diff --git a/.github/ISSUE_TEMPLATE/40-tflite-op-request.md b/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
new file mode 100644
index 0000000000000000000000000000000000000000..7b391279e479ade4ed5327728f19be8752e11507
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/40-tflite-op-request.md
@@ -0,0 +1,24 @@
+---
+name: TensorFlow Lite Op Request
+about: Use this template for reporting ops you are using or missing.
+
+---
+
+
+**System information**
+- OS Platform and Distribution (e.g., Linux Ubuntu 16.04):
+- TensorFlow installed from (source or binary):
+- TensorFlow version (or github SHA if from source):
+
+
+**Provide the text output from tflite_convert**
+
+```
+# Copy and paste here
+```
+
+Also, please include a link to a GraphDef or the model if possible.
+
+**Any other info / logs**
+
+Include any logs or source code that would be helpful to diagnose the problem. If including tracebacks, please include the full traceback. Large logs and files should be attached.
diff --git a/.gitignore b/.gitignore
index 57d84228cfd037325716b5faa56c17f7424fe713..90324058600bee46af56e49028977971848a80de 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,7 +24,7 @@ Pods
 Podfile.lock
 *.pbxproj
 *.xcworkspacedata
-/tensorflow/lite/downloads/**
+/tensorflow/lite/tools/make/downloads/**
 /tensorflow/lite/gen/**
 /tensorflow/lite/examples/ios/simple/data/*.txt
 /tensorflow/lite/examples/ios/simple/data/*.tflite
diff --git a/CODEOWNERS b/CODEOWNERS
index 54a61a4d72c40d297d90d53e223f64f813d9167d..cb3fa2312405ce44d5dfc30ea4164740f436e07e 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,7 +1,7 @@
 # Where component owners are known, add them here.
 
 /tenosrflow/core/debug @caisq
-/tensorflow/core/nccl/ @azaks @csigg
+/tensorflow/core/nccl/ @azaks2 @chsigg
 /tensorflow/core/platform/windows/ @mrry
 /tensorflow/core/platform/s3 @yongtang
 /tensorflow/go @asimshankar
@@ -51,13 +51,13 @@
 /tensorflow/contrib/pi_examples/ @maciekcc
 /tensorflow/contrib/quantization/ @petewarden
 /tensorflow/contrib/rnn/ @ebrevdo @scottzhu
-/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenl
+/tensorflow/contrib/saved_model/ @nfiedel @sukritiramesh @allenlavoie
 /tensorflow/contrib/seq2seq/ @ebrevdo @lmthang
 /tensorflow/contrib/session_bundle/ @nfiedel @sukritiramesh
 /tensorflow/contrib/slim/ @sguada @thenbasilmanran
 /tensorflow/contrib/stateless/ @girving @alextp
 /tensorflow/contrib/tensor_forest/ @gilberthendry @thomascolthurst @yupbank
-/tensorflow/contrib/tensorrt/ @aaroey
+/tensorflow/contrib/tensorrt/ @aaroey @smit-hinsu @azaks2
 # NEED OWNER: /tensorflow/contrib/testing/
 /tensorflow/contrib/timeseries/ @allenlavoie
 /tensorflow/contrib/tpu/ @frankchn @saeta @jhseu @sourabhbajaj
diff --git a/README.md b/README.md
index 8af5370befbb090966a8b3af54d80c84a969aaa5..044174947a094d43a51f7140dd40ec0f17801d40 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,14 @@
 |-----------------|
 | [![Documentation](https://img.shields.io/badge/api-reference-blue.svg)](https://www.tensorflow.org/api_docs/) |
 
-**TensorFlow** is an open source software library for numerical computation using
-data flow graphs.  The graph nodes represent mathematical operations, while
+**TensorFlow** is an open source software library for numerical computation
+using data flow graphs. The graph nodes represent mathematical operations, while
 the graph edges represent the multidimensional data arrays (tensors) that flow
-between them.  This flexible architecture enables you to deploy computation to one
-or more CPUs or GPUs in a desktop, server, or mobile device without rewriting
-code.  TensorFlow also includes [TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard), a data visualization toolkit.
+between them. This flexible architecture enables you to deploy computation to
+one or more CPUs or GPUs in a desktop, server, or mobile device without
+rewriting code. TensorFlow also includes
+[TensorBoard](https://github.com/tensorflow/tensorboard), a data visualization
+toolkit.
 
 TensorFlow was originally developed by researchers and engineers
 working on the Google Brain team within Google's Machine Intelligence Research
@@ -111,22 +113,24 @@ The TensorFlow project strives to abide by generally accepted best practices in
 Build Type                                                                                                                                                                                      | Status                                                                                                                                                                                   | Artifacts
 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------
 **IBM s390x**                                                                                                                                                                                   | [![Build Status](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/badge/icon)](http://ibmz-ci.osuosl.org/job/TensorFlow_IBMZ_CI/)                                                        | TBA
-**IBM ppc64le CPU**                                                                                                                                                                             | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_Ubuntu_16.04_CPU/)                                      | TBA
+**IBM ppc64le CPU**                                                                                                                                                                             | [![Build Status](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/badge/icon)](http://powerci.osuosl.org/job/TensorFlow_PPC64LE_CPU_Build/)                                    | TBA
 **IBM ppc64le GPU** Nightly                                                                                                                                                                     | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)            | [Nightly](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Nightly_Artifact/)
 **IBM ppc64le GPU** Stable Release                                                                                                                                                              | [![Build Status](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/badge/icon)](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)                  | [Release](https://powerci.osuosl.org/job/TensorFlow_PPC64LE_GPU_Release_Build/)
 **Linux CPU with Intel® MKL-DNN** Nightly                                                                                                                                                       | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-linux-cpu/)                                | [Nightly](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-whl-nightly/)
 **Linux CPU with Intel® MKL-DNN** Python 2.7<br> **Linux CPU with Intel® MKL-DNN** Python 3.4<br> **Linux CPU with Intel® MKL-DNN** Python 3.5<br> **Linux CPU with Intel® MKL-DNN** Python 3.6 | [![Build Status](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/badge/icon)](https://tensorflow-ci.intel.com/job/tensorflow-mkl-build-release-whl/lastStableBuild) | [1.11.0 py2.7](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp27-cp27mu-linux_x86_64.whl)<br>[1.11.0 py3.4](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp34-cp34m-linux_x86_64.whl)<br>[1.11.0 py3.5](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp35-cp35m-linux_x86_64.whl)<br>[1.11.0 py3.6](https://storage.googleapis.com/intel-optimized-tensorflow/tensorflow-1.11.0-cp36-cp36m-linux_x86_64.whl)
 
 ## For more information
-* [TensorFlow Website](https://www.tensorflow.org)
-* [TensorFlow Tutorials](https://www.tensorflow.org/tutorials/)
-* [TensorFlow Model Zoo](https://github.com/tensorflow/models)
-* [TensorFlow Twitter](https://twitter.com/tensorflow)
-* [TensorFlow Blog](https://medium.com/tensorflow)
-* [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
-* [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
-* [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
-* [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
+
+*   [TensorFlow Website](https://www.tensorflow.org)
+*   [TensorFlow Tutorials](https://www.tensorflow.org/tutorials/)
+*   [TensorFlow Model Zoo](https://github.com/tensorflow/models)
+*   [TensorFlow Twitter](https://twitter.com/tensorflow)
+*   [TensorFlow Blog](https://medium.com/tensorflow)
+*   [TensorFlow Course at Stanford](https://web.stanford.edu/class/cs20si)
+*   [TensorFlow Roadmap](https://www.tensorflow.org/community/roadmap)
+*   [TensorFlow White Papers](https://www.tensorflow.org/about/bib)
+*   [TensorFlow YouTube Channel](https://www.youtube.com/channel/UC0rqucBdTuFTjJiefW5t-IQ)
+*   [TensorFlow Visualization Toolkit](https://github.com/tensorflow/tensorboard)
 
 Learn more about the TensorFlow community at the [community page of tensorflow.org](https://www.tensorflow.org/community) for a few ways to participate.
 
diff --git a/WORKSPACE b/WORKSPACE
index 0c7bc085b512b084b9470abe17326d7c119aa327..7cc08e0164a202581ad7ebbe107a9e19410e70e4 100644
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -1,5 +1,7 @@
 workspace(name = "org_tensorflow")
 
+load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
+
 http_archive(
     name = "io_bazel_rules_closure",
     sha256 = "a38539c5b5c358548e75b44141b4ab637bba7c4dc02b46b1f62a96d6433f56ae",
@@ -57,9 +59,9 @@ android_workspace()
 # Please add all new TensorFlow dependencies in workspace.bzl.
 tf_workspace()
 
-new_http_archive(
+http_archive(
     name = "inception_v1",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "7efe12a8363f09bc24d7b7a450304a15655a57a7751929b2c1593a71183bb105",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/inception_v1.zip",
@@ -67,9 +69,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "mobile_ssd",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "bddd81ea5c80a97adfac1c9f770e6f55cbafd7cce4d3bbe15fbeb041e6b8f3e8",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/object_detection/ssd_mobilenet_v1_android_export.zip",
@@ -77,9 +79,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "mobile_multibox",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "859edcddf84dddb974c36c36cfc1f74555148e9c9213dedacf1d6b613ad52b96",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/mobile_multibox_v1a.zip",
@@ -87,9 +89,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "stylize",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "3d374a730aef330424a356a8d4f04d8a54277c425e274ecb7d9c83aa912c6bfa",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/stylize_v1.zip",
@@ -97,9 +99,9 @@ new_http_archive(
     ],
 )
 
-new_http_archive(
+http_archive(
     name = "speech_commands",
-    build_file = "models.BUILD",
+    build_file = "//:models.BUILD",
     sha256 = "c3ec4fea3158eb111f1d932336351edfe8bd515bb6e87aad4f25dbad0a600d0c",
     urls = [
         "http://storage.googleapis.com/download.tensorflow.org/models/speech_commands_v0.01.zip",
diff --git a/configure.py b/configure.py
index 234561d94a46f57c4de5ca487360e2d5a3dfdb2f..6c905a0be3d685b5921dfbc5bddfbe6471a82625 100644
--- a/configure.py
+++ b/configure.py
@@ -238,6 +238,13 @@ def setup_python(environ_cp):
   write_to_bazelrc('build --python_path=\"%s"' % python_bin_path)
   environ_cp['PYTHON_BIN_PATH'] = python_bin_path
 
+  # If choosen python_lib_path is from a path specified in the PYTHONPATH
+  # variable, need to tell bazel to include PYTHONPATH
+  if environ_cp.get('PYTHONPATH'):
+    python_paths = environ_cp.get('PYTHONPATH').split(':')
+    if python_lib_path in python_paths:
+      write_action_env_to_bazelrc('PYTHONPATH', environ_cp.get('PYTHONPATH'))
+
   # Write tools/python_bin_path.sh
   with open(
       os.path.join(_TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'),
@@ -445,11 +452,12 @@ def convert_version_to_int(version):
   return int(version_str)
 
 
-def check_bazel_version(min_version):
-  """Check installed bazel version is at least min_version.
+def check_bazel_version(min_version, max_version):
+  """Check installed bazel version is between min_version and max_version.
 
   Args:
     min_version: string for minimum bazel version.
+    max_version: string for maximum bazel version.
 
   Returns:
     The bazel version detected.
@@ -467,6 +475,7 @@ def check_bazel_version(min_version):
 
   min_version_int = convert_version_to_int(min_version)
   curr_version_int = convert_version_to_int(curr_version)
+  max_version_int = convert_version_to_int(max_version)
 
   # Check if current bazel version can be detected properly.
   if not curr_version_int:
@@ -480,6 +489,10 @@ def check_bazel_version(min_version):
     print('Please upgrade your bazel installation to version %s or higher to '
           'build TensorFlow!' % min_version)
     sys.exit(0)
+  if curr_version_int > max_version_int:
+    print('Please downgrade your bazel installation to version %s or lower to '
+          'build TensorFlow!' % max_version)
+    sys.exit(0)
   return curr_version
 
 
@@ -859,7 +872,7 @@ def set_tf_cuda_version(environ_cp):
     cuda_toolkit_paths_full = [
         os.path.join(cuda_toolkit_path, x) for x in cuda_rt_lib_paths
     ]
-    if any([os.path.exists(x) for x in cuda_toolkit_paths_full]):
+    if any(os.path.exists(x) for x in cuda_toolkit_paths_full):
       break
 
     # Reset and retry
@@ -1552,7 +1565,7 @@ def main():
   # environment variables.
   environ_cp = dict(os.environ)
 
-  check_bazel_version('0.15.0')
+  check_bazel_version('0.15.0', '0.20.0')
 
   reset_tf_configure_bazelrc()
   # Explicitly import tools/bazel.rc, this is needed for Bazel 0.19.0 or later
@@ -1694,6 +1707,7 @@ def main():
   config_info_line('nohdfs', 'Disable HDFS support.')
   config_info_line('noignite', 'Disable Apacha Ignite support.')
   config_info_line('nokafka', 'Disable Apache Kafka support.')
+  config_info_line('nonccl', 'Disable NVIDIA NCCL support.')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index 859dc3b8d77be66e0f51e15d86188399273af23f..fd4b94202aad24a82abef8abd16431f61a8326f0 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -43,6 +43,11 @@ TENSORFLOW_API_INIT_FILES_V2 = (
     TENSORFLOW_API_INIT_FILES + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
 )
 
+# @unused
+TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT = (
+    TENSORFLOW_API_INIT_FILES_V1 + get_compat_files(TENSORFLOW_API_INIT_FILES_V1, 1)
+)
+
 # Config setting used when building for products
 # which requires restricted licenses to be avoided.
 config_setting(
@@ -213,31 +218,37 @@ config_setting(
 #
 config_setting(
     name = "no_aws_support",
-    define_values = {"no_aws_support": "false"},
+    define_values = {"no_aws_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_gcp_support",
-    define_values = {"no_gcp_support": "false"},
+    define_values = {"no_gcp_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_hdfs_support",
-    define_values = {"no_hdfs_support": "false"},
+    define_values = {"no_hdfs_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_ignite_support",
-    define_values = {"no_ignite_support": "false"},
+    define_values = {"no_ignite_support": "true"},
     visibility = ["//visibility:public"],
 )
 
 config_setting(
     name = "no_kafka_support",
-    define_values = {"no_kafka_support": "false"},
+    define_values = {"no_kafka_support": "true"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "no_nccl_support",
+    define_values = {"no_nccl_support": "true"},
     visibility = ["//visibility:public"],
 )
 
@@ -350,7 +361,7 @@ package_group(
         "-//third_party/tensorflow/python/estimator",
         "//learning/meta_rank/...",
         "//tensorflow/...",
-        "//tensorflow_estimator/...",
+        "//tensorflow_estimator/contrib/...",
         "//tensorflow_fold/llgtm/...",
         "//tensorflow_text/...",
         "//third_party/py/tensor2tensor/...",
@@ -554,18 +565,24 @@ genrule(
     }),
     outs = ["__init__.py"],
     cmd = select({
-        "api_version_2": "cp $(@D)/_api/v2/__init__.py $(OUTS)",
-        "//conditions:default": "cp $(@D)/_api/v1/__init__.py $(OUTS)",
+        "api_version_2": "cp $(@D)/_api/v2/v2.py $(OUTS)",
+        "//conditions:default": "cp $(@D)/_api/v1/v1.py $(OUTS)",
     }),
 )
 
 gen_api_init_files(
     name = "tf_python_api_gen_v1",
-    srcs = ["api_template_v1.__init__.py"],
+    srcs = [
+        "api_template_v1.__init__.py",
+        "compat_template_v1.__init__.py",
+    ],
     api_version = 1,
+    compat_api_versions = [1],
+    compat_init_templates = ["compat_template_v1.__init__.py"],
     output_dir = "_api/v1/",
-    output_files = TENSORFLOW_API_INIT_FILES_V1,
+    output_files = TENSORFLOW_API_INIT_FILES_V1_WITH_COMPAT,
     output_package = "tensorflow._api.v1",
+    root_file_name = "v1.py",
     root_init_template = "api_template_v1.__init__.py",
 )
 
@@ -581,6 +598,7 @@ gen_api_init_files(
     output_dir = "_api/v2/",
     output_files = TENSORFLOW_API_INIT_FILES_V2,
     output_package = "tensorflow._api.v2",
+    root_file_name = "v2.py",
     root_init_template = "api_template.__init__.py",
 )
 
diff --git a/tensorflow/api_template.__init__.py b/tensorflow/api_template.__init__.py
index 0d49756838505289a960a6cabeb7cab02fad995b..d81cf067eb07e88e2b8a86cf5643674235eb3f3b 100644
--- a/tensorflow/api_template.__init__.py
+++ b/tensorflow/api_template.__init__.py
@@ -21,8 +21,6 @@ from __future__ import print_function as _print_function
 import os as _os
 
 # pylint: disable=g-bad-import-order
-from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
-
 from tensorflow.python.tools import component_api_helper as _component_api_helper
 _component_api_helper.package_hook(
     parent_package_str=__name__,
@@ -30,16 +28,16 @@ _component_api_helper.package_hook(
 
 # API IMPORTS PLACEHOLDER
 
-from tensorflow.python.platform import flags  # pylint: disable=g-import-not-at-top
-
 # Make sure directory containing top level submodules is in
 # the __path__ so that "from tensorflow.foo import bar" works.
-_tf_api_dir = _os.path.dirname(_os.path.dirname(app.__file__))  # pylint: disable=undefined-variable
+# We're using bitwise, but there's nothing special about that.
+_tf_api_dir = _os.path.dirname(_os.path.dirname(bitwise.__file__))  # pylint: disable=undefined-variable
 if _tf_api_dir not in __path__:
   __path__.append(_tf_api_dir)
 
-# Calls to enable and disable features.
-enable_eager_execution()  # pylint: disable=undefined-variable
+# Enable TF2 behaviors
+from tensorflow.python.compat import compat as _compat  # pylint: disable=g-import-not-at-top
+_compat.enable_v2_behavior()
 
 # These symbols appear because we import the python package which
 # in turn imports from tensorflow.core and tensorflow.python. They
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index b8db1b2144978e97bd32f62e643c2c4a7fcf1654..25df970ecab0757f23465ab19e7f45de0c759458 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -60,6 +60,7 @@ tf_cuda_library(
             "//tensorflow/core:framework",
             "//tensorflow/core:lib",
             "//tensorflow/core:op_gen_lib",
+            "//tensorflow/core/distributed_runtime:server_lib",
         ],
     }),
 )
@@ -120,7 +121,8 @@ tf_cuda_library(
         ":c_api",
         ":c_api_internal",
         "//tensorflow/c/eager:c_api",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
+        "//tensorflow/c/eager:c_api_internal",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/contrib/tpu:all_ops",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
@@ -173,6 +175,60 @@ tf_cuda_library(
     ],
 )
 
+tf_cuda_library(
+    name = "env",
+    srcs = [
+        "env.cc",
+    ],
+    hdrs = [
+        "env.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            ":c_api",
+            ":tf_status_helper",
+            "//tensorflow/core:android_tensorflow_lib_lite",
+            "//tensorflow/core:platform_env",
+            "//tensorflow/core:lib",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            ":tf_status_helper",
+            "//tensorflow/core:framework",
+            "//tensorflow/core:platform_env",
+            "//tensorflow/core:lib",
+        ],
+    }) + [":c_api_internal"],
+)
+
+tf_cuda_library(
+    name = "kernels",
+    srcs = [
+        "kernels.cc",
+    ],
+    hdrs = [
+        "kernels.h",
+    ],
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = select({
+        "//tensorflow:android": [
+            ":c_api",
+            ":c_api_internal",
+            ":tf_status_helper",
+            "//tensorflow/core:android_tensorflow_lib_lite",
+        ],
+        "//conditions:default": [
+            ":c_api",
+            ":c_api_internal",
+            ":tf_status_helper",
+            "//tensorflow/core:framework",
+        ],
+    }),
+)
+
 # -----------------------------------------------------------------------------
 # Tests
 
@@ -208,7 +264,10 @@ tf_cuda_cc_test(
         "//tensorflow:darwin": ["-headerpad_max_install_names"],
         "//conditions:default": [],
     }),
-    tags = ["noasan"],
+    tags = [
+        "no_oss",  # http://b/119522529
+        "noasan",
+    ],
     # We must ensure that the dependencies can be dynamically linked since
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
@@ -237,7 +296,7 @@ tf_cuda_cc_test(
 
 tf_cc_test(
     name = "c_api_experimental_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_experimental_test.cc"],
     data = ["testdata/tf_record"],
     linkopts = select({
@@ -248,8 +307,11 @@ tf_cc_test(
     # the shared library must be able to use core:framework.
     # linkstatic = tf_kernel_tests_linkstatic(),
     deps = [
+        ":c_api",
         ":c_api_experimental",
         ":c_test_util",
+        "//tensorflow/c/eager:c_api",
+        "//tensorflow/c/eager:c_api_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
@@ -300,6 +362,51 @@ tf_kernel_library(
     alwayslink = 1,
 )
 
+tf_cuda_cc_test(
+    name = "env_test",
+    size = "small",
+    srcs = ["env_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    tags = ["noasan"],
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api",
+        ":env",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
+tf_cuda_cc_test(
+    name = "kernels_test",
+    size = "small",
+    srcs = ["kernels_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    tags = ["noasan"],
+    # We must ensure that the dependencies can be dynamically linked since
+    # the shared library must be able to use core:framework.
+    # linkstatic = tf_kernel_tests_linkstatic(),
+    deps = [
+        ":c_api",
+        ":kernels",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:proto_text",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 # -----------------------------------------------------------------------------
 # Python API target
 
diff --git a/tensorflow/c/c_api.cc b/tensorflow/c/c_api.cc
index f13e8777dff164bcd8eedf46310ae846abd0c804..94d18eb8b04e3534be547aca5cfbb32da40ffbf6 100644
--- a/tensorflow/c/c_api.cc
+++ b/tensorflow/c/c_api.cc
@@ -136,16 +136,22 @@ const char* TF_Message(const TF_Status* s) {
 namespace {
 class TF_ManagedBuffer : public TensorBuffer {
  public:
-  void* data_;
-  size_t len_;
-  void (*deallocator_)(void* data, size_t len, void* arg);
-  void* deallocator_arg_;
+  TF_ManagedBuffer(void* data, size_t len,
+                   void (*deallocator)(void* data, size_t len, void* arg),
+                   void* deallocator_arg)
+      : TensorBuffer(data),
+        len_(len),
+        deallocator_(deallocator),
+        deallocator_arg_(deallocator_arg) {}
+
+  const size_t len_;
+  void (*const deallocator_)(void* data, size_t len, void* arg);
+  void* const deallocator_arg_;
 
   ~TF_ManagedBuffer() override {
-    (*deallocator_)(data_, len_, deallocator_arg_);
+    (*deallocator_)(data(), len_, deallocator_arg_);
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return len_; }
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -199,8 +205,7 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     dimvec[i] = static_cast<tensorflow::int64>(dims[i]);
   }
 
-  TF_ManagedBuffer* buf = new TF_ManagedBuffer;
-  buf->len_ = len;
+  TF_ManagedBuffer* buf = nullptr;
   if (dtype != TF_STRING && dtype != TF_RESOURCE &&
       tensorflow::DataTypeCanUseMemcpy(static_cast<DataType>(dtype)) &&
       reinterpret_cast<intptr_t>(data) % std::max(1, EIGEN_MAX_ALIGN_BYTES) !=
@@ -212,17 +217,15 @@ TF_Tensor* TF_NewTensor(TF_DataType dtype, const int64_t* dims, int num_dims,
     //
     // Other types have the same representation, so copy only if it is safe to
     // do so.
-    buf->data_ = allocate_tensor("TF_NewTensor", len);
-    std::memcpy(buf->data_, data, len);
-    buf->deallocator_ = deallocate_buffer;
-    buf->deallocator_arg_ = nullptr;
+    buf = new TF_ManagedBuffer(allocate_tensor("TF_NewTensor", len), len,
+                               deallocate_buffer, nullptr);
+    std::memcpy(buf->data(), data, len);
     // Free the original buffer.
     deallocator(data, len, deallocator_arg);
   } else {
-    buf->data_ = data;
-    buf->deallocator_ = deallocator;
-    buf->deallocator_arg_ = deallocator_arg;
+    buf = new TF_ManagedBuffer(data, len, deallocator, deallocator_arg);
   }
+
   TF_Tensor* ret = new TF_Tensor{dtype, TensorShape(dimvec), buf};
   size_t elem_size = TF_DataTypeSize(dtype);
   if (elem_size > 0 && len < (elem_size * ret->shape.num_elements())) {
@@ -477,9 +480,9 @@ static TF_Tensor* EmptyTensor(TF_DataType dtype, const TensorShape& shape) {
   CHECK_EQ(nelems, 0);
   static_assert(sizeof(int64_t) == sizeof(tensorflow::int64),
                 "64-bit int types should match in size");
-  return TF_NewTensor(dtype, reinterpret_cast<const int64_t*>(dims.data()),
-                      shape.dims(), reinterpret_cast<void*>(&empty), 0,
-                      [](void*, size_t, void*) {}, nullptr);
+  return TF_NewTensor(
+      dtype, reinterpret_cast<const int64_t*>(dims.data()), shape.dims(),
+      reinterpret_cast<void*>(&empty), 0, [](void*, size_t, void*) {}, nullptr);
 }
 
 // Non-static for testing.
@@ -1592,18 +1595,20 @@ TF_AttrMetadata TF_OperationGetAttrMetadata(TF_Operation* oper,
     break;                                            \
   }
 
-      LIST_CASE(s, TF_ATTR_STRING, metadata.total_size = 0;
-                for (int i = 0; i < attr->list().s_size();
-                     ++i) { metadata.total_size += attr->list().s(i).size(); });
+      LIST_CASE(
+          s, TF_ATTR_STRING, metadata.total_size = 0;
+          for (int i = 0; i < attr->list().s_size();
+               ++i) { metadata.total_size += attr->list().s(i).size(); });
       LIST_CASE(i, TF_ATTR_INT);
       LIST_CASE(f, TF_ATTR_FLOAT);
       LIST_CASE(b, TF_ATTR_BOOL);
       LIST_CASE(type, TF_ATTR_TYPE);
-      LIST_CASE(shape, TF_ATTR_SHAPE, metadata.total_size = 0;
-                for (int i = 0; i < attr->list().shape_size(); ++i) {
-                  const auto& s = attr->list().shape(i);
-                  metadata.total_size += s.unknown_rank() ? 0 : s.dim_size();
-                });
+      LIST_CASE(
+          shape, TF_ATTR_SHAPE, metadata.total_size = 0;
+          for (int i = 0; i < attr->list().shape_size(); ++i) {
+            const auto& s = attr->list().shape(i);
+            metadata.total_size += s.unknown_rank() ? 0 : s.dim_size();
+          });
       LIST_CASE(tensor, TF_ATTR_TENSOR);
       LIST_CASE(tensor, TF_ATTR_FUNC);
 #undef LIST_CASE
diff --git a/tensorflow/c/c_api.h b/tensorflow/c/c_api.h
index 3d56268110edbe96616201d15a69cc8c84d3115a..c7abba85521fccec07983cd5ab4f94a8368d6181 100644
--- a/tensorflow/c/c_api.h
+++ b/tensorflow/c/c_api.h
@@ -91,7 +91,7 @@ extern "C" {
 // --------------------------------------------------------------------------
 // TF_Version returns a string describing version information of the
 // TensorFlow library. TensorFlow using semantic versioning.
-TF_CAPI_EXPORT extern const char* TF_Version();
+TF_CAPI_EXPORT extern const char* TF_Version(void);
 
 // --------------------------------------------------------------------------
 // TF_DataType holds the type for a scalar value.  E.g., one slot in a tensor.
@@ -157,7 +157,7 @@ typedef enum TF_Code {
 typedef struct TF_Status TF_Status;
 
 // Return a new status object.
-TF_CAPI_EXPORT extern TF_Status* TF_NewStatus();
+TF_CAPI_EXPORT extern TF_Status* TF_NewStatus(void);
 
 // Delete a previously created status object.
 TF_CAPI_EXPORT extern void TF_DeleteStatus(TF_Status*);
@@ -196,7 +196,7 @@ TF_CAPI_EXPORT extern TF_Buffer* TF_NewBufferFromString(const void* proto,
                                                         size_t proto_len);
 
 // Useful for passing *out* a protobuf.
-TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer();
+TF_CAPI_EXPORT extern TF_Buffer* TF_NewBuffer(void);
 
 TF_CAPI_EXPORT extern void TF_DeleteBuffer(TF_Buffer*);
 
@@ -305,7 +305,7 @@ TF_CAPI_EXPORT extern size_t TF_StringEncodedSize(size_t len);
 typedef struct TF_SessionOptions TF_SessionOptions;
 
 // Return a new options object.
-TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions();
+TF_CAPI_EXPORT extern TF_SessionOptions* TF_NewSessionOptions(void);
 
 // Set the target in TF_SessionOptions.options.
 // target can be empty, a single entry, or a comma separated list of entries.
@@ -338,7 +338,7 @@ TF_CAPI_EXPORT extern void TF_DeleteSessionOptions(TF_SessionOptions*);
 typedef struct TF_Graph TF_Graph;
 
 // Return a new graph object.
-TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph();
+TF_CAPI_EXPORT extern TF_Graph* TF_NewGraph(void);
 
 // Destroy an options object.  Graph will be deleted once no more
 // TFSession's are referencing it.
@@ -890,7 +890,8 @@ TF_CAPI_EXPORT extern void TF_GraphVersions(TF_Graph* graph,
 // TF_GraphImportGraphDef.
 typedef struct TF_ImportGraphDefOptions TF_ImportGraphDefOptions;
 
-TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions();
+TF_CAPI_EXPORT extern TF_ImportGraphDefOptions* TF_NewImportGraphDefOptions(
+    void);
 TF_CAPI_EXPORT extern void TF_DeleteImportGraphDefOptions(
     TF_ImportGraphDefOptions* opts);
 
@@ -1611,7 +1612,7 @@ TF_CAPI_EXPORT extern void TF_DeleteLibraryHandle(TF_Library* lib_handle);
 //
 // The data in the buffer will be the serialized OpList proto for ops registered
 // in this address space.
-TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList();
+TF_CAPI_EXPORT extern TF_Buffer* TF_GetAllOpList(void);
 
 // TF_ApiDefMap encapsulates a collection of API definitions for an operation.
 //
diff --git a/tensorflow/c/c_api_experimental.cc b/tensorflow/c/c_api_experimental.cc
index fabe2fa0f60bc8baafa7f83802da74bb7ab93c6d..38e29aa74a90f4e85d1369b6928a5a58c531b2da 100644
--- a/tensorflow/c/c_api_experimental.cc
+++ b/tensorflow/c/c_api_experimental.cc
@@ -15,13 +15,18 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 
+#include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/node_builder.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/net.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
@@ -51,8 +56,8 @@ void TF_EnableXLACompilation(TF_SessionOptions* options, unsigned char enable) {
     // These XLA flags are needed to trigger XLA properly from C (more generally
     // non-Python) clients. If this API is called again with `enable` set to
     // false, it is safe to keep these flag values as is.
-    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
-        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    tensorflow::MarkForCompilationPassFlags* flags =
+        tensorflow::GetMarkForCompilationPassFlags();
     flags->tf_xla_cpu_global_jit = true;
     flags->tf_xla_min_cluster_size = 1;
   } else {
@@ -71,8 +76,8 @@ TF_Buffer* TF_CreateConfig(unsigned char enable_xla_compilation,
     // These XLA flags are needed to trigger XLA properly from C (more generally
     // non-Python) clients. If this API is called again with `enable` set to
     // false, it is safe to keep these flag values as is.
-    tensorflow::legacy_flags::MarkForCompilationPassFlags* flags =
-        tensorflow::legacy_flags::GetMarkForCompilationPassFlags();
+    tensorflow::MarkForCompilationPassFlags* flags =
+        tensorflow::GetMarkForCompilationPassFlags();
     flags->tf_xla_cpu_global_jit = true;
     flags->tf_xla_min_cluster_size = 1;
   } else {
@@ -6525,7 +6530,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/cycle_length"
+      name: "ExperimentalParallelInterleaveDataset/cycle_length"
       op: "Const"
       attr {
         key: "dtype"
@@ -6546,7 +6551,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/block_length"
+      name: "ExperimentalParallelInterleaveDataset/block_length"
       op: "Const"
       attr {
         key: "dtype"
@@ -6567,7 +6572,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/sloppy"
+      name: "ExperimentalParallelInterleaveDataset/sloppy"
       op: "Const"
       attr {
         key: "dtype"
@@ -6588,7 +6593,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/buffer_output_elements"
+      name: "ExperimentalParallelInterleaveDataset/buffer_output_elements"
       op: "Const"
       attr {
         key: "dtype"
@@ -6609,7 +6614,7 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset/prefetch_input_elements"
+      name: "ExperimentalParallelInterleaveDataset/prefetch_input_elements"
       op: "Const"
       attr {
         key: "dtype"
@@ -6630,14 +6635,14 @@ library {
       }
     }
     node_def {
-      name: "ParallelInterleaveDataset"
-      op: "ParallelInterleaveDataset"
+      name: "ExperimentalParallelInterleaveDataset"
+      op: "ExperimentalParallelInterleaveDataset"
       input: "RepeatDataset:handle:0"
-      input: "ParallelInterleaveDataset/cycle_length:output:0"
-      input: "ParallelInterleaveDataset/block_length:output:0"
-      input: "ParallelInterleaveDataset/sloppy:output:0"
-      input: "ParallelInterleaveDataset/buffer_output_elements:output:0"
-      input: "ParallelInterleaveDataset/prefetch_input_elements:output:0"
+      input: "ExperimentalParallelInterleaveDataset/cycle_length:output:0"
+      input: "ExperimentalParallelInterleaveDataset/block_length:output:0"
+      input: "ExperimentalParallelInterleaveDataset/sloppy:output:0"
+      input: "ExperimentalParallelInterleaveDataset/buffer_output_elements:output:0"
+      input: "ExperimentalParallelInterleaveDataset/prefetch_input_elements:output:0"
       attr {
         key: "Targuments"
         value {
@@ -6737,7 +6742,7 @@ library {
     node_def {
       name: "ShuffleDataset_2"
       op: "ShuffleDataset"
-      input: "ParallelInterleaveDataset:handle:0"
+      input: "ExperimentalParallelInterleaveDataset:handle:0"
       input: "ShuffleDataset_2/buffer_size_1:output:0"
       input: "ShuffleDataset_2/seed_2:output:0"
       input: "ShuffleDataset_2/seed2_2:output:0"
@@ -8739,14 +8744,65 @@ void TFE_TensorHandlePrintDebugString(TFE_TensorHandle* handle) {
   TF_DeleteStatus(status);
 }
 
-TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
-                                                      const char* errMsg) {
+struct TFE_ExecuteOpNotification {
+  TFE_ExecuteOpNotification() : status(TF_NewStatus(), TF_DeleteStatus) {}
+  tensorflow::Notification n;
+  std::unique_ptr<tensorflow::Thread> thread;
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status;
+};
+
+TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(TFE_Op* op,
+                                                    TFE_TensorHandle** retvals,
+                                                    int* num_retvals,
+                                                    TF_Status* status) {
+  TFE_ExecuteOpNotification* n = new TFE_ExecuteOpNotification;
+
+  n->thread.reset(op->operation.EagerContext()->TFEnv()->StartThread(
+      tensorflow::ThreadOptions(), "ExecuteOpThread",
+      [op, retvals, num_retvals, n]() {
+        TFE_Execute(op, retvals, num_retvals, n->status.get());
+        n->n.Notify();
+      }));
+
+  return n;
+}
+
+void TFE_ExecuteOpNotificationWaitAndDelete(
+    TFE_ExecuteOpNotification* notification, TF_Status* status) {
+  if (notification == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Passed in notification is a nullptr.");
+
+    return;
+  }
+  if (notification->thread == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "Passed in notification didn't start a thread correctly. Cleaning up "
+        "this notification. Please re-execute the operation to get a new "
+        "notification.");
+
+    delete notification;
+    return;
+  }
+
+  notification->n.WaitForNotification();
+
+  status->status = notification->status->status;
+
+  delete notification;
+}
+
+void TF_MakeInternalErrorStatus(TF_Status* status, const char* errMsg) {
   status->status = tensorflow::errors::Internal(errMsg);
 }
 
 // This builder is used in the eager API to build a NodeDef.
 struct TF_AttrBuilder : public tensorflow::AttrBuilder {
   using tensorflow::AttrBuilder::AttrBuilder;
+  // The string buffers to make sure that any `attr_name` we pass into
+  // `builder->Set()` will outlive the subsequent
+  // `TF_AttrBuilderCheckCanRunOnDevice()` call(s) on the same `builder`.
+  std::set<std::string> attr_names;
 };
 
 TF_AttrBuilder* TF_NewAttrBuilder(const char* op_name) {
@@ -8757,13 +8813,15 @@ void TF_DeleteAttrBuilder(TF_AttrBuilder* builder) { delete builder; }
 
 void TF_AttrBuilderSetType(TF_AttrBuilder* builder, const char* attr_name,
                            TF_DataType value) {
-  builder->Set(attr_name, static_cast<tensorflow::DataType>(value));
+  auto iter = builder->attr_names.insert(attr_name).first;
+  builder->Set((*iter).c_str(), static_cast<tensorflow::DataType>(value));
 }
 
 void TF_AttrBuilderSetTypeList(TF_AttrBuilder* builder, const char* attr_name,
                                const TF_DataType* values, int num_values) {
+  auto iter = builder->attr_names.insert(attr_name).first;
   builder->Set(
-      attr_name,
+      (*iter).c_str(),
       tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
           reinterpret_cast<const tensorflow::DataType*>(values), num_values));
 }
@@ -8800,3 +8858,31 @@ const char* TF_GetNumberAttrForOpListInput(const char* op_name, int input_index,
   // The returned string is owned by OpRegistry, so liveness is not a concern.
   return input_arg.number_attr().c_str();
 }
+
+int TF_OpIsStateful(const char* op_type, TF_Status* status) {
+  const tensorflow::OpRegistrationData* op_reg_data;
+  status->status =
+      tensorflow::OpRegistry::Global()->LookUp(op_type, &op_reg_data);
+  if (!status->status.ok()) {
+    return 0;
+  }
+  return op_reg_data->op_def.is_stateful();
+}
+
+void TF_InitMain(const char* usage, int* argc, char*** argv) {
+  tensorflow::port::InitMain(usage, argc, argv);
+}
+
+int TF_PickUnusedPortOrDie() {
+  return tensorflow::internal::PickUnusedPortOrDie();
+}
+
+TFE_TensorHandle* TFE_NewTensorHandleFromScalar(TF_DataType dtype_arg,
+                                                void* data, size_t len) {
+  auto dtype = static_cast<tensorflow::DataType>(dtype_arg);
+  DCHECK(tensorflow::DataTypeCanUseMemcpy(dtype));
+
+  tensorflow::Tensor tensor(dtype, tensorflow::TensorShape({}));
+  std::memcpy(tensorflow::TensorCApi::Buffer(tensor)->data(), data, len);
+  return new TFE_TensorHandle(tensor, nullptr, nullptr);
+}
diff --git a/tensorflow/c/c_api_experimental.h b/tensorflow/c/c_api_experimental.h
index 6639b0be72bdf81d0e3c806770364d7bc5082ad2..3e3a485eb763b871b0551414c4ef04746b2ed9a3 100644
--- a/tensorflow/c/c_api_experimental.h
+++ b/tensorflow/c/c_api_experimental.h
@@ -180,6 +180,25 @@ TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_DequeueVariantTensor(
 TF_CAPI_EXPORT extern void TFE_TensorHandlePrintDebugString(
     TFE_TensorHandle* handle);
 
+typedef struct TFE_ExecuteOpNotification TFE_ExecuteOpNotification;
+
+// Allows invoking a kernel asynchronously, and explicitly returns a
+// notification that can be waited upon. This always executes the kernel in a
+// new thread.
+// 1. `retvals` and `num_retvals` can only be consumed after
+// `TFE_ExecuteOp` returns successfully. They shouldn't be used
+// if the return is unsuccessful
+// 2. These new APIs cannot be used together with the TFE context level async
+// support.
+TF_CAPI_EXPORT extern TFE_ExecuteOpNotification* TFE_ExecuteOpInNewThread(
+    TFE_Op* op, TFE_TensorHandle** retvals, int* num_retvals,
+    TF_Status* status);
+
+// Waits to complete the op execution, and cleans up the notification.
+// Errors reported by op execution are set in `status`.
+TF_CAPI_EXPORT extern void TFE_ExecuteOpNotificationWaitAndDelete(
+    TFE_ExecuteOpNotification* notification, TF_Status* status);
+
 TF_CAPI_EXPORT extern void TF_MakeInternalErrorStatus(TF_Status* status,
                                                       const char* errMsg);
 
@@ -209,6 +228,24 @@ TF_CAPI_EXPORT extern void TF_AttrBuilderCheckCanRunOnDevice(
 TF_CAPI_EXPORT extern const char* TF_GetNumberAttrForOpListInput(
     const char* op_name, int input_index, TF_Status* status);
 
+// Returns 1 if the op is stateful, 0 otherwise. The return value is undefined
+// if the status is not ok.
+TF_CAPI_EXPORT extern int TF_OpIsStateful(const char* op_type,
+                                          TF_Status* status);
+
+// Platform specific initialization routine. Very few platforms actually require
+// this to be called.
+TF_CAPI_EXPORT void TF_InitMain(const char* usage, int* argc, char*** argv);
+
+// Platform-specific implementation to return an unused port. (This should used
+// in tests only.)
+TF_CAPI_EXPORT int TF_PickUnusedPortOrDie(void);
+
+// Fast path method that makes constructing a single scalar tensor require less
+// overhead and copies.
+TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_NewTensorHandleFromScalar(
+    TF_DataType dtype, void* scalar, size_t len);
+
 #ifdef __cplusplus
 } /* end extern "C" */
 #endif
diff --git a/tensorflow/c/c_api_experimental_test.cc b/tensorflow/c/c_api_experimental_test.cc
index c6effd39697e0397278770b53e98508074f99862..daa7701b7fe7e8ce757b6504329cf6434ad39778 100644
--- a/tensorflow/c/c_api_experimental_test.cc
+++ b/tensorflow/c/c_api_experimental_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/c/c_api_experimental.h"
 #include "tensorflow/c/c_test_util.h"
+#include "tensorflow/c/eager/c_api.h"
+#include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
@@ -162,5 +164,137 @@ protocol: "grpc"
   TF_DeleteStatus(status);
 }
 
+TEST(CAPI_EXPERIMENTAL, IsStateful) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  int assign = TF_OpIsStateful("AssignAddVariableOp", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  EXPECT_EQ(assign, 1);
+  int id = TF_OpIsStateful("Identity", status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  EXPECT_EQ(id, 0);
+}
+
+TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Simple) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+
+  TFE_Op* matmul_op = MatMulOp(ctx, m, m);
+
+  TFE_TensorHandle* retvals[1] = {nullptr};
+  int num_retvals = 1;
+
+  auto* r =
+      TFE_ExecuteOpInNewThread(matmul_op, &retvals[0], &num_retvals, status);
+
+  TFE_ExecuteOpNotificationWaitAndDelete(r, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(7, product[0]);
+  EXPECT_EQ(10, product[1]);
+  EXPECT_EQ(15, product[2]);
+  EXPECT_EQ(22, product[3]);
+
+  TFE_DeleteOp(matmul_op);
+  TFE_DeleteTensorHandle(m);
+
+  TFE_DeleteTensorHandle(retvals[0]);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
+// Perform a send/recv test. Recv blocks, so they need to be executed
+// asynchronously.
+TEST(CAPI_EXPERIMENTAL, TFE_ExecuteOpInNewThreadTest_Blocking) {
+  TF_Status* status = TF_NewStatus();
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_Context* ctx = TFE_NewContext(opts, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_DeleteContextOptions(opts);
+
+  // Returns a 2x2 float32 Tensor on the CPU, with data 1., 2., 3., 4.
+  TFE_TensorHandle* m = TestMatrixTensorHandle();
+
+  // Build a send op.
+  TFE_Op* send_op = TFE_NewOp(ctx, "_Send", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(send_op, m, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  string tensor_name = "Tensor";
+  TFE_OpSetAttrType(send_op, "T", TF_FLOAT);
+  TFE_OpSetAttrString(send_op, "tensor_name", tensor_name.c_str(),
+                      tensor_name.size());
+  string send_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_OpSetAttrString(send_op, "send_device", send_device.c_str(),
+                      send_device.size());
+  TFE_OpSetAttrInt(send_op, "send_device_incarnation", 1234);
+  string recv_device = "/job:localhost/replica:0/task:0/device:CPU:0";
+  TFE_OpSetAttrString(send_op, "recv_device", recv_device.c_str(),
+                      recv_device.size());
+  TFE_OpSetAttrBool(send_op, "client_terminated", true);
+
+  // Build a recv op.
+  TFE_Op* recv_op = TFE_NewOp(ctx, "_Recv", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TFE_OpSetAttrType(recv_op, "tensor_type", TF_FLOAT);
+  TFE_OpSetAttrString(recv_op, "tensor_name", tensor_name.c_str(),
+                      tensor_name.size());
+  TFE_OpSetAttrString(recv_op, "send_device", send_device.c_str(),
+                      send_device.size());
+  TFE_OpSetAttrInt(recv_op, "send_device_incarnation", 1234);
+  TFE_OpSetAttrString(recv_op, "recv_device", recv_device.c_str(),
+                      recv_device.size());
+  TFE_OpSetAttrBool(recv_op, "client_terminated", true);
+
+  TFE_TensorHandle* send_retvals;
+  int send_num_retvals = 0;
+  auto* send_result = TFE_ExecuteOpInNewThread(send_op, &send_retvals,
+                                               &send_num_retvals, status);
+
+  TFE_TensorHandle* recv_retvals[1] = {nullptr};
+  int recv_num_retvals = 1;
+  auto* recv_result = TFE_ExecuteOpInNewThread(recv_op, &recv_retvals[0],
+                                               &recv_num_retvals, status);
+
+  TFE_ExecuteOpNotificationWaitAndDelete(send_result, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_ExecuteOpNotificationWaitAndDelete(recv_result, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  TF_Tensor* t = TFE_TensorHandleResolve(recv_retvals[0], status);
+  ASSERT_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+
+  float product[4] = {0};
+  EXPECT_EQ(sizeof(product), TF_TensorByteSize(t));
+  memcpy(&product[0], TF_TensorData(t), TF_TensorByteSize(t));
+  TF_DeleteTensor(t);
+  EXPECT_EQ(1, product[0]);
+  EXPECT_EQ(2, product[1]);
+  EXPECT_EQ(3, product[2]);
+  EXPECT_EQ(4, product[3]);
+
+  TFE_DeleteOp(send_op);
+  TFE_DeleteOp(recv_op);
+  TFE_DeleteTensorHandle(m);
+
+  TFE_DeleteTensorHandle(recv_retvals[0]);
+  TFE_DeleteContext(ctx);
+  TF_DeleteStatus(status);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/c_api_function.cc b/tensorflow/c/c_api_function.cc
index f68f8a3e90a971b5e4a024feaf26ba498afc48da..28b9f8df9c873ee394eb6a241dd9ac06ba6c8796 100644
--- a/tensorflow/c/c_api_function.cc
+++ b/tensorflow/c/c_api_function.cc
@@ -392,26 +392,26 @@ Status ProcessInputs(
     EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   input_tensors->reserve(ninputs);
   for (int i = 0; i < ninputs; ++i) {
-    const Node& node = inputs[i].oper->node;
+    Node* node = &inputs[i].oper->node;
     int idx = inputs[i].index;
 
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        fn_body->graph.IsValidOutputTensor(&node, idx),
+        fn_body->graph.IsValidOutputTensor(node, idx),
         "Encountered while processing input ", i, " into function '", fn_name,
         "'");
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(&node, idx),
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(node, idx),
                                     "Encountered while processing input ", i,
                                     " into function '", fn_name, "'");
 
-    input_tensors->emplace_back(&node, idx);
+    input_tensors->emplace_back(node, idx);
 
-    const auto& iter = input_nodes->find(&node);
+    const auto& iter = input_nodes->find(node);
     if (iter == input_nodes->end()) {
-      input_nodes->insert({&node, {idx}});
+      input_nodes->insert({node, {idx}});
     } else {
       auto& indices = iter->second;
       if (std::find(indices.begin(), indices.end(), idx) != indices.end()) {
-        return InvalidArgument("TF_Output ", node.name(), ":", idx,
+        return InvalidArgument("TF_Output ", node->name(), ":", idx,
                                " appears more than once in the input list");
       }
       indices.push_back(idx);
@@ -428,16 +428,16 @@ Status ProcessOutputs(const TF_Graph* fn_body, const char* fn_name,
     EXCLUSIVE_LOCKS_REQUIRED(fn_body->mu) {
   output_tensors->reserve(noutputs);
   for (int i = 0; i < noutputs; ++i) {
-    const Node& node = outputs[i].oper->node;
+    Node* node = &outputs[i].oper->node;
     int idx = outputs[i].index;
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
-        fn_body->graph.IsValidOutputTensor(&node, idx),
+        fn_body->graph.IsValidOutputTensor(node, idx),
         "Encountered while processing output ", i, " from function '", fn_name,
         "'");
-    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(&node, idx),
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(ValidateNonRefOutput(node, idx),
                                     "Encountered while creating function '",
                                     fn_name, "'");
-    output_tensors->emplace_back(&node, idx);
+    output_tensors->emplace_back(node, idx);
   }
   return Status::OK();
 }
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index ba3d8533db7623b8fa7fdf35093abcd1450776b1..c34a84fcfee9b6ba9a7be86ae16e2856a2d343c7 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -50,6 +50,7 @@ tf_cuda_library(
         ],
         "//conditions:default": [],
     }) + [
+        "@com_google_absl//absl/memory",
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/distributed_runtime/eager:eager_client",
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
@@ -143,6 +144,7 @@ tf_cuda_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 408277468d7beb23d1b2ab7f9bbccac16332e55a..027d752f420238da867cb9d8c116640e1730caaa 100755
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -21,9 +21,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/c/c_api.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/c_api_internal.h"
+#include "tensorflow/core/platform/host_info.h"
 #ifdef TENSORFLOW_EAGER_USE_XLA
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #endif  // TENSORFLOW_EAGER_USE_XLA
@@ -79,7 +81,7 @@ tensorflow::Status GetAllRemoteDevices(
     const std::vector<string>& remote_workers,
     tensorflow::WorkerCacheInterface* worker_cache,
     std::unique_ptr<tensorflow::DeviceMgr>* device_mgr) {
-  std::vector<tensorflow::Device*> remote_devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> remote_devices;
   tensorflow::Status status;
   // TODO(nareshmodi) do this in parallel instead of serially.
   for (const string& remote_worker : remote_workers) {
@@ -92,7 +94,7 @@ tensorflow::Status GetAllRemoteDevices(
           status = s;
           if (s.ok()) {
             for (tensorflow::Device* d : *devices) {
-              remote_devices.push_back(d);
+              remote_devices.emplace_back(d);
             }
           }
           n.Notify();
@@ -100,7 +102,7 @@ tensorflow::Status GetAllRemoteDevices(
     n.WaitForNotification();
   }
   std::unique_ptr<tensorflow::DeviceMgr> remote_device_mgr(
-      new tensorflow::DeviceMgr(remote_devices));
+      new tensorflow::DeviceMgr(std::move(remote_devices)));
 
   TF_RETURN_IF_ERROR(status);
 
@@ -261,13 +263,13 @@ TF_CAPI_EXPORT extern void TFE_ContextSetAsyncForThread(TFE_Context* ctx,
 void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   status->status = tensorflow::DeviceFactory::AddDevices(
       opts->session_options.options, "/job:localhost/replica:0/task:0",
       &devices);
   if (!status->status.ok()) return nullptr;
   std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+      new tensorflow::DeviceMgr(std::move(devices)));
 
   tensorflow::Rendezvous* r =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
@@ -409,6 +411,18 @@ const char* TFE_TensorHandleDeviceName(TFE_TensorHandle* h, TF_Status* status) {
                         : d->name().c_str();
 }
 
+const char* TFE_TensorHandleBackingDeviceName(TFE_TensorHandle* h,
+                                              TF_Status* status) {
+  if (h == nullptr || h->handle == nullptr) {
+    status->status = tensorflow::errors::InvalidArgument(
+        "The passed in handle is a nullptr");
+    return nullptr;
+  }
+  tensorflow::Device* d = h->handle->device();
+  return (d == nullptr) ? "/job:localhost/replica:0/task:0/device:CPU:0"
+                        : d->name().c_str();
+}
+
 TF_CAPI_EXPORT extern TFE_TensorHandle* TFE_TensorHandleCopySharingTensor(
     TFE_TensorHandle* h, TF_Status* status) {
   if (h == nullptr || h->handle == nullptr) {
@@ -458,13 +472,20 @@ TFE_Op* TFE_NewOp(TFE_Context* ctx, const char* op_or_function_name,
                   TF_Status* status) {
   const char* name = op_or_function_name;  // Shorthand
   const tensorflow::AttrTypeMap* types;
-  status->status = tensorflow::AttrTypeMapForOp(name, &types);
-  if (status->status.ok()) return new TFE_Op(ctx, name, types);
-  if (TF_GetCode(status) == TF_NOT_FOUND) {
-    if (ctx->context.FindFunctionByName(name)) {
-      status->status = tensorflow::Status::OK();
-      return new TFE_Op(ctx, name, nullptr);
+  bool is_function = false;
+  status->status = tensorflow::AttrTypeMapForOp(name, &types, &is_function);
+  if (status->status.ok()) {
+    if (is_function && !ctx->context.FindFunctionByName(name)) {
+      status->status = tensorflow::errors::NotFound(
+          "'", name,
+          "' is neither a type of a primitive operation nor a name "
+          "of a function registered in binary running on ",
+          tensorflow::port::Hostname(),
+          ". Make sure the operation or function is "
+          "registered in the binary running in this process.");
+      return nullptr;
     }
+    return new TFE_Op(ctx, name, is_function, types);
   }
   return nullptr;
 }
@@ -497,12 +518,6 @@ void TFE_OpAddInput(TFE_Op* op, TFE_TensorHandle* h, TF_Status* status) {
 TF_AttrType TFE_OpGetAttrType(TFE_Op* op, const char* attr_name,
                               unsigned char* is_list, TF_Status* status) {
   TF_AttrType ret;
-  if (op->operation.is_function()) {
-    status->status = tensorflow::errors::Unimplemented(
-        "TODO(apassos): Support for attributes for TensorFlow functions is not "
-        "ready yet.");
-    return TF_ATTR_INT;  // The compiler requires that we return something.
-  }
   status->status = tensorflow::AttrTypeByName(*op->operation.AttrTypes(),
                                               attr_name, &ret, is_list);
   return ret;
diff --git a/tensorflow/c/eager/c_api.h b/tensorflow/c/eager/c_api.h
index b2454d872207e26feb3764671474a5d87c01f84d..f80ae5a6d02d4d613c95cf8486e0fc0aeed3affc 100755
--- a/tensorflow/c/eager/c_api.h
+++ b/tensorflow/c/eager/c_api.h
@@ -48,7 +48,7 @@ extern "C" {
 typedef struct TFE_ContextOptions TFE_ContextOptions;
 
 // Return a new options object.
-TF_CAPI_EXPORT extern TFE_ContextOptions* TFE_NewContextOptions();
+TF_CAPI_EXPORT extern TFE_ContextOptions* TFE_NewContextOptions(void);
 
 // Set the config in TF_ContextOptions.options.
 // config should be a serialized tensorflow.ConfigProto proto.
@@ -169,10 +169,33 @@ TF_CAPI_EXPORT extern int64_t TFE_TensorHandleNumElements(TFE_TensorHandle* h,
 TF_CAPI_EXPORT extern int64_t TFE_TensorHandleDim(TFE_TensorHandle* h,
                                                   int dim_index,
                                                   TF_Status* status);
+
+// Returns the device of the operation that produced `h`.
+// If `h` was produced by a copy, returns the destination device of
+// the copy. Note that returned device name is not always the device
+// holding the tensor handle's memory. If you want the latter, use
+// TFE_TensorHandleBackingDeviceName.
+// This function will block till the operation that produces `h` has completed.
+//
+// Device on which the kernel of the operation that produced `h` ran.
+//
+// If `h` was produced by a copy, returns the destination device of
+// the copy.
+//
+// Note that returned device name is not always the device that owns the memory
+// that backs the tensor handle. For the latter see
+// TFE_TensorHandleBackingDeviceName.
+//
 // This function will block till the operation that produces `h` has completed.
 TF_CAPI_EXPORT extern const char* TFE_TensorHandleDeviceName(
     TFE_TensorHandle* h, TF_Status* status);
 
+// Returns the name of the device in whose memory `h` resides.
+//
+// This function will block till the operation that produces `h` has completed.
+TF_CAPI_EXPORT extern const char* TFE_TensorHandleBackingDeviceName(
+    TFE_TensorHandle* h, TF_Status* status);
+
 // Return a pointer to a new TFE_TensorHandle that shares the underlying tensor
 // with `h`. On success, `status` is set to OK. On failure, `status` reflects
 // the error and a nullptr is returned.
diff --git a/tensorflow/c/eager/c_api_internal.h b/tensorflow/c/eager/c_api_internal.h
index fa1b22e3af487b19b8b7885b7c3740b6249c73eb..67bc1bcd24605f8363d6a7c8d5d6a0836a42fc82 100644
--- a/tensorflow/c/eager/c_api_internal.h
+++ b/tensorflow/c/eager/c_api_internal.h
@@ -93,10 +93,9 @@ struct TFE_TensorDebugInfo {
 };
 
 struct TFE_Op {
-  // t is NULL iff the TFE_Op corresponds to a TensorFlow function instead of a
-  // primitive operation.
-  TFE_Op(TFE_Context* ctx, const char* op, const tensorflow::AttrTypeMap* t)
-      : operation(&ctx->context, op, t) {}
+  TFE_Op(TFE_Context* ctx, const char* op, bool is_function,
+         const tensorflow::AttrTypeMap* t)
+      : operation(&ctx->context, op, is_function, t) {}
 
   tensorflow::EagerOperation operation;
 };
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 55331022b9dbd0696928fa44430f340f371432ac..6b39b79ee82f9c7baaf856e573a42b7da65691e5 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/c/eager/c_api.h"
 
 #include <string.h>
+#include "absl/strings/match.h"
 #include "tensorflow/c/eager/c_api_test_util.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_server_lib.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -589,9 +590,22 @@ void TensorHandleCopyBetweenTwoGPUDevices(bool async) {
   TF_DeviceList* devices = TFE_ContextListDevices(ctx, status.get());
   ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
   const int num_devices = TF_DeviceListCount(devices);
+  bool has_gpu0 = false;
+  bool has_gpu1 = false;
+  for (int i = 0; i < num_devices; ++i) {
+    const char* dev = TF_DeviceListName(devices, i, status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    string device_name(dev);
+    if (device_name.find("GPU:0") != string::npos) {
+      has_gpu0 = true;
+    }
+    if (device_name.find("GPU:1") != string::npos) {
+      has_gpu1 = true;
+    }
+  }
 
   const char* kCPUDevice = "CPU:0";
-  if (num_devices < 3) {
+  if (!has_gpu0 || !has_gpu1) {
     TF_DeleteDeviceList(devices);
     TF_DeleteTensor(t);
     TFE_DeleteTensorHandle(hcpu);
@@ -781,6 +795,14 @@ TEST(CAPI, TensorHandleNullptr) {
 
   TF_SetStatus(status.get(), TF_OK, "");
 
+  device_name = TFE_TensorHandleBackingDeviceName(h, status.get());
+  ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
+  ASSERT_EQ(device_name, nullptr);
+  ASSERT_EQ("The passed in handle is a nullptr",
+            string(TF_Message(status.get())));
+
+  TF_SetStatus(status.get(), TF_OK, "");
+
   int num_dims = TFE_TensorHandleNumDims(h, status.get());
   ASSERT_EQ(TF_INVALID_ARGUMENT, TF_GetCode(status.get()));
   ASSERT_EQ(num_dims, -1);
@@ -796,6 +818,62 @@ TEST(CAPI, TensorHandleNullptr) {
             string(TF_Message(status.get())));
 }
 
+TEST(CAPI, TensorHandleDevices) {
+  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
+      TF_NewStatus(), TF_DeleteStatus);
+  TFE_ContextOptions* opts = TFE_NewContextOptions();
+  TFE_Context* ctx = TFE_NewContext(opts, status.get());
+  TFE_DeleteContextOptions(opts);
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+
+  TFE_TensorHandle* hcpu = TestMatrixTensorHandle();
+  const char* device_name = TFE_TensorHandleDeviceName(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(device_name, "CPU:0")) << device_name;
+  const char* backing_device_name =
+      TFE_TensorHandleBackingDeviceName(hcpu, status.get());
+  ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  ASSERT_TRUE(absl::StrContains(backing_device_name, "CPU:0"))
+      << backing_device_name;
+
+  // Disable the test if no GPU is present.
+  string gpu_device_name;
+  if (GetDeviceName(ctx, &gpu_device_name, "GPU")) {
+    TFE_TensorHandle* hgpu = TFE_TensorHandleCopyToDevice(
+        hcpu, ctx, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    TFE_Op* shape_op = ShapeOp(ctx, hgpu);
+    TFE_OpSetDevice(shape_op, gpu_device_name.c_str(), status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+    TFE_TensorHandle* retvals[1];
+    int num_retvals = 1;
+    TFE_Execute(shape_op, &retvals[0], &num_retvals, status.get());
+    ASSERT_TRUE(TF_GetCode(status.get()) == TF_OK) << TF_Message(status.get());
+
+    // .device of shape is GPU since the op is executed on GPU
+    device_name = TFE_TensorHandleDeviceName(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_TRUE(absl::StrContains(device_name, "GPU:0")) << device_name;
+
+    // .backing_device of shape is CPU since the tensor is backed by CPU
+    backing_device_name =
+        TFE_TensorHandleBackingDeviceName(retvals[0], status.get());
+    ASSERT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+    ASSERT_TRUE(absl::StrContains(backing_device_name, "CPU:0"))
+        << backing_device_name;
+
+    TFE_DeleteOp(shape_op);
+    TFE_DeleteTensorHandle(retvals[0]);
+    TFE_DeleteTensorHandle(hgpu);
+  }
+
+  TFE_DeleteTensorHandle(hcpu);
+  TFE_ContextAsyncWait(ctx, status.get());
+  EXPECT_EQ(TF_OK, TF_GetCode(status.get())) << TF_Message(status.get());
+  TFE_DeleteContext(ctx);
+}
+
 void Execute_MatMul_CPU(bool async) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
diff --git a/tensorflow/c/eager/c_api_test_util.cc b/tensorflow/c/eager/c_api_test_util.cc
index 008f088c2dcdd7d9114103516a4702e47a55c6de..bd38127d50c171af801dd1b937acefdba491b4a6 100644
--- a/tensorflow/c/eager/c_api_test_util.cc
+++ b/tensorflow/c/eager/c_api_test_util.cc
@@ -104,6 +104,19 @@ TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b) {
   return op;
 }
 
+TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a) {
+  TF_Status* status = TF_NewStatus();
+
+  TFE_Op* op = TFE_NewOp(ctx, "Shape", status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TFE_OpAddInput(op, a, status);
+  CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
+  TF_DeleteStatus(status);
+  TFE_OpSetAttrType(op, "T", TFE_TensorHandleDataType(a));
+
+  return op;
+}
+
 TFE_TensorHandle* TestAxisTensorHandle() {
   int64_t dims[] = {1};
   int data[] = {1};
diff --git a/tensorflow/c/eager/c_api_test_util.h b/tensorflow/c/eager/c_api_test_util.h
index 474cae67c89249af3a62707f0db00ba458ca8f31..75ef9459e93b4f2ed471c423a34565594efc1714 100644
--- a/tensorflow/c/eager/c_api_test_util.h
+++ b/tensorflow/c/eager/c_api_test_util.h
@@ -37,6 +37,9 @@ TFE_TensorHandle* TestMatrixTensorHandle3X2();
 // Return a matmul op multiplying `a` by `b`.
 TFE_Op* MatMulOp(TFE_Context* ctx, TFE_TensorHandle* a, TFE_TensorHandle* b);
 
+// Return a shape op fetching the shape of `a`.
+TFE_Op* ShapeOp(TFE_Context* ctx, TFE_TensorHandle* a);
+
 // Return an 1-D INT32 tensor containing a single value 1.
 TFE_TensorHandle* TestAxisTensorHandle();
 
diff --git a/tensorflow/c/eager/tape.h b/tensorflow/c/eager/tape.h
index 5ba55a203ff70cc64c07e96b5a869a1f11c9334e..5c11f51e8749de84547ae873f5f55ebd42bc4b3d 100644
--- a/tensorflow/c/eager/tape.h
+++ b/tensorflow/c/eager/tape.h
@@ -141,8 +141,9 @@ class GradientTape {
   // null. The result is populated with one tensor per target element.
   Status ComputeGradient(
       const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-      gtl::ArraySlice<int64> target_tensor_ids,
-      gtl::ArraySlice<int64> source_tensor_id,
+      const gtl::ArraySlice<int64> target_tensor_ids,
+      const gtl::ArraySlice<int64> source_tensor_ids,
+      const gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
       gtl::ArraySlice<Gradient*> output_gradients,
       std::vector<Gradient*>* result);
 
@@ -396,6 +397,7 @@ template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status InitialGradients(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
     gtl::ArraySlice<int64> target_tensor_ids,
+    gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients, const TensorTape& tensor_tape,
     const OpTape<BackwardFunction, TapeTensor>& op_tape,
     gtl::FlatMap<int64, std::vector<Gradient*>>* result) {
@@ -425,8 +427,13 @@ Status InitialGradients(
               "none of operations outputs match expected tensor");
         }
       } else {
-        // No record of the target tensor found on the tape, so no gradient
-        // needs to be computed from it. Do nothing.
+        // This target tensor was not generated by any operation recorded on
+        // the tape, so no gradient needs to be computed from it unless this
+        // target is also a source.
+        auto source_tensor = sources_that_are_targets.find(id);
+        if (source_tensor != sources_that_are_targets.end()) {
+          (*result)[id].push_back(vspace.Ones(source_tensor->second));
+        }
       }
     } else {
       (*result)[id].push_back(output_gradients[i]);
@@ -467,8 +474,9 @@ constexpr int kMinAggregateBytes = 128 * 1024 * 1024;
 template <typename Gradient, typename BackwardFunction, typename TapeTensor>
 Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
     const VSpace<Gradient, BackwardFunction, TapeTensor>& vspace,
-    gtl::ArraySlice<int64> target_tensor_ids,
-    gtl::ArraySlice<int64> source_tensor_ids,
+    const gtl::ArraySlice<int64> target_tensor_ids,
+    const gtl::ArraySlice<int64> source_tensor_ids,
+    const gtl::FlatMap<int64, TapeTensor> sources_that_are_targets,
     gtl::ArraySlice<Gradient*> output_gradients,
     std::vector<Gradient*>* result) {
   gtl::FlatSet<int64> sources_set(source_tensor_ids.begin(),
@@ -478,7 +486,8 @@ Status GradientTape<Gradient, BackwardFunction, TapeTensor>::ComputeGradient(
   std::vector<int64> op_stack =
       InitialStack(state.op_tape, state.op_missing_tensor);
   gtl::FlatMap<int64, std::vector<Gradient*>> gradients;
-  Status s = InitialGradients(vspace, target_tensor_ids, output_gradients,
+  Status s = InitialGradients(vspace, target_tensor_ids,
+                              sources_that_are_targets, output_gradients,
                               tensor_tape_, state.op_tape, &gradients);
   auto cleanup = [this, &state]() {
     if (!persistent_) {
diff --git a/tensorflow/c/env.cc b/tensorflow/c/env.cc
new file mode 100644
index 0000000000000000000000000000000000000000..07b9e8b940c55caf62ae0b81b884bf313d335459
--- /dev/null
+++ b/tensorflow/c/env.cc
@@ -0,0 +1,161 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/env.h"
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
+
+struct TF_StringStream {
+  std::vector<::tensorflow::string>* list;
+  size_t position;
+};
+
+void TF_CreateDir(const char* dirname, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->CreateDir(dirname));
+}
+
+void TF_DeleteDir(const char* dirname, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteDir(dirname));
+}
+
+void TF_DeleteRecursively(const char* dirname, int64_t* undeleted_file_count,
+                          int64_t* undeleted_dir_count, TF_Status* status) {
+  ::tensorflow::int64 f, d;
+
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteRecursively(dirname, &f, &d));
+  *undeleted_file_count = f;
+  *undeleted_dir_count = d;
+}
+
+void TF_FileStat(const char* filename, TF_FileStatistics* stats,
+                 TF_Status* status) {
+  ::tensorflow::FileStatistics cc_stats;
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Status s =
+      ::tensorflow::Env::Default()->Stat(filename, &cc_stats);
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+  if (s.ok()) {
+    stats->length = cc_stats.length;
+    stats->mtime_nsec = cc_stats.mtime_nsec;
+    stats->is_directory = cc_stats.is_directory;
+  }
+}
+
+void TF_NewWritableFile(const char* filename, TF_WritableFileHandle** handle,
+                        TF_Status* status) {
+  std::unique_ptr<::tensorflow::WritableFile> f;
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Status s =
+      ::tensorflow::Env::Default()->NewWritableFile(filename, &f);
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+
+  if (s.ok()) {
+    *handle = reinterpret_cast<TF_WritableFileHandle*>(f.release());
+  }
+}
+
+void TF_CloseWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Close());
+  delete cc_file;
+}
+
+void TF_SyncWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Sync());
+}
+
+void TF_FlushWritableFile(TF_WritableFileHandle* handle, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, cc_file->Flush());
+}
+
+void TF_AppendWritableFile(TF_WritableFileHandle* handle, const char* data,
+                           size_t length, TF_Status* status) {
+  auto* cc_file = reinterpret_cast<::tensorflow::WritableFile*>(handle);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, cc_file->Append(::tensorflow::StringPiece{data, length}));
+}
+
+void TF_DeleteFile(const char* filename, TF_Status* status) {
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->DeleteFile(filename));
+}
+
+bool TF_StringStreamNext(TF_StringStream* list, const char** result) {
+  if (list->position >= list->list->size()) {
+    *result = nullptr;
+    return false;
+  }
+
+  *result = list->list->at(list->position++).c_str();
+  return true;
+}
+
+void TF_StringStreamDone(TF_StringStream* list) {
+  delete list->list;
+  delete list;
+}
+TF_StringStream* TF_GetChildren(const char* dirname, TF_Status* status) {
+  auto* children = new std::vector<::tensorflow::string>;
+
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(
+      status, ::tensorflow::Env::Default()->GetChildren(dirname, children));
+
+  auto* list = new TF_StringStream;
+  list->list = children;
+  list->position = 0;
+  return list;
+}
+
+TF_StringStream* TF_GetLocalTempDirectories() {
+  auto* tmpdirs = new std::vector<::tensorflow::string>;
+
+  ::tensorflow::Env::Default()->GetLocalTempDirectories(tmpdirs);
+
+  auto* list = new TF_StringStream;
+  list->list = tmpdirs;
+  list->position = 0;
+  return list;
+}
+
+TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void) {
+  return ::tensorflow::Env::Default()->NowNanos();
+}
+
+// Returns the number of microseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void) {
+  return ::tensorflow::Env::Default()->NowMicros();
+}
+
+// Returns the number of seconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void) {
+  return ::tensorflow::Env::Default()->NowSeconds();
+}
diff --git a/tensorflow/c/env.h b/tensorflow/c/env.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d27c5da37735042c7476b591e57486dbde33152
--- /dev/null
+++ b/tensorflow/c/env.h
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_ENV_H_
+#define TENSORFLOW_C_ENV_H_
+
+#include "tensorflow/c/c_api.h"
+
+// --------------------------------------------------------------------------
+// C API for tensorflow::Env.
+
+struct TF_WritableFileHandle;
+struct TF_StringStream;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct TF_FileStatistics {
+  // The length of the file in bytes.
+  int64_t length;
+  // The last modified time in nanoseconds.
+  int64_t mtime_nsec;
+  // Whether the name refers to a directory.
+  bool is_directory;
+} TF_FileStatistics;
+
+// Creates the specified directory. Typical status code are:
+//  * TF_OK - successfully created the directory
+//  * TF_ALREADY_EXISTS - directory already exists
+//  * TF_PERMISSION_DENIED - dirname is not writable
+TF_CAPI_EXPORT extern void TF_CreateDir(const char* dirname, TF_Status* status);
+
+// Deletes the specified directory. Typical status codes are:
+//  * TF_OK - successfully deleted the directory
+//  * TF_FAILED_PRECONDITION - the directory is not empty
+TF_CAPI_EXPORT extern void TF_DeleteDir(const char* dirname, TF_Status* status);
+
+// Deletes the specified directory and all subdirectories and files underneath
+// it. This is accomplished by traversing the directory tree rooted at dirname
+// and deleting entries as they are encountered.
+//
+// If dirname itself is not readable or does not exist, *undeleted_dir_count is
+// set to 1, *undeleted_file_count is set to 0 and an appropriate status (e.g.
+// TF_NOT_FOUND) is returned.
+//
+// If dirname and all its descendants were successfully deleted, TF_OK is
+// returned and both error counters are set to zero.
+//
+// Otherwise, while traversing the tree, undeleted_file_count and
+// undeleted_dir_count are updated if an entry of the corresponding type could
+// not be deleted. The returned error status represents the reason that any one
+// of these entries could not be deleted.
+//
+// Typical status codes:
+//  * TF_OK - dirname exists and we were able to delete everything underneath
+//  * TF_NOT_FOUND - dirname doesn't exist
+//  * TF_PERMISSION_DENIED - dirname or some descendant is not writable
+//  * TF_UNIMPLEMENTED - some underlying functions (like Delete) are not
+//    implemented
+TF_CAPI_EXPORT extern void TF_DeleteRecursively(const char* dirname,
+                                                int64_t* undeleted_file_count,
+                                                int64_t* undeleted_dir_count,
+                                                TF_Status* status);
+
+// Obtains statistics for the given path. If status is TF_OK, *stats is
+// updated, otherwise it is not touched.
+TF_CAPI_EXPORT extern void TF_FileStat(const char* filename,
+                                       TF_FileStatistics* stats,
+                                       TF_Status* status);
+
+// Creates or truncates the given filename and returns a handle to be used for
+// appending data to the file. If status is TF_OK, *handle is updated and the
+// caller is responsible for freeing it (see TF_CloseWritableFile).
+TF_CAPI_EXPORT extern void TF_NewWritableFile(const char* filename,
+                                              TF_WritableFileHandle** handle,
+                                              TF_Status* status);
+
+// Closes the given handle and frees its memory. If there was a problem closing
+// the file, it is indicated by status. Memory is freed in any case.
+TF_CAPI_EXPORT extern void TF_CloseWritableFile(TF_WritableFileHandle* handle,
+                                                TF_Status* status);
+
+// Syncs content of the handle to the filesystem. Blocks waiting for the
+// filesystem to indicate that the content has been persisted.
+TF_CAPI_EXPORT extern void TF_SyncWritableFile(TF_WritableFileHandle* handle,
+                                               TF_Status* status);
+
+// Flush local buffers to the filesystem. If the process terminates after a
+// successful flush, the contents may still be persisted, since the underlying
+// filesystem may eventually flush the contents.  If the OS or machine crashes
+// after a successful flush, the contents may or may not be persisted, depending
+// on the implementation.
+TF_CAPI_EXPORT extern void TF_FlushWritableFile(TF_WritableFileHandle* handle,
+                                                TF_Status* status);
+
+// Appends the given bytes to the file. Any failure to do so is indicated in
+// status.
+TF_CAPI_EXPORT extern void TF_AppendWritableFile(TF_WritableFileHandle* handle,
+                                                 const char* data,
+                                                 size_t length,
+                                                 TF_Status* status);
+
+// Deletes the named file and indicates whether successful in *status.
+TF_CAPI_EXPORT extern void TF_DeleteFile(const char* filename,
+                                         TF_Status* status);
+
+// Retrieves the next item from the given TF_StringStream and places a pointer
+// to it in *result. If no more items are in the list, *result is set to NULL
+// and false is returned.
+//
+// Ownership of the items retrieved with this function remains with the library.
+// Item points are invalidated after a call to TF_StringStreamDone.
+TF_CAPI_EXPORT extern bool TF_StringStreamNext(TF_StringStream* list,
+                                               const char** result);
+
+// Frees the resources associated with given string list. All pointers returned
+// by TF_StringStreamNext are invalid after this call.
+TF_CAPI_EXPORT extern void TF_StringStreamDone(TF_StringStream* list);
+
+// Retrieves the list of children of the given directory. You can iterate
+// through the list with TF_StringStreamNext. The caller is responsible for
+// freeing the list (see TF_StringStreamDone).
+TF_CAPI_EXPORT extern TF_StringStream* TF_GetChildren(const char* filename,
+                                                      TF_Status* status);
+
+// Retrieves a list of directory names on the local machine that may be used for
+// temporary storage. You can iterate through the list with TF_StringStreamNext.
+// The caller is responsible for freeing the list (see TF_StringStreamDone).
+TF_CAPI_EXPORT extern TF_StringStream* TF_GetLocalTempDirectories(void);
+
+// Returns the number of nanoseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowNanos(void);
+
+// Returns the number of microseconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowMicros(void);
+
+// Returns the number of seconds since the Unix epoch.
+TF_CAPI_EXPORT extern uint64_t TF_NowSeconds(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_C_ENV_H_
diff --git a/tensorflow/c/env_test.cc b/tensorflow/c/env_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e2206c6befd2167346c64032940d6e8c631e4a3e
--- /dev/null
+++ b/tensorflow/c/env_test.cc
@@ -0,0 +1,100 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/env.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/types.h"
+
+#define ASSERT_TF_OK(x) ASSERT_EQ(TF_OK, TF_GetCode(x))
+
+TEST(TestEnv, TestDirHandling) {
+  TF_StringStream* tempdirs = TF_GetLocalTempDirectories();
+  const char* tempdir;
+  bool found = false;
+  while (TF_StringStreamNext(tempdirs, &tempdir)) {
+    found = true;
+
+    TF_Status* s = TF_NewStatus();
+
+    ::tensorflow::string dirpath =
+        ::tensorflow::io::JoinPath(tempdir, "somedir");
+    TF_CreateDir(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_CreateDir failed for " << dirpath << ": "
+                    << TF_Message(s);
+
+    ::tensorflow::string filepath =
+        ::tensorflow::io::JoinPath(dirpath, "somefile.txt");
+    TF_WritableFileHandle* handle;
+    TF_NewWritableFile(filepath.c_str(), &handle, s);
+    ASSERT_TF_OK(s) << "NewWritableFile failed for " << filepath << ": "
+                    << TF_Message(s);
+
+    const char* data = "Hello, world!\n";
+    TF_AppendWritableFile(handle, data, strlen(data), s);
+    ASSERT_TF_OK(s) << "TF_AppendWritableFile failed to append data to file at "
+                    << filepath << ": " << TF_Message(s);
+
+    TF_CloseWritableFile(handle, s);
+    ASSERT_TF_OK(s) << "TF_CloseWritableFile failed to close handle to "
+                    << filepath << ": " << TF_Message(s);
+
+    TF_StringStream* children = TF_GetChildren(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_GetChildren failed for " << dirpath;
+    const char* childpath;
+    ASSERT_TRUE(TF_StringStreamNext(children, &childpath));
+    ASSERT_EQ(::tensorflow::string(childpath), "somefile.txt");
+    // There should only be one file in this directory.
+    ASSERT_FALSE(TF_StringStreamNext(children, &childpath));
+    ASSERT_EQ(childpath, nullptr);
+    TF_StringStreamDone(children);
+
+    TF_FileStatistics stats;
+    TF_FileStat(filepath.c_str(), &stats, s);
+    ASSERT_EQ(stats.length, strlen(data));
+    ASSERT_FALSE(stats.is_directory);
+    ASSERT_GT(stats.mtime_nsec, 0);
+
+    // Trying to delete a non-empty directory should fail.
+    TF_DeleteDir(dirpath.c_str(), s);
+    ASSERT_NE(TF_OK, TF_GetCode(s))
+        << "TF_DeleteDir unexpectedly succeeded with a non-empty directory "
+        << dirpath;
+
+    TF_DeleteFile(filepath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_DeleteFile failed for " << filepath << ": "
+                    << TF_Message(s);
+
+    // Now deleting the directory should work.
+    TF_DeleteDir(dirpath.c_str(), s);
+    ASSERT_TF_OK(s) << "TF_DeleteDir failed for " << dirpath << ": "
+                    << TF_Message(s);
+
+    TF_DeleteStatus(s);
+    break;
+  }
+
+  ASSERT_TRUE(found) << "expected at least one temp dir";
+
+  TF_StringStreamDone(tempdirs);
+}
+
+TEST(TestEnv, TestTimeFunctions) {
+  ASSERT_GE(TF_NowSeconds(), 946684800);  // Midnight Jan 1, 2000
+  ASSERT_GE(TF_NowMicros(), 946684800 * 1e6);
+  ASSERT_GE(TF_NowNanos(), 946684800 * 1e9);
+}
diff --git a/tensorflow/c/kernels.cc b/tensorflow/c/kernels.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2a4eaecb6cf2740a522b1e849d1306ebde6c4577
--- /dev/null
+++ b/tensorflow/c/kernels.cc
@@ -0,0 +1,160 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/c/c_api_internal.h"
+#include "tensorflow/c/kernels.h"
+#include "tensorflow/c/tf_status_helper.h"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+// This file forms the basis of a stable ABI for third-party kernel
+// implementations. It is crucial that changes to this file are made cautiously
+// and with a focus on maintaining both source and binary compatibility.
+
+struct TF_KernelBuilder {
+  ::tensorflow::KernelDefBuilder* cc_builder;
+
+  void* (*create_function)(TF_OpKernelConstruction*);
+  void (*compute_function)(void*, TF_OpKernelContext*);
+  void (*delete_function)(void*);
+};
+
+TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*)) {
+  TF_KernelBuilder* result = new TF_KernelBuilder;
+  result->cc_builder = new ::tensorflow::KernelDefBuilder(op_name);
+  result->cc_builder->Device(device_name);
+  result->create_function = create_func;
+  result->compute_function = compute_func;
+  result->delete_function = delete_func;
+  return result;
+}
+
+void TF_DeleteKernelBuilder(TF_KernelBuilder* builder) {
+  DCHECK_NE(builder, nullptr);
+  delete builder->cc_builder;
+  delete builder;
+}
+
+namespace tensorflow {
+namespace {
+
+// An OpKernel whose methods delegate to C function pointers.
+class COpKernel : public OpKernel {
+ public:
+  explicit COpKernel(OpKernelConstruction* ctx,
+                     void* (*create_func)(TF_OpKernelConstruction*),
+                     void (*compute_func)(void*, TF_OpKernelContext*),
+                     void (*delete_func)(void*))
+      : OpKernel(ctx), compute_func_(compute_func), delete_func_(delete_func) {
+    if (create_func != nullptr) {
+      c_kernel_ =
+          (*create_func)(reinterpret_cast<TF_OpKernelConstruction*>(ctx));
+    } else {
+      c_kernel_ = nullptr;
+    }
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    (*compute_func_)(c_kernel_, reinterpret_cast<TF_OpKernelContext*>(ctx));
+  }
+
+  ~COpKernel() override {
+    if (delete_func_ != nullptr) {
+      (*delete_func_)(c_kernel_);
+    }
+  }
+
+ private:
+  void (*compute_func_)(void*, TF_OpKernelContext* context);
+  void (*delete_func_)(void*);
+  void* c_kernel_;
+};
+
+// A KernelFactory that returns COpKernel instances.
+class KernelBuilderFactory
+    : public ::tensorflow::kernel_factory::OpKernelFactory {
+ public:
+  explicit KernelBuilderFactory(TF_KernelBuilder* builder)
+      : builder_(builder) {}
+  ::tensorflow::OpKernel* Create(
+      ::tensorflow::OpKernelConstruction* context) override {
+    return new ::tensorflow::COpKernel(context, builder_->create_function,
+                                       builder_->compute_function,
+                                       builder_->delete_function);
+  }
+  ~KernelBuilderFactory() override { TF_DeleteKernelBuilder(builder_); }
+
+ private:
+  TF_KernelBuilder* builder_;
+};
+}  // namespace
+}  // namespace tensorflow
+
+void TF_RegisterKernelBuilder(const char* name, TF_KernelBuilder* builder,
+                              TF_Status* status) {
+  using tensorflow::register_kernel::Name;
+
+  tensorflow::kernel_factory::OpKernelRegistrar(
+      builder->cc_builder->Build(), name,
+      absl::make_unique<tensorflow::KernelBuilderFactory>(builder));
+
+  TF_SetStatus(status, TF_OK, "");
+}
+
+int TF_NumInputs(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return cc_ctx->num_inputs();
+}
+
+int TF_NumOutputs(TF_OpKernelContext* ctx) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  return cc_ctx->num_outputs();
+}
+
+void TF_GetInput(TF_OpKernelContext* ctx, int i, TF_Tensor** tensor,
+                 TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  if (i < 0 || i >= cc_ctx->num_inputs()) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "input index out of range");
+    return;
+  }
+  const ::tensorflow::Tensor& cc_tensor(cc_ctx->input(i));
+  TF_Tensor* result = ::tensorflow::TF_TensorFromTensor(cc_tensor, status);
+  if (TF_GetCode(status) == TF_OK) {
+    *tensor = result;
+  }
+}
+
+void TF_SetOutput(TF_OpKernelContext* ctx, int i, const TF_Tensor* tensor,
+                  TF_Status* status) {
+  auto* cc_ctx = reinterpret_cast<::tensorflow::OpKernelContext*>(ctx);
+  if (i < 0 || i >= cc_ctx->num_inputs()) {
+    TF_SetStatus(status, TF_OUT_OF_RANGE, "input index out of range");
+    return;
+  }
+  ::tensorflow::Tensor cc_tensor;
+  ::tensorflow::Status s = ::tensorflow::TF_TensorToTensor(tensor, &cc_tensor);
+  TF_SetStatus(status, TF_OK, "");
+  ::tensorflow::Set_TF_Status_from_Status(status, s);
+  if (s.ok()) {
+    cc_ctx->set_output(i, cc_tensor);
+  }
+}
diff --git a/tensorflow/c/kernels.h b/tensorflow/c/kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a91aa184f11ac8e45b38a1d106c7b445747a7c1
--- /dev/null
+++ b/tensorflow/c/kernels.h
@@ -0,0 +1,118 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_C_KERNELS_H_
+#define TENSORFLOW_C_KERNELS_H_
+
+#include "tensorflow/c/c_api.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// --------------------------------------------------------------------------
+// C API for TensorFlow Kernels.
+//
+// This API allows developers to register custom kernel implementations for
+// TensorFlow.
+//
+// See c_api.h header comments for a discussion about API conventions.
+//
+// Users wishing to extend TensorFlow with new kernels will call
+// `TF_NewKernelBuilder`. The resulting kernel builder can be registered with
+// `TF_RegisterKernelBuilder`, which will allow TF to construct user-provided
+// kernels when necessary.
+
+struct TF_KernelBuilder;
+struct TF_OpKernelConstruction;
+struct TF_OpKernelContext;
+
+// Allocates a new kernel builder and returns a pointer to it.
+//
+// If non-null, TensorFlow will call create_func when it needs to instantiate
+// the kernel. The pointer returned by create_func will be passed to
+// compute_func and delete_func, thereby functioning as a "this" pointer for
+// referring to kernel instances.
+//
+// The TF_OpKernelConstruction pointer passed to create_func is owned by
+// TensorFlow and will be deleted once create_func returns. It must not be used
+// after this.
+//
+// When TensorFlow needs to perform a computation with this kernel, it will
+// call compute_func. This function will receive the pointer returned by
+// create_func (or null if no create_func was provided), along with the inputs
+// to the computation.
+//
+// The TF_OpKernelContext pointer received by compute_func is owned by
+// TensorFlow and will be deleted once compute_func returns. It must not be used
+// after this.
+//
+// Finally, when TensorFlow no longer needs the kernel, it will call
+// delete_func if one is provided. This function will receive the pointer
+// returned in `create_func` or nullptr if no `create_func` was provided.
+//
+// The caller should pass the result of this function to
+// TF_RegisterKernelBuilder, which will take ownership of the pointer. If, for
+// some reason, the kernel builder will not be registered, the caller should
+// delete it with TF_DeleteKernelBuilder.
+TF_CAPI_EXPORT extern TF_KernelBuilder* TF_NewKernelBuilder(
+    const char* op_name, const char* device_name,
+    void* (*create_func)(TF_OpKernelConstruction*),
+    void (*compute_func)(void*, TF_OpKernelContext*),
+    void (*delete_func)(void*));
+
+// Register the given kernel builder with the TensorFlow runtime. If
+// registration fails, the given status will be populated.
+//
+// This call takes ownership of the `builder` pointer.
+TF_CAPI_EXPORT extern void TF_RegisterKernelBuilder(const char* kernel_name,
+                                                    TF_KernelBuilder* builder,
+                                                    TF_Status* status);
+
+// Deletes the given TF_KernelBuilder. This should be called only if the kernel
+// builder is not registered with TensorFlow via TF_RegisterKernelBuilder.
+TF_CAPI_EXPORT extern void TF_DeleteKernelBuilder(TF_KernelBuilder* builder);
+
+// --------------------------------------------------------------------------
+// OpKernelContext routines
+
+// TF_NumInputs returns the number of inputs available in ctx.
+TF_CAPI_EXPORT extern int TF_NumInputs(TF_OpKernelContext* ctx);
+
+// TF_NumOutputs returns the number of outputs to be placed in *ctx by the
+// kernel.
+TF_CAPI_EXPORT extern int TF_NumOutputs(TF_OpKernelContext* ctx);
+
+// Retrieves the ith input from ctx. If TF_GetCode(status) is TF_OK, *tensor is
+// populated and its ownership is passed to the caller. In any other case,
+// *tensor is not modified.
+//
+// If i < 0 or i >= TF_NumInputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_GetInput(TF_OpKernelContext* ctx, int i,
+                                       TF_Tensor** tensor, TF_Status* status);
+
+// Sets the ith output of ctx to tensor. If TF_GetCode(status) is anything but
+// TF_OK, ctx is left unmodified.
+//
+// If i < 0 or i >= TF_NumOutputs(ctx), *status is set to TF_OUT_OF_RANGE.
+TF_CAPI_EXPORT extern void TF_SetOutput(TF_OpKernelContext* ctx, int i,
+                                        const TF_Tensor* tensor,
+                                        TF_Status* status);
+
+#ifdef __cplusplus
+} /* end extern "C" */
+#endif
+
+#endif  // TENSORFLOW_C_KERNELS_H_
diff --git a/tensorflow/c/kernels_test.cc b/tensorflow/c/kernels_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e659ee3c3d258a626ccf03a782ec031b5a703a48
--- /dev/null
+++ b/tensorflow/c/kernels_test.cc
@@ -0,0 +1,203 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/c/kernels.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/core/framework/kernel_def.pb.h"
+#include "tensorflow/core/framework/node_def.pb_text.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+struct MyCustomKernel {
+  bool created;
+  bool compute_called;
+};
+
+static bool delete_called = false;
+
+static void* MyCreateFunc(TF_OpKernelConstruction* ctx) {
+  struct MyCustomKernel* s = new struct MyCustomKernel;
+  s->created = true;
+  s->compute_called = false;
+  return s;
+}
+
+static void MyComputeFunc(void* kernel, TF_OpKernelContext* ctx) {
+  struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
+  s->compute_called = true;
+}
+
+static void MyDeleteFunc(void* kernel) {
+  struct MyCustomKernel* s = static_cast<struct MyCustomKernel*>(kernel);
+  EXPECT_TRUE(s->created);
+  EXPECT_TRUE(s->compute_called);
+  delete_called = true;
+  delete s;
+}
+
+namespace tensorflow {
+
+static std::unique_ptr<OpKernel> GetFakeKernel(const char* device_name,
+                                               const char* op_name,
+                                               Status* status) {
+  NodeDef def;
+  def.set_op(op_name);
+  def.set_device(device_name);
+  def.add_input("input1");
+  def.add_input("input2");
+  return CreateOpKernel(DeviceType(device_name), nullptr, nullptr, def, 1,
+                        status);
+}
+
+// Tests registration of a single C kernel and checks that calls through the
+// C/C++ boundary are being made.
+TEST(TestKernel, TestRegisterKernelBuilder) {
+  const char* kernel_name = "SomeKernelName";
+  const char* op_name = "FooOp";
+  const char* device_name = "FakeDeviceName1";
+
+  REGISTER_OP(op_name)
+      .Input("input1: double")
+      .Input("input2: uint8")
+      .Output("output1: uint8");
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(
+      op_name, device_name, &MyCreateFunc, &MyComputeFunc, &MyDeleteFunc);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_Buffer* buf = TF_GetRegisteredKernelsForOp(op_name, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    KernelList list;
+    list.ParseFromArray(buf->data, buf->length);
+    ASSERT_EQ(1, list.kernel_size());
+    ASSERT_EQ(device_name, list.kernel(0).device_type());
+    TF_DeleteBuffer(buf);
+    TF_DeleteStatus(status);
+  }
+
+  {
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+    kernel->Compute(nullptr);
+  }
+
+  ASSERT_TRUE(delete_called);
+}
+
+class DummyDevice : public DeviceBase {
+ public:
+  DummyDevice(Env* env, bool save) : DeviceBase(env), save_(save) {}
+  bool RequiresRecordingAccessedTensors() const override { return save_; }
+  Allocator* GetAllocator(AllocatorAttributes /*attr*/) override {
+    return cpu_allocator();
+  }
+
+ private:
+  bool save_;
+};
+
+TEST(TestKernel, TestInputAndOutputCount) {
+  const char* kernel_name = "InputOutputCounterKernel";
+  const char* op_name = "BarOp";
+  const char* device_name = "FakeDeviceName2";
+
+  REGISTER_OP(op_name)
+      .Input("input1: double")
+      .Input("input2: uint8")
+      .Output("output1: uint8");
+
+  static int num_inputs = 0;
+  static int num_outputs = 0;
+
+  // A kernel whose Compute function has a side-effect of updating num_inputs
+  // and num_outputs. Various functions on TF_OpKernelContext are also
+  // exercised.
+  auto my_compute_func = [](void* kernel, TF_OpKernelContext* ctx) {
+    num_inputs = TF_NumInputs(ctx);
+    num_outputs = TF_NumOutputs(ctx);
+
+    TF_Tensor* input = nullptr;
+    TF_Status* s = TF_NewStatus();
+    TF_GetInput(ctx, 0, &input, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s)) << "Failed to get input: " << TF_Message(s);
+    EXPECT_EQ(123, *static_cast<tensorflow::uint8*>(TF_TensorData(input)));
+    TF_GetInput(ctx, -1, &input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+    TF_GetInput(ctx, 3, &input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+
+    // Copy the input tensor to output.
+    TF_SetOutput(ctx, 0, input, s);
+    EXPECT_EQ(TF_OK, TF_GetCode(s));
+
+    TF_SetOutput(ctx, 24, input, s);
+    EXPECT_EQ(TF_OUT_OF_RANGE, TF_GetCode(s));
+
+    TF_DeleteStatus(s);
+    if (input != nullptr) {
+      TF_DeleteTensor(input);
+    }
+  };
+
+  TF_KernelBuilder* builder = TF_NewKernelBuilder(op_name, device_name, nullptr,
+                                                  my_compute_func, nullptr);
+
+  {
+    TF_Status* status = TF_NewStatus();
+    TF_RegisterKernelBuilder(kernel_name, builder, status);
+    EXPECT_EQ(TF_OK, TF_GetCode(status));
+    TF_DeleteStatus(status);
+  }
+
+  {
+    OpKernelContext::Params p;
+    DummyDevice dummy_device(nullptr, false);
+    p.device = &dummy_device;
+
+    Tensor t(tensorflow::uint8(123));
+
+    gtl::InlinedVector<TensorValue, 4> inputs;
+    // Simulate 2 inputs
+    inputs.emplace_back(&t);
+    inputs.emplace_back();
+    p.inputs = &inputs;
+
+    Status status;
+    std::unique_ptr<OpKernel> kernel =
+        GetFakeKernel(device_name, op_name, &status);
+    TF_EXPECT_OK(status);
+    ASSERT_NE(nullptr, kernel.get());
+
+    p.op_kernel = kernel.get();
+    OpKernelContext ctx(&p);
+    kernel->Compute(&ctx);
+
+    ASSERT_EQ(2, num_inputs);
+    ASSERT_EQ(1, num_outputs);
+    ASSERT_EQ(123, ctx.mutable_output(0)->scalar<tensorflow::uint8>()());
+  }
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/c/python_api.cc b/tensorflow/c/python_api.cc
index 247236b760dd8c07bbb08426100b6a4d34296d2e..98d8393332269ae349cf8aa5c0b612c6f17172e6 100644
--- a/tensorflow/c/python_api.cc
+++ b/tensorflow/c/python_api.cc
@@ -160,4 +160,17 @@ void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
   ic->set_output_handle_shapes_and_types(output.index, shapes_and_types);
 }
 
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status) {
+  mutex_lock l(graph->mu);
+  status->status = graph->graph.AddWhileInputHack(&new_src.oper->node,
+                                                  new_src.index, &dst->node);
+  if (status->status.ok()) {
+    // This modification only updates the destination node for
+    // the purposes of running this graph in a session. Thus, we don't
+    // record the source node as being modified.
+    RecordMutation(graph, *dst, "adding input tensor");
+  }
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/c/python_api.h b/tensorflow/c/python_api.h
index 5cce84020bc68d912d259f51512341eb5f464a2c..44779ca656165dd65590cb5e9ea3ccf71165ed63 100644
--- a/tensorflow/c/python_api.h
+++ b/tensorflow/c/python_api.h
@@ -34,6 +34,7 @@ void SetAttr(TF_Graph* graph, TF_Operation* op, const char* attr_name,
 
 void SetRequestedDevice(TF_Graph* graph, TF_Operation* op, const char* device);
 
+// Updates 'dst' to consume 'new_src'.
 void UpdateEdge(TF_Graph* graph, TF_Output new_src, TF_Input dst,
                 TF_Status* status);
 
@@ -65,6 +66,13 @@ std::string GetHandleShapeAndType(TF_Graph* graph, TF_Output output);
 // because I couldn't get SWIG to work otherwise.
 void SetHandleShapeAndType(TF_Graph* graph, TF_Output output, const void* proto,
                            size_t proto_len, TF_Status* status);
+
+// This method is used to add a new input edge to 'dst', which must be a While
+// op. The While op's "T" attribute must have already been updated to include
+// the new edge. This is used to construct tf.while_loop gradients.
+void AddWhileInputHack(TF_Graph* graph, TF_Output new_src, TF_Operation* dst,
+                       TF_Status* status);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_C_PYTHON_API_H_
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index 83353b79f722f0a95f508b32d4a49b14b35624fb..a09becc49b10d2c58f98fbcc11df5190f794c1d4 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -489,6 +489,7 @@ tf_gen_op_wrappers_cc(
         "image_ops",
         "io_ops",
         "linalg_ops",
+        "list_ops",
         "logging_ops",
         "lookup_ops",
         "manip_ops",
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 3d3895c8fa82c3c0e2974228e9cad767d0e00df4..52345a376cc29ee47ccb9888c9bb26292468b5a9 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -133,5 +133,6 @@ filegroup(
         "testdata/half_plus_two_pbtxt/**",
         "testdata/half_plus_two_main_op/**",
         "testdata/half_plus_two/**",
+        "testdata/half_plus_two_v2/**",
     ]),
 )
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index 645a3f101d1ae7dda88ec4ca622c694dc5a7a919..6f00dc324bd7054b28de2c35023581e1666bfa01 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -33,10 +33,10 @@ constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
 /// SavedModel text format proto filename.
 constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
 
-/// SavedModel legacy init op key.
+/// SavedModel legacy init op collection key. Used in v1 SavedModels.
 constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
 
-/// SavedModel main op key.
+/// SavedModel main op collection key. Used in v1 SavedModels.
 constexpr char kSavedModelMainOpKey[] = "saved_model_main_op";
 
 /// Directory in which to save the SavedModel variables.
@@ -45,6 +45,11 @@ constexpr char kSavedModelVariablesDirectory[] = "variables";
 /// SavedModel variables filename.
 constexpr char kSavedModelVariablesFilename[] = "variables";
 
+/// SavedModel SignatureDef keys for the initialization and train ops. Used in
+/// V2 SavedModels.
+constexpr char kSavedModelInitOpSignatureKey[] = "__saved_model_init_op";
+constexpr char kSavedModelTrainOpSignatureKey[] = "__saved_model_train_op";
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_CC_SAVED_MODEL_CONSTANTS_H_
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index c6abe2f41b9b5ec2faee6f65b429ff606f8ac08e..85d3dd01fa51b3c3ba6fcbf5faac03f1ff5630e2 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -122,34 +122,54 @@ Status RunOnce(const RunOptions& run_options,
   return run_status;
 }
 
-bool HasMainOp(const MetaGraphDef& meta_graph_def) {
+// RunInitOp will return OK if the initialization op was run successfully.
+// An empty init_op_name indicates that there are no init ops to run.
+Status RunInitOp(const RunOptions& run_options, const string& export_dir,
+                 const MetaGraphDef& meta_graph_def,
+                 const std::vector<AssetFileDef>& asset_file_defs,
+                 Session* session, const string& init_op_name) {
+  if (!init_op_name.empty()) {
+    LOG(INFO) << "Running initialization op on SavedModel bundle.";
+    std::vector<std::pair<string, Tensor>> inputs;
+    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
+    RunMetadata run_metadata;
+    return RunOnce(run_options, inputs, {}, {init_op_name},
+                   nullptr /* outputs */, &run_metadata, session);
+  }
+  return Status::OK();
+}
+
+// A SavedModel may store the name of the initialization op to run in the
+// in the SignatureDef (v2) or a collection (v1). If an init_op collection
+// exists, then the collection must contain exactly one op.
+Status GetInitOp(const string& export_dir, const MetaGraphDef& meta_graph_def,
+                 string* init_op_name) {
+  const auto& sig_def_map = meta_graph_def.signature_def();
+  const auto& init_op_sig_it =
+      meta_graph_def.signature_def().find(kSavedModelInitOpSignatureKey);
+  if (init_op_sig_it != sig_def_map.end()) {
+    *init_op_name = init_op_sig_it->second.outputs()
+                        .find(kSavedModelInitOpSignatureKey)
+                        ->second.name();
+    return Status::OK();
+  }
+
   const auto& collection_def_map = meta_graph_def.collection_def();
+  string init_op_collection_key;
   if (collection_def_map.find(kSavedModelMainOpKey) !=
       collection_def_map.end()) {
-    return true;
+    init_op_collection_key = kSavedModelMainOpKey;
+  } else {
+    init_op_collection_key = kSavedModelLegacyInitOpKey;
   }
-  return false;
-}
 
-Status RunMainOp(const RunOptions& run_options, const string& export_dir,
-                 const MetaGraphDef& meta_graph_def,
-                 const std::vector<AssetFileDef>& asset_file_defs,
-                 Session* session, const string& main_op_key) {
-  LOG(INFO) << "Running MainOp with key " << main_op_key
-            << " on SavedModel bundle.";
-  const auto& collection_def_map = meta_graph_def.collection_def();
-  const auto main_op_it = collection_def_map.find(main_op_key);
-  if (main_op_it != collection_def_map.end()) {
-    if (main_op_it->second.node_list().value_size() != 1) {
+  const auto init_op_it = collection_def_map.find(init_op_collection_key);
+  if (init_op_it != collection_def_map.end()) {
+    if (init_op_it->second.node_list().value_size() != 1) {
       return errors::FailedPrecondition(
           strings::StrCat("Expected exactly one main op in : ", export_dir));
     }
-    std::vector<std::pair<string, Tensor>> inputs;
-    AddAssetsTensorsToInputs(export_dir, asset_file_defs, &inputs);
-    RunMetadata run_metadata;
-    const StringPiece main_op_name = main_op_it->second.node_list().value(0);
-    return RunOnce(run_options, inputs, {}, {string(main_op_name)},
-                   nullptr /* outputs */, &run_metadata, session);
+    *init_op_name = init_op_it->second.node_list().value(0);
   }
   return Status::OK();
 }
@@ -193,6 +213,15 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
 
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
                         std::vector<AssetFileDef>* asset_file_defs) {
+  // With SavedModel v2, we write asset file def into metagraph instead of
+  // collection, so read from metagraph first.
+  if (meta_graph_def.asset_file_def_size() > 0) {
+    for (const auto& asset : meta_graph_def.asset_file_def()) {
+      asset_file_defs->push_back(asset);
+    }
+    return Status::OK();
+  }
+  // Fall back to read from collection to be backward compatible with v1.
   const auto& collection_def_map = meta_graph_def.collection_def();
   const auto assets_it = collection_def_map.find(kSavedModelAssetsKey);
   if (assets_it == collection_def_map.end()) {
@@ -227,15 +256,12 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
                  bundle->meta_graph_def.saver_def().restore_op_name(),
                  bundle->meta_graph_def.saver_def().filename_tensor_name(),
                  asset_file_defs, bundle->session.get()));
-  if (HasMainOp(bundle->meta_graph_def)) {
-    TF_RETURN_IF_ERROR(RunMainOp(run_options, export_dir,
-                                 bundle->meta_graph_def, asset_file_defs,
-                                 bundle->session.get(), kSavedModelMainOpKey));
-  } else {
-    TF_RETURN_IF_ERROR(RunMainOp(
-        run_options, export_dir, bundle->meta_graph_def, asset_file_defs,
-        bundle->session.get(), kSavedModelLegacyInitOpKey));
-  }
+  string init_op_name;
+  TF_RETURN_IF_ERROR(
+      GetInitOp(export_dir, bundle->meta_graph_def, &init_op_name));
+  TF_RETURN_IF_ERROR(RunInitOp(run_options, export_dir, bundle->meta_graph_def,
+                               asset_file_defs, bundle->session.get(),
+                               init_op_name));
   return Status::OK();
 }
 
diff --git a/tensorflow/cc/saved_model/loader_test.cc b/tensorflow/cc/saved_model/loader_test.cc
index 72b8bc18710b0ee77cb01ed3ad0c2abb5183efb2..597e42bb65ab5536664089f7e65ec52d77fc8f23 100644
--- a/tensorflow/cc/saved_model/loader_test.cc
+++ b/tensorflow/cc/saved_model/loader_test.cc
@@ -36,6 +36,8 @@ constexpr char kTestDataMainOp[] =
     "cc/saved_model/testdata/half_plus_two_main_op/00000123";
 constexpr char kTestDataSharded[] =
     "cc/saved_model/testdata/half_plus_two/00000123";
+constexpr char kTestDataInitOpV2[] =
+    "cc/saved_model/testdata/half_plus_two_v2/00000123";
 
 class LoaderTest : public ::testing::Test {
  protected:
@@ -227,5 +229,17 @@ TEST_F(LoaderTest, MaybeSavedModelDirectory) {
   EXPECT_FALSE(MaybeSavedModelDirectory(invalid_export_dir));
 }
 
+TEST_F(LoaderTest, SavedModelInitOpV2Format) {
+  SavedModelBundle bundle;
+  SessionOptions session_options;
+  RunOptions run_options;
+
+  const string export_dir =
+      io::JoinPath(testing::TensorFlowSrcRoot(), kTestDataInitOpV2);
+  TF_ASSERT_OK(LoadSavedModel(session_options, run_options, export_dir,
+                              {kSavedModelTagServe}, &bundle));
+  CheckSavedModelBundle(export_dir, bundle);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f9ff036688007836524129e23f5cf82edd1e8910
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/assets/foo.txt
@@ -0,0 +1 @@
+asset-file-contents
\ No newline at end of file
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb
new file mode 100644
index 0000000000000000000000000000000000000000..a10bbf8fb6bca0fcee6414b2927d2f706de85ebc
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..15b75d6ef6bffc336d138d923badb3928b8c4c13
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index
new file mode 100644
index 0000000000000000000000000000000000000000..7ec9fb4fe2dd21d0a6c324aecd7658fc37cf2326
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/half_plus_two_v2/00000123/variables/variables.index differ
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index b17bc658fa06b9feb7edb292bd89ef31e6309169..ab1c1be344e2257721507543bc7647d4ff4becb2 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -164,7 +164,8 @@ string RewriteWithName(const string& name, string code,
 }
 
 // Generate methods for args (inputs).
-Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
+Status GenArgMethods(const tf2xla::Config& config,
+                     const xla::ProgramShapeProto& ps,
                      const CompileResult& compile_result, string* methods) {
   size_t num_args = ps.parameters_size();
   if (config.feed_size() != num_args) {
@@ -174,9 +175,10 @@ Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
   }
   for (int i = 0; i < num_args; ++i) {
     std::vector<std::pair<string, string>> rewrites;
-    TF_RETURN_IF_ERROR(AddRewritesForShape(i, ps.parameters(i), &rewrites));
+    TF_RETURN_IF_ERROR(
+        AddRewritesForShape(i, xla::Shape(ps.parameters(i)), &rewrites));
     const string code = R"(
-  void set_arg{{NAME}}_data(void* data) {
+  void set_arg{{NAME}}_data(const void* data) {
     set_arg_data({{I}}, data);
   }
   {{TYPE}}* arg{{NAME}}_data() {
@@ -204,7 +206,7 @@ Status GenArgMethods(const tf2xla::Config& config, const xla::ProgramShape& ps,
 
 // Generate methods for results (outputs).
 Status GenResultMethods(const tf2xla::Config& config,
-                        const xla::ProgramShape& ps, string* methods) {
+                        const xla::ProgramShapeProto& ps, string* methods) {
   if (ps.result().element_type() != xla::TUPLE) {
     // The XlaCompiler we use to build the xla computation always generates a
     // tuple result, and we rely on this to simplify code generation.
@@ -217,8 +219,8 @@ Status GenResultMethods(const tf2xla::Config& config,
   }
   for (int i = 0; i < ps.result().tuple_shapes_size(); ++i) {
     std::vector<std::pair<string, string>> rewrites;
-    TF_RETURN_IF_ERROR(
-        AddRewritesForShape(i, ps.result().tuple_shapes(i), &rewrites));
+    TF_RETURN_IF_ERROR(AddRewritesForShape(
+        i, xla::Shape(ps.result().tuple_shapes(i)), &rewrites));
     string code = R"(
   {{TYPE}}* result{{NAME}}_data() {
     return static_cast<{{TYPE}}*>(result_data({{I}}));
@@ -336,7 +338,7 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
       ExtractEntryParamBufferInfos(buffer_infos);
   std::vector<BufferInfo> buffer_infos_for_temps =
       ExtractTempBufferInfos(buffer_infos);
-  const xla::ProgramShape& ps = compile_result.program_shape;
+  const xla::ProgramShapeProto& ps = compile_result.program_shape;
   string methods_arg, methods_result;
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
@@ -548,8 +550,8 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
   static const char** StaticResultNames() {{RESULT_NAMES_CODE}}
 
   // Shape of the args and results.
-  static const xla::ProgramShape* StaticProgramShape() {
-    static const xla::ProgramShape* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
+  static const xla::ProgramShapeProto* StaticProgramShape() {
+    static const xla::ProgramShapeProto* kShape = {{PROGRAM_SHAPE_SHIM_EXPRESSION}};
     return kShape;
   }
 
@@ -587,7 +589,7 @@ class {{CLASS}} : public tensorflow::XlaCompiledCpuFunction {
       {"{{METHODS_RESULT}}\n", methods_result},
       {"{{NS_END}}\n", ns_end},
       {"{{NS_START}}\n", ns_start},
-      {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(ps)},
+      {"{{PROGRAM_SHAPE}}", xla::ShapeUtil::HumanString(xla::ProgramShape(ps))},
       {"{{PROGRAM_SHAPE_SHIM_EXPRESSION}}",
        metadata_result.program_shape_access_shim},
       {"{{RESULT_INDEX}}", absl::StrCat(result_index)},
@@ -615,11 +617,11 @@ static string CreateUniqueIdentifier(const CodegenOpts& opts,
 Status GenerateMetadata(const CodegenOpts& opts,
                         const CompileResult& compile_result,
                         MetadataResult* metadata_result) {
-  std::unique_ptr<xla::ProgramShape> program_shape;
+  std::unique_ptr<xla::ProgramShapeProto> program_shape;
 
   if (opts.gen_program_shape) {
     program_shape =
-        absl::make_unique<xla::ProgramShape>(compile_result.program_shape);
+        absl::make_unique<xla::ProgramShapeProto>(compile_result.program_shape);
 
     // The parameter names are currently meaningless, and redundant with the
     // rest of our metadata, so clear them out to avoid confusion and save
@@ -631,8 +633,8 @@ Status GenerateMetadata(const CodegenOpts& opts,
   // a shim that evaluates to nullptr, which is what we want.
 
   ProtobufToEmbed program_shape_protobuf{
-      CreateUniqueIdentifier(opts, "ProgramShape"), "xla::ProgramShape",
-      program_shape.get()};
+      CreateUniqueIdentifier(opts, "ProgramShapeProto"),
+      "xla::ProgramShapeProto", program_shape.get()};
 
   ProtobufToEmbed hlo_profile_printer_data_protobuf{
       CreateUniqueIdentifier(opts, "HloProfilePrinterData"),
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 90410c46a8e36e44454f1219ad76d0fb0937070d..9485e86b10e225a3c9c12eafd9905bdf7c15c9fa 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -57,7 +57,7 @@ struct MetadataResult {
   std::vector<string> header_variable_decls;
 
   // program_shape_access_shim is a C++ expression that constructs the
-  // xla::ProgramShape instance for the CompileResult passed to
+  // xla::ProgramShapeProto instance for the CompileResult passed to
   // GenerateMetadata.
   string program_shape_access_shim;
 
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index bb288d23000527be74f01630d20bbf82e50007ce..c1788ca32a1d099284eeb870f9513891051fd29e 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -181,13 +181,15 @@ TEST(CodegenTest, Golden) {
        BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1),
        BufferInfo::MakeTempBuffer(3), BufferInfo::MakeTempBuffer(120)},
       5, {}));
-  compile_result.program_shape = xla::ShapeUtil::MakeProgramShape(
-      {
-          xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
-          xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
-      },
-      xla::ShapeUtil::MakeTupleShape(
-          {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}));
+  compile_result.program_shape =
+      xla::ShapeUtil::MakeProgramShape(
+          {
+              xla::ShapeUtil::MakeShape(xla::F32, {1, 2}),
+              xla::ShapeUtil::MakeShape(xla::S64, {3, 4}),
+          },
+          xla::ShapeUtil::MakeTupleShape(
+              {xla::ShapeUtil::MakeShape(xla::U32, {5, 6})}))
+          .ToProto();
   compile_result.entry_point = "entry_point";
   compile_result.pointer_size = 8;
 
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index e4d8a02877c75fa72c5747650ab9c7ac229955b3..968afad65ed6d4b5510687df484b7ce6743f6a85 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -22,7 +22,7 @@ extern "C" void entry_point(
     void* result, const xla::ExecutableRunOptions* run_options,
     const void** args, void** temps, tensorflow::int64* profile_counters);
 
-extern "C" char __tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[];
+extern "C" char __tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[];
 
 
 namespace foo {
@@ -114,7 +114,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   //   with dim indices specifying which value. No bounds checking is performed
   //   on dim indices.
 
-  void set_arg0_data(void* data) {
+  void set_arg0_data(const void* data) {
     set_arg_data(0, data);
   }
   float* arg0_data() {
@@ -132,7 +132,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         arg_data(0)))[dim0][dim1];
   }
 
-  void set_arg_myfeed_data(void* data) {
+  void set_arg_myfeed_data(const void* data) {
     set_arg_data(0, data);
   }
   float* arg_myfeed_data() {
@@ -150,7 +150,7 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
         arg_data(0)))[dim0][dim1];
   }
 
-  void set_arg1_data(void* data) {
+  void set_arg1_data(const void* data) {
     set_arg_data(1, data);
   }
   tensorflow::int64* arg1_data() {
@@ -253,10 +253,10 @@ class MyClass : public tensorflow::XlaCompiledCpuFunction {
   }
 
   // Shape of the args and results.
-  static const xla::ProgramShape* StaticProgramShape() {
-    static const xla::ProgramShape* kShape = []() {
-    xla::ProgramShape* proto = new xla::ProgramShape;
-    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShape_protobuf_array_contents[0], 52);
+  static const xla::ProgramShapeProto* StaticProgramShape() {
+    static const xla::ProgramShapeProto* kShape = []() {
+    xla::ProgramShapeProto* proto = new xla::ProgramShapeProto;
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 52);
     return proto;
   }();
     return kShape;
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
index eb001c5d45bdfefc76629d7303d89f5480432235..ce8e5ec8c96a2c3696f14b8eea206d648182ecb5 100644
Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index 2b5f97b34cd928d32eb220536342c715d91d45bb..9fc223bdc7c0e207ce2005cb86250aa77e709df8 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -56,17 +56,23 @@ Status CompileXla(xla::CompileOnlyClient* client,
     return errors::Unknown("Couldn't get XLA program shape: ",
                            pshape_or.status().error_message());
   }
-  compile_result->program_shape = *pshape_or.ValueOrDie();
-  xla::ProgramShape* pshape = &compile_result->program_shape;
-  std::vector<const xla::Shape*> arg_layouts;
-  arg_layouts.reserve(pshape->parameters_size());
+  compile_result->program_shape = pshape_or.ValueOrDie()->ToProto();
+  xla::ProgramShapeProto* pshape = &compile_result->program_shape;
+
+  // AotXlaComputationInstance::argument_layouts is a vector of Shape
+  // pointers. Accumulate the Shape objects themselves in a separate vector
+  // while building the vector of pointers.
+  std::vector<const xla::Shape*> arg_layout_ptrs(pshape->parameters_size());
+  std::vector<xla::Shape> arg_layouts(pshape->parameters_size());
   for (int i = 0; i < pshape->parameters_size(); ++i) {
-    arg_layouts.push_back(pshape->mutable_parameters(i));
+    arg_layouts[i] = xla::Shape(*pshape->mutable_parameters(i));
+    arg_layout_ptrs[i] = &arg_layouts[i];
   }
   xla::CompileOnlyClient::AotXlaComputationInstance instance;
   instance.computation = &computation;
-  instance.argument_layouts = std::move(arg_layouts);
-  instance.result_layout = &pshape->result();
+  instance.argument_layouts = std::move(arg_layout_ptrs);
+  xla::Shape result_shape(pshape->result());
+  instance.result_layout = &result_shape;
   xla::StatusOr<std::vector<std::unique_ptr<xla::AotCompilationResult>>>
       aot_or = client->CompileAheadOfTime({instance}, aot_opts);
   if (!aot_or.ok()) {
diff --git a/tensorflow/compiler/aot/compile.h b/tensorflow/compiler/aot/compile.h
index e03c5b1aa77c1262ed903aae3072ef65f34d80a2..ee7bb26fabd2d897b85b62f38778ecbfe2238eb6 100644
--- a/tensorflow/compiler/aot/compile.h
+++ b/tensorflow/compiler/aot/compile.h
@@ -33,9 +33,9 @@ namespace tfcompile {
 struct CompileResult {
   // Contains object file and meta-info.
   std::unique_ptr<xla::cpu::CpuAotCompilationResult> aot;
-  xla::ProgramShape program_shape;  // Static shape of args and results.
-  string entry_point;               // Name of generated function.
-  int pointer_size = 0;             // Size of a pointer in bytes.
+  xla::ProgramShapeProto program_shape;  // Static shape of args and results.
+  string entry_point;                    // Name of generated function.
+  int pointer_size = 0;                  // Size of a pointer in bytes.
 };
 
 // CompileGraph compiles the graph_def into an object file containing a function
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index f10852c7850f61bfd8b99fa9f1648202d182085e..4dd79e5882d7da61be029735ef2b165908c599f9 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -526,13 +526,15 @@ TEST(TFCompileTest, ProgramShape) {
 
   // muladd has the program shape defined.
   MatMulAndAddComp muladd;
-  const xla::ProgramShape* muladd_shape = muladd.ProgramShape();
+  const xla::ProgramShapeProto* muladd_shape = muladd.ProgramShape();
   ASSERT_TRUE(muladd_shape != nullptr);
   ASSERT_EQ(muladd_shape->parameters_size(), 2);
-  EXPECT_TRUE(ShapeUtil::Compatible(muladd_shape->parameters(0), f32_2x2));
-  EXPECT_TRUE(ShapeUtil::Compatible(muladd_shape->parameters(1), f32_2x2));
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(xla::Shape(muladd_shape->parameters(0)), f32_2x2));
+  EXPECT_TRUE(
+      ShapeUtil::Compatible(xla::Shape(muladd_shape->parameters(1)), f32_2x2));
 
-  const xla::Shape& muladd_result = muladd_shape->result();
+  const xla::Shape muladd_result(muladd_shape->result());
   ASSERT_EQ(muladd_result.element_type(), xla::TUPLE);
   ASSERT_EQ(ShapeUtil::TupleElementCount(muladd_result), 2);
   const xla::Shape& muladd_result0 =
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index 162a137fa7a5573056911d19472de4261574137a..15dcbb2641eca031e82db9aa58dee6a14ab0a2cc 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -23,7 +23,6 @@ package(
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
 
@@ -38,7 +37,7 @@ cc_library(
         ":xla_cpu_device",
         ":xla_cpu_jit",
         "//tensorflow/compiler/plugin",
-    ] + if_cuda_is_configured([
+    ] + if_cuda([
         ":xla_gpu_device",
         ":xla_gpu_jit",
     ]),
@@ -51,6 +50,7 @@ cc_library(
     deps = [
         ":jit_compilation_passes",
         "//tensorflow/compiler/jit/kernels:xla_ops",
+        "//tensorflow/compiler/tf2xla/kernels:xla_cpu_only_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_dummy_ops",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",
@@ -76,10 +76,11 @@ cc_library(
     srcs = ["xla_cpu_device.cc"],
     visibility = [":friends"],
     deps = [
+        ":create_xla_launch_op",  # buildcleaner: keep
+        ":flags",
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_ops",
-        "//tensorflow/compiler/jit/legacy_flags:xla_device_flags",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # buildcleaner: keep
@@ -95,6 +96,7 @@ cc_library(
     srcs = ["xla_gpu_device.cc"],
     visibility = [":friends"],
     deps = [
+        ":create_xla_launch_op",  # buildcleaner: keep
         ":jit_compilation_passes",
         ":xla_device",
         "//tensorflow/compiler/jit/kernels:xla_ops",
@@ -104,6 +106,7 @@ cc_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
@@ -210,6 +213,18 @@ cc_library(
 
 # Internal targets below this point.
 
+cc_library(
+    name = "flags",
+    srcs = ["flags.cc"],
+    hdrs = ["flags.h"],
+    visibility = [":friends"],
+    deps = [
+        "//tensorflow/compiler/xla:parse_flags_from_env",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "common",
     srcs = [
@@ -256,6 +271,7 @@ cc_library(
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
@@ -268,6 +284,7 @@ cc_library(
         "//tensorflow/core/kernels:variable_ops",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -487,6 +504,7 @@ cc_library(
     deps = [
         ":common",
         ":encapsulate_util",
+        ":flags",
         ":shape_inference_helpers",
         ":union_find",
         ":xla_cluster_util",
@@ -494,8 +512,6 @@ cc_library(
         "//tensorflow/cc:ops",
         "//tensorflow/cc:scope_internal",
         "//tensorflow/compiler/jit/graphcycles",
-        "//tensorflow/compiler/jit/legacy_flags:build_xla_ops_pass_flags",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/jit/ops:xla_ops",
         "//tensorflow/compiler/tf2xla:dump_graph",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
@@ -544,25 +560,6 @@ cc_library(
     hdrs = ["union_find.h"],
 )
 
-cc_library(
-    name = "producer_consumer_queue",
-    hdrs = ["producer_consumer_queue.h"],
-    deps = ["//tensorflow/core:lib"],
-)
-
-tf_cc_test(
-    name = "producer_consumer_queue_test",
-    size = "small",
-    srcs = ["producer_consumer_queue_test.cc"],
-    deps = [
-        ":producer_consumer_queue",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-    ],
-)
-
 tf_cc_test(
     name = "deadness_analysis_test",
     size = "small",
@@ -743,7 +740,10 @@ tf_custom_op_py_library(
     visibility = [
         ":friends",
     ],
-    deps = ["//tensorflow/compiler/jit/ops:xla_ops_wrapper_py"],
+    deps = [
+        "//tensorflow/compiler/jit/ops:xla_ops_grad",
+        "//tensorflow/compiler/jit/ops:xla_ops_wrapper_py",
+    ],
 )
 
 # This target can be used by XLA device plugins to prevent circular dependencies, and provides access to all of the required headers for building a device library.
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass.cc b/tensorflow/compiler/jit/build_xla_ops_pass.cc
index 93637a69d5d7b6bf9e9ce784ae521ef0e9b121b9..9f4042630edaec1b9519b6434d859a48372e8b15 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/control_flow_ops.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/encapsulate_subgraphs_pass.h"
-#include "tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_jit_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
@@ -320,10 +320,10 @@ Status BuildXlaOpsPass::Run(const GraphOptimizationPassOptions& options) {
                     return IsXlaCompiledKernel(*n);
                   });
 
-  bool lazy_compilation_enabled = enable_lazy_compilation_
-                                      ? *enable_lazy_compilation_
-                                      : legacy_flags::GetBuildXlaOpsPassFlags()
-                                            .tf_xla_enable_lazy_compilation;
+  bool lazy_compilation_enabled =
+      enable_lazy_compilation_
+          ? *enable_lazy_compilation_
+          : GetBuildXlaOpsPassFlags().tf_xla_enable_lazy_compilation;
 
   for (Node* n : xla_compiled_kernels) {
     TF_RETURN_IF_ERROR(ReplaceNodeWithXlaCompileAndXlaRun(
diff --git a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
index 11df946cc186660242574c2644463a26ead44f1f..48a23a4c1711ac88a329723c46559112d5a39dbd 100644
--- a/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
+++ b/tensorflow/compiler/jit/build_xla_ops_pass_test.cc
@@ -42,14 +42,8 @@ class BuildXlaOpsTest : public ::testing::Test {
               .ok());
   }
 
-  void TearDown() override {
-    for (Device* device : devices_) {
-      delete device;
-    }
-  }
-
  private:
-  std::vector<Device*> devices_;
+  std::vector<std::unique_ptr<Device>> devices_;
 };
 
 using ::tensorflow::testing::FindNodeByName;
diff --git a/tensorflow/compiler/jit/create_xla_launch_op_test.cc b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
index 73866607621cd745f6e640a14405daebf0dd9985..0f872a480f4d4843217f1df3452c4dc62531264e 100644
--- a/tensorflow/compiler/jit/create_xla_launch_op_test.cc
+++ b/tensorflow/compiler/jit/create_xla_launch_op_test.cc
@@ -59,8 +59,9 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 1});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) {
@@ -69,7 +70,7 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
     lib_def_ = absl::make_unique<FunctionLibraryDefinition>(
         OpRegistry::Global(), proto);
     OptimizerOptions opts;
-    device_mgr_ = absl::make_unique<DeviceMgr>(devices_);
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
     pflr_ = absl::make_unique<ProcessFunctionLibraryRuntime>(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, /*default_thread_pool=*/nullptr, /*cluster_flr=*/nullptr);
@@ -77,7 +78,6 @@ class CreateXlaLaunchOpTest : public ::testing::Test {
   }
 
   FunctionLibraryRuntime* flr_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
diff --git a/tensorflow/compiler/jit/encapsulate_util.cc b/tensorflow/compiler/jit/encapsulate_util.cc
index 28ec37b1b9c8a1a306b5e778bac5b6ba01c2c997..1f4b9c90a4ff0b1166cdb7b5942771b350740ef3 100644
--- a/tensorflow/compiler/jit/encapsulate_util.cc
+++ b/tensorflow/compiler/jit/encapsulate_util.cc
@@ -86,7 +86,7 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
       continue;
     } else if (src_xla_computation && !dst_xla_computation) {
       if (src_outside_compilation) {
-        // Case 1d: outside compilation to host computation control edge.
+        // Case 1c: outside compilation to host computation control edge.
         edges_to_remove.push_back(e);
 
         TF_RETURN_IF_ERROR(AppendToListAttr<string>(
@@ -94,7 +94,7 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
       }
     } else if (!src_xla_computation && dst_xla_computation) {
       if (dst_outside_compilation) {
-        // Case 1d: host computation control to outside compilation edge.
+        // Case 1c: host computation control to outside compilation edge.
         edges_to_remove.push_back(e);
 
         TF_RETURN_IF_ERROR(AppendToListAttr<string>(
@@ -103,40 +103,24 @@ Status ProcessControlEdges(Graph* g, const string& xla_computation_attr_name,
     } else {  // src_xla_computation && dst_xla_computation
       if (*src_xla_computation != *dst_xla_computation) {
         if (src_outside_compilation && dst_outside_compilation) {
-          // Case 1c: outside compilation to outside compilation control edge.
+          // Case 1b: outside compilation to outside compilation control edge.
           edges_to_remove.push_back(e);
 
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
         } else if (src_outside_compilation && !dst_outside_compilation) {
-          // Case 1b: outside compilation to another XLA computaition control
+          // Case 1a: outside compilation to another XLA computaition control
           // edge.
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->src(), kXlaConnectedToOtherXlaComputationAttrName,
               *dst_xla_computation));
         } else if (!src_outside_compilation && dst_outside_compilation) {
-          // Case 1b: another XLA computaition to outside compilation control
+          // Case 1a: another XLA computaition to outside compilation control
           // edge.
           TF_RETURN_IF_ERROR(AppendToListAttr<string>(
               e->dst(), kXlaConnectedFromOtherXlaComputationAttrName,
               *src_xla_computation));
         }
-      } else {  // *src_xla_computation == *dst_xla_computation
-        if (src_outside_compilation && dst_outside_compilation) {
-          if (*src_outside_compilation != *dst_outside_compilation) {
-            // Case 1c: outside compilation to outside compilation control edge.
-            edges_to_remove.push_back(e);
-
-            TF_RETURN_IF_ERROR(AppendToListAttr<string>(
-                e->dst(), kXlaControlDependenciesAttrName, e->src()->name()));
-          }
-        } else if (src_outside_compilation && !dst_outside_compilation) {
-          // Case 1a: outside compilation to its XLA computation control edge.
-          ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true);
-        } else if (!src_outside_compilation && dst_outside_compilation) {
-          // Case 1a: XLA computation to outside compilation in it control edge.
-          ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true);
-        }
       }
     }
   }
@@ -181,12 +165,6 @@ Status ProcessXlaToXlaDataEdges(Graph* g,
         edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
         VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
       }
-    } else {  // *src_xla_computation == *dst_xla_computation
-      if (src_outside_compilation && dst_outside_compilation &&
-          *src_outside_compilation != *dst_outside_compilation) {
-        edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
-        VLOG(4) << "XLA -> XLA edge: " << e->DebugString();
-      }
     }
   }
 
@@ -263,7 +241,7 @@ Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
 
   // Remove the edge from host to outside compilation. Add a placeholder as
   // outside compilation node input.
-  std::map<string, Node*> placeholders;
+  std::map<std::pair<string, int>, Node*> placeholders;
   for (int i = 0; i < edges.size(); i++) {
     Node* dst = g->FindNodeId(edges[i].dst_node_id);
     const Edge* e;
@@ -275,9 +253,10 @@ Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
     // Find or create placeholder node.
     string new_name =
         edges[i].is_host_to_outside_compilation
-            ? absl::StrCat(src->name(), "_host_to_oc_placeholder")
-            : absl::StrCat(src->name(), "_oc_to_host_placeholder");
-    auto iter = placeholders.find(new_name);
+            ? absl::StrCat(src->name(), "_host_to_oc_placeholder_", src_output)
+            : absl::StrCat(src->name(), "_oc_to_host_placeholder_", src_output);
+    auto placeholder_index = std::make_pair(src->name(), src_output);
+    auto iter = placeholders.find(placeholder_index);
     Node* placeholder_node;
     if (iter == placeholders.end()) {
       NodeDefBuilder placeholder_builder(new_name, "Placeholder");
@@ -310,7 +289,7 @@ Status ProcessDataEdgeBetweenOutsideCompilationAndHostComputation(
       Status s;
       placeholder_node = g->AddNode(placeholder_def, &s);
       TF_RETURN_IF_ERROR(s);
-      placeholders[new_name] = placeholder_node;
+      placeholders[placeholder_index] = placeholder_node;
     } else {
       placeholder_node = iter->second;
     }
@@ -594,14 +573,244 @@ Status AddControlDependencies(
   return Status::OK();
 }
 
+// Step 1 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PreprocessEdgesBetweenOutsideCompilations` for details.
+Status PreprocessControlEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather edges to remove. We should not remove the edge while iterating.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->edges()) {
+    if (!e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation) {
+      if (*src_outside_compilation != *dst_outside_compilation) {
+        // Case 1a: outside compilation to outside compilation control edge.
+        edges_to_remove.push_back(e);
+
+        TF_RETURN_IF_ERROR(AppendToListAttr<string>(
+            e->dst(), kXlaControlDependenciesWithinXlaClusterAttrName,
+            e->src()->name()));
+      }
+    } else if (src_outside_compilation && !dst_outside_compilation) {
+      // Case 1b: outside compilation to its XLA computation control edge.
+      ReplaceAttr(e->src(), kXlaConnectedToXlaComputationAttrName, true);
+    } else if (!src_outside_compilation && dst_outside_compilation) {
+      // Case 1b: XLA computation to outside compilation in it control edge.
+      ReplaceAttr(e->dst(), kXlaConnectedFromXlaComputationAttrName, true);
+    }
+  }
+
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PreprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PreprocessEdgesBetweenOutsideCompilations` for details.
+Status PreprocessDataEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather edges between outside compilation and host computation. Notice that
+  // we do not store `Edge*` directly because we remove some nodes while adding
+  // Identity nodes, and those Edge pointers might be invalidated.
+  struct EdgeInfo {
+    int dst_input, dst_node_id;
+  };
+  std::vector<EdgeInfo> edges;
+  for (const Edge* e : g->edges()) {
+    if (e->IsControlEdge()) {
+      continue;
+    }
+
+    auto src_outside_compilation =
+        GetStringAttr(*e->src(), outside_compilation_attr_name);
+    auto dst_outside_compilation =
+        GetStringAttr(*e->dst(), outside_compilation_attr_name);
+
+    if (src_outside_compilation && dst_outside_compilation &&
+        *src_outside_compilation != *dst_outside_compilation) {
+      edges.push_back(EdgeInfo{e->dst_input(), e->dst()->id()});
+      VLOG(4) << "Oc -> oc edge: " << e->DebugString();
+    }
+  }
+
+  // Remove the edge from host to outside compilation. Add a placeholder as
+  // outside compilation node input.
+  std::map<std::pair<string, int>, Node*> placeholders;
+  for (int i = 0; i < edges.size(); i++) {
+    Node* dst = g->FindNodeId(edges[i].dst_node_id);
+    const Edge* e;
+    TF_RETURN_IF_ERROR(dst->input_edge(edges[i].dst_input, &e));
+    Node* src = e->src();
+    int src_output = e->src_output(), dst_input = e->dst_input();
+    g->RemoveEdge(e);
+
+    // Find or create placeholder node.
+    string new_name =
+        absl::StrCat(src->name(), "_oc_to_oc_placeholder_", src_output);
+    auto placeholder_index = std::make_pair(src->name(), src_output);
+    auto iter = placeholders.find(placeholder_index);
+    Node* placeholder_node;
+    if (iter == placeholders.end()) {
+      NodeDefBuilder placeholder_builder(new_name, "Placeholder");
+      placeholder_builder.Attr("dtype", src->output_type(src_output));
+      string outside_compilation_attr;
+      TF_RETURN_IF_ERROR(GetNodeAttr(dst->attrs(),
+                                     outside_compilation_attr_name,
+                                     &outside_compilation_attr));
+      placeholder_builder.Attr(outside_compilation_attr_name,
+                               outside_compilation_attr);
+      placeholder_builder.Attr(kOutsideCompilationOriginalNodeAttrName,
+                               src->name());
+      placeholder_builder.Attr(kOutsideCompilationSrcOutputAttrName,
+                               src_output);
+      NodeDef placeholder_def;
+      TF_RETURN_IF_ERROR(placeholder_builder.Finalize(&placeholder_def));
+      Status s;
+      placeholder_node = g->AddNode(placeholder_def, &s);
+      TF_RETURN_IF_ERROR(s);
+      placeholders[placeholder_index] = placeholder_node;
+    } else {
+      placeholder_node = iter->second;
+    }
+    g->AddEdge(placeholder_node, 0, dst, dst_input);
+
+    // Replace `e->dst()` because its input node changed.
+    NodeDef new_def = dst->def();
+    *new_def.mutable_input(dst_input) = placeholder_node->name();
+    TF_ASSIGN_OR_RETURN(Node * dst_replace_node, ReplaceNode(g, dst, new_def));
+
+    // Other edge in `edges` might have `e->dst()` as src or dst
+    // node. Before removing `e->dst()`, replace those edges with
+    // corresponding edges for `dst_replace_node`.
+    for (int j = i + 1; j < edges.size(); j++) {
+      if (edges[j].dst_node_id == edges[i].dst_node_id) {
+        edges[j].dst_node_id = dst_replace_node->id();
+      }
+    }
+  }
+  return Status::OK();
+}
+
+// Step 1 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PostprocessEdgesBetweenOutsideCompilations` for details.
+Status PostprocessDataEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Gather all outside compilation to outside compilation nodes.
+  std::vector<Node*> placeholder_nodes;
+  for (Node* n : g->nodes()) {
+    if (n->type_string() == "Placeholder" &&
+        HasNodeAttr(n->def(), kOutsideCompilationOriginalNodeAttrName)) {
+      placeholder_nodes.push_back(n);
+    }
+  }
+
+  // Remove the placeholder nodes, and reconnect original edge.
+  auto node_name_index = g->BuildNodeNameIndex();
+  for (auto n : placeholder_nodes) {
+    string node_name;
+    int node_src_output;
+    TF_RETURN_IF_ERROR(GetNodeAttr(
+        n->attrs(), kOutsideCompilationOriginalNodeAttrName, &node_name));
+    TF_RETURN_IF_ERROR(GetNodeAttr(
+        n->attrs(), kOutsideCompilationSrcOutputAttrName, &node_src_output));
+    auto iter = node_name_index.find(node_name);
+    if (iter == node_name_index.end()) {
+      return errors::Internal(
+          "Cannot find original node for oc -> host placeholder node ",
+          node_name);
+    }
+
+    // Change all usage node to use the original node instead.
+    Node* original_node = iter->second;
+    std::vector<const Edge*> control_edges;
+    std::vector<OutEdgeInfo> data_edges;
+    for (auto e : n->out_edges()) {
+      if (e->IsControlEdge()) {
+        control_edges.push_back(e);
+      } else {
+        data_edges.push_back({e->dst(), e->src_output(), e->dst_input()});
+      }
+    }
+    for (const Edge* e : control_edges) {
+      g->AddControlEdge(original_node, e->dst());
+      g->RemoveEdge(e);
+    }
+    for (int i = 0; i < data_edges.size(); i++) {
+      Node* dst = data_edges[i].dst;
+      NodeDef new_def = dst->def();
+      int dst_input = data_edges[i].dst_input;
+      *new_def.mutable_input(dst_input) =
+          absl::StrCat(original_node->name(), ":", node_src_output);
+      TF_ASSIGN_OR_RETURN(Node * replace_node, ReplaceNode(g, dst, new_def));
+
+      const Edge* edge_to_replace = nullptr;
+      TF_RETURN_IF_ERROR(replace_node->input_edge(dst_input, &edge_to_replace));
+      g->RemoveEdge(edge_to_replace);
+      g->AddEdge(original_node, node_src_output, replace_node, dst_input);
+
+      // Other edges might have `dst` as dst node. Update those edges with
+      // `replace_node`.
+      for (int j = i + 1; j < data_edges.size(); j++) {
+        if (data_edges[j].dst == dst) {
+          data_edges[j].dst = replace_node;
+        }
+      }
+
+      // Other placeholder node might have `dst` as original node. Update
+      // `node_name_index` with `replace_node`.
+      node_name_index[replace_node->name()] = replace_node;
+    }
+
+    // Remove placeholder node.
+    g->RemoveNode(n);
+  }
+  return Status::OK();
+}
+
+// Step 2 for `PostprocessEdgesBetweenOutsideCompilations`. See comments of
+// `PostprocessEdgesBetweenOutsideCompilations` for details.
+Status PostprocessControlEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  auto node_name_index = g->BuildNodeNameIndex();
+
+  // Reconnect outside compilation to outside compilation control edge.
+  for (Node* n : g->nodes()) {
+    std::vector<string> control_deps;
+    Status s =
+        GetNodeAttr(n->attrs(), kXlaControlDependenciesWithinXlaClusterAttrName,
+                    &control_deps);
+    if (!s.ok()) {
+      if (s.code() != error::NOT_FOUND) {
+        return s;
+      } else {
+        continue;
+      }
+    } else {
+      n->ClearAttr(kXlaControlDependenciesWithinXlaClusterAttrName);
+      for (const string& control_input : control_deps) {
+        auto iter = node_name_index.find(control_input);
+        if (iter == node_name_index.end()) {
+          return errors::Internal("Cannot find original node for ",
+                                  control_input);
+        }
+        g->AddControlEdge(iter->second, n);
+      }
+    }
+  }
+  return Status::OK();
+}
 }  // namespace
 
 const char kXlaInferredShapesAttrName[] = "_xla_inferred_shapes";
 
-const char kXlaConnectedToXlaComputationAttrName[] =
-    "_xla_connected_to_xla_computation";
-const char kXlaConnectedFromXlaComputationAttrName[] =
-    "_xla_connected_from_xla_computation";
 const char kXlaConnectedToOtherXlaComputationAttrName[] =
     "_xla_connected_to_other_xla_computation";
 const char kXlaConnectedFromOtherXlaComputationAttrName[] =
@@ -616,6 +825,15 @@ const char kHostToOutsideCompilationOriginalNodeAttrName[] =
     "_xla_host_to_oc_node_name";
 const char kHostToOutsideCompilationSrcOutputAttrName[] =
     "_xla_host_to_oc_src_output";
+const char kXlaConnectedToXlaComputationAttrName[] =
+    "_xla_connected_to_xla_computation";
+const char kXlaConnectedFromXlaComputationAttrName[] =
+    "_xla_connected_from_xla_computation";
+const char kOutsideCompilationOriginalNodeAttrName[] =
+    "_xla_oc_to_oc_node_name";
+const char kOutsideCompilationSrcOutputAttrName[] = "_xla_oc_to_oc_src_output";
+const char kXlaControlDependenciesWithinXlaClusterAttrName[] =
+    "_xla_control_dependencies_within_xla_cluster";
 
 Status PerformStaticShapeInferenceBeforeEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
@@ -699,4 +917,39 @@ Status PostprocessForEncapsulation(
   return Status::OK();
 }
 
+Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  // Remove edges from source node to outside compilation nodes, and edges
+  // from outside compilation nodes to sink node.
+  std::vector<const Edge*> edges_to_remove;
+  for (const Edge* e : g->source_node()->out_edges()) {
+    if (HasNodeAttr(e->dst()->def(), outside_compilation_attr_name)) {
+      edges_to_remove.push_back(e);
+    }
+  }
+  for (const Edge* e : g->sink_node()->in_edges()) {
+    if (HasNodeAttr(e->src()->def(), outside_compilation_attr_name)) {
+      edges_to_remove.push_back(e);
+    }
+  }
+  for (auto e : edges_to_remove) {
+    g->RemoveEdge(e);
+  }
+
+  TF_RETURN_IF_ERROR(PreprocessControlEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(PreprocessDataEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  return Status::OK();
+}
+
+Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name) {
+  TF_RETURN_IF_ERROR(PostprocessDataEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  TF_RETURN_IF_ERROR(PostprocessControlEdgesBetweenOutsideCompilations(
+      g, outside_compilation_attr_name));
+  return Status::OK();
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/encapsulate_util.h b/tensorflow/compiler/jit/encapsulate_util.h
index 5e0c4bf6a0cc92d69209595e257989665404db6b..e363bc5754ac395bae262dc67a780a0173efaf5e 100644
--- a/tensorflow/compiler/jit/encapsulate_util.h
+++ b/tensorflow/compiler/jit/encapsulate_util.h
@@ -44,14 +44,6 @@ Status PerformStaticShapeInferenceBeforeEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
     const string& outside_compilation_attr_name);
 
-// Attribute indicating that some ops in this node's XLA computation has control
-// dependency on this node. Attribute value will always be "true".
-extern const char kXlaConnectedToXlaComputationAttrName[];
-
-// Attribute indicating that this node has control dependency on some ops in
-// this node's XLA computation. Attribute value will always be "true".
-extern const char kXlaConnectedFromXlaComputationAttrName[];
-
 // Attribute indicating that some ops in other XLA computation has control
 // dependency on this node. Attribute value will be a list of string (XLA
 // computation names).
@@ -81,6 +73,14 @@ extern const char kOutsideCompilationToHostOriginalNodeAttrName[];
 // int (src_output for original edge).
 extern const char kOutsideCompilationToHostSrcOutputAttrName[];
 
+// Attribute indicating that some ops in this node's XLA computation has control
+// dependency on this node. Attribute value will always be "true".
+extern const char kXlaConnectedToXlaComputationAttrName[];
+
+// Attribute indicating that this node has control dependency on some ops in
+// this node's XLA computation. Attribute value will always be "true".
+extern const char kXlaConnectedFromXlaComputationAttrName[];
+
 // Attribute indicating that this is an Placeholder node added to act as a
 // temporary input node for an host node. Attribute value will be string
 // (original input node name).
@@ -91,19 +91,31 @@ extern const char kHostToOutsideCompilationOriginalNodeAttrName[];
 // for original edge).
 extern const char kHostToOutsideCompilationSrcOutputAttrName[];
 
-// Preprocesses the graph for encapsulation. It will perform the following
-// operations in order:
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// string (original input node name).
+extern const char kOutsideCompilationOriginalNodeAttrName[];
+
+// Attribute indicating that this is an Placeholder node added to act as a
+// temporary input node for an outside compilation node. Attribute value will be
+// int (src_output for original edge).
+extern const char kOutsideCompilationSrcOutputAttrName[];
+
+// Attribute indicating that this node has control dependencies on some other
+// nodes within the same XLA cluster. Attribute value will be a list of string
+// (node names).
+extern const char kXlaControlDependenciesWithinXlaClusterAttrName[];
+
+// Preprocesses edges between different XLA clusters for encapsulation. It will
+// perform the following operations in order:
 //
-// 1a. For control edges between outside compilation and its XLA computation,
-//     add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the
-//     outside compilation node.
-// 1b. For control edges between outside compilation and another XLA
+// 1a. For control edges between outside compilation and another XLA
 //     computation, add attr "kXlaConnected{From, To}OtherXlaComputationAttrName
 //     = XLA computation node name" to the outside compilation node.
-// 1c. For control edges between different outside compilations, remove the edge
-//     and add attr "kXlaControlDependenciesAttrName = src node name" to dst
-//     node.
-// 1d. For control edges between outside compilation and host computation,
+// 1b. For control edges between different outside compilations (in different
+//     XLA computations), remove the edge and add attr
+//     "kXlaControlDependenciesAttrName = src node name" to dst node.
+// 1c. For control edges between outside compilation and host computation,
 //     remove the edge and add attr "kXlaControlDependenciesAttrName = src node
 //     name" to dst node.
 // 2. For data edges between different XLA computations, if either src or dst
@@ -146,26 +158,53 @@ struct XlaClusterInfo {
   const std::map<string, int> host_compute_core;
 };
 
-// Postprocesses the graph for encapsulation. This function reverts what
-// `PreprocessForEncapsulation` did. It will perform the following operations in
-// order:
+// Postprocesses edges between different XLA clusters for encapsulation. This
+// function reverts what `PreprocessForEncapsulation` did. It will perform the
+// following operations in order:
 //
 // 1. Remove Placeholder nodes between outside compilation and host computation
 //     (created in `PreprocessForEncapsulation` step 3).
 // 2. Remove Identity nodes created in `PreprocessForEncapsulation` step 2.
-// 3a. Reconnect control edges between different outside compilations (marked by
-//     `PreprocessForEncapsulation` step 1c) and control edges between outside
-//     compilation and host computation (marked by `PreprocessForEncapsulation`
-//     step 1d).
-// 3b. Reconnect control edges between outside compilation and another XLA
-//     computation (marked by `PreprocessForEncapsulation` step 1b).
-// Notice that control edges marked by `PreprocessForEncapsulation` step 1a are
-// not handled here. They are handled in `RewriteOutsideCompilationSubgraphFn`.
+// 3a. Reconnect control edges between outside compilation and another XLA
+//     computation (marked by `PreprocessForEncapsulation` step 1a).
+// 3b. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessForEncapsulation` step 1b).
+// 3c. Reconnect control edges between outside compilation and host computation
+//     (marked by `PreprocessForEncapsulation` step 1c).
 Status PostprocessForEncapsulation(
     Graph* g, const string& xla_computation_attr_name,
     const string& outside_compilation_attr_name,
     const std::unordered_map<string, XlaClusterInfo>& clusters);
 
+// Preprocesses edges within the same XLA cluster. It will perform the following
+// operations in order:
+//
+// 0.  Remove edges from source node to outside compilation nodes, and edges
+//     from outside compilation nodes to sink node.
+// 1a. For edges between different outside compilation clusters, remove the edge
+//     and add attr "kXlaControlDependenciesWithinXlaClusterAttrName = src node
+//     name" to dst node.
+// 1b. For control edges between outside compilation and its XLA computation,
+//     add attr "kXlaConnected{From, To}XlaComputationAttrName = true" to the
+//     outside compilation node.
+// 2.  For data edges between different outside compilations, remove the edge
+//     and create a Placeholder node as dst node's input.
+Status PreprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
+
+// Postprocesses edges within the same XLA cluster. This function reverts what
+// `PreprocessEdgesBetweenOutsideCompilations` did. It will perform the
+// following operations in order:
+//
+// 1. Remove Placeholder nodes between different outside compilations (created
+//    in `PreprocessEdgesBetweenOutsideCompilations` step 2).
+// 2a. Reconnect control edges between different outside compilations (marked by
+//     `PreprocessEdgesBetweenOutsideCompilations` step 1a).
+// Notice that control edges marked by
+// `PreprocessEdgesBetweenOutsideCompilations` step 1b are not handled here.
+// They are handled in `RewriteOutsideCompilationSubgraphFn`.
+Status PostprocessEdgesBetweenOutsideCompilations(
+    Graph* g, const string& outside_compilation_attr_name);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_ENCAPSULATE_UTIL_H_
diff --git a/tensorflow/compiler/jit/encapsulate_util_test.cc b/tensorflow/compiler/jit/encapsulate_util_test.cc
index 7255df3112916b7abcc98ff8204efc8c02209b13..3b8b49cb92f3e453883a8e64e12ce3748a5173f6 100644
--- a/tensorflow/compiler/jit/encapsulate_util_test.cc
+++ b/tensorflow/compiler/jit/encapsulate_util_test.cc
@@ -107,28 +107,19 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) {
   identity4_node->AddAttr("_xla", "1");
   identity4_node->AddAttr("_oc", "0");
   identity5_node->AddAttr("_xla", "1");
-  // Case 1a: control edges between outside compilation and its XLA computation.
-  g.AddControlEdge(add_node, identity0_node);
-  g.AddControlEdge(identity0_node, identity1_node);
-  // Case 1b: control edges between outside compilation and another XLA
+  // Case 1a: control edges between outside compilation and another XLA
   // computation.
   g.AddControlEdge(identity0_node, identity3_node);
   g.AddControlEdge(identity1_node, identity4_node);
-  // Case 1c: control edges between different outside compilations.
+  // Case 1b: control edges between different outside compilations.
   g.AddControlEdge(identity0_node, identity4_node);
-  // Case 1d: control edges between outside compilation and host computation.
+  // Case 1c: control edges between outside compilation and host computation.
   g.AddControlEdge(const0_node, identity0_node);
   g.AddControlEdge(identity0_node, identity2_node);
 
   TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
 
-  // Case 1a: add attr "_xla_connected_{from/to}_xla_computation = true" to the
-  // outside compilation node.
-  EXPECT_TRUE(HasNodeAttr(identity0_node->def(),
-                          kXlaConnectedFromXlaComputationAttrName));
-  EXPECT_TRUE(HasNodeAttr(identity0_node->def(),
-                          kXlaConnectedToXlaComputationAttrName));
-  // Case 1b: add attr "_xla_control_deps_{from/to} = XLA computation node name"
+  // Case 1a: add attr "_xla_control_deps_{from/to} = XLA computation node name"
   // to the outside compilation node.
   std::vector<string> attr;
   TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
@@ -140,13 +131,13 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) {
                           kXlaConnectedFromOtherXlaComputationAttrName, &attr));
   EXPECT_EQ(attr.size(), 1);
   EXPECT_EQ(attr[0], "0");
-  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
+  // Case 1b: add attr "_xla_control_deps = src node name" to dst node.
   attr.clear();
   TF_CHECK_OK(GetNodeAttr(identity4_node->def(),
                           kXlaControlDependenciesAttrName, &attr));
   EXPECT_EQ(attr.size(), 1);
   EXPECT_EQ(attr[0], "identity0");
-  // Case 1d: add attr "_xla_control_deps = src node name" to dst node.
+  // Case 1c: add attr "_xla_control_deps = src node name" to dst node.
   attr.clear();
   TF_CHECK_OK(GetNodeAttr(identity0_node->def(),
                           kXlaControlDependenciesAttrName, &attr));
@@ -162,23 +153,33 @@ TEST(PreprocessForEncapsulationTest, ControlEdges) {
 TEST(PreprocessForEncapsulationTest, DataEdges) {
   // Build the graph:
   // "const_0" and "const_1" in host computation
+  // "identityn0" = ("const_0", "const_1") in host computation 0
   // "add0" = "const_0" + "const_1" in XLA computation 0
   // "add1" = "add0" + "const_0" in XLA computation 0 & outside compilation 0
   // "identity0" = "add1" in XLA computation 0
   // "add2" = "add1" + "identity0" in host computation
   // "add3" = "add1" + "add2" in XLA computation 1
-  // "add4" = "identity0" + "add2" in XLA computation 1 & outside compilation 1
+  // "add4" = "identity0" + "add2" in XLA computation 1 & outside compilation 0
+  // "add5" = "identityn0"[0] + "identityn0"[1] in XLA computation 1 &
+  //                                               outside compilation 0
+  // "identityn1" = ("identityn0"[0], "identityn0"[1]) in XLA computation 1 &
+  //                                                   outside compilation 0
   // "identity1" = "add4" in XLA computation 1
   // "identity2" = "identity1" in host computation
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
   Output const_0 = ops::Const(s.WithOpName("const_0"), 1, {});
   Output const_1 = ops::Const(s.WithOpName("const_1"), 2, {});
+  auto identityn0 =
+      ops::IdentityN(s.WithOpName("identityn_0"), {const_0, const_1});
   Output add0 = ops::Add(s.WithOpName("add0"), const_0, const_1);
   Output add1 = ops::Add(s.WithOpName("add1"), add0, const_0);
   Output identity0 = ops::Identity(s.WithOpName("identity0"), add1);
   Output add2 = ops::Add(s.WithOpName("add2"), add1, identity0);
   Output add3 = ops::Add(s.WithOpName("add3"), add1, add2);
   Output add4 = ops::Add(s.WithOpName("add4"), identity0, add2);
+  Output add5 = ops::Add(s.WithOpName("add5"), identityn0[0], identityn0[1]);
+  auto identityn1 = ops::IdentityN(s.WithOpName("identityn_1"),
+                                   {identityn0[0], identityn0[1]});
   Output identity1 = ops::Identity(s.WithOpName("identity1"), add4);
   Output identity2 = ops::Identity(s.WithOpName("identity2"), add4);
   Graph g(OpRegistry::Global());
@@ -189,6 +190,8 @@ TEST(PreprocessForEncapsulationTest, DataEdges) {
   Node *add0_node = node_index["add0"], *add1_node = node_index["add1"],
        *identity0_node = node_index["identity0"],
        *add3_node = node_index["add3"], *add4_node = node_index["add4"],
+       *add5_node = node_index["add5"],
+       *identityn1_node = node_index["identityn_1"],
        *identity1_node = node_index["identity1"];
   add0_node->AddAttr("_xla", "0");
   add1_node->AddAttr("_xla", "0");
@@ -197,6 +200,10 @@ TEST(PreprocessForEncapsulationTest, DataEdges) {
   add3_node->AddAttr("_xla", "1");
   add4_node->AddAttr("_xla", "1");
   add4_node->AddAttr("_oc", "0");
+  add5_node->AddAttr("_xla", "1");
+  add5_node->AddAttr("_oc", "0");
+  identityn1_node->AddAttr("_xla", "1");
+  identityn1_node->AddAttr("_oc", "0");
   identity1_node->AddAttr("_xla", "1");
 
   TF_CHECK_OK(PreprocessForEncapsulation(&g, "_xla", "_oc"));
@@ -214,8 +221,9 @@ TEST(PreprocessForEncapsulationTest, DataEdges) {
   EXPECT_NE(bridge_identity0_add4, nullptr);
   // Step 3: add placeholder for edges between host computation and outside
   // compilation.
-  EXPECT_EQ(bridge_add1_add3->def().input(0), "add1_oc_to_host_placeholder");
-  Node *add1_oc_to_host_placeholder = node_index["add1_oc_to_host_placeholder"];
+  EXPECT_EQ(bridge_add1_add3->def().input(0), "add1_oc_to_host_placeholder_0");
+  Node *add1_oc_to_host_placeholder =
+      node_index["add1_oc_to_host_placeholder_0"];
   TF_CHECK_OK(GetNodeAttr(add1_oc_to_host_placeholder->attrs(),
                           kOutsideCompilationToHostOriginalNodeAttrName, &str));
   EXPECT_EQ(str, "add1");
@@ -226,15 +234,34 @@ TEST(PreprocessForEncapsulationTest, DataEdges) {
   add4_node = node_index["add4"];
   ASSERT_NE(add4_node, nullptr);
   EXPECT_EQ(add4_node->def().input(0),
-            "bridge_identity0_add4_host_to_oc_placeholder");
+            "bridge_identity0_add4_host_to_oc_placeholder_0");
   Node *identity0_host_to_oc_placeholder =
-      node_index["bridge_identity0_add4_host_to_oc_placeholder"];
+      node_index["bridge_identity0_add4_host_to_oc_placeholder_0"];
   TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
                           kHostToOutsideCompilationOriginalNodeAttrName, &str));
   EXPECT_EQ(str, "bridge_identity0_add4");
   TF_CHECK_OK(GetNodeAttr(identity0_host_to_oc_placeholder->attrs(),
                           kHostToOutsideCompilationSrcOutputAttrName, &i));
   EXPECT_EQ(i, 0);
+
+  // Check different placeholder nodes are created for different src_output.
+  Node *placeholder0 = node_index["identityn_0_host_to_oc_placeholder_0"],
+       *placeholder1 = node_index["identityn_0_host_to_oc_placeholder_1"];
+  EXPECT_NE(placeholder0, nullptr);
+  EXPECT_NE(placeholder1, nullptr);
+  // Check we only have 2 placeholder nodes created for "identityn_0".
+  int placeholder_count = 0;
+  for (Node *n : g.nodes()) {
+    if (HasNodeAttr(n->def(), kHostToOutsideCompilationOriginalNodeAttrName)) {
+      string attr;
+      TF_CHECK_OK(GetNodeAttr(
+          n->attrs(), kHostToOutsideCompilationOriginalNodeAttrName, &attr));
+      if (attr == "identityn_0") {
+        ++placeholder_count;
+      }
+    }
+  }
+  EXPECT_EQ(placeholder_count, 2);
 }
 
 TEST(PostprocessForEncapsulationTest, ControlEdges) {
diff --git a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
index 2ce6fa73fc448ca83fa392aa909cb385453eb8b6..d334100aa4a915a87fb05d371e0e3379a7ee05f2 100644
--- a/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_xla_computations_pass.cc
@@ -195,8 +195,11 @@ Status RewriteSubgraph(const std::vector<OutputTensor>& arg_source_tensors,
         e->dst()->attrs().Find(kXlaClusterAttr) == nullptr &&
         e->dst()->type_string() != kXlaClusterOutput) {
       return errors::InvalidArgument(
-          "Undeclared output of XLA computation. A common cause of this error "
-          "is variable initializers that depend on the XLA computation. Edge: ",
+          "Undeclared output of XLA computation. Some common causes of this "
+          "error are: 1) variable initializers that depend on the XLA "
+          "computation; 2) gradient computations that depend on the XLA "
+          "computation, which can be mitigated by moving gradient computations "
+          "inside XLA computation. Offending edge: ",
           e->src()->name(), ":", e->src_output(), " -> ", e->dst()->name(), ":",
           e->dst_input());
     }
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
index 8b3587c5087a0651c466f53f3709ba21e75dd273..e3c7e2f89be9b37b51a633dabb099969c181013f 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass.cc
@@ -366,7 +366,7 @@ Status ReplaceOrRemoveOutsideCompilationCallNode(
 //    replace this node with compilation result node.
 // 3) all outside compilation graphs.
 Status ConstructHostGraph(
-    const string& xla_cluster_name,
+    const string& xla_cluster_name, const string& outside_compilation_attr_name,
     const std::vector<string>& outside_compilation_host_graphs,
     FunctionLibraryDefinition* fld, std::unique_ptr<Graph>* host_graph) {
   host_graph->reset(new Graph(fld));
@@ -476,6 +476,10 @@ Status ConstructHostGraph(
       host_graph->get(),
       std::unordered_set<const Node*>{(*host_graph)->sink_node()});
 
+  // Postprocess edges between different outside compilations.
+  TF_RETURN_IF_ERROR(PostprocessEdgesBetweenOutsideCompilations(
+      host_graph->get(), outside_compilation_attr_name));
+
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_host_graph_for_",
@@ -801,6 +805,11 @@ Status ExtractOutsideCompilationForFunction(
       },
       &fbody));
   std::unique_ptr<FunctionBody> fbody_deleter(fbody);
+
+  // Preprocess edges between different outside compilations. They will be
+  // restored in `ConstructHostGraph()`.
+  TF_RETURN_IF_ERROR(PreprocessEdgesBetweenOutsideCompilations(
+      fbody->graph, outside_compilation_attr_name));
   if (VLOG_IS_ON(4)) {
     dump_graph::DumpGraphToFile(
         absl::StrCat("extract_outside_compilation_for_func_before_", func_name),
@@ -860,8 +869,9 @@ Status ExtractOutsideCompilationForFunction(
 
   // Construct host graph.
   if (!outside_compilation_host_graphs.empty()) {
-    TF_RETURN_IF_ERROR(ConstructHostGraph(
-        xla_cluster_name, outside_compilation_host_graphs, fld, host_graph));
+    TF_RETURN_IF_ERROR(
+        ConstructHostGraph(xla_cluster_name, outside_compilation_attr_name,
+                           outside_compilation_host_graphs, fld, host_graph));
   }
 
   // Remove the outside compilation graphs from function library.
diff --git a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
index c5bd64f004ef98853955372680277e04c16bdc9e..bff956100da661b679b4557fce53671e6cef88c5 100644
--- a/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/extract_outside_compilation_pass_test.cc
@@ -290,21 +290,18 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
   TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shapes", &shapes));
   EXPECT_EQ(shapes.size(), 1);
   EXPECT_EQ(shapes[0].dim_size(), 1);
-  // Check XlaHostCompute nodes' "shape_inference_graph" attr. "0" should have a
-  // non-empty value, and "1" should have an empty value.
+  // Check XlaHostCompute nodes' "shape_inference_graph" attr. Both should have
+  // empty values.
   string shape_inference_graph;
   TF_CHECK_OK(GetNodeAttr(host_compute_0->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
-  EXPECT_EQ(shape_inference_graph,
-            "_outside_compilation_shape_inference_cluster_0");
+  EXPECT_EQ(shape_inference_graph, "");
   TF_CHECK_OK(GetNodeAttr(host_compute_1->attrs(), "shape_inference_graph",
                           &shape_inference_graph));
   EXPECT_EQ(shape_inference_graph, "");
 
   // Check `shape_inference_graphs`.
-  EXPECT_EQ(shape_inference_graphs.size(), 1);
-  EXPECT_EQ(shape_inference_graphs[0],
-            "_outside_compilation_shape_inference_cluster_0");
+  EXPECT_EQ(shape_inference_graphs.size(), 0);
 
   // Check `host_graph`: verify we have key placeholder and sequencer.
   Node *key_placeholder = nullptr, *sequencer = nullptr;
@@ -333,8 +330,8 @@ TEST(ExtractOutsideCompilationForFunctionTest, Basic) {
       send_recv_nodes.push_back(n);
     }
   }
-  EXPECT_EQ(num_send_from_host, 2);
-  EXPECT_EQ(num_recv_at_host, 2);
+  EXPECT_EQ(num_send_from_host, 1);
+  EXPECT_EQ(num_recv_at_host, 1);
   for (Node *n : send_recv_nodes) {
     Node *input_node;
     TF_CHECK_OK(n->input_node(n->num_inputs() - 1, &input_node));
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
new file mode 100644
index 0000000000000000000000000000000000000000..98e344b3a080aa8aab27cd41564a90427bac151e
--- /dev/null
+++ b/tensorflow/compiler/jit/flags.cc
@@ -0,0 +1,152 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <mutex>  // NOLINT
+
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/xla/parse_flags_from_env.h"
+#include "tensorflow/core/util/command_line_flags.h"
+
+namespace tensorflow {
+namespace {
+
+BuildXlaOpsPassFlags* build_ops_flags;
+DumpGraphFlags* dump_graph_flags;
+MarkForCompilationPassFlags* mark_for_compilation_flags;
+XlaDeviceFlags* device_flags;
+XlaOpsCommonFlags* ops_flags;
+
+std::vector<Flag>* flag_list;
+std::once_flag flags_init;
+
+void AppendDumpGraphFlagsInternal(std::vector<Flag>* flag_list) {
+  std::vector<Flag> new_flags = {
+      Flag("tf_dump_graph_prefix", &dump_graph_flags->tf_dump_graph_prefix,
+           "Path prefix to which graphs dumped during debugging should be "
+           "written."),
+  };
+  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
+}
+
+void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
+  std::vector<Flag> new_flags = {
+      Flag("tf_xla_auto_jit", &mark_for_compilation_flags->tf_xla_auto_jit,
+           "Control compilation of operators into XLA computations on CPU and "
+           "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
+           "things very likely to be improved; 2 = on for everything.  "
+           "Experimental."),
+      Flag("tf_xla_min_cluster_size",
+           &mark_for_compilation_flags->tf_xla_min_cluster_size,
+           "Minimum number of operators in an XLA compilation. Ignored for "
+           "operators placed on an XLA device or operators explicitly marked "
+           "for compilation."),
+      Flag("tf_xla_max_cluster_size",
+           &mark_for_compilation_flags->tf_xla_max_cluster_size,
+           "Maximum number of operators in an XLA compilation."),
+      Flag("tf_xla_clustering_debug",
+           &mark_for_compilation_flags->tf_xla_clustering_debug,
+           "Dump graphs during XLA compilation."),
+      Flag("tf_xla_cpu_global_jit",
+           &mark_for_compilation_flags->tf_xla_cpu_global_jit,
+           "Enables global JIT compilation for CPU via SessionOptions."),
+      Flag("tf_xla_clustering_fuel",
+           &mark_for_compilation_flags->tf_xla_clustering_fuel,
+           "Places an artificial limit on the number of ops marked as "
+           "eligible for clustering."),
+      Flag("tf_xla_fusion_only",
+           &mark_for_compilation_flags->tf_xla_fusion_only,
+           "enable fusion of element-wise operations only using XLA when "
+           "global_jit_level is ON*.")};
+  flag_list->insert(flag_list->end(), new_flags.begin(), new_flags.end());
+}
+
+void AllocateAndParseFlags() {
+  build_ops_flags = new BuildXlaOpsPassFlags;
+  build_ops_flags->tf_xla_enable_lazy_compilation = true;
+
+  dump_graph_flags = new DumpGraphFlags;
+  dump_graph_flags->tf_dump_graph_prefix = "/tmp/";
+
+  mark_for_compilation_flags = new MarkForCompilationPassFlags;
+  mark_for_compilation_flags->tf_xla_auto_jit = 0;
+  mark_for_compilation_flags->tf_xla_min_cluster_size = 2;
+  mark_for_compilation_flags->tf_xla_max_cluster_size =
+      std::numeric_limits<int32>::max();
+  mark_for_compilation_flags->tf_xla_clustering_debug = false;
+  mark_for_compilation_flags->tf_xla_cpu_global_jit = false;
+  mark_for_compilation_flags->tf_xla_clustering_fuel =
+      std::numeric_limits<int64>::max();
+  mark_for_compilation_flags->tf_xla_fusion_only = false;
+
+  device_flags = new XlaDeviceFlags;
+  device_flags->tf_xla_compile_on_demand = false;
+
+  ops_flags = new XlaOpsCommonFlags;
+  ops_flags->tf_xla_always_defer_compilation = false;
+
+  flag_list = new std::vector<Flag>({
+      Flag("tf_xla_enable_lazy_compilation",
+           &build_ops_flags->tf_xla_enable_lazy_compilation, ""),
+
+      Flag("tf_xla_compile_on_demand", &device_flags->tf_xla_compile_on_demand,
+           "Switch a device into 'on-demand' mode, where instead of "
+           "autoclustering ops are compiled one by one just-in-time."),
+
+      Flag("tf_xla_always_defer_compilation",
+           &ops_flags->tf_xla_always_defer_compilation, ""),
+  });
+  AppendDumpGraphFlagsInternal(flag_list);
+  AppendMarkForCompilationPassFlagsInternal(flag_list);
+  xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", *flag_list);
+}
+
+}  // namespace
+
+const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return *build_ops_flags;
+}
+
+DumpGraphFlags* GetDumpGraphFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return dump_graph_flags;
+}
+
+MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return mark_for_compilation_flags;
+}
+
+XlaDeviceFlags* GetXlaDeviceFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return device_flags;
+}
+
+const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  return *ops_flags;
+}
+
+void AppendMarkForCompilationPassFlags(std::vector<Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  AppendMarkForCompilationPassFlagsInternal(flag_list);
+}
+
+void AppendDumpGraphFlags(std::vector<Flag>* flag_list) {
+  std::call_once(flags_init, &AllocateAndParseFlags);
+  AppendDumpGraphFlagsInternal(flag_list);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h b/tensorflow/compiler/jit/flags.h
similarity index 57%
rename from tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
rename to tensorflow/compiler/jit/flags.h
index 79b47357a179d2d9e0d1b6bf9c9f814288bcd5e1..5ddea588eef5270880d91623dc05893da265960a 100644
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
-
-// Legacy flags for the XLA bridge's mark_for_compilation_pass module.
+#ifndef TENSORFLOW_COMPILER_JIT_FLAGS_H_
+#define TENSORFLOW_COMPILER_JIT_FLAGS_H_
 
 #include <vector>
 
@@ -24,15 +22,8 @@ limitations under the License.
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// mark_for_compilation_pass module.
-void AppendMarkForCompilationPassFlags(
-    std::vector<tensorflow::Flag>* flag_list);
 
-// The values of flags associated with the XLA bridge's
-// mark_for_compilation_pass module.
+// Flags associated with the XLA bridge's mark_for_compilation_pass module.
 struct MarkForCompilationPassFlags {
   int32 tf_xla_auto_jit;  // Control compilation of operators into XLA
                           // computations on CPU and GPU devices.  0 = use
@@ -57,12 +48,56 @@ struct MarkForCompilationPassFlags {
                             // only using XLA.
 };
 
-// Return a pointer to the MarkForCompilationPassFlags struct;
+// Flags associated with the XLA bridge's xla_device module.
+struct XlaDeviceFlags {
+  // Switch the CPU device into "on-demand" mode, where instead of
+  // autoclustering ops are compiled one by one just-in-time.
+  // Enabling this mode by a legacy flag is a temporary mechanism. When this
+  // feature is battle-tested, we will switch this to be a session option.
+  bool tf_xla_compile_on_demand;
+};
+
+// Flags common to the _Xla* ops and their kernels.
+struct XlaOpsCommonFlags {
+  // If true, _XlaCompile always refuses to compile the cluster, which means the
+  // XLA clusters always run in the TF executor.  Defaults to false.
+  bool tf_xla_always_defer_compilation;
+};
+
+// Flags for the build_xla_ops pass.
+struct BuildXlaOpsPassFlags {
+  // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
+  // Defaults to true.
+  bool tf_xla_enable_lazy_compilation;
+};
+
+// Flags for the XLA bridge's dump_graph module.
+struct DumpGraphFlags {
+  // Path prefix to which graphs dumped during debugging should be written.
+  string tf_dump_graph_prefix;
+};
+
+// Return a pointer to the DumpGraphFlags struct;
 // repeated calls return the same pointer.
 // This should be called only after Flags::Parse() has returned.
+
+// Getters for flags structs defined above.  The first call to any of these
+// parses TF_XLA_FLAGS for all of them.  Those functions which return a pointer
+// always return the same pointer.
 MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
+const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
+XlaDeviceFlags* GetXlaDeviceFlags();
+const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
+DumpGraphFlags* GetDumpGraphFlags();
+
+// Appends the flag definitions associated with
+// MarkForCompilationPassFlags/DumpGraphFlags to `flag_list`.
+//
+// Has the side-effect of parsing TF_XLA_FLAGS if that hasn't happened yet.
+void AppendMarkForCompilationPassFlags(
+    std::vector<tensorflow::Flag>* flag_list);
+void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
 
-}  // namespace legacy_flags
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_MARK_FOR_COMPILATION_PASS_FLAGS_H_
+#endif  // TENSORFLOW_COMPILER_JIT_FLAGS_H_
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
index d984ca15cb722821b2a466a90387a29cbc1d1097..ce53f70b79d97ab087fefe542920b33f883632a2 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass.cc
@@ -23,7 +23,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/math_ops.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/cc/ops/xla_ops.h"
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
@@ -208,8 +208,12 @@ Status ComputeSliceSize(const Scope& host_scope,
     DCHECK_EQ(slice_size.back().type(), DT_INT64);
   }
 
-  *size = ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
-                      ops::Const(host_scope.WithOpName("concat_axis"), 0));
+  // Trivial ConcatV2 nodes (with exactly one input) are disallowed.
+  *size =
+      slice_size.size() == 1
+          ? slice_size[0]
+          : ops::Concat(host_scope.WithOpName("slice_size"), slice_size,
+                        ops::Const(host_scope.WithOpName("concat_axis"), 0));
   return Status::OK();
 }
 
@@ -242,6 +246,9 @@ Status ConvertTensorFlowSliceToStaticShapedSlice(
                      .WithOpName("static_shaped_slice"),
                  slice_inputs_int64.input, slice_inputs_int64.begin, slice_size)
           .node();
+
+  TF_RETURN_IF_ERROR(main_scope.status());
+
   std::vector<string> compile_time_const_inputs;
   compile_time_const_inputs.push_back("size");
   (*result)->AddAttr(kXlaCompileTimeConstantInputsAttr,
@@ -284,49 +291,45 @@ Status RewriteSlice(Graph* g, Node* slice, const SliceInputs& slice_inputs,
   return Status::OK();
 }
 
-// If `n` is a slice we can rewrite to have a static shape (i.e. have the output
-// shape only depend on the "size" input) then returns the a SliceInputs
-// representing the inputs to `n`.  Otherwise returns nullopt.
-StatusOrOptional<SliceInputs> IsRewritableSlice(Node* n) {
+// Return true if `n` is a slice we can rewrite to have a static shape
+// (i.e. have the output shape only depend on the "size" input).
+xla::StatusOr<bool> IsRewritableSlice(Node* n) {
   if (n->type_string() != "Slice") {
-    return {absl::nullopt};
+    return false;
   }
 
   if (!GetXlaClusterForNode(*n).has_value()) {
     // There is no need to change slice ops outside XLA clusters.
-    return {absl::nullopt};
+    return false;
   }
 
   TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
                       GetSliceInputs(n));
   if (!slice_inputs.has_value()) {
-    return {absl::nullopt};
+    return false;
   }
 
   // If slice_size[i] < -1 for any i then executing the slice will throw an
   // error, and we don't do anything here.
-  bool slice_is_ok = absl::c_all_of(slice_inputs->size_as_vector,
-                                    [](int64 size_i) { return size_i >= -1; });
-  if (!slice_is_ok) {
-    return {absl::nullopt};
-  }
-
-  return slice_inputs;
+  return absl::c_all_of(slice_inputs->size_as_vector,
+                        [](int64 size_i) { return size_i >= -1; });
 }
 
 Status FindAndRewriteSlices(Graph* g, bool* changed) {
-  std::vector<std::pair<Node*, SliceInputs>> slices_to_rewrite;
+  std::vector<Node*> slices_to_rewrite;
   for (Node* n : g->nodes()) {
-    TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
-                        IsRewritableSlice(n));
-    if (slice_inputs.has_value()) {
-      slices_to_rewrite.push_back({n, std::move(*slice_inputs)});
+    TF_ASSIGN_OR_RETURN(bool is_rewritable, IsRewritableSlice(n));
+    if (is_rewritable) {
+      slices_to_rewrite.push_back(n);
     }
   }
 
-  for (const auto& pair : slices_to_rewrite) {
-    TF_RETURN_IF_ERROR(RewriteSlice(g, pair.first, pair.second,
-                                    *GetXlaClusterForNode(*pair.first)));
+  for (Node* n : slices_to_rewrite) {
+    TF_ASSIGN_OR_RETURN(absl::optional<SliceInputs> slice_inputs,
+                        GetSliceInputs(n));
+    TF_RET_CHECK(slice_inputs.has_value());
+    TF_RETURN_IF_ERROR(
+        RewriteSlice(g, n, *slice_inputs, *GetXlaClusterForNode(*n)));
   }
 
   if (!slices_to_rewrite.empty()) {
@@ -342,8 +345,7 @@ Status FindAndRewriteSlices(Graph* g, bool* changed) {
 
 Status IncreaseDynamismForAutoJitPass::Run(
     const GraphOptimizationPassOptions& options) {
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   if (flags->tf_xla_clustering_debug) {
     dump_graph::DumpGraphToFile("before_increase_dynamism_for_auto_jit_pass",
                                 **options.graph, options.flib_def);
diff --git a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
index 0f6f612e967035f6af3e4aff2a499d5cedd018af..a2f1b831ad7605237e23c15cc43b337e06265553 100644
--- a/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
+++ b/tensorflow/compiler/jit/increase_dynamism_for_auto_jit_pass_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::testing::_;
 using testing::matchers::AssignedDevice;
 using testing::matchers::Attr;
 using testing::matchers::Const;
@@ -142,6 +143,26 @@ TEST(SliceToDynamicSliceRewriteTest, Basic) {
   EXPECT_THAT(static_shaped_slice, m_dynamic_slice);
 }
 
+TEST(SliceToDynamicSliceRewriteTest, SliceFromVector) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size = ops::Const(root.WithOpName("size"), {-1});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  EXPECT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(result->nodes(), Not(Contains(NodeWith(Op("ConcatV2")))));
+}
+
 TEST(SliceToDynamicSliceRewriteTest, ControlDependencePreserved) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
@@ -166,18 +187,18 @@ TEST(SliceToDynamicSliceRewriteTest, ControlDependencePreserved) {
                        CtrlDeps(NodeWith(Op("Placeholder"), Name("control")))));
 }
 
+int64 ToInt64(int v) { return static_cast<int64>(v); }
+
 TEST(SliceToDynamicSliceRewriteTest, Int64Indices) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
                    .WithAssignedDevice(kDeviceName)
                    .WithXlaCluster("cluster_0");
 
-  auto to_int64 = [](int v) { return static_cast<int64>(v); };
-
   Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
   Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
   Output size =
-      ops::Const(root.WithOpName("size"), {to_int64(-1), to_int64(500)});
+      ops::Const(root.WithOpName("size"), {ToInt64(-1), ToInt64(500)});
   Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
 
   std::unique_ptr<Graph> result;
@@ -252,13 +273,35 @@ TEST(SliceToDynamicSliceRewriteTest, DontRewriteSliceWithNonConstSize) {
                                     Attr(kXlaCompileTimeConstantInputsAttr)))));
 }
 
+TEST(SliceToDynamicSliceRewriteTest, ScalarSlice) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
+  Output size = ops::Const<int64>(root.WithOpName("size"), {});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(), "slice/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_THAT(static_shaped_slice,
+              NodeWith(Op("Slice"), Attr(kXlaCompileTimeConstantInputsAttr),
+                       Inputs(_, _, Out(NodeWith(Name(size.node()->name()))))));
+}
+
 TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
   Scope root = Scope::NewRootScope()
                    .ExitOnError()
                    .WithAssignedDevice(kDeviceName)
                    .WithXlaCluster("cluster_0");
 
-  auto to_int64 = [](int v) { return static_cast<int64>(v); };
+  auto ToInt64 = [](int v) { return static_cast<int64>(v); };
 
   Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
   Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT64);
@@ -271,7 +314,7 @@ TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
       ops::Slice(root.WithOpName("slice"), input, begin, size_placeholder);
 
   Output size =
-      ops::Const(root.WithOpName("size"), {{to_int64(-1)}, {to_int64(500)}});
+      ops::Const(root.WithOpName("size"), {{ToInt64(-1)}, {ToInt64(500)}});
   TF_ASSERT_OK(root.graph()->UpdateEdge(size.node(), 0, slice.node(), 2));
 
   std::unique_ptr<Graph> result;
@@ -281,5 +324,82 @@ TEST(SliceToDynamicSliceRewriteTest, IndicesNotVector) {
               Not(Contains(NodeWith(Op("Slice"),
                                     Attr(kXlaCompileTimeConstantInputsAttr)))));
 }
+
+TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceInput) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input = ops::Placeholder(root.WithOpName("input"), DT_FLOAT);
+  Output begin = ops::Placeholder(root.WithOpName("begin"), DT_INT32);
+  Output size_a = ops::Const(root.WithOpName("size_a"), {-1, 500});
+  Output slice = ops::Slice(root.WithOpName("slice"), input, begin, size_a);
+
+  Output size_b = ops::Const(root.WithOpName("size_a"), {-1, 200});
+  Output slice_with_slice_input = ops::Slice(
+      root.WithOpName("slice_with_slice_input"), slice, begin, size_b);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(),
+      "slice_with_slice_input/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_EQ(static_shaped_slice->output_type(0), DT_FLOAT)
+      << "Expected DT_FLOAT, was "
+      << DataType_Name(static_shaped_slice->output_type(0));
+  EXPECT_THAT(
+      static_shaped_slice,
+      NodeWith(
+          Op("Slice"),
+          Inputs(Out(NodeWith(
+                     Op("Slice"),
+                     Name("slice/static_shaped_slice/static_shaped_slice"))),
+                 _, _)));
+}
+
+TEST(SliceToDynamicSliceRewriteTest, SliceWithSliceBegin) {
+  Scope root = Scope::NewRootScope()
+                   .ExitOnError()
+                   .WithAssignedDevice(kDeviceName)
+                   .WithXlaCluster("cluster_0");
+
+  Output input_float =
+      ops::Placeholder(root.WithOpName("input_float"), DT_FLOAT);
+  Output input_i64 = ops::Placeholder(root.WithOpName("input_i64"), DT_INT64);
+
+  Output begin_begin =
+      ops::Placeholder(root.WithOpName("begin_begin"), DT_INT32);
+  Output begin_size = ops::Const(root.WithOpName("begin_size"), {-1});
+  Output begin =
+      ops::Slice(root.WithOpName("begin"), input_i64, begin_begin, begin_size);
+
+  Output size =
+      ops::Const(root.WithOpName("size"), {ToInt64(-1), ToInt64(200)});
+  Output slice_with_slice_begin = ops::Slice(
+      root.WithOpName("slice_with_slice_begin"), input_float, begin, size);
+
+  std::unique_ptr<Graph> result;
+  TF_ASSERT_OK(IncreaseDynamismForAutoJit(root, &result));
+
+  Node* static_shaped_slice = testing::FindNodeByName(
+      result.get(),
+      "slice_with_slice_begin/static_shaped_slice/static_shaped_slice");
+  ASSERT_NE(static_shaped_slice, nullptr);
+  EXPECT_EQ(static_shaped_slice->output_type(0), DT_FLOAT)
+      << "Expected DT_FLOAT, was "
+      << DataType_Name(static_shaped_slice->output_type(0));
+  EXPECT_THAT(
+      static_shaped_slice,
+      NodeWith(
+          Op("Slice"),
+          Inputs(_,
+                 Out(NodeWith(
+                     Op("Slice"),
+                     Name("begin/static_shaped_slice/static_shaped_slice"))),
+                 _)));
+}
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 830db9ebdd92608c375ad778eced833e26729325..0583774714c6db7a2fa515fc8a0d304e1898db97 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -12,10 +12,10 @@ cc_library(
     hdrs = ["xla_ops.h"],
     deps = [
         "//tensorflow/compiler/jit:common",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_compilation_cache",
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/jit:xla_launch_util",
-        "//tensorflow/compiler/jit/legacy_flags:xla_ops_common_flags",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 055de7afcc538a1a1183f3687d998a5b2211c887..ad71df5a694a5f8da94675049df1062a7edb6253 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
@@ -418,7 +418,7 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
     cannot_compile_cluster = cannot_compile_cluster_;
   }
 
-  if (legacy_flags::GetXlaOpsCommonFlags().tf_xla_always_defer_compilation ||
+  if (GetXlaOpsCommonFlags().tf_xla_always_defer_compilation ||
       cannot_compile_cluster) {
     executable = nullptr;
   } else {
diff --git a/tensorflow/compiler/jit/legacy_flags/BUILD b/tensorflow/compiler/jit/legacy_flags/BUILD
deleted file mode 100644
index 5fa6c85f06f863f5d18bc4939ffa0ae820d222bd..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/BUILD
+++ /dev/null
@@ -1,65 +0,0 @@
-# Legacy command line flags for the XLA bridge libraries.
-
-# Please do not add more flags to this package.
-
-# The XLA bridge libraries were written in an environment that allowed
-# command-line flags to be scattered freely throughout the libraries.  This
-# model, while initially convenient, leads to a proliferation in unused command
-# line flags in tests and binaries, and serious problems in servers, where one
-# might wish parameters to be different in independent RPC calls to the same
-# routine.
-#
-# Please don't add more flags.  If you're a library author, pass options and
-# parameters explicitly through the library's interface.
-
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//tensorflow:internal"])
-
-cc_library(
-    name = "mark_for_compilation_pass_flags",
-    srcs = ["mark_for_compilation_pass_flags.cc"],
-    hdrs = ["mark_for_compilation_pass_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "xla_device_flags",
-    srcs = ["xla_device_flags.cc"],
-    hdrs = ["xla_device_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "build_xla_ops_pass_flags",
-    srcs = ["build_xla_ops_pass_flags.cc"],
-    hdrs = ["build_xla_ops_pass_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
-
-cc_library(
-    name = "xla_ops_common_flags",
-    srcs = ["xla_ops_common_flags.cc"],
-    hdrs = ["xla_ops_common_flags.h"],
-    deps =
-        [
-            "//tensorflow/compiler/xla:parse_flags_from_env",
-            "//tensorflow/core:framework_internal",
-            "//tensorflow/core:lib",
-        ],
-)
diff --git a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
deleted file mode 100644
index 961c17c17eac891261530ef25baaa50f8496c331..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <mutex>  // NOLINT
-
-#include "tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-namespace {
-
-BuildXlaOpsPassFlags* flags;
-std::vector<Flag>* flag_list;
-std::once_flag flags_init;
-
-void AllocateAndParseFlags() {
-  flags = new BuildXlaOpsPassFlags;
-  flags->tf_xla_enable_lazy_compilation = true;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_enable_lazy_compilation",
-           &flags->tf_xla_enable_lazy_compilation, ""),
-  });
-  xla::ParseFlagsFromEnv(*flag_list);
-}
-
-}  // namespace
-
-const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags() {
-  std::call_once(flags_init, &AllocateAndParseFlags);
-  return *flags;
-}
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h b/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h
deleted file mode 100644
index 9aa5cf64d6db56ae36875ca08d2ae88c73604733..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/build_xla_ops_pass_flags.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Flags for the build_xla_ops pass.
-struct BuildXlaOpsPassFlags {
-  // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
-  // Defaults to true.
-  bool tf_xla_enable_lazy_compilation;
-};
-
-// Parses the flags in BuildXlaOpsPassFlags from the TF_XLA_FLAGS environment
-// variable and returns a reference to the parsed copy.  Parses TF_XLA_FLAGS
-// only the first time this routine is called.
-const BuildXlaOpsPassFlags& GetBuildXlaOpsPassFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_BUILD_XLA_OPS_PASS_FLAGS_H_
diff --git a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc b/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
deleted file mode 100644
index bad306e0b0a3061ba13dc69c08066c642667a2b9..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's mark_for_compilation_pass module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static MarkForCompilationPassFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new MarkForCompilationPassFlags;
-  flags->tf_xla_auto_jit = 0;
-  flags->tf_xla_min_cluster_size = 2;
-  flags->tf_xla_max_cluster_size = std::numeric_limits<int32>::max();
-  flags->tf_xla_clustering_debug = false;
-  flags->tf_xla_cpu_global_jit = false;
-  flags->tf_xla_clustering_fuel = std::numeric_limits<int64>::max();
-  flags->tf_xla_fusion_only = false;
-  flag_list = new std::vector<Flag>(
-      {Flag("tf_xla_auto_jit", &flags->tf_xla_auto_jit,
-            "Control compilation of operators into XLA computations on CPU and "
-            "GPU devices.  0 = use ConfigProto setting; -1 = off; 1 = on for "
-            "things very likely to be improved; 2 = on for everything.  "
-            "Experimental."),
-       Flag("tf_xla_min_cluster_size", &flags->tf_xla_min_cluster_size,
-            "Minimum number of operators in an XLA compilation. Ignored for "
-            "operators placed on an XLA device or operators explicitly marked "
-            "for compilation."),
-       Flag("tf_xla_max_cluster_size", &flags->tf_xla_max_cluster_size,
-            "Maximum number of operators in an XLA compilation."),
-       Flag("tf_xla_clustering_debug", &flags->tf_xla_clustering_debug,
-            "Dump graphs during XLA compilation."),
-       Flag("tf_xla_cpu_global_jit", &flags->tf_xla_cpu_global_jit,
-            "Enables global JIT compilation for CPU via SessionOptions."),
-       Flag("tf_xla_clustering_fuel", &flags->tf_xla_clustering_fuel,
-            "Places an artificial limit on the number of ops marked as "
-            "eligible for clustering."),
-       Flag("tf_xla_fusion_only", &flags->tf_xla_fusion_only,
-            "enable fusion of element-wise operations only using XLA when "
-            "global_jit_level is ON*.")});
-  xla::ParseFlagsFromEnv(*flag_list);
-
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << "Parsed MarkForCompilationPassFlags:";
-    VLOG(1) << "  tf_xla_auto_jit = " << flags->tf_xla_auto_jit;
-    VLOG(1) << "  tf_xla_min_cluster_size = " << flags->tf_xla_min_cluster_size;
-    VLOG(1) << "  tf_xla_max_cluster_size = " << flags->tf_xla_max_cluster_size;
-    VLOG(1) << "  tf_xla_clustering_debug = " << flags->tf_xla_clustering_debug;
-    VLOG(1) << "  tf_xla_cpu_global_jit = " << flags->tf_xla_cpu_global_jit;
-    VLOG(1) << "  tf_xla_clustering_fuel = " << flags->tf_xla_clustering_fuel;
-    VLOG(1) << "  tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
-  }
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// mark_for_compilation_pass module.
-void AppendMarkForCompilationPassFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the MarkForCompilationPassFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-MarkForCompilationPassFlags* GetMarkForCompilationPassFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
deleted file mode 100644
index 76b80d3034c8a13a1ddf1afe548d5c3d9c7b2cec..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's xla_device module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static XlaDeviceFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new XlaDeviceFlags;
-  flags->tf_xla_compile_on_demand = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_compile_on_demand", &flags->tf_xla_compile_on_demand,
-           "Switch a device into 'on-demand' mode, where instead of "
-           "autoclustering ops are compiled one by one just-in-time."),
-  });
-  xla::ParseFlagsFromEnv(*flag_list);
-}
-
-// Return a pointer to the XlaDeviceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-XlaDeviceFlags* GetXlaDeviceFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
deleted file mode 100644
index 27b22121ac1e089bd5d5a494e1e3fb60b05bc76d..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_device_flags.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
-
-// Legacy flags for the XLA bridge's xla_device module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// The values of flags associated with the XLA bridge's
-// xla_device module.
-typedef struct {
-  // Switch the CPU device into "on-demand" mode, where instead of
-  // autoclustering ops are compiled one by one just-in-time.
-  // Enabling this mode by a legacy flag is a temporary mechanism. When this
-  // feature is battle-tested, we will switch this to be a session option.
-  bool tf_xla_compile_on_demand;
-} XlaDeviceFlags;
-
-// Return a pointer to the XlaDeviceFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-XlaDeviceFlags* GetXlaDeviceFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_DEVICE_FLAGS_H_
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc b/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
deleted file mode 100644
index 1443d48a734c0a44c1cd91d8d1218bdbed7f765c..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.cc
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <mutex>  // NOLINT
-#include <vector>
-
-#include "tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-XlaOpsCommonFlags* flags;
-std::vector<Flag>* flag_list;
-std::once_flag flags_init;
-
-void AllocateAndParseFlags() {
-  flags = new XlaOpsCommonFlags;
-  flags->tf_xla_always_defer_compilation = false;
-  flag_list = new std::vector<Flag>({
-      Flag("tf_xla_always_defer_compilation",
-           &flags->tf_xla_always_defer_compilation, ""),
-  });
-  xla::ParseFlagsFromEnv(*flag_list);
-
-  if (VLOG_IS_ON(1)) {
-    VLOG(1) << "Parsed XlaOpsCommonFlags:";
-    VLOG(1) << "  tf_xla_always_defer_compilation = "
-            << flags->tf_xla_always_defer_compilation;
-  }
-}
-
-const XlaOpsCommonFlags& GetXlaOpsCommonFlags() {
-  std::call_once(flags_init, &AllocateAndParseFlags);
-  return *flags;
-}
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h b/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h
deleted file mode 100644
index 7c5c1818ef2d1dcf38c324a2c926db9c4bfa8ef5..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/legacy_flags/xla_ops_common_flags.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
-#define TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Flags common to the _Xla* ops and their kernels.
-struct XlaOpsCommonFlags {
-  // If true, _XlaCompile always refuses to compile the cluster, which means the
-  // XLA clusters always run in the TF executor.  Defaults to false.
-  bool tf_xla_always_defer_compilation;
-};
-
-// Parses the flags in XlaOpsCommonFlags from the TF_XLA_FLAGS environment
-// variable and returns a reference to the parsed copy.  Parses TF_XLA_FLAGS
-// only the first time this routine is called.
-const XlaOpsCommonFlags& GetXlaOpsCommonFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_LEGACY_FLAGS_XLA_OPS_COMMON_FLAGS_H_
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 70033cae0afacb6a25598ee1abf2aeb2721e7496..6618e3a58ab7b6374ed775cd6e4e18a6a4975588 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -24,8 +24,8 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/jit/deadness_analysis.h"
 #include "tensorflow/compiler/jit/defs.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/graphcycles/graphcycles.h"
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
 #include "tensorflow/compiler/jit/union_find.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
@@ -72,6 +72,11 @@ struct OperationFilter {
   // to resort to a dummy implementation. Currently Assert and CheckNumerics ops
   // have dummy XLA implementations.
   bool allow_dummy_ops;
+
+  // Whether ops that produce or consume DT_VARIANT values are allowed.  We
+  // don't auto-cluster these ops because we don't yet support live-in or
+  // live-out DT_VARIANT values.
+  bool allow_ops_producing_or_consuming_variant;
 };
 
 bool IsDummyImplOp(absl::string_view op_name) {
@@ -81,7 +86,13 @@ bool IsDummyImplOp(absl::string_view op_name) {
 bool IsStatefulRandomOp(absl::string_view op_name) {
   return op_name == "RandomUniform" || op_name == "RandomShuffle" ||
          op_name == "RandomUniformInt" || op_name == "RandomStandardNormal" ||
-         op_name == "TruncatedNormal";
+         op_name == "TruncatedNormal" || op_name == "Multinomial";
+}
+
+bool OpProducesOrConsumesVariant(const Node& node) {
+  auto is_variant = [](DataType dtype) { return dtype == DT_VARIANT; };
+  return absl::c_any_of(node.input_types(), is_variant) ||
+         absl::c_any_of(node.output_types(), is_variant);
 }
 
 bool HasXLAKernel(const Node& node, const DeviceType& jit_device_type) {
@@ -246,6 +257,10 @@ bool IsCompilableCall(const NodeDef& call_def,
     if (!op_filter.allow_dummy_ops && IsDummyImplOp(node->type_string())) {
       return false;
     }
+    if (!op_filter.allow_ops_producing_or_consuming_variant &&
+        OpProducesOrConsumesVariant(*node)) {
+      return false;
+    }
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, op_filter, depth + 1,
                           lib_runtime)) {
@@ -427,8 +442,7 @@ Status FindCompilationCandidates(
       BackwardsConstAnalysis(graph, /*compile_time_const_arg_indices=*/nullptr,
                              &compile_time_const_nodes));
 
-  int64& fuel =
-      legacy_flags::GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
+  int64& fuel = GetMarkForCompilationPassFlags()->tf_xla_clustering_fuel;
 
   // Iterate over nodes in sorted order so that compiler fuel is deterministic.
   // We can't simply pass op_nodes().begin() and op_nodes().end to the
@@ -471,16 +485,15 @@ Status FindCompilationCandidates(
         XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration));
     DeviceType jit_device_type(registration->compilation_device_name);
 
+    bool always_auto_cluster = registration->autoclustering_policy ==
+                               XlaOpRegistry::AutoclusteringPolicy::kAlways;
+
     OperationFilter op_filter;
     op_filter.allow_resource_ops = registration->compile_resource_ops;
-    op_filter.allow_stateful_rng_ops =
-        (registration->autoclustering_policy ==
-         XlaOpRegistry::AutoclusteringPolicy::kAlways);
-    op_filter.allow_control_trigger =
-        (registration->autoclustering_policy ==
-         XlaOpRegistry::AutoclusteringPolicy::kAlways);
-    op_filter.allow_dummy_ops = (registration->autoclustering_policy ==
-                                 XlaOpRegistry::AutoclusteringPolicy::kAlways);
+    op_filter.allow_stateful_rng_ops = always_auto_cluster;
+    op_filter.allow_control_trigger = always_auto_cluster;
+    op_filter.allow_dummy_ops = always_auto_cluster;
+    op_filter.allow_ops_producing_or_consuming_variant = always_auto_cluster;
 
     if (!HasXLAKernel(*node, jit_device_type) &&
         !IsCompilableCall(node->def(), jit_device_type, op_filter, 0,
@@ -504,6 +517,12 @@ Status FindCompilationCandidates(
               << node->type_string() << ")";
       continue;
     }
+    if (!op_filter.allow_ops_producing_or_consuming_variant &&
+        OpProducesOrConsumesVariant(*node)) {
+      VLOG(2) << "Rejecting " << node->name()
+              << ": produces or consumes DT_VARIANT";
+      continue;
+    }
 
     if (!op_filter.allow_resource_ops &&
         (HasResourceOutput(*node) || IsNonResourceVarResourceOp(*node))) {
@@ -607,8 +626,7 @@ OptimizerOptions::GlobalJitLevel GetGlobalJitLevel(
     // To set compilation to be on by default, change the following line.
     global_jit_level = OptimizerOptions::OFF;
   }
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   if (flags->tf_xla_auto_jit == -1 ||
       (1 <= flags->tf_xla_auto_jit && flags->tf_xla_auto_jit <= 2)) {
     // If the flag tf_xla_auto_jit is a valid, non-zero setting, it overrides
@@ -641,6 +659,7 @@ bool IsCompilable(FunctionLibraryRuntime* flr, const NodeDef& ndef) {
   op_filter.allow_stateful_rng_ops = true;
   op_filter.allow_control_trigger = true;
   op_filter.allow_dummy_ops = true;
+  op_filter.allow_ops_producing_or_consuming_variant = true;
 
   return IsCompilableCall(ndef, jit_device_type, op_filter, 0, flr);
 }
@@ -651,8 +670,7 @@ Status MarkForCompilationPass::Run(
   // device ahead of time.
   OptimizerOptions::GlobalJitLevel global_jit_level =
       GetGlobalJitLevel(options);
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
   bool fusion_only = flags->tf_xla_fusion_only;
 
   VLOG(1) << "flags->tf_xla_fusion_only = " << flags->tf_xla_fusion_only;
@@ -953,8 +971,7 @@ Status MarkForCompilationPass::RunImpl(
 
   OptimizerOptions::GlobalJitLevel global_jit_level =
       GetGlobalJitLevel(options);
-  legacy_flags::MarkForCompilationPassFlags* flags =
-      legacy_flags::GetMarkForCompilationPassFlags();
+  MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
 
   // Repeatedly contract edges between clusters that are on the same device,
   // provided the contraction would not create a cycle.
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index 24d78c077268f83cebbdafddc1a658ae8dc6b8d8..bf2c5508ea9e987e80093f4c2e15d3ff5191126f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/control_flow_ops_internal.h"
 #include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/cc/ops/list_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
@@ -1147,5 +1148,80 @@ TEST(XlaCompilationTest, DontAutoClusterDummyOps) {
   EXPECT_EQ(clusters["test/check"], "");
 }
 
+TEST(XlaCompilationTest, DontAutoClusterOpsProducingVariant) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64);
+
+  Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32);
+  Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32);
+
+  Output tensor_list_reserve = ops::TensorListReserve(
+      root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/tensor_list_reserve"], "");
+}
+
+TEST(XlaCompilationTest, DontAutoClusterOpsConsumingVariant) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output dummy_input =
+      ops::Placeholder(root.WithOpName("test/dummy_input"), DT_INT64);
+  Output variant_input =
+      ops::Placeholder(root.WithOpName("test/variant_input"), DT_VARIANT);
+
+  // Create one more node so that we don't avoid creating a cluster solely
+  // because it would be trivial.
+  Output dummy_cast =
+      ops::Cast(root.WithOpName("test/dummy_cast"), dummy_input, DT_INT32);
+
+  Output tensor_list_element_shape = ops::TensorListElementShape(
+      root.WithOpName("test/tensor_list_element_shape"), variant_input,
+      DT_INT32);
+
+  root.graph()->AddControlEdge(dummy_cast.node(),
+                               tensor_list_element_shape.node());
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_EQ(clusters["test/tensor_list_element_shape"], "");
+}
+
+TEST(XlaCompilationTest, ClusterOpsProducingVariantIfOnXlaDevice) {
+  Scope root = Scope::NewRootScope().ExitOnError();
+  Output a = ops::Placeholder(root.WithOpName("test/a"), DT_INT64);
+  Output b = ops::Placeholder(root.WithOpName("test/b"), DT_INT64);
+
+  Output cast_a = ops::Cast(root.WithOpName("test/cast_a"), a, DT_INT32);
+  Output cast_b = ops::Cast(root.WithOpName("test/cast_b"), b, DT_INT32);
+
+  Output tensor_list_reserve = ops::TensorListReserve(
+      root.WithOpName("test/tensor_list_reserve"), cast_a, cast_b, DT_FLOAT);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(root.ToGraph(graph.get()));
+
+  string xla_cpu_device = "/job:worker/replica:0/task:0/device:XLA_CPU:0";
+  for (Node* n : graph->nodes()) {
+    if (absl::StartsWith(n->name(), /*prefix=*/"test/")) {
+      n->set_assigned_device_name(xla_cpu_device);
+    }
+  }
+
+  TF_ASSERT_OK(MarkForCompilationPassTestHelper::MarkForCompilation(&graph));
+
+  std::unordered_map<string, string> clusters = GetClusters(*graph);
+  EXPECT_NE(clusters["test/tensor_list_reserve"], "");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
index d56d0f8ccfcdab40003be38059228cb255921b64..64a3301745790132fe3149bf8fb52d6c45ecc3c1 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test_helper.cc
@@ -34,15 +34,9 @@ namespace tensorflow {
   //
   // It may be worth refactoring out XlaOpRegistry::RegisterCompilationDevice to
   // make this more direct, but probably not worth it solely for this test.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(*session_options, "", &devices));
 
-  auto delete_devices = gtl::MakeCleanup([&] {
-    for (Device* d : devices) {
-      delete d;
-    }
-  });
-
   GraphOptimizationPassOptions opt_options;
   opt_options.graph = graph;
   opt_options.session_options = session_options;
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index f72224545b25bc7100e0b6788e6fbf0a7ca63dad..64409d9334751e0edfce9091a4e5697dd2c712c5 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -18,3 +18,9 @@ tf_gen_op_wrapper_py(
     out = "xla_ops.py",
     deps = ["//tensorflow/compiler/jit/ops:xla_ops"],
 )
+
+py_library(
+    name = "xla_ops_grad",
+    srcs = ["xla_ops_grad.py"],
+    deps = ["//tensorflow/python:framework_ops"],
+)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn.py b/tensorflow/compiler/jit/ops/xla_ops_grad.py
similarity index 62%
rename from tensorflow/contrib/estimator/python/estimator/dnn.py
rename to tensorflow/compiler/jit/ops/xla_ops_grad.py
index 10f657df8de64cc96f0cf04f434a77df66629dca..2d31d8dc714307a48932d061fb1af643940a0872 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn.py
+++ b/tensorflow/compiler/jit/ops/xla_ops_grad.py
@@ -1,3 +1,4 @@
+"""Gradients for XLA ops."""
 # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -12,21 +13,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""dnn python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow_estimator.contrib.estimator.python.estimator import dnn
+from tensorflow.python.framework import ops
 
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-dnn.__all__ = [s for s in dir(dnn) if not s.startswith('__')]
 
-from tensorflow_estimator.contrib.estimator.python.estimator.dnn import *
+@ops.RegisterGradient("XlaClusterOutput")
+def _XlaClusterOutputGrad(_, grad):
+  del grad  # unused
+  raise RuntimeError("Gradient computation of graph in xla.compile() is "
+                     "prohibited because it can cause performance degradation."
+                     "Please move gradient computation inside xla.compile().")
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index 36b345ecbff8d5f6ba3c241b9e164f677236c20d..42ea3926e16ae791dbe1bede3b8742383db7667c 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -26,6 +26,10 @@ limitations under the License.
 
 namespace tensorflow {
 namespace {
+
+bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
+
+namespace reduce_device_to_host_copies {
 Status FindNodesToDecluster(const Graph& graph,
                             absl::flat_hash_set<Node*>* result,
                             absl::Span<Node* const> post_order) {
@@ -140,8 +144,6 @@ Status PartiallyDeclusterNode(Graph* graph, Node* n) {
   return Status::OK();
 }
 
-bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
-
 // Clones nodes to outside their cluster to avoid device-to-host copies.  For
 // instance, converts this:
 //
@@ -168,7 +170,7 @@ bool NotBackedge(const Edge& edge) { return !edge.src()->IsNextIteration(); }
 // where the ===> arrow has a hostmem source and destination and would entail a
 // device to host copy if the source and destination were not in the same XLA
 // cluster.
-Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph) {
   // When deciding whether to decluster a particular node, we base our decision
   // on if we've decided that some of its consumers have to be declustered too.
   // Iterating the graph in post-order guarantees that consumers have been
@@ -206,7 +208,9 @@ Status PartiallyDeclusterToRemoveDeviceToHostCopies(Graph* graph) {
 
   return Status::OK();
 }
+}  // namespace reduce_device_to_host_copies
 
+namespace reduce_recompilation {
 bool IsIntraClusterEdge(const Edge& edge) {
   absl::optional<absl::string_view> src_cluster_name =
       GetXlaClusterForNode(*edge.src());
@@ -269,7 +273,7 @@ Status MustCompileNode(const Node* n, bool* must_compile) {
 // regress performance in any significant manner.  We will have to revisit this
 // algorith with a more complex cost model if this assumption turns out to be
 // incorrect.
-Status DeclusterNodesToReduceRecompilations(Graph* graph) {
+Status PartiallyDeclusterGraph(Graph* graph) {
   std::vector<bool> compile_time_const_nodes(graph->num_node_ids());
   TF_RETURN_IF_ERROR(BackwardsConstAnalysis(
       *graph, nullptr, &compile_time_const_nodes, IsIntraClusterEdge));
@@ -322,7 +326,7 @@ Status DeclusterNodesToReduceRecompilations(Graph* graph) {
 
   return Status::OK();
 }
-
+}  // namespace reduce_recompilation
 }  // namespace
 
 Status PartiallyDeclusterPass::Run(
@@ -334,8 +338,9 @@ Status PartiallyDeclusterPass::Run(
 
   Graph* graph = options.graph->get();
 
-  TF_RETURN_IF_ERROR(PartiallyDeclusterToRemoveDeviceToHostCopies(graph));
-  TF_RETURN_IF_ERROR(DeclusterNodesToReduceRecompilations(graph));
+  TF_RETURN_IF_ERROR(
+      reduce_device_to_host_copies::PartiallyDeclusterGraph(graph));
+  TF_RETURN_IF_ERROR(reduce_recompilation::PartiallyDeclusterGraph(graph));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/partially_decluster_pass_test.cc b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
index 1fc5da5071f7aa6f6dd6636aacd60e33c12431a6..38a54cc5efae35ad77b6dc8039c653e920cfc071 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass_test.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass_test.cc
@@ -386,7 +386,7 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterXlaDeviceOps) {
   TF_ASSERT_OK(s.ToGraph(graph.get()));
 
   // This is needed to register the XLA_GPU device.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_ASSERT_OK(DeviceFactory::AddDevices(
       SessionOptions(), "/job:localhost/replica:0/task:0", &devices));
 
@@ -400,10 +400,6 @@ TEST(PartiallyDeclusterPassTest, DontDeclusterXlaDeviceOps) {
   TF_ASSERT_OK(PartiallyDecluster(&graph));
 
   EXPECT_EQ(GetXlaClusterForNode(*n), "cluster_0");
-
-  for (Device* d : devices) {
-    delete d;
-  }
 }
 
 TEST(PartiallyDeclusterPassTest, DontDeclusterNonTensorFlowOps) {
diff --git a/tensorflow/compiler/jit/producer_consumer_queue.h b/tensorflow/compiler/jit/producer_consumer_queue.h
deleted file mode 100644
index 7c8c04152d2f3a0fd46711df24756b7e68b967ea..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/producer_consumer_queue.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
-#define TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
-
-#include <deque>
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/mutex.h"
-
-namespace tensorflow {
-
-// A thread-safe, first-in-first-out queue.
-template <typename T>
-class ProducerConsumerQueue {
- public:
-  ProducerConsumerQueue()
-      : capacity_(std::numeric_limits<std::size_t>::max()) {}
-  ~ProducerConsumerQueue() = default;
-
-  // Wait until the queue is non-full, then append a copy of v.
-  void Put(const T &v);
-
-  // Wait until the queue is non-empty, then remove and return the head value.
-  T Get();
-
-  // If the queue is non-empty, remove the head value, placing it in *pv, and
-  // return true; otherwise return false.
-  bool TryGet(T *pv);
-
-  // Set the capacity of the queue; the queue is full whenever count() >=
-  // capacity().  The initial value is the maximum size_t.  Requires size > 0.
-  void set_capacity(std::size_t size);
-
-  // Return the capacity of the queue.
-  std::size_t capacity() const;
-
-  // Return the number of elements in the queue.
-  std::size_t count() const;
-
-  // Implementation details follow.  Clients should ignore.
- private:
-  mutable tensorflow::mutex mu_;  // protects all fields below
-  tensorflow::condition_variable non_empty_ GUARDED_BY(mu_);
-  tensorflow::condition_variable non_full_ GUARDED_BY(mu_);
-  std::size_t capacity_ GUARDED_BY(mu_);
-  std::deque<T> queue_ GUARDED_BY(mu_);
-
-  TF_DISALLOW_COPY_AND_ASSIGN(ProducerConsumerQueue);
-};
-
-// ------------------------------------------------------
-// Implementation details follow.  Clients should ignore.
-
-// Wait until the queue is non-full, then append a copy of v.
-template <typename T>
-void ProducerConsumerQueue<T>::Put(const T &v) {
-  mutex_lock lock(mu_);
-  while (queue_.size() >= capacity_) {
-    non_full_.wait(lock);
-  }
-  queue_.push_back(v);
-  non_empty_.notify_one();
-}
-
-// Wait until the queue is non-empty, then remove and return the head value.
-template <typename T>
-T ProducerConsumerQueue<T>::Get() {
-  mutex_lock lock(mu_);
-  while (queue_.empty()) {
-    non_empty_.wait(lock);
-  }
-  non_full_.notify_one();
-  T result_value = queue_.front();
-  queue_.pop_front();
-  return result_value;
-}
-
-// If the queue is non-empty, remove the head value, placing it in *pv, and
-// return true; otherwise return false.
-template <typename T>
-bool ProducerConsumerQueue<T>::TryGet(T *pv) {
-  mutex_lock lock(mu_);
-  bool got_element = !queue_.empty();
-  if (got_element) {
-    non_full_.notify_one();
-    *pv = queue_.front();
-    queue_.pop_front();
-  }
-  return got_element;
-}
-
-// Set the capacity of the queue; the queue is full whenever count() >=
-// capacity().  The initial value is the maximum size_t.  Requires size > 0.
-template <typename T>
-void ProducerConsumerQueue<T>::set_capacity(std::size_t size) {
-  mutex_lock lock(mu_);
-  CHECK_NE(size, 0);
-  capacity_ = size;
-  non_full_.notify_all();
-}
-
-// Return the capacity of the queue.
-template <typename T>
-std::size_t ProducerConsumerQueue<T>::capacity() const {
-  mutex_lock lock(mu_);
-  std::size_t max_elements = capacity_;
-  return max_elements;
-}
-
-// Return the number of elements in the queue.
-template <typename T>
-std::size_t ProducerConsumerQueue<T>::count() const {
-  mutex_lock lock(mu_);
-  std::size_t num_elements = queue_.size();
-  return num_elements;
-}
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_JIT_PRODUCER_CONSUMER_QUEUE_H_
diff --git a/tensorflow/compiler/jit/producer_consumer_queue_test.cc b/tensorflow/compiler/jit/producer_consumer_queue_test.cc
deleted file mode 100644
index f61260c6e52756ee039829afdc7452f5f760c221..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/jit/producer_consumer_queue_test.cc
+++ /dev/null
@@ -1,139 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/jit/producer_consumer_queue.h"
-
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/test.h"
-
-namespace tensorflow {
-namespace {
-
-typedef ProducerConsumerQueue<int> IntQueue;
-
-// Insert integers between low inclusive and high exclusive into q.
-void PushRange(IntQueue *q, int low, int high) {
-  while (low != high) {
-    q->Put(low);
-    VLOG(2) << "Pushing " << low;
-    ++low;
-  }
-}
-
-// Push the numbers between 0 and 999 inclusive from several threads in the
-// pool.
-void PushRanges(IntQueue *queue, thread::ThreadPool *pool) {
-  VLOG(1) << "Adding 20-36";
-  pool->Schedule([queue] { PushRange(queue, 20, 36); });
-  VLOG(1) << "Adding 7-20";
-  pool->Schedule([queue] { PushRange(queue, 7, 20); });
-  VLOG(1) << "Adding 36-501";
-  pool->Schedule([queue] { PushRange(queue, 36, 501); });
-  VLOG(1) << "Adding 501-1000";
-  pool->Schedule([queue] { PushRange(queue, 501, 1000); });
-  VLOG(1) << "Adding 0-5";
-  pool->Schedule([queue] { PushRange(queue, 0, 5); });
-  VLOG(1) << "Adding 5-7";
-  pool->Schedule([queue] { PushRange(queue, 5, 7); });
-}
-
-// Pop elements from queue using Get().  Make sure that exactly <high> elements
-// were present and their values are all integers between 0 and high-1
-// inclusive.
-void GetRange(IntQueue *queue, int high) {
-  VLOG(1) << "Testing Wait";
-  std::vector<int> results;
-  for (int i = 0; i != high; ++i) {
-    int r = queue->Get();
-    VLOG(2) << "Waited and got " << r;
-    results.push_back(r);
-  }
-  CHECK_EQ(queue->count(), 0);
-  std::sort(results.begin(), results.end());
-  for (int i = 0; i != high; ++i) {
-    CHECK(results[i] == i);
-  }
-}
-
-// Pop elements from queue using TryGet().  Make sure that exactly <high>
-// elements were present and their values are all integers between 0 and high-1
-// inclusive.
-void TryGetRange(IntQueue *queue, int high) {
-  std::vector<int> results;
-  // Give up if we don't get all the elements back from the queue
-  // in 10 seconds.
-  int timeout = 10;
-  int r;
-  for (int i = 0; i != high; ++i) {
-    while (!queue->TryGet(&r)) {
-      if (!timeout--) {
-        LOG(FATAL) << "Can't find all elements in the queue";
-      }
-      VLOG(1) << "Sleeping for a second...";
-      sleep(1);
-    }
-    VLOG(2) << "Popped " << r;
-    results.push_back(r);
-  }
-  CHECK_EQ(queue->count(), 0);
-  CHECK(!queue->TryGet(&r));
-  std::sort(results.begin(), results.end());
-  for (int i = 0; i != high; ++i) {
-    CHECK_EQ(i, results[i]);
-  }
-}
-
-const int kNumThreads = 15;
-
-TEST(ProducerConsumerQueue, GetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    PushRanges(&queue, &pool);
-  }
-  GetRange(&queue, 1000);
-}
-
-TEST(ProducerConsumerQueue, TryGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    PushRanges(&queue, &pool);
-  }
-  TryGetRange(&queue, 1000);
-}
-
-TEST(ProducerConsumerQueue, ParallelGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    pool.Schedule([&queue] { GetRange(&queue, 1000); });
-    PushRanges(&queue, &pool);
-  }
-}
-
-TEST(ProducerConsumerQueue, ParallelTryGetRange) {
-  IntQueue queue;
-  {
-    thread::ThreadPool pool(Env::Default(), "test", kNumThreads);
-    pool.Schedule([&queue] { TryGetRange(&queue, 1000); });
-    PushRanges(&queue, &pool);
-  }
-}
-
-}  // namespace
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index 1fe612d43d10030675cf307b109e4dcc89cb2d79..c7e8d61d280a33a83c3386d8ef801018634d31ec 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -142,11 +142,22 @@ Status XlaCompileOnDemandOp::Compile(
         TF_RETURN_IF_ERROR(ctx->allocate_temp(
             device_tensor.dtype(), device_tensor.shape(), &host_tensor, attrs));
         Notification n;
+        Status status;
         ctx->op_device_context()->CopyDeviceTensorToCPU(
             &device_tensor, "ConstantArgument",
             reinterpret_cast<Device*>(ctx->device()), &host_tensor,
-            [&](Status status) { n.Notify(); });
+            [&](Status s) {
+              status = s;
+              n.Notify();
+            });
         n.WaitForNotification();
+        if (!status.ok()) {
+          LOG(ERROR) << "Copying tensor of shape "
+                     << device_tensor.shape().DebugString() << " from "
+                     << ctx->device()->name() << "to CPU failed with "
+                     << status.ToString();
+          return status;
+        }
         constant_arguments[i] = host_tensor;
       }
     }
@@ -189,6 +200,7 @@ Status XlaCompileOnDemandOp::Compile(
   std::map<int, OptionalTensor> variable_args = GetVariables(ctx);
 
   std::vector<XlaCompiler::Argument> args;
+
   TF_RETURN_IF_ERROR(XlaComputationLaunchContext::BuildXlaCompilerArguments(
       constant_arguments, variable_args, ctx, &args));
 
diff --git a/tensorflow/compiler/jit/xla_cpu_device.cc b/tensorflow/compiler/jit/xla_cpu_device.cc
index 116e0756036e722c13f27579aa0e0876d2e846a7..e9770647e7ba96cc1db026d12d5f11f52ce98d35 100644
--- a/tensorflow/compiler/jit/xla_cpu_device.cc
+++ b/tensorflow/compiler/jit/xla_cpu_device.cc
@@ -17,8 +17,8 @@ limitations under the License.
 // operators using XLA via the XLA "Host" (CPU) backend.
 
 #include "absl/memory/memory.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
-#include "tensorflow/compiler/jit/legacy_flags/xla_device_flags.h"
 #include "tensorflow/compiler/jit/xla_compile_on_demand_op.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@@ -31,13 +31,13 @@ namespace tensorflow {
 class XlaCpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
-Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
-                                          const string& name_prefix,
-                                          std::vector<Device*>* devices) {
-  legacy_flags::XlaDeviceFlags* flags = legacy_flags::GetXlaDeviceFlags();
+Status XlaCpuDeviceFactory::CreateDevices(
+    const SessionOptions& session_options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
+  XlaDeviceFlags* flags = GetXlaDeviceFlags();
   bool compile_on_demand = flags->tf_xla_compile_on_demand;
 
   XlaOpRegistry::DeviceRegistration registration;
@@ -64,7 +64,18 @@ Status XlaCpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
   options.compilation_device_name = DEVICE_CPU_XLA_JIT;
   options.use_multiple_streams = false;
   auto device = absl::make_unique<XlaDevice>(session_options, options);
-  devices->push_back(device.release());
+
+  // Setting GpuDeviceInfo because eager runtime relies on the device
+  // context in tensorflow_gpu_device_info(). Also,
+  // tensorflow_gpu_device_info() == nullptr is used as an IsCPU test.
+  // We need XlaCpuDevice to be treated not as CPU because it allocates
+  // XlaTensors, not regular Tensors.
+  Status status = device->UseGpuDeviceInfo();
+  if (!status.ok()) {
+    errors::AppendToMessage(&status, "while setting up ", DEVICE_GPU_XLA_JIT);
+    return status;
+  }
+  devices->push_back(std::move(device));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 5c1b55cb57f58387086ab9eaf924d0beffb43e18..4201ff91a89b1bee370e6a43337c51abe3bf974a 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -218,6 +218,9 @@ XlaDevice::XlaDevice(const SessionOptions& session_options,
 XlaDevice::~XlaDevice() {
   VLOG(1) << "Destroying XLA device " << jit_device_name_ << " " << this;
   mutex_lock lock(mu_);
+  while (outstanding_asynchronous_operations_ > 0) {
+    outstanding_asynchronous_operations_cv_.wait(lock);
+  }
   if (device_context_) {
     device_context_->Unref();
   }
@@ -384,6 +387,7 @@ void XlaDevice::ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
 
 Status XlaDevice::Sync() {
   VLOG(1) << "XlaDevice::Sync";
+  tracing::ScopedActivity activity("XlaDevice::Sync", /*is_expensive=*/true);
   std::shared_ptr<se::Stream> stream;
   {
     mutex_lock lock(mu_);
@@ -391,13 +395,46 @@ Status XlaDevice::Sync() {
   }
   if (!stream) return Status::OK();
 
-  if (!stream->parent()->SynchronizeAllActivity() || !stream->ok()) {
+  Status status = stream->BlockHostUntilDone();
+  {
+    mutex_lock lock(mu_);
+    while (outstanding_asynchronous_operations_ > 0) {
+      outstanding_asynchronous_operations_cv_.wait(lock);
+    }
+  }
+  TF_RETURN_IF_ERROR(status);
+  if (!stream->ok()) {
     return errors::Internal("XlaDevice::Sync() failed.");
   }
   VLOG(1) << "XlaDevice::Sync completed";
   return Status::OK();
 }
 
+void XlaDevice::Sync(const DoneCallback& done) {
+  VLOG(1) << "XlaDevice::Sync (asynchronous)";
+  std::shared_ptr<se::Stream> stream;
+  {
+    mutex_lock lock(mu_);
+    stream = stream_;
+  }
+  if (!stream) {
+    done(Status::OK());
+    return;
+  }
+
+  stream->ThenEnqueueOnBackgroundThread(
+      [this, stream, done](se::StreamExecutor*) {
+        tracing::ScopedActivity activity("XlaDevice::Sync::Callback",
+                                         /*is_expensive=*/true);
+        mutex_lock lock(mu_);
+        while (outstanding_asynchronous_operations_ > 0) {
+          outstanding_asynchronous_operations_cv_.wait(lock);
+        }
+        done(stream->ok() ? Status::OK()
+                          : errors::Internal("XlaDevice::Sync() failed."));
+      });
+}
+
 Status XlaDevice::MakeTensorFromProto(const TensorProto& tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
                                       Tensor* tensor) {
@@ -441,6 +478,49 @@ bool XlaDevice::RequiresSyncOnCompletion() const {
   return sync_on_completion_;
 }
 
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    XlaDevice* device)
+    : device_(device) {
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+}
+
+XlaDevice::AsynchronousOperationHandle::~AsynchronousOperationHandle() {
+  if (device_) {
+    mutex_lock lock(device_->mu_);
+    --device_->outstanding_asynchronous_operations_;
+    device_->outstanding_asynchronous_operations_cv_.notify_all();
+  }
+}
+
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    const XlaDevice::AsynchronousOperationHandle& other)
+    : device_(other.device_) {
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+}
+
+XlaDevice::AsynchronousOperationHandle::AsynchronousOperationHandle(
+    XlaDevice::AsynchronousOperationHandle&& other)
+    : device_(other.device_) {
+  other.device_ = nullptr;
+}
+
+XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
+operator=(const XlaDevice::AsynchronousOperationHandle& other) {
+  device_ = other.device_;
+  mutex_lock lock(device_->mu_);
+  ++device_->outstanding_asynchronous_operations_;
+  return *this;
+}
+
+XlaDevice::AsynchronousOperationHandle& XlaDevice::AsynchronousOperationHandle::
+operator=(XlaDevice::AsynchronousOperationHandle&& other) {
+  device_ = other.device_;
+  other.device_ = nullptr;
+  return *this;
+}
+
 XlaDeviceOpRegistrations* RegisterXlaDeviceKernels(const char* device,
                                                    const char* jit_device) {
   // Any op assigned to the device that isn't rewritten by the graph rewriter
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index 49f53b477ef5508a23812453cb61e29a8d8b9379..c8bb276cdb9673fdcba4cc15a9f33ecd3ae96dbb 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -135,6 +135,7 @@ class XlaDevice : public LocalDevice {
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
   Status Sync() override;
+  void Sync(const DoneCallback& done) override;
 
   Status FillContextMap(const Graph* graph,
                         DeviceContextMap* device_context_map) override
@@ -164,7 +165,30 @@ class XlaDevice : public LocalDevice {
 
   bool RequiresSyncOnCompletion() const override LOCKS_EXCLUDED(mu_);
 
+  // A simple RAII handle. On construction the device's
+  // outstanding_asynchronous_operations_ field is incremented; on destruction
+  // it is decremented.
+  class AsynchronousOperationHandle {
+   public:
+    AsynchronousOperationHandle(XlaDevice* device);
+    ~AsynchronousOperationHandle();
+    AsynchronousOperationHandle(const AsynchronousOperationHandle& other);
+    AsynchronousOperationHandle(AsynchronousOperationHandle&& other);
+    AsynchronousOperationHandle& operator=(
+        const AsynchronousOperationHandle& other);
+    AsynchronousOperationHandle& operator=(AsynchronousOperationHandle&& other);
+
+   private:
+    XlaDevice* device_ = nullptr;
+  };
+
+  AsynchronousOperationHandle CreateAsynchronousOperationHandle() {
+    return AsynchronousOperationHandle(this);
+  }
+
  private:
+  friend class AsynchronousOperationHandle;
+
   xla::LocalClient* client() const;
   Allocator* GetAllocatorLocked(AllocatorAttributes attr)
       EXCLUSIVE_LOCKS_REQUIRED(mu_);
@@ -227,6 +251,11 @@ class XlaDevice : public LocalDevice {
   // True if the device requires XlaDevice::Sync to be called on completion
   // regardless of status.
   bool sync_on_completion_ GUARDED_BY(mu_) = false;
+
+  // Count of outstanding asynchronous operations which must be zero on Sync()
+  // completion.
+  int64 outstanding_asynchronous_operations_ GUARDED_BY(mu_) = 0;
+  condition_variable outstanding_asynchronous_operations_cv_;
 };
 
 // Builds OpKernel registrations on 'device' for the JIT operators
diff --git a/tensorflow/compiler/jit/xla_device_ops.h b/tensorflow/compiler/jit/xla_device_ops.h
index adf0f994b84d9fbf918a5b2478aa7d106853e038..927f983ba9ef23c8509523f42366c0c89c29db9f 100644
--- a/tensorflow/compiler/jit/xla_device_ops.h
+++ b/tensorflow/compiler/jit/xla_device_ops.h
@@ -203,6 +203,8 @@ class XlaAssignVariableOp : public OpKernel {
                               .HostMemory("output")                            \
                               .TypeConstraint<ResourceHandle>("T"),            \
                           ArgOp);                                              \
+  REGISTER_KERNEL_BUILDER(                                                     \
+      Name(kArgOp).Device(DEVICE).TypeConstraint<Variant>("T"), ArgOp);        \
                                                                                \
   REGISTER_KERNEL_BUILDER(Name(kRetOp)                                         \
                               .Device(DEVICE)                                  \
diff --git a/tensorflow/compiler/jit/xla_gpu_device.cc b/tensorflow/compiler/jit/xla_gpu_device.cc
index 441970169581d53e0d8683b98d26712445b170ea..0191315a66f4d331e54fadc9dc6a073a05fd67ef 100644
--- a/tensorflow/compiler/jit/xla_gpu_device.cc
+++ b/tensorflow/compiler/jit/xla_gpu_device.cc
@@ -16,7 +16,10 @@ limitations under the License.
 // Registers the XLA_GPU device, which is an XlaDevice instantiation that runs
 // operators using XLA via the XLA "CUDA" (GPU) backend.
 
+#include <set>
 #include "absl/memory/memory.h"
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/compiler/jit/kernels/xla_ops.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_device_ops.h"
@@ -29,12 +32,12 @@ namespace tensorflow {
 class XlaGpuDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
-Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
-                                          const string& name_prefix,
-                                          std::vector<Device*>* devices) {
+Status XlaGpuDeviceFactory::CreateDevices(
+    const SessionOptions& session_options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   XlaOpRegistry::DeviceRegistration registration;
   registration.compilation_device_name = DEVICE_GPU_XLA_JIT;
   registration.autoclustering_policy =
@@ -52,8 +55,35 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
     VLOG(1) << "Failed to create XLA_GPU device: " << platform.status();
     return Status::OK();
   }
-
-  for (int i = 0; i < platform.ValueOrDie()->VisibleDeviceCount(); ++i) {
+  string allowed_gpus =
+      session_options.config.gpu_options().visible_device_list();
+  std::set<int> gpu_ids;
+  int num_visible_devices = platform.ValueOrDie()->VisibleDeviceCount();
+  if (allowed_gpus.empty()) {
+    for (int i = 0; i < num_visible_devices; ++i) {
+      gpu_ids.insert(i);
+    }
+  } else {
+    // For loop below is copied from gpu/gpu_device.cc. It validates
+    // the visible_device_list and populates gpu_ids set.
+    const std::vector<string> visible_devices =
+        absl::StrSplit(allowed_gpus, ',');
+    for (const string& platform_gpu_id_str : visible_devices) {
+      int32 platform_gpu_id;
+      if (!absl::SimpleAtoi(platform_gpu_id_str, &platform_gpu_id)) {
+        return errors::InvalidArgument(
+            "Could not parse entry in 'visible_device_list': '",
+            platform_gpu_id_str, "'. visible_device_list = ", allowed_gpus);
+      }
+      if (platform_gpu_id < 0 || platform_gpu_id >= num_visible_devices) {
+        return errors::InvalidArgument(
+            "'visible_device_list' listed an invalid GPU id '", platform_gpu_id,
+            "' but visible device count is ", num_visible_devices);
+      }
+      gpu_ids.insert(platform_gpu_id);
+    }
+  }
+  for (int i : gpu_ids) {
     XlaDevice::Options options;
     options.platform = platform.ValueOrDie();
     options.device_name_prefix = name_prefix;
@@ -70,7 +100,7 @@ Status XlaGpuDeviceFactory::CreateDevices(const SessionOptions& session_options,
       return status;
     }
 
-    devices->push_back(device.release());
+    devices->push_back(std::move(device));
   }
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_interpreter_device.cc b/tensorflow/compiler/jit/xla_interpreter_device.cc
index e828bae865d630bd40f227943cdabb2d8d95ca48..4007309ed1c57b663dca5bac0df11260bf1327f3 100644
--- a/tensorflow/compiler/jit/xla_interpreter_device.cc
+++ b/tensorflow/compiler/jit/xla_interpreter_device.cc
@@ -33,12 +33,12 @@ constexpr std::array<DataType, 9> kExecAllTypes = {
 class XlaInterpreterDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 };
 
 Status XlaInterpreterDeviceFactory::CreateDevices(
     const SessionOptions& session_options, const string& name_prefix,
-    std::vector<Device*>* devices) {
+    std::vector<std::unique_ptr<Device>>* devices) {
   static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
       DEVICE_XLA_INTERPRETER, DEVICE_INTERPRETER_XLA_JIT);
   (void)registrations;
@@ -61,8 +61,7 @@ Status XlaInterpreterDeviceFactory::CreateDevices(
   options.device_ordinal = 0;
   options.compilation_device_name = DEVICE_INTERPRETER_XLA_JIT;
   options.use_multiple_streams = false;
-  auto device = absl::make_unique<XlaDevice>(session_options, options);
-  devices->push_back(device.release());
+  devices->push_back(absl::make_unique<XlaDevice>(session_options, options));
 
   return Status::OK();
 }
diff --git a/tensorflow/compiler/jit/xla_launch_util.h b/tensorflow/compiler/jit/xla_launch_util.h
index 437db019a0eabe66417725148d8b121842e90479..554227f09de0ab4d9e07f199b957657f3121ff06 100644
--- a/tensorflow/compiler/jit/xla_launch_util.h
+++ b/tensorflow/compiler/jit/xla_launch_util.h
@@ -199,19 +199,17 @@ class XlaTensorBuffer : public TensorBuffer {
  public:
   XlaTensorBuffer(const void* ptr, size_t expected_size, size_t actual_size,
                   Allocator* allocator)
-      : expected_size_(expected_size),
+      : TensorBuffer(const_cast<void*>(ptr)),
+        expected_size_(expected_size),
         actual_size_(actual_size),
-        allocator_(allocator) {
-    data_ = const_cast<void*>(ptr);
-  }
+        allocator_(allocator) {}
 
   ~XlaTensorBuffer() override {
-    if (data_) {
-      allocator_->DeallocateRaw(data_);
+    if (data()) {
+      allocator_->DeallocateRaw(data());
     }
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return expected_size_; }
 
   TensorBuffer* root_buffer() override { return this; }
@@ -231,7 +229,6 @@ class XlaTensorBuffer : public TensorBuffer {
   }
 
  private:
-  void* data_;
   size_t expected_size_;
   size_t actual_size_;
   Allocator* allocator_;
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 6b8e6bba1e1bbfd773141d33721e4d7e30420a11..093b61629cd0b04d5d8488139b8d7262b739f86d 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -375,27 +375,6 @@ tf_xla_py_test(
     ],
 )
 
-tf_xla_py_test(
-    name = "resampler_ops_test",
-    size = "small",
-    srcs = ["resampler_ops_test.py"],
-    disabled_backends = [
-        # TODO(b/74459949) Support BatchDot in CPU backend.
-        "cpu",
-        "cpu_ondemand",
-    ],
-    # TODO(b/112295522): figure out how to make OSS build pass.
-    tags = ["no_oss"],
-    deps = [
-        ":xla_test",
-        "//tensorflow/contrib/resampler:resampler_ops",
-        "//tensorflow/contrib/resampler:resampler_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform_test",
-    ],
-)
-
 tf_xla_py_test(
     name = "dynamic_stitch_test",
     size = "small",
@@ -429,13 +408,6 @@ tf_xla_py_test(
     name = "eager_test",
     size = "large",
     srcs = ["eager_test.py"],
-    disabled_backends = [
-        # TODO(b/78199195) Support XLA CPU devices in eager runtime
-        "cpu",
-        "cpu_ondemand",
-        # TODO(b/78468222) Enable GPU backend
-        "gpu",
-    ],
     deps = [
         ":xla_test",
         "//tensorflow/python:array_ops",
@@ -474,7 +446,6 @@ tf_xla_py_test(
         "//tensorflow/python:extra_py_tests_deps",
         "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:spectral_ops",
         "//tensorflow/python/ops/signal",
     ],
 )
diff --git a/tensorflow/compiler/tests/adagrad_da_test.py b/tensorflow/compiler/tests/adagrad_da_test.py
index 69fb3ec2964a09508e612515b9e291fc14121d68..e9c2d363acab96c0fb968cb7f901ce105ea8703e 100644
--- a/tensorflow/compiler/tests/adagrad_da_test.py
+++ b/tensorflow/compiler/tests/adagrad_da_test.py
@@ -50,8 +50,8 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllClose([0.0, 0.0], var0.eval())
-        self.assertAllClose([0.0, 0.0], var1.eval())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+        self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
@@ -63,9 +63,9 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
         # For -0.1*3.0*(0.1 - 0)/(0 + sqrt(0.1 + 0.1*0.1)) = -0.904534
         # similarly for others.
         self.assertAllCloseAccordingToType(
-            np.array([-0.904534, -1.603567]), var0.eval())
+            np.array([-0.904534, -1.603567]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.094821, -0.189358]), var1.eval())
+            np.array([-0.094821, -0.189358]), self.evaluate(var1))
 
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in self.float_types:
@@ -87,16 +87,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.904534, -1.603567]), var0.eval())
+            np.array([-0.904534, -1.603567]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.094821, -0.189358]), var1.eval())
+            np.array([-0.094821, -0.189358]), self.evaluate(var1))
 
   def testAdagradDAWithL1(self):
     for dtype in self.float_types:
@@ -118,16 +118,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.895489, -1.59555]), var0.eval())
+            np.array([-0.895489, -1.59555]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.085339, -0.17989]), var1.eval())
+            np.array([-0.085339, -0.17989]), self.evaluate(var1))
 
   def testAdagradDAWithL1_L2(self):
     for dtype in self.float_types:
@@ -149,16 +149,16 @@ class AdagradDAOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run a step of AdagradDA
         update.run()
 
         self.assertAllCloseAccordingToType(
-            np.array([-0.046907, -0.093659]), var0.eval())
+            np.array([-0.046907, -0.093659]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([-0.004275, -0.009023]), var1.eval())
+            np.array([-0.004275, -0.009023]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/adagrad_test.py b/tensorflow/compiler/tests/adagrad_test.py
index ab69319c59fb07e7ce56c3c287a50a6290effdfd..e26483303c3934fd51675cb1fbc998b276caf527 100644
--- a/tensorflow/compiler/tests/adagrad_test.py
+++ b/tensorflow/compiler/tests/adagrad_test.py
@@ -42,17 +42,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
   def testTensorLearningRate(self):
@@ -68,17 +70,19 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
   def testSharing(self):
@@ -103,18 +107,20 @@ class AdagradOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Mix the first and the second adagrad for 3 steps.
         ada_update1.run()
         ada_update2.run()
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval(),
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0),
             float_rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval(),
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1),
             float_rtol=1e-5)
 
 
diff --git a/tensorflow/compiler/tests/adam_test.py b/tensorflow/compiler/tests/adam_test.py
index 058576b3d4b695209952158769162bb24e7ccfce..8bcff9d379d34f8a6bb8b0fdc60b7588c6d80be9 100644
--- a/tensorflow/compiler/tests/adam_test.py
+++ b/tensorflow/compiler/tests/adam_test.py
@@ -75,23 +75,24 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTensorLearningRate(self):
     for dtype in self.float_types:
@@ -117,23 +118,24 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testSharing(self):
     for dtype in self.float_types:
@@ -162,13 +164,14 @@ class AdamOptimizerTest(xla_test.XLATestCase):
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           if t % 2 == 0:
             update1.run(feed_dict={grads0: grads0_np, grads1: grads1_np})
           else:
@@ -178,8 +181,8 @@ class AdamOptimizerTest(xla_test.XLATestCase):
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/adamax_test.py b/tensorflow/compiler/tests/adamax_test.py
index 3ed1d41b7121f44dd7470f61180f7a7055369174..961b46375c941bdc3922e460a2f58345086dbceb 100644
--- a/tensorflow/compiler/tests/adamax_test.py
+++ b/tensorflow/compiler/tests/adamax_test.py
@@ -78,8 +78,8 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power = opt._get_beta_accumulators()
 
@@ -87,14 +87,17 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
         for t in range(1, 4):
           update.run()
 
-          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta1_power))
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval(), rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, var1.eval(), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var1_np, self.evaluate(var1), rtol=1e-2)
           self.assertEqual("var0_%d/AdaMax:0" % (i,),
                            opt.get_slot(var=var0, name="m").name)
 
@@ -118,22 +121,23 @@ class AdaMaxOptimizerTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power = opt._get_beta_accumulators()
 
         # Run 3 steps of AdaMax
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
           update.run()
 
           var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/compiler/tests/addsign_test.py b/tensorflow/compiler/tests/addsign_test.py
index 1bc07ace23ccdc83103abe71ee11b72994c75a6d..a37c97e6d374440aeb860b9d02f2d5dd95c91f62 100644
--- a/tensorflow/compiler/tests/addsign_test.py
+++ b/tensorflow/compiler/tests/addsign_test.py
@@ -90,8 +90,8 @@ class AddSignTest(xla_test.XLATestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 7 steps of AddSign
         # first 4 steps with positive gradient
@@ -125,8 +125,8 @@ class AddSignTest(xla_test.XLATestCase):
 
           # Validate updated params
           self.assertAllCloseAccordingToType(
-              var0_np, var0.eval(), half_rtol=1e-2)
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+              var0_np, self.evaluate(var0), half_rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testDense(self):
     decay_steps = 10
diff --git a/tensorflow/compiler/tests/binary_ops_test.py b/tensorflow/compiler/tests/binary_ops_test.py
index 332381c59eed06d5697e58efb1d8fa2b6ef604d2..9a5423c1b2a5df7880453cbb328f6a8174066255 100644
--- a/tensorflow/compiler/tests/binary_ops_test.py
+++ b/tensorflow/compiler/tests/binary_ops_test.py
@@ -218,6 +218,21 @@ class BinaryOpsTest(xla_test.XLATestCase):
             ],
             equality_test=self.ListsAreClose)
 
+      # TF doesn't define these for bf16.
+      if dtype != dtypes.bfloat16.as_numpy_dtype:
+        self._testBinary(
+            gen_math_ops.xdivy,
+            np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
+            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
+            expected=np.array([0, 0.8, 0.5, 0.285714, 0.125, 0], dtype=dtype))
+
+        self._testBinary(
+            gen_math_ops.xlogy,
+            np.array([0, 4, 3, 2, 1, 0], dtype=dtype),
+            np.array([0, 5, 6, 7, 8, float("NaN")], dtype=dtype),
+            expected=np.array([0, 6.437752, 5.375278, 3.89182, 2.079442, 0],
+                              dtype=dtype))
+
   def testIntOps(self):
     for dtype in self.signed_int_types:
       self._testBinary(
diff --git a/tensorflow/compiler/tests/categorical_op_test.py b/tensorflow/compiler/tests/categorical_op_test.py
index a57d1dc81ea2c9c188b0a3005904738aa8156bf3..5d5e486f616937601214aa169a4c329ab78932c8 100644
--- a/tensorflow/compiler/tests/categorical_op_test.py
+++ b/tensorflow/compiler/tests/categorical_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import googletest
 
 
@@ -56,11 +57,11 @@ class CategoricalTest(xla_test.XLATestCase):
     Returns:
       Frequencies from sampled classes; shape [batch_size, num_classes].
     """
-    with self.cached_session() as sess, self.test_scope():
+    with self.cached_session(), self.test_scope():
       random_seed.set_random_seed(1618)
       op = random_ops.multinomial(logits, num_samples,
                                   output_dtype=dtypes.int32)
-      d = sess.run(op)
+      d = self.evaluate(op)
 
     batch_size, num_classes = logits.shape
     freqs_mat = []
@@ -79,15 +80,15 @@ class CategoricalTest(xla_test.XLATestCase):
 
   def _testRngIsNotConstant(self, rng, dtype, output_dtype):
     # Tests that 'rng' does not always return the same value.
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         x = rng(dtype, output_dtype)
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
@@ -107,12 +108,12 @@ class CategoricalTest(xla_test.XLATestCase):
   def testCategoricalIsInRange(self):
     for dtype in self.float_types:
       for output_dtype in self.output_dtypes():
-        with self.cached_session() as sess:
+        with self.cached_session():
           with self.test_scope():
             x = random_ops.multinomial(
                 array_ops.ones(shape=[1, 20], dtype=dtype), 1000,
                 output_dtype=output_dtype)
-          y = sess.run(x)
+          y = self.evaluate(x)
           self.assertTrue((y >= 0).sum() == 1000)
           self.assertTrue((y < 20).sum() == 1000)
 
@@ -138,6 +139,57 @@ class CategoricalTest(xla_test.XLATestCase):
       chi2 = self._chi2(probs, freqs)
       self.assertLess(chi2, 1e-3)
 
+  def testStatelessMultinomialIsInRange(self):
+    for dtype in self.float_types:
+      for output_dtype in self.output_dtypes():
+        with self.cached_session() as sess:
+          with self.test_scope():
+            seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+            x = stateless_random_ops.stateless_multinomial(
+                array_ops.ones(shape=[1, 20], dtype=dtype),
+                1000,
+                seed_t,
+                output_dtype=output_dtype)
+          y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+          self.assertTrue((y >= 0).sum() == 1000)
+          self.assertTrue((y < 20).sum() == 1000)
+
+  def testDeterminismMultinomial(self):
+    # Stateless values should be equal iff the seeds are equal (roughly)
+    num_samples = 10
+    with self.cached_session(), self.test_scope():
+      seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+      seeds = [(x, y) for x in range(5) for y in range(5)] * 3
+      for logits in ([[0.1, 0.25, 0.5, 0.15]], [[0.5, 0.5], [0.8, 0.2],
+                                                [0.25, 0.75]]):
+        pure = stateless_random_ops.stateless_multinomial(
+            logits, num_samples, seed=seed_t)
+        values = [(seed, pure.eval(feed_dict={seed_t: seed})) for seed in seeds]
+        for s0, v0 in values:
+          for s1, v1 in values:
+            self.assertEqual(s0 == s1, np.all(v0 == v1))
+
+  def testEmpty(self):
+    with self.cached_session():
+      with self.test_scope():
+        x = random_ops.multinomial(
+            array_ops.zeros([42, 40]), 0, output_dtype=dtypes.int32)
+        y = self.evaluate(x)
+        self.assertEqual(y.shape, (42, 0))
+
+  def testEmptyStateless(self):
+    with self.cached_session() as sess:
+      with self.test_scope():
+        seed_t = array_ops.placeholder(dtypes.int32, shape=[2])
+        x = stateless_random_ops.stateless_multinomial(
+            array_ops.zeros([42, 40]),
+            0,
+            seed=seed_t,
+            output_dtype=dtypes.int32)
+        y = sess.run(x, {seed_t: [0x12345678, 0xabcdef12]})
+        self.assertEqual(y.shape, (42, 0))
+
+
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/compiler/tests/clustering_test.py b/tensorflow/compiler/tests/clustering_test.py
index 88bd58b2da6b2892f898ad10f3467d8ce39d6388..ef2d7af69deeebd5f4c4c7225d7027f8f76bf861 100644
--- a/tensorflow/compiler/tests/clustering_test.py
+++ b/tensorflow/compiler/tests/clustering_test.py
@@ -43,7 +43,7 @@ class ClusteringTest(xla_test.XLATestCase):
         input1 = constant_op.constant(val1, name="const1")
         input2 = constant_op.constant(val2, name="const2")
         output = math_ops.add(input1, input2)
-      result = output.eval()
+      result = self.evaluate(output)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testAddFromCpuMultiple(self):
@@ -57,7 +57,7 @@ class ClusteringTest(xla_test.XLATestCase):
       with self.test_scope():
         output = math_ops.add(input1, input2)
       for _ in xrange(10):
-        result = output.eval()
+        result = self.evaluate(output)
         self.assertAllClose(result, expected, rtol=1e-3)
 
   def testDeadlock(self):
diff --git a/tensorflow/compiler/tests/concat_ops_test.py b/tensorflow/compiler/tests/concat_ops_test.py
index 2d225ad226cac368042b95eae8fc29e6fd8e82e0..2187f57960f80300d631bdc7eb8fe5e9c8dddeea 100644
--- a/tensorflow/compiler/tests/concat_ops_test.py
+++ b/tensorflow/compiler/tests/concat_ops_test.py
@@ -72,7 +72,7 @@ class ConcatTest(xla_test.XLATestCase):
       x2 = constant_op.constant(p2)
       with self.test_scope():
         c = array_ops.concat([x1, x2], 0)
-      result = c.eval()
+      result = self.evaluate(c)
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
@@ -150,7 +150,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 1)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
     self.assertAllEqual(result, grad_inp)
 
   def testGradientsSimpleAll(self):
@@ -177,7 +177,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 0)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -205,7 +205,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, 2)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -242,7 +242,7 @@ class ConcatTest(xla_test.XLATestCase):
             [float(x) for x in grad_inp.flatten()], shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, concat_dim)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -254,7 +254,7 @@ class ConcatTest(xla_test.XLATestCase):
   def DISABLED_testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         for shape0 in (), (2,):
           axis = len(shape0)
@@ -270,7 +270,7 @@ class ConcatTest(xla_test.XLATestCase):
                 self.assertAllEqual(c.eval(), correct)
                 # Check gradients
                 dc = np.random.randn(*c.get_shape().as_list())
-                dxs = sess.run(gradients_impl.gradients(c, xs, dc))
+                dxs = self.evaluate(gradients_impl.gradients(c, xs, dc))
                 self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
   def testConcatTuple(self):
@@ -280,7 +280,7 @@ class ConcatTest(xla_test.XLATestCase):
       with self.test_scope():
         concat_list_t = array_ops.concat([c1, c2], 0)
         concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
+      self.assertAllEqual(concat_list_t.eval(), self.evaluate(concat_tuple_t))
 
   def testConcatNoScalars(self):
     with self.cached_session():
@@ -330,47 +330,47 @@ class ConcatTest(xla_test.XLATestCase):
 class ConcatOffsetTest(xla_test.XLATestCase):
 
   def testBasic(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         cdim = constant_op.constant(1, dtypes.int32)
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
         off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-        ans = sess.run(off)
+        ans = self.evaluate(off)
         self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
 
 class PackTest(xla_test.XLATestCase):
 
   def testBasic(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant([2, 3, 5], dtypes.int32)
         s1 = constant_op.constant([2, 7, 5], dtypes.int32)
         s2 = constant_op.constant([2, 20, 5], dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [[2, 3, 5], [2, 7, 5], [2, 20, 5]])
 
   def testScalars(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant(2, dtypes.int32)
         s1 = constant_op.constant(3, dtypes.int32)
         s2 = constant_op.constant(5, dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [2, 3, 5])
 
   def testEmpty(self):
-    with self.cached_session() as sess:
+    with self.cached_session():
       with self.test_scope():
         s0 = constant_op.constant([[]], dtypes.int32)
         s1 = constant_op.constant([[]], dtypes.int32)
         s2 = constant_op.constant([[]], dtypes.int32)
         packed = array_ops.stack([s0, s1, s2])
-        ans = sess.run(packed)
+        ans = self.evaluate(packed)
         self.assertAllEqual(ans, [[[]], [[]], [[]]])
 
 
diff --git a/tensorflow/compiler/tests/conv3d_test.py b/tensorflow/compiler/tests/conv3d_test.py
index d59fd0236f4f7da2bbfb3409342c7f70f8f5d1f6..01cc1b6392845be2418c50d55be97487eb290843 100644
--- a/tensorflow/compiler/tests/conv3d_test.py
+++ b/tensorflow/compiler/tests/conv3d_test.py
@@ -85,7 +85,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells = kernel_depth * kernel_height * kernel_width
@@ -135,7 +135,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[3]):
@@ -173,7 +173,7 @@ class Conv3DTransposeTest(xla_test.XLATestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
diff --git a/tensorflow/compiler/tests/dense_layer_test.py b/tensorflow/compiler/tests/dense_layer_test.py
index d1b90f098d7d6574999ba0af44b285f5ad5e4f8d..bf5ea7b1fb6fb3c774c4db20d059f131990d20d3 100644
--- a/tensorflow/compiler/tests/dense_layer_test.py
+++ b/tensorflow/compiler/tests/dense_layer_test.py
@@ -42,7 +42,7 @@ def GetRunMetadataLabels(run_metadata):
 
 def InLabels(labels, substr):
   """Returns true iff one of the labels contains substr."""
-  return any([substr in x for x in labels])
+  return any(substr in x for x in labels)
 
 
 class DenseLayerTest(test.TestCase):
@@ -72,7 +72,7 @@ class DenseLayerTest(test.TestCase):
       x = array_ops.placeholder(shape=[None, None, 3], dtype=np.float32)
       y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -97,7 +97,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
@@ -126,7 +126,7 @@ class DenseLayerTest(test.TestCase):
       with jit_scope():
         y = layers.dense(x, 3)
 
-      sess.run(variables.initialize_all_variables())
+      self.evaluate(variables.initialize_all_variables())
       run_metadata = config_pb2.RunMetadata()
       test_utils.RunWithWarmup(
           sess,
diff --git a/tensorflow/compiler/tests/dynamic_stitch_test.py b/tensorflow/compiler/tests/dynamic_stitch_test.py
index 50b04daa6b9f4159a3c4bdeecaf900a5b35a833c..e89cf975f5d889091ce92a35165aef55ee5ad4b0 100644
--- a/tensorflow/compiler/tests/dynamic_stitch_test.py
+++ b/tensorflow/compiler/tests/dynamic_stitch_test.py
@@ -58,6 +58,15 @@ class DynamicStitchTest(xla_test.XLATestCase):
         [idx1, idx2], [val1, val2],
         expected=np.array([[], [], [], []], np.int32))
 
+  def testEmptyIndex(self):
+    idx1 = np.array([], dtype=np.int32)
+    idx2 = np.array([[], []], dtype=np.int32)
+    val1 = np.ndarray(shape=(0, 9), dtype=np.int32)
+    val2 = np.ndarray(shape=(2, 0, 9), dtype=np.int32)
+    self._AssertDynamicStitchResultIs([idx1, idx2], [val1, val2],
+                                      expected=np.ndarray(
+                                          shape=(0, 9), dtype=np.int32))
+
   def testSimple1D(self):
     val1 = np.array([0, 4, 7], dtype=np.int32)
     val2 = np.array([1, 6, 2, 3, 5], dtype=np.int32)
diff --git a/tensorflow/compiler/tests/eager_test.py b/tensorflow/compiler/tests/eager_test.py
index 63cee550fde9d9d4314b1541fba191df776a4da2..2af32b537ba53723370faf81aebf308a465718c7 100644
--- a/tensorflow/compiler/tests/eager_test.py
+++ b/tensorflow/compiler/tests/eager_test.py
@@ -101,12 +101,12 @@ class EagerTest(xla_test.XLATestCase):
       self.assertAllEqual(15, product)
 
     # Run some ops graphly
-    with context.graph_mode(), self.cached_session() as sess:
+    with context.graph_mode(), self.cached_session():
       with self.test_scope():
         three = constant_op.constant(3)
         five = constant_op.constant(5)
         product = three * five
-        self.assertAllEqual(15, sess.run(product))
+        self.assertAllEqual(15, self.evaluate(product))
 
   def testDegenerateSlices(self):
     with self.test_scope():
diff --git a/tensorflow/compiler/tests/fft_test.py b/tensorflow/compiler/tests/fft_test.py
index e92afd5d6feb42ece233ee521e3a796c4bc3914a..0edd0c35aa2d417a3ed24decbaa0b5d62d35bb62 100644
--- a/tensorflow/compiler/tests/fft_test.py
+++ b/tensorflow/compiler/tests/fft_test.py
@@ -27,8 +27,7 @@ from tensorflow.compiler.tests import xla_test
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
-from tensorflow.python.ops import signal
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import signal
 from tensorflow.python.platform import googletest
 
 BATCH_DIMS = (3, 5)
@@ -107,39 +106,39 @@ class FFTTest(xla_test.XLATestCase):
 
   def testFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.fft,
-                          spectral_ops.fft)
+                          signal.fft)
 
   def testFFT2D(self):
     self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.fft2,
-                          spectral_ops.fft2d)
+                          signal.fft2d)
 
   def testFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.fftn(x, axes=(-3, -2, -1)),
-                          spectral_ops.fft3d)
+                          signal.fft3d)
 
   def testIFFT(self):
     self._VerifyFftMethod(INNER_DIMS_1D, lambda x: x, np.fft.ifft,
-                          spectral_ops.ifft)
+                          signal.ifft)
 
   def testIFFT2D(self):
     self._VerifyFftMethod(INNER_DIMS_2D, lambda x: x, np.fft.ifft2,
-                          spectral_ops.ifft2d)
+                          signal.ifft2d)
 
   def testIFFT3D(self):
     self._VerifyFftMethod(INNER_DIMS_3D, lambda x: x,
                           lambda x: np.fft.ifftn(x, axes=(-3, -2, -1)),
-                          spectral_ops.ifft3d)
+                          signal.ifft3d)
 
   def testRFFT(self):
     self._VerifyFftMethod(
         INNER_DIMS_1D, np.real, lambda x: np.fft.rfft(x, n=x.shape[-1]),
-        lambda x: spectral_ops.rfft(x, fft_length=[x.shape[-1].value]))
+        lambda x: signal.rfft(x, fft_length=[x.shape[-1].value]))
 
   def testRFFT2D(self):
 
     def _tf_fn(x):
-      return spectral_ops.rfft2d(
+      return signal.rfft2d(
           x, fft_length=[x.shape[-2].value, x.shape[-1].value])
 
     self._VerifyFftMethod(
@@ -153,16 +152,33 @@ class FFTTest(xla_test.XLATestCase):
           x, axes=(-3, -2, -1), s=[x.shape[-3], x.shape[-2], x.shape[-1]])
 
     def _tf_fn(x):
-      return spectral_ops.rfft3d(
+      return signal.rfft3d(
           x,
           fft_length=[x.shape[-3].value, x.shape[-2].value, x.shape[-1].value])
 
     self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn)
 
+  def testRFFT3DMismatchedSize(self):
+
+    def _to_expected(x):
+      return np.fft.rfftn(
+          x,
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _tf_fn(x):
+      return signal.rfft3d(
+          x,
+          fft_length=[
+              x.shape[-3].value // 2, x.shape[-2].value, x.shape[-1].value * 2
+          ])
+
+    self._VerifyFftMethod(INNER_DIMS_3D, np.real, _to_expected, _tf_fn)
+
   def testIRFFT(self):
 
     def _tf_fn(x):
-      return spectral_ops.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
+      return signal.irfft(x, fft_length=[2 * (x.shape[-1].value - 1)])
 
     self._VerifyFftMethod(
         INNER_DIMS_1D, lambda x: np.fft.rfft(np.real(x), n=x.shape[-1]),
@@ -171,7 +187,7 @@ class FFTTest(xla_test.XLATestCase):
   def testIRFFT2D(self):
 
     def _tf_fn(x):
-      return spectral_ops.irfft2d(
+      return signal.irfft2d(
           x, fft_length=[x.shape[-2].value, 2 * (x.shape[-1].value - 1)])
 
     self._VerifyFftMethod(
@@ -195,7 +211,7 @@ class FFTTest(xla_test.XLATestCase):
           s=[x.shape[-3], x.shape[-2], 2 * (x.shape[-1] - 1)])
 
     def _tf_fn(x):
-      return spectral_ops.irfft3d(
+      return signal.irfft3d(
           x,
           fft_length=[
               x.shape[-3].value, x.shape[-2].value, 2 * (x.shape[-1].value - 1)
@@ -203,6 +219,30 @@ class FFTTest(xla_test.XLATestCase):
 
     self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
 
+  def testIRFFT3DMismatchedSize(self):
+
+    def _to_input(x):
+      return np.fft.rfftn(
+          np.real(x),
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _to_expected(x):
+      return np.fft.irfftn(
+          x,
+          axes=(-3, -2, -1),
+          s=[x.shape[-3] // 2, x.shape[-2], x.shape[-1] * 2])
+
+    def _tf_fn(x):
+      return signal.irfft3d(
+          x,
+          fft_length=[
+              x.shape[-3].value // 2, x.shape[-2].value, x.shape[-1].value * 2
+          ])
+
+    self._VerifyFftMethod(INNER_DIMS_3D, _to_input, _to_expected, _tf_fn)
+
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/compiler/tests/fifo_queue_test.py b/tensorflow/compiler/tests/fifo_queue_test.py
index 8c7edfd277c992c35a81dd5f261256a86352254e..91d77d2f791834346f43aecb60d116ddbf2faa6e 100644
--- a/tensorflow/compiler/tests/fifo_queue_test.py
+++ b/tensorflow/compiler/tests/fifo_queue_test.py
@@ -129,7 +129,7 @@ class FIFOQueueTest(xla_test.XLATestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -192,9 +192,9 @@ class FIFOQueueTest(xla_test.XLATestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/ftrl_test.py b/tensorflow/compiler/tests/ftrl_test.py
index 5b197afd655404e4e36a8b3442f8db60cb1d648d..b078053cdbd6d129645734492d34dd25d28ab3ef 100644
--- a/tensorflow/compiler/tests/ftrl_test.py
+++ b/tensorflow/compiler/tests/ftrl_test.py
@@ -50,14 +50,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Ftrl for a few steps
     for _ in range(steps):
       ftrl_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivAdagradTest_AdagradPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -65,14 +65,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     adagrad_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Adagrad for a few steps
     for _ in range(steps):
       adagrad_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivGradientDescentTest_FtrlPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -85,14 +85,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run Ftrl for a few steps
     for _ in range(steps):
       ftrl_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def equivGradientDescentTest_GradientDescentPart(self, steps, dtype):
     var0, var1, grads0, grads1 = self.initVariableAndGradient(dtype)
@@ -100,14 +100,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
     sgd_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
     # Fetch params to validate initial values
-    self.assertAllClose([0.0, 0.0], var0.eval())
-    self.assertAllClose([0.0, 0.0], var1.eval())
+    self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+    self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
     # Run GradientDescent for a few steps
     for _ in range(steps):
       sgd_update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testFtrlwithoutRegularization(self):
     for dtype in self.float_types:
@@ -124,8 +124,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([0.0, 0.0], var0.eval())
-        self.assertAllClose([0.0, 0.0], var1.eval())
+        self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+        self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
         # Run 3 steps FTRL
         for _ in range(3):
@@ -134,12 +134,12 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType(
             np.array([-2.60260963, -4.29698515]),
-            var0.eval(),
+            self.evaluate(var0),
             float_rtol=1e-4,
             half_rtol=1e-2)
         self.assertAllCloseAccordingToType(
             np.array([-0.28432083, -0.56694895]),
-            var1.eval(),
+            self.evaluate(var1),
             float_rtol=1e-5,
             half_rtol=1e-2)
 
@@ -158,8 +158,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 3 steps FTRL
         for _ in range(3):
@@ -167,10 +167,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-2.55607247, -3.98729396]), var0.eval(), 1e-5, 1e-5,
+            np.array([-2.55607247, -3.98729396]),
+            self.evaluate(var0),
+            1e-5,
+            1e-5,
             float_rtol=1e-4)
         self.assertAllCloseAccordingToType(
-            np.array([-0.28232238, -0.56096673]), var1.eval(), 1e-5, 1e-5)
+            np.array([-0.28232238, -0.56096673]), self.evaluate(var1), 1e-5,
+            1e-5)
 
   def testFtrlWithL1(self):
     for dtype in self.float_types:
@@ -187,8 +191,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -197,12 +201,14 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         # Validate updated params
         self.assertAllCloseAccordingToType(
             np.array([-7.66718769, -10.91273689]),
-            var0.eval(),
+            self.evaluate(var0),
             rtol=1e-4,
             bfloat16_rtol=1e-1,
             bfloat16_atol=1e-1)
         self.assertAllCloseAccordingToType(
-            np.array([-0.93460727, -1.86147261]), var1.eval(), rtol=1e-4)
+            np.array([-0.93460727, -1.86147261]),
+            self.evaluate(var1),
+            rtol=1e-4)
 
   def testFtrlWithL1_L2(self):
     for dtype in self.float_types:
@@ -219,8 +225,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([4.0, 3.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -228,9 +234,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-0.24059935, -0.46829352]), var0.eval(), rtol=1e-5)
+            np.array([-0.24059935, -0.46829352]),
+            self.evaluate(var0),
+            rtol=1e-5)
         self.assertAllCloseAccordingToType(
-            np.array([-0.02406147, -0.04830509]), var1.eval(), rtol=1e-5)
+            np.array([-0.02406147, -0.04830509]),
+            self.evaluate(var1),
+            rtol=1e-5)
 
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
@@ -254,8 +264,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         ftrl_update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([4.0, 3.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([4.0, 3.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -263,9 +273,13 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-0.22578996, -0.44345799]), var0.eval(), rtol=1e-4)
+            np.array([-0.22578996, -0.44345799]),
+            self.evaluate(var0),
+            rtol=1e-4)
         self.assertAllCloseAccordingToType(
-            np.array([-0.14378493, -0.13229476]), var1.eval(), rtol=1e-4)
+            np.array([-0.14378493, -0.13229476]),
+            self.evaluate(var1),
+            rtol=1e-4)
 
   def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
     """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
@@ -291,8 +305,8 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
         update1 = opt1.apply_gradients([(grads1, var1)])
         variables.global_variables_initializer().run()
 
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([1.0, 2.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var1))
 
         # Run 10 steps FTRL
         for _ in range(10):
@@ -301,7 +315,7 @@ class FtrlOptimizerTest(xla_test.XLATestCase):
 
         # var0 is experiencing L2 shrinkage so it should be smaller than var1
         # in magnitude.
-        self.assertTrue((var0.eval()**2 < var1.eval()**2).all())
+        self.assertTrue((var0.eval()**2 < self.evaluate(var1)**2).all())
         accum0 = list(opt0._slots["accum"].values())[0].eval()
         accum1 = list(opt1._slots["accum"].values())[0].eval()
         # L2 shrinkage should not change how we update grad accumulator.
diff --git a/tensorflow/compiler/tests/function_test.py b/tensorflow/compiler/tests/function_test.py
index b1891b918c6584abce9da382088ed0037f5319fb..a61827c2ae44de117abad5b7db5c6bcd78fa171e 100644
--- a/tensorflow/compiler/tests/function_test.py
+++ b/tensorflow/compiler/tests/function_test.py
@@ -40,7 +40,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32)
     expected = APlus2B(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -50,7 +50,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_f = Foo(a, b)
-      result = sess.run(call_f)
+      result = self.evaluate(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testNestedFunctions(self):
@@ -66,7 +66,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([4, 3, 2, 1]).reshape([2, 2]).astype(np.float32)
     expected = APlus2B(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -76,7 +76,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_g = Foo(a, b)
-      result = sess.run(call_g)
+      result = self.evaluate(call_g)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testFunctionMultipleRetvals(self):
@@ -90,7 +90,7 @@ class FunctionTest(xla_test.XLATestCase):
     bval = np.array([5, 6, 7, 8]).reshape([2, 2]).astype(np.float32)
     expected = Func(aval, bval)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
 
       @function.Defun(dtypes.float32, dtypes.float32)
       def Foo(a, b):
@@ -100,7 +100,7 @@ class FunctionTest(xla_test.XLATestCase):
       b = constant_op.constant(bval, name="b")
       with self.test_scope():
         call_f = Foo(a, b)
-      result = sess.run(call_f)
+      result = self.evaluate(call_f)
     self.assertAllClose(result, expected, rtol=1e-3)
 
   def testCompileTimeConstantsInDefun(self):
diff --git a/tensorflow/compiler/tests/jit_test.py b/tensorflow/compiler/tests/jit_test.py
index 6f51ae33a1b0fc8670ddf0cacb03a3b5a9176a91..dbea9849e217519874352b789588a2af62f1c826 100644
--- a/tensorflow/compiler/tests/jit_test.py
+++ b/tensorflow/compiler/tests/jit_test.py
@@ -75,7 +75,7 @@ def RunMetadataLabels(run_metadata):
 
 def InLabels(labels, substr):
   """Returns true iff one of the labels contains substr."""
-  return any([substr in x for x in labels])
+  return any(substr in x for x in labels)
 
 
 def MetadataHasXlaRunOp(run_metadata):
diff --git a/tensorflow/compiler/tests/listdiff_op_test.py b/tensorflow/compiler/tests/listdiff_op_test.py
index 58622114e4f552fb71db9b040a39b57d7da0037c..0210201fa71a6e790e94667073ab4dba542537a5 100644
--- a/tensorflow/compiler/tests/listdiff_op_test.py
+++ b/tensorflow/compiler/tests/listdiff_op_test.py
@@ -33,13 +33,13 @@ class ListDiffTest(xla_test.XLATestCase):
   def _testListDiff(self, x, y, out, idx):
     for dtype in [dtypes.int32, dtypes.int64]:
       for index_dtype in [dtypes.int32, dtypes.int64]:
-        with self.cached_session() as sess:
+        with self.cached_session():
           x_tensor = ops.convert_to_tensor(x, dtype=dtype)
           y_tensor = ops.convert_to_tensor(y, dtype=dtype)
           with self.test_scope():
             out_tensor, idx_tensor = array_ops.listdiff(
                 x_tensor, y_tensor, out_idx=index_dtype)
-            tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+            tf_out, tf_idx = self.evaluate([out_tensor, idx_tensor])
         self.assertAllEqual(out, tf_out)
         self.assertAllEqual(idx, tf_idx)
         self.assertEqual(1, out_tensor.get_shape().ndims)
diff --git a/tensorflow/compiler/tests/lrn_ops_test.py b/tensorflow/compiler/tests/lrn_ops_test.py
index c6ad67993e8bc196a74c9a328df8c9200c92c575..5dddf6ae4e8c8a3d5e9eb7b2c62298df02a0093c 100644
--- a/tensorflow/compiler/tests/lrn_ops_test.py
+++ b/tensorflow/compiler/tests/lrn_ops_test.py
@@ -120,8 +120,8 @@ class LRNTest(xla_test.XLATestCase):
       with self.test_scope():
         actual = gen_nn_ops.lrn_grad(out_grads, in_image, out_image,
                                      depth_radius, bias, alpha, beta)
-      expected_val = expected.eval()
-      actual_val = actual.eval()
+      expected_val = self.evaluate(expected)
+      actual_val = self.evaluate(actual)
     self.assertAllClose(actual_val, expected_val, rtol=1e-3)
 
 
diff --git a/tensorflow/compiler/tests/lstm_test.py b/tensorflow/compiler/tests/lstm_test.py
index 265c0b6d1412de7be3a5bf5e79129cb330ceb162..776ed899e68ddd3893b8bb30b7c8034297aa6515 100644
--- a/tensorflow/compiler/tests/lstm_test.py
+++ b/tensorflow/compiler/tests/lstm_test.py
@@ -88,8 +88,8 @@ class LSTMTest(test.TestCase):
                  (basename, m_prev_scalar, c_prev_scalar, pad_scalar))
 
       # Initialize variables and run the unrolled LSTM step.
-      sess.run(variables.global_variables_initializer())
-      return sess.run([m, c])
+      self.evaluate(variables.global_variables_initializer())
+      return self.evaluate([m, c])
 
   def testLSTMCell(self):
     # Run with all-0 weights, no padding.
@@ -173,8 +173,8 @@ class LSTMTest(test.TestCase):
                  (basename, m_init_scalar, c_init_scalar, pad_scalar))
 
       # Initialize variables and run the unrolled LSTM layer.
-      sess.run(variables.global_variables_initializer())
-      return sess.run(out_seq)
+      self.evaluate(variables.global_variables_initializer())
+      return self.evaluate(out_seq)
 
   def testLSTMLayer(self):
     # Run with all-0 weights, no padding.
diff --git a/tensorflow/compiler/tests/momentum_test.py b/tensorflow/compiler/tests/momentum_test.py
index f77521a7c49dba39849869ddceb7c0e885147722..3416f7dbd6bdd264bf79785084f981f5b07cb8a9 100644
--- a/tensorflow/compiler/tests/momentum_test.py
+++ b/tensorflow/compiler/tests/momentum_test.py
@@ -61,37 +61,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
   def testNesterovMomentum(self):
     for dtype in self.float_types:
@@ -115,8 +121,8 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
               var0_np, accum0_np, var0_np * 0.8, 0.1, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
               var1_np, accum1_np, 0.9, 0.1, 0.9)
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTensorLearningRateAndMomentum(self):
     for dtype in self.float_types:
@@ -141,37 +147,43 @@ class MomentumOptimizerTest(xla_test.XLATestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/placeholder_test.py b/tensorflow/compiler/tests/placeholder_test.py
index 77bb839409f0c323ff6ed2c8d6bd105d3003b398..9671ae0ae973ff82d22744a1feb9b4293d94bbdd 100644
--- a/tensorflow/compiler/tests/placeholder_test.py
+++ b/tensorflow/compiler/tests/placeholder_test.py
@@ -33,7 +33,7 @@ class PlaceholderTest(xla_test.XLATestCase):
       ph = array_ops.placeholder_with_default(v, shape=[])
       out = ph * 2
       sess.run(variables.variables_initializer([v]))
-      self.assertEqual(8.0, sess.run(out))
+      self.assertEqual(8.0, self.evaluate(out))
 
   def test_placeholder_with_default_fed(self):
     with self.cached_session() as sess, self.test_scope():
diff --git a/tensorflow/compiler/tests/powersign_test.py b/tensorflow/compiler/tests/powersign_test.py
index 86536da7fed0e2309beb32fee9c7c605491592ed..5b35c20027700b34500a31e174061d7087094b61 100644
--- a/tensorflow/compiler/tests/powersign_test.py
+++ b/tensorflow/compiler/tests/powersign_test.py
@@ -91,8 +91,8 @@ class PowerSignTest(xla_test.XLATestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 7 steps of powersign
         # first 4 steps with positive gradient
@@ -125,8 +125,8 @@ class PowerSignTest(xla_test.XLATestCase):
           )
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testDense(self):
     decay_steps = 10
diff --git a/tensorflow/compiler/tests/proximal_adagrad_test.py b/tensorflow/compiler/tests/proximal_adagrad_test.py
index c41b4171e26af4f7ad0237d7407a5b3691299595..63cc51a470164915b2614a06d18ca1850bb64a3c 100644
--- a/tensorflow/compiler/tests/proximal_adagrad_test.py
+++ b/tensorflow/compiler/tests/proximal_adagrad_test.py
@@ -45,15 +45,17 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([0.0, 0.0], var0.eval())
-      self.assertAllClose([0.0, 0.0], var1.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+      self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([-2.60260963, -4.29698515]), var0.eval())
-      self.assertAllClose(np.array([-0.28432083, -0.56694895]), var1.eval())
+      self.assertAllClose(
+          np.array([-2.60260963, -4.29698515]), self.evaluate(var0))
+      self.assertAllClose(
+          np.array([-0.28432083, -0.56694895]), self.evaluate(var1))
       opt_vars = opt.variables()
       self.assertStartsWith(opt_vars[0].name, var0._shared_name)
       self.assertStartsWith(opt_vars[1].name, var1._shared_name)
@@ -74,14 +76,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
-      self.assertAllClose(np.array([-1.60261, -2.296985]), var0.eval())
-      self.assertAllClose(np.array([3.715679, 2.433051]), var1.eval())
+      self.assertAllClose(np.array([-1.60261, -2.296985]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.715679, 2.433051]), self.evaluate(var1))
 
   def testProximalAdagradWithL1(self):
     with self.cached_session(), self.test_scope():
@@ -98,14 +100,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Adagrad
       for _ in range(10):
         update.run()
-      self.assertAllClose(np.array([-6.663634, -9.190331]), var0.eval())
-      self.assertAllClose(np.array([2.959304, 1.029232]), var1.eval())
+      self.assertAllClose(np.array([-6.663634, -9.190331]), self.evaluate(var0))
+      self.assertAllClose(np.array([2.959304, 1.029232]), self.evaluate(var1))
 
   def testProximalAdagradWithL1_L2(self):
     with self.cached_session(), self.test_scope():
@@ -122,15 +124,15 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Adagrad.
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval())
-      self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval())
+      self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1))
 
   def applyOptimizer(self, opt, steps=5):
     var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -141,14 +143,14 @@ class ProximalAdagradOptimizerTest(xla_test.XLATestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    self.assertAllClose([1.0, 2.0], var0.eval())
-    self.assertAllClose([3.0, 4.0], var1.eval())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
     # Run ProximalAdagrad for a few steps
     for _ in range(steps):
       update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testEquivAdagradwithoutRegularization(self):
     with self.cached_session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/proximal_gradient_descent_test.py b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
index 3d808e6b8a71ef9fa60b671d07bfd907e9f58efc..5aec433be765dd0a04bd7ab10d5c39a5a7f48c5c 100644
--- a/tensorflow/compiler/tests/proximal_gradient_descent_test.py
+++ b/tensorflow/compiler/tests/proximal_gradient_descent_test.py
@@ -42,15 +42,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([0.0, 0.0], var0.eval())
-      self.assertAllClose([0.0, 0.0], var1.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var0))
+      self.assertAllClose([0.0, 0.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Gradient Descent.
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([-0.9, -1.8]), var0.eval())
-      self.assertAllClose(np.array([-0.09, -0.18]), var1.eval())
+      self.assertAllClose(np.array([-0.9, -1.8]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.09, -0.18]), self.evaluate(var1))
 
   def testProximalGradientDescentwithoutRegularization2(self):
     with self.cached_session(), self.test_scope():
@@ -64,15 +64,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 3 steps Proximal Gradient Descent
       for _ in range(3):
         update.run()
 
-      self.assertAllClose(np.array([0.1, 0.2]), var0.eval())
-      self.assertAllClose(np.array([3.91, 2.82]), var1.eval())
+      self.assertAllClose(np.array([0.1, 0.2]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.91, 2.82]), self.evaluate(var1))
 
   def testProximalGradientDescentWithL1(self):
     with self.cached_session(), self.test_scope():
@@ -86,15 +86,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps proximal gradient descent.
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-1.988, -3.988001]), var0.eval())
-      self.assertAllClose(np.array([3.67, 2.37]), var1.eval())
+      self.assertAllClose(np.array([-1.988, -3.988001]), self.evaluate(var0))
+      self.assertAllClose(np.array([3.67, 2.37]), self.evaluate(var1))
 
   def testProximalGradientDescentWithL1_L2(self):
     with self.cached_session(), self.test_scope():
@@ -108,15 +108,15 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([4.0, 3.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([4.0, 3.0], self.evaluate(var1))
 
       # Run 10 steps Proximal Gradient Descent
       for _ in range(10):
         update.run()
 
-      self.assertAllClose(np.array([-0.0495, -0.0995]), var0.eval())
-      self.assertAllClose(np.array([-0.0045, -0.0095]), var1.eval())
+      self.assertAllClose(np.array([-0.0495, -0.0995]), self.evaluate(var0))
+      self.assertAllClose(np.array([-0.0045, -0.0095]), self.evaluate(var1))
 
   def applyOptimizer(self, opt, steps=5):
     var0 = resource_variable_ops.ResourceVariable([1.0, 2.0])
@@ -127,14 +127,14 @@ class ProximalGradientDescentOptimizerTest(xla_test.XLATestCase):
     update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
     variables.global_variables_initializer().run()
 
-    self.assertAllClose([1.0, 2.0], var0.eval())
-    self.assertAllClose([3.0, 4.0], var1.eval())
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
     # Run ProximalAdagrad for a few steps
     for _ in range(steps):
       update.run()
 
-    return var0.eval(), var1.eval()
+    return self.evaluate(var0), self.evaluate(var1)
 
   def testEquivGradientDescentwithoutRegularization(self):
     with self.cached_session(), self.test_scope():
diff --git a/tensorflow/compiler/tests/qr_op_test.py b/tensorflow/compiler/tests/qr_op_test.py
index 236b1b881dcaffc1a5b0c6395f0605c1d7ef0269..b4d4193e35f9e0e3b23d0242ed076dd811f4ee2b 100644
--- a/tensorflow/compiler/tests/qr_op_test.py
+++ b/tensorflow/compiler/tests/qr_op_test.py
@@ -63,7 +63,7 @@ class QrOpTest(xla_test.XLATestCase, parameterized.TestCase):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    precision = self.AdjustedNorm(xx.eval() - identity.eval())
+    precision = self.AdjustedNorm(xx.eval() - self.evaluate(identity))
     self.assertTrue(np.all(precision < 5.0))
 
   def _test(self, dtype, shape, full_matrices):
diff --git a/tensorflow/compiler/tests/random_ops_test.py b/tensorflow/compiler/tests/random_ops_test.py
index 36ef6ed5fee78bad10bb1ee0bf3eb7824d05c206..97ffad34c00b8ec16eb1ec109ba5d980e0ce673d 100644
--- a/tensorflow/compiler/tests/random_ops_test.py
+++ b/tensorflow/compiler/tests/random_ops_test.py
@@ -46,9 +46,9 @@ class RandomOpsTest(xla_test.XLATestCase):
 
       # The random-number generator, if working correctly, should produce the
       # same output multiple times with low probability.
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # deterministic output, all three outputs will be bitwise identical.
@@ -83,7 +83,7 @@ class RandomOpsTest(xla_test.XLATestCase):
         with self.test_scope():
           x = random_ops.random_uniform(
               shape=[1000], dtype=dtype, minval=-2, maxval=33)
-        y = sess.run(x)
+        y = self.evaluate(x)
         self.assertTrue((y >= -2).sum() == 1000)
         self.assertTrue((y < 33).sum() == 1000)
 
@@ -102,7 +102,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.cached_session() as sess:
         with self.test_scope():
           x = random_ops.truncated_normal(shape=[count], dtype=dtype)
-        y = sess.run(x)
+        y = self.evaluate(x)
 
         def normal_cdf(x):
           return .5 * math.erfc(-x / math.sqrt(2))
@@ -111,7 +111,7 @@ class RandomOpsTest(xla_test.XLATestCase):
           return math.exp(-(x**2) / 2.) / math.sqrt(2 * math.pi)
 
         def probit(x, sess=sess):
-          return sess.run(special_math.ndtri(x))
+          return self.evaluate(special_math.ndtri(x))
 
         a = -2.
         b = 2.
@@ -148,7 +148,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.test_scope():
         x = math_ops.range(1 << 16)
         shuffle = random_ops.random_shuffle(x)
-      result = sess.run(shuffle)
+      result = self.evaluate(shuffle)
       expected = range(1 << 16)
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
@@ -159,7 +159,7 @@ class RandomOpsTest(xla_test.XLATestCase):
       with self.test_scope():
         x = array_ops.diag(math_ops.range(20))
         shuffle = random_ops.random_shuffle(x)
-      result = sess.run(shuffle)
+      result = self.evaluate(shuffle)
       expected = np.diag(range(20)).flatten()
       # Compare sets to avoid randomness behavior changes but make sure still
       # have all the values.
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index a6b58020126a3297944f199e99b0801387615564..d23fd125163d1afe8c7fd5e008d4b617ff4b2874 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -3382,10 +3382,10 @@ int main(int argc, char** argv) {
   }
   // XLA devices register kernels at construction time; create all known devices
   // to make sure the kernels are registered.
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
       tensorflow::SessionOptions(), "", &devices));
-  tensorflow::DeviceMgr device_mgr(devices);
+  tensorflow::DeviceMgr device_mgr(std::move(devices));
 
   tensorflow::Device* ignored;
   TF_QCHECK_OK(
diff --git a/tensorflow/compiler/tests/reduce_ops_test.py b/tensorflow/compiler/tests/reduce_ops_test.py
index 132c59c32c9db0c8759bdbb31f8613c3ef88b485..e8fc81bbb5472669c408b8bbdbcdfcdcf461131f 100644
--- a/tensorflow/compiler/tests/reduce_ops_test.py
+++ b/tensorflow/compiler/tests/reduce_ops_test.py
@@ -91,6 +91,7 @@ class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
       np.array([], dtype=np.bool).reshape(0, 3),
       np.array([[False, True, False], [True, True, False]]),
   ]
+  ONES = [np.ones([34000, 2])]
 
   def testReduceSumF32(self, index_dtype):
     self._testReduction(math_ops.reduce_sum, np.sum, np.float32, self.REAL_DATA,
@@ -149,6 +150,11 @@ class ReduceOpsTest(xla_test.XLATestCase, parameterized.TestCase):
     self._testReduction(math_ops.reduce_mean, np.mean, np.float32,
                         self.NONEMPTY_REAL_DATA, index_dtype)
 
+  def testReduceMeanF16(self, index_dtype):
+    if np.float16 in self.all_types:
+      self._testReduction(math_ops.reduce_mean, np.mean, np.float16, self.ONES,
+                          index_dtype)
+
   def testReduceMeanC64(self, index_dtype):
     self._testReduction(math_ops.reduce_mean, np.mean, np.complex64,
                         self.NONEMPTY_COMPLEX_DATA, index_dtype)
diff --git a/tensorflow/compiler/tests/rmsprop_test.py b/tensorflow/compiler/tests/rmsprop_test.py
index 8840a1329a907bddc6ef1cb6dd1c2a6d234def5c..dc3e90b4afa41c08d899ee195d42fb91678bad1c 100644
--- a/tensorflow/compiler/tests/rmsprop_test.py
+++ b/tensorflow/compiler/tests/rmsprop_test.py
@@ -76,7 +76,7 @@ class RmspropTest(xla_test.XLATestCase):
           rms_opt = rmsprop.RMSPropOptimizer(learning_rate, centered=centered)
           rms_update = rms_opt.apply_gradients(
               zip([grads0, grads1], [var0, var1]))
-          variables.global_variables_initializer().run()
+          self.evaluate(variables.global_variables_initializer())
 
           mg0 = rms_opt.get_slot(var0, "mg")
           self.assertEqual(mg0 is not None, centered)
@@ -92,12 +92,12 @@ class RmspropTest(xla_test.XLATestCase):
           self.assertTrue(mom1 is not None)
 
           # Fetch params to validate initial values
-          self.assertAllClose([1.0, 2.0], var0.eval())
-          self.assertAllClose([3.0, 4.0], var1.eval())
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
           # Run 3 steps of RMSProp
           for _ in range(3):
-            rms_update.run()
+            self.evaluate(rms_update)
 
             var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
                 var0_np,
@@ -118,14 +118,14 @@ class RmspropTest(xla_test.XLATestCase):
 
             # Validate updated params
             if centered:
-              self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-              self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-            self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-            self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-            self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-            self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-            self.assertAllCloseAccordingToType(var0_np, var0.eval())
-            self.assertAllCloseAccordingToType(var1_np, var1.eval())
+              self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+              self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+            self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+            self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+            self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+            self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+            self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+            self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tests/scan_ops_test.py b/tensorflow/compiler/tests/scan_ops_test.py
index 897db384b7e8067b0460b5f344201f101a4d8479..17639bd8a755b9e9f5acc77979ac7a4149f112db 100644
--- a/tensorflow/compiler/tests/scan_ops_test.py
+++ b/tensorflow/compiler/tests/scan_ops_test.py
@@ -71,7 +71,7 @@ def handle_options(func, x, axis, exclusive, reverse):
 
 class CumsumTest(xla_test.XLATestCase):
 
-  valid_dtypes = [np.float32]
+  valid_dtypes = [np.float32, np.int32]
 
   def axis_dtypes(self):
     return set(self.int_types).intersection([np.int32, np.int64])
@@ -149,7 +149,7 @@ class CumsumTest(xla_test.XLATestCase):
 
 class CumprodTest(xla_test.XLATestCase):
 
-  valid_dtypes = [np.float32]
+  valid_dtypes = [np.float32, np.int32]
 
   def axis_dtypes(self):
     return set(self.int_types).intersection([np.int32, np.int64])
diff --git a/tensorflow/compiler/tests/stateless_random_ops_test.py b/tensorflow/compiler/tests/stateless_random_ops_test.py
index 21708aa15877647e2a979a5a2674dfb734700df3..ee7ca7e6f196e114ff18e2597145e5c198980b08 100644
--- a/tensorflow/compiler/tests/stateless_random_ops_test.py
+++ b/tensorflow/compiler/tests/stateless_random_ops_test.py
@@ -156,7 +156,7 @@ class StatelessRandomOpsTest(xla_test.XLATestCase):
           return math.exp(-(x**2) / 2.) / math.sqrt(2 * math.pi)
 
         def probit(x, sess=sess):
-          return sess.run(special_math.ndtri(x))
+          return self.evaluate(special_math.ndtri(x))
 
         a = -2.
         b = 2.
diff --git a/tensorflow/compiler/tests/tensor_array_ops_test.py b/tensorflow/compiler/tests/tensor_array_ops_test.py
index 46ca371c8abf1cb4710717a183ee12820c4c4ca0..d7e26d79c4c054860ade5c8960a3bca984e020b0 100644
--- a/tensorflow/compiler/tests/tensor_array_ops_test.py
+++ b/tensorflow/compiler/tests/tensor_array_ops_test.py
@@ -79,7 +79,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       c0 = w2.stack()
 
       self.assertAllEqual(
-          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]), c0.eval())
+          convert([[[4.0, 5.0]], [[6.0, 7.0]], [[8.0, 9.0]]]),
+          self.evaluate(c0))
 
   def testTensorArrayWritePack(self):
     for dtype in self.numeric_tf_types:
@@ -97,7 +98,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
       c0 = w2.stack()
 
-      self.assertAllEqual([3, 0, 1], c0.eval().shape)
+      self.assertAllEqual([3, 0, 1], self.evaluate(c0).shape)
 
   def _testTensorArrayWriteConcat(self, tf_dtype):
     with self.cached_session(), self.test_scope():
@@ -113,8 +114,8 @@ class TensorArrayTest(xla_test.XLATestCase):
       c0 = w2.concat()
 
       self.assertAllEqual(
-          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0],
-                   [106.0, 107.0], [8.0, 9.0], [204.0, 205.0]]), c0.eval())
+          convert([[4.0, 5.0], [104.0, 105.0], [6.0, 7.0], [106.0, 107.0],
+                   [8.0, 9.0], [204.0, 205.0]]), self.evaluate(c0))
 
   def testTensorArrayWriteConcat(self):
     for dtype in self.numeric_tf_types:
@@ -341,7 +342,7 @@ class TensorArrayTest(xla_test.XLATestCase):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtype2, flow_in=w0.flow)
         with self.assertRaisesOpError("TensorArray dtype is "):
-          r0_bad.eval()
+          self.evaluate(r0_bad)
 
         # Test reading from a different index than the one we wrote to
         w0.read(1)
@@ -422,7 +423,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       w2 = h2.write(0, 5.0)
       r2 = w2.read(0)
       r = r1 + r2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def _testTensorArrayGradientWriteReadType(self, dtype):
     with self.cached_session() as session, self.test_scope():
@@ -504,7 +505,7 @@ class TensorArrayTest(xla_test.XLATestCase):
                 [-0.5, 1.5],  # read(0) gradient
                 [20.0, 30.0, 40.0, 50.0],  # concat gradient
             ])
-      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+      grad_vals = self.evaluate(grad_r)  # 2 + 2 entries
 
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
@@ -526,7 +527,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       with ops.control_dependencies([r0_readtwice]):
         r1_readtwice = w_readtwice.read(0)
 
-      self.assertAllEqual([1.0, -1.0], r1_readtwice.eval())
+      self.assertAllEqual([1.0, -1.0], self.evaluate(r1_readtwice))
 
   def _testTensorArrayGradientUnpackRead(self):
     with self.cached_session() as session, self.test_scope():
@@ -592,7 +593,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       s = ta.size()
-      self.assertAllEqual(3, s.eval())
+      self.assertAllEqual(3, self.evaluate(s))
 
   def testWriteCloseTensorArray(self):
     with self.cached_session(), self.test_scope():
@@ -722,7 +723,7 @@ class TensorArrayTest(xla_test.XLATestCase):
 
   #     r = acc2.stack()
   #     grad = gradients_impl.gradients(r, [x])[0]
-  #     self.assertAllClose(31.0, grad.eval())
+  #     self.assertAllClose(31.0, self.evaluate(grad))
 
   def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
     with self.cached_session() as session, self.test_scope():
@@ -912,7 +913,7 @@ class TensorArrayTest(xla_test.XLATestCase):
       self.assertEqual(0, ta.size().eval())
       ta = ta.unstack(array_ops.zeros([0, 3, 5]))
       packed = ta.stack()
-      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
       self.assertAllEqual([0, 5], ta.concat().eval().shape)
@@ -1041,8 +1042,8 @@ class TensorArrayTest(xla_test.XLATestCase):
           (read0, read1, size0, size1))
 
       # Tests that the control dependencies was added and executed.
-      self.assertEqual(1, v0.eval())
-      self.assertEqual(1, v1.eval())
+      self.assertEqual(1, self.evaluate(v0))
+      self.assertEqual(1, self.evaluate(v1))
 
       # Tests correct TensorArray.
       self.assertEqual(read0_v, 0)
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index d612d3b32dd6b0893508413b337ea9ad95ef6dd7..95c9e7ffd4651642781143c2c1940b0e51e1e470 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -481,6 +481,72 @@ class UnaryOpsTest(xla_test.XLATestCase):
           np.array([-1, -0.5, 0, 0.3], dtype=dtype),
           expected=np.array([-1., -0.5, 0., 0.296875], dtype=dtype))
 
+      def quantize_and_dequantize_v2_round_half_up(x):
+        return array_ops.quantize_and_dequantize_v2(
+            x,
+            -1,
+            1.0,
+            signed_input=True,
+            num_bits=8,
+            range_given=True,
+            round_mode="HALF_UP")
+
+      self._assertOpOutputMatchesExpected(
+          quantize_and_dequantize_v2_round_half_up,
+          np.array([-0.8, -0.5, 0, 0.3, 0.8, -2, 33], dtype=dtype),
+          expected=np.array([
+              -102.0 / 127,
+              -63.0 / 127,
+              0,
+              38.0 / 127,
+              102.0 / 127,
+              -128.0 / 127,
+              1,
+          ],
+                            dtype=dtype))
+
+      def quantize_and_dequantize_v2_round_half_to_even(x):
+        return array_ops.quantize_and_dequantize_v2(
+            x,
+            -1.0,
+            1.0,
+            signed_input=True,
+            num_bits=8,
+            range_given=True,
+            round_mode="HALF_TO_EVEN")
+
+      self._assertOpOutputMatchesExpected(
+          quantize_and_dequantize_v2_round_half_to_even,
+          np.array(
+              [
+                  -0.8,
+                  # The -0.5 should become -63.5 after scaling and with
+                  # rounding this should become -64. But with the test
+                  # unary_ops_test_cpu_ondemand, this fails as the result
+                  # before scaling becomes -63.499996 and gets rounded to -63.
+                  # TODO(sreenik): Some one more familiar with this test needs
+                  # to take a look and resolve this. This works on all other
+                  # variations of the platform like cpu, and gpu.
+                  # -0.5,
+                  0,
+                  0.3,
+                  0.8,
+                  -2,
+                  33
+              ],
+              dtype=dtype),
+          expected=np.array(
+              [
+                  -102.0 / 127,
+                  # -64.0 / 127,
+                  0,
+                  38.0 / 127,
+                  102.0 / 127,
+                  -128.0 / 127,
+                  1,
+              ],
+              dtype=dtype))
+
       def quantize_and_dequantize_v3(x):
         return array_ops.quantize_and_dequantize_v3(
             x, -127, 127, num_bits=8, signed_input=True, range_given=False)
diff --git a/tensorflow/compiler/tests/variable_ops_test.py b/tensorflow/compiler/tests/variable_ops_test.py
index 77cdeac8168aa71555955b141852587d62ab59d3..fcd7ac5ba1ca5049246e93e6f5f76746fb28c6b8 100644
--- a/tensorflow/compiler/tests/variable_ops_test.py
+++ b/tensorflow/compiler/tests/variable_ops_test.py
@@ -77,7 +77,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         sess.run(variables.variables_initializer([v]))
         x = v.sparse_read(2)
         self.assertAllClose(
-            np.array([8j, 9, 10, 11]).astype(dtype), sess.run(x))
+            np.array([8j, 9, 10, 11]).astype(dtype), self.evaluate(x))
 
   def testSparseRead1DIndices(self):
     for dtype in self.numeric_types:
@@ -89,7 +89,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         x = v.sparse_read([2, 1])
         self.assertAllClose(
             np.array([[8, 9, 10, 11], [4, 5, 6j, 7]]).astype(dtype),
-            sess.run(x))
+            self.evaluate(x))
 
   def testSparseRead2DIndices(self):
     for dtype in self.numeric_types:
@@ -102,7 +102,7 @@ class VariableOpsTest(xla_test.XLATestCase):
         self.assertAllClose(
             np.array([[[8, 9, 10, 11], [4, 5, 6, 7]],
                       [[0, 1, 2j, 3], [8, 9, 10, 11]]]).astype(dtype),
-            sess.run(x))
+            self.evaluate(x))
 
   def testSparseRead2DIndices3DTensor(self):
     for dtype in self.numeric_types:
@@ -115,9 +115,9 @@ class VariableOpsTest(xla_test.XLATestCase):
         x = v.sparse_read([[2, 1], [3, 0]])
         self.assertAllClose(
             np.array(
-                [[[[20, 21, 22], [23, 24j, 25]], [[10, 11, 12], [13, 14, 15]]
-                 ], [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]
-                ],).astype(dtype), sess.run(x))
+                [[[[20, 21, 22], [23, 24j, 25]], [[10, 11, 12], [13, 14, 15]]],
+                 [[[30, 31, 32], [33, 34, 35]], [[0, 1, 2], [3, 4, 5]]]
+                ],).astype(dtype), self.evaluate(x))
 
   def testShape(self):
     for dtype in self.numeric_types:
@@ -229,7 +229,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_add(
               handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[3], [7]])
+      self.assertAllEqual(self.evaluate(read), [[3], [7]])
 
   def testScatterSub(self):
     with self.test_session() as sess, self.test_scope():
@@ -242,7 +242,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_sub(
               handle, [1], constant_op.constant([[2]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[4], [-1]])
+      self.assertAllEqual(self.evaluate(read), [[4], [-1]])
 
   def testScatterMul(self):
     with self.test_session() as sess, self.test_scope():
@@ -255,7 +255,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_mul(
               handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[5]])
+      self.assertEqual(self.evaluate(read), [[5]])
 
   def testScatterDiv(self):
     with self.test_session() as sess, self.test_scope():
@@ -268,7 +268,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_div(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertAllEqual(sess.run(read), [[2]])
+      self.assertAllEqual(self.evaluate(read), [[2]])
 
   def testScatterMin(self):
     with self.test_session() as sess, self.test_scope():
@@ -281,7 +281,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_min(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterMax(self):
     with self.test_session() as sess, self.test_scope():
@@ -294,7 +294,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_max(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[6]])
+      self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterUpdate(self):
     with self.test_session() as sess, self.test_scope():
@@ -307,7 +307,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_update(
               handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterAddScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -320,7 +320,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_add(
               handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterSubScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -333,7 +333,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_sub(
               handle, [0], constant_op.constant(2, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[-1]])
+      self.assertEqual(self.evaluate(read), [[-1]])
 
   def testScatterMulScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -346,7 +346,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_mul(
               handle, [0], constant_op.constant(5, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[5]])
+      self.assertEqual(self.evaluate(read), [[5]])
 
   def testScatterDivScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -359,7 +359,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_div(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[2]])
+      self.assertEqual(self.evaluate(read), [[2]])
 
   def testScatterMinScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -372,7 +372,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_min(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[3]])
+      self.assertEqual(self.evaluate(read), [[3]])
 
   def testScatterMaxScalar(self):
     with self.test_session() as sess, self.test_scope():
@@ -385,7 +385,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           resource_variable_ops.resource_scatter_max(
               handle, [0], constant_op.constant(3, dtype=dtypes.int32)))
       read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
-      self.assertEqual(sess.run(read), [[6]])
+      self.assertEqual(self.evaluate(read), [[6]])
 
   def testScatterNdAddOps(self):
     with self.test_session() as sess, self.test_scope():
@@ -400,7 +400,7 @@ class VariableOpsTest(xla_test.XLATestCase):
       sess.run(gen_state_ops.resource_scatter_nd_add(handle, indices, updates))
       read = resource_variable_ops.read_variable_op(
           handle, dtype=dtypes.float32)
-      self.assertAllClose(expected, sess.run(read))
+      self.assertAllClose(expected, self.evaluate(read))
 
   def testScatterNdUpdateAddOps(self):
     with self.test_session() as sess, self.test_scope():
@@ -416,7 +416,7 @@ class VariableOpsTest(xla_test.XLATestCase):
           gen_state_ops.resource_scatter_nd_update(handle, indices, updates))
       read = resource_variable_ops.read_variable_op(
           handle, dtype=dtypes.float32)
-      self.assertAllClose(expected, sess.run(read))
+      self.assertAllClose(expected, self.evaluate(read))
 
 
 class StridedSliceAssignChecker(object):
diff --git a/tensorflow/compiler/tests/xla_device_test.py b/tensorflow/compiler/tests/xla_device_test.py
index 28d61fb07dcb665fa0dbe3f3e566e291e24fa662..ef55292b1be91a731ec556d7efa9cdf1a696e5cc 100644
--- a/tensorflow/compiler/tests/xla_device_test.py
+++ b/tensorflow/compiler/tests/xla_device_test.py
@@ -81,7 +81,7 @@ class XlaDeviceTest(xla_test.XLATestCase):
     with self.cached_session() as sess:
       with self.test_scope():
         x = gen_control_flow_ops.control_trigger()
-      sess.run(x)
+      self.evaluate(x)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index e0171415492658a76b25167107e01300ee4bde88..5a0d9b9af9d55a8dee809d3cf909bce39c3b8b6c 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -9,6 +9,7 @@ package_group(
         "//tensorflow/compiler/jit/...",
         "//tensorflow/compiler/tests/...",
         "//tensorflow/compiler/tf2xla/...",
+        "//tensorflow/contrib/compiler/...",
     ],
 )
 
@@ -195,8 +196,8 @@ cc_library(
         ":sharding_util",
         ":side_effect_util",
         ":tf2xla_util",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/compiler/jit:xla_cluster_util",
-        "//tensorflow/compiler/jit/legacy_flags:mark_for_compilation_pass_flags",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -204,13 +205,13 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
@@ -221,6 +222,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
@@ -437,21 +439,15 @@ cc_library(
     name = "dump_graph",
     srcs = [
         "dump_graph.cc",
-        "dump_graph_flags.cc",
-        "dump_graph_flags.h",
     ],
     hdrs = [
         "dump_graph.h",
     ],
     deps = [
-        "//tensorflow/compiler/xla:parse_flags_from_env",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/compiler/jit:flags",
         "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
+        "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
-        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/dump_graph.cc b/tensorflow/compiler/tf2xla/dump_graph.cc
index 380c6a7e23da92d949b26876836b999bf6406c6c..64fdbbebc65bff4ed0b965fcdd534cc9696472b6 100644
--- a/tensorflow/compiler/tf2xla/dump_graph.cc
+++ b/tensorflow/compiler/tf2xla/dump_graph.cc
@@ -18,87 +18,26 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/dump_graph.h"
 
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
-#include "tensorflow/core/platform/env.h"
-#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 namespace dump_graph {
 
-namespace {
-
-struct NameCounts {
-  mutex counts_mutex;
-  std::unordered_map<string, int> counts;
-};
-
-string MakeUniqueFilename(string name) {
-  static NameCounts& instance = *new NameCounts;
-
-  // Remove illegal characters from `name`.
-  for (int i = 0; i < name.size(); ++i) {
-    char ch = name[i];
-    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
-      name[i] = '_';
-    }
-  }
-
-  int count;
-  {
-    mutex_lock lock(instance.counts_mutex);
-    count = instance.counts[name]++;
-  }
-
-  string filename = name;
-  if (count > 0) {
-    absl::StrAppend(&filename, "_", count);
-  }
-  absl::StrAppend(&filename, ".pbtxt");
-  return filename;
-}
-
-string WriteTextProtoToUniqueFile(
-    Env* env, const string& name, const char* proto_type,
-    const ::tensorflow::protobuf::Message& proto) {
-  const string& dirname =
-      legacy_flags::GetDumpGraphFlags()->tf_dump_graph_prefix;
-  Status status = env->RecursivelyCreateDir(dirname);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to create " << dirname << " for dumping "
-                 << proto_type << ": " << status;
-    return "(unavailable)";
-  }
-  string filepath = absl::StrCat(dirname, "/", MakeUniqueFilename(name));
-  status = WriteTextProto(Env::Default(), filepath, proto);
-  if (!status.ok()) {
-    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
-                 << " : " << status;
-    return "(unavailable)";
-  }
-  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
-  return filepath;
-}
-
-}  // anonymous namespace
-
 string DumpGraphDefToFile(const string& name, GraphDef const& graph_def) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef",
-                                    graph_def);
+  return tensorflow::DumpGraphDefToFile(
+      name, graph_def, GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 string DumpGraphToFile(const string& name, Graph const& graph,
                        const FunctionLibraryDefinition* flib_def) {
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-  if (flib_def) {
-    *graph_def.mutable_library() = flib_def->ToProto();
-  }
-  return DumpGraphDefToFile(name, graph_def);
+  return tensorflow::DumpGraphToFile(name, graph, flib_def,
+                                     GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef) {
-  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef);
+  return tensorflow::DumpFunctionDefToFile(
+      name, fdef, GetDumpGraphFlags()->tf_dump_graph_prefix);
 }
 
 }  // namespace dump_graph
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.cc b/tensorflow/compiler/tf2xla/dump_graph_flags.cc
deleted file mode 100644
index 2eb1f8cd849b67922f94cfe3f88456b0d6beeaf8..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Legacy flags for the XLA bridge's dump_graph module.
-
-#include <mutex>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/dump_graph_flags.h"
-#include "tensorflow/compiler/xla/parse_flags_from_env.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Pointers to the parsed value of the flags and flag descriptors, initialized
-// via flags_init.
-static DumpGraphFlags* flags;
-static std::vector<Flag>* flag_list;
-static std::once_flag flags_init;
-
-// Allocate *flags.  Called via call_once(&flags_init,...).
-static void AllocateFlags() {
-  flags = new DumpGraphFlags;
-  flags->tf_dump_graph_prefix = "/tmp/";
-  flag_list = new std::vector<Flag>({
-      Flag("tf_dump_graph_prefix", &flags->tf_dump_graph_prefix,
-           "Path prefix to which graphs dumped during debugging should be "
-           "written."),
-  });
-  xla::ParseFlagsFromEnv(*flag_list);
-}
-
-// Append to *append_to flag definitions associated with the XLA bridge's
-// dump_graph module.
-void AppendDumpGraphFlags(std::vector<Flag>* append_to) {
-  std::call_once(flags_init, &AllocateFlags);
-  append_to->insert(append_to->end(), flag_list->begin(), flag_list->end());
-}
-
-// Return a pointer to the DumpGraphFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-DumpGraphFlags* GetDumpGraphFlags() {
-  std::call_once(flags_init, &AllocateFlags);
-  return flags;
-}
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/dump_graph_flags.h b/tensorflow/compiler/tf2xla/dump_graph_flags.h
deleted file mode 100644
index 80a3307d920f2cc3d668d507786a02e43589f86f..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/dump_graph_flags.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
-#define TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
-
-// Legacy flags for the XLA bridge's dump_graph module.
-
-#include <vector>
-
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
-
-namespace tensorflow {
-namespace legacy_flags {
-
-// Append to *flag_list flag definitions associated with the XLA bridge's
-// dump_graph module.
-void AppendDumpGraphFlags(std::vector<tensorflow::Flag>* flag_list);
-
-// The values of flags associated with the XLA bridge's
-// dump_graph module.
-typedef struct {
-  string tf_dump_graph_prefix;  // Path prefix to which graphs dumped during
-                                // debugging should be written.
-} DumpGraphFlags;
-
-// Return a pointer to the DumpGraphFlags struct;
-// repeated calls return the same pointer.
-// This should be called only after Flags::Parse() has returned.
-DumpGraphFlags* GetDumpGraphFlags();
-
-}  // namespace legacy_flags
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_DUMP_GRAPH_FLAGS_H_
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 9ef9f49f422ec4dfaf538ac3c0754ba3609d3f88..3dfd3f854c8646ebbf06d3378201d22e8741b7eb 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -75,6 +75,25 @@ Status FunctionalizeControlFlow(Graph* graph,
   return FunctionalizeControlFlow(/*lookup_library=*/nullptr, graph, library);
 }
 
+Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
+                                           FunctionLibraryDefinition* library) {
+  return FunctionalizeControlFlowForGraphDef(/*lookup_library=*/nullptr,
+                                             graph_def, library);
+}
+
+Status FunctionalizeControlFlowForGraphDef(
+    const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def,
+    FunctionLibraryDefinition* library) {
+  FunctionDefLibrary function_lib = graph_def->library();
+  Graph graph(OpRegistry::Global());
+
+  TF_RETURN_IF_ERROR(ConvertGraphDefToGraph({}, *graph_def, &graph));
+  TF_RETURN_IF_ERROR(FunctionalizeControlFlow(lookup_library, &graph, library));
+  graph.ToGraphDef(graph_def);
+  std::swap(*graph_def->mutable_library(), function_lib);
+  return Status::OK();
+}
+
 Status FunctionalizeControlFlowForFunction(
     const string& func_name, const string& new_func_name,
     const protobuf::Map<string, tensorflow::AttrValue>& attrs,
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.h b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
index ba99205640ccdc83a3a4d50e3ec474907894a835..91d33fa405834d7f1f8f66180583580f4f2e448a 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.h
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.h
@@ -33,6 +33,12 @@ Status FunctionalizeControlFlow(const FunctionLibraryDefinition* lookup_library,
                                 Graph* graph,
                                 FunctionLibraryDefinition* library);
 
+Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
+                                           FunctionLibraryDefinition* library);
+Status FunctionalizeControlFlowForGraphDef(
+    const FunctionLibraryDefinition* lookup_library, GraphDef* graph_def,
+    FunctionLibraryDefinition* library);
+
 // This pass looks at the graph and all associated FunctionDefs, and turns
 // traditional control flow structure (Switch/Merge/etc.) into functional
 // control flow structure (If/While).
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index c3841f996f801e855da75b23f01d41674ec51c4d..9784985af83a18619d837528f99a60b98a501ec5 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -95,77 +95,87 @@ TEST(FunctionalizeControlFlow, Conditional) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    string op_name;
+    NameAttrList then_fn;
+    NameAttrList else_fn;
+    TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
+    InstantiationResultForTest else_result;
+    TF_EXPECT_OK(
+        InstantiateFunctionForTest(else_fn.name(), library, &else_result));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
+      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+      auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
+      auto if_op = ops::If(scope.WithOpName(op_name), less,
+                           std::initializer_list<Input>{less, y, x}, {DT_INT32},
+                           then_fn, else_fn);
+      auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-  string op_name;
-  NameAttrList then_fn;
-  NameAttrList else_fn;
-  TF_EXPECT_OK(FindIfThenAndElse(graph_def, &op_name, &then_fn, &else_fn));
-  InstantiationResultForTest else_result;
-  TF_EXPECT_OK(
-      InstantiateFunctionForTest(else_fn.name(), library, &else_result));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto y = ops::Placeholder(scope.WithOpName("y"), DT_INT32);
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto less = ops::Less(scope.WithOpName("cond/Less"), y, x);
-    auto if_op = ops::If(scope.WithOpName(op_name), less,
-                         std::initializer_list<Input>{less, y, x}, {DT_INT32},
-                         then_fn, else_fn);
-    auto id = ops::Identity(scope.WithOpName("cond/Merge"), if_op.output[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // then body.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
-    auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
-    auto cond = ops::Const(
-        scope.WithOpName("cond").WithControlDependencies(identity), 17);
-    auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(then_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+    // then body.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
+      auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto identity = ops::Identity(scope.WithOpName("cond/Identity"), arg_0);
+      auto cond = ops::Const(
+          scope.WithOpName("cond").WithControlDependencies(identity), 17);
+      auto mul = ops::Mul(scope.WithOpName("cond/Mul"), arg_1, cond);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), mul, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(then_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
+                result.arg_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-  // else body.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
-    auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
-    auto cond_1 = ops::Const(
-        scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
-    auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(else_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}), result.arg_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // else body.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg_0 = ops::_Arg(scope.WithOpName("_arg0"), DT_BOOL, 0);
+      auto arg_1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg_2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto identity = ops::Identity(scope.WithOpName("cond/Identity_1"), arg_0);
+      auto cond_1 = ops::Const(
+          scope.WithOpName("cond_1").WithControlDependencies(identity), 23);
+      auto add = ops::Add(scope.WithOpName("cond/false/add"), arg_2, cond_1);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(else_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      EXPECT_EQ((DataTypeVector{DT_BOOL, DT_INT32, DT_INT32}),
+                result.arg_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -239,75 +249,77 @@ TEST(FunctionalizeControlFlow, OneLoopVar) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
-    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
-    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+    // Condition graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+      auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
-// @function.Defun(noinline=True)
-// def increment_fn(x):
-//   return [x + 1]
-// Define the above function, and add it to the given graph. It's used as the
-// while loop body in NoinlineLoopBody test.
-Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
+FunctionDef GetNoinlineFunctionDef() {
   FunctionDef fdef = FunctionDefHelper::Create(
       "increment_fn", {"x:int32"}, {"add:int32"}, {},
       {
@@ -316,8 +328,17 @@ Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
       },
       {{"add", "add_0:z:0"}});
   (*fdef.mutable_attr())["_noinline"].set_b(true);
+  return fdef;
+}
+
+// @function.Defun(noinline=True)
+// def increment_fn(x):
+//   return [x + 1]
+// Define the above function, and add it to the given graph. It's used as the
+// while loop body in NoinlineLoopBody test.
+Status AddNoinlineFunctionToGraph(const string& node_name, Graph* graph) {
   FunctionDefLibrary fdef_lib;
-  *(fdef_lib.add_function()) = fdef;
+  *(fdef_lib.add_function()) = GetNoinlineFunctionDef();
   TF_RETURN_IF_ERROR(graph->AddFunctionLibrary(fdef_lib));
   NodeDef increment_fn;
   increment_fn.set_name(node_name);
@@ -376,55 +397,88 @@ TEST(FunctionalizeControlFlow, NoinlineLoopBody) {
   FunctionLibraryDefinition lookup_lib(graph.flib_def());
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
   // Function increment_fn will be copied from lookup_lib to library.
-  TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
+  *(optimized_graph_def.mutable_library()->add_function()) =
+      GetNoinlineFunctionDef();
 
-  NameAttrList cond_fn, body_fn;
-  TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+  TF_ASSERT_OK(FunctionalizeControlFlowForGraphDef(
+      &lookup_lib, &optimized_graph_def, &library));
+  TF_ASSERT_OK(FunctionalizeControlFlow(&lookup_lib, &graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_ASSERT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      GraphDef expected;
+      TF_ASSERT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    GraphDef expected;
-    TF_ASSERT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      TF_ASSERT_OK(
+          AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      NodeDef retval;
+      retval.set_name("_retval0_RetVal");
+      retval.set_op(FunctionLibraryDefinition::kRetOp);
+      *retval.add_input() = noinline_node_name;
+      (*retval.mutable_attr())["T"].set_type(DT_INT32);
+      (*retval.mutable_attr())["index"].set_i(0);
+      Status status;
+      scope.graph()->AddNode(retval, &status);
+      TF_ASSERT_OK(status);
+
+      GraphDef expected;
+      TF_ASSERT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      // Verify that increment_fn has been copied to library.
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      // Ignore the function library when comparing the graphs.
+      expected.clear_library();
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
+}
 
-  // Body graph.
+TEST(FunctionalizeControlFlow, MissingFunctionDefInLibrary) {
+  const string& noinline_node_name = "while/increment_fn";
+  Graph graph(OpRegistry::Global());
   {
     Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+    auto identity = ops::Identity(scope.WithOpName("while/Identity"), source);
     TF_ASSERT_OK(AddNoinlineFunctionToGraph(noinline_node_name, scope.graph()));
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    NodeDef retval;
-    retval.set_name("_retval0_RetVal");
-    retval.set_op(FunctionLibraryDefinition::kRetOp);
-    *retval.add_input() = noinline_node_name;
-    (*retval.mutable_attr())["T"].set_type(DT_INT32);
-    (*retval.mutable_attr())["index"].set_i(0);
-    Status status;
-    scope.graph()->AddNode(retval, &status);
-    TF_ASSERT_OK(status);
-
-    GraphDef expected;
-    TF_ASSERT_OK(scope.ToGraphDef(&expected));
+    TF_ASSERT_OK(scope.ToGraph(&graph));
+  }
 
-    InstantiationResultForTest result;
-    // Verify that increment_fn has been copied to library.
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+  FunctionLibraryDefinition lookup_lib(graph.flib_def());
+  FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+  graph_def.clear_library();
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    // Ignore the function library when comparing the graphs.
-    expected.clear_library();
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
+  Status status =
+      FunctionalizeControlFlowForGraphDef(&lookup_lib, &graph_def, &library);
+  EXPECT_EQ(tensorflow::error::NOT_FOUND, status.code());
 }
 
 // Tests functionalizing OneLoopVar where the loop value is not used post the
@@ -467,65 +521,72 @@ TEST(FunctionalizeControlFlow, OneLoopVarWithoutExit) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{source}, cond_fn, body_fn);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto source = ops::Placeholder(scope.WithOpName("source"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{source}, cond_fn, body_fn);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
-    auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
-    auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+    // Condition graph
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("while/Less/y").WithControlDependencies(arg), 10);
+      auto less = ops::Less(scope.WithOpName("while/Less"), arg, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto identity = ops::Identity(scope.WithOpName("while/Identity"), arg);
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/y").WithControlDependencies(identity), 1);
+      auto add = ops::Add(scope.WithOpName("while/add"), identity, one);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_INT32}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -608,86 +669,95 @@ TEST(FunctionalizeControlFlow, TwoLoopVars) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
+
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList cond_fn, body_fn;
+    TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
+
+    // Outer graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
+      auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
+      auto while_op =
+          ops::While(scope.WithOpName("while/LoopCond"),
+                     std::initializer_list<Input>{x, y}, cond_fn, body_fn);
+      auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
+      auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList cond_fn, body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &cond_fn, &body_fn));
-
-  // Outer graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto x = ops::Placeholder(scope.WithOpName("Placeholder/x"), DT_INT32);
-    auto y = ops::Placeholder(scope.WithOpName("Placeholder/y"), DT_INT32);
-    auto while_op =
-        ops::While(scope.WithOpName("while/LoopCond"),
-                   std::initializer_list<Input>{x, y}, cond_fn, body_fn);
-    auto sink_x = ops::Identity(scope.WithOpName("sink_x"), while_op[0]);
-    auto sink_y = ops::Identity(scope.WithOpName("sink_y"), while_op[1]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+    // Condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto three = ops::Const<int32>(scope.WithOpName("while/cond/three")
+                                         .WithControlDependencies(arg0.output),
+                                     3);
+      auto cond_add =
+          ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
+      auto ten = ops::Const<int32>(scope.WithOpName("while/cond/ten")
                                        .WithControlDependencies(arg0.output),
-                                   3);
-    auto cond_add =
-        ops::Add(scope.WithOpName("while/cond/Add"), arg0.output, three);
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("while/cond/ten").WithControlDependencies(arg0.output),
-        10);
-    auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-
-    auto identity_x = ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
-    auto identity_y = ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
-
-    auto one = ops::Const<int32>(
-        scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
-        1);
-    auto two = ops::Const<int32>(
-        scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
-        2);
+                                   10);
+      auto less = ops::Less(scope.WithOpName("while/cond/Less"), cond_add, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
 
-    auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
-    auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
-    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(cond_fn.name(), library, &result));
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(InstantiateFunctionForTest(body_fn.name(), library, &result));
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+
+      auto identity_x =
+          ops::Identity(scope.WithOpName("while/Identity/x"), arg0);
+      auto identity_y =
+          ops::Identity(scope.WithOpName("while/Identity/y"), arg1);
+
+      auto one = ops::Const<int32>(
+          scope.WithOpName("while/add/one").WithControlDependencies(identity_x),
+          1);
+      auto two = ops::Const<int32>(
+          scope.WithOpName("while/mul/two").WithControlDependencies(identity_x),
+          2);
+
+      auto add = ops::Add(scope.WithOpName("while/add"), identity_x, one);
+      auto mul = ops::Add(scope.WithOpName("while/mul"), identity_y, two);
+      auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add, 0);
+      auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), mul, 1);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(body_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32}), result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
@@ -841,177 +911,192 @@ TEST(FunctionalizeControlFlow, Complex) {
   }
 
   FunctionLibraryDefinition library(OpRegistry::Global(), {});
+  GraphDef optimized_graph_def;
+  graph.ToGraphDef(&optimized_graph_def);
+  TF_ASSERT_OK(
+      FunctionalizeControlFlowForGraphDef(&optimized_graph_def, &library));
   TF_ASSERT_OK(FunctionalizeControlFlow(&graph, &library));
+  GraphDef converted_graph_def;
+  graph.ToGraphDef(&converted_graph_def);
 
-  GraphDef graph_def;
-  graph.ToGraphDef(&graph_def);
-
-  NameAttrList outer_cond_fn, outer_body_fn;
-  TF_EXPECT_OK(FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
-
-  // Outer graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
-    auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
-    auto y = ops::Add(scope.WithOpName("y"), x, three);
-
-    auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
-                                TensorShape({}));
-
-    auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
-
-    auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
-                               std::initializer_list<Input>{zero, y, x, var},
-                               outer_cond_fn, outer_body_fn);
-    auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-    TF_EXPECT_GRAPH_EQ(expected, graph_def);
-  }
-
-  // Outer condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto ten = ops::Const<int32>(
-        scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
-        10);
-    auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Outer body graph.
-  NameAttrList inner_cond_fn, inner_body_fn;
-  {
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
-
-    // Find the inner condition and body names.
-    TF_EXPECT_OK(
-        FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
-
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
-    auto one_j = ops::Const<int32>(
-        scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
-    auto while_op =
-        ops::While(scope.WithOpName("outer/LoopCond_1"),
-                   std::initializer_list<Input>{one_j, arg1, arg2, arg3},
-                   inner_cond_fn, inner_body_fn);
-
-    auto one_outer = ops::Const<int32>(
-        scope.WithOpName("outer/add/y").WithControlDependencies(identity_i), 1);
-    auto add_i =
-        ops::Add(scope.WithOpName("outer/add")
-                     .WithControlDependencies(absl::Span<const Operation>{
-                         while_op[0].op(), while_op[1].op()}),
-                 identity_i, one_outer);
-
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
-    auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
-    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Inner condition graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto five = ops::Const<int32>(
-        scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0), 5);
-    auto less_j = ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
-    auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
-
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
-
-    InstantiationResultForTest result;
+  for (const GraphDef& graph_def : {optimized_graph_def, converted_graph_def}) {
+    NameAttrList outer_cond_fn, outer_body_fn;
     TF_EXPECT_OK(
-        InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
-
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
-  }
-
-  // Inner body graph.
-  {
-    Scope scope = Scope::NewRootScope().ExitOnError();
-    auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
-    auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
-    auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
-    auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
-
-    auto identity_j =
-        ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
-    auto identity_k =
-        ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
-
-    auto mul_jk =
-        ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
-    auto add_jkx = ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
-    auto assign = ops::AssignAddVariableOp(
-        scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
-
-    auto one = ops::Const<int32>(
-        scope.WithOpName("outer/inner/One")
-            .WithControlDependencies(
-                absl::Span<const Operation>{assign.operation}),
-        1);
-    auto add_j =
-        ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+        FindWhileCondAndBody(graph_def, &outer_cond_fn, &outer_body_fn));
+
+    // Outer graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto x = ops::Placeholder(scope.WithOpName("x"), DT_INT32);
+      auto three = ops::Const<int32>(scope.WithOpName("three"), 3);
+      auto y = ops::Add(scope.WithOpName("y"), x, three);
+
+      auto var = ops::VarHandleOp(scope.WithOpName("Variable"), DT_INT32,
+                                  TensorShape({}));
+
+      auto zero = ops::Const<int32>(scope.WithOpName("outer/Const"), 0);
+
+      auto while_op = ops::While(scope.WithOpName("outer/LoopCond"),
+                                 std::initializer_list<Input>{zero, y, x, var},
+                                 outer_cond_fn, outer_body_fn);
+      auto sink = ops::Identity(scope.WithOpName("sink"), while_op[0]);
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+      TF_EXPECT_GRAPH_EQ(expected, graph_def);
+    }
 
-    auto retval0 = ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
-    auto retval1 =
-        ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
-    auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+    // Outer condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto ten = ops::Const<int32>(
+          scope.WithOpName("outer/Less/y").WithControlDependencies(arg0.output),
+          10);
+      auto less = ops::Less(scope.WithOpName("outer/Less_i"), arg0, ten);
+      auto retval = ops::_Retval(scope.WithOpName("_retval0_RetVal"), less, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(outer_cond_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    GraphDef expected;
-    TF_EXPECT_OK(scope.ToGraphDef(&expected));
+    // Outer body graph.
+    NameAttrList inner_cond_fn, inner_body_fn;
+    {
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(outer_body_fn.name(), library, &result));
+
+      // Find the inner condition and body names.
+      TF_EXPECT_OK(
+          FindWhileCondAndBody(result.gdef, &inner_cond_fn, &inner_body_fn));
+
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto identity_i = ops::Identity(scope.WithOpName("outer/Identity"), arg0);
+      auto one_j = ops::Const<int32>(
+          scope.WithOpName("outer/j").WithControlDependencies(identity_i), 1);
+      auto while_op =
+          ops::While(scope.WithOpName("outer/LoopCond_1"),
+                     std::initializer_list<Input>{one_j, arg1, arg2, arg3},
+                     inner_cond_fn, inner_body_fn);
+
+      auto one_outer = ops::Const<int32>(
+          scope.WithOpName("outer/add/y").WithControlDependencies(identity_i),
+          1);
+      auto add_i =
+          ops::Add(scope.WithOpName("outer/add")
+                       .WithControlDependencies(absl::Span<const Operation>{
+                           while_op[0].op(), while_op[1].op()}),
+                   identity_i, one_outer);
+
+      auto retval0 =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_i, 0);
+      auto retval1 = ops::_Retval(scope.WithOpName("_retval1_RetVal"), arg1, 1);
+      auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+                result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    InstantiationResultForTest result;
-    TF_EXPECT_OK(
-        InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+    // Inner condition graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto five = ops::Const<int32>(
+          scope.WithOpName("outer/inner/Five").WithControlDependencies(arg0),
+          5);
+      auto less_j =
+          ops::Less(scope.WithOpName("outer/inner/Less_j"), arg0, five);
+      auto retval =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), less_j, 0);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(inner_cond_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ(DataTypeVector{DT_BOOL}, result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
 
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
-              result.arg_types);
-    EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}), result.ret_types);
-    TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    // Inner body graph.
+    {
+      Scope scope = Scope::NewRootScope().ExitOnError();
+      auto arg0 = ops::_Arg(scope.WithOpName("_arg0"), DT_INT32, 0);
+      auto arg1 = ops::_Arg(scope.WithOpName("_arg1"), DT_INT32, 1);
+      auto arg2 = ops::_Arg(scope.WithOpName("_arg2"), DT_INT32, 2);
+      auto arg3 = ops::_Arg(scope.WithOpName("_arg3"), DT_RESOURCE, 3);
+
+      auto identity_j =
+          ops::Identity(scope.WithOpName("outer/inner/Identity_j"), arg0);
+      auto identity_k =
+          ops::Identity(scope.WithOpName("outer/inner/Identity_k"), arg1);
+
+      auto mul_jk =
+          ops::Mul(scope.WithOpName("outer/inner/mul"), identity_j, identity_k);
+      auto add_jkx =
+          ops::Add(scope.WithOpName("outer/inner/add"), mul_jk, arg2);
+      auto assign = ops::AssignAddVariableOp(
+          scope.WithOpName("outer/inner/assign_add"), arg3, add_jkx);
+
+      auto one = ops::Const<int32>(
+          scope.WithOpName("outer/inner/One")
+              .WithControlDependencies(
+                  absl::Span<const Operation>{assign.operation}),
+          1);
+      auto add_j =
+          ops::Add(scope.WithOpName("outer/inner/add_j"), identity_j, one);
+
+      auto retval0 =
+          ops::_Retval(scope.WithOpName("_retval0_RetVal"), add_j, 0);
+      auto retval1 =
+          ops::_Retval(scope.WithOpName("_retval1_RetVal"), identity_k, 1);
+      auto retval2 = ops::_Retval(scope.WithOpName("_retval2_RetVal"), arg2, 2);
+
+      GraphDef expected;
+      TF_EXPECT_OK(scope.ToGraphDef(&expected));
+
+      InstantiationResultForTest result;
+      TF_EXPECT_OK(
+          InstantiateFunctionForTest(inner_body_fn.name(), library, &result));
+
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32, DT_RESOURCE}),
+                result.arg_types);
+      EXPECT_EQ((DataTypeVector{DT_INT32, DT_INT32, DT_INT32}),
+                result.ret_types);
+      TF_EXPECT_GRAPH_EQ(expected, result.gdef);
+    }
   }
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index d85b4f5ae0cb9c7d2476158a5830f921742ae980..8bc329229648c5aced8d06c99b170803bb3a90f8 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -121,13 +121,11 @@ tf_kernel_library(
         ":while_op",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/compiler/tf2xla/lib:batch_dot",
         "//tensorflow/compiler/tf2xla/lib:broadcast",
         "//tensorflow/compiler/tf2xla/lib:cholesky",
         "//tensorflow/compiler/tf2xla/lib:qr",
         "//tensorflow/compiler/tf2xla/lib:random",
         "//tensorflow/compiler/tf2xla/lib:scatter",
-        "//tensorflow/compiler/tf2xla/lib:triangular_solve",
         "//tensorflow/compiler/tf2xla/lib:util",
         "//tensorflow/compiler/tf2xla/lib:while_loop",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
@@ -144,10 +142,11 @@ tf_kernel_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:pooling",
         "//tensorflow/compiler/xla/client/lib:prng",
         "//tensorflow/compiler/xla/client/lib:sorting",
+        "//tensorflow/compiler/xla/client/lib:triangular_solve",
         "//tensorflow/core:framework",
         "//tensorflow/core:image_ops_op_lib",
         "//tensorflow/core:lib",
@@ -196,7 +195,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -216,7 +214,6 @@ cc_library(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels:bounds_check",
         "//tensorflow/core/kernels:conv_ops",
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 2db2514397deca39e6874cf994532a20d2186316..795ea09831e183a26fb3498b9bbaf9c3adaef9ed 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -50,7 +50,7 @@ class XlaArgOp : public XlaOpKernel {
       return;
     }
 
-    const XlaExpression& arg = XlaContext::Get(ctx).args()[index_];
+    const XlaExpression& arg = ctx->xla_context()->args()[index_];
     OP_REQUIRES(ctx, arg.kind() != XlaExpression::Kind::kInvalid,
                 errors::InvalidArgument("Invalid/missing argument expression"));
     ctx->SetOutputExpression(0, arg);
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 4cfe946b2e6146f034867c06e996ffae42b90705..1b254e328a8c71bd81a0ec700e2af1d81a5fa67a 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
+#include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 
 namespace tensorflow {
 namespace {
@@ -28,9 +30,11 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = BatchDot(ctx->Input(0), ctx->Input(1),
-                           /*transpose_x=*/adj_x_, /*transpose_y=*/adj_y_,
-                           /*conjugate_x=*/adj_x_, /*conjugate_y=*/adj_y_);
+    auto result =
+        xla::BatchDot(MaybeTransposeInMinorDims(
+                          MaybeConjugate(ctx->Input(0), adj_x_), adj_x_),
+                      MaybeTransposeInMinorDims(
+                          MaybeConjugate(ctx->Input(1), adj_y_), adj_y_));
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index a267c0c72fce67d7c22c55a57f8d5ac4ffd2b7e2..0e2f335f3354e3ae6008bdc0ac0b80683fe479c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -115,9 +115,9 @@ class FusedBatchNormGradOp : public XlaOpKernel {
     // operators. For now, cast everything to the statistics type (which
     // may be more precise than the input type).
     auto grad_backprop =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), scale_dtype);
+        XlaHelpers::ConvertElementType(ctx->Input(0), scale_dtype);
     auto activations =
-        XlaHelpers::ConvertElementType(b, ctx->Input(1), scale_dtype);
+        XlaHelpers::ConvertElementType(ctx->Input(1), scale_dtype);
     auto scale = ctx->Input(2);
     auto mean = ctx->Input(3);
     auto var = ctx->Input(4);
@@ -151,11 +151,11 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       const DataType accumulation_type =
           XlaHelpers::SumAccumulationType(scale_dtype);
       auto converted =
-          XlaHelpers::ConvertElementType(b, grad_backprop, accumulation_type);
+          XlaHelpers::ConvertElementType(grad_backprop, accumulation_type);
       auto reduce =
           xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                       *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
-      offset_backprop = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
+      offset_backprop = XlaHelpers::ConvertElementType(reduce, scale_dtype);
 
       // scratch1 = rsqrt(pop_var + epsilon)
       auto neg_half = XlaHelpers::FloatLiteral(b, scale_dtype, -0.5);
@@ -165,19 +165,18 @@ class FusedBatchNormGradOp : public XlaOpKernel {
       // scratch2 = sum(y_backprop * (x - mean))
       auto mul =
           xla::Mul(grad_backprop, xla::Sub(activations, mean, {feature_index}));
-      converted = XlaHelpers::ConvertElementType(b, mul, accumulation_type);
+      converted = XlaHelpers::ConvertElementType(mul, accumulation_type);
       reduce =
           xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                       *ctx->GetOrCreateAdd(accumulation_type), reduction_dims);
-      auto scratch2 = XlaHelpers::ConvertElementType(b, reduce, scale_dtype);
+      auto scratch2 = XlaHelpers::ConvertElementType(reduce, scale_dtype);
 
       x_backprop =
           xla::Mul(grad_backprop, xla::Mul(scratch1, scale), {feature_index});
       scale_backprop = xla::Mul(scratch1, scratch2);
     }
 
-    ctx->SetOutput(0,
-                   XlaHelpers::ConvertElementType(b, x_backprop, input_dtype));
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(x_backprop, input_dtype));
     ctx->SetOutput(1, scale_backprop);
     ctx->SetOutput(2, offset_backprop);
     ctx->SetConstantOutput(3, Tensor());
diff --git a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
index 41f540506ba41fbe7f91393e7b8e26a89e72ef0a..e7f369b761f36a717ea5fb536780af91a8955b1e 100644
--- a/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bias_ops.cc
@@ -107,11 +107,11 @@ class BiasAddGradOp : public XlaOpKernel {
     const DataType accumulation_type =
         XlaHelpers::SumAccumulationType(input_type(0));
     auto converted =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+        XlaHelpers::ConvertElementType(ctx->Input(0), accumulation_type);
     auto reduce =
         xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                     *ctx->GetOrCreateAdd(accumulation_type), reduce_dims);
-    ctx->SetOutput(0, XlaHelpers::ConvertElementType(b, reduce, input_type(0)));
+    ctx->SetOutput(0, XlaHelpers::ConvertElementType(reduce, input_type(0)));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index 47e517a6576d3a848bc41ceb703df2bd778c4a35..5e9280c1fe692037b0a842a92ef5a8c28b854a54 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -43,6 +43,9 @@ namespace {
         const std::vector<int64>& extend_dimensions) override {          \
       xla::XlaBuilder* b = ctx->builder();                               \
       (void)b;                                                           \
+      (void)lhs_shape;                                                   \
+      (void)rhs_shape;                                                   \
+      (void)extend_dimensions;                                           \
       return HLO;                                                        \
     }                                                                    \
   };                                                                     \
@@ -103,23 +106,23 @@ static xla::XlaOp FloorDivImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
 XLA_MAKE_BINARY(FloorDiv,
                 FloorDivImpl(b, input_type(0), lhs, rhs, broadcast_helper));
 
-static xla::XlaOp XlogyImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
-                            xla::XlaOp y, const BCast& broadcast_helper) {
+xla::XlaOp XlogyImpl(xla::XlaOp x, xla::XlaOp y,
+                     const BCast& broadcast_helper) {
   std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
-  auto zero = XlaHelpers::Zero(b, dtype);
+  auto zero = xla::ZerosLike(x);
   auto is_zero = xla::Eq(x, zero);
   return xla::Select(is_zero, zero, xla::Mul(x, xla::Log(y)));
 }
-XLA_MAKE_BINARY(Xlogy, XlogyImpl(b, input_type(0), lhs, rhs, broadcast_helper));
+XLA_MAKE_BINARY(Xlogy, XlogyImpl(lhs, rhs, broadcast_helper));
 
-static xla::XlaOp XdivyImpl(xla::XlaBuilder* b, DataType dtype, xla::XlaOp x,
-                            xla::XlaOp y, const BCast& broadcast_helper) {
+xla::XlaOp XdivyImpl(xla::XlaOp x, xla::XlaOp y,
+                     const BCast& broadcast_helper) {
   std::tie(x, y) = XlaBinaryOp::Broadcast(x, y, broadcast_helper);
-  auto zero = XlaHelpers::Zero(b, dtype);
+  auto zero = xla::ZerosLike(x);
   auto is_zero = xla::Eq(x, zero);
   return xla::Select(is_zero, zero, xla::Div(x, y));
 }
-XLA_MAKE_BINARY(Xdivy, XdivyImpl(b, input_type(0), lhs, rhs, broadcast_helper));
+XLA_MAKE_BINARY(Xdivy, XdivyImpl(lhs, rhs, broadcast_helper));
 
 // Implementation of FloorMod. Pseudo-code:
 // T trunc_mod = std::fmod(x, y);
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index ad85940920ebb82e72331516e3fe46c79f853892..7199b9b6feb36dd45ef51f4c38463bc715fcc38a 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -21,10 +21,13 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 
 namespace tensorflow {
 namespace {
@@ -57,11 +60,9 @@ class CategoricalOp : public XlaOpKernel {
     const int64 batch_size = logits_shape.dim_size(0);
     const int64 num_classes = logits_shape.dim_size(1);
 
-    xla::XlaBuilder* builder = ctx->builder();
-
     xla::Shape uniform_shape;
     int class_dimension;
-    if (num_samples > 1) {
+    if (num_samples != 1) {
       std::array<int64, 3> uniform_shape_array = {
           {batch_size, num_samples, num_classes}};
       xla::PrimitiveType uniform_xla_type;
@@ -83,16 +84,16 @@ class CategoricalOp : public XlaOpKernel {
           xla::ShapeUtil::MakeShape(uniform_xla_type, uniform_shape_array);
       class_dimension = 1;
     }
-    xla::XlaOp uniforms =
-        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
-                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    xla::PrimitiveType type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(input_type(0), &type));
+    xla::XlaOp log_uniforms = GetLogUniforms(uniform_shape, type, ctx);
 
     // Use Gumbel softmax trick to generate categorical samples.
     // See:
     // https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/
     // TODO(b/68769470): Switch to using a cumulative sum approach.
     auto softmax_entries =
-        xla::Sub(logits, xla::Log(-xla::Log(uniforms)),
+        xla::Sub(logits, log_uniforms,
                  /*broadcast_dimensions=*/{0, class_dimension});
 
     xla::PrimitiveType xla_output_type;
@@ -107,6 +108,16 @@ class CategoricalOp : public XlaOpKernel {
     ctx->SetOutput(0, argmax);
   }
 
+  virtual xla::XlaOp GetLogUniforms(xla::Shape uniform_shape,
+                                    xla::PrimitiveType type,
+                                    XlaOpKernelContext* ctx) {
+    xla::XlaBuilder* builder = ctx->builder();
+    auto uniforms =
+        xla::RngUniform(XlaHelpers::Zero(builder, input_type(0)),
+                        XlaHelpers::One(builder, input_type(0)), uniform_shape);
+    return xla::Log(-xla::Log(uniforms));
+  }
+
  private:
   TF_DISALLOW_COPY_AND_ASSIGN(CategoricalOp);
 };
@@ -115,5 +126,48 @@ class CategoricalOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("Multinomial").CompileTimeConstantInput("num_samples"),
                 CategoricalOp);
 
+class StatelessCategoricalOp : public CategoricalOp {
+ public:
+  explicit StatelessCategoricalOp(OpKernelConstruction* ctx)
+      : CategoricalOp(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &dtype_));
+  }
+
+  xla::XlaOp GetLogUniforms(xla::Shape uniform_shape, xla::PrimitiveType type,
+                            XlaOpKernelContext* ctx) override {
+    xla::XlaOp seed = ctx->Input(2);
+    auto seed0 = xla::Reshape(xla::Slice(seed, {0}, {1}, {1}), {});
+    auto seed1 = xla::Reshape(xla::Slice(seed, {1}, {2}, {1}), {});
+
+    xla::XlaBuilder* builder = ctx->builder();
+    if (uniform_shape.element_type() == xla::BF16) {
+      uniform_shape.set_element_type(xla::F32);
+    }
+    auto uniforms = xla::StatelessRngUniform(
+        {seed0, seed1}, uniform_shape, XlaHelpers::Zero(builder, DT_FLOAT),
+        XlaHelpers::One(builder, DT_FLOAT));
+    return xla::ConvertElementType(xla::Log(-xla::Log(uniforms)), type);
+  }
+
+  void Compile(XlaOpKernelContext* ctx) override {
+    TensorShape seed_shape = ctx->InputShape(2);
+    OP_REQUIRES(ctx, seed_shape.dims() == 1 && seed_shape.dim_size(0) == 2,
+                errors::InvalidArgument("seed must have shape [2], not ",
+                                        seed_shape.DebugString()));
+    CategoricalOp::Compile(ctx);
+  }
+
+ private:
+  DataType dtype_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(StatelessCategoricalOp);
+};
+
+REGISTER_XLA_OP(Name("StatelessMultinomial")
+                    .CompileTimeConstantInput("num_samples")
+                    .TypeConstraint("T", {DT_FLOAT, DT_BFLOAT16})
+                    .TypeConstraint("Tseed", DT_INT32),
+                StatelessCategoricalOp);
+
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index c9a1be494066e4f935a1d818bc86c86333e34fae..641fefafb357f6ad10483c454600f3dadd4f8cb7 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -65,60 +64,63 @@ xla::Shape ExpandedFilterShapeForDepthwiseConvolution(const xla::Shape& shape) {
 //   0 0 1 1 0 0   0 0 1 1 0 0
 //   0 0 0 0 1 1   0 0 0 0 1 1
 //
-// The first step is to create a one tensor, A, that is [3]
-//   0 1 2
+// The first step is to create a iota A with iota_dimension = 2
+//   0 0 0 0 0 0   0 0 0 0 0 0
+//   1 1 1 1 1 1   1 1 1 1 1 1
+//   2 2 2 2 2 2   2 2 2 2 2 2
 //
-// and another tensor, B,  that is [3 * 2]
-//   0 1 2 3 4 5
+//   0 0 0 0 0 0   0 0 0 0 0 0
+//   1 1 1 1 1 1   1 1 1 1 1 1
+//   2 2 2 2 2 2   2 2 2 2 2 2
 //
-// and divide B it by 2 to get
-//   0 0 1 1 2 2
+// and another iota B with iota_dimension = 3
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
 //
-// then we broadcast the B to [2, 2, 3, 3 * 2]
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
+//   0 1 2 3 4 5  0 1 2 3 4 5
 //
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
-//   0 0 1 1 2 2   0 0 1 1 2 2
+// and divide B by 2 to get
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
 //
-// Finally compare A and broadcasted B in dimension 2 amd return the result at
-// the beginning of the comment.
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//   0 0 1 1 2 2  0 0 1 1 2 2
+//
+// Finally compare A and B and return the result at the beginning of the
+// comment.
 xla::XlaOp CreateExpandedFilterMask(const xla::Shape& filter_shape,
                                     xla::XlaBuilder* builder) {
   xla::Shape expanded_filter_shape =
       ExpandedFilterShapeForDepthwiseConvolution(filter_shape);
   int64 depthwise_multiplier =
       filter_shape.dimensions(filter_shape.dimensions_size() - 1);
-  int64 input_feature =
-      filter_shape.dimensions(filter_shape.dimensions_size() - 2);
-
-  // Create a M sized linspace and an M*N sized linspace that will be
-  // broadcasted into perpendicular dimensions and compared.
-  xla::XlaOp input_feature_iota = xla::Iota(builder, xla::S32, input_feature);
-  xla::XlaOp expanded_feature_iota =
-      xla::Iota(builder, xla::S32, input_feature * depthwise_multiplier);
 
-  // Divide the M*N sized linspace by the depthwise_multiplier to create
-  // [0 0 1 1 2 2] in the example in the function comment.
+  // Create two iotas with the shape of the expanded filter, one of them with
+  // the iota dimension chosen as the feature dimension, and the other a iota
+  // with the iota dimension chosen as the expanded output feature dimension.
+  std::vector<int64> iota_dimensions(expanded_filter_shape.dimensions().begin(),
+                                     expanded_filter_shape.dimensions().end());
+  xla::Shape iota_shape = xla::ShapeUtil::MakeShape(xla::S32, iota_dimensions);
+  xla::XlaOp input_feature_iota = xla::Iota(
+      builder, iota_shape, /*iota_dimension=*/iota_dimensions.size() - 2);
+  xla::XlaOp expanded_feature_iota = xla::Iota(
+      builder, iota_shape, /*iota_dimension=*/iota_dimensions.size() - 1);
+
+  // Divide 'expanded_feature_iota' by the depthwise_multiplier to create
+  // [0 0 1 1 2 2] ... in the example in the function comment.
   expanded_feature_iota =
       xla::Div(expanded_feature_iota,
                XlaHelpers::IntegerLiteral(builder, DataType::DT_INT32,
                                           depthwise_multiplier));
 
-  // Broadcast the N*M linspace to [H, W, ..., M, M*N].
-  std::vector<int64> expanded_feature_broadcast_dims(
-      expanded_filter_shape.dimensions().begin(),
-      expanded_filter_shape.dimensions().end());
-  expanded_feature_broadcast_dims.pop_back();
-  auto broadcasted_expanded_feature_iota =
-      xla::Broadcast(expanded_feature_iota, expanded_feature_broadcast_dims);
-
-  // Compare the broadcasted linspace to the input feature linspace in the
-  // input feature dimension to create a diagonal predicate.
-  return xla::Eq(broadcasted_expanded_feature_iota, input_feature_iota,
-                 {expanded_filter_shape.dimensions_size() - 2});
+  // Compare 'input_feature_iota' with 'expanded_feature_iota' to create a
+  // diagonal predicate.
+  return xla::Eq(expanded_feature_iota, input_feature_iota);
 }
 
 // Reshapes a filter of shape [H, W, ..., M, N] to [H, W, ..., 1, M*N]. Used to
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index d820528a43064e327cb90e5a2889f77ab1f3f3e2..eafdba876ae9e2c38694f065cf83bb3725b8460e 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 49c12fc232092873b69961644a059abc6035f64f..ee79cbc70da269be7586c47b4fd33c901f4fd581 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -19,7 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index b2f6ef43fa9765b0d6da8e3215cbea5b56b4fe05..6e6ba21daf5bf3eab5bfc15378e77b6dd253da7c 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -113,8 +113,20 @@ class DynamicStitchOp : public XlaOpKernel {
       }
     }
     int number_of_indices = max_index + 1;
-    OP_REQUIRES(ctx, number_of_indices > 0,
-                errors::InvalidArgument("no indices supplied"));
+    int64 result_rank = 1 + data0_shape.dims() - indices0_shape.dims();
+    if (number_of_indices == 0) {
+      std::vector<int64> result_shape(result_rank);
+      for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
+        result_shape[d - indices0_shape.dims() + 1] = data0_shape.dim_size(d);
+      }
+      xla::PrimitiveType element_type =
+          ctx->input_xla_type(ctx->num_inputs() - 1);
+      xla::Literal empty_literal = xla::Literal::CreateFromShape(
+          xla::ShapeUtil::MakeShape(element_type, result_shape));
+      ctx->SetOutput(0, xla::ConstantLiteral(ctx->builder(), empty_literal));
+      return;
+    }
+
     // Construct the reverse mapping, for each index, of which slice of which
     // input it comes from.
     std::vector<int32> src_input_vector(number_of_indices);
@@ -157,12 +169,9 @@ class DynamicStitchOp : public XlaOpKernel {
 
     // Set up the vectors for slicing: the first dimension will vary
     // slice by slice, and the rest take the full common extra shape.
-    std::vector<int64> slice_start(1 + data0_shape.dims() -
-                                   indices0_shape.dims());
-    std::vector<int64> slice_limit(1 + data0_shape.dims() -
-                                   indices0_shape.dims());
-    std::vector<int64> stride(1 + data0_shape.dims() - indices0_shape.dims(),
-                              1);
+    std::vector<int64> slice_start(result_rank);
+    std::vector<int64> slice_limit(result_rank);
+    std::vector<int64> stride(result_rank, 1);
     for (int d = indices0_shape.dims(); d < data0_shape.dims(); d++) {
       slice_limit[1 + d - indices0_shape.dims()] = data0_shape.dim_size(d);
     }
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index c68b0bfd7961892294c2931e5c4c44de534a7740..29687c7b82f92d9f336854c4575746589c63b64f 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -17,7 +17,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/util/tensor_format.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index cdba6680dee3fade5bdf0c453ed672b653072b0d..142be030f737f105980ab9c80a5a849e1ca6eb47 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -260,19 +260,19 @@ class FakeQuantWithMinMaxVarsGradOp : public XlaOpKernel {
     xla::XlaOp below_min = xla::Lt(input, nudged_input_min);
     xla::XlaOp select1 = xla::Select(below_min, gradient, zeroes);
     xla::XlaOp reduce1 = xla::ReduceAll(
-        XlaHelpers::ConvertElementType(b, select1, accumulation_type),
+        XlaHelpers::ConvertElementType(select1, accumulation_type),
         XlaHelpers::Zero(b, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type));
-    xla::XlaOp output1 = XlaHelpers::ConvertElementType(b, reduce1, data_type);
+    xla::XlaOp output1 = XlaHelpers::ConvertElementType(reduce1, data_type);
     ctx->SetOutput(1, output1);
 
     xla::XlaOp above_max = xla::Gt(input, nudged_input_max);
     xla::XlaOp select2 = xla::Select(above_max, gradient, zeroes);
     xla::XlaOp reduce2 = xla::ReduceAll(
-        XlaHelpers::ConvertElementType(b, select2, accumulation_type),
+        XlaHelpers::ConvertElementType(select2, accumulation_type),
         XlaHelpers::Zero(b, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type));
-    xla::XlaOp output2 = XlaHelpers::ConvertElementType(b, reduce2, data_type);
+    xla::XlaOp output2 = XlaHelpers::ConvertElementType(reduce2, data_type);
     ctx->SetOutput(2, output2);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 9b06357d9b78be6d7b64e88a97f45f6c19176fc8..6df8b5367d2390e65995beb1583b225755e6ee9f 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -50,11 +51,36 @@ class GenericFftOp : public XlaOpKernel {
         errors::InvalidArgument("input must be at least 1 dimensional"));
 
     std::vector<int64> fft_length;
+    xla::XlaOp input = ctx->Input(0);
     if (fft_type_ == FftType::RFFT || fft_type_ == FftType::IRFFT) {
       OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntVector(1, &fft_length));
       OP_REQUIRES(ctx, fft_length.size() == fft_rank_,
                   errors::InvalidArgument("fft_length must be length ",
                                           fft_rank_, " vector"));
+
+      // Zero pad or truncate the axes we're doing FFT on.
+      absl::InlinedVector<int64, 4> slice_sizes = input_shape.dim_sizes();
+      std::vector<std::pair<int64, int64>> padding_sizes(slice_sizes.size());
+      std::vector<int64> expected_sizes = fft_length;
+      // IRFFT wants the innermost axis to be n / 2 + 1.
+      if (fft_type_ == FftType::IRFFT) {
+        expected_sizes[fft_rank_ - 1] = fft_length[fft_rank_ - 1] / 2 + 1;
+      }
+      for (int i = 0; i < fft_rank_; i++) {
+        int index = input_shape.dims() - fft_rank_ + i;
+        if (input_shape.dim_size(index) > expected_sizes[i]) {
+          slice_sizes[index] = expected_sizes[i];
+        } else {
+          padding_sizes[index].second =
+              expected_sizes[i] - input_shape.dim_size(index);
+        }
+      }
+
+      std::vector<int64> start_indices(input_shape.dims(), 0);
+      std::vector<int64> strides(input_shape.dims(), 1);
+      input = xla::Pad(xla::Slice(input, start_indices, slice_sizes, strides),
+                       XlaHelpers::Zero(ctx->builder(), ctx->input_type(0)),
+                       xla::MakeEdgePaddingConfig(padding_sizes));
     } else {
       // Innermost axis provides the FFT length.
       for (int i = 0; i < fft_rank_; i++) {
@@ -63,7 +89,7 @@ class GenericFftOp : public XlaOpKernel {
       }
     }
 
-    xla::XlaOp fft = xla::Fft(ctx->Input(0), fft_type_, fft_length);
+    xla::XlaOp fft = xla::Fft(input, fft_type_, fft_length);
     ctx->SetOutput(0, fft);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 56da50f140893c68c8a1556853884720b21c7229..b5e083912555c865b5eadc7697075c9ca4451ca9 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -72,7 +72,7 @@ void XlaIfOp::Compile(XlaOpKernelContext* ctx) {
       arg.shape = resource->shape();
       OP_REQUIRES(ctx, arg.initialized,
                   errors::Unimplemented("Uninitialized arguments: ", arg.name));
-      arg.tensor_array_size = resource->tensor_array_size();
+      arg.max_array_size = resource->max_array_size();
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index b49b2516d8b829a550071bc7580d350328833f32..e9bb0a77e99d144863b027bd214081316d61c314 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -191,12 +191,11 @@ class AdjustContrastOpV2 : public XlaOpKernel {
     DataType type = context->input_type(0);
 
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
-    auto converted =
-        XlaHelpers::ConvertElementType(b, input, accumulation_type);
+    auto converted = XlaHelpers::ConvertElementType(input, accumulation_type);
     auto reduce = xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                               *context->GetOrCreateAdd(accumulation_type),
                               {height_dim, width_dim});
-    auto output = XlaHelpers::ConvertElementType(b, reduce, type);
+    auto output = XlaHelpers::ConvertElementType(reduce, type);
     output =
         xla::Div(output, XlaHelpers::FloatLiteral(b, type, height * width));
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 0c7ca602bfacd598dada0303d3a3e77fe7f1b0fc..5a10c52ba8b6d4fab73f0dda67cbd52fd625e76b 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
index e310db2162da0997204f85bc3ca42e7b0460e1e3..e2c05b648bb194b1b452c527ddb1a2c5995b1217 100644
--- a/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
+++ b/tensorflow/compiler/tf2xla/kernels/index_ops_cpu.cc
@@ -30,7 +30,9 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-// The logic below uses a custom-call to implement argmax.
+// The logic below uses a custom-call to implement argmax when possible. When
+// custom-call is not allowed or input shapes are not supported, this kernel
+// falls back to using XLA HLO native ArgMax.
 //
 // Also see b/29507024 for first-class XLA support for indexing ops.
 class ArgMaxCustomCallOp : public XlaOpKernel {
@@ -50,27 +52,40 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     // overhead, when compiling ahead-of-time.
     int64 dim;
     OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(1, &dim));
-    OP_REQUIRES(ctx, dim >= 0, errors::InvalidArgument("dim must be >= 0"));
-    OP_REQUIRES(
-        ctx, dim < input_shape.dims(),
-        errors::InvalidArgument("dim must be < input rank (",
-                                input_shape.dims(), "), but got: ", dim));
-    const int64 dim_size = input_shape.dim_size(dim);
-    OP_REQUIRES(ctx, dim_size > 0,
+
+    const int input_dims = input_shape.dims();
+    const int axis = dim < 0 ? dim + input_dims : dim;
+    OP_REQUIRES(ctx, axis >= 0 && axis < input_dims,
+                errors::InvalidArgument("Expected dimension in the range [",
+                                        -input_dims, ", ", input_dims,
+                                        "), but got ", dim));
+
+    const int64 axis_size = input_shape.dim_size(axis);
+    OP_REQUIRES(ctx, axis_size > 0,
                 errors::InvalidArgument(
                     "Reduction axis ", dim,
                     " is empty in shape: ", input_shape.DebugString()));
 
-    // The output shape is the input shape contracted along dim.
+    const DataType dtype = output_type(0);
+    xla::PrimitiveType output_type;
+    OP_REQUIRES_OK(ctx, DataTypeToPrimitiveType(dtype, &output_type));
+
+    // Fall back to XLA ArgMax HLO when CustomCall is not allowed or when input
+    // shape isn't supported.
+    if (!ctx->compiler()->options().allow_cpu_custom_calls ||
+        (input_dims != 1 && input_dims != 2)) {
+      xla::XlaOp output = XlaHelpers::ArgMax(ctx->Input(0), output_type, axis);
+      ctx->SetOutput(0, output);
+      return;
+    }
+
+    xla::XlaOp output;
+    // The output shape is the input shape contracted along axis.
     TensorShape output_shape;
     for (int d = 0; d < input_shape.dims() - 1; ++d) {
-      output_shape.AddDim(input_shape.dim_size((d < dim) ? d : d + 1));
+      output_shape.AddDim(input_shape.dim_size((d < axis) ? d : d + 1));
     }
 
-    // For now we use a custom-call, only for the 1d and 2d cases.
-    OP_REQUIRES(ctx, XlaContext::Get(ctx).allow_cpu_custom_calls(),
-                errors::InvalidArgument(
-                    "ArgMax implementation requires a CustomCall on CPU"));
     xla::XlaBuilder& b = *ctx->builder();
 
     // XLA passes <out> to the function, so it is not included here.
@@ -84,7 +99,7 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
       args.push_back(xla::ConstantLiteral(
           &b, xla::LiteralUtil::CreateR1<int64>(output_shape.dim_sizes())));
       args.push_back(
-          xla::ConstantLiteral(&b, xla::LiteralUtil::CreateR0<int32>(dim)));
+          xla::ConstantLiteral(&b, xla::LiteralUtil::CreateR0<int32>(axis)));
     }
 
     // The argmax function expects row-major layout.
@@ -101,24 +116,15 @@ class ArgMaxCustomCallOp : public XlaOpKernel {
     }
 
     // Tell XLA to call the custom code, defined in
-    // index_ops_kernel_argmax_float_1d.cc.
-    xla::XlaOp output;
-    switch (input_shape.dims()) {
-      case 1:
-        output = xla::CustomCallWithLayout(&b, "argmax_float_1d_xla_impl", args,
-                                           xla_shape, arg_shapes);
-        break;
-      case 2:
-        output = xla::CustomCallWithLayout(&b, "argmax_float_2d_xla_impl", args,
-                                           xla_shape, arg_shapes);
-        break;
-      default:
-        OP_REQUIRES(ctx, false,
-                    errors::Unimplemented(
-                        "Argmax is only implemented for 1d and 2d tensors"
-                        ", but got shape: ",
-                        input_shape.DebugString()));
+    // index_ops_kernel_argmax_float_{1, 2}d.cc.
+    if (input_dims == 1) {
+      output = xla::CustomCallWithLayout(&b, "argmax_float_1d_xla_impl", args,
+                                         xla_shape, arg_shapes);
+    } else {
+      output = xla::CustomCallWithLayout(&b, "argmax_float_2d_xla_impl", args,
+                                         xla_shape, arg_shapes);
     }
+    output = xla::ConvertElementType(output, output_type);
     ctx->SetOutput(0, output);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index f028e361bccd51de0bd69a1d2227c7afaed53455..93f029731c34e84000a3dc00df8af05654cccf2d 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -37,12 +37,11 @@ class L2LossOp : public XlaOpKernel {
 
     //  output = sum(t ** 2) / 2
     const DataType accumulation_type = XlaHelpers::SumAccumulationType(dtype);
-    auto t =
-        XlaHelpers::ConvertElementType(b, ctx->Input(0), accumulation_type);
+    auto t = XlaHelpers::ConvertElementType(ctx->Input(0), accumulation_type);
     auto square = xla::Mul(t, t);
     auto reduce = xla::Reduce(square, XlaHelpers::Zero(b, accumulation_type),
                               *ctx->GetOrCreateAdd(accumulation_type), dims);
-    auto deconverted = XlaHelpers::ConvertElementType(b, reduce, dtype);
+    auto deconverted = XlaHelpers::ConvertElementType(reduce, dtype);
     auto two = XlaHelpers::IntegerLiteral(b, dtype, 2);
     ctx->SetOutput(0, xla::Div(deconverted, two));
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
index 87ee2d3aede50eb24e65570f106d49030e1d4236..987901d82b3f3798dd52f18ef2497b8f0cf80b11 100644
--- a/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/lrn_ops.cc
@@ -49,16 +49,14 @@ class LRNOp : public XlaOpKernel {
     // We use a window of depth_radius_ * 2 + 1, to account for the current
     // element and a depth_radius_ on either side.
     auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
-    auto converted =
-        XlaHelpers::ConvertElementType(builder, input, accumulation_type);
+    auto converted = XlaHelpers::ConvertElementType(input, accumulation_type);
     auto squared = xla::Mul(converted, converted);
     auto reduce = xla::ReduceWindow(
         squared, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto sqr_sum =
-        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
+    auto sqr_sum = XlaHelpers::ConvertElementType(reduce, input_type(0));
 
     auto scale = xla::Pow(
         xla::Add(xla::ConstantR0<float>(builder, bias_),
@@ -138,15 +136,14 @@ class LRNGradOp : public XlaOpKernel {
 
     auto accumulation_type = XlaHelpers::SumAccumulationType(input_type(0));
     auto converted =
-        XlaHelpers::ConvertElementType(builder, in_image, accumulation_type);
+        XlaHelpers::ConvertElementType(in_image, accumulation_type);
     auto squared = xla::Mul(converted, converted);
     auto reduce = xla::ReduceWindow(
         squared, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto sqr_sum =
-        XlaHelpers::ConvertElementType(builder, reduce, input_type(0));
+    auto sqr_sum = XlaHelpers::ConvertElementType(reduce, input_type(0));
 
     auto norm =
         xla::Add(xla::ConstantR0<float>(builder, bias_),
@@ -157,15 +154,13 @@ class LRNGradOp : public XlaOpKernel {
                  xla::Div(out_image, norm)),
         in_grads);
 
-    auto converted_dy =
-        XlaHelpers::ConvertElementType(builder, dy, accumulation_type);
+    auto converted_dy = XlaHelpers::ConvertElementType(dy, accumulation_type);
     auto dy_reduce = xla::ReduceWindow(
         converted_dy, XlaHelpers::Zero(builder, accumulation_type),
         *ctx->GetOrCreateAdd(accumulation_type),
         /* window_dimensions = */ {1, 1, 1, depth_radius_ * 2 + 1},
         /* window_strides = */ {1, 1, 1, 1}, xla::Padding::kSame);
-    auto dy_reduced =
-        XlaHelpers::ConvertElementType(builder, dy_reduce, input_type(0));
+    auto dy_reduced = XlaHelpers::ConvertElementType(dy_reduce, input_type(0));
 
     xla::XlaOp gradients = xla::Add(
         xla::Mul(in_image, dy_reduced),
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
index 8dfd7de591c4a3c4768dd60b41e03d294ad49397..2dd0a710e47ec8cad6153402fdb3be59f5868205 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_band_part_op.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 
 namespace tensorflow {
@@ -61,11 +61,11 @@ class MatrixBandPartOp : public XlaOpKernel {
 
     // Compute 'offset', which is how many diagonals we are above/below the
     // diagonal.
-    xla::XlaOp iota_m = xla::Iota(builder, index_xla_type, m);
-    xla::XlaOp iota_n = xla::Iota(builder, index_xla_type, n);
+    xla::Shape iota_shape = xla::ShapeUtil::MakeShape(index_xla_type, {m, n});
+    xla::XlaOp iota_m = xla::Iota(builder, iota_shape, /*iota_dimension=*/0);
+    xla::XlaOp iota_n = xla::Iota(builder, iota_shape, /*iota_dimension=*/1);
 
-    auto offset = xla::Sub(xla::Broadcast(iota_n, {m}), iota_m,
-                           /*broadcast_dimensions=*/{0});
+    auto offset = xla::Sub(iota_n, iota_m);
 
     // If num_lower or num_upper are negative, include all lower/upper
     // diagonals.
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
index c0ca881ff82cee04e0c5e35f9a2d5732fabdd8a6..4f980b6d14ed667bdf4756ed740894098cae5919 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_set_diag_op.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
index f4def11d08c31513aec5aad15187016a7294c2fd..90c0ebefb24ec2c4378782e9b15d3f57c33032a4 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 
 namespace tensorflow {
 namespace {
@@ -29,7 +29,7 @@ class MatrixTriangularSolveOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result = TriangularSolve(
+    auto result = xla::TriangularSolve(
         ctx->Input(0), ctx->Input(1), /*left_side=*/true,
         /*lower=*/lower_, /*transpose_a=*/adjoint_, /*conjugate_a=*/adjoint_);
     ctx->SetOutput(0, result);
diff --git a/tensorflow/compiler/tf2xla/kernels/permute_op.cc b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
index 94b51e1a586c6cf623c181abf200b91851c7ba05..71920bf5c1e6aa5981aafa8b611cc01c0917e02b 100644
--- a/tensorflow/compiler/tf2xla/kernels/permute_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/permute_op.cc
@@ -75,8 +75,7 @@ class DataFormatVecPermuteOp : public XlaOpKernel {
     }
     auto keys = xla::ConstantR1(builder, absl::Span<const int32>(dst_indices));
     if (input_rank == 2) {
-      keys = xla::BroadcastInDim(
-          keys, xla::ShapeUtil::MakeShape(xla::S32, {4, 2}), {0});
+      keys = xla::BroadcastInDim(keys, {4, 2}, {0});
     }
     auto sorted = xla::Sort(keys, {ctx->Input(0)}, 0);
     auto output = xla::GetTupleElement(sorted, 1);
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index a259da6383d461fd11b0d79096bf66aae7ddef06..06c6cc37ec90192486ba15010bfeb763a9ffb987 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -152,7 +152,12 @@ class MaxPoolOp : public PoolingOp {
  public:
   MaxPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
-                  /*reduction_type=*/ctx->input_type(0)) {}
+                  /*reduction_type=*/ctx->input_type(0)) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     auto ksize_or_error = GetKernelSize(ctx);
@@ -180,10 +185,6 @@ class MaxPool2DOp : public MaxPoolOp {
  public:
   explicit MaxPool2DOp(OpKernelConstruction* ctx)
       : MaxPoolOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(Name("MaxPool"), MaxPool2DOp);
@@ -204,7 +205,12 @@ class AvgPoolOp : public PoolingOp {
   AvgPoolOp(OpKernelConstruction* ctx, int num_spatial_dims)
       : PoolingOp(ctx, /*num_spatial_dims=*/num_spatial_dims,
                   /*reduction_type=*/
-                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {}
+                  XlaHelpers::SumAccumulationType(ctx->input_type(0))) {
+    string data_format_str;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
+    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
+  }
 
   void Compile(XlaOpKernelContext* ctx) override {
     auto ksize_or_error = GetKernelSize(ctx);
@@ -241,10 +247,6 @@ class AvgPool2DOp : public AvgPoolOp {
  public:
   explicit AvgPool2DOp(OpKernelConstruction* ctx)
       : AvgPoolOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format_str;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format_str));
-    OP_REQUIRES(ctx, FormatFromString(data_format_str, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(Name("AvgPool"), AvgPool2DOp);
@@ -390,6 +392,11 @@ class AvgPoolGradOp : public XlaOpKernel {
     OP_REQUIRES(ctx, ksize_[0] == 1 && stride_[0] == 1,
                 errors::Unimplemented(
                     "Pooling is not yet supported on the batch dimension."));
+
+    string data_format;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
+    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
+                errors::InvalidArgument("Invalid data format"));
   }
 
   int num_dims() const { return num_spatial_dims_ + 2; }
@@ -449,10 +456,6 @@ class AvgPool2DGradOp : public AvgPoolGradOp {
  public:
   explicit AvgPool2DGradOp(OpKernelConstruction* ctx)
       : AvgPoolGradOp(ctx, /*num_spatial_dims=*/2) {
-    string data_format;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("data_format", &data_format));
-    OP_REQUIRES(ctx, FormatFromString(data_format, &data_format_),
-                errors::InvalidArgument("Invalid data format"));
   }
 };
 REGISTER_XLA_OP(
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index 6f4ed496a1774dde68dd9d5fbd37995d615b678c..7fe102428db1cc5ce16037f56fa301d1941da8e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/core/platform/macros.h"
@@ -26,12 +27,26 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+enum QuantizerRoundMode {
+  // Round half up: if the fraction of y is exactly 0.5, then
+  // round(y) = y + 0.5
+  // E.g., -5.5 gets rounded to -5, -5.4 goes to -5,
+  // 5.4 goes to 5, and 5.5 goes to 6.
+  ROUND_HALF_UP,
+  // Round half to even: if the fraction of y is exactly 0.5, then round(y) is
+  // the nearest even integer to y.
+  // E.g., 23.5 gets rounded to 24, 24.5 gets rounded to 24, while -23.5 becomes
+  // -24, and -24.5 gets rounded to 24.
+  ROUND_HALF_TO_EVEN,
+};
+
 class QuantizeAndDequantizeOp : public XlaOpKernel {
  public:
   explicit QuantizeAndDequantizeOp(OpKernelConstruction* ctx)
       : XlaOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("signed_input", &signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
+    round_mode_ = ROUND_HALF_TO_EVEN;
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
@@ -117,8 +132,17 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
       // in that case they were measured from the tensor.
       input = Clamp(min_range, input, max_range);
     }
-    xla::XlaOp result =
-        Floor((input - min_range) * scale + half) * inverse_scale + min_range;
+    xla::XlaOp result;
+    switch (round_mode_) {
+      case ROUND_HALF_TO_EVEN: {
+        result = xla::RoundToEven(input * scale) * inverse_scale;
+        break;
+      }
+      case ROUND_HALF_UP: {
+        result = Floor(input * scale + half) * inverse_scale;
+        break;
+      }
+    }
     ctx->SetOutput(0, result);
   }
 
@@ -126,6 +150,7 @@ class QuantizeAndDequantizeOp : public XlaOpKernel {
   int64 num_bits_ = -1;
   bool signed_input_;
   bool range_given_;
+  QuantizerRoundMode round_mode_;
 };
 
 class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp {
@@ -136,6 +161,20 @@ class QuantizeAndDequantizeV2Op : public QuantizeAndDequantizeOp {
     OP_REQUIRES(ctx, num_bits_ > 0 && num_bits_ < (signed_input_ ? 62 : 63),
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
                                         " with signed_input_ ", signed_input_));
+    string round_mode_string;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
+    OP_REQUIRES(
+        ctx,
+        (round_mode_string == "HALF_UP" || round_mode_string == "HALF_TO_EVEN"),
+        errors::InvalidArgument("Round mode string must be "
+                                "'HALF_UP' or "
+                                "'HALF_TO_EVEN', is '" +
+                                round_mode_string + "'"));
+    if (round_mode_string == "HALF_UP") {
+      round_mode_ = ROUND_HALF_UP;
+    } else if (round_mode_string == "HALF_TO_EVEN") {
+      round_mode_ = ROUND_HALF_TO_EVEN;
+    }
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 415ce9b77ffeac8a6a5f3c23537afb16c1d3567c..8822e29f7e77b1cbc6fa6ca61d0062d9b1b0c36e 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 107fa62967a55dffcfff8728b65338564e5202d2..65e158d64fdd7df62d50b81c9e488b2d03476fb7 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -113,12 +113,21 @@ class MeanOp : public XlaReductionOp {
     xla::Add(scalar_lhs, scalar_rhs);
   }
 
-  xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
-                            const xla::XlaOp& reduce_output,
-                            int64 num_elements_reduced) override {
-    auto divisor = XlaHelpers::IntegerLiteral(builder, input_type(0),
-                                              num_elements_reduced);
-    return reduce_output / divisor;
+  xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* /*builder*/, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64>& dimensions_to_reduce) override {
+    if (dimensions_to_reduce.empty()) {
+      return reduce_output;
+    }
+    auto divisor = xla::GetDimensionSize(input, dimensions_to_reduce[0]);
+    for (int i = 1; i < dimensions_to_reduce.size(); i++) {
+      auto size = xla::GetDimensionSize(input, dimensions_to_reduce[i]);
+      divisor = xla::Mul(divisor, size);
+    }
+    divisor = xla::ConvertElementType(divisor, xla_reduction_type_);
+    return XlaHelpers::ConvertElementType(reduce_output / divisor,
+                                          input_type(0));
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 466e79828d111ee7cadcf713703e8f252c63e62c..af716eab79886791e8507a84984b7ca60865d00e 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -48,13 +48,14 @@ class XlaReductionOp : public XlaOpKernel {
                             const xla::XlaOp& scalar_rhs) = 0;
 
   // Applies a transformation to the output of the reduction. The desired
-  // computation should be added to 'builder'. Argument 'reduce_output' is the
-  // output of the reduction. 'num_elements_reduced' is the number of elements
-  // that contributed to the reduction. Returns the transformed reduction
-  // output, Defaults to returning 'reduce_output' unchanged.
-  virtual xla::XlaOp BuildFinalizer(xla::XlaBuilder* builder,
-                                    const xla::XlaOp& reduce_output,
-                                    int64 num_elements_reduced);
+  // computation should be added to 'builder'. Argument 'input' is the original
+  // input of the reduction; 'reduce_output' is the output of the reduction.
+  // Returns the transformed reduction output. Defaults to returning
+  // 'reduce_output' converted to the input type.
+  virtual xla::XlaOp BuildFinalizer(
+      xla::XlaBuilder* builder, const xla::XlaOp& input,
+      const xla::XlaOp& reduce_output,
+      const std::vector<int64>& dimensions_to_reduce);
 
   void Compile(XlaOpKernelContext* ctx) override;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 118f2798d559f43acb7f6394a7337426164325ef..2ca2a85244b4edfe75db3d4fff6c2058adc2bf71 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -35,12 +35,13 @@ XlaReductionOp::XlaReductionOp(OpKernelConstruction* ctx,
       ctx, DataTypeToPrimitiveType(reduction_type_, &xla_reduction_type_));
 }
 
-// Unless BuildFinalizer is overridden the reduction has no
-// finalizer.
-xla::XlaOp XlaReductionOp::BuildFinalizer(xla::XlaBuilder* builder,
-                                          const xla::XlaOp& reduce_output,
-                                          int64 num_elements_reduced) {
-  return reduce_output;
+// The default finalizer converts the results back into the input type. This can
+// be overridden.
+xla::XlaOp XlaReductionOp::BuildFinalizer(
+    xla::XlaBuilder* /*builder*/, const xla::XlaOp& /*input*/,
+    const xla::XlaOp& reduce_output,
+    const std::vector<int64>& /*dimensions_to_reduce*/) {
+  return XlaHelpers::ConvertElementType(reduce_output, input_type(0));
 }
 
 void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
@@ -71,7 +72,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
 
   absl::InlinedVector<bool, 4> bitmap(data_shape.dims(), false);
   std::vector<int64> xla_axes;
-  int64 num_elements_reduced = 1LL;
   for (int64 i = 0; i < axes_tensor_shape.num_elements(); ++i) {
     int64 index = axes[i];
     OP_REQUIRES(ctx,
@@ -82,7 +82,6 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
     index = (index + data_shape.dims()) % data_shape.dims();
     bitmap[index] = true;
     xla_axes.push_back(index);
-    num_elements_reduced *= data_shape.dim_size(index);
   }
 
   std::vector<int64> final_shape;
@@ -118,8 +117,7 @@ void XlaReductionOp::Compile(XlaOpKernelContext* ctx) {
   xla::XlaComputation reduction_computation = r.Build().ConsumeValueOrDie();
 
   auto reduce = xla::Reduce(data, initial, reduction_computation, xla_axes);
-  auto deconverted = XlaHelpers::ConvertElementType(b, reduce, input_type(0));
-  auto finalized = BuildFinalizer(b, deconverted, num_elements_reduced);
+  auto finalized = BuildFinalizer(b, data, reduce, xla_axes);
   auto result = keep_dims_ ? xla::Reshape(finalized, final_shape) : finalized;
   ctx->SetOutput(0, result);
 }
diff --git a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
index 847704608fb32b43ffb61f87556d5231b9e39cde..54d34a38abc4948a1a08197d72e3e7f763649093 100644
--- a/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/resampler_ops.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array4d.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -44,9 +43,6 @@ namespace {
 
 using xla::XlaOp;
 
-// TODO(b/112295522): note that sampling from image boundary is not currently
-// being handled properly.
-
 // Calculates the bilinear weight tensor, given basis ratio (px, py) of the
 // sampling position:
 //    W = [(1-px)*(1-py), px*(1-py), (1-px)*py, px*py]
@@ -70,11 +66,8 @@ XlaOp BilinearWeights(XlaOpKernelContext* ctx, XlaOp ratio,
   std::vector<int64> last_two_dims_indices = {(broadcast_dims_size - 2),
                                               (broadcast_dims_size - 1)};
 
-  xla::Shape broadcast_shape =
-      xla::ShapeUtil::MakeShape(xla_type, broadcast_dims);
-
   auto broadcast_first_term =
-      xla::BroadcastInDim(first_term, broadcast_shape, last_two_dims_indices);
+      xla::BroadcastInDim(first_term, broadcast_dims, last_two_dims_indices);
 
   // Ratio is of the same dimension as warp, which is [batch, dim_0,... dim_n,
   // 2], we broadcast ratio tensor to 'broadcast_dim' by keeping the
@@ -85,7 +78,7 @@ XlaOp BilinearWeights(XlaOpKernelContext* ctx, XlaOp ratio,
   ratio_broadcast_indices.erase(ratio_broadcast_indices.end() - 2);
 
   auto broadcast_ratio =
-      xla::BroadcastInDim(ratio, broadcast_shape, ratio_broadcast_indices);
+      xla::BroadcastInDim(ratio, broadcast_dims, ratio_broadcast_indices);
 
   auto first_term_subtract_weights = broadcast_first_term - broadcast_ratio;
 
@@ -96,7 +89,7 @@ XlaOp BilinearWeights(XlaOpKernelContext* ctx, XlaOp ratio,
   sign_change = xla::ConvertElementType(sign_change, xla_type);
 
   auto broadcast_sign_change =
-      xla::BroadcastInDim(sign_change, broadcast_shape, last_two_dims_indices);
+      xla::BroadcastInDim(sign_change, broadcast_dims, last_two_dims_indices);
 
   auto flipped = first_term_subtract_weights * broadcast_sign_change;
 
@@ -232,21 +225,19 @@ XlaOp CalculateGradData(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
 
   std::vector<int64> weights_with_channels_dims = reshaped_weights_dims;
   weights_with_channels_dims.push_back(data_channels);
-  auto weights_with_channels_shape =
-      xla::ShapeUtil::MakeShape(warp_type, weights_with_channels_dims);
   std::vector<int64> reshaped_weights_indices(reshaped_weights_dims.size());
   std::iota(reshaped_weights_indices.begin(), reshaped_weights_indices.end(),
             0);
 
   // The dimension is [batch, dim_0, ..., dim_n, 2, 2, data_channel].
   auto broadcast_reshaped_weights = xla::BroadcastInDim(
-      reshaped_weights, weights_with_channels_shape, reshaped_weights_indices);
+      reshaped_weights, weights_with_channels_dims, reshaped_weights_indices);
 
   std::vector<int64> grad_output_indices(warp_dims_without_last_dims.size());
   std::iota(grad_output_indices.begin(), grad_output_indices.end(), 0);
   grad_output_indices.push_back(weights_with_channels_dims.size() - 1);
   XlaOp broadcast_grad_output = xla::BroadcastInDim(
-      grad_output, weights_with_channels_shape, grad_output_indices);
+      grad_output, weights_with_channels_dims, grad_output_indices);
 
   auto grad_output_multiply_weights =
       broadcast_grad_output * broadcast_reshaped_weights;
@@ -294,13 +285,10 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
   std::vector<int64> warp_dims_without_last_dims(warp_dims.begin(),
                                                  warp_dims.end() - 1);
 
+  // With dimension [batch, dim_0, ...dim_n, 4]
   std::vector<int64> neighbor_broadcast_dims = warp_dims_without_last_dims;
   neighbor_broadcast_dims.push_back(4);
 
-  // With dimension [batch, dim_0, ...dim_n, 4]
-  auto neighbor_broadcast_shape =
-      xla::ShapeUtil::MakeShape(data_type, neighbor_broadcast_dims);
-
   // The dimension is [batch, dim_0, ... dim_n, 4, data_channels]
   auto neighbors_data = Gather2by2Neighbors(
       ctx->builder(), data, gather_indices, data_channels, warp_shape.dims());
@@ -326,7 +314,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::BroadcastInDim(
           xla::ConvertElementType(
               xla::ConstantR1<float>(ctx->builder(), {0, 0, -1, 1}), data_type),
-          neighbor_broadcast_shape, {last_warp_dim}),
+          neighbor_broadcast_dims, {last_warp_dim}),
       neighbors_data, dot_dims, /*precision_config=*/nullptr);
 
   // img_cxfy - img_fxfy
@@ -334,7 +322,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::BroadcastInDim(
           xla::ConvertElementType(
               xla::ConstantR1<float>(ctx->builder(), {-1, 1, 0, 0}), data_type),
-          neighbor_broadcast_shape, {last_warp_dim}),
+          neighbor_broadcast_dims, {last_warp_dim}),
       neighbors_data, dot_dims, /*precision_config=*/nullptr);
 
   // img_cxcy - img_cxfy
@@ -342,7 +330,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::BroadcastInDim(
           xla::ConvertElementType(
               xla::ConstantR1<float>(ctx->builder(), {0, -1, 0, 1}), data_type),
-          neighbor_broadcast_shape, {last_warp_dim}),
+          neighbor_broadcast_dims, {last_warp_dim}),
       neighbors_data, dot_dims, /*precision_config=*/nullptr);
 
   // img_fxcy - img_fxfy
@@ -350,7 +338,7 @@ XlaOp CalculateGradWarp(XlaOpKernelContext* ctx, XlaOp grad_output, XlaOp ratio,
       xla::BroadcastInDim(
           xla::ConvertElementType(
               xla::ConstantR1<float>(ctx->builder(), {-1, 0, 1, 0}), data_type),
-          neighbor_broadcast_shape, {last_warp_dim}),
+          neighbor_broadcast_dims, {last_warp_dim}),
       neighbors_data, dot_dims, /*precision_config=*/nullptr);
 
   // Slice out x and y.
@@ -421,12 +409,13 @@ class ResamplerOp : public XlaOpKernel {
     OP_REQUIRES(ctx, warp_shape.dim_size(last_warp_dim) == 2,
                 errors::InvalidArgument(
                     "the last dimension of warp must be exactly size 2."));
+    xla::PrimitiveType warp_type = ctx->input_xla_type(1);
 
     XlaOp data = ctx->Input("data");
     XlaOp warp = ctx->Input("warp");
 
     // Find the coordinates of the top left corner for the 2x2 region to be
-    // sampled from. The dimensions are (batch, dim_0, ... dim_n, 2) where the
+    // sampled from. The dimensions are [batch, dim_0, ... dim_n, 2] where the
     // last dimension of size 2 in turn is [x, y].
     XlaOp top_left = xla::ConvertElementType(warp, xla::U32);
 
@@ -457,10 +446,54 @@ class ResamplerOp : public XlaOpKernel {
     dot_dims.add_lhs_contracting_dimensions(warp_shape.dims() - 1);
     dot_dims.add_rhs_contracting_dimensions(warp_shape.dims() - 1);
 
+    // The dimension is [batch, dim_0, ...dim_n, data_channels].
     auto blended_pixels = xla::DotGeneral(weights, neighbors_data, dot_dims,
                                           /*precision_config=*/nullptr);
 
-    ctx->SetOutput(0, blended_pixels);
+    // Handle out of boundary cases by constructing a predicate mask array based
+    // on the in-bound condition, and output 0 for the blended pixel value if
+    // out-bound. The dimension is the same as top_left: [batch, dim_0,
+    // ...dim_n, 2] where the last dimension of size 2 is the [x, y] coordinate.
+
+    auto is_ge_zero = xla::Ge(warp, xla::ZerosLike(warp));
+
+    auto is_lt_image_size = xla::Lt(
+        warp,
+        xla::ConvertElementType(
+            xla::ConstantR1<float>(
+                ctx->builder(),
+                {/*width=*/static_cast<float>(data_shape.dim_size(2) - 1),
+                 /*height=*/static_cast<float>(data_shape.dim_size(1) - 1)}),
+            warp_type),
+        /*broadcast_dimensions=*/{warp_shape.dims() - 1});
+
+    auto is_in_bound_x_y = xla::And(is_ge_zero, is_lt_image_size);
+    // Reduce along last dimension. The resulting dimension is:
+    // [batch, dim_0, ...dim_n].
+    auto is_in_bound = xla::Reduce(
+        is_in_bound_x_y, xla::ConstantR0<bool>(ctx->builder(), true),
+        xla::CreateScalarAndComputation(xla::PrimitiveType::PRED,
+                                        ctx->builder()),
+        {last_warp_dim});
+
+    // Broadcast 'is_in_bound' to the same dimension as 'blended_pixels', which
+    // is the dimension of the result:
+    //  [batch, dim_0, ...dim_n, data_channels].
+    auto warp_dims = warp_shape.dim_sizes();
+    std::vector<int64> result_dims(warp_dims.begin(), warp_dims.end() - 1);
+    result_dims.push_back(data_channels);
+
+    std::vector<int64> broadcasted_dims(warp_dims.size() - 1);
+    std::iota(broadcasted_dims.begin(), broadcasted_dims.end(), 0);
+    auto broadcasted_is_in_bound =
+        xla::BroadcastInDim(is_in_bound, result_dims, broadcasted_dims);
+
+    // Set out of bound samples to zero.
+    auto zeros =
+        xla::Broadcast(xla::Zero(ctx->builder(), data_type), result_dims);
+    auto result = xla::Select(broadcasted_is_in_bound, blended_pixels, zeros);
+
+    ctx->SetOutput(0, result);
   }
 };
 
@@ -473,6 +506,8 @@ class ResamplerGradOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("T", &output_dtype));
   }
 
+  // TODO(b/112295522): note that sampling from image boundary is not currently
+  // being handled properly.
   void Compile(XlaOpKernelContext* ctx) override {
     TensorShape data_shape_tf = ctx->InputShape("data");
     OP_REQUIRES(ctx, data_shape_tf.dims() == 4,
diff --git a/tensorflow/compiler/tf2xla/kernels/retval_op.cc b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
index 6970dd0a00641c9f88571561501fb3454fb3eab3..e4046c795577983bff1a8053743bf4d3a258e583 100644
--- a/tensorflow/compiler/tf2xla/kernels/retval_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/retval_op.cc
@@ -47,8 +47,7 @@ class RetvalOp : public XlaOpKernel {
       // compilation.
       OP_REQUIRES_OK(ctx, frame->SetRetval(index_, input));
     } else {
-      XlaContext& xla_context = XlaContext::Get(ctx);
-      xla_context.SetRetval(index_, ctx->InputExpression(0));
+      ctx->xla_context()->SetRetval(index_, ctx->InputExpression(0));
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
index 7ff3e9163811434e8d621795c22bf8304ba7a1ed..d7b38e86cc985d608116488f9e76756a8e904f9c 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_sequence_op.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index b5fd7850bfca01868273c40cbf86188bd815be5b..4b9e1a578be2445091228953df7e5c5e82b42c28 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -39,8 +39,8 @@ namespace {
 
 // TODO(phawkins): implement double-sized windowed reductions in XLA and remove
 // the type constraint.
-constexpr std::array<DataType, 3> kScanOpTypes = {
-    {DT_HALF, DT_BFLOAT16, DT_FLOAT}};
+constexpr std::array<DataType, 4> kScanOpTypes = {
+    {DT_HALF, DT_BFLOAT16, DT_FLOAT, DT_INT32}};
 
 class ScanOp : public XlaOpKernel {
  public:
@@ -103,11 +103,10 @@ class ScanOp : public XlaOpKernel {
       reducer = ctx->GetOrCreateMul(dtype);
     }
     auto output = xla::ReduceWindowWithGeneralPadding(
-        XlaHelpers::ConvertElementType(builder, ctx->Input(0), dtype), init,
-        *reducer, window_dims, window_strides,
+        XlaHelpers::ConvertElementType(ctx->Input(0), dtype), init, *reducer,
+        window_dims, window_strides,
         /*base_dilations=*/{}, /*window_dilations=*/{}, padding);
-    output =
-        XlaHelpers::ConvertElementType(builder, output, ctx->input_type(0));
+    output = XlaHelpers::ConvertElementType(output, ctx->input_type(0));
 
     // In exclusive mode, we have computed an extra element containing the sum
     // of all the input elements. Slice off this extra "last" element.
diff --git a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
index a7f5a8f1698b9d02560de427d356e9e6be5caa7c..84470b230d421658e0d79dcecb175a24155f49b7 100644
--- a/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sendrecv_ops.cc
@@ -42,7 +42,7 @@ SendOp::SendOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
 }
 
 void SendOp::Compile(XlaOpKernelContext* ctx) {
-  XlaCompiler* compiler = XlaContext::Get(ctx).compiler();
+  XlaCompiler* compiler = ctx->compiler();
   xla::ChannelHandle channel;
   OP_REQUIRES_OK(ctx, compiler->GetChannelHandle(tensor_name_, &channel));
   xla::Send(ctx->Input(0), channel);
@@ -73,7 +73,7 @@ RecvOp::RecvOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {
 }
 
 void RecvOp::Compile(XlaOpKernelContext* ctx) {
-  XlaCompiler* compiler = XlaContext::Get(ctx).compiler();
+  XlaCompiler* compiler = ctx->compiler();
   xla::ChannelHandle channel;
   OP_REQUIRES_OK(ctx, compiler->GetChannelHandle(tensor_name_, &channel));
   ctx->SetOutput(0, xla::Recv(ctx->builder(), shape_, channel));
diff --git a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
index 60b011ba6d9b64a89e4228ba2a213f72b67a462d..b1fa2915d59e4e5e2f2523e20e9a37898d087117 100644
--- a/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sequence_ops.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index d6bd927135c013ac1ec3f6547aef358dc2741896..20da8033536e3af3da0fcb216db45f808cacc1d5 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -71,7 +71,7 @@ class SoftmaxOp : public XlaOpKernel {
     auto reduce =
         xla::Reduce(converted, xla::Zero(b, xla_accumulation_type),
                     *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-    auto sum = XlaHelpers::ConvertElementType(b, reduce, type);
+    auto sum = XlaHelpers::ConvertElementType(reduce, type);
     auto softmax =
         log_
             // softmax = shifted_logits - log(sum(exp(shifted_logits)))
@@ -111,11 +111,11 @@ std::pair<xla::XlaOp, xla::XlaOp> CrossEntropyWithLogits(
   // sum_{class} (exp(logits - max_logits))
   const DataType accumulation_type = XlaHelpers::SumAccumulationType(type);
   auto converted =
-      XlaHelpers::ConvertElementType(b, exp_shifted_logits, accumulation_type);
+      XlaHelpers::ConvertElementType(exp_shifted_logits, accumulation_type);
   auto reduce =
       xla::Reduce(converted, XlaHelpers::Zero(b, accumulation_type),
                   *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-  auto sum_exp = XlaHelpers::ConvertElementType(b, reduce, type);
+  auto sum_exp = XlaHelpers::ConvertElementType(reduce, type);
 
   // log(sum(exp(logits - max_logits)))
   auto log_sum_exp = xla::Log(sum_exp);
@@ -126,11 +126,10 @@ std::pair<xla::XlaOp, xla::XlaOp> CrossEntropyWithLogits(
   // (The subtraction broadcasts along the batch dimension.)
   auto sub = xla::Sub(shifted_logits, log_sum_exp, {kBatchDim});
   auto mul = xla::Mul(xla::Neg(labels), sub);
-  auto sum =
-      xla::Reduce(XlaHelpers::ConvertElementType(b, mul, accumulation_type),
-                  XlaHelpers::Zero(b, accumulation_type),
-                  *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
-  auto loss = XlaHelpers::ConvertElementType(b, sum, type);
+  auto sum = xla::Reduce(XlaHelpers::ConvertElementType(mul, accumulation_type),
+                         XlaHelpers::Zero(b, accumulation_type),
+                         *ctx->GetOrCreateAdd(accumulation_type), {kClassDim});
+  auto loss = XlaHelpers::ConvertElementType(sum, type);
 
   // backprop: prob - labels, where
   //   prob = exp(logits - max_logits) / sum(exp(logits - max_logits))
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index 7b96b43ad834c28aa0283c5ef4ac516618ca5134..8e9e4daf99d3dd3b8e149e3f3e5f6c27665c0fcb 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -69,7 +69,7 @@ Status MaybeInitializeStack(xla::XlaBuilder* builder, XlaResource* resource,
   }
 
   TensorShape stack_shape;
-  stack_shape.AddDim(resource->tensor_array_size());
+  stack_shape.AddDim(resource->max_array_size());
   stack_shape.AppendShape(elem_shape);
 
   if (!resource->initialized()) {
@@ -97,10 +97,10 @@ class StackOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    int64 size;
-    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &size));
+    int64 max_size;
+    OP_REQUIRES_OK(ctx, ctx->ConstantInputAsIntScalar(0, &max_size));
     OP_REQUIRES(
-        ctx, size >= 0,
+        ctx, max_size >= 0,
         errors::InvalidArgument(
             "XLA compilation requires a fixed stack size upper bound. If "
             "you are using tf.while_loop, set the maximum_iterations parameter "
@@ -108,14 +108,9 @@ class StackOp : public XlaOpKernel {
 
     // We defer initializing the Stack resource until we see the first push.
     // Otherwise we do not know the shape of the stack elements.
-    xla::XlaOp value;
-    XlaContext& xc = XlaContext::Get(ctx);
-    XlaResource* resource;
-    string name = absl::StrCat("Stack: ", stack_name_);
-    OP_REQUIRES_OK(
-        ctx, xc.CreateResource(XlaResource::kStack, -1, std::move(name), dtype_,
-                               TensorShape(), value, /*tensor_array_size=*/size,
-                               /*tensor_array_gradients=*/{}, &resource));
+    XlaResource* resource =
+        ctx->xla_context()->AddResource(XlaResource::CreateStack(
+            /*name=*/absl::StrCat("Stack: ", stack_name_), dtype_, max_size));
     ctx->SetResourceOutput(0, resource);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index 5db52781be473a9a1aef0adf105e3edf69ccd306..50653d7b3973b73d580cdeec5d71943b575d7cc9 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 252967a74649f5089f0cb0a9166b1d2b6e094f27..939d7e19515a1cb41e3e23e9d1fa957ae09ecab7 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -61,8 +61,8 @@ Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
         " but op has dtype ", DataTypeString(dtype), ".");
   }
 
-  TF_RET_CHECK(resource->tensor_array_size() >= 0)
-      << resource->name() << " size " << resource->tensor_array_size();
+  TF_RET_CHECK(resource->max_array_size() >= 0)
+      << resource->name() << " size " << resource->max_array_size();
 
   if (!resource->initialized()) {
     TF_RETURN_IF_ERROR(resource->SetTypeAndShape(dtype, elem_shape));
@@ -78,7 +78,7 @@ Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
         XLAShapeToTensorShape(shape_or_status.ValueOrDie(), &shape));
 
     TensorShape ta_shape;
-    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AddDim(resource->max_array_size());
     ta_shape.AppendShape(elem_shape);
     if (ta_shape != shape) {
       return errors::InvalidArgument(
@@ -114,7 +114,7 @@ Status CheckTensorArrayIsInitialized(const string& op_name,
 Status GetTensorArrayShape(const XlaResource* resource,
                            xla::XlaBuilder* builder, TensorShape* shape) {
   *shape = resource->shape();
-  shape->InsertDim(0, resource->tensor_array_size());
+  shape->InsertDim(0, resource->max_array_size());
   return Status::OK();
 }
 
@@ -166,13 +166,10 @@ class TensorArrayOp : public XlaOpKernel {
       value = xla::Broadcast(zero, ta_shape.dim_sizes());
     }
 
-    XlaContext& xc = XlaContext::Get(ctx);
-    XlaResource* var;
-    string name = absl::StrCat("TensorArray: ", tensor_array_name_);
-    OP_REQUIRES_OK(
-        ctx, xc.CreateResource(XlaResource::kTensorArray, -1, std::move(name),
-                               dtype_, shape, value, /*tensor_array_size=*/size,
-                               /*tensor_array_gradients=*/{}, &var));
+    XlaResource* var =
+        ctx->xla_context()->AddResource(XlaResource::CreateTensorArray(
+            /*name=*/absl::StrCat("TensorArray: ", tensor_array_name_), dtype_,
+            shape, /*initial_value=*/value, /*max_array_size=*/size));
     ctx->SetResourceOutput(0, var);
 
     Tensor flow(DT_FLOAT, TensorShape({}));
@@ -517,14 +514,13 @@ class TensorArraySplitOp : public XlaOpKernel {
     xla::XlaOp ta = resource->value();
 
     TensorShape ta_shape;
-    ta_shape.AddDim(resource->tensor_array_size());
+    ta_shape.AddDim(resource->max_array_size());
     ta_shape.AppendShape(elem_shape);
 
-    OP_REQUIRES(
-        ctx, lengths.size() == resource->tensor_array_size(),
-        errors::InvalidArgument(
-            "TensorArray's size is not equal to the size of lengths (",
-            lengths.size(), " vs. ", resource->tensor_array_size(), ")"));
+    OP_REQUIRES(ctx, lengths.size() == resource->max_array_size(),
+                errors::InvalidArgument(
+                    "TensorArray's size is not equal to the size of lengths (",
+                    lengths.size(), " vs. ", resource->max_array_size(), ")"));
 
     const xla::XlaOp value = ctx->Input(1);
     const xla::XlaOp flow = ctx->Input(3);
@@ -562,8 +558,7 @@ class TensorArraySizeOp : public XlaOpKernel {
     XlaResource* var;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(0, &var));
     Tensor size_tensor(DT_INT32, {});
-    size_tensor.scalar<int32>()() =
-        static_cast<int32>(var->tensor_array_size());
+    size_tensor.scalar<int32>()() = static_cast<int32>(var->max_array_size());
     ctx->SetConstantOutput(0, size_tensor);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/topk_op.cc b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
index 8a0c94cfae1b298bd62a3231caf39ecf9b32880e..ee3bdf3394e37c757f31724e73e95417becaa534 100644
--- a/tensorflow/compiler/tf2xla/kernels/topk_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/topk_op.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/training_ops.cc b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
index 7077c2e3a546e198bdb4ff944ea531f3158810f2..960c1462ceb8c00a2d6c96564f6c985fd1caef0f 100644
--- a/tensorflow/compiler/tf2xla/kernels/training_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/training_ops.cc
@@ -320,9 +320,8 @@ class ResourceApplyAdagradDA : public XlaOpKernel {
     xla::XlaOp lr = ctx->Input(4);
     xla::XlaOp l1 = ctx->Input(5);
     xla::XlaOp l2 = ctx->Input(6);
-    xla::XlaBuilder* const b = ctx->builder();
     xla::XlaOp global_step =
-        XlaHelpers::ConvertElementType(b, ctx->Input(7), dtype_);
+        XlaHelpers::ConvertElementType(ctx->Input(7), dtype_);
 
     accum = accum + grad;
     squared_accum = squared_accum + xla::Square(grad);
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 559414eeaa5fec75e5a9d1866baaf738c024cd15..ce007fc04a818869686b9936a1607cee42665e87 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -64,7 +64,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
       if (!arg.initialized) {
         *has_uninitialized_vars = true;
       }
-      arg.tensor_array_size = resource->tensor_array_size();
+      arg.max_array_size = resource->max_array_size();
       for (const auto& gradient : resource->tensor_array_gradients()) {
         arg.tensor_array_gradients.insert(gradient.first);
       }
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
index a9f88a6df2539b06ff44fb0aa49c2f2ae1389100..ad8e707e1116d01d492575986a7ab9586022f6b3 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
@@ -89,13 +89,10 @@ class XlaBroadcastHelperOp : public XlaOpKernel {
               lhs_shape.DebugString(), " and ", rhs_shape.DebugString()));
       broadcast_shape[dim] = min_rank_shape->dim_size(i);
     }
-    xla::PrimitiveType type = context->input_xla_type(0);
-    xla::Shape broadcast_xla_shape =
-        xla::ShapeUtil::MakeShape(type, broadcast_shape);
     if (broadcast_lhs) {
-      lhs = xla::BroadcastInDim(lhs, broadcast_xla_shape, broadcast_dims);
+      lhs = xla::BroadcastInDim(lhs, broadcast_shape, broadcast_dims);
     } else {
-      rhs = xla::BroadcastInDim(rhs, broadcast_xla_shape, broadcast_dims);
+      rhs = xla::BroadcastInDim(rhs, broadcast_shape, broadcast_dims);
     }
     context->SetOutput(0, lhs);
     context->SetOutput(1, rhs);
diff --git a/tensorflow/compiler/tf2xla/lib/BUILD b/tensorflow/compiler/tf2xla/lib/BUILD
index 1ce3930fd1cd91f8e8dfb765b49be2dc969d1bd7..3e7a761120317ff85947559b7b2e52be9232afb7 100644
--- a/tensorflow/compiler/tf2xla/lib/BUILD
+++ b/tensorflow/compiler/tf2xla/lib/BUILD
@@ -17,20 +17,6 @@ filegroup(
 
 load("//tensorflow/compiler/xla/tests:build_defs.bzl", "xla_test")
 
-cc_library(
-    name = "batch_dot",
-    srcs = ["batch_dot.cc"],
-    hdrs = ["batch_dot.h"],
-    deps = [
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/core:lib",
-    ],
-)
-
 cc_library(
     name = "broadcast",
     srcs = ["broadcast.cc"],
@@ -52,8 +38,6 @@ cc_library(
     srcs = ["cholesky.cc"],
     hdrs = ["cholesky.h"],
     deps = [
-        ":batch_dot",
-        ":triangular_solve",
         ":util",
         ":while_loop",
         "//tensorflow/compiler/xla:literal",
@@ -63,6 +47,9 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/compiler/xla/client/lib:triangular_solve",
         "//tensorflow/core:lib",
     ],
 )
@@ -87,7 +74,6 @@ cc_library(
     srcs = ["qr.cc"],
     hdrs = ["qr.h"],
     deps = [
-        ":batch_dot",
         ":util",
         ":while_loop",
         "//tensorflow/compiler/xla:literal_util",
@@ -99,7 +85,8 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
-        "//tensorflow/compiler/xla/client/lib:numeric",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/core:lib",
     ],
 )
@@ -124,51 +111,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "triangular_solve",
-    srcs = ["triangular_solve.cc"],
-    hdrs = ["triangular_solve.h"],
-    deps = [
-        ":batch_dot",
-        ":util",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:constants",
-        "//tensorflow/compiler/xla/client/lib:numeric",
-        "//tensorflow/core:lib",
-    ],
-)
-
-xla_test(
-    name = "triangular_solve_test",
-    srcs = ["triangular_solve_test.cc"],
-    tags = ["noasan"],  # sometimes times out, http://b/78650012
-    deps = [
-        ":triangular_solve",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "util",
     srcs = ["util.cc"],
@@ -187,29 +129,6 @@ cc_library(
     ],
 )
 
-xla_test(
-    name = "util_test",
-    srcs = ["util_test.cc"],
-    deps = [
-        ":batch_dot",
-        ":util",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto",
-        "//tensorflow/compiler/xla/client:global_data",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-    ],
-)
-
 cc_library(
     name = "while_loop",
     srcs = ["while_loop.cc"],
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.cc b/tensorflow/compiler/tf2xla/lib/batch_dot.cc
deleted file mode 100644
index 5400e8834cb9807f6dd71abe7789b2672e29e905..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-
-#include <memory>
-#include <vector>
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
-#include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-
-xla::XlaOp BatchDot(xla::XlaOp x, xla::XlaOp y, bool transpose_x,
-                    bool transpose_y, bool conjugate_x, bool conjugate_y,
-                    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape x_shape, builder->GetShape(x));
-    TF_ASSIGN_OR_RETURN(xla::Shape y_shape, builder->GetShape(y));
-
-    // Check that both tensors have the same number of dimensions. There must be
-    // at least two (the batch dimensions can be empty).
-    if (xla::ShapeUtil::Rank(x_shape) != xla::ShapeUtil::Rank(y_shape)) {
-      return errors::InvalidArgument(
-          "Arguments to BatchedDot have different ranks: ",
-          xla::ShapeUtil::HumanString(x_shape), " vs. ",
-          xla::ShapeUtil::HumanString(y_shape));
-    }
-    const int ndims = xla::ShapeUtil::Rank(x_shape);
-    if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to BatchedDot must have rank >= 2: ", ndims);
-    }
-
-    // The batch dimensions must be equal and the matrix dimensions must be
-    // valid.
-    std::vector<int64> batch_dimension_numbers;
-    for (int i = 0; i < ndims - 2; ++i) {
-      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
-        return errors::InvalidArgument(
-            "Dimension ", i, " of inputs to BatchedDot must be equal: ",
-            xla::ShapeUtil::HumanString(x_shape), " vs ",
-            xla::ShapeUtil::HumanString(y_shape));
-      }
-      batch_dimension_numbers.push_back(i);
-    }
-
-    int x_inner_dim = transpose_x ? (ndims - 2) : (ndims - 1);
-    int y_inner_dim = transpose_y ? (ndims - 1) : (ndims - 2);
-    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
-      return errors::InvalidArgument(
-          "Dimensions ", x_inner_dim, " and ", y_inner_dim,
-          " of arguments to BatchedDot must be equal: ",
-          xla::ShapeUtil::HumanString(x_shape), " transpose: ", transpose_x,
-          " vs. ", xla::ShapeUtil::HumanString(y_shape),
-          " transpose: ", transpose_y);
-    }
-
-    // Check for zero lhs/rhs dim size.
-    if (xla::ShapeUtil::IsZeroElementArray(x_shape) ||
-        xla::ShapeUtil::IsZeroElementArray(y_shape)) {
-      std::vector<int64> dimensions(batch_dimension_numbers.size());
-      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
-        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
-      }
-      int x_outer_dim = transpose_x ? (ndims - 1) : (ndims - 2);
-      int y_outer_dim = transpose_y ? (ndims - 2) : (ndims - 1);
-      dimensions.push_back(x_shape.dimensions(x_outer_dim));
-      dimensions.push_back(y_shape.dimensions(y_outer_dim));
-      return xla::Broadcast(
-          xla::ConstantLiteral(builder,
-                               xla::LiteralUtil::Zero(x_shape.element_type())),
-          dimensions);
-    }
-
-    if (x_shape.element_type() == xla::C64 && conjugate_x) {
-      x = xla::Conj(x);
-    }
-    if (y_shape.element_type() == xla::C64 && conjugate_y) {
-      y = xla::Conj(y);
-    }
-
-    xla::PrecisionConfig precision_proto;
-    precision_proto.add_operand_precision(precision);
-    precision_proto.add_operand_precision(precision);
-
-    xla::DotDimensionNumbers dot_dnums;
-    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
-    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
-    for (auto batch_dimension_number : batch_dimension_numbers) {
-      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
-      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
-    }
-
-    return xla::DotGeneral(x, y, dot_dnums, &precision_proto);
-  });
-}
-
-}  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/batch_dot.h b/tensorflow/compiler/tf2xla/lib/batch_dot.h
deleted file mode 100644
index 6edd63a4d3b66c21aa4cce8c9f36eef0dc363cd8..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/tf2xla/lib/batch_dot.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
-
-#include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-
-namespace tensorflow {
-
-// Multiplies slices of two tensors in batches.
-
-// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
-// viewed as an element of a batch), and arranges the individual results
-// in a single output tensor of the same batch size. Each of the
-// individual slices can optionally be transposed before multiplication by
-// setting the `transpose_x` or `transpose_y` flag to `true`. Similarly, each
-// can be elementwise-complex-conjugated by setting the `conjugate_x` or
-// `conjugate_y` flag to `true`. To apply a Hermitian adjoint to `x`, set both
-// `transpose_x` and `conjugate_x` to `true`, and analogously for `y`.
-//
-// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
-// and `[..., r_y, c_y]`.
-//
-// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
-//
-//     r_o = c_x if transpose_x else r_x
-//     c_o = r_y if transpose_y else c_y
-//
-// It is computed as:
-//
-//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
-xla::XlaOp BatchDot(
-    xla::XlaOp x, xla::XlaOp y, bool transpose_x = false,
-    bool transpose_y = false, bool conjugate_x = false,
-    bool conjugate_y = false,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_BATCH_DOT_H_
diff --git a/tensorflow/compiler/tf2xla/lib/broadcast.cc b/tensorflow/compiler/tf2xla/lib/broadcast.cc
index 3e402ef855cd7c114332d84032bc869232404fc8..be31f116686a2e302ece730e9d03312a45888a61 100644
--- a/tensorflow/compiler/tf2xla/lib/broadcast.cc
+++ b/tensorflow/compiler/tf2xla/lib/broadcast.cc
@@ -80,10 +80,8 @@ xla::StatusOr<xla::XlaOp> BroadcastTo(xla::XlaOp input,
     broadcast_dim = broadcast_shape_size - broadcast_dim - 1;
   }
   absl::c_reverse(broadcast_shape);
-  xla::XlaOp output = xla::BroadcastInDim(
-      input,
-      xla::ShapeUtil::MakeShape(input_shape.element_type(), broadcast_shape),
-      broadcast_dims);
+  xla::XlaOp output =
+      xla::BroadcastInDim(input, broadcast_shape, broadcast_dims);
   if (broadcast_shape != output_dims) {
     output = xla::Reshape(output, output_dims);
   }
diff --git a/tensorflow/compiler/tf2xla/lib/cholesky.cc b/tensorflow/compiler/tf2xla/lib/cholesky.cc
index ab3d0a566839343828d176d9a46672824e425613..550ab5b05693b79e60e49577309328ac6846d3f9 100644
--- a/tensorflow/compiler/tf2xla/lib/cholesky.cc
+++ b/tensorflow/compiler/tf2xla/lib/cholesky.cc
@@ -18,11 +18,12 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -101,10 +102,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // a[..., i, i]
       auto a_ii = DynamicSliceInMinorDims(body_a, {i, i}, {1, 1});
       // np.dot(row, np.swapaxes(row, -1, -2))
-      auto diag_dot = BatchDot(row, row,
-                               /*transpose_x=*/false,
-                               /*transpose_y=*/true, /*conjugate_x=*/false,
-                               /*conjugate_y=*/false, precision);
+      auto diag_dot = BatchDot(row, TransposeInMinorDims(row), precision);
       // l[..., i, i] = np.sqrt(a[..., i, i] - np.dot(row,
       //                                              np.swapaxes(row, -1, -2)))
       auto l_ii =
@@ -122,10 +120,7 @@ xla::XlaOp CholeskyUnblocked(xla::XlaOp a,
       // The columns in [i, n] are zeroed out in `row`, so we just have to
       // zero out rows above i+1 after the BatchDot. np.dot(l[..., :, :i],
       // r.T)
-      auto dot = BatchDot(body_l, row,
-                          /*transpose_x=*/false,
-                          /*transpose_y=*/true, /*conjugate_x=*/false,
-                          /*conjugate_y=*/false, precision);
+      auto dot = BatchDot(body_l, TransposeInMinorDims(row), precision);
       // np.dot(l[..., i+1:, :i], r.T)
       auto dot_ip1 =
           xla::Select(xla::Le(mask_range_col, i), mask_zeros_col, dot);
@@ -185,9 +180,7 @@ xla::XlaOp Cholesky(xla::XlaOp a, int64 block_size,
         // a[i:, i:i+k] -= np.dot(l[i:, :i], np.transpose(l[i:i+k, :i]))
         auto lhs = SliceInMinorDims(l, {i, 0}, {n, i});
         auto rhs = SliceInMinorDims(l, {i, 0}, {i + k, i});
-        auto delta = BatchDot(lhs, rhs, /*transpose_x=*/false,
-                              /*transpose_y=*/true, /*conjugate_x=*/false,
-                              /*conjugate_y=*/false, precision);
+        auto delta = BatchDot(lhs, TransposeInMinorDims(rhs), precision);
         auto before = SliceInMinorDims(a, {i, i}, {n, i + k});
         a = UpdateSliceInMinorDims(a, before - delta, {i, i});
       }
diff --git a/tensorflow/compiler/tf2xla/lib/qr.cc b/tensorflow/compiler/tf2xla/lib/qr.cc
index 6b3f2b6e065b5c99e2d0248237369ecc30188aa5..d6007748609fdd161cb89692a167eb7ed12fe00c 100644
--- a/tensorflow/compiler/tf2xla/lib/qr.cc
+++ b/tensorflow/compiler/tf2xla/lib/qr.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/lib/while_loop.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -191,12 +191,8 @@ xla::StatusOr<QRBlockResult> QRBlock(
     auto v_broadcast = xla::Reshape(v, shape);
     // a[:, :] -= tau * np.dot(v[:, np.newaxis],
     //                          np.dot(v[np.newaxis, :], a[:, :]))
-    auto vva =
-        BatchDot(v_broadcast, a, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    vva =
-        BatchDot(v_broadcast, vva, /*transpose_x=*/true, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto vva = BatchDot(v_broadcast, a, precision);
+    vva = BatchDot(TransposeInMinorDims(v_broadcast), vva, precision);
     a = a - xla::Mul(tau, vva,
                      /*broadcast_dimensions=*/batch_dim_indices);
 
@@ -278,12 +274,9 @@ xla::StatusOr<xla::XlaOp> ComputeWYRepresentation(
     auto beta = DynamicSliceInMinorDims(taus, {j}, {1});
 
     // yv has shape [..., n, 1]
-    auto yv = BatchDot(y, v, /*transpose_x=*/true, /*transpose_y=*/false,
-                       /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto yv = BatchDot(TransposeInMinorDims(y), v, precision);
     // wyv has shape [..., m, 1]
-    auto wyv =
-        BatchDot(w, yv, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto wyv = BatchDot(w, yv, precision);
 
     auto z = xla::Mul(
         -beta, v + wyv,
@@ -375,23 +368,15 @@ xla::StatusOr<QRDecompositionResult> QRDecomposition(
 
     // a[i:, i+k:] += np.dot(Y, np.dot(W.T, a[i:, i+k:]))
     auto a_panel = SliceInMinorDims(a, {i, i + k}, {m, n});
-    auto a_update =
-        BatchDot(w, a_panel, /*transpose_x=*/true, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    a_update =
-        BatchDot(y, a_update, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+    auto a_update = BatchDot(TransposeInMinorDims(w), a_panel, precision);
+    a_update = BatchDot(y, a_update, precision);
     a_panel = a_panel + a_update;
     a = UpdateSliceInMinorDims(a, a_panel, {i, i + k});
 
     // q[:, i:] += np.dot(np.dot(q[:, i:], W), Y.T))
     auto q_panel = SliceInMinorDims(q, {0, i}, {m, m});
-    auto q_update =
-        BatchDot(q_panel, w, /*transpose_x=*/false, /*transpose_y=*/false,
-                 /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
-    q_update = BatchDot(q_update, y, /*transpose_x=*/false,
-                        /*transpose_y=*/true, /*conjugate_x=*/false,
-                        /*conjugate_y=*/false, precision);
+    auto q_update = BatchDot(q_panel, w, precision);
+    q_update = BatchDot(q_update, TransposeInMinorDims(y), precision);
     q_panel = q_panel + q_update;
     q = UpdateSliceInMinorDims(q, q_panel, {0, i});
   }
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 804671fbc75b0a5a6e04b204822b6f084013cd8b..c0bd172d17c192435ba8ee196f9def0491c0bf5c 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -113,36 +113,6 @@ xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
   return xla::ConstantLiteral(builder, literal);
 }
 
-xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span<const int64> start,
-                            absl::Span<const int64> end) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_RET_CHECK(start.size() == end.size());
-    int64 n_minor_dims = start.size();
-
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    auto major_dims = xla::AsInt64Slice(shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - n_minor_dims);
-
-    // Prepends 0s in the major dim
-    std::vector<int64> padded_start(n_dims, 0);
-    std::copy(start.begin(), start.end(),
-              padded_start.begin() + major_dims.size());
-
-    // Prepends the shape of the major dims.
-    std::vector<int64> padded_end(n_dims);
-    std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
-    std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
-
-    std::vector<int64> strides(n_dims, 1);
-    return xla::Slice(x, padded_start, padded_end, strides);
-  });
-}
 
 std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
                                  absl::Span<const int64> ys) {
@@ -152,100 +122,4 @@ std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
   return output;
 }
 
-xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts,
-                                   absl::Span<const int64> sizes) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    int64 n_minor_dims = starts.size();
-    TF_RET_CHECK(n_minor_dims == sizes.size());
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    auto major_dims = xla::AsInt64Slice(shape.dimensions())
-                          .subspan(
-                              /*pos=*/0,
-                              /*len=*/n_dims - sizes.size());
-    auto padded_starts = PrependZerosInMajorDims(x, starts);
-    auto padded_sizes = ConcatVectors(major_dims, sizes);
-    return xla::DynamicSlice(x, padded_starts, padded_sizes);
-  });
-}
-
-xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update,
-                       absl::Span<const int64> start) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
-    std::vector<int32> start_as_int32(start.begin(), start.end());
-    auto start_constant = xla::ConstantR1<int32>(builder, start_as_int32);
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_ASSIGN_OR_RETURN(xla::Shape start_constant_shape,
-                        builder->GetShape(start_constant));
-    const int64 start_length =
-        xla::ShapeUtil::GetDimension(start_constant_shape, -1);
-    TF_RET_CHECK(start_length == n_dims);
-    return xla::DynamicUpdateSlice(x, update, start_constant);
-  });
-}
-
-xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                  absl::Span<const int64> start) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    const int64 n_minor_dims = start.size();
-    TF_RET_CHECK(n_minor_dims <= n_dims);
-    std::vector<int64> padded_start(n_dims, 0);
-    std::copy(start.begin(), start.end(),
-              padded_start.begin() + (n_dims - n_minor_dims));
-    return UpdateSlice(x, update, padded_start);
-  });
-}
-
-xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                         absl::Span<const xla::XlaOp> starts) {
-  auto padded_starts = PrependZerosInMajorDims(x, starts);
-  return xla::DynamicUpdateSlice(x, update, padded_starts);
-}
-
-xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    auto zero = xla::Reshape(xla::ConstantR0<int32>(builder, 0), {1});
-    std::vector<xla::XlaOp> padded_starts(n_dims, zero);
-    for (int i = 0; i < starts.size(); ++i) {
-      padded_starts[n_dims - starts.size() + i] = xla::Reshape(starts[i], {1});
-    }
-    return xla::ConcatInDim(builder, padded_starts, 0);
-  });
-}
-
-xla::XlaOp TransposeInMinorDims(xla::XlaOp x) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    const int64 n_dims = xla::ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    std::vector<int64> permutation(n_dims);
-    std::iota(permutation.begin(), permutation.end(), 0);
-    std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
-    return xla::Transpose(x, permutation);
-  });
-}
-
-xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate) {
-  xla::XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(x));
-    auto perform_conj = shape.element_type() == xla::C64 && conjugate;
-    return perform_conj ? xla::Conj(x) : x;
-  });
-}
-
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/lib/util.h b/tensorflow/compiler/tf2xla/lib/util.h
index 80e9e5b002d49581209e608b98606e02709c5876..aec8061cb4322b8d315b6cdc80c7fff1e0cb4cb1 100644
--- a/tensorflow/compiler/tf2xla/lib/util.h
+++ b/tensorflow/compiler/tf2xla/lib/util.h
@@ -38,44 +38,10 @@ xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
 xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
                           int64 value);
 
-// Builds a vector of zeros of length rank(x) with the last values being
-// those in `starts`.
-xla::XlaOp PrependZerosInMajorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts);
-
-// Performs a slice in the minor dimensions of a Tensor.
-xla::XlaOp SliceInMinorDims(xla::XlaOp x, absl::Span<const int64> start,
-                            absl::Span<const int64> end);
-
 // Returns the concatenation of `xs` and `ys`.
 std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
                                  absl::Span<const int64> ys);
 
-// Performs a dynamic slice in the minor dimensions of a Tensor.
-xla::XlaOp DynamicSliceInMinorDims(xla::XlaOp x,
-                                   absl::Span<const xla::XlaOp> starts,
-                                   absl::Span<const int64> sizes);
-
-// Updates a slice of 'x', i.e.,
-// x[start[0], ..., start[n]] = update
-xla::XlaOp UpdateSlice(xla::XlaOp x, xla::XlaOp update,
-                       absl::Span<const int64> start);
-
-// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
-// x[..., start[0], ..., start[n]] = update
-xla::XlaOp UpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                  absl::Span<const int64> start);
-
-xla::XlaOp DynamicUpdateSliceInMinorDims(xla::XlaOp x, xla::XlaOp update,
-                                         absl::Span<const xla::XlaOp> starts);
-
-// Transposes a stack of matrices `x` by swapping the last two dimensions.
-xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
-
-// Applies a complex conjugation operation if `a` is complex and `conjugate_a`
-// is true, otherwise returns its argument.
-xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
-
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_UTIL_H_
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index c9f486edc8d30954619db0967c988fe8e26938de..fef97b98c376d9df8bbfd9cb6651216895e46bf4 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -1,11 +1,13 @@
 licenses(["notice"])  # Apache 2.0
 
+package_group(
+    name = "friends",
+    includes = ["//tensorflow:internal"],
+)
+
 package(
     default_visibility = [
-        "//learning/deepmind/public/wavenet/python:__subpackages__",
-        "//learning/deepmind/research/alphastar:__subpackages__",
-        "//learning/tfx:__subpackages__",
-        "//tensorflow:internal",
+        ":friends",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/shape_util.h b/tensorflow/compiler/tf2xla/shape_util.h
index f7e34a5b40c2f9244c029ed325a76322b8cf54dd..0b231ea8e7a2d8e303e91911e2e0a36fc83e78b4 100644
--- a/tensorflow/compiler/tf2xla/shape_util.h
+++ b/tensorflow/compiler/tf2xla/shape_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
 #define TENSORFLOW_COMPILER_TF2XLA_SHAPE_UTIL_H_
 
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 425e769346ffcbc548495d93cb7adc779f860110..c7341cf8b9e8d7a06fd304ae8766420d20f0c16e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -26,7 +26,7 @@ limitations under the License.
 // Forward-declare, rather than include, to reduce code size for users that
 // never use this functionality.
 namespace xla {
-class ProgramShape;
+class ProgramShapeProto;
 class HloProfilePrinterData;
 }
 
@@ -84,7 +84,7 @@ class XlaCompiledCpuFunction {
     void set_result_names(const char** result_names) {
       result_names_ = result_names;
     }
-    void set_program_shape(const xla::ProgramShape* program_shape) {
+    void set_program_shape(const xla::ProgramShapeProto* program_shape) {
       program_shape_ = program_shape;
     }
     const xla::HloProfilePrinterData* hlo_profile_printer_data() const {
@@ -122,7 +122,7 @@ class XlaCompiledCpuFunction {
     const char** result_names_ = nullptr;
 
     // [Optional] Arg and result shapes.
-    const xla::ProgramShape* program_shape_ = nullptr;
+    const xla::ProgramShapeProto* program_shape_ = nullptr;
 
     // [Optional] Profile printer data.  Null if profiling is disabled.
     const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
@@ -206,8 +206,14 @@ class XlaCompiledCpuFunction {
   //
   // Aliasing of argument and result buffers is not allowed, and results in
   // undefined behavior.
-  void set_arg_data(size_t index, void* data) {
-    buffer_table_[arg_index_table_[index]] = data;
+  void set_arg_data(size_t index, const void* data) {
+    // The const_cast is safe because the generated code does not write to arg
+    // buffers.
+    //
+    // buffer_table_ contains pointers to buffers that _will_ be written to by
+    // generated code so it would be misleading to make buffer_table_ a `const
+    // void**`.
+    buffer_table_[arg_index_table_[index]] = const_cast<void*>(data);
   }
 
   // ------------------------------
@@ -264,7 +270,7 @@ class XlaCompiledCpuFunction {
 
   // Returns the shape of the args and results. May return nullptr if the
   // program shape isn't available.
-  const xla::ProgramShape* ProgramShape() const { return program_shape_; }
+  const xla::ProgramShapeProto* ProgramShape() const { return program_shape_; }
 
   bool hlo_profiling_enabled() const {
     return hlo_profile_printer_data_ != nullptr;
@@ -287,11 +293,6 @@ class XlaCompiledCpuFunction {
 
   // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
   // for XLA generated code to be able to find it.
-  //
-  // For now we need to keep around the args_ array because there is code that
-  // depends on args() returning a void**.  However, in the future we may remove
-  // args_ in favor of using buffer_table_ as the sole storage for the
-  // arguments.
   const int32* const arg_index_table_;
 
   // The number of incoming arguments.
@@ -310,7 +311,7 @@ class XlaCompiledCpuFunction {
   // Optional metadata.
   const char** arg_names_ = nullptr;
   const char** result_names_ = nullptr;
-  const xla::ProgramShape* program_shape_ = nullptr;
+  const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 };
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index a08d030ce710bdb97910c01a64f80199fc10d649..ee461a3c07d4db514c7697e005a9371be4b54dd0 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -158,7 +158,8 @@ Status BuildComputation(
     xla::XlaBuilder* builder, xla::XlaComputation* computation,
     int* num_computation_outputs, int* num_nonconst_outputs,
     std::vector<XlaCompiler::OutputDescription>* outputs,
-    std::vector<XlaCompiler::ResourceUpdate>* resource_updates) {
+    std::vector<XlaCompiler::ResourceUpdate>* resource_updates,
+    xla::Shape* output_shape) {
   // Attach a common operator name as metadata. This has no semantic effect — it
   // merely makes the HLO graph more readable when visualized via TensorBoard,
   // since TensorBoard forms groups out of operators with similar names.
@@ -176,6 +177,10 @@ Status BuildComputation(
 
   std::vector<xla::XlaOp> elems;
   elems.reserve(retvals.size());
+
+  // Keeps track of which retvals have layout to update. The first element is
+  // the output index, second element is the new layout.
+  std::vector<std::pair<int64, xla::Layout>> retval_to_update_layout;
   for (int i = 0; i < retvals.size(); ++i) {
     XlaCompiler::OutputDescription& output = (*outputs)[i];
     const XlaExpression& retval = retvals[i];
@@ -202,10 +207,12 @@ Status BuildComputation(
           TF_ASSIGN_OR_RETURN(xla::Shape shape, shape_representation_fn(
                                                     output.shape, output.type));
           value = xla::Reshape(value, xla::AsInt64Slice(shape.dimensions()));
+          retval_to_update_layout.emplace_back(elems.size(), shape.layout());
         } else if (it != retval_cores.end()) {
           // Apply the sharding to the output, if there is a core assignment.
           value = identity_op(value);
         }
+
         elems.push_back(value);
         break;
       }
@@ -297,6 +304,21 @@ Status BuildComputation(
     return computation_status.status();
   }
   *computation = computation_status.ConsumeValueOrDie();
+
+  TF_ASSIGN_OR_RETURN(const auto& program_shape,
+                      computation->GetProgramShape());
+  *output_shape = program_shape.result();
+  // Update the output layout to the layout of retval.
+  for (auto& update : retval_to_update_layout) {
+    if (!always_return_tuple && elems.size() == 1) {
+      *output_shape->mutable_layout() = update.second;
+      continue;
+    }
+
+    xla::Shape* output_sub_shape =
+        xla::ShapeUtil::GetMutableSubshape(output_shape, {update.first});
+    *output_sub_shape->mutable_layout() = update.second;
+  }
   return Status::OK();
 }
 
@@ -304,10 +326,10 @@ Status BuildComputation(
 
 bool XlaCompiler::Argument::operator==(
     const XlaCompiler::Argument& other) const {
-  if (std::tie(kind, resource_kind, type, name, initialized, tensor_array_size,
+  if (std::tie(kind, resource_kind, type, name, initialized, max_array_size,
                tensor_array_gradients) !=
       std::tie(other.kind, other.resource_kind, other.type, other.name,
-               other.initialized, other.tensor_array_size,
+               other.initialized, other.max_array_size,
                other.tensor_array_gradients)) {
     return false;
   }
@@ -337,8 +359,8 @@ string XlaCompiler::Argument::HumanString() const {
       string output = absl::StrCat("kind=resource", common, " resource_kind=",
                                    XlaResource::KindToString(resource_kind),
                                    " initialized=", initialized);
-      if (tensor_array_size >= 0) {
-        absl::StrAppend(&output, " tensor_array_size=", tensor_array_size);
+      if (max_array_size >= 0) {
+        absl::StrAppend(&output, " max_array_size=", max_array_size);
       }
       if (!tensor_array_gradients.empty()) {
         absl::StrAppend(&output, " tensor_array_gradients=",
@@ -358,7 +380,7 @@ XlaCompiler::XlaCompiler(XlaCompiler::Options options)
       initialization_status_(Status::OK()),
       next_step_id_(1),
       device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
-      device_mgr_({device_}) {
+      device_mgr_(absl::WrapUnique(device_)) {
   CHECK(!options_.device_type.type_string().empty());
   if (options_.populate_resource_manager) {
     initialization_status_ =
@@ -545,12 +567,12 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
           return Status::OK();
         }
         case XlaResource::kTensorArray: {
-          if (arg.tensor_array_size < 0) {
+          if (arg.max_array_size < 0) {
             return errors::InvalidArgument(
-                "Negative tensor_array_size in XLAShapeForArgument");
+                "Negative max_array_size in XLAShapeForArgument");
           }
           TensorShape shape;
-          shape.AddDim(arg.tensor_array_size);
+          shape.AddDim(arg.max_array_size);
           shape.AppendShape(arg.shape);
           TF_RETURN_IF_ERROR(TensorShapeToXLAShape(arg.type, shape, xla_shape));
 
@@ -562,12 +584,12 @@ Status XlaCompiler::XLAShapeForArgument(const XlaCompiler::Argument& arg,
           return Status::OK();
         }
         case XlaResource::kStack: {
-          if (arg.tensor_array_size < 0) {
+          if (arg.max_array_size < 0) {
             return errors::InvalidArgument(
-                "Negative tensor_array_size in XLAShapeForArgument");
+                "Negative max_array_size in XLAShapeForArgument");
           }
           TensorShape shape;
-          shape.AddDim(arg.tensor_array_size);
+          shape.AddDim(arg.max_array_size);
           shape.AppendShape(arg.shape);
           xla::Shape buffer_shape;
           TF_RETURN_IF_ERROR(
@@ -613,21 +635,23 @@ Status XlaCompiler::BuildArguments(
     const XlaCompiler::Argument& arg = args[i];
     XlaExpression& arg_expression = (*arg_expressions)[i];
     switch (arg.kind) {
-      case XlaCompiler::Argument::kResource:
+      case XlaCompiler::Argument::kResource: {
         TF_RET_CHECK(arg.resource_kind != XlaResource::kInvalid);
         // TODO(phawkins): this code assumes that resource arguments do not
         // alias.
-        XlaResource* resource;
-        TF_RETURN_IF_ERROR(context->CreateResource(
-            arg.resource_kind, i, arg.name, arg.type, arg.shape, xla::XlaOp(),
-            /*tensor_array_size=*/arg.tensor_array_size,
-            /*tensor_array_gradients=*/arg.tensor_array_gradients, &resource));
+        XlaResource* resource =
+            context->AddResource(absl::make_unique<XlaResource>(
+                arg.resource_kind, i, arg.name, arg.type, arg.shape,
+                xla::XlaOp(),
+                /*max_array_size=*/arg.max_array_size,
+                /*tensor_array_gradients=*/arg.tensor_array_gradients,
+                /*tensor_array_multiple_writes_aggregate=*/true));
         arg_expression = XlaExpression::Resource(resource);
         if (arg.initialized) {
           input_mapping->push_back(i);
         }
-
         break;
+      }
       case XlaCompiler::Argument::kParameter:
       case XlaCompiler::Argument::kToken: {
         input_mapping->push_back(i);
@@ -901,9 +925,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                    options_.device_type, name));
 
   xla::XlaBuilder builder(name);
-  XlaContext* context =
-      new XlaContext(this, &builder, options_.allow_cpu_custom_calls,
-                     &options_.shape_representation_fn);
+  XlaContext* context = new XlaContext(this, &builder);
   core::ScopedUnref context_unref(context);
 
   std::vector<XlaCompiler::Argument> real_args(args.begin(), args.end());
@@ -988,23 +1010,12 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       options.return_updated_values_for_all_resources,
       options.always_return_tuple, &builder, result->computation.get(),
       &num_computation_outputs, &num_nonconst_outputs, &result->outputs,
-      &result->resource_updates));
+      &result->resource_updates, &result->xla_output_shape));
 
   VLOG(2) << "Outputs: total: " << context->retvals().size()
           << " nonconstant: " << num_nonconst_outputs;
-
-  // Compute the XLA output shape, if there is a computation with non-constant
-  // outputs.
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ProgramShape> computation_shape,
-                      client()->GetComputationShape(*result->computation));
-
-  result->xla_output_shape.Swap(computation_shape->mutable_result());
   VLOG(2) << "XLA output shape: "
-          << xla::ShapeUtil::HumanString(result->xla_output_shape);
-
-  // Tensorflow expects a major-to-minor order of results.
-  xla::LayoutUtil::SetToDefaultLayout(&result->xla_output_shape);
-
+          << xla::ShapeUtil::HumanStringWithLayout(result->xla_output_shape);
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.h b/tensorflow/compiler/tf2xla/xla_compiler.h
index 63426124686e1b92a3534b7e365b8282008b8455..0d801b73a8c2651305328384377751254ecaa41d 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.h
+++ b/tensorflow/compiler/tf2xla/xla_compiler.h
@@ -150,7 +150,7 @@ class XlaCompiler {
 
     // For a TensorArray or Stack resource, what is the array's declared size?
     // (Used for lazy initialization.)
-    int64 tensor_array_size = -1;
+    int64 max_array_size = -1;
 
     // TensorArray resource parameters are passed as (array, gradient array 0,
     // ..., gradient array k), where the gradient arrays are in the same order
diff --git a/tensorflow/compiler/tf2xla/xla_compiler_test.cc b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
index aaee208f6349d56f685481977cea55c8dd5e7938..fe2a5f5b0c9ea6b5f2bb71df836fdcabf9a0cf23 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/cc/ops/function_ops.h"
 #include "tensorflow/cc/ops/resource_variable_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -649,7 +650,7 @@ TEST_F(XlaCompilerTest, CanPassTensorArraysToAndFromComputation) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad2"};
 
   // Compiles the graph.
@@ -708,7 +709,7 @@ TEST_F(XlaCompilerTest, UnwrittenTensorArrayGradientsAreNotComputationOutputs) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
   // Compiles the graph.
@@ -740,7 +741,7 @@ TEST_F(XlaCompilerTest, NewTensorArrayGradientsAreComputationOutputs) {
   args[0].initialized = true;
   args[0].type = DT_INT32;
   args[0].shape = TensorShape({});
-  args[0].tensor_array_size = 2;
+  args[0].max_array_size = 2;
   args[0].tensor_array_gradients = {"grad1"};
 
   // Compiles the graph.
@@ -910,6 +911,82 @@ TEST_F(XlaCompilerTest, Variables) {
   RunAndCheckVariablesComputation(client_, result);
 }
 
+TEST_F(XlaCompilerTest, ResultLayoutSingle) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("RET"), a, 0);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  // Sets the representation function to return a non-default layout.
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  auto compile_options = XlaCompiler::CompileOptions();
+  compile_options.always_return_tuple = false;
+  TF_ASSERT_OK(compiler.CompileGraph(compile_options, "id", std::move(graph),
+                                     args, &result));
+  EXPECT_TRUE(xla::ShapeUtil::Equal(
+      result.xla_output_shape,
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1})));
+}
+
+TEST_F(XlaCompilerTest, ResultLayoutMultiple) {
+  Scope scope = Scope::NewRootScope().ExitOnError();
+  auto a = ops::_Arg(scope.WithOpName("A"), DT_INT32, 0);
+  auto b = ops::_Retval(scope.WithOpName("RET1"), a, 0);
+  auto c = ops::_Retval(scope.WithOpName("RET2"), a, 1);
+
+  std::unique_ptr<Graph> graph(new Graph(OpRegistry::Global()));
+  TF_ASSERT_OK(scope.ToGraph(graph.get()));
+
+  // Builds a description of the arguments.
+  std::vector<XlaCompiler::Argument> args(1);
+  args[0].kind = XlaCompiler::Argument::kParameter;
+  args[0].type = DT_INT32;
+  args[0].shape = TensorShape({2, 3});
+
+  auto options = DefaultOptions();
+  // Sets the representation function to return a non-default layout.
+  options.shape_representation_fn =
+      [](const TensorShape& shape, DataType type) -> xla::StatusOr<xla::Shape> {
+    xla::Shape xla_shape;
+    TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
+    *xla_shape.mutable_layout() = xla::LayoutUtil::MakeLayout({0, 1});
+    return xla_shape;
+  };
+
+  // Compiles the graph.
+  XlaCompiler compiler(options);
+
+  XlaCompiler::CompilationResult result;
+  TF_ASSERT_OK(compiler.CompileGraph(XlaCompiler::CompileOptions(), "id",
+                                     std::move(graph), args, &result));
+  xla::Shape result_shape =
+      xla::ShapeUtil::MakeShapeWithLayout(xla::S32, {2, 3}, {0, 1});
+
+  EXPECT_TRUE(xla::ShapeUtil::Equal(
+      result.xla_output_shape,
+      xla::ShapeUtil::MakeTupleShape({result_shape, result_shape})));
+}
+
 // Tests a simple graph that reads and writes a variable.
 TEST_F(XlaCompilerTest, ReturnResourceHandleOnly) {
   Scope scope = Scope::NewRootScope().ExitOnError();
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 43095fbb47351617a0de12a088c947106ccaa641..a69af70503376b6c0905deb8980abdc3254a6e47 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -54,25 +54,14 @@ const char XlaContext::kXlaContextResourceName[] = "_xla_context";
   return *context;
 }
 
-/* static */ XlaContext& XlaContext::Get(const XlaOpKernelContext* ctx) {
-  return Get(ctx->op_kernel_context());
-}
-
 void XlaContext::set_args(std::vector<XlaExpression> args) {
   args_ = std::move(args);
 }
 
-XlaContext::XlaContext(
-    XlaCompiler* compiler, xla::XlaBuilder* builder,
-    bool allow_cpu_custom_calls,
-    const std::function<xla::StatusOr<xla::Shape>(
-        const TensorShape&, DataType)>* shape_representation_fn)
-    : compiler_(compiler),
-      builder_(builder),
-      allow_cpu_custom_calls_(allow_cpu_custom_calls),
-      shape_representation_fn_(shape_representation_fn) {}
+XlaContext::XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder)
+    : compiler_(compiler), builder_(builder) {}
 
-string XlaContext::DebugString() { return "TLA JIT context"; }
+string XlaContext::DebugString() { return "XLA JIT context"; }
 
 void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   if (retvals_.size() <= index) {
@@ -81,21 +70,9 @@ void XlaContext::SetRetval(int index, const XlaExpression& expression) {
   retvals_[index] = expression;
 }
 
-Status XlaContext::CreateResource(
-    XlaResource::Kind kind, int arg_num, string name, DataType type,
-    TensorShape shape, const xla::XlaOp& handle, int64 tensor_array_size,
-    const std::set<string>& tensor_array_gradients, XlaResource** resource) {
-  resources_.emplace_back(
-      new XlaResource(kind, arg_num, std::move(name), type, std::move(shape),
-                      handle, tensor_array_size, tensor_array_gradients,
-                      /*tensor_array_multiple_writes_aggregate=*/false));
-  *resource = resources_.back().get();
-  return Status::OK();
-}
-
-xla::StatusOr<xla::Shape> XlaContext::RepresentationShape(
-    const TensorShape& shape, DataType type) const {
-  return (*shape_representation_fn_)(shape, type);
+XlaResource* XlaContext::AddResource(std::unique_ptr<XlaResource> resource) {
+  resources_.push_back(std::move(resource));
+  return resources_.back().get();
 }
 
 const xla::XlaComputation* XlaContext::GetOrCreateMax(const DataType type) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.h b/tensorflow/compiler/tf2xla/xla_context.h
index dbfd344c9bad8a5d05abb6a3b902ed3baebbe02a..0767d1faac14cedb8666f6cc37175eb7b55f6158 100644
--- a/tensorflow/compiler/tf2xla/xla_context.h
+++ b/tensorflow/compiler/tf2xla/xla_context.h
@@ -41,14 +41,10 @@ class XlaContext : public ResourceBase {
  public:
   // Retrieves the XlaContext of the current compilation.
   static XlaContext& Get(const OpKernelContext* ctx);
-  static XlaContext& Get(const XlaOpKernelContext* ctx);
 
   // Creates a new XlaContext. See the documentation on the class data fields
   // for descriptions of the arguments.
-  XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder,
-             bool allow_cpu_custom_calls,
-             const std::function<xla::StatusOr<xla::Shape>(
-                 const TensorShape&, DataType)>* shape_representation_fn);
+  XlaContext(XlaCompiler* compiler, xla::XlaBuilder* builder);
 
   // Virtual method defined by ResourceBase.
   string DebugString() override;
@@ -58,8 +54,6 @@ class XlaContext : public ResourceBase {
   // Returns the XlaBuilder that Ops use for compiling new expressions.
   xla::XlaBuilder* builder() { return builder_; }
 
-  bool allow_cpu_custom_calls() const { return allow_cpu_custom_calls_; }
-
   const std::vector<XlaExpression>& args() const { return args_; }
   void set_args(std::vector<XlaExpression> args);
 
@@ -70,25 +64,13 @@ class XlaContext : public ResourceBase {
   // grows the return values vector to size index+1 if it is smaller.
   void SetRetval(int index, const XlaExpression& expression);
 
-  // Creates a resource with resource `kind` and initial value `handle`. `name`
-  // is a descriptive name for use in error messages. See the `XlaResource`
-  // constructor for a description of the remaining arguments.
-  // Fails if the resource already exists.
-  Status CreateResource(XlaResource::Kind kind, int arg_num, string name,
-                        DataType type, TensorShape shape,
-                        const xla::XlaOp& handle, int64 tensor_array_size,
-                        const std::set<string>& tensor_array_gradients,
-                        XlaResource** resource);
+  // Adds 'resource' to the set of resources owned by the context.
+  XlaResource* AddResource(std::unique_ptr<XlaResource> resource);
 
   const std::vector<std::unique_ptr<XlaResource>>& resources() {
     return resources_;
   }
 
-  // Returns the XLA shape to be used to represent a variable of TF `shape`
-  // and `type`, or of an argument or return value of a top-level computation.
-  xla::StatusOr<xla::Shape> RepresentationShape(const TensorShape& shape,
-                                                DataType type) const;
-
   // Get an XLA lambda to compute Max. This is cached in the
   // XlaContext since it may be used by multiple Ops. There is a
   // separate specialization of the computation for each DataType.
@@ -118,9 +100,6 @@ class XlaContext : public ResourceBase {
   // The XlaBuilder used to construct the subgraph's compiled representation.
   xla::XlaBuilder* builder_;
 
-  // Allow ops to emit CustomCall operations for CPU.
-  const bool allow_cpu_custom_calls_;
-
   // Arguments to the Tensorflow graph, indexed by _Arg index.
   // Includes both compile-time constant arguments and runtime parameters.
   std::vector<XlaExpression> args_;
@@ -131,11 +110,6 @@ class XlaContext : public ResourceBase {
   // Holds ownership of resources. The resources are not ordered.
   std::vector<std::unique_ptr<XlaResource>> resources_;
 
-  // Describes the on-host shapes of parameters and return values. Also see:
-  // XlaDevice::Options::shape_representation_fn.
-  const std::function<xla::StatusOr<xla::Shape>(const TensorShape&, DataType)>*
-      shape_representation_fn_;
-
   // Cache of prebuilt computations indexed by their type.
   using ComputationMap = std::map<DataType, xla::XlaComputation>;
 
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index 9a34cd8c6ae2dc6d52a3cc69168df96f5322c6da..c2c0751211180c3715a19d6c78e34659fd18914e 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -26,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -216,8 +215,7 @@ DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
   return dtype;
 }
 
-xla::XlaOp XlaHelpers::ConvertElementType(xla::XlaBuilder* const builder,
-                                          const xla::XlaOp& operand,
+xla::XlaOp XlaHelpers::ConvertElementType(const xla::XlaOp& operand,
                                           const DataType new_element_type) {
   xla::PrimitiveType convert_to;
   TF_CHECK_OK(DataTypeToPrimitiveType(new_element_type, &convert_to));
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 39578144caaadf293d24ea91aa874e56e27ecc01..4858dfee55a393d04cd2af83916eeb40820ee368 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -80,8 +80,7 @@ class XlaHelpers {
 
   // A helper for creating a ConvertElementType xla op given a DataType rather
   // than the xla::PrimitiveType.
-  static xla::XlaOp ConvertElementType(xla::XlaBuilder* const builder,
-                                       const xla::XlaOp& operand,
+  static xla::XlaOp ConvertElementType(const xla::XlaOp& operand,
                                        const DataType new_element_type);
 };
 
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 86a78ee429e8913edb4a948727fa692083c472f4..fabbcd04fed96ad814d04c2df9394f43bfe0cf99 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -133,7 +133,8 @@ XlaJitCompiledCpuFunction::Compile(
   jit->executable_ = std::move(executable);
   jit->buffer_infos_ = std::move(buffer_infos);
   jit->arg_index_table_ = std::move(arg_index_table);
-  jit->program_shape_ = std::move(program_shape);
+  jit->program_shape_ =
+      absl::make_unique<xla::ProgramShapeProto>(program_shape->ToProto());
   jit->static_data_.set_raw_function(raw_function);
   jit->static_data_.set_buffer_infos(jit->buffer_infos_.data());
   jit->static_data_.set_num_buffers(jit->buffer_infos_.size());
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
index d3c8f22a8078d03d15447ed200c914390f40b04f..a5392057177e983e11787c31bb496a8947add1e6 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.h
@@ -80,8 +80,10 @@ class XlaJitCompiledCpuFunction {
   std::vector<const char*> arg_names_;
   std::vector<const char*> result_names_;
 
-  // The backing data for the program shape.
-  std::unique_ptr<const xla::ProgramShape> program_shape_;
+  // The backing data for the program shape. The proto form of program shape is
+  // used because the program shape is serialized and embedded in the object
+  // file.
+  std::unique_ptr<const xla::ProgramShapeProto> program_shape_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index 6d49298a6f3e8a726695fafc42f3c5341fe98b5f..8846088678b53f6b9ecff0de732d6b5c82392b5a 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -116,13 +116,13 @@ TEST(XlaJitCompiledCpuFunction, Sum) {
   // Check program shape.
   using xla::ShapeUtil;
   const xla::Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
-  const xla::ProgramShape* program_shape = function.ProgramShape();
-  ASSERT_TRUE(program_shape != nullptr);
-  ASSERT_EQ(program_shape->parameters_size(), 2);
-  EXPECT_TRUE(ShapeUtil::Compatible(program_shape->parameters(0), s32));
-  EXPECT_TRUE(ShapeUtil::Compatible(program_shape->parameters(1), s32));
+  ASSERT_TRUE(function.ProgramShape() != nullptr);
+  const xla::ProgramShape program_shape(*function.ProgramShape());
+  ASSERT_EQ(program_shape.parameters_size(), 2);
+  EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(0), s32));
+  EXPECT_TRUE(ShapeUtil::Compatible(program_shape.parameters(1), s32));
 
-  const xla::Shape& result = program_shape->result();
+  const xla::Shape& result = program_shape.result();
   ASSERT_EQ(result.element_type(), xla::TUPLE);
   ASSERT_EQ(ShapeUtil::TupleElementCount(result), 1);
   const xla::Shape& result0 = ShapeUtil::GetTupleElementShape(result, 0);
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 8dd8def0549f2b39d4c9863bb535f19703c3ef22..58808c76de6330a6b28e21dbdead03dea25847f6 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -36,8 +36,16 @@ bool XlaOpKernelContext::ValidateInputsAreSameShape(OpKernel* op) {
   return context_->ValidateInputsAreSameShape(op);
 }
 
+XlaContext* XlaOpKernelContext::xla_context() const {
+  return &XlaContext::Get(context_);
+}
+
 xla::XlaBuilder* XlaOpKernelContext::builder() const {
-  return XlaContext::Get(this).builder();
+  return xla_context()->builder();
+}
+
+XlaCompiler* XlaOpKernelContext::compiler() const {
+  return xla_context()->compiler();
 }
 
 // Retrieves an XlaExpression that was allocated by a previous Op.
@@ -338,8 +346,8 @@ Status XlaOpKernelContext::ConstantInputList(
 namespace {
 
 Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
-                               const OpKernelContext* ctx, TensorShape* shape,
-                               xla::XlaOp* value) {
+                               const XlaOpKernelContext* ctx,
+                               TensorShape* shape, xla::XlaOp* value) {
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
   TF_RET_CHECK(variable != nullptr);
@@ -357,10 +365,9 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
     *shape = variable->shape();
   }
 
-  XlaContext& xla_context = XlaContext::Get(ctx);
-  TF_ASSIGN_OR_RETURN(
-      xla::Shape representation_shape,
-      xla_context.RepresentationShape(variable->shape(), variable->type()));
+  TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
+                      ctx->compiler()->options().shape_representation_fn(
+                          variable->shape(), variable->type()));
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(
       TensorShapeToXLAShape(variable->type(), variable->shape(), &xla_shape));
@@ -377,15 +384,15 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
 Status XlaOpKernelContext::ReadVariableInput(int index, DataType type,
                                              TensorShape* shape,
                                              xla::XlaOp* value) {
-  return ReadVariableInputTensor(context_->input(index), type, context_, shape,
+  return ReadVariableInputTensor(context_->input(index), type, this, shape,
                                  value);
 }
 
 Status XlaOpKernelContext::ReadVariableInput(absl::string_view name,
                                              DataType type, TensorShape* shape,
                                              xla::XlaOp* value) {
-  return ReadVariableInputTensor(GetInputTensorByName(name), type, context_,
-                                 shape, value);
+  return ReadVariableInputTensor(GetInputTensorByName(name), type, this, shape,
+                                 value);
 }
 
 Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
@@ -464,7 +471,7 @@ Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
 namespace {
 
 Status AssignVariableTensor(const Tensor& tensor, DataType type,
-                            const OpKernelContext* ctx, xla::XlaOp handle,
+                            const XlaOpKernelContext* ctx, xla::XlaOp handle,
                             xla::XlaBuilder* builder) {
   const XlaExpression* expression = CastExpressionFromTensor(tensor);
   XlaResource* variable = expression->resource();
@@ -481,9 +488,9 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
 
   TF_RETURN_IF_ERROR(variable->SetTypeAndShape(type, shape));
 
-  XlaContext& xla_context = XlaContext::Get(ctx);
-  TF_ASSIGN_OR_RETURN(xla::Shape representation_shape,
-                      xla_context.RepresentationShape(shape, type));
+  TF_ASSIGN_OR_RETURN(
+      xla::Shape representation_shape,
+      ctx->compiler()->options().shape_representation_fn(shape, type));
   xla::Shape xla_shape;
   TF_RETURN_IF_ERROR(TensorShapeToXLAShape(type, shape, &xla_shape));
   if (!xla::ShapeUtil::Compatible(xla_shape, representation_shape)) {
@@ -498,19 +505,15 @@ Status AssignVariableTensor(const Tensor& tensor, DataType type,
 Status XlaOpKernelContext::AssignVariable(int input_index, DataType type,
                                           xla::XlaOp handle) {
   TF_RET_CHECK(handle.valid());
-  return AssignVariableTensor(context_->input(input_index), type, context_,
-                              handle, builder());
+  return AssignVariableTensor(context_->input(input_index), type, this, handle,
+                              builder());
 }
 
 Status XlaOpKernelContext::AssignVariable(absl::string_view name, DataType type,
                                           xla::XlaOp handle) {
   TF_RET_CHECK(handle.valid());
-  return AssignVariableTensor(GetInputTensorByName(name), type, context_,
-                              handle, builder());
-}
-
-XlaCompiler* XlaOpKernelContext::compiler() const {
-  return XlaContext::Get(context_).compiler();
+  return AssignVariableTensor(GetInputTensorByName(name), type, this, handle,
+                              builder());
 }
 
 void XlaOpKernelContext::CtxFailure(const Status& s) {
@@ -530,22 +533,22 @@ void XlaOpKernelContext::CtxFailureWithWarning(const char* file, int line,
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMax(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMax(type);
+  return xla_context()->GetOrCreateMax(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMin(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMin(type);
+  return xla_context()->GetOrCreateMin(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateAdd(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateAdd(type);
+  return xla_context()->GetOrCreateAdd(type);
 }
 
 const xla::XlaComputation* XlaOpKernelContext::GetOrCreateMul(
     const DataType type) {
-  return XlaContext::Get(context_).GetOrCreateMul(type);
+  return xla_context()->GetOrCreateMul(type);
 }
 
 const Tensor& XlaOpKernelContext::GetInputTensorByName(absl::string_view name) {
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.h b/tensorflow/compiler/tf2xla/xla_op_kernel.h
index c06efa2c474c5ec3cb5d75d94ba15d4096faa085..1858844bc05a6e12abbf07af83cad816590ddd03 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.h
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.h
@@ -60,6 +60,8 @@ class XlaOpKernelContext {
  public:
   explicit XlaOpKernelContext(OpKernelContext* context);
 
+  XlaContext* xla_context() const;
+
   // Returns the XLA XlaBuilder containing the output of compilation.
   xla::XlaBuilder* builder() const;
 
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index dcd0e9c5c1f20c07c6d2b6fd7315a861817bc523..14237df69081016817fbd1a5332f22996e7f264d 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -18,7 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
-#include "tensorflow/compiler/jit/legacy_flags/mark_for_compilation_pass_flags.h"
+#include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/xla_cluster_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -130,8 +130,7 @@ XlaOpRegistry::~XlaOpRegistry() = default;
   // Lazily register the CPU and GPU JIT devices the first time
   // GetCompilationDevice is called.
   static void* registration_init = [&registry]() {
-    legacy_flags::MarkForCompilationPassFlags* flags =
-        legacy_flags::GetMarkForCompilationPassFlags();
+    MarkForCompilationPassFlags* flags = GetMarkForCompilationPassFlags();
     bool cpu_global_jit = flags->tf_xla_cpu_global_jit;
 
     mutex_lock lock(registry.mutex_);
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index a322eb9015e829fd468133f3de6c12aad7e4ff74..48a3c012727acd8472d3d5d4072ae700f5497d96 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
@@ -39,9 +40,29 @@ namespace tensorflow {
   }
 }
 
+/*static*/ std::unique_ptr<XlaResource> XlaResource::CreateStack(
+    string name, DataType type, int64 max_size) {
+  return absl::make_unique<XlaResource>(
+      XlaResource::kStack, /*arg_num=*/-1, std::move(name), type, TensorShape(),
+      /*initial_value=*/xla::XlaOp(),
+      /*max_array_size=*/max_size,
+      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_multiple_writes_aggregate=*/false);
+}
+
+/*static*/ std::unique_ptr<XlaResource> XlaResource::CreateTensorArray(
+    string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
+    int64 max_array_size) {
+  return absl::make_unique<XlaResource>(
+      XlaResource::kTensorArray, /*arg_num=*/-1, std::move(name), type, shape,
+      initial_value, max_array_size,
+      /*tensor_array_gradients=*/std::set<string>{},
+      /*tensor_array_multiple_writes_aggregate=*/false);
+}
+
 XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
                          TensorShape shape, const xla::XlaOp& initial_value,
-                         int64 tensor_array_size,
+                         int64 max_array_size,
                          const std::set<string>& tensor_array_gradients,
                          bool tensor_array_multiple_writes_aggregate)
     : kind_(kind),
@@ -51,7 +72,7 @@ XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
       shape_(std::move(shape)),
       value_(initial_value),
       initial_value_(initial_value),
-      tensor_array_size_(tensor_array_size),
+      max_array_size_(max_array_size),
       tensor_array_multiple_writes_aggregate_(
           tensor_array_multiple_writes_aggregate) {
   CHECK(kind_ != kInvalid);
@@ -60,7 +81,7 @@ XlaResource::XlaResource(Kind kind, int arg_num, string name, DataType type,
     tensor_array_gradients_[gradient].reset(new XlaResource(
         /*kind=*/kTensorArray, /*arg_num=*/-1,
         /*name=*/absl::StrCat("TensorArrayGrad: ", name_), type_, shape_,
-        xla::XlaOp(), tensor_array_size_, /*tensor_array_gradients=*/{},
+        xla::XlaOp(), max_array_size_, /*tensor_array_gradients=*/{},
         /*tensor_array_multiple_writes_aggregate=*/true));
   }
 }
@@ -113,7 +134,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     }
     case kTensorArray: {
       TensorShape ta_shape;
-      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AddDim(max_array_size_);
       ta_shape.AppendShape(shape_);
       value_ = xla::Broadcast(XlaHelpers::Zero(builder, type_),
                               ta_shape.dim_sizes());
@@ -121,7 +142,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     }
     case kStack: {
       TensorShape ta_shape;
-      ta_shape.AddDim(tensor_array_size_);
+      ta_shape.AddDim(max_array_size_);
       ta_shape.AppendShape(shape_);
       value_ =
           xla::Tuple(builder, {xla::Broadcast(XlaHelpers::Zero(builder, type_),
@@ -146,14 +167,14 @@ Status XlaResource::GetOrCreateTensorArrayGradient(const string& source,
   std::unique_ptr<XlaResource>& gradient = tensor_array_gradients_[source];
   if (!gradient) {
     TensorShape ta_shape;
-    ta_shape.AddDim(tensor_array_size_);
+    ta_shape.AddDim(max_array_size_);
     ta_shape.AppendShape(shape_);
     xla::XlaOp gradient_value =
         xla::Broadcast(XlaHelpers::Zero(builder, type_), ta_shape.dim_sizes());
     gradient.reset(
         new XlaResource(/*kind=*/kTensorArray, /*arg_num=*/-1,
                         /*name=*/absl::StrCat("TensorArrayGrad: ", name_),
-                        type_, shape_, gradient_value, tensor_array_size_,
+                        type_, shape_, gradient_value, max_array_size_,
                         /*tensor_array_gradients=*/{},
                         /*tensor_array_multiple_writes_aggregate=*/true));
   }
diff --git a/tensorflow/compiler/tf2xla/xla_resource.h b/tensorflow/compiler/tf2xla/xla_resource.h
index 857b9a928bb824656f637b2b1ca2fc02a1bef139..736588bb8b89ba756cdce77eeebff8d1fcf4774c 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.h
+++ b/tensorflow/compiler/tf2xla/xla_resource.h
@@ -38,9 +38,18 @@ class XlaResource {
   };
   static absl::string_view KindToString(Kind kind);
 
+  // Creates a new Stack resource.
+  static std::unique_ptr<XlaResource> CreateStack(string name, DataType type,
+                                                  int64 max_size);
+
+  // Creates a new TensorArray resource.
+  static std::unique_ptr<XlaResource> CreateTensorArray(
+      string name, DataType type, TensorShape shape, xla::XlaOp initial_value,
+      int64 max_array_size);
+
   XlaResource(Kind kind, int arg_num, string name, DataType type,
               TensorShape shape, const xla::XlaOp& initial_value,
-              int64 tensor_array_size,
+              int64 max_array_size,
               const std::set<string>& tensor_array_gradients,
               bool tensor_array_multiple_writes_aggregate);
 
@@ -119,12 +128,12 @@ class XlaResource {
   // TODO(phawkins): refactor this code to use subclasses, rather than putting
   // kind-specific fields in XlaResource.
 
-  // 'tensor_array_size' stores the expected size of the TensorArray or Stack.
+  // 'max_array_size' stores the expected size of the TensorArray or Stack.
   // We need to store this since sometimes TensorArrays must be initialized
   // lazily since we do not know the element shape at construction time.
   // Used by both TensorArrays and Stacks.
-  int64 tensor_array_size() const { return tensor_array_size_; }
-  void set_tensor_array_size(int64 size) { tensor_array_size_ = size; }
+  int64 max_array_size() const { return max_array_size_; }
+  void set_max_array_size(int64 size) { max_array_size_ = size; }
 
   bool tensor_array_multiple_writes_aggregate() const {
     return tensor_array_multiple_writes_aggregate_;
@@ -151,7 +160,7 @@ class XlaResource {
   xla::XlaOp value_;
   xla::XlaOp initial_value_;
 
-  int64 tensor_array_size_ = -1;
+  int64 max_array_size_ = -1;
   bool tensor_array_multiple_writes_aggregate_ = false;
 
   std::map<string, std::unique_ptr<XlaResource>> tensor_array_gradients_;
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index 91096cf1d043eb652756f77b7594780124260766..4360e0857964b0ac63fc887e269b04a4b00d854a 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -226,12 +226,14 @@ cc_library(
         "index_util.cc",
         "layout_util.cc",
         "primitive_util.cc",
+        "shape.cc",
         "shape_util.cc",
     ],
     hdrs = [
         "index_util.h",
         "layout_util.h",
         "primitive_util.h",
+        "shape.h",
         "shape_util.h",
     ],
     visibility = ["//visibility:public"],
@@ -254,6 +256,23 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "shape_test",
+    srcs = ["shape_test.cc"],
+    deps = [
+        ":shape_util",
+        ":status_macros",
+        ":test",
+        ":test_helpers",
+        ":types",
+        ":util",
+        ":xla_data_proto",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cc_test(
     name = "shape_util_test",
     srcs = ["shape_util_test.cc"],
@@ -745,6 +764,8 @@ cc_library(
             "//tensorflow/core:framework_internal",
             "//tensorflow/core:lib",
             "@com_google_absl//absl/strings",
+            "@com_google_absl//absl/strings:str_format",
+            "@com_google_absl//absl/types:span",
         ],
 )
 
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index 782c966b4c57672d137569a318fb20ace14d493b..e4aca98f67d50287a83afc6f41a59458f3df2da2 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -104,7 +104,7 @@ std::unique_ptr<Array2D<NativeT>> MakeLinspaceArray2D(double from, double to,
   int64 count = n1 * n2;
   NativeT step =
       static_cast<NativeT>((count > 1) ? (to - from) / (count - 1) : 0);
-  auto set = [&array, n1, n2](int64 index, NativeT value) {
+  auto set = [&array, n2](int64 index, NativeT value) {
     (*array)(index / n2, index % n2) = value;
   };
   for (int64 i = 0; i < count - 1; ++i) {
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index 42da0ebf4992884187bbe21701a44d8ba2fccd64..fe99564d3c671cd7890e1fa26fcd2e3384972983 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -81,6 +81,7 @@ cc_library(
         "//tensorflow/core:lib",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -90,11 +91,12 @@ cc_library(
     srcs = ["executable_build_options.cc"],
     hdrs = ["executable_build_options.h"],
     deps = [
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -191,6 +193,7 @@ cc_library(
     hdrs = ["xla_computation.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index eef2844e0df6aaf509881535f41493673fbeeee5..74b76f929949d3300a5d0ff45d5fa4cd9f162642 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
@@ -42,7 +43,7 @@ StatusOr<Literal> Client::Transfer(const GlobalData& data,
   TransferToClientRequest request;
   *request.mutable_data() = data.handle();
   if (shape_with_layout != nullptr) {
-    *request.mutable_shape_with_layout() = *shape_with_layout;
+    *request.mutable_shape_with_layout() = shape_with_layout->ToProto();
   }
   TransferToClientResponse response;
 
@@ -123,7 +124,7 @@ StatusOr<Literal> Client::TransferFromOutfeed(
   }
   request.set_replica_id(replica_id);
   if (shape_with_layout != nullptr) {
-    *request.mutable_shape_with_layout() = *shape_with_layout;
+    *request.mutable_shape_with_layout() = shape_with_layout->ToProto();
   }
   TransferFromOutfeedResponse response;
 
@@ -170,11 +171,14 @@ StatusOr<Literal> Client::ExecuteAndTransfer(
       std::unique_ptr<GlobalData> data,
       Execute(computation, arguments, execution_options, execution_profile));
 
-  const Shape* shape_with_output_layout = nullptr;
+  absl::optional<Shape> shape_with_output_layout;
   if (execution_options && execution_options->has_shape_with_output_layout()) {
-    shape_with_output_layout = &execution_options->shape_with_output_layout();
+    shape_with_output_layout =
+        Shape(execution_options->shape_with_output_layout());
   }
-  return Transfer(*data, shape_with_output_layout);
+  return Transfer(*data, shape_with_output_layout.has_value()
+                             ? &(*shape_with_output_layout)
+                             : nullptr);
 }
 
 StatusOr<Literal> Client::ComputeConstant(const XlaComputation& computation,
@@ -229,7 +233,7 @@ StatusOr<ExecutionHandle> Client::Compile(
 
   // The argument shapes affect how the computation is compiled.
   for (const auto& arg_shape : argument_shapes) {
-    *request.add_input_shape_with_layout() = arg_shape;
+    *request.add_input_shape_with_layout() = arg_shape.ToProto();
   }
 
   CompileResponse response;
@@ -458,7 +462,7 @@ StatusOr<Shape> Client::GetShape(const GlobalData& data) {
     return s;
   }
 
-  return response.shape();
+  return Shape(response.shape());
 }
 
 StatusOr<string> Client::ExecutionStatsAsString(
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 0f1745366b7c33e573aff2e66d85431b01488c49..1f594e551af381d7537e947892cbf7e0b5b3b861 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
@@ -39,6 +40,13 @@ ExecutableBuildOptions& ExecutableBuildOptions::set_device_ordinal(
 
 int ExecutableBuildOptions::device_ordinal() const { return device_ordinal_; }
 
+DebugOptions* ExecutableBuildOptions::mutable_debug_options() {
+  if (!has_debug_options()) {
+    debug_options_ = GetDebugOptionsFromFlags();
+  }
+  return &debug_options_.value();
+}
+
 ExecutableBuildOptions& ExecutableBuildOptions::set_result_layout(
     const Shape& shape_with_layout) {
   result_layout_set_ = true;
@@ -55,68 +63,10 @@ string ExecutableBuildOptions::ToString() const {
   if (result_layout_set_) {
     result_layout = ShapeUtil::HumanStringWithLayout(result_layout_);
   }
-  string generate_hlo_graph = "nullopt";
-  if (generate_hlo_graph_.has_value()) {
-    generate_hlo_graph = generate_hlo_graph_.value();
-  }
   return absl::StrFormat(
       "ExecutableBuildOptions{device_ordinal=%d, result_layout=%s, "
       "generate_hlo_graph=%s}",
-      device_ordinal_, result_layout, generate_hlo_graph);
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_generate_hlo_graph(
-    string regex) {
-  generate_hlo_graph_ = std::move(regex);
-  return *this;
-}
-
-const absl::optional<string>& ExecutableBuildOptions::generate_hlo_graph()
-    const {
-  return generate_hlo_graph_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_dump_optimized_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_optimized_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_optimized_hlo_proto_to() const {
-  return dump_optimized_hlo_proto_to_;
-}
-
-ExecutableBuildOptions&
-ExecutableBuildOptions::set_dump_unoptimized_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_unoptimized_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_unoptimized_hlo_proto_to() const {
-  return dump_unoptimized_hlo_proto_to_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_dump_per_pass_hlo_proto_to(
-    absl::string_view dirpath) {
-  dump_per_pass_hlo_proto_to_ = string(dirpath);
-  return *this;
-}
-
-const absl::optional<string>&
-ExecutableBuildOptions::dump_per_pass_hlo_proto_to() const {
-  return dump_per_pass_hlo_proto_to_;
-}
-
-ExecutableBuildOptions& ExecutableBuildOptions::set_hlo_profile(bool enabled) {
-  hlo_profile_ = enabled;
-  return *this;
-}
-
-absl::optional<bool> ExecutableBuildOptions::hlo_profile() const {
-  return hlo_profile_;
+      device_ordinal_, result_layout, debug_options().xla_generate_hlo_graph());
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index 93334db88bc24f2ffbf3c7a57ee45ef238286739..a58090253bfac7779e4b61bc7231a0f0d945cc00 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -19,7 +19,9 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/xla/service/device_memory_allocator.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla.pb.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
@@ -44,6 +46,12 @@ class ExecutableBuildOptions {
   ExecutableBuildOptions& set_result_layout(const Shape& shape_with_layout);
   const Shape* result_layout() const;
 
+  // Expose access to the XLA debug options which will be passed to the
+  // compilation process.
+  bool has_debug_options() const { return debug_options_.has_value(); }
+  const DebugOptions& debug_options() const { return *debug_options_; }
+  DebugOptions* mutable_debug_options();
+
   // If set, this specifies an allocator that can be used to allocate temporary
   // space on the device during compilation.  For example, the compiler might
   // want to run various algorithms on the device and pick the fastest one -- it
@@ -55,56 +63,16 @@ class ExecutableBuildOptions {
       DeviceMemoryAllocator* allocator);
   DeviceMemoryAllocator* device_allocator() const;
 
-  // If set, specifies a regexp of HLO graphs to dump (as in DebugOptions).
-  ExecutableBuildOptions& set_generate_hlo_graph(string regex);
-  const absl::optional<string>& generate_hlo_graph() const;
-
-  // If set, specifies a dirpath to dump the end-of-optimization-pipeline HLO
-  // protobuf to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_optimized_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_optimized_hlo_proto_to() const;
-
-  // If set, specifies a dirpath to dump the start-of-optimization-pipeline HLO
-  // protobuf to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_unoptimized_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_unoptimized_hlo_proto_to() const;
-
-  // If set, specifies a dirpath to dump the per-pass-in-pipeline HLO protobufs
-  // to (as in DebugOptions).
-  ExecutableBuildOptions& set_dump_per_pass_hlo_proto_to(
-      absl::string_view dirpath);
-  const absl::optional<string>& dump_per_pass_hlo_proto_to() const;
-
-  // If true, specifies that we should record an HLO profile during execution
-  // and log it after execution (as in DebugOptions). If nullopt the default is
-  // used.
-  ExecutableBuildOptions& set_hlo_profile(bool enabled);
-  absl::optional<bool> hlo_profile() const;
-
-  void add_disabled_hlo_pass(absl::string_view pass_name) {
-    disabled_hlo_passes_.push_back(std::string(pass_name));
-  }
-  const absl::Span<const std::string> disabled_hlo_passes() const {
-    return disabled_hlo_passes_;
-  }
-
   // Returns a string representation of the build options, suitable for
   // debugging.
   string ToString() const;
 
  private:
-  absl::optional<bool> hlo_profile_;
   int device_ordinal_ = -1;
   Shape result_layout_;
   bool result_layout_set_ = false;
-  absl::optional<string> generate_hlo_graph_;
-  absl::optional<string> dump_optimized_hlo_proto_to_;
-  absl::optional<string> dump_unoptimized_hlo_proto_to_;
-  absl::optional<string> dump_per_pass_hlo_proto_to_;
+  absl::optional<DebugOptions> debug_options_;
   DeviceMemoryAllocator* device_allocator_ = nullptr;
-  std::vector<std::string> disabled_hlo_passes_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index f833ddcd3235e08e2d0d3c0b9921e96ef871c89e..41db8de29ff0085a30847ff41db4ffbfc774e2a1 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -104,13 +104,17 @@ xla_test(
 )
 
 cc_library(
-    name = "numeric",
-    srcs = ["numeric.cc"],
-    hdrs = ["numeric.h"],
+    name = "matrix",
+    srcs = ["matrix.cc"],
+    hdrs = ["matrix.h"],
     deps = [
         ":arithmetic",
         ":constants",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
         "@com_google_absl//absl/types:span",
@@ -118,11 +122,12 @@ cc_library(
 )
 
 xla_test(
-    name = "numeric_test",
-    srcs = ["numeric_test.cc"],
+    name = "matrix_test",
+    srcs = ["matrix_test.cc"],
     tags = ["enable_for_xla_interpreter"],
     deps = [
-        ":numeric",
+        ":matrix",
+        ":slicing",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
@@ -164,7 +169,6 @@ cc_library(
     deps = [
         ":constants",
         ":math",
-        ":numeric",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
@@ -173,13 +177,46 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "slicing",
+    srcs = ["slicing.cc"],
+    hdrs = ["slicing.h"],
+    deps = [
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "slicing_test",
+    srcs = ["slicing_test.cc"],
+    tags = ["enable_for_xla_interpreter"],
+    deps = [
+        ":slicing",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "sorting",
     srcs = ["sorting.cc"],
     hdrs = ["sorting.h"],
     deps = [
-        ":numeric",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/client:xla_builder",
     ],
@@ -188,10 +225,6 @@ cc_library(
 xla_test(
     name = "sorting_test",
     srcs = ["sorting_test.cc"],
-    blacklisted_backends = [
-        "cpu",
-        "gpu",
-    ],
     tags = ["enable_for_xla_interpreter"],
     deps = [
         ":sorting",
@@ -225,3 +258,48 @@ cc_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "triangular_solve",
+    srcs = ["triangular_solve.cc"],
+    hdrs = ["triangular_solve.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/client/lib:constants",
+        "//tensorflow/compiler/xla/client/lib:math",
+        "//tensorflow/compiler/xla/client/lib:matrix",
+        "//tensorflow/compiler/xla/client/lib:slicing",
+        "//tensorflow/core:lib",
+    ],
+)
+
+xla_test(
+    name = "triangular_solve_test",
+    srcs = ["triangular_solve_test.cc"],
+    tags = ["noasan"],  # sometimes times out, http://b/78650012
+    deps = [
+        ":triangular_solve",
+        "//tensorflow/compiler/xla:array2d",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:global_data",
+        "//tensorflow/compiler/xla/client:local_client",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 08a887a6e4660cb2528f0ec7244b7ccc540808d2..36fdda39b4124b9100c6054160f9c17bdf787d6f 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -268,17 +268,16 @@ XlaOp Digamma(XlaOp input) {
 // Implements Banker's rounding: numbers that are equidistant between two
 // integers are rounded towards even.
 XlaOp RoundToEven(XlaOp x) {
-  auto half = xla::ScalarLike(x, 0.5);
-  auto one = xla::ScalarLike(x, 1.0);
-  auto two = xla::ScalarLike(x, 2.0);
+  auto half = ScalarLike(x, 0.5);
+  auto one = ScalarLike(x, 1.0);
+  auto two = ScalarLike(x, 2.0);
 
-  auto round_val = xla::Floor(x);
+  auto round_val = Floor(x);
   auto fraction = x - round_val;
-  auto nearest_even_int = round_val - two * xla::Floor(half * x);
-  auto is_odd = xla::Eq(nearest_even_int, one);
-  return xla::Select(xla::Or(xla::Gt(fraction, half),
-                             xla::And(xla::Eq(fraction, half), is_odd)),
-                     round_val + one, round_val);
+  auto nearest_even_int = round_val - two * Floor(half * x);
+  auto is_odd = Eq(nearest_even_int, one);
+  return Select(Or(Gt(fraction, half), And(Eq(fraction, half), is_odd)),
+                round_val + one, round_val);
 }
 
 // Trigonometric functions.
@@ -320,4 +319,13 @@ XlaOp Cosh(XlaOp x) { return (Exp(x) + Exp(-x)) * ScalarLike(x, 0.5); }
 
 XlaOp Sinh(XlaOp x) { return (Exp(x) - Exp(-x)) * ScalarLike(x, 0.5); }
 
+XlaOp MaybeConjugate(XlaOp x, bool conjugate) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    auto perform_conj = shape.element_type() == C64 && conjugate;
+    return perform_conj ? Conj(x) : x;
+  });
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/math.h b/tensorflow/compiler/xla/client/lib/math.h
index 3f06d04b9ae98b3aa75e68cd07810b2b4c24d280..17612bf9fdc0f1eabb338671c93c025c5b268872 100644
--- a/tensorflow/compiler/xla/client/lib/math.h
+++ b/tensorflow/compiler/xla/client/lib/math.h
@@ -86,6 +86,10 @@ XlaOp Cosh(XlaOp x);
 // Computes the hyperbolic sine of 'x'.
 XlaOp Sinh(XlaOp x);
 
+// Applies a complex conjugation operation if `a` is complex and `conjugate`
+// is true, otherwise returns its argument.
+xla::XlaOp MaybeConjugate(xla::XlaOp x, bool conjugate);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATH_H_
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ffd744d190885b8e3f4149a48a706498b3787618
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -0,0 +1,185 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+
+#include <numeric>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
+#include "tensorflow/compiler/xla/client/lib/constants.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
+                     int64 n) {
+  auto a = Iota(builder, type, m);
+  auto b = Iota(builder, type, n);
+  auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
+  return ConvertElementType(indicator, type);
+}
+
+XlaOp GetMatrixDiagonal(XlaOp x) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    auto mask = Broadcast(indicator, major_dims);
+
+    // TPUs don't support S64 add reduction at the moment. But fortunately
+    // OR-reductions work just as well for integers.
+    XlaComputation reducer =
+        primitive_util::IsIntegralType(shape.element_type())
+            ? CreateScalarOrComputation(shape.element_type(), builder)
+            : CreateScalarAddComputation(shape.element_type(), builder);
+
+    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
+                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
+  });
+}
+
+XlaOp Triangle(XlaOp x, bool lower) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    const int64 m = shape.dimensions(n_dims - 2);
+    const int64 n = shape.dimensions(n_dims - 1);
+    absl::Span<const int64> major_dims =
+        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
+    auto a = Iota(builder, U32, n);
+    auto b = Iota(builder, U32, m);
+    XlaOp indicator;
+    if (lower) {
+      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    } else {
+      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
+    }
+    auto mask = Broadcast(indicator, major_dims);
+
+    return Select(mask, x, Zeros(builder, shape));
+  });
+}
+
+XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
+
+XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
+
+XlaOp BatchDot(XlaOp x, XlaOp y, PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape x_shape, builder->GetShape(x));
+    TF_ASSIGN_OR_RETURN(Shape y_shape, builder->GetShape(y));
+
+    // Check that both tensors have the same number of dimensions. There must be
+    // at least two (the batch dimensions can be empty).
+    if (ShapeUtil::Rank(x_shape) != ShapeUtil::Rank(y_shape)) {
+      return InvalidArgument(
+          "Arguments to BatchDot have different ranks: %s vs. %s",
+          ShapeUtil::HumanString(x_shape), ShapeUtil::HumanString(y_shape));
+    }
+    const int ndims = ShapeUtil::Rank(x_shape);
+    if (ndims < 2) {
+      return InvalidArgument(
+          "Arguments to BatchDot must have rank >= 2: got %d", ndims);
+    }
+
+    // The batch dimensions must be equal and the matrix dimensions must be
+    // valid.
+    std::vector<int64> batch_dimension_numbers;
+    for (int i = 0; i < ndims - 2; ++i) {
+      if (x_shape.dimensions(i) != y_shape.dimensions(i)) {
+        return InvalidArgument(
+            "Dimension %d of inputs to BatchDot must be equal: shapes %s vs %s",
+            i, ShapeUtil::HumanString(x_shape),
+            ShapeUtil::HumanString(y_shape));
+      }
+      batch_dimension_numbers.push_back(i);
+    }
+
+    int x_inner_dim = ndims - 1;
+    int y_inner_dim = ndims - 2;
+    if (x_shape.dimensions(x_inner_dim) != y_shape.dimensions(y_inner_dim)) {
+      return InvalidArgument(
+          "Dimensions %d and %d of arguments to BatchDot must be equal: "
+          "shapes %s vs %s",
+          x_inner_dim, y_inner_dim, ShapeUtil::HumanString(x_shape),
+          ShapeUtil::HumanString(y_shape));
+    }
+
+    // Check for zero lhs/rhs dim size.
+    if (ShapeUtil::IsZeroElementArray(x_shape) ||
+        ShapeUtil::IsZeroElementArray(y_shape)) {
+      std::vector<int64> dimensions(batch_dimension_numbers.size());
+      for (int i = 0; i < batch_dimension_numbers.size(); ++i) {
+        dimensions[i] = x_shape.dimensions(batch_dimension_numbers[i]);
+      }
+      int x_outer_dim = ndims - 2;
+      int y_outer_dim = ndims - 1;
+      dimensions.push_back(x_shape.dimensions(x_outer_dim));
+      dimensions.push_back(y_shape.dimensions(y_outer_dim));
+      return Broadcast(
+          ConstantLiteral(builder, LiteralUtil::Zero(x_shape.element_type())),
+          dimensions);
+    }
+
+    PrecisionConfig precision_proto;
+    precision_proto.add_operand_precision(precision);
+    precision_proto.add_operand_precision(precision);
+
+    DotDimensionNumbers dot_dnums;
+    dot_dnums.add_lhs_contracting_dimensions(x_inner_dim);
+    dot_dnums.add_rhs_contracting_dimensions(y_inner_dim);
+    for (auto batch_dimension_number : batch_dimension_numbers) {
+      dot_dnums.add_lhs_batch_dimensions(batch_dimension_number);
+      dot_dnums.add_rhs_batch_dimensions(batch_dimension_number);
+    }
+
+    return DotGeneral(x, y, dot_dnums, &precision_proto);
+  });
+}
+
+XlaOp TransposeInMinorDims(XlaOp x) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_dims >= 2);
+    std::vector<int64> permutation(n_dims);
+    std::iota(permutation.begin(), permutation.end(), 0);
+    std::swap(permutation[n_dims - 1], permutation[n_dims - 2]);
+    return Transpose(x, permutation);
+  });
+}
+
+XlaOp MaybeTransposeInMinorDims(XlaOp x, bool transpose) {
+  return transpose ? TransposeInMinorDims(x) : x;
+}
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.h b/tensorflow/compiler/xla/client/lib/matrix.h
similarity index 56%
rename from tensorflow/compiler/xla/client/lib/numeric.h
rename to tensorflow/compiler/xla/client/lib/matrix.h
index efd8cdc25724198633e0bf1c48c4e7d9e4b4c9e1..8856f99c7a0fee8f315aac11fab392cf5536f57b 100644
--- a/tensorflow/compiler/xla/client/lib/numeric.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
-#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -22,9 +22,6 @@ limitations under the License.
 
 namespace xla {
 
-// Returns a rank 1 tensor of `type` containing values [0, 1, 2, ...].
-XlaOp Iota(XlaBuilder* builder, PrimitiveType type, int64 size);
-
 // Returns an m x n matrix with 1s on the diagonal elements, zeros everywhere
 // else.
 XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m, int64 n);
@@ -43,6 +40,34 @@ XlaOp UpperTriangle(XlaOp x);
 // Get the lower triangle part of the last two dimensions
 XlaOp LowerTriangle(XlaOp x);
 
+// Multiplies slices of two tensors in batches.
+
+// Multiplies all slices of `Tensor` `x` and `y` (each slice can be
+// viewed as an element of a batch), and arranges the individual results
+// in a single output tensor of the same batch size.
+//
+// The input tensors `x` and `y` are 2-D or higher with shape `[..., r_x, c_x]`
+// and `[..., r_y, c_y]`.
+//
+// The output tensor is 2-D or higher with shape `[..., r_o, c_o]`, where:
+//
+//     r_o = c_x if transpose_x else r_x
+//     c_o = r_y if transpose_y else c_y
+//
+// It is computed as:
+//
+//     output[..., :, :] = matrix(x[..., :, :]) * matrix(y[..., :, :])
+xla::XlaOp BatchDot(
+    xla::XlaOp x, xla::XlaOp y,
+    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::DEFAULT);
+
+// Transposes a stack of matrices `x` by swapping the last two dimensions.
+xla::XlaOp TransposeInMinorDims(xla::XlaOp x);
+
+// Transposes `x` in its minor dimensions if `transpose` is true, otherwise
+// returns `x` unchanged.
+xla::XlaOp MaybeTransposeInMinorDims(xla::XlaOp x, bool transpose);
+
 }  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_NUMERIC_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_MATRIX_H_
diff --git a/tensorflow/compiler/xla/client/lib/numeric_test.cc b/tensorflow/compiler/xla/client/lib/matrix_test.cc
similarity index 53%
rename from tensorflow/compiler/xla/client/lib/numeric_test.cc
rename to tensorflow/compiler/xla/client/lib/matrix_test.cc
index 7d6aedd49462bd4f075f90d0b0f85c40f1191aa1..0593a7517ac125ca8dc5395cee76f6bc23232cd3 100644
--- a/tensorflow/compiler/xla/client/lib/numeric_test.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -24,13 +26,13 @@ limitations under the License.
 namespace xla {
 namespace {
 
-class NumericTest : public ClientLibraryTestBase {
+class MatrixTest : public ClientLibraryTestBase {
  protected:
   template <typename T>
   void TestMatrixDiagonal();
 };
 
-XLA_TEST_F(NumericTest, Triangle) {
+XLA_TEST_F(MatrixTest, Triangle) {
   XlaBuilder builder(TestName());
   Array3D<int32> input(2, 3, 4);
   input.FillIota(0);
@@ -45,7 +47,7 @@ XLA_TEST_F(NumericTest, Triangle) {
 }
 
 template <typename T>
-void NumericTest::TestMatrixDiagonal() {
+void MatrixTest::TestMatrixDiagonal() {
   XlaBuilder builder("GetMatrixDiagonal");
   Array3D<T> input(2, 3, 4);
   input.FillIota(0);
@@ -58,11 +60,46 @@ void NumericTest::TestMatrixDiagonal() {
   ComputeAndCompareR2<T>(&builder, expected, {a_data.get()});
 }
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S32) { TestMatrixDiagonal<int32>(); }
+
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
+
+XLA_TEST_F(MatrixTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
+
+Array3D<float> BatchedAValsFull() {
+  return {{
+              {2, 0, 1, 2},
+              {3, 6, 0, 1},
+              {4, 7, 9, 0},
+              {5, 8, 10, 11},
+          },
+          {
+              {16, 24, 8, 12},
+              {24, 61, 82, 48},
+              {8, 82, 456, 106},
+              {12, 48, 106, 62},
+          }};
+}
+
+XLA_TEST_F(MatrixTest, RowBatchDot) {
+  XlaBuilder builder(TestName());
+
+  int n = 4;
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_S64) { TestMatrixDiagonal<int64>(); }
+  XlaOp a, row, index;
+  auto a_data =
+      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
+  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
+                                           "row", &builder, &row);
+  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
+  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
 
-XLA_TEST_F(NumericTest, GetMatrixDiagonal_F32) { TestMatrixDiagonal<float>(); }
+  auto l_index = DynamicSliceInMinorDims(
+      a, {index, ConstantR0<int32>(&builder, 0)}, {1, n});
+  BatchDot(l_index, TransposeInMinorDims(row));
 
+  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
+                             {a_data.get(), row_data.get(), index_data.get()});
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/numeric.cc b/tensorflow/compiler/xla/client/lib/numeric.cc
deleted file mode 100644
index 377654220b5df4487e9e194361473d54ff46a54e..0000000000000000000000000000000000000000
--- a/tensorflow/compiler/xla/client/lib/numeric.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <numeric>
-#include <vector>
-
-#include "absl/types/span.h"
-#include "tensorflow/compiler/xla/client/lib/arithmetic.h"
-#include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
-
-namespace xla {
-
-XlaOp IdentityMatrix(XlaBuilder* builder, PrimitiveType type, int64 m,
-                     int64 n) {
-  auto a = Iota(builder, type, m);
-  auto b = Iota(builder, type, n);
-  auto indicator = Eq(a, Broadcast(b, {m}), /*broadcast_dimensions=*/{0});
-  return ConvertElementType(indicator, type);
-}
-
-XlaOp GetMatrixDiagonal(XlaOp x) {
-  XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    const int64 m = shape.dimensions(n_dims - 2);
-    const int64 n = shape.dimensions(n_dims - 1);
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
-    auto indicator = Eq(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    auto mask = Broadcast(indicator, major_dims);
-
-    // TPUs don't support S64 add reduction at the moment. But fortunately
-    // OR-reductions work just as well for integers.
-    XlaComputation reducer =
-        primitive_util::IsIntegralType(shape.element_type())
-            ? CreateScalarOrComputation(shape.element_type(), builder)
-            : CreateScalarAddComputation(shape.element_type(), builder);
-
-    return Reduce(Select(mask, x, Zeros(builder, shape)), ScalarLike(x, 0),
-                  reducer, {m >= n ? n_dims - 2 : n_dims - 1});
-  });
-}
-
-XlaOp Triangle(XlaOp x, bool lower) {
-  XlaBuilder* builder = x.builder();
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
-    const int64 n_dims = ShapeUtil::Rank(shape);
-    TF_RET_CHECK(n_dims >= 2);
-    const int64 m = shape.dimensions(n_dims - 2);
-    const int64 n = shape.dimensions(n_dims - 1);
-    absl::Span<const int64> major_dims =
-        AsInt64Slice(shape.dimensions()).subspan(/*pos=*/0, /*len=*/n_dims - 2);
-    auto a = Iota(builder, U32, n);
-    auto b = Iota(builder, U32, m);
-    xla::XlaOp indicator;
-    if (lower) {
-      indicator = Ge(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    } else {
-      indicator = Le(b, Broadcast(a, {m}), /*broadcast_dimensions=*/{0});
-    }
-    auto mask = Broadcast(indicator, major_dims);
-
-    return Select(mask, x, Zeros(builder, shape));
-  });
-}
-
-XlaOp UpperTriangle(XlaOp x) { return Triangle(x, false); }
-
-XlaOp LowerTriangle(XlaOp x) { return Triangle(x, true); }
-
-}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index c6f68c8ee2f5198017c37abeb9551478f52a99f4..85b9e1827dcef5ed907d893277deb5a52f8f30e9 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "absl/base/casts.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8c7df3ff5189c817202eaf39adb572f7e232ec2
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -0,0 +1,134 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
+
+namespace xla {
+
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
+                       absl::Span<const int64> end) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_RET_CHECK(start.size() == end.size());
+    int64 n_minor_dims = start.size();
+
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    auto major_dims = AsInt64Slice(shape.dimensions())
+                          .subspan(
+                              /*pos=*/0,
+                              /*len=*/n_dims - n_minor_dims);
+
+    // Prepends 0s in the major dim
+    std::vector<int64> padded_start(n_dims, 0);
+    std::copy(start.begin(), start.end(),
+              padded_start.begin() + major_dims.size());
+
+    // Prepends the shape of the major dims.
+    std::vector<int64> padded_end(n_dims);
+    std::copy(major_dims.begin(), major_dims.end(), padded_end.begin());
+    std::copy(end.begin(), end.end(), padded_end.begin() + major_dims.size());
+
+    std::vector<int64> strides(n_dims, 1);
+    return Slice(x, padded_start, padded_end, strides);
+  });
+}
+
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    // TODO(phawkins): make int64 work on all backends, remove the int32 cast.
+    std::vector<int32> start_as_int32(start.begin(), start.end());
+    auto start_constant = ConstantR1<int32>(builder, start_as_int32);
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    TF_ASSIGN_OR_RETURN(Shape start_constant_shape,
+                        builder->GetShape(start_constant));
+    const int64 start_length =
+        ShapeUtil::GetDimension(start_constant_shape, -1);
+    TF_RET_CHECK(start_length == n_dims);
+    return DynamicUpdateSlice(x, update, start_constant);
+  });
+}
+
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64> start) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    const int64 n_minor_dims = start.size();
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    std::vector<int64> padded_start(n_dims, 0);
+    std::copy(start.begin(), start.end(),
+              padded_start.begin() + (n_dims - n_minor_dims));
+    return UpdateSlice(x, update, padded_start);
+  });
+}
+
+namespace {
+
+std::vector<int64> ConcatVectors(absl::Span<const int64> xs,
+                                 absl::Span<const int64> ys) {
+  std::vector<int64> output(xs.size() + ys.size());
+  std::copy(xs.begin(), xs.end(), output.begin());
+  std::copy(ys.begin(), ys.end(), output.begin() + xs.size());
+  return output;
+}
+
+XlaOp PrependZerosInMajorDims(XlaOp x, absl::Span<const XlaOp> starts) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    auto zero = Reshape(ConstantR0<int32>(builder, 0), {1});
+    std::vector<XlaOp> padded_starts(n_dims, zero);
+    for (int i = 0; i < starts.size(); ++i) {
+      padded_starts[n_dims - starts.size() + i] = Reshape(starts[i], {1});
+    }
+    return ConcatInDim(builder, padded_starts, 0);
+  });
+}
+
+}  // namespace
+
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64> sizes) {
+  XlaBuilder* builder = x.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(x));
+    const int64 n_dims = ShapeUtil::Rank(shape);
+    int64 n_minor_dims = starts.size();
+    TF_RET_CHECK(n_minor_dims == sizes.size());
+    TF_RET_CHECK(n_minor_dims <= n_dims);
+    auto major_dims = AsInt64Slice(shape.dimensions())
+                          .subspan(
+                              /*pos=*/0,
+                              /*len=*/n_dims - sizes.size());
+    auto padded_starts = PrependZerosInMajorDims(x, starts);
+    auto padded_sizes = ConcatVectors(major_dims, sizes);
+    return DynamicSlice(x, padded_starts, padded_sizes);
+  });
+}
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts) {
+  auto padded_starts = PrependZerosInMajorDims(x, starts);
+  return DynamicUpdateSlice(x, update, padded_starts);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c482a38b5489c9fb17c3dca9ee3d2a1b8fd1890
--- /dev/null
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/types.h"
+
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
+
+namespace xla {
+
+// Updates a slice of 'x', i.e.,
+// x[start[0], ..., start[n]] = update
+XlaOp UpdateSlice(XlaOp x, XlaOp update, absl::Span<const int64> start);
+
+// Performs a slice in the minor dimensions of a tensor.
+// x[..., start[0]:end[0], ..., start[n]:end[n]]
+XlaOp SliceInMinorDims(XlaOp x, absl::Span<const int64> start,
+                       absl::Span<const int64> end);
+
+// Updates a slice of 'x', where 'start' contains a list of minor dimensions:
+// x[..., start[0]:..., ..., start[n]:...] = update
+XlaOp UpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                             absl::Span<const int64> start);
+
+// Performs a dynamic slice in the minor dimensions of a tensor.
+XlaOp DynamicSliceInMinorDims(XlaOp x, absl::Span<const XlaOp> starts,
+                              absl::Span<const int64> sizes);
+
+XlaOp DynamicUpdateSliceInMinorDims(XlaOp x, XlaOp update,
+                                    absl::Span<const XlaOp> starts);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_SLICING_H_
diff --git a/tensorflow/compiler/tf2xla/lib/util_test.cc b/tensorflow/compiler/xla/client/lib/slicing_test.cc
similarity index 67%
rename from tensorflow/compiler/tf2xla/lib/util_test.cc
rename to tensorflow/compiler/xla/client/lib/slicing_test.cc
index 442fe92c34ca26cb1a854cc90da8dc034bca79bb..8d362119e01006555db0f82d02626175936e1d05 100644
--- a/tensorflow/compiler/tf2xla/lib/util_test.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing_test.cc
@@ -13,28 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/util.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 
-#include <memory>
-#include <numeric>
-#include <vector>
-
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/xla/array2d.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
-#include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
 #include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace tensorflow {
+namespace xla {
 namespace {
 
-using UtilTest = xla::ClientLibraryTestBase;
-using UtilLeftLookingTest = xla::ClientLibraryTestBase;
+using SlicingTest = xla::ClientLibraryTestBase;
 
 xla::Array2D<float> BValsRight() {
   return {{1, 2, 3, 4}, {5, 6, 7, 8}, {9, 10, 11, 12}};
@@ -63,7 +54,7 @@ xla::Array3D<float> BatchedAValsFull() {
           }};
 }
 
-XLA_TEST_F(UtilTest, Simple2dLookup) {
+XLA_TEST_F(SlicingTest, Simple2dLookup) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, x, y;
@@ -77,7 +68,7 @@ XLA_TEST_F(UtilTest, Simple2dLookup) {
                              xla::ErrorSpec(1e-2, 1e-2));
 }
 
-XLA_TEST_F(UtilTest, Simple3dLookup) {
+XLA_TEST_F(SlicingTest, Simple3dLookup) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, index;
@@ -92,7 +83,7 @@ XLA_TEST_F(UtilTest, Simple3dLookup) {
                              {a_data.get(), index_data.get()});
 }
 
-XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
+XLA_TEST_F(SlicingTest, SimpleSliceUpdate) {
   xla::XlaBuilder builder(TestName());
 
   xla::XlaOp a, b, x, y;
@@ -111,26 +102,5 @@ XLA_TEST_F(UtilTest, SimpleSliceUpdate) {
       {a_data.get(), b_data.get(), x_data.get(), y_data.get()});
 }
 
-XLA_TEST_F(UtilTest, RowBatchDot) {
-  xla::XlaBuilder builder(TestName());
-
-  int n = 4;
-
-  xla::XlaOp a, row, index;
-  auto a_data =
-      CreateR3Parameter<float>(BatchedAValsFull(), 0, "a", &builder, &a);
-  auto row_data = CreateR3Parameter<float>({{{9, 1, 0, 0}}, {{2, 4, 0, 0}}}, 1,
-                                           "row", &builder, &row);
-  // Select {{3, 6, 0, 1}, {24, 61,  82,  48}} out of BatchedAValsFull().
-  auto index_data = CreateR0Parameter<int>(1, 2, "index", &builder, &index);
-
-  auto l_index = DynamicSliceInMinorDims(
-      a, {index, xla::ConstantR0<int32>(&builder, 0)}, {1, n});
-  BatchDot(l_index, row, /*transpose_x=*/false, /*transpose_y=*/true);
-
-  ComputeAndCompareR3<float>(&builder, {{{33}}, {{292}}},
-                             {a_data.get(), row_data.get(), index_data.get()});
-}
-
 }  // namespace
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index 0475fd9c94f6e390b5169cfe2cbba8eae28ddc18..e8553a08bb014e790822a14e128686b60b8d6b7c 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
 
@@ -23,13 +25,12 @@ XlaOp TopK(XlaOp input, int64 k) {
   return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(Shape input_shape, builder->GetShape(input));
     int last_dim = input_shape.dimensions_size() - 1;
-    int last_dim_size = input_shape.dimensions(last_dim);
 
-    XlaOp iota_s32 = Iota(builder, S32, last_dim_size);
+    Shape iota_shape =
+        ShapeUtil::MakeShape(S32, AsInt64Slice(input_shape.dimensions()));
+    XlaOp iota_s32 = Iota(builder, iota_shape, last_dim);
     auto input_dims = input_shape.dimensions();
-    std::vector<int64> broadcast_dims(input_dims.begin(), input_dims.end() - 1);
-    XlaOp broadcast_s32 = Broadcast(iota_s32, broadcast_dims);
-    XlaOp sort_result = Sort(Neg(input), {broadcast_s32});
+    XlaOp sort_result = Sort(Neg(input), {iota_s32});
     std::vector<int64> start_indices(input_shape.dimensions_size(), 0);
     std::vector<int64> limit_indices(input_dims.begin(), input_dims.end());
     limit_indices[last_dim] = k;
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index fef98c9923096e21a755c6d730de2c7c10852b2d..27ff36c7491ab8397d46f3a49493ff2b904deb2d 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
+
+#include <limits>
+
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
@@ -41,6 +44,28 @@ XLA_TEST_F(SortingTest, TopK3From8Indices) {
   ComputeAndCompareR1<int>(&builder, {0, 1, 2}, {});
 }
 
+// TODO(b/119930279): enable this test.
+XLA_TEST_F(SortingTest, DISABLED_TopKFullSortMinInt) {
+  XlaBuilder builder(TestName());
+  auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
+                                          std::numeric_limits<int>::min() + 1,
+                                          std::numeric_limits<int>::max()});
+  xla::GetTupleElement(xla::TopK(x_rev, 3), 1);
+  ComputeAndCompareR1<int>(&builder, {2, 1, 0}, {});
+}
+
+XLA_TEST_F(SortingTest, NOT_TopKFullSortMinInt) {
+  XlaBuilder builder(TestName());
+  auto x_rev = ConstantR1<int>(&builder, {std::numeric_limits<int>::min(),
+                                          std::numeric_limits<int>::min() + 1,
+                                          std::numeric_limits<int>::max()});
+  xla::GetTupleElement(xla::TopK(x_rev, 3), 1);
+  // TopK currently negates the keys, which doesn't work correctly for
+  // std::numeric_limits<int>::min(). Therefore, it will sort this key to the
+  // front instead of to the back.
+  ComputeAndCompareR1<int>(&builder, {0, 2, 1}, {});
+}
+
 XLA_TEST_F(SortingTest, TopKFullSort) {
   XlaBuilder builder(TestName());
   const int kSize = 16;
@@ -56,5 +81,13 @@ XLA_TEST_F(SortingTest, TopKFullSort) {
   ComputeAndCompareR1<float>(&builder, inputs, {});
 }
 
+XLA_TEST_F(SortingTest, TopKFullSortWithDuplicates) {
+  XlaBuilder builder(TestName());
+  XlaOp a;
+  auto a_data = CreateR1Parameter<int>({1, 1, 2, 2, 1}, 0, "a", &builder, &a);
+  xla::GetTupleElement(xla::TopK(a, 5), 1);
+  ComputeAndCompareR1<int>(&builder, {2, 3, 0, 1, 4}, {a_data.get()});
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index a44681f586278bf03f3fb2b8c812936cbf3ad47b..a95bbf2c8c860914877d3195b97342097dafc725 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -66,7 +66,7 @@ std::unique_ptr<GlobalData> MakeFakeDataViaDeviceOrDie(const Shape& shape,
   XlaComputation computation = b.Build().ConsumeValueOrDie();
 
   auto execution_options = CreateDefaultExecutionOptions();
-  *execution_options.mutable_shape_with_output_layout() = shape;
+  *execution_options.mutable_shape_with_output_layout() = shape.ToProto();
   return client->Execute(computation, /*arguments=*/{}, &execution_options)
       .ConsumeValueOrDie();
 }
@@ -98,8 +98,8 @@ std::vector<std::unique_ptr<GlobalData>> MakeFakeArgumentsOrDie(
   auto program_shape = computation.proto().host_program_shape();
 
   std::vector<std::unique_ptr<GlobalData>> results;
-  for (const Shape& shape : program_shape.parameters()) {
-    results.push_back(MakeFakeDataOrDie(shape, client));
+  for (const ShapeProto& shape : program_shape.parameters()) {
+    results.push_back(MakeFakeDataOrDie(Shape(shape), client));
   }
   return results;
 }
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
similarity index 62%
rename from tensorflow/compiler/tf2xla/lib/triangular_solve.cc
rename to tensorflow/compiler/xla/client/lib/triangular_solve.cc
index 6524c2a9b1ada632d80edd234272760c2b545cc4..c5a1d34cc66e6f8c1a832f8a8437163b846a5431 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.cc
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve.cc
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 
 #include <memory>
 #include <vector>
 
-#include "tensorflow/compiler/tf2xla/lib/batch_dot.h"
-#include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
-#include "tensorflow/compiler/xla/client/lib/numeric.h"
+#include "tensorflow/compiler/xla/client/lib/math.h"
+#include "tensorflow/compiler/xla/client/lib/matrix.h"
+#include "tensorflow/compiler/xla/client/lib/slicing.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -29,21 +29,20 @@ limitations under the License.
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/math/math_util.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Get the diagonal blocks of the coefficient matrix
-xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(a));
-    int ndims = xla::ShapeUtil::Rank(shape);
-    int64 n = xla::ShapeUtil::GetDimension(shape, -1);
+XlaOp DiagonalBlocks(XlaOp a, int64 block_size) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(a));
+    int ndims = ShapeUtil::Rank(shape);
+    int64 n = ShapeUtil::GetDimension(shape, -1);
     int64 num_blocks = n / block_size;
 
-    xla::XlaOp diag_blocks;
+    XlaOp diag_blocks;
 
     // If the coefficient matrix is exactly the block size, we just add a
     // singleton dimension i.e. [..., n, n] -> [..., 1, n, n]
@@ -58,13 +57,13 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
     if (n > block_size) {
       // Construct the starting indices of the diagonal blocks
       auto start_indices =
-          Transpose(Broadcast(Mul(Iota(builder, xla::S32, num_blocks),
-                                  xla::ConstantR0<int32>(builder, block_size)),
+          Transpose(Broadcast(Mul(Iota(builder, S32, num_blocks),
+                                  ConstantR0<int32>(builder, block_size)),
                               /*broadcast_sizes=*/{2}),
                     /*permutation=*/{1, 0});
 
       // Gather the diagonal blocks
-      xla::GatherDimensionNumbers dim_numbers;
+      GatherDimensionNumbers dim_numbers;
       dim_numbers.add_offset_dims(ndims - 1);
       dim_numbers.add_offset_dims(ndims);
       dim_numbers.add_start_index_map(ndims - 2);
@@ -80,7 +79,7 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
       // Pad with zeros
       auto last_blocks =
           SliceInMinorDims(a, {n - n % block_size, n - n % block_size}, {n, n});
-      xla::PaddingConfig config = xla::MakeNoPaddingConfig(ndims);
+      PaddingConfig config = MakeNoPaddingConfig(ndims);
       int64 padding = block_size - n % block_size;
       config.mutable_dimensions(ndims - 1)->set_edge_padding_high(padding);
       config.mutable_dimensions(ndims - 2)->set_edge_padding_high(padding);
@@ -89,9 +88,8 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
 
       // Add a singleton dimension
       // i.e. [..., block_size, block_size] -> [..., 1, block_size, block_size]
-      TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
-                          builder->GetShape(last_blocks));
-      auto shape_dims = xla::AsInt64Slice(blocks_shape.dimensions());
+      TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(last_blocks));
+      auto shape_dims = AsInt64Slice(blocks_shape.dimensions());
       auto last_blocks_dims = std::vector<int64>(ndims);
       std::copy(shape_dims.begin(), shape_dims.end(), last_blocks_dims.begin());
       last_blocks_dims.insert(last_blocks_dims.end() - 2, 1);
@@ -100,7 +98,7 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
       // Concatenate with the other blocks if necessary
       if (n > block_size) {
         diag_blocks =
-            xla::ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2);
+            ConcatInDim(builder, {diag_blocks, last_blocks}, ndims - 2);
       } else {
         diag_blocks = last_blocks;
       }
@@ -110,16 +108,16 @@ xla::XlaOp DiagonalBlocks(xla::XlaOp a, int64 block_size) {
   });
 }
 
-xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
-                                bool transpose_a, bool conjugate_a,
-                                xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = diag_blocks.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
+XlaOp InvertDiagonalBlocks(XlaOp diag_blocks, bool lower, bool transpose_a,
+                           bool conjugate_a,
+                           PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = diag_blocks.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     // Input is a batch of square lower triangular square matrices. Its shape is
     // (..., size, size). We resize this to (num_blocks, size, size).
-    TF_ASSIGN_OR_RETURN(xla::Shape shape, builder->GetShape(diag_blocks));
-    int64 block_size = xla::ShapeUtil::GetDimension(shape, -1);
-    int64 num_blocks = xla::ShapeUtil::ElementsIn(shape) /
+    TF_ASSIGN_OR_RETURN(Shape shape, builder->GetShape(diag_blocks));
+    int64 block_size = ShapeUtil::GetDimension(shape, -1);
+    int64 num_blocks = ShapeUtil::ElementsIn(shape) /
                        tensorflow::MathUtil::IPow(block_size, 2);
     diag_blocks = Reshape(diag_blocks, {num_blocks, block_size, block_size});
 
@@ -131,9 +129,9 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
     // zero (which can happen if the last block was padded) otherwise it will
     // introduce nans which will propagate
     auto diags = GetMatrixDiagonal(diag_blocks);
-    TF_ASSIGN_OR_RETURN(xla::Shape diags_shape, builder->GetShape(diags));
+    TF_ASSIGN_OR_RETURN(Shape diags_shape, builder->GetShape(diags));
     auto one = ScalarLike(diags, 1);
-    auto ones = Broadcast(one, xla::AsInt64Slice(diags_shape.dimensions()));
+    auto ones = Broadcast(one, AsInt64Slice(diags_shape.dimensions()));
     diags = Select(Eq(diags, Zero(builder, shape.element_type())), ones, diags);
     auto scaled_diag_blocks = Div(diag_blocks, diags, {0, 2});
 
@@ -159,40 +157,40 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
     auto start_index = (lower) ? 0 : block_size - 1;
     auto output_block = DynamicUpdateSlice(
         neg_identity, pos_one,
-        /*start_indices=*/xla::ConstantR1<int>(builder, 2, start_index));
+        /*start_indices=*/ConstantR1<int>(builder, 2, start_index));
 
     // Broadcast diag([1, -1, -1, ...]) to every block
-    xla::XlaOp output = Broadcast(output_block,
-                                  /*broadcast_sizes=*/{num_blocks});
+    XlaOp output = Broadcast(output_block,
+                             /*broadcast_sizes=*/{num_blocks});
 
     // Now we construct a loop that performs matrix-vector multiplications
     // inverting the blocks one row at a time
-    std::vector<xla::Shape> tuple_shapes = {
+    std::vector<Shape> tuple_shapes = {
         // The loop iteration counter is a scalar, incremented each iteration.
-        xla::ShapeUtil::MakeShape(xla::S32, {}),
+        ShapeUtil::MakeShape(S32, {}),
         // The output has the shape of A, with one row updated each iteration.
-        xla::ShapeUtil::MakeShape(shape.element_type(),
-                                  {num_blocks, block_size, block_size}),
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size}),
         // The input is a loop invariant.
-        xla::ShapeUtil::MakeShape(shape.element_type(),
-                                  {num_blocks, block_size, block_size})};
-    xla::Shape tuple_shape = xla::ShapeUtil::MakeTupleShape(tuple_shapes);
+        ShapeUtil::MakeShape(shape.element_type(),
+                             {num_blocks, block_size, block_size})};
+    Shape tuple_shape = ShapeUtil::MakeTupleShape(tuple_shapes);
 
-    auto init_i = One(builder, xla::S32);
-    auto init = xla::Tuple(builder, {init_i, output, scaled_diag_blocks});
+    auto init_i = One(builder, S32);
+    auto init = Tuple(builder, {init_i, output, scaled_diag_blocks});
 
     // Construct the loop condition function.
-    std::unique_ptr<xla::XlaBuilder> condb =
+    std::unique_ptr<XlaBuilder> condb =
         builder->CreateSubBuilder("InvertDiagCond");
     {
       auto i = GetTupleElement(
           Parameter(condb.get(), 0, tuple_shape, "InvertDiagCondTuple"), 0);
-      Lt(i, xla::ConstantR0<int32>(condb.get(), block_size));
+      Lt(i, ConstantR0<int32>(condb.get(), block_size));
     }
     TF_ASSIGN_OR_RETURN(auto cond, condb->Build());
 
     // Construct the loop body function.
-    std::unique_ptr<xla::XlaBuilder> bodyb =
+    std::unique_ptr<XlaBuilder> bodyb =
         builder->CreateSubBuilder("InvertDiagBody");
     {
       auto input_tuple =
@@ -202,21 +200,21 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
       auto body_out = GetTupleElement(input_tuple, 1);
       auto body_input = GetTupleElement(input_tuple, 2);
 
-      auto zero = xla::ConstantR1<int32>(bodyb.get(), 1, 0);
+      auto zero = ConstantR1<int32>(bodyb.get(), 1, 0);
       auto j = (lower) ? i : ScalarLike(i, block_size - 1) - i;
       auto start_indices =
-          xla::ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
+          ConcatInDim(bodyb.get(), {zero, Reshape(j, {1}), zero}, 0);
       auto input_row =
           DynamicSlice(body_input, start_indices,
                        /*slice_sizes=*/{num_blocks, 1, block_size});
 
       // We want -L21 L11^{-1}
-      xla::DotDimensionNumbers dnums;
+      DotDimensionNumbers dnums;
       dnums.add_lhs_batch_dimensions(0);
       dnums.add_rhs_batch_dimensions(0);
       dnums.add_lhs_contracting_dimensions(2);
       dnums.add_rhs_contracting_dimensions(1);
-      xla::PrecisionConfig precision_proto;
+      PrecisionConfig precision_proto;
       precision_proto.add_operand_precision(precision);
       precision_proto.add_operand_precision(precision);
       auto update = -DotGeneral(input_row, body_out, dnums, &precision_proto);
@@ -224,7 +222,7 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
       body_out = DynamicUpdateSlice(body_out, update, start_indices);
 
       auto next_i = i + ScalarLike(i, 1);
-      xla::Tuple(bodyb.get(), {next_i, body_out, body_input});
+      Tuple(bodyb.get(), {next_i, body_out, body_input});
     }
     TF_ASSIGN_OR_RETURN(auto body, bodyb->Build());
 
@@ -238,27 +236,26 @@ xla::XlaOp InvertDiagonalBlocks(xla::XlaOp diag_blocks, bool lower,
                           /*broadcast_dimensions=*/{0, 1});
 
     // Reshape back to original batch major dimensions
-    return Reshape(inv_diag_blocks, xla::AsInt64Slice(shape.dimensions()));
+    return Reshape(inv_diag_blocks, AsInt64Slice(shape.dimensions()));
   });
 }
 
-xla::XlaOp SolveWithInvertedDiagonalBlocks(
-    xla::XlaOp a, xla::XlaOp b, xla::XlaOp inv_diag_blocks, bool left_side,
-    bool lower, bool transpose_a, bool conjugate_a,
-    xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape blocks_shape,
-                        builder->GetShape(inv_diag_blocks));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    int64 block_size = xla::ShapeUtil::GetDimension(blocks_shape, -1);
-
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    int64 ndims = xla::ShapeUtil::Rank(a_shape);
-    int64 n = xla::ShapeUtil::GetDimension(a_shape, -1);
+XlaOp SolveWithInvertedDiagonalBlocks(XlaOp a, XlaOp b, XlaOp inv_diag_blocks,
+                                      bool left_side, bool lower,
+                                      bool transpose_a, bool conjugate_a,
+                                      PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape blocks_shape, builder->GetShape(inv_diag_blocks));
+    TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
+    int64 block_size = ShapeUtil::GetDimension(blocks_shape, -1);
+
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    int64 ndims = ShapeUtil::Rank(a_shape);
+    int64 n = ShapeUtil::GetDimension(a_shape, -1);
     int64 num_blocks = n / block_size + (n % block_size != 0);
     int64 m_dim = (left_side) ? -1 : -2;
-    int64 m = xla::ShapeUtil::GetDimension(b_shape, m_dim);
+    int64 m = ShapeUtil::GetDimension(b_shape, m_dim);
 
     // Initialize the solution
     auto x = ZerosLike(b);
@@ -294,7 +291,7 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(
       }
       auto b_row = SliceInMinorDims(b, start, end);
 
-      xla::XlaOp remainder;
+      XlaOp remainder;
       if (i == 0) {
         remainder = b_row;
       } else {
@@ -311,29 +308,27 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(
         auto a_row =
             MaybeConjugate(SliceInMinorDims(a, start, end), conjugate_a);
         if (left_side) {
-          remainder = b_row - BatchDot(a_row, x, transpose_a, false,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false, precision);
+          remainder =
+              b_row - BatchDot(MaybeTransposeInMinorDims(a_row, transpose_a), x,
+                               precision);
         } else {
-          remainder = b_row - BatchDot(x, a_row, false, transpose_a,
-                                       /*conjugate_x=*/false,
-                                       /*conjugate_y=*/false, precision);
+          remainder =
+              b_row - BatchDot(x, MaybeTransposeInMinorDims(a_row, transpose_a),
+                               precision);
         }
       }
 
-      xla::XlaOp x_update;
-      auto zero = Zero(builder, xla::S32);
-      auto start_index =
-          xla::ConstantR0WithType(builder, xla::S32, j * block_size);
-      std::vector<xla::XlaOp> update_starts = {start_index, zero};
+      XlaOp x_update;
+      auto zero = Zero(builder, S32);
+      auto start_index = ConstantR0WithType(builder, S32, j * block_size);
+      std::vector<XlaOp> update_starts = {start_index, zero};
       if (left_side) {
-        x_update =
-            BatchDot(inv_block, remainder, transpose_a, false,
-                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+        x_update = BatchDot(MaybeTransposeInMinorDims(inv_block, transpose_a),
+                            remainder, precision);
       } else {
-        x_update =
-            BatchDot(remainder, inv_block, false, transpose_a,
-                     /*conjugate_x=*/false, /*conjugate_y=*/false, precision);
+        x_update = BatchDot(remainder,
+                            MaybeTransposeInMinorDims(inv_block, transpose_a),
+                            precision);
         std::swap(update_starts[0], update_starts[1]);
       }
       x = DynamicUpdateSliceInMinorDims(x, x_update, /*starts=*/update_starts);
@@ -343,24 +338,24 @@ xla::XlaOp SolveWithInvertedDiagonalBlocks(
   });
 }
 
-xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
-                           bool lower, bool transpose_a, bool conjugate_a,
-                           int64 block_size,
-                           xla::PrecisionConfig::Precision precision) {
-  xla::XlaBuilder* builder = a.builder();
-  return builder->ReportErrorOrReturn([&]() -> xla::StatusOr<xla::XlaOp> {
-    TF_ASSIGN_OR_RETURN(xla::Shape a_shape, builder->GetShape(a));
-    TF_ASSIGN_OR_RETURN(xla::Shape b_shape, builder->GetShape(b));
-    if (xla::ShapeUtil::Rank(a_shape) != xla::ShapeUtil::Rank(b_shape)) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve have different ranks: ",
-          xla::ShapeUtil::HumanString(a_shape), " vs. ",
-          xla::ShapeUtil::HumanString(b_shape));
+XlaOp TriangularSolve(XlaOp a, XlaOp b, bool left_side, bool lower,
+                      bool transpose_a, bool conjugate_a, int64 block_size,
+                      PrecisionConfig::Precision precision) {
+  XlaBuilder* builder = a.builder();
+  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
+    TF_ASSIGN_OR_RETURN(Shape a_shape, builder->GetShape(a));
+    TF_ASSIGN_OR_RETURN(Shape b_shape, builder->GetShape(b));
+    if (ShapeUtil::Rank(a_shape) != ShapeUtil::Rank(b_shape)) {
+      return InvalidArgument(
+          "Arguments to TriangularSolve have shapes with different ranks: "
+          "%s vs. %s",
+          ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
     }
-    const int64 ndims = xla::ShapeUtil::Rank(a_shape);
+    const int64 ndims = ShapeUtil::Rank(a_shape);
     if (ndims < 2) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve must have rank >= 2: ", ndims);
+      return InvalidArgument(
+          "Arguments to TriangularSolve was rank %d but must have rank >= 2.",
+          ndims);
     }
     // The batch dimensions must be equal.
     std::vector<int64> batch_dimensions;
@@ -368,32 +363,33 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
       int64 a_size = a_shape.dimensions(i);
       int64 b_size = b_shape.dimensions(i);
       if (a_size != b_size) {
-        return errors::InvalidArgument(
-            "Batch dimensions of arguments to TriangularSolve must be equal: ",
-            xla::ShapeUtil::HumanString(a_shape), " vs ",
-            xla::ShapeUtil::HumanString(b_shape));
+        return InvalidArgument(
+            "Batch dimensions of arguments to TriangularSolve must be equal; "
+            "shapes were %s and %s.",
+            ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
       }
       batch_dimensions.push_back(a_size);
     }
 
-    if (xla::ShapeUtil::GetDimension(a_shape, -1) !=
-        xla::ShapeUtil::GetDimension(a_shape, -2)) {
-      return errors::InvalidArgument(
-          "The 'a' arguments to TriangularSolve must be square matrices: ",
-          xla::ShapeUtil::HumanString(a_shape));
+    if (ShapeUtil::GetDimension(a_shape, -1) !=
+        ShapeUtil::GetDimension(a_shape, -2)) {
+      return InvalidArgument(
+          "The 'a' argument to TriangularSolve must be a batched square matrix;"
+          " shape was: %s",
+          ShapeUtil::HumanString(a_shape));
     }
-    const int64 m = xla::ShapeUtil::GetDimension(b_shape, -2);
-    const int64 n = xla::ShapeUtil::GetDimension(b_shape, -1);
-    if ((left_side ? m : n) != xla::ShapeUtil::GetDimension(a_shape, -1)) {
-      return errors::InvalidArgument(
-          "Arguments to TriangularSolve have incompatible matrix shapes: ",
-          xla::ShapeUtil::HumanString(a_shape), " vs ",
-          xla::ShapeUtil::HumanString(b_shape));
+    const int64 m = ShapeUtil::GetDimension(b_shape, -2);
+    const int64 n = ShapeUtil::GetDimension(b_shape, -1);
+    if ((left_side ? m : n) != ShapeUtil::GetDimension(a_shape, -1)) {
+      return InvalidArgument(
+          "Arguments to TriangularSolve have incompatible matrix shapes %s and "
+          "%s",
+          ShapeUtil::HumanString(a_shape), ShapeUtil::HumanString(b_shape));
     }
 
     if (block_size < 1) {
-      return errors::InvalidArgument(
-          "block_size argument to TriangularSolve must be >= 1; got ",
+      return InvalidArgument(
+          "block_size argument to TriangularSolve must be >= 1; got %d",
           block_size);
     }
 
@@ -413,4 +409,4 @@ xla::XlaOp TriangularSolve(xla::XlaOp a, xla::XlaOp b, bool left_side,
   });
 }
 
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve.h b/tensorflow/compiler/xla/client/lib/triangular_solve.h
similarity index 88%
rename from tensorflow/compiler/tf2xla/lib/triangular_solve.h
rename to tensorflow/compiler/xla/client/lib/triangular_solve.h
index 2303234f361e54cd2a0ad495cb03b371bed76877..50a3b30ebd1c15eb6d2ace4e351cb41f21db7093 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve.h
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve.h
@@ -13,13 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
-#define TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
+#ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
+#define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
-namespace tensorflow {
+namespace xla {
 
 // Solves systems of linear equations with lower or upper triangular coefficient
 // matrices by forward- or back-substitution. Broadcasting along leading
@@ -57,11 +57,11 @@ namespace tensorflow {
 //
 // Uses a blocked algorithm if `block_size` is > 1; if block_size == 1 then no
 // blocking is used.
-xla::XlaOp TriangularSolve(
-    xla::XlaOp a, xla::XlaOp b, bool left_side, bool lower, bool transpose_a,
+XlaOp TriangularSolve(
+    XlaOp a, XlaOp b, bool left_side, bool lower, bool transpose_a,
     bool conjugate_a, int64 block_size = 128,
-    xla::PrecisionConfig::Precision precision = xla::PrecisionConfig::HIGHEST);
+    PrecisionConfig::Precision precision = PrecisionConfig::HIGHEST);
 
-}  // namespace tensorflow
+}  // namespace xla
 
-#endif  // TENSORFLOW_COMPILER_TF2XLA_LIB_TRIANGULAR_SOLVE_H_
+#endif  // TENSORFLOW_COMPILER_XLA_CLIENT_LIB_TRIANGULAR_SOLVE_H_
diff --git a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
similarity index 99%
rename from tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
rename to tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
index aeebf16028d40189203cdfd815f06a339ee72902..f6a70d64a788d95a456774ccbbcf67f2e5cac98b 100644
--- a/tensorflow/compiler/tf2xla/lib/triangular_solve_test.cc
+++ b/tensorflow/compiler/xla/client/lib/triangular_solve_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/tf2xla/lib/triangular_solve.h"
+#include "tensorflow/compiler/xla/client/lib/triangular_solve.h"
 
 #include <memory>
 #include <numeric>
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace tensorflow {
+namespace xla {
 namespace {
 
 using TriangularSolveTest = xla::ClientLibraryTestBase;
@@ -330,4 +330,4 @@ XLA_TEST_F(TriangularSolveTest, SimpleLeftUpperTransposeNoconjugate) {
 }
 
 }  // namespace
-}  // namespace tensorflow
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index f96b6c9c261a9686fb647e3da0dcc933cd1f70df..049cd15738a619294b19d5cf74ca514d7b4a00ad 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -71,9 +71,9 @@ Status LocalExecutable::ValidateExecutionOptions(
           "parameter "
           "%d: want %s, got %s",
           i,
-          ShapeUtil::HumanString(
+          ShapeUtil::HumanStringWithLayout(
               computation_layout.parameter_layout(i).shape()),
-          ShapeUtil::HumanString(arguments[i]->on_host_shape()));
+          ShapeUtil::HumanStringWithLayout(arguments[i]->on_host_shape()));
     }
   }
 
@@ -310,4 +310,28 @@ StatusOr<int> LocalClient::ReplicaNumberToDeviceOrdinal(int replica_number) {
   return local_service_->ReplicaNumberToDeviceOrdinal(replica_number);
 }
 
+StatusOr<TransferToServerResponse> LocalClient::TransferToLocalServer(
+    const ::xla::BorrowingLiteral& literal, int device_oridinal) {
+  const ::xla::Shape& shape = literal.shape();
+
+  TF_ASSIGN_OR_RETURN(
+      ::xla::ScopedShapedBuffer shaped_buffer,
+      backend().transfer_manager()->AllocateScopedShapedBuffer(
+          shape, backend().memory_allocator(), device_oridinal));
+  TF_ASSIGN_OR_RETURN(auto stream,
+                      mutable_backend()->BorrowStream(device_oridinal));
+  TF_RETURN_IF_ERROR(backend().transfer_manager()->TransferLiteralToDevice(
+      stream.get(), literal, shaped_buffer));
+  std::vector<::xla::ScopedShapedBuffer> replicated_buffer;
+  replicated_buffer.emplace_back(std::move(shaped_buffer));
+  ::xla::TransferToServerResponse result;
+  TF_ASSIGN_OR_RETURN(*result.mutable_data(),
+                      local_service_->RegisterReplicatedBuffers(
+                          std::move(replicated_buffer),
+                          absl::StrCat("TransferToServer literal of shape ",
+                                       ::xla::ShapeUtil::HumanString(shape))));
+
+  return result;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index e49451ca9708ab506d11af5f9855db245674864c..ddb36680e8b185b053368baffa6f1d5cac50dc07 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -129,6 +129,10 @@ class LocalClient : public Client {
       const Literal& literal, int device_ordinal,
       DeviceMemoryAllocator* allocator = nullptr);
 
+  // Transfer the BorrowingLiteral to the device with the given ordinal.
+  StatusOr<TransferToServerResponse> TransferToLocalServer(
+      const ::xla::BorrowingLiteral& literal, int device_oridinal);
+
   // Copy the data from the device contained in the given ShapedBuffer and
   // return as a Literal.
   StatusOr<Literal> ShapedBufferToLiteral(const ShapedBuffer& shaped_buffer);
diff --git a/tensorflow/compiler/xla/client/sharding_builder.cc b/tensorflow/compiler/xla/client/sharding_builder.cc
index 176802b33ef824a1f898255a19e44def3c1fc982..fb9ea6ec3fc41d5e04ca125798a8199350470a44 100644
--- a/tensorflow/compiler/xla/client/sharding_builder.cc
+++ b/tensorflow/compiler/xla/client/sharding_builder.cc
@@ -36,7 +36,7 @@ OpSharding Tile(const Shape& tile_shape,
                 const TileAssignment& tile_assignment) {
   OpSharding result;
   result.set_type(OpSharding::Type::OpSharding_Type_OTHER);
-  *result.mutable_tile_shape() = tile_shape;
+  *result.mutable_tile_shape() = tile_shape.ToProto();
   for (int64 dim : tile_assignment.dimensions()) {
     result.add_tile_assignment_dimensions(dim);
   }
@@ -52,7 +52,7 @@ OpSharding Tile1D(const Shape& tile_shape, int64 num_tiles) {
 
   CHECK_EQ(ShapeUtil::Rank(tile_shape), 1);
   std::vector<int64> dimensions(1, num_tiles);
-  *result.mutable_tile_shape() = tile_shape;
+  *result.mutable_tile_shape() = tile_shape.ToProto();
   auto& tile_dimension =
       (*result.mutable_tile_shape()->mutable_dimensions())[0];
   tile_dimension = CeilOfRatio(static_cast<int64>(tile_dimension), num_tiles);
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 0a587725d20507555382ef0657bdc08369a7fbac..60df2ec3959216b0564846ad47c21c5bcc01ea57 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -102,7 +102,7 @@ StatusOr<Shape> XlaBuilder::GetShape(const XlaOp& op) const {
   TF_RETURN_IF_ERROR(first_error_);
 
   TF_ASSIGN_OR_RETURN(auto instr, LookUpInstruction(op));
-  return instr->shape();
+  return Shape(instr->shape());
 }
 
 StatusOr<std::vector<Shape>> XlaBuilder::GetOperandShapes(
@@ -155,7 +155,7 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64 root_id) const {
 
   ProgramShape program_shape;
 
-  *program_shape.mutable_result() = root_proto->shape();
+  *program_shape.mutable_result() = Shape(root_proto->shape());
 
   // Check that the parameter numbers are continuous from 0, and add parameter
   // shapes and names to the program shape.
@@ -172,7 +172,7 @@ StatusOr<ProgramShape> XlaBuilder::GetProgramShape(int64 root_id) const {
       const int64 index = instr.parameter_number();
       TF_RET_CHECK(index >= 0 && index < param_count)
           << "invalid parameter number: " << index;
-      *program_shape.mutable_parameters(index) = instr.shape();
+      *program_shape.mutable_parameters(index) = Shape(instr.shape());
       *program_shape.mutable_parameter_names(index) = instr.name();
     }
   }
@@ -239,6 +239,19 @@ void XlaBuilder::IsConstantVisitor(const int64 op_handle,
   visited->insert(op_handle);
 }
 
+Status XlaBuilder::SetDynamicBinding(int64 dynamic_size_param_num,
+                                     ShapeIndex dynamic_size_param_index,
+                                     int64 target_param_num,
+                                     ShapeIndex target_param_index,
+                                     int64 target_dim_num) {
+  TF_RETURN_IF_ERROR(dynamic_parameter_binding_.Bind(
+      DynamicParameterBinding::DynamicParameter{dynamic_size_param_num,
+                                                dynamic_size_param_index},
+      DynamicParameterBinding::DynamicDimension{
+          target_param_num, target_param_index, target_dim_num}));
+  return Status::OK();
+}
+
 XlaComputation XlaBuilder::BuildAndNoteError() {
   DCHECK(parent_builder_ != nullptr);
   auto build_status = Build();
@@ -275,7 +288,8 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
 
   HloComputationProto entry;
   SetProtoIdAndName(&entry, name_, kNameSeparator, GetNextId());
-  TF_ASSIGN_OR_RETURN(*entry.mutable_program_shape(), GetProgramShape(root_id));
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, GetProgramShape(root_id));
+  *entry.mutable_program_shape() = program_shape.ToProto();
   entry.set_root_id(root_id);
 
   for (auto& instruction : instructions_) {
@@ -297,6 +311,9 @@ StatusOr<XlaComputation> XlaBuilder::Build(int64 root_id) {
   }
   module->add_computations()->Swap(&entry);
 
+  *(module->mutable_dynamic_parameter_binding()) =
+      dynamic_parameter_binding_.ToProto();
+
   // Clear data held by this builder.
   this->instructions_.clear();
   this->handle_to_index_.clear();
@@ -312,7 +329,7 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
   TF_RETURN_IF_ERROR(first_error_);
 
   HloInstructionProto instr;
-  *instr.mutable_shape() = shape;
+  *instr.mutable_shape() = shape.ToProto();
   for (int64 dim : broadcast_dimensions) {
     instr.add_dimensions(dim);
   }
@@ -363,8 +380,9 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, const XlaOp& operand) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferUnaryOpShape(unop, operand_shape));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), unop, {operand});
   });
 }
@@ -375,9 +393,10 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferBinaryOpShape(
                             binop, lhs_shape, rhs_shape, broadcast_dimensions));
+    *instr.mutable_shape() = shape.ToProto();
 
     const int64 lhs_rank = ShapeUtil::Rank(lhs_shape);
     const int64 rhs_rank = ShapeUtil::Rank(rhs_shape);
@@ -391,7 +410,7 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
       const Shape& from_shape = should_broadcast_lhs ? lhs_shape : rhs_shape;
 
       std::vector<int64> to_size;
-      for (int64 size : instr.shape().dimensions()) {
+      for (int64 size : shape.dimensions()) {
         to_size.push_back(size);
       }
       for (int64 from_dim = 0; from_dim < ShapeUtil::Rank(from_shape);
@@ -411,14 +430,14 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, const XlaOp& lhs, const XlaOp& rhs,
     }
 
     TF_ASSIGN_OR_RETURN(Shape updated_lhs_shape, GetShape(updated_lhs));
-    if (!ShapeUtil::SameDimensions(instr.shape(), updated_lhs_shape)) {
+    if (!ShapeUtil::SameDimensions(shape, updated_lhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_lhs,
-                          AddBroadcastSequence(instr.shape(), updated_lhs));
+                          AddBroadcastSequence(shape, updated_lhs));
     }
     TF_ASSIGN_OR_RETURN(Shape updated_rhs_shape, GetShape(updated_rhs));
-    if (!ShapeUtil::SameDimensions(instr.shape(), updated_rhs_shape)) {
+    if (!ShapeUtil::SameDimensions(shape, updated_rhs_shape)) {
       TF_ASSIGN_OR_RETURN(updated_rhs,
-                          AddBroadcastSequence(instr.shape(), updated_rhs));
+                          AddBroadcastSequence(shape, updated_rhs));
     }
 
     return AddInstruction(std::move(instr), binop, {updated_lhs, updated_rhs});
@@ -432,30 +451,28 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, const XlaOp& lhs, const XlaOp& rhs,
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
     TF_ASSIGN_OR_RETURN(const Shape& ehs_shape, GetShape(ehs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferTernaryOpShape(
-                            triop, lhs_shape, rhs_shape, ehs_shape));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferTernaryOpShape(triop, lhs_shape,
+                                                         rhs_shape, ehs_shape));
+    *instr.mutable_shape() = shape.ToProto();
     XlaOp updated_lhs = lhs;
     XlaOp updated_rhs = rhs;
     XlaOp updated_ehs = ehs;
-    if (!ShapeUtil::IsTuple(instr.shape())) {
+    if (!ShapeUtil::IsTuple(shape)) {
       if (!ShapeUtil::IsTuple(lhs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), lhs_shape)) {
+          !ShapeUtil::SameDimensions(shape, lhs_shape)) {
         // lhs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_lhs,
-                            AddBroadcastSequence(instr.shape(), lhs));
+        TF_ASSIGN_OR_RETURN(updated_lhs, AddBroadcastSequence(shape, lhs));
       }
       if (!ShapeUtil::IsTuple(rhs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), rhs_shape)) {
+          !ShapeUtil::SameDimensions(shape, rhs_shape)) {
         // rhs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_rhs,
-                            AddBroadcastSequence(instr.shape(), rhs));
+        TF_ASSIGN_OR_RETURN(updated_rhs, AddBroadcastSequence(shape, rhs));
       }
       if (!ShapeUtil::IsTuple(ehs_shape) &&
-          !ShapeUtil::SameDimensions(instr.shape(), ehs_shape)) {
+          !ShapeUtil::SameDimensions(shape, ehs_shape)) {
         // ehs is being implicitly broadcasted. Change to explicit.
-        TF_ASSIGN_OR_RETURN(updated_ehs,
-                            AddBroadcastSequence(instr.shape(), ehs));
+        TF_ASSIGN_OR_RETURN(updated_ehs, AddBroadcastSequence(shape, ehs));
       }
     }
     return AddInstruction(std::move(instr), triop,
@@ -476,7 +493,7 @@ XlaOp XlaBuilder::Mul(const XlaOp& lhs, const XlaOp& rhs,
 XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = literal.shape();
+    *instr.mutable_shape() = literal.shape().ToProto();
     *instr.mutable_literal() = literal.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kConstant);
   });
@@ -485,7 +502,7 @@ XlaOp XlaBuilder::ConstantLiteral(const LiteralSlice& literal) {
 XlaOp XlaBuilder::Iota(const Shape& shape, int64 iota_dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     instr.add_dimensions(iota_dimension);
     return AddInstruction(std::move(instr), HloOpcode::kIota);
   });
@@ -505,10 +522,10 @@ XlaOp XlaBuilder::Call(const XlaComputation& computation,
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferCallShape(operand_shape_ptrs,
-                                       /*to_apply=*/called_program_shape));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCallShape(
+                                         operand_shape_ptrs,
+                                         /*to_apply=*/called_program_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(computation, &instr);
 
@@ -526,7 +543,7 @@ XlaOp XlaBuilder::Parameter(int64 parameter_number, const Shape& shape,
     }
     instr.set_parameter_number(parameter_number);
     instr.set_name(name);
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kParameter);
   });
 }
@@ -556,27 +573,35 @@ XlaOp XlaBuilder::Broadcast(const XlaOp& operand,
 }
 
 XlaOp XlaBuilder::BroadcastInDim(
-    const XlaOp& operand, const Shape& shape,
+    const XlaOp& operand, const absl::Span<const int64> out_dim_size,
     const absl::Span<const int64> broadcast_dimensions) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_RETURN_IF_ERROR(ShapeInference::InferBroadcastShape(operand_shape, shape,
-                                                           broadcast_dimensions)
+    // Output shape, in the case of degenerate broadcast, the out_dim_size is
+    // not necessarily the same as the dimension sizes of the output shape.
+    const auto& output_shape =
+        ShapeUtil::MakeShape(operand_shape.element_type(), out_dim_size);
+
+    TF_RETURN_IF_ERROR(ShapeInference::InferBroadcastShape(
+                           operand_shape, output_shape, broadcast_dimensions)
                            .status());
-    std::vector<int64> in_dim_size(ShapeUtil::Rank(shape));
-    absl::c_copy(shape.dimensions(), in_dim_size.begin());
+    std::vector<int64> in_dim_size(out_dim_size.begin(), out_dim_size.end());
     for (int i = 0; i < broadcast_dimensions.size(); i++) {
       in_dim_size[broadcast_dimensions[i]] = operand_shape.dimensions(i);
     }
     const auto& in_dim_shape =
-        ShapeUtil::MakeShape(shape.element_type(), in_dim_size);
+        ShapeUtil::MakeShape(operand_shape.element_type(), in_dim_size);
     TF_ASSIGN_OR_RETURN(
         XlaOp in_dim_broadcast,
         InDimBroadcast(in_dim_shape, operand, broadcast_dimensions));
-    if (ShapeUtil::Equal(in_dim_shape, shape)) {
+
+    // If broadcast is not degenerate, return broadcasted result.
+    if (ShapeUtil::Equal(in_dim_shape, output_shape)) {
       return in_dim_broadcast;
     }
-    return AddBroadcastSequence(shape, in_dim_broadcast);
+
+    // Otherwise handle degenerate broadcast case.
+    return AddBroadcastSequence(output_shape, in_dim_broadcast);
   });
 }
 
@@ -584,7 +609,7 @@ StatusOr<XlaOp> XlaBuilder::Reshape(const Shape& shape, const XlaOp& operand) {
   TF_RETURN_IF_ERROR(first_error_);
 
   HloInstructionProto instr;
-  *instr.mutable_shape() = shape;
+  *instr.mutable_shape() = shape.ToProto();
   return AddInstruction(std::move(instr), HloOpcode::kReshape, {operand});
 }
 
@@ -596,9 +621,9 @@ XlaOp XlaBuilder::Slice(const XlaOp& operand,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferSliceShape(operand_shape, start_indices,
-                                        limit_indices, strides));
+        Shape shape, ShapeInference::InferSliceShape(
+                         operand_shape, start_indices, limit_indices, strides));
+    *instr.mutable_shape() = shape.ToProto();
     for (int i = 0; i < start_indices.size(); i++) {
       auto* slice_config = instr.add_slice_dimensions();
       slice_config->set_start(start_indices[i]);
@@ -633,9 +658,10 @@ XlaOp XlaBuilder::DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicSliceShape(
                             operand_shape, start_indices_shape, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (int64 size : slice_sizes) {
       instr.add_dynamic_slice_sizes(size);
@@ -655,9 +681,10 @@ XlaOp XlaBuilder::DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
     TF_ASSIGN_OR_RETURN(const Shape& update_shape, GetShape(update));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDynamicUpdateSliceShape(
                             operand_shape, update_shape, start_indices_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     return AddInstruction(std::move(instr), HloOpcode::kDynamicUpdateSlice,
                           {operand, update, start_indices});
@@ -673,9 +700,9 @@ XlaOp XlaBuilder::ConcatInDim(absl::Span<const XlaOp> operands,
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(operands));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConcatOpShape(operand_shape_ptrs, dimension));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConcatOpShape(
+                                         operand_shape_ptrs, dimension));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.add_dimensions(dimension);
 
@@ -692,10 +719,9 @@ XlaOp XlaBuilder::Pad(const XlaOp& operand, const XlaOp& padding_value,
     TF_ASSIGN_OR_RETURN(const Shape& padding_value_shape,
                         GetShape(padding_value));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferPadShape(operand_shape, padding_value_shape,
-                                      padding_config));
-
+        Shape shape, ShapeInference::InferPadShape(
+                         operand_shape, padding_value_shape, padding_config));
+    *instr.mutable_shape() = shape.ToProto();
     *instr.mutable_padding_config() = padding_config;
 
     return AddInstruction(std::move(instr), HloOpcode::kPad,
@@ -708,7 +734,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           absl::Span<const int64> new_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(const Shape& shape,
+    TF_ASSIGN_OR_RETURN(const Shape shape,
                         ShapeInference::InferReshapeShape(
                             operand_shape, dimensions, new_sizes));
     XlaOp transposed = IsIdentityPermutation(dimensions)
@@ -721,7 +747,7 @@ XlaOp XlaBuilder::Reshape(const XlaOp& operand,
 XlaOp XlaBuilder::Reshape(const XlaOp& operand,
                           absl::Span<const int64> new_sizes) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_ASSIGN_OR_RETURN(auto shape, GetShape(operand));
+    TF_ASSIGN_OR_RETURN(Shape shape, GetShape(operand));
     std::vector<int64> dimensions(shape.dimensions_size());
     std::iota(dimensions.begin(), dimensions.end(), 0);
     return Reshape(operand, dimensions, new_sizes);
@@ -771,7 +797,7 @@ XlaOp XlaBuilder::Collapse(const XlaOp& operand,
 void XlaBuilder::Trace(const string& tag, const XlaOp& operand) {
   ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeNil();
+    *instr.mutable_shape() = ShapeUtil::MakeNil().ToProto();
     *instr.mutable_literal() = LiteralUtil::CreateR1U8(tag).ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kTrace, {operand});
   });
@@ -797,9 +823,10 @@ XlaOp XlaBuilder::Tuple(absl::Span<const XlaOp> elements) {
     TF_ASSIGN_OR_RETURN(const auto& operand_shapes, GetOperandShapes(elements));
     absl::c_transform(operand_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(const Shape shape,
                         ShapeInference::InferVariadicOpShape(
                             HloOpcode::kTuple, operand_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kTuple, elements);
   });
 }
@@ -814,7 +841,7 @@ XlaOp XlaBuilder::GetTupleElement(const XlaOp& tuple_data, int64 index) {
           ShapeUtil::HumanString(tuple_shape));
     }
     *instr.mutable_shape() =
-        ShapeUtil::GetTupleElementShape(tuple_shape, index);
+        ShapeUtil::GetTupleElementShape(tuple_shape, index).ToProto();
 
     instr.set_tuple_index(index);
 
@@ -873,9 +900,10 @@ XlaOp XlaBuilder::DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& lhs_shape, GetShape(lhs));
     TF_ASSIGN_OR_RETURN(const Shape& rhs_shape, GetShape(rhs));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferDotOpShape(lhs_shape, rhs_shape,
                                                         dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
     *instr.mutable_dot_dimension_numbers() = dimension_numbers;
     if (precision_config != nullptr) {
       *instr.mutable_precision_config() = *precision_config;
@@ -1017,10 +1045,11 @@ XlaOp XlaBuilder::ConvGeneralDilated(
                         MakeWindow(window_dimensions, window_strides, padding,
                                    lhs_dilation, rhs_dilation));
 
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferConvolveShape(
                             lhs_shape, rhs_shape, feature_group_count,
                             instr.window(), dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_convolution_dimension_numbers() = dimension_numbers;
     instr.set_feature_group_count(feature_group_count);
@@ -1093,10 +1122,9 @@ XlaOp XlaBuilder::Fft(const XlaOp& operand, const FftType fft_type,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferFftShape(operand_shape, fft_type, fft_length));
-
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferFftShape(
+                                         operand_shape, fft_type, fft_length));
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_fft_type(fft_type);
     for (int64 i : fft_length) {
       instr.add_fft_length(i);
@@ -1114,7 +1142,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     }
     const Shape infeed_instruction_shape =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
-    *instr.mutable_shape() = infeed_instruction_shape;
+    *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
     if (ShapeUtil::IsArray(shape) && sharding() &&
@@ -1135,7 +1163,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     XlaOp token;
     auto make_token = [&]() {
       HloInstructionProto token_instr;
-      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+      *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
       return AddInstruction(std::move(token_instr), HloOpcode::kAfterAll, {});
     };
     if (sharding()) {
@@ -1174,7 +1202,7 @@ XlaOp XlaBuilder::Infeed(const Shape& shape, const string& config) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto infeed_data;
-    *infeed_data.mutable_shape() = shape;
+    *infeed_data.mutable_shape() = shape.ToProto();
     infeed_data.set_tuple_index(0);
     return AddInstruction(std::move(infeed_data), HloOpcode::kGetTupleElement,
                           {infeed});
@@ -1190,7 +1218,7 @@ XlaOp XlaBuilder::InfeedWithToken(const XlaOp& token, const Shape& shape,
     }
     const Shape infeed_instruction_shape =
         ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
-    *instr.mutable_shape() = infeed_instruction_shape;
+    *instr.mutable_shape() = infeed_instruction_shape.ToProto();
     instr.set_infeed_config(config);
 
     if (ShapeUtil::IsArray(shape) && sharding() &&
@@ -1215,7 +1243,7 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
   ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
 
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
@@ -1228,14 +1256,14 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
           ShapeUtil::HumanStringWithLayout(shape_with_layout),
           ShapeUtil::HumanStringWithLayout(operand_shape));
     }
-    *instr.mutable_outfeed_shape() = shape_with_layout;
+    *instr.mutable_outfeed_shape() = shape_with_layout.ToProto();
 
     instr.set_outfeed_config(outfeed_config);
 
     // Outfeed takes a token as its second operand. Generate the token to pass
     // to the outfeed.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -1249,7 +1277,7 @@ void XlaBuilder::Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto tuple_instr;
-    *tuple_instr.mutable_shape() = ShapeUtil::MakeNil();
+    *tuple_instr.mutable_shape() = ShapeUtil::MakeNil().ToProto();
 
     // The dummy tuple should have no sharding.
     {
@@ -1268,7 +1296,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
 
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
 
     // Check and set outfeed shape.
     if (!LayoutUtil::HasLayout(shape_with_layout)) {
@@ -1281,7 +1309,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
           ShapeUtil::HumanStringWithLayout(shape_with_layout),
           ShapeUtil::HumanStringWithLayout(operand_shape));
     }
-    *instr.mutable_outfeed_shape() = shape_with_layout;
+    *instr.mutable_outfeed_shape() = shape_with_layout.ToProto();
 
     instr.set_outfeed_config(outfeed_config);
 
@@ -1293,7 +1321,7 @@ XlaOp XlaBuilder::OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
 XlaOp XlaBuilder::CreateToken() {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kAfterAll);
   });
 }
@@ -1303,8 +1331,17 @@ XlaOp XlaBuilder::AfterAll(absl::Span<const XlaOp> tokens) {
     if (tokens.empty()) {
       return InvalidArgument("AfterAll requires at least one operand");
     }
+    for (int i = 0; i < tokens.size(); ++i) {
+      const XlaOp& operand = tokens[i];
+      TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
+      if (!ShapeUtil::IsToken(operand_shape)) {
+        return InvalidArgument(
+            "All operands to AfterAll must be tokens; operand %d has shape %s",
+            i, ShapeUtil::HumanString(operand_shape));
+      }
+    }
     HloInstructionProto instr;
-    *instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kAfterAll, tokens);
   });
 }
@@ -1321,7 +1358,7 @@ XlaOp XlaBuilder::CustomCall(
           "are reserved for internal use.",
           call_target_name);
     }
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_custom_call_target(call_target_name);
     instr.set_custom_call_opaque(opaque);
     if (operand_shapes_with_layout.has_value()) {
@@ -1345,7 +1382,7 @@ XlaOp XlaBuilder::CustomCall(
               "constrained layout.",
               operand_num);
         }
-        *instr.add_operand_shapes_with_layout() = operand_shape;
+        *instr.add_operand_shapes_with_layout() = operand_shape.ToProto();
         ++operand_num;
       }
     }
@@ -1499,9 +1536,9 @@ XlaOp XlaBuilder::Transpose(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferTransposeShape(operand_shape, permutation));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferTransposeShape(
+                                         operand_shape, permutation));
+    *instr.mutable_shape() = shape.ToProto();
     for (int64 dim : permutation) {
       instr.add_dimensions(dim);
     }
@@ -1514,9 +1551,9 @@ XlaOp XlaBuilder::Rev(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferReverseShape(operand_shape, dimensions));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReverseShape(
+                                         operand_shape, dimensions));
+    *instr.mutable_shape() = shape.ToProto();
     for (int64 dim : dimensions) {
       instr.add_dimensions(dim);
     }
@@ -1535,9 +1572,9 @@ XlaOp XlaBuilder::Sort(const XlaOp& keys, absl::Span<const XlaOp> values,
                         GetOperandShapes(values));
     absl::c_transform(values_shapes, std::back_inserter(operand_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferVariadicOpShape(
-                            HloOpcode::kSort, operand_shape_ptrs));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferVariadicOpShape(
+                                         HloOpcode::kSort, operand_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     if (dimension == -1) {
       TF_ASSIGN_OR_RETURN(const Shape& keys_shape, GetShape(keys));
       dimension = ShapeUtil::Rank(keys_shape) - 1;
@@ -1559,9 +1596,9 @@ XlaOp XlaBuilder::ConvertElementType(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
+                                         operand_shape, new_element_type));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kConvert, {operand});
   });
 }
@@ -1571,9 +1608,9 @@ XlaOp XlaBuilder::BitcastConvertType(const XlaOp& operand,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferConvertShape(operand_shape, new_element_type));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferConvertShape(
+                                         operand_shape, new_element_type));
+    *instr.mutable_shape() = shape.ToProto();
     return AddInstruction(std::move(instr), HloOpcode::kBitcastConvert,
                           {operand});
   });
@@ -1605,11 +1642,11 @@ XlaOp XlaBuilder::Map(absl::Span<const XlaOp> operands,
     TF_ASSIGN_OR_RETURN(const ProgramShape& called_program_shape,
                         computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferMapShape(operand_shape_ptrs, called_program_shape,
-                                      dimensions));
+        Shape shape, ShapeInference::InferMapShape(
+                         operand_shape_ptrs, called_program_shape, dimensions));
+    *instr.mutable_shape() = shape.ToProto();
 
-    const Shape& output_shape = instr.shape();
+    Shape output_shape(instr.shape());
     const int64 output_rank = ShapeUtil::Rank(output_shape);
     AddCalledComputation(computation, &instr);
     std::vector<XlaOp> new_operands(operands.begin(), operands.end());
@@ -1652,7 +1689,7 @@ XlaOp XlaBuilder::RngOp(RandomDistribution distribution,
     }
 
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
-    *instr.mutable_shape() = shape;
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_distribution(distribution);
 
@@ -1680,10 +1717,10 @@ XlaOp XlaBuilder::While(const XlaComputation& condition,
     TF_ASSIGN_OR_RETURN(const auto& condition_program_shape,
                         condition.GetProgramShape());
     TF_ASSIGN_OR_RETURN(const Shape& init_shape, GetShape(init));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferWhileShape(condition_program_shape,
-                                        body_program_shape, init_shape));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferWhileShape(
+                                         condition_program_shape,
+                                         body_program_shape, init_shape));
+    *instr.mutable_shape() = shape.ToProto();
     // Body comes before condition computation in the vector.
     AddCalledComputation(body, &instr);
     AddCalledComputation(condition, &instr);
@@ -1700,10 +1737,10 @@ XlaOp XlaBuilder::Gather(const XlaOp& input, const XlaOp& start_indices,
     TF_ASSIGN_OR_RETURN(const Shape& input_shape, GetShape(input));
     TF_ASSIGN_OR_RETURN(const Shape& start_indices_shape,
                         GetShape(start_indices));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferGatherShape(input_shape, start_indices_shape,
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGatherShape(
+                                         input_shape, start_indices_shape,
                                          dimension_numbers, slice_sizes));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_gather_dimension_numbers() = dimension_numbers;
     for (int64 bound : slice_sizes) {
@@ -1728,10 +1765,11 @@ XlaOp XlaBuilder::Scatter(const XlaOp& input, const XlaOp& scatter_indices,
     TF_ASSIGN_OR_RETURN(const Shape& updates_shape, GetShape(updates));
     TF_ASSIGN_OR_RETURN(const ProgramShape& to_apply_shape,
                         update_computation.GetProgramShape());
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferScatterShape(
                             input_shape, scatter_indices_shape, updates_shape,
                             to_apply_shape, dimension_numbers));
+    *instr.mutable_shape() = shape.ToProto();
 
     *instr.mutable_scatter_dimension_numbers() = dimension_numbers;
 
@@ -1758,10 +1796,11 @@ XlaOp XlaBuilder::Conditional(const XlaOp& predicate, const XlaOp& true_operand,
     TF_ASSIGN_OR_RETURN(const ProgramShape& false_computation_shape,
                         false_computation.GetProgramShape());
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferConditionalShape(
             predicate_shape, true_operand_shape, false_operand_shape,
             true_computation_shape, false_computation_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     // The index of true_computation must be 0 and that of false computation
     // must be 1.
@@ -1803,9 +1842,10 @@ XlaOp XlaBuilder::Reduce(absl::Span<const XlaOp> operands,
                       [](const Shape& shape) { return &shape; });
 
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferReduceShape(
             operand_shape_ptrs, dimensions_to_reduce, called_program_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (int64 dim : dimensions_to_reduce) {
       instr.add_dimensions(dim);
@@ -1868,10 +1908,10 @@ XlaOp XlaBuilder::ReduceWindowWithGeneralPadding(
                         MakeWindow(window_dimensions, window_strides, padding,
                                    /*lhs_dilation=*/base_dilations,
                                    /*rhs_dilation=*/window_dilations));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferReduceWindowShape(operand_shape, init_shape,
-                                               instr.window(), to_apply_shape));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferReduceWindowShape(
+                                         operand_shape, init_shape,
+                                         instr.window(), to_apply_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(computation, &instr);
     return AddInstruction(std::move(instr), HloOpcode::kReduceWindow,
@@ -1889,9 +1929,10 @@ XlaOp XlaBuilder::BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& scale_shape, GetShape(scale));
     TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferBatchNormTrainingShape(
             operand_shape, scale_shape, offset_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1913,10 +1954,11 @@ XlaOp XlaBuilder::BatchNormInference(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& offset_shape, GetShape(offset));
     TF_ASSIGN_OR_RETURN(const Shape& mean_shape, GetShape(mean));
     TF_ASSIGN_OR_RETURN(const Shape& variance_shape, GetShape(variance));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
-                        ShapeInference::InferBatchNormInferenceShape(
-                            operand_shape, scale_shape, offset_shape,
-                            mean_shape, variance_shape, feature_index));
+    TF_ASSIGN_OR_RETURN(
+        Shape shape, ShapeInference::InferBatchNormInferenceShape(
+                         operand_shape, scale_shape, offset_shape, mean_shape,
+                         variance_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1938,10 +1980,11 @@ XlaOp XlaBuilder::BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
     TF_ASSIGN_OR_RETURN(const Shape& batch_mean_shape, GetShape(batch_mean));
     TF_ASSIGN_OR_RETURN(const Shape& batch_var_shape, GetShape(batch_var));
     TF_ASSIGN_OR_RETURN(const Shape& grad_output_shape, GetShape(grad_output));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferBatchNormGradShape(
                             operand_shape, scale_shape, batch_mean_shape,
                             batch_var_shape, grad_output_shape, feature_index));
+    *instr.mutable_shape() = shape.ToProto();
 
     instr.set_epsilon(epsilon);
     instr.set_feature_index(feature_index);
@@ -1972,9 +2015,9 @@ XlaOp XlaBuilder::CrossReplicaSum(
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferCrossReplicaSumShape({&operand_shape}));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferCrossReplicaSumShape(
+                                         {&operand_shape}));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (const ReplicaGroup& group : replica_groups) {
       *instr.add_replica_groups() = group;
@@ -2027,8 +2070,8 @@ XlaOp XlaBuilder::AllToAll(const XlaOp& operand, int64 split_dimension,
     absl::c_transform(slice_shapes, std::back_inserter(slice_shape_ptrs),
                       [](const Shape& shape) { return &shape; });
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
+        Shape shape, ShapeInference::InferAllToAllTupleShape(slice_shape_ptrs));
+    *instr.mutable_shape() = shape.ToProto();
     for (const ReplicaGroup& group : replica_groups) {
       *instr.add_replica_groups() = group;
     }
@@ -2053,8 +2096,9 @@ XlaOp XlaBuilder::CollectivePermute(
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
+        Shape shape,
         ShapeInference::InferCollectivePermuteShape(operand_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     for (const auto& pair : source_target_pairs) {
       auto* proto_pair = instr.add_source_target_pairs();
@@ -2103,10 +2147,11 @@ XlaOp XlaBuilder::SelectAndScatterWithGeneralPadding(
     TF_ASSIGN_OR_RETURN(*instr.mutable_window(),
                         MakeWindow(window_dimensions, window_strides, padding,
                                    /*lhs_dilation=*/{}, /*rhs_dilation=*/{}));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferSelectAndScatterShape(
                             operand_shape, select_shape, instr.window(),
                             source_shape, init_shape, scatter_shape));
+    *instr.mutable_shape() = shape.ToProto();
 
     AddCalledComputation(select, &instr);
     AddCalledComputation(scatter, &instr);
@@ -2121,9 +2166,10 @@ XlaOp XlaBuilder::ReducePrecision(const XlaOp& operand, const int exponent_bits,
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const Shape& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(*instr.mutable_shape(),
+    TF_ASSIGN_OR_RETURN(Shape shape,
                         ShapeInference::InferReducePrecisionShape(
                             operand_shape, exponent_bits, mantissa_bits));
+    *instr.mutable_shape() = shape.ToProto();
     instr.set_exponent_bits(exponent_bits);
     instr.set_mantissa_bits(mantissa_bits);
     return AddInstruction(std::move(instr), HloOpcode::kReducePrecision,
@@ -2138,7 +2184,7 @@ void XlaBuilder::Send(const XlaOp& operand, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -2157,15 +2203,17 @@ XlaOp XlaBuilder::SendWithToken(const XlaOp& operand, const XlaOp& token,
     // token}.
     HloInstructionProto send_instr;
     TF_ASSIGN_OR_RETURN(const Shape& shape, GetShape(operand));
-    *send_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *send_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     send_instr.set_channel_id(handle.handle());
     TF_ASSIGN_OR_RETURN(XlaOp send,
                         AddInstruction(std::move(send_instr), HloOpcode::kSend,
                                        {operand, token}));
 
     HloInstructionProto send_done_instr;
-    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     send_done_instr.set_channel_id(handle.handle());
     return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
                           {send});
@@ -2179,7 +2227,7 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto token_instr;
-    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *token_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     TF_ASSIGN_OR_RETURN(XlaOp token, AddInstruction(std::move(token_instr),
                                                     HloOpcode::kAfterAll, {}));
 
@@ -2190,7 +2238,7 @@ XlaOp XlaBuilder::Recv(const Shape& shape, const ChannelHandle& handle) {
     // TODO(b/80000000): Remove this when clients have been updated to handle
     // tokens.
     HloInstructionProto recv_data;
-    *recv_data.mutable_shape() = shape;
+    *recv_data.mutable_shape() = shape.ToProto();
     recv_data.set_tuple_index(0);
     return AddInstruction(std::move(recv_data), HloOpcode::kGetTupleElement,
                           {recv});
@@ -2207,15 +2255,18 @@ XlaOp XlaBuilder::RecvWithToken(const XlaOp& token, const Shape& shape,
     // Recv instruction produces a tuple of {receive buffer, U32 context,
     // token}.
     HloInstructionProto recv_instr;
-    *recv_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *recv_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_instr.set_channel_id(handle.handle());
     TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr),
                                                    HloOpcode::kRecv, {token}));
 
     HloInstructionProto recv_done_instr;
     *recv_done_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_done_instr.set_channel_id(handle.handle());
     return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
                           {recv});
@@ -2249,9 +2300,11 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token,
     // Send instruction produces a tuple of {aliased operand, U32 context,
     // token}.
     HloInstructionProto send_instr;
-    *send_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape_with_layout, ShapeUtil::MakeShape(U32, {}),
-         ShapeUtil::MakeTokenShape()});
+    *send_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape({shape_with_layout,
+                                   ShapeUtil::MakeShape(U32, {}),
+                                   ShapeUtil::MakeTokenShape()})
+            .ToProto();
     send_instr.set_channel_id(handle.handle());
     send_instr.set_is_host_transfer(true);
     TF_ASSIGN_OR_RETURN(XlaOp send,
@@ -2259,7 +2312,7 @@ XlaOp XlaBuilder::SendToHost(const XlaOp& operand, const XlaOp& token,
                                        {operand, token}));
 
     HloInstructionProto send_done_instr;
-    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape();
+    *send_done_instr.mutable_shape() = ShapeUtil::MakeTokenShape().ToProto();
     send_done_instr.set_channel_id(handle.handle());
     send_done_instr.set_is_host_transfer(true);
     return AddInstruction(std::move(send_done_instr), HloOpcode::kSendDone,
@@ -2288,8 +2341,10 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
     // Recv instruction produces a tuple of {receive buffer, U32 context,
     // token}.
     HloInstructionProto recv_instr;
-    *recv_instr.mutable_shape() = ShapeUtil::MakeTupleShape(
-        {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()});
+    *recv_instr.mutable_shape() =
+        ShapeUtil::MakeTupleShape(
+            {shape, ShapeUtil::MakeShape(U32, {}), ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_instr.set_channel_id(handle.handle());
     recv_instr.set_is_host_transfer(true);
     TF_ASSIGN_OR_RETURN(XlaOp recv, AddInstruction(std::move(recv_instr),
@@ -2297,7 +2352,8 @@ XlaOp XlaBuilder::RecvFromHost(const XlaOp& token, const Shape& shape,
 
     HloInstructionProto recv_done_instr;
     *recv_done_instr.mutable_shape() =
-        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()});
+        ShapeUtil::MakeTupleShape({shape, ShapeUtil::MakeTokenShape()})
+            .ToProto();
     recv_done_instr.set_channel_id(handle.handle());
     recv_done_instr.set_is_host_transfer(true);
     return AddInstruction(std::move(recv_done_instr), HloOpcode::kRecvDone,
@@ -2309,9 +2365,9 @@ XlaOp XlaBuilder::GetDimensionSize(const XlaOp& operand, int64 dimension) {
   return ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     HloInstructionProto instr;
     TF_ASSIGN_OR_RETURN(const auto& operand_shape, GetShape(operand));
-    TF_ASSIGN_OR_RETURN(
-        *instr.mutable_shape(),
-        ShapeInference::InferGetDimensionSizeShape(operand_shape, dimension));
+    TF_ASSIGN_OR_RETURN(Shape shape, ShapeInference::InferGetDimensionSizeShape(
+                                         operand_shape, dimension));
+    *instr.mutable_shape() = shape.ToProto();
     instr.add_dimensions(dimension);
     return AddInstruction(std::move(instr), HloOpcode::kGetDimensionSize,
                           {operand});
@@ -2356,7 +2412,7 @@ StatusOr<XlaComputation> XlaBuilder::BuildConstantSubGraph(
   SetProtoIdAndName(&entry, StrCat(name_, "_compute_constant"), kNameSeparator,
                     GetNextId());
   entry.set_root_id(root->id());
-  ProgramShape* program_shape = entry.mutable_program_shape();
+  ProgramShapeProto* program_shape = entry.mutable_program_shape();
   *program_shape->mutable_result() = root->shape();
 
   // We use std::set to keep the instruction ids in ascending order (which is
@@ -2617,9 +2673,10 @@ XlaOp Broadcast(const XlaOp& operand, absl::Span<const int64> broadcast_sizes) {
   return operand.builder()->Broadcast(operand, broadcast_sizes);
 }
 
-XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+XlaOp BroadcastInDim(const XlaOp& operand,
+                     const absl::Span<const int64> out_dim_size,
                      const absl::Span<const int64> broadcast_dimensions) {
-  return operand.builder()->BroadcastInDim(operand, shape,
+  return operand.builder()->BroadcastInDim(operand, out_dim_size,
                                            broadcast_dimensions);
 }
 
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index 68314a026eab0db3eaf321f0fa53c016d79882ba..098efb60f9bdca8306ff771a505f4a225dea9f7d 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -263,35 +264,30 @@ class XlaBuilder {
   // evaluating the computation.
   StatusOr<bool> IsConstant(const XlaOp& operand) const;
 
+  // Sets up binding which indicates that the `target_dim_num` in the subshape
+  // `target_param_index` of parameter `target_param_num` is a dynamic dimension
+  // and its real dynamic size is represented by `dynamic_param_index` in
+  // parameter `dynamic_param_num`.
+  //
+  // TODO(b/119520625): Remove this API once we have more dynamic shape infra
+  // ready.
+  Status SetDynamicBinding(int64 dynamic_size_param_num,
+                           ShapeIndex dynamic_size_param_index,
+                           int64 target_param_num,
+                           ShapeIndex target_param_index, int64 target_dim_num);
+
  private:
   // Build helper which takes the id of the root operation..
   StatusOr<XlaComputation> Build(int64 root_id);
 
-  // Enqueues a "retrieve parameter value" instruction for a parameter that was
-  // passed to the computation.
+  // Description for the methods below can be found in the corresponding public
+  // functions section in this file.
+
   XlaOp Parameter(int64 parameter_number, const Shape& shape,
                   const string& name);
 
-  // Enqueues a constant with the value of the given literal onto the
-  // computation.
   XlaOp ConstantLiteral(const LiteralSlice& literal);
 
-  // Enqueues a constant onto the computation. Methods are templated on the
-  // native host type (NativeT) which corresponds to a specific XLA
-  // PrimitiveType as given in the following table:
-  //
-  //  Native Type   PrimitiveType
-  // -----------------------------
-  //   bool           PRED
-  //   int32          S32
-  //   int64          S64
-  //   uint32         U32
-  //   uint64         U64
-  //   float          F32
-  //   double         F64
-  //
-  // Note: not all primitive types defined in xla_data.proto have a
-  // corresponding native type yet.
   template <typename NativeT>
   XlaOp ConstantR0(NativeT value);
   template <typename NativeT>
@@ -321,181 +317,79 @@ class XlaBuilder {
   template <typename NativeT>
   XlaOp ConstantR4FromArray4D(const Array4D<NativeT>& values);
 
-  // Enqueues a rank one constant (vector) onto the computation. The vector has
-  // size 'length' and every element has the value 'value'.
   template <typename NativeT>
   XlaOp ConstantR1(int64 length, NativeT value);
 
-  // Adds dimensions to an array by duplicating the data in the array.
-  //
-  // The new dimensions are inserted on the left, i.e. if
-  // broadcast_sizes has values {a0, ..., aN} and the operand shape
-  // has dimensions {b0, ..., bM} then the shape of the output has
-  // dimensions {a0, ..., aN, b0, ..., bM}.
-  //
-  // The new dimensions index into copies of the operand, i.e.
-  //
-  //   output[i0, ..., iN, j0, ..., jM] = operand[j0, ..., jM]
   XlaOp Broadcast(const XlaOp& operand,
                   absl::Span<const int64> broadcast_sizes);
 
-  XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+  XlaOp BroadcastInDim(const XlaOp& operand,
+                       const absl::Span<const int64> out_dim_size,
                        const absl::Span<const int64> broadcast_dimensions);
 
-  // Enqueues a pad operation onto the computation that pads the given value on
-  // the edges as well as between the elements of the input. padding_config
-  // specifies the padding amount for each dimension.
   XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
             const PaddingConfig& padding_config);
 
-  // Enqueues an operation onto the computation that flattens the operand based
-  // on the dimension order (major/slowest-varying to minor/fastest-varying)
-  // given, followed by reshaping it into the shape with the given dimension
-  // sizes (also major to minor). Conceptually, this is a limited form of
-  // "shape casting".
   XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> dimensions,
                 absl::Span<const int64> new_sizes);
 
-  // Enqueues an operation onto the computation that collapses the operand, from
-  // first to last dimension (C order), then reshapes it to the given dimension
-  // sizes. Conceptually, this is a limited form of "shape casting".
   XlaOp Reshape(const XlaOp& operand, absl::Span<const int64> new_sizes);
 
-  // Wrapper for Reshape.
-  // Enqueues an operation to collapse the provided dimensions; e.g. an
-  // operand with dimensions {x=256, y=2, z=2, p=32} can be collapsed to
-  // {x=1024, y=32} by collapsing dims {0, 1, 2}. Collapsing dimensions must
-  // be a consecutive, in-order subsequence of the operand dimensions.
-  //
-  // Note that collapsing a single dimension does nothing:
-  //
-  //    {256} collapsing {0} => {256}
-  //    {1} collapsing {0} => {1}
-  //
-  // Collapsing multiple dimensions produces a single result dimension:
-  //
-  //    {256, 2} collapsing {0,1} => {512}
-  //    {256, 2, 3} collapsing {0,1} => {512, 3}
-  //
-  // This could potentially cause data to be moved -- it provides a more
-  // structured form of reshaping than an arbitrary Reshape operation.
   XlaOp Collapse(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  // Enqueues a slice operation onto the computation that slices the operand
-  // from the start indices to the limit indices; e.g.
-  //
-  //        x
-  //   [ 0 1 2 3 ]
-  // y [ 4 5 6 7 ] => slice(start={1, 1}, limit={2, 3}) => [ 5 6 ]
-  //   [ 8 9 a b ]
-  //
-  // Note that "limit" means up-to-but-not-including; i.e. [start, limit) in 1D
-  // range notation.
-  // The strides parameter determines the stride over the slice
   XlaOp Slice(const XlaOp& operand, absl::Span<const int64> start_indices,
               absl::Span<const int64> limit_indices,
               absl::Span<const int64> strides);
 
-  // Enqueues a slice operation in a given dimension, taking all other
-  // dimensions as they are; e.g. if dimno is 1 from start_index 2 to
-  // limit_index 4 by 1, and the shape is f32[7,8,9], this call is short-hand
-  // for:
-  //
-  //  array[:, 2:4:1, :]
   XlaOp SliceInDim(const XlaOp& operand, int64 start_index, int64 limit_index,
                    int64 stride, int64 dimno);
 
-  // Enqueues a slice operation onto the computation that slices the 'operand'
-  // from dynamic start indices which are passed in 'start_indices'.
-  // The size of the slice in each dimension is passed in 'slice_sizes',
-  // which specify the end point of exclusive slice intervals in each
-  // dimension [start, start + size).
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo input dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
   XlaOp DynamicSlice(const XlaOp& operand, const XlaOp& start_indices,
                      absl::Span<const int64> slice_sizes);
 
-  // Enqueues a dynamic update slice operation onto the computation, which
-  // updates a slice of 'operand' with 'update' at dynamic 'start_indices'.
-  // The shape of 'update' determines the shape of the slice of 'operand'
-  // which is updated.
-  // The indices specified in 'start_indices' specify the offset of the slice
-  // of 'operand' which is updated.
-  //
-  //               update = {10, 11} // calculated at runtime.
-  //   [1 2 3]     start  = {1, 1}   // calculated at runtime.  [1 2  3 ]
-  //   [4 5 6]  => DynamicUpdateslice(data, update, start)   => [4 10 11]
-  //   [7 8 9]                                                  [7 8  9 ]
-  //
-  // The shape of 'start_indices' must be rank == 1, with dimension size
-  // equal to the rank of the 'operand'.
-  // Slice index calculations are computed modulo update dimension sizes to
-  // prevent dynamic start indices from generating out-of-bound array accesses.
   XlaOp DynamicUpdateSlice(const XlaOp& operand, const XlaOp& update,
                            const XlaOp& start_indices);
 
-  // Enqueues a concatenate instruction onto the computation. 'operands' must
-  // have >= 1 entry.
   XlaOp ConcatInDim(absl::Span<const XlaOp> operands, int64 dimension);
 
-  // Enqueue a tracing operation onto the computation; the computation will emit
-  // a logging message with the operand.
   void Trace(const string& tag, const XlaOp& operand);
 
-  // Enqueues a conditional-move-like select operation onto the computation;
-  // predicated on pred, selects between on_true and on_false.
   XlaOp Select(const XlaOp& pred, const XlaOp& on_true, const XlaOp& on_false);
 
-  // Enqueues a tuple-creation instruction onto the computation.
   XlaOp Tuple(absl::Span<const XlaOp> elements);
 
-  // Enqueues a tuple-element-get instruction onto the computation.
   XlaOp GetTupleElement(const XlaOp& tuple_data, int64 index);
 
-  // Enqueues an equal-to comparison instruction onto the computation.
   XlaOp Eq(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a not-equal comparison instruction onto the computation.
   XlaOp Ne(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a greater-or-equal comparison instruction onto the computation.
   XlaOp Ge(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a greater-than comparison instruction onto the computation.
   XlaOp Gt(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a less-than comparison instruction onto the computation.
   XlaOp Lt(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a less-or-equal comparison instruction onto the computation.
   XlaOp Le(const XlaOp& lhs, const XlaOp& rhs,
            absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a dot instruction onto the computation.
   XlaOp Dot(const XlaOp& lhs, const XlaOp& rhs,
             const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a general dot instruction onto the computation.
   XlaOp DotGeneral(const XlaOp& lhs, const XlaOp& rhs,
                    const DotDimensionNumbers& dimension_numbers,
                    const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, which uses the
-  // default convolution dimension numbers.
   XlaOp Conv(const XlaOp& lhs, const XlaOp& rhs,
              absl::Span<const int64> window_strides, Padding padding,
              int64 feature_group_count = 1,
              const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration in the format returned by MakePadding().
   XlaOp ConvWithGeneralPadding(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides,
@@ -503,8 +397,6 @@ class XlaBuilder {
       int64 feature_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided dimension numbers configuration.
   XlaOp ConvWithGeneralDimensions(
       const XlaOp& lhs, const XlaOp& rhs,
       absl::Span<const int64> window_strides, Padding padding,
@@ -512,8 +404,6 @@ class XlaBuilder {
       int64 feature_group_count = 1,
       const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration as well as the dimension numbers.
   XlaOp ConvGeneral(const XlaOp& lhs, const XlaOp& rhs,
                     absl::Span<const int64> window_strides,
                     absl::Span<const std::pair<int64, int64>> padding,
@@ -521,8 +411,6 @@ class XlaBuilder {
                     int64 feature_group_count = 1,
                     const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues a convolution instruction onto the computation, with the caller
-  // provided padding configuration, dilation factors and dimension numbers.
   XlaOp ConvGeneralDilated(const XlaOp& lhs, const XlaOp& rhs,
                            absl::Span<const int64> window_strides,
                            absl::Span<const std::pair<int64, int64>> padding,
@@ -532,80 +420,53 @@ class XlaBuilder {
                            int64 feature_group_count = 1,
                            const PrecisionConfig* precision_config = nullptr);
 
-  // Enqueues an FFT instruction onto the computation, of the given type and
-  // with the given FFT length.
   XlaOp Fft(const XlaOp& operand, FftType fft_type,
             absl::Span<const int64> fft_length);
 
-  // Enqueues an infeed instruction onto the computation, which writes data of
-  // the given shape to the infeed buffer of the device.
   XlaOp Infeed(const Shape& shape, const string& config = "");
   XlaOp InfeedWithToken(const XlaOp& token, const Shape& shape,
                         const string& config = "");
 
-  // Enqueues an outfeed instruction onto the computation. This instruction
-  // generates outgoing data transfers for the given data.
-  //
-  // shape_with_layout communicates the laid out shape that we want to outfeed
-  // -- if !ShapeUtil::Compatible(GetShape(operand), shape_with_layout) an error
-  // will occur.
   void Outfeed(const XlaOp& operand, const Shape& shape_with_layout,
                const string& outfeed_config);
   XlaOp OutfeedWithToken(const XlaOp& operand, const XlaOp& token,
                          const Shape& shape_with_layout,
                          const string& outfeed_config);
 
-  // Enqueues a call instruction onto the computation.
   XlaOp Call(const XlaComputation& computation,
              absl::Span<const XlaOp> operands);
 
-  // Enqueues a custom call instruction onto the computation.
   XlaOp CustomCall(
       const string& call_target_name, absl::Span<const XlaOp> operands,
       const Shape& shape_with_layout, const string& opaque,
       absl::optional<absl::Span<const Shape>> operand_shapes_with_layout);
 
-  // The following methods enqueue element-wise binary arithmetic operations
-  // onto the computation. The shapes of the operands have to match unless one
-  // of the operands is a scalar, or an explicit broadcast dimension is given
-  // (see g3doc for more details).
-
-  // Enqueues a complex compose instruction onto the computation.
   XlaOp Complex(const XlaOp& real, const XlaOp& imag,
                 absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a complex conjugate instruction onto the computation.
   XlaOp Conj(const XlaOp& operand);
 
-  // Enqueues an add instruction onto the computation.
   XlaOp Add(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a subtract instruction onto the computation.
   XlaOp Sub(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a multiply instruction onto the computation.
   XlaOp Mul(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a divide instruction onto the computation.
   XlaOp Div(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a remainder instruction onto the computation.
   XlaOp Rem(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a max instruction onto the computation.
   XlaOp Max(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues a min instruction onto the computation.
   XlaOp Min(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Element-wise logical operators
   XlaOp And(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
@@ -624,32 +485,23 @@ class XlaBuilder {
   XlaOp ShiftRightLogical(const XlaOp& lhs, const XlaOp& rhs,
                           absl::Span<const int64> broadcast_dimensions = {});
 
-  // Reduces an array among the provided dimensions, given "computation" as a
-  // reduction operator.
   XlaOp Reduce(const XlaOp& operand, const XlaOp& init_value,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
 
-  // Reduces several arrays simultaneously among the provided dimensions, given
-  // "computation" as a reduction operator.
   XlaOp Reduce(absl::Span<const XlaOp> operands,
                absl::Span<const XlaOp> init_values,
                const XlaComputation& computation,
                absl::Span<const int64> dimensions_to_reduce);
 
-  // Convenience wrapper around the above that reduces all the dimensions in the
-  // operand shape.
   XlaOp ReduceAll(const XlaOp& operand, const XlaOp& init_value,
                   const XlaComputation& computation);
 
-  // Enqueues a windowed reduce instruction onto the computation.
   XlaOp ReduceWindow(const XlaOp& operand, const XlaOp& init_value,
                      const XlaComputation& computation,
                      absl::Span<const int64> window_dimensions,
                      absl::Span<const int64> window_strides, Padding padding);
 
-  // As ReduceWindow(), but the padding is given in the format
-  // returned by MakePadding().
   XlaOp ReduceWindowWithGeneralPadding(
       const XlaOp& operand, const XlaOp& init_value,
       const XlaComputation& computation,
@@ -659,48 +511,22 @@ class XlaBuilder {
       absl::Span<const int64> window_dilations,
       absl::Span<const std::pair<int64, int64>> padding);
 
-  // Returns the sum of the operand value within each subgroup of replicas. All
-  // replicas supply one input to the sum and all replicas receive the resulting
-  // sum for each subgroup.
   XlaOp CrossReplicaSum(const XlaOp& operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
-  // Enqueues an operation that do an AllReduce of the operand cross cores. Here
-  // AllReduce means doing a reduction on the input operand cross cores and then
-  // broadcasting the reduction result to those cores. The reduction function is
-  // defined by `computation`, which should be a commutative computation on
-  // scalars, e.g., add, min, or max. The way that AllReduce is applied is
-  // configured by:
-  //
-  // - `replica_groups`: each ReplicaGroup contains a list of replica id. If
-  // empty, all replicas belong to one group. Allreduce will be applied within
-  // subgroups. For example, we have 4 replicas, then
-  // replica_groups={{0,2},{1,3}} means, replica 0 and 2 are in subgroup 0,
-  // replica 1 and 3 are in subgroup 1.
-  //
-  // - `channel_id`: for Allreduce nodes from different modules, if they have
-  // the same channel_id, they will be 'Allreduce'd. If empty, Allreduce will
-  // not be applied cross modules.
-  //
-  // TODO(b/117564385): Rename this to AllReduce when it's ready to use.
   XlaOp CrossReplicaSum(
       const XlaOp& operand, const XlaComputation& computation,
       absl::Span<const ReplicaGroup> replica_groups = {},
       const absl::optional<ChannelHandle>& channel_id = absl::nullopt);
 
-  // Enqueues an operation that do an Alltoall of the operand cross cores.
   XlaOp AllToAll(const XlaOp& operand, int64 split_dimension,
                  int64 concat_dimension, int64 split_count,
                  const std::vector<ReplicaGroup>& replica_groups);
 
-  // Enqueues an operation that do an CollectivePermute of the operand cross
-  // cores.
   XlaOp CollectivePermute(
       const XlaOp& operand,
       const std::vector<std::pair<int64, int64>>& source_target_pairs);
 
-  // Enqueues an operation that scatters the `source` array to the selected
-  // indices of each window.
   XlaOp SelectAndScatter(const XlaOp& operand, const XlaComputation& select,
                          absl::Span<const int64> window_dimensions,
                          absl::Span<const int64> window_strides,
@@ -708,8 +534,6 @@ class XlaBuilder {
                          const XlaOp& init_value,
                          const XlaComputation& scatter);
 
-  // As SelectAndScatter(), but the padding is given in the format
-  // returned by MakePadding().
   XlaOp SelectAndScatterWithGeneralPadding(
       const XlaOp& operand, const XlaComputation& select,
       absl::Span<const int64> window_dimensions,
@@ -717,217 +541,119 @@ class XlaBuilder {
       absl::Span<const std::pair<int64, int64>> padding, const XlaOp& source,
       const XlaOp& init_value, const XlaComputation& scatter);
 
-  // Enqueues an abs instruction onto the computation.
   XlaOp Abs(const XlaOp& operand);
 
-  // Enqueues a atan2 instruction onto the computation.
   XlaOp Atan2(const XlaOp& y, const XlaOp& x,
               absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues an exp instruction onto the computation.
   XlaOp Exp(const XlaOp& operand);
 
-  // Enqueues an expm1 instruction onto the computation.
   XlaOp Expm1(const XlaOp& operand);
 
-  // Enqueues a floor instruction onto the computation.
   XlaOp Floor(const XlaOp& operand);
 
-  // Enqueues a ceil instruction onto the computation.
   XlaOp Ceil(const XlaOp& operand);
 
-  // Enqueues a round instruction onto the computation, rounding to nearest even
-  // with half-way cases rounding away from zero.
   XlaOp Round(const XlaOp& operand);
 
-  // Enqueues an log instruction (natural logarithm) onto the computation.
   XlaOp Log(const XlaOp& operand);
 
-  // Enqueues an log1p instruction (log(x+1)) onto the computation.
   XlaOp Log1p(const XlaOp& operand);
 
-  // Enqueues a sign instruction onto the computation.
   XlaOp Sign(const XlaOp& operand);
 
-  // Enqueues a count leading zeros instruction onto the computation.
   XlaOp Clz(const XlaOp& operand);
 
-  // Enqueues a cosine instruction onto the computation.
   XlaOp Cos(const XlaOp& operand);
 
-  // Enqueues a sine instruction onto the computation.
   XlaOp Sin(const XlaOp& operand);
 
-  // Enqueues a tanh instruction onto the computation.
   XlaOp Tanh(const XlaOp& operand);
 
-  // Enqueues a real-part instruction onto the computation.
   XlaOp Real(const XlaOp& operand);
 
-  // Enqueues an imaginary-part instruction onto the computation.
   XlaOp Imag(const XlaOp& operand);
 
-  // Enqueues a lhs^rhs computation onto the computation.
   XlaOp Pow(const XlaOp& lhs, const XlaOp& rhs,
             absl::Span<const int64> broadcast_dimensions = {});
 
-  // Enqueues an operator that tests if the operand's values are finite, i.e.,
-  // not Inf or NaN. Defined only for floating-point types. Returns an array of
-  // booleans with the same shape where entries are true iff the corresponding
-  // entry was NaN.
   XlaOp IsFinite(const XlaOp& operand);
 
-  // Enqueues an iota operation onto the computation.
   XlaOp Iota(const Shape& shape, int64 iota_dimension);
 
-  // Enqueues a rank-1 iota operation onto the computation.
   XlaOp Iota(PrimitiveType type, int64 size);
 
-  // Enqueues a convert instruction onto the computation that changes the
-  // element type of the operand array to primitive_type.
   XlaOp ConvertElementType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  // Enqueues a no-op instruction onto the computation that changes
-  // the element type of the operand array to primitive_type. The
-  // bit-widths of the source and destination element types must be
-  // identical.
   XlaOp BitcastConvertType(const XlaOp& operand,
                            PrimitiveType new_element_type);
 
-  // Enqueues a negate instruction onto the computation.
   XlaOp Neg(const XlaOp& operand);
 
-  // Enqueues a transpose instruction onto the computation.
   XlaOp Transpose(const XlaOp& operand, absl::Span<const int64> permutation);
 
-  // Enqueues a reverse instruction onto the computation. The order of the
-  // elements in the given dimensions is reversed (i.e., the element at index i
-  // is moved to index dimension_size - 1 - i).
   XlaOp Rev(const XlaOp& operand, absl::Span<const int64> dimensions);
 
-  // Enqueues a sort (as increasing order) instruction onto the computation.
-  // If only keys are provided:
-  // * If the keys are an rank-1 tensor (an array), the result is a sorted array
-  // of keys, in ascending order.
-  // * If the keys have higher rank, the keys are sorted along the provided
-  // dimension. For example, for a rank-2 tensor (a matrix) of keys, a dimension
-  // value of 0 will indepenently sort every column, and a dimension value of 1
-  // will independently sort each row. If no dimension number is provided, then
-  // the last dimension is chosen by default.
-  //
-  // If both keys and values are provided:
-  // * The keys and all values must be tensors with the same dimensions. The
-  // element types of the tensors may be different.
-  // * The result is a tuple that consists of a sorted tensor of keys (along the
-  // provided dimension, as above) as the first element, and tensors with their
-  // corresponding values as the other elements.
   XlaOp Sort(const XlaOp& keys, absl::Span<const XlaOp> values = {},
              int64 dimension = -1);
 
-  // Enqueues a clamp instruction onto the computation.
   XlaOp Clamp(const XlaOp& min, const XlaOp& operand, const XlaOp& max);
 
-  // Enqueues a map instruction onto the computation.
   XlaOp Map(absl::Span<const XlaOp> operands, const XlaComputation& computation,
             absl::Span<const int64> dimensions,
             absl::Span<const XlaOp> static_operands = {});
 
-  // Enqueues a N(mu, sigma) random number generation instruction onto the
-  // computation.
   XlaOp RngNormal(const XlaOp& mu, const XlaOp& sigma, const Shape& shape);
 
-  // Enqueues a U(a, b) random number generation instruction onto the
-  // computation. Returns values in the semi-open interval [a, b).
   XlaOp RngUniform(const XlaOp& a, const XlaOp& b, const Shape& shape);
 
-  // Enqueues a while node onto the computation.
   XlaOp While(const XlaComputation& condition, const XlaComputation& body,
               const XlaOp& init);
 
-  // Enqueues a conditional node onto the computation.
   XlaOp Conditional(const XlaOp& predicate, const XlaOp& true_operand,
                     const XlaComputation& true_computation,
                     const XlaOp& false_operand,
                     const XlaComputation& false_computation);
 
-  // Enqueues a ReducePrecision node onto the computation.
   XlaOp ReducePrecision(const XlaOp& operand, const int exponent_bits,
                         const int mantissa_bits);
 
-  // Enqueues a Gather node onto the computation.
   XlaOp Gather(const XlaOp& input, const XlaOp& start_indices,
                const GatherDimensionNumbers& dimension_numbers,
                absl::Span<const int64> slice_sizes);
 
-  // Enqueues a Scatter node onto the computation.
   XlaOp Scatter(const XlaOp& input, const XlaOp& scatter_indices,
                 const XlaOp& updates, const XlaComputation& update_computation,
                 const ScatterDimensionNumbers& dimension_numbers);
 
-  // Enqueues a Send node onto the computation for device-to-device
-  // communication, to send the given operand to a Recv instruction that shares
-  // the same channel handle.
   void Send(const XlaOp& operand, const ChannelHandle& handle);
   XlaOp SendWithToken(const XlaOp& operand, const XlaOp& token,
                       const ChannelHandle& handle);
 
-  // Enqueues a Send node which sends data to the host.
   XlaOp SendToHost(const XlaOp& operand, const XlaOp& token,
                    const Shape& shape_with_layout, const ChannelHandle& handle);
 
-  // Enqueues a Recv node which receives data from the host.
   XlaOp RecvFromHost(const XlaOp& token, const Shape& shape,
                      const ChannelHandle& handle);
 
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
   XlaOp CreateToken();
 
-  // Enqueues an AfterAll operation with no operands producing a token-shaped
-  // value.
   XlaOp AfterAll(absl::Span<const XlaOp> tokens);
 
-  // Enqueues a Recv node onto the computation. The data comes from a Send
-  // instruction that shares the same channel handle and its shape must
-  // be the same as the given shape.
   XlaOp Recv(const Shape& shape, const ChannelHandle& handle);
   XlaOp RecvWithToken(const XlaOp& token, const Shape& shape,
                       const ChannelHandle& handle);
 
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // Returns a tuple (normalized, batch_mean, batch_var) where `normalized`
-  // is the normalized result and batch_mean and batch_var are the mean and
-  // variance, respectively, across batch for the operand.
   XlaOp BatchNormTraining(const XlaOp& operand, const XlaOp& scale,
                           const XlaOp& offset, float epsilon,
                           int64 feature_index);
 
-  // Normalizes operand across spatial and batch dimensions for each feature.
-  //
-  // `BatchNormInference` is equivalent to calling `BatchNormTraining` without
-  // computing `mean` and `variance` for each batch inside the operation. It
-  // uses the input `mean` and `variance` instead as estimated values. The
-  // purpose of this op is to reduce latency in inference, hence the name
-  // `BatchNormInference`.
-  //
-  // The output has the same shape as `operand`, and contains the normalized
-  // values for each batch.
   XlaOp BatchNormInference(const XlaOp& operand, const XlaOp& scale,
                            const XlaOp& offset, const XlaOp& mean,
                            const XlaOp& variance, float epsilon,
                            int64 feature_index);
 
-  // Calculates the gradients of a batch norm op.
-  //
-  // The inputs `batch_mean` and `batch_var` represent the mean and variance
-  // across the batch.
-  //
-  // Returns a tuple of three elements:
-  //   - grad_operand: Gradient with respect to input `operand`
-  //   - grad_offset: Gradient with respect to input `offset`
-  //   - grad_scale: Gradient with respect to input `scale`
   XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
                       const XlaOp& batch_mean, const XlaOp& batch_var,
                       const XlaOp& grad_output, float epsilon,
@@ -1019,6 +745,9 @@ class XlaBuilder {
   // The instructions of this computation.
   std::vector<HloInstructionProto> instructions_;
 
+  // Dynamic parameter configuration of this computation.
+  DynamicParameterBinding dynamic_parameter_binding_;
+
   // A map from XlaOp::Handle to the index in the instructions_ vector where the
   // instruction is held.
   absl::flat_hash_map<int64, int64> handle_to_index_;
@@ -1096,7 +825,7 @@ class XlaBuilder {
                          absl::Span<const int64> broadcast_sizes);
 
   friend XlaOp BroadcastInDim(
-      const XlaOp& operand, const Shape& shape,
+      const XlaOp& operand, const absl::Span<const int64> out_dim_size,
       const absl::Span<const int64> broadcast_dimensions);
 
   friend XlaOp Pad(const XlaOp& operand, const XlaOp& padding_value,
@@ -1393,6 +1122,7 @@ class XlaScopedShardingAssignment {
 // Free functions for building XlaOps. The intention is that these will
 // become the public API for building XlaOps rather than calling methods on
 // XlaBuilder directly.
+//
 
 // Enqueues a "retrieve parameter value" instruction for a parameter that was
 // passed to the computation.
@@ -1488,7 +1218,8 @@ XlaOp Broadcast(const XlaOp& operand, absl::Span<const int64> broadcast_sizes);
 //   will generate output
 //   {{1 , 1},
 //    {2 , 2}}
-XlaOp BroadcastInDim(const XlaOp& operand, const Shape& shape,
+XlaOp BroadcastInDim(const XlaOp& operand,
+                     const absl::Span<const int64> out_dim_size,
                      const absl::Span<const int64> broadcast_dimensions);
 
 // Enqueues a pad operation onto the computation that pads the given value on
@@ -2138,6 +1869,7 @@ XlaOp BatchNormGrad(const XlaOp& operand, const XlaOp& scale,
 XlaOp GetDimensionSize(const XlaOp& operand, int64 dimension);
 
 // Implementation details below this point.
+//
 
 template <typename NativeT>
 XlaOp XlaBuilder::ConstantR0(NativeT value) {
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 8aa85c3cd63c9b0aeb55d2cebbb989b6432ac959..b3f5be300d3f15397ad33858a6a9cab5f6029688 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -267,7 +267,7 @@ TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
 TEST_F(XlaBuilderTest, BroadcastInDim) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 3}), "x");
-  BroadcastInDim(x, ShapeUtil::MakeShape(F32, {2, 4, 3}),
+  BroadcastInDim(x, {2, 4, 3},
                  /*broadcast_dimensions=*/{0, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
@@ -277,7 +277,7 @@ TEST_F(XlaBuilderTest, BroadcastInDim) {
 TEST_F(XlaBuilderTest, BroadcastInDimWithDegeneratedDim) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(F32, {2, 1, 4}), "x");
-  BroadcastInDim(x, ShapeUtil::MakeShape(F32, {2, 3, 4}),
+  BroadcastInDim(x, {2, 3, 4},
                  /*broadcast_dimensions=*/{0, 1, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
@@ -446,5 +446,14 @@ TEST_F(XlaBuilderTest, ProtoMatches) {
   EXPECT_EQ(c0_string, c1_string);
 }
 
+TEST_F(XlaBuilderTest, AfterAllWithNonTokenOperands) {
+  XlaBuilder b(TestName());
+  AfterAll(&b, {CreateToken(&b), ConstantR0<float>(&b, 1.0)});
+  Status status = b.Build().status();
+  ASSERT_IS_NOT_OK(status);
+  EXPECT_THAT(status.error_message(),
+              ::testing::HasSubstr("All operands to AfterAll must be tokens"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/xla_computation.cc b/tensorflow/compiler/xla/client/xla_computation.cc
index c9870b65b91c1ebd7d44143faf215a2d5c2a2fc5..f317892c12529b2ee8a81788f6bbcae3b3d6489d 100644
--- a/tensorflow/compiler/xla/client/xla_computation.cc
+++ b/tensorflow/compiler/xla/client/xla_computation.cc
@@ -25,7 +25,7 @@ namespace xla {
 
 StatusOr<ProgramShape> XlaComputation::GetProgramShape() const {
   TF_RET_CHECK(proto_.has_host_program_shape());
-  return proto_.host_program_shape();
+  return ProgramShape(proto_.host_program_shape());
 }
 
 StatusOr<std::unique_ptr<HloSnapshot>> XlaComputation::Snapshot() const {
diff --git a/tensorflow/compiler/xla/client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h
index 71598ef8b296a760b0ee818fce0a59aed5cfc6b4..3ccbfb28bd0c5939ee40878e9cc298688882ac62 100644
--- a/tensorflow/compiler/xla/client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_computation.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index 033887d7c11bb530d70f0653f26c61bcbfe1e321..20609cad58d920c0c272899c41efeb99d23cd490 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -54,7 +54,7 @@ void SetDebugOptionsDefaults(DebugOptions* flags) {
   // TODO(jlebar): Disable fastmath once doing so is not a performance
   // regression.
   flags->set_xla_cpu_enable_fast_math(true);
-  flags->set_xla_gpu_enable_fast_math(true);
+  flags->set_xla_gpu_enable_fast_min_max(true);
 
   flags->set_xla_force_host_platform_device_count(1);
 }
@@ -160,11 +160,11 @@ void AllocateFlags() {
           "Enable unsafe fast-math optimizations in the CPU compiler; "
           "this may produce faster code at the expense of some accuracy."),
       tensorflow::Flag(
-          "xla_gpu_enable_fast_math",
-          bool_setter_for(&DebugOptions::set_xla_cpu_enable_fast_math),
-          flag_values->xla_cpu_enable_fast_math(),
-          "Enable unsafe fast-math optimizations in the GPU compiler; "
-          "this may produce faster code at the expense of some accuracy."),
+          "xla_gpu_enable_fast_min_max",
+          bool_setter_for(&DebugOptions::set_xla_gpu_enable_fast_min_max),
+          flag_values->xla_gpu_enable_fast_min_max(),
+          "Enable fast floating point min/max lowering that does not propagate "
+          "NaNs."),
       tensorflow::Flag(
           "xla_llvm_enable_alias_scope_metadata",
           bool_setter_for(
@@ -334,8 +334,14 @@ void AllocateFlags() {
           "overhead from context switching but we let the user override this "
           "behavior to help run tests on the host that run models in parallel "
           "across multiple devices."),
+      tensorflow::Flag(
+          "xla_gpu_disable_ptxas_optimizations",
+          bool_setter_for(
+              &DebugOptions::set_xla_gpu_disable_ptxas_optimizations),
+          flag_values->xla_gpu_disable_ptxas_optimizations(),
+          "In XLA:GPU run ptxas in -O0 (default is -O3)."),
   });
-  ParseFlagsFromEnv(*flag_objects);
+  ParseFlagsFromEnvAndDieIfUnknown("XLA_FLAGS", *flag_objects);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
index fb135f5ceda67ce6c001de15b8f3f084ca164826..1fea816a803bfb75b9721393cef8c4dfc249268d 100644
--- a/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
+++ b/tensorflow/compiler/xla/experimental/xla_sharding/xla_sharding.py
@@ -18,12 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math
-
 import numpy as _np  # Avoids becoming a part of public Tensorflow API.
 
 from tensorflow.compiler.xla import xla_data_pb2
-from tensorflow.compiler.xla.python_api import xla_shape
 from tensorflow.core.framework import attr_value_pb2
 
 
@@ -64,22 +61,18 @@ class Sharding(object):
             tile_assignment_devices=[core]))
 
   @classmethod
-  def tile(cls, tile_shape, tile_assignment):
+  def tile(cls, tile_assignment):
     """Returns a Tiled sharding attribute.
 
     This causes an op to be partially computed on multiple cores in the
     XLA device.
 
     Args:
-      tile_shape: A xla_shape.Shape describing the tile shape that each core
-        will compute.
-        The tile shape does not need to be divisible by the tile assignment.
       tile_assignment: An np.ndarray describing the topology of the tiling and
         which device will compute which part of the topology.
 
     Raises:
-      TypeError: tile_assignment was not of np.array type or tile_shape was
-         not of xla_shape.Shape type.
+      TypeError: tile_assignment was not of np.array type.
 
     TODO(jmolloy): This concept is nefarious and is not
     something we really want to expose to users (especially as the
@@ -87,14 +80,11 @@ class Sharding(object):
     """
     if not isinstance(tile_assignment, _np.ndarray):
       raise TypeError('Tile assignment must be of type np.ndarray')
-    if not isinstance(tile_shape, xla_shape.Shape):
-      raise TypeError('Tile shape must be of type xla_shape.Shape')
     dims = list(tile_assignment.shape)
     flattened_devices = tile_assignment.reshape(-1, order='C')
     return Sharding(
         proto=xla_data_pb2.OpSharding(
             type=xla_data_pb2.OpSharding.OTHER,
-            tile_shape=tile_shape.message,
             tile_assignment_dimensions=dims,
             tile_assignment_devices=list(flattened_devices)))
 
@@ -118,14 +108,8 @@ class Sharding(object):
     shape = tensor.shape.as_list()
     if shape[split_dimension] < num_devices:
       raise ValueError('Split dimension was smaller than the required number '
-                       'of splits: shape=%r, dimension=%r, num_devices=%r',
-                       shape, split_dimension, num_devices)
-
-    tile_shape = shape
-    tile_shape[split_dimension] = int(
-        math.ceil(tile_shape[split_dimension] / num_devices))
-    tile_shape_proto = xla_data_pb2.Shape(
-        element_type=xla_data_pb2.F32, dimensions=tile_shape)
+                       'of splits: shape=%r, dimension=%r, num_devices=%r' %
+                       (shape, split_dimension, num_devices))
 
     tile_assignment_dims = [1] * len(shape)
     tile_assignment_dims[split_dimension] = num_devices
@@ -133,7 +117,6 @@ class Sharding(object):
     return Sharding(
         proto=xla_data_pb2.OpSharding(
             type=xla_data_pb2.OpSharding.OTHER,
-            tile_shape=tile_shape_proto,
             tile_assignment_dimensions=tile_assignment_dims,
             tile_assignment_devices=range(num_devices)))
 
@@ -149,7 +132,6 @@ class Sharding(object):
           type=xla_data_pb2.OpSharding.TUPLE, tuple_shardings=tuple_shardings)
     else:
       proto = self._proto
-
     attr_value = attr_value_pb2.AttrValue(s=proto.SerializeToString())
     # TODO(jmolloy): This need to be seriously revisited before declaring this
     # API available for public use.
@@ -194,8 +176,8 @@ def assign_device(tensor, device):
   return tensor
 
 
-def tile(tensor, tile_shape, tile_assignment):
-  Sharding.tile(tile_shape, tile_assignment).apply_to_tensor(tensor)
+def tile(tensor, tile_assignment):
+  Sharding.tile(tile_assignment).apply_to_tensor(tensor)
   return tensor
 
 
diff --git a/tensorflow/compiler/xla/g3doc/_book.yaml b/tensorflow/compiler/xla/g3doc/_book.yaml
index bcfbcc3a22f50c748c388d17fbcd7defd27846d0..267701e9c0e42a21d2cda6238520f6a9692e7e76 100644
--- a/tensorflow/compiler/xla/g3doc/_book.yaml
+++ b/tensorflow/compiler/xla/g3doc/_book.yaml
@@ -3,15 +3,15 @@ upper_tabs:
 - include: /_upper_tabs_left.yaml
 - include: /api_docs/_upper_tabs_api.yaml
 # Dropdown menu
-- name: Ecosystem
-  path: /ecosystem
+- name: Resources
+  path: /resources
   is_default: true
   menu:
-  - include: /ecosystem/_menu_toc.yaml
+  - include: /resources/_menu_toc.yaml
   lower_tabs:
     # Subsite tabs
     other:
-    - name: Guide
+    - name: Guide & Tutorials
       contents:
       - title: XLA overview
         path: /xla/overview
@@ -27,3 +27,9 @@ upper_tabs:
         path: /xla/shapes
       - title: Using AOT compilation
         path: /xla/tfcompile
+      - heading: Tutorials
+      - title: XLA compile API
+        path: /xla/tutorials/xla_compile
+        status: experimental
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/compiler/xla/g3doc/_index.yaml b/tensorflow/compiler/xla/g3doc/_index.yaml
index 7934cd11ba22d3f47e172726f54ce51d15eb2cad..858de427119bfcfa82d0b1158776bf269129fd92 100644
--- a/tensorflow/compiler/xla/g3doc/_index.yaml
+++ b/tensorflow/compiler/xla/g3doc/_index.yaml
@@ -17,7 +17,7 @@ landing_page:
   - classname: devsite-landing-row-cards
     items:
     - heading: XLA - TensorFlow, compiled
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      image_path: /resources/images/tf-logo-card-16x9.png
       path: https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html
       buttons:
       - label: Read on Google Developers blog
@@ -28,7 +28,7 @@ landing_page:
       - label: Watch the video
         path: https://www.youtube.com/watch?v=kAOanJczHA0
     - heading: XLA on GitHub
-      image_path: /ecosystem/images/github-card-16x9.png
+      image_path: /resources/images/github-card-16x9.png
       path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/compiler/xla
       buttons:
       - label: View on GitHub
diff --git a/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png
new file mode 100644
index 0000000000000000000000000000000000000000..00cefe4c7806c1c09dd51499375e720bfb0baac6
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure1.png differ
diff --git a/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png
new file mode 100644
index 0000000000000000000000000000000000000000..6439c6e40272ae6b2954e9d7f3de2df470a2b36d
Binary files /dev/null and b/tensorflow/compiler/xla/g3doc/images/xla_array_layout_figure2.png differ
diff --git a/tensorflow/compiler/xla/g3doc/jit.md b/tensorflow/compiler/xla/g3doc/jit.md
index ded1e582b24c7a45acc6b61ba9c018fa2a1e7db7..85fa16ccc7f48a3dce840564e79097c9e136767f 100644
--- a/tensorflow/compiler/xla/g3doc/jit.md
+++ b/tensorflow/compiler/xla/g3doc/jit.md
@@ -86,7 +86,7 @@ on uncompilable operator, xla.compile() returns an explicit error. This is
 useful if you want more predictable behaviors from XLA compilation.
 
 Please see
-[xla.compile() tutorial Colab](https://colab.sandbox.google.com/github/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb)
+[xla.compile() tutorial Colab](./tutorials/xla_compile.ipynb)
 for how to use it.
 
 ### Placing operators on XLA devices
@@ -144,7 +144,7 @@ Execute the python script to train the model with XLA and turn on a debugging
 feature of XLA via an environmental variable that outputs the XLA graph.
 
 ```shell
-TF_XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
+XLA_FLAGS="--xla_hlo_graph_path=/tmp --xla_generate_hlo_graph=.*" python mnist_softmax_xla.py
 ```
 
 Open the timeline file created (`timeline.ctf.json`).  The rendered timeline
diff --git a/tensorflow/compiler/xla/g3doc/layout_with_tiling.md b/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..5e990851af7495ebd4417e44f1d955fcc14dadf1
--- /dev/null
+++ b/tensorflow/compiler/xla/g3doc/layout_with_tiling.md
@@ -0,0 +1,159 @@
+# Tiled layout
+
+*Note: This doc describes how tiled layout is intended to work. Tiling is being
+implemented, but this is an early effort and it is currently not even guaranteed
+to get an Unimplemented error if one tries to use tiling - it may be just
+silently ignored.*
+
+<center> ![](images/xla_array_layout_figure1.png)
+
+Figure 1 </center>
+
+Figure 1 shows how an array F32[3,5] is laid out in memory with 2x2 tiling. A
+shape with this layout is written as F32[3,5]{1,0:(2,2)}, where 1,0 relates to
+the physical order of dimensions (minor_to_major field in Layout) while (2,2)
+after the colon indicates tiling of the physical dimensions by a 2x2 tile.
+
+Intuitively tiles are laid out to cover the shape and then within each tile,
+elements are then laid out without tiling, as in the example above, where the
+right part of the example shows the layout in memory, including the white
+padding elements that are added in order to have complete 2x2 tiles even though
+the original array bounds are not even.
+
+The extra elements in the padding are not required to contain any particular
+value.
+
+## Linear index formulas for tiling given a shape and a tile
+
+Without tiling, an element e=(e<sub>n</sub>, e<sub>n-1</sub>, ... ,
+e<sub>1</sub>) in an array with array bounds d=(d<sub>n</sub>, d<sub>n-1</sub>,
+... , d<sub>1</sub>) (d1 is the most minor dimension) is laid out by major to
+minor order at position:
+
+&nbsp;&nbsp; linear_index(e, d) \
+= linear_index((e<sub>n</sub>, e<sub>n-1</sub>, ... , e<sub>1</sub>),
+(d<sub>n</sub>, d<sub>n-1</sub>, ... , d<sub>1</sub>)) \
+= e<sub>n</sub>d<sub>n-1</sub>...d<sub>1</sub> +
+e<sub>n-1</sub>d<sub>n-2</sub>...d<sub>1</sub> + ... + e<sub>1</sub>
+
+For simplicity of notation in this document we assume a tile has the same number
+of dimensions as the array. In XLA's implementation of tiling, this is
+generalized to tilings with fewer dimensions by leaving the initial most-major
+dimensions unchanged and applying the tiling only to the most minor dimensions,
+so that the tiling that is specified mentions a suffix of the physical
+dimensions of the shape being tiled.
+
+When tiling of size (t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>) is
+used, an element in the array with indices (e<sub>n</sub>, e<sub>n-1</sub>, ...
+, e<sub>1</sub>) is mapped to this position in the final layout:
+
+&nbsp;&nbsp; linear_index_with_tile(e, d, t) \
+= linear_index((⌊e/t⌋, e mod t), (⌈d/t⌉, t)) &nbsp; &nbsp; (arithmetic is
+elementwise, (a,b) is concatenation) \
+= linear_index((⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>1</sub>/t<sub>1</sub>⌋, e<sub>n</sub> mod t<sub>n</sub>, ... ,
+e<sub>1</sub> mod t<sub>1</sub>), (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>n</sub>, t<sub>n-1</sub>, ... ,
+t<sub>1</sub>)) \
+= linear_index((⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>1</sub>/t<sub>1</sub>⌋), (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉))∙t<sub>n</sub>t<sub>n-1</sub>...t<sub>1</sub> +
+linear_index((e<sub>n</sub> mod t<sub>n</sub>, ... , e<sub>1</sub> mod
+t<sub>1</sub>), (t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>))
+
+The layout can be thought of as having two parts:
+(⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... , ⌊e<sub>1</sub>/t<sub>1</sub>⌋), which
+corresponds to a tile index in an array of tiles of size
+(⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... , ⌈d<sub>1</sub>/t<sub>1</sub>⌉), and
+(e<sub>n</sub> mod t<sub>n</sub>, ... , e<sub>1</sub> mod t<sub>1</sub>), which
+corresponds to a within-tile index. The ceil function appears in
+⌈d<sub>i</sub>/t<sub>i</sub>⌉ because if tiles overrun the bounds of the larger
+array, padding is inserted as in Figure 1. Both the tiles and elements within
+tiles are laid out recursively without tiling.
+
+For the example in Figure 1, element (2,3) has tile index (1,1), and within-tile
+index (0,1), for a combined coordinate vector of (1, 1, 0, 1). The tile indices
+have bounds (2, 3) and the tile itself is (2, 2) for a combined vector of (2, 3,
+2, 2). The linear index with tile for the element with index (2, 3) in the
+logical shape is then
+
+&nbsp;&nbsp; linear_index_with_tile((2,3), (3,5), (2,2)) \
+= linear_index((1,1,0,1), (2,3,2,2)) \
+= linear_index((1,1), (2,3)) ∙ 2 ∙ 2 + linear_index((0,1), (2,2)) \
+= (1 ∙ 3 + 1) ∙ 2 ∙ 2 + (0 ∙ 2 + 1) \
+= 17.
+
+# Tiling as pad-reshape-transpose
+
+Tiling-based layout operates as follows: \
+Consider an array of dimensions (d<sub>n</sub>, d<sub>n-1</sub>, ... , d1) (d1
+is the most minor dimension). When it’s laid out with tiling of size
+(t<sub>n</sub>, t<sub>n-1</sub>, ... , t<sub>1</sub>) (t<sub>1</sub> is the most
+minor dimension), that tiling can be described in terms of pad-reshape-transpose
+in the following way.
+
+1.  The array is padded to (⌈d<sub>n</sub>/t<sub>n</sub>⌉∙t<sub>n</sub>, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉∙t<sub>1</sub>).
+2.  Each dimension i is broken into (⌈d<sub>i</sub>/t</sub>i</sub>⌉,
+    t<sub>i</sub>), i.e. the array is reshaped to \
+    &nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, t<sub>n</sub>, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>1</sub>). \
+    There is no physical layout change in this reshape by itself, so this
+    reshape is a bitcast. If one is not explicitly thinking of a tiling, this
+    reshape could express any shape with the same number of elements as the
+    padded shape - the example here is of how to express a tile in this way.
+3.  A transpose happens by moving t<sub>n</sub>, ... , t<sub>1</sub> to the most
+    minor dimensions while keeping their relative order, so that the order of
+    dimensions from most major to most minor becomes \
+    &nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+    ⌈d<sub>1</sub>/t<sub>1</sub>⌉, t<sub>n</sub>, ... , t<sub>1</sub>).
+
+The final shape has the prefix \
+&nbsp; &nbsp; (⌈d<sub>n</sub>/t<sub>n</sub>⌉, ... ,
+⌈d<sub>1</sub>/t<sub>1</sub>⌉), which describes the number of tiles in each
+dimension. An element in the array (e<sub>n</sub>, ... , e<sub>1</sub>) is
+mapped to this element in the final shape: \
+&nbsp; &nbsp; (⌊e<sub>n</sub>/t<sub>n</sub>⌋, ... ,
+⌊e<sub>0</sub>/t<sub>0</sub>⌋, e<sub>n</sub> mod t<sub>n</sub>, ... ,
+e<sub>1</sub> mod t<sub>1</sub>). It is easy to see that the linear index of the
+element follows the formula above as expected.
+
+# Repeated tiling
+
+XLA's tiling becomes even more flexible by applying it repeatedly.
+
+<center> ![](images/xla_array_layout_figure2.png)
+
+Figure 2 </center>
+
+Figure 2 shows how an array of size 4x8 is tiled by two levels of tiling (first
+2x4 then 2x1). We represent this repeated tiling as (2,4)(2,1). Each color
+indicates a 2x4 tile and each red border box is a 2x1 tile. The numbers
+indicates the linear index in memory of that element in the tiled format. This
+format matches the format used for BF16 on TPU, except that the initial tile is
+bigger, namely the tiling is (8,128)(2,1), where the purpose of the second
+tiling by 2x1 is to collect together two 16 bit values to form one 32 bit value
+in a way that aligns with the architecture of a TPU.
+
+Note that a second or later tile can refer to both the minor within-tile
+dimensions, which just rearranges data within the tile, as in this example with
+(8,128)(2,1), but can also refer to the major cross-tile dimensions from the
+prior tiling.
+
+# Combining dimensions using tiles
+
+XLA's tiling also supports combining dimensions. For example, it can combine
+dimensions in F32[2,7,8,11,10]{4,3,2,1,0} into F32[112,110]{1,0} first before
+tiling it with (2,3). The tile used is (&lowast;,&lowast;,2,&lowast;,3). Here an
+asterisk in a tile implies taking that dimension and combining it with the next
+more minor dimension. Multiple adjacent dimensions can be subsumed together into
+one dimension. A subsumed dimension is represented by a tile value of -1 in that
+dimension of the tile, which is not otherwise valid in a tile as a dimension
+size.
+
+More precisely, if dimension i of the shape is eliminated via an asterisk in the
+tile, then before the prior definition of tiling is applied, that dimension is
+removed from both the shape being tiled and the tile vector, and what was
+dimension i-1 of the shape has its array bound increased from d<sub>i-1</sub> to
+d<sub>i</sub>d<sub>i-1</sub>. This step is repeated for each asterisk in the
+tile vector.
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 73a9db75f6bf090bba5c3534f14d8ebfa421b5bb..d888b1f23f36f33ef94ef0e22374e0c796e47a89 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -13,6 +13,22 @@ arbitrary-dimensional array. For convenience, special cases have more specific
 and familiar names; for example a *vector* is a 1-dimensional array and a
 *matrix* is a 2-dimensional array.
 
+## AfterAll
+
+See also
+[`XlaBuilder::AfterAll`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+AfterAll takes a variadic number of tokens and produces a single token. Tokens
+are primitive types which can be threaded between side-effecting operations to
+enforce ordering. `AfterAll` can be used as a join of tokens for ordering a
+operation after a set operations.
+
+<b> `AfterAll(operands)` </b>
+
+Arguments  | Type    | Semantics
+---------- | ------- | -------------------------
+`operands` | `XlaOp` | variadic number of tokens
+
 ## AllToAll
 
 See also
@@ -402,6 +418,33 @@ then v12 == f32[8x3] {{10, 11, 12},
 
 ```
 
+## CollectivePermute
+
+See also
+[`XlaBuilder::CollectivePermute`](https://www.tensorflow.org/code/tensorflow/compiler/xla/client/xla_builder.h).
+
+CollectivePermute is a collective operation that sends and receives data cross
+replicas.
+
+<b> `CollectivePermute(operand, source_target_pairs)` </b>
+
+| Arguments             | Type                    | Semantics                  |
+| --------------------- | ----------------------- | -------------------------- |
+| `operand`             | `XlaOp`                 | n dimensional input array  |
+| `source_target_pairs` | `<int64, int64>` vector | A list of                  |
+:                       :                         : (source_replica_id,        :
+:                       :                         : target_replica_id) pairs.  :
+:                       :                         : For each pair, the operand :
+:                       :                         : is sent from source        :
+:                       :                         : replica to target replica. :
+
+Note that there are the following restrictions on the `source_target_pair`:
+
+-   Any two pairs should not have the same target replica id, and they should
+    not have the same source replica id.
+-   If a replica id is not a target in any pair, then the output on that replica
+    is a tensor consists of 0(s) with the same shape as the input.
+
 ## Concatenate
 
 See also
@@ -1423,10 +1466,11 @@ Builds a constant literal on device rather than a potentially large host
 transfer. Creates a rank 1 array of values starting at zero and incrementing by
 one.
 
-Arguments | Type            | Semantics
---------- | --------------- | ------------------------------------
-`type`    | `PrimitiveType` | type U
-`size`    | `int64`         | The number of elements in the array.
+Arguments        | Type            | Semantics
+---------------- | --------------- | ------------------------------------
+`type`           | `PrimitiveType` | type U
+`size`           | `int64`         | The number of elements in the array.
+`iota_dimension` | `int64`         | The dimension to increment along.
 
 ## Map
 
@@ -1780,8 +1824,9 @@ XlaBuilder builder(client_, "reduce_window_2x3");
 auto shape = ShapeUtil::MakeShape(F32, {4, 6});
 auto input = builder.Parameter(0, shape, "input");
 builder.ReduceWindow(
-    input, *max,
+    input,
     /*init_val=*/builder.ConstantLiteral(LiteralUtil::MinValue(F32)),
+    *max,
     /*window_dimensions=*/{2, 3},
     /*window_stride_dimensions=*/{2, 3},
     Padding::kValid);
diff --git a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
index a83e3f78598e7c0afaada43b8ae1ba71ad4839d6..2a83092805be5efdd7b9ab54449b2bcc6a2ec481 100644
--- a/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
+++ b/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb
@@ -1,25 +1,38 @@
 {
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "name": "The XLA compile API",
+      "version": "0.3.2",
+      "provenance": [],
+      "collapsed_sections": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    }
+  },
   "cells": [
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "f4TSNCvpENrW"
       },
+      "cell_type": "markdown",
       "source": [
         "##### Copyright 2018 The TensorFlow Authors."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
         "cellView": "form",
-        "colab": {},
         "colab_type": "code",
-        "id": "vamNSA0vEP-m"
+        "id": "vamNSA0vEP-m",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
         "# you may not use this file except in compliance with the License.\n",
@@ -32,139 +45,84 @@
         "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
         "# See the License for the specific language governing permissions and\n",
         "# limitations under the License."
-      ]
-    },
-    {
-      "cell_type": "code",
+      ],
       "execution_count": 0,
-      "metadata": {
-        "cellView": "form",
-        "colab": {},
-        "colab_type": "code",
-        "id": "xD_ydfejEV7H"
-      },
-      "outputs": [],
-      "source": [
-        "#@title MIT License\n",
-        "#\n",
-        "# Copyright (c) 2017 François Chollet\n",
-        "#\n",
-        "# Permission is hereby granted, free of charge, to any person obtaining a\n",
-        "# copy of this software and associated documentation files (the \"Software\"),\n",
-        "# to deal in the Software without restriction, including without limitation\n",
-        "# the rights to use, copy, modify, merge, publish, distribute, sublicense,\n",
-        "# and/or sell copies of the Software, and to permit persons to whom the\n",
-        "# Software is furnished to do so, subject to the following conditions:\n",
-        "#\n",
-        "# The above copyright notice and this permission notice shall be included in\n",
-        "# all copies or substantial portions of the Software.\n",
-        "#\n",
-        "# THE SOFTWARE IS PROVIDED \"AS IS\", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR\n",
-        "# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,\n",
-        "# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL\n",
-        "# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER\n",
-        "# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING\n",
-        "# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER\n",
-        "# DEALINGS IN THE SOFTWARE."
-      ]
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "e1oSi4lHFt3z"
       },
+      "cell_type": "markdown",
       "source": [
-        "# Welcome to `xla.compile()` tutorial"
+        "# The XLA compile API"
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "b7noD9NjFRL-"
       },
+      "cell_type": "markdown",
       "source": [
-        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/xla/jit#turning_on_jit_compilation\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://colab.sandbox.google.com/github/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "  \u003ctd\u003e\n",
-        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
-        "  \u003c/td\u003e\n",
-        "\u003c/table\u003e"
+        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/xla/tutorials/xla_compile\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
+        "  </td>\n",
+        "  <td>\n",
+        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/xla_compile.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
+        "  </td>\n",
+        "</table>"
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "v9YbsuLZaBXy"
       },
+      "cell_type": "markdown",
       "source": [
-        "xla.compile() is a new experimental API that compiles part or all of a model with [XLA](https://www.tensorflow.org/extend/xla/).\n",
         "\n",
-        "Please run all code blocks in order."
+        "\n",
+        "Import TensorFlow and the XLA library. XLA contains `xla.compile()`, an experimental API that compiles part or all of a model with [XLA](https://www.tensorflow.org/extend/xla/)."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "45kUPj5ZFrRa"
-      },
-      "outputs": [],
-      "source": [
-        "import tensorflow as tf"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "9NMQFjroSMns"
+        "id": "45kUPj5ZFrRa",
+        "colab": {}
       },
-      "source": [
-        "Imports XLA library, which includes xla.compile() experimental API."
-      ]
-    },
-    {
       "cell_type": "code",
-      "execution_count": 0,
-      "metadata": {
-        "colab": {},
-        "colab_type": "code",
-        "id": "-Uggy03rSGJm"
-      },
-      "outputs": [],
       "source": [
+        "import tensorflow as tf\n",
+        "\n",
         "from tensorflow.contrib.compiler import xla"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "GZVNiRmTDV-5"
       },
+      "cell_type": "markdown",
       "source": [
-        "Define some necessary constants and prepare MNIST dataset."
+        "Define some necessary constants and prepare the MNIST dataset."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "f37TSEGvGX4_"
+        "id": "f37TSEGvGX4_",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Size of each input image, 28 x 28 pixels\n",
         "IMAGE_SIZE = 28 * 28\n",
@@ -174,17 +132,17 @@
         "TRAIN_BATCH_SIZE = 100\n",
         "# Number of training steps to run\n",
         "TRAIN_STEPS = 1000"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "TiVXchblG5hK"
+        "id": "TiVXchblG5hK",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Loads MNIST dataset.\n",
         "train, test = tf.keras.datasets.mnist.load_data()\n",
@@ -195,16 +153,18 @@
         "images, labels = iterator.get_next()\n",
         "images = tf.reshape(images, [-1, IMAGE_SIZE])\n",
         "images, labels = tf.cast(images, tf.float32), tf.cast(labels, tf.int64)"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "x_ZehpZP-SfS"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Defines build_mnist_model function to construct model\n",
+        "# Define the model constructing function\n",
         "\n",
         "Following code block contains a function that constructs a simple model with one dense layer, including both forward and backward propagation.\n",
         "\n",
@@ -212,14 +172,12 @@
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "ZbhJl_WvGa3g"
+        "id": "ZbhJl_WvGa3g",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "def build_mnist_model(x, y_):\n",
         "  y = tf.keras.layers.Dense(NUM_CLASSES).apply(x)\n",
@@ -228,47 +186,41 @@
         "  train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)\n",
         "\n",
         "  return y, train_step"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "7Jh3lyQHDfM9"
       },
-      "source": [
-        "## Uses xla.compile with build_mnist_model function to enable XLA"
-      ]
-    },
-    {
       "cell_type": "markdown",
-      "metadata": {
-        "colab_type": "text",
-        "id": "EtDwez_1gjzv"
-      },
       "source": [
-        "Following code block wraps the model with xla.compile(), which allows the target function with provided inputs to be executed by XLA."
+        "# Enable XLA\n",
+        "\n",
+        "Use `xla.compile` with the `build_mnist_model` function to enable XLA. Following code block wraps the model with `xla.compile()`, which allows the target function with provided inputs to be executed by XLA."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "kYpCXCdRHNuN"
+        "id": "kYpCXCdRHNuN",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "[y] = xla.compile(build_mnist_model, inputs=[images, labels])"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "4giQh62IrZGF"
       },
+      "cell_type": "markdown",
       "source": [
         "When compiling the graph, XLA replaces all the graph nodes constructed in the target function with a few XLA ops.\n",
         "\n",
@@ -293,62 +245,62 @@
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "TPGas4jjFLZl"
       },
+      "cell_type": "markdown",
       "source": [
         "If you were to print the constructed graph now, you will see that it is not much different from a normal Tensorflow graph and you won't be able to find XLA ops mentioned before. This is because the actual compilation happens later when you try to execute the graph with `sess.run()`.  At that time, Tensorflow triggers a series of graph rewrite passes that actually generate XLA ops, which compiles and executes computation when all inputs are ready."
       ]
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "EZD1m_n1DxAF"
       },
+      "cell_type": "markdown",
       "source": [
-        "## Trains and tests the model"
+        "# Train and test the model"
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "qe28bAHNHUG2"
+        "id": "qe28bAHNHUG2",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Creates session and initialize all variables.\n",
         "# xla.compile() doesn't work with Keras model.fit() API or TF eager mode yet.\n",
         "sess = tf.Session()\n",
         "sess.run(tf.global_variables_initializer())"
-      ]
+      ],
+      "execution_count": 0,
+      "outputs": []
     },
     {
-      "cell_type": "markdown",
       "metadata": {
         "colab_type": "text",
         "id": "qgsKmz3n2UiW"
       },
+      "cell_type": "markdown",
       "source": [
-        "Following code block trains model.\n",
-        "\n",
-        "Note that evaluating `y` also triggers its control dependency node `train_step`, which updates model variables."
+        "Following code block trains model. Evaluating `y` also triggers its control dependency node `train_step`, which updates model variables."
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "_GxF6jTRHVuA"
+        "id": "_GxF6jTRHVuA",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "fbf299ca-02d5-4e95-f9fe-8f3c0432d132"
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Feeds training dataset\n",
         "sess.run(iterator.make_initializer(train_ds))\n",
@@ -356,18 +308,31 @@
         "# Runs TRAIN_STEPS steps\n",
         "for i in range(TRAIN_STEPS):\n",
         "  sess.run(y)\n",
+        "\n",
         "print(\"Model trained for %s steps.\" % TRAIN_STEPS)"
+      ],
+      "execution_count": 21,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Model trained for 1000 steps.\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "dHlQlRSRHXD1"
+        "id": "dHlQlRSRHXD1",
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 34
+        },
+        "outputId": "9c3677a2-ec84-406f-9d2c-d722844f3093"
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Tests trained model\n",
         "\n",
@@ -378,35 +343,31 @@
         "correct_prediction = tf.equal(tf.argmax(y, 1), labels)\n",
         "accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))\n",
         "print(\"Prediction accuracy after training: %s\" % sess.run(accuracy))"
+      ],
+      "execution_count": 22,
+      "outputs": [
+        {
+          "output_type": "stream",
+          "text": [
+            "Prediction accuracy after training: 0.91\n"
+          ],
+          "name": "stdout"
+        }
       ]
     },
     {
-      "cell_type": "code",
-      "execution_count": 0,
       "metadata": {
-        "colab": {},
         "colab_type": "code",
-        "id": "ynJQIuzjHYOb"
+        "id": "ynJQIuzjHYOb",
+        "colab": {}
       },
-      "outputs": [],
+      "cell_type": "code",
       "source": [
         "# Cleans up session\n",
         "sess.close()"
-      ]
-    }
-  ],
-  "metadata": {
-    "colab": {
-      "collapsed_sections": [],
-      "name": "xla.compile() Tutorial",
-      "provenance": [],
-      "version": "0.3.2"
-    },
-    "kernelspec": {
-      "display_name": "Python 2",
-      "name": "python2"
+      ],
+      "execution_count": 0,
+      "outputs": []
     }
-  },
-  "nbformat": 4,
-  "nbformat_minor": 0
-}
+  ]
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/index_util.h b/tensorflow/compiler/xla/index_util.h
index 458bdaf2f89819d2fbd8518150d11b42ce9f9c6e..d76f61eb62c0fc89d6bc3ca2033e8c7170f30e78 100644
--- a/tensorflow/compiler/xla/index_util.h
+++ b/tensorflow/compiler/xla/index_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 2398470dd49955f154dcb32edae6f3b9f961f89d..dbb81381acde645f08639737b6e7b6f6ad971f9b 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -460,6 +460,13 @@ std::ostream& operator<<(std::ostream& out, const Layout& layout) {
   }
   hash_value = Hash64Combine(hash_value, layout.max_sparse_elements());
 
+  for (Tile tile : layout.tiles()) {
+    for (int64 tile_dim : tile.dimensions()) {
+      hash_value = Hash64Combine(hash_value, hash<int64>()(tile_dim));
+    }
+  }
+  hash_value = Hash64Combine(hash_value, layout.element_size_in_bits());
+
   return hash_value;
 }
 
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 6e0390763da15167b85597462f3e21b8e1eaf732..6c298e57252449ce3f1f9055436e918f2d9f17f1 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index cb00a0ab16df851ccbd4bba960b92ea83157867d..8f480c1f1079b4e1a5be53958ebdf6e004ad9ebe 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -62,6 +63,14 @@ void ConvertEndianShort(char* bytes, int64 size) {
   }
 }
 
+// Since Eigen::half doesn't satisfy the absl::bit_cast contract, we need to be
+// able to transparently access the raw 16-bit value contained within.
+template <typename T>
+T GetRawValue(T val) {
+  return val;
+}
+uint16 GetRawValue(Eigen::half val) { return val.x; }
+
 }  // namespace
 
 LiteralBase::~LiteralBase() {}
@@ -283,16 +292,17 @@ Status MutableLiteralBase::CopyElementFrom(const LiteralSlice& src_literal,
   if (!proto.has_shape()) {
     return InvalidArgument("LiteralProto has no shape");
   }
-  if (ShapeUtil::HasPrimitiveType(proto.shape(), OPAQUE)) {
+  Shape shape(proto.shape());
+  if (ShapeUtil::HasPrimitiveType(shape, OPAQUE)) {
     return InvalidArgument("Literal shape cannot include OPAQUE sub-shape");
   }
-  if (!LayoutUtil::HasLayout(proto.shape())) {
+  if (!LayoutUtil::HasLayout(shape)) {
     return InvalidArgument("LiteralProto has no layout");
   }
 
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(proto.shape()));
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
-  Literal literal(proto.shape());
+  Literal literal(shape);
 
   TF_RETURN_IF_ERROR(literal.root_piece_->ForEachMutableSubpieceWithStatus(
       [&](const ShapeIndex& index, Piece* piece) {
@@ -1012,166 +1022,143 @@ void LiteralBase::Piece::SortSparseElementsInternal() {
 
 namespace {
 
-void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
-                    bool print_layout, std::vector<string>* pieces) {
-  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
-  CHECK(LayoutUtil::HasLayout(literal.shape()));
-  CHECK(LayoutUtil::HasLayout(subshape));
+string ShapeToString(bool print_layout, const Shape& shape) {
+  return print_layout ? ShapeUtil::HumanStringWithLayout(shape)
+                      : ShapeUtil::HumanString(shape);
+}
 
-  auto shape_to_string = [print_layout](const Shape& shape) {
-    if (print_layout) {
-      return ShapeUtil::HumanStringWithLayout(shape);
-    } else {
-      return ShapeUtil::HumanString(shape);
-    }
-  };
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                    bool print_layout, std::vector<string>* pieces);
 
-  // TODO(b/32894291): refactor this code to reduce code duplication.
-  if (ShapeUtil::IsTuple(subshape)) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" (\n");
-    std::vector<string> tuple_pieces;
-    for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
-      ShapeIndex element_index = shape_index;
-      element_index.push_back(i);
-      std::vector<string> element_pieces;
-      ToStringHelper(literal, element_index, print_layout, &element_pieces);
-      tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
+void TupleToStringHelper(const LiteralBase& literal,
+                         const ShapeIndex& shape_index, bool print_layout,
+                         std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  pieces->push_back(ShapeToString(print_layout, subshape));
+  pieces->push_back(" (\n");
+  std::vector<string> tuple_pieces;
+  for (int i = 0; i < ShapeUtil::TupleElementCount(subshape); ++i) {
+    ShapeIndex element_index = shape_index;
+    element_index.push_back(i);
+    std::vector<string> element_pieces;
+    ToStringHelper(literal, element_index, print_layout, &element_pieces);
+    tuple_pieces.push_back(absl::StrJoin(element_pieces, ""));
+  }
+  pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
+  pieces->push_back("\n)");
+}
+
+void SparseArrayToStringHelper(const LiteralBase& literal,
+                               const Shape& subshape, bool print_layout,
+                               std::vector<string>* pieces) {
+  pieces->push_back(ShapeToString(print_layout, subshape));
+  pieces->push_back("{");
+  int64 rank = ShapeUtil::Rank(subshape);
+  int64 num_elements = literal.sparse_element_count();
+  for (int64 i = 0; i < num_elements; ++i) {
+    if (i > 0) {
+      pieces->push_back(", ");
     }
-    pieces->push_back(absl::StrJoin(tuple_pieces, ",\n"));
-    pieces->push_back("\n)");
-    return;
-  }
-
-  if (ShapeUtil::IsToken(subshape)) {
-    pieces->push_back("token");
-    return;
-  }
-
-  if (LayoutUtil::IsSparseArray(subshape)) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back("{");
-    int64 rank = ShapeUtil::Rank(subshape);
-    int64 num_elements = literal.sparse_element_count();
-    for (int64 i = 0; i < num_elements; ++i) {
-      if (i > 0) {
-        pieces->push_back(", ");
-      }
-      if (rank == 1) {
-        pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
-        pieces->push_back(": ");
-      } else {
-        pieces->push_back("[");
-        pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
-        pieces->push_back("]: ");
-      }
-      pieces->push_back(literal.GetSparseElementAsString(i));
+    if (rank == 1) {
+      pieces->push_back(StrCat(literal.GetSparseIndex(i)[0]));
+      pieces->push_back(": ");
+    } else {
+      pieces->push_back("[");
+      pieces->push_back(absl::StrJoin(literal.GetSparseIndex(i), ", "));
+      pieces->push_back("]: ");
     }
-    pieces->push_back("}");
-    return;
+    pieces->push_back(literal.GetSparseElementAsString(i));
   }
+  pieces->push_back("}");
+}
 
-  CHECK(LayoutUtil::IsDenseArray(subshape));
-
-  auto element_to_string = [&](absl::Span<const int64> indices) -> string {
-    PrimitiveType element_type = subshape.element_type();
-    // We display predicates as 0s and 1s so that the string is more dense.
-    string elem = element_type == PRED
-                      ? literal.Get<bool>(indices, shape_index) ? "1" : "0"
-                      : literal.GetAsString(indices, shape_index);
-    return ((!indices.empty() && indices.back() > 0) ? ", " : "") + elem;
-  };
+void DenseArrayToStringHelper(const LiteralBase& literal,
+                              const ShapeIndex& shape_index, bool print_layout,
+                              std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  int64 rank = ShapeUtil::Rank(subshape);
+
+  std::function<void(absl::Span<const int64> dimensions, std::vector<int64>*)>
+      to_string_recursive = [&](absl::Span<const int64> dimensions,
+                                std::vector<int64>* accum_indices) {
+        // dimensions.size() decreases by 1 at each recursive call,
+        // and accum_indices->size() increases by 1.
+        // Their sum is equal to the rank of the tensor.
+        CHECK_EQ(rank, dimensions.size() + accum_indices->size());
+
+        auto brace_to_string = [&](string brace) -> string {
+          // Handle 1D tensor
+          if (rank == 1) {
+            return brace;
+          }
+          // Handle the innermost tensor of a 2D+ tensor.
+          if (dimensions.size() == 1 && brace == "{") {
+            return StrCat("  ", brace, dimensions[0] <= 1 ? "" : " ");
+          }
+          if (dimensions.size() == 1 && brace == "}") {
+            return StrCat(dimensions[0] <= 1 ? "" : " ", brace);
+          }
+          // Handle the non-innermost tensors of a 2D+ tensor.
+          if (brace == "{") {
+            if (rank > 3 && !accum_indices->empty() &&
+                accum_indices->size() < rank) {
+              int index = accum_indices->size() - 1;
+              int value = accum_indices->back();
+              return StrCat(brace, " /*i", index, "=", value, "*/\n");
+            }
+            return StrCat(brace, "\n");
+          }
+          return StrCat("\n", brace);
+        };
 
-  if (ShapeUtil::Rank(subshape) == 0) {
-    pieces->push_back(literal.GetAsString({}, shape_index));
-  } else if (ShapeUtil::Rank(subshape) == 1) {
-    pieces->push_back("{");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(element_to_string({i0}));
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 2) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back("  { ");
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(element_to_string({i0, i1}));
-      }
-      pieces->push_back(" ");
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "}\n" : "},\n");
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 3) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(i0 > 0 ? ",\n{" : "{");
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(i1 > 0 ? ",\n  { " : " { ");
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back(element_to_string({i0, i1, i2}));
-        }
-        pieces->push_back(" }");
-      }
-      pieces->push_back(" }");
-    }
-    pieces->push_back("\n}");
-  } else if (ShapeUtil::Rank(subshape) == 4) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(StrFormat("  {  /*i0=%d*/\n", i0));
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(StrFormat("    {  /*i1=%d*/\n", i1));
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back("      {");
-          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
-            pieces->push_back(element_to_string({i0, i1, i2, i3}));
+        if (dimensions.empty()) {
+          // Display predicates as 0s and 1s so that the string is more dense.
+          string elem;
+          if (subshape.element_type() == PRED && rank > 0) {
+            elem = literal.Get<bool>(*accum_indices, shape_index) ? "1" : "0";
+          } else {
+            elem = literal.GetAsString(*accum_indices, shape_index);
           }
-          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "}\n" : "},\n");
-        }
-        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
-                                                           : "    },\n");
-      }
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
-    }
-    pieces->push_back("}");
-  } else if (ShapeUtil::Rank(subshape) == 5) {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {\n");
-    for (int64 i0 = 0; i0 < subshape.dimensions(0); ++i0) {
-      pieces->push_back(StrFormat("  {  /*i0=%d*/\n", i0));
-      for (int64 i1 = 0; i1 < subshape.dimensions(1); ++i1) {
-        pieces->push_back(StrFormat("    {  /*i1=%d*/\n", i1));
-        for (int64 i2 = 0; i2 < subshape.dimensions(2); ++i2) {
-          pieces->push_back(StrFormat("      {  /*i2=%d*/\n", i2));
-          for (int64 i3 = 0; i3 < subshape.dimensions(3); ++i3) {
-            pieces->push_back("        {");
-            for (int64 i4 = 0; i4 < subshape.dimensions(4); ++i4) {
-              pieces->push_back(element_to_string({i0, i1, i2, i3, i4}));
+          pieces->push_back(elem);
+        } else {
+          pieces->push_back(brace_to_string("{"));
+          for (int i = 0; i < dimensions[0]; ++i) {
+            std::vector<int64> cloned_indices(*accum_indices);
+            cloned_indices.push_back(i);
+            to_string_recursive(dimensions.subspan(1), &cloned_indices);
+            if (i < dimensions[0] - 1) {
+              pieces->push_back(",");
+              pieces->push_back(dimensions.size() > 1 ? "\n" : " ");
             }
-            pieces->push_back(i3 == subshape.dimensions(3) - 1 ? "}\n"
-                                                               : "},\n");
           }
-          pieces->push_back(i2 == subshape.dimensions(2) - 1 ? "      }\n"
-                                                             : "      },\n");
+          pieces->push_back(brace_to_string("}"));
         }
-        pieces->push_back(i1 == subshape.dimensions(1) - 1 ? "    }\n"
-                                                           : "    },\n");
-      }
-      pieces->push_back(i0 == subshape.dimensions(0) - 1 ? "  }\n" : "  },\n");
-    }
-    pieces->push_back("}");
+      };
+
+  if (rank > 1) {
+    pieces->push_back(ShapeToString(print_layout, subshape));
+    pieces->push_back(" ");
+  }
+  std::vector<int64> indices = {};
+  std::vector<int64> dimensions(subshape.dimensions().begin(),
+                                subshape.dimensions().end());
+  to_string_recursive(dimensions, &indices);
+}
+
+void ToStringHelper(const LiteralBase& literal, const ShapeIndex& shape_index,
+                    bool print_layout, std::vector<string>* pieces) {
+  const Shape& subshape = ShapeUtil::GetSubshape(literal.shape(), shape_index);
+  CHECK(LayoutUtil::HasLayout(literal.shape()));
+  CHECK(LayoutUtil::HasLayout(subshape));
+  if (ShapeUtil::IsTuple(subshape)) {
+    TupleToStringHelper(literal, shape_index, print_layout, pieces);
+  } else if (ShapeUtil::IsToken(subshape)) {
+    pieces->push_back("token");
+  } else if (LayoutUtil::IsSparseArray(subshape)) {
+    SparseArrayToStringHelper(literal, subshape, print_layout, pieces);
   } else {
-    pieces->push_back(shape_to_string(subshape));
-    pieces->push_back(" {");
-    literal.EachCellAsString(
-        [&](absl::Span<const int64> indices, const string& value) {
-          pieces->push_back(" ");
-          pieces->push_back(value);
-        });
-    pieces->push_back("}");
+    CHECK(LayoutUtil::IsDenseArray(subshape));
+    DenseArrayToStringHelper(literal, shape_index, print_layout, pieces);
   }
 }
 
@@ -1228,16 +1215,32 @@ Literal ConvertBetweenNativeTypes(const LiteralBase& src_literal) {
 }
 
 template <typename NativeSrcT, typename NativeDestT>
-typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT)),
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(NativeDestT) &&
+                         !std::is_same<NativeDestT, Eigen::half>::value),
                         Literal>::type
 BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
   auto converter = [](NativeSrcT src) {
-    return absl::bit_cast<NativeDestT>(src);
+    return absl::bit_cast<NativeDestT>(GetRawValue(src));
   };
   return ConvertBetweenNativeTypesWithConverter<NativeSrcT, NativeDestT>(
       src_literal, converter);
 }
 
+template <typename NativeSrcT, typename NativeDestT>
+typename std::enable_if<(sizeof(NativeSrcT) == sizeof(Eigen::half) &&
+                         std::is_same<NativeDestT, Eigen::half>::value),
+                        Literal>::type
+BitcastBetweenNativeTypes(const LiteralBase& src_literal) {
+  // Eigen::half doesn't satisfy the absl::bit_cast contract, so explicitly
+  // cast to unsigned short and then use raw_uint16_to_half.
+  auto converter = [](NativeSrcT src) {
+    return Eigen::half_impl::raw_uint16_to_half(
+        absl::bit_cast<uint16>(GetRawValue(src)));
+  };
+  return ConvertBetweenNativeTypesWithConverter<NativeSrcT, Eigen::half>(
+      src_literal, converter);
+}
+
 // This template specialization is here to make the compiler happy. bit_cast has
 // a static check that the types are the same size. This specialization should
 // never be used because the source and destination types are checked for
@@ -1792,7 +1795,7 @@ void CopyToRepeatedField(RepeatedFieldT* dest,
 }  // namespace
 
 void LiteralBase::Piece::WriteToProto(LiteralProto* proto) const {
-  *proto->mutable_shape() = subshape();
+  *proto->mutable_shape() = subshape().ToProto();
   switch (subshape().element_type()) {
     case PRED:
       CopyToRepeatedField(proto->mutable_preds(), data<bool>());
@@ -1898,8 +1901,9 @@ Status LiteralBase::Piece::CopyFromProto(const LiteralProto& proto) {
   // These conditions should have been checked in
   // MutableLiteralBase::CreateFromProto.
   TF_RET_CHECK(proto.has_shape());
-  TF_RET_CHECK(LayoutUtil::HasLayout(proto.shape()));
-  TF_RET_CHECK(ShapeUtil::Equal(proto.shape(), subshape()));
+  Shape shape(proto.shape());
+  TF_RET_CHECK(LayoutUtil::HasLayout(shape));
+  TF_RET_CHECK(ShapeUtil::Equal(shape, subshape()));
 
   if (LayoutUtil::IsSparseArray(subshape())) {
     // Compute the number of elements (indices) in the sparse shape and reserve
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index e791048b4d9f5dcf877e05e3b5cf16eb37c07dbc..fa9a71af4ceb998a7a289443cbef70eb52cb1a11 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -301,7 +301,7 @@ class LiteralBase {
   //
   // Note: It's an antipattern to use this method then immediately call
   // MutableLiteralBase::Populate on the result (since that results in zero
-  // initialization, then reinitialization. Conside if a call to
+  // initialization, then reinitialization. Consider if a call to
   // absl::make_unique<Literal>(shape), followed by the call to
   // MutableLiteralBase::Populate can be used instead.
   static Literal CreateFromShape(const Shape& shape);
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index 8cec37897a94472d61d2346cf4cab03c45033800..49363ad802ddb9520f89b53257216bc7ddaf8ff5 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -150,12 +150,58 @@ TEST_F(LiteralUtilTest, R3ToString) {
   const auto literal =
       LiteralUtil::CreateR3({{{1}, {2}}, {{3}, {4}}, {{5}, {6}}});
   const string expected = R"(s32[3,2,1] {
-{ { 1 },
-  { 2 } },
-{ { 3 },
-  { 4 } },
-{ { 5 },
-  { 6 } }
+{
+  {1},
+  {2}
+},
+{
+  {3},
+  {4}
+},
+{
+  {5},
+  {6}
+}
+})";
+  EXPECT_EQ(expected, literal.ToString());
+}
+
+TEST_F(LiteralUtilTest, R6ToString) {
+  const auto literal =
+      LiteralUtil::CreateFromDimensions(S32, {2, 2, 1, 1, 1, 2});
+  const string expected = R"(s32[2,2,1,1,1,2] {
+{ /*i0=0*/
+{ /*i1=0*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+},
+{ /*i1=1*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+}
+},
+{ /*i0=1*/
+{ /*i1=0*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+},
+{ /*i1=1*/
+{ /*i2=0*/
+{ /*i3=0*/
+  { 0, 0 }
+}
+}
+}
+}
 })";
   EXPECT_EQ(expected, literal.ToString());
 }
@@ -190,12 +236,16 @@ TEST_F(LiteralUtilTest, CreateR3FromArray3d) {
   EXPECT_THAT(literal.shape().dimensions(), ElementsAre(2, 3, 2));
   string result = literal.ToString();
   const string expected = R"(f32[2,3,2] {
-{ { 1, 2 },
+{
+  { 1, 2 },
   { 3, 4 },
-  { 5, 6 } },
-{ { 7, 8 },
+  { 5, 6 }
+},
+{
+  { 7, 8 },
   { 9, 10 },
-  { 11, 12 } }
+  { 11, 12 }
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -247,18 +297,18 @@ TEST_F(LiteralUtilTest, LiteralR4F32ProjectedStringifies) {
   EXPECT_THAT(literal.shape().dimensions(), ElementsAre(1, 2, 3, 2));
   string result = literal.ToString();
   const string expected = R"(f32[1,2,3,2] {
-  {  /*i0=0*/
-    {  /*i1=0*/
-      {1, 2},
-      {1001, 1002},
-      {2001, 2002}
-    },
-    {  /*i1=1*/
-      {1, 2},
-      {1001, 1002},
-      {2001, 2002}
-    }
-  }
+{ /*i0=0*/
+{ /*i1=0*/
+  { 1, 2 },
+  { 1001, 1002 },
+  { 2001, 2002 }
+},
+{ /*i1=1*/
+  { 1, 2 },
+  { 1001, 1002 },
+  { 2001, 2002 }
+}
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -268,30 +318,30 @@ TEST_F(LiteralUtilTest, LiteralR4F32Stringifies) {
               ElementsAre(2, 2, 3, 3));
   string result = literal_r4_2x2x3x3_dim0major_.ToString();
   const string expected = R"(f32[2,2,3,3] {
-  {  /*i0=0*/
-    {  /*i1=0*/
-      {1, 2, 3},
-      {4, 5, 6},
-      {7, 8, 9}
-    },
-    {  /*i1=1*/
-      {11, 12, 13},
-      {14, 15, 16},
-      {17, 18, 19}
-    }
-  },
-  {  /*i0=1*/
-    {  /*i1=0*/
-      {101, 102, 103},
-      {104, 105, 106},
-      {107, 108, 109}
-    },
-    {  /*i1=1*/
-      {201, 202, 203},
-      {204, 205, 206},
-      {207, 208, 209}
-    }
-  }
+{ /*i0=0*/
+{ /*i1=0*/
+  { 1, 2, 3 },
+  { 4, 5, 6 },
+  { 7, 8, 9 }
+},
+{ /*i1=1*/
+  { 11, 12, 13 },
+  { 14, 15, 16 },
+  { 17, 18, 19 }
+}
+},
+{ /*i0=1*/
+{ /*i1=0*/
+  { 101, 102, 103 },
+  { 104, 105, 106 },
+  { 107, 108, 109 }
+},
+{ /*i1=1*/
+  { 201, 202, 203 },
+  { 204, 205, 206 },
+  { 207, 208, 209 }
+}
+}
 })";
   EXPECT_EQ(expected, result);
 }
@@ -1327,13 +1377,26 @@ TEST_F(LiteralUtilTest, BitcastConvertBetweenInvalidTypes) {
       absl::StrContains(status.error_message(), "bit widths are different"));
 }
 
+// Sets the layout of the given ShapeProto to the default.
+void SetDefaultLayoutOnProto(ShapeProto* shape_proto) {
+  CHECK(ShapeUtil::IsArrayPrimitiveType(shape_proto->element_type()));
+  shape_proto->mutable_layout()->set_format(DENSE);
+  auto* minor_to_major =
+      shape_proto->mutable_layout()->mutable_minor_to_major();
+  minor_to_major->Resize(shape_proto->dimensions_size(), 0);
+  const int64 size = minor_to_major->size();
+  for (int64 i = 0; i < size; ++i) {
+    minor_to_major->Set(i, size - 1 - i);
+  }
+}
+
 TEST_F(LiteralUtilTest, CopyFromProto_Bool) {
   LiteralProto p;
   p.mutable_shape()->set_element_type(PRED);
   for (int len = 0; len < 25; ++len) {
     p.mutable_shape()->clear_dimensions();
     p.mutable_shape()->add_dimensions(len);
-    LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+    SetDefaultLayoutOnProto(p.mutable_shape());
     p.clear_preds();
     for (int i = 0; i < len; ++i) {
       p.add_preds((i % 2) == (len % 2));
@@ -1359,7 +1422,7 @@ TEST_F(LiteralUtilTest, ToProto_f16) {
   EXPECT_EQ(4, m.data<half>().size());
 
   LiteralProto p = m.ToProto();
-  EXPECT_EQ(4, ShapeUtil::ElementsIn(p.shape()));
+  EXPECT_EQ(4, ShapeUtil::ElementsIn(Shape(p.shape())));
   EXPECT_EQ(8, p.f16s().size());
   const char* d = p.f16s().data();
   EXPECT_EQ(d[0], 0);
@@ -1382,7 +1445,7 @@ TEST_F(LiteralUtilTest, CopyFromProto_f16) {
   p.mutable_shape()->set_element_type(F16);
   p.mutable_shape()->clear_dimensions();
   p.mutable_shape()->add_dimensions(4);
-  LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+  SetDefaultLayoutOnProto(p.mutable_shape());
   p.clear_f16s();
   p.set_f16s(half_vals, 8);
   TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p));
@@ -1404,7 +1467,7 @@ TEST_F(LiteralUtilTest, CopyFromProto_u16) {
   p.mutable_shape()->set_element_type(U16);
   p.mutable_shape()->clear_dimensions();
   p.mutable_shape()->add_dimensions(4);
-  LayoutUtil::SetToDefaultLayout(p.mutable_shape());
+  SetDefaultLayoutOnProto(p.mutable_shape());
   p.clear_u16s();
   p.set_u16s(uint16_vals, 8);
   TF_ASSERT_OK_AND_ASSIGN(Literal literal, Literal::CreateFromProto(p));
@@ -1537,9 +1600,9 @@ TEST_F(LiteralUtilTest, DecomposeTuple) {
   Literal nested_tuple = LiteralUtil::MakeTuple(
       {&tuple_elements[0], &tuple_elements[1], &nil_literal});
 
-  EXPECT_FALSE(ShapeUtil::IsNil(nested_tuple.shape()));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(nested_tuple.shape()));
   std::vector<Literal> elements = nested_tuple.DecomposeTuple();
-  EXPECT_TRUE(ShapeUtil::IsNil(nested_tuple.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(nested_tuple.shape()));
 
   ASSERT_EQ(elements.size(), 3);
 
@@ -1590,7 +1653,7 @@ TEST_F(LiteralUtilTest, MoveIntoTuple) {
   EXPECT_EQ(literal.Get<double>({1}, /*shape_index=*/{2, 1}), 44.0);
 
   for (const Literal& element : elements) {
-    EXPECT_TRUE(ShapeUtil::IsNil(element.shape()));
+    EXPECT_TRUE(ShapeUtil::IsEmptyTuple(element.shape()));
   }
 }
 
@@ -1706,7 +1769,7 @@ TEST_F(LiteralUtilTest, ProtoRoundTrip) {
 TEST_F(LiteralUtilTest, InvalidProtoNoValues) {
   // Proto contains a shape, but no values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
   Status status = Literal::CreateFromProto(proto).status();
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.error_message(),
@@ -1727,7 +1790,7 @@ TEST_F(LiteralUtilTest, InvalidProtoNoShape) {
 TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
   // Proto contains values in wrong container.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {3}).ToProto();
   proto.add_preds(false);
   proto.add_preds(true);
   proto.add_preds(false);
@@ -1740,7 +1803,7 @@ TEST_F(LiteralUtilTest, InvalidProtoWrongContainer) {
 TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
   // Proto contains too few values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(F32, {42, 2}).ToProto();
   proto.add_f32s(1.0);
   proto.add_f32s(2.0);
   proto.add_f32s(3.0);
@@ -1753,7 +1816,7 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewValues) {
 TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
   // Proto contains too many values.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2});
+  *proto.mutable_shape() = ShapeUtil::MakeShape(S32, {2}).ToProto();
   proto.add_s32s(42);
   proto.add_s32s(-10);
   proto.add_s32s(100);
@@ -1766,8 +1829,8 @@ TEST_F(LiteralUtilTest, InvalidProtoTooManyValues) {
 TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
   // Proto shape missing layout.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2});
-  LayoutUtil::ClearLayout(proto.mutable_shape());
+  *proto.mutable_shape() = ShapeUtil::MakeShape(PRED, {2, 2}).ToProto();
+  proto.mutable_shape()->clear_layout();
   proto.add_preds(true);
   proto.add_preds(false);
   proto.add_preds(true);
@@ -1780,11 +1843,13 @@ TEST_F(LiteralUtilTest, InvalidProtoMissingLayout) {
 TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
   // Proto has the too few tuple elements.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})});
+  *proto.mutable_shape() =
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})})
+          .ToProto();
   LiteralProto* element0 = proto.add_tuple_literals();
   *element0->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 0).ToProto();
   element0->add_preds(false);
   element0->add_preds(true);
 
@@ -1796,19 +1861,21 @@ TEST_F(LiteralUtilTest, InvalidProtoTooFewTupleElements) {
 TEST_F(LiteralUtilTest, InvalidProtoTooManyTupleElements) {
   // Proto has the too many tuple elements.
   LiteralProto proto;
-  *proto.mutable_shape() = ShapeUtil::MakeTupleShape(
-      {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})});
+  *proto.mutable_shape() =
+      ShapeUtil::MakeTupleShape(
+          {ShapeUtil::MakeShape(PRED, {2}), ShapeUtil::MakeShape(F32, {})})
+          .ToProto();
   LiteralProto* element0 = proto.add_tuple_literals();
   *element0->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 0).ToProto();
   element0->add_preds(false);
   element0->add_preds(true);
   LiteralProto* element1 = proto.add_tuple_literals();
   *element1->mutable_shape() =
-      ShapeUtil::GetTupleElementShape(proto.shape(), 1);
+      ShapeUtil::GetTupleElementShape(Shape(proto.shape()), 1).ToProto();
   element1->add_f32s(42.0);
   LiteralProto* element2 = proto.add_tuple_literals();
-  *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {});
+  *element2->mutable_shape() = ShapeUtil::MakeShape(F32, {}).ToProto();
   element2->add_f32s(123.0);
 
   Status status = Literal::CreateFromProto(proto).status();
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.cc b/tensorflow/compiler/xla/parse_flags_from_env.cc
index 40481331b6992103e10e3fe635a030d3bdffebc9..5b568888d14f21c1330556d017eafba6c8dd2228 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env.cc
@@ -13,15 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This module exports ParseFlagsFromEnv(), which allows other modules to parse
-// flags from an environtment variable, or a file named by the environment
-// variable.
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environtment variable, or a file named by the
+// environment variable.
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <memory>
+#include <unordered_map>
 #include <vector>
 
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/parse_flags_from_env.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
@@ -32,7 +37,6 @@ limitations under the License.
 
 namespace xla {
 
-static const char kEnvVar[] = "TF_XLA_FLAGS";  // environment variable queried
 static const char kWS[] = " \t\r\n";           // whitespace
 
 // The following struct represents an argv[]-style array, parsed
@@ -42,12 +46,20 @@ static const char kWS[] = " \t\r\n";           // whitespace
 // constructor/destructor collisions with other "private" types
 // in the same named namespace.
 namespace {
+
+// Functor which deletes objects by calling `free`.  Necessary to free strdup'ed
+// strings created by AppendToEnvArgv.
+struct FreeDeleter {
+  void operator()(char* ptr) { free(ptr); }
+};
+
 struct EnvArgv {
   EnvArgv() : initialized(false), argc(0) {}
   bool initialized;         // whether the other fields have been set.
   int argc;                 // elements used in argv[]
   std::vector<char*> argv;  // flag arguments parsed from environment string.
-  std::vector<char*> argv_save;  // saved values from argv[] to avoid leaks
+  // saved values from argv[] to avoid leaks
+  std::vector<std::unique_ptr<char, FreeDeleter>> argv_save;
 };
 }  // anonymous namespace
 
@@ -63,7 +75,7 @@ static void AppendToEnvArgv(const char* s0, size_t s0len, const char* s1,
     string s = string(s0, s0len) + string(s1, s1len);
     char* str = strdup(s.c_str());
     a->argv.push_back(str);
-    a->argv_save.push_back(str);
+    a->argv_save.emplace_back(str);
     a->argc++;
   }
 }
@@ -127,14 +139,14 @@ static void ParseArgvFromString(const string& flag_str, EnvArgv* a) {
   }
 }
 
-// Call ParseArgvFromString(..., a) on a string derived from the setting of an
-// environment variable kEnvVar, or a file it points to.
-static void SetArgvFromEnv(EnvArgv* a) {
+// Call ParseArgvFromString(..., a) on a string derived from the setting of the
+// environment variable `envvar`, or a file it points to.
+static void SetArgvFromEnv(absl::string_view envvar, EnvArgv* a) {
   if (!a->initialized) {
     static const char kDummyArgv[] = "<argv[0]>";
     AppendToEnvArgv(kDummyArgv, strlen(kDummyArgv), nullptr, 0,
                     a);  // dummy argv[0]
-    const char* env = getenv(kEnvVar);
+    const char* env = getenv(string(envvar).c_str());
     if (env == nullptr || env[0] == '\0') {
       // nothing
     } else if (env[strspn(env, kWS)] == '-') {  // flags in env var value
@@ -157,48 +169,66 @@ static void SetArgvFromEnv(EnvArgv* a) {
   }
 }
 
-// The simulated argv[] parsed from the environment.
-static EnvArgv* env_argv;
+// The simulated argv[] parsed from the environment, one for each different
+// environment variable we've seen.
+static std::unordered_map<string, EnvArgv>& EnvArgvs() {
+  static auto* env_argvs = new std::unordered_map<string, EnvArgv>();
+  return *env_argvs;
+}
 
-// Used to protect accesses to env_argv.
+// Used to protect accesses to env_argvs.
 static tensorflow::mutex env_argv_mu(tensorflow::LINKER_INITIALIZED);
 
-// Call Flags::Parse(argc, argv, flag_list) against any as yet unrecognized
-// flags passed in from the environment.
-bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list) {
-  env_argv_mu.lock();
-  if (env_argv == nullptr) {
-    env_argv = new EnvArgv;
-  }
-  SetArgvFromEnv(env_argv);  // a no-op if already initialized
+bool ParseFlagsFromEnvAndDieIfUnknown(
+    absl::string_view envvar, const std::vector<tensorflow::Flag>& flag_list) {
+  tensorflow::mutex_lock lock(env_argv_mu);
+  auto* env_argv = &EnvArgvs()[string(envvar)];
+  SetArgvFromEnv(envvar, env_argv);  // a no-op if already initialized
   bool result =
       tensorflow::Flags::Parse(&env_argv->argc, &env_argv->argv[0], flag_list);
-  env_argv_mu.unlock();
+
+  // There's always at least one unparsed argc, namely the fake argv[0].
+  if (result && env_argv->argc != 1) {
+    // Skip the first argv, which is the fake argv[0].
+    auto unknown_flags = absl::MakeSpan(env_argv->argv);
+    unknown_flags.remove_prefix(1);
+
+    // Some flags are set on XLA_FLAGS, others on TF_XLA_FLAGS.  If we find an
+    // unrecognized flag, suggest the alternative.
+    string alternate_envvar;
+    if (envvar == "TF_XLA_FLAGS") {
+      alternate_envvar = "XLA_FLAGS";
+    } else if (envvar == "XLA_FLAGS") {
+      alternate_envvar = "TF_XLA_FLAGS";
+    }
+    string did_you_mean;
+    if (!alternate_envvar.empty()) {
+      did_you_mean = absl::StrFormat(
+          "\nPerhaps you meant to specify these on the %s envvar?",
+          alternate_envvar);
+    }
+
+    LOG(FATAL) << "Unknown flag" << (unknown_flags.size() > 1 ? "s" : "")
+               << " in " << envvar << ": " << absl::StrJoin(unknown_flags, " ")
+               << did_you_mean;
+    return false;
+  }
   return result;
 }
 
 // Testing only.
-// Reset the env_argv struct so that subsequent calls to ParseFlagsFromEnv()
-// will parse the environment variable (or the file it points to) anew, and set
-// *pargc, and *pargv to point to the internal locations of the argc and argv
-// constructed from the environment.
-void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv) {
-  env_argv_mu.lock();
-  if (env_argv == nullptr) {
-    env_argv = new EnvArgv;
-  }
-  if (!env_argv->argv_save.empty()) {
-    for (int i = 0; env_argv->argv_save[i] != nullptr; i++) {
-      free(env_argv->argv_save[i]);
-    }
-  }
-  env_argv->initialized = false;
-  env_argv->argc = 0;
-  env_argv->argv.clear();
-  env_argv->argv_save.clear();
-  env_argv_mu.unlock();
-  *pargc = &env_argv->argc;
-  *pargv = &env_argv->argv;
+//
+// Resets the env_argv struct so that subsequent calls to
+// ParseFlagsFromEnvAndDieIfUnknown() will parse the environment variable (or
+// the file it points to) anew, and set *pargc, and *pargv to point to the
+// internal locations of the argc and argv constructed from the environment.
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv) {
+  tensorflow::mutex_lock lock(env_argv_mu);
+  EnvArgvs().erase(string(envvar));
+  auto& env_argv = EnvArgvs()[string(envvar)];
+  *pargc = &env_argv.argc;
+  *pargv = &env_argv.argv;
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/parse_flags_from_env.h b/tensorflow/compiler/xla/parse_flags_from_env.h
index fe86ee687f8482aaffc2ebe04a723d9a22f2cce6..76940a4299ac50138222333ff250a264cc941288 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env.h
+++ b/tensorflow/compiler/xla/parse_flags_from_env.h
@@ -16,48 +16,58 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
 #define TENSORFLOW_COMPILER_XLA_PARSE_FLAGS_FROM_ENV_H_
 
-// This module exports ParseFlagsFromEnv(), which allows other modules to parse
-// flags from the environtment variable TF_XLA_FLAGS, or (if the first
+// This module exports ParseFlagsFromEnvAndDieIfUnknown(), which allows other
+// modules to parse flags from an environtment variable, or (if the first
 // non-whitespace in the variable value is not '-'), a file named by that
-// environment variable.  The accepted syntax is that flags arguments are of
-// the form --flag=value or (for boolean flags) --flag, and are whitespace
-// separated.  The <value> may be one of:
-// - <non-whitespace, non-nul not starting with single-quote or double-quote>
-//   in which case the effective value is the string itself
-// - <single-quote><characters string not containing nul or
-//   single-quote><single_quote> in which case the effective value is the
-//   string with the single-quotes removed
-// - <double-quote><character string not containing nul or unesecaped
-//   double-quote><double_quote> in which case the effective value if the
-//   string with the double-quotes removed, and escaped sequences of
-//   <backslash><char> replaced by <char>.
+// environment variable.
+//
+// The accepted syntax is that flags arguments are of the form --flag=value or
+// (for boolean flags) --flag, and are whitespace separated.  The <value> may be
+// one of:
+//
+//  - <non-whitespace, non-nul not starting with single-quote or double-quote>
+//    in which case the effective value is the string itself
+//  - <single-quote><characters string not containing nul or
+//    single-quote><single_quote> in which case the effective value is the
+//    string with the single-quotes removed
+//  - <double-quote><character string not containing nul or unesecaped
+//    double-quote><double_quote> in which case the effective value if the
+//    string with the double-quotes removed, and escaped sequences of
+//    <backslash><char> replaced by <char>.
 //
 // Flags values inconsistent with the type of the flag will be rejected by the
 // flag parser.
 //
 // Examples:
-//    TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
 //
-//    TF_XLA_FLAGS=/tmp/flagfile
+//  - TF_XLA_FLAGS="--foo=bar  --wombat='value with a space'"
+//  - TF_XLA_FLAGS=/tmp/flagfile
+//
 // where /tmp/flagfile might contain
-//    --some_flag="This is a string containing a \" and a '."
-//    --another_flag=wombats
+//
+//  --some_flag="This is a string containing a \" and a '."
+//  --another_flag=wombats
 
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
 
-// Call tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
-// unrecognized flags passed in from the environment, and return its
-// return value.
-bool ParseFlagsFromEnv(const std::vector<tensorflow::Flag>& flag_list);
+// Calls tensorflow::Flags::Parse(argc, argv, flag_list) against any as yet
+// unrecognized flags passed in the environment variable `envvar`, and returns
+// its return value.
+//
+// Raises a fatal error if any flags in `envvar` were not recognized.
+bool ParseFlagsFromEnvAndDieIfUnknown(
+    absl::string_view envvar, const std::vector<tensorflow::Flag>& flag_list);
 
 // Used only for testing.  Not to be used by clients.
-void ResetFlagsFromEnvForTesting(int** pargc, std::vector<char*>** pargv);
+void ResetFlagsFromEnvForTesting(absl::string_view envvar, int** pargc,
+                                 std::vector<char*>** pargv);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/parse_flags_from_env_test.cc b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
index edd6538402d6ceee292ca6a265f490be9709d3ae..3465552ebbf52140fb954b247d99d3c6afe7fcde 100644
--- a/tensorflow/compiler/xla/parse_flags_from_env_test.cc
+++ b/tensorflow/compiler/xla/parse_flags_from_env_test.cc
@@ -37,20 +37,7 @@ static void TestParseFlagsFromEnv(const char* msg) {
   // Initialize module under test.
   int* pargc;
   std::vector<char*>* pargv;
-  ResetFlagsFromEnvForTesting(&pargc, &pargv);
-
-  // Ensure that environment variable can be parsed when
-  // no flags are expected.
-  std::vector<tensorflow::Flag> empty_flag_list;
-  bool parsed_ok = ParseFlagsFromEnv(empty_flag_list);
-  CHECK(parsed_ok) << msg;
-  const std::vector<char*>& argv_first = *pargv;
-  CHECK_NE(argv_first[0], nullptr) << msg;
-  int i = 0;
-  while (argv_first[i] != nullptr) {
-    i++;
-  }
-  CHECK_EQ(i, *pargc) << msg;
+  ResetFlagsFromEnvForTesting("TF_XLA_FLAGS", &pargc, &pargv);
 
   // Check that actual flags can be parsed.
   bool simple = false;
@@ -65,7 +52,7 @@ static void TestParseFlagsFromEnv(const char* msg) {
       tensorflow::Flag("single_quoted", &single_quoted, ""),
       tensorflow::Flag("double_quoted", &double_quoted, ""),
   };
-  parsed_ok = ParseFlagsFromEnv(flag_list);
+  bool parsed_ok = ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   CHECK_EQ(*pargc, 1) << msg;
   const std::vector<char*>& argv_second = *pargv;
   CHECK_NE(argv_second[0], nullptr) << msg;
@@ -171,7 +158,8 @@ int main(int argc, char* argv[]) {
       tensorflow::Flag("int_flag", &int_flag, "An integer flag to test with"),
   };
   xla::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
-  bool parse_ok = xla::ParseFlagsFromEnv(flag_list);
+  bool parse_ok =
+      xla::ParseFlagsFromEnvAndDieIfUnknown("TF_XLA_FLAGS", flag_list);
   if (!parse_ok) {
     LOG(QFATAL) << "can't parse from environment\n" << usage;
   }
diff --git a/tensorflow/compiler/xla/protobuf_util.cc b/tensorflow/compiler/xla/protobuf_util.cc
index b507a2ef79f1d7e9ae632744675dddf574490805..ac342bf40fbc0052acbb09a346b9d062561ed06b 100644
--- a/tensorflow/compiler/xla/protobuf_util.cc
+++ b/tensorflow/compiler/xla/protobuf_util.cc
@@ -40,16 +40,6 @@ bool ProtobufEquals(const tensorflow::protobuf::Message& m1,
 
 namespace {
 
-string SanitizeFilename(const string& file_name) {
-  string safe_file_name = file_name;
-  for (char& c : safe_file_name) {
-    if (c == '/' || c == '\\') {
-      c = '_';
-    }
-  }
-  return safe_file_name;
-}
-
 std::pair<tensorflow::mutex*, std::vector<std::function<string(string)>>*>
 GetDirectoryExpanders() {
   static auto* mutex = new tensorflow::mutex;
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.cc b/tensorflow/compiler/xla/python/local_computation_builder.cc
index 4d2a37cfac3e0e89d189f168031e5db44ca5d410..6e2ee866321a070d55a7221c7c68024ceaa93448 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.cc
+++ b/tensorflow/compiler/xla/python/local_computation_builder.cc
@@ -148,14 +148,19 @@ static StatusOr<ScopedShapedBuffer> ToBuffer(LocalClient* client,
 
 /* static */
 StatusOr<LocalShapedBuffer*> LocalShapedBuffer::FromLiteral(
-    const Literal& argument, const absl::optional<Shape>& shape_with_layout) {
+    const Literal& argument, const absl::optional<Shape>& shape_with_layout,
+    int replica_number) {
   LocalClient* client = GetOrCreateLocalClient();
+  TF_ASSIGN_OR_RETURN(int device_ordinal,
+                      client->ReplicaNumberToDeviceOrdinal(replica_number));
+  VLOG(1) << "Creating shaped buffer from literal on replica/ordinal: "
+          << replica_number << "/" << device_ordinal;
   StatusOr<ScopedShapedBuffer> buf = [&] {
     if (shape_with_layout) {
       Literal relaid = argument.Relayout(shape_with_layout.value());
-      return ToBuffer(client, /*device_ordinal=*/0, relaid);
+      return ToBuffer(client, device_ordinal, relaid);
     }
-    return ToBuffer(client, /*device_ordinal=*/0, argument);
+    return ToBuffer(client, device_ordinal, argument);
   }();
   TF_RETURN_IF_ERROR(buf.status());
   return new LocalShapedBuffer(std::move(buf).ValueOrDie());
@@ -312,67 +317,127 @@ CompiledLocalComputation::CompiledLocalComputation(
 StatusOr<LocalShapedBuffer*> CompiledLocalComputation::Execute(
     absl::Span<LocalShapedBuffer* const> argument_handles) {
   LocalClient* client = GetOrCreateLocalClient();
+  StatusOr<int> device_ordinal_status = client->ReplicaNumberToDeviceOrdinal(0);
+  StatusOr<ScopedShapedBuffer> result_buffer_status;
+  if (!device_ordinal_status.ok()) {
+    result_buffer_status = device_ordinal_status.status();
+  } else {
+    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    VLOG(3) << "Replica 0 mapped to device ordinal for execution: "
+            << device_ordinal;
+
+    std::vector<const ShapedBuffer*> argument_buffers;
+    argument_buffers.reserve(argument_handles.size());
+    for (auto& handle : argument_handles) {
+      argument_buffers.push_back(handle->shaped_buffer());
+    }
+
+    DeviceAssignment device_assignment =
+        client->backend()
+            .computation_placer()
+            ->AssignDevices(1, /*computation_count=*/1)
+            .ConsumeValueOrDie();
+
+    ExecutableRunOptions options;
+    options.set_device_ordinal(device_ordinal);
+    options.set_allocator(client->backend().memory_allocator());
+    options.set_intra_op_thread_pool(
+        client->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
 
-  VLOG(1) << "Execution requested with " << GetReplicaCount() << " replicas.";
+    result_buffer_status = executable_->Run(argument_buffers, options);
+  }
+
+  if (!result_buffer_status.ok()) {
+    return InternalError(
+        "Failed running replica 0 (other replicas may have failed as well): "
+        "%s.",
+        result_buffer_status.status().ToString());
+  }
+  return new LocalShapedBuffer(std::move(result_buffer_status).ValueOrDie());
+}
+
+StatusOr<LocalShapedBufferTuple*> CompiledLocalComputation::ExecutePerReplica(
+    absl::Span<const std::vector<LocalShapedBuffer*>> argument_handles) {
+  LocalClient* client = GetOrCreateLocalClient();
+  const int num_replicas = GetReplicaCount();
+
+  if (argument_handles.size() != num_replicas) {
+    return InvalidArgument(
+        "Attempted to execute with %d replicas when replica count is %d",
+        argument_handles.size(), num_replicas);
+  }
+
+  VLOG(1) << "Executing with " << num_replicas << " replicas.";
 
   // Each replica populates a StatusOr result, but only the output value of
   // replica zero is returned.
-  std::vector<StatusOr<ScopedShapedBuffer>> results(GetReplicaCount());
-  {
+  std::vector<StatusOr<ScopedShapedBuffer>> results(num_replicas);
+  auto execute = [this, client, num_replicas, &argument_handles,
+                  &results](int replica) {
+    StatusOr<int> device_ordinal_status =
+        client->ReplicaNumberToDeviceOrdinal(replica);
+    if (!device_ordinal_status.ok()) {
+      results[replica] = device_ordinal_status.status();
+      return;
+    }
+    const int device_ordinal = device_ordinal_status.ValueOrDie();
+    VLOG(3) << "Replica " << replica
+            << " mapped to device ordinal for execution: " << device_ordinal;
+
+    std::vector<const ShapedBuffer*> argument_buffers;
+    argument_buffers.reserve(argument_handles[replica].size());
+    for (auto& handle : argument_handles[replica]) {
+      argument_buffers.push_back(handle->shaped_buffer());
+    }
+
+    DeviceAssignment device_assignment =
+        client->backend()
+            .computation_placer()
+            ->AssignDevices(num_replicas, /*computation_count=*/1)
+            .ConsumeValueOrDie();
+
+    ExecutableRunOptions options;
+    options.set_device_ordinal(device_ordinal);
+    options.set_allocator(client->backend().memory_allocator());
+    options.set_intra_op_thread_pool(
+        client->backend().eigen_intra_op_thread_pool_device());
+    options.set_device_assignment(&device_assignment);
+    StatusOr<ScopedShapedBuffer> result_buffer_status =
+        executable_->Run(argument_buffers, options);
+
+    results[replica] = std::move(result_buffer_status);
+  };
+
+  if (num_replicas == 1) {
+    // Fast-path if there is only one replica — run the computation on the
+    // current thread.
+    execute(0);
+  } else {
+    // TODO(phawkins): don't recreate the threadpool for each execution.
     tensorflow::thread::ThreadPool pool(tensorflow::Env::Default(), "xlarun",
-                                        GetReplicaCount());
-
-    for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-      pool.Schedule(
-          [this, client, replica, &argument_handles, &results] {
-            StatusOr<int> device_ordinal_status =
-                client->ReplicaNumberToDeviceOrdinal(replica);
-            if (!device_ordinal_status.ok()) {
-              results[replica] = device_ordinal_status.status();
-              return;
-            }
-            const int device_ordinal = device_ordinal_status.ValueOrDie();
-            VLOG(3) << "Replica " << replica
-                    << " mapped to device ordinal for execution: "
-                    << device_ordinal;
-
-            std::vector<const ShapedBuffer*> argument_buffers;
-            argument_buffers.reserve(argument_handles.size());
-            for (auto& handle : argument_handles) {
-              argument_buffers.push_back(handle->shaped_buffer());
-            }
-
-            DeviceAssignment device_assignment =
-                client->backend()
-                    .computation_placer()
-                    ->AssignDevices(GetReplicaCount(), /*computation_count=*/1)
-                    .ConsumeValueOrDie();
-
-            ExecutableRunOptions options;
-            options.set_device_ordinal(device_ordinal);
-            options.set_allocator(client->backend().memory_allocator());
-            options.set_intra_op_thread_pool(
-                client->backend().eigen_intra_op_thread_pool_device());
-            options.set_device_assignment(&device_assignment);
-            StatusOr<ScopedShapedBuffer> result_buffer_status =
-                executable_->Run(argument_buffers, options);
-
-            results[replica] = std::move(result_buffer_status);
-          });
+                                        num_replicas - 1);
+
+    for (int replica = 0; replica < num_replicas - 1; ++replica) {
+      pool.Schedule([&execute, replica] { execute(replica); });
     }
+    execute(num_replicas - 1);
   }
 
-  for (int replica = 0; replica < GetReplicaCount(); ++replica) {
-    const auto& statusor = results[replica];
+  std::vector<LocalShapedBuffer*> wrapped_results(num_replicas);
+  for (int replica = 0; replica < num_replicas; ++replica) {
+    auto& statusor = results[replica];
     if (!statusor.ok()) {
       return InternalError(
           "Failed running replica %d (other replicas may have failed as well): "
           "%s.",
           replica, statusor.status().ToString());
     }
+    wrapped_results[replica] =
+        new LocalShapedBuffer(std::move(statusor).ValueOrDie());
   }
 
-  return new LocalShapedBuffer(std::move(results[0]).ValueOrDie());
+  return new LocalShapedBufferTuple(std::move(wrapped_results));
 }
 
 static StatusOr<Shape> GetReturnValueShape(const XlaComputation& computation) {
@@ -487,12 +552,13 @@ StatusOr<CompiledXrtComputation*> LocalComputation::CompileForXrt(
 
   xrt::XLAComputation c;
   auto config = c.mutable_config();
-  auto shapes = config->mutable_program_shape();
+  ProgramShape shapes;
   for (auto& shape : argument_shapes) {
-    *shapes->add_parameters() = shape;
+    *shapes.add_parameters() = shape;
   }
-  TF_ASSIGN_OR_RETURN(*shapes->mutable_result(), GetReturnValueShape());
-  LayoutUtil::SetToDefaultLayout(shapes);
+  TF_ASSIGN_OR_RETURN(*shapes.mutable_result(), GetReturnValueShape());
+  LayoutUtil::SetToDefaultLayout(&shapes);
+  *config->mutable_program_shape() = shapes.ToProto();
   auto snapshot = computation().Snapshot().ValueOrDie();
   *c.mutable_hlo_snapshot() = *snapshot;
 
@@ -584,9 +650,9 @@ LocalOp LocalComputationBuilder::Broadcast(
 }
 
 LocalOp LocalComputationBuilder::BroadcastInDim(
-    const LocalOp& operand, const Shape& shape,
+    const LocalOp& operand, absl::Span<const int64> out_dim_sizes,
     absl::Span<const int64> broadcast_dimensions) {
-  return xla::BroadcastInDim(operand.op(), shape, broadcast_dimensions);
+  return xla::BroadcastInDim(operand.op(), out_dim_sizes, broadcast_dimensions);
 }
 
 LocalOp LocalComputationBuilder::Pad(const LocalOp& operand,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.h b/tensorflow/compiler/xla/python/local_computation_builder.h
index 9e617c48bdc5ae4b37c1a1db9a1876bb4c0a6f0d..149e44570df5c6a3df88bbe2ffa779be47842d82 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.h
+++ b/tensorflow/compiler/xla/python/local_computation_builder.h
@@ -71,7 +71,8 @@ StatusOr<Literal> TransferFromOutfeedLocalReplica(const Shape& shape,
 class LocalShapedBuffer {
  public:
   static StatusOr<LocalShapedBuffer*> FromLiteral(
-      const Literal& argument, const absl::optional<Shape>& shape_with_layout);
+      const Literal& argument, const absl::optional<Shape>& shape_with_layout,
+      int replica_number);
 
   LocalShapedBuffer(ScopedShapedBuffer shaped_buffer);
   StatusOr<Literal> ToLiteral() const;
@@ -175,6 +176,12 @@ class CompiledLocalComputation {
   StatusOr<LocalShapedBuffer*> Execute(
       absl::Span<LocalShapedBuffer* const> argument_handles);
 
+  // Execute on many replicas. Takes a sequence of argument lists (one argument
+  // list per replica) and returns a tuple of results (one result per replica).
+  // The number of argument lists must be equal to the replica count.
+  StatusOr<LocalShapedBufferTuple*> ExecutePerReplica(
+      absl::Span<const std::vector<LocalShapedBuffer*> > argument_handles);
+
  private:
   std::unique_ptr<LocalExecutable> executable_;
 };
@@ -282,7 +289,8 @@ class LocalComputationBuilder {
   LocalOp Broadcast(const LocalOp& operand,
                     absl::Span<const int64> broadcast_sizes);
 
-  LocalOp BroadcastInDim(const LocalOp& operand, const Shape& shape,
+  LocalOp BroadcastInDim(const LocalOp& operand,
+                         absl::Span<const int64> out_dim_sizes,
                          absl::Span<const int64> broadcast_dimensions);
 
   LocalOp Pad(const LocalOp& operand, const LocalOp& padding_value,
diff --git a/tensorflow/compiler/xla/python/local_computation_builder.i b/tensorflow/compiler/xla/python/local_computation_builder.i
index feabfdb889ca055550c5d1e1c05ca47c1b0bd166..d23d693c1e5bde43b52959e4397aa311268411bb 100644
--- a/tensorflow/compiler/xla/python/local_computation_builder.i
+++ b/tensorflow/compiler/xla/python/local_computation_builder.i
@@ -363,6 +363,37 @@ tensorflow::ImportNumpy();
   $1 = temps;
 }
 
+%typemap(in) absl::Span<const std::vector<xla::swig::LocalShapedBuffer*> >
+    (std::vector<std::vector<LocalShapedBuffer*> > temps) {
+  if (!PySequence_Check($input)) {
+    PyErr_SetString(PyExc_TypeError, "Argument is not a sequence");
+    SWIG_fail;
+  }
+  const int size = PySequence_Size($input);
+  temps.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    PyObject* o = PySequence_GetItem($input, i);
+    std::vector<LocalShapedBuffer*> vec;
+    const int vec_size = PySequence_Size(o);
+    vec.reserve(vec_size);
+    for (int j = 0; j < vec_size; ++j) {
+      PyObject* vec_elt = PySequence_GetItem(o, j);
+      LocalShapedBuffer* lsbp;
+      if ((SWIG_ConvertPtr(vec_elt, (void**) &lsbp, $descriptor(xla::swig::LocalShapedBuffer*),
+                           SWIG_POINTER_EXCEPTION)) == -1) {
+        Py_DECREF(vec_elt);
+        Py_DECREF(o);
+        SWIG_fail;
+      }
+      vec.push_back(lsbp);
+      Py_DECREF(vec_elt);
+    }
+    temps.push_back(vec);
+    Py_DECREF(o);
+  }
+  $1 = temps;
+}
+
 %typemap(in) absl::Span<xla::swig::XrtAllocation* const>
     (std::vector<XrtAllocation*> temps) {
   if (!PySequence_Check($input)) {
@@ -921,22 +952,22 @@ tensorflow::ImportNumpy();
     $1 = NULL;
   } else {
     if (!HandleStringAttribute($input, "generate_hlo_graph", [&](string s) {
-      build_options.set_generate_hlo_graph(std::move(s));
+      build_options.mutable_debug_options()->set_xla_generate_hlo_graph(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_optimized_hlo_proto_to", [&](string s) {
-      build_options.set_dump_optimized_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_optimized_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_unoptimized_hlo_proto_to", [&](string s) {
-      build_options.set_dump_unoptimized_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_unoptimized_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
     if (!HandleStringAttribute($input, "dump_per_pass_hlo_proto_to", [&](string s) {
-      build_options.set_dump_per_pass_hlo_proto_to(std::move(s));
+      build_options.mutable_debug_options()->set_xla_dump_per_pass_hlo_proto_to(std::move(s));
     })) {
       return nullptr;
     }
@@ -950,7 +981,7 @@ tensorflow::ImportNumpy();
         PyErr_SetString(PyExc_TypeError, "ExecutableBuildOptions.hlo_profile must be a bool or None.");
         SWIG_fail;
       }
-      build_options.set_hlo_profile(o == Py_True);
+      build_options.mutable_debug_options()->set_xla_hlo_profile(o == Py_True);
     }
     Py_DECREF(o);
 
@@ -992,11 +1023,13 @@ tensorflow::ImportNumpy();
 %unignore xla::swig::XrtAllocation;
 %unignore xla::swig::XrtAllocation::FromLiteral;
 %unignore xla::swig::XrtAllocation::ToLiteral;
+%unignore xla::swig::XrtAllocation::shape;
 %unignore xla::swig::XrtAllocationTuple;
 %unignore xla::swig::XrtAllocationTuple::Release;
 %unignore xla::swig::XrtAllocationTuple::size;
 %unignore xla::swig::CompiledLocalComputation;
 %unignore xla::swig::CompiledLocalComputation::Execute;
+%unignore xla::swig::CompiledLocalComputation::ExecutePerReplica;
 %unignore xla::swig::CompiledXrtComputation;
 %unignore xla::swig::CompiledXrtComputation::Execute;
 %unignore xla::swig::LocalComputation;
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 92b0685dbba195405d78867776fe43b5f6c60f4c..c91a2aaf56dfe2127168628c78e0c4b868a28055 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -26,6 +26,9 @@ import os
 
 import numpy as np
 
+import six
+from six.moves import xrange
+
 from tensorflow.compiler.xla import xla_data_pb2
 from tensorflow.compiler.xla.python import pywrap_xla as c_api
 from tensorflow.compiler.xla.service import hlo_pb2
@@ -75,6 +78,13 @@ def CurrentSourceInfoMetadata(op_type=None, op_name=None, skip_frames=1):
       source_line=lineno)
 
 
+def _maybe_encode_string(s):
+  if six.PY3:
+    return s.encode('utf-8')
+  else:
+    return s
+
+
 class PaddingType(enum.Enum):
   VALID = 1
   SAME = 2
@@ -212,23 +222,33 @@ class LocalBuffer(object):
   means the referent is in device memory.
   """
 
-  def __init__(self, c_buffer, backend):
+  def __init__(self, c_buffer, backend, replica):
     self.c_buffer = c_buffer
     self._backend = backend
+    self._replica = replica
     if backend.backend_type == BackendType.XRT:
       self._delete = c_api.DeleteXrtAllocation
     else:
       self._delete = c_api.DeleteLocalShapedBuffer
 
   @staticmethod
-  def from_pyval(pyval, backend=XLA_LOCAL_BACKEND):
+  def from_pyval(pyval, replica=0, backend=XLA_LOCAL_BACKEND):
     """Allocate and copy to XLA the given python value."""
     pyval = require_numpy_array_layout(pyval)
+    num_replicas = get_replica_count()
+    if not 0 <= replica < num_replicas:
+      raise ValueError(
+          'Attempt to place buffer on replica {} when the replica count is {}'
+          .format(replica, num_replicas))
     if backend.backend_type == BackendType.XRT:
-      cbuf = c_api.XrtAllocation.FromLiteral(pyval, backend.target)
+      if replica != 0:
+        raise NotImplementedError(
+            'Multi-replica execution is not yet supported via the XRT backend.')
+      cbuf = c_api.XrtAllocation.FromLiteral(
+          pyval, _maybe_encode_string(backend.target))
     else:
-      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None)
-    return LocalBuffer(cbuf, backend)
+      cbuf = c_api.LocalShapedBuffer.FromLiteral(pyval, None, replica)
+    return LocalBuffer(cbuf, backend, replica)
 
   def to_py(self):
     return self.c_buffer.ToLiteral()
@@ -236,6 +256,9 @@ class LocalBuffer(object):
   def shape(self):
     return _wrap_shape(self.c_buffer.shape())
 
+  def replica(self):
+    return self._replica
+
   def delete(self):
     if self.c_buffer is not None:
       self._delete(self.c_buffer)
@@ -245,14 +268,15 @@ class LocalBuffer(object):
     """Assuming a tuple buffer, unpack it into constituent tuple elements."""
     assert self.c_buffer is not None
     if self._backend.backend_type == BackendType.XRT:
-      result = c_api.DestructureXrtAllocationTuple(self.c_buffer,
-                                                   self._backend.target)
+      result = c_api.DestructureXrtAllocationTuple(
+          self.c_buffer, _maybe_encode_string(self._backend.target))
     else:
       result = c_api.DestructureLocalShapedBufferTuple(self.c_buffer)
     self.delete()
     size = result.size()
     destructured = tuple(
-        LocalBuffer(result.Release(i), backend=self._backend)
+        LocalBuffer(
+            result.Release(i), replica=self._replica, backend=self._backend)
         for i in xrange(size))
     return destructured
 
@@ -322,6 +346,9 @@ class Shape(object):
   def __ne__(self, other):
     return not self == other
 
+  def __hash__(self):
+    return hash((self._dtype, self._dimensions, self._minor_to_major))
+
   def __repr__(self):
     return ('xla_client.Shape(_dtype={!r}, _dimensions={!r}, '
             '_is_tuple={!r}, _minor_to_major={!r})').format(
@@ -541,10 +568,13 @@ class LocalComputation(object):
       ]
       result_shape = result_shape.map_leaves(layout_fn)
 
+    argument_shapes = list(argument_shapes)
+
     compile_options = compile_options or CompileOptions()
     compile_options.result_shape = result_shape
     if self._backend.backend_type == BackendType.XRT:
-      c = self.computation.CompileForXrt(argument_shapes, self._backend.target)
+      c = self.computation.CompileForXrt(
+          argument_shapes, _maybe_encode_string(self._backend.target))
     else:
       c = self.computation.Compile(argument_shapes, compile_options)
     return LocalComputation(c, is_compiled=True, backend=self._backend)
@@ -558,23 +588,87 @@ class LocalComputation(object):
         compile_options=compile_options,
         layout_fn=layout_fn)
 
-  def Execute(self, arguments=()):
-    """Execute with LocalBuffer arguments and return value."""
+  def GetReturnValueShape(self):
+    return _wrap_shape(self._c_computation.GetReturnValueShape())
+
+  def Execute(self, arguments=(), check_for_deleted_args=True):
+    """Execute on one replica with LocalBuffer arguments and return value."""
+    if check_for_deleted_args and any(arg.is_deleted() for arg in arguments):
+      raise ValueError('Executing with deleted local buffer argument')
+    raw_args = [arg.c_buffer for arg in arguments]
+    output_buffer = self._c_computation.Execute(raw_args)
+    return LocalBuffer(output_buffer, backend=self._backend, replica=0)
+
+  def ExecutePerReplica(self, arguments=None):
+    """Execute on many replicas with LocalBuffer arguments and return value.
+
+    Args:
+      arguments: A sequence of sequences of LocalBuffers. The i'th inner
+        sequence comprises the arguments for execution on the i'th replica.
+
+    Returns:
+      A list of the computation's outputs on each replica, as a LocalBuffer. If
+      a shallow sequence of arguments was passed in for `arguments`, then the
+      sole, zero'th replica's output is returned instead, as a LocalBuffer.
+    """
     if not self._is_compiled:
       raise ValueError('Cannot execute an uncompiled local XLA computation.')
-    arguments = tuple(arguments)
-    if any(arg.is_deleted() for arg in arguments):
-      raise ValueError('Executing with deleted local buffer argument')
-    return LocalBuffer(
-        self._c_computation.Execute([arg.c_buffer for arg in arguments]),
-        backend=self._backend)
+    if arguments is None:
+      arguments = ((),) * get_replica_count()
+    else:
+      arguments = [list(replica_args) for replica_args in arguments]
+
+    # Check arguments
+    for replica, replica_args in enumerate(arguments):
+      for arg in replica_args:
+        if arg.is_deleted():
+          raise ValueError('Executing with deleted local buffer argument')
+        if arg.replica() != replica:
+          raise ValueError(
+              'Executing on replica {} with argument from replica {}'.format(
+                  replica, arg.replica()))
+
+    # Pull out argument buffer handles
+    stripped_args = [
+        [arg.c_buffer for arg in replica_args] for replica_args in arguments
+    ]
+
+    # Execute
+    if self._backend.backend_type == BackendType.XRT:
+      if len(stripped_args) > 1:
+        raise NotImplementedError(
+            'Multi-replica execution is not yet supported via the XRT backend.')
+      output_buffers = [self._c_computation.Execute(stripped_args[0])]
+    else:
+      output_buffer_tup = self._c_computation.ExecutePerReplica(stripped_args)
+      size = output_buffer_tup.size()
+      output_buffers = [output_buffer_tup.Release(i) for i in xrange(size)]
+
+    # Wrap output handles in LocalBuffer instances
+    return tuple(
+        LocalBuffer(output_buffer, backend=self._backend, replica=replica)
+        for replica, output_buffer in enumerate(output_buffers))
 
   def ExecuteWithPythonValues(self, arguments=()):
-    """Execute with Python values as arguments and return value."""
-    arguments = tuple(
-        LocalBuffer.from_pyval(arg, backend=self._backend) for arg in arguments)
+    """Execute on one replica with Python values as arguments and output."""
+
+    def put(arg):
+      return LocalBuffer.from_pyval(arg, backend=self._backend)
+
+    arguments = [put(arg) for arg in arguments]
     return self.Execute(arguments).to_py()
 
+  def ExecuteWithPythonValuesPerReplica(self, arguments):
+    """Execute on many replicas with Python values as arguments and output."""
+
+    def put(arg, replica):
+      return LocalBuffer.from_pyval(arg, replica, backend=self._backend)
+
+    arguments = [[put(arg, replica)
+                  for arg in replica_args]
+                 for replica, replica_args in enumerate(arguments)]
+    return [out.to_py() for out in self.ExecutePerReplica(arguments)]
+
   def __del__(self):
     self._delete(self._c_computation)
 
@@ -761,8 +855,7 @@ class ComputationBuilder(object):
     Returns:
       A LocalOp representing the added broadcast-in-dimensions op.
     """
-    xla_shape = Shape.array_shape(self.GetShape(operand).element_type(), shape)
-    return self._client.BroadcastInDim(operand, xla_shape, broadcast_dimensions)
+    return self._client.BroadcastInDim(operand, shape, broadcast_dimensions)
 
   def Concatenate(self, operands, dimension):
     """Enqueues a concatenate operation onto the computation.
@@ -1380,6 +1473,7 @@ def initialize_platform_name(platform_name):
   Raises:
     A runtime exception if the XLA service has already been initialized.
   """
+  platform_name = _maybe_encode_string(platform_name)
   c_api.InitializePlatformName(platform_name)
 
 
diff --git a/tensorflow/compiler/xla/python_api/xla_shape.py b/tensorflow/compiler/xla/python_api/xla_shape.py
index f158f6b2410352432445f669155aff0af5526abf..95b2bf300ec67e9f034f77450416544cb088ae55 100644
--- a/tensorflow/compiler/xla/python_api/xla_shape.py
+++ b/tensorflow/compiler/xla/python_api/xla_shape.py
@@ -25,9 +25,10 @@ from tensorflow.compiler.xla.python_api import types
 
 
 class Shape(object):
-  """Wraps a xla_data_pb2.Shape message with a convenient Python type.
+  """Wraps a xla_data_pb2.ShapeProto message with a convenient Python type.
 
-  Provides direct access to the underlying xla_data_pb2.Shape message in the
+  Provides direct access to the underlying xla_data_pb2.ShapeProto message in
+  the
   message attribute, along with accessor wrappers to the message's fields.
   Avoid direct access to .message unless interacting directly with protobuf APIs
   like CopyFrom. In other words, prefer hauling the shape around in a Shape, and
@@ -48,7 +49,7 @@ class Shape(object):
     Raises:
       ValueError: if element_type is TUPLE but dimensions are not Shape objects.
     """
-    self.message = xla_data_pb2.Shape()
+    self.message = xla_data_pb2.ShapeProto()
     self.message.element_type = element_type
     if element_type == xla_data_pb2.TUPLE:
       if not all(isinstance(subshape, Shape) for subshape in dimensions):
diff --git a/tensorflow/compiler/xla/rpc/BUILD b/tensorflow/compiler/xla/rpc/BUILD
index 3abb3855a42b8b5222115262448d359da3a80e87..26affbcceb33110baf41d507173e56f8b1c8c9eb 100644
--- a/tensorflow/compiler/xla/rpc/BUILD
+++ b/tensorflow/compiler/xla/rpc/BUILD
@@ -16,7 +16,6 @@ xla_proto_library(
     use_grpc_plugin = True,
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla:xla_proto",
     ],
 )
diff --git a/tensorflow/compiler/xla/rpc/xla_service.proto b/tensorflow/compiler/xla/rpc/xla_service.proto
index e4f332cda22cc5b889bf73f06913b96d6091dc81..0ff8adc2acbe5fd21e85027dd63bfb14f5672a7d 100644
--- a/tensorflow/compiler/xla/rpc/xla_service.proto
+++ b/tensorflow/compiler/xla/rpc/xla_service.proto
@@ -43,7 +43,6 @@ limitations under the License.
 syntax = "proto3";
 
 import "tensorflow/compiler/xla/xla.proto";
-import "tensorflow/compiler/xla/xla_data.proto";
 
 package xla;
 
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 19b5c1ca25debf80c7e712854b47384937697d3d..4c21ae2a427477caa86fb4130616c38eb3bcf006 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -281,10 +281,12 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -292,6 +294,7 @@ cc_library(
     name = "hlo",
     srcs = [
         "dfs_hlo_visitor.cc",
+        "dynamic_parameter_binding.cc",
         "hlo_computation.cc",
         "hlo_input_output_alias_config.cc",
         "hlo_instruction.cc",
@@ -305,6 +308,7 @@ cc_library(
     hdrs = [
         "dfs_hlo_visitor.h",
         "dfs_hlo_visitor_with_default.h",
+        "dynamic_parameter_binding.h",
         "hlo_clone_context.h",
         "hlo_computation.h",
         "hlo_domain_metadata.h",
@@ -350,6 +354,25 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "dynamic_parameter_binding_test",
+    srcs = ["dynamic_parameter_binding_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_dce",
+        ":hlo_memory_scheduler",
+        ":hlo_ordering",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
 tf_cc_test(
     name = "dfs_hlo_visitor_with_default_test",
     srcs = ["dfs_hlo_visitor_with_default_test.cc"],
@@ -387,9 +410,36 @@ tf_cc_test(
         ":hlo",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:test",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "pattern_matcher_gmock",
+    testonly = 1,
+    hdrs = ["pattern_matcher_gmock.h"],
+    deps = [
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/core:test",
+    ],
+)
+
+tf_cc_test(
+    name = "pattern_matcher_gmock_test",
+    srcs = ["pattern_matcher_gmock_test.cc"],
+    deps = [
+        ":hlo",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:test",
     ],
 )
 
@@ -403,6 +453,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
@@ -1336,6 +1387,7 @@ cc_library(
         ":hlo",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -1539,7 +1591,10 @@ tf_cc_test(
         ":hlo",
         ":hlo_casting_utils",
         ":hlo_matchers",
+        ":hlo_parser",
         ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1707,7 +1762,9 @@ cc_library(
         ":hlo",
         ":hlo_pass",
         ":hlo_query",
+        ":pattern_matcher",
         ":while_loop_analysis",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -1720,9 +1777,14 @@ tf_cc_test(
     name = "while_loop_simplifier_test",
     srcs = ["while_loop_simplifier_test.cc"],
     deps = [
+        ":algebraic_simplifier",
         ":hlo",
+        ":hlo_cse",
         ":hlo_dce",
         ":hlo_matchers",
+        ":hlo_pass",
+        ":hlo_pass_pipeline",
+        ":tuple_simplifier",
         ":while_loop_simplifier",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -1848,6 +1910,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "dynamic_dimension_inference",
+    srcs = ["dynamic_dimension_inference.cc"],
+    hdrs = ["dynamic_dimension_inference.h"],
+    deps = [
+        ":hlo",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "dynamic_dimension_inference_test",
+    srcs = ["dynamic_dimension_inference_test.cc"],
+    deps = [
+        ":dynamic_dimension_inference",
+        "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla:test_helpers",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_matchers",
+        "//tensorflow/compiler/xla/service:hlo_runner",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "reshape_mover_test",
     srcs = ["reshape_mover_test.cc"],
@@ -2005,7 +2102,8 @@ tf_cc_test(
     srcs = ["hlo_computation_test.cc"],
     deps = [
         ":hlo",
-        ":hlo_matchers",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -2347,6 +2445,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -2598,8 +2697,9 @@ tf_cc_test(
         ":algebraic_simplifier",
         ":computation_layout",
         ":hlo",
-        ":hlo_matchers",
         ":layout_assignment",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_layout",
         "//tensorflow/compiler/xla:shape_util",
@@ -2610,6 +2710,7 @@ tf_cc_test(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "@com_google_absl//absl/types:span",
@@ -2744,6 +2845,8 @@ tf_cc_test(
         ":hlo_matchers",
         ":hlo_parser",
         ":hlo_pass",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -2855,6 +2958,46 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "hlo_get_dimension_size_rewriter",
+    srcs = ["hlo_get_dimension_size_rewriter.cc"],
+    hdrs = ["hlo_get_dimension_size_rewriter.h"],
+    deps = [
+        ":hlo",
+        ":hlo_pass",
+        ":shape_inference",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+tf_cc_test(
+    name = "hlo_get_dimension_size_rewriter_test",
+    srcs = ["hlo_get_dimension_size_rewriter_test.cc"],
+    deps = [
+        ":hlo",
+        ":hlo_get_dimension_size_rewriter",
+        ":hlo_matchers",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/compiler/xla/tests:test_utils",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 cc_library(
     name = "device_memory_allocator",
     srcs = [
@@ -2913,6 +3056,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@llvm//:core",
         "@llvm//:transform_utils",
@@ -3026,6 +3170,7 @@ cc_library(
         ":hlo_casting_utils",
         ":hlo_execution_profile",
         ":hlo_tfgraph_builder",
+        ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -3318,9 +3463,9 @@ cc_library(
         ":tuple_util",
         ":while_loop_analysis",
         ":while_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/core:lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3463,6 +3608,8 @@ tf_cc_test(
         ":hlo_casting_utils",
         ":hlo_matchers",
         ":hlo_parser",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/core:lib",
@@ -3513,6 +3660,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ar_crs_combiner",
+    srcs = ["ar_crs_combiner.cc"],
+    hdrs = ["ar_crs_combiner.h"],
+    deps = [
+        ":call_graph",
+        ":pattern_matcher",
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
+        "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla/service:hlo",
+        "//tensorflow/compiler/xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "ar_crs_combiner_test",
+    srcs = ["ar_crs_combiner_test.cc"],
+    deps = [
+        ":ar_crs_combiner",
+        ":hlo",
+        ":hlo_matchers",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+    ],
+)
+
 tf_cc_test(
     name = "map_inliner_test",
     srcs = ["map_inliner_test.cc"],
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 89e62bd2f0dc02d2d0947ae47e3bb0c9955f103e..985c5af1c4d89425dd6693585e42e22510fe21f8 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
 
 #include <algorithm>
+#include <cmath>
 #include <iterator>
 #include <memory>
 #include <numeric>
@@ -68,6 +69,45 @@ bool IsAll(const HloInstruction* op, int8 value) {
   }
 }
 
+// Checks whether `op` is a floating-point constant or broadcast of a constant
+// of the form +/- 2^k for some integer k positive, negative, or zero.  Such
+// values are interesting because multiplying by a power of 2 just moves the
+// exponent.
+bool IsAllFpConstantPowerOf2(const HloInstruction* op) {
+  // Unwrap the broadcast if necessary.
+  const HloInstruction* c;
+  if (!Match(op, m::ConstantEffectiveScalar(&c)) &&
+      !Match(op, m::Broadcast(m::Constant(&c).WithShape(
+                     m::Shape().IsEffectiveScalar())))) {
+    return false;
+  }
+  auto val = [&]() -> absl::optional<double> {
+    switch (c->shape().element_type()) {
+      case BF16:
+        return static_cast<double>(c->literal().GetFirstElement<bfloat16>());
+      case F16:
+        return static_cast<double>(c->literal().GetFirstElement<Eigen::half>());
+      case F32:
+        return c->literal().GetFirstElement<float>();
+      case F64:
+        return c->literal().GetFirstElement<double>();
+      default:
+        // Cowardly refuse to consider complex types.
+        return absl::nullopt;
+    }
+  }();
+  if (!val) {
+    return false;
+  }
+
+  int exp;
+  double mantissa = std::frexp(*val, &exp);
+  // frexp returns a value in the range (-1; -0.5] U [0.5, 1).  A return value
+  // of +/-0.5 therefore indicates that the floating point value is a power of
+  // 2.
+  return mantissa == 0.5 || mantissa == -0.5;
+}
+
 // Returns whether the given transpose produces a result which is bit-wise
 // identical to its operand and thus may be replaced with a bitcast.
 bool TransposeIsBitcast(const HloInstruction* transpose) {
@@ -84,7 +124,8 @@ bool TransposeIsBitcast(const HloInstruction* transpose) {
 // reshape may still be a bitcast. For example, a reshape from [28x28] to [784].
 bool ReshapeOrCopyIsBitcast(
     const HloInstruction* instr,
-    const AlgebraicSimplifier::ValidBitcastCallback& valid_bitcast_callback) {
+    const AlgebraicSimplifierOptions::ValidBitcastCallback&
+        valid_bitcast_callback) {
   CHECK(HloOpcode::kReshape == instr->opcode() ||
         HloOpcode::kCopy == instr->opcode());
 
@@ -95,6 +136,11 @@ bool ReshapeOrCopyIsBitcast(
          valid_bitcast_callback(operand->shape(), instr->shape());
 }
 
+bool IsUnstridedSlice(const HloInstruction* hlo) {
+  return absl::c_all_of(hlo->slice_strides(),
+                        [](int64 stride) { return stride == 1; });
+}
+
 // AlgebraicSimplifierVisitor traverses the HLO computation and reduces certain
 // algebraic expressions to simplified forms. Note: This only supports
 // simplifications that simply look at the operands of an instruction. For the
@@ -180,21 +226,13 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   const bool changed() const { return changed_; }
 
   // Runs the visitor on a computation.
-  static bool Run(
-      HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_strength_reduction, bool enable_conv_simplification);
+  static bool Run(HloComputation* computation,
+                  const AlgebraicSimplifierOptions& options);
 
  private:
-  explicit AlgebraicSimplifierVisitor(
-      HloComputation* computation, bool is_layout_sensitive,
-      AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-      bool enable_dot_strength_reduction, bool enable_conv_simplification)
-      : computation_(computation),
-        is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_strength_reduction_(enable_dot_strength_reduction),
-        enable_conv_simplification_(enable_conv_simplification) {}
+  explicit AlgebraicSimplifierVisitor(HloComputation* computation,
+                                      const AlgebraicSimplifierOptions& options)
+      : computation_(computation), options_(options) {}
 
   // Transforms Dots where at least one input is a vector or has a degenerate
   // dimension and converts it into a multiply and reduce. This should enable
@@ -233,10 +271,10 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
                                      HloInstruction* new_instruction);
 
   // Returns whether the shape of the output of the given instructions are the
-  // same for the purposes of simplification. If is_layout_sensitive_ is true,
-  // then this tests shape equality including layout (ShapeUtil::Equal). If
-  // is_layout_sensitive_ is false, then the tests shape compatibility
-  // (ShapeUtil::Compatible).
+  // same for the purposes of simplification. If options_.is_layout_sensitive()
+  // is true, then this tests shape equality including layout
+  // (ShapeUtil::Equal). If options_.is_layout_sensitive() is false, then the
+  // tests shape compatibility (ShapeUtil::Compatible).
   bool SameShape(const HloInstruction* lhs, const HloInstruction* rhs) const;
 
   // Returns whether it was possible to transform `root` to a clamp instruction.
@@ -325,22 +363,12 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
   // traversing.
   HloComputation* computation_;
 
+  // The backend-specific options selected for the algebraic simplifier.
+  const AlgebraicSimplifierOptions& options_;
+
   // Whether algebraic simplification has occurred.
   bool changed_ = false;
 
-  // Whether layout is considered during transformation.
-  bool is_layout_sensitive_;
-
-  // Callback used to determine if a bitcast is possible.
-  AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback_;
-
-  // Disable dot strength reduction on platforms where it causes a slowdown.
-  bool enable_dot_strength_reduction_;
-
-  // Disable convolution -> dot simplification on platforms where it causes a
-  // slowdown.
-  bool enable_conv_simplification_;
-
   // Cached computation for adding two scalar F32.
   HloComputation* scalar_add_computation_ = nullptr;
 };
@@ -348,19 +376,15 @@ class AlgebraicSimplifierVisitor : public DfsHloVisitorWithDefault {
 }  // namespace
 
 bool AlgebraicSimplifierVisitor::Run(
-    HloComputation* computation, bool is_layout_sensitive,
-    AlgebraicSimplifier::ValidBitcastCallback valid_bitcast_callback,
-    bool enable_dot_strength_reduction, bool enable_conv_simplification) {
-  AlgebraicSimplifierVisitor visitor(
-      computation, is_layout_sensitive, std::move(valid_bitcast_callback),
-      enable_dot_strength_reduction, enable_conv_simplification);
+    HloComputation* computation, const AlgebraicSimplifierOptions& options) {
+  AlgebraicSimplifierVisitor visitor(computation, options);
   TF_CHECK_OK(computation->Accept(&visitor));
   return visitor.changed_;
 }
 
 bool AlgebraicSimplifierVisitor::SameShape(const HloInstruction* lhs,
                                            const HloInstruction* rhs) const {
-  if (is_layout_sensitive_) {
+  if (options_.is_layout_sensitive()) {
     return ShapeUtil::Equal(lhs->shape(), rhs->shape());
   } else {
     return ShapeUtil::Compatible(lhs->shape(), rhs->shape());
@@ -431,6 +455,40 @@ Status AlgebraicSimplifierVisitor::HandleAdd(HloInstruction* add) {
                                           sum_of_constants));
   }
 
+  // A*C + B*C => (A+B)*C
+  //
+  //  - If A, B, and C are integers, do this unconditionally. Proof of
+  //    correctness: https://rise4fun.com/Alive/u9X.
+  //
+  //  - If A, B, and C are floating point, do this if C is a scalar constant or
+  //    broadcast of scalar constant and is equal to +/- 2^k for some (possibly
+  //    negative) integer k.
+  //
+  //    Multiplying by a power of 2 just moves the exponent, so our answer is
+  //    exact modulo rounding of intermediate results so long as
+  //
+  //     - none of the three products has an exponent which underflows (so the
+  //       result is 0 or denormal), and
+  //     - none of the three products overflows to inf.
+  //
+  //    Proof: See algebraic_simplifier_proof_distributive_property.py.
+  //
+  //    We deem these differences in rounding, underflow, and overflow
+  //    acceptable in the ML context.
+  HloInstruction *b, *c;
+  if (((Match(lhs, m::Multiply(m::Op(&a), m::Op(&c))) &&
+        Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b)))) ||
+       (Match(lhs, m::Multiply(m::Op(&c), m::Op(&a))) &&
+        Match(rhs, m::MultiplyAnyOrder(m::Op().Is(c), m::Op(&b))))) &&
+      (ShapeUtil::ElementIsIntegral(add->shape()) ||
+       IsAllFpConstantPowerOf2(c))) {
+    return ReplaceWithNewInstruction(
+        add, HloInstruction::CreateBinary(
+                 add->shape(), HloOpcode::kMultiply,
+                 computation_->AddInstruction(HloInstruction::CreateBinary(
+                     add->shape(), HloOpcode::kAdd, a, b)),
+                 c));
+  }
   return Status::OK();
 }
 
@@ -504,8 +562,8 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
     return Status::OK();
   }
 
-  if (is_layout_sensitive_ &&
-      ReshapeOrCopyIsBitcast(copy, valid_bitcast_callback_)) {
+  if (options_.is_layout_sensitive() &&
+      ReshapeOrCopyIsBitcast(copy, options_.valid_bitcast_callback())) {
     ReplaceWithBitcast(copy);
   }
 
@@ -541,7 +599,74 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
     VLOG(10) << "trying to replace " << concatenate->ToString() << " with "
              << replacement->ToString();
     ReplaceInstructionIfSameShape(concatenate, replacement);
-  } else if (operands.size() == 2) {
+    return Status::OK();
+  }
+
+  // Check if we can merge "adjacent" slice operands which take slices from the
+  // same other op. For simplicity we only merge unstrided slices.
+  int64 concatenate_dimension = concatenate->concatenate_dimension();
+  for (int64 i = 0; i < operands.size(); ++i) {
+    if (operands[i]->opcode() != HloOpcode::kSlice ||
+        !IsUnstridedSlice(operands[i])) {
+      continue;
+    }
+    int64 slice_end = operands[i]->slice_limits(concatenate_dimension);
+    HloInstruction* slice_operand = operands[i]->mutable_operand(0);
+    int64 j = i + 1;
+    while (j < operands.size() && operands[j]->opcode() == HloOpcode::kSlice &&
+           IsUnstridedSlice(operands[j]) &&
+           operands[j]->operand(0) == slice_operand &&
+           operands[j]->slice_starts(concatenate_dimension) == slice_end) {
+      // Check that all the slice_start values are the same in all other
+      // dimensions. This implies that the slice_limit values are also the same,
+      // because operands of concatenate need to have the same shape, and we
+      // already checked that the slices are unstrided.
+      bool same_other_starts = true;
+      for (int64 k = 0; k < operands[j]->slice_starts().size(); ++k) {
+        if (k == concatenate_dimension) {
+          continue;
+        }
+        if (operands[i]->slice_starts(k) != operands[j]->slice_starts(k)) {
+          same_other_starts = false;
+          break;
+        }
+      }
+      if (!same_other_starts) {
+        break;
+      }
+      slice_end = operands[j]->slice_limits(concatenate_dimension);
+      ++j;
+    }
+    if (j - i > 1) {
+      Shape new_slice_shape = operands[i]->shape();
+      new_slice_shape.set_dimensions(
+          concatenate_dimension,
+          slice_end - operands[i]->slice_starts(concatenate_dimension));
+      auto new_limit_indices = operands[i]->slice_limits();
+      new_limit_indices[concatenate_dimension] = slice_end;
+      auto new_slice_op =
+          computation_->AddInstruction(HloInstruction::CreateSlice(
+              new_slice_shape, slice_operand,
+              /*start_indices=*/operands[i]->slice_starts(),
+              /*limit_indices=*/new_limit_indices,
+              /*strides=*/operands[i]->slice_strides()));
+      std::vector<HloInstruction*> new_operands;
+      for (int64 k = 0; k < i; ++k) {
+        new_operands.push_back(operands[k]);
+      }
+      new_operands.push_back(new_slice_op);
+      for (int64 k = j; k < operands.size(); ++k) {
+        new_operands.push_back(operands[k]);
+      }
+      auto replacement =
+          computation_->AddInstruction(concatenate->CloneWithNewOperands(
+              concatenate->shape(), new_operands));
+      ReplaceInstructionIfSameShape(concatenate, replacement);
+      return Status::OK();
+    }
+  }
+
+  if (operands.size() == 2) {
     // A binary concat with a broadcasted scalar as an operand can be converted
     // into a pad which is simpler to fold into other operations.
     bool is_effective_low_pad = Match(
@@ -557,7 +682,7 @@ Status AlgebraicSimplifierVisitor::HandleConcatenate(
       padding_config_dim->set_edge_padding_high(0);
       padding_config_dim->set_edge_padding_low(0);
       padding_config_dim->set_interior_padding(0);
-      if (dim == concatenate->concatenate_dimension()) {
+      if (dim == concatenate_dimension) {
         if (is_effective_low_pad) {
           padding_config_dim->set_edge_padding_low(
               operands[0]->shape().dimensions(dim));
@@ -1215,7 +1340,8 @@ Status AlgebraicSimplifierVisitor::HandleDot(HloInstruction* dot) {
     return ReplaceInstruction(dot, dot_of_gather_optimized);
   }
 
-  if (enable_dot_strength_reduction_ && !is_layout_sensitive_) {
+  if (options_.enable_dot_strength_reduction() &&
+      !options_.is_layout_sensitive()) {
     TF_ASSIGN_OR_RETURN(bool did_strength_reduction,
                         HandleDotStrengthReduction(dot));
     if (did_strength_reduction) {
@@ -1619,6 +1745,27 @@ Status AlgebraicSimplifierVisitor::HandlePad(HloInstruction* pad) {
         pad, HloInstruction::CreateBroadcast(pad->shape(),
                                              pad->mutable_operand(1), {}));
   }
+
+  // Interior padding on one sized dimensions have no effect. As a result it
+  // makes other simplifications possible if there is no interior padding.
+  if (HasInteriorPadding(pad->padding_config())) {
+    PaddingConfig padding_config = pad->padding_config();
+    bool cleared_interior_padding = false;
+    for (int64 i = 0; i < ShapeUtil::Rank(pad->shape()); ++i) {
+      if (padding_config.dimensions(i).interior_padding() > 0 &&
+          pad->operand(0)->shape().dimensions(i) == 1) {
+        cleared_interior_padding = true;
+        padding_config.mutable_dimensions(i)->set_interior_padding(0);
+      }
+    }
+    if (cleared_interior_padding) {
+      return ReplaceWithNewInstruction(
+          pad,
+          HloInstruction::CreatePad(pad->shape(), pad->mutable_operand(0),
+                                    pad->mutable_operand(1), padding_config));
+    }
+  }
+
   // Eliminate nop pads (padding all zero), and replace a pad with negative
   // padding with a pad with non-negative padding followed by a slice.
   bool all_zero = true;
@@ -1910,8 +2057,8 @@ Status AlgebraicSimplifierVisitor::HandleReshape(HloInstruction* reshape) {
   }
 
   // Make this a bitcast if possible.
-  if (is_layout_sensitive_ &&
-      ReshapeOrCopyIsBitcast(reshape, valid_bitcast_callback_)) {
+  if (options_.is_layout_sensitive() &&
+      ReshapeOrCopyIsBitcast(reshape, options_.valid_bitcast_callback())) {
     ReplaceWithBitcast(reshape);
     return Status::OK();
   }
@@ -2030,11 +2177,6 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyScalarSlice(
   return false;
 }
 
-bool IsUnstridedSlice(const HloInstruction* hlo) {
-  return absl::c_all_of(hlo->slice_strides(),
-                        [](int64 stride) { return stride == 1; });
-}
-
 StatusOr<bool> AlgebraicSimplifierVisitor::TryToReorderSliceAndReshape(
     HloInstruction* slice) {
   CHECK_EQ(slice->opcode(), HloOpcode::kSlice);
@@ -2501,6 +2643,108 @@ Status AlgebraicSimplifierVisitor::HandleSort(HloInstruction* sort) {
     return ReplaceWithNewInstruction(
         sort, HloInstruction::CreateTuple(sort->operands()));
   }
+  if (!options_.enable_permutation_sort_replacement()) {
+    return Status::OK();
+  }
+  // Check if we are sorting a permutation. In that case, we know that the keys
+  // will be sorted to the identity permutation, and we can represent the
+  // changes to the 'values' parameter as a scatter.
+  if (sort->operand_count() == 2 &&
+      operand->opcode() == HloOpcode::kGetTupleElement) {
+    const HloInstruction* other_sort = operand->operand(0);
+    // Check whether the 'values' parameter is the result of another sort with
+    // the same sort dimension.
+    if (other_sort->opcode() == HloOpcode::kSort &&
+        other_sort->operand_count() >= 2 &&
+        other_sort->dimensions(0) == dimension_to_sort &&
+        other_sort->operand(operand->tuple_index())->opcode() ==
+            HloOpcode::kIota) {
+      auto* iota =
+          Cast<HloIotaInstruction>(other_sort->operand(operand->tuple_index()));
+      // The sort operand needs to be an integral iota, and the iota dimension
+      // needs to be the dimension that was sorted.
+      if (iota->iota_dimension() == dimension_to_sort &&
+          ShapeUtil::ElementIsIntegral(iota->shape())) {
+        // We use the following construction method for a Scatter that applies
+        // the permutation from 'keys' to the 'values' parameter.
+        // - Take the "keys" parameter of the second sort and reshape it to have
+        //   another "1" dimension at the end.
+        // - Concatenate it with iotas of the same extended shape with all
+        //   different iota_dimensions except the dimension_to_sort in the order
+        //   of iota_dimensions/dimension_to_sort, so e.g. with rank 3 and
+        //   dimension_to_sort = 1, we would have concatenate of (iota with
+        //   iota_dimension=0, keys, iota with iota_dimension = 2)
+        // - Use this as the indices parameter of scatter, and set updates
+        //   of the scatter to be a reshaped 'values' parameter of sort (adding
+        //   'rank' many 1 dimensions at the end).
+        int64 rank = ShapeUtil::Rank(operand->shape());
+        Shape extended_shape = operand->shape();
+        extended_shape.add_dimensions(1);
+        extended_shape.mutable_layout()->add_minor_to_major(rank);
+        auto reshaped_permutation = computation_->AddInstruction(
+            HloInstruction::CreateReshape(extended_shape, operand));
+        std::vector<HloInstruction*> concat_operands;
+        for (int64 i = 0; i < rank; ++i) {
+          if (i == dimension_to_sort) {
+            concat_operands.push_back(reshaped_permutation);
+          } else {
+            concat_operands.push_back(computation_->AddInstruction(
+                HloInstruction::CreateIota(extended_shape, i)));
+          }
+        }
+        Shape concat_shape = operand->shape();
+        concat_shape.add_dimensions(rank);
+        concat_shape.mutable_layout()->add_minor_to_major(rank);
+        auto scatter_indices =
+            rank > 1 ? computation_->AddInstruction(
+                           HloInstruction::CreateConcatenate(
+                               concat_shape, concat_operands, rank))
+                     : reshaped_permutation;
+
+        // We don't care about the operand, it will be completely overridden by
+        // the updates.
+        auto scatter_operand = computation_->AddInstruction(
+            HloInstruction::CreateIota(sort->operand(1)->shape(), 0));
+
+        // Construct the updates operand of scatter.
+        Shape update_shape = sort->operand(1)->shape();
+        for (int64 i = 0; i < rank; ++i) {
+          update_shape.add_dimensions(1);
+          update_shape.mutable_layout()->add_minor_to_major(rank + i);
+        }
+        auto scatter_updates =
+            computation_->AddInstruction(HloInstruction::CreateReshape(
+                update_shape, sort->mutable_operand(1)));
+
+        // Construct the updates computation, which simply replaces the operand
+        // values with the update values.
+        HloComputation::Builder b("update_replace_computation");
+        Shape scalar_shape = ShapeUtil::MakeShape(S32, {});
+        b.AddInstruction(
+            HloInstruction::CreateParameter(0, scalar_shape, "scalar_lhs"));
+        auto scalar_rhs = b.AddInstruction(
+            HloInstruction::CreateParameter(1, scalar_shape, "scalar_rhs"));
+        auto update_replace_computation =
+            computation_->parent()->AddEmbeddedComputation(b.Build(scalar_rhs));
+
+        ScatterDimensionNumbers dim_numbers;
+        dim_numbers.set_index_vector_dim(rank);
+        for (int64 i = 0; i < rank; ++i) {
+          dim_numbers.add_update_window_dims(rank + i);
+          dim_numbers.add_scatter_dims_to_operand_dims(i);
+        }
+        auto scatter =
+            computation_->AddInstruction(HloInstruction::CreateScatter(
+                sort->operand(1)->shape(), scatter_operand, scatter_indices,
+                scatter_updates, update_replace_computation, dim_numbers));
+        return ReplaceWithNewInstruction(
+            sort, HloInstruction::CreateTuple(
+                      {computation_->AddInstruction(HloInstruction::CreateIota(
+                           operand->shape(), dimension_to_sort)),
+                       scatter}));
+      }
+    }
+  }
   return Status::OK();
 }
 
@@ -2525,7 +2769,7 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
     return ReplaceInstruction(transpose, operand);
   }
 
-  if (is_layout_sensitive_ && TransposeIsBitcast(transpose)) {
+  if (options_.is_layout_sensitive() && TransposeIsBitcast(transpose)) {
     ReplaceWithBitcast(transpose);
     return Status::OK();
   }
@@ -2674,13 +2918,13 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   const ConvolutionDimensionNumbers& dnums =
       convolution->convolution_dimension_numbers();
 
-  if (!enable_conv_simplification_) {
+  if (!options_.enable_conv_simplification()) {
     return false;
   }
 
   // TODO(b/31337498): For now, we cowardly refuse to do this optimization in
   // layout-insensitive mode, for fear of adding nontrivial reshapes.
-  if (!is_layout_sensitive_) {
+  if (!options_.is_layout_sensitive()) {
     return false;
   }
 
@@ -2770,9 +3014,9 @@ StatusOr<bool> AlgebraicSimplifierVisitor::SimplifyConvToDot(
   // We cannot insert bitcasts if the layouts will not be compatible.
   // TODO(b/33178038): Consider inserting a transpose if a bitcast would be
   // invalid.
-  if (!valid_bitcast_callback_(input_shape, new_input_shape) ||
-      !valid_bitcast_callback_(filter_shape, new_filter_shape) ||
-      !valid_bitcast_callback_(dot_output_shape, convolution_shape)) {
+  if (!options_.valid_bitcast_callback()(input_shape, new_input_shape) ||
+      !options_.valid_bitcast_callback()(filter_shape, new_filter_shape) ||
+      !options_.valid_bitcast_callback()(dot_output_shape, convolution_shape)) {
     return false;
   }
 
@@ -2878,9 +3122,7 @@ StatusOr<bool> AlgebraicSimplifier::Run(HloModule* module) {
                  "AlgebraicSimplifier::Run(), before:\n" + module->ToString());
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations()) {
-    if (AlgebraicSimplifierVisitor::Run(
-            comp, is_layout_sensitive_, valid_bitcast_callback_,
-            enable_dot_strength_reduction_, enable_conv_simplification_)) {
+    if (AlgebraicSimplifierVisitor::Run(comp, options_)) {
       changed = true;
     }
   }
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.h b/tensorflow/compiler/xla/service/algebraic_simplifier.h
index 9f8d0ee88bdebcf17310cd0407b1b99e4b0a7b5f..d2775b9fafa7e4c625f5d181114e80e7369f9c78 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.h
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.h
@@ -23,8 +23,7 @@ limitations under the License.
 
 namespace xla {
 
-// A pass which performs algebraic simplifications.
-class AlgebraicSimplifier : public HloModulePass {
+class AlgebraicSimplifierOptions {
  public:
   // Given shapes 'from_shape' and 'to_shape', determines if it is valid to
   // bitcast from 'from_shape' to 'to_shape' after considering platform
@@ -34,18 +33,63 @@ class AlgebraicSimplifier : public HloModulePass {
   using ValidBitcastCallback =
       std::function<bool(const Shape& from_shape, const Shape& to_shape)>;
 
+  explicit AlgebraicSimplifierOptions(
+      ValidBitcastCallback valid_bitcast_callback)
+      : valid_bitcast_callback_(std::move(valid_bitcast_callback)) {}
+  // If valid_bitcast_callback returns true, then the pass will replace reshapes
+  // and transposes with bitcasts.
+  const ValidBitcastCallback& valid_bitcast_callback() const {
+    return valid_bitcast_callback_;
+  }
+
+  // If is_layout_sensitive is true, then the simplifier preserves layout during
+  // transformation. Otherwise, layout is ignored.
+  void set_is_layout_sensitive(bool is_layout_sensitive) {
+    is_layout_sensitive_ = is_layout_sensitive;
+  }
+  bool is_layout_sensitive() const { return is_layout_sensitive_; }
+
+  // Enable dot simplification on platforms where it is profitable.
+  void set_enable_dot_strength_reduction(bool enable_dot_strength_reduction) {
+    enable_dot_strength_reduction_ = enable_dot_strength_reduction;
+  }
+  bool enable_dot_strength_reduction() const {
+    return enable_dot_strength_reduction_;
+  }
+
+  // Enable convolution simplification on platforms where it is profitable.
+  void set_enable_conv_simplification(bool enable_conv_simplification) {
+    enable_conv_simplification_ = enable_conv_simplification;
+  }
+  bool enable_conv_simplification() const {
+    return enable_conv_simplification_;
+  }
+
+  // If enable_permutation_sort_replacement is true, a sort op that is known to
+  // sort a permutation will be replaced with a scatter op.
+  void set_enable_permutation_sort_replacement(
+      bool enable_permutation_sort_replacement) {
+    enable_permutation_sort_replacement_ = enable_permutation_sort_replacement;
+  }
+  bool enable_permutation_sort_replacement() const {
+    return enable_permutation_sort_replacement_;
+  }
+
+ private:
+  ValidBitcastCallback valid_bitcast_callback_;
+  bool is_layout_sensitive_{false};
+  bool enable_dot_strength_reduction_{true};
+  bool enable_conv_simplification_{true};
+  bool enable_permutation_sort_replacement_{false};
+};
+
+// A pass which performs algebraic simplifications.
+class AlgebraicSimplifier : public HloModulePass {
+ public:
   // If is_layout_sensitive is true, then the simplifier preserves layout during
-  // transformation. Otherwise, layout is ignored. If valid_bitcast_callback
-  // returns true, then the pass will replace reshapes and transposes with
-  // bitcasts.
-  AlgebraicSimplifier(bool is_layout_sensitive,
-                      ValidBitcastCallback valid_bitcast_callback,
-                      bool enable_dot_strength_reduction = true,
-                      bool enable_conv_simplification = true)
-      : is_layout_sensitive_(is_layout_sensitive),
-        valid_bitcast_callback_(std::move(valid_bitcast_callback)),
-        enable_dot_strength_reduction_(enable_dot_strength_reduction),
-        enable_conv_simplification_(enable_conv_simplification) {}
+  // transformation. Otherwise, layout is ignored.
+  explicit AlgebraicSimplifier(const AlgebraicSimplifierOptions& options)
+      : options_(options) {}
   ~AlgebraicSimplifier() override = default;
   absl::string_view name() const override { return "algsimp"; }
 
@@ -54,14 +98,7 @@ class AlgebraicSimplifier : public HloModulePass {
   StatusOr<bool> Run(HloModule* module) override;
 
  private:
-  bool is_layout_sensitive_;
-  ValidBitcastCallback valid_bitcast_callback_;
-
-  // Enable dot simplification on platforms where it is profitable.
-  bool enable_dot_strength_reduction_;
-
-  // Enable convolution simplification on platforms where it is profitable.
-  bool enable_conv_simplification_;
+  AlgebraicSimplifierOptions options_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py
new file mode 100644
index 0000000000000000000000000000000000000000..5da13da041b4ded813876af7ca379025187545ab
--- /dev/null
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_proof_distributive_property.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Proof that transforming (A*C)+(B*C) <=> (A+B)*C is "safe" if C=2^k.
+
+Specifically, for all floating-point values A, B, and C, if
+
+ - C is equal to +/- 2^k for some (possibly negative) integer k, and
+ - A, B, C, A*C, B*C, and A+B are not subnormal, zero, or inf,
+
+then there exists a rounding mode rm in [RTZ, RNE] such that
+
+ (A*C) + (B*C) == (A+B) * C  (computed with rounding mode rm).
+
+Informally, this means that the equivalence holds for powers of 2 C, modulo
+flushing to zero or inf, and modulo rounding of intermediate results.
+
+Requires z3 python bindings; try `pip install z3-solver`.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import z3
+
+# We do float16 because it lets the solver run much faster.  These results
+# should generalize to fp32 and fp64, and you can verify this by changing the
+# value of FLOAT_TY (and then waiting a while).
+FLOAT_TY = z3.Float16
+
+a = z3.FP("a", FLOAT_TY())
+b = z3.FP("b", FLOAT_TY())
+c = z3.FP("c", FLOAT_TY())
+
+s = z3.Solver()
+
+# C must be a power of 2, i.e. significand bits must all be 0.
+s.add(z3.Extract(FLOAT_TY().sbits() - 1, 0, z3.fpToIEEEBV(c)) == 0)
+
+for rm in [z3.RTZ(), z3.RNE()]:
+  z3.set_default_rounding_mode(rm)
+  before = a * c + b * c
+  after = (a + b) * c
+
+  # Check that before == after, allowing that 0 == -0.
+  s.add(
+      z3.Not(
+          z3.Or(
+              before == after,  #
+              z3.And(z3.fpIsZero(before), z3.fpIsZero(after)))))
+
+  for x in [
+      (a * c),
+      (b * c),
+      (a + b),
+  ]:
+    s.add(z3.Not(z3.fpIsSubnormal(x)))
+    s.add(z3.Not(z3.fpIsZero(x)))
+    s.add(z3.Not(z3.fpIsInf(x)))
+
+if s.check() == z3.sat:
+  m = s.model()
+  print("Counterexample found!")
+  print(m)
+  print("a*c:       ", z3.simplify(m[a] * m[c]))
+  print("b*c:       ", z3.simplify(m[b] * m[c]))
+  print("a+b:       ", z3.simplify(m[a] + m[b]))
+  print("a*c + b*c: ", z3.simplify(m[a] * m[c] + m[b] * m[c]))
+  print("(a+b) * c: ", z3.simplify((m[a] + m[b]) * m[c]))
+else:
+  print("Proved!")
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index e4c4da1b0e7aef0e3476e4d232e410da25794e13..14ce519b6a0fd221070006d336d23bddeb6cd621 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -27,9 +27,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
@@ -42,18 +44,20 @@ namespace xla {
 namespace {
 
 using ::testing::ElementsAre;
+namespace m = match;
 
-namespace op = xla::testing::opcode_matchers;
-
-AlgebraicSimplifier::ValidBitcastCallback bitcasting_callback() {
+AlgebraicSimplifierOptions::ValidBitcastCallback bitcasting_callback() {
   return [](const Shape&, const Shape&) { return true; };
 }
 
-AlgebraicSimplifier::ValidBitcastCallback non_bitcasting_callback() {
+AlgebraicSimplifierOptions::ValidBitcastCallback non_bitcasting_callback() {
   return [](const Shape&, const Shape&) { return false; };
 }
 
-class AlgebraicSimplifierTest : public HloTestBase {};
+class AlgebraicSimplifierTest : public HloTestBase {
+ protected:
+  AlgebraicSimplifierOptions default_options_{non_bitcasting_callback()};
+};
 
 // Test that A + 0 is simplified to A
 TEST_F(AlgebraicSimplifierTest, AddZero) {
@@ -70,13 +74,134 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
 }
 
+TEST_F(AlgebraicSimplifierTest, FactorIntegerAddition) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = s32[8] parameter(0)
+      p1 = s32[8] parameter(1)
+      p2 = s32[8] parameter(2)
+      x = s32[8] multiply(p0, p2)
+      y = s32[8] multiply(p1, p2)
+      ROOT sum = s32[8] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MultiplyAnyOrder(
+          m::AddAnyOrder(m::Parameter(0), m::Parameter(1)), m::Parameter(2))));
+}
+
+// A*C + B*C => (A+B)*C if C is a floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAddition) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(0.125)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, c)
+      ROOT sum = f32[] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::ConstantScalar(0.125))));
+}
+
+// A*C + B*C => (A+B)*C if C is a broadcast of a floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionWithBroadcast) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[4] parameter(0)
+      p1 = f32[4] parameter(1)
+      c = f32[] constant(0.125)
+      b = f32[4] broadcast(c), dimensions={}
+      x = f32[4] multiply(p0, b)
+      y = f32[4] multiply(p1, b)
+      ROOT sum = f32[4] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::Broadcast(m::ConstantScalar(0.125)))));
+}
+
+// A*C + B*C => (A+B)*C simplification should not happen if C is not a
+// floating-point power of 2.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionNotPowerOf2) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      c = f32[] constant(0.3)
+      x = f32[] multiply(p0, c)
+      y = f32[] multiply(p1, c)
+      ROOT sum = f32[] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+// A*C + B*C => (A+B)*C simplification should not happen if A, B, and C are
+// complex numbers.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionComplex) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = c64[8] parameter(0)
+      p1 = c64[8] parameter(1)
+      p2 = c64[8] parameter(2)
+      x = c64[8] multiply(p0, p2)
+      y = c64[8] multiply(p1, p2)
+      ROOT sum = c64[8] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  EXPECT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+}
+
+// A*C + B*C => (A+B)*C simplification is OK if A, B, and C are complex.
+TEST_F(AlgebraicSimplifierTest, FactorFpAdditionBfloat16) {
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      p0 = bf16[4] parameter(0)
+      p1 = bf16[4] parameter(1)
+      c = bf16[] constant(0.125)
+      b = bf16[4] broadcast(c), dimensions={}
+      x = bf16[4] multiply(p0, b)
+      y = bf16[4] multiply(p1, b)
+      ROOT sum = bf16[4] add(x, y)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).ValueOrDie());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::MultiplyAnyOrder(
+                  m::AddAnyOrder(m::Parameter(0), m::Parameter(1)),
+                  m::Broadcast(m::ConstantScalar(0.125)))));
+}
+
 // Test that A * 0 is simplified to 0
 TEST_F(AlgebraicSimplifierTest, MulZero) {
   auto m = CreateNewVerifiedModule();
@@ -92,8 +217,7 @@ TEST_F(AlgebraicSimplifierTest, MulZero) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), zero);
 }
@@ -115,8 +239,7 @@ TEST_F(AlgebraicSimplifierTest, SelectTrue) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param0);
 }
@@ -138,8 +261,7 @@ TEST_F(AlgebraicSimplifierTest, SelectFalse) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param1);
 }
@@ -159,8 +281,7 @@ TEST_F(AlgebraicSimplifierTest, SelectIdentical) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_EQ(computation->root_instruction(), param1);
 }
@@ -196,11 +317,10 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero,
                                                       dims1, add_computation));
   m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   HloInstruction* root = m->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reduce(param, zero));
+  EXPECT_THAT(root, GmockMatch(m::Reduce(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_EQ(root->dimensions(), std::vector<int64>({0, 2, 3}));
 }
 
@@ -219,11 +339,10 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0), m::Constant())));
 }
 
 // Test that [(A + C1) + C2] => [A + (C1 + C2)] for constants C1 and C2.
@@ -246,11 +365,12 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Add(constant1, constant2)));
+  EXPECT_THAT(root, GmockMatch(m::Add(
+                        m::Op().Is(param0),
+                        m::Add(m::Op().Is(constant1), m::Op().Is(constant2)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
@@ -269,8 +389,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -306,11 +425,11 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMap);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Broadcast(zero)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0),
+                                      m::Broadcast(m::Op().Is(zero)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
@@ -329,8 +448,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -344,12 +462,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_EQ(3.14f, root->operand(0)->literal().GetFirstElement<float>());
 }
 
@@ -361,12 +478,11 @@ TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
@@ -377,12 +493,11 @@ TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
 
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
 }
 
 // Test that A - 0 is simplified to A
@@ -400,8 +515,7 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -422,11 +536,11 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param0, op::Negate(constant)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0),
+                                      m::Negate(m::Op().Is(constant)))));
 }
 
 // Test that (A/B)/C is simplified to A/(B*C).
@@ -448,14 +562,16 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Divide(param0, param1), param2));
+              GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
+                                   m::Parameter(2))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Multiply(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Multiply(m::Parameter(1), m::Parameter(2)))));
 }
 
 // Test that A/(B/C) is simplified to (A*C)/B.
@@ -476,15 +592,18 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Divide(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Divide(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Multiply(param0, param2), param1));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Multiply(m::Parameter(0), m::Parameter(2)),
+                           m::Parameter(1))));
 }
 
 // Test that (A/B)/(C/D) is simplified to (A*D)/(B*C).
@@ -511,15 +630,16 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Divide(op::Divide(param0, param1), op::Divide(param2, param3)));
+      GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
+                           m::Divide(m::Parameter(2), m::Parameter(3)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Divide(op::Multiply(param0, param3), op::Multiply(param1, param2)));
+      GmockMatch(m::Divide(m::Multiply(m::Parameter(0), m::Parameter(3)),
+                           m::Multiply(m::Parameter(1), m::Parameter(2)))));
 }
 
 // Test that A/exp(B) is simplified to A*exp(-B).
@@ -539,14 +659,14 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Exp(param1)));
+              GmockMatch(m::Divide(m::Parameter(0), m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Exp(op::Negate(param1))));
+              GmockMatch(m::Multiply(m::Parameter(0),
+                                     m::Exp(m::Negate(m::Parameter(1))))));
 }
 
 // Test that A/pow(B,C) is simplified to A*pow(B,-C).
@@ -567,15 +687,18 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Power(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Power(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+              GmockMatch(m::Multiply(
+                  m::Parameter(0),
+                  m::Power(m::Parameter(1), m::Negate(m::Parameter(2))))));
 }
 
 // Test that broadcasting is done on the right step when simplifying A/pow(B,C)
@@ -597,15 +720,18 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(param0, op::Power(param1, param2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Parameter(0),
+                           m::Power(m::Parameter(1), m::Parameter(2)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   ASSERT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Power(param1, op::Negate(param2))));
+              GmockMatch(m::Multiply(
+                  m::Parameter(0),
+                  m::Power(m::Parameter(1), m::Negate(m::Parameter(2))))));
 }
 
 // A / Const => A * InvertedConst
@@ -623,12 +749,11 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(param0, op::Constant()));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Constant())));
 }
 
 // pow(pow(A, X), Y) => pow(A, X*Y)
@@ -648,11 +773,12 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPower) {
                                                       inner_power, exp2));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Power(base, op::Multiply(exp1, exp2)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Power(m::Op().Is(base),
+                          m::Multiply(m::Op().Is(exp1), m::Op().Is(exp2)))));
 }
 
 // Don't simplify pow(pow(A, X), Y) => pow(A, X*Y) if X and Y are complex
@@ -673,8 +799,7 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
                                                       inner_power, exp2));
 
   m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 }
 
@@ -693,8 +818,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -715,8 +839,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -740,8 +863,7 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, cplx);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -765,8 +887,7 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, real);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -790,8 +911,7 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, imag);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param1);
@@ -818,11 +938,10 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Add(param1, param2));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(1), m::Parameter(2))));
 }
 
 // Test that exp(A)/exp(B) is simplified to exp(A-B)
@@ -843,15 +962,16 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Divide(op::Exp(param0), op::Exp(param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Divide(m::Exp(m::Parameter(0)), m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Subtract(param0, param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Exp(m::Subtract(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that exp(A)*exp(B) is simplified to exp(A+B)
@@ -873,14 +993,14 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Exp(param0), op::Exp(param1)));
+              GmockMatch(m::Multiply(m::Exp(m::Parameter(0)),
+                                     m::Exp(m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Add(param0, param1)));
+              GmockMatch(m::Exp(m::Add(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that pow(exp(A), B) is simplified to exp(A*B)
@@ -900,14 +1020,14 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Power(op::Exp(param0), param1));
+              GmockMatch(m::Power(m::Exp(m::Parameter(0)), m::Parameter(1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Exp(op::Multiply(param0, param1)));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Exp(m::Multiply(m::Parameter(0), m::Parameter(1)))));
 }
 
 // Test that ln(pow(A, B)) is simplified to ln(A)*B
@@ -927,14 +1047,14 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Log(op::Power(param0, param1)));
+              GmockMatch(m::Log(m::Power(m::Parameter(0), m::Parameter(1)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Log(param0), param1));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Multiply(m::Log(m::Parameter(0)), m::Parameter(1))));
 }
 
 // Test that ln(exp(A)) is simplified to A
@@ -951,10 +1071,10 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Log(op::Exp(param0)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Log(m::Exp(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
@@ -981,13 +1101,14 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Log(op::Divide(op::Exp(param0), op::Exp(param1))));
+              GmockMatch(m::Log(m::Divide(m::Exp(m::Parameter(0)),
+                                          m::Exp(m::Parameter(1))))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Subtract(param0, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Subtract(m::Parameter(0), m::Parameter(1))));
 }
 
 // Test that pow(A, 0) where A is a scalar is simplified to the scalar
@@ -1005,14 +1126,14 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_EQ(root->literal().GetFirstElement<float>(), 1);
 }
 
@@ -1030,14 +1151,14 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast());
+  EXPECT_THAT(root, GmockMatch(m::Broadcast()));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), r1f32))
       << ShapeUtil::HumanString(root->shape());
   EXPECT_EQ(root->dimensions().size(), 0);
@@ -1059,10 +1180,10 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, one));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(one))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), param0);
@@ -1082,13 +1203,14 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, two));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(two))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Multiply(param0, param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(0))));
 }
 
 // Test that pow(A, -1) is simplified to 1/A.
@@ -1105,14 +1227,14 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Power(param0, negative_one));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Power(m::Parameter(0), m::Op().Is(negative_one))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Divide(op::Broadcast(), param0));
+  EXPECT_THAT(root, GmockMatch(m::Divide(m::Broadcast(), m::Parameter(0))));
   EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kBroadcast);
   EXPECT_EQ(root->operand(0)->operand(0)->literal().GetFirstElement<float>(),
             1);
@@ -1153,13 +1275,12 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
       ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, /*feature_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
   m->AddEntryComputation(builder.Build());
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Convolution(lhs, rhs));
+              GmockMatch(m::Convolution(m::Op().Is(lhs), m::Op().Is(rhs))));
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
@@ -1196,13 +1317,12 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
       window, add_computation));
   m->AddEntryComputation(builder.Build());
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::ReduceWindow(param, op::Constant()));
+              GmockMatch(m::ReduceWindow(m::Parameter(0), m::Constant())));
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
@@ -1225,12 +1345,11 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
       padding));
   m->AddEntryComputation(builder.Build());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Pad(param, op::Constant()));
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
+              GmockMatch(m::Pad(m::Parameter(0), m::Constant())));
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Broadcast(op::Constant()));
+              GmockMatch(m::Broadcast(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
@@ -1251,10 +1370,9 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
   m->AddEntryComputation(std::move(computation));
 
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Reshape(op::Broadcast(op::Reshape(op))));
+              GmockMatch(m::Reshape(m::Broadcast(m::Reshape(m::Op().Is(op))))));
 
-  HloPassFix<AlgebraicSimplifier> simplifier(/*is_layout_sensitive=*/false,
-                                             non_bitcasting_callback());
+  HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(m->entry_computation()->root_instruction(), op);
@@ -1271,10 +1389,10 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert(m::Op().Is(input))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), input);
@@ -1292,10 +1410,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1314,19 +1432,24 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
   *copy->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({1, 2, 0, 3});
   auto computation = m->AddEntryComputation(builder.Build());
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier1(/*is_layout_sensitive=*/true,
-                                  non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier1(options);
   ASSERT_FALSE(simplifier1.Run(m.get()).ValueOrDie());
   // Verify that the copy is not replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier2(/*is_layout_sensitive=*/true,
-                                  bitcasting_callback());
+  AlgebraicSimplifierOptions options2(bitcasting_callback());
+  options2.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier2(options2);
   ASSERT_TRUE(simplifier2.Run(m.get()).ValueOrDie());
   // Verify that the copy is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 // Test that unary concatenates are removed.
@@ -1341,10 +1464,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Concatenate(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Concatenate(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param0);
@@ -1371,16 +1494,17 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(
-      computation->root_instruction(),
-      op::Concatenate(empty_literal, param0, param0, empty_slice, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Concatenate(
+                  m::Op().Is(empty_literal), m::Parameter(0), m::Parameter(0),
+                  m::Op().Is(empty_slice), m::Parameter(1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Concatenate(param0, param0, param1));
+              GmockMatch(m::Concatenate(m::Parameter(0), m::Parameter(0),
+                                        m::Parameter(1))));
 }
 
 // Test that reduce of concat is simplified.
@@ -1423,14 +1547,14 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Map(op::Map(op::Reduce(param0, zero), op::Reduce(param1, zero)),
-              op::Reduce(param2, zero)));
+      GmockMatch(m::Map(m::Map(m::Reduce(m::Parameter(0), m::Op().Is(zero)),
+                               m::Reduce(m::Parameter(1), m::Op().Is(zero))),
+                        m::Reduce(m::Parameter(2), m::Op().Is(zero)))));
 }
 
 // Test a concatenate with only empty operands is removed.
@@ -1453,10 +1577,10 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Concatenate(empty_literal, empty_slice));
+              GmockMatch(m::Concatenate(m::Op().Is(empty_literal),
+                                        m::Op().Is(empty_slice))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_EQ(computation->root_instruction(), empty_literal);
@@ -1479,10 +1603,80 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param0, param1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Parameter(1))));
+}
+
+TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
+  auto m = CreateNewVerifiedModule();
+  Shape r2f32 = ShapeUtil::MakeShape(F32, {100, 99});
+  Shape concat_shape = ShapeUtil::MakeShape(F32, {50, 80});
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r2f32, "param0"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r2f32, "param1"));
+
+  HloInstruction* slice0 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{0, 0},
+      /*limit_indices=*/{50, 10}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice0' and 'slice1' because of different start indices in
+  // dimension 0.
+  HloInstruction* slice1 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 10},
+      /*limit_indices=*/{100, 20}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice1' and 'slice2' because of stride in dimension 2.
+  HloInstruction* slice2 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 20},
+      /*limit_indices=*/{100, 40}, /*strides=*/{1, 2}));
+
+  // Cannot merge 'slice2' and 'slice3' because of stride in dimension 2.
+  HloInstruction* slice3 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 40},
+      /*limit_indices=*/{100, 50}, /*strides=*/{1, 1}));
+
+  // Can merge 'slice3' and 'slice4'.
+  HloInstruction* slice4 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 50},
+      /*limit_indices=*/{100, 60}, /*strides=*/{1, 1}));
+
+  // Can merge 'slice4' and 'slice5'.
+  HloInstruction* slice5 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 60},
+      /*limit_indices=*/{100, 70}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice5' and 'slice6' because of overlap.
+  HloInstruction* slice6 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param0, /*start_indices=*/{50, 69},
+      /*limit_indices=*/{100, 79}, /*strides=*/{1, 1}));
+
+  // Cannot merge 'slice6' and 'slice7' because of slicing from a different
+  // parameter.
+  HloInstruction* slice7 = builder.AddInstruction(HloInstruction::CreateSlice(
+      ShapeUtil::MakeShape(F32, {50, 10}), param1, /*start_indices=*/{50, 79},
+      /*limit_indices=*/{100, 89}, /*strides=*/{1, 1}));
+
+  builder.AddInstruction(HloInstruction::CreateConcatenate(
+      concat_shape,
+      {slice0, slice1, slice2, slice3, slice4, slice5, slice6, slice7}, 1));
+  auto computation = m->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(default_options_);
+  ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
+  auto s = m::Slice(m::Parameter(0));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Concatenate(s, s, s, s, s, m::Slice(m::Parameter(1)))));
+  // The operand 3 should be a merge of 'slice3', 'slice4' and 'slice5', so its
+  // shape should have dimensions {50, 30}.
+  EXPECT_TRUE(
+      ShapeUtil::Equal(computation->root_instruction()->operand(3)->shape(),
+                       ShapeUtil::MakeShape(F32, {50, 30})));
+  EXPECT_EQ(computation->root_instruction()->operand(3)->slice_starts(1), 40);
 }
 
 // Test that a simplification which changes layouts is not performed if layout
@@ -1502,14 +1696,17 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has not been removed.
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 }
 
 // Test that a simplification which preserves layouts is performed if layout
@@ -1529,10 +1726,12 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Copy has been removed.
@@ -1557,14 +1756,17 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   // Reshape is not replaced with a bitcast.
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 }
 
 // Test transforming reshapes and transposes of rng.
@@ -1588,13 +1790,13 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  // Verify that that reshape(transpose(rng)) is replace by a single rng of the
+  // Verify that reshape(transpose(rng)) is replace by a single rng of the
   // same shape as the reshape.
-  EXPECT_THAT(computation->root_instruction(), op::Rng());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Rng()));
   EXPECT_TRUE(ShapeUtil::Equal(computation->root_instruction()->shape(),
                                reshape_shape));
 }
@@ -1636,17 +1838,20 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(transformable_reshape, dimensions_wrong_reshape,
-                        layout_wrong_reshape));
+              GmockMatch(m::Tuple(m::Op().Is(transformable_reshape),
+                                  m::Op().Is(dimensions_wrong_reshape),
+                                  m::Op().Is(layout_wrong_reshape))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   simplifier.Run(m.get()).ValueOrDie();
 
   // Verify that only the first reshape is replaced.
   EXPECT_THAT(
       computation->root_instruction(),
-      op::Tuple(op::Bitcast(), dimensions_wrong_reshape, layout_wrong_reshape));
+      GmockMatch(m::Tuple(m::Bitcast(), m::Op().Is(dimensions_wrong_reshape),
+                          m::Op().Is(layout_wrong_reshape))));
 }
 
 // Regression test for a bug where if we failed to sink a reshape, we'd set the
@@ -1667,8 +1872,8 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
   m->AddEntryComputation(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
@@ -1692,8 +1897,8 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
       HloInstruction::CreateBroadcast(ShapeUtil::MakeShape(F32, {2, 2, 2}), add,
                                       /*broadcast_dimensions=*/{0, 1}));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifier simplifier(
+      (AlgebraicSimplifierOptions(bitcasting_callback())));
   m->AddEntryComputation(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
@@ -1715,14 +1920,17 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
@@ -1742,14 +1950,17 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   // Verify that the reshape is replaced.
-  EXPECT_THAT(computation->root_instruction(), op::Bitcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Bitcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
@@ -1769,13 +1980,13 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Reshape(param0)));
+              GmockMatch(m::Reshape(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
@@ -1796,13 +2007,16 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(op::Copy(param0)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Copy(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Copy(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
@@ -1821,13 +2035,14 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(transpose1));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Op().Is(transpose1))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Transpose(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Transpose(m::Parameter(0))));
   EXPECT_EQ(std::vector<int64>({2, 1, 0}),
             computation->root_instruction()->dimensions());
 }
@@ -1846,13 +2061,13 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Broadcast(op::Reshape(param0)));
+              GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
 }
 
 // Test merging broadcast and reshape.
@@ -1869,13 +2084,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param0)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param0));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
@@ -1891,14 +2106,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   auto computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
@@ -1914,13 +2128,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
   EXPECT_THAT(computation->root_instruction()->dimensions(),
               ::testing::ElementsAre(3));
 }
@@ -1938,13 +2152,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Broadcast(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))));
   const std::vector<int64> broadcast_dims =
       computation->root_instruction()->dimensions();
   EXPECT_EQ(1, broadcast_dims.size());
@@ -1964,14 +2178,13 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Broadcast(param)));
+              GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
@@ -1984,13 +2197,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape));
 }
@@ -2004,14 +2217,13 @@ TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
   auto root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_EQ(0.0f, root->operand(0)->literal().GetFirstElement<float>());
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), result_shape));
@@ -2027,13 +2239,14 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 }
 
 TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
@@ -2046,13 +2259,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(computation->root_instruction())
                 ->iota_dimension(),
             3);
@@ -2068,13 +2281,13 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Iota());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
   const int64 iota_dim =
       Cast<HloIotaInstruction>(computation->root_instruction())
           ->iota_dimension();
@@ -2091,13 +2304,14 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) {
 
   HloComputation* computation = m->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   EXPECT_FALSE(simplifier.Run(m.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Iota()));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Iota())));
 }
 
 TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
@@ -2120,10 +2334,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
@@ -2153,8 +2367,7 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
 
   auto has_negative_padding = [](const HloInstruction* pad) {
     for (auto& padding_dimension : pad->padding_config().dimensions()) {
@@ -2166,16 +2379,54 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
     return false;
   };
 
-  EXPECT_THAT(computation->root_instruction(), op::Pad(param, zero));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
   EXPECT_TRUE(has_negative_padding(pad));
 
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Pad(param, zero)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Pad(m::Parameter(0), m::Op().Is(zero)))));
   EXPECT_FALSE(
       has_negative_padding(computation->root_instruction()->operand(0)));
 }
 
+TEST_F(AlgebraicSimplifierTest, TrivialInteriorPadding) {
+  // Verify that a pad instruction with interior padding on one-sized
+  // dimensions, removes the interior padding.
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {2, 1}), "param"));
+  HloInstruction* zero = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f)));
+  PaddingConfig padding;
+  for (int i = 0; i < 2; ++i) {
+    auto dimension = padding.add_dimensions();
+    dimension->set_edge_padding_low(3);
+    dimension->set_edge_padding_high(3);
+    dimension->set_interior_padding(i * 3);
+  }
+  HloInstruction* pad = builder.AddInstruction(HloInstruction::CreatePad(
+      ShapeUtil::MakeShape(F32, {8, 7}), param, zero, padding));
+
+  auto module = CreateNewVerifiedModule();
+  HloComputation* computation = module->AddEntryComputation(builder.Build());
+
+  AlgebraicSimplifier simplifier(default_options_);
+
+  ASSERT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
+  ASSERT_TRUE(HasInteriorPadding(pad->padding_config()));
+
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
+  EXPECT_FALSE(
+      HasInteriorPadding(computation->root_instruction()->padding_config()));
+}
+
 TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   HloComputation::Builder builder(TestName());
   HloInstruction* param =
@@ -2187,10 +2438,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
@@ -2210,10 +2461,10 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   EXPECT_THAT(computation->root_instruction(), param);
@@ -2239,13 +2490,14 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Slice(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Slice(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(param));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Parameter(0))));
   EXPECT_EQ(computation->root_instruction()->slice_starts(0), 3);
   EXPECT_EQ(computation->root_instruction()->slice_starts(1), 5);
   EXPECT_EQ(computation->root_instruction()->slice_limits(0), dim0 - 2);
@@ -2271,13 +2523,14 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
-  EXPECT_THAT(computation->root_instruction(), op::Reshape(op::Slice(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Reshape(m::Slice(m::Parameter(0)))));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
@@ -2296,10 +2549,10 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Slice(op::Reshape(param)));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
@@ -2312,12 +2565,84 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   builder.AddInstruction(HloInstruction::CreateSort(keys_shape, 0, keys));
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), keys);
 }
 
+TEST_F(AlgebraicSimplifierTest, ReplacePermutationSortWithScatter) {
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
+  auto root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              GmockMatch(m::Tuple(
+                  m::Iota(),
+                  m::Scatter(m::Iota(), m::Concatenate(m::Iota(), m::Reshape()),
+                             m::Reshape()))));
+}
+
+TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortIfNonIntegral) {
+  // Same as ReplacePermutationSortWithScatter except that the iota has F32
+  // type.
+  const char* hlo_string = R"(
+    HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = f32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = f32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (f32[64,8732]{1,0}, f32[64,8732]{1,0}) sort(gte, values), dimensions={1}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
+TEST_F(AlgebraicSimplifierTest, DontReplacePermutationSortWrongDimensions) {
+  // Same as ReplacePermutationSortWithScatter except that the sort dimensions
+  // don't match.
+  const char* hlo_string = R"(
+   HloModule permutation_sort
+
+    ENTRY sort_computation {
+      keys = f32[64,8732]{1,0} parameter(0)
+      values = s32[64,8732]{1,0} iota(), iota_dimension=1
+      sort = (f32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(keys, values), dimensions={1}
+      gte = s32[64,8732]{1,0} get-tuple-element(sort), index=1
+      ROOT sort2 = (s32[64,8732]{1,0}, s32[64,8732]{1,0}) sort(gte, values), dimensions={0}
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options(non_bitcasting_callback());
+  options.set_enable_permutation_sort_replacement(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
+}
+
 TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
   auto builder = HloComputation::Builder(TestName());
 
@@ -2334,11 +2659,11 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
       keys, {values0, values1}));
   auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(keys, values0, values1));
+              GmockMatch(m::Tuple(m::Op().Is(keys), m::Op().Is(values0),
+                                  m::Op().Is(values1))));
 }
 
 // Test that A && True is simplified to A
@@ -2356,8 +2681,7 @@ TEST_F(AlgebraicSimplifierTest, AndTrue) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -2378,8 +2702,7 @@ TEST_F(AlgebraicSimplifierTest, AndTrue2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -2400,8 +2723,7 @@ TEST_F(AlgebraicSimplifierTest, AndFalse) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, const_false);
@@ -2422,8 +2744,7 @@ TEST_F(AlgebraicSimplifierTest, AndFalse2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, const_false);
@@ -2444,8 +2765,7 @@ TEST_F(AlgebraicSimplifierTest, OrTrue) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, const_true);
@@ -2466,8 +2786,7 @@ TEST_F(AlgebraicSimplifierTest, OrTrue2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, const_true);
@@ -2488,8 +2807,7 @@ TEST_F(AlgebraicSimplifierTest, OrFalse) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -2510,8 +2828,7 @@ TEST_F(AlgebraicSimplifierTest, OrFalse2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
   EXPECT_EQ(root, param0);
@@ -2641,15 +2958,15 @@ TEST_P(ConvInputPaddingTest, DoTest) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
     ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
     ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
-    ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
+    ASSERT_THAT(conv,
+                GmockMatch(m::Convolution(m::Parameter(), m::Parameter())));
     EXPECT_EQ(window_util::ToString(conv->window()),
               absl::StrCat("size=3x3 ", testcase.expected_conv_window));
   }
@@ -2759,15 +3076,15 @@ TEST_P(ConvFilterPaddingTest, DoIt) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
     ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
   } else {
     ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
     auto* conv = module->entry_computation()->root_instruction();
     SCOPED_TRACE(module->ToString());
-    ASSERT_THAT(conv, op::Convolution(op::Parameter(), op::Parameter()));
+    ASSERT_THAT(conv,
+                GmockMatch(m::Convolution(m::Parameter(), m::Parameter())));
     EXPECT_EQ(window_util::ToString(conv->window()),
               absl::StrFormat("size=%dx%d %s",
                               conv->operand(1)->shape().dimensions(2),
@@ -2908,8 +3225,9 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
     auto module = CreateNewUnverifiedModule();
     auto* computation = module->AddEntryComputation(b.Build());
 
-    AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/true,
-                                   bitcasting_callback());
+    AlgebraicSimplifierOptions simplifier_options(bitcasting_callback());
+    simplifier_options.set_is_layout_sensitive(true);
+    AlgebraicSimplifier simplifier(simplifier_options);
     if (!simplifier.Run(module.get()).ValueOrDie()) {
       return "NO_CHANGE";
     }
@@ -3032,17 +3350,15 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
   EXPECT_EQ(root, slice);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
 
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
   ASSERT_FALSE(simplifier.Run(module.get()).ValueOrDie());
-
-  root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(scalar_param));
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), slice_shape));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Op().Is(scalar_param))
+                             .WithShapeEqualTo(&slice_shape)));
 }
 
 // Test that reshape(transpose(broadcast(/*scalar value*/))) simplifies to a
@@ -3071,13 +3387,11 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
   EXPECT_EQ(root, reshape);
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
-
-  root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(forty_two));
-  EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reshape_shape));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Broadcast(m::Op().Is(forty_two))
+                             .WithShapeEqualTo(&reshape_shape)));
 }
 
 // Test that ReduceWindow(Pad(op, x), y) can simplify to ReduceWindow(op, x).
@@ -3138,8 +3452,7 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
@@ -3147,7 +3460,8 @@ TEST_F(AlgebraicSimplifierTest, FoldPadIntoReduceWindow) {
 
   // Verify the result
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::ReduceWindow(operand, op::Constant()));
+  EXPECT_THAT(root,
+              GmockMatch(m::ReduceWindow(m::Op().Is(operand), m::Constant())));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
       << ShapeUtil::HumanString(root->shape()) << " vs "
       << ShapeUtil::HumanString(reduce_window_shape);
@@ -3224,8 +3538,7 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
   auto computation = module->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reduce_window);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   // Running simplification again should not result in any further changes.
@@ -3233,7 +3546,8 @@ TEST_F(AlgebraicSimplifierTest, FoldConvertedPadIntoReduceWindow) {
 
   // Verify the result
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::ReduceWindow(op::Convert(parameter), op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::ReduceWindow(m::Convert(m::Parameter(0)),
+                                               m::Constant())));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), reduce_window_shape))
       << ShapeUtil::HumanString(root->shape()) << " vs "
       << ShapeUtil::HumanString(reduce_window_shape);
@@ -3258,8 +3572,7 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).ValueOrDie());
 
   HloInstruction* root = computation->root_instruction();
@@ -3295,8 +3608,7 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
 
   m->AddEmbeddedComputation(std::move(dot_computation));
   m->AddEntryComputation(call_builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
 }
 
@@ -3313,11 +3625,10 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
 
   auto computation = m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::Tuple(op::Constant(), op::Constant()));
+              GmockMatch(m::Tuple(m::Constant(), m::Constant())));
 }
 
 // A dynamic-slice is trivial if its start indices are all zeroes and the size
@@ -3337,10 +3648,9 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
       /*slice_sizes=*/{10, 100, 1000}));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
-  EXPECT_THAT(computation->root_instruction(), op::Parameter());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Parameter()));
 }
 
 // A dynamic-update-slice is trivial if its start indices are all zeroes and the
@@ -3371,11 +3681,10 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
           3, ShapeUtil::MakeShape(U32, {3}), "update_indices"))));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(),
-              op::DynamicSlice(op::Parameter(), op::Parameter()));
+              GmockMatch(m::DynamicSlice(m::Parameter(), m::Parameter())));
 }
 
 // Test that two consecutive broadcasts can be merged to one.
@@ -3394,11 +3703,10 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Constant())));
   EXPECT_THAT(root->dimensions(), ElementsAre(2));
 }
 
@@ -3421,11 +3729,10 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Broadcast(op::Parameter(0)));
+  EXPECT_THAT(root, GmockMatch(m::Broadcast(m::Parameter(0))));
   EXPECT_THAT(root->dimensions(), ElementsAre(1, 3));
 }
 
@@ -3442,11 +3749,10 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
 }
 
@@ -3464,11 +3770,10 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
   auto computation = m->AddEntryComputation(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   root = computation->root_instruction();
-  EXPECT_THAT(root, op::Iota());
+  EXPECT_THAT(root, GmockMatch(m::Iota()));
   EXPECT_EQ(Cast<HloIotaInstruction>(root)->iota_dimension(), 2);
 }
 
@@ -3486,11 +3791,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadLow) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reshape(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
@@ -3507,11 +3812,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadHigh) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reshape(op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Constant())));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
@@ -3528,8 +3833,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidNonScalar) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_FALSE(simplifier.Run(module.get()).ValueOrDie());
 }
 
@@ -3547,11 +3852,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfPadMidScalar) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter());
+  EXPECT_THAT(root, GmockMatch(m::Parameter()));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
@@ -3569,11 +3874,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatScalarInput) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(1));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(1)));
 }
 
 TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
@@ -3591,11 +3896,11 @@ TEST_F(AlgebraicSimplifierTest, SliceOfConcatNonScalarInput) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Slice(op::Parameter(2)));
+  EXPECT_THAT(root, GmockMatch(m::Slice(m::Parameter(2))));
   EXPECT_EQ(root->slice_starts(0), 1);
   EXPECT_EQ(root->slice_limits(0), 2);
 }
@@ -3613,11 +3918,11 @@ TEST_F(AlgebraicSimplifierTest, NegateNegate) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(0));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(0)));
 }
 
 TEST_F(AlgebraicSimplifierTest, NotNot) {
@@ -3633,11 +3938,11 @@ TEST_F(AlgebraicSimplifierTest, NotNot) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(hlo_string));
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 bitcasting_callback());
+  AlgebraicSimplifierOptions options(bitcasting_callback());
+  AlgebraicSimplifier simplifier(options);
   EXPECT_TRUE(simplifier.Run(module.get()).ValueOrDie());
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter(0));
+  EXPECT_THAT(root, GmockMatch(m::Parameter(0)));
 }
 
 struct PadReduceWindowEffectiveBroadcastCase {
@@ -3733,8 +4038,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
       output_shape, pad, zero, window, add_computation));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
@@ -3742,10 +4046,10 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
       ShapeUtil::Equal(computation->root_instruction()->shape(), output_shape));
 
   if (param.should_become_broadcast) {
-    EXPECT_THAT(computation->root_instruction(), op::Broadcast(::testing::_));
+    EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Broadcast()));
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::ReduceWindow(::testing::_, zero));
+                GmockMatch(m::ReduceWindow(m::Op(), m::Op().Is(zero))));
   }
 }
 
@@ -3815,8 +4119,7 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
   auto computation = module->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
   const bool dot_should_be_transformed = m == 1 || k == 1 || n == 1;
   const bool computation_should_be_modified =
@@ -3845,7 +4148,7 @@ struct DotOfConcatTestSpec {
 };
 
 class DotOfConcatSimplificationTest
-    : public HloTestBase,
+    : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<DotOfConcatTestSpec> {};
 
 // Test that we transform
@@ -3893,19 +4196,19 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
 
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
 
-  auto match_dot_0 = op::Dot(op::Slice(op::Constant()), op::Parameter(0));
-  auto match_dot_1 = op::Dot(op::Slice(op::Constant()), op::Parameter(1));
-  auto match_dot_2 = op::Dot(op::Slice(op::Constant()), op::Parameter(2));
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2));
+  auto match_dot_0 = m::Dot(m::Slice(m::Constant()), m::Parameter(0));
+  auto match_dot_1 = m::Dot(m::Slice(m::Constant()), m::Parameter(1));
+  auto match_dot_2 = m::Dot(m::Slice(m::Constant()), m::Parameter(2));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Add(m::Add(match_dot_0, match_dot_1), match_dot_2)));
 }
 
 // Test that we transform
@@ -3958,20 +4261,20 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
       ShapeUtil::Equal(computation->root_instruction()->shape(), dot_shape));
 
-  auto match_dot_0 = op::Dot(op::Parameter(0), op::Slice(op::Constant()));
-  auto match_dot_1 = op::Dot(op::Parameter(1), op::Slice(op::Constant()));
-  auto match_dot_2 = op::Dot(op::Parameter(2), op::Slice(op::Constant()));
-  auto match_dot_3 = op::Dot(op::Parameter(3), op::Slice(op::Constant()));
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Add(op::Add(match_dot_0, match_dot_1), match_dot_2),
-                      match_dot_3));
+  auto match_dot_0 = m::Dot(m::Parameter(0), m::Slice(m::Constant()));
+  auto match_dot_1 = m::Dot(m::Parameter(1), m::Slice(m::Constant()));
+  auto match_dot_2 = m::Dot(m::Parameter(2), m::Slice(m::Constant()));
+  auto match_dot_3 = m::Dot(m::Parameter(3), m::Slice(m::Constant()));
+  EXPECT_THAT(
+      computation->root_instruction(),
+      GmockMatch(m::Add(m::Add(m::Add(match_dot_0, match_dot_1), match_dot_2),
+                        match_dot_3)));
 }
 
 DotOfConcatTestSpec kDotOfConcatTestSpecs[] = {
@@ -4000,8 +4303,7 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
   const HloComputation* const computation =
       m->AddEntryComputation(builder.Build());
 
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).ValueOrDie());
   EXPECT_THAT(computation->root_instruction(), operand);
 }
@@ -4021,7 +4323,7 @@ struct DotOfGatherTestSpec {
 };
 
 class DotOfGatherSimplificationTest
-    : public HloTestBase,
+    : public AlgebraicSimplifierTest,
       public ::testing::WithParamInterface<DotOfGatherTestSpec> {};
 
 // input: dot(DS(ctA), ctB))
@@ -4078,8 +4380,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
       dot_shape, ds, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
@@ -4090,8 +4391,8 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
               HloOpcode::kDynamicSlice);
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
-                                 op::Concatenate()));
+                GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
+                                           m::Concatenate())));
   }
 }
 
@@ -4149,8 +4450,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
       dot_shape, lhs, ds, dot_dnums, DefaultPrecisionConfig(2)));
 
   auto computation = m->AddEntryComputation(builder.Build());
-  AlgebraicSimplifier simplifier(/*is_layout_sensitive=*/false,
-                                 non_bitcasting_callback());
+  AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
   EXPECT_TRUE(
@@ -4161,8 +4461,8 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
               HloOpcode::kDynamicSlice);
   } else {
     EXPECT_THAT(computation->root_instruction(),
-                op::DynamicSlice(op::Dot(op::Constant(), op::Constant()),
-                                 op::Concatenate()));
+                GmockMatch(m::DynamicSlice(m::Dot(m::Constant(), m::Constant()),
+                                           m::Concatenate())));
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.cc b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
new file mode 100644
index 0000000000000000000000000000000000000000..362bc44a1cf377b51c5519c6ab5e0d9628e80e58
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.cc
@@ -0,0 +1,285 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/ar_crs_combiner.h"
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+
+namespace {
+
+namespace m = match;
+
+// If the argument instruction is a CRS in the sequence
+// AR -> Convert -> Add -> CRS
+// then return the AR in the sequence.
+// TODO(b/117554291): Rewrite this to recognize more general patterns,
+// not just the specific one of AR -> Add -> Convert -> CRS.
+absl::optional<HloInstruction*> MatchesArCrsPattern(
+    HloInstruction* instruction) {
+  HloInstruction *ar, *convert, *add, *crs;
+  if (Match(instruction,
+            m::CrossReplicaSum(
+                &crs, m::Add(&add, m::Op(),
+                             m::Convert(&convert,
+                                        m::CrossReplicaSum(&ar, m::Op()))))) &&
+      ar->users().size() == 1 && ar->shape().element_type() == BF16 &&
+      convert->shape().element_type() == F32 && !crs->all_reduce_id()) {
+    return ar;
+  }
+  return absl::optional<HloInstruction*>();
+}
+
+}  // namespace
+
+absl::optional<HloInstruction*> ArCrsCombiner::WhileFromBodyParameter(
+    HloInstruction* instruction) {
+  CHECK_EQ(HloOpcode::kParameter, instruction->opcode());
+  HloComputation* computation = instruction->parent();
+  auto caller_instructions = call_graph_->GetComputationCallers(computation);
+  if (caller_instructions.size() == 1) {
+    auto caller_instruction = caller_instructions[0];
+    if (caller_instruction->opcode() == HloOpcode::kWhile) {
+      return caller_instruction;
+    }
+  }
+  return absl::optional<HloInstruction*>();
+}
+
+std::vector<HloInstruction*> ArCrsCombiner::GetAllTuples(
+    HloInstruction* instruction) {
+  if (instruction->opcode() == HloOpcode::kTuple) {
+    return {instruction};
+  }
+  if (instruction->opcode() == HloOpcode::kDomain) {
+    return GetAllTuples(instruction->operands()[0]);
+  }
+  if (instruction->opcode() == HloOpcode::kParameter) {
+    auto maybe_while = WhileFromBodyParameter(instruction);
+    if (!maybe_while) {
+      return {};
+    }
+    auto while_instr = *maybe_while;
+    auto init_tuples = GetAllTuples(while_instr->while_init());
+    auto body_tuples =
+        GetAllTuples(while_instr->while_body()->root_instruction());
+    if (init_tuples.empty() || body_tuples.empty()) {
+      return {};
+    }
+    init_tuples.insert(init_tuples.end(), body_tuples.begin(),
+                       body_tuples.end());
+    return init_tuples;
+  }
+  if (instruction->opcode() == HloOpcode::kGetTupleElement) {
+    std::vector<HloInstruction*> result_tuples;
+    for (auto tuple : GetAllTuples(instruction->operands()[0])) {
+      auto tmp_tuples =
+          GetAllTuples(tuple->mutable_operand(instruction->tuple_index()));
+      if (tmp_tuples.empty()) {
+        return {};
+      }
+      result_tuples.insert(result_tuples.end(), tmp_tuples.begin(),
+                           tmp_tuples.end());
+    }
+    return result_tuples;
+  }
+  return {};
+}
+
+bool ArCrsCombiner::TupleElementsComputeSameValue(
+    HloInstruction* tuple_shaped_instruction, int64 i1, int64 i2,
+    absl::flat_hash_map<int64, int64>* visited_pairs) {
+  auto tuples = GetAllTuples(tuple_shaped_instruction);
+  if (tuples.empty()) {
+    return false;
+  }
+  for (auto tuple : tuples) {
+    CHECK_EQ(tuple->opcode(), HloOpcode::kTuple);
+    if (!InstructionsComputeSameValue(tuple->mutable_operand(i1),
+                                      tuple->mutable_operand(i2),
+                                      visited_pairs)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/* static */
+bool ArCrsCombiner::TestInstructionsComputeSameValue(HloInstruction* i1,
+                                                     HloInstruction* i2) {
+  ArCrsCombiner combiner(/*num_spatial_partitions=*/2);
+  auto module = i1->parent()->parent();
+  CHECK_EQ(module, i2->parent()->parent());
+  combiner.call_graph_ = CallGraph::Build(module);
+  absl::flat_hash_map<int64, int64> visited_pairs;
+  return combiner.InstructionsComputeSameValue(i1, i2, &visited_pairs);
+}
+
+bool ArCrsCombiner::InstructionsComputeSameValue(
+    HloInstruction* i1, HloInstruction* i2,
+    absl::flat_hash_map<int64, int64>* visited_pairs) {
+  if (i1 == i2) {
+    return true;
+  }
+  auto uid1 = i1->unique_id();
+  auto uid2 = i2->unique_id();
+  auto min_uid = std::min(uid1, uid2);
+  auto max_uid = std::max(uid1, uid2);
+  auto it = visited_pairs->find(min_uid);
+  if (it != visited_pairs->end() && max_uid == it->second) {
+    return true;
+  }
+  auto opcode1 = i1->opcode();
+  auto operands1 = i1->operands();
+  if (opcode1 != i2->opcode() || operands1.size() != i2->operands().size()) {
+    return false;
+  }
+  visited_pairs->emplace(min_uid, max_uid);
+  for (int i = 0; i < operands1.size(); ++i) {
+    auto operand1 = operands1[i];
+    auto operand2 = i2->operands()[i];
+    if (!InstructionsComputeSameValue(operand1, operand2, visited_pairs)) {
+      return false;
+    }
+  }
+  if (opcode1 == HloOpcode::kParameter) {
+    // In the general case, we don't try to prove equality of parameters.
+    // We only try in the context of get-tuple-element
+    // (see TupleElementsComputeSameValue).
+    return false;
+  }
+  if (opcode1 == HloOpcode::kGetTupleElement) {
+    return i1->tuple_index() == i2->tuple_index() ||
+           TupleElementsComputeSameValue(operands1[0], i1->tuple_index(),
+                                         i2->tuple_index(), visited_pairs);
+  }
+  // Don't check that the operands are identical, because Identical can
+  // return false for instructions that compute the same value but are not
+  // identical, which we don't want. We have checked the arguments with
+  // InstructionsComputeSameValue earlier.
+  auto eq_instructions = [](const HloInstruction* i1,
+                            const HloInstruction* i2) -> bool { return true; };
+  auto eq_computations = [](const HloComputation* a, const HloComputation* b) {
+    return *a == *b;
+  };
+  return i1->Identical(*i2, eq_instructions, eq_computations,
+                       /*layout_sensitive=*/false);
+}
+
+void ArCrsCombiner::GroupAllReducesById(HloModule* module) {
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      auto ar = MatchesArCrsPattern(instruction);
+      if (ar) {
+        all_reduce_map_[*((*ar)->all_reduce_id())].push_back(*ar);
+      }
+    }
+  }
+}
+
+void ArCrsCombiner::KeepProvablyEqualInstructionGroups() {
+  for (auto it : all_reduce_map_) {
+    auto instruction_vec = it.second;
+    CHECK_EQ(instruction_vec.size(), num_spatial_partitions_);
+
+    auto instr_0 = instruction_vec[0];
+    auto add_0 = instr_0->users()[0]->users()[0];
+    CHECK_EQ(HloOpcode::kAdd, add_0->opcode());
+
+    for (int i = 1; i < instruction_vec.size(); ++i) {
+      auto instr_i = instruction_vec[i];
+      auto add_i = instr_i->users()[0]->users()[0];
+      CHECK_EQ(HloOpcode::kAdd, add_i->opcode());
+      absl::flat_hash_map<int64, int64> visited_pairs;
+      if (!InstructionsComputeSameValue(add_0, add_i, &visited_pairs)) {
+        all_reduce_map_.erase(it.first);
+      }
+    }
+  }
+}
+
+StatusOr<bool> ArCrsCombiner::RewriteGraph() {
+  if (all_reduce_map_.empty()) {
+    return false;
+  }
+
+  auto computation_is_addition = [](HloComputation* c) {
+    return c->instruction_count() == 3 &&
+           Match(c->root_instruction(), m::Add(m::Parameter(), m::Parameter()));
+  };
+
+  for (auto it : all_reduce_map_) {
+    auto instruction_vec = it.second;
+    for (auto all_reduce : instruction_vec) {
+      auto parent_computation = all_reduce->parent();
+      auto convert = all_reduce->users()[0];
+      auto add = convert->users()[0];
+      auto crs = add->users()[0];
+
+      if (!computation_is_addition(all_reduce->called_computations()[0]) ||
+          !computation_is_addition(crs->called_computations()[0])) {
+        continue;
+      }
+      HloInstruction* other_summand = (add->operands()[0] == convert)
+                                          ? add->operands()[1]
+                                          : add->operands()[0];
+      // To move the AR past the addition, we need to divide other_summand by
+      // the number of spatial partitions.
+      CHECK_EQ(all_reduce->user_count(), 1);
+      TF_CHECK_OK(
+          all_reduce->ReplaceAllUsesWith(all_reduce->mutable_operand(0)));
+      auto shape = other_summand->shape();
+      Literal lit(shape);
+      lit.PopulateWithValue<float>(num_spatial_partitions_);
+      auto divisor = parent_computation->AddInstruction(
+          HloInstruction::CreateConstant(lit.Clone()));
+      auto division =
+          parent_computation->AddInstruction(HloInstruction::CreateBinary(
+              shape, HloOpcode::kDivide, other_summand, divisor));
+      TF_CHECK_OK(other_summand->ReplaceUseWith(add, division));
+      // The AllReduce and the CRS are combined to an all-core AllReduce.
+      crs->set_all_reduce_id(all_reduce->all_reduce_id());
+      TF_CHECK_OK(parent_computation->RemoveInstruction(all_reduce));
+    }
+  }
+
+  return true;
+}
+
+StatusOr<bool> ArCrsCombiner::Run(HloModule* module) {
+  call_graph_ = CallGraph::Build(module);
+
+  GroupAllReducesById(module);
+
+  KeepProvablyEqualInstructionGroups();
+
+  return RewriteGraph();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner.h b/tensorflow/compiler/xla/service/ar_crs_combiner.h
new file mode 100644
index 0000000000000000000000000000000000000000..f6a7ef76ec3b76972d1b2c7fb548cecfb9423160
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner.h
@@ -0,0 +1,88 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Combine an AllReduce and a CrossReplicaSum when they are close to each other
+// in the graph, to use an efficient CrossReplicaSum implementation that
+// fully utilizes the interconnect bandwidth.
+class ArCrsCombiner : public HloModulePass {
+ public:
+  ArCrsCombiner(int num_spatial_partitions)
+      : num_spatial_partitions_(num_spatial_partitions) {}
+  absl::string_view name() const override { return "ar-crs-combiner"; }
+  StatusOr<bool> Run(HloModule* module) override;
+
+  // Helper method to allow testing of InstructionsComputeSameValue.
+  static bool TestInstructionsComputeSameValue(HloInstruction* i1,
+                                               HloInstruction* i2);
+
+ private:
+  // If the passed instruction is a while parameter, and the while body is only
+  // called by a single while instruction, return the while instruction.
+  absl::optional<HloInstruction*> WhileFromBodyParameter(
+      HloInstruction* instruction);
+
+  // Returns a vector of tuple instructions.
+  // If all instructions that flow to "instruction" are tuples, return them.
+  // Otherwise, return an empty vector.
+  std::vector<HloInstruction*> GetAllTuples(HloInstruction* instruction);
+
+  // Checks whether two different elements in the same tuple compute the same
+  // value.
+  bool TupleElementsComputeSameValue(
+      HloInstruction* tuple_shaped_instruction, int64 i1, int64 i2,
+      absl::flat_hash_map<int64, int64>* visited_pairs);
+
+  // Returns whether the instructions i1 and i2 can be shown to evaluate to the
+  // same value. Handling WHILE requires recursion, which may cause us to visit
+  // the same instruction again. To avoid infinite loops, we pass a cache of
+  // visited instruction pairs.
+  bool InstructionsComputeSameValue(
+      HloInstruction* i1, HloInstruction* i2,
+      absl::flat_hash_map<int64, int64>* visited_pairs);
+
+  // Populates all_reduce_map_.
+  void GroupAllReducesById(HloModule* module);
+
+  // Looks at each AllReduce group in all_reduce_map_, and keeps only the
+  // groups for which it's safe to move the AllReduce later in the HLO graph.
+  void KeepProvablyEqualInstructionGroups();
+
+  // Performs the graph rewrite that eliminates the early AllReduce and turns
+  // the later CRS into an AllReduce.
+  StatusOr<bool> RewriteGraph();
+
+  int num_spatial_partitions_;
+
+  // Map from all-reduce ids to the all reduce instructions.
+  absl::flat_hash_map<int64, std::vector<HloInstruction*>> all_reduce_map_;
+
+  std::unique_ptr<CallGraph> call_graph_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_AR_CRS_COMBINER_H_
diff --git a/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10171835d83c75fef091a34b8fe102d263211307
--- /dev/null
+++ b/tensorflow/compiler/xla/service/ar_crs_combiner_test.cc
@@ -0,0 +1,496 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/ar_crs_combiner.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class ArCrsCombinerTest : public HloTestBase {};
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(
+      i1, module->entry_computation()->parameter_instruction(0)));
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase2) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (x: f32[]) -> (f32[], f32[]) {
+  %x = f32[] parameter(0)
+  ROOT %tuple = (f32[], f32[]) tuple(%x, %x)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestBasecase3) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (x: f32[], y: f32[]) -> (f32[], f32[]) {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %tuple = (f32[], f32[]) tuple(%x, %y)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestNumOperands) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> ((f32[2,2]), (f32[2,2], f32[2,2])) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple1 = (f32[2,2]) tuple(%constant.f32)
+  %tuple2 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %tuple = ((f32[2,2]), (f32[2,2], f32[2,2])) tuple(%tuple1, %tuple2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestSliceIndicesMatch) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2]) -> (f32[1], f32[1]) {
+  %p = f32[2] parameter(0)
+  %slice.1 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  %slice.2 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  ROOT %tuple = (f32[1], f32[1]) tuple(%slice.1, %slice.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestSliceIndicesDontMatch) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2]) -> (f32[1], f32[1]) {
+  %p = f32[2] parameter(0)
+  %slice.1 = f32[1] slice(f32[2] %p), slice={[0:1]}
+  %slice.2 = f32[1] slice(f32[2] %p), slice={[1:2]}
+  ROOT %tuple = (f32[1], f32[1]) tuple(%slice.1, %slice.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementSameIndex) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementDifferentIndex1) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestTupleElementDifferentIndex2) {
+  const char* module_str = R"(
+HloModule foobar
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{2, 3}, {4, 5}})
+  %tuple.1 = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%tuple.1), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%tuple.1), index=1
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%get-tuple-element.1, %get-tuple-element.2)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_tuple = module->entry_computation()->root_instruction();
+  auto i1 = root_tuple->operands()[0];
+  auto i2 = root_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile1) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0];
+  auto i2 = body_tuple->operands()[1];
+  EXPECT_TRUE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile2) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {7, 8}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32.1, %constant.f32.2)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0];
+  auto i2 = body_tuple->operands()[1];
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, SameValueTestWhile3) {
+  const char* module_str = R"(
+HloModule foobar
+
+%condition (x: (f32[2,2], f32[2,2])) -> pred[] {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.0 = s32[] constant(0)
+  %constant.1 = s32[] constant(1)
+  ROOT %greater-than = pred[] greater-than(s32[] %constant.1, s32[] %constant.0)
+}
+
+%body (x: (f32[2,2], f32[2,2])) -> (f32[2,2], f32[2,2]) {
+  %x = (f32[2,2], f32[2,2]) parameter(0)
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {1, 2}})
+  %get-tuple-element.1 = f32[2,2] get-tuple-element(%x), index=0
+  %get-tuple-element.2 = f32[2,2] get-tuple-element(%x), index=1
+  %add.1 = f32[2,2] add(%get-tuple-element.1, %constant.f32.1)
+  %add.2 = f32[2,2] add(%get-tuple-element.2, %constant.f32.2)
+  ROOT %tuple = (f32[2,2], f32[2,2]) tuple(%add.1, %add.2)
+}
+
+ENTRY %WhileLoop () -> (f32[2,2], f32[2,2]) {
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+  %init.tuple = (f32[2,2], f32[2,2]) tuple(%constant.f32, %constant.f32)
+  ROOT %while = (f32[2,2], f32[2,2]) while(%init.tuple), condition=%condition, body=%body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto root_while = module->entry_computation()->root_instruction();
+  auto body_tuple = root_while->while_body()->root_instruction();
+  auto i1 = body_tuple->operands()[0]->operands()[0];  // %get-tuple-element.1
+  auto i2 = body_tuple->operands()[1]->operands()[0];  // %get-tuple-element.2
+  EXPECT_FALSE(ArCrsCombiner::TestInstructionsComputeSameValue(i1, i2));
+}
+
+TEST_F(ArCrsCombinerTest, RewritePatternArConvertAddCrs) {
+  const char* module_str = R"(
+HloModule foobar
+
+%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
+  %constant.f32 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+
+  %cross-replica-sum.ar.1 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=0}
+  %convert.1 = f32[2,2]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[2,2]
+      add(%constant.f32, %convert.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[2,2]
+      cross-replica-sum(%add.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=1}
+  %convert.2 = f32[2,2]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %add.2 = f32[2,2]
+      add(%constant.f32, %convert.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[2,2]
+      cross-replica-sum(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[2,2], f32[2,2])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  auto crs_before =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_before = crs_before->replica_groups();
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      op::Tuple(
+          op::CrossReplicaSum(op::Add(
+              op::Divide(op::Constant(), op::Constant()), op::Convert())),
+          op::CrossReplicaSum(op::Add(
+              op::Divide(op::Constant(), op::Constant()), op::Convert()))));
+  auto crs_after =
+      module->entry_computation()->root_instruction()->operands()[0];
+  auto replica_groups_after = crs_after->replica_groups();
+  ASSERT_EQ(replica_groups_before.size(), replica_groups_after.size());
+  for (int i = 0; i < replica_groups_before.size(); ++i) {
+    // Somewhat verbose way to compare the replica_ids, because EqualsProto
+    // is not available in the open-source build.
+    auto group_before = replica_groups_before[i];
+    std::vector<int64> ids_before(group_before.replica_ids().begin(),
+                                  group_before.replica_ids().end());
+    auto group_after = replica_groups_after[i];
+    std::vector<int64> ids_after(group_after.replica_ids().begin(),
+                                 group_after.replica_ids().end());
+    EXPECT_EQ(ids_before, ids_after);
+  }
+}
+
+TEST_F(ArCrsCombinerTest, OtherSummandNotTheSameDontRewrite) {
+  const char* module_str = R"(
+HloModule foobar
+
+%binary_add (a: bf16[], b: bf16[]) -> bf16[] {
+  %a = bf16[] parameter(0)
+  %b = bf16[] parameter(1)
+  ROOT %add = bf16[] add(%a, %b)
+}
+
+%sum.f32 (x: f32[], y: f32[]) -> f32[] {
+  %x = f32[] parameter(0)
+  %y = f32[] parameter(1)
+  ROOT %add = f32[] add(%x, %y)
+}
+
+ENTRY %entrycomp (p: f32[2,2]) -> (f32[2,2], f32[2,2]) {
+  %p = f32[2,2] parameter(0)
+  %constant.bf16 = bf16[2,2] constant(bf16[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.1 = f32[2,2] constant(f32[2,2] {{1, 2}, {3, 4}})
+  %constant.f32.2 = f32[2,2] constant(f32[2,2] {{3, 4}, {5, 6}})
+
+  %cross-replica-sum.ar.1 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=0}
+  %convert.1 = f32[2,2]
+      convert(%cross-replica-sum.ar.1),
+      sharding={maximal device=0}
+  %add.1 = f32[2,2]
+      add(%constant.f32.1, %convert.1),
+      sharding={maximal device=0}
+  %cross-replica-sum.1 = f32[2,2]
+      cross-replica-sum(%add.1),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=0}
+
+  %cross-replica-sum.ar.2 = bf16[2,2]
+      cross-replica-sum(%constant.bf16),
+      replica_groups={{0},{1}},
+      all_reduce_id=1,
+      to_apply=%binary_add,
+      sharding={maximal device=1}
+  %convert.2 = f32[2,2]
+      convert(%cross-replica-sum.ar.2),
+      sharding={maximal device=1}
+  %add.2 = f32[2,2]
+      add(%constant.f32.2, %convert.2),
+      sharding={maximal device=1}
+  %cross-replica-sum.2 = f32[2,2]
+      cross-replica-sum(%add.2),
+      replica_groups={{0,1}},
+      to_apply=%sum.f32,
+      sharding={maximal device=1}
+
+  ROOT %tuple = (f32[2,2], f32[2,2])
+      tuple(%cross-replica-sum.1, %cross-replica-sum.2),
+      sharding={{maximal device=0}, {maximal device=1}}
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_str));
+  ArCrsCombiner combiner(2);
+  auto changed = combiner.Run(module.get()).ValueOrDie();
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander.cc b/tensorflow/compiler/xla/service/batchnorm_expander.cc
index f70f6ddfec69c0113a1afe2073a2392098f49456..0e6ca1871b379a2f55b92207133822fc6258b007 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander.cc
@@ -107,19 +107,37 @@ class BatchNormExpanderVisitor : public DfsHloVisitorWithDefault {
   }
 
   std::unique_ptr<HloInstruction> Mean(
-      int64 element_count, HloInstruction* operand,
+      HloInstruction* element_count, HloInstruction* operand,
       const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
           add_instruction) {
-    HloInstruction* elem_count_recip =
-        add_instruction(HloInstruction::CreateBroadcast(
-            operand->shape(),
-            add_instruction(HloInstruction::CreateConvert(
-                ShapeUtil::MakeShape(operand->shape().element_type(), {}),
-                add_instruction(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<float>(1.0 / element_count))))),
-            {}));
-    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kMultiply,
-                                        operand, elem_count_recip);
+    auto broadcast = add_instruction(
+        HloInstruction::CreateBroadcast(operand->shape(), element_count, {}));
+    return HloInstruction::CreateBinary(operand->shape(), HloOpcode::kDivide,
+                                        operand, broadcast);
+  }
+
+  std::unique_ptr<HloInstruction> DynamicElementCountPerFeature(
+      HloInstruction* operand, int64 feature_index,
+      const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
+          add_instruction) {
+    auto elements_per_feature_u32 = add_instruction(
+        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(1)));
+
+    for (int64 i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+      if (i == feature_index) {
+        continue;
+      }
+      auto dynamic_dimension_size =
+          add_instruction(HloInstruction::CreateGetDimensionSize(
+              ShapeUtil::MakeShape(U32, {}), operand, i));
+      elements_per_feature_u32 = add_instruction(HloInstruction::CreateBinary(
+          ShapeUtil::MakeShape(U32, {}), HloOpcode::kMultiply,
+          dynamic_dimension_size, elements_per_feature_u32));
+    }
+
+    return HloInstruction::CreateConvert(
+        ShapeUtil::MakeShape(operand->shape().element_type(), {}),
+        elements_per_feature_u32);
   }
 
   // Replaces the existing HLO instruction old_instruction, with
@@ -195,9 +213,6 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
   const Shape operand_shape = operand->shape();
   PrimitiveType ptype = operand_shape.element_type();
   int64 feature_index = batch_norm->feature_index();
-  const int64 feature_count = operand_shape.dimensions(feature_index);
-  const int64 size_in_elements = ShapeUtil::ElementsIn(operand_shape);
-  int64 elements_per_feature_int64 = size_in_elements / feature_count;
 
   HloInstruction* scale = batch_norm->mutable_operand(1);
   HloInstruction* offset = batch_norm->mutable_operand(2);
@@ -220,6 +235,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
     }
   }
 
+  auto elements_per_feature =
+      add(DynamicElementCountPerFeature(operand, feature_index, add));
+
   auto scale_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, scale, {feature_index}));
 
@@ -243,13 +261,13 @@ Status BatchNormExpanderVisitor::HandleBatchNormTraining(
       add_reduce_computation));
 
   // E[X].
-  auto mean = add(Mean(elements_per_feature_int64, sum, add));
+  auto mean = add(Mean(elements_per_feature, sum, add));
 
   auto mean_broadcasted = add(
       HloInstruction::CreateBroadcast(operand_shape, mean, {feature_index}));
 
   // E[X^2].
-  auto square_mean = add(Mean(elements_per_feature_int64, squared_sum, add));
+  auto square_mean = add(Mean(elements_per_feature, squared_sum, add));
 
   // E^2[X].
   auto mean_square =
@@ -458,9 +476,8 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
 
   int64 feature_index = batch_norm->feature_index();
 
-  const int64 size_in_elements = ShapeUtil::ElementsIn(activation_shape);
-  const int64 feature_count = activation_shape.dimensions(feature_index);
-  const int64 elements_per_feature_int64 = size_in_elements / feature_count;
+  auto elements_per_feature =
+      add(DynamicElementCountPerFeature(activation, feature_index, add));
 
   auto zero_literal = LiteralUtil::CreateR0(0.0f);
   TF_ASSIGN_OR_RETURN(zero_literal, zero_literal.Convert(ptype));
@@ -553,15 +570,9 @@ Status BatchNormExpanderVisitor::HandleBatchNormGrad(
       add_binary(activation_shape, HloOpcode::kMultiply, scale_broadcasted,
                  rsqrt_var_add_epsilon_broadcasted);
 
-  scale_times_rsqrt_var_add_epsilon = add(
-      Mean(elements_per_feature_int64, scale_times_rsqrt_var_add_epsilon, add));
+  scale_times_rsqrt_var_add_epsilon =
+      add(Mean(elements_per_feature, scale_times_rsqrt_var_add_epsilon, add));
 
-  auto elements_per_feature_literal =
-      LiteralUtil::CreateR0<float>(elements_per_feature_int64);
-  TF_ASSIGN_OR_RETURN(elements_per_feature_literal,
-                      elements_per_feature_literal.Convert(ptype));
-  auto elements_per_feature = add(
-      HloInstruction::CreateConstant(std::move(elements_per_feature_literal)));
   auto i1 = add_binary(activation_shape, HloOpcode::kMultiply, grad_output,
                        add(HloInstruction::CreateBroadcast(
                            activation_shape, elements_per_feature, {})));
diff --git a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
index 08cf8026177d77ff98cca5e5d168ac3194936b35..8e8fbbd935b154e5a77d68e60d861601d740bf03 100644
--- a/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
+++ b/tensorflow/compiler/xla/service/batchnorm_expander_test.cc
@@ -36,7 +36,21 @@ limitations under the License.
 namespace xla {
 namespace {
 
-using BatchNormExpanderTest = HloTestBase;
+class BatchNormExpanderTest : public HloTestBase {
+ protected:
+  // BatchNorm should have a dynamic sized dividor for mean operations.
+  int64 CountGetDimensionSize(const HloModule& module) {
+    int64 count = 0;
+    for (HloComputation* comp : module.computations()) {
+      for (HloInstruction* inst : comp->instructions()) {
+        if (inst->opcode() == HloOpcode::kGetDimensionSize) {
+          count++;
+        }
+      }
+    }
+    return count;
+  }
+};
 
 // Test that we expand BatchNormTraining.
 TEST_F(BatchNormExpanderTest, BatchNormTraining) {
@@ -68,6 +82,7 @@ TEST_F(BatchNormExpanderTest, BatchNormTraining) {
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
+  EXPECT_EQ(CountGetDimensionSize(*module), 3);
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
 }
@@ -110,6 +125,7 @@ TEST_F(BatchNormExpanderTest, BatchNormGrad) {
                              /*rewrite_grad_op=*/true);
   ASSERT_TRUE(rewriter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
+  EXPECT_EQ(CountGetDimensionSize(*module), 3);
   // Make sure this operation is expanded.
   EXPECT_EQ(root->opcode(), HloOpcode::kTuple);
 }
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index 40c012a5e4214f00dbeaca4e8cbfaa668089c6e8..8d7c62447852fd946440c41389300a92377c471f 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -746,8 +746,7 @@ StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     LogicalBuffer::AlignmentFunction color_alignment,
     bool allow_input_output_aliasing, bool allocate_buffers_for_constants,
     BufferLiveness::Colorer colorer, ReuseAllocationFunction reuse_checker) {
-  BufferAssigner assigner(allow_input_output_aliasing,
-                          allocate_buffers_for_constants, std::move(colorer),
+  BufferAssigner assigner(allocate_buffers_for_constants, std::move(colorer),
                           std::move(reuse_checker));
   return assigner.CreateAssignment(module, std::move(hlo_ordering),
                                    std::move(buffer_size),
@@ -1434,33 +1433,40 @@ BufferAssigner::MergeColocatedBufferSets(
            computation == module->entry_computation();
   };
 
+  std::vector<bool> set_can_be_merged(colocated_buffer_sets.size(), true);
+
+  // Do not merge if one of the sets includes live outs, entry parameters or
+  // constants.
+  //
+  // Buffer liveness does not report the correct live range for entry
+  // parameter and live out buffers so we have to special case them here.  On
+  // backends that support constant buffer allocations, constant buffers are
+  // assigned globals in readonly storage so we can't merge colocated buffer
+  // sets containing constants with colocated buffer sets containing writing
+  // instructions or other constants.
+  //
+  // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
+  // the caller of the executable so we can't write to entry parameters
+  // either, and the argument for not merging constants also applies to entry
+  // parameters.
+  for (int64 i = 0; i < colocated_buffer_sets.size(); ++i) {
+    for (auto& buffer : colocated_buffer_sets[i]) {
+      if (buffer_liveness.MaybeLiveOut(*buffer) ||
+          is_entry_parameter(*buffer) ||
+          buffer->instruction()->opcode() == HloOpcode::kConstant) {
+        set_can_be_merged[i] = false;
+        break;
+      }
+    }
+  }
+
   // Returns true if the two colocated buffer sets (specified by their indices
   // into the colocated_buffer_sets) can be merged into a single set.
   auto cannot_merge_buffer_sets = [&colocated_buffer_sets, &buffer_liveness,
                                    &buffer_size,
-                                   &is_entry_parameter](int64 i, int64 j) {
-    // Do not merge if one of the sets includes live outs, entry parameters or
-    // constants.
-    //
-    // Buffer liveness does not report the correct live range for entry
-    // parameter and live out buffers so we have to special case them here.  On
-    // backends that support constant buffer allocations, constant buffers are
-    // assigned globals in readonly storage so we can't merge colocated buffer
-    // sets containing constants with colocated buffer sets containing writing
-    // instructions or other constants.
-    //
-    // Moreover (on the CPU/GPU backends) the entry parameter buffers belong to
-    // the caller of the executable so we can't write to entry parameters
-    // either, and the argument for not merging constants also applies to entry
-    // parameters.
-    for (int64 key : {i, j}) {
-      for (auto& buffer : colocated_buffer_sets[key]) {
-        if (buffer_liveness.MaybeLiveOut(*buffer) ||
-            is_entry_parameter(*buffer) ||
-            buffer->instruction()->opcode() == HloOpcode::kConstant) {
-          return true;
-        }
-      }
+                                   &set_can_be_merged](int64 i, int64 j) {
+    if (!set_can_be_merged[i] || !set_can_be_merged[j]) {
+      return true;
     }
 
     // Colocated sets satisfy the invariant that all buffers within a set have
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index d8e1612b899f10a5793f9c65c59a41024dfdddd1..0a9fdede803e84ca42472259084615c031b206eb 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -545,12 +545,10 @@ class BufferAssigner {
       ReuseAllocationFunction reuse_checker = nullptr);
 
  private:
-  BufferAssigner(bool allow_input_output_aliasing,
-                 bool allocate_buffers_for_constants,
+  BufferAssigner(bool allocate_buffers_for_constants,
                  BufferLiveness::Colorer colorer,
                  ReuseAllocationFunction reuse_checker)
-      : allow_input_output_aliasing_(allow_input_output_aliasing),
-        allocate_buffers_for_constants_(allocate_buffers_for_constants),
+      : allocate_buffers_for_constants_(allocate_buffers_for_constants),
         colorer_(colorer),
         reuse_checker_(reuse_checker) {}
   virtual ~BufferAssigner() = default;
@@ -640,10 +638,6 @@ class BufferAssigner {
                       LogicalBuffer::Color::Hasher>
   SplitBuffersByColor(const absl::flat_hash_set<const LogicalBuffer*>& buffers);
 
-  // If true, buffer assignments assumes that input parameter buffers and output
-  // buffers can be shared if their sizes match.
-  bool allow_input_output_aliasing_;
-
   // If true, allocate buffers for constant instructions.
   bool allocate_buffers_for_constants_;
 
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index b1fc50cb1881241a0a53b024b06342308cabdd62..8f482e6ba8c3e71c9980be5e6947ea61f3b4ef29 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -137,8 +137,7 @@ class BufferAssignmentTest : public HloTestBase {
   }
 
   std::unique_ptr<BufferAssignment> RunBufferAssignmentWithInstructionSequence(
-      HloModule* module,
-      absl::Span<const HloInstruction* const> instruction_sequence,
+      HloModule* module, absl::Span<HloInstruction* const> instruction_sequence,
       int64 alignment = 1) {
     HloSchedule schedule(module);
     schedule.set_sequence(module->entry_computation(), instruction_sequence);
@@ -1853,7 +1852,7 @@ class WhileBufferAssignmentTest : public HloTestBase {
   std::unique_ptr<BufferAssignment> RunBufferAssignment(HloModule* module,
                                                         int64 alignment = 1) {
     HloSchedule schedule =
-        ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
+        ScheduleModule(module, ByteSizeOf).ConsumeValueOrDie();
     return BufferAssigner::Run(
                module, absl::make_unique<SequentialHloOrdering>(schedule),
                ByteSizeOf,
@@ -2162,7 +2161,7 @@ TEST_F(WhileBufferAssignmentTest, ColocatedBuffers) {
   // nodes are traversed during BufferAssignment.
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
@@ -2391,15 +2390,16 @@ TEST_F(WhileBufferAssignmentTest, WhileLoopsInterferingResultRange) {
   RunCopyInsertion(module.get());
 
   HloSchedule schedule =
-      ScheduleModule(*module, ByteSizeOf).ConsumeValueOrDie();
+      ScheduleModule(module.get(), ByteSizeOf).ConsumeValueOrDie();
 
   // To trigger b/38494731, we want a specific Hlo schedule for the
   // root computation, so we overwrite that entry with a manually
   // crafted sequence.
-  schedule.set_sequence(module->entry_computation(),
-                        {input1, weights1, one, output1, while1->operand(0),
-                         while1, input0, weights0, zero, output0,
-                         while0->operand(0), while0, gte0, gte1, root_add});
+  schedule.set_sequence(
+      module->entry_computation(),
+      {input1, weights1, one, output1, while1->mutable_operand(0), while1,
+       input0, weights0, zero, output0, while0->mutable_operand(0), while0,
+       gte0, gte1, root_add});
 
   // If this ASSERT fails, we constructed a bogus sequence above and this test
   // itself is buggy.
diff --git a/tensorflow/compiler/xla/service/buffer_liveness_test.cc b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
index aeee543e8435200915ab992e2aa146a3c17646d5..40825a78716b1c0b9fb0121787977d275891c0f8 100644
--- a/tensorflow/compiler/xla/service/buffer_liveness_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_liveness_test.cc
@@ -117,7 +117,7 @@ TEST_F(BufferLivenessTest, ElementwiseChain) {
   auto log = builder.AddInstruction(
       HloInstruction::CreateUnary(vec_, HloOpcode::kLog, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -164,7 +164,7 @@ TEST_F(BufferLivenessTest, MultipleEntryParameters_Sequential) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* entry = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -213,7 +213,7 @@ TEST_F(BufferLivenessTest, NonElementwiseOperand) {
   auto reverse =
       builder.AddInstruction(HloInstruction::CreateReverse(vec_, negate, {0}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -247,7 +247,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffers) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -289,7 +289,7 @@ TEST_F(BufferLivenessTest, OverlappedBuffersSequentialOrder) {
   auto add = builder.AddInstruction(
       HloInstruction::CreateBinary(vec_, HloOpcode::kAdd, negate, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   HloSchedule schedule(module.get());
@@ -336,7 +336,7 @@ TEST_F(BufferLivenessTest, RootInstructionIsNotLastInSequentialOrder) {
       HloInstruction::CreateSend(recv_done, token, /*channel_id=*/1));
   auto send_done = builder.AddInstruction(HloInstruction::CreateSendDone(send));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build(add));
 
   HloSchedule schedule(module.get());
@@ -373,7 +373,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
   auto outer_tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({inner_tuple, exp}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -393,7 +393,7 @@ TEST_F(BufferLivenessTest, TupleLiveOut) {
 
 TEST_F(BufferLivenessTest, EmbeddedComputation) {
   // Test MaybeLiveOut and MayInterfere for embedded computation.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   auto embedded_builder = HloComputation::Builder(TestName() + "_embedded");
   auto embedded_param = embedded_builder.AddInstruction(
@@ -450,7 +450,7 @@ TEST_F(BufferLivenessTest, TupleConstantLiveOut) {
   builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       inner_tuple0.shape(), tuple_constant, 0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto liveness =
@@ -576,7 +576,7 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
   auto tuple_root =
       builder.AddInstruction(HloInstruction::CreateTuple({add0, add1}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(BuildDummyComputation());
   module->AddEmbeddedComputation(builder.Build());
 
@@ -611,8 +611,8 @@ TEST_F(BufferLivenessTest, DependentTupleElements) {
 class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
  protected:
   // Builds and runs a computation (see test case computation graphs below).
-  std::unique_ptr<HloModule> BuildModule(const bool update_uses_tuple_element1,
-                                         const bool fuse_gte0) {
+  std::unique_ptr<VerifiedHloModule> BuildModule(
+      const bool update_uses_tuple_element1, const bool fuse_gte0) {
     auto builder = HloComputation::Builder(TestName());
     // Create param0 Tuple.
     Shape data_shape = ShapeUtil::MakeShape(F32, {8});
@@ -646,7 +646,7 @@ class FusedDynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewUnverifiedModule();
+    auto module = CreateNewVerifiedModule();
     module->AddEntryComputation(builder.Build());
     auto* computation = module->entry_computation();
     // Create fusion instruction based on number of tuple element 1 users.
@@ -802,7 +802,7 @@ class DynamicUpdateSliceLivenessTest : public BufferLivenessTest {
     auto tuple_root = builder.AddInstruction(
         HloInstruction::CreateTuple({gte0, dynamic_update_slice}));
     // Build module and get reference to entry computation.
-    auto module = CreateNewUnverifiedModule();
+    auto module = CreateNewVerifiedModule();
     module->AddEntryComputation(BuildDummyComputation());
     module->AddEmbeddedComputation(builder.Build());
     // Run BufferLiveness on 'module'.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index bdd5069632e84fe6c67ca129f726432479ac1b35..7987343bfaf1069fd550909d127e4b11f2124701 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -325,6 +325,15 @@ bool CallGraph::IsFlattened() const {
   return true;
 }
 
+std::vector<HloInstruction*> CallGraph::GetComputationCallers(
+    HloComputation* c) {
+  std::vector<HloInstruction*> callers;
+  for (auto callsite : GetNode(c).caller_callsites()) {
+    callers.push_back(callsite.instruction());
+  }
+  return callers;
+}
+
 std::pair<HloInstruction*, HloInstruction*>
 CallGraph::NearestAncestorsInSameComputation(HloInstruction* a,
                                              HloInstruction* b) const {
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index cb56f4789d06ac33acdaadc8b619b9e37f683d58..05c7c998738f861ee804d1ec87bfa5fb17ddfb74 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -236,6 +236,10 @@ class CallGraph {
   // FlattenCallGraph.
   bool IsFlattened() const;
 
+  // Returns a vector of instructions calling the passed computation.
+  // (Often a vector of size 1.)
+  std::vector<HloInstruction*> GetComputationCallers(HloComputation* c);
+
   string ToString() const;
 
  private:
diff --git a/tensorflow/compiler/xla/service/compile_only_service.cc b/tensorflow/compiler/xla/service/compile_only_service.cc
index 67132274c0dcbfda831c79836d052bb51b753ec7..1965925fa7f6d50b1d7af918bc3468d4b4d5d0a2 100644
--- a/tensorflow/compiler/xla/service/compile_only_service.cc
+++ b/tensorflow/compiler/xla/service/compile_only_service.cc
@@ -86,15 +86,15 @@ CompileOnlyService::CompileAheadOfTime(
           Executable::DumpToDirectory(per_host_path, filename, hlo_snapshot));
     }
 
-    const auto& program_shape = instance.computation.host_program_shape();
     ExecutionOptions execution_options;
     *execution_options.mutable_debug_options() = debug_options;
     *execution_options.mutable_shape_with_output_layout() =
-        *instance.result_layout;
+        instance.result_layout->ToProto();
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(program_shape, instance.argument_layouts,
-                           &execution_options));
+        CreateModuleConfig(
+            ProgramShape(instance.computation.host_program_shape()),
+            instance.argument_layouts, &execution_options));
 
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModule> hlo_module,
diff --git a/tensorflow/compiler/xla/service/computation_placer.h b/tensorflow/compiler/xla/service/computation_placer.h
index c899ffb9dc562426ef14c0d414469c04debeec70..844b42a38d7539cccd5c4e30071c0ea6693e3bba 100644
--- a/tensorflow/compiler/xla/service/computation_placer.h
+++ b/tensorflow/compiler/xla/service/computation_placer.h
@@ -105,8 +105,6 @@ class ComputationPlacer {
   // Map from platform kind to computation placer singleton.
   static std::map<se::Platform::Id, State>* GetPlatformComputationPlacers();
 
-  se::Platform::Id platform_id_;
-
   TF_DISALLOW_COPY_AND_ASSIGN(ComputationPlacer);
 };
 
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
index 7f7f1503a099b3a67ed22cb5978c01da6cf8ba88..95c7724c3c93507ae61a984301ecfc0111bef192 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter.cc
@@ -142,16 +142,16 @@ std::vector<int32> GetMaskIds(int64 group_size, int64 group_count) {
 // Finally we use the Eq op of these two broadcasted constants and get the
 // desired mask.
 HloInstruction* GetExpandedFilterMask(
-    const Shape& filter_shape, int64 input_feature_dim,
-    int64 output_feature_dim, int64 group_count,
+    const Shape& filter_shape, int64 kernel_input_feature_dim,
+    int64 kernel_output_feature_dim, int64 group_count,
     const std::function<HloInstruction*(std::unique_ptr<HloInstruction>)>&
         add_instruction) {
   Shape expanded_filter_shape =
-      ExpandedFilterShape(filter_shape, group_count, input_feature_dim);
+      ExpandedFilterShape(filter_shape, group_count, kernel_input_feature_dim);
   Shape mask_shape = ShapeUtil::MakeShape(
       S32, AsInt64Slice(expanded_filter_shape.dimensions()));
-  int64 output_feature = filter_shape.dimensions(output_feature_dim);
-  int64 group_size = filter_shape.dimensions(input_feature_dim);
+  int64 output_feature = filter_shape.dimensions(kernel_output_feature_dim);
+  int64 group_size = filter_shape.dimensions(kernel_input_feature_dim);
 
   // Create a 'input_feature' sized linspace and 'output_feature' sized linspace
   // that will be broadcasted into perpendicular dimensions and compared.
@@ -159,15 +159,14 @@ HloInstruction* GetExpandedFilterMask(
       GetMaskIds(group_size, group_count);
   const std::vector<int32> output_feature_filter_mask =
       GetMaskIds(output_feature / group_count, group_count);
-
   auto mask1 = add_instruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>(input_feature_filter_mask)));
-  auto broadcasted_mask1 = add_instruction(
-      HloInstruction::CreateBroadcast(mask_shape, mask1, {input_feature_dim}));
+  auto broadcasted_mask1 = add_instruction(HloInstruction::CreateBroadcast(
+      mask_shape, mask1, {kernel_input_feature_dim}));
   auto mask2 = add_instruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<int32>(output_feature_filter_mask)));
-  auto broadcasted_mask2 = add_instruction(
-      HloInstruction::CreateBroadcast(mask_shape, mask2, {output_feature_dim}));
+  auto broadcasted_mask2 = add_instruction(HloInstruction::CreateBroadcast(
+      mask_shape, mask2, {kernel_output_feature_dim}));
 
   // Compare the broadcasted output feature linspace to the input feature
   // linspace to create a diagonal predicate.
@@ -189,91 +188,203 @@ Status ConvolutionVisitor::HandleConvolution(HloInstruction* convolution) {
   };
 
   auto dim_numbers = convolution->convolution_dimension_numbers();
-  int64 input_feature_dim = dim_numbers.kernel_input_feature_dimension();
-  int64 group_size = filter->shape().dimensions(input_feature_dim);
-  int64 output_feature_dim = dim_numbers.kernel_output_feature_dimension();
-  auto expanded_filter_shape =
-      ExpandedFilterShape(filter->shape(), group_count, input_feature_dim);
-  HloInstruction* filter_mask = GetExpandedFilterMask(
-      filter->shape(), input_feature_dim, output_feature_dim, group_count, add);
+  int64 kernel_input_feature_dim = dim_numbers.kernel_input_feature_dimension();
+  int64 group_size = filter->shape().dimensions(kernel_input_feature_dim);
+  int64 kernel_output_feature_dim =
+      dim_numbers.kernel_output_feature_dimension();
+  auto expanded_filter_shape = ExpandedFilterShape(filter->shape(), group_count,
+                                                   kernel_input_feature_dim);
+  HloInstruction* filter_mask =
+      GetExpandedFilterMask(filter->shape(), kernel_input_feature_dim,
+                            kernel_output_feature_dim, group_count, add);
   HloInstruction* expanded_filter;
 
   if (group_size == 1) {
     bool depthwise_separable =
-        (group_count == filter->shape().dimensions(output_feature_dim));
+        (group_count == filter->shape().dimensions(kernel_output_feature_dim));
     // If the code generator handles depthwise separable convolutions
     // inherently, then no filter expansion is needed.
     if (!filter_expansion_ && depthwise_separable) {
-      const int64 old_kernel_input_feature_dimension =
-          dim_numbers.kernel_input_feature_dimension();
-      const int64 old_kernel_output_feature_dimension =
-          dim_numbers.kernel_output_feature_dimension();
-
-      // For depthwise convolutions, we want the kernel input feature dimension
-      // to be smaller than the output feature dimension. If that's not the
-      // case, we swap the dimensions.
-      if (old_kernel_input_feature_dimension >
-          old_kernel_output_feature_dimension) {
-        Shape reshaped_filter_shape = filter->shape();
-        auto& dimensions = *reshaped_filter_shape.mutable_dimensions();
-        std::swap(dimensions[old_kernel_input_feature_dimension],
-                  dimensions[old_kernel_output_feature_dimension]);
-
-        auto reshaped_filter =
-            add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
-
-        dim_numbers.set_kernel_input_feature_dimension(
-            old_kernel_output_feature_dimension);
-
-        dim_numbers.set_kernel_output_feature_dimension(
-            old_kernel_input_feature_dimension);
-
-        auto new_convolution = HloInstruction::CreateConvolve(
-            convolution->shape(), convolution->mutable_operand(0),
-            reshaped_filter, group_count, convolution->window(), dim_numbers,
-            convolution->precision_config());
-
-        TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
-            convolution, std::move(new_convolution)));
-      }
       return Status::OK();
     }
     // We want to repeat 'filter' in the 'input_feature_dim' dimension
     // 'group_count' times.
     Shape reshaped_filter_shape =
-        ShapeUtil::DeleteDimension(input_feature_dim, filter->shape());
+        ShapeUtil::DeleteDimension(kernel_input_feature_dim, filter->shape());
     auto reshaped_filter =
         add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
     std::vector<int64> broadcast_dims;
     for (int64 i = 0; i < filter->shape().dimensions_size(); ++i) {
-      if (i == input_feature_dim) {
+      if (i == kernel_input_feature_dim) {
         continue;
       }
       broadcast_dims.push_back(i);
     }
     expanded_filter = add(HloInstruction::CreateBroadcast(
         expanded_filter_shape, reshaped_filter, broadcast_dims));
+
+    auto zero = add(HloInstruction::CreateConstant(
+        LiteralUtil::Zero(expanded_filter_shape.element_type())));
+    auto zero_filter =
+        add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
+    auto new_filter = add(HloInstruction::CreateTernary(
+        expanded_filter_shape, HloOpcode::kSelect, filter_mask, expanded_filter,
+        zero_filter));
+
+    auto new_convolution = HloInstruction::CreateConvolve(
+        convolution->shape(), convolution->mutable_operand(0), new_filter,
+        /*feature_group_count=*/1, convolution->window(), dim_numbers,
+        convolution->precision_config());
+    TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+        convolution, std::move(new_convolution)));
   } else {
-    // We could possibly also use reshape, broadcast, reshape instead of concat
-    // here, but it would require more complex code, and for depthwise
-    // convolution we would never end up in this branch.
-    std::vector<HloInstruction*> concat_operands(group_count, filter);
-    expanded_filter = add(HloInstruction::CreateConcatenate(
-        expanded_filter_shape, concat_operands, input_feature_dim));
+    int64 activation_input_feature_dim = dim_numbers.input_feature_dimension();
+
+    int64 output_feature =
+        filter->shape().dimensions(kernel_output_feature_dim);
+
+    // If group_count == output_feature, then we map those grouped convolutions
+    // onto depthwise convolution. This is done by adding an additional spatial
+    // dimension to the activations, kernel, and the output.
+    // E.g., we would turn
+    // [2, 12]{B, IF} conv [3, 4]{IF, OF} into
+    // [3, 2, 4]{S, B, IF} depth conv [3, 1, 4]{S, IF, OF}, where S is the
+    // additional spatial dimension. The generated convolution output will be
+    // [1, 2, 4]{S, B, OF} and then reshape the output back to [2, 4] {B, OF}.
+
+    if (group_count == output_feature && !filter_expansion_) {
+      auto filter = convolution->mutable_operand(1);
+      auto activation = convolution->mutable_operand(0);
+
+      // Add spatial dimension to the activation, and reshape.
+      Shape reshaped_activation_shape = activation->shape();
+      ShapeUtil::AppendMajorDimension(group_size, &reshaped_activation_shape);
+
+      int64 new_spatial_dim = reshaped_activation_shape.dimensions().size() - 1;
+
+      reshaped_activation_shape.set_dimensions(activation_input_feature_dim,
+                                               group_count);
+      activation = add(
+          HloInstruction::CreateReshape(reshaped_activation_shape, activation));
+
+      // Add spatial dimension to the filter, and reshape.
+      Shape reshaped_filter_shape = filter->shape();
+      ShapeUtil::AppendMajorDimension(1, &reshaped_filter_shape);
+
+      filter =
+          add(HloInstruction::CreateReshape(reshaped_filter_shape, filter));
+
+      Shape new_output_shape = convolution->shape();
+      ShapeUtil::AppendMajorDimension(1, &new_output_shape);
+
+      // Edit convolution dimension numbers. Note that kernel_input_feature_dim
+      // now becomes a spatial dimension, and the newly added dimension of size
+      // 1 is the new kernel_input_feature_dim.
+      dim_numbers.add_input_spatial_dimensions(new_spatial_dim);
+      dim_numbers.add_kernel_spatial_dimensions(kernel_input_feature_dim);
+      dim_numbers.set_kernel_input_feature_dimension(new_spatial_dim);
+      dim_numbers.add_output_spatial_dimensions(new_spatial_dim);
+
+      // Add window for the new spatial dimension.
+      Window new_window = convolution->window();
+      auto* dim = new_window.add_dimensions();
+      dim->set_window_dilation(1);
+      dim->set_base_dilation(1);
+      dim->set_stride(1);
+      dim->set_size(group_size);
+
+      auto new_convolution = add(HloInstruction::CreateConvolve(
+          new_output_shape, activation, filter, group_count, new_window,
+          dim_numbers, convolution->precision_config()));
+
+      // Delete the extra spatial dimension, and reshape.
+      Shape reshaped_convolution_shape =
+          ShapeUtil::DeleteDimension(new_spatial_dim, new_convolution->shape());
+      auto reshaped_convolution = HloInstruction::CreateReshape(
+          reshaped_convolution_shape, new_convolution);
+
+      TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+          convolution, std::move(reshaped_convolution)));
+
+    } else {
+      // The filter expansion mechanism adds zeroes in the kernel.
+      // For an OF = 12, IF = 6, and kernel IF = 2, the expanded filter mask
+      // would look like (IF on the Y-axis, OF on the X-axis)
+      // 1 1 1 1 0 0 0 0 0 0 0 0
+      // 1 1 1 1 0 0 0 0 0 0 0 0
+      // 0 0 0 0 1 1 1 1 0 0 0 0
+      // 0 0 0 0 1 1 1 1 0 0 0 0
+      // 0 0 0 0 0 0 0 0 1 1 1 1
+      // 0 0 0 0 0 0 0 0 1 1 1 1
+      //
+      // Instead of convolving the above with the input, we instead slice the
+      // kernel into three kernels, each containing islands of 1s from the
+      // filter above. We also slice the activations in the IF dimension with
+      // each slice of size = group_size. For each slice, we perform
+      // convolutions, and concatenate the generated outputs in the output OF
+      // dimension.
+
+      std::vector<HloInstruction*> sliced_convolutions;
+      auto activation = convolution->mutable_operand(0);
+      std::vector<int64> slice_strides(filter->shape().dimensions_size(), 1);
+      std::vector<int64> filter_slice_starts(filter->shape().dimensions_size(),
+                                             0);
+      std::vector<int64> filter_slice_limits(
+          filter->shape().dimensions().begin(),
+          filter->shape().dimensions().end());
+      std::vector<int64> activation_slice_starts(
+          activation->shape().dimensions_size(), 0);
+      std::vector<int64> activation_slice_limits(
+          activation->shape().dimensions().begin(),
+          activation->shape().dimensions().end());
+
+      int64 output_feature =
+          filter->shape().dimensions(kernel_output_feature_dim);
+      auto output_feature_dim = dim_numbers.output_feature_dimension();
+      int64 filter_slice_width = output_feature / group_count;
+
+      int64 activation_input_feature_dim =
+          dim_numbers.input_feature_dimension();
+
+      for (int64 i = 0; i < group_count; i++) {
+        filter_slice_starts[kernel_output_feature_dim] = i * filter_slice_width;
+        filter_slice_limits[kernel_output_feature_dim] =
+            (i + 1) * filter_slice_width;
+        auto filter_sliced_shape = filter->shape();
+        filter_sliced_shape.set_dimensions(kernel_output_feature_dim,
+                                           filter_slice_width);
+        auto filter_slice = add(HloInstruction::CreateSlice(
+            filter_sliced_shape, filter, filter_slice_starts,
+            filter_slice_limits, slice_strides));
+
+        activation_slice_starts[activation_input_feature_dim] = i * group_size;
+        activation_slice_limits[activation_input_feature_dim] =
+            (i + 1) * group_size;
+        auto activation_sliced_shape = activation->shape();
+        activation_sliced_shape.set_dimensions(activation_input_feature_dim,
+                                               group_size);
+        auto activation_slice = add(HloInstruction::CreateSlice(
+            activation_sliced_shape, activation, activation_slice_starts,
+            activation_slice_limits, slice_strides));
+
+        auto conv_slice_shape = convolution->shape();
+        conv_slice_shape.set_dimensions(output_feature_dim, filter_slice_width);
+
+        auto new_convolution = add(HloInstruction::CreateConvolve(
+            conv_slice_shape, activation_slice, filter_slice,
+            /*feature_group_count=*/1, convolution->window(), dim_numbers,
+            convolution->precision_config()));
+
+        sliced_convolutions.push_back(new_convolution);
+      }
+
+      auto new_conv = HloInstruction::CreateConcatenate(
+          convolution->shape(), sliced_convolutions, output_feature_dim);
+      TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
+          convolution, std::move(new_conv)));
+    }
   }
-  auto zero = add(HloInstruction::CreateConstant(
-      LiteralUtil::Zero(expanded_filter_shape.element_type())));
-  auto zero_filter =
-      add(HloInstruction::CreateBroadcast(expanded_filter_shape, zero, {}));
-  auto new_filter = add(
-      HloInstruction::CreateTernary(expanded_filter_shape, HloOpcode::kSelect,
-                                    filter_mask, expanded_filter, zero_filter));
-  auto new_convolution = HloInstruction::CreateConvolve(
-      convolution->shape(), convolution->mutable_operand(0), new_filter,
-      /*feature_group_count=*/1, convolution->window(), dim_numbers,
-      convolution->precision_config());
-  TF_RETURN_IF_ERROR(computation_->ReplaceWithNewInstruction(
-      convolution, std::move(new_convolution)));
+
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
index 28373ebf636c7b6b3059dcf6cd931901ebc87fc2..e6bf2143a21bd5001d3530fe8727c88504be1d43 100644
--- a/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
+++ b/tensorflow/compiler/xla/service/convolution_feature_group_converter_test.cc
@@ -82,18 +82,14 @@ ENTRY %Convolve1D1Window_0.v3 (input: f32[1,2,4], filter: f32[1,2,2]) -> f32[1,2
   ConvolutionFeatureGroupConverter converter;
   ASSERT_TRUE(converter.Run(module.get()).ValueOrDie());
   root = computation->root_instruction();
-  // Make sure the convolution is converted to one with feature_group_count = 1.
-  EXPECT_EQ(root->opcode(), HloOpcode::kConvolution);
-  EXPECT_EQ(root->feature_group_count(), 1);
-  // Verify that the filter operand has been replaced.
-  EXPECT_THAT(root->operand(1),
-              op::Select(op::Eq(op::Broadcast(op::Constant()),
-                                op::Broadcast(op::Constant())),
-                         // We expect to see Concatenate here instead of
-                         // Broadcast, because feature_group_count < input
-                         // feature dimension.
-                         op::Concatenate(op::Parameter(), op::Parameter()),
-                         op::Broadcast(op::Constant())));
+  // Make sure the convolution is replaced with a concatenate.
+  EXPECT_EQ(root->opcode(), HloOpcode::kConcatenate);
+  // And the operands of the concatenate are convolutions, each with a feature
+  // group count = 1.
+  EXPECT_EQ(root->operand(0)->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->operand(1)->opcode(), HloOpcode::kConvolution);
+  EXPECT_EQ(root->operand(0)->feature_group_count(), 1);
+  EXPECT_EQ(root->operand(1)->feature_group_count(), 1);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index 4e547d925f62dce1d2dd23a39a28ca8c23ba9f2f..df6059663876dfde71f4c75d3931b3d2de72c1df 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -442,7 +442,6 @@ class CopyRemover {
               const HloOrdering& ordering, HloModule* module)
       : module_(module),
         alias_analysis_(alias_analysis),
-        ordering_(ordering),
         buffer_value_tracker_(*module, alias_analysis, ordering) {}
 
   // Try to elide the given copy. The copy is elided if the instruction is not
@@ -1003,7 +1002,6 @@ class CopyRemover {
 
   HloModule* module_;
   const HloAliasAnalysis& alias_analysis_;
-  const HloOrdering& ordering_;
 
   // Object tracking the HLO values contained in each HLO buffer.
   BufferValueTracker buffer_value_tracker_;
diff --git a/tensorflow/compiler/xla/service/copy_insertion_test.cc b/tensorflow/compiler/xla/service/copy_insertion_test.cc
index 7446bc7cc11553984dcf1cea00c58072d2cbf0f0..e4e9d7ba05c115be9dd0eb53ebd7de208d514efb 100644
--- a/tensorflow/compiler/xla/service/copy_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion_test.cc
@@ -94,7 +94,7 @@ TEST_F(CopyInsertionTest, SingleParameter) {
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -114,7 +114,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(tuple));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -127,7 +127,7 @@ TEST_F(CopyInsertionTest, SingleConstant) {
 TEST_F(CopyInsertionTest, ExistingCopiesNotRemoved) {
   // Verify that kCopy instructions which change layout and exist before
   // copy-insertion remain in the graph after copy-insertion.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   auto builder = HloComputation::Builder(TestName());
   HloInstruction* constant =
@@ -181,7 +181,7 @@ TEST_F(CopyInsertionTest, MultipleConstantsAndParameters) {
 
   builder.AddInstruction(HloInstruction::CreateTuple({constant2, x, add}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   InsertCopies(module.get());
@@ -217,7 +217,7 @@ TEST_F(CopyInsertionTest, AmbiguousPointsToSet) {
   EXPECT_THAT(constant2->users(), UnorderedElementsAre(tuple1, tuple2));
   EXPECT_THAT(constant3->users(), UnorderedElementsAre(tuple2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloInstruction* old_root = module->entry_computation()->root_instruction();
@@ -238,7 +238,7 @@ TEST_F(CopyInsertionTest, BitcastParameter) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -261,7 +261,7 @@ TEST_F(CopyInsertionTest, BitcastConstant) {
   HloInstruction* bitcast = builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, constant));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(constant->users(), UnorderedElementsAre(bitcast));
@@ -283,7 +283,7 @@ TEST_F(CopyInsertionTest, BitcastTupleElementParameter) {
       ShapeUtil::MakeShape(F32, {2, 2}), HloOpcode::kBitcast, x));
   builder.AddInstruction(HloInstruction::CreateTuple({bitcast}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(x->users(), UnorderedElementsAre(bitcast));
@@ -310,7 +310,7 @@ TEST_F(CopyInsertionTest, NestedTupleParameter) {
            ShapeUtil::MakeShape(F32, {42})}),
       "param0"));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(HloOpcode::kParameter,
@@ -351,7 +351,7 @@ TEST_F(CopyInsertionTest, ElementOfNestedTupleParameter) {
   auto gte = builder.AddInstruction(HloInstruction::CreateGetTupleElement(
       ShapeUtil::GetSubshape(param->shape(), {0}), param, 0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -388,7 +388,7 @@ TEST_F(CopyInsertionTest, AmbiguousTopLevelRoot) {
       builder.AddInstruction(HloInstruction::CreateGetTupleElement(
           ShapeUtil::GetSubshape(select->shape(), {0}), select, 0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(gte, module->entry_computation()->root_instruction());
@@ -1295,7 +1295,7 @@ TEST_F(WhileCopyInsertionTest, InitPointsToNonDistinctUsedByTwoWhileLoops) {
 TEST_F(CopyInsertionTest, SwizzlingWhile) {
   // Test a while instruction with a body which permutes its tuple parameter
   // elements.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1362,7 +1362,7 @@ TEST_F(CopyInsertionTest, CrossingParameters) {
   //   |  / \ |
   //   | /   \|
   //  (p1  ,  p0)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1395,7 +1395,7 @@ TEST_F(CopyInsertionTest, ParametersAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1428,7 +1428,7 @@ TEST_F(CopyInsertionTest, ParameterWithNoAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1461,7 +1461,7 @@ TEST_F(CopyInsertionTest, ParameterWithPartialAliasing) {
   //   |      |
   //   |      |
   //  (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1496,7 +1496,7 @@ TEST_F(CopyInsertionTest, ParameterAndParallelOpsWithPartialAliasing) {
   //   |    |      |
   //   |    |      |
   //   +-- (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1534,7 +1534,7 @@ TEST_F(CopyInsertionTest, ParameterAndOpsWithPartialAliasing) {
   //   |    Add----+
   //   |    |      |
   //   +-- (p0 ,  p1)
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape tuple_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1569,7 +1569,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileWithOneOp) {
   // the operation (instruction) on the element makes the live range of the
   // respective input and output elements different than if the instruction were
   // not there (as in the SwizzlingWhile test above).
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1632,7 +1632,7 @@ TEST_F(CopyInsertionTest, SwizzlingWhileSharedInput) {
   // the while body is a single constant (both loop state elements are the same
   // constant). This means no copies are necessary because both loop state
   // elements are the same so interchanging them is a no-op.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape loop_state_shape =
       ShapeUtil::MakeTupleShape({scalar_shape_, scalar_shape_});
 
@@ -1693,7 +1693,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
   const Shape loop_state_shape = ShapeUtil::MakeTupleShape(
       {element_shape, element_shape, element_shape, element_shape});
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, element_shape, "param_0"));
@@ -1783,7 +1783,7 @@ TEST_F(CopyInsertionTest, SequentialWhiles) {
 TEST_F(CopyInsertionTest, WhileBodyWithConstantRoot) {
   // Test a while body and condition which are each simply a constant (root of
   // computation is a constant). The body constant should be copied.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
   auto param_0 = builder.AddInstruction(
       HloInstruction::CreateParameter(0, scalar_shape_, "param_0"));
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index 2763d18121a0c1328ea0c11d825476923ae2b15d..ce4c2a9cc69240b9565b35a3f2504d7fc9373917 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -96,6 +96,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "//tensorflow/compiler/tf2xla:cpu_function_runtime",
         "//tensorflow/compiler/xla/service:map_inliner",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 73b03440cbb936017257b8a92f16dcc25d41e21c..796a7cf94d02b0ad42366387a9d3f8d589b8840a 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -61,19 +61,6 @@ Disabling these as a starting point.
 // TODO(b/64227304) Creating a custom pass pipeline will replace this.
 
 namespace {
-class FilteredFunctionPassManager : public llvm::legacy::FunctionPassManager {
- public:
-  FilteredFunctionPassManager(llvm::Module* m, bool disable_expensive_passes)
-      : llvm::legacy::FunctionPassManager(m),
-        disable_expensive_passes_(disable_expensive_passes) {}
-  void add(llvm::Pass* p) override {
-    llvm::legacy::FunctionPassManager::add(p);
-  }
-
- private:
-  bool disable_expensive_passes_;
-};
-
 class FilteredPassManager : public llvm::legacy::PassManager {
  public:
   explicit FilteredPassManager(bool disable_expensive_passes)
@@ -96,8 +83,7 @@ class FilteredPassManager : public llvm::legacy::PassManager {
 std::unique_ptr<llvm::MemoryBuffer> CompilerFunctor::operator()(
     llvm::Module& module) const {
   FilteredPassManager module_passes(disable_expensive_passes_);
-  FilteredFunctionPassManager function_passes(&module,
-                                              disable_expensive_passes_);
+  llvm::legacy::FunctionPassManager function_passes(&module);
 
   VLOG(2) << "IR before optimizations";
   XLA_VLOG_LINES(2, llvm_ir::DumpModuleToString(module));
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 4ce5a8a29255a763c83941efb6de9b7c652cedb4..6374822c81bf42fd12829f57cf93c19457128219 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -76,6 +76,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
@@ -268,10 +269,11 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
         /*rewrite_training_op=*/true,
         /*rewrite_inference_op=*/true,
         /*rewrite_grad_op=*/true);
-    pass.AddPass<AlgebraicSimplifier>(
-        /*is_layout_sensitive=*/false,
-        [](const Shape&, const Shape&) { return false; },
-        /*enable_dot_strength_reduction=*/false);
+    pipeline.AddPass<HloGetDimensionSizeRewriter>();
+    AlgebraicSimplifierOptions options(
+        [](const Shape&, const Shape&) { return false; });
+    options.set_enable_dot_strength_reduction(false);
+    pass.AddPass<AlgebraicSimplifier>(options);
     pass.AddPass<HloDCE>();
 
     // BatchNormExpander can create zero-sized ops, so zero-sized HLO
@@ -334,10 +336,11 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     pass.AddInvariantChecker<HloVerifier>(
         /*layout_sensitive=*/true,
         /*allow_mixed_precision=*/false);
-    pass.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
-        [](const Shape&, const Shape&) { return true; },
-        /*enable_dot_strength_reduction=*/false);
+    AlgebraicSimplifierOptions options(
+        [](const Shape&, const Shape&) { return true; });
+    options.set_is_layout_sensitive(true);
+    options.set_enable_dot_strength_reduction(false);
+    pass.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
     pass.AddPass<HloDCE>();
     pass.AddPass<HloCSE>(/*is_layout_sensitive=*/true);
   }
@@ -587,9 +590,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
   // Select an order for emitting the HLO instructions for each
   // computation. Using this sequence enables tighter buffer liveness analysis
   // and reduced memory usage (as compared to using DependencyHloOrdering).
-  TF_ASSIGN_OR_RETURN(
-      HloSchedule schedule,
-      ScheduleModule(*module, BufferSizeBytesFunction(), DFSMemoryScheduler));
+  TF_ASSIGN_OR_RETURN(HloSchedule schedule,
+                      ScheduleModule(module.get(), BufferSizeBytesFunction(),
+                                     DFSMemoryScheduler));
 
   // Run buffer allocation on the HLO graph.
   TF_ASSIGN_OR_RETURN(
@@ -779,7 +782,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
     XLA_VLOG_LINES(2, module->ToString());
 
     TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                        ScheduleModule(*module, BufferSizeBytesFunction()));
+                        ScheduleModule(module, BufferSizeBytesFunction()));
 
     // Run buffer analysis on the HLO graph. This analysis figures out which
     // temporary buffers are required to run the computation.
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 29abf38e439d919ff93629ed992cb3ff93a929bd..818b2b0d0db2893e11fa46c7867e6c74bbbb6905 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -51,8 +51,7 @@ namespace cpu {
 CpuExecutable::CpuExecutable(
     std::unique_ptr<SimpleOrcJIT> jit,
     std::unique_ptr<const BufferAssignment> assignment,
-    std::unique_ptr<const HloModule> hlo_module,
-    const string& entry_function_name,
+    std::unique_ptr<HloModule> hlo_module, const string& entry_function_name,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
     : Executable(std::move(hlo_module), std::move(hlo_profile_printer_data),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.h b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
index 3c3c047bfe8ee0d1ad90ede2432a86264f47870b..3b91b15ba9b5603b50f78f489e9a3fdad354c083 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.h
@@ -49,7 +49,7 @@ class CpuExecutable : public Executable {
  public:
   CpuExecutable(std::unique_ptr<SimpleOrcJIT> jit,
                 std::unique_ptr<const BufferAssignment> assignment,
-                std::unique_ptr<const HloModule> hlo_module,
+                std::unique_ptr<HloModule> hlo_module,
                 const string& entry_function_name,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
index f9cd61bea3dc86cadff99d4a90eca44c16520823..6f79ad7c1468f27c74d84770ec6358fbcd1c1f09 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion.cc
@@ -48,10 +48,15 @@ bool IsMatrixVectorDot(const HloInstruction* hlo) {
          (hlo_shape.dimensions(0) == 1 || hlo_shape.dimensions(1) == 1);
 }
 
+bool HasExactlyOneUse(const HloInstruction& hlo_instr) {
+  return hlo_instr.user_count() == 1 &&
+         absl::c_count(hlo_instr.users().front()->operands(), &hlo_instr) == 1;
+}
+
 bool CanBeOutputFused(const HloInstruction* producer,
                       const HloInstruction* consumer) {
   return consumer->opcode() == HloOpcode::kAdd && IsMatrixVectorDot(producer) &&
-         producer->user_count() == 1;
+         HasExactlyOneUse(*producer) == 1;
 }
 
 bool CanBeOutputFusedIntoSomeOperand(const HloInstruction* consumer) {
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
index c95a514ca04bee1fb4c03ee21510eb8da3122081..527df0bd1c23bba74f32226e5622fed32f7dcf84 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_instruction_fusion_test.cc
@@ -321,7 +321,7 @@ TEST_F(OpcodeFusionTest, Exponential_Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -370,7 +370,7 @@ TEST_F(OpcodeFusionTest, Broadcast_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, broadcast1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -410,7 +410,7 @@ TEST_F(OpcodeFusionTest, Exponential_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, exp1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -429,7 +429,7 @@ TEST_F(OpcodeFusionTest, Reshape_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(result_shape, HloOpcode::kNegate, reshape1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -447,7 +447,7 @@ TEST_F(OpcodeFusionTest, Reverse_Negate) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param_shape, HloOpcode::kNegate, reverse1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -489,7 +489,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       result_shape, HloOpcode::kNegate, transpose2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   RunFusionAndCheckOpcodesWereFused(
@@ -498,7 +498,7 @@ TEST_F(OpcodeFusionTest, Exponential_Transpose_Negate) {
 }
 
 TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -517,7 +517,7 @@ TEST_F(OpcodeFusionTest, UnaryMapOfExp) {
 }
 
 TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {3, 4});
@@ -542,7 +542,7 @@ TEST_F(OpcodeFusionTest, BinaryMapOfExps) {
 }
 
 TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   HloComputation::Builder builder(TestName());
   Shape full_shape = ShapeUtil::MakeShape(F32, {10, 100, 1000});
@@ -573,7 +573,7 @@ TEST_F(OpcodeFusionTest, DynamicSliceWithDynamicUpdateSlice) {
 }
 
 TEST_F(OpcodeFusionTest, MessOfFusibleNodes) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   Shape full_shape = ShapeUtil::MakeShape(F32, {4, 100, 10, 100, 50});
@@ -712,7 +712,7 @@ void CreateComputationForDotAddOutputFusionTest(const string& test_name,
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/1,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -725,7 +725,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_1x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/false);
@@ -738,7 +738,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/19,
                                              /*add_extra_use_for_dot=*/false);
@@ -751,7 +751,7 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x19) {
 }
 
 TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   CreateComputationForDotAddOutputFusionTest(TestName(), module.get(), /*m=*/19,
                                              /*k=*/50, /*n=*/1,
                                              /*add_extra_use_for_dot=*/true);
@@ -763,6 +763,28 @@ TEST_F(OpcodeFusionTest, DotAddOutputFusion_19x50x1_multi_use) {
               Not(op::Fusion()));
 }
 
+TEST_F(InstructionFusionTest,
+       DotOperationFusion_DontOutputFuseDuplicateOperands) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[50,60]{1,0} parameter(0)
+  b = f32[60,1]{1,0} parameter(1)
+  c = f32[50,1]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT d = f32[50,1]{1,0} add(c, c)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool fused_something,
+                          CpuInstructionFusion().Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
 struct GatherLoopFusionTestSpec {
   string test_name;
   string hlo_computation_text;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
index 2cd52e4a18a4524365393db5f658a982d83a7632..6c61b64758ede160e2d50e4429590a789ec253c3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment_test.cc
@@ -73,7 +73,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensor) {
   auto result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -114,7 +114,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor0) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       result_shape, HloOpcode::kAdd, dot_a_result, dot_b_result));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -158,7 +158,7 @@ TEST_F(CpuLayoutAssignmentTest, MultipleDotsWithSameConstantRhsTensor1) {
   auto tuple_result = builder.AddInstruction(
       HloInstruction::CreateTuple({dot_a_result, dot_b_result}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -192,7 +192,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantLhsTensor) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -232,7 +232,7 @@ TEST_F(CpuLayoutAssignmentTest, DotWithConstantRhsTensorThroughGTE) {
   auto dot_result = builder.AddInstruction(
       CreateCanonicalDot(result_shape, dot_lhs, dot_rhs));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   ComputationLayout computation_layout(computation->ComputeProgramShape());
@@ -353,7 +353,7 @@ static void AssertCorrectLayoutForDotOutputFusion(
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -365,7 +365,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/1, /*k=*/50, /*n=*/19,
@@ -377,7 +377,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_1x50x19_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -389,7 +389,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/1,
@@ -401,7 +401,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x1_dot_idx_1) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
@@ -413,7 +413,7 @@ TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_0) {
 }
 
 TEST_F(CpuLayoutAssignmentTest, DotOutputFusion_19x50x19_dot_idx_1) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+  std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
   TF_ASSERT_OK_AND_ASSIGN(
       DotOutputFusionLayoutAssignmentResult layout_assignment_result,
       RunDotOutputFusion(module.get(), TestName(), /*m=*/19, /*k=*/50, /*n=*/19,
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_options.cc b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
index b8ace5702688096822573c7afae234cbcbe77b28..92debb83e33b1400a59e5eef0f90971392ab7b22 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_options.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_options.cc
@@ -22,7 +22,6 @@ limitations under the License.
 namespace {
 
 const char* const kXlaOptimizeForSizeCpuOption = "xla_cpu_optimize_for_size";
-const char* const kXlaDisableVectorizedReduce = "xla_disable_vectorized_reduce";
 const char* const kLlvmIrDotTilingFactor = "xla_llvm_dot_tiling_factor";
 const char* const kXlaEnableExperimentalLlvmIrGemm =
     "xla_enable_experimental_llvm_ir_gemm";
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
index 620c45fa391e69ef88269d44709404e6f71b30cb..4032c2da2f33ee61da8771ae6225a14172cbe6e8 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.cc
@@ -111,7 +111,7 @@ IrEmitter::IrEmitter(
 StatusOr<llvm::Function*> IrEmitter::EmitComputation(
     HloComputation* computation, const string& function_name_prefix,
     bool is_top_level_computation,
-    const std::vector<const HloInstruction*>* instruction_order) {
+    const std::vector<HloInstruction*>* instruction_order) {
   string function_name = name_uniquer_.GetUniqueName(function_name_prefix);
   VLOG(2) << "Emitting IR for CPU function [" << function_name_prefix
           << "]; ordered? " << (instruction_order != nullptr);
@@ -140,7 +140,7 @@ StatusOr<llvm::Function*> IrEmitter::EmitComputation(
   // readcyclecounter if it is unavailable.
   bool use_rdtscp = arch_type_ == llvm::Triple::ArchType::x86 ||
                     arch_type_ == llvm::Triple::ArchType::x86_64;
-  profiling_state_ = ProfilingState(use_rdtscp, GetProfileCountersArgument());
+  profiling_state_ = ProfilingState(use_rdtscp);
   if (instruction_order == nullptr) {
     TF_RETURN_IF_ERROR(computation->Accept(this));
   } else {
@@ -1379,33 +1379,6 @@ Status IrEmitter::HandleCrossReplicaSum(HloInstruction* crs) {
   return Status::OK();
 }
 
-// Fills up the free variables in 'index_with_free_var' with values from
-// 'filler_index'. The size of free variables must be the same as the
-// size of 'filler_index'.
-//
-// This is often used after dimension reduction, where
-// 'index_with_free_var' has one or more dimensions reduced, which serves as
-// free variables (represented as nullptr). For example, if we have a 4
-// dimensional input and index for the dimension being reduced is
-// 2 (third dimension), we will have an index like [i, j, NULL, k]
-// after reduced dimension.
-//
-// Here we fill up that free variable by 'filler_index', which contains
-// the value in the reduced dimension.
-static llvm_ir::IrArray::Index FillReducedDimensionIndex(
-    llvm_ir::IrArray::Index index_with_free_var,
-    llvm_ir::IrArray::Index filler_index) {
-  llvm_ir::IrArray::Index::const_iterator it = filler_index.begin();
-
-  for (size_t i = 0; i < index_with_free_var.size(); ++i) {
-    if (index_with_free_var[i] == nullptr) {
-      index_with_free_var[i] = *it++;
-    }
-  }
-  CHECK(filler_index.end() == it);
-  return index_with_free_var;
-}
-
 Status IrEmitter::HandleParameter(HloInstruction* parameter) {
   VLOG(2) << "HandleParameter: " << parameter->ToString();
   return EmitTargetAddressForOp(parameter);
@@ -2194,14 +2167,6 @@ Status IrEmitter::HandlePad(HloInstruction* pad) {
   return Status::OK();
 }
 
-// If `hlo` is a Transpose, returns its operand; otherwise returns `hlo` itself.
-static const HloInstruction* StripTranspose(const HloInstruction& hlo) {
-  if (hlo.IsRank2Transpose()) {
-    return hlo.operand(0);
-  }
-  return &hlo;
-}
-
 Status IrEmitter::HandleFusion(HloInstruction* fusion) {
   auto* root = fusion->fused_expression_root();
   if (llvm_ir::CanEmitFusedDynamicUpdateSliceInPlace(fusion, assignment_)) {
@@ -2600,10 +2565,17 @@ Status IrEmitter::HandleConditional(HloInstruction* conditional) {
   return Status::OK();
 }
 
-Status IrEmitter::HandleAfterAll(HloInstruction* gen_token) {
-  TF_RET_CHECK(ByteSizeOf(gen_token->shape()) == 0);
+Status IrEmitter::HandleAfterAll(HloInstruction* after_all) {
+  TF_RET_CHECK(ByteSizeOf(after_all->shape()) == 0);
   // No code to generate, but we need to emit an address for book-keeping.
-  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(gen_token));
+  TF_RETURN_IF_ERROR(EmitTargetAddressForOp(after_all));
+  return Status::OK();
+}
+
+Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) {
+  // AddDedendency just forwards its zero-th operand.
+  emitted_value_[add_dependency] =
+      GetEmittedValueFor(add_dependency->operand(0));
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 136b88ff75ea8a5f48b42d3476219f18f5ecb39a..559a8162a2d53f28ea6817653503c216af90a610 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -101,7 +101,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   StatusOr<llvm::Function*> EmitComputation(
       HloComputation* computation, const string& function_name_prefix,
       bool is_top_level_computation,
-      const std::vector<const HloInstruction*>* instruction_order);
+      const std::vector<HloInstruction*>* instruction_order);
 
   llvm::IRBuilder<>* b() { return &b_; }
 
@@ -159,7 +159,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleConcatenate(HloInstruction* concatenate) override;
   Status HandleConditional(HloInstruction* conditional) override;
   Status HandleScatter(HloInstruction* scatter) override;
-  Status HandleAfterAll(HloInstruction* gen_token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
   Status HandleRng(HloInstruction* rng) override;
   Status FinishVisit(HloInstruction* root) override;
 
@@ -467,9 +468,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // profiling a computation.
   class ProfilingState {
    public:
-    ProfilingState() : use_rdtscp_(false), prof_counters_(nullptr) {}
-    ProfilingState(bool use_rdtscp, llvm::Value* prof_counters)
-        : use_rdtscp_(use_rdtscp), prof_counters_(prof_counters) {}
+    ProfilingState() : use_rdtscp_(false) {}
+    explicit ProfilingState(bool use_rdtscp) : use_rdtscp_(use_rdtscp) {}
 
     // Record the cycle counter before an HLO executes.
     void RecordCycleStart(llvm::IRBuilder<>* b, HloInstruction* hlo);
@@ -494,9 +494,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
     // intrinsic?
     bool use_rdtscp_;
 
-    // The argument which corresponds to the profile counter buffer.
-    llvm::Value* prof_counters_;
-
     // The first read cycle counter in the program.
     llvm::Value* first_read_cycle_start_ = nullptr;
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
index 669eeb95f3299623a7556bfbb8045fd77f5d0745..722aa3120ef4d8c957873ac58c361f19632dde1f 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime_key_value_sort.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstring>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -41,61 +42,60 @@ void KeyValueSort(std::pair<KeyType, int64>* row_to_sort, int64 num_elements) {
   std::sort(row_to_sort, row_to_sort + num_elements);
 }
 
-// For floating point numbers, we want a total order comparator. -NaN and NaN
-// should appear at the beginning and end of the ordering, and -0.0 should
-// appear before 0.0. Also we want to have a stable sort, so if the keys are the
-// same, we compare the index values.
-template <typename KeyType>
-bool LessThan(KeyType lhs, int64 lhs_index, KeyType rhs, int64 rhs_index) {
-  bool lhs_is_negative = std::signbit(lhs);
-  bool rhs_is_negative = std::signbit(rhs);
-  // If the signs are different, we can just compare the signs.
-  if (lhs_is_negative != rhs_is_negative) {
-    return lhs_is_negative && !rhs_is_negative;
-  }
-  bool lhs_nan = std::isnan(lhs);
-  bool rhs_nan = std::isnan(rhs);
-  // Exactly one number is nan?
-  if (lhs_nan != rhs_nan) {
-    if (lhs_nan) {
-      return lhs_is_negative;
-    }
-    return !rhs_is_negative;
+// We would like a total order of floating point numbers so that the
+// sort has a predictable behavior in the presence of NaNs. Rather
+// than using floating point comparison, we use the following trick:
+// If f is a float, and
+// x = bit_cast<int32>(f);
+// y = x < 0 ? 0x7FFFFFFF - x : x;
+// then y is ordered as an int32 such that finite values have the
+// obvious order, -0 is ordered before 0, and -NaN and NaN appear at
+// the beginning and end of the ordering.
+template <typename CastType, typename UnsignedCastType, typename KeyType>
+CastType Convert(KeyType value) {
+  CastType casted_value;
+  memcpy(&casted_value, &value, sizeof(CastType));
+  if (casted_value < 0) {
+    return static_cast<UnsignedCastType>(std::numeric_limits<CastType>::max()) -
+           casted_value;
   }
-  if (lhs != rhs) {
-    return lhs < rhs;
-  }
-  return lhs_index < rhs_index;
+  return casted_value;
+}
+
+template <typename CastType, typename UnsignedCastType, typename KeyType>
+bool LessThan(KeyType lhs, KeyType rhs) {
+  return Convert<CastType, UnsignedCastType>(lhs) <
+         Convert<CastType, UnsignedCastType>(rhs);
 }
 
 template <>
 void KeyValueSort(std::pair<double, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<double, int64>& lhs,
-               const std::pair<double, int64>& rhs) -> bool {
-              return LessThan(lhs.first, lhs.second, rhs.first, rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<double, int64>& lhs,
+                      const std::pair<double, int64>& rhs) -> bool {
+                     return LessThan<int64, uint64>(lhs.first, rhs.first);
+                   });
 }
 
 template <>
 void KeyValueSort(std::pair<float, int64>* row_to_sort, int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<float, int64>& lhs,
-               const std::pair<float, int64>& rhs) -> bool {
-              return LessThan(lhs.first, lhs.second, rhs.first, rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<float, int64>& lhs,
+                      const std::pair<float, int64>& rhs) -> bool {
+                     return LessThan<int32, uint32>(lhs.first, rhs.first);
+                   });
 }
 
 template <>
 void KeyValueSort(std::pair<Eigen::half, int64>* row_to_sort,
                   int64 num_elements) {
-  std::sort(row_to_sort, row_to_sort + num_elements,
-            [](const std::pair<Eigen::half, int64>& lhs,
-               const std::pair<Eigen::half, int64>& rhs) -> bool {
-              return LessThan(
-                  Eigen::half_impl::half_to_float(lhs.first), lhs.second,
-                  Eigen::half_impl::half_to_float(rhs.first), rhs.second);
-            });
+  std::stable_sort(row_to_sort, row_to_sort + num_elements,
+                   [](const std::pair<Eigen::half, int64>& lhs,
+                      const std::pair<Eigen::half, int64>& rhs) -> bool {
+                     return LessThan<int32, uint32>(
+                         Eigen::half_impl::half_to_float(lhs.first),
+                         Eigen::half_impl::half_to_float(rhs.first));
+                   });
 }
 
 template <typename KeyType>
diff --git a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
index f77641eb7da71117092730c1fd5090c61c939813..efccadedf27181a4cddf4f1dc3610f7c6db1d821 100644
--- a/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
+++ b/tensorflow/compiler/xla/service/cpu/simple_orc_jit.cc
@@ -128,8 +128,18 @@ SimpleOrcJIT::SimpleOrcJIT(const llvm::TargetOptions& target_options,
 }
 
 llvm::JITSymbol SimpleOrcJIT::ResolveRuntimeSymbol(const std::string& name) {
-  void* func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+  void* func_addr = nullptr;
+  if (name.size() > 1 && name.front() == data_layout_.getGlobalPrefix()) {
+    // On Mac OS X, 'name' may have a leading underscore prefix, even though the
+    // registered name may not.
+    std::string stripped_name(name.begin() + 1, name.end());
+    func_addr = CustomCallTargetRegistry::Global()->Lookup(stripped_name);
+  } else {
+    func_addr = CustomCallTargetRegistry::Global()->Lookup(name);
+  }
+
   if (func_addr == nullptr) {
+    VLOG(2) << "Unable to resolve runtime symbol: " << name;
     return nullptr;
   }
   llvm::JITEvaluatedSymbol symbol_info(reinterpret_cast<uint64_t>(func_addr),
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
index 691b3c7bee26e84edbef18a4ac10a9cafd29c61a..f8f5f392da8ab3348e63185aecf7b639daacaa42 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_eigen_dot_operation_test.cc
@@ -50,7 +50,7 @@ class CpuEigenDotOperationTest
         /*entry_point_name=*/"entry",
         /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-    auto hlo_module = CreateNewUnverifiedModule();
+    auto hlo_module = CreateNewVerifiedModule();
     hlo_module->AddEntryComputation(std::move(entry_computation));
 
     CompileAheadOfTimeAndVerifyIr(std::move(hlo_module), options,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
index d201a151d7a9edb86a0de15819ea99f95a9c4d28..e30f95311fce229f9c559d3bb40142151e8bf3e3 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_external_constants_test.cc
@@ -46,7 +46,7 @@ class CpuExternalConstantsTest : public CpuCodegenTest {
     builder.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, constant));
 
-    std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
+    std::unique_ptr<HloModule> module = CreateNewVerifiedModule();
     module->AddEntryComputation(builder.Build());
 
     CompileAndVerifyIr(std::move(module), filecheck_pattern,
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
index 773336c7a92f808f0c6370c7353e780b1471470f..9b10c49f4f547edfb2164f98c49cceb031148bdc 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_intrinsic_test.cc
@@ -91,7 +91,7 @@ TEST_P(CpuUnaryIntrinsicTest, DoIt) {
       /*entry_point_name=*/"entry",
       /*relocation_model=*/CpuAotCompilationOptions::RelocationModel::Static};
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   string check_lines{spec.check_lines.data(), spec.check_lines.size()};
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
index 3b87683ffffefd2aa24dd234cc072425bef00a24..fa0e09ff6b5694c0e97963b83c6e541b858a1376 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_literal_caching_test.cc
@@ -63,7 +63,7 @@ CHECK-NOT: private constant [48 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text));
+                          ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
@@ -104,14 +104,14 @@ ENTRY main {
 )";
 
   string filecheck_pattern = R"(
-CHECK: private constant [4 x i8]
-CHECK: private constant [8 x i8]
+CHECK-DAG: private constant [4 x i8]
+CHECK-DAG: private constant [8 x i8]
 CHECK-NOT: private constant [4 x i8]
 CHECK-NOT: private constant [8 x i8]
 )";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseHloString(hlo_text));
+                          ParseAndReturnVerifiedModule(hlo_text));
 
   CpuAotCompilationOptions options{
       /*triple=*/"x86_64-pc-linux", /*cpu_name=*/"", /*features=*/"",
diff --git a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
index f5419b7063bea6d1f5d24fde0a22e829413b8d93..a7702c2aeeaff8a46a2c4f2785ccb873ea2c08e5 100644
--- a/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/cpu/tests/cpu_noalias_test.cc
@@ -56,7 +56,7 @@ TEST_F(CpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // Now that we have an HLO module, build an llvm_ir::AliasAnalysis for it.
diff --git a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
index 990ff94ba2338cb663b655ca3106bda83ab718a3..70008947f371d25e95d02839c30ba822fce7a292 100644
--- a/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
+++ b/tensorflow/compiler/xla/service/cpu/xfeed_manager.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <deque>
 
 #include "absl/types/span.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
index d6371283221b63b30f968929fe2807eae3f22df0..e84bf00153aa28df29d8df486b92654feab4afbf 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor.h
@@ -251,6 +251,7 @@ class DfsHloVisitorBase {
 
   virtual Status HandleBatchNormGrad(HloInstructionPtr hlo) = 0;
 
+  virtual Status HandleAddDependency(HloInstructionPtr add_dependency) = 0;
   virtual Status HandleAfterAll(HloInstructionPtr token) = 0;
 
   // Invoked to inform the visitor that the traversal has completed, and that
diff --git a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
index e57184f639f4f2c618b980a5082381f4b9c28b19..80ea5be298aea44a0f424398da74c4e478f10346 100644
--- a/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h
@@ -206,6 +206,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleGetDimensionSize(HloInstructionPtr get_size) override {
     return DefaultAction(get_size);
   }
+  Status HandleAddDependency(HloInstructionPtr add_dependency) override {
+    return DefaultAction(add_dependency);
+  }
 
   // Invoked to inform the visitor that the traversal has completed, and that
   // the root was "root".
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6d0472689bf48092ceef2e9792c1358687d707ec
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.cc
@@ -0,0 +1,459 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+#include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+
+namespace {
+bool IsTrivialWindowDimension(const WindowDimension& window_dimension) {
+  return window_dimension.size() == 1 && window_dimension.stride() == 1 &&
+         window_dimension.padding_low() == 0 &&
+         window_dimension.padding_high() == 0 &&
+         window_dimension.window_dilation() == 1 &&
+         window_dimension.base_dilation() == 1;
+}
+}  // namespace
+
+class DynamicDimensionInferenceVisitor : public DfsHloVisitorWithDefault {
+ public:
+  explicit DynamicDimensionInferenceVisitor(
+      const DynamicParameterBinding& param_bindings,
+      DynamicDimensionInference* parent)
+      : param_bindings_(param_bindings), parent_(parent) {}
+
+  Status DefaultAction(HloInstruction* hlo) override;
+
+  static Status Run(HloComputation* computation,
+                    const DynamicParameterBinding& param_bindings,
+                    DynamicDimensionInference* parent) {
+    DynamicDimensionInferenceVisitor visitor(param_bindings, parent);
+    return computation->Accept(&visitor);
+  }
+
+  Status HandleParameter(HloInstruction* hlo) override;
+
+  Status HandleReduce(HloInstruction* hlo) override;
+
+  Status HandleDot(HloInstruction* hlo) override;
+
+  Status HandleTranspose(HloInstruction* hlo) override;
+
+  Status HandleReshape(HloInstruction* hlo) override;
+
+  Status HandlePad(HloInstruction* hlo) override;
+
+  Status HandleBroadcast(HloInstruction* hlo) override;
+
+  Status HandleGetDimensionSize(HloInstruction* hlo) override;
+
+  Status HandleSelect(HloInstruction* hlo) override;
+
+  Status HandleConvolution(HloInstruction* hlo) override;
+
+  Status HandleReduceWindow(HloInstruction* hlo) override;
+
+  Status HandleSelectAndScatter(HloInstruction* hlo) override;
+
+  Status HandleGetTupleElement(HloInstruction* hlo) override;
+
+  Status HandleElementwiseUnary(HloInstruction* hlo) override;
+
+  Status HandleElementwiseBinary(HloInstruction* hlo) override;
+
+ private:
+  using OperandDynamicDimensionFn = std::function<Status(
+      HloInstruction* operand, ShapeIndex index, int64 dimension,
+      int64 operand_index, HloInstruction* dynamic_size)>;
+
+  Status ForEachOperandDynamicDimension(HloInstruction* inst,
+                                        const OperandDynamicDimensionFn&);
+
+  // Pass through a dynamic dimension from the input to the output with the same
+  // value and index in the shape. This is a helper function to handle trivial
+  // instructions like elementwise operations.
+  Status PassThroughDynamicDimension(HloInstruction*);
+
+  // The dynamic parameter bindings of this computation.
+  const DynamicParameterBinding& param_bindings_;
+
+  // A pointer to DynamicDimensionInference, used to update the dynamic mapping.
+  DynamicDimensionInference* parent_;
+};
+
+Status DynamicDimensionInferenceVisitor::DefaultAction(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        return UnimplementedStrCat(
+            "Asked to propagate a dynamic dimension from hlo ",
+            operand->ToString(), "@", index.ToString(), "@", dimension,
+            " to hlo ", hlo->ToString(), ", which is not implemented.");
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleGetTupleElement(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        if (hlo->tuple_index() == index[0]) {
+          ShapeIndex new_index =
+              ShapeIndexView(index).ConsumeFront().ToShapeIndex();
+          parent_->SetDynamicSize(hlo, new_index, dimension, dynamic_size);
+        }
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleBroadcast(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        int64 broadcast_dim = hlo->dimensions(dimension);
+        parent_->SetDynamicSize(hlo, index, broadcast_dim, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandlePad(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        if (operand_index != 0) {
+          return Unimplemented(
+              "Dynamic dimension on padding value is not supported");
+        }
+        const PaddingConfig_PaddingConfigDimension& padding_config =
+            hlo->padding_config().dimensions(dimension);
+        if (padding_config.interior_padding() == 0 &&
+            padding_config.edge_padding_low() == 0 &&
+            padding_config.edge_padding_high() == 0) {
+          parent_->SetDynamicSize(hlo, {}, dimension, dynamic_size);
+          return Status::OK();
+        } else {
+          return Unimplemented(
+              "Dynamic dimension propagation on padding dimension is not "
+              "supported.");
+        }
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReduce(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reduce = hlo;
+        int64 operand_count = reduce->operand_count();
+        CHECK_EQ(operand_count % 2, 0);
+        if (operand_index >= operand_count / 2) {
+          // Init values doesn't have dynamic size.
+          return Status::OK();
+        }
+        if ((absl::c_count(reduce->dimensions(), dimension) != 0)) {
+          // Dimension is to be reduce, stop tracing.
+          return Status::OK();
+        }
+
+        // Find out the new dynamic dimension after reduce.
+        int64 dimensions_not_reduced_count = 0;
+        for (int i = 0; i < ShapeUtil::Rank(operand->shape()); ++i) {
+          if (dimension == i) {
+            parent_->SetDynamicSize(reduce, {}, dimensions_not_reduced_count,
+                                    dynamic_size);
+
+            return Status::OK();
+          }
+          if (absl::c_count(reduce->dimensions(), i) == 0) {
+            dimensions_not_reduced_count++;
+          }
+        }
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleDot(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* dot = hlo;
+        const DotDimensionNumbers& dimension_numbers =
+            dot->dot_dimension_numbers();
+        // A map from the operand dimensions to result dimension.
+        absl::flat_hash_map<int64, int64> result_dim_mapping;
+        int64 current_result_dims = 0;
+        std::unordered_set<int64> batch_dims(
+            dimension_numbers.rhs_batch_dimensions().begin(),
+            dimension_numbers.rhs_batch_dimensions().end());
+
+        for (int64 i : dimension_numbers.rhs_batch_dimensions()) {
+          result_dim_mapping[i] = current_result_dims++;
+        }
+
+        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(0)->shape()); i++) {
+          if (!absl::c_linear_search(
+                  dimension_numbers.lhs_contracting_dimensions(), i)) {
+            if (operand_index == 0) {
+              result_dim_mapping[i] = current_result_dims;
+            }
+            current_result_dims++;
+          }
+        }
+
+        for (int64 i = 0; i < ShapeUtil::Rank(dot->operand(1)->shape()); i++) {
+          if (!absl::c_linear_search(
+                  dimension_numbers.rhs_contracting_dimensions(), i) &&
+              !absl::c_linear_search(dimension_numbers.rhs_batch_dimensions(),
+                                     i)) {
+            if (operand_index == 1) {
+              result_dim_mapping[i] = current_result_dims;
+            }
+            current_result_dims++;
+          }
+        }
+
+        // Check if the operand dim is in the result shape. If so, add another
+        // work item to trace that dimension.
+        auto iter = result_dim_mapping.find(dimension);
+        if (iter != result_dim_mapping.end()) {
+          parent_->SetDynamicSize(dot, {}, iter->second, dynamic_size);
+        }
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleTranspose(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        parent_->SetDynamicSize(hlo, {}, hlo->dimensions()[dimension],
+                                dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleConvolution(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* conv = hlo;
+        const ConvolutionDimensionNumbers& dimension_numbers =
+            conv->convolution_dimension_numbers();
+
+        if (operand_index == 0) {
+          if (dimension == dimension_numbers.input_batch_dimension()) {
+            parent_->SetDynamicSize(conv, {},
+                                    dimension_numbers.output_batch_dimension(),
+                                    dynamic_size);
+            return Status::OK();
+          }
+
+          if (dimension == dimension_numbers.input_feature_dimension()) {
+            return Status::OK();
+          }
+        } else {
+          if (dimension == dimension_numbers.kernel_input_feature_dimension()) {
+            return Status::OK();
+          }
+        }
+
+        return Unimplemented("Dynamic Spatial Convolution is not supported: %s",
+                             conv->ToString());
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleGetDimensionSize(
+    HloInstruction*) {
+  // Dynamic dimension doesn't propagate through GetDimensionSize:
+  //
+  //   Input: F32[x, y, z]
+  //     |
+  //   GetDimensionSize(1): U32[]
+  //
+  // The returned value is a scalar, which doesn't have any dynamic dimension in
+  // the shape (although the value contains the real size of the dynamic
+  // dimension of the input).
+  return Status::OK();
+}
+
+Status DynamicDimensionInferenceVisitor::PassThroughDynamicDimension(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        parent_->SetDynamicSize(hlo, index, dimension, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleElementwiseUnary(
+    HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleSelect(HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleElementwiseBinary(
+    HloInstruction* hlo) {
+  return PassThroughDynamicDimension(hlo);
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReshape(HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reshape = hlo;
+        std::vector<std::pair<int64, int64>> unmodified_dims =
+            ShapeUtil::DimensionsUnmodifiedByReshape(operand->shape(),
+                                                     reshape->shape());
+        for (auto& unmodified : unmodified_dims) {
+          if (unmodified.first == dimension) {
+            parent_->SetDynamicSize(reshape, {}, unmodified.second,
+                                    dynamic_size);
+            return Status::OK();
+          }
+        }
+        return Unimplemented(
+            "Dynamic Reshape on modified dimensions is yet not supported: %s",
+            reshape->ToString());
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleReduceWindow(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* reduce_window = hlo;
+        const WindowDimension& window_dimension =
+            reduce_window->window().dimensions(dimension);
+
+        if (!IsTrivialWindowDimension(window_dimension)) {
+          return Unimplemented(
+              "Dynamic Spatial reduce window is not supported: %s",
+              reduce_window->ToString());
+        }
+
+        parent_->SetDynamicSize(reduce_window, {}, dimension, dynamic_size);
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleSelectAndScatter(
+    HloInstruction* hlo) {
+  return ForEachOperandDynamicDimension(
+      hlo, [&](HloInstruction* operand, ShapeIndex index, int64 dimension,
+               int64 operand_index, HloInstruction* dynamic_size) {
+        HloInstruction* select_and_scatter = hlo;
+        const WindowDimension& window_dimension =
+            select_and_scatter->window().dimensions(dimension);
+
+        if (!IsTrivialWindowDimension(window_dimension)) {
+          return Unimplemented(
+              "Dynamic Spatial select and scatter is not supported: %s",
+              select_and_scatter->ToString());
+        }
+
+        parent_->SetDynamicSize(select_and_scatter, {}, dimension,
+                                dynamic_size);
+
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::HandleParameter(HloInstruction* hlo) {
+  return param_bindings_.ForEachBinding(
+      [&](const DynamicParameterBinding::DynamicParameter& dynamic_parameter,
+          const DynamicParameterBinding::DynamicDimension& dynamic_dimension) {
+        if (dynamic_dimension.parameter_num != hlo->parameter_number()) {
+          return Status::OK();
+        }
+        HloComputation* computation = hlo->parent();
+        HloInstruction* target_parameter =
+            computation->parameter_instruction(dynamic_dimension.parameter_num);
+
+        HloInstruction* dynamic_size =
+            computation->parameter_instruction(dynamic_parameter.parameter_num);
+        for (int64 i : dynamic_parameter.parameter_index) {
+          dynamic_size =
+              computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+                  ShapeUtil::GetSubshape(dynamic_size->shape(), {i}),
+                  dynamic_size, i));
+        }
+
+        parent_->SetDynamicSize(target_parameter,
+                                dynamic_dimension.parameter_index,
+                                dynamic_dimension.dimension, dynamic_size);
+        return Status::OK();
+      });
+}
+
+Status DynamicDimensionInferenceVisitor::ForEachOperandDynamicDimension(
+    HloInstruction* inst, const OperandDynamicDimensionFn& fn) {
+  for (int64 operand_index = 0; operand_index < inst->operand_count();
+       ++operand_index) {
+    auto iter =
+        parent_->per_hlo_dynamic_dimensions_.find(inst->operand(operand_index));
+    if (iter != parent_->per_hlo_dynamic_dimensions_.end()) {
+      for (auto& dynamic_dimension : iter->second) {
+        HloInstruction* dynamic_size = parent_->GetDynamicSize(
+            dynamic_dimension.inst, dynamic_dimension.index,
+            dynamic_dimension.dim);
+        TF_RETURN_IF_ERROR(fn(dynamic_dimension.inst, dynamic_dimension.index,
+                              dynamic_dimension.dim, operand_index,
+                              dynamic_size));
+      }
+    }
+  }
+  return Status::OK();
+}
+
+/* static */
+StatusOr<DynamicDimensionInference> DynamicDimensionInference::Run(
+    HloModule* module) {
+  VLOG(0) << "Param Config " << module->dynamic_parameter_binding().ToString();
+  DynamicDimensionInference inference(module);
+  TF_RETURN_IF_ERROR(inference.AnalyzeDynamicDimensions());
+  return inference;
+}
+
+DynamicDimensionInference::DynamicDimensionInference(HloModule* module)
+    : module_(module) {}
+
+Status DynamicDimensionInference::AnalyzeDynamicDimensions() {
+  return DynamicDimensionInferenceVisitor::Run(
+      module_->entry_computation(), module_->dynamic_parameter_binding(), this);
+}
+
+HloInstruction* DynamicDimensionInference::GetDynamicSize(
+    HloInstruction* inst, const ShapeIndex& index, int64 dim) const {
+  auto iter = dynamic_mapping_.find(DynamicDimension{inst, index, dim});
+  if (iter != dynamic_mapping_.end()) {
+    return iter->second;
+  }
+  return nullptr;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference.h b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..164d15bf111a92e3da957f609b54ee0662ef18b1
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference.h
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/macros.h"
+
+namespace xla {
+
+// DynamicDimensionInference analyzes each HLO instruction in a graph and
+// inferences which dimensions are dynamic and which scalar instructions
+// represent the runtime real size of those dynamic dimensions.
+class DynamicDimensionInference {
+ public:
+  static StatusOr<DynamicDimensionInference> Run(HloModule* module);
+
+  string ToString() const;
+
+  // If the dimension `dim` of instruction `inst` at `index` has a dynamic size,
+  // returns a scalar HloInstruction that represents the runtime size of that
+  // dimension. Otherwise returns nullptr.
+  HloInstruction* GetDynamicSize(HloInstruction* inst, const ShapeIndex& index,
+                                 int64 dim) const;
+
+  friend class DynamicDimensionInferenceVisitor;
+
+ private:
+  explicit DynamicDimensionInference(HloModule* module);
+
+  // DynamicDimension is used as a key in the dynamic key-value mapping. It
+  // unambiguously represents a dynamic dimension of a instruction at a given
+  // index.
+  struct DynamicDimension {
+    // HloInstruction that holds the dimension.
+    HloInstruction* inst;
+    // Subshape of the instruction that holds the dimension.
+    ShapeIndex index;
+    // The dimension number of the dynamic dimension at given index of a given
+    // instruction.
+    int64 dim;
+
+    // Artifacts needed to make this struct able to be used as a `key` in absl
+    // maps. "friend" keywords are added so these functions can be found through
+    // ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.inst, m.index, m.dim);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.inst == rhs.inst && lhs.index == rhs.index &&
+             lhs.dim == rhs.dim;
+    }
+  };
+
+  // Update the dynamic mapping so that we know dimension `dim` of instruction
+  // `inst` at `index` has a dynamic size, and its runtime size is represented
+  // by a scalar instruction `size`.
+  void SetDynamicSize(HloInstruction* inst, const ShapeIndex& index, int64 dim,
+                      HloInstruction* size) {
+    dynamic_mapping_.try_emplace(DynamicDimension{inst, index, dim}, size);
+    auto iter = per_hlo_dynamic_dimensions_.try_emplace(inst);
+    iter.first->second.emplace(DynamicDimension{inst, index, dim});
+  }
+
+  // AnalyzeDynamicDimensions starts the analysis of the dynamic dimensions in
+  // module_.
+  Status AnalyzeDynamicDimensions();
+
+  // HloModule being analyzed.
+  HloModule* module_;
+
+  // dynamic_mapping_ holds the result of the analysis. It maps a dynamic
+  // dimension to a scalar HloInstruction that represents the real dynamic size
+  // of the dynamic dimension.
+  using DynamicMapping = absl::flat_hash_map<DynamicDimension, HloInstruction*>;
+  DynamicMapping dynamic_mapping_;
+
+  using PerHloDynamicDimensions =
+      absl::flat_hash_map<HloInstruction*,
+                          absl::flat_hash_set<DynamicDimension>>;
+  PerHloDynamicDimensions per_hlo_dynamic_dimensions_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_DIMENSION_INFERENCE_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea9ebed45d99797ce4f80376ec3d0b758da3ca17
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_dimension_inference_test.cc
@@ -0,0 +1,535 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_dimension_inference.h"
+
+#include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_runner.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace op = xla::testing::opcode_matchers;
+
+namespace xla {
+namespace {
+
+class DynamicDimensionInferenceTest : public HloTestBase {
+ protected:
+  DynamicDimensionInferenceTest() : HloTestBase() {
+    module_ = CreateNewVerifiedModule();
+  }
+
+  Status RunInference() {
+    hlo_graph_dumper::MaybeDumpHloModule(*module_, "Before alias analysis");
+    TF_ASSIGN_OR_RETURN(DynamicDimensionInference inference,
+                        DynamicDimensionInference::Run(module_.get()));
+
+    inference_ = absl::make_unique<DynamicDimensionInference>(inference);
+    return Status::OK();
+  }
+
+  HloComputation* GetAdd() {
+    auto embedded_builder = HloComputation::Builder("add");
+    auto lhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        0, ShapeUtil::MakeShape(F32, {}), "lhs"));
+    auto rhs = embedded_builder.AddInstruction(HloInstruction::CreateParameter(
+        1, ShapeUtil::MakeShape(F32, {}), "rhs"));
+    embedded_builder.AddInstruction(
+        HloInstruction::CreateBinary(lhs->shape(), HloOpcode::kAdd, lhs, rhs));
+    return module_->AddEmbeddedComputation(embedded_builder.Build());
+  }
+
+  std::unique_ptr<HloModule> module_;
+  std::unique_ptr<DynamicDimensionInference> inference_;
+  const Shape scalar_shape_ = ShapeUtil::MakeShape(S32, {});
+};
+
+TEST_F(DynamicDimensionInferenceTest, ParamTest) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "param"));
+  auto param2 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "param"));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(param, {}, 1), param2);
+  EXPECT_EQ(inference_->GetDynamicSize(param, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(param2, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ParamTestTuple) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({input_shape, scalar_shape_}), "param"));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {1}},
+      DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_EQ(inference_->GetDynamicSize(param, {0}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, GetTupleElement) {
+  // When data flows through GTE, the dynamic dimension size keeps the
+  // same, and the index has its front popped.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto param = builder.AddInstruction(HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape({input_shape, scalar_shape_}), "param"));
+
+  auto gte = builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(input_shape, param, 0));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{0, {1}},
+      DynamicParameterBinding::DynamicDimension{0, {0}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_THAT(inference_->GetDynamicSize(param, {0}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_THAT(inference_->GetDynamicSize(gte, {}, 1),
+              op::GetTupleElement(param, 1));
+
+  EXPECT_EQ(inference_->GetDynamicSize(param, {0}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ElementwiseTest) {
+  // When data flows through elementwise, the dynamic dimension size keeps the
+  // same.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto* negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  module_->AddEntryComputation(builder.Build());
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(negate, {}, 1), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceTestI) {
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(HloInstruction::CreateReduce(
+      reduce_shape, negate, init, {0, 2}, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceTestII) {
+  // Same as ReduceTestI, but only reduce one dimension.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 2});
+  auto reduce_shape = ShapeUtil::MakeShape(F32, {1, 2});
+
+  auto data_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, input_shape, "data_param"));
+  auto size_param = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, scalar_shape_, "size_param"));
+
+  auto negate = builder.AddInstruction(
+      HloInstruction::CreateUnary(input_shape, HloOpcode::kNegate, data_param));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto reduce = builder.AddInstruction(
+      HloInstruction::CreateReduce(reduce_shape, negate, init, {1}, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reduce, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, DotTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto xz_shape = ShapeUtil::MakeShape(F32, {xdim, zdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  DotDimensionNumbers dot_dnums;
+  dot_dnums.add_lhs_contracting_dimensions(1);
+  dot_dnums.add_rhs_contracting_dimensions(0);
+  auto dot = builder.AddInstruction(
+      HloInstruction::CreateDot(xz_shape, a_param, b_param, dot_dnums,
+                                HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 0), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(dot, {}, 1), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ConvolutionTest) {
+  auto builder = HloComputation::Builder(TestName());
+  constexpr int xdim = 3;
+  constexpr int ydim = 2;
+  constexpr int zdim = 1;
+  auto xy_shape = ShapeUtil::MakeShape(F32, {xdim, ydim});
+  auto yz_shape = ShapeUtil::MakeShape(F32, {ydim, zdim});
+  auto zx_shape = ShapeUtil::MakeShape(F32, {zdim, xdim});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, xy_shape, "A"));
+  auto* b_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, yz_shape, "B"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+
+  auto dnums = XlaBuilder::CreateDefaultConvDimensionNumbers(0);
+
+  dnums.set_kernel_input_feature_dimension(0);
+  dnums.set_kernel_output_feature_dimension(1);
+  dnums.set_input_batch_dimension(0);
+  dnums.set_output_batch_dimension(1);
+  dnums.set_output_feature_dimension(0);
+
+  Window window;
+
+  auto* conv = builder.AddInstruction(HloInstruction::CreateConvolve(
+      zx_shape, a_param, b_param, /*feature_group_count=*/1, window, dnums,
+      HloTestBase::DefaultPrecisionConfig(2)));
+
+  module_->AddEntryComputation(builder.Build());
+
+  // Set up dynamic parameter binding for non-contracting dimension.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  // Set up binding for contracting dimensions.
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(conv, {}, 0), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, TransposeTest) {
+  // Test the ability to trace unmodified dimensions
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {1, 2, 3});
+  auto output_shape = ShapeUtil::MakeShape(F32, {3, 2, 1});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param_1 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+  auto* size_param_2 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/2, scalar_shape_, "size_param"));
+  auto* size_param_3 = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/3, scalar_shape_, "size_param"));
+
+  auto* transpose = builder.AddInstruction(
+      HloInstruction::CreateTranspose(output_shape, a_param, {2, 1, 0}));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{2, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{3, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 0), size_param_3);
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 1), size_param_2);
+  EXPECT_EQ(inference_->GetDynamicSize(transpose, {}, 2), size_param_1);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReshapeTest) {
+  // Test the ability to trace unmodified reshape dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6});
+  auto output_shape = ShapeUtil::MakeShape(F32, {6, 4, 1, 5, 2, 3});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto* reshape = builder.AddInstruction(
+      HloInstruction::CreateReshape(output_shape, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 2}));
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 3}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 2), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 3), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 4), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(reshape, {}, 5), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReshapeTestUnimplemented) {
+  // Test the ability to trace unmodified reshape dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 3, 4, 5, 6});
+  auto output_shape = ShapeUtil::MakeShape(F32, {6, 4, 1, 5, 2, 3});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+
+  builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  builder.AddInstruction(HloInstruction::CreateReshape(output_shape, a_param));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 1}));
+
+  Status status = RunInference();
+  EXPECT_EQ(status.code(), tensorflow::error::UNIMPLEMENTED);
+}
+
+TEST_F(DynamicDimensionInferenceTest, BroadcastTest) {
+  // Test the ability to trace broadcast dimension.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2});
+  auto output_shape = ShapeUtil::MakeShape(F32, {3, 2, 4});
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto* broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(output_shape, a_param, {1}));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 0), nullptr);
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 1), size_param);
+  EXPECT_EQ(inference_->GetDynamicSize(broadcast, {}, 2), nullptr);
+}
+
+TEST_F(DynamicDimensionInferenceTest, ReduceWindowBatchTest) {
+  // Test the ability to trace reduce window batch dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
+  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+
+  Window window;
+  // First dimension is unchanged.
+  WindowDimension* batch_dim = window.add_dimensions();
+  batch_dim->set_size(1);
+  batch_dim->set_stride(1);
+  batch_dim->set_padding_low(0);
+  batch_dim->set_padding_high(0);
+  batch_dim->set_window_dilation(1);
+  batch_dim->set_base_dilation(1);
+
+  // Second and third dimension are reduced.
+  for (int64 i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(2);
+    dim->set_stride(2);
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          output_shape, a_param, init, window, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
+}
+
+TEST_F(DynamicDimensionInferenceTest, SelectAndScatterTest) {
+  // Test the ability to trace select and scatter batch dimensions.
+  auto builder = HloComputation::Builder(TestName());
+  auto input_shape = ShapeUtil::MakeShape(F32, {2, 4, 4});
+  auto output_shape = ShapeUtil::MakeShape(F32, {2, 2, 2});
+
+  Window window;
+  // First dimension is unchanged.
+  WindowDimension* batch_dim = window.add_dimensions();
+  batch_dim->set_size(1);
+  batch_dim->set_stride(1);
+  batch_dim->set_padding_low(0);
+  batch_dim->set_padding_high(0);
+  batch_dim->set_window_dilation(1);
+  batch_dim->set_base_dilation(1);
+
+  // Second and third dimension are reduced.
+  for (int64 i = 0; i < 2; ++i) {
+    WindowDimension* dim = window.add_dimensions();
+    dim->set_size(2);
+    dim->set_stride(2);
+    dim->set_padding_low(0);
+    dim->set_padding_high(0);
+    dim->set_window_dilation(1);
+    dim->set_base_dilation(1);
+  }
+
+  auto* a_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, input_shape, "A"));
+  auto* size_param = builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, scalar_shape_, "size_param"));
+
+  auto init = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0)));
+
+  auto* reduce_window =
+      builder.AddInstruction(HloInstruction::CreateReduceWindow(
+          output_shape, a_param, init, window, GetAdd()));
+
+  module_->AddEntryComputation(builder.Build());
+
+  TF_CHECK_OK(module_->dynamic_parameter_binding().Bind(
+      DynamicParameterBinding::DynamicParameter{1, {}},
+      DynamicParameterBinding::DynamicDimension{0, {}, 0}));
+
+  TF_ASSERT_OK(RunInference());
+  EXPECT_EQ(inference_->GetDynamicSize(reduce_window, {}, 0), size_param);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c8bfc8905064bcd7b68fe259fbcc1546ff083dbd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.cc
@@ -0,0 +1,138 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+
+namespace xla {
+
+Status DynamicParameterBinding::Bind(
+    const DynamicParameter& dynamic_parameter,
+    const DynamicDimension& dynamic_dimension) {
+  auto result = bindings_.emplace(dynamic_dimension, dynamic_parameter);
+  TF_RET_CHECK(result.second);
+  return Status::OK();
+}
+
+absl::optional<DynamicParameterBinding::DynamicParameter>
+DynamicParameterBinding::GetBinding(const DynamicDimension& dynamic_dimension) {
+  auto param_iter = bindings_.find(dynamic_dimension);
+  if (param_iter == bindings_.end()) {
+    return absl::nullopt;
+  }
+  return param_iter->second;
+}
+
+DynamicParameterBindingProto DynamicParameterBinding::ToProto() const {
+  DynamicParameterBindingProto result;
+  for (const auto& binding : bindings_) {
+    const DynamicDimension& dynamic_dimension = binding.first;
+    const DynamicParameter& dynamic_param = binding.second;
+    DynamicParameterBindingProto::Binding binding_proto;
+    binding_proto.set_dynamic_param_num(dynamic_param.parameter_num);
+    for (int64 i : dynamic_param.parameter_index) {
+      binding_proto.add_dynamic_param_index(i);
+    }
+
+    binding_proto.set_target_param_num(dynamic_dimension.parameter_num);
+
+    for (int64 i : dynamic_dimension.parameter_index) {
+      binding_proto.add_target_param_index(i);
+    }
+
+    binding_proto.set_target_param_dim_num(dynamic_dimension.dimension);
+    result.add_entries()->Swap(&binding_proto);
+  }
+  return result;
+}
+
+StatusOr<DynamicParameterBinding> DynamicParameterBinding::CreateFromProto(
+    const DynamicParameterBindingProto& proto) {
+  DynamicParameterBinding result;
+  for (const DynamicParameterBindingProto::Binding& binding : proto.entries()) {
+    int64 dynamic_param_num = binding.dynamic_param_num();
+    ShapeIndex dynamic_param_index(binding.dynamic_param_index().begin(),
+                                   binding.dynamic_param_index().end());
+    int64 target_param_num = binding.target_param_num();
+    ShapeIndex target_param_index(binding.target_param_index().begin(),
+                                  binding.target_param_index().end());
+    int64 target_dim_num = binding.target_param_num();
+
+    TF_RETURN_IF_ERROR(
+        result.Bind(DynamicParameter{dynamic_param_num, dynamic_param_index},
+                    DynamicDimension{target_param_num, target_param_index,
+                                     target_dim_num}));
+  }
+
+  return result;
+}
+
+string DynamicParameterBinding::ToString() const {
+  std::vector<string> pieces;
+  pieces.push_back("DynamicParameterBinding: ");
+  for (const auto& binding : bindings_) {
+    const DynamicDimension& dynamic_dimension = binding.first;
+    const DynamicParameter& dynamic_param = binding.second;
+    pieces.push_back(absl::StrFormat(
+        " -- Input param number %lld at %s has dim %lld as dynamic"
+        " dimension, which is represented by param number %lld at "
+        "%s",
+        dynamic_dimension.parameter_num,
+        dynamic_dimension.parameter_index.ToString(),
+        dynamic_dimension.dimension, dynamic_param.parameter_num,
+        dynamic_param.parameter_index.ToString()));
+  }
+  return absl::StrJoin(pieces, "\n");
+}
+
+Status DynamicParameterBinding::ForEachBinding(BindingFn fn) const {
+  for (const auto& binding : bindings_) {
+    TF_RETURN_IF_ERROR(fn(binding.second, binding.first));
+  }
+  return Status::OK();
+}
+
+Status DynamicParameterBinding::Verify(const HloModule& module) const {
+  const HloComputation* entry = module.entry_computation();
+  return ForEachBinding([&](const DynamicParameter& dynamic_parameter,
+                            const DynamicDimension& dynamic_dimension)
+                            -> Status {
+    TF_RET_CHECK(dynamic_parameter.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(dynamic_dimension.parameter_num < entry->num_parameters());
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        entry->parameter_instruction(dynamic_parameter.parameter_num)->shape(),
+        dynamic_parameter.parameter_index));
+    TF_RET_CHECK(ShapeUtil::IndexIsValid(
+        entry->parameter_instruction(dynamic_dimension.parameter_num)->shape(),
+        dynamic_dimension.parameter_index));
+    TF_RET_CHECK(
+        dynamic_dimension.dimension <
+        ShapeUtil::Rank(ShapeUtil::GetSubshape(
+            entry->parameter_instruction(dynamic_dimension.parameter_num)
+                ->shape(),
+            dynamic_dimension.parameter_index)));
+    return Status::OK();
+  });
+}
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding) {
+  out << binding.ToString();
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding.h b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
new file mode 100644
index 0000000000000000000000000000000000000000..dd474d8eed1b2c30ddb8f624a864198c74eacaba
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding.h
@@ -0,0 +1,125 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
+#include "tensorflow/compiler/xla/shape_tree.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+class HloModule;
+// We currently use an explicit API that takes an extra parameter to indicate
+// the runtime size of a dynamic dimension. DynamicParameterBinding indicates
+// the relationship between parameter: We can have a dynamic parameter that
+// points to another target parameter to indicate that the target parameter is
+// dynamic.
+//
+//
+// TODO(b/119520625): Remove this API once we have more dynamic shape infra
+// ready.
+class DynamicParameterBinding {
+ public:
+  // DynamicParameter represents a special parameter that is used to represent
+  // the runtime size of a dimension of another parameter. A dynamic parameter
+  // has to be a scalar value.
+  struct DynamicParameter {
+    // The parameter number of dynamic parameter.
+    int64 parameter_num;
+    // The index of the parameter.
+    ShapeIndex parameter_index;
+  };
+
+  // DynamicDimension represents a dimension whose size is determined at
+  // runtime. A DynamicDimension's runtime size is determined by the binded
+  // DynamicParameter using `DynamicParameterBinding::Bind` method.
+  struct DynamicDimension {
+    // The parameter number of dynamic dimension.
+    int64 parameter_num;
+    // The subshape index of the parameter.
+    ShapeIndex parameter_index;
+    // The dimension number in the subshape.
+    int64 dimension;
+
+    // "friend" keyword are added so these functions can be found by ADL.
+    template <typename H>
+    friend H AbslHashValue(H h, const DynamicDimension& m) {
+      return H::combine(std::move(h), m.parameter_num, m.parameter_index,
+                        m.dimension);
+    }
+
+    friend bool operator==(const DynamicDimension& lhs,
+                           const DynamicDimension& rhs) {
+      return lhs.parameter_num == rhs.parameter_num &&
+             lhs.parameter_index == rhs.parameter_index &&
+             lhs.dimension == rhs.dimension;
+    }
+  };
+
+  DynamicParameterBinding() = default;
+
+  virtual ~DynamicParameterBinding() = default;
+
+  // Adds binding which indicates that the dimension indicated by
+  // `dynamic_dimension` is dynamic, and its runtime size is represented by
+  // `dynamic_parameter`.
+  Status Bind(const DynamicParameter& dynamic_parameter,
+              const DynamicDimension& dynamic_dimension);
+
+  // Returns the parameter and the index representing the runtime size of
+  // dimension `dim_num` of parameter `param_num` at `param_index`.
+  //
+  // Returns nullopt if the binding is not set.
+  absl::optional<DynamicParameter> GetBinding(
+      const DynamicDimension& dynamic_dimension);
+
+  using BindingFn =
+      std::function<Status(const DynamicParameter& dynamic_parameter,
+                           const DynamicDimension& dynamic_dimension)>;
+
+  // Iterate through each binding.
+  Status ForEachBinding(BindingFn fn) const;
+
+  DynamicParameterBindingProto ToProto() const;
+
+  static StatusOr<DynamicParameterBinding> CreateFromProto(
+      const DynamicParameterBindingProto& proto);
+
+  string ToString() const;
+
+  // Verifies that the given binding is valid for the given module.
+  // Specifically, the binding's parameter and parameter size should be valid.
+  Status Verify(const HloModule& module) const;
+
+ private:
+  // Keeps track of mappings from DynamicDimension to DynamicParameter. The
+  // direction of is chosen so that we can easily query if a dimension is
+  // dynamic and which dynamic parameter represents the real size of that
+  // dimension.
+  absl::flat_hash_map<DynamicDimension, DynamicParameter> bindings_;
+};
+
+std::ostream& operator<<(std::ostream& out,
+                         const DynamicParameterBinding& binding);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DYNAMIC_PARAMETER_BINDING_H_
diff --git a/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..83a6d83dffde7995bd8e43917d13c5fd2705ba6f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dynamic_parameter_binding_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_ordering.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace xla {
+namespace {
+class DynamicParameterBindingTest : public HloTestBase {};
+
+TEST_F(DynamicParameterBindingTest, SimpleBinding) {
+  // 'b' is a dynamic shape; 'a' represents the real size of b's first
+  // dimension.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  a = f32[] parameter(0)
+  b = f32[10] parameter(1)
+  ROOT root = (f32[], f32[10]) tuple(%a, %b)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {}},
+                   DynamicParameterBinding::DynamicDimension{1, {}, 0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/1,
+                                                    /*parameter_index=*/{},
+                                                    /*dimension=*/0});
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({}));
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+TEST_F(DynamicParameterBindingTest, TupleBinding) {
+  // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's first
+  // dimension.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[10]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[10] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[10]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+TEST_F(DynamicParameterBindingTest, TupleBindingWithMultiDimension) {
+  // 'gte2' is a dynamic shape; 'gte1' represents the real size of gte2's both
+  // dimensions.
+  const string module_str = R"(
+HloModule TEST
+
+ENTRY main {
+  param = (f32[], f32[10, 10]) parameter(0)
+  gte1 = f32[] get-tuple-element(%param), index=0
+  gte2 = f32[10, 10] get-tuple-element(%param), index=1
+  ROOT root = (f32[], f32[10, 10]) tuple(%gte1, %gte2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(module_str));
+
+  DynamicParameterBinding binding;
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 0}));
+
+  TF_EXPECT_OK(
+      binding.Bind(DynamicParameterBinding::DynamicParameter{0, {0}},
+                   DynamicParameterBinding::DynamicDimension{0, {1}, 1}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+
+  EXPECT_TRUE(param);
+  EXPECT_EQ(param->parameter_num, 0);
+  EXPECT_EQ(param->parameter_index, ShapeIndex({0}));
+
+  absl::optional<DynamicParameterBinding::DynamicParameter> param2 =
+      binding.GetBinding(
+          DynamicParameterBinding::DynamicDimension{/*parameter_num=*/0,
+                                                    /*parameter_index=*/{1},
+                                                    /*dimension=*/0});
+  EXPECT_TRUE(param2);
+  EXPECT_EQ(param2->parameter_num, 0);
+  EXPECT_EQ(param2->parameter_index, ShapeIndex({0}));
+
+  TF_EXPECT_OK(binding.Verify(*module));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index f98c943669be8c14d245896b91cee3eee1e47429..6f1f95f2e9082649b6ca9cc0da5c238e15b77c10 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 // IWYU pragma: no_include "llvm/IR/Intrinsics.gen.inc"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Instructions.h"
@@ -1671,26 +1672,66 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalConcatenate(
 
   b_->SetInsertPoint(init_block);
 
+  // Assign a unique id for each *different* operand, and count how often each
+  // operand is used. If all operands are different, the usage count will be 1
+  // for each operand.
+  absl::flat_hash_map<const HloInstruction*, int64> to_unique_operand_id;
+  std::vector<int64> operand_usage_count;
+  for (const auto* operand : hlo->operands()) {
+    if (to_unique_operand_id.contains(operand)) {
+      ++operand_usage_count[to_unique_operand_id[operand]];
+    } else {
+      int64 unique_operand_id = to_unique_operand_id.size();
+      to_unique_operand_id[operand] = unique_operand_id;
+      operand_usage_count.push_back(1);
+    }
+  }
+
+  // To avoid that we emit the same operand more than once, we create one basic
+  // block for each *different* operand with a PHI node for the different source
+  // index inputs.
+  std::vector<llvm::BasicBlock*> emit_operand_blocks(
+      to_unique_operand_id.size(), nullptr);
+  std::vector<llvm::PHINode*> source_index_phis(to_unique_operand_id.size(),
+                                                nullptr);
+  for (const auto* operand : hlo->operands()) {
+    int64 operand_id = to_unique_operand_id[operand];
+    if (emit_operand_blocks[operand_id] != nullptr) {
+      continue;
+    }
+
+    emit_operand_blocks[operand_id] = llvm_ir::CreateBasicBlock(
+        exit_block, StrCat("concat_index_from_operand_id", operand_id), b_);
+    auto saved_insert_point = b_->GetInsertPoint();
+    llvm_ir::SetToFirstInsertPoint(emit_operand_blocks[operand_id], b_);
+    source_index_phis[operand_id] =
+        PHI(source_index.GetType(), operand_usage_count[operand_id]);
+    auto operand_index = source_index;
+    operand_index[concat_dim] = source_index_phis[operand_id];
+
+    // Create the terminator of the block before calling operand generators,
+    // because they require non-degenerate basic blocks.
+    b_->SetInsertPoint(llvm::BranchInst::Create(
+        exit_block, /*InsertAtEnd=*/emit_operand_blocks[operand_id]));
+    TF_ASSIGN_OR_RETURN(llvm::Value * value,
+                        operand_to_generator.at(operand)(operand_index));
+    output->addIncoming(value, b_->GetInsertBlock());
+    b_->SetInsertPoint(init_block, saved_insert_point);
+  }
+
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
        ++operand_idx) {
     const HloInstruction* operand = hlo->operand(operand_idx);
-    auto true_block = llvm_ir::CreateBasicBlock(
-        exit_block, StrCat("concat_index_from_operand", operand_idx), b_);
     auto false_block = llvm_ir::CreateBasicBlock(
         exit_block, StrCat("concat_index_not_from_operand", operand_idx), b_);
     auto concat_dim_size =
         llvm::ConstantInt::get(source_index[concat_dim]->getType(),
                                operand->shape().dimensions(concat_dim));
-    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size), true_block,
-           false_block);
-
-    // Create the terminator of the true block before calling operand
-    // generators, because they require non-degenerate basic blocks.
-    b_->SetInsertPoint(
-        llvm::BranchInst::Create(exit_block, /*InsertAtEnd=*/true_block));
-    TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                        operand_to_generator.at(operand)(source_index));
-    output->addIncoming(value, b_->GetInsertBlock());
+    int64 operand_id = to_unique_operand_id[operand];
+    source_index_phis[operand_id]->addIncoming(source_index[concat_dim],
+                                               b_->GetInsertBlock());
+    CondBr(ICmpULT(source_index[concat_dim], concat_dim_size),
+           emit_operand_blocks[operand_id], false_block);
 
     // Subtract the size of the concat dimension of the current operand
     // from the source index.
@@ -2204,13 +2245,15 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
                 : iota->shape();
         PrimitiveType component_element_type = component_shape.element_type();
         llvm::Value* iota_result;
-        if (ShapeUtil::ElementIsIntegral(component_shape)) {
+        if (primitive_util::IsIntegralType(component_element_type) ||
+            component_element_type == PRED) {
           iota_result = b_->CreateIntCast(
               elem_index_linear,
               llvm_ir::PrimitiveTypeToIrType(component_element_type, module_),
               /*isSigned=*/false);
         } else {
-          TF_RET_CHECK(ShapeUtil::ElementIsFloating(component_shape))
+          TF_RET_CHECK(
+              primitive_util::IsFloatingPointType(component_element_type))
               << component_element_type;
           llvm::Type* float_ir_type;
           if (component_element_type == BF16) {
diff --git a/tensorflow/compiler/xla/service/executable.h b/tensorflow/compiler/xla/service/executable.h
index 45f620f3f33eee41eefa9ddfdfb166a5ba76caef..b34bca55a48b113c325dbf28c03f7a0f5b71f658 100644
--- a/tensorflow/compiler/xla/service/executable.h
+++ b/tensorflow/compiler/xla/service/executable.h
@@ -61,7 +61,7 @@ struct ExecutionOutput {
 class Executable {
  public:
   explicit Executable(
-      std::unique_ptr<const HloModule> hlo_module,
+      std::unique_ptr<HloModule> hlo_module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
       std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
       : hlo_module_(std::move(hlo_module)),
@@ -162,7 +162,7 @@ class Executable {
     return hlo_profile_printer_data_ != nullptr;
   }
 
-  const HloModule& module() const { return *hlo_module_; }
+  HloModule& module() const { return *hlo_module_; }
 
   const bool has_module() const { return hlo_module_ != nullptr; }
 
@@ -199,7 +199,7 @@ class Executable {
   // HloModule this was compiled from. BufferAssignment keeps pointers to
   // HloInstructions owned by the HloModule so we need to keep the HloModule
   // around.
-  const std::unique_ptr<const HloModule> hlo_module_;
+  const std::unique_ptr<HloModule> hlo_module_;
 
   // HloSnapshot this was compiled from. Null if not dumping executions.
   std::unique_ptr<HloSnapshot> hlo_snapshot_;
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index b1629616acd2bb715d5aa1a89286a38a45417d2c..bfd1b6cb1492f5cb709e2ecefe73782094e26f5e 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -701,6 +701,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_cse",
         "//tensorflow/compiler/xla/service:hlo_dce",
         "//tensorflow/compiler/xla/service:hlo_element_type_converter",
+        "//tensorflow/compiler/xla/service:hlo_get_dimension_size_rewriter",
         "//tensorflow/compiler/xla/service:hlo_pass",
         "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:hlo_proto",
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
index d7829045cc127deaa4c2c9b705dca5285d704af2..3a09d4d4716950a09d65dd093272482d55ac5c27 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_padding_legalization.cc
@@ -43,13 +43,14 @@ bool IsForwardConvolutionCanonical(const HloInstruction& conv) {
 // dilation), returns kPad and/or kSlice instructions that explicitly apply the
 // padding; otherwise returns the original input operand. When there is both
 // positive padding (including dilation) and negative padding, we insert both
-// kPad and kSlice.
+// kPad and kSlice. Modifies 'conv_window' accordingly if any padding was moved
+// into a kPad or kSlice op.
 HloInstruction* MaybePaddedAndSlicedInput(
-    const Window& conv_window, const ConvolutionDimensionNumbers& conv_dnums,
+    Window* conv_window, const ConvolutionDimensionNumbers& conv_dnums,
     HloInstruction* input) {
   HloComputation* computation = input->parent();
-  if (!window_util::HasSymmetricPadding(conv_window) ||
-      window_util::HasBaseDilation(conv_window)) {
+  if (!window_util::HasSymmetricPadding(*conv_window) ||
+      window_util::HasBaseDilation(*conv_window)) {
     // If padding is uneven or has dilation, we insert a kPad instruction that
     // applies positive padding and dilation.
     //
@@ -62,12 +63,21 @@ HloInstruction* MaybePaddedAndSlicedInput(
         MakeNoPaddingConfig(input->shape().dimensions_size());
     for (size_t i = 0; i < conv_dnums.input_spatial_dimensions().size(); ++i) {
       int64 dim = conv_dnums.input_spatial_dimensions(i);
-      padding_config.mutable_dimensions(dim)->set_edge_padding_low(
-          std::max<int64>(0LL, conv_window.dimensions(i).padding_low()));
-      padding_config.mutable_dimensions(dim)->set_edge_padding_high(
-          std::max<int64>(0LL, conv_window.dimensions(i).padding_high()));
-      padding_config.mutable_dimensions(dim)->set_interior_padding(
-          conv_window.dimensions(i).base_dilation() - 1);
+      if (conv_window->dimensions(i).padding_low() > 0) {
+        padding_config.mutable_dimensions(dim)->set_edge_padding_low(
+            conv_window->dimensions(i).padding_low());
+        conv_window->mutable_dimensions(i)->set_padding_low(0);
+      }
+      if (conv_window->dimensions(i).padding_high() > 0) {
+        padding_config.mutable_dimensions(dim)->set_edge_padding_high(
+            conv_window->dimensions(i).padding_high());
+        conv_window->mutable_dimensions(i)->set_padding_high(0);
+      }
+      if (conv_window->dimensions(i).base_dilation() != 1) {
+        padding_config.mutable_dimensions(dim)->set_interior_padding(
+            conv_window->dimensions(i).base_dilation() - 1);
+        conv_window->mutable_dimensions(i)->set_base_dilation(1);
+      }
     }
     PrimitiveType element_type = input->shape().element_type();
     HloInstruction* padding = computation->AddInstruction(
@@ -75,7 +85,7 @@ HloInstruction* MaybePaddedAndSlicedInput(
     input = MakePadHlo(input, padding, padding_config).ValueOrDie();
   }
 
-  if (window_util::HasNegativePadding(conv_window)) {
+  if (window_util::HasNegativePadding(*conv_window)) {
     // If the window has negative padding, insert a kSlice that explicitly
     // applies negative padding.
     //
@@ -89,10 +99,14 @@ HloInstruction* MaybePaddedAndSlicedInput(
       int64 dim = conv_dnums.input_spatial_dimensions(i);
       // If dimension "dim" has negative padding, increase the start index or
       // decrement the limit index by the amount of negative padding.
-      start_indices[dim] +=
-          std::max<int64>(0LL, -conv_window.dimensions(i).padding_low());
-      limit_indices[dim] -=
-          std::max<int64>(0LL, -conv_window.dimensions(i).padding_high());
+      if (conv_window->dimensions(i).padding_low() < 0) {
+        start_indices[dim] += -conv_window->dimensions(i).padding_low();
+        conv_window->mutable_dimensions(i)->set_padding_low(0);
+      }
+      if (conv_window->dimensions(i).padding_high() < 0) {
+        limit_indices[dim] -= -conv_window->dimensions(i).padding_high();
+        conv_window->mutable_dimensions(i)->set_padding_high(0);
+      }
     }
 
     input =
@@ -140,25 +154,22 @@ bool CudnnConvPaddingLegalization::CanonicalizeForwardConvolution(
 
   // Insert slices and/or pads between the convolution and its input and/or
   // kernel operand.
+  Window new_conv_window = conv->window();
   HloInstruction* new_input = MaybePaddedAndSlicedInput(
-      conv->window(), conv->convolution_dimension_numbers(),
+      &new_conv_window, conv->convolution_dimension_numbers(),
       conv->mutable_operand(0));
   HloInstruction* new_kernel =
-      MaybePaddedKernel(conv->window(), conv->convolution_dimension_numbers(),
+      MaybePaddedKernel(new_conv_window, conv->convolution_dimension_numbers(),
                         conv->mutable_operand(1));
 
-  // Remove the padding from convolution's window field. These paddings are
-  // made explicit with the inserted pads.
-  Window new_conv_window = conv->window();
+  // Remove the window dilation from convolution's window field. These paddings
+  // are made explicit with the pads inserted by MaybePaddedKernel().
   for (size_t i = 0; i < new_conv_window.dimensions_size(); ++i) {
     WindowDimension* dim = new_conv_window.mutable_dimensions(i);
 
     // The size of the kernel may have changed so update the Window to match.
     dim->set_size(new_kernel->shape().dimensions(
         conv->convolution_dimension_numbers().kernel_spatial_dimensions(i)));
-    dim->set_padding_low(0);
-    dim->set_padding_high(0);
-    dim->set_base_dilation(1);
     dim->set_window_dilation(1);
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
index 4ce877f62a55c960765314670288ee626c5fc15b..e81850db69edced29ea31bb2a526b0503bf8a453 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_rewriter.cc
@@ -77,7 +77,11 @@ bool CanImplementAsCudnnForwardConv(HloInstruction* conv) {
     return false;
   }
 
-  if (window_util::HasWindowReversal(conv->window())) {
+  // CuDNN can perform either cross correlation (no reversal),
+  // or convolution (all dimensions reversed).
+  if (dnums.input_spatial_dimensions_size() == 2
+          ? !window_util::AllOrNoneReversed(conv->window())
+          : window_util::HasWindowReversal(conv->window())) {
     return false;
   }
   return true;
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
index 492d290bf4a27a91fa14dea95ac62d90bc1fa28a..3425e1b4942aaf1011ba1bf1c50dd7e79c1f9807 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_conv_runner.cc
@@ -138,6 +138,7 @@ Status RunCudnnConvImpl(CudnnConvParams params,
 
   const int num_dimensions = window.dimensions_size();
   CHECK_LE(num_dimensions, 3);
+  CHECK_GE(num_dimensions, 1);
   // cuDNN does not support 1D convolutions. We therefore express 1D
   // convolutions as 2D convolutions where the first spatial dimension is 1.
   // This matches the behavior of TF (see definition of conv1d in
@@ -148,10 +149,15 @@ Status RunCudnnConvImpl(CudnnConvParams params,
            output_shape.element_type())
       << ShapeUtil::HumanString(output_shape);
 
+  // If one dimension is reversed, we need to have all dimensions reversed (so
+  // we're doing convolution not cross correlation).
+  const bool dims_reversed = window.dimensions()[0].window_reversal();
+
   CHECK_EQ(num_dimensions, dnums.input_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dnums.kernel_spatial_dimensions_size());
   CHECK_EQ(num_dimensions, dnums.output_spatial_dimensions_size());
   for (const WindowDimension& dim : window.dimensions()) {
+    CHECK_EQ(dims_reversed, dim.window_reversal());
     CHECK_EQ(dim.padding_low(), dim.padding_high());
     CHECK_EQ(dim.base_dilation(), 1)
         << "cudnn does not support base dilation; it "
@@ -198,6 +204,7 @@ Status RunCudnnConvImpl(CudnnConvParams params,
 
   ConvolutionDescriptor convolution_descriptor(effective_num_dimensions);
   convolution_descriptor.set_group_count(feature_group_count);
+  convolution_descriptor.set_convolution_not_crosscorr(dims_reversed);
   for (int dim = 0; dim < num_dimensions; ++dim) {
     convolution_descriptor
         .set_zero_padding(
@@ -363,14 +370,12 @@ StatusOr<CudnnConvParams> GetCudnnConvParams(
       params.output_shape = &conv_result_shape;
       params.fusion.emplace();
       auto& fusion = *params.fusion;
-      if (backend_config.activation_mode() <
-          static_cast<int64>(se::dnn::ActivationMode::kNumActivationModes)) {
-        fusion.mode = static_cast<se::dnn::ActivationMode>(
-            backend_config.activation_mode());
-      } else {
+      if (!se::dnn::ActivationMode_IsValid(backend_config.activation_mode())) {
         return InternalError("Bad activation mode: %s",
                              backend_config.ShortDebugString());
       }
+      fusion.mode = static_cast<se::dnn::ActivationMode>(
+          backend_config.activation_mode());
       fusion.side_input_scale = backend_config.side_input_scale();
       params.input_buf = operand_buffers[0];
       params.filter_buf = operand_buffers[1];
diff --git a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
index 6dcdaf1cfe06e446deed847aaf29088a7ed10e13..2ab754a471070d5f90a3eaebd0600ff180d2fe5d 100644
--- a/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/elemental_ir_emitter.cc
@@ -161,6 +161,16 @@ StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitFloatBinaryOp(
   PrimitiveType lhs_input_type = op->operand(0)->shape().element_type();
   PrimitiveType rhs_input_type = op->operand(1)->shape().element_type();
   PrimitiveType output_type = op->shape().element_type();
+  HloOpcode opcode = op->opcode();
+
+  if (hlo_module_config_.debug_options().xla_gpu_enable_fast_min_max() &&
+      (opcode == HloOpcode::kMaximum || opcode == HloOpcode::kMinimum)) {
+    return llvm_ir::EmitCallToIntrinsic(
+        opcode == HloOpcode::kMaximum ? llvm::Intrinsic::maxnum
+                                      : llvm::Intrinsic::minnum,
+        {lhs_value, rhs_value}, {lhs_value->getType()}, b_);
+  }
+
   switch (op->opcode()) {
     case HloOpcode::kRemainder: {
       return EmitLibdeviceMathCall("__nv_fmod", {lhs_value, rhs_value},
diff --git a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
index 30c1f9088968305ad0207164ecb07ba13cc89ee6..470457935acacb8940af241dadb393d770786939 100644
--- a/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
+++ b/tensorflow/compiler/xla/service/gpu/fusion_merger.cc
@@ -229,7 +229,7 @@ Status FusionInstructionMerger::HandleFusion(HloInstruction* fusion) {
   if (!absl::c_all_of(fusion->users(), [&](const HloInstruction* user) {
         return user->opcode() == HloOpcode::kFusion &&
                (user->fusion_kind() == HloInstruction::FusionKind::kLoop ||
-                (user->fusion_kind() == HloInstruction::FusionKind::kInput &&
+                (IsReduceInputFusion(*user) &&
                  LayoutsAreReduceInputFusionFriendly(*fusion, *user)));
       })) {
     VLOG(3) << "Not merging " << fusion->name()
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 57426327822d95a42f407ed7488f35acfd3623d2..ae2e718db29803a085401969a7d9b09abf690a6c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -51,7 +51,7 @@ GpuExecutable::GpuExecutable(
     const string& ptx, const std::vector<uint8>& cubin,
     std::pair<int, int> compute_capability,
     std::unique_ptr<const ThunkSchedule> thunk_schedule,
-    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<const BufferAssignment> assignment,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
     std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map)
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.h b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
index 0e276282e40fba0ae4881a51dad0c7c9e8d1c081..2b3c77f5b82aa94f44d8de56caf0f4d31c05e0cb 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.h
@@ -54,7 +54,7 @@ class GpuExecutable : public Executable {
   GpuExecutable(const string& ptx, const std::vector<uint8>& cubin,
                 std::pair<int, int> compute_capability,
                 std::unique_ptr<const ThunkSchedule> thunk_schedule,
-                std::unique_ptr<const HloModule> hlo_module,
+                std::unique_ptr<HloModule> hlo_module,
                 std::unique_ptr<const BufferAssignment> assignment,
                 std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data,
                 std::unique_ptr<HloProfileIndexMap> hlo_profile_index_map);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
index 2d31fd5570c468b0c42fa308535fd335f3588a79..452e763a8eaadc805cd3a3859a68e2a31598fd36 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.cc
@@ -55,7 +55,7 @@ bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
   });
 }
 
-bool IsInputFusibleReduction(const HloInstruction& instr) {
+bool IsReduceInputFusion(const HloInstruction& instr) {
   if (instr.IsMultiOutputFusion()) {
     for (const HloInstruction* operand :
          instr.fused_expression_root()->operands()) {
@@ -67,17 +67,70 @@ bool IsInputFusibleReduction(const HloInstruction& instr) {
         return true;
       }
     }
-    return false;
-  } else if (instr.opcode() == HloOpcode::kFusion) {
-    if (IsReductionToVector(*instr.fused_expression_root())) {
-      CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput)
-          << " Fusion rooted at reduction-to-vector op must be of kind kInput: "
-          << instr.ToString();
-      return true;
+  } else if (instr.opcode() == HloOpcode::kFusion &&
+             IsReductionToVector(*instr.fused_expression_root())) {
+    CHECK(instr.fusion_kind() == HloInstruction::FusionKind::kInput)
+        << " Fusion rooted at reduction-to-vector op must be of kind kInput: "
+        << instr.ToString();
+    return true;
+  }
+  return false;
+}
+
+bool IsInputFusibleReduction(const HloInstruction& instr) {
+  return IsReduceInputFusion(instr) || IsReductionToVector(instr);
+}
+
+bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
+                                          const HloInstruction& instr2) {
+  // Returns the instructions that determines the emitter used for lowering,
+  // sometimes referred to as "the real hero".
+  auto get_real_hero =
+      [&](const HloInstruction* instr) -> const HloInstruction* {
+    if (instr->opcode() == HloOpcode::kFusion) {
+      auto fused_expression_root = instr->fused_expression_root();
+      if (instr->IsMultiOutputFusion()) {
+        // If possible, we want to pick a reduction-to-vector operand of the
+        // fusion root, because it has the most constraints.
+        for (const auto* inst : fused_expression_root->operands()) {
+          if (IsReductionToVector(*inst)) {
+            return inst;
+          }
+        }
+        return fused_expression_root->operands()[0];
+      }
+      return fused_expression_root;
     }
+    return instr;
+  };
+
+  // Multi-output fusion kernels share a common parallel loop. The loop
+  // dimenstions are determined by instruction shapes.
+  auto get_loop_shape = [&](const HloInstruction* element_instr) {
+    // Special-case reduction-to-vector ops: The loop dimensions are determined
+    // by the shape of the first operand.
+    if (IsReductionToVector(*element_instr)) {
+      return element_instr->operand(0)->shape();
+    }
+    return element_instr->shape();
+  };
+
+  // All shapes of the root tuple of multi-output fusions should agree, i.e. all
+  // root ops should have equal output shapes. An exception are
+  // reduction-to-vector ops. Here the input shapes of the reduction (first
+  // operand shape) and the reduction dimensions need to match.
+  auto* instr_1 = get_real_hero(&instr1);
+  auto* instr_2 = get_real_hero(&instr2);
+  // TODO(tjoerg): Relax the shape constraint. The datatype does not matter.
+  if (IsReductionToVector(*instr_1) && IsReductionToVector(*instr_2) &&
+      (!ShapeUtil::Equal(instr_1->shape(), instr_2->shape()) ||
+       instr_1->dimensions() != instr_2->dimensions())) {
     return false;
   }
-  return IsReductionToVector(instr);
+  // The elementwise output shapes must be the same (including layout).
+  // TODO(tjoerg): Further relax the constraint. The datatype does not matter.
+  return ShapeUtil::EqualIgnoringFpPrecision(get_loop_shape(instr_1),
+                                             get_loop_shape(instr_2));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
index f7c24a0d5bbfcc61389ea19ae7f769671e4e974d..e9d7ba1c4cfa865532a0d06c2ed883a2fea4e2cd 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible.h
@@ -33,16 +33,29 @@ namespace gpu {
 bool LayoutsAreReduceInputFusionFriendly(const HloInstruction& producer,
                                          const HloInstruction& reduce);
 
-// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
-// is either an unfused reduction-to-vector op, an input fusion rooted at a
-// reduction-to-vector op, or a multi-output input fusion with at least one
-// reduction-to-vector op root.
 // Note that reduction ops are lowered in different ways. Reduce input fusions
 // are lowered by IrEmitterUnnested::EmitReductionToVector and must be rooted at
 // reduction-to-vector ops. Other reduction ops are lowered by
 // GpuElementalIrEmitter and fused like elementwise ops.
+
+// Whether `instr` is an input fusion rooted at a reduction-to-vector op or a
+// multi-output input fusion with at least one reduction-to-vector op root.
+bool IsReduceInputFusion(const HloInstruction& instr);
+
+// Whether `instr` is fusible as root of a reduce input fusions, i.e. `instr`
+// is either an unfused reduction-to-vector op or a reduce input fusion.
 bool IsInputFusibleReduction(const HloInstruction& instr);
 
+// Whether instruction shapes are compatible for multi-output fusion, i.e.
+// whether the emitters support lowering the resulting fusion.
+// This function works for both, sibling and producer-conumser multi-output
+// fusion.
+// So far, multi-output fusion is supported for loop fusions and reduce
+// input fusions only. It is up to the caller to ensure the instructions
+// themselves are fusible!
+bool ShapesCompatibleForMultiOutputFusion(const HloInstruction& instr1,
+                                          const HloInstruction& instr2);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
index d91b7bc61fda5a07c163a07ec0e1644d2ad9db49..15d4ee206ce8debcb8a5dbc6ec65d29ba257d302 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_fusible_test.cc
@@ -178,7 +178,7 @@ TEST_F(GpuFusibleTest,
   EXPECT_TRUE(LayoutsAreReduceInputFusionFriendly(*loop_fusion, *reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_ReductionToVector) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       c0 = f32[] parameter(0)
@@ -191,10 +191,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ReductionToVector) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_ElementalReduction) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     ENTRY entry {
       c0 = f32[] parameter(0)
@@ -207,10 +208,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_ElementalReduction) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kReduce);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputInputReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -225,10 +227,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputInputReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_SingleOutputLoopReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -243,10 +246,11 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_SingleOutputLoopReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputInputReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -263,11 +267,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputInputReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
 TEST_F(GpuFusibleTest,
-       IsInputFusibleReduction_MultiOutputInputReduceFusionWithExtraOutputs) {
+       IsReduceInputFusion_MultiOutputInputReduceFusionWithExtraOutputs) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -284,10 +289,11 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_TRUE(IsReduceInputFusion(*reduce));
   EXPECT_TRUE(IsInputFusibleReduction(*reduce));
 }
 
-TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) {
+TEST_F(GpuFusibleTest, IsReduceInputFusion_MultiOutputLoopReduceFusion) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -304,11 +310,12 @@ TEST_F(GpuFusibleTest, IsInputFusibleReduction_MultiOutputLoopReduceFusion) {
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
 TEST_F(GpuFusibleTest,
-       IsInputFusibleReduction_MultiOutputLoopFusionReduceAndElementwiseOp) {
+       IsReduceInputFusion_MultiOutputLoopFusionReduceAndElementwiseOp) {
   auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
     fused_reduction {
       c0 = f32[] parameter(0)
@@ -325,8 +332,304 @@ TEST_F(GpuFusibleTest,
   const HloInstruction* reduce =
       module->entry_computation()->root_instruction();
   ASSERT_EQ(reduce->opcode(), HloOpcode::kFusion);
+  EXPECT_FALSE(IsReduceInputFusion(*reduce));
   EXPECT_FALSE(IsInputFusibleReduction(*reduce));
 }
 
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_LoopFusions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[6400]{0} parameter(0)
+      const.2 = f32[] constant(1)
+      ROOT div = f32[6400]{0} divide(p0.2, const.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_IgnoreFpPrecision) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[6400]{0} parameter(0)
+      ROOT convert = f16[6400]{0} convert(p0.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_Reduce) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      const.2 = f32[] constant(0)
+      reduce = f32[] reduce(p0, const.2), dimensions={0}, to_apply=scalar_add
+      ROOT root = (f32[6400]{0}, f32[]) tuple(fusion.1, reduce)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion, *reduce));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_Elementwise) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[6400]{0} parameter(0)
+      ROOT mul = f32[6400]{0} multiply(p0.1, p0.1)
+    }
+
+    ENTRY entry {
+      p0 = f32[6400]{0} parameter(0)
+      fusion.1 = f32[6400]{0} fusion(p0), kind=kLoop, calls=fused_computation_1
+      const.2 = f32[] constant(1)
+      div = f32[6400]{0} divide(p0, const.2)
+      ROOT root = (f32[6400]{0}, f32[6400]{0}) tuple(fusion.1, div)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* div =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion, *div));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_MultiOutputLoopFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_computation_1 {
+      p0.1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      mul = f32[8,1,5,16,1,1]{5,4,3,2,1,0} multiply(p0.1, p0.1)
+      exp = f32[8,1,5,16,1,1]{5,4,3,2,1,0} exponential(p0.1)
+      ROOT tuple = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(mul, exp)
+    }
+
+    fused_computation_2 {
+      p0.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      const.2 = f32[] constant(0)
+      ROOT add = f32[8,1,5,16,1,1]{5,4,3,2,1,0} add(p0.2, const.2)
+    }
+
+    ENTRY entry {
+      p0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} parameter(0)
+      fusion.1 = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) fusion(p0), kind=kLoop, calls=fused_computation_1
+      fusion.2 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} fusion(p0), kind=kLoop, calls=fused_computation_2
+      gte0 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=0
+      gte1 = f32[8,1,5,16,1,1]{5,4,3,2,1,0} get-tuple-element(fusion.1), index=1
+      ROOT root = (f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}, f32[8,1,5,16,1,1]{5,4,3,2,1,0}) tuple(gte0, gte1, fusion.2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1)->operand(0);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_UnfusedOps) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      exp = f32[2,2,2]{2,1,0} exponential(p0)
+      reduce = f32[2,2]{1,0} reduce(exp, c0), dimensions={2}, to_apply=scalar_add
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce, exp)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* exp =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*reduce, *exp));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_DifferentLayouts) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{0,1,2} parameter(1)
+      c0 = f32[] constant(0)
+      exp = f32[2,2,2]{2,1,0} exponential(p0)
+      reduce = f32[2,2]{0,1} reduce(p1, c0), dimensions={2}, to_apply=scalar_add
+      ROOT root = (f32[2,2]{0,1}, f32[2,2,2]{2,1,0}) tuple(reduce, exp)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* reduce =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* exp =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*reduce, *exp));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_MultiOutputReduceFusion) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_select {
+      p1.1 = f32[2,2,2]{2,1,0} parameter(1)
+      c0 = f32[] constant(0)
+      broadcast = f32[2,2,2]{2,1,0} broadcast(f32[] c0), dimensions={}
+      greater-than = pred[2,2,2]{2,1,0} greater-than(f32[2,2,2]{2,1,0} p1.1, f32[2,2,2]{2,1,0} broadcast)
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      ROOT select = f32[2,2,2]{2,1,0} select(pred[2,2,2]{2,1,0} greater-than, f32[2,2,2]{2,1,0} p0.1, f32[2,2,2]{2,1,0} broadcast)
+    }
+
+    fused_reduce {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      c1 = f32[] constant(0)
+      r1 = f32[2,2]{1,0} reduce(p0.2, c1), dimensions={2}, to_apply=scalar_add
+      mul = f32[2,2,2]{2,1,0} multiply(p0.2, p0.2)
+      r2 = f32[2,2]{1,0} reduce(mul, c1), dimensions={2}, to_apply=scalar_add
+      ROOT tuple = (f32[2,2]{1,0}, f32[2,2]{1,0}) tuple(r1, r2)
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      select = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_select
+      fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(select), kind=kInput, calls=fused_reduce
+      gte0 = f32[2,2]{1,0} get-tuple-element(fusion), index=0
+      gte1 = f32[2,2]{1,0} get-tuple-element(fusion), index=1
+      ROOT root = (f32[2,2]{1,0}, f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(gte1, gte1, select)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0)->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1)->operand(0);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest, ShapesCompatibleForMultiOutputFusion_ReduceFusions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_reduce_1 {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} p0.1, f32[] c0), dimensions={0}, to_apply=scalar_add
+    }
+
+    fused_reduce_2 {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={0}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      reduce_1 = f32[2,2]{1,0} fusion(p0), kind=kLoop, calls=fused_reduce_1
+      reduce_2 = f32[2,2]{1,0} fusion(p1), kind=kLoop, calls=fused_reduce_2
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce_1, reduce_2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_DifferentReduceDimensions) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_reduce_1 {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      c0 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} p0.1, f32[] c0), dimensions={0}, to_apply=scalar_add
+    }
+
+    fused_reduce_2 {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={2}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      reduce_1 = f32[2,2]{1,0} fusion(p0), kind=kLoop, calls=fused_reduce_1
+      reduce_2 = f32[2,2]{1,0} fusion(p1), kind=kLoop, calls=fused_reduce_2
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(reduce_1, reduce_2)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
+TEST_F(GpuFusibleTest,
+       ShapesCompatibleForMultiOutputFusion_NoReductionToVector) {
+  auto module = ParseHloString(absl::StrCat(kModulePrefix, R"(
+    fused_element_wise {
+      p0.1 = f32[2,2,2]{2,1,0} parameter(0)
+      p1.1 = f32[2,2,2]{2,1,0} parameter(1)
+      ROOT add = f32[2,2,2]{2,1,0} add(p0.1, p1.1)
+    }
+
+    fused_reduce {
+      p0.2 = f32[2,2,2]{2,1,0} parameter(0)
+      mul = f32[2,2,2]{2,1,0} multiply(f32[2,2,2]{2,1,0} p0.2, f32[2,2,2]{2,1,0} p0.2)
+      c1 = f32[] constant(0)
+      // Note that reduce is not a reduction-to-vector.
+      ROOT reduce = f32[2,2]{1,0} reduce(f32[2,2,2]{2,1,0} mul, f32[] c1), dimensions={1}, to_apply=scalar_add
+    }
+
+    ENTRY reduce {
+      p0 = f32[2,2,2]{2,1,0} parameter(0)
+      p1 = f32[2,2,2]{2,1,0} parameter(1)
+      element_wise = f32[2,2,2]{2,1,0} fusion(p0, p1), kind=kLoop, calls=fused_element_wise
+      fusion = (f32[2,2]{1,0}, f32[2,2]{1,0}) fusion(element_wise), kind=kLoop, calls=fused_reduce
+      ROOT root = (f32[2,2]{1,0}, f32[2,2,2]{2,1,0}) tuple(fusion, element_wise)
+    })"))
+                    .ValueOrDie();
+  const HloInstruction* fusion_1 =
+      module->entry_computation()->root_instruction()->operand(0);
+  const HloInstruction* fusion_2 =
+      module->entry_computation()->root_instruction()->operand(1);
+  EXPECT_FALSE(ShapesCompatibleForMultiOutputFusion(*fusion_1, *fusion_2));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index 91609c730b6c0d666eb607fb42b918c0f8f250e5..1126943624a3771433ecac591545d335c1890115 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -37,7 +37,7 @@ class GpuHloOrdering : public PredecessorHloOrdering {
  public:
   GpuHloOrdering(const HloModule* module,
                  const StreamAssignment& stream_assignment,
-                 const std::vector<const HloInstruction*>& thunk_launch_order);
+                 const std::vector<HloInstruction*>& thunk_launch_order);
   ~GpuHloOrdering() override = default;
 
   // Only the entry computation can possibly be sequentially ordered, and only
@@ -56,7 +56,7 @@ class GpuHloOrdering : public PredecessorHloOrdering {
 
 GpuHloOrdering::GpuHloOrdering(
     const HloModule* module, const StreamAssignment& stream_assignment,
-    const std::vector<const HloInstruction*>& thunk_launch_order)
+    const std::vector<HloInstruction*>& thunk_launch_order)
     : PredecessorHloOrdering(module) {
   // The entry computation has a total order when there's only one stream.
   if (stream_assignment.StreamCount() == 1) {
@@ -150,7 +150,7 @@ GpuHloOrdering::GpuHloOrdering(
 // However, if the total order is A,B,D,C,E, then C and E can run
 // concurrently.
 void BFSLaunchOrder(const HloComputation* computation,
-                    std::vector<const HloInstruction*>* launch_order) {
+                    std::vector<HloInstruction*>* launch_order) {
   // This topological sort uses two data structures:
   // 1. `incoming_edge_count` which keeps track of the number of incoming
   // edges to each HLO;
@@ -158,9 +158,9 @@ void BFSLaunchOrder(const HloComputation* computation,
   //
   // The sorting algorithm repeatedly pops the top from the queue and deletes
   // that HLO from the graph, making more HLOs incoming-edge free.
-  std::deque<const HloInstruction*> queue;
+  std::deque<HloInstruction*> queue;
   std::unordered_map<const HloInstruction*, int64> incoming_edge_count;
-  for (const auto& hlo : computation->instructions()) {
+  for (auto* hlo : computation->instructions()) {
     if (hlo->operand_count() == 0) {
       queue.push_back(hlo);
     } else {
@@ -172,10 +172,10 @@ void BFSLaunchOrder(const HloComputation* computation,
   }
 
   while (!queue.empty()) {
-    const HloInstruction* x = queue.front();
+    HloInstruction* x = queue.front();
     queue.pop_front();
     launch_order->push_back(x);
-    for (const HloInstruction* y : x->users()) {
+    for (HloInstruction* y : x->users()) {
       --incoming_edge_count[y];
       if (incoming_edge_count[y] == 0) {
         queue.push_back(y);
@@ -195,14 +195,14 @@ StatusOr<std::unique_ptr<GpuHloSchedule>> GpuHloSchedule::Build(
   std::unique_ptr<GpuHloSchedule> schedule(new GpuHloSchedule);
 
   // Initialize thunk_launch_order_, the total order of thunk launches.
-  const HloComputation* entry_computation = module.entry_computation();
+  HloComputation* entry_computation = module.entry_computation();
   if (stream_assignment.StreamCount() == 1) {
     // All kernels are launched on a single stream, so there's no loss of
     // concurrency by optimizing for minimal memory usage.
     TF_ASSIGN_OR_RETURN(
         HloInstructionSequence sequence,
         ScheduleComputation(
-            *entry_computation, [pointer_size](const BufferValue& buffer) {
+            entry_computation, [pointer_size](const BufferValue& buffer) {
               return ShapeUtil::ByteSizeOf(buffer.shape(), pointer_size);
             }));
     schedule->thunk_launch_order_ = sequence.instructions();
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
index 07a7fc67aa555845c3de57e574ab582403ec0490..7f224ffe4f03f8f05b0f1907628d99d9df387770 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.h
@@ -46,7 +46,7 @@ class GpuHloSchedule {
 
   // Returns the total order of thunk launches, represented in terms of HLO
   // instructions.
-  const std::vector<const HloInstruction*>& ThunkLaunchOrder() const {
+  const std::vector<HloInstruction*>& ThunkLaunchOrder() const {
     return thunk_launch_order_;
   }
 
@@ -60,7 +60,7 @@ class GpuHloSchedule {
  private:
   GpuHloSchedule();
 
-  std::vector<const HloInstruction*> thunk_launch_order_;
+  std::vector<HloInstruction*> thunk_launch_order_;
   std::unique_ptr<HloOrdering> hlo_ordering_;
 };
 
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index 6d3aed15ebe7d925eda00a72177a03a2264a640c..91db7151f22fd75b20244878bee86d65acd1d304 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -33,7 +33,7 @@ namespace gpu {
 
 class GpuHloScheduleTest : public HloTestBase {
  protected:
-  using HloVec = std::vector<const HloInstruction*>;
+  using HloVec = std::vector<HloInstruction*>;
 
   // Pre-canned shapes.
   Shape f32_2x2_ = ShapeUtil::MakeShape(F32, {2, 2});
@@ -44,7 +44,7 @@ class GpuHloScheduleTest : public HloTestBase {
         .ConsumeValueOrDie();
   }
 
-  std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -79,7 +79,7 @@ TEST_F(GpuHloScheduleTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -139,7 +139,7 @@ TEST_F(GpuHloScheduleTest, SequentialAdd) {
   HloInstruction* add3 = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, add1, add2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add3));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -209,7 +209,7 @@ TEST_F(GpuHloScheduleTest, ConcurrentMatMul) {
   HloInstruction* add =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, dot2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
@@ -288,7 +288,7 @@ TEST_F(GpuHloScheduleTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> streams = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
index 1c0a23fa3eb38961d420aff05e412c3b4d8524e7..f59da2caa18646676297e66dd329c66fb5fddf1b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.cc
@@ -65,8 +65,8 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
 
   VLOG(2) << "Using heuristic to figure out layouts for " << instr->ToString();
 
-  // Empirically we've found with Volta and cudnn 7 that backward-input convs
-  // with stride are significantly faster with NCHW layouts.
+  // Empirically we've found with Volta and cudnn <= 7.3 that backward-input
+  // convs with stride are significantly faster with NCHW layouts.
   //
   // We could have used a mixed layout combination, e.g. (NHWC, NCHW, NCHW),
   // which on paper gives good performance. However, there are two observations:
@@ -75,11 +75,17 @@ HeuristicLayoutAssignment(const HloInstruction* instr,
   // * we've also observed that for mixed layouts, cuDNN transposes data back
   //   and forth from a different layout combination. If we end up with
   //   transposes anyway, we prefer to have them in XLA, as they can be fused.
-  // TODO(timshen): Figure out the exact condition. This may be achieved by
-  // auto-tuning layouts offline.
-  if (instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
-      window_util::HasStride(instr->window())) {
-    return kAllNCHW;
+  if (auto* dnn = stream_executor->AsDnn()) {
+    auto version_status = dnn->GetVersion();
+    if (version_status.ok()) {
+      auto version = version_status.ConsumeValueOrDie();
+      if (std::make_tuple(version.major_version(), version.minor_version()) <=
+              std::make_tuple(7, 3) &&
+          instr->custom_call_target() == kCudnnConvBackwardInputCallTarget &&
+          window_util::HasStride(instr->window())) {
+        return kAllNCHW;
+      }
+    }
   }
 
   // For other Volta f16 convolutions, use NHWC.
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
index 8cc76c872c61634ca4344d8a8cdf8c6a75aea2ac..2ffc8bfb49b205dced0d540ba72426e72d95e596 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment_test.cc
@@ -61,7 +61,7 @@ TEST_F(LayoutAssignmentTest, Elementwise) {
             HloInstruction::CreateParameter(1, ashape, "y"));
         auto add = builder.AddInstruction(
             HloInstruction::CreateBinary(ashape, HloOpcode::kAdd, x, y));
-        auto module = CreateNewUnverifiedModule();
+        auto module = CreateNewVerifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(add));
 
@@ -148,7 +148,7 @@ TEST_F(LayoutAssignmentTest, BatchNormInference) {
           {operand, scale, offset, mean, variance, epsilon, feature_index},
           kCudnnBatchNormForwardInferenceCallTarget));
 
-      auto module = CreateNewUnverifiedModule();
+      auto module = CreateNewVerifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -217,7 +217,7 @@ TEST_F(LayoutAssignmentTest, BatchNormTraining) {
           batchnorm_shape, {operand, scale, offset, epsilon, feature_index},
           kCudnnBatchNormForwardTrainingCallTarget));
 
-      auto module = CreateNewUnverifiedModule();
+      auto module = CreateNewVerifiedModule();
       HloComputation* computation =
           module->AddEntryComputation(builder.Build(batchnorm));
 
@@ -298,7 +298,7 @@ TEST_F(LayoutAssignmentTest, BatchNormGrad) {
                  feature_index},
                 kCudnnBatchNormBackwardCallTarget));
 
-        auto module = CreateNewUnverifiedModule();
+        auto module = CreateNewVerifiedModule();
         HloComputation* computation =
             module->AddEntryComputation(builder.Build(batchnorm));
 
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
index 43f43b50e4a6478f343088194871cc9d380bd2d2..6151dd8ff4c92bb81bd756c68cc9377633c8c9d5 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion.cc
@@ -80,7 +80,7 @@ bool IsIEEEFloatingPointScalarConstant(const HloInstruction* constant) {
 // This function limits the maximum number of operands to a fusion.
 //
 // There's a cap on how many parameters we can pass to a CUDA kernel, but
-// exactly what that limit is is hazy, as it depends on (among other things) how
+// exactly what that limit is hazy, as it depends on (among other things) how
 // much GPU constant memory is in use for other purposes.
 //
 // Moreover, we don't even know at the point that we're running fusion how many
@@ -181,7 +181,8 @@ bool GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
         return true;
       }
     } else if (consumer->operand_count() == 2 &&
-               consumer->opcode() == HloOpcode::kAdd) {
+               consumer->opcode() == HloOpcode::kAdd &&
+               consumer->operand(other_operand_index) != producer) {
       // Fuse a bias add into the output of the dot.
       return true;
     }
diff --git a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
index fb77bc4b8eb497d09014da96769b52aa606510af..688604cd36e5a45debf855aacd29d05ecda92341 100644
--- a/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/instruction_fusion_test.cc
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfDotUnfused) {
   auto reshape2 = builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(S32, {1, 1, 1}), dot1));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -134,7 +134,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfDotUnfused) {
   auto transpose2 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {1, 1}), dot1, {0, 1}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose2, computation->root_instruction());
   EXPECT_FALSE(GpuInstructionFusion(/*may_duplicate=*/true)
@@ -358,6 +358,29 @@ TEST_F(InstructionFusionTest, DotOutputFusionBiasAdd) {
                       op::Parameter()));
 }
 
+TEST_F(InstructionFusionTest,
+       DotOperationFusion_DontOutputFuseDuplicateOperands) {
+  absl::string_view module_string = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[50,60]{1,0} parameter(0)
+  b = f32[60,1]{1,0} parameter(1)
+  c = f32[50,1]{1,0} dot(a, b), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+  ROOT d = f32[50,1]{1,0} add(c, c)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(module_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool fused_something,
+      GpuInstructionFusion(/*may_duplicate=*/false).Run(module.get()));
+  EXPECT_FALSE(fused_something);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              Not(op::Fusion()));
+}
+
 // Compute sum(1/p0), where p0 has type f32, twice.  Check that the division is
 // duplicated and fused into both reduces.
 TEST_F(InstructionFusionTest, FloatingPointDivIsCheap) {
@@ -723,7 +746,7 @@ TEST_F(InstructionFusionTest, AvoidsLargeFusion) {
     sum = b.AddInstruction(
         HloInstruction::CreateBinary(shape, HloOpcode::kAdd, sum, param));
   }
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(b.Build());
   EXPECT_TRUE(GpuInstructionFusion(/*may_duplicate=*/true)
                   .Run(module.get())
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 42fb38dffae31b0f4322216545027e067cab250d..33e41a2782b5932430eea621d3cea2c6634f292f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -268,5 +268,17 @@ string CudnnConvKindToString(CudnnConvKind kind) {
   }
 }
 
+llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b) {
+  return b->CreateAnd(
+      b->CreateICmpEQ(
+          b->getInt32(0),
+          llvm_ir::EmitCallToIntrinsic(
+              llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b)),
+      b->CreateICmpEQ(
+          b->getInt32(0),
+          llvm_ir::EmitCallToIntrinsic(
+              llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b)));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index f373d4a8393a047aba599b0fae954e98a740161e..ebf4d926b7a280e10b09a2532caba7ad6ab3ceb2 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -155,6 +155,10 @@ llvm::Value* EmitPrintf(absl::string_view fmt,
 llvm::Value* EmitFullWarpShuffleDown(llvm::Value* value, llvm::Value* offset,
                                      llvm::IRBuilder<>* builder);
 
+// Emits code that determines whether the current thread is thread 0 within
+// block 0 of the kernel.
+llvm::Value* IsBlock0Thread0(llvm::IRBuilder<>* b);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
index 7fcdd805ed32004a96ecc0da7de1d89bcf1b6229..6693f66d62d8b04d1b78e001fdb515b34539c67f 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.cc
@@ -63,9 +63,6 @@ IrEmitter::IrEmitter(const HloModuleConfig& hlo_module_config,
                 &ir_emitter_context->buffer_assignment(), &b_, module_,
                 is_nested),
       hlo_module_config_(hlo_module_config) {
-  b_.setFastMathFlags(llvm_ir::GetFastMathFlags(
-      /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_gpu_enable_fast_math()));
 }
 
 Status IrEmitter::DefaultAction(HloInstruction* hlo) {
@@ -97,6 +94,18 @@ Status IrEmitter::HandleBitcast(HloInstruction* bitcast) {
   return Status::OK();
 }
 
+Status IrEmitter::HandleAddDependency(HloInstruction* add_dependency) {
+  VLOG(2) << "HandleAddDependency: " << add_dependency->ToString();
+  const HloInstruction* operand = add_dependency->operand(0);
+  // Add_Dependency is a no-op, but we still want to bind it to an llvm::Value
+  // sometimes, e.g., when it's operand is a constant or a bitcast of a
+  // constant.
+  if (bindings_.BoundToIrValue(*operand)) {
+    bindings_.BindHloToIrValue(*add_dependency, GetBasePointer(*operand));
+  }
+  return Status::OK();
+}
+
 Status IrEmitter::HandleGetTupleElement(HloInstruction* get_tuple_element) {
   auto operand = get_tuple_element->operand(0);
   CHECK(bindings_.BoundToIrValue(*operand));
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter.h b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
index 56c3f452006f9e2d5c37cc3b54701b2367abfa14..2da46c016935d0e927879bbfb0d05cfc4899d818 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter.h
@@ -100,6 +100,7 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleBatchNormInference(HloInstruction* batch_norm) override;
   Status HandleBatchNormTraining(HloInstruction* batch_norm) override;
   Status HandleBatchNormGrad(HloInstruction* batch_norm) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status FinishVisit(HloInstruction* root) override { return Status::OK(); }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 87b6cd640acc41074c40e1d397b9334b76029fd5..fb040aff30d48bf5817946ce53d37bc6685941e4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -65,11 +65,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/dynamic_update_slice_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/sort_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/tuple_ops.h"
@@ -88,6 +88,8 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using llvm_ir::KernelMappingScheme;
+
 namespace {
 
 using absl::InlinedVector;
@@ -1188,7 +1190,7 @@ Status IrEmitterUnnested::EmitColumnReduction(
       .EmitLoop(IrName(reduce), index_ty);
 }
 
-static std::pair<int64, int64> ComputeTilingSchemeForReduction(
+static std::pair<int64, int64> ComputeKernelMappingSchemeForReduction(
     int64 depth, int64 width, int64 kWarpSize) {
   constexpr int64 kTargetNumElementsPerThread = 64;
   int64 x_tile_size = kTargetNumElementsPerThread;
@@ -1322,7 +1324,7 @@ Status IrEmitterUnnested::EmitRowReduction(
   int64 x_tile_size;
   int64 z_tile_size;
   std::tie(x_tile_size, z_tile_size) =
-      ComputeTilingSchemeForReduction(depth, width, kWarpSize);
+      ComputeKernelMappingSchemeForReduction(depth, width, kWarpSize);
 
   // Round the width in tiles up to the nearest multiple of kWarpSize, so that
   // the use of shfl_down is valid.
@@ -2171,7 +2173,18 @@ Status IrEmitterUnnested::HandleSelect(HloInstruction* select) {
 Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
   std::vector<std::unique_ptr<Thunk>> thunks;
   Shape keys_shape = sort->operand(0)->shape();
+  int64 dimension_to_sort = sort->dimensions(0);
+  // In case there is a 'values' parameter that is a iota, we take note and use
+  // it later to ensure a stable sort. Otherwise, we don't guarantee a stable
+  // sort.
+  int64 iota_values_parameter_index = -1;
   for (int64 i = 0; i < sort->operand_count(); ++i) {
+    if (i > 0 && sort->operand(i)->opcode() == HloOpcode::kIota &&
+        ShapeUtil::ElementIsIntegral(sort->operand(i)->shape()) &&
+        Cast<HloIotaInstruction>(sort->operand(i))->iota_dimension() ==
+            dimension_to_sort) {
+      iota_values_parameter_index = i;
+    }
     ShapeIndex shape_index =
         sort->operand_count() > 1 ? ShapeIndex({i}) : ShapeIndex({});
     // We assume that the layout of all involved operands and outputs is the
@@ -2196,7 +2209,6 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
     }
   }
 
-  int64 dimension_to_sort = sort->dimensions(0);
   uint64 dimension_to_sort_bound = keys_shape.dimensions(dimension_to_sort);
   int64 num_stages = tensorflow::Log2Ceiling(dimension_to_sort_bound);
   CHECK_GE(1ULL << num_stages, dimension_to_sort_bound);
@@ -2298,8 +2310,9 @@ Status IrEmitterUnnested::HandleSort(HloInstruction* sort) {
       }
     }
     return llvm_ir::EmitSortInPlace(
-        dimension_to_sort, keys_array, values_arrays, IrName(sort), xor_masks,
-        &b_, launch_dimensions,
+        dimension_to_sort, keys_array, values_arrays,
+        iota_values_parameter_index, IrName(sort), xor_masks, &b_,
+        launch_dimensions,
         xor_masks.size() > 1 ? num_iterations_in_sort_dim
                              : standard_num_iterations_in_sort_dim,
         kTileSize);
@@ -2385,7 +2398,7 @@ Status IrEmitterUnnested::HandleCrossReplicaSum(HloInstruction* crs) {
   return Status::OK();
 }
 
-Status IrEmitterUnnested::HandleAfterAll(HloInstruction* gen_token) {
+Status IrEmitterUnnested::HandleAfterAll(HloInstruction* after_all) {
   return Status::OK();
 }
 
@@ -3103,8 +3116,18 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
             GetIndexTypeForKernel(&hlo, launch_dimensions.launch_bound(), &b_));
   }
 
-  // For multioutput fusion, we need to emit each operand and the root.
+  // Emit the tuple pointers in one thread.  We could do this at any point in
+  // the kernel, but we do it at the beginning in the hopes of reducing register
+  // pressure, since we touch threadIdx.x and blockIdx.x at the beginning of the
+  // kernel *anyway*.
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(hlo);
+  TF_RETURN_IF_ERROR(
+      KernelSupportLibrary(&b_).If("emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+        llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
+        return Status::OK();
+      }));
+
+  // For multioutput fusion, we need to emit each operand and the root.
   TF_RETURN_IF_ERROR(
       ParallelLoopEmitter(element_generator, output_arrays, launch_dimensions,
                           &b_, unroll_factor)
@@ -3113,8 +3136,6 @@ Status IrEmitterUnnested::EmitTargetElementLoopInThunk(
                         &hlo, launch_dimensions.launch_bound(), &b_)));
 
   b_.SetInsertPoint(b_.GetInsertBlock()->getTerminator());
-  llvm_ir::EmitTuple(GetIrArray(hlo, hlo), output_arrays, &b_, module_);
-
   return Status::OK();
 }
 
@@ -3146,31 +3167,6 @@ std::vector<IrArray> IrEmitterUnnested::ConstructIrArrayForInputs(
   return param_arrays;
 }
 
-int IrEmitterUnnested::ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-    const HloInstruction& hlo, const std::vector<IrArray>& output_arrays,
-    absl::Span<const int64> reduced_output_dims,
-    std::vector<Shape>* output_reduced_shapes,
-    std::vector<IrArray>* output_in_reduced_shape_arrays) {
-  int64 num_outputs = 1;
-  if (hlo.IsMultiOutputFusion()) {
-    num_outputs = ShapeUtil::TupleElementCount(hlo.shape());
-    output_in_reduced_shape_arrays->reserve(num_outputs);
-    output_reduced_shapes->reserve(num_outputs);
-    for (int64 i = 0; i < num_outputs; ++i) {
-      output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-          ShapeUtil::GetSubshape(hlo.shape(), {i}).element_type(),
-          reduced_output_dims));
-      output_in_reduced_shape_arrays->push_back(
-          output_arrays[i].CastToShape((*output_reduced_shapes)[i], &b_));
-    }
-  } else {
-    output_reduced_shapes->push_back(ShapeUtil::MakeShapeWithDescendingLayout(
-        hlo.shape().element_type(), reduced_output_dims));
-    output_in_reduced_shape_arrays->push_back(
-        output_arrays[0].CastToShape((*output_reduced_shapes)[0], &b_));
-  }
-  return num_outputs;
-}
 
 int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
     const HloInstruction& hlo, const std::vector<IrArray>& param_arrays,
@@ -3199,335 +3195,508 @@ int IrEmitterUnnested::ConstructInputReducedShapeAndCastInputIrArrayToShape(
 
 namespace {
 
-// Reads thread_idx.x and converts it to a (y,x) coordinate, assuming that the
-// thread lives within a square tile of size tile_size (so thread blocks are of
-// size tile_size * tile_size).
-std::tuple<llvm::Value*, llvm::Value*> CalculateYXCoordinateWithinTile(
-    llvm::IRBuilder<>* builder, llvm::Value* tile_size,
-    int64 threads_per_tile) {
-  // Calculate the starting element coordinate within a tile for the current
-  // thread, (y, x) from thread_id.
-  llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, builder);
-  llvm_ir::AddRangeMetadata(0, threads_per_tile,
-                            llvm::cast<llvm::Instruction>(thread_id));
-  thread_id = builder->CreateIntCast(thread_id, tile_size->getType(),
-                                     /*isSigned=*/true, "thread.id.x");
-  auto x = builder->CreateURem(thread_id, tile_size);
-  auto y = builder->CreateUDiv(thread_id, tile_size);
-  return std::make_tuple(y, x);
-}
-
-// Reads block_idx.x, casts it to type index_ty, and adds the assumption that
-// it's in the range [0, num_blocks].
-llvm::Value* GetBlockIdx(llvm::IRBuilder<>* builder, llvm::Type* index_ty,
-                         int64 num_blocks) {
-  llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
-      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, builder);
-  llvm_ir::AddRangeMetadata(0, num_blocks,
-                            llvm::cast<llvm::Instruction>(block_id));
-  return builder->CreateIntCast(block_id, index_ty, /*isSigned=*/true,
-                                "block.id.x");
-}
-
-// Emits code to process up to (tile_size/num_rows) elements in a tile, given
-// `emit_elem_function` is the function to emit code to process one element, `y`
-// and `x` are the coordinates for the first element to process, and `index` is
-// the index for the origin of the tile. Emits bounds check to ensure that each
-// processed element is within the boundary defined by `tile_width` and
-// `tile_height`.
+void EmitFullTile(const KernelMappingScheme* mapping_scheme,
+                  const IrArray::Index& tile_origin_index,
+                  llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
+                  llvm::Type* index_ty,
+                  const std::function<void(const IrArray::Index&, llvm::Value*,
+                                           llvm::Value*)>& emit_elem_function) {
+  int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
+  int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+  int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
+  for (int64 i = 0; i < tile_size_y; i += num_threads_y) {
+    IrArray::Index source_idx_y =
+        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, i),
+                                         KernelMappingScheme::DimY, builder);
+    llvm::Value* y_loc =
+        builder->CreateAdd(llvm::ConstantInt::get(index_ty, i), y);
+    for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+      IrArray::Index source_idx =
+          source_idx_y.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
+                                      KernelMappingScheme::DimX, builder);
+      llvm::Value* x_loc =
+          builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+      emit_elem_function(source_idx, y_loc, x_loc);
+    }
+  }
+}
+
+void EmitPartialTile(
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    llvm::Type* index_ty,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
+  int64 num_threads_x = mapping_scheme->GetNumberOfThreadsForDimensionX();
+  int64 num_threads_y = mapping_scheme->GetNumberOfThreadsForDimensionY();
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+
+  for (int64 j = 0; j < tile_size_x; j += num_threads_x) {
+    IrArray::Index source_idx =
+        tile_origin_index.AddOffsetToDim(llvm::ConstantInt::get(index_ty, j),
+                                         KernelMappingScheme::DimX, builder);
+    llvm::Value* x_loc =
+        builder->CreateAdd(llvm::ConstantInt::get(index_ty, j), x);
+
+    ksl->IfReturnVoid(
+        "x_in_tile", builder->CreateICmpULT(x_loc, tile_width), [&] {
+          // tile_height_bound =
+          //   ceil(tile_height / num_threads_y) * num_threads_y
+          llvm::Value* ceiling_of_ratio = builder->CreateUDiv(
+              builder->CreateAdd(tile_height, llvm::ConstantInt::get(
+                                                  index_ty, num_threads_y - 1)),
+              llvm::ConstantInt::get(index_ty, num_threads_y));
+          llvm::Value* tile_height_bound = builder->CreateMul(
+              ceiling_of_ratio,
+              llvm::ConstantInt::get(index_ty, num_threads_y));
+          ksl->ForReturnVoid(
+              loop_name, /*start=*/llvm::ConstantInt::get(index_ty, 0),
+              /*end=*/tile_height_bound,
+              /*step=*/llvm::ConstantInt::get(index_ty, num_threads_y),
+              [&](llvm::Value* y_indvar) {
+                llvm::Value* y_loc = builder->CreateAdd(y_indvar, y);
+                ksl->IfReturnVoid(
+                    "y_in_tile", builder->CreateICmpULT(y_loc, tile_height),
+                    [&] {
+                      emit_elem_function(
+                          source_idx.AddOffsetToDim(
+                              y_indvar, KernelMappingScheme::DimY, builder),
+                          y_loc, x_loc);
+                    });
+              });
+        });
+  }
+}
+
+// Emits code to process up to
+// (tile_size_x/num_threads_x * tile_size_y/num_threads_y) elements in a tile,
+// given `emit_elem_function` is the function to emit code to process one
+// element, `y` and `x` are the intra-tile coordinates for the first element
+// to process, and `index` is the index for the origin of the tile. Information
+// about tile_size_x/y and num_threads_x/y are stored in `mapping_scheme`. Emits
+// bounds check to ensure that each processed element is within the boundary
+// defined by `tile_width` and `tile_height`.
 void EmitTiledElementalCodeWithBoundsCheck(
-    int64 tile_size, int64 num_rows, const IrArray::Index& index,
-    const string& loop_name, KernelSupportLibrary* ksl,
-    llvm::IRBuilder<>* builder, llvm::Value* y, llvm::Value* x,
-    llvm::Value* tile_width, llvm::Value* tile_height,
-    const std::function<void(const IrArray::Index&, llvm::Value*)>&
-        emit_elem_function) {
+    const KernelMappingScheme* mapping_scheme,
+    const IrArray::Index& tile_origin_index, const string& loop_name,
+    KernelSupportLibrary* ksl, llvm::IRBuilder<>* builder, llvm::Value* y,
+    llvm::Value* x, llvm::Value* tile_height, llvm::Value* tile_width,
+    const std::function<void(const IrArray::Index&, llvm::Value*,
+                             llvm::Value*)>& emit_elem_function) {
+  int64 tile_size_x = mapping_scheme->GetTileSizeForDimensionX();
+  int64 tile_size_y = mapping_scheme->GetTileSizeForDimensionY();
   llvm::Type* index_ty = tile_width->getType();
-  // Emits a constant value with index type.
-  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_ty, c);
-  };
-  // Adds `addend` to the given `dim` of `index`.
-  auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = builder->CreateAdd(index[dim], addend);
-    return index;
-  };
 
-  auto emit_full_tile = [&] {
-    for (int64 i = 0; i < tile_size; i += num_rows) {
-      auto source_idx = offset_dim(index, index_typed_constant(i), /*dim=*/1);
-      auto y_loc = builder->CreateAdd(index_typed_constant(i), y);
-      emit_elem_function(source_idx, y_loc);
-    }
-  };
-
-  auto emit_last_row = [&] {
-    ksl->IfReturnVoid("x_in_tile", builder->CreateICmpULT(x, tile_width), [&] {
-      // tile_height_upper_bound =
-      //   ceil(tile_height / num_rows) * num_rows
-      auto tile_height_upper_bound = builder->CreateMul(
-          builder->CreateUDiv(
-              builder->CreateAdd(tile_height,
-                                 index_typed_constant(num_rows - 1)),
-              index_typed_constant(num_rows)),
-          index_typed_constant(num_rows));
-      ksl->ForReturnVoid(
-          loop_name, /*start=*/index_typed_constant(0),
-          /*end=*/tile_height_upper_bound,
-          /*step=*/index_typed_constant(num_rows), [&](llvm::Value* y_indvar) {
-            auto y_loc = builder->CreateAdd(y_indvar, y);
-            ksl->IfReturnVoid(
-                "y_in_tile", builder->CreateICmpULT(y_loc, tile_height), [&] {
-                  emit_elem_function(offset_dim(index, y_indvar, /*dim=*/1),
-                                     y_loc);
-                });
-          });
-    });
-  };
   ksl->IfReturnVoid(
       "full_tile",
       builder->CreateAnd(
-          builder->CreateICmpEQ(index_typed_constant(tile_size), tile_width),
-          builder->CreateICmpEQ(index_typed_constant(tile_size), tile_height)),
-      emit_full_tile, emit_last_row);
+          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_x),
+                                tile_width),
+          builder->CreateICmpEQ(llvm::ConstantInt::get(index_ty, tile_size_y),
+                                tile_height)),
+      [&] {
+        EmitFullTile(mapping_scheme, tile_origin_index, builder, y, x, index_ty,
+                     emit_elem_function);
+      },
+      [&] {
+        EmitPartialTile(mapping_scheme, tile_origin_index, loop_name, ksl,
+                        builder, y, x, tile_height, tile_width, index_ty,
+                        emit_elem_function);
+      });
 }
 }  // namespace
 
-// Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
-// algorithm to improve the memory access patterns for the input parameters
-// which have a shape that is a 0-2-1 transpose of the output tensors.
+// Emits code to process a tensor element in a tile for the given kCopy HLO that
+// performs a 0-2-1 transpose.
 //
-// For the purpose of tiling, the output tensors have a logical shape of three
-// components 0-2-1 while the relevant input parameters have a logical shape of
-// three components 0-1-2 in the order major to minor. The x- and y- dimensions
-// of the tensors are tiled in square tiles of edge length `kTileSize`. Each
-// thread block of `kTileSize` x `kNumRows` threads transposes one tile: each
-// thread copies kTileSize/kNumRows elements from the input to a shared memory
-// tile, then the otherwise "regular hlo kernel" reads from the shared memory
-// instead of the original input.
+// index: The index for the first output element in the normalized tensor. The
+//   normalized tensor is the resulting tensor after collapsing contiguous
+//   dimensions that play the same role in the transpose.
+// y_loc: The y coordinate within a tile.
+// x_loc: The x coordinate within a tile.
+// kernel_info: Other information to support the kernel code generation.
+void IrEmitterUnnested::EmitTileElementForCopy(
+    HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
+  // TODO(jlebar): Add AA metadata to this load.
+  llvm::Instruction* load_from_shmem_buffer =
+      Load(GEP(tiled_param_info->GetBufferForParameter(0),
+               {b_.getInt64(0), x_loc, y_loc}),
+           "output_element");
+  llvm_ir::IrArray output_array = GetIrArray(*hlo, *hlo);
+  Shape output_reduced_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      hlo->shape().element_type(),
+      kernel_info->GetKernelMappingScheme()->GetDimensionsInElements());
+  // When the output_reduced_shape is a 0-2-1 transpose of the input shape,
+  // the 0-2-1 transpose is achieved through EmitWriteArrayElement.
+  output_array.CastToShape(output_reduced_shape, &b_)
+      .EmitWriteArrayElement(index, load_from_shmem_buffer, &b_);
+}
+
+// Emits code to process a tensor element in a tile for the given kLoop fusion
+// HLO containing parameters that are 0-2-1 transpose of its outputs.
 //
-// This is similar to the following CUDA algorithm in TensorFlow:
-// https://goo.gl/MStRV6.
-//
-// `kTileSize` should usually be same as warp size. We currently choose 32 for
-// `kTileSize` and 4 for `kNumRows`. The CUDA algorithm uses 8 for `kNumRows`.
-//
-// TODO(b/33320379): Here each block transposes 1 tile. It may be more efficient
-// to launch fewer blocks so each transposes many tiles.
-LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
-    HloInstruction* hlo, absl::Span<const int64> reduced_output_dims,
-    absl::Span<const int64> tiled_param_ids) {
-  // Parameters for the tiling algorithm.
-  constexpr int64 kTileSize = 32;
-  constexpr int64 kNumRows = 4;
-  constexpr int64 kThreadsPerTile = kTileSize * kNumRows;
-
-  // Construct IrArrays for the inputs and outputs.
+// index: The index for the first output element in the normalized tensor, that
+//   is the resulting tensor after collapsing contiguous dimensions that play
+//   the same role in the transpose.
+// kernel_info: Other information to support the kernel code generation.
+// y_loc: The y coordinate within a tile.
+// x_loc: The x coordinate within a tile.
+void IrEmitterUnnested::EmitTileElementForFusion(
+    HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+    const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+    llvm::Value* x_loc) {
+  llvm_ir::TiledParameterInfo* tiled_param_info =
+      kernel_info->GetTiledParameterInfo();
   std::vector<IrArray> output_arrays = ConstructIrArrayForOutputs(*hlo);
-  int64 num_outputs = output_arrays.size();
-  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*hlo);
-  int64 num_params = param_arrays.size();
+  GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
+                                     GetNestedComputer());
+  FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
+                               &elem_emitter);
+  tiled_param_info->set_y(y_loc);
+  tiled_param_info->set_x(x_loc);
+  fused_emitter.SetTiledParameterInfo(tiled_param_info);
+  TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
+  IrArray::Index untiled_index =
+      kernel_info->GetKernelMappingScheme()->GetUnnormalizedIndex(
+          index, output_arrays[0].GetShape());
+  const llvm_ir::ElementGenerator& output_generator =
+      fused_emitter.GetRootGenerator();
+  llvm::Value* output_value = output_generator(untiled_index).ValueOrDie();
+  if (hlo->IsMultiOutputFusion()) {
+    DCHECK(output_value->getType()->isStructTy());
+    DCHECK_EQ(output_value->getType()->getStructNumElements(),
+              output_arrays.size());
+    for (int64 i = 0; i < output_arrays.size(); ++i) {
+      output_arrays[i].EmitWriteArrayElement(
+          untiled_index, ExtractValue(output_value, i), &b_);
+    }
+  } else {
+    output_arrays[0].EmitWriteArrayElement(untiled_index, output_value, &b_);
+  }
+}
+
+// Emits a block of tiles, given a function object to emit one tile.
+void IrEmitterUnnested::EmitBlock(const TileGenerator& emit_one_tile,
+                                  const KernelCodegenInfo* kernel_info,
+                                  KernelSupportLibrary& ksl,
+                                  llvm::Type* index_ty) {
+  KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
+  absl::Span<const int64> dims_in_tile = mapping_scheme->GetDimensionsInTiles();
+  absl::Span<const int64> dims_in_block =
+      mapping_scheme->GetDimensionsInBlocks();
+  absl::Span<const int64> block_sizes = mapping_scheme->GetBlockSizes();
+  auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
+    return llvm::ConstantInt::get(index_ty, c);
+  };
+
+  // Emit all the tiles for a given dimension in a tile block.
+  auto emit_tiles_for_block_dim =
+      [&](const string& loop_name, const IrArray::Index& starting_tile,
+          int dim_id,
+          const std::function<void(const IrArray::Index& tile_index)>
+              emit_next_block_dim) {
+        if (block_sizes[dim_id] == 1) {
+          emit_next_block_dim(starting_tile);
+        } else {
+          llvm::Value* starting_tile_index_for_dim = starting_tile[dim_id];
+          llvm::Value* block_size_for_dim =
+              index_typed_constant(block_sizes[dim_id]);
+          llvm::Value* block_id_for_dim =
+              b_.CreateUDiv(starting_tile_index_for_dim, block_size_for_dim);
+          llvm::Value* last_block_for_dim =
+              index_typed_constant(dims_in_block[dim_id] - 1);
+          llvm::Value* last_block_size_for_dim = index_typed_constant(
+              dims_in_tile[dim_id] -
+              (dims_in_block[dim_id] - 1) * block_sizes[dim_id]);
+          llvm::Value* num_tiles_in_block =
+              Select(ICmpEQ(last_block_for_dim, block_id_for_dim),
+                     last_block_size_for_dim, block_size_for_dim);
+
+          ksl.ForReturnVoid(
+              loop_name,
+              /*start=*/index_typed_constant(0),
+              /*end=*/num_tiles_in_block,
+              /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
+                IrArray::Index tile_index = starting_tile.AddOffsetToDim(
+                    block_dim_induction_var, dim_id, &b_);
+                emit_next_block_dim(tile_index);
+              });
+        }
+      };
+
+  absl::Span<const int64> reduced_dims =
+      mapping_scheme->GetDimensionsInElements();
+  const bool block_contains_multi_tiles =
+      mapping_scheme->GetNumberOfTilesInOneBlock() > 1;
+
+  // Emit the tile with a given tile_index, by calculating the tight bounds for
+  // each dimension of the tile and then calling emit_one_tile.
+  auto emit_one_tile_for_tile_index = [&](const IrArray::Index& tile_index) {
+    std::vector<llvm::Value*> output_tile_bounds(3);
+    for (int i = KernelMappingScheme::DimY; i < KernelMappingScheme::DimTot;
+         ++i) {
+      int64 tile_size_for_dim = mapping_scheme->GetTileSizeForDimension(i);
+      // Only last row or column may not have full size.
+      llvm::Value* is_last_row =
+          ICmpEQ(tile_index[i], index_typed_constant(dims_in_tile[i] - 1));
+      int64 partial_row_size =
+          reduced_dims[i] - (dims_in_tile[i] - 1) * tile_size_for_dim;
+      output_tile_bounds[i] =
+          Select(is_last_row, index_typed_constant(partial_row_size),
+                 index_typed_constant(tile_size_for_dim), "tile_bound");
+    }
+
+    IrArray::Index tile_origin =
+        mapping_scheme->GetElementIndexForTileOrigin(tile_index);
+    emit_one_tile(tile_origin, output_tile_bounds, block_contains_multi_tiles);
+  };
 
+  const IrArray::Index starting_block =
+      mapping_scheme->EmitBlockIndex(index_ty);
+  const IrArray::Index starting_tile_for_dim_z =
+      mapping_scheme->GetTileIndexForBlockOrigin(starting_block);
+
+  // Emit the three dimensional block of tiles.
+  emit_tiles_for_block_dim(
+      "block_dim_z", starting_tile_for_dim_z, KernelMappingScheme::DimZ,
+      [&](const IrArray::Index& starting_tile_for_dim_y) {
+        emit_tiles_for_block_dim(
+            "block_dim_y", starting_tile_for_dim_y, KernelMappingScheme::DimY,
+            [&](const IrArray::Index& starting_tile_for_dim_x) {
+              emit_tiles_for_block_dim("block_dim_x", starting_tile_for_dim_x,
+                                       KernelMappingScheme::DimX,
+                                       emit_one_tile_for_tile_index);
+            });
+      });
+}
+
+// Emits a kernel for the hlo instruction using the given kernel mapping scheme.
+//
+// unnested_hlo: The unnested hlo instruction for which the kernel is generated.
+//   Currently, these hlo instructions are supported: kLoop fusion, kCopy.
+// tiled_param_ids: The IDs for the parameters that are 0-2-1 transpose of
+//   other tensors with the same dimensions and need to be tiled and tranposed.
+// mapping_scheme: The tiling scheme to use.
+// kernel_generator: Contains function objects for code generation, such as
+//   element generator, block prologue and epilogue generators.
+// kernel_info: Represent other information to support the code generation
+//   of the tiled kernel for the hlo.
+LaunchDimensions IrEmitterUnnested::EmitKernel(
+    HloInstruction* unnested_hlo, absl::Span<const int64> tiled_param_ids,
+    const KernelCodeGenerator& kernel_generator,
+    KernelCodegenInfo* kernel_info) {
+  KernelMappingScheme* mapping_scheme = kernel_info->GetKernelMappingScheme();
+
+  std::vector<IrArray> param_arrays = ConstructIrArrayForInputs(*unnested_hlo);
+  int64 num_params = param_arrays.size();
   // Allocate shared memory buffers to store the tiled inputs.
   std::vector<llvm::Value*> param_shmem_buffers(num_params, nullptr);
   for (int64 id : tiled_param_ids) {
-    const HloInstruction* param = hlo->operand(id);
-    // Add 1 to the minor dimension to reduce shared memory bank conflicts.
-    llvm::Type* tile_type = llvm::ArrayType::get(
-        llvm::ArrayType::get(llvm_ir::PrimitiveTypeToIrType(
-                                 param->shape().element_type(), module_),
-                             kTileSize + 1),
-        kTileSize);
-    auto* tile_base_ptr = llvm_ir::AllocateSharedMemoryTile(
-        b_.GetInsertBlock()->getParent()->getParent(), tile_type,
-        IrName(hlo, StrCat("tile", id)));
-    param_shmem_buffers[id] = tile_base_ptr;
+    const HloInstruction* param = unnested_hlo->operand(id);
+    param_shmem_buffers[id] =
+        mapping_scheme->GetSharedMemoryBufferForElementType(
+            llvm_ir::PrimitiveTypeToIrType(param->shape().element_type(),
+                                           module_),
+            IrName(unnested_hlo, StrCat("tile", id)));
     VLOG(3) << "Added shmem buffer for parameter " << id << ": "
-            << llvm_ir::DumpToString(*tile_base_ptr);
+            << llvm_ir::DumpToString(*param_shmem_buffers[id]);
   }
 
-  // The 0-2-1 shape of the tiling scheme is the reduced shape of the HLO result
-  // for the purpose of tiling. Calculate the logical output dimensions in the
-  // tile from the reduced output dimensions.
-  std::vector<int64> output_dims_in_tiles = std::vector<int64>(
-      reduced_output_dims.begin(), reduced_output_dims.end());
-  CHECK_EQ(output_dims_in_tiles.size(), 3);
-  for (int i = 1; i < 3; ++i) {
-    output_dims_in_tiles[i] =
-        CeilOfRatio<int64>(output_dims_in_tiles[i], kTileSize);
-  }
-  const int64 num_tiles =
-      absl::c_accumulate(output_dims_in_tiles, 1, std::multiplies<int64>());
-  LaunchDimensions launch_dimensions(num_tiles, kThreadsPerTile);
-
-  llvm::Type* index_ty =
-      GetIndexTypeForKernel(hlo, launch_dimensions.launch_bound(), &b_);
+  CHECK_EQ(mapping_scheme->GetThreadsPerTile() % kWarpSize, 0);
+  LaunchDimensions launch_dimensions = LaunchDimensions(
+      mapping_scheme->GetNumberOfBlocks(), mapping_scheme->GetThreadsPerTile());
+  llvm::Type* index_ty = GetIndexTypeForKernel(
+      unnested_hlo, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64 c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
 
-  // Cast each output IrArray to its corresponding reduced shape and keep the
-  // reduced shape live during IR emission.
-  std::vector<IrArray> output_in_reduced_shape_arrays;
-  std::vector<Shape> output_reduced_shapes;
-  CHECK_EQ(ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-               *hlo, output_arrays, reduced_output_dims, &output_reduced_shapes,
-               &output_in_reduced_shape_arrays),
-           num_outputs);
+  // For multioutput fusion, one thread needs to output a tuple with pointers to
+  // all the individual outputs.  We could do this at any point in the kernel,
+  // but we do it at the beginning in the hopes of reducing register pressure,
+  // since we touch threadIdx.x and blockIdx.x at the beginning of the kernel
+  // *anyway*.
+  if (unnested_hlo->IsMultiOutputFusion()) {
+    TF_CHECK_OK(KernelSupportLibrary(&b_).If(
+        "emit_mof_tuple", IsBlock0Thread0(&b_), [&] {
+          llvm_ir::EmitTuple(GetIrArray(*unnested_hlo, *unnested_hlo),
+                             ConstructIrArrayForOutputs(*unnested_hlo), &b_,
+                             module_);
+          return Status::OK();
+        }));
+  }
 
   // For each tiled parameter, cast its input IrArray to the corresponding
   // reduced shape and keep the reduced shape live during IR emission.
   std::vector<IrArray> param_in_reduced_shape_arrays;
   std::vector<Shape> param_reduced_shapes;
-  CHECK_EQ(ConstructInputReducedShapeAndCastInputIrArrayToShape(
-               *hlo, param_arrays, param_shmem_buffers, reduced_output_dims,
-               &param_reduced_shapes, &param_in_reduced_shape_arrays),
-           num_params);
+  absl::Span<const int64> reduced_dims =
+      mapping_scheme->GetDimensionsInElements();
+  int num_shapes = ConstructInputReducedShapeAndCastInputIrArrayToShape(
+      *unnested_hlo, param_arrays, param_shmem_buffers, reduced_dims,
+      &param_reduced_shapes, &param_in_reduced_shape_arrays);
+  DCHECK_EQ(num_shapes, num_params);
 
   // Calculate the starting element coordinate within a tile for the current
   // thread, (y, x) from thread_id.
   llvm::Value* x;
   llvm::Value* y;
-  std::tie(y, x) = CalculateYXCoordinateWithinTile(
-      &b_, index_typed_constant(kTileSize), kThreadsPerTile);
-
-  // Calculate the index for the current output tile from block_id.
-  const IrArray::Index output_tile_index(
-      GetBlockIdx(&b_, index_ty, num_tiles),
-      ShapeUtil::MakeShapeWithDescendingLayout(PRED /*arbitrary*/,
-                                               output_dims_in_tiles),
-      &b_);
-
-  // Output tile origin is the index for the first element of the current output
-  // tile.
-  const IrArray::Index output_tile_origin = [&] {
-    IrArray::Index index = output_tile_index;
-    for (int i = 1; i < 3; ++i) {
-      index[i] = Mul(output_tile_index[i], index_typed_constant(kTileSize),
-                     "tile_origin." + std::to_string(i));
-    }
-    return index;
-  }();
-
-  // Calculate the input tile origin from the output tile origin.
-  const IrArray::Index input_tile_origin(
-      Permute({0, 2, 1}, output_tile_origin.multidim()));
+  std::tie(y, x) = mapping_scheme->EmitThreadYXCoordinate(index_ty);
 
-  // Calculate the current output tile bounds in each of the logical dimensions.
-  std::vector<llvm::Value*> output_tile_bounds(3);
-  for (int i = 1; i < 3; ++i) {
-    // Only last row or column may not have full size.
-    output_tile_bounds[i] =
-        Select(ICmpEQ(output_tile_index[i],
-                      index_typed_constant(output_dims_in_tiles[i] - 1)),
-               index_typed_constant(reduced_output_dims[i] -
-                                    (output_dims_in_tiles[i] - 1) * kTileSize),
-               index_typed_constant(kTileSize), "kTileSize");
-  }
+  kernel_info->SetLaneId(
+      mapping_scheme->GetNumberOfThreadsForDimensionX() == kWarpSize ? x
+                                                                     : nullptr);
 
   KernelSupportLibrary ksl(&b_, llvm_ir::UnrollMode::kDefaultUnroll);
-
   // Curry a few parameters to EmitTiledElementalCodeWithBoundsCheck.
   auto emit_tiled_elemental_code_with_bounds_check =
       [&](const IrArray::Index& index, const string& loop_name,
-          llvm::Value* tile_width, llvm::Value* tile_height,
-          const std::function<void(const IrArray::Index&, llvm::Value*)>&
-              emit_elem_function) {
-        EmitTiledElementalCodeWithBoundsCheck(
-            kTileSize, kNumRows, index, loop_name, &ksl, &b_, y, x, tile_width,
-            tile_height, emit_elem_function);
+          llvm::Value* tile_height, llvm::Value* tile_width,
+          const std::function<void(const IrArray::Index&, llvm::Value*,
+                                   llvm::Value*)>& emit_elem_function) {
+        EmitTiledElementalCodeWithBoundsCheck(mapping_scheme, index, loop_name,
+                                              &ksl, &b_, y, x, tile_height,
+                                              tile_width, emit_elem_function);
       };
 
-  // Adds `addend` to the given `dim` of `index`.
-  auto offset_dim = [&](IrArray::Index index, llvm::Value* addend, int64 dim) {
-    index[dim] = Add(index[dim], addend);
-    return index;
-  };
-  const IrArray::Index input_index =
-      offset_dim(offset_dim(input_tile_origin, x, /*dim=*/2), y, /*dim=*/1);
-
-  // Copy input parameter values to shared memory buffers:
-  // tile[y, x] = input[index]
-  emit_tiled_elemental_code_with_bounds_check(
-      input_index, "input", output_tile_bounds[1], output_tile_bounds[2],
-      [&](const IrArray::Index& index, llvm::Value* y_loc) {
-        for (int64 id : tiled_param_ids) {
-          IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id];
-          llvm::Value* shmem_buffer = param_shmem_buffers[id];
-          // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
-          // global variables, so LLVM can't infer much about it.
-          Store(input_in_logical_shape.EmitReadArrayElement(index, &b_,
-                                                            "input_element"),
-                GEP(shmem_buffer, {index_typed_constant(0), y_loc, x}));
-        }
-      });
+  auto emit_one_tile = [&](const IrArray::Index& output_tile_origin,
+                           absl::Span<llvm::Value* const> output_tile_bounds,
+                           bool block_contains_multi_tiles) {
+    // Calculate the input tile origin from the output tile origin.
+    const IrArray::Index input_tile_origin(
+        Permute({0, 2, 1}, output_tile_origin.multidim()));
+
+    const IrArray::Index input_index =
+        input_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
+            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
+
+    // Copy input parameter values to shared memory buffers:
+    // tile[y, x] = input[index]
+    // Note that tile_width and tile_height are flipped here because we are
+    // reading a transposed tile.
+    emit_tiled_elemental_code_with_bounds_check(
+        input_index, "input", output_tile_bounds[2], output_tile_bounds[1],
+        [&](const IrArray::Index& index, llvm::Value* y_loc,
+            llvm::Value* x_loc) {
+          for (int64 id : tiled_param_ids) {
+            IrArray& input_in_logical_shape = param_in_reduced_shape_arrays[id];
+            llvm::Value* shmem_buffer = param_shmem_buffers[id];
+            // TODO(jlebar): Add AA metadata to this store.  Tile buffers are
+            // global variables, so LLVM can't infer much about it.
+            Store(input_in_logical_shape.EmitReadArrayElement(index, &b_,
+                                                              "input_element"),
+                  GEP(shmem_buffer, {index_typed_constant(0), y_loc, x_loc}));
+          }
+        });
 
-  // Wait for all threads to reach this point, lest we copy a value from tile to
-  // output before the other thread copies it from input to tile.
-  // This is `__syncthreads` in CUDA.
-  llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    // If shared memory transpose is needed, wait for all threads to reach this
+    // point, lest we copy a value from tile to output before the other thread
+    // copies it from input to tile. This is `__syncthreads` in CUDA.
+    if (!tiled_param_ids.empty()) {
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    }
 
-  llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
+    llvm_ir::TiledParameterInfo tiled_param_info(param_shmem_buffers, y, x);
+    kernel_info->SetTiledParamInfo(&tiled_param_info);
 
-  const IrArray::Index output_index =
-      offset_dim(offset_dim(output_tile_origin, x, /*dim=*/2), y, /*dim=*/1);
+    const IrArray::Index output_index =
+        output_tile_origin.AddOffsetToDim(x, KernelMappingScheme::DimX, &b_)
+            .AddOffsetToDim(y, KernelMappingScheme::DimY, &b_);
 
-  // Write to output[index] by emitting code like normal, except that values for
-  // the tiled parameters are read from the shmem buffers.
-  if (hlo->opcode() == HloOpcode::kCopy) {
-    emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          // TODO(jlebar): Add AA metadata to this load.
-          llvm::Instruction* load_from_shmem_buffer =
-              Load(GEP(param_shmem_buffers[0], {b_.getInt64(0), x, y_loc}),
-                   "output_element");
-          output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-              index, load_from_shmem_buffer, &b_);
-        });
-  } else {
-    CHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
+    // Write to output[index] by emitting code like normal, except that values
+    // for the tiled parameters are read from the shmem buffers.
     emit_tiled_elemental_code_with_bounds_check(
-        output_index, "output", output_tile_bounds[2], output_tile_bounds[1],
-        [&](const IrArray::Index& index, llvm::Value* y_loc) {
-          GpuElementalIrEmitter elem_emitter(hlo_module_config_, module_, &b_,
-                                             GetNestedComputer());
-          FusedIrEmitter fused_emitter(GetGeneratorForOperandIrArrays(hlo),
-                                       &elem_emitter);
-          tiled_param_info.set_y(y_loc);
-          fused_emitter.SetTiledParameterInfo(&tiled_param_info);
-          TF_CHECK_OK(hlo->fused_expression_root()->Accept(&fused_emitter));
-          IrArray::Index untiled_index = llvm_ir::GetUnreducedOutputIndex(
-              index, output_reduced_shapes[0], output_arrays[0].GetShape(),
-              &b_);
-          const llvm_ir::ElementGenerator& output_generator =
-              fused_emitter.GetRootGenerator();
-          llvm::Value* output_value =
-              output_generator(untiled_index).ValueOrDie();
-          if (hlo->IsMultiOutputFusion()) {
-            CHECK(output_value->getType()->isStructTy());
-            CHECK_EQ(output_value->getType()->getStructNumElements(),
-                     output_in_reduced_shape_arrays.size());
-            for (int64 i = 0; i < output_in_reduced_shape_arrays.size(); ++i) {
-              output_in_reduced_shape_arrays[i].EmitWriteArrayElement(
-                  index, ExtractValue(output_value, i), &b_);
-            }
-          } else {
-            output_in_reduced_shape_arrays[0].EmitWriteArrayElement(
-                index, output_value, &b_);
-          }
+        output_index, "output", output_tile_bounds[1], output_tile_bounds[2],
+        [&](const IrArray::Index& index, llvm::Value* y_loc,
+            llvm::Value* x_loc) {
+          kernel_generator.GetTileElementGenerator()(unnested_hlo, index,
+                                                     kernel_info, y_loc, x_loc);
         });
+    // If a tile block contains multiple tiles and shared memory buffers are
+    // used, we need to wait for all threads to finish using the shared memory
+    // buffer for the current tile before we move on to process the next tile
+    // and overwrite the shared memory buffers.
+    if (block_contains_multi_tiles && !tiled_param_ids.empty()) {
+      llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::nvvm_barrier0, {}, {}, &b_);
+    }
+  };
+
+  const BlockPrologueGenerator& block_prologue_generator =
+      kernel_generator.GetBlockPrologueGenerator();
+  if (block_prologue_generator) {
+    block_prologue_generator(unnested_hlo, kernel_info);
   }
 
-  // For multioutput fusion, emit a tuple with all the individual outputs.
-  if (hlo->IsMultiOutputFusion()) {
-    llvm_ir::EmitTuple(GetIrArray(*hlo, *hlo), output_arrays, &b_, module_);
+  EmitBlock(std::move(emit_one_tile), kernel_info, ksl, index_ty);
+
+  const BlockEpilogueGenerator& block_epilogue_generator =
+      kernel_generator.GetBlockEpilogueGenerator();
+  if (block_epilogue_generator) {
+    block_epilogue_generator(unnested_hlo, kernel_info);
   }
 
   return launch_dimensions;
 }
 
+// Emits a kernel for the given hlo instruction using a tiled 0-2-1 transpose
+// algorithm to improve the memory access patterns for the input parameters
+// with a shape that is a 0-2-1 transpose of the output tensor shape.
+//
+// For the purpose of tiling, the output tensors have a logical shape of three
+// components 0-2-1 while the relevant input parameters have a logical shape
+// of three components 0-1-2 in the order major to minor. The x- and y-
+// dimensions of the tensors are tiled in square tiles with an edge length
+// `kTileSize`. Each thread block of `kTileSize` x `kNumRows` threads
+// transposes one tile: each thread copies kTileSize/kNumRows elements from
+// the input to a shared memory tile, then the otherwise "regular HLO kernel"
+// reads from the shared memory instead of the original input.
+//
+// This is similar to the following CUDA algorithm in TensorFlow:
+// https://goo.gl/MStRV6.
+//
+// `kTileSize` should usually be same as warp size. We currently choose 32 for
+// `kTileSize` and 4 for `kNumRows`. The CUDA algorithm uses 8 for `kNumRows`.
+//
+// TODO(b/33320379): Here each block transposes 1 tile. It may be more
+// efficient to launch fewer blocks so each transposes many tiles.
+LaunchDimensions IrEmitterUnnested::EmitHlo021Tile(
+    HloInstruction* hlo, absl::Span<const int64> reduced_output_dims,
+    absl::Span<const int64> tiled_param_ids) {
+  constexpr int kNumRows = 4;
+  KernelMappingScheme mapping_scheme(
+      reduced_output_dims, /*tile_size_y=*/kWarpSize,
+      /*tile_size_x=*/kWarpSize, /*req_block_sizes=*/{1, 1, 1},
+      /*num_threads_y=*/kNumRows,
+      /*num_threads_x=*/kWarpSize, &b_);
+  TileElementGenerator element_generator;
+  if (hlo->opcode() == HloOpcode::kCopy) {
+    element_generator = [&](HloInstruction* hlo,
+                            const llvm_ir::IrArray::Index& index,
+                            const KernelCodegenInfo* kernel_info,
+                            llvm::Value* y_loc, llvm::Value* x_loc) {
+      EmitTileElementForCopy(hlo, index, kernel_info, y_loc, x_loc);
+    };
+  } else {
+    DCHECK_EQ(hlo->opcode(), HloOpcode::kFusion);
+    element_generator = [&](HloInstruction* hlo,
+                            const llvm_ir::IrArray::Index& index,
+                            const KernelCodegenInfo* kernel_info,
+                            llvm::Value* y_loc, llvm::Value* x_loc) {
+      EmitTileElementForFusion(hlo, index, kernel_info, y_loc, x_loc);
+    };
+  }
+  KernelCodegenInfo kernel_info(&mapping_scheme);
+  KernelCodeGenerator kernel_generator(std::move(element_generator));
+  return EmitKernel(hlo, tiled_param_ids, kernel_generator, &kernel_info);
+}
+
 namespace {
 // Returns true to indicate it is safe to use the tile based shared memory
 // transpose implementation to implement the kernel for the instruction.
@@ -3562,8 +3731,8 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
                                   ? ShapeUtil::GetSubshape(hlo->shape(), {0})
                                   : hlo->shape();
 
-  // If the output_shape is reduced to 021 shape, find all the parameters of the
-  // hlo that are in the corresponding 012 shape.
+  // If the output_shape is reduced to 021 shape, find all the parameters of
+  // the HLO that are in the corresponding 012 shape.
   std::vector<int64> params_012;
   optional<std::vector<int64>> reduced_dims_021;
   for (int64 operand_idx = 0; operand_idx < hlo->operand_count();
@@ -3600,9 +3769,9 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   }
 
   // Each of our shared memory tiles has 32*33 elements (so ~4kb, if the
-  // elements are of size 4 bytes), and CUDA has an architectural limit of 48kb
-  // shared memory per SM.  (This is increased to 96kb in Volta, but we don't
-  // use this, in part because it eats into our L1 cache space.)
+  // elements are of size 4 bytes), and CUDA has an architectural limit of
+  // 48kb shared memory per SM.  (This is increased to 96kb in Volta, but we
+  // don't use this, in part because it eats into our L1 cache space.)
   //
   // For correctness we need to ensure that we don't make more than 48kb worth
   // of shmem tiles per block.  And for performance, we'd probably like to use
@@ -3610,9 +3779,9 @@ bool IrEmitterUnnested::CheckAndEmitHloWithTile021(HloInstruction* hlo) {
   // gpu core.
   //
   // We say without benchmarks that we want at least 3 threads/block,
-  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We choose
-  // which params get the shmem transpose treatment arbitrarily; it's not clear
-  // if there's a Right Choice.
+  // corresponding to 3 shmem tiles if the elements are 32 bits wide.  We
+  // choose which params get the shmem transpose treatment arbitrarily; it's
+  // not clear if there's a Right Choice.
   //
   // This is only sound if tiled transposes are the only place where we use
   // shared memory in fusions.  If in the future other fusible ops use shared
@@ -3666,10 +3835,10 @@ Status IrEmitterUnnested::EmitConstantGlobals() {
     }
 
     // These globals will be looked up by name by GpuExecutable so we need to
-    // give them an external linkage.  Not all of their uses are visible in the
-    // LLVM IR (e.g. TupleThunk) so we can't give then a linkage that merely
-    // preserves their names (like available_externally), we also need to ensure
-    // that they stick around even if they're "unused".
+    // give them an external linkage.  Not all of their uses are visible in
+    // the LLVM IR (e.g. TupleThunk) so we can't give then a linkage that
+    // merely preserves their names (like available_externally), we also need
+    // to ensure that they stick around even if they're "unused".
     //
     // We may have to be more more clever here in the future if we notice that
     // we're keeping around too many globals because of their linkage.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 334c0b3c20b0888fa9b167a8979221f0184a82e7..e09ed657a812be6ab4859a0e365a51c45a37bfed 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/ir_emitter.h"
 #include "tensorflow/compiler/xla/service/gpu/sequential_thunk.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
+#include "tensorflow/compiler/xla/service/llvm_ir/kernel_support_library.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h"
 
 namespace xla {
@@ -47,6 +48,94 @@ namespace gpu {
 //
 class IrEmitterUnnested : public IrEmitter {
  public:
+  // Parameter block_contains_multi_tiles indicates whether a tile block
+  // consists of multiple tiles or not. If the tile block contains only one
+  // tile, there is no need to use atomic operation to accumulate a local result
+  // to a global result to implement reduction.
+  using TileGenerator =
+      std::function<void(const llvm_ir::IrArray::Index& output_tile_origin,
+                         absl::Span<llvm::Value* const> output_tile_bounds,
+                         bool block_contains_multi_tiles)>;
+  // KernelCodegenInfo records the common information to support the code
+  // generation for a kernel to process tensor elements by blocks. A block of
+  // tensor elements may contain one or multiple tiles. The code generators that
+  // generate code for tile elements or block prologue/epilogue refer to this
+  // class in their prototypes. If the implementations of such code generators
+  // require other information that are specific to the HLO instructions, the
+  // implementations need to define and use derived classes of this class.
+  class KernelCodegenInfo {
+   public:
+    explicit KernelCodegenInfo(llvm_ir::KernelMappingScheme* mapping_scheme)
+        : mapping_scheme_(mapping_scheme),
+          tiled_param_info_(nullptr),
+          lane_id_(nullptr) {}
+
+    void SetLaneId(llvm::Value* v) { lane_id_ = v; }
+    void SetTiledParamInfo(llvm_ir::TiledParameterInfo* tiled_param_info) {
+      CHECK_EQ(tiled_param_info_, nullptr);
+      tiled_param_info_ = tiled_param_info;
+    }
+
+    llvm::Value* GetLaneId() const { return lane_id_; }
+    llvm_ir::KernelMappingScheme* GetKernelMappingScheme() const {
+      return mapping_scheme_;
+    }
+    llvm_ir::TiledParameterInfo* GetTiledParameterInfo() const {
+      return tiled_param_info_;
+    }
+
+   private:
+    llvm_ir::KernelMappingScheme* mapping_scheme_;
+    llvm_ir::TiledParameterInfo* tiled_param_info_;
+    llvm::Value* lane_id_;
+  };
+
+  // A function object to prepare for the code generation for a tile block.
+  using BlockPrologueGenerator =
+      std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+  // A function object to finalize the code generation for a tile block.
+  using BlockEpilogueGenerator =
+      std::function<void(HloInstruction* hlo, KernelCodegenInfo* kernel_info)>;
+  // A function object to generate code to process one element in a tile.
+  //
+  // hlo: the instruction for which the code is generated for.
+  // index: the index for the first output element of the current thread.
+  // y_loc: The y coordinate within a tile.
+  // x_loc: The x coordinate within a tile.
+  // kernel_info: Other information to support the kernel code generation.
+  using TileElementGenerator = std::function<void(
+      HloInstruction* hlo, const llvm_ir::IrArray::Index& index,
+      const KernelCodegenInfo* kernel_info, llvm::Value* y_loc,
+      llvm::Value* x_loc)>;
+
+  // KernelCodeGenerator records the code generator objects that generate code
+  // for tile elements or tile block prologue/epilogue.
+  class KernelCodeGenerator {
+   public:
+    explicit KernelCodeGenerator(
+        TileElementGenerator tile_element_generator,
+        BlockPrologueGenerator block_prologue_generator = {},
+        BlockEpilogueGenerator block_epilogue_generator = {})
+        : tile_element_generator_(std::move(tile_element_generator)),
+          block_prologue_generator_(std::move(block_prologue_generator)),
+          block_epilogue_generator_(std::move(block_epilogue_generator)) {}
+
+    const TileElementGenerator& GetTileElementGenerator() const {
+      return tile_element_generator_;
+    }
+    const BlockPrologueGenerator& GetBlockPrologueGenerator() const {
+      return block_prologue_generator_;
+    }
+    const BlockEpilogueGenerator& GetBlockEpilogueGenerator() const {
+      return block_epilogue_generator_;
+    }
+
+   private:
+    TileElementGenerator tile_element_generator_;
+    BlockPrologueGenerator block_prologue_generator_;
+    BlockEpilogueGenerator block_epilogue_generator_;
+  };
+
   IrEmitterUnnested(const HloModuleConfig& hlo_module_config,
                     const HloComputation* hlo_computation,
                     IrEmitterContext* ir_emitter_context);
@@ -82,7 +171,7 @@ class IrEmitterUnnested : public IrEmitter {
   Status HandleSort(HloInstruction* sort) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
   Status HandleCrossReplicaSum(HloInstruction* crs) override;
-  Status HandleAfterAll(HloInstruction* gen_token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
 
   Status EmitTargetElementLoop(
       const HloInstruction& hlo,
@@ -205,22 +294,32 @@ class IrEmitterUnnested : public IrEmitter {
   LaunchDimensions EmitHlo021Tile(HloInstruction* hlo,
                                   absl::Span<const int64> reduced_output_dims,
                                   absl::Span<const int64> tiled_param_ids);
+  // Emits a kernel for an unnested HLO instruction.
+  LaunchDimensions EmitKernel(HloInstruction* unnested_hlo,
+                              absl::Span<const int64> param_ids,
+                              const KernelCodeGenerator& kernel_generator,
+                              KernelCodegenInfo* kernel_info);
+  void EmitBlock(const TileGenerator& emit_one_tile,
+                 const KernelCodegenInfo* kernel_info,
+                 KernelSupportLibrary& ksl, llvm::Type* index_ty);
+  // Emits code to process a tensor element in a tile for the given kCopy HLO
+  // that performs a 0-2-1 transpose.
+  void EmitTileElementForCopy(HloInstruction* hlo,
+                              const llvm_ir::IrArray::Index& index,
+                              const KernelCodegenInfo* kernel_info,
+                              llvm::Value* y_loc, llvm::Value* x_loc);
+  // Emits code to process a tensor element in a tile for the given kLoop fusion
+  // HLO containing parameters that are 0-2-1 transpose of its outputs.
+  void EmitTileElementForFusion(HloInstruction* hlo,
+                                const llvm_ir::IrArray::Index& index,
+                                const KernelCodegenInfo* kernel_info,
+                                llvm::Value* y_loc, llvm::Value* x_loc);
 
   // Generates the IrArray for each input of an hlo and returns a vector that
   // constains such IrArrays.
   std::vector<llvm_ir::IrArray> ConstructIrArrayForInputs(
       const HloInstruction& hlo);
 
-  // For each output of the `hlo` instruction, constructs the reduced shape for
-  // the output with the given `reduced_output_dims` and cast the original
-  // output IrArray element in `output_arrays` to the reduced shape. Returns
-  // the number of outputs.
-  int ConstructOutputReducedShapeAndCastOutputIrArrayToShape(
-      const HloInstruction& hlo,
-      const std::vector<llvm_ir::IrArray>& output_arrays,
-      absl::Span<const int64> reduced_output_dims,
-      std::vector<Shape>* output_reduced_shapes,
-      std::vector<llvm_ir::IrArray>* output_in_reduced_shape_arrays);
   // For each input of the `hlo` instruction, checks its value in
   // `param_buffers` to find out whether the input has a reduced shape. If the
   // input has a reduced shape, constructs the reduced shape for the input and
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
index 8751e3a9c2a4c8da46d3ecd8437629450d4a2ba2..bd53b90b42d8e657a3ee58e7ca03fb60522aae28 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/nvptx_backend_lib.cc
@@ -177,13 +177,6 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
 
   TargetOptions target_options = InitTargetOptionsFromCodeGenFlags();
-  llvm_ir::SetTargetOptions(
-      /*fast_math_enabled=*/hlo_module_config.debug_options()
-          .xla_gpu_enable_fast_math(),
-      &target_options);
-
-  // Enable FMA synthesis.
-  target_options.AllowFPOpFusion = FPOpFusion::Fast;
 
   // Set the verbose assembly options.
   target_options.MCOptions.AsmVerbose = false;
@@ -206,8 +199,7 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
   }
   return absl::WrapUnique(target->createTargetMachine(
       triple.str(), llvm_ir::AsStringRef(cpu_name), "+ptx60", target_options,
-      Optional<Reloc::Model>(RelocModel), Optional<CodeModel::Model>(CMModel),
-      codegen_opt_level));
+      getRelocModel(), getCodeModel(), codegen_opt_level));
 }
 
 // Adds the standard LLVM optimization passes, based on the speed optimization
@@ -401,8 +393,16 @@ StatusOr<string> CompileModuleToPtx(llvm::Module* module,
   int32 opt_level =
       hlo_module_config.debug_options().xla_backend_optimization_level();
 
-  CHECK_GE(opt_level, 2)
-      << "The XLA GPU backend doesn't support unoptimized code generation";
+  if (opt_level < 2) {
+    LOG(ERROR) << std::string(80, '*');
+    LOG(ERROR) << "The XLA GPU backend doesn't support unoptimized code "
+                  "generation but ";
+    LOG(ERROR) << "--xla_backend_optimization_level is set to " << opt_level
+               << "!";
+    LOG(ERROR) << "(Supported configuration is "
+                  "--xla_backend_optimization_level >= 2.)";
+    LOG(ERROR) << std::string(80, '*');
+  }
 
   AddOptimizationPasses(opt_level,
                         /*size_level=*/0, target_machine.get(), &module_passes,
@@ -453,18 +453,21 @@ void GPUBackendInit(const HloModuleConfig& hlo_module_config) {
   // * 3-6 gives similar results as 2;
   // * >6 start hurting the performance of at least dot product kernels.
   //
-  // TODO(jingyue): The current threshold only considers the numbr of IR
+  // TODO(jingyue): The current threshold only considers the number of IR
   // instructions which do not accurately reflect the true cost. We need a
   // better cost model.
   FeedLLVMWithFlags({"-bonus-inst-threshold=2"});
-  // TODO(b/22073864): Increase limit when scan memory dependency.
-  // This helps to reduce more redundant load instructions.
+  // Increase limit when scanning memory dependencies.  This helps to reduce
+  // more redundant load instructions.
   //
   // The specific value is currently large enough for s3d in shoc benchmark,
   // which contains a lot of load instructions and many arithmetic instructions
   // between those loads.
   FeedLLVMWithFlags({"-memdep-block-scan-limit=500"});
 
+  // Use div.approx -- it matters for some float-division heavy benchmarks.
+  FeedLLVMWithFlags({"-nvptx-prec-divf32=0"});
+
   llvm_ir::InitializeLLVMCommandLineOptions(hlo_module_config);
 
   // Initialize the NVPTX target; it's the only target we link with, so call its
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
index d9b06828e2b5d334873c88cb49c2e0d5675bb5fe..01fddcede64d1bb02ab89db5fc9524893c2d47a4 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion.cc
@@ -41,50 +41,7 @@ GpuMultiOutputFusion::GpuMultiOutputFusion() : MultiOutputFusion(INT64_MAX) {}
 
 bool GpuMultiOutputFusion::ShapesCompatibleForFusion(HloInstruction* instr1,
                                                      HloInstruction* instr2) {
-  auto get_element_instr =
-      [&](const HloInstruction* instr) -> const HloInstruction* {
-    const HloInstruction* element_instr = instr;
-    if (instr->opcode() == HloOpcode::kFusion) {
-      auto fused_expression_root = instr->fused_expression_root();
-      if (instr->IsMultiOutputFusion()) {
-        // If possible, we want to pick a reduce operand of the fusion root,
-        // because it has the most constraints.
-        for (const auto* inst : fused_expression_root->operands()) {
-          if (IsReductionToVector(*inst)) {
-            return inst;
-          }
-        }
-        return fused_expression_root->operands()[0];
-      } else {
-        element_instr = fused_expression_root;
-      }
-    }
-    return element_instr;
-  };
-
-  auto get_element_shape = [&](const HloInstruction* element_instr) {
-    // Special handling of kReduce instructions -- the fusion
-    // applies to the first operand.
-    if (IsReductionToVector(*element_instr)) {
-      return element_instr->operand(0)->shape();
-    }
-    return element_instr->shape();
-  };
-
-  // The shapes in all tuple operands should agree, unless it is a reduce.
-  // In that case, the operand of the reduce needs to have the same shape
-  // as the other tuple operands, but also we need to compare the output
-  // shapes of the reduces.
-  auto* element_instr_1 = get_element_instr(instr1);
-  auto* element_instr_2 = get_element_instr(instr2);
-  if (element_instr_1->opcode() == HloOpcode::kReduce &&
-      element_instr_2->opcode() == HloOpcode::kReduce &&
-      !ShapeUtil::Equal(element_instr_1->shape(), element_instr_2->shape())) {
-    return false;
-  }
-  // The elementwise output shapes must be the same (including layout).
-  return ShapeUtil::EqualIgnoringFpPrecision(
-      get_element_shape(element_instr_1), get_element_shape(element_instr_2));
+  return ShapesCompatibleForMultiOutputFusion(*instr1, *instr2);
 }
 
 bool GpuMultiOutputFusion::IsFusible(HloInstruction* instr) {
@@ -205,7 +162,7 @@ bool GpuMultiOutputFusion::DoProducerConsumerMultiOutputFusion() {
         VLOG(3) << producer->name() << " is not a loop fusion.";
         continue;
       }
-      if (!ShapesCompatibleForFusion(producer, consumer)) {
+      if (!ShapesCompatibleForMultiOutputFusion(*producer, *consumer)) {
         VLOG(3) << producer->name() << " has an incompatible shape.";
         continue;
       }
diff --git a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
index dc221f22a74f0875e08e01890ce8ac8fe072cd9d..d16c87ba5c63aa582753fe949e9e39ee2d8b81e5 100644
--- a/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/multi_output_fusion_test.cc
@@ -580,7 +580,7 @@ TEST_F(MultiOutputFusionTest, AvoidsLargeFusion) {
   //   ...
   // where each of the (pi * pj)'s is represented as a fusion node so that
   // multi-output fusion will pay attention to it.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder b(TestName());
   Shape shape = ShapeUtil::MakeShape(F32, {10, 100});
 
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index de04ed85c30717f5be7c5485ff3b68270c8ec188..f3e17d888242a36c268dcbfa0d6530f80cedceb0 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_element_type_converter.h"
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
@@ -173,13 +174,16 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
           /*rewrite_inference_op=*/true,
           /*rewrite_grad_op=*/true);
 
+      pipeline.AddPass<HloGetDimensionSizeRewriter>();
+
       // BatchNormExpander can create zero-sized ops, so zero-sized HLO
       // elimination has to come after that pass.
       pipeline.AddPass<ZeroSizedHloElimination>();
 
-      pass.AddPass<AlgebraicSimplifier>(
-          /*is_layout_sensitive=*/false,
+      AlgebraicSimplifierOptions options(
           [](const Shape&, const Shape&) { return false; });
+      options.set_enable_permutation_sort_replacement(true);
+      pass.AddPass<AlgebraicSimplifier>(options);
       pass.AddPass<TupleSimplifier>();
       pass.AddPass<WhileLoopConstantSinking>();
       pass.AddPass<WhileLoopSimplifier>();
@@ -248,11 +252,13 @@ Status OptimizeHloModule(HloModule* hlo_module, se::StreamExecutor* stream_exec,
 
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(
-        /*is_layout_sensitive=*/true,
+    AlgebraicSimplifierOptions options(
         /*valid_bitcast_callback=*/[](const Shape&, const Shape&) {
           return true;
         });
+    options.set_is_layout_sensitive(true);
+    options.set_enable_permutation_sort_replacement(true);
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
 
     // Choose the fastest algorithm for each conv.
     //
@@ -473,7 +479,8 @@ void WarnIfBadDriverJITVersion() {
 // Compiles the given PTX string using ptxas and returns the resulting machine
 // code (i.e. a cubin) as a byte array.
 StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
-                                        int cc_minor) {
+                                        int cc_minor,
+                                        bool disable_ptx_optimizations) {
   tracing::ScopedActivity activity("Compile PTX", /*is_expensive=*/true);
   const string ptxas_path =
       tensorflow::io::JoinPath(tensorflow::CudaRoot(), "bin", "ptxas");
@@ -513,6 +520,9 @@ StatusOr<std::vector<uint8>> CompilePtx(const string& ptx, int cc_major,
   if (VLOG_IS_ON(2)) {
     ptxas_args.push_back("-v");
   }
+  if (disable_ptx_optimizations) {
+    ptxas_args.push_back("-O0");
+  }
   ptxas_info_dumper.SetProgram(ptxas_path, ptxas_args);
   ptxas_info_dumper.SetChannelAction(tensorflow::CHAN_STDERR,
                                      tensorflow::ACTION_PIPE);
@@ -733,8 +743,9 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
     }
   }
 
-  const std::vector<uint8> cubin =
-      CompilePtxOrGetCachedResult(ptx, cc_major, cc_minor);
+  const std::vector<uint8> cubin = CompilePtxOrGetCachedResult(
+      ptx, cc_major, cc_minor,
+      module->config().debug_options().xla_gpu_disable_ptxas_optimizations());
 
   auto thunk_schedule = absl::make_unique<ThunkSchedule>(
       ir_emitter.ConsumeThunkSequence(), std::move(stream_assignment),
@@ -766,9 +777,9 @@ StatusOr<std::unique_ptr<Executable>> NVPTXCompiler::RunBackend(
   return std::unique_ptr<Executable>(gpu_executable);
 }
 
-std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
-                                                              int cc_major,
-                                                              int cc_minor) {
+std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(
+    const string& ptx, int cc_major, int cc_minor,
+    bool disable_ptx_optimizations) {
   XLA_SCOPED_LOGGING_TIMER("NVPTXCompiler::CompilePtxOrGetCachedResult");
   tracing::ScopedActivity activity("PTX->CUBIN", /*is_expensive=*/true);
   bool inserted;
@@ -796,8 +807,8 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
     if (inserted) {
       CHECK(!cache_value->compilation_done);
       if (!ptx.empty()) {
-        StatusOr<std::vector<uint8>> maybe_cubin =
-            CompilePtx(*cache_ptx, cc_major, cc_minor);
+        StatusOr<std::vector<uint8>> maybe_cubin = CompilePtx(
+            *cache_ptx, cc_major, cc_minor, disable_ptx_optimizations);
         if (maybe_cubin.ok()) {
           cache_value->cubin_data = std::move(maybe_cubin).ValueOrDie();
           VLOG(2) << "Compiled PTX size:" << ptx.size()
@@ -810,7 +821,7 @@ std::vector<uint8> NVPTXCompiler::CompilePtxOrGetCachedResult(const string& ptx,
             // binaries are not available. We don't want to spam logs with
             // identical warnings in this case.
 
-            // TODO(zhengxq): we should implement a LOG_FIRST_N and LOG_EVERY_N
+            // TODO(jlebar): we should implement a LOG_FIRST_N and LOG_EVERY_N
             // for more general usage.
             static std::atomic<bool> warning_done(false);
             log_warning = !warning_done.exchange(true);
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
index f79ae2990ae7d6e6985b15727a72358289121aa9..be5e31a50112686841e6f18b76f382a56e61bafc 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.h
@@ -97,8 +97,9 @@ class NVPTXCompiler : public LLVMCompiler {
 
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
   // compiled cubin.  If compilation was unsuccessful, returns an empty vector.
-  std::vector<uint8> CompilePtxOrGetCachedResult(const string& ptx,
-                                                 int cc_major, int cc_minor);
+  std::vector<uint8> CompilePtxOrGetCachedResult(
+      const string& ptx, int cc_major, int cc_minor,
+      bool disable_ptx_optimizations);
 
   // The compilation_cache_ map is a cache from {ptx string, cc_major, cc_minor}
   // -> cubin so we don't recompile the same ptx twice.  This is important for
diff --git a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
index f2ef11e1e6ac2405ac2a35fec7b79add9d2b6c17..31a5d7a8c04e9863830e2026fc73cd7ded8c322e 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_assignment_test.cc
@@ -30,7 +30,7 @@ namespace gpu {
 
 class StreamAssignmentTest : public HloTestBase {
  protected:
-  std::unique_ptr<HloModule> CreateNewUnverifiedModule() {
+  std::unique_ptr<HloModule> CreateNewVerifiedModule() {
     HloModuleConfig config;
     auto debug_options = GetDebugOptionsForTest();
     debug_options.set_xla_gpu_disable_multi_streaming(false);
@@ -55,7 +55,7 @@ TEST_F(StreamAssignmentTest, SequentialMatMul) {
   HloInstruction* dot2 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, dot1, z));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(dot2));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -76,7 +76,7 @@ TEST_F(StreamAssignmentTest, ConcurrentMatMul) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(f32_2x2_, HloOpcode::kAdd, dot1, dot2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(add));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
@@ -120,7 +120,7 @@ TEST_F(StreamAssignmentTest, LatticeMatMul) {
   HloInstruction* d40 =
       builder.AddInstruction(CreateCanonicalDot(f32_2x2_, d30, d31));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build(d40));
 
   std::unique_ptr<StreamAssignment> assignment = AssignStreams(*module);
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
index d2f30ae7bc4f65675f10a2f87ba934cf308f663a..d917320e36363c4fa7e4c0055e8f3345cbc610a2 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h
@@ -26,7 +26,7 @@ namespace gpu {
 // Tests that verify IR or PTX emitted by the GPU backend is as expected.
 class GpuCodegenTest : public LlvmIrGenTestBase {
  protected:
-  // Like HloTestBase::CreateNewUnverifiedModule(), with a flag for configuring
+  // Like HloTestBase::CreateNewVerifiedModule(), with a flag for configuring
   // the ftz option.
   std::unique_ptr<HloModule> CreateNewUnverifiedModuleWithFTZ(bool ftz);
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
index 268b48a1cadeef911dfda7e827ae0cd154040be8..a1ed8499040359fe7265a7317b0577a990a2234c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_copy_test.cc
@@ -46,7 +46,7 @@ TEST_F(GpuCopyTest, UseMemcpy) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   // There should not be any kernel prefixed "copy".
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
index d0ccd8619bde9ddd560989380b403efed5c5f42c..5e524faab18947f5793dc2ae34e9329a446d4235 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ftz_test.cc
@@ -75,16 +75,16 @@ class GpuFtzDisabledTest : public GpuFtzTest {
 // Check that we emit mul.ftz.f32 when in ftz mode, and plain mul.f32 otherwise.
 TEST_F(GpuFtzEnabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
-    CHECK-NOT: mul.f32
-    CHECK: mul.ftz.f32
-    CHECK-NOT: mul.f32
+    CHECK-NOT: mul.rn.f32
+    CHECK: mul.rn.ftz.f32
+    CHECK-NOT: mul.rn.f32
   )");
 }
 TEST_F(GpuFtzDisabledTest, MultiplyFtz) {
   CompileAndVerifyPtx(CreateBinaryOpModule(HloOpcode::kMultiply), R"(
-    CHECK-NOT: mul.ftz.f32
-    CHECK: mul.f32
-    CHECK-NOT: mul.ftz.f32
+    CHECK-NOT: mul.rn.ftz.f32
+    CHECK: mul.rn.f32
+    CHECK-NOT: mul.rn.ftz.f32
   )");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
index da8e513a2c3b61eb9f780ac628e4befeb918b939..6814be779e0b02c38e3bc7008f036b845d88cb6f 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_index_test.cc
@@ -51,7 +51,7 @@ TEST_F(GpuIndexTest, CompatibleUseLinearIndex) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {5, 7, 2}), HloOpcode::kGe, param_x, param_y));
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   // Check the optimized IR as the unoptimized IR contains dead udiv and urem.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
index ea1fee040dd536bcd1c4f8c5dd4f3aaa8dca32f9..3019215c015a4e0aa094a62424d650ced0de2a0e 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_ldg_test.cc
@@ -48,7 +48,7 @@ TEST_F(GpuLdgTest, LdgForParamRead) {
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, param, param));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -73,7 +73,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
   builder.AddInstruction(HloInstruction::CreateTuple({add, square}));
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyPtx(std::move(hlo_module), R"(
@@ -95,7 +95,7 @@ TEST_F(GpuLdgTest, LdgForNonParamRead) {
 // reduce in the foreseeable future.  But if that turns out to be wrong, I give
 // you, future reader, permission to delete this test.
 TEST_F(GpuLdgTest, NoLdgWhenSharingBuffer) {
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation* reduce_computation;
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index 14285459b5a7fc0325dc5af80e57bef4ee4b7299..ca0a78034d7dc83d17ad72202914d95f37ac122b 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -47,7 +47,7 @@ TEST_F(GpuNoAliasTest, Concat) {
 
   std::unique_ptr<HloComputation> computation = builder.Build();
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(std::move(computation));
 
   CompileAndVerifyIr(std::move(hlo_module),
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
index 141f3219387940a08ef22cbcc0be0971a14c2cd6..6b2d76764a077dc6cfa3f9ddc6e525ab330323be 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.cc
@@ -45,7 +45,7 @@ void ThunkSchedule::AddDependenciesOnTransitiveOperands(
 ThunkSchedule::ThunkSchedule(
     std::unique_ptr<ThunkSequence> thunks,
     std::unique_ptr<StreamAssignment> stream_assignment,
-    const std::vector<const HloInstruction*>& hlo_total_order)
+    const std::vector<HloInstruction*>& hlo_total_order)
     : thunks_(std::move(thunks)),
       stream_assignment_(std::move(stream_assignment)) {
   std::unordered_map<const HloInstruction*, Thunk*> hlo_to_thunk;
@@ -53,7 +53,7 @@ ThunkSchedule::ThunkSchedule(
     InsertOrDie(&hlo_to_thunk, thunk->hlo_instruction(), thunk.get());
   }
 
-  for (const HloInstruction* hlo : hlo_total_order) {
+  for (HloInstruction* hlo : hlo_total_order) {
     if (hlo_to_thunk.count(hlo)) {
       thunk_total_order_.push_back(FindOrDie(hlo_to_thunk, hlo));
     }
diff --git a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
index d3352994f845a535233612a17e19107511ce0622..43b628a1baf0e79a3197f3cfad3547991642eaed 100644
--- a/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
+++ b/tensorflow/compiler/xla/service/gpu/thunk_schedule.h
@@ -46,7 +46,7 @@ class ThunkSchedule {
  public:
   ThunkSchedule(std::unique_ptr<ThunkSequence> thunks,
                 std::unique_ptr<StreamAssignment> stream_assignment,
-                const std::vector<const HloInstruction*>& hlo_total_order);
+                const std::vector<HloInstruction*>& hlo_total_order);
 
   // Returns the total order of executing all the thunks.
   const std::vector<Thunk*>& TotalOrder() const { return thunk_total_order_; }
diff --git a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
index c7f51127649664189050e2128ae1e56547358c23..2dce7749bbd8da2673ae607eee3d731d9917e8fe 100644
--- a/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/while_transformer_test.cc
@@ -29,7 +29,7 @@ namespace {
 class WhileTransformerTest : public HloTestBase {
  protected:
   WhileTransformerTest()
-      : module_(CreateNewUnverifiedModule()),
+      : module_(CreateNewVerifiedModule()),
         induction_variable_shape_(ShapeUtil::MakeShape(S32, {})),
         data_shape_(ShapeUtil::MakeShape(F32, {8})),
         condition_result_shape_(ShapeUtil::MakeShape(PRED, {})) {}
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index fad3215fc81e1012ddaa5a6458bc98f90ab97f18..dc40b9446ad1bffcb757543e52fc9ab20de6d52e 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -258,7 +258,7 @@ class HeapSimulatorTracker {
   // Constructor for testing a single entry computation.
   HeapSimulatorTracker(
       const string& name, std::unique_ptr<HloComputation> computation,
-      const std::vector<const HloInstruction*>& instruction_sequence) {
+      const std::vector<HloInstruction*>& instruction_sequence) {
     HloModuleConfig config;
     module_ = absl::make_unique<HloModule>(name, config);
     module_->AddEntryComputation(std::move(computation));
@@ -286,7 +286,7 @@ class HeapSimulatorTracker {
   // Similar to the single entry computation constructor above, but runs the
   // simulation over the entire module.
   void RunWholeModule(
-      const std::vector<const HloInstruction*>& full_module_sequence) {
+      const std::vector<HloInstruction*>& full_module_sequence) {
     points_to_analysis_ =
         TuplePointsToAnalysis::Run(module_.get()).ConsumeValueOrDie();
 
@@ -294,7 +294,7 @@ class HeapSimulatorTracker {
     HloSchedule schedule(module_.get());
     absl::flat_hash_map<const HloInstruction*, int> reverse_position;
     for (int i = 0; i < full_module_sequence.size(); ++i) {
-      const HloInstruction* instruction = full_module_sequence[i];
+      HloInstruction* instruction = full_module_sequence[i];
       schedule.GetOrCreateSequence(instruction->parent())
           .push_back(instruction);
       reverse_position[instruction] = full_module_sequence.size() - i;
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index dbab62f847e8ca5e0b46dfd4162a0f4222640252..414c63271245315f037d04924c9291a9cd5b7a77 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -51,7 +51,7 @@ message HloInstructionProto {
 
   string name = 1;
   string opcode = 2;
-  xla.Shape shape = 3;
+  xla.ShapeProto shape = 3;
 
   xla.OpMetadata metadata = 7;
 
@@ -132,7 +132,7 @@ message HloInstructionProto {
   string custom_call_opaque = 53;
 
   // Shape of outfeed request.
-  xla.Shape outfeed_shape = 29;
+  xla.ShapeProto outfeed_shape = 29;
 
   // Describes the dimension numbers used for a dot operation
   xla.DotDimensionNumbers dot_dimension_numbers = 30;
@@ -190,7 +190,7 @@ message HloInstructionProto {
   // 'operand_shapes_with_layout' must contain a shape with layout for each
   // operand.
   bool constrain_layout = 56;
-  repeated Shape operand_shapes_with_layout = 57;
+  repeated xla.ShapeProto operand_shapes_with_layout = 57;
 }
 
 // Serialization of HloComputation.
@@ -205,7 +205,8 @@ message HloComputationProto {
   repeated HloInstructionProto instructions = 2;
 
   // The program shape (with layout) of this computation.
-  xla.ProgramShape program_shape = 4;
+
+  xla.ProgramShapeProto program_shape = 4;
 
   // The id of this computation.
   int64 id = 5;
@@ -251,6 +252,41 @@ message HloInputOutputAliasProto {
   repeated AliasEntryProto entries = 1;
 }
 
+message DynamicParameterBindingProto {
+  // A list of bindings which indicates that the `target_dim_num` in
+  // the subshape `target_param_index` of parameter `target_param_num`
+  // is a dynamic dimension and its real dynamic size is represented
+  // by `dynamic_param_index` in parameter `dynamic_param_num`.
+  //
+  // As an example, imagine we have a program:
+  //
+  // ENTRY main {
+  //   a = f32[] parameter(0)
+  //   b = f32[10] parameter(1)
+  //   ROOT root = (f32[], f32[10]) tuple(%a, %b)
+  // }
+  //
+  // Let's say 'b' (param index 1) is a dynamic shape whose input has
+  // an upperbound of 10 and real size is determined at runtime.'a'
+  // represents the real size of b's first dimension.
+  //
+  // In this case, the fields are set in the following way:
+  // dynamic_param_num = 1
+  // dynamic_param_index = {}
+  // target_param_num = 0
+  // target_param_index = {}
+  // target_param_dim = 0
+  message Binding {
+    int64 dynamic_param_num = 1;
+    repeated int64 dynamic_param_index = 2;
+    int64 target_param_num = 3;
+    repeated int64 target_param_index = 4;
+    int64 target_param_dim_num = 5;
+  }
+
+  repeated Binding entries = 1;
+}
+
 // Serialization of HloModule.
 message HloModuleProto {
   string name = 1;
@@ -262,7 +298,7 @@ message HloModuleProto {
   repeated HloComputationProto computations = 3;
 
   // The host program shape (with layout) of the entry computation.
-  xla.ProgramShape host_program_shape = 4;
+  xla.ProgramShapeProto host_program_shape = 4;
 
   // The id of this module.
   int64 id = 5;
@@ -272,6 +308,8 @@ message HloModuleProto {
 
   // Describes alias information between inputs and outputs.
   HloInputOutputAliasProto input_output_alias = 8;
+
+  DynamicParameterBindingProto dynamic_parameter_binding = 9;
 }
 
 // Serialization of LogicalBuffer.
diff --git a/tensorflow/compiler/xla/service/hlo_computation.cc b/tensorflow/compiler/xla/service/hlo_computation.cc
index 0c20d207ddbca82e2f87800d331d1bace39a512e..ff122b529bdcdcc69d2245136e19101902dbf957 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation.cc
@@ -499,7 +499,7 @@ HloComputationProto HloComputation::ToProto() const {
     proto.add_instructions()->Swap(&instruction_proto);
   }
   proto.set_root_id(root_instruction()->unique_id());
-  *proto.mutable_program_shape() = ComputeProgramShape();
+  *proto.mutable_program_shape() = ComputeProgramShape().ToProto();
   return proto;
 }
 
@@ -711,6 +711,8 @@ bool HloComputation::operator==(const HloComputation& other) const {
   return eq(root_instruction(), other.root_instruction());
 }
 
+uint64 HloComputation::Hash() const { return root_instruction()->Hash(); }
+
 Status HloComputation::ReplaceWithNewInstruction(
     HloInstruction* old_instruction,
     std::unique_ptr<HloInstruction> new_instruction) {
@@ -795,7 +797,7 @@ Status HloComputation::AcceptWithOperandOrder(
 template <typename HloInstructionPtr>
 Status HloComputation::AcceptOrdered(
     DfsHloVisitorBase<HloInstructionPtr>* visitor,
-    const std::vector<const HloInstruction*>& order) const {
+    const std::vector<HloInstruction*>& order) const {
   VLOG(3) << "Accepting visitor with order.";
   for (HloInstruction* root : CollectUnreachableRoots()) {
     TF_RET_CHECK(std::find(order.begin(), order.end(), root) != order.end())
@@ -825,9 +827,9 @@ Status HloComputation::AcceptOrdered(
 
 // Explicit instantiations.
 template Status HloComputation::AcceptOrdered(
-    DfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+    DfsHloVisitor*, const std::vector<HloInstruction*>&) const;
 template Status HloComputation::AcceptOrdered(
-    ConstDfsHloVisitor*, const std::vector<const HloInstruction*>&) const;
+    ConstDfsHloVisitor*, const std::vector<HloInstruction*>&) const;
 
 Status HloComputation::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
diff --git a/tensorflow/compiler/xla/service/hlo_computation.h b/tensorflow/compiler/xla/service/hlo_computation.h
index fc7d2035e5bd0b99fa9e7a026430172f686019d4..c584e4c7ca5770533f28352b0df9dadd9dbe1860 100644
--- a/tensorflow/compiler/xla/service/hlo_computation.h
+++ b/tensorflow/compiler/xla/service/hlo_computation.h
@@ -264,6 +264,12 @@ class HloComputation {
   // Return whether `*this` and `other` are functionally equivalent.
   bool operator==(const HloComputation& other) const;
 
+  // Generates a hash value of an HLO computation. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO computations,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const;
+
   // Replaces old instruction with newly created instruction. Removes old
   // instruction from computation. Updates uses and root instruction.
   Status ReplaceWithNewInstruction(
@@ -301,7 +307,7 @@ class HloComputation {
   // be a topological sort of all instructions in the computation.
   template <typename HloInstructionPtr>
   Status AcceptOrdered(DfsHloVisitorBase<HloInstructionPtr>* visitor,
-                       const std::vector<const HloInstruction*>& order) const;
+                       const std::vector<HloInstruction*>& order) const;
 
   // Same as Accept() above, but the visitor is given as a function.
   Status Accept(const std::function<Status(HloInstruction*)>& visitor_func);
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index 1e7a6e197f5b6c3070b7cad2c14f62521290a4c9..0361c87428f6e4c031d95492a5bc782ad388e5b5 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -20,19 +20,19 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 
 namespace {
 
+namespace m = match;
 using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 
@@ -65,7 +65,7 @@ class HloComputationTest : public HloTestBase {
 };
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEntryComputation(CreateNegateComputation());
   EXPECT_TRUE(negate_computation->MakeEmbeddedComputationsList().empty());
@@ -73,7 +73,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsEmpty) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
   // Create computation which calls one other computation.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map_computation =
@@ -85,7 +85,7 @@ TEST_F(HloComputationTest, GetEmbeddedComputationsOneComputation) {
 
 TEST_F(HloComputationTest, GetEmbeddedComputationsDiamond) {
   // Create computations with a diamond-shaped callgraph.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto negate_computation =
       module->AddEmbeddedComputation(CreateNegateComputation());
   auto map1_computation =
@@ -119,7 +119,7 @@ TEST_F(HloComputationTest, PostOrderSingleton) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(), ElementsAre(constant));
 }
@@ -134,7 +134,7 @@ TEST_F(HloComputationTest, PostOrderSimple) {
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
   auto negate2 = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, negate1));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               ElementsAre(constant, negate1, negate2));
@@ -170,7 +170,7 @@ TEST_F(HloComputationTest, PostOrderDisconnectedInstructions) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
   auto constant4 = builder.AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0f)));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_THAT(computation->MakeInstructionPostOrder(),
               UnorderedElementsAre(constant1, constant2, constant3, constant4));
@@ -192,7 +192,7 @@ TEST_F(HloComputationTest, PostOrderWithMultipleRoots) {
       r0f32_, HloOpcode::kAdd, constant2, constant3));
   auto add3 = builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32_, HloOpcode::kAdd, constant1, constant3));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto post_order = computation->MakeInstructionPostOrder();
   EXPECT_EQ(6, post_order.size());
@@ -217,7 +217,7 @@ TEST_F(HloComputationTest, VisitWithMultipleRoots) {
                                                       constant2, constant3));
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32_, HloOpcode::kAdd,
                                                       constant1, constant3));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   // Visitor which keeps track of which instructions have been visited.
   class TestVisitor : public DfsHloVisitorWithDefault {
@@ -257,11 +257,11 @@ TEST_F(HloComputationTest, DeepCopyArray) {
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0})));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(constant).ValueOrDie();
 
-  EXPECT_THAT(copy, op::Copy(constant));
+  EXPECT_THAT(copy, GmockMatch(m::Copy(m::Op().Is(constant))));
 }
 
 TEST_F(HloComputationTest, DeepCopyTuple) {
@@ -274,12 +274,13 @@ TEST_F(HloComputationTest, DeepCopyTuple) {
   auto tuple = builder.AddInstruction(
       HloInstruction::CreateTuple({constant1, constant2}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto tuple_copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
-  EXPECT_THAT(tuple_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                    op::Copy(op::GetTupleElement(tuple))));
+  EXPECT_THAT(tuple_copy, GmockMatch(m::Tuple(
+                              m::Copy(m::GetTupleElement(m::Op().Is(tuple))),
+                              m::Copy(m::GetTupleElement(m::Op().Is(tuple))))));
   EXPECT_EQ(0, tuple_copy->operand(0)->operand(0)->tuple_index());
   EXPECT_EQ(1, tuple_copy->operand(1)->operand(0)->tuple_index());
 }
@@ -297,7 +298,7 @@ TEST_F(HloComputationTest, DeepCopyArrayAtIndices) {
     ShapeTree<bool> indices_to_copy(constant->shape(), /*init_value=*/true);
     EXPECT_THAT(computation->DeepCopyInstruction(constant, &indices_to_copy)
                     .ValueOrDie(),
-                op::Copy(constant));
+                GmockMatch(m::Copy(m::Op().Is(constant))));
   }
 
   {
@@ -330,10 +331,11 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                     op::Copy(op::GetTupleElement(tuple))));
-    EXPECT_THAT(deep_copy, op::Tuple(copies_added.element({0}),
-                                     copies_added.element({1})));
+    EXPECT_THAT(deep_copy, GmockMatch(m::Tuple(
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple)))
+                                   .Is(copies_added.element({0})),
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple)))
+                                   .Is(copies_added.element({1})))));
   }
 
   {
@@ -346,8 +348,9 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::GetTupleElement(tuple),
-                                     op::GetTupleElement(tuple)));
+    EXPECT_THAT(deep_copy,
+                GmockMatch(m::Tuple(m::GetTupleElement(m::Op().Is(tuple)),
+                                    m::GetTupleElement(m::Op().Is(tuple)))));
     EXPECT_TRUE(copies_added.element({}) == nullptr);
     EXPECT_TRUE(copies_added.element({0}) == nullptr);
     EXPECT_TRUE(copies_added.element({1}) == nullptr);
@@ -363,8 +366,9 @@ TEST_F(HloComputationTest, DeepCopyTupleAtIndices) {
         computation->DeepCopyInstruction(tuple, &indices_to_copy, &copies_added)
             .ValueOrDie();
 
-    EXPECT_THAT(deep_copy, op::Tuple(op::Copy(op::GetTupleElement(tuple)),
-                                     op::GetTupleElement(tuple)));
+    EXPECT_THAT(deep_copy, GmockMatch(m::Tuple(
+                               m::Copy(m::GetTupleElement(m::Op().Is(tuple))),
+                               m::GetTupleElement(m::Op().Is(tuple)))));
     EXPECT_TRUE(copies_added.element({}) == nullptr);
     EXPECT_TRUE(copies_added.element({0}) != nullptr);
     EXPECT_TRUE(copies_added.element({1}) == nullptr);
@@ -376,12 +380,12 @@ TEST_F(HloComputationTest, DeepCopyToken) {
   // copied.
   auto builder = HloComputation::Builder(TestName());
   auto token = builder.AddInstruction(HloInstruction::CreateToken());
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(token).ValueOrDie();
 
   // No copy should be added.
-  EXPECT_THAT(copy, op::AfterAll());
+  EXPECT_THAT(copy, GmockMatch(m::AfterAll()));
 }
 
 TEST_F(HloComputationTest, DeepCopyTokenTuple) {
@@ -393,14 +397,15 @@ TEST_F(HloComputationTest, DeepCopyTokenTuple) {
       HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.0)));
   auto tuple =
       builder.AddInstruction(HloInstruction::CreateTuple({token, constant}));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   auto copy = computation->DeepCopyInstruction(tuple).ValueOrDie();
 
   // Only the array (second tuple element) should be copied. The token is passed
   // through transparently.
-  EXPECT_THAT(copy, op::Tuple(op::GetTupleElement(tuple),
-                              op::Copy(op::GetTupleElement(tuple))));
+  EXPECT_THAT(copy, GmockMatch(m::Tuple(
+                        m::GetTupleElement(m::Op().Is(tuple)),
+                        m::Copy(m::GetTupleElement(m::Op().Is(tuple))))));
 }
 
 TEST_F(HloComputationTest, CycleDetection) {
@@ -440,16 +445,18 @@ TEST_F(HloComputationTest, RemoveInstructionWithDuplicateOperand) {
       r0f32_, HloOpcode::kAdd, dead_negate, dead_negate));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, constant));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(4, computation->instruction_count());
-  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Negate(m::Op().Is(constant))));
   EXPECT_EQ(negate, computation->root_instruction());
 
   ASSERT_IS_OK(computation->RemoveInstructionAndUnusedOperands(dead_add));
 
   EXPECT_EQ(2, computation->instruction_count());
-  EXPECT_THAT(computation->root_instruction(), op::Negate(constant));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Negate(m::Op().Is(constant))));
   EXPECT_EQ(negate, computation->root_instruction());
 }
 
@@ -466,7 +473,7 @@ TEST_F(HloComputationTest, CloneWithControlDependency) {
       HloInstruction::CreateParameter(0, r0f32_, "param0"));
   auto negate = builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32_, HloOpcode::kNegate, param));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation =
       module->AddEntryComputation(builder.Build(/*root_instruction=*/add));
 
@@ -505,7 +512,7 @@ TEST_F(HloComputationTest, Stringification) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
@@ -540,7 +547,7 @@ TEST_F(HloComputationTest, StringificationIndent) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options =
@@ -576,7 +583,7 @@ TEST_F(HloComputationTest, StringificationCanonical) {
       2, PrecisionConfig::DEFAULT);
   builder.AddInstruction(
       HloInstruction::CreateDot(sout, x, reshape, dot_dnums, precision_config));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto options = HloPrintOptions().set_print_metadata(false);
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
index d12f920722e20a3390a99f74c8a10c7c9e3fdf6c..4f81dc94e577a63c09ae4019e5e8158252c712ce 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding_test.cc
@@ -22,21 +22,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/types.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 namespace {
 
+namespace m = xla::match;
+
 using HloConstantFoldingTest = HloTestBase;
 
 TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
@@ -49,13 +50,14 @@ TEST_F(HloConstantFoldingTest, ConvertF32ToS64) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
   TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<int64>(),
             42);
 }
@@ -70,13 +72,14 @@ TEST_F(HloConstantFoldingTest, ConvertS64ToF32) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
   TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().GetFirstElement<float>(),
             42.0f);
 }
@@ -91,13 +94,14 @@ TEST_F(HloConstantFoldingTest, ConvertF32ArrayToS64Array) {
   auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
-  EXPECT_THAT(computation->root_instruction(), op::Convert(input));
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Convert().WithOperand(0, m::Op().Is(input))));
 
   HloConstantFolding const_folder;
   TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(module.get()));
   EXPECT_TRUE(result);
 
-  EXPECT_THAT(computation->root_instruction(), op::Constant());
+  EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Constant()));
   EXPECT_EQ(computation->root_instruction()->literal().Get<int64>({0}), 42);
   EXPECT_EQ(computation->root_instruction()->literal().Get<int64>({1}), 19);
 }
@@ -138,7 +142,7 @@ TEST_F(HloConstantFoldingTest, Concatenate) {
     EXPECT_TRUE(result);
 
     HloInstruction* root = computation->root_instruction();
-    EXPECT_THAT(root, op::Constant());
+    EXPECT_THAT(root, GmockMatch(m::Constant()));
     EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
   }
 }
@@ -165,7 +169,7 @@ TEST_F(HloConstantFoldingTest, Slice) {
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_TRUE(ShapeUtil::Equal(root->shape(), shape));
 }
 
@@ -190,7 +194,7 @@ TEST_F(HloConstantFoldingTest, TransposeConstantFold) {
   EXPECT_TRUE(result);
 
   HloInstruction* root = computation->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
   EXPECT_TRUE(ShapeUtil::Compatible(root->shape(), shape));
 
   using NativeT = typename primitive_util::PrimitiveTypeToNative<F32>::type;
@@ -240,7 +244,8 @@ TEST_F(HloConstantFoldingTest, ConstantFoldReduceNoLayout) {
   TF_ASSERT_OK_AND_ASSIGN(bool result, const_folder.Run(m.get()));
   EXPECT_FALSE(result);
 
-  EXPECT_THAT(m->entry_computation()->root_instruction(), op::Reduce());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reduce()));
 }
 
 const char* const kConstantFoldLargePad = R"(
@@ -260,7 +265,7 @@ TEST_F(HloConstantFoldingTest, DoesNotFoldLargePad) {
   EXPECT_FALSE(result);
 
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Pad(op::Constant(), op::Constant()));
+              GmockMatch(m::Pad(m::Constant(), m::Constant())));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index fdfb38b858c32ba5b092ec2db84d4bac487c3e78..df7d3826dbad1f264a5dc53312c062900155b0f6 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -419,6 +419,21 @@ Status HloCostAnalysis::HandleTranspose(const HloInstruction*) {
 }
 
 Status HloCostAnalysis::HandleAfterAll(const HloInstruction*) {
+  // This instruction is used to enforce ordering at compile time. No code is
+  // emitted.
+  current_should_compute_bottleneck_time_ = false;
+  current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
+  return Status::OK();
+}
+
+Status HloCostAnalysis::HandleAddDependency(
+    const HloInstruction* add_dependency) {
+  // This instruction is used to enforce ordering at compile time. No code is
+  // emitted.
+  current_should_compute_bottleneck_time_ = false;
+  current_properties_[kBytesAccessedKey] = 0;
+  current_properties_[kOptimalSecondsKey] = 0;
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 8ced9d776e150ac587e9ac3ed0beffbc38dc5503..33983119c9b00a248c0e8dcc5815c6367192dca3 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -101,6 +101,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleBroadcast(const HloInstruction* broadcast) override;
   Status HandlePad(const HloInstruction* pad) override;
   Status HandleReshape(const HloInstruction* reshape) override;
+  Status HandleAddDependency(const HloInstruction* add_dependency) override;
   Status HandleAfterAll(const HloInstruction* token) override;
   Status HandleTranspose(const HloInstruction* transpose) override;
   Status HandleWhile(const HloInstruction* xla_while) override;
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
index 6a15b3440c6f9bd2cac5ea10a0883330260b89e5..ff32faf298dd1f04c5b769f2a88f76a7a1e18ae7 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis_test.cc
@@ -387,7 +387,7 @@ TEST_F(FusionCostAnalysis, LoopFusion) {
         HloInstruction::CreateBinary(r2f32, HloOpcode::kSubtract, mul, clamp));
     auto tuple = HloInstruction::CreateTuple({sub, sub, mul, c1});
 
-    auto module = CreateNewUnverifiedModule();
+    auto module = CreateNewVerifiedModule();
     auto* computation = module->AddEntryComputation(builder.Build());
     auto* fusion = computation->CreateFusionInstruction(
         {sub, mul, exp, clamp, add}, HloInstruction::FusionKind::kLoop);
@@ -429,7 +429,7 @@ TEST_F(FusionCostAnalysis, NoLayout) {
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
       shape_with_layout, HloOpcode::kAdd, c1, broadcast));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
   auto* fusion = computation->CreateFusionInstruction(
       {add, broadcast}, HloInstruction::FusionKind::kLoop);
@@ -472,7 +472,7 @@ TEST_F(DomainCostAnalysis, DomainCost) {
   auto domain = builder.AddInstruction(
       HloInstruction::CreateDomain(tuple->shape(), tuple, nullptr, nullptr));
 
-  auto hlo_module = CreateNewUnverifiedModule();
+  auto hlo_module = CreateNewVerifiedModule();
   hlo_module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(hlo_module->entry_computation()->root_instruction(), domain);
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index 5dcf6bc985ff18fa6fc1ab5a5692914b4597d065..3ed3d3c11c71dc534f193ba3ffb556b0eb0c80e4 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -466,6 +466,21 @@ bool HloDataflowAnalysis::UpdateDomainValueSet(HloInstruction* domain) {
   return changed;
 }
 
+bool HloDataflowAnalysis::UpdateAddDependencyValueSet(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand.
+  CHECK_EQ(add_dependency->opcode(), HloOpcode::kAddDependency);
+  const InstructionValueSet& operand_set =
+      GetInstructionValueSet(add_dependency->operand(0));
+  InstructionValueSet& add_dependency_set =
+      GetInstructionValueSet(add_dependency);
+  if (operand_set != add_dependency_set) {
+    add_dependency_set = operand_set;
+    return true;
+  }
+  return false;
+}
+
 bool HloDataflowAnalysis::UpdateGetTupleElementValueSet(HloInstruction* gte) {
   CHECK_EQ(gte->opcode(), HloOpcode::kGetTupleElement);
   bool changed = false;
@@ -622,6 +637,8 @@ bool HloDataflowAnalysis::UpdateInstructionValueSet(
     HloInstruction* instruction) {
   // Recompute from operands.
   switch (instruction->opcode()) {
+    case HloOpcode::kAddDependency:
+      return UpdateAddDependencyValueSet(instruction);
     case HloOpcode::kBitcast:
       return UpdateBitcastValueSet(instruction);
     case HloOpcode::kDomain:
@@ -795,6 +812,7 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
             define_all_values();
           }
           break;
+        case HloOpcode::kAddDependency:
         case HloOpcode::kWhile:
         case HloOpcode::kCall:
         case HloOpcode::kConditional:
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index abac398c04fc4c418d8814a0097db4434bc1cd9c..ece17fc4c3ea0261474df5d53c088dd05016e1e4 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -193,6 +193,7 @@ class HloDataflowAnalysis {
   bool UpdateSendValueSet(HloInstruction* send);
   bool UpdateTupleValueSet(HloInstruction* tuple);
   bool UpdateWhileValueSet(HloInstruction* xla_while);
+  bool UpdateAddDependencyValueSet(HloInstruction* add_dependency);
 
   // Propagate the dataflow through the module.
   void Propagate();
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
index 6422346c1011b95bb511a1fcdfee5c84647f0571..f7a1f19a6f52befd58a405d0e406d7d0d37a8e57 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis_test.cc
@@ -43,7 +43,7 @@ using ::testing::UnorderedElementsAre;
 class HloDataflowAnalysisTest : public HloTestBase,
                                 public ::testing::WithParamInterface<bool> {
  protected:
-  HloDataflowAnalysisTest() : module_(CreateNewUnverifiedModule()) {}
+  HloDataflowAnalysisTest() : module_(CreateNewVerifiedModule()) {}
 
   // Run dataflow analysis on the member module. For convenience returns a
   // reference to the generated analysis stored in analysis_.
@@ -1877,6 +1877,30 @@ TEST_P(HloDataflowAnalysisTest, NestedConditionals) {
   }
 }
 
+TEST_P(HloDataflowAnalysisTest, AddDependency) {
+  string module_string = R"(
+HloModule AddDependency
+ENTRY %AddDependency (p: f32[3]) -> f32[3] {
+  %p = f32[3] parameter(0)
+  %token = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %p, token[] %token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloDataflowAnalysis> analysis,
+                          HloDataflowAnalysis::Run(*module));
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kAddDependency);
+
+  // The after-all and parameter should define a value. Add-dependency should
+  // not.
+  EXPECT_EQ(analysis->values().size(), 2);
+  EXPECT_FALSE(analysis->ValueIsDefinedAt(root));
+}
+
 INSTANTIATE_TEST_CASE_P(HloDataflowAnalysisInstantiation,
                         HloDataflowAnalysisTest,
                         ::testing::Values(false, true));
diff --git a/tensorflow/compiler/xla/service/hlo_dce_test.cc b/tensorflow/compiler/xla/service/hlo_dce_test.cc
index 6c8095d39774b247e136442c92c8ecf17432701c..1fa4259a3e42286cbc911907eea563e6ca6f8611 100644
--- a/tensorflow/compiler/xla/service/hlo_dce_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_dce_test.cc
@@ -59,7 +59,7 @@ TEST_F(HloDceTest, NoDeadCode) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(3, computation->instruction_count());
@@ -110,7 +110,7 @@ TEST_F(HloDceTest, DeadParameters) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       live_param->shape(), HloOpcode::kNegate, live_param));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   EXPECT_EQ(5, computation->instruction_count());
@@ -150,7 +150,7 @@ TEST_F(HloDceTest, ControlDependencies) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       constant1->shape(), HloOpcode::kAdd, constant1, constant2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Add a control dependency between two instructions.
@@ -175,7 +175,7 @@ TEST_F(HloDceTest, ControlDependencies) {
 
 // Tests that a dead call instruction is removed.
 TEST_F(HloDceTest, DeadInstructionWithCalledComputation) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   Shape shape = ShapeUtil::MakeShape(F32, {});
 
   // Called computation for the call instruction.
@@ -323,7 +323,7 @@ TEST_F(HloDceTest, CalledComputationWithNestedSideEffect) {
 }
 
 TEST_F(HloDceTest, RemoveDeadSubcomputation) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   HloComputation::Builder builder(TestName());
 
   HloComputation::Builder subcomp_builder("reduction_subcomp");
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.cc b/tensorflow/compiler/xla/service/hlo_evaluator.cc
index 7fcafafc097a623686ca98a7cb3c6256c7904f6d..3a7652a8dc856b23c8988c4676916c8199e78860 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_query.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -396,6 +397,16 @@ StatusOr<Literal> HloEvaluator::EvaluateDotOp(
   return Evaluate(cloned_instruction.get());
 }
 
+Status HloEvaluator::HandleBitcast(HloInstruction* bitcast) {
+  const Literal& operand_literal = GetEvaluatedLiteralFor(bitcast->operand(0));
+  Literal result(bitcast->shape());
+  TF_RET_CHECK(operand_literal.size_bytes() == result.size_bytes());
+  memcpy(result.untyped_data(), operand_literal.untyped_data(),
+         operand_literal.size_bytes());
+  evaluated_[bitcast] = std::move(result);
+  return Status::OK();
+}
+
 Status HloEvaluator::HandleParameter(HloInstruction* parameter) {
   CHECK_LT(parameter->parameter_number(), arg_literals_.size());
   const Literal* input_literal = arg_literals_[parameter->parameter_number()];
@@ -1046,8 +1057,15 @@ Status HloEvaluator::HandleBroadcast(HloInstruction* broadcast) {
   return Status::OK();
 }
 
-Status HloEvaluator::HandleAfterAll(HloInstruction* token) {
-  evaluated_[token] = LiteralUtil::CreateToken();
+Status HloEvaluator::HandleAfterAll(HloInstruction* after_all) {
+  evaluated_[after_all] = LiteralUtil::CreateToken();
+  return Status::OK();
+}
+
+Status HloEvaluator::HandleAddDependency(HloInstruction* add_dependency) {
+  // AddDedendency just forwards its zero-th operand.
+  evaluated_[add_dependency] =
+      GetEvaluatedLiteralFor(add_dependency->operand(0)).Clone();
   return Status::OK();
 }
 
@@ -1279,10 +1297,10 @@ StatusOr<Literal> EvaluateSortInternal(HloInstruction* sort,
           key_value_vector.push_back(
               std::make_pair(keys_data[i], values_data[i]));
         }
-        std::sort(key_value_vector.begin(), key_value_vector.end(),
-                  [](const kv_pair& a, const kv_pair& b) {
-                    return SafeLess<KeyType>(a.first, b.first);
-                  });
+        std::stable_sort(key_value_vector.begin(), key_value_vector.end(),
+                         [](const kv_pair& a, const kv_pair& b) {
+                           return SafeLess<KeyType>(a.first, b.first);
+                         });
         std::vector<KeyType> result_keys;
         // We use a InlinedVector here because we need to convert it to an
         // absl::Span later, and this would not work with std::vector<bool>.
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator.h b/tensorflow/compiler/xla/service/hlo_evaluator.h
index 07f8d0aad4af0b07303b4e485b3630cc75bcb519..45ed8131dc6b71f706fce45d65b206363dd79ac3 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator.h
@@ -144,6 +144,8 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
   // Operations that are type-agnostic or always return a specific type, such as
   // HandleIsFinite where boolean is always returned.
   //
+  Status HandleBitcast(HloInstruction* bitcast) override;
+
   Status HandleParameter(HloInstruction* parameter) override;
 
   Status HandleConstant(HloInstruction* constant) override;
@@ -180,7 +182,9 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleBroadcast(HloInstruction* broadcast) override;
 
-  Status HandleAfterAll(HloInstruction* token) override;
+  Status HandleAfterAll(HloInstruction* after_all) override;
+
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status HandleSort(HloInstruction* sort) override;
 
@@ -221,16 +225,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
       const Literal& operand_literal) {
     const auto shape = instruction->shape();
     const auto* operand = instruction->operand(0);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast is
-    // removed.
-    if (!ShapeUtil::SameDimensions(shape, operand->shape())) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s",
-          ShapeUtil::HumanString(shape),
-          ShapeUtil::HumanString(operand->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, operand->shape()));
 
     Literal result(shape);
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
index d95b6ad04f2c446b423a3aaef4de333ed2968883..4eaaab20ea0add17d9b49b1b2b97991af0438dcc 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/reference_util.h"
@@ -35,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -2765,6 +2767,33 @@ ENTRY main {
   EXPECT_TRUE(LiteralTestUtil::Equal(arg, actual));
 }
 
+TEST_P(HloEvaluatorTest, Bitcast) {
+  // Regression test for b/114735354.
+  constexpr absl::string_view hlo_text_base = R"(
+HloModule Bitcast
+
+ENTRY main {
+  param = %s[32,121]{1,0} parameter(0)
+  ROOT bitcast = %s[121,32,1]{0,1,2} bitcast(%s[32,121]{1,0} param)
+}
+)";
+  string hlo_text;
+  if (use_bfloat16_) {
+    hlo_text = absl::StrFormat(hlo_text_base, "bf16", "bf16", "bf16");
+  } else {
+    hlo_text = absl::StrFormat(hlo_text_base, "f32", "f32", "f32");
+  }
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto args = MakeFakeArguments(m_.get()).ConsumeValueOrDie();
+  Literal actual = Evaluate({&args[0]});
+  if (use_bfloat16_) {
+    EXPECT_TRUE(
+        absl::c_equal(args[0].data<bfloat16>(), actual.data<bfloat16>()));
+  } else {
+    EXPECT_TRUE(absl::c_equal(args[0].data<float>(), actual.data<float>()));
+  }
+}
+
 INSTANTIATE_TEST_CASE_P(HloEvaluatorTest_Instantiation, HloEvaluatorTest,
                         ::testing::ValuesIn(use_bf16_params));
 
diff --git a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
index ebed875eb4954bc9a9da3f182005fa3d44326493..b87fc3e34012e75ee07bff6c1e113dce404f83cb 100644
--- a/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/service/hlo_evaluator_typed_visitor.h
@@ -161,9 +161,6 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
                          HloOpcodeString(hlo_instruction->opcode()));
   }
 
-  // TODO(b/35950897): many of the stl functions used in the handlers are not
-  // overloaded for every XLA primitive type.
-
   template <typename NativeT,
             typename std::enable_if<std::is_unsigned<NativeT>::value>::type* =
                 nullptr>
@@ -596,7 +593,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     return Status::OK();
   }
 
-  Status HandleDivide(HloInstruction* divide) {
+  Status HandleDivide(HloInstruction* divide) override {
     return HandleDivide<ElementwiseT>(divide);
   }
 
@@ -1556,10 +1553,10 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
           const auto& row_data = row_to_sort.data<NativeT>();
 
           std::vector<NativeT> result_data(row_data.begin(), row_data.end());
-          std::sort(result_data.begin(), result_data.end(),
-                    [](const NativeT& a, const NativeT& b) {
-                      return SafeLess<NativeT>(a, b);
-                    });
+          std::stable_sort(result_data.begin(), result_data.end(),
+                           [](const NativeT& a, const NativeT& b) {
+                             return SafeLess<NativeT>(a, b);
+                           });
           Literal sorted_row(ShapeUtil::MakeShape(keys->shape().element_type(),
                                                   {sort_dim_elements}));
           sorted_row.PopulateR1(absl::Span<const NativeT>(result_data));
@@ -2546,12 +2543,14 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   template <typename NativeT,
             typename std::enable_if<
-                std::is_same<NativeT, float>::value ||
-                std::is_same<NativeT, int32>::value ||
-                std::is_same<NativeT, uint32>::value>::type* = nullptr>
+                std::is_integral<NativeT>::value ||
+                std::is_floating_point<NativeT>::value>::type* = nullptr>
   Status HandleIota(HloInstruction* instruction) {
     auto* iota = Cast<HloIotaInstruction>(instruction);
-    std::vector<NativeT> data(iota->shape().dimensions(iota->iota_dimension()));
+    // Avoid using std::vector since std::vector<bool> does not convert to
+    // absl::Span<bool>.
+    absl::InlinedVector<NativeT, 1> data(
+        iota->shape().dimensions(iota->iota_dimension()));
     std::iota(data.begin(), data.end(), 0);
     auto result = LiteralUtil::CreateR1<NativeT>(data);
 
@@ -2568,9 +2567,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   }
   template <typename NativeT,
             typename std::enable_if<
-                !(std::is_same<NativeT, float>::value ||
-                  std::is_same<NativeT, int32>::value ||
-                  std::is_same<NativeT, uint32>::value)>::type* = nullptr>
+                !(std::is_integral<NativeT>::value ||
+                  std::is_floating_point<NativeT>::value)>::type* = nullptr>
   Status HandleIota(HloInstruction* iota) {
     return InvalidArgument("Unsupported type for iota");
   }
@@ -2722,17 +2720,8 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const auto shape = instruction->shape();
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit broadcast
-    // is removed.
-    if (!(ShapeUtil::SameDimensions(shape, rhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()),
-          ShapeUtil::HumanString(rhs->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
@@ -2756,19 +2745,9 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
     const auto* lhs = instruction->operand(0);
     const auto* rhs = instruction->operand(1);
     const auto* ehs = instruction->operand(2);
-
-    // TODO(b/35950897, b/27796129): add DCHECK back once implicit
-    // broadcast is removed.
-    if (!(ShapeUtil::SameDimensions(shape, lhs->shape()) &&
-          ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()) &&
-          ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()))) {
-      return Unimplemented(
-          "Implicit broadcasting is currently unsupported in HLO evaluator "
-          "Shape Mismatch: %s vs %s vs %s vs %s: ",
-          ShapeUtil::HumanString(shape), ShapeUtil::HumanString(lhs->shape()),
-          ShapeUtil::HumanString(rhs->shape()),
-          ShapeUtil::HumanString(ehs->shape()));
-    }
+    TF_RET_CHECK(ShapeUtil::SameDimensions(shape, lhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(lhs->shape(), rhs->shape()));
+    TF_RET_CHECK(ShapeUtil::SameDimensions(rhs->shape(), ehs->shape()));
 
     const Literal& lhs_literal = parent_->GetEvaluatedLiteralFor(lhs);
     const Literal& rhs_literal = parent_->GetEvaluatedLiteralFor(rhs);
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c919dbd82d3668c477bf37074f1d56f8cb7d9506
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.cc
@@ -0,0 +1,61 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/shape_inference.h"
+
+namespace xla {
+
+namespace {
+
+StatusOr<bool> ReplaceGetSize(HloInstruction* instr) {
+  if (instr->opcode() != HloOpcode::kGetDimensionSize) {
+    return false;
+  }
+  HloComputation* computation = instr->parent();
+
+  TF_ASSIGN_OR_RETURN(auto legal_shape,
+                      ShapeInference::InferGetDimensionSizeShape(
+                          instr->operand(0)->shape(), instr->dimension()));
+  TF_RET_CHECK(ShapeUtil::Equal(instr->shape(), legal_shape));
+  TF_RET_CHECK(ShapeUtil::HasPrimitiveType(instr->shape(), U32));
+  uint32 size = instr->operand(0)->shape().dimensions(instr->dimension());
+  HloInstruction* new_instr = computation->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32>(size)));
+  TF_RETURN_IF_ERROR(instr->ReplaceAllUsesWith(new_instr));
+  return true;
+}
+
+}  // namespace
+
+StatusOr<bool> HloGetDimensionSizeRewriter::Run(HloModule* module) {
+  bool changed = false;
+  HloProto proto;
+  *proto.mutable_hlo_module() = module->ToProto();
+  for (auto* computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      TF_ASSIGN_OR_RETURN(bool replaced, ReplaceGetSize(instruction));
+      changed = changed || replaced;
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..30f44c23a835b3bcc935caaa917e040e07c4e703
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
+
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Pass to replace a kGetDimensionSize instruction with a constant instruction.
+class HloGetDimensionSizeRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "hlo-get-dimension-size-rewriter";
+  }
+
+  StatusOr<bool> Run(HloModule* module) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_GET_DIMENSION_SIZE_REWRITER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a86aebdd5b64240e6e07d8e8050c0c8681cce765
--- /dev/null
+++ b/tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/hlo_get_dimension_size_rewriter.h"
+
+#include "tensorflow/compiler/xla/service/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class HloGetDimensionSizeRewriterTest : public HloTestBase {
+ protected:
+  HloGetDimensionSizeRewriterTest() {}
+};
+
+TEST_F(HloGetDimensionSizeRewriterTest, Ok) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3,4] parameter(0)
+  size0 = u32[] get-dimension-size(p), dimensions={0}
+  size1 = u32[] get-dimension-size(p), dimensions={1}
+  ROOT mul = u32[] multiply(size0, size1)
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_TRUE(pass.Run(module.get()).ValueOrDie());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Multiply(op::Constant(), op::Constant()));
+}
+
+TEST_F(HloGetDimensionSizeRewriterTest, IllegalType) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = s32[3]{0} parameter(0)
+  ROOT gds = s64[] get-dimension-size(p), dimensions={0}
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+TEST_F(HloGetDimensionSizeRewriterTest, IllegalDimension) {
+  auto module = ParseHloString(R"(
+HloModule _
+ENTRY gds {
+  p = f32[2,5] parameter(0)
+  ROOT gds = u32[] get-dimension-size(p), dimensions={2}
+})")
+                    .ValueOrDie();
+  HloGetDimensionSizeRewriter pass;
+  EXPECT_FALSE(pass.Run(module.get()).ok());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index 05cc1593e4ef4fc52b94e0536628645b1fa2abbc..302eca656be53a3cec86ddbf05a7fa3925c5185b 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <deque>
 #include <map>
 #include <memory>
+#include <queue>
 #include <string>
 #include <tuple>
 #include <unordered_map>
@@ -38,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_tfgraph_builder.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/window_util.h"
@@ -111,11 +113,6 @@ class NodeFilter {
            result == kSomeUsersOmitted;
   }
 
-  bool ShowFusionSubcomputation(const HloInstruction* instr) const {
-    CHECK_EQ(instr->opcode(), HloOpcode::kFusion);
-    return Show(instr) && !SomeOrAllOperandsOmitted(instr);
-  }
-
  private:
   std::function<NodeFilterResult(const HloInstruction* instr)> filter_;
 };
@@ -240,34 +237,28 @@ string HtmlLikeStringSanitize(absl::string_view s) {
 // it to a short string lets us tell the user what the subcomputation is without
 // drawing it as a graph.
 optional<string> MatchTrivialComputation(const HloComputation* computation) {
+  namespace m = match;
+
   if (computation->instruction_count() != 3) {
     return nullopt;
   }
-
   HloInstruction* root = computation->root_instruction();
-  if (root->operand_count() != 2) {
-    return nullopt;
-  }
-
-  // Check that both of the operands to the root are parameters.
-  const HloInstruction* operand0 = root->operand(0);
-  const HloInstruction* operand1 = root->operand(1);
-  if (operand0->opcode() != HloOpcode::kParameter ||
-      operand1->opcode() != HloOpcode::kParameter) {
-    return nullopt;
-  }
-
-  // Check that the two operands of root are param0 and param1.  All of the
-  // opcodes we recognize are commutative, so we're OK with either order.
-  auto n0 = operand0->parameter_number();
-  auto n1 = operand1->parameter_number();
-  if (!(n0 == 0 && n1 == 1) && !(n1 == 0 && n0 == 1)) {
+  const HloInstruction *param0, *param1;
+  if (!Match(root, m::Op()
+                       .WithNumOperands(2)
+                       .WithShape(m::Shape().IsEffectiveScalar())
+                       .WithBinaryOperandsAnyOrder(
+                           m::Parameter(&param0, 0)
+                               .WithShape(m::Shape().IsEffectiveScalar()),
+                           m::Parameter(&param1, 1)
+                               .WithShape(m::Shape().IsEffectiveScalar())))) {
     return nullopt;
   }
 
-  // If the params are reversed, check that the operation being performed is
-  // commutative.
-  if (n0 == 1) {
+  // If the params are reversed (i.e. operand0 is param1 and operand1 is
+  // param0), check that the operation being performed is commutative.
+  if (root->operand(0) == param1) {
+    CHECK_EQ(root->operand(1), param0);
     switch (root->opcode()) {
       case HloOpcode::kLe:
       case HloOpcode::kGe:
@@ -279,13 +270,6 @@ optional<string> MatchTrivialComputation(const HloComputation* computation) {
     }
   }
 
-  // Check that the root and params are all effective scalars.
-  if (!ShapeUtil::IsEffectiveScalar(root->shape()) ||
-      !ShapeUtil::IsEffectiveScalar(operand0->shape()) ||
-      !ShapeUtil::IsEffectiveScalar(operand1->shape())) {
-    return nullopt;
-  }
-
   // If we recognize the root's opcode, we've successfully pattern-matched!
   switch (root->opcode()) {
     case HloOpcode::kAdd:
@@ -578,7 +562,7 @@ bool HloDotDumper::ShouldShowSubcomputation(const HloComputation* subcomp) {
 
   // Show the subcomputation if we're showing any of its members.
   return std::any_of(
-      computation_->instructions().begin(), computation_->instructions().end(),
+      subcomp->instructions().begin(), subcomp->instructions().end(),
       [&](const HloInstruction* instr) { return filter_.Show(instr); });
 }
 
@@ -987,6 +971,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kTrace:
     case HloOpcode::kAfterAll:
+    case HloOpcode::kAddDependency:
     case HloOpcode::kTuple:
       return kWhite;
     case HloOpcode::kBroadcast:
@@ -1267,12 +1252,12 @@ const HloInstruction* HloDotDumper::GetNodeForEdge(
 
 class GraphRendererRegistry {
  public:
-  void AddRenderer(GraphRendererInterface* graph_renderer) {
+  void SetRenderer(std::shared_ptr<GraphRendererInterface> graph_renderer) {
     tensorflow::mutex_lock lock(mu_);
     graph_renderer_ = graph_renderer;
   }
 
-  GraphRendererInterface* GetDefaultRenderer() {
+  std::shared_ptr<GraphRendererInterface> GetDefaultRenderer() {
     tensorflow::mutex_lock lock(mu_);
     return graph_renderer_;
   }
@@ -1284,20 +1269,21 @@ class GraphRendererRegistry {
 
  private:
   tensorflow::mutex mu_;
-  GraphRendererInterface* graph_renderer_ = nullptr;
+  std::shared_ptr<GraphRendererInterface> graph_renderer_ GUARDED_BY(mu_);
 };
 
 }  // namespace
 
-Registrar::Registrar(GraphRendererInterface* dumper) {
-  GraphRendererRegistry::Default()->AddRenderer(dumper);
+Registrar::Registrar(std::shared_ptr<GraphRendererInterface> dumper) {
+  GraphRendererRegistry::Default()->SetRenderer(dumper);
 }
 
 namespace {
 
 // Gets a NodeFilter that includes roughly all instructions whose distance from
 // root is <= radius.
-NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
+NodeFilter MakeNodeRadiusAroundFilter(const HloInstruction* root,
+                                      int64 radius) {
   // First, find the neighborhood of nodes with distance from root <= radius.
   // These nodes are our initial set of "normal" nodes.
   std::unordered_map<const HloInstruction*, NodeFilterResult> nodes;
@@ -1404,6 +1390,56 @@ NodeFilter MakeNodeFilter(const HloInstruction* root, int64 radius) {
   });
 }
 
+// Gets a node filter that includes nodes on all paths from `from` to `to`.  If
+// the all-paths set contains more than max_nodes elements, includes the nodes
+// on the shortest paths and sets hit_limit to true.
+NodeFilter MakeNodeFromToFilter(const HloInstruction* from,
+                                const HloInstruction* to, int64 max_nodes,
+                                bool* hit_limit) {
+  *hit_limit = false;
+
+  // Elements in the queue are paths through the graph.
+  std::deque<std::vector<const HloInstruction*>> queue;
+  queue.push_front({from});
+
+  // Compute the set of nodes we want to show using a slightly-modified
+  // Djikstra's algorithm.  The only real difference is, rather than stopping
+  // when we find a (shortest) path, we continue until we've found max_nodes
+  // nodes on some path.
+  std::unordered_set<const HloInstruction*> visited;
+  std::unordered_set<const HloInstruction*> to_display = {from, to};
+  while (!queue.empty() && to_display.size() < max_nodes) {
+    std::vector<const HloInstruction*> path = std::move(queue.front());
+    queue.pop_front();
+    if (!visited.insert(path.back()).second) {
+      continue;
+    }
+
+    for (const auto* user : path.back()->users()) {
+      if (user == to) {
+        auto it = path.begin();
+        for (; it != path.end() && to_display.size() < max_nodes; ++it) {
+          to_display.insert(*it);
+        }
+        if (it != path.end()) {
+          *hit_limit = true;
+        }
+      } else if (!visited.count(user)) {
+        auto new_path = path;
+        new_path.push_back(user);
+        queue.push_back(std::move(new_path));
+      }
+    }
+  }
+
+  return NodeFilter([=](const HloInstruction* instr) {
+    if (instr == from || instr == to) {
+      return kHighlightNode;
+    }
+    return to_display.count(instr) ? kNormalNode : kHideNode;
+  });
+}
+
 string SaveGraph(const string& graph,
                  GraphRendererInterface::GraphKind graph_kind,
                  const string& dest_path) {
@@ -1483,7 +1519,7 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
   auto debug_options = node.GetModule()->config().debug_options();
   string label =
       StrCat("Neighborhood of ", radius, " nodes around ", node.name());
-  NodeFilter filter = MakeNodeFilter(&node, radius);
+  NodeFilter filter = MakeNodeRadiusAroundFilter(&node, radius);
   string graph =
       HloDotDumper(node.parent(), label, debug_options, show_backend_config,
                    /*profile=*/nullptr, filter)
@@ -1491,6 +1527,29 @@ string DumpNeighborhoodAround(const HloInstruction& node, int radius,
   return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
 }
 
+string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
+                          int64 max_nodes, bool show_backend_config) {
+  CHECK_EQ(from.parent(), to.parent()) << "Nodes must be in same computation!";
+  auto debug_options = from.GetModule()->config().debug_options();
+
+  bool hit_limit = false;
+  NodeFilter filter = MakeNodeFromToFilter(&from, &to, max_nodes, &hit_limit);
+  string label;
+  if (!hit_limit) {
+    label = StrCat("All paths from ", from.name(), " to ", to.name());
+  } else {
+    label = StrCat(max_nodes, " nodes on the shortest paths from ", from.name(),
+                   " to ", to.name(),
+                   "<br/><br/>***SHOWING ONLY A SUBSET OF ALL PATHS BETWEEN "
+                   "NODES***<br/><br/>");
+  }
+  string graph =
+      HloDotDumper(from.parent(), label, debug_options, show_backend_config,
+                   /*profile=*/nullptr, filter)
+          .Dump();
+  return ExportGraph(graph, GraphRendererInterface::DOT_GRAPH, debug_options);
+}
+
 void DumpText(const HloModule& module, const string& label,
               const string& directory_path, bool do_prefix) {
   Env* env = Env::Default();
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.h b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
index 0b11f34abb7f0d937a24d11f4dc5d2d6a0aae6e7..de1eefab776f9c3d2c73959a5cd267e938a78a32 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.h
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.h
@@ -66,6 +66,12 @@ string DumpGraph(const HloComputation& computation, const string& label,
 string DumpNeighborhoodAround(const HloInstruction& node, int radius,
                               bool show_backend_config = false);
 
+// Dumps nodes on any of the paths from `from` to `to`.  If there are more than
+// max_nodes on all paths, restricts to the max_nodes nodes on the shortest
+// paths.
+string DumpAllPathsFromTo(const HloInstruction& from, const HloInstruction& to,
+                          int64 max_nodes, bool show_backend_config = false);
+
 // Dumps the HloModule::ToString() as a file into the provided directory path
 // suffixed with the provided label.
 //
@@ -87,13 +93,13 @@ void DumpText(const HloModule& module, const string& label,
 // Class that registers a graph renderer.
 class Registrar {
  public:
-  Registrar(GraphRendererInterface* dumper);
+  Registrar(std::shared_ptr<GraphRendererInterface> dumper);
 };
 
-#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...)   \
-  static ::xla::hlo_graph_dumper::Registrar                       \
-      XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr)(new factory, \
-                                                     ##__VA_ARGS__)
+#define XLA_INTERNAL_REGISTER_GRAPH_RENDERER(factory, ctr, ...) \
+  static ::xla::hlo_graph_dumper::Registrar                     \
+      XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr)(           \
+          std::make_shared<factory>(), ##__VA_ARGS__)
 
 // __COUNTER__ must go through another macro to be properly expanded
 #define XLA_INTERNAL_REGISTER_GRAPH_RENDERER_NAME(ctr) ___##ctr##__object_
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.cc b/tensorflow/compiler/xla/service/hlo_instruction.cc
index 26786ee950b5421f79fc089d65f1395aae65d335..21b1dbc1676cccd2fe5b331a1f9d6ff5e3a73fcd 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/service/hlo_instruction.cc
@@ -93,7 +93,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       [&computation_map](int64 id) { return computation_map.contains(id); }))
       << proto.name() << " instruction references invalid computation id(s)";
 
-  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(proto.shape()));
+  Shape shape(proto.shape());
+  TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
 
   switch (opcode) {
     // Ops migrated to subclasses.
@@ -101,23 +102,23 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 3)
           << "BatchNormTraining instruction should have 3 operands but sees "
           << proto.operand_ids_size();
-      instruction = CreateBatchNormTraining(
-          proto.shape(), operands(0), operands(1), operands(2), proto.epsilon(),
-          proto.feature_index());
+      instruction =
+          CreateBatchNormTraining(shape, operands(0), operands(1), operands(2),
+                                  proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormInference:
       TF_RET_CHECK(proto.operand_ids_size() == 5)
           << "BatchNormInference instruction should have 5 operands but sees "
           << proto.operand_ids_size();
       instruction = CreateBatchNormInference(
-          proto.shape(), operands(0), operands(1), operands(2), operands(3),
+          shape, operands(0), operands(1), operands(2), operands(3),
           operands(4), proto.epsilon(), proto.feature_index());
       break;
     case HloOpcode::kBatchNormGrad:
       TF_RET_CHECK(proto.operand_ids_size() == 5)
           << "BatchNormGrad instruction should have 5 operands but sees "
           << proto.operand_ids_size();
-      instruction = CreateBatchNormGrad(proto.shape(), operands(0), operands(1),
+      instruction = CreateBatchNormGrad(shape, operands(0), operands(1),
                                         operands(2), operands(3), operands(4),
                                         proto.epsilon(), proto.feature_index());
       break;
@@ -127,7 +128,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       std::vector<int64> fft_length(proto.fft_length().begin(),
                                     proto.fft_length().end());
-      instruction = CreateFft(proto.shape(), operands(0), proto.fft_type(),
+      instruction = CreateFft(shape, operands(0), proto.fft_type(),
                               absl::Span<const int64>(fft_length));
       break;
     }
@@ -148,7 +149,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Recv instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateRecv(proto.shape().tuple_shapes(0), operands(0),
+      instruction = CreateRecv(shape.tuple_shapes(0), operands(0),
                                proto.channel_id(), proto.is_host_transfer());
       break;
     case HloOpcode::kRecvDone:
@@ -161,7 +162,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Reverse instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateReverse(proto.shape(), operands(0),
+      instruction = CreateReverse(shape, operands(0),
                                   std::vector<int64>(proto.dimensions().begin(),
                                                      proto.dimensions().end()));
       break;
@@ -170,7 +171,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Concatenate instruction should have 1 dimension but sees "
           << proto.dimensions_size();
       instruction =
-          CreateConcatenate(proto.shape(), all_operands(), proto.dimensions(0));
+          CreateConcatenate(shape, all_operands(), proto.dimensions(0));
       break;
     case HloOpcode::kReduce:
       TF_RET_CHECK(proto.operand_ids_size() % 2 == 0)
@@ -188,7 +189,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
             absl::MakeSpan(reduce_operands)
                 .subspan(reduce_operands.size() / 2, reduce_operands.size());
         instruction =
-            CreateReduce(proto.shape(), inputs, init_values,
+            CreateReduce(shape, inputs, init_values,
                          std::vector<int64>(proto.dimensions().begin(),
                                             proto.dimensions().end()),
                          computations(0));
@@ -203,7 +204,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       auto sort_operands = all_operands();
       HloInstruction* keys = sort_operands[0];
       instruction = CreateSort(
-          proto.shape(), proto.dimensions(0), keys,
+          shape, proto.dimensions(0), keys,
           absl::Span<HloInstruction* const>(sort_operands).subspan(1));
       break;
     }
@@ -212,7 +213,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Transpose instruction should have 1 operand but sees "
           << proto.operand_ids_size();
       instruction =
-          CreateTranspose(proto.shape(), operands(0),
+          CreateTranspose(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
                                              proto.dimensions().end()));
       break;
@@ -221,7 +222,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Broadcast instruction should have 1 operand but sees "
           << proto.operand_ids_size();
       instruction =
-          CreateBroadcast(proto.shape(), operands(0),
+          CreateBroadcast(shape, operands(0),
                           std::vector<int64>(proto.dimensions().begin(),
                                              proto.dimensions().end()));
       break;
@@ -229,7 +230,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "Map instruction should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateMap(proto.shape(), all_operands(), computations(0));
+      instruction = CreateMap(shape, all_operands(), computations(0));
       break;
     case HloOpcode::kSlice: {
       TF_RET_CHECK(proto.operand_ids_size() == 1)
@@ -242,8 +243,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         slice_limits.push_back(slice_dimensions.limit());
         slice_strides.push_back(slice_dimensions.stride());
       }
-      instruction = CreateSlice(proto.shape(), operands(0), slice_starts,
-                                slice_limits, slice_strides);
+      instruction = CreateSlice(shape, operands(0), slice_starts, slice_limits,
+                                slice_strides);
       break;
     }
     case HloOpcode::kConstant: {
@@ -253,7 +254,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                             Literal::CreateFromProto(proto.literal()));
         instruction = CreateConstant(std::move(literal));
       } else {
-        instruction = absl::make_unique<HloConstantInstruction>(proto.shape());
+        instruction = absl::make_unique<HloConstantInstruction>(shape);
       }
       break;
     }
@@ -284,55 +285,54 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           tensorflow::gtl::FindPtrOrNull(computation_map, fusion_id);
       TF_RET_CHECK(fused_computation != nullptr)
           << "No fusion computation with id " << fusion_id;
-      instruction = CreateFusion(proto.shape(), fusion_kind, all_operands(),
-                                 fused_computation);
+      instruction =
+          CreateFusion(shape, fusion_kind, all_operands(), fused_computation);
       break;
     }
     case HloOpcode::kRng:
-      instruction =
-          CreateRng(proto.shape(), proto.distribution(), all_operands());
+      instruction = CreateRng(shape, proto.distribution(), all_operands());
       break;
     case HloOpcode::kParameter:
-      instruction = CreateParameter(proto.parameter_number(), proto.shape(),
-                                    proto.name());
+      instruction =
+          CreateParameter(proto.parameter_number(), shape, proto.name());
       break;
     case HloOpcode::kGetTupleElement:
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "GetTupleElement instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction = CreateGetTupleElement(proto.shape(), operands(0),
-                                          proto.tuple_index());
+      instruction =
+          CreateGetTupleElement(shape, operands(0), proto.tuple_index());
       break;
     case HloOpcode::kReducePrecision:
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "ReducePrecision instruction should have 1 operand but sees "
           << proto.operand_ids_size();
-      instruction =
-          CreateReducePrecision(proto.shape(), operands(0),
-                                proto.exponent_bits(), proto.mantissa_bits());
+      instruction = CreateReducePrecision(
+          shape, operands(0), proto.exponent_bits(), proto.mantissa_bits());
       break;
     case HloOpcode::kInfeed: {
-      TF_RET_CHECK(ShapeUtil::IsTuple(proto.shape()) &&
-                   (ShapeUtil::TupleElementCount(proto.shape()) == 2))
+      TF_RET_CHECK(ShapeUtil::IsTuple(shape) &&
+                   (ShapeUtil::TupleElementCount(shape) == 2))
           << "Infeed should have a tuple shape with 2 operands, but has: "
-          << proto.shape();
-      const Shape& data_shape =
-          ShapeUtil::GetTupleElementShape(proto.shape(), 0);
+          << shape;
+      const Shape& data_shape = ShapeUtil::GetTupleElementShape(shape, 0);
       TF_RET_CHECK(proto.operand_ids_size() == 1)
           << "Infeed instruction should have 1 operand but sees "
           << proto.operand_ids_size();
       instruction =
           CreateInfeed(data_shape, operands(0), proto.infeed_config());
     } break;
-    case HloOpcode::kOutfeed:
+    case HloOpcode::kOutfeed: {
       TF_RET_CHECK(proto.operand_ids_size() == 2)
           << "Outfeed instruction should have 2 operands but sees "
           << proto.operand_ids_size();
+      Shape outfeed_shape(proto.outfeed_shape());
       TF_RETURN_IF_ERROR(
-          ShapeUtil::ValidateShapeWithOptionalLayout(proto.outfeed_shape()));
-      instruction = CreateOutfeed(proto.outfeed_shape(), operands(0),
-                                  operands(1), proto.outfeed_config());
+          ShapeUtil::ValidateShapeWithOptionalLayout(outfeed_shape));
+      instruction = CreateOutfeed(outfeed_shape, operands(0), operands(1),
+                                  proto.outfeed_config());
       break;
+    }
     case HloOpcode::kCrossReplicaSum: {
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "CrossReplicaSum should have 1 called computation but sees "
@@ -342,7 +342,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         all_reduce_id = proto.all_reduce_id();
       }
       instruction = CreateCrossReplicaSum(
-          proto.shape(), all_operands(), computations(0),
+          shape, all_operands(), computations(0),
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()),
@@ -352,7 +352,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     }
     case HloOpcode::kAllToAll: {
       instruction = CreateAllToAll(
-          proto.shape(), all_operands(),
+          shape, all_operands(),
           /*replica_groups=*/
           std::vector<ReplicaGroup>(proto.replica_groups().begin(),
                                     proto.replica_groups().end()));
@@ -368,8 +368,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         source_target_pairs[i].first = proto.source_target_pairs(i).source();
         source_target_pairs[i].second = proto.source_target_pairs(i).target();
       }
-      instruction = CreateCollectivePermute(proto.shape(), operands(0),
-                                            source_target_pairs);
+      instruction =
+          CreateCollectivePermute(shape, operands(0), source_target_pairs);
       break;
     }
     case HloOpcode::kConvolution: {
@@ -382,7 +382,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       precision_config.mutable_operand_precision()->Resize(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
       instruction = CreateConvolve(
-          proto.shape(), operands(0), operands(1),
+          shape, operands(0), operands(1),
           std::max<int64>(proto.feature_group_count(), 1), proto.window(),
           proto.convolution_dimension_numbers(), precision_config);
       break;
@@ -394,7 +394,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 1)
           << "ReduceWindow should have 1 called computation but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateReduceWindow(proto.shape(), operands(0), operands(1),
+      instruction = CreateReduceWindow(shape, operands(0), operands(1),
                                        proto.window(), computations(0));
       break;
     case HloOpcode::kSelectAndScatter:
@@ -404,9 +404,9 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       TF_RET_CHECK(proto.called_computation_ids_size() == 2)
           << "SelectAndScatter should have 2 called computations but sees "
           << proto.called_computation_ids_size();
-      instruction = CreateSelectAndScatter(
-          proto.shape(), operands(0), computations(0), proto.window(),
-          operands(1), operands(2), computations(1));
+      instruction = CreateSelectAndScatter(shape, operands(0), computations(0),
+                                           proto.window(), operands(1),
+                                           operands(2), computations(1));
       break;
     case HloOpcode::kCustomCall:
       if (proto.constrain_layout()) {
@@ -414,16 +414,17 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         // vector of pointers essentially) so create a vector of shapes to pass
         // in.
         std::vector<Shape> operand_shapes;
-        for (const Shape& shape : proto.operand_shapes_with_layout()) {
-          operand_shapes.push_back(shape);
+        for (const ShapeProto& shape_proto :
+             proto.operand_shapes_with_layout()) {
+          operand_shapes.emplace_back(shape_proto);
         }
-        instruction = CreateCustomCall(
-            proto.shape(), all_operands(), proto.custom_call_target(),
-            operand_shapes, proto.custom_call_opaque());
+        instruction =
+            CreateCustomCall(shape, all_operands(), proto.custom_call_target(),
+                             operand_shapes, proto.custom_call_opaque());
       } else {
-        instruction = CreateCustomCall(proto.shape(), all_operands(),
-                                       proto.custom_call_target(),
-                                       proto.custom_call_opaque());
+        instruction =
+            CreateCustomCall(shape, all_operands(), proto.custom_call_target(),
+                             proto.custom_call_opaque());
       }
       if (proto.has_window()) {
         static_cast<HloCustomCallInstruction*>(instruction.get())
@@ -443,8 +444,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << "Pad instruction should have 2 operands but sees "
           << proto.operand_ids_size();
       TF_RET_CHECK(proto.has_padding_config());
-      instruction = CreatePad(proto.shape(), operands(0), operands(1),
-                              proto.padding_config());
+      instruction =
+          CreatePad(shape, operands(0), operands(1), proto.padding_config());
       break;
     case HloOpcode::kDynamicSlice: {
       TF_RET_CHECK(proto.operand_ids_size() == 2)
@@ -452,8 +453,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
           << proto.operand_ids_size();
       std::vector<int64> slice_sizes(proto.dynamic_slice_sizes_size());
       absl::c_copy(proto.dynamic_slice_sizes(), slice_sizes.begin());
-      instruction = CreateDynamicSlice(proto.shape(), operands(0), operands(1),
-                                       slice_sizes);
+      instruction =
+          CreateDynamicSlice(shape, operands(0), operands(1), slice_sizes);
       break;
     }
     case HloOpcode::kGather: {
@@ -469,7 +470,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       for (int64 bound : proto.gather_slice_sizes()) {
         gather_slice_sizes.push_back(bound);
       }
-      instruction = CreateGather(proto.shape(), operands(0), operands(1),
+      instruction = CreateGather(shape, operands(0), operands(1),
                                  *gather_dimension_numbers, gather_slice_sizes);
       break;
     }
@@ -485,16 +486,15 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       auto scatter_dimension_numbers =
           absl::make_unique<ScatterDimensionNumbers>(
               proto.scatter_dimension_numbers());
-      instruction =
-          CreateScatter(proto.shape(), operands(0), operands(1), operands(2),
-                        computations(0), *scatter_dimension_numbers);
+      instruction = CreateScatter(shape, operands(0), operands(1), operands(2),
+                                  computations(0), *scatter_dimension_numbers);
       break;
     }
     case HloOpcode::kIota:
       TF_RET_CHECK(proto.dimensions_size() == 1)
           << "Iota instruction should have 1 dimension but sees "
           << proto.dimensions_size();
-      instruction = CreateIota(proto.shape(), proto.dimensions(0));
+      instruction = CreateIota(shape, proto.dimensions(0));
       break;
     case HloOpcode::kDot: {
       TF_RET_CHECK(proto.has_dot_dimension_numbers())
@@ -506,8 +506,8 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
       precision_config.mutable_operand_precision()->Resize(
           proto.operand_ids_size(), PrecisionConfig::DEFAULT);
       instruction = absl::make_unique<HloDotInstruction>(
-          proto.shape(), operands(0), operands(1),
-          proto.dot_dimension_numbers(), precision_config);
+          shape, operands(0), operands(1), proto.dot_dimension_numbers(),
+          precision_config);
       break;
     }
     case HloOpcode::kDomain: {
@@ -529,7 +529,7 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
         exit_hlo_sharding = std::make_shared<const HloSharding>(sharding);
       }
       instruction = absl::make_unique<HloDomainInstruction>(
-          proto.shape(), operands(0),
+          shape, operands(0),
           absl::make_unique<ShardingMetadata>(entry_hlo_sharding),
           absl::make_unique<ShardingMetadata>(exit_hlo_sharding));
       break;
@@ -537,11 +537,11 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
     case HloOpcode::kGetDimensionSize:
       TF_RET_CHECK(proto.operand_ids_size() == 1);
       TF_RET_CHECK(proto.dimensions_size() == 1);
-      instruction = CreateGetDimensionSize(proto.shape(), operands(0),
-                                           proto.dimensions(0));
+      instruction =
+          CreateGetDimensionSize(shape, operands(0), proto.dimensions(0));
       break;
     default: {
-      instruction = absl::WrapUnique(new HloInstruction(opcode, proto.shape()));
+      instruction = absl::WrapUnique(new HloInstruction(opcode, shape));
       for (const int64 operand_id : proto.operand_ids()) {
         instruction->AppendOperand(instruction_map.at(operand_id));
       }
@@ -855,6 +855,16 @@ HloInstruction::CreateCollectivePermute(
       new HloInstruction(HloOpcode::kAfterAll, ShapeUtil::MakeTokenShape()));
 }
 
+/* static */ std::unique_ptr<HloInstruction>
+HloInstruction::CreateAddDependency(HloInstruction* data_operand,
+                                    HloInstruction* token_operand) {
+  auto instruction = absl::WrapUnique(
+      new HloInstruction(HloOpcode::kAddDependency, data_operand->shape()));
+  instruction->AppendOperand(data_operand);
+  instruction->AppendOperand(token_operand);
+  return instruction;
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateWhile(
     const Shape& shape, HloComputation* condition, HloComputation* body,
     HloInstruction* init) {
@@ -1394,6 +1404,10 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
         clone = CreateAfterAll(new_operands);
       }
       break;
+    case HloOpcode::kAddDependency:
+      CHECK_EQ(new_operands.size(), 2);
+      clone = CreateAddDependency(new_operands[0], new_operands[1]);
+      break;
   }
   // SetupDerivedInstruction will setup the precision_config_ field.
   SetupDerivedInstruction(clone.get());
@@ -1680,6 +1694,7 @@ bool HloInstruction::IdenticalSlowPath(
 
     // This opcode has complex or special behavior so just return false.
     case HloOpcode::kAfterAll:
+    case HloOpcode::kAddDependency:
       return false;
 
     // Remaining instructions with special values.
@@ -1745,6 +1760,26 @@ bool HloInstruction::IdenticalSlowPath(
   return false;
 }
 
+uint64 HloInstruction::Hash() const {
+  using tensorflow::Hash64Combine;
+
+  uint64 hash_value = Hash64Combine(0, static_cast<uint64>(opcode()));
+  hash_value = Hash64Combine(hash_value, ShapeUtil::Hash(shape()));
+
+  if (!IsCrossModuleAllReduce()) {
+    if (!operands().empty()) {
+      for (size_t i = 0; i < operands().size(); ++i) {
+        hash_value = Hash64Combine(hash_value, operand(i)->Hash());
+      }
+    }
+  }
+
+  hash_value = Hash64Combine(hash_value, InnerHash());
+  return hash_value;
+}
+
+uint64 HloInstruction::InnerHash() const { return 13; }
+
 void HloInstruction::RemoveUser(HloInstruction* user) {
   auto set_it = user_set_.find(user);
   CHECK(set_it != user_set_.end());
@@ -1900,6 +1935,11 @@ void HloInstruction::set_while_body(HloComputation* computation) {
   called_computations_[kBodyComputationIndex] = computation;
 }
 
+HloInstruction* HloInstruction::while_init() const {
+  CHECK_EQ(HloOpcode::kWhile, opcode_);
+  return operands_[0];
+}
+
 HloComputation* HloInstruction::true_computation() const {
   CHECK_EQ(HloOpcode::kConditional, opcode_);
   return called_computations_[kTrueComputationIndex];
@@ -2214,7 +2254,7 @@ HloInstructionProto HloInstruction::ToProto() const {
   proto.set_id(unique_id_);
   proto.set_name(name_);
   proto.set_opcode(HloOpcodeString(opcode_));
-  *proto.mutable_shape() = shape_;
+  *proto.mutable_shape() = shape_.ToProto();
   for (const HloInstruction* operand : operands_) {
     proto.add_operand_ids(operand->unique_id());
   }
@@ -2462,6 +2502,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleDomain(this);
     case HloOpcode::kAfterAll:
       return visitor->HandleAfterAll(this);
+    case HloOpcode::kAddDependency:
+      return visitor->HandleAddDependency(this);
     case HloOpcode::kIota:
       return visitor->HandleIota(this);
     case HloOpcode::kGetDimensionSize:
@@ -2623,36 +2665,6 @@ Status HloInstruction::AcceptWithOperandOrder(
   return Status::OK();
 }
 
-namespace {
-
-// Returns true if the given order is a topological sort of the instructions
-// it contains.
-bool OrderIsTopologicalSort(const std::vector<const HloInstruction*>& order) {
-  // Create a map from instruction to its position in 'order'.
-  std::unordered_map<const HloInstruction*, int> order_position;
-  for (int i = 0; i < order.size(); i++) {
-    if (!order_position.insert({order[i], i}).second) {
-      // Instruction order[i] is duplicated in the order.
-      return false;
-    }
-  }
-  // Verify that the operand of each instruction in the order is also in the
-  // order *and* the operand's position is earlier (defs are before uses for
-  // all ops).
-  for (auto* instruction : order) {
-    for (auto* operand : instruction->operands()) {
-      if (!ContainsKey(order_position, operand) ||
-          order_position.at(operand) >= order_position.at(instruction)) {
-        return false;
-      }
-    }
-  }
-
-  return true;
-}
-
-}  // namespace
-
 Status HloInstruction::Accept(
     const std::function<Status(HloInstruction*)>& visitor_func) {
   FunctionVisitor visitor(visitor_func);
@@ -3022,6 +3034,16 @@ const PrecisionConfig& HloInstruction::precision_config() const {
   LOG(FATAL) << "Unimplemented method.";
 }
 
+PrecisionConfig* HloInstruction::mutable_precision_config() {
+  if (auto* convolution = DynCast<HloConvolutionInstruction>(this)) {
+    return convolution->mutable_precision_config();
+  }
+  if (auto* dot = DynCast<HloDotInstruction>(this)) {
+    return dot->mutable_precision_config();
+  }
+  LOG(FATAL) << "Unimplemented method.";
+}
+
 HloModule* HloInstruction::GetModule() const {
   if (parent_) {
     return parent_->parent();
@@ -3064,6 +3086,10 @@ int64 HloInstruction::concatenate_dimension() const {
   return Cast<HloConcatenateInstruction>(this)->concatenate_dimension();
 }
 
+int64 HloInstruction::dimension() const {
+  return Cast<HloGetDimensionSizeInstruction>(this)->dimension();
+}
+
 bool HloInstruction::IsRank2Transpose() const {
   auto transpose = DynCast<HloTransposeInstruction>(this);
   return transpose != nullptr && transpose->IsRank2Transpose();
@@ -3243,6 +3269,11 @@ absl::optional<int64> HloInstruction::all_reduce_id() const {
   return Cast<HloAllReduceInstruction>(this)->all_reduce_id();
 }
 
+void HloInstruction::set_all_reduce_id(
+    const absl::optional<int64>& all_reduce_id) {
+  return Cast<HloAllReduceInstruction>(this)->set_all_reduce_id(all_reduce_id);
+}
+
 const ConvolutionDimensionNumbers&
 HloInstruction::convolution_dimension_numbers() const {
   if (auto convolution = DynCast<HloConvolutionInstruction>(this)) {
diff --git a/tensorflow/compiler/xla/service/hlo_instruction.h b/tensorflow/compiler/xla/service/hlo_instruction.h
index 818d4ede0f30f06d390daa70c508c6be6bbc38ce..a54716217d6bbc5c0601f5d9ff7bf4072a6b30f5 100644
--- a/tensorflow/compiler/xla/service/hlo_instruction.h
+++ b/tensorflow/compiler/xla/service/hlo_instruction.h
@@ -770,6 +770,9 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateGetDimensionSize(
       const Shape& shape, HloInstruction* operand, int64 dimension);
 
+  static std::unique_ptr<HloInstruction> CreateAddDependency(
+      HloInstruction* data_operand, HloInstruction* token_operand);
+
   // Returns the opcode for this instruction.
   HloOpcode opcode() const { return opcode_; }
 
@@ -883,11 +886,15 @@ class HloInstruction {
       return false;
     }
 
-    // Use an explicit loop rather than ContainerEquals, because copying around
-    // std::functions may be too expensive in some cases.
-    for (size_t i = 0; i < operands().size(); ++i) {
-      if (!eq_operands(operand(i), other.operand(i))) {
-        return false;
+    // Two AllReduces are Identical if they have the same all_reduce_id.
+    // Their operands don't have to be Identical.
+    if (!IsCrossModuleAllReduce()) {
+      // Use an explicit loop rather than ContainerEquals, because copying
+      // around std::functions may be too expensive in some cases.
+      for (size_t i = 0; i < operands().size(); ++i) {
+        if (!eq_operands(operand(i), other.operand(i))) {
+          return false;
+        }
       }
     }
 
@@ -898,6 +905,12 @@ class HloInstruction {
     return IdenticalSlowPath(other, eq_computations);
   }
 
+  // Generates a hash value of an HLO instruction. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO instructions,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const;
+
   // Returns whether the instruction has a constant operand.
   bool HasConstantOperand() const;
 
@@ -997,6 +1010,8 @@ class HloInstruction {
   void set_while_condition(HloComputation* while_condition);
   void set_while_body(HloComputation* while_body);
 
+  HloInstruction* while_init() const;
+
   // Gets/sets the true and false HloComputation for Conditional. The setters
   // should only be called by HloModule or HloComputation methods.
   //
@@ -1257,6 +1272,7 @@ class HloInstruction {
   // superior.
   // Precondition: opcode must be kConvolution or kDot.
   const PrecisionConfig& precision_config() const;
+  PrecisionConfig* mutable_precision_config();
 
   // Sets the debug metadata for this instruction.
   void set_metadata(const OpMetadata& metadata) { metadata_ = metadata; }
@@ -1317,6 +1333,9 @@ class HloInstruction {
   // Delegates to HloConcatenateInstruction::concatenate_dimension.
   int64 concatenate_dimension() const;
 
+  // Delegates to HloGetDimensionSizeInstruction::dimension.
+  int64 dimension() const;
+
   // Returns whether this instruction does a rank-2 transposition.
   bool IsRank2Transpose() const;
 
@@ -1435,6 +1454,7 @@ class HloInstruction {
 
   // Delegates to HloAllReduceInstruction::all_reduce_id.
   absl::optional<int64> all_reduce_id() const;
+  void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
 
   // Returns data on the window in a windowed operation such as
   // convolution.
@@ -1599,6 +1619,10 @@ class HloInstruction {
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const;
 
+  // Generates a hash value specific to a particular type of an instruction.
+  // This function typically considers the inner root instruction.
+  virtual uint64 InnerHash() const;
+
   // Creates an n-ary elementwise operation.
   static std::unique_ptr<HloInstruction> CreateNary(
       const Shape& shape, HloOpcode opcode,
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.cc b/tensorflow/compiler/xla/service/hlo_instructions.cc
index 4c765aa375cd788612d144484df041dd6cd989f4..1ea02cf9c03866a598bec0e5356f0eb31ad27755 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/service/hlo_instructions.cc
@@ -370,6 +370,11 @@ HloAllReduceInstruction::HloAllReduceInstruction(
   AppendComputation(reduce_computation);
 }
 
+void HloAllReduceInstruction::set_all_reduce_id(
+    const absl::optional<int64>& all_reduce_id) {
+  all_reduce_id_ = all_reduce_id;
+}
+
 HloInstructionProto HloAllReduceInstruction::ToProto() const {
   HloInstructionProto proto = HloCollectiveInstruction::ToProto();
   // Proto3 is so sad.
@@ -1367,6 +1372,10 @@ bool HloFusionInstruction::IdenticalSlowPath(
                          other.fused_instructions_computation());
 }
 
+uint64 HloFusionInstruction::InnerHash() const {
+  return fused_instructions_computation()->Hash();
+}
+
 std::unique_ptr<HloInstruction> HloFusionInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
@@ -1610,7 +1619,7 @@ HloOutfeedInstruction::HloOutfeedInstruction(const Shape& outfeed_shape,
 HloInstructionProto HloOutfeedInstruction::ToProto() const {
   HloInstructionProto proto = HloInstruction::ToProto();
   proto.set_outfeed_config(outfeed_config());
-  *proto.mutable_outfeed_shape() = outfeed_shape();
+  *proto.mutable_outfeed_shape() = outfeed_shape().ToProto();
   return proto;
 }
 
@@ -1862,7 +1871,7 @@ HloInstructionProto HloCustomCallInstruction::ToProto() const {
   if (layout_constrained()) {
     proto.set_constrain_layout(true);
     for (const Shape& shape : operand_shapes_with_layout_) {
-      *proto.add_operand_shapes_with_layout() = shape;
+      *proto.add_operand_shapes_with_layout() = shape.ToProto();
     }
   }
   return proto;
diff --git a/tensorflow/compiler/xla/service/hlo_instructions.h b/tensorflow/compiler/xla/service/hlo_instructions.h
index d43a8973ccff697c27462b611446215df71973a5..b5c28137a145667a977d39c9d3c40c6d36a8436e 100644
--- a/tensorflow/compiler/xla/service/hlo_instructions.h
+++ b/tensorflow/compiler/xla/service/hlo_instructions.h
@@ -252,6 +252,7 @@ class HloAllReduceInstruction : public HloCollectiveInstruction {
   }
 
   absl::optional<int64> all_reduce_id() const { return all_reduce_id_; }
+  void set_all_reduce_id(const absl::optional<int64>& all_reduce_id);
 
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
@@ -742,6 +743,8 @@ class HloFusionInstruction : public HloInstruction {
       const HloInstruction& other,
       const std::function<bool(const HloComputation*, const HloComputation*)>&
           eq_computations) const override;
+  uint64 InnerHash() const override;
+
   // Implementation for non-common logic of CloneWithNewOperands.
   std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
       const Shape& shape, absl::Span<HloInstruction* const> new_operands,
@@ -954,6 +957,7 @@ class HloConvolutionInstruction : public HloInstruction {
   // information but it is presumed that the alternate lowering is strictly
   // superior.
   const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
 
   string ToCategory() const override;
   // Returns a serialized representation of this instruction.
@@ -1325,6 +1329,7 @@ class HloDotInstruction : public HloInstruction {
   // information but it is presumed that the alternate lowering is strictly
   // superior.
   const PrecisionConfig& precision_config() const { return precision_config_; }
+  PrecisionConfig* mutable_precision_config() { return &precision_config_; }
 
   // Returns a serialized representation of this instruction.
   HloInstructionProto ToProto() const override;
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 3e2f8bcd52f9043f161197756a2060b28dded1d9..d6a2b292a3916b2ff85f278cf5cb9f1567df88fa 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_token.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.cc b/tensorflow/compiler/xla/service/hlo_matchers.cc
index 5269cad94d35be3dd1c009588bbe422ff1533364..d28e79d41ad5d58a8881cfb80d488684af26564f 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.cc
+++ b/tensorflow/compiler/xla/service/hlo_matchers.cc
@@ -237,8 +237,4 @@ void PrintTo(const HloInstruction* inst, ::std::ostream* os) {
   *os << (inst ? inst->ToString() : "nullptr");
 }
 
-void PrintTo(HloInstruction* inst, ::std::ostream* os) {
-  PrintTo(const_cast<const HloInstruction*>(inst), os);
-}
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_matchers.h b/tensorflow/compiler/xla/service/hlo_matchers.h
index 170ec93a334903cdc314f1950675ef30bc4cda5a..235efb19ce4ed28a5cd9fe5ca52ae5d8e9e5ba3d 100644
--- a/tensorflow/compiler/xla/service/hlo_matchers.h
+++ b/tensorflow/compiler/xla/service/hlo_matchers.h
@@ -385,7 +385,6 @@ std::vector<const HloInstruction*> Pointers(const Container& container) {
 // Tell GMock to print HloInstruction* by value, so error messages are nice.
 // Has to be in the same namespace as 'HloInstruction'.
 void PrintTo(const HloInstruction* inst, ::std::ostream* os);
-void PrintTo(HloInstruction* inst, ::std::ostream* os);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
index 234fcd266aa09e193849ffb4526599114dfe22fe..d2740bcce26f04c5d7c8b64cfdaea53e3c697855 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.cc
@@ -73,7 +73,7 @@ class ListScheduler {
   // Construct and return a memory-minimizing sequence of HLO instructions
   // containing the given HLO computation.
   static StatusOr<HloInstructionSequence> Run(
-      const HloComputation& computation,
+      HloComputation* computation,
       const TuplePointsToAnalysis& points_to_analysis,
       const LogicalBuffer::SizeFunction& size_function,
       const absl::flat_hash_map<const HloComputation*, int64>&
@@ -98,7 +98,7 @@ class ListScheduler {
   // comparison operators.
   using Priority = std::pair<int64, int64>;
 
-  ListScheduler(const HloComputation& computation,
+  ListScheduler(HloComputation* computation,
                 const TuplePointsToAnalysis& points_to_analysis,
                 const LogicalBuffer::SizeFunction& size_function,
                 const absl::flat_hash_map<const HloComputation*, int64>&
@@ -111,7 +111,7 @@ class ListScheduler {
     // instruction. An HLO instruction "uses" a LogicalBuffer if the
     // LogicalBuffer is in an operand of the instruction as indicated by
     // points-to analysis.
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       absl::flat_hash_set<const LogicalBuffer*> instr_uses;
       for (auto* operand : instruction->operands()) {
         points_to_analysis.GetPointsToSet(operand).ForEachElement(
@@ -126,13 +126,13 @@ class ListScheduler {
 
     // Create map containing the number of unscheduled uses (hlo instructions)
     // of each logical buffer.
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       for (auto* buffer :
            points_to_analysis.GetBuffersDefinedByInstruction(instruction)) {
         unscheduled_use_count_[buffer] = 0;
       }
     }
-    for (auto* instruction : computation.instructions()) {
+    for (auto* instruction : computation->instructions()) {
       for (const LogicalBuffer* buffer : buffer_uses_.at(instruction)) {
         ++unscheduled_use_count_[buffer];
       }
@@ -141,7 +141,7 @@ class ListScheduler {
     // Buffers live out of the computation have an implicit use at the end of
     // the computation.
     for (const LogicalBuffer* live_out_buffer :
-         points_to_analysis.GetPointsToSet(computation.root_instruction())
+         points_to_analysis.GetPointsToSet(computation->root_instruction())
              .CreateFlattenedSet()) {
       ++unscheduled_use_count_[live_out_buffer];
     }
@@ -157,7 +157,7 @@ class ListScheduler {
   // HloInstruction, plus some cached metadata, saved for the purposes of making
   // BytesFreedIfScheduled fast.
   struct ReadyListEntry {
-    const HloInstruction* instruction;
+    HloInstruction* instruction;
 
     // The total size of all buffers defined by this instruction.
     int64 bytes_defined;
@@ -171,7 +171,7 @@ class ListScheduler {
   };
 
   // Creates a ReadyListEntry for the given instruction.
-  ReadyListEntry MakeReadyListEntry(const HloInstruction* instruction) {
+  ReadyListEntry MakeReadyListEntry(HloInstruction* instruction) {
     ReadyListEntry entry;
     entry.instruction = instruction;
 
@@ -250,13 +250,13 @@ class ListScheduler {
     // Populate the ready list with instructions which have no operands or
     // control predecessors.
     absl::flat_hash_map<const HloInstruction*, int64> unscheduled_pred_count;
-    for (auto* instruction : computation_.instructions()) {
+    for (auto* instruction : computation_->instructions()) {
       // TODO(b/34466113): Replace this and above with successors() or
       // predecessors() when these methods are added to HloInstruction.
-      for (const HloInstruction* user : instruction->users()) {
+      for (HloInstruction* user : instruction->users()) {
         unscheduled_pred_count[user]++;
       }
-      for (const HloInstruction* succ : instruction->control_successors()) {
+      for (HloInstruction* succ : instruction->control_successors()) {
         unscheduled_pred_count[succ]++;
       }
     }
@@ -275,7 +275,7 @@ class ListScheduler {
       ready_instructions[inst] = it;
     };
 
-    for (auto* instruction : computation_.instructions()) {
+    for (auto* instruction : computation_->instructions()) {
       if (instruction->operands().empty() &&
           instruction->control_predecessors().empty()) {
         add_to_ready_queue(instruction);
@@ -287,7 +287,7 @@ class ListScheduler {
       // schedule.
       auto best_it = ready_queue.end();
       --best_it;
-      const HloInstruction* best = best_it->second.instruction;
+      HloInstruction* best = best_it->second.instruction;
       VLOG(2) << "Schedule instruction: " << best->ToShortString()
               << " Bytes freed: " << best_it->first.first;
       ready_queue.erase(best_it);
@@ -348,13 +348,13 @@ class ListScheduler {
         }
       }
     }
-    CHECK_EQ(schedule.size(), computation_.instruction_count());
-    CHECK_EQ(scheduled_instructions_.size(), computation_.instruction_count());
+    CHECK_EQ(schedule.size(), computation_->instruction_count());
+    CHECK_EQ(scheduled_instructions_.size(), computation_->instruction_count());
 
     return schedule;
   }
 
-  const HloComputation& computation_;
+  HloComputation* computation_;
   const TuplePointsToAnalysis& points_to_analysis_;
   const LogicalBuffer::SizeFunction& size_function_;
   // Computations are analyzed in post-order. When scheduling an instruction
@@ -386,13 +386,13 @@ int64 SumLogicalBufferSizes(
 }
 
 StatusOr<HloInstructionSequence> ScheduleComputationHelper(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
-  VLOG(2) << "Computation: " << computation.name();
+  VLOG(2) << "Computation: " << computation->name();
   if (algorithm) {
     return algorithm(computation, points_to_analysis, size_function,
                      memory_by_computation);
@@ -404,17 +404,17 @@ StatusOr<HloInstructionSequence> ScheduleComputationHelper(
 }  // namespace
 
 StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
   // These variables are a hack to prevent overflows.
   int64 cumulative_total_size = 0;
-  int64 total_hlos = computation.parent()->instruction_count();
+  int64 total_hlos = computation->parent()->instruction_count();
   absl::flat_hash_map<const HloInstruction*, int64> extra_users;
   absl::flat_hash_map<const HloInstruction*, int64> total_sizes;
-  for (const HloInstruction* hlo : computation.MakeInstructionPostOrder()) {
+  for (const HloInstruction* hlo : computation->MakeInstructionPostOrder()) {
     if (ListScheduler::IgnoreInstruction(*hlo)) {
       extra_users[hlo] = 0;
       total_sizes[hlo] = 0;
@@ -448,8 +448,8 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     total_sizes[hlo] = std::min(total_sizes[hlo], cumulative_total_size);
     extra_users[hlo] = std::min(extra_users[hlo], total_hlos);
   }
-  CHECK_EQ(extra_users.size(), computation.instruction_count());
-  CHECK_EQ(total_sizes.size(), computation.instruction_count());
+  CHECK_EQ(extra_users.size(), computation->instruction_count());
+  CHECK_EQ(total_sizes.size(), computation->instruction_count());
 
   // Construct a total order based on DFS post-order, visiting operands in
   // decreasing cumulative extra user order, and next by cumulative size, with a
@@ -459,7 +459,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
     sequence.push_back(hlo);
     return Status::OK();
   });
-  TF_RETURN_IF_ERROR(computation.AcceptWithOperandOrder(
+  TF_RETURN_IF_ERROR(computation->AcceptWithOperandOrder(
       &visitor, [&extra_users, &total_sizes](const HloInstruction* a,
                                              const HloInstruction* b) {
         if (extra_users[a] != extra_users[b]) {
@@ -470,12 +470,12 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
         }
         return a->name() < b->name();
       }));
-  CHECK_EQ(sequence.size(), computation.instruction_count());
+  CHECK_EQ(sequence.size(), computation->instruction_count());
   return sequence;
 }  // namespace xla
 
 StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -485,16 +485,16 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
 }
 
 StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
         memory_by_computation) {
-  return HloInstructionSequence(computation.MakeInstructionPostOrder());
+  return HloInstructionSequence(computation->MakeInstructionPostOrder());
 }
 
 StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -513,7 +513,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                           memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 list_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, list_sequence, points_to_analysis,
+                          *computation, list_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory list sequence: " << HumanReadableNumBytes(list_memory);
 
@@ -522,7 +522,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                                          size_function, memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 dfs_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, dfs_sequence, points_to_analysis,
+                          *computation, dfs_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory dfs sequence: " << HumanReadableNumBytes(dfs_memory);
 
@@ -532,7 +532,7 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
                                memory_by_computation));
   TF_ASSIGN_OR_RETURN(const int64 post_order_memory,
                       HeapSimulator::MinimumMemoryForComputation(
-                          computation, post_order_sequence, points_to_analysis,
+                          *computation, post_order_sequence, points_to_analysis,
                           size_function, &memory_by_computation));
   VLOG(2) << "Min-memory post order sequence: "
           << HumanReadableNumBytes(post_order_memory);
@@ -555,17 +555,17 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
 }
 
 StatusOr<HloSchedule> ScheduleModule(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm) {
-  HloSchedule schedule(&module);
+  HloSchedule schedule(module);
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(&module));
+                      TuplePointsToAnalysis::Run(module));
   absl::flat_hash_map<const HloComputation*, int64> memory_by_computation;
-  for (const auto* computation : module.MakeComputationPostOrder()) {
+  for (auto* computation : module->MakeComputationPostOrder()) {
     if (!computation->IsFusionComputation()) {
       TF_ASSIGN_OR_RETURN(HloInstructionSequence computation_sequence,
                           ScheduleComputationHelper(
-                              *computation, *points_to_analysis, size_function,
+                              computation, *points_to_analysis, size_function,
                               algorithm, memory_by_computation));
       memory_by_computation[computation] =
           HeapSimulator::MinimumMemoryForComputation(
@@ -583,11 +583,11 @@ StatusOr<HloSchedule> ScheduleModule(
 }
 
 StatusOr<HloInstructionSequence> ScheduleComputation(
-    const HloComputation& computation,
+    HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function) {
-  CHECK(!computation.IsFusionComputation());
+  CHECK(!computation->IsFusionComputation());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<TuplePointsToAnalysis> points_to_analysis,
-                      TuplePointsToAnalysis::Run(computation.parent()));
+                      TuplePointsToAnalysis::Run(computation->parent()));
   absl::flat_hash_map<const HloComputation*, int64> empty_map;
   return ScheduleComputationHelper(computation, *points_to_analysis,
                                    size_function, nullptr, empty_map);
@@ -600,7 +600,7 @@ HloMemoryScheduler::HloMemoryScheduler(
 
 StatusOr<bool> HloMemoryScheduler::Run(HloModule* module) {
   TF_ASSIGN_OR_RETURN(HloSchedule schedule,
-                      ScheduleModule(*module, size_function_, algorithm_));
+                      ScheduleModule(module, size_function_, algorithm_));
   TF_RETURN_IF_ERROR(module->set_schedule(std::move(schedule)));
   return true;
 }
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
index cca5dc493989811a0bb9790c3237e5468a3f2d67..7227bfb27c74758d2b79e404afc9eb97a1ca894d 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler.h
@@ -36,14 +36,14 @@ namespace xla {
 // that describes buffer aliasing, together with a target-specific size function
 // that maps a tensor's logical size to its padded size.
 typedef std::function<StatusOr<HloInstructionSequence>(
-    const HloComputation&, const TuplePointsToAnalysis&,
+    HloComputation*, const TuplePointsToAnalysis&,
     const LogicalBuffer::SizeFunction&,
     const absl::flat_hash_map<const HloComputation*, int64>&)>
     MemorySchedulerAlgorithm;
 
 // List scheduler
 StatusOr<HloInstructionSequence> ListMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -51,7 +51,7 @@ StatusOr<HloInstructionSequence> ListMemoryScheduler(
 
 // DFS-order scheduler
 StatusOr<HloInstructionSequence> DFSMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -59,7 +59,7 @@ StatusOr<HloInstructionSequence> DFSMemoryScheduler(
 
 // Naive Post Order scheduler
 StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -69,7 +69,7 @@ StatusOr<HloInstructionSequence> PostOrderMemoryScheduler(
 // and the DFS scheduler, and chooses whichever returns a lower min-memory,
 // not accounting for fragmentation.
 StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
-    const HloComputation& computation,
+    HloComputation* computation,
     const TuplePointsToAnalysis& points_to_analysis,
     const LogicalBuffer::SizeFunction& size_function,
     const absl::flat_hash_map<const HloComputation*, int64>&
@@ -79,13 +79,13 @@ StatusOr<HloInstructionSequence> DefaultMemoryScheduler(
 // the computation. size_function is the function returning the number of bytes
 // required for a LogicalBuffer.
 StatusOr<HloSchedule> ScheduleModule(
-    const HloModule& module, const LogicalBuffer::SizeFunction& size_function,
+    HloModule* module, const LogicalBuffer::SizeFunction& size_function,
     const MemorySchedulerAlgorithm& algorithm = {});
 
 // Computes the schedule for a single computation.
 // Currently only used by the GPU backend.
 StatusOr<HloInstructionSequence> ScheduleComputation(
-    const HloComputation& computation,
+    HloComputation* computation,
     const LogicalBuffer::SizeFunction& size_function);
 
 // A pass which schedules the HLO instructions in a module. The HloModule's
diff --git a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
index 984a6266abb28f154a015e79645317e4e246fd0b..bc0d7e2bc00eab014f2660c95a51b966642eaee9 100644
--- a/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_memory_scheduler_test.cc
@@ -65,7 +65,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   auto sub = builder.AddInstruction(
       HloInstruction::CreateBinary(vec, HloOpcode::kSubtract, add, negate));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloMemoryScheduler scheduler([](const BufferValue& buffer) {
@@ -78,7 +78,7 @@ TEST_F(HloSchedulingTest, LastUseScheduledFirst) {
   TF_ASSERT_OK(module->schedule().Verify());
 
   // Verify that all instructions are in the sequence.
-  const std::vector<const HloInstruction*>& sequence =
+  const std::vector<HloInstruction*>& sequence =
       module->schedule().sequence(module->entry_computation()).instructions();
   EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size());
 
@@ -124,9 +124,9 @@ ENTRY root {
   };
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
+      ScheduleModule(module.get(), size_fn, ListMemoryScheduler));
   // Verify that all instructions are in the sequence.
-  const std::vector<const HloInstruction*>& sequence =
+  const std::vector<HloInstruction*>& sequence =
       schedule.sequence(module->entry_computation()).instructions();
   EXPECT_EQ(module->entry_computation()->instruction_count(), sequence.size());
 
@@ -172,15 +172,16 @@ TEST_F(HloSchedulingTest, TuplesAreAccountedCorrectly) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd,
                                                       tuple_elm, abs_abs2));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(*module,
-                                         [](const BufferValue& buffer) {
-                                           return ShapeUtil::ByteSizeOf(
-                                               buffer.shape(), TUPLE_SIZE);
-                                         },
-                                         ListMemoryScheduler));
+                          ScheduleModule(
+                              module.get(),
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(),
+                                                             TUPLE_SIZE);
+                              },
+                              ListMemoryScheduler));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -218,19 +219,19 @@ TEST_F(HloSchedulingTest, MultiOutputFusionAccountedCorrectly) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kAdd, tuple_elm, exp));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto* computation = module->AddEntryComputation(builder.Build());
 
   auto fusion = computation->CreateFusionInstruction(
       {tuple, mul, add}, HloInstruction::FusionKind::kLoop);
 
   TF_ASSERT_OK_AND_ASSIGN(HloSchedule schedule,
-                          ScheduleModule(*module,
-                                         [](const BufferValue& buffer) {
-                                           return ShapeUtil::ByteSizeOf(
-                                               buffer.shape(), 2);
-                                         },
-                                         ListMemoryScheduler));
+                          ScheduleModule(
+                              module.get(),
+                              [](const BufferValue& buffer) {
+                                return ShapeUtil::ByteSizeOf(buffer.shape(), 2);
+                              },
+                              ListMemoryScheduler));
 
   // Verify that all instructions are in the sequence.
   EXPECT_EQ(module->entry_computation()->instruction_count(),
@@ -252,7 +253,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
       HloInstruction::CreateParameter(0, r1f32, "cond_param"));
   HloInstruction* zero_vector =
       cond_builder.AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR2<float>({{0, 0, 0, 0}})));
+          LiteralUtil::CreateR1<float>({0, 0, 0, 0})));
   cond_builder.AddInstruction(HloInstruction::CreateBinary(
       ShapeUtil::MakeShape(PRED, {}), HloOpcode::kNe, cond_param, zero_vector));
   auto cond_computation = module->AddEmbeddedComputation(cond_builder.Build());
@@ -284,7 +285,7 @@ TEST_F(HloSchedulingTest, HeapSimulatorAccountsForSubcomputations) {
   };
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, size_fn, ListMemoryScheduler));
+      ScheduleModule(module.get(), size_fn, ListMemoryScheduler));
   // Verify that all instructions are in the sequence.
   auto entry_computation = module->entry_computation();
   EXPECT_EQ(module->entry_computation()->instruction_count(),
diff --git a/tensorflow/compiler/xla/service/hlo_module.cc b/tensorflow/compiler/xla/service/hlo_module.cc
index 14bf17f4be16f8cf820753bc9f0473029834f1f8..fe8371384c0fa3900a9022f101ff0b296439cf16 100644
--- a/tensorflow/compiler/xla/service/hlo_module.cc
+++ b/tensorflow/compiler/xla/service/hlo_module.cc
@@ -240,8 +240,10 @@ HloModuleProto HloModule::ToProto() const {
     *proto.mutable_schedule() = schedule().ToProto().ValueOrDie();
   }
   *proto.mutable_host_program_shape() =
-      entry_computation_layout().ComputeProgramShape();
+      entry_computation_layout().ComputeProgramShape().ToProto();
   *proto.mutable_input_output_alias() = input_output_alias_config().ToProto();
+  *proto.mutable_dynamic_parameter_binding() =
+      dynamic_parameter_binding().ToProto();
   return proto;
 }
 
@@ -255,7 +257,7 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
   // the entry parameters and root.
   TF_RET_CHECK(proto.has_host_program_shape())
       << "No program shape found in the proto";
-  const auto& expected_program_shape = proto.host_program_shape();
+  ProgramShape expected_program_shape(proto.host_program_shape());
   TF_RET_CHECK(expected_program_shape.parameters_size() ==
                module_config.entry_computation_layout().parameter_count());
   for (int i = 0; i < expected_program_shape.parameters_size(); ++i) {
@@ -325,6 +327,10 @@ StatusOr<std::unique_ptr<HloModule>> HloModule::CreateFromProto(
 
   // Because we didn't uniquify the names or the ids, double-check that the
   // instruction and computation names and ids are unique from the proto.
+  TF_ASSIGN_OR_RETURN(module->dynamic_parameter_binding_,
+                      DynamicParameterBinding::CreateFromProto(
+                          proto.dynamic_parameter_binding()));
+
   absl::flat_hash_set<string> computation_names;
   absl::flat_hash_set<string> instruction_names;
   absl::flat_hash_set<int> computation_ids;
@@ -363,9 +369,9 @@ StatusOr<HloModuleConfig> HloModule::CreateModuleConfigFromProto(
     const HloModuleProto& module, const DebugOptions& debug_options) {
   TF_RET_CHECK(module.has_host_program_shape())
       << "No program shape found in the proto";
-  const auto& program_shape = module.host_program_shape();
+  ProgramShape program_shape(module.host_program_shape());
 
-  HloModuleConfig module_config(program_shape);
+  HloModuleConfig module_config(ProgramShape{program_shape});
   module_config.set_debug_options(debug_options);
 
   // The module config is constructed with default layouts regardless of what is
diff --git a/tensorflow/compiler/xla/service/hlo_module.h b/tensorflow/compiler/xla/service/hlo_module.h
index 8a1f999e3ab076b87a651a915f4de93320e7067f..7b9cbf9a53a2201b1312405bbd7ed2b88f65c9be 100644
--- a/tensorflow/compiler/xla/service/hlo_module.h
+++ b/tensorflow/compiler/xla/service/hlo_module.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/iterator_util.h"
+#include "tensorflow/compiler/xla/service/dynamic_parameter_binding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
@@ -103,11 +104,7 @@ class HloModule {
                                        HloCloneContext* context = nullptr);
 
   // Return a pointer to the entry computation of the module.
-  const HloComputation* entry_computation() const {
-    CHECK_NE(nullptr, entry_computation_);
-    return entry_computation_;
-  }
-  HloComputation* entry_computation() {
+  HloComputation* entry_computation() const {
     CHECK_NE(nullptr, entry_computation_);
     return entry_computation_;
   }
@@ -135,6 +132,12 @@ class HloModule {
     return config_.entry_computation_layout();
   }
 
+  // Generates a hash value of an HLO module. Hash considers
+  // information on opcode, shape, operands, and typically a root instruction.
+  // This function returns the same hash value for equivalent HLO modules,
+  // with respect to HloInstruction::Identical() method.
+  uint64 Hash() const { return entry_computation()->Hash(); }
+
   // Gets the computations in this module.
   //
   // Returns a view of HloComputation*s, so you can iterate over this in the
@@ -232,6 +235,16 @@ class HloModule {
     return input_output_alias_config_;
   }
 
+  // DynamicParameterBinding holds the list of bindings that indicates which
+  // parameter dimensions are dynamic and which parameters represent their
+  // runtime value.
+  DynamicParameterBinding& dynamic_parameter_binding() {
+    return dynamic_parameter_binding_;
+  }
+  const DynamicParameterBinding& dynamic_parameter_binding() const {
+    return dynamic_parameter_binding_;
+  }
+
   // Returns an id that is unique to this module across all modules created over
   // the lifetime of this process.
   int unique_id() const { return unique_id_; }
@@ -285,6 +298,9 @@ class HloModule {
   // alias_config indicates the alias information of input/output buffers that
   // are expected from the module.
   HloInputOutputAliasConfig input_output_alias_config_;
+
+  // Bindings for dynamic parameter mapping.
+  DynamicParameterBinding dynamic_parameter_binding_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_module_test.cc b/tensorflow/compiler/xla/service/hlo_module_test.cc
index 3ae67e4e5ee90ca182c7c3d97a67d070431ce851..620cb7e01ad1a060915f5b73474f6950ab18122a 100644
--- a/tensorflow/compiler/xla/service/hlo_module_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_module_test.cc
@@ -63,7 +63,7 @@ class HloModuleTest : public HloTestBase {
 
 TEST_F(HloModuleTest, OneComputationPostOrder) {
   // Create a module with a single computation.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(CreateConstantComputation());
 
   EXPECT_THAT(module->MakeComputationPostOrder(),
@@ -72,7 +72,7 @@ TEST_F(HloModuleTest, OneComputationPostOrder) {
 
 TEST_F(HloModuleTest, TwoComputationsPostOrder) {
   // Create a module with two unconnected computations.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 = module->AddEntryComputation(CreateConstantComputation());
   auto computation2 =
       module->AddEmbeddedComputation(CreateConstantComputation());
@@ -88,7 +88,7 @@ TEST_F(HloModuleTest, TwoComputationsPostOrder) {
 
 TEST_F(HloModuleTest, CloneTest) {
   // Create and copy a module with a diamond call graph of computations.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -111,7 +111,7 @@ TEST_F(HloModuleTest, CloneTest) {
 }
 
 TEST_F(HloModuleTest, CloneHasFusion) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
 
   // Create the fused computation.
   HloComputation* fused_computation;
@@ -154,7 +154,7 @@ TEST_F(HloModuleTest, CloneHasFusion) {
 
 TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
   // Create a module with a diamond call graph of computations.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation1 =
       module->AddEmbeddedComputation(CreateConstantComputation());
   auto computation2 =
@@ -174,7 +174,7 @@ TEST_F(HloModuleTest, DiamondComputationsPostOrder) {
 
 TEST_F(HloModuleTest, LargeConstantToString) {
   // Create a module with a single computation.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder("Constant");
   std::vector<float> values(16, 42.0);
   builder.AddInstruction(
@@ -194,8 +194,8 @@ TEST_F(HloModuleTest, LargeConstantToString) {
 }
 
 TEST_F(HloModuleTest, UniqueModuleId) {
-  auto module_a = CreateNewUnverifiedModule();
-  auto module_b = CreateNewUnverifiedModule();
+  auto module_a = CreateNewVerifiedModule();
+  auto module_b = CreateNewVerifiedModule();
   EXPECT_NE(module_a->unique_id(), module_b->unique_id());
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_opcode.h b/tensorflow/compiler/xla/service/hlo_opcode.h
index 70c7d70b41c5c7bc94d1fac83c0fcf71f155b5f0..127cfd165a5d8229cac3035f56a66f1bcfa734f3 100644
--- a/tensorflow/compiler/xla/service/hlo_opcode.h
+++ b/tensorflow/compiler/xla/service/hlo_opcode.h
@@ -47,6 +47,8 @@ namespace xla {
 #define HLO_OPCODE_LIST(V)                                   \
   V(kAbs, "abs")                                             \
   V(kAdd, "add")                                             \
+  V(kAddDependency, "add-dependency")                        \
+  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
   V(kAllToAll, "all-to-all")                                 \
   V(kAtan2, "atan2")                                         \
   V(kBatchNormGrad, "batch-norm-grad")                       \
@@ -84,7 +86,6 @@ namespace xla {
   V(kGather, "gather")                                       \
   V(kGe, "greater-than-or-equal-to", kHloOpcodeIsComparison) \
   V(kGetDimensionSize, "get-dimension-size")                 \
-  V(kAfterAll, "after-all", kHloOpcodeIsVariadic)            \
   V(kGetTupleElement, "get-tuple-element")                   \
   V(kGt, "greater-than", kHloOpcodeIsComparison)             \
   V(kImag, "imag")                                           \
diff --git a/tensorflow/compiler/xla/service/hlo_ordering.cc b/tensorflow/compiler/xla/service/hlo_ordering.cc
index f5f99bece18cc637365118ddcd1273da05f4e1b6..ca6a154809be46d6a0305c29e2b89219de408019 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering.cc
@@ -356,8 +356,7 @@ void SequentialHloOrdering::Initialize() {
   // Create a map from instruction to its order position.
   TF_DCHECK_OK(schedule_.Verify());
   for (const auto& computation_sequence : schedule_.sequences()) {
-    const std::vector<const HloInstruction*>& order =
-        computation_sequence.second.instructions();
+    const auto& order = computation_sequence.second.instructions();
     for (int i = 0; i < order.size(); ++i) {
       InsertOrDie(&order_position_, order[i], i);
     }
diff --git a/tensorflow/compiler/xla/service/hlo_ordering_test.cc b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
index 2ab8aa57f6ed4586c3376ee7c44126c0ed19ea0b..3ca77e60cd5275c22eb0e338cd5437fc44b49958 100644
--- a/tensorflow/compiler/xla/service/hlo_ordering_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_ordering_test.cc
@@ -53,7 +53,7 @@ TEST_F(HloOrderingTest, InstructionsInDifferentComputations) {
   //   %c = Constant(42.0f)
   //
   // This results in a diamond-shaped callgraph.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder_c = HloComputation::Builder("C");
@@ -126,7 +126,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
   //   %constant = Constant(1.0)
   //   return While(%constant, body, condition)
   //
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -176,7 +176,7 @@ TEST_F(HloOrderingTest, InstructionsInWhileComputations) {
 
 TEST_F(HloOrderingTest, ParametersDefinedBeforeOthers) {
   // Entry parameter should always be defined before other instruction.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
   auto builder = HloComputation::Builder(TestName());
   auto constant = builder.AddInstruction(
@@ -209,7 +209,7 @@ TEST_F(HloOrderingTest, ValuesInWhileComputations) {
   //   %while = While(%constant, body, condition)
   //   %add = Add(%constant, %while)
   //
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto body_builder = HloComputation::Builder("body");
@@ -407,7 +407,7 @@ TEST_F(HloOrderingTest,
   //   %dead = Constant(123.0)
   //
   // %root should interfere with %dead.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto builder = HloComputation::Builder(TestName());
@@ -455,7 +455,7 @@ TEST_F(HloOrderingTest,
   //   ROOT %call = call({%c}), subcomputation
   //
   // %root should interfere with %dead.
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   const Shape scalar_shape = ShapeUtil::MakeShape(xla::F32, {});
 
   auto subbuilder = HloComputation::Builder(TestName() + ".sub");
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index 4390145c6bd7484987b2851ef92336defffb388b..9b5bb5d0bd6af104ef62eaa5d3e53cedbe0213d3 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -47,11 +47,11 @@ const double kF16max = 65504;
 
 // Creates and returns a schedule created using the order of the instructions in
 // the HloComputation::instructions() vectors in the module.
-HloSchedule ScheduleFromInstructionOrder(const HloModule* module) {
+HloSchedule ScheduleFromInstructionOrder(HloModule* module) {
   HloSchedule schedule(module);
-  for (const HloComputation* computation : module->computations()) {
+  for (HloComputation* computation : module->computations()) {
     if (!computation->IsFusionComputation()) {
-      for (const HloInstruction* instruction : computation->instructions()) {
+      for (HloInstruction* instruction : computation->instructions()) {
         schedule.GetOrCreateSequence(computation).push_back(instruction);
       }
     }
@@ -850,6 +850,15 @@ bool HloParser::ParseInstructionRhs(HloComputation::Builder* builder,
       }
       break;
     }
+    case HloOpcode::kAddDependency: {
+      if (!ParseOperands(&operands, /*expected_size=*/2) ||
+          !ParseAttributes(attrs)) {
+        return false;
+      }
+      instruction = builder->AddInstruction(
+          HloInstruction::CreateAddDependency(operands[0], operands[1]));
+      break;
+    }
     case HloOpcode::kSort: {
       optional<std::vector<tensorflow::int64>> dimensions;
       attrs["dimensions"] = {/*required=*/true, AttrTy::kBracedInt64List,
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index c59bdc0a0b372d829ee61f0a048b7704498e0d0e..ab71f011ac9d77d00ddfb41aca7a224d26d416b7 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -21,7 +21,8 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_instructions.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
@@ -29,7 +30,7 @@ limitations under the License.
 namespace xla {
 namespace {
 
-namespace op = ::xla::testing::opcode_matchers;
+namespace m = ::xla::match;
 using absl::string_view;
 
 struct TestData {
@@ -195,7 +196,7 @@ ENTRY %add_constants () -> f32[] {
 R"(HloModule TupleConstant_module
 
 ENTRY %TupleConstant.v1 () -> (f32[2,1], f32[2]) {
-  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { { 1 }, { 2 } }, {2, 42} ))
+  ROOT %constant = (f32[2,1]{1,0}, f32[2]{0}) constant((f32[2,1], f32[2]) ( f32[2,1] { {1}, {2} }, {2, 42} ))
 }
 
 )"
@@ -587,7 +588,7 @@ ENTRY %DynamicUpdateSlice.v4 (input: s32[1,1,25,1], update: s32[1,1,2,1], start_
 R"(HloModule BasicTraining_module
 
 ENTRY %BasicTraining.v4 () -> (f32[2,2,1,2], f32[2], f32[2]) {
-  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ {1, 2} }, { /*i1=1*/ {3, 4} } }, { /*i0=1*/ { /*i1=0*/ {5, 6} }, { /*i1=1*/ {7, 8} } } })
+  %constant = f32[2,2,1,2]{3,2,1,0} constant(f32[2,2,1,2] { { /*i0=0*/ { /*i1=0*/ { 1, 2 } }, { /*i1=1*/ { 3, 4 } } }, { /*i0=1*/ { /*i1=0*/ { 5, 6 } }, { /*i1=1*/ { 7, 8 } } } })
   %constant.1 = f32[2]{0} constant({2, 3})
   %constant.2 = f32[2]{0} constant({1, 2})
   ROOT %batch-norm-training = (f32[2,2,1,2]{3,2,1,0}, f32[2]{0}, f32[2]{0}) batch-norm-training(f32[2,2,1,2]{3,2,1,0} %constant, f32[2]{0} %constant.1, f32[2]{0} %constant.2), epsilon=0.001, feature_index=3
@@ -1241,7 +1242,38 @@ ENTRY Sort {
 }
 
 )"
+    },
+// AfterAll with multiple operands
+{
+"AfterAllWithMultipleOperands",
+R"(HloModule AfterAllWithMultipleOperands
+
+ENTRY AfterAllWithMultipleOperands {
+  p0 = f32[] parameter(0)
+  token0 = token[] after-all()
+  token1 = token[] after-all()
+  ROOT after-all = token[] after-all(p0, token0, token1)
 }
+
+)"
+},
+// AddDependency
+// A dependency chain is created from 'neg' to 'exp' using tokens.
+{
+"AddDependency",
+R"(HloModule AddDependency
+
+ENTRY AddDependency {
+  p = f32[] parameter(0)
+  neg = f32[] negate(p)
+  token = token[] after-all(neg)
+  p_after_token = f32[] add-dependency(p, token)
+  exp = f32[] exponential(p_after_token)
+  ROOT sum = f32[] add(neg, exp)
+}
+
+)"
+},
 });
   // clang-format on
 }
@@ -1862,7 +1894,8 @@ ENTRY ReduceR3ToR2 {
 )";
   TF_ASSERT_OK_AND_ASSIGN(auto module, ParseHloString(original));
   ASSERT_NE(module->entry_computation(), nullptr);
-  EXPECT_THAT(module->entry_computation()->root_instruction(), op::Reduce());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Reduce()));
 }
 
 TEST_F(HloParserTest, ParseSharding) {
@@ -1922,7 +1955,7 @@ TEST(HloParserSingleOpTest, SingleOp) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, SingleOpNoShapeProducesError) {
@@ -1950,7 +1983,7 @@ TEST(HloParserSingleOpTest, SingleOpNoNames) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, CanonicalOp) {
@@ -1959,7 +1992,7 @@ TEST(HloParserSingleOpTest, CanonicalOp) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Multiply(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Multiply(m::Parameter(0), m::Parameter(1))));
   EXPECT_EQ(
       computation->root_instruction()->ToString(HloPrintOptions::Canonical()),
       text);
@@ -2013,7 +2046,11 @@ TEST(HloParserSingleOpTest, SingleOpWithNested) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Fusion(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Op()
+                             .WithOpcode(HloOpcode::kFusion)
+                             .WithNumOperands(2)
+                             .WithOperand(0, m::Parameter(0))
+                             .WithOperand(1, m::Parameter(1))));
 }
 
 TEST(HloParserSingleOpTest, SingleOpWithNested_DoesNotExist) {
@@ -2057,7 +2094,7 @@ TEST(HloParserSingleOpTest, ConvolutionTrivialFeatureGroupCount) {
   const HloComputation* computation = module->entry_computation();
   ASSERT_NE(computation, nullptr);
   EXPECT_THAT(computation->root_instruction(),
-              op::Convolution(op::Parameter(0), op::Parameter(1)));
+              GmockMatch(m::Convolution(m::Parameter(0), m::Parameter(1))));
   auto* convolution =
       Cast<HloConvolutionInstruction>(computation->root_instruction());
   EXPECT_EQ(convolution->feature_group_count(), 1);
@@ -2121,8 +2158,10 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
       module->schedule().is_computation_scheduled(module->entry_computation()));
   EXPECT_THAT(
       module->schedule().sequence(module->entry_computation()).instructions(),
-      ::testing::ElementsAre(op::Parameter(), op::Broadcast(), op::Parameter(),
-                             op::Multiply(), op::Parameter(), op::Add()));
+      ::testing::ElementsAre(
+          GmockMatch(m::Parameter()), GmockMatch(m::Broadcast()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Multiply()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Add())));
 }
 
 TEST_F(HloParserTest, IsScheduledIsTrueDifferentOrder) {
@@ -2148,8 +2187,10 @@ ENTRY %axpy.v5 (alpha: f32[], x: f32[2,4], y: f32[2,4]) -> f32[2,4] {
       module->schedule().is_computation_scheduled(module->entry_computation()));
   EXPECT_THAT(
       module->schedule().sequence(module->entry_computation()).instructions(),
-      ::testing::ElementsAre(op::Parameter(), op::Parameter(), op::Parameter(),
-                             op::Broadcast(), op::Multiply(), op::Add()));
+      ::testing::ElementsAre(
+          GmockMatch(m::Parameter()), GmockMatch(m::Parameter()),
+          GmockMatch(m::Parameter()), GmockMatch(m::Broadcast()),
+          GmockMatch(m::Multiply()), GmockMatch(m::Add())));
 }
 
 TEST_F(HloParserTest, CustomCallWrongNumberofOperandConstraints) {
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 312b5d020c398feb7738d14a9cfa0928d5178948..51177f24f5ee702be96fc8b4530ed38a5798109f 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -113,7 +113,7 @@ void HloPassPipeline::MaybeDumpHlo(const HloModule& module,
   }
 
   const string message =
-      StrCat("after ", after_pass_name, ", before ", before_pass_name);
+      absl::StrCat("after ", after_pass_name, ", before ", before_pass_name);
   hlo_graph_dumper::MaybeDumpHloModule(module, message);
   VLOG(3) << "HLO " << message << ":";
   VLOG(3) << module.entry_computation_layout().ToString();
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.cc b/tensorflow/compiler/xla/service/hlo_proto_util.cc
index cf33668f5bfa64a7843efc76e9f6768d18533240..981d06ce101644ecce587c4bd2f7a12c8edf6548 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.cc
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.cc
@@ -48,7 +48,7 @@ StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
   return std::move(module);
 }
 
-StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
+StatusOr<std::vector<const ShapeProto*>> EntryComputationParameterShapes(
     const HloProto& hlo_proto) {
   if (!hlo_proto.has_hlo_module()) {
     return NotFound("HloProto missing HloModuleProto.");
@@ -57,15 +57,16 @@ StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
     return NotFound("HloProto missing program shape.");
   }
 
-  std::vector<const Shape*> parameter_shapes;
+  std::vector<const ShapeProto*> parameter_shapes;
   const auto& program_shape = hlo_proto.hlo_module().host_program_shape();
-  for (const Shape& shape : program_shape.parameters()) {
+  for (const ShapeProto& shape : program_shape.parameters()) {
     parameter_shapes.push_back(&shape);
   }
   return parameter_shapes;
 }
 
-StatusOr<const Shape*> EntryComputationOutputShape(const HloProto& hlo_proto) {
+StatusOr<const ShapeProto*> EntryComputationOutputShape(
+    const HloProto& hlo_proto) {
   if (!hlo_proto.has_hlo_module()) {
     return NotFound("HloProto missing HloModuleProto.");
   }
diff --git a/tensorflow/compiler/xla/service/hlo_proto_util.h b/tensorflow/compiler/xla/service/hlo_proto_util.h
index 1db82dd6fcaa5d7fe7d65894c1021105f0b26266..31ea2aaffd9cdb76d21edbd0d4a03aa5f865f4f0 100644
--- a/tensorflow/compiler/xla/service/hlo_proto_util.h
+++ b/tensorflow/compiler/xla/service/hlo_proto_util.h
@@ -43,12 +43,13 @@ StatusOr<std::unique_ptr<HloModule>> CreateModuleFromProto(
 
 // Returns the shapes of the parameters of the entry computation. Shape pointers
 // refer to shapes inside of the given HloProto.
-StatusOr<std::vector<const Shape*>> EntryComputationParameterShapes(
+StatusOr<std::vector<const ShapeProto*>> EntryComputationParameterShapes(
     const HloProto& hlo_proto);
 
 // Returns the shape of the output of the entry computation. The shape pointer
 // refers to the output shape inside of the given HloProto.
-StatusOr<const Shape*> EntryComputationOutputShape(const HloProto& hlo_proto);
+StatusOr<const ShapeProto*> EntryComputationOutputShape(
+    const HloProto& hlo_proto);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index 49e46ecd00ee4370f3e93746348373b79febed3d..48add75523f02005c70bc6baf69a6b7d5aa4f7ef 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -130,10 +130,10 @@ using ItemList = absl::InlinedVector<Item*, 3>;
 // before arbitrary elements.
 class InstructionList {
  public:
-  explicit InstructionList(const std::vector<const HloInstruction*>& order) {
+  explicit InstructionList(const HloInstructionSequence& order) {
     int64 position = 0;
     Item* last = nullptr;
-    for (const HloInstruction* inst : order) {
+    for (HloInstruction* inst : order.instructions()) {
       // Add a new item to the linked list.
       Item* item = new Item;
       item->next = nullptr;
@@ -151,7 +151,7 @@ class InstructionList {
       // to be monotonically increasing through the list, and so is still useful
       // for quickly(-ish) determining the order of arbitrary instructions in
       // the list.
-      item->instruction = const_cast<HloInstruction*>(inst);
+      item->instruction = inst;
       item->position = position;
       position++;
 
@@ -927,7 +927,7 @@ Item* PickRematerializationCandidate(
 
 StatusOr<int64> HloRematerialization::ComputePeakMemory(
     const HloComputation* computation,
-    const std::vector<const HloInstruction*>& order) const {
+    const HloInstructionSequence& order) const {
   InstructionList instruction_list(order);
   MemoryUsageTracker tracker(computation, size_function_, *points_to_analysis_,
                              instruction_list);
@@ -971,8 +971,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
           << HumanReadableNumBytes(computation_peak_memory_.at(computation));
   CHECK(!ContainsKey(rematerialized_computations_, computation));
 
-  InstructionList instruction_list(
-      schedule->sequence(computation).instructions());
+  InstructionList instruction_list(schedule->sequence(computation));
   MemoryUsageTracker memory_tracker(computation, size_function_,
                                     *points_to_analysis_, instruction_list);
   bool changed = false;
@@ -1184,7 +1183,7 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
   sequence.clear();
   for (auto* item = instruction_list.first(); item != nullptr;
        item = instruction_list.next(item)) {
-    const HloInstruction* instruction = item->instruction;
+    HloInstruction* instruction = item->instruction;
     sequence.push_back(instruction);
   }
   rematerialized_computations_.insert(computation);
@@ -1235,10 +1234,8 @@ StatusOr<bool> HloRematerialization::Run(HloModule* module) {
         if (node.context() == CallContext::kSequential) {
           TF_ASSIGN_OR_RETURN(
               computation_peak_memory_[node.computation()],
-              ComputePeakMemory(node.computation(),
-                                module->schedule()
-                                    .sequence(node.computation())
-                                    .instructions()));
+              ComputePeakMemory(node.computation(), module->schedule().sequence(
+                                                        node.computation())));
         }
         return Status::OK();
       },
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.h b/tensorflow/compiler/xla/service/hlo_rematerialization.h
index 70d83c04f07ca7fd0139f586869e8fe688f958f4..a07d348041b72bba45c6fd1f726f2a0065d01e53 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.h
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.h
@@ -87,9 +87,8 @@ class HloRematerialization : public HloModulePass {
   // peak memory is the maximum total size of all live HLO instruction values at
   // any program point. 'order' is the order in which the HLO instructions will
   // be emitted which is used to determine lifespans of HLO values.
-  StatusOr<int64> ComputePeakMemory(
-      const HloComputation* computation,
-      const std::vector<const HloInstruction*>& order) const;
+  StatusOr<int64> ComputePeakMemory(const HloComputation* computation,
+                                    const HloInstructionSequence& order) const;
 
   // Returns the peak memory usage of the called computations for the given
   // instruction. Zero is returned if the instruction calls no computations.
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 3f0ca342b4c84216ddd5ee553848360d8bd1ff0b..5a9b820a9d7f58695383b21c9e2126cf98970c83 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -205,6 +205,40 @@ StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
       /*profile=*/profile);
 }
 
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const ShapedBuffer* const> arguments,
+    ExecutionProfile* profile) {
+  // Get service run options.
+  se::Stream stream(backend().default_stream_executor());
+  stream.Init();
+  ServiceExecutableRunOptions service_run_options =
+      GetServiceRunOptionsForDevice(backend().default_device_ordinal(), &stream,
+                                    nullptr);
+
+  TF_ASSIGN_OR_RETURN(
+      ScopedShapedBuffer retval,
+      executable->ExecuteOnStreamWrapper(&service_run_options,
+                                         /*profile=*/profile, arguments));
+  TF_RETURN_IF_ERROR(stream.BlockHostUntilDone());
+  return std::move(retval);
+}
+
+StatusOr<ScopedShapedBuffer> HloRunner::ExecuteWithDeviceBuffers(
+    std::unique_ptr<Executable> executable,
+    const absl::Span<const ScopedShapedBuffer> arguments,
+    ExecutionProfile* profile) {
+  std::vector<const ShapedBuffer*> argument_pointers;
+  argument_pointers.reserve(arguments.size());
+  for (const auto& argument : arguments) {
+    argument_pointers.push_back(&argument);
+  }
+  return ExecuteWithDeviceBuffers(
+      /*executable=*/std::move(executable),
+      /*arguments=*/argument_pointers,
+      /*profile=*/profile);
+}
+
 StatusOr<std::vector<Literal>> HloRunner::ExecuteReplicated(
     std::unique_ptr<HloModule> module,
     const ReplicatedExecuteOptions& options) {
diff --git a/tensorflow/compiler/xla/service/hlo_runner.h b/tensorflow/compiler/xla/service/hlo_runner.h
index 2e934bf66ae43ea412f242030b874dddb6d3722d..bb792cf8c9825ff67ca33bbcf2c3c32b1a0ecb85 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.h
+++ b/tensorflow/compiler/xla/service/hlo_runner.h
@@ -136,6 +136,21 @@ class HloRunner {
       const absl::Span<const ScopedShapedBuffer> arguments,
       bool run_hlo_passes = true, ExecutionProfile* profile = nullptr);
 
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<Executable> executable,
+      const absl::Span<const ShapedBuffer* const> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  StatusOr<ScopedShapedBuffer> ExecuteWithDeviceBuffers(
+      std::unique_ptr<Executable> executable,
+      const absl::Span<const ScopedShapedBuffer> arguments,
+      ExecutionProfile* profile = nullptr);
+
+  // Creates an executable object given an HLO module. If run_hlo_passes is
+  // true, the HLO passes will be run as part of compilation.
+  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
+      std::unique_ptr<HloModule> module, bool run_hlo_passes);
+
   // Executes a given HLO module into a set of replicas, and returns a map
   // with the replica number as key, and the corresponding returned literal as
   // value.
@@ -152,11 +167,6 @@ class HloRunner {
   const Backend& backend() const;
 
  private:
-  // Creates an executable object given an HLO module. If run_hlo_passes is
-  // true, the HLO passes will be run before.
-  StatusOr<std::unique_ptr<Executable>> CreateExecutable(
-      std::unique_ptr<HloModule> module, bool run_hlo_passes);
-
   // Creates a ServiceExecutableRunOptions object to configure a run on device,
   // using the provided stream object. If device_assignment is not nullptr, it
   // will be used to configure the replication parameters. Replicated executions
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.cc b/tensorflow/compiler/xla/service/hlo_schedule.cc
index a5780b7551a43f2b64f2ac61ef1bf6ce9e07eb16..8f6eb974c5179b420c8f961393ca923e0a3b3530 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule.cc
@@ -46,8 +46,8 @@ namespace xla {
         << "No computation exists in HLO module with id " << computation_id;
     const HloComputation* computation = comp_it->second;
 
-    absl::flat_hash_map<int64, const HloInstruction*> id_to_instruction;
-    for (const HloInstruction* instruction : computation->instructions()) {
+    absl::flat_hash_map<int64, HloInstruction*> id_to_instruction;
+    for (HloInstruction* instruction : computation->instructions()) {
       id_to_instruction[instruction->unique_id()] = instruction;
     }
 
@@ -81,9 +81,8 @@ StatusOr<HloScheduleProto> HloSchedule::ToProto() const {
   return std::move(proto);
 }
 
-void HloSchedule::set_sequence(
-    const HloComputation* computation,
-    absl::Span<const HloInstruction* const> sequence) {
+void HloSchedule::set_sequence(const HloComputation* computation,
+                               absl::Span<HloInstruction* const> sequence) {
   set_sequence(computation, HloInstructionSequence(sequence));
 }
 
@@ -114,8 +113,8 @@ Status HloSchedule::UpdateComputationSchedule(
     const HloComputation* computation) {
   // Map from unique ID to HloInstruction pointer for instructions in the
   // computation.
-  absl::flat_hash_map<int, const HloInstruction*> id_to_instruction;
-  for (const HloInstruction* instruction : computation->instructions()) {
+  absl::flat_hash_map<int, HloInstruction*> id_to_instruction;
+  for (HloInstruction* instruction : computation->instructions()) {
     InsertOrDie(&id_to_instruction, instruction->unique_id(), instruction);
   }
 
@@ -128,7 +127,7 @@ Status HloSchedule::UpdateComputationSchedule(
   // Map from HloInstruction X to newly added instructions (instruction is in
   // computation, but not in schedule) which use X. If an instruction is not in
   // the map, then it has no users which are newly added instructions.
-  absl::flat_hash_map<const HloInstruction*, std::vector<const HloInstruction*>>
+  absl::flat_hash_map<const HloInstruction*, std::vector<HloInstruction*>>
       new_instruction_uses;
 
   // For each newly added instruction, this is the count of the instruction's
@@ -138,9 +137,9 @@ Status HloSchedule::UpdateComputationSchedule(
 
   // Create a worklist of newly added instructions which are ready to be added
   // to the schedule. Initialize worklist with those that have zero operands.
-  std::queue<const HloInstruction*> worklist;
+  std::queue<HloInstruction*> worklist;
 
-  for (const HloInstruction* instruction : computation->instructions()) {
+  for (HloInstruction* instruction : computation->instructions()) {
     if (ids_in_schedule.count(instruction->unique_id()) == 0) {
       // This is a newly added instruction which is not in the schedule.
       if (instruction->operands().empty()) {
@@ -161,17 +160,17 @@ Status HloSchedule::UpdateComputationSchedule(
   // Lambda which schedules all instructions on the worklist.
   auto schedule_worklist = [&]() {
     while (!worklist.empty()) {
-      const HloInstruction* instruction = worklist.front();
+      HloInstruction* instruction = worklist.front();
       worklist.pop();
       new_sequence.push_back(instruction);
-      std::vector<const HloInstruction*>* new_users =
+      std::vector<HloInstruction*>* new_users =
           tensorflow::gtl::FindOrNull(new_instruction_uses, instruction);
       if (new_users != nullptr) {
         // This just-scheduled instruction has users which are newly added to
         // the module. Update the number of unscheduled operands and push the
         // newly added instruction to the worklist if it is ready to
         // schedule.
-        for (const HloInstruction* new_user : *new_users) {
+        for (HloInstruction* new_user : *new_users) {
           unscheduled_operand_count.at(new_user)--;
           CHECK_GE(unscheduled_operand_count.at(new_user), 0);
           if (unscheduled_operand_count.at(new_user) == 0) {
diff --git a/tensorflow/compiler/xla/service/hlo_schedule.h b/tensorflow/compiler/xla/service/hlo_schedule.h
index 0a714101ee587aa847fa674bbde5586287c51f33..486ddbf499de80c634bc497158cd79ca066cc866 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule.h
+++ b/tensorflow/compiler/xla/service/hlo_schedule.h
@@ -35,14 +35,14 @@ class HloInstructionSequence {
  public:
   HloInstructionSequence() = default;
   explicit HloInstructionSequence(
-      absl::Span<const HloInstruction* const> instructions) {
-    for (const HloInstruction* instruction : instructions) {
+      absl::Span<HloInstruction* const> instructions) {
+    for (HloInstruction* instruction : instructions) {
       push_back(instruction);
     }
   }
 
   // Adds the instruction to the end of the sequence.
-  void push_back(const HloInstruction* instruction) {
+  void push_back(HloInstruction* instruction) {
     instruction_sequence_.push_back(instruction);
     id_sequence_.push_back(instruction->unique_id());
   }
@@ -56,7 +56,7 @@ class HloInstructionSequence {
   int64 size() const { return instruction_sequence_.size(); }
 
   // Returns the sequence of HLO instructions.
-  const std::vector<const HloInstruction*>& instructions() const {
+  const std::vector<HloInstruction*>& instructions() const {
     return instruction_sequence_;
   }
 
@@ -65,7 +65,7 @@ class HloInstructionSequence {
 
  private:
   // The sequence as HloInstructions.
-  std::vector<const HloInstruction*> instruction_sequence_;
+  std::vector<HloInstruction*> instruction_sequence_;
 
   // The sequence of HLO instructions, represented by their unique IDs. The
   // sequence is stored as both HloInstructions and unique IDs because the
@@ -98,7 +98,7 @@ class HloSchedule {
 
   // Sets the sequence for the given computation to the given sequence.
   void set_sequence(const HloComputation* computation,
-                    absl::Span<const HloInstruction* const> sequence);
+                    absl::Span<HloInstruction* const> sequence);
   void set_sequence(const HloComputation* computation,
                     HloInstructionSequence sequence);
 
diff --git a/tensorflow/compiler/xla/service/hlo_schedule_test.cc b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
index 1424569ac1f62e4b965876141f1eb40be4f15bea..0e56e6f760e35ddcb45c6f58771d78405a09acfe 100644
--- a/tensorflow/compiler/xla/service/hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_schedule_test.cc
@@ -56,10 +56,10 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
-  const std::vector<const HloInstruction*>& entry_schedule =
+  const auto& entry_schedule =
       schedule.sequence(module->entry_computation()).instructions();
 
   EXPECT_EQ(entry_schedule.size(), 6);
@@ -90,7 +90,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -139,7 +139,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -183,7 +183,7 @@ ENTRY main {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape());
       }));
 
@@ -244,7 +244,7 @@ ENTRY %WhileLoop () -> s32[] {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
@@ -313,7 +313,7 @@ ENTRY %WhileLoop () -> s32[] {
                           ParseHloString(module_str));
   TF_ASSERT_OK_AND_ASSIGN(
       HloSchedule schedule,
-      ScheduleModule(*module, [](const BufferValue& buffer) {
+      ScheduleModule(module.get(), [](const BufferValue& buffer) {
         return ShapeUtil::ByteSizeOf(buffer.shape(),
                                      /*pointer_size=*/sizeof(void*));
       }));
diff --git a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
index 88329c899794a6e0f5102d181d6161fe17f89932..f5061304456e04ab40448861343ef201c9450dcf 100644
--- a/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/service/hlo_sharding_metadata.cc
@@ -253,7 +253,7 @@ StatusOr<bool> ApplyShardingFromUsers(HloInstruction* instruction,
       instruction->shape(), HloSharding::AssignDevice(kUnassignedDevice));
   for (HloInstruction* user : instruction->users()) {
     if (user->opcode() == HloOpcode::kDomain &&
-        domain.exit_domains.count(const_cast<HloInstruction*>(user)) > 0) {
+        domain.exit_domains.count(user) > 0) {
       // If a user is a domain and it is registered in the domain exits, then
       // the instruction sharding is taken directly from the domain, and no
       // further users need to be visited.
diff --git a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
index 11994d99c93e9d51691e482a3e3233b06fb0d060..c1073911ea9dc3811c195e27bcbae9b00929ad17 100644
--- a/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_subcomputation_unification_test.cc
@@ -66,7 +66,7 @@ class HloSubcomputationUnificationTest : public HloTestBase {
 };
 
 TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -103,7 +103,7 @@ TEST_F(HloSubcomputationUnificationTest, UnifyIdentities) {
 }
 
 TEST_F(HloSubcomputationUnificationTest, UnifyAdditions) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto builder = HloComputation::Builder(TestName());
 
   auto callee1 =
@@ -184,7 +184,7 @@ TEST_F(HloSubcomputationUnificationTest, DifferentParameterShapes) {
 // Regression test for b/31466798. Checks that entry_computation is still valid
 // after unification.
 TEST_F(HloSubcomputationUnificationTest, TwoIdenticalComputations) {
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   for (int i = 0; i < 2; ++i) {
     HloComputation::Builder builder("pow");
     auto x =
diff --git a/tensorflow/compiler/xla/service/hlo_value.h b/tensorflow/compiler/xla/service/hlo_value.h
index b6670d409b92e8be42f5cdb40fba8d662ae83958..1f01b0bb365450a933da9cc443db5223c06903f0 100644
--- a/tensorflow/compiler/xla/service/hlo_value.h
+++ b/tensorflow/compiler/xla/service/hlo_value.h
@@ -166,9 +166,6 @@ class HloValue : public BufferValue {
 
   // Whether this value is live out of the HLO module.
   bool live_out_of_module_ = false;
-
-  // Whether this value is live out of its computation.
-  bool live_out_of_computation_ = false;
 };
 
 std::ostream& operator<<(std::ostream& out, const HloValue& hlo_value);
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 27fd685a69a0bbd95b1d8d266ce6177a6c557f55..77db7b098a38ff4efdcc7447935fae61561c9ff4 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -753,13 +753,19 @@ Status ShapeVerifier::HandleAfterAll(HloInstruction* token) {
   for (const HloInstruction* operand : token->operands()) {
     operand_shapes.push_back(&operand->shape());
   }
-  return CheckShape(token, ShapeInference::InferAfterAllShape(operand_shapes));
+  return CheckShape(token, ShapeUtil::MakeTokenShape());
+}
+
+Status ShapeVerifier::HandleAddDependency(HloInstruction* add_dependency) {
+  TF_RETURN_IF_ERROR(CheckOperandCount(add_dependency, 2));
+  TF_RETURN_IF_ERROR(CheckIsTokenOperand(add_dependency, 1));
+  return CheckShape(add_dependency, add_dependency->operand(0)->shape());
 }
 
 Status ShapeVerifier::HandleGetDimensionSize(HloInstruction* get_size) {
-  return CheckShape(
-      get_size, ShapeInference::InferGetDimensionSizeShape(
-                    get_size->operand(0)->shape(), get_size->dimensions(0)));
+  return CheckShape(get_size,
+                    ShapeInference::InferGetDimensionSizeShape(
+                        get_size->operand(0)->shape(), get_size->dimension()));
 }
 
 Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
@@ -1373,9 +1379,8 @@ class InstructionVerifier : public DfsHloVisitorWithDefault {
           const Layout& operand_layout = operand_shape.layout();
           TF_RET_CHECK(LayoutUtil::Equal(result_layout, operand_layout))
               << "Instruction shouldn't change layouts "
-              << instruction->ToString() << " From "
-              << ShapeUtil::HumanString(result_shape) << " To "
-              << ShapeUtil::HumanString(operand_shape);
+              << instruction->ToString() << " From " << result_shape << " To "
+              << operand_shape;
         }
       }
     }
@@ -1426,6 +1431,8 @@ StatusOr<bool> HloVerifier::Run(HloModule* module) {
         return target_metadata_->ShapeSize(shape);
       }));
 
+  TF_RETURN_IF_ERROR(module->dynamic_parameter_binding().Verify(*module));
+
   return false;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index 9fbfd6a21c1f1148801000169046fbcbb37934fe..e4d0c3d6957885f1d719fedb5a900de601e397f8 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -95,6 +95,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleScatter(HloInstruction* scatter) override;
   Status HandleAfterAll(HloInstruction* token) override;
   Status HandleGetDimensionSize(HloInstruction* get_size) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   Status FinishVisit(HloInstruction*) override { return Status::OK(); }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 5ddfe0a944f04f070f9bdb81697425ee417ac15a..4bc557e4e62e7df4e25fda86fe417e84129b464c 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -35,6 +35,10 @@ namespace {
 
 using ::testing::HasSubstr;
 
+std::unique_ptr<HloModule> CreateUnverifiedModule() {
+  return absl::make_unique<HloModule>("module", HloModuleConfig());
+}
+
 // This class cannot be converted to use HloTestBase. It explicitly
 // uses HloTestBase to create and test malformed HLOs.
 class HloVerifierTest : public HloTestBase {
@@ -66,7 +70,7 @@ TEST_F(HloVerifierTest, NullInstructionParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -85,7 +89,7 @@ TEST_F(HloVerifierTest, NullComputationParent) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   HloComputation* computation = module->AddEntryComputation(builder.Build());
 
   TF_ASSERT_OK(verifier().Run(module.get()).status());
@@ -104,7 +108,7 @@ TEST_F(HloVerifierTest, DifferentOperandParents) {
       HloInstruction::CreateParameter(0, scalar_shape, "param"));
   HloInstruction* negate = builder.AddInstruction(
       HloInstruction::CreateUnary(scalar_shape, HloOpcode::kNegate, param));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   HloComputation::Builder emb_builder(TestName());
@@ -138,7 +142,7 @@ TEST_F(HloVerifierTest, ResetsShapeVerifierState) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(s2, HloOpcode::kMultiply, add, add));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Run the verifier twice.  It should fail both times, because it shouldn't
@@ -303,7 +307,7 @@ TEST_F(HloVerifierTest, NegativeInteriorPaddingNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32))),
       padding_config));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   auto status = verifier().Run(module.get()).status();
@@ -327,7 +331,7 @@ TEST_F(HloVerifierTest, PadNegativeInteriorDilationNotAllowed) {
           HloInstruction::CreateConstant(LiteralUtil::Zero(F32).Clone())),
       padding_config));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateUnverifiedModule();
   module->AddEntryComputation(builder.Build());
 
   EXPECT_THAT(verifier().Run(module.get()).status().error_message(),
diff --git a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
index 20cc18f981574adf1d95c9f1f87c95634238db06..98246d5403e4aebc2f4d81e52145706355ddd9a9 100644
--- a/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/indexed_array_analysis_test.cc
@@ -481,8 +481,8 @@ ENTRY main {
   const char* expected_root_expression = R"(
 (scalar-indexed-const
   (constant s32[2,1,1,1,6] s32[2,1,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } },
-    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ {1, 2, 3, 4, 5, 6} } } } })
+    { /*i0=0*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } },
+    { /*i0=1*/ { /*i1=0*/ { /*i2=0*/ { 1, 2, 3, 4, 5, 6 } } } } })
   (reshape %indices to s32[])
   0->[])
 )";
@@ -512,8 +512,8 @@ ENTRY main {
   const char* expected_root_expression = R"(
 (scalar-indexed-const
   (constant s32[2,1,1,6] s32[2,1,1,6] {
-    { /*i0=0*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } },
-    { /*i0=1*/ { /*i1=0*/ {1, 2, 3, 4, 5, 6} } } })
+    { /*i0=0*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } },
+    { /*i0=1*/ { /*i1=0*/ { 1, 2, 3, 4, 5, 6 } } } })
   (reshape %indices to s32[5])
   0->[2])
 )";
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index 7f2d7e7cffc6debaaf9b64fffc5a8a7037ecdaa3..7559ed1bab84b21a4d51bc38db999900befcfad7 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -103,7 +103,6 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kShiftRightLogical:
     case HloOpcode::kSlice:
     case HloOpcode::kSubtract:
-    case HloOpcode::kAfterAll:
     case HloOpcode::kTranspose:
     case HloOpcode::kTuple:
     case HloOpcode::kTupleSelect:
@@ -116,7 +115,10 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kSin:
       return ShapeUtil::ElementIsComplex(instruction.shape());
 
-    // Expensive instructions.
+    // Expensive instructions or unusual instructions for which fusion is
+    // nonsensical.
+    case HloOpcode::kAddDependency:
+    case HloOpcode::kAfterAll:
     case HloOpcode::kAtan2:
     case HloOpcode::kBatchNormGrad:
     case HloOpcode::kBatchNormInference:
@@ -455,8 +457,13 @@ StatusOr<bool> InstructionFusion::Run(HloModule* module) {
     computation_ = computation;
     reachability_ = HloReachabilityMap::Build(computation_);
 
-    HloInstructionSet do_not_duplicate =
-        ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
+    HloInstructionSet do_not_duplicate;
+    // If we allow duplications, we need to compute which instructions we do not
+    // want to duplicate based on a global analysis of the graph.
+    if (may_duplicate_) {
+      do_not_duplicate =
+          ComputeGloballyUnfusible(computation_->MakeInstructionPostOrder());
+    }
     auto fusion_queue = GetFusionQueue(computation_);
 
     // Instruction fusion effectively fuses edges in the computation graph
@@ -564,8 +571,8 @@ HloInstruction* InstructionFusion::FuseIntoMultiOutput(
 bool InstructionFusion::MultiOutputFusionCreatesCycle(
     HloInstruction* producer, HloInstruction* consumer) {
   auto is_reachable = [&](const HloInstruction* a, const HloInstruction* b) {
-    // A consumer operand may have been multii-output fused into a parallel
-    // consumer and thus be missing  from the oridinal reachability map.
+    // A consumer operand may have been multi-output fused into a parallel
+    // consumer and thus be missing from the original reachability map.
     if (!reachability_->IsPresent(a) || !reachability_->IsPresent(b)) {
       reachability_ = HloReachabilityMap::Build(consumer->parent());
     }
diff --git a/tensorflow/compiler/xla/service/instruction_fusion_test.cc b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
index 39904bd54b09a916d3e26e90c62cd6a202f9588d..58b7135cea7419f13d60ed510ecf7a88126aee48 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion_test.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion_test.cc
@@ -117,7 +117,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -133,7 +133,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastSimpleReshapeOfParameterUnfused) {
   auto reshape1 = builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {1, 1}), param0));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(reshape1, computation->root_instruction());
   EXPECT_FALSE(
@@ -149,7 +149,7 @@ TEST_F(InstructionFusionTest, PotentialBitcastTransposeOfParameterUnfused) {
   auto transpose1 = builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(S32, {}), param0, {}));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
   EXPECT_EQ(transpose1, computation->root_instruction());
   EXPECT_FALSE(
@@ -394,6 +394,56 @@ TEST_F(InstructionFusionTest, AllowEffectiveUnaryDuplication) {
           .ValueOrDie());
 }
 
+TEST_F(InstructionFusionTest, FuseDiamondGraphsNoDuplication) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f32[100] parameter(0)
+    p1 = f32[100] parameter(1)
+    add = f32[100] add(p0, p1)
+    slice1 = f32[99] slice(add), slice={[0:99:1]}
+    slice2 = f32[99] slice(add), slice={[1:100:1]}
+    ROOT add2 = f32[99] add(slice1, slice2)
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/false)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // 'add' would originally need to be duplicated if fused. However after its
+  // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one
+  // user and can now be also fused.
+  EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter()));
+}
+
+TEST_F(InstructionFusionTest, FuseDiamondGraphsAllowDuplication) {
+  auto module = ParseHloString(R"(
+  HloModule test_module
+  ENTRY Test {
+    p0 = f32[100] parameter(0)
+    p1 = f32[100] parameter(1)
+    add = f32[100] add(p0, p1)
+    slice1 = f32[99] slice(add), slice={[0:99:1]}
+    slice2 = f32[99] slice(add), slice={[1:100:1]}
+    ROOT add2 = f32[99] add(slice1, slice2)
+  })")
+                    .ValueOrDie();
+  EXPECT_TRUE(
+      InstructionFusion(InstructionFusion::IsExpensive, /*may_duplicate=*/true)
+          .Run(module.get())
+          .ValueOrDie())
+      << module->ToString();
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  // 'add' would originally need to be duplicated if fused. However after its
+  // two users 'slice1' and 'slice2' are fused into 'add2', 'add' has only one
+  // user and can now be also fused.
+  EXPECT_THAT(root, op::Fusion(op::Parameter(), op::Parameter()));
+}
+
 TEST_F(InstructionFusionTest,
        WideningConvertsAreAlwaysDuplicableIntoConsumers) {
   auto module = ParseHloString(R"(
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.cc b/tensorflow/compiler/xla/service/interpreter/executable.cc
index a06d6113e84630df14ff68280c248cccb9afaf06..de9204011ce5ba8a9fc2871c6bd7120b6ed371b5 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executable.cc
@@ -37,7 +37,7 @@ namespace xla {
 namespace interpreter {
 
 InterpreterExecutable::InterpreterExecutable(
-    std::unique_ptr<const HloModule> hlo_module,
+    std::unique_ptr<HloModule> hlo_module,
     std::unique_ptr<HloEvaluator> evaluator)
     : Executable(std::move(hlo_module), /*hlo_profile_printer=*/nullptr,
                  /*hlo_profile_index_map=*/nullptr),
@@ -85,6 +85,7 @@ StatusOr<ScopedShapedBuffer> InterpreterExecutable::ExecuteOnStream(
   Literal result_literal;
   {
     tensorflow::mutex_lock lock(evaluator_lock_);
+    evaluator_->ResetVisitStates();
     TF_ASSIGN_OR_RETURN(result_literal, evaluator_->Evaluate<Literal>(
                                             *computation, arg_literals));
   }
diff --git a/tensorflow/compiler/xla/service/interpreter/executable.h b/tensorflow/compiler/xla/service/interpreter/executable.h
index 3b1ebce0c75457d65e6834c809fe488a9c4a159a..bda13d376360306c81230e41b01cefc6caff230d 100644
--- a/tensorflow/compiler/xla/service/interpreter/executable.h
+++ b/tensorflow/compiler/xla/service/interpreter/executable.h
@@ -42,7 +42,7 @@ namespace interpreter {
 // buffer allocation. Refer to interpreter/README.md for more.
 class InterpreterExecutable : public Executable {
  public:
-  InterpreterExecutable(std::unique_ptr<const HloModule> hlo_module,
+  InterpreterExecutable(std::unique_ptr<HloModule> hlo_module,
                         std::unique_ptr<HloEvaluator> evaluator);
   ~InterpreterExecutable() override;
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.cc b/tensorflow/compiler/xla/service/interpreter/executor.cc
index 4fb67bd0b72fc591c1ffa76ebb0513bf14ed3737..e3e5fa71543baa309b3a68888b1b9bdfd43cfbd5 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.cc
+++ b/tensorflow/compiler/xla/service/interpreter/executor.cc
@@ -78,9 +78,14 @@ port::Status XlaInterpreterExecutor::SynchronousMemcpy(
   return port::Status::OK();
 }
 
-bool XlaInterpreterExecutor::HostCallback(Stream *stream,
-                                          std::function<void()> callback) {
-  AsExecutorStream(stream)->EnqueueTask(callback);
+bool XlaInterpreterExecutor::HostCallback(
+    Stream *stream, std::function<port::Status()> callback) {
+  AsExecutorStream(stream)->EnqueueTask([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return true;
 }
 
diff --git a/tensorflow/compiler/xla/service/interpreter/executor.h b/tensorflow/compiler/xla/service/interpreter/executor.h
index fbb99457847dca69a1901006d5d8ff713882f918..400c30515464ed5b00251fba303fef303a26b97b 100644
--- a/tensorflow/compiler/xla/service/interpreter/executor.h
+++ b/tensorflow/compiler/xla/service/interpreter/executor.h
@@ -125,7 +125,8 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return port::Status{port::error::UNIMPLEMENTED, ""};
   }
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
     return port::Status{port::error::UNIMPLEMENTED, ""};
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index a90411922205c0006159ff99f35a70138b1bee4f..eddef850cf5250b85b564c1e6c92d1cc8ecd1a43 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2000,6 +2000,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
   switch (instruction->opcode()) {
     case HloOpcode::kAbs:
     case HloOpcode::kAdd:
+    case HloOpcode::kAddDependency:
     case HloOpcode::kAnd:
     case HloOpcode::kAtan2:
     case HloOpcode::kBitcastConvert:
diff --git a/tensorflow/compiler/xla/service/layout_assignment_test.cc b/tensorflow/compiler/xla/service/layout_assignment_test.cc
index 2400b7bb7c409a4dcb33e6e8f4b409738510f3d6..5c661bfacb08fe27f3cbdc1fb9db083315166008 100644
--- a/tensorflow/compiler/xla/service/layout_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment_test.cc
@@ -27,10 +27,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/computation_layout.h"
 #include "tensorflow/compiler/xla/service/hlo_computation.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
-#include "tensorflow/compiler/xla/service/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
@@ -42,11 +43,10 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 
-namespace op = xla::testing::opcode_matchers;
-
 namespace xla {
 namespace {
 
+namespace m = xla::match;
 using ::testing::ElementsAre;
 
 class LayoutAssignmentTest : public HloTestBase {
@@ -328,11 +328,10 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
   //  %tuple.1 = Tuple(%copy) layout=({0,1})
   //  %tuple.2 = Tuple(%tuple.0, %tuple.1) layout=(({1,0}), ({0,1}))
   //
-  EXPECT_TRUE(
-      AlgebraicSimplifier(/*is_layout_sensitive=*/true,
-                          [](const Shape&, const Shape&) { return false; })
-          .Run(m.get())
-          .ValueOrDie());
+  AlgebraicSimplifierOptions options(
+      [](const Shape&, const Shape&) { return false; });
+  options.set_is_layout_sensitive(true);
+  EXPECT_TRUE(AlgebraicSimplifier(options).Run(m.get()).ValueOrDie());
   HloInstruction* root = m->entry_computation()->root_instruction();
   // Verify layout of the root and the root's operands.
   EXPECT_TRUE(ShapeUtil::Equal(result_shape, root->shape()));
@@ -343,7 +342,8 @@ TEST_F(LayoutAssignmentTest, ConflictingLayoutTuple) {
 
   // Verify the structure of the HLO graph.
   EXPECT_THAT(root,
-              op::Tuple(op::Tuple(constant), op::Tuple(op::Copy(constant))));
+              GmockMatch(m::Tuple(m::Tuple(m::Op().Is(constant)),
+                                  m::Tuple(m::Copy(m::Op().Is(constant))))));
 }
 
 TEST_F(LayoutAssignmentTest, ElementwiseAndReshape) {
@@ -947,9 +947,11 @@ TEST_F(LayoutAssignmentTest, CopySliceOperandToAvoidImplicitLayoutChange) {
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {1, 0});
-  EXPECT_THAT(root, op::Add(op::Parameter(),
-                            op::Slice(AllOf(op::Copy(op::Parameter(1)),
-                                            op::ShapeWithLayout(shape_copy)))));
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Add(
+          m::Parameter(),
+          m::Slice(m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
@@ -977,10 +979,11 @@ TEST_F(LayoutAssignmentTest, CopyDSliceOperandToAvoidImplicitLayoutChange) {
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {1, 0});
   EXPECT_THAT(root,
-              op::Add(op::Parameter(),
-                      op::DynamicSlice(AllOf(op::Copy(op::Parameter(1)),
-                                             op::ShapeWithLayout(shape_copy)),
-                                       op::Parameter(2))));
+              GmockMatch(m::Add(
+                  m::Parameter(),
+                  m::DynamicSlice(
+                      m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
+                      m::Parameter(2)))));
 }
 
 TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
@@ -1008,11 +1011,12 @@ TEST_F(LayoutAssignmentTest, CopyConcatOperandToAvoidImplicitLayoutChange) {
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {3, 5}, {1, 0});
-  EXPECT_THAT(root,
-              op::Add(op::Parameter(),
-                      op::Concatenate(AllOf(op::Copy(op::Parameter(1)),
-                                            op::ShapeWithLayout(shape_copy)),
-                                      op::Parameter(2))));
+  EXPECT_THAT(
+      root,
+      GmockMatch(m::Add(
+          m::Parameter(),
+          m::Concatenate(m::Copy(m::Parameter(1)).WithShapeEqualTo(&shape_copy),
+                         m::Parameter(2)))));
 }
 
 TEST_F(LayoutAssignmentTest,
@@ -1039,7 +1043,8 @@ TEST_F(LayoutAssignmentTest,
           .ConsumeValueOrDie();
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Convolution(op::Parameter(0), op::Parameter(1)));
+  EXPECT_THAT(root,
+              GmockMatch(m::Convolution(m::Parameter(0), m::Parameter(1))));
 }
 
 TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
@@ -1063,8 +1068,9 @@ TEST_F(LayoutAssignmentTest, PropagatingLayoutFromResultToOperand) {
   HloInstruction* root =
       compiled_module->entry_computation()->root_instruction();
   Shape shape_copy = ShapeUtil::MakeShapeWithLayout(F32, {4, 5}, {0, 1});
-  EXPECT_THAT(root, op::Slice(AllOf(op::Copy(op::Parameter(0)),
-                                    op::ShapeWithLayout(shape_copy))));
+  EXPECT_THAT(root,
+              GmockMatch(m::Slice(
+                  m::Copy(m::Parameter(0)).WithShapeEqualTo(&shape_copy))));
 }
 
 TEST_F(LayoutAssignmentTest, TupleCopyOnLayoutMismatch) {
@@ -1150,7 +1156,7 @@ ENTRY %CustomCallWithNotLayoutConstrained (p: f32[42,2,3]) -> f32[1,2,3,4] {
     AssignLayouts(m.get(), &computation_layout);
 
     HloInstruction* root = m->entry_computation()->root_instruction();
-    ASSERT_THAT(root, op::CustomCall(op::Parameter()));
+    ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter())));
     ExpectLayoutIs(root->shape(), {3, 2, 0, 1});
     ExpectLayoutIs(root->operand(0)->shape(), {0, 2, 1});
   }
@@ -1166,7 +1172,7 @@ ENTRY %CustomCallWithNotLayoutConstrained (p: f32[42,2,3]) -> f32[1,2,3,4] {
     AssignLayouts(m.get(), &computation_layout);
 
     HloInstruction* root = m->entry_computation()->root_instruction();
-    ASSERT_THAT(root, op::CustomCall(op::Parameter()));
+    ASSERT_THAT(root, GmockMatch(m::CustomCall(m::Parameter())));
     ExpectLayoutIs(root->shape(), {0, 2, 3, 1});
     ExpectLayoutIs(root->operand(0)->shape(), {0, 1, 2});
   }
@@ -1197,7 +1203,7 @@ ENTRY %CustomCallWithLayoutConstraints (p0: f32[4,4], p1: f32[2,3]) -> f32[1,2,3
   // The custom call should be partially encapsulated in kCopy instructions
   // because of the layout mismatches.
   ASSERT_THAT(m->entry_computation()->root_instruction(),
-              op::Copy(op::CustomCall(op::Copy(), op::Parameter())));
+              GmockMatch(m::Copy(m::CustomCall(m::Copy(), m::Parameter()))));
 
   const HloInstruction* custom_call =
       m->entry_computation()->root_instruction()->operand(0);
@@ -1223,7 +1229,7 @@ ENTRY %CustomCallLayoutConstrainedZeroOperands () -> f32[1,2,3,4] {
   AssignLayouts(m.get(), &computation_layout);
 
   ASSERT_THAT(m->entry_computation()->root_instruction(),
-              op::Copy(op::CustomCall()));
+              GmockMatch(m::Copy(m::CustomCall())));
 
   const HloInstruction* custom_call =
       m->entry_computation()->root_instruction()->operand(0);
@@ -1257,7 +1263,7 @@ ENTRY %CustomCallLayoutConstrainedTupleOperand (p0: f32[4,4], p1: f32[2,3]) -> f
   ExpectLayoutIs(root->shape(), {2, 1, 0, 3});
 
   ASSERT_THAT(m->entry_computation()->root_instruction(),
-              op::Copy(op::CustomCall(op::Tuple())));
+              GmockMatch(m::Copy(m::CustomCall(m::Tuple()))));
 
   const HloInstruction* custom_call =
       m->entry_computation()->root_instruction()->operand(0);
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
index f4b05f29c38529b3cce81b4c8ee6fae5c00cafcc..d6d84994ee147f4b8c1a333b0eaccdf6e0a2219b 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Value.h"
 #include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/platform/logging.h"
@@ -108,6 +109,14 @@ class IrArray {
     Index(absl::Span<llvm::Value* const> multidim, llvm::Value* linear,
           const Shape& shape);
 
+    // Returns an index that adds `addend` to the given `dim` of the object.
+    Index AddOffsetToDim(llvm::Value* addend, int64 dim,
+                         llvm::IRBuilder<>* b) const {
+      IrArray::Index index = *this;
+      index[dim] = b->CreateAdd(index[dim], addend);
+      return index;
+    }
+
     const std::vector<llvm::Value*>& multidim() const { return multidim_; }
     llvm::Value* linear() const { return linear_; }
 
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
index e5fbdbd51b8a9aa14decadedd1eeb3bdbf831738..c26711e526c9b89cdedcb6aed9f93d41dd25dc83 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.cc
@@ -52,6 +52,29 @@ Shape MergeDimensions(absl::Span<const size_t> segs, const Shape& shape) {
   return ShapeUtil::MakeShapeWithDescendingLayout(shape.element_type(),
                                                   dimensions);
 }
+
+// Given an index for a shape, return the equivalent new index if the shape is
+// reshaped to another shape.
+IrArray::Index GetReshapedIndex(const IrArray::Index& index, const Shape& shape,
+                                const Shape& reshaped_shape,
+                                llvm::IRBuilder<>* b) {
+  auto bounds = shape.dimensions();
+  auto minor_to_major = shape.layout().minor_to_major();
+  llvm::Value* linear_index = index.GetConstantWithIndexType(0);
+  int64 multiplier = 1;
+  for (int i = 0; i < index.size(); ++i) {
+    int64 dim = minor_to_major[i];
+    llvm::Value* addend = b->CreateMul(
+        index[dim], index.GetConstantWithIndexType(multiplier), "linearizing",
+        /*HasNUW=*/true, /*HasNSW=*/true);
+    linear_index = b->CreateAdd(linear_index, addend, "",
+                                /*HasNUW=*/true, /*HasNSW=*/true);
+    multiplier *= bounds[dim];
+  }
+
+  return IrArray::Index(linear_index, reshaped_shape, b);
+}
+
 }  // namespace
 
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
@@ -60,28 +83,30 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
     return absl::nullopt;
   }
 
-  std::vector<int64> perm(a.dimensions().size());
-  {
-    auto layout_a_orig = LayoutUtil::MinorToMajor(a);
-    std::vector<int64> layout_a(layout_a_orig.rbegin(), layout_a_orig.rend());
-    auto layout_b_orig = LayoutUtil::MinorToMajor(b);
-    std::vector<int64> layout_b(layout_b_orig.rbegin(), layout_b_orig.rend());
-    for (size_t i = 0; i < perm.size(); ++i) {
-      perm[i] = PositionInContainer(layout_b, layout_a[i]);
-    }
+  std::vector<int64> permutation(a.dimensions().size());
+  absl::Span<const int64> minor_to_major_a = LayoutUtil::MinorToMajor(a);
+  std::vector<int64> major_to_minor_a(minor_to_major_a.rbegin(),
+                                      minor_to_major_a.rend());
+  absl::Span<const int64> minor_to_major_b = LayoutUtil::MinorToMajor(b);
+  std::vector<int64> major_to_minor_b(minor_to_major_b.rbegin(),
+                                      minor_to_major_b.rend());
+  for (size_t i = 0; i < permutation.size(); ++i) {
+    permutation[i] = PositionInContainer(major_to_minor_b, major_to_minor_a[i]);
   }
-  auto segs = ConsecutiveSegments(perm);
-  if ((3 == segs.size() && 0 == perm[0]) || 2 == segs.size()) {
-    Shape norm_a =
+
+  std::vector<size_t> segments = ConsecutiveSegments(permutation);
+  if ((3 == segments.size() && 0 == permutation[0]) || 2 == segments.size()) {
+    Shape descending_layout_shape =
         ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(a);
-    Shape reduced_a = MergeDimensions(segs, norm_a);
-    auto reduced_a_dims = reduced_a.dimensions();
+    Shape normalized_shape = MergeDimensions(segments, descending_layout_shape);
+    absl::Span<const int64> normalized_dims =
+        AsInt64Slice(normalized_shape.dimensions());
     std::vector<int64> dims_021;
-    if (2 == segs.size()) {
+    if (2 == segments.size()) {
       // The logical component-0 is of size one.
-      dims_021 = {1, reduced_a_dims[1], reduced_a_dims[0]};
+      dims_021 = {1, normalized_dims[1], normalized_dims[0]};
     } else {
-      dims_021 = {reduced_a_dims[0], reduced_a_dims[2], reduced_a_dims[1]};
+      dims_021 = {normalized_dims[0], normalized_dims[2], normalized_dims[1]};
     }
 
     return dims_021;
@@ -90,27 +115,117 @@ absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
   return absl::nullopt;
 }
 
-IrArray::Index GetUnreducedOutputIndex(
-    const IrArray::Index& reduced_output_index,
-    const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* b) {
-  auto bounds = reduced_output_shape.dimensions();
-  auto minor_to_major = reduced_output_shape.layout().minor_to_major();
-  llvm::Value* linear_index = reduced_output_index.GetConstantWithIndexType(0);
-  int64 multiplier = 1;
-  for (int i = 0; i < reduced_output_index.size(); ++i) {
-    int64 dim = minor_to_major[i];
-    llvm::Value* addend =
-        b->CreateMul(reduced_output_index[dim],
-                     reduced_output_index.GetConstantWithIndexType(multiplier),
-                     "linearizing",
-                     /*HasNUW=*/true, /*HasNSW=*/true);
-    linear_index = b->CreateAdd(linear_index, addend, "",
-                                /*HasNUW=*/true, /*HasNSW=*/true);
-    multiplier *= bounds[dim];
+KernelMappingScheme::KernelMappingScheme(
+    absl::Span<const int64> dims_in_elems, int64 tile_size_y, int64 tile_size_x,
+    absl::Span<const int64> req_block_sizes, int64 num_threads_y,
+    int64 num_threads_x, llvm::IRBuilder<>* b)
+    : b_(b),
+      dims_in_elems_(dims_in_elems),
+      tile_sizes_{1, tile_size_y, tile_size_x},
+      num_threads_x_(num_threads_x),
+      num_threads_y_(num_threads_y) {
+  DCHECK_EQ(dims_in_elems_.size(), 3);
+  DCHECK_EQ(req_block_sizes.size(), 3);
+
+  DCHECK_EQ(tile_size_y % num_threads_y_, 0);
+  DCHECK_EQ(tile_size_x % num_threads_x_, 0);
+
+  dims_in_tiles_ = ElementWiseCeilOfRatio<int64>(dims_in_elems_, tile_sizes_);
+  block_sizes_.reserve(req_block_sizes.size());
+  absl::c_transform(req_block_sizes, dims_in_tiles_,
+                    std::back_inserter(block_sizes_),
+                    [](const int64 requested_size, const int64 max_size) {
+                      return std::min(requested_size, max_size);
+                    });
+  dims_in_blocks_ = ElementWiseCeilOfRatio<int64>(dims_in_tiles_, block_sizes_);
+
+  VLOG(10) << "dims_in_elems_ = [" << absl::StrJoin(dims_in_elems_, ",") << "]";
+  VLOG(10) << "dims_in_tiles_ = [" << absl::StrJoin(dims_in_tiles_, ",") << "]";
+  VLOG(10) << "dims_in_blocks_ = [" << absl::StrJoin(dims_in_blocks_, ",")
+           << "]";
+}
+
+IrArray::Index KernelMappingScheme::GetUnnormalizedIndex(
+    const IrArray::Index& normalized_shape_index,
+    const Shape& unnormalized_shape) {
+  DCHECK_EQ(normalized_shape_index.size(), dims_in_elems_.size());
+  Shape output_shape = ShapeUtil::MakeShapeWithDescendingLayout(
+      unnormalized_shape.element_type(), GetDimensionsInElements());
+  return GetReshapedIndex(normalized_shape_index, output_shape,
+                          unnormalized_shape, b_);
+}
+
+IrArray::Index KernelMappingScheme::EmitBlockIndex(llvm::Type* index_ty) {
+  llvm::Value* block_id = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_ctaid_x, {}, {}, b_);
+  llvm_ir::AddRangeMetadata(0, GetNumberOfBlocks(),
+                            llvm::cast<llvm::Instruction>(block_id));
+  llvm::Value* linear_block_id =
+      b_->CreateIntCast(block_id, index_ty, /*isSigned=*/true, "block.id.x");
+  return IrArray::Index(linear_block_id,
+                        ShapeUtil::MakeShapeWithDescendingLayout(
+                            PRED /*arbitrary*/, dims_in_blocks_),
+                        b_);
+}
+
+IrArray::Index KernelMappingScheme::GetTileIndexForBlockOrigin(
+    const IrArray::Index& block_index) {
+  IrArray::Index tile_index = block_index;
+  for (int i = 0; i < block_sizes_.size(); ++i) {
+    tile_index[i] = b_->CreateMul(
+        block_index[i],
+        llvm::ConstantInt::get(block_index[i]->getType(), block_sizes_[i]),
+        "block_origin." + std::to_string(i));
+  }
+  return tile_index;
+}
+
+IrArray::Index KernelMappingScheme::GetElementIndexForTileOrigin(
+    const IrArray::Index& tile_index) {
+  IrArray::Index elem_index = tile_index;
+  for (int i = DimY; i < DimTot; ++i) {
+    elem_index[i] =
+        b_->CreateMul(tile_index[i],
+                      llvm::ConstantInt::get(tile_index[i]->getType(),
+                                             GetTileSizeForDimension(i)),
+                      "tile_origin." + std::to_string(i));
   }
+  return elem_index;
+}
+
+llvm::GlobalVariable* KernelMappingScheme::GetSharedMemoryBufferForElementType(
+    llvm::Type* elem_ty, absl::string_view buffer_name) {
+  // If shared memory tranpose is needed, we use square tiles.
+  CHECK_EQ(GetTileSizeForDimensionX(), GetTileSizeForDimensionY());
+
+  // For Nvidia GPUs, the warp size is 32 threads and the shared memory bank is
+  // organized into 32-way. We usually use the warp size or a multiplier or a
+  // the warp size as the size for tiling. This may cause all elements in the
+  // same column of a tile use the same memory bank and therefore shared memory
+  // bank conflicts. Adding 1 to the minor dimension of the shared memory buffer
+  // can reduce such shared memory bank conflicts.
+  llvm::Type* buffer_type = llvm::ArrayType::get(
+      llvm::ArrayType::get(elem_ty, GetTileSizeForDimension(DimX) + 1),
+      GetTileSizeForDimension(DimY));
+  return llvm_ir::AllocateSharedMemoryTile(b_->GetInsertBlock()->getModule(),
+                                           buffer_type, buffer_name);
+}
 
-  return IrArray::Index(linear_index, unreduced_output_shape, b);
+std::tuple<llvm::Value*, llvm::Value*>
+KernelMappingScheme::EmitThreadYXCoordinate(llvm::Type* index_ty) {
+  // Calculate (y, x) coordinate of the thread in the 2D view of thread block
+  // defined by (num_thread_y, num_thread_x) from thread_id.
+  llvm::CallInst* thread_id_raw = llvm_ir::EmitCallToIntrinsic(
+      llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b_);
+  llvm_ir::AddRangeMetadata(0, GetThreadsPerTile(), thread_id_raw);
+  llvm::Value* thread_id_int =
+      b_->CreateIntCast(thread_id_raw, index_ty,
+                        /*isSigned=*/true, "thread.id.x");
+  llvm::Value* num_thread_x =
+      llvm::ConstantInt::get(index_ty, GetNumberOfThreadsForDimensionX());
+  llvm::Value* x = b_->CreateURem(thread_id_int, num_thread_x);
+  llvm::Value* y = b_->CreateUDiv(thread_id_int, num_thread_x);
+  return std::make_tuple(y, x);
 }
 
 }  // namespace llvm_ir
diff --git a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
index 5ea05b3188a1c0881e4c0c41625d530aff1b1205..06002d57b0d7daa07f903feebe67a60a083c0e7c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/kernel_tiling.h
@@ -28,23 +28,160 @@ namespace llvm_ir {
 // If a shape can be viewed as three logical components 0-1-2 in the order of
 // major to minor, a 0-2-1-transpose changes the order of such logical
 // components to 0-2-1. We call the shape being transposed the input shape and
-// the transposed shape the output shape. The logical view of the input and
-// output shapes for the transpose are called the 0-1-2 shape or reduced input
-// shape and the 0-2-1 shape or the reduced output shape respectively. The
-// original input and output shapes are called the unreduced input and output
-// shapes.
-
+// the transposed shape the output shape. The logical view of the input/output
+// shapes for the transpose are called the 0-1-2/0-2-1 shapes or the normalized
+// shapes. The original input/output shapes are called unnormalized shapes.
+//
 // If `b` is a 0-2-1 transpose of `a` in 0-1-2, return the dimensions for the
-// reduced shape of `b` or the 0-2-1 shape.
+// normalized shape of `b` or the 0-2-1 shape.
 absl::optional<std::vector<int64> > FindTranspose021(const Shape& a,
                                                      const Shape& b);
 
-// Return the unreduced output index corresponding to the given reduced output
-// index.
-IrArray::Index GetUnreducedOutputIndex(
-    const IrArray::Index& reduced_output_index,
-    const Shape& reduced_output_shape, const Shape& unreduced_output_shape,
-    llvm::IRBuilder<>* b);
+// A tile is a spatial subdivision of a tensor. We group tensor elements into
+// tiles so that we can launch kernels to process the tensor elements in blocks
+// of tiles.
+//
+// A kernel mapping scheme describes a method to partition the tensors accessed
+// by an unnested HLO instruction into tiles and blocks of tiles, and the
+// associated information to use hardware threads to process the tensor elements
+// in blocks of tiles.
+//
+// Currently, there are two main use cases for a tiling scheme. First, we
+// implement kernels with 0-2-1 memory transpose using shared memory to improve
+// memory access pattern. Second, we implement reduction to contiguous
+// dimensions in layout, with or without memory tranpsose, to achieve better
+// memory access pattern as well as to reduce the need numbers of executed
+// expensive instructions, such as thread synchronization related instructions
+// and atomic operations. For both use cases, we can apply a normalization to
+// the original tensors, to collapse contiguous dimensions for the same purpose
+// and produce normlized three dimensional tensors. For this reason, the tiling
+// scheme class only needs to handle normalized three dimensional tensors and
+// two dimensional tiles.
+//
+// The current implementation of the class is somewhat NVIDIA GPU oriented. This
+// situation can be improved when there is a need though. The idea of 0-2-1
+// transpose using shared memory can be found in the following CUDA algorithm in
+// TensorFlow: https://goo.gl/MStRV6.
+//
+// We use a thread block to process a tile because we want to use the HW thread
+// block synchronization primitives to synchronize the processing of all the
+// elements in the same tile. A thread block can be viewed as a two dimensional
+// array of threads, described by the number of threads for the Y and X
+// dimensions. A thread block (num_threads_y, num_threads_x) processes a tile of
+// (tile_size_y, tile_size_x) as follows: each thread in the thread block
+// processes one element in the tile so that all the threads in the thread block
+// together process a subdivision of the tile that has the same dimension as the
+// thread block array. Then the thread block moves on to process the next
+// subdivision of the tile until the whole tile is processed. Therefore, each
+// thread in the thread block processes
+// tile_size_x/num_threads_x * tile_size_y/num_threads_y elements in a tile.
+//
+// There are situations where we want a thread block to process multiple
+// tiles. We can't group those tiles into a bigger tiles because we limit a tile
+// to a two dimensional spatial subdivision of a tensor. For example, when we
+// use tiling to implement reduction with tranpose, we want the partial sum
+// produced by each thread to accumulate values for more elements before using
+// shlf_down and atomic_add instructions for further reduction, to amortize the
+// cost of such expensive instructions. The concept of tile block is introduced
+// for this purpose. A tile block is a three dimensional array of tiles, of
+// which some dimensions may be degenerated to only one tile.
+class KernelMappingScheme {
+ public:
+  enum { DimZ = 0, DimY, DimX, DimTot };
+
+ public:
+  // dims_in_elems: the normalized tensor dimensions.
+  // req_block_sizes: the requested block size in number of tiles for each
+  //   dimension. The actual block size is set to min(req_block_size,
+  //   dims_in_number_of_blocks).
+  explicit KernelMappingScheme(absl::Span<const int64> dims_in_elems,
+                               int64 tile_size_y, int64 tile_size_x,
+                               absl::Span<const int64> req_block_sizes,
+                               int64 num_threads_y, int64 num_threads_x,
+                               llvm::IRBuilder<>* b);
+
+  absl::Span<const int64> GetDimensionsInElements() const {
+    return dims_in_elems_;
+  }
+  absl::Span<const int64> GetDimensionsInTiles() const {
+    return dims_in_tiles_;
+  }
+  absl::Span<const int64> GetDimensionsInBlocks() const {
+    return dims_in_blocks_;
+  }
+
+  int64 GetNumberOfTilesInTotal() const {
+    return absl::c_accumulate(dims_in_tiles_, 1LL, std::multiplies<int64>());
+  }
+  int64 GetNumberOfTilesInOneBlock() const {
+    return absl::c_accumulate(block_sizes_, 1, std::multiplies<int64>());
+  }
+
+  int64 GetNumberOfBlocks() const {
+    return absl::c_accumulate(dims_in_blocks_, 1, std::multiplies<int64>());
+  }
+
+  int64 GetTileSizeForDimension(int d) const {
+    DCHECK(d >= DimZ && d <= DimX);
+    return tile_sizes_[d];
+  }
+  int64 GetTileSizeForDimensionX() const {
+    return GetTileSizeForDimension(DimX);
+  }
+  int64 GetTileSizeForDimensionY() const {
+    return GetTileSizeForDimension(DimY);
+  }
+
+  absl::Span<const int64> GetBlockSizes() const { return block_sizes_; }
+
+  int64 GetNumberOfThreadsForDimensionX() const { return num_threads_x_; }
+  int64 GetNumberOfThreadsForDimensionY() const { return num_threads_y_; }
+
+  int64 GetThreadsPerTile() const {
+    return GetNumberOfThreadsForDimensionX() *
+           GetNumberOfThreadsForDimensionY();
+  }
+
+  IrArray::Index EmitBlockIndex(llvm::Type* index_ty);
+  // Returns the index for the first tile in the block with the given block
+  // index.
+  IrArray::Index GetTileIndexForBlockOrigin(const IrArray::Index& block_index);
+  // Returns the index for the first element in the tile with the given tile
+  // index.
+  IrArray::Index GetElementIndexForTileOrigin(const IrArray::Index& tile_index);
+
+  std::tuple<llvm::Value*, llvm::Value*> EmitThreadYXCoordinate(
+      llvm::Type* index_ty);
+
+  IrArray::Index GetUnnormalizedIndex(
+      const IrArray::Index& normalized_shape_index,
+      const Shape& unnormalized_shape);
+
+  llvm::GlobalVariable* GetSharedMemoryBufferForElementType(
+      llvm::Type* elem_ty, absl::string_view buffer_name);
+
+ private:
+  llvm::IRBuilder<>* b_;
+  // The number of elements in each dimension.
+  absl::Span<const int64> dims_in_elems_;
+
+  // The number of elements for each dimension of a tile.
+  std::vector<int64> tile_sizes_;
+  // The number of tiles in each dimension. It is computed from dims_in_elem_
+  // and tile_sizes_.
+  std::vector<int64> dims_in_tiles_;
+
+  // The number of tiles for each dimension of a tile block.
+  std::vector<int64> block_sizes_;
+  // The number of blocks in each dimension of a tile block. It is computed from
+  // dims_in_tile_ and block_sizes_.
+  std::vector<int64> dims_in_blocks_;
+
+  // Number of threads used to process elements in the X direction of a tile.
+  int64 num_threads_x_;
+  // Number of threads used to process elements in the Y direction of a tile.
+  int64 num_threads_y_;
+};
 
 // A class to represent information for tiled parameters to support IR emission
 // for 021 transpose.
diff --git a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
index df78726166eea953b57e72a5a5fc81ee246aca34..ceea24685af566e02340664f0a40c398c62b5ab0 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/llvm_util.cc
@@ -244,10 +244,11 @@ StatusOr<llvm::Value*> EncodeSelfDescribingShapeConstant(const Shape& shape,
 
 StatusOr<Shape> DecodeSelfDescribingShapeConstant(const void* shape_ptr,
                                                   int32 size_bytes) {
-  Shape shape;
-  TF_RET_CHECK(shape.ParseFromArray(shape_ptr, size_bytes));
+  ShapeProto shape_proto;
+  TF_RET_CHECK(shape_proto.ParseFromArray(shape_ptr, size_bytes));
+  Shape shape(shape_proto);
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
-  return shape;
+  return std::move(shape);
 }
 
 llvm::Constant* ConvertLiteralToIrConstant(const Literal& literal,
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
index fd16af67fe99b4f440ad962b4b648a3b22c41dc6..e22c2173c271fc9571be1ddb0759d2b31562dc98 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.cc
@@ -47,7 +47,8 @@ namespace {
 // Adds the inner comparison loop body where we compare elements.
 void EmitCompareLoopBody(
     int64 iteration_bound, PrimitiveType key_type, int64 num_values,
-    llvm::Value* element_pair_index, int64 xor_mask, llvm::Type* index_type,
+    int64 iota_values_parameter_index, llvm::Value* element_pair_index,
+    int64 xor_mask, llvm::Type* index_type,
     std::function<llvm::Value*(int64 operand, llvm::Value* index)> read_element,
     std::function<void(int64 operand, llvm::Value* index, llvm::Value* value)>
         write_element,
@@ -139,34 +140,42 @@ void EmitCompareLoopBody(
       is_signed_comparison = false;
     }
     // If key2 < key1
-    ksl.IfReturnVoid(
-        "is_smaller_than",
+    auto is_smaller_than =
         b->CreateICmp(is_signed_comparison ? llvm::ICmpInst::ICMP_SLT
                                            : llvm::ICmpInst::ICMP_ULT,
-                      compare_key2, compare_key1),
-        [&]() {
-          // Swap key1 with key2.
-          write_element(0, current_keys_index, key2);
-          write_element(0, compare_keys_index, key1);
-          for (int64 i = 1; i <= num_values; ++i) {
-            // Also swap the values.
-            auto value1 = read_element(i, current_keys_index);
-            auto value2 = read_element(i, compare_keys_index);
-            write_element(i, current_keys_index, value2);
-            write_element(i, compare_keys_index, value1);
-          }
-        });
+                      compare_key2, compare_key1);
+    if (iota_values_parameter_index >= 0) {
+      auto keys_equal = b->CreateICmpEQ(compare_key1, compare_key2);
+      auto key_index1 =
+          read_element(iota_values_parameter_index, current_keys_index);
+      auto key_index2 =
+          read_element(iota_values_parameter_index, compare_keys_index);
+      auto index_is_smaller_than =
+          b->CreateICmp(llvm::ICmpInst::ICMP_ULT, key_index2, key_index1);
+      is_smaller_than = b->CreateOr(
+          is_smaller_than, b->CreateAnd(keys_equal, index_is_smaller_than));
+    }
+    ksl.IfReturnVoid("is_smaller_than", is_smaller_than, [&]() {
+      // Swap key1 with key2.
+      write_element(0, current_keys_index, key2);
+      write_element(0, compare_keys_index, key1);
+      for (int64 i = 1; i <= num_values; ++i) {
+        // Also swap the values.
+        auto value1 = read_element(i, current_keys_index);
+        auto value2 = read_element(i, compare_keys_index);
+        write_element(i, current_keys_index, value2);
+        write_element(i, compare_keys_index, value1);
+      }
+    });
   });
 }
 
-void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index,
-                          int64 dimension_to_sort,
-                          int64 dimension_to_sort_bound,
-                          PrimitiveType keys_type,
-                          absl::Span<const int64> xor_masks,
-                          const std::vector<IrArray>& params,
-                          const std::vector<llvm::Value*>& param_shmem_buffers,
-                          int64 tile_size, llvm::IRBuilder<>* b) {
+void EmitTiledCompareLoop(
+    const IrArray::Index& tiled_keys_index, int64 dimension_to_sort,
+    int64 dimension_to_sort_bound, PrimitiveType keys_type,
+    absl::Span<const int64> xor_masks, const std::vector<IrArray>& params,
+    const std::vector<llvm::Value*>& param_shmem_buffers,
+    int64 iota_values_parameter_index, int64 tile_size, llvm::IRBuilder<>* b) {
   KernelSupportLibrary ksl(b);
   llvm::Value* thread_id = llvm_ir::EmitCallToIntrinsic(
       llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x, {}, {}, b);
@@ -253,20 +262,22 @@ void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index,
                   RoundDownToNearest(dimension_to_sort_bound, tile_size))),
           [&]() {
             EmitCompareLoopBody(dimension_to_sort_bound % tile_size, keys_type,
-                                params.size() - 1, element_pair_index, xor_mask,
+                                params.size() - 1, iota_values_parameter_index,
+                                element_pair_index, xor_mask,
                                 tiled_keys_index.GetType(), read_element,
                                 write_element, b);
           },
           [&]() {
-            EmitCompareLoopBody(
-                tile_size, keys_type, params.size() - 1, element_pair_index,
-                xor_mask, tiled_keys_index.GetType(), read_element,
-                write_element, b, /*needs_bounds_checks=*/false);
+            EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
+                                iota_values_parameter_index, element_pair_index,
+                                xor_mask, tiled_keys_index.GetType(),
+                                read_element, write_element, b,
+                                /*needs_bounds_checks=*/false);
           });
     } else {
       EmitCompareLoopBody(tile_size, keys_type, params.size() - 1,
-                          element_pair_index, xor_mask,
-                          tiled_keys_index.GetType(), read_element,
+                          iota_values_parameter_index, element_pair_index,
+                          xor_mask, tiled_keys_index.GetType(), read_element,
                           write_element, b, /*needs_bounds_checks=*/false);
     }
     // Wait until all comparisons have happened.
@@ -296,6 +307,7 @@ void EmitTiledCompareLoop(const IrArray::Index& tiled_keys_index,
 
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
                        const std::vector<IrArray>& values_arrays,
+                       int64 iota_values_parameter_index,
                        absl::string_view name,
                        absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
                        const gpu::LaunchDimensions& launch_dimensions,
@@ -367,8 +379,8 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
     if (xor_masks.size() > 1) {
       EmitTiledCompareLoop(keys_index, dimension_to_sort,
                            dimension_to_sort_bound, keys_shape.element_type(),
-                           xor_masks, params, param_shmem_buffers, tile_size,
-                           b);
+                           xor_masks, params, param_shmem_buffers,
+                           iota_values_parameter_index, tile_size, b);
     } else {
       auto read_element = [&](int64 operand, llvm::Value* index) {
         keys_index[dimension_to_sort] = index;
@@ -380,9 +392,10 @@ Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
         params[operand].EmitWriteArrayElement(keys_index, value, b);
       };
       EmitCompareLoopBody(dimension_to_sort_bound, keys_shape.element_type(),
-                          values_arrays.size(), tiles_index[rank - 1],
-                          xor_masks[0], tiles_index.GetType(), read_element,
-                          write_element, b);
+                          values_arrays.size(), iota_values_parameter_index,
+                          tiles_index[rank - 1], xor_masks[0],
+                          tiles_index.GetType(), read_element, write_element,
+                          b);
     }
     return Status::OK();
   };
diff --git a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
index 556a217322d373ffd5e816dcf35888b546806633..685f9383acba416f51681270e4037d56abb4b6ea 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
+++ b/tensorflow/compiler/xla/service/llvm_ir/sort_util.h
@@ -31,9 +31,12 @@ namespace llvm_ir {
 // Emits llvm IR to do pairwise comparisons/swaps in the 'dimension_to_sort'
 // dimension of 'keys_array'. All other dimensions are kept as-is. This
 // implements the inner loop of BitonicSort. It is assumed that 'xor_masks'
-// contains only powers of 2, or values 2^k - 1 (k > 0).
+// contains only powers of 2, or values 2^k - 1 (k > 0). If
+// 'iota_values_parameter_index' is >= 0, it points at a 'values_arrays' operand
+// that is a iota and can be used to make the sorting stable.
 Status EmitSortInPlace(int64 dimension_to_sort, const IrArray& keys_array,
                        const std::vector<IrArray>& values_arrays,
+                       int64 iota_values_parameter_index,
                        absl::string_view name,
                        absl::Span<const int64> xor_masks, llvm::IRBuilder<>* b,
                        const gpu::LaunchDimensions& launch_dimensions,
diff --git a/tensorflow/compiler/xla/service/local_service.cc b/tensorflow/compiler/xla/service/local_service.cc
index cca37556173bb95ef062b59ab0a4bf9ca7c496fe..6c89700983363fec46c41b5430c6eab6b366a1b6 100644
--- a/tensorflow/compiler/xla/service/local_service.cc
+++ b/tensorflow/compiler/xla/service/local_service.cc
@@ -96,44 +96,18 @@ ExecutionOptions CreateExecutionOptions(
     const ExecutableBuildOptions& build_options,
     const ProgramShape* program_shape) {
   ExecutionOptions execution_options = CreateDefaultExecutionOptions();
-  if (build_options.hlo_profile().has_value()) {
-    execution_options.mutable_debug_options()->set_xla_hlo_profile(
-        *build_options.hlo_profile());
-  }
-  if (build_options.generate_hlo_graph().has_value()) {
-    execution_options.mutable_debug_options()->set_xla_generate_hlo_graph(
-        build_options.generate_hlo_graph().value());
-  }
-  if (build_options.dump_optimized_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_optimized_hlo_proto_to(
-            build_options.dump_optimized_hlo_proto_to().value());
-  }
-  if (build_options.dump_unoptimized_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_unoptimized_hlo_proto_to(
-            build_options.dump_unoptimized_hlo_proto_to().value());
-  }
-  if (build_options.dump_per_pass_hlo_proto_to().has_value()) {
-    execution_options.mutable_debug_options()
-        ->set_xla_dump_per_pass_hlo_proto_to(
-            build_options.dump_per_pass_hlo_proto_to().value());
+  if (build_options.has_debug_options()) {
+    *execution_options.mutable_debug_options() = build_options.debug_options();
   }
   if (build_options.result_layout() != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *build_options.result_layout();
+        build_options.result_layout()->ToProto();
   } else {
+    Shape result_shape(program_shape->result());
+    LayoutUtil::SetToDefaultLayout(&result_shape);
     *execution_options.mutable_shape_with_output_layout() =
-        program_shape->result();
-    LayoutUtil::SetToDefaultLayout(
-        execution_options.mutable_shape_with_output_layout());
+        result_shape.ToProto();
   }
-
-  for (const std::string& disabled_pass : build_options.disabled_hlo_passes()) {
-    execution_options.mutable_debug_options()->add_xla_disable_hlo_passes(
-        disabled_pass);
-  }
-
   return execution_options;
 }
 
@@ -145,7 +119,7 @@ StatusOr<std::unique_ptr<Executable>> LocalService::CompileExecutable(
     const ExecutableBuildOptions& build_options) {
   const HloModuleProto& proto = computation.proto();
   TF_RET_CHECK(proto.has_host_program_shape());
-  const ProgramShape& program_shape = proto.host_program_shape();
+  ProgramShape program_shape(proto.host_program_shape());
 
   // Validate incoming layouts.
   if (argument_layouts.size() != program_shape.parameters_size()) {
@@ -220,4 +194,10 @@ StatusOr<const ShapedBuffer*> LocalService::GlobalDataToShapedBuffer(
   return buffers[replica_number];
 }
 
+StatusOr<GlobalDataHandle> LocalService::RegisterReplicatedBuffers(
+    std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag) {
+  return allocation_tracker_.RegisterReplicatedBuffers(
+      std::move(replicated_buffers), tag);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/local_service.h b/tensorflow/compiler/xla/service/local_service.h
index 3b4f0b50832d6d2b64528ffb63eb5c7375396aec..f56ba32b04b9bf3aba75654bdb98887ad22e6791 100644
--- a/tensorflow/compiler/xla/service/local_service.h
+++ b/tensorflow/compiler/xla/service/local_service.h
@@ -63,6 +63,11 @@ class LocalService : public Service {
   StatusOr<const ShapedBuffer*> GlobalDataToShapedBuffer(
       const GlobalDataHandle& data, int replica_number);
 
+  // Registers a vector of shaped buffers of device memory, one per replica, and
+  // returns a corresponding handle that can be used for talking to XLA clients.
+  StatusOr<GlobalDataHandle> RegisterReplicatedBuffers(
+      std::vector<ScopedShapedBuffer> replicated_buffers, const string& tag);
+
  private:
   explicit LocalService(const ServiceOptions& options,
                         std::unique_ptr<Backend> backend);
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
index ec52a24d782a44fda961feab3230886072e755c7..972a5b9ced0d84387ef8308efe2a7aff7317d047 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.cc
@@ -113,6 +113,13 @@ Status LogicalBufferAnalysis::HandleGetTupleElement(HloInstruction*) {
   return Status::OK();
 }
 
+Status LogicalBufferAnalysis::HandleAddDependency(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand and does not
+  // create buffers.
+  return Status::OK();
+}
+
 Status LogicalBufferAnalysis::HandleCopy(HloInstruction* copy) {
   // The top-level buffer (index={}) for kCopy is newly created, but all other
   // buffers (in the case of a tuple shape) come from the operand
diff --git a/tensorflow/compiler/xla/service/logical_buffer_analysis.h b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
index 81f524d84a8091e1fff13dc7c55b401143a02753..7ffca943d0f7805ad4420343fcdbf860415c4c40 100644
--- a/tensorflow/compiler/xla/service/logical_buffer_analysis.h
+++ b/tensorflow/compiler/xla/service/logical_buffer_analysis.h
@@ -64,6 +64,7 @@ class LogicalBufferAnalysis : public DfsHloVisitorWithDefault {
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   // A map from the buffer ID to the logical buffer
   std::vector<std::unique_ptr<LogicalBuffer>> logical_buffers_;
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 6152cdc6099a182f1ed98f9501613e0aa123cdbb..c35f72699bfe90f7b8021916c0f81d5e1926ff4c 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
 
+#include "absl/strings/str_replace.h"
 #include "absl/strings/string_view.h"
 #include "absl/utility/utility.h"
 #include "tensorflow/compiler/xla/layout_util.h"
@@ -44,32 +45,48 @@ namespace xla {
 //
 // This pattern will match Add instructions whose first operand is a constant.
 //
-// Each pattern type has the following modifiers:
+// Each pattern type has the following modifiers, which are described where
+// nontrivial.
 //
 //   Op():
-//     - WithName: match operations with the given name
-//     - WithOpcode: match operations with the given opcode
-//     - WithShape: match operations whose shape matches the given pattern
-//     - WithOperand: match operations whose operand matches the given pattern
+//     - Is: is the given HloInstruction* (i.e. pointer equality)
+//     - WithName
+//     - WithOpcode
+//     - WithoutOpcode: anything other than the given opcode
+//     - WithShape: instr's shape matches the given pattern
+//     - WithShapeEqualTo: instr's shape is equal to the given Shape
+//     - WithShapeCompatibleTo: instr's shape is compatible with the given Shape
+//     - WithNumOperands
+//     - WithOperand: operand at the given index matches the given pattern
+//     - IsConstant
+//     - IsNonConstant
+//     - IsConstantScalar/IsEffectiveConstantScalar: Optionally accepts a value,
+//       e.g. IsConstantScalar() or IsConstantScalar(42).
+//     - WithFusionKind
+//     - WithTupleIndex: get-tuple-element operations with the given tuple index
+//     - WithOneUse: Instruction is used as an operand exactly once.
+//     - WithOneUser: Instruction is used by exactly one other instruction, but
+//       is possibly used more than once as an operand (e.g. multiply(x,x)).
 //
 //   Shape():
-//     - EqualTo: matches shapes that are equal to the argument
-//     - CompatibleTo: matches shapes that are compatible to the argument
-//     - IsScalar/IsArray/IsTuple: matches scalar/array/tuple shapes
-//     - IsDenseArray/IsSparseArray: matches arrays with dense/sparse format
-//     - WithLayout: match shapes whose layout matches the given pattern
-//     - WithLayoutEqualTo: matches shapes whose layouts equal the argument
-//     - WithSubshape: matches tuple shapes whose subshape matches the given
-//       pattern
-//     - WithSubshapeEqualTo: matches shapes with a subshape equal the argument
-//     - WithElementType: matches array/scalar shapes with the given element
-//       type
-//     - WithRank: matches array/scalar types with the given rank
+//     - EqualTo
+//     - CompatibleTo
+//     - IsScalar/IsEffectiveScalar/IsArray/IsTuple
+//     - IsDenseArray/IsSparseArray
+//     - WithLayout: layout shape's layout matches the given pattern (e.g.
+//       Layout().WithDenseFormat())
+//     - WithLayoutEqualTo: shape's layout equals the argument (i.e. another
+//       Layout, but not the result of Layout().foo())
+//     - WithSubshape: shape is a tuple whose subshape matches the given pattern
+//       (e.g. Shape().IsScalar()).
+//     - WithSubshapeEqualTo: shape is a tuple with a subshape equal to the arg
+//       (i.e. another Shape, but not the result of Shape().foo())
+//     - WithElementType: shape is an array/scalar with the given elem type
+//     - WithRank: shape is an array/scalar with the given rank
 //
 //  Layout():
-//     - EqualTo: matches layouts that are equal to the argument
-//     - WithDenseFormat/WithSparseFormat: matches layouts with dense/sparse
-//       format
+//     - EqualTo
+//     - WithDenseFormat/WithSparseFormat
 //
 // Op(), Shape(), and Layout() may be passed an argument of type
 // HloInstruction**, Shape**, or Layout**, respectively, or const versions of
@@ -82,53 +99,55 @@ namespace xla {
 //   CHECK(Match(foo,
 //               match::Op().WithOperand(0, match::Op(&matched_operand))));
 //
-// Helpers are provided for common nullary, unary, binary, and ternary
-// instructions. These helpers can be called with no arguments, in which case
-// they will match any instruction matching the opcode. They may also be called
-// with matches for the operands and with an optional capture. (The capture must
-// be the first argument.) Some examples of these helpers and their equivalents
-// are provided below.
-//
+// Helpers are provided for most HLO instructions. These helpers can be called
+// with no arguments, in which case they will match any instruction matching the
+// opcode. They may also be called with matches for the operands and with an
+// optional capture. (The capture must be the first argument.) Some examples of
+// these helpers and their equivalents are provided below.
+
 // Example nullary instruction:
-//   Param()                        == Op().WithOpcode(HloOpcode::kParam)
-//   Param(&a)                      == Op(&a).WithOpcode(HloOpcode::kParam)
+//   Parameter()                    == Op().WithOpcode(HloOpcode::kParameter)
+//   Parameter(&a)                  == Op(&a).WithOpcode(HloOpcode::kParameter)
 //
 // Example unary instruction:
-//   Abs()                             == Op().WithOpcode(HloOpcode::kAbs)
-//   Abs(Op(&a))                       == Op().WithOpcode(HloOpcode::kAbs)
-//                                            .WithOperand(0, Op(&a)))
-//   Abs(&a, Op(&b))                   == Op(&a).WithOpcode(HloOpcode::kAbs)
-//                                              .WithOperand(0, Op(&b))
+//   Abs()                          == Op().WithOpcode(HloOpcode::kAbs)
+//   Abs(Op(&a))                    == Op().WithOpcode(HloOpcode::kAbs)
+//                                         .WithOperand(0, Op(&a)))
+//   Abs(&a, Op(&b))                == Op(&a).WithOpcode(HloOpcode::kAbs)
+//                                           .WithOperand(0, Op(&b))
+//
+// Commutative binary instructions have a special form that accepts either order
+// of args, e.g.:
+//
+//   AddAnyOrder(Parameter(1), Abs()) ==
+//     Op().WithOpcode(HloOpcode::kAdd)
+//         .WithBinaryOperandsAnyOrder(Op().WithParameterNum(1), Abs());
 //
-// Example binary instruction:
-//   Add()                             == Op().WithOpcode(HloOpcode::kAdd)
-//   Add(Op(&a), Op(&b))               == Op().WithOpcode(HloOpcode::kAdd)
-//                                            .WithOperand(0, Op(&a))
-//                                            .WithOperand(1, Op(&b))
-//   Add(&a, Op(&b), Op(&c))           == Op(&a).WithOpcode(HloOpcode::kAdd)
-//                                              .WithOperand(0, Op(&b))
-//                                              .WithOperand(1, Op(&c))
+//   MultiplyAnyOrder(&a, Parameter(), Abs())  // Captures the mul in `a`.
 //
-// Example ternary instruction:
-//   Clamp()                           == Op().WithOpcode(HloOpcode::kClamp)
-//   Clamp(Op(&a), Op(&b), Op(&c))     == Op().WithOpcode(HloOpcode::kClamp)
-//                                            .WithOperand(0, Op(&a))
-//                                            .WithOperand(1, Op(&b))
-//                                            .WithOperand(2, Op(&c))
-//   Clamp(&a, Op(&b), Op(&c), Op(&d)) == Op(&a).WithOpcode(HloOpcode::kClamp)
-//                                              .WithOperand(0, Op(&b))
-//                                              .WithOperand(1, Op(&c))
-//                                              .WithOperand(2, Op(&d))
+// The following additional helpers are provided.  In all cases, `&a` is
+// optional.
 //
+//   ConstantScalar(&a)               == Op(&a).IsConstantScalar();
+//   ConstantScalar(&a, v)            == Op(&a).IsConstantScalar(v);
+//   ConstantEffectiveScalar(&a)      == Op(&a).IsConstantEffectiveScalar();
+//   ConstantEffectiveScalar(&a, v)   == Op(&a).IsConstantEffectiveScalar(&a, v)
+//   NonConstant(&a)                  == Op(&a).IsNonConstant()
+//   GetTupleElement(&a, b, index)    == Op(&a).WithTupleIndex(index)
+//                                             .WithOperand(0, b);
+//   Parameter(&a, n)                 == Op(&a).WithParameterNum(n);
 
 struct MatchOption {
   // If true, actually capture matched item into the user pointer.
   bool capture;
+
+  // An explanation for why we failed to match is streamed here, if not-null.
+  std::ostream* explain_os;
 };
 
 template <typename Value, typename Pattern>
 bool Match(Value* value, const Pattern& pattern,
-           MatchOption option = {/*.capture=*/true}) {
+           MatchOption option = {/*.capture=*/true, /*.explain_os=*/nullptr}) {
   if (option.capture) {
     auto new_option = option;
     new_option.capture = false;
@@ -143,6 +162,77 @@ namespace match {
 
 namespace detail {
 
+// Macro for streaming to option.explain_os if it's not null.
+//
+//   EXPLAIN << "value of foo(): " << foo()
+//
+#pragma push_macro("EXPLAIN")
+#define EXPLAIN \
+  if (option.explain_os) *option.explain_os
+
+// kIndentInc is the additional number of spaces that we indent by when we
+// increase the indent "by one".
+enum {
+  kIndentInc = 2,
+};
+
+// Writes a newline and then `indent` spaces.
+//
+// We follow an unintuitive convention in this file's pretty-printers: Indents
+// are performed by the caller, not the callee.  For example, if you want to
+// print
+//
+//   foo:
+//    - bar
+//
+// you'd do:
+//
+//  Foo::DescribeTo(std::ostream* os, int64 indent) {
+//    *os << "foo:";
+//    Indent(os, indent)  // Create a newline at the *current* indent level.
+//    *os << " - ";
+//    bar.DescribeTo(os, indent + 3);  // + 3 because strlen(" * ") == 3.
+//  }
+//
+//  Bar::DescribeTo(std::ostream* os, int64 indent) { *os << "bar"; }
+//
+// Notice that Bar::DescribeTo() does not call Indent; the indenting is
+// performed by Foo.  This convention allows the caller to decide whether a
+// matcher is preceded by a newline, which is important e.g. for the AllOf
+// matcher.
+//
+// (Incidentally, indenting in Match's explanations is handled differently.
+// Indents are a common case in DescribeTo [we're printing a whole tree], but
+// they're a special case in Match [we're printing only a path through the tree
+// that encounters a failing node]. Indents in Match only appear when we
+// encounter a failing disjunction, so we just handle them as a special case
+// there.)
+inline void Indent(std::ostream* os, int64 indent) {
+  *os << "\n";
+  for (int64 i = 0; i < indent; ++i) {
+    *os << " ";
+  }
+}
+
+// SFINAE template that determines whether T declares a static member
+// kIsTrivialMatcher.
+//
+// Trivial matchers get special treatment.  For example, when printing
+// a conjunction of matchers, we don't print "and" after a trivial matcher. This
+// yields e.g.
+//    "a shape compatible with f32[1,2]"
+// rather than
+//    "a shape AND compatible with f32[1,2]"
+template <typename T, typename Dummy = void>
+struct IsTrivialMatcher {
+  static constexpr bool value = false;
+};
+template <typename T>
+struct IsTrivialMatcher<T,
+                        typename std::enable_if<T::kIsTrivialMatcher>::type> {
+  static constexpr bool value = true;
+};
+
 template <typename Item, typename... Patterns>
 class AllOfPattern {
  public:
@@ -162,10 +252,19 @@ class AllOfPattern {
     return matched;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    DescribeToImpl(os, std::integral_constant<size_t, 0>(), indent);
+  }
+
+  // Accessor for patterns_.  Please don't use this outside of this file.
+  const std::tuple<Patterns...>& patterns() const { return patterns_; }
+
  private:
   template <typename ItemType, size_t index>
   bool MatchImpl(ItemType* item, MatchOption option,
                  std::integral_constant<size_t, index>) const {
+    // We don't need to do any EXPLAINing here; it's all correctly handled by
+    // our sub-matchers (if any fail).
     return std::get<index>(patterns_).Match(item, option) &&
            MatchImpl(item, option, std::integral_constant<size_t, index + 1>());
   }
@@ -176,6 +275,73 @@ class AllOfPattern {
     return true;
   }
 
+  // Pretty-printing a conjunction has some special cases to make it easy to
+  // read in the simple (common) case.
+  //
+  // If sizeof...(Patterns) == 1, prints as e.g.
+  //
+  //   a shape
+  //
+  // If sizeof...(Patterns) == 2 and patterns_[0] is a trivial matcher (e.g. "a
+  // shape") prints as
+  //
+  //   a shape compatible with f32[1,2]
+  //
+  // If sizeof...(Patterns) > 2 and patterns_[0] is a trivial matcher, prints as
+  //
+  //   a shape:
+  //    * compatible with f32[1,2] AND
+  //    * that represents a scalar
+  //
+  // Otherwise prints as:
+  //
+  //   all of:
+  //    * foo AND
+  //    * bar
+  //
+  template <size_t index>
+  void DescribeToImpl(std::ostream* os, std::integral_constant<size_t, index>,
+                      int64 indent) const {
+    constexpr bool first_is_trivial =
+        IsTrivialMatcher<typename std::remove_reference<decltype(
+            std::get<0>(patterns_))>::type>::value;
+    constexpr bool is_last = index == sizeof...(Patterns) - 1;
+    const auto& submatcher = std::get<index>(patterns_);
+
+    auto print_bulleted_item = [&] {
+      *os << " * ";
+      submatcher.DescribeTo(os, indent + 3);
+      if (!is_last) {
+        *os << " AND";
+        Indent(os, indent);
+      }
+    };
+
+    if (index == 0) {
+      if (first_is_trivial || is_last) {
+        submatcher.DescribeTo(os, indent + kIndentInc);
+        if (sizeof...(Patterns) > 2) {
+          *os << ":";
+          Indent(os, indent);
+        }
+      } else {
+        *os << "all of:";
+        Indent(os, indent);
+        print_bulleted_item();
+      }
+    } else if (first_is_trivial && index == 1 && sizeof...(Patterns) == 2) {
+      *os << " ";
+      submatcher.DescribeTo(os, indent);
+    } else {
+      print_bulleted_item();
+    }
+    DescribeToImpl(os, std::integral_constant<size_t, index + 1>(), indent);
+  }
+
+  void DescribeToImpl(std::ostream* os,
+                      std::integral_constant<size_t, sizeof...(Patterns)>,
+                      int64 indent) const {}
+
   std::tuple<Patterns...> patterns_;
 };
 
@@ -183,10 +349,6 @@ class AllOfPattern {
 
 // Returns a pattern that represents the conjunction of all input patterns. All
 // patterns need to match in order to have the AllOf pattern match.
-//
-// TODO(timshen): Currently AllOf is still nested, e.g. AllOf<AllOf<A>, B> is
-// not AllOf<A, B>. We might want to flatten the AllOf type structure if the
-// C++ compile error message gets annoying.
 template <typename Item, typename... Patterns>
 detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
     const Patterns&... patterns) {
@@ -194,6 +356,25 @@ detail::AllOfPattern<typename std::remove_const<Item>::type, Patterns...> AllOf(
                               Patterns...>(patterns...);
 }
 
+// AllOf<AllOf<A, B...>, X, Y, ...> => AllOf<A, B, ..., X, Y, ...>.
+//
+// This transformation is necessary for good pretty-printing.
+template <typename Item, typename... InnerPs, typename... OuterPs>
+detail::AllOfPattern<typename std::remove_const<Item>::type, InnerPs...,
+                     OuterPs...>
+AllOf(const detail::AllOfPattern<Item, InnerPs...>& inner_p,
+      const OuterPs&... outer_ps) {
+  // Invoke constructor of AllOfPattern<Item, InnerPs..., OuterPs...>.
+  auto make_all_of = [](const InnerPs&... inner_ps,
+                        const OuterPs&... outer_ps) {
+    return detail::AllOfPattern<typename std::remove_const<Item>::type,
+                                InnerPs..., OuterPs...>(inner_ps...,
+                                                        outer_ps...);
+  };
+  return absl::apply(make_all_of, std::tuple_cat(inner_p.patterns(),
+                                                 std::make_tuple(outer_ps...)));
+}
+
 namespace detail {
 
 template <typename LayoutType, typename Impl>
@@ -204,8 +385,18 @@ class LayoutPattern;
 class LayoutPatternBaseImpl {
  public:
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return layout != nullptr;
+    if (layout == nullptr) {
+      EXPLAIN << "Layout is null";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "a layout";
   }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // A LayoutPattern implementation that matches only if the layout equals a
@@ -216,7 +407,17 @@ class LayoutPatternEqualImpl {
       : layout_(layout) {}
 
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return LayoutUtil::Equal(*layout_, *layout);
+    if (!LayoutUtil::Equal(*layout_, *layout)) {
+      EXPLAIN << "Layout " << LayoutUtil::HumanString(*layout)
+              << " is not equal to expected "
+              << LayoutUtil::HumanString(*layout_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "equal to " << LayoutUtil::HumanString(*layout_);
   }
 
  private:
@@ -230,7 +431,16 @@ class LayoutPatternFormatImpl {
   explicit constexpr LayoutPatternFormatImpl(Format format) : format_(format) {}
 
   bool Match(const ::xla::Layout* layout, MatchOption option) const {
-    return layout->format() == format_;
+    if (layout->format() != format_) {
+      EXPLAIN << "Layout has format " << Format_Name(layout->format())
+              << " but expected " << Format_Name(format_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with format " << Format_Name(format_);
   }
 
  private:
@@ -242,11 +452,13 @@ template <typename LayoutType, typename Impl>
 class LayoutPattern {
  private:
   template <typename NewImpl>
-  LayoutPattern<LayoutType, AllOfPattern<::xla::Layout, Impl, NewImpl>>
-  AppendImpl(NewImpl new_impl) const {
-    return LayoutPattern<LayoutType,
-                         AllOfPattern<::xla::Layout, Impl, NewImpl>>(
-        AllOf<Layout>(impl_, std::move(new_impl)), matched_layout_);
+  auto AppendImpl(NewImpl new_impl) const
+      -> LayoutPattern<LayoutType,
+                       decltype(AllOf<Layout>(std::declval<Impl>(),
+                                              std::move(new_impl)))> {
+    auto new_allof = AllOf<Layout>(impl_, std::move(new_impl));
+    return LayoutPattern<LayoutType, decltype(new_allof)>(std::move(new_allof),
+                                                          matched_layout_);
   }
 
  public:
@@ -276,6 +488,10 @@ class LayoutPattern {
     return false;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    impl_.DescribeTo(os, indent);
+  }
+
   // Modifies the pattern to match only if the layout equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr auto EqualTo(const ::xla::Layout* layout) const
@@ -306,19 +522,48 @@ class AnyOfPattern {
   explicit AnyOfPattern(const Patterns&... patterns) : patterns_(patterns...) {}
 
   bool Match(const Item* item, MatchOption option) const {
-    return MatchImpl(item, option, std::integral_constant<size_t, 0>());
+    return MatchImpl(item, option);
   }
 
   bool Match(Item* item, MatchOption option) const {
-    return MatchImpl(item, option, std::integral_constant<size_t, 0>());
+    return MatchImpl(item, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "any of:";
+    Indent(os, indent);
+    DescribeToImpl(os, std::integral_constant<size_t, 0>(), indent);
   }
 
  private:
+  template <typename ItemType>
+  bool MatchImpl(ItemType* item, MatchOption option) const {
+    // If we're generating an explanation, buffer it until we know we failed.
+    absl::optional<std::stringstream> explanation;
+    MatchOption new_option = option;
+    if (option.explain_os) {
+      new_option.explain_os = &explanation.emplace();
+    }
+    bool rv = MatchRecursiveImpl(item, new_option,
+                                 std::integral_constant<size_t, 0>());
+    if (!rv && option.explain_os) {
+      EXPLAIN << "None of the following matchers succeeded:";
+      EXPLAIN << explanation->str();
+    }
+    return rv;
+  }
+
   template <typename ItemType, size_t index>
-  bool MatchImpl(ItemType* item, MatchOption option,
-                 std::integral_constant<size_t, index>) const {
+  bool MatchRecursiveImpl(ItemType* item, MatchOption option,
+                          std::integral_constant<size_t, index>) const {
     auto new_option = option;
     new_option.capture = false;
+
+    absl::optional<std::stringstream> explanation;
+    if (option.explain_os) {
+      new_option.explain_os = &explanation.emplace();
+    }
+
     // Try to match the sub-pattern without capturing behavior.
     if (std::get<index>(patterns_).Match(item, new_option)) {
       // Capture the branch.
@@ -337,20 +582,46 @@ class AnyOfPattern {
         // AnyOf will be a runtime number indicate which sub-pattern is matched.
         // Then we run another pass to do captures only with the help of the
         // trace.
-        bool ret = std::get<index>(patterns_).Match(item, option);
-        DCHECK(ret);
+        bool matched = std::get<index>(patterns_).Match(item, option);
+        DCHECK(matched);
       }
       return true;
     }
-    return MatchImpl(item, option, std::integral_constant<size_t, index + 1>());
+    if (option.explain_os) {
+      EXPLAIN << "\nMatcher #" << index + 1;
+      EXPLAIN << "\n - ";
+      std::get<index>(patterns_).DescribeTo(option.explain_os, /*indent=*/3);
+      EXPLAIN << "\nfailed with";
+      EXPLAIN << "\n - ";
+      EXPLAIN << absl::StrReplaceAll(explanation->str(), {{"\n", "\n   "}});
+    }
+    return MatchRecursiveImpl(item, option,
+                              std::integral_constant<size_t, index + 1>());
   }
 
   template <typename ItemType>
-  bool MatchImpl(ItemType* item, MatchOption option,
-                 std::integral_constant<size_t, sizeof...(Patterns)>) const {
+  bool MatchRecursiveImpl(
+      ItemType* item, MatchOption option,
+      std::integral_constant<size_t, sizeof...(Patterns)>) const {
     return false;
   }
 
+  template <size_t index>
+  void DescribeToImpl(std::ostream* os, std::integral_constant<size_t, index>,
+                      int64 indent) const {
+    *os << " - ";
+    std::get<index>(patterns_).DescribeTo(os, indent + 3);
+    if (index != sizeof...(Patterns) - 1) {
+      *os << " OR";
+      Indent(os, indent);
+    }
+    DescribeToImpl(os, std::integral_constant<size_t, index + 1>(), indent);
+  }
+
+  void DescribeToImpl(std::ostream* os,
+                      std::integral_constant<size_t, sizeof...(Patterns)>,
+                      int64 indent) const {}
+
   std::tuple<Patterns...> patterns_;
 };
 
@@ -395,8 +666,17 @@ class ShapePattern;
 class ShapePatternBaseImpl {
  public:
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (shape == nullptr) {
+      EXPLAIN << "Shape is null";
+    }
     return shape != nullptr;
   }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "a shape";
+  }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // A ShapePattern implementation that matches only if the shape equals a Shape
@@ -407,7 +687,16 @@ class ShapePatternEqualImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Equal(*shape_, *shape);
+    if (!ShapeUtil::Equal(*shape_, *shape)) {
+      EXPLAIN << "Shape not equal to "
+              << ShapeUtil::HumanStringWithLayout(*shape_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "equal to " << ShapeUtil::HumanStringWithLayout(*shape_);
   }
 
  private:
@@ -422,7 +711,16 @@ class ShapePatternCompatibleImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Compatible(*shape_, *shape);
+    if (!ShapeUtil::Compatible(*shape_, *shape)) {
+      EXPLAIN << "Shape not compatible with "
+              << ShapeUtil::HumanString(*shape_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "compatible with " << ShapeUtil::HumanString(*shape_);
   }
 
  private:
@@ -437,7 +735,16 @@ class ShapePatternElementTypeImpl {
       : element_type_(element_type) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return shape->element_type() == element_type_;
+    if (shape->element_type() != element_type_) {
+      EXPLAIN << "Shape does not have element type "
+              << PrimitiveType_Name(element_type_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with element type " << PrimitiveType_Name(element_type_);
   }
 
  private:
@@ -450,7 +757,15 @@ class ShapePatternIsScalarImpl {
   explicit constexpr ShapePatternIsScalarImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsScalar(*shape);
+    if (!ShapeUtil::IsScalar(*shape)) {
+      EXPLAIN << "Shape is not a scalar";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents a scalar";
   }
 };
 
@@ -460,7 +775,15 @@ class ShapePatternIsArrayImpl {
   explicit constexpr ShapePatternIsArrayImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsArray(*shape);
+    if (!ShapeUtil::IsArray(*shape)) {
+      EXPLAIN << "Shape is not an array";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents an array";
   }
 };
 
@@ -470,7 +793,34 @@ class ShapePatternIsTupleImpl {
   explicit constexpr ShapePatternIsTupleImpl() {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IsTuple(*shape);
+    if (!ShapeUtil::IsTuple(*shape)) {
+      EXPLAIN << "Shape is not a tuple";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that represents a tuple";
+  }
+};
+
+// A ShapePattern implementation that matches only if the shape is an effective
+// scalar.
+class ShapePatternEffectiveScalarImpl {
+ public:
+  explicit constexpr ShapePatternEffectiveScalarImpl() {}
+
+  bool Match(const ::xla::Shape* shape, MatchOption option) const {
+    if (!ShapeUtil::IsEffectiveScalar(*shape)) {
+      EXPLAIN << "Shape is not an effective scalar";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "that is an effective scalar";
   }
 };
 
@@ -481,7 +831,23 @@ class ShapePatternRankImpl {
   explicit constexpr ShapePatternRankImpl(int64 rank) : rank_(rank) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::Rank(*shape) == rank_;
+    if (ShapeUtil::Rank(*shape) != rank_) {
+      if (rank_ == 0) {
+        EXPLAIN << "Shape is not a scalar";
+      } else {
+        EXPLAIN << "Shape does not have rank " << rank_;
+      }
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    if (rank_ == 0) {
+      *os << "that is a scalar";
+    } else {
+      *os << "that has " << rank_ << " dimension" << (rank_ != 1 ? "s" : "");
+    }
   }
 
  private:
@@ -503,8 +869,21 @@ class ShapePatternLayoutImpl {
   }
 
   bool Match(Shape* shape, MatchOption option) const {
-    return LayoutUtil::HasLayout(*shape) &&
-           layout_.Match(shape->mutable_layout(), option);
+    if (!LayoutUtil::HasLayout(*shape)) {
+      EXPLAIN << "Shape does not have a layout";
+      return false;
+    }
+    if (!layout_.Match(shape->mutable_layout(), option)) {
+      EXPLAIN << "\nin layout";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with";
+    Indent(os, indent + kIndentInc);
+    layout_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
@@ -522,17 +901,40 @@ class ShapePatternSubshapeImpl {
       : index_(index), subshape_(subshape) {}
 
   bool Match(const ::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IndexIsValid(*shape, index_) &&
-           subshape_.Match(&ShapeUtil::GetSubshape(*shape, index_), option);
+    return MatchImpl(shape, option);
   }
 
   bool Match(::xla::Shape* shape, MatchOption option) const {
-    return ShapeUtil::IndexIsValid(*shape, index_) &&
-           subshape_.Match(ShapeUtil::GetMutableSubshape(shape, index_),
-                           option);
+    return MatchImpl(shape, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with subshape at index " << index_.ToString() << " which is";
+    Indent(os, indent + kIndentInc);
+    subshape_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
+  Shape* GetSubshape(Shape* shape) const {
+    return ShapeUtil::GetMutableSubshape(shape, index_);
+  }
+  const Shape* GetSubshape(const Shape* shape) const {
+    return &ShapeUtil::GetSubshape(*shape, index_);
+  }
+
+  template <typename ShapeType>
+  bool MatchImpl(ShapeType* shape, MatchOption option) const {
+    if (!ShapeUtil::IndexIsValid(*shape, index_)) {
+      EXPLAIN << "No subshape at " << index_.ToString();
+      return false;
+    }
+    if (!subshape_.Match(GetSubshape(shape), option)) {
+      EXPLAIN << "\nin subshape at " << index_.ToString();
+      return false;
+    }
+    return true;
+  }
+
   ShapeIndexView index_;
   ShapePattern<SubshapeType, SubshapeImpl> subshape_;
 };
@@ -542,10 +944,12 @@ template <typename ShapeType, typename Impl>
 class ShapePattern {
  private:
   template <typename NewImpl>
-  ShapePattern<ShapeType, AllOfPattern<::xla::Shape, Impl, NewImpl>> AppendImpl(
-      NewImpl new_impl) const {
-    return ShapePattern<ShapeType, AllOfPattern<::xla::Shape, Impl, NewImpl>>(
-        AllOf<Shape>(impl_, std::move(new_impl)), matched_shape_);
+  auto AppendImpl(NewImpl new_impl) const
+      -> ShapePattern<ShapeType, decltype(AllOf<Shape>(std::declval<Impl>(),
+                                                       std::move(new_impl)))> {
+    auto new_all_of = AllOf<Shape>(impl_, std::move(new_impl));
+    return ShapePattern<ShapeType, decltype(new_all_of)>(std::move(new_all_of),
+                                                         matched_shape_);
   }
 
  public:
@@ -560,6 +964,11 @@ class ShapePattern {
       }
       return true;
     }
+    if (shape) {
+      EXPLAIN << "\nin "
+              << (shape->has_layout() ? ShapeUtil::HumanStringWithLayout(*shape)
+                                      : ShapeUtil::HumanString(*shape));
+    }
     return false;
   }
 
@@ -571,9 +980,16 @@ class ShapePattern {
       }
       return true;
     }
+    EXPLAIN << "\nin "
+            << (shape->has_layout() ? ShapeUtil::HumanStringWithLayout(*shape)
+                                    : ShapeUtil::HumanString(*shape));
     return false;
   }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    return impl_.DescribeTo(os, indent);
+  }
+
   // Modifies the pattern to match only if the shape equals the given proto.
   // The layout must outlive the returned pattern.
   constexpr auto EqualTo(const ::xla::Shape* shape) const
@@ -612,6 +1028,11 @@ class ShapePattern {
     return AppendImpl(ShapePatternIsTupleImpl());
   }
 
+  constexpr auto IsEffectiveScalar() const
+      -> decltype(this->AppendImpl(ShapePatternEffectiveScalarImpl())) {
+    return AppendImpl(ShapePatternEffectiveScalarImpl());
+  }
+
   // Modifies the pattern to match only if the shape has the given rank.
   constexpr auto WithRank(int64 rank) const
       -> decltype(this->AppendImpl(ShapePatternRankImpl(rank))) {
@@ -706,6 +1127,22 @@ Shape(::xla::Shape** matched_shape) {
 
 namespace detail {
 
+// Overloads to get a const or non-const operand out of an instruction.
+inline HloInstruction* HloOperand(HloInstruction* instr, int64 idx) {
+  return instr->mutable_operand(idx);
+}
+inline const HloInstruction* HloOperand(const HloInstruction* instr,
+                                        int64 idx) {
+  return instr->operand(idx);
+}
+
+// Pretty-printer for HloInstruction.  Sort of like ToShortString, but with
+// fewer %s and more shapes.
+inline string InstToString(const HloInstruction* inst) {
+  return inst->ToString(
+      HloPrintOptions().set_print_metadata(false).set_print_percent(false));
+}
+
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern;
 
@@ -714,8 +1151,18 @@ class HloInstructionPattern;
 class HloInstructionPatternBaseImpl {
  public:
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst != nullptr;
+    if (inst == nullptr) {
+      EXPLAIN << "HloInstruction* is null";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "an HloInstruction";
   }
+
+  static constexpr bool kIsTrivialMatcher = true;
 };
 
 // An HloInstructionPattern implementation that matches only if the instruction
@@ -726,13 +1173,44 @@ class HloInstructionPatternNameImpl {
       : name_(name) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->name() == name_;
+    if (inst->name() != name_) {
+      EXPLAIN << "HloInstruction not named \"" << name_ << "\"";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "named \"" << name_ << "\"";
   }
 
  private:
   absl::string_view name_;
 };
 
+// An HloInstructionPattern implementation that matches only if the instruction
+// equals a particular pointer.
+class HloInstructionIsImpl {
+ public:
+  explicit HloInstructionIsImpl(const HloInstruction* inst) : inst_(inst) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst != inst_) {
+      EXPLAIN << "HloInstruction " << inst << " is not " << inst_ << " ("
+              << InstToString(inst_) << ")";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is " << inst_ << " (" << InstToString(inst_) << ")";
+  }
+
+ private:
+  const HloInstruction* inst_;
+};
+
 // An HloInstructionPattern implementation that matches only if the instruction
 // has a given opcode.
 class HloInstructionPatternOpcodeImpl {
@@ -742,7 +1220,25 @@ class HloInstructionPatternOpcodeImpl {
       : opcode_(opcode), invert_(invert) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return (invert_ ^ (inst->opcode() == opcode_));
+    if (invert_ && inst->opcode() == opcode_) {
+      EXPLAIN << "HloInstruction has opcode " << HloOpcodeString(opcode_)
+              << ", expected anything else";
+      return false;
+    }
+    if (!invert_ && inst->opcode() != opcode_) {
+      EXPLAIN << "HloInstruction doesn't have opcode "
+              << HloOpcodeString(opcode_);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    if (!invert_) {
+      *os << "with opcode " << HloOpcodeString(opcode_);
+    } else {
+      *os << "with any opcode other than " << HloOpcodeString(opcode_);
+    }
   }
 
  private:
@@ -757,8 +1253,17 @@ class HloInstructionPatternNumOperandsImpl {
   explicit constexpr HloInstructionPatternNumOperandsImpl(int64 num_operands)
       : num_operands_(num_operands) {}
 
-  bool Match(const ::xla::HloInstruction* inst, MatchOption /*option*/) const {
-    return inst->operand_count() == num_operands_;
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    if (inst->operand_count() != num_operands_) {
+      EXPLAIN << "HloInstruction doesn't have " << num_operands_ << " operands";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with " << num_operands_ << " operand"
+        << (num_operands_ != 1 ? "s" : "");
   }
 
  private:
@@ -775,11 +1280,25 @@ class HloInstructionPatternShapeImpl {
       : shape_(shape) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return shape_.Match(&inst->shape(), option);
+    if (!shape_.Match(&inst->shape(), option)) {
+      EXPLAIN << "\nin output shape";
+      return false;
+    }
+    return true;
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return shape_.Match(inst->mutable_shape(), option);
+    if (!shape_.Match(inst->mutable_shape(), option)) {
+      EXPLAIN << "\nin output shape";
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "outputting";
+    Indent(os, indent + kIndentInc);
+    shape_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
@@ -797,20 +1316,197 @@ class HloInstructionPatternOperandImpl {
       : operand_index_(operand_index), operand_(operand) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return operand_index_ < inst->operand_count() &&
-           operand_.Match(inst->operand(operand_index_), option);
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return operand_index_ < inst->operand_count() &&
-           operand_.Match(inst->mutable_operand(operand_index_), option);
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with operand " << operand_index_ << " which is:";
+    Indent(os, indent + kIndentInc);
+    operand_.DescribeTo(os, indent + kIndentInc);
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (operand_index_ >= inst->operand_count()) {
+      EXPLAIN << "desired operand index " << operand_index_
+              << " is out of bounds";
+      return false;
+    }
+    if (!operand_.Match(HloOperand(inst, operand_index_), option)) {
+      EXPLAIN << "\nin operand " << operand_index_;
+      return false;
+    }
+    return true;
+  }
+
   int64 operand_index_;
   HloInstructionPattern<OperandType, OperandImpl> operand_;
 };
 
+// Matches a binary instruction whose operands come in any order.
+template <typename OperandType1, typename OperandImpl1, typename OperandType2,
+          typename OperandImpl2>
+class HloInstructionPatternBinaryOperandsAnyOrderImpl {
+ public:
+  explicit constexpr HloInstructionPatternBinaryOperandsAnyOrderImpl(
+      const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2)
+      : op1_(op1), op2_(op2) {}
+
+  bool Match(HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with two operands in either order:";
+    Indent(os, indent);
+    *os << " - ";
+    op1_.DescribeTo(os, indent + 3);
+    Indent(os, indent);
+    *os << " - ";
+    op2_.DescribeTo(os, indent + 3);
+  }
+
+ private:
+  HloInstruction* operand(HloInstruction* inst, int64 idx) const {
+    return inst->mutable_operand(idx);
+  }
+  const HloInstruction* operand(const HloInstruction* inst, int64 idx) const {
+    return inst->operand(idx);
+  }
+
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    // We could implement this using AnyOf and AllOf matchers, but the templates
+    // get pretty difficult to debug, since any compile error herein becomes
+    // not-an-error via SFINAE.  Also this way lets us give better messages on
+    // failure.
+    if (inst->operand_count() != 2) {
+      EXPLAIN << "HloInstruction did not have two operands";
+      return false;
+    }
+
+    // If we're not generating explanations, this is pretty simple.
+    if (!option.explain_os) {
+      auto try_match = [&](int64 idx1, int64 idx2) {
+        MatchOption new_option = option;
+        new_option.capture = false;
+        if (op1_.Match(operand(inst, idx1), new_option) &&
+            op2_.Match(operand(inst, idx2), new_option)) {
+          if (option.capture) {
+            bool matched = op1_.Match(operand(inst, idx1), option) &&
+                           op2_.Match(operand(inst, idx2), option);
+            DCHECK(matched);
+          }
+          return true;
+        }
+        return false;
+      };
+      return try_match(0, 1) || try_match(1, 0);
+    }
+
+    // If we are generating explanations, we have some work to do in order to
+    // generate a helpful error.
+    //
+    // First, try all four operand/matcher combinations, recording the
+    // failure explanations separately from option.explain_os. matches[i][j]
+    // tells us if matcher_i matches operand j.
+    bool matches[/*matcher*/ 2][/*operand*/ 2];
+    std::stringstream explanations[/*matcher*/ 2][/*operand*/ 2];
+    for (int i = 0; i < 2; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        MatchOption new_option = option;
+        new_option.capture = false;
+        new_option.explain_os = &explanations[i][j];
+        matches[i][j] = i == 0 ? op1_.Match(operand(inst, j), new_option)
+                               : op2_.Match(operand(inst, j), new_option);
+      }
+    }
+
+    // Check if the match succeeded.
+    for (int i = 0; i < 2; ++i) {
+      if (matches[0][i] && matches[1][(i + 1) % 2]) {
+        // Rerun the matches with capture enabled if necessary.
+        if (option.capture) {
+          auto* operand1 = operand(inst, i);
+          auto* operand2 = operand(inst, (i + 1) % 2);
+          bool matched =
+              op1_.Match(operand1, option) && op2_.Match(operand2, option);
+          DCHECK(matched);
+        }
+        return true;
+      }
+    }
+
+    auto describe_matcher = [&](int matcher_idx) {
+      EXPLAIN << "\n - ";
+      if (matcher_idx == 0) {
+        op1_.DescribeTo(option.explain_os, /*indent=*/3);
+      } else {
+        CHECK_EQ(matcher_idx, 1);
+        op2_.DescribeTo(option.explain_os, /*indent=*/3);
+      }
+      for (int i = 0; i < 2; ++i) {
+        if (matches[matcher_idx][/*operand*/ i]) {
+          continue;
+        }
+        EXPLAIN << "\ndoes not match " << (i == 0 ? "LHS" : "RHS") << ":\n";
+        EXPLAIN << " - ";
+        EXPLAIN << absl::StrReplaceAll(
+            explanations[matcher_idx][/*operand*/ i].str(), {{"\n", "\n   "}});
+      }
+    };
+
+    // If we failed to match, one of the following is true:
+    //  1. op1 (op2) matches neither LHS nor RHS, or
+    //  2. op1 and op2 both match LHS (RHS), but neither matches RHS (LHS).
+    // We print different explanations depending on which case we're in.
+
+    // Case 1.
+    bool wrote_explanation = false;
+    for (int i = 0; !wrote_explanation && i < 2; ++i) {
+      if (!matches[i][0] && !matches[i][1]) {
+        EXPLAIN << "HloInstruction's operands (ignoring order) did not match "
+                << (i == 0 ? "first" : "second") << " matcher.  Specifically,";
+        describe_matcher(i);
+        wrote_explanation = true;
+      }
+    }
+
+    // Case 2.
+    for (int i = 0; !wrote_explanation && i < 2; ++i) {
+      if (matches[/*matcher*/ 0][/*operand*/ i] &&
+          matches[/*matcher*/ 1][/*operand*/ i]) {
+        CHECK(!matches[0][(i + 1) % 2]);
+        CHECK(!matches[1][(i + 1) % 2]);
+        CHECK(!wrote_explanation);
+        EXPLAIN << "HloInstruction's " << (i == 1 ? "LHS" : "RHS")
+                << " operand did not match either of the two matchers.  "
+                   "Specifically,";
+        describe_matcher(0);
+        EXPLAIN << "\nand";
+        describe_matcher(1);
+        wrote_explanation = true;
+      }
+    }
+
+    CHECK(wrote_explanation);
+    return false;
+  }
+
+  HloInstructionPattern<OperandType1, OperandImpl1> op1_;
+  HloInstructionPattern<OperandType2, OperandImpl2> op2_;
+};
+
 // An HloInstructionPattern implementation that matches only if the instruction
 // is a fusion node with a particular kind.
 class HloInstructionPatternFusionKindImpl {
@@ -820,14 +1516,32 @@ class HloInstructionPatternFusionKindImpl {
       : kind_(kind) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kFusion && inst->fusion_kind() == kind_;
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kFusion && inst->fusion_kind() == kind_;
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "with fusion kind " << ToString(kind_);
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kFusion) {
+      EXPLAIN << "HloInstruction does not have fusion kind " << ToString(kind_)
+              << "; it's not a fusion";
+      return false;
+    }
+    if (inst->fusion_kind() != kind_) {
+      EXPLAIN << "HloInstruction does not have fusion kind " << ToString(kind_);
+      return false;
+    }
+    return true;
+  }
+
   ::xla::HloInstruction::FusionKind kind_;
 };
 
@@ -839,47 +1553,211 @@ class HloInstructionPatternTupleIndexImpl {
       : tuple_index_(tuple_index) {}
 
   bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kGetTupleElement &&
-           inst->tuple_index() == tuple_index_;
+    return MatchImpl(inst, option);
   }
 
   bool Match(::xla::HloInstruction* inst, MatchOption option) const {
-    return inst->opcode() == HloOpcode::kGetTupleElement &&
-           inst->tuple_index() == tuple_index_;
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is a GTE with index " << tuple_index_;
   }
 
  private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kGetTupleElement) {
+      EXPLAIN << "HloInstruction is not a GTE with index " << tuple_index_
+              << "; it's not a GTE at all";
+      return false;
+    }
+    if (inst->tuple_index() != tuple_index_) {
+      EXPLAIN << "HloInstruction is not a GTE with index " << tuple_index_;
+      return false;
+    }
+    return true;
+  }
+
   int64 tuple_index_;
 };
 
-template <typename ItemType, typename Predicate>
-class HloPredicatePatternImpl {
+class HloInstructionPatternParameterNumImpl {
  public:
-  explicit HloPredicatePatternImpl(Predicate pred) : pred_(std::move(pred)) {}
+  explicit constexpr HloInstructionPatternParameterNumImpl(int64 parameter_num)
+      : parameter_num_(parameter_num) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
 
-  bool Match(const ItemType* item, MatchOption option) const {
-    return pred_(item);
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
   }
 
-  bool Match(ItemType* item, MatchOption option) const { return pred_(item); }
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is parameter " << parameter_num_;
+  }
 
  private:
-  Predicate pred_;
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kParameter ||
+        inst->parameter_number() != parameter_num_) {
+      EXPLAIN << "HloInstruction is not parameter " << parameter_num_;
+      return false;
+    }
+    return true;
+  }
+
+  int64 parameter_num_;
 };
 
-struct PatternFriend;
+// Superclass that contains common code used by Op::WithOneUse() and
+// Op::WithOneUser().
+class HloInstructionPatternOneUseOrUserImpl {
+ protected:
+  bool MatchOneUser(const HloInstruction* inst, MatchOption option) const {
+    if (inst->user_count() != 1) {
+      EXPLAIN << "HloInstruction has " << inst->user_count()
+              << " users, but expected exactly one.";
+      if (inst->user_count() > 1) {
+        EXPLAIN << "\nAll users:";
+        for (const HloInstruction* user : inst->users()) {
+          EXPLAIN << "\n - " << InstToString(user);
+        }
+      }
+      return false;
+    }
+    return true;
+  }
+};
+
+class HloInstructionPatternOneUseImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    if (!MatchOneUser(inst, option)) {
+      return false;
+    }
+
+    int64 use_count = absl::c_count_if(
+        inst->users()[0]->operands(),
+        [&](const HloInstruction* operand) { return operand == inst; });
+    if (use_count != 1) {
+      EXPLAIN << "HloInstruction is used " << use_count
+              << " times by its user, but is expected to be used just once: "
+              << InstToString(inst->users()[0]);
+      return false;
+    }
+    return true;
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has exactly one use";
+  }
+};
+
+class HloInstructionPatternOneUserImpl
+    : public HloInstructionPatternOneUseOrUserImpl {
+ public:
+  bool Match(const HloInstruction* inst, MatchOption option) const {
+    return MatchOneUser(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which has exactly one user (but possibly is used multiple times by "
+           "that instruction)";
+  }
+};
+
+// Matches a constant scalar or effective scalar, optionally with a given value.
+template <typename ScalarTy>
+class HloConstantScalarImpl {
+ public:
+  explicit constexpr HloConstantScalarImpl(bool match_effective_scalar)
+      : val_(absl::nullopt), match_effective_scalar_(match_effective_scalar) {}
+
+  constexpr HloConstantScalarImpl(ScalarTy val, bool match_effective_scalar)
+      : val_(val), match_effective_scalar_(match_effective_scalar) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    *os << "which is a constant "
+        << (match_effective_scalar_ ? "effective " : "") << "scalar";
+    if (val_.has_value()) {
+      *os << " with value " << *val_;
+    }
+  }
+
+ private:
+  template <typename InstTy>
+  bool MatchImpl(InstTy* inst, MatchOption option) const {
+    const auto* const_inst = DynCast<HloConstantInstruction>(inst);
+    if (!const_inst) {
+      EXPLAIN << "HloInstruction is not a constant";
+      return false;
+    }
+    if (match_effective_scalar_ &&
+        !ShapeUtil::IsEffectiveScalar(inst->shape())) {
+      EXPLAIN << "HloInstruction is not an effective scalar";
+      return false;
+    }
+    if (!match_effective_scalar_ && !ShapeUtil::IsScalar(inst->shape())) {
+      EXPLAIN << "HloInstruction is not a scalar";
+      return false;
+    }
+    if (!val_.has_value()) {
+      return true;
+    }
+
+    // Check that literal == static_cast<LitearlTy>(val) and
+    // val == static_cast<ValTy>(literal).  This is sufficient to ensure that
+    // the two constant scalars are actually "equal".
+    auto val_literal = LiteralUtil::CreateR0(*val_);
+    auto literal_r0_or = const_inst->literal().Reshape({});
+    auto val_as_literal_ty_or =
+        val_literal.Convert(const_inst->shape().element_type());
+    if (!literal_r0_or.ok() || !val_as_literal_ty_or.ok()) {
+      EXPLAIN << "could not construct relevant Literals (how did this happen?)";
+      return false;
+    }
+    auto literal_r0 = std::move(literal_r0_or).ValueOrDie();
+    auto val_as_literal_ty = std::move(val_as_literal_ty_or).ValueOrDie();
+    auto literal_r0_as_val_ty_or =
+        literal_r0.Convert(val_literal.shape().element_type());
+    bool rv = literal_r0_as_val_ty_or.ok() &&  //
+              literal_r0_as_val_ty_or.ValueOrDie() == val_literal &&
+              literal_r0 == val_as_literal_ty;
+    if (!rv) {
+      EXPLAIN << "HloInstruction's constant value " << literal_r0.ToString()
+              << " did not match expected value " << *val_;
+    }
+    return rv;
+  }
+
+  absl::optional<ScalarTy> val_;
+  bool match_effective_scalar_;
+};
 
 // A pattern that matches HloInstructions.
 template <typename HloInstructionType, typename Impl>
 class HloInstructionPattern {
  private:
   template <typename NewImpl>
-  HloInstructionPattern<HloInstructionType,
-                        AllOfPattern<::xla::HloInstruction, Impl, NewImpl>>
-  AppendImpl(NewImpl new_impl) const {
-    return HloInstructionPattern<
-        HloInstructionType, AllOfPattern<::xla::HloInstruction, Impl, NewImpl>>(
-        AllOf<HloInstruction>(impl_, std::move(new_impl)), matched_inst_);
+  auto AppendImpl(NewImpl new_impl) const -> HloInstructionPattern<
+      HloInstructionType, decltype(AllOf<HloInstruction>(
+                              std::declval<Impl>(), std::move(new_impl)))> {
+    auto new_allof = AllOf<HloInstruction>(impl_, std::move(new_impl));
+    return HloInstructionPattern<HloInstructionType, decltype(new_allof)>(
+        std::move(new_allof), matched_inst_);
   }
 
  public:
@@ -895,6 +1773,9 @@ class HloInstructionPattern {
       }
       return true;
     }
+    if (inst != nullptr) {
+      EXPLAIN << "\nin " << InstToString(inst);
+    }
     return false;
   }
 
@@ -906,6 +1787,7 @@ class HloInstructionPattern {
       }
       return true;
     }
+    EXPLAIN << "\nin " << InstToString(inst);
     return false;
   }
 
@@ -935,12 +1817,47 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternOpcodeImpl(opcode, true));
   }
 
+  constexpr auto Is(const HloInstruction* instr) const
+      -> decltype(this->AppendImpl(HloInstructionIsImpl(instr))) {
+    return AppendImpl(HloInstructionIsImpl(instr));
+  }
+
   // Modifies the pattern to match only if the instruction is a constant.
   constexpr auto IsConstant() const
       -> decltype(this->WithOpcode(HloOpcode::kConstant)) {
     return WithOpcode(HloOpcode::kConstant);
   }
 
+  constexpr auto IsConstantScalar() const -> decltype(this->AppendImpl(
+      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false))) {
+    return AppendImpl(
+        HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/false));
+  }
+
+  // This does not check that T has the same type as the instruction, so e.g.
+  // IsConstantScalar(1.0) may match a constant of shape int32[].
+  template <typename ScalarTy>
+  constexpr auto IsConstantScalar(const ScalarTy& val) const
+      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
+          val, /*match_effective_scalar=*/false))) {
+    return AppendImpl(
+        HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/false));
+  }
+
+  constexpr auto IsConstantEffectiveScalar() const -> decltype(this->AppendImpl(
+      HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true))) {
+    return AppendImpl(
+        HloConstantScalarImpl</*Dummy*/ int>(/*match_effective_scalar=*/true));
+  }
+
+  template <typename ScalarTy>
+  constexpr auto IsConstantEffectiveScalar(const ScalarTy& val) const
+      -> decltype(this->AppendImpl(HloConstantScalarImpl<ScalarTy>(
+          val, /*match_effective_scalar=*/true))) {
+    return AppendImpl(
+        HloConstantScalarImpl<ScalarTy>(val, /*match_effective_scalar=*/true));
+  }
+
   // Modifies the pattern to match only if the instruction is not a constant.
   constexpr auto IsNonConstant() const
       -> decltype(this->WithoutOpcode(HloOpcode::kConstant)) {
@@ -957,6 +1874,22 @@ class HloInstructionPattern {
         HloInstructionPatternShapeImpl<ShapeType, ShapeImpl>(shape));
   }
 
+  // Make this a templated function to work around gcc 4.9.4 template infinite
+  // recursion bug.
+  template <typename Dummy = void>
+  constexpr auto WithShapeEqualTo(const ::xla::Shape* shape)
+      -> decltype(this->WithShape(Shape().EqualTo(shape))) {
+    return WithShape(Shape().EqualTo(shape));
+  }
+
+  // Make this a templated function to work around gcc 4.9.4 template infinite
+  // recursion bug.
+  template <typename Dummy = void>
+  constexpr auto WithShapeCompatibleTo(const ::xla::Shape* shape)
+      -> decltype(this->WithShape(Shape().CompatibleTo(shape))) {
+    return WithShape(Shape().CompatibleTo(shape));
+  }
+
   // Modifies the pattern to match only if the instruction has an operand that
   // matches the given pattern.
   template <typename OperandType, typename OperandImpl>
@@ -971,6 +1904,20 @@ class HloInstructionPattern {
             operand_index, operand));
   }
 
+  template <typename OperandType1, typename OperandImpl1, typename OperandType2,
+            typename OperandImpl2>
+  constexpr auto WithBinaryOperandsAnyOrder(
+      const HloInstructionPattern<OperandType1, OperandImpl1>& op1,
+      const HloInstructionPattern<OperandType2, OperandImpl2>& op2) const
+      -> decltype(this->AppendImpl(
+          HloInstructionPatternBinaryOperandsAnyOrderImpl<
+              OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1,
+                                                                      op2))) {
+    return AppendImpl(
+        HloInstructionPatternBinaryOperandsAnyOrderImpl<
+            OperandType1, OperandImpl1, OperandType2, OperandImpl2>(op1, op2));
+  }
+
   // Modifies the pattern to match only if the instruction is a fusion node with
   // the given kind.
   constexpr auto WithFusionKind(HloInstruction::FusionKind kind) const
@@ -985,17 +1932,34 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternTupleIndexImpl(tuple_index));
   }
 
- private:
-  template <typename Predicate>
-  constexpr auto WithPredicate(Predicate pred) const -> decltype(
-      this->AppendImpl(HloPredicatePatternImpl<HloInstruction, Predicate>(
-          std::move(pred)))) {
-    return AppendImpl(
-        HloPredicatePatternImpl<HloInstruction, Predicate>(std::move(pred)));
+  // Modifies the pattern to match only if the instruction is a parameter
+  // with the given parameter number.
+  constexpr auto WithParameterNum(int64 parameter_num) const -> decltype(
+      this->AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num))) {
+    return AppendImpl(HloInstructionPatternParameterNumImpl(parameter_num));
   }
 
-  friend struct PatternFriend;
+  // Modifies the pattern to match if the instruction is used exactly once.
+  // Does not match if the instruction is used twice by the same user (e.g.
+  // multiply(x,x)).
+  constexpr auto WithOneUse() const
+      -> decltype(this->AppendImpl(HloInstructionPatternOneUseImpl())) {
+    return AppendImpl(HloInstructionPatternOneUseImpl());
+  }
+
+  // Modifies the pattern to match if the instruction is used by exactly one
+  // other instruction.  Will match if the instruction is used twice, so long as
+  // it's by the same user (e.g.  multiply(x,x)).
+  constexpr auto WithOneUser() const
+      -> decltype(this->AppendImpl(HloInstructionPatternOneUserImpl())) {
+    return AppendImpl(HloInstructionPatternOneUserImpl());
+  }
 
+  void DescribeTo(std::ostream* os, int64 indent = 0) const {
+    impl_.DescribeTo(os, indent);
+  }
+
+ private:
   Impl impl_;
   HloInstructionType** matched_inst_;
 };
@@ -1036,6 +2000,7 @@ Op(::xla::HloInstruction** matched_inst) {
 XLA_NULLOP_PATTERN(Constant)
 XLA_NULLOP_PATTERN(Parameter)
 XLA_NULLOP_PATTERN(Iota)
+XLA_NULLOP_PATTERN(Rng)
 #undef XLA_NULLOP_PATTERN
 
 // Helpers for unary instructions.
@@ -1067,8 +2032,10 @@ XLA_UNOP_PATTERN(RoundNearestAfz)
 XLA_UNOP_PATTERN(Bitcast)
 XLA_UNOP_PATTERN(Broadcast)
 XLA_UNOP_PATTERN(Ceil)
+XLA_UNOP_PATTERN(Convert)
 XLA_UNOP_PATTERN(Copy)
 XLA_UNOP_PATTERN(Cos)
+XLA_UNOP_PATTERN(CrossReplicaSum)
 XLA_UNOP_PATTERN(Exp)
 XLA_UNOP_PATTERN(Fft)
 XLA_UNOP_PATTERN(Floor)
@@ -1088,6 +2055,7 @@ XLA_UNOP_PATTERN(Reverse)
 XLA_UNOP_PATTERN(SendDone)
 XLA_UNOP_PATTERN(Sign)
 XLA_UNOP_PATTERN(Sin)
+XLA_UNOP_PATTERN(Slice)
 XLA_UNOP_PATTERN(Sort)
 XLA_UNOP_PATTERN(Tanh)
 XLA_UNOP_PATTERN(Transpose)
@@ -1125,25 +2093,32 @@ XLA_UNOP_PATTERN(Transpose)
 #define XLA_COMMUTATIVE_BINOP_PATTERN(NAME)                                 \
   XLA_BINOP_PATTERN(NAME)                                                   \
                                                                             \
-  template <typename Lhs, typename Rhs>                                     \
-  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
-      ->decltype(AnyOf<HloInstruction>(NAME(lhs, rhs), NAME(rhs, lhs))) {   \
-    return AnyOf<HloInstruction>(NAME(lhs, rhs), NAME(rhs, lhs));           \
-  }                                                                         \
-                                                                            \
   template <typename HloInstructionType, typename Lhs, typename Rhs>        \
   inline auto NAME##AnyOrder(HloInstructionType** matched_inst, Lhs&& lhs,  \
                              Rhs&& rhs)                                     \
-      ->decltype(AnyOf<HloInstructionType>(NAME(matched_inst, lhs, rhs),    \
-                                           NAME(matched_inst, rhs, lhs))) { \
-    return AnyOf<HloInstructionType>(NAME(matched_inst, lhs, rhs),          \
-                                     NAME(matched_inst, rhs, lhs));         \
+      ->decltype(Op(matched_inst)                                           \
+                     .WithOpcode(HloOpcode::k##NAME)                        \
+                     .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),    \
+                                                 std::forward<Rhs>(rhs))) { \
+    return Op(matched_inst)                                                 \
+        .WithOpcode(HloOpcode::k##NAME)                                     \
+        .WithBinaryOperandsAnyOrder(std::forward<Lhs>(lhs),                 \
+                                    std::forward<Rhs>(rhs));                \
+  }                                                                         \
+  template <typename Lhs, typename Rhs>                                     \
+  inline auto NAME##AnyOrder(Lhs&& lhs, Rhs&& rhs)                          \
+      ->decltype(NAME##AnyOrder<const HloInstruction>(                      \
+          nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs))) {       \
+    return NAME##AnyOrder<const HloInstruction>(                            \
+        nullptr, std::forward<Lhs>(lhs), std::forward<Rhs>(rhs));           \
   }
 XLA_COMMUTATIVE_BINOP_PATTERN(Add)
 XLA_BINOP_PATTERN(Atan2)
 XLA_BINOP_PATTERN(Divide)
 XLA_BINOP_PATTERN(Complex)
+XLA_BINOP_PATTERN(Convolution)
 XLA_BINOP_PATTERN(Dot)
+XLA_BINOP_PATTERN(DynamicSlice)
 XLA_COMMUTATIVE_BINOP_PATTERN(Eq)
 XLA_BINOP_PATTERN(Gather)
 XLA_BINOP_PATTERN(Ge)
@@ -1155,7 +2130,9 @@ XLA_COMMUTATIVE_BINOP_PATTERN(Minimum)
 XLA_COMMUTATIVE_BINOP_PATTERN(Multiply)
 XLA_COMMUTATIVE_BINOP_PATTERN(Ne)
 XLA_BINOP_PATTERN(Outfeed)
+XLA_BINOP_PATTERN(Pad)
 XLA_BINOP_PATTERN(Power)
+XLA_BINOP_PATTERN(ReduceWindow)
 XLA_BINOP_PATTERN(Remainder)
 XLA_BINOP_PATTERN(Send)
 XLA_BINOP_PATTERN(Subtract)
@@ -1202,6 +2179,7 @@ XLA_BINOP_PATTERN(ShiftRightLogical)
         .WithOperand(2, std::forward<Arg2>(arg2));                     \
   }
 XLA_TERNOP_PATTERN(Clamp);
+XLA_TERNOP_PATTERN(Scatter);
 XLA_TERNOP_PATTERN(Select);
 #undef XLA_TERNOP_PATTERN
 
@@ -1254,32 +2232,12 @@ inline auto WithOperands(Matcher&& m, int64 operand_num, FirstArg&& first_arg,
 
 // We could implement all ops as "variadic" ops, but it would make the
 // already-bad compile errors even worse.
+XLA_VARIADIC_OP_PATTERN(AfterAll);
 XLA_VARIADIC_OP_PATTERN(Concatenate);
+XLA_VARIADIC_OP_PATTERN(CustomCall);
+XLA_VARIADIC_OP_PATTERN(Map)
 XLA_VARIADIC_OP_PATTERN(Reduce);
-
-namespace detail {
-struct PatternFriend {
-  template <typename T>
-  static auto ConstantScalar(T constant) -> decltype(
-      Constant()
-          .WithShape(match::Shape().IsScalar())
-          .WithPredicate(
-              std::declval<std::function<bool(const HloInstruction*)>>())) {
-    std::function<bool(const HloInstruction*)> pred =
-        [constant](const HloInstruction* instr) {
-          const auto& literal = Cast<HloConstantInstruction>(instr)->literal();
-          auto status_or_const = LiteralUtil::CreateR0(constant).Convert(
-              literal.shape().element_type());
-          return status_or_const.ok() &&
-                 literal == status_or_const.ConsumeValueOrDie();
-        };
-
-    return Constant()
-        .WithShape(match::Shape().IsScalar())
-        .WithPredicate(std::move(pred));
-  }
-};
-}  // namespace detail
+XLA_VARIADIC_OP_PATTERN(Tuple);
 
 // Helpers for matching non-constant instructions.
 inline auto NonConstant() -> decltype(Op().IsNonConstant()) {
@@ -1318,14 +2276,71 @@ inline auto GetTupleElement(HloInstructionType** matched_inst, Arg&& arg,
       .WithTupleIndex(tuple_index);
 }
 
-template <typename T>
-inline auto ConstantScalar(T constant)
-    -> decltype(detail::PatternFriend::ConstantScalar(constant)) {
-  return detail::PatternFriend::ConstantScalar(constant);
+// Add overloads for Parameter which take an int64 specifying the parameter
+// number.
+inline auto Parameter(int64 parameter_num) -> decltype(
+    Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num)) {
+  return Op().WithOpcode(HloOpcode::kParameter).WithParameterNum(parameter_num);
+}
+template <typename HloInstructionType>
+inline auto Parameter(HloInstructionType** matched_inst, int64 parameter_num)
+    -> decltype(Op(matched_inst)
+                    .WithOpcode(HloOpcode::kParameter)
+                    .WithParameterNum(parameter_num)) {
+  return Op(matched_inst)
+      .WithOpcode(HloOpcode::kParameter)
+      .WithParameterNum(parameter_num);
+}
+
+inline auto ConstantScalar() -> decltype(Op().IsConstantScalar()) {
+  return Op().IsConstantScalar();
+}
+
+template <typename HloInstructionType>
+inline auto ConstantScalar(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsConstantScalar()) {
+  return Op(matched_inst).IsConstantScalar();
+}
+
+template <typename ScalarTy>
+inline auto ConstantScalar(ScalarTy val)
+    -> decltype(Op().IsConstantScalar(val)) {
+  return Op().IsConstantScalar(val);
+}
+
+template <typename HloInstructionType, typename ScalarTy>
+inline auto ConstantScalar(HloInstructionType** matched_inst, ScalarTy val)
+    -> decltype(Op(matched_inst).IsConstantScalar(val)) {
+  return Op(matched_inst).IsConstantScalar(val);
+}
+
+inline auto ConstantEffectiveScalar() -> decltype(Op().IsConstantScalar()) {
+  return Op().IsConstantEffectiveScalar();
+}
+
+template <typename HloInstructionType>
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst)
+    -> decltype(Op(matched_inst).IsConstantScalar()) {
+  return Op(matched_inst).IsConstantEffectiveScalar();
+}
+
+template <typename ScalarTy>
+inline auto ConstantEffectiveScalar(ScalarTy val)
+    -> decltype(Op().IsConstantEffectiveScalar(val)) {
+  return Op().IsConstantEffectiveScalar(val);
+}
+
+template <typename HloInstructionType, typename ScalarTy>
+inline auto ConstantEffectiveScalar(HloInstructionType** matched_inst,
+                                    ScalarTy val)
+    -> decltype(Op(matched_inst).IsConstantEffectiveScalar(val)) {
+  return Op(matched_inst).IsConstantEffectiveScalar(val);
 }
 
 }  // namespace match
 
 }  // namespace xla
 
+#undef EXPLAIN
+#pragma pop_macro("EXPLAIN")
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock.h b/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fe2d10a11b5b2d26ee222c63e0db2d55e361d12
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock.h
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
+
+#include <ostream>
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+
+namespace pattern_matcher_gmock_detail {
+template <typename Pattern>
+class GmockMatcher {
+ public:
+  explicit GmockMatcher(Pattern p) : pattern_(std::move(p)) {}
+
+  // In service of better error messages, list out the overloads explicitly
+  // rather than just using a template.  gMock's polymorphism plus
+  // pattern_matcher yields some pretty gnarly stuff.
+  bool MatchAndExplain(const Layout& l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&l, listener);
+  }
+  bool MatchAndExplain(const Layout* l,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(l, listener);
+  }
+
+  bool MatchAndExplain(const Shape& s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&s, listener);
+  }
+  bool MatchAndExplain(const Shape* s,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(s, listener);
+  }
+
+  bool MatchAndExplain(const HloInstruction& instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(&instr, listener);
+  }
+  bool MatchAndExplain(const HloInstruction* instr,
+                       ::testing::MatchResultListener* listener) const {
+    return MatchAndExplainImpl(instr, listener);
+  }
+
+  void DescribeTo(std::ostream* os) const { pattern_.DescribeTo(os); }
+
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << "is NOT: ";
+    DescribeTo(os);
+  }
+
+ private:
+  template <typename T>
+  bool MatchAndExplainImpl(const T* t,
+                           ::testing::MatchResultListener* listener) const {
+    MatchOption options{/*.capture=*/true, /*.explain_os=*/listener->stream()};
+    return Match(t, pattern_, options);
+  }
+
+  Pattern pattern_;
+};
+}  // namespace pattern_matcher_gmock_detail
+
+template <typename Pattern>
+::testing::PolymorphicMatcher<
+    pattern_matcher_gmock_detail::GmockMatcher<Pattern>>
+GmockMatch(Pattern&& p) {
+  return ::testing::MakePolymorphicMatcher(
+      pattern_matcher_gmock_detail::GmockMatcher<Pattern>(
+          std::forward<Pattern>(p)));
+}
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_PATTERN_MATCHER_GMOCK_H_
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9ca2fb05c1f7ef093c58237cf21fbc7c813a592a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/pattern_matcher_gmock_test.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace xla {
+namespace {
+
+namespace m = ::xla::match;
+using ::testing::Eq;
+using ::testing::Not;
+
+template <typename MatchedTy>
+string Describe(const ::testing::Matcher<MatchedTy>& m) {
+  std::stringstream ss;
+  m.DescribeTo(&ss);
+  return ss.str();
+}
+
+template <typename MatchedTy>
+string Explain(
+    const MatchedTy& val,
+    const ::testing::Matcher<typename std::remove_cv<MatchedTy>::type>& m) {
+  ::testing::StringMatchResultListener listener;
+  EXPECT_THAT(val, ::testing::Not(m));  // For the error message.
+  EXPECT_FALSE(m.MatchAndExplain(val, &listener));
+  return listener.str();
+}
+
+// This file tests the GmockMatch function.  The actual explanation and
+// description returned by matchers is tested in pattern_matchers_test.
+TEST(PatternMatcherGmock, MatchShape) {
+  Shape s = ShapeUtil::MakeShape(F32, {10, 100});
+  // You can pass const Shape& or a const Shape*.
+  EXPECT_THAT(s, GmockMatch(m::Shape()));
+  EXPECT_THAT(&s, Not(GmockMatch(m::Shape().WithElementType(F16))));
+  EXPECT_THAT(Describe<Shape>(GmockMatch(m::Shape().IsArray())),
+              "a shape that represents an array");
+}
+
+TEST(PatternMatcherGmock, MatchLayout) {
+  Layout l = LayoutUtil::MakeLayout({0, 1});
+  EXPECT_THAT(l, GmockMatch(m::Layout()));
+  EXPECT_THAT(&l, Not(GmockMatch(m::Layout().WithSparseFormat())));
+  EXPECT_THAT(Describe<Layout>(GmockMatch(m::Layout().WithSparseFormat())),
+              "a layout with format SPARSE");
+}
+
+TEST(PatternMatchGmock, MatchInstruction) {
+  auto instr =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {42}), "p");
+  EXPECT_THAT(instr.get(), GmockMatch(m::Parameter()));
+  EXPECT_THAT(*instr, GmockMatch(m::Parameter(0)));
+  EXPECT_THAT(*instr, Not(GmockMatch(m::Parameter(1))));
+  EXPECT_THAT(Describe<HloInstruction*>(GmockMatch(m::Parameter())),
+              "an HloInstruction with opcode parameter");
+}
+
+}  // anonymous namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 3f74273517aebfd6f2700a9275b92765e29f21cc..186ef0c7911a2724df810780e018f52586e3e6a8 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -14,14 +14,18 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace xla {
 namespace {
 
+namespace m = match;
+
 TEST(PatternMatcherTest, AddOp) {
   constexpr char kModuleStr[] = R"(HloModule two_plus_two_module
     ENTRY %two_plus_two_computation () -> f32[] {
@@ -229,23 +233,74 @@ TEST(PatternMatcherTest, AnyOf) {
 }
 
 TEST(PatternMatcherTest, ConstantScalar) {
-  constexpr char kModuleStr[] = R"(
-    HloModule test_module ENTRY test { ROOT constant = f16[] constant(42) })";
-  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
-  auto* root = hlo_module->entry_computation()->root_instruction();
-
-  EXPECT_TRUE(Match(root, match::ConstantScalar(42)));
-  EXPECT_FALSE(Match(root, match::ConstantScalar(41)));
-  EXPECT_FALSE(Match(root, match::ConstantScalar(0)));
-}
+  using match::ConstantEffectiveScalar;
+  using match::ConstantScalar;
+  using match::Op;
+  using match::Tuple;
 
-TEST(PatternMatcherTest, NoMatchConstantScalar) {
   constexpr char kModuleStr[] = R"(
-    HloModule test_module ENTRY test { ROOT v = f16[] parameter(0) })";
+    HloModule test_module
+    ENTRY test {
+      a = s32[] constant(1)
+      b = s32[1,1] constant(s32[1,1]{{2}})
+      c = s32[1,2] constant(s32[1,2]{{2,2}})
+      d = f32[] constant(1)
+      e = f32[] constant(1.25)
+      ROOT tuple = (s32[], s32[1,1], s32[1,2], f32[], f32[]) tuple(a,b,c,d,e)
+    })";
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
   auto* root = hlo_module->entry_computation()->root_instruction();
 
-  EXPECT_FALSE(Match(root, match::ConstantScalar(42)));
+  const HloInstruction* a = root->operand(0);
+  const HloInstruction* b = root->operand(1);
+  const HloInstruction* c = root->operand(2);
+  const HloInstruction* d = root->operand(3);
+  const HloInstruction* e = root->operand(4);
+  EXPECT_TRUE(Match(a, ConstantScalar()));
+  EXPECT_TRUE(Match(a, ConstantScalar(1)));
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar()));
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(1)));
+  EXPECT_FALSE(Match(a, ConstantScalar(2)));
+  EXPECT_FALSE(Match(a, ConstantScalar(2.01)));
+  EXPECT_FALSE(Match(a, ConstantEffectiveScalar(2)));
+  EXPECT_FALSE(Match(a, ConstantEffectiveScalar(1.01)));
+
+  EXPECT_FALSE(Match(b, ConstantScalar()));
+  EXPECT_FALSE(Match(b, ConstantScalar(2)));
+  EXPECT_TRUE(Match(b, ConstantEffectiveScalar()));
+  EXPECT_TRUE(Match(b, ConstantEffectiveScalar(2)));
+
+  EXPECT_FALSE(Match(c, ConstantScalar()));
+  EXPECT_FALSE(Match(c, ConstantScalar(2)));
+  EXPECT_FALSE(Match(c, ConstantEffectiveScalar()));
+  EXPECT_FALSE(Match(c, ConstantEffectiveScalar(2)));
+
+  EXPECT_TRUE(Match(d, ConstantScalar(1)));
+  EXPECT_TRUE(Match(d, ConstantEffectiveScalar(1)));
+  EXPECT_TRUE(Match(d, ConstantScalar(1.0)));
+  EXPECT_TRUE(Match(d, ConstantEffectiveScalar(1.0)));
+
+  EXPECT_TRUE(Match(e, ConstantScalar(1.25f)));
+  EXPECT_TRUE(Match(e, ConstantScalar(1.25)));
+  EXPECT_TRUE(Match(e, ConstantEffectiveScalar(1.25)));
+  EXPECT_FALSE(Match(e, ConstantScalar(1)));
+  EXPECT_FALSE(Match(e, ConstantEffectiveScalar(1)));
+
+  const HloInstruction* instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantScalar(&instr)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantScalar(&instr, 1)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(&instr)));
+  EXPECT_EQ(instr, a);
+
+  instr = nullptr;
+  EXPECT_TRUE(Match(a, ConstantEffectiveScalar(&instr, 1)));
+  EXPECT_EQ(instr, a);
 }
 
 TEST(PatternMatcherTest, MultiplyAnyOrder) {
@@ -267,6 +322,15 @@ TEST(PatternMatcherTest, MultiplyAnyOrder) {
       root, MultiplyAnyOrder(&instr, ConstantScalar(42), ConstantScalar(52))));
   EXPECT_TRUE(Match(
       root, MultiplyAnyOrder(&instr, ConstantScalar(52), ConstantScalar(42))));
+
+  // Check that MultiplyAnyOrder exposes the same API as Op(), so we can call
+  // e.g. IsNonConstant() on it.
+  EXPECT_TRUE(Match(
+      root, MultiplyAnyOrder(&instr, ConstantScalar(42), ConstantScalar(52))
+                .IsNonConstant()));
+  EXPECT_TRUE(
+      Match(root, MultiplyAnyOrder(ConstantScalar(42), ConstantScalar(52))
+                      .IsNonConstant()));
 }
 
 TEST(PatternMatcherTest, AnyOfShortCircuit) {
@@ -315,14 +379,22 @@ TEST(PatternMatcherTest, AllOf) {
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloString(kModuleStr));
   auto* root = hlo_module->entry_computation()->root_instruction();
 
+  auto f16_scalar = ShapeUtil::MakeShape(F16, {});
+  auto f16_pattern = Constant().WithShapeEqualTo(&f16_scalar);
+  auto f16_compatible_pattern = Constant().WithShapeCompatibleTo(&f16_scalar);
   auto scalar_pattern = Constant().WithShape(match::Shape().IsScalar());
-  auto f16_pattern = Constant().WithShape(match::Shape().WithElementType(F16));
   ASSERT_TRUE(Match(root, scalar_pattern));
   ASSERT_TRUE(Match(root, f16_pattern));
-  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(scalar_pattern, f16_pattern)));
-  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(f16_pattern, scalar_pattern)));
+  ASSERT_TRUE(Match(root, f16_compatible_pattern));
+  EXPECT_TRUE(Match(root, AllOf<HloInstruction>(scalar_pattern, f16_pattern,
+                                                f16_compatible_pattern)));
+  EXPECT_TRUE(
+      Match(root, AllOf<HloInstruction>(f16_pattern, f16_compatible_pattern,
+                                        scalar_pattern)));
   EXPECT_FALSE(
       Match(root, AllOf<HloInstruction>(Broadcast(Op()), f16_pattern)));
+  EXPECT_FALSE(Match(
+      root, AllOf<HloInstruction>(Broadcast(Op()), f16_compatible_pattern)));
   EXPECT_FALSE(
       Match(root, AllOf<HloInstruction>(Broadcast(Op()), scalar_pattern)));
 }
@@ -431,5 +503,433 @@ TEST(PatternMatcherTest, TestConcat) {
                         Reshape(ConstantScalar(4)))));
 }
 
+template <typename Pattern>
+string Description(const Pattern& pattern) {
+  std::stringstream ss;
+  pattern.DescribeTo(&ss);
+  return ss.str();
+}
+
+template <typename Elem, typename Pattern>
+string Explanation(Elem* elem, const Pattern& pattern) {
+  std::stringstream ss;
+  MatchOption options{/*.capture=*/true, /*.explain_os=*/&ss};
+  Match(elem, pattern, options);
+  return ss.str();
+}
+template <typename Elem, typename Pattern>
+string Explanation(const std::unique_ptr<Elem>& elem, const Pattern& pattern) {
+  return Explanation(elem.get(), pattern);
+}
+template <typename Elem, typename Pattern>
+string Explanation(const Elem& elem, const Pattern& pattern) {
+  return Explanation(&elem, pattern);
+}
+
+// Helper macro for checking a pattern's description and the explanation printed
+// when attempting to match (and presumably failing) on a given object.
+//
+// We use a macro rather than a function because we want good line numbers in
+// errors.  We use this rather than writing a helper that returns a pair of
+// (description, explanation) and doing something like
+//
+//   EXPECT_THAT(DescAndExplanation(...), ::testing::Pair(..., ...));
+//
+// because EXPECT_EQ prints a unified diff if multiline string comparison fails,
+// while EXPECT_THAT does not.  This unified diff makes the errors much easier
+// to read.
+#define EXPECT_DESC_AND_EXPLANATION(elem, pattern, expected_desc,    \
+                                    expected_explanation)            \
+  do {                                                               \
+    EXPECT_EQ(Description(pattern), (expected_desc));                \
+    EXPECT_EQ(Explanation((elem), (pattern)), expected_explanation); \
+  } while (0)
+
+TEST(PatternMatcherTest, LayoutDescribeToAndExplain) {
+  auto layout = LayoutUtil::MakeLayout({1, 2});
+  auto layout2 = LayoutUtil::MakeLayout({2, 2});
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const Layout*>(nullptr), m::Layout(),
+                              "a layout", "Layout is null");
+  EXPECT_DESC_AND_EXPLANATION(layout2, m::Layout().EqualTo(&layout),
+                              "a layout equal to {1,2}",
+                              "Layout {2,2} is not equal to expected {1,2}");
+  EXPECT_DESC_AND_EXPLANATION(layout2, m::Layout().WithSparseFormat(),
+                              "a layout with format SPARSE",
+                              "Layout has format DENSE but expected SPARSE");
+  EXPECT_DESC_AND_EXPLANATION(layout,
+                              m::Layout().EqualTo(&layout).WithSparseFormat(),
+                              "a layout:\n"
+                              " * equal to {1,2} AND\n"
+                              " * with format SPARSE",
+                              "Layout has format DENSE but expected SPARSE");
+}
+
+TEST(PatternMatcherTest, ShapeDescribeToAndExplain) {
+  auto shape = ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {0, 1});
+  auto layout = shape.layout();
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const Shape*>(nullptr), m::Shape(),
+                              "a shape", "Shape is null");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {1, 0}),
+      m::Shape().EqualTo(&shape), "a shape equal to f32[1,2]{0,1}",
+      "Shape not equal to f32[1,2]{0,1}\n"
+      "in f32[1,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeShape(F32, {2, 2}),
+                              m::Shape().CompatibleTo(&shape),
+                              "a shape compatible with f32[1,2]",
+                              "Shape not compatible with f32[1,2]\n"
+                              "in f32[2,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithElementType(F16),
+                              "a shape with element type F16",
+                              "Shape does not have element type F16\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsScalar(),
+                              "a shape that represents a scalar",
+                              "Shape is not a scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeNil(), m::Shape().IsArray(),
+                              "a shape that represents an array",
+                              "Shape is not an array\n"
+                              "in ()");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsTuple(),
+                              "a shape that represents a tuple",
+                              "Shape is not a tuple\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().IsEffectiveScalar(),
+                              "a shape that is an effective scalar",
+                              "Shape is not an effective scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(42),
+                              "a shape that has 42 dimensions",
+                              "Shape does not have rank 42\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(0),
+                              "a shape that is a scalar",
+                              "Shape is not a scalar\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape, m::Shape().WithRank(1).IsArray(),
+                              "a shape:\n"
+                              " * that has 1 dimension AND\n"
+                              " * that represents an array",
+                              "Shape does not have rank 1\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(ShapeUtil::MakeNil(),
+                              m::Shape().IsArray().WithRank(1),
+                              "a shape:\n"
+                              " * that represents an array AND\n"
+                              " * that has 1 dimension",
+                              "Shape is not an array\n"
+                              "in ()");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeShapeWithLayout(F32, {1, 2}, {1, 0}),
+      m::Shape().WithLayoutEqualTo(&layout),
+      "a shape with\n  a layout equal to {0,1}",
+      "Layout {1,0} is not equal to expected {0,1}\n"
+      "in f32[1,2]{1,0}");
+  EXPECT_DESC_AND_EXPLANATION(
+      shape, m::Shape().WithLayout(m::Layout().WithSparseFormat()),
+      "a shape with\n  a layout with format SPARSE",
+      "Layout has format DENSE but expected SPARSE\n"
+      "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(shape,
+                              m::Shape().WithSubshapeEqualTo({10}, &shape),
+                              "a shape with subshape at index {10} which is\n"
+                              "  a shape equal to f32[1,2]{0,1}",
+                              "No subshape at {10}\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2, 2})}),
+      m::Shape().WithSubshapeEqualTo({0}, &shape),
+      "a shape with subshape at index {0} which is\n"
+      "  a shape equal to f32[1,2]{0,1}",
+      "Shape not equal to f32[1,2]{0,1}\n"
+      "in f32[2,2]{1,0}\n"
+      "in subshape at {0}\n"
+      "in (f32[2,2])");
+  EXPECT_DESC_AND_EXPLANATION(shape,
+                              m::Shape().WithSubshapeCompatibleTo({10}, &shape),
+                              "a shape with subshape at index {10} which is\n"
+                              "  a shape compatible with f32[1,2]",
+                              "No subshape at {10}\n"
+                              "in f32[1,2]{0,1}");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {2, 2})}),
+      m::Shape().WithSubshapeCompatibleTo({0}, &shape),
+      "a shape with subshape at index {0} which is\n"
+      "  a shape compatible with f32[1,2]",
+      "Shape not compatible with f32[1,2]\n"
+      "in f32[2,2]{1,0}\n"
+      "in subshape at {0}\n"
+      "in (f32[2,2])");
+  EXPECT_DESC_AND_EXPLANATION(
+      ShapeUtil::MakeTupleShape({ShapeUtil::MakeTupleShape({shape})}),
+      m::Shape().WithSubshape({0, 0}, m::Shape().IsScalar()),
+      "a shape with subshape at index {0,0} which is\n"
+      "  a shape that represents a scalar",
+      "Shape is not a scalar\n"
+      "in f32[1,2]{0,1}\n"
+      "in subshape at {0,0}\n"
+      "in ((f32[1,2]))");
+}
+
+std::unique_ptr<HloInstruction> SetName(absl::string_view name,
+                                        std::unique_ptr<HloInstruction> instr) {
+  instr->SetAndSanitizeName(string(name));
+  return instr;
+}
+
+TEST(PatternMatcherTest, HloInstructionDescribeToAndExplain) {
+  std::unique_ptr<HloInstruction> iota =
+      SetName("i", HloInstruction::CreateIota(ShapeUtil::MakeShape(S32, {42}),
+                                              /*iota_dimension=*/0));
+  std::unique_ptr<HloInstruction> constant =
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0)));
+
+  EXPECT_DESC_AND_EXPLANATION(static_cast<const HloInstruction*>(nullptr),
+                              m::Op(), "an HloInstruction",
+                              "HloInstruction* is null");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithName("foo"),
+                              "an HloInstruction named \"foo\"",
+                              "HloInstruction not named \"foo\"\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithOpcode(HloOpcode::kAdd),
+                              "an HloInstruction with opcode add",
+                              "HloInstruction doesn't have opcode add\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      constant, m::Op().IsNonConstant(),
+      "an HloInstruction with any opcode other than constant",
+      "HloInstruction has opcode constant, expected anything else\n"
+      "in c = s32[] constant(0)");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithNumOperands(42),
+                              "an HloInstruction with 42 operands",
+                              "HloInstruction doesn't have 42 operands\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().WithShape(m::Shape().IsTuple()),
+                              "an HloInstruction outputting\n"
+                              "  a shape that represents a tuple",
+                              "Shape is not a tuple\n"
+                              "in s32[42]{0}\n"
+                              "in output shape\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithOperand(2, m::Op().WithOpcode(HloOpcode::kAdd)),
+      "an HloInstruction with operand 2 which is:\n"
+      "  an HloInstruction with opcode add",
+      "desired operand index 2 is out of bounds\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a", HloInstruction::CreateBinary(ShapeUtil::MakeShape(S32, {}),
+                                                HloOpcode::kAdd, constant.get(),
+                                                constant.get())),
+      m::Op().WithOperand(1, m::Op().IsNonConstant()),
+      "an HloInstruction with operand 1 which is:\n"
+      "  an HloInstruction with any opcode other than constant",
+      "HloInstruction has opcode constant, expected anything else\n"
+      "in c = s32[] constant(0)\n"
+      "in operand 1\n"
+      "in a = s32[] add(s32[] c, s32[] c)");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithFusionKind(HloInstruction::FusionKind::kLoop),
+      "an HloInstruction with fusion kind kLoop",
+      "HloInstruction does not have fusion kind kLoop; it's not a fusion\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      iota, m::Op().WithTupleIndex(42),
+      "an HloInstruction which is a GTE with index 42",
+      "HloInstruction is not a GTE with index 42; it's not a GTE at all\n"
+      "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(iota, m::Op().IsConstantScalar(),
+                              "an HloInstruction which is a constant scalar",
+                              "HloInstruction is not a constant\n"
+                              "in i = s32[42]{0} iota(), iota_dimension=0");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(
+                       LiteralUtil::CreateR1<int>({1, 2}))),
+      m::Op().IsConstantEffectiveScalar(),
+      "an HloInstruction which is a constant effective scalar",
+      "HloInstruction is not an effective scalar\n"
+      "in c = s32[2]{0} constant({1, 2})");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(10))),
+      m::Op().IsConstantScalar(42),
+      "an HloInstruction which is a constant scalar with value 42",
+      "HloInstruction's constant value 10 did not match expected value 42\n"
+      "in c = s32[] constant(10)");
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(2.25))),
+      m::Op().IsConstantEffectiveScalar(1.25),
+      "an HloInstruction which is a constant effective scalar with value 1.25",
+      "HloInstruction's constant value 2.25 did not match expected value 1.25\n"
+      "in c = f64[] constant(2.25)");
+  EXPECT_DESC_AND_EXPLANATION(
+      constant, m::Op().Is(iota.get()),
+      absl::StrCat("an HloInstruction which is 0x", absl::Hex(iota.get()),
+                   " (i = s32[42]{0} iota(), iota_dimension=0)"),
+      absl::StrCat("HloInstruction 0x", absl::Hex(constant.get()), " is not 0x",
+                   absl::Hex(iota.get()),
+                   " (i = s32[42]{0} iota(), iota_dimension=0)\n"
+                   "in c = s32[] constant(0)"));
+}
+
+TEST(PatternMatcherTest, HloInstructionMatcherAnyOrderDescribeTo) {
+  auto scalar_s32 = ShapeUtil::MakeShape(S32, {});
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a", HloInstruction::CreateBinary(
+                       scalar_s32, HloOpcode::kAdd,
+                       SetName("b", HloInstruction::CreateConstant(
+                                        LiteralUtil::CreateR0(0)))
+                           .get(),
+                       SetName("c", HloInstruction::CreateConstant(
+                                        LiteralUtil::CreateR0(0)))
+                           .get())),
+      m::AddAnyOrder(m::Op().WithName("b"), m::Op().WithName("bar")),
+      "an HloInstruction:\n"
+      " * with opcode add AND\n"
+      " * with two operands in either order:\n"
+      "    - an HloInstruction named \"b\"\n"
+      "    - an HloInstruction named \"bar\"",
+      "HloInstruction's operands (ignoring order) did not match second "
+      "matcher.  Specifically,\n"
+      " - an HloInstruction named \"bar\"\n"
+      "does not match LHS:\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in b = s32[] constant(0)\n"
+      "does not match RHS:\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in c = s32[] constant(0)\n"
+      "in a = s32[] add(s32[] b, s32[] c)");
+
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("a",
+              HloInstruction::CreateBinary(
+                  scalar_s32, HloOpcode::kAdd,
+                  HloInstruction::CreateParameter(0, scalar_s32, "p").get(),
+                  SetName("c", HloInstruction::CreateConstant(
+                                   LiteralUtil::CreateR0(0)))
+                      .get())),
+      m::AddAnyOrder(m::Op().IsConstantScalar(), m::Op().IsConstant()),
+      "an HloInstruction:\n"
+      " * with opcode add AND\n"
+      " * with two operands in either order:\n"
+      "    - an HloInstruction which is a constant scalar\n"
+      "    - an HloInstruction with opcode constant",
+      "HloInstruction's LHS operand did not match either of the two matchers.  "
+      "Specifically,\n"
+      " - an HloInstruction which is a constant scalar\n"
+      "does not match LHS:\n"
+      " - HloInstruction is not a constant\n"
+      "   in p = s32[] parameter(0)\n"
+      "and\n"
+      " - an HloInstruction with opcode constant\n"
+      "does not match LHS:\n"
+      " - HloInstruction doesn't have opcode constant\n"
+      "   in p = s32[] parameter(0)\n"
+      "in a = s32[] add(s32[] p, s32[] c)");
+}
+
+TEST(PatternMatcherTest, AnyOfMatcherDescribeToAndExplain) {
+  EXPECT_DESC_AND_EXPLANATION(
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0))),
+      m::AnyOf<HloInstruction>(m::Op().WithName("foo"),
+                               m::Op().WithName("bar")),
+      "any of:\n"
+      " - an HloInstruction named \"foo\" OR\n"
+      " - an HloInstruction named \"bar\"",
+      "None of the following matchers succeeded:\n"
+      "Matcher #1\n"
+      " - an HloInstruction named \"foo\"\n"
+      "failed with\n"
+      " - HloInstruction not named \"foo\"\n"
+      "   in c = s32[] constant(0)\n"
+      "Matcher #2\n"
+      " - an HloInstruction named \"bar\"\n"
+      "failed with\n"
+      " - HloInstruction not named \"bar\"\n"
+      "   in c = s32[] constant(0)");
+}
+
+TEST(PatternMatcherTest, Parameter) {
+  auto param =
+      HloInstruction::CreateParameter(1, ShapeUtil::MakeShape(F32, {}), "p1");
+  auto non_param =
+      SetName("c", HloInstruction::CreateConstant(LiteralUtil::CreateR0(0)));
+  EXPECT_FALSE(Match(param.get(), m::Parameter(0)));
+  EXPECT_TRUE(Match(param.get(), m::Parameter()));
+  EXPECT_TRUE(Match(param.get(), m::Parameter(1)));
+  EXPECT_FALSE(Match(non_param.get(), m::Parameter()));
+  EXPECT_FALSE(Match(non_param.get(), m::Parameter(1)));
+
+  EXPECT_DESC_AND_EXPLANATION(non_param, m::Parameter(1),
+                              "an HloInstruction:\n"
+                              " * with opcode parameter AND\n"
+                              " * which is parameter 1",
+                              "HloInstruction doesn't have opcode parameter\n"
+                              "in c = s32[] constant(0)");
+  EXPECT_EQ(Explanation(HloInstruction::CreateParameter(
+                            0, ShapeUtil::MakeShape(F32, {}), "p0"),
+                        m::Parameter(1)),
+            "HloInstruction is not parameter 1\n"
+            "in p0 = f32[] parameter(0)");
+}
+
+TEST(PatternMatcherTest, OneUseAndOneUser) {
+  auto param =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+  EXPECT_DESC_AND_EXPLANATION(
+      param, m::Op().WithOneUse(),
+      "an HloInstruction which has exactly one use",
+      "HloInstruction has 0 users, but expected exactly one.\n"
+      "in p0 = f32[] parameter(0)");
+
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser()));
+  EXPECT_DESC_AND_EXPLANATION(
+      param, m::Op().WithOneUser(),
+      "an HloInstruction which has exactly one user (but possibly is used "
+      "multiple times by that instruction)",
+      "HloInstruction has 0 users, but expected exactly one.\n"
+      "in p0 = f32[] parameter(0)");
+
+  {
+    auto reshape =
+        SetName("r", HloInstruction::CreateReshape(
+                         ShapeUtil::MakeShape(F32, {1}), param.get()));
+    EXPECT_TRUE(Match(param.get(), m::Op().WithOneUse()));
+    EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser()));
+
+    auto reshape1 =
+        SetName("r1", HloInstruction::CreateReshape(
+                          ShapeUtil::MakeShape(F32, {1}), param.get()));
+    EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+    EXPECT_FALSE(Match(param.get(), m::Op().WithOneUser()));
+
+    const char* kMultipleUserExplanation =
+        "HloInstruction has 2 users, but expected exactly one.\n"
+        "All users:\n"
+        " - r = f32[1]{0} reshape(f32[] p0)\n"
+        " - r1 = f32[1]{0} reshape(f32[] p0)\n"
+        "in p0 = f32[] parameter(0)";
+    EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
+              kMultipleUserExplanation);
+    EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUser()),
+              kMultipleUserExplanation);
+  }
+
+  auto add = SetName("add", HloInstruction::CreateBinary(
+                                ShapeUtil::MakeShape(F32, {}), HloOpcode::kAdd,
+                                param.get(), param.get()));
+  EXPECT_TRUE(Match(param.get(), m::Op().WithOneUser()));
+  EXPECT_FALSE(Match(param.get(), m::Op().WithOneUse()));
+  EXPECT_EQ(Explanation(param.get(), m::Op().WithOneUse()),
+            "HloInstruction is used 2 times by its user, but is expected to be "
+            "used just once: add = f32[] add(f32[] p0, f32[] p0)\n"
+            "in p0 = f32[] parameter(0)");
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
index 16fa80d53e7dc3456b0dade8b92cf101b3e0a33d..efeec96571455d8a9e4b7837dd7286392c12f1a3 100644
--- a/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
+++ b/tensorflow/compiler/xla/service/reduce_precision_insertion_test.cc
@@ -54,7 +54,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeUnaryInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -81,7 +81,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeUnaryScalarInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -111,7 +111,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeBinaryInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a, b));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -140,7 +140,7 @@ TEST_F(ReducePrecisionInsertionTest, BeforeZeroInputInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -173,7 +173,7 @@ TEST_F(ReducePrecisionInsertionTest, AvoidAddingDuplicateInstructions) {
   HloInstruction* d = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, b, c));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -205,7 +205,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterRootInstruction) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, a));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -242,7 +242,7 @@ TEST_F(ReducePrecisionInsertionTest, AfterNonRootInstruction) {
   HloInstruction* c = builder.AddInstruction(
       HloInstruction::CreateBinary(shape, HloOpcode::kAdd, a_cos, b_cos));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -295,7 +295,7 @@ TEST_F(ReducePrecisionInsertionTest, ShouldReduceOutputPrecisionIsFalse) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -321,7 +321,7 @@ TEST_F(ReducePrecisionInsertionTest, InsertionIsNotRecursive) {
   HloInstruction* b = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, a, 8, 23));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected state before adding ops.
@@ -348,7 +348,7 @@ TEST_F(ReducePrecisionInsertionTest, SkipRedundantReducePrecisionAfter) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 5, 10));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -376,7 +376,7 @@ TEST_F(ReducePrecisionInsertionTest, AddNonRedundantReducePrecision) {
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateReducePrecision(shape, x, 8, 23));
 
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Confirm expected graph before adding ops.
@@ -402,7 +402,7 @@ TEST_F(ReducePrecisionInsertionTest, IgnoreOpsInsideFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -438,7 +438,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInHeadOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
@@ -485,7 +485,7 @@ TEST_F(ReducePrecisionInsertionTest, OpGetsInsertedInTailOfFusionNode) {
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "x"));
   HloInstruction* y = builder.AddInstruction(
       HloInstruction::CreateUnary(shape, HloOpcode::kCos, x));
-  auto module = CreateNewUnverifiedModule();
+  auto module = CreateNewVerifiedModule();
   auto computation = module->AddEntryComputation(builder.Build());
 
   // Manually fuse the kCos operation into a fusion operation.
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 75f7413b3c303da620c2815c83e03324148c0961..5ec7fe2adedac2fc3d8a7588e853dba90e99006f 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/service/transfer_manager.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_layout.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -275,8 +276,8 @@ StatusOr<std::unique_ptr<HloModuleConfig>> Service::CreateModuleConfig(
   }
   if (execution_options != nullptr &&
       execution_options->has_shape_with_output_layout()) {
-    const auto& shape_with_output_layout =
-        execution_options->shape_with_output_layout();
+    const Shape shape_with_output_layout(
+        execution_options->shape_with_output_layout());
     TF_RETURN_IF_ERROR(
         ValidateResultShape(shape_with_output_layout, program_shape.result()));
     TF_RETURN_IF_ERROR(
@@ -658,9 +659,9 @@ Status Service::ExecuteGraphParallel(const ExecuteGraphParallelRequest* arg,
     // replica 0.
     TF_ASSIGN_OR_RETURN(
         std::unique_ptr<HloModuleConfig> module_config,
-        CreateModuleConfig(request.computation().host_program_shape(),
-                           replicated_arguments.front(),
-                           request.execution_options()));
+        CreateModuleConfig(
+            ProgramShape{request.computation().host_program_shape()},
+            replicated_arguments.front(), request.execution_options()));
     VLOG(3)
         << "ExecuteGraphParallel created HloModuleConfig computation layout: "
         << module_config->entry_computation_layout().ToString();
@@ -745,9 +746,9 @@ Status Service::GetDeviceHandles(const GetDeviceHandlesRequest* arg,
   }
   if (available_device_count < arg->device_count() * replica_count) {
     return ResourceExhausted(
-        "Requested device count (%d) exceeds the number of available devices "
-        "on the target (%d)",
-        arg->device_count(), available_device_count);
+        "Requested logical device count (%d) with replica count (%d) exceeds "
+        "the number of available physical devices on the target (%d)",
+        arg->device_count(), replica_count, available_device_count);
   }
 
   for (int64 i = 0; i < arg->device_count(); ++i) {
@@ -818,14 +819,17 @@ Status Service::Compile(const CompileRequest* arg, CompileResponse* result) {
         "The compile request does not support multiple device handles.");
   }
 
-  std::vector<const Shape*> argument_shapes;
-  absl::c_transform(arg->input_shape_with_layout(),
-                    std::back_inserter(argument_shapes),
-                    [](const Shape& shape) { return &shape; });
+  std::vector<Shape> argument_shapes;
+  argument_shapes.reserve(arg->input_shape_with_layout_size());
+  std::vector<const Shape*> argument_shape_ptrs;
+  for (const ShapeProto& shape_proto : arg->input_shape_with_layout()) {
+    argument_shapes.push_back(Shape(shape_proto));
+    argument_shape_ptrs.push_back(&argument_shapes.back());
+  }
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> module_config,
-      CreateModuleConfig(arg->computation().host_program_shape(),
-                         argument_shapes, &arg->execution_options()));
+      CreateModuleConfig(ProgramShape{arg->computation().host_program_shape()},
+                         argument_shape_ptrs, &arg->execution_options()));
   VLOG(3) << "Compile created HloModuleConfig computation layout: "
           << module_config->entry_computation_layout().ToString();
 
@@ -930,14 +934,14 @@ Status Service::TransferToClient(const TransferToClientRequest* arg,
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* shaped_buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
 
-  const Shape* return_shape;
+  Shape return_shape;
   if (arg->has_shape_with_layout()) {
-    if (!LayoutUtil::HasLayout(arg->shape_with_layout())) {
+    return_shape = Shape(arg->shape_with_layout());
+    if (!LayoutUtil::HasLayout(return_shape)) {
       return InvalidArgument("shape_with_layout must have layout if present.");
     }
-    return_shape = &arg->shape_with_layout();
   } else {
-    return_shape = &shaped_buffer->on_host_shape();
+    return_shape = Shape(shaped_buffer->on_host_shape());
   }
 
   TF_ASSIGN_OR_RETURN(auto stream, execute_backend_->BorrowStream(
@@ -948,30 +952,15 @@ Status Service::TransferToClient(const TransferToClientRequest* arg,
       execute_backend_->transfer_manager()->TransferLiteralFromDevice(
           stream.get(), *shaped_buffer));
 
-  if (LayoutUtil::LayoutsInShapesEqual(*return_shape, result_literal.shape())) {
+  if (LayoutUtil::LayoutsInShapesEqual(return_shape, result_literal.shape())) {
     *result->mutable_literal() = result_literal.ToProto();
   } else {
     *result->mutable_literal() =
-        result_literal.Relayout(*return_shape).ToProto();
+        result_literal.Relayout(return_shape).ToProto();
   }
   return Status::OK();
 }
 
-namespace {
-
-// Creates a clone of the given shaped buffer with the given device ordinal. The
-// shape and DeviceMemoryBase values of the clone are identical to the original.
-std::unique_ptr<ShapedBuffer> CloneShapedBufferOnDevice(
-    const ShapedBuffer& shaped_buffer, int device_ordinal) {
-  auto clone = absl::make_unique<ShapedBuffer>(
-      shaped_buffer.on_host_shape(), shaped_buffer.on_device_shape(),
-      shaped_buffer.platform(), device_ordinal);
-  clone->buffers() = shaped_buffer.buffers();
-  return clone;
-}
-
-}  // namespace
-
 Status Service::TransferToServer(const TransferToServerRequest* arg,
                                  TransferToServerResponse* result) {
   TF_ASSIGN_OR_RETURN(Literal literal,
@@ -1060,11 +1049,11 @@ Status Service::TransferFromOutfeed(const TransferFromOutfeedRequest* arg,
     executor = replicas[arg->replica_id()];
   }
 
-  auto literal = Literal::CreateFromShape(arg->shape_with_layout());
+  auto literal = Literal::CreateFromShape(Shape(arg->shape_with_layout()));
 
   TF_RETURN_IF_ERROR(
       execute_backend_->transfer_manager()->TransferLiteralFromOutfeed(
-          executor, arg->shape_with_layout(), literal));
+          executor, Shape(arg->shape_with_layout()), literal));
   *result->mutable_literal() = literal.ToProto();
   return Status::OK();
 }
@@ -1087,7 +1076,7 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
         "constant computation may not depend on any parameters.");
   }
 
-  ProgramShape program_shape = arg->computation().host_program_shape();
+  ProgramShape program_shape(arg->computation().host_program_shape());
   TF_DCHECK_OK(ShapeUtil::ValidateShape(program_shape.result()));
   if (arg->has_output_layout()) {
     TF_RETURN_IF_ERROR(LayoutUtil::ValidateLayoutForShape(
@@ -1118,7 +1107,7 @@ Status Service::ComputeConstantGraph(const ComputeConstantGraphRequest* arg,
 Status Service::GetShape(const GetShapeRequest* arg, GetShapeResponse* result) {
   TF_ASSIGN_OR_RETURN(const ShapedBuffer* buffer,
                       allocation_tracker_.ResolveForReplica(arg->data(), 0));
-  *result->mutable_shape() = buffer->on_host_shape();
+  *result->mutable_shape() = buffer->on_host_shape().ToProto();
   return Status::OK();
 }
 
@@ -1131,7 +1120,7 @@ Status Service::GetComputationGraphStats(
     return InvalidArgument("Program shape may not be empty.");
   }
 
-  HloModuleConfig config(arg->computation().host_program_shape());
+  HloModuleConfig config(ProgramShape{arg->computation().host_program_shape()});
   config.set_debug_options(arg->debug_options());
   TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
                       CreateModuleFromProto(arg->computation(), config));
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index 61a60ef9efa72f53fa2c6730ca297ddfe01c56ba..7e7282a737041458aed39b0054f901c23aa87d7a 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -391,17 +391,6 @@ StatusOr<Shape> InferWindowOutputShape(const Shape& base_shape,
   return ShapeUtil::MakeShape(element_type, new_dimensions);
 }
 
-/* static */ StatusOr<Shape> ShapeInference::InferAfterAllShape(
-    absl::Span<const Shape* const> arg_shapes) {
-  for (const Shape* arg_shape : arg_shapes) {
-    if (arg_shape->element_type() != TOKEN) {
-      return InvalidArgument(
-          "Operands of token instructions must be TOKEN types.");
-    }
-  }
-  return ShapeUtil::MakeTokenShape();
-}
-
 /* static */ StatusOr<Shape> ShapeInference::InferConvertShape(
     const Shape& operand_shape, PrimitiveType new_element_type) {
   auto old_element_type = operand_shape.element_type();
@@ -1029,7 +1018,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
   switch (opcode) {
     case HloOpcode::kTuple: {
       Shape result = ShapeUtil::MakeTupleShape({});
-      result.mutable_tuple_shapes()->Reserve(operand_shapes.size());
+      result.mutable_tuple_shapes()->reserve(operand_shapes.size());
       for (const Shape* shape : operand_shapes) {
         ShapeUtil::AppendShapeToTuple(*shape, &result);
       }
@@ -2038,7 +2027,16 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
                            dimension);
   }
 
-  return ShapeUtil::MakeShape(S64, {});
+  // TODO(b/119580730): Remove this restriction when very large dimension size
+  // is needed.
+  if (shape.dimensions(dimension) > std::numeric_limits<uint32>::max()) {
+    return InvalidArgument(
+        "GetDimensionSize's input shape is %s, the %dth dimension exceeds the "
+        "UINT_MAX limit.",
+        ShapeUtil::HumanString(shape), dimension);
+  }
+
+  return ShapeUtil::MakeShape(U32, {});
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferSliceShape(
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 31ef4b2e41078f87731a1eff58e37409a6004ba4..d94385a04d50baff8156570a09620fd458547936 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -232,13 +232,6 @@ class ShapeInference {
   static StatusOr<Shape> InferConcatOpShape(
       absl::Span<const Shape* const> arg_shapes, int64 dimension);
 
-  // Infers the shape produced by a kAfterAll. Trivially this shape is always a
-  // TOKEN shape. However, ShapeInference serves two purposes: inferring shapes
-  // and checking operand shapes. This method verifies that the operand shapes
-  // are all TOKENs.
-  static StatusOr<Shape> InferAfterAllShape(
-      absl::Span<const Shape* const> arg_shapes);
-
   // Helper that validates the given operand shape can be converted to the
   // target output_shape via a convert instruction -- the requirement is that
   // the shape is identical except for the element type.
diff --git a/tensorflow/compiler/xla/service/transpose_folding_test.cc b/tensorflow/compiler/xla/service/transpose_folding_test.cc
index 7a565bf076847a4a5f7c98635785c80d86df152d..17cdaa74fc328d156292f5af828d4222a9a01f1f 100644
--- a/tensorflow/compiler/xla/service/transpose_folding_test.cc
+++ b/tensorflow/compiler/xla/service/transpose_folding_test.cc
@@ -172,7 +172,7 @@ TEST_F(TransposeFoldingTest, FuseDotWithConstantOperands) {
   HloInstruction* mul = builder.AddInstruction(HloInstruction::CreateBinary(
       add->shape(), HloOpcode::kMultiply, add, sub));
 
-  auto module = CreateNewUnverifiedModule("fuse_with_constant_operands");
+  auto module = CreateNewVerifiedModule("fuse_with_constant_operands");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(mul));
   HloInstruction* call = module->OutlineExpressionFromComputation(
@@ -247,7 +247,7 @@ TEST_F(TransposeFoldingTest, FoldConvDimSwapTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewUnverifiedModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -302,7 +302,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeRhs) {
       conv_shape.ValueOrDie(), x, transpose_y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewUnverifiedModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -362,7 +362,7 @@ TEST_F(TransposeFoldingTest, FoldConvTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewUnverifiedModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
@@ -428,7 +428,7 @@ TEST_F(TransposeFoldingTest, FoldConvComplexTransposeLhs) {
       conv_shape.ValueOrDie(), transpose_x, y,
       /*feature_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
 
-  auto module = CreateNewUnverifiedModule("test_module");
+  auto module = CreateNewVerifiedModule("test_module");
   HloComputation* entry_computation =
       module->AddEntryComputation(builder.Build(conv));
   FoldTranspose(module.get());
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
index 96f3055c98e0611dfe25517cb490014a6d1f7c76..50d51eaeb762e208004c1dae3dcc27503f3f94e9 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.cc
@@ -280,6 +280,13 @@ Status TuplePointsToAnalysis::HandleDomain(HloInstruction* domain) {
   return Status::OK();
 }
 
+Status TuplePointsToAnalysis::HandleAddDependency(
+    HloInstruction* add_dependency) {
+  // AddDependency just forwards the value of its zero-th operand.
+  CreateCopiedPointsToSet(add_dependency, add_dependency->operand(0));
+  return Status::OK();
+}
+
 Status TuplePointsToAnalysis::HandleRecvDone(HloInstruction* recv_done) {
   // RecvDone aliases its input (Recv) tuple element {0} to element {0} of its
   // output. The other indices ({} and {1}) define their own buffers.
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
index bcfcb388f95b0bedb35a8c399e804034816867b3..0a1d5649d6d69fea12263e6986ce76af62615ec7 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis.h
@@ -252,6 +252,7 @@ class TuplePointsToAnalysis : public DfsHloVisitorWithDefault {
   Status HandleRecvDone(HloInstruction* recv_done) override;
   Status HandleSend(HloInstruction* send) override;
   Status HandleTupleSelect(HloInstruction* tuple_select) override;
+  Status HandleAddDependency(HloInstruction* add_dependency) override;
 
   string ToString() const;
 
diff --git a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
index 10ef2d38fa21c3e93c270535bc99b2f76435337d..561762b5d424ed5f537665be9d67a81dc8bdd56e 100644
--- a/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
+++ b/tensorflow/compiler/xla/service/tuple_points_to_analysis_test.cc
@@ -264,6 +264,22 @@ TEST_F(TuplePointsToAnalysisTest, GetTupleElement) {
               UnorderedElementsAre(inner_tuple));
 }
 
+TEST_F(TuplePointsToAnalysisTest, AddDependency) {
+  auto builder = HloComputation::Builder(TestName());
+  auto constant = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.0)));
+  auto token = builder.AddInstruction(HloInstruction::CreateToken());
+  auto add_dependency = builder.AddInstruction(
+      HloInstruction::CreateAddDependency(constant, token));
+  BuildModuleAndRunAnalysis(builder.Build());
+
+  auto& points_to_set = points_to_analysis_->GetPointsToSet(add_dependency);
+  EXPECT_EQ(1, points_to_set.size());
+  EXPECT_FALSE(points_to_set.IsAmbiguous());
+  EXPECT_TRUE(points_to_set.IsDistinct());
+  ExpectHasTopLevelBuffers(points_to_set.CreateFlattenedSet(), {constant});
+}
+
 TEST_F(TuplePointsToAnalysisTest, DuplicatedElement) {
   // Create a tuple which contains duplicate elements.
   auto builder = HloComputation::Builder(TestName());
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
index b7c28bfac7889b788645360366d1419eb80e64de..41011176ffa91e885bc58364d1fb19617d3518ad 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/tuple_util.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 #include "tensorflow/compiler/xla/service/while_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/util.h"
 
 namespace xla {
@@ -207,6 +208,37 @@ WhileLoopInvariantCodeMotion::TryHoistingInvariantInstructionsFromWhileBody(
       continue;
     }
 
+    if (!hoist_size_inflating_ops_) {
+      // Check that hoisting the instruction doesn't cause a significant memory
+      // blow-up. LICM extends the live-range of the output of the hoisted
+      // instruction to be the entire while loop, which may be problematic on
+      // platforms where memory is limited. This can be especially harmful if
+      // the instruction has a significantly larger output than its input, e.g.
+      // kIota, kBroadcast or kConstant.
+      int64 input_size = 0, output_size = 0;
+
+      for (auto* operand : instruction->operands()) {
+        ShapeUtil::ForEachSubshape(
+            operand->shape(),
+            [&input_size](const Shape& subshape, const ShapeIndex& /*index*/) {
+              if (ShapeUtil::IsArray(subshape)) {
+                input_size += ShapeUtil::ByteSizeOfElements(subshape);
+              }
+            });
+      }
+      ShapeUtil::ForEachSubshape(
+          instruction->shape(),
+          [&output_size](const Shape& subshape, const ShapeIndex& /*index*/) {
+            if (ShapeUtil::IsArray(subshape)) {
+              output_size += ShapeUtil::ByteSizeOfElements(subshape);
+            }
+          });
+
+      if (output_size > input_size) {
+        continue;
+      }
+    }
+
     auto is_invariant = [&](HloInstruction* op) {
       return hoisted_instructions.find(op) != hoisted_instructions.end() ||
              unhoisted_invariant_instructions.count(op) ||
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
index 3031899f71e0fd77f20448d9d7489798af01615c..bd6232dc0a988775a0490abbf6125daad8476295 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion.h
@@ -34,8 +34,14 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
   // Setting `hoist_constants` to false can be help if LICM is run in the mid
   // level HLO pipeline because hoisting constants out of while loop bodies can
   // break optimizations like constant folding.
-  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false)
-      : hoist_constants_(hoist_constants) {}
+  // Setting `hoist_size_inflating_ops` to false will forbid hoisting
+  // instructions where the size of the output(s) is larger than the size of the
+  // input(s). This is useful on platforms on which it's important to prevent
+  // blow-ups in memory size.
+  explicit WhileLoopInvariantCodeMotion(bool hoist_constants = false,
+                                        bool hoist_size_inflating_ops = true)
+      : hoist_constants_(hoist_constants),
+        hoist_size_inflating_ops_(hoist_size_inflating_ops) {}
   ~WhileLoopInvariantCodeMotion() override = default;
 
   absl::string_view name() const override {
@@ -49,6 +55,7 @@ class WhileLoopInvariantCodeMotion : public HloModulePass {
       HloInstruction* while_instr);
 
   bool hoist_constants_;
+  bool hoist_size_inflating_ops_;
 };
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
index 046ccb2d3f29c2141ade5275d043875e3e278582..8e7c4bc8828552e197b41f874c070d496b85a382 100644
--- a/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_invariant_code_motion_test.cc
@@ -570,5 +570,59 @@ TEST_F(WhileLoopInvariantCodeMotionTest, DoNotHoistOutOfSingleIteration) {
   EXPECT_FALSE(simplified_loop);
 }
 
+const char* const kInflatingTestCase = R"(
+HloModule ModuleWithWhile
+
+mul {
+  lhs = f32[] parameter(0)
+  rhs = f32[] parameter(1)
+  ROOT mul = f32[] multiply(lhs, rhs)
+}
+
+body {
+  p_body = (f32[]) parameter(0)
+  iota = f32[1024, 1024] iota(), iota_dimension=0
+  add = f32[1024, 1024] add(iota, iota)
+  constant = f32[] constant(1.0)
+  reduce = f32[] reduce(f32[1024, 1024] add, f32[] constant), dimensions={0,1}, to_apply=mul
+  ROOT root = (f32[]) tuple(reduce)
+}
+
+condition {
+  p_cond = (f32[]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  param = f32[] parameter(0)
+  while_init = (f32[]) tuple(param)
+  ROOT while = (f32[]) while(while_init), condition=condition, body=body
+}
+)";
+
+TEST_F(WhileLoopInvariantCodeMotionTest, HoistsInflatingByDefault) {
+  auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion(/*hoist_constants=*/true).Run(m.get()));
+  EXPECT_TRUE(simplified_loop);
+
+  HloComputation* while_body = m->GetComputationWithName("wide.body");
+  ASSERT_NE(while_body, nullptr);
+  EXPECT_THAT(while_body->instructions(), Not(Contains(op::Iota())));
+}
+
+TEST_F(WhileLoopInvariantCodeMotionTest, NoHoistInflating) {
+  auto m = ParseAndReturnVerifiedModule(kInflatingTestCase).ValueOrDie();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool simplified_loop,
+      WhileLoopInvariantCodeMotion(/*hoist_constants=*/true,
+                                   /*hoist_size_inflating_ops=*/false)
+          .Run(m.get()));
+  EXPECT_FALSE(simplified_loop);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier.cc b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
index 6f924a29d8a3ac60abe98efd2e82ae7343c7de47..d30f67dd8110b88166fe807762fb653190ec00bc 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier.cc
@@ -19,13 +19,17 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/call_inliner.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
+#include "tensorflow/compiler/xla/service/hlo_instructions.h"
 #include "tensorflow/compiler/xla/service/hlo_query.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/while_loop_analysis.h"
 
 namespace xla {
 
+namespace m = match;
 using absl::optional;
 using hlo_query::ContainsInstrWithOpcode;
 
@@ -302,6 +306,147 @@ static StatusOr<bool> TryRemoveDeadWhileParams(HloInstruction* while_op) {
   return true;
 }
 
+// Removes each loop parameter (i.e. member of the while loop tuple) that is a
+// constant and is the same in the while loop body and the while loop init.
+static StatusOr<bool> TryRemoveConstantParams(HloInstruction* while_op) {
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return false;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+
+  absl::flat_hash_set<int64> constant_tuple_indices;
+  const auto& while_shape = while_init->shape();
+  for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    auto* init_elem = while_init->operand(i);
+    auto* body_elem = while_body_root->operand(i);
+    if (init_elem->opcode() == HloOpcode::kConstant &&
+        body_elem->opcode() == HloOpcode::kConstant &&
+        init_elem->literal() == body_elem->literal()) {
+      constant_tuple_indices.insert(i);
+    }
+  }
+
+  if (constant_tuple_indices.empty()) {
+    return false;
+  }
+
+  // OK, we found some constant elements of the while parameter!  Eliminate
+  // them.
+  std::vector<Shape> new_while_shape_elems;
+  for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+    if (!constant_tuple_indices.count(i)) {
+      new_while_shape_elems.push_back(while_shape.tuple_shapes(i));
+    }
+  }
+  Shape new_while_shape = ShapeUtil::MakeTupleShape(new_while_shape_elems);
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  // Returns a new tuple without the elements of constant_tuple_indices.
+  auto remove_constant_elems = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), while_shape));
+
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      if (!constant_tuple_indices.count(i)) {
+        tuple_elems.push_back(
+            add_new_instr(HloInstruction::CreateGetTupleElement(
+                while_shape.tuple_shapes(i), instr, i)));
+      }
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  auto add_constant_elems = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
+
+    std::vector<HloInstruction*> tuple_elems;
+    int64 j = 0;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      if (constant_tuple_indices.count(i)) {
+        tuple_elems.push_back(while_init->mutable_operand(i));
+      } else {
+        tuple_elems.push_back(
+            add_new_instr(HloInstruction::CreateGetTupleElement(
+                while_shape.tuple_shapes(i), instr, j)));
+        ++j;
+      }
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Special case: constant_tuple_indices covers the whole while parameter, so
+  // the new while shape is the empty tuple.  In this case, the value of the
+  // while loop is simply equal to the value of `init`.
+  //
+  // It's unfortunate to special-case this, but it's simpler than the
+  // alternative.  The problem is that if our while parameter has no
+  // non-constant elems, the tuple returned by `add_constant_elems` won't depend
+  // on instr (the loop body/cond parameter), and therefore
+  // CloneWithReplacementPairs will *leave the parameter out entirely*, creating
+  // invalid HLO.
+  if (ShapeUtil::IsEmptyTuple(new_while_shape)) {
+    TF_RETURN_IF_ERROR(computation->ReplaceInstruction(while_op, while_init));
+    return true;
+  }
+
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          add_constant_elems(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  std::unique_ptr<HloComputation> new_while_body =
+      while_body->CloneWithReplacementPairs(
+          {
+              while_body->parameter_instruction(0),
+              add_constant_elems(add_new_instr(HloInstruction::CreateParameter(
+                  0, new_while_shape,
+                  while_cond->parameter_instruction(0)->name()))),
+          },
+          {
+              while_body->root_instruction(),
+              remove_constant_elems(
+                  add_new_instr(while_body->root_instruction()->Clone())),
+          });
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op,
+      add_constant_elems(
+          computation->AddInstruction(HloInstruction::CreateWhile(
+              new_while_shape,
+              module->AddEmbeddedComputation(std::move(new_while_cond)),
+              module->AddEmbeddedComputation(std::move(new_while_body)),
+              add_new_instr(remove_constant_elems(while_init)))))));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return true;
+}
+
 // Tries to remove a while loop from the graph.
 //
 //  - Loops with trip count of 0 can be replaced by the loop's "init" value.
@@ -381,16 +526,14 @@ static StatusOr<bool> TryPropagateConstant(HloInstruction* while_op) {
   // performance by forcing us to copy constants.
   absl::flat_hash_map<int, const HloInstruction*> index_to_constant;
   for (int i = 0; i < root_operands.size(); i++) {
-    HloInstruction* instr = root_operands[i];
-    if (instr->opcode() == HloOpcode::kGetTupleElement &&
-        instr->tuple_index() == i && instr->operand(0) == while_body_param &&
-        ShapeUtil::IsScalar(instr->shape())) {
-      auto tuple_element = while_init->operand(i);
-      if (tuple_element->IsConstant()) {
-        VLOG(3) << "Found loop invariant tuple element " << i << " "
-                << tuple_element->ToString();
-        index_to_constant[i] = tuple_element;
-      }
+    const HloInstruction* init_tuple_elem = nullptr;
+    if (Match(root_operands[i],
+              m::GetTupleElement(m::Op().Is(while_body_param), i)
+                  .WithShape(m::Shape().IsScalar())) &&
+        Match(while_init->operand(i), m::Constant(&init_tuple_elem))) {
+      VLOG(3) << "Found loop invariant tuple element " << i << " "
+              << init_tuple_elem->ToString();
+      index_to_constant[i] = init_tuple_elem;
     }
   }
 
@@ -519,14 +662,6 @@ static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
     return false;
   }
 
-  // Cowardly refuse to perform this optimization in the presence of kDomain
-  // instructions, which may reference other instructions in the loop and
-  // therefore make this complicated.
-  if (ContainsInstrWithOpcode(while_body, {HloOpcode::kDomain}) ||
-      ContainsInstrWithOpcode(while_cond, {HloOpcode::kDomain})) {
-    return false;
-  }
-
   std::vector<Shape> flattened_shape_elems;
   ShapeUtil::ForEachSubshape(while_shape,
                              [&](const Shape& s, const ShapeIndex& /*index*/) {
@@ -605,6 +740,243 @@ static StatusOr<bool> TryFlattenNestedTuples(HloInstruction* while_op) {
   return true;
 }
 
+// Tries to merge loop induction variables of a given type.
+//
+// In this pass we're only concerned with elements of the loop's tuple that
+// are effective-scalars of type `elem_ty`.  Some terminology:
+//
+//  - The trip counter is the first element of the loop's tuple that starts at
+//    0 and does x++ on each iteration.
+//
+//  - An induction variable is an element of the loop's tuple that is not the
+//    trip counter and does `x += <constant>` on each iteration of the loop.
+//    Negative constants are OK.
+//
+// This pass adds a trip counter if one isn't already present, then replaces
+// each induction variable with
+//
+//   <initial_value> + <trip_count> * <constant>.
+//
+// This reduces the number of scalar operations in the loop, which is important
+// e.g. on GPUs, where each scalar operation is nontrivially expensive because
+// it's a separate kernel launch.
+//
+// Returns the new loop if a change was made, or null if no change was made.
+// Note that the new loop is not a valid replacement for the old loop; it may
+// need to be wrapped in a tuple that changes its shape.  We return the loop
+// itself so that you can call TryMergeInductionVariables in a loop, once for
+// each integral type elem_ty.
+static StatusOr<HloInstruction*> TryMergeInductionVariables(
+    HloInstruction* while_op, PrimitiveType elem_ty) {
+  CHECK(primitive_util::IsIntegralType(elem_ty)) << PrimitiveType_Name(elem_ty);
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  auto* while_init = while_op->mutable_operand(0);
+  auto* while_body = while_op->while_body();
+  auto* while_cond = while_op->while_condition();
+  auto* while_body_root = while_body->root_instruction();
+  if (while_init->opcode() != HloOpcode::kTuple ||
+      while_body_root->opcode() != HloOpcode::kTuple) {
+    return nullptr;
+  }
+
+  TF_RET_CHECK(while_cond->num_parameters() == 1);
+  TF_RET_CHECK(while_body->num_parameters() == 1);
+  TF_RET_CHECK(
+      ShapeUtil::Compatible(while_init->shape(), while_body_root->shape()));
+  Shape while_shape = while_init->shape();
+
+  // The tuple index of the trip counter, if one is present.
+  absl::optional<int64> trip_counter;
+  // Maps the tuple index of each induction variable to its constant increment.
+  absl::flat_hash_map<int64, const HloConstantInstruction*> induction_vars;
+  for (int64 i = 0; i < while_body_root->operand_count(); ++i) {
+    HloInstruction* constant;
+    if (!Match(while_body_root->mutable_operand(i),
+               m::AddAnyOrder(m::GetTupleElement(m::Parameter(), i),
+                              m::ConstantScalar(&constant))
+                   .WithShape(m::Shape().WithElementType(elem_ty)))) {
+      continue;
+    }
+    if (!trip_counter && constant->literal().IsAll(1) &&
+        while_init->operand(i)->IsConstant() &&
+        while_init->operand(i)->literal().IsAll(0)) {
+      VLOG(10) << "Found existing trip counter at index " << i;
+      trip_counter = i;
+    } else {
+      VLOG(10) << "Found induction variable at index " << i;
+      induction_vars.emplace(i, Cast<HloConstantInstruction>(constant));
+    }
+  }
+
+  // There's only something to simplify if we can either:
+  //
+  //  - combine one or more induction vars with an existing trip counter, or
+  //  - replace two or more induction variables with a new trip counter.
+  //
+  // Put another way, there's only something to simplify if the number of
+  // induction vars plus the number of existing trip counters (0 or 1) is >= 2.
+  if (induction_vars.size() + (trip_counter.has_value() ? 1 : 0) < 2) {
+    return nullptr;
+  }
+
+  // OK, we're going to do the transformation!  Set up some helpers.
+
+  // `new_instrs` holds instructions created outside of a computation for
+  // cloning.  Elements added here just need to live until the end of the
+  // relevant CloneWithReplacement call.
+  std::vector<std::unique_ptr<HloInstruction>> new_instrs;
+  auto add_new_instr = [&](std::unique_ptr<HloInstruction> instr) {
+    new_instrs.push_back(std::move(instr));
+    return new_instrs.back().get();
+  };
+
+  auto add_binary_op = [&](const Shape& shape, HloOpcode opcode,
+                           HloInstruction* lhs, HloInstruction* rhs) {
+    // Reshape lhs/rhs to the output shape if necessary.  This deals with the
+    // fact that induction variables need only be effective scalars, not true
+    // scalars.
+    if (!ShapeUtil::Compatible(shape, lhs->shape())) {
+      lhs = add_new_instr(HloInstruction::CreateReshape(shape, lhs));
+    }
+    if (!ShapeUtil::Compatible(shape, rhs->shape())) {
+      rhs = add_new_instr(HloInstruction::CreateReshape(shape, rhs));
+    }
+    return add_new_instr(HloInstruction::CreateBinary(shape, opcode, lhs, rhs));
+  };
+
+  auto add_gte = [&](HloInstruction* src, int64 idx) {
+    return add_new_instr(HloInstruction::CreateGetTupleElement(
+        src->shape().tuple_shapes(idx), src, idx));
+  };
+
+  // Our new while loop will have the same shape as the old while loop, except
+  // we'll add a trip counter to the end if it wasn't originally present.
+  Shape new_while_shape = while_shape;
+  bool added_trip_counter = false;
+  if (!trip_counter) {
+    VLOG(10) << "Adding new trip counter to end of loop's tuple.";
+    trip_counter = new_while_shape.tuple_shapes_size();
+    *new_while_shape.add_tuple_shapes() =
+        ShapeUtil::MakeShape(elem_ty, /*dimensions=*/{});
+    added_trip_counter = true;
+  }
+
+  // Converts `instr` into a tuple of the "old" form -- that is, to a tuple with
+  // shape `while_body->shape()` and where the induction variables are "reified"
+  // (i.e. they have value <init> + <counter> * <constant>).
+  auto convert_to_old_form = [&](HloInstruction* instr) {
+    CHECK(ShapeUtil::Compatible(instr->shape(), new_while_shape));
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      const auto& elem_shape = while_shape.tuple_shapes(i);
+      if (!induction_vars.count(i)) {
+        tuple_elems.push_back(add_gte(instr, i));
+        continue;
+      }
+      tuple_elems.push_back(add_binary_op(
+          elem_shape, HloOpcode::kAdd, add_gte(instr, i),
+          add_binary_op(elem_shape, HloOpcode::kMultiply,
+                        add_gte(instr, *trip_counter),
+                        add_new_instr(induction_vars.at(i)->Clone()))));
+    }
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Converts `root` into a tuple of the "new" form -- that is, to a tuple with
+  // shape `new_while_shape` and where the induction variables (but not trip
+  // counters) are replaced with their unchanging <loop_body_param> values.
+  auto convert_to_new_form = [&](HloInstruction* old_root,
+                                 HloParameterInstruction* loop_body_param) {
+    CHECK(ShapeUtil::Compatible(old_root->shape(), while_shape));
+    std::vector<HloInstruction*> tuple_elems;
+
+    // In the new form, induction variables come from `init`, everything else
+    // (including the trip counter if it's not one we created ourselves) comes
+    // from the `root` tuple unmodified.
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      tuple_elems.push_back(
+          add_gte((induction_vars.count(i) ? loop_body_param : old_root), i));
+    }
+    // If we created a trip counter ourselves, add 1 to it in the next
+    // iteration.
+    if (added_trip_counter) {
+      tuple_elems.push_back(add_binary_op(
+          new_while_shape.tuple_shapes(*trip_counter), HloOpcode::kAdd,
+          add_gte(loop_body_param, *trip_counter),
+          add_new_instr(
+              HloInstruction::CreateConstant(LiteralUtil::One(elem_ty)))));
+    }
+
+    return HloInstruction::CreateTuple(tuple_elems);
+  };
+
+  // Creates a new init tuple, which is the same as the old init tuple except if
+  // we added a trip counter, it's set to 0.
+  auto get_new_while_init = [&](HloInstruction* init) {
+    CHECK(ShapeUtil::Compatible(init->shape(), while_shape));
+    if (!added_trip_counter) {
+      return init;
+    }
+    std::vector<HloInstruction*> tuple_elems;
+    for (int64 i = 0; i < while_shape.tuple_shapes_size(); ++i) {
+      tuple_elems.push_back(add_gte(init, i));
+    }
+    tuple_elems.push_back(add_new_instr(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(elem_ty))));
+    return add_new_instr(HloInstruction::CreateTuple(tuple_elems));
+  };
+
+  std::unique_ptr<HloComputation> new_while_cond =
+      while_cond->CloneWithReplacementPairs({
+          while_cond->parameter_instruction(0),
+          convert_to_old_form(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_cond->parameter_instruction(0)->name()))),
+      });
+
+  // Creating the new while body proceeds in two steps.  First we convert the
+  // users of the parameter to the old form.  Then as a second
+  // CloneWithReplacement operation we convert the root to the new form.  We
+  // have to do this in two steps because the new root needs to use the new
+  // param0, and during the first clone operation, only the *old-form* param0 is
+  // accessible.
+  //
+  // We have to add temp_new_while_body to the module because cloning a
+  // computation touches the module (to get its NameUniquer).
+  HloComputation* temp_new_while_body =
+      module->AddEmbeddedComputation(while_body->CloneWithReplacementPairs({
+          while_body->parameter_instruction(0),
+          convert_to_old_form(add_new_instr(HloInstruction::CreateParameter(
+              0, new_while_shape,
+              while_body->parameter_instruction(0)->name()))),
+      }));
+  std::unique_ptr<HloComputation> new_while_body =
+      temp_new_while_body->CloneWithReplacementPairs({
+          temp_new_while_body->root_instruction(),
+          convert_to_new_form(
+              add_new_instr(temp_new_while_body->root_instruction()->Clone()),
+              Cast<HloParameterInstruction>(
+                  temp_new_while_body->parameter_instruction(0))),
+      });
+  TF_RETURN_IF_ERROR(module->RemoveEmbeddedComputation(temp_new_while_body));
+
+  // Create the final while loop, and add any new instructions created to
+  // `computation`.
+  new_instrs.clear();
+  auto* new_while = computation->AddInstruction(HloInstruction::CreateWhile(
+      new_while_shape,
+      module->AddEmbeddedComputation(std::move(new_while_cond)),
+      module->AddEmbeddedComputation(std::move(new_while_body)),
+      get_new_while_init(while_init)));
+  TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
+      while_op, convert_to_old_form(new_while)));
+  for (auto& instr : new_instrs) {
+    computation->AddInstruction(std::move(instr));
+  }
+  return new_while;
+}
+
 StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
   XLA_VLOG_LINES(3,
                  "WhileLoopSimplifier::Run(), before:\n" + module->ToString());
@@ -650,19 +1022,50 @@ StatusOr<bool> WhileLoopSimplifier::Run(HloModule* module) {
       continue;
     }
 
+    // TODO(b/119281462): Cowardly refuse to perform any of the following
+    // optimizations in the presence of kDomain instructions.  It seems that
+    // modifying a while loop's tuple doesn't work when kDomain is present.
+    if (ContainsInstrWithOpcode(while_op->while_body(), {HloOpcode::kDomain}) ||
+        ContainsInstrWithOpcode(while_op->while_condition(),
+                                {HloOpcode::kDomain})) {
+      continue;
+    }
+
+    // Each of the optimizations below modifies the while loop itself if it's
+    // successful, meaning that `while_op` is no longer valid after one of these
+    // transformations returns true.
+
     TF_ASSIGN_OR_RETURN(result, TryFlattenNestedTuples(while_op));
     changed |= result;
     if (result) {
-      // Successfully flattening nested tuples results in us cloning and
-      // replacing the while loop, meaning that `while_op` is no longer valid.
       continue;
     }
 
     TF_ASSIGN_OR_RETURN(result, TryRemoveDeadWhileParams(while_op));
     changed |= result;
     if (result) {
-      // Successfully removing dead while params results in us cloning and
-      // replacing the while loop, meaning that `while_op` is no longer valid.
+      continue;
+    }
+
+    TF_ASSIGN_OR_RETURN(result, TryRemoveConstantParams(while_op));
+    changed |= result;
+    if (result) {
+      continue;
+    }
+
+    bool merged_induction_vars = false;
+    // Notably missing from this list are S16 and U16.  These don't currently
+    // work because S/U16 literals are not implemented.
+    for (auto elem_ty : {S8, U8, S32, U32, S64, U64}) {
+      TF_ASSIGN_OR_RETURN(auto* new_while_op,
+                          TryMergeInductionVariables(while_op, elem_ty));
+      if (new_while_op) {
+        while_op = new_while_op;
+        changed = true;
+        merged_induction_vars = true;
+      }
+    }
+    if (merged_induction_vars) {
       continue;
     }
   }
diff --git a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
index 05005e0b262a50cd40e004deac4c450a2e257308..4950e8269e9cf0723d717bd1734518d104c0c9f2 100644
--- a/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/while_loop_simplifier_test.cc
@@ -17,9 +17,12 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_cse.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -27,8 +30,17 @@ limitations under the License.
 namespace xla {
 namespace {
 
+using ::testing::_;
 namespace op = xla::testing::opcode_matchers;
 
+// Returns the first kWhile instruction within m's entry computation.
+HloInstruction* FindFirstWhile(HloModule* m) {
+  const auto& instrs = m->entry_computation()->instructions();
+  return *absl::c_find_if(instrs, [](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kWhile;
+  });
+}
+
 class WhileLoopSimplifierTest : public HloTestBase {
  protected:
   // Makes an HloModule that contains a loop with `num_iters` iteration.
@@ -540,11 +552,7 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
   // it easy to find.
   EXPECT_TRUE(HloDCE().Run(m.get()).ok());
 
-  const auto& instrs = m->entry_computation()->instructions();
-  HloInstruction* new_while =
-      *absl::c_find_if(instrs, [](const HloInstruction* instr) {
-        return instr->opcode() == HloOpcode::kWhile;
-      });
+  HloInstruction* new_while = FindFirstWhile(m.get());
   Shape flat_tuple =
       ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3], s32[4])")
           .ValueOrDie();
@@ -563,5 +571,177 @@ TEST_F(WhileLoopSimplifierTest, FlattenNestedTuple) {
           .ValueOrDie()));
 }
 
+// Edge-case: All elements of the loop carry are constants which can be removed,
+// leaving us with a nullary loop.  This is a special case, we just replace the
+// loop with its init.
+TEST_F(WhileLoopSimplifierTest, OnlyConstantsInLoopCarry) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1]) parameter(0)
+    a = s32[1] constant({0})
+    ROOT tuple = (s32[1]) tuple(a)
+  }
+  Cond {
+    param = (s32[1]) parameter(0)
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    a = s32[1] constant({0})
+    init = (s32[1]) tuple(a)
+    ROOT while = (s32[1]) while(init), condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(op::Constant()));
+}
+
+TEST_F(WhileLoopSimplifierTest, RemoveConstantFromLoopCarry) {
+  const string hlo_string = R"(
+  HloModule Test
+  Body {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    a = s32[1] get-tuple-element(param), index=0
+    a.1 = s32[1] add(a, a)
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({10,10,10})
+    ROOT tuple = (s32[1], s32[2], s32[3]) tuple(a.1, b, c)
+  }
+  Cond {
+    param = (s32[1], s32[2], s32[3]) parameter(0)
+    /* Use each tuple element.  The verifier will then ensure that if any of
+     * these get modified, they're replaced with values of the correct shape. */
+    a = s32[1] get-tuple-element(param), index=0
+    b = s32[2] get-tuple-element(param), index=1
+    c = s32[3] get-tuple-element(param), index=2
+    ROOT cond = pred[] constant(true)
+  }
+  ENTRY Loop {
+    /* Only `b` should be simplified away.  `a` is not a constant within the
+     * loop, and `c`'s value changes depending on whether we run 0 or 1
+     * iterations of the loop. */
+    a = s32[1] constant({0})
+    b = s32[2] constant({1,1})
+    c = s32[3] constant({2,2,2})
+    init = (s32[1], s32[2], s32[3]) tuple(a,b,c)
+    ROOT while = (s32[1], s32[2], s32[3]) while(init),
+      condition=Cond, body=Body
+  })";
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  // Run the tuple simplifier to make the resulting HLO a bit easier to check.
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  Shape new_while_shape =
+      ShapeUtil::ParseShapeString("(s32[1], s32[3])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      m->entry_computation()->root_instruction()->shape(),
+      ShapeUtil::ParseShapeString("(s32[1], s32[2], s32[3])").ValueOrDie()));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              op::Tuple(_, op::Constant(), _));
+}
+
+const char* const kSimpleMergeInductionVariablesModule = R"(
+  HloModule Test
+  Body {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+
+    a = TYPE[] get-tuple-element(param), index=0
+    one = TYPE[] constant(1)
+    a1 = TYPE[] add(a, one)
+
+    b = TYPE[] get-tuple-element(param), index=1
+    negone = TYPE[] constant(-1)
+    b1 = TYPE[] add(b, negone)
+
+    c = TYPE[] add(a, b)
+
+    ROOT tuple = (TYPE[], TYPE[], TYPE[]) tuple(a1,b1,c)
+  }
+  Cond {
+    param = (TYPE[], TYPE[], TYPE[]) parameter(0)
+    a = TYPE[] get-tuple-element(param), index=0
+    b = TYPE[] get-tuple-element(param), index=1
+    sum = TYPE[] power(a, b)
+    ten = TYPE[] constant(10)
+    ROOT cond = pred[] less-than(sum, ten)
+  }
+  ENTRY Loop {
+    a = TYPE[] constant(10)
+    b = TYPE[] constant(100)
+    c = TYPE[] constant(0)
+    init = (TYPE[], TYPE[], TYPE[]) tuple(a,b,c)
+    while = (TYPE[], TYPE[], TYPE[]) while(init), condition=Cond, body=Body
+
+    a1 = TYPE[] get-tuple-element(while), index=0
+    b1 = TYPE[] get-tuple-element(while), index=1
+    ROOT sum = TYPE[] add(a1, b1)
+  })";
+
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_Simple) {
+  string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule,
+                                          {{"TYPE", "s32"}});
+
+  auto m = ParseAndReturnVerifiedModule(hlo_string).ValueOrDie();
+  EXPECT_TRUE(WhileLoopSimplifier().Run(m.get()).ValueOrDie());
+  // DCE away the old loop so there's just one while loop in the module, making
+  // it easy to find, and run the tuple simplifier to make the resulting HLO
+  // easier to check.
+  EXPECT_TRUE(HloDCE().Run(m.get()).ok());
+  EXPECT_TRUE(TupleSimplifier().Run(m.get()).ok());
+
+  HloInstruction* new_while = FindFirstWhile(m.get());
+  // We should have added a new loop counter for s32[] to the end of the tuple.
+  SCOPED_TRACE(m->ToString());
+  Shape new_while_shape =
+      ShapeUtil::ParseShapeString("(s32[], s32[], s32[], s32[])").ValueOrDie();
+  EXPECT_TRUE(ShapeUtil::Equal(new_while->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->root_instruction()->shape(), new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_body()->parameter_instruction(0)->shape(),
+      new_while_shape));
+  EXPECT_TRUE(ShapeUtil::Equal(
+      new_while->while_condition()->parameter_instruction(0)->shape(),
+      new_while_shape));
+
+  EXPECT_THAT(new_while->while_body()->root_instruction(),
+              op::Tuple(op::GetTupleElement(op::Parameter(), 0),
+                        op::GetTupleElement(op::Parameter(), 1), op::Add(),
+                        op::Add(op::GetTupleElement(op::Parameter(), 3),
+                                op::Constant())));
+  EXPECT_THAT(new_while->while_condition()->root_instruction(),
+              op::Lt(op::Power(op::Add(), op::Add()), op::Constant()));
+}
+
+// We shouldn't merge S16 induction variables; we can't create constants of this
+// type because S16 literals are not implemented.
+TEST_F(WhileLoopSimplifierTest, MergeInductionVariables_SkipS16) {
+  string hlo_string = absl::StrReplaceAll(kSimpleMergeInductionVariablesModule,
+                                          {{"TYPE", "s16"}});
+  EXPECT_FALSE(
+      WhileLoopSimplifier()
+          .Run(ParseAndReturnVerifiedModule(hlo_string).ValueOrDie().get())
+          .ValueOrDie());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
new file mode 100644
index 0000000000000000000000000000000000000000..746ab9e9977b1b10cdb0cb57197027d65bd50f55
--- /dev/null
+++ b/tensorflow/compiler/xla/shape.cc
@@ -0,0 +1,107 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/shape.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+
+namespace xla {
+
+Shape::Shape(const ShapeProto& shape_proto) {
+  set_element_type(shape_proto.element_type());
+  dimensions_.reserve(shape_proto.dimensions_size());
+  for (const int64 dimension : shape_proto.dimensions()) {
+    add_dimensions(dimension);
+  }
+  tuple_shapes_.reserve(shape_proto.tuple_shapes_size());
+  for (const ShapeProto& element_shape : shape_proto.tuple_shapes()) {
+    *add_tuple_shapes() = Shape(element_shape);
+  }
+  if (shape_proto.has_layout()) {
+    *mutable_layout() = shape_proto.layout();
+  }
+}
+
+ShapeProto Shape::ToProto() const {
+  ShapeProto proto;
+  proto.set_element_type(element_type_);
+  proto.mutable_dimensions()->Reserve(dimensions_size());
+  for (const int64 dimension : dimensions()) {
+    proto.add_dimensions(dimension);
+  }
+  proto.mutable_tuple_shapes()->Reserve(tuple_shapes_size());
+  for (const Shape& shape : tuple_shapes()) {
+    *proto.add_tuple_shapes() = shape.ToProto();
+  }
+  if (has_layout()) {
+    *proto.mutable_layout() = layout();
+  }
+  return proto;
+}
+
+string Shape::ToString(bool print_layout) const {
+  if (print_layout) {
+    return ShapeUtil::HumanStringWithLayout(*this);
+  } else {
+    return ShapeUtil::HumanString(*this);
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const Shape& shape) {
+  out << shape.ToString(/*print_layout=*/true);
+  return out;
+}
+
+ProgramShape::ProgramShape(const ProgramShapeProto& program_shape_proto) {
+  for (const ShapeProto& shape_proto : program_shape_proto.parameters()) {
+    *add_parameters() = Shape(shape_proto);
+  }
+  *mutable_result() = Shape(program_shape_proto.result());
+  for (const string& name : program_shape_proto.parameter_names()) {
+    add_parameter_names(name);
+  }
+}
+
+ProgramShapeProto ProgramShape::ToProto() const {
+  ProgramShapeProto proto;
+  for (const Shape& shape : parameters()) {
+    *proto.add_parameters() = shape.ToProto();
+  }
+  *proto.mutable_result() = result().ToProto();
+  for (const string& name : parameter_names()) {
+    proto.add_parameter_names(name);
+  }
+  return proto;
+}
+
+string ProgramShape::ToString() const {
+  std::vector<string> parameter_strings(parameters_size());
+  for (int i = 0; i < parameters_size(); ++i) {
+    parameter_strings[i] = absl::StrCat(
+        i < parameter_names_size() ? parameter_names(i) : "(unknown)", ": ",
+        ShapeUtil::HumanString(parameters(i)));
+  }
+  return absl::StrCat("(", absl::StrJoin(parameter_strings, ", "), ") -> ",
+                      ShapeUtil::HumanString(result()));
+}
+
+std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape) {
+  out << program_shape.ToString() << "\n";
+  return out;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f6b14ab4286c696dce64d2250a3fe8a57e4865b
--- /dev/null
+++ b/tensorflow/compiler/xla/shape.h
@@ -0,0 +1,204 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SHAPE_H_
+#define TENSORFLOW_COMPILER_XLA_SHAPE_H_
+
+#include <string>
+#include <vector>
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/core/platform/types.h"
+
+namespace xla {
+
+// A shape describes the number of dimensions in a array, the bounds of each
+// dimension, and the primitive component type. For tuples, shape describes the
+// structure (number of elements and nesting).
+class Shape {
+ public:
+  Shape() = default;
+
+  // Construct a shape from a ShapeProto.
+  explicit Shape(const ShapeProto& shape_proto);
+
+  // Returns a ShapeProto representation of the Shape.
+  ShapeProto ToProto() const;
+
+  // Returns a human-readable string that represents the given shape, with or
+  // without layout. e.g. "F32[42,12] {0, 1}" or "F32[64]".
+  string ToString(bool print_layout = false) const;
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message ShapeProto. This enabled easy migration of this data structure
+  // from a proto to a proper C++ class.
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing the primitive type.
+  PrimitiveType element_type() const { return element_type_; }
+  void set_element_type(PrimitiveType value) { element_type_ = value; }
+
+  // Methods for accessing the dimensions array.
+  int dimensions_size() const { return dimensions_.size(); }
+  int64 dimensions(int index) const { return dimensions_.at(index); }
+  void set_dimensions(int index, int64 value) { dimensions_.at(index) = value; }
+  void add_dimensions(int64 value) { dimensions_.push_back(value); }
+  void clear_dimensions() { dimensions_.clear(); }
+  const std::vector<int64>& dimensions() const { return dimensions_; }
+  std::vector<int64>* mutable_dimensions() { return &dimensions_; }
+
+  // Methods for accessing the tuple subshapes. This field only non-empty for
+  // tuple shapes.
+  int tuple_shapes_size() const { return tuple_shapes_.size(); }
+  const Shape& tuple_shapes(int index) const { return tuple_shapes_.at(index); }
+  Shape* mutable_tuple_shapes(int index) { return &tuple_shapes_.at(index); }
+  Shape* add_tuple_shapes() {
+    tuple_shapes_.push_back(Shape());
+    return &tuple_shapes_.back();
+  }
+  void clear_tuple_shapes() { tuple_shapes_.clear(); }
+  const std::vector<Shape>& tuple_shapes() const { return tuple_shapes_; }
+  std::vector<Shape>* mutable_tuple_shapes() { return &tuple_shapes_; }
+
+  // Methods for accessing the layout field.
+  bool has_layout() const { return layout_.has_value(); }
+  const Layout& layout() const {
+    if (layout_.has_value()) {
+      return *layout_;
+    } else {
+      return Layout::default_instance();
+    }
+  }
+  Layout* mutable_layout() {
+    if (!layout_.has_value()) {
+      layout_ = Layout();
+    }
+    return &layout_.value();
+  }
+  void clear_layout() { layout_.reset(); }
+
+  void Swap(Shape* other) {
+    using std::swap;
+    swap(*this, *other);
+  }
+
+  void Clear() {
+    element_type_ = PRIMITIVE_TYPE_INVALID;
+    dimensions_.clear();
+    tuple_shapes_.clear();
+    layout_.reset();
+  }
+
+  string SerializeAsString() const { return ToProto().SerializeAsString(); }
+  string ShortDebugString() const { return ToProto().ShortDebugString(); }
+  string DebugString() const { return ToProto().DebugString(); }
+
+ public:
+  // The element type of this shape (tuple, array, etc).
+  PrimitiveType element_type_ = PRIMITIVE_TYPE_INVALID;
+
+  // The array bounds of the dimensions. This is nonempty only for array shapes.
+  std::vector<int64> dimensions_;
+
+  // The tuple element subshapes. This is nonempty only for tuple shapes.
+  std::vector<Shape> tuple_shapes_;
+
+  // The array layout of the shape. This is present only for array shapes.
+  absl::optional<Layout> layout_;
+};
+
+// Shape of the parameters and output of an XLA computation. This is analogous
+// to a traditional function signature.
+class ProgramShape {
+ public:
+  ProgramShape() = default;
+
+  // Creates a ProgramShape from a ProgramShapeProto protobuf.
+  explicit ProgramShape(const ProgramShapeProto& program_shape_proto);
+
+  // Returns a proto representation of the object.
+  ProgramShapeProto ToProto() const;
+
+  string ToString() const;
+
+  // The following methods mirror the protobuf generated code interface for the
+  // message ProgramShapeProto. This enabled easy migration of this data
+  // structure from a proto to a proper C++ class.
+  // TODO(b/29771030): Replace or augment these methods with a more ergonomic
+  // interface.
+
+  // Methods for accessing and manipulating the Shape of the parameters.
+  int parameters_size() const { return parameters_.size(); }
+  const Shape& parameters(int index) const { return parameters_.at(index); }
+  Shape* mutable_parameters(int index) { return &parameters_.at(index); }
+  Shape* add_parameters() {
+    parameters_.emplace_back();
+    return &parameters_.back();
+  }
+  void clear_parameters() { parameters_.clear(); }
+  const std::vector<Shape>& parameters() const { return parameters_; }
+  std::vector<Shape>* mutable_parameters() { return &parameters_; }
+
+  // Methods for accessing and manipulating the Shape of the result.
+  const Shape& result() const { return result_; }
+  Shape* mutable_result() { return &result_; }
+
+  // Methods for accessing and manipulating the names of the parameters.
+  int parameter_names_size() const { return parameter_names_.size(); }
+  const string& parameter_names(int index) const {
+    return parameter_names_.at(index);
+  }
+  void set_parameter_names(int index, const string& value) {
+    parameter_names_.at(index) = value;
+  }
+  string* mutable_parameter_names(int index) {
+    return &parameter_names_.at(index);
+  }
+  void add_parameter_names(const string& value) {
+    parameter_names_.push_back(value);
+  }
+  string* add_parameter_names() {
+    parameter_names_.push_back("");
+    return &parameter_names_.back();
+  }
+  void clear_parameter_names() { parameter_names_.clear(); }
+  const std::vector<string>& parameter_names() const {
+    return parameter_names_;
+  }
+  std::vector<string>* mutable_parameter_names() { return &parameter_names_; }
+
+  string ShortDebugString() const { return ToProto().ShortDebugString(); }
+  string DebugString() const { return ToProto().DebugString(); }
+
+ private:
+  // The shapes of the parameters of the computation represented by this object.
+  std::vector<Shape> parameters_;
+
+  // The names of the parameters of the computation represented by this object.
+  std::vector<string> parameter_names_;
+
+  // The shape of the result of the computation represented by this object.
+  Shape result_;
+};
+
+std::ostream& operator<<(std::ostream& out, const Shape& shape);
+std::ostream& operator<<(std::ostream& out, const ProgramShape& program_shape);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SHAPE_H_
diff --git a/tensorflow/compiler/xla/shape_test.cc b/tensorflow/compiler/xla/shape_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e396897eeebc2e7bdc2dc49300c8906710608b05
--- /dev/null
+++ b/tensorflow/compiler/xla/shape_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/shape.h"
+
+#include <numeric>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class ShapeTest : public ::testing::Test {
+ protected:
+  const Shape opaque_ = ShapeUtil::MakeOpaqueShape();
+  const Shape token_ = ShapeUtil::MakeTokenShape();
+  const Shape scalar_ = ShapeUtil::MakeShape(F32, {});
+  const Shape matrix_ = ShapeUtil::MakeShape(U32, {1, 2});
+  const Shape matrix2_ = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
+  const Shape tuple_ =
+      ShapeUtil::MakeTupleShape({opaque_, scalar_, matrix_, matrix2_});
+  const Shape nested_tuple_ =
+      ShapeUtil::MakeTupleShape({tuple_, matrix_, token_});
+};
+
+TEST_F(ShapeTest, ShapeToFromProto) {
+  for (const Shape& shape :
+       {opaque_, token_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_}) {
+    Shape shape_copy(shape.ToProto());
+    EXPECT_TRUE(ShapeUtil::Equal(shape, shape_copy))
+        << shape << " != " << shape_copy;
+  }
+}
+
+TEST_F(ShapeTest, ShapeToString) {
+  EXPECT_EQ("opaque[]", opaque_.ToString());
+  EXPECT_EQ("token[]", token_.ToString());
+  EXPECT_EQ("f32[]", scalar_.ToString());
+  EXPECT_EQ("u32[1,2]", matrix_.ToString());
+  EXPECT_EQ("s32[3,4]", matrix2_.ToString());
+  EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])", tuple_.ToString());
+  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+            nested_tuple_.ToString());
+
+  EXPECT_EQ("opaque[]", opaque_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("f32[]", scalar_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("u32[1,2]{1,0}", matrix_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("s32[3,4]{0,1}", matrix2_.ToString(/*print_layout=*/true));
+  EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
+            tuple_.ToString(/*print_layout=*/true));
+  EXPECT_EQ(
+      "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, "
+      "token[])",
+      nested_tuple_.ToString(/*print_layout=*/true));
+}
+
+TEST_F(ShapeTest, ProgramShapeToFromProto) {
+  ProgramShape program_shape;
+  *program_shape.add_parameters() = ShapeUtil::MakeShape(F32, {1, 2, 3});
+  *program_shape.add_parameters() = ShapeUtil::MakeTokenShape();
+  *program_shape.add_parameters() = ShapeUtil::MakeShape(S64, {});
+  *program_shape.add_parameters() = ShapeUtil::MakeTupleShape(
+      {ShapeUtil::MakeShape(S32, {}),
+       ShapeUtil::MakeTupleShape({ShapeUtil::MakeTokenShape()}),
+       ShapeUtil::MakeShape(F32, {42, 42})});
+
+  *program_shape.mutable_result() = ShapeUtil::MakeShape(F32, {7});
+
+  program_shape.add_parameter_names("foo");
+  program_shape.add_parameter_names("bar");
+  program_shape.add_parameter_names("baz");
+  program_shape.add_parameter_names("qux qux");
+
+  // Create a copy of the program shape by round-tripping through a proto.
+  ProgramShape program_shape_copy(program_shape.ToProto());
+  ASSERT_EQ(program_shape.parameters_size(),
+            program_shape_copy.parameters_size());
+  for (int i = 0; i < program_shape.parameters_size(); ++i) {
+    EXPECT_TRUE(ShapeUtil::Equal(program_shape.parameters(i),
+                                 program_shape_copy.parameters(i)));
+  }
+
+  EXPECT_TRUE(
+      ShapeUtil::Equal(program_shape.result(), program_shape_copy.result()));
+
+  ASSERT_EQ(program_shape.parameter_names_size(),
+            program_shape_copy.parameter_names_size());
+  for (int i = 0; i < program_shape.parameter_names_size(); ++i) {
+    EXPECT_EQ(program_shape.parameter_names(i),
+              program_shape_copy.parameter_names(i));
+  }
+}
+
+TEST_F(ShapeTest, ProgramShapeToString) {
+  ProgramShape prog = ShapeUtil::MakeProgramShape(
+      {opaque_, scalar_, matrix_, matrix2_, tuple_, nested_tuple_},
+      nested_tuple_);
+  EXPECT_EQ(
+      "((unknown): opaque[], "
+      "(unknown): f32[], "
+      "(unknown): u32[1,2], "
+      "(unknown): s32[3,4], "
+      "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), "
+      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+      prog.ToString());
+
+  prog.add_parameter_names("arg0");
+  prog.add_parameter_names("scalar");
+  prog.add_parameter_names("matrix");
+  prog.add_parameter_names("matrix2");
+  prog.add_parameter_names("tuple");
+  prog.add_parameter_names("nested_tuple");
+  EXPECT_EQ(
+      "(arg0: opaque[], "
+      "scalar: f32[], "
+      "matrix: u32[1,2], "
+      "matrix2: s32[3,4], "
+      "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), "
+      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], "
+      "token[])) "
+      "-> "
+      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
+      prog.ToString());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/shape_tree.h b/tensorflow/compiler/xla/shape_tree.h
index df610102b4c7fa08c0b7030124939009130f89f4..7bf97729165bef98fabc29040e02203eee68a53c 100644
--- a/tensorflow/compiler/xla/shape_tree.h
+++ b/tensorflow/compiler/xla/shape_tree.h
@@ -667,12 +667,11 @@ void ShapeTree<T>::CopySubtreeFrom(const ShapeTree<T>& other,
 template <typename T>
 bool ShapeTree<T>::operator==(const ShapeTree<T>& other) const {
   bool equal = true;
-  ForEachElement(
-      [this, &other, &equal](const ShapeIndex& index, const T& data) {
-        if (data != other.element(index)) {
-          equal = false;
-        }
-      });
+  ForEachElement([&other, &equal](const ShapeIndex& index, const T& data) {
+    if (data != other.element(index)) {
+      equal = false;
+    }
+  });
   return equal;
 }
 
diff --git a/tensorflow/compiler/xla/shape_tree_test.cc b/tensorflow/compiler/xla/shape_tree_test.cc
index c8ff55e7845785d9292516b823fb591cc28cbfad..2b6c484bc4f205be0180403eeac2dd391029b110 100644
--- a/tensorflow/compiler/xla/shape_tree_test.cc
+++ b/tensorflow/compiler/xla/shape_tree_test.cc
@@ -52,10 +52,10 @@ class ShapeTreeTest : public ::testing::Test {
 
 TEST_F(ShapeTreeTest, DefaultConstructor) {
   ShapeTree<int> int_tree;
-  EXPECT_TRUE(ShapeUtil::IsNil(int_tree.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(int_tree.shape()));
 
   ShapeTree<bool> bool_tree;
-  EXPECT_TRUE(ShapeUtil::IsNil(bool_tree.shape()));
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(bool_tree.shape()));
 }
 
 void ShapeTreeTest::TestShapeConstructor(const Shape& shape,
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index d0c35d8dee46a1e0a5e343e0506a14ca1ce38bfd..a4d4e1e53e727bdf7822cacaa4559fcae59d4eae 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -79,14 +79,14 @@ bool ShapeIndexView::StartsWith(ShapeIndexView prefix) const {
          indices_.subspan(0, prefix.size()) == prefix.indices_;
 }
 
-namespace {
-
-// Returns whether the given primitive type corresponds to an array shape.
-bool IsArrayPrimitiveType(PrimitiveType primitive_type) {
+/* static */ bool ShapeUtil::IsArrayPrimitiveType(
+    PrimitiveType primitive_type) {
   return primitive_type != PRIMITIVE_TYPE_INVALID && primitive_type != TUPLE &&
          primitive_type != OPAQUE && primitive_type != TOKEN;
 }
 
+namespace {
+
 // Recursive helper for comparing the equality of two shapes. Returns true if
 // the shapes are the same. If compare_layouts is true, then layouts must also
 // match.
@@ -121,6 +121,23 @@ bool CompareShapes(const Shape& lhs, const Shape& rhs, bool compare_layouts,
         VLOG(3) << "CompareShapes: lhs layout != rhs layout";
         return false;
       }
+
+      const auto& lhs_tiles = lhs.layout().tiles();
+      const auto& rhs_tiles = rhs.layout().tiles();
+      if (lhs_tiles.size() != rhs_tiles.size()) {
+        return false;
+      }
+      for (int64 i = 0; i < lhs_tiles.size(); i++) {
+        if (!absl::c_equal(lhs_tiles[i].dimensions(),
+                           rhs_tiles[i].dimensions())) {
+          return false;
+        }
+      }
+
+      if (lhs.layout().element_size_in_bits() !=
+          rhs.layout().element_size_in_bits()) {
+        return false;
+      }
     }
   }
 
@@ -203,7 +220,7 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
 /* static */ ProgramShape ShapeUtil::MakeProgramShape(
     std::initializer_list<Shape> parameters, Shape result) {
   ProgramShape program_shape;
-  for (const auto& shape : parameters) {
+  for (const Shape& shape : parameters) {
     *program_shape.add_parameters() = shape;
   }
   *program_shape.mutable_result() = std::move(result);
@@ -272,7 +289,7 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 /* static */ Shape ShapeUtil::MakeTupleShape(absl::Span<const Shape> shapes) {
   Shape result;
   result.set_element_type(TUPLE);
-  result.mutable_tuple_shapes()->Reserve(shapes.size());
+  result.mutable_tuple_shapes()->reserve(shapes.size());
   for (const auto& shape : shapes) {
     AppendShapeToTuple(shape, &result);
   }
@@ -372,10 +389,6 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   return IsTuple(shape) && TupleElementCount(shape) == 0;
 }
 
-/* static */ bool ShapeUtil::IsNil(const Shape& shape) {
-  return IsEmptyTuple(shape);
-}
-
 /* static */ int64 ShapeUtil::TupleElementCount(const Shape& shape) {
   CHECK(IsTuple(shape)) << HumanString(shape);
   return shape.tuple_shapes_size();
@@ -571,7 +584,7 @@ namespace {
 // Parses shapes with simple recursive descent structure -- consumes from the
 // front of s and passes that view recursively as required.
 StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
-  *s = StripLeadingAsciiWhitespace(*s);
+  *s = absl::StripLeadingAsciiWhitespace(*s);
 
   if (absl::ConsumePrefix(s, "(")) {  // Tuple.
     std::vector<Shape> shapes;
@@ -584,7 +597,7 @@ StatusOr<Shape> ParseShapeStringInternal(absl::string_view* s) {
       }
       shapes.emplace_back();
       TF_ASSIGN_OR_RETURN(shapes.back(), ParseShapeStringInternal(s));
-      *s = StripLeadingAsciiWhitespace(*s);
+      *s = absl::StripLeadingAsciiWhitespace(*s);
       must_end = !absl::ConsumePrefix(s, ",");
     }
     return ShapeUtil::MakeTupleShape(shapes);
@@ -1155,7 +1168,7 @@ Status ForEachMutableSubshapeHelper(
   // Let the argument `permutation` be P.  This is a permutation over `shape`'s
   // dimensions, so our return value will be a shape with dims P.I = P.  Our
   // goal is to construct a layout permutation L* that we can apply to P such
-  // that that the physical dimension ordering of the returned shape is the same
+  // that the physical dimension ordering of the returned shape is the same
   // as that of the original shape, namely L'.
   //
   // Our returned shape has dims P and layout L*, so its in-memory layout is
@@ -1600,7 +1613,8 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
 /* static */ Shape ShapeUtil::DeleteDimension(int64 dim_to_delete,
                                               Shape shape) {
   CHECK(IsArray(shape));
-  shape.mutable_dimensions()->erase(shape.dimensions().begin() + dim_to_delete);
+  shape.mutable_dimensions()->erase(shape.mutable_dimensions()->begin() +
+                                    dim_to_delete);
   if (LayoutUtil::HasLayout(shape)) {
     Layout* layout = shape.mutable_layout();
     layout->set_format(DENSE);
@@ -1634,11 +1648,6 @@ ShapeUtil::DimensionsUnmodifiedByReshape(const Shape& input_shape,
   return shape;
 }
 
-std::ostream& operator<<(std::ostream& out, const Shape& shape) {
-  out << ShapeUtil::HumanStringWithLayout(shape);
-  return out;
-}
-
 /*static*/ size_t ShapeUtil::Hash(const Shape& shape) {
   using tensorflow::hash;
   using tensorflow::Hash64Combine;
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index a7a3026cf3f3a53d34d389212738ca584a19db1d..84a27f662a57ba274562e2e9be57b7e971c9b477 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -37,6 +38,7 @@ limitations under the License.
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace xla {
@@ -100,6 +102,11 @@ class ShapeIndex {
 
   string ToString() const;
 
+  template <typename H>
+  friend H AbslHashValue(H h, const ShapeIndex& index) {
+    return H::combine(std::move(h), index.indices_);
+  }
+
  private:
   container_type indices_;
 };
@@ -461,6 +468,9 @@ class ShapeUtil {
   // arrays.
   static bool IsArray(const Shape& shape);
 
+  // Returns whether the given primitive type corresponds to an array shape.
+  static bool IsArrayPrimitiveType(PrimitiveType primitive_type);
+
   // Returns whether the shape is a tuple with at least one element which is
   // also a tuple.
   static bool IsNestedTuple(const Shape& shape);
@@ -468,9 +478,6 @@ class ShapeUtil {
   // Returns true if shape is an empty tuple.
   static bool IsEmptyTuple(const Shape& shape);
 
-  // Returns true if shape is the nil shape (an empty tuple).
-  static bool IsNil(const Shape& shape);
-
   // Returns the number of elements in the given tuple shape.
   // Precondition: IsTuple(shape)
   static int64 TupleElementCount(const Shape& shape);
@@ -754,10 +761,18 @@ class ShapeUtil {
       pool.emplace(tensorflow::Env::Default(), "foreach", kNumThreads);
     }
 
+    tensorflow::mutex mu;
+    Status status;  // Guarded by mu
+
     while (n < rank) {
       if (pool != absl::nullopt) {
-        pool->Schedule(
-            [indexes, &visitor_function] { visitor_function(indexes); });
+        pool->Schedule([indexes, &visitor_function, &mu, &status] {
+          StatusOr<bool> result = visitor_function(indexes);
+          if (!result.ok()) {
+            tensorflow::mutex_lock lock(mu);
+            status = status.ok() ? result.status() : status;
+          }
+        });
       } else {
         TF_ASSIGN_OR_RETURN(bool should_continue, visitor_function(indexes));
         if (!should_continue) {
@@ -775,14 +790,14 @@ class ShapeUtil {
       }
     }
 
-    return Status::OK();
+    // Waits for the scheduled work to complete.
+    pool.reset();
+    return status;
   }
 
   TF_DISALLOW_COPY_AND_ASSIGN(ShapeUtil);
 };
 
-std::ostream& operator<<(std::ostream& out, const Shape& shape);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SHAPE_UTIL_H_
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 0c647369a37e70f93abe1732963d2ddc7730c214..60bdbe302045e6f3b4bae500c50bc68fb217525d 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -376,12 +376,12 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
 }
 
 TEST(ShapeUtilTest, NilShape) {
-  EXPECT_TRUE(ShapeUtil::IsNil(ShapeUtil::MakeNil()));
-  EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {1, 2, 3})));
-  EXPECT_FALSE(ShapeUtil::IsNil(ShapeUtil::MakeShape(F32, {0, 1})));
-  EXPECT_FALSE(ShapeUtil::IsNil(
+  EXPECT_TRUE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeNil()));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {1, 2, 3})));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {0, 1})));
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(S32, {})})));
-  EXPECT_FALSE(ShapeUtil::IsNil(
+  EXPECT_FALSE(ShapeUtil::IsEmptyTuple(
       ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {0})})));
 }
 
@@ -546,68 +546,6 @@ TEST(ShapeUtilTest, IsLeafIndex) {
   EXPECT_TRUE(ShapeUtil::IsLeafIndex(nested_tuple_shape, {1, 1}));
 }
 
-TEST(ShapeUtilTest, HumanString) {
-  Shape opaque = ShapeUtil::MakeOpaqueShape();
-  Shape token = ShapeUtil::MakeTokenShape();
-  Shape scalar = ShapeUtil::MakeShape(F32, {});
-  Shape matrix = ShapeUtil::MakeShape(U32, {1, 2});
-  Shape matrix2 = ShapeUtil::MakeShapeWithLayout(S32, {3, 4}, {0, 1});
-  Shape tuple = ShapeUtil::MakeTupleShape({opaque, scalar, matrix, matrix2});
-  Shape nested_tuple = ShapeUtil::MakeTupleShape({tuple, matrix, token});
-
-  EXPECT_EQ("opaque[]", ShapeUtil::HumanString(opaque));
-  EXPECT_EQ("token[]", ShapeUtil::HumanString(token));
-  EXPECT_EQ("f32[]", ShapeUtil::HumanString(scalar));
-  EXPECT_EQ("u32[1,2]", ShapeUtil::HumanString(matrix));
-  EXPECT_EQ("s32[3,4]", ShapeUtil::HumanString(matrix2));
-  EXPECT_EQ("(opaque[], f32[], u32[1,2], s32[3,4])",
-            ShapeUtil::HumanString(tuple));
-  EXPECT_EQ("((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-            ShapeUtil::HumanString(nested_tuple));
-
-  EXPECT_EQ("opaque[]", ShapeUtil::HumanStringWithLayout(opaque));
-  EXPECT_EQ("f32[]", ShapeUtil::HumanStringWithLayout(scalar));
-  EXPECT_EQ("u32[1,2]{1,0}", ShapeUtil::HumanStringWithLayout(matrix));
-  EXPECT_EQ("s32[3,4]{0,1}", ShapeUtil::HumanStringWithLayout(matrix2));
-  EXPECT_EQ("(opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1})",
-            ShapeUtil::HumanStringWithLayout(tuple));
-  EXPECT_EQ(
-      "((opaque[], f32[], u32[1,2]{1,0}, s32[3,4]{0,1}), u32[1,2]{1,0}, "
-      "token[])",
-      ShapeUtil::HumanStringWithLayout(nested_tuple));
-
-  ProgramShape prog = ShapeUtil::MakeProgramShape(
-      {opaque, scalar, matrix, matrix2, tuple, nested_tuple}, nested_tuple);
-  EXPECT_EQ(
-      "((unknown): opaque[], "
-      "(unknown): f32[], "
-      "(unknown): u32[1,2], "
-      "(unknown): s32[3,4], "
-      "(unknown): (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "(unknown): ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])) "
-      "-> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-      ShapeUtil::HumanString(prog));
-
-  prog.add_parameter_names("arg0");
-  prog.add_parameter_names("scalar");
-  prog.add_parameter_names("matrix");
-  prog.add_parameter_names("matrix2");
-  prog.add_parameter_names("tuple");
-  prog.add_parameter_names("nested_tuple");
-  EXPECT_EQ(
-      "(arg0: opaque[], "
-      "scalar: f32[], "
-      "matrix: u32[1,2], "
-      "matrix2: s32[3,4], "
-      "tuple: (opaque[], f32[], u32[1,2], s32[3,4]), "
-      "nested_tuple: ((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], "
-      "token[])) "
-      "-> "
-      "((opaque[], f32[], u32[1,2], s32[3,4]), u32[1,2], token[])",
-      ShapeUtil::HumanString(prog));
-}
-
 TEST(ShapeUtilTest, ForEachSubshapeArray) {
   const Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   int calls = 0;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index db34d34f969311543d988ec6c3b8ee2af5b07e8e..5a7a4faa7e89b27fb537f20d94c21cb4a76e000d 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -79,6 +79,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
@@ -135,6 +136,7 @@ cc_library(
         "//tensorflow/core:stream_executor_no_cuda",
         "//tensorflow/core:test",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
@@ -297,6 +299,52 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "conv_depthwise_test",
+    timeout = "long",
+    srcs = ["conv_depthwise_test.cc"],
+    shard_count = 50,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
+xla_test(
+    name = "grouped_convolution_test",
+    timeout = "long",
+    srcs = ["grouped_convolution_test.cc"],
+    blacklisted_backends = [
+        # disabled because of a break b/119590850.
+        "gpu",
+        # disabled because it times out.
+        "cpu",
+    ],
+    shard_count = 50,
+    deps = [
+        "//tensorflow/compiler/xla:execution_options_util",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/service:bfloat16_normalization",
+        "//tensorflow/compiler/xla/service:despecializer",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/tests:client_library_test_base",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/types:optional",
+    ],
+)
+
 xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
@@ -1265,6 +1313,7 @@ xla_test(
         "enable_for_xla_interpreter",
     ],
     deps = [
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -1865,6 +1914,7 @@ xla_test(
 xla_test(
     name = "multioutput_fusion_test",
     srcs = ["multioutput_fusion_test.cc"],
+    backends = ["gpu"],
     deps = [
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
index 2180b22cb3bc2e1cdd484098bafd14315d1fa142..915b456b52215f8d6a9eb6c5b933f3502f1d3d2c 100644
--- a/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/array_elementwise_ops_test.cc
@@ -329,13 +329,13 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
   Literal b_literal = LiteralUtil::CreateR1<float>({b_values});
   std::unique_ptr<GlobalData> b_data =
       client_->TransferToServer(b_literal).ConsumeValueOrDie();
-  auto b_constant = Parameter(&builder, 1, a_literal.shape(), "b_param");
-  auto b_param = ConstantR1<float>(&builder, b_values);
+  auto b_param = Parameter(&builder, 1, a_literal.shape(), "b_param");
+  auto b_constant = ConstantR1<float>(&builder, b_values);
 
-  auto sum1 = Add(a_constant, b_constant);
-  auto sum2 = Add(a_constant, b_param);
-  auto sum3 = Add(a_param, b_constant);
-  auto sum4 = Add(a_param, b_param);
+  auto sum1 = Add(a_constant, b_param);
+  auto sum2 = Add(a_constant, b_constant);
+  auto sum3 = Add(a_param, b_param);
+  auto sum4 = Add(a_param, b_constant);
 
   auto sum = Add(sum1, sum2);
   sum = Add(sum, sum3);
@@ -350,6 +350,44 @@ TEST_P(ArrayElementwiseOpTestParamCount, AddManyValues) {
                              error_spec_);
 }
 
+// TODO(b/119692968): This test runs OOM on the GPU and CPU backend.
+XLA_TEST_F(ArrayElementwiseOpTest,
+           DISABLED_ON_GPU(DISABLED_ON_CPU(DeeplyNestedAddWithSlices))) {
+  XlaBuilder builder(TestName());
+  std::vector<float> values(30, 0.0);
+  auto a_literal = LiteralUtil::CreateR1<float>(values);
+  auto a = Parameter(&builder, 0, a_literal.shape(), "x");
+  auto b_literal = LiteralUtil::CreateR1<float>(values);
+  auto b = Parameter(&builder, 1, b_literal.shape(), "x");
+
+  // Construct a sequence of diamond-shaped gadgets like this:
+  //
+  //      add
+  //    /    \
+  //  slice  slice
+  //     \   /
+  //      add
+  //
+  // Each 'left' slice removes the last element, each 'right' slice removes the
+  // first element. In this way, we index into the add with different
+  // multi-dimensional index arrays, which defeats the caching we use to avoid
+  // exponential compile time.
+  std::function<XlaOp(int64)> generate_recursive =
+      [&](int64 slice_size) -> XlaOp {
+    if (slice_size == values.size()) {
+      return Add(a, b);
+    }
+    XlaOp param = generate_recursive(slice_size + 1);
+    auto slice1 = Slice(param, {0}, {slice_size}, {1});
+    auto slice2 = Slice(param, {1}, {slice_size + 1}, {1});
+    return Add(slice1, slice2);
+  };
+  generate_recursive(1);
+  auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie();
+  auto b_data = client_->TransferToServer(b_literal).ConsumeValueOrDie();
+  ComputeAndCompareR1<float>(&builder, {0.0}, {a_data.get(), b_data.get()});
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, SubTwoConstantF32s) {
   XlaBuilder builder(TestName());
   auto a = ConstantR1<float>(&builder, {-2.5f, 3.14f, 2.25f, -10.0f, 6.0f});
@@ -2744,12 +2782,16 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGtR3F32sWithDegenerateDim2) {
   Array3D<int> expected_3d(
       {{{0, 1}, {0, 0}, {0, 0}}, {{0, 1}, {1, 0}, {0, 1}}});
   const string expected = R"(pred[2,3,2] {
-{ { 0, 1 },
+{
+  { 0, 1 },
   { 0, 0 },
-  { 0, 0 } },
-{ { 0, 1 },
+  { 0, 0 }
+},
+{
+  { 0, 1 },
   { 1, 0 },
-  { 0, 1 } }
+  { 0, 1 }
+}
 })";
   EXPECT_EQ(expected, ExecuteToString(&builder, {}));
 }
diff --git a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
index dde19fb65d65064c9452a6ac49c70e20cf113336..702fb32adfc8a0ded26845c92245776a79777c34 100644
--- a/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
+++ b/tensorflow/compiler/xla/tests/broadcast_simple_test.cc
@@ -161,8 +161,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {2, 2}), {1});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {2, 2}, {1});
 
   Array2D<float> expected(2, 2);
   expected(0, 0) = 1;
@@ -175,8 +174,7 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsUsual) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {2, 2}), {0});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {2, 2}, {0});
 
   Array2D<float> expected(2, 2);
   expected(0, 0) = 1;
@@ -189,8 +187,8 @@ XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsTranspose) {
 
 XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}),
-                 ShapeUtil::MakeShape(F32, {2, 2, 2}), {0, 1});
+  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}), {2, 2, 2},
+                 {0, 1});
 
   Array3D<float> expected(2, 2, 2);
   expected(0, 0, 0) = 1.0;
@@ -207,8 +205,8 @@ XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDims) {
 
 XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}),
-                 ShapeUtil::MakeShape(F32, {2, 2, 2}), {0, 2});
+  BroadcastInDim(ConstantR2<float>(&b, {{1.0, 5.0}, {2.0, 6.0}}), {2, 2, 2},
+                 {0, 2});
 
   Array3D<float> expected(2, 2, 2);
   expected(0, 0, 0) = 1.0;
@@ -225,8 +223,7 @@ XLA_TEST_F(BroadcastSimpleTest, 2DTo3D_WithDimsNotPossibleWithBroadCast) {
 
 XLA_TEST_F(BroadcastSimpleTest, 1DTo2D_WithDimsNotPossibleWithBroadCast) {
   XlaBuilder b(TestName());
-  BroadcastInDim(ConstantR1<float>(&b, {1, 2}),
-                 ShapeUtil::MakeShape(F32, {3, 2}), {1});
+  BroadcastInDim(ConstantR1<float>(&b, {1, 2}), {3, 2}, {1});
 
   Array2D<float> expected(3, 2);
   expected(0, 0) = 1;
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.cc b/tensorflow/compiler/xla/tests/client_library_test_base.cc
index b98572e24c831c1ff746904302cacccb20056207..12c029983336cc9aed0fde4ce6881c9a00a9869e 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.cc
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.cc
@@ -107,7 +107,7 @@ StatusOr<Literal> ClientLibraryTestBase::ExecuteAndTransfer(
   ExecutionOptions execution_options = execution_options_;
   if (shape_with_output_layout != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *shape_with_output_layout;
+        shape_with_output_layout->ToProto();
   }
   return client_->ExecuteAndTransfer(computation, arguments,
                                      &execution_options);
@@ -127,7 +127,7 @@ StatusOr<Literal> ClientLibraryTestBase::ExecuteAndTransferReference(
   ExecutionOptions execution_options = execution_options_;
   if (shape_with_output_layout != nullptr) {
     *execution_options.mutable_shape_with_output_layout() =
-        *shape_with_output_layout;
+        shape_with_output_layout->ToProto();
   }
   execution_options.clear_device_handles();
   return ref_client_->ExecuteAndTransfer(computation, arguments,
diff --git a/tensorflow/compiler/xla/tests/client_library_test_base.h b/tensorflow/compiler/xla/tests/client_library_test_base.h
index 34148e5886d3806b19fc5bee90806c5678df345e..65a23dd883594b9bf9c37494a37e9be39b197788 100644
--- a/tensorflow/compiler/xla/tests/client_library_test_base.h
+++ b/tensorflow/compiler/xla/tests/client_library_test_base.h
@@ -76,7 +76,7 @@ class ClientLibraryTestBase : public ::testing::Test {
   void SetFastMathDisabled(bool disabled) {
     auto* opts = execution_options_.mutable_debug_options();
     opts->set_xla_cpu_enable_fast_math(!disabled);
-    opts->set_xla_gpu_enable_fast_math(!disabled);
+    opts->set_xla_gpu_enable_fast_min_max(!disabled);
   }
 
   void SetSeed(uint64 seed) { execution_options_.set_seed(seed); }
diff --git a/tensorflow/compiler/xla/tests/client_test.cc b/tensorflow/compiler/xla/tests/client_test.cc
index 6f2ca84bb646e88af221ab80b727911ff7d990eb..363dee74b2755a6bdc3c5a5164a85378581c21d2 100644
--- a/tensorflow/compiler/xla/tests/client_test.cc
+++ b/tensorflow/compiler/xla/tests/client_test.cc
@@ -50,7 +50,8 @@ XLA_TEST_F(ClientTest, ExecuteWithLayout) {
       ExecutionOptions execution_options = execution_options_;
       *execution_options.mutable_shape_with_output_layout() =
           ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
-                                         execute_layout);
+                                         execute_layout)
+              .ToProto();
       TF_ASSERT_OK_AND_ASSIGN(
           std::unique_ptr<GlobalData> data,
           client_->Execute(computation, {}, &execution_options));
@@ -84,7 +85,8 @@ XLA_TEST_F(ClientTest, ExecuteWithTupleLayout) {
           {ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
                                           /*minor_to_major=*/{0, 1}),
            ShapeUtil::MakeShapeWithLayout(S32, /*dimensions=*/{2, 2},
-                                          /*minor_to_major=*/{1, 0})});
+                                          /*minor_to_major=*/{1, 0})})
+          .ToProto();
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto result,
diff --git a/tensorflow/compiler/xla/tests/concat_test.cc b/tensorflow/compiler/xla/tests/concat_test.cc
index 9811a015e91d866d6f4de6ebb6dac536ed6c7e06..4f5b525a34252db9e967a55af0d1bf39a2dd830e 100644
--- a/tensorflow/compiler/xla/tests/concat_test.cc
+++ b/tensorflow/compiler/xla/tests/concat_test.cc
@@ -492,6 +492,32 @@ XLA_TEST_F(ConcatTest, ConcatR3WeirdDims) {
   ComputeAndCompareR3<float>(&builder, expected, {p0.get(), p1.get()});
 }
 
+XLA_TEST_F(ConcatTest, ConcatDeeplyNested) {
+  XlaBuilder builder(TestName());
+  auto a_literal = LiteralUtil::CreateR1<float>({256.0});
+  auto a = Parameter(&builder, 0, a_literal.shape(), "x");
+  auto b = ConcatInDim(&builder, {a, a}, 0);
+  auto c = ConcatInDim(&builder, {b, b}, 0);
+  auto d = ConcatInDim(&builder, {c, c}, 0);
+  auto e = ConcatInDim(&builder, {d, d}, 0);
+  auto f = ConcatInDim(&builder, {e, e}, 0);
+  auto g = ConcatInDim(&builder, {f, f}, 0);
+  auto h = ConcatInDim(&builder, {g, g}, 0);
+  auto i = ConcatInDim(&builder, {h, h}, 0);
+  auto j = ConcatInDim(&builder, {i, i}, 0);
+  auto k = ConcatInDim(&builder, {j, j}, 0);
+  auto l = ConcatInDim(&builder, {k, k}, 0);
+  auto m = ConcatInDim(&builder, {l, l}, 0);
+  auto n = ConcatInDim(&builder, {m, m}, 0);
+  auto o = ConcatInDim(&builder, {n, n}, 0);
+  auto p = ConcatInDim(&builder, {o, o}, 0);
+  auto q = ConcatInDim(&builder, {p, p}, 0);
+  ConcatInDim(&builder, {q, q}, 0);
+  std::vector<float> expected(131072, 256.0);
+  auto a_data = client_->TransferToServer(a_literal).ConsumeValueOrDie();
+  ComputeAndCompareR1<float>(&builder, expected, {a_data.get()});
+}
+
 // Describes a binary rank-2 concatenation test.
 struct R2BinarySpec {
   int64 lhs_dim0;
diff --git a/tensorflow/compiler/xla/tests/conv_depthwise_test.cc b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..627a17a0ca114085240dbaf28211bb3511cf0cab
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/conv_depthwise_test.cc
@@ -0,0 +1,234 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct DepthwiseConvolution2DSpec {
+  int64 output_feature, window, stride, pad, lhs_dilate;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+class DepthwiseConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<DepthwiseConvolution2DSpec, bool>> {};
+
+static std::vector<DepthwiseConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<DepthwiseConvolution2DSpec> config_set;
+  std::vector<std::vector<int64>> config_options = {
+      {128, 6, 3, 64},  {256, 5, 3, 256}, {256, 5, 2, 144}, {144, 5, 3, 64},
+      {144, 5, 2, 256}, {8, 48, 17, 8},   {128, 20, 6, 64}, {64, 14, 12, 172},
+      {16, 9, 4, 16},   {128, 1, 2, 144}, {256, 1, 2, 64}};
+
+  for (auto option : config_options) {
+    int64 feature = option[0];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[3];
+
+    std::vector<int64> kernel_layout = {3, 2, 1, 0};
+    DepthwiseConvolution2DSpec config;
+    config.output_feature = feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size, feature};
+    config.activation_layout = {3, 0, 2, 1};
+
+    config.kernel_dims = {kernel_size, kernel_size, 1, feature};
+    config.kernel_layout = {3, 2, 1, 0};
+
+    if (activation_size == 1 && kernel_size == 2) {
+      // Test for outer dim.
+      config.output_dims = {batch, activation_size + kernel_size - 1,
+                            activation_size + kernel_size, feature};
+    } else if (feature == 256) {
+      // Restrict dilation-based tests only to one feature configuration.
+      config.stride = activation_size - 1;
+      config.pad = 0;
+      config.lhs_dilate = feature / 32;
+      config.output_dims = {batch, feature / 32,
+                            activation_size - kernel_size + 1, feature};
+    } else {
+      config.stride = config.pad = config.lhs_dilate = -1;
+      config.output_dims = {batch, activation_size - kernel_size + 1,
+                            activation_size - kernel_size + 1, feature};
+    }
+
+    // Try this layout for all kernel shapes.
+    config.output_layout = {3, 0, 2, 1};
+    config_set.push_back(config);
+
+    // Try other layouts only for certain kernel shapes.
+    if (kernel_size % 2 == 0) {
+      config.activation_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.output_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.activation_layout = {3, 0, 2, 1};
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string DepthwiseConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<DepthwiseConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextDepthwiseConvolution2D(
+    const DepthwiseConvolution2DSpec& spec, bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.output_feature);
+
+  } else if (spec.stride == -1) {
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv
+
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.output_feature);
+  } else {
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.output_feature);
+  }
+}
+
+XLA_TEST_P(DepthwiseConvolution2DTest, DoIt) {
+  const DepthwiseConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text =
+      BuildHloTextDepthwiseConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    DepthwiseConvolution2DTestWithRandomIndices, DepthwiseConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    DepthwiseConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/convolution_test.cc b/tensorflow/compiler/xla/tests/convolution_test.cc
index 211d004ec8c0a04b17c2454995880c0b565d3d4d..4a58a1ed66c438d1dd9561f4eb029b38d8c6cbdd 100644
--- a/tensorflow/compiler/xla/tests/convolution_test.cc
+++ b/tensorflow/compiler/xla/tests/convolution_test.cc
@@ -721,23 +721,573 @@ class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid : public ConvolutionTest {
     ComputeAndCompareLiteral(&builder, expected_r4,
                              {input_literal.get(), filter_literal.get()},
                              error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 512}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(
+    Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes,
+    TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid_Output_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {256, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048 * 256, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 =
+        expected_r1.Reshape({256, 2, 2, 512}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Input_Batch_in_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {256, 4, 4, 512};
+    std::vector<int64> filter_dims = {3, 3, 1, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/512);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(2048 * 256, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 =
+        expected_r1.Reshape({256, 2, 2, 512}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_256x4x4x512_3x3x1x512_Depthwise_Both_Batch_in_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 5};
+    std::vector<int64> filter_dims = {3, 3, 1, 5};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/5);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(6864),  static_cast<T>(7296),  static_cast<T>(7746),
+         static_cast<T>(8214),  static_cast<T>(8700),  static_cast<T>(7809),
+         static_cast<T>(8286),  static_cast<T>(8781),  static_cast<T>(9294),
+         static_cast<T>(9825),  static_cast<T>(10644), static_cast<T>(11256),
+         static_cast<T>(11886), static_cast<T>(12534), static_cast<T>(13200),
+         static_cast<T>(11589), static_cast<T>(12246), static_cast<T>(12921),
+         static_cast<T>(13614), static_cast<T>(14325)});
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 5}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(
+    Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes,
+    TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x5_3x3x1x5_Depthwise_Valid_Output_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({3, 0, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Input_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 160};
+    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/160);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+    auto input_r4_relaid =
+        input_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(640, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4_relaid =
+        expected_r4.Relayout(LayoutUtil::MakeLayout({0, 3, 2, 1}));
+
+    auto input_literal =
+        client_->TransferToServer(input_r4_relaid).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4_relaid,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_, &expected_r4_relaid.shape());
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Dephtwise_Both_Batch_In_Lanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 4, 4, 1024};
+    std::vector<int64> filter_dims = {3, 3, 1, 1024};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
 
-    auto filter_r = filter_r1.Reshape(filter_dims);
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/1024);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
+                               static_cast<T>(1));
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
+                                static_cast<T>(2));
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    std::vector<T> output_elems(4096, static_cast<T>(18));
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
+    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 1024}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x4x4x512_3x3x1x512_Depthwise_Valid, Types) {
+TYPED_TEST_CASE(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, Types) {
   this->RunTest();
 }
 
 template <typename T>
-class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
+class Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid : public ConvolutionTest {
  public:
   void RunTest() {
     XlaBuilder builder(TestName());
-    std::vector<int64> input_dims = {1, 4, 4, 160};
-    std::vector<int64> filter_dims = {3, 3, 1, 160};
+    std::vector<int64> input_dims = {1, 2, 2, 6};
+    std::vector<int64> filter_dims = {2, 2, 2, 12};
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
     {
@@ -760,23 +1310,89 @@ class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
       dnums.set_kernel_output_feature_dimension(3);
 
       ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
-                                /*feature_group_count=*/160);
+                                /*feature_group_count=*/3);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 = LiteralUtil::CreateR1<T>(
+        {static_cast<T>(5076), static_cast<T>(5160), static_cast<T>(5244),
+         static_cast<T>(5328), static_cast<T>(6164), static_cast<T>(6264),
+         static_cast<T>(6364), static_cast<T>(6464), static_cast<T>(7380),
+         static_cast<T>(7496), static_cast<T>(7612), static_cast<T>(7728)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 12}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x6_2x2x2x12_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 1024};
+    std::vector<int64> filter_dims = {2, 2, 128, 512};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/8);
     }
 
     std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
                                static_cast<T>(1));
+
     auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
     auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
 
     std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
                                 static_cast<T>(2));
+
     auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
     auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
 
-    std::vector<T> output_elems(640, static_cast<T>(18));
-
+    std::vector<T> output_elems(512, static_cast<T>(1024));
     auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
-    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 160}).ConsumeValueOrDie();
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 512}).ConsumeValueOrDie();
 
     auto input_literal =
         client_->TransferToServer(input_r4).ConsumeValueOrDie();
@@ -786,24 +1402,21 @@ class Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid : public ConvolutionTest {
     ComputeAndCompareLiteral(&builder, expected_r4,
                              {input_literal.get(), filter_literal.get()},
                              error_spec_);
-
-    auto filter_r = filter_r1.Reshape(filter_dims);
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x4x4x160_3x3x1x160_Depthwise_Valid, Types) {
+TYPED_TEST_CASE(Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x1024_2x2x128x512_Grouped_Valid, Types) {
   this->RunTest();
 }
 
 template <typename T>
-class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
-    : public ConvolutionTest {
+class Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid : public ConvolutionTest {
  public:
   void RunTest() {
     XlaBuilder builder(TestName());
-    std::vector<int64> input_dims = {1, 4, 4, 1024};
-    std::vector<int64> filter_dims = {3, 3, 1, 1024};
+    std::vector<int64> input_dims = {1, 2, 2, 1024};
+    std::vector<int64> filter_dims = {2, 2, 128, 8};
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
     {
@@ -826,23 +1439,24 @@ class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
       dnums.set_kernel_output_feature_dimension(3);
 
       ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
-                                /*feature_group_count=*/1024);
+                                /*feature_group_count=*/8);
     }
 
     std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape),
                                static_cast<T>(1));
+
     auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
     auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
 
     std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape),
                                 static_cast<T>(2));
+
     auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
     auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
 
-    std::vector<T> output_elems(4096, static_cast<T>(18));
-
+    std::vector<T> output_elems(8, static_cast<T>(1024));
     auto expected_r1 = LiteralUtil::CreateR1<T>(output_elems);
-    auto expected_r4 = expected_r1.Reshape({1, 2, 2, 1024}).ConsumeValueOrDie();
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 8}).ConsumeValueOrDie();
 
     auto input_literal =
         client_->TransferToServer(input_r4).ConsumeValueOrDie();
@@ -852,23 +1466,21 @@ class Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid
     ComputeAndCompareLiteral(&builder, expected_r4,
                              {input_literal.get(), filter_literal.get()},
                              error_spec_);
-
-    auto filter_r = filter_r1.Reshape(filter_dims);
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x4x4x1024_3x3x1x1024_Depthwise_Valid, Types) {
+TYPED_TEST_CASE(Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x1024_2x2x128x8_Grouped_Valid, Types) {
   this->RunTest();
 }
 
 template <typename T>
-class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
+class Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid : public ConvolutionTest {
  public:
   void RunTest() {
     XlaBuilder builder(TestName());
-    std::vector<int64> input_dims = {1, 2, 2, 6};
-    std::vector<int64> filter_dims = {2, 2, 2, 12};
+    std::vector<int64> input_dims = {1, 2, 2, 12};
+    std::vector<int64> filter_dims = {2, 2, 3, 4};
     Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
     Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
     {
@@ -891,7 +1503,7 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
       dnums.set_kernel_output_feature_dimension(3);
 
       ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
-                                /*feature_group_count=*/3);
+                                /*feature_group_count=*/4);
     }
 
     std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
@@ -904,12 +1516,140 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
     auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
     auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
 
+    auto expected_r1 =
+        LiteralUtil::CreateR1<T>({static_cast<T>(7712), static_cast<T>(8816),
+                                  static_cast<T>(9992), static_cast<T>(11240)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid, Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes
+    : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 2, 2, 12};
+    std::vector<int64> filter_dims = {2, 2, 4, 3};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(3);
+      dnums.set_kernel_output_feature_dimension(2);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/4);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+    auto filter_r4_relaid =
+        filter_r4.Relayout(LayoutUtil::MakeLayout({3, 2, 1, 0}));
     auto expected_r1 = LiteralUtil::CreateR1<T>(
-        {static_cast<T>(5076), static_cast<T>(5160), static_cast<T>(5244),
-         static_cast<T>(5328), static_cast<T>(6164), static_cast<T>(6264),
-         static_cast<T>(6364), static_cast<T>(6464), static_cast<T>(7380),
-         static_cast<T>(7496), static_cast<T>(7612), static_cast<T>(7728)});
-    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 12}).ConsumeValueOrDie();
+        {static_cast<T>(6968), static_cast<T>(8516), static_cast<T>(10280),
+         static_cast<T>(12260)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
+
+    auto input_literal =
+        client_->TransferToServer(input_r4).ConsumeValueOrDie();
+    auto filter_literal =
+        client_->TransferToServer(filter_r4_relaid).ConsumeValueOrDie();
+
+    ComputeAndCompareLiteral(&builder, expected_r4,
+                             {input_literal.get(), filter_literal.get()},
+                             error_spec_);
+  }
+};
+
+TYPED_TEST_CASE(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes,
+                TestTypes);
+TYPED_TEST(Convolve2D_1x2x2x12_2x2x3x4_Grouped_Valid_Filter_OF_In_Sublanes,
+           Types) {
+  this->RunTest();
+}
+
+template <typename T>
+class Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid : public ConvolutionTest {
+ public:
+  void RunTest() {
+    XlaBuilder builder(TestName());
+    std::vector<int64> input_dims = {1, 1, 1, 12};
+    std::vector<int64> filter_dims = {1, 1, 3, 4};
+    Shape input_shape = ShapeUtil::MakeShapeWithType<T>(input_dims);
+    Shape filter_shape = ShapeUtil::MakeShapeWithType<T>(filter_dims);
+    {
+      auto input = Parameter(&builder, 0, input_shape, "input");
+      auto filter = Parameter(&builder, 1, filter_shape, "filter");
+
+      // Tensorflow dimension numbers for 2D convolution.
+      ConvolutionDimensionNumbers dnums;
+      dnums.set_input_batch_dimension(0);
+      dnums.set_output_batch_dimension(0);
+      dnums.add_input_spatial_dimensions(1);
+      dnums.add_output_spatial_dimensions(1);
+      dnums.add_input_spatial_dimensions(2);
+      dnums.add_output_spatial_dimensions(2);
+      dnums.set_input_feature_dimension(3);
+      dnums.set_output_feature_dimension(3);
+      dnums.add_kernel_spatial_dimensions(0);
+      dnums.add_kernel_spatial_dimensions(1);
+      dnums.set_kernel_input_feature_dimension(2);
+      dnums.set_kernel_output_feature_dimension(3);
+
+      ConvWithGeneralDimensions(input, filter, {1, 1}, Padding::kValid, dnums,
+                                /*feature_group_count=*/4);
+    }
+
+    std::vector<T> input_elems(ShapeUtil::ElementsIn(input_shape));
+    iota_int_init_value(input_elems, 1);
+    auto input_r1 = LiteralUtil::CreateR1<T>(input_elems);
+    auto input_r4 = input_r1.Reshape(input_dims).ConsumeValueOrDie();
+
+    std::vector<T> filter_elems(ShapeUtil::ElementsIn(filter_shape));
+    iota_int_init_value(filter_elems, 1);
+    auto filter_r1 = LiteralUtil::CreateR1<T>(filter_elems);
+    auto filter_r4 = filter_r1.Reshape(filter_dims).ConsumeValueOrDie();
+
+    auto expected_r1 =
+        LiteralUtil::CreateR1<T>({static_cast<T>(38), static_cast<T>(98),
+                                  static_cast<T>(176), static_cast<T>(272)});
+    auto expected_r4 = expected_r1.Reshape({1, 1, 1, 4}).ConsumeValueOrDie();
 
     auto input_literal =
         client_->TransferToServer(input_r4).ConsumeValueOrDie();
@@ -922,8 +1662,8 @@ class Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid : public ConvolutionTest {
   }
 };
 
-TYPED_TEST_CASE(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, TestTypes);
-TYPED_TEST(Convolve2D_1x2x2x6_2x2x1x12_Grouped_Valid, Types) {
+TYPED_TEST_CASE(Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid, TestTypes);
+TYPED_TEST(Convolve2D_1x1x1x12_1x1x3x4_Grouped_Valid, Types) {
   this->RunTest();
 }
 
@@ -1217,6 +1957,18 @@ ENTRY Test {
   EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
 }
 
+XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF32ForwardReversed)) {
+  constexpr char kHlo[] = R"(
+HloModule TestModule
+
+ENTRY Test {
+  %arg0 = f32[3,56,56,16] parameter(0)
+  %arg1 = f32[3,3,3,32] parameter(1)
+  ROOT %conv = f32[54,54,16,32] convolution(%arg0, %arg1), window={size=3x3 rhs_reversal=1x1}, dim_labels=f01b_i01o->01bf
+})";
+  EXPECT_TRUE(RunAndCompare(kHlo, ErrorSpec{0.001}));
+}
+
 XLA_TEST_F(ConvolutionHloTest, DISABLED_ON_CPU(ConvolveF64BackwardFilter)) {
   constexpr char kHlo[] = R"(
 HloModule TestModule
diff --git a/tensorflow/compiler/xla/tests/dot_operation_test.cc b/tensorflow/compiler/xla/tests/dot_operation_test.cc
index 6c0847a875798870b4362a99ac2ab65d99f9f3e6..c5d8b663f4abe77e05ec213d2e4e075c260a8655 100644
--- a/tensorflow/compiler/xla/tests/dot_operation_test.cc
+++ b/tensorflow/compiler/xla/tests/dot_operation_test.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/tests/test_utils.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace xla {
 namespace {
@@ -637,6 +636,76 @@ XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMul) {
       {x_data.get(), y_data.get()}, this->error_spec_);
 }
 
+#ifndef XLA_TEST_BACKEND_CPU
+// TODO(b/74459949): failed on CPU on 2018-10-29.
+XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR3LhsR2Rhs) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  auto x =
+      Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "x");
+  auto y = Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({2, 2}), "y");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  DotGeneral(x, y, dnums);
+
+  auto x_data =
+      this->client_
+          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
+              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
+          .ConsumeValueOrDie();
+
+  auto y_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
+                    .ConsumeValueOrDie();
+
+  this->template ComputeAndCompareR2<T>(
+      &builder,
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      this->error_spec_);
+}
+
+// TODO(b/74459949): failed on CPU on 2018-10-29.
+XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulR2LhsR3Rhs) {
+  using T = TypeParam;
+
+  XlaBuilder builder(this->TestName());
+  auto x = Parameter(&builder, 0, ShapeUtil::MakeShapeWithType<T>({2, 2}), "x");
+  auto y =
+      Parameter(&builder, 1, ShapeUtil::MakeShapeWithType<T>({2, 2, 2}), "y");
+
+  DotDimensionNumbers dnums;
+  dnums.add_lhs_contracting_dimensions(1);
+  dnums.add_rhs_contracting_dimensions(1);
+  dnums.add_lhs_batch_dimensions(0);
+  dnums.add_rhs_batch_dimensions(0);
+
+  DotGeneral(x, y, dnums);
+
+  auto x_data = this->client_
+                    ->TransferToServer(LiteralUtil::CreateR2FromArray2D<T>(
+                        {{1.0f, 0.0f}, {0.0f, 1.0f}}))
+                    .ConsumeValueOrDie();
+
+  auto y_data =
+      this->client_
+          ->TransferToServer(LiteralUtil::CreateR3FromArray3D<T>(
+              {{{1.0f, 2.0f}, {3.0f, 4.0f}}, {{5.0f, 6.0f}, {7.0f, 8.0f}}}))
+          .ConsumeValueOrDie();
+
+  this->template ComputeAndCompareR2<T>(
+      &builder,
+      /*expected=*/{{1.0f, 2.0f}, {7.0f, 8.0f}}, {x_data.get(), y_data.get()},
+      this->error_spec_);
+}
+#endif  // XLA_TEST_BACKEND_CPU
+
 XLA_TYPED_TEST(DotOperationTest_F16F32F64CF64, GeneralMatMulMultipleBatch) {
   using T = TypeParam;
 
diff --git a/tensorflow/compiler/xla/tests/grouped_convolution_test.cc b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f7049910e70c4e591636a47c1b6ba72cf2c234f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/grouped_convolution_test.cc
@@ -0,0 +1,245 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/types/optional.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/execution_options_util.h"
+#include "tensorflow/compiler/xla/service/bfloat16_normalization.h"
+#include "tensorflow/compiler/xla/service/despecializer.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/client_library_test_base.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+
+namespace xla {
+namespace {
+
+string GetFloatDataType(bool use_bfloat16) {
+  return use_bfloat16 ? "bf16" : "f32";
+}
+
+struct GroupedConvolution2DSpec {
+  int64 input_feature, output_feature, window, stride, pad, lhs_dilate;
+  int64 group_size, group_count;
+  std::vector<int64> activation_dims;
+  std::vector<int64> activation_layout;
+  std::vector<int64> kernel_dims;
+  std::vector<int64> kernel_layout;
+  std::vector<int64> output_dims;
+  std::vector<int64> output_layout;
+};
+
+class GroupedConvolution2DTest
+    : public HloTestBase,
+      public ::testing::WithParamInterface<
+          ::testing::tuple<GroupedConvolution2DSpec, bool>> {};
+
+static std::vector<GroupedConvolution2DSpec> GetConv2DTestCases() {
+  std::vector<GroupedConvolution2DSpec> config_set;
+  // Add to this set if you want a new test configuration.
+  // Rule : the penultimate number must be divisible by the last number.
+  std::vector<std::vector<int64>> config_options = {{8, 2, 2, 1, 1024, 128},
+                                                    {512, 3, 3, 144, 1024, 16},
+                                                    {256, 3, 3, 129, 512, 64},
+                                                    {64, 1, 2, 127, 32, 8},
+                                                    {256, 3, 3, 256, 1024, 4}};
+
+  for (auto option : config_options) {
+    int64 output_feature = option[0];
+    int64 activation_size = option[1];
+    int64 kernel_size = option[2];
+    int64 batch = option[3];
+    int64 input_feature = option[4];
+    int64 group_size = option[5];
+
+    std::vector<int64> kernel_layout = {3, 2, 1, 0};
+    GroupedConvolution2DSpec config;
+    config.group_size = group_size;
+    config.group_count = input_feature / group_size;
+    config.output_feature = output_feature;
+    config.window = kernel_size;
+
+    config.activation_dims = {batch, activation_size, activation_size,
+                              input_feature};
+    config.activation_layout = {3, 0, 2, 1};
+
+    config.kernel_dims = {kernel_size, kernel_size, group_size, output_feature};
+    config.kernel_layout = {3, 2, 1, 0};
+
+    if (activation_size == 1 && kernel_size == 2) {
+      // Test for outer dim.
+      config.output_dims = {batch, activation_size + kernel_size - 1,
+                            activation_size + kernel_size, output_feature};
+    } else if (output_feature == 256) {
+      // Restrict dilation-based tests only to one feature configuration.
+      config.stride = activation_size - 1;
+      config.pad = 0;
+      config.lhs_dilate = output_feature / 32;
+      config.output_dims = {batch, output_feature / 32,
+                            activation_size - kernel_size + 1, output_feature};
+    } else {
+      config.stride = config.pad = config.lhs_dilate = -1;
+      config.output_dims = {batch, activation_size - kernel_size + 1,
+                            activation_size - kernel_size + 1, output_feature};
+    }
+
+    // Try this layout for all kernel shapes.
+    config.output_layout = {3, 0, 2, 1};
+    config_set.push_back(config);
+
+    // Try other layouts only for certain kernel shapes.
+    if (kernel_size % 2 == 0) {
+      config.activation_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.output_layout = {0, 3, 2, 1};
+      config_set.push_back(config);
+
+      config.activation_layout = {3, 0, 2, 1};
+      config_set.push_back(config);
+    }
+  }
+
+  return config_set;
+}
+
+string GroupedConvolution2DTestDataToString(
+    const ::testing::TestParamInfo<
+        ::testing::tuple<GroupedConvolution2DSpec, bool>>& data) {
+  const auto& spec = ::testing::get<0>(data.param);
+  const string data_type = GetFloatDataType(::testing::get<1>(data.param));
+  string str = absl::StrCat(
+      "activation_dims_", absl::StrJoin(spec.activation_dims, "x"),
+      "_activation_layout_", absl::StrJoin(spec.activation_layout, "_"),
+      "_kernel_dims_", absl::StrJoin(spec.kernel_dims, "x"), "_kernel_layout_",
+      absl::StrJoin(spec.kernel_layout, "_"), "_output_dims_",
+      absl::StrJoin(spec.output_dims, "x"), "_output_layout_",
+      absl::StrJoin(spec.output_layout, "_"), data_type);
+  // -1 indicates non-existence.
+  if (spec.stride != -1) {
+    absl::StrAppend(&str, "_lhs_dilation_", spec.lhs_dilate, "x1");
+  }
+
+  // Test names are not allowed to contain the '-' character.
+  absl::c_replace(str, '-', 'n');
+  return str;
+}
+
+string BuildHloTextGroupedConvolution2D(const GroupedConvolution2DSpec& spec,
+                                        bool use_bfloat16) {
+  const string data_type = GetFloatDataType(use_bfloat16);
+  if (spec.activation_dims[1] == 1 && spec.kernel_dims[1] == 2) {
+    // Check for outer dim.
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d  pad=1_1x%d_%d rhs_dilate=1x%d}, dim_labels=b01f_01io->b01f,
+          feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.window, spec.window, spec.window, spec.group_count);
+
+  } else if (spec.stride == -1) {
+    // Check for basic, non-dilated cases.
+    return absl::StrFormat(
+        R"(
+      HloModule TensorFlowDepthwiseConv
+
+      ENTRY main {
+        activation = %s[%s]{%s} parameter(0)
+        kernel = %s[%s]{%s} parameter(1)
+        ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+            window={size=%dx%d}, dim_labels=b01f_01io->b01f,
+            feature_group_count=%d
+      }
+      )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.group_count);
+  } else {
+    // Check for base dilations.
+    return absl::StrFormat(
+        R"(
+    HloModule TensorFlowDepthwiseConv
+
+    ENTRY main {
+      activation = %s[%s]{%s} parameter(0)
+      kernel = %s[%s]{%s} parameter(1)
+      ROOT conv = %s[%s]{%s} convolution(%s[%s]{%s} activation, %s[%s]{%s} kernel),
+          window={size=%dx%d stride=%dx1 pad=%d_%dx0_0 lhs_dilate=%dx1}, 
+          dim_labels=b01f_01io->b01f, feature_group_count=%d
+    }
+    )",
+        data_type, absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), data_type,
+        absl::StrJoin(spec.output_dims, ","),
+        absl::StrJoin(spec.output_layout, ","), data_type,
+        absl::StrJoin(spec.activation_dims, ","),
+        absl::StrJoin(spec.activation_layout, ","), data_type,
+        absl::StrJoin(spec.kernel_dims, ","),
+        absl::StrJoin(spec.kernel_layout, ","), spec.window, spec.window,
+        spec.stride, 0, 0, spec.lhs_dilate, spec.group_count);
+  }
+}
+
+XLA_TEST_P(GroupedConvolution2DTest, DoIt) {
+  const GroupedConvolution2DSpec& spec = ::testing::get<0>(GetParam());
+  bool use_bfloat16 = ::testing::get<1>(GetParam());
+  const string hlo_text = BuildHloTextGroupedConvolution2D(spec, use_bfloat16);
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{0.01, 0.01},
+                            [](HloModule* module) -> Status {
+                              BFloat16MixedPrecisionRemoval remover;
+                              TF_RETURN_IF_ERROR(remover.Run(module).status());
+                              Despecializer despecializer;
+                              return despecializer.Run(module).status();
+                            }));
+}
+
+INSTANTIATE_TEST_CASE_P(
+    GroupedConvolution2DTestWithRandomIndices, GroupedConvolution2DTest,
+    ::testing::Combine(::testing::ValuesIn(GetConv2DTestCases()),
+                       ::testing::Bool()),
+    GroupedConvolution2DTestDataToString);
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index d8fa00272f8f19ab843fd32a66fd6d6842997bdb..989a7c705a8254f99e5cc0e97dfde5942f146964 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -99,6 +99,8 @@ void VerifiedHloModule::VerifyOrAddFailure(const string& message) {
     ADD_FAILURE() << "HloVerifier failed on module " << name()
                   << (message.empty() ? "" : absl::StrCat(" (", message, ")"))
                   << ": " << status;
+    LOG(ERROR) << "Contents of bad module:";
+    XLA_LOG_LINES(tensorflow::ERROR, ToString());
   }
 }
 
@@ -140,14 +142,6 @@ std::unique_ptr<VerifiedHloModule> HloTestBase::CreateNewVerifiedModule(
       allow_mixed_precision_in_hlo_verifier_);
 }
 
-StatusOr<std::unique_ptr<HloModule>>
-HloTestBase::ParseAndReturnUnverifiedModule(absl::string_view hlo_text,
-                                            const HloModuleConfig& config) {
-  auto module = absl::make_unique<HloModule>(TestName(), config);
-  TF_RETURN_IF_ERROR(ParseHloString(hlo_text, module.get()));
-  return std::move(module);
-}
-
 StatusOr<std::unique_ptr<VerifiedHloModule>>
 HloTestBase::ParseAndReturnVerifiedModule(absl::string_view hlo_text,
                                           const HloModuleConfig& config) {
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.h b/tensorflow/compiler/xla/tests/hlo_test_base.h
index 366726d90b4752b6d53dc2133c8b0b5bbafce086..1d1e7f437296a7493ef7da07039fcf6d273f35bc 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.h
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/base/macros.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/service/backend.h"
@@ -100,6 +101,7 @@ class HloTestBase : public ::testing::Test {
   //
   // This returns a vanilla HloModule that doesn't run the HLO verifier on
   // destruction.
+  ABSL_DEPRECATED("Use CreateNewVerifiedModule instead.")
   std::unique_ptr<HloModule> CreateNewUnverifiedModule(
       const string& name = TestName());
 
@@ -108,12 +110,6 @@ class HloTestBase : public ::testing::Test {
   std::unique_ptr<VerifiedHloModule> CreateNewVerifiedModule(
       const string& name = TestName());
 
-  // Parses the given string and returns module as a vanilla, unverified
-  // HloModule.
-  StatusOr<std::unique_ptr<HloModule>> ParseAndReturnUnverifiedModule(
-      absl::string_view hlo_text,
-      const HloModuleConfig& config = HloModuleConfig());
-
   // Parses the given string and returns module as a VerifiedHloModule.
   StatusOr<std::unique_ptr<VerifiedHloModule>> ParseAndReturnVerifiedModule(
       absl::string_view hlo_text,
diff --git a/tensorflow/compiler/xla/tests/iota_test.cc b/tensorflow/compiler/xla/tests/iota_test.cc
index 310f3495922250d68aa463fcbb24ef0b04603d09..65205f53ddc582ae477d67705f161fef1e31b857 100644
--- a/tensorflow/compiler/xla/tests/iota_test.cc
+++ b/tensorflow/compiler/xla/tests/iota_test.cc
@@ -113,5 +113,26 @@ INSTANTIATE_TEST_CASE_P(IotaR3TestInstantiation, IotaR3Test,
                                                             /*step=*/10),
                                            ::testing::Values(0, 1, 2)));
 
+class IotaR3PredTest : public ClientLibraryTestBase,
+                       public ::testing::WithParamInterface<int> {};
+
+TEST_P(IotaR3PredTest, DoIt) {
+  const auto element_type = PRED;
+  const int64 num_elements = 2;
+  const int64 iota_dim = GetParam();
+  XlaBuilder builder(TestName() + "_" + PrimitiveType_Name(element_type));
+  std::vector<int64> dimensions = {42, 19};
+  dimensions.insert(dimensions.begin() + iota_dim, num_elements);
+  Iota(&builder, ShapeUtil::MakeShape(element_type, dimensions), iota_dim);
+  if (primitive_util::IsFloatingPointType(element_type)) {
+    ComputeAndCompare(&builder, {}, ErrorSpec{0.0001});
+  } else {
+    ComputeAndCompare(&builder, {});
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(IotaR3PredTestInstantiation, IotaR3PredTest,
+                        ::testing::Values(0, 1, 2));
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/replay_test.cc b/tensorflow/compiler/xla/tests/replay_test.cc
index 5cf87e565bf493167f5173588e7afa3b96282488..34c7dc7c46427b2d18ea21fc286ee03175f70800 100644
--- a/tensorflow/compiler/xla/tests/replay_test.cc
+++ b/tensorflow/compiler/xla/tests/replay_test.cc
@@ -55,7 +55,8 @@ TEST_F(ReplayTest, TwoPlusTwoReplay) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   Literal literal =
@@ -87,7 +88,8 @@ XLA_TEST_F(ReplayTest, XPlusYReplayWithParameters) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   std::unique_ptr<GlobalData> x_data =
@@ -133,7 +135,8 @@ TEST_F(ReplayTest, MapPlusTwoOverR1) {
       client_->GetComputationShape(computation).ConsumeValueOrDie();
   std::unique_ptr<ProgramShape> replayed_shape =
       client_->GetComputationShape(replayed).ConsumeValueOrDie();
-  ASSERT_TRUE(protobuf_util::ProtobufEquals(*original_shape, *replayed_shape));
+  ASSERT_TRUE(protobuf_util::ProtobufEquals(original_shape->ToProto(),
+                                            replayed_shape->ToProto()));
 
   // Run it.
   Literal literal =
diff --git a/tensorflow/compiler/xla/tests/reshape_test.cc b/tensorflow/compiler/xla/tests/reshape_test.cc
index dedc95b5ae8315185a35f786af42aad53bd7ad96..298136002e9ef47188e0bae95af3f596596e6062 100644
--- a/tensorflow/compiler/xla/tests/reshape_test.cc
+++ b/tensorflow/compiler/xla/tests/reshape_test.cc
@@ -618,7 +618,8 @@ XLA_TEST_P(ReshapeTest, R4Dim0MinorLayoutToR2Dim0MajorLayout) {
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {2, 8},
-                                     {1, 0});
+                                     {1, 0})
+          .ToProto();
   Literal actual =
       client_
           ->ExecuteAndTransfer(computation, {input.get()}, &execution_options)
@@ -767,7 +768,8 @@ XLA_TEST_P(ReshapeTest, NoopReshape) {
   ExecutionOptions execution_options = execution_options_;
   *execution_options.mutable_shape_with_output_layout() =
       ShapeUtil::MakeShapeWithLayout(use_bfloat16() ? BF16 : F32, {7, 2, 3, 5},
-                                     {2, 3, 0, 1});
+                                     {2, 3, 0, 1})
+          .ToProto();
   Literal output_literal =
       client_
           ->ExecuteAndTransfer(computation, {input_data.get()},
diff --git a/tensorflow/compiler/xla/tests/scatter_test.cc b/tensorflow/compiler/xla/tests/scatter_test.cc
index 7e1f4aa0eb4801876d9bdbac6a4d7f1d09f81ba8..32de0fdf78f9c442e17c55e1b951e39122dac5ef 100644
--- a/tensorflow/compiler/xla/tests/scatter_test.cc
+++ b/tensorflow/compiler/xla/tests/scatter_test.cc
@@ -129,6 +129,42 @@ ENTRY main {
   RunTest(hlo_text, &operand, &scatter_indices, &updates);
 }
 
+XLA_TEST_F(ScatterTest, TensorFlowScatterV2_InversePermutation) {
+  const char* hlo_text = R"(
+HloModule TensorFlowScatterV2
+
+update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
+  lhs = s32[] parameter(0)
+  ROOT rhs = s32[] parameter(1)
+}
+
+ENTRY main {
+  permutation = s32[3,4] parameter(0)
+  reshape = s32[3,4,1] reshape(permutation)
+  operand = s32[3,4] iota(), iota_dimension=1
+  updates = s32[3,4,1,1] iota(), iota_dimension=1
+  iota = s32[3,4,1] iota(), iota_dimension=0
+  indices = s32[3,4,2] concatenate(iota, reshape), dimensions={2}
+  ROOT scatter = s32[3,4] scatter(operand, indices, updates),
+      to_apply=update_s32,
+      update_window_dims={2,3},
+      inserted_window_dims={},
+      scatter_dims_to_operand_dims={0,1},
+      index_vector_dim=2
+}
+)";
+  Literal permutation =
+      LiteralUtil::CreateR2<int32>({{1, 3, 2, 0}, {3, 0, 2, 1}, {2, 3, 1, 0}});
+  HloModuleConfig config;
+  config.set_debug_options(GetDebugOptionsForTest());
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseHloString(hlo_text, config));
+  auto actual = ExecuteAndTransfer(std::move(module), {&permutation});
+  Literal expected =
+      LiteralUtil::CreateR2<int32>({{3, 0, 2, 1}, {1, 3, 2, 0}, {3, 2, 0, 1}});
+  EXPECT_TRUE(LiteralTestUtil::Equal(expected, actual));
+}
+
 XLA_TEST_F(ScatterTest, SimpleR4) {
   const char* hlo_text = R"(
 HloModule SimpleR4
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 2f18036ff4c5b0bfa28723fb181c33fa6995eb80..eafa48ed7b8cf2bd67fe767ad36082661dbbd66e 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 
+#include "absl/base/casts.h"
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -28,65 +29,113 @@ namespace xla {
 namespace {
 
 template <typename FloatT, typename GeneratorT>
-void PopulateWithRandomFloatingPointDataImpl(Literal* literal,
-                                             std::minstd_rand0* engine,
-                                             bool no_duplicates) {
+void PopulateWithRandomFloatingPointData(Literal* literal,
+                                         std::minstd_rand0* engine) {
+  std::uniform_real_distribution<GeneratorT> generator(-0.1f, 0.2f);
+  for (FloatT& value : literal->data<FloatT>()) {
+    value = static_cast<FloatT>(generator(*engine));
+  }
+}
+
+template <typename FloatT>
+void PopulateWithIntNext(Literal* literal);
+
+template <>
+void PopulateWithIntNext<half>(Literal* literal) {
+  // Duplicates may be generated if we don't have enough bits.
+  uint16 next_value = 0;
+  for (half& value : literal->data<half>()) {
+    // Zero-out the MSB of the exponent to avoid Infs and NaNs, and put it into
+    // the sign bit. We could be less wasteful, but this is best-effort anyway.
+    uint16 exponent_msb = next_value & 0x4000;
+    value.x = (next_value & 0xBFFF) | (exponent_msb << 1);
+    next_value++;
+  }
+}
+
+template <>
+void PopulateWithIntNext<bfloat16>(Literal* literal) {
+  // Duplicates may be generated if we don't have enough bits.
+  // Start at 0x80 rather than 0 to avoid denormals.
+  uint16 next_value = 0x80;
+  for (bfloat16& value : literal->data<bfloat16>()) {
+    // Zero-out the MSB of the exponent to avoid Infs and NaNs, and put it into
+    // the sign bit. We could be less wasteful, but this is best-effort anyway.
+    uint16 exponent_msb = next_value & 0x4000;
+    value.value = (next_value & 0xBFFF) | (exponent_msb << 1);
+    next_value++;
+  }
+}
+
+template <typename FloatT>
+void PopulateWithNextAfter(Literal* literal) {
+  // Duplicates may be generated if the number of elements in the literal
+  // exceeds the number of positive values supported by the type.
+  float next_value = std::numeric_limits<float>::min();
+  for (float& value : literal->data<float>()) {
+    value = next_value;
+    next_value = std::nextafter(next_value, std::numeric_limits<float>::max());
+  }
+}
+
+template <typename FloatT,
+          typename std::enable_if<std::is_same<bfloat16, FloatT>::value ||
+                                      std::is_same<half, FloatT>::value,
+                                  int>::type = 0>
+void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
+  PopulateWithIntNext<FloatT>(literal);
+  std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
+               *engine);
+}
+
+template <typename FloatT,
+          typename std::enable_if<!std::is_same<bfloat16, FloatT>::value &&
+                                      !std::is_same<half, FloatT>::value,
+                                  int>::type = 0>
+void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
+  PopulateWithNextAfter<FloatT>(literal);
+  std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
+               *engine);
+}
+
+template <typename FloatT>
+void PopulateWithFloatingPointData(Literal* literal, std::minstd_rand0* engine,
+                                   bool no_duplicates) {
   CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<FloatT>());
   if (no_duplicates) {
-    // Duplicates may be generated if the number of elements in the literal
-    // exceeds the number of positive values supported by the type.
-    FloatT next_value = std::numeric_limits<FloatT>::min();
-    for (FloatT& value : literal->data<FloatT>()) {
-      value = next_value;
-      next_value =
-          std::nextafter(next_value, std::numeric_limits<FloatT>::max());
-    }
-    std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
-                 *engine);
+    PopulateWithNoDuplicateData<FloatT>(literal, engine);
   } else {
-    std::uniform_real_distribution<GeneratorT> generator(-0.1f, 0.2f);
-    for (FloatT& value : literal->data<FloatT>()) {
-      value = static_cast<FloatT>(generator(*engine));
-    }
+    PopulateWithRandomFloatingPointData<FloatT, FloatT>(literal, engine);
   }
 }
 
-template <typename FloatT>
-void PopulateWithRandomFloatingPointData(Literal* literal,
+template <>
+void PopulateWithFloatingPointData<half>(Literal* literal,
                                          std::minstd_rand0* engine,
                                          bool no_duplicates) {
   CHECK(engine != nullptr);
-  PopulateWithRandomFloatingPointDataImpl<FloatT, FloatT>(literal, engine,
-                                                          no_duplicates);
-}
-
-template <>
-void PopulateWithRandomFloatingPointData<half>(Literal* literal,
-                                               std::minstd_rand0* engine,
-                                               bool no_duplicates) {
-  // no_duplicates is ignored for half types. Unique values can only be
-  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
-  // best-effort anyway.
-  CHECK(engine != nullptr);
-  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
-  for (half& value : literal->data<half>()) {
-    value = static_cast<half>(generator(*engine));
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<half>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<half>(literal, engine);
+  } else {
+    PopulateWithRandomFloatingPointData<half, float>(literal, engine);
   }
 }
 
 template <>
-void PopulateWithRandomFloatingPointData<bfloat16>(Literal* literal,
-                                                   std::minstd_rand0* engine,
-                                                   bool no_duplicates) {
-  // no_duplicates is ignored for bfloat types. Unique values can only be
-  // generated for arrays with fewer than ~2**16 elements and no_duplicates is
-  // best-effort anyway.
+void PopulateWithFloatingPointData<bfloat16>(Literal* literal,
+                                             std::minstd_rand0* engine,
+                                             bool no_duplicates) {
   CHECK(engine != nullptr);
-  std::uniform_real_distribution<float> generator(-0.1f, 0.2f);
-  for (bfloat16& value : literal->data<bfloat16>()) {
-    value = static_cast<bfloat16>(generator(*engine));
+  CHECK_EQ(literal->shape().element_type(),
+           primitive_util::NativeToPrimitiveType<bfloat16>());
+  if (no_duplicates) {
+    PopulateWithNoDuplicateData<bfloat16>(literal, engine);
+  } else {
+    PopulateWithRandomFloatingPointData<bfloat16, float>(literal, engine);
   }
 }
 
@@ -135,20 +184,16 @@ StatusOr<Literal> MakeFakeLiteralInternal(const Shape& shape,
   Literal literal(shape);
   switch (shape.element_type()) {
     case BF16:
-      PopulateWithRandomFloatingPointData<bfloat16>(&literal, engine,
-                                                    no_duplicates);
+      PopulateWithFloatingPointData<bfloat16>(&literal, engine, no_duplicates);
       break;
     case F16:
-      PopulateWithRandomFloatingPointData<half>(&literal, engine,
-                                                no_duplicates);
+      PopulateWithFloatingPointData<half>(&literal, engine, no_duplicates);
       break;
     case F32:
-      PopulateWithRandomFloatingPointData<float>(&literal, engine,
-                                                 no_duplicates);
+      PopulateWithFloatingPointData<float>(&literal, engine, no_duplicates);
       break;
     case F64:
-      PopulateWithRandomFloatingPointData<double>(&literal, engine,
-                                                  no_duplicates);
+      PopulateWithFloatingPointData<double>(&literal, engine, no_duplicates);
       break;
     case S8:
       PopulateWithRandomIntegralData<int8>(&literal, engine, no_duplicates);
diff --git a/tensorflow/compiler/xla/tests/test_utils_test.cc b/tensorflow/compiler/xla/tests/test_utils_test.cc
index e066b3f4f224e80dab1b69c12fe76855d2967401..e8f5d7a9a79ebddea3cb989dbe8eab90b630d5e7 100644
--- a/tensorflow/compiler/xla/tests/test_utils_test.cc
+++ b/tensorflow/compiler/xla/tests/test_utils_test.cc
@@ -175,5 +175,28 @@ ENTRY %sort.148.1589 (parameter.0: s32[1048576], parameter.1: s32[1048576]) -> (
   }
 }
 
+XLA_TEST_F(TestUtilsTest, NoDuplicatesBfloat16) {
+  // Inputs which are sort keys in key/value sorts should have no duplicates.
+  auto module = ParseHloString(R"(
+HloModule sort, is_scheduled=true
+
+ENTRY %sort. (parameter.0: bf16[2,1452], parameter.1: s32[2,1452]) -> (bf16[2,1452], s32[2,1452]) {
+  %parameter.0 = bf16[2,1452]{1,0} parameter(0)
+  %parameter.1 = s32[2,1452]{1,0} parameter(1)
+  ROOT %sort = (bf16[2,1452]{1,0}, s32[2,1452]{1,0}) sort(bf16[2,1452]{1,0} %parameter.0, s32[2,1452]{1,0} %parameter.1), dimensions={1}
+}
+)")
+                    .ValueOrDie();
+  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> args,
+                          MakeFakeArguments(module.get()));
+  ASSERT_EQ(args.size(), 2);
+  const Literal& key_arg = args[0];
+
+  absl::flat_hash_set<uint16> key_set;
+  for (const bfloat16& value : key_arg.data<bfloat16>()) {
+    EXPECT_TRUE(key_set.insert(absl::bit_cast<uint16>(value)).second);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/token_hlo_test.cc b/tensorflow/compiler/xla/tests/token_hlo_test.cc
index a2b7c26331b3cc89ed0413efe8eb31c2b9e37038..601c6b06938fef1f1ae809b33209ae59b24c70a2 100644
--- a/tensorflow/compiler/xla/tests/token_hlo_test.cc
+++ b/tensorflow/compiler/xla/tests/token_hlo_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <array>
 
 #include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
@@ -108,26 +109,6 @@ XLA_TEST_F(TokenHloTest, InvalidTupleTokenShapedEntryParameter) {
       ::testing::HasSubstr("Entry parameter 0 is or contains a token shape"));
 }
 
-XLA_TEST_F(TokenHloTest, InvalidOperandToTokenInstruction) {
-  std::unique_ptr<HloModule> module = CreateNewUnverifiedModule();
-  auto builder = HloComputation::Builder(TestName());
-  auto param = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0"));
-  builder.AddInstruction(HloInstruction::CreateAfterAll({param}));
-  builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR0<int32>(123)));
-  module->AddEntryComputation(builder.Build());
-
-  Status status =
-      HloVerifier(/*layout_sensitive=*/false, /*allow_mixed_precision=*/false)
-          .Run(module.get())
-          .status();
-  ASSERT_IS_NOT_OK(status);
-  EXPECT_THAT(status.error_message(),
-              ::testing::HasSubstr(
-                  "Operands of token instructions must be TOKEN types"));
-}
-
 XLA_TEST_F(TokenHloTest, TokenInWhileLoop) {
   // Thread a token around a while loop. Token is created and consumed by a
   // AfterAll instruction in the while body.
@@ -220,5 +201,95 @@ ENTRY %TokenInConditional (param.3: pred[]) -> s32[] {
   }
 }
 
+XLA_TEST_F(TokenHloTest, AddDependency) {
+  string module_string = R"(
+HloModule AddDependency, is_scheduled=true
+
+// Computes (p0 + 42) * (-p1)
+// where there is a dependency from the add to the negation using a token
+// with after-all and add-dependency instructions.
+ENTRY %AddDependency (p0: f32[], p1: f32[]) -> f32[] {
+  %p0 = f32[] parameter(0)
+  %p1 = f32[] parameter(1)
+
+  %forty_two = f32[] constant(42.0)
+  %add = f32[] add(f32[] %p0, f32[] %forty_two)
+  %token = token[] after-all(f32[] %add)
+  %p1_after_token = f32[] add-dependency(f32[] %p1, token[] %token)
+  %neg = f32[] negate(f32[] %p1_after_token)
+  ROOT %product = f32[] multiply(f32[] %add, f32[] %neg)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR0<float>(10.0);
+  auto p1 = LiteralUtil::CreateR0<float>(3.0);
+  auto expected = LiteralUtil::CreateR0<float>(-156.0);
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0, &p1}));
+}
+
+XLA_TEST_F(TokenHloTest, AddDependencyOfConstant) {
+  string module_string = R"(
+HloModule AddDependencyOfConstant, is_scheduled=true
+
+ENTRY %AddDependency (p0: f32[]) -> f32[] {
+  %p0 = f32[] parameter(0)
+  %forty_two = f32[] constant(42.0)
+  %token = token[] after-all(f32[] %p0)
+  %forty_two_after_token = f32[] add-dependency(f32[] %forty_two, token[] %token)
+  ROOT %product = f32[] multiply(f32[] %p0, f32[] %forty_two_after_token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR0<float>(10.0);
+  auto expected = LiteralUtil::CreateR0<float>(420.0);
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0}));
+}
+
+XLA_TEST_F(TokenHloTest, AddDependencyAsRoot) {
+  string module_string = R"(
+HloModule AddDependencyAsRoot, is_scheduled=true
+ENTRY %AddDependency (p: f32[3]) -> f32[3] {
+  %p = f32[3] parameter(0)
+  %neg = f32[3] negate(f32[3] %p)
+  %token = token[] after-all()
+  ROOT %add_dep = f32[3] add-dependency(f32[3] %neg, token[] %token)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto input = LiteralUtil::CreateR1<float>({1.0, 3.0, 7.0});
+  auto expected = LiteralUtil::CreateR1<float>({-1.0, -3.0, -7.0});
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&input}));
+}
+
+XLA_TEST_F(TokenHloTest, TupleShapedAddDependency) {
+  string module_string = R"(
+HloModule TupleShapedAddDependency, is_scheduled=true
+ENTRY %TupleShapedAddDependency (p0: f32[3], p1: f32[3]) -> f32[3] {
+  %p0 = f32[3] parameter(0)
+  %p1 = f32[3] parameter(1)
+  %forty_two = f32[] constant(42.0)
+  %token = token[] after-all()
+  %tuple = (f32[3], token[], f32[3], f32[]) tuple(f32[3] %p0, token[] %token, f32[3] %p1, f32[] %forty_two)
+  %add_dep = (f32[3], token[], f32[3], f32[]) add-dependency((f32[3], token[], f32[3], f32[]) %tuple, token[] %token)
+  %elem0 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=0
+  %elem2 = f32[3] get-tuple-element((f32[3], token[], f32[3], f32[]) %add_dep), index=2
+  ROOT %diff = f32[3] subtract(f32[3] %elem0, f32[3] %elem2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<HloModule> module,
+      ParseHloString(module_string, GetModuleConfigForTest()));
+  auto p0 = LiteralUtil::CreateR1<float>({3.0, 3.0, 47.0});
+  auto p1 = LiteralUtil::CreateR1<float>({1.0, -2.0, 2.0});
+  auto expected = LiteralUtil::CreateR1<float>({2.0, 5.0, 45.0});
+  EXPECT_EQ(expected, ExecuteNoHloPasses(std::move(module), {&p0, &p1}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
index ca036f1ae0d5e31a3f83d9d31c80e070c2a666df..e57d072a0632b492b8b6e34439f4e80332b843b6 100644
--- a/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
+++ b/tensorflow/compiler/xla/tests/xla_hlo_profile_test.cc
@@ -157,10 +157,12 @@ void ExecuteAndFetchProfile(string* profile_output, LocalClient* client,
   TF_ASSERT_OK(transfer_manager->TransferLiteralToDevice(
       stream_ptr.get(), Literal::CreateFromShape(rhs_arg_shape), rhs_arg));
 
+  ExecutableBuildOptions build_options;
+  build_options.mutable_debug_options()->set_xla_hlo_profile(true);
   TF_ASSERT_OK_AND_ASSIGN(
       std::unique_ptr<LocalExecutable> local_executable,
       client->Compile(computation, {&lhs_arg_shape, &rhs_arg_shape},
-                      ExecutableBuildOptions().set_hlo_profile(true)));
+                      build_options));
 
   Executable* executable = local_executable->executable();
   HloExecutionProfile hlo_execution_profile(
@@ -208,7 +210,7 @@ XLA_TEST_F(HloProfileTest, ProfileSingleComputation) {
   string profile_output;
   ExecuteAndFetchProfile(&profile_output, client, computation, lhs_shape,
                          rhs_shape);
-
+  VLOG(4) << "Profile Output:\n" << profile_output;
   std::vector<string> profile_output_lines =
       absl::StrSplit(profile_output, '\n');
 
diff --git a/tensorflow/compiler/xla/tools/replay_computation.cc b/tensorflow/compiler/xla/tools/replay_computation.cc
index 47be9f5adf1063463d7678579a7f394684aaf357..ff2c3399928c0e6339304323c4f93e212933a340 100644
--- a/tensorflow/compiler/xla/tools/replay_computation.cc
+++ b/tensorflow/compiler/xla/tools/replay_computation.cc
@@ -82,13 +82,17 @@ struct Options {
 std::unique_ptr<LocalExecutable> CompileExecutable(const HloSnapshot& module,
                                                    LocalClient* client) {
   XlaComputation computation(module.hlo().hlo_module());
-  std::vector<const Shape*> argument_layouts;
-  for (const auto& param :
+  std::vector<Shape> argument_layouts;
+  argument_layouts.reserve(
+      computation.proto().host_program_shape().parameters_size());
+  std::vector<const Shape*> argument_layout_ptrs;
+  for (const ShapeProto& param :
        computation.proto().host_program_shape().parameters()) {
-    argument_layouts.push_back(&param);
+    argument_layouts.push_back(Shape(param));
+    argument_layout_ptrs.push_back(&argument_layouts.back());
   }
   return client
-      ->Compile(computation, argument_layouts, ExecutableBuildOptions())
+      ->Compile(computation, argument_layout_ptrs, ExecutableBuildOptions())
       .ValueOrDie();
 }
 
@@ -149,7 +153,7 @@ StatusOr<Literal> ReplayComputation(const HloSnapshot& module,
               << "--generate_fake_infeed only works if the model has 0 or 1 "
                  "infeed ops, but this one has >= 2.";
           provide_infeed = true;
-          infeed_shape = instruction.shape();
+          infeed_shape = Shape(instruction.shape());
           LOG(INFO) << "Generating fake infeed shape for inferred shape: "
                     << ShapeUtil::HumanString(infeed_shape);
         }
@@ -315,9 +319,10 @@ int RealMain(absl::Span<char* const> args, const Options& opts) {
       if (snapshot.has_result()) {
         Literal literal =
             Literal::CreateFromProto(snapshot.result()).ConsumeValueOrDie();
-        fprintf(stdout, "was %s:%s\n",
-                ShapeUtil::HumanString(snapshot.result().shape()).c_str(),
-                literal.ToString().c_str());
+        fprintf(
+            stdout, "was %s:%s\n",
+            ShapeUtil::HumanString(Shape(snapshot.result().shape())).c_str(),
+            literal.ToString().c_str());
       }
     }
   }
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 8ce741647414a1fa75e6d706ec1e719ace7b7cc8..6722641e9d2c177440361e6f0d1f6c0804eb7cda 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -152,6 +152,13 @@ static inline absl::Span<const int64> AsInt64Slice(
                                  slice.size());
 }
 
+// TODO(b/29771030): This nop overload was added to simplify the migration of
+// Shape from a proto to a C++ class. Remove after class has been migrated.
+static inline absl::Span<const int64> AsInt64Slice(
+    absl::Span<const int64> slice) {
+  return slice;
+}
+
 // As above, but for uint64 types.
 static inline absl::Span<const uint64> AsUInt64Slice(
     const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_uint64>& v) {
@@ -387,6 +394,19 @@ T CeilOfRatio(T dividend, T divisor) {
   return tensorflow::MathUtil::CeilOfRatio<T>(dividend, divisor);
 }
 
+template <typename T>
+std::vector<T> ElementWiseCeilOfRatio(absl::Span<const T> dividends,
+                                      absl::Span<const T> divisors) {
+  std::vector<T> ceil_of_ratios;
+  CHECK_EQ(dividends.size(), divisors.size());
+  ceil_of_ratios.reserve(dividends.size());
+  absl::c_transform(dividends, divisors, std::back_inserter(ceil_of_ratios),
+                    [](const T dividend, const T divisor) {
+                      return CeilOfRatio<T>(dividend, divisor);
+                    });
+  return ceil_of_ratios;
+}
+
 // Rounds the value up to a multiple of the divisor by first calling CeilOfRatio
 // then multiplying by the divisor. For example: RoundUpToNearest(13, 8) => 16
 template <typename T>
diff --git a/tensorflow/compiler/xla/window_util.cc b/tensorflow/compiler/xla/window_util.cc
index 8ea8dbab2574ca1e24271e7c1c7762d4a6b6a8de..51c73b3d17e4c32d9a8a14d3055ab56f02922af3 100644
--- a/tensorflow/compiler/xla/window_util.cc
+++ b/tensorflow/compiler/xla/window_util.cc
@@ -185,6 +185,17 @@ bool HasWindowReversal(const Window& window) {
   return false;
 }
 
+bool AllOrNoneReversed(const Window& window) {
+  if (window.dimensions().empty()) {
+    return true;
+  }
+  bool reversed = window.dimensions()[0].window_reversal();
+  return std::all_of(window.dimensions().begin(), window.dimensions().end(),
+                     [&](const WindowDimension& dim) {
+                       return dim.window_reversal() == reversed;
+                     });
+}
+
 bool HasDilation(const Window& window) {
   return HasBaseDilation(window) || HasWindowDilation(window);
 }
diff --git a/tensorflow/compiler/xla/window_util.h b/tensorflow/compiler/xla/window_util.h
index 1fb9e855fc16f334eb0e83dfd27b307b2149628f..099d7ecdd5c732ffc8c6ff6370288a2fc4144fa2 100644
--- a/tensorflow/compiler/xla/window_util.h
+++ b/tensorflow/compiler/xla/window_util.h
@@ -56,6 +56,7 @@ bool HasWindowDilation(const Window& window);
 bool HasDilation(const Window& window);
 
 bool HasWindowReversal(const Window& window);
+bool AllOrNoneReversed(const Window& window);
 
 // Returns true if the given logical dimension is inactive in the sense that it
 // has window bound 1, no striding and no padding.
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 28df3b03f398841460189910bc3a5096dfb0d367..a37eac7fe441d91aa71e1b6fd7b84099fee2215b 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -193,7 +193,11 @@ message DebugOptions {
   //  - Assuming that operations never produce or consume NaN or +/- Inf.
   //  - Assuming that +0 and -0 are indistinguishable.
   bool xla_cpu_enable_fast_math = 99;
-  bool xla_gpu_enable_fast_math = 100;
+
+  // When true we lower the Minimum and Maximum hlos in the GPU backend such
+  // that Min(NotNaN, NaN) = Min(NaN, NotNaN) = NotNaN.  In other words, if flag
+  // this is true we don't propagate NaNs through Min and Max.
+  bool xla_gpu_enable_fast_min_max = 100;
 
   // Crashes the program when any kind of verification fails, instead of just
   // logging the failures. One example is cross checking of convolution results
@@ -209,6 +213,9 @@ message DebugOptions {
   // the host that run models in parallel across multiple devices.
   int32 xla_force_host_platform_device_count = 102;
 
+  // If set to true XLA:GPU invokes `ptxas` with -O0 (default is -O3).
+  bool xla_gpu_disable_ptxas_optimizations = 103;
+
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
   map<string, string> xla_backend_extra_options = 500;
@@ -224,7 +231,7 @@ message ExecutionOptions {
   // may be faster when using this layout.
   //
   // We use a Shape here to accommodate computations that return a tuple.
-  Shape shape_with_output_layout = 2;
+  ShapeProto shape_with_output_layout = 2;
 
   // Used to seed random-number generators used in this computation.  If this is
   // 0, we generate a seed ourselves.
@@ -253,7 +260,7 @@ message TransferToClientRequest {
 
   // This optional field directs the service to return the literal in this
   // layout. A shape is used to hold the layout to accommodate tuples.
-  Shape shape_with_layout = 2;
+  ShapeProto shape_with_layout = 2;
 }
 
 message TransferToClientResponse {
@@ -281,7 +288,7 @@ message TransferToInfeedResponse {
 message TransferFromOutfeedRequest {
   // This optional field directs the service to return the literal in this
   // layout. A shape is used to hold the layout to accommodate tuples.
-  Shape shape_with_layout = 1;
+  ShapeProto shape_with_layout = 1;
 
   int64 replica_id = 2;
   DeviceHandle device_handle = 3;
@@ -332,7 +339,7 @@ message CompileRequest {
   // The layouts of the input arguments. If not set, the default layout will be
   // used. Although the real arguments are not needed in compilation, the
   // layouts of the arguments can affect the compilation.
-  repeated Shape input_shape_with_layout = 3;
+  repeated ShapeProto input_shape_with_layout = 3;
 }
 
 message CompileResponse {
@@ -406,7 +413,7 @@ message LoadDataRequest {
   string columnio_field = 2;
 
   // Individual element shape, excluding rows.
-  Shape element_shape = 3;
+  ShapeProto element_shape = 3;
 
   // Warning: ColumnIO does not support random-access, so use offset with
   // caution in performance-critical scenarios.
@@ -422,7 +429,7 @@ message LoadDataRequest {
 
 message LoadDataResponse {
   GlobalDataHandle data = 1;
-  Shape data_shape = 2;
+  ShapeProto data_shape = 2;
   int64 available_rows = 3;
   int64 rows_loaded = 4;
   int64 nanoseconds = 5;
@@ -433,7 +440,7 @@ message GetShapeRequest {
 }
 
 message GetShapeResponse {
-  Shape shape = 1;
+  ShapeProto shape = 1;
 }
 
 message UnpackRequest {
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index 683ccc40f162ead3a248aee83d9abf3086a1ac93..85ec83437a10d973687a7fb84285c2e2541a53c7 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -108,6 +108,16 @@ enum Format {
   SPARSE = 2;
 }
 
+// Describes a tile used in tiling-based layout. Refer to
+// g3doc/layout_with_tiling.md for details about tiling-based layout.
+message Tile {
+  // Number of elements in each dimension of the tile. It's ordered from the
+  // most major dimension of the tile to the most minor dimension of the tile.
+  // The dimensions correspond to a suffix of the dimensions of the shape being
+  // tiled.
+  repeated int64 dimensions = 1;
+}
+
 // A layout describes how the array is placed in (1D) memory space.  This
 // includes the minor-to-major ordering of dimensions within a shape.
 //
@@ -138,6 +148,20 @@ message Layout {
   // memory.  This field must be unset unless the format is SPARSE.
   int64 max_sparse_elements = 5;
 
+  // A sequence of tiles, starting from the tile that's applied first to the
+  // Shape.
+  //
+  // TODO(b/119839262): implement tiling in each backend or add Unimplemented
+  // error.
+  repeated Tile tiles = 6;
+
+  // Bit size of each element. If the size is bigger than what the element
+  // type requires, the value is stored in the least significant
+  // bits and the additional most significant bits are filled with 0's.
+  //
+  // TODO(b/119839262): implement in each backend or add Unimplemented error.
+  int64 element_size_in_bits = 7;
+
   // Important: if any field is added, be sure to modify ShapeUtil::Equal() and
   // LayoutUtil::Hash appropriately to account for the new field.
 }
@@ -154,7 +178,7 @@ message Layout {
 // See the XLA documentation for more information on shapes and layouts.
 //
 // LINT.IfChange
-message Shape {
+message ShapeProto {
   reserved 1;
   reserved "rank";
 
@@ -169,7 +193,7 @@ message Shape {
   repeated int64 dimensions = 3;
 
   // For tuples only, the shapes of constitutent shapes in the tuple sequence.
-  repeated Shape tuple_shapes = 4;
+  repeated ShapeProto tuple_shapes = 4;
 
   // The layout used to back this shape.
   Layout layout = 5;
@@ -183,9 +207,9 @@ message Shape {
 
 // Shape of the parameters and output of a computation (like a traditional
 // function signature).
-message ProgramShape {
-  repeated Shape parameters = 1;
-  Shape result = 2;
+message ProgramShapeProto {
+  repeated ShapeProto parameters = 1;
+  ShapeProto result = 2;
   repeated string parameter_names = 3;
 }
 
@@ -320,7 +344,7 @@ message DeviceAssignmentProto {
 // Transfers to/from the client are encoded in literal form, and the structure
 // of the repeated fields is implied by the shape.
 message LiteralProto {
-  Shape shape = 1;
+  ShapeProto shape = 1;
   repeated bool preds = 2;
   bytes s8s = 15;
   bytes u8s = 3;
@@ -521,7 +545,7 @@ message OpSharding {
   }
   Type type = 1;
   // The shape of the sharded tile.
-  Shape tile_shape = 2;
+  ShapeProto tile_shape = 2;
   // The shape of the tile assignment tensor - this must be the same rank as
   // tile_shape and the product of its dimensions must equal
   // tile_assignment_devices.size().
diff --git a/tensorflow/compiler/xrt/BUILD b/tensorflow/compiler/xrt/BUILD
index 2ff97914f862e0ec30fc54602ec5fee2a0a5ebca..2dae746d034a1bf52e84de74dfb0c6e23aaed4d1 100644
--- a/tensorflow/compiler/xrt/BUILD
+++ b/tensorflow/compiler/xrt/BUILD
@@ -22,6 +22,7 @@ xla_proto_library(
     deps = [
         "//tensorflow/compiler/tf2xla:host_compute_metadata_proto",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/service:hlo_proto",
     ],
 )
@@ -32,20 +33,25 @@ cc_library(
         "xrt_compilation_cache.cc",
         "xrt_device.cc",
         "xrt_state.cc",
+        "xrt_util.cc",
     ],
     hdrs = [
         "xrt_compilation_cache.h",
         "xrt_device.h",
         "xrt_state.h",
+        "xrt_util.h",
     ],
     deps = [
         "//tensorflow/compiler/jit:xla_device",
         "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto",
+        "//tensorflow/compiler/xla:xla_proto",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/service:backend",
         "//tensorflow/compiler/xla/service:device_memory_allocator",
diff --git a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
index dc62cf7a6b24e373374b458d2e4722e79500fb93..2ccdf0f02d840600d5e0649c4805e3672d4a1286 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_compile_ops.cc
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt.pb.h"
 #include "tensorflow/compiler/xrt/xrt_compilation_cache.h"
 #include "tensorflow/compiler/xrt/xrt_device.h"
+#include "tensorflow/compiler/xrt/xrt_util.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -108,19 +109,26 @@ Status XRTCompileOp::Compile(OpKernelContext* ctx,
   TF_ASSIGN_OR_RETURN(xla::XlaComputation computation,
                       client->LoadSnapshot(computation_proto.hlo_snapshot()));
 
-  std::vector<const xla::Shape*> argument_layouts(
+  std::vector<xla::Shape> argument_layouts(
+      config.program_shape().parameters_size());
+  std::vector<const xla::Shape*> argument_layout_ptrs(
       config.program_shape().parameters_size());
   for (int i = 0; i < config.program_shape().parameters_size(); ++i) {
-    argument_layouts[i] = &config.program_shape().parameters(i);
+    argument_layouts[i] = xla::Shape(config.program_shape().parameters(i));
+    argument_layout_ptrs[i] = &argument_layouts[i];
   }
   xla::ExecutableBuildOptions build_options;
   build_options.set_device_ordinal(client->default_device_ordinal());
-  build_options.set_result_layout(config.program_shape().result());
+  build_options.set_result_layout(xla::Shape(config.program_shape().result()));
   build_options.set_device_allocator(device_ref.backend()->memory_allocator());
+  if (config.has_debug_options()) {
+    *build_options.mutable_debug_options() =
+        BuildXlaDebugOptions(config.debug_options());
+  }
 
   VLOG(1) << "Building executable";
   auto compile_result =
-      client->Compile(computation, argument_layouts, build_options);
+      client->Compile(computation, argument_layout_ptrs, build_options);
   if (!compile_result.ok()) {
     return compile_result.status();
   }
@@ -174,11 +182,12 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
   ctx->set_output(0, handle_output);
 
   xla::LocalExecutable* executable = entry->get().get_executable();
-  xla::ProgramShape program_shape = executable->executable()
-                                        ->module()
-                                        .config()
-                                        .entry_computation_layout()
-                                        .ComputeProgramShape();
+  xla::ProgramShapeProto program_shape = executable->executable()
+                                             ->module()
+                                             .config()
+                                             .entry_computation_layout()
+                                             .ComputeProgramShape()
+                                             .ToProto();
   Tensor program_shape_output(DT_STRING, TensorShape({1}));
   program_shape_output.vec<string>()(0) = program_shape.SerializeAsString();
   ctx->set_output(1, program_shape_output);
diff --git a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
index 8c6191ddc06ea7d85f5fd21a7d4058c669ffdeb2..751329eefc33f3372335c805233dafabbf42bf36 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_execute_op.cc
@@ -228,14 +228,35 @@ Status XRTExecuteOp::DoWork(OpKernelContext* context) {
   TF_RETURN_IF_ERROR(XRTTupleAllocation::CreateFromBuffer(
       shaped_buffer, device_ref.backend(), device_ref.device_ordinal(),
       &output_tuple));
-
-  Tensor* output_tensor;
-  TF_RETURN_IF_ERROR(
-      context->allocate_output(0, TensorShape({}), &output_tensor));
-  int64 key;
-  TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
-  output_tensor->scalar<int64>()() = key;
-
+  if (config_proto.return_exploded_tuple() &&
+      xla::ShapeUtil::IsTuple(output_tuple->on_device_shape())) {
+    int64 tuple_element_count =
+        xla::ShapeUtil::TupleElementCount(output_tuple->on_device_shape());
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(context->allocate_output(
+        0, TensorShape({tuple_element_count}), &output_tensor));
+
+    for (int64 i = 0; i < tuple_element_count; ++i) {
+      xla::ShapeIndex shape_index;
+      shape_index.push_back(i);
+
+      XRTTupleAllocation* suballocation;
+      TF_RETURN_IF_ERROR(XRTTupleAllocation::MakeSubBuffer(
+          output_tuple, shape_index, &suballocation,
+          /*alias_parent_allocation=*/false));
+      int64 key;
+      TF_RETURN_IF_ERROR(suballocation->Intern(rm, &key));
+      output_tensor->vec<int64>()(i) = key;
+    }
+    output_tuple->Unref();
+  } else {
+    Tensor* output_tensor;
+    TF_RETURN_IF_ERROR(
+        context->allocate_output(0, TensorShape({}), &output_tensor));
+    int64 key;
+    TF_RETURN_IF_ERROR(output_tuple->Intern(rm, &key));
+    output_tensor->scalar<int64>()() = key;
+  }
   return Status::OK();
 }
 
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
index ffea592491d43788b876a51866dc8a6611e8c734..3258286c10665225aab917107ffa614459c53f3d 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.cc
@@ -87,6 +87,19 @@ REGISTER_KERNEL_BUILDER(Name("XRTReadLiteral")
                             .HostMemory("literal"),
                         XRTReadLiteralOp<false, XRTGenericDeviceAccessor>);
 
+REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
+                            .Device(DEVICE_XLA_GPU)
+                            .HostMemory("handle")
+                            .HostMemory("literal")
+                            .HostMemory("output_handle"),
+                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
+REGISTER_KERNEL_BUILDER(Name("XRTWriteLiteral")
+                            .Device(DEVICE_XLA_CPU)
+                            .HostMemory("handle")
+                            .HostMemory("literal")
+                            .HostMemory("output_handle"),
+                        XRTWriteLiteralOp<XRTGenericDeviceAccessor>);
+
 REGISTER_KERNEL_BUILDER(Name("XRTReadLiteralAndRelease")
                             .Device(DEVICE_XLA_GPU)
                             .HostMemory("handle")
diff --git a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
index 54b06558adcd8ef1f8f1bee52d210d558801afea..26a58fa42d8b730b365b11d2e5608e9945497763 100644
--- a/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
+++ b/tensorflow/compiler/xrt/kernels/xrt_state_ops.h
@@ -393,6 +393,56 @@ class XRTReadLiteralOp : public OpKernel {
   }
 };
 
+// Op that writes a new literal value into device-resident memory.
+template <class DeviceAccessor>
+class XRTWriteLiteralOp : public OpKernel {
+ public:
+  explicit XRTWriteLiteralOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  ~XRTWriteLiteralOp() override = default;
+  XRTWriteLiteralOp(const XRTWriteLiteralOp&) = delete;
+  XRTWriteLiteralOp& operator=(const XRTWriteLiteralOp&) = delete;
+
+  void Compute(OpKernelContext* ctx) override {
+    VLOG(1) << "XRTWriteLiteralOp::Compute";
+
+    const Tensor& handle_tensor = ctx->input(0);
+    OP_REQUIRES(
+        ctx, TensorShapeUtils::IsScalar(handle_tensor.shape()),
+        errors::Internal("computation input should be an int64 scalar"));
+    int64 allocation_handle = handle_tensor.scalar<int64>()();
+
+    const Tensor& literal_info = ctx->input(1);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(literal_info.shape()),
+                errors::Internal("literal input should be a string scalar"));
+    xla::LiteralProto literal_proto;
+    OP_REQUIRES(ctx,
+                literal_proto.ParseFromString(literal_info.scalar<string>()()),
+                errors::InvalidArgument(
+                    "Unable to parse allocation input to LiteralProto"));
+    xla::Literal literal;
+    OP_REQUIRES_OK(ctx, XRTStateHelpers::MakeLiteral(literal_proto, &literal));
+
+    ResourceMgr* rm;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::GetResourceManager(ctx, &rm));
+
+    XRTTupleAllocation* allocation;
+    OP_REQUIRES_OK(
+        ctx, XRTTupleAllocation::Lookup(rm, allocation_handle, &allocation));
+    core::ScopedUnref allocation_unref(allocation);
+    // We are guaranteed that the underlying device object won't be deleted out
+    // from under us, while the ScopedRef is live.
+    typename DeviceAccessor::ScopedRef device_ref;
+    OP_REQUIRES_OK(ctx, DeviceAccessor::InitScopedRef(
+                            ctx, allocation->device_ordinal(), &device_ref));
+    OP_REQUIRES_OK(ctx,
+                   allocation->WriteLiteral(device_ref.backend(), literal));
+
+    Tensor output(DT_INT64, TensorShape({}));
+    output.scalar<int64>()() = allocation_handle;
+    ctx->set_output(0, output);
+  }
+};
+
 // Op that discards a handle to device memory.
 template <class DeviceAccessor>
 class XRTReleaseAllocationOp : public OpKernel {
diff --git a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
index 07d025ce343f229097b557d33ad41bf9612b0696..a3d63106fa14674a9f5887ccfd908ce17dbc6384 100644
--- a/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_state_ops.cc
@@ -95,6 +95,20 @@ Copies an allocated tuple from device memory and returns it as a literal.
 'literal' is a serialized xla::LiteralProto proto.
 )");
 
+REGISTER_OP("XRTWriteLiteral")
+    .Input("handle: int64")
+    .Input("literal: string")
+    .Output("output_handle: int64")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape)
+    .Doc(
+        R"(
+Copies the input literal into the device memory pointed to by handle.
+Returns the handle itself.
+
+'handle' is the id returned from the Op that produced the on-device allocation.
+'literal' is a serialized xla::LiteralProto proto to be written to device memory.
+)");
+
 REGISTER_OP("XRTReadLiteralAndRelease")
     .Input("handle: int64")
     .Output("literal: string")
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 25464b5554d21f4b936f3f4a442fd174a8b56a8b..abaa17e50e3f5e47a45f5a8a45fa2090d3efee39 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -102,7 +102,7 @@ bool CompareLiteralProtos(const xla::LiteralProto& a,
   auto l_b = xla::Literal::CreateFromProto(b).ValueOrDie();
   bool equal = l_a == l_b;
   if (!equal) {
-    LOG(INFO) << "LiteralProtos don't match " << a.DebugString()
+    LOG(INFO) << "LiteralProtos don't match: " << a.DebugString()
               << " != " << b.DebugString();
   }
   return equal;
@@ -175,6 +175,18 @@ xla::XlaComputation AddAndTuple() {
   return builder.Build().ValueOrDie();
 }
 
+xla::XlaComputation AddAndSubTuple() {
+  xla::XlaBuilder builder("AddAndSubTuple");
+  auto p0 = xla::Parameter(&builder, 0, xla::ShapeUtil::MakeShape(xla::F32, {}),
+                           "P0");
+  auto p1 = xla::Parameter(&builder, 1, xla::ShapeUtil::MakeShape(xla::F32, {}),
+                           "P1");
+  auto sum = xla::Add(p0, p1);
+  auto sub = xla::Sub(p0, p1);
+  xla::Tuple(&builder, {sum, sub});
+  return builder.Build().ValueOrDie();
+}
+
 void StoreComputationSnapshot(const xla::XlaComputation& computation,
                               xla::HloSnapshot* dst) {
   auto snapshot = computation.Snapshot().ValueOrDie();
@@ -203,6 +215,56 @@ xla::ProgramShape XlaCompiledProgramShape(
       ->ComputeProgramShape();
 }
 
+TEST(RawApiTest, AllocAndRewrite) {
+  xrt::XLAAllocation alloc;
+  alloc.set_device_ordinal(0);
+  *alloc.mutable_value() =
+      xla::LiteralUtil::CreateR2({{4, 5}, {6, 7}}).ToProto();
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto value =
+      ops::Const(root.WithDevice("/device:CPU:0"), alloc.SerializeAsString());
+  auto handle = ops::XRTAllocate(root, value);
+  auto read_back = ops::XRTReadLiteral(root, handle);
+  TF_ASSERT_OK(root.status());
+
+  tensorflow::ClientSession session(root);
+  std::vector<tensorflow::Tensor> outputs;
+  TF_EXPECT_OK(session.Run({read_back, handle}, &outputs));
+  EXPECT_EQ(outputs.size(), 2);
+
+  int64 allocation_handle = outputs[1].scalar<int64>()();
+  xla::LiteralProto response;
+  EXPECT_TRUE(response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralProtos(alloc.value(), response));
+  outputs.clear();
+
+  xla::LiteralProto new_literal =
+      xla::LiteralUtil::CreateR2({{9, 2}, {4, 1}}).ToProto();
+  auto new_value = ops::Const(root.WithDevice("/device:CPU:0"),
+                              new_literal.SerializeAsString());
+  auto write_op =
+      ops::XRTWriteLiteral(root, Input(allocation_handle), new_value);
+  TF_ASSERT_OK(root.status());
+  TF_EXPECT_OK(session.Run({write_op}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+  EXPECT_EQ(allocation_handle, outputs[0].scalar<int64>()());
+  outputs.clear();
+
+  auto read_after_write = ops::XRTReadLiteral(root, Input(allocation_handle));
+  TF_EXPECT_OK(session.Run({read_after_write}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  xla::LiteralProto new_response;
+  EXPECT_TRUE(new_response.ParseFromString(outputs[0].scalar<string>()()));
+  EXPECT_TRUE(CompareLiteralProtos(new_literal, new_response));
+
+  auto release =
+      ops::XRTReleaseAllocationHandle(root, Input(allocation_handle));
+  TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(), {}, {release},
+                           &outputs));
+}
+
 TEST(RawApiTest, ReadAndWriteState) {
   xrt::XLAAllocation alloc;
   alloc.set_device_ordinal(0);
@@ -375,9 +437,12 @@ TEST(RawApiTest, CompileAndExecute) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {2});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
   StoreComputationSnapshot(AddAndScale(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -411,7 +476,7 @@ TEST(RawApiTest, CompileAndExecute) {
   auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
-  xla::ProgramShape program_shape;
+  xla::ProgramShapeProto program_shape;
   EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
@@ -427,9 +492,12 @@ TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {2});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
   StoreComputationSnapshot(AddAndScale(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -465,7 +533,7 @@ TEST(RawApiTest, CompileAndExecuteWithArgumentVector) {
   auto expected = xla::LiteralUtil::CreateR1<float>({27.0f, 21.0f});
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
-  xla::ProgramShape program_shape;
+  xla::ProgramShapeProto program_shape;
   EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
 }
@@ -494,8 +562,8 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = param_shape;
-  *shapes->mutable_result() = result_shape;
+  *shapes->add_parameters() = param_shape.ToProto();
+  *shapes->mutable_result() = result_shape.ToProto();
   StoreComputationSnapshot(xla_computation, c.mutable_hlo_snapshot());
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
@@ -510,8 +578,9 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
   TF_EXPECT_OK(session.Run(tensorflow::ClientSession::FeedType(),
                            {c_handle.program_shape}, {release}, &outputs));
 
-  xla::ProgramShape program_shape;
-  EXPECT_TRUE(program_shape.ParseFromString(outputs[0].vec<string>()(0)));
+  xla::ProgramShapeProto program_shape_proto;
+  EXPECT_TRUE(program_shape_proto.ParseFromString(outputs[0].vec<string>()(0)));
+  xla::ProgramShape program_shape(program_shape_proto);
   EXPECT_EQ(program_shape.parameters_size(), 1);
 
   VLOG(2) << "Param: "
@@ -520,7 +589,7 @@ TEST(RawApiTest, CompileWithXlaReturnShapes) {
           << xla::ShapeUtil::HumanStringWithLayout(program_shape.result());
 
   xla::ProgramShape xla_program_shape =
-      XlaCompiledProgramShape(xla_computation, *shapes);
+      XlaCompiledProgramShape(xla_computation, xla::ProgramShape(*shapes));
   EXPECT_TRUE(xla::LayoutUtil::Equal(
       xla::ShapeUtil::GetSubshape(program_shape.parameters(0), {0}).layout(),
       xla::ShapeUtil::GetSubshape(xla_program_shape.parameters(0), {0})
@@ -547,11 +616,11 @@ TEST(RawApiTest, DotGeneralWithLayoutTest) {
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
   *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 2}, {0, 1});
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 2}, {0, 1}).ToProto();
   *shapes->add_parameters() =
-      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1});
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1}).ToProto();
   *shapes->mutable_result() =
-      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1});
+      xla::ShapeUtil::MakeShapeWithLayout(xla::F32, {2, 1}, {0, 1}).ToProto();
   StoreComputationSnapshot(Dot(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -592,7 +661,7 @@ TEST(RawApiTest, CompileAndExecuteZeroArg) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {});
+  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
 
   xrt::XRTExecutionConfig e;
   e.set_release_input_handles(true);
@@ -632,10 +701,13 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::F32, {2})});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
   StoreComputationSnapshot(AddAndTuple(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -671,14 +743,81 @@ TEST(RawApiTest, CompileAndExecuteReturnTuple) {
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 }
 
+TEST(RawApiTest, CompileAndExecuteReturnExplodedTuple) {
+  xrt::XLAAllocation p0;
+  p0.set_device_ordinal(0);
+  *p0.mutable_value() = xla::LiteralUtil::CreateR0<float>(12.0f).ToProto();
+
+  xrt::XLAAllocation p1;
+  p1.set_device_ordinal(0);
+  *p1.mutable_value() = xla::LiteralUtil::CreateR0<float>(3.0f).ToProto();
+
+  xrt::XLAComputation c;
+  auto config = c.mutable_config();
+  auto shapes = config->mutable_program_shape();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {}),
+                                      xla::ShapeUtil::MakeShape(xla::F32, {})})
+          .ToProto();
+  StoreComputationSnapshot(AddAndSubTuple(), c.mutable_hlo_snapshot());
+
+  xrt::XRTExecutionConfig e;
+  e.set_release_input_handles(true);
+  e.set_release_compilation_handle(true);
+  e.set_return_exploded_tuple(true);
+
+  Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
+  auto e_config =
+      ops::Const(root.WithDevice("/device:CPU:0"), e.SerializeAsString());
+  auto computation =
+      ops::Const(root.WithDevice("/device:CPU:0"), c.SerializeAsString());
+  auto c_handle = ops::XRTCompile(root, computation);
+  auto p0_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p0.SerializeAsString());
+  auto p0_handle = ops::XRTAllocate(root, p0_value);
+  auto p1_value =
+      ops::Const(root.WithDevice("/device:CPU:0"), p1.SerializeAsString());
+  auto p1_handle = ops::XRTAllocate(root, p1_value);
+  auto result = ops::XRTExecute(root, c_handle.handle, e_config,
+                                {Output(p0_handle), Output(p1_handle)});
+  TF_ASSERT_OK(root.status());
+
+  ClientSession session(root);
+  std::vector<Tensor> outputs;
+  TF_EXPECT_OK(session.Run({result}, &outputs));
+  EXPECT_EQ(outputs.size(), 1);
+
+  auto handles_vec = outputs.front().vec<int64>();
+  EXPECT_EQ(handles_vec.size(), 2);
+
+  const float kResults[2] = {15.0f, 9.0f};
+  for (int64 i = 0; i < handles_vec.size(); ++i) {
+    auto read_back = ops::XRTReadLiteralAndRelease(root, Input(handles_vec(i)));
+    std::vector<Tensor> voutputs;
+    TF_EXPECT_OK(session.Run({read_back}, &voutputs));
+    EXPECT_EQ(voutputs.size(), 1);
+
+    xla::LiteralProto response;
+    EXPECT_TRUE(response.ParseFromString(voutputs[0].scalar<string>()()));
+
+    auto expected = xla::LiteralUtil::CreateR0<float>(kResults[i]);
+    EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
+  }
+}
+
 TEST(RawApiTest, LeakCompilationReference) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::F32, {2});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeTupleShape(
-      {xla::ShapeUtil::MakeShape(xla::F32, {2})});
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->add_parameters() =
+      xla::ShapeUtil::MakeShape(xla::F32, {2}).ToProto();
+  *shapes->mutable_result() =
+      xla::ShapeUtil::MakeTupleShape({xla::ShapeUtil::MakeShape(xla::F32, {2})})
+          .ToProto();
   StoreComputationSnapshot(AddAndTuple(), c.mutable_hlo_snapshot());
 
   Scope root = Scope::NewRootScope().WithDevice(DeviceFromFlag());
@@ -703,9 +842,9 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   xrt::XLAComputation c;
   auto config = c.mutable_config();
   auto shapes = config->mutable_program_shape();
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {});
-  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {});
-  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::S64, {});
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
+  *shapes->add_parameters() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
+  *shapes->mutable_result() = xla::ShapeUtil::MakeShape(xla::S64, {}).ToProto();
   StoreComputationSnapshot(AddS64(), c.mutable_hlo_snapshot());
 
   xrt::XRTExecutionConfig e;
@@ -739,11 +878,11 @@ TEST(RawApiTest, CompileAndExecuteWithS64Argument) {
   auto expected = xla::LiteralUtil::CreateR0<int64>(15123899);
   EXPECT_TRUE(CompareLiteralToLiteralProto(expected, response));
 
-  xla::ProgramShape program_shape;
+  xla::ProgramShapeProto program_shape;
   EXPECT_TRUE(program_shape.ParseFromString(outputs[1].vec<string>()(0)));
   EXPECT_EQ(program_shape.parameters_size(), 2);
-  EXPECT_TRUE(
-      xla::ShapeUtil::HasPrimitiveType(program_shape.result(), xla::S64));
+  EXPECT_TRUE(xla::ShapeUtil::HasPrimitiveType(
+      xla::Shape(program_shape.result()), xla::S64));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xrt/xrt.proto b/tensorflow/compiler/xrt/xrt.proto
index 6ab77fbaaf0cbe23503ebc71775f52af01e41a74..378bb9246f27b8106310d565435404d7ac260a87 100644
--- a/tensorflow/compiler/xrt/xrt.proto
+++ b/tensorflow/compiler/xrt/xrt.proto
@@ -3,6 +3,7 @@ syntax = "proto3";
 package xrt;
 
 import "tensorflow/compiler/tf2xla/host_compute_metadata.proto";
+import "tensorflow/compiler/xla/xla.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
 import "tensorflow/compiler/xla/service/hlo.proto";
 
@@ -36,16 +37,18 @@ message XLAComputationConfig {
   tensorflow.tf2xla.HostComputeMetadata host_compute_metadata = 3;
 
   // The arg/result shapes for the whole computation.
-  xla.ProgramShape program_shape = 4;
+  xla.ProgramShapeProto program_shape = 4;
   // The arg/result shapes for each core of a model-parallel
   // computation. per_core_args_and_result_shapes is optional for a
   // single-core computation.
-  repeated xla.ProgramShape per_core_program_shape = 5;
+  repeated xla.ProgramShapeProto per_core_program_shape = 5;
   // Describes how replicated computation instances should be assigned to
   // devices. There are num_cores_per_replica computations, and each one will be
   // sent and executed to the set of replica device numbers described in the
   // DeviceAssignment proto.
   DeviceAssignment device_assignment = 6;
+  // The debugging options to be passed to the XLA compilation process.
+  xla.DebugOptions debug_options = 7;
 }
 
 // Options and XLA computation for a compilation.
@@ -98,4 +101,8 @@ message XRTExecutionConfig {
   bool release_input_handles = 5;
   // If true, release the handle to the computation after running.
   bool release_compilation_handle = 6;
+  // If set to true, and the result shape is a tuple, then instead of returning
+  // a single tuple allocation the execution will return a vector of
+  // allocations, one for each of the first-level elements of the result tuple.
+  bool return_exploded_tuple = 7;
 }
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index 3a99820d7aa9e9546cc95385fd98c05f28988e9e..31603e044d17baa3ae0ae583f61837811bb12495 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt_state.h"
 
 #include <stdint.h>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -34,6 +35,7 @@ limitations under the License.
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/stream_executor/stream_executor.h"
 
@@ -41,6 +43,34 @@ namespace tensorflow {
 
 namespace {
 
+class BufferAllocStats {
+ public:
+  struct Stats {
+    int64 count = 0;
+    int64 size = 0;
+  };
+
+  Stats ReportAlloc(int64 device, int64 msize) {
+    mutex_lock lock(lock_);
+    Stats* device_stats = &stats_[device];
+    device_stats->count += 1;
+    device_stats->size += msize;
+    return *device_stats;
+  }
+
+  Stats ReportFree(int64 device, int64 msize) {
+    mutex_lock lock(lock_);
+    Stats* device_stats = &stats_[device];
+    device_stats->count -= 1;
+    device_stats->size -= msize;
+    return *device_stats;
+  }
+
+ private:
+  mutable mutex lock_;
+  std::map<int64, Stats> stats_;
+};
+
 const char* kTupleContainer = "tuples";
 
 int64 get_uid() {
@@ -48,6 +78,11 @@ int64 get_uid() {
   return static_cast<int64>(unsigned_rand);
 }
 
+BufferAllocStats* GetAllocStats() {
+  static BufferAllocStats* stats = new BufferAllocStats();
+  return stats;
+}
+
 Status AllocateScopedShapedBuffer(
     xla::Backend* backend, int device_ordinal, const xla::Shape& shape,
     std::unique_ptr<xla::ScopedShapedBuffer>* buffer) {
@@ -100,9 +135,19 @@ XRTBufferAllocation::XRTBufferAllocation(const se::DeviceMemoryBase& allocation,
                                          xla::DeviceMemoryAllocator* allocator)
     : allocation_(allocation),
       device_ordinal_(device_ordinal),
-      allocator_(allocator) {}
+      allocator_(allocator) {
+  if (VLOG_IS_ON(2)) {
+    auto stats =
+        GetAllocStats()->ReportAlloc(device_ordinal_, allocation_.size());
+    LOG(INFO) << "XRT Allocation Stats: device=" << device_ordinal_
+              << " count=" << stats.count << " size=" << stats.size;
+  }
+}
 
 XRTBufferAllocation::~XRTBufferAllocation() {
+  if (VLOG_IS_ON(2)) {
+    GetAllocStats()->ReportFree(device_ordinal_, allocation_.size());
+  }
   // Deallocate explicitly allows allocation_ to be null.
   Status s = allocator_->Deallocate(device_ordinal_, allocation_);
   // Nothing to do but check fail here if memory datastructures are corrupted.
@@ -183,6 +228,20 @@ Status XRTTupleAllocation::ToLiteral(xla::Backend* backend, int device_ordinal,
   return Status::OK();
 }
 
+Status XRTTupleAllocation::WriteLiteral(xla::Backend* backend,
+                                        const xla::Literal& literal) {
+  if (!xla::ShapeUtil::Equal(literal.shape(), on_host_shape())) {
+    return errors::InvalidArgument(
+        "New literal shape not matching the existing one: literal=",
+        xla::ShapeUtil::HumanStringWithLayout(literal.shape()),
+        " device=", xla::ShapeUtil::HumanStringWithLayout(on_host_shape()));
+  }
+  auto transfer_manager = backend->transfer_manager();
+  TF_ASSIGN_OR_RETURN(auto stream, backend->BorrowStream(device_ordinal()));
+  return transfer_manager->TransferLiteralToDevice(stream.get(), literal,
+                                                   ToShapedBuffer());
+}
+
 void XRTTupleAllocation::DiscardAllocation(
     const xla::ShapeIndex& buffer_index) {
   buffers_.element(buffer_index)->DiscardAllocation();
diff --git a/tensorflow/compiler/xrt/xrt_state.h b/tensorflow/compiler/xrt/xrt_state.h
index 73b5584e38f781343fe6793af7ad28232fbfc184..3664c0cd4e6ad26945ae1012208fdb006164a066 100644
--- a/tensorflow/compiler/xrt/xrt_state.h
+++ b/tensorflow/compiler/xrt/xrt_state.h
@@ -137,6 +137,9 @@ class XRTTupleAllocation : public ResourceBase {
   Status ToLiteral(xla::Backend* backend, int device_ordinal,
                    xla::Literal* literal);
 
+  // Write a new literal value to the allocation.
+  Status WriteLiteral(xla::Backend* backend, const xla::Literal& literal);
+
   // True if none of the buffers in the allocation are aliased by any other live
   // handle.
   bool IsExclusiveOwner();
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ef8bedc7324696cd255c72a851f0f2410e03848
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -0,0 +1,76 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xrt/xrt_util.h"
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/types.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+namespace {
+
+bool DebugOptionsPassThroughEnabled() {
+  const char* env = getenv("TF_XLA_DEBUG_OPTIONS_PASSTHROUGH");
+  bool enabled =
+      env != nullptr && (strcmp(env, "1") == 0 || strcmp(env, "true") == 0);
+  if (enabled) {
+    LOG(WARNING) << "Passing through XLA debug options!";
+  } else {
+    LOG(WARNING) << "TF_XLA_DEBUG_OPTIONS_PASSTHROUGH not set, not all options "
+                    "will be retained";
+  }
+  return enabled;
+}
+
+string SafeDebugPath(const string& path) {
+  if (path.empty() || path.compare(0, 5, "gs://") == 0 ||
+      path.compare(0, 11, "bigstore://") == 0) {
+    return path;
+  }
+  LOG(WARNING) << "Invalid config path (will be dropped): " << path;
+  return string();
+}
+
+}  // namespace
+
+xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options) {
+  static const bool options_passthrough = DebugOptionsPassThroughEnabled();
+  if (options_passthrough) {
+    return ref_options;
+  }
+  xla::DebugOptions options = xla::GetDebugOptionsFromFlags();
+  options.set_xla_generate_hlo_text_to(
+      SafeDebugPath(ref_options.xla_generate_hlo_text_to()));
+  options.set_xla_dump_optimized_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_optimized_hlo_proto_to()));
+  options.set_xla_dump_computations_to(
+      SafeDebugPath(ref_options.xla_dump_computations_to()));
+  options.set_xla_dump_executions_to(
+      SafeDebugPath(ref_options.xla_dump_executions_to()));
+  for (auto& pass : ref_options.xla_disable_hlo_passes()) {
+    options.add_xla_disable_hlo_passes(pass);
+  }
+  options.set_xla_dump_unoptimized_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_unoptimized_hlo_proto_to()));
+  options.set_xla_dump_per_pass_hlo_proto_to(
+      SafeDebugPath(ref_options.xla_dump_per_pass_hlo_proto_to()));
+  return options;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..d9c05a7f3406313f99ae214d67b34e8e7de8be3e
--- /dev/null
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Utility functions in support of the XRT API.
+
+#ifndef TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
+#define TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
+
+#include "tensorflow/compiler/xla/xla.pb.h"
+
+namespace tensorflow {
+
+// Filters the debug options provided as argument according to the value of the
+// TF_XLA_DEBUG_OPTIONS_PASSTHROUGH environment variable. If such variable is
+// set to "1" or "true", the debug options will be returned as is. Otherwise
+// only a subset of them will be set in the returned ones, and all the paths
+// contained in it, will be limited to gs:// and bigstore:// ones.
+xla::DebugOptions BuildXlaDebugOptions(const xla::DebugOptions& ref_options);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
diff --git a/tensorflow/contrib/all_reduce/BUILD b/tensorflow/contrib/all_reduce/BUILD
index a513aa1e7c49d64a860c740fffde156fb5bcbcf3..f6c6560c1c354ed8a36b98b1f564835eb9958e55 100644
--- a/tensorflow/contrib/all_reduce/BUILD
+++ b/tensorflow/contrib/all_reduce/BUILD
@@ -9,8 +9,6 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
-load("//tensorflow:tensorflow.bzl", "tf_py_test")
-
 py_library(
     name = "all_reduce_py",
     srcs = ["__init__.py"],
@@ -29,29 +27,6 @@ py_library(
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nccl_ops",
-    ],
-)
-
-tf_py_test(
-    name = "all_reduce_test",
-    srcs = ["python/all_reduce_test.py"],
-    additional_deps = [
-        ":all_reduce",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:state_ops",
+        "//tensorflow/python/distribute:all_reduce",
     ],
 )
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce.py b/tensorflow/contrib/all_reduce/python/all_reduce.py
index 25f4b4b8d341331db79321338a88cabfe325eea5..238cdaf8a79812df3f043d9d070bbcfd443f6e1e 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce.py
+++ b/tensorflow/contrib/all_reduce/python/all_reduce.py
@@ -18,842 +18,5 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import math
-
-from tensorflow.python.framework import device as device_lib
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nccl_ops
-
-
-def _flatten_tensors(tensors):
-  """Check tensors for isomorphism and flatten.
-
-  Args:
-    tensors: list of T `tf.Tensor` which must all have the same shape.
-
-  Returns:
-    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
-    shape: the original shape of each element of input tensors
-
-  Raises:
-    ValueError: tensors are empty or non-isomorphic or have unknown shape.
-  """
-  if not tensors:
-    raise ValueError("tensors cannot be empty")
-  shape = tensors[0].shape
-  for tensor in tensors:
-    shape = shape.merge_with(tensor.shape)
-  if not shape.is_fully_defined():
-    raise ValueError("Tensors must have statically known shape.")
-  if len(shape) != 1:
-    reshaped = []
-    for t in tensors:
-      with ops.colocate_with(t):
-        reshaped.append(array_ops.reshape(t, [-1]))
-    tensors = reshaped
-  return tensors, shape
-
-
-def _reshape_tensors(tensors, shape):
-  """Reshape tensors flattened by _flatten_tensors.
-
-  Args:
-    tensors: list of T `tf.Tensor` of identical length 1D tensors.
-    shape: list of integers describing the desired shape.  Product of
-      the elements must equal the length of each tensor.
-
-  Returns:
-    list of T `tf.Tensor` which are the reshaped inputs.
-  """
-  reshaped = []
-  for t in tensors:
-    with ops.colocate_with(t):
-      reshaped.append(array_ops.reshape(t, shape))
-  return reshaped
-
-
-def _padded_split(tensor, pieces):
-  """Like split for 1D tensors but pads-out case where len % pieces != 0.
-
-  Args:
-    tensor: T `tf.Tensor` that must be 1D.
-    pieces: a positive integer specifying the number of pieces into which
-      tensor should be split.
-
-  Returns:
-    list of T `tf.Tensor` of length pieces, which hold the values of
-      thin input tensor, in order.  The final tensor may
-      be zero-padded on the end to make its size equal to those of all
-      of the other tensors.
-
-  Raises:
-    ValueError: The input tensor is not 1D.
-  """
-  shape = tensor.shape
-  if 1 != len(shape):
-    raise ValueError("input tensor must be 1D")
-  tensor_len = shape.dims[0].value
-  with ops.colocate_with(tensor):
-    if tensor_len % pieces != 0:
-      # pad to an even length
-      chunk_size = 1 + tensor_len // pieces
-      if pieces > tensor_len:
-        # This is an edge case that should not come up in practice,
-        # i.e. a different reduction algorithm would be better,
-        # but we'll make it work just for completeness.
-        pad_len = pieces - tensor_len
-        extended_whole = array_ops.concat(
-            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        parts = array_ops.split(extended_whole, pieces)
-        return parts, pad_len
-      elif (pieces - 1) * chunk_size >= tensor_len:
-        # Another edge case of limited real interest.
-        pad_len = (pieces * chunk_size) % tensor_len
-        extended_whole = array_ops.concat(
-            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        parts = array_ops.split(extended_whole, pieces)
-        return parts, pad_len
-      else:
-        last_chunk_size = tensor_len - (pieces - 1) * chunk_size
-        pad_len = chunk_size - last_chunk_size
-        piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
-        parts = array_ops.split(tensor, piece_lens)
-        parts[-1] = array_ops.concat(
-            [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
-        return parts, pad_len
-    else:
-      return array_ops.split(tensor, pieces), 0
-
-
-def _strip_padding(tensors, pad_len):
-  """Strip the suffix padding added by _padded_split.
-
-  Args:
-    tensors: list of T `tf.Tensor` of identical length 1D tensors.
-    pad_len: number of elements to be stripped from the end of each tensor.
-
-  Returns:
-    list of T `tf.Tensor` which are the stripped inputs.
-
-  Raises:
-    ValueError: tensors must be a non-empty list of 1D tensors, and
-      each must be longer than pad_len.
-  """
-  if not tensors:
-    raise ValueError("tensors cannot be empty")
-  shape = tensors[0].shape
-  if len(shape) > 1:
-    raise ValueError("tensors must be 1D")
-  prefix_len = int(shape[0] - pad_len)
-  if prefix_len < 0:
-    raise ValueError("pad_len longer than tensor")
-  stripped = []
-  for t in tensors:
-    with ops.colocate_with(t):
-      stripped.append(array_ops.slice(t, [0], [prefix_len]))
-  return stripped
-
-
-def _ragged_split(tensor, pieces):
-  """Like split for 1D tensors but allows case where len % pieces != 0.
-
-  Args:
-    tensor: T `tf.Tensor` that must be 1D.
-    pieces: a positive integer specifying the number of pieces into which
-      tensor should be split.
-
-  Returns:
-    list of T `tf.Tensor` of length pieces, which hold the values of
-      the input tensor, in order.  The final tensor may be shorter
-      than the others, which will all be of equal length.
-
-  Raises:
-    ValueError: input tensor must be 1D.
-  """
-  shape = tensor.shape
-  if 1 != len(shape):
-    raise ValueError("input tensor must be 1D")
-  tensor_len = shape.dims[0].value
-  chunk_size = tensor_len // pieces
-  with ops.colocate_with(tensor):
-    if tensor_len != (pieces * chunk_size):
-      # last piece will be short
-      assert pieces > 1
-      last_chunk_size = tensor_len - ((pieces - 1) * chunk_size)
-      assert last_chunk_size > 0
-      piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
-      return array_ops.split(tensor, piece_lens)
-    else:
-      return array_ops.split(tensor, pieces)
-
-
-def _ring_permutations(num_workers, num_subchunks, gpu_perm):
-  """"Generate an array of device index arrays, one for each subchunk.
-
-  In the basic ring reduction algorithm there are size(T)/num_devices
-  data chunks and each device process one chunk per tick, i.e. sending
-  one chunk and receiving one chunk.  The idea of subchunking is that
-  each device processes num_subchunks smaller data regions per tick,
-  and the ring rank permutation is different for each subchunk index
-  so that a device is potentially sending to and receiving from
-  num_subchunks different other devices at each tick.  Where multiple
-  independent data channels exist between devices, this strategy
-  supplies a method of using them in parallel.
-
-  Args:
-    num_workers: number of worker tasks
-    num_subchunks: number of subchunks into which to divide each per-GPU chunk.
-    gpu_perm: an array of integers in [0, num_gpus-1] giving the default
-      ring order of GPUs at each worker.  Other permutations will be generated
-      by rotating this array and splicing together per-worker instances.
-
-  Raises:
-    ValueError: the number of subchunks may not exceed the number of GPUs.
-
-  Returns:
-    pred_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
-        preceding device in the permutation for that subchunk.  The
-        device index of GPU i at worker j is i + (j * num_gpus).
-    rank_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
-       local rank of device d in the permutation for that subchunk.
-  """
-  num_gpus = len(gpu_perm)
-  devices = num_workers * num_gpus
-  if devices == 0:
-    return [], []
-  if num_subchunks > num_gpus:
-    raise ValueError(
-        "num_subchunks %d must be <= num_gpus %d" % (num_subchunks, num_gpus))
-  rotation_interval = max(1, int(num_gpus / num_subchunks))
-  perms_by_s = []
-  for s in range(0, num_subchunks):
-    full_order = []
-    offset = s * rotation_interval
-    for w in range(0, num_workers):
-      default_order = [(w * num_gpus) + i for i in gpu_perm]
-      dev_order = default_order[offset:] + default_order[:offset]
-      full_order += dev_order
-    perms_by_s.append(full_order)
-  pred_by_s_d = [[-1 for d in range(0, devices)]
-                 for s in range(0, num_subchunks)]
-  rank_by_s_d = [[-1 for d in range(0, devices)]
-                 for s in range(0, num_subchunks)]
-  for s in range(0, num_subchunks):
-    for d in range(0, devices):
-      for t in range(0, devices):
-        if d == perms_by_s[s][t]:
-          rank_by_s_d[s][d] = t
-          pred_by_s_d[s][d] = perms_by_s[s][(t + devices - 1) % devices]
-          break
-  return (pred_by_s_d, rank_by_s_d)
-
-
-def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
-                          gpu_perm, red_op, un_op=None):
-  """Construct a subgraph performing a ring-style all-reduce of input_tensors.
-
-  Args:
-    input_tensors: a list of T `tf.Tensor` objects, which must all
-      have the same shape and type.
-    num_workers: number of worker tasks spanned by input_tensors.
-    num_subchunks: number of subchunks each device should process in one tick.
-    gpu_perm: a list of ints giving a ring-wise rank ordering of GPUs at
-      each worker.  All workers must have the same number of
-      GPUs with the same rank ordering.  If NVLINK is available, this should
-      be a ring order supported by NVLINK edges.
-    red_op: a binary operator for elementwise reduction.
-    un_op: an optional unary operator to apply to fully reduced values.
-
-  Raises:
-    ValueError: empty input_tensors or they don't all have same
-    size.
-
-  Returns:
-    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
-  """
-  if len(input_tensors) < 2:
-    raise ValueError("input_tensors must be length 2 or longer")
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  devices = [t.device for t in input_tensors]
-  (pred_by_s_d, rank_by_s_d) = _ring_permutations(
-      num_workers, num_subchunks, gpu_perm)
-  chunks_by_dev, pad_len = _build_ring_gather(
-      input_tensors, devices,
-      num_subchunks, pred_by_s_d, rank_by_s_d, red_op)
-  if un_op:
-    chunks_by_dev = _apply_unary_to_chunks(un_op, chunks_by_dev)
-  output_tensors = _build_ring_scatter(pred_by_s_d, rank_by_s_d,
-                                       chunks_by_dev)
-  if pad_len > 0:
-    output_tensors = _strip_padding(output_tensors, pad_len)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_ring_gather(input_tensors, devices, num_subchunks,
-                       pred_by_s_d, rank_by_s_d, red_op):
-  """Construct a subgraph for the first (reduction) pass of ring all-reduce.
-
-  Args:
-    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
-      shape and type.
-    devices: array of device name strings
-    num_subchunks: number of subchunks each device should process in one tick.
-    pred_by_s_d: as produced by _ring_permutations
-    rank_by_s_d: as produced by _ring_permutations
-    red_op: a binary operator for elementwise reduction
-
-  Raises:
-    ValueError: tensors must all be one dimensional.
-
-  Returns:
-    list of list of T `tf.Tensor` of (partially) reduced values where
-    exactly num_subchunks chunks at each device are fully reduced.
-  """
-  num_devices = len(input_tensors)
-  if num_devices == 0:
-    return []
-  if num_devices == 1:
-    return input_tensors
-  shape = input_tensors[0].shape
-  if 1 != len(shape):
-    raise ValueError("input tensors must be 1D")
-  num_chunks = num_devices * num_subchunks
-  num_ticks = num_devices - 1
-  # Initialize chunks_by_dev with splits of the input tensors.
-  chunks_by_dev = []
-  split_pad_len = 0
-  for d in range(0, num_devices):
-    with ops.device(devices[d]):
-      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
-      chunks_by_dev.append(splits)
-  # Reduction phase
-  for tick in range(0, num_ticks):
-    # One new partial reduction for every chunk
-    new_partial_reductions = [None for _ in range(0, num_chunks)]
-    # Compute reductions with respect to last tick's values
-    for d in range(0, num_devices):
-      with ops.device(devices[d]):
-        for s in range(0, num_subchunks):
-          rank = rank_by_s_d[s][d]
-          seg_index = (rank + num_devices - (2 + tick)) % num_devices
-          pred_dev = pred_by_s_d[s][d]
-          chunk_index = (seg_index * num_subchunks) + s
-          new_partial_reductions[chunk_index] = red_op(
-              chunks_by_dev[pred_dev][chunk_index],
-              chunks_by_dev[d][chunk_index])
-    # Update chunks_by_dev with the new values at the end of the tick.
-    for d in range(0, num_devices):
-      for s in range(0, num_subchunks):
-        rank = rank_by_s_d[s][d]
-        seg_index = (rank + num_devices - (2 + tick)) % num_devices
-        chunk_index = (seg_index * num_subchunks) + s
-        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
-  return chunks_by_dev, split_pad_len
-
-
-def _apply_unary_to_chunks(f, chunks_by_dev):
-  """Apply a unary op to each tensor in chunks_by_dev, on same device.
-
-  Args:
-    f: a unary function over T `tf.Tensor`.
-    chunks_by_dev: list of lists of T `tf.Tensor`.
-
-  Returns:
-    new list of lists of T `tf.Tensor` with the same structure as
-    chunks_by_dev containing the derived tensors.
-  """
-  output = []
-  for x in chunks_by_dev:
-    with ops.colocate_with(x[0]):
-      output.append([f(t) for t in x])
-  return output
-
-
-def _build_ring_scatter(pred_by_s_d, rank_by_s_d,
-                        chunks_by_dev):
-  """Construct subgraph for second (scatter) pass of ring all-reduce.
-
-  Args:
-    pred_by_s_d: as produced by _ring_permutations
-    rank_by_s_d: as produced by _ring_permutations
-    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
-      (device, chunk)
-
-  Raises:
-    ValueError: chunks_by_dev is not well-formed
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors, one
-    at each device corresponding to the outer dimension of chunks_by_dev.
-  """
-  num_devices = len(chunks_by_dev)
-  num_chunks = len(chunks_by_dev[0])
-  if 0 != num_chunks % num_devices:
-    raise ValueError(
-        "Expect number of chunks per device to be divisible by num_devices")
-  num_subchunks = int(num_chunks / num_devices)
-  num_ticks = num_devices - 1
-  for tick in range(0, num_ticks):
-    passed_values = [None for _ in range(0, num_chunks)]
-    for d in range(0, num_devices):
-      with ops.colocate_with(chunks_by_dev[d][0]):
-        for s in range(0, num_subchunks):
-          rank = rank_by_s_d[s][d]
-          seg_index = (rank + num_devices - (1 + tick)) % num_devices
-          pred_dev = pred_by_s_d[s][d]
-          chunk_index = (seg_index * num_subchunks) + s
-          passed_values[chunk_index] = array_ops.identity(
-              chunks_by_dev[pred_dev][chunk_index])
-    for d in range(0, num_devices):
-      for s in range(0, num_subchunks):
-        rank = rank_by_s_d[s][d]
-        seg_index = (rank + num_devices - (1 + tick)) % num_devices
-        chunk_index = (seg_index * num_subchunks) + s
-        chunks_by_dev[d][chunk_index] = passed_values[chunk_index]
-  # Join chunks at each device.
-  output = []
-  for x in chunks_by_dev:
-    with ops.colocate_with(x[0]):
-      output.append(array_ops.concat(x, 0))
-  return output
-
-
-def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
-  """Construct a subgraph for recursive halving-doubling all-reduce.
-
-  The recursive halving-doubling algorithm is described in
-  http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
-
-  The concept is to arrange the participating n devices in
-  a linear sequence where devices exchange data pairwise
-  with one other device in each round.  During the gather
-  phase there are lg(n) rounds where devices exchange
-  increasingly smaller sub-tensors with another device
-  at increasingly greater distances, until at the top
-  each device has 1/n of the fully reduced values.  During the
-  scatter phase each device exchanges its fully reduced
-  sub-tensor (which doubles in length at each round)
-  with one other device at increasingly smaller distances
-  until each device has all of the fully reduced values.
-
-  Note: this preliminary version requires that len(input_tensors) be a
-    power of 2.  TODO(tucker): relax this restriction.  Also, the
-    number of elements in each tensor must be divisible by 2^h where h
-    is the number of hops in each phase.  This will also be relaxed in
-    the future with edge-case specific logic.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
-    red_op: a binary elementwise reduction Op.
-    un_op: an optional unary elementwise Op to apply to reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors, one
-    at each device of input_tensors.
-
-  Raises:
-    ValueError: num_devices not a power of 2, or tensor len not divisible
-    by 2 the proper number of times.
-  """
-  devices = [t.device for t in input_tensors]
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  reduced_shards = _build_recursive_hd_gather(input_tensors, devices, red_op)
-  if un_op:
-    reduced_shards = [un_op(t) for t in reduced_shards]
-  output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_recursive_hd_gather(input_tensors, devices, red_op):
-  """Construct the gather phase of recursive halving-doubling all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
-    devices: a list of strings naming the devices hosting input_tensors,
-      which will also be used to host the (partial) reduction values.
-    red_op: a binary elementwise reduction Op.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensor shards.
-
-  Raises:
-    ValueError: num_devices not a power of 2, or tensor len not divisible
-    by 2 the proper number of times.
-  """
-  num_devices = len(devices)
-  num_hops = int(math.log(num_devices, 2))
-  if num_devices != (2 ** num_hops):
-    raise ValueError("num_devices must be a power of 2")
-  chunks = input_tensors
-  for h in range(0, num_hops):
-    span = 2 ** h
-    group_size = span * 2
-    new_chunks = [[] for _ in devices]
-    for d in range(0, num_devices):
-      if (d % group_size) >= (group_size / 2):
-        # skip right half of a pair
-        continue
-      left_dev = devices[d]
-      right_dev = devices[d + span]
-      left_split = array_ops.split(chunks[d], 2)
-      right_split = array_ops.split(chunks[d+span], 2)
-      with ops.device(left_dev):
-        new_chunks[d] = red_op(left_split[0], right_split[0])
-      with ops.device(right_dev):
-        new_chunks[d + span] = red_op(left_split[1], right_split[1])
-    chunks = new_chunks
-  return chunks
-
-
-def _build_recursive_hd_scatter(input_tensors, devices):
-  """Construct the scatter phase of recursive halving-doublng all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
-    devices: a list of strings naming the devices on which the reconstituted
-      full tensors should be placed.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors.
-  """
-  num_devices = len(devices)
-  num_hops = int(math.log(num_devices, 2))
-  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
-  chunks = input_tensors
-  for h in reversed(range(0, num_hops)):
-    span = 2 ** h
-    group_size = span * 2
-    new_chunks = [[] for _ in devices]
-    for d in range(0, num_devices):
-      if (d % group_size) >= (group_size / 2):
-        # skip right half of a pair
-        continue
-      left_idx = d
-      right_idx = d + span
-      left_dev = devices[left_idx]
-      right_dev = devices[right_idx]
-      with ops.device(left_dev):
-        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
-                                                 chunks[right_idx]], 0)
-      with ops.device(right_dev):
-        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
-                                                  chunks[right_idx]], 0)
-    chunks = new_chunks
-  return chunks
-
-
-def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
-  """Construct a subgraph for shuffle all-reduce.
-
-  Shuffle reduce is essentially the algorithm implemented when using
-  parameter servers.  Suppose tensor length is n, there are d devices
-  and g gather shards.  Each device sends a n/g length sub-tensor to
-  each gather shard.  The gather shards perform a reduction across d
-  fragments, then broadcast the result back to each device.  The
-  devices then join the g fully reduced fragments they receive from
-  the shards.  The gather shards could perform d-1 pairwise
-  reductions, or one d-way reduction.  The first is better where
-  reduction Op time is low compared to transmission time, the second
-  better in the other case.
-
-  Args:
-    input_tensors: list of T @(tf.Tensor} values to be reduced.
-    gather_devices: list of names of devices on which reduction shards
-      should be placed.
-    red_op: an n-array elementwise reduction Op
-    un_op: optional elementwise unary Op to be applied to fully-reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced tensors.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  dst_devices = [t.device for t in input_tensors]
-  reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
-                                         red_op, un_op)
-  output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
-  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.
-
-  Args:
-    input_tensors: list of T @(tf.Tensor} values to be reduced.
-    gather_devices: list of names of devices on which reduction shards
-      should be placed.
-    red_op: the binary reduction Op
-    un_op: optional elementwise unary Op to be applied to fully-reduced values.
-
-  Returns:
-    list of T `tf.Tensor` which are the fully reduced shards.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  num_source_devices = len(input_tensors)
-  num_gather_devices = len(gather_devices)
-  shape = input_tensors[0].shape
-  if len(shape) != 1:
-    raise ValueError("input_tensors must be 1D")
-  shards_by_source = []
-  for d in range(0, num_source_devices):
-    with ops.colocate_with(input_tensors[d]):
-      shards_by_source.append(
-          _ragged_split(input_tensors[d], num_gather_devices))
-  reduced_shards = []
-  for d in range(0, num_gather_devices):
-    with ops.device(gather_devices[d]):
-      values = [s[d] for s in shards_by_source]
-      red_shard = red_op(values)
-      if un_op:
-        red_shard = un_op(red_shard)
-      reduced_shards.append(red_shard)
-  return reduced_shards
-
-
-def _build_shuffle_scatter(reduced_shards, dst_devices):
-  """Build the scatter phase of shuffle all-reduce.
-
-  Args:
-    reduced_shards:  list of T @(tf.Tensor} fully reduced shards
-    dst_devices: list of names of devices at which the fully-reduced value
-      should be reconstituted.
-
-  Returns:
-    list of T `tf.Tensor` scattered tensors.
-  """
-  num_devices = len(dst_devices)
-  out_tensors = []
-  for d in range(0, num_devices):
-    with ops.device(dst_devices[d]):
-      out_tensors.append(array_ops.concat(reduced_shards, 0))
-  return out_tensors
-
-
-def _split_by_task(devices, values):
-  """Partition devices and values by common task.
-
-  Args:
-    devices: list of device name strings
-    values: list of T `tf.tensor` of same length as devices.
-
-  Returns:
-    (per_task_devices, per_task_values) where both values are
-    lists of lists with isomorphic structure: the outer list is
-    indexed by task, and the inner list has length of the number
-    of values belonging to that task.  per_task_devices contains
-    the specific devices to which the values are local, and
-    per_task_values contains the corresponding values.
-
-  Raises:
-    ValueError: devices must be same length as values.
-  """
-  num_devices = len(devices)
-  if num_devices != len(values):
-    raise ValueError("len(devices) must equal len(values)")
-  per_task_devices = collections.OrderedDict()
-  per_task_values = collections.OrderedDict()
-  for d in range(num_devices):
-    d_spec = device_lib.DeviceSpec.from_string(devices[d])
-    if not hasattr(d_spec, "task") or d_spec.task is None:
-      assert False, "failed to parse device %s" % devices[d]
-    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
-    if index not in per_task_devices:
-      per_task_devices[index] = []
-      per_task_values[index] = []
-    per_task_devices[index].append(devices[d])
-    per_task_values[index].append(values[d])
-
-  return (list(per_task_devices.values()), list(per_task_values.values()))
-
-
-def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
-  """Build a subgraph that does one full all-reduce, using NCCL.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    red_op: binary elementwise reduction operator.  Must be one of
-      {tf.add}
-    un_op: optional unary elementwise Op to apply to fully-reduce values.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: red_op not supported.
-  """
-  if red_op == math_ops.add:
-    output_tensors = nccl_ops.all_sum(input_tensors)
-  else:
-    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
-  if un_op:
-    un_op_wrapped = []
-    for t in output_tensors:
-      with ops.colocate_with(t):
-        un_op_wrapped.append(un_op(t))
-    output_tensors = un_op_wrapped
-  return output_tensors
-
-
-def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
-  """Construct a subgraph for NCCL hybrid all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    red_op: binary elementwise reduction operator.
-    upper_level_f: function for reducing one value per worker, across
-      workers.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  devices = [t.device for t in input_tensors]
-  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
-  num_workers = len(per_worker_devices)
-  up_values = [None for w in range(0, num_workers)]
-  up_devices = up_values[:]
-  down_values = up_values[:]
-  # First stage: reduce within each worker using NCCL
-  for w in range(0, num_workers):
-    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
-    # NOTE: these reductions will not run to completion unless
-    # every output value is used.  Since we only need one, we
-    # need to put control dependencies on the rest.
-    with ops.control_dependencies(worker_values):
-      with ops.device(worker_values[0].device):
-        up_values[w] = array_ops.identity(worker_values[0])
-      up_devices[w] = per_worker_devices[w][0]
-  # Second stage: Apply upper_level_f to reduce across first device at
-  # each worker
-  level_2_output = upper_level_f(up_values)
-  # Third stage: propagate within each worker using NCCL Broadcast
-  for w in range(0, num_workers):
-    dst_tensors = []
-    with ops.device(per_worker_devices[w][0]):
-      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
-    for d in per_worker_devices[w]:
-      with ops.device(d):
-        dst_tensors.append(array_ops.identity(broadcast_src))
-    down_values[w] = dst_tensors
-  output_tensors = [v for sublist in down_values for v in sublist]
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def _reduce_non_singleton(input_tensors, red_f, un_op):
-  """If input_tensors has more than one element apply red_f, else apply un_op."""
-  if len(input_tensors) > 1:
-    return red_f(input_tensors)
-  else:
-    if not un_op:
-      return input_tensors
-    output_tensors = []
-    for t in input_tensors:
-      with ops.colocate_with(t):
-        output_tensors.append(un_op(t))
-    return output_tensors
-
-
-def build_nccl_then_ring(input_tensors, subdiv, red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Ring across workers."""
-  def upper_builder(y):
-    return build_ring_all_reduce(y, len(y), subdiv, [0], red_op, un_op)
-  def upper_level_f(x):
-    return _reduce_non_singleton(x, upper_builder, un_op)
-  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
-
-
-def build_nccl_then_recursive_hd(input_tensors, red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Recursive-HD across workers."""
-  upper_level_f = lambda x: build_recursive_hd_all_reduce(x, red_op, un_op)
-  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
-
-
-def build_nccl_then_shuffle(input_tensors, gather_devices, nccl_red_op,
-                            shuffle_red_op, un_op=None):
-  """Construct hybrid of NCCL within workers, Shuffle across workers."""
-  upper_level_f = lambda x: build_shuffle_all_reduce(x, gather_devices,
-                                                     shuffle_red_op, un_op)
-  return _build_nccl_hybrid(input_tensors, nccl_red_op, upper_level_f)
-
-
-def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
-  """Construct a subgraph for Shuffle hybrid all-reduce.
-
-  Args:
-    input_tensors: list of T `tf.Tensor` of same-shape and type values to
-      be reduced.
-    gather_devices: list of device names on which to host gather shards.
-    red_op: binary elementwise reduction operator.
-    upper_level_f: function for reducing one value per worker, across
-      workers.
-
-  Returns:
-    list of T `tf.Tensor` of reduced values.
-
-  Raises:
-    ValueError: inputs not well-formed.
-  """
-  input_tensors, shape = _flatten_tensors(input_tensors)
-  # First stage, reduce across each worker using gather_devices.
-  devices = [t.device for t in input_tensors]
-  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
-  num_workers = len(per_worker_devices)
-  up_values = []
-  if len(gather_devices) != num_workers:
-    raise ValueError("For shuffle hybrid, gather_devices must contain one "
-                     "device per worker. ")
-  for w in range(0, num_workers):
-    reduced_shards = _build_shuffle_gather(
-        per_worker_values[w], [gather_devices[w]], red_op)
-    up_values.append(reduced_shards[0])
-  # Second stage, apply upper_level_f.
-  level_2_output = upper_level_f(up_values)
-  # Third stage, apply shuffle scatter at each worker.
-  output_tensors = []
-  for w in range(0, num_workers):
-    output_tensors += _build_shuffle_scatter(
-        [level_2_output[w]], per_worker_devices[w])
-  if len(shape) != 1:
-    output_tensors = _reshape_tensors(output_tensors, shape)
-  return output_tensors
-
-
-def build_shuffle_then_ring(input_tensors, gather_devices, subdiv,
-                            red_n_op, red_op, un_op=None):
-  """Construct hybrid of Shuffle within workers, Ring across workers."""
-  def upper_builder(tensors):
-    return build_ring_all_reduce(tensors, len(tensors), subdiv, [0],
-                                 red_op, un_op)
-  def upper_level_f(tensors):
-    return _reduce_non_singleton(tensors, upper_builder, un_op)
-  return _build_shuffle_hybrid(
-      input_tensors, gather_devices, red_n_op, upper_level_f)
-
-
-def build_shuffle_then_shuffle(input_tensors, first_gather_devices,
-                               second_gather_devices, red_op, un_op=None):
-  """Construct hybrid of Shuffle within workers, Shuffle across workers."""
-  def upper_builder(tensors):
-    return build_shuffle_all_reduce(tensors, second_gather_devices,
-                                    red_op, un_op)
-  def upper_level_f(tensors):
-    return _reduce_non_singleton(tensors, upper_builder, un_op)
-  return _build_shuffle_hybrid(
-      input_tensors, first_gather_devices, red_op, upper_level_f)
+# pylint: disable=unused-import,wildcard-import
+from tensorflow.python.distribute.all_reduce import *
diff --git a/tensorflow/contrib/android/cmake/build.gradle b/tensorflow/contrib/android/cmake/build.gradle
index 17a57b99fd6c9efc09bda0ce1249b1f51bd5af5c..ddec08894f34f96b080610f1d27a6a436f7ffa91 100644
--- a/tensorflow/contrib/android/cmake/build.gradle
+++ b/tensorflow/contrib/android/cmake/build.gradle
@@ -22,8 +22,8 @@ android {
         }
         externalNativeBuild {
             cmake {
-                arguments '-DANDROID_TOOLCHAIN=gcc',
-                          '-DANDROID_STL=gnustl_static'
+                arguments '-DANDROID_TOOLCHAIN=clang',
+                          '-DANDROID_STL=c++_static'
             }
         }
     }
@@ -70,7 +70,7 @@ if (ndkDir == null || ndkDir == "") {
     ndkDir = System.getenv('ANDROID_NDK_HOME')
 }
 
-if(! Os.isFamily(Os.FAMILY_WINDOWS)) {
+if (!Os.isFamily(Os.FAMILY_WINDOWS)) {
     // This script is for non-Windows OS. For Windows OS, MANUALLY build
     // (or copy the built) libs/headers to the
     //    ${TENSORFLOW_ROOT_DIR}/tensorflow/contrib/makefile/gen
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/BUILD b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..6d2d70c99b4cc804f2c8bf57afdc8c11f1f73516
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/BUILD
@@ -0,0 +1,36 @@
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/tools/test:performance.bzl", "tf_py_logged_benchmark")
+
+py_library(
+    name = "benchmark_base",
+    srcs = [
+        "benchmark_base.py",
+    ],
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_test(
+    name = "cartpole_benchmark",
+    size = "enormous",
+    srcs = ["cartpole_benchmark.py"],
+    tags = [
+        "local",
+        "manual",
+        "no_oss",
+        "notap",
+        "nozapfhahn",
+    ],
+    deps = [
+        ":benchmark_base",
+        # Note: required gym dependency may need to be added here.
+    ],
+)
+
+tf_py_logged_benchmark(
+    name = "cartpole_logged_benchmark",
+    target = "//tensorflow/contrib/autograph/examples/benchmarks:cartpole_benchmark",
+)
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..93c694849c4dc3faca71e7f9d8614649a7784f99
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/benchmark_base.py
@@ -0,0 +1,62 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Common benchmarking code.
+
+See https://www.tensorflow.org/community/benchmarks for usage.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+import tensorflow as tf
+
+
+class ReportingBenchmark(tf.test.Benchmark):
+  """Base class for a benchmark that reports general performance metrics.
+
+  Subclasses only need to call one of the _profile methods, and optionally
+  report_results.
+  """
+
+  def time_execution(self, name, target, iters, warm_up_iters=5):
+    for _ in range(warm_up_iters):
+      target()
+
+    all_times = []
+    for _ in range(iters):
+      iter_time = time.time()
+      target()
+      all_times.append(time.time() - iter_time)
+
+    avg_time = np.average(all_times)
+
+    extras = dict()
+    extras['all_times'] = all_times
+
+    if isinstance(name, tuple):
+      extras['name'] = name
+      name = '_'.join(str(piece) for piece in name)
+
+    self.report_benchmark(
+        iters=iters, wall_time=avg_time, name=name, extras=extras)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f553be58e94f11e45f0697558348fbbd26bfb91
--- /dev/null
+++ b/tensorflow/contrib/autograph/examples/benchmarks/cartpole_benchmark.py
@@ -0,0 +1,492 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""A basic RL cartpole benchmark.
+
+The RL model uses the OpenAI Gym environment to train a simple network using
+the policy gradients method. The training scales the gradients for each step
+by the episode's cumulative discounted reward and averages these gradients over
+a fixed number of games before applying the optimization step.
+
+For benchmarking purposes, we replace the OpenAI Gym environment to a fake
+that returns random actions and rewards and never ends the episode. This way
+the benchmarks compare the same amount of computation at each step.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gym
+import numpy as np
+import tensorflow as tf
+
+from tensorflow.contrib import eager
+from tensorflow.contrib.autograph.examples.benchmarks import benchmark_base
+from tensorflow.python import autograph as ag
+from tensorflow.python.eager import context
+
+#
+# AutoGraph implementation
+#
+
+
+@ag.convert()
+def graph_append_discounted_rewards(destination, rewards, discount_rate):
+  """Discounts episode rewards and appends them to destination."""
+  ag.set_element_type(rewards, tf.float32)
+
+  cdr = 0.0
+  reverse_discounted = []
+  ag.set_element_type(reverse_discounted, tf.float32)
+
+  for i in range(len(rewards) - 1, -1, -1):
+    cdr = cdr * discount_rate + rewards[i]
+    cdr.set_shape(())
+    reverse_discounted.append(cdr)
+
+  retval = destination
+  # Note: AutoGraph doesn't yet support reversed() so we use a loop instead.
+  for i in range(len(reverse_discounted) - 1, -1, -1):
+    retval.append(reverse_discounted[i])
+
+  return retval
+
+
+class GraphPolicyNetwork(tf.keras.Model):
+  """Policy network for the cart-pole reinforcement learning problem.
+
+  The forward path of the network takes an observation from the cart-pole
+  environment (length-4 vector) and outputs an action.
+  """
+
+  def __init__(self, hidden_size):
+    super(GraphPolicyNetwork, self).__init__()
+    self._hidden_layer = tf.keras.layers.Dense(
+        hidden_size, activation=tf.nn.elu)
+    self._output_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    """Calculates logits and action.
+
+    Args:
+      inputs: Observations from a step in the cart-pole environment, of shape
+        `(batch_size, input_size)`
+
+    Returns:
+      logits: the logits output by the output layer. This can be viewed as the
+        likelihood vales of choosing the left (0) action. Shape:
+        `(batch_size, 1)`.
+      actions: randomly selected actions ({0, 1}) based on the logits. Shape:
+        `(batch_size, 1)`.
+    """
+    hidden = self._hidden_layer(inputs)
+    logits = self._output_layer(hidden)
+
+    left_prob = tf.nn.sigmoid(logits)
+    action_probs = tf.concat([left_prob, 1.0 - left_prob], 1)
+
+    actions = tf.multinomial(tf.log(action_probs), 1)
+    return logits, actions
+
+  # TODO(mdan): Move this method out of the class.
+  @ag.convert()
+  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
+            max_steps_per_game):
+    var_list = tf.trainable_variables()
+    grad_list = [
+        tf.TensorArray(tf.float32, 0, dynamic_size=True) for _ in var_list
+    ]
+
+    step_counts = []
+    discounted_rewards = []
+    ag.set_element_type(discounted_rewards, tf.float32)
+    ag.set_element_type(step_counts, tf.int32)
+
+    # Note: we use a shared object, cart_pole_env here. Because calls to the
+    # object's method are made through py_func, TensorFlow cannot detect its
+    # data dependencies. Hence we must manually synchronize access to it
+    # and ensure the control dependencies are set in such a way that
+    # calls to reset(), take_one_step, etc. are made in the correct order.
+    sync_counter = tf.constant(0)
+
+    for _ in tf.range(num_games):
+      with tf.control_dependencies([sync_counter]):
+        obs = cart_pole_env.reset()
+        with tf.control_dependencies([obs]):
+          sync_counter += 1
+
+        game_rewards = []
+        ag.set_element_type(game_rewards, tf.float32)
+
+        for step in tf.range(max_steps_per_game):
+          logits, actions = self(obs)  # pylint:disable=not-callable
+          logits = tf.reshape(logits, ())
+          actions = tf.reshape(actions, ())
+
+          labels = 1.0 - tf.cast(actions, tf.float32)
+          loss = tf.nn.sigmoid_cross_entropy_with_logits(
+              labels=labels, logits=logits)
+          grads = tf.gradients(loss, var_list)
+
+          for i in range(len(grads)):
+            grad_list[i].append(grads[i])
+
+          with tf.control_dependencies([sync_counter]):
+            obs, reward, done = cart_pole_env.step(actions)
+            with tf.control_dependencies([obs]):
+              sync_counter += 1
+            obs = tf.reshape(obs, (1, 4))
+
+          game_rewards.append(reward)
+          if reward < 0.1 or done:
+            step_counts.append(step + 1)
+            break
+
+        discounted_rewards = graph_append_discounted_rewards(
+            discounted_rewards, game_rewards, discount_rate)
+
+    discounted_rewards = ag.stack(discounted_rewards)
+    discounted_rewards.set_shape((None,))
+    mean, variance = tf.nn.moments(discounted_rewards, [0])
+    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)
+
+    for i in range(len(grad_list)):
+      g = ag.stack(grad_list[i])
+
+      # This block just adjusts the shapes to match for multiplication.
+      r = normalized_rewards
+      if r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+      if r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+
+      grad_list[i] = tf.reduce_mean(g * r, axis=0)
+
+    optimizer.apply_gradients(
+        zip(grad_list, var_list), global_step=tf.train.get_global_step())
+
+    return ag.stack(step_counts)
+
+
+@ag.convert()
+def graph_train_model(policy_network, cart_pole_env, optimizer, iterations):
+  """Trains the policy network for a given number of iterations."""
+  i = tf.constant(0)
+  mean_steps_per_iteration = []
+  ag.set_element_type(mean_steps_per_iteration, tf.int32)
+
+  while i < iterations:
+    steps_per_game = policy_network.train(
+        cart_pole_env,
+        optimizer,
+        discount_rate=0.95,
+        num_games=20,
+        max_steps_per_game=200)
+    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
+    i += 1
+
+  return ag.stack(mean_steps_per_iteration)
+
+
+class GraphGymCartpoleEnv(object):
+  """An env backed by OpenAI Gym's CartPole environment.
+
+  Used to confirm a functional model only.
+  """
+
+  def __init__(self):
+    cart_pole_env = gym.make('CartPole-v1')
+    cart_pole_env.seed(0)
+    cart_pole_env.reset()
+    self.env = cart_pole_env
+
+  def reset(self):
+    obs = ag.utils.wrap_py_func(self.env.reset, tf.float64, ())
+    obs = tf.reshape(obs, (1, 4))
+    obs = tf.cast(obs, tf.float32)
+    return obs
+
+  def step(self, actions):
+
+    def take_one_step(actions):
+      obs, reward, done, _ = self.env.step(actions)
+      obs = obs.astype(np.float32)
+      reward = np.float32(reward)
+      return obs, reward, done
+
+    return ag.utils.wrap_py_func(take_one_step,
+                                 (tf.float32, tf.float32, tf.bool), (actions,))
+
+
+class GraphRandomCartpoleEnv(object):
+  """An environment that returns random actions and never finishes.
+
+  Used during benchmarking, it will cause training to run a constant number of
+  steps.
+  """
+
+  def reset(self):
+    return tf.random.normal((1, 4))
+
+  def step(self, actions):
+    with tf.control_dependencies([actions]):
+      random_obs = tf.random.normal((1, 4))
+      fixed_reward = tf.constant(0.001)
+      done = tf.constant(False)
+      return random_obs, fixed_reward, done
+
+
+#
+# Eager implementation
+#
+
+
+def eager_append_discounted_rewards(discounted_rewards, rewards, discount_rate):
+  cdr = 0.0
+  reverse_discounted = []
+
+  for i in range(len(rewards) - 1, -1, -1):
+    cdr = cdr * discount_rate + rewards[i]
+    reverse_discounted.append(cdr)
+
+  discounted_rewards.extend(reversed(reverse_discounted))
+  return discounted_rewards
+
+
+class EagerPolicyNetwork(tf.keras.Model):
+  """Policy network for the cart-pole reinforcement learning problem.
+
+  The forward path of the network takes an observation from the cart-pole
+  environment (length-4 vector) and outputs an action.
+  """
+
+  def __init__(self, hidden_size):
+    super(EagerPolicyNetwork, self).__init__()
+    self._hidden_layer = tf.keras.layers.Dense(
+        hidden_size, activation=tf.nn.elu)
+    self._output_layer = tf.keras.layers.Dense(1)
+
+  def call(self, inputs):
+    """Calculates logits and action.
+
+    Args:
+      inputs: Observations from a step in the cart-pole environment, of shape
+        `(batch_size, input_size)`
+
+    Returns:
+      logits: the logits output by the output layer. This can be viewed as the
+        likelihood vales of choosing the left (0) action. Shape:
+        `(batch_size, 1)`.
+      actions: randomly selected actions ({0, 1}) based on the logits. Shape:
+        `(batch_size, 1)`.
+    """
+    hidden = self._hidden_layer(inputs)
+    logits = self._output_layer(hidden)
+
+    left_prob = tf.nn.sigmoid(logits)
+    action_probs = tf.concat([left_prob, 1.0 - left_prob], 1)
+
+    self._grad_fn = eager.implicit_gradients(
+        self._get_cross_entropy_and_save_actions)
+
+    actions = tf.multinomial(tf.log(action_probs), 1)
+    return logits, actions
+
+  def _get_cross_entropy_and_save_actions(self, inputs):
+    logits, actions = self(inputs)  # pylint:disable=not-callable
+    self._current_actions = actions
+    labels = 1.0 - tf.cast(actions, tf.float32)
+    return tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
+
+  def train(self, cart_pole_env, optimizer, discount_rate, num_games,
+            max_steps_per_game):
+    grad_list = None
+
+    step_counts = []
+    discounted_rewards = []
+
+    for _ in range(num_games):
+      obs = cart_pole_env.reset()
+
+      game_rewards = []
+
+      for step in range(max_steps_per_game):
+        grads_and_vars = self._grad_fn(tf.constant([obs], dtype=tf.float32))
+        grads, var_list = zip(*grads_and_vars)
+        actions = self._current_actions.numpy()[0][0]
+
+        if grad_list is None:
+          grad_list = [[g] for g in grads]
+        else:
+          for i in range(len(grads)):
+            grad_list[i].append(grads[i])
+
+        obs, reward, done = cart_pole_env.step(actions)
+
+        game_rewards.append(reward)
+        if reward < 0.1 or done:
+          step_counts.append(step + 1)
+          break
+
+      discounted_rewards = eager_append_discounted_rewards(
+          discounted_rewards, game_rewards, discount_rate)
+
+    discounted_rewards = tf.stack(discounted_rewards)
+    mean, variance = tf.nn.moments(discounted_rewards, [0])
+    normalized_rewards = (discounted_rewards - mean) / tf.sqrt(variance)
+
+    for i in range(len(grad_list)):
+      g = tf.stack(grad_list[i])
+
+      r = normalized_rewards
+      while r.shape.ndims < g.shape.ndims:
+        r = tf.expand_dims(r, -1)
+
+      grad_list[i] = tf.reduce_mean(g * r, axis=0)
+
+    optimizer.apply_gradients(
+        zip(grad_list, var_list), global_step=tf.train.get_global_step())
+
+    return tf.stack(step_counts)
+
+
+def eager_train_model(policy_network, cart_pole_env, optimizer, iterations):
+  """Trains the policy network for a given number of iterations."""
+  mean_steps_per_iteration = []
+
+  for _ in range(iterations):
+    steps_per_game = policy_network.train(
+        cart_pole_env,
+        optimizer,
+        discount_rate=0.95,
+        num_games=20,
+        max_steps_per_game=200)
+    mean_steps_per_iteration.append(tf.reduce_mean(steps_per_game))
+
+  return mean_steps_per_iteration
+
+
+class EagerGymCartpoleEnv(object):
+  """An env backed by OpenAI Gym's CartPole environment.
+
+  Used to confirm a functional model only.
+  """
+
+  def __init__(self):
+    cart_pole_env = gym.make('CartPole-v1')
+    cart_pole_env.seed(0)
+    cart_pole_env.reset()
+    self.env = cart_pole_env
+
+  def reset(self):
+    return self.env.reset()
+
+  def step(self, actions):
+    obs, reward, done, _ = self.env.step(actions)
+    return obs, reward, done
+
+
+class EagerRandomCartpoleEnv(object):
+  """An environment that returns random actions and never finishes.
+
+  Used during benchmarking, it will cause training to run a constant number of
+  steps.
+  """
+
+  def reset(self):
+    return np.random.normal(size=(4,))
+
+  def step(self, actions):
+    with tf.control_dependencies([actions]):
+      random_obs = np.random.normal(size=(4,))
+      fixed_reward = 0.001
+      done = False
+      return random_obs, fixed_reward, done
+
+
+def graph_demo_training():
+  """Not used in the benchmark. Used to confirm a functional model."""
+  with tf.Graph().as_default():
+    tf.set_random_seed(0)
+
+    network = GraphPolicyNetwork(hidden_size=5)
+    network.build((1, 4))
+    env = GraphGymCartpoleEnv()
+    opt = tf.train.AdamOptimizer(0.05)
+
+    train_ops = graph_train_model(network, env, opt, iterations=5)
+
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      sess.run(tf.local_variables_initializer())
+      steps_per_iteration = sess.run(train_ops)
+      for i, steps in enumerate(steps_per_iteration):
+        print('Step {} iterations: {}'.format(i, steps))
+
+
+def eager_demo_training():
+  with context.eager_mode():
+    network = EagerPolicyNetwork(hidden_size=5)
+    network.build((1, 4))
+    env = EagerGymCartpoleEnv()
+    opt = tf.train.AdamOptimizer(0.05)
+
+    steps_per_iteration = eager_train_model(network, env, opt, iterations=5)
+    for i, steps in enumerate(steps_per_iteration):
+      print('Step {} iterations: {}'.format(i, steps))
+
+
+class RLCartPoleBenchmark(benchmark_base.ReportingBenchmark):
+  """Actual benchmark.
+
+  Trains the RL agent a fixed number of times, on random environments that
+  result in constant number of steps.
+  """
+
+  def benchmark_cartpole(self):
+
+    def train_session(sess, ops):
+      return lambda: sess.run(ops)
+
+    def train_eager(network, env, opt):
+      return lambda: eager_train_model(network, env, opt, iterations=10)
+
+    for model_size in (10, 100, 1000):
+      with tf.Graph().as_default():
+        network = GraphPolicyNetwork(hidden_size=model_size)
+        network.build((1, 4))
+        env = GraphRandomCartpoleEnv()
+        opt = tf.train.AdamOptimizer(0.05)
+        train_ops = graph_train_model(network, env, opt, iterations=10)
+
+        with tf.Session() as sess:
+          sess.run(tf.global_variables_initializer())
+          sess.run(tf.local_variables_initializer())
+
+          self.time_execution(('cartpole', 'autograph', model_size),
+                              train_session(sess, train_ops), 20)
+
+      with context.eager_mode():
+        network = EagerPolicyNetwork(hidden_size=model_size)
+        network.build((1, 4))
+        env = EagerRandomCartpoleEnv()
+        opt = tf.train.AdamOptimizer(0.05)
+
+        self.time_execution(('cartpole', 'eager', model_size),
+                            train_eager(network, env, opt), 20)
+
+
+if __name__ == '__main__':
+  tf.test.main()
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops.py b/tensorflow/contrib/batching/python/ops/batch_ops.py
index 55faad983f2bcf2f3fa633669bd371608e2e925b..3e4d0dc1cec76b068c1c846eb476eec615e4f613 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops.py
@@ -18,8 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import function
+from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import gen_batch_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
@@ -101,12 +102,15 @@ def batch_function(num_batch_threads,
   def decorator(fn):  # pylint: disable=missing-docstring
 
     def decorated(*args):  # pylint: disable=missing-docstring
-      types = [arg.dtype for arg in args]
 
-      @function.Defun(*types)
+      @function.defun()
       def computation(*computation_args):
         return fn(*computation_args)
 
+      computation = computation.get_concrete_function(
+          *[tensor_spec.TensorSpec(dtype=x.dtype, shape=x.shape, name=str(i))
+            for i, x in enumerate(args)])
+
       with ops.name_scope("batch") as name:
         for a in args:
           if not isinstance(a, ops.Tensor):
@@ -123,7 +127,7 @@ def batch_function(num_batch_threads,
             f=computation,
             in_tensors=list(args),
             captured_tensors=computation.captured_inputs,
-            Tout=[o.type for o in computation.definition.signature.output_arg])
+            Tout=[o.dtype for o in computation.outputs])
 
     return decorated
 
diff --git a/tensorflow/contrib/batching/python/ops/batch_ops_test.py b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
index 01ee8703a93836d607ee9b765c51c79fe3bb974f..9109b9c1c91cefa4c52bad49de23336a6e05e1ef 100644
--- a/tensorflow/contrib/batching/python/ops/batch_ops_test.py
+++ b/tensorflow/contrib/batching/python/ops/batch_ops_test.py
@@ -219,6 +219,7 @@ class BatchOpsTest(test.TestCase):
 
       @batch_ops.batch_function(1, 10, 100000)
       def computation(in_t):
+        self.assertTrue(in_t.shape is not None)
         return in_t + 1
 
       inp = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
diff --git a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
index 13215ffabf3a956d3f83697f867457b2fa72e7c9..8b6ed9f041b89a0da02a505ec261bca82b094f74 100644
--- a/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
+++ b/tensorflow/contrib/bayesflow/python/kernel_tests/monte_carlo_test.py
@@ -81,7 +81,7 @@ class ExpectationImportanceSampleTest(test.TestCase):
       # Compute E_p[X_1 * X_2 > 0], with X_i the ith component of X ~ p(x).
       # Should equal 1/2 because p is a spherical Gaussian centered at (0, 0).
       def indicator(x):
-        x1_times_x2 = math_ops.reduce_prod(x, reduction_indices=[-1])
+        x1_times_x2 = math_ops.reduce_prod(x, axis=[-1])
         return 0.5 * (math_ops.sign(x1_times_x2) + 1.0)
 
       prob = mc.expectation_importance_sampler(
diff --git a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
index 18d40fc1dff8e7c9aefffbe3ceba770598a42096..e83a54851195708eb7e6412b7400236f4bc06e6b 100644
--- a/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
+++ b/tensorflow/contrib/bayesflow/python/ops/monte_carlo_impl.py
@@ -353,12 +353,12 @@ def expectation(f, samples, log_prob=None, use_reparametrization=True,
 
 def _sample_mean(values):
   """Mean over sample indices.  In this module this is always [0]."""
-  return math_ops.reduce_mean(values, reduction_indices=[0])
+  return math_ops.reduce_mean(values, axis=[0])
 
 
 def _sample_max(values):
   """Max over sample indices.  In this module this is always [0]."""
-  return math_ops.reduce_max(values, reduction_indices=[0])
+  return math_ops.reduce_max(values, axis=[0])
 
 
 def _get_samples(dist, z, n, seed):
diff --git a/tensorflow/contrib/bigtable/README.md b/tensorflow/contrib/bigtable/README.md
index 2c44abed5e1955cc666273e97e6b2378766f13d2..79052bee35c7895cb4048b10c1f73acb036d1587 100644
--- a/tensorflow/contrib/bigtable/README.md
+++ b/tensorflow/contrib/bigtable/README.md
@@ -51,25 +51,18 @@ BIGTABLE_TABLE_NAME = '<FILL_ME_IN>'
 PREFIX = 'train-'
 
 def main():
+  tf.enable_eager_execution()
+
   client = tf.contrib.cloud.BigtableClient(GCP_PROJECT_ID, BIGTABLE_INSTANCE_ID)
   table = client.table(BIGTABLE_TABLE_NAME)
   dataset = table.keys_by_prefix_dataset(PREFIX)
-  iterator = dataset.make_initializable_iterator()
-  get_next_op = iterator.get_next()
 
-  with tf.Session() as sess:
-    print('Initializing the iterator.')
-    sess.run(iterator.initializer)
-    print('Retrieving rows:')
-    row_index = 0
-    while True:
-      try:
-        row_key = sess.run(get_next_op)
-        print('Row key %d: %s' % (row_index, row_key))
-        row_index += 1
-      except tf.errors.OutOfRangeError:
-        print('Finished reading data!')
-        break
+  print('Retrieving rows:')
+  row_index = 0
+  for row_key in dataset:
+    print('Row key %d: %s' % (row_index, row_key))
+    row_index += 1
+  print('Finished reading data!')
 
 if __name__ == '__main__':
   main()
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
index f083ce6f44b3c2a83d9b5d3235056eb94c4be4a8..e95dc577184f7e81d942755b41065f52131ce9f6 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.cc
@@ -366,6 +366,39 @@ BigtableTestClient::MutateRows(
   return MakeUnique<MutateRowsResponse>(request.entries_size());
 }
 
+std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+    google::bigtable::v2::MutateRowResponse>>
+BigtableTestClient::AsyncMutateRow(
+    grpc::ClientContext* context,
+    google::bigtable::v2::MutateRowRequest const& request,
+    grpc::CompletionQueue* cq) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
+std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+    ::google::bigtable::v2::SampleRowKeysResponse>>
+BigtableTestClient::AsyncSampleRowKeys(
+    ::grpc::ClientContext* context,
+    const ::google::bigtable::v2::SampleRowKeysRequest& request,
+    ::grpc::CompletionQueue* cq, void* tag) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
+std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+    ::google::bigtable::v2::MutateRowsResponse>>
+BigtableTestClient::AsyncMutateRows(
+    ::grpc::ClientContext* context,
+    const ::google::bigtable::v2::MutateRowsRequest& request,
+    ::grpc::CompletionQueue* cq, void* tag) {
+  LOG(WARNING) << "Call to InMemoryDataClient::" << __func__
+               << "(); this will likely cause a crash!";
+  return nullptr;
+}
+
 std::shared_ptr<grpc::Channel> BigtableTestClient::Channel() {
   LOG(WARNING) << "Call to InMemoryDataClient::Channel(); this will likely "
                   "cause a crash!";
diff --git a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
index dac2b16a216d26f02684c7401ed2ddaa4b7baddb..c4a1f06bc504c3565c7bb09b42e48e7fbddb9cc6 100644
--- a/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
+++ b/tensorflow/contrib/bigtable/kernels/test_kernels/bigtable_test_client.h
@@ -61,6 +61,25 @@ class BigtableTestClient : public ::google::cloud::bigtable::DataClient {
   MutateRows(grpc::ClientContext* context,
              google::bigtable::v2::MutateRowsRequest const& request) override;
 
+  std::unique_ptr<grpc::ClientAsyncResponseReaderInterface<
+      google::bigtable::v2::MutateRowResponse>>
+  AsyncMutateRow(grpc::ClientContext* context,
+                 google::bigtable::v2::MutateRowRequest const& request,
+                 grpc::CompletionQueue* cq) override;
+
+  std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+      ::google::bigtable::v2::SampleRowKeysResponse>>
+  AsyncSampleRowKeys(
+      ::grpc::ClientContext* context,
+      const ::google::bigtable::v2::SampleRowKeysRequest& request,
+      ::grpc::CompletionQueue* cq, void* tag) override;
+
+  std::unique_ptr<::grpc::ClientAsyncReaderInterface<
+      ::google::bigtable::v2::MutateRowsResponse>>
+  AsyncMutateRows(::grpc::ClientContext* context,
+                  const ::google::bigtable::v2::MutateRowsRequest& request,
+                  ::grpc::CompletionQueue* cq, void* tag) override;
+
   std::shared_ptr<grpc::Channel> Channel() override;
 
  private:
diff --git a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
index 316da9ebe152ef52c7e7f846cf8c3eb1555ee8a6..197f5578eb010bee5a3aad7c05446393193f99e2 100644
--- a/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
+++ b/tensorflow/contrib/bigtable/python/kernel_tests/bigtable_ops_test.py
@@ -57,7 +57,7 @@ class BigtableOpsTest(test.TestCase):
     sess.run(write_op)
 
   def runReadKeyTest(self, read_ds):
-    itr = read_ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(read_ds)
     n = itr.get_next()
     expected = list(self.COMMON_ROW_KEYS)
     expected.reverse()
@@ -78,7 +78,7 @@ class BigtableOpsTest(test.TestCase):
     self.runReadKeyTest(self._table.keys_by_range_dataset("r1", "r4"))
 
   def runScanTest(self, read_ds):
-    itr = read_ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(read_ds)
     n = itr.get_next()
     expected_keys = list(self.COMMON_ROW_KEYS)
     expected_keys.reverse()
@@ -120,7 +120,7 @@ class BigtableOpsTest(test.TestCase):
   def testLookup(self):
     ds = self._table.keys_by_prefix_dataset("r")
     ds = ds.apply(self._table.lookup_columns(cf1="c1"))
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     expected_keys = list(self.COMMON_ROW_KEYS)
     expected_values = list(self.COMMON_VALUES)
@@ -141,7 +141,7 @@ class BigtableOpsTest(test.TestCase):
 
   def testSampleKeys(self):
     ds = self._table.sample_keys()
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     expected_key = self.COMMON_ROW_KEYS[0]
     with self.cached_session() as sess:
@@ -161,7 +161,7 @@ class BigtableOpsTest(test.TestCase):
         sess.run(n)
 
   def runSampleKeyPairsTest(self, ds, expected_key_pairs):
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
@@ -218,7 +218,7 @@ class BigtableOpsTest(test.TestCase):
   def testSampleKeyPairsPrefixAndStartKey(self):
     ds = bigtable_api._BigtableSampleKeyPairsDataset(
         self._table, prefix="r", start="r1", end="")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     with self.cached_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(itr.initializer)
@@ -226,14 +226,14 @@ class BigtableOpsTest(test.TestCase):
   def testSampleKeyPairsPrefixAndEndKey(self):
     ds = bigtable_api._BigtableSampleKeyPairsDataset(
         self._table, prefix="r", start="", end="r3")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     with self.cached_session() as sess:
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(itr.initializer)
 
   def testParallelScanPrefix(self):
     ds = self._table.parallel_scan_prefix(prefix="r", cf1="c1")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
@@ -251,7 +251,7 @@ class BigtableOpsTest(test.TestCase):
 
   def testParallelScanRange(self):
     ds = self._table.parallel_scan_range(start="r1", end="r4", cf1="c1")
-    itr = ds.make_initializable_iterator()
+    itr = dataset_ops.make_initializable_iterator(ds)
     n = itr.get_next()
     with self.cached_session() as sess:
       self._writeCommonValues(sess)
diff --git a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
index 7c87b0daeb09950cc44c51f49c16534d413f0376..b6cdc7aab0320fe5f457288ada03a46e18a694cc 100644
--- a/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
+++ b/tensorflow/contrib/bigtable/python/ops/bigtable_api.py
@@ -35,8 +35,8 @@ from tensorflow.contrib.util import loader
 from tensorflow.python.data.experimental.ops import interleave_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.platform import resource_loader
 
@@ -111,8 +111,7 @@ class BigtableClient(object):
 
 
 class BigtableTable(object):
-  """BigtableTable is the entrypoint for reading and writing data in Cloud
-  Bigtable.
+  """Entry point for reading and writing data in Cloud Bigtable.
 
   This BigtableTable class is the Python representation of the Cloud Bigtable
   table within TensorFlow. Methods on this class allow data to be read from and
@@ -222,7 +221,7 @@ class BigtableTable(object):
       A `tf.data.Dataset`. containing `tf.string` Tensors corresponding to all
       of the row keys matching that prefix.
     """
-    return _BigtablePrefixKeyDataset(self, prefix)
+    return dataset_ops.DatasetV1Adapter(_BigtablePrefixKeyDataset(self, prefix))
 
   def sample_keys(self):
     """Retrieves a sampling of row keys from the Bigtable table.
@@ -234,7 +233,7 @@ class BigtableTable(object):
     Returns:
       A `tf.data.Dataset` returning string row keys.
     """
-    return _BigtableSampleKeysDataset(self)
+    return dataset_ops.DatasetV1Adapter(_BigtableSampleKeysDataset(self))
 
   def scan_prefix(self, prefix, probability=None, columns=None, **kwargs):
     """Retrieves row (including values) from the Bigtable service.
@@ -279,7 +278,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    return _BigtableScanDataset(self, prefix, "", "", normalized, probability)
+    return dataset_ops.DatasetV1Adapter(
+        _BigtableScanDataset(self, prefix, "", "", normalized, probability))
 
   def scan_range(self, start, end, probability=None, columns=None, **kwargs):
     """Retrieves rows (including values) from the Bigtable service.
@@ -324,7 +324,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    return _BigtableScanDataset(self, "", start, end, normalized, probability)
+    return dataset_ops.DatasetV1Adapter(
+        _BigtableScanDataset(self, "", start, end, normalized, probability))
 
   def parallel_scan_prefix(self,
                            prefix,
@@ -380,7 +381,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    ds = _BigtableSampleKeyPairsDataset(self, prefix, "", "")
+    ds = dataset_ops.DatasetV1Adapter(
+        _BigtableSampleKeyPairsDataset(self, prefix, "", ""))
     return self._make_parallel_scan_dataset(ds, num_parallel_scans, probability,
                                             normalized)
 
@@ -442,7 +444,8 @@ class BigtableTable(object):
     """
     probability = _normalize_probability(probability)
     normalized = _normalize_columns(columns, kwargs)
-    ds = _BigtableSampleKeyPairsDataset(self, "", start, end)
+    ds = dataset_ops.DatasetV1Adapter(
+        _BigtableSampleKeyPairsDataset(self, "", start, end))
     return self._make_parallel_scan_dataset(ds, num_parallel_scans, probability,
                                             normalized)
 
@@ -589,16 +592,8 @@ class _BigtableKeyDataset(dataset_ops.DatasetSource):
     self._table = table
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.TensorShape([])
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 class _BigtablePrefixKeyDataset(_BigtableKeyDataset):
@@ -658,16 +653,9 @@ class _BigtableLookupDataset(dataset_ops.DatasetSource):
     self._columns = [i[1] for i in normalized]
 
   @property
-  def output_classes(self):
-    return tuple([ops.Tensor] * self._num_outputs)
-
-  @property
-  def output_shapes(self):
-    return tuple([tensor_shape.TensorShape([])] * self._num_outputs)
-
-  @property
-  def output_types(self):
-    return tuple([dtypes.string] * self._num_outputs)
+  def _element_structure(self):
+    return structure.NestedStructure(tuple(
+        [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -693,16 +681,9 @@ class _BigtableScanDataset(dataset_ops.DatasetSource):
     self._num_outputs = len(normalized) + 1  # 1 for row key
 
   @property
-  def output_classes(self):
-    return tuple([ops.Tensor] * self._num_outputs)
-
-  @property
-  def output_shapes(self):
-    return tuple([tensor_shape.TensorShape([])] * self._num_outputs)
-
-  @property
-  def output_types(self):
-    return tuple([dtypes.string] * self._num_outputs)
+  def _element_structure(self):
+    return structure.NestedStructure(tuple(
+        [structure.TensorStructure(dtypes.string, [])] * self._num_outputs))
 
   def _as_variant_tensor(self):
     return gen_bigtable_ops.bigtable_scan_dataset(
@@ -726,16 +707,10 @@ class _BigtableSampleKeyPairsDataset(dataset_ops.DatasetSource):
     self._end = end
 
   @property
-  def output_classes(self):
-    return (ops.Tensor, ops.Tensor)
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
-
-  @property
-  def output_types(self):
-    return (dtypes.string, dtypes.string)
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
index 14b6fc4ac26f74f54628ae37ad6437c7d3e8caba..d3b23d949ee2c7674c3918d39e8b71d76eefcfec 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/BUILD
@@ -132,6 +132,7 @@ py_library(
     srcs = ["estimator.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":custom_loss_head",
         ":estimator_utils",
         ":model",
         "//tensorflow/contrib/boosted_trees:losses",
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
index a3df272e6924792128fc38fd153b9527b58b486e..b314b4d74df882a421d9a2ecce2629a63d5c5248 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/custom_export_strategy.py
@@ -41,7 +41,8 @@ def make_custom_export_strategy(name,
                                 convert_fn,
                                 feature_columns,
                                 export_input_fn,
-                                use_core_columns=False):
+                                use_core_columns=False,
+                                feature_engineering_fn=None):
   """Makes custom exporter of GTFlow tree format.
 
   Args:
@@ -52,6 +53,7 @@ def make_custom_export_strategy(name,
     export_input_fn: A function that takes no arguments and returns an
       `InputFnOps`.
     use_core_columns: A boolean, whether core feature columns were used.
+    feature_engineering_fn: Feature eng function to be called on the input.
 
   Returns:
     An `ExportStrategy`.
@@ -59,9 +61,12 @@ def make_custom_export_strategy(name,
   base_strategy = saved_model_export_utils.make_export_strategy(
       serving_input_fn=export_input_fn, strip_default_attrs=True)
   input_fn = export_input_fn()
+  features = input_fn.features
+  if feature_engineering_fn is not None:
+    features, _ = feature_engineering_fn(features, labels=None)
   (sorted_feature_names, dense_floats, sparse_float_indices, _, _,
    sparse_int_indices, _, _) = gbdt_batch.extract_features(
-       input_fn.features, feature_columns, use_core_columns)
+       features, feature_columns, use_core_columns)
 
   def export_fn(estimator, export_dir, checkpoint_path=None, eval_result=None):
     """A wrapper to export to SavedModel, and convert it to other formats."""
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
index ca73e4af2fbd0a383d02fa7111f59161701661df..358404cd946bbc56d2f7228be8fe4223749c850b 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/dnn_tree_combined_estimator.py
@@ -36,7 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import estimator
 from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
-from tensorflow.python.feature_column import feature_column as feature_column_lib
+from tensorflow.python.feature_column import feature_column_lib
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
index 38d19976ef38a295a172e935f70bdae3c67f01e2..a178820841c4c8bcb7f5742babdb6d0f4825de31 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.contrib.boosted_trees.estimator_batch import model
 from tensorflow.contrib.boosted_trees.python.utils import losses
 from tensorflow.contrib.learn.python.learn.estimators import estimator
@@ -26,7 +28,8 @@ from tensorflow.python.estimator.canned import head as core_head_lib
 from tensorflow.python.estimator import estimator as core_estimator
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.losses import losses as core_losses
-
+from tensorflow.contrib.boosted_trees.estimator_batch import custom_loss_head
+from tensorflow.python.ops import array_ops
 
 # ================== Old estimator interface===================================
 # The estimators below were designed for old feature columns and old estimator
@@ -414,6 +417,108 @@ class GradientBoostedDecisionTreeRanker(estimator.Estimator):
         config=config,
         feature_engineering_fn=feature_engineering_fn)
 
+# When using this estimator, make sure to regularize the hessian (at least l2,
+# min_node_weight)!
+# TODO(nponomareva): extend to take multiple quantiles in one go.
+class GradientBoostedDecisionTreeQuantileRegressor(estimator.Estimator):
+  """An estimator that does quantile regression and returns quantile estimates.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               quantiles,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               use_core_libs=False,
+               output_leaf_index=False,
+               override_global_step_value=None,
+               num_quantiles=100):
+    """Initializes a GradientBoostedDecisionTreeQuantileRegressor instance.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      quantiles: a list of quantiles for the loss, each between 0 and 1.
+      label_dimension: Dimension of regression label. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`). When label_dimension>1, it is
+        recommended to use multiclass strategy diagonal hessian or full hessian.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      use_core_libs: Whether feature columns and loss are from the core (as
+        opposed to contrib) version of tensorflow.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+      override_global_step_value: If after the training is done, global step
+        value must be reset to this value. This should be used to reset global
+        step to a number > number of steps used to train the current ensemble.
+        For example, the usual way is to train a number of trees and set a very
+        large number of training steps. When the training is done (number of
+        trees were trained), this parameter can be used to set the global step
+        to a large value, making it look like that number of training steps ran.
+        If None, no override of global step will happen.
+      num_quantiles: Number of quantiles to build for numeric feature values.
+    """
+
+    if len(quantiles) > 1:
+      raise ValueError('For now, just one quantile per estimator is supported')
+
+    def _quantile_regression_head(quantile):
+      # Use quantile regression.
+      head = custom_loss_head.CustomLossHead(
+          loss_fn=functools.partial(
+              losses.per_example_quantile_regression_loss, quantile=quantile),
+          link_fn=array_ops.identity,
+          logit_dimension=label_dimension)
+      return head
+
+    learner_config.num_classes = max(2, label_dimension)
+
+    super(GradientBoostedDecisionTreeQuantileRegressor, self).__init__(
+        model_fn=model.model_builder,
+        params={
+            'head': _quantile_regression_head(quantiles[0]),
+            'feature_columns': feature_columns,
+            'learner_config': learner_config,
+            'num_trees': num_trees,
+            'weight_column_name': weight_column_name,
+            'examples_per_layer': examples_per_layer,
+            'logits_modifier_function': logits_modifier_function,
+            'center_bias': center_bias,
+            'use_core_libs': use_core_libs,
+            'output_leaf_index': False,
+            'override_global_step_value': override_global_step_value,
+            'num_quantiles': num_quantiles,
+        },
+        model_dir=model_dir,
+        config=config,
+        feature_engineering_fn=feature_engineering_fn)
+
 # ================== New Estimator interface===================================
 # The estimators below use new core Estimator interface and must be used with
 # new feature columns and heads.
@@ -437,12 +542,42 @@ def core_multiclass_head(
 
   # pylint:disable=protected-access
   head_fn = core_head_lib._multi_class_head_with_softmax_cross_entropy_loss(
-      n_classes=n_classes, loss_fn=loss_fn, loss_reduction=loss_reduction)
+      n_classes=n_classes,
+      loss_fn=loss_fn,
+      loss_reduction=loss_reduction,
+      weight_column=weight_column)
   # pylint:enable=protected-access
 
   return head_fn
 
 
+# For quantile regression, use this head with Core..Estimator, or use
+# Core..QuantileRegressor directly,
+def core_quantile_regression_head(
+    quantiles,
+    label_dimension=1,
+    weight_column=None,
+    loss_reduction=core_losses.Reduction.SUM_OVER_NONZERO_WEIGHTS):
+  """Core head for quantile regression problems."""
+
+  def loss_fn(labels, logits):
+    result = losses.per_example_quantile_regression_loss(
+        labels=labels,
+        predictions=logits,
+        weights=weight_column,
+        quantile=quantiles)
+    return result[0]
+
+  # pylint:disable=protected-access
+  head_fn = core_head_lib._regression_head(
+      label_dimension=label_dimension,
+      loss_fn=loss_fn,
+      loss_reduction=loss_reduction,
+      weight_column=weight_column)
+  # pylint:enable=protected-access
+  return head_fn
+
+
 class CoreGradientBoostedDecisionTreeEstimator(core_estimator.Estimator):
   """An estimator using gradient boosted decision trees.
 
@@ -606,3 +741,104 @@ class CoreGradientBoostedDecisionTreeRanker(core_estimator.Estimator):
 
     super(CoreGradientBoostedDecisionTreeRanker, self).__init__(
         model_fn=_model_fn, model_dir=model_dir, config=config)
+
+
+# When using this estimator, make sure to regularize the hessian (at least l2,
+# min_node_weight)!
+# TODO(nponomareva): extend to take multiple quantiles in one go.
+class CoreGradientBoostedDecisionTreeQuantileRegressor(
+    core_estimator.Estimator):
+  """An estimator that does quantile regression and returns quantile estimates.
+  """
+
+  def __init__(self,
+               learner_config,
+               examples_per_layer,
+               quantiles,
+               label_dimension=1,
+               num_trees=None,
+               feature_columns=None,
+               weight_column_name=None,
+               model_dir=None,
+               config=None,
+               label_keys=None,
+               feature_engineering_fn=None,
+               logits_modifier_function=None,
+               center_bias=True,
+               output_leaf_index=False,
+               num_quantiles=100):
+    """Initializes a core version of GradientBoostedDecisionTreeEstimator.
+
+    Args:
+      learner_config: A config for the learner.
+      examples_per_layer: Number of examples to accumulate before growing a
+        layer. It can also be a function that computes the number of examples
+        based on the depth of the layer that's being built.
+      quantiles: a list of quantiles for the loss, each between 0 and 1.
+      label_dimension: Dimension of regression label. This is the size
+        of the last dimension of the labels `Tensor` (typically, this has shape
+        `[batch_size, label_dimension]`). When label_dimension>1, it is
+        recommended to use multiclass strategy diagonal hessian or full hessian.
+      num_trees: An int, number of trees to build.
+      feature_columns: A list of feature columns.
+      weight_column_name: Name of the column for weights, or None if not
+        weighted.
+      model_dir: Directory for model exports, etc.
+      config: `RunConfig` object to configure the runtime settings.
+      label_keys: Optional list of strings with size `[n_classes]` defining the
+        label vocabulary. Only supported for `n_classes` > 2.
+      feature_engineering_fn: Feature engineering function. Takes features and
+        labels which are the output of `input_fn` and returns features and
+        labels which will be fed into the model.
+      logits_modifier_function: A modifier function for the logits.
+      center_bias: Whether a separate tree should be created for first fitting
+        the bias.
+      output_leaf_index: whether to output leaf indices along with predictions
+        during inference. The leaf node indexes are available in predictions
+        dict by the key 'leaf_index'. For example,
+        result_dict = classifier.predict(...)
+        for example_prediction_result in result_dict:
+          # access leaf index list by example_prediction_result["leaf_index"]
+          # which contains one leaf index per tree
+      num_quantiles: Number of quantiles to build for numeric feature values.
+    """
+    if len(quantiles) > 1:
+      raise ValueError('For now, just one quantile per estimator is supported')
+
+    def _model_fn(features, labels, mode, config):
+      return model.model_builder(
+          features=features,
+          labels=labels,
+          mode=mode,
+          config=config,
+          params={
+              'head':
+                  core_quantile_regression_head(
+                      quantiles[0], label_dimension=label_dimension),
+              'feature_columns':
+                  feature_columns,
+              'learner_config':
+                  learner_config,
+              'num_trees':
+                  num_trees,
+              'weight_column_name':
+                  weight_column_name,
+              'examples_per_layer':
+                  examples_per_layer,
+              'center_bias':
+                  center_bias,
+              'logits_modifier_function':
+                  logits_modifier_function,
+              'use_core_libs':
+                  True,
+              'output_leaf_index':
+                  output_leaf_index,
+              'override_global_step_value':
+                  None,
+              'num_quantiles':
+                  num_quantiles,
+          },
+          output_type=model.ModelBuilderOutputType.ESTIMATOR_SPEC)
+
+    super(CoreGradientBoostedDecisionTreeQuantileRegressor, self).__init__(
+        model_fn=_model_fn, model_dir=model_dir, config=config)
diff --git a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
index c155128c0e4ccf928349ee6453baff4384222096..ee052ac60387d8f993e4942dd7dff39e191dd3a4 100644
--- a/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
+++ b/tensorflow/contrib/boosted_trees/estimator_batch/estimator_test.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.boosted_trees.proto import learner_pb2
 from tensorflow.contrib.layers.python.layers import feature_column as contrib_feature_column
 from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.python.estimator.canned import head as head_lib
+from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column_lib as core_feature_column
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -47,8 +48,8 @@ def _multiclass_train_input_fn():
   features = {
       "x": constant_op.constant([[2.], [1.], [1.], [5.], [3.5], [4.6], [3.5]])
   }
-  label = constant_op.constant(
-      [[1], [0], [0], [2], [2], [0], [1]], dtype=dtypes.int32)
+  label = constant_op.constant([[1], [0], [0], [2], [2], [0], [1]],
+                               dtype=dtypes.int32)
   return features, label
 
 
@@ -77,6 +78,59 @@ def _infer_ranking_train_input_fn():
   return features, None
 
 
+_QUANTILE_REGRESSION_SIZE = 1000
+
+
+def _quantile_regression_input_fns(two_dimension=False):
+  # The data generation is taken from
+  # http://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_quantile.html
+  np.random.seed(1)
+
+  def f(x):
+    """The function to predict."""
+    return x * np.sin(x)
+
+  def g(x):
+    """The function to predict."""
+    return x * np.cos(x)
+
+  #  Training data.
+  x = np.atleast_2d(np.random.uniform(0, 10.0,
+                                      size=_QUANTILE_REGRESSION_SIZE)).T
+  x = x.astype(np.float32)
+
+  # Labels.
+  if not two_dimension:
+    y = f(x).ravel()
+  else:
+    y = np.column_stack((f(x).ravel(), g(x).ravel()))
+
+  # Add random noise.
+  dy = 1.5 + 1.0 * np.random.random(y.shape)
+  noise = np.random.normal(0, dy)
+  y += noise
+  y_original = y.astype(np.float32)
+  if not two_dimension:
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
+
+  train_input_fn = numpy_io.numpy_input_fn(
+      x=x,
+      y=y,
+      batch_size=_QUANTILE_REGRESSION_SIZE,
+      num_epochs=None,
+      shuffle=True)
+
+  # Test on the training data to make sure the predictions are calibrated.
+  test_input_fn = numpy_io.numpy_input_fn(
+      x=x,
+      y=y,
+      batch_size=_QUANTILE_REGRESSION_SIZE,
+      num_epochs=1,
+      shuffle=False)
+
+  return train_input_fn, test_input_fn, y_original
+
+
 class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -341,6 +395,130 @@ class BoostedTreeEstimatorTest(test_util.TensorFlowTestCase):
     for prediction_dict in result_iter:
       self.assertTrue("classes" in prediction_dict)
 
+  # One dimensional quantile regression.
+  def testQuantileRegression(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
+
+    # 95% percentile.
+    model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["scores"])
+
+    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper >= 0.92)
+    self.assertTrue(frac_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
+    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["scores"])
+
+    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower >= 0.92)
+    self.assertTrue(frac_above_lower <= 0.98)
+
+  # Multi-dimensional quantile regression.
+  def testQuantileRegressionMultiDimLabel(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
+        two_dimension=True)
+
+    # 95% percentile.
+    model_upper = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        label_dimension=2,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["scores"])
+
+    count_below_upper = np.count_nonzero(upper > y, axis=0)
+    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
+    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
+    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
+    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper_0 >= 0.92)
+    self.assertTrue(frac_below_upper_0 <= 0.98)
+    self.assertTrue(frac_below_upper_1 >= 0.92)
+    self.assertTrue(frac_below_upper_1 <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.92)
+    self.assertTrue(frac_both_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
+        two_dimension=True)
+    model_lower = estimator.GradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        label_dimension=2,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.fit(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["scores"])
+
+    count_above_lower = np.count_nonzero(lower < y, axis=0)
+    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
+    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
+    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
+    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower_0 >= 0.92)
+    self.assertTrue(frac_above_lower_0 <= 0.98)
+    self.assertTrue(frac_above_lower_1 >= 0.92)
+    self.assertTrue(frac_above_lower_1 <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.92)
+    self.assertTrue(frac_both_above_lower <= 0.98)
+
 
 class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
@@ -489,8 +667,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
 
     feature_columns = [
         core_feature_column.weighted_categorical_column(
-            categorical_column=core_feature_column.
-            categorical_column_with_vocabulary_list(
+            categorical_column=core_feature_column
+            .categorical_column_with_vocabulary_list(
                 key="word", vocabulary_list=["the", "cat", "dog"]),
             weight_feature_key="weight")
     ]
@@ -509,8 +687,8 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
         # Weights for the words are 5 - cat, 6- dog and 1 -the.
         features_dict["word"] = sparse_tensor.SparseTensor(
             indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
-            values=constant_op.constant(
-                ["the", "cat", "dog", "the"], dtype=dtypes.string),
+            values=constant_op.constant(["the", "cat", "dog", "the"],
+                                        dtype=dtypes.string),
             dense_shape=[4, 3])
         features_dict["weight"] = sparse_tensor.SparseTensor(
             indices=[[0, 0], [0, 1], [1, 0], [3, 0]],
@@ -534,6 +712,132 @@ class CoreGradientBoostedDecisionTreeEstimators(test_util.TensorFlowTestCase):
     est.evaluate(input_fn=input_fn, steps=1)
     est.predict(input_fn=input_fn)
 
+  # One dimensional quantile regression.
+  def testQuantileRegression(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns()
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 1)
+
+    # 95% percentile.
+    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["predictions"])
+
+    frac_below_upper = round(1. * np.count_nonzero(upper > y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper >= 0.92)
+    self.assertTrue(frac_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns()
+    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["predictions"])
+
+    frac_above_lower = round(1. * np.count_nonzero(lower < y) / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower >= 0.92)
+    self.assertTrue(frac_above_lower <= 0.98)
+
+  # Multi-dimensional quantile regression.
+  def testQuantileRegressionMultiDimLabel(self):
+    learner_config = learner_pb2.LearnerConfig()
+    learner_config.num_classes = 2
+    learner_config.constraints.max_tree_depth = 3
+    learner_config.growing_mode = learner_pb2.LearnerConfig.WHOLE_TREE
+    learner_config.constraints.min_node_weight = 1 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l2 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.l1 = 1.0 / _QUANTILE_REGRESSION_SIZE
+    learner_config.regularization.tree_complexity = (
+        1.0 / _QUANTILE_REGRESSION_SIZE)
+
+    train_input_fn, test_input_fn, y = _quantile_regression_input_fns(
+        two_dimension=True)
+    y = y.reshape(_QUANTILE_REGRESSION_SIZE, 2)
+
+    # 95% percentile.
+    model_upper = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.95],
+        learner_config=learner_config,
+        num_trees=100,
+        label_dimension=2,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_upper.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_upper.predict(input_fn=test_input_fn)
+    upper = []
+    for prediction_dict in result_iter:
+      upper.append(prediction_dict["predictions"])
+
+    count_below_upper = np.count_nonzero(upper > y, axis=0)
+    count_both_below_upper = np.count_nonzero(np.prod(upper > y, axis=1))
+    frac_below_upper_0 = round(1. * count_below_upper[0] / len(y), 3)
+    frac_below_upper_1 = round(1. * count_below_upper[1] / len(y), 3)
+    frac_both_below_upper = round(1. * count_both_below_upper / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_below_upper_0 >= 0.92)
+    self.assertTrue(frac_below_upper_0 <= 0.98)
+    self.assertTrue(frac_below_upper_1 >= 0.92)
+    self.assertTrue(frac_below_upper_1 <= 0.98)
+    self.assertTrue(frac_both_below_upper >= 0.92)
+    self.assertTrue(frac_both_below_upper <= 0.98)
+
+    train_input_fn, test_input_fn, _ = _quantile_regression_input_fns(
+        two_dimension=True)
+    model_lower = estimator.CoreGradientBoostedDecisionTreeQuantileRegressor(
+        quantiles=[0.05],
+        learner_config=learner_config,
+        num_trees=100,
+        label_dimension=2,
+        examples_per_layer=_QUANTILE_REGRESSION_SIZE,
+        center_bias=False)
+
+    model_lower.train(input_fn=train_input_fn, steps=1000)
+    result_iter = model_lower.predict(input_fn=test_input_fn)
+    lower = []
+    for prediction_dict in result_iter:
+      lower.append(prediction_dict["predictions"])
+
+    count_above_lower = np.count_nonzero(lower < y, axis=0)
+    count_both_aboce_lower = np.count_nonzero(np.prod(lower < y, axis=1))
+    frac_above_lower_0 = round(1. * count_above_lower[0] / len(y), 3)
+    frac_above_lower_1 = round(1. * count_above_lower[1] / len(y), 3)
+    frac_both_above_lower = round(1. * count_both_aboce_lower / len(y), 3)
+    # +/- 3%
+    self.assertTrue(frac_above_lower_0 >= 0.92)
+    self.assertTrue(frac_above_lower_0 <= 0.98)
+    self.assertTrue(frac_above_lower_1 >= 0.92)
+    self.assertTrue(frac_above_lower_1 <= 0.98)
+    self.assertTrue(frac_both_above_lower >= 0.92)
+    self.assertTrue(frac_both_above_lower <= 0.98)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/contrib/boosted_trees/examples/boston.py b/tensorflow/contrib/boosted_trees/examples/boston.py
index 54c4ff059e3408d2cb8fc689a9ae877f57485f58..09b240a7006a8ef53eb95108b3adbfae728cf8fc 100644
--- a/tensorflow/contrib/boosted_trees/examples/boston.py
+++ b/tensorflow/contrib/boosted_trees/examples/boston.py
@@ -90,13 +90,13 @@ def _make_experiment_fn(output_dir):
   (x_train, y_train), (x_test,
                        y_test) = tf.keras.datasets.boston_housing.load_data()
 
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_train},
       y=y_train,
       batch_size=FLAGS.batch_size,
       num_epochs=None,
       shuffle=True)
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   feature_columns = [
diff --git a/tensorflow/contrib/boosted_trees/examples/boston_combined.py b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
index e04b56afbfd266dc13a5b0d78d171ea273415ee3..d640af354f55423b7c9706900359f5e64c459f39 100644
--- a/tensorflow/contrib/boosted_trees/examples/boston_combined.py
+++ b/tensorflow/contrib/boosted_trees/examples/boston_combined.py
@@ -80,13 +80,13 @@ def _make_experiment_fn(output_dir):
   (x_train, y_train), (x_test,
                        y_test) = tf.keras.datasets.boston_housing.load_data()
 
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_train},
       y=y_train,
       batch_size=FLAGS.batch_size,
       num_epochs=None,
       shuffle=True)
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": x_test}, y=y_test, num_epochs=1, shuffle=False)
 
   feature_columns = [
diff --git a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
index 8edb5d6c640611bbb90d7731b2fea4354e125563..6d78e27e8f69ea289b686af8402bd91967f997f4 100644
--- a/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
+++ b/tensorflow/contrib/boosted_trees/kernels/split_handler_ops.cc
@@ -834,8 +834,13 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
       root_gradient_stats *= normalizer_ratio;
       NodeStats root_stats = state->ComputeNodeStats(root_gradient_stats);
       int32 best_feature_idx = 0;
+      bool best_feature_updated = false;
       NodeStats best_right_node_stats(0);
       NodeStats best_left_node_stats(0);
+      CHECK(end_index - start_index >= 2)
+          << "Partition should have a non bias feature. Start index "
+          << start_index << " and end index " << end_index;
+
       for (int64 feature_idx = start_index + 1; feature_idx < end_index;
            ++feature_idx) {
         GradientStats left_gradient_stats(*gradients_t, *hessians_t,
@@ -845,11 +850,13 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
             root_gradient_stats - left_gradient_stats;
         NodeStats left_stats = state->ComputeNodeStats(left_gradient_stats);
         NodeStats right_stats = state->ComputeNodeStats(right_gradient_stats);
-        if (left_stats.gain + right_stats.gain > best_gain) {
+        if (!best_feature_updated ||
+            left_stats.gain + right_stats.gain > best_gain) {
           best_gain = left_stats.gain + right_stats.gain;
           best_left_node_stats = left_stats;
           best_right_node_stats = right_stats;
           best_feature_idx = feature_idx;
+          best_feature_updated = true;
         }
       }
       SplitInfo split_info;
@@ -864,7 +871,7 @@ class BuildCategoricalEqualitySplitsOp : public OpKernel {
           << feature_ids(best_feature_idx, 0) << ", "
           << feature_ids(best_feature_idx, 1)
           << "\nPartition IDS: " << partition_ids(start_index) << "  "
-          << partition_ids(best_feature_idx);
+          << partition_ids(best_feature_idx) << " and best gain " << best_gain;
       equality_split->set_feature_id(feature_ids(best_feature_idx, 0));
       auto* left_child = split_info.mutable_left_child();
       auto* right_child = split_info.mutable_right_child();
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
index 4da25298cb82093ac501997cc21c48265df06860..d26af58419752170bbc58bba757ac43349fc2cff 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler.py
@@ -119,7 +119,7 @@ class EqualitySplitHandler(base_split_handler.BaseSplitHandler):
 
     def not_active_inputs():
       return (constant_op.constant([], dtype=dtypes.int32),
-              constant_op.constant([], dtype=dtypes.int64, shape=[1, 2]),
+              constant_op.constant_v1([], dtype=dtypes.int64, shape=[1, 2]),
               empty_gradients, empty_hessians)
 
     def active_inputs():
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
index a2f708081a4b484d649b5d09b172c2c60db69aeb..386dc19fc7b9529993a9625fb1298f6eb9a70d87 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/categorical_split_handler_test.py
@@ -36,9 +36,9 @@ def get_empty_tensors(gradient_shape, hessian_shape):
   empty_hess_shape = [1] + hessian_shape.as_list()
   empty_grad_shape = [1] + gradient_shape.as_list()
 
-  empty_gradients = constant_op.constant(
+  empty_gradients = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_grad_shape)
-  empty_hessians = constant_op.constant(
+  empty_hessians = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_hess_shape)
 
   return empty_gradients, empty_hessians
@@ -486,8 +486,8 @@ class EqualitySplitHandlerTest(test_util.TensorFlowTestCase):
       gradients = array_ops.constant([0.2, -0.5, 1.2, 4.0])
       hessians = array_ops.constant([0.12, 0.07, 0.2, 0.13])
       partition_ids = [0, 0, 0, 1]
-      indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
-      values = array_ops.constant([], dtype=dtypes.int64)
+      indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2])
+      values = constant_op.constant_v1([], dtype=dtypes.int64)
 
       gradient_shape = tensor_shape.scalar()
       hessian_shape = tensor_shape.scalar()
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
index 1fffbb5f660c681e1dde11a2aaf1d0f1cf79d1d0..0476bed2cd3f3ea5b47b10c51a819f17d6e37c74 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler.py
@@ -605,7 +605,7 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
                             quantile_buckets, example_partition_ids, gradients,
                             hessians, weights, empty_gradients, empty_hessians):
   """Updates the state for dense split handler."""
-  empty_float = constant_op.constant([], dtype=dtypes.float32)
+  empty_float = constant_op.constant_v1([], dtype=dtypes.float32)
 
   quantile_values, quantile_weights = control_flow_ops.cond(
       is_active[1],  # For the next layer, this handler is inactive.
@@ -621,8 +621,8 @@ def dense_make_stats_update(is_active, are_buckets_ready, float_column,
     return (example_partition_ids, quantized_feature, gradients, hessians)
 
   def not_ready_inputs_fn():
-    return (constant_op.constant([], dtype=dtypes.int32),
-            constant_op.constant([[]], dtype=dtypes.int64, shape=[1, 2]),
+    return (constant_op.constant_v1([], dtype=dtypes.int32),
+            constant_op.constant_v1([[]], dtype=dtypes.int64, shape=[1, 2]),
             empty_gradients, empty_hessians)
 
   example_partition_ids, feature_ids, gradients, hessians = (
@@ -708,11 +708,11 @@ def sparse_make_stats_update(
 
   def quantiles_not_ready():
     """The subgraph for when the quantiles are not ready."""
-    return (constant_op.constant([], dtype=dtypes.int32),
-            constant_op.constant([], dtype=dtypes.int64, shape=[1, 2]),
+    return (constant_op.constant_v1([], dtype=dtypes.int32),
+            constant_op.constant_v1([], dtype=dtypes.int64, shape=[1, 2]),
             empty_gradients, empty_hessians)
 
-  empty_float = constant_op.constant([], dtype=dtypes.float32)
+  empty_float = constant_op.constant_v1([], dtype=dtypes.float32)
   handler_not_active = (constant_op.constant(
       [], dtype=dtypes.int64, shape=[0, 2]), empty_float,
                         constant_op.constant([0, 1], dtype=dtypes.int64),
diff --git a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
index 74b0ea6989c65e83e7a466107d624712a0e72d1b..4a1b528646e7d2139d7eabb0264b8d280f8da133 100644
--- a/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
+++ b/tensorflow/contrib/boosted_trees/lib/learner/batch/ordinal_split_handler_test.py
@@ -39,9 +39,9 @@ def get_empty_tensors(gradient_shape, hessian_shape):
   empty_hess_shape = [1] + hessian_shape.as_list()
   empty_grad_shape = [1] + gradient_shape.as_list()
 
-  empty_gradients = constant_op.constant(
+  empty_gradients = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_grad_shape)
-  empty_hessians = constant_op.constant(
+  empty_hessians = constant_op.constant_v1(
       [], dtype=dtypes.float32, shape=empty_hess_shape)
 
   return empty_gradients, empty_hessians
@@ -1476,9 +1476,9 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
 
   def testEmpty(self):
     with self.cached_session() as sess:
-      indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
+      indices = constant_op.constant_v1([], dtype=dtypes.int64, shape=[0, 2])
       # No values in this feature column in this mini-batch.
-      values = array_ops.constant([], dtype=dtypes.float32)
+      values = constant_op.constant_v1([], dtype=dtypes.float32)
       sparse_column = sparse_tensor.SparseTensor(indices, values, [4, 1])
 
       gradient_shape = tensor_shape.scalar()
@@ -1549,8 +1549,9 @@ class SparseSplitHandlerTest(test_util.TensorFlowTestCase):
       sparse_column = array_ops.sparse_placeholder(dtypes.float32)
 
       # We have two batches - at first, a sparse feature is empty.
-      empty_indices = array_ops.constant([], dtype=dtypes.int64, shape=[0, 2])
-      empty_values = array_ops.constant([], dtype=dtypes.float32)
+      empty_indices = constant_op.constant_v1([], dtype=dtypes.int64,
+                                              shape=[0, 2])
+      empty_values = constant_op.constant_v1([], dtype=dtypes.float32)
       empty_sparse_column = sparse_tensor.SparseTensor(empty_indices,
                                                        empty_values, [4, 2])
       empty_sparse_column = empty_sparse_column.eval(session=sess)
diff --git a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
index ab5713fbe26ab76eac923035e9feecc2ec51f492..9fdc2fc0c2c7b85502f7a3f9ae7c85cf05d5916c 100644
--- a/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
+++ b/tensorflow/contrib/boosted_trees/python/training/functions/gbdt_batch.py
@@ -897,9 +897,9 @@ class GradientBoostedDecisionTreeModel(object):
     empty_hess_shape = [1] + self._hessian_shape.as_list()
     empty_grad_shape = [1] + self._gradient_shape.as_list()
 
-    empty_gradients = constant_op.constant(
+    empty_gradients = constant_op.constant_v1(
         [], dtype=dtypes.float32, shape=empty_grad_shape)
-    empty_hessians = constant_op.constant(
+    empty_hessians = constant_op.constant_v1(
         [], dtype=dtypes.float32, shape=empty_hess_shape)
 
     active_handlers = array_ops.unstack(active_handlers, axis=0)
@@ -1257,13 +1257,12 @@ class GradientBoostedDecisionTreeModel(object):
   def _get_replica_device_setter(self, worker_device):
     """Creates a replica device setter."""
     ps_tasks = self._num_ps_replicas
-    ps_ops = [
-        "Variable",
-        "VariableV2",
+    ps_ops = list(device_setter.STANDARD_PS_OPS)
+    ps_ops.extend([
         "DecisionTreeEnsembleResourceHandleOp",
         "StatsAccumulatorScalarResourceHandleOp",
         "StatsAccumulatorTensorResourceHandleOp",
-    ]
+    ])
     ps_strategy = _OpRoundRobinStrategy(ps_ops, ps_tasks)
     return device_setter.replica_device_setter(
         worker_device=worker_device,
diff --git a/tensorflow/contrib/boosted_trees/python/utils/losses.py b/tensorflow/contrib/boosted_trees/python/utils/losses.py
index b5ebaf1999519f65110e8164fa20bace5ecc3ef6..220e981618b7c0bfb1e4e98c087d83b451b9b3cf 100644
--- a/tensorflow/contrib/boosted_trees/python/utils/losses.py
+++ b/tensorflow/contrib/boosted_trees/python/utils/losses.py
@@ -48,6 +48,47 @@ def per_example_logistic_loss(labels, weights, predictions):
       labels=labels, logits=predictions)
   return unweighted_loss * weights, control_flow_ops.no_op()
 
+# MUST USE WITH HESSIAN REGULARIZATION,
+# This loss can have zero hessian, so it must be used with l2 or min_node_weight
+# regularization.
+# An example config is
+# learner_config.constraints.min_node_weight = 1 / num_examples_per_layer
+# learner_config.regularization.l2 = 1.0 / num_examples_per_layer
+# TODO(nponomareva): make it multidimensional so we can estimate several
+# quantiles at once.
+def per_example_quantile_regression_loss(labels, weights, predictions,
+                                         quantile):
+  """Smoothed loss for quantile regression.
+
+  The standard quantile regression loss is quantile*(y-y') when y>y' and
+  (quantile-1)*(y-y') otherwise, y' is a prediction, y is a label. The impl
+  below is this loss but squared in the region where the loss value < 1.
+
+  Args:
+    labels: Rank 2 (N, D) tensor of per-example labels.
+    weights: Rank 2 (N, 1) tensor of per-example weights.
+    predictions: Rank 2 (N, D) tensor of per-example predictions.
+    quantile: The quantile to use.
+
+  Returns:
+    loss: A Rank 2 (N, 1) tensor of per-example quantile loss.
+    update_op: An update operation to update the loss's internal state.
+  """
+  labels = math_ops.to_float(labels)
+  error = labels - predictions
+  square_loss_right = array_ops.where(error * quantile < 1.0,
+                                      math_ops.square(quantile * error),
+                                      quantile * error)
+  square_loss_left = array_ops.where(error * (quantile - 1) < 1,
+                                     math_ops.square((quantile - 1) * error),
+                                     (quantile - 1) * error)
+
+  unweighted_loss = array_ops.where(error > 0, square_loss_right,
+                                    square_loss_left)
+  if weights is None:
+    return unweighted_loss, control_flow_ops.no_op()
+  else:
+    return unweighted_loss * weights, control_flow_ops.no_op()
 
 # This is classical form of Maximum entropy loss, that is twice differentiable
 # (sparse_softmax_cross_entropy which is what we go for is not twice
@@ -78,8 +119,7 @@ def per_example_maxent_loss(labels, weights, logits, num_classes, eps=1e-15):
     labels = array_ops.expand_dims(labels, 1)
   # Labels are indices of classes, convert them to one hot encodings.
   target_one_hot = array_ops.one_hot(indices=labels, depth=num_classes)
-  labels = math_ops.reduce_sum(
-      input_tensor=target_one_hot, reduction_indices=[1])
+  labels = math_ops.reduce_sum(input_tensor=target_one_hot, axis=[1])
   labels = math_ops.to_float(labels)
 
   # Calculate softmax probabilities for each class.
diff --git a/tensorflow/contrib/checkpoint/python/containers.py b/tensorflow/contrib/checkpoint/python/containers.py
index 242c1e8ba45e0b2f6f9a1a51695b824546382666..5418e2605b724edb60878e250d2c50fcc6ff5633 100644
--- a/tensorflow/contrib/checkpoint/python/containers.py
+++ b/tensorflow/contrib/checkpoint/python/containers.py
@@ -46,6 +46,10 @@ class UniqueNameTracker(data_structures.CheckpointableDataStructure):
     self._maybe_initialize_checkpointable()
     self._name_counts = {}
 
+  @property
+  def _values(self):
+    return [dep.ref for dep in self._checkpoint_dependencies]
+
   def track(self, checkpointable, base_name):
     """Add a dependency on `checkpointable`.
 
diff --git a/tensorflow/contrib/cluster_resolver/BUILD b/tensorflow/contrib/cluster_resolver/BUILD
index 9e1867ea9d0c72596f5cc848b25331d79fa84c24..f944b7f88438ff257a44581170ead16640540e69 100644
--- a/tensorflow/contrib/cluster_resolver/BUILD
+++ b/tensorflow/contrib/cluster_resolver/BUILD
@@ -21,173 +21,25 @@ py_library(
 
 py_library(
     name = "cluster_resolver_py",
-    srcs = [
+    srcs = glob([
         "__init__.py",
-        "python/training/__init__.py",
-    ],
+        "python/training/*.py",
+    ]),
     srcs_version = "PY2AND3",
     visibility = ["//visibility:public"],
-    deps = [
-        ":base_cluster_resolver_py",
-        ":gce_cluster_resolver_py",
-        ":kubernetes_cluster_resolver_py",
-        ":slurm_cluster_resolver_py",
-        ":tfconfig_cluster_resolver_py",
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:util",
-    ],
-)
-
-py_library(
-    name = "base_cluster_resolver_py",
-    srcs = ["python/training/cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "gce_cluster_resolver_py",
-    srcs = ["python/training/gce_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "tfconfig_cluster_resolver_py",
-    srcs = ["python/training/tfconfig_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "tpu_cluster_resolver_py",
-    srcs = ["python/training/tpu_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "slurm_cluster_resolver_py",
-    srcs = ["python/training/slurm_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-py_library(
-    name = "kubernetes_cluster_resolver_py",
-    srcs = ["python/training/kubernetes_cluster_resolver.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":base_cluster_resolver_py",
-        "//tensorflow/python:training",
-    ],
-)
-
-tf_py_test(
-    name = "base_cluster_resolver_py_test",
-    srcs = ["python/training/cluster_resolver_test.py"],
-    additional_deps = [
-        ":cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    main = "python/training/cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "gce_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/gce_cluster_resolver_test.py"],
-    additional_deps = [
-        ":cluster_resolver_py",
-        ":gce_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    main = "python/training/gce_cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "tfconfig_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/tfconfig_cluster_resolver_test.py"],
-    additional_deps = [
-        ":tfconfig_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    grpc_enabled = True,
-    main = "python/training/tfconfig_cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "tpu_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/tpu_cluster_resolver_test.py"],
-    additional_deps = [
-        ":tpu_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    grpc_enabled = True,
-    main = "python/training/tpu_cluster_resolver_test.py",
-)
-
-tf_py_test(
-    name = "slurm_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/slurm_cluster_resolver_test.py"],
-    additional_deps = [
-        ":cluster_resolver_py",
-        ":slurm_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-    ],
-    main = "python/training/slurm_cluster_resolver_test.py",
-    tags = [],
+    deps = ["//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib"],
 )
 
 tf_py_test(
-    name = "kubernetes_cluster_resolver_py_test",
-    size = "small",
-    srcs = ["python/training/kubernetes_cluster_resolver_test.py"],
+    name = "cluster_resolver_initialization_test",
+    srcs = ["cluster_resolver_initialization_test.py"],
     additional_deps = [
         ":cluster_resolver_py",
-        ":kubernetes_cluster_resolver_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:training",
     ],
-    main = "python/training/kubernetes_cluster_resolver_test.py",
+    main = "cluster_resolver_initialization_test.py",
 )
diff --git a/tensorflow/contrib/cluster_resolver/__init__.py b/tensorflow/contrib/cluster_resolver/__init__.py
index fd1263fe81ae826d5edfa8752460fb78fe52b32a..390b3e7550b3d991269bb84707c3500f2fa33290 100644
--- a/tensorflow/contrib/cluster_resolver/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/__init__.py
@@ -20,12 +20,14 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=wildcard-import,unused-import
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.slurm_cluster_resolver import SlurmClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
 # pylint: enable=wildcard-import,unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
@@ -35,6 +37,8 @@ _allowed_symbols = [
     'SimpleClusterResolver',
     'UnionClusterResolver',
     'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
     'TPUClusterResolver',
     'SlurmClusterResolver',
 ]
diff --git a/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py b/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..01ff1478c694cf0901aeed48b6e0f873d8abe65e
--- /dev/null
+++ b/tensorflow/contrib/cluster_resolver/cluster_resolver_initialization_test.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests to ensure ClusterResolvers are usable via the old contrib path."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.cluster_resolver import SimpleClusterResolver
+from tensorflow.contrib.cluster_resolver.python.training import cluster_resolver
+from tensorflow.contrib.cluster_resolver.python.training import UnionClusterResolver
+from tensorflow.python.platform import test
+from tensorflow.python.training import server_lib
+
+
+class ClusterResolverInitializationTest(test.TestCase):
+
+  def testCreateSimpleClusterResolverFromLib(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    cluster_resolver.SimpleClusterResolver(base_cluster_spec)
+
+  def testCreateSimpleClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    SimpleClusterResolver(base_cluster_spec)
+
+  def testCreateUnionClusterResolver(self):
+    base_cluster_spec = server_lib.ClusterSpec({
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+    })
+    simple_cr = SimpleClusterResolver(base_cluster_spec)
+    UnionClusterResolver(simple_cr)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/cluster_resolver/python/training/__init__.py b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
index 6d9120a3b96e1960a438772e282ef653b364b7eb..10d93549ebbd4f7e900796d0516b0af1744224af 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/__init__.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/__init__.py
@@ -18,11 +18,36 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.kubernetes_cluster_resolver import KubernetesClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.slurm_cluster_resolver import SlurmClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.tfconfig_cluster_resolver import TFConfigClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'cluster_resolver',
+    'gce_cluster_resolver',
+    'kubernetes_cluster_resolver',
+    'slurm_cluster_resolver',
+    'tfconfig_cluster_resolver',
+    'tpu_cluster_resolver',
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
+    'TPUClusterResolver',
+    'SlurmClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
index 40b1e667ee6039b44b1a442d41dc28dfcbad6dc6..99840fb5166dd739b3bee06a926e06b534011d1f 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,333 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Cluster Resolvers are used for dynamic cluster IP/hostname resolution."""
+"""Stub file for ClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import abc
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-import six
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+# pylint: enable=unused-import
 
-from tensorflow.python.training.server_lib import ClusterSpec
+from tensorflow.python.util.all_util import remove_undocumented
 
+_allowed_symbols = [
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+]
 
-def _format_master_url(master, rpc_layer=None):
-  if rpc_layer:
-    return '%s://%s' % (rpc_layer, master)
-  else:
-    return master
+remove_undocumented(__name__, _allowed_symbols)
 
-
-@six.add_metaclass(abc.ABCMeta)
-class ClusterResolver(object):
-  """Abstract class for all implementations of ClusterResolvers.
-
-  This defines the skeleton for all implementations of ClusterResolvers.
-  ClusterResolvers are a way for TensorFlow to communicate with various cluster
-  management systems (e.g. GCE, AWS, etc...).
-
-  By letting TensorFlow communicate with these systems, we will be able to
-  automatically discover and resolve IP addresses for various TensorFlow
-  workers. This will eventually allow us to automatically recover from
-  underlying machine failures and scale TensorFlow worker clusters up and down.
-  """
-
-  @abc.abstractmethod
-  def cluster_spec(self):
-    """Retrieve the current state of the cluster and returns a ClusterSpec.
-
-    Returns:
-      A ClusterSpec representing the state of the cluster at the moment this
-      function is called.
-
-    Implementors of this function must take care in ensuring that the
-    ClusterSpec returned is up-to-date at the time of calling this function.
-    This usually means retrieving the information from the underlying cluster
-    management system every time this function is invoked and reconstructing
-    a cluster_spec, rather than attempting to cache anything.
-    """
-    raise NotImplementedError(
-        'cluster_spec is not implemented for {}.'.format(self))
-
-  @abc.abstractmethod
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
-    """Retrieves the name or URL of the session master.
-
-    Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
-      rpc_layer: (Optional) The RPC protocol for the given cluster.
-
-    Returns:
-      The name or URL of the session master.
-
-    Implementors of this function must take care in ensuring that the master
-    returned is up-to-date at the time to calling this function. This usually
-    means retrieving the master every time this function is invoked.
-    """
-    raise NotImplementedError('master is not implemented for {}.'.format(self))
-
-
-class SimpleClusterResolver(ClusterResolver):
-  """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
-
-  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
-               environment='', num_accelerators_per_worker=0,
-               rpc_layer=None):
-    """Creates a SimpleClusterResolver from a ClusterSpec."""
-    super(SimpleClusterResolver, self).__init__()
-
-    self._task_type = task_type
-    self._task_index = task_index
-    self._environment = environment
-    self._num_accelerators_per_worker = num_accelerators_per_worker
-    self._rpc_layer = rpc_layer
-
-    if not isinstance(cluster_spec, ClusterSpec):
-      raise TypeError('cluster_spec must be a ClusterSpec.')
-    self._cluster_spec = cluster_spec
-
-    if not isinstance(master, str):
-      raise TypeError('master must be a string.')
-    self._master = master
-
-  def cluster_spec(self):
-    """Returns the ClusterSpec passed into the constructor."""
-    return self._cluster_spec
-
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
-    """Returns the master address to use when creating a session.
-
-    Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
-      rpc_layer: (Optional) The RPC used by distributed TensorFlow.
-
-    Returns:
-      The name or URL of the session master.
-
-    If a task_type and task_index is given, this will override the `master`
-    string passed into the initialization function.
-    """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
-    else:
-      master = self._master
-
-    return _format_master_url(master, rpc_layer or self._rpc_layer)
-
-  @property
-  def task_type(self):
-    return self._task_type
-
-  @property
-  def task_index(self):
-    return self._task_index
-
-  @task_type.setter
-  def task_type(self, task_type):
-    self._task_type = task_type
-
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
-
-  @property
-  def environment(self):
-    return self._environment
-
-  def num_accelerators_per_worker(self, session_config=None):
-    """Returns the number of accelerator cores per worker.
-
-    Args:
-      session_config: Unused. The SimpleClusterResolver does not do automatic
-        detection of accelerators, so a TensorFlow session will never be
-        created, and thus a `session_config` is never necessary here, and will
-        be ignored.
-    """
-    del session_config
-    return self._num_accelerators_per_worker
-
-  @property
-  def rpc_layer(self):
-    return self._rpc_layer
-
-  @rpc_layer.setter
-  def rpc_layer(self, rpc_layer):
-    self._rpc_layer = rpc_layer
-
-
-class UnionClusterResolver(ClusterResolver):
-  """Performs a union on underlying ClusterResolvers.
-
-  This class performs a union given two or more existing ClusterResolvers. It
-  merges the underlying ClusterResolvers, and returns one unified ClusterSpec
-  when cluster_spec is called. The details of the merge function is
-  documented in the cluster_spec function.
-
-  For additional Cluster Resolver properties such as task type, task index,
-  rpc layer, environment, etc..., we will return the value from the first
-  ClusterResolver in the union.
-  """
-
-  def __init__(self, *args, **kwargs):
-    """Initializes a UnionClusterResolver with other ClusterResolvers.
-
-    Args:
-      *args: `ClusterResolver` objects to be unionized.
-      **kwargs:
-        rpc_layer - (Optional) Override value for the RPC layer used by
-          TensorFlow.
-        task_type - (Optional) Override value for the current task type.
-        task_index - (Optional) Override value for the current task index.
-
-    Raises:
-      TypeError: If any argument is not a subclass of `ClusterResolvers`.
-      ValueError: If there are no arguments passed.
-    """
-    super(UnionClusterResolver, self).__init__()
-
-    self._rpc_layer = kwargs.pop('rpc_layer', None)
-    self._task_type = kwargs.pop('task_type', None)
-    self._task_index = kwargs.pop('task_index', None)
-
-    if kwargs:
-      raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
-
-    if not args:
-      raise ValueError('At least one ClusterResolver is required.')
-
-    for cluster_resolver in args:
-      if not isinstance(cluster_resolver, ClusterResolver):
-        raise TypeError('All arguments must be a sub-class of '
-                        '`ClusterResolver.`')
-    self._cluster_resolvers = args
-
-  def cluster_spec(self):
-    """Returns a union of all the ClusterSpecs from the ClusterResolvers.
-
-    Returns:
-      A ClusterSpec containing host information merged from all the underlying
-      ClusterResolvers.
-
-    Raises:
-      KeyError: If there are conflicting keys detected when merging two or
-      more dictionaries, this exception is raised.
-
-    Note: If there are multiple ClusterResolvers exposing ClusterSpecs with the
-    same job name, we will merge the list/dict of workers.
-
-    If *all* underlying ClusterSpecs expose the set of workers as lists, we will
-    concatenate the lists of workers, starting with the list of workers from
-    the first ClusterResolver passed into the constructor.
-
-    If *any* of the ClusterSpecs expose the set of workers as a dict, we will
-    treat all the sets of workers as dicts (even if they are returned as lists)
-    and will only merge them into a dict if there is no conflicting keys. If
-    there is a conflicting key, we will raise a `KeyError`.
-    """
-
-    merged_cluster = {}
-
-    # We figure out whether it is all lists for a particular job, or whether
-    # there are dicts inside.
-    for cluster_resolver in self._cluster_resolvers:
-      cluster_spec = cluster_resolver.cluster_spec()
-      cluster_dict = cluster_spec.as_dict()
-
-      for job_name, tasks in cluster_dict.items():
-        if job_name in merged_cluster:
-          # If we see a dict, then we write a dict out regardless.
-          if isinstance(tasks, dict):
-            merged_cluster[job_name] = {}
-        else:
-          # We take whichever type is present.
-          if isinstance(tasks, list):
-            merged_cluster[job_name] = []
-          else:
-            merged_cluster[job_name] = {}
-
-    # We then do the merge as appropriate in merged_cluster[job].
-    for cluster_resolver in self._cluster_resolvers:
-      cluster_spec = cluster_resolver.cluster_spec()
-      cluster_dict = cluster_spec.as_dict()
-
-      for job_name, tasks in cluster_dict.items():
-        if isinstance(merged_cluster[job_name], list):
-          # We all have lists, we can just concatenate and be done.
-          merged_cluster[job_name].extend(tasks)
-        else:
-          if isinstance(tasks, list):
-            # We convert to a dictionary if the type is a list.
-            task_dict = dict(zip(range(0, len(tasks)), tasks))
-          else:
-            # We can simply make a copy (for update) and be done.
-            task_dict = tasks.copy()
-
-          # We detect if there are duplicates, and raise an error if so.
-          task_keys = set(task_dict)
-          merged_keys = set(merged_cluster[job_name].keys())
-          intersected_keys = task_keys.intersection(merged_keys)
-          if intersected_keys:
-            raise KeyError('Duplicate keys detected when merging two '
-                           'ClusterSpecs: %s' % repr(intersected_keys))
-
-          # We do the merge after all the processing.
-          merged_cluster[job_name].update(task_dict)
-
-    return ClusterSpec(merged_cluster)
-
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
-    """Returns the master address to use when creating a session.
-
-    This usually returns the master from the first ClusterResolver passed in,
-    but you can override this by specifying the task_type and task_index.
-
-    Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
-      rpc_layer: (Optional) The RPC protocol for the given cluster.
-
-    Returns:
-      The name or URL of the session master.
-    """
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
-      return _format_master_url(master, rpc_layer or self._rpc_layer)
-
-    return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
-
-  @property
-  def task_type(self):
-    return self._task_type or self._cluster_resolvers[0].task_type
-
-  @property
-  def task_index(self):
-    return self._task_index or self._cluster_resolvers[0].task_index
-
-  @task_type.setter
-  def task_type(self, task_type):
-    self._task_type = task_type
-
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
-
-  @property
-  def environment(self):
-    return self._cluster_resolvers[0].environment
-
-  def num_accelerators_per_worker(self, session_config=None):
-    return self._cluster_resolvers[0].num_accelerators_per_worker(
-        session_config)
-
-  @property
-  def rpc_layer(self):
-    return self._rpc_layer or self._cluster_resolvers[0].rpc_layer
-
-  @rpc_layer.setter
-  def rpc_layer(self, rpc_layer):
-    self._rpc_layer = rpc_layer
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
index 195b68959b6d21ef674438a4a23a4dd07f45faa7..55e61155c683c928efab9bb018868faec3e3df8c 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,197 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for GCE Instance Groups."""
+"""Stub file for GceClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training.server_lib import ClusterSpec
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+# pylint: enable=unused-import
 
-_GOOGLE_API_CLIENT_INSTALLED = True
-try:
-  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
-  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
-except ImportError:
-  _GOOGLE_API_CLIENT_INSTALLED = False
+from tensorflow.python.util.all_util import remove_undocumented
 
+_allowed_symbols = [
+    'GceClusterResolver',
+]
 
-def _format_master_url(master, rpc_layer=None):
-  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
-
-
-class GceClusterResolver(ClusterResolver):
-  """Cluster Resolver for Google Compute Engine.
-
-  This is an implementation of cluster resolvers for the Google Compute Engine
-  instance group platform. By specifying a project, zone, and instance group,
-  this will retrieve the IP address of all the instances within the instance
-  group and return a Cluster Resolver object suitable for use for distributed
-  TensorFlow.
-  """
-
-  def __init__(self,
-               project,
-               zone,
-               instance_group,
-               port,
-               task_type='worker',
-               task_index=0,
-               rpc_layer='grpc',
-               num_accelerators_per_worker=0,
-               credentials='default',
-               service=None):
-    """Creates a new GceClusterResolver object.
-
-    This takes in a few parameters and creates a GceClusterResolver project. It
-    will then use these parameters to query the GCE API for the IP addresses of
-    each instance in the instance group.
-
-    Args:
-      project: Name of the GCE project.
-      zone: Zone of the GCE instance group.
-      instance_group: Name of the GCE instance group.
-      port: Port of the listening TensorFlow server (default: 8470)
-      task_type: Name of the TensorFlow job this GCE instance group of VM
-        instances belong to.
-      task_index: The task index for this particular VM, within the GCE
-        instance group. In particular, every single instance should be assigned
-        a unique ordinal index within an instance group manually so that they
-        can be distinguished from each other.
-      rpc_layer: The RPC layer TensorFlow should use to communicate across
-        instances.
-      num_accelerators_per_worker: Number of accelerators (GPUs) present per
-        instance.
-      credentials: GCE Credentials. If nothing is specified, this defaults to
-        GoogleCredentials.get_application_default().
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. (Default: discovery.build('compute', 'v1')). If you specify a
-        custom service object, then the credentials parameter will be ignored.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-    """
-    self._project = project
-    self._zone = zone
-    self._instance_group = instance_group
-    self._task_type = task_type
-    self._task_index = task_index
-    self._rpc_layer = rpc_layer
-    self._port = port
-    self._credentials = credentials
-
-    if credentials == 'default':
-      if _GOOGLE_API_CLIENT_INSTALLED:
-        self._credentials = GoogleCredentials.get_application_default()
-
-    if service is None:
-      if not _GOOGLE_API_CLIENT_INSTALLED:
-        raise ImportError('googleapiclient must be installed before using the '
-                          'GCE cluster resolver')
-      self._service = discovery.build(
-          'compute', 'v1',
-          credentials=self._credentials)
-    else:
-      self._service = service
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest instance group info.
-
-    This returns a ClusterSpec object for use based on information from the
-    specified instance group. We will retrieve the information from the GCE APIs
-    every time this method is called.
-
-    Returns:
-      A ClusterSpec containing host information retrieved from GCE.
-    """
-    request_body = {'instanceState': 'RUNNING'}
-    request = self._service.instanceGroups().listInstances(
-        project=self._project,
-        zone=self._zone,
-        instanceGroups=self._instance_group,
-        body=request_body,
-        orderBy='name')
-
-    worker_list = []
-
-    while request is not None:
-      response = request.execute()
-
-      items = response['items']
-      for instance in items:
-        instance_name = instance['instance'].split('/')[-1]
-
-        instance_request = self._service.instances().get(
-            project=self._project,
-            zone=self._zone,
-            instance=instance_name)
-
-        if instance_request is not None:
-          instance_details = instance_request.execute()
-          ip_address = instance_details['networkInterfaces'][0]['networkIP']
-          instance_url = '%s:%s' % (ip_address, self._port)
-          worker_list.append(instance_url)
-
-      request = self._service.instanceGroups().listInstances_next(
-          previous_request=request,
-          previous_response=response)
-
-    worker_list.sort()
-    return ClusterSpec({self._task_type: worker_list})
-
-  def master(self, task_type=None, task_index=None, rpc_layer=None):
-    task_type = task_type if task_type is not None else self._task_type
-    task_index = task_index if task_index is not None else self._task_index
-
-    if task_type is not None and task_index is not None:
-      master = self.cluster_spec().task_address(task_type, task_index)
-      if rpc_layer or self._rpc_layer:
-        return '%s://%s' % (rpc_layer or self._rpc_layer, master)
-      else:
-        return master
-
-    return ''
-
-  @property
-  def task_type(self):
-    return self._task_type
-
-  @property
-  def task_index(self):
-    return self._task_index
-
-  @task_type.setter
-  def task_type(self, task_type):
-    raise RuntimeError(
-        'You cannot reset the task_type of the GceClusterResolver after it has '
-        'been created.')
-
-  @task_index.setter
-  def task_index(self, task_index):
-    self._task_index = task_index
-
-  @property
-  def environment(self):
-    """Returns the current environment which TensorFlow is running in.
-
-    For users in the GCE environment, the environment property is always an
-    empty string, and Google users will not use this ClusterResolver for running
-    on internal systems.
-    """
-    return ''
-
-  @property
-  def rpc_layer(self):
-    return self._rpc_layer
-
-  @rpc_layer.setter
-  def rpc_layer(self, rpc_layer):
-    self._rpc_layer = rpc_layer
-
-  def num_accelerators_per_worker(self, session_config=None):
-    del session_config  # Unused, since this is set manually in __init__.
-    return self._num_accelerators_per_worker
-
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
index ddae64839f01b4f67fe4c0c0bc00199bb2e037aa..a8eaf33629a6299d5da5f8a930e0cad7d07044e8 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver.py
@@ -12,121 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Kubernetes."""
+"""Stub file for KubernetesClusterResolver for backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training import server_lib
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-_KUBERNETES_API_CLIENT_INSTALLED = True
-try:
-  from kubernetes import client as k8sclient  # pylint: disable=g-import-not-at-top
-  from kubernetes import config as k8sconfig  # pylint: disable=g-import-not-at-top
-except ImportError:
-  _KUBERNETES_API_CLIENT_INSTALLED = False
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+# pylint: enable=unused-import
 
+from tensorflow.python.util.all_util import remove_undocumented
 
-class KubernetesClusterResolver(ClusterResolver):
-  """Cluster Resolver for Kubernetes.
+_allowed_symbols = [
+    'KubernetesClusterResolver',
+]
 
-  This is an implementation of cluster resolvers for Kubernetes. When given the
-  the Kubernetes namespace and label selector for pods, we will retrieve the
-  pod IP addresses of all running pods matching the selector, and return a
-  ClusterSpec based on that information.
-  """
+remove_undocumented(__name__, _allowed_symbols)
 
-  def __init__(self,
-               job_to_label_mapping=None,
-               tf_server_port=8470,
-               override_client=None):
-    """Initializes a new KubernetesClusterResolver.
-
-    This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver
-    will attempt to talk to the Kubernetes master to retrieve all the instances
-    of pods matching a label selector.
-
-    Args:
-      job_to_label_mapping: A mapping of TensorFlow jobs to label selectors.
-        This allows users to specify many TensorFlow jobs in one Cluster
-        Resolver, and each job can have pods belong with different label
-        selectors. For example, a sample mapping might be
-        ```
-        {'worker': ['job-name=worker-cluster-a', 'job-name=worker-cluster-b'],
-         'ps': ['job-name=ps-1', 'job-name=ps-2']}
-        ```
-      tf_server_port: The port the TensorFlow server is listening on.
-      override_client: The Kubernetes client (usually automatically retrieved
-        using `from kubernetes import client as k8sclient`). If you pass this
-        in, you are responsible for setting Kubernetes credentials manually.
-
-    Raises:
-      ImportError: If the Kubernetes Python client is not installed and no
-        `override_client` is passed in.
-    """
-    if _KUBERNETES_API_CLIENT_INSTALLED:
-      k8sconfig.load_kube_config()
-
-    if not job_to_label_mapping:
-      job_to_label_mapping = {'worker': ['job-name=tensorflow']}
-
-    if not override_client and not _KUBERNETES_API_CLIENT_INSTALLED:
-      raise ImportError('The Kubernetes Python client must be installed before'
-                        'using the Kubernetes Cluster Resolver. To install the'
-                        'Kubernetes Python client, run `pip install '
-                        'kubernetes` on your command line.')
-
-    self._job_to_label_mapping = job_to_label_mapping
-    self._tf_server_port = tf_server_port
-    self._override_client = override_client
-
-  def master(self):
-    # TODO(frankchn): Figure out a standard way to pass in the current task type
-    # and task id via Kubernetes.
-    pass
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    return self._job_name
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest info from Kubernetes.
-
-    We retrieve the information from the Kubernetes master every time this
-    method is called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Kubernetes.
-
-    Raises:
-      RuntimeError: If any of the pods returned by the master is not in the
-        `Running` phase.
-    """
-    if not self._override_client:
-      k8sconfig.load_kube_config()
-
-    client = self._override_client or k8sclient.CoreV1Api()
-    cluster_map = {}
-
-    for tf_job in self._job_to_label_mapping:
-      all_pods = []
-      for selector in self._job_to_label_mapping[tf_job]:
-        ret = client.list_pod_for_all_namespaces(label_selector=selector)
-        selected_pods = []
-
-        # Sort the list by the name to make sure it doesn't change call to call.
-        for pod in sorted(ret.items, key=lambda x: x.metadata.name):
-          if pod.status.phase == 'Running':
-            selected_pods.append(
-                '%s:%s' % (pod.status.host_ip, self._tf_server_port))
-          else:
-            raise RuntimeError('Pod "%s" is not running; phase: "%s"' %
-                               (pod.metadata.name, pod.status.phase))
-        all_pods.extend(selected_pods)
-      cluster_map[tf_job] = all_pods
-
-    return server_lib.ClusterSpec(cluster_map)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
index dabe2fe1d39db14c60e5437d636144f18c384cf1..fcd2a846eeb1be7ad4b5a98b067a125afbbebc7d 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver.py
@@ -12,185 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Slurm workload manager."""
+"""Stub file for SlurmClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections
-import os
-import subprocess
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training.server_lib import ClusterSpec
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+# pylint: enable=unused-import
 
+from tensorflow.python.util.all_util import remove_undocumented
 
-class SlurmClusterResolver(ClusterResolver):
-  """Cluster Resolver for system with Slurm workload manager.
+_allowed_symbols = [
+    'SlurmClusterResolver',
+]
 
-  This is an implementation of cluster resolvers for Slurm clusters. This allows
-  the specification of jobs and task counts, number of tasks per node, number of
-  GPUs on each node and number of GPUs for each task, It retrieves system
-  attributes by Slurm environment variables, resolves allocated computing node
-  names, construct a cluster and return a Cluster Resolver object which an be
-  use for distributed TensorFlow.
-  """
-
-  def _resolve_hostnames(self):
-    """Resolve host names of nodes allocated in current jobs.
-
-    Returns:
-      A list of node names as strings.
-    """
-    hostlist = (subprocess.check_output(['scontrol', 'show', 'hostname']).
-                decode('utf-8').strip().split('\n'))
-    return hostlist
-
-  def __init__(self,
-               jobs,
-               port_base=8888,
-               gpus_per_node=1,
-               gpus_per_task=1,
-               tasks_per_node=None,
-               auto_set_gpu=True):
-    """Creates a new SlurmClusterResolver object.
-
-    This takes in parameters and creates a SlurmClusterResolver object. It uses
-    those parameters to check which nodes will processes reside and resolves
-    their hostnames. With the number of the GPUs on each node and number of GPUs
-    for each task it offsets the port number for each processes and allocate
-    GPUs to tasks by setting environment variables. The resolver currently
-    supports homogeneous tasks and default Slurm process allocation.
-
-    Args:
-      jobs: Dictionary with job names as key and number of tasks in the job as
-        value
-      port_base: The first port number to start with for processes on a node.
-      gpus_per_node: Number of GPUs available on each node.
-      gpus_per_task: Number of GPUs to be used for each task.
-      tasks_per_node: Number of tasks to run on each node, if not set defaults
-        to Slurm's output environment variable SLURM_NTASKS_PER_NODE.
-      auto_set_gpu: Set the visible CUDA devices automatically while resolving
-        the cluster by setting CUDA_VISIBLE_DEVICES environment variable.
-        Defaults to True.
-
-    Returns:
-      A ClusterResolver object which can be used with distributed TensorFlow.
-
-    Raises:
-      RuntimeError: If requested more GPUs per node then available or requested
-      more tasks then assigned tasks.
-    """
-
-    # check if launched by mpirun
-    if 'OMPI_COMM_WORLD_RANK' in os.environ:
-      self._rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-      num_tasks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
-    else:
-      self._rank = int(os.environ['SLURM_PROCID'])
-      num_tasks = int(os.environ['SLURM_NTASKS'])
-
-    self._jobs = collections.OrderedDict(sorted(jobs.items()))
-    self._port_base = port_base
-
-    # user specification overrides SLURM specification
-    if tasks_per_node is not None:
-      self._tasks_per_node = tasks_per_node
-    elif tasks_per_node is None and 'SLURM_NTASKS_PER_NODE' in os.environ:
-      self._tasks_per_node = int(os.environ['SLURM_NTASKS_PER_NODE'])
-    else:
-      raise RuntimeError('Neither `tasks_per_node` or '
-                         'SLURM_NTASKS_PER_NODE is set.')
-
-    self._gpus_per_node = gpus_per_node
-    self._gpus_per_task = gpus_per_task
-
-    self._auto_set_gpu = auto_set_gpu
-    self._job_name = None
-    self._task_index = None
-
-    self._gpu_allocation = []
-    self._cluster_allocation = {}
-
-    if self._tasks_per_node * self._gpus_per_task > self._gpus_per_node:
-      raise RuntimeError('Requested more GPUs per node then available.')
-
-    if sum(self._jobs.values()) != num_tasks:
-      raise RuntimeError('Requested more tasks then assigned tasks.')
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest instance group info.
-
-    This returns a ClusterSpec object for use based on information from the
-    specified initialization parameters and Slurm environment variables. The
-    cluster specification is resolved each time this function is called. The
-    resolver extract hostnames of nodes by scontrol and pack tasks in that
-    order until a node a has number of tasks that is equal to specification.
-    GPUs on nodes are allocated to tasks by specification through setting
-    CUDA_VISIBLE_DEVICES environment variable.
-
-    Returns:
-      A ClusterSpec containing host information retrieved from Slurm's
-        environment variables.
-    """
-    hostlist = self._resolve_hostnames()
-
-    task_list = []
-    self._gpu_allocation = []
-    self._cluster_allocation = {}
-
-    for host in hostlist:
-      for port_offset, gpu_offset in zip(
-          range(self._tasks_per_node),
-          range(0, self._gpus_per_node, self._gpus_per_task)):
-
-        host_addr = '%s:%d' % (host, self._port_base + port_offset)
-        task_list.append(host_addr)
-        gpu_id_list = []
-
-        for gpu_id in range(gpu_offset, gpu_offset + self._gpus_per_task):
-          gpu_id_list.append(str(gpu_id))
-
-        self._gpu_allocation.append(','.join(gpu_id_list))
-
-    cluster_rank_offset_start = 0
-    cluster_rank_offset_end = 0
-
-    for job_name, num_tasks in self._jobs.items():
-      cluster_rank_offset_end = cluster_rank_offset_start + num_tasks
-
-      self._cluster_allocation[job_name] = \
-        task_list[cluster_rank_offset_start:cluster_rank_offset_end]
-
-      if self._rank >= cluster_rank_offset_start and \
-          self._rank < cluster_rank_offset_end:
-
-        self._job_name = job_name
-        self._task_index = self._rank - cluster_rank_offset_start
-
-      cluster_rank_offset_start = cluster_rank_offset_end
-
-    if self._auto_set_gpu is True:
-      os.environ['CUDA_VISIBLE_DEVICES'] = self._gpu_allocation[self._rank]
-
-    return ClusterSpec(self._cluster_allocation)
-
-  def get_task_info(self):
-    """Returns job name and task_index for the process which calls this.
-
-    This returns the job name and task index for the process which calls this
-    function according to its rank and cluster specification. The job name and
-    task index are set after a cluster is constructed by cluster_spec otherwise
-    defaults to None.
-
-    Returns:
-      A string specifying job name the process belongs to and an integner
-        specifying the task index the process belongs to in that job.
-    """
-    return self._job_name, self._task_index
-
-  def master(self, task_type=None, task_index=None):
-    if task_type and task_index:
-      return self.cluster_spec().task_address(task_type, task_index)
-    return self._cluster_allocation[str(self._job_name)][self._task_index]
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
index 7bbd189d03d9c96914d11948941916739f10d18f..9db7f47dcb49c499719b9002b1d2d6c4837a7bd2 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver.py
@@ -12,81 +12,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for TF_CONFIG Environment Variables."""
-
+"""Stub file for TFConfigClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-import os
-
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training.server_lib import ClusterSpec
-
-_TF_CONFIG_ENV = 'TF_CONFIG'
-_SESSION_MASTER_KEY = 'session_master'
-
-
-class TFConfigClusterResolver(ClusterResolver):
-  """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
-
-  def _load_tf_config(self):
-    return json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec based on the TF_CONFIG environment variable.
-
-    Returns:
-      A ClusterSpec with information from the TF_CONFIG environment variable.
-    """
-    tf_config = self._load_tf_config()
-    if 'cluster' not in tf_config:
-      return ClusterSpec({})
-    return ClusterSpec(tf_config['cluster'])
-
-  def master(self, task_type=None, task_index=0):
-    """Returns the master address to use when creating a TensorFlow session.
-
-    Args:
-      task_type: (String, optional) Overrides and sets the task_type of the
-        master.
-      task_index: (Integer, optional) Overrides and sets the task id of the
-        master.
-
-    Returns:
-      The address of the master.
-
-    Raises:
-      RuntimeError: If the task_type or task_id is not specified and the
-        `TF_CONFIG` environment variable does not contain a task section.
-    """
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-    # If `session_master` is set, just use that.
-    tf_config = self._load_tf_config()
-    if _SESSION_MASTER_KEY in tf_config:
-      return tf_config[_SESSION_MASTER_KEY]
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+# pylint: enable=unused-import
 
-    if 'rpc_layer' in tf_config:
-      rpclayer = '%s://' % tf_config['rpc_layer']
-    else:
-      rpclayer = ''
+from tensorflow.python.util.all_util import remove_undocumented
 
-    # Return an empty string if we are the only job in the ClusterSpec.
-    cluster_spec = self.cluster_spec()
-    if (not cluster_spec.jobs or
-        (len(cluster_spec.jobs) == 1 and
-         len(cluster_spec.job_tasks(cluster_spec.jobs[0])) == 1)):
-      return ''
+_allowed_symbols = [
+    'TFConfigClusterResolver',
+]
 
-    # We try to auto-detect the task type and id, but uses the user-supplied one
-    # where available
-    if not task_type:
-      if 'task' not in tf_config:
-        raise RuntimeError('You must either specify a `task_type`, or your '
-                           'TF_CONFIG must contain a `task` section.')
-      task_type = tf_config['task']['type']
-      task_index = tf_config['task']['index']
+remove_undocumented(__name__, _allowed_symbols)
 
-    return rpclayer + cluster_spec.task_address(task_type, task_index)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
index 1f6803a9ff9a7a1e72ee691afd7e22bb4d85475c..3a1eaccd06e574babbe9a3232dacd1d66f3a4648 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
+++ b/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver.py
@@ -1,4 +1,4 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,341 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Implementation of Cluster Resolvers for Cloud TPUs."""
+"""Stub file for TPUClusterResolver to maintain backwards compatibility."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
+# This file (and all files in this directory in general) is a backwards
+# compatibility shim that exists to re-export ClusterResolvers such that
+# existing OSS code will not be broken.
 
-from six.moves.urllib.request import Request
-from six.moves.urllib.request import urlopen
+# pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+# pylint: enable=unused-import
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import ClusterResolver
-from tensorflow.python.training import server_lib
-from tensorflow.python.util import compat
+from tensorflow.python.util.all_util import remove_undocumented
 
-_GOOGLE_API_CLIENT_INSTALLED = True
-try:
-  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
-  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
-except ImportError:
-  _GOOGLE_API_CLIENT_INSTALLED = False
+_allowed_symbols = [
+    'TPUClusterResolver',
+]
 
-
-_GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
-_ENDPOINTS_SEPARATOR = ','
-_DEFAULT_ENV_VARIABLE = 'TPU_NAME'
-_DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
-
-
-class TPUClusterResolver(ClusterResolver):
-  """Cluster Resolver for Google Cloud TPUs.
-
-  This is an implementation of cluster resolvers for the Google Cloud TPU
-  service. As Cloud TPUs are in alpha, you will need to specify a API definition
-  file for this to consume, in addition to a list of Cloud TPUs in your Google
-  Cloud Platform project.
-  """
-
-  def _tpuService(self):
-    """Creates a new Cloud TPU API object.
-
-    This works around an issue where the underlying HTTP connection sometimes
-    times out when the script has been running for too long. Other methods in
-    this object calls this method to get a new API object whenever they need
-    to communicate with the Cloud API.
-
-    Returns:
-      A Google Cloud TPU API object.
-    """
-    if self._service:
-      return self._service
-
-    credentials = self._credentials
-    if credentials is None or credentials == 'default':
-      credentials = GoogleCredentials.get_application_default()
-
-    if self._discovery_url:
-      return discovery.build(
-          'tpu', 'v1alpha1',
-          credentials=credentials,
-          discoveryServiceUrl=self._discovery_url)
-    else:
-      return discovery.build(
-          'tpu', 'v1alpha1',
-          credentials=credentials)
-
-  def _requestComputeMetadata(self, path):
-    req = Request('http://metadata/computeMetadata/v1/%s' % path,
-                  headers={'Metadata-Flavor': 'Google'})
-    resp = urlopen(req)
-    return compat.as_bytes(resp.read())
-
-  def _shouldResolve(self):
-    if (self._tpu == compat.as_bytes('') or
-        self._tpu == compat.as_bytes('local') or
-        self._tpu.startswith(compat.as_bytes('/bns')) or
-        self._tpu.startswith(compat.as_bytes('localhost:')) or
-        self._tpu.startswith(compat.as_bytes('grpc://'))):
-      return False
-    return True
-
-  @staticmethod
-  def _inGke():
-    """When running in GKE, the environment variable will be set."""
-    return _GKE_ENV_VARIABLE in os.environ
-
-  @staticmethod
-  def _gkeEndpoints():
-    return os.environ[_GKE_ENV_VARIABLE]
-
-  @staticmethod
-  def _envVarFallback():
-    if _DEFAULT_ENV_VARIABLE in os.environ:
-      return os.environ[_DEFAULT_ENV_VARIABLE]
-    return None
-
-  @staticmethod
-  def _environmentDiscoveryUrl():
-    return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
-
-  def __init__(self,
-               tpu=None,
-               zone=None,
-               project=None,
-               job_name='worker',
-               coordinator_name=None,
-               coordinator_address=None,
-               credentials='default',
-               service=None,
-               discovery_url=None):
-    """Creates a new TPUClusterResolver object.
-
-    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
-    for the IP addresses and ports of each Cloud TPU listed.
-
-    Args:
-      tpu: Either a string, or a list of strings corresponding to the TPUs to
-        use. If the single string is the empty string, the string 'local', or a
-        string that begins with 'grpc://' or '/bns', then it is assumed to not
-        correspond with a Cloud TPU and will instead be passed as the session
-        master and no ClusterSpec propagation will be done.
-      zone: Zone where the TPUs are located. If omitted or empty, we will assume
-        that the zone of the TPU is the same as the zone of the GCE VM, which we
-        will try to discover from the GCE metadata service.
-      project: Name of the GCP project containing Cloud TPUs. If omitted or
-        empty, we will try to discover the project name of the GCE VM from the
-        GCE metadata service.
-      job_name: Name of the TensorFlow job the TPUs belong to.
-      coordinator_name: The name to use for the coordinator. Set to None if the
-        coordinator should not be included in the computed ClusterSpec.
-      coordinator_address: The address of the coordinator (typically an ip:port
-        pair). If set to None, a TF server will be started. If coordinator_name
-        is None, a TF server will not be started even if coordinator_address is
-        None.
-      credentials: GCE Credentials. If None, then we use default credentials
-        from the oauth2client
-      service: The GCE API object returned by the googleapiclient.discovery
-        function. If you specify a custom service object, then the credentials
-        parameter will be ignored.
-      discovery_url: A URL template that points to the location of
-        the discovery service. It should have two parameters {api} and
-        {apiVersion} that when filled in produce an absolute URL to the
-        discovery document for that service. The environment variable
-        'TPU_API_DISCOVERY_URL' will override this.
-
-    Raises:
-      ImportError: If the googleapiclient is not installed.
-      ValueError: If no TPUs are specified.
-    """
-    if isinstance(tpu, list):
-      if not tpu:
-        raise ValueError('At least one TPU must be specified.')
-      if len(tpu) != 1:
-        raise NotImplementedError(
-            'Using multiple TPUs in a single session is not yet implemented')
-      tpu = tpu[0]
-
-    in_gke = self._inGke()
-    # When using GKE with Cloud TPUs, the env variable will be set.
-    if tpu is None:
-      if in_gke:
-        tpu = self._gkeEndpoints()
-      else:
-        tpu = self._envVarFallback()
-
-    if tpu is None:
-      raise ValueError('Please provide a TPU Name to connect to.')
-
-    self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
-    self._job_name = job_name
-
-    # Whether we should actually attempt to contact Cloud APIs
-    should_resolve = self._shouldResolve()
-
-    # We error out if we are in a non-Cloud environment which cannot talk to the
-    # Cloud APIs using the standard class and a special object is not passed in.
-    self._service = service
-    if (self._service is None and should_resolve and
-        not _GOOGLE_API_CLIENT_INSTALLED):
-      raise ImportError('googleapiclient and oauth2client must be installed '
-                        'before using the TPU cluster resolver. Execute: '
-                        '`pip install --upgrade google-api-python-client` '
-                        'and `pip install --upgrade oauth2client` to '
-                        'install with pip.')
-
-    # We save user-passed credentials, unless the user didn't pass in anything.
-    self._credentials = credentials
-    if (credentials == 'default' and should_resolve and
-        _GOOGLE_API_CLIENT_INSTALLED):
-      self._credentials = None
-
-    # Automatically detect project and zone if unspecified.
-    if not project and should_resolve:
-      project = compat.as_str(
-          self._requestComputeMetadata('project/project-id'))
-    if not zone and should_resolve:
-      zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
-      zone = zone_path.split('/')[-1]
-    self._project = project
-    self._zone = zone
-
-    self._discovery_url = self._environmentDiscoveryUrl() or discovery_url
-
-    self._coordinator_name = coordinator_name
-    if (coordinator_name and not coordinator_address and
-        (should_resolve or in_gke)):
-      self._start_local_server()
-    else:
-      self._coordinator_address = coordinator_address
-
-  def master(self, task_type=None, task_index=None):
-    """Get the Master string to be used for the session.
-
-    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
-    first instance in the ClusterSpec returned by the cluster_spec function.
-
-    If a non-TPU name is used when constructing a TPUClusterResolver, that will
-    be returned instead (e.g. If the tpus argument's value when constructing
-    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
-    'grpc://10.240.1.2:8470' will be returned).
-
-    Args:
-      task_type: (Optional) The type of the TensorFlow task of the master.
-      task_index: (Optional) The index of the TensorFlow task of the master.
-
-    Returns:
-      string, the connection string to use when creating a session.
-
-    Raises:
-      ValueError: If none of the TPUs specified exists.
-    """
-    if not self._shouldResolve():
-      return self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
-
-    cluster_spec = self.cluster_spec()
-    if task_type and task_index:
-      return cluster_spec.task_address(task_type, task_index)
-
-    job_tasks = cluster_spec.job_tasks(self._job_name)
-    if not job_tasks:
-      raise ValueError('No TPUs exists with the specified names exist.')
-
-    return 'grpc://' + job_tasks[0]
-
-  def get_master(self):
-    return self.master()
-
-  def get_job_name(self):
-    if self._shouldResolve():
-      return self._job_name
-
-  def cluster_spec(self):
-    """Returns a ClusterSpec object based on the latest TPU information.
-
-    We retrieve the information from the GCE APIs every time this method is
-    called.
-
-    Returns:
-      A ClusterSpec containing host information returned from Cloud TPUs.
-
-    Raises:
-      RuntimeError: If the provided TPU is not healthy.
-    """
-    ############################################################################
-    # There are 5 potential cases this code must handle:
-    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
-    #      a. Create a ClusterSpec that includes the coordinator job
-    #      b. Create a ClusterSpec without the coordinator job.
-    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
-    #     tasks and
-    #      a. Create a ClusterSpec with the coordinator
-    #      b. Create a ClusterSpec without the coordinator
-    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
-    ############################################################################
-
-    if self._shouldResolve():
-      # Case 1.
-      full_name = 'projects/%s/locations/%s/nodes/%s' % (
-          self._project, self._zone, compat.as_text(self._tpu))
-      service = self._tpuService()
-      request = service.projects().locations().nodes().get(name=full_name)
-      response = request.execute()
-
-      if 'state' in response and response['state'] != 'READY':
-        raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
-                           (compat.as_text(self._tpu), response['state']))
-
-      if 'health' in response and response['health'] != 'HEALTHY':
-        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
-                           (compat.as_text(self._tpu), response['health']))
-
-      if 'networkEndpoints' in response:
-        worker_list = [
-            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
-            for endpoint in response['networkEndpoints']
-        ]
-      else:
-        # Fall back to the deprecated response format
-        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
-        worker_list = [instance_url]
-
-      cluster_spec = {self._job_name: worker_list}
-    else:
-      if not self._tpu.startswith(compat.as_bytes('grpc://')):
-        # Case 3.
-        return None
-      # Case 2.
-      cluster_spec = {
-          self._job_name: [
-              x[len(compat.as_bytes('grpc://')):]
-              for x in self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))
-          ]
-      }
-
-    if self._coordinator_address:
-      # {1, 2}.a
-      cluster_spec[self._coordinator_name] = [self._coordinator_address]
-
-    return server_lib.ClusterSpec(cluster_spec)
-
-  def _start_local_server(self):
-    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
-    self._server = server_lib.Server(
-        {
-            'local': ['0.0.0.0:0']
-        }, protocol='grpc', config=None, start=True)
-    # self._server.target is of the form: grpc://ipaddress:port
-    target = compat.as_bytes(self._server.target)
-    splits = target.split(compat.as_bytes(':'))
-    assert len(splits) == 3, self._server.target
-    assert splits[0] == compat.as_bytes('grpc'), self._server.target
-    self._coordinator_port = compat.as_text(splits[2])
-    self._coordinator_address = '%s:%s' % (
-        address, compat.as_text(self._coordinator_port))
-
-  def __deepcopy__(self, memo):
-    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
-    return self
+remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/cmake/README.md b/tensorflow/contrib/cmake/README.md
index fca24b16043524c7651c7b7a3a83cac1bfdd53fb..df8b48dfc46124d3b9454d92ffb70dbcf1bc4217 100644
--- a/tensorflow/contrib/cmake/README.md
+++ b/tensorflow/contrib/cmake/README.md
@@ -5,10 +5,10 @@ CMAKE build is deprecated for TensorFlow. Please use `bazel` to build TF for all
 platforms. For details, see the
 [TensorFlow install guide](https://www.tensorflow.org/install/).
 
-This directory contains CMake files for building TensorFlow on Microsoft
-Windows and Linux. [CMake](https://cmake.org) is a cross-platform tool that can
-generate build scripts for multiple build systems, including Microsoft
-Visual Studio and GCC. "The method has not been tested on Mac OS X.
+This directory contains CMake files for building TensorFlow on Microsoft Windows
+and Linux. [CMake](https://cmake.org) is a cross-platform tool that can generate
+build scripts for multiple build systems, including Microsoft Visual Studio and
+GCC. "The method has not been tested on Mac OS X.
 
 **N.B.** We provide Linux build instructions primarily for the purpose of
 testing the build. We recommend using the standard Bazel-based build on
@@ -17,13 +17,17 @@ Linux.
 Current Status
 --------------
 
-CMake can be used to build TensorFlow on all platforms. See the [getting started documentation](https://www.tensorflow.org/install/install_windows)
-for instructions on how to install a pre-built TensorFlow package on Windows and Linux. The procedure in MacOS is similar to the Linux build.
+CMake can be used to build TensorFlow on all platforms. See the
+[getting started documentation](https://www.tensorflow.org/install/install_windows)
+for instructions on how to install a pre-built TensorFlow package on Windows and
+Linux. The procedure in MacOS is similar to the Linux build.
 
 ### Current known limitations
-* It is not possible to load a custom Op library.
-* GCS file system is not supported.
-* Debug build is not available since Python for Windows is no longer distributed with a debug library.
+
+*   It is not possible to load a custom Op library.
+*   GCS file system is not supported.
+*   Debug build is not available since Python for Windows is no longer
+    distributed with a debug library.
 
 ## Building with CMake
 
@@ -33,77 +37,88 @@ bindings.
 
 ### Prerequisites
 
-* CMake version 3.5 or later.
+*   CMake version 3.5 or later.
+
+*   [Git](https://git-scm.com)
+
+*   [SWIG](http://www.swig.org/download.html)
+
+*   [Perl](https://www.perl.org/get.html) (optional, for SSL support build)
+
+*   [Go](https://golang.org/) (optional, for SSL support build)
 
-* [Git](https://git-scm.com)
+*   [NASM](http://www.nasm.us/)/[YASM](http://yasm.tortall.net/) (optional, for
+    SSL support build)
 
-* [SWIG](http://www.swig.org/download.html)
+*   Additional pre-requisites for Microsoft Windows:
 
-* [Perl](https://www.perl.org/get.html) (optional, for SSL support build)
+    -   Visual Studio 2015 (latest version of MSVC 2017 is not supported by CUDA
+        yet, try it on your own risk)
 
-* [Go](https://golang.org/) (optional, for SSL support build)
+    -   Python 3.5
 
-* [NASM](http://www.nasm.us/)/[YASM](http://yasm.tortall.net/) (optional, for SSL support build)
+*   Additional prerequisites for Linux:
 
-* Additional pre-requisites for Microsoft Windows:
-  - Visual Studio 2015 (latest version of MSVC 2017 is not supported by CUDA yet, try it on your own risk)
-  
-  - Python 3.5
+    -   Python 2.7 or later
+    -   [Docker](https://www.docker.com/) (for automated testing)
 
-* Additional prerequisites for Linux:
-  - Python 2.7 or later
-  - [Docker](https://www.docker.com/) (for automated testing)
+*   Python dependencies:
 
-* Python dependencies:
-  - wheel
-  - NumPy 1.11.0 or later
+    -   wheel
+    -   NumPy 1.11.0 or later
 
 ### Known-good configurations
 
-* Microsoft Windows 10
-  - Microsoft Visual Studio Enterprise/ Community 2015 with Visual C++ 2015
-  - [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
-  - [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
-  - [swigwin-3.0.10](http://www.swig.org/download.html)
-  - [NVidia CUDA Toolkit 9.0](https://developer.nvidia.com/cuda-downloads)
-  - [NVidia CUDNN 7](https://developer.nvidia.com/cudnn)
-  - [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
+*   Microsoft Windows 10
 
-* Ubuntu 14.04
-  - Makefile generator
-  - Docker 1.9.1 (for automated testing)
+    -   Microsoft Visual Studio Enterprise/ Community 2015 with Visual C++ 2015
+    -   [Anaconda 4.1.1 (Python 3.5 64-bit)](https://www.anaconda.com/download/)
+    -   [Git for Windows version 2.9.2.windows.1](https://git-scm.com/download/win)
+    -   [swigwin-3.0.10](http://www.swig.org/download.html)
+    -   [NVidia CUDA Toolkit 9.0](https://developer.nvidia.com/cuda-downloads)
+    -   [NVidia CUDNN 7](https://developer.nvidia.com/cudnn)
+    -   [CMake 3.6](https://cmake.org/files/v3.6/cmake-3.6.3-win64-x64.msi)
+
+*   Ubuntu 14.04
+
+    -   Makefile generator
+    -   Docker 1.9.1 (for automated testing)
 
 ### Current known limitations
-  - The Python package supports **Python 3.5/3.6 only**, because these are the only
-    versions for which standard Python binaries exist and those binaries are
-    compatible with the TensorFlow runtime. (On Windows, the standard Python
+
+-   The Python package supports **Python 3.5/3.6 only**, because these are the
+    only versions for which standard Python binaries exist and those binaries
+    are compatible with the TensorFlow runtime. (On Windows, the standard Python
     binaries for versions earlier than 3.5 were compiled with older compilers
     that do not have all of the features (e.g. C++11 support) needed to compile
-    TensorFlow. We welcome patches for making TensorFlow work with Python 2.7
-    on Windows, but have not yet committed to supporting that configuration.)
-
-  - The following Python APIs are not currently implemented:
-    * Loading custom op libraries via `tf.load_op_library()`. In order to use your
-      custom op, please put the source code under the tensorflow/core/user_ops
-      directory, and a shape function is required (not optional) for each op.
-    * Path manipulation functions (such as `tf.gfile.ListDirectory()`) are not
-      functional.
-
-  - The `tf.contrib` libraries are not currently included in the PIP package.
-
-  - The following operations are not currently implemented:
-    * `DepthwiseConv2dNative`
-    * `Digamma`
-    * `Erf`
-    * `Erfc`
-    * `Igamma`
-    * `Igammac`
-    * `ImmutableConst`
-    * `Lgamma`
-    * `Polygamma`
-    * `Zeta`
-
-  - Google Cloud Storage support is not currently implemented. The GCS library
+    TensorFlow. We welcome patches for making TensorFlow work with Python 2.7 on
+    Windows, but have not yet committed to supporting that configuration.)
+
+-   The following Python APIs are not currently implemented:
+
+    *   Loading custom op libraries via `tf.load_op_library()`. In order to use
+        your custom op, please put the source code under the
+        tensorflow/core/user_ops directory, and a shape function is required
+        (not optional) for each op.
+    *   Path manipulation functions (such as `tf.gfile.ListDirectory()`) are not
+        functional.
+
+-   The `tf.contrib` libraries are not currently included in the PIP package.
+
+-   The following operations are not currently implemented:
+
+    *   `DepthwiseConv2dNative`
+    *   `Digamma`
+    *   `Erf`
+    *   `Erfc`
+    *   `Igamma`
+    *   `Igammac`
+    *   `ImmutableConst`
+    *   `Lgamma`
+    *   `Polygamma`
+    *   `Zeta`
+
+-   Google Cloud Storage support is not currently implemented. The GCS library
     currently depends on `libcurl` and `boringssl`, and the Windows version
     could use standard Windows APIs for making HTTP requests and cryptography
     (for OAuth). Contributions are welcome for this feature.
@@ -112,97 +127,145 @@ We are actively working on improving CMake and Windows support, and addressing
 these limitations. We would appreciate pull requests that implement missing
 ops or APIs.
 
-CMake GUI build (all platforms)
-==================================
-Install from CMake GUI would be a convenient way to generate C++ build projects. The software supports Windows, MacOS and Linux, while the posix platform provides an extra ccmake binary to run command line GUI. Both working principal of cmake, ccmake and cmake-gui are the same, the only difference is by providing suitable interface for project configuration and dependency setting.
-
-0. Pre-buid checklist:
-    The following binary/libraries should be setted in system path, otherwise you need to set manualy via cmake.
-    * Compiler (GCC for Linux, MSVC for Windows)
-    * Make sure compiler directory has been set to system path
-    * CUDA 9.0 (GPU build)
-    * CUDNN (GPU build)
-    * NCCL (GPU build on Linux)
-    * SWIG (python binding) 
-    * Perl (required if you need ssl support, optional)
-    * Go (required if you need ssl support, optional)
-    * NASM/YASM (required by grpc for ssl support, optional)
-1. Start CMake GUI
-2. Click on `Browse Source` and direct to the the folder `<tensorflow-source>/tensorflow/contrib/cmake`
-3. Click on `Browse Build` and spectify a location that you want tensorflow to be build
-4. Click on `Configure`, a new window will be prompted out, specify the generator mode for the project generation. For Windows, choose `Visual Studio <version> <year> Win64`, for Linux, choose `Unix Makefiles`, then press `Finish`. Wait for a moment, the default project dependecy would automatically generate.
-5. There are a few options that you can customize your own build. **The setting here is crucial for a sucessful build, please check all items carefully.**
-    * `tensorflow_BUILD_ALL_KERNELS` should alway be `on`
-    * `tensorflow_BUILD_CC_EXAMPLE` is default to be `on`. This can help you to test build (optional)
-    * `tensorflow_BUILD_CONTRIB_KERNELS` is default to be `on`, but it won't affect tensorflow function, turn it to `off` if you want a slim build. (optional)
-    * `tensorflow_BUILD_PYTHON_BINDING` is default to be `on`. Set to `off` if you don't need python interaface. If SWIG is not in system path, you need set it manually. (optional)
-    * `tensorflow_BUILD_SHARED_LIB` is default to be `off`. Set to `on` if you want the c++ interface. (optional)
-    * `tensorflow_ENABLE_GPU` is default to be `off`. Set to `on` if you want GPU support. It will search CUDA and CUDNN dependecies if you have set them to system path, otherwise CMake would prompt error and request you to set it manually. (optional)
-    * `tensorflow_ENABLE_GRPC_SUPPORT` is default to be `on`. For Linux build, this option must always be `on`. This need to be `on` for a gpu build. Reminded that Perl, Go and NASM/YASM are required for this option if you want to build grpc with offical SSL support.
-    * `tensorflow_ENABLE_POSITION_INDEPENDENT_CODE` should always be `on`
-    * `tensorflow_ENABLE_SNAPPY_SUPPORT` should always be `on` 
-    * `tensorflow_OPTIMIZE_FOR_NATIVE_ARCH` should always be `on`
-    * `CMAKE_INSTALL_PREFIX` is the location where the final package will be installed. You may change it to your own preferred path (optional)
-
-6. After changing the configuration in step 5, press `Configure` again
-7. If not error is found, press `Generate`
+# CMake GUI build (all platforms)
+
+Install from CMake GUI would be a convenient way to generate C++ build projects.
+The software supports Windows, MacOS and Linux, while the posix platform
+provides an extra ccmake binary to run command line GUI. Both working principal
+of cmake, ccmake and cmake-gui are the same, the only difference is by providing
+suitable interface for project configuration and dependency setting.
+
+1.  Pre-buid checklist: The following binary/libraries should be setted in
+    system path, otherwise you need to set manualy via cmake.
+    *   Compiler (GCC for Linux, MSVC for Windows)
+    *   Make sure compiler directory has been set to system path
+    *   CUDA 9.0 (GPU build)
+    *   CUDNN (GPU build)
+    *   NCCL (GPU build on Linux)
+    *   SWIG (python binding)
+    *   Perl (required if you need ssl support, optional)
+    *   Go (required if you need ssl support, optional)
+    *   NASM/YASM (required by grpc for ssl support, optional)
+2.  Start CMake GUI
+3.  Click on `Browse Source` and direct to the the folder
+    `<tensorflow-source>/tensorflow/contrib/cmake`
+4.  Click on `Browse Build` and spectify a location that you want tensorflow to
+    be build
+5.  Click on `Configure`, a new window will be prompted out, specify the
+    generator mode for the project generation. For Windows, choose `Visual
+    Studio <version> <year> Win64`, for Linux, choose `Unix Makefiles`, then
+    press `Finish`. Wait for a moment, the default project dependecy would
+    automatically generate.
+6.  There are a few options that you can customize your own build. **The setting
+    here is crucial for a sucessful build, please check all items carefully.**
+
+    *   `tensorflow_BUILD_ALL_KERNELS` should alway be `on`
+    *   `tensorflow_BUILD_CC_EXAMPLE` is default to be `on`. This can help you
+        to test build (optional)
+    *   `tensorflow_BUILD_CONTRIB_KERNELS` is default to be `on`, but it won't
+        affect tensorflow function, turn it to `off` if you want a slim build.
+        (optional)
+    *   `tensorflow_BUILD_PYTHON_BINDING` is default to be `on`. Set to `off` if
+        you don't need python interaface. If SWIG is not in system path, you
+        need set it manually. (optional)
+    *   `tensorflow_BUILD_SHARED_LIB` is default to be `off`. Set to `on` if you
+        want the c++ interface. (optional)
+    *   `tensorflow_ENABLE_GPU` is default to be `off`. Set to `on` if you want
+        GPU support. It will search CUDA and CUDNN dependecies if you have set
+        them to system path, otherwise CMake would prompt error and request you
+        to set it manually. (optional)
+    *   `tensorflow_ENABLE_GRPC_SUPPORT` is default to be `on`. For Linux build,
+        this option must always be `on`. This need to be `on` for a gpu build.
+        Reminded that Perl, Go and NASM/YASM are required for this option if you
+        want to build grpc with offical SSL support.
+    *   `tensorflow_ENABLE_POSITION_INDEPENDENT_CODE` should always be `on`
+    *   `tensorflow_ENABLE_SNAPPY_SUPPORT` should always be `on`
+    *   `tensorflow_OPTIMIZE_FOR_NATIVE_ARCH` should always be `on`
+    *   `CMAKE_INSTALL_PREFIX` is the location where the final package will be
+        installed. You may change it to your own preferred path (optional)
+
+7.  After changing the configuration in step 5, press `Configure` again
+
+8.  If not error is found, press `Generate`
 
 #### Windows
 
-1. Open `tensorflow.sln` in the build folder (Windows). Change build type from `Debug` to `Release`. Choose `Build`->`Build Solution`. This may take more than hours of compilation. If everything is alright, the output window would show no error.
+1.  Open `tensorflow.sln` in the build folder (Windows). Change build type from
+    `Debug` to `Release`. Choose `Build`->`Build Solution`. This may take more
+    than hours of compilation. If everything is alright, the output window would
+    show no error.
 
     ##### Python
 
-    In solution explorer, right click on `tf_python_build_pip_package` -> `build`. It will generate the wheel file in `<tensorflow-build>/tf_python/dist`. Install with following command:
+    In solution explorer, right click on `tf_python_build_pip_package` ->
+    `build`. It will generate the wheel file in
+    `<tensorflow-build>/tf_python/dist`. Install with following command:
 
-     ```pip install --upgrade tensorflow-<config>.whl```
+    `pip install --upgrade tensorflow-<config>.whl`
 
-    ***The wheel name varies depends on you config. Change to your own wheel filename.***
+    ***The wheel name varies depends on you config. Change to your own wheel
+    filename.***
 
-    Reminded that some pip installation requires administrator right command prompt.
+    Reminded that some pip installation requires administrator right command
+    prompt.
 
     ##### C++
 
-    You can directly use the build folder tree for C++ interface with cmake. If you want to do installation for api releasing, right click on `Install` -> `build`. The headers and library will be installed in the directory specify by `CMAKE_INSTALL_PREFIX` during configuration.
+    You can directly use the build folder tree for C++ interface with cmake. If
+    you want to do installation for api releasing, right click on `Install` ->
+    `build`. The headers and library will be installed in the directory specify
+    by `CMAKE_INSTALL_PREFIX` during configuration.
 
-2. For smaller RAM computer, it is noticed that out of heap space error appears. Change to command prompt build is an alternative to do step 1. 
+1.  For smaller RAM computer, it is noticed that out of heap space error
+    appears. Change to command prompt build is an alternative to do step 1.
 
-    Open `VS2015 x64 Native Tools Command Prompt`. You can open it by press `Start`, then type the binary name. Use `VS2017 x64 Native Tools Command Prompt` if you are using MSVC 2017.
+    Open `VS2015 x64 Native Tools Command Prompt`. You can open it by press
+    `Start`, then type the binary name. Use `VS2017 x64 Native Tools Command
+    Prompt` if you are using MSVC 2017.
 
     ##### Python
 
     Directly build python wheel package by following command:
 
-    ```MSBuild /p:Configuration=Release <path-to-tf_python_build_pip_package.vcxproj>```
+    `MSBuild /p:Configuration=Release
+    <path-to-tf_python_build_pip_package.vcxproj>`
 
-    Remember to change `<path-to-tf_python_build_pip_package.vcxproj>` to the actual path of the file, it can be found at the root of build directory
+    Remember to change `<path-to-tf_python_build_pip_package.vcxproj>` to the
+    actual path of the file, it can be found at the root of build directory
 
     Install the wheel file generated as instructed by step 1.
 
     ##### C++ interface
-    Build from VS native toolchain with following command:
-    ```MSBuild /p:Configuration=Release <path-to-ALL_BUILD.vcxproj>```
 
-    Headers are discretely located in the build folders. Tensorflow library can be found at `<path-to-build>/Release`, namely `tensorflow.dll` and `tensorflow.lib`.
+    Build from VS native toolchain with following command: `MSBuild
+    /p:Configuration=Release <path-to-ALL_BUILD.vcxproj>`
+
+    Headers are discretely located in the build folders. Tensorflow library can
+    be found at `<path-to-build>/Release`, namely `tensorflow.dll` and
+    `tensorflow.lib`.
 
-    * Build to install for api release (optional):
-    ```MSBuild /p:Configuration=Release <path-to-INSTALL.vcxproj>```
+    *   Build to install for api release (optional): `MSBuild
+        /p:Configuration=Release <path-to-INSTALL.vcxproj>`
 
-    Remember to change `<path-to-ALL_BUILD.vcxproj>` and `<path-to-INSTALL.vcxproj>` to the actual path of the file, it can be found at the root of build directory.
+    Remember to change `<path-to-ALL_BUILD.vcxproj>` and
+    `<path-to-INSTALL.vcxproj>` to the actual path of the file, it can be found
+    at the root of build directory.
 
 #### Linux/MacOS (command line GNU build)
 
-1. Open the terminal, change working directory to the one specified in step 3.
+1.  Open the terminal, change working directory to the one specified in step 3.
 
-2. Type the following command:
+2.  Type the following command:
 
-    ```make -sj<number-of-threads> all```
+    `make -sj<number-of-threads> all`
 
     ##### Python
 
-    **Important Note** CMake generated python wheel for Linux/MacOs is currently under development. Please use bazel build.
+    **Important Note** CMake generated python wheel for Linux/MacOs is currently
+    under development. Please use bazel build.
 
-    Follow code is an expected Linux/MacOS python package build after development work is completed.
+    Follow code is an expected Linux/MacOS python package build after
+    development work is completed.
 
     ```
     make -sj<number-of-threads> tf_python_build_pip_package
@@ -212,52 +275,63 @@ Install from CMake GUI would be a convenient way to generate C++ build projects.
 
     ##### C++ interface
 
-    ```make -sj<number-of-threads> install```
+    `make -sj<number-of-threads> install`
 
-    Where `<number-of-threads>` is the threads used for the compilation, change to any integer less or equal to your computer's maxiumum thread number.
+    Where `<number-of-threads>` is the threads used for the compilation, change
+    to any integer less or equal to your computer's maxiumum thread number.
 
-     Headers are discretely located in the build folders. Tensorflow library can be found at `<path-to-build>`, namely `tensorflow.so` (Linux) or `tensorflow.dylib` (MacOS).
+    Headers are discretely located in the build folders. Tensorflow library can
+    be found at `<path-to-build>`, namely `tensorflow.so` (Linux) or
+    `tensorflow.dylib` (MacOS).
 
 #### Start a Tensorflow C++ project with CMake
-Here we assume that you have basic knowledge on gathering dependency with `CMakeLists.txt`. Here we introduce how the C++ api works with [official hello world tutorial](https://www.tensorflow.org/api_guides/cc/guide).
 
-1. Create a new working directory and create a new text file named `CMakeLists.txt` and the c++ file `main.cxx`
-2. Fill in the `main.cxx` with the code provided in [official c++ api basic](https://www.tensorflow.org/api_guides/cc/guide).
-3. Fill in the `CMakeLists.txt` with following code:
-    ``` cmake
-    cmake_minimum_required (VERSION 2.6)
-    project (tf_hello)
+Here we assume that you have basic knowledge on gathering dependency with
+`CMakeLists.txt`. Here we introduce how the C++ api works with
+[official hello world tutorial](https://www.tensorflow.org/api_guides/cc/guide).
+
+1.  Create a new working directory and create a new text file named
+    `CMakeLists.txt` and the c++ file `main.cxx`
+2.  Fill in the `main.cxx` with the code provided in
+    [official c++ api basic](https://www.tensorflow.org/api_guides/cc/guide).
+3.  Fill in the `CMakeLists.txt` with following code: ``` cmake
+    cmake_minimum_required (VERSION 2.6) project (tf_hello)
 
     # Tensorflow
+
     find_package(Tensorflow REQUIRED)
     include_directories(${TENSORFLOW_INCLUDE_DIRS})
 
     # compiler setting required by tensorflow, to be tested on all compilers
+
     # currently only tested on MSVC and GCC
-    if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) 
-      add_definitions(-DCOMPILER_MSVC)
-    elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU)
-      if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS "3")
-        add_definitions(-DCOMPILER_GCC3)
-      else()
-        add_definitions(-D__GNUC__)
-      endif()
-    else()
-      message(ERROR " compiler ${CMAKE_CXX_COMPILER_ID} not supported by this CMakeList.txt, under development")
-    endif()
-
-    add_executable(tf_hello main.cxx)
-    target_link_libraries(tf_hello ${TENSORFLOW_LIBRARIES})
-    ```
-4. Configure the folder with cmake-gui, an error should be prompted out, requesting you to locate the folder containing `TensorflowConfig.cmake`. This file can be found at `<tensorflow-build>` or `<tensorflow-intall>` (for those have build install in previous steps).
 
-5. Configure again, generate the project.
-6. Compile the project with `Release` config (Windows). For Linux users, just compile the project.
-7. Copy the `tensorflow.dll`(Windows)/`tensorflow.so`(Linux) from build directory to the build folder containing `tf_hello` binary.
-8. Run `tf_hello` binary
+    if (${CMAKE_CXX_COMPILER_ID} STREQUAL MSVC) add_definitions(-DCOMPILER_MSVC)
+    elseif (${CMAKE_CXX_COMPILER_ID} STREQUAL GNU) if
+    (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS "3")
+    add_definitions(-DCOMPILER_GCC3) else() add_definitions(-D__GNUC__) endif()
+    else() message(ERROR " compiler ${CMAKE_CXX_COMPILER_ID} not supported by
+    this CMakeList.txt, under development") endif()
+
+    add_executable(tf_hello main.cxx) target_link_libraries(tf_hello
+    ${TENSORFLOW_LIBRARIES}) ```
+
+4.  Configure the folder with cmake-gui, an error should be prompted out,
+    requesting you to locate the folder containing `TensorflowConfig.cmake`.
+    This file can be found at `<tensorflow-build>` or `<tensorflow-intall>` (for
+    those have build install in previous steps).
+
+5.  Configure again, generate the project.
+
+6.  Compile the project with `Release` config (Windows). For Linux users, just
+    compile the project.
+
+7.  Copy the `tensorflow.dll`(Windows)/`tensorflow.so`(Linux) from build
+    directory to the build folder containing `tf_hello` binary.
+
+8.  Run `tf_hello` binary
 
-Step-by-step Windows build (command prompt)
-==========================
+# Step-by-step Windows build (command prompt)
 
 1.  Install the prerequisites detailed above, and set up your environment.
 
@@ -443,4 +517,4 @@ $ cd tensorflow
 $ tensorflow/tools/ci_build/ci_build.sh CMAKE tensorflow/tools/ci_build/builds/cmake.sh
 ```
 
-That's it. Dependencies included.
\ No newline at end of file
+That's it. Dependencies included.
diff --git a/tensorflow/contrib/cmake/external/abseil_cpp.cmake b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
index eefa7d3f039295ab595b4233fab51e7733dd6236..b85fd48f0f34df93d9eaa31251ebe05c78b34a9e 100644
--- a/tensorflow/contrib/cmake/external/abseil_cpp.cmake
+++ b/tensorflow/contrib/cmake/external/abseil_cpp.cmake
@@ -31,8 +31,8 @@ if (systemlib_ABSEIL_CPP)
   message(STATUS "  abseil_cpp includes: ${ABSEIL_CPP_INCLUDE_DIR}")
   message(STATUS "  abseil_cpp libraries: ${ABSEIL_CPP_LIBRARIES}")
 
-  add_custom_target(abseil_cpp_build)
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
+  add_custom_target(abseil_cpp)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
 
 else (systemlib_ABSEIL_CPP)
 
@@ -79,14 +79,11 @@ else (systemlib_ABSEIL_CPP)
         ${abseil_cpp_BUILD}/absl/types/libabsl_bad_optional_access.a)
   endif()
 
-  ExternalProject_Add(abseil_cpp_build
+  ExternalProject_Add(abseil_cpp
       PREFIX abseil_cpp
       GIT_REPOSITORY ${abseil_cpp_URL}
       DOWNLOAD_DIR "${DOWNLOAD_LOCATION}"
-      BUILD_IN_SOURCE 1
       BUILD_BYPRODUCTS ${abseil_cpp_STATIC_LIBRARIES}
-      BUILD_COMMAND ${CMAKE_COMMAND} --build . --config Release
-      COMMAND ${CMAKE_COMMAND} --build . --config Release
       INSTALL_COMMAND ""
       CMAKE_CACHE_ARGS
           -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=${tensorflow_ENABLE_POSITION_INDEPENDENT_CODE}
@@ -99,6 +96,6 @@ else (systemlib_ABSEIL_CPP)
 
   list(APPEND tensorflow_EXTERNAL_LIBRARIES ${abseil_cpp_STATIC_LIBRARIES})
 
-  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp_build)
+  list(APPEND tensorflow_EXTERNAL_DEPENDENCIES abseil_cpp)
 
 endif (systemlib_ABSEIL_CPP)
\ No newline at end of file
diff --git a/tensorflow/contrib/cmake/external/grpc.cmake b/tensorflow/contrib/cmake/external/grpc.cmake
index eca4f3c8c8866ff60c4ee8332a2baaa972fe3b83..e570c09ecb5e64130ed6f3375a51d74850cc3989 100644
--- a/tensorflow/contrib/cmake/external/grpc.cmake
+++ b/tensorflow/contrib/cmake/external/grpc.cmake
@@ -17,7 +17,7 @@ include (ExternalProject)
 set(GRPC_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc/include)
 set(GRPC_URL https://github.com/grpc/grpc.git)
 set(GRPC_BUILD ${CMAKE_CURRENT_BINARY_DIR}/grpc/src/grpc)
-set(GRPC_TAG d184fa229d75d336aedea0041bd59cb93e7e267f)
+set(GRPC_TAG 69b6c047bc767b4d80e7af4d00ccb7c45b683dae)
 
 if(WIN32)
   # We use unsecure gRPC because boringssl does not build on windows
diff --git a/tensorflow/contrib/cmake/tf_shared_lib.cmake b/tensorflow/contrib/cmake/tf_shared_lib.cmake
index 48dbfb92e65b0ed456846f83ddd5eed4d74dfe67..62005dd113bfb80fbdf23afb6d4aa5f90a1e32de 100644
--- a/tensorflow/contrib/cmake/tf_shared_lib.cmake
+++ b/tensorflow/contrib/cmake/tf_shared_lib.cmake
@@ -213,6 +213,10 @@ install(DIRECTORY ${tensorflow_source_dir}/third_party/eigen3/
 # unsupported Eigen directory
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/eigen/src/eigen/unsupported/Eigen/
         DESTINATION include/unsupported/Eigen)
+# absl directory
+install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/abseil_cpp/src/abseil_cpp/absl/
+        DESTINATION include/absl
+        FILES_MATCHING PATTERN "*.h")
 # mkl
 if (tensorflow_ENABLE_MKL_SUPPORT)
     install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/mkl/src/mkl/include/
diff --git a/tensorflow/contrib/compiler/BUILD b/tensorflow/contrib/compiler/BUILD
index 1630f010ab60db258b976c7bddc22ff78dccf890..e4566437c60ebb2da039e61c171fbe954a7355c9 100644
--- a/tensorflow/contrib/compiler/BUILD
+++ b/tensorflow/contrib/compiler/BUILD
@@ -58,6 +58,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/compiler/jit:xla_ops_py",
+        "//tensorflow/compiler/jit/ops:xla_ops_grad",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
diff --git a/tensorflow/contrib/compiler/xla.py b/tensorflow/contrib/compiler/xla.py
index 335ac7946485f234d1af3d180283fc8daac50005..f867cd15b67dbd43650d8012b4299845af7200a8 100644
--- a/tensorflow/contrib/compiler/xla.py
+++ b/tensorflow/contrib/compiler/xla.py
@@ -23,6 +23,7 @@ import contextlib
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.jit.ops import xla_ops
+from tensorflow.compiler.jit.ops import xla_ops_grad  # pylint: disable=unused-import
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
index 41258edd90866ae9f644a02c42dfe2dc589da998..6926c0d03fe38ab2d62cc588950c7f5a49b2aba1 100644
--- a/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
+++ b/tensorflow/contrib/constrained_optimization/python/constrained_minimization_problem.py
@@ -74,8 +74,8 @@ class ConstrainedMinimizationProblem(object):
 
     if (constraints_shape.ndims is None or
         proxy_constraints_shape.ndims is None or
-        any([ii is None for ii in constraints_shape.as_list()]) or
-        any([ii is None for ii in proxy_constraints_shape.as_list()])):
+        any(ii is None for ii in constraints_shape.as_list()) or
+        any(ii is None for ii in proxy_constraints_shape.as_list())):
       raise ValueError(
           "constraints and proxy_constraints must have fully-known shapes")
     if constraints_shape != proxy_constraints_shape:
diff --git a/tensorflow/contrib/crf/python/ops/crf.py b/tensorflow/contrib/crf/python/ops/crf.py
index 656633f0bf21a4d46cb85547241ef0fd42807ed6..40e159b8fcbd1864284e208cb15d9ed96119f840 100644
--- a/tensorflow/contrib/crf/python/ops/crf.py
+++ b/tensorflow/contrib/crf/python/ops/crf.py
@@ -38,12 +38,12 @@ tf_unary_scores, tf_sequence_lengths, tf_transition_params, _ = session.run(
     [unary_scores, sequence_lengths, transition_params, train_op])
 for tf_unary_scores_, tf_sequence_length_ in zip(tf_unary_scores,
                                                  tf_sequence_lengths):
-# Remove padding.
-tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
+    # Remove padding.
+    tf_unary_scores_ = tf_unary_scores_[:tf_sequence_length_]
 
-# Compute the highest score and its tag sequence.
-tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
-    tf_unary_scores_, tf_transition_params)
+    # Compute the highest score and its tag sequence.
+    tf_viterbi_sequence, tf_viterbi_score = tf.contrib.crf.viterbi_decode(
+        tf_unary_scores_, tf_transition_params)
 """
 
 from __future__ import absolute_import
diff --git a/tensorflow/contrib/cudnn_rnn/BUILD b/tensorflow/contrib/cudnn_rnn/BUILD
index 670b54943277806c47bfd6c6bc9b345db4bb1448..8d35622e393e15a2f2dfea7c75ad2c9f48aa7150 100644
--- a/tensorflow/contrib/cudnn_rnn/BUILD
+++ b/tensorflow/contrib/cudnn_rnn/BUILD
@@ -42,10 +42,11 @@ tf_custom_op_py_library(
 
 cuda_py_test(
     name = "cudnn_rnn_ops_test",
-    size = "large",
+    size = "medium",
     srcs = ["python/kernel_tests/cudnn_rnn_ops_test.py"],
     additional_deps = [
         ":cudnn_rnn_py",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/contrib/rnn:rnn_py",
         "//tensorflow/python/ops/losses:losses",
@@ -61,7 +62,7 @@ cuda_py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
     ],
-    shard_count = 6,
+    shard_count = 2,
     tags = [
         "noasan",  # http://b/62067814
         "requires-gpu-sm35",
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
index ae839108ebec31b70b687e5ff3e99c7d5a9b560e..a268415f0e65206294431a537be18cadbe1a1e84 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_ops_test.py
@@ -18,24 +18,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import itertools
 import os
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.core.protobuf import saver_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import rnn
+from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
@@ -56,714 +62,989 @@ CUDNN_RNN_TANH_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_TANH_PARAMS_PER_LAYER
 CUDNN_RNN_RELU_PARAMS_PER_LAYER = cudnn_rnn_ops.CUDNN_RNN_RELU_PARAMS_PER_LAYER
 
 
-def _CreateModel(rnn_mode,
-                 num_layers,
-                 num_units,
-                 input_size,
-                 input_mode="linear_input",
-                 direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-                 dtype=dtypes.float32,
-                 dropout=0.):
-  del input_mode
-  if rnn_mode == cudnn_rnn_ops.CUDNN_LSTM:
-    model_fn = cudnn_rnn_ops.CudnnLSTM
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_GRU:
-    model_fn = cudnn_rnn_ops.CudnnGRU
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_TANH:
-    model_fn = cudnn_rnn_ops.CudnnRNNTanh
-  elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_RELU:
-    model_fn = cudnn_rnn_ops.CudnnRNNRelu
+def RunLSTM(sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers=1,
+            is_training=True,
+            dropout=0.,
+            num_dirs=True,
+            dtype=dtypes.float32):
+  # TODO(jamesqin): add multi-layer tests.
+  # TODO(jamesqin): add multi-dir tests
+  assert num_layers == 1
+  assert num_dirs == 1
+  if is_training and not np.isclose(dropout, 0):
+    raise ValueError("dropout can not be 0. when test training.")
+
+  # set graph level random seed and numpy random seed.
+  random_seed.set_random_seed(0)
+  np.random.seed(0)
+
+  inputs = variable_scope.get_variable(
+      "inputs",
+      initializer=np.random.rand(time, batch_size,
+                                 input_size).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_h_op = variable_scope.get_variable(
+      "initial_h_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_c_op = variable_scope.get_variable(
+      "initial_c_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+
+  initializer = init_ops.random_uniform_initializer(
+      -0.01, 0.01, dtype=dtype, seed=19980904)
+
+  with variable_scope.variable_scope("test", initializer=initializer):
+    w = variable_scope.get_variable(
+        "rnn/lstm_cell/kernel",
+        shape=[input_size + num_units, num_units * 4],
+        dtype=dtype)
+    b = variable_scope.get_variable(
+        "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype)
+
+    # canonical lstm. must set forget_bias to 0. to align with cudnn lstm.
+    cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True)
+    outputs_op, state_tuple_op = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=rnn_cell_impl.LSTMStateTuple(
+            h=initial_h_op, c=initial_c_op),
+        dtype=dtype,
+        time_major=True,
+        scope=None)
+
+  # Convert to cudnn opaque param.
+  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
+      num_layers, num_units, input_size)
+  opaque_params = format_converter.tf_canonical_to_opaque([w, b])
+
+  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_initial_c_op = array_ops.expand_dims(initial_c_op, axis=0)
+  cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn(
+      inputs,
+      cu_initial_h_op,
+      cu_initial_c_op,
+      opaque_params,
+      dropout=dropout,
+      is_training=is_training,
+      rnn_mode=cudnn_rnn_ops.CUDNN_LSTM)
+  # Remove the trivial 1st dimension.
+  cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple(
+      c=array_ops.squeeze(cu_c_op, axis=0),
+      h=array_ops.squeeze(cu_h_op, axis=0))
+
+  if is_training:
+    (inp_grad_op, hgrad_op,
+     cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients(
+         outputs_op, [inputs, initial_h_op, initial_c_op, w, b])
+
+    (cu_inp_grad_op, cu_hgrad_op,
+     cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients(
+         cu_outputs_op,
+         [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params])
+    # Remove the trivial 1st dimension
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+    # Remove the trivial 1st dimension
+    cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0)
+
+    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
+        opaque_grad_op)
+    cu_wgrad_op = cu_wgrad_op[0]
+    cu_bgrad_op = cu_bgrad_op[0]
+    # cudnn lstm has 2 biases each gate. When converting to tf canonical format,
+    # the two biases are summed into one. Thus here bias gradient should be
+    # halved when comparing with tf lstm.
+    cu_bgrad_op *= 0.5
+
+  init_op = variables.global_variables_initializer()
+  sess.run(init_op)
+
+  if is_training:
+    outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([
+        outputs_op, state_tuple_op, inp_grad_op,
+        (hgrad_op, cgrad_op), wgrad_op, bgrad_op
+    ])
+    (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad,
+     cu_bgrad) = sess.run([
+         cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op,
+         (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op
+     ])
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
+    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
+    logging.vlog(1, "inp_grad: %s" % inp_grad)
+    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
+    logging.vlog(1, "state_grad: %s" % str(state_grad))
+    logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad))
+    logging.vlog(1, "wgrad: %s" % str(wgrad))
+    logging.vlog(1, "bgrad: %s" % str(bgrad))
+    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
+    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
+    return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad,
+            cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad,
+            cu_bgrad)
   else:
-    raise ValueError("Invalid rnn_mode: %s" % rnn_mode)
-  return model_fn(
-      num_layers,
-      num_units,
-      input_size,
-      direction=direction,
-      dtype=dtype,
-      dropout=dropout)
-
-
-def _CreateParamsSavable(params,
-                         model,
-                         base_variable_scope=None,
-                         name="params_canonical"):
-  """Create a RNNParamsSaveable for the weight and bias parameters.
+    outputs, state_tuple = sess.run([outputs_op, state_tuple_op])
+    cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op])
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "state_tuple: %s" % str(state_tuple))
+    logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple))
+  return outputs, cu_outputs, state_tuple, cu_state_tuple
+
+
+# Basic set of RNN configs to test. They can be further extended in relevant
+# test (e.g. adding num_dirs).
+NAMED_RNN_TESTCASES = ({
+    "testcase_name": "xsmall",
+    "num_units": 1,
+    "input_size": 1,
+    "batch_size": 1,
+    "time": 1,
+    "num_layers": 1,
+}, {
+    "testcase_name": "small",
+    "num_units": 4,
+    "input_size": 4,
+    "batch_size": 4,
+    "time": 4,
+    "num_layers": 1,
+}, {
+    "testcase_name": "medium",
+    "num_units": 128,
+    "input_size": 64,
+    "batch_size": 8,
+    "time": 16,
+    "num_layers": 1,
+}, {
+    "testcase_name": "large",
+    "num_units": 128,
+    "input_size": 128,
+    "batch_size": 16,
+    "time": 32,
+    "num_layers": 1,
+})
+
+
+def ExpandNamedTestCases(inputs, *remove_keys, **extra_configs):
+  """Expands testcase with new config dimensions.
+
+  Example:
+    inputs = (
+      {'testcase_name': 'test1', 'gender': 'male'}
+      {'testcase_name': 'test2', 'gender': 'female'}
+    )
+    remove_keys:  empty
+    extra_configs = {
+      'age': [40, 80]
+      'height': [5, 6]
+    }
+
+    Returns:
+      (
+        {'testcase_name': 'test1_age_40_height_5','gender': 'male', 'age':
+        40,'height': 5}
+        {'testcase_name': 'test1_age_40_height_6', 'gender': 'male', 'age': 40,
+        'height': 6}
+        {'testcase_name': 'test1_age_80_height_5', 'gender': 'male', 'age': 80,
+        'height': 5}
+        {'testcase_name': 'test1_age_80_height_6', 'gender': 'male', 'age': 80,
+        'height': 6}
+
+        {'testcase_name': 'test2_age_40_height_5', 'gender': 'female', 'age':
+        40,
+        'height': 5}
+        {'testcase_name': 'test2_age_40_height_6', 'gender': 'female', 'age':
+        40,
+        'height': 6}
+        {'testcase_name': 'test2_age_80_height_5', 'gender': 'female', 'age':
+        80,
+        'height': 5}
+        {'testcase_name': 'test2_age_80_height_6', 'gender': 'female', 'age':
+        80,
+        'height': 6}
+      )
 
   Args:
-    params: a Variable for weight and bias parameters.
-    model: a CudnnRNN model.
-    base_variable_scope: a string, prefix of names of saved variables.
-    name: a string, name of the RNNParamsSaveable object.
+    inputs: A list of dictionary, each being a testcase.
+    *remove_keys: A list of keys into testcase which are not needed in new
+      testcases.
+    **extra_configs: A dict of new test dimension and applicable values in that
+      dimension.
+
   Returns:
-    a RNNParamsSaveable object.
+    A list of dictionary with expanded test cases.
   """
-  if model._rnn_mode == CUDNN_LSTM:
-    fn = cudnn_rnn_ops.CudnnLSTMSaveable
-  elif model._rnn_mode == CUDNN_GRU:
-    fn = cudnn_rnn_ops.CudnnGRUSaveable
-  elif model._rnn_mode == CUDNN_RNN_TANH:
-    fn = cudnn_rnn_ops.CudnnRNNTanhSaveable
-  elif model._rnn_mode == CUDNN_RNN_RELU:
-    fn = cudnn_rnn_ops.CudnnRNNReluSaveable
-  params_saveable = fn(
-      params,
-      model.num_layers,
-      model.num_units,
-      model.input_size,
-      model.input_mode,
-      model.direction,
-      scope=base_variable_scope,
-      name=name)
-  ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable)
-  return params_saveable
-
-
-def _MinLSTMParamSize(num_layers,
-                      num_units,
-                      input_size,
-                      direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION):
-  if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION:
-    first_layer_weights = 4 * num_units * (num_units + input_size)
-    higher_layer_weights = 8 * (num_layers - 1) * num_units * num_units
-    all_biases = 8 * num_layers * num_units
-    return first_layer_weights + higher_layer_weights + all_biases
-  elif direction == cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION:
-    first_layer_weights = 4 * num_units * (num_units + input_size)
-    higher_layer_weights = (num_layers - 1) * (
-        4 * 2 * num_units * num_units + 4 * num_units**2)
-    all_biases = 8 * num_layers * num_units
-    return 2 * (first_layer_weights + higher_layer_weights + all_biases)
-  else:
-    raise ValueError("%s direction is not supported.")
+  res = []
+  ordered_extra_configs = collections.OrderedDict(extra_configs)
+  keys = ordered_extra_configs.keys()
+  # A list of list of configs.
+  # The outer loop is iterating keys, the innner is values of one key.
+  combined_kv = [[(k, v) for v in ordered_extra_configs[k]] for k in keys]
+  logging.info("combined_kv: %s", combined_kv)
 
+  for inp in inputs:
+    # Each inp is a dict
+    for config in itertools.product(*combined_kv):
+      new_inp = dict(inp)
+      # config is a list in the form of [(k_i, v_j), (k_p, v_q), ...]
+      suffix = ["%s_%s" % (p[0], str(p[1])) for p in config]
+      suffix = "_".join(suffix)
+      new_inp["testcase_name"] += "_" + suffix
+      for k, v in config:
+        new_inp[k] = v
+      # Remove not used keys from the new test case.
+      if remove_keys:
+        if not isinstance(remove_keys, (list, tuple)):
+          remove_keys = [remove_keys]
+        for k in remove_keys:
+          new_inp.pop(k, None)
+      logging.info("new_inp: %s", new_inp)
+      res.append(new_inp)
+  # Dedup, necessary if `remove_keys` is set.
+  return [dict(t) for t in {tuple(d.items()) for d in res}]
 
-class CudnnRNNTestSaveRestore(TensorFlowTestCase):
 
-  def _CompareWeights(self, lhs, rhs):
-    self.assertEqual(len(lhs), len(rhs))
-    for lw, rw in zip(lhs, rhs):
-      self.assertAllEqual(lw, rw)
+class CudnnLSTMTest(TensorFlowTestCase, parameterized.TestCase):
 
-  def _CompareBiases(self, lhs, rhs, rnn_mode, num_layers, direction):
-    self.assertEqual(len(lhs), len(rhs))
-    if rnn_mode == CUDNN_LSTM:
-      num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER
-    elif rnn_mode == CUDNN_GRU:
-      num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER
-    elif rnn_mode == CUDNN_RNN_TANH:
-      num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER
-    else:
-      num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER
-    num_dirs = 1 if direction == CUDNN_RNN_UNIDIRECTION else 2
-    num_params_per_layer *= num_dirs
-    self.assertEqual(num_params_per_layer * num_layers, len(lhs))
-
-    for i in range(num_layers):
-      layer_lhs = lhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
-      layer_rhs = rhs[i * num_params_per_layer: (i+1) * num_params_per_layer]
-      if direction == CUDNN_RNN_UNIDIRECTION:
-        self._CompareSingleLayerBiases(layer_lhs, layer_rhs)
-      else:
-        size = len(layer_lhs)
-        fw_lhs, bw_lhs = layer_lhs[:size//2], layer_lhs[size//2:]
-        fw_rhs, bw_rhs = layer_rhs[:size//2], layer_rhs[size//2:]
-        self._CompareSingleLayerBiases(fw_lhs, fw_rhs)
-        self._CompareSingleLayerBiases(bw_lhs, bw_rhs)
-
-  def _CompareSingleLayerBiases(self, lhs, rhs):
-    self.assertEqual(len(lhs), len(rhs))
-
-    lf_lhs, rt_lhs = lhs[:len(lhs)//2], lhs[len(lhs)//2:]
-    lf_rhs, rt_rhs = rhs[:len(rhs)//2], rhs[len(rhs)//2:]
-    self.assertEqual(len(lf_lhs), len(rt_lhs))
-    self.assertEqual(len(lf_rhs), len(rt_rhs))
-
-    sum_lhs, sum_rhs = [], []
-    for lf, rt in zip(lf_lhs, rt_lhs):
-      sum_lhs.append(lf + rt)
-    for lf, rt in zip(lf_rhs, rt_rhs):
-      sum_rhs.append(lf + rt)
-    self.assertEqual(len(sum_lhs), len(sum_rhs))
-    for lf, rt in zip(sum_lhs, sum_rhs):
-      self.assertAllEqual(lf, rt)
+  def _test_training_helper(self,
+                            num_units,
+                            input_size,
+                            batch_size,
+                            time,
+                            num_layers,
+                            dtype,
+                            rtol=2e-6,
+                            atol=2e-6):
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad,
+       state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunLSTM(
+           sess, num_units, input_size, batch_size, time, num_layers)
 
-  def _testSaveRestoreVariable(self, rnn_mode, direction, dtype):
-    num_layers = 2
-    num_units = 7
-    input_size = 3
-    with ops.Graph().as_default():
-      model = _CreateModel(
-          rnn_mode,
-          num_layers=num_layers,
-          num_units=num_units,
-          input_size=input_size,
-          direction=direction,
-          dtype=dtype)
-      random_seed.set_random_seed(1234)
-      params_size_t = model.params_size()
-      params = variables.VariableV1(
-          random_ops.random_uniform([params_size_t], dtype=dtype),
-          dtype=dtype,
-          validate_shape=False)
-      saveable = _CreateParamsSavable(params, model)
-      weights, biases = saveable.format_converter._opaque_to_cu_canonical(
-          saveable._variables)
-      reset_params = state_ops.assign(
-          params,
-          array_ops.zeros([params_size_t], dtype=dtype),
-          validate_shape=False)
-      save_path = os.path.join(self.get_temp_dir(),
-                               "save-restore-variable-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      for s, cu_s in zip(state_tuple, cu_state_tuple):
+        self.assertAllClose(s, cu_s, rtol=rtol, atol=atol)
+      for sg, cu_sg in zip(state_grad, cu_state_grad):
+        self.assertAllClose(sg, cu_sg, rtol=rtol, atol=atol)
+      self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol)
+      self.assertAllClose(bgrad, cu_bgrad, rtol=rtol, atol=atol)
+      self.assertAllClose(wgrad, cu_wgrad, rtol=rtol, atol=atol)
 
-        weights_v, biases_v = sess.run([weights, biases])
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(num_units, input_size, batch_size, time,
+                               num_layers, dtypes.float32)
 
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        weights_v_restored, biases_v_restored = sess.run([weights, biases])
-
-        self._CompareWeights(weights_v, weights_v_restored)
-        self._CompareBiases(biases_v, biases_v_restored, rnn_mode, num_layers,
-                            direction)
-
-  def _testSaveRestoreTwoVariables(self, rnn_mode, direction, dtype):
-    num_layers = 2
-    num_units = 7
-    input_size = 3
-    with ops.Graph().as_default():
-      model = _CreateModel(
-          rnn_mode,
-          num_layers=num_layers,
-          num_units=num_units,
-          input_size=input_size,
-          direction=direction,
-          dtype=dtype)
-      random_seed.set_random_seed(1234)
-      params_size_t = model.params_size()
-      names = ["rnn_1", "rnn_2"]
-      param_vars = [
-          variables.VariableV1(
-              random_ops.random_uniform([params_size_t], dtype=dtype),
-              dtype=dtype,
-              validate_shape=False) for name in names
-      ]
-      saveables = []
-      for name, params in zip(names, param_vars):
-        saveables.append(_CreateParamsSavable(params, model, name, name))
-      weights1, biases1 = saveables[0].format_converter._opaque_to_cu_canonical(
-          saveables[0]._variables)
-      weights2, biases2 = saveables[1].format_converter._opaque_to_cu_canonical(
-          saveables[1]._variables)
-      reset_params = [
-          state_ops.assign(
-              params,
-              array_ops.zeros([params_size_t], dtype=dtype),
-              validate_shape=False) for params in param_vars
-      ]
-      save_path = os.path.join(self.get_temp_dir(),
-                               "save-restore-variable-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(use_gpu=True,
-                             graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
-        weights1_v, biases1_v = sess.run([weights1, biases1])
-        weights2_v, biases2_v = sess.run([weights2, biases2])
-
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        weights1_v_restored, biases1_v_restored = sess.run([weights1, biases1])
-        weights2_v_restored, biases2_v_restored = sess.run([weights2, biases2])
-
-        self._CompareWeights(weights1_v, weights1_v_restored)
-        self._CompareWeights(weights2_v, weights2_v_restored)
-        self._CompareBiases(biases1_v, biases1_v_restored, rnn_mode, num_layers,
-                            direction)
-        self._CompareBiases(biases2_v, biases2_v_restored, rnn_mode, num_layers,
-                            direction)
-
-  def _testSaveRestoreOutput(self, rnn_mode, direction, dtype):
-    with ops.Graph().as_default():
-      num_layers = 2
-      num_units = 7
-      input_size = 7
-      seq_length = 10
-      batch_size = 5
-      dir_count = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
-      model = _CreateModel(
-          rnn_mode,
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training_fp16(self, num_units, input_size, batch_size, time,
+                         num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(
+        num_units,
+        input_size,
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float16,
+        rtol=5e-3,
+        atol=5e-4)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
           num_layers,
+          is_training=False)
+
+      self.assertAllClose(outputs, cu_outputs)
+      # h
+      self.assertAllClose(state_tuple.h, cu_state_tuple.h)
+      # c
+      self.assertAllClose(state_tuple.c, cu_state_tuple.c)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_fp16(self, num_units, input_size, batch_size, time,
+                          num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, state_tuple, cu_state_tuple) = RunLSTM(
+          sess,
           num_units,
           input_size,
-          direction=direction,
-          dtype=dtype)
-      params_size_t = model.params_size()
-      params = variables.VariableV1(
-          array_ops.ones([params_size_t], dtype=dtype),
-          validate_shape=False,
-          dtype=dtype)
-      _CreateParamsSavable(params, model)
-      save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test")
-      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+          batch_size,
+          time,
+          num_layers,
+          is_training=False,
+          dtype=dtypes.float16)
 
-      np.random.seed(1234)
-      has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-      input_data = constant_op.constant(
-          np.random.randn(seq_length, batch_size, input_size), dtype=dtype)
-      input_h = constant_op.constant(
-          np.random.randn(num_layers * dir_count, batch_size, num_units),
-          dtype=dtype)
-      if has_input_c:
-        input_c = constant_op.constant(
-            np.random.randn(num_layers * dir_count, batch_size, num_units),
-            dtype=dtype)
-        outputs = model(
-            input_data=input_data,
-            input_h=input_h,
-            input_c=input_c,
-            params=params,
-            is_training=False)
-      else:
-        outputs = model(
-            input_data=input_data,
-            input_h=input_h,
-            params=params,
-            is_training=False)
-      total_sum = sum(map(math_ops.reduce_sum, outputs))
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        sess.run(variables.global_variables_initializer())
-        total_sum_v = sess.run(total_sum)
-        val = saver.save(sess, save_path)
-        self.assertEqual(save_path, val)
-      # Passing graph explicitly, otherwise an old sess would be reused.
-      with self.test_session(
-          use_gpu=True, graph=ops.get_default_graph()) as sess:
-        reset_params = state_ops.assign(
-            params,
-            array_ops.zeros([params_size_t], dtype=dtype),
-            validate_shape=False)
-        sess.run(reset_params)
-        saver.restore(sess, save_path)
-        total_sum_v_restored = sess.run(total_sum)
-        self.assertAllClose(total_sum_v, total_sum_v_restored, atol=1e-5)
+      rtol, atol = 5e-3, 5e-4
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      # h
+      self.assertAllClose(
+          state_tuple.h, cu_state_tuple.h, rtol=rtol, atol=atol)
+      # c
+      self.assertAllClose(
+          state_tuple.c, cu_state_tuple.c, rtol=rtol, atol=atol)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSaveRestore(self):
-    rnn_modes = [
-        cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU,
-        cudnn_rnn_ops.CUDNN_RNN_TANH, cudnn_rnn_ops.CUDNN_RNN_RELU
-    ]
-    directions = [
-        cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-        cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
-    ]
-    dtype_list = [dtypes.float32, dtypes.float64]
-    for rnn_mode, direction, dtype in itertools.product(rnn_modes, directions,
-                                                        dtype_list):
-      self._testSaveRestoreVariable(rnn_mode, direction, dtype)
-      self._testSaveRestoreTwoVariables(rnn_mode, direction, dtype)
-      self._testSaveRestoreOutput(rnn_mode, direction, dtype)
-
-
-class CudnnRNNTestParamsSize(TensorFlowTestCase):
-
-  def _testOneLSTMParamsSize(self, num_layers, num_units, input_size,
-                             direction):
-    logging.info("Testing one lstm param size with config: %s", locals())
-    min_params_size = _MinLSTMParamSize(num_layers, num_units, input_size,
-                                        direction)
-    model = _CreateModel(
-        cudnn_rnn_ops.CUDNN_LSTM,
-        num_layers,
+  def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
+                                  num_layers):
+    """Validates that dropout does not affect Cudnn Rnn inference."""
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    # Hand-picked dropouts are used below (0. and 1.)
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        # 1st time w/o dropout.
+        (_, cu_outputs, _, cu_state_tuple) = RunLSTM(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=0.)
+
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        (_, cu_outputs2, _, cu_state_tuple2) = RunLSTM(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=1.)
+
+    self.assertAllClose(cu_outputs, cu_outputs2)
+    # h
+    self.assertAllClose(cu_state_tuple.h, cu_state_tuple2.h)
+    # c
+    self.assertAllClose(cu_state_tuple.c, cu_state_tuple2.c)
+
+
+def RunGRU(sess,
+           num_units,
+           input_size,
+           batch_size,
+           time,
+           num_layers=1,
+           is_training=True,
+           dropout=0.,
+           num_dirs=True,
+           dtype=dtypes.float32):
+  # TODO(jamesqin): add multi-layer tests.
+  # TODO(jamesqin): add multi-dir tests
+  assert num_layers == 1
+  assert num_dirs == 1
+  if is_training and not np.isclose(dropout, 0):
+    raise ValueError("dropout can not be 0. when test training.")
+
+  # set graph level random seed and numpy random seed.
+  random_seed.set_random_seed(0)
+  np.random.seed(0)
+
+  inputs = variable_scope.get_variable(
+      "inputs",
+      initializer=np.random.rand(time, batch_size,
+                                 input_size).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+  initial_h_op = variable_scope.get_variable(
+      "initial_h_op",
+      initializer=np.random.rand(batch_size,
+                                 num_units).astype(dtype.as_numpy_dtype),
+      dtype=dtype)
+
+  initializer = init_ops.random_uniform_initializer(
+      -0.01, 0.01, dtype=dtype, seed=19980904)
+  with variable_scope.variable_scope("test", initializer=initializer):
+    gate_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/gates/kernel",
+        shape=[input_size + num_units, num_units * 2],
+        dtype=dtype)
+    gate_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/gates/bias",
+        shape=[num_units * 2],
+        dtype=dtype)
+    candidate_inp_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel",
+        shape=[input_size, num_units],
+        dtype=dtype)
+    candidate_inp_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias",
+        shape=[num_units],
+        dtype=dtype)
+    candidate_hid_kernel = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel",
+        shape=[num_units, num_units],
+        dtype=dtype)
+    candidate_hid_bias = variable_scope.get_variable(
+        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias",
+        shape=[num_units],
+        dtype=dtype)
+
+    cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
+    outputs_op, h_op = rnn.dynamic_rnn(
+        cell,
+        inputs,
+        initial_state=initial_h_op,
+        dtype=dtype,
+        time_major=True,
+        scope=None)
+
+  ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
+  bs = [gate_bias, candidate_inp_bias, candidate_hid_bias]
+  # Convert to cudnn opaque param.
+  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
+      num_layers, num_units, input_size)
+  opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+
+  cu_initial_h_op = array_ops.expand_dims(initial_h_op, axis=0)
+  cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
+      inputs,
+      cu_initial_h_op,
+      array_ops.zeros_like(cu_initial_h_op),  # not used
+      opaque_params,
+      dropout=dropout,
+      is_training=is_training,
+      rnn_mode=cudnn_rnn_ops.CUDNN_GRU)
+
+  if is_training:
+    (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op,
+     cib_grad_op, chb_grad_op) = gradients_impl.gradients(
+         outputs_op, [inputs, initial_h_op] + ws + bs)
+
+    (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
+        cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
+    # Remove the trivial 1st dimension
+    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0)
+
+    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
+        opaque_grad_op)
+    (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op
+    (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op
+    # cudnn gru has 2 biases for reset and update gates. When converting to tf
+    # canonical format, the two biases are summed into one.  Thus here relevant
+    # bias gradient should be halved before comparing with tf gru.
+    cu_gb_grad_op *= 0.5
+
+  init_op = variables.global_variables_initializer()
+  sess.run(init_op)
+
+  if is_training:
+    outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([
+        outputs_op, h_op, inp_grad_op, hgrad_op,
+        (gk_grad_op, cik_grad_op, chk_grad_op),
+        (gb_grad_op, cib_grad_op, chb_grad_op)
+    ])
+    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run([
+        cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
+        (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
+        (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
+    ])
+    # Remove the trivial 1st dimension
+    cu_h = np.squeeze(cu_h, axis=0)
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "h: %s" % h)
+    logging.vlog(1, "cu_h: %s" % h)
+    logging.vlog(1, "inp_grad: %s" % inp_grad)
+    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
+    logging.vlog(1, "hgrad: %s" % hgrad)
+    logging.vlog(1, "cu_hgrad: %s" % cu_hgrad)
+    logging.vlog(1, "wgrad: %s" % str(wgrad))
+    logging.vlog(1, "bgrad: %s" % str(bgrad))
+    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
+    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
+    return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
+            cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
+  else:
+    outputs, h = sess.run([outputs_op, h_op])
+    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op])
+    # Remove the trivial 1st dimension.
+    cu_h = np.squeeze(cu_h, axis=0)
+
+    logging.vlog(1, "outputs: %s" % outputs)
+    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
+    logging.vlog(1, "h: %s" % h)
+    logging.vlog(1, "cu_h: %s" % h)
+  return outputs, cu_outputs, h, cu_h
+
+
+class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase):
+
+  def _test_training_helper(self,
+                            num_units,
+                            input_size,
+                            batch_size,
+                            time,
+                            num_layers,
+                            dtype,
+                            rtol=2e-6,
+                            atol=2e-6):
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
+       cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunGRU(
+           sess, num_units, input_size, batch_size, time, num_layers)
+
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
+      self.assertAllClose(hgrad, cu_hgrad, rtol=rtol, atol=atol)
+      self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol)
+      for bg, cu_bg in zip(bgrad, cu_bgrad):
+        self.assertAllClose(bg, cu_bg, rtol=rtol, atol=atol)
+      for wg, cu_wg in zip(wgrad, cu_wgrad):
+        self.assertAllClose(wg, cu_wg, rtol=rtol, atol=atol)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(num_units, input_size, batch_size, time,
+                               num_layers, dtypes.float32)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_training_fp16(self, num_units, input_size, batch_size, time,
+                         num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_training_helper(
         num_units,
         input_size,
-        direction=direction)
-    params_size = model.params_size()
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      params_size_v = sess.run(params_size)
-      self.assertLessEqual(min_params_size, params_size_v)
+        batch_size,
+        time,
+        num_layers,
+        dtypes.float16,
+        rtol=5e-3,
+        atol=5e-4)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testLSTMParamsSize(self):
-    test_configs = [
-        [4, 200, 200],
-        [4, 200, 300],
-        [4, 200, 100],
-        [1, 100, 200],
-        [2, 200, 100],
-        [3, 200, 400],
-    ]
-    directions = [
-        cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION,
-        cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION
-    ]
-    for (config, direction) in itertools.product(test_configs, directions):
-      num_layers, num_units, input_size = config
-      with ops.Graph().as_default():
-        self._testOneLSTMParamsSize(num_layers, num_units, input_size,
-                                    direction)
+  def test_inference(self, num_units, input_size, batch_size, time, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h) = RunGRU(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False)
+      self.assertAllClose(outputs, cu_outputs)
+      self.assertAllClose(h, cu_h)
 
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testLSTMParamsSizeShape(self):
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          constant_op.constant([4]), 200, 200,
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      _ = model.params_size()
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
-          cudnn_rnn_ops.CUDNN_LSTM,
-          4, constant_op.constant([200]), 200,
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      _ = model.params_size()
-    with self.assertRaisesRegexp(
-        ValueError, "Shape must be rank 0 but is rank 1"):
-      model = _CreateModel(
+  def test_inference_fp16(self, num_units, input_size, batch_size, time,
+                          num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      (outputs, cu_outputs, h, cu_h) = RunGRU(
+          sess,
+          num_units,
+          input_size,
+          batch_size,
+          time,
+          num_layers,
+          is_training=False,
+          dtype=dtypes.float16)
+
+      rtol, atol = 5e-3, 5e-4
+      self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol)
+      self.assertAllClose(h, cu_h, rtol=rtol, atol=atol)
+
+  @parameterized.named_parameters(*NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_inference_with_dropout(self, num_units, input_size, batch_size, time,
+                                  num_layers):
+    """Validates that dropout does not affect Cudnn Rnn inference."""
+    # Hand-picked dropouts are used below (0. and 1.)
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        # 1st time w/o dropout.
+        (_, cu_outputs, _, cu_h) = RunGRU(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=0.)
+
+    with ops.Graph().as_default() as g:
+      with self.session(use_gpu=True, graph=g) as sess:
+        (_, cu_outputs2, _, cu_h2) = RunGRU(
+            sess,
+            num_units,
+            input_size,
+            batch_size,
+            time,
+            num_layers,
+            is_training=False,
+            dropout=1.)
+
+    self.assertAllClose(cu_outputs, cu_outputs2)
+    self.assertAllClose(cu_h[0], cu_h2[0])
+
+
+class CudnnParamsFormatConverterTest(TensorFlowTestCase,
+                                     parameterized.TestCase):
+  """Class for testing various format converters."""
+
+  def _test_lstm_helper(self, num_units, input_size, num_layers, direction):
+    with self.session(use_gpu=True) as sess:
+      random_seed.set_random_seed(0)
+      np.random.seed(0)
+
+      num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
+      format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM(
+          num_layers, num_units, input_size, direction=direction)
+
+      ws, bs = [], []
+      for _ in range(num_layers * num_dirs):
+        w = constant_op.constant(
+            np.random.rand(input_size + num_units, 4 * num_units),
+            dtype=dtypes.float32)
+        b = constant_op.constant(
+            np.random.rand(4 * num_units), dtype=dtypes.float32)
+        ws.append(w)
+        bs.append(b)
+
+      opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+      opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
           cudnn_rnn_ops.CUDNN_LSTM,
-          4, 200, constant_op.constant([200]),
-          direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
-      _ = model.params_size()
+          num_layers,
+          num_units,
+          input_size,
+          direction=direction)
 
+      ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)
 
-class CudnnRNNTestInference(TensorFlowTestCase):
+      # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
+      # returns the original input.
+      ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
+      for w, w_r in zip(ws, ws_r):
+        self.assertAllClose(w, w_r)
+      for b, b_r in zip(bs, bs_r):
+        self.assertAllClose(b, b_r)
 
-  def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size,
-                              batch_size, seq_length, dir_count, dropout,
-                              expected, tolerance):
-    random_seed.set_random_seed(5678)
-    model = _CreateModel(
-        rnn_mode,
-        num_layers,
-        num_units,
-        input_size,
-        input_mode="auto_select",
-        direction=(cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1
-                   else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION),
-        dropout=dropout)
-    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-    params_size_t = model.params_size()
-    input_data = array_ops.ones([seq_length, batch_size, input_size])
-    input_h = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-    params = variables.VariableV1(
-        array_ops.ones([params_size_t]), validate_shape=False)
-    if has_input_c:
-      input_c = array_ops.ones([num_layers * dir_count, batch_size, num_units])
-      output, output_h, output_c = model(
-          input_data=input_data,
-          input_h=input_h,
-          input_c=input_c,
-          params=params,
-          is_training=False)
-    else:
-      output, output_h = model(
-          input_data=input_data,
-          input_h=input_h,
-          params=params,
-          is_training=False)
-    output_sum = math_ops.reduce_sum(output)
-    output_h_sum = math_ops.reduce_sum(output_h)
-    total_sum = output_sum + output_h_sum
-    if has_input_c:
-      output_c_sum = math_ops.reduce_sum(output_c)
-      total_sum += output_c_sum
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      sess.run(variables.global_variables_initializer())
-      total_sum_v = sess.run([total_sum])
+      # Test opaque_params size lower bound
+      opaque_params_size_v = sess.run(opaque_params_size)
+      min_params_size = sum(x.size for x in ws) + np.sum(x.size for x in bs)
+      logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
+                   min_params_size, opaque_params_size_v)
+      self.assertLessEqual(min_params_size, opaque_params_size_v)
 
-      self.assertAllClose(
-          total_sum_v[0], expected, atol=tolerance, rtol=tolerance)
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_lstm(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_lstm_helper(num_units, input_size, num_layers,
+                           cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
 
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def testSimpleInference(self):
-    test_configs = [
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "expected": 231833.22,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "expected": 56000,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "expected": 56000,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 4,
-                "num_units": 200,
-                "input_size": 200,
-                "batch_size": 20,
-                "seq_length": 10,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "expected": 130688,
-            "tolerance": 1e-2,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 8,
-                "input_size": 4,
-                "batch_size": 4,
-                "seq_length": 2,
-                "dir_count": 1,
-            },
-        },
-    ]
-    # Cudnn scales result for dropout during training, therefore dropout has no
-    # impact for inference results.
-    # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most
-    # demonstrative of the dropout-invariant nature of CudnnRnn.)
-    dropouts = [0., 0.5, 1.]
-    for (config, dropout) in itertools.product(test_configs, dropouts):
-      rnn_mode = config["rnn_mode"]
-      expected = config["expected"]
-      tolerance = config["tolerance"]
-      shape = config["shape"]
-      with ops.Graph().as_default():
-        self._testOneSimpleInference(
-            rnn_mode, shape["num_layers"], shape["num_units"],
-            shape["input_size"], shape["batch_size"], shape["seq_length"],
-            shape["dir_count"], dropout, expected, tolerance)
-
-
-class CudnnRNNTestTraining(TensorFlowTestCase):
-
-  def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size,
-                             batch_size, seq_length, dir_count, dropout, dtype,
-                             delta, tolerance):
-    # Gradient checking runs two forward ops with almost the same input. Need to
-    # make sure the drop patterns across the two runs are the same.
-    logging.info("Training test with config: %s", locals())
-    old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False))
-    os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True)
-    has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM)
-    random_seed.set_random_seed(5678)
-    direction = (cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1
-                 else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
-    model = _CreateModel(
-        rnn_mode,
-        num_layers,
-        num_units,
-        input_size,
-        direction=direction,
-        dtype=dtype,
-        dropout=dropout)
-    params_size_t = model.params_size()
-    input_data = variables.VariableV1(
-        random_ops.random_uniform(
-            [seq_length, batch_size, input_size], dtype=dtype),
-        dtype=dtype)
-    input_h = variables.VariableV1(
-        random_ops.random_uniform(
-            [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-        dtype=dtype)
-    params = variables.VariableV1(
-        random_ops.random_uniform([params_size_t], dtype=dtype),
-        validate_shape=False,
-        dtype=dtype)
-    if has_input_c:
-      input_c = variables.VariableV1(
-          random_ops.random_uniform(
-              [num_layers * dir_count, batch_size, num_units], dtype=dtype),
-          dtype=dtype)
-
-      output, output_h, output_c = model(
-          input_data=input_data,
-          input_h=input_h,
-          input_c=input_c,
-          params=params)
-    else:
-      output, output_h = model(
-          input_data=input_data, input_h=input_h, params=params)
-    output_sum = math_ops.reduce_sum(output)
-    output_h_sum = math_ops.reduce_sum(output_h)
-    total_sum = output_sum + output_h_sum
-    if has_input_c:
-      output_c_sum = math_ops.reduce_sum(output_c)
-      total_sum += output_c_sum
-
-    with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess:
-      params_size_v = sess.run(params_size_t)
-      inputs_and_shapes = [
-          (input_data, [seq_length, batch_size, input_size]),
-          (input_h, [num_layers * dir_count, batch_size, num_units]),
-          (params, [params_size_v]),
-      ]
-      if has_input_c:
-        inputs_and_shapes.append(
-            (input_c, [num_layers * dir_count, batch_size, num_units]),)
-      sess.run(variables.global_variables_initializer())
-      all_inputs = [entry[0] for entry in inputs_and_shapes]
-      all_shapes = [entry[1] for entry in inputs_and_shapes]
-
-      err = gradient_checker.compute_gradient_error(
-          all_inputs, all_shapes, total_sum, [1], delta=delta)
-
-      self.assertLess(err, tolerance)
-      os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state
+  def test_lstm_bidi(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_lstm_helper(num_units, input_size, num_layers,
+                           cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
+
+  def _test_gru_helper(self, num_units, input_size, num_layers, direction):
+    with self.session(use_gpu=True) as sess:
+      random_seed.set_random_seed(0)
+      np.random.seed(0)
+
+      num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2
+      format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
+          num_layers, num_units, input_size, direction=direction)
+
+      ws, bs = [], []
+      for _ in range(num_layers * num_dirs):
+        gate_kernel = constant_op.constant(
+            np.random.rand(input_size + num_units, num_units * 2),
+            dtype=dtypes.float32)
+        gate_bias = constant_op.constant(
+            np.random.rand(num_units * 2), dtype=dtypes.float32)
+        candidate_inp_kernel = constant_op.constant(
+            np.random.rand(input_size, num_units), dtype=dtypes.float32)
+        candidate_inp_bias = constant_op.constant(
+            np.random.rand(num_units), dtype=dtypes.float32)
+        candidate_hid_kernel = constant_op.constant(
+            np.random.rand(num_units, num_units), dtype=dtypes.float32)
+        candidate_hid_bias = constant_op.constant(
+            np.random.rand(num_units), dtype=dtypes.float32)
+        ws.extend([gate_kernel, candidate_inp_kernel, candidate_hid_kernel])
+        bs.extend([gate_bias, candidate_inp_bias, candidate_hid_bias])
 
+      opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)
+      opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+          cudnn_rnn_ops.CUDNN_GRU,
+          num_layers,
+          num_units,
+          input_size,
+          direction=direction)
+
+      ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params)
+
+      # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical()
+      # returns the original input.
+      ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r])
+      for w, w_r in zip(ws, ws_r):
+        self.assertAllClose(w, w_r)
+      for b, b_r in zip(bs, bs_r):
+        self.assertAllClose(b, b_r)
+
+      # Test opaque_params size lower bound
+      opaque_params_size_v = sess.run(opaque_params_size)
+      min_params_size = sum(x.size for x in ws) + sum(x.size for x in bs)
+      logging.info("min_parm_size: %d vs actual_opaque_param_size: %d",
+                   min_params_size, opaque_params_size_v)
+      self.assertLessEqual(min_params_size, opaque_params_size_v)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_gru(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_gru_helper(num_units, input_size, num_layers,
+                          cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION)
+
+  @parameterized.named_parameters((c["testcase_name"], c["num_units"],
+                                   c["input_size"], c["num_layers"])
+                                  for c in NAMED_RNN_TESTCASES)
   @unittest.skipUnless(test.is_built_with_cuda(),
                        "Test only applicable when running on GPUs")
-  def DISABLED_testSimpleTraining(self):
-    # TODO(jamesqin): fix b/117989214
-    test_configs = [
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "dtype": dtypes.float64,
-            "delta": 1e-4,
-            "tolerance": 5e-6,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-                "dir_count": 1,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM,
-            "dtype": dtypes.float32,
-            "tolerance": 1.5e-2,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_GRU,
-            "dtype": dtypes.float32,
-            "tolerance": 4e-3,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH,
-            "dtype": dtypes.float32,
-            "tolerance": 5e-3,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-        {
-            "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU,
-            "dtype": dtypes.float32,
-            "tolerance": 5e-1,
-            "shape": {
-                "num_layers": 2,
-                "num_units": 3,
-                "input_size": 4,
-                "batch_size": 3,
-                "seq_length": 4,
-            },
-        },
-    ]
-    dropouts = [0., 0.5, 1.]
-    dir_counts = [1]
-    for config, dropout, dir_count in itertools.product(test_configs, dropouts,
-                                                        dir_counts):
-      rnn_mode = config["rnn_mode"]
-      dtype = config.get("dtype", dtypes.float32)
-      delta = config.get("delta", 1e-3)
-      tolerance = config["tolerance"]
-      shape = config["shape"]
-      with ops.Graph().as_default():
-        self._testOneSimpleTraining(rnn_mode, shape["num_layers"],
-                                    shape["num_units"], shape["input_size"],
-                                    shape["batch_size"], shape["seq_length"],
-                                    dir_count, dropout, dtype, delta, tolerance)
+  def test_gru_bidi(self, num_units, input_size, num_layers):
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    self._test_gru_helper(num_units, input_size, num_layers,
+                          cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
+
+
+class CudnnRnnSaveRestoreTest(TensorFlowTestCase, parameterized.TestCase):
+  """Class for testing various Cudnn Rnn SaveableObjects."""
+
+  def _create_opaque_param(self,
+                           rnn_mode,
+                           num_units,
+                           input_size,
+                           num_layers,
+                           direction,
+                           name=None):
+    param_size_t = cudnn_rnn_ops.cudnn_rnn_opaque_params_size(
+        rnn_mode, num_layers, num_units, input_size, direction=direction)
+    init_val = random_ops.random_uniform([param_size_t])
+    return variable_scope.get_variable(
+        name or "opaque_param", initializer=init_val, validate_shape=False)
+
+  def _create_saveable(self, opaque_param, rnn_mode, num_units, input_size,
+                       num_layers, direction):
+    if rnn_mode == CUDNN_LSTM:
+      fn = cudnn_rnn_ops.CudnnLSTMSaveable
+    elif rnn_mode == CUDNN_GRU:
+      fn = cudnn_rnn_ops.CudnnGRUSaveable
+    elif rnn_mode == CUDNN_RNN_TANH:
+      fn = cudnn_rnn_ops.CudnnRNNTanhSaveable
+    elif rnn_mode == CUDNN_RNN_RELU:
+      fn = cudnn_rnn_ops.CudnnRNNReluSaveable
+    saveable = fn(
+        opaque_param, num_layers, num_units, input_size, direction=direction)
+    return saveable
+
+  def _compare_weights(self, lhs, rhs):
+    self.assertLen(rhs, len(lhs))
+    for lw, rw in zip(lhs, rhs):
+      self.assertAllEqual(lw, rw)
+
+  def _compare_biases(self, lhs, rhs):
+    self.assertLen(rhs, len(lhs))
+    for lf, rt in zip(lhs, rhs):
+      self.assertAllEqual(lf, rt)
+
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, "time", "batch_size", **{
+              "rnn_mode": [
+                  CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH
+              ],
+              "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+          }))
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_save_restore_variable(self, rnn_mode, num_units, input_size,
+                                 num_layers, direction):
+    # Verify the restored opaque param, once converted to tf_canonical format,
+    # is the same as the tf canonicals of the pre-restored param.
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      opaque_param = self._create_opaque_param(rnn_mode, num_units, input_size,
+                                               num_layers, direction)
+      saveable = self._create_saveable(opaque_param, rnn_mode, num_units,
+                                       input_size, num_layers, direction)
+      ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+      weights_op, biases_op = saveable.format_converter.opaque_to_tf_canonical(
+          saveable._variables)
+
+      save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test")
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      init_op = variables.global_variables_initializer()
+      reset_op = state_ops.assign(opaque_param,
+                                  array_ops.zeros_like(opaque_param))
+      sess.run(init_op)
+      self.assertEqual(save_path, saver.save(sess, save_path))
+
+      # Get the tf canonical vals before reset-restore
+      weights, biases = sess.run([weights_op, biases_op])
+
+      # Reset the opaque param value
+      sess.run(reset_op)
+      # Assert reset happened.
+      weights_z, biases_z = sess.run([weights_op, biases_op])
+      for w in weights_z:
+        self.assertAllClose(w, np.zeros_like(w))
+      for b in biases_z:
+        self.assertAllClose(b, np.zeros_like(b))
+
+      # Restore opaque param value from checkpoint.
+      saver.restore(sess, save_path)
+      weights_r, biases_r = sess.run([weights_op, biases_op])
+      self._compare_weights(weights, weights_r)
+      self._compare_biases(biases, biases_r)
+
+  @parameterized.named_parameters(
+      ExpandNamedTestCases(
+          NAMED_RNN_TESTCASES, "time", "batch_size", **{
+              "rnn_mode": [
+                  CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH
+              ],
+              "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION]
+          }))
+  @unittest.skipUnless(test.is_built_with_cuda(),
+                       "Test only applicable when running on GPUs")
+  def test_save_restore_multi_variables(self, rnn_mode, num_units, input_size,
+                                        num_layers, direction):
+    # Verify the restored opaque param, once converted to tf_canonical format,
+    # is the same as the tf canonicals of the pre-restored param.
+    if not context.context().num_gpus():
+      self.skipTest("No GPUs found")
+    with self.session(use_gpu=True) as sess:
+      opaque_params = []
+      saveables = []
+      num_opaque_params = 2
+      for i in range(num_opaque_params):
+        opaque_params.append(
+            self._create_opaque_param(
+                rnn_mode,
+                num_units,
+                input_size,
+                num_layers,
+                direction,
+                name="opaque_param_%d" % i))
+        saveable = self._create_saveable(opaque_params[i], rnn_mode, num_units,
+                                         input_size, num_layers, direction)
+        ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
+        saveables.append(saveable)
+
+      weights_ops, biases_ops = [], []
+      for i in range(num_opaque_params):
+        weights_op, biases_op = (
+            saveables[i].format_converter.opaque_to_tf_canonical(
+                saveables[i]._variables))
+        weights_ops.append(weights_op)
+        biases_ops.append(biases_op)
+
+      save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test")
+      saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2)
+
+      init_op = variables.global_variables_initializer()
+      reset_ops = []
+      for i in range(num_opaque_params):
+        reset_ops.append(
+            state_ops.assign(opaque_params[i],
+                             array_ops.zeros_like(opaque_params[i])))
+      sess.run(init_op)
+      self.assertEqual(save_path, saver.save(sess, save_path))
+
+      # Get the tf canonical vals before reset-restore
+      for i in range(num_opaque_params):
+        weights, biases = sess.run([weights_ops[i], biases_ops[i]])
+
+        # Reset the opaque param value
+        sess.run(reset_ops[i])
+
+        # Assert reset happened.
+        weights_z, biases_z = sess.run([weights_ops[i], biases_ops[i]])
+        for w in weights_z:
+          self.assertAllClose(w, np.zeros_like(w))
+        for b in biases_z:
+          self.assertAllClose(b, np.zeros_like(b))
+
+        # Restore opaque param value from checkpoint.
+        saver.restore(sess, save_path)
+        weights_r, biases_r = sess.run([weights_ops[i], biases_ops[i]])
+        self._compare_weights(weights, weights_r)
+        self._compare_biases(biases, biases_r)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
index 1954f6717bbebd803b0ec45992b43cf68f5d72a0..7e1b4062ce435f3ab4216e90b4f5fcbab984c1dc 100644
--- a/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
+++ b/tensorflow/contrib/cudnn_rnn/python/kernel_tests/cudnn_rnn_test.py
@@ -536,7 +536,9 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
       save_path = os.path.join(self.get_temp_dir(),
                                "save-restore-variable-test")
       saver = saver_lib.Saver()
-      weights, biases = model.rnn.saveable._OpaqueParamsToCanonical()
+      weights, biases = (
+          model.rnn.saveable.format_converter._opaque_to_cu_canonical(
+              model.rnn.saveable._variables))
       opaque_params = rnn.trainable_variables[0]
       # CudnnTestModel() creates CudnnOpaqueParamsSaveable that helps saver save
       # Cudnn vars in canonical format.
@@ -583,8 +585,12 @@ class CudnnRNNTestSaveRestore(test_util.TensorFlowTestCase):
             dtype=dtype)
       opaque_params = (model1.rnn.trainable_variables[0],
                        model2.rnn.trainable_variables[0])
-      weights1, biases1 = model1.rnn.saveable._OpaqueParamsToCanonical()
-      weights2, biases2 = model2.rnn.saveable._OpaqueParamsToCanonical()
+      saveable1 = model1.rnn.saveable
+      weights1, biases1 = saveable1.format_converter._opaque_to_cu_canonical(
+          saveable1._variables)
+      saveable2 = model1.rnn.saveable
+      weights2, biases2 = saveable2.format_converter._opaque_to_cu_canonical(
+          saveable2._variables)
       reset_params = [
           state_ops.assign(params,
                            array_ops.zeros_like(params, dtype=dtype))
@@ -1039,8 +1045,8 @@ class CudnnRNNTestParamsSize(test_util.TensorFlowTestCase):
 
     # Min param size estimate = sum(weights.size) + sum(biases.size)
     min_params_size = (
-        np.sum(list(map(np.prod, rnn.canonical_weight_shapes))) +
-        np.sum([sp[0] for sp in rnn.canonical_bias_shapes]))
+        sum(map(np.prod, rnn.canonical_weight_shapes)) +
+        sum(sp[0] for sp in rnn.canonical_bias_shapes))
 
     opaque_params = rnn.trainable_variables[0]
     with self.test_session(use_gpu=True, graph=ops.get_default_graph()):
diff --git a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
index 8bbcc7cd0397a5339a69e4e44528f0e56584043a..8e25637ed91a1559b321ea96efbfaa2910f67158 100644
--- a/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
+++ b/tensorflow/contrib/cudnn_rnn/python/layers/cudnn_rnn.py
@@ -21,6 +21,7 @@ from tensorflow.contrib.cudnn_rnn.python.ops import cudnn_rnn_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -322,7 +323,7 @@ class _CudnnRNN(base_layer.Layer):
       raise ValueError("The last dimension of the inputs to `CudnnRNN` "
                        "should be defined. Found `None`.")
     self._input_size = input_shape[-1].value
-    self.input_spec = base_layer.InputSpec(ndim=3, axes={-1: self._input_size})
+    self.input_spec = input_spec.InputSpec(ndim=3, axes={-1: self._input_size})
 
     self._set_scope(None)
 
diff --git a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
index d06d0c6bdaa113089c4d4239a6d4ed216ddd01a8..1ce29b42d52ff67477161278ed11016c2e73041d 100644
--- a/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
+++ b/tensorflow/contrib/cudnn_rnn/python/ops/cudnn_rnn_ops.py
@@ -738,7 +738,7 @@ class CudnnOpaqueParamsSaveable(saver.BaseSaverBuilder.SaveableObject):
         self._variables, opaque_params, validate_shape=False)
 
   def _checkpointable_save(self, save_buffer):
-    weights, biases = self.format_converter.opaque_params_to_tf_canonical(
+    weights, biases = self.format_converter.opaque_to_tf_canonical(
         self._variables)
     for name, tensor in zip(self._param_names, weights + biases):
       save_buffer[name] = array_ops.identity(tensor)
diff --git a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
index 0456463a1928cf226010670b90a5d574579e0411..6c5f8c6b00975b3fba041271309a93cecd9f5057 100644
--- a/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/assert_element_shape_test.py
@@ -46,7 +46,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -88,7 +88,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -115,9 +115,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((3, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset.apply(batching.assert_element_shape(wrong_shapes)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -142,7 +141,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
                      tensor_shape.TensorShape((3, 4)))
     self.assertEqual(actual_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -184,7 +183,7 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
     result = dataset.apply(batching.assert_element_shape(expected_shapes))
     self.assertEqual(expected_shapes, result.output_shapes)
 
-    iterator = result.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(result)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
@@ -211,9 +210,8 @@ class AssertElementShapeTest(test_base.DatasetTestBase):
 
     wrong_shapes = (tensor_shape.TensorShape(2),
                     tensor_shape.TensorShape((None, 10)))
-    iterator = (
-        dataset.apply(batching.assert_element_shape(wrong_shapes))
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset.apply(batching.assert_element_shape(wrong_shapes)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
index d2a72272db159755ac2d741bcdbce9ec646d928e..b9840b1ff1a3df5a05db0e64f436637220f49f80 100644
--- a/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/lmdb_dataset_op_test.py
@@ -23,6 +23,7 @@ import shutil
 
 from tensorflow.contrib.data.python.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -48,7 +49,7 @@ class LMDBDatasetTest(test_base.DatasetTestBase):
     num_repeats = 2
 
     dataset = readers.LMDBDataset(filenames).repeat(num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
index c5a786232252432481566e3cde23e9310df172cc..2527706709fae8e459aca3489324d4db3c784be6 100644
--- a/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
+++ b/tensorflow/contrib/data/python/kernel_tests/slide_dataset_op_test.py
@@ -63,13 +63,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) ->
     # _SlideDataset(window_size, window_shift, window_stride).
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(count).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 window_shift=window_shift_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -127,13 +127,13 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
     # RepeatDataset(count) -> _SlideDataset(window_size, stride, window_stride).
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(count).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 stride=stride_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -173,12 +173,12 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     window_shift_t = array_ops.placeholder(dtypes.int64, shape=[])
     window_stride_t = array_ops.placeholder(dtypes.int64, shape=[])
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(10).map(lambda x: x).repeat(count_t).apply(
             sliding.sliding_window_batch(
                 window_size=window_size_t,
                 window_shift=window_shift_t,
-                window_stride=window_stride_t)).make_initializable_iterator())
+                window_stride=window_stride_t)))
     init_op = iterator.initializer
 
     with self.cached_session() as sess:
@@ -204,9 +204,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(
-            window_size=5, window_shift=3)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).apply(
+            sliding.sliding_window_batch(window_size=5, window_shift=3)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -233,9 +233,9 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           values=array_ops.fill([math_ops.to_int32(i)], i),
           dense_shape=[i])
 
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).apply(
-        sliding.sliding_window_batch(
-            window_size=5, window_shift=3)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).apply(
+            sliding.sliding_window_batch(window_size=5, window_shift=3)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -265,11 +265,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return sparse_tensor.SparseTensorValue(
           indices=[[0]], values=(i * [1]), dense_shape=[1])
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(10).map(_sparse).apply(
             sliding.sliding_window_batch(window_size=4, window_shift=2)).apply(
-                sliding.sliding_window_batch(window_size=3, window_shift=1))
-        .make_initializable_iterator())
+                sliding.sliding_window_batch(window_size=3, window_shift=1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -305,11 +304,10 @@ class SlideDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       yield [4.0, 5.0, 6.0]
       yield [7.0, 8.0, 9.0, 10.0]
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(
             generator, dtypes.float32, output_shapes=[None]).apply(
-                sliding.sliding_window_batch(window_size=3, window_shift=1))
-        .make_initializable_iterator())
+                sliding.sliding_window_batch(window_size=3, window_shift=1)))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
diff --git a/tensorflow/contrib/data/python/ops/BUILD b/tensorflow/contrib/data/python/ops/BUILD
index 34dc2379d0cb38f8f6962fa42efe21b793bc8d65..0fb406f1167053a128646c5c692986b0ce016f1e 100644
--- a/tensorflow/contrib/data/python/ops/BUILD
+++ b/tensorflow/contrib/data/python/ops/BUILD
@@ -188,8 +188,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
diff --git a/tensorflow/contrib/data/python/ops/readers.py b/tensorflow/contrib/data/python/ops/readers.py
index 4601376dff47e161962e92678883039c4b88bab7..c0152156a1ba70297adb7054622b15ca04f859cd 100644
--- a/tensorflow/contrib/data/python/ops/readers.py
+++ b/tensorflow/contrib/data/python/ops/readers.py
@@ -21,10 +21,9 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util import deprecation
 
@@ -355,7 +354,7 @@ def read_batch_features(file_pattern,
       shuffle=randomize_input,
       num_epochs=num_epochs,
       shuffle_buffer_size=capacity)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   outputs = iterator.get_next()
   return outputs
 
@@ -379,15 +378,13 @@ class LMDBDataset(dataset_ops.DatasetSource):
     (key value) pairs sequentially.
     For example:
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.contrib.lmdb.LMDBDataset("/foo/bar.mdb")
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
+
     # Prints the (key, value) pairs inside a lmdb file.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for key, value in dataset:
+      print(key, value)
     ```
     Args:
       filenames: A `tf.string` tensor containing one or more filenames.
@@ -398,18 +395,10 @@ class LMDBDataset(dataset_ops.DatasetSource):
 
   def _as_variant_tensor(self):
     return gen_experimental_dataset_ops.experimental_lmdb_dataset(
-        self._filenames,
-        output_types=nest.flatten(self.output_types),
-        output_shapes=nest.flatten(self.output_shapes))
-
-  @property
-  def output_classes(self):
-    return ops.Tensor, ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+        self._filenames, **dataset_ops.flat_structure(self))
 
   @property
-  def output_types(self):
-    return dtypes.string, dtypes.string
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
diff --git a/tensorflow/contrib/data/python/ops/sliding.py b/tensorflow/contrib/data/python/ops/sliding.py
index bcc383587c54bd89502313f9328bc06c49046a87..5c6ee6bfdc7167d14b292f8f763adafca4e3a72c 100644
--- a/tensorflow/contrib/data/python/ops/sliding.py
+++ b/tensorflow/contrib/data/python/ops/sliding.py
@@ -18,11 +18,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util import deprecation
 
 
@@ -40,8 +39,13 @@ class _SlideDataset(dataset_ops.UnaryDataset):
     self._window_shift = ops.convert_to_tensor(
         window_shift, dtype=dtypes.int64, name="window_shift")
 
+    input_structure = structure.convert_legacy_structure(
+        input_dataset.output_types, input_dataset.output_shapes,
+        input_dataset.output_classes)
+    self._structure = input_structure._batch(None)  # pylint: disable=protected-access
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.slide_dataset(
+    return ged_ops.experimental_sliding_window_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         window_size=self._window_size,
         window_shift=self._window_shift,
@@ -49,20 +53,8 @@ class _SlideDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @deprecation.deprecated_args(
diff --git a/tensorflow/contrib/distribute/BUILD b/tensorflow/contrib/distribute/BUILD
index a87a5624c88d1d0af10055261dad55937ed6aeb0..3ecd755d86f6be47910aebbdb46d335d165427d8 100644
--- a/tensorflow/contrib/distribute/BUILD
+++ b/tensorflow/contrib/distribute/BUILD
@@ -26,7 +26,6 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/contrib/distribute/python:collective_all_reduce_strategy",
-        "//tensorflow/contrib/distribute/python:cross_tower_ops",
         "//tensorflow/contrib/distribute/python:mirrored_strategy",
         "//tensorflow/contrib/distribute/python:monitor",
         "//tensorflow/contrib/distribute/python:one_device_strategy",
@@ -35,6 +34,7 @@ py_library(
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:distribute_config",
         "//tensorflow/python/distribute:distribute_coordinator",
     ],
diff --git a/tensorflow/contrib/distribute/README.md b/tensorflow/contrib/distribute/README.md
index a938f8629d8210b4b512338a040340f21d3ef594..8a8dc159ade6f2a4a9b5ec29055ea4848492b29f 100644
--- a/tensorflow/contrib/distribute/README.md
+++ b/tensorflow/contrib/distribute/README.md
@@ -134,7 +134,7 @@ def model_fn(features, labels, mode):
     return tf.estimator.EstimatorSpec(mode, loss=loss)
 
   if mode == tf.estimator.ModeKeys.TRAIN:
-    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss_fn())
+    train_op = tf.train.GradientDescentOptimizer(0.2).minimize(loss)
     return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
 ```
 
@@ -248,19 +248,17 @@ Let's use the same example for multi-worker. We'll start a cluster with 3
 workers doing synchronous all-reduce training. In the following code snippet, we
 start multi-worker training using `tf.estimator.train_and_evaluate`:
 
-
 ```python
 def model_main():
-  estimator = ...
   distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
       num_gpus_per_worker=2)
   config = tf.estimator.RunConfig(train_distribute=distribution)
+  estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
   train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
   eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
   tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
 ```
 
-
 **Note**: You don't have to set "TF\_CONFIG" manually if you use our provided
 Kubernetes template.
 
@@ -327,13 +325,13 @@ start training.
 On your laptop, you can run
 
 ```python
-estimator = ...
 distribution = tf.contrib.distribute.CollectiveAllReduceStrategy(
     num_gpus_per_worker=2)
 config = tf.estimator.RunConfig(
     experimental_distribute=tf.contrib.distribute.DistributeConfig(
         train_distribute=distribution,
         remote_cluster={"worker": ["host1:port", "host2:port", "host3:port"]}))
+estimator = tf.estimator.Estimator(model_fn=model_fn, config=config)
 train_spec = tf.estimator.TrainSpec(input_fn=input_fn)
 eval_spec = tf.estimator.EvalSpec(input_fn=eval_input_fn)
 tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
diff --git a/tensorflow/contrib/distribute/__init__.py b/tensorflow/contrib/distribute/__init__.py
index ab2f221dc6486666e914deb19dd56c7687606e2f..8ec73654e30e4967f318c558ba94301e84a206e4 100644
--- a/tensorflow/contrib/distribute/__init__.py
+++ b/tensorflow/contrib/distribute/__init__.py
@@ -25,13 +25,13 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,wildcard-import
 from tensorflow.contrib.distribute.python.collective_all_reduce_strategy import CollectiveAllReduceStrategy
-from tensorflow.contrib.distribute.python.cross_tower_ops import *
 from tensorflow.contrib.distribute.python.mirrored_strategy import MirroredStrategy
 from tensorflow.contrib.distribute.python.monitor import Monitor
 from tensorflow.contrib.distribute.python.one_device_strategy import OneDeviceStrategy
 from tensorflow.contrib.distribute.python.parameter_server_strategy import ParameterServerStrategy
 from tensorflow.contrib.distribute.python.step_fn import *
 from tensorflow.contrib.distribute.python.tpu_strategy import TPUStrategy
+from tensorflow.python.distribute.cross_device_ops import *
 from tensorflow.python.distribute.distribute_config import DistributeConfig
 from tensorflow.python.distribute.distribute_coordinator import run_standard_tensorflow_server
 from tensorflow.python.training.distribute import *
@@ -46,6 +46,7 @@ _allowed_symbols = [
     'CrossDeviceOps',
     'DistributeConfig',
     'DistributionStrategy',
+    'DistributionStrategyExtended',
     'MirroredStrategy',
     'Monitor',
     'MultiWorkerAllReduce',
@@ -62,6 +63,7 @@ _allowed_symbols = [
     'get_loss_reduction',
     'get_replica_context',
     'has_distribution_strategy',
+    'in_cross_replica_context',
     'require_replica_context',
     'run_standard_tensorflow_server',
     'UpdateContext',
diff --git a/tensorflow/contrib/distribute/python/BUILD b/tensorflow/contrib/distribute/python/BUILD
index 4094e52169aab0b46da4f62087ddac4f750039a4..4c9c35da5a36aa8149d15c8d1c25e4dfaa6a07c1 100644
--- a/tensorflow/contrib/distribute/python/BUILD
+++ b/tensorflow/contrib/distribute/python/BUILD
@@ -16,45 +16,26 @@ load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 # TODO(priyag): Figure out testonly issues that are preventing us from
 # including our tests in pip for now.
 
-py_library(
-    name = "values",
-    srcs = ["values.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":input_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device_util",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:multi_device_iterator_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/training/checkpointable:base",
-        "@six_archive//:six",
-    ],
-)
-
 cuda_py_test(
     name = "values_test",
     srcs = ["values_test.py"],
     additional_deps = [
+        ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
-        ":values",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python:errors",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python:device_util",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -68,25 +49,9 @@ py_library(
     srcs = ["mirrored_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
-        ":shared_variable_creator",
-        ":values",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:device_util",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:training",
-        "//tensorflow/python:util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/distribute:multi_worker_util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -95,16 +60,17 @@ py_library(
     srcs = ["parameter_server_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
         ":mirrored_strategy",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:cross_device_ops",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -116,7 +82,7 @@ cuda_py_test(
         ":combinations",
         ":multi_worker_test_base",
         ":parameter_server_strategy",
-        ":values",
+        ":strategy_test_lib",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
@@ -127,10 +93,12 @@ cuda_py_test(
         "//tensorflow/python:gradients",
         "//tensorflow/python:layers",
         "//tensorflow/python:session",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -145,12 +113,13 @@ py_library(
     srcs = ["one_device_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":values",
-        "//tensorflow/contrib/eager/python:datasets",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:distribute",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -161,16 +130,16 @@ py_library(
     srcs = ["collective_all_reduce_strategy.py"],
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":cross_tower_ops",
-        ":cross_tower_utils",
         ":mirrored_strategy",
-        ":values",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:collective_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -187,11 +156,11 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:layers",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
@@ -212,10 +181,10 @@ py_library(
         ":tpu_strategy",
         "//tensorflow/contrib/cluster_resolver:cluster_resolver_pip",
         "//tensorflow/contrib/optimizer_v2:training",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -233,28 +202,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "mirrored_strategy_test",
-    srcs = ["mirrored_strategy_test.py"],
-    srcs_version = "PY2AND3",
-    tags = [
-        "no_pip",
-    ],
-    deps = [
-        ":mirrored_strategy",
-        ":multi_worker_test_base",
-        ":strategy_test_lib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:distribute",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
 py_test(
     name = "one_device_strategy_test",
     srcs = ["one_device_strategy_test.py"],
@@ -270,35 +217,32 @@ py_test(
     ],
 )
 
+# TODO(priyag): Rename this test to mirrored_strategy_test
 cuda_py_test(
     name = "mirrored_strategy_multigpu_test",
     srcs = ["mirrored_strategy_multigpu_test.py"],
     additional_deps = [
+        ":combinations",
         ":mirrored_strategy",
         ":multi_worker_test_base",
-        ":values",
         ":strategy_test_lib",
-        "//tensorflow/python:distribute",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:layers",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
+    shard_count = 5,
     tags = [
         "guitar",
-        "no_pip",
         "multi_and_single_gpu",
-        # Do not perform the extra analysis on this test, because it is already
-        # performed for the `:mirrored_strategy_test` target.
-        "no_oss",
-        "noasan",
-        "notap",
-        "notsan",
+        "no_pip",
     ],
 )
 
@@ -337,12 +281,15 @@ py_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         ":one_device_strategy",
-        ":values",
         "//tensorflow/contrib/tpu:tpu_lib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/distribute:values",
     ],
 )
 
@@ -352,7 +299,6 @@ cuda_py_test(
     additional_deps = [
         ":collective_all_reduce_strategy",
         ":combinations",
-        ":cross_tower_utils",
         ":multi_worker_test_base",
         ":strategy_test_lib",
         "@absl_py//absl/testing:parameterized",
@@ -368,6 +314,7 @@ cuda_py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:estimator_py",
     ],
@@ -469,6 +416,7 @@ cuda_py_test(
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
         "no_pip",
+        "tf_integration_test",
     ],
 )
 
@@ -476,28 +424,18 @@ cuda_py_test(
     name = "keras_optimizer_v2_test",
     srcs = ["keras_optimizer_v2_test.py"],
     additional_deps = [
-        ":combinations",
-        "@absl_py//absl/testing:parameterized",
-        "//third_party/py/numpy",
-        "//tensorflow/contrib/optimizer_v2:training",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/eager:test",
-        "//tensorflow/python/estimator:estimator_py",
-        "//tensorflow/python/feature_column",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:summary",
+        ":keras_test_lib",
     ],
     tags = [
         "multi_and_single_gpu",
         "no_oss",  # http://b/119349471
         "no_pip",
+        "tf_integration_test",
     ],
 )
 
 cuda_py_test(
     name = "estimator_training_test",
-    size = "large",
     srcs = ["estimator_training_test.py"],
     additional_deps = [
         ":collective_all_reduce_strategy",
@@ -508,7 +446,9 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/contrib/optimizer_v2:training",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute",
+        "//tensorflow/python/distribute:distribute_config",
+        "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/feature_column",
@@ -516,7 +456,7 @@ cuda_py_test(
         "//tensorflow/python:platform",
         "//tensorflow/python:summary",
     ],
-    shard_count = 5,
+    shard_count = 48,
     tags = [
         "multi_and_single_gpu",
         "no_pip",
@@ -524,6 +464,7 @@ cuda_py_test(
         "noasan",
         "nomsan",
         "notsan",
+        "no_oss",  # http://b/119349471
     ],
 )
 
@@ -599,52 +540,16 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "shared_variable_creator",
-    srcs = ["shared_variable_creator.py"],
-    visibility = ["//tensorflow:internal"],
-)
-
-py_test(
-    name = "shared_variable_creator_test",
-    srcs = ["shared_variable_creator_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":shared_variable_creator",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:test",
-    ],
-)
-
-py_library(
-    name = "cross_tower_utils",
-    srcs = ["cross_tower_utils.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":values",
-        "//tensorflow/contrib/all_reduce:all_reduce_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nccl_ops",
-    ],
-)
-
 cuda_py_test(
-    name = "cross_tower_utils_test",
-    srcs = ["cross_tower_utils_test.py"],
+    name = "cross_device_utils_test",
+    srcs = ["cross_device_utils_test.py"],
     additional_deps = [
         ":combinations",
-        ":cross_tower_utils",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:cross_device_utils",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
@@ -653,40 +558,20 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "cross_tower_ops",
-    srcs = ["cross_tower_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":cross_tower_utils",
-        ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:device_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python/eager:context",
-        "@six_archive//:six",
-    ],
-)
-
 cuda_py_test(
-    name = "cross_tower_ops_test",
-    srcs = ["cross_tower_ops_test.py"],
+    name = "cross_device_ops_test",
+    srcs = ["cross_device_ops_test.py"],
     additional_deps = [
         ":combinations",
-        ":cross_tower_ops",
         ":multi_worker_test_base",
         ":mirrored_strategy",
-        ":values",
         "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
     ],
@@ -696,37 +581,6 @@ cuda_py_test(
     ],
 )
 
-py_library(
-    name = "input_ops",
-    srcs = ["input_ops.py"],
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/data/util:nest",
-    ],
-)
-
-cuda_py_test(
-    name = "input_ops_test",
-    srcs = ["input_ops_test.py"],
-    additional_deps = [
-        ":input_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/contrib/data/python/ops:batching",
-        "//tensorflow/contrib/data/python/ops:interleave_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python/data/ops:readers",
-        "//tensorflow/python:util",
-    ],
-    tags = [
-        "no_pip",
-    ],
-)
-
 py_library(
     name = "keras_test_lib",
     testonly = 1,
@@ -737,6 +591,7 @@ py_library(
         "//tensorflow/contrib/distribute/python:tpu_strategy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
+        "//tensorflow/python/eager:test",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/keras",
         "//third_party/py/numpy",
@@ -766,7 +621,6 @@ py_library(
     srcs = ["metrics_v1_test.py"],
     deps = [
         ":combinations",
-        "//tensorflow/contrib/data/python/ops:batching",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:variables",
diff --git a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
index d38bdb592a303d23871b48d80868917efc01dcd1..31bd0e996a247a2fc01405fb3b8172a40853d698 100644
--- a/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
+++ b/tensorflow/contrib/distribute/python/checkpoint_utils_test.py
@@ -43,7 +43,9 @@ class CheckpointUtilsWithDistributionStrategyTest(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       in_replica_mode=[True, False],
       mode=["graph"]))
   def testInitFromCheckpoint(self, distribution, in_replica_mode):
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
index efa99d1fc52e8facfaeb61f98b5e649a18f6a3cf..5c50a20490482856becedf7b1379d2a0583d9a11 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy.py
@@ -18,12 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import cross_tower_utils
+import copy
+
 from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -32,7 +36,7 @@ from tensorflow.python.platform import tf_logging as logging
 
 
 # TODO(yuefengz): support in-graph replication.
-class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
+class CollectiveAllReduceStrategy(distribute_lib.DistributionStrategy):
   """Distribution strategy that uses collective ops for all-reduce.
 
   It is similar to the MirroredStrategy but it uses collective ops for
@@ -53,6 +57,17 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       num_gpus_per_worker: number of local GPUs or GPUs per worker, the default
         is 0 meaning CPU only.
     """
+    super(CollectiveAllReduceStrategy, self).__init__(
+        CollectiveAllReduceExtended(self, num_gpus_per_worker))
+
+
+class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
+  """Implementation of CollectiveAllReduceStrategy."""
+
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    distribute_lib.DistributionStrategyExtended.__init__(
+        self, container_strategy)
+    self._cross_device_ops = None
     self._num_gpus_per_worker = num_gpus_per_worker
     self._initialize_local_worker(num_gpus_per_worker)
 
@@ -62,19 +77,19 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     self._num_workers = 1
 
     if num_gpus_per_worker:
-      local_devices = [
+      local_devices = tuple(
           "/device:GPU:%d" % i for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      local_devices = ["/device:CPU:0"]
+      local_devices = ("/device:CPU:0",)
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
 
-    self._collective_keys = cross_tower_utils.CollectiveKeys()
-    super(CollectiveAllReduceStrategy, self).__init__(
-        devices=local_devices,
-        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
-            num_workers=1,
-            num_gpus_per_worker=num_gpus_per_worker,
-            collective_keys=self._collective_keys))
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._initialize_local(local_devices)
+    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus_per_worker,
+        collective_keys=self._collective_keys)
 
     self._cluster_spec = None
     self._task_type = None
@@ -89,13 +104,12 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     if task_type is None or task_id is None:
       raise ValueError("When `cluster_spec` is given, you must also specify "
                        "`task_type` and `task_id`")
-    if task_type not in ["chief", "worker"]:
+    if task_type not in ("chief", "worker"):
       raise ValueError(
           "Unrecognized task_type: %r, valid task types are: \"chief\", "
           "\"worker\"." % task_type)
     cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._num_workers = len(cluster_spec.as_dict().get("worker", [])) + len(
-        cluster_spec.as_dict().get("chief", []))
+    self._num_workers = multi_worker_util.worker_count(cluster_spec, task_type)
     if not self._num_workers:
       raise ValueError("No `worker` or `chief` tasks can be found in "
                        "`cluster_spec`.")
@@ -103,22 +117,21 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     self._is_chief = multi_worker_util.is_chief(cluster_spec, task_type,
                                                 task_id)
 
-    worker_device = "/job:%s/task:%d" % (task_type, task_id)
+    self._worker_device = "/job:%s/task:%d" % (task_type, task_id)
     if num_gpus_per_worker:
-      local_devices = [
-          "%s/device:GPU:%d" % (worker_device, i)
+      local_devices = tuple(
+          "%s/device:GPU:%d" % (self._worker_device, i)
           for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      local_devices = [worker_device]
+      local_devices = (self._worker_device,)
 
-    self._collective_keys = cross_tower_utils.CollectiveKeys()
-    super(CollectiveAllReduceStrategy, self).__init__(
-        devices=local_devices,
-        cross_tower_ops=cross_tower_ops_lib.CollectiveAllReduce(
-            num_workers=self._num_workers,
-            num_gpus_per_worker=num_gpus_per_worker,
-            collective_keys=self._collective_keys))
+    self._collective_keys = cross_device_utils.CollectiveKeys()
+    self._initialize_local(local_devices)
+    self._cross_tower_ops = cross_device_ops_lib.CollectiveAllReduce(
+        num_workers=self._num_workers,
+        num_gpus_per_worker=num_gpus_per_worker,
+        collective_keys=self._collective_keys)
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
@@ -202,17 +215,40 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     return mirrored_strategy._create_mirrored_variable(
         devices, _real_mirrored_creator, *args, **kwargs)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     # TODO(yuefengz): shard the dataset.
     return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._devices, True)
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _make_dataset_iterator(self, dataset):
+    worker_device_pairs = [(self._worker_device, self._devices)]
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec is None:
+      input_pipeline_id = 0
+    else:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=self._num_workers,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+
+    return values.InputFunctionIterator(
+        input_fn, [(self._worker_device, self._devices)], [input_context])
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the object.
 
     Args:
@@ -232,13 +268,15 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
       self._initialize_multi_worker(self._num_gpus_per_worker, cluster_spec,
                                     task_type, task_id)
 
-    if not session_config:
-      return
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
 
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
     # Enable the scoped allocator optimization for CollectiveOps.  This
     # optimization converts many small all-reduces into fewer larger
     # all-reduces.
-    rewrite_options = session_config.graph_options.rewrite_options
+    rewrite_options = updated_config.graph_options.rewrite_options
     rewrite_options.scoped_allocator_optimization = (
         rewriter_config_pb2.RewriterConfig.ON)
     # We turn on ScopedAllocator only for CollectiveReduce op, i.e. enable_op =
@@ -248,7 +286,7 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     rewrite_options.scoped_allocator_opts.enable_op.append("CollectiveReduce")
 
     if not self._cluster_spec:
-      return
+      return updated_config
 
     assert self._task_type
     assert self._task_id is not None
@@ -256,26 +294,28 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     # Collective group leader is needed for collective ops to coordinate
     # workers.
     if "chief" in self._cluster_spec.jobs:
-      session_config.experimental.collective_group_leader = (
+      updated_config.experimental.collective_group_leader = (
           "/job:chief/replica:0/task:0")
     else:
       if "worker" not in self._cluster_spec.jobs:
         raise ValueError(
             "You must have `chief` or `worker` jobs in the `cluster_spec`.")
-      session_config.experimental.collective_group_leader = (
+      updated_config.experimental.collective_group_leader = (
           "/job:worker/replica:0/task:0")
 
     # The device filters prevent communication between workers.
-    del session_config.device_filters[:]
-    session_config.device_filters.append(
+    del updated_config.device_filters[:]
+    updated_config.device_filters.append(
         "/job:%s/task:%d" % (self._task_type, self._task_id))
 
+    return updated_config
+
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
     return True
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return True
 
   @property
@@ -287,6 +327,10 @@ class CollectiveAllReduceStrategy(mirrored_strategy.MirroredStrategy):
     return self._is_chief
 
   @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return len(self._devices) * self._num_workers
 
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
index e3d919dd0d482f49d9a934c879e9adad25c03f86..8a9e583f0afaac37a2057bae9b1ed79de43d68bc 100644
--- a/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/collective_all_reduce_strategy_test.py
@@ -23,13 +23,19 @@ import numpy as np
 
 from tensorflow.contrib.distribute.python import collective_all_reduce_strategy
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import multi_worker_test_base
+from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
@@ -51,9 +57,6 @@ class CollectiveAllReduceStrategyTestBase(
   collective_key_base = 0
 
   def setUp(self):
-    self._run_options = config_pb2.RunOptions()
-    self._run_options.experimental.collective_graph_key = 6
-
     # We use a different key_base for each test so that collective keys won't be
     # reused.
     # TODO(yuefengz, tucker): enable it to reuse collective keys in different
@@ -71,15 +74,16 @@ class CollectiveAllReduceStrategyTestBase(
           cluster_spec=self._cluster_spec,
           task_type=task_type,
           task_id=task_id)
-    collective_keys = cross_tower_utils.CollectiveKeys(
+    collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_start=num_gpus * 100 +
         CollectiveAllReduceStrategyTestBase.collective_key_base,
         instance_key_with_id_start=num_gpus * 10000 +
         CollectiveAllReduceStrategyTestBase.collective_key_base)
-    distribution._collective_keys = collective_keys
-    distribution._cross_tower_ops._collective_keys = collective_keys
+    distribution.extended._collective_keys = collective_keys
+    distribution.extended._inferred_cross_device_ops._collective_keys = (
+        collective_keys)
     if task_type and task_id is not None:
       return distribution, 'grpc://' + self._cluster_spec[task_type][
           task_id], session_config
@@ -93,7 +97,8 @@ class CollectiveAllReduceStrategyTestBase(
          self.cached_session(config=config,
                              target=master_target) as sess, \
          d.scope():
-      l = core.Dense(1, use_bias=False, name='gpu_%d' % d._num_gpus_per_worker)
+      l = core.Dense(1, use_bias=False,
+                     name='gpu_%d' % d.extended._num_gpus_per_worker)
 
       def loss_fn(x):
         y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
@@ -127,8 +132,8 @@ class CollectiveAllReduceStrategyTestBase(
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
                 d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -136,14 +141,13 @@ class CollectiveAllReduceStrategyTestBase(
 
       before_out, after_out = step()
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
-      sess.run(
-          variables.global_variables_initializer(), options=self._run_options)
+      sess.run(variables.global_variables_initializer())
 
       for i in range(10):
-        b, a = sess.run((before_out, after_out), options=self._run_options)
+        b, a = sess.run((before_out, after_out))
         if i == 0:
           before, = b
         after, = a
@@ -222,26 +226,54 @@ class CollectiveAllReduceStrategyTestBase(
         return array_ops.identity(x)
 
       x = distribution.call_for_each_replica(model_fn)
-      reduced_x = distribution.unwrap(
-          distribution.reduce(
-              variable_scope.VariableAggregation.MEAN, x,
-              destinations='/cpu:0'))[0]
+      reduced_x = distribution.reduce(reduce_util.ReduceOp.MEAN, x)
       x = distribution.unwrap(x)[0]
 
-      sess.run(
-          variables.global_variables_initializer(), options=self._run_options)
+      sess.run(variables.global_variables_initializer())
 
-      x_value, reduced_x_value = sess.run([x, reduced_x],
-                                          options=self._run_options)
+      x_value, reduced_x_value = sess.run([x, reduced_x])
       self.assertTrue(
           np.allclose(x_value, reduced_x_value, atol=1e-5),
           msg=('x_value = %r, reduced_x_value = %r' % (x_value,
                                                        reduced_x_value)))
     return np.allclose(x_value, reduced_x_value, atol=1e-5)
 
+  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
+                              expected_values):
+    distribution, master_target, config = self._get_test_object(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_device(d, next_element) for d in devices])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
 
 class DistributedCollectiveAllReduceStrategyTest(
-    CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
+    CollectiveAllReduceStrategyTestBase,
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -269,7 +301,7 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testVariableInitialization(self, num_gpus):
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_variable_initialization,
         self._cluster_spec,
@@ -279,10 +311,56 @@ class DistributedCollectiveAllReduceStrategyTest(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testComplexModel(self, num_gpus):
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._run_between_graph_clients(
         self._test_complex_model, self._cluster_spec, num_gpus=num_gpus)
 
+  # TODO(yuefengz): Update how we use num_gpus and required_gpus
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
+  def testMakeInputFnIterator(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    # We use CPU as the device when num_gpus = 0
+    devices_per_worker = max(1, num_gpus)
+    expected_values = [[i+j for j in range(devices_per_worker)]
+                       for i in range(0, 100, devices_per_worker)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=3*devices_per_worker,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator('worker', 1, num_gpus,
+                                 input_fn, expected_values)
+
+  def testUpdateConfigProto(self):
+    distribution = collective_all_reduce_strategy.CollectiveAllReduceStrategy(
+        num_gpus_per_worker=2)
+    distribution.configure(
+        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+
+    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
+    rewrite_options = config_proto.graph_options.rewrite_options
+    rewrite_options.scoped_allocator_opts.enable_op.append('to_be_removed')
+
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify group leader
+    self.assertEqual('/job:worker/replica:0/task:0',
+                     new_config.experimental.collective_group_leader)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1'], new_config.device_filters)
+
+    # Verify rewrite options.
+    new_rewrite_options = new_config.graph_options.rewrite_options
+    self.assertEqual(rewriter_config_pb2.RewriterConfig.ON,
+                     new_rewrite_options.scoped_allocator_optimization)
+    self.assertEqual(['CollectiveReduce'],
+                     new_rewrite_options.scoped_allocator_opts.enable_op)
+
 
 class DistributedCollectiveAllReduceStrategyTestWithChief(
     CollectiveAllReduceStrategyTestBase, parameterized.TestCase):
@@ -293,10 +371,6 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=0, has_chief=True)
 
-  def setUp(self):
-    super(DistributedCollectiveAllReduceStrategyTestWithChief, self).setUp()
-    self._run_options.experimental.collective_graph_key = 7
-
   @combinations.generate(
       combinations.combine(mode=['graph'], num_gpus=[0, 1, 2], required_gpus=1))
   def testMinimizeLossGraph(self, num_gpus):
@@ -323,20 +397,36 @@ class DistributedCollectiveAllReduceStrategyTestWithChief(
 
 
 class LocalCollectiveAllReduceStrategy(CollectiveAllReduceStrategyTestBase,
+                                       strategy_test_lib.DistributionTestBase,
                                        parameterized.TestCase):
 
   def testMinimizeLossGraph(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._test_minimize_loss_graph(None, None, num_gpus)
 
   def testComplexModel(self, num_gpus=2):
     # Collective ops doesn't support strategy with one device.
     if context.num_gpus() < num_gpus:
-      return
+      self.skipTest('Not enough GPUs')
     self._test_complex_model(None, None, num_gpus)
 
+  def testMakeInputFnIterator(self, num_gpus=2):
+    # Collective ops doesn't support strategy with one device.
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    self._test_input_fn_iterator(None, None, num_gpus,
+                                 input_fn, expected_values)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/combinations.py b/tensorflow/contrib/distribute/python/combinations.py
index a51371654031e32d084e2b0e8ae345bb2c166ae8..365ce5cdec79f1914f0c9ccdf59a7dc59e6f819e 100644
--- a/tensorflow/contrib/distribute/python/combinations.py
+++ b/tensorflow/contrib/distribute/python/combinations.py
@@ -53,11 +53,11 @@ from tensorflow.contrib.distribute.python import tpu_strategy as tpu_lib
 from tensorflow.contrib.optimizer_v2 import adagrad as adagrad_v2
 from tensorflow.contrib.optimizer_v2 import adam as adam_v2
 from tensorflow.contrib.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.training import adagrad
 from tensorflow.python.training import adam
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 from tensorflow.python.util import tf_inspect
@@ -168,6 +168,8 @@ def _augment_with_special_arguments(test_method):
       if GPU_TEST:
         self.skipTest("Test that doesn't require GPUs.")
     elif context.num_gpus() < required_gpus:
+      # TODO(priyag): Consider allowing tests in graph mode using soft
+      # placement.
       self.skipTest(
           "{} GPUs are not available for this test. {} GPUs are available".
           format(required_gpus, context.num_gpus()))
@@ -190,7 +192,7 @@ def _augment_with_special_arguments(test_method):
         kwargs_to_pass[arg] = kwargs[arg]
 
     if mode == "eager":
-      with ops.Graph().as_default(), context.eager_mode():
+      with context.eager_mode():
         if distribution:
           kwargs_to_pass["distribution"] = distribution.strategy
         test_method(**kwargs_to_pass)
@@ -335,6 +337,13 @@ tpu_strategy_one_step = NamedDistribution(
     "TPUOneStep", lambda: tpu_lib.TPUStrategy(
         TPUClusterResolver(""), steps_per_run=1),
     required_tpu=True)
+mirrored_strategy_with_one_cpu = NamedDistribution(
+    "Mirrored1CPU",
+    lambda: mirrored_lib.MirroredStrategy(["/cpu:0"]))
+mirrored_strategy_with_one_gpu = NamedDistribution(
+    "Mirrored1GPU",
+    lambda: mirrored_lib.MirroredStrategy(["/gpu:0"]),
+    required_gpus=1)
 mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
     "MirroredCPUAndGPU",
     lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/cpu:0"]),
@@ -343,6 +352,21 @@ mirrored_strategy_with_two_gpus = NamedDistribution(
     "Mirrored2GPUs",
     lambda: mirrored_lib.MirroredStrategy(["/gpu:0", "/gpu:1"]),
     required_gpus=2)
+core_mirrored_strategy_with_one_cpu = NamedDistribution(
+    "CoreMirrored1CPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/cpu:0"]))
+core_mirrored_strategy_with_one_gpu = NamedDistribution(
+    "CoreMirrored1GPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0"]),
+    required_gpus=1)
+core_mirrored_strategy_with_gpu_and_cpu = NamedDistribution(
+    "CoreMirroredCPUAndGPU",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/cpu:0"]),
+    required_gpus=1)
+core_mirrored_strategy_with_two_gpus = NamedDistribution(
+    "CoreMirrored2GPUs",
+    lambda: mirrored_lib.CoreMirroredStrategy(["/gpu:0", "/gpu:1"]),
+    required_gpus=2)
 
 
 gradient_descent_optimizer_v1_fn = NamedObject(
@@ -373,8 +397,11 @@ def distributions_and_v1_optimizers():
   """A common set of combination with DistributionStrategies and Optimizers."""
   return combine(
       distribution=[
-          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
-          mirrored_strategy_with_two_gpus
+          one_device_strategy,
+          mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus,
+          core_mirrored_strategy_with_gpu_and_cpu,
+          core_mirrored_strategy_with_two_gpus,
       ],
       optimizer_fn=optimizers_v1)
 
@@ -383,7 +410,10 @@ def distributions_and_v2_optimizers():
   """DistributionStrategies and V2 Optimizers."""
   return combine(
       distribution=[
-          one_device_strategy, mirrored_strategy_with_gpu_and_cpu,
-          mirrored_strategy_with_two_gpus
+          one_device_strategy,
+          mirrored_strategy_with_gpu_and_cpu,
+          mirrored_strategy_with_two_gpus,
+          core_mirrored_strategy_with_gpu_and_cpu,
+          core_mirrored_strategy_with_two_gpus,
       ],
       optimizer_fn=optimizers_v2)
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
similarity index 79%
rename from tensorflow/contrib/distribute/python/cross_tower_ops_test.py
rename to tensorflow/contrib/distribute/python/cross_device_ops_test.py
index 3e274ba67ca6709a14f5391968f28b721e46b8a6..d6e9521c1c1115ffdbdcf375ad4017bacb962832 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_ops_test.py
@@ -24,24 +24,24 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import cross_tower_utils
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import device_util
 
 
 def _make_per_replica(values, devices, regroup=False):
-  devices = cross_tower_ops_lib.get_devices_from(devices)
+  devices = cross_device_ops_lib.get_devices_from(devices)
   assert len(values) == len(devices)
 
   # We simulate the result of regroup called on PerReplica which strips the
@@ -66,7 +66,7 @@ def _fake_mirrored(value, devices):
   All components of the returned Mirrored have the same objects, which is not
   true in reality.
   """
-  devices = cross_tower_ops_lib.get_devices_from(devices)
+  devices = cross_device_ops_lib.get_devices_from(devices)
   return value_lib.Mirrored(
       {d: v for d, v in zip(devices, [value] * len(devices))})
 
@@ -118,8 +118,8 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
           self.assertEqual(
               sess.run(list(left._index.values())), list(right._index.values()))
 
-  def _testReductionAndBroadcast(self, cross_tower_ops, distribution):
-    devices = distribution.worker_devices
+  def _testReductionAndBroadcast(self, cross_device_ops, distribution):
+    devices = distribution.extended.worker_devices
 
     values = [constant_op.constant(float(d)) for d in range(len(devices))]
     per_replica = _make_per_replica(values, devices)
@@ -132,35 +132,33 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     destination_mirrored = _fake_mirrored(1., devices)
     destination_different = _fake_mirrored(1., _cpu_device)
     destination_str = _cpu_device
-    destination_list = devices
 
     all_destinations = [
         destination_mirrored, destination_different, destination_str,
-        destination_list
     ]
 
     # test reduce()
     for destinations in all_destinations:
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.MEAN,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.MEAN,
               per_replica,
               destinations=destinations),
           _fake_mirrored(mean, destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.MEAN,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.MEAN,
               per_replica_2,
               destinations=destinations),
           _fake_mirrored(mean_2, destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM, per_replica,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.SUM, per_replica,
               destinations=destinations),
           _fake_mirrored(mean * len(devices), destinations))
       self._assert_values_equal(
-          cross_tower_ops.reduce(
-              vs.VariableAggregation.SUM,
+          cross_device_ops.reduce(
+              reduce_util.ReduceOp.SUM,
               per_replica_2,
               destinations=destinations),
           _fake_mirrored(mean_2 * len(devices), destinations))
@@ -168,16 +166,16 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     # test batch_reduce()
     for d1, d2 in itertools.product(all_destinations, all_destinations):
       self._assert_values_equal(
-          cross_tower_ops.batch_reduce(
-              vs.VariableAggregation.MEAN,
+          cross_device_ops.batch_reduce(
+              reduce_util.ReduceOp.MEAN,
               [(per_replica, d1), (per_replica_2, d2)]),
           [
               _fake_mirrored(mean, d1),
               _fake_mirrored(mean_2, d2)
           ])
       self._assert_values_equal(
-          cross_tower_ops.batch_reduce(
-              vs.VariableAggregation.SUM,
+          cross_device_ops.batch_reduce(
+              reduce_util.ReduceOp.SUM,
               [(per_replica, d1), (per_replica_2, d2)]),
           [
               _fake_mirrored(mean * len(devices), d1),
@@ -187,7 +185,7 @@ class CrossDeviceOpsTestBase(test.TestCase, parameterized.TestCase):
     # test broadcast()
     for destinations in all_destinations:
       self._assert_values_equal(
-          cross_tower_ops.broadcast(constant_op.constant(1.), destinations),
+          cross_device_ops.broadcast(constant_op.constant(1.), destinations),
           _fake_mirrored(1., destinations))
 
 
@@ -196,62 +194,65 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
   # combinations module so that we can pass in devices instead of a distribution
   # strategy.
   reduction_to_one_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "DefaultReductionToOneDeviceCrossDeviceOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
           combinations.NamedObject(
               "ReductionToCPUDeviceCrossDeviceOps",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
                   reduce_to_device=_cpu_device)),
           combinations.NamedObject(
               "AccumulateNCrossDeviceOp",
-              cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+              cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
                   accumulation_fn=math_ops.accumulate_n)),
       ],
       distribution=[
           combinations.one_device_strategy,
           combinations.mirrored_strategy_with_gpu_and_cpu,
-          combinations.mirrored_strategy_with_two_gpus
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_two_gpus
       ],
       mode=["graph", "eager"])
   allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "AllReduce",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 1, 0, 0)),
           combinations.NamedObject(
               "HierarchicalCopy",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps(
+              cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 8, 0, 0)),
           combinations.NamedObject(
               "AllReduceNoGradientRepacking",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
+              cross_device_ops_lib.AllReduceCrossDeviceOps("nccl", 0, 0, 0)),
           combinations.NamedObject(
               "HierarchicalCopyAggregateSmallTensors",
-              cross_tower_ops_lib.AllReduceCrossDeviceOps(
+              cross_device_ops_lib.AllReduceCrossDeviceOps(
                   "hierarchical_copy", 0, 100, 10))
       ],
-      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      distribution=[combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       mode=["graph", "eager"])
 
   @combinations.generate(reduction_to_one_combinations + allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+  def testReductionAndBroadcast(self, cross_device_ops, distribution):
     with distribution.scope():
-      self._testReductionAndBroadcast(cross_tower_ops, distribution)
+      self._testReductionAndBroadcast(cross_device_ops, distribution)
 
   def testChooseAlgorithm(self):
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7],
                     [0, 5, 6, 7], [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
     self.assertEqual(result._num_packs, 8)
 
     # if there are only 4 devices
     device_links = [[1, 2, 3, 4], [0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "nccl")
     self.assertEqual(result._num_packs, 1)
 
@@ -259,16 +260,16 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     device_links = [[0, 1, 2, 3, 4], [0, 1, 2, 3, 5], [0, 1, 2, 3, 6],
                     [0, 1, 2, 3, 7], [0, 4, 5, 6, 7], [1, 4, 5, 6, 7],
                     [2, 4, 5, 6, 7], [3, 4, 5, 6, 7]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "hierarchical_copy")
     self.assertEqual(result._num_packs, 8)
 
     # if not dgx1-like links
     device_links = [[0, 2, 3, 5], [0, 1, 3, 6], [0, 1, 2, 7], [0, 5, 6, 7],
                     [1, 4, 6, 7], [2, 4, 5, 7], [3, 4, 5, 6], [1, 2, 3, 4]]
-    result = cross_tower_ops_lib._choose_all_reduce_algorithm(device_links)
-    self.assertIsInstance(result, cross_tower_ops_lib.AllReduceCrossDeviceOps)
+    result = cross_device_ops_lib._choose_all_reduce_algorithm(device_links)
+    self.assertIsInstance(result, cross_device_ops_lib.AllReduceCrossDeviceOps)
     self.assertEqual(result._all_reduce_alg, "nccl")
     self.assertEqual(result._num_packs, 1)
 
@@ -280,8 +281,8 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     t0 = _make_indexed_slices([[1., 2.]], [1], [5, 2], devices[0])
     t1 = _make_indexed_slices([[3., 4.], [5., 6.]], [1, 3], [5, 2], devices[1])
     per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
-    result = cross_tower_ops_lib._simple_reduce(
-        per_replica, devices[0], math_ops.add_n, vs.VariableAggregation.SUM)
+    result = cross_device_ops_lib._simple_reduce(
+        per_replica, devices[0], math_ops.add_n, reduce_util.ReduceOp.SUM)
 
     # Test that the result is semantically equal to both the concatenated
     # IndexedSlices with and without duplicate indices.
@@ -294,19 +295,19 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
 
   @combinations.generate(
       combinations.combine(
-          cross_tower_ops_instance=[
+          cross_device_ops_instance=[
               combinations.NamedObject(
                   "ReductionToOneDeviceCrossDeviceOps",
-                  cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
+                  cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps()),
               combinations.NamedObject(
                   "AllReduceCrossDeviceOps",
-                  cross_tower_ops_lib.AllReduceCrossDeviceOps())
+                  cross_device_ops_lib.AllReduceCrossDeviceOps())
           ],
-          aggregation=[vs.VariableAggregation.SUM, vs.VariableAggregation.MEAN],
+          reduce_op=[reduce_util.ReduceOp.SUM, reduce_util.ReduceOp.MEAN],
           batch_reduce=[True, False],
           mode=["graph", "eager"],
           required_gpus=1))
-  def testIndexedSlicesAllReduce(self, cross_tower_ops_instance, aggregation,
+  def testIndexedSlicesAllReduce(self, cross_device_ops_instance, reduce_op,
                                  batch_reduce):
     devices = ["/cpu:0", "/gpu:0"]
     dense_shape = [5, 2]
@@ -316,20 +317,20 @@ class SingleWorkerCrossDeviceOpsTest(CrossDeviceOpsTestBase):
     per_replica = value_lib.PerReplica({devices[0]: t0, devices[1]: t1})
 
     if batch_reduce:
-      result = cross_tower_ops_instance.batch_reduce(
-          aggregation, [(per_replica, devices)])
+      result = cross_device_ops_instance.batch_reduce(
+          reduce_op, [(per_replica, per_replica)])
     else:
-      result = cross_tower_ops_instance.reduce(
-          aggregation, per_replica, devices)
+      result = cross_device_ops_instance.reduce(
+          reduce_op, per_replica, per_replica)
 
     total_indices_with_dups = [1, 1, 3]
     total_indices_without_dups = [1, 3]
 
-    if aggregation == vs.VariableAggregation.SUM:
+    if reduce_op == reduce_util.ReduceOp.SUM:
       total_values_with_dups = [[1., 2.], [3., 4.], [5., 6.]]
       total_values_without_dups = [[4., 6.], [5., 6.]]
     else:
-      assert aggregation == vs.VariableAggregation.MEAN
+      assert reduce_op == reduce_util.ReduceOp.MEAN
       total_values_with_dups = [[0.5, 1.], [1.5, 2.], [2.5, 3.]]
       total_values_without_dups = [[2., 3.], [2.5, 3.]]
 
@@ -356,49 +357,63 @@ class MultiWorkerCrossDeviceOpsTest(multi_worker_test_base.MultiWorkerTestBase,
       "/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"
   ]
   multi_worker_allreduce_combinations = combinations.combine(
-      cross_tower_ops=[
+      cross_device_ops=[
           combinations.NamedObject(
               "MultiWorkerAllReduce",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 0, 0)),
           combinations.NamedObject(
               "MultiWorkerAllReducePack",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 1, 0, 0)),
           combinations.NamedObject(
               "MultiWorkerAllReduceAggregation",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, ("pscpu/pscpu", 2, -1), 0, 100, 10)),
           combinations.NamedObject(
               "MultiWorkerAllReduceMultipleSpecs",
-              cross_tower_ops_lib.MultiWorkerAllReduce(
+              cross_device_ops_lib.MultiWorkerAllReduce(
                   worker_devices, 2, [("pscpu/pscpu", 2, 100),
                                       ("xring", 2, -1)], 0, 0, 0)),
       ],
       distribution=[
           combinations.NamedDistribution(
               "MirroredCPU",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=0),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=0),
               required_gpus=0),
           combinations.NamedDistribution(
               "Mirrored1GPU",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=1),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=1),
               required_gpus=1),
           combinations.NamedDistribution(
               "Mirrored2GPUs",
-              lambda: mirrored_strategy.MirroredStrategy(num_gpus=2),
+              lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=2),
+              required_gpus=2),
+          # pylint: disable=g-long-lambda
+          combinations.NamedDistribution(
+              "CoreMirroredCPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:CPU:0"]),
+              required_gpus=0),
+          combinations.NamedDistribution(
+              "CoreMirrored1GPU",
+              lambda: mirrored_strategy.CoreMirroredStrategy(["/device:GPU:0"]),
+              required_gpus=1),
+          combinations.NamedDistribution(
+              "CoreMirrored2GPUs",
+              lambda: mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"]),
               required_gpus=2),
       ],
       mode=["graph"])
 
   @combinations.generate(multi_worker_allreduce_combinations)
-  def testReductionAndBroadcast(self, cross_tower_ops, distribution):
+  def testReductionAndBroadcast(self, cross_device_ops, distribution):
     distribution.configure(cluster_spec={
         "worker":
             ["/job:worker/replica:0/task:0", "/job:worker/replica:0/task:1"]
     })
     with distribution.scope():
-      self._testReductionAndBroadcast(cross_tower_ops, distribution)
+      self._testReductionAndBroadcast(cross_device_ops, distribution)
 
 
 class MultiWorkerCollectiveAllReduceTest(
@@ -419,7 +434,7 @@ class MultiWorkerCollectiveAllReduceTest(
     MultiWorkerCollectiveAllReduceTest.collective_key_base += 100000
 
   def _get_test_objects(self, task_type, task_id, num_gpus=0, local_mode=False):
-    collective_keys = cross_tower_utils.CollectiveKeys(
+    collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=10 * num_gpus +
         MultiWorkerCollectiveAllReduceTest.collective_key_base,
         instance_key_start=num_gpus * 100 +
@@ -427,7 +442,7 @@ class MultiWorkerCollectiveAllReduceTest(
         instance_key_with_id_start=num_gpus * 10000 +
         MultiWorkerCollectiveAllReduceTest.collective_key_base)
     if local_mode:
-      collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
+      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
           1, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = ["/device:GPU:%d" % i for i in range(num_gpus)]
@@ -435,7 +450,7 @@ class MultiWorkerCollectiveAllReduceTest(
         devices = ["/device:CPU:0"]
       return collective_all_reduce_ops, devices, ""
     else:
-      collective_all_reduce_ops = cross_tower_ops_lib.CollectiveAllReduce(
+      collective_all_reduce_ops = cross_device_ops_lib.CollectiveAllReduce(
           3, num_gpus, collective_keys=collective_keys)
       if num_gpus:
         devices = [
@@ -491,37 +506,35 @@ class MultiWorkerCollectiveAllReduceTest(
       destination_mirrored = _fake_mirrored(1., devices)
       destination_different = _fake_mirrored(1., _cpu_device)
       destination_str = _cpu_device
-      destination_list = devices
 
       all_destinations = [
-          destination_different, destination_mirrored, destination_str,
-          destination_list
+          destination_different, destination_mirrored, destination_str
       ]
 
       # test reduce()
       for destinations in all_destinations:
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.MEAN,
+                reduce_util.ReduceOp.MEAN,
                 per_replica,
                 destinations=destinations),
             _fake_mirrored(mean, destinations), sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.MEAN,
+                reduce_util.ReduceOp.MEAN,
                 per_replica_2,
                 destinations=destinations),
             _fake_mirrored(mean_2, destinations), sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.SUM,
+                reduce_util.ReduceOp.SUM,
                 per_replica,
                 destinations=destinations),
             _fake_mirrored(mean * len(devices) * num_workers, destinations),
             sess)
         self._assert_values_equal(
             collective_all_reduce.reduce(
-                vs.VariableAggregation.SUM,
+                reduce_util.ReduceOp.SUM,
                 per_replica_2,
                 destinations=destinations),
             _fake_mirrored(mean_2 * len(devices) * num_workers, destinations),
@@ -530,7 +543,7 @@ class MultiWorkerCollectiveAllReduceTest(
       # test batch_reduce()
       for d1, d2 in itertools.product(all_destinations, all_destinations):
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(vs.VariableAggregation.MEAN,
+            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.MEAN,
                                                [(per_replica, d1),
                                                 (per_replica_2, d2)]),
             [
@@ -538,7 +551,7 @@ class MultiWorkerCollectiveAllReduceTest(
                 _fake_mirrored(mean_2, d2)
             ], sess)
         self._assert_values_equal(
-            collective_all_reduce.batch_reduce(vs.VariableAggregation.SUM,
+            collective_all_reduce.batch_reduce(reduce_util.ReduceOp.SUM,
                                                [(per_replica, d1),
                                                 (per_replica_2, d2)]),
             [
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
similarity index 76%
rename from tensorflow/contrib/distribute/python/cross_tower_utils_test.py
rename to tensorflow/contrib/distribute/python/cross_device_utils_test.py
index a991156ca87fb666f9e47462ccf2bbbe305fe925..2303a31677afbd12a0b8e7eea3ecf7c7736c46ad 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils_test.py
+++ b/tensorflow/contrib/distribute/python/cross_device_utils_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for cross_tower_utils."""
+"""Tests for cross_device_utils."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,14 +21,14 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
-from tensorflow.python.training import device_util
 
 
 class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
@@ -43,7 +43,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t0 = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     t1 = constant_op.constant([[0., 0.], [5, 6], [7., 8.]])
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1])
     self._assert_values_equal(total, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -53,7 +53,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
     total = constant_op.constant([[1., 2.], [5, 6], [10., 12.]])
-    result = cross_tower_utils.aggregate_tensors_or_indexed_slices([t0, t1])
+    result = cross_device_utils.aggregate_tensors_or_indexed_slices([t0, t1])
     self.assertIsInstance(result, ops.IndexedSlices)
     self._assert_values_equal(total, result)
 
@@ -62,7 +62,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     n = 2
     expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
-    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n)
     self._assert_values_equal(expected, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -71,7 +71,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     n = 2
     expected = constant_op.constant([[0.5, 1.], [0, 0], [1.5, 2.]])
-    result = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(t, n)
+    result = cross_device_utils.divide_by_n_tensors_or_indexed_slices(t, n)
     self.assertIsInstance(result, ops.IndexedSlices)
     self._assert_values_equal(expected, result)
 
@@ -79,7 +79,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
   def testIsIndexedSlices(self):
     t = math_ops._as_indexed_slices(
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(t))
+    self.assertTrue(cross_device_utils.contains_indexed_slices(t))
 
   @test_util.run_in_graph_and_eager_modes
   def testContainsIndexedSlices_List(self):
@@ -87,7 +87,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices([t0, t1]))
+    self.assertTrue(cross_device_utils.contains_indexed_slices([t0, t1]))
 
   @test_util.run_in_graph_and_eager_modes
   def testContainsIndexedSlices_Tuple(self):
@@ -95,7 +95,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
         constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    self.assertTrue(cross_tower_utils.contains_indexed_slices((t0, t1)))
+    self.assertTrue(cross_device_utils.contains_indexed_slices((t0, t1)))
 
   @test_util.run_in_graph_and_eager_modes
   def testContainsIndexedSlices_PerReplica(self):
@@ -104,18 +104,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     t1 = math_ops._as_indexed_slices(
         constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
     per_replica = value_lib.PerReplica({"/gpu:0": t0, "/cpu:0": t1})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_replica))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testContainsIndexedSlices_PerReplicaMapOutput(self):
-    t0 = math_ops._as_indexed_slices(
-        constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
-    t1 = math_ops._as_indexed_slices(
-        constant_op.constant([[0., 0.], [5, 6], [7., 8.]]))
-    per_replica = value_lib.PerReplica({
-        "/gpu:0": value_lib.MapOutput([t0]),
-        "/cpu:0": value_lib.MapOutput([t1])})
-    self.assertTrue(cross_tower_utils.contains_indexed_slices(per_replica))
+    self.assertTrue(cross_device_utils.contains_indexed_slices(per_replica))
 
   @combinations.generate(combinations.combine(
       mode=["graph", "eager"],
@@ -124,7 +113,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
     with ops.device("/cpu:0"):
       t = constant_op.constant([[1., 2.], [0, 0], [3., 4.]])
     destination = "/gpu:0"
-    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    result = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         t, destination)
 
     self._assert_values_equal(t, result)
@@ -139,7 +128,7 @@ class IndexedSlicesUtilsTest(test.TestCase, parameterized.TestCase):
       t = math_ops._as_indexed_slices(
           constant_op.constant([[1., 2.], [0, 0], [3., 4.]]))
     destination = "/gpu:0"
-    result = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    result = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         t, destination)
 
     self.assertIsInstance(result, ops.IndexedSlices)
diff --git a/tensorflow/contrib/distribute/python/estimator_integration_test.py b/tensorflow/contrib/distribute/python/estimator_integration_test.py
index a1355c0b09e51c18cc4f8967dfc2c472d63593b9..e17085628ba6d1dfc79839fd824801723f07a518 100644
--- a/tensorflow/contrib/distribute/python/estimator_integration_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_integration_test.py
@@ -34,7 +34,7 @@ from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import gfile
 from tensorflow.python.summary.writer import writer_cache
@@ -63,7 +63,9 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
           distribution=[
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
           ],
           use_train_and_evaluate=[True, False]))
   def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
@@ -75,12 +77,12 @@ class DNNLinearCombinedClassifierIntegrationTest(test.TestCase,
     train_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices),
+        batch_size=batch_size // distribution.num_replicas_in_sync,
         shuffle=True)
     eval_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices),
+        batch_size=batch_size // distribution.num_replicas_in_sync,
         shuffle=False)
     predict_input_fn = numpy_io.numpy_input_fn(
         x={'x': data}, batch_size=batch_size, shuffle=False)
diff --git a/tensorflow/contrib/distribute/python/estimator_training_test.py b/tensorflow/contrib/distribute/python/estimator_training_test.py
index 8f82b4c92aa4305af121855972df4947c963850d..b369a7fefe6f35cf5a9b64451419cf4f72a99471 100644
--- a/tensorflow/contrib/distribute/python/estimator_training_test.py
+++ b/tensorflow/contrib/distribute/python/estimator_training_test.py
@@ -24,7 +24,6 @@ import json
 import os
 import sys
 import tempfile
-import threading
 from absl.testing import parameterized
 import numpy as np
 
@@ -45,11 +44,13 @@ from tensorflow.python.estimator import training as estimator_training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export as export_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary_iterator
 from tensorflow.python.summary.writer import writer_cache
+from tensorflow.python.training import session_manager
+
 
 BATCH_SIZE = 10
 LABEL_DIMENSION = 2
@@ -68,57 +69,19 @@ PS = dc._TaskType.PS
 original_run_std_server = dc._run_std_server
 
 
-class MockOsEnv(dict):
-
-  def __init__(self, *args):
-    self._thread_local = threading.local()
-    super(MockOsEnv, self).__init__(*args)
-
-  def get(self, key, default):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.get(self._thread_local.dict, key, default)
-    else:
-      return dict.get(self, key, default)
-
-  def __getitem__(self, key):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.__getitem__(self._thread_local.dict, key)
-    else:
-      return dict.__getitem__(self, key)
-
-  def __setitem__(self, key, val):
-    if not hasattr(self._thread_local, "dict"):
-      self._thread_local.dict = dict()
-    if key == "TF_CONFIG":
-      return dict.__setitem__(self._thread_local.dict, key, val)
-    else:
-      return dict.__setitem__(self, key, val)
-
-
-class DistributeCoordinatorIntegrationTest(test.TestCase,
-                                           parameterized.TestCase):
+class DistributeCoordinatorIntegrationTest(
+    multi_worker_test_base.IndependentWorkerTestBase, parameterized.TestCase):
 
   @classmethod
   def setUpClass(cls):
     """Create a local cluster with 2 workers."""
+    super(DistributeCoordinatorIntegrationTest, cls).setUpClass()
     cls._cluster_spec = multi_worker_test_base.create_in_process_cluster(
         num_workers=3, num_ps=2, has_eval=True)
 
   def setUp(self):
     self._model_dir = tempfile.mkdtemp()
-    self._mock_os_env = MockOsEnv()
-    self._mock_context = test.mock.patch.object(os, "environ",
-                                                self._mock_os_env)
     super(DistributeCoordinatorIntegrationTest, self).setUp()
-    self._mock_context.__enter__()
-
-  def tearDown(self):
-    self._mock_context.__exit__(None, None, None)
-    super(DistributeCoordinatorIntegrationTest, self).tearDown()
 
   def dataset_input_fn(self, x, y, batch_size, shuffle):
 
@@ -141,8 +104,8 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   def _extract_loss_and_global_step(self, event_folder):
     """Returns the loss and global step in last event."""
     event_paths = glob.glob(os.path.join(event_folder, "events*"))
-    self.assertGreater(len(event_paths), 0,
-                       msg="Event file not found in dir %s" % event_folder)
+    self.assertNotEmpty(
+        event_paths, msg="Event file not found in dir %s" % event_folder)
 
     loss = None
     global_step_count = None
@@ -202,10 +165,10 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     train_input_fn = self.dataset_input_fn(
         x={"x": DATA},
         y=DATA,
-        batch_size=BATCH_SIZE // len(train_distribute.worker_devices),
+        batch_size=BATCH_SIZE // train_distribute.num_replicas_in_sync,
         shuffle=True)
     if eval_distribute:
-      eval_batch_size = BATCH_SIZE // len(eval_distribute.worker_devices)
+      eval_batch_size = BATCH_SIZE // eval_distribute.num_replicas_in_sync
     else:
       eval_batch_size = BATCH_SIZE
     eval_input_fn = self.dataset_input_fn(
@@ -285,27 +248,34 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     ])
     self.assertAllEqual((BATCH_SIZE, LABEL_DIMENSION), predicted_proba.shape)
 
+  def _get_strategy_object(self, strategy_cls):
+    if strategy_cls == mirrored_strategy.CoreMirroredStrategy:
+      return strategy_cls(mirrored_strategy.all_local_devices())
+    else:
+      return strategy_cls(num_gpus_per_worker=context.num_gpus())
+
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
           train_distribute_cls=[
               collective_all_reduce_strategy.CollectiveAllReduceStrategy,
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy
           ],
           eval_distribute_cls=[
-              None, mirrored_strategy.MirroredStrategy,
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy,
           ],
           required_gpus=[0, 1]))
   def test_complete_flow_standalone_client(self, train_distribute_cls,
                                            eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -322,20 +292,20 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           mode=["graph"],
           train_distribute_cls=[
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
           ],
           eval_distribute_cls=[
               None,
               mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
           ],
           required_gpus=[0, 1]))
   def test_estimator_standalone_client(self, train_distribute_cls,
                                        eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -355,47 +325,15 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
     self._barrier.wait()
     return ret
 
-  def _task_thread(self, train_distribute, eval_distribute, tf_config):
-    os.environ["TF_CONFIG"] = json.dumps(tf_config)
+  def _independent_worker_fn(
+      self,
+      train_distribute,
+      eval_distribute,
+  ):
     with test.mock.patch.object(dc, "_run_std_server",
                                 self._mock_run_std_server):
       self._complete_flow(train_distribute, eval_distribute)
 
-  def _run_task_in_thread(self, cluster_spec, task_type, task_id,
-                          train_distribute, eval_distribute):
-    if task_type:
-      tf_config = {
-          "cluster": cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-    else:
-      tf_config = {
-          "cluster": cluster_spec,
-          "task": {
-              "type": task_type,
-              "index": task_id
-          }
-      }
-    t = threading.Thread(
-        target=self._task_thread,
-        args=(train_distribute, eval_distribute, tf_config))
-    t.start()
-    return t
-
-  def _run_multiple_tasks_in_threads(self, cluster_spec, train_distribute,
-                                     eval_distribute):
-    threads = {}
-    for task_type in cluster_spec.keys():
-      threads[task_type] = []
-      for task_id in range(len(cluster_spec[task_type])):
-        t = self._run_task_in_thread(cluster_spec, task_type, task_id,
-                                     train_distribute, eval_distribute)
-        threads[task_type].append(t)
-    return threads
-
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
@@ -405,21 +343,20 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
           ],
           eval_distribute_cls=[
               None, mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy,
               parameter_server_strategy.ParameterServerStrategy,
           ],
           required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_between_graph(
       self, train_distribute_cls, eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
-
     if (context.num_gpus() < 2 and eval_distribute_cls ==
         collective_all_reduce_strategy.CollectiveAllReduceStrategy):
       self.skipTest("`CollectiveAllReduceStrategy` needs at least two towers.")
 
+    train_distribute = self._get_strategy_object(train_distribute_cls)
+
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -435,8 +372,9 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
       # 3 workers and 1 evaluator.
       self._barrier = dc._Barrier(4)
 
-    threads = self._run_multiple_tasks_in_threads(
-        cluster_spec, train_distribute, eval_distribute)
+    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
+                                                 cluster_spec, train_distribute,
+                                                 eval_distribute)
     for task_type, ts in threads.items():
       if task_type == PS:
         continue
@@ -449,17 +387,22 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
   @combinations.generate(
       combinations.combine(
           mode=["graph"],
-          train_distribute_cls=[mirrored_strategy.MirroredStrategy],
-          eval_distribute_cls=[None, mirrored_strategy.MirroredStrategy],
+          train_distribute_cls=[
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy
+          ],
+          eval_distribute_cls=[
+              None,
+              mirrored_strategy.MirroredStrategy,
+              mirrored_strategy.CoreMirroredStrategy
+          ],
           required_gpus=[0, 1]))
   def test_complete_flow_indepedent_worker_in_graph(self, train_distribute_cls,
                                                     eval_distribute_cls):
-    train_distribute = train_distribute_cls(
-        num_gpus_per_worker=context.num_gpus())
+    train_distribute = self._get_strategy_object(train_distribute_cls)
 
     if eval_distribute_cls:
-      eval_distribute = eval_distribute_cls(
-          num_gpus_per_worker=context.num_gpus())
+      eval_distribute = self._get_strategy_object(eval_distribute_cls)
     else:
       eval_distribute = None
 
@@ -467,8 +410,9 @@ class DistributeCoordinatorIntegrationTest(test.TestCase,
         num_workers=3, num_ps=0, has_eval=True)
     # 3 workers and 1 evaluator.
     self._barrier = dc._Barrier(4)
-    threads = self._run_multiple_tasks_in_threads(
-        cluster_spec, train_distribute, eval_distribute)
+    threads = self.run_multiple_tasks_in_threads(self._independent_worker_fn,
+                                                 cluster_spec, train_distribute,
+                                                 eval_distribute)
     threads[WORKER][0].join()
     threads[EVALUATOR][0].join()
 
@@ -506,7 +450,8 @@ class RunConfigTest(test.TestCase):
         "os.environ", {"TF_CONFIG": json.dumps(TF_CONFIG_WITHOUT_TASK)}):
       run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
 
   def test_should_run_distribute_coordinator(self):
     """Tests that should_run_distribute_coordinator return a correct value."""
@@ -529,10 +474,12 @@ class RunConfigTest(test.TestCase):
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
       config_with_train_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
       config_with_eval_distribute = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              eval_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
     self.assertTrue(
         dc_training.should_run_distribute_coordinator(
             config_with_train_distribute))
@@ -545,26 +492,27 @@ class RunConfigTest(test.TestCase):
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
       config = run_config_lib.RunConfig(
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy(num_gpus=2)))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy(
+                  ["/device:GPU:0", "/device:GPU:1"])))
     self.assertFalse(dc_training.should_run_distribute_coordinator(config))
 
   def test_init_run_config_duplicate_distribute(self):
     with self.assertRaises(ValueError):
       run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy(),
+          train_distribute=mirrored_strategy.CoreMirroredStrategy(),
           experimental_distribute=DistributeConfig(
-              train_distribute=mirrored_strategy.MirroredStrategy()))
+              train_distribute=mirrored_strategy.CoreMirroredStrategy()))
 
     with self.assertRaises(ValueError):
       run_config_lib.RunConfig(
-          eval_distribute=mirrored_strategy.MirroredStrategy(),
+          eval_distribute=mirrored_strategy.CoreMirroredStrategy(),
           experimental_distribute=DistributeConfig(
-              eval_distribute=mirrored_strategy.MirroredStrategy()))
+              eval_distribute=mirrored_strategy.CoreMirroredStrategy()))
 
   def test_init_run_config_none_distribute_coordinator_mode(self):
     # We don't use distribute coordinator for local training.
     config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.MirroredStrategy())
+        train_distribute=mirrored_strategy.CoreMirroredStrategy())
     dc_training.init_run_config(config, {})
     self.assertIsNone(config._distribute_coordinator_mode)
 
@@ -572,7 +520,7 @@ class RunConfigTest(test.TestCase):
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_MASTER)}):
       config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy())
+          train_distribute=mirrored_strategy.CoreMirroredStrategy())
       self.assertIsNone(config._distribute_coordinator_mode)
 
     # When `train_distribute` is not specified, don't use distribute
@@ -588,7 +536,7 @@ class RunConfigTest(test.TestCase):
     with test.mock.patch.dict("os.environ",
                               {"TF_CONFIG": json.dumps(TF_CONFIG_WITH_CHIEF)}):
       config = run_config_lib.RunConfig(
-          train_distribute=mirrored_strategy.MirroredStrategy())
+          train_distribute=mirrored_strategy.CoreMirroredStrategy())
     self.assertEqual(config._distribute_coordinator_mode,
                      dc.CoordinatorMode.INDEPENDENT_WORKER)
 
@@ -597,7 +545,7 @@ class RunConfigTest(test.TestCase):
     # `experimental.remote_cluster` is set use distribute coordinator with
     # STANDALONE_CLIENT mode.
     config = run_config_lib.RunConfig(
-        train_distribute=mirrored_strategy.MirroredStrategy(),
+        train_distribute=mirrored_strategy.CoreMirroredStrategy(),
         experimental_distribute=DistributeConfig(
             remote_cluster={"chief": ["fake_worker"]}))
     self.assertEqual(config._distribute_coordinator_mode,
@@ -605,5 +553,15 @@ class RunConfigTest(test.TestCase):
 
 
 if __name__ == "__main__":
+  # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
+  orig_init = session_manager.SessionManager.__init__
+
+  def new_init(*args, **kwargs):
+    kwargs.pop("recovery_wait_secs", None)
+    kwargs["recovery_wait_secs"] = 0.5
+    orig_init(*args, **kwargs)
+
+  session_manager.SessionManager.__init__ = new_init
+
   with test.mock.patch.object(sys, "exit", os._exit):
     test.main()
diff --git a/tensorflow/contrib/distribute/python/examples/keras_mnist.py b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
index 0fd3acd045170c04ebdaa9c84d0cb7267a4bc68a..60fda996642464135fe1fb8c314bcf7f04d19362 100644
--- a/tensorflow/contrib/distribute/python/examples/keras_mnist.py
+++ b/tensorflow/contrib/distribute/python/examples/keras_mnist.py
@@ -20,6 +20,10 @@ from __future__ import print_function
 import tensorflow as tf
 
 
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+
+
 NUM_CLASSES = 10
 
 
@@ -102,18 +106,23 @@ def main(_):
   # Build the train and eval datasets from the MNIST data. Also return the
   # input shape which is constructed based on the `image_data_format`
   # i.e channels_first or channels_last.
+  tf.enable_eager_execution()
+
   train_ds, eval_ds, input_shape = get_input_datasets()
   model = get_model(input_shape)
 
   # Instantiate the MirroredStrategy object. If we don't specify `num_gpus` or
   # the `devices` argument then all the GPUs available on the machine are used.
-  strategy = tf.contrib.distribute.MirroredStrategy()
+  # TODO(priyag): Use `tf.distribute.MirroredStrategy` once available.
+  strategy = mirrored_strategy.MirroredStrategy(['/gpu:0', '/cpu:0'])
+
+  optimizer = rmsprop.RMSProp(learning_rate=0.001)
 
   # Compile the model by passing the distribution strategy object to the
   # `distribute` argument. `fit`, `evaluate` and `predict` will be distributed
   # based on the strategy instantiated.
   model.compile(loss=tf.keras.losses.categorical_crossentropy,
-                optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
+                optimizer=optimizer,
                 metrics=['accuracy'],
                 distribute=strategy)
 
diff --git a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
index 46a1cf41c55b371e87979ca625765e0531ac188b..6dfd85bcc4f3784e2744fd876a7190cc9581d96a 100644
--- a/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
+++ b/tensorflow/contrib/distribute/python/keras_optimizer_v2_test.py
@@ -25,18 +25,23 @@ import numpy as np
 import six
 
 from tensorflow.contrib.distribute.python import combinations
-from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.eager import context
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.estimator import training
 from tensorflow.python.estimator.canned import dnn_linear_combined
 from tensorflow.python.estimator.canned import prediction_keys
 from tensorflow.python.estimator.export import export
 from tensorflow.python.estimator.inputs import numpy_io
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
@@ -64,7 +69,9 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
           distribution=[
               combinations.one_device_strategy,
               combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus
+              combinations.mirrored_strategy_with_two_gpus,
+              combinations.core_mirrored_strategy_with_gpu_and_cpu,
+              combinations.core_mirrored_strategy_with_two_gpus
           ],
           use_train_and_evaluate=[True, False]))
   def test_complete_flow_with_mode(self, distribution, use_train_and_evaluate):
@@ -76,11 +83,11 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
     train_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices))
+        batch_size=batch_size // distribution.num_replicas_in_sync)
     eval_input_fn = self.dataset_input_fn(
         x={'x': data},
         y=data,
-        batch_size=batch_size // len(distribution.worker_devices))
+        batch_size=batch_size // distribution.num_replicas_in_sync)
     predict_input_fn = numpy_io.numpy_input_fn(
         x={'x': data}, batch_size=batch_size, shuffle=False)
 
@@ -136,44 +143,51 @@ class KerasOptimizerV2IntegrationTest(test.TestCase, parameterized.TestCase):
       shutil.rmtree(self._model_dir)
 
 
-class MirroredStrategyOptimizerV2Test(test.TestCase):
+def get_model():
+  x = keras.layers.Input(shape=(3,), name='input')
+  y = keras.layers.Dense(4, name='dense')(x)
+  model = keras.Model(x, y)
+  return model
 
-  def testKerasOptimizerWithUnequalInput(self):
-    if context.num_gpus() < 1:
-      self.skipTest('Not enough GPUs.')
 
-    def create_fn(device_id):
+class MirroredStrategyOptimizerV2Test(test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def testKerasOptimizerWithUnequalInput(self, distribution):
+    def create_fn():
       var = variables.Variable(
           2.0, name='var', aggregation=variable_scope.VariableAggregation.SUM)
       # grad for cpu is 1, grad for gpu is 2, avg grad is 1.5.
-      loss = (device_id + 1) * var
+      loss = math_ops.cast(_replica_id() + 1, dtype=dtypes.float32) * var
       optimizer = adam.Adam(learning_rate=0.01, beta_1=0.2, beta_2=0.2)
       train_op = optimizer.minimize(loss, var_list=[var])
       m = optimizer.get_slot(var, 'm')
       v = optimizer.get_slot(var, 'v')
-      return (var, m, v, train_op, optimizer.iteration)
+      return (var, m, v, train_op, optimizer.iterations)
 
     devices = ['/device:GPU:0', '/device:CPU:0']
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      (var, m, v, op, counter) = dist.call_for_each_replica(
-          create_fn, args=[dist.worker_device_index])
+    with distribution.scope():
+      (var, m, v, op, counter) = distribution.call_for_each_replica(create_fn)
       self.evaluate(variables.global_variables_initializer())
       var_val = [2.0, 2.0, 2.0]
       self.assertAllClose(
           var_val,
           self.evaluate(
-              [dist.read_var(var),
+              [distribution.read_var(var),
                var.get(devices[0]),
                var.get(devices[1])]))
       self.assertAllClose([0, 0, 0],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
 
-      train_op = dist.unwrap(op)
+      train_op = distribution.unwrap(op)
       self.evaluate(train_op)
       # m(1) = beta1 * m(0) + (1-beta1) * grad = 0.2 * 0 + 0.8 * (1 + 2) / 2
       m_val = [1.2, 1.2, 1.2]
@@ -181,7 +195,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           m_val,
           self.evaluate(
-              [dist.read_var(m),
+              [distribution.read_var(m),
                m.get(devices[0]),
                m.get(devices[1])]))
       # v(1) = beta2 * v(0) + (1-beta2) * grad^2 = 0.2 * 0 + 0.8 * 2.25
@@ -189,7 +203,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           v_val,
           self.evaluate(
-              [dist.read_var(v),
+              [distribution.read_var(v),
                v.get(devices[0]),
                v.get(devices[1])]))
       # var(1) = var(0) - lr * m(1) * sqrt(1 - beta2) / sqrt(v(1)) / (1 - beta1)
@@ -198,12 +212,12 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           var_val,
           self.evaluate(
-              [dist.read_var(var),
+              [distribution.read_var(var),
                var.get(devices[0]),
                var.get(devices[1])]))
       self.assertAllClose([1, 1, 1],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
@@ -214,7 +228,7 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           m_val,
           self.evaluate(
-              [dist.read_var(m),
+              [distribution.read_var(m),
                m.get(devices[0]),
                m.get(devices[1])]))
       # v(2) = beta2 * v(1) + (1-beta2) * grad^2 = 0.2 * 1.8 + 0.8 * 2.25
@@ -222,16 +236,50 @@ class MirroredStrategyOptimizerV2Test(test.TestCase):
       self.assertAllClose(
           v_val,
           self.evaluate(
-              [dist.read_var(v),
+              [distribution.read_var(v),
                v.get(devices[0]),
                v.get(devices[1])]))
       self.assertAllClose([2, 2, 2],
                           self.evaluate([
-                              dist.read_var(counter),
+                              distribution.read_var(counter),
                               counter.get(devices[0]),
                               counter.get(devices[1])
                           ]))
 
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph']))
+  def testOptimizerWithKerasModelAndNumpyArrays(self, distribution):
+
+    with self.cached_session():
+      model = get_model()
+      optimizer = gradient_descent.SGD(0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
+
+      inputs = np.zeros((64, 3), dtype=np.float32)
+      targets = np.zeros((64, 4), dtype=np.float32)
+
+      model.fit(
+          inputs,
+          targets,
+          epochs=1,
+          batch_size=2,
+          verbose=0,
+          validation_data=(inputs, targets))
+      model.evaluate(inputs, targets)
+      model.predict(inputs)
+
+
+def _replica_id():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if not isinstance(replica_id, ops.Tensor):
+    replica_id = constant_op.constant(replica_id)
+  return replica_id
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/keras_test.py b/tensorflow/contrib/distribute/python/keras_test.py
index 0db5844e4c40e84c635b063523b95226241d07fb..683cc89bfbae9c877ea6794d311ffc00c96c6937 100644
--- a/tensorflow/contrib/distribute/python/keras_test.py
+++ b/tensorflow/contrib/distribute/python/keras_test.py
@@ -24,9 +24,10 @@ import numpy as np
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import tpu_strategy
-from tensorflow.contrib.distribute.python import values
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import test
 from tensorflow.python.estimator import keras as keras_lib
 from tensorflow.python.estimator import run_config as run_config_lib
 from tensorflow.python.framework import constant_op
@@ -35,14 +36,13 @@ from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.engine import distributed_training_utils
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_keras
 from tensorflow.python.ops.parsing_ops import gen_parsing_ops
 from tensorflow.python.platform import gfile
-from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import rmsprop
 
-
 _RANDOM_SEED = 1337
 _TRAIN_SIZE = 200
 _INPUT_SIZE = (10,)
@@ -165,7 +165,9 @@ def get_multi_inputs_multi_outputs_data():
   return (train_data, test_data)
 
 
-def batch_wrapper(dataset, batch_size, distribution):
+def batch_wrapper(dataset, batch_size, distribution, repeat=None):
+  if repeat:
+    dataset = dataset.repeat(repeat)
   # TPUs currently require fully defined input shapes, drop_remainder ensures
   # the input will have fully defined shapes.
   if isinstance(distribution, tpu_strategy.TPUStrategy):
@@ -212,13 +214,19 @@ def multi_input_output_model():
   return model
 
 
-def get_correctness_test_inputs(use_numpy, with_distribution,
+def get_correctness_test_inputs(use_numpy, use_validation_data,
+                                with_distribution,
                                 x_train, y_train, x_predict):
   """Generates the inputs for correctness check when enable Keras with DS."""
+  training_epochs = 2
   global_batch_size = 64
   batch_size = global_batch_size
   # TODO(b/118776054): Use global batch size for Keras/DS support.
-  if with_distribution:
+  use_per_core_batch_size = (
+      with_distribution and
+      not distributed_training_utils.global_batch_size_supported(
+          with_distribution))
+  if use_per_core_batch_size:
     batch_size //= with_distribution.num_replicas_in_sync
 
   if use_numpy:
@@ -226,19 +234,20 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
         'batch_size': batch_size,
         'x': x_train,
         'y': y_train,
-        'epochs': 1,
+        'epochs': training_epochs,
         'shuffle': False,
     }
-    eval_inputs = {
-        'batch_size': batch_size,
-        'x': x_train,
-        'y': y_train,
-    }
+
+    if use_validation_data:
+      eval_inputs = None
+      training_inputs['validation_data'] = (x_train, y_train)
+    else:
+      eval_inputs = {
+          'batch_size': batch_size,
+          'x': x_train,
+          'y': y_train,
+      }
     predict_inputs = {
-        # TODO(b/119318587): We should not require batch_size when distribution
-        # is enabled.
-        'batch_size': (len(x_predict) // with_distribution.num_replicas_in_sync
-                       if with_distribution else None),
         'x': np.array(x_predict, dtype=np.float32),
     }
   else:
@@ -246,30 +255,39 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
     # keras.fit/evaluate/predict. The batch size is part of the dataset.
     train_dataset = dataset_ops.Dataset.from_tensor_slices(
         (x_train, y_train))
-    x = batch_wrapper(train_dataset, batch_size, with_distribution)
+    x = batch_wrapper(
+        train_dataset, batch_size, with_distribution, repeat=training_epochs)
 
     training_inputs = {
         'batch_size': None,
         'x': x,
         'y': None,
-        'epochs': 1,
+        'epochs': training_epochs,
         'shuffle': False,
         'steps_per_epoch': len(x_train) // global_batch_size,
     }
-    eval_inputs = {
-        'batch_size': None,
-        'x': x,
-        'y': None,
-        'steps': 20,
-    }
+    if use_validation_data:
+      eval_inputs = None  # Remove the eval_inputs
+      eval_dataset = dataset_ops.Dataset.from_tensor_slices(
+          (x_train, y_train))
+      x = batch_wrapper(eval_dataset, batch_size, with_distribution)
+      training_inputs['validation_data'] = x
+      training_inputs['validation_steps'] = 5
+    else:
+      eval_inputs = {
+          'batch_size': None,
+          'x': x,
+          'y': None,
+          'steps': 20,
+      }
+
     predict_batch_size = len(x_predict)
-    if with_distribution:
+    if use_per_core_batch_size:
       predict_batch_size //= with_distribution.num_replicas_in_sync
     predict_dataset = dataset_ops.Dataset.from_tensor_slices(x_predict)
     predict_dataset = batch_wrapper(predict_dataset,
                                     predict_batch_size, with_distribution)
     predict_inputs = {
-        'batch_size': None,
         'steps': 1,
         'x': predict_dataset,
     }
@@ -277,47 +295,71 @@ def get_correctness_test_inputs(use_numpy, with_distribution,
   return training_inputs, eval_inputs, predict_inputs
 
 
-strategies = [combinations.default_strategy,
-              combinations.one_device_strategy,
-              combinations.mirrored_strategy_with_gpu_and_cpu,
-              combinations.mirrored_strategy_with_two_gpus,
-              combinations.tpu_strategy,  # steps_per_run=2
-              combinations.tpu_strategy_one_step]
+strategies_minus_tpu = [
+    combinations.default_strategy,
+    combinations.one_device_strategy,
+    combinations.mirrored_strategy_with_gpu_and_cpu,
+    combinations.mirrored_strategy_with_two_gpus,
+    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+    combinations.core_mirrored_strategy_with_two_gpus]
+
+tpu_strategies = [
+    combinations.tpu_strategy,  # steps_per_run=2
+    combinations.tpu_strategy_one_step]
 
 
 def strategy_minus_tpu_combinations():
   return combinations.combine(
-      distribution=[combinations.default_strategy,
-                    combinations.one_device_strategy,
-                    combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
-      mode=['graph'])
+      distribution=strategies_minus_tpu,
+      mode=['graph', 'eager'])
 
 
-def strategy_combinations():
+def tpu_strategy_combinations():
   return combinations.combine(
-      distribution=strategies,
+      distribution=tpu_strategies,
       mode=['graph'])
 
 
-def strategy_and_optimizer_combinations():
-  return combinations.combine(
-      distribution=strategies,
-      optimizer=[combinations.adagrad_optimizer_v1_fn,
-                 combinations.adam_optimizer_v1_fn,
-                 combinations.gradient_descent_optimizer_v1_fn,
-                 combinations.rmsprop_optimizer_v1_fn],
-      mode=['graph'])
+def all_strategy_combinations():
+  return strategy_minus_tpu_combinations() + tpu_strategy_combinations()
 
 
-def strategy_and_inputs():
+# TODO(priyag): Add v2 optimizers here.
+def strategy_and_optimizer_combinations():
+  return combinations.times(
+      all_strategy_combinations(),
+      combinations.combine(
+          optimizer=[combinations.adagrad_optimizer_v1_fn,
+                     combinations.adam_optimizer_v1_fn,
+                     combinations.gradient_descent_optimizer_v1_fn,
+                     combinations.rmsprop_optimizer_v1_fn]))
+
+
+def strategy_and_input_combinations():
+  return (
+      combinations.times(
+          combinations.combine(distribution=strategies_minus_tpu),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])
+          + combinations.combine(mode=['eager'],
+                                 use_numpy=[False],
+                                 use_validation_data=[False])) +
+      combinations.times(
+          combinations.combine(distribution=tpu_strategies),
+          combinations.combine(mode=['graph'],
+                               use_numpy=[True, False],
+                               use_validation_data=[True, False])))
+
+
+def strategy_for_numpy_input_combinations():
   return combinations.combine(
-      distribution=strategies,
-      use_numpy=[True, False],
+      distribution=strategies_minus_tpu + tpu_strategies,
       mode=['graph'])
 
 
-class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
+class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase,
+                                        parameterized.TestCase):
 
   def setUp(self):
     self._base_dir = os.path.join(self.get_temp_dir(),
@@ -325,17 +367,18 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     gfile.MakeDirs(self._base_dir)
     self._config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED, model_dir=self._base_dir)
-    self._dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
 
   def tearDown(self):
     writer_cache.FileWriterCache.clear()
     if os.path.isdir(self._base_dir):
       gfile.DeleteRecursively(self._base_dir)
 
-  def test_train_functional_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_train_functional_with_distribution_strategy(self, distribution):
     keras_model = simple_functional_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -343,8 +386,8 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist,
-                                      eval_distribute=dist)
+                                      train_distribute=distribution,
+                                      eval_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
@@ -358,9 +401,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     writer_cache.FileWriterCache.clear()
     gfile.DeleteRecursively(self._config.model_dir)
 
-  def test_train_sequential_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_train_sequential_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -368,7 +414,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.01))
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist)
+                                      train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(
           keras_model=keras_model, config=config)
@@ -382,7 +428,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
     writer_cache.FileWriterCache.clear()
     gfile.DeleteRecursively(self._config.model_dir)
 
-  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_multi_inputs_multi_outputs_with_input_fn_as_dict(self, distribution):
     train_data, test_data = get_multi_inputs_multi_outputs_data()
 
     def train_input_fn():
@@ -412,14 +463,14 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
                                                      output_dict)).batch(16)
 
     self.do_test_multi_inputs_multi_outputs_with_input_fn(
-        train_input_fn, eval_input_fn)
+        distribution, train_input_fn, eval_input_fn)
 
-  def do_test_multi_inputs_multi_outputs_with_input_fn(self, train_input_fn,
-                                                       eval_input_fn):
+  def do_test_multi_inputs_multi_outputs_with_input_fn(
+      self, distribution, train_input_fn, eval_input_fn):
     config = run_config_lib.RunConfig(
         tf_random_seed=_RANDOM_SEED,
         model_dir=self._base_dir,
-        train_distribute=self._dist)
+        train_distribute=distribution)
     with self.cached_session():
       model = multi_inputs_multi_outputs_model()
       est_keras = keras_lib.model_to_estimator(keras_model=model, config=config)
@@ -429,9 +480,12 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
       eval_results = est_keras.evaluate(input_fn=eval_input_fn, steps=1)
       self.assertLess(eval_results['loss'], baseline_eval_results['loss'])
 
-  def test_keras_optimizer_with_distribution_strategy(self):
-    dist = mirrored_strategy.MirroredStrategy(
-        devices=['/device:GPU:0', '/device:GPU:1'])
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph']))
+  def test_keras_optimizer_with_distribution_strategy(self, distribution):
     keras_model = simple_sequential_model()
     keras_model.compile(
         loss='categorical_crossentropy',
@@ -439,7 +493,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 
     config = run_config_lib.RunConfig(tf_random_seed=_RANDOM_SEED,
                                       model_dir=self._base_dir,
-                                      train_distribute=dist)
+                                      train_distribute=distribution)
     with self.cached_session():
       est_keras = keras_lib.model_to_estimator(keras_model=keras_model,
                                                config=config)
@@ -455,7 +509,7 @@ class TestEstimatorDistributionStrategy(test_util.TensorFlowTestCase):
 class TestDistributionStrategyWithNumpyArrays(test.TestCase,
                                               parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_creating_var_with_numpy_arrays(self, distribution):
     with self.cached_session():
       x = np.asarray(np.random.random((64, 3)), dtype=np.float32)
@@ -464,84 +518,135 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # Verify that the numpy value is copied to the variable.
       self.assertAllEqual(x, val)
 
-  def test_calculating_batch_params(self):
-    # This verifies that we calculate the number of steps when the batch size
-    # is specified.
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_no_steps_no_batch_size(self, distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
     with self.cached_session():
-      # 64 is the number of input samples.
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      # The number of replicas is equal to 3.
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0',
-                                                     '/device:GPU:1'])
-
-      with self.assertRaisesRegexp(ValueError, 'Please specify a batch_size '
-                                               'that is smaller than'):
-        # The batch size(128) is larger than the number of input
-        # samples(64).
-        distributed_training_utils.get_input_batch_params(inputs,
-                                                          128,
-                                                          strategy)
-
-      with self.assertRaisesRegexp(ValueError, 'is smaller than the number '
-                                               'of replicas'):
-        # The batch size(32) * num_replicas_in_sync(3) is 96 which is greater
-        # than the number of input samples(64).
-        distributed_training_utils.get_input_batch_params(inputs,
-                                                          32,
-                                                          strategy)
-
-      # The number of replicas now is equal to 2.
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      # 32 is the batch size per replica.
-      steps = distributed_training_utils.get_input_batch_params(inputs,
-                                                                32,
-                                                                strategy)
-      # The number of batches is the ratio of input samples(64) to
-      # batch size(32) which is 2. The number of steps(1) is the ratio of
-      # number of batches(2) to the number of replicas(2).
+      # Input samples of different sizes
+      input_20_samples = np.zeros((20, 3), dtype=np.float32)
+      input_63_samples = np.zeros((63, 3), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Default global batch size 32 for input with 64 samples run in 2 steps
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=None)
+      self.assertEqual(batch_size, 32 // replica_scale_factor)
+      self.assertEqual(steps, 2)
+
+      # Computed global batch size 20 is lower than 32 if we pass less samples.
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_20_samples, steps=None, batch_size=None)
+      self.assertEqual(batch_size, 20 // replica_scale_factor)
       self.assertEqual(steps, 1)
 
-      # 16 is the batch size per replica.
-      steps = distributed_training_utils.get_input_batch_params(inputs,
-                                                                16,
-                                                                strategy)
-      # The number of batches is the ratio of input samples(64) to
-      # batch size(16) which is 4. The number of steps(2) is the ratio of
-      # number of batches(4) to the number of replicas(2).
+      #  Default global batch size 32 cannot be used with 63 samples.
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=None, batch_size=None)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_with_steps_no_batch_size(self,
+                                                             distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
+    with self.cached_session():
+      # Input samples of different sizes
+      input_63_samples = np.zeros((63, 3), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Computed global batch size is correct for number of specified 1 step
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=1, batch_size=None)
+      self.assertEqual(batch_size, 64 // replica_scale_factor)
+      self.assertEqual(steps, 1)
+
+      # Computed global batch size is correct for number of specified 2 steps
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=2, batch_size=None)
+      self.assertEqual(batch_size, 32 // replica_scale_factor)
       self.assertEqual(steps, 2)
 
-  def test_calculating_batch_size(self):
+      # All samples can not be consumed in specified number of steps
+      with self.assertRaisesRegexp(ValueError, 'not divisible by steps'):
+        distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=2, batch_size=None)
+
+      # This cases is different for different strategies due to the
+      # difference in supported batch size being global or per-replica.
+      if replica_scale_factor == 1:
+        # Computed global batch size is correct even if not sharadable
+        steps, batch_size = distributed_training_utils.get_input_params(
+            distribution, input_63_samples, steps=3, batch_size=None)
+        self.assertEqual(batch_size, 21)
+        self.assertEqual(steps, 3)
+      else:
+        # Computed global batch size can not be sharded across replicas
+        with self.assertRaisesRegexp(ValueError, 'could not be sharded evenly '
+                                     'across the sync replicas'):
+          distributed_training_utils.get_input_params(
+              distribution, input_63_samples, steps=1, batch_size=None)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_no_steps_with_batch_size(self,
+                                                             distribution):
+    # Calculate the per_replica_batch_size scaling factor for strategies
+    # that use per_core_batch_size
+    replica_scale_factor = 1.0
+    if not distributed_training_utils.global_batch_size_supported(distribution):
+      replica_scale_factor = distribution.num_replicas_in_sync
+
     with self.cached_session():
-      # 64 is the number of input samples.
-      inputs = np.zeros((64, 3), dtype=np.float32)
-      targets = np.zeros((64, 4), dtype=np.float32)
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
+
+      # Computed steps is correct for specified batch size
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=16)
+      self.assertEqual(batch_size, 16)
+      self.assertEqual(steps, 4 // replica_scale_factor)
+
+      # Computed steps is correct for specified batch size
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=None, batch_size=32)
+      self.assertEqual(batch_size, 32)
+      self.assertEqual(steps, 2 // replica_scale_factor)
+
+      # Number of samples is not divisible by the global batch size
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=None, batch_size=20)
+
+      # Number of samples is not divisible by the global batch size
+      with self.assertRaisesRegexp(ValueError, 'not divisible by batch size'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=None, batch_size=3)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
+  def test_calculating_input_params_with_steps_with_batch_size(self,
+                                                               distribution):
+    with self.cached_session():
+      input_64_samples = np.zeros((64, 3), dtype=np.float32)
 
-      model = get_model()
-      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
-      loss = 'mse'
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      strategy._require_static_shapes = True
-
-      model.compile(optimizer, loss, distribute=strategy)
-      iterator = model._distribution_standardize_user_data(inputs,
-                                                           targets,
-                                                           batch_size=None,
-                                                           check_steps=True,
-                                                           steps_name='steps',
-                                                           steps=3)
-
-      # The global batch size(21) across all replicas is the ratio of the input
-      # samples(64) to the steps(3).
-      # The batch size(10) per device is the ratio of the global batch size(21)
-      # to the number of replicas(2).
-      # The global batch size and batch size are rounded integer values.
-      self.assertEqual(10, distributed_training_utils.get_batch_dimension(
-          iterator._iterator))
-
-  @combinations.generate(strategy_combinations())
+      # No change to steps and batch size if both specified and feasible
+      steps, batch_size = distributed_training_utils.get_input_params(
+          distribution, input_64_samples, steps=5, batch_size=3)
+      self.assertEqual(batch_size, 3)
+      self.assertEqual(steps, 5)
+
+      # Number of samples is less than global batch size * steps
+      with self.assertRaisesRegexp(ValueError, 'less than samples required'):
+        distributed_training_utils.get_input_params(
+            distribution, input_64_samples, steps=10, batch_size=13)
+
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_numpy_arrays(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -572,7 +677,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # with batch_size
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_calling_model_with_nested_numpy_arrays(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
@@ -606,21 +711,22 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # with batch_size
       model.predict(inputs, batch_size=8)
 
-  @combinations.generate(strategy_minus_tpu_combinations())
+  @combinations.generate(combinations.combine(
+      distribution=strategies_minus_tpu, mode=['graph']))
   def test_numpy_with_sample_weights(self, distribution):
     model = get_model()
     optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     model.compile(optimizer, loss, distribute=distribution)
 
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
+    inputs = np.zeros((20, 3), np.float32)
+    targets = np.zeros((20, 4), np.float32)
+    sample_weights = np.ones((20), np.float32)
 
     model.fit(inputs, targets, sample_weight=sample_weights, epochs=1,
               steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(strategy_for_numpy_input_combinations())
   def test_flatten_predict_outputs(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
@@ -638,7 +744,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
       # `predict` a list that is equal in length to the number of model outputs.
       # In this test our model has two outputs and each element of `outs`
       # corresponds to all the samples of one of the model outputs.
-      self.assertEqual(2, len(outs))
+      self.assertLen(outs, 2)
       # Each of the output samples have a dimension of 7. We should process all
       # the available input samples(6).
       self.assertAllEqual([6, 7], outs[0].shape)
@@ -648,7 +754,7 @@ class TestDistributionStrategyWithNumpyArrays(test.TestCase,
 class TestDistributionStrategyWithDatasets(test.TestCase,
                                            parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_calling_model_on_same_dataset(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -667,7 +773,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
                 validation_data=dataset, validation_steps=2)
       model.predict(get_predict_dataset(distribution), steps=2)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_model_interleaved_eval_same_as_direct_eval(self, distribution):
     with self.cached_session():
       user_controlled_model = get_model()
@@ -710,16 +816,20 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
   # TODO(priyag): Enable this test for TPU. Currently tuples/dict don't work
   # as clone_model's input_tensors argument only seems to accept list and not
   # tuples or dict.
-  def test_fit_with_tuple_and_dict_dataset_inputs(self):
+
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_fit_with_tuple_and_dict_dataset_inputs(self, distribution):
     with self.cached_session():
       model = multi_input_output_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.001)
       loss = 'mse'
       metrics = ['mae', keras.metrics.CategoricalAccuracy()]
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
       input_a_np = np.random.random((10, 3))
       input_b_np = np.random.random((10, 5))
@@ -743,7 +853,7 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       model.fit(dataset_dict, epochs=1, steps_per_epoch=2, verbose=1)
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_fit_eval_and_predict_methods_on_dataset(self, distribution):
     with self.cached_session():
       model = get_model()
@@ -792,35 +902,48 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
     model.evaluate(dataset, steps=2, verbose=1)
     model.predict(dataset, steps=2)
 
-  def test_dataset_input_shape_validation(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_dataset_wrong_input_shape(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
       loss = 'mse'
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
-      model.compile(optimizer, loss, distribute=strategy)
+      model.compile(optimizer, loss, distribute=distribution)
 
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3), dtype=np.float32)
+      # Wrong input shape
+      inputs = np.zeros((10, 5), dtype=np.float32)
       targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
+      with self.assertRaisesRegexp(ValueError,
+                                   'expected input to have shape'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-      # Wrong input shape
-      inputs = np.zeros((10, 5), dtype=np.float32)
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_dataset_no_batch_input_validation(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = rmsprop.RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3), dtype=np.float32)
       targets = np.zeros((10, 4), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
       dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
 
-      with self.assertRaisesRegexp(ValueError,
-                                   'expected input to have shape'):
+      with self.assertRaisesRegexp(ValueError, 'expected input to have shape'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
   @combinations.generate(combinations.combine(
@@ -842,7 +965,12 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       with self.assertRaisesRegexp(ValueError, 'requires fully defined shapes'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0)
 
-  def test_learning_phase_value(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_learning_phase_value(self, distribution):
     # TODO(anjalisridhar): Modify this test to use Lambdas since we can compare
     # meaningful values. Currently we don't pass the learning phase if the
     # Lambda layer uses the learning phase.
@@ -856,15 +984,17 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
       optimizer = gradient_descent.GradientDescentOptimizer(0.005)
       loss = 'mse'
       metrics = ['acc']
-      strategy = mirrored_strategy.MirroredStrategy(
-          ['/device:GPU:0', '/device:GPU:1'])
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      batch_size = 8
+      if isinstance(distribution, mirrored_strategy.CoreMirroredStrategy):
+        # CoreMirroredStrategy uses global batch size.
+        batch_size = 8 * distribution.num_replicas_in_sync
 
       inputs = np.ones((10, 1), dtype=np.float32)
       targets = np.ones((10, 1), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat().batch(8)
+      dataset = dataset.repeat().batch(batch_size)
       hist = model.fit(dataset, epochs=1, steps_per_epoch=20, verbose=1)
       self.assertAlmostEqual(hist.history['acc'][0], 0, 0)
 
@@ -875,24 +1005,51 @@ class TestDistributionStrategyWithDatasets(test.TestCase,
 
       inputs = np.ones((10, 1), dtype=np.float32)
       predict_dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
-      predict_dataset = predict_dataset.repeat().batch(5)
+
+      predict_dataset = predict_dataset.repeat().batch(batch_size)
       output = model.predict(predict_dataset, steps=10)
-      # `predict` runs for 10 steps and in each step you process 100 samples.
-      ref_output = np.ones((100, 1), dtype=np.float32)
+      # `predict` runs for 10 steps
+      ref_output = np.ones((160, 1), dtype=np.float32)
       self.assertArrayNear(output, ref_output, 1e-1)
 
+  @combinations.generate(strategy_minus_tpu_combinations())
+  def testOptimizerWithCallbacks(self, distribution):
+    with self.cached_session():
+      model = get_model()
+
+      optimizer = gradient_descent_keras.SGD(0.01)
+      loss = 'mse'
+      model.compile(optimizer, loss, distribute=distribution)
+
+      dataset = get_dataset(distribution)
+
+      def schedule(_):
+        return 0.001
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+                callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
+      grouped_models = distribution.unwrap(model._grouped_model)
+      with distribution.scope():
+        for m in grouped_models:
+          self.assertAllClose(0.001, keras.backend.get_value(
+              m.optimizer.lr), atol=1e-05, rtol=1e-05)
+
 
 class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
 
-  def test_validating_dataset_input_tensors_with_shape_mismatch(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_shape_mismatch(self,
+                                                                distribution):
     with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2))
       b = constant_op.constant([[1, 2], [1, 2]], shape=(2, 2))
       x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
       y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
+      with distribution.scope():
         # Removed device and input tensor shape details from the error message
         # since the order of the device and the corresponding input tensor shape
         # is not deterministic over different runs.
@@ -901,17 +1058,21 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                      'distributed tensor inputs '
                                      'DistributedValues:.+'):
           distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+              distribution, x, y)
 
-  def test_validating_dataset_input_tensors_with_dtype_mismatch(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_validating_dataset_input_tensors_with_dtype_mismatch(self,
+                                                                distribution):
     with self.cached_session():
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:0',
-                                                     '/device:CPU:0'])
       a = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.int32)
       b = constant_op.constant([1, 2], shape=(1, 2), dtype=dtypes.float64)
       x = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': b})
       y = values.DistributedValues({'/device:CPU:0': a, '/device:GPU:0': a})
-      with strategy.scope():
+      with distribution.scope():
         # Removed device and input tensor dtype details from the error message
         # since the order of the device and the corresponding input tensor dtype
         # is not deterministic over different runs.
@@ -920,21 +1081,23 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                      'distributed tensor inputs '
                                      'DistributedValues:.+'):
           distributed_training_utils.validate_distributed_dataset_inputs(
-              strategy, x, y)
+              distribution, x, y)
 
-  def test_unsupported_features(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_unsupported_features(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      dataset = get_dataset(strategy)
+      dataset = get_dataset(distribution)
 
       # Test with validation split
       with self.assertRaisesRegexp(
@@ -969,45 +1132,48 @@ class TestDistributionStrategyErrorCases(test.TestCase, parameterized.TestCase):
                                    'you should specify the `steps` argument'):
         model.predict(dataset, verbose=0)
 
-  def test_calling_with_unsupported_predefined_callbacks(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_gpu_and_cpu,
+          combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=['graph', 'eager']))
+  def test_calling_with_unsupported_predefined_callbacks(self, distribution):
     with self.cached_session():
       model = get_model()
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.001)
       loss = 'mse'
       metrics = ['mae']
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-      model.compile(optimizer, loss, metrics=metrics, distribute=strategy)
+      model.compile(optimizer, loss, metrics=metrics, distribute=distribution)
 
-      dataset = get_dataset(strategy)
+      dataset = get_dataset(distribution)
 
       def schedule(_):
         return 0.001
       with self.assertRaisesRegexp(ValueError,
-                                   'LearningRateScheduler callback is not '
-                                   'supported with DistributionStrategy.'):
+                                   'You must specify a Keras Optimizer V2 when '
+                                   'using'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                   callbacks=[keras.callbacks.LearningRateScheduler(schedule)])
 
       with self.assertRaisesRegexp(ValueError,
-                                   'ReduceLROnPlateau callback is not '
-                                   'supported with DistributionStrategy.'):
+                                   'You must specify a Keras Optimizer V2 when '
+                                   'using'):
         model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
                   callbacks=[keras.callbacks.ReduceLROnPlateau()])
-      with self.assertRaisesRegexp(ValueError,
-                                   'histogram_freq in the TensorBoard callback '
-                                   'is not supported when using '
-                                   'DistributionStrategy.'):
-        model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-                  callbacks=[keras.callbacks.TensorBoard(histogram_freq=10)])
 
 
-class TestDistributionStrategyWithLossMasking(test.TestCase):
+class TestDistributionStrategyWithLossMasking(test.TestCase,
+                                              parameterized.TestCase):
 
   # TODO(priyag): Enable all strategies for this test. Currently it does not
   # work for TPU due to some invalid datatype.
-  def test_masking(self):
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_two_gpus,
+          combinations.core_mirrored_strategy_with_two_gpus],
+      mode=['graph', 'eager']))
+  def test_masking(self, distribution):
     with self.cached_session():
       np.random.seed(1337)
       x = np.array([[[1], [1]], [[0], [0]]])
@@ -1016,12 +1182,9 @@ class TestDistributionStrategyWithLossMasking(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      strategy = mirrored_strategy.MirroredStrategy(['/device:GPU:1',
-                                                     '/device:GPU:0'])
-
       model.compile(loss='mse',
                     optimizer=gradient_descent.GradientDescentOptimizer(0.01),
-                    distribute=strategy)
+                    distribute=distribution)
       y = np.array([[[1], [1]], [[1], [1]]])
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
@@ -1033,7 +1196,7 @@ class TestDistributionStrategyWithLossMasking(test.TestCase):
 class TestDistributionStrategyWithNormalizationLayer(
     test.TestCase, parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_batchnorm_correctness(self, distribution):
     with self.cached_session():
       model = keras.models.Sequential()
@@ -1065,7 +1228,7 @@ class TestDistributionStrategyWithNormalizationLayer(
 class TestDistributionStrategyCorrectness(test.TestCase,
                                           parameterized.TestCase):
 
-  @combinations.generate(strategy_combinations())
+  @combinations.generate(all_strategy_combinations())
   def test_metric_correctness(self, distribution):
     with self.cached_session():
       keras.backend.set_image_data_format('channels_last')
@@ -1088,21 +1251,63 @@ class TestDistributionStrategyCorrectness(test.TestCase,
           distribute=distribution)
 
       batch_size = 64
-      batch_size //= distribution.num_replicas_in_sync
+      if not distributed_training_utils.global_batch_size_supported(
+          distribution):
+        batch_size //= distribution.num_replicas_in_sync
       train_dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
       train_dataset = batch_wrapper(train_dataset, batch_size, distribution)
 
-      history = model.fit(x=train_dataset, epochs=1, steps_per_epoch=10)
-      self.assertEqual(history.history['binary_accuracy'], [1.0])
+      history = model.fit(x=train_dataset, epochs=2, steps_per_epoch=10)
+      self.assertEqual(history.history['binary_accuracy'], [1.0, 1.0])
 
-  @combinations.generate(strategy_and_inputs())
-  def test_correctness(self, distribution, use_numpy):
+  @combinations.generate(all_strategy_combinations())
+  def test_eval_metrics_correctness(self, distribution):
     with self.cached_session():
-      tolerance = 1e-5
+      model = keras.Sequential()
+      model.add(
+          keras.layers.Dense(
+              3, activation='relu', input_dim=4, kernel_initializer='ones'))
+      model.add(
+          keras.layers.Dense(
+              1, activation='sigmoid', kernel_initializer='ones'))
+      model.compile(
+          loss='mae',
+          metrics=['accuracy', keras.metrics.BinaryAccuracy()],
+          optimizer=gradient_descent.GradientDescentOptimizer(0.001),
+          distribute=distribution)
+
+      # verify correctness of stateful and stateless metrics.
+      x = np.ones((100, 4)).astype('float32')
+      y = np.ones((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = batch_wrapper(dataset, 4, distribution)
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 1.)
+      self.assertEqual(outs[2], 1.)
+
+      y = np.zeros((100, 1)).astype('float32')
+      dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat()
+      dataset = batch_wrapper(dataset, 4, distribution)
+      outs = model.evaluate(dataset, steps=10)
+      self.assertEqual(outs[1], 0.)
+      self.assertEqual(outs[2], 0.)
+
+  @combinations.generate(strategy_and_input_combinations())
+  def test_correctness(self, distribution, use_numpy, use_validation_data):
 
-      if isinstance(distribution, mirrored_strategy.MirroredStrategy):
-        # TODO(b/119257215): use the default one once the flakyness is fixed.
-        tolerance = 1e-4
+    with self.cached_session():
+      default_tolerance = 1e-5
+      tol_table = {}
+
+      if isinstance(distribution, (mirrored_strategy.MirroredStrategy,
+                                   mirrored_strategy.CoreMirroredStrategy)):
+        # TODO(b/119257215): Weights are not exactly the same, so use larger
+        # tolerance for now. Predict should be related to weights.
+        tol_table = {
+            'weights_1': 1e-4,
+            'weights_2': 1e-4,
+            'predict_result_1': 1e-4,
+        }
 
       keras.backend.set_image_data_format('channels_last')
       np.random.seed(_RANDOM_SEED)
@@ -1123,49 +1328,75 @@ class TestDistributionStrategyCorrectness(test.TestCase,
       # This is used to initialize the model for both the distribution and
       # non-distribution run. In addition, we add few non-linear layers to make
       # it non-trivial.
-      model = keras.Sequential()
-      model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
-      model.add(keras.layers.Dense(10, activation='relu'))
-      model.add(keras.layers.Dense(10, activation='relu'))
-      model.add(keras.layers.Dense(1))
+      def _create_model():
+        model = keras.Sequential()
+        model.add(keras.layers.Dense(10, activation='relu', input_shape=(1,)))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(10, activation='relu'))
+        model.add(keras.layers.Dense(1))
+        return model
+
+      model = _create_model()
       initial_weights = model.get_weights()
+      del model  # avoid accident usage.
 
-      def fit_and_predict(with_distribution=None):
+      def fit_eval_and_predict(with_distribution=None):
+        model = _create_model()
         # We have initialized the model to the same weight for the distribution
         # and non-distribution run.
         model.set_weights(initial_weights)
         model.compile(
             loss=keras.losses.mean_squared_error,
-            optimizer=gradient_descent.GradientDescentOptimizer(0.5),
+            optimizer=gradient_descent_keras.SGD(0.5),
+            metrics=['mse'],
             distribute=with_distribution)
 
         training_inputs, eval_inputs, predict_inputs = (
-            get_correctness_test_inputs(use_numpy, with_distribution,
+            get_correctness_test_inputs(use_numpy, use_validation_data,
+                                        with_distribution,
                                         x_train, y_train, x_predict))
 
-        model.fit(**training_inputs)
-        eval_result = model.evaluate(**eval_inputs)
-        weights = model.get_weights()
-        predict_result = model.predict(**predict_inputs)
+        result = {}
+        result['training_history_1'] = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          result['eval_result_1'] = model.evaluate(**eval_inputs)
+
+        result['weights_1'] = model.get_weights()
+        result['predict_result_1'] = model.predict(**predict_inputs)
+
+        # Train and eval again to mimic user's flow.
+
+        result['training_history_2'] = model.fit(**training_inputs).history
+
+        if eval_inputs is not None:
+          result['eval_result_2'] = model.evaluate(**eval_inputs)
+
+        result['weights_2'] = model.get_weights()
 
-        return weights, eval_result, predict_result
+        return result
 
-      wts_with_ds, eval_with_ds, predict_with_ds = fit_and_predict(
-          with_distribution=distribution)
-      wts_without_ds, eval_without_ds, predict_without_ds = fit_and_predict(
-          with_distribution=None)
+      results_with_ds = fit_eval_and_predict(with_distribution=distribution)
+      results_without_ds = fit_eval_and_predict(with_distribution=None)
 
-      # Verify that the weights, eval results, predict outputs  are the same
-      # within some limits of tolerance.
-      self.assertAllClose(
-          wts_with_ds, wts_without_ds, atol=tolerance, rtol=tolerance)
-      self.assertAllClose(
-          eval_with_ds, eval_without_ds, atol=tolerance, rtol=tolerance)
-      self.assertAllClose(
-          predict_with_ds, predict_without_ds, atol=tolerance, rtol=tolerance)
+      # Verify that the weights, training history, eval results, predict outputs
+      # are the same within some limits of tolerance.
+      for key in results_with_ds:
+        if (key.startswith('training_history') and
+            isinstance(distribution, tpu_strategy.TPUStrategy) and
+            distribution.extended.steps_per_run > 1):
+          # TODO(b/119894254): Enable this test for all cases once the
+          # underlying bug is fixed.
+          continue
 
+        tolerance = tol_table.get(key, default_tolerance)
 
-# TODO(priyag): Add a test for TPUStrategy with steps_per_run > 1.
+        self.assertAllClose(
+            results_with_ds[key],
+            results_without_ds[key],
+            atol=tolerance,
+            rtol=tolerance,
+            msg='Fail to assert {}.'.format(key))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/metrics_v1_test.py b/tensorflow/contrib/distribute/python/metrics_v1_test.py
index c28ab416518799e239bff43def75e00b7c22ee73..8ac659abe96370b751ed1556cc699fe20788a0fd 100644
--- a/tensorflow/contrib/distribute/python/metrics_v1_test.py
+++ b/tensorflow/contrib/distribute/python/metrics_v1_test.py
@@ -72,14 +72,14 @@ def _regression_dataset_fn():
       "predictions": [1., .75, .25, 0.]}).repeat()
 
 
-# TODO(priyag): Add TPU Strategy to this once metrics aggregate correctly using
-# ReplicaLocalVariables on TPUs. Submit http://cl/208914352.
 def all_combinations():
   return combinations.combine(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       mode=["graph"])
 
 
@@ -100,18 +100,19 @@ class MetricsV1Test(test.TestCase, parameterized.TestCase):
       if isinstance(distribution, tpu_strategy.TPUStrategy):
         def step_fn(ctx, inputs):
           value, update = distribution.call_for_each_replica(
-              metric_fn, args=[inputs])
+              metric_fn, args=inputs)
           ctx.set_non_tensor_output(name="value", output=value)
           return distribution.group(update)
 
         ctx = distribution.run_steps_on_dataset(
-            step_fn, iterator, iterations=distribution.steps_per_run)
+            step_fn, iterator, iterations=distribution.extended.steps_per_run)
         update = ctx.run_op
         value = ctx.non_tensor_outputs["value"]
         # In each run, we run multiple steps, and each steps consumes as many
         # batches as number of replicas.
         batches_per_update = (
-            distribution.num_replicas_in_sync * distribution.steps_per_run)
+            distribution.num_replicas_in_sync *
+            distribution.extended.steps_per_run)
       else:
         value, update = distribution.call_for_each_replica(
             metric_fn, iterator.get_next())
diff --git a/tensorflow/contrib/distribute/python/minimize_loss_test.py b/tensorflow/contrib/distribute/python/minimize_loss_test.py
index c6562463edbf8e03d5771a5147dc227ddf438c40..f09483cb56b66fd4720ee71085203c14f1ccadc3 100644
--- a/tensorflow/contrib/distribute/python/minimize_loss_test.py
+++ b/tensorflow/contrib/distribute/python/minimize_loss_test.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python.single_loss_example import batchnorm_example
 from tensorflow.contrib.distribute.python.single_loss_example import minimize_loss_example
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
@@ -63,7 +64,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       model_fn, dataset_fn, layer = minimize_loss_example(
           optimizer_fn, use_bias=True, use_callable_loss=use_callable_loss)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
             distribution.call_for_each_replica(model_fn, args=inputs))
@@ -157,7 +158,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           use_callable_loss=True,
           create_optimizer_inside_model_fn=True)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
             distribution.call_for_each_replica(model_fn, args=inputs))
@@ -226,12 +227,12 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
           renorm=renorm,
           update_ops_in_replica_mode=not update_ops_in_cross_replica_mode)
 
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         fetches = distribution.unwrap(
             distribution.call_for_each_replica(model_fn, args=inputs))
         if update_ops_in_cross_replica_mode:
-          fetches += ops.get_collection(ops.GraphKeys.UPDATE_OPS)
+          fetches += tuple(ops.get_collection(ops.GraphKeys.UPDATE_OPS))
         return control_flow_ops.group(fetches)
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
@@ -285,7 +286,9 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
                   distribution=[
                       combinations.one_device_strategy,
                       combinations.mirrored_strategy_with_gpu_and_cpu,
-                      combinations.mirrored_strategy_with_two_gpus
+                      combinations.mirrored_strategy_with_two_gpus,
+                      combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                      combinations.core_mirrored_strategy_with_two_gpus
                   ]),
               combinations.combine(
                   mode=["graph"], use_callable_loss=[True, False]) +
@@ -321,10 +324,10 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         labels = dataset_ops.Dataset.from_tensors([[6.], [21.]])
         return dataset_ops.Dataset.zip((features, labels)).repeat()
 
-      def step_fn(ctx, x, y):
+      def step_fn(ctx, inputs):
         del ctx  # Unused
         return distribution.group(
-            distribution.call_for_each_replica(model_fn, args=(x, y)))
+            distribution.call_for_each_replica(model_fn, args=inputs))
 
       iterator = self._get_iterator(distribution.distribute_dataset(dataset_fn))
 
@@ -341,7 +344,7 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       run_step()
 
       v = all_vars[0]
-      self.assertTrue(all([v is vi for vi in all_vars[1:]]))
+      self.assertTrue(all(v is vi for vi in all_vars[1:]))
       weight = numpy.squeeze(self.evaluate(v))
       # Our model is:
       #   predict = x * w
@@ -402,21 +405,21 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
         train_op = optimizer.minimize(loss_fn)
         loss = loss_fn()
         output_context.set_last_step_output(
-            name="replica_loss_agg",
+            name="replica_loss_reduced",
             output=loss,
-            aggregation=variables_lib.VariableAggregation.MEAN)
+            reduce_op=reduce_util.ReduceOp.MEAN)
         output_context.set_non_tensor_output(key1, value1)
         return (train_op, loss)
 
-      def step_fn(output_context, *inputs):
+      def step_fn(output_context, inputs):
         (train_op, loss) = distribution.call_for_each_replica(
             model_fn, args=(output_context,) + inputs)
         output_context.set_last_step_output(
-            name="cross_replica_loss_agg",
+            name="cross_replica_loss_reduced",
             output=loss,
-            aggregation=variables_lib.VariableAggregation.MEAN)
+            reduce_op=reduce_util.ReduceOp.MEAN)
         output_context.set_last_step_output(
-            name="cross_replica_loss_noagg",
+            name="cross_replica_loss_not_reduced",
             output=loss)
         return distribution.group(train_op)
 
@@ -424,36 +427,36 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
 
       def run_step():
         initial_loss = lambda: constant_op.constant(1e7)
-        # Initial values corresponding to aggregated losses are just single
-        # tensors. But for non aggregated losses, we need to have initial
+        # Initial values corresponding to reduced losses are just single
+        # tensors. But for non reduced losses, we need to have initial
         # values that are of the same structure as non reduced losses. In
         # MirroredStrategy, this will be a list of losses, in TPUStrategy
         # it will be single tensor. Using `broadcast` followed by `unwrap`
         # gives us the desired initial value structure.
         initial_loop_values = {
-            "replica_loss_agg": initial_loss(),
-            "cross_replica_loss_agg": initial_loss(),
-            "cross_replica_loss_noagg":
+            "replica_loss_reduced": initial_loss(),
+            "cross_replica_loss_reduced": initial_loss(),
+            "cross_replica_loss_not_reduced":
             distribution.unwrap(distribution.broadcast(initial_loss()))
         }
         ctx = distribution.run_steps_on_dataset(
             step_fn, iterator, iterations=2,
             initial_loop_values=initial_loop_values)
 
-        self.assertEqual({key1: [value1]}, ctx.non_tensor_outputs)
+        self.assertEqual({key1: (value1,)}, ctx.non_tensor_outputs)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["replica_loss_agg"],
-            aggregated=True, distribution=distribution)
+            loss_output=ctx.last_step_outputs["replica_loss_reduced"],
+            reduced=True, distribution=distribution)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_replica_loss_agg"],
-            aggregated=True, distribution=distribution)
+            loss_output=ctx.last_step_outputs["cross_replica_loss_reduced"],
+            reduced=True, distribution=distribution)
         self._verify_loss_output(
             initial_loss(),
-            loss_output=ctx.last_step_outputs["cross_replica_loss_noagg"],
-            aggregated=False, distribution=distribution)
-        return (ctx.run_op, ctx.last_step_outputs["replica_loss_agg"])
+            loss_output=ctx.last_step_outputs["cross_replica_loss_not_reduced"],
+            reduced=False, distribution=distribution)
+        return (ctx.run_op, ctx.last_step_outputs["replica_loss_reduced"])
 
       self.evaluate(distribution.initialize())
       if not context.executing_eagerly():
@@ -478,18 +481,16 @@ class MinimizeLossStepTest(test.TestCase, parameterized.TestCase):
       error_is_not_increasing = all(y <= x for x, y in zip(error, error[1:]))
       self.assertTrue(error_is_not_increasing)
 
-  def _verify_loss_output(self, initial_loss, loss_output, aggregated,
+  def _verify_loss_output(self, initial_loss, loss_output, reduced,
                           distribution):
-    if not aggregated:
-      self.assertEqual(distribution.num_replicas_in_sync,
-                       len(distribution.unwrap(loss_output)))
-      loss_output = distribution.reduce(
-          aggregation=variables_lib.VariableAggregation.MEAN,
-          value=loss_output, destinations="/device:CPU:0")
-
-    unwrapped_output = distribution.unwrap(loss_output)
-    self.assertEqual(1, len(unwrapped_output))
-    loss_tensor = unwrapped_output[0]
+    if not reduced:
+      self.assertLen(distribution.unwrap(loss_output),
+                     distribution.num_replicas_in_sync)
+      loss_tensor = distribution.reduce(reduce_util.ReduceOp.MEAN, loss_output)
+    else:
+      unwrapped_output = distribution.unwrap(loss_output)
+      self.assertLen(unwrapped_output, 1)
+      loss_tensor = unwrapped_output[0]
     self.assertEqual(initial_loss.dtype, loss_tensor.dtype)
     self.assertEqual(initial_loss.shape, loss_tensor.shape)
 
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy.py b/tensorflow/contrib/distribute/python/mirrored_strategy.py
index e90c510aadb40555cacf60bcff5516e87e06b728..20f1a08d4261b931a9353738147fba7d7dff9225 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy.py
@@ -12,293 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class MirroredStrategy implementing DistributionStrategy."""
+"""Contrib version of MirroredStrategy."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-from functools import partial
-import threading
+import functools
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import shared_variable_creator
-from tensorflow.contrib.distribute.python import values
-from tensorflow.python import pywrap_tensorflow
-from tensorflow.python.distribute import multi_worker_util
-from tensorflow.python.eager import context
-from tensorflow.python.eager import tape
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import coordinator
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.util import nest
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import mirrored_strategy
+from tensorflow.python.distribute import values
 
 
-# TODO(josh11b): Replace asserts in this file with if ...: raise ...
-
-
-@contextlib.contextmanager
-def _enter_graph(g):
-  if context.executing_eagerly():
-    with g.as_default(), context.eager_mode():
-      yield
-  else:
-    with g.as_default():
-      yield
-
-
-def _cpu_device(device):
-  cpu_device = tf_device.DeviceSpec.from_string(device)
-  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
-  return cpu_device.to_string()
-
-
-class _RequestedStop(Exception):
-  pass
-
-
-# _call_for_each_replica and _reduce_non_distributed_value are not members of
-# MirroredStrategy so that they are generally not allowed to use anything
-# specific to MirroredStrategy and thus can be shared with other distribution
-# strategies.
-
-
-# TODO(yuefengz): maybe create a common class for those who need to call this
-# _call_for_each_replica.
-def _call_for_each_replica(distribution, fn, args, kwargs):
-  """Run `fn` in separate threads, once per replica/worker device.
-
-  Args:
-    distribution: the DistributionStrategy object.
-    fn: function to run (will be run once per device, each in its own thread).
-    args: positional arguments for `fn`
-    kwargs: keyword arguments for `fn`.
-
-  Returns:
-    Merged return value of `fn` across all replicas.
-
-  Raises:
-    RuntimeError: If fn() calls get_replica_context().merge_call() a different
-        number of times from the available devices.
-  """
-  # TODO(josh11b): Add this option once we add synchronization to variable
-  # creation. Until then, this is pretty unsafe to use.
-  run_concurrently = False
-  if not context.executing_eagerly():
-    # Needed for per-thread device, etc. contexts in graph mode.
-    ops.get_default_graph().switch_to_thread_local()
-
-  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
-
-  shared_variable_store = {}
-
-  # TODO(isaprykin): Create these threads once instead of during every run()
-  # call.
-  threads = []
-  for index, d in enumerate(distribution.worker_devices):
-    variable_creator_fn = shared_variable_creator.make_fn(
-        shared_variable_store, index)
-    t = MirroredStrategy._MirroredReplicaThread(  # pylint: disable=protected-access
-        distribution, coord, d, variable_creator_fn, fn,
-        *values.select_device(d, args), **values.select_device(d, kwargs))
-    threads.append(t)
-
-  for t in threads:
-    t.start()
-
-  # When `fn` starts `should_run` event is set on _MirroredReplicaThread
-  # (`MRT`) threads. The execution waits until
-  # `MRT.has_paused` is set, which indicates that either `fn` is
-  # complete or a `get_replica_context().merge_call()` is called.  If `fn` is
-  # complete, then `MRT.done` is set to True.  Otherwise, arguments
-  # of `get_replica_context().merge_call` from all paused threads are grouped
-  # and the `merge_fn` is performed.  Results of the
-  # `get_replica_context().merge_call` are then set to `MRT.merge_result`.
-  # Each such `get_replica_context().merge_call` call returns the
-  # `MRT.merge_result` for that thread when `MRT.should_run` event
-  # is reset again. Execution of `fn` resumes.
-
-  try:
-    with coord.stop_on_exception():
-      all_done = False
-      while not all_done and not coord.should_stop():
-        done = []
-        if run_concurrently:
-          for t in threads:
-            t.should_run.set()
-          for t in threads:
-            t.has_paused.wait()
-            t.has_paused.clear()
-            if coord.should_stop():
-              return None
-            done.append(t.done)
-        else:
-          for t in threads:
-            t.should_run.set()
-            t.has_paused.wait()
-            t.has_paused.clear()
-            if coord.should_stop():
-              return None
-            done.append(t.done)
-        if coord.should_stop():
-          return None
-        all_done = all(done)
-        if not all_done:
-          if any(done):
-            raise RuntimeError("Some replicas made a different number of "
-                               "replica_context().merge_call() calls.")
-          # get_replica_context().merge_call() case
-          merge_args = values.regroup({t.device: t.merge_args for t in threads})
-          merge_kwargs = values.regroup(
-              {t.device: t.merge_kwargs for t in threads})
-          # We capture the name_scope of the MRT when we call merge_fn
-          # to ensure that if we have opened a name scope in the MRT,
-          # it will be respected when executing the merge function. We only
-          # capture the name_scope from the first MRT and assume it is
-          # the same for all other MRTs.
-          mtt_captured_name_scope = threads[0].captured_name_scope
-          with ops.name_scope(mtt_captured_name_scope):
-            merge_result = threads[0].merge_fn(distribution, *merge_args,
-                                               **merge_kwargs)
-          for t in threads:
-            t.merge_result = values.select_device(t.device, merge_result)
-  finally:
-    for t in threads:
-      t.should_run.set()
-    coord.join(threads)
-
-  return values.regroup({t.device: t.main_result for t in threads})
-
-
-def _reduce_non_distributed_value(distribution, aggregation, value,
-                                  destinations):
-  """Reduce a non-DistributedValue `value` to `destinations`."""
-  if isinstance(value, values.DistributedValues):
-    raise ValueError("You are passing a `DistributedValue` to "
-                     "`_reduce_non_distributed_value`, which is not allowed.")
-
-  # If the same value is present on all replicas then the PerReplica value will
-  # be a single value. We also handle the case when `value` is a single value
-  # and equal to 0.
-  if value == 0:
-    return 0
-  # If the aggregation type is MEAN or ONLY_FIRST_REPLICA, then this
-  # essentially means that the same value should be on all destinations.
-  if aggregation in (
-      variable_scope.VariableAggregation.MEAN,
-      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA):
-    return value
-
-  cross_tower_ops_lib.validate_destinations(destinations)
-  # We do not support an aggregation type of SUM if the value is the same across
-  # all replicas. We call this as part of assign functions for MirroredVariables
-  # and summing up identical values across replicas is not clearly defined.
-  if (len(distribution.worker_devices) != 1 or
-      not cross_tower_ops_lib.check_destinations(destinations)):
-    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
-                     "the given aggregation %s." % (value, aggregation))
-  # TODO(anjalisridhar): Moves these methods to a device utility file?
-  devices = cross_tower_ops_lib.get_devices_from(destinations)
-  if len(devices) == 1:
-    with ops.device(devices[0]):
-      return array_ops.identity(value)
-  else:
-    value_updates = {}
-    for d in devices:
-      with ops.device(d):
-        value_updates[d] = array_ops.identity(value)
-    return values.Mirrored(value_updates)
-
-
-def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
-  # Figure out what collections this variable should be added to.
-  # We'll add the MirroredVariable to those collections instead.
-  collections = kwargs.pop("collections", None)
-  if collections is None:
-    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
-  kwargs["collections"] = []
-
-  # Get synchronization value
-  synchronization = kwargs.get("synchronization",
-                               variable_scope.VariableSynchronization.ON_WRITE)
-  if synchronization == variable_scope.VariableSynchronization.NONE:
-    raise ValueError("`NONE` variable synchronization mode is not "
-                     "supported with `Mirrored` distribution strategy. Please"
-                     " change the `synchronization` for variable: " +
-                     kwargs["name"])
-  elif synchronization == variable_scope.VariableSynchronization.ON_READ:
-    # Variables that are to be synced on read are replica local.
-    is_replica_local = True
-    kwargs["trainable"] = False
-  elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
-        synchronization == variable_scope.VariableSynchronization.AUTO):
-    # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
-    is_replica_local = False
-  else:
-    raise ValueError("Invalid variable synchronization mode: " +
-                     synchronization + " for variable: " + kwargs["name"])
-
-  # Get aggregation value
-  aggregation = kwargs.pop("aggregation",
-                           variable_scope.VariableAggregation.NONE)
-  if aggregation not in (
-      variable_scope.VariableAggregation.NONE,
-      variable_scope.VariableAggregation.SUM,
-      variable_scope.VariableAggregation.MEAN,
-      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-  ):
-    raise ValueError("Invalid variable aggregation mode: " + aggregation +
-                     " for variable: " + kwargs["name"])
-
-  # Ignore user-specified caching device, not needed for mirrored variables.
-  kwargs.pop("caching_device", None)
-
-  # TODO(josh11b,apassos): It would be better if variable initialization
-  # was never recorded on the tape instead of having to do this manually
-  # here.
-  with tape.stop_recording():
-    index = real_mirrored_creator(devices, *args, **kwargs)
-
-    if is_replica_local:
-      result = values.ReplicaLocalVariable(
-          index, index[devices[0]], aggregation)
-    else:
-      result = values.MirroredVariable(index, index[devices[0]], aggregation)
-
-  # Add the wrapped variable to the requested collections.
-  # The handling of eager mode and the global step matches
-  # ResourceVariable._init_from_args().
-  if not context.executing_eagerly():
-    g = ops.get_default_graph()
-    # If "trainable" is True, next_creator() will add the member variables
-    # to the TRAINABLE_VARIABLES collection, so we manually remove
-    # them and replace with the MirroredVariable. We can't set
-    # "trainable" to False for next_creator() since that causes functions
-    # like implicit_gradients to skip those variables.
-    if kwargs.get("trainable", True):
-      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
-      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
-      for v in index.values():
-        if v in l:
-          l.remove(v)
-    g.add_to_collections(collections, result)
-  elif ops.GraphKeys.GLOBAL_STEP in collections:
-    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
-
-  return result
+# pylint: disable=protected-access,invalid-name
+_call_for_each_replica = mirrored_strategy._call_for_each_replica
+_reduce_non_distributed_value = mirrored_strategy._reduce_non_distributed_value
+_create_mirrored_variable = mirrored_strategy._create_mirrored_variable
+all_local_devices = mirrored_strategy.all_local_devices
+CoreMirroredStrategy = mirrored_strategy.MirroredStrategy
+CoreMirroredExtended = mirrored_strategy.MirroredExtended
+# pylint: enable=protected-access,invalid-name
 
 
 class MirroredStrategy(distribute_lib.DistributionStrategy):
   """Mirrors vars to distribute across multiple devices and machines.
 
+  *** contrib version ***
+
   This strategy uses one replica per device and sync replication for its
   multi-GPU version.
 
@@ -353,483 +95,66 @@ class MirroredStrategy(distribute_lib.DistributionStrategy):
                cross_device_ops=None,
                auto_shard_dataset=False,
                cross_tower_ops=None):
-    super(MirroredStrategy, self).__init__()
-
     assert not (cross_device_ops and cross_tower_ops)
-    self._cross_tower_ops = cross_device_ops or cross_tower_ops
-    self._auto_shard_dataset = auto_shard_dataset
-    # Remember num GPUs which might be needed by `configure` method.
     if num_gpus is not None and num_gpus_per_worker is not None:
       raise ValueError(
           "You cannot specify both `num_gpus` and `num_gpus_per_worker`.")
-    if num_gpus is not None:
-      self._num_gpus = num_gpus
-    else:
-      self._num_gpus = num_gpus_per_worker
-
-    self._initialize_local(self._num_gpus, devices)
-
-  def _initialize_local(self, num_gpus, devices):
-    """Initializes the object for local training."""
-    self._cluster_spec = None
-    # Convert `num_gpus` into `devices`, shouldn't specify both.
-    if devices is None:
-      if num_gpus is None:
-        num_gpus = context.num_gpus()
-      if num_gpus == 0:
-        devices = ["/device:CPU:0"]
-      else:
-        devices = ["/device:GPU:%d" % d for d in range(num_gpus)]
-    elif num_gpus is not None:
-      raise ValueError("Must only specify one of `devices` and `num_gpus`.")
-    self._num_gpus = num_gpus
-    # TODO(yuefengz): consider setting the default device.
-
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerReplica(
-        {d: i for i, d in enumerate(devices)})
-
-  def _initialize_multi_worker(self, num_gpus, cluster_spec):
-    """Initializes the object for multi-worker training."""
-    cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
-    self._cluster_spec = cluster_spec
-
-    self._workers = []
-    for job in ["chief", "worker"]:
-      for task in range(len(cluster_spec.as_dict().get(job, []))):
-        self._workers.append("/job:%s/task:%d" % (job, task))
-
     if num_gpus is None:
-      raise ValueError("`num_gpus` is required if `cluster_spec` is given.")
-    if num_gpus > 0:
-      self._worker_devices = [
-          (worker, [
-              device_util.canonicalize(worker + "/device:GPU:%d" % gpu)
-              for gpu in range(num_gpus)
-          ]) for worker in self._workers
-      ]
-    else:
-      self._worker_devices = [
-          (worker, [device_util.canonicalize(worker, "/device:CPU:0")])
-          for worker in self._workers
-      ]
-
-    devices = nest.flatten([l for _, l in self._worker_devices])
+      num_gpus = num_gpus_per_worker
+    extended = MirroredExtended(self, devices, num_gpus,
+                                cross_device_ops or cross_tower_ops,
+                                auto_shard_dataset)
+    super(MirroredStrategy, self).__init__(extended)
 
-    # Setting `_default_device` will add a device scope in the
-    # distribution.scope. We set the default device to the first worker. When
-    # users specify device under distribution.scope by
-    #   with tf.device("/cpu:0"):
-    #     ...
-    # their ops will end up on the cpu device of its first worker, e.g.
-    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
-    self._default_device = self._workers[0]
 
-    assert devices, "Must specify at least one device."
-    assert len(set(devices)) == len(devices), (
-        "No duplicates allowed in `devices` argument.")
-    # TODO(josh11b): Require at least 2 devices?
-    self._devices = [device_util.resolve(d) for d in devices]
-    self._canonical_device_set = set(self._devices)
-    self._device_index = values.PerReplica(
-        {d: i for i, d in enumerate(devices)})
+class MirroredExtended(CoreMirroredExtended):
+  """Implementation of (contrib) MirroredStrategy."""
 
-  def _create_variable(self, next_creator, *args, **kwargs):
-    """Create a mirrored variable. See `DistributionStrategy.scope`."""
-    colocate_with = kwargs.pop("colocate_with", None)
-    devices = self._get_devices_from(colocate_with)
-
-    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
-      index = {}
-      for i, d in enumerate(devices):
-        with ops.device(d):
-          if i > 0:
-            # Give replicas meaningful distinct names:
-            var0name = index[devices[0]].name.split(":")[0]
-            # We append a / to variable names created on replicas with id > 0 to
-            # ensure that we ignore the name scope and instead use the given
-            # name as the absolute name of the variable.
-            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
-            # Initialize replicas with the same value:
-            def initial_value_fn(device=d):
-              if context.executing_eagerly():
-                init_value = index[devices[0]].value()
-                return array_ops.identity(init_value)
-              else:
-                with ops.device(device):
-                  init_value = index[devices[0]].initial_value
-                  return array_ops.identity(init_value)
-            kwargs["initial_value"] = initial_value_fn
-          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-            # Don't record operations (e.g. other variable reads) during
-            # variable creation.
-            with tape.stop_recording():
-              v = next_creator(*args, **kwargs)
-          assert not isinstance(v, values.DistributedVariable)
-          index[d] = v
-      return index
-
-    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
-                                     **kwargs)
+  def __init__(self,
+               container_strategy,
+               devices=None,
+               num_gpus_per_worker=None,
+               cross_device_ops=None,
+               auto_shard_dataset=False):
+    if devices is None:
+      devices = mirrored_strategy.all_local_devices(num_gpus_per_worker)
+    elif num_gpus_per_worker is not None:
+      raise ValueError(
+          "Must only specify one of `devices` and `num_gpus_per_worker`.")
+    super(MirroredExtended, self).__init__(container_strategy, devices,
+                                           cross_device_ops)
+    self._auto_shard_dataset = auto_shard_dataset
 
-  def distribute_dataset(self, dataset_fn):
-    if self._cluster_spec:
-      return values.MultiWorkerDataset(
-          partial(self._call_dataset_fn, dataset_fn), self._worker_devices,
-          auto_shard=self._auto_shard_dataset)
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch.
+
+    This implementation is different than the one in
+    `tf.distribute.MirroredStrategy` for purposes of backward compatibility.
+    We treat the incoming dataset's batch size as per replica batch size.
+
+    Args:
+      dataset: `tf.data.Dataset` for input.
+    Returns:
+      An `InputIterator` which returns inputs for each step of the computation.
+    """
+    if self._local_mode:
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
     else:
+      worker_device_pairs = self._worker_devices
+    return values.DatasetIterator(dataset, worker_device_pairs)
+
+  def _distribute_dataset(self, dataset_fn):
+    if self._local_mode:
       return values.PerReplicaDataset(
           self._call_dataset_fn(dataset_fn), self._devices)
-
-  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
-    if initial_loop_values is None:
-      initial_loop_values = {}
-    initial_loop_values = nest.flatten(initial_loop_values)
-
-    ctx = values.MultiStepContext()
-    def body(i, *args):
-      """A wrapper around `fn` to create the while loop body."""
-      del args
-      fn_inputs = iterator.get_next()
-      if not isinstance(fn_inputs, tuple):
-        fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
-      for (name, output) in ctx.last_step_outputs.items():
-        # Convert all outputs to tensors, potentially from `DistributedValues`.
-        ctx.last_step_outputs[name] = self.unwrap(output)
-      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
-      with ops.control_dependencies([fn_result]):
-        return [i + 1] + flat_last_step_outputs
-
-    # We capture the control_flow_context at this point, before we run `fn`
-    # inside a while_loop. This is useful in cases where we might need to exit
-    # these contexts and get back to the outer context to do some things, for
-    # e.g. create an op which should be evaluated only once at the end of the
-    # loop on the host. One such usage is in creating metrics' value op.
-    self._outer_control_flow_context = (
-        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
-
-    cond = lambda i, *args: i < iterations
-    i = constant_op.constant(0)
-    loop_result = control_flow_ops.while_loop(
-        cond, body, [i] + initial_loop_values, name="",
-        parallel_iterations=1, back_prop=False, swap_memory=False,
-        return_same_structure=True)
-    del self._outer_control_flow_context
-
-    ctx.run_op = control_flow_ops.group(loop_result)
-
-    # Convert the last_step_outputs from a list to the original dict structure
-    # of last_step_outputs.
-    last_step_tensor_outputs = loop_result[1:]
-    last_step_tensor_outputs_dict = nest.pack_sequence_as(
-        ctx.last_step_outputs, last_step_tensor_outputs)
-
-    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
-      output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been aggregated, wrap them in a Mirrored
-      # container, else in a PerReplica container.
-      if aggregation is variables_lib.VariableAggregation.NONE:
-        last_step_tensor_outputs_dict[name] = values.regroup(
-            {d: t for d, t in zip(self._devices, output)}, values.PerReplica)
-      else:
-        assert len(output) == 1
-        last_step_tensor_outputs_dict[name] = output[0]
-
-    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
-    return ctx
-
-  def _broadcast(self, tensor, destinations):
-    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
-    return self._get_cross_tower_ops().broadcast(tensor, destinations or
-                                                 self._devices)
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    return _call_for_each_replica(self, fn, args, kwargs)
-
-  def map(self, map_over, fn, *args, **kwargs):
-    # TODO(josh11b): In eager mode, use one thread per device.
-    index = {}
-    for i, m in enumerate(map_over):
-      d = self._devices[i % len(self._devices)]
-      with ops.device(d):
-        l = index.get(d, [])
-        l.append(fn(m,
-                    *values.select_device_mirrored(d, args),
-                    **values.select_device_mirrored(d, kwargs)))
-        index[d] = l
-    # TODO(josh11b): Need a values.regroup equivalent that handles MapOutput
-    # in addition to PerReplica data.
-    return values.PerReplica({k: values.MapOutput(v) for k, v in index.items()})
-
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
-    del task_type, task_id
-
-    if session_config:
-      session_config.isolate_session_state = True
-
-    if cluster_spec:
-      self._initialize_multi_worker(self._num_gpus, cluster_spec)
-
-    if self._cross_tower_ops is None:
-      if self._cluster_spec:
-        # It currently cannot detect the toplogy of remote workers. So we
-        # hard-code the multi-worker all-reduce algorithm for now.
-        if len(self._workers) == 1:
-          # The default is "nccl".
-          self._cross_tower_ops = cross_tower_ops_lib.AllReduceCrossDeviceOps()
-        else:
-          # The default is hierarchical reduce and broadcast.
-          self._cross_tower_ops = cross_tower_ops_lib.MultiWorkerAllReduce(
-              self._workers, self._num_gpus)
-      else:
-        self._cross_tower_ops = cross_tower_ops_lib.choose_the_best(
-            self._devices, session_config=session_config)
-
-  def _get_cross_tower_ops(self):
-    if self._cross_tower_ops is None:
-      self._cross_tower_ops = (
-          cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps())
-    return self._cross_tower_ops
-
-  def _reduce(self, aggregation, value, destinations):
-    assert not isinstance(value, values.Mirrored)
-    if not isinstance(value, values.DistributedValues):
-      # This function handles reducing values that are not PerReplica or
-      # Mirrored values. For example, the same value could be present on all
-      # replicas in which case `value` would be a single value or value could
-      # be 0.
-      return _reduce_non_distributed_value(self, aggregation, value,
-                                           destinations)
-    if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_REPLICA:
-      value = value.get(self._devices[0])
-      if isinstance(value, (int, float)):
-        return value
-      return self.broadcast(value, destinations)
-    return self._get_cross_tower_ops().reduce(
-        aggregation, value, destinations=destinations)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if aggregation == variable_scope.VariableAggregation.ONLY_FIRST_REPLICA:
-      return [self.broadcast(v.get(self._devices[0]), d)
-              for v, d in value_destination_pairs]
-    return self._get_cross_tower_ops().batch_reduce(aggregation,
-                                                    value_destination_pairs)
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    # TODO(josh11b): In eager mode, use one thread per device.
-    assert isinstance(var, values.DistributedVariable)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    updates = {}
-    for d, v in var._index.items():  # pylint: disable=protected-access
-      name = "update_%d" % self._device_index.get(d)
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        # If args and kwargs are not mirrored, the value is returned as is.
-        updates[d] = fn(v,
-                        *values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    assert isinstance(colocate_with, list)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    # TODO(josh11b): In eager mode, use one thread per device.
-    updates = {}
-    for d in colocate_with:
-      name = "update_%d" % self._device_index.get(d)
-      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
-        updates[d] = fn(*values.select_device_mirrored(d, args),
-                        **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  def read_var(self, replica_local_var):
-    """Read the aggregate value of a replica-local variable."""
-    if isinstance(replica_local_var, values.ReplicaLocalVariable):
-      return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
-    assert isinstance(replica_local_var, values.Mirrored)
-    return array_ops.identity(replica_local_var.get())
-
-  def _unwrap(self, val):
-    if isinstance(val, values.DistributedValues):
-      # Return in a deterministic order.
-      if set(val.devices) == self._canonical_device_set:
-        return [val.get(device=d) for d in self._devices]
-      return [val.get(device=d) for d in sorted(val.devices)]
-    return [val]
-
-  def value_container(self, val):
-    return values.value_container(val)
-
-  @property
-  def num_replicas(self):
-    return len(self._devices)
-
-  @property
-  def num_replicas_in_sync(self):
-    return len(self._devices)
-
-  def _worker_device_index(self):
-    return self._device_index
-
-  @property
-  def worker_devices(self):
-    # Make a copy to prevent users from accidentally mutating our copy.
-    return list(self._devices)
-
-  @property
-  def parameter_devices(self):
-    return list(self._devices)
-
-  @property
-  def between_graph(self):
-    return False
-
-  @property
-  def should_init(self):
-    return True
-
-  @property
-  def should_checkpoint(self):
-    return True
-
-  @property
-  def should_save_summary(self):
-    return True
-
-  def non_slot_devices(self, var_list):
-    del var_list
-    return list(self._devices)
-
-  def _get_devices_from(self, colocate_with=None):
-    if colocate_with is None:
-      return self._devices
     else:
-      return cross_tower_ops_lib.get_devices_from(colocate_with)
-
-  class _MirroredReplicaThread(threading.Thread):
-    """A thread that runs() a function on a device."""
-
-    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
-                 **kwargs):
-      super(MirroredStrategy._MirroredReplicaThread, self).__init__()  # pylint: disable=protected-access
-      self.coord = coord
-      self.distribution = dist
-      self.device = device
-      self.replica_id = dist.worker_devices.index(device)
-      self.variable_creator_fn = variable_creator_fn
-      # State needed to run and return the results of `fn`.
-      self.main_fn = fn
-      self.main_args = args
-      self.main_kwargs = kwargs
-      self.main_result = None
-      self.done = False
-      # State needed to run the next merge_call() (if any) requested via
-      # ReplicaContext.
-      self.merge_fn = None
-      self.merge_args = None
-      self.merge_kwargs = None
-      self.merge_result = None
-      self.captured_name_scope = None
-      # We use a thread.Event for the main thread to signal when this
-      # thread should start running (`should_run`), and another for
-      # this thread to transfer control back to the main thread
-      # (`has_paused`, either when it gets to a
-      # `get_replica_context().merge_call` or when `fn` returns). In
-      # either case the event starts cleared, is signaled by calling
-      # set(). The receiving thread waits for the signal by calling
-      # wait() and then immediately clearing the event using clear().
-      self.should_run = threading.Event()
-      self.has_paused = threading.Event()
-      # These fields have to do with inheriting various contexts from the
-      # parent thread:
-      # pylint: disable=protected-access
-      self.context_mode = context.context()._eager_context.mode
-      if not context.context()._context_handle:
-        context.context()._initialize_handle_and_devices()
-      self.context_device_policy = (
-          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
-              context.context()._context_handle))
-      self.graph = ops.get_default_graph()
-      self._variable_creator_stack = self.graph._variable_creator_stack[:]
-      self._captured_var_scope = variable_scope.get_variable_scope()
-      # Adding a "/" at end lets us re-enter this scope later.
-      self._name_scope = self.graph.get_name_scope()
-      if self._name_scope:
-        self._name_scope += "/"
-      if self.replica_id > 0:
-        if not self._name_scope:
-          self._name_scope = ""
-        self._name_scope += "replica_%d/" % self.replica_id
-
-    def run(self):
-      # pylint: disable=protected-access
-      self.graph._variable_creator_stack = self._variable_creator_stack
-      self.should_run.wait()
-      self.should_run.clear()
-      try:
-        if self.coord.should_stop():
-          return
-        with self.coord.stop_on_exception(), \
-            context.context()._mode(self.context_mode), \
-            context.context().device_policy(self.context_device_policy), \
-            _enter_graph(self.graph), \
-            MirroredReplicaContext(self.distribution, self.replica_id), \
-            ops.device(self.device), \
-            ops.name_scope(self._name_scope), \
-            variable_scope.variable_scope(
-                self._captured_var_scope, reuse=self.replica_id > 0), \
-            variable_scope.variable_creator_scope(self.variable_creator_fn):
-          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
-          self.done = True
-      finally:
-        self.has_paused.set()
-
-
-class MirroredReplicaContext(distribute_lib.ReplicaContext):
-  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
-
-  Opened in `_MirroredReplicaThread`, to allow the user to invoke
-  `MirroredStrategy`'s specific implementation of `merge_call()`,
-  which works by delegating the function and its arguments to
-  the main thread (the one that invoked
-  `MirroredStrategy.call_for_each_replica()`).
-  """
-
-  def _merge_call(self, fn, args, kwargs):
-    """Delegate to the main thread to actually perform merge_call()."""
-    t = threading.current_thread()  # a _MirroredReplicaThread
-    t.merge_fn = fn
-    t.merge_args = args
-    t.merge_kwargs = kwargs
-    t.captured_name_scope = t.graph.get_name_scope()
-    # Adding a "/" at end lets us re-enter this scope later.
-    if t.captured_name_scope:
-      t.captured_name_scope += "/"
-    t.has_paused.set()
-    t.should_run.wait()
-    t.should_run.clear()
-    if t.coord.should_stop():
-      raise _RequestedStop()
-    return t.merge_result
-
-  @property
-  def device(self):
-    raise RuntimeError("Use .devices instead")
+      return values.MultiWorkerDataset(
+          functools.partial(self._call_dataset_fn, dataset_fn),
+          self._worker_devices,
+          auto_shard=self._auto_shard_dataset)
 
+  # TODO(priyag): Delete this once all strategies use global batch size.
   @property
-  def devices(self):
-    distribute_lib.require_replica_context(self)
-    return [self._distribution_strategy.worker_devices[self._replica_id]]
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
index 0dbf6ba0567a3637d3ebfca6df05804dd61e07c3..36be5c83f8bafb6c934d1d7682b5227b1f71c089 100644
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
+++ b/tensorflow/contrib/distribute/python/mirrored_strategy_multigpu_test.py
@@ -20,22 +20,27 @@ from __future__ import print_function
 
 import sys
 
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import mirrored_strategy
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import func_graph
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training as keras_training
 from tensorflow.python.keras.layers import core as keras_core
 from tensorflow.python.layers import core
@@ -46,8 +51,6 @@ from tensorflow.python.ops import rnn_cell_impl
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import optimizer as optimizer_lib
 from tensorflow.python.training import server_lib
@@ -56,253 +59,229 @@ from tensorflow.python.training import server_lib
 GPU_TEST = "test_gpu" in sys.argv[0]
 
 
-class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase):
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.mirrored_strategy_with_two_gpus,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_two_gpus],
+    mode=["graph", "eager"]))
+class MirroredTwoDeviceDistributionTest(strategy_test_lib.DistributionTestBase,
+                                        parameterized.TestCase):
 
-  def _get_distribution_strategy(self):
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    if GPU_TEST:
-      self.assertGreater(context.num_gpus(), 0)
-      if context.num_gpus() > 1:
-        devices = ["/device:GPU:0", "/device:GPU:1"]
-    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
-    return mirrored_strategy.MirroredStrategy(devices)
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
 
-  def testMinimizeLossEager(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
 
-  def testMinimizeLossGraph(self):
-    soft_placement = not GPU_TEST
-    print("testMinimizeLossGraph soft_placement:", soft_placement)
-    self._test_minimize_loss_graph(
-        self._get_distribution_strategy(), soft_placement=soft_placement)
-
-  def testMapReduce(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_device_index(self._get_distribution_strategy())
-
-  def testReplicaId(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_replica_id(self._get_distribution_strategy())
-
-  def testNumReplicas(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self.assertEqual(2, self._get_distribution_strategy().num_replicas)
-
-  def testNumReplicasInSync(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self.assertEqual(2, self._get_distribution_strategy().
-                     num_replicas_in_sync)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testRunRegroupError(self):
-
-    def run_fn(device_id):
+  def testNumReplicasInSync(self, distribution):
+    self.assertEqual(2, distribution.num_replicas_in_sync)
+
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
+
+  def testRunRegroupError(self, distribution):
+    def run_fn():
+      replica_id = int(self.evaluate(_replica_id()))
       # Generates a list with different lengths on different devices.
       # Will fail in _regroup() (if more than one device).
-      return list(range(device_id))
-
-    dist = self._get_distribution_strategy()
-    with dist.scope(), self.assertRaises(AssertionError):
-      dist.call_for_each_replica(run_fn, args=(dist.worker_device_index,))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testReduceToCpu(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    def run_fn(device_id):
-      return device_id
-
-    dist = self._get_distribution_strategy()
-    with dist.scope():
-      result = dist.call_for_each_replica(
-          run_fn, args=(dist.worker_device_index,))
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.SUM,
-          result,
-          destinations="/device:CPU:0")
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(1, len(unwrapped))
-      expected = sum(range(len(dist.worker_devices)))
-      self.assertEqual(expected, self.evaluate(unwrapped[0]))
-
-  @test_util.run_in_graph_and_eager_modes
-  def testReduceOnlyFirstReplicaUpdates(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    def run_fn(device_id):
-      return constant_op.constant(3 + 5 * device_id)
-
-    dist = self._get_distribution_strategy()
-    with dist.scope():
-      result = dist.call_for_each_replica(
-          run_fn, args=(dist.worker_device_index,))
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.ONLY_FIRST_REPLICA,
-          result,
-          destinations="/device:CPU:0")
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(1, len(unwrapped))
-      self.assertEqual(3, self.evaluate(unwrapped[0]))
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testReduceToMultipleDestinations(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
-
-    devices = ["/device:GPU:0"]
-    if GPU_TEST:
-      self.assertGreater(context.num_gpus(), 0)
-    print(self.id().split(".")[-1], "devices:", ", ".join(devices))
-
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      reduced = dist.reduce(
-          variable_scope.VariableAggregation.SUM,
-          1.0,
-          destinations=["/device:CPU:0", "/device:GPU:0"])
-      unwrapped = dist.unwrap(reduced)
-      self.assertEqual(2, len(unwrapped))
-      self.assertEqual(1.0, self.evaluate(unwrapped[0]))
+      return list(range(replica_id))
+
+    with distribution.scope(), self.assertRaises(AssertionError):
+      distribution.extended.call_for_each_replica(run_fn)
+
+  def testReduceToCpu(self, distribution):
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(_replica_id)
+      reduced = distribution.reduce(reduce_util.ReduceOp.SUM, result)
+      expected = sum(range(distribution.num_replicas_in_sync))
+      self.assertEqual(expected, self.evaluate(reduced))
+
+  def testMakeInputFnIterator(self, distribution):
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=2,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = distribution.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(iterator, distribution.extended.worker_devices,
+                                 expected_values)
+
+  def testGlobalStepUpdate(self, distribution):
+    self._test_global_step_update(distribution)
+
+
+def one_device_combinations():
+  return combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_cpu,
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.core_mirrored_strategy_with_one_cpu,
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph", "eager"])
+
+
+class MirroredOneDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  @combinations.generate(one_device_combinations())
+  def testMinimizeLoss(self, distribution):
+    if context.executing_eagerly():
+      self._test_minimize_loss_eager(distribution)
+    else:
+      self._test_minimize_loss_graph(distribution)
+
+  @combinations.generate(one_device_combinations())
+  def testReplicaId(self, distribution):
+    self._test_replica_id(distribution)
+
+  @combinations.generate(one_device_combinations())
+  def testCallAndMergeExceptions(self, distribution):
+    self._test_call_and_merge_exceptions(distribution)
 
 
+class MirroredStrategyVariableCreatorStackTest(
+    test.TestCase, parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      distribution=[combinations.mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu],
+      mode=["graph"]))
+  def testCreatorStacksAreThreadLocal(self, distribution):
+    def model_fn():
+      replica_id_str = str(self.evaluate(_replica_id()))
+
+      def thread_creator_fn(next_creator, *args, **kwargs):
+        return next_creator(*args, **kwargs) + ":thread_" + replica_id_str
+
+      with variable_scope.variable_creator_scope(thread_creator_fn):
+        # Create a variable in this scope.
+        v = variable_scope.variable(1.0)
+
+        # This will pause the current thread, and execute the other thread.
+        ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
+
+    def main_thread_creator(next_creator, *args, **kwargs):
+      # We are not using the underlying next_creator for test purposes.
+      del next_creator, args, kwargs
+      return "main_thread"
+
+    with context.graph_mode(), \
+        distribution.scope(), \
+        variable_scope.variable_creator_scope(main_thread_creator):
+      result = distribution.extended.call_for_each_replica(model_fn)
+      result = distribution.unwrap(result)
+      expected = ("main_thread:thread_0", "main_thread:thread_1")
+      self.assertEqual(expected, result)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredStrategyVariableCreationTest(test.TestCase):
 
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
+  # TODO(priyag): Modify more tests to use this helper and check more
+  # properties.
+  def _test_mv_properties(self, var, name):
+    self.assertIsInstance(var, values.MirroredVariable)
+    self.assertEqual(name, var.name)
+    for d in var.devices:
+      self.assertEqual(d, var.get(d).device)
+
+  def testVariableInFuncGraph(self, distribution):
+    def model_fn():
+      v = variable_scope.variable(2.0, name="bar")
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Enough GPUs not available for this test in eager mode.")
+    with func_graph.FuncGraph("fg").as_default(), distribution.scope():
+      v1 = variable_scope.variable(1.0, name="foo")
+      v2 = distribution.extended.call_for_each_replica(model_fn)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSingleVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._test_mv_properties(v1, "foo:0")
+    self._test_mv_properties(v2, "bar:0")
 
+  def testSingleVariable(self, distribution):
     def model_fn():
       # This variable should be created only once across the threads because of
-      # special variable_creator functions used by `dist.call_for_each_replica`.
+      # special variable_creator functions used by
+      # `distribution.extended.call_for_each_replica`.
       v = variable_scope.variable(1.0, name="foo")
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEquals("foo:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testUnnamedVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self._test_mv_properties(result, "foo:0")
 
+  def testUnnamedVariable(self, distribution):
     def model_fn():
       v = variable_scope.variable(1.0)
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
-      # Default name of "Variable" will be used.
-      self.assertEquals("Variable:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleVariables(self):
-    self._skip_eager_if_gpus_less_than(1)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self._test_mv_properties(result, "Variable:0")
 
+  def testMultipleVariables(self, distribution):
     def model_fn():
       vs = []
       for i in range(5):
         vs.append(variable_scope.variable(1.0, name="foo" + str(i)))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return vs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       for i, v in enumerate(result):
-        self.assertIsInstance(v, values.MirroredVariable)
-        self.assertEquals("foo" + str(i) + ":0", v.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testMultipleVariablesWithSameCanonicalName(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self._test_mv_properties(v, "foo" + str(i) + ":0")
 
+  def testMultipleVariablesWithSameCanonicalName(self, distribution):
     def model_fn():
       vs = []
       vs.append(variable_scope.variable(1.0, name="foo/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar"))
       vs.append(variable_scope.variable(1.0, name="foo_1/bar_1"))
       vs.append(variable_scope.variable(1.0, name="foo/bar_1"))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return vs
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       for v in result:
         self.assertIsInstance(v, values.MirroredVariable)
-      self.assertEquals(4, len(result))
-      self.assertEquals("foo/bar:0", result[0].name)
-      self.assertEquals("foo_1/bar:0", result[1].name)
-      self.assertEquals("foo_1/bar_1:0", result[2].name)
-      self.assertEquals("foo/bar_1:0", result[3].name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testVariableWithSameCanonicalNameAcrossThreads(self):
-    self._skip_eager_if_gpus_less_than(1)
-
-    def model_fn(device_id):
-      v = variable_scope.variable(1.0, name="foo_" + str(device_id))
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
-      return v
+      self.assertEqual(4, len(result))
+      self.assertEqual("foo/bar:0", result[0].name)
+      self.assertEqual("foo_1/bar:0", result[1].name)
+      self.assertEqual("foo_1/bar_1:0", result[2].name)
+      self.assertEqual("foo/bar_1:0", result[3].name)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
+  def testVariableWithSameCanonicalNameAcrossThreads(self, distribution):
+    def model_fn():
+      replica_id = self.evaluate(_replica_id())
+      v = variable_scope.variable(1.0, name="foo_" + str(replica_id))
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
-    with dist.scope():
-      result = dist.call_for_each_replica(
-          model_fn, args=(dist.worker_device_index,))
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
       self.assertIsInstance(result, values.MirroredVariable)
       # The resulting mirrored variable will use the name from the first device.
-      self.assertEquals("foo_0:0", result.name)
+      self.assertEqual("foo_0:0", result.name)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithLayers(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testWithLayers(self, distribution):
     def model_fn(features):
       with variable_scope.variable_scope("common"):
         layer1 = core.Dense(1)
@@ -310,17 +289,14 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         layer2 = core.Dense(1)
         layer2(features)
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         layer3 = core.Dense(1)
         layer3(features)
         return [(layer1.kernel, layer1.bias),
                 (layer2.kernel, layer2.bias),
                 (layer3.kernel, layer3.bias)]
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-    ds = dist.distribute_dataset(
+    ds = distribution.distribute_dataset(
         lambda: dataset_ops.Dataset.from_tensors([[1.]]).repeat(10))
     if context.executing_eagerly():
       iterator = ds.make_one_shot_iterator()
@@ -330,26 +306,23 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
     features = iterator.get_next()
 
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn, args=(features,))
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=(features,))
       suffixes = ["", "_1", "_2"]
       for (kernel, bias), suffix in zip(result, suffixes):
         self.assertIsInstance(kernel, values.MirroredVariable)
-        self.assertEquals("common/dense" + suffix + "/kernel:0", kernel.name)
+        self.assertEqual("common/dense" + suffix + "/kernel:0", kernel.name)
         self.assertIsInstance(bias, values.MirroredVariable)
-        self.assertEquals("common/dense" + suffix + "/bias:0", bias.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithVariableAndVariableScope(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("common/dense" + suffix + "/bias:0", bias.name)
 
+  def testWithVariableAndVariableScope(self, distribution):
     def model_fn():
       v0 = variable_scope.variable(1.0, name="var0", aggregation=None)
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.variable(1.0, name="var1")
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.variable(
             1.0,
             name="var2",
@@ -363,37 +336,31 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       return v0, v1, v2, v3
 
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+    with distribution.scope():
       v = variable_scope.variable(1.0, name="var-main0")
-      self.assertEquals("var-main0:0", v.name)
+      self.assertEqual("var-main0:0", v.name)
 
-      result = dist.call_for_each_replica(model_fn)
-      self.assertEquals(4, len(result))
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual(4, len(result))
       v0, v1, v2, v3 = result
       self.assertIsInstance(v0, values.MirroredVariable)
-      self.assertEquals("var0:0", v0.name)
+      self.assertEqual("var0:0", v0.name)
       self.assertIsInstance(v1, values.MirroredVariable)
-      self.assertEquals("common/var1:0", v1.name)
+      self.assertEqual("common/var1:0", v1.name)
       self.assertIsInstance(v2, values.ReplicaLocalVariable)
-      self.assertEquals("common/var2:0", v2.name)
-      self.assertEquals(variable_scope.VariableAggregation.SUM, v2.aggregation)
+      self.assertEqual("common/var2:0", v2.name)
+      self.assertEqual(variable_scope.VariableAggregation.SUM, v2.aggregation)
       self.assertIsInstance(v3, values.MirroredVariable)
-      self.assertEquals("common/var3:0", v3.name)
-      self.assertEquals(variable_scope.VariableAggregation.MEAN, v3.aggregation)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testWithGetVariableAndVariableScope(self):
-    self._skip_eager_if_gpus_less_than(1)
+      self.assertEqual("common/var3:0", v3.name)
+      self.assertEqual(variable_scope.VariableAggregation.MEAN, v3.aggregation)
 
+  def testWithGetVariableAndVariableScope(self, distribution):
     def model_fn():
       v0 = variable_scope.get_variable("var0", [1])
       with variable_scope.variable_scope("common"):
         v1 = variable_scope.get_variable("var1", [1])
         # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         v2 = variable_scope.get_variable(
             "var2", [1],
             synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -405,33 +372,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       return v0, v1, v2, v3
 
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+    with distribution.scope():
       with variable_scope.variable_scope("main"):
         v = variable_scope.get_variable("var-main0", [1])
-        self.assertEquals("main/var-main0:0", v.name)
+        self.assertEqual("main/var-main0:0", v.name)
 
-        result = dist.call_for_each_replica(model_fn)
-        self.assertEquals(4, len(result))
+        result = distribution.extended.call_for_each_replica(model_fn)
+        self.assertEqual(4, len(result))
         v0, v1, v2, v3 = result
         self.assertIsInstance(v0, values.MirroredVariable)
-        self.assertEquals("main/var0:0", v0.name)
+        self.assertEqual("main/var0:0", v0.name)
         self.assertIsInstance(v1, values.MirroredVariable)
-        self.assertEquals("main/common/var1:0", v1.name)
+        self.assertEqual("main/common/var1:0", v1.name)
         self.assertIsInstance(v2, values.ReplicaLocalVariable)
-        self.assertEquals("main/common/var2:0", v2.name)
-        self.assertEquals(variable_scope.VariableAggregation.SUM,
-                          v2.aggregation)
+        self.assertEqual("main/common/var2:0", v2.name)
+        self.assertEqual(variable_scope.VariableAggregation.SUM,
+                         v2.aggregation)
         self.assertIsInstance(v3, values.MirroredVariable)
-        self.assertEquals("main/common/var3:0", v3.name)
-        self.assertEquals(variable_scope.VariableAggregation.MEAN,
-                          v3.aggregation)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testOnlyFirstReplicaUpdatesVariables(self):
-    self._skip_eager_if_gpus_less_than(1)
+        self.assertEqual("main/common/var3:0", v3.name)
+        self.assertEqual(variable_scope.VariableAggregation.MEAN,
+                         v3.aggregation)
 
+  def testOnlyFirstReplicaUpdatesVariables(self, distribution):
     def create_fn():
       aggregation = variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
       v0 = variable_scope.variable(
@@ -447,71 +409,73 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
       return v0, v1
 
     devices = ["/device:GPU:0", "/device:CPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
-      v0, v1 = dist.call_for_each_replica(create_fn)
+    with distribution.scope():
+      v0, v1 = distribution.extended.call_for_each_replica(create_fn)
       self.evaluate(v0.initializer)
       self.assertEqual(2.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0, self.evaluate(v0.get(devices[1])))
-      self.assertEqual(2.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0, self.evaluate(distribution.extended.read_var(v0)))
       self.evaluate(v1.initializer)
       self.assertEqual(3.0, self.evaluate(v1.get(devices[0])))
       self.assertEqual(3.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0, self.evaluate(distribution.extended.read_var(v1)))
+
+      def replica_id_plus_one():
+        return math_ops.cast(_replica_id() + 1, dtype=dtypes.float32)
 
       # Update using the assign_add member function.
-      def update_member_fn(device_id):
-        update0 = v0.assign_add(5.0 * (device_id + 1))
-        update1 = v1.assign_add(7.0 * (device_id + 1))
+      def update_member_fn():
+        update0 = v0.assign_add(5.0 * replica_id_plus_one())
+        update1 = v1.assign_add(7.0 * replica_id_plus_one())
         return update0, update1
 
-      update0a, update1a = dist.call_for_each_replica(
-          update_member_fn, args=(dist.worker_device_index,))
+      update0a, update1a = distribution.extended.call_for_each_replica(
+          update_member_fn)
 
       # Update "sync on read" variable.
-      self.evaluate(dist.group(update0a))
+      self.evaluate(distribution.group(update0a))
       self.assertEqual(2.0 + 5.0, self.evaluate(v0.get(devices[0])))
       # Writes are not synchronized for "sync on read" variables,
       # so device[1] can end up with a different value.
       self.assertEqual(2.0 + 2*5.0, self.evaluate(v0.get(devices[1])))
       # Always reads from device 0.
-      self.assertEqual(2.0 + 5.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0 + 5.0, self.evaluate(
+          distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
-      self.evaluate(dist.group(update1a))
+      self.evaluate(distribution.group(update1a))
       self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[0])))
       # Writes are synchronized for v1, only the argument to assign_add on
       # device[0] is used.
       self.assertEqual(3.0 + 7.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0 + 7.0, self.evaluate(dist.read_var(v1)))
+      self.assertEqual(3.0 + 7.0, self.evaluate(
+          distribution.extended.read_var(v1)))
 
       # Update using state_ops.assign_add global function.
-      def update_state_ops_fn(device_id):
-        update0 = state_ops.assign_add(v0, 11.0 * (device_id + 1))
-        update1 = state_ops.assign_add(v1, 13.0 * (device_id + 1))
+      def update_state_ops_fn():
+        update0 = state_ops.assign_add(v0, 11.0 * replica_id_plus_one())
+        update1 = state_ops.assign_add(v1, 13.0 * replica_id_plus_one())
         return update0, update1
 
-      update0b, update1b = dist.call_for_each_replica(
-          update_state_ops_fn, args=(dist.worker_device_index,))
-      self.evaluate(dist.group(update0b))
+      update0b, update1b = distribution.extended.call_for_each_replica(
+          update_state_ops_fn)
+      self.evaluate(distribution.group(update0b))
 
       # Update "sync on read" variable.
       self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(v0.get(devices[0])))
       self.assertEqual(2.0 + 2*5.0 + 2*11.0, self.evaluate(v0.get(devices[1])))
-      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(dist.read_var(v0)))
+      self.assertEqual(2.0 + 5.0 + 11.0, self.evaluate(
+          distribution.extended.read_var(v0)))
 
       # Update "sync on write" variable.
-      self.evaluate(dist.group(update1b))
+      self.evaluate(distribution.group(update1b))
       self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[0])))
       self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(v1.get(devices[1])))
-      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(dist.read_var(v1)))
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNoneSynchronizationWithGetVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+      self.assertEqual(3.0 + 7.0 + 13.0, self.evaluate(
+          distribution.extended.read_var(v1)))
+
+  def testNoneSynchronizationWithGetVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
@@ -520,12 +484,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             "v", [1],
             synchronization=variable_scope.VariableSynchronization.NONE)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNoneSynchronizationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testNoneSynchronizationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "`NONE` variable synchronization mode is not "
           "supported with `Mirrored` distribution strategy. Please change "
@@ -535,23 +495,15 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             name="v",
             synchronization=variable_scope.VariableSynchronization.NONE)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidSynchronizationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidSynchronizationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable synchronization mode: Invalid for "
           "variable: v"):
         variable_scope.variable(1.0, name="v", synchronization="Invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidAggregationWithGetVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidAggregationWithGetVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
@@ -560,12 +512,8 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
             aggregation="invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testInvalidAggregationWithVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-    with dist.scope():
+  def testInvalidAggregationWithVariable(self, distribution):
+    with distribution.scope():
       with self.assertRaisesRegexp(
           ValueError, "Invalid variable aggregation mode: invalid for "
           "variable: v"):
@@ -575,55 +523,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
             synchronization=variable_scope.VariableSynchronization.ON_WRITE,
             aggregation="invalid")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testThreeDevices(self):
-    self._skip_eager_if_gpus_less_than(2)
-
-    def model_fn():
-      v = variable_scope.variable(1.0, name="foo")
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
-      return v
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"])
-
-    with dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      self.assertIsInstance(result, values.MirroredVariable)
-      self.assertEquals("foo:0", result.name)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testNonMatchingVariableCreation(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testNonMatchingVariableCreation(self, distribution):
     def model_fn(name):
       v = variable_scope.variable(1.0, name=name)
-      distribution_strategy_context.get_replica_context().merge_call(
-          lambda _: _)
+      ds_context.get_replica_context().merge_call(lambda _: _)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
+    with distribution.scope():
       names = values.DistributedValues({
           "/device:CPU:0": "foo",
           "/device:GPU:0": "bar"
       })
       with self.assertRaises(RuntimeError):
-        _ = dist.call_for_each_replica(model_fn, args=(names,))
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testReplicaLocalVariable(self):
-    self._skip_eager_if_gpus_less_than(1)
+        _ = distribution.extended.call_for_each_replica(model_fn, args=(names,))
 
+  def testReplicaLocalVariable(self, distribution):
     all_v_sum = {}
     all_v_mean = {}
     components_sum = {}
     components_mean = {}
 
-    def model_fn(device_id):
+    def model_fn():
+      replica_id = self.evaluate(_replica_id())
       v_sum = variable_scope.variable(
           1.0,
           synchronization=variable_scope.VariableSynchronization.ON_READ,
@@ -634,26 +555,22 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.MEAN)
       self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
       self.assertTrue(isinstance(v_mean, values.ReplicaLocalVariable))
-      updates = [v_sum.assign_add(2.0 + device_id),
-                 v_mean.assign(6.0 * device_id)]
-      all_v_sum[device_id] = v_sum
-      all_v_mean[device_id] = v_mean
+      updates = [v_sum.assign_add(2.0 + replica_id),
+                 v_mean.assign(6.0 * replica_id)]
+      all_v_sum[replica_id] = v_sum
+      all_v_mean[replica_id] = v_mean
       c_sum = v_sum.get()
       c_mean = v_mean.get()
-      components_sum[device_id] = c_sum
-      components_mean[device_id] = c_mean
+      components_sum[replica_id] = c_sum
+      components_mean[replica_id] = c_mean
       self.assertIsNot(v_sum, c_sum)
       self.assertIsNot(v_mean, c_mean)
       return updates, v_sum, v_mean, c_sum, c_mean
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
+    with distribution.scope():
       # Create "sum" and "mean" versions of ReplicaLocalVariables.
       ret_ops, ret_v_sum, ret_v_mean, regrouped_sum, regrouped_mean = (
-          dist.call_for_each_replica(
-              model_fn, args=(dist.worker_device_index,)))
+          distribution.extended.call_for_each_replica(model_fn))
       # Should see the same wrapping instance in all replicas.
       self.assertIs(all_v_sum[0], ret_v_sum)
       self.assertIs(all_v_mean[0], ret_v_mean)
@@ -668,10 +585,10 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
 
       # Apply updates
       self.evaluate(variables.global_variables_initializer())
-      self.evaluate([y for x in ret_ops for y in dist.unwrap(x)])
+      self.evaluate([y for x in ret_ops for y in distribution.unwrap(x)])
       expected_sum = 0.0
       expected_mean = 0.0
-      for i, d in enumerate(dist.worker_devices):
+      for i, d in enumerate(distribution.extended.worker_devices):
         # Should see different values on different devices.
         v_sum_value = self.evaluate(ret_v_sum.get(d).read_value())
         v_mean_value = self.evaluate(ret_v_mean.get(d).read_value())
@@ -681,69 +598,125 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
         expected = i * 6.0
         self.assertEqual(expected, v_mean_value)
         expected_mean += expected
-      expected_mean /= len(dist.worker_devices)
+      expected_mean /= len(distribution.extended.worker_devices)
 
       # Without get(device), should return the value you get by
       # applying the reduction across all replicas (whether you use
       # read_var(), get(), or nothing).
-      self.assertEqual(expected_sum, self.evaluate(dist.read_var(ret_v_sum)))
-      self.assertEqual(expected_mean, self.evaluate(dist.read_var(ret_v_mean)))
+      self.assertEqual(expected_sum, self.evaluate(
+          distribution.extended.read_var(ret_v_sum)))
+      self.assertEqual(expected_mean, self.evaluate(
+          distribution.extended.read_var(ret_v_mean)))
       self.assertEqual(expected_sum, self.evaluate(ret_v_sum.get()))
       self.assertEqual(expected_mean, self.evaluate(ret_v_mean.get()))
       self.assertEqual(expected_sum, self.evaluate(ret_v_sum))
       self.assertEqual(expected_mean, self.evaluate(ret_v_mean))
 
+  # TODO(priyag): Update this test to work in eager mode as well.
+  def testDynamicRnnVariables(self, distribution):
+    def model_fn():
+      inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
+      cell_fw = rnn_cell_impl.LSTMCell(300)
+      cell_bw = rnn_cell_impl.LSTMCell(300)
+      (outputs, _) = rnn.bidirectional_dynamic_rnn(
+          cell_fw,
+          cell_bw,
+          inputs,
+          dtype=dtypes.float32)
+      return outputs
+
+    with context.graph_mode(), distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      # Two variables are created by the RNN layer.
+      self.assertEqual(2, len(result))
+      for v in result:
+        self.assertIsInstance(v, values.DistributedValues)
+        _, v1 = distribution.unwrap(v)
+        self.assertStartsWith(v1._op.name, "replica_1/")
+
+  def testReplicaLocalVariableUpdate(self, distribution):
+    def model_fn():
+      v_sum = variable_scope.variable(
+          1.0,
+          synchronization=variable_scope.VariableSynchronization.ON_READ,
+          aggregation=variable_scope.VariableAggregation.SUM)
+      self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
+      return v_sum
+
+    def update(var, value):
+      return var.assign(value)
+
+    with distribution.scope():
+      ret_v_sum = distribution.extended.call_for_each_replica(model_fn)
+
+      # Initialize variables.
+      self.evaluate(variables.global_variables_initializer())
+      # Assert that the aggregated value of the replica local vars is the sum
+      # of the individual values before running the update ops.
+      self.assertEqual(1.0, self.evaluate(ret_v_sum.get(
+          distribution.extended.worker_devices[0]).read_value()))
+      self.assertEqual(2.0, self.evaluate(ret_v_sum))
+
+      # Apply updates.
+      update_ops = distribution.extended.update(
+          ret_v_sum, update, args=(5.0,), group=False)
+      self.evaluate(update_ops)
+      # Assert that the aggregated value of the replica local vars is the sum
+      # of the individual values after running the update ops.
+      self.assertEqual(5.0, self.evaluate(ret_v_sum.get(
+          distribution.extended.worker_devices[0]).read_value()))
+      self.assertEqual(10.0, self.evaluate(ret_v_sum))
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph"]))
+class MirroredStrategyNameScopeTest(test.TestCase):
   # NOTE(priyag): Names and name scopes are ignored in eager, hence we are not
   # testing this in eager mode.
 
-  def testNameScope(self):
+  def testNameScope(self, distribution):
     def model_fn():
       with ops.name_scope("foo"):
         a = constant_op.constant(1.0, name="a")
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         b = constant_op.constant(1.0, name="b")
       return a, b
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
-        result = dist.call_for_each_replica(model_fn)
-        self.assertEquals(2, len(result))
+        result = distribution.extended.call_for_each_replica(model_fn)
+        self.assertEqual(2, len(result))
         for v, name in zip(result, ["a", "b"]):
           self.assertIsInstance(v, values.DistributedValues)
-          v0, v1 = dist.unwrap(v)
-          self.assertEquals("main/foo/" + name + ":0", v0.name)
-          self.assertEquals("main/replica_1/foo/" + name + ":0", v1.name)
+          v0, v1 = distribution.unwrap(v)
+          self.assertEqual("main/foo/" + name + ":0", v0.name)
+          self.assertEqual("main/replica_1/foo/" + name + ":0", v1.name)
 
-  def testWithDefaultName(self):
+  def testWithDefaultName(self, distribution):
     def model_fn():
       with ops.name_scope(None, "foo"):
         a = constant_op.constant(1.0, name="a")
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
+        ds_context.get_replica_context().merge_call(lambda _: _)
         b = constant_op.constant(2.0, name="b")
       return a, b
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      self.assertEquals(2, len(result))
+    with context.graph_mode(), distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertEqual(2, len(result))
       for v, name in zip(result, ["a", "b"]):
         self.assertIsInstance(v, values.DistributedValues)
-        v0, v1 = dist.unwrap(v)
-        self.assertEquals("foo/" + name + ":0", v0.name)
-        self.assertEquals("replica_1/foo/" + name + ":0", v1.name)
+        v0, v1 = distribution.unwrap(v)
+        self.assertEqual("foo/" + name + ":0", v0.name)
+        self.assertEqual("replica_1/foo/" + name + ":0", v1.name)
 
   # variable_scope.variable() respects name scopes when creating
   # variables. On the other hand variable_scope.get_variable() ignores name
   # scopes when creating variables. We test both methods of creating variables
   # to make sure that we have the same variable names in both cases.
-  def testNameScopeWithVariable(self):
+  def testNameScopeWithVariable(self, distribution):
     def in_cross_replica(_):
       c = variable_scope.variable(1.0, name="c")
       return c
@@ -751,32 +724,28 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     def model_fn():
       b = variable_scope.variable(1.0, name="b")
       with ops.name_scope("foo"):
-        c = distribution_strategy_context.get_replica_context().merge_call(
-            in_cross_replica)
+        c = ds_context.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
         a = variable_scope.variable(1.0, name="a")
-        result = dist.call_for_each_replica(model_fn)
+        result = distribution.extended.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
       self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = dist.unwrap(a)
-      b0, b1 = dist.unwrap(result_b)
-      c0, c1 = dist.unwrap(result_c)
-      self.assertEquals("main/a:0", a0.name)
-      self.assertEquals("main/a/replica_1:0", a1.name)
-      self.assertEquals("main/b:0", b0.name)
-      self.assertEquals("main/b/replica_1:0", b1.name)
-      self.assertEquals("main/foo/c:0", c0.name)
-      self.assertEquals("main/foo/c/replica_1:0", c1.name)
-
-  def testNameScopeWithGetVariable(self):
+      a0, a1 = distribution.unwrap(a)
+      b0, b1 = distribution.unwrap(result_b)
+      c0, c1 = distribution.unwrap(result_c)
+      self.assertEqual("main/a:0", a0.name)
+      self.assertEqual("main/a/replica_1:0", a1.name)
+      self.assertEqual("main/b:0", b0.name)
+      self.assertEqual("main/b/replica_1:0", b1.name)
+      self.assertEqual("main/foo/c:0", c0.name)
+      self.assertEqual("main/foo/c/replica_1:0", c1.name)
+
+  def testNameScopeWithGetVariable(self, distribution):
     def in_cross_replica(_):
       c = variable_scope.get_variable("c", [1])
       return c
@@ -784,118 +753,80 @@ class MirroredStrategyVariableCreationTest(test.TestCase):
     def model_fn():
       b = variable_scope.get_variable("b", [1])
       with ops.name_scope("foo"):
-        c = distribution_strategy_context.get_replica_context().merge_call(
-            in_cross_replica)
+        c = ds_context.get_replica_context().merge_call(in_cross_replica)
       return b, c
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
+    with context.graph_mode(), distribution.scope():
       with ops.name_scope("main"):
         a = variable_scope.get_variable("a", [1])
-        result = dist.call_for_each_replica(model_fn)
+        result = distribution.extended.call_for_each_replica(model_fn)
       result_b = result[0]
       result_c = result[1]
       self.assertIsInstance(result_b, values.DistributedValues)
       self.assertIsInstance(result_c, values.DistributedValues)
-      a0, a1 = dist.unwrap(a)
-      b0, b1 = dist.unwrap(result_b)
-      c0, c1 = dist.unwrap(result_c)
-      self.assertEquals("a:0", a0.name)
-      self.assertEquals("a/replica_1:0", a1.name)
-      self.assertEquals("b:0", b0.name)
-      self.assertEquals("b/replica_1:0", b1.name)
-      self.assertEquals("c:0", c0.name)
-      self.assertEquals("c/replica_1:0", c1.name)
-
-  def testDynamicRnnVariables(self):
+      a0, a1 = distribution.unwrap(a)
+      b0, b1 = distribution.unwrap(result_b)
+      c0, c1 = distribution.unwrap(result_c)
+      self.assertEqual("a:0", a0.name)
+      self.assertEqual("a/replica_1:0", a1.name)
+      self.assertEqual("b:0", b0.name)
+      self.assertEqual("b/replica_1:0", b1.name)
+      self.assertEqual("c:0", c0.name)
+      self.assertEqual("c/replica_1:0", c1.name)
+
+
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            combinations.NamedDistribution(
+                "Mirrored3Devices",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.MirroredStrategy(
+                    ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+                required_gpus=2),
+            combinations.NamedDistribution(
+                "CoreMirrored3Devices",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    ["/device:GPU:0", "/device:GPU:1", "/device:CPU:0"]),
+                required_gpus=2)
+        ],
+        mode=["graph", "eager"]))
+class MirroredThreeDeviceDistributionTest(
+    strategy_test_lib.DistributionTestBase,
+    parameterized.TestCase):
+
+  def testThreeDevices(self, distribution):
     def model_fn():
-      inputs = constant_op.constant(2 * [2 * [[0.0, 1.0, 2.0, 3.0, 4.0]]])
-      cell_fw = rnn_cell_impl.LSTMCell(300)
-      cell_bw = rnn_cell_impl.LSTMCell(300)
-      (outputs, _) = rnn.bidirectional_dynamic_rnn(
-          cell_fw,
-          cell_bw,
-          inputs,
-          dtype=dtypes.float32)
-      return outputs
-
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with context.graph_mode(), dist.scope():
-      result = dist.call_for_each_replica(model_fn)
-      # Two variables are created by the RNN layer.
-      self.assertEquals(2, len(result))
-      for v in result:
-        self.assertIsInstance(v, values.DistributedValues)
-        _, v1 = dist.unwrap(v)
-        self.assertStartsWith(v1.name, "replica_1/")
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testReplicaLocalVariableUpdate(self):
-    with context.graph_mode():
-
-      def model_fn():
-        v_sum = variable_scope.variable(
-            1.0,
-            synchronization=variable_scope.VariableSynchronization.ON_READ,
-            aggregation=variable_scope.VariableAggregation.SUM)
-        self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
-        return v_sum
-
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:GPU:1"])
-
-      def update(var, value):
-        return var.assign(value)
-
-      with dist.scope():
-        ret_v_sum = dist.call_for_each_replica(model_fn)
-        update_ops = dist.update(ret_v_sum, update, 5.0, grouped=False)
-
-        # Initialize variables.
-        self.evaluate(variables.global_variables_initializer())
-        # Assert that the aggregated value of the replica local vars is the sum
-        # of the individual values before running the update ops.
-        self.assertEquals(1.0, self.evaluate(
-            ret_v_sum.get(dist._devices[0]).read_value()))
-        self.assertEquals(2.0, self.evaluate(ret_v_sum))
+      v = variable_scope.variable(1.0, name="foo")
+      ds_context.get_replica_context().merge_call(lambda _: _)
+      return v
 
-        # Apply updates.
-        self.evaluate(update_ops)
-        # Assert that the aggregated value of the replica local vars is the sum
-        # of the individual values after running the update ops.
-        self.assertEquals(5.0, self.evaluate(
-            ret_v_sum.get(dist._devices[0]).read_value()))
-        self.assertEquals(10.0, self.evaluate(ret_v_sum))
+    with distribution.scope():
+      result = distribution.extended.call_for_each_replica(model_fn)
+      self.assertIsInstance(result, values.MirroredVariable)
+      self.assertEqual("foo:0", result.name)
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredVariableUpdateTest(test.TestCase):
   # The following tests check assign, assign_add and assign_sub on Mirrored
   # variables in replica and cross replica context.
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
-
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Enough GPUs not available for this test in eager mode.")
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithoutAggregationType(self):
+  def testAssignMirroredVarReplicaContextWithoutAggregationType(self,
+                                                                distribution):
     # Test that we always have an aggregation type set on the mirrored variable
     # if we assign to it in replica mode.
-    self._skip_eager_if_gpus_less_than(1)
     def var_fn():
       v = variable_scope.variable(1.0, name="foo")
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -905,23 +836,19 @@ class MirroredVariableUpdateTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError, "You must specify an aggregation method to update a "
                       "MirroredVariable in Replica Context."):
-        self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
+        self.evaluate(distribution.unwrap(
+            distribution.extended.call_for_each_replica(model_fn)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithSum(self):
+  def testAssignMirroredVarReplicaContextWithSum(self, distribution):
     # Test that we don't reduce a non-per-replica value with the "sum"
     # aggregation type.
-    self._skip_eager_if_gpus_less_than(1)
     def var_fn():
       v = variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.SUM)
       return v
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
 
@@ -930,219 +857,184 @@ class MirroredVariableUpdateTest(test.TestCase):
 
       with self.assertRaisesRegexp(
           ValueError, "A non-DistributedValues value 5.0 cannot be reduced "
-          "with the given aggregation VariableAggregation.SUM."):
-        self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
+          "with the given reduce op ReduceOp.SUM."):
+        self.evaluate(distribution.unwrap(
+            distribution.extended.call_for_each_replica(model_fn)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(1.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign(6.0))
-      self.assertEquals(6.0, mirrored_var_result)
+      self.assertEqual(6.0, mirrored_var_result)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(0.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(0.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(1.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       # read_value == True
       mirrored_var_result = self.evaluate(
           mirrored_var.assign_add(6.0, read_value=True))
-      self.assertEquals(7.0, mirrored_var_result)
-      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
-      self.assertEquals(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(7.0, mirrored_var_result)
+      self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(7.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
       # read_value == False
       self.evaluate(mirrored_var.assign_add(2.0, read_value=False))
-      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
-      self.assertEquals(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(9.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_add(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(1.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(1.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignAddMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignAddMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           1.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(1.0, self.evaluate(mirrored_var))
+      self.assertEqual(1.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign_add(5.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(6.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(6.0, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarCrossDeviceContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarCrossDeviceContext(self, distribution):
     def var_fn():
       return variable_scope.variable(5.0, name="foo")
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
       mirrored_var_result = self.evaluate(mirrored_var.assign_sub(2.0))
-      self.assertEquals(3.0, mirrored_var_result)
-      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
-      self.assertEquals(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
+      self.assertEqual(3.0, mirrored_var_result)
+      self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:GPU:0")))
+      self.assertEqual(3.0, self.evaluate(mirrored_var.get("/device:CPU:0")))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarReplicaContext(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarReplicaContext(self, distribution):
     def var_fn():
       return variable_scope.variable(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
         value = math_ops.cast(
-            distribution_strategy_context.get_replica_context().replica_id,
+            ds_context.get_replica_context().replica_id_in_sync_group,
             mirrored_var.dtype)
         return mirrored_var.assign_sub(value)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(4.5, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(4.5, self.evaluate(mirrored_var))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignSubMirroredVarReplicaContextWithSingleValue(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignSubMirroredVarReplicaContextWithSingleValue(self, distribution):
     def var_fn():
       return variable_scope.variable(
           5.0, name="foo", aggregation=variable_scope.VariableAggregation.MEAN)
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      mirrored_var = dist.call_for_each_replica(var_fn)
+    with distribution.scope():
+      mirrored_var = distribution.extended.call_for_each_replica(var_fn)
       self.assertIsInstance(mirrored_var, values.MirroredVariable)
       self.evaluate(variables.global_variables_initializer())
-      self.assertEquals(5.0, self.evaluate(mirrored_var))
+      self.assertEqual(5.0, self.evaluate(mirrored_var))
 
       def model_fn():
         return mirrored_var.assign_sub(1.0)
 
-      self.evaluate(dist.unwrap(dist.call_for_each_replica(model_fn)))
-      self.assertEquals(4.0, self.evaluate(mirrored_var))
+      self.evaluate(distribution.unwrap(
+          distribution.extended.call_for_each_replica(model_fn)))
+      self.assertEqual(4.0, self.evaluate(mirrored_var))
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
 
-  def testAssignMirroredVarInitializer(self):
+  def testAssignMirroredVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
@@ -1150,17 +1042,14 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         v = variable_scope.variable(1.0, name="foo")
         return v
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:CPU:0"])
-
-      with dist.scope():
-        mirrored_var = dist.call_for_each_replica(var_fn)
+      with distribution.scope():
+        mirrored_var = distribution.extended.call_for_each_replica(var_fn)
         self.assertIsInstance(mirrored_var, values.MirroredVariable)
         self.assertFalse(self.evaluate(mirrored_var.is_initialized()))
         self.evaluate(mirrored_var.initializer)
         self.assertTrue(self.evaluate(mirrored_var.is_initialized()))
 
-  def testAssignReplicaLocalVarInitializer(self):
+  def testAssignReplicaLocalVarInitializer(self, distribution):
     # This test is not eager compatible since in eager variables are initialized
     # upon construction instead of once the initialization op is run.
     with context.graph_mode():
@@ -1172,11 +1061,9 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         self.assertTrue(isinstance(v_sum, values.ReplicaLocalVariable))
         return v_sum
 
-      dist = mirrored_strategy.MirroredStrategy(
-          ["/device:GPU:0", "/device:CPU:0"])
-
-      with dist.scope():
-        replica_local_var = dist.call_for_each_replica(model_fn)
+      with distribution.scope():
+        replica_local_var = distribution.extended.call_for_each_replica(
+            model_fn)
         self.assertTrue(isinstance(replica_local_var,
                                    values.ReplicaLocalVariable))
         self.assertFalse(self.evaluate(replica_local_var.is_initialized()))
@@ -1184,17 +1071,14 @@ class MirroredAndReplicaLocalVariableInitializerTest(test.TestCase):
         self.assertTrue(self.evaluate(replica_local_var.is_initialized()))
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class ReplicaLocalVariableAssignTest(test.TestCase):
-  config = config_pb2.ConfigProto()
-  config.allow_soft_placement = True
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Not enough GPUs available for this test in eager mode.")
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignReplicaLocalVarSumAggregation(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignReplicaLocalVarSumAggregation(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
@@ -1202,18 +1086,16 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.SUM)
       return v_sum
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      replica_local_var = dist.call_for_each_replica(model_fn)
+    with distribution.scope():
+      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
       self.assertTrue(isinstance(replica_local_var,
                                  values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the SUM of each of
       # values on each of the replicas.
-      self.assertEqual(2.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(2.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
       # Assigning 6.0 in cross replica context will assign a value of
       # 6.0/num_replicas to each replica.
       tlv_ops = replica_local_var.assign(6.0)
@@ -1221,11 +1103,10 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
       # On reading the replica local var we should get the assigned value back.
       # The value on all the replicas are added before being returned by
       # `read_var`.
-      self.assertEqual(6.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(6.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testAssignReplicaLocalVarMeanAggregation(self):
-    self._skip_eager_if_gpus_less_than(1)
+  def testAssignReplicaLocalVarMeanAggregation(self, distribution):
     def model_fn():
       v_sum = variable_scope.variable(
           1.0,
@@ -1233,23 +1114,22 @@ class ReplicaLocalVariableAssignTest(test.TestCase):
           aggregation=variable_scope.VariableAggregation.MEAN)
       return v_sum
 
-    dist = mirrored_strategy.MirroredStrategy(
-        ["/device:GPU:0", "/device:CPU:0"])
-
-    with dist.scope():
-      replica_local_var = dist.call_for_each_replica(model_fn)
+    with distribution.scope():
+      replica_local_var = distribution.extended.call_for_each_replica(model_fn)
       self.assertTrue(isinstance(replica_local_var,
                                  values.ReplicaLocalVariable))
       self.evaluate(variables.global_variables_initializer())
       # Each replica has a value of 1.0 assigned to it in replica context.
       # When we read the value using `read_var` we should see the MEAN of values
       # on all replicas which is the value assigned in replica context.
-      self.assertEqual(1.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(1.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
       tlv_ops = replica_local_var.assign(6.0)
       self.evaluate(tlv_ops)
       # On reading the replica local var we should get the MEAN of all values
       # which is equal to the value assigned.
-      self.assertEqual(6.0, self.evaluate(dist.read_var(replica_local_var)))
+      self.assertEqual(6.0, self.evaluate(
+          distribution.read_var(replica_local_var)))
 
 
 class MockModel(object):
@@ -1283,24 +1163,25 @@ class MiniModel(keras_training.Model):
     return self.fc(inputs)
 
 
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
 class MirroredStrategyDefunTest(test.TestCase):
 
-  def _skip_eager_if_gpus_less_than(self, num_gpus):
-    if context.num_gpus() < num_gpus and context.executing_eagerly():
-      self.skipTest("Not enough GPUs available for this test in eager mode.")
-
-  def _call_and_check(self, model_fn, inputs, expected_result, defuns,
-                      two_variables=False):
+  def _call_and_check(self, distribution, model_fn, inputs, expected_result,
+                      defuns, two_variables=False):
     cpu_dev = device_util.canonicalize("CPU:0")
     gpu_dev = device_util.canonicalize("GPU:0")
     devices = [cpu_dev, gpu_dev]
-    dist = mirrored_strategy.MirroredStrategy(devices)
 
-    with dist.scope():
+    with distribution.scope():
       mock_model = MockModel(two_variables)
       self.evaluate(variables.global_variables_initializer())
 
-      result = dist.call_for_each_replica(model_fn, args=[mock_model] + inputs)
+      result = distribution.extended.call_for_each_replica(
+          model_fn, args=[mock_model] + inputs)
       for device in devices:
         device_result = values.select_device(device, result)
         device_expected_result = values.select_device(device, expected_result)
@@ -1312,17 +1193,15 @@ class MirroredStrategyDefunTest(test.TestCase):
         # call_for_each has one trace per device. To check that the expected set
         # of variables was accessed on each trace, we first retrieve each
         # device-specific graph function.
-        per_replica_graph_functions = dist.call_for_each_replica(
-            defun.get_concrete_function, args=[mock_model] + inputs)
+        per_replica_graph_functions = (
+            distribution.extended.call_for_each_replica(
+                defun.get_concrete_function, args=[mock_model] + inputs))
         for device in devices:
           graph_function = per_replica_graph_functions.get(device=device)
           self.assertEqual(set(mock_model.variables),
                            set(graph_function.graph.variables))
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableInDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testVariableInDefun(self, distribution):
     @function.defun
     def times_two(mock_model):
       return mock_model()
@@ -1330,12 +1209,9 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return times_two(mock_model)
 
-    self._call_and_check(model_fn, [], 2.5, [times_two])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testVariableInNestedDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 2.5, [times_two])
 
+  def testVariableInNestedDefun(self, distribution):
     @function.defun
     def times_two(mock_model):
       return mock_model()
@@ -1347,12 +1223,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return two_x_plus_one(mock_model)
 
-    self._call_and_check(model_fn, [], 3.5, [times_two, two_x_plus_one])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testTwoVariablesInNestedDefun(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 3.5,
+                         [times_two, two_x_plus_one])
 
+  def testTwoVariablesInNestedDefun(self, distribution):
     @function.defun
     def fn1(mock_model):
       return mock_model()
@@ -1364,12 +1238,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     def model_fn(mock_model):
       return fn2(mock_model)
 
-    self._call_and_check(model_fn, [], 5.5, [fn1, fn2], two_variables=True)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testGradientTapeOverNestedDefuns(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, model_fn, [], 5.5, [fn1, fn2],
+                         two_variables=True)
 
+  def testGradientTapeOverNestedDefuns(self, distribution):
     @function.defun
     def fn1(mock_model):
       return mock_model()
@@ -1385,13 +1257,10 @@ class MirroredStrategyDefunTest(test.TestCase):
                              [v.get() for v in mock_model.variables])
       return grads
 
-    self._call_and_check(model_fn, [], [2.0, 1.0], [fn1, fn2],
+    self._call_and_check(distribution, model_fn, [], [2.0, 1.0], [fn1, fn2],
                          two_variables=True)
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testPassPerReplica(self):
-    self._skip_eager_if_gpus_less_than(1)
-
+  def testPassPerReplica(self, distribution):
     @function.defun
     def fn1(mock_model, factor):
       return mock_model(factor)
@@ -1399,18 +1268,10 @@ class MirroredStrategyDefunTest(test.TestCase):
     factors = values.PerReplica({"CPU:0": 5.0, "GPU:0": 3.0})
     expected_result = values.PerReplica({"CPU:0": 5.0 * 1.25,
                                          "GPU:0": 3.0 * 1.25})
-    self._call_and_check(fn1, [factors], expected_result, [fn1])
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testTrain(self):
-    self._skip_eager_if_gpus_less_than(1)
+    self._call_and_check(distribution, fn1, [factors], expected_result, [fn1])
 
-    cpu_dev = device_util.canonicalize("CPU:0")
-    gpu_dev = device_util.canonicalize("GPU:0")
-    devices = [cpu_dev, gpu_dev]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-
-    with dist.scope():
+  def testTrain(self, distribution):
+    with distribution.scope():
       mock_model = MiniModel()
       mock_model.call = function.defun(mock_model.call)
 
@@ -1420,10 +1281,11 @@ class MirroredStrategyDefunTest(test.TestCase):
 
       gradients_fn = backprop.implicit_grad(loss_fn)
       gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
-      grads_and_vars = dist.call_for_each_replica(gradients_fn, args=(None,))
+      grads_and_vars = distribution.extended.call_for_each_replica(
+          gradients_fn, args=(None,))
 
       optimizer = gradient_descent.GradientDescentOptimizer(0.25)
-      update_ops = optimizer._distributed_apply(dist, grads_and_vars)  # pylint: disable=protected-access
+      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access
 
       if not context.executing_eagerly():
         self.evaluate(variables.global_variables_initializer())
@@ -1435,30 +1297,82 @@ class MirroredStrategyDefunTest(test.TestCase):
       self.assertAllEqual([0.5], updated_var_values[1])
 
 
+@combinations.generate(
+    combinations.combine(
+        distribution=[
+            combinations.NamedDistribution(
+                "Mirrored",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.MirroredStrategy(num_gpus_per_worker=
+                                                           context.num_gpus()),
+                required_gpus=1),
+            combinations.NamedDistribution(
+                "CoreMirrored",
+                # pylint: disable=g-long-lambda
+                lambda: mirrored_strategy.CoreMirroredStrategy(
+                    mirrored_strategy.all_local_devices()),
+                required_gpus=1)
+        ],
+        mode=["graph"]))
 class MultiWorkerMirroredStrategyTest(
     multi_worker_test_base.MultiWorkerTestBase,
     strategy_test_lib.DistributionTestBase):
 
-  def _get_distribution_strategy(self):
+  def _configure_distribution_strategy(self, distribution):
     cluster_spec = server_lib.ClusterSpec({
         "worker": ["/job:worker/task:0", "/job:worker/task:1"]
     })
-    strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
-    strategy.configure(cluster_spec=cluster_spec)
-    return strategy
-
-  def test_num_replicas_in_sync(self):
-    if not GPU_TEST:
-      self.skipTest("Not GPU test")
+    distribution.configure(cluster_spec=cluster_spec)
 
-    strategy = self._get_distribution_strategy()
+  def test_num_replicas_in_sync(self, distribution):
+    self._configure_distribution_strategy(distribution)
     # We calculate the total number of gpus across the workers(2) specified in
     # the cluster spec.
-    self.assertEqual(context.num_gpus() * 2, strategy.num_replicas_in_sync)
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy(),
-                                   learning_rate=0.05)
+    self.assertEqual(context.num_gpus() * 2, distribution.num_replicas_in_sync)
+
+  def testMinimizeLossGraph(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    self._test_minimize_loss_graph(distribution, learning_rate=0.05)
+
+  def testDeviceScope(self, distribution):
+    """Test the device scope of multi-worker MirroredStrategy."""
+    self._configure_distribution_strategy(distribution)
+    with distribution.scope():
+      a = constant_op.constant(1.)
+      with ops.device("/cpu:0"):
+        b = constant_op.constant(1.)
+      self.assertEqual(a.device, "/job:worker/task:0")
+      self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
+
+  def testMakeInputFnIterator(self, distribution):
+    self._configure_distribution_strategy(distribution)
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    num_gpus = context.num_gpus()
+    num_workers = 2
+
+    expected_values = [[i+j for j in range(num_gpus)] * num_workers
+                       for i in range(0, 100, num_gpus)]
+
+    with context.graph_mode(), self.cached_session() as sess:
+      # `expected_input_pipeline_id` is None because the input_fn will be called
+      # multiple times, each with a different input_pipeline_id.
+      input_fn = self._input_fn_to_test_input_context(
+          dataset_fn,
+          expected_num_replicas_in_sync=num_workers*num_gpus,
+          expected_num_input_pipelines=num_workers,
+          expected_input_pipeline_id=None)
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      self._test_input_fn_iterator(
+          iterator, distribution.extended.worker_devices, expected_values, sess)
+
+  def testUpdateConfigProto(self, distribution):
+    distribution.configure(cluster_spec={"worker": ["fake1", "fake2"]})
+
+    config_proto = config_pb2.ConfigProto()
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify isolate_session_state
+    self.assertTrue(new_config.isolate_session_state)
 
 
 class MultiWorkerMirroredStrategyTestWithChief(
@@ -1478,6 +1392,19 @@ class MultiWorkerMirroredStrategyTestWithChief(
     strategy.configure(cluster_spec=self._cluster_spec)
     self._test_minimize_loss_graph(strategy, learning_rate=0.05)
 
+  def testMinimizeLossGraphCoreMirroredStrategy(self):
+    strategy = mirrored_strategy.CoreMirroredStrategy(
+        mirrored_strategy.all_local_devices())
+    strategy.configure(cluster_spec=self._cluster_spec)
+    self._test_minimize_loss_graph(strategy, learning_rate=0.05)
+
+
+def _replica_id():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if not isinstance(replica_id, ops.Tensor):
+    replica_id = constant_op.constant(replica_id)
+  return replica_id
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py b/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
deleted file mode 100644
index b5d393fd0dc8d3524bf356b7e60480d6056fd550..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/distribute/python/mirrored_strategy_test.py
+++ /dev/null
@@ -1,110 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for class MirroredStrategy."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import strategy_test_lib
-from tensorflow.python.eager import context
-from tensorflow.python.eager import test
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribution_strategy_context
-
-
-class MirroredOneCPUDistributionTest(strategy_test_lib.DistributionTestBase):
-
-  def _get_distribution_strategy(self):
-    return mirrored_strategy.MirroredStrategy(["/device:CPU:0"])
-
-  def testMinimizeLossEager(self):
-    self._test_minimize_loss_eager(self._get_distribution_strategy())
-
-  def testMinimizeLossGraph(self):
-    self._test_minimize_loss_graph(self._get_distribution_strategy())
-
-  def testMapReduce(self):
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    self._test_device_index(self._get_distribution_strategy())
-
-  def testReplicaId(self):
-    self._test_replica_id(self._get_distribution_strategy())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testCallAndMergeExceptions(self):
-    self._test_call_and_merge_exceptions(self._get_distribution_strategy())
-
-
-class VariableCreatorStackTest(test.TestCase):
-
-  def testCreatorStacksAreThreadLocal(self):
-    devices = ["/device:CPU:0", "/device:GPU:0"]
-    dist = mirrored_strategy.MirroredStrategy(devices)
-
-    def model_fn(device_id):
-      assert isinstance(device_id, int)
-
-      def thread_creator_fn(next_creator, *args, **kwargs):
-        return next_creator(*args, **kwargs) + ":thread_" + str(device_id)
-
-      with variable_scope.variable_creator_scope(thread_creator_fn):
-        # Create a variable in this scope.
-        v = variable_scope.variable(1.0)
-
-        # This will pause the current thread, and execute the other thread.
-        distribution_strategy_context.get_replica_context().merge_call(
-            lambda _: _)
-      return v
-
-    def main_thread_creator(next_creator, *args, **kwargs):
-      # We are not using the underlying next_creator for test purposes.
-      del next_creator, args, kwargs
-      return "main_thread"
-
-    with context.graph_mode(), \
-        dist.scope(), \
-        variable_scope.variable_creator_scope(main_thread_creator):
-      result = dist.call_for_each_replica(
-          model_fn, args=(dist.worker_device_index,))
-      result = dist.unwrap(result)
-      expected = ["main_thread:thread_0", "main_thread:thread_1"]
-      self.assertEquals(expected, result)
-
-
-class MultiWorkerMirroredStrategyTest(test.TestCase):
-
-  def testDeviceScope(self):
-    """Test the device scope of multi-worker MirroredStrategy."""
-    with context.graph_mode():
-      strategy = mirrored_strategy.MirroredStrategy(num_gpus=context.num_gpus())
-      strategy.configure(
-          cluster_spec={"worker": ["/job:worker/task:0", "/job:worker/task:1"]})
-      with strategy.scope():
-        a = constant_op.constant(1.)
-        with ops.device("/cpu:0"):
-          b = constant_op.constant(1.)
-        self.assertEqual(a.device, "/job:worker/task:0")
-        self.assertEqual(b.device, "/job:worker/task:0/device:CPU:0")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/distribute/python/moving_averages_test.py b/tensorflow/contrib/distribute/python/moving_averages_test.py
index 7ecc852d20508cc7063f3598c9fef03d6ce536a5..8f13e9153ea7a951dd722c4549882c97e79b57fe 100644
--- a/tensorflow/contrib/distribute/python/moving_averages_test.py
+++ b/tensorflow/contrib/distribute/python/moving_averages_test.py
@@ -32,7 +32,8 @@ from tensorflow.python.training import moving_averages
 all_combinations = combinations.combine(
     distribution=[combinations.default_strategy,
                   combinations.one_device_strategy,
-                  combinations.mirrored_strategy_with_gpu_and_cpu],
+                  combinations.mirrored_strategy_with_gpu_and_cpu,
+                  combinations.core_mirrored_strategy_with_gpu_and_cpu],
     mode=["graph"])
 
 
@@ -138,6 +139,27 @@ class AssignMovingAveragesTest(test.TestCase, parameterized.TestCase):
            (2.0 * 0.25 + 0.0) / (1.0 * 0.25 + 1.0)],
           var.eval())
 
+  @combinations.generate(all_combinations)
+  def testAssignVariable(self, distribution):
+
+    def replica_fn():
+      var = variables.Variable([10.0, 11.0])
+      # Here we expect to check the case when input value are variable.
+      val = variables.Variable([1., 2.])
+      decay = 0.25
+      assign = moving_averages.assign_moving_average(
+          var, val, decay, zero_debias=False)
+      return var, assign
+
+    with distribution.scope(), self.cached_session() as sess:
+      var, assign = distribution.call_for_each_replica(replica_fn)
+      variables.global_variables_initializer().run()
+      self.assertAllClose([10.0, 11.0], var.eval())
+      sess.run(distribution.unwrap(assign))
+      self.assertAllClose(
+          [10 * 0.25 + 1. * (1 - 0.25), 11 * 0.25 + 2. * (1 - 0.25)],
+          var.eval())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/multi_worker_test_base.py b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
index 8eec3dc0f6ec0676353c7434d203e017b9aab80d..147c9b83f866fd364ea23cf7988692a7b5f61b9c 100644
--- a/tensorflow/contrib/distribute/python/multi_worker_test_base.py
+++ b/tensorflow/contrib/distribute/python/multi_worker_test_base.py
@@ -18,8 +18,11 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import contextlib
 import copy
+import json
+import os
 import threading
 import numpy as np
 
@@ -271,7 +274,6 @@ class MultiWorkerTestBase(test.TestCase):
 
     return config
 
-
   def _run_client(self, client_fn, task_type, task_id, num_gpus, *args,
                   **kwargs):
     result = client_fn(task_type, task_id, num_gpus, *args, **kwargs)
@@ -303,3 +305,101 @@ class MultiWorkerTestBase(test.TestCase):
     for t in threads:
       t.join()
     self.assertEqual(self._result, len(threads))
+
+
+class MockOsEnv(collections.Mapping):
+  """A class that allows per-thread TF_CONFIG."""
+
+  def __init__(self, *args):
+    self._dict = dict()
+    self._thread_local = threading.local()
+    super(MockOsEnv, self).__init__(*args)
+
+  def get(self, key, default=None):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.get(self._thread_local.dict, key, default)
+    else:
+      return dict.get(self._dict, key, default)
+
+  def __getitem__(self, key):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.__getitem__(self._thread_local.dict, key)
+    else:
+      return dict.__getitem__(self._dict, key)
+
+  def __setitem__(self, key, val):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    if key == 'TF_CONFIG':
+      return dict.__setitem__(self._thread_local.dict, key, val)
+    else:
+      return dict.__setitem__(self._dict, key, val)
+
+  def __iter__(self):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    for x in self._thread_local.dict.items():
+      yield x
+    for x in self._dict.items():
+      yield x
+
+  def __len__(self):
+    if not hasattr(self._thread_local, 'dict'):
+      self._thread_local.dict = dict()
+    return self._thread_local.dict.__len__() + self._dict.__len__()
+
+
+class IndependentWorkerTestBase(test.TestCase):
+  """Testing infra for independent workers."""
+
+  def setUp(self):
+    self._mock_os_env = MockOsEnv()
+    self._mock_context = test.mock.patch.object(os, 'environ',
+                                                self._mock_os_env)
+    super(IndependentWorkerTestBase, self).setUp()
+    self._mock_context.__enter__()
+
+  def tearDown(self):
+    self._mock_context.__exit__(None, None, None)
+    super(IndependentWorkerTestBase, self).tearDown()
+
+  def _task_thread(self, task_fn, tf_config, *args, **kwargs):
+    os.environ['TF_CONFIG'] = json.dumps(tf_config)
+    task_fn(*args, **kwargs)
+
+  def _run_task_in_thread(self, task_fn, cluster_spec, task_type, task_id,
+                          *args, **kwargs):
+    if task_type:
+      tf_config = {
+          'cluster': cluster_spec,
+          'task': {
+              'type': task_type,
+              'index': task_id
+          }
+      }
+    else:
+      tf_config = {
+          'cluster': cluster_spec,
+      }
+    t = threading.Thread(
+        target=self._task_thread,
+        args=(task_fn, tf_config) + args,
+        kwargs=kwargs)
+    t.start()
+    return t
+
+  def run_multiple_tasks_in_threads(self, task_fn, cluster_spec, *args,
+                                    **kwargs):
+    # The task_fn should create std_server by itself.
+    threads = {}
+    for task_type in cluster_spec.keys():
+      threads[task_type] = []
+      for task_id in range(len(cluster_spec[task_type])):
+        t = self._run_task_in_thread(task_fn, cluster_spec, task_type, task_id,
+                                     *args, **kwargs)
+        threads[task_type].append(t)
+    return threads
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy.py b/tensorflow/contrib/distribute/python/one_device_strategy.py
index 1b4251b761b2b95b4e41fbd8c8d5e31e5e1b2d25..fdbfba4e04358451a46b23ef250dc7c534c855a0 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy.py
@@ -20,14 +20,14 @@ from __future__ import print_function
 
 import six
 
-from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import values
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -41,7 +41,14 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
   # implementations?
 
   def __init__(self, device):
-    super(OneDeviceStrategy, self).__init__()
+    super(OneDeviceStrategy, self).__init__(OneDeviceExtended(self, device))
+
+
+class OneDeviceExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of OneDeviceStrategy."""
+
+  def __init__(self, container_strategy, device):
+    super(OneDeviceExtended, self).__init__(container_strategy)
     self._device = device
     self._default_device = device
 
@@ -53,24 +60,40 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     if isinstance(colocate_with, six.string_types):
       with ops.device(colocate_with):
         return next_creator(*args, **kwargs)
-    if (isinstance(colocate_with, list) and len(colocate_with) == 1 and
+    if (isinstance(colocate_with, (list, tuple)) and len(colocate_with) == 1 and
         isinstance(colocate_with[0], six.string_types)):
       with ops.device(colocate_with[0]):
         return next_creator(*args, **kwargs)
     with ops.colocate_with(colocate_with):
       return next_creator(*args, **kwargs)
 
-  def distribute_dataset(self, dataset_fn):
+  def _make_dataset_iterator(self, dataset):
+    """Make iterator from dataset without splitting the batch."""
+    worker = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(worker, [self._device])]
+    return values.DatasetIterator(dataset, worker_device_pairs)
+
+  def _distribute_dataset(self, dataset_fn):
     return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), [self._device])
 
-  def _broadcast(self, tensor, destinations):
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    worker = device_util.canonicalize("/device:CPU:0")
+    worker_device_pairs = [(worker, [self._device])]
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs,
+        [distribute_lib.InputContext()])
+
+  def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
 
   # TODO(priyag): Deal with OutOfRange errors  once b/111349762 is fixed.
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values=None):
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
     if initial_loop_values is None:
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
@@ -82,7 +105,7 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
       fn_inputs = iterator.get_next()
       if not isinstance(fn_inputs, tuple):
         fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
+      fn_result = fn(ctx, fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       with ops.control_dependencies([fn_result]):
         return [i + 1] + flat_last_step_outputs
@@ -116,39 +139,24 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     return ctx
 
   def _call_for_each_replica(self, fn, args, kwargs):
-    with ops.device(self._device), _OneDeviceReplicaContext(self):
+    strategy = self._container_strategy()
+    with ops.device(self._device), _OneDeviceReplicaContext(strategy):
       return fn(*args, **kwargs)
 
-  def map(self, map_over, fn, *args, **kwargs):
-    with ops.device(self._device):
-      return values.MapOutput([fn(m, *args, **kwargs) for m in map_over])
-
-  def _reduce(self, aggregation, value, destinations):
-    del destinations
-    if not isinstance(value, values.MapOutput):
-      return value
-    l = value.get()
-    assert l
-    with ops.device(self._device):
-      if aggregation == vs.VariableAggregation.SUM:
-        return math_ops.add_n(l)
-      elif aggregation == vs.VariableAggregation.MEAN:
-        return math_ops.add_n(l) / len(l)
-      else:
-        assert False
+  def _reduce_to(self, reduce_op, value, destinations):
+    del reduce_op, destinations
+    return value
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     # The implementations of _update() and _update_non_slot() are identical
     # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     del colocate_with
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.device(self._device), distribute_lib.UpdateContext(self._device):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -158,33 +166,43 @@ class OneDeviceStrategy(distribute_lib.DistributionStrategy):
     return array_ops.identity(replica_local_var)
 
   def _unwrap(self, value):
-    return [value]
+    return (value,)
 
   def value_container(self, value):
     return value
 
   @property
-  def num_replicas(self):
-    return 1
-
-  @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return 1
 
   @property
   def worker_devices(self):
-    return [self._device]
+    return (self._device,)
 
   @property
   def parameter_devices(self):
-    return [self._device]
+    return (self._device,)
 
   def non_slot_devices(self, var_list):
     del var_list
-    return [self._device]
+    return (self._device,)
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
 
-  def _worker_device_index(self):
-    return 0
+  @property
+  def should_save_summary(self):
+    return True
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
 
 
 class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
@@ -192,12 +210,10 @@ class _OneDeviceReplicaContext(distribute_lib.ReplicaContext):
 
   def __init__(self, distribution_strategy):
     distribute_lib.ReplicaContext.__init__(
-        self, distribution_strategy, replica_id=0)
-
-  @property
-  def device(self):
-    raise RuntimeError("Use .devices instead")
+        self,
+        distribution_strategy,
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
 
   @property
   def devices(self):
-    return [self._distribution_strategy.worker_devices[0]]
+    return self._distribution_strategy.extended.worker_devices
diff --git a/tensorflow/contrib/distribute/python/one_device_strategy_test.py b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
index 3fb92273924a665bf2a1ee5fc94b75273b8c5f78..d46cd6f529e363f76bfa2b22339add63530cfde8 100644
--- a/tensorflow/contrib/distribute/python/one_device_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/one_device_strategy_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.contrib.distribute.python import one_device_strategy
 from tensorflow.contrib.distribute.python import strategy_test_lib
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 
@@ -35,12 +36,6 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
   def testMinimizeLossGraph(self):
     self._test_minimize_loss_graph(self._get_distribution_strategy())
 
-  def testMapReduce(self):
-    self._test_map_reduce(self._get_distribution_strategy())
-
-  def testDeviceIndex(self):
-    self._test_device_index(self._get_distribution_strategy())
-
   def testReplicaId(self):
     self._test_replica_id(self._get_distribution_strategy())
 
@@ -48,6 +43,20 @@ class OneDeviceStrategyTest(strategy_test_lib.DistributionTestBase):
   def testCallAndMergeExceptions(self):
     self._test_call_and_merge_exceptions(self._get_distribution_strategy())
 
+  @test_util.run_in_graph_and_eager_modes
+  def testMakeInputFnIterator(self):
+    d = one_device_strategy.OneDeviceStrategy("/device:CPU:0")
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+    expected_values = [[i] for i in range(10)]
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=1,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)
+    iterator = d.make_input_fn_iterator(input_fn)
+    self._test_input_fn_iterator(
+        iterator, d.extended.worker_devices, expected_values)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy.py b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
index 790b37f86010eba6bdc87e6424e55a97629c5d1a..2c7766f95fbcb7b68a53ad0052f21485c763a1db 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy.py
@@ -18,10 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
+import copy
+
 from tensorflow.contrib.distribute.python import mirrored_strategy
-from tensorflow.contrib.distribute.python import values
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
@@ -30,8 +34,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import device_setter
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 _LOCAL_CPU = "/device:CPU:0"
@@ -94,13 +96,21 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       ValueError: if `cluster_spec` is given but `task_type` or `task_id` is
         not.
     """
-    super(ParameterServerStrategy, self).__init__()
+    super(ParameterServerStrategy, self).__init__(
+        ParameterServerExtended(self, num_gpus_per_worker))
+
+
+class ParameterServerExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of ParameterServerStrategy."""
+
+  def __init__(self, container_strategy, num_gpus_per_worker):
+    super(ParameterServerExtended, self).__init__(container_strategy)
     self._num_gpus_per_worker = num_gpus_per_worker
     self._initialize_local(num_gpus_per_worker)
 
     # We typically don't need to do all-reduce in this strategy.
-    self._cross_tower_ops = (
-        cross_tower_ops_lib.ReductionToOneDeviceCrossDeviceOps(
+    self._cross_device_ops = (
+        cross_device_ops_lib.ReductionToOneDeviceCrossDeviceOps(
             reduce_to_device=_LOCAL_CPU))
 
   def _initialize_multi_worker(self, num_gpus_per_worker, cluster_spec,
@@ -135,14 +145,14 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     # replica. When there are GPUs, replicate operations on these GPUs.
     # Otherwise, place operations on CPU.
     if num_gpus_per_worker > 0:
-      self._compute_devices = [
+      self._compute_devices = tuple(
           "%s/device:GPU:%d" % (self._worker_device, i)
           for i in range(num_gpus_per_worker)
-      ]
+      )
     else:
-      self._compute_devices = [self._worker_device]
+      self._compute_devices = (self._worker_device,)
 
-    self._compute_devices = list(
+    self._compute_devices = tuple(
         map(device_util.resolve, self._compute_devices))
     self._canonical_compute_device_set = set(self._compute_devices)
 
@@ -166,8 +176,8 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     # The `_parameter_devices` is needed for the `parameter_devices` property
     # and is a list of all variable devices. Here parameter devices are all
     # tasks of the "ps" job.
-    self._parameter_devices = map("/job:ps/task:{}".format,
-                                  range(num_ps_replicas))
+    self._parameter_devices = tuple(map("/job:ps/task:{}".format,
+                                        range(num_ps_replicas)))
 
     # Add a default device so that ops without specified devices will not end up
     # on other workers.
@@ -189,28 +199,29 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
   def _initialize_local(self, num_gpus_per_worker):
     """Initialize internal devices for local training."""
+    self._worker_device = device_util.canonicalize("/device:CPU:0")
     # Define compute devices which is a list of device strings and one for each
     # replica. When there are GPUs, replicate operations on these GPUs.
     # Otherwise, place operations on CPU.
     if num_gpus_per_worker > 0:
-      self._compute_devices = list(
+      self._compute_devices = tuple(
           map("/device:GPU:{}".format, range(num_gpus_per_worker)))
     else:
-      self._compute_devices = [_LOCAL_CPU]
+      self._compute_devices = (_LOCAL_CPU,)
 
-    self._compute_devices = list(
+    self._compute_devices = tuple(
         map(device_util.resolve, self._compute_devices))
     self._canonical_compute_device_set = set(self._compute_devices)
 
     # If there is only one GPU, put everything on that GPU. Otherwise, place
     # variables on CPU.
     if num_gpus_per_worker == 1:
-      assert len(list(self._compute_devices)) == 1
+      assert len(self._compute_devices) == 1
       self._variable_device = _LOCAL_GPU_0
-      self._parameter_devices = [_LOCAL_GPU_0]
+      self._parameter_devices = (_LOCAL_GPU_0,)
     else:
       self._variable_device = _LOCAL_CPU
-      self._parameter_devices = [_LOCAL_CPU]
+      self._parameter_devices = (_LOCAL_CPU,)
 
     self._is_chief = True
     self._cluster_spec = None
@@ -221,15 +232,48 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
         "ParameterServerStrategy with compute_devices = %r, "
         "variable_device = %r", self._compute_devices, self._variable_device)
 
-  def distribute_dataset(self, dataset_fn):
+  def _distribute_dataset(self, dataset_fn):
     """Distributes the dataset to each local GPU."""
     return values.PerReplicaDataset(
         self._call_dataset_fn(dataset_fn), self._compute_devices, True)
 
-  def _broadcast(self, tensor, destinations):
-    if not cross_tower_ops_lib.check_destinations(destinations):
+  def _make_dataset_iterator(self, dataset):
+    worker_device_pairs = [(self._worker_device, self._compute_devices)]
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    """Distributes the dataset to each local GPU."""
+    if self._cluster_spec:
+      input_pipeline_id = multi_worker_util.id_in_cluster(
+          self._cluster_spec, self._task_type, self._task_id)
+      num_input_pipelines = multi_worker_util.worker_count(
+          self._cluster_spec, self._task_type)
+    else:
+      input_pipeline_id = 0
+      num_input_pipelines = 1
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=num_input_pipelines,
+        input_pipeline_id=input_pipeline_id,
+        num_replicas_in_sync=self._num_replicas_in_sync)
+    worker_device_pairs = [(self._worker_device, self._compute_devices)]
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs, [input_context])
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    if not cross_device_ops_lib.check_destinations(destinations):
       destinations = self._compute_devices
-    return self._cross_tower_ops.broadcast(tensor, destinations)
+    return self._cross_device_ops.broadcast(tensor, destinations)
 
   def _allow_variable_partition(self):
     return not context.executing_eagerly()
@@ -237,7 +281,7 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   # TODO(yuefengz): not all ops in device_setter.STANDARD_PS_OPS will go through
   # this creator, such as "MutableHashTable".
   def _create_variable(self, next_creator, *args, **kwargs):
-    if self.num_replicas_in_sync > 1:
+    if self._num_replicas_in_sync > 1:
       aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
       if aggregation not in (
           vs.VariableAggregation.NONE,
@@ -293,39 +337,35 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
   def _call_for_each_replica(self, fn, args, kwargs):
     # pylint: disable=protected-access
-    return mirrored_strategy._call_for_each_replica(self, fn, args, kwargs)
+    return mirrored_strategy._call_for_each_replica(
+        self._container_strategy(), fn, args, kwargs)
 
   def _verify_destinations_not_different_worker(self, destinations):
     if not self._cluster_spec:
       return
     if destinations is None:
       return
-    for d in cross_tower_ops_lib.get_devices_from(destinations):
+    for d in cross_device_ops_lib.get_devices_from(destinations):
       d_spec = tf_device.DeviceSpec.from_string(d)
       if d_spec.job == self._task_type and d_spec.task != self._task_id:
         raise ValueError(
             "Cannot reduce to another worker: %r, current worker is %r" %
             (d, self._worker_device))
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     self._verify_destinations_not_different_worker(destinations)
     if not isinstance(value, values.DistributedValues):
       # pylint: disable=protected-access
       return mirrored_strategy._reduce_non_distributed_value(
-          self, aggregation, value, destinations)
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return self.broadcast(value.get(self._compute_devices[0]), destinations)
-    return self._cross_tower_ops.reduce(
-        aggregation, value, destinations=destinations)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return [self.broadcast(v.get(self._compute_devices[0]), d)
-              for v, d in value_destination_pairs]
+          self, reduce_op, value, destinations)
+    return self._cross_device_ops.reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
     for _, destinations in value_destination_pairs:
       self._verify_destinations_not_different_worker(destinations)
-    return self._cross_tower_ops.batch_reduce(aggregation,
-                                              value_destination_pairs)
+    return self._cross_device_ops.batch_reduce(reduce_op,
+                                               value_destination_pairs)
 
   def _select_single_value(self, structured):
     """Select any single values in `structured`."""
@@ -349,30 +389,26 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
 
     return nest.map_structure(_select_fn, structured)
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     if isinstance(var, values.AggregatingVariable):
       var = var.get()
     if not isinstance(var, resource_variable_ops.ResourceVariable):
       raise ValueError(
           "You can not update `var` %r. It must be a Variable." % var)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.colocate_with(var), distribute_lib.UpdateContext(var.device):
       result = fn(var, *self._select_single_value(args),
                   **self._select_single_value(kwargs))
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
 
   # TODO(yuefengz): does it need to call _select_single_value?
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     with ops.device(
         colocate_with.device), distribute_lib.UpdateContext(colocate_with):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -381,9 +417,9 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     if isinstance(val, values.DistributedValues):
       # Return in a deterministic order.
       if set(val.devices) == self._canonical_compute_device_set:
-        return [val.get(device=d) for d in self._compute_devices]
-      return [val.get(device=d) for d in sorted(val.devices)]
-    return [val]
+        return tuple(val.get(device=d) for d in self._compute_devices)
+      return tuple(val.get(device=d) for d in sorted(val.devices))
+    return (val,)
 
   def value_container(self, val):
     if (hasattr(val, "_aggregating_container") and
@@ -398,11 +434,11 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
     # variables.
     return array_ops.identity(var)
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     """Configures the strategy class.
 
     The strategy object will be re-initialized if `cluster_spec` is given but
@@ -433,48 +469,50 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
       self._initialize_multi_worker(self._num_gpus_per_worker,
                                     self._cluster_spec, task_type, task_id)
 
-    if not session_config or not self._cluster_spec:
-      return
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
 
-    session_config.isolate_session_state = False
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    if not self._cluster_spec:
+      updated_config.isolate_session_state = True
+      return updated_config
+
+    updated_config.isolate_session_state = False
 
-    assert self._cluster_spec
     assert self._task_type
     assert self._task_id is not None
 
     # The device filters prevent communication between workers.
     if self._task_type not in ["chief", "worker"]:
-      return
-    del session_config.device_filters[:]
-    session_config.device_filters.extend(
+      return updated_config
+    del updated_config.device_filters[:]
+    updated_config.device_filters.extend(
         ["/job:%s/task:%d" % (self._task_type, self._task_id), "/job:ps"])
+    return updated_config
 
   @property
-  def num_replicas(self):
-    return len(self._compute_devices)
-
-  @property
-  def num_replicas_in_sync(self):
+  def _num_replicas_in_sync(self):
     return len(self._compute_devices)
 
   @property
   def worker_devices(self):
-    # Make a copy to prevent users from accidentally mutating our copy.
-    return list(self._compute_devices)
+    return self._compute_devices
 
   @property
   def parameter_devices(self):
-    return list(self._parameter_devices)
+    return self._parameter_devices
 
   def non_slot_devices(self, var_list):
     return min(var_list, key=lambda x: x.name)
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
+    # TODO(yuefengz): Should this return False in the local case?
     return True
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return self._is_chief
 
   @property
@@ -484,3 +522,8 @@ class ParameterServerStrategy(distribute_lib.DistributionStrategy):
   @property
   def should_save_summary(self):
     return self._is_chief
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return False
diff --git a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
index 81a23c89030221a8a15bdedc796c50d9c518138c..83d7473666a65e438a1c0119d2a12bf54e53c8fc 100644
--- a/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
+++ b/tensorflow/contrib/distribute/python/parameter_server_strategy_test.py
@@ -25,14 +25,21 @@ from absl.testing import parameterized
 from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
 from tensorflow.contrib.distribute.python import parameter_server_strategy
-from tensorflow.contrib.distribute.python import values
+from tensorflow.contrib.distribute.python import strategy_test_lib
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
 from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import run_config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -41,8 +48,6 @@ from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import training_util
 
 CHIEF = run_config.TaskType.CHIEF
@@ -50,6 +55,13 @@ WORKER = run_config.TaskType.WORKER
 PS = run_config.TaskType.PS
 
 
+def _get_replica_id_integer():
+  replica_id = ds_context.get_replica_context().replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id
+
+
 class ParameterServerStrategyTestBase(
     multi_worker_test_base.MultiWorkerTestBase):
 
@@ -94,9 +106,8 @@ class ParameterServerStrategyTestBase(
         if num_gpus == 0:
           last_part_device = 'device:CPU:0'
         else:
-          last_part_device = (
-              'device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          last_part_device = ('device:GPU:%d' % replica_id)
 
         a = constant_op.constant(1.0)
         b = constant_op.constant(2.0)
@@ -261,18 +272,16 @@ class ParameterServerStrategyTestBase(
         if 'CPU' in compute_device:
           replica_compute_device = '/device:CPU:0'
         else:
-          replica_compute_device = (
-              '/device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          replica_compute_device = ('/device:GPU:%d' % replica_id)
         replica_compute_device = device_util.canonicalize(
             replica_compute_device)
 
         if 'CPU' in variable_device:
           replica_variable_device = '/device:CPU:0'
         else:
-          replica_variable_device = (
-              '/device:GPU:%d' %
-              distribution_strategy_context.get_replica_context().replica_id)
+          replica_id = _get_replica_id_integer()
+          replica_variable_device = ('/device:GPU:%d' % replica_id)
         replica_variable_device = device_util.canonicalize(
             replica_variable_device)
 
@@ -354,9 +363,9 @@ class ParameterServerStrategyTestBase(
   def _test_simple_increment(self, task_type, task_id, num_gpus):
     d, master_target, sess_config = self._get_test_objects(
         task_type, task_id, num_gpus)
-    if hasattr(d, '_cluster_spec') and d._cluster_spec:
-      num_workers = len(d._cluster_spec.as_dict().get(WORKER))
-      if 'chief' in d._cluster_spec.as_dict():
+    if d.extended._cluster_spec:
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if 'chief' in d.extended._cluster_spec.as_dict():
         num_workers += 1
     else:
       num_workers = 1
@@ -389,7 +398,7 @@ class ParameterServerStrategyTestBase(
       x, y, z, train_op = d.call_for_each_replica(model_fn)
       train_op = d.group(train_op)
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
       if task_id == 0:
@@ -426,9 +435,9 @@ class ParameterServerStrategyTestBase(
         task_type, task_id, num_gpus)
     if task_type:
       # Multi-worker
-      assert hasattr(d, '_cluster_spec') and d._cluster_spec
-      num_workers = len(d._cluster_spec.as_dict().get(WORKER))
-      if CHIEF in d._cluster_spec.as_dict():
+      assert hasattr(d.extended, '_cluster_spec') and d.extended._cluster_spec
+      num_workers = len(d.extended._cluster_spec.as_dict().get(WORKER))
+      if CHIEF in d.extended._cluster_spec.as_dict():
         num_workers += 1
     else:
       # local
@@ -472,8 +481,8 @@ class ParameterServerStrategyTestBase(
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
             # TODO(yuefengz): support non-Mirrored variable as destinations.
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(
                 d.update(v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -481,11 +490,12 @@ class ParameterServerStrategyTestBase(
 
       before_out, after_out = step()
 
-      if context.num_gpus() < d._num_gpus_per_worker:
+      if context.num_gpus() < d.extended._num_gpus_per_worker:
         return True
 
       if (not task_type or
-          multi_worker_util.is_chief(d._cluster_spec, task_type, task_id)):
+          multi_worker_util.is_chief(
+              d.extended._cluster_spec, task_type, task_id)):
         variables.global_variables_initializer().run()
 
       # Workers waiting for chief worker's initializing variables.
@@ -508,8 +518,40 @@ class ParameterServerStrategyTestBase(
       self.assertLess(error_after, error_before)
       return error_after < error_before
 
+  def _test_input_fn_iterator(self, task_type, task_id, num_gpus, input_fn,
+                              expected_values):
+    distribution, master_target, config = self._get_test_objects(
+        task_type, task_id, num_gpus)
+    devices = distribution.extended.worker_devices
+
+    with ops.Graph().as_default(), \
+         self.cached_session(config=config,
+                             target=master_target) as sess:
+      iterator = distribution.make_input_fn_iterator(input_fn)
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
+      with self.assertRaises(errors.OutOfRangeError):
+        next_element = iterator.get_next()
+        sess.run([values.select_device(d, next_element) for d in devices])
+
+      # After re-initializing the iterator, should be able to iterate again.
+      sess.run(iterator.initialize())
+
+      for expected_value in expected_values:
+        next_element = iterator.get_next()
+        computed_value = sess.run(
+            [values.select_device(d, next_element) for d in devices])
+        self.assertEqual(expected_value, computed_value)
+
 
 class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
+                                  strategy_test_lib.DistributionTestBase,
                                   parameterized.TestCase):
 
   @classmethod
@@ -574,6 +616,73 @@ class ParameterServerStrategyTest(ParameterServerStrategyTestBase,
   def testMinimizeLossGraphLocal(self, num_gpus):
     self._test_minimize_loss_graph(None, None, num_gpus)
 
+  # TODO(priyag): Refactor this and other multi worker tests.
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
+  def testMakeInputFnIteratorDistributed(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=3,
+        expected_input_pipeline_id=1)  # because task_id = 1
+    self._test_input_fn_iterator('worker', 1, num_gpus,
+                                 input_fn, expected_values)
+
+  @combinations.generate(
+      combinations.combine(mode=['graph'], num_gpus=[1, 2], required_gpus=1))
+  def testMakeInputFnIteratorLocal(self, num_gpus):
+    if context.num_gpus() < num_gpus:
+      self.skipTest('Not enough GPUs')
+    dataset_fn = lambda: dataset_ops.Dataset.range(100)
+    expected_values = [[i+j for j in range(num_gpus)]
+                       for i in range(0, 100, num_gpus)]
+
+    input_fn = self._input_fn_to_test_input_context(
+        dataset_fn,
+        expected_num_replicas_in_sync=num_gpus,
+        expected_num_input_pipelines=1,
+        expected_input_pipeline_id=0)  # only one worker and pipeline for local.
+    self._test_input_fn_iterator(None, None, num_gpus,
+                                 input_fn, expected_values)
+
+  def testGlobalStepUpdate(self):
+    strategy = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=context.num_gpus())
+    self._test_global_step_update(strategy)
+
+  def testUpdateConfigProtoMultiWorker(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+    distribution.configure(
+        cluster_spec=self._cluster_spec, task_type='worker', task_id=1)
+
+    config_proto = config_pb2.ConfigProto(device_filters=['to_be_overridden'])
+
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify device filters.
+    self.assertEqual(['/job:worker/task:1', '/job:ps'],
+                     new_config.device_filters)
+
+    # Verify isolate_session_state
+    self.assertFalse(new_config.isolate_session_state)
+
+  def testUpdateConfigProtoLocal(self):
+    distribution = parameter_server_strategy.ParameterServerStrategy(
+        num_gpus_per_worker=2)
+
+    config_proto = config_pb2.ConfigProto()
+    new_config = distribution.update_config_proto(config_proto)
+
+    # Verify isolate_session_state
+    self.assertTrue(new_config.isolate_session_state)
+
 
 class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
                                            parameterized.TestCase):
@@ -616,9 +725,9 @@ class ParameterServerStrategyWithChiefTest(ParameterServerStrategyTestBase,
           v = variable_scope.get_variable('v', initializer=10.0)
           _ = v * v
         v, = tape.watched_variables()
-        w = distribution.value_container(v)
+        w = distribution.extended.value_container(v)
         self.assertIs(values.AggregatingVariable, type(w))
-      distribution.call_for_each_replica(f)
+      distribution.extended.call_for_each_replica(f)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/contrib/distribute/python/step_fn.py b/tensorflow/contrib/distribute/python/step_fn.py
index 3dc815f0371002bd3a8657f18ccc09a27bb14961..c928b6d9f1f21508edd753f94c38ab2723cc0a9f 100644
--- a/tensorflow/contrib/distribute/python/step_fn.py
+++ b/tensorflow/contrib/distribute/python/step_fn.py
@@ -94,7 +94,7 @@ class StandardSingleLossStep(StandardInputStep):
 
   def __call__(self):
     with self._distribution.scope():
-      def step_fn(ctx, *inputs):
+      def step_fn(ctx, inputs):
         """Function to run one iteration with one input."""
         gradients_fn = backprop.implicit_grad(self._loss_fn)
         gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
diff --git a/tensorflow/contrib/distribute/python/strategy_test_lib.py b/tensorflow/contrib/distribute/python/strategy_test_lib.py
index 98cdb17b8ca2624ed8bbc55fc8a7fb7e76aa507e..d441b5af5f6aa41efde2c75d09d9589516c54992 100644
--- a/tensorflow/contrib/distribute/python/strategy_test_lib.py
+++ b/tensorflow/contrib/distribute/python/strategy_test_lib.py
@@ -19,16 +19,21 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.distribute import distribution_strategy_context as ds_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.layers import core
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer
 
 
@@ -45,8 +50,7 @@ def _raise_exception_fn(_=None):
 # Must be the argument to a distribution.call_for_each_replica() call, calls a
 # get_replica_context().merge_call() that raises an exception.
 def _merge_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _raise_exception_fn)
+  ds_context.get_replica_context().merge_call(_raise_exception_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
@@ -59,8 +63,7 @@ def _call_raises_fn(dist):
 # calls a get_replica_context().merge_call() that calls a
 # call_for_each_replica() that raises an exception.
 def _merge_call_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _call_raises_fn)
+  ds_context.get_replica_context().merge_call(_call_raises_fn)
 
 
 # Must be the argument to a get_replica_context().merge_call() call, calls
@@ -74,8 +77,7 @@ def _call_merge_raises_fn(dist):
 # get_replica_context().merge_call() that calls a call_for_each_replica() that
 # calls a get_replica_context().merge_call() that raises an exception.
 def _merge_call_merge_raises_fn():
-  distribution_strategy_context.get_replica_context().merge_call(
-      _call_merge_raises_fn)
+  ds_context.get_replica_context().merge_call(_call_merge_raises_fn)
 
 
 class DistributionTestBase(test.TestCase):
@@ -114,8 +116,8 @@ class DistributionTestBase(test.TestCase):
           before_list.append(fetched)
           # control_dependencies irrelevant but harmless in eager execution
           with ops.control_dependencies([fetched]):
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(d.update(
                 v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -169,8 +171,8 @@ class DistributionTestBase(test.TestCase):
           fetched = d.read_var(v)
           before_list.append(fetched)
           with ops.control_dependencies([fetched]):
-            g = d.reduce(
-                variable_scope.VariableAggregation.SUM, g, destinations=v)
+            g = d.extended.reduce_to(
+                reduce_util.ReduceOp.SUM, g, destinations=v)
             with ops.control_dependencies(d.update(
                 v, update, g, grouped=False)):
               after_list.append(d.read_var(v))
@@ -189,40 +191,20 @@ class DistributionTestBase(test.TestCase):
       # Error should go down
       self.assertLess(error_after, error_before)
 
-  def _test_map_reduce(self, d, in_graph=None):
-    with d.scope():
-      map_in = [constant_op.constant(i) for i in range(10)]
-      map_out = d.map(map_in, lambda x, y: x * y, 2)
-      observed = d.reduce(variable_scope.VariableAggregation.SUM, map_out,
-                          "/device:CPU:0")
-      expected = 90  # 2 * (0 + 1 + ... + 9)
-      self.assertEqual(expected, observed.numpy())
-
-  def _test_device_index(self, d):
-    with d.scope():
-      expected_devices = [False] * len(d.worker_devices)
-
-      def mark_devices_fn(device_id):
-        self.assertLess(device_id, len(d.worker_devices))
-        self.assertFalse(expected_devices[device_id])
-        expected_devices[device_id] = True
-
-      d.call_for_each_replica(mark_devices_fn, args=(d.worker_device_index,))
-      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
-
   def _test_replica_id(self, d):
     with d.scope():
-      expected_devices = [False] * len(d.worker_devices)
+      expected_devices = [False] * len(d.extended.worker_devices)
 
       def mark_devices_fn():
-        replica_id = (
-            distribution_strategy_context.get_replica_context().replica_id)
-        self.assertLess(replica_id, len(d.worker_devices))
+        replica_id = self.evaluate(
+            ds_context.get_replica_context().replica_id_in_sync_group)
+        self.assertLess(replica_id, len(d.extended.worker_devices))
         self.assertFalse(expected_devices[replica_id])
         expected_devices[replica_id] = True
 
       d.call_for_each_replica(mark_devices_fn)
-      self.assertAllEqual(expected_devices, [True] * len(d.worker_devices))
+      self.assertAllEqual(expected_devices,
+                          [True] * len(d.extended.worker_devices))
 
   def _test_call_and_merge_exceptions(self, dist):
     with dist.scope():
@@ -234,3 +216,78 @@ class DistributionTestBase(test.TestCase):
         dist.call_for_each_replica(_merge_call_raises_fn)
       with self.assertRaises(_TestException):
         dist.call_for_each_replica(_merge_call_merge_raises_fn)
+
+  def _input_fn_to_test_input_context(self,
+                                      dataset_fn,
+                                      expected_num_replicas_in_sync,
+                                      expected_num_input_pipelines,
+                                      expected_input_pipeline_id):
+    # Use a list of one element as counter so that it can be captured by the
+    # `_input_fn`. This counter is incremented by 1 each time an input_fn is
+    # called. We use this counter to check whether the `input_pipeline_id`
+    # matches the counter in the in-graph replication.
+    worker_id_counter = [0]
+
+    def _input_fn(input_context):
+      """Input fn for testing."""
+      self.assertIsNotNone(input_context)
+      self.assertEqual(expected_num_replicas_in_sync,
+                       input_context.num_replicas_in_sync)
+      self.assertEqual(expected_num_input_pipelines,
+                       input_context.num_input_pipelines)
+      if expected_input_pipeline_id is not None:
+        self.assertEqual(expected_input_pipeline_id,
+                         input_context.input_pipeline_id)
+      else:
+        self.assertEqual(worker_id_counter[0], input_context.input_pipeline_id)
+        worker_id_counter[0] += 1
+
+      return dataset_fn()
+
+    return _input_fn
+
+  def _test_input_fn_iterator(self, iterator, devices, expected_values,
+                              sess=None):
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+    evaluate(iterator.initialize())
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_device(d, next_element) for d in devices])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(iterator.initialize())
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertEqual(expected_value, computed_value)
+
+  def _test_global_step_update(self, strategy):
+    with strategy.scope():
+      global_step = variable_scope.get_variable(
+          "global_step",
+          shape=[],
+          dtype=dtypes.int64,
+          initializer=init_ops.zeros_initializer(),
+          trainable=False,
+          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self.evaluate(variables.global_variables_initializer())
+
+      def model_fn():
+        train_op = global_step.assign_add(1)
+        value = global_step.read_value()
+        return train_op, value
+
+      train_ops, value = strategy.call_for_each_replica(model_fn)
+      self.evaluate(strategy.group(train_ops))
+      global_step_tensors = strategy.unwrap(value)
+      global_step_values = self.evaluate(global_step_tensors)
+      self.assertEqual((1,) * len(global_step_tensors), global_step_values)
diff --git a/tensorflow/contrib/distribute/python/tpu_strategy.py b/tensorflow/contrib/distribute/python/tpu_strategy.py
index f5b4531ba8c483e69f2a2b5539b27205efb9fc21..b6f5b492017fc7dfd329e69ad9ca418ae682bc4b 100644
--- a/tensorflow/contrib/distribute/python/tpu_strategy.py
+++ b/tensorflow/contrib/distribute/python/tpu_strategy.py
@@ -21,31 +21,34 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import copy
 import functools
 
-from tensorflow.contrib.distribute.python import cross_tower_ops as cross_tower_ops_lib
-from tensorflow.contrib.distribute.python import values
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.contrib.tpu.python.tpu import training_loop
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 
 
-_TPU_INITIALIZE_SYSTEM_COLLECTION = "TPU_STRATEGY_INITIALIZE"
-
-
 def get_tpu_system_metadata(tpu_cluster_resolver):
   """Retrieves TPU system metadata given a TPUClusterResolver."""
   master = tpu_cluster_resolver.master()
@@ -130,8 +133,24 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       num_cores: Number of cores to use on the TPU. If None specified, then
           auto-detect the cores and topology of the TPU system.
     """
-    super(TPUStrategy, self).__init__()
+    super(TPUStrategy, self).__init__(TPUExtended(
+        self, tpu_cluster_resolver, steps_per_run, num_cores))
+
+  @property
+  def steps_per_run(self):
+    """DEPRECATED: use .extended.steps_per_run instead."""
+    return self._extended.steps_per_run
+
+
+class TPUExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of TPUStrategy."""
 
+  # Track what TPU devices have been initialized.
+  _initialized_devices = []
+
+  def __init__(self, container_strategy, tpu_cluster_resolver, steps_per_run,
+               num_cores=None):
+    super(TPUExtended, self).__init__(container_strategy)
     self._tpu_cluster_resolver = tpu_cluster_resolver
     self._tpu_metadata = get_tpu_system_metadata(self._tpu_cluster_resolver)
     # TODO(sourabhbajaj): Change this from num_cores to metadata_override
@@ -143,16 +162,41 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
                   if "device:TPU:" in d.name}
     self._device_index = values.PerReplica(device_map)
     self._host_device = self.get_host_cpu_device(0)
-    self._tpu_devices = sorted(device_map.keys())
+    self._tpu_devices = tuple(sorted(device_map.keys()))
     # Only create variables for the number of replicas we're running.
-    self._tpu_devices = self._tpu_devices[:self.num_replicas]
+    self._tpu_devices = self._tpu_devices[:self._num_replicas_in_sync]
 
     # TODO(sourabhbajaj): Remove this once performance of running one step
     # at a time is comparable to multiple steps.
     self.steps_per_run = steps_per_run
-
     self._require_static_shapes = True
 
+    # Initialize the TPU devices.
+    self._initialize_tpu()
+
+  def _initialize_tpu(self):
+    """Initialize the TPU devices in a separate session and graph.
+
+    We keep track of all the TPU devices that we're initialized as we should
+    only be running TPU initialize once for the entire process.
+    """
+    master = self._tpu_cluster_resolver.master()
+    # Verify TPU has not already been initialized in this process.
+    if master in TPUExtended._initialized_devices:
+      logging.info("TPU master %s has already been initialized." % master)
+      return
+
+    logging.info("Initializing the TPU system.")
+    session_config = config_pb2.ConfigProto(allow_soft_placement=True)
+    self._configure(session_config)
+    with ops.Graph().as_default():
+      with session_lib.Session(config=session_config, target=master) as sess:
+        sess.run([tpu.initialize_system()])
+    logging.info("Finized initializing TPU system.")
+
+    # Update Strategy state to make sure we can track device initialization.
+    TPUExtended._initialized_devices.append(master)
+
   def _get_enqueue_op_per_host(self, host_id, multi_worker_iterator,
                                input_shapes, iterations):
     """Create an enqueue op for a single host identified using host_id.
@@ -214,7 +258,17 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
 
     return enqueue_op_per_host
 
-  def distribute_dataset(self, dataset_fn):
+  def _make_dataset_iterator(self, dataset):
+    """Make iterators for each of the TPU hosts."""
+
+    worker_devices = [
+        (self.get_host(hid), [self.get_host_cpu_device(hid)])
+        for hid in range(self.num_hosts)
+    ]
+    return values.DatasetIterator(dataset, worker_devices,
+                                  self._num_replicas_in_sync)
+
+  def _distribute_dataset(self, dataset_fn):
     worker_devices = [
         (self.get_host(hid), [self.get_host_cpu_device(hid)])
         for hid in range(self.num_hosts)
@@ -225,12 +279,11 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
   # TODO(sourabhbajaj): Remove the initial_loop_values parameter when we have
   # a mechanism to infer the outputs of `fn`. Pending b/110550782.
-  def _run_steps_on_dataset(self, fn, multi_worker_iterator, iterations,
-                            initial_loop_values=None):
-
+  def _experimental_run_steps_on_iterator(
+      self, fn, multi_worker_iterator, iterations, initial_loop_values=None):
     output_shapes = multi_worker_iterator.output_shapes
     shapes = nest.flatten(output_shapes)
-    if any([not s.is_fully_defined() for s in shapes]):
+    if any(not s.is_fully_defined() for s in shapes):
       raise ValueError(
           "TPU currently requires fully defined shapes. Either use "
           "set_shape() on the input tensors or use "
@@ -251,13 +304,13 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       initial_loop_values = {}
     initial_loop_values = nest.flatten(initial_loop_values)
     ctx = values.MultiStepContext()
-    def run_fn(*args, **kwargs):
+
+    def run_fn():
       """Single step on the TPU device."""
-      del args, kwargs
       fn_inputs = dequeue_fn()
       if not isinstance(fn_inputs, tuple):
         fn_inputs = (fn_inputs,)
-      fn_result = fn(ctx, *fn_inputs)
+      fn_result = fn(ctx, fn_inputs)
       flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
       if flat_last_step_outputs:
         with ops.control_dependencies([fn_result]):
@@ -265,11 +318,6 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
       else:
         return fn_result
 
-    # TODO(sourabhbajaj): The input to while loop should be based on the output
-    # type of the step_fn
-    def iterate_on_tpu():
-      return training_loop.repeat(iterations, run_fn, initial_loop_values)
-
     # We capture the control_flow_context at this point, before we run `fn`
     # inside a while_loop and TPU replicate context. This is useful in cases
     # where we might need to exit these contexts and get back to the outer
@@ -279,38 +327,70 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     self._outer_control_flow_context = (
         ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
 
-    replicate_inputs = [[]] * self.num_replicas
-    replicate_outputs = tpu.replicate(iterate_on_tpu, replicate_inputs)
+    def rewrite_fn(*args):
+      """The rewritten step fn running on TPU."""
+      del args
+      replicate_inputs = [[]] * self._num_replicas_in_sync
+      replicate_outputs = tpu.replicate(run_fn, replicate_inputs)
+
+      # If run_fn has tensor outputs, tpu.replicate returns a list of list. We
+      # will flatten it in this case. If run_fn has no tensor outputs,
+      # tpu.replicate returns a list of no_ops, we will keep the output as it
+      # is.
+      if isinstance(replicate_outputs[0], list):
+        replicate_outputs = nest.flatten(replicate_outputs)
+
+      return replicate_outputs
+
+    # TODO(sourabhbajaj): The input to while loop should be based on the output
+    # type of the step_fn
+    assert isinstance(initial_loop_values, list)
+    initial_loop_values = initial_loop_values * self._num_replicas_in_sync
+
+    # Put the while loop op on host 0.
+    with ops.device(self.get_host_cpu_device(0)):
+      replicate_outputs = training_loop.repeat(iterations, rewrite_fn,
+                                               initial_loop_values)
+
     del self._outer_control_flow_context
     ctx.run_op = control_flow_ops.group(replicate_outputs, enqueue_ops)
 
-    # Filter out any ops from the outputs, typically this would be the case
-    # when there were no tensor outputs.
-    last_step_tensor_outputs = [x for x in replicate_outputs
-                                if not isinstance(x, ops.Operation)]
-
-    # Outputs are currently of the structure (grouped by device)
-    # [[output0_device0, output1_device0, output2_device0],
-    #  [output0_device1, output1_device1, output2_device1]]
-    # Convert this to the following structure instead: (grouped by output)
-    # [[output0_device0, output0_device1],
-    #  [output1_device0, output1_device1],
-    #  [output2_device0, output2_device1]]
-    last_step_tensor_outputs = [list(x) for x in zip(*last_step_tensor_outputs)]
+    if isinstance(replicate_outputs, list):
+      # Filter out any ops from the outputs, typically this would be the case
+      # when there were no tensor outputs.
+      last_step_tensor_outputs = [
+          x for x in replicate_outputs if not isinstance(x, ops.Operation)
+      ]
+
+      # Outputs are currently of the structure (flattened)
+      # [output0_device0, output1_device0, output2_device0,
+      #  output0_device1, output1_device1, output2_device1,
+      #  ...]
+      # Convert this to the following structure instead: (grouped by output)
+      # [[output0_device0, output0_device1],
+      #  [output1_device0, output1_device1],
+      #  [output2_device0, output2_device1]]
+      output_num = len(last_step_tensor_outputs) // self._num_replicas_in_sync
+      last_step_tensor_outputs = [
+          last_step_tensor_outputs[i::output_num] for i in range(output_num)
+      ]
+    else:
+      # no tensors returned.
+      last_step_tensor_outputs = []
 
     # Convert replicate_outputs to the original dict structure of
     # last_step_outputs.
     last_step_tensor_outputs_dict = nest.pack_sequence_as(
         ctx.last_step_outputs, last_step_tensor_outputs)
 
-    for (name, aggregation) in ctx._last_step_outputs_aggregations.items():  # pylint: disable=protected-access
+    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
       output = last_step_tensor_outputs_dict[name]
-      # For outputs that have already been aggregated, take the first value
+      # For outputs that have already been reduced, take the first value
       # from the list as each value should be the same. Else return the full
       # list of values.
-      # TODO(josh11b): If aggregation is NONE, we should return a PerReplica
+      # TODO(josh11b): If reduce_op is NONE, we should return a PerReplica
       # value.
-      if aggregation is not variables_lib.VariableAggregation.NONE:
+      if reduce_op is not None:
         # TODO(priyag): Should this return the element or a list with 1 element
         last_step_tensor_outputs_dict[name] = output[0]
     ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
@@ -320,33 +400,25 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def _call_for_each_replica(self, fn, args, kwargs):
     # TODO(jhseu): Consider making it so call_for_each_replica implies that
     # we're in a tpu.rewrite(), and update TPUMirroredVariable accordingly.
-    with _TPUReplicaContext(self):
+    with _TPUReplicaContext(self._container_strategy()):
       return fn(*args, **kwargs)
 
-  def initialize(self):
+  def _initialize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError("Eager mode not supported in TPUStrategy.")
     else:
-      # TODO(jhseu): We need this hack because DistributionStrategies must be
-      # pickleable for copy.deepcopy(). Remove when initialize_system goes away.
-      graph = ops.get_default_graph()
-      tpu_init = graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
-      if tpu_init:
-        return tpu_init
-      graph.add_to_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION,
-                              tpu.initialize_system())
-      return graph.get_collection(_TPU_INITIALIZE_SYSTEM_COLLECTION)
-
-  def finalize(self):
+      return []
+
+  def _finalize(self):
     if context.executing_eagerly():
       # TODO(priyag): Add appopriate call here when eager is supported for TPUs.
       raise NotImplementedError("Eager mode not supported in TPUStrategy.")
     else:
-      return [tpu.shutdown_system()]
+      return []
 
   def _get_devices_from(self, colocate_with=None):
-     # TODO(jhseu): Change this when we support model parallelism.
+    # TODO(jhseu): Change this when we support model parallelism.
     return self._tpu_devices
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -383,12 +455,12 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     return _create_tpu_mirrored_variable(devices, _real_mirrored_creator, *args,
                                          **kwargs)
 
-  def _reduce(self, aggregation, value, destinations):
+  def _reduce_to(self, reduce_op, value, destinations):
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if aggregation == vs.VariableAggregation.MEAN:
+      if reduce_op == reduce_util.ReduceOp.MEAN:
         # TODO(jhseu):  Revisit once we support model-parallelism.
-        value *= (1. / self.num_replicas)
-      elif aggregation != vs.VariableAggregation.SUM:
+        value *= (1. / self._num_replicas_in_sync)
+      elif reduce_op != reduce_util.ReduceOp.SUM:
         raise NotImplementedError(
             "Currently only support sum & mean in TPUStrategy.")
       return tpu_ops.cross_replica_sum(value)
@@ -396,27 +468,22 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     # Validate that the destination is same as the host device
     # Note we don't do this when in replicate context as the reduction is
     # performed on the TPU device itself.
-    devices = cross_tower_ops_lib.get_devices_from(destinations)
+    devices = cross_device_ops_lib.get_devices_from(destinations)
     if len(devices) == 1:
       assert device_util.canonicalize(devices[0]) == device_util.canonicalize(
           self._host_device)
     else:
       raise ValueError("Multiple devices are not supported for TPUStrategy")
 
-    if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
-      return value[0]
     output = math_ops.add_n(value)
-    if aggregation == vs.VariableAggregation.MEAN:
+    if reduce_op == reduce_util.ReduceOp.MEAN:
       return output * (1. / len(value))
     return output
 
-  def _update(self, var, options, fn, *args, **kwargs):
+  def _update(self, var, fn, args, kwargs, group):
     assert isinstance(var, values.TPUMirroredVariable)
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-
     if values._enclosing_tpu_context() is not None:  # pylint: disable=protected-access
-      if should_group:
+      if group:
         return fn(var, *args, **kwargs)
       else:
         return [fn(var, *args, **kwargs)]
@@ -431,9 +498,7 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
         updates[d] = fn(v,
                         *values.select_device_mirrored(d, args),
                         **values.select_device_mirrored(d, kwargs))
-    return values.update_regroup(self, updates, should_group)
-
-  # TODO(josh11b): Need to implement _update_non_slot()!
+    return values.update_regroup(self, updates, group)
 
   def read_var(self, var):
     assert isinstance(var, values.TPUMirroredVariable)
@@ -442,25 +507,21 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def _unwrap(self, val):
     if isinstance(val, values.DistributedValues):
       # Return in a deterministic order.
-      return [val.get(device=d) for d in sorted(val.devices)]
+      return tuple(val.get(device=d) for d in sorted(val.devices))
     elif isinstance(val, list):
       # TODO(josh11b): We need to remove this case; per device values should
       # be represented using a PerReplica wrapper instead of a list with
       # one entry per device.
-      return val
-    return [val]
+      return tuple(val)
+    return (val,)
 
   def value_container(self, value):
     return value
 
-  def _broadcast(self, tensor, destinations):
+  def _broadcast_to(self, tensor, destinations):
     del destinations
     return tensor
 
-  @property
-  def num_replicas(self):
-    return self._num_cores_override or self._tpu_metadata.num_cores
-
   @property
   def num_hosts(self):
     return self._tpu_metadata.num_hosts
@@ -470,15 +531,15 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
     return self._tpu_metadata.num_of_cores_per_host
 
   @property
-  def num_replicas_in_sync(self):
-    return self.num_replicas
+  def _num_replicas_in_sync(self):
+    return self._num_cores_override or self._tpu_metadata.num_cores
 
   @property
-  def between_graph(self):
+  def experimental_between_graph(self):
     return False
 
   @property
-  def should_init(self):
+  def experimental_should_init(self):
     return True
 
   @property
@@ -500,14 +561,12 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def non_slot_devices(self, var_list):
     return self._host_device
 
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
     del colocate_with
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
     with ops.device(self._host_device), distribute_lib.UpdateContext(
         self._host_device):
       result = fn(*args, **kwargs)
-      if should_group:
+      if group:
         return result
       else:
         return nest.map_structure(self._unwrap, result)
@@ -521,17 +580,27 @@ class TPUStrategy(distribute_lib.DistributionStrategy):
   def get_host_cpu_device(self, host_id):
     return self.get_host(host_id) + "/device:CPU:0"
 
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
     del cluster_spec, task_type, task_id
     if session_config:
-      session_config.isolate_session_state = True
-      cluster_spec = self._tpu_cluster_resolver.cluster_spec()
-      if cluster_spec:
-        session_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    updated_config.isolate_session_state = True
+    cluster_spec = self._tpu_cluster_resolver.cluster_spec()
+    if cluster_spec:
+      updated_config.cluster_def.CopyFrom(cluster_spec.as_cluster_def())
+    return updated_config
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
 
 
 class _TPUReplicaContext(distribute_lib.ReplicaContext):
@@ -540,13 +609,14 @@ class _TPUReplicaContext(distribute_lib.ReplicaContext):
   # TODO(sourabhbajaj): Call for each tower should be updating this.
   def __init__(self, distribution_strategy):
     distribute_lib.ReplicaContext.__init__(
-        self, distribution_strategy, replica_id=0)
-
-  @property
-  def device(self):
-    raise RuntimeError("Use .devices instead")
+        self,
+        distribution_strategy,
+        # TODO(b/118385803): properly initialize replica_id, instead of always 0
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32))
 
   @property
   def devices(self):
     distribute_lib.require_replica_context(self)
-    return [self._distribution_strategy.worker_devices[self._replica_id]]
+    ds = self._distribution_strategy
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    return (ds.extended.worker_devices[replica_id],)
diff --git a/tensorflow/contrib/distribute/python/values_test.py b/tensorflow/contrib/distribute/python/values_test.py
index 268393ee801b5f25bb5a7f061960b817c2d2ce5e..538b859f3d1ece55b460f6dbf8f01540a6013381 100644
--- a/tensorflow/contrib/distribute/python/values_test.py
+++ b/tensorflow/contrib/distribute/python/values_test.py
@@ -19,12 +19,15 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 
-from tensorflow.contrib.distribute.python import mirrored_strategy
+from tensorflow.contrib.distribute.python import combinations
 from tensorflow.contrib.distribute.python import multi_worker_test_base
-from tensorflow.contrib.distribute.python import values
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import values
 from tensorflow.python.eager import context
 from tensorflow.python.eager import test
 from tensorflow.python.estimator import model_fn as model_fn_lib
@@ -34,10 +37,10 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.util import nest
 
@@ -324,20 +327,20 @@ class RegroupAndSelectDeviceTest(test.TestCase):
 
       self.assertTrue(
           isinstance(merged_estimator_spec, model_fn_lib.EstimatorSpec))
-      self.assertEquals(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
+      self.assertEqual(model_fn_lib.ModeKeys.TRAIN, merged_estimator_spec.mode)
       for device_id in range(3):
         d = _device_str(device_id)
-        self.assertEquals(created_estimator_specs[device_id].loss,
-                          merged_estimator_spec.loss.get(d))
-        self.assertEquals(created_estimator_specs[device_id].train_op,
-                          merged_estimator_spec.train_op.get(d))
+        self.assertEqual(created_estimator_specs[device_id].loss,
+                         merged_estimator_spec.loss.get(d))
+        self.assertEqual(created_estimator_specs[device_id].train_op,
+                         merged_estimator_spec.train_op.get(d))
         # Scaffold is populated by `EstimatorSpec.__new__`.
-        self.assertEquals(created_estimator_specs[device_id].scaffold,
-                          merged_estimator_spec.scaffold.get(d))
+        self.assertEqual(created_estimator_specs[device_id].scaffold,
+                         merged_estimator_spec.scaffold.get(d))
         # Also test that we can undo the merge using select_device()
-        self.assertEquals(created_estimator_specs[device_id],
-                          values.select_device(_device_str(device_id),
-                                               merged_estimator_spec))
+        self.assertEqual(created_estimator_specs[device_id],
+                         values.select_device(_device_str(device_id),
+                                              merged_estimator_spec))
 
 
 class PerReplicaDatasetTest(test.TestCase):
@@ -568,7 +571,184 @@ class MultiWorkerDatasetTest(multi_worker_test_base.MultiWorkerTestBase):
         multi_worker_iterator.get_next()
 
 
-class MirroredVariableTest(test.TestCase):
+class InputIteratorTestBase(test.TestCase):
+
+  def _test_iterator(self, input_type, dataset_fn, worker_device_pairs,
+                     expected_values, sess=None, split_batch_by=None):
+    devices = nest.flatten([ds for _, ds in worker_device_pairs])
+
+    if input_type == "input_fn":
+      input_contexts = [
+          distribute_lib.InputContext() for _ in worker_device_pairs]
+      input_fn = lambda _: dataset_fn()
+      iterator = values.InputFunctionIterator(input_fn, worker_device_pairs,
+                                              input_contexts)
+    else:
+      iterator = values.DatasetIterator(dataset_fn(), worker_device_pairs,
+                                        split_batch_by)
+
+    evaluate = lambda x: sess.run(x) if sess else self.evaluate(x)
+
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertAllEqual(expected_value, computed_value)
+
+    with self.assertRaises(errors.OutOfRangeError):
+      next_element = iterator.get_next()
+      evaluate([values.select_device(d, next_element) for d in devices])
+
+    # After re-initializing the iterator, should be able to iterate again.
+    evaluate(control_flow_ops.group(iterator.initialize()))
+
+    for expected_value in expected_values:
+      next_element = iterator.get_next()
+      computed_value = evaluate(
+          [values.select_device(d, next_element) for d in devices])
+      self.assertAllEqual(expected_value, computed_value)
+
+
+class InputIteratorSingleWorkerTest(InputIteratorTestBase,
+                                    parameterized.TestCase):
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDeviceCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i] for i in range(10)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesOneGPUOneCPU(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(10)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTupleDataset(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    def dataset_fn():
+      dataset1 = dataset_ops.Dataset.range(10)
+      dataset2 = dataset_ops.Dataset.range(10).map(lambda x: x**2)
+      return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+    expected_values = [[(i, i**2), (i+1, (i+1)**2)] for i in range(0, 10, 2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testUnevenDatasetBatches(self, input_type):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    dataset_fn = lambda: dataset_ops.Dataset.range(11)
+
+    expected_values = [[i, i+1] for i in range(0, 10, 2)]
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph", "eager"],
+      input_type=["dataset"],
+      split_batch_by=[None, 2],
+      required_gpus=1))
+  def testBatchSplitting(self, input_type, split_batch_by):
+    worker_device_pairs = [("", ["/device:GPU:0", "/device:CPU:0"])]
+    batch_size = 10
+    dataset_fn = lambda: dataset_ops.Dataset.range(100).batch(batch_size)
+
+    updated_batch_size = (
+        batch_size // split_batch_by if split_batch_by else batch_size)
+    expected_values = [[range(i, i+updated_batch_size),
+                        range(i+updated_batch_size, i+2*updated_batch_size)]
+                       for i in range(0, 100, updated_batch_size*2)]
+
+    self._test_iterator(input_type, dataset_fn, worker_device_pairs,
+                        expected_values, sess=None,
+                        split_batch_by=split_batch_by)
+
+
+class InputIteratorMultiWorkerTest(
+    multi_worker_test_base.MultiWorkerTestBase, InputIteratorTestBase,
+    parameterized.TestCase):
+
+  def _cpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0",
+         ["/job:worker/replica:0/task:0/device:CPU:0"]),
+        ("/job:worker/replica:0/task:1",
+         ["/job:worker/replica:0/task:1/device:CPU:0"])]
+
+  def _cpu_and_one_gpu_devices(self):
+    return [
+        ("/job:worker/replica:0/task:0", [
+            "/job:worker/replica:0/task:0/device:GPU:0",
+            "/job:worker/replica:0/task:0/device:CPU:0"
+        ]),
+        ("/job:worker/replica:0/task:1", [
+            "/job:worker/replica:0/task:1/device:GPU:0",
+            "/job:worker/replica:0/task:1/device:CPU:0"
+        ])
+    ]
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testOneDevicePerWorker(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 0], [1, 1], [2, 2], [3, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"],
+      required_gpus=1))
+  def testTwoDevicesPerWorker(self, input_type):
+    worker_devices = self._cpu_and_one_gpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      dataset_fn = lambda: dataset_ops.Dataset.range(4)
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          [[0, 1, 0, 1], [2, 3, 2, 3]], sess)
+
+  @combinations.generate(combinations.combine(
+      mode=["graph"],
+      input_type=["input_fn", "dataset"]))
+  def testTupleDataset(self, input_type):
+    worker_devices = self._cpu_devices()
+    with context.graph_mode(), self.cached_session() as sess:
+      def dataset_fn():
+        dataset1 = dataset_ops.Dataset.range(4)
+        dataset2 = dataset_ops.Dataset.range(4).map(lambda x: x**2)
+        return dataset_ops.Dataset.zip((dataset1, dataset2))
+
+      expected_values = [[(i, i**2), (i, i**2)] for i in range(0, 4)]
+      self._test_iterator(input_type, dataset_fn, worker_devices,
+                          expected_values, sess)
+
+
+class MirroredVariableTest(test.TestCase, parameterized.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -580,9 +760,9 @@ class MirroredVariableTest(test.TestCase):
 
     v, _, mirrored = _make_mirrored()
 
-    self.assertEquals(v[0].name, mirrored.name)
-    self.assertEquals(v[0].dtype, mirrored.dtype)
-    self.assertEquals(v[0].shape, mirrored.shape)
+    self.assertEqual(v[0].name, mirrored.name)
+    self.assertEqual(v[0].dtype, mirrored.dtype)
+    self.assertEqual(v[0].shape, mirrored.shape)
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testVariableOnAnotherDevice(self):
@@ -592,9 +772,9 @@ class MirroredVariableTest(test.TestCase):
     mirrored = values.MirroredVariable(index, v,
                                        variable_scope.VariableAggregation.MEAN)
 
-    self.assertEquals(v.name, mirrored.name)
-    self.assertEquals(v.dtype, mirrored.dtype)
-    self.assertEquals(v.shape, mirrored.shape)
+    self.assertEqual(v.name, mirrored.name)
+    self.assertEqual(v.dtype, mirrored.dtype)
+    self.assertEqual(v.shape, mirrored.shape)
 
   def _assign_mirrored(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
@@ -714,14 +894,13 @@ class MirroredVariableTest(test.TestCase):
     save_path = self._save_normal()
     self._restore_mirrored(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testFetchAMirroredVariable(self):
-    if context.num_gpus() < 1 or context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test or it's eager mode.")
-
-    with self.session(
-        graph=ops.Graph()) as sess, mirrored_strategy.MirroredStrategy(
-            ["/device:GPU:0"]).scope():
+  @combinations.generate(combinations.combine(
+      distribution=[
+          combinations.mirrored_strategy_with_one_gpu,
+          combinations.core_mirrored_strategy_with_one_gpu],
+      mode=["graph"]))
+  def testFetchAMirroredVariable(self, distribution):
+    with self.session(graph=ops.Graph()) as sess, distribution.scope():
       with ops.device("/device:GPU:0"):
         v = variable_scope.get_variable(
             name="v", initializer=1., use_resource=True)
@@ -747,7 +926,7 @@ def _make_replica_local(method):
   return v, replica_local
 
 
-class ReplicaLocalVariableTest(test.TestCase):
+class ReplicaLocalVariablePropertiesTest(test.TestCase):
 
   config = config_pb2.ConfigProto()
   config.allow_soft_placement = True
@@ -756,15 +935,14 @@ class ReplicaLocalVariableTest(test.TestCase):
   def testProperties(self):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
-
     v, replica_local = _make_replica_local(
         variable_scope.VariableAggregation.SUM)
 
-    self.assertEquals(v[0].name, replica_local.name)
-    self.assertEquals(v[0].dtype, replica_local.dtype)
-    self.assertEquals(v[0].shape, replica_local.shape)
-    self.assertEquals(variable_scope.VariableAggregation.SUM,
-                      replica_local.aggregation)
+    self.assertEqual(v[0].name, replica_local.name)
+    self.assertEqual(v[0].dtype, replica_local.dtype)
+    self.assertEqual(v[0].shape, replica_local.shape)
+    self.assertEqual(variable_scope.VariableAggregation.SUM,
+                     replica_local.aggregation)
 
   @test_util.run_in_graph_and_eager_modes(config=config)
   def testVariableOnAnotherDevice(self):
@@ -774,11 +952,32 @@ class ReplicaLocalVariableTest(test.TestCase):
     replica_local = values.ReplicaLocalVariable(
         index, v, variable_scope.VariableAggregation.MEAN)
 
-    self.assertEquals(v.name, replica_local.name)
-    self.assertEquals(v.dtype, replica_local.dtype)
-    self.assertEquals(v.shape, replica_local.shape)
-    self.assertEquals(variable_scope.VariableAggregation.MEAN,
-                      replica_local.aggregation)
+    self.assertEqual(v.name, replica_local.name)
+    self.assertEqual(v.dtype, replica_local.dtype)
+    self.assertEqual(v.shape, replica_local.shape)
+    self.assertEqual(variable_scope.VariableAggregation.MEAN,
+                     replica_local.aggregation)
+
+  def testTensorConversion(self):
+    with context.graph_mode():
+      _, replica_local = _make_replica_local(
+          variable_scope.VariableAggregation.SUM)
+      converted = ops.internal_convert_to_tensor(replica_local, as_ref=False)
+      self.assertIsInstance(converted, ops.Tensor)
+      self.assertEqual(converted.dtype, replica_local.dtype)
+
+      converted = ops.internal_convert_to_tensor(replica_local, as_ref=True)
+      # Resources variable are converted to tensors as well when as_ref is True.
+      self.assertIsInstance(converted, ops.Tensor)
+      self.assertEqual(converted.dtype, replica_local.dtype)
+
+
+@combinations.generate(combinations.combine(
+    distribution=[
+        combinations.mirrored_strategy_with_gpu_and_cpu,
+        combinations.core_mirrored_strategy_with_gpu_and_cpu],
+    mode=["graph", "eager"]))
+class ReplicaLocalVariableTest(test.TestCase, parameterized.TestCase):
 
   def _assign_replica_local(self, devices, v, new):
     for d, var, n in zip(devices, v, new):
@@ -795,22 +994,15 @@ class ReplicaLocalVariableTest(test.TestCase):
     save_path, _ = self._save_return_saver(sess, var)
     return save_path
 
-  def _dist_scope(self):
-    return mirrored_strategy.MirroredStrategy(_devices).scope()
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreReplicaLocalSumOneGraph(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    with self.cached_session(config=self.config) as sess:
+  def testSaveAndRestoreReplicaLocalSumOneGraph(self, distribution):
+    with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.SUM)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of v[0] + v[1], 7.
         save_path, saver = self._save_return_saver(sess, replica_local)
 
@@ -822,19 +1014,18 @@ class ReplicaLocalVariableTest(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveAndRestoreReplicaLocalMeanOneGraph(self):
+  def testSaveAndRestoreReplicaLocalMeanOneGraph(self, distribution):
     if context.num_gpus() < 1 and context.executing_eagerly():
       self.skipTest("A GPU is not available for this test in eager mode.")
 
-    with self.cached_session(config=self.config) as sess:
+    with self.cached_session() as sess:
       v, replica_local = _make_replica_local(
           variable_scope.VariableAggregation.MEAN)
 
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of (v[0] + v[1])/2, 3.5.
         save_path, saver = self._save_return_saver(sess, replica_local)
 
@@ -845,7 +1036,7 @@ class ReplicaLocalVariableTest(test.TestCase):
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  def _save_replica_local_mean(self):
+  def _save_replica_local_mean(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -854,7 +1045,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [3., 4.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of (v[0] + v[1])/2, 3.5
         save_path = self._save(sess, replica_local)
 
@@ -862,7 +1053,7 @@ class ReplicaLocalVariableTest(test.TestCase):
         self._assign_replica_local(_devices, v, [5., 6.])
     return save_path
 
-  def _save_replica_local_sum(self):
+  def _save_replica_local_sum(self, distribution):
     """Save variables with mirroring, returns save_path."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local("sum")
@@ -870,7 +1061,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [1.5, 2.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Saves the current value of v[0] + v[1], 3.5
         save_path = self._save(sess, replica_local)
 
@@ -908,7 +1099,7 @@ class ReplicaLocalVariableTest(test.TestCase):
       saver.restore(sess, save_path)
       self.assertEqual(3.5, self.evaluate(var))
 
-  def _restore_replica_local_mean(self, save_path):
+  def _restore_replica_local_mean(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -917,13 +1108,13 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Restores the saved value of 3.5 to both variables.
         saver = saver_lib.Saver(var_list=[replica_local])
         saver.restore(sess, save_path)
         self.assertEqual([3.5, 3.5], self.evaluate([v[0], v[1]]))
 
-  def _restore_replica_local_sum(self, save_path):
+  def _restore_replica_local_sum(self, save_path, distribution):
     """Restore to variables with mirroring in a fresh graph."""
     with self.session(graph=ops.Graph()) as sess:
       v, replica_local = _make_replica_local(
@@ -932,72 +1123,35 @@ class ReplicaLocalVariableTest(test.TestCase):
       # Overwrite the initial values.
       self._assign_replica_local(_devices, v, [7., 8.])
 
-      with self._dist_scope():
+      with distribution.scope():
         # Restores the saved value of 3.5 to both variables.
         saver = saver_lib.Saver(var_list=[replica_local])
         saver.restore(sess, save_path)
         self.assertEqual([1.75, 1.75], self.evaluate([v[0], v[1]]))
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalRestoreReplicaLocalMean(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
+  def testSaveReplicaLocalRestoreReplicaLocalMean(self, distribution):
+    save_path = self._save_replica_local_mean(distribution)
+    self._restore_replica_local_mean(save_path, distribution)
 
-    save_path = self._save_replica_local_mean()
-    self._restore_replica_local_mean(save_path)
+  def testSaveReplicaLocalRestoreReplicaLocalSum(self, distribution):
+    save_path = self._save_replica_local_sum(distribution)
+    self._restore_replica_local_sum(save_path, distribution)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalRestoreReplicaLocalSum(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_sum()
-    self._restore_replica_local_sum(save_path)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalMeanRestoreNormal(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_mean()
+  def testSaveReplicaLocalMeanRestoreNormal(self, distribution):
+    save_path = self._save_replica_local_mean(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveReplicaLocalSumRestoreNormal(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
-    save_path = self._save_replica_local_sum()
+  def testSaveReplicaLocalSumRestoreNormal(self, distribution):
+    save_path = self._save_replica_local_sum(distribution)
     self._restore_normal(save_path)
 
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreReplicaLocalMean(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
-
+  def testSaveNormalRestoreReplicaLocalMean(self, distribution):
     save_path = self._save_normal()
-    self._restore_replica_local_mean(save_path)
-
-  @test_util.run_in_graph_and_eager_modes(config=config)
-  def testSaveNormalRestoreReplicaLocalSum(self):
-    if context.num_gpus() < 1 and context.executing_eagerly():
-      self.skipTest("A GPU is not available for this test in eager mode.")
+    self._restore_replica_local_mean(save_path, distribution)
 
+  def testSaveNormalRestoreReplicaLocalSum(self, distribution):
     save_path = self._save_normal()
-    self._restore_replica_local_sum(save_path)
-
-  def testTensorConversion(self):
-    with context.graph_mode():
-      _, replica_local = _make_replica_local(
-          variable_scope.VariableAggregation.SUM)
-      converted = ops.internal_convert_to_tensor(replica_local, as_ref=False)
-      self.assertIsInstance(converted, ops.Tensor)
-      self.assertEqual(converted.dtype, replica_local.dtype)
-
-      converted = ops.internal_convert_to_tensor(replica_local, as_ref=True)
-      # Resources variable are converted to tensors as well when as_ref is True.
-      self.assertIsInstance(converted, ops.Tensor)
-      self.assertEqual(converted.dtype, replica_local.dtype)
+    self._restore_replica_local_sum(save_path, distribution)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/distribute/python/warm_starting_util_test.py b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
index 5d57d144c1c16a08280970ecd89eb54f7cf1ffd4..b0bcf9b17456c938204a4892451928daf90b6743 100644
--- a/tensorflow/contrib/distribute/python/warm_starting_util_test.py
+++ b/tensorflow/contrib/distribute/python/warm_starting_util_test.py
@@ -44,7 +44,9 @@ class WarmStartingUtilWithDistributionStrategyTest(
       distribution=[combinations.default_strategy,
                     combinations.one_device_strategy,
                     combinations.mirrored_strategy_with_gpu_and_cpu,
-                    combinations.mirrored_strategy_with_two_gpus],
+                    combinations.mirrored_strategy_with_two_gpus,
+                    combinations.core_mirrored_strategy_with_gpu_and_cpu,
+                    combinations.core_mirrored_strategy_with_two_gpus],
       save_with_distribution=[True, False],
       restore_with_distribution=[True, False],
       mode=["graph"]))
diff --git a/tensorflow/contrib/distributions/BUILD b/tensorflow/contrib/distributions/BUILD
index 60f6b90edcb71f04bca29b90744db201e83cd545..3079175015a9aee1625404902070df8f13b2089c 100644
--- a/tensorflow/contrib/distributions/BUILD
+++ b/tensorflow/contrib/distributions/BUILD
@@ -72,7 +72,6 @@ py_library(
         "//tensorflow/python:nn",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
-        "//tensorflow/python:spectral_ops",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
@@ -80,6 +79,7 @@ py_library(
         "//tensorflow/python:variables",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
index 29eeaf43c5185ce5519d4a1211f66e99ce61c6ab..ab3c07172a68255f4e387e071ac2f8341e93b90c 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/normal_conjugate_posteriors_test.py
@@ -82,7 +82,7 @@ class NormalTest(test.TestCase):
       x = constant_op.constant(
           [[-2.5, 2.5, 4.0, 0.0, -1.0, 2.0], [2.5, -2.5, -4.0, 0.0, 1.0, -2.0]],
           dtype=dtypes.float32)
-      s = math_ops.reduce_sum(x, reduction_indices=[1])
+      s = math_ops.reduce_sum(x, axis=[1])
       x = array_ops.transpose(x)  # Reshape to shape (6, 2)
       n = constant_op.constant([6] * 2)
       prior = distributions.Normal(loc=mu0, scale=sigma0)
diff --git a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
index a60056c444a3fe7262939c5b3c73673f9a7c1469..cdee30bbc42e661952a9c757d7a30ebcd393f794 100644
--- a/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
+++ b/tensorflow/contrib/distributions/python/kernel_tests/wishart_test.py
@@ -147,14 +147,13 @@ class WishartCholeskyTest(test.TestCase):
       x = chol_w.sample(10000, seed=42)
       self.assertAllEqual((10000, 3, 3), x.get_shape())
 
-      moment1_estimate = math_ops.reduce_mean(x, reduction_indices=[0]).eval()
+      moment1_estimate = math_ops.reduce_mean(x, axis=[0]).eval()
       self.assertAllClose(chol_w.mean().eval(), moment1_estimate, rtol=0.05)
 
       # The Variance estimate uses the squares rather than outer-products
       # because Wishart.Variance is the diagonal of the Wishart covariance
       # matrix.
-      variance_estimate = (math_ops.reduce_mean(
-          math_ops.square(x), reduction_indices=[0]) -
+      variance_estimate = (math_ops.reduce_mean(math_ops.square(x), axis=[0]) -
                            math_ops.square(moment1_estimate)).eval()
       self.assertAllClose(
           chol_w.variance().eval(), variance_estimate, rtol=0.05)
diff --git a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
index 15c241d5d7a29d0e317cb6e5f46a40516e8a834f..74765f19e584c5de07c6aee4a36ec4e85438f862 100644
--- a/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
+++ b/tensorflow/contrib/distributions/python/ops/bijectors/softmax_centered.py
@@ -168,7 +168,7 @@ class SoftmaxCentered(bijector.Bijector):
     #   log_normalization = 1 + reduce_sum(exp(logits))
     #   -log_normalization + reduce_sum(logits - log_normalization)
     log_normalization = nn_ops.softplus(
-        math_ops.reduce_logsumexp(x, axis=-1, keep_dims=True))
+        math_ops.reduce_logsumexp(x, axis=-1, keepdims=True))
     return array_ops.squeeze(
         (-log_normalization + math_ops.reduce_sum(
             x - log_normalization, axis=-1, keepdims=True)), axis=-1)
diff --git a/tensorflow/contrib/distributions/python/ops/sample_stats.py b/tensorflow/contrib/distributions/python/ops/sample_stats.py
index aa680a92be64cf0f099acd335369f2a1610c5953..978e627d6638ddeea9df288d389354f0ac53d115 100644
--- a/tensorflow/contrib/distributions/python/ops/sample_stats.py
+++ b/tensorflow/contrib/distributions/python/ops/sample_stats.py
@@ -29,8 +29,8 @@ from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops.distributions import util
+from tensorflow.python.ops.signal import fft_ops
 
 __all__ = [
     "auto_correlation",
@@ -157,11 +157,11 @@ def auto_correlation(
                                        dtype.real_dtype.as_numpy_dtype(0.))
 
     # Autocorrelation is IFFT of power-spectral density (up to some scaling).
-    fft_x_rotated_pad = spectral_ops.fft(x_rotated_pad)
+    fft_x_rotated_pad = fft_ops.fft(x_rotated_pad)
     spectral_density = fft_x_rotated_pad * math_ops.conj(fft_x_rotated_pad)
     # shifted_product is R[m] from above detailed explanation.
     # It is the inner product sum_n X[n] * Conj(X[n - m]).
-    shifted_product = spectral_ops.ifft(spectral_density)
+    shifted_product = fft_ops.ifft(spectral_density)
 
     # Cast back to real-valued if x was real to begin with.
     shifted_product = math_ops.cast(shifted_product, dtype)
diff --git a/tensorflow/contrib/eager/python/datasets.py b/tensorflow/contrib/eager/python/datasets.py
index 3aed121233be1268531495a2fa83fd323412e1fd..34614b86a75b93ab93cf844c645c211b1329c6d5 100644
--- a/tensorflow/contrib/eager/python/datasets.py
+++ b/tensorflow/contrib/eager/python/datasets.py
@@ -52,12 +52,6 @@ class Iterator(iterator_ops.EagerIterator):
       TypeError: If `dataset` is an unsupported type.
       RuntimeError: When invoked without eager execution enabled.
     """
-    if isinstance(dataset, prefetching_ops._PrefetchToDeviceDataset):  # pylint: disable=protected-access
-      raise TypeError(
-          "`tf.data.experimental.prefetch_to_device()` is not compatible with "
-          "`tf.contrib.eager.Iterator`. Use `for ... in dataset:` to iterate "
-          "over the dataset instead.")
-
     if not context.context().device_spec.device_type:
       is_remote_device = False
     else:
diff --git a/tensorflow/contrib/eager/python/datasets_test.py b/tensorflow/contrib/eager/python/datasets_test.py
index 6a508fc6ba98740c4d441a064dc8a3e2b321f585..257d02057ae0d280074559aa9e97725bf5cc3fd0 100644
--- a/tensorflow/contrib/eager/python/datasets_test.py
+++ b/tensorflow/contrib/eager/python/datasets_test.py
@@ -26,7 +26,6 @@ import numpy as np
 from tensorflow.contrib import lookup
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.python.data import Dataset
-from tensorflow.python.data.experimental.ops import prefetching_ops
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.eager import test
@@ -208,18 +207,6 @@ class IteratorTest(test.TestCase):
         y = math_ops.add(x, x)
     self.assertAllEqual([0., 2.], y.numpy())
 
-  def testTensorsExplicitPrefetchToDevice(self):
-    ds = Dataset.from_tensor_slices([0., 1.])
-    ds = ds.apply(prefetching_ops.prefetch_to_device(test.gpu_device_name()))
-
-    with self.assertRaisesRegexp(TypeError, 'prefetch_to_device'):
-      datasets.Iterator(ds)
-
-    for i, x in enumerate(ds):
-      with ops.device(test.gpu_device_name()):
-        x = math_ops.add(x, x)
-        self.assertEqual(float(i) + float(i), x.numpy())
-
   def testOverrideThreadPool(self):
 
     def get_thread_id(_):
diff --git a/tensorflow/contrib/eager/python/evaluator.py b/tensorflow/contrib/eager/python/evaluator.py
index 7949a3f6da293abdd85512209242bae76ab4d816..51443d24829bdc31a41813e0ff50ad7102422112 100644
--- a/tensorflow/contrib/eager/python/evaluator.py
+++ b/tensorflow/contrib/eager/python/evaluator.py
@@ -22,6 +22,7 @@ import six
 
 from tensorflow.contrib.eager.python import datasets
 from tensorflow.contrib.eager.python import metrics
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import errors_impl
@@ -164,8 +165,8 @@ class Evaluator(object):
         self.__call__(example, *args, **kwargs)
       return self.all_metric_results(summary_logdir)
     # Graph construction
-    call_op = self.__call__(dataset.make_one_shot_iterator().get_next(), *args,
-                            **kwargs)
+    call_op = self.__call__(
+        dataset_ops.make_one_shot_iterator(dataset).get_next(), *args, **kwargs)
     init_op = self.init_variables()
     results_op = self.all_metric_results(summary_logdir)
     return (init_op, call_op, results_op)
diff --git a/tensorflow/contrib/eager/python/examples/densenet/BUILD b/tensorflow/contrib/eager/python/examples/densenet/BUILD
index 2dc196f550a10367066730f6f042c4ed69533ec3..e2154fcc5fcf774dcd52285d9442dfd5073a4992 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/BUILD
+++ b/tensorflow/contrib/eager/python/examples/densenet/BUILD
@@ -3,6 +3,7 @@ licenses(["notice"])  # Apache 2.0
 package(default_visibility = ["//tensorflow:internal"])
 
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.bzl", "py_binary")
 
 py_binary(
     name = "densenet",
diff --git a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
index 4b3cb624bc947a1d1956eff6accb6d4da3bf3b87..24f6b007b526b29157011f3b1e9abdbd50bacc8e 100644
--- a/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/densenet/densenet_graph_test.py
@@ -119,7 +119,8 @@ class DensenetBenchmark(tf.test.Benchmark):
       with tf.Graph().as_default():
         np_images, np_labels = random_batch(batch_size)
         dataset = tf.data.Dataset.from_tensors((np_images, np_labels)).repeat()
-        (images, labels) = dataset.make_one_shot_iterator().get_next()
+        (images, labels) = tf.compat.v1.data.make_one_shot_iterator(
+            dataset).get_next()
 
         model = densenet.DenseNet(self.depth, self.growth_rate, self.num_blocks,
                                   self.output_classes,
diff --git a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
index 12b39b0cde49d4c017acfa74572c725036c54eff..e73841fbf724e05eaa3be90cc8650f795d3e1ccf 100644
--- a/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/gan/mnist_graph_test.py
@@ -42,7 +42,8 @@ class MnistGraphGanBenchmark(tf.test.Benchmark):
     # Generate some random data.
     images_data = np.random.randn(batch_size, 784).astype(np.float32)
     dataset = tf.data.Dataset.from_tensors(images_data)
-    images = dataset.repeat().make_one_shot_iterator().get_next()
+    images = tf.compat.v1.data.make_one_shot_iterator(
+        dataset.repeat()).get_next()
 
     # Create the models and optimizers
     generator = mnist.Generator(data_format())
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
index ca27a85a229d41a85fa26ecdc982da478fe9e202..1a08cc0fd06516be4af5c2b0b46a3ffcf9101e95 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/cvae.ipynb
@@ -470,7 +470,7 @@
         "\n",
         "  if epoch % 1 == 0:\n",
         "    loss = tfe.metrics.Mean()\n",
-        "    for test_x in test_dataset.make_one_shot_iterator():\n",
+        "    for test_x in test_dataset:\n",
         "      loss(compute_loss(model, test_x))\n",
         "    elbo = -loss.result()\n",
         "    display.clear_output(wait=False)\n",
diff --git a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
index 3acecd283cda83992bab0c37cf0b8037ed2cf27a..12c5eff2b4aa901bdab52bf545e95b1e4dce7468 100644
--- a/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb
@@ -1,1184 +1,1174 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "K2s1A9eLRPEj"
+   },
+   "source": [
+    "##### Copyright 2018 The TensorFlow Authors.\n",
+    "\n",
+    "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Cffg2i257iMS"
+   },
+   "source": [
+    "# Image Captioning with Attention\n",
+    "\n",
+    "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
+    "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
+    "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
+    "</td><td>\n",
+    "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "QASbY_HGo4Lq"
+   },
+   "source": [
+    "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
+    "\n",
+    "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
+    "\n",
+    "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
+    "\n",
+    "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
+    "\n",
+    "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
+    "\n",
+    "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
+    "\n",
+    "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
+    "\n",
+    "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
+    "\n",
+    "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
+    "\n",
+    "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
     "colab": {
-      "name": "image_captioning_with_attention.ipynb",
-      "version": "0.3.2",
-      "views": {},
-      "default_view": {},
-      "provenance": [
-        {
-          "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
-          "timestamp": 1530222436922
-        }
-      ],
-      "private_outputs": true,
-      "collapsed_sections": [],
-      "toc_visible": true
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "language": "python",
-      "name": "python3"
-    },
-    "accelerator": "GPU"
+    "colab_type": "code",
+    "id": "U8l4RJ0XRPEm"
+   },
+   "outputs": [],
+   "source": [
+    "# Import TensorFlow and enable eager execution\n",
+    "# This code requires TensorFlow version >=1.9\n",
+    "import tensorflow as tf\n",
+    "tf.enable_eager_execution()\n",
+    "\n",
+    "# We'll generate plots of attention in order to see which parts of an image\n",
+    "# our model focuses on during captioning\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "# Scikit-learn includes many helpful utilities\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.utils import shuffle\n",
+    "\n",
+    "import re\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import time\n",
+    "import json\n",
+    "from glob import glob\n",
+    "from PIL import Image\n",
+    "import pickle"
+   ]
   },
-  "cells": [
-    {
-      "metadata": {
-        "id": "K2s1A9eLRPEj",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "##### Copyright 2018 The TensorFlow Authors.\n",
-        "\n",
-        "Licensed under the Apache License, Version 2.0 (the \"License\").\n"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Cffg2i257iMS",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Image Captioning with Attention\n",
-        "\n",
-        "<table class=\"tfo-notebook-buttons\" align=\"left\"><td>\n",
-        "<a target=\"_blank\"  href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\">\n",
-        "    <img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>  \n",
-        "</td><td>\n",
-        "<a target=\"_blank\"  href=\"https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/eager/python/examples/generative_examples/image_captioning_with_attention.ipynb\"><img width=32px src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a></td></table>"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "QASbY_HGo4Lq",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "Image captioning is the task of generating a caption for an image. Given an image like this:\n",
-        "\n",
-        "![Man Surfing](https://tensorflow.org/images/surf.jpg) \n",
-        "\n",
-        "[Image Source](https://commons.wikimedia.org/wiki/Surfing#/media/File:Surfing_in_Hawaii.jpg), License: Public Domain\n",
-        "\n",
-        "Our goal is to generate a caption, such as \"a surfer riding on a wave\". Here, we'll use an attention-based model. This enables us to see which parts of the image the model focuses on as it generates a caption.\n",
-        "\n",
-        "![Prediction](https://tensorflow.org/images/imcap_prediction.png)\n",
-        "\n",
-        "This model architecture below is similar to [Show, Attend and Tell: Neural Image Caption Generation with Visual Attention](https://arxiv.org/abs/1502.03044). \n",
-        "\n",
-        "The code uses [tf.keras](https://www.tensorflow.org/programmers_guide/keras) and [eager execution](https://www.tensorflow.org/programmers_guide/eager), which you can learn more about in the linked guides.\n",
-        "\n",
-        "This notebook is an end-to-end example. If you run it, it will download the  [MS-COCO](http://cocodataset.org/#home) dataset, preprocess and cache a subset of the images using Inception V3, train an encoder-decoder model, and use it to generate captions on new images.\n",
-        "\n",
-        "The code requires TensorFlow version >=1.9. If you're running this in [Colab]()\n",
-        "\n",
-        "In this example, we're training on a relatively small amount of data as an example. On a single P100 GPU, this example will take about ~2 hours to train. We train on the first 30,000 captions (corresponding to about ~20,000 images depending on shuffling, as there are multiple captions per image in the dataset)\n"
-      ]
-    },
-    {
-      "metadata": {
-        "id": "U8l4RJ0XRPEm",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# Import TensorFlow and enable eager execution\n",
-        "# This code requires TensorFlow version >=1.9\n",
-        "import tensorflow as tf\n",
-        "tf.enable_eager_execution()\n",
-        "\n",
-        "# We'll generate plots of attention in order to see which parts of an image\n",
-        "# our model focuses on during captioning\n",
-        "import matplotlib.pyplot as plt\n",
-        "\n",
-        "# Scikit-learn includes many helpful utilities\n",
-        "from sklearn.model_selection import train_test_split\n",
-        "from sklearn.utils import shuffle\n",
-        "\n",
-        "import re\n",
-        "import numpy as np\n",
-        "import os\n",
-        "import time\n",
-        "import json\n",
-        "from glob import glob\n",
-        "from PIL import Image\n",
-        "import pickle"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "b6qbGw8MRPE5",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Download and prepare the MS-COCO dataset\n",
-        "\n",
-        "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
-        "\n",
-        "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "krQuPYTtRPE7",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
-        "                                          cache_subdir=os.path.abspath('.'),\n",
-        "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
-        "                                          extract = True)\n",
-        "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
-        "\n",
-        "name_of_zip = 'train2014.zip'\n",
-        "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
-        "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
-        "                                      cache_subdir=os.path.abspath('.'),\n",
-        "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
-        "                                      extract = True)\n",
-        "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
-        "else:\n",
-        "  PATH = os.path.abspath('.')+'/train2014/'"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "aANEzb5WwSzg",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Optionally, limit the size of the training set for faster training\n",
-        "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "4G3b8x8_RPFD",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# read the json file\n",
-        "with open(annotation_file, 'r') as f:\n",
-        "    annotations = json.load(f)\n",
-        "\n",
-        "# storing the captions and the image name in vectors\n",
-        "all_captions = []\n",
-        "all_img_name_vector = []\n",
-        "\n",
-        "for annot in annotations['annotations']:\n",
-        "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
-        "    image_id = annot['image_id']\n",
-        "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
-        "    \n",
-        "    all_img_name_vector.append(full_coco_image_path)\n",
-        "    all_captions.append(caption)\n",
-        "\n",
-        "# shuffling the captions and image_names together\n",
-        "# setting a random state\n",
-        "train_captions, img_name_vector = shuffle(all_captions,\n",
-        "                                          all_img_name_vector,\n",
-        "                                          random_state=1)\n",
-        "\n",
-        "# selecting the first 30000 captions from the shuffled set\n",
-        "num_examples = 30000\n",
-        "train_captions = train_captions[:num_examples]\n",
-        "img_name_vector = img_name_vector[:num_examples]"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "mPBMgK34RPFL",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "len(train_captions), len(all_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "8cSW4u-ORPFQ",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Preprocess the images using InceptionV3\n",
-        "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
-        "\n",
-        "First, we will need to convert the images into the format inceptionV3 expects by:\n",
-        "* Resizing the image to (299, 299)\n",
-        "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "zXR0217aRPFR",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def load_image(image_path):\n",
-        "    img = tf.read_file(image_path)\n",
-        "    img = tf.image.decode_jpeg(img, channels=3)\n",
-        "    img = tf.image.resize_images(img, (299, 299))\n",
-        "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
-        "    return img, image_path"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "MDvIu4sXRPFV",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
-        "\n",
-        "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
-        "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
-        "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
-        "* We avoid doing this during training so it does not become a bottleneck. \n",
-        "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "RD3vW4SsRPFW",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
-        "                                                weights='imagenet')\n",
-        "new_input = image_model.input\n",
-        "hidden_layer = image_model.layers[-1].output\n",
-        "\n",
-        "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "rERqlR3WRPGO",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Caching the features extracted from InceptionV3\n",
-        "\n",
-        "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
-        "\n",
-        "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
-        "\n",
-        "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
-        "\n",
-        "```for img, path in image_dataset:``` \n",
-        "\n",
-        "to:\n",
-        "\n",
-        "```for img, path in tqdm(image_dataset):```."
-      ]
-    },
-    {
-      "metadata": {
-        "id": "Dx_fvbVgRPGQ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# getting the unique images\n",
-        "encode_train = sorted(set(img_name_vector))\n",
-        "\n",
-        "# feel free to change the batch_size according to your system configuration\n",
-        "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
-        "                                encode_train).map(load_image).batch(16)\n",
-        "\n",
-        "for img, path in image_dataset:\n",
-        "  batch_features = image_features_extract_model(img)\n",
-        "  batch_features = tf.reshape(batch_features, \n",
-        "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
-        "\n",
-        "  for bf, p in zip(batch_features, path):\n",
-        "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
-        "    np.save(path_of_feature, bf.numpy())"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "nyqH3zFwRPFi",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Preprocess and tokenize the captions\n",
-        "\n",
-        "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
-        "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
-        "* Finally, we create a word --> index mapping and vice-versa.\n",
-        "* We will then pad all sequences to the be same length as the longest one. "
-      ]
-    },
-    {
-      "metadata": {
-        "id": "HZfK8RhQRPFj",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# This will find the maximum length of any caption in our dataset\n",
-        "def calc_max_length(tensor):\n",
-        "    return max(len(t) for t in tensor)"
-      ],
-      "execution_count": 0,
-      "outputs": []
-    },
-    {
-      "metadata": {
-        "id": "oJGE34aiRPFo",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# The steps above is a general process of dealing with text processing\n",
-        "\n",
-        "# choosing the top 5000 words from the vocabulary\n",
-        "top_k = 5000\n",
-        "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
-        "                                                  oov_token=\"<unk>\", \n",
-        "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
-        "tokenizer.fit_on_texts(train_captions)\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "b6qbGw8MRPE5"
+   },
+   "source": [
+    "## Download and prepare the MS-COCO dataset\n",
+    "\n",
+    "We will use the [MS-COCO dataset](http://cocodataset.org/#home) to train our model. This dataset contains >82,000 images, each of which has been annotated with at least 5 different captions. The code below will download and extract the dataset automatically.  \n",
+    "\n",
+    "**Caution: large download ahead**. We'll use the training set, it's a 13GB file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "8Q44tNQVRPFt",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "tokenizer.word_index = {key:value for key, value in tokenizer.word_index.items() if value <= top_k}\n",
-        "# putting <unk> token in the word2idx dictionary\n",
-        "tokenizer.word_index[tokenizer.oov_token] = top_k + 1\n",
-        "tokenizer.word_index['<pad>'] = 0"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "krQuPYTtRPE7"
+   },
+   "outputs": [],
+   "source": [
+    "annotation_zip = tf.keras.utils.get_file('captions.zip', \n",
+    "                                          cache_subdir=os.path.abspath('.'),\n",
+    "                                          origin = 'http://images.cocodataset.org/annotations/annotations_trainval2014.zip',\n",
+    "                                          extract = True)\n",
+    "annotation_file = os.path.dirname(annotation_zip)+'/annotations/captions_train2014.json'\n",
+    "\n",
+    "name_of_zip = 'train2014.zip'\n",
+    "if not os.path.exists(os.path.abspath('.') + '/' + name_of_zip):\n",
+    "  image_zip = tf.keras.utils.get_file(name_of_zip, \n",
+    "                                      cache_subdir=os.path.abspath('.'),\n",
+    "                                      origin = 'http://images.cocodataset.org/zips/train2014.zip',\n",
+    "                                      extract = True)\n",
+    "  PATH = os.path.dirname(image_zip)+'/train2014/'\n",
+    "else:\n",
+    "  PATH = os.path.abspath('.')+'/train2014/'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "aANEzb5WwSzg"
+   },
+   "source": [
+    "## Optionally, limit the size of the training set for faster training\n",
+    "For this example, we'll select a subset of 30,000 captions and use these and the corresponding images to train our model. As always, captioning quality will improve if you choose to use more data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "0fpJb5ojRPFv",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# creating the tokenized vectors\n",
-        "train_seqs = tokenizer.texts_to_sequences(train_captions)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "4G3b8x8_RPFD"
+   },
+   "outputs": [],
+   "source": [
+    "# read the json file\n",
+    "with open(annotation_file, 'r') as f:\n",
+    "    annotations = json.load(f)\n",
+    "\n",
+    "# storing the captions and the image name in vectors\n",
+    "all_captions = []\n",
+    "all_img_name_vector = []\n",
+    "\n",
+    "for annot in annotations['annotations']:\n",
+    "    caption = '<start> ' + annot['caption'] + ' <end>'\n",
+    "    image_id = annot['image_id']\n",
+    "    full_coco_image_path = PATH + 'COCO_train2014_' + '%012d.jpg' % (image_id)\n",
+    "    \n",
+    "    all_img_name_vector.append(full_coco_image_path)\n",
+    "    all_captions.append(caption)\n",
+    "\n",
+    "# shuffling the captions and image_names together\n",
+    "# setting a random state\n",
+    "train_captions, img_name_vector = shuffle(all_captions,\n",
+    "                                          all_img_name_vector,\n",
+    "                                          random_state=1)\n",
+    "\n",
+    "# selecting the first 30000 captions from the shuffled set\n",
+    "num_examples = 30000\n",
+    "train_captions = train_captions[:num_examples]\n",
+    "img_name_vector = img_name_vector[:num_examples]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "olQArbgbRPF1",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# creating a reverse mapping (index -> word)\n",
-        "index_word = {value:key for key, value in tokenizer.word_index.items()}"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "mPBMgK34RPFL"
+   },
+   "outputs": [],
+   "source": [
+    "len(train_captions), len(all_captions)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "8cSW4u-ORPFQ"
+   },
+   "source": [
+    "## Preprocess the images using InceptionV3\n",
+    "Next, we will use InceptionV3 (pretrained on Imagenet) to classify each image. We will extract features from the last convolutional layer. \n",
+    "\n",
+    "First, we will need to convert the images into the format inceptionV3 expects by:\n",
+    "* Resizing the image to (299, 299)\n",
+    "* Using the [preprocess_input](https://www.tensorflow.org/api_docs/python/tf/keras/applications/inception_v3/preprocess_input) method to place the pixels in the range of -1 to 1 (to match the format of the images used to train InceptionV3)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AidglIZVRPF4",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# padding each vector to the max_length of the captions\n",
-        "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
-        "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "zXR0217aRPFR"
+   },
+   "outputs": [],
+   "source": [
+    "def load_image(image_path):\n",
+    "    img = tf.read_file(image_path)\n",
+    "    img = tf.image.decode_jpeg(img, channels=3)\n",
+    "    img = tf.image.resize_images(img, (299, 299))\n",
+    "    img = tf.keras.applications.inception_v3.preprocess_input(img)\n",
+    "    return img, image_path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "MDvIu4sXRPFV"
+   },
+   "source": [
+    "## Initialize InceptionV3 and load the pretrained Imagenet weights\n",
+    "\n",
+    "To do so, we'll create a tf.keras model where the output layer is the last convolutional layer in the InceptionV3 architecture. \n",
+    "* Each image is forwarded through the network and the vector that we get at the end is stored in a dictionary (image_name --> feature_vector). \n",
+    "* We use the last convolutional layer because we are using attention in this example. The shape of the output of this layer is ```8x8x2048```. \n",
+    "* We avoid doing this during training so it does not become a bottleneck. \n",
+    "* After all the images are passed through the network, we pickle the dictionary and save it to disk."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "gL0wkttkRPGA",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# calculating the max_length \n",
-        "# used to store the attention weights\n",
-        "max_length = calc_max_length(train_seqs)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "RD3vW4SsRPFW"
+   },
+   "outputs": [],
+   "source": [
+    "image_model = tf.keras.applications.InceptionV3(include_top=False, \n",
+    "                                                weights='imagenet')\n",
+    "new_input = image_model.input\n",
+    "hidden_layer = image_model.layers[-1].output\n",
+    "\n",
+    "image_features_extract_model = tf.keras.Model(new_input, hidden_layer)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "rERqlR3WRPGO"
+   },
+   "source": [
+    "## Caching the features extracted from InceptionV3\n",
+    "\n",
+    "We will pre-process each image with InceptionV3 and cache the output to disk. Caching the output in RAM would be faster but memory intensive, requiring 8 \\* 8 \\* 2048 floats per image. At the time of writing, this would exceed the memory limitations of Colab (although these may change, an instance appears to have about 12GB of memory currently). \n",
+    "\n",
+    "Performance could be improved with a more sophisticated caching strategy (e.g., by sharding the images to reduce random access disk I/O) at the cost of more code.\n",
+    "\n",
+    "This will take about 10 minutes to run in Colab with a GPU. If you'd like to see a progress bar, you could: install [tqdm](https://github.com/tqdm/tqdm) (```!pip install tqdm```), then change this line: \n",
+    "\n",
+    "```for img, path in image_dataset:``` \n",
+    "\n",
+    "to:\n",
+    "\n",
+    "```for img, path in tqdm(image_dataset):```."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "M3CD75nDpvTI",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Split the data into training and testing"
-      ]
+    "colab_type": "code",
+    "id": "Dx_fvbVgRPGQ"
+   },
+   "outputs": [],
+   "source": [
+    "# getting the unique images\n",
+    "encode_train = sorted(set(img_name_vector))\n",
+    "\n",
+    "# feel free to change the batch_size according to your system configuration\n",
+    "image_dataset = tf.data.Dataset.from_tensor_slices(\n",
+    "                                encode_train).map(load_image).batch(16)\n",
+    "\n",
+    "for img, path in image_dataset:\n",
+    "  batch_features = image_features_extract_model(img)\n",
+    "  batch_features = tf.reshape(batch_features, \n",
+    "                              (batch_features.shape[0], -1, batch_features.shape[3]))\n",
+    "\n",
+    "  for bf, p in zip(batch_features, path):\n",
+    "    path_of_feature = p.numpy().decode(\"utf-8\")\n",
+    "    np.save(path_of_feature, bf.numpy())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "nyqH3zFwRPFi"
+   },
+   "source": [
+    "## Preprocess and tokenize the captions\n",
+    "\n",
+    "* First, we'll tokenize the captions (e.g., by splitting on spaces). This will give us a  vocabulary of all the unique words in the data (e.g., \"surfing\", \"football\", etc).\n",
+    "* Next, we'll limit the vocabulary size to the top 5,000 words to save memory. We'll replace all other words with the token \"UNK\" (for unknown).\n",
+    "* Finally, we create a word --> index mapping and vice-versa.\n",
+    "* We will then pad all sequences to the be same length as the longest one. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "iS7DDMszRPGF",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# Create training and validation sets using 80-20 split\n",
-        "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
-        "                                                                    cap_vector, \n",
-        "                                                                    test_size=0.2, \n",
-        "                                                                    random_state=0)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "HZfK8RhQRPFj"
+   },
+   "outputs": [],
+   "source": [
+    "# This will find the maximum length of any caption in our dataset\n",
+    "def calc_max_length(tensor):\n",
+    "    return max(len(t) for t in tensor)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "XmViPkRFRPGH",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "oJGE34aiRPFo"
+   },
+   "outputs": [],
+   "source": [
+    "# The steps above is a general process of dealing with text processing\n",
+    "\n",
+    "# choosing the top 5000 words from the vocabulary\n",
+    "top_k = 5000\n",
+    "tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=top_k, \n",
+    "                                                  oov_token=\"<unk>\", \n",
+    "                                                  filters='!\"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')\n",
+    "tokenizer.fit_on_texts(train_captions)\n",
+    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "uEWM9xrYcg45",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
-        "\n"
-      ]
+    "colab_type": "code",
+    "id": "8Q44tNQVRPFt"
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer.word_index['<pad>'] = 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Q3TnZ1ToRPGV",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# feel free to change these parameters according to your system's configuration\n",
-        "\n",
-        "BATCH_SIZE = 64\n",
-        "BUFFER_SIZE = 1000\n",
-        "embedding_dim = 256\n",
-        "units = 512\n",
-        "vocab_size = len(tokenizer.word_index)\n",
-        "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
-        "# these two variables represent that\n",
-        "features_shape = 2048\n",
-        "attention_features_shape = 64"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "0fpJb5ojRPFv"
+   },
+   "outputs": [],
+   "source": [
+    "# creating the tokenized vectors\n",
+    "train_seqs = tokenizer.texts_to_sequences(train_captions)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "SmZS2N0bXG3T",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# loading the numpy files \n",
-        "def map_func(img_name, cap):\n",
-        "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
-        "    return img_tensor, cap"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AidglIZVRPF4"
+   },
+   "outputs": [],
+   "source": [
+    "# padding each vector to the max_length of the captions\n",
+    "# if the max_length parameter is not provided, pad_sequences calculates that automatically\n",
+    "cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "FDF_Nm3tRPGZ",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
-        "\n",
-        "# using map to load the numpy files in parallel\n",
-        "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
-        "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
-        "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
-        "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
-        "\n",
-        "# shuffling and batching\n",
-        "dataset = dataset.shuffle(BUFFER_SIZE)\n",
-        "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
-        "dataset = dataset.batch(BATCH_SIZE)\n",
-        "dataset = dataset.prefetch(1)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "gL0wkttkRPGA"
+   },
+   "outputs": [],
+   "source": [
+    "# calculating the max_length \n",
+    "# used to store the attention weights\n",
+    "max_length = calc_max_length(train_seqs)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "M3CD75nDpvTI"
+   },
+   "source": [
+    "## Split the data into training and testing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "nrvoDphgRPGd",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Model\n",
-        "\n",
-        "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
-        "\n",
-        "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
-        "\n",
-        "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
-        "* We squash that to a shape of (64, 2048).\n",
-        "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
-        "* The RNN(here GRU) attends over the image to predict the next word."
-      ]
+    "colab_type": "code",
+    "id": "iS7DDMszRPGF"
+   },
+   "outputs": [],
+   "source": [
+    "# Create training and validation sets using 80-20 split\n",
+    "img_name_train, img_name_val, cap_train, cap_val = train_test_split(img_name_vector, \n",
+    "                                                                    cap_vector, \n",
+    "                                                                    test_size=0.2, \n",
+    "                                                                    random_state=0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AAppCGLKRPGd",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def gru(units):\n",
-        "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
-        "  # significant speedup).\n",
-        "  if tf.test.is_gpu_available():\n",
-        "    return tf.keras.layers.CuDNNGRU(units, \n",
-        "                                    return_sequences=True, \n",
-        "                                    return_state=True, \n",
-        "                                    recurrent_initializer='glorot_uniform')\n",
-        "  else:\n",
-        "    return tf.keras.layers.GRU(units, \n",
-        "                               return_sequences=True, \n",
-        "                               return_state=True, \n",
-        "                               recurrent_activation='sigmoid', \n",
-        "                               recurrent_initializer='glorot_uniform')"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "XmViPkRFRPGH"
+   },
+   "outputs": [],
+   "source": [
+    "len(img_name_train), len(cap_train), len(img_name_val), len(cap_val)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "uEWM9xrYcg45"
+   },
+   "source": [
+    "## Our images and captions are ready! Next, let's create a tf.data dataset to use for training our model.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "ja2LFTMSdeV3",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class BahdanauAttention(tf.keras.Model):\n",
-        "  def __init__(self, units):\n",
-        "    super(BahdanauAttention, self).__init__()\n",
-        "    self.W1 = tf.keras.layers.Dense(units)\n",
-        "    self.W2 = tf.keras.layers.Dense(units)\n",
-        "    self.V = tf.keras.layers.Dense(1)\n",
-        "  \n",
-        "  def call(self, features, hidden):\n",
-        "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
-        "    \n",
-        "    # hidden shape == (batch_size, hidden_size)\n",
-        "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
-        "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
-        "    \n",
-        "    # score shape == (batch_size, 64, hidden_size)\n",
-        "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
-        "    \n",
-        "    # attention_weights shape == (batch_size, 64, 1)\n",
-        "    # we get 1 at the last axis because we are applying score to self.V\n",
-        "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
-        "    \n",
-        "    # context_vector shape after sum == (batch_size, hidden_size)\n",
-        "    context_vector = attention_weights * features\n",
-        "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
-        "    \n",
-        "    return context_vector, attention_weights"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Q3TnZ1ToRPGV"
+   },
+   "outputs": [],
+   "source": [
+    "# feel free to change these parameters according to your system's configuration\n",
+    "\n",
+    "BATCH_SIZE = 64\n",
+    "BUFFER_SIZE = 1000\n",
+    "embedding_dim = 256\n",
+    "units = 512\n",
+    "vocab_size = len(tokenizer.word_index)\n",
+    "# shape of the vector extracted from InceptionV3 is (64, 2048)\n",
+    "# these two variables represent that\n",
+    "features_shape = 2048\n",
+    "attention_features_shape = 64"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "AZ7R1RxHRPGf",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class CNN_Encoder(tf.keras.Model):\n",
-        "    # Since we have already extracted the features and dumped it using pickle\n",
-        "    # This encoder passes those features through a Fully connected layer\n",
-        "    def __init__(self, embedding_dim):\n",
-        "        super(CNN_Encoder, self).__init__()\n",
-        "        # shape after fc == (batch_size, 64, embedding_dim)\n",
-        "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
-        "        \n",
-        "    def call(self, x):\n",
-        "        x = self.fc(x)\n",
-        "        x = tf.nn.relu(x)\n",
-        "        return x"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "SmZS2N0bXG3T"
+   },
+   "outputs": [],
+   "source": [
+    "# loading the numpy files \n",
+    "def map_func(img_name, cap):\n",
+    "    img_tensor = np.load(img_name.decode('utf-8')+'.npy')\n",
+    "    return img_tensor, cap"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "V9UbGQmERPGi",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "class RNN_Decoder(tf.keras.Model):\n",
-        "  def __init__(self, embedding_dim, units, vocab_size):\n",
-        "    super(RNN_Decoder, self).__init__()\n",
-        "    self.units = units\n",
-        "\n",
-        "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
-        "    self.gru = gru(self.units)\n",
-        "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
-        "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
-        "    \n",
-        "    self.attention = BahdanauAttention(self.units)\n",
-        "        \n",
-        "  def call(self, x, features, hidden):\n",
-        "    # defining attention as a separate model\n",
-        "    context_vector, attention_weights = self.attention(features, hidden)\n",
-        "    \n",
-        "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
-        "    x = self.embedding(x)\n",
-        "    \n",
-        "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
-        "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
-        "    \n",
-        "    # passing the concatenated vector to the GRU\n",
-        "    output, state = self.gru(x)\n",
-        "    \n",
-        "    # shape == (batch_size, max_length, hidden_size)\n",
-        "    x = self.fc1(output)\n",
-        "    \n",
-        "    # x shape == (batch_size * max_length, hidden_size)\n",
-        "    x = tf.reshape(x, (-1, x.shape[2]))\n",
-        "    \n",
-        "    # output shape == (batch_size * max_length, vocab)\n",
-        "    x = self.fc2(x)\n",
-        "\n",
-        "    return x, state, attention_weights\n",
-        "\n",
-        "  def reset_state(self, batch_size):\n",
-        "    return tf.zeros((batch_size, self.units))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "FDF_Nm3tRPGZ"
+   },
+   "outputs": [],
+   "source": [
+    "dataset = tf.data.Dataset.from_tensor_slices((img_name_train, cap_train))\n",
+    "\n",
+    "# using map to load the numpy files in parallel\n",
+    "# NOTE: Be sure to set num_parallel_calls to the number of CPU cores you have\n",
+    "# https://www.tensorflow.org/api_docs/python/tf/py_func\n",
+    "dataset = dataset.map(lambda item1, item2: tf.py_func(\n",
+    "          map_func, [item1, item2], [tf.float32, tf.int32]), num_parallel_calls=8)\n",
+    "\n",
+    "# shuffling and batching\n",
+    "dataset = dataset.shuffle(BUFFER_SIZE)\n",
+    "# https://www.tensorflow.org/api_docs/python/tf/contrib/data/batch_and_drop_remainder\n",
+    "dataset = dataset.batch(BATCH_SIZE)\n",
+    "dataset = dataset.prefetch(1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "nrvoDphgRPGd"
+   },
+   "source": [
+    "## Model\n",
+    "\n",
+    "Fun fact, the decoder below is identical to the one in the example for [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb).\n",
+    "\n",
+    "The model architecture is inspired by the [Show, Attend and Tell](https://arxiv.org/pdf/1502.03044.pdf) paper.\n",
+    "\n",
+    "* In this example, we extract the features from the lower convolutional layer of InceptionV3 giving us a vector of shape (8, 8, 2048). \n",
+    "* We squash that to a shape of (64, 2048).\n",
+    "* This vector is then passed through the CNN Encoder(which consists of a single Fully connected layer).\n",
+    "* The RNN(here GRU) attends over the image to predict the next word."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Qs_Sr03wRPGk",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "encoder = CNN_Encoder(embedding_dim)\n",
-        "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "AAppCGLKRPGd"
+   },
+   "outputs": [],
+   "source": [
+    "def gru(units):\n",
+    "  # If you have a GPU, we recommend using the CuDNNGRU layer (it provides a \n",
+    "  # significant speedup).\n",
+    "  if tf.test.is_gpu_available():\n",
+    "    return tf.keras.layers.CuDNNGRU(units, \n",
+    "                                    return_sequences=True, \n",
+    "                                    return_state=True, \n",
+    "                                    recurrent_initializer='glorot_uniform')\n",
+    "  else:\n",
+    "    return tf.keras.layers.GRU(units, \n",
+    "                               return_sequences=True, \n",
+    "                               return_state=True, \n",
+    "                               recurrent_activation='sigmoid', \n",
+    "                               recurrent_initializer='glorot_uniform')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "-bYN7xA0RPGl",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "optimizer = tf.train.AdamOptimizer()\n",
-        "\n",
-        "# We are masking the loss calculated for padding\n",
-        "def loss_function(real, pred):\n",
-        "    mask = 1 - np.equal(real, 0)\n",
-        "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
-        "    return tf.reduce_mean(loss_)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "ja2LFTMSdeV3"
+   },
+   "outputs": [],
+   "source": [
+    "class BahdanauAttention(tf.keras.Model):\n",
+    "  def __init__(self, units):\n",
+    "    super(BahdanauAttention, self).__init__()\n",
+    "    self.W1 = tf.keras.layers.Dense(units)\n",
+    "    self.W2 = tf.keras.layers.Dense(units)\n",
+    "    self.V = tf.keras.layers.Dense(1)\n",
+    "  \n",
+    "  def call(self, features, hidden):\n",
+    "    # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)\n",
+    "    \n",
+    "    # hidden shape == (batch_size, hidden_size)\n",
+    "    # hidden_with_time_axis shape == (batch_size, 1, hidden_size)\n",
+    "    hidden_with_time_axis = tf.expand_dims(hidden, 1)\n",
+    "    \n",
+    "    # score shape == (batch_size, 64, hidden_size)\n",
+    "    score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))\n",
+    "    \n",
+    "    # attention_weights shape == (batch_size, 64, 1)\n",
+    "    # we get 1 at the last axis because we are applying score to self.V\n",
+    "    attention_weights = tf.nn.softmax(self.V(score), axis=1)\n",
+    "    \n",
+    "    # context_vector shape after sum == (batch_size, hidden_size)\n",
+    "    context_vector = attention_weights * features\n",
+    "    context_vector = tf.reduce_sum(context_vector, axis=1)\n",
+    "    \n",
+    "    return context_vector, attention_weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "PHod7t72RPGn",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Training\n",
-        "\n",
-        "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
-        "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
-        "* The decoder returns the predictions and the decoder hidden state.\n",
-        "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
-        "* Use teacher forcing to decide the next input to the decoder.\n",
-        "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
-        "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
-      ]
+    "colab_type": "code",
+    "id": "AZ7R1RxHRPGf"
+   },
+   "outputs": [],
+   "source": [
+    "class CNN_Encoder(tf.keras.Model):\n",
+    "    # Since we have already extracted the features and dumped it using pickle\n",
+    "    # This encoder passes those features through a Fully connected layer\n",
+    "    def __init__(self, embedding_dim):\n",
+    "        super(CNN_Encoder, self).__init__()\n",
+    "        # shape after fc == (batch_size, 64, embedding_dim)\n",
+    "        self.fc = tf.keras.layers.Dense(embedding_dim)\n",
+    "        \n",
+    "    def call(self, x):\n",
+    "        x = self.fc(x)\n",
+    "        x = tf.nn.relu(x)\n",
+    "        return x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Vt4WZ5mhJE-E",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# adding this in a separate cell because if you run the training cell \n",
-        "# many times, the loss_plot array will be reset\n",
-        "loss_plot = []"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "V9UbGQmERPGi"
+   },
+   "outputs": [],
+   "source": [
+    "class RNN_Decoder(tf.keras.Model):\n",
+    "  def __init__(self, embedding_dim, units, vocab_size):\n",
+    "    super(RNN_Decoder, self).__init__()\n",
+    "    self.units = units\n",
+    "\n",
+    "    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)\n",
+    "    self.gru = gru(self.units)\n",
+    "    self.fc1 = tf.keras.layers.Dense(self.units)\n",
+    "    self.fc2 = tf.keras.layers.Dense(vocab_size)\n",
+    "    \n",
+    "    self.attention = BahdanauAttention(self.units)\n",
+    "        \n",
+    "  def call(self, x, features, hidden):\n",
+    "    # defining attention as a separate model\n",
+    "    context_vector, attention_weights = self.attention(features, hidden)\n",
+    "    \n",
+    "    # x shape after passing through embedding == (batch_size, 1, embedding_dim)\n",
+    "    x = self.embedding(x)\n",
+    "    \n",
+    "    # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)\n",
+    "    x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)\n",
+    "    \n",
+    "    # passing the concatenated vector to the GRU\n",
+    "    output, state = self.gru(x)\n",
+    "    \n",
+    "    # shape == (batch_size, max_length, hidden_size)\n",
+    "    x = self.fc1(output)\n",
+    "    \n",
+    "    # x shape == (batch_size * max_length, hidden_size)\n",
+    "    x = tf.reshape(x, (-1, x.shape[2]))\n",
+    "    \n",
+    "    # output shape == (batch_size * max_length, vocab)\n",
+    "    x = self.fc2(x)\n",
+    "\n",
+    "    return x, state, attention_weights\n",
+    "\n",
+    "  def reset_state(self, batch_size):\n",
+    "    return tf.zeros((batch_size, self.units))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "UlA4VIQpRPGo",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "EPOCHS = 20\n",
-        "\n",
-        "for epoch in range(EPOCHS):\n",
-        "    start = time.time()\n",
-        "    total_loss = 0\n",
-        "    \n",
-        "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
-        "        loss = 0\n",
-        "        \n",
-        "        # initializing the hidden state for each batch\n",
-        "        # because the captions are not related from image to image\n",
-        "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
-        "\n",
-        "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
-        "        \n",
-        "        with tf.GradientTape() as tape:\n",
-        "            features = encoder(img_tensor)\n",
-        "            \n",
-        "            for i in range(1, target.shape[1]):\n",
-        "                # passing the features through the decoder\n",
-        "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
-        "\n",
-        "                loss += loss_function(target[:, i], predictions)\n",
-        "                \n",
-        "                # using teacher forcing\n",
-        "                dec_input = tf.expand_dims(target[:, i], 1)\n",
-        "        \n",
-        "        total_loss += (loss / int(target.shape[1]))\n",
-        "        \n",
-        "        variables = encoder.variables + decoder.variables\n",
-        "        \n",
-        "        gradients = tape.gradient(loss, variables) \n",
-        "        \n",
-        "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
-        "        \n",
-        "        if batch % 100 == 0:\n",
-        "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
-        "                                                          batch, \n",
-        "                                                          loss.numpy() / int(target.shape[1])))\n",
-        "    # storing the epoch end loss value to plot later\n",
-        "    loss_plot.append(total_loss / len(cap_vector))\n",
-        "    \n",
-        "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
-        "                                         total_loss/len(cap_vector)))\n",
-        "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "Qs_Sr03wRPGk"
+   },
+   "outputs": [],
+   "source": [
+    "encoder = CNN_Encoder(embedding_dim)\n",
+    "decoder = RNN_Decoder(embedding_dim, units, vocab_size)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "1Wm83G-ZBPcC",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "plt.plot(loss_plot)\n",
-        "plt.xlabel('Epochs')\n",
-        "plt.ylabel('Loss')\n",
-        "plt.title('Loss Plot')\n",
-        "plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "-bYN7xA0RPGl"
+   },
+   "outputs": [],
+   "source": [
+    "optimizer = tf.train.AdamOptimizer()\n",
+    "\n",
+    "# We are masking the loss calculated for padding\n",
+    "def loss_function(real, pred):\n",
+    "    mask = 1 - np.equal(real, 0)\n",
+    "    loss_ = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=real, logits=pred) * mask\n",
+    "    return tf.reduce_mean(loss_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "PHod7t72RPGn"
+   },
+   "source": [
+    "## Training\n",
+    "\n",
+    "* We extract the features stored in the respective `.npy` files and then pass those features through the encoder.\n",
+    "* The encoder output, hidden state(initialized to 0) and the decoder input (which is the start token) is passed to the decoder.\n",
+    "* The decoder returns the predictions and the decoder hidden state.\n",
+    "* The decoder hidden state is then passed back into the model and the predictions are used to calculate the loss.\n",
+    "* Use teacher forcing to decide the next input to the decoder.\n",
+    "* Teacher forcing is the technique where the target word is passed as the next input to the decoder.\n",
+    "* The final step is to calculate the gradients and apply it to the optimizer and backpropagate.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "xGvOcLQKghXN",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Caption!\n",
-        "\n",
-        "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
-        "* Stop predicting when the model predicts the end token.\n",
-        "* And store the attention weights for every time step."
-      ]
+    "colab_type": "code",
+    "id": "Vt4WZ5mhJE-E"
+   },
+   "outputs": [],
+   "source": [
+    "# adding this in a separate cell because if you run the training cell \n",
+    "# many times, the loss_plot array will be reset\n",
+    "loss_plot = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "RCWpDtyNRPGs",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def evaluate(image):\n",
-        "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
-        "\n",
-        "    hidden = decoder.reset_state(batch_size=1)\n",
-        "\n",
-        "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
-        "    img_tensor_val = image_features_extract_model(temp_input)\n",
-        "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
-        "\n",
-        "    features = encoder(img_tensor_val)\n",
-        "\n",
-        "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
-        "    result = []\n",
-        "\n",
-        "    for i in range(max_length):\n",
-        "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
-        "\n",
-        "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
-        "\n",
-        "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
-        "        result.append(index_word[predicted_id])\n",
-        "\n",
-        "        if index_word[predicted_id] == '<end>':\n",
-        "            return result, attention_plot\n",
-        "\n",
-        "        dec_input = tf.expand_dims([predicted_id], 0)\n",
-        "\n",
-        "    attention_plot = attention_plot[:len(result), :]\n",
-        "    return result, attention_plot"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "UlA4VIQpRPGo"
+   },
+   "outputs": [],
+   "source": [
+    "EPOCHS = 20\n",
+    "\n",
+    "for epoch in range(EPOCHS):\n",
+    "    start = time.time()\n",
+    "    total_loss = 0\n",
+    "    \n",
+    "    for (batch, (img_tensor, target)) in enumerate(dataset):\n",
+    "        loss = 0\n",
+    "        \n",
+    "        # initializing the hidden state for each batch\n",
+    "        # because the captions are not related from image to image\n",
+    "        hidden = decoder.reset_state(batch_size=target.shape[0])\n",
+    "\n",
+    "        dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)\n",
+    "        \n",
+    "        with tf.GradientTape() as tape:\n",
+    "            features = encoder(img_tensor)\n",
+    "            \n",
+    "            for i in range(1, target.shape[1]):\n",
+    "                # passing the features through the decoder\n",
+    "                predictions, hidden, _ = decoder(dec_input, features, hidden)\n",
+    "\n",
+    "                loss += loss_function(target[:, i], predictions)\n",
+    "                \n",
+    "                # using teacher forcing\n",
+    "                dec_input = tf.expand_dims(target[:, i], 1)\n",
+    "        \n",
+    "        total_loss += (loss / int(target.shape[1]))\n",
+    "        \n",
+    "        variables = encoder.variables + decoder.variables\n",
+    "        \n",
+    "        gradients = tape.gradient(loss, variables) \n",
+    "        \n",
+    "        optimizer.apply_gradients(zip(gradients, variables), tf.train.get_or_create_global_step())\n",
+    "        \n",
+    "        if batch % 100 == 0:\n",
+    "            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, \n",
+    "                                                          batch, \n",
+    "                                                          loss.numpy() / int(target.shape[1])))\n",
+    "    # storing the epoch end loss value to plot later\n",
+    "    loss_plot.append(total_loss / len(cap_vector))\n",
+    "    \n",
+    "    print ('Epoch {} Loss {:.6f}'.format(epoch + 1, \n",
+    "                                         total_loss/len(cap_vector)))\n",
+    "    print ('Time taken for 1 epoch {} sec\\n'.format(time.time() - start))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "fD_y7PD6RPGt",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "def plot_attention(image, result, attention_plot):\n",
-        "    temp_image = np.array(Image.open(image))\n",
-        "\n",
-        "    fig = plt.figure(figsize=(10, 10))\n",
-        "    \n",
-        "    len_result = len(result)\n",
-        "    for l in range(len_result):\n",
-        "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
-        "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
-        "        ax.set_title(result[l])\n",
-        "        img = ax.imshow(temp_image)\n",
-        "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
-        "\n",
-        "    plt.tight_layout()\n",
-        "    plt.show()"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "1Wm83G-ZBPcC"
+   },
+   "outputs": [],
+   "source": [
+    "plt.plot(loss_plot)\n",
+    "plt.xlabel('Epochs')\n",
+    "plt.ylabel('Loss')\n",
+    "plt.title('Loss Plot')\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "xGvOcLQKghXN"
+   },
+   "source": [
+    "## Caption!\n",
+    "\n",
+    "* The evaluate function is similar to the training loop, except we don't use teacher forcing here. The input to the decoder at each time step is its previous predictions along with the hidden state and the encoder output.\n",
+    "* Stop predicting when the model predicts the end token.\n",
+    "* And store the attention weights for every time step."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "io7ws3ReRPGv",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "# captions on the validation set\n",
-        "rid = np.random.randint(0, len(img_name_val))\n",
-        "image = img_name_val[rid]\n",
-        "real_caption = ' '.join([index_word[i] for i in cap_val[rid] if i not in [0]])\n",
-        "result, attention_plot = evaluate(image)\n",
-        "\n",
-        "print ('Real Caption:', real_caption)\n",
-        "print ('Prediction Caption:', ' '.join(result))\n",
-        "plot_attention(image, result, attention_plot)\n",
-        "# opening the image\n",
-        "Image.open(img_name_val[rid])"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "RCWpDtyNRPGs"
+   },
+   "outputs": [],
+   "source": [
+    "def evaluate(image):\n",
+    "    attention_plot = np.zeros((max_length, attention_features_shape))\n",
+    "\n",
+    "    hidden = decoder.reset_state(batch_size=1)\n",
+    "\n",
+    "    temp_input = tf.expand_dims(load_image(image)[0], 0)\n",
+    "    img_tensor_val = image_features_extract_model(temp_input)\n",
+    "    img_tensor_val = tf.reshape(img_tensor_val, (img_tensor_val.shape[0], -1, img_tensor_val.shape[3]))\n",
+    "\n",
+    "    features = encoder(img_tensor_val)\n",
+    "\n",
+    "    dec_input = tf.expand_dims([tokenizer.word_index['<start>']], 0)\n",
+    "    result = []\n",
+    "\n",
+    "    for i in range(max_length):\n",
+    "        predictions, hidden, attention_weights = decoder(dec_input, features, hidden)\n",
+    "\n",
+    "        attention_plot[i] = tf.reshape(attention_weights, (-1, )).numpy()\n",
+    "\n",
+    "        predicted_id = tf.argmax(predictions[0]).numpy()\n",
+    "        result.append(tokenizer.index_word[predicted_id])\n",
+    "\n",
+    "        if tokenizer.index_word[predicted_id] == '<end>':\n",
+    "            return result, attention_plot\n",
+    "\n",
+    "        dec_input = tf.expand_dims([predicted_id], 0)\n",
+    "\n",
+    "    attention_plot = attention_plot[:len(result), :]\n",
+    "    return result, attention_plot"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "Rprk3HEvZuxb",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "## Try it on your own images\n",
-        "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
-      ]
+    "colab_type": "code",
+    "id": "fD_y7PD6RPGt"
+   },
+   "outputs": [],
+   "source": [
+    "def plot_attention(image, result, attention_plot):\n",
+    "    temp_image = np.array(Image.open(image))\n",
+    "\n",
+    "    fig = plt.figure(figsize=(10, 10))\n",
+    "    \n",
+    "    len_result = len(result)\n",
+    "    for l in range(len_result):\n",
+    "        temp_att = np.resize(attention_plot[l], (8, 8))\n",
+    "        ax = fig.add_subplot(len_result//2, len_result//2, l+1)\n",
+    "        ax.set_title(result[l])\n",
+    "        img = ax.imshow(temp_image)\n",
+    "        ax.imshow(temp_att, cmap='gray', alpha=0.6, extent=img.get_extent())\n",
+    "\n",
+    "    plt.tight_layout()\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
-    {
-      "metadata": {
-        "id": "9Psd1quzaAWg",
-        "colab_type": "code",
-        "colab": {
-          "autoexec": {
-            "startup": false,
-            "wait_interval": 0
-          }
-        }
-      },
-      "cell_type": "code",
-      "source": [
-        "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
-        "image_extension = image_url[-4:]\n",
-        "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
-        "                                     origin=image_url)\n",
-        "\n",
-        "result, attention_plot = evaluate(image_path)\n",
-        "print ('Prediction Caption:', ' '.join(result))\n",
-        "plot_attention(image_path, result, attention_plot)\n",
-        "# opening the image\n",
-        "Image.open(image_path)"
-      ],
-      "execution_count": 0,
-      "outputs": []
+    "colab_type": "code",
+    "id": "io7ws3ReRPGv"
+   },
+   "outputs": [],
+   "source": [
+    "# captions on the validation set\n",
+    "rid = np.random.randint(0, len(img_name_val))\n",
+    "image = img_name_val[rid]\n",
+    "real_caption = ' '.join([tokenizer.index_word[i] for i in cap_val[rid] if i not in [0]])\n",
+    "result, attention_plot = evaluate(image)\n",
+    "\n",
+    "print ('Real Caption:', real_caption)\n",
+    "print ('Prediction Caption:', ' '.join(result))\n",
+    "plot_attention(image, result, attention_plot)\n",
+    "# opening the image\n",
+    "Image.open(img_name_val[rid])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "Rprk3HEvZuxb"
+   },
+   "source": [
+    "## Try it on your own images\n",
+    "For fun, below we've provided a method you can use to caption your own images with the model we've just trained. Keep in mind, it was trained on a relatively small amount of data, and your images may be different from the training data (so be prepared for weird results!)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "autoexec": {
+      "startup": false,
+      "wait_interval": 0
+     }
     },
+    "colab_type": "code",
+    "id": "9Psd1quzaAWg"
+   },
+   "outputs": [],
+   "source": [
+    "image_url = 'https://tensorflow.org/images/surf.jpg'\n",
+    "image_extension = image_url[-4:]\n",
+    "image_path = tf.keras.utils.get_file('image'+image_extension, \n",
+    "                                     origin=image_url)\n",
+    "\n",
+    "result, attention_plot = evaluate(image_path)\n",
+    "print ('Prediction Caption:', ' '.join(result))\n",
+    "plot_attention(image_path, result, attention_plot)\n",
+    "# opening the image\n",
+    "Image.open(image_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text",
+    "id": "VJZXyJco6uLO"
+   },
+   "source": [
+    "# Next steps\n",
+    "\n",
+    "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "default_view": {},
+   "name": "image_captioning_with_attention.ipynb",
+   "private_outputs": true,
+   "provenance": [
     {
-      "metadata": {
-        "id": "VJZXyJco6uLO",
-        "colab_type": "text"
-      },
-      "cell_type": "markdown",
-      "source": [
-        "# Next steps\n",
-        "\n",
-        "Congrats! You've just trained an image captioning model with attention. Next, we recommend taking a look at this example [Neural Machine Translation with Attention]( https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb). It uses a similar architecture to translate between Spanish and English sentences. You can also experiment with training the code in this notebook on a different dataset."
-      ]
+     "file_id": "1HI8OK2sMjcx9CTWVn0122QAHOuXaOaMg",
+     "timestamp": 1530222436922
     }
-  ]
+   ],
+   "toc_visible": true,
+   "version": "0.3.2",
+   "views": {}
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
 }
diff --git a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
index 557ad42752144243ae3da61b955b31398cba846e..d412b25b368260b81256fd58034330b884261b2b 100644
--- a/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/linear_regression/linear_regression_graph_test.py
@@ -36,7 +36,7 @@ class GraphLinearRegressionBenchmark(tf.test.Benchmark):
         noise_level=0.01,
         batch_size=batch_size,
         num_batches=num_batches)
-    iterator = dataset.make_initializable_iterator()
+    iterator = tf.compat.v1.data.make_initializable_iterator(dataset)
     x, y = iterator.get_next()
 
     model = linear_regression.LinearModel()
diff --git a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
index 480777d948769b56ac1cc3be2052fe48459e98d6..66d52a74943d0d81fde05ce51b019558b327978d 100644
--- a/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
+++ b/tensorflow/contrib/eager/python/examples/nmt_with_attention/nmt_with_attention.ipynb
@@ -768,7 +768,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'hace mucho frio aqui.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -781,7 +781,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'esta es mi vida.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -794,7 +794,7 @@
       },
       "outputs": [],
       "source": [
-        "translate('¿todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'todavia estan en casa?', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
@@ -808,7 +808,7 @@
       "outputs": [],
       "source": [
         "# wrong translation\n",
-        "translate('trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
+        "translate(u'trata de averiguarlo.', encoder, decoder, inp_lang, targ_lang, max_length_inp, max_length_targ)"
       ]
     },
     {
diff --git a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
index f3bb978875e226f58d6a00e09154191673a97415..fb7975d8fe867711cff31d627788a2d62a520aa9 100644
--- a/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/resnet50/resnet50_graph_test.py
@@ -142,7 +142,8 @@ class ResNet50Benchmarks(tf.test.Benchmark):
       with tf.Graph().as_default():
         np_images, np_labels = random_batch(batch_size)
         dataset = tf.data.Dataset.from_tensors((np_images, np_labels)).repeat()
-        (images, labels) = dataset.make_one_shot_iterator().get_next()
+        images, labels = tf.compat.v1.data.make_one_shot_iterator(
+            dataset).get_next()
 
         model = resnet50.ResNet50(data_format())
         logits = model(images, training=True)
diff --git a/tensorflow/contrib/eager/python/examples/revnet/main.py b/tensorflow/contrib/eager/python/examples/revnet/main.py
index b702e91f92220c2a9003a1b82411131332012a9e..9585f3565f83af724b6336e466d3671443ba2361 100644
--- a/tensorflow/contrib/eager/python/examples/revnet/main.py
+++ b/tensorflow/contrib/eager/python/examples/revnet/main.py
@@ -72,14 +72,11 @@ def main(_):
     train_one_iter(model, x, y, optimizer, global_step=global_step)
 
     if global_step.numpy() % config.log_every == 0:
-      it_test = ds_test.make_one_shot_iterator()
-      acc_test, loss_test = evaluate(model, it_test)
+      acc_test, loss_test = evaluate(model, ds_test)
 
       if FLAGS.validate:
-        it_train = ds_train_one_shot.make_one_shot_iterator()
-        it_validation = ds_validation.make_one_shot_iterator()
-        acc_train, loss_train = evaluate(model, it_train)
-        acc_validation, loss_validation = evaluate(model, it_validation)
+        acc_train, loss_train = evaluate(model, ds_train_one_shot)
+        acc_validation, loss_validation = evaluate(model, ds_validation)
         print("Iter {}, "
               "training set accuracy {:.4f}, loss {:.4f}; "
               "validation set accuracy {:.4f}, loss {:.4f}; "
@@ -218,11 +215,11 @@ def train_one_iter(model, inputs, labels, optimizer, global_step=None):
   return logits, loss
 
 
-def evaluate(model, iterator):
+def evaluate(model, dataset):
   """Compute accuracy with the given dataset iterator."""
   mean_loss = tfe.metrics.Mean()
   accuracy = tfe.metrics.Accuracy()
-  for x, y in iterator:
+  for x, y in dataset:
     logits, _ = model(x, training=False)
     loss = model.compute_loss(logits=logits, labels=y)
     accuracy(
diff --git a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
index 63b5c4c54d13e9c2448ec1f572ca1389f2443bef..770484abed96e540cf75cc5368a1410c31a8d2d0 100644
--- a/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
+++ b/tensorflow/contrib/eager/python/examples/rnn_ptb/rnn_ptb_graph_test.py
@@ -82,7 +82,7 @@ class PTBBenchmark(tf.test.Benchmark):
         tf.ones(
             [PTBBenchmark.SEQ_LEN, PTBBenchmark.BATCH_SIZE],
             dtype=tf.int64)).repeat(num_iters + num_warmup)
-    inputs = dataset.make_one_shot_iterator().get_next()
+    inputs = tf.compat.v1.data.make_one_shot_iterator(dataset).get_next()
 
     with tf.device(tf.test.gpu_device_name()):
       outputs = model(inputs, training=True)
@@ -124,7 +124,8 @@ class PTBBenchmark(tf.test.Benchmark):
             dtype=tf.int64)).repeat(num_iters + num_warmup)
     # inputs and labels have the same shape
     dataset = tf.data.Dataset.zip((dataset, dataset))
-    (inputs, labels) = dataset.make_one_shot_iterator().get_next()
+    (inputs, labels) = tf.compat.v1.data.make_one_shot_iterator(
+        dataset).get_next()
 
     with tf.device(tf.test.gpu_device_name()):
       optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0)
diff --git a/tensorflow/contrib/eager/python/metrics_impl.py b/tensorflow/contrib/eager/python/metrics_impl.py
index c88c0f52eead58c7562cda1a49d164c1d857822d..566246de4957c1dc5919c10e22146706f9e50be8 100644
--- a/tensorflow/contrib/eager/python/metrics_impl.py
+++ b/tensorflow/contrib/eager/python/metrics_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import smart_cond
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
@@ -354,9 +355,10 @@ class Mean(Metric):
     def write_summary_f():
       summary_ops.scalar(name=self.name, tensor=t)
       return t
-    control_flow_ops.cond(write_summary,
+    smart_cond.smart_cond(write_summary,
                           write_summary_f,
-                          lambda: t)
+                          lambda: t,
+                          name="")
     return t
 
 
diff --git a/tensorflow/contrib/eager/python/metrics_test.py b/tensorflow/contrib/eager/python/metrics_test.py
index 9d2d172752c7f3f3ee6eaa11ab8952313a4a3543..39e5957f5d1760613f2c33607c0bdb163040efb4 100644
--- a/tensorflow/contrib/eager/python/metrics_test.py
+++ b/tensorflow/contrib/eager/python/metrics_test.py
@@ -49,18 +49,6 @@ class MetricsTest(test.TestCase):
     self.assertEqual(dtypes.float64, m.dtype)
     self.assertEqual(dtypes.float64, m.result().dtype)
 
-  def testSummaryArg(self):
-    m = metrics.Mean()
-    m([1, 10, 100])
-    m(1000)
-    m([10000.0, 100000.0])
-    self.assertEqual(111111.0/6, m.result(write_summary=True).numpy())
-    self.assertEqual(111111.0/6, m.result(write_summary=False).numpy())
-    with self.assertRaises(ValueError):
-      m.result(write_summary=5)
-    with self.assertRaises(ValueError):
-      m.result(write_summary=[True])
-
   def testVariableCollections(self):
     with context.graph_mode(), ops.Graph().as_default():
       m = metrics.Mean()
diff --git a/tensorflow/contrib/eager/python/network.py b/tensorflow/contrib/eager/python/network.py
index f801d9a47b2f831a48d9b6335c69612c1356d800..5cc0c4f23d9d641ff1452c7cc9c1fcde612a33a2 100644
--- a/tensorflow/contrib/eager/python/network.py
+++ b/tensorflow/contrib/eager/python/network.py
@@ -24,7 +24,7 @@ import weakref
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
@@ -220,7 +220,7 @@ class Network(base.Layer):
         avoid_names = parent_network._owned_layers
         name_uid_map = parent_network._sub_layer_name_uids
       else:
-        name_uid_map = keras_base_layer.get_default_graph_uid_map()
+        name_uid_map = base_layer_utils.get_default_graph_uid_map()
         # Figure out which names we have to avoid based on which variable scope
         # we're nested in.
         strip_name = self._default_parent_variable_scope.name
diff --git a/tensorflow/contrib/eager/python/saver.py b/tensorflow/contrib/eager/python/saver.py
index f9c716360c5755ee1902b576545d776725f9966f..1d0d6c6c14ce4a8e454206e0be9fea4724f09192 100644
--- a/tensorflow/contrib/eager/python/saver.py
+++ b/tensorflow/contrib/eager/python/saver.py
@@ -115,6 +115,11 @@ def restore_variables_on_create(save_path, map_func=None):
 
 class Saver(object):
   """A tf.train.Saver adapter for use when eager execution is enabled.
+
+  `Saver`'s name-based checkpointing strategy is fragile. Please switch to
+  `tf.train.Checkpoint` or `tf.keras.Model.save_weights`, which perform a more
+  robust object-based saving. These APIs will load checkpoints written by
+  `Saver`.
   """
 
   def __init__(self, var_list):
diff --git a/tensorflow/contrib/eager/python/tfe_test.py b/tensorflow/contrib/eager/python/tfe_test.py
index 4454abfb9667f824b9de0100bb81bae24ad5f7a6..8c35dddb5a515aa09cc70c173a9f0605e8567e82 100644
--- a/tensorflow/contrib/eager/python/tfe_test.py
+++ b/tensorflow/contrib/eager/python/tfe_test.py
@@ -87,8 +87,8 @@ class TFETest(test_util.TensorFlowTestCase):
       x += 1.
     # Without a device context, heuristics are used to place ops.
     # In this case, ops.reduce_mean runs on the GPU.
-    reduction_indices = range(x.shape.ndims)
-    m = math_ops.reduce_mean(x, reduction_indices)
+    axis = range(x.shape.ndims)
+    m = math_ops.reduce_mean(x, axis)
     # m is on GPU, bring it back to CPU and compare.
     self.assertEqual(3.5, m.cpu().numpy())
 
diff --git a/tensorflow/contrib/estimator/BUILD b/tensorflow/contrib/estimator/BUILD
index 37f253d9c115ca4a6d3c30aca33ca1be12b4a927..a888379f13e79d1c246d4cd6d19a225c065692a2 100644
--- a/tensorflow/contrib/estimator/BUILD
+++ b/tensorflow/contrib/estimator/BUILD
@@ -16,7 +16,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":boosted_trees",
-        ":dnn",
         ":dnn_with_layer_annotations",
         ":early_stopping",
         ":expect_tensorflow_estimator_installed",
@@ -25,7 +24,6 @@ py_library(
         ":extenders",
         ":head",
         ":hooks",
-        ":linear",
         ":logit_fns",
         ":multi_head",
         ":replicate_model_fn",
@@ -47,18 +45,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "dnn",
-    srcs = ["python/estimator/dnn.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":expect_tensorflow_estimator_installed",
-        "//tensorflow:tensorflow_py_no_contrib",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:dnn",
-    ],
-)
-
 py_library(
     name = "dnn_with_layer_annotations",
     srcs = ["python/estimator/dnn_with_layer_annotations.py"],
@@ -144,17 +130,6 @@ py_library(
     ],
 )
 
-py_library(
-    name = "linear",
-    srcs = ["python/estimator/linear.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":expect_tensorflow_estimator_installed",
-        "//tensorflow/python/estimator",
-        "//tensorflow/python/estimator:linear",
-    ],
-)
-
 py_library(
     name = "logit_fns",
     srcs = [
diff --git a/tensorflow/contrib/estimator/__init__.py b/tensorflow/contrib/estimator/__init__.py
index 80d59627620b86b5ebc20e1631ca368a0f2f6fdf..7d61247e7ef26d3777843cd3be20684583e9058c 100644
--- a/tensorflow/contrib/estimator/__init__.py
+++ b/tensorflow/contrib/estimator/__init__.py
@@ -58,8 +58,6 @@ _allowed_symbols = [
     'multi_label_head',
     'poisson_regression_head',
     'regression_head',
-    'DNNEstimator',
-    'LinearEstimator',
     'boosted_trees_classifier_train_in_memory',
     'boosted_trees_regressor_train_in_memory',
     'call_logit_fn',
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans.py b/tensorflow/contrib/factorization/python/ops/kmeans.py
index f384d761a8430074f022c973d7ec3d46cd90f70b..3eb396a29ccdc0478384f9fa122465731740a30d 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans.py
@@ -26,7 +26,7 @@ from tensorflow.contrib.factorization.python.ops import clustering_ops
 from tensorflow.python.estimator import estimator
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export_output
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
diff --git a/tensorflow/contrib/factorization/python/ops/kmeans_test.py b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
index 1ab5418fe4659cb0068ee8c3ca1442f6f723ee76..2f7cd131d3ed20df307ed231cce2ecb50ecfbceb 100644
--- a/tensorflow/contrib/factorization/python/ops/kmeans_test.py
+++ b/tensorflow/contrib/factorization/python/ops/kmeans_test.py
@@ -27,7 +27,7 @@ from sklearn.cluster import KMeans as SklearnKMeans
 # pylint: disable=g-import-not-at-top
 from tensorflow.contrib.factorization.python.ops import kmeans as kmeans_lib
 from tensorflow.python.estimator import run_config
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/feature_column/BUILD b/tensorflow/contrib/feature_column/BUILD
index bbe335be3e1384e8a86872165a4e37230e28b6c9..1cd83bdb5de7c2f6dc91c980750b49aca1a7790b 100644
--- a/tensorflow/contrib/feature_column/BUILD
+++ b/tensorflow/contrib/feature_column/BUILD
@@ -14,6 +14,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":sequence_feature_column",
+        ":sequence_feature_column_v2",
         "//tensorflow/python:util",
     ],
 )
@@ -32,7 +33,7 @@ py_library(
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
     ],
 )
 
@@ -51,7 +52,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -69,7 +70,7 @@ py_test(
         "//tensorflow/python:parsing_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:util",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras:layers",
     ],
 )
@@ -89,7 +90,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/feature_column:feature_column_py",
     ],
 )
 
@@ -110,7 +111,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:training",
         "//tensorflow/python/feature_column",
-        "//tensorflow/python/feature_column:feature_column_v2",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
index dd6da35ed009c07ad3819e7860a283c7837c1f83..9b3a5c58aaa9498257fc971ac60b97f31d5185d8 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column.py
@@ -222,10 +222,8 @@ def sequence_categorical_column_with_identity(
     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_identity(
-          key=key,
-          num_buckets=num_buckets,
-          default_value=default_value))
+      fc._categorical_column_with_identity(
+          key=key, num_buckets=num_buckets, default_value=default_value))
 
 
 def sequence_categorical_column_with_hash_bucket(
@@ -265,10 +263,8 @@ def sequence_categorical_column_with_hash_bucket(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_hash_bucket(
-          key=key,
-          hash_bucket_size=hash_bucket_size,
-          dtype=dtype))
+      fc._categorical_column_with_hash_bucket(
+          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
 
 
 def sequence_categorical_column_with_vocabulary_file(
@@ -324,7 +320,7 @@ def sequence_categorical_column_with_vocabulary_file(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key=key,
           vocabulary_file=vocabulary_file,
           vocabulary_size=vocabulary_size,
@@ -384,7 +380,7 @@ def sequence_categorical_column_with_vocabulary_list(
     ValueError: if `dtype` is not integer or string.
   """
   return fc._SequenceCategoricalColumn(
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key=key,
           vocabulary_list=vocabulary_list,
           dtype=dtype,
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
index d8ca363627eace15e039679545366648df174c33..bcc25b8de895a769f9e11b207c2092e23d029b1f 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_integration_test.py
@@ -53,19 +53,20 @@ class SequenceFeatureColumnIntegrationTest(test.TestCase):
     return example
 
   def _build_feature_columns(self):
-    col = fc.categorical_column_with_identity(
-        'int_ctx', num_buckets=100)
+    col = fc._categorical_column_with_identity('int_ctx', num_buckets=100)
     ctx_cols = [
-        fc.embedding_column(col, dimension=10),
-        fc.numeric_column('float_ctx')]
+        fc._embedding_column(col, dimension=10),
+        fc._numeric_column('float_ctx')
+    ]
 
     identity_col = sfc.sequence_categorical_column_with_identity(
         'int_list', num_buckets=10)
     bucket_col = sfc.sequence_categorical_column_with_hash_bucket(
         'bytes_list', hash_bucket_size=100)
     seq_cols = [
-        fc.embedding_column(identity_col, dimension=10),
-        fc.embedding_column(bucket_col, dimension=20)]
+        fc._embedding_column(identity_col, dimension=10),
+        fc._embedding_column(bucket_col, dimension=20)
+    ]
 
     return ctx_cols, seq_cols
 
@@ -148,8 +149,8 @@ class SequenceExampleParsingTest(test.TestCase):
     """
     example = _make_sequence_example()
     columns = [
-        fc.categorical_column_with_identity('int_ctx', num_buckets=100),
-        fc.numeric_column('float_ctx'),
+        fc._categorical_column_with_identity('int_ctx', num_buckets=100),
+        fc._numeric_column('float_ctx'),
         col_fn(col_name, col_arg)
     ]
     context, seq_features = parsing_ops.parse_single_sequence_example(
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
index 2163af0b43864c96483df529f07881f2f985a80e..d5f74028298ee7015f5b2e3aaee7d9330c1acac1 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc
 from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc_lib
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -109,13 +110,15 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=embedding_dimension_a,
+    embedding_column_a = fc._embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc.embedding_column(
-        categorical_column_b, dimension=embedding_dimension_b,
+    embedding_column_b = fc._embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -148,10 +151,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
+    embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -206,7 +208,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
     # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension,
         initializer=_get_initializer(embedding_dimension, embedding_values))
@@ -244,11 +246,11 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
@@ -315,10 +317,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc.indicator_column(categorical_column_b)
+    indicator_column_b = fc._indicator_column(categorical_column_b)
     input_layer, sequence_length = sfc.sequence_input_layer(
         features={
             'aaa': sparse_input_a,
@@ -342,9 +344,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -530,7 +532,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     input_layer, _ = sfc.sequence_input_layer(
         features={'aaa': sparse_input}, feature_columns=[indicator_column])
@@ -616,8 +618,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc.embedding_column(
-        categorical_column_a, dimension=2)
+    embedding_column_a = fc._embedding_column(categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -639,7 +640,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc.indicator_column(categorical_column_a)
+    indicator_column_a = fc._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -918,8 +919,9 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
@@ -956,8 +958,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -984,8 +985,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
@@ -1055,7 +1055,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -1101,7 +1101,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     expected_sequence_length_b = [2, 1]
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1152,7 +1152,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
 
-    shared_embedding_columns = fc.shared_embedding_columns(
+    shared_embedding_columns = fc_lib.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1218,7 +1218,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -1250,7 +1250,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -1277,7 +1277,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc.indicator_column(categorical_column)
+    indicator_column = fc._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
index 67ffb939663358b5e356b3b626978db959c1bac9..0d34ad161855476b6a4cd9a258521dbe122b4140 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2.py
@@ -26,7 +26,7 @@ import collections
 
 
 from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -226,10 +226,8 @@ def sequence_categorical_column_with_identity(
     ValueError: if `default_value` is not in range `[0, num_buckets)`.
   """
   return fc_old._SequenceCategoricalColumn(
-      fc_old.categorical_column_with_identity(
-          key=key,
-          num_buckets=num_buckets,
-          default_value=default_value))
+      fc_old._categorical_column_with_identity(
+          key=key, num_buckets=num_buckets, default_value=default_value))
 
 
 def sequence_categorical_column_with_hash_bucket(
@@ -269,10 +267,8 @@ def sequence_categorical_column_with_hash_bucket(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc_old._SequenceCategoricalColumn(
-      fc_old.categorical_column_with_hash_bucket(
-          key=key,
-          hash_bucket_size=hash_bucket_size,
-          dtype=dtype))
+      fc_old._categorical_column_with_hash_bucket(
+          key=key, hash_bucket_size=hash_bucket_size, dtype=dtype))
 
 
 def sequence_categorical_column_with_vocabulary_file(
@@ -328,7 +324,7 @@ def sequence_categorical_column_with_vocabulary_file(
     ValueError: `dtype` is neither string nor integer.
   """
   return fc_old._SequenceCategoricalColumn(
-      fc_old.categorical_column_with_vocabulary_file(
+      fc_old._categorical_column_with_vocabulary_file(
           key=key,
           vocabulary_file=vocabulary_file,
           vocabulary_size=vocabulary_size,
@@ -388,7 +384,7 @@ def sequence_categorical_column_with_vocabulary_list(
     ValueError: if `dtype` is not integer or string.
   """
   return fc_old._SequenceCategoricalColumn(
-      fc_old.categorical_column_with_vocabulary_list(
+      fc_old._categorical_column_with_vocabulary_list(
           key=key,
           vocabulary_list=vocabulary_list,
           dtype=dtype,
@@ -441,7 +437,7 @@ def sequence_numeric_column(
     ValueError: if any dimension in shape is not a positive integer.
     ValueError: if `dtype` is not convertible to `tf.float32`.
   """
-  shape = fc._check_shape(shape=shape, key=key)
+  shape = fc_old._check_shape(shape=shape, key=key)
   if not (dtype.is_integer or dtype.is_floating):
     raise ValueError('dtype must be convertible to float. '
                      'dtype: {}, key: {}'.format(dtype, key))
diff --git a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
index 5ecd85807c55e592f2216dbe2ff76f56e5c2650d..ca4398a142065de0be7bee57cd7e54670bbae12e 100644
--- a/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
+++ b/tensorflow/contrib/feature_column/python/feature_column/sequence_feature_column_v2_test.py
@@ -25,7 +25,7 @@ import numpy as np
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column as sfc_old
 from tensorflow.contrib.feature_column.python.feature_column import sequence_feature_column_v2 as sfc
 from tensorflow.python.feature_column import feature_column as fc_old
-from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.feature_column.feature_column import _LazyBuilder
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -111,13 +111,15 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old.embedding_column(
-        categorical_column_a, dimension=embedding_dimension_a,
+    embedding_column_a = fc_old._embedding_column(
+        categorical_column_a,
+        dimension=embedding_dimension_a,
         initializer=_get_initializer(embedding_dimension_a, embedding_values_a))
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_b = fc_old.embedding_column(
-        categorical_column_b, dimension=embedding_dimension_b,
+    embedding_column_b = fc_old._embedding_column(
+        categorical_column_b,
+        dimension=embedding_dimension_b,
         initializer=_get_initializer(embedding_dimension_b, embedding_values_b))
 
     input_layer, sequence_length = sfc.sequence_input_layer(
@@ -150,9 +152,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old.embedding_column(
+    embedding_column_a = fc_old._embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
@@ -208,7 +210,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
     # Test that columns are reordered alphabetically.
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension,
         initializer=_get_initializer(embedding_dimension, embedding_values))
@@ -246,11 +248,11 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc_old._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     with self.assertRaisesRegexp(
@@ -317,10 +319,10 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size_a)
-    indicator_column_a = fc_old.indicator_column(categorical_column_a)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size_b)
-    indicator_column_b = fc_old.indicator_column(categorical_column_b)
+    indicator_column_b = fc_old._indicator_column(categorical_column_b)
     input_layer, sequence_length = sfc.sequence_input_layer(
         features={
             'aaa': sparse_input_a,
@@ -344,9 +346,9 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
         values=(2, 0, 1),
         dense_shape=(2, 2))
 
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old.indicator_column(categorical_column_a)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -532,7 +534,7 @@ class SequenceInputLayerTest(test.TestCase, parameterized.TestCase):
     sparse_input = sparse_tensor.SparseTensorValue(**sparse_input_args)
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    indicator_column = fc_old.indicator_column(categorical_column)
+    indicator_column = fc_old._indicator_column(categorical_column)
 
     input_layer, _ = sfc.sequence_input_layer(
         features={'aaa': sparse_input}, feature_columns=[indicator_column])
@@ -618,7 +620,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column_a = fc_old.embedding_column(
+    embedding_column_a = fc_old._embedding_column(
         categorical_column_a, dimension=2)
 
     with self.assertRaisesRegexp(
@@ -641,7 +643,7 @@ class InputLayerTest(test.TestCase):
 
     categorical_column_a = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column_a = fc_old.indicator_column(categorical_column_a)
+    indicator_column_a = fc_old._indicator_column(categorical_column_a)
 
     with self.assertRaisesRegexp(
         ValueError,
@@ -920,8 +922,9 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc_old._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     embedding_lookup, _ = embedding_column._get_sequence_dense_tensor(
@@ -958,8 +961,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -986,8 +988,7 @@ class SequenceEmbeddingColumnTest(
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc_old.embedding_column(
-        categorical_column, dimension=2)
+    embedding_column = fc_old._embedding_column(categorical_column, dimension=2)
 
     _, sequence_length = embedding_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': sparse_input}))
@@ -1057,7 +1058,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
         key='aaa', num_buckets=vocabulary_size)
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -1103,7 +1104,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     expected_sequence_length_b = [2, 1]
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1154,7 +1155,7 @@ class SequenceSharedEmbeddingColumnTest(test.TestCase):
     categorical_column_b = sfc.sequence_categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
 
-    shared_embedding_columns = fc_old.shared_embedding_columns(
+    shared_embedding_columns = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
 
     sequence_length_a = shared_embedding_columns[0]._get_sequence_dense_tensor(
@@ -1220,7 +1221,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc_old.indicator_column(categorical_column)
+    indicator_column = fc_old._indicator_column(categorical_column)
 
     indicator_tensor, _ = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
@@ -1252,7 +1253,7 @@ class SequenceIndicatorColumnTest(test.TestCase, parameterized.TestCase):
 
     categorical_column = sfc.sequence_categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    indicator_column = fc_old.indicator_column(categorical_column)
+    indicator_column = fc_old._indicator_column(categorical_column)
 
     _, sequence_length = indicator_column._get_sequence_dense_tensor(
         _LazyBuilder({'aaa': inputs}))
diff --git a/tensorflow/contrib/framework/BUILD b/tensorflow/contrib/framework/BUILD
index cd747df4d69d2c264f5a64b491da9570b1423770..dad50a3a73085526f65bd87c3d8549ceb75b3af4 100644
--- a/tensorflow/contrib/framework/BUILD
+++ b/tensorflow/contrib/framework/BUILD
@@ -47,6 +47,11 @@ tf_custom_op_py_library(
         ":variable_ops_op_lib",
     ],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     deps = [
         ":gen_variable_ops",
         "//tensorflow/contrib/util:util_py",
@@ -66,6 +71,7 @@ tf_custom_op_py_library(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:script_ops",
         "//tensorflow/python:smart_cond",
+        "//tensorflow/python:sort_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:state_ops_gen",
@@ -311,17 +317,3 @@ py_test(
         "//third_party/py/numpy",
     ],
 )
-
-py_test(
-    name = "sort_ops_test",
-    size = "medium",
-    srcs = ["python/ops/sort_ops_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":framework_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:random_ops",
-        "//third_party/py/numpy",
-    ],
-)
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops.py b/tensorflow/contrib/framework/python/ops/sort_ops.py
index 1921a77c1e96ee3531d1ed0f98e41c27c9d427ac..42184a4e55e292f7921702e3f8909ae54f717702 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops.py
+++ b/tensorflow/contrib/framework/python/ops/sort_ops.py
@@ -22,173 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
+from tensorflow.python.ops import sort_ops
 
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import ops as framework_ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import nn_ops
-
-
-def sort(values, axis=-1, direction='ASCENDING', name=None):
-  """Sorts a tensor.
-
-  Args:
-    values: 1-D or higher numeric `Tensor`.
-    axis: The axis along which to sort. The default is -1, which sorts the last
-        axis.
-    direction: The direction in which to sort the values (`'ASCENDING'` or
-        `'DESCENDING'`).
-    name: Optional name for the operation.
-
-  Returns:
-    A `Tensor` with the same dtype and shape as `values`, with the elements
-        sorted along the given `axis`.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  with framework_ops.name_scope(name, 'sort'):
-    return _sort_or_argsort(values, axis, direction, return_argsort=False)
-
-
-def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
-  """Returns the indices of a tensor that give its sorted order along an axis.
-
-  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
-  `tf.sort(values)`. For higher dimensions, the output has the same shape as
-  `values`, but along the given axis, values represent the index of the sorted
-  element in that slice of the tensor at the given position.
-
-  Args:
-    values: 1-D or higher numeric `Tensor`.
-    axis: The axis along which to sort. The default is -1, which sorts the last
-        axis.
-    direction: The direction in which to sort the values (`'ASCENDING'` or
-        `'DESCENDING'`).
-    stable: If True, equal elements in the original tensor will not be
-        re-ordered in the returned order. Unstable sort is not yet implemented,
-        but will eventually be the default for performance reasons. If you
-        require a stable order, pass `stable=True` for forwards compatibility.
-    name: Optional name for the operation.
-
-  Returns:
-    An int32 `Tensor` with the same shape as `values`. The indices that would
-        sort each slice of the given `values` along the given `axis`.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  del stable  # Unused.
-  with framework_ops.name_scope(name, 'argsort'):
-    return _sort_or_argsort(values, axis, direction, return_argsort=True)
-
-
-def _sort_or_argsort(values, axis, direction, return_argsort):
-  """Internal sort/argsort implementation.
-
-  Args:
-    values: The input values.
-    axis: The axis along which to sort.
-    direction: 'ASCENDING' or 'DESCENDING'.
-    return_argsort: Whether to return the argsort result.
-
-  Returns:
-    Either the sorted values, or the indices of the sorted values in the
-        original tensor. See the `sort` and `argsort` docstrings.
-
-  Raises:
-    ValueError: If axis is not a constant scalar, or the direction is invalid.
-  """
-  if direction not in _SORT_IMPL:
-    raise ValueError('%s should be one of %s' %
-                     (direction, ', '.join(sorted(_SORT_IMPL.keys()))))
-  # Axis must be an integer, not a Tensor.
-  axis = framework_ops.convert_to_tensor(axis, name='axis')
-  axis_static = tensor_util.constant_value(axis)
-  if axis.shape.ndims != 0 or axis_static is None:
-    raise ValueError('axis must be a constant scalar')
-  axis_static = int(axis_static)  # Avoids NumPy casting error
-
-  values = framework_ops.convert_to_tensor(values, name='values')
-
-  return _SORT_IMPL[direction](values, axis_static, return_argsort)
-
-
-def _descending_sort(values, axis, return_argsort=False):
-  """Sorts values in reverse using `top_k`.
-
-  Args:
-    values: Tensor of numeric values.
-    axis: Index of the axis which values should be sorted along.
-    return_argsort: If False, return the sorted values. If True, return the
-        indices that would sort the values.
-
-  Returns:
-    The sorted values.
-  """
-  k = array_ops.shape(values)[axis]
-  rank = array_ops.rank(values)
-  static_rank = values.shape.ndims
-  # Fast path: sorting the last axis.
-  if axis == -1 or axis + 1 == values.get_shape().ndims:
-    top_k_input = values
-    transposition = None
-  else:
-    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
-    if axis < 0:
-      # Calculate the actual axis index if counting from the end. Use the static
-      # rank if available, or else make the axis back into a tensor.
-      axis += static_rank or rank
-    if static_rank is not None:
-      # Prefer to calculate the transposition array in NumPy and make it a
-      # constant.
-      transposition = constant_op.constant(
-          np.r_[
-              # Axes up to axis are unchanged.
-              np.arange(axis),
-              # Swap axis and rank - 1.
-              [static_rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              np.arange(axis + 1, static_rank - 1),
-              # Swap axis and rank - 1.
-              [axis]],
-          name='transposition')
-    else:
-      # Generate the transposition array from the tensors.
-      transposition = array_ops.concat(
-          [
-              # Axes up to axis are unchanged.
-              math_ops.range(axis),
-              # Swap axis and rank - 1.
-              [rank - 1],
-              # Axes in [axis + 1, rank - 1) are unchanged.
-              math_ops.range(axis + 1, rank - 1),
-              # Swap axis and rank - 1.
-              [axis]
-          ],
-          axis=0)
-    top_k_input = array_ops.transpose(values, transposition)
-
-  values, indices = nn_ops.top_k(top_k_input, k)
-  return_value = indices if return_argsort else values
-  if transposition is not None:
-    # transposition contains a single cycle of length 2 (swapping 2 elements),
-    # so it is an involution (it is its own inverse).
-    return_value = array_ops.transpose(return_value, transposition)
-  return return_value
-
-
-def _ascending_sort(values, axis, return_argsort=False):
-  # Negate the values to get the ascending order from descending sort.
-  values_or_indices = _descending_sort(-values, axis, return_argsort)
-  # If not argsort, negate the values again.
-  return values_or_indices if return_argsort else -values_or_indices
-
-
-_SORT_IMPL = {
-    'ASCENDING': _ascending_sort,
-    'DESCENDING': _descending_sort,
-}
+sort = sort_ops.sort
+argsort = sort_ops.argsort
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
index 219cc199d79eca8c263859ae46bbb1ce0b4442b3..3593b501bb738b8f58dce4e40cffbdf410f136b3 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_impl.py
@@ -113,7 +113,8 @@ class GANEstimator(estimator.Estimator):
                add_summaries=None,
                use_loss_summaries=True,
                config=None,
-               warm_start_from=None):
+               warm_start_from=None,
+               is_chief=True):
     """Initializes a GANEstimator instance.
 
     Args:
@@ -154,6 +155,8 @@ class GANEstimator(estimator.Estimator):
       config: `RunConfig` object to configure the runtime settings.
       warm_start_from: A filepath to a checkpoint or saved model, or a
         WarmStartSettings object to configure initialization.
+      is_chief: Whether or not this Estimator is running on a chief or worker.
+        Needs to be set appropriately if using SyncReplicasOptimizers.
 
     Raises:
       ValueError: If loss functions aren't callable.
@@ -187,7 +190,7 @@ class GANEstimator(estimator.Estimator):
       return _get_estimator_spec(
           mode, gan_model, generator_loss_fn, discriminator_loss_fn,
           get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-          get_hooks_fn, use_loss_summaries)
+          get_hooks_fn, use_loss_summaries, is_chief)
 
     super(GANEstimator, self).__init__(
         model_fn=_model_fn, model_dir=model_dir, config=config,
@@ -215,7 +218,7 @@ def _get_gan_model(
 def _get_estimator_spec(
     mode, gan_model, generator_loss_fn, discriminator_loss_fn,
     get_eval_metric_ops_fn, generator_optimizer, discriminator_optimizer,
-    get_hooks_fn=None, use_loss_summaries=True):
+    get_hooks_fn=None, use_loss_summaries=True, is_chief=True):
   """Get the EstimatorSpec for the current mode."""
   if mode == model_fn_lib.ModeKeys.PREDICT:
     estimator_spec = model_fn_lib.EstimatorSpec(
@@ -236,7 +239,7 @@ def _get_estimator_spec(
               else discriminator_optimizer)
       get_hooks_fn = get_hooks_fn or tfgan_train.get_sequential_train_hooks()
       estimator_spec = _get_train_estimator_spec(
-          gan_model, gan_loss, gopt, dopt, get_hooks_fn)
+          gan_model, gan_loss, gopt, dopt, get_hooks_fn, is_chief=is_chief)
 
   return estimator_spec
 
@@ -321,11 +324,11 @@ def _get_eval_estimator_spec(gan_model, gan_loss, get_eval_metric_ops_fn=None,
 
 def _get_train_estimator_spec(
     gan_model, gan_loss, generator_optimizer, discriminator_optimizer,
-    get_hooks_fn, train_op_fn=tfgan_train.gan_train_ops):
+    get_hooks_fn, train_op_fn=tfgan_train.gan_train_ops, is_chief=True):
   """Return an EstimatorSpec for the train case."""
   scalar_loss = gan_loss.generator_loss + gan_loss.discriminator_loss
   train_ops = train_op_fn(gan_model, gan_loss, generator_optimizer,
-                          discriminator_optimizer)
+                          discriminator_optimizer, is_chief=is_chief)
   training_hooks = get_hooks_fn(train_ops)
   return model_fn_lib.EstimatorSpec(
       loss=scalar_loss,
diff --git a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
index 3d6bdab0ad7b4778edf0776f2d1b6a6f105cf2c7..bc9021050bc010ce75c3091fef868549686c0e90 100644
--- a/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
+++ b/tensorflow/contrib/gan/python/estimator/python/gan_estimator_test.py
@@ -48,6 +48,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import input as input_lib
 from tensorflow.python.training import learning_rate_decay
+from tensorflow.python.training import sync_replicas_optimizer
 from tensorflow.python.training import training
 from tensorflow.python.training import training_util
 
@@ -82,7 +83,7 @@ class GetGANModelTest(test.TestCase, parameterized.TestCase):
 
     self.assertEqual(generator_inputs, gan_model.generator_inputs)
     self.assertIsNotNone(gan_model.generated_data)
-    self.assertEqual(2, len(gan_model.generator_variables))  # 1 FC layer
+    self.assertLen(gan_model.generator_variables, 2)  # 1 FC layer
     self.assertIsNotNone(gan_model.generator_fn)
     if mode == model_fn_lib.ModeKeys.PREDICT:
       self.assertIsNone(gan_model.real_data)
@@ -95,7 +96,7 @@ class GetGANModelTest(test.TestCase, parameterized.TestCase):
       self.assertIsNotNone(gan_model.real_data)
       self.assertIsNotNone(gan_model.discriminator_real_outputs)
       self.assertIsNotNone(gan_model.discriminator_gen_outputs)
-      self.assertEqual(2, len(gan_model.discriminator_variables))  # 1 FC layer
+      self.assertLen(gan_model.discriminator_variables, 2)  # 1 FC layer
       self.assertIsNotNone(gan_model.discriminator_scope)
       self.assertIsNotNone(gan_model.discriminator_fn)
 
@@ -121,6 +122,7 @@ def get_dummy_gan_model():
 
 
 def dummy_loss_fn(gan_model, add_summaries=True):
+  del add_summaries
   return math_ops.reduce_sum(gan_model.discriminator_real_outputs -
                              gan_model.discriminator_gen_outputs)
 
@@ -168,6 +170,35 @@ class GetEstimatorSpecTest(test.TestCase, parameterized.TestCase):
       self.assertShapeEqual(np.array(0), spec.loss)  # must be a scalar
       self.assertIsNotNone(spec.eval_metric_ops)
 
+  def test_get_sync_estimator_spec(self):
+    """Make sure spec is loaded with sync hooks for sync opts."""
+
+    def get_sync_optimizer():
+      return sync_replicas_optimizer.SyncReplicasOptimizer(
+          training.GradientDescentOptimizer(learning_rate=1.0),
+          replicas_to_aggregate=1)
+
+    with ops.Graph().as_default():
+      self._gan_model = get_dummy_gan_model()
+      g_opt = get_sync_optimizer()
+      d_opt = get_sync_optimizer()
+
+      spec = estimator._get_estimator_spec(
+          model_fn_lib.ModeKeys.TRAIN,
+          self._gan_model,
+          generator_loss_fn=dummy_loss_fn,
+          discriminator_loss_fn=dummy_loss_fn,
+          get_eval_metric_ops_fn=get_metrics,
+          generator_optimizer=g_opt,
+          discriminator_optimizer=d_opt)
+
+      self.assertLen(spec.training_hooks, 4)
+      sync_opts = [
+          hook._sync_optimizer for hook in spec.training_hooks if
+          isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+      self.assertLen(sync_opts, 2)
+      self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
 
 # TODO(joelshor): Add pandas test.
 class GANEstimatorIntegrationTest(test.TestCase):
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
index e2594faf85bcf91cbe09f266e4d4211d20bdee17..364fa4eb461c62784803f0c309e3b7c5855df199 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils_impl.py
@@ -64,6 +64,9 @@ def condition_tensor(tensor, conditioning):
   """
   tensor.shape[1:].assert_is_fully_defined()
   num_features = tensor.shape[1:].num_elements()
+  if conditioning.shape.ndims < 2:
+    raise ValueError('conditioning must be at least 2D, but saw shape: %s'
+                     % conditioning.shape)
 
   mapped_conditioning = layers.linear(
       layers.flatten(conditioning), num_features)
diff --git a/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py b/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
index 0aad769793761be69ee9d1e3416e44c7b3d8cea0..f5c7d53cf2c9aa08ba0074950983ef3ecd90168b 100644
--- a/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
+++ b/tensorflow/contrib/gan/python/features/python/conditioning_utils_test.py
@@ -45,7 +45,7 @@ class ConditioningUtilsTest(test.TestCase):
           array_ops.placeholder(dtypes.float32, (5, None)),
           array_ops.placeholder(dtypes.float32, (5, 1)))
 
-    with self.assertRaisesRegexp(ValueError, 'expected min_ndim=2'):
+    with self.assertRaisesRegexp(ValueError, 'at least 2D'):
       conditioning_utils.condition_tensor(
           array_ops.placeholder(dtypes.float32, (5, 2)),
           array_ops.placeholder(dtypes.float32, (5)))
diff --git a/tensorflow/contrib/gan/python/losses/python/losses_impl.py b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
index df0342c80c587cd0dfbf5f1455e05c31745995f5..a0a86c6337eefa756a209635faa70db686a36247 100644
--- a/tensorflow/contrib/gan/python/losses/python/losses_impl.py
+++ b/tensorflow/contrib/gan/python/losses/python/losses_impl.py
@@ -36,7 +36,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
 
 from tensorflow.contrib.framework.python.ops import variables as contrib_variables_lib
 from tensorflow.python.framework import ops
@@ -47,7 +46,6 @@ from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.distributions import distribution as ds
 from tensorflow.python.ops.losses import losses
 from tensorflow.python.ops.losses import util
 from tensorflow.python.summary import summary
@@ -740,11 +738,16 @@ def least_squares_discriminator_loss(
 def _validate_distributions(distributions):
   if not isinstance(distributions, (list, tuple)):
     raise ValueError('`distributions` must be a list or tuple. Instead, '
-                     'found %s.', type(distributions))
+                     'found %s.' % type(distributions))
   for x in distributions:
-    if not isinstance(x, ds.Distribution):
+    # We used to check with `isinstance(x, tf.distributions.Distribution)`.
+    # However, distributions have migrated to `tfp.distributions.Distribution`,
+    # which is a new code repo, so we can't check this way anymore until
+    # TF-GAN is migrated to a new repo as well.
+    # This new check is not sufficient, but is a useful heuristic for now.
+    if not callable(getattr(x, 'log_prob', None)):
       raise ValueError('`distributions` must be a list of `Distributions`. '
-                       'Instead, found %s.', type(x))
+                       'Instead, found %s.' % type(x))
 
 
 def _validate_information_penalty_inputs(
@@ -817,7 +820,7 @@ def _numerically_stable_global_norm(tensor_list):
   Returns:
     A scalar tensor with the global norm.
   """
-  if np.all([x is None for x in tensor_list]):
+  if all(x is None for x in tensor_list):
     return 0.0
 
   list_max = math_ops.reduce_max([math_ops.reduce_max(math_ops.abs(x)) for x in
diff --git a/tensorflow/contrib/gan/python/namedtuples.py b/tensorflow/contrib/gan/python/namedtuples.py
index b9ac1bf15138c7e7d15ab3ebdac605d84921b6e5..969b68449d9c82f9f9144a8657cd8932b38fd0f7 100644
--- a/tensorflow/contrib/gan/python/namedtuples.py
+++ b/tensorflow/contrib/gan/python/namedtuples.py
@@ -213,7 +213,8 @@ class GANTrainOps(
     collections.namedtuple('GANTrainOps', (
         'generator_train_op',
         'discriminator_train_op',
-        'global_step_inc_op'
+        'global_step_inc_op',
+        'train_hooks'
     ))):
   """GANTrainOps contains the training ops.
 
@@ -221,8 +222,17 @@ class GANTrainOps(
     generator_train_op: Op that performs a generator update step.
     discriminator_train_op: Op that performs a discriminator update step.
     global_step_inc_op: Op that increments the shared global step.
+    train_hooks: a list or tuple containing hooks related to training that need
+      to be populated when training ops are instantiated. Used primarily for
+      sync hooks.
   """
 
+  def __new__(cls, generator_train_op, discriminator_train_op,
+              global_step_inc_op, train_hooks=()):
+    return super(GANTrainOps, cls).__new__(cls, generator_train_op,
+                                           discriminator_train_op,
+                                           global_step_inc_op, train_hooks)
+
 
 class GANTrainSteps(
     collections.namedtuple('GANTrainSteps', (
diff --git a/tensorflow/contrib/gan/python/train.py b/tensorflow/contrib/gan/python/train.py
index 7ee39f304ab213a8fa4e7a6f03cda88037bff9a1..4c7bee41b33ce1fee46d374ca5fd1c0b603762f9 100644
--- a/tensorflow/contrib/gan/python/train.py
+++ b/tensorflow/contrib/gan/python/train.py
@@ -114,7 +114,7 @@ def gan_model(
     discriminator_gen_outputs = discriminator_fn(generated_data,
                                                  generator_inputs)
   with variable_scope.variable_scope(dis_scope, reuse=True):
-    real_data = ops.convert_to_tensor(real_data)
+    real_data = _convert_tensor_or_l_or_d(real_data)
     discriminator_real_outputs = discriminator_fn(real_data, generator_inputs)
 
   if check_shapes:
@@ -924,6 +924,7 @@ def gan_train_ops(
     generator_optimizer,
     discriminator_optimizer,
     check_for_unused_update_ops=True,
+    is_chief=True,
     # Optional args to pass directly to the `create_train_op`.
     **kwargs):
   """Returns GAN train ops.
@@ -939,6 +940,8 @@ def gan_train_ops(
     discriminator_optimizer: The optimizer for the discriminator updates.
     check_for_unused_update_ops: If `True`, throws an exception if there are
       update ops outside of the generator or discriminator scopes.
+    is_chief: Specifies whether or not the training is being run by the primary
+      replica during replica training.
     **kwargs: Keyword args to pass directly to
       `training.create_train_op` for both the generator and
       discriminator train op.
@@ -980,6 +983,9 @@ def gan_train_ops(
       kwargs, model.generator_scope.name, model.discriminator_scope.name,
       check_for_unused_update_ops)
 
+  # Get the sync hooks if these are needed.
+  sync_hooks = []
+
   generator_global_step = None
   if isinstance(generator_optimizer,
                 sync_replicas_optimizer.SyncReplicasOptimizer):
@@ -995,6 +1001,7 @@ def gan_train_ops(
         trainable=False,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     gen_update_ops += [generator_global_step.assign(global_step)]
+    sync_hooks.append(generator_optimizer.make_session_run_hook(is_chief))
   with ops.name_scope('generator_train'):
     gen_train_op = training.create_train_op(
         total_loss=loss.generator_loss,
@@ -1016,6 +1023,7 @@ def gan_train_ops(
         trainable=False,
         collections=[ops.GraphKeys.GLOBAL_VARIABLES])
     dis_update_ops += [discriminator_global_step.assign(global_step)]
+    sync_hooks.append(discriminator_optimizer.make_session_run_hook(is_chief))
   with ops.name_scope('discriminator_train'):
     disc_train_op = training.create_train_op(
         total_loss=loss.discriminator_loss,
@@ -1025,7 +1033,8 @@ def gan_train_ops(
         update_ops=dis_update_ops,
         **kwargs)
 
-  return namedtuples.GANTrainOps(gen_train_op, disc_train_op, global_step_inc)
+  return namedtuples.GANTrainOps(gen_train_op, disc_train_op, global_step_inc,
+                                 sync_hooks)
 
 
 # TODO(joelshor): Implement a dynamic GAN train loop, as in `Real-Time Adaptive
@@ -1066,13 +1075,24 @@ def get_sequential_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
                                      train_steps.generator_train_steps)
     discriminator_hook = RunTrainOpsHook(train_ops.discriminator_train_op,
                                          train_steps.discriminator_train_steps)
-    return [generator_hook, discriminator_hook]
+    return [generator_hook, discriminator_hook] + list(train_ops.train_hooks)
 
   return get_hooks
 
 
+def _num_joint_steps(train_steps):
+  g_steps = train_steps.generator_train_steps
+  d_steps = train_steps.discriminator_train_steps
+  # Get the number of each type of step that should be run.
+  num_d_and_g_steps = min(g_steps, d_steps)
+  num_g_steps = g_steps - num_d_and_g_steps
+  num_d_steps = d_steps - num_d_and_g_steps
+
+  return num_d_and_g_steps, num_g_steps, num_d_steps
+
+
 def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
-  """Returns a hooks function for sequential GAN training.
+  """Returns a hooks function for joint GAN training.
 
   When using these train hooks, IT IS RECOMMENDED TO USE `use_locking=True` ON
   ALL OPTIMIZERS TO AVOID RACE CONDITIONS.
@@ -1105,12 +1125,7 @@ def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
   Returns:
     A function that takes a GANTrainOps tuple and returns a list of hooks.
   """
-  g_steps = train_steps.generator_train_steps
-  d_steps = train_steps.discriminator_train_steps
-  # Get the number of each type of step that should be run.
-  num_d_and_g_steps = min(g_steps, d_steps)
-  num_g_steps = g_steps - num_d_and_g_steps
-  num_d_steps = d_steps - num_d_and_g_steps
+  num_d_and_g_steps, num_g_steps, num_d_steps = _num_joint_steps(train_steps)
 
   def get_hooks(train_ops):
     g_op = train_ops.generator_train_op
@@ -1120,7 +1135,7 @@ def get_joint_train_hooks(train_steps=namedtuples.GANTrainSteps(1, 1)):
     g_hook = RunTrainOpsHook(g_op, num_g_steps)
     d_hook = RunTrainOpsHook(d_op, num_d_steps)
 
-    return [joint_hook, g_hook, d_hook]
+    return [joint_hook, g_hook, d_hook] + list(train_ops.train_hooks)
 
   return get_hooks
 
diff --git a/tensorflow/contrib/gan/python/train_test.py b/tensorflow/contrib/gan/python/train_test.py
index 64d670619905a427a84bee4b661228abca591fae..841f25cd7f1852767776eed2dcbf2522d8b0743b 100644
--- a/tensorflow/contrib/gan/python/train_test.py
+++ b/tensorflow/contrib/gan/python/train_test.py
@@ -519,7 +519,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase):
     """Test output type."""
     loss = train.gan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.GANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
 
   @parameterized.named_parameters(
       ('cyclegan', create_cyclegan_model),
@@ -528,7 +528,7 @@ class GANLossTest(test.TestCase, parameterized.TestCase):
   def test_cyclegan_output_type(self, get_gan_model_fn):
     loss = train.cyclegan_loss(get_gan_model_fn(), add_summaries=True)
     self.assertIsInstance(loss, namedtuples.CycleGANLoss)
-    self.assertGreater(len(ops.get_collection(ops.GraphKeys.SUMMARIES)), 0)
+    self.assertNotEmpty(ops.get_collection(ops.GraphKeys.SUMMARIES))
 
   @parameterized.named_parameters(
       ('gan', create_gan_model, False),
@@ -759,7 +759,7 @@ class TensorPoolAdjusteModelTest(test.TestCase):
           # For [pool_size, ?), the pool is full, tensor2 must be equal to some
           # historical values of tensor1 (which is previously stored in the
           # pool).
-          self.assertTrue(any([(v == t2).all() for v in history_values]))
+          self.assertTrue(any((v == t2).all() for v in history_values))
 
   def _make_new_model_and_check(self, model, pool_size):
     pool_fn = lambda x: random_tensor_pool.tensor_pool(x, pool_size=pool_size)
@@ -836,6 +836,9 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
 
     self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
 
+    # Make sure there are no training hooks populated accidentally.
+    self.assertEmpty(train_ops.train_hooks)
+
   # TODO(joelshor): Add a test to check that custom update op is run.
   @parameterized.named_parameters(
       ('gan', create_gan_model, False),
@@ -923,8 +926,15 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
         model, loss, generator_optimizer=g_opt, discriminator_optimizer=d_opt)
     self.assertIsInstance(train_ops, namedtuples.GANTrainOps)
     # No new trainable variables should have been added.
-    self.assertEqual(num_trainable_vars,
-                     len(variables_lib.get_trainable_variables()))
+    self.assertLen(variables_lib.get_trainable_variables(), num_trainable_vars)
+
+    # Sync hooks should be populated in the GANTrainOps.
+    self.assertLen(train_ops.train_hooks, 2)
+    for hook in train_ops.train_hooks:
+      self.assertIsInstance(
+          hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)
+    sync_opts = [hook._sync_optimizer for hook in train_ops.train_hooks]
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
 
     g_sync_init_op = g_opt.get_init_tokens_op(num_tokens=1)
     d_sync_init_op = d_opt.get_init_tokens_op(num_tokens=1)
@@ -959,6 +969,32 @@ class GANTrainOpsTest(test.TestCase, parameterized.TestCase):
       coord.request_stop()
       coord.join(g_threads + d_threads)
 
+  @parameterized.named_parameters(
+      ('is_chief', True),
+      ('is_not_chief', False),
+  )
+  def test_is_chief_in_train_hooks(self, is_chief):
+    """Make sure is_chief is propagated correctly to sync hooks."""
+    model = create_gan_model()
+    loss = train.gan_loss(model)
+    g_opt = get_sync_optimizer()
+    d_opt = get_sync_optimizer()
+    train_ops = train.gan_train_ops(
+        model,
+        loss,
+        g_opt,
+        d_opt,
+        is_chief=is_chief,
+        summarize_gradients=True,
+        colocate_gradients_with_ops=True)
+
+    self.assertLen(train_ops.train_hooks, 2)
+    for hook in train_ops.train_hooks:
+      self.assertIsInstance(
+          hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)
+    is_chief_list = [hook._is_chief for hook in train_ops.train_hooks]
+    self.assertListEqual(is_chief_list, [is_chief, is_chief])
+
 
 class GANTrainTest(test.TestCase, parameterized.TestCase):
   """Tests for `gan_train`."""
@@ -1036,6 +1072,44 @@ class GANTrainTest(test.TestCase, parameterized.TestCase):
     self.assertTrue(np.isscalar(final_loss))
     self.assertEqual(17.0, final_loss)
 
+  @parameterized.named_parameters(
+      ('gan', create_gan_model),
+      ('callable_gan', create_callable_gan_model),
+      ('infogan', create_infogan_model),
+      ('callable_infogan', create_callable_infogan_model),
+      ('acgan', create_acgan_model),
+      ('callable_acgan', create_callable_acgan_model),
+  )
+  def test_train_hooks_exist_in_get_hooks_fn(self, create_gan_model_fn):
+    model = create_gan_model_fn()
+    loss = train.gan_loss(model)
+
+    g_opt = get_sync_optimizer()
+    d_opt = get_sync_optimizer()
+    train_ops = train.gan_train_ops(
+        model,
+        loss,
+        g_opt,
+        d_opt,
+        summarize_gradients=True,
+        colocate_gradients_with_ops=True)
+
+    sequential_train_hooks = train.get_sequential_train_hooks()(train_ops)
+    self.assertLen(sequential_train_hooks, 4)
+    sync_opts = [
+        hook._sync_optimizer for hook in sequential_train_hooks if
+        isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+    self.assertLen(sync_opts, 2)
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
+    joint_train_hooks = train.get_joint_train_hooks()(train_ops)
+    self.assertLen(joint_train_hooks, 5)
+    sync_opts = [
+        hook._sync_optimizer for hook in joint_train_hooks if
+        isinstance(hook, sync_replicas_optimizer._SyncReplicasOptimizerHook)]
+    self.assertLen(sync_opts, 2)
+    self.assertSetEqual(frozenset(sync_opts), frozenset((g_opt, d_opt)))
+
 
 class PatchGANTest(test.TestCase, parameterized.TestCase):
   """Tests that functions work on PatchGAN style output."""
diff --git a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
index 94f522c04e5a09ed2d9355fa675125c340407923..fbccbead03fc0d641db40ede661bf3677d44c45d 100644
--- a/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
+++ b/tensorflow/contrib/gdr/gdr_rendezvous_mgr.cc
@@ -170,6 +170,14 @@ class GdrRemoteRendezvous : public BaseRemoteRendezvous {
     // Record "call" in active_ so that it can be aborted cleanly.
     RegisterCall(call);
 
+    // RendezvousMgr already aborted, shouldn't send RPC call any more
+    if (!call->status().ok()) {
+      done(call->status(), Args(), Args(), Tensor(), false);
+      session()->worker_cache->ReleaseWorker(src_worker, rwi);
+      delete call;
+      return;
+    }
+
     // Start "call".
     Ref();
     call->Start([this, call, src_worker, rwi, done]() {
diff --git a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
index f7f1189bb93c611719186a697c40f371644f63a2..bc941ae9f23eaa5c46fcca95b9aba0ac0d87960a 100644
--- a/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
+++ b/tensorflow/contrib/hadoop/python/kernel_tests/hadoop_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.contrib.hadoop.python.ops import hadoop_dataset_ops
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -47,7 +48,7 @@ class SequenceFileDatasetTest(test.TestCase):
 
     dataset = hadoop_dataset_ops.SequenceFileDataset(filenames).repeat(
         num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
index bf398b838dfaaff6fdaf33a6cd7086ef13e43a3e..5c5599858ee6879a5703d65658bf4bbd881c7e72 100644
--- a/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
+++ b/tensorflow/contrib/hadoop/python/ops/hadoop_dataset_ops.py
@@ -20,10 +20,9 @@ from __future__ import print_function
 from tensorflow.contrib.hadoop.python.ops import gen_dataset_ops
 from tensorflow.contrib.hadoop.python.ops import hadoop_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 
 
 class SequenceFileDataset(dataset_ops.DatasetSource):
@@ -40,15 +39,12 @@ class SequenceFileDataset(dataset_ops.DatasetSource):
     For example:
 
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.contrib.hadoop.SequenceFileDataset("/foo/bar.seq")
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
     # Prints the (key, value) pairs inside a hadoop sequence file.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for key, value in dataset:
+      print(key, value)
     ```
 
     Args:
@@ -60,16 +56,10 @@ class SequenceFileDataset(dataset_ops.DatasetSource):
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.sequence_file_dataset(
-        self._filenames, nest.flatten(self.output_types))
-
-  @property
-  def output_classes(self):
-    return ops.Tensor, ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
+        self._filenames, self._element_structure._flat_types)  # pylint: disable=protected-access
 
   @property
-  def output_types(self):
-    return dtypes.string, dtypes.string
+  def _element_structure(self):
+    return structure.NestedStructure(
+        (structure.TensorStructure(dtypes.string, []),
+         structure.TensorStructure(dtypes.string, [])))
diff --git a/tensorflow/contrib/ignite/README.md b/tensorflow/contrib/ignite/README.md
index c7db0b77e25668fb8a42d204776044420f403e44..5a8c650fb927be0c835aaceffc516c048195c7bf 100644
--- a/tensorflow/contrib/ignite/README.md
+++ b/tensorflow/contrib/ignite/README.md
@@ -54,14 +54,12 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>> tf.enable_eager_execution()
+>>>
 >>> dataset = IgniteDataset(cache_name="SQL_PUBLIC_KITTEN_CACHE")
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   for _ in range(3):
->>>     print(sess.run(next_obj))
+>>> for element in dataset:
+>>>   print(element)
 
 {'key': 1, 'val': {'NAME': b'WARM KITTY'}}
 {'key': 2, 'val': {'NAME': b'SOFT KITTY'}}
@@ -74,23 +72,22 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>> tf.enable_eager_execution()
+>>>
 >>> dataset = IgniteDataset(cache_name="IMAGES")
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   print(sess.run(next_obj))
+>>> for element in dataset.take(1):
+>>>   print(element)
 
 {
-    'key': 'kitten.png', 
+    'key': 'kitten.png',
     'val': {
         'metadata': {
             'file_name': b'kitten.png',
             'label': b'little ball of fur',
-            width: 800, 
+            width: 800,
             height: 600
-        }, 
+        },
         'pixels': [0, 0, 0, 0, ..., 0]
     }
 }
@@ -100,13 +97,11 @@ jdbc:ignite:thin://localhost/> INSERT INTO KITTEN_CACHE VALUES (3, 'LITTLE BALL
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>>
 >>> dataset = IgniteDataset(cache_name="IMAGES").map(lambda obj: obj['val']['pixels'])
->>> iterator = dataset.make_one_shot_iterator()
->>> next_obj = iterator.get_next()
 >>>
->>> with tf.Session() as sess:
->>>   print(sess.run(next_obj))
+>>> for element in dataset:
+>>>   print(element)
 
 [0, 0, 0, 0, ..., 0]
 ```
@@ -126,18 +121,18 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 ```python
 >>> import tensorflow as tf
 >>> from tensorflow.contrib.ignite import IgniteDataset
->>> 
+>>>
 >>> dataset = IgniteDataset("IMAGES")
 >>>
 >>> # Compute gradients locally on every worker node.
->>> gradients = []    
+>>> gradients = []
 >>> for i in range(5):
 >>>     with tf.device("/job:WORKER/task:%d" % i):
->>>         device_iterator = dataset.make_one_shot_iterator()
+>>>         device_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
 >>>         device_next_obj = device_iterator.get_next()
 >>>         gradient = compute_gradient(device_next_obj)
->>>         gradients.append(gradient)        
->>>        
+>>>         gradients.append(gradient)
+>>>
 >>> # Aggregate them on master node.
 >>> result_gradient = tf.reduce_sum(gradients)
 >>>
@@ -145,7 +140,7 @@ Ignite Dataset allows using these two aspects of distributed neural network trai
 >>>     print(sess.run(result_gradient))
 ```
 
-High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well. 
+High-level TensorFlow API for [distributed training](https://www.tensorflow.org/api_docs/python/tf/contrib/distribute/DistributionStrategy) is supported as well.
 
 ### Distributed File System
 
diff --git a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
index 936b29a4f50794380d48efed99e267c6b4c44dc6..e4762c91b193f9c5e32fa2642e702e61e8e5e57f 100644
--- a/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
+++ b/tensorflow/contrib/ignite/python/ops/ignite_dataset_ops.py
@@ -27,6 +27,7 @@ import six
 from tensorflow.contrib.ignite.python.ops import gen_dataset_ops
 from tensorflow.contrib.ignite.python.ops import ignite_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -34,10 +35,7 @@ from tensorflow.python.framework import tensor_shape
 
 @six.add_metaclass(abc.ABCMeta)
 class Readable(object):
-  """Readable abstract class that exposes methods to do reading-related
-
-     operations.
-  """
+  """Abstract class that exposes methods to do reading-related operations."""
 
   @abc.abstractmethod
   def __init__(self):
@@ -227,10 +225,7 @@ types = {
 
 
 class TypeTreeNode(object):
-  """TypeTreeNode class exposes methods to format object tree structure
-
-     data.
-  """
+  """TypeTreeNode class exposes methods to format object tree structure data."""
 
   def __init__(self, name, type_id, fields=None, permutation=None):
     """Constructs a new instance of TypeTreeNode.
@@ -692,14 +687,14 @@ class IgniteClient(TcpClient):
 
 
 class IgniteDataset(dataset_ops.DatasetSource):
-  """Apache Ignite is a memory-centric distributed database, caching, and
-
-     processing platform for transactional, analytical, and streaming workloads,
-     delivering in-memory speeds at petabyte scale. This contrib package
-     contains an integration between Apache Ignite and TensorFlow. The
-     integration is based on tf.data from TensorFlow side and Binary Client
-     Protocol from Apache Ignite side. It allows to use Apache Ignite as a
-     datasource for neural network training, inference and all other
+  """Apache Ignite is a memory-centric distributed database.
+
+     It acts as a caching and processing platform for transactional, analytical,
+     and streaming workloads, delivering in-memory speeds at petabyte scale.
+     This contrib package contains an integration between Apache Ignite and
+     TensorFlow. The integration is based on tf.data from TensorFlow side and
+     Binary Client Protocol from Apache Ignite side. It allows to use Apache
+     Ignite as a datasource for neural network training, inference and all other
      computations supported by TensorFlow. Ignite Dataset is based on Apache
      Ignite Binary Client Protocol.
   """
@@ -756,6 +751,9 @@ class IgniteDataset(dataset_ops.DatasetSource):
         self.cache_type.to_permutation(),
         dtype=dtypes.int32,
         name="permutation")
+    self._structure = structure.convert_legacy_structure(
+        self.cache_type.to_output_types(), self.cache_type.to_output_shapes(),
+        self.cache_type.to_output_classes())
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.ignite_dataset(self.cache_name, self.host, self.port,
@@ -763,13 +761,5 @@ class IgniteDataset(dataset_ops.DatasetSource):
                                           self.schema, self.permutation)
 
   @property
-  def output_classes(self):
-    return self.cache_type.to_output_classes()
-
-  @property
-  def output_shapes(self):
-    return self.cache_type.to_output_shapes()
-
-  @property
-  def output_types(self):
-    return self.cache_type.to_output_types()
+  def _element_structure(self):
+    return self._structure
diff --git a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
index ef29b5f14a4b2fea2400ec4d56a7ad2cf44cf2cb..ff5d4c458c859fd8e5e3ae65ee41a454d55d6538 100644
--- a/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
+++ b/tensorflow/contrib/ignite/python/tests/ignite_dataset_test.py
@@ -21,6 +21,7 @@ import os
 
 from tensorflow.contrib.ignite import IgniteDataset
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
@@ -65,7 +66,7 @@ class IgniteDatasetTest(test.TestCase):
     self.assertEqual(dtypes.string, dataset.output_types["val"]["NAME"])
     self.assertEqual(dtypes.int64, dataset.output_types["val"]["VAL"])
 
-    it = dataset.make_one_shot_iterator()
+    it = dataset_ops.make_one_shot_iterator(dataset)
     ne = it.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
index 478b716d88321101c971789f36c0ff8ecd3f418e..108da04494685f06f9afc26a26a5dadcdd99b0ff 100644
--- a/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
+++ b/tensorflow/contrib/image/kernels/adjust_hsv_in_yiq_op.cc
@@ -115,7 +115,7 @@ class AdjustHsvInYiqOp<CPUDevice> : public AdjustHsvInYiqOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, &tranformation_matrix](
+          [&input_data, &output_data, &tranformation_matrix](
               int64 start_channel, int64 end_channel) {
             // Applying projection matrix to input RGB vectors.
             const float* p = input_data.data() + start_channel * kChannelSize;
diff --git a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
index 24b790977dfdb675ff7bf0a119a08e243a30d3aa..ae9c7a611945e1445c933d74b9944054b3f0e0a4 100644
--- a/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/dense_image_warp_test.py
@@ -24,7 +24,7 @@ from tensorflow.contrib.image.python.ops import dense_image_warp
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
@@ -259,7 +259,7 @@ class DenseImageWarpTest(test_util.TensorFlowTestCase):
 
     shape = [1, 2, 1, 1]
     msg = 'Should have raised an exception for invalid image size'
-    with self.assertRaises(ValueError, msg=msg):
+    with self.assertRaises(errors.InvalidArgumentError, msg=msg):
       self.check_interpolation_correctness(shape, 'float32', 'float32')
 
 
diff --git a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
index 4997c31a7fc7f4243d03b22fc9c01fb13a2a25a4..ba5cdfebf92c07e496ed588848d5859ff6a5bff2 100644
--- a/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
+++ b/tensorflow/contrib/image/python/kernel_tests/image_ops_test.py
@@ -281,6 +281,13 @@ class ImageOpsTest(test_util.TensorFlowTestCase):
             value.eval(),
             np.array([[4, 4], [4, 4]]).astype(dtype.as_numpy_dtype()))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_transform_eager(self):
+    image = constant_op.constant([[1., 2.], [3., 4.]])
+    value = image_ops.transform(image, [1] * 8)
+    with self.test_session(use_gpu=True):
+      self.assertAllEqual(self.evaluate(value), np.array([[4, 4], [4, 4]]))
+
 
 class BipartiteMatchTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/contrib/image/python/ops/dense_image_warp.py b/tensorflow/contrib/image/python/ops/dense_image_warp.py
index 9c7ada7afb7cb620c2e06887795053778f133287..f7ced440720209cb05dfcd79395c51517f9de0d5 100644
--- a/tensorflow/contrib/image/python/ops/dense_image_warp.py
+++ b/tensorflow/contrib/image/python/ops/dense_image_warp.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -60,28 +61,38 @@ def _interpolate_bilinear(grid,
       msg = 'Grid must be 4 dimensional. Received size: '
       raise ValueError(msg + str(grid.get_shape()))
 
-    batch_size, height, width, channels = shape
+    batch_size, height, width, channels = (array_ops.shape(grid)[0],
+                                           array_ops.shape(grid)[1],
+                                           array_ops.shape(grid)[2],
+                                           array_ops.shape(grid)[3])
+
+    shape = [batch_size, height, width, channels]
     query_type = query_points.dtype
     grid_type = grid.dtype
 
-    if (query_points.shape.rank != 3 or
-        query_points.shape.dims[2].value != 2):
-      msg = ('Query points must be 3 dimensional and size 2 in dim 2. Received '
-             'size: ')
-      raise ValueError(msg + str(query_points.get_shape()))
-
-    _, num_queries, _ = query_points.get_shape().as_list()
-
-    if height < 2 or width < 2:
-      msg = 'Grid must be at least batch_size x 2 x 2 in size. Received size: '
-      raise ValueError(msg + str(grid.get_shape()))
-
-    alphas = []
-    floors = []
-    ceils = []
-
-    index_order = [0, 1] if indexing == 'ij' else [1, 0]
-    unstacked_query_points = array_ops.unstack(query_points, axis=2)
+    with ops.control_dependencies([
+        check_ops.assert_equal(
+            len(query_points.get_shape()),
+            3,
+            message='Query points must be 3 dimensional.'),
+        check_ops.assert_equal(
+            array_ops.shape(query_points)[2],
+            2,
+            message='Query points must be size 2 in dim 2.')
+    ]):
+      num_queries = array_ops.shape(query_points)[1]
+
+    with ops.control_dependencies([
+        check_ops.assert_greater_equal(
+            height, 2, message='Grid height must be at least 2.'),
+        check_ops.assert_greater_equal(
+            width, 2, message='Grid width must be at least 2.')
+    ]):
+      alphas = []
+      floors = []
+      ceils = []
+      index_order = [0, 1] if indexing == 'ij' else [1, 0]
+      unstacked_query_points = array_ops.unstack(query_points, axis=2)
 
     for dim in index_order:
       with ops.name_scope('dim-' + str(dim)):
@@ -112,16 +123,18 @@ def _interpolate_bilinear(grid,
         alpha = array_ops.expand_dims(alpha, 2)
         alphas.append(alpha)
 
-    if batch_size * height * width > np.iinfo(np.int32).max / 8:
-      error_msg = """The image size or batch size is sufficiently large
-                     that the linearized addresses used by array_ops.gather
-                     may exceed the int32 limit."""
-      raise ValueError(error_msg)
-
-    flattened_grid = array_ops.reshape(grid,
-                                       [batch_size * height * width, channels])
-    batch_offsets = array_ops.reshape(
-        math_ops.range(batch_size) * height * width, [batch_size, 1])
+    with ops.control_dependencies([
+        check_ops.assert_less_equal(
+            math_ops.cast(batch_size * height * width, dtype=dtypes.float32),
+            np.iinfo(np.int32).max / 8,
+            message="""The image size or batch size is sufficiently large
+                       that the linearized addresses used by array_ops.gather
+                       may exceed the int32 limit.""")
+    ]):
+      flattened_grid = array_ops.reshape(
+          grid, [batch_size * height * width, channels])
+      batch_offsets = array_ops.reshape(
+          math_ops.range(batch_size) * height * width, [batch_size, 1])
 
     # This wraps array_ops.gather. We reshape the image data such that the
     # batch, y, and x coordinates are pulled into the first dimension.
@@ -182,7 +195,11 @@ def dense_image_warp(image, flow, name='dense_image_warp'):
                 of dimensions.
   """
   with ops.name_scope(name):
-    batch_size, height, width, channels = image.get_shape().as_list()
+    batch_size, height, width, channels = (array_ops.shape(image)[0],
+                                           array_ops.shape(image)[1],
+                                           array_ops.shape(image)[2],
+                                           array_ops.shape(image)[3])
+
     # The flow is defined on the image grid. Turn the flow into a list of query
     # points in the grid space.
     grid_x, grid_y = array_ops.meshgrid(
diff --git a/tensorflow/contrib/image/python/ops/image_ops.py b/tensorflow/contrib/image/python/ops/image_ops.py
index d4fb99a017faebe30384d739f22f4ff5fa986bc4..b25a6f7b5742917a032946fe03a0dab20e7dc1ad 100644
--- a/tensorflow/contrib/image/python/ops/image_ops.py
+++ b/tensorflow/contrib/image/python/ops/image_ops.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.contrib.image.ops import gen_image_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import common_shapes
@@ -271,8 +272,11 @@ def transform(images,
       raise TypeError("Images should have rank between 2 and 4.")
 
     if output_shape is None:
-      output_shape = tensor_util.constant_value(
-          array_ops.shape(images)[1:3]) or array_ops.shape(images)[1:3]
+      output_shape = array_ops.shape(images)[1:3]
+      if not context.executing_eagerly():
+        output_shape_value = tensor_util.constant_value(output_shape)
+        if output_shape_value is not None:
+          output_shape = output_shape_value
 
     output_shape = ops.convert_to_tensor(
         output_shape, dtypes.int32, name="output_shape")
diff --git a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
index 7129f09e8b42e48a9c768fd4a66cde3d4da9d31d..2b86331099ccae03664462987ee0c141d766c10f 100644
--- a/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
+++ b/tensorflow/contrib/kafka/python/ops/kafka_dataset_ops.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 from tensorflow.contrib.kafka.python.ops import gen_dataset_ops
 from tensorflow.contrib.kafka.python.ops import kafka_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 
 
 class KafkaDataset(dataset_ops.DatasetSource):
@@ -63,13 +63,5 @@ class KafkaDataset(dataset_ops.DatasetSource):
                                          self._group, self._eof, self._timeout)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/contrib/keras/api/keras/layers/__init__.py b/tensorflow/contrib/keras/api/keras/layers/__init__.py
index 3327a9f9a613bfb56e6a25af0fe1c0ca18609035..9e19884df852c0fd259a55aef56c62b4189cd1da 100644
--- a/tensorflow/contrib/keras/api/keras/layers/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/layers/__init__.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 # Generic layers.
 # pylint: disable=g-bad-import-order
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
diff --git a/tensorflow/contrib/keras/api/keras/utils/__init__.py b/tensorflow/contrib/keras/api/keras/utils/__init__.py
index 47cd01b924fb43e8a83836c58f8ced61e9e88268..3b9fa1b230b837a350d521c4165053c187786201 100644
--- a/tensorflow/contrib/keras/api/keras/utils/__init__.py
+++ b/tensorflow/contrib/keras/api/keras/utils/__init__.py
@@ -30,6 +30,7 @@ from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
 from tensorflow.python.keras.utils.vis_utils import plot_model
diff --git a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
index de7530231db4ea4f50996a67eb8c0d6936db9dd3..1626e55b9b3bc82bd96703bfab765ac6ad81f462 100644
--- a/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
+++ b/tensorflow/contrib/kernel_methods/python/kernel_estimators.py
@@ -90,7 +90,7 @@ def _update_features_and_columns(features, feature_columns,
     mapped_column_name = column_name + "_MAPPED"
     # Construct new feature columns based on provided kernel_mappers.
     column_kernel_mappers = kernel_mappers_dict[feature_column]
-    new_dim = sum([mapper.output_dim for mapper in column_kernel_mappers])
+    new_dim = sum(mapper.output_dim for mapper in column_kernel_mappers)
     mapped_columns.add(
         layers.feature_column.real_valued_column(mapped_column_name, new_dim))
 
diff --git a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
index 75806dbbeb1819bb0a6965bbc384e02df9895210..20395395281768ac429984a1e3552cfd187527a2 100644
--- a/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
+++ b/tensorflow/contrib/kinesis/python/ops/kinesis_dataset_ops.py
@@ -20,9 +20,9 @@ from __future__ import print_function
 from tensorflow.contrib.kinesis.python.ops import gen_dataset_ops
 from tensorflow.contrib.kinesis.python.ops import kinesis_op_loader  # pylint: disable=unused-import
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 
 
 class KinesisDataset(dataset_ops.DatasetSource):
@@ -34,15 +34,12 @@ class KinesisDataset(dataset_ops.DatasetSource):
 
   For example, we can construct and use the KinesisDataset as follows:
   ```python
+  tf.enable_eager_execution()
+
   dataset = tf.contrib.kinesis.KinesisDataset(
       "kinesis_stream_name", read_indefinitely=False)
-  next = dataset.make_one_shot_iterator().get_next()
-  with tf.Session() as sess:
-    while True:
-      try:
-        print(sess.run(nxt))
-      except tf.errors.OutOfRangeError:
-        break
+  for element in dataset:
+    print(element)
   ```
 
   Since Kinesis is a data streaming service, data may not be available
@@ -84,13 +81,5 @@ class KinesisDataset(dataset_ops.DatasetSource):
         self._stream, self._shard, self._read_indefinitely, self._interval)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/contrib/layers/BUILD b/tensorflow/contrib/layers/BUILD
index e6596bfdfb9b153e5946ab7f8933c160cf2f2326..9ca6f8df5dbe3c236c4cd85095176ce69ad9deaa 100644
--- a/tensorflow/contrib/layers/BUILD
+++ b/tensorflow/contrib/layers/BUILD
@@ -78,6 +78,11 @@ tf_custom_op_py_library(
         ":sparse_feature_cross_op_op_lib",
     ],
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     deps = [
         ":sparse_feature_cross_op",
         "//tensorflow/contrib/framework:framework_py",
@@ -253,7 +258,7 @@ py_test(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
@@ -277,7 +282,7 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
index 124515e5a6474f2cc1038830346e27277c6ceea7..295c721fceda6aaaf8672525ceed560308db6af7 100644
--- a/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/embedding_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 import itertools
 import math
+import sys
 
 import numpy as np
 
@@ -36,6 +37,7 @@ from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -48,11 +50,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= vocab_size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32)
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[vocab_size, embed_dim],
-        slicing=[num_shards, 1],
-        initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32))
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
+        initializer=initializer))
     for w in embedding_weights:
       w.initializer.run()
     embedding_weights = [w.eval() for w in embedding_weights]
@@ -256,6 +260,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                         embedding_weights, sparse_ids, sparse_weights)
 
 
+# pylint: disable=invalid-name
+def local_variable_scope():
+  """Create a variable scope named like the caller function."""
+  return variable_scope.variable_scope(sys._getframe(1).f_code.co_name)
+# pylint: enable=invalid-name
+
+
 class ScatteredEmbeddingLookupTest(test.TestCase):
 
   def setUp(self):
@@ -266,17 +277,18 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[size],
-        slicing=[num_shards],
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
         initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0, dtype=dtypes.float32))
+            mean=0.0, stddev=1.0, dtype=dtypes.float32)))
     for w in embedding_weights:
       w.initializer.run()
     return embedding_weights
 
   def test_scattered_embedding_consistency(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant(["foo", "foo"])
 
@@ -288,7 +300,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1])
 
   def test_scattered_embedding_multiple_partition(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights(num_shards=7)
       values = constant_op.constant([4, 4, 5])
 
@@ -304,7 +316,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertGreater(embedding_diff, 0)
 
   def test_scattered_embedding_coverage(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       size = 8
       embedding_weights = self._random_weights(size=size, num_shards=3)
       values = constant_op.constant(["foo"])
@@ -316,7 +328,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
       self.assertEqual(len(np.unique(embedding_lookup_result[0])), size)
 
   def test_scattered_embedding_multi_dimension(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant([["foo", "bar", "bar"],
                                      ["bar", "bar", "foo"]])
@@ -329,7 +341,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1][2])
 
   def test_scattered_embedding_lookup_sparse(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights(num_shards=3)
       sparse_tensor = sparse_tensor_lib.SparseTensor(
           values=["foo", "bar", "foo", "bar"],
@@ -358,7 +370,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     embeds = np.random.randn(n_embed, d_embed)
     idx = np.random.randint(0, n_embed, idx_shape)
 
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedded_np = embeds[idx]
       embedded_tf = embedding_ops.embedding_lookup_unique(embeds, idx).eval()
 
@@ -370,7 +382,7 @@ class ScatteredEmbeddingLookupTest(test.TestCase):
     idx = np.random.randint(0, 5, 10)
     idx2d = np.random.randint(0, 5, (10, 2))
 
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedded_np = embeds[idx]
       embedded_np2d = embeds[idx2d]
       embedded_tf = embedding_ops.embedding_lookup_unique(embeds, idx).eval()
@@ -398,17 +410,18 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    embedding_weights = list(variable_scope.get_variable(
+        "embedding_weights",
         shape=[size],
-        slicing=[num_shards],
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
         initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0, dtype=dtypes.float32))
+            mean=0.0, stddev=1.0, dtype=dtypes.float32)))
     for w in embedding_weights:
       w.initializer.run()
     return embedding_weights
 
   def test_hashed_embedding_consistency(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant(["foo", "foo"])
       # The first three sampled_candidates are equal, so the first three
@@ -429,7 +442,7 @@ class SampledScatteredEmbeddingLookupTest(test.TestCase):
                           embedding_lookup_result[1][3])
 
   def test_hashed_embedding_multi_dimension(self):
-    with self.cached_session():
+    with self.cached_session(), local_variable_scope():
       embedding_weights = self._random_weights()
       values = constant_op.constant([["foo", "bar", "bar"],
                                      ["bar", "bar", "foo"]])
@@ -691,7 +704,6 @@ class EmbeddingLookupSparseWithDistributedAggregationTest(test.TestCase):
       index += num_val
     return grouped_vals
 
-  @test_util.enable_c_shapes
   def testEmbeddingLookupSparse(self):
     vocab_size = 13
     batch_size = 10
diff --git a/tensorflow/contrib/layers/python/layers/encoders.py b/tensorflow/contrib/layers/python/layers/encoders.py
index f42112206d0db9d2e42bd4cff19f6a6533951d46..3671633c8d795034b13cb55fd6db87c453e9fa12 100644
--- a/tensorflow/contrib/layers/python/layers/encoders.py
+++ b/tensorflow/contrib/layers/python/layers/encoders.py
@@ -84,8 +84,7 @@ def bow_encoder(ids,
       if isinstance(ids, sparse_tensor.SparseTensor):
         raise TypeError('ids are expected to be dense Tensor, got: %s', ids)
       return math_ops.reduce_mean(
-          embedding_ops.embedding_lookup(embeddings, ids),
-          reduction_indices=1)
+          embedding_ops.embedding_lookup(embeddings, ids), axis=1)
 
 
 def embed_sequence(ids,
diff --git a/tensorflow/contrib/layers/python/layers/feature_column.py b/tensorflow/contrib/layers/python/layers/feature_column.py
index 222404b19db2b93b695ee6d2cb16584e17033700..00d819ed0e9fe3a5644105a571beda100204631e 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column.py
@@ -1015,8 +1015,7 @@ class _OneHotColumn(
         dense_id_tensor, depth=self.length, on_value=1.0, off_value=0.0)
 
     # Reduce to get a multi-hot per example.
-    return math_ops.reduce_sum(
-        one_hot_id_tensor, reduction_indices=[output_rank - 1])
+    return math_ops.reduce_sum(one_hot_id_tensor, axis=[output_rank - 1])
 
   @property
   def _variable_shape(self):
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
index 6fb4b9ff3534cab34c84de5d13fea7aff756556d..7e6eafaa0d6f60cfc28a4c422abac0b6d5a991fb 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_ops_test.py
@@ -27,7 +27,7 @@ from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import feature_column_ops
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/layers/python/layers/feature_column_test.py b/tensorflow/contrib/layers/python/layers/feature_column_test.py
index d90d6ecf7f671a40a7ff2b066b6782c7421f9887..cab8da808b6413518ff4864cb0b03a42809260f1 100644
--- a/tensorflow/contrib/layers/python/layers/feature_column_test.py
+++ b/tensorflow/contrib/layers/python/layers/feature_column_test.py
@@ -27,7 +27,7 @@ import numpy as np
 
 from tensorflow.contrib.layers.python.layers import feature_column as fc
 from tensorflow.contrib.layers.python.layers import feature_column_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
diff --git a/tensorflow/contrib/layers/python/layers/layers.py b/tensorflow/contrib/layers/python/layers/layers.py
index ac9561c7693fc4ad994a00889aa3f15b4b5a5ee4..403b522ce45ac6ad98a321378626b87aaa7738aa 100644
--- a/tensorflow/contrib/layers/python/layers/layers.py
+++ b/tensorflow/contrib/layers/python/layers/layers.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base
 from tensorflow.python.layers import convolutional as convolutional_layers
 from tensorflow.python.layers import core as core_layers
@@ -1958,7 +1959,7 @@ class GDN(base.Layer):
     self._reparam_offset = reparam_offset
     self.data_format = data_format
     self._channel_axis()  # trigger ValueError early
-    self.input_spec = base.InputSpec(min_ndim=3, max_ndim=5)
+    self.input_spec = input_spec.InputSpec(min_ndim=3, max_ndim=5)
 
   def _channel_axis(self):
     try:
@@ -2015,7 +2016,7 @@ class GDN(base.Layer):
       raise ValueError('The channel dimension of the inputs to `GDN` '
                        'must be defined.')
     self._input_rank = input_shape.ndims
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         ndim=input_shape.ndims, axes={
             channel_axis: num_channels
         })
diff --git a/tensorflow/contrib/layers/python/layers/layers_test.py b/tensorflow/contrib/layers/python/layers/layers_test.py
index 8ead6336a08db4dd52edf0d3372db5a50f860e2b..d791418c9d0f887058ceb535092fa8122da1aa75 100644
--- a/tensorflow/contrib/layers/python/layers/layers_test.py
+++ b/tensorflow/contrib/layers/python/layers/layers_test.py
@@ -1459,13 +1459,6 @@ class DropoutTest(test.TestCase):
 
 class FlattenTest(test.TestCase):
 
-  def testInvalidRank(self):
-    with ops.Graph().as_default() as g, self.session(g):
-      inputs = array_ops.placeholder(dtype=dtypes.float32)
-      inputs.set_shape(tensor_shape.TensorShape((5,)))
-      with self.assertRaisesRegexp(ValueError, 'incompatible with the layer'):
-        _layers.flatten(inputs)
-
   def testUnknownLastDim(self):
     with ops.Graph().as_default() as g, self.session(g):
       inputs = array_ops.placeholder(dtype=dtypes.float32)
@@ -1502,6 +1495,12 @@ class FlattenTest(test.TestCase):
                        images.get_shape().num_elements())
       self.assertEqual(output.get_shape()[0], images.get_shape()[0])
 
+  def testFlatten0D(self):
+    with self.cached_session():
+      scalars = random_ops.random_uniform((5,), seed=1, name='scalars')
+      output = _layers.flatten(scalars)
+      self.assertEqual(output.shape, (5, 1))
+
   def testFlattenBatchSize(self):
     height, width = 3, 3
     with self.cached_session() as sess:
@@ -3811,7 +3810,7 @@ class UnitNormTests(test.TestCase):
       image = random_ops.random_uniform((height, width, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), axis=dim))
 
       shape = [height, width, 3]
       del shape[dim]
@@ -3847,7 +3846,7 @@ class UnitNormTests(test.TestCase):
       image = array_ops.placeholder(dtypes.float32, (None, None, 3))
       output = _layers.unit_norm(image, dim=dim, epsilon=1e-6)
       norms = math_ops.sqrt(
-          math_ops.reduce_sum(math_ops.square(output), reduction_indices=dim))
+          math_ops.reduce_sum(math_ops.square(output), axis=dim))
 
       with self.cached_session():
         actual = norms.eval({image: placeholder_value})
diff --git a/tensorflow/contrib/layers/python/layers/regularizers_test.py b/tensorflow/contrib/layers/python/layers/regularizers_test.py
index 51faba30c74d64c54d3d2b11d2a11195cca6b759..5cb00b76847430be8ade9f4e4fc8f7372035485a 100644
--- a/tensorflow/contrib/layers/python/layers/regularizers_test.py
+++ b/tensorflow/contrib/layers/python/layers/regularizers_test.py
@@ -141,7 +141,7 @@ class RegularizerTest(test.TestCase):
     dummy_regularizer = lambda x: math_ops.reduce_sum(2 * x)
     array_weights_list = [[1.5], [2, 3, 4.2], [10, 42, 666.6]]
     tensor_weights_list = [constant_op.constant(x) for x in array_weights_list]
-    expected = sum([2 * x for l in array_weights_list for x in l])
+    expected = sum(2 * x for l in array_weights_list for x in l)
     with self.cached_session():
       result = regularizers.apply_regularization(dummy_regularizer,
                                                  tensor_weights_list)
diff --git a/tensorflow/contrib/learn/BUILD b/tensorflow/contrib/learn/BUILD
index 61185f65a9bd294003515456f891de0a68661a82..14065fcee51c014a1af227504eaaca1fa39941e1 100644
--- a/tensorflow/contrib/learn/BUILD
+++ b/tensorflow/contrib/learn/BUILD
@@ -24,6 +24,11 @@ py_library(
         exclude = ["python/learn/**/*_test.py"],
     ),
     srcs_version = "PY2AND3",
+    visibility = [
+        "//learning/brain:__subpackages__",
+        "//tensorflow:__subpackages__",
+        "//video/youtube/personalization:__subpackages__",
+    ],
     # This library should not depend on sklearn, even though some of the code
     # refers to it. (The code handles the presence of sklearn conditionally.)
     deps = [
@@ -269,6 +274,7 @@ py_test(
     name = "estimator_test",
     size = "medium",
     srcs = ["python/learn/estimators/estimator_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = [
         "manual",
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn.py b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
index eabebb7e881558471c343c0573cc9a8f4a425312..10fbd60ba2df4c3f84169bf04f249d67dc14573f 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn.py
@@ -28,7 +28,6 @@ import six
 from tensorflow.contrib import layers
 from tensorflow.contrib.framework import deprecated
 from tensorflow.contrib.framework import deprecated_arg_values
-from tensorflow.python.training import training_util
 from tensorflow.contrib.layers.python.layers import feature_column
 from tensorflow.contrib.layers.python.layers import optimizers
 from tensorflow.contrib.learn.python.learn import metric_spec
@@ -38,11 +37,12 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.ops import nn
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.summary import summary
+from tensorflow.python.training import training_util
 
 # The default learning rate of 0.05 is a historical artifact of the initial
 # implementation, but seems a reasonable choice.
@@ -150,10 +150,10 @@ def _dnn_model_fn(features, labels, mode, params, config=None):
         "input_from_feature_columns",
         values=tuple(six.itervalues(features)),
         partitioner=input_layer_partitioner) as input_layer_scope:
-      if all([
+      if all(
           isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
           for fc in feature_columns
-      ]):
+      ):
         net = layers.input_from_feature_columns(
             columns_to_tensors=features,
             feature_columns=feature_columns,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
index 3d85533d92d17095bae9a69f229171e1bf61ba10..2ade6b7b6ce2678ec8df7c98ffaa5636ae9d4b1d 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined.py
@@ -38,7 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import model_fn
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import nn
@@ -236,10 +236,10 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
           "input_from_feature_columns",
           values=tuple(six.itervalues(features)),
           partitioner=input_layer_partitioner) as dnn_input_scope:
-        if all([
+        if all(
             isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
             for fc in dnn_feature_columns
-        ]):
+        ):
           net = layers.input_from_feature_columns(
               columns_to_tensors=features,
               feature_columns=dnn_feature_columns,
@@ -292,8 +292,8 @@ def _dnn_linear_combined_model_fn(features, labels, mode, params, config=None):
         linear_parent_scope,
         values=tuple(six.itervalues(features)),
         partitioner=linear_partitioner) as scope:
-      if all([isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
-              for fc in linear_feature_columns]):
+      if all(isinstance(fc, feature_column_lib._FeatureColumn)  # pylint: disable=protected-access
+             for fc in linear_feature_columns):
         if joint_linear_weights:
           linear_logits, _, _ = layers.joint_weighted_sum_from_feature_columns(
               columns_to_tensors=features,
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
index 4e65c180d8bee9ab8fe9b1fbf32edc229c31af09..d46a873bfaa297e7f6242aa56e9d0bf0eb551867 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_linear_combined_test.py
@@ -36,7 +36,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
index 2bd57597c2e9444b51b1dacfbe4180b443c95a3d..ee25cebd484f1e831fe8b6d3aa7290da7558adee 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dnn_test.py
@@ -38,7 +38,7 @@ from tensorflow.contrib.learn.python.learn.estimators import run_config
 from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
diff --git a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
index 1d8a59281a4934ad063362cba064e6cb3abff5a2..28c4964527bb034c8c6b1642366c6c82c1a72201 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/dynamic_rnn_estimator_test.py
@@ -668,7 +668,7 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
         sequences = centers + noise
 
         inputs = array_ops.expand_dims(sequences, 2)
-        labels = math_ops.reduce_mean(sequences, reduction_indices=[1])
+        labels = math_ops.reduce_mean(sequences, axis=[1])
         return {'inputs': inputs}, labels
 
       return input_fn
@@ -722,8 +722,8 @@ class DynamicRNNEstimatorLearningTest(test.TestCase):
         inputs = array_ops.expand_dims(math_ops.to_float(random_sequence), 2)
         labels = math_ops.to_int32(
             array_ops.squeeze(
-                math_ops.reduce_sum(
-                    inputs, reduction_indices=[1]) > (sequence_length / 2.0)))
+                math_ops.reduce_sum(inputs, axis=[1]) > (
+                    sequence_length / 2.0)))
         return {'inputs': inputs}, labels
 
       return input_fn
diff --git a/tensorflow/contrib/learn/python/learn/estimators/estimator.py b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
index 8bc869db895b753be805219892342b5e6ea3799b..9132b2209bce8005b323d058d6d176784a84b2d1 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/estimator.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/estimator.py
@@ -1066,11 +1066,11 @@ class BaseEstimator(sklearn.BaseEstimator, evaluable.Evaluable,
       chief_hooks = []
       if (self._config.save_checkpoints_secs or
           self._config.save_checkpoints_steps):
-        saver_hook_exists = any([
+        saver_hook_exists = any(
             isinstance(h, basic_session_run_hooks.CheckpointSaverHook)
             for h in (all_hooks + model_fn_ops.training_hooks + chief_hooks +
                       model_fn_ops.training_chief_hooks)
-        ])
+        )
         if not saver_hook_exists:
           chief_hooks = [
               basic_session_run_hooks.CheckpointSaverHook(
@@ -1493,7 +1493,7 @@ class Estimator(BaseEstimator):
 # pylint: disable=protected-access
 class SKCompat(sklearn.BaseEstimator):
   """Scikit learn wrapper for TensorFlow Learn Estimator.
-  
+
   THIS CLASS IS DEPRECATED. See
   [contrib/learn/README.md](https://www.tensorflow.org/code/tensorflow/contrib/learn/README.md)
   for general migration instructions.
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear.py b/tensorflow/contrib/learn/python/learn/estimators/linear.py
index e100bc7a1e7be4896e9ab1c965775b5185b38897..9ee8d8004bf26224dd96a98bad109720c44d04f7 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import head as head_lib
 from tensorflow.contrib.learn.python.learn.estimators import prediction_key
 from tensorflow.contrib.learn.python.learn.utils import export
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -155,8 +155,8 @@ def _linear_model_fn(features, labels, mode, params, config=None):
       parent_scope,
       values=tuple(six.itervalues(features)),
       partitioner=partitioner) as scope:
-    if all([isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
-            for fc in feature_columns]):
+    if all(isinstance(fc, feature_column._FeatureColumn)  # pylint: disable=protected-access
+           for fc in feature_columns):
       if joint_weights:
         layer_fn = layers.joint_weighted_sum_from_feature_columns
       else:
diff --git a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
index 597ca4e86dbf66c86182f14a2a364b662d52fb0a..dfc76bfde6c0109f98093232b6f223d6938007f9 100644
--- a/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
+++ b/tensorflow/contrib/learn/python/learn/estimators/linear_test.py
@@ -37,7 +37,7 @@ from tensorflow.contrib.learn.python.learn.estimators import test_data
 from tensorflow.contrib.learn.python.learn.metric_spec import MetricSpec
 from tensorflow.contrib.linear_optimizer.python import sdca_optimizer as sdca_optimizer_lib
 from tensorflow.contrib.metrics.python.ops import metric_ops
-from tensorflow.python.feature_column import feature_column as fc_core
+from tensorflow.python.feature_column import feature_column_lib as fc_core
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
@@ -1745,7 +1745,7 @@ class LinearRegressorTest(test.TestCase):
           'place_holder':
               constant_op.constant([[0.0]] * num_examples),
       }, constant_op.constant(
-          [[1 if i % 4 is 0 else 0] for i in range(num_examples)])
+          [[1 if i % 4 == 0 else 0] for i in range(num_examples)])
 
     place_holder = feature_column_lib.real_valued_column('place_holder')
     sdca_optimizer = sdca_optimizer_lib.SDCAOptimizer(
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
index 29552d24f1eaa0d85a99c8b09f69d007e7e4fe9f..59a67636ae275c5ca1df21685770baa7a960d667 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/numpy_io.py
@@ -27,7 +27,7 @@ from tensorflow.python.estimator.inputs.numpy_io import numpy_input_fn as core_n
 from tensorflow.python.util.deprecation import deprecated
 
 
-@deprecated(None, 'Use tf.estimator.inputs.numpy_input_fn.')
+@deprecated(None, 'Use tf.compat.v1.estimator.inputs.numpy_input_fn.')
 def numpy_input_fn(x,
                    y=None,
                    batch_size=128,
diff --git a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
index b4ef055f5ae484ec704ad42efcf2c00c4a7a4f56..e9df7258a358d9543f2bb386518d900bd6ddef74 100644
--- a/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
+++ b/tensorflow/contrib/learn/python/learn/learn_io/pandas_io.py
@@ -53,7 +53,7 @@ PANDAS_DTYPES = {
 }
 
 
-@deprecated(None, 'Please use tf.estimator.inputs.pandas_input_fn')
+@deprecated(None, 'Please use tf.compat.v1.estimator.inputs.pandas_input_fn')
 def pandas_input_fn(x,
                     y=None,
                     batch_size=128,
diff --git a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
index 8466dc36d13e223aed4f1dfe8e39a6f91c99fa55..d49834dc860a8b4341ddd3720fde52281f7474f7 100644
--- a/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/kernel_tests/sdca_ops_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for SdcaModel."""
+"""Tests for SdcaModel (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
index f3f1dcd98db5ae24af154d1f0851a0688d2bc611..c056a12fa5307a7e9ac4cf30e1386ddfd5cd7d75 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sdca_ops.py
@@ -12,7 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Proximal stochastic dual coordinate ascent optimizer for linear models."""
+# pylint: disable=line-too-long
+"""Proximal stochastic dual coordinate ascent optimizer for linear models (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
+# pylint: enable=line-too-long
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -40,6 +47,7 @@ from tensorflow.python.ops import variables as var_ops
 from tensorflow.python.ops.nn import log_poisson_loss
 from tensorflow.python.ops.nn import sigmoid_cross_entropy_with_logits
 from tensorflow.python.summary import summary
+from tensorflow.python.util import deprecation
 
 __all__ = ['SdcaModel']
 
@@ -48,7 +56,7 @@ __all__ = ['SdcaModel']
 class SdcaModel(object):
   """Stochastic dual coordinate ascent solver for linear models.
 
-    Loss functions supported:
+  Loss functions supported:
 
      * Binary logistic loss
      * Squared loss
@@ -109,6 +117,10 @@ class SdcaModel(object):
     ```
   """
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self, examples, variables, options):
     """Create a new sdca optimizer."""
 
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
index a001555e8f257c88a52fdb40d4181f5cd9c92e84..a28394964a12013c43d85701b5a0ab5c559afd62 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sharded mutable dense hash table."""
+"""Sharded mutable dense hash table (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,6 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 
 
 # TODO(rohanj): This should subclass Checkpointable and implement
@@ -45,6 +51,10 @@ class ShardedMutableDenseHashTable(object):
 
   # TODO(andreasst): consider moving this to lookup module
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self,
                key_dtype,
                value_dtype,
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
index 2b56d0fa3a8b8564b7c73a62bd99cc900d6f5c54..2d1457f9e4cc576da696be191e718814dd9ff4e5 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sharded_mutable_dense_hashtable_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for sharded_mutable_dense_hashtable.py."""
+"""Tests for sharded_mutable_dense_hashtable.py (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
index 003795233ff2b28e33fc10388ef25efb63c43bb0..64730f8eed1ff9bfcd4a980dceb28abb98e39f73 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Sparse feature column."""
+"""Sparse feature column (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,6 +26,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework.ops import internal_convert_to_tensor
 from tensorflow.python.framework.ops import name_scope
+from tensorflow.python.util import deprecation
 
 
 class SparseFeatureColumn(object):
@@ -68,6 +74,10 @@ class SparseFeatureColumn(object):
   @@feature_values
   """
 
+  @deprecation.deprecated(
+      None, 'This class is deprecated. To UPDATE or USE linear optimizers, '
+      'please check its latest version in core: '
+      'tensorflow_estimator/python/estimator/canned/linear_optimizer/.')
   def __init__(self, example_indices, feature_indices, feature_values):
     """Creates a `SparseFeatureColumn` representation.
 
diff --git a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
index 51c4f68543da2f563481cc2d35b556796616cf9d..0ae780e1a100c7dadde7196803f2ae0d4bcb2334 100644
--- a/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/ops/sparse_feature_column_test.py
@@ -12,7 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for sparse_feature_column.py."""
+"""Tests for sparse_feature_column.py (deprecated).
+
+This module and all its submodules are deprecated. To UPDATE or USE linear
+optimizers, please check its latest version in core:
+tensorflow_estimator/python/estimator/canned/linear_optimizer/.
+"""
 
 from __future__ import absolute_import
 from __future__ import division
diff --git a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
index 647667188238dc18b137eaad98356a79b3a549b4..7a5354222f103aa0f45adc513079e420bbbfd30c 100644
--- a/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
+++ b/tensorflow/contrib/linear_optimizer/python/sdca_estimator_test.py
@@ -524,7 +524,7 @@ class SDCALinearRegressorTest(test.TestCase):
           # LinearClassifier requires at least one column.
           'place_holder':
               constant_op.constant([[0.0]] * num_examples),
-      }, constant_op.constant([[1 if i % 4 is 0 else 0]
+      }, constant_op.constant([[1 if i % 4 == 0 else 0]
                                for i in range(num_examples)])
 
     with self._single_threaded_test_session():
diff --git a/tensorflow/contrib/lookup/lookup_ops_test.py b/tensorflow/contrib/lookup/lookup_ops_test.py
index 5e99ef460518fa761b12533e5dc07dc252f1d582..9b2c2dd87cc8a92fbb6b45504939be3788b60839 100644
--- a/tensorflow/contrib/lookup/lookup_ops_test.py
+++ b/tensorflow/contrib/lookup/lookup_ops_test.py
@@ -25,6 +25,7 @@ import six
 from tensorflow.contrib import lookup
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import counter
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -2737,7 +2738,7 @@ class MutableHashTableBenchmark(test.Benchmark):
 
   def benchmark_many_repeated_scalar_insert_scalar(self):
     table = self._create_table()
-    c = counter.Counter().make_one_shot_iterator().get_next()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
     value = variables.Variable(1.0)
     insert = table.insert(c, value)
     size = table.size()
@@ -2758,7 +2759,7 @@ class MutableHashTableBenchmark(test.Benchmark):
 
   def benchmark_many_repeated_batch_32_insert_scalar(self):
     table = self._create_table()
-    c = counter.Counter().make_one_shot_iterator().get_next()
+    c = dataset_ops.make_one_shot_iterator(counter.Counter()).get_next()
     value = variables.Variable([1.0] * 32)
     insert = table.insert(32 * c + list(range(32)), value)
     size = table.size()
diff --git a/tensorflow/contrib/losses/python/losses/loss_ops.py b/tensorflow/contrib/losses/python/losses/loss_ops.py
index 619294b51822bd9983eda777acae5cf0d253926d..709a042bbcefb89125f7e4cd14a0d7ecd2b53281 100644
--- a/tensorflow/contrib/losses/python/losses/loss_ops.py
+++ b/tensorflow/contrib/losses/python/losses/loss_ops.py
@@ -22,7 +22,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.contrib.framework.python.ops import add_arg_scope
-from tensorflow.python.compat import compat
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -60,41 +59,12 @@ def _scale_losses(losses, weights):
   """
   # First, compute the sum of the losses over all elements:
   start_index = max(0, weights.get_shape().ndims)
-  reduction_indices = list(range(start_index, losses.get_shape().ndims))
-  reduced_losses = math_ops.reduce_sum(
-      losses, reduction_indices=reduction_indices)
+  axis = list(range(start_index, losses.get_shape().ndims))
+  reduced_losses = math_ops.reduce_sum(losses, axis=axis)
   reduced_losses = math_ops.multiply(reduced_losses, weights)
   return math_ops.reduce_sum(reduced_losses)
 
 
-def _safe_div(numerator, denominator, name="value"):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-    name: An optional name for the returned op.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator, name=name)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator),
-      name=name)
-
-
 def _safe_mean(losses, num_present):
   """Computes a safe mean of the losses.
 
@@ -107,7 +77,7 @@ def _safe_mean(losses, num_present):
       then zero is returned.
   """
   total_loss = math_ops.reduce_sum(losses)
-  return _safe_div(total_loss, num_present, name="value")
+  return math_ops.div_no_nan(total_loss, num_present, name="value")
 
 
 @deprecated("2016-12-30", "Use tf.losses.compute_weighted_loss instead.")
@@ -187,10 +157,9 @@ def _num_present(losses, weights, per_batch=False):
 
   # First, count the number of nonzero weights:
   if weights.get_shape().ndims >= 1:
-    reduction_indices = list(range(1, weights.get_shape().ndims))
+    axis = list(range(1, weights.get_shape().ndims))
     num_nonzero_per_batch = math_ops.reduce_sum(
-        math_ops.to_float(math_ops.not_equal(weights, 0)),
-        reduction_indices=reduction_indices)
+        math_ops.to_float(math_ops.not_equal(weights, 0)), axis=axis)
 
   # Next, determine the number of elements that weights would broadcast to:
   broadcast_dims = array_ops.slice(
@@ -606,20 +575,20 @@ def mean_pairwise_squared_error(predictions,
     if weights.get_shape().ndims is None:
       raise ValueError("weights.get_shape().ndims cannot be None")
 
-    reduction_indices = list(range(1, diffs.get_shape().ndims))
+    axis = list(range(1, diffs.get_shape().ndims))
 
     sum_squares_diff_per_batch = math_ops.reduce_sum(
-        math_ops.square(diffs), reduction_indices=reduction_indices)
+        math_ops.square(diffs), axis=axis)
     num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-    term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
-                            num_present_per_batch,
-                            name="value")
+    term1 = 2.0 * math_ops.div_no_nan(
+        sum_squares_diff_per_batch, num_present_per_batch, name="value")
 
-    sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices)
-    term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
-                            math_ops.square(num_present_per_batch),
-                            name="value")
+    sum_diff = math_ops.reduce_sum(diffs, axis=axis)
+    term2 = 2.0 * math_ops.div_no_nan(
+        math_ops.square(sum_diff),
+        math_ops.square(num_present_per_batch),
+        name="value")
 
     loss = _scale_losses(term1 - term2, weights)
 
@@ -674,7 +643,7 @@ def cosine_distance(predictions,
 
     radial_diffs = math_ops.multiply(predictions, labels)
     losses = 1 - math_ops.reduce_sum(
-        radial_diffs, reduction_indices=[
+        radial_diffs, axis=[
             axis,
         ])
     return compute_weighted_loss(losses, weights, scope=scope)
diff --git a/tensorflow/contrib/makefile/download_dependencies.sh b/tensorflow/contrib/makefile/download_dependencies.sh
index 0a07588f07f0bb89dbf5dc5909f511f74470fb41..2a5232b476712a96f84be0f4725beb78bc138297 100755
--- a/tensorflow/contrib/makefile/download_dependencies.sh
+++ b/tensorflow/contrib/makefile/download_dependencies.sh
@@ -30,11 +30,13 @@ EIGEN_URL="$(grep -o 'http.*bitbucket.org/eigen/eigen/get/.*tar\.gz' "${BZL_FILE
 GEMMLOWP_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/gemmlowp/.*zip' "${BZL_FILE_PATH}" | head -n1)"
 GOOGLETEST_URL="https://github.com/google/googletest/archive/release-1.8.0.tar.gz"
 NSYNC_URL="$(grep -o 'https://mirror.bazel.build/github.com/google/nsync/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
-# Note: The Protobuf source in `tensorflow/workspace.bzl` in TensorFlow
-# 1.10 branch does not work. `make distclean` fails and blocks the build
-# process. For now we're hardcoding to the version which is used by
-# TensorFlow 1.9.
-PROTOBUF_URL="https://mirror.bazel.build/github.com/google/protobuf/archive/396336eb961b75f03b25824fe86cf6490fb75e3a.tar.gz"
+
+# Note: The protobuf repo needs to be cloned due to its submodules.
+# These variables contain the GitHub repo and the sha, from `tensorflow/workspace.bzl`,
+# from which to clone it from and checkout to.
+readonly PROTOBUF_REPO="https://github.com/protocolbuffers/protobuf.git"
+readonly PROTOBUF_TAG="$(grep -o 'https://github.com/protocolbuffers/protobuf/archive/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1 | awk '{print substr($0, index($0, "archive") + 8, index($0, "tar") - index($0, "archive") - 9) }')"
+
 # TODO (yongtang): Replace the following with 'https://mirror.bazel.build/github.com/google/re2/.*tar\.gz' once
 # the archive has been propagated in mirror.bazel.build.
 RE2_URL="$(grep -o 'https://github.com/google/re2/.*tar\.gz' "${BZL_FILE_PATH}" | head -n1)"
@@ -91,11 +93,34 @@ download_and_extract() {
   find "${dir}" -type f -name '*BUILD' -delete
 }
 
+function clone_repository() {
+  local repo_url="${1}"
+  local destination_directory="${2}"
+  local commit_sha="${3}"
+
+  if [[ -d "${destination_directory}" ]]; then
+    rm -rf "${destination_directory}"
+  fi
+
+  git clone "${repo_url}" "${destination_directory}"
+
+  pushd "$(pwd)" 1>/dev/null
+
+  cd "${destination_directory}"
+
+  if [[ -n "${commit_sha}" ]]; then
+    git checkout "${PROTOBUF_TAG}"
+  fi
+
+  git submodule update --init
+
+  popd 1>/dev/null
+}
+
 download_and_extract "${EIGEN_URL}" "${DOWNLOADS_DIR}/eigen"
 download_and_extract "${GEMMLOWP_URL}" "${DOWNLOADS_DIR}/gemmlowp"
 download_and_extract "${GOOGLETEST_URL}" "${DOWNLOADS_DIR}/googletest"
 download_and_extract "${NSYNC_URL}" "${DOWNLOADS_DIR}/nsync"
-download_and_extract "${PROTOBUF_URL}" "${DOWNLOADS_DIR}/protobuf"
 download_and_extract "${RE2_URL}" "${DOWNLOADS_DIR}/re2"
 download_and_extract "${FFT2D_URL}" "${DOWNLOADS_DIR}/fft2d"
 download_and_extract "${DOUBLE_CONVERSION_URL}" "${DOWNLOADS_DIR}/double_conversion"
@@ -106,6 +131,8 @@ download_and_extract "${CUB_URL}" "${DOWNLOADS_DIR}/cub/external/cub_archive"
 download_and_extract "${FARMHASH_URL}" "${DOWNLOADS_DIR}/farmhash"
 download_and_extract "${FLATBUFFERS_URL}" "${DOWNLOADS_DIR}/flatbuffers"
 
+clone_repository "${PROTOBUF_REPO}" "${DOWNLOADS_DIR}/protobuf" "${PROTOBUF_TAG}"
+
 replace_by_sed 's#static uint32x4_t p4ui_CONJ_XOR = vld1q_u32( conj_XOR_DATA );#static uint32x4_t p4ui_CONJ_XOR; // = vld1q_u32( conj_XOR_DATA ); - Removed by script#' \
   "${DOWNLOADS_DIR}/eigen/Eigen/src/Core/arch/NEON/Complex.h"
 replace_by_sed 's#static uint32x2_t p2ui_CONJ_XOR = vld1_u32( conj_XOR_DATA );#static uint32x2_t p2ui_CONJ_XOR;// = vld1_u32( conj_XOR_DATA ); - Removed by scripts#' \
diff --git a/tensorflow/contrib/makefile/tf_op_files.txt b/tensorflow/contrib/makefile/tf_op_files.txt
index e779eff68901af7042deb5c09b78a230e0d06d02..655c7eefcb978d40c8bc16a23685e03ed71bfb63 100644
--- a/tensorflow/contrib/makefile/tf_op_files.txt
+++ b/tensorflow/contrib/makefile/tf_op_files.txt
@@ -157,6 +157,7 @@ tensorflow/core/kernels/mirror_pad_op_cpu_impl_2.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_3.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_4.cc
 tensorflow/core/kernels/mirror_pad_op_cpu_impl_5.cc
+tensorflow/core/kernels/multinomial_op.cc
 tensorflow/core/kernels/no_op.cc
 tensorflow/core/kernels/non_max_suppression_op.cc
 tensorflow/core/kernels/one_hot_op.cc
@@ -252,6 +253,7 @@ tensorflow/core/kernels/split_op.cc
 tensorflow/core/kernels/split_v_op.cc
 tensorflow/core/kernels/stack.cc
 tensorflow/core/kernels/stack_ops.cc
+tensorflow/core/kernels/stateless_random_ops.cc
 tensorflow/core/kernels/strided_slice_op.cc
 tensorflow/core/kernels/strided_slice_op_inst_0.cc
 tensorflow/core/kernels/strided_slice_op_inst_1.cc
diff --git a/tensorflow/contrib/metrics/python/metrics/classification.py b/tensorflow/contrib/metrics/python/metrics/classification.py
index ac1236086503a7c6e541bdf098efcb92f84e577f..9aabc4bec3053871e3ff6cd3a88fd76d293f48cc 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification.py
@@ -18,13 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import metrics_impl
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.training import distribution_strategy_context
 
 # TODO(nsilberman): move into metrics/python/ops/
 
@@ -175,7 +175,7 @@ def f1_score(labels, predictions, weights=None, num_thresholds=200,
       return best_f1
 
     best_f1 = distribution_strategy_context.get_replica_context().merge_call(
-        f1_across_replicas, values)
+        f1_across_replicas, args=(values,))
 
     update_op = compute_best_f1_score(tp=update_ops['tp'], fp=update_ops['fp'],
                                       fn=update_ops['fn'], name='update')
diff --git a/tensorflow/contrib/metrics/python/metrics/classification_test.py b/tensorflow/contrib/metrics/python/metrics/classification_test.py
index d6a670f97b32a29129cb9ea0cd71c5a2b7597a47..e789d2cb9dfbac7b1e145be48b3f707af3fd4e18 100644
--- a/tensorflow/contrib/metrics/python/metrics/classification_test.py
+++ b/tensorflow/contrib/metrics/python/metrics/classification_test.py
@@ -291,12 +291,11 @@ class F1ScoreTest(test.TestCase):
 
     labels = labels.astype(np.float32)
     predictions = predictions.astype(np.float32)
-    tf_predictions, tf_labels = (dataset_ops.Dataset
-                                 .from_tensor_slices((predictions, labels))
-                                 .repeat()
-                                 .batch(batch_size)
-                                 .make_one_shot_iterator()
-                                 .get_next())
+    tf_predictions, tf_labels = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset
+        .from_tensor_slices((predictions, labels))
+        .repeat()
+        .batch(batch_size)).get_next()
     f1, f1_op = classification.f1_score(tf_labels, tf_predictions,
                                         num_thresholds=3)
 
diff --git a/tensorflow/contrib/metrics/python/ops/metric_ops.py b/tensorflow/contrib/metrics/python/ops/metric_ops.py
index d6932f6e4b603b1a76250ab622f5fe8eaea81bc9..7b432f8bd20989c6d95310bcaca88d44ce3e0d1f 100644
--- a/tensorflow/contrib/metrics/python/ops/metric_ops.py
+++ b/tensorflow/contrib/metrics/python/ops/metric_ops.py
@@ -24,7 +24,6 @@ from __future__ import print_function
 
 import collections as collections_lib
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -46,32 +45,6 @@ from tensorflow.python.util.deprecation import deprecated
 _EPSILON = 1e-7
 
 
-def _safe_div(numerator, denominator):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator))
-
-
 @deprecated(None, 'Please switch to tf.metrics.true_positives. Note that the '
             'order of the labels and predictions arguments has been switched.')
 def streaming_true_positives(predictions,
@@ -3247,24 +3220,20 @@ def streaming_covariance(predictions,
 
     # We update the means by Delta=Error*BatchCount/(BatchCount+PrevCount)
     # batch_mean_prediction is E[x_B] in the update equation
-    batch_mean_prediction = _safe_div(
-        math_ops.reduce_sum(weighted_predictions),
-        batch_count)
-    delta_mean_prediction = _safe_div(
-        (batch_mean_prediction - mean_prediction) * batch_count,
-        update_count)
+    batch_mean_prediction = math_ops.div_no_nan(
+        math_ops.reduce_sum(weighted_predictions), batch_count)
+    delta_mean_prediction = math_ops.div_no_nan(
+        (batch_mean_prediction - mean_prediction) * batch_count, update_count)
     update_mean_prediction = state_ops.assign_add(mean_prediction,
                                                   delta_mean_prediction)
     # prev_mean_prediction is E[x_A] in the update equation
     prev_mean_prediction = update_mean_prediction - delta_mean_prediction
 
     # batch_mean_label is E[y_B] in the update equation
-    batch_mean_label = _safe_div(
-        math_ops.reduce_sum(weighted_labels),
-        batch_count)
-    delta_mean_label = _safe_div(
-        (batch_mean_label - mean_label) * batch_count,
-        update_count)
+    batch_mean_label = math_ops.div_no_nan(
+        math_ops.reduce_sum(weighted_labels), batch_count)
+    delta_mean_label = math_ops.div_no_nan(
+        (batch_mean_label - mean_label) * batch_count, update_count)
     update_mean_label = state_ops.assign_add(mean_label, delta_mean_label)
     # prev_mean_label is E[y_A] in the update equation
     prev_mean_label = update_mean_label - delta_mean_label
@@ -3447,7 +3416,7 @@ def streaming_mean_cosine_distance(predictions,
   predictions.get_shape().assert_is_compatible_with(labels.get_shape())
   radial_diffs = math_ops.multiply(predictions, labels)
   radial_diffs = math_ops.reduce_sum(
-      radial_diffs, reduction_indices=[
+      radial_diffs, axis=[
           dim,
       ], keepdims=True)
   mean_distance, update_op = streaming_mean(radial_diffs, weights, None, None,
@@ -3926,9 +3895,8 @@ def cohen_kappa(labels,
       po_sum = math_ops.reduce_sum(po)
       total = math_ops.reduce_sum(pe_row)
       pe_sum = math_ops.reduce_sum(
-          _safe_div(
-              math_ops.to_double(pe_row * pe_col),
-              math_ops.to_double(total)))
+          math_ops.div_no_nan(
+              math_ops.to_double(pe_row * pe_col), math_ops.to_double(total)))
       po_sum, pe_sum, total = (math_ops.to_double(po_sum),
                                math_ops.to_double(pe_sum),
                                math_ops.to_double(total))
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
index 1b0383d24c0c472b4875d15c3650e37dfd2439e1..c922d0cd11fda3c51a51ceccf69798df7ce75f26 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_manager_test.py
@@ -29,7 +29,7 @@ from tensorflow.python.platform import test
 
 def _GetExampleIter(inputs):
   dataset = dataset_ops.Dataset.from_tensor_slices(inputs)
-  return dataset.make_one_shot_iterator()
+  return dataset_ops.make_one_shot_iterator(dataset)
 
 
 class FixedLossScaleManagerTest(test.TestCase):
diff --git a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
index 9009df0eefec13146090ba5fc2096e71ba6eb89d..33f9a43e803ea845a25bba284e41e5a0e6228dad 100644
--- a/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
+++ b/tensorflow/contrib/mixed_precision/python/loss_scale_optimizer_test.py
@@ -132,7 +132,7 @@ class LossScaleOptimizerTest(test.TestCase):
 
     x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
-    itr = dataset.make_one_shot_iterator()
+    itr = dataset_ops.make_one_shot_iterator(dataset)
 
     lr = 1
     opt = gd.GradientDescentOptimizer(lr)
@@ -182,7 +182,7 @@ class LossScaleOptimizerTest(test.TestCase):
 
     x = variable_scope.get_variable("x", initializer=1., dtype=dtypes.float32)
     dataset = dataset_ops.Dataset.from_tensor_slices([np.nan, np.inf, 0.1])
-    itr = dataset.make_one_shot_iterator()
+    itr = dataset_ops.make_one_shot_iterator(dataset)
 
     lr = 1
     init_loss_scale = 8
diff --git a/tensorflow/contrib/model_pruning/python/layers/core_layers.py b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
index f0ce6fe03966c2de2dfd8ebcca07bf46afcf4fce..1fa5c8cb485704a5fccc486e823bbc4050bf505a 100644
--- a/tensorflow/contrib/model_pruning/python/layers/core_layers.py
+++ b/tensorflow/contrib/model_pruning/python/layers/core_layers.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base
 from tensorflow.python.layers import utils
 from tensorflow.python.ops import array_ops
@@ -119,7 +120,7 @@ class _MaskedConv(base.Layer):
     self.bias_initializer = bias_initializer
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
-    self.input_spec = base.InputSpec(ndim=self.rank + 2)
+    self.input_spec = input_spec.InputSpec(ndim=self.rank + 2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -171,7 +172,7 @@ class _MaskedConv(base.Layer):
           dtype=self.dtype)
     else:
       self.bias = None
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         ndim=self.rank + 2, axes={channel_axis: input_dim})
     self.built = True
 
@@ -393,14 +394,14 @@ class MaskedFullyConnected(base.Layer):
     self.bias_initializer = bias_initializer
     self.kernel_regularizer = kernel_regularizer
     self.bias_regularizer = bias_regularizer
-    self.input_spec = base.InputSpec(min_ndim=2)
+    self.input_spec = input_spec.InputSpec(min_ndim=2)
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if tensor_shape.dimension_value(input_shape[-1]) is None:
       raise ValueError('The last dimension of the inputs to `Dense` '
                        'should be defined. Found `None`.')
-    self.input_spec = base.InputSpec(
+    self.input_spec = input_spec.InputSpec(
         min_ndim=2, axes={-1: tensor_shape.dimension_value(input_shape[-1])})
 
     self.kernel = self.add_variable(
diff --git a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
index 6c203e5519e6a66d20e2509eca3c74eb66bf32c7..fa1a7aaff0aa59a6a64b1f0bf836a273926d785d 100644
--- a/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/elastic_average_optimizer.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import saver
 from tensorflow.python.training import session_run_hook
+from tensorflow.python.training.saving import saveable_object_util
 
 LOCAL_VARIABLE_NAME = 'local_center_variable'
 GLOBAL_VARIABLE_NAME = 'global_center_variable'
@@ -424,7 +425,7 @@ class ElasticAverageOptimizer(optimizer.Optimizer):
     if var_list is None:
       var_list = variables.trainable_variables()
     if not isinstance(var_list, dict):
-      var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
+      var_list = saveable_object_util.op_list_to_dict(var_list)
 
     swapped_var_list = {}
     for key, var in var_list.items():
@@ -464,4 +465,4 @@ class _ElasticAverageOptimizerHook(session_run_hook.SessionRunHook):
 
   def after_create_session(self, session, coord):
     """Run initialization ops"""
-    session.run(self._variable_init_op)
\ No newline at end of file
+    session.run(self._variable_init_op)
diff --git a/tensorflow/contrib/opt/python/training/lars_optimizer.py b/tensorflow/contrib/opt/python/training/lars_optimizer.py
index a8dafd9a4cb9c669400f74b545b3c165bd49b2a2..bc18177b6d0b1d3f4fc58236bbc3d445fb73d80d 100644
--- a/tensorflow/contrib/opt/python/training/lars_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/lars_optimizer.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -162,3 +163,14 @@ class LARSOptimizer(optimizer.Optimizer):
         math_ops.cast(self._momentum_tensor, grad.dtype),
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov)
+
+  def _prepare(self):
+    learning_rate = self._learning_rate
+    if callable(learning_rate):
+      learning_rate = learning_rate()
+    self._learning_rate_tensor = ops.convert_to_tensor(
+        learning_rate, name="learning_rate")
+    momentum = self._momentum
+    if callable(momentum):
+      momentum = momentum()
+    self._momentum_tensor = ops.convert_to_tensor(momentum, name="momentum")
diff --git a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
index b7fd2d2fb9db3eed15eb1cc2934199939790b1c0..bf3e5c51f78cc3ca3c7c77009c9cf428c4988953 100644
--- a/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/moving_average_optimizer.py
@@ -26,6 +26,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.training import moving_averages
 from tensorflow.python.training import optimizer
 from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 
 
 class MovingAverageOptimizer(optimizer.Optimizer):
@@ -165,7 +166,7 @@ class MovingAverageOptimizer(optimizer.Optimizer):
     if var_list is None:
       var_list = variables.global_variables()
     if not isinstance(var_list, dict):
-      var_list = saver.BaseSaverBuilder.OpListToDict(var_list)
+      var_list = saveable_object_util.op_list_to_dict(var_list)
 
     v_name_to_tensor = {}
     for k, tensor_or_list in six.iteritems(var_list):
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer.py b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
index 155ff5b3f4f29d4d9c81bb265d19d1b8cce4fef2..960826407b66b4efa3c2693efb6d2e17c4b47b33 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -83,14 +84,14 @@ class NadamOptimizer(adam.AdamOptimizer):
     with ops.control_dependencies([m_t]):
       m_t = scatter_add(m, indices, m_scaled_g_values)
       # m_bar = (1 - beta1) * g_t + beta1 * m_t
-      m_bar = m_scaled_g_values + beta1_t * m_t
+      m_bar = m_scaled_g_values + beta1_t * array_ops.gather(m_t, indices)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, "v")
     v_scaled_g_values = (grad * grad) * (1 - beta2_t)
     v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
     with ops.control_dependencies([v_t]):
       v_t = scatter_add(v, indices, v_scaled_g_values)
-    v_sqrt = math_ops.sqrt(v_t)
-    var_update = state_ops.assign_sub(
-        var, lr * m_bar / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+    v_t_slice = array_ops.gather(v_t, indices)
+    v_sqrt = math_ops.sqrt(v_t_slice)
+    var_update = scatter_add(var, indices, -lr * m_bar / (v_sqrt + epsilon_t))
     return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
index 85e05ce71cec6ef897cadb7d123e630febb3c064..a4372f64874e7591dbceac901fad6c941209bef9 100644
--- a/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
+++ b/tensorflow/contrib/opt/python/training/nadam_optimizer_test.py
@@ -52,14 +52,19 @@ def nadam_update_numpy(param,
 class NadamOptimizerTest(test.TestCase):
 
   def doTestSparse(self, use_resource=False):
+    # need to use a larger value of epsilon here so that
+    # np.sqrt(v_t) + epsilon doesn't get rounded to 0 when
+    # the dtype is half and np.sqrt(v_t) = 0, as is the case
+    # when the gradient is 0
+    sparse_epsilon = 1e-7
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
         # Initialize variables for numpy implementation.
         m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
-        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
-        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
-        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
-        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
 
         if use_resource:
           var0 = resource_variable_ops.ResourceVariable(var0_np)
@@ -67,21 +72,21 @@ class NadamOptimizerTest(test.TestCase):
         else:
           var0 = variables.Variable(var0_np)
           var1 = variables.Variable(var1_np)
-        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
         grads0 = ops.IndexedSlices(
-            constant_op.constant(grads0_np),
-            constant_op.constant(grads0_np_indices), constant_op.constant([2]))
-        grads1_np_indices = np.array([0, 1], dtype=np.int32)
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
         grads1 = ops.IndexedSlices(
-            constant_op.constant(grads1_np),
-            constant_op.constant(grads1_np_indices), constant_op.constant([2]))
-        opt = nadam_optimizer.NadamOptimizer()
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = nadam_optimizer.NadamOptimizer(epsilon=sparse_epsilon)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
@@ -91,8 +96,10 @@ class NadamOptimizerTest(test.TestCase):
           self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
           update.run()
 
-          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
-          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0,
+                                               epsilon=sparse_epsilon)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1,
+                                               epsilon=sparse_epsilon)
 
           # Validate updated params
           self.assertAllCloseAccordingToType(var0_np, var0.eval())
diff --git a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
index 200b0d200826a6212a236680327f4daf7d07831f..8b8065c678e11e8fc237e71cf1d392ced5c22ada 100644
--- a/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
+++ b/tensorflow/contrib/opt/python/training/weight_decay_optimizers.py
@@ -59,6 +59,23 @@ class DecoupledWeightDecayExtension(object):
   Note that this extension decays weights BEFORE applying the update based
   on the gradient, i.e. this extension only has the desired behaviour for
   optimizers which do not depend on the value of'var' in the update step!
+  
+  Note: when applying a decay to the learning rate, be sure to manually apply
+  the decay to the `weight_decay` as well. For example:
+
+  ```python
+    schedule = tf.train.piecewise_constant(tf.train.get_global_step(), 
+                                           [10000, 15000], [1e-0, 1e-1, 1e-2])
+    lr = 1e-1 * schedule()
+    wd = lambda: 1e-4 * schedule()
+
+    # ...
+
+    optimizer = tf.contrib.opt.MomentumWOptimizer(learning_rate=lr,
+                                                  weight_decay=wd,
+                                                  momentum=0.9,
+                                                  use_nesterov=True)
+  ```
   """
 
   def __init__(self, weight_decay, **kwargs):
diff --git a/tensorflow/contrib/optimizer_v2/BUILD b/tensorflow/contrib/optimizer_v2/BUILD
index 3ba3ee29ec79687df522eb330665a2ce80061682..6e401406308604970677003aeea0f15c64cc74b6 100644
--- a/tensorflow/contrib/optimizer_v2/BUILD
+++ b/tensorflow/contrib/optimizer_v2/BUILD
@@ -48,7 +48,6 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:framework",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
@@ -56,6 +55,8 @@ py_library(
         "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
     ],
 )
 
diff --git a/tensorflow/contrib/optimizer_v2/optimizer_v2.py b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
index 467dd86d8fd247a42be2dc47d5bf9872e14da89e..7fb23abc38d9dc101204ed83808aebe5a8ef1e78 100644
--- a/tensorflow/contrib/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/contrib/optimizer_v2/optimizer_v2.py
@@ -24,6 +24,9 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -34,8 +37,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import distribution_strategy_context as distribute_ctx
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -446,7 +447,7 @@ class _OptimizerV2State(object):
     if v is None:
       if colocate_with is None:
         colocate_with = self._non_slot_devices
-      with self._distribution.colocate_vars_with(colocate_with):
+      with self._distribution.extended.colocate_vars_with(colocate_with):
         # TODO(josh11b): Use get_variable() except for the legacy Adam use case.
         v = variable_scope.variable(initial_value, name=name, trainable=False)
       self._non_slot_dict[name] = v
@@ -657,7 +658,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
                var_list=None,
                gate_gradients=GATE_OP,
                aggregation_method=None,
-               colocate_gradients_with_ops=False,
                name=None,
                grad_loss=None,
                stop_gradients=None,
@@ -680,8 +680,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         `GATE_NONE`, `GATE_OP`, or  `GATE_GRAPH`.
       aggregation_method: Specifies the method used to combine gradient terms.
         Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with the
-        corresponding op.
       name: Optional name for the returned operation.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
@@ -704,8 +702,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Minimization (and gradient computation) is done with respect to the
     elements of `var_list` if not None, else with respect to any trainable
     variables created during the execution of the `loss` function.
-    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
-    `grad_loss` are ignored when eager execution is enabled.
+    `gate_gradients`, `aggregation_method`, and `grad_loss` are ignored when
+    eager execution is enabled.
     @end_compatibility
     """
     grads_and_vars = self.compute_gradients(
@@ -713,7 +711,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         var_list=var_list,
         gate_gradients=gate_gradients,
         aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
         grad_loss=grad_loss,
         stop_gradients=stop_gradients,
         scale_loss_by_num_replicas=scale_loss_by_num_replicas)
@@ -733,7 +730,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
                         var_list=None,
                         gate_gradients=GATE_OP,
                         aggregation_method=None,
-                        colocate_gradients_with_ops=False,
                         grad_loss=None,
                         stop_gradients=None,
                         scale_loss_by_num_replicas=None):
@@ -756,8 +752,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
       aggregation_method: Specifies the method used to combine gradient terms.
         Valid values are defined in the class `AggregationMethod`.
-      colocate_gradients_with_ops: If True, try colocating gradients with the
-        corresponding op.
       grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.
       stop_gradients: Optional. A Tensor or list of tensors not to differentiate
         through.
@@ -776,8 +770,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
         not callable.
 
     @compatibility(eager)
-    When eager execution is enabled, `gate_gradients`, `aggregation_method`,
-    and `colocate_gradients_with_ops` are ignored.
+    When eager execution is enabled, `gate_gradients`, and `aggregation_method`
+    are ignored.
     @end_compatibility
     """
     # TODO(josh11b): Test that we handle weight decay in a reasonable way.
@@ -832,7 +826,6 @@ class OptimizerV2(optimizer_v1.Optimizer):
         grad_ys=grad_loss,
         gate_gradients=(gate_gradients == optimizer_v1.Optimizer.GATE_OP),
         aggregation_method=aggregation_method,
-        colocate_gradients_with_ops=colocate_gradients_with_ops,
         stop_gradients=stop_gradients)
     if gate_gradients == optimizer_v1.Optimizer.GATE_GRAPH:
       grads = control_flow_ops.tuple(grads)
@@ -848,8 +841,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     """Scale loss for the number of replicas."""
     if scale_loss_by_num_replicas is None:
       scale_loss_by_num_replicas = (
-          distribute_lib.get_loss_reduction() == variable_scope
-          .VariableAggregation.MEAN)
+          distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN)
     if scale_loss_by_num_replicas:
       num_replicas = \
         distribute_ctx.get_distribution_strategy().num_replicas_in_sync
@@ -892,7 +884,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
       raise ValueError("No gradients provided for any variable: %s." %
                        ([str(v) for _, v in grads_and_vars],))
     return distribute_ctx.get_replica_context().merge_call(
-        self._distributed_apply, filtered, global_step=global_step, name=name)
+        self._distributed_apply, args=(filtered,),
+        kwargs={"global_step": global_step, "name": name})
 
   def _get_or_create_state(self, var_list=None):
     """Either looks up or creates `_OptimizerV2State`.
@@ -927,8 +920,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   def _distributed_apply(self, distribution, grads_and_vars, global_step, name):
     """`apply_gradients` for use with a `DistributionStrategy`."""
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
 
@@ -944,7 +937,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     with ops.name_scope(name, self._name) as name:
       per_graph_state = self._get_or_create_state(var_list=unwrapped_var_list)
       # Include the current value of any dynamic hyper parameters in `state`.
-      non_slot_devices = distribution.non_slot_devices(var_list)
+      non_slot_devices = distribution.extended.non_slot_devices(var_list)
       state = per_graph_state._copy_with_dynamic_hyper(  # pylint: disable=protected-access
           self._hyper, distribution, non_slot_devices)
 
@@ -989,7 +982,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
       # Use the processors to update the variables.
       update_ops = []
       for grad, var in grads_and_vars:
-        update_ops.extend(distribution.update(var, update, grad, grouped=False))
+        update_ops.extend(distribution.extended.update(
+            var, update, args=(grad,), group=False))
 
       # Give the child class a chance to do something after applying
       # gradients
@@ -1001,12 +995,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
       update_ops = control_flow_ops.group(update_ops)
       with ops.control_dependencies([update_ops]):
-        finish_updates = distribution.update_non_slot(
-            non_slot_devices, finish, grouped=False)
-      # We said grouped=False, which means finish_updates is always a list.
-      # It will be [None] when finish() returns None.
-      if finish_updates == [None]:
-        finish_updates = [update_ops]
+        finish_updates = distribution.extended.update_non_slot(
+            non_slot_devices, finish, group=False)
+      # We said group=False, which means finish_updates is always a tuple.
+      # It will be (None,) when finish() returns None.
+      if finish_updates == (None,):
+        finish_updates = (update_ops,)
 
       # Update `global_step` (if any).
       if global_step is None:
@@ -1017,8 +1011,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
           def update_global_step(global_step, name):
             return global_step.assign_add(1, read_value=False, name=name)
 
-          apply_updates = distribution.update(global_step, update_global_step,
-                                              name)
+          apply_updates = distribution.extended.update(
+              global_step, update_global_step, args=(name,))
 
       # Add the training op to the TRAIN_OP graph collection in graph mode.
       if not eager_execution:
diff --git a/tensorflow/contrib/predictor/BUILD b/tensorflow/contrib/predictor/BUILD
index d50b52b8ff1ce8188ab52c6968d716378efd9daa..53a3bc63e1d770b451846c45370fdee9ffa72d70 100644
--- a/tensorflow/contrib/predictor/BUILD
+++ b/tensorflow/contrib/predictor/BUILD
@@ -42,6 +42,7 @@ py_library(
     name = "saved_model_predictor",
     srcs = ["saved_model_predictor.py"],
     srcs_version = "PY2AND3",
+    visibility = ["//learning/brain/contrib/learn/tpu:__subpackages__"],
     deps = [
         ":base_predictor",
         "//tensorflow/contrib/saved_model:saved_model_py",
diff --git a/tensorflow/contrib/quantize/README.md b/tensorflow/contrib/quantize/README.md
index a1f2b5902663e96bca8e13998869f4a0e9ae584b..9085d9fa719520ac84ef6f8e07d7fa335bef5605 100644
--- a/tensorflow/contrib/quantize/README.md
+++ b/tensorflow/contrib/quantize/README.md
@@ -28,7 +28,7 @@ Since it's difficult to add these fake quantization operations to all the
 required locations in the model, there's a function available that rewrites the
 training graph. To create a fake quantized training graph:
 
-```
+```python
 # Build forward pass of model.
 loss = tf.losses.get_total_loss()
 
@@ -51,7 +51,7 @@ The rewritten *eval graph* is non-trivially different from the *training graph*
 since the quantization ops affect the batch normalization step. Because of this,
 we've added a separate rewrite for the *eval graph*:
 
-```
+```python
 # Build eval model
 logits = tf.nn.softmax_cross_entropy_with_logits_v2(...)
 
diff --git a/tensorflow/contrib/quantize/python/quant_ops.py b/tensorflow/contrib/quantize/python/quant_ops.py
index 6f659347fba019288361dd0420f2ade6dc1bebaf..8619708cdaecd78bcc7de0e8e0cbf2baa11bf6a2 100644
--- a/tensorflow/contrib/quantize/python/quant_ops.py
+++ b/tensorflow/contrib/quantize/python/quant_ops.py
@@ -138,7 +138,7 @@ def LastValueQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_min = math_ops.reduce_min(
-            inputs, reduction_indices=reduce_dims, name='BatchMin')
+            inputs, axis=reduce_dims, name='BatchMin')
       else:
         batch_min = inputs
     else:
@@ -147,7 +147,7 @@ def LastValueQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_max = math_ops.reduce_max(
-            inputs, reduction_indices=reduce_dims, name='BatchMax')
+            inputs, axis=reduce_dims, name='BatchMax')
       else:
         batch_max = inputs
     else:
@@ -263,7 +263,7 @@ def MovingAvgQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_min = math_ops.reduce_min(
-            inputs, reduction_indices=reduce_dims, name='BatchMin')
+            inputs, axis=reduce_dims, name='BatchMin')
       else:
         batch_min = inputs
     else:
@@ -272,7 +272,7 @@ def MovingAvgQuantize(inputs,
     if per_channel:
       if input_dim >= 2:
         batch_max = math_ops.reduce_max(
-            inputs, reduction_indices=reduce_dims, name='BatchMax')
+            inputs, axis=reduce_dims, name='BatchMax')
       else:
         batch_max = inputs
     else:
diff --git a/tensorflow/contrib/quantize/python/quantize.py b/tensorflow/contrib/quantize/python/quantize.py
index 338923f75125ed3d1a2b1046a65d563bc8f7d3e3..21d1b1213090273b5abd8e012f8711db98c94347 100644
--- a/tensorflow/contrib/quantize/python/quantize.py
+++ b/tensorflow/contrib/quantize/python/quantize.py
@@ -160,7 +160,7 @@ def Quantize(graph,
       # shouldn't quantize it, since the activation will be Fused into the
       # Add at inference time.
       consumers = input_to_ops_map.ConsumerOperations(layer_match.bypass_op)
-      if any([consumer.type in _ACTIVATION_TYPES for consumer in consumers]):
+      if any(consumer.type in _ACTIVATION_TYPES for consumer in consumers):
         logging.info('Skipping %s, because its followed by an activation.',
                      layer_match.bypass_op.name)
       else:
@@ -195,7 +195,7 @@ def Quantize(graph,
       # Add at inference time.
       consumers = input_to_ops_map.ConsumerOperations(
           layer_match.post_activation_bypass_op)
-      if any([consumer.type in _RELU_TYPES for consumer in consumers]):
+      if any(consumer.type in _RELU_TYPES for consumer in consumers):
         logging.info('Skipping %s, because its followed by an activation.',
                      layer_match.post_activation_bypass_op.name)
       else:
diff --git a/tensorflow/contrib/rate/BUILD b/tensorflow/contrib/rate/BUILD
index c461a7145e27c4238161cec989448be807acd543..76db9aecf615d0a94f65cd7ea799db245828db1c 100644
--- a/tensorflow/contrib/rate/BUILD
+++ b/tensorflow/contrib/rate/BUILD
@@ -34,6 +34,11 @@ py_test(
     name = "rate_test",
     size = "small",
     srcs = ["rate_test.py"],
+    tags = [
+        "manual",  # TODO(b/120555555)
+        "no_oss",  # TODO(b/120555555)
+        "notap",  # TODO(b/120555555)
+    ],
     deps = [
         ":rate",
         "//tensorflow/python:array_ops",
diff --git a/tensorflow/contrib/resampler/BUILD b/tensorflow/contrib/resampler/BUILD
index 38fcca03116721f3dabfa6d1e7122c369b6b405d..bbf109967595a73a0fc4bacaf34859b30c2376fc 100644
--- a/tensorflow/contrib/resampler/BUILD
+++ b/tensorflow/contrib/resampler/BUILD
@@ -13,6 +13,7 @@ load(
 )
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_py_library")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
 
 tf_custom_op_py_library(
     name = "resampler_py",
@@ -50,10 +51,14 @@ tf_kernel_library(
     prefix = "resampler_ops",
     deps = [
         ":resampler_ops_op_lib",
-        "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-    ],
+    ] + select({
+        "//tensorflow:with_xla_support": [
+            "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
+        ],
+        "//conditions:default": [],
+    }),
     alwayslink = 1,
 )
 
@@ -94,3 +99,26 @@ cuda_py_test(
         "//tensorflow/python:array_ops",
     ],
 )
+
+tf_xla_py_test(
+    name = "resampler_ops_xla_test",
+    size = "small",
+    srcs = ["xla/resampler_ops_xla_test.py"],
+    disabled_backends = [
+        # TODO(b/74459949) Support BatchDot in CPU backend.
+        "cpu",
+        "cpu_ondemand",
+    ],
+    # TODO(b/112295522): the OSS build will not likely work in the short to medium term, currently it is blocked by the fact that bazel does not allow py_library to depend on cc_library: https://github.com/bazelbuild/bazel/issues/701 which may not be resolvable.
+    tags = ["no_oss"],
+    deps = [
+        "//tensorflow/compiler/tests:xla_test",
+        "//tensorflow/compiler/tf2xla/kernels:resampler_ops",
+        "//tensorflow/contrib/resampler:resampler_ops",
+        "//tensorflow/contrib/resampler:resampler_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/compiler/tests/resampler_ops_test.py b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
similarity index 76%
rename from tensorflow/compiler/tests/resampler_ops_test.py
rename to tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
index f87ac3360c905d7956ab3716c47d42765949774d..d8ca0eab276b39f025d018edebb78eed7a8433bb 100644
--- a/tensorflow/compiler/tests/resampler_ops_test.py
+++ b/tensorflow/contrib/resampler/xla/resampler_ops_xla_test.py
@@ -63,8 +63,8 @@ class ResamplerOpsTest(xla_test.XLATestCase):
   def testSimple(self):
     for dtype in self.float_types:
       input_shape = [1, 2, 2, 1]
-      input_rgb_data = [0, 5, 13, 54]
-      input_np = np.array(input_rgb_data, dtype=dtype).reshape(input_shape)
+      input_data = [0, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
 
       warp_shape = [1, 2]
       warp_data = [0.7, 0.6]
@@ -151,6 +151,55 @@ class ResamplerOpsTest(xla_test.XLATestCase):
                                             expected_grad_data,
                                             expected_grad_warp)
 
+  def testOutOfBoundWarps(self):
+    # (x, y) are both less than 0.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-1, -1, 0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [27.62]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # One of (x, y) is less than 0.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-1, 0.1, 0.7, 0.6]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [27.62]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # Both of (x, y) are greater than image size.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [-0.1, 0.1, 1.2, 2.1]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [0.0]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
+    # One of (x, y) is greater than image size.
+    for dtype in self.float_types:
+      input_shape = [1, 2, 2, 1]
+      input_data = [10, 5, 13, 54]
+      input_np = np.array(input_data, dtype=dtype).reshape(input_shape)
+
+      warp_shape = [1, 2, 2]
+      warp_data = [0.1, -0.1, 1.2, 0.1]
+      warp_np = np.array(warp_data, dtype=dtype).reshape(warp_shape)
+      expected = [[[0.0], [0.0]]]
+      self._assertForwardOpMatchesExpected(input_np, warp_np, expected)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
index 245fa68eaef43ca8bc18c6087460d946228b0c85..7d57b0413a3bb51c35e670ce3fdb2cc818f44a58 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_cell_test.py
@@ -906,7 +906,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoOutput(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     res = self._testDropoutWrapper(
         input_keep_prob=keep_all,
         output_keep_prob=keep_none,
@@ -922,7 +922,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoStateExceptLSTMCellMemory(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     # Even though we dropout state, by default DropoutWrapper never
     # drops out the memory ("c") term of an LSTMStateTuple.
     res = self._testDropoutWrapper(
@@ -943,7 +943,7 @@ class DropoutWrapperTest(test.TestCase):
 
   def testDropoutWrapperKeepNoInput(self):
     keep_all = variable_scope.get_variable("all", initializer=1.0)
-    keep_none = variable_scope.get_variable("none", initializer=1e-10)
+    keep_none = variable_scope.get_variable("none", initializer=1e-6)
     true_full_output = np.array(
         [[[0.751109, 0.751109, 0.751109]], [[0.895509, 0.895509, 0.895509]]],
         dtype=np.float32)
diff --git a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
index 5cba54dd3df5bbb33380505bd5a073f069a3a590..ef372b947cedf71e9d44423f10cc43375b467cd9 100644
--- a/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
+++ b/tensorflow/contrib/rnn/python/kernel_tests/core_rnn_test.py
@@ -227,7 +227,7 @@ class RNNTest(test.TestCase):
   def testDropout(self):
     cell = Plus1RNNCell()
     full_dropout_cell = rnn_cell.DropoutWrapper(
-        cell, input_keep_prob=1e-12, seed=0)
+        cell, input_keep_prob=1e-6, seed=0)
     (name, dep), = full_dropout_cell._checkpoint_dependencies
     self.assertIs(dep, cell)
     self.assertEqual("cell", name)
diff --git a/tensorflow/contrib/rnn/python/ops/gru_ops.py b/tensorflow/contrib/rnn/python/ops/gru_ops.py
index b30ca7882fce1747cb1dcb27f97f5b012ff9da02..251a933eaec826b08266123245d9aef8573d3e06 100644
--- a/tensorflow/contrib/rnn/python/ops/gru_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/gru_ops.py
@@ -21,7 +21,7 @@ from tensorflow.contrib.rnn.ops import gen_gru_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -165,7 +165,7 @@ class GRUBlockCell(LayerRNNCell):
       num_units = cell_size
     self._cell_size = num_units
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
diff --git a/tensorflow/contrib/rnn/python/ops/lstm_ops.py b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
index 4db431f85a467389717e98d87875afce5e08b974..b043026bc556a8879b15b432829baf8136250c0e 100644
--- a/tensorflow/contrib/rnn/python/ops/lstm_ops.py
+++ b/tensorflow/contrib/rnn/python/ops/lstm_ops.py
@@ -25,6 +25,7 @@ from tensorflow.contrib.rnn.ops import gen_lstm_ops
 from tensorflow.contrib.util import loader
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -385,7 +386,7 @@ class LSTMBlockCell(LayerRNNCell):
         "scope": "lstm_cell"
     }
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -628,7 +629,7 @@ class LSTMBlockFusedCell(LSTMBlockWrapper):
     self._use_peephole = use_peephole
 
     # Inputs must be 3-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=3)
+    self.input_spec = input_spec.InputSpec(ndim=3)
 
   @property
   def num_units(self):
diff --git a/tensorflow/contrib/rnn/python/ops/rnn_cell.py b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
index e159dc95796e8f02287a4b6db4d25023348fe8da..8a1c09f171e6108174671e3122d5ff4c0b236003 100644
--- a/tensorflow/contrib/rnn/python/ops/rnn_cell.py
+++ b/tensorflow/contrib/rnn/python/ops/rnn_cell.py
@@ -30,7 +30,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
-from tensorflow.python.layers import base as base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gen_array_ops
@@ -2752,7 +2752,7 @@ class SRUCell(rnn_cell_impl.LayerRNNCell):
     self._activation = activation or math_ops.tanh
 
     # Restrict inputs to be 2-dimensional matrices
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
   @property
   def state_size(self):
@@ -3089,7 +3089,7 @@ class IndRNNCell(rnn_cell_impl.LayerRNNCell):
     super(IndRNNCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -3183,7 +3183,7 @@ class IndyGRUCell(rnn_cell_impl.LayerRNNCell):
     super(IndyGRUCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._activation = activation or math_ops.tanh
@@ -3323,7 +3323,7 @@ class IndyLSTMCell(rnn_cell_impl.LayerRNNCell):
     super(IndyLSTMCell, self).__init__(_reuse=reuse, name=name, dtype=dtype)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._forget_bias = forget_bias
@@ -3444,7 +3444,7 @@ class MinimalRNNCell(rnn_cell_impl.LayerRNNCell):
     super(MinimalRNNCell, self).__init__(name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self.units = units
     self.activation = activations.get(activation)
@@ -3558,7 +3558,7 @@ class CFNCell(rnn_cell_impl.LayerRNNCell):
     super(CFNCell, self).__init__(name=name, dtype=dtype, **kwargs)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self.units = units
     self.activation = activations.get(activation)
diff --git a/tensorflow/contrib/saved_model/BUILD b/tensorflow/contrib/saved_model/BUILD
index f0947fe423f7e6bf84dae468bc36ca11147ac0bb..269443b2c6508bb618d30f64487b1a6a84e8646f 100644
--- a/tensorflow/contrib/saved_model/BUILD
+++ b/tensorflow/contrib/saved_model/BUILD
@@ -102,7 +102,10 @@ py_test(
     size = "medium",
     srcs = ["python/saved_model/keras_saved_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
+    tags = [
+        "no_oss",  # TODO(b/119349471): Re-enable
+        "no_windows",
+    ],
     deps = [
         ":keras_saved_model",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
index 27b5b6d22e0fc1156d6f7a1c852f4c5a6e06da02..ffba514bb96f5ce8d963cb0a0482738eafe88355 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model.py
@@ -25,7 +25,6 @@ from tensorflow.python.client import session
 from tensorflow.python.estimator import keras as estimator_keras_util
 from tensorflow.python.estimator import model_fn as model_fn_lib
 from tensorflow.python.estimator.export import export as export_helpers
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import models as models_lib
@@ -126,7 +125,7 @@ def save_keras_model(
   export_dir = export_helpers.get_timestamped_export_dir(saved_model_path)
   temp_export_dir = export_helpers.get_temp_export_dir(export_dir)
 
-  builder = saved_model_builder.SavedModelBuilder(temp_export_dir)
+  builder = saved_model_builder._SavedModelBuilder(temp_export_dir)
 
   # Manually save variables to export them in an object-based checkpoint. This
   # skips the `builder.add_meta_graph_and_variables()` step, which saves a
@@ -228,9 +227,10 @@ def _export_mode(
       g.add_to_collection(ops.GraphKeys.GLOBAL_STEP, clone.optimizer.iterations)
 
     # Extract update and train ops from train/test/predict functions.
+    train_op = None
     if mode == model_fn_lib.ModeKeys.TRAIN:
       clone._make_train_function()
-      builder._add_train_op(clone.train_function.updates_op)
+      train_op = clone.train_function.updates_op
     elif mode == model_fn_lib.ModeKeys.EVAL:
       clone._make_test_function()
     else:
@@ -265,7 +265,8 @@ def _export_mode(
         model_fn_lib.EXPORT_TAG_MAP[mode],
         signature_def_map=_create_signature_def_map(clone, mode),
         saver=saver_lib.Saver(clone_var_list),
-        main_op=variables.local_variables_initializer())
+        init_op=variables.local_variables_initializer(),
+        train_op=train_op)
     return None
 
 
@@ -307,31 +308,11 @@ def _create_signature_def_map(model, mode):
       serving_only=(mode == model_fn_lib.ModeKeys.PREDICT))
 
 
-def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):
+def _assert_same_non_optimizer_objects(model, model_graph, clone, clone_graph):  # pylint: disable=unused-argument
   """Assert model and clone contain the same checkpointable objects."""
 
-  def get_non_optimizer_objects(m, g):
-    """Gather set of model and optimizer checkpointable objects."""
-    # Set default graph because optimizer.variables() returns optimizer
-    # variables defined in the default graph.
-    with g.as_default():
-      all_objects = set(checkpointable_utils.list_objects(m))
-      optimizer_and_variables = set()
-      for obj in all_objects:
-        if isinstance(obj, optimizers.TFOptimizer):
-          optimizer_and_variables.update(checkpointable_utils.list_objects(obj))
-          optimizer_and_variables.update(set(obj.optimizer.variables()))
-      return all_objects - optimizer_and_variables
-
-  model_objects = get_non_optimizer_objects(model, model_graph)
-  clone_objects = get_non_optimizer_objects(clone, clone_graph)
-
-  if len(model_objects) != len(clone_objects):
-    raise errors.InternalError(
-        None, None,
-        'Model and clone must use the same variables.'
-        '\n\tModel variables: %s\n\t Clone variables: %s'
-        % (model_objects, clone_objects))
+  # TODO(fchollet, kathywu): make sure this works in eager mode.
+  return True
 
 
 def load_keras_model(saved_model_path):
diff --git a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
index a65b2ce466111c33d0092b7018537573708de2d0..93d73e1b484ed810fb347b13e95022dfca3584c2 100644
--- a/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
+++ b/tensorflow/contrib/saved_model/python/saved_model/keras_saved_model_test.py
@@ -29,14 +29,12 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
 from tensorflow.python.estimator import model_fn as model_fn_lib
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
-from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import loader_impl
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.training import training as training_module
@@ -255,7 +253,7 @@ def load_model(sess, path, mode):
   outputs = {
       k: sess.graph.get_tensor_by_name(v.name)
       for k, v in meta_graph_def.signature_def[sig_def_key].outputs.items()}
-  return inputs, outputs
+  return inputs, outputs, meta_graph_def
 
 
 @test_util.run_all_in_graph_and_eager_modes
@@ -332,8 +330,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
     # Load predict graph, and test predictions
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs = load_model(sess, output_path,
-                                   model_fn_lib.ModeKeys.PREDICT)
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      model_fn_lib.ModeKeys.PREDICT)
 
       predictions = sess.run(outputs[output_name],
                              {inputs[input_name]: input_arr})
@@ -342,19 +340,21 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
     if optimizer:
       # Load eval graph, and test predictions, loss and metric values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs = load_model(sess, output_path,
-                                     model_fn_lib.ModeKeys.EVAL)
+        inputs, outputs, _ = load_model(sess, output_path,
+                                        model_fn_lib.ModeKeys.EVAL)
 
         # First obtain the loss and predictions, and run the metric update op by
         # feeding in the inputs and targets.
         loss, predictions, _ = sess.run(
             (outputs['loss'], outputs['predictions/' + output_name],
-             outputs['metrics/mae/update_op']),
-            {inputs[input_name]: input_arr, inputs[target_name]: target_arr})
+             outputs['metrics/mean_absolute_error/update_op']), {
+                 inputs[input_name]: input_arr,
+                 inputs[target_name]: target_arr
+             })
 
         # The metric value should be run after the update op, to ensure that it
         # reflects the correct value.
-        metric_value = sess.run(outputs['metrics/mae/value'])
+        metric_value = sess.run(outputs['metrics/mean_absolute_error/value'])
 
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
@@ -364,17 +364,17 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
 
       # Load train graph, and check for the train op, and prediction values
       with session.Session(graph=ops.Graph()) as sess:
-        inputs, outputs = load_model(sess, output_path,
-                                     model_fn_lib.ModeKeys.TRAIN)
+        inputs, outputs, meta_graph_def = load_model(
+            sess, output_path, model_fn_lib.ModeKeys.TRAIN)
         self.assertEqual(int(train_before_export),
                          sess.run(training_module.get_global_step()))
         self.assertIn('loss', outputs)
-        self.assertIn('metrics/mae/update_op', outputs)
-        self.assertIn('metrics/mae/value', outputs)
+        self.assertIn('metrics/mean_absolute_error/update_op', outputs)
+        self.assertIn('metrics/mean_absolute_error/value', outputs)
         self.assertIn('predictions/' + output_name, outputs)
 
         # Train for a step
-        train_op = ops.get_collection(constants.TRAIN_OP_KEY)
+        train_op = loader_impl.get_train_op(meta_graph_def)
         train_outputs, _ = sess.run(
             [outputs, train_op], {inputs[input_name]: input_arr,
                                   inputs[target_name]: target_arr})
@@ -401,8 +401,8 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       output_path = keras_saved_model.save_keras_model(
           model, saved_model_path, custom_objects={'relu6': relu6})
     with session.Session(graph=ops.Graph()) as sess:
-      inputs, outputs = load_model(sess, output_path,
-                                   model_fn_lib.ModeKeys.PREDICT)
+      inputs, outputs, _ = load_model(sess, output_path,
+                                      model_fn_lib.ModeKeys.PREDICT)
       input_name = model.input_names[0]
       output_name = model.output_names[0]
       predictions = sess.run(
@@ -463,11 +463,6 @@ class TestModelSavedModelExport(test.TestCase, parameterized.TestCase):
       clone.compile(loss='mse', optimizer=keras.optimizers.RMSprop(lr=0.0001))
       clone.train_on_batch(input_arr, target_arr)
 
-    with self.assertRaisesRegexp(
-        errors.InternalError, 'Model and clone must use the same variables.'):
-      keras_saved_model._assert_same_non_optimizer_objects(
-          model, model_graph, clone, clone_graph)
-
   def testSaveSeqModelWithoutInputShapesRaisesError(self):
     """A Sequential model that hasn't been built should raise an error."""
     model = sequential_model_without_input_shape(True)
diff --git a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
index 8668c67cf95aba6cbd466142bed37c79e34d9e04..922f21b98b35dfff19c8c605a25e89c5d2da8d98 100644
--- a/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
+++ b/tensorflow/contrib/seq2seq/python/kernel_tests/attention_wrapper_test.py
@@ -154,8 +154,8 @@ class AttentionWrapperTest(test.TestCase):
 
     if attention_layer_sizes is not None:
       # Compute sum of attention_layer_sizes. Use encoder_output_depth if None.
-      attention_depth = sum([attention_layer_size or encoder_output_depth
-                             for attention_layer_size in attention_layer_sizes])
+      attention_depth = sum(attention_layer_size or encoder_output_depth
+                            for attention_layer_size in attention_layer_sizes)
     elif attention_layers is not None:
       # Compute sum of attention_layers output depth.
       attention_depth = sum(
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py b/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
index 8fcd7aeef6a6964902666a4f3c17e05b0c7b52ee..f31bdbd399c9de4f2f5d557b75b1ece6d64a765e 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/lanczos_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import lanczos
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -80,7 +81,8 @@ if __name__ == "__main__":
     for shape in [[4, 4], [7, 4], [5, 8]]:
       for orthogonalize in True, False:
         for steps in range(1, min(shape) + 1):
-          for use_static_shape in True, False:
+          # TF2 does not support placeholders so we skip it
+          for use_static_shape in set([True, tf2.enabled()]):
             arg_string = "%s_%s_%s_%s_staticshape_%s" % (
                 dtype.__name__, "_".join(map(str, shape)), orthogonalize, steps,
                 use_static_shape)
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py b/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
index 2a9100903aae5689919a6b25fcb18ff192f250b3..841a41a2339824ab8ca15f4bdd74be697cd6fe9f 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/least_squares_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import least_squares
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -76,7 +77,8 @@ def _get_least_squares_tests(dtype_, use_static_shape_, shape_):
 if __name__ == "__main__":
   for dtype in np.float32, np.float64:
     for shape in [[4, 4], [8, 5], [3, 7]]:
-      for use_static_shape in True, False:
+      # TF2 does not support placeholders under eager so we skip it
+      for use_static_shape in set([True, tf2.enabled()]):
         arg_string = "%s_%s_staticshape_%s" % (dtype.__name__,
                                                "_".join(map(str, shape)),
                                                use_static_shape)
diff --git a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
index a0e6eb87bc06fb1303a7eb86fa6760458f20a9b9..10807f7a80617e56abeb6d13ce419a49a2269aac 100644
--- a/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
+++ b/tensorflow/contrib/solvers/python/kernel_tests/linear_equations_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.contrib.solvers.python.ops import linear_equations
 from tensorflow.contrib.solvers.python.ops import util
 from tensorflow.python.framework import constant_op
@@ -113,7 +114,8 @@ def _get_linear_equations_tests(dtype_, use_static_shape_, shape_):
 if __name__ == "__main__":
   for dtype in np.float32, np.float64:
     for size in 1, 4, 10:
-      for use_static_shape in True, False:
+      # TF2 does not support placeholders under eager so we skip it
+      for use_static_shape in set([True, tf2.enabled()]):
         shape = [size, size]
         arg_string = "%s_%s_staticshape_%s" % (dtype.__name__, size,
                                                use_static_shape)
diff --git a/tensorflow/contrib/summary/summary_ops_test.py b/tensorflow/contrib/summary/summary_ops_test.py
index 4d1807130c57039976dfa57c27bb0d4807e75212..10e4556dacbc17ec02c2bd698389b04d517d7076 100644
--- a/tensorflow/contrib/summary/summary_ops_test.py
+++ b/tensorflow/contrib/summary/summary_ops_test.py
@@ -152,6 +152,27 @@ class EagerFileTest(test_util.TensorFlowTestCase):
       self.assertEqual(len(events), 2)
       self.assertEqual(events[1].summary.value[0].tag, 'scalar')
 
+  def testRecordEveryNGlobalSteps(self):
+    step = training_util.get_or_create_global_step()
+    logdir = tempfile.mkdtemp()
+
+    def run_step():
+      summary_ops.scalar('scalar', i, step=step)
+      step.assign_add(1)
+
+    with summary_ops.create_file_writer(
+        logdir).as_default(), summary_ops.record_summaries_every_n_global_steps(
+            2, step):
+      for i in range(10):
+        run_step()
+      # And another 10 steps as a graph function.
+      run_step_fn = function.defun(run_step)
+      for i in range(10):
+        run_step_fn()
+
+    events = summary_test_util.events_from_logdir(logdir)
+    self.assertEqual(len(events), 11)
+
   def testMaxQueue(self):
     logs = tempfile.mkdtemp()
     with summary_ops.create_file_writer(
@@ -279,12 +300,9 @@ class EagerDbTest(summary_test_util.SummaryDbTest):
 
   def testDbURIOpen(self):
     tmpdb_path = os.path.join(self.get_temp_dir(), 'tmpDbURITest.sqlite')
-    tmpdb_uri = six.moves.urllib_parse.urljoin("file:", tmpdb_path)
-    tmpdb_writer = summary_ops.create_db_writer(
-        tmpdb_uri,
-        "experimentA",
-        "run1",
-        "user1")
+    tmpdb_uri = six.moves.urllib_parse.urljoin('file:', tmpdb_path)
+    tmpdb_writer = summary_ops.create_db_writer(tmpdb_uri, 'experimentA',
+                                                'run1', 'user1')
     with summary_ops.always_record_summaries():
       with tmpdb_writer.as_default():
         summary_ops.scalar('t1', 2.0)
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
index 3f24f58f03aac2ba6d368d7eccf8731f611a81b4..22b6f09d0cd88068f7bedabe7687920420a3028f 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer.cc
@@ -73,7 +73,16 @@ class SummaryFileWriter : public SummaryWriterInterface {
     e->set_step(global_step);
     e->set_wall_time(GetWallTime());
     Summary::Value* v = e->mutable_summary()->add_value();
-    t.AsProtoTensorContent(v->mutable_tensor());
+
+    if (t.dtype() == DT_STRING) {
+      // Treat DT_STRING specially, so that tensor_util.MakeNdarray in Python
+      // can convert the TensorProto to string-type numpy array. MakeNdarray
+      // does not work with strings encoded by AsProtoTensorContent() in
+      // tensor_content.
+      t.AsProtoField(v->mutable_tensor());
+    } else {
+      t.AsProtoTensorContent(v->mutable_tensor());
+    }
     v->set_tag(tag);
     if (!serialized_metadata.empty()) {
       v->mutable_metadata()->ParseFromString(serialized_metadata);
diff --git a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
index cd3f712256f2293ed725745f8cbe48109856ef86..ffbfb9533e887e54b0f5bdfde11dadce21073a94 100644
--- a/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
+++ b/tensorflow/contrib/tensorboard/db/summary_file_writer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/contrib/tensorboard/db/summary_file_writer.h"
 
 #include "tensorflow/core/framework/summary.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -104,6 +105,23 @@ TEST_F(SummaryFileWriterTest, WriteTensor) {
                                   CHECK_EQ(e.summary().value_size(), 1);
                                   EXPECT_EQ(e.summary().value(0).tag(), "name");
                                 }));
+  TF_CHECK_OK(SummaryTestHelper(
+      "string_tensor_test",
+      [](SummaryWriterInterface* writer) {
+        Tensor hello(DT_STRING, TensorShape({}));
+        hello.scalar<string>()() = "hello";
+        TF_RETURN_IF_ERROR(writer->WriteTensor(
+            2, hello, "name", SummaryMetadata().SerializeAsString()));
+        TF_RETURN_IF_ERROR(writer->Flush());
+        return Status::OK();
+      },
+      [](const Event& e) {
+        EXPECT_EQ(e.step(), 2);
+        CHECK_EQ(e.summary().value_size(), 1);
+        EXPECT_EQ(e.summary().value(0).tag(), "name");
+        EXPECT_EQ(e.summary().value(0).tensor().dtype(), DT_STRING);
+        EXPECT_EQ(e.summary().value(0).tensor().string_val()[0], "hello");
+      }));
 }
 
 TEST_F(SummaryFileWriterTest, WriteScalar) {
diff --git a/tensorflow/contrib/tensorrt/BUILD b/tensorflow/contrib/tensorrt/BUILD
index 20bcd2447e6fd7eaf11e3e5cf383f6abf168c787..784acce444a8d0c066f1b7ae6c1b5d7d65405549 100644
--- a/tensorflow/contrib/tensorrt/BUILD
+++ b/tensorflow/contrib/tensorrt/BUILD
@@ -29,6 +29,10 @@ load(
     "if_tensorrt",
 )
 
+exports_files(glob([
+    "test/testdata/*",
+]))
+
 tf_cuda_cc_test(
     name = "tensorrt_test_cc",
     size = "small",
@@ -491,6 +495,7 @@ cuda_py_tests(
         "test/memory_alignment_test.py",
         "test/multi_connection_neighbor_engine_test.py",
         "test/neighboring_engine_test.py",
+        "test/quantization_test.py",
         "test/rank_two_test.py",
         "test/reshape_transpose_test.py",
         "test/vgg_block_nchw_test.py",
@@ -527,6 +532,30 @@ cuda_py_tests(
     ],
 )
 
+cuda_py_test(
+    name = "quantization_mnist_test",
+    srcs = ["test/quantization_mnist_test.py"],
+    additional_deps = [
+        ":tf_trt_integration_test_base",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/keras:keras",
+        "//tensorflow/python/estimator:estimator",
+    ],
+    data = [
+        "test/testdata/checkpoint",
+        "test/testdata/model.ckpt-46900.data-00000-of-00001",
+        "test/testdata/model.ckpt-46900.index",
+    ],
+    tags = [
+        "no_cuda_on_cpu_tap",
+        "no_pip",
+        "no_tap",  # It is not able to download the mnist data.
+        "no_windows",
+        "nomac",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = ["convert/utils.cc"],
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.cc b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
index 26d54eb156ccc8593d82609195caabb5bb929262..ae211a93c3279ff1d6de2f9c9a4b849fc8cd578d 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.cc
@@ -82,60 +82,78 @@ std::vector<int> GetLoadedTensorRTVersion() {
 }
 
 TrtCandidateSelector::TrtCandidateSelector(
-    const grappler::GraphProperties& graph_properties)
-    : graph_properties_(graph_properties) {}
+    const grappler::GraphProperties& graph_properties, int precision_mode)
+    : graph_properties_(graph_properties), precision_mode_(precision_mode) {}
 
 Status TrtCandidateSelector::IsTensorRTCandidate(const tensorflow::Node* node) {
   // TODO(laigd): move this set to TrtNodeValidator where it should belong.
   // LINT.IfChange
   static const std::set<string> candidate_ops = {
-    "Identity",
-    "Snapshot",
-    "Const",
-    "Conv2D",
-    "MaxPool",
-    "BiasAdd",
-    "Relu",
-    "Add",
-    "Mul",
-    "Sub",
-    "Rsqrt",
-    "Pad",
-    "Mean",
-    "AvgPool",
-    "ConcatV2",
-    "DepthwiseConv2dNative",
-    "FusedBatchNorm",
-    "FusedBatchNormV2",
-    "Div",
-    "RealDiv",
-    "Rsqrt",
-    "Reciprocal",
-    "Exp",
-    "Log",
-    "Sqrt",
-    "Abs",
-    "Neg",
-    "Transpose",
-    "Reshape",
-    "MatMul",
-    "BatchMatMul",
-    "Softmax",
-    "Minimum",
-    "Maximum",
-    "TopKV2",
-    "Sum",
-    "Prod",
-    "Max",
-    "Min",
+      "Identity",
+      "Snapshot",
+      "Const",
+      "Conv2D",
+      "MaxPool",
+      "BiasAdd",
+      "Relu",
+      "Sigmoid",
+      "Tanh",
+      "Add",
+      "Mul",
+      "Sub",
+      "Rsqrt",
+      "Pad",
+      "Mean",
+      "AvgPool",
+      "ConcatV2",
+      "DepthwiseConv2dNative",
+      "FusedBatchNorm",
+      "FusedBatchNormV2",
+      "Div",
+      "RealDiv",
+      "Rsqrt",
+      "Reciprocal",
+      "Exp",
+      "Log",
+      "Sqrt",
+      "Abs",
+      "Neg",
+      "Transpose",
+      "Reshape",
+      "MatMul",
+      "BatchMatMul",
+      "Softmax",
+      "Minimum",
+      "Maximum",
+      "TopKV2",
+      "Sum",
+      "Prod",
+      "Max",
+      "Min",
+      "Relu6",
+      "Square",
+      "ExpandDims",
+      "Squeeze",
   };
-  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
-  const bool is_supported_op_type =
+  bool is_supported_op_type =
       (candidate_ops.count(node->type_string()) ||
        PluginFactoryTensorRT::GetInstance()->IsPlugin(node->type_string()));
+  static const std::set<string> quantize_ops = {
+      "QuantizeAndDequantizeV2",
+      "QuantizeAndDequantizeV3",
+      "FakeQuantWithMinMaxVars",
+      "FakeQuantWithMinMaxArgs",
+  };
+  // In INT8 mode, we will always apply the quantization ranges provided by
+  // these ops to the relevant tensors. This happens regardless of the value of
+  // use_calibration.
+  if (precision_mode_ == INT8MODE && quantize_ops.count(node->type_string())) {
+    is_supported_op_type = true;
+  }
+  // LINT.ThenChange(//tensorflow/contrib/tensorrt/convert/convert_nodes.cc)
   if (!is_supported_op_type) {
     return errors::Unimplemented("Op type ", node->type_string(),
-                                 " is not supported.");
+                                 " is not supported");
   }
 
   std::vector<const Edge*> input_edges;
@@ -170,7 +188,7 @@ tensorflow::Status BuildNodeMap(
 tensorflow::Status ConvertCalibGraphToInferGraph(
     const tensorflow::GraphDef& graph_def, tensorflow::GraphDef* infer_graph,
     bool is_dyn_op) {
-  VLOG(0) << "Starting Calib Conversion";
+  LOG(INFO) << "Starting Calib Conversion";
   infer_graph->CopyFrom(graph_def);
   auto trt_rm = TRTResourceManager::instance();
   auto calib_rm = trt_rm->getManager("TRTCalibration");
@@ -220,18 +238,19 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     const std::vector<string>& output_names, size_t max_batch_size,
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode, int minimum_segment_size, bool is_dyn_op,
-    int max_cached_engines, std::vector<int> cached_engine_batches) {
+    int max_cached_engines, std::vector<int> cached_engine_batches,
+    bool use_calibration) {
   // Create GrapplerItem.
   tensorflow::grappler::GrapplerItem item;
   item.fetch = output_names;
   item.graph = graph_def;
 
-  // TODO(aaroey): we should have used single machine cluster like the
-  // following, but the problem is then wrap_conversion will depend on
-  // direct_session and cause double linking problems. To fix this we need to
-  // fix or get rid of the swig dependency. Here we use VirtualCluster
-  // as a work around, and we need to create a session to initialize the
-  // underlying device before calling this method.
+// TODO(aaroey): we should have used single machine cluster like the
+// following, but the problem is then wrap_conversion will depend on
+// direct_session and cause double linking problems. To fix this we need to
+// fix or get rid of the swig dependency. Here we use VirtualCluster
+// as a work around, and we need to create a session to initialize the
+// underlying device before calling this method.
 #if 0
   // Create single machine cluster. Note that this will create a session and
   // initialize the gpu devices.
@@ -264,7 +283,9 @@ tensorflow::Status ConvertGraphDefToTensorRT(
 #endif
 
   // Create RewriterConfig.
-  tensorflow::RewriterConfig rw_cfg;
+  tensorflow::ConfigProto config_proto;
+  auto& rw_cfg =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   // TODO(aaroey): use only const folding and layout for the time being since
   // new optimizers break the graph for trt.
   rw_cfg.add_optimizers("constfold");
@@ -285,9 +306,10 @@ tensorflow::Status ConvertGraphDefToTensorRT(
       list->add_i(batch);
     }
   }
+  parameters["use_calibration"].set_b(use_calibration);
 
   // Run optimizer.
-  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, rw_cfg);
+  tensorflow::grappler::MetaOptimizer meta_opt(nullptr, config_proto);
   TF_RETURN_IF_ERROR(meta_opt.Optimize(cluster.get(), item, new_graph_def));
 
   if (VLOG_IS_ON(5)) {
@@ -433,7 +455,8 @@ tensorflow::Status GetEngineInfo(
                  << "but this shouldn't have happened";
     info->device = *segment_devices.begin();
   } else {
-    LOG(ERROR) << "Can't find a device placement for the op!";
+    VLOG(1) << "No device is assigned to the segment. "
+            << "A device will be assigned during graph execution (inference).";
   }
   return Status::OK();
 }
@@ -564,27 +587,38 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
       }
     }
   }
+  // We don't support segments with no inputs. Fall back to native TF here to
+  // avoid crash later. Constant folding should've folded the ops that make up
+  // these segments.
+  if (inputs.empty()) {
+    return tensorflow::errors::Internal(
+        "Segment has no inputs (possible "
+        "constfold failure)");
+  }
+
+  const bool calibrate_int8 =
+      (info.precision_mode == INT8MODE && info.use_calibration);
+  // Build the engine and get its serialized representation.
   string segment_string;
-  if (info.engine_type == EngineInfo::EngineType::TRTStatic ||
-      info.precision_mode == INT8MODE) {
+  if (info.engine_type == EngineInfo::EngineType::TRTStatic || calibrate_int8) {
     // Create static engine for fp32/fp16 mode, and test validity of the engine
-    // for int8 mode. We don't want engine to fail at the calibration time.
-    // So we are constructing a FP32 engine here to check its validity, and if
-    // it is a valid engine then we put the serialized graphdef to the op.
-    // Otherwise we skip node creation for this engine.
+    // for int8 calibration mode. We don't want engine to fail at the
+    // calibration time. So we are constructing a FP32 engine here to check its
+    // validity, and if it is a valid engine then we put the serialized graphdef
+    // to the op. Otherwise we skip node creation for this engine.
     Logger trt_logger;
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     // TODO(sami): What happens if 1st dim is not batch?
     TF_RETURN_IF_ERROR(ConvertGraphDefToEngine(
-        info.segment_graph_def,
-        info.precision_mode == INT8MODE ? FP32MODE : info.precision_mode,
+        info.segment_graph_def, calibrate_int8 ? FP32MODE : info.precision_mode,
         max_batch_size, info.max_workspace_size_bytes, input_shapes,
         &trt_logger, alloc, /*calibrator=*/nullptr, &engine,
+        info.use_calibration,
         /*convert_successfully=*/nullptr));
     TrtUniquePtrType<nvinfer1::IHostMemory> engine_data(engine->serialize());
     segment_string =
         string((const char*)engine_data->data(), engine_data->size());
-    if (info.precision_mode == INT8MODE) {
+    if (calibrate_int8) {
       // See above comment about why not putting this inside the 'else' branch.
       segment_string = info.segment_graph_def.SerializeAsString();
     }
@@ -596,7 +630,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
   // conversion.
   string prec_string;
   TF_RETURN_IF_ERROR(GetPrecisionModeName(info.precision_mode, &prec_string));
-  if (info.precision_mode == INT8MODE &&
+  if (info.precision_mode == INT8MODE && calibrate_int8 &&
       !TRTResourceManager::instance()->getManager("TRTCalibration")) {
     LOG(ERROR) << "Failed to construct calibration storage";
   }
@@ -632,6 +666,7 @@ tensorflow::Status CreateTRTNode(const std::vector<EngineInfo>& infos, int pos,
           .Attr("cached_engine_batches", {max_batch_size})
           .Attr("workspace_size_bytes", info.max_workspace_size_bytes)
           .Attr("precision_mode", prec_string)
+          .Attr("use_calibration", info.use_calibration)
           .Attr("OutT", out_types)
           .Finalize(&trt_node);
   if (!status.ok()) {
@@ -864,7 +899,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
   }
   segment_options.minimum_segment_size = params.minimum_segment_size;
   tensorflow::tensorrt::segment::SegmentNodesVector initial_segments;
-  TrtCandidateSelector candidate_selector(*params.graph_properties);
+  TrtCandidateSelector candidate_selector(*params.graph_properties,
+                                          params.precision_mode);
   TF_RETURN_IF_ERROR(tensorrt::segment::SegmentGraph(
       &graph,
       std::bind(&TrtCandidateSelector::IsTensorRTCandidate, &candidate_selector,
@@ -873,10 +909,8 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       // need to check the input edges.
       [](const Edge* edge) { return true; }, OutputEdgeValidator(),
       segment_options, &initial_segments));
-  if (initial_segments.size() > 1) {
-    VLOG(0) << "MULTIPLE tensorrt candidate conversion: "
+  LOG(INFO) << "Number of TensorRT candidate segments: "
             << initial_segments.size();
-  }
 
   // Get the EngineInfo for each segment.
   std::unordered_map<string, tensorflow::Node*> node_map;
@@ -902,13 +936,17 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       continue;
     }
     curr_engine.precision_mode = params.precision_mode;
-    curr_engine.engine_type =
-        (params.is_dyn_op || params.precision_mode == INT8MODE
-             ? EngineInfo::EngineType::TRTDynamic
-             : EngineInfo::EngineType::TRTStatic);
+    if (params.use_calibration && params.precision_mode != INT8MODE) {
+      return errors::InvalidArgument(
+          "Calibration with FP32 or FP16 is not supported.");
+    }
+    curr_engine.engine_type = ((params.is_dyn_op || params.use_calibration)
+                                   ? EngineInfo::EngineType::TRTDynamic
+                                   : EngineInfo::EngineType::TRTStatic);
+    curr_engine.use_calibration = params.use_calibration;
     curr_engine.cached_engine_batches = params.cached_engine_batches;
     curr_engine.maximum_cached_engines = params.max_cached_engines;
-    StrAppend(&curr_engine.engine_name, "my_trt_op_", t);
+    StrAppend(&curr_engine.engine_name, "TRTEngineOp_", t);
     status = RegisterSegmentFunctionToFunctionLibrary(
         &graph, curr_engine.segment_graph_def, curr_engine.engine_name);
     if (!status.ok()) {
@@ -969,16 +1007,9 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
                                 &graph, alloc.get(), &engine_nodes);
     // If status is ok, we successfully added the node to the graph and can
     // remove segment ops. Otherwise graph is not modified.
-    string msg = StrCat("Engine ", engine.engine_name, " creation for segment ",
-                        i, ", composed of ",
+    string msg = StrCat("TensorRT node ", engine.engine_name,
+                        " added for segment ", i, " consisting of ",
                         converted_segments.at(i).first.size(), " nodes");
-    if (VLOG_IS_ON(1)) {
-      StrAppend(&msg, " (");
-      for (const string& node_name : converted_segments.at(i).first) {
-        StrAppend(&msg, node_name, ", ");
-      }
-      StrAppend(&msg, ")");
-    }
     if (status.ok()) {
       LOG(INFO) << msg << " succeeded.";
       for (auto node_name : converted_segments.at(i).first) {
@@ -986,7 +1017,14 @@ tensorflow::Status ConvertAfterShapes(ConversionParams& params) {
       }
     } else {
       // Graph is not modified.
-      LOG(WARNING) << msg << " failed: " << status << ". Skipping...";
+      LOG(WARNING) << msg << " failed: " << status << ". Fallback to TF...";
+    }
+    if (VLOG_IS_ON(1)) {
+      msg = "Segment consists of nodes: ";
+      for (const string& node_name : converted_segments.at(i).first) {
+        StrAppend(&msg, node_name, ", ");
+      }
+      VLOG(1) << msg;
     }
   }
   cudaSetDevice(old_cuda_device);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph.h b/tensorflow/contrib/tensorrt/convert/convert_graph.h
index 1c9d82105a7b380cafbb27c340a4cc9d1580ee2c..1f39f56f6392ba33af3d74fec12c326ed4451cb6 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph.h
@@ -35,7 +35,8 @@ namespace convert {
 // supported by TRT.
 class TrtCandidateSelector {
  public:
-  TrtCandidateSelector(const grappler::GraphProperties& graph_properties);
+  TrtCandidateSelector(const grappler::GraphProperties& graph_properties,
+                       int precision_mode);
 
   // Returns OK iff 'node' is a TF-TRT conversion candidate, which will be added
   // to TRT subgraph and later converted into TRT engine.
@@ -49,6 +50,9 @@ class TrtCandidateSelector {
   // GraphProperties of the graph whose nodes are to be validated by
   // IsTensorRTCandidate().
   const grappler::GraphProperties& graph_properties_;
+
+  // Quantization ops are only converted when using quantized precisions.
+  const int precision_mode_;
 };
 
 struct ConversionParams {
@@ -63,6 +67,7 @@ struct ConversionParams {
         cluster(nullptr),
         is_dyn_op(false),
         fixed_input_size(true),
+        use_calibration(true),
         max_cached_engines(1) {}
   const tensorflow::GraphDef* input_graph_def;
   const std::vector<string>* output_names;
@@ -76,6 +81,7 @@ struct ConversionParams {
   bool is_dyn_op;  //  Whether to create engine on conversion or execution time
   bool fixed_input_size;   // Assume non-batch ranks of input tensors are fixed
   int max_cached_engines;  // maximum number of cached engines
+  bool use_calibration;
   std::vector<int> cached_engine_batches;  // list of cached engines
 };
 
@@ -95,7 +101,7 @@ tensorflow::Status ConvertGraphDefToTensorRT(
     size_t max_workspace_size_bytes, tensorflow::GraphDef* new_graph_def,
     int precision_mode = 1, int minimum_segment_size = 3,
     bool is_dyn_op = false, int max_cached_engines = 1,
-    std::vector<int> cached_engine_batches = {});
+    std::vector<int> cached_engine_batches = {}, bool use_calibration = true);
 
 // Method to call from optimization pass
 tensorflow::Status ConvertAfterShapes(ConversionParams& params);
diff --git a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
index f10729987fdb787c6a745fdac28fe7d7d60d08fa..2d2bfeb192c1893824c7b30bfad593c62c203392 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_graph_test.cc
@@ -85,27 +85,42 @@ TEST(TrtCandidateSelector, Basics) {
       ops::MatMul(s.WithOpName("matmul_with_incompatible_input"),
                   incompatible_feed, const_2);
 
+  // Quantize ops.
+  auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
+  auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("quantize"), feed,
+                                               quantize_attrs);
+
+  // Get GrapplerItem and GraphProperties.
   grappler::GrapplerItem item;
   TF_EXPECT_OK(s.ToGraphDef(&item.graph));
   Tensor feed_tensor(DT_FLOAT, input_shape);
   item.feed.push_back(std::make_pair("feed", feed_tensor));
-
   grappler::GraphProperties graph_properties(item);
   TF_EXPECT_OK(graph_properties.InferStatically(true));
 
-  TrtCandidateSelector selector(graph_properties);
-  TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
-  ExpectStatus(
-      selector.IsTensorRTCandidate(incompatible_matmul.operation.node()),
-      error::INVALID_ARGUMENT,
-      "transpose_a is not supported for TensorRT FullyConnected "
-      "(op: MatMul), at: incompatible_matmul");
-  ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
-               error::UNIMPLEMENTED, "Op type Sin is not supported");
-  ExpectStatus(selector.IsTensorRTCandidate(
-                   matmul_with_incompatible_input.operation.node()),
-               error::INTERNAL,
-               "Failed to convert input with index 0 to a TRT_TensorOrWeights");
+  for (const int precision_mode : {FP32MODE, INT8MODE}) {
+    TrtCandidateSelector selector(graph_properties, precision_mode);
+    TF_EXPECT_OK(selector.IsTensorRTCandidate(matmul.operation.node()));
+    ExpectStatus(
+        selector.IsTensorRTCandidate(incompatible_matmul.operation.node()),
+        error::INVALID_ARGUMENT,
+        "transpose_a is not supported for TensorRT FullyConnected "
+        "(op: MatMul), at: incompatible_matmul");
+    ExpectStatus(selector.IsTensorRTCandidate(unsupported_op.operation.node()),
+                 error::UNIMPLEMENTED, "Op type Sin is not supported");
+    ExpectStatus(
+        selector.IsTensorRTCandidate(
+            matmul_with_incompatible_input.operation.node()),
+        error::INTERNAL,
+        "Failed to convert input with index 0 to a TRT_TensorOrWeights");
+    if (precision_mode == INT8MODE) {
+      TF_EXPECT_OK(selector.IsTensorRTCandidate(quantize.operation.node()));
+    } else {
+      ExpectStatus(selector.IsTensorRTCandidate(quantize.operation.node()),
+                   error::UNIMPLEMENTED,
+                   "Op type FakeQuantWithMinMaxArgs is not supported");
+    }
+  }
 }
 
 class FakeCluster : public grappler::Cluster {
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
index e2988f5f2a8f6164cbe193573b267e6ffeef3284..777a80bbc4da7a260cf85d0a7bc5ec16f4cd3cab 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.cc
@@ -54,10 +54,10 @@ limitations under the License.
 // would work!
 #define TFTRT_CHECK_EQ_TYPE(val1, val2) CHECK_EQ((int)val1, (int)val2)
 
-#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                               \
-  do {                                                                   \
-    return tensorflow::errors::Internal(                                 \
-        "TFTRT::", __FUNCTION__, "failed to add TRT layer, at: ", node); \
+#define TFTRT_INTERNAL_ERROR_AT_NODE(node)                                \
+  do {                                                                    \
+    return tensorflow::errors::Internal(                                  \
+        "TFTRT::", __FUNCTION__, " failed to add TRT layer, at: ", node); \
   } while (0)
 
 #define TFTRT_RETURN_ERROR_IF_FALSE(status, node) \
@@ -120,6 +120,15 @@ inline nvinfer1::Dims TensorShapeToTrtDims(const TensorShapeType& shape,
   return trt_dims;
 }
 
+Status TensorShapeArrayToTrtDims(const std::vector<int>& shape,
+                                 nvinfer1::Dims* out,
+                                 bool ignore_first_dim = false) {
+  PartialTensorShape tensor_shape;
+  TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(shape, &tensor_shape));
+  *out = TensorShapeToTrtDims(tensor_shape, ignore_first_dim);
+  return tensorflow::Status::OK();
+}
+
 void GetOutputProperties(const grappler::GraphProperties& graph_properties,
                          const Node* node, const int out_port,
                          PartialTensorShape* shape,
@@ -130,7 +139,7 @@ void GetOutputProperties(const grappler::GraphProperties& graph_properties,
     *dtype = out_shape.dtype();
     *shape = out_shape.shape();
   } else {
-    VLOG(0) << "Unknown output shape" << node->name();
+    LOG(INFO) << "Unknown output shape" << node->name();
     *dtype = node->output_type(out_port);
   }
 }
@@ -181,16 +190,55 @@ Status ValidateTensorProperties(const string& producer_node_type,
     if (shape.dim_size(d) < 0) {
       return errors::InvalidArgument(
           "Input tensor with shape ", shape.DebugString(),
-          " has an unknown non-batch dimemension at dim ", d);
+          " has an unknown non-batch dimension at dim ", d);
     }
   }
   return Status::OK();
 }
 
+string DebugString(const nvinfer1::DimensionType type) {
+  switch (type) {
+    case nvinfer1::DimensionType::kSPATIAL:
+      return "kSPATIAL";
+    case nvinfer1::DimensionType::kCHANNEL:
+      return "kCHANNEL";
+    case nvinfer1::DimensionType::kINDEX:
+      return "kINDEX";
+    case nvinfer1::DimensionType::kSEQUENCE:
+      return "kSEQUENCE";
+    default:
+      return StrCat(static_cast<int>(type), "=unknown");
+  }
+}
+
+string DebugString(const nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return "kFLOAT";
+    case nvinfer1::DataType::kHALF:
+      return "kHALF";
+    case nvinfer1::DataType::kINT8:
+      return "kINT8";
+    case nvinfer1::DataType::kINT32:
+      return "kINT32";
+    default:
+      return "Invalid TRT data type";
+  }
+}
+
 string DebugString(const nvinfer1::Dims& dims) {
   string out = StrCat("nvinfer1::Dims(nbDims=", dims.nbDims, ", d=");
   for (int i = 0; i < dims.nbDims; ++i) {
-    StrAppend(&out, dims.d[i], ",");
+    StrAppend(&out, dims.d[i], "[", DebugString(dims.type[i]), "],");
+  }
+  StrAppend(&out, ")");
+  return out;
+}
+
+string DebugString(const nvinfer1::Permutation& permutation, int len) {
+  string out = "nvinfer1::Permutation(";
+  for (int i = 0; i < len; ++i) {
+    StrAppend(&out, permutation.order[i], ",");
   }
   StrAppend(&out, ")");
   return out;
@@ -198,16 +246,15 @@ string DebugString(const nvinfer1::Dims& dims) {
 
 string DebugString(const nvinfer1::ITensor& tensor) {
   return StrCat("nvinfer1::ITensor(@", reinterpret_cast<uintptr_t>(&tensor),
-                ", shape=", DebugString(tensor.getDimensions()), ")");
+                ", name=", tensor.getName(),
+                ", dtype=", DebugString(tensor.getType()),
+                ", dims=", DebugString(tensor.getDimensions()), ")");
 }
 
-// Return whether or not the broadcast is feasible;
-bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
-                               const bool operand_l_is_tensor,
-                               const nvinfer1::Dims& operand_r,
-                               const bool operand_r_is_tensor,
-                               nvinfer1::Dims* operand_l_new_shape,
-                               nvinfer1::Dims* operand_r_new_shape) {
+Status Converter::GetTrtBroadcastShape(
+    const TRT_TensorOrWeights& operand_l, const TRT_TensorOrWeights& operand_r,
+    nvinfer1::Dims* operand_l_new_dims,
+    nvinfer1::Dims* operand_r_new_dims) const {
   // ***************************************************************************
   // TensorRT Elementwise op supports broadcast but requires both tensor to be
   // of Identical rank
@@ -232,52 +279,59 @@ bool TensorRTGetBroadcastShape(const nvinfer1::Dims& operand_l,
   // -> T: 1 1 1 -1 3 5 1
   // -> W: 1 1 1  1 3 5 1
   // ***************************************************************************
-  const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
-  const size_t element_size = sizeof(operand_l.d[0]);
-
-  // fill in dimensions
-  int l_s[max_nb_dims];
-  std::fill(l_s, l_s + max_nb_dims, 1);
-  int l_d = operand_l_is_tensor ? operand_l.nbDims + 1 : operand_l.nbDims;
-  int r_s[max_nb_dims];
-  std::fill(r_s, r_s + max_nb_dims, 1);
-  int r_d = operand_r_is_tensor ? operand_r.nbDims + 1 : operand_r.nbDims;
-
-  int max_d = std::max(l_d, r_d);
-  std::memcpy(l_s + max_d - operand_l.nbDims, operand_l.d,
-              operand_l.nbDims * element_size);
-  std::memcpy(r_s + max_d - operand_r.nbDims, operand_r.d,
-              operand_r.nbDims * element_size);
-
-  // set -1 for batch dimension, since batch size is not supposed to be
-  // broadcasted
-  if (operand_l_is_tensor) {
-    if (max_d != l_d) {  // if broadcast beyond batch dimension, fail
-      return false;
-    }
-    l_s[0] = -1;
-  }
-  if (operand_r_is_tensor) {
-    if (max_d != r_d) {  // if broadcast beyond batch dimension, fail
-      return false;
-    }
-    r_s[0] = -1;
+  if (!operand_l.is_tensor() && !operand_r.is_tensor()) {
+    return errors::InvalidArgument(
+        "Broadcasting requires at least one of the operands be tensors");
   }
 
-  // compare broadcast feasibility
-  for (int i = max_d - 1; i >= 0; i--) {
-    if ((l_s[i] != r_s[i]) && (l_s[i] != 1) && (r_s[i] != 1)) {
-      return false;
+  const int max_nb_dims = nvinfer1::Dims::MAX_DIMS + 1;
+  auto compute_output_dims =
+      [max_nb_dims](const TRT_TensorOrWeights& input, int broadcast_num_dims,
+                    int* output_dims_array, nvinfer1::Dims* output_dims) {
+        const nvinfer1::Dims input_dims = input.GetTrtDims();
+        std::fill(output_dims_array, output_dims_array + max_nb_dims, 1);
+        std::copy(input_dims.d, input_dims.d + input_dims.nbDims,
+                  output_dims_array + broadcast_num_dims - input_dims.nbDims);
+        if (input.is_tensor()) {
+          const int true_input_dims = input_dims.nbDims + 1;
+          if (true_input_dims < broadcast_num_dims) {
+            return errors::InvalidArgument(
+                "Broadcasting beyond batch dimension is not supported ",
+                "(tensor #dims ", true_input_dims, " vs broadcast #dims ",
+                broadcast_num_dims, ")");
+          }
+          // Set the batch dimension to -1, since batch size is not supposed to
+          // be broadcasted.
+          output_dims_array[0] = -1;
+        }
+        // Copy to output dimensions (stripping the batch dimension).
+        output_dims->nbDims = broadcast_num_dims - 1;
+        std::copy(output_dims_array + 1, output_dims_array + broadcast_num_dims,
+                  output_dims->d);
+        return Status::OK();
+      };
+
+  // Compute the output dimensions.
+  const int broadcast_num_dims =
+      std::max(operand_l.GetTrtDims().nbDims + (operand_l.is_tensor() ? 1 : 0),
+               operand_r.GetTrtDims().nbDims + (operand_r.is_tensor() ? 1 : 0));
+  int output_l[max_nb_dims], output_r[max_nb_dims];
+  TF_RETURN_IF_ERROR(compute_output_dims(operand_l, broadcast_num_dims,
+                                         output_l, operand_l_new_dims));
+  TF_RETURN_IF_ERROR(compute_output_dims(operand_r, broadcast_num_dims,
+                                         output_r, operand_r_new_dims));
+
+  // Compare broadcast feasibility
+  for (int i = 0; i < broadcast_num_dims; ++i) {
+    if ((output_l[i] != output_r[i]) && (output_l[i] != 1) &&
+        (output_r[i] != 1)) {
+      return errors::InvalidArgument(
+          "Infeasible broadcast scheme (", "batch_dim: ", output_l[0], ", ",
+          DebugString(*operand_l_new_dims), " vs ", "batch_dim: ", output_r[0],
+          ", ", DebugString(*operand_r_new_dims), ")");
     }
   }
-
-  // output new TensorRT Dimension (stripping the batch dimension)
-  operand_l_new_shape->nbDims = max_d - 1;
-  std::memcpy(operand_l_new_shape->d, l_s + 1, (max_d - 1) * element_size);
-  operand_r_new_shape->nbDims = max_d - 1;
-  std::memcpy(operand_r_new_shape->d, r_s + 1, (max_d - 1) * element_size);
-
-  return true;
+  return Status::OK();
 }
 
 inline bool DimsEqual(const nvinfer1::Dims& dim_l,
@@ -381,7 +435,7 @@ size_t TRT_ShapedWeights::size_bytes() const {
 
 string TRT_ShapedWeights::DebugString() const {
   return StrCat("TRT_ShapedWeights(shape=", convert::DebugString(shape_),
-                ", type=", type_,
+                ", type=", DataTypeString(type_),
                 ", values=", reinterpret_cast<uintptr_t>(GetValues()), ")");
 }
 
@@ -425,7 +479,9 @@ class TRT_TensorOrWeights::SimpleITensor : public nvinfer1::ITensor {
   void setLocation(nvinfer1::TensorLocation location) override {}
 
 #if NV_TENSORRT_MAJOR >= 5
-  bool setDynamicRange(float min, float max) override {}
+  bool setDynamicRange(float min, float max) override { return true; }
+
+  float getDynamicRange() const override { return 0; }
 #endif
 
  private:
@@ -489,8 +545,7 @@ nvinfer1::Dims TRT_TensorOrWeights::GetTrtDims() const {
 string TRT_TensorOrWeights::DebugString() const {
   string output = "TRT_TensorOrWeights(type=";
   if (is_tensor()) {
-    StrAppend(&output, "tensor @", reinterpret_cast<uintptr_t>(tensor()),
-              ", shape=", convert::DebugString(tensor()->getDimensions()),
+    StrAppend(&output, "tensor=", convert::DebugString(*tensor()),
               ", batch_size=", batch_size_);
   } else {
     StrAppend(&output, "weights=", weights_.DebugString());
@@ -753,8 +808,9 @@ Status TrtNodeValidator::ValidateNode(
     Status status = ConvertToTensorOrWeights(
         *pair.first, pair.second, graph_properties, &tensor_or_weights);
     if (!status.ok()) {
-      return errors::Internal("Failed to convert input with index ", i,
-                              " to a TRT_TensorOrWeights");
+      return errors::Internal(
+          "Failed to convert input with index ", i,
+          " to a TRT_TensorOrWeights: ", status.error_message());
     }
     inputs.push_back(tensor_or_weights);
   }
@@ -786,8 +842,11 @@ Status TrtNodeValidator::ConvertConstToWeights(
   return status;
 }
 
-Converter::Converter(nvinfer1::INetworkDefinition* trt_network, bool is_fp16)
-    : trt_network_(trt_network), is_fp16_(is_fp16) {
+Converter::Converter(nvinfer1::INetworkDefinition* trt_network,
+                     int precision_mode, bool use_calibration)
+    : trt_network_(trt_network),
+      precision_mode_(precision_mode),
+      use_calibration_(use_calibration) {
   this->RegisterOpConverters();
 }
 
@@ -812,13 +871,18 @@ Status Converter::ConvertNode(const NodeDef& node_def) {
     TRT_TensorOrWeights& output = outputs[i];
     string output_name = node_def.name();
     if (i != 0) output_name = StrCat(output_name, ":", i);
-    // We need to check the name before setting it. For Identity op where the
-    // output is the input, if its input is one of the engine input, setting
-    // the name here will overwrite engine input bindings which will cause
-    // runtime error.
+    // We need to check the name before setting it. If the input is one of the
+    // engine input, setting the name here will overwrite engine input
+    // bindings which will cause runtime error.
     if (output.is_tensor()) {
       const char* tensor_name = output.tensor()->getName();
-      if (tensor_name == nullptr || std::strlen(tensor_name) == 0) {
+      if (!tensorflow::str_util::StartsWith(tensor_name, kInputPHName)) {
+        // TRT initializes tensor names as "(Unnamed ITensor* N)". We rename
+        // them to match their corresponding TensorFlow name.
+        // Note: ITensors that we create internally within TF-TRT which are
+        // not inputs or outputs of a node will not be renamed. This is a
+        // potential cause of confusion if an error message or warning
+        // mentions the unnamed tensor.
         output.tensor()->setName(output_name.c_str());
       }
     }
@@ -930,11 +994,14 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
 
   nvinfer1::IShuffleLayer* layer = this->network()->addShuffle(*input_tensor);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Transpose");
+  MarkQuantizationRangesAsInferrable(input_tensor, layer->getOutput(0));
 
   nvinfer1::Permutation permutation;
   for (int32_t i = 0; i < dims.nbDims; ++i) {
     permutation.order[i] = order_with_batch_dim[i + 1] - 1;
   }
+  VLOG(1) << "TransposeTensor permutation: "
+          << DebugString(permutation, dims.nbDims);
   layer->setFirstTranspose(permutation);
 
   nvinfer1::Dims reshape_dims;
@@ -950,6 +1017,38 @@ Status Converter::TransposeTensor(nvinfer1::ITensor* input_tensor,
   return tensorflow::Status::OK();
 }
 
+Status Converter::GetWeightRange(const TRT_ShapedWeights& weights,
+                                 float* out_min, float* out_max) const {
+  switch (weights.type_) {
+    case DataType::DT_FLOAT: {
+      auto inp = static_cast<float const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = *result.first;
+      *out_max = *result.second;
+      break;
+    }
+    case DataType::DT_HALF: {
+      auto inp = static_cast<Eigen::half const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = Eigen::half_impl::half_to_float(*result.first);
+      *out_max = Eigen::half_impl::half_to_float(*result.second);
+      break;
+    }
+    case DataType::DT_INT32: {
+      auto inp = static_cast<int const*>(weights.GetValues());
+      auto result = std::minmax_element(inp, inp + weights.count());
+      *out_min = static_cast<float>(*result.first);
+      *out_max = static_cast<float>(*result.second);
+      break;
+    }
+    default:
+      return errors::Unimplemented(
+          "Data type not supported for GetWeightRange: ",
+          DataTypeString(weights.type_));
+  }
+  return Status::OK();
+}
+
 Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
                                         const nvinfer1::Dims& dims,
                                         const nvinfer1::ITensor** tensor) {
@@ -964,8 +1063,9 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
   }
   if (can_check_shapes &&
       TrtDimsNumElements(input.GetTrtDims()) != TrtDimsNumElements(dims)) {
-    return tensorflow::errors::InvalidArgument(
-        "Reshape shapes are not compatible.");
+    return errors::InvalidArgument("Reshape shapes are not compatible (",
+                                   DebugString(input.GetTrtDims()), " vs ",
+                                   DebugString(dims), ")");
   }
 
   if (input.is_tensor()) {
@@ -976,6 +1076,8 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
           *const_cast<nvinfer1::ITensor*>(input.tensor()));
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
       layer->setReshapeDimensions(dims);
+      MarkQuantizationRangesAsInferrable(
+          const_cast<nvinfer1::ITensor*>(input.tensor()), layer->getOutput(0));
       *tensor = layer->getOutput(0);
     }
   } else {
@@ -983,10 +1085,123 @@ Status Converter::PrepareTensorForShape(const TRT_TensorOrWeights& input,
         this->network()->addConstant(dims, input.weights().GetTrtWeights());
     TFTRT_RETURN_ERROR_IF_NULLPTR(layer, "TF-TRT Internal Reshape");
     *tensor = layer->getOutput(0);
+    if (precision_mode() == INT8MODE && !use_calibration()) {
+      // If we are in int8 mode and not calibrating, we need to explicitly set a
+      // quantization range for the output tensor of the IConstantLayer. Here we
+      // set the range to [min(weights), max(weights)].
+      float min_range = 0.0f;
+      float max_range = 0.0f;
+      TF_RETURN_IF_ERROR(
+          GetWeightRange(input.weights(), &min_range, &max_range));
+      // Avoid setting range to 0 because TRT will throw an error. If the
+      // weights are zero then the range doesn't matter: using 127.0f should
+      // ensure the quantized weight will be exactly zero.
+      if (min_range == 0.0f && max_range == 0.0f) {
+        min_range = -127.0f;
+        max_range = 127.0f;
+      }
+      ProvideQuantizationRange(const_cast<nvinfer1::ITensor*>(*tensor),
+                               min_range, max_range);
+    }
   }
   return tensorflow::Status::OK();
 }
 
+void Converter::MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
+                                                   nvinfer1::ITensor* output) {
+  quantization_infer_.push_back({input, output});
+  quantization_infer_.push_back({output, input});
+}
+
+void Converter::ProvideQuantizationRange(nvinfer1::ITensor* tensor,
+                                         float min_range, float max_range) {
+  float symmetric_range = std::max(std::abs(min_range), std::abs(max_range));
+  quantization_ranges_[tensor] = symmetric_range;
+}
+
+void Converter::MaybeApplyQuantizationRanges() {
+  if (precision_mode() != INT8MODE) return;
+
+  // Infer ranges across marked ops.
+  PropagateQuantizationRanges();
+  // Apply ranges.
+#if NV_TENSORRT_MAJOR >= 5
+  for (auto pair : quantization_ranges_) {
+    nvinfer1::ITensor* tensor = pair.first;
+    const float range = pair.second;
+    VLOG(1) << "Setting range for: " << tensor->getName() << ": " << range;
+    // TODO(laigd): if 'tensor' already has a range set which doesn't match
+    // 'range', it should report error.
+    tensor->setDynamicRange(-range, range);
+  }
+#endif
+
+  // Warn user about tensors that are missing ranges. If TRT fuses some layers
+  // then these tensors may not actually be required, which is why this is
+  // just a warning. If we are still missing ranges even after fusion,
+  // Builder::buildCudaEngine() will return nullptr and we will catch the
+  // error at that point.
+  if (!use_calibration()) {
+    // Get all tensors from network
+    std::set<nvinfer1::ITensor*> all_tensors;
+    for (int i = 0; i < this->network()->getNbLayers(); i++) {
+      nvinfer1::ILayer* layer = this->network()->getLayer(i);
+      for (int j = 0; j < layer->getNbInputs(); j++) {
+        all_tensors.insert(layer->getInput(j));
+      }
+      for (int j = 0; j < layer->getNbOutputs(); j++) {
+        all_tensors.insert(layer->getOutput(j));
+      }
+    }
+    // Find tensors with no ranges
+    for (auto tensor : all_tensors) {
+      if (!quantization_ranges_.count(tensor)) {
+        // Note: there may be some warnings for "(Unnamed ITensor* N)". These
+        // are tensors which are created internally by TF-TRT. The ranges for
+        // these unnamed ITensors are always inferred from user provided ranges,
+        // thus there will also be a warning for the range(s) the user missed.
+        LOG(WARNING) << "Quantization range was not found for "
+                     << tensor->getName() << ". "
+                     << "This is okay if TensorRT does not need the range "
+                     << "(e.g. due to node fusion).";
+      }
+    }
+  }
+}
+
+void Converter::PropagateQuantizationRanges() {
+  // Propagate ranges across edges in quantization_infer_ until no new
+  // information is added.
+  // Note: this function modifies quantization_infer_, it might be better to
+  // modify a copy instead if we for some reason need quantization_infer_
+  // later.
+  bool information_added = true;
+  while (information_added) {
+    information_added = false;
+    for (auto it = quantization_infer_.begin();
+         it != quantization_infer_.end();) {
+      auto input_tensor_range = quantization_ranges_.find(it->first);
+      auto output_tensor_range = quantization_ranges_.find(it->second);
+      if (input_tensor_range != quantization_ranges_.end() &&
+          output_tensor_range == quantization_ranges_.end()) {
+        // Input has range but output doesn't: copy range
+        // TODO(laigd): consider reporting error if it a different range is
+        // already set.
+        quantization_ranges_[it->second] = input_tensor_range->second;
+        information_added = true;
+        VLOG(1) << "Copy quantization range: " << it->first->getName() << " -> "
+                << it->second->getName();
+      }
+      // We can remove edges when the output range is known
+      if (quantization_ranges_.find(it->second) != quantization_ranges_.end()) {
+        it = quantization_infer_.erase(it);
+      } else {
+        ++it;
+      }
+    }
+  }
+}
+
 Status Converter::GetInputs(const tensorflow::NodeDef& node_def,
                             std::vector<TRT_TensorOrWeights>* inputs) const {
   for (auto const& input_name : node_def.input()) {
@@ -1043,12 +1258,11 @@ TRT_ShapedWeights ConvertFP32ToFP16(TrtWeightStore* store,
 }
 
 // ****************************************************************************
-// Constant folding functions
-// TODO(jie): once optimizer kicks in, we should have done constant folding
-// there.
+// Constant folding functions for weights.
+// TODO(laigd): we should probably use eigen directly.
 // *****************************************************************************
 struct LambdaFactory {
-  enum class OP_CATEGORY : int { RSQRT = 0, NEG, ADD, MUL, SUB, RECIP };
+  enum class OP_CATEGORY : int { RSQRT = 0, NEG, RECIP };
   OP_CATEGORY op;
 
   template <typename T>
@@ -1063,84 +1277,10 @@ struct LambdaFactory {
       case OP_CATEGORY::RECIP:
         return [](T t) -> T { return 1.0 / t; };
       default:
-        VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+        LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
         return nullptr;
     }
   }
-
-  template <typename T>
-  std::function<T(T, T)> binary() {
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [](T l, T r) -> T { return l + r; };
-      case OP_CATEGORY::SUB:
-        return [](T l, T r) -> T { return l - r; };
-      case OP_CATEGORY::MUL:
-        return [](T l, T r) -> T { return l * r; };
-      default:
-        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [](T l, T r) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
-
-  template <typename T>
-  std::function<T(T)> broadcast_r(T val) {
-    VLOG(2) << "LAMBDA VAL : " << val;
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l + val;
-        };
-      case OP_CATEGORY::SUB:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l - val;
-        };
-      case OP_CATEGORY::MUL:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return l * val;
-        };
-      default:
-        LOG(WARNING) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [val](T l) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
-
-  template <typename T>
-  std::function<T(T)> broadcast_l(T val) {
-    VLOG(2) << "LAMBDA VAL : " << val;
-    switch (op) {
-      case OP_CATEGORY::ADD:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val + l;
-        };
-      case OP_CATEGORY::SUB:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val - l;
-        };
-      case OP_CATEGORY::MUL:
-        return [val](T l) -> T {
-          VLOG(2) << "LAMBDA VAL : " << val;
-          return val * l;
-        };
-      default:
-        LOG(ERROR) << "Not supported op for binary: " << static_cast<int>(op);
-    }
-    return [val](T l) -> T {
-      LOG(FATAL) << "Unsupported op type ";
-      return l;
-    };
-  }
 };
 
 template <>
@@ -1148,15 +1288,18 @@ std::function<Eigen::half(Eigen::half)> LambdaFactory::unary<Eigen::half>() {
   switch (op) {
     case OP_CATEGORY::RSQRT: {
       VLOG(2) << "RSQRT GETS DONE";
-      return [](Eigen::half t) -> Eigen::half {
+      return [](Eigen::half t) {
         return Eigen::half(1.0 / sqrt(static_cast<float>(t)));
       };
     }
     case OP_CATEGORY::NEG:
-      return [](Eigen::half t) -> Eigen::half { return -t; };
-    // TODO(aaroey): can we support RECIP?
+      return [](Eigen::half t) { return -t; };
+    case OP_CATEGORY::RECIP:
+      return [](Eigen::half t) {
+        return Eigen::half(1.0 / static_cast<float>(t));
+      };
     default:
-      VLOG(2) << "Not supported op for unary: " << static_cast<int>(op);
+      LOG(ERROR) << "Not supported op for unary: " << static_cast<int>(op);
       return nullptr;
   }
 }
@@ -1188,50 +1331,48 @@ tensorflow::Status UnaryCompute(const TRT_ShapedWeights& iweights,
   return tensorflow::Status::OK();
 }
 
+// If swapped_inputs is false, 'tensor' is the left operand and 'weights' is the
+// right operand. If swapped_inputs is true, those two are swapped.
+//
 // TODO(jie): broadcast is needed yet not implemented.
-// Only implemented channel wise for the time being
-tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
-                                        const nvinfer1::ITensor* tensor,
-                                        TRT_ShapedWeights weights,
-                                        bool swapped_inputs) {
+// Only implemented channel wise for the time being.
+Status BinaryTensorOpWeight(OpConverterParams* params,
+                            const nvinfer1::ITensor* tensor,
+                            TRT_ShapedWeights weights, bool swapped_inputs) {
+  static const std::unordered_set<string> supported_ops = {"Sub", "Add", "Mul",
+                                                           "Div", "RealDiv"};
   const auto& node_def = params->node_def;
-  // tensor is the left operand while weights is the right operand;
-  // when swapped_inputs set to true, those two are swapped.
-  // TODO(aaroey): use a set.
-  if (node_def.op() != "Sub" && node_def.op() != "Add" &&
-      node_def.op() != "Mul" && node_def.op() != "Div" &&
-      node_def.op() != "RealDiv") {
-    return tensorflow::errors::Unimplemented(
-        "op not supported: " + node_def.op() + ", at: " + node_def.name());
+  if (!supported_ops.count(node_def.op())) {
+    return errors::Unimplemented(node_def.op(), " is not supported, at ",
+                                 node_def.name());
   }
 
-  // Check type consistency
-  nvinfer1::DataType ttype;
-  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &ttype));
+  // Check type consistency.
+  nvinfer1::DataType trt_dtype;
+  TF_RETURN_IF_ERROR(ConvertDType(weights.type_, &trt_dtype));
 
-  // Check scale mode
+  // Check scale mode.
   auto dims_w = weights.shape_;
-  auto dims_t = tensor->getDimensions();
+  const auto dims_t = tensor->getDimensions();
 
   // TODO(jie): addScale checks for input tensor dimension
   if (dims_t.nbDims != 3) {
-    return tensorflow::errors::InvalidArgument(
-        "addScale requires tensor with rank 3, " + node_def.name());
+    return errors::InvalidArgument("addScale requires tensor with rank 3, at ",
+                                   node_def.name());
   }
 
-  // default to element-wise
+  // Default to element-wise
   auto scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
 
   // TODO(jie): maybe use a permutation instead to support more cases;
-  bool permutation_flag = false;
+  bool need_to_permute = false;
 
   if (weights.count() == 1) {
-    VLOG(2) << "UNIFORM";
     scale_mode = nvinfer1::ScaleMode::kUNIFORM;
   } else {
-    // no broadcasting on Batch dimension;
-    VLOG(2) << "WEIGHTS DIM: " << dims_w.nbDims
-            << " tensor DIM: " << dims_t.nbDims;
+    VLOG(2) << "weights dims: " << DebugString(dims_w)
+            << "; tensor dims: " << DebugString(dims_t);
+    // Make sure no broadcasting on batch dimension.
     if (dims_w.nbDims == dims_t.nbDims + 1) {
       if (dims_w.d[0] == 1) {
         for (int i = 1; i < dims_w.nbDims; i++) {
@@ -1239,72 +1380,70 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
         }
         dims_w.nbDims--;
       } else {
-        return tensorflow::errors::InvalidArgument(
-            "Binary op cannot operate on batch, " + node_def.name());
+        return errors::InvalidArgument("Binary op cannot operate on batch, at ",
+                                       node_def.name());
       }
     }
 
     if (dims_w.nbDims == dims_t.nbDims && dims_w.d[0] == dims_t.d[0]) {
       scale_mode = nvinfer1::ScaleMode::kELEMENTWISE;
-      // default is element;
+      // Default is element-wise
       for (int i = 1; i < dims_w.nbDims; i++) {
         if (dims_w.d[i] != dims_t.d[i]) {
-          // if dimension does not match, switch back to channel;
-          VLOG(2) << "channel";
+          // If dimension does not match, switch back to per-channel
           scale_mode = nvinfer1::ScaleMode::kCHANNEL;
           break;
         }
       }
-      // if channel as candidate, validate it
+      // If the mode is per-channel, since channel dimension is assumed to be
+      // the third to last dimension, we need to make sure all other dimensions
+      // have size 1.
       if (scale_mode == nvinfer1::ScaleMode::kCHANNEL) {
         for (int i = 1; i < dims_w.nbDims; i++) {
           if (dims_w.d[i] != 1)
-            return tensorflow::errors::InvalidArgument(
-                "Weight shape not compatible at, " + node_def.name());
+            return errors::InvalidArgument(
+                "Weight dims not compatible for channel-wise broadcast at ",
+                node_def.name());
         }
-      } else {
-        VLOG(2) << "elementwise";
       }
     } else if (dims_w.nbDims == 1 &&
                dims_w.d[0] == dims_t.d[dims_t.nbDims - 1]) {
-      // channel wise and broadcast required;
-      permutation_flag = true;
+      // Channel wise and broadcast required. We compare the last dimension of
+      // the tensor shape because of tensorflow default broadcasting rules.
+      need_to_permute = true;
       scale_mode = nvinfer1::ScaleMode::kCHANNEL;
     } else {
-      return tensorflow::errors::InvalidArgument(
-          "Weight shape not compatible at, " + node_def.name());
+      return errors::InvalidArgument("Weight dims not compatible at ",
+                                     node_def.name());
     }
   }
+  // TODO(laigd): we should add validation_only support in TransposeTensor() and
+  // PrepareTensorForShape().
+  if (params->validation_only) return Status::OK();
 
-  // transpose last dimension
+  // Transpose last dimension.
   std::vector<int> permutation(dims_t.nbDims + 1);
-  if (permutation_flag) {
-    if (scale_mode == nvinfer1::ScaleMode::kCHANNEL && dims_t.nbDims > 1) {
-      // we swap the last dimension into channel for trt.
-      // because of tensorflow default broadcasting rules.
-      for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
-        permutation[i] = i;
-      }
-      permutation[1] = dims_t.nbDims;
-      permutation[dims_t.nbDims] = 1;
-      TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
-          const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
-    } else {
-      return tensorflow::errors::InvalidArgument(
-          "Transpose cannot be applied, " + node_def.name());
-    }
+  if (need_to_permute) {
+    // We swap the last dimension into channel for trt, because of tensorflow
+    // default broadcasting rules.
+    for (int i = 0; i < static_cast<int>(permutation.size()); i++) {
+      permutation[i] = i;
+    }
+    permutation[1] = dims_t.nbDims;
+    permutation[dims_t.nbDims] = 1;
+    TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
+        const_cast<nvinfer1::ITensor*>(tensor), permutation, &tensor));
   }
 
-  if (params->converter->is_fp16()) {
+  if (params->converter->precision_mode() == FP16MODE) {
     weights = ConvertFP32ToFP16(params->weight_store, weights);
   }
 
-  // prepare weights
+  // Prepare weights
   TRT_ShapedWeights shift_weights(weights.type_);
   TRT_ShapedWeights scale_weights(weights.type_);
   TRT_ShapedWeights power_weights(weights.type_);
 
-  // Maybe I should do a switch
   if (node_def.op() == "Sub") {
     if (swapped_inputs) {
       shift_weights = weights;
@@ -1312,6 +1451,10 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
           *const_cast<nvinfer1::ITensor*>(tensor),
           nvinfer1::UnaryOperation::kNEG);
       TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+      // Since quantization ranges are symmetric, the same range as the input
+      // will work for the negation of the input.
+      params->converter->MarkQuantizationRangesAsInferrable(
+          const_cast<nvinfer1::ITensor*>(tensor), layer->getOutput(0));
       tensor = layer->getOutput(0);
     } else {
       TRT_ShapedWeights neg_weights =
@@ -1323,6 +1466,25 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
     }
   } else if (node_def.op() == "Div" || node_def.op() == "RealDiv") {
     if (swapped_inputs) {
+      // We need to infer the quantization range for this intermediate tensor.
+      //
+      //   x -> [Recip] -> 1/x -> [Scale] -> s/x
+      //                    ^
+      //            need range for this
+      //
+      // We have the quantization scales for x and s/x - can we divide the scale
+      // for s/x by s? Only if it is a scalar.
+      //
+      // Because of this issue, fall back to BinaryTensorOpTensor if we are
+      // doing INT8 with no calibration. There is most likely no performance
+      // penalty by falling back here.
+      if (params->converter->precision_mode() == INT8MODE &&
+          !params->converter->use_calibration()) {
+        return errors::Unimplemented(
+            "Intermediate quantization range cannot be determined without"
+            " calibration. Falling back to BinaryTensorOpTensor for ",
+            node_def.op(), ", at ", node_def.name());
+      }
       scale_weights = weights;
       nvinfer1::IUnaryLayer* layer = params->converter->network()->addUnary(
           *const_cast<nvinfer1::ITensor*>(tensor),
@@ -1342,8 +1504,8 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
   } else if (node_def.op() == "Add") {
     shift_weights = weights;
   } else {
-    return tensorflow::errors::Unimplemented("Binary op not supported: " +
-                                             node_def.op());
+    // This should not happen.
+    return errors::Unimplemented("Binary op not supported at ", node_def.op());
   }
 
   nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
@@ -1353,8 +1515,8 @@ tensorflow::Status BinaryTensorOpWeight(OpConverterParams* params,
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   const nvinfer1::ITensor* output_tensor = layer->getOutput(0);
-  // transpose back dimension
-  if (permutation_flag) {
+  // Transpose back dimension
+  if (need_to_permute) {
     TF_RETURN_IF_ERROR(params->converter->TransposeTensor(
         const_cast<nvinfer1::ITensor*>(output_tensor), permutation,
         &output_tensor));
@@ -1398,7 +1560,7 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
     return tensorflow::errors::Internal(
         "Conv2D expects kernel of dimension 4, at: " + node_def.name());
   }
-  if (params->converter->is_fp16()) {
+  if (params->converter->precision_mode() == FP16MODE) {
     weights_rsck =
         ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
   }
@@ -1445,6 +1607,8 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params, int group) {
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
     VLOG(2) << "TENSOR after: " << DebugString(tensor->getDimensions());
@@ -1486,9 +1650,9 @@ tensorflow::Status ConvertConv2DHelper(OpConverterParams* params,
                                            params->node_def.name());
 }
 
-tensorflow::Status BinaryTensorOpTensor(OpConverterParams* params,
-                                        const TRT_TensorOrWeights& operand_l,
-                                        const TRT_TensorOrWeights& operand_r) {
+Status BinaryTensorOpTensor(OpConverterParams* params,
+                            const TRT_TensorOrWeights& operand_l,
+                            const TRT_TensorOrWeights& operand_r) {
   const auto& node_def = params->node_def;
   static const std::unordered_map<string, nvinfer1::ElementWiseOperation> ops{
       {"Add", nvinfer1::ElementWiseOperation::kSUM},
@@ -1499,50 +1663,52 @@ tensorflow::Status BinaryTensorOpTensor(OpConverterParams* params,
       {"Minimum", nvinfer1::ElementWiseOperation::kMIN},
       {"Maximum", nvinfer1::ElementWiseOperation::kMAX},
   };
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end()) {
+    return errors::Unimplemented("Binary op ", node_def.op(),
+                                 " not supported at: ", node_def.name());
+  }
 
-  const nvinfer1::ITensor* tensor_l;
-  const nvinfer1::ITensor* tensor_r;
-
-  nvinfer1::Dims dim_l;
-  nvinfer1::Dims dim_r;
-
-  if (!TensorRTGetBroadcastShape(operand_l.GetTrtDims(), operand_l.is_tensor(),
-                                 operand_r.GetTrtDims(), operand_r.is_tensor(),
-                                 &dim_l, &dim_r)) {
-    return tensorflow::errors::InvalidArgument(
-        "Binary op broadcast scheme not supported by TensorRT op: " +
-        node_def.op() + ", at: " + node_def.name());
+  nvinfer1::Dims broadcasted_dims_l, broadcasted_dims_r;
+  Status status = params->converter->GetTrtBroadcastShape(
+      operand_l, operand_r, &broadcasted_dims_l, &broadcasted_dims_r);
+  if (!status.ok()) {
+    return errors::InvalidArgument(
+        "Unsupported binary op broadcast scheme for op ", node_def.name(), ": ",
+        status.error_message());
   }
+  if (params->validation_only) return Status::OK();
 
-  TF_RETURN_IF_ERROR(
-      params->converter->PrepareTensorForShape(operand_l, dim_l, &tensor_l));
-  TF_RETURN_IF_ERROR(
-      params->converter->PrepareTensorForShape(operand_r, dim_r, &tensor_r));
+  const nvinfer1::ITensor* tensor_l = nullptr;
+  const nvinfer1::ITensor* tensor_r = nullptr;
+  status = params->converter->PrepareTensorForShape(
+      operand_l, broadcasted_dims_l, &tensor_l);
+  if (status.ok()) {
+    status = params->converter->PrepareTensorForShape(
+        operand_r, broadcasted_dims_r, &tensor_r);
+  }
+  if (!status.ok()) {
+    return errors::Internal("Failed to convert binary op ", node_def.name(),
+                            ": ", status.error_message());
+  }
 
-  // get trt type & shape
+  // Check type consistency.
   TFAttrs attrs(node_def);
-  // maybe this part has to be moved into the block of rsqrt later
   nvinfer1::DataType dtype = attrs.get<nvinfer1::DataType>("T");
+  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype)
+      << DebugString(tensor_l->getType()) << " vs " << DebugString(dtype);
+  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype)
+      << DebugString(tensor_r->getType()) << " vs " << DebugString(dtype);
 
-  // check type consistency
-  TFTRT_CHECK_EQ_TYPE(tensor_l->getType(), dtype);
-  TFTRT_CHECK_EQ_TYPE(tensor_r->getType(), dtype);
-  auto op_pair = ops.find(node_def.op());
-  if (op_pair == ops.end()) {
-    return tensorflow::errors::Unimplemented(
-        "binary op: ", node_def.op(), " not supported at: ", node_def.name());
-  }
-
+  // Add ElementWise layer.
   nvinfer1::IElementWiseLayer* layer =
       params->converter->network()->addElementWise(
-          // TODO(aaroey): will tensor_l/tensor_r get modified?
           *const_cast<nvinfer1::ITensor*>(tensor_l),
           *const_cast<nvinfer1::ITensor*>(tensor_r), op_pair->second);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
-
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // pass the output
+  // Pass the output
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
@@ -1723,6 +1889,133 @@ tensorflow::Status ConvertReshape(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertExpandDims(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 2) {
+    return tensorflow::errors::InvalidArgument(
+        "Two inputs expected for ExpandDims, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "ExpandDims expects tensor for input, at ", node_def.name());
+  }
+  if (!inputs.at(1).is_weights()) {
+    return tensorflow::errors::InvalidArgument(
+        "ExpandDims expects weights for axis, at ", node_def.name());
+  }
+  // Get input shape as vector.
+  TRT_TensorOrWeights input_tensor = inputs.at(0);
+  const nvinfer1::Dims dims = input_tensor.GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dim back.
+  input_dims.insert(input_dims.begin(), -1);
+  const int input_rank = input_dims.size();
+  // Get axis to expand on.
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (weights.count() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "ExpandDims axis must be a scalar, at ", node_def.name());
+  }
+  const int* weights_ptr =
+      static_cast<int*>(const_cast<void*>(weights.GetValues()));
+  int axis = weights_ptr[0];
+  // Make sure axis is valid.
+  if ((axis < (-input_rank - 1)) || (axis > input_rank)) {
+    return tensorflow::errors::InvalidArgument(
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at ",
+        node_def.name());
+  }
+  // Convert negative axis to corresponding positive axis.
+  if (axis < 0) axis += input_rank + 1;
+  if (axis == 0) {
+    return tensorflow::errors::Unimplemented(
+        "Modifying batch dimension is not supported for ExpandDims, at ",
+        node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+
+  // ExpandDims: Insert new dim of size 1.
+  input_dims.insert(input_dims.begin() + axis, 1);
+  // Reshape tensor.
+  nvinfer1::Dims new_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
+                                               /*ignore_first_dim=*/true));
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      input_tensor, new_dims, &output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
+  return tensorflow::Status::OK();
+}
+
+tensorflow::Status ConvertSqueeze(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "One input expected for Squeeze, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "Squeeze expects tensor for input, at ", node_def.name());
+  }
+  // Get input shape.
+  TRT_TensorOrWeights input_tensor = inputs.at(0);
+  const nvinfer1::Dims dims = input_tensor.GetTrtDims();
+  std::vector<int> input_dims(dims.d, dims.d + dims.nbDims);
+  // Add batch dim back.
+  input_dims.insert(input_dims.begin(), -1);
+  const int input_rank = input_dims.size();
+  // Mark axes to remove by setting them to 0.
+  TFAttrs attrs(node_def);
+  auto squeeze_dims = attrs.get<std::vector<int>>("squeeze_dims");
+  if (squeeze_dims.size() == 0) {
+    return tensorflow::errors::Unimplemented(
+        "Squeeze is only implemented for explicit dims, at ", node_def.name());
+  }
+  for (int axis : squeeze_dims) {
+    // Make sure axis is valid.
+    if ((axis < -input_rank) || (axis >= input_rank)) {
+      return tensorflow::errors::InvalidArgument(
+          "Axis for Squeeze is invalid, must be in the range "
+          "[-rank(input), rank(input)), at ",
+          node_def.name());
+    }
+    // Convert negative axis to corresponding positive axis.
+    if (axis < 0) axis += input_rank;
+    // Don't squeeze batch dim.
+    if (axis == 0) {
+      return tensorflow::errors::Unimplemented(
+          "Cannot squeeze batch dimension, at ", node_def.name());
+    }
+    // Make sure target dimension is size 1.
+    if (input_dims[axis] != 1) {
+      return tensorflow::errors::InvalidArgument(
+          "Cannot squeeze a dimension which isn't size 1, at ",
+          node_def.name());
+    }
+    // Mark dim for removal by setting to 0.
+    input_dims[axis] = 0;
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Remove all dims which are equal to 0.
+  input_dims.erase(std::remove(input_dims.begin(), input_dims.end(), 0),
+                   input_dims.end());
+  // Reshape tensor.
+  nvinfer1::Dims new_dims;
+  TF_RETURN_IF_ERROR(TensorShapeArrayToTrtDims(input_dims, &new_dims,
+                                               /*ignore_first_dim=*/true));
+  const nvinfer1::ITensor* output_tensor = nullptr;
+  TF_RETURN_IF_ERROR(params->converter->PrepareTensorForShape(
+      input_tensor, new_dims, &output_tensor));
+  params->outputs->push_back(
+      TRT_TensorOrWeights(const_cast<nvinfer1::ITensor*>(output_tensor)));
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertConv2D(OpConverterParams* params) {
   return ConvertConv2DHelper(params, ConvolutionType::DEFAULT);
 }
@@ -1789,6 +2082,8 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
         nvinfer1::DimsHW(padding[0].first, padding[1].first),
         nvinfer1::DimsHW(padding[0].second, padding[1].second));
     TFTRT_RETURN_ERROR_IF_NULLPTR(pad_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        const_cast<nvinfer1::ITensor*>(tensor), pad_layer->getOutput(0));
     padding = {{0, 0}, {0, 0}};
     tensor = pad_layer->getOutput(0);
   }
@@ -1796,6 +2091,11 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
   nvinfer1::IPoolingLayer* layer = params->converter->network()->addPooling(
       *const_cast<nvinfer1::ITensor*>(tensor), type, ksize);
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  // TODO(tmorris): Average pooling may not be entirely safe to infer
+  // quantization range through (at least forwards - backwards should be fine).
+  // Max pooling is okay.
+  params->converter->MarkQuantizationRangesAsInferrable(
+      const_cast<nvinfer1::ITensor*>(tensor), layer->getOutput(0));
 
   layer->setStride(stride);
   layer->setPadding({padding[0].first, padding[1].first});
@@ -1813,110 +2113,290 @@ tensorflow::Status ConvertPool(OpConverterParams* params) {
 }
 
 tensorflow::Status ConvertActivation(OpConverterParams* params) {
-  const nvinfer1::ITensor* tensor = params->inputs.at(0).tensor();
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        node_def.op(), " expects one input, at ", node_def.name());
+  }
+  if (!inputs.at(0).is_tensor()) {
+    return tensorflow::errors::Unimplemented(
+        node_def.op(), " is only implemented for tensors, at ",
+        node_def.name());
+  }
+  static const std::unordered_map<string, nvinfer1::ActivationType> ops{
+      {"Relu", nvinfer1::ActivationType::kRELU},
+      {"Sigmoid", nvinfer1::ActivationType::kSIGMOID},
+      {"Tanh", nvinfer1::ActivationType::kTANH},
+  };
+  auto op_pair = ops.find(node_def.op());
+  if (op_pair == ops.end()) {
+    return tensorflow::errors::Unimplemented(
+        "Activation op: ", node_def.op(),
+        " not supported at: ", node_def.name());
+  }
+  if (params->validation_only) return tensorflow::Status::OK();
+
+  // Start conversion.
+  const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
   nvinfer1::IActivationLayer* layer =
       params->converter->network()->addActivation(
-          *const_cast<nvinfer1::ITensor*>(tensor),
-          nvinfer1::ActivationType::kRELU);
-  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, params->node_def.name());
+          *const_cast<nvinfer1::ITensor*>(tensor), op_pair->second);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  // Set quantization range for output of Sigmoid, Tanh.
+  if (node_def.op() == "Sigmoid") {
+    params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f);
+  } else if (node_def.op() == "Tanh") {
+    params->converter->ProvideQuantizationRange(output_tensor, -1.0f, 1.0f);
+  }
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertScale(OpConverterParams* params) {
+Status ConvertQuantize(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
-  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
-      !inputs.at(1).is_weights()) {
+  if ((inputs.size() == 0) ||
+      (node_def.op() == "FakeQuantWithMinMaxArgs" && inputs.size() != 1) ||
+      (node_def.op() == "FakeQuantWithMinMaxVars" && inputs.size() != 3) ||
+      (node_def.op() == "QuantizeAndDequantizeV2" && inputs.size() != 3) ||
+      (node_def.op() == "QuantizeAndDequantizeV3" && inputs.size() != 4)) {
+    return errors::InvalidArgument("Invalid number of inputs for ",
+                                   node_def.op(), ", at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    // TensorRT will automatically quantize weights, so we will ignore ranges
+    // for weights.
+    params->outputs->push_back(inputs.at(0));
+    return Status::OK();
+  }
+  float min_range = 0.0f;
+  float max_range = 0.0f;
+  if (node_def.op() == "FakeQuantWithMinMaxArgs") {
+    // Get ranges via node attributes.
+    TFAttrs attrs(node_def);
+    if (attrs.count("min") == 0 || attrs.count("max") == 0) {
+      return errors::InvalidArgument("Min or max attribute not found for ",
+                                     node_def.op(), " at ", node_def.name());
+    }
+    min_range = attrs.get<float>("min");
+    max_range = attrs.get<float>("max");
+  } else if (node_def.op() == "FakeQuantWithMinMaxVars" ||
+             node_def.op() == "QuantizeAndDequantizeV2" ||
+             node_def.op() == "QuantizeAndDequantizeV3") {
+    // Get ranges via inputs.
+    if (!inputs.at(1).is_weights() || !inputs.at(2).is_weights()) {
+      return errors::InvalidArgument("Min and max inputs for ", node_def.op(),
+                                     " must be weights not tensors, at ",
+                                     node_def.name());
+    }
+    auto get_weights_value = [&inputs](int index) {
+      auto raw_weights = static_cast<float*>(
+          const_cast<void*>(inputs.at(index).weights().GetValues()));
+      return raw_weights[0];
+    };
+    min_range = get_weights_value(1);
+    max_range = get_weights_value(2);
+  } else {
+    return errors::InvalidArgument("Unknown quantization op ", node_def.op(),
+                                   ", at ", node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Store ranges for tensor
+  params->converter->ProvideQuantizationRange(
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor()), min_range,
+      max_range);
+  // Sometimes, TRT may not quantize a tensor, either because it chooses to
+  // execute a higher precision kernel or because of op fusion. In these cases,
+  // accuracy will suffer if the model was trained to expect quantization at
+  // that tensor. We should consider adding a clip(tensor, min_range, max_range)
+  // operation here to ensure that any arbitrarily placed quantize node will
+  // execute as expected. However, this will negatively affect performance. If
+  // users train their models in a way which models inference as close as
+  // possible (i.e. not quantizing in place where fusion will occur), then there
+  // is no problem with the current implementation.
+  params->outputs->push_back(inputs.at(0));
+  return Status::OK();
+}
+
+// TODO(pdavoodi): we should update relu6 implementation once TensorRT supports
+// Relu6 natively.
+tensorflow::Status ConvertRelu6(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument(
+        "Invalid number of inputs for Relu6, at ", node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
     return tensorflow::errors::Unimplemented(
-        "ConvertScale only supports tensor<op>weight: ", node_def.name());
+        "Relu6 is only implemented for tensors, not weights, at ",
+        node_def.name());
   }
+  if (params->validation_only) return Status::OK();
+  // ***************************************************************************
+  // TensorRT does not implement Relu6 natively. This function converts Relu6 op
+  // to available TensorRT ops: Relu6(x) = min(Relu(x), 6)
+  // ***************************************************************************
 
+  // Input Tensor
   const nvinfer1::ITensor* tensor = inputs.at(0).tensor();
-  TRT_ShapedWeights weights = inputs.at(1).weights();
-  if (params->converter->is_fp16()) {
-    weights = ConvertFP32ToFP16(params->weight_store, inputs.at(1).weights());
-  }
 
-  TRT_ShapedWeights empty_weights(weights.type_);
-  TFAttrs attrs(node_def);
+  // Relu operation i.e. Relu(x) = max(0, x)
+  nvinfer1::IActivationLayer* relu_layer =
+      params->converter->network()->addActivation(
+          *const_cast<nvinfer1::ITensor*>(tensor),
+          nvinfer1::ActivationType::kRELU);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(relu_layer, node_def.name());
+
+  // Large range of relu is problematic during quantization in INT8 precision
+  // mode. Setting dynamic range of relu = [0.f, 6.0f] helps with quantization.
+  // TRT only uses dynamic ranges in INT8 precision mode,
+  // and this does not affect the FP32 path.
+  params->converter->ProvideQuantizationRange(relu_layer->getOutput(0), 0.0f,
+                                              6.0f);
+
+  // Create a constant layer to store the floating point weight i.e. 6.0f This
+  // tensor will be broadcasted uniformly during elementwise `min` operation.
+  // The constant has to have the same rank as the input in order for TRT to
+  // broadcast
+  nvinfer1::Dims dims;
+  dims.nbDims = relu_layer->getOutput(0)->getDimensions().nbDims;
+  for (int i = 0; i < dims.nbDims; i++) {
+    dims.d[i] = 1;
+  }
+  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
+      tensorflow::DataType::DT_FLOAT, dims);
+  auto weights_ptr =
+      static_cast<float*>(const_cast<void*>(weights.GetValues()));
+  weights_ptr[0] = 6.0f;
+  nvinfer1::IConstantLayer* const6_layer =
+      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const6_layer, node_def.name());
+  params->converter->ProvideQuantizationRange(const6_layer->getOutput(0), 0.0f,
+                                              6.0f);
+
+  // ElementWise Min Operation
+  // Min op is a nop for INT8 execution path, as the input tensor
+  // to this layer will only have values in range [0.f, 6.0f].
+  const nvinfer1::ITensor* tensor_l = relu_layer->getOutput(0);
+  const nvinfer1::ITensor* tensor_r = const6_layer->getOutput(0);
+  nvinfer1::IElementWiseLayer* relu6_layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor_l),
+          *const_cast<nvinfer1::ITensor*>(tensor_r),
+          nvinfer1::ElementWiseOperation::kMIN);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(relu6_layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = relu6_layer->getOutput(0);
+  params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 6.0f);
 
-  const auto data_format = attrs.get<string>("data_format");
-  int channel_index;
-  const auto dims = tensor->getDimensions();
-  if (data_format == "NHWC") {
-    //  1). NHWC is really N+C
-    channel_index = dims.nbDims - 1;  // batch dimension is implicit here!
-  } else {
-    //  2). NCHW is really N+CHW
-    channel_index = 0;  // batch dimension is implicit here!
-  }
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return Status::OK();
+}
 
-  nvinfer1::Permutation permutation;
-  for (int32_t i = 0; i < dims.nbDims; ++i) {
-    permutation.order[i] = i;
+tensorflow::Status ConvertBiasAdd(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 2 || !inputs.at(0).is_tensor() ||
+      !inputs.at(1).is_weights()) {
+    return errors::InvalidArgument("Input expects tensor and weights, at ",
+                                   node_def.name());
   }
+  if (params->validation_only) return Status::OK();
 
-  if (channel_index >= 0) {
+  nvinfer1::ITensor* tensor =
+      const_cast<nvinfer1::ITensor*>(inputs.at(0).tensor());
+  const nvinfer1::Dims original_dims = tensor->getDimensions();
+  TFAttrs attrs(node_def);
+  const string data_format = attrs.get<string>("data_format");
+  const int channel_index =
+      (data_format == "NHWC" ? original_dims.nbDims - 1 : 0);
+
+  nvinfer1::Permutation permutation;
+  if (channel_index != 0) {
+    // Permute the dimensions so that the channel dimension is the first
+    // dimension.
+    for (int i = 0; i < original_dims.nbDims; ++i) {
+      permutation.order[i] = i;
+    }
     permutation.order[0] = channel_index;
     permutation.order[channel_index] = 0;
-  } else {
-    return tensorflow::errors::Unimplemented(
-        "TFTRT::BiasAdd cannot apply on batch dimension, at ", node_def.name());
+    VLOG(1) << "ConvertBiasAdd permutation: "
+            << DebugString(permutation, original_dims.nbDims);
   }
 
   // TensorRT addScale requires input to be of rank 3, we need to apply
-  // transpose as well as reshape
-  if (channel_index != 0 || dims.nbDims != 3) {
+  // transpose as well as reshape.
+  // TODO(laigd): this doesn't match what the TRT doc says, fix the doc?
+  if (channel_index != 0 || original_dims.nbDims != 3) {
     nvinfer1::IShuffleLayer* shuffle_layer =
-        params->converter->network()->addShuffle(
-            *const_cast<nvinfer1::ITensor*>(tensor));
+        params->converter->network()->addShuffle(*tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
+    params->converter->MarkQuantizationRangesAsInferrable(
+        tensor, shuffle_layer->getOutput(0));
+
+    // NOTE(laigd): for some reason we need to apply the reshape
+    // unconditionally. The default shape has nbDims==-1 and it seems the
+    // behavior is undefined in some cases.
     nvinfer1::Dims reshape_dims;
     reshape_dims.nbDims = 3;
-    reshape_dims.d[0] = 0;                          // 0 copy from the input
-    reshape_dims.d[1] = dims.nbDims >= 2 ? 0 : 1;   // 0 copy from the input
-    reshape_dims.d[2] = dims.nbDims >= 3 ? -1 : 1;  // -1 infer from the rest
+    // 0 means copying from input; -1 means inferring from the rest.
+    reshape_dims.d[0] = 0;
+    reshape_dims.d[1] = original_dims.nbDims >= 2 ? 0 : 1;
+    reshape_dims.d[2] = original_dims.nbDims >= 3 ? -1 : 1;
+    shuffle_layer->setReshapeDimensions(reshape_dims);
+
     if (channel_index != 0) {
-      // maybe we do not need this check. concerned about TRT optimization
       shuffle_layer->setFirstTranspose(permutation);
     }
-    shuffle_layer->setReshapeDimensions(reshape_dims);
     tensor = shuffle_layer->getOutput(0);
   }
 
+  TRT_ShapedWeights weights = inputs.at(1).weights();
+  if (params->converter->precision_mode() == FP16MODE) {
+    weights = ConvertFP32ToFP16(params->weight_store, weights);
+  }
   nvinfer1::ScaleMode mode = nvinfer1::ScaleMode::kCHANNEL;
   if (weights.shape_.d[0] == 1) {
     mode = nvinfer1::ScaleMode::kUNIFORM;
   }
 
+  TRT_ShapedWeights empty_weights(weights.type_);
   nvinfer1::IScaleLayer* layer = params->converter->network()->addScale(
-      *const_cast<nvinfer1::ITensor*>(tensor), mode, weights.GetTrtWeights(),
-      empty_weights.GetTrtWeights(), empty_weights.GetTrtWeights());
+      *tensor, mode, weights.GetTrtWeights(), empty_weights.GetTrtWeights(),
+      empty_weights.GetTrtWeights());
   TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
 
-  // restore transpose & reshape
-  if (channel_index != 0 || dims.nbDims != 3) {
+  // Restore transpose & reshape.
+  if (channel_index != 0 || original_dims.nbDims != 3) {
     nvinfer1::IShuffleLayer* shuffle_layer =
-        params->converter->network()->addShuffle(
-            *const_cast<nvinfer1::ITensor*>(output_tensor));
+        params->converter->network()->addShuffle(*output_tensor);
     TFTRT_RETURN_ERROR_IF_NULLPTR(shuffle_layer, node_def.name());
-    nvinfer1::Dims reshape_dims = dims;
-    int tmp = reshape_dims.d[channel_index];
-    reshape_dims.d[channel_index] = reshape_dims.d[0];
-    reshape_dims.d[0] = tmp;
+    // NOTE: for same reason as mentioned above we need to apply the reshape
+    // unconditionally.
+    nvinfer1::Dims reshape_dims = original_dims;
+    if (channel_index != 0) {
+      // NOTE: according to NVIDIA dimension types are deprecated, so we don't
+      // need to copy them back.
+      reshape_dims.d[channel_index] = original_dims.d[0];
+      reshape_dims.d[0] = original_dims.d[channel_index];
+    }
     shuffle_layer->setReshapeDimensions(reshape_dims);
+
     if (channel_index != 0) {
       shuffle_layer->setSecondTranspose(permutation);
     }
+    params->converter->MarkQuantizationRangesAsInferrable(
+        output_tensor, shuffle_layer->getOutput(0));
     output_tensor = shuffle_layer->getOutput(0);
   }
 
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
-  return tensorflow::Status::OK();
+  return Status::OK();
 }
 
 Status GetTensorDimsWithProtoShape(const Tensor& tensor,
@@ -2070,32 +2550,41 @@ tensorflow::Status ConvertConst(OpConverterParams* params) {
 }
 
 tensorflow::Status ConvertIdentity(OpConverterParams* params) {
+  // TODO(tmorris): TRT's Identity layer does not get optimized away as of TRT
+  // 5.0, however once we know that it does it would be nice to use that
+  // instead.
   params->outputs->push_back(params->inputs.at(0));
   return tensorflow::Status::OK();
 }
 
-tensorflow::Status ConvertBinary(OpConverterParams* params) {
+Status ConvertBinary(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
   if (inputs.size() != 2) {
-    return tensorflow::errors::FailedPrecondition(
-        "Binary ops require two tensor input, at ", node_def.name());
+    return errors::InvalidArgument("Binary ops require two inputs, at ",
+                                   node_def.name());
   }
 
   // Constant folding should have been done by TensorFlow
-
   if (inputs.at(0).is_weights() && inputs.at(1).is_weights()) {
-    return tensorflow::errors::Unimplemented(
+    return errors::Unimplemented(
         "Constant folding is falled back to TensorFlow, binary op received "
         "both input as constant at: ",
         node_def.name());
   }
 
-  // Try to convert into Scale layer first (for better performance)
+  // TODO(tmorris): TRT plans to deprecate IScaleLayer and will replace it with
+  // IElementwiseLayer. At that point, we can remove BinaryTensorOpWeight. For
+  // now, the performance will be slightly better with IScaleLayer because it
+  // can be fused in more situations. However, most of the benefits of
+  // IScaleLayer are when the layer performs both a shift and a scale, which we
+  // don't do except for convolutions.
+  //
+  // Try to convert into Scale layer first (for better performance).
   // Since scale layer supports restricted broadcast policy and op types, we
   // allow failure and try to handle it through Elementwise op
-  // (BinaryTensorOpTensor)
-  Status status = tensorflow::Status::OK();
+  // (BinaryTensorOpTensor).
+  Status status = Status::OK();
   if (inputs.at(0).is_tensor() && inputs.at(1).is_weights()) {
     status = BinaryTensorOpWeight(params, inputs.at(0).tensor(),
                                   inputs.at(1).weights(), false);
@@ -2103,7 +2592,10 @@ tensorflow::Status ConvertBinary(OpConverterParams* params) {
     status = BinaryTensorOpWeight(params, inputs.at(1).tensor(),
                                   inputs.at(0).weights(), true);
   }
+  // If both input are tensors, or one of them is weights but the conversion
+  // above failed, try the conversion using BinaryTensorOpTensor.
   if ((inputs.at(0).is_tensor() && inputs.at(1).is_tensor()) || !status.ok()) {
+    if (!status.ok()) VLOG(1) << status;
     status = BinaryTensorOpTensor(params, inputs.at(0), inputs.at(1));
   }
   return status;
@@ -2133,6 +2625,20 @@ tensorflow::Status ConvertUnary(OpConverterParams* params) {
 
   nvinfer1::IUnaryLayer* layer;
   if (node_def.op() == "Rsqrt") {
+    // We will need a quantization range for intermediate tensor if not using
+    // calibration.
+    //
+    //   x -> [Sqrt] -> sqrt(x) -> [Recip] -> 1/sqrt(x)
+    //                     ^
+    //               need range here
+    if (params->converter->precision_mode() == INT8MODE &&
+        !params->converter->use_calibration()) {
+      return errors::Unimplemented(
+          "Intermediate quantization range cannot be determined without"
+          " calibration for Rsqrt, consider replacing with "
+          "Sqrt -> FakeQuant -> Reciprocal ops, at ",
+          node_def.name());
+    }
     layer = params->converter->network()->addUnary(
         *const_cast<nvinfer1::ITensor*>(tensor),
         nvinfer1::UnaryOperation::kSQRT);
@@ -2156,6 +2662,48 @@ tensorflow::Status ConvertUnary(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertSquare(OpConverterParams* params) {
+  const auto& inputs = params->inputs;
+  const auto& node_def = params->node_def;
+  if (inputs.size() != 1) {
+    return tensorflow::errors::InvalidArgument("Square expects one input, at ",
+                                               node_def.name());
+  }
+  if (inputs.at(0).is_weights()) {
+    return tensorflow::errors::Unimplemented(
+        "Square is only implemented for tensors, at ", node_def.name());
+  }
+  if (params->validation_only) return Status::OK();
+
+  // Constant 2 with same rank as input
+  nvinfer1::Dims dims = inputs.at(0).GetTrtDims();
+  for (int i = 0; i < dims.nbDims; i++) {
+    dims.d[i] = 1;
+  }
+  TRT_ShapedWeights weights = params->weight_store->GetTempWeights(
+      tensorflow::DataType::DT_FLOAT, dims);
+  auto weights_ptr =
+      static_cast<float*>(const_cast<void*>(weights.GetValues()));
+  weights_ptr[0] = 2.f;
+  nvinfer1::IConstantLayer* const2_layer =
+      params->converter->network()->addConstant(dims, weights.GetTrtWeights());
+  TFTRT_RETURN_ERROR_IF_NULLPTR(const2_layer, node_def.name());
+
+  // ElementWise Pow Operation
+  const nvinfer1::ITensor* tensor_l = inputs.at(0).tensor();
+  const nvinfer1::ITensor* tensor_r = const2_layer->getOutput(0);
+  nvinfer1::IElementWiseLayer* layer =
+      params->converter->network()->addElementWise(
+          *const_cast<nvinfer1::ITensor*>(tensor_l),
+          *const_cast<nvinfer1::ITensor*>(tensor_r),
+          nvinfer1::ElementWiseOperation::kPOW);
+  TFTRT_RETURN_ERROR_IF_NULLPTR(layer, node_def.name());
+  nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+
+  params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertReduce(OpConverterParams* params) {
   const auto& inputs = params->inputs;
   const auto& node_def = params->node_def;
@@ -2692,6 +3240,8 @@ tensorflow::Status ConvertSoftmax(OpConverterParams* params) {
   layer->setAxes(1 << (nbDims - 1));
 
   nvinfer1::ITensor* output_tensor = layer->getOutput(0);
+  // Quantization range for SoftMax is always (0, 1)
+  params->converter->ProvideQuantizationRange(output_tensor, 0.0f, 1.0f);
   params->outputs->push_back(TRT_TensorOrWeights(output_tensor));
   return tensorflow::Status::OK();
 }
@@ -2732,40 +3282,54 @@ tensorflow::Status ConvertTopK(OpConverterParams* params) {
   return tensorflow::Status::OK();
 }
 
-void TrtNodeValidator::RegisterOpValidators() {
+static void RegisterValidatableOpConverters(
+    std::unordered_map<string, OpConverter>* registration) {
   // TODO(laigd): support all op types.
-  op_validators_["Const"] = ConvertConst;
-  op_validators_["Transpose"] = ConvertTranspose;
-  op_validators_["Reshape"] = ConvertReshape;
-  op_validators_["MatMul"] = ConvertMatMul;
+  (*registration)["BiasAdd"] = ConvertBiasAdd;
+  (*registration)["Const"] = ConvertConst;
+  (*registration)["Transpose"] = ConvertTranspose;
+  (*registration)["Reshape"] = ConvertReshape;
+  (*registration)["MatMul"] = ConvertMatMul;
+  (*registration)["Relu6"] = ConvertRelu6;
+  (*registration)["Square"] = ConvertSquare;
+  (*registration)["ExpandDims"] = ConvertExpandDims;
+  (*registration)["Squeeze"] = ConvertSqueeze;
+
+  for (auto quantization_op_type :
+       {"QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3",
+        "FakeQuantWithMinMaxVars", "FakeQuantWithMinMaxArgs"}) {
+    (*registration)[quantization_op_type] = ConvertQuantize;
+  }
+  for (auto binary_op_type :
+       {"Add", "Mul", "Sub", "Div", "RealDiv", "Maximum", "Minimum"}) {
+    (*registration)[binary_op_type] = ConvertBinary;
+  }
+  for (auto activation_op_type : {"Relu", "Sigmoid", "Tanh"}) {
+    (*registration)[activation_op_type] = ConvertActivation;
+  }
+}
+
+void TrtNodeValidator::RegisterOpValidators() {
+  RegisterValidatableOpConverters(&op_validators_);
 }
 
 void Converter::RegisterOpConverters() {
-  // vgg_16 slim implementation
+  RegisterValidatableOpConverters(&op_registry_);
+
   op_registry_["Conv2D"] = ConvertConv2D;
   op_registry_["DepthwiseConv2dNative"] = ConvertConv2DDepthwise;
-  op_registry_["Relu"] = ConvertActivation;
   op_registry_["MaxPool"] = ConvertPool;
   op_registry_["AvgPool"] = ConvertPool;
-  op_registry_["BiasAdd"] = ConvertScale;
-  op_registry_["Const"] = ConvertConst;
   // TODO(ben,jie): this is a temp hack.
   op_registry_["Identity"] = ConvertIdentity;  // Identity should be removed
   op_registry_["Snapshot"] = ConvertIdentity;  // Snapshot should be removed
 
-  // resnet_50_v1 slim implementation
-  op_registry_["Add"] = ConvertBinary;
-  op_registry_["Mul"] = ConvertBinary;
-  op_registry_["Sub"] = ConvertBinary;
   op_registry_["Pad"] = ConvertPad;
 
   op_registry_["ConcatV2"] = ConvertConcat;
   op_registry_["FusedBatchNorm"] = ConvertFusedBatchNorm;
   op_registry_["FusedBatchNormV2"] = ConvertFusedBatchNorm;
 
-  op_registry_["Div"] = ConvertBinary;
-  op_registry_["RealDiv"] = ConvertBinary;
-
   op_registry_["Rsqrt"] = ConvertUnary;
   op_registry_["Reciprocal"] = ConvertUnary;
   op_registry_["Exp"] = ConvertUnary;
@@ -2774,18 +3338,12 @@ void Converter::RegisterOpConverters() {
   op_registry_["Abs"] = ConvertUnary;
   op_registry_["Neg"] = ConvertUnary;
 
-  op_registry_["Transpose"] = ConvertTranspose;
-  op_registry_["Reshape"] = ConvertReshape;
-
   op_registry_["Sum"] = ConvertReduce;
   op_registry_["Prod"] = ConvertReduce;
   op_registry_["Max"] = ConvertReduce;
   op_registry_["Min"] = ConvertReduce;
   op_registry_["Mean"] = ConvertReduce;
-  op_registry_["Maximum"] = ConvertBinary;
-  op_registry_["Minimum"] = ConvertBinary;
   op_registry_["Softmax"] = ConvertSoftmax;
-  op_registry_["MatMul"] = ConvertMatMul;
   op_registry_["BatchMatMul"] = ConvertBatchMatMul;
   op_registry_["TopKV2"] = ConvertTopK;
 
@@ -2798,7 +3356,7 @@ tensorflow::Status ConvertGraphDefToEngine(
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully) {
   engine->reset();
   if (convert_successfully) *convert_successfully = false;
@@ -2813,7 +3371,11 @@ tensorflow::Status ConvertGraphDefToEngine(
     builder->setHalf2Mode(true);
   } else if (precision_mode == INT8MODE) {
     builder->setInt8Mode(true);
-    builder->setInt8Calibrator(calibrator);
+    if (use_calibration) {
+      builder->setInt8Calibrator(calibrator);
+    } else {
+      builder->setInt8Calibrator(nullptr);
+    }
   }
 
   // Create the network.
@@ -2826,7 +3388,7 @@ tensorflow::Status ConvertGraphDefToEngine(
 
   // Build the network
   VLOG(1) << "Starting engine conversion ";
-  Converter converter(trt_network.get(), precision_mode == FP16MODE);
+  Converter converter(trt_network.get(), precision_mode, use_calibration);
   std::vector<std::pair<string, string>> output_tensors;
   // Graph nodes are already topologically sorted during construction
   for (const auto& node_def : gdef.node()) {
@@ -2882,6 +3444,9 @@ tensorflow::Status ConvertGraphDefToEngine(
   TF_RETURN_IF_ERROR(converter.RenameAndMarkOutputTensors(output_tensors));
   if (convert_successfully) *convert_successfully = true;
 
+  // Apply user provided quantization ranges to tensors
+  converter.MaybeApplyQuantizationRanges();
+
   // Build the engine.
   VLOG(1) << "Starting engine creation";
   engine->reset(builder->buildCudaEngine(*converter.network()));
@@ -3026,7 +3591,8 @@ tensorflow::Status ConvertSegmentToGraphDef(
     }
   }
   *common_scope = local_scope;
-  VLOG(0) << "Segment @scope '" << local_scope << "', converted to graph";
+  VLOG(1) << "Converted TensorRT candidate segment @scope '" << local_scope
+          << "' to a GraphDef";
   return tensorflow::Status::OK();
 }
 
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes.h b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
index 5cc28b33e7f2c56d2f281d24e8390d253a8228f5..54e19b73957bccdae2b23bd3556de9ad00b864e5 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes.h
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes.h
@@ -92,7 +92,8 @@ struct EngineInfo {
   EngineInfo()
       : engine_type(EngineType::TRTStatic),
         max_workspace_size_bytes(0),
-        precision_mode(FP32MODE) {}
+        precision_mode(FP32MODE),
+        use_calibration(true) {}
 
   string engine_name;
   string device;
@@ -109,6 +110,7 @@ struct EngineInfo {
   int maximum_cached_engines;
   std::vector<int> cached_engine_batches;
   int precision_mode;
+  bool use_calibration;
 };
 
 // Constructs a graphdef from the segment in the given graph. Adds placeholder
@@ -145,7 +147,7 @@ tensorflow::Status ConvertGraphDefToEngine(
     const std::vector<tensorflow::PartialTensorShape>& input_shapes,
     Logger* logger, nvinfer1::IGpuAllocator* allocator,
     TRTInt8Calibrator* calibrator,
-    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine,
+    TrtUniquePtrType<nvinfer1::ICudaEngine>* engine, bool use_calibration,
     bool* convert_successfully);
 
 // Helper class for the segmenter to determine whether an output edge from the
@@ -392,7 +394,8 @@ class TrtNodeValidator {
 // Class to convert TF nodes to TRT network.
 class Converter {
  public:
-  Converter(nvinfer1::INetworkDefinition* trt_network, bool is_fp16);
+  Converter(nvinfer1::INetworkDefinition* trt_network, int precision_mode,
+            bool use_calibration);
 
   //////////////////////////////////////////////////////////////////////////////
   // Methods used by the TRT engine builder to build a TRT network from a TF
@@ -422,8 +425,27 @@ class Converter {
   // to add TRT layers.
   nvinfer1::INetworkDefinition* network() { return trt_network_; }
 
-  // Is the converter operating in fp16 mode?
-  bool is_fp16() const { return is_fp16_; }
+  // What precision are we targeting?
+  int precision_mode() const { return precision_mode_; }
+
+  // Calibration will be or was previously performed on this network?
+  bool use_calibration() const { return use_calibration_; }
+
+  // This should be called on the inputs and outputs of any layer we create
+  // where we know that the quantization range does not change during that
+  // operation. (e.g. Reshape, Transpose, Identity, MaxPool).
+  void MarkQuantizationRangesAsInferrable(nvinfer1::ITensor* input,
+                                          nvinfer1::ITensor* output);
+
+  // This function should be called when we know the quantization range of a
+  // tensor, either from a quantize/dequantize node or when the output is a
+  // fixed range (e.g. SoftMax, Relu6, Sigmoid).
+  void ProvideQuantizationRange(nvinfer1::ITensor* tensor, float min_range,
+                                float max_range);
+
+  // Should be called when full TRT network has been constructed and before
+  // building the engine.
+  void MaybeApplyQuantizationRanges();
 
   // Below are helper methods for op converters to add different layers to the
   // TRT network.
@@ -440,6 +462,13 @@ class Converter {
                                const nvinfer1::Dims& dims,
                                const nvinfer1::ITensor** tensor);
 
+  // Return OK if the broadcast scheme is supported and compute the shapes after
+  // broadcasting.
+  Status GetTrtBroadcastShape(const TRT_TensorOrWeights& operand_l,
+                              const TRT_TensorOrWeights& operand_r,
+                              nvinfer1::Dims* operand_l_new_dims,
+                              nvinfer1::Dims* operand_r_new_dims) const;
+
  private:
   // Verify the provided batch_size is consistent with batch_size_ and update it
   // if necessary.
@@ -457,6 +486,12 @@ class Converter {
 
   void RegisterOpConverters();
 
+  void PropagateQuantizationRanges();
+
+  // Gets the min and max value in a TRT_ShapedWeights
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const;
+
   // Registered op converters by op type.
   std::unordered_map<string, OpConverter> op_registry_;
 
@@ -472,7 +507,25 @@ class Converter {
   // Store the weights added during construction of trt_network_.
   TrtWeightStore weight_store_;
 
-  const bool is_fp16_;
+  // During conversion, this table is populated with quantization ranges per
+  // tensor. MaybeApplyQuantizationRanges() will use this table to set the TRT
+  // quantization ranges. Since TRT only supports symmetric ranges, we will
+  // store the range as a single float = max(abs(min_range), abs(max_range)).
+  // Range refers to the floating point values, e.g. min_range = 0.0f, max_range
+  // = 6.0f for Relu6.
+  std::unordered_map<nvinfer1::ITensor*, float> quantization_ranges_;
+
+  // Edges where quantization ranges can be inferred (copied) across ops - from
+  // first tensor to second tensor. PropagateQuantizationRanges() will propagate
+  // known ranges from quantization_ranges_ across these edges, adding the new
+  // ranges to quantization_ranges_ so that they can be applied in
+  // MaybeApplyQuantizationRanges().
+  std::vector<std::pair<nvinfer1::ITensor*, nvinfer1::ITensor*>>
+      quantization_infer_;
+
+  const int precision_mode_;
+
+  const bool use_calibration_;
 
   // Batch size of inputs to trt_network_ added by AddInputTensor(). During
   // network construction it will update this, use it to verify the batch
diff --git a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
index c3a39395f3a99f3e471e09688a11cc0ebba61ff4..c37a43dd5def9daf3c5d70720c6db2aab20db077 100644
--- a/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
+++ b/tensorflow/contrib/tensorrt/convert/convert_nodes_test.cc
@@ -35,7 +35,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/protobuf/config.pb.h"  // NOLINT
+#include "tensorflow/core/public/session.h"
 
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
@@ -47,7 +50,9 @@ namespace tensorflow {
 namespace tensorrt {
 namespace convert {
 
+using ::tensorflow::strings::StrCat;
 using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 
 // TODO(laigd): put this into some test utils file.
 void ExpectStatus(Status status, error::Code code = error::OK,
@@ -69,6 +74,32 @@ nvinfer1::Dims GetTestDims(const std::vector<int>& d) {
   return dims;
 }
 
+nvinfer1::DataType TfDataTypeToTrt(DataType tf_dtype) {
+  switch (tf_dtype) {
+    case DT_FLOAT:
+      return nvinfer1::DataType::kFLOAT;
+    case DT_HALF:
+      return nvinfer1::DataType::kHALF;
+    case DT_INT32:
+      return nvinfer1::DataType::kINT32;
+    default:
+      QCHECK(false) << "Unexpected data type " << DataTypeString(tf_dtype);
+  }
+}
+
+DataType TrtDataTypeToTf(nvinfer1::DataType trt_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      return DT_FLOAT;
+    case nvinfer1::DataType::kHALF:
+      return DT_HALF;
+    case nvinfer1::DataType::kINT32:
+      return DT_INT32;
+    default:
+      QCHECK(false) << "Unexpected data type " << static_cast<int>(trt_dtype);
+  }
+}
+
 NodeDef MakeNodeDef(const string& name, const string& op,
                     const std::vector<string>& inputs) {
   NodeDef node_def;
@@ -111,6 +142,35 @@ bool TrtDimsEqualsArray(const std::vector<int>& lhs,
   return TrtDimsEquals(GetTestDims(lhs), rhs);
 }
 
+// TODO(laigd): define a parameterized matcher that can compare against the
+// vector.
+void ExpectTrtDimsEqualsArray(const std::vector<int>& lhs,
+                              const nvinfer1::Dims& rhs) {
+  EXPECT_TRUE(TrtDimsEqualsArray(lhs, rhs))
+      << "expected: " << DebugString(GetTestDims(lhs)) << "\n"
+      << "  actual: " << DebugString(rhs);
+}
+
+template <typename T>
+void ExpectArrayNear(const std::vector<T>& lhs, const std::vector<T>& rhs) {
+  ASSERT_EQ(lhs.size(), rhs.size());
+  for (int i = 0; i < lhs.size(); i++) {
+    EXPECT_FLOAT_EQ(lhs[i], rhs[i]);
+  }
+}
+
+// Eigen::half cannot implicitly convert to float which is required for
+// EXPECT_FLOAT_EQ.
+template <>
+void ExpectArrayNear(const std::vector<Eigen::half>& lhs,
+                     const std::vector<Eigen::half>& rhs) {
+  ASSERT_EQ(lhs.size(), rhs.size());
+  for (int i = 0; i < lhs.size(); i++) {
+    EXPECT_FLOAT_EQ(Eigen::half_impl::half_to_float(lhs[i]),
+                    Eigen::half_impl::half_to_float(rhs[i]));
+  }
+}
+
 bool TrtShapedWeightsEquals(const TRT_ShapedWeights& lhs,
                             const TRT_ShapedWeights& rhs) {
   return TrtDimsEquals(lhs.shape_, rhs.shape_) && lhs.type_ == rhs.type_ &&
@@ -121,8 +181,7 @@ template <typename T>
 void ValidateWeights(const TRT_ShapedWeights& weights,
                      const std::vector<int>& expected_dims,
                      const std::vector<T>& expected_value) {
-  EXPECT_TRUE(TrtDimsEqualsArray(expected_dims, weights.shape_))
-      << weights.DebugString();
+  ExpectTrtDimsEqualsArray(expected_dims, weights.shape_);
   ASSERT_EQ(expected_value.size(), weights.count()) << weights.DebugString();
   const T* actual_values = static_cast<const T*>(weights.GetValues());
   for (int i = 0; i < expected_value.size(); ++i) {
@@ -133,11 +192,12 @@ void ValidateWeights(const TRT_ShapedWeights& weights,
 // Fake ITensor implementation for testing purposes.
 class FakeITensor : public nvinfer1::ITensor {
  public:
-  FakeITensor() {}
+  FakeITensor() : dynamic_range_(0.0f) {}
 
-  FakeITensor(const nvinfer1::Dims& dims) : dims_(dims) {}
+  FakeITensor(const nvinfer1::Dims& dims) : dims_(dims), dynamic_range_(0.0f) {}
 
-  FakeITensor(const std::vector<int>& dims) : dims_(GetTestDims(dims)) {}
+  FakeITensor(const std::vector<int>& dims)
+      : dims_(GetTestDims(dims)), dynamic_range_(0.0f) {}
 
   void setName(const char* name) override { name_ = name; }
 
@@ -166,7 +226,12 @@ class FakeITensor : public nvinfer1::ITensor {
   }
 
 #if NV_TENSORRT_MAJOR >= 5
-  bool setDynamicRange(float min, float max) override {}
+  bool setDynamicRange(float min, float max) override {
+    dynamic_range_ = std::max(std::abs(min), std::abs(max));
+    return true;
+  }
+
+  float getDynamicRange() const override { return dynamic_range_; }
 #endif
 
  private:
@@ -174,6 +239,7 @@ class FakeITensor : public nvinfer1::ITensor {
   nvinfer1::Dims dims_;
   nvinfer1::DataType type_;
   nvinfer1::TensorLocation location_;
+  float dynamic_range_;
 };
 
 TEST(TRT_ShapedWeights_Test, Basic) {
@@ -265,9 +331,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
           EXPECT_EQ(1, ptr->batch_size());
         }
         EXPECT_EQ(&itensor, ptr->tensor());
-        EXPECT_TRUE(TrtDimsEqualsArray({1}, ptr->GetTrtDims()))
-            << "- expected: " << DebugString(dims)
-            << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+        ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
       }
     }
   }
@@ -286,9 +350,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
       EXPECT_EQ(false, ptr->is_weights());
       EXPECT_EQ(1, ptr->batch_size());
       EXPECT_NE(nullptr, ptr->tensor());
-      EXPECT_TRUE(TrtDimsEqualsArray({1}, ptr->GetTrtDims()))
-          << "- expected: " << DebugString(dims)
-          << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+      ExpectTrtDimsEqualsArray({1}, ptr->GetTrtDims());
     }
   }
   // Test constructor with TRT_ShapedWeights argument.
@@ -305,9 +367,7 @@ TEST(TRT_TensorOrWeights_Test, Basic) {
 
       nvinfer1::Dims dims;
       dims.nbDims = 0;
-      EXPECT_TRUE(TrtDimsEqualsArray({}, ptr->GetTrtDims()))
-          << "- expected: " << DebugString(dims)
-          << "\n        vs\n-   actual: " << DebugString(ptr->GetTrtDims());
+      ExpectTrtDimsEqualsArray({}, ptr->GetTrtDims());
     }
   }
 }
@@ -341,34 +401,50 @@ TEST_F(ValidatorTest, ConvertToTensorOrWeights) {
                                           graph_properties, &output));
     ValidateWeights<float>(output.weights(), {2}, {1.0, 2.0});
   }
-  // Convert non-Const. We test the case where the non-batch dimemsion is
-  // unknown as well, to make sure the validator allows that.
-  for (const int32 non_batch_dim : {-1, 2}) {
-    const int32 batch_size = 12;
 
+  // Helper method to run ConvertToTensorOrWeights() with predefined parameters.
+  auto convert_to_tensor_or_weights = [this](const std::vector<int64>& dims,
+                                             TRT_TensorOrWeights* output) {
     Scope s = Scope::NewRootScope();
-    ops::Placeholder::Attrs attrs;
-    TF_EXPECT_OK(TensorShapeUtils::MakeShape(
-        std::vector<int32>{batch_size, non_batch_dim}, &attrs.shape_));
+    const auto attrs = ops::Placeholder::Shape(PartialTensorShape{dims});
     auto feed = ops::Placeholder(s.WithOpName("feed"), DT_FLOAT, attrs);
     auto add = ops::Add(s.WithOpName("add"), feed, feed);
 
     grappler::GrapplerItem item;
     TF_EXPECT_OK(s.ToGraphDef(&item.graph));
-
     grappler::GraphProperties graph_properties(item);
     TF_EXPECT_OK(graph_properties.InferStatically(true));
-
-    auto& node_def = add.operation.node()->def();
+    const NodeDef& node_def = add.operation.node()->def();
+    return this->ConvertToTensorOrWeights(node_def, /*output_port=*/0,
+                                          graph_properties, output);
+  };
+  // Convert non-Const with #dims > nvinfer1::Dims::MAX_DIMS+1.
+  {
     TRT_TensorOrWeights output;
-    ExpectStatus(ConvertToTensorOrWeights(node_def, /*output_port=*/0,
-                                          graph_properties, &output));
+    ExpectStatus(
+        convert_to_tensor_or_weights(
+            std::vector<int64>(nvinfer1::Dims::MAX_DIMS + 2, 1), &output),
+        error::OUT_OF_RANGE, "Input tensor rank is greater than 9");
+  }
+  // Convert non-Const with #dims < 2.
+  {
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights({1}, &output), error::INVALID_ARGUMENT,
+        "Input tensor with rank<2 is not supported since the first dimension "
+        "is treated as batch dimension by TRT");
+  }
+  // Convert non-Const. We test the case where the non-batch dimemsion is
+  // unknown as well, to make sure the validator allows that.
+  for (const int32 non_batch_dim : {-1, 2}) {
+    const int32 batch_size = 12;
+    TRT_TensorOrWeights output;
+    ExpectStatus(
+        convert_to_tensor_or_weights({batch_size, non_batch_dim}, &output));
     EXPECT_EQ(true, output.is_tensor());
     EXPECT_EQ(batch_size, output.batch_size());
     EXPECT_NE(nullptr, output.tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims()))
-        << "- expected: {" << non_batch_dim << "} \n        vs\n"
-        << "-   actual: " << DebugString(output.GetTrtDims());
+    ExpectTrtDimsEqualsArray({non_batch_dim}, output.GetTrtDims());
   }
 }
 
@@ -405,7 +481,9 @@ class ConverterTest : public ::testing::Test {
   ConverterTest() {
     builder_.reset(nvinfer1::createInferBuilder(logger_));
     network_.reset(builder_->createNetwork());
-    converter_.reset(new Converter(network_.get(), /*fp16=*/false));
+    converter_.reset(new Converter(network_.get(),
+                                   /*precision_mode=*/FP32MODE,
+                                   /*use_calibration=*/false));
     weight_store_ = &converter_->weight_store_;
   }
 
@@ -432,8 +510,21 @@ class ConverterTest : public ::testing::Test {
     return converter_->GetInputs(node_def, inputs);
   }
 
+  Status GetWeightRange(const TRT_ShapedWeights& weights, float* out_min,
+                        float* out_max) const {
+    return converter_->GetWeightRange(weights, out_min, out_max);
+  }
+
+  void PropagateQuantizationRanges() {
+    converter_->PropagateQuantizationRanges();
+  }
+
   int batch_size() const { return converter_->batch_size_; }
 
+  std::unordered_map<nvinfer1::ITensor*, float>& quantization_ranges() {
+    return converter_->quantization_ranges_;
+  }
+
  private:
   Logger logger_;
   // These members are ordered in a way such that the destruction order is:
@@ -504,9 +595,9 @@ TEST_F(ConverterTest, AddAndGetInputs) {
   EXPECT_EQ(nvinfer1::DataType::kFLOAT, inputs[0].tensor()->getType());
   EXPECT_EQ(nvinfer1::DataType::kINT32, inputs[2].tensor()->getType());
   EXPECT_EQ(nvinfer1::DataType::kHALF, inputs[3].tensor()->getType());
-  EXPECT_TRUE(TrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions()));
-  EXPECT_TRUE(TrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions()));
-  EXPECT_TRUE(TrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions()));
+  ExpectTrtDimsEqualsArray({1}, inputs[0].tensor()->getDimensions());
+  ExpectTrtDimsEqualsArray({2, 3}, inputs[2].tensor()->getDimensions());
+  ExpectTrtDimsEqualsArray({5, 3}, inputs[3].tensor()->getDimensions());
 }
 
 TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
@@ -552,7 +643,7 @@ TEST_F(ConverterTest, RenameAndMarkOutputTensors) {
       {{"my_op", "my_output"}, {"my_op:1", "my_output_1"}}));
   EXPECT_EQ(2, output_tensors.size());
   for (auto output_tensor : output_tensors) {
-    EXPECT_TRUE(TrtDimsEqualsArray({2, 1}, output_tensor->getDimensions()));
+    ExpectTrtDimsEqualsArray({2, 1}, output_tensor->getDimensions());
   }
   EXPECT_EQ("my_output", string(output_tensors[0]->getName()));
   EXPECT_EQ("my_output_1", string(output_tensors[1]->getName()));
@@ -577,8 +668,7 @@ TEST_F(ConverterTest, TransposeTensor) {
   // OK.
   TF_EXPECT_OK(
       converter_->TransposeTensor(input_tensor, {0, 3, 1, 2}, &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({5, 2, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
@@ -590,7 +680,7 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
   // Shape size doesn't match.
   ExpectStatus(converter_->PrepareTensorForShape(tw, GetTestDims({2, 3, 6}),
                                                  &output_tensor),
-               error::INVALID_ARGUMENT, "Reshape shapes are not compatible.");
+               error::INVALID_ARGUMENT, "Reshape shapes are not compatible");
 
   // TODO(aaroey): we should check the case where uninferred dimensions are not
   // an exact divisor of input dim ensions, e.g. for dims {-1, 7}.
@@ -598,14 +688,12 @@ TEST_F(ConverterTest, PrepareTensorForShape_Tensor) {
   // Infer shape, ok.
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({-1, 2}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({15, 2}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({15, 2}, output_tensor->getDimensions());
 
   // Regular shape.
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({10, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
@@ -615,8 +703,7 @@ TEST_F(ConverterTest, PrepareTensorForShape_Weights) {
   const nvinfer1::ITensor* output_tensor = nullptr;
   TF_EXPECT_OK(converter_->PrepareTensorForShape(tw, GetTestDims({10, 3}),
                                                  &output_tensor));
-  EXPECT_TRUE(TrtDimsEqualsArray({10, 3}, output_tensor->getDimensions()))
-      << DebugString(*output_tensor);
+  ExpectTrtDimsEqualsArray({10, 3}, output_tensor->getDimensions());
 }
 
 TEST_F(ConverterTest, MaybeUpdateBatchSize) {
@@ -656,6 +743,178 @@ TEST_F(ConverterTest, AddAndGetTensorOrWeights) {
                "tensor/weights my_tensor already exist");
 }
 
+template <typename T>
+void TestGetWeightRange(ConverterTest* test, TrtWeightStore* weight_store) {
+  TRT_ShapedWeights weights =
+      weight_store->GetTempWeights(DataTypeToEnum<T>::v(), GetTestDims({2, 3}));
+  const std::vector<T> values = {T(3), T(1), T(2), T(6), T(5), T(4)};
+  memcpy(const_cast<void*>(weights.GetValues()), values.data(),
+         weights.size_bytes());
+
+  float out_min = 0.0f;
+  float out_max = 0.0f;
+  TF_EXPECT_OK(test->GetWeightRange(weights, &out_min, &out_max));
+  EXPECT_EQ(1.0f, out_min);
+  EXPECT_EQ(6.0f, out_max);
+}
+
+TEST_F(ConverterTest, GetWeightRange) {
+  TestGetWeightRange<float>(this, weight_store_);
+  TestGetWeightRange<Eigen::half>(this, weight_store_);
+  TestGetWeightRange<int32>(this, weight_store_);
+}
+
+TEST_F(ConverterTest, ProvideQuantizationRange) {
+  FakeITensor fake_tensor;
+  // Assymetric range
+  converter_->ProvideQuantizationRange(&fake_tensor, 0.0f, 6.0f);
+  EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, 1.0f, 6.0f);
+  EXPECT_EQ(6.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, -8.0f, 6.0f);
+  EXPECT_EQ(8.0f, quantization_ranges()[&fake_tensor]);
+  converter_->ProvideQuantizationRange(&fake_tensor, -8.123f, -6.123f);
+  EXPECT_EQ(8.123f, quantization_ranges()[&fake_tensor]);
+  // Symmetric range
+  converter_->ProvideQuantizationRange(&fake_tensor, -6.123f, 6.123f);
+  EXPECT_EQ(6.123f, quantization_ranges()[&fake_tensor]);
+}
+
+TEST_F(ConverterTest, MaybeApplyQuantizationRanges) {
+  // input -> infer1 -> infer2 -> infer3
+  FakeITensor input, infer_1, infer_2, infer_3;
+  FakeITensor not_infer;
+  Converter int8_converter(/*trt_network=*/nullptr, INT8MODE,
+                           /*use_calibration=*/true);
+  int8_converter.ProvideQuantizationRange(&input, -5.0f, 5.0f);
+  int8_converter.ProvideQuantizationRange(&not_infer, -100.0f, 100.0f);
+  int8_converter.MarkQuantizationRangesAsInferrable(&input, &infer_1);
+  int8_converter.MarkQuantizationRangesAsInferrable(&infer_1, &infer_2);
+  int8_converter.MarkQuantizationRangesAsInferrable(&infer_2, &infer_3);
+
+  // Input range should be inferred along the chain and applied to tensors.
+  int8_converter.MaybeApplyQuantizationRanges();
+#if NV_TENSORRT_MAJOR >= 5
+  EXPECT_EQ(input.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_1.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_2.getDynamicRange(), 5.0f);
+  EXPECT_EQ(infer_3.getDynamicRange(), 5.0f);
+  EXPECT_EQ(not_infer.getDynamicRange(), 100.0f);
+#endif
+}
+
+TEST_F(ConverterTest, PropagateQuantizationRanges) {
+  // infer0 <-> infer1 <-> infer2 <-> infer3
+  //              |
+  //            infer4 <-> infer5
+  FakeITensor infer[6];
+  FakeITensor not_infer;
+  converter_->ProvideQuantizationRange(&infer[4], -5.0f, 5.0f);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[0], &infer[1]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[1], &infer[2]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[3], &infer[2]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[1]);
+  converter_->MarkQuantizationRangesAsInferrable(&infer[4], &infer[5]);
+
+  // Input range should be inferred along the chain.
+  PropagateQuantizationRanges();
+  auto ranges = quantization_ranges();
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(5.0f, ranges[&infer[i]]);
+  }
+  EXPECT_EQ(ranges.count(&not_infer), 0);
+}
+
+TEST_F(ConverterTest, GetTrtBroadcastShape) {
+  const bool kIsTensor = true;
+  const bool kIsNotTensor = false;
+  auto symmetric_test = [this](const std::vector<int>& operand_1_shape,
+                               const std::vector<int>& operand_2_shape,
+                               const bool operand_1_is_tensor,
+                               const bool operand_2_is_tensor,
+                               const std::vector<int>& expected_operand_1_shape,
+                               const std::vector<int>& expected_operand_2_shape,
+                               error::Code expected_code = error::OK,
+                               const char* expected_error_msg_substr = nullptr,
+                               const int operand_1_batch_size = -1,
+                               const int operand_2_batch_size = -1) {
+    auto create_tensor_or_weights = [](const std::vector<int>& shape,
+                                       bool is_tensor, int batch_size = -1) {
+      if (is_tensor) {
+        return TRT_TensorOrWeights{nvinfer1::DataType::kFLOAT,
+                                   GetTestDims(shape), batch_size};
+      }
+      TRT_ShapedWeights weights;
+      weights.shape_ = GetTestDims(shape);
+      return TRT_TensorOrWeights(weights);
+    };
+
+    nvinfer1::Dims operand_1_new_dims, operand_2_new_dims;
+    TRT_TensorOrWeights operand_1 = create_tensor_or_weights(
+        operand_1_shape, operand_1_is_tensor, operand_1_batch_size);
+    TRT_TensorOrWeights operand_2 = create_tensor_or_weights(
+        operand_2_shape, operand_2_is_tensor, operand_2_batch_size);
+
+    // operand_1 broadcast operand_2
+    ExpectStatus(
+        this->converter_->GetTrtBroadcastShape(
+            operand_1, operand_2, &operand_1_new_dims, &operand_2_new_dims),
+        expected_code, expected_error_msg_substr);
+    if (expected_code == error::OK) {
+      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
+      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+    }
+    // operand_2 broadcast operand_1
+    ExpectStatus(
+        this->converter_->GetTrtBroadcastShape(
+            operand_2, operand_1, &operand_2_new_dims, &operand_1_new_dims),
+        expected_code, expected_error_msg_substr);
+    if (expected_code == error::OK) {
+      ExpectTrtDimsEqualsArray(expected_operand_1_shape, operand_1_new_dims);
+      ExpectTrtDimsEqualsArray(expected_operand_2_shape, operand_2_new_dims);
+    }
+  };
+
+  // Both inputs are weights.
+  symmetric_test(
+      {1}, {1}, kIsNotTensor, kIsNotTensor, {}, {}, error::INVALID_ARGUMENT,
+      "Broadcasting requires at least one of the operands be tensors");
+
+  // One tensor and one weights.
+  symmetric_test({1, 1, 1}, {2}, kIsTensor, kIsNotTensor, {1, 1, 1}, {1, 1, 2});
+  symmetric_test({1, 1, 2}, {2}, kIsTensor, kIsNotTensor, {1, 1, 2}, {1, 1, 2});
+  symmetric_test({1, 3, 2}, {1}, kIsTensor, kIsNotTensor, {1, 3, 2}, {1, 1, 1});
+  symmetric_test({1, 1, 1}, {2, 3}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {1, 2, 3});
+  symmetric_test({1, 1, 1}, {2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {2, 3, 4});
+  symmetric_test({1, 1, 1}, {1, 2, 3, 4}, kIsTensor, kIsNotTensor, {1, 1, 1},
+                 {2, 3, 4});
+  symmetric_test({1, 3, 4}, {1, 2, 1, 4}, kIsTensor, kIsNotTensor, {1, 3, 4},
+                 {2, 1, 4});
+  symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme");
+  symmetric_test({1, 1, 1}, {2, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT, "Infeasible broadcast scheme",
+                 /*operand_1_batch_size=*/2);
+  symmetric_test({1, 1, 1}, {1, 1, 1, 1, 1}, kIsTensor, kIsNotTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 4 vs broadcast #dims 5)");
+
+  // Both inputs are tensors.
+  symmetric_test({1, 1, 1}, {1, 1}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 3 vs broadcast #dims 4)");
+  symmetric_test({1, 3, 4}, {2, 1, 4}, kIsTensor, kIsTensor, {1, 3, 4},
+                 {2, 1, 4});
+  symmetric_test({1, 1, 1}, {1, 1, 1, 1}, kIsTensor, kIsTensor, {}, {},
+                 error::INVALID_ARGUMENT,
+                 "Broadcasting beyond batch dimension is not supported "
+                 "(tensor #dims 4 vs broadcast #dims 5)");
+}
+
 // Class to test various op converters, using both a TrtNodeValidator and
 // Converter.
 class OpConverterTest : public ::testing::Test {
@@ -684,15 +943,21 @@ class OpConverterTest : public ::testing::Test {
 
     // Reset the validator and converter.
     validator_.reset(new TrtNodeValidator);
-    converter_.reset(new Converter(network_.get(), /*fp16=*/false));
+    converter_.reset(new Converter(network_.get(),
+                                   /*precision_mode=*/FP32MODE,
+                                   /*use_calibration=*/false));
 
     // Reset other related artifacts.
     scope_ = Scope::NewRootScope();
     validator_inputs_.clear();
   }
 
-  void BuildAndRun(const char* input_name, const std::vector<float>& input_data,
-                   const char* output_name, std::vector<float>* output_data) {
+  // TODO(laigd): test fp16 and int8 support.
+  template <typename T>
+  void BuildAndRun(
+      const std::vector<std::pair<const char*, const std::vector<T>>>&
+          input_data,
+      const char* output_name, std::vector<T>* output_data) {
     // Mark the output tensor as TRT engine output.
     TF_EXPECT_OK(converter_->RenameAndMarkOutputTensors(
         {{string(output_name), string(output_name)}}));
@@ -703,25 +968,33 @@ class OpConverterTest : public ::testing::Test {
     CHECK_NOTNULL(engine_.get());
 
     // Execute the TRT engine.
-    const int input_size = input_data.size() * sizeof(float);
-    const int output_size = output_data->size() * sizeof(float);
-    const int input_index = engine_->getBindingIndex(input_name);
-    const int output_index = engine_->getBindingIndex(output_name);
+    ASSERT_LE(input_data.size() + 1, 3);
+    void* buffers[3];
+    for (const auto name_and_data : input_data) {
+      const int input_size = name_and_data.second.size() * sizeof(T);
+      const int input_index = engine_->getBindingIndex(name_and_data.first);
+      ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
+      ASSERT_EQ(
+          0, cudaMemcpyAsync(buffers[input_index], name_and_data.second.data(),
+                             input_size, cudaMemcpyHostToDevice, stream_));
+    }
 
-    ASSERT_EQ(engine_->getNbBindings(), 2);
-    void* buffers[2];
-    ASSERT_EQ(0, cudaMalloc(&buffers[input_index], input_size));
+    const int output_size = output_data->size() * sizeof(T);
+    const int output_index = engine_->getBindingIndex(output_name);
     ASSERT_EQ(0, cudaMalloc(&buffers[output_index], output_size));
-    ASSERT_EQ(0, cudaMemcpyAsync(buffers[input_index], input_data.data(),
-                                 input_size, cudaMemcpyHostToDevice, stream_));
+
+    ASSERT_EQ(engine_->getNbBindings(), input_data.size() + 1);
+
     TrtUniquePtrType<nvinfer1::IExecutionContext> execution_context(
         engine_->createExecutionContext());
     execution_context->enqueue(/*batchSize=*/1, buffers, stream_, nullptr);
     ASSERT_EQ(0, cudaMemcpyAsync(output_data->data(), buffers[output_index],
                                  output_size, cudaMemcpyDeviceToHost, stream_));
     cudaStreamSynchronize(stream_);
-    ASSERT_EQ(0, cudaFree(buffers[input_index]));
-    ASSERT_EQ(0, cudaFree(buffers[output_index]));
+
+    for (int i = 0; i < input_data.size() + 1; ++i) {
+      ASSERT_EQ(0, cudaFree(buffers[i]));
+    }
   }
 
   bool HasStaticShape(const nvinfer1::Dims& dims) const {
@@ -736,18 +1009,7 @@ class OpConverterTest : public ::testing::Test {
   void AddTestTensor(
       const char* name, const std::vector<int32>& dims, int batch_size = 1,
       nvinfer1::DataType trt_dtype = nvinfer1::DataType::kFLOAT) {
-    DataType tf_dtype = DT_FLOAT;
-    switch (trt_dtype) {
-      case nvinfer1::DataType::kFLOAT:
-        tf_dtype = DT_FLOAT;
-        break;
-      case nvinfer1::DataType::kINT32:
-        tf_dtype = DT_INT32;
-        break;
-      default:
-        ASSERT_TRUE(false) << "Unexpected data type "
-                           << static_cast<int>(trt_dtype);
-    }
+    DataType tf_dtype = TrtDataTypeToTf(trt_dtype);
     ops::Placeholder::Attrs attrs;
     TF_EXPECT_OK(TensorShapeUtils::MakeShape(dims, &attrs.shape_));
     attrs.shape_.InsertDim(0, batch_size);
@@ -826,6 +1088,11 @@ class OpConverterTest : public ::testing::Test {
     }
   }
 
+  // Expose quantization_ranges_ for tests
+  std::unordered_map<nvinfer1::ITensor*, float>& quantization_ranges() {
+    return converter_->quantization_ranges_;
+  }
+
   std::unique_ptr<Converter> converter_;
   std::unique_ptr<TrtNodeValidator> validator_;
 
@@ -835,6 +1102,11 @@ class OpConverterTest : public ::testing::Test {
   TrtUniquePtrType<nvinfer1::INetworkDefinition> network_;
   TrtUniquePtrType<nvinfer1::ICudaEngine> engine_;
   cudaStream_t stream_;
+  // Used to create placeholders with shape and data type information. The
+  // created placeholders will be used as inputs to the node to be verified,
+  // thus we need the shape and data type information to get a non-empty
+  // GraphProperties.
+  // TODO(laigd): consider use this Scope to create the NodeDef to verify.
   Scope scope_;
   std::unordered_map<string, NodeDef> validator_inputs_;
 };
@@ -958,15 +1230,15 @@ TEST_F(OpConverterTest, ConvertTranspose) {
     Reset();
     AddTestTensor("input", {1, 2, 3});
     AddTestWeights<int32>("weights", {4}, {0, 3, 1, 2});
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_transpose", &output));
     EXPECT_TRUE(output.is_tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions()))
-        << output.DebugString();
+    ExpectTrtDimsEqualsArray({3, 1, 2}, output.tensor()->getDimensions());
 
     std::vector<float> output_data(6);
-    BuildAndRun("input", {1, 2, 3, 4, 5, 6}, "my_transpose", &output_data);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_transpose",
+                       &output_data);
     EXPECT_THAT(output_data, ElementsAre(1, 4, 2, 5, 3, 6));
   }
 }
@@ -1048,15 +1320,15 @@ TEST_F(OpConverterTest, ConvertReshape) {
     Reset();
     AddTestTensor("input", ok_params[i].tensor_dims, ok_params[i].batch_size);
     AddTestWeights<int32>("weights", {4}, ok_params[i].shape);
-    RunConversion(node_def);
+    RunValidationAndConversion(node_def);
     TRT_TensorOrWeights output;
     TF_EXPECT_OK(GetTensorOrWeights("my_reshape", &output));
     EXPECT_TRUE(output.is_tensor());
-    EXPECT_TRUE(TrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions()))
-        << output.DebugString();
+    ExpectTrtDimsEqualsArray({1, 3, 2}, output.tensor()->getDimensions());
 
     std::vector<float> output_data(6);
-    BuildAndRun("input", {1, 2, 3, 4, 5, 6}, "my_reshape", &output_data);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_reshape",
+                       &output_data);
     EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
   }
 }
@@ -1070,15 +1342,14 @@ TEST_F(OpConverterTest, ConvertMatMul) {
         "Input expects tensor and weights, at my_matmul");
   }
 
-  // Get the NodeDef for Reshape.
+  // Get the NodeDef for MatMul.
   auto get_matmul_nodedef = [](DataType dtype, bool transpose_a,
                                bool transpose_b) -> NodeDef {
     Scope s = Scope::NewRootScope();
     auto input = ops::Placeholder(s.WithOpName("input"), dtype);
     auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
-    ops::MatMul::Attrs matmul_attrs;
-    matmul_attrs.transpose_a_ = transpose_a;
-    matmul_attrs.transpose_b_ = transpose_b;
+    const auto matmul_attrs =
+        ops::MatMul::TransposeA(transpose_a).TransposeB(transpose_b);
     auto matmul =
         ops::MatMul(s.WithOpName("my_matmul"), input, weights, matmul_attrs);
     return matmul.operation.node()->def();
@@ -1094,45 +1365,990 @@ TEST_F(OpConverterTest, ConvertMatMul) {
         node_def, error::UNIMPLEMENTED,
         "Data type is not supported, for node my_matmul got int32");
   }
-  {
-    // transpose_a is set.
-    for (bool transpose_b : {false, true}) {
-      Reset();
-      NodeDef node_def =
-          get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b);
-      AddTestTensor("input", {2}, /*batch_size=*/1);
-      AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-      RunValidationAndConversion(
-          node_def, error::INVALID_ARGUMENT,
-          "transpose_a is not supported for TensorRT FullyConnected");
+  // transpose_a is set.
+  for (bool transpose_b : {false, true}) {
+    Reset();
+    NodeDef node_def =
+        get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/true, transpose_b);
+    AddTestTensor("input", {2}, /*batch_size=*/1);
+    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "transpose_a is not supported for TensorRT FullyConnected");
+  }
+  // OK.
+  for (bool transpose_b : {false, true}) {
+    Reset();
+    NodeDef node_def =
+        get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b);
+    AddTestTensor("input", {2}, /*batch_size=*/1);
+    AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({2}, output.tensor()->getDimensions());
+
+    std::vector<float> output_data(2);
+    BuildAndRun<float>({{"input", {0, 1}}}, "my_matmul", &output_data);
+    if (transpose_b) {
+      EXPECT_THAT(output_data, ElementsAre(1, 3));
+    } else {
+      EXPECT_THAT(output_data, ElementsAre(2, 3));
     }
   }
-  {
-    // OK.
-    for (bool transpose_b : {false, true}) {
-      Reset();
-      NodeDef node_def =
-          get_matmul_nodedef(DT_FLOAT, /*transpose_a=*/false, transpose_b);
-      AddTestTensor("input", {2}, /*batch_size=*/1);
-      AddTestWeights<float>("weights", {2, 2}, {0, 1, 2, 3});
-      RunConversion(node_def);
+}
+
+template <DataType dtype>
+void TestConvertBiasAdd(OpConverterTest* test) {
+  // Get the NodeDef for BiasAdd.
+  auto get_biasadd_nodedef = [](const string& data_format) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+    auto weights = ops::Placeholder(s.WithOpName("weights"), dtype);
+    const auto biasadd_attrs = ops::BiasAdd::DataFormat(data_format);
+    auto biasadd =
+        ops::BiasAdd(s.WithOpName("my_biasadd"), input, weights, biasadd_attrs);
+    return biasadd.operation.node()->def();
+  };
+
+  typedef typename EnumToDataType<dtype>::Type CType;
+  for (const string& data_format : {"NHWC", "NCHW"}) {
+    for (const int trt_input_rank : {1, 2, 3, 4}) {
+      test->Reset();
+      NodeDef node_def = get_biasadd_nodedef(data_format);
+
+      // Add input, dims_array will be like {2, 1, ..., 1, 3}
+      std::vector<int32> dims_array(trt_input_rank, 1);
+      if (trt_input_rank == 1) {
+        dims_array[0] = (data_format == "NHWC" ? 3 : 2);
+      } else {
+        dims_array[0] = 2;
+        dims_array[trt_input_rank - 1] = 3;
+      }
+      test->AddTestTensor("input", dims_array, /*batch_size=*/1,
+                          TfDataTypeToTrt(dtype));
+
+      // Add bias weights.
+      const int channel_size = (data_format == "NHWC" ? 3 : 2);
+      std::vector<CType> bias(channel_size);
+      for (int i = 0; i < channel_size; ++i) {
+        bias[i] = CType(i + 1);  // bias will be {1, 2, 3, ...}
+      }
+      test->AddTestWeights<CType>("weights", {channel_size}, bias);
+
+      // Run the conversion.
+      test->RunValidationAndConversion(node_def);
       TRT_TensorOrWeights output;
-      TF_EXPECT_OK(GetTensorOrWeights("my_matmul", &output));
+      TF_EXPECT_OK(test->GetTensorOrWeights("my_biasadd", &output));
       EXPECT_TRUE(output.is_tensor());
-      EXPECT_TRUE(TrtDimsEqualsArray({2}, output.tensor()->getDimensions()))
-          << output.DebugString();
-
-      std::vector<float> output_data(2);
-      BuildAndRun("input", {0, 1}, "my_matmul", &output_data);
-      if (transpose_b) {
-        EXPECT_THAT(output_data, ElementsAre(1, 3));
+      ExpectTrtDimsEqualsArray(dims_array, output.tensor()->getDimensions());
+
+      // Build and run the engine.
+      const int num_input = TrtDimsNumElements(GetTestDims(dims_array));
+      ASSERT_EQ(trt_input_rank > 1 ? 6 : (data_format == "NHWC" ? 3 : 2),
+                num_input);
+      std::vector<CType> output_data(num_input);
+      test->BuildAndRun<CType>(
+          {{"input", std::vector<CType>(num_input, CType(0))}}, "my_biasadd",
+          &output_data);
+      if (trt_input_rank == 1) {
+        if (data_format == "NHWC") {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3)));
+        } else {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2)));
+        }
       } else {
-        EXPECT_THAT(output_data, ElementsAre(2, 3));
+        if (data_format == "NHWC") {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(2), CType(3),
+                                               CType(1), CType(2), CType(3)));
+        } else {
+          EXPECT_THAT(output_data, ElementsAre(CType(1), CType(1), CType(1),
+                                               CType(2), CType(2), CType(2)));
+        }
       }
     }
   }
 }
 
+TEST_F(OpConverterTest, ConvertBiasAdd) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_biasadd", "BiasAdd", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Input expects tensor and weights, at my_biasadd");
+  }
+
+  // OK. Note that kINT32 is not supported by IScaleLayer, so we don't test
+  // DT_INT32 type here.
+  TestConvertBiasAdd<DT_FLOAT>(this);
+  TestConvertBiasAdd<DT_HALF>(this);
+}
+
+template <typename OpType>
+NodeDef GetBinaryOpNodeDef(const string& input_name_l,
+                           const string& input_name_r, DataType dtype) {
+  Scope s = Scope::NewRootScope();
+  auto input_l = ops::Placeholder(s.WithOpName(input_name_l), dtype);
+  auto input_r = ops::Placeholder(s.WithOpName(input_name_r), dtype);
+  auto op = OpType(s.WithOpName("my_binary"), input_l, input_r);
+  return op.operation.node()->def();
+}
+
+void CheckAddedLayers(OpConverterTest* test, bool expect_scale_layer) {
+  bool element_wise_layer_found = false;
+  bool scale_layer_found = false;
+  for (int i = 0; i < test->converter_->network()->getNbLayers(); i++) {
+    nvinfer1::ILayer* layer = test->converter_->network()->getLayer(i);
+    if (dynamic_cast<nvinfer1::IScaleLayer*>(layer)) {
+      scale_layer_found = true;
+    } else if (dynamic_cast<nvinfer1::IElementWiseLayer*>(layer)) {
+      element_wise_layer_found = true;
+    }
+  }
+  EXPECT_EQ(expect_scale_layer, scale_layer_found);
+  EXPECT_NE(expect_scale_layer, element_wise_layer_found);
+}
+
+template <typename OpType, DataType dtype>
+void TestBinaryTensorOpWeightNoBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  for (auto swap_inputs : {false, true}) {
+    test->Reset();
+    NodeDef node_def;
+    if (swap_inputs) {
+      node_def = GetBinaryOpNodeDef<OpType>("weights", "input", dtype);
+    } else {
+      node_def = GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
+    }
+
+    const std::vector<CType> operand1{CType(3), CType(7.5)};
+    const std::vector<CType> operand2{CType(2), CType(3)};
+
+    // It requires the dims to be at least of rank 3 to apply an IScaleLayer.
+    test->AddTestTensor("input", /*dims=*/{1, 1, 2}, /*batch_size=*/1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("weights", /*dims=*/{1, 1, 2},
+                                /*values=*/swap_inputs ? operand1 : operand2);
+    test->RunValidationAndConversion(node_def);
+
+    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+    CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+    // Check the dims of the output ITensor.
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<CType> output_data(2);
+    test->BuildAndRun<CType>(
+        {{"input",
+          /*input_data=*/swap_inputs ? operand2 : operand1}},
+        "my_binary", &output_data);
+    if (node_def.op() == "Add") {
+      EXPECT_THAT(output_data, ElementsAre(CType(5), CType(10.5)));
+    } else if (node_def.op() == "Sub") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1), CType(4.5)));
+    } else if (node_def.op() == "Mul") {
+      EXPECT_THAT(output_data, ElementsAre(CType(6), CType(22.5)));
+    } else if (node_def.op() == "Div") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+    } else if (node_def.op() == "RealDiv") {
+      EXPECT_THAT(output_data, ElementsAre(CType(1.5), CType(2.5)));
+    } else {
+      ASSERT_TRUE(false);
+    }
+  }
+}
+
+template <DataType dtype>
+void TestBinaryTensorOpWeightWithChannelWiseBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
+  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
+  const std::vector<CType> weights{CType(10), CType(20)};
+  // There are two types of valid dim pairs which requires channel-wise
+  // broadcasting:
+  // - input dims (X Y Z) vs weights dims (X 1 1)
+  // - input dims (X Y Z) vs weights dims (Z)
+  // Here X=Z=2 and Y=1.
+  for (auto weights_dims : std::vector<std::vector<int>>{{2, 1, 1}, {2}}) {
+    test->Reset();
+    test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
+                        TfDataTypeToTrt(dtype));
+    test->AddTestWeights<CType>("weights", weights_dims, weights);
+    test->RunValidationAndConversion(node_def);
+
+    // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+    CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+    // Check the dims of the output ITensor.
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
+
+    std::vector<CType> output_data(4);
+    test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+    if (weights_dims.size() == 1) {
+      EXPECT_THAT(output_data,
+                  ElementsAre(CType(11), CType(22), CType(13), CType(24)));
+    } else {
+      EXPECT_THAT(output_data,
+                  ElementsAre(CType(11), CType(12), CType(23), CType(24)));
+    }
+  }
+}
+
+template <DataType dtype>
+void TestBinaryTensorOpWeightWithUniformlyBroadcast(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<ops::Add>("input", "weights", dtype);
+  const std::vector<CType> input{CType(1), CType(2), CType(3), CType(4)};
+  const std::vector<CType> weights{CType(10)};
+  test->Reset();
+  test->AddTestTensor("input", /*dims=*/{2, 1, 2}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestWeights<CType>("weights", {1, 1, 1, 1}, weights);
+  test->RunValidationAndConversion(node_def);
+
+  // Make sure it does use BinaryTensorOpWeight, not BinaryTensorOpTensor.
+  CheckAddedLayers(test, /*expect_scale_layer=*/true);
+
+  // Check the dims of the output ITensor.
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({2, 1, 2}, output.tensor()->getDimensions());
+
+  std::vector<CType> output_data(4);
+  test->BuildAndRun<CType>({{"input", input}}, "my_binary", &output_data);
+  EXPECT_THAT(output_data,
+              ElementsAre(CType(11), CType(12), CType(13), CType(14)));
+}
+
+template <typename OpType>
+void TestBinaryTensorOpWeightFallback(OpConverterTest* test,
+                                      const std::vector<int32>& input_dims,
+                                      const std::vector<int>& weights_dims,
+                                      error::Code code = error::OK,
+                                      const char* error_msg_substr = nullptr,
+                                      const int input_batch_size = 1) {
+  const DataType dtype = DT_FLOAT;
+  typedef typename EnumToDataType<dtype>::Type CType;
+  const size_t num_inputs = TrtDimsNumElements(GetTestDims(input_dims));
+  const size_t num_weights = TrtDimsNumElements(GetTestDims(weights_dims));
+
+  test->Reset();
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<OpType>("input", "weights", dtype);
+  test->AddTestTensor("input", /*dims=*/input_dims, input_batch_size,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestWeights<CType>(
+      "weights", /*dims=*/weights_dims,
+      /*values=*/std::vector<CType>(num_weights, CType(1)));
+  test->RunValidationAndConversion(node_def, code, error_msg_substr);
+  if (code != error::OK) return;
+
+  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
+  CheckAddedLayers(test, /*expect_scale_layer=*/false);
+
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+
+  // Check the dims of the output ITensor.
+  std::vector<int> expected_output_dims = input_dims;
+  for (int i = expected_output_dims.size() - 1, j = weights_dims.size() - 1;
+       i >= 0 && j >= 0; --i, --j) {
+    if (expected_output_dims[i] == 1) {
+      expected_output_dims[i] = weights_dims[j];
+    }
+  }
+  ExpectTrtDimsEqualsArray(expected_output_dims,
+                           output.tensor()->getDimensions());
+
+  // Check the result of running the engine.
+  const int expected_num_outputs =
+      TrtDimsNumElements(GetTestDims(expected_output_dims));
+  std::vector<CType> output_data(expected_num_outputs);
+  test->BuildAndRun<CType>(
+      {{"input",
+        /*input_data=*/std::vector<CType>(num_inputs, CType(2))}},
+      "my_binary", &output_data);
+  if (node_def.op() == "Add") {
+    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
+                                 expected_num_outputs, CType(3))));
+  } else if (node_def.op() == "Minimum") {
+    EXPECT_THAT(output_data, ElementsAreArray(std::vector<CType>(
+                                 expected_num_outputs, CType(1))));
+  } else {
+    ASSERT_TRUE(false);
+  }
+}
+
+template <typename OpType, DataType dtype>
+void TestBinaryTensorOpTensor(OpConverterTest* test) {
+  typedef typename EnumToDataType<dtype>::Type CType;
+  test->Reset();
+  const NodeDef node_def =
+      GetBinaryOpNodeDef<OpType>("input1", "input2", dtype);
+  test->AddTestTensor("input1", /*dims=*/{1, 2}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->AddTestTensor("input2", /*dims=*/{2, 1}, /*batch_size=*/1,
+                      TfDataTypeToTrt(dtype));
+  test->RunValidationAndConversion(node_def);
+
+  // Make sure it does use BinaryTensorOpTensor, not BinaryTensorOpWeight.
+  CheckAddedLayers(test, /*expect_scale_layer=*/false);
+
+  // Check output dims.
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_binary", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({2, 2}, output.tensor()->getDimensions());
+
+  std::vector<CType> output_data(4);
+  // After broadcasting first input becomes {3, 6, 3, 6} and second input
+  // becomes {2, 3, 2, 3}.
+  test->BuildAndRun<CType>(
+      {{"input1", {CType(3), CType(6)}}, {"input2", {CType(2), CType(3)}}},
+      "my_binary", &output_data);
+  if (node_def.op() == "Add") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(5), CType(8), CType(6), CType(9)));
+  } else if (node_def.op() == "Sub") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1), CType(4), CType(0), CType(3)));
+  } else if (node_def.op() == "Mul") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(6), CType(12), CType(9), CType(18)));
+  } else if (node_def.op() == "Div") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+  } else if (node_def.op() == "RealDiv") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(1.5), CType(3), CType(1), CType(2)));
+  } else if (node_def.op() == "Minimum") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(2), CType(2), CType(3), CType(3)));
+  } else if (node_def.op() == "Maximum") {
+    EXPECT_THAT(output_data,
+                ElementsAre(CType(3), CType(6), CType(3), CType(6)));
+  } else {
+    ASSERT_TRUE(false);
+  }
+}
+
+TEST_F(OpConverterTest, ConvertBinary) {
+  // Input size doesn't match, should fail.
+  for (size_t num_inputs = 0; num_inputs < 2; ++num_inputs) {
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_add", "Add", {num_inputs, "input"});
+    AddTestTensor("input", {1}, /*batch_size=*/1, nvinfer1::DataType::kFLOAT);
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Binary ops require two inputs, at my_add");
+  }
+  {
+    // Both inputs are weights.
+    Reset();
+    NodeDef node_def = MakeNodeDef("my_add", "Add", {"weights1", "weights2"});
+    AddTestWeights<float>("weights1", {1}, {1});
+    AddTestWeights<float>("weights2", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Constant folding is falled back to TensorFlow, binary op received "
+        "both input as constant at: my_add");
+  }
+
+  // Test BinaryTensorOpWeight() without broadcasting.
+  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_FLOAT>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_FLOAT>(this);
+#if 0
+  // TODO(b/119560144): it doesn't support FP16 constants and the following test
+  // will fail.
+  TestBinaryTensorOpWeightNoBroadcast<ops::Add, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Sub, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Mul, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::Div, DT_HALF>(this);
+  TestBinaryTensorOpWeightNoBroadcast<ops::RealDiv, DT_HALF>(this);
+#endif
+
+  // Test BinaryTensorOpWeight() with channel-wise broadcasting.
+  TestBinaryTensorOpWeightWithChannelWiseBroadcast<DT_FLOAT>(this);
+
+  // Test BinaryTensorOpWeight() with uniformly broadcasting.
+  TestBinaryTensorOpWeightWithUniformlyBroadcast<DT_FLOAT>(this);
+
+  // Test BinaryTensorOpWeight() falling back to BinaryTensorOpTensor().
+  // Unsupported op.
+  TestBinaryTensorOpWeightFallback<ops::Minimum>(this, {1, 1, 1}, {1});
+  // Rank of input tensor dimension <3.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1}, {1});
+  // Broadcast on batch dimension, should fail.
+  TestBinaryTensorOpWeightFallback<ops::Add>(
+      this, {1, 1, 1}, {2, 1, 1, 1}, error::INVALID_ARGUMENT,
+      "Unsupported binary op broadcast scheme for op my_binary",
+      /*input_batch_size=*/2);
+  // Incompatible dims with per-channel mode.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 1, 1}, {1, 2, 1});
+  // Incompatible dims.
+  TestBinaryTensorOpWeightFallback<ops::Add>(this, {1, 2, 1}, {2});
+
+  // Test BinaryTensorOpTensor() with broadcasting.
+  TestBinaryTensorOpTensor<ops::Add, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Sub, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Mul, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Div, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::RealDiv, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Minimum, DT_FLOAT>(this);
+  TestBinaryTensorOpTensor<ops::Maximum, DT_FLOAT>(this);
+
+  TestBinaryTensorOpTensor<ops::Add, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Sub, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Mul, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Div, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::RealDiv, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Minimum, DT_HALF>(this);
+  TestBinaryTensorOpTensor<ops::Maximum, DT_HALF>(this);
+}
+
+TEST_F(OpConverterTest, ConvertQuantize) {
+  for (const string& op :
+       {"FakeQuantWithMinMaxArgs", "FakeQuantWithMinMaxVars",
+        "QuantizeAndDequantizeV2", "QuantizeAndDequantizeV3"}) {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_quantize", op, {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        StrCat("Invalid number of inputs for ", op, ", at my_quantize")
+            .c_str());
+  }
+  {
+    // FakeQuantWithMinMaxArgs attributes are empty, should fail.
+    NodeDef node_def =
+        MakeNodeDef("my_quantize", "FakeQuantWithMinMaxArgs", {"input"});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min or max attribute not found for FakeQuantWithMinMaxArgs "
+        "at my_quantize");
+  }
+  {
+    // FakeQuantWithMinMaxArgs ranges set via attributes, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto quantize_attrs = ops::FakeQuantWithMinMaxArgs::Min(-6.0f).Max(6.0f);
+    auto quantize = ops::FakeQuantWithMinMaxArgs(s.WithOpName("my_quantize"),
+                                                 input, quantize_attrs);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // FakeQuantWithMinMaxVars ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::FakeQuantWithMinMaxVars(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // QuantizeAndDequantizeV2 ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+  {
+    // QuantizeAndDequantizeV2 Range inputs are tensors, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto quantize = ops::QuantizeAndDequantizeV2(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights_min", {1});
+    AddTestTensor("weights_max", {1});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Min and max inputs for QuantizeAndDequantizeV2 must be weights not "
+        "tensors, at my_quantize");
+  }
+  {
+    // QuantizeAndDequantizeV3 ranges set via inputs, ok.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto weights_min = ops::Placeholder(s.WithOpName("weights_min"), DT_FLOAT);
+    auto weights_max = ops::Placeholder(s.WithOpName("weights_max"), DT_FLOAT);
+    auto num_bits = ops::Placeholder(s.WithOpName("num_bits"), DT_INT32);
+    auto quantize = ops::QuantizeAndDequantizeV3(
+        s.WithOpName("my_quantize"), input, weights_min, weights_max, num_bits);
+    const NodeDef& node_def = quantize.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<float>("weights_min", {1}, {-6.0f});
+    AddTestWeights<float>("weights_max", {1}, {6.0f});
+    AddTestWeights<int>("num_bits", {1}, {8});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_quantize", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(1, ranges.count(output.tensor()));
+    EXPECT_EQ(6.0f, ranges[output.tensor()]);
+  }
+}
+
+TEST_F(OpConverterTest, ConvertRelu6) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_relu6", "Relu6", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Invalid number of inputs for Relu6, at my_relu6");
+  }
+
+  // Get the NodeDef for Relu6.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto relu6 = ops::Relu6(s.WithOpName("my_relu6"), input);
+  const NodeDef node_def = relu6.operation.node()->def();
+  {
+    // Input is weights, should fail.
+    Reset();
+    AddTestWeights<float>("input", {1}, {1.0f});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Relu6 is only implemented for tensors, not weights, at my_relu6");
+  }
+  {
+    // Clip tensor values and set quantization ranges, ok.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_relu6", &output));
+    EXPECT_TRUE(output.is_tensor());
+    auto ranges = quantization_ranges();
+    EXPECT_EQ(ranges[output.tensor()], 6.0f);
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {-100, -1, 0, 3, 5, 9}}}, "my_relu6",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(0, 0, 0, 3, 5, 6));
+  }
+}
+
+template <DataType dtype>
+void TestConvertSquare(OpConverterTest* test) {
+  test->Reset();
+  typedef typename EnumToDataType<dtype>::Type CType;
+
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), dtype);
+  auto square = ops::Square(s.WithOpName("my_square"), input);
+  NodeDef node_def = square.operation.node()->def();
+
+  test->AddTestTensor("input", {1, 20});
+  test->RunValidationAndConversion(node_def);
+  TRT_TensorOrWeights output;
+  TF_EXPECT_OK(test->GetTensorOrWeights("my_square", &output));
+  EXPECT_TRUE(output.is_tensor());
+  ExpectTrtDimsEqualsArray({1, 20}, output.tensor()->getDimensions());
+
+  const int num_inputs = 20;
+  std::vector<CType> input_data(num_inputs);
+  std::vector<CType> expected_output_data(num_inputs);
+  for (int i = 0; i < 20; i++) {
+    const CType value = CType(i - 9);
+    input_data[i] = value;
+    expected_output_data[i] = value * value;
+  }
+  std::vector<CType> output_data(num_inputs);
+  test->BuildAndRun<CType>({{"input", input_data}}, "my_square", &output_data);
+  ExpectArrayNear(expected_output_data, output_data);
+}
+
+TEST_F(OpConverterTest, ConvertSquare) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_square", "Square", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Square expects one input, at my_square");
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto square = ops::Square(s.WithOpName("my_square"), input);
+    NodeDef node_def = square.operation.node()->def();
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, -5, 6});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Square is only implemented for tensors, at my_square");
+  }
+
+  // OK. Note that kINT32 is not supported by IElementWiseLayer, so we don't
+  // test DT_INT32 type here.
+  TestConvertSquare<DT_FLOAT>(this);
+  // TODO(tmorris): Looks like there may be a bug with this layer for FP16
+  // inputs. Disabling for now.
+  // TestConvertSquare<DT_HALF>(this);
+}
+
+TEST_F(OpConverterTest, ConvertActivation) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_act", "Relu", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "Relu expects one input, at my_act");
+  }
+  {
+    // Input is weights, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto relu = ops::Relu(s.WithOpName("my_act"), input);
+    const NodeDef& node_def = relu.operation.node()->def();
+    AddTestWeights<int32>("input", {1, 2, 3}, {-3, -2, -1, 0, 1, 2});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Relu is only implemented for tensors, at my_act");
+  }
+
+  // Get nodedef for activation layer.
+  auto get_act_nodedef = [](string op_name) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    if (op_name == "Relu") {
+      auto act = ops::Relu(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    } else if (op_name == "Sigmoid") {
+      auto act = ops::Sigmoid(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    } else if (op_name == "Tanh") {
+      auto act = ops::Tanh(s.WithOpName("my_act"), input);
+      return act.operation.node()->def();
+    }
+    EXPECT_TRUE(false);
+    return NodeDef();
+  };
+  // Get expected output for activation layer.
+  auto get_act_output = [](string op_name, float input) -> float {
+    if (op_name == "Relu") {
+      return (input > 0.0f) ? input : 0.0f;
+    } else if (op_name == "Sigmoid") {
+      return 1.0f / (1.0f + std::exp(-input));
+    } else if (op_name == "Tanh") {
+      return std::tanh(input);
+    }
+    EXPECT_TRUE(false);
+    return 0;
+  };
+
+  // Ok.
+  for (string op_name : {"Relu", "Sigmoid", "Tanh"}) {
+    Reset();
+    NodeDef node_def = get_act_nodedef(op_name);
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_act", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray({1, 2, 3}, output.tensor()->getDimensions());
+
+    const std::vector<float> input_data = {-100, -2, -1, 0, 1, 100};
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", input_data}}, "my_act", &output_data);
+    for (int i = 0; i < input_data.size(); i++) {
+      const float expected_output = get_act_output(op_name, input_data[i]);
+      EXPECT_FLOAT_EQ(output_data[i], expected_output);
+    }
+  }
+}
+
+TEST_F(OpConverterTest, ConvertExpandDims) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_expanddims", "ExpandDims", {});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Two inputs expected for ExpandDims, at my_expanddims");
+  }
+
+  // Get the NodeDef for ExpandDims.
+  Scope s = Scope::NewRootScope();
+  auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+  auto weights = ops::Placeholder(s.WithOpName("weights"), DT_INT32);
+  auto expanddims =
+      ops::ExpandDims(s.WithOpName("my_expanddims"), input, weights);
+  const NodeDef& node_def = expanddims.operation.node()->def();
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    AddTestWeights<int32>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    AddTestWeights<int32>("weights", {1}, {1});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "ExpandDims expects tensor for input, at my_expanddims");
+  }
+  {
+    // Axis is a tensor, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestTensor("weights", {3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "ExpandDims expects weights for axis, at my_expanddims");
+  }
+  {
+    // Add dim at batch dimension, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    AddTestWeights<int32>("weights", {1}, {0});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Modifying batch dimension is not supported for ExpandDims, at "
+        "my_expanddims");
+  }
+  {
+    // Add dim at batch dimension via negative axis, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {-5});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Modifying batch dimension is not supported for ExpandDims, at "
+        "my_expanddims");
+  }
+  {
+    // Axis > rank(input), should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {5});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at my_expanddims");
+  }
+  {
+    // Axis < -rank(input)-1, should fail.
+    Reset();
+    AddTestTensor("input", {1, 2, 3});
+    // Input is rank 4 (batch dim included)
+    AddTestWeights<int32>("weights", {1}, {-6});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for ExpandDims is invalid, must be in the range "
+        "[-rank(input) - 1, rank(input)], at my_expanddims");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims, int axis,
+               const std::vector<int>& expected_output_dims)
+        : input_dims(input_dims),
+          axis(axis),
+          expected_output_dims(expected_output_dims) {}
+    std::vector<int> input_dims;
+    int axis;
+    std::vector<int> expected_output_dims;
+  };
+
+  // Ok.
+  const int kExpandDimsOKCases = 8;
+  TestParams ok_params[kExpandDimsOKCases] = {
+      TestParams{{2, 3}, 1, {1, 2, 3}}, TestParams{{2, 3}, -3, {1, 2, 3}},
+      TestParams{{2, 3}, 3, {2, 3, 1}}, TestParams{{2, 3}, -1, {2, 3, 1}},
+      TestParams{{2, 3}, 2, {2, 1, 3}}, TestParams{{2, 3}, -2, {2, 1, 3}},
+      TestParams{{6}, 1, {1, 6}},       TestParams{{6}, -1, {6, 1}},
+  };
+  for (int i = 0; i < kExpandDimsOKCases; ++i) {
+    Reset();
+    AddTestTensor("input", ok_params[i].input_dims);
+    AddTestWeights<int32>("weights", {1}, {ok_params[i].axis});
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_expanddims", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_expanddims",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+  }
+}
+
+TEST_F(OpConverterTest, ConvertSqueeze) {
+  {
+    // Input list is empty, should fail.
+    NodeDef node_def = MakeNodeDef("my_squeeze", "Squeeze", {});
+    RunValidationAndConversion(node_def, error::INVALID_ARGUMENT,
+                               "One input expected for Squeeze, at my_squeeze");
+  }
+  {
+    // No attrs, should fail.
+    Reset();
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    auto squeeze = ops::Squeeze(s.WithOpName("my_squeeze"), input);
+    const NodeDef& node_def = squeeze.operation.node()->def();
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Squeeze is only implemented for explicit dims, at my_squeeze");
+  }
+
+  // Get the NodeDef for Squeeze.
+  auto get_squeeze_nodedef = [](std::vector<int> axis) -> NodeDef {
+    Scope s = Scope::NewRootScope();
+    auto input = ops::Placeholder(s.WithOpName("input"), DT_FLOAT);
+    ops::Squeeze::Attrs squeeze_attrs;
+    squeeze_attrs.axis_ = gtl::ArraySlice<int>(axis);
+    auto squeeze =
+        ops::Squeeze(s.WithOpName("my_squeeze"), input, squeeze_attrs);
+    return squeeze.operation.node()->def();
+  };
+
+  {
+    // Input is weights, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({0});
+    AddTestWeights<float>("input", {1, 2, 3}, {1, 2, 3, 4, 5, 6});
+    RunValidationAndConversion(
+        node_def, error::UNIMPLEMENTED,
+        "Squeeze expects tensor for input, at my_squeeze");
+  }
+  {
+    // Squeeze batch dim, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({0});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Cannot squeeze batch dimension, at my_squeeze");
+  }
+  {
+    // Squeeze batch dim via negative axis, should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({-4});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(node_def, error::UNIMPLEMENTED,
+                               "Cannot squeeze batch dimension, at my_squeeze");
+  }
+  {
+    // Squeeze >= rank(input), should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({4});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for Squeeze is invalid, must be in the range "
+        "[-rank(input), rank(input)), at my_squeeze");
+  }
+  {
+    // Squeeze < -rank(input), should fail.
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef({-5});
+    AddTestTensor("input", {1, 2, 3});
+    RunValidationAndConversion(
+        node_def, error::INVALID_ARGUMENT,
+        "Axis for Squeeze is invalid, must be in the range "
+        "[-rank(input), rank(input)), at my_squeeze");
+  }
+
+  struct TestParams {
+    TestParams(const std::vector<int>& input_dims, const std::vector<int>& axis,
+               const std::vector<int>& expected_output_dims)
+        : input_dims(input_dims),
+          axis(axis),
+          expected_output_dims(expected_output_dims) {}
+    std::vector<int> input_dims;
+    std::vector<int> axis;
+    std::vector<int> expected_output_dims;
+  };
+
+  // Ok.
+  const int kSqueezeOKCases = 10;
+  TestParams ok_params[kSqueezeOKCases] = {
+      TestParams{{1, 2, 3}, {1}, {2, 3}},
+      TestParams{{1, 2, 3}, {-3}, {2, 3}},
+      TestParams{{2, 3, 1}, {3}, {2, 3}},
+      TestParams{{2, 3, 1}, {-1}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {1, 3, 5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {3, 1, 5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {-1, -3, -5}, {2, 3}},
+      TestParams{{1, 2, 1, 3, 1}, {1, -3, 5}, {2, 3}},
+      TestParams{{1, 6}, {1}, {6}},
+      TestParams{{6, 1}, {2}, {6}},
+  };
+  for (int i = 0; i < kSqueezeOKCases; ++i) {
+    Reset();
+    NodeDef node_def = get_squeeze_nodedef(ok_params[i].axis);
+    AddTestTensor("input", ok_params[i].input_dims);
+    RunValidationAndConversion(node_def);
+    TRT_TensorOrWeights output;
+    TF_EXPECT_OK(GetTensorOrWeights("my_squeeze", &output));
+    EXPECT_TRUE(output.is_tensor());
+    ExpectTrtDimsEqualsArray(ok_params[i].expected_output_dims,
+                             output.tensor()->getDimensions());
+
+    std::vector<float> output_data(6);
+    BuildAndRun<float>({{"input", {1, 2, 3, 4, 5, 6}}}, "my_squeeze",
+                       &output_data);
+    EXPECT_THAT(output_data, ElementsAre(1, 2, 3, 4, 5, 6));
+  }
+}
+
 }  // namespace convert
 }  // namespace tensorrt
 }  // namespace tensorflow
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
index b30d94b02824516906ea8880ac6de0bbee9e166c..c1688d4db88a270dcd202989f89a677ed10576d9 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.cc
@@ -67,6 +67,9 @@ tensorflow::Status TRTOptimizationPass::Init(
     TF_RETURN_IF_ERROR(GetPrecisionMode(
         Uppercase(params.at("precision_mode").s()), &precision_mode_));
   }
+  if (params.count("use_calibration")) {
+    use_calibration_ = params.at("use_calibration").b();
+  }
   return tensorflow::Status::OK();
 }
 
@@ -187,8 +190,8 @@ tensorflow::Status TRTOptimizationPass::Optimize(
     *optimized_graph = item.graph;
     return tensorflow::Status::OK();
   }
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << CurrentStackTrace();
+  if (VLOG_IS_ON(3)) {
+    LOG(INFO) << CurrentStackTrace();
     PrintDebugInfo(cluster, item);
   }
   int max_dim = -1;
@@ -222,6 +225,12 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   TF_RETURN_IF_ERROR(static_graph_properties.InferStatically(true));
   tensorflow::tensorrt::convert::ConversionParams cp;
 
+  if (use_calibration_ && precision_mode_ != INT8MODE) {
+    LOG(ERROR) << "Calibration with FP32 or FP16 is not implemented. "
+               << "Falling back to use_calibration = False.";
+    use_calibration_ = false;
+  }
+
   std::vector<string> nodes_to_preserve;
   for (const auto& n : item.NodesToPreserve()) {
     auto tokens = str_util::Split(n, ":");
@@ -250,6 +259,7 @@ tensorflow::Status TRTOptimizationPass::Optimize(
   cp.is_dyn_op = is_dynamic_op_;
   cp.cached_engine_batches = batches_;
   cp.max_cached_engines = max_cached_batches_;
+  cp.use_calibration = use_calibration_;
   auto status = tensorflow::tensorrt::convert::ConvertAfterShapes(cp);
   VLOG(1) << "Returning from " << name_;
   return status;
diff --git a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
index 71b51d13681cb3f75dad034f3fb0f73dea2bacc1..3e8dc0978e43e2e9ba07aaa09f74acfe8e59b9a7 100644
--- a/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
+++ b/tensorflow/contrib/tensorrt/convert/trt_optimization_pass.h
@@ -38,7 +38,8 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
         maximum_batch_size_(-1),
         is_dynamic_op_(false),
         max_cached_batches_(1),
-        max_workspace_size_bytes_(256LL << 20) {
+        max_workspace_size_bytes_(256LL << 20),
+        use_calibration_(true) {
     VLOG(1) << "Constructing " << name_;
   }
 
@@ -67,6 +68,7 @@ class TRTOptimizationPass : public tensorflow::grappler::CustomGraphOptimizer {
   std::vector<int> batches_;
   int max_cached_batches_;
   int64_t max_workspace_size_bytes_;
+  bool use_calibration_;
 };
 
 }  // namespace convert
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
index 019446813a56de6316a04c1738ae13d03e8f4713..bad568644bb1f8d01d4cb0a7c853ec47d6f19e45 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.cc
@@ -124,8 +124,10 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
   OP_REQUIRES_OK(context,
                  context->GetAttr("segment_funcdef_name", &funcdef_name_));
   OP_REQUIRES_OK(context, GetPrecisionMode(precision_string, &precision_mode_));
-  calibration_mode_ =
-      (precision_mode_ == INT8MODE && calibration_data.size() == 0);
+  OP_REQUIRES_OK(context,
+                 context->GetAttr("use_calibration", &use_calibration_));
+  calibration_mode_ = (use_calibration_ && precision_mode_ == INT8MODE &&
+                       calibration_data.size() == 0);
   if (calibration_data.size()) {
     calibrator_.reset(new TRTInt8Calibrator(calibration_data));
     calibration_data.resize(0);
@@ -149,9 +151,6 @@ TRTEngineOp::TRTEngineOp(OpKernelConstruction* context)
 
 void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
                                        AsyncHelper* helper) {
-  if (!calibration_mode_) {
-    VLOG(1) << "Executing native engine";
-  }
   std::vector<Tensor> inputs;
   std::vector<Tensor>* outputs = new std::vector<Tensor>();
   if (native_func_ == tensorflow::kInvalidHandle) {
@@ -172,7 +171,7 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
     inputs.push_back(ctx->input(i));
   }
   helper->Ref();  // Increment count for calculating native graph
-  VLOG(1) << "Executing native segment " << name();
+  VLOG(1) << "Executing native segment: " << name();
   lib->Run(opts, native_func_, inputs, outputs,
            [this, ctx, outputs, helper](const tensorflow::Status& s) {
              tensorflow::core::ScopedUnref sc(helper);
@@ -192,6 +191,7 @@ void TRTEngineOp::ExecuteNativeSegment(OpKernelContext* ctx,
 
 void TRTEngineOp::ExecuteCalibration(OpKernelContext* ctx,
                                      AsyncHelper* helper) {
+  VLOG(1) << "Executing TRT calibration: " << name();
   helper->Ref();
   tensorflow::core::ScopedUnref sc(helper);
   // TODO(aaroey): remove the ResourceMgr singleton.
@@ -303,12 +303,13 @@ bool TRTEngineOp::ExecuteTrtEngine(
     OpKernelContext* ctx, const int num_batch,
     nvinfer1::ICudaEngine* trt_engine_ptr,
     nvinfer1::IExecutionContext* trt_execution_context_ptr) {
+  VLOG(1) << "Executing TRT engine: " << name();
   const bool kRetry = true;
   const int num_binding = ctx->num_inputs() + ctx->num_outputs();
   std::vector<void*> buffers(num_binding);
   for (int i = 0; i < ctx->num_inputs(); i++) {
     const string input_name = StrCat(kInputPHName, i);
-    const size_t binding_index =
+    const int binding_index =
         trt_engine_ptr->getBindingIndex(input_name.c_str());
     if (binding_index == -1) {
       LOG(ERROR) << "Input node not found, at " << input_name;
@@ -345,7 +346,7 @@ bool TRTEngineOp::ExecuteTrtEngine(
   for (int i = 0; i < ctx->num_outputs(); i++) {
     // Create an output tensor
     const string output_name = StrCat(kOutputPHName, i);
-    const size_t binding_index =
+    const int binding_index =
         trt_engine_ptr->getBindingIndex(output_name.c_str());
     Tensor* output_tensor = nullptr;
 
@@ -491,13 +492,14 @@ TRTEngineOp::EngineCtxPair& TRTEngineOp::GetEngine(int batch_size,
     }
     TrtUniquePtrType<nvinfer1::ICudaEngine> engine;
     bool convert_successfully = false;
-    VLOG(0) << name() << " Constructing a new engine with batch size "
-            << batch_size;
+    LOG(INFO) << "Building a new TensorRT engine for " << name()
+              << " with batch size " << batch_size;
     // Up to this point, calibrator_ can never be empty, since otherwise it
     // means calibration_mode_ is true and this path won't get executed.
     auto status = convert::ConvertGraphDefToEngine(
         segment_graph_, precision_mode_, batch_size, workspace_size_, shapes,
-        &logger, allocator, calibrator_.get(), &engine, &convert_successfully);
+        &logger, allocator, calibrator_.get(), &engine, use_calibration_,
+        &convert_successfully);
     if (!status.ok()) {
       if (convert_successfully) {
         // This means it fail to build the engine even when the network is built
@@ -567,8 +569,8 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
   const int64 workspace_size_bytes = workspace_size_;
   cres->thr_.reset(new std::thread([cres, label, segment_graph, shapes,
                                     platform_gpu_id, workspace_size_bytes]() {
-    VLOG(0) << "Starting calibration thread on device " << platform_gpu_id
-            << ", Calibration Resource @ " << cres;
+    LOG(INFO) << "Starting calibration thread on device " << platform_gpu_id
+              << ", Calibration Resource @ " << cres;
     auto err = cudaSetDevice(platform_gpu_id);
     if (err != cudaSuccess) {
       // TODO(aaroey): should return error here.
@@ -586,6 +588,7 @@ tensorflow::Status TRTEngineOp::AllocateCalibrationResources(
         *segment_graph, INT8MODE, cres->calibrator_->getBatchSize(),
         workspace_size_bytes, shapes, &cres->logger_, cres->allocator_.get(),
         cres->calibrator_.get(), &cres->engine_,
+        /*use_calibration=*/true,
         /*convert_successfully=*/nullptr);
     if (!s.ok()) {
       LOG(ERROR) << "Calibration failed: " << s;
diff --git a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
index 8fe06758914261035c90a6fda3f114a63a8ac93a..b545f497f32d5a1a6960b748467ca189b7debf6c 100644
--- a/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
+++ b/tensorflow/contrib/tensorrt/kernels/trt_engine_op.h
@@ -130,6 +130,10 @@ class TRTEngineOp : public AsyncOpKernel {
 
   // The finalized calibrator for inference.
   std::unique_ptr<TRTInt8Calibrator> calibrator_;
+
+  // If true, create calibration graph for INT8 mode. Otherwise, we are using
+  // user-provided quantization ranges.
+  bool use_calibration_;
 };
 
 }  // namespace tensorrt
diff --git a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
index e0c7b6272379a20e3dacb6cd7c3b39de735d844d..92405906eb76b043bc08b68e25e16ab40197dddf 100644
--- a/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
+++ b/tensorflow/contrib/tensorrt/ops/trt_engine_op.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #if GOOGLE_CUDA
 #if GOOGLE_TENSORRT
 
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -39,18 +40,19 @@ REGISTER_OP("TRTEngineOp")
     .Attr("cached_engine_batches: list(int) = []")
     .Attr("max_cached_engines_count: int = 1")
     .Attr("workspace_size_bytes: int")
-    .Attr("precision_mode: {'FP32', 'FP16', 'INT8', 'INT8CALIB'}")
+    .Attr("precision_mode: {'FP32', 'FP16', 'INT8'}")
     .Attr("calibration_data: string = ''")
+    .Attr("use_calibration: bool = true")
     .Input("in_tensor: InT")
-    .Output("out_tensor: OutT");
-// TODO(jie): TF requires concrete output shape for concrete input shapes.
-// This is tricky for batch dimension, since we cannot ensure which input
-// would carry the correct batch dimension (for the current stage of the
-// implementation, we do require all input tensor to carry the same batch
-// size, but this could change in the future). Hence we disable shape
-// inference function as a workaround.
-// .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
-
+    .Output("out_tensor: OutT")
+    // TODO(jie): TF requires concrete output shape for concrete input shapes.
+    // This is tricky for batch dimension, since we cannot ensure which input
+    // would carry the correct batch dimension (for the current stage of the
+    // implementation, we do require all input tensor to carry the same batch
+    // size, but this could change in the future). Hence we disable shape
+    // inference function as a workaround.
+    // .SetShapeFn(shape_inference::TRTEngineOpShapeInference);
+    .SetShapeFn(shape_inference::UnknownShape);
 }  // namespace tensorflow
 
 #endif  // GOOGLE_TENSORRT
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert.py b/tensorflow/contrib/tensorrt/python/trt_convert.py
index bb81fbf93f37b97d01bb1e10fefb8c7da64b329f..203b2697babe32b45523109708cbf062dceee33b 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert.py
@@ -63,19 +63,20 @@ class TrtPrecisionMode(object):
     return [TrtPrecisionMode.FP32, TrtPrecisionMode.FP16, TrtPrecisionMode.INT8]
 
 
-def tensorrt_rewriter_config(rewriter_config=None,
-                             max_batch_size=1,
-                             max_workspace_size_bytes=2 << 20,
-                             precision_mode=TrtPrecisionMode.FP32,
-                             minimum_segment_size=3,
-                             is_dynamic_op=False,
-                             maximum_cached_engines=1,
-                             cached_engine_batch_sizes=None):
+def get_tensorrt_rewriter_config(rewriter_config=None,
+                                 max_batch_size=1,
+                                 max_workspace_size_bytes=2 << 20,
+                                 precision_mode=TrtPrecisionMode.FP32,
+                                 minimum_segment_size=3,
+                                 is_dynamic_op=False,
+                                 maximum_cached_engines=1,
+                                 cached_engine_batch_sizes=None,
+                                 use_calibration=True):
   """Returns a RewriterConfig proto for TRT transformation.
 
   Args:
-    rewriter_config: a RewriterConfig proto to append the TensorRTOptimizer to.
-      If None, it will create one with default settings.
+    rewriter_config: a template RewriterConfig proto used to create a
+      TRT-enabled RewriterConfig. If None, it will use a default one.
     max_batch_size: max size for the input batch
     max_workspace_size_bytes: the maximum GPU temporary memory which the TRT
       engine can use at execution time. This corresponds to the 'workspaceSize'
@@ -95,6 +96,15 @@ def tensorrt_rewriter_config(rewriter_config=None,
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
+    use_calibration: this argument is ignored if precision_mode is not INT8. If
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
 
   Returns:
     A RewriterConfig proto which sets a TensorRTOptimizer to run Grappler.
@@ -107,13 +117,16 @@ def tensorrt_rewriter_config(rewriter_config=None,
       rewriter_config, rewriter_config_pb2.RewriterConfig):
     raise TypeError("rewriter_config should be a RewriterConfig proto.")
 
+  rewriter_config_with_trt = rewriter_config_pb2.RewriterConfig()
   if rewriter_config is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
     # Layout optimizer may add Const nodes followed by Reshape nodes, thus we
     # need to run constant folding again.
-    rewriter_config.optimizers.extend(["constfold", "layout", "constfold"])
-    rewriter_config.meta_optimizer_iterations = (
+    rewriter_config_with_trt.optimizers.extend(
+        ["constfold", "layout", "constfold"])
+    rewriter_config_with_trt.meta_optimizer_iterations = (
         rewriter_config_pb2.RewriterConfig.ONE)
+  else:
+    rewriter_config_with_trt.CopyFrom(rewriter_config)
 
   if precision_mode.upper() not in TrtPrecisionMode.supported_precision_modes():
     raise ValueError(("precision mode '{}' is not supported."
@@ -121,7 +134,7 @@ def tensorrt_rewriter_config(rewriter_config=None,
                           precision_mode,
                           TrtPrecisionMode.supported_precision_modes))
 
-  optimizer = rewriter_config.custom_optimizers.add()
+  optimizer = rewriter_config_with_trt.custom_optimizers.add()
   optimizer.name = "TensorRTOptimizer"
   optimizer.parameter_map["minimum_segment_size"].i = minimum_segment_size
   optimizer.parameter_map["max_batch_size"].i = max_batch_size
@@ -138,7 +151,8 @@ def tensorrt_rewriter_config(rewriter_config=None,
                        "maximum_cached_engines items.")
     optimizer.parameter_map["cached_engine_batches"].list.i.extend(
         cached_engine_batch_sizes)
-  return rewriter_config
+  optimizer.parameter_map["use_calibration"].b = use_calibration
+  return rewriter_config_with_trt
 
 
 def create_inference_graph(input_graph_def,
@@ -150,7 +164,7 @@ def create_inference_graph(input_graph_def,
                            is_dynamic_op=False,
                            maximum_cached_engines=1,
                            cached_engine_batch_sizes=None,
-                           rewriter_config=None,
+                           use_calibration=True,
                            input_saved_model_dir=None,
                            input_saved_model_tags=None,
                            output_saved_model_dir=None,
@@ -182,8 +196,15 @@ def create_inference_graph(input_graph_def,
       use this list to determine the batch sizes of the cached engines, instead
       of making the decision on the fly. This is useful when we know the most
       common batch size(s) the application is going to generate.
-    rewriter_config: a RewriterConfig proto to append the TensorRTOptimizer to.
-      If None, it will create one with default settings.
+    use_calibration: this argument is ignored if precision_mode is not INT8. If
+      set to True, a calibration graph will be created to calibrate the missing
+      ranges. The calibration graph must be converted to an inference graph
+      using calib_graph_to_infer_graph() after running calibration. if set to
+      False, quantization nodes will be expected for every tensor in the graph
+      (exlcuding those which will be fused). If a range is missing, an error
+      will occur. Please note that accuracy may be negatively affected if there
+      is a mismatch between which tensors TRT quantizes and which tensors were
+      trained with fake quantization.
     input_saved_model_dir: the directory to load the SavedModel which contains
       the input graph to transforms. Used only when input_graph_def is None.
     input_saved_model_tags: list of tags to load the SavedModel.
@@ -191,8 +212,9 @@ def create_inference_graph(input_graph_def,
       returned GraphDef and save it to the specified directory. This option only
       works when the input graph is loaded from a SavedModel, i.e. when
       input_saved_model_dir is specified and input_graph_def is None.
-    session_config: the ConfigProto used to create a Session. If not specified,
-      a default ConfigProto will be used.
+    session_config: the ConfigProto used to create a Session. It's also used as
+      a template to create a TRT-enabled ConfigProto for conversion. If not
+      specified, a default ConfigProto will be used.
 
   Returns:
     A GraphDef transformed from input_graph_def (or the SavedModel graph def
@@ -322,21 +344,30 @@ def create_inference_graph(input_graph_def,
       grappler_meta_graph_def.collection_def["train_op"].CopyFrom(
           output_collection)
 
-  # Create RewriterConfig.
-  rewriter_config = tensorrt_rewriter_config(
+  # Create TRT-enabled ConfigProto.
+  session_config_with_trt = config_pb2.ConfigProto()
+  session_config_with_trt.CopyFrom(session_config)
+  rewriter_config = None
+  if (session_config_with_trt.HasField("graph_options") and
+      session_config_with_trt.graph_options.HasField("rewrite_options")):
+    rewriter_config = session_config_with_trt.graph_options.rewrite_options
+  rewriter_config_with_trt = get_tensorrt_rewriter_config(
       rewriter_config, max_batch_size, max_workspace_size_bytes, precision_mode,
       minimum_segment_size, is_dynamic_op, maximum_cached_engines,
-      cached_engine_batch_sizes)
+      cached_engine_batch_sizes, use_calibration)
+  session_config_with_trt.graph_options.rewrite_options.CopyFrom(
+      rewriter_config_with_trt)
 
   # Run Grappler.
   transformed_graph_def = tf_optimizer.OptimizeGraph(
-      rewriter_config, grappler_meta_graph_def, graph_id=b"tf_graph")
+      session_config_with_trt, grappler_meta_graph_def, graph_id=b"tf_graph")
 
   # Optionally write the transformed graphdef as SavedModel.
   if output_saved_model_dir is not None:
     saved_model_builder = builder.SavedModelBuilder(output_saved_model_dir)
     with ops.Graph().as_default():
       importer.import_graph_def(transformed_graph_def, name="")
+      # We don't use TRT here.
       with session.Session(config=session_config) as sess:
         saved_model_builder.add_meta_graph_and_variables(
             sess,
diff --git a/tensorflow/contrib/tensorrt/python/trt_convert_test.py b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
index 9f2eeac990dcacb547d336b68bc042016c3e6171..a7b2d2ea50543ba85c5a13dd6ca320e794ca47f1 100644
--- a/tensorflow/contrib/tensorrt/python/trt_convert_test.py
+++ b/tensorflow/contrib/tensorrt/python/trt_convert_test.py
@@ -47,9 +47,9 @@ from tensorflow.python.tools import saved_model_utils
 class TrtConvertTest(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration python API."""
 
-  def testTensorrtRewriterConfig(self):
-    """Test case for trt_convert.tensorrt_rewriter_config()."""
-    rewriter_cfg = trt_convert.tensorrt_rewriter_config(
+  def testGetTensorrtRewriterConfig(self):
+    """Test case for trt_convert.get_tensorrt_rewriter_config()."""
+    rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
         rewriter_config=None,
         max_batch_size=128,
         max_workspace_size_bytes=1234,
@@ -162,7 +162,7 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
       node_name_to_op = {node.name: node.op for node in graph_def.node}
       self.assertEqual({
           "input": "Placeholder",
-          "my_trt_op_0": "TRTEngineOp",
+          "TRTEngineOp_0": "TRTEngineOp",
           "output": "Identity"
       }, node_name_to_op)
 
@@ -188,11 +188,12 @@ class TrtConvertTest(test_util.TensorFlowTestCase):
     self.assertAllEqual([[[4.0]]] * batch_size, result)
     execute_engine_test_value = ("done" if expect_engine_is_run else "")
     execute_native_segment_test_value = ("" if expect_engine_is_run else "done")
-    self.assertEqual(execute_engine_test_value,
-                     trt_convert.get_test_value("my_trt_op_0:ExecuteTrtEngine"))
+    self.assertEqual(
+        execute_engine_test_value,
+        trt_convert.get_test_value("TRTEngineOp_0:ExecuteTrtEngine"))
     self.assertEqual(
         execute_native_segment_test_value,
-        trt_convert.get_test_value("my_trt_op_0:ExecuteNativeSegment"))
+        trt_convert.get_test_value("TRTEngineOp_0:ExecuteNativeSegment"))
 
   def testCreateInferenceGraph_MinimumSegmentSize(self):
     if not trt_convert.is_tensorrt_enabled():
diff --git a/tensorflow/contrib/tensorrt/resources/trt_resources.h b/tensorflow/contrib/tensorrt/resources/trt_resources.h
index 840da6e78d88392b3c1ef5c9f6e31a2f355d09f1..aac9e5c7bd725fc10bcaa04536ebc7be071b4d4c 100644
--- a/tensorflow/contrib/tensorrt/resources/trt_resources.h
+++ b/tensorflow/contrib/tensorrt/resources/trt_resources.h
@@ -39,7 +39,8 @@ namespace tensorrt {
 class TRTCalibrationResource : public tensorflow::ResourceBase {
  public:
   ~TRTCalibrationResource() {
-    VLOG(0) << "Destroying Calibration Resource " << std::endl << DebugString();
+    LOG(INFO) << "Destroying Calibration Resource " << std::endl
+              << DebugString();
     builder_.reset();
     engine_.reset();
     // We need to manually destroy the builder and engine before the allocator
diff --git a/tensorflow/contrib/tensorrt/segment/segment.cc b/tensorflow/contrib/tensorrt/segment/segment.cc
index 4f64b7a9522a177624baeb425ed643c5bff7e65f..6abc5226ccf96e472df77269bee6186726e5768d 100644
--- a/tensorflow/contrib/tensorrt/segment/segment.cc
+++ b/tensorflow/contrib/tensorrt/segment/segment.cc
@@ -33,6 +33,7 @@ namespace tensorflow {
 namespace tensorrt {
 namespace segment {
 using ::tensorflow::strings::StrAppend;
+using ::tensorflow::strings::StrCat;
 
 // A simple graph representation to mirror tensorflow::Graph. This structure
 // helps saving memory since segmenter modifies the graph in place, preventing
@@ -406,22 +407,42 @@ tensorflow::Status SegmentGraph(
   // Use a union-find to collect the nodes that belong to the same
   // segment. A node value of nullptr indicates that the node is not a candidate
   // for TRT.
+  std::unordered_set<string> unsupported_ops;
+  int num_unsupported_ops = 0;
   std::vector<UnionFind<SimpleNode*>> node_segments;
   for (int i = 0; i < graph->num_node_ids(); ++i) {
     SimpleNode* node = graph->FindNodeId(i);
     if (options.exclude_node_list.count(node->name()) != 0) {
-      VLOG(1) << "Not a TF-TRT candidate: " << node->name()
-              << " (excluded by segmenter option).";
+      VLOG(1) << "Not a TF-TRT candidate, "
+              << "(Op type: " << node->tf_node()->type_string() << "), "
+              << "(Op name: " << node->name() << "), "
+              << "(Reason: excluded by segmenter option)";
+      unsupported_ops.emplace(node->tf_node()->type_string());
+      num_unsupported_ops++;
       node = nullptr;
     } else {
       const Status status = candidate_fn(node->tf_node());
       if (!status.ok()) {
-        VLOG(1) << "Not a TF-TRT candidate: " << node->name() << ": " << status;
+        VLOG(1) << "Not a TF-TRT candidate, "
+                << "(Op type: " << node->tf_node()->type_string() << "), "
+                << "(Op name: " << node->name() << "), "
+                << "(Reason: " << status << ")";
+        unsupported_ops.emplace(node->tf_node()->type_string());
+        num_unsupported_ops++;
         node = nullptr;
       }
     }
     node_segments.emplace_back(node);
   }
+  string msg = StrCat(
+      "There are ", num_unsupported_ops, " ops of ", unsupported_ops.size(),
+      " different types in the graph that", " are not converted to TensorRT: ");
+  for (const auto& elem : unsupported_ops) {
+    StrAppend(&msg, elem, ", ");
+  }
+  LOG(INFO) << msg << "(For more information see "
+            << "https://docs.nvidia.com/deeplearning"
+            << "/dgx/integrate-tf-trt/index.html#support-ops).";
 
   // The segmentation algorithm below visits nodes in reverse topological order
   // and attempts to merge nodes along output edges. That means that subgraphs
diff --git a/tensorflow/contrib/tensorrt/test/base_test.py b/tensorflow/contrib/tensorrt/test/base_test.py
index 18096e0ff1ec6b9872346d8a84ac93c542cfb643..ff317e43e1e6ff1c0b869ae8dc6d1fda8f0ce126 100644
--- a/tensorflow/contrib/tensorrt/test/base_test.py
+++ b/tensorflow/contrib/tensorrt/test/base_test.py
@@ -56,8 +56,9 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
             strides=[1, 2, 2, 1],
             padding="SAME",
             name="conv")
-        bias = constant_op.constant(
-            [4., 1.5, 2., 3., 5., 7.], name="bias", dtype=dtype)
+        bias = constant_op.constant([4., 1.5, 2., 3., 5., 7.],
+                                    name="bias",
+                                    dtype=dtype)
         added = nn.bias_add(conv, bias, name="bias_add")
         relu = nn.relu(added, "relu")
         identity = array_ops.identity(relu, "identity")
@@ -73,11 +74,12 @@ class SimpleSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-    # breaks the connection check, fix it.
-    # - my_trt_op_0 should have ["weights", "conv", "bias", "bias_add",
-    #   "relu", "identity", "max_pool"]
-    return ["my_trt_op_0"]
+    return {
+        "TRTEngineOp_0": [
+            "weights", "conv", "bias", "bias_add", "relu", "identity",
+            "max_pool"
+        ]
+    }
 
 
 class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -92,7 +94,7 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
     g = ops.Graph()
     with g.as_default():
       inp = array_ops.placeholder(
-          dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+          dtype=dtype, shape=input_dims, name=input_name)
       with g.device("/GPU:0"):
         conv_filter = constant_op.constant(
             [[[[1., 0.5, 4., 6., 0.5, 1.], [1., 0.5, 1., 1., 0.5, 1.]]]],
@@ -105,10 +107,10 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
             padding="SAME",
             name="conv")
         c1 = constant_op.constant(
-            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c1")
+            np.random.randn(12, 12, 6), dtype=dtype, name="c1")
         p = math_ops.mul(conv, c1, name="mul")
         c2 = constant_op.constant(
-            np.random.randn(input_dims[0], 12, 12, 6), dtype=dtype, name="c2")
+            np.random.randn(12, 12, 6), dtype=dtype, name="c2")
         q = math_ops.div(conv, c2, name="div")
 
         edge = self.trt_incompatible_op(q, name="incompatible")
@@ -129,22 +131,21 @@ class SimpleMultiEnginesTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    # TODO(aaroey): LayoutOptimizer adds additional nodes to the graph which
-    # breaks the connection check, fix it.
-    # - my_trt_op_0 should have ["mul", "sub", "div1", "mul1", "add1",
-    #   "add", "sub1"];
-    # - my_trt_op_1 should have ["weights","conv", "div"]
-    return ["my_trt_op_0", "my_trt_op_1"]
+    return {
+        "TRTEngineOp_0": [
+            "add", "add1", "c1", "div1", "mul", "mul1", "sub", "sub1"
+        ],
+        "TRTEngineOp_1": ["c2", "conv", "div", "weights"]
+    }
 
-  def ShouldRunTest(self, run_params):
-    # TODO(aaroey): LayoutOptimizer adds Transpose(Const, Const) to the graph
-    # which breaks the conversion. We should fix it as:
-    # - Detect the invalid NodeDef earlier before adding them to segment
-    # - Let it able to change the RewriterConfig when calling
-    #   create_inference_graph().
-    # It will be good to add debugging feature for Grappler to print the graph
-    # after running each optimizer.
-    return False
+  def GetConversionParams(self, run_params):
+    """Return a ConversionParams for test."""
+    return super(
+        SimpleMultiEnginesTest, self
+    ).GetConversionParams(run_params)._replace(
+        # Disable layout optimizer, since it'll add Transpose(Const, Const) to
+        # the graph and breaks the conversion check.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
 
 class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
@@ -153,7 +154,7 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Setup method."""
     super(PartiallyConvertedTestA, self).setUp()
     # Let it fail to build the second engine.
-    trt_convert.add_test_value("my_trt_op_1:CreateTRTNode", "fail")
+    trt_convert.add_test_value("TRTEngineOp_1:CreateTRTNode", "fail")
 
   def GetParams(self):
     """Create a graph containing two segment."""
@@ -190,14 +191,16 @@ class PartiallyConvertedTestA(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     return {
         # Only the first engine is built.
-        "my_trt_op_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
+        "TRTEngineOp_0": ["c0", "c1", "add0", "add1", "mul0", "mul1"]
     }
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
     # Disable the test in fp16 mode since multiple matmul and add ops together
     # can cause overflow.
-    return run_params.precision_mode != "FP16"
+    return ((run_params.precision_mode != "FP16") and
+            not (trt_test.IsQuantizationMode(run_params.precision_mode) and
+                 not run_params.use_calibration))
 
 
 class PartiallyConvertedTestB(PartiallyConvertedTestA):
@@ -207,13 +210,13 @@ class PartiallyConvertedTestB(PartiallyConvertedTestA):
     super(PartiallyConvertedTestB, self).setUp()
     # Let it fail to build the first engine.
     trt_convert.clear_test_values("")
-    trt_convert.add_test_value("my_trt_op_0:CreateTRTNode", "fail")
+    trt_convert.add_test_value("TRTEngineOp_0:CreateTRTNode", "fail")
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
         # Only the second engine is built.
-        "my_trt_op_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
+        "TRTEngineOp_1": ["c2", "c3", "add2", "add3", "mul2", "mul3"]
     }
 
 
@@ -257,8 +260,8 @@ class ConstInputTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["add", "add1", "mul"],
-        "my_trt_op_1": ["add2", "add3", "mul1"]
+        "TRTEngineOp_0": ["add", "add1", "mul"],
+        "TRTEngineOp_1": ["add2", "add3", "mul1"]
     }
 
 
@@ -289,7 +292,7 @@ class ConstDataInputSingleEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return {"my_trt_op_0": ["c", "add", "add1", "mul"]}
+    return {"TRTEngineOp_0": ["c", "add", "add1", "mul"]}
 
 
 class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
@@ -324,12 +327,12 @@ class ConstDataInputMultipleEnginesTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["add2", "add3", "mul1"],
+        "TRTEngineOp_0": ["add2", "add3", "mul1"],
         # Why segment ["add", "add1", "mul"] was assigned segment id 1
         # instead of 0: the parent node of this segment is actually const
         # node 'c', but it's removed later since it's const output of the
         # segment which is not allowed.
-        "my_trt_op_1": ["add", "add1", "mul"]
+        "TRTEngineOp_1": ["add", "add1", "mul"]
     }
 
 
@@ -373,8 +376,8 @@ class ControlDependencyTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["c1", "add", "add1", "mul"],
-        "my_trt_op_1": ["c2", "add2", "add3", "mul1"]
+        "TRTEngineOp_0": ["c1", "add", "add1", "mul"],
+        "TRTEngineOp_1": ["c2", "add2", "add3", "mul1"]
     }
 
 
diff --git a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
index 4b8880817876143dc753cfacdb79d4ad50347fe0..f42308ecb7c8f8a107e78008abd3f470ddc85975 100644
--- a/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/batch_matmul_test.py
@@ -79,12 +79,12 @@ class BatchMatMulTest(trt_test.TfTrtIntegrationTestBase):
     """Return the expected engines to build."""
     if (run_params.dynamic_engine and
         not trt_test.IsQuantizationMode(run_params.precision_mode)):
-      return ["my_trt_op_0", "my_trt_op_1"]
-    return ["my_trt_op_1"]
+      return ["TRTEngineOp_0", "TRTEngineOp_1"]
+    return ["TRTEngineOp_1"]
 
   def ExpectedEnginesToRun(self, run_params):
     """Return the expected engines to run."""
-    return ["my_trt_op_1"]
+    return ["TRTEngineOp_1"]
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
diff --git a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
index 7545bb9df20f295a8fdbc82b573cdb3407f8c5e4..053b38ff1c0578c58f39dd6dc0630d1401a105af 100644
--- a/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
+++ b/tensorflow/contrib/tensorrt/test/biasadd_matmul_test.py
@@ -41,6 +41,7 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
     input_name = "input"
     input_matrix_rows = 4
     input_matrix_columns = 144
+    # Note that tf.nn.bias_add supports up to 5 dimensions.
     input_dims = [input_matrix_rows, input_matrix_columns]
     output_name = "output"
     g = ops.Graph()
@@ -74,18 +75,18 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
       x5 = nn.bias_add(x5, b)
       x5 = gen_array_ops.reshape(x5, [4, -1])
 
-      x6 = gen_array_ops.reshape(x, [4, 12, 12])
-      b = self._ConstOp((12,))
+      x6 = gen_array_ops.reshape(x, [4, 24, 6])
+      b = self._ConstOp((6,))
       x6 = nn.bias_add(x6, b, data_format="NHWC")
       x6 = gen_array_ops.reshape(x6, [4, -1])
 
-      x7 = gen_array_ops.reshape(x, [4, 12, 3, 4])
-      b = self._ConstOp((4,))
+      x7 = gen_array_ops.reshape(x, [4, 12, 4, 3])
+      b = self._ConstOp((3,))
       x7 = nn.bias_add(x7, b, data_format="NHWC")
       x7 = gen_array_ops.reshape(x7, [4, -1])
 
-      x8 = gen_array_ops.reshape(x, [4, 12, 3, 2, 2])
-      b = self._ConstOp((2,))
+      x8 = gen_array_ops.reshape(x, [4, 4, 3, 2, 6])
+      b = self._ConstOp((6,))
       x8 = nn.bias_add(x8, b, data_format="NHWC")
       x8 = gen_array_ops.reshape(x8, [4, -1])
 
@@ -94,13 +95,13 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
       x9 = nn.bias_add(x9, b, data_format="NCHW")
       x9 = gen_array_ops.reshape(x9, [4, -1])
 
-      x10 = gen_array_ops.reshape(x, [4, 12, 3, 4])
-      b = self._ConstOp((12,))
+      x10 = gen_array_ops.reshape(x, [4, 3, 4, 12])
+      b = self._ConstOp((3,))
       x10 = nn.bias_add(x10, b, data_format="NCHW")
       x10 = gen_array_ops.reshape(x10, [4, -1])
 
-      x11 = gen_array_ops.reshape(x, [4, 12, 12])
-      b = self._ConstOp((12,))
+      x11 = gen_array_ops.reshape(x, [4, 6, 24])
+      b = self._ConstOp((6,))
       x11 = nn.bias_add(x11, b, data_format="NCHW")
       x11 = gen_array_ops.reshape(x11, [4, -1])
 
@@ -116,13 +117,18 @@ class BiasaddMatMulTest(trt_test.TfTrtIntegrationTestBase):
 
   def GetConversionParams(self, run_params):
     """Return a ConversionParams for test."""
-    return super(BiasaddMatMulTest,
-                 self).GetConversionParams(run_params)._replace(
-                     max_batch_size=4, maximum_cached_engines=1)
+    conversion_params = super(BiasaddMatMulTest,
+                              self).GetConversionParams(run_params)
+    return conversion_params._replace(
+        max_batch_size=4,
+        maximum_cached_engines=1,
+        # Disable layout optimizer, since it will convert BiasAdd with NHWC
+        # format to NCHW format under four dimentional input.
+        rewriter_config=trt_test.OptimizerDisabledRewriterConfig())
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
diff --git a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
index b53cb3c091ea477ef0974d9d14d82c587a431152..169835956c046dd675e967daa05fd81405662e38 100644
--- a/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/binary_tensor_weight_broadcast_test.py
@@ -26,7 +26,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -56,10 +55,10 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
       ]:
         a = self._ConstOp(weights_shape)
         f = x + a
-        x = math_ops.sigmoid(f)
+        x = self.trt_incompatible_op(f)
         a = self._ConstOp(weights_shape)
         f = a + x
-        x = math_ops.sigmoid(f)
+        x = self.trt_incompatible_op(f)
       gen_array_ops.reshape(x, [5, -1], name=output_name)
     return trt_test.TfTrtIntegrationTestParams(
         gdef=g.as_graph_def(),
@@ -70,7 +69,7 @@ class BinaryTensorWeightBroadcastTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_%d" % i for i in range(16)]
+    return ["TRTEngineOp_%d" % i for i in range(16)]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/concatenation_test.py b/tensorflow/contrib/tensorrt/test/concatenation_test.py
index 465cb022964df046bf03a481bb1c6b65750aa883..c3576f81d97afe7e0e42cd10413971911e97774c 100644
--- a/tensorflow/contrib/tensorrt/test/concatenation_test.py
+++ b/tensorflow/contrib/tensorrt/test/concatenation_test.py
@@ -79,7 +79,7 @@ class ConcatenationTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
index e32f0478661caaab5386339c819b524656baf066..c1c883312d867b60b88ac14318041f9750ca41e6 100644
--- a/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
+++ b/tensorflow/contrib/tensorrt/test/const_broadcast_test.py
@@ -64,7 +64,7 @@ class ConstBroadcastTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ['my_trt_op_0']
+    return ['TRTEngineOp_0']
 
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
diff --git a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
index bc7c90081ff38a832b523948db10c02de7acefc2..104bac43a0b1166dcddee9920991582f33e93316 100644
--- a/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
+++ b/tensorflow/contrib/tensorrt/test/memory_alignment_test.py
@@ -68,7 +68,7 @@ class MemoryAlignmentTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
   def ExpectedAbsoluteTolerance(self, run_params):
     """The absolute tolerance to compare floating point results."""
diff --git a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
index 11be4feaf7bf8ce6c8bd16f1546dc17450c342f1..293f93d8a78bc8ab06002d6fc01cb8d6a0738698 100644
--- a/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/multi_connection_neighbor_engine_test.py
@@ -25,8 +25,6 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_math_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
 from tensorflow.python.platform import test
 
@@ -60,14 +58,14 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
       b = constant_op.constant(
           np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
       q = conv - b
-      edge = math_ops.sigmoid(q)
+      edge = self.trt_incompatible_op(q)
 
       b = constant_op.constant(
           np.random.normal(5.0, 1.0, [1, 4, 1, 1]), name="bias", dtype=dtype)
       d = b + conv
-      edge3 = math_ops.sigmoid(d)
+      edge3 = self.trt_incompatible_op(d)
 
-      edge1 = gen_math_ops.tan(conv)
+      edge1 = self.trt_incompatible_op(conv)
       t = t - edge1
       q = q + edge
       t = t + q
@@ -83,7 +81,7 @@ class MultiConnectionNeighborEngineTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0", "my_trt_op_1"]
+    return ["TRTEngineOp_0", "TRTEngineOp_1"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
index eddeafa38bc71743ac6c9d8e5e8db76f28ca7bf4..3e1e4b088ba200db2184dd64092cbc642a17cb3a 100644
--- a/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
+++ b/tensorflow/contrib/tensorrt/test/neighboring_engine_test.py
@@ -66,8 +66,8 @@ class NeighboringEngineTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["bias", "mul", "sub"],
-        "my_trt_op_1": ["weights", "conv"]
+        "TRTEngineOp_0": ["bias", "mul", "sub"],
+        "TRTEngineOp_1": ["weights", "conv"]
     }
 
 
diff --git a/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..31cbef89e23949ba5ceaab34e0f683fd906bf0ce
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/quantization_mnist_test.py
@@ -0,0 +1,290 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Script to test TF-TRT INT8 conversion without calibration on Mnist model."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.contrib.tensorrt.python import trt_convert
+# pylint: disable=unused-import
+from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
+# pylint: enable=unused-import
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python import data
+from tensorflow.python import keras
+from tensorflow.python.estimator.estimator import Estimator
+from tensorflow.python.estimator.model_fn import EstimatorSpec
+from tensorflow.python.estimator.model_fn import ModeKeys
+from tensorflow.python.estimator.run_config import RunConfig
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import graph_util
+from tensorflow.python.framework import importer
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.datasets import mnist
+from tensorflow.python.layers import layers
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import metrics
+from tensorflow.python.ops import nn
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.summary import summary
+from tensorflow.python.training import saver
+from tensorflow.python.training.adam import AdamOptimizer
+from tensorflow.python.training.checkpoint_management import latest_checkpoint
+from tensorflow.python.training.training_util import get_global_step
+
+INPUT_NODE_NAME = 'input'
+OUTPUT_NODE_NAME = 'output'
+
+
+class QuantizationAwareTrainingMNISTTest(test_util.TensorFlowTestCase):
+
+  def _BuildGraph(self, x):
+
+    def _Quantize(x, r):
+      x = gen_array_ops.quantize_and_dequantize_v2(x, -r, r)
+      return x
+
+    def _DenseLayer(x, num_inputs, num_outputs, quantization_range, name):
+      """Dense layer with quantized outputs.
+
+      Args:
+        x: input to the dense layer
+        num_inputs: number of input columns of x
+        num_outputs: number of output columns
+        quantization_range: the min/max range for quantization
+        name: name of the variable scope
+
+      Returns:
+        The output of the layer.
+      """
+      with variable_scope.variable_scope(name):
+        kernel = variable_scope.get_variable(
+            'kernel',
+            shape=[num_inputs, num_outputs],
+            dtype=dtypes.float32,
+            initializer=keras.initializers.glorot_uniform())
+        bias = variable_scope.get_variable(
+            'bias',
+            shape=[num_outputs],
+            dtype=dtypes.float32,
+            initializer=keras.initializers.zeros())
+        x = math_ops.matmul(x, kernel)
+        x = _Quantize(x, quantization_range)
+        x = nn.bias_add(x, bias)
+        x = _Quantize(x, quantization_range)
+      return x
+
+    x = _Quantize(x, 1)
+    # Conv + Bias + Relu6
+    x = layers.conv2d(x, filters=32, kernel_size=3, use_bias=True)
+    x = nn.relu6(x)
+    # Conv + Bias + Relu6
+    x = layers.conv2d(x, filters=64, kernel_size=3, use_bias=True)
+    x = nn.relu6(x)
+    # Reduce
+    x = math_ops.reduce_mean(x, [1, 2])
+    x = _Quantize(x, 6)
+    # FC1
+    x = _DenseLayer(x, 64, 512, 6, name='dense')
+    x = nn.relu6(x)
+    # FC2
+    x = _DenseLayer(x, 512, 10, 25, name='dense_1')
+    x = array_ops.identity(x, name=OUTPUT_NODE_NAME)
+    return x
+
+  def _GetGraphDef(self, use_trt, max_batch_size, model_dir):
+    """Get the frozen mnist GraphDef.
+
+    Args:
+      use_trt: whether use TF-TRT to convert the graph.
+      max_batch_size: the max batch size to apply during TF-TRT conversion.
+      model_dir: the model directory to load the checkpoints.
+
+    Returns:
+      The frozen mnist GraphDef.
+    """
+    graph = ops.Graph()
+    with self.session(graph=graph) as sess:
+      with graph.device('/GPU:0'):
+        x = array_ops.placeholder(
+            shape=(None, 28, 28, 1), dtype=dtypes.float32, name=INPUT_NODE_NAME)
+        self._BuildGraph(x)
+      # Load weights
+      mnist_saver = saver.Saver()
+      checkpoint_file = latest_checkpoint(model_dir)
+      mnist_saver.restore(sess, checkpoint_file)
+      # Freeze
+      graph_def = graph_util.convert_variables_to_constants(
+          sess, sess.graph_def, output_node_names=[OUTPUT_NODE_NAME])
+    # Convert with TF-TRT
+    if use_trt:
+      logging.info('Number of nodes before TF-TRT conversion: %d',
+                   len(graph_def.node))
+      graph_def = trt_convert.create_inference_graph(
+          graph_def,
+          outputs=[OUTPUT_NODE_NAME],
+          max_batch_size=max_batch_size,
+          precision_mode='INT8',
+          max_workspace_size_bytes=4096 << 19,
+          minimum_segment_size=2,
+          use_calibration=False,
+      )
+      logging.info('Number of nodes after TF-TRT conversion: %d',
+                   len(graph_def.node))
+      num_engines = len(
+          [1 for n in graph_def.node if str(n.op) == 'TRTEngineOp'])
+      self.assertEqual(1, num_engines)
+    return graph_def
+
+  def _Run(self, is_training, use_trt, batch_size, num_epochs, model_dir):
+    """Train or evaluate the model.
+
+    Args:
+      is_training: whether to train or evaluate the model. In training mode,
+        quantization will be simulated where the quantize_and_dequantize_v2 are
+        placed.
+      use_trt: if true, use TRT INT8 mode for evaluation, which will perform
+        real quantization. Otherwise use native TensorFlow which will perform
+        simulated quantization. Ignored if is_training is True.
+      batch_size: batch size.
+      num_epochs: how many epochs to train. Ignored if is_training is False.
+      model_dir: where to save or load checkpoint.
+
+    Returns:
+      The Estimator evaluation result.
+    """
+    # Get dataset
+    train_data, test_data = mnist.load_data()
+
+    def _PreprocessFn(x, y):
+      x = math_ops.cast(x, dtypes.float32)
+      x = array_ops.expand_dims(x, axis=2)
+      x = 2.0 * (x / 255.0) - 1.0
+      y = math_ops.cast(y, dtypes.int32)
+      return x, y
+
+    def _EvalInputFn():
+      mnist_x, mnist_y = test_data
+      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.apply(
+          data.experimental.map_and_batch(
+              map_func=_PreprocessFn,
+              batch_size=batch_size,
+              num_parallel_calls=8))
+      dataset = dataset.repeat(count=1)
+      iterator = data.make_one_shot_iterator(dataset)
+      features, labels = iterator.get_next()
+      return features, labels
+
+    def _TrainInputFn():
+      mnist_x, mnist_y = train_data
+      dataset = data.Dataset.from_tensor_slices((mnist_x, mnist_y))
+      dataset = dataset.shuffle(2 * len(mnist_x))
+      dataset = dataset.apply(
+          data.experimental.map_and_batch(
+              map_func=_PreprocessFn,
+              batch_size=batch_size,
+              num_parallel_calls=8))
+      dataset = dataset.repeat(count=num_epochs)
+      iterator = data.make_one_shot_iterator(dataset)
+      features, labels = iterator.get_next()
+      return features, labels
+
+    def _ModelFn(features, labels, mode):
+      if is_training:
+        logits_out = self._BuildGraph(features)
+      else:
+        graph_def = self._GetGraphDef(use_trt, batch_size, model_dir)
+        logits_out = importer.import_graph_def(
+            graph_def,
+            input_map={INPUT_NODE_NAME: features},
+            return_elements=[OUTPUT_NODE_NAME + ':0'],
+            name='')[0]
+
+      loss = losses.sparse_softmax_cross_entropy(
+          labels=labels, logits=logits_out)
+      summary.scalar('loss', loss)
+
+      classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out')
+      accuracy = metrics.accuracy(
+          labels=labels, predictions=classes_out, name='acc_op')
+      summary.scalar('accuracy', accuracy[1])
+
+      if mode == ModeKeys.EVAL:
+        return EstimatorSpec(
+            mode, loss=loss, eval_metric_ops={'accuracy': accuracy})
+      elif mode == ModeKeys.TRAIN:
+        optimizer = AdamOptimizer(learning_rate=1e-2)
+        train_op = optimizer.minimize(loss, global_step=get_global_step())
+        return EstimatorSpec(mode, loss=loss, train_op=train_op)
+
+    config_proto = config_pb2.ConfigProto()
+    config_proto.gpu_options.allow_growth = True
+    estimator = Estimator(
+        model_fn=_ModelFn,
+        model_dir=model_dir if is_training else None,
+        config=RunConfig(session_config=config_proto))
+
+    if is_training:
+      estimator.train(_TrainInputFn)
+    results = estimator.evaluate(_EvalInputFn)
+    logging.info('accuracy: %s', str(results['accuracy']))
+    return results
+
+  # To generate the checkpoint, set a different model_dir and call self._Run()
+  # by setting is_training=True and num_epochs=1000, e.g.:
+  # model_dir = '/tmp/quantization_mnist'
+  # self._Run(
+  #     is_training=True,
+  #     use_trt=False,
+  #     batch_size=128,
+  #     num_epochs=100,
+  #     model_dir=model_dir)
+  def testEval(self):
+    if not trt_convert.is_tensorrt_enabled():
+      return
+    model_dir = test.test_src_dir_path('contrib/tensorrt/test/testdata')
+
+    accuracy_tf_native = self._Run(
+        is_training=False,
+        use_trt=False,
+        batch_size=128,
+        num_epochs=None,
+        model_dir=model_dir)['accuracy']
+    logging.info('accuracy_tf_native: %f', accuracy_tf_native)
+    self.assertAllClose(accuracy_tf_native, 0.9662)
+
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return
+
+    accuracy_tf_trt = self._Run(
+        is_training=False,
+        use_trt=True,
+        batch_size=128,
+        num_epochs=None,
+        model_dir=model_dir)['accuracy']
+    logging.info('accuracy_tf_trt: %f', accuracy_tf_trt)
+    self.assertAllClose(accuracy_tf_trt, 0.9677)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/quantization_test.py b/tensorflow/contrib/tensorrt/test/quantization_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e425a3674635650d7292ab072178e98932e6b824
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/quantization_test.py
@@ -0,0 +1,144 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Model script to test TF-TensorRT integration."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.contrib.tensorrt.python import trt_convert
+from tensorflow.contrib.tensorrt.test import tf_trt_integration_test_base as trt_test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def _GetParams(add_quantization_nodes, dtype=dtypes.float32):
+  input_name = "input"
+  input_dims = [8, 8]
+  output_name = "output"
+
+  def _Quantize(x, r):
+    if add_quantization_nodes:
+      x = gen_array_ops.fake_quant_with_min_max_vars(x, -r, r)
+    return x
+
+  g = ops.Graph()
+  with g.as_default():
+    x = array_ops.placeholder(
+        dtype=dtype, shape=[None] + input_dims[1:], name=input_name)
+    x = _Quantize(x, 10.0)
+    x = x + 5
+    x = _Quantize(x, 15.0)
+    x = x - 5
+    x = _Quantize(x, 10.0)
+    x = x * 0.1
+    x = _Quantize(x, 1.0)
+    w = constant_op.constant(np.ones((8, 1)), dtype=dtypes.float32)
+    x = math_ops.matmul(x, w)
+    x = _Quantize(x, 10.0)
+    x = array_ops.identity(x, name=output_name)
+
+  return trt_test.TfTrtIntegrationTestParams(
+      gdef=g.as_graph_def(),
+      input_names=[input_name],
+      input_dims=[input_dims],
+      output_names=[output_name],
+      expected_output_dims=[(8, 1)])
+
+
+class QuantizationMissingAllRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=False)
+
+  def ShouldRunTest(self, run_params):
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return False
+    # Only test static engine mode, with or without calibration.
+    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.use_optimizer and not run_params.dynamic_engine)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    if run_params.use_calibration:
+      # In static engine mode with calibration, it should build a calibration
+      # engine.
+      return ["TRTEngineOp_0"]
+    # In static engine mode without calibration, the engine building will fail
+    # since no quantization ranges are set, which results in no TRT nodes.
+    return []
+
+
+class QuantizationWithRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=True)
+
+  def ShouldRunTest(self, run_params):
+    if trt_convert.get_linked_tensorrt_version()[0] < 5:
+      return False
+    # Test static/dynamic engine with/without calibration.
+    return (trt_test.IsQuantizationMode(run_params.precision_mode) and
+            not run_params.use_optimizer)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    return ["TRTEngineOp_0"]
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+
+class NonQuantizedPrecisionsWithRangesTest(trt_test.TfTrtIntegrationTestBase):
+
+  def GetParams(self):
+    """Create a graph containing single segment with no quantization ranges."""
+    return _GetParams(add_quantization_nodes=True)
+
+  def ShouldRunTest(self, run_params):
+    # Only test FP32/FP16 mode.
+    return not trt_test.IsQuantizationMode(run_params.precision_mode)
+
+  def ExpectedEnginesToBuild(self, run_params):
+    """Return the expected engines to build."""
+    # The fake quant ops are not supported in FP32/FP16 mode, and will split the
+    # graph into three TRT segments.
+    return ["TRTEngineOp_0", "TRTEngineOp_1", "TRTEngineOp_2", "TRTEngineOp_3"]
+
+  def ExpectedAbsoluteTolerance(self, run_params):
+    """The absolute tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+  def ExpectedRelativeTolerance(self, run_params):
+    """The relative tolerance to compare floating point results."""
+    return 1.e-05 if run_params.precision_mode == "FP32" else 1.e-01
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/contrib/tensorrt/test/rank_two_test.py b/tensorflow/contrib/tensorrt/test/rank_two_test.py
index 74a4a059257ffde4c86df1f18b3ce35c3790ec7a..563232fc12675d9e1b32b7ab461591af57beadb9 100644
--- a/tensorflow/contrib/tensorrt/test/rank_two_test.py
+++ b/tensorflow/contrib/tensorrt/test/rank_two_test.py
@@ -51,8 +51,10 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
         c = constant_op.constant(3.0, name="c%d_3" % i)
         q = math_ops.add(q, c, name="add%d_3" % i)
         if i == 0:
+          axis = constant_op.constant(-1, dtype=dtypes.int32, name="axis")
           for j in range(2):
-            q = array_ops.expand_dims(q, -1, name="expand%d_%d" % (i, j))
+            q = array_ops.expand_dims(q, axis, name="expand%d_%d" % (i, j))
+          q = self.trt_incompatible_op(q)
         q = gen_math_ops.reciprocal(q, name="reciprocal%d" % i)
         outputs.append(q)
       # Combine both paths
@@ -68,11 +70,11 @@ class RankTwoTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": [
+        "TRTEngineOp_0": [
             "add0_1", "add0_2", "add0_3", "c0_1", "c0_2", "c0_3", "abs0_1",
-            "abs0_2"
+            "abs0_2", "expand0_0", "expand0_1", "axis"
         ],
-        "my_trt_op_1": [
+        "TRTEngineOp_1": [
             "add", "add1_1", "add1_2", "add1_3", "c1_1", "c1_2", "c1_3",
             "abs1_1", "abs1_2", "reciprocal0", "reciprocal1"
         ],
diff --git a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py b/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
index bbc724ab18e18be3e831732071a31f0a541a4059..207944468ab0b038abfe01f0096d7dc220d064ed 100644
--- a/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
+++ b/tensorflow/contrib/tensorrt/test/reshape_transpose_test.py
@@ -79,8 +79,8 @@ class ReshapeTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": ["reshape-%d" % i for i in range(7)] +
-                       ["reshape-%d/shape" % i for i in range(7)]
+        "TRTEngineOp_0": ["reshape-%d" % i for i in range(7)] +
+                         ["reshape-%d/shape" % i for i in range(7)]
     }
 
   def ShouldRunTest(self, run_params):
@@ -117,7 +117,7 @@ class TransposeTest(trt_test.TfTrtIntegrationTestBase):
         # Note: by default Grappler will run the TRT optimizer twice. At the
         # first time it will group the two transpose ops below to same segment
         # then fail the conversion due to the expected batch dimension problem.
-        # At the second time, since the input of bridge op is my_trt_op_0, it
+        # At the second time, since the input of bridge op is TRTEngineOp_0, it
         # will fail to do shape inference which then cause conversion to fail.
         # TODO(laigd): support shape inference, make TRT optimizer run only
         # once, and fix this.
@@ -136,7 +136,7 @@ class TransposeTest(trt_test.TfTrtIntegrationTestBase):
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
     return {
-        "my_trt_op_0": [
+        "TRTEngineOp_0": [
             "transpose-1", "transpose-1/perm", "transposeback",
             "transposeback/perm"
         ]
diff --git a/tensorflow/contrib/tensorrt/test/testdata/checkpoint b/tensorflow/contrib/tensorrt/test/testdata/checkpoint
new file mode 100644
index 0000000000000000000000000000000000000000..a603e1aec91adab04fd9801ba05a2ee9adfbb6e8
--- /dev/null
+++ b/tensorflow/contrib/tensorrt/test/testdata/checkpoint
@@ -0,0 +1,3 @@
+model_checkpoint_path: "model.ckpt-46900"
+all_model_checkpoint_paths: "model.ckpt-0"
+all_model_checkpoint_paths: "model.ckpt-46900"
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001
new file mode 100644
index 0000000000000000000000000000000000000000..88a998f184b275121e1e76eb51d2310da149f10a
Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.data-00000-of-00001 differ
diff --git a/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index
new file mode 100644
index 0000000000000000000000000000000000000000..537976571337508ab1798d33646c51d62a146ecc
Binary files /dev/null and b/tensorflow/contrib/tensorrt/test/testdata/model.ckpt-46900.index differ
diff --git a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
index a725d0651c92fe18bcfd284cffd40cdfec2e6c69..495a9391a1e818a6078988161c9bf72f6143737f 100644
--- a/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
+++ b/tensorflow/contrib/tensorrt/test/tf_trt_integration_test_base.py
@@ -30,6 +30,7 @@ from tensorflow.contrib.tensorrt.python import trt_convert
 from tensorflow.contrib.tensorrt.python.ops import trt_engine_op
 # pylint: enable=unused-import
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import importer
@@ -42,14 +43,15 @@ TfTrtIntegrationTestParams = namedtuple("TfTrtIntegrationTestParams", [
     "gdef", "input_names", "input_dims", "output_names", "expected_output_dims"
 ])
 
-RunParams = namedtuple(
-    "RunParams",
-    ["use_optimizer", "precision_mode", "dynamic_engine", "test_name"])
+RunParams = namedtuple("RunParams", [
+    "use_optimizer", "precision_mode", "dynamic_engine", "test_name",
+    "use_calibration"
+])
 
 ConversionParams = namedtuple("ConversionParams", [
     "max_batch_size", "max_workspace_size_bytes", "precision_mode",
     "minimum_segment_size", "is_dynamic_op", "maximum_cached_engines",
-    "cached_engine_batch_sizes", "rewriter_config"
+    "cached_engine_batch_sizes", "rewriter_config", "use_calibration"
 ])
 
 PRECISION_MODES = ["FP32", "FP16", "INT8"]
@@ -65,6 +67,34 @@ class GraphState(object):
   INFERENCE = 2
 
 
+def OptimizerDisabledRewriterConfig():
+  """Returns a RewriterConfig with all default Grappler optimizers disabled."""
+  rewriter_config = rewriter_config_pb2.RewriterConfig()
+
+  # Turn off all default Grappler optimizers.
+  off = rewriter_config_pb2.RewriterConfig.OFF
+  rewriter_config.layout_optimizer = off
+  rewriter_config.constant_folding = off
+  rewriter_config.shape_optimization = off
+  rewriter_config.remapping = off
+  rewriter_config.arithmetic_optimization = off
+  rewriter_config.dependency_optimization = off
+  rewriter_config.loop_optimization = off
+  rewriter_config.function_optimization = off
+  rewriter_config.debug_stripper = off
+  rewriter_config.disable_model_pruning = True
+  rewriter_config.scoped_allocator_optimization = off
+  rewriter_config.memory_optimization = (
+      rewriter_config_pb2.RewriterConfig.NO_MEM_OPT)
+  rewriter_config.pin_to_host_optimization = off
+  rewriter_config.auto_parallel.enable = False
+
+  # Run only once for each enabled optimizer.
+  rewriter_config.meta_optimizer_iterations = (
+      rewriter_config_pb2.RewriterConfig.ONE)
+  return rewriter_config
+
+
 class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   """Class to test Tensorflow-TensorRT integration."""
 
@@ -139,11 +169,15 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=run_params.dynamic_engine,
         maximum_cached_engines=1,
         cached_engine_batch_sizes=None,
-        rewriter_config=None)
+        rewriter_config=None,
+        use_calibration=run_params.use_calibration)
 
   def ShouldRunTest(self, run_params):
     """Whether to run the test."""
-    return True
+    # This setting combination requires quantization nodes to be present in
+    # order to build the engine.
+    return not (IsQuantizationMode(run_params.precision_mode) and
+                not run_params.use_calibration)
 
   def VerifyRunForEngine(self, engine_name, graph_state, expect_run=True):
     """Verify the state of a particular engine after sess.run()."""
@@ -194,34 +228,35 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
   def _PrepareRun(self, graph_state):
     """Set up necessary testing environment before calling sess.run()."""
     # Clear test values added by TRTEngineOp.
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteTrtEngine")
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteCalibration")
-    trt_convert.clear_test_values("my_trt_op_.*:ExecuteNativeSegment")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteTrtEngine")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteCalibration")
+    trt_convert.clear_test_values("TRTEngineOp_.*:ExecuteNativeSegment")
+
+  def _GetGPUOptions(self):
+    gpu_options = config_pb2.GPUOptions()
+    gpu_options.allow_growth = True
+    return gpu_options
 
   def _GetConfigProto(self, run_params, graph_state):
     """Get config proto based on specific settings."""
     if graph_state != GraphState.ORIGINAL and run_params.use_optimizer:
       conversion_params = self.GetConversionParams(run_params)
-      rewriter_cfg = trt_convert.tensorrt_rewriter_config(
+      rewriter_cfg = trt_convert.get_tensorrt_rewriter_config(
           conversion_params.rewriter_config, conversion_params.max_batch_size,
           conversion_params.max_workspace_size_bytes,
           conversion_params.precision_mode,
           conversion_params.minimum_segment_size,
           conversion_params.is_dynamic_op,
           conversion_params.maximum_cached_engines,
-          conversion_params.cached_engine_batch_sizes)
+          conversion_params.cached_engine_batch_sizes,
+          conversion_params.use_calibration)
 
       graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_cfg)
     else:
       graph_options = config_pb2.GraphOptions()
 
-    gpu_options = config_pb2.GPUOptions()
-    gpu_options.allow_growth = True
-    if trt_convert.get_linked_tensorrt_version()[0] == 3:
-      gpu_options.per_process_gpu_memory_fraction = 0.50
-
     config = config_pb2.ConfigProto(
-        gpu_options=gpu_options, graph_options=graph_options)
+        gpu_options=self._GetGPUOptions(), graph_options=graph_options)
     return config
 
   def _ExpectTestValue(self, engine_name, method, expected_value):
@@ -291,6 +326,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
     params = self._GetParamsCached()
     conversion_params = self.GetConversionParams(run_params)
     logging.info(conversion_params)
+
+    config_for_trt = config_pb2.ConfigProto(gpu_options=self._GetGPUOptions())
+    if conversion_params.rewriter_config is not None:
+      config_for_trt.graph_options.rewrite_options.CopyFrom(
+          conversion_params.rewriter_config)
     return trt_convert.create_inference_graph(
         input_graph_def=gdef,
         outputs=params.input_names + params.output_names,
@@ -301,7 +341,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_op=conversion_params.is_dynamic_op,
         maximum_cached_engines=conversion_params.maximum_cached_engines,
         cached_engine_batch_sizes=conversion_params.cached_engine_batch_sizes,
-        rewriter_config=conversion_params.rewriter_config)
+        use_calibration=conversion_params.use_calibration,
+        session_config=config_for_trt)
 
   def _WriteGraph(self, run_params, gdef, graph_state):
     if graph_state == GraphState.ORIGINAL:
@@ -400,10 +441,12 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
         is_dynamic_engine = not node.attr["static_engine"].b
         self.assertEqual(run_params.dynamic_engine, is_dynamic_engine,
                          node.name)
+        self.assertEqual(node.attr["use_calibration"].b,
+                         run_params.use_calibration, node.name)
 
         has_calibration_data = len(node.attr["calibration_data"].s)
         if (IsQuantizationMode(run_params.precision_mode) and
-            graph_state == GraphState.INFERENCE):
+            run_params.use_calibration and graph_state == GraphState.INFERENCE):
           self.assertTrue(has_calibration_data, node.name)
         else:
           self.assertFalse(has_calibration_data, node.name)
@@ -438,6 +481,11 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
       # types.
       scale = 10.0 if np.issubdtype(dtype, np.integer) else 1.0
       dims = params.input_dims[i]
+      # TODO(laigd): add debug options. E.g. we can set the input data to be
+      # continuous natural numbers:
+      # seq = np.arange(np.prod(dims))
+      # seq.resize(dims)
+      # input_data.append(scale * seq.astype(dtype))
       input_data.append((scale * np.random.random_sample(dims)).astype(dtype))
     self._VerifyGraphDef(run_params, input_gdef, GraphState.ORIGINAL)
 
@@ -449,7 +497,8 @@ class TfTrtIntegrationTestBase(test_util.TensorFlowTestCase):
                                 config_no_trt, GraphState.ORIGINAL)
 
     # Run calibration if necessary.
-    if IsQuantizationMode(run_params.precision_mode):
+    if (IsQuantizationMode(run_params.precision_mode) and
+        run_params.use_calibration):
 
       calib_config = self._GetConfigProto(run_params, GraphState.CALIBRATE)
       logging.info("Running calibration graph, config:\n%s", str(calib_config))
@@ -519,27 +568,38 @@ def _AddTests(test_class):
 
   use_optimizer_options = [False, True]
   dynamic_engine_options = [False, True]
-  for (use_optimizer, precision_mode, dynamic_engine) in itertools.product(
-      use_optimizer_options, PRECISION_MODES, dynamic_engine_options):
+  use_calibration_options = [False, True]
+  opts = itertools.product(use_optimizer_options, PRECISION_MODES,
+                           dynamic_engine_options, use_calibration_options)
+  for (use_optimizer, precision_mode, dynamic_engine, use_calibration) in opts:
     if IsQuantizationMode(precision_mode):
       if use_optimizer:
         # TODO(aaroey): if use_optimizer is True we need to get the inference
         # graphdef using custom python wrapper class, which is not currently
         # supported yet.
         continue
-      if not dynamic_engine:
+      if use_calibration and not dynamic_engine:
+        # Static engine with use_calibration=False will be static, so we want to
+        # test that. If use_calibration=True, only dynamic op is supported.
         # TODO(aaroey): construction of static calibration engine is not
         # supported yet.
         continue
+    else:
+      if use_calibration:
+        # Don't calibrate in FP32 or FP16 mode
+        continue
 
     conversion = "OptimizerConversion" if use_optimizer else "ToolConversion"
-    engine_type = ("DynamicEngine" if dynamic_engine else "StaticEngine")
-    test_name = "%s_%s_%s" % (conversion, precision_mode, engine_type)
+    engine_type = "DynamicEngine" if dynamic_engine else "StaticEngine"
+    calibration_type = "UseCalibration" if use_calibration else "NoCalibration"
+    test_name = "%s_%s_%s_%s" % (conversion, engine_type, precision_mode,
+                                 calibration_type)
     run_params = RunParams(
         use_optimizer=use_optimizer,
         precision_mode=precision_mode,
         dynamic_engine=dynamic_engine,
-        test_name=test_name)
+        test_name=test_name,
+        use_calibration=use_calibration)
     setattr(test_class, "testTfTrt_" + test_name, _GetTest(run_params))
 
 
diff --git a/tensorflow/contrib/tensorrt/test/unary_test.py b/tensorflow/contrib/tensorrt/test/unary_test.py
index 8736bfb6449b3c25a411ec081ad58b1f8be84617..b6e5e32db1236684a06c2d44298b9a3d39667152 100644
--- a/tensorflow/contrib/tensorrt/test/unary_test.py
+++ b/tensorflow/contrib/tensorrt/test/unary_test.py
@@ -106,10 +106,7 @@ class UnaryTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return [
-        "my_trt_op_0", "my_trt_op_1", "my_trt_op_2", "my_trt_op_3",
-        "my_trt_op_4"
-    ]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
index b0271a04b364864b841c2ec9fe53aac74611b2c3..b29626d2c28b4def716aef9e2703b669b5e46374 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_nchw_test.py
@@ -76,7 +76,7 @@ class VGGBlockNCHWTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tensorrt/test/vgg_block_test.py b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
index d7c165784bfe14bb5faffd266770328237a3eb80..9b0b189626050f678c71e9abbf7eb5296440d879 100644
--- a/tensorflow/contrib/tensorrt/test/vgg_block_test.py
+++ b/tensorflow/contrib/tensorrt/test/vgg_block_test.py
@@ -67,7 +67,7 @@ class VGGBlockTest(trt_test.TfTrtIntegrationTestBase):
 
   def ExpectedEnginesToBuild(self, run_params):
     """Return the expected engines to build."""
-    return ["my_trt_op_0"]
+    return ["TRTEngineOp_0"]
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/contrib/tfprof/README.md b/tensorflow/contrib/tfprof/README.md
index b29d1acacf17b57549558be45c853566817c1729..f40e76f554e8815aac96344d8cb0b911bafdd712 100644
--- a/tensorflow/contrib/tfprof/README.md
+++ b/tensorflow/contrib/tfprof/README.md
@@ -1,7 +1,5 @@
 # tfprof: TensorFlow Profiler and Beyond
 
-<h1>Please use `tf.profiler.xxx` instead of `tf.contrib.tfprof.xxx`</h1>
-
 <h1>Full Document in
 <a href="https://github.com/tensorflow/tensorflow/blob/master/tensorflow/core/profiler/README.md">tensorflow/core/profiler/README.md</a><h1>
 
diff --git a/tensorflow/contrib/timeseries/python/timeseries/BUILD b/tensorflow/contrib/timeseries/python/timeseries/BUILD
index c230919168b937b26c68e141e15f0762ad70f3e6..4b90b596b28efec83aa349782c4874d79b6817c7 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/BUILD
@@ -104,8 +104,10 @@ py_test(
     srcs = [
         "estimators_test.py",
     ],
+    shard_count = 3,
     srcs_version = "PY2AND3",
     tags = [
+        "no_mac",
         "no_pip_gpu",  # b/63391119
         "nomsan",  # Takes too long to run.
         "notsan",  # b/67865658
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators.py b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
index af68aa03cf6583dc474eda6cda2e648fa1c3d08d..146ed9f27134e3e2a6c74627b6b78e53d65155f0 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators.py
@@ -32,7 +32,7 @@ from tensorflow.contrib.timeseries.python.timeseries.state_space_models.filterin
 from tensorflow.python.estimator import estimator_lib
 from tensorflow.python.estimator.canned import optimizers
 from tensorflow.python.estimator.export import export_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
diff --git a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
index ffd838be40ed6267109fe36d95a681496fb2f964..7d780559f976516823611f3fe0ded056e4be088c 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/estimators_test.py
@@ -30,7 +30,7 @@ from tensorflow.contrib.timeseries.python.timeseries import saved_model_utils
 
 from tensorflow.python.client import session
 from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
diff --git a/tensorflow/contrib/timeseries/python/timeseries/head_test.py b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
index 90c7d8ac1a9c69216ece74af458cd750667f51ee..8f692d94da45bfaed6c72cf75d525346865aea34 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/head_test.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/head_test.py
@@ -38,7 +38,7 @@ from tensorflow.core.example import example_pb2
 
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.estimator import estimator_lib
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
diff --git a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
index 43c5267e632e464d43ffcbcf6c551ff83d3c5767..aab330643862c1ccf073d2a0e34e1c475b1ec15f 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/math_utils.py
@@ -802,7 +802,7 @@ class InputStatisticsFromMiniBatch(object):
             array_ops.shape(times)[1] - 1, self._dtype))
     # Co-locate updates with their variables to minimize race conditions when
     # updating statistics.
-    with ops.colocate_with(auxiliary_variables.max_time_seen):
+    with ops.device(auxiliary_variables.max_time_seen.device):
       # There is a race condition if this value is being updated from multiple
       # workers. However, it should eventually reach the correct value if the
       # last chunk is presented enough times.
@@ -810,16 +810,16 @@ class InputStatisticsFromMiniBatch(object):
           auxiliary_variables.max_time_seen,
           gen_math_ops.maximum(auxiliary_variables.max_time_seen,
                                math_ops.reduce_max(times)))
-    with ops.colocate_with(auxiliary_variables.chunk_count):
+    with ops.device(auxiliary_variables.chunk_count.device):
       chunk_count_assign = state_ops.assign_add(auxiliary_variables.chunk_count,
                                                 array_ops.shape(
                                                     times,
                                                     out_type=dtypes.int64)[0])
-    with ops.colocate_with(auxiliary_variables.inter_observation_duration_sum):
+    with ops.device(auxiliary_variables.inter_observation_duration_sum.device):
       inter_observation_duration_assign = state_ops.assign_add(
           auxiliary_variables.inter_observation_duration_sum,
           math_ops.reduce_sum(batch_inter_observation_duration))
-    with ops.colocate_with(auxiliary_variables.example_count):
+    with ops.device(auxiliary_variables.example_count.device):
       example_count_assign = state_ops.assign_add(
           auxiliary_variables.example_count,
           array_ops.size(times, out_type=dtypes.int64))
@@ -829,11 +829,11 @@ class InputStatisticsFromMiniBatch(object):
     # the series are then members of fewer chunks. For series which are much
     # longer than the chunk size (the usual/expected case), this effect becomes
     # irrelevant.
-    with ops.colocate_with(auxiliary_variables.overall_feature_sum):
+    with ops.device(auxiliary_variables.overall_feature_sum.device):
       overall_feature_sum_assign = state_ops.assign_add(
           auxiliary_variables.overall_feature_sum,
           math_ops.reduce_sum(values, axis=[0, 1]))
-    with ops.colocate_with(auxiliary_variables.overall_feature_sum_of_squares):
+    with ops.device(auxiliary_variables.overall_feature_sum_of_squares.device):
       overall_feature_sum_of_squares_assign = state_ops.assign_add(
           auxiliary_variables.overall_feature_sum_of_squares,
           math_ops.reduce_sum(values**2, axis=[0, 1]))
@@ -869,7 +869,7 @@ class InputStatisticsFromMiniBatch(object):
             state_ops.assign(statistics.series_start_moments.mean, mean),
             state_ops.assign(statistics.series_start_moments.variance,
                              variance))
-      with ops.colocate_with(statistics.start_time):
+      with ops.device(statistics.start_time.device):
         series_start_update = control_flow_ops.cond(
             # Update moments whenever we even match the lowest time seen so far,
             # to ensure that series start statistics are eventually updated to
diff --git a/tensorflow/contrib/timeseries/python/timeseries/model.py b/tensorflow/contrib/timeseries/python/timeseries/model.py
index edd97b2a4c131dbce0a5111dbac7d40eddea2bae..a8cd4287e0003de300b7114cf3f88d21d3239e6e 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/model.py
+++ b/tensorflow/contrib/timeseries/python/timeseries/model.py
@@ -27,7 +27,7 @@ from tensorflow.contrib.timeseries.python.timeseries import math_utils
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import PredictionFeatures
 from tensorflow.contrib.timeseries.python.timeseries.feature_keys import TrainEvalFeatures
 
-from tensorflow.python.feature_column import feature_column
+from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
diff --git a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
index 3c07a74ed8af9e3ab70408f9b43cb62b6bd4c7f2..125750e7639ad40c481472a93353e6fb7055be96 100644
--- a/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
+++ b/tensorflow/contrib/timeseries/python/timeseries/state_space_models/BUILD
@@ -40,7 +40,10 @@ py_test(
     timeout = "long",  # Moderate but for asan
     srcs = ["state_space_model_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],  # TODO: needs investigation on Windows
+    tags = [
+        "no_mac",
+        "no_windows",  # TODO: needs investigation on Windows
+    ],
     deps = [
         ":state_space_model",
         "//tensorflow/contrib/layers:layers_py",
diff --git a/tensorflow/contrib/tpu/BUILD b/tensorflow/contrib/tpu/BUILD
index a0a9cb3f31a945a00eb3f6a5fd1402aab9a2df5f..4bf3a0463d9046eea2f60e9154fca1357e728215 100644
--- a/tensorflow/contrib/tpu/BUILD
+++ b/tensorflow/contrib/tpu/BUILD
@@ -14,6 +14,7 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 package(
     default_visibility = [
         "//cloud/vmm/testing/tests/tpu:__subpackages__",
+        "//knowledge/cerebra/sense/im2query:__subpackages__",
         "//learning/brain:__subpackages__",
         "//learning/deepmind:__subpackages__",
         "//medical/pathology:__subpackages__",
@@ -78,6 +79,7 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform",
+        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:summary",
         "//tensorflow/python:summary_ops_v2",
@@ -215,7 +217,7 @@ py_library(
     ],
     deps = [
         ":tpu_lib",
-        "//tensorflow/contrib/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/distribute",
         "//tensorflow/contrib/framework:framework_py",
         "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
@@ -263,7 +265,7 @@ py_library(
         ":tpu_py",
         "//tensorflow/compiler/xla/experimental/xla_sharding",
         "//tensorflow/compiler/xla/python_api:xla_shape",
-        "//tensorflow/contrib/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/contrib/cluster_resolver:cluster_resolver_py",
         "//tensorflow/contrib/compiler:xla",
         "//tensorflow/contrib/tpu/proto:compilation_result_proto_py",
         "//tensorflow/contrib/tpu/proto:optimization_parameters_proto_py",
diff --git a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
index b4b06a40a2c8aaa97ff82baf93c8f2d55a587e37..ef35e84ba5205fb76e5afe77e670d87197ca8405 100644
--- a/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
+++ b/tensorflow/contrib/tpu/profiler/dump_tpu_profile.cc
@@ -98,7 +98,7 @@ Status DumpOpProfileToLogDirectory(StringPiece run_dir,
   if (!status.ok()) {
     return errors::Internal(
         "Failed to convert op profile to json. Skipping... ",
-        string(status.error_message()));
+        string(status.message()));
   }
   TF_RETURN_IF_ERROR(WriteStringToFile(Env::Default(), path, json));
   if (os) {
diff --git a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
index 63641e00c5dbf4b4e635ecfea8bef98c7d0b7075..a081c4354a779d37140338793e66844c3fcf7a12 100644
--- a/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
+++ b/tensorflow/contrib/tpu/profiler/pip_package/cloud_tpu_profiler/main.py
@@ -90,12 +90,12 @@ def main(unused_argv=None):
   tf_version = tf.__version__
   print('TensorFlow version %s detected' % tf_version)
 
-  if FLAGS.service_addr is None and FLAGS.tpu is None:
+  if not FLAGS.service_addr and not FLAGS.tpu:
     sys.exit('You must specify either --service_addr or --tpu.')
 
   tpu_cluster_resolver = None
-  if FLAGS.service_addr is not None:
-    if FLAGS.tpu is not None:
+  if FLAGS.service_addr:
+    if FLAGS.tpu:
       tf.logging.warn('Both --service_addr and --tpu are set. Ignoring '
                       '--tpu and using --service_addr.')
     service_addr = FLAGS.service_addr
diff --git a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
index 1cf7f9fcf67ec98feb02dd4298a36153e689f2e5..1b09ce173a64ba3f93ec019c8fd65dc4710f0fcf 100644
--- a/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
+++ b/tensorflow/contrib/tpu/python/tpu/async_checkpoint.py
@@ -80,6 +80,8 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
     self._summary_writer = None
     self._global_step_tensor = None
 
+    self._last_checkpoint_step = None
+
   def _set_steps_per_run(self, steps_per_run):
     self._steps_per_run = steps_per_run
 
@@ -137,8 +139,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
 
     last_step = session.run(self._global_step_tensor)
 
-    # Save the last checkpoint synchronously if needed.
-    if last_step != self._timer.last_triggered_step():
+    if self._last_checkpoint_step != last_step:
       self._save(session, last_step, asynchronous=False)
 
     for l in self._listeners:
@@ -174,6 +175,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
       logging.info("Checkpoint finished for %d into %s.", step, self._save_path)
 
     if not asynchronous:
+      self._last_checkpoint_step = step
       _save_fn()
       return
 
@@ -183,6 +185,7 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
         logging.info("Saver thread still in progress, skipping checkpoint.")
         return
 
+    self._last_checkpoint_step = step
     self._save_thread = threading.Thread(target=_save_fn)
     self._save_thread.start()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets.py b/tensorflow/contrib/tpu/python/tpu/datasets.py
index c694e9c1bca10d9930492c29dd1c3cbc7f7f5d04..8d6245390fc3fa005c92d01bc9b64ddb47583582 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets.py
@@ -133,7 +133,7 @@ def StreamingFilesDataset(files,
   with ops.device('/job:%s' % file_reader_job):
     if isinstance(files, str):
       source_dataset = dataset_ops.Dataset.list_files(files)
-    elif isinstance(files, dataset_ops.Dataset):
+    elif isinstance(files, dataset_ops.DatasetV2):
       source_dataset = files
     else:
       raise ValueError('files was not a string or a dataset: %s' % files)
@@ -156,7 +156,7 @@ def StreamingFilesDataset(files,
 
     source_dataset = source_dataset.prefetch(1)
 
-    source_iterator = source_dataset.make_one_shot_iterator()
+    source_iterator = dataset_ops.make_one_shot_iterator(source_dataset)
     source_handle = source_iterator.string_handle()
 
   @function.Defun(dtypes.string)
diff --git a/tensorflow/contrib/tpu/python/tpu/datasets_test.py b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
index b58d05eac56f3586e183333f7c1a3867ee57456c..52d87b800401c3e584da9843916cfc7a767c082a 100644
--- a/tensorflow/contrib/tpu/python/tpu/datasets_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/datasets_test.py
@@ -70,7 +70,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'text_line.*.txt'), filetype='text')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -94,7 +94,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         os.path.join(self.get_temp_dir(), 'tf_record*'), filetype='tfrecord')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -121,7 +121,7 @@ class DatasetsTest(test.TestCase):
 
     dataset = datasets.StreamingFilesDataset(filenames, filetype='tfrecord')
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -154,7 +154,7 @@ class DatasetsTest(test.TestCase):
         os.path.join(self.get_temp_dir(), 'fixed_length*'),
         filetype=FixedLengthFile)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
@@ -177,7 +177,7 @@ class DatasetsTest(test.TestCase):
     dataset = datasets.StreamingFilesDataset(
         dataset_ops.Dataset.range(10), filetype=gen_dataset)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     self._sess.run(iterator.initializer)
     get_next = iterator.get_next()
 
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_support.py b/tensorflow/contrib/tpu/python/tpu/keras_support.py
index 08f58a5f5b89f92502893e222cbca3bd07b2432b..4ce194590342555a7c4e9e119bf51e516a37a715 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_support.py
@@ -81,6 +81,7 @@ from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import models
 from tensorflow.python.keras import optimizers as keras_optimizers
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import training_arrays
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.layers import embeddings
@@ -132,7 +133,7 @@ def _tpu_session_context():
 An error occurred connecting or initializing your TPU.
 
 The session has been reset. re-run keras_to_tpu_model to create a new session.
-""" + e)
+""" + str(e))
 
 
 def setup_tpu_session(cluster_resolver):
@@ -438,7 +439,7 @@ class TPURewriteContext(object):
 
     self._default_placeholder = array_ops.placeholder
     self._default_name_scope = ops.name_scope
-    self._default_make_variable = base_layer.make_variable
+    self._default_make_variable = base_layer_utils.make_variable
     self._default_random_normal = random_ops.random_normal
     self._default_qr = gen_linalg_ops.qr
 
@@ -486,14 +487,14 @@ class TPURewriteContext(object):
     gen_linalg_ops.qr = qr
 
     ops.name_scope = _name_scope
-    base_layer.make_variable = variable_scope.get_variable
+    base_layer_utils.make_variable = variable_scope.get_variable
     logging.info('Overriding default placeholder.')
     return
 
   def __exit__(self, exc_type, exc_val, exc_tb):
     array_ops.placeholder = self._default_placeholder
     ops.name_scope = self._default_name_scope
-    base_layer.make_variable = self._default_make_variable
+    base_layer_utils.make_variable = self._default_make_variable
     random_ops.random_normal = self._default_random_normal
     gen_linalg_ops.qr = self._default_qr
 
@@ -728,7 +729,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
     dummy_x_shape[0] *= tpu_assignment.num_towers
     dummy_y_shape = dataset.output_shapes[1].as_list()
     dummy_y_shape[0] *= tpu_assignment.num_towers
-    self._iterator = dataset.make_initializable_iterator()
+    self._iterator = dataset_ops.make_initializable_iterator(dataset)
     K.get_session().run(self._iterator.initializer)
 
     self._get_next_ops = []
@@ -769,7 +770,7 @@ class TPUDatasetInfeedManager(TPUInfeedManager):
 
   def _verify_dataset_shape(self, dataset):
     """Verifies a dataset is of an appropriate shape for TPUs."""
-    if not isinstance(dataset, dataset_ops.Dataset):
+    if not isinstance(dataset, dataset_ops.DatasetV2):
       raise ValueError('The function passed as the `x` parameter did not '
                        'return a `tf.data.Dataset`.')
     if not isinstance(dataset.output_classes, tuple):
@@ -1012,9 +1013,10 @@ class TPUFunction(object):
                   optimizer=_replicated_optimizer(self._cloned_optimizer),
                   loss=self.model.loss,
                   loss_weights=self.model.loss_weights,
-                  metrics=metrics_module.clone_metrics(self.model.metrics),
+                  metrics=metrics_module.clone_metrics(
+                      self.model._compile_metrics),
                   weighted_metrics=metrics_module.clone_metrics(
-                      self.model.weighted_metrics),
+                      self.model._compile_weighted_metrics),
                   target_tensors=tpu_targets,
               )
 
@@ -1184,12 +1186,9 @@ class TPUFunction(object):
       # pipelined loop.
       return None, None
 
-    if not isinstance(K.learning_phase(), int):
+    if isinstance(inputs[-1], int):
       # Remove the learning_phase flag at the end. We currently hard code the
       # learning_phase in TPUFunction.
-      assert isinstance(inputs[-1], int), (
-          'Expect the final element be learning_phase flag. Got {}'.format(
-              inputs[-1]))
       inputs = inputs[:-1]
 
     if (self.execution_mode == model_fn_lib.ModeKeys.TRAIN or
@@ -1379,6 +1378,7 @@ class KerasTPUModel(models.Model):
     self.train_function = None
     self._fit_function = None
     self._eval_function = None
+    self._stateful_metric_functions = []
 
     cluster_resolver = strategy._tpu_cluster_resolver
     self._tpu_name_or_address = cluster_resolver.get_master()
@@ -1393,10 +1393,10 @@ class KerasTPUModel(models.Model):
       self.compile(
           self._cpu_model.optimizer,
           self._cpu_model.loss,
-          self._cpu_model.metrics,
+          self._cpu_model._compile_metrics,
           self._cpu_model.loss_weights,
           self._cpu_model.sample_weight_mode,
-          self._cpu_model.weighted_metrics,
+          self._cpu_model._compile_weighted_metrics,
           self._cpu_model.target_tensors,
       )
 
@@ -1466,7 +1466,7 @@ class KerasTPUModel(models.Model):
       assert not self._numpy_to_infeed_manager_list  # Ensure empty.
 
       infeed_managers = []  # Managers to clean up at the end of the fit call.
-      if isinstance(x, dataset_ops.Dataset):
+      if isinstance(x, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1492,7 +1492,7 @@ class KerasTPUModel(models.Model):
           y = infeed_manager.dummy_y
           infeed_managers.append((x, infeed_manager))
 
-      if isinstance(validation_data, dataset_ops.Dataset):
+      if isinstance(validation_data, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1551,7 +1551,7 @@ class KerasTPUModel(models.Model):
     with _tpu_session_context():
       # Managers to clean up at the end of the evaluate call.
       infeed_managers = []
-      if isinstance(x, dataset_ops.Dataset):
+      if isinstance(x, dataset_ops.DatasetV2):
         # TODO(b/111413240): Support taking a tf.data.Dataset directly.
         raise ValueError(
             'Taking a Dataset directly is not yet supported. Please '
@@ -1676,14 +1676,10 @@ class KerasTPUModel(models.Model):
         callbacks,
         self,
         do_validation=do_validation,
-        val_inputs=val_inputs,
-        val_targets=val_targets,
-        val_sample_weights=val_sample_weights,
         batch_size=batch_size,
         epochs=epochs,
         steps_per_epoch=steps_per_epoch,
         samples=num_training_samples,
-        validation_steps=validation_steps,
         verbose=verbose,
         count_mode=count_mode)
 
@@ -1700,7 +1696,7 @@ class KerasTPUModel(models.Model):
     callbacks.on_train_begin()
     for epoch in range(initial_epoch, epochs):
       # Reset stateful metrics
-      for m in self.stateful_metric_functions:
+      for m in self.metrics:
         m.reset_states()
       # Update callbacks
       callbacks.on_epoch_begin(epoch)
@@ -1923,7 +1919,7 @@ class KerasTPUModel(models.Model):
     if validation_data:
       if (isinstance(validation_data, iterator_ops.Iterator) or
           isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.Dataset)):
+          isinstance(validation_data, dataset_ops.DatasetV2)):
         raise ValueError('KerasTPUModel cannot handle a Dataset or Iterator '
                          'for validation_data. Please instead pass a function '
                          'that returns a `tf.data.Dataset`.')
@@ -1998,14 +1994,14 @@ class KerasTPUModel(models.Model):
     self._optimizer = optimizer
 
   @property
-  def stateful_metric_functions(self):
+  def metrics(self):
     if self._tpu_model:
-      return self._tpu_model.stateful_metric_functions
+      return self._tpu_model.metrics
     return self._stateful_metric_functions
 
-  @stateful_metric_functions.setter
-  def stateful_metric_functions(self, stateful_metric_functions):
-    self._stateful_metric_functions = stateful_metric_functions
+  @metrics.setter
+  def metrics(self, metrics):
+    self._stateful_metric_functions = metrics
 
   def _make_train_function(self):
     if not self.train_function:
@@ -2230,10 +2226,10 @@ def tpu_model(model, strategy=None):
     cpu_model.compile(
         _clone_optimizer(model.optimizer, optimizer_config),
         model.loss,
-        metrics_module.clone_metrics(model.metrics),
+        metrics_module.clone_metrics(model._compile_metrics),
         model.loss_weights,
         model.sample_weight_mode,
-        metrics_module.clone_metrics(model.weighted_metrics),
+        metrics_module.clone_metrics(model._compile_weighted_metrics),
     )
 
   if model_weights:
diff --git a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
index 28d3a938510a450ccba0d921663d848e2adec72f..8b0b240dc7302c203a22349d583323327fc4480b 100644
--- a/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
+++ b/tensorflow/contrib/tpu/python/tpu/keras_tpu_variables.py
@@ -217,6 +217,10 @@ class ReplicatedVariable(object):
   def get(self):
     return self._primary_var
 
+  @property
+  def _in_graph_mode(self):
+    return self._primary_var._in_graph_mode   # pylint: disable=protected-access
+
   def _should_act_as_resource_variable(self):
     """Pass resource_variable_ops.is_resource_variable check."""
     pass
diff --git a/tensorflow/contrib/tpu/python/tpu/session_support.py b/tensorflow/contrib/tpu/python/tpu/session_support.py
index a95275487899c4770ef99b620a7671eec2bb81eb..3e463823c820a3ef8628324f77e1a9caf8d385d5 100644
--- a/tensorflow/contrib/tpu/python/tpu/session_support.py
+++ b/tensorflow/contrib/tpu/python/tpu/session_support.py
@@ -43,12 +43,19 @@ class CoordinatorShutdownException(Exception):
   pass
 
 
+def _clone_session(session, graph=None):
+  return session_lib.Session(
+      target=session.sess_str,
+      config=session._config,  # pylint: disable=protected-access
+      graph=graph if graph else session.graph)
+
+
 def _make_heartbeat_op(session, device, request_ph):
   """Return a heartbeat op or None if heartbeats are not supported by device."""
   try:
     # Test if we can connect in a isolated graph + session
     with ops.Graph().as_default():
-      with session_lib.Session(target=session.sess_str) as temp_session:
+      with _clone_session(session) as temp_session:
         with ops.device(device):
           heartbeat_op = tpu_ops.worker_heartbeat('')
           options = config_pb2.RunOptions(timeout_in_ms=5000)
@@ -220,6 +227,7 @@ class WatchdogManager(threading.Thread):
     self.ping_interval = ping_interval
     self.shutdown_timeout = shutdown_timeout
     self.daemon = True
+    self._config = session._config  # pylint: disable=protected-access
     self._target = session.sess_str
     self._running = False
     self._devices = devices
@@ -234,6 +242,7 @@ class WatchdogManager(threading.Thread):
     self._session = session_lib.Session(
         target=self._target,
         graph=self._graph,
+        config=self._config,
     )
 
     if self._devices is None:
@@ -334,8 +343,7 @@ class GracefulShutdownHook(session_run_hook.SessionRunHook):
 
     with self._graph.as_default():
       logging.info('Installing graceful shutdown hook.')
-      self._session = session_lib.Session(
-          target=training_session.sess_str, graph=self._graph)
+      self._session = _clone_session(training_session, self._graph)
       self._workers = WorkerHeartbeatManager.from_devices(
           self._session, all_worker_devices(self._session))
       self._heartbeat_supported = self._workers.num_workers() > 0
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu.py b/tensorflow/contrib/tpu/python/tpu/tpu.py
index e3e791faacb9b3c1fedbd83d3740e35351e38abb..def57da20d6018dcf27ccb7a9d04592f38ce2f7c 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu.py
@@ -1001,8 +1001,8 @@ def rewrite(computation,
       `rewrite` is a list of tensors corresponding to the tensors from the
       output of `computation`.
 
-      All `Operation`s returned from `computation` will be executed when
-      evaluating any of the returned output tensors.
+      All `Operation`s constructed during `computation` will be executed when
+      evaluating any of the returned output tensors, not just the ones returned.
     inputs: A list of input tensors or `None` (equivalent to an empty list).
     infeed_queue: If not `None`, the `InfeedQueue` from which to append a tuple
       of arguments as inputs to `computation`.
@@ -1111,7 +1111,7 @@ def validate_inference_rewrite_for_variables(graph):
   Raises:
     RuntimeError: if validation failed.
   """
-  if not any([x.type == "GuaranteeConst" for x in graph.get_operations()]):
+  if not any(x.type == "GuaranteeConst" for x in graph.get_operations()):
     raise RuntimeError(
         "No GuaranteeConst ops found in the graph after running "
         "tpu.rewrite_for_inference(...). Please check that you are using "
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_context.py b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
index da6bdf67d686fba09d66386de982b57aa28d4dd4..672462447944b777375331d49727c4d5366cf295 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_context.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_context.py
@@ -41,7 +41,7 @@ _NUM_CORES_TO_COMPUTATION_SHAPE = {
 
 
 class TPUContext(object):
-  """The context of current input_fn invocation."""
+  """A context that holds the current configuration of the TPU computation."""
 
   def __init__(self,
                internal_ctx,
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
index 3fe896426a7ae5b4b15b0520522002e6fb0dc1b0..ccba8a46c7cad0337119672e02314684f4451479 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_embedding.py
@@ -1069,17 +1069,14 @@ def _create_partitioned_variables(name,
                      'As TPU embedding is not optimized for small tables, '
                      'please consider other ways for this embedding lookup.')
 
-  slicing = [num_hosts, 1]
-
-  # TODO(shizhiw): deprecated, use tf.get_variable()?
-  return partitioned_variables.create_partitioned_variables(
-      name=name,
-      slicing=slicing,
+  return list(variable_scope.get_variable(
+      name,
       shape=(vocabulary_size, embedding_dimension),
+      partitioner=partitioned_variables.fixed_size_partitioner(num_hosts),
       dtype=dtypes.float32,
       initializer=initializer,
       collections=collections,
-      trainable=False)
+      trainable=False))
 
 
 @ops.RegisterGradient('TPUEmbeddingActivations')
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
index 7cb8c4aa7f14636a9597ec45974ec013ef367414..96b9556e137effcaaa5916b9723142f737a6dc33 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator.py
@@ -45,6 +45,7 @@ from tensorflow.contrib.training.python.training import hparam
 from tensorflow.core.framework import variable_pb2
 from tensorflow.core.framework.summary_pb2 import Summary
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session as tf_session
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest as data_nest
 from tensorflow.python.estimator import estimator as estimator_lib
@@ -298,9 +299,9 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
       host_calls['host_call'] = host_call
     _OutfeedHostCall.validate(host_calls)
 
-    training_hooks = list(training_hooks or [])
-    evaluation_hooks = list(evaluation_hooks or [])
-    prediction_hooks = list(prediction_hooks or [])
+    training_hooks = tuple(training_hooks or [])
+    evaluation_hooks = tuple(evaluation_hooks or [])
+    prediction_hooks = tuple(prediction_hooks or [])
 
     for hook in training_hooks + evaluation_hooks + prediction_hooks:
       if not isinstance(hook, session_run_hook.SessionRunHook):
@@ -335,7 +336,7 @@ class TPUEstimatorSpec(model_fn_lib._TPUEstimatorSpec):  # pylint: disable=prote
     hooks = None
     if self.host_call is not None:
       hooks = [_OutfeedHostCallHook(host_call_ret['host_call'])]
-    hooks = list(hooks or [])
+    hooks = tuple(hooks or [])
     scaffold = self.scaffold_fn() if self.scaffold_fn else None
     return model_fn_lib.EstimatorSpec(
         mode=self.mode,
@@ -412,12 +413,15 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
                enqueue_ops,
                dequeue_ops,
                run_infeed_loop_on_coordinator=True,
-               rendezvous=None):
+               rendezvous=None,
+               master=None,
+               session_config=None):
     self._master_job = ctx.master_job
     self._enqueue_ops = enqueue_ops
     self._dequeue_ops = dequeue_ops
     self._rendezvous = rendezvous
-
+    self._master = master
+    self._session_config = session_config
     self._run_infeed_loop_on_coordinator = run_infeed_loop_on_coordinator
     self._initial_infeed_sleep_secs = (
         ctx.config.tpu_config.initial_infeed_sleep_secs)
@@ -429,11 +433,10 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
   def begin(self):
     logging.info('TPU job name %s', self._master_job)
     self._iterations_per_loop_var = _create_or_get_iterations_per_loop()
+    self._init_ops = []
     if self._should_initialize_tpu:
-      self._init_ops = [tpu.initialize_system(job=self._master_job)]
       self._finalize_ops = [tpu.shutdown_system(job=self._master_job)]
     else:
-      self._init_ops = []
       self._finalize_ops = []
 
     summary_writer_init_ops = contrib_summary.summary_writer_initializer_op()
@@ -475,11 +478,17 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
     return _OpQueueContext(name=name, target=target, args=args)
 
   def after_create_session(self, session, coord):
-    logging.info('Init TPU system')
-    start = time.time()
+    if self._should_initialize_tpu:
+      logging.info('Init TPU system')
+      start = time.time()
+      with ops.Graph().as_default():
+        with tf_session.Session(
+            self._master, config=self._session_config) as sess:
+          sess.run(tpu.initialize_system(job=self._master_job))
+      logging.info('Initialized TPU in %d seconds', time.time() - start)
+
     session.run(self._init_ops,
                 options=config_pb2.RunOptions(timeout_in_ms=5 * 60 * 1000))
-    logging.info('Initialized TPU in %d seconds', time.time() - start)
 
     self._infeed_controller = self._create_infeed_controller(
         name='InfeedController', target=self._run_infeed, args=(session,))
@@ -521,13 +530,16 @@ class TPUInfeedOutfeedSessionHook(session_run_hook.SessionRunHook):
 
 class TPUInfeedOutfeedSessionHookForPrediction(TPUInfeedOutfeedSessionHook):
 
-  def __init__(self, ctx, enqueue_ops, dequeue_ops, rendezvous=None):
+  def __init__(self, ctx, enqueue_ops, dequeue_ops, rendezvous=None,
+               master=None, session_config=None):
     super(TPUInfeedOutfeedSessionHookForPrediction, self).__init__(
         ctx,
         enqueue_ops,
         dequeue_ops,
         run_infeed_loop_on_coordinator=False,
-        rendezvous=rendezvous)
+        rendezvous=rendezvous,
+        master=master,
+        session_config=session_config)
 
   def _create_infeed_controller(self, name, target, args):
     return _OpSignalOnceQueueContext(name=name, target=target, args=args)
@@ -2169,7 +2181,6 @@ class TPUEstimator(estimator_lib.Estimator):
                                builder,
                                input_receiver_fn_map,
                                checkpoint_path,
-                               strip_default_attrs,
                                save_variables=True,
                                mode=model_fn_lib.ModeKeys.PREDICT,
                                export_tags=None,
@@ -2184,7 +2195,6 @@ class TPUEstimator(estimator_lib.Estimator):
         builder,
         input_receiver_fn_map,
         checkpoint_path,
-        strip_default_attrs,
         save_variables,
         mode=mode,
         export_tags=export_tags,
@@ -2201,7 +2211,6 @@ class TPUEstimator(estimator_lib.Estimator):
           builder,
           input_receiver_fn_map,
           checkpoint_path,
-          strip_default_attrs,
           save_variables=False,
           mode=mode,
           export_tags=export_tags,
@@ -2225,7 +2234,7 @@ class TPUEstimator(estimator_lib.Estimator):
     def computation():
       """Compute tpu tensors used in export_outputs.
 
-      Passed to rewrite_for_inference so that model_fn will be called under
+      Passed to rewrite so that model_fn will be called under
       the rewriting contexts. Only tpu tensors are returned, but export_outputs
       and scaffold are captured.
 
@@ -2234,7 +2243,7 @@ class TPUEstimator(estimator_lib.Estimator):
          outside_compilation.
       """
       # We should only call model fn once and it should be inside `computation`
-      # so that building the graph will happen under `rewrite_for_inference`.
+      # so that building the graph will happen under `rewrite`.
       mode = model_fn_lib.ModeKeys.PREDICT
       estimator_spec = self._call_model_fn(features, labels, mode, config)
 
@@ -2251,7 +2260,7 @@ class TPUEstimator(estimator_lib.Estimator):
       capture.capture((estimator_spec, tensors_dict, tensors))
       return tpu_tensors
 
-    tpu_tensors_on_cpu = tpu.rewrite_for_inference(computation)
+    tpu_tensors_on_cpu = tpu.rewrite(computation)
     estimator_spec, tensors_dict, tensors = capture.get()
 
     # Reconstruct `tensors`, but with `tpu_tensors` replaced with
@@ -2564,6 +2573,8 @@ class TPUEstimator(estimator_lib.Estimator):
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator),
                   rendezvous=self._rendezvous[mode],
+                  master=self._config.master,
+                  session_config=self._session_config,
               ),
               InstallSignalHandlerHook()
           ])
@@ -2666,8 +2677,10 @@ class TPUEstimator(estimator_lib.Estimator):
                   eval_update_ops + host_ops,
                   run_infeed_loop_on_coordinator=(
                       run_infeed_loop_on_coordinator),
-                  rendezvous=self._rendezvous[mode]),
-          ] + input_hooks
+                  rendezvous=self._rendezvous[mode],
+                  master=self._config.evaluation_master,
+                  session_config=self._session_config,
+              )] + input_hooks
 
           if eval_hooks:
             hooks.extend(eval_hooks)
@@ -2738,7 +2751,9 @@ class TPUEstimator(estimator_lib.Estimator):
         hooks = [
             _StoppingPredictHook(scalar_stopping_signal),
             TPUInfeedOutfeedSessionHookForPrediction(
-                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode]),
+                ctx, enqueue_ops, host_ops, rendezvous=self._rendezvous[mode],
+                master=self._config.master,
+                session_config=self._session_config),
         ] + input_hooks
 
         if prediction_hooks:
@@ -2783,7 +2798,7 @@ def _export_output_to_tensors(export_output):
   elif isinstance(export_output, export_output_lib.RegressionOutput):
     return [export_output.value]
   elif isinstance(export_output, export_output_lib.PredictOutput):
-    return export_output.outputs.values()
+    return list(export_output.outputs.values())
   else:
     raise ValueError(
         '`export_output` must be have type `ClassificationOutput`, '
@@ -3059,7 +3074,7 @@ class _Inputs(object):
   @staticmethod
   def from_input_fn(return_values):
     """Returns an `_Inputs` instance according to `input_fn` return value."""
-    if isinstance(return_values, dataset_ops.Dataset):
+    if isinstance(return_values, dataset_ops.DatasetV2):
       dataset = return_values
       return _Inputs(dataset=dataset)
 
@@ -3084,7 +3099,7 @@ class _Inputs(object):
 
     The initializer must be run before calling `features_and_labels`.
     """
-    self._iterator = self._dataset.make_initializable_iterator()
+    self._iterator = dataset_ops.make_initializable_iterator(self._dataset)
     return self._iterator.initializer
 
   def features_and_labels(self):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
index 3786e52b949dfac8c1587d1ea3041b625f00183f..e3ea983abfd24d03c964fbc647b56262e15e0a96 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_estimator_signals_test.py
@@ -21,8 +21,8 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.contrib.tpu.python.tpu import tpu_estimator
-from tensorflow.python import data as dataset_lib
 from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import test
@@ -34,10 +34,10 @@ def make_input_fn(num_samples):
 
   def input_fn(params):
     batch_size = params['batch_size']
-    da1 = dataset_lib.Dataset.from_tensor_slices(a)
-    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+    da1 = dataset_ops.Dataset.from_tensor_slices(a)
+    da2 = dataset_ops.Dataset.from_tensor_slices(b)
 
-    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset_ops.Dataset.zip((da1, da2))
     dataset = dataset.map(lambda fa, fb: {'a': fa, 'b': fb})
     dataset = dataset.batch(batch_size)
     return dataset
@@ -50,10 +50,10 @@ def make_input_fn_with_labels(num_samples):
 
   def input_fn(params):
     batch_size = params['batch_size']
-    da1 = dataset_lib.Dataset.from_tensor_slices(a)
-    da2 = dataset_lib.Dataset.from_tensor_slices(b)
+    da1 = dataset_ops.Dataset.from_tensor_slices(a)
+    da2 = dataset_ops.Dataset.from_tensor_slices(b)
 
-    dataset = dataset_lib.Dataset.zip((da1, da2))
+    dataset = dataset_ops.Dataset.zip((da1, da2))
     dataset = dataset.map(lambda fa, fb: ({'a': fa}, fb))
     dataset = dataset.batch(batch_size)
     return dataset
@@ -71,7 +71,7 @@ class TPUEstimatorStoppingSignalsTest(test.TestCase):
 
     with ops.Graph().as_default():
       dataset = input_fn(params)
-      features = dataset.make_one_shot_iterator().get_next()
+      features = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
       # With tf.data.Dataset.batch, the batch is None, i.e., dynamic shape.
       self.assertIsNone(features['a'].shape.as_list()[0])
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
index e75a09492ec12b95bad32b221a8e78a1b79f3a6b..d5957b7e8ec40b40c7af8822378cee6134ef0d0f 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_feed.py
@@ -26,7 +26,6 @@ import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.compiler.xla.experimental.xla_sharding import xla_sharding
-from tensorflow.compiler.xla.python_api import xla_shape
 from tensorflow.contrib.tpu.python.ops import tpu_ops
 from tensorflow.contrib.tpu.python.tpu import tpu
 from tensorflow.contrib.tpu.python.tpu import tpu_sharding
@@ -92,8 +91,7 @@ class InfeedQueue(object):
       else:
         raise ValueError(
             "number of tuple elements cannot be inferred from InfeedQueue "
-            "constructor"
-        )
+            "constructor")
     if number_of_tuple_elements <= 0:
       raise ValueError("number_of_tuple_elements %d must be > 0" %
                        number_of_tuple_elements)
@@ -293,9 +291,8 @@ class InfeedQueue(object):
         self.number_of_tuple_elements
     """
     if len(input_tensors) != self.number_of_tuple_elements:
-      raise ValueError(
-          "input_tensors is %s, but should be a list of %d Tensors", (
-              str(input_tensors), self.number_of_tuple_elements))
+      raise ValueError("input_tensors is %s, but should be a list of %d Tensors"
+                       % (str(input_tensors), self.number_of_tuple_elements))
     self.set_tuple_shapes([t.shape for t in input_tensors])
     self.set_tuple_types([t.dtype for t in input_tensors])
 
@@ -451,8 +448,8 @@ class InfeedQueue(object):
       for i in xrange(1, self.number_of_tuple_elements):
         if devices[0] != devices[i]:
           raise ValueError(
-              "input devices for shard %d are %s, but should all be the same",
-              index, str(devices))
+              "input devices for shard %d are %s, but should all be the same" %
+              (index, str(devices)))
       with ops.colocate_with(inputs[0]):
         return tpu_ops.infeed_enqueue_tuple(
             inputs=inputs,
@@ -792,18 +789,14 @@ class _PartitionedInfeedQueue(InfeedQueue):
 
     Args:
       tensor: Input tensor for partitioning.
-      dims: A list of integer describes how to partition the input tensor.
+      dims: 1-D np.array of the list of integer describes how to partition the
+        input tensor.
 
     Raises:
       ValueError: If the tensor can't be partitioned by dims or the
         num_cores_per_replica doesn't match the number of
         partitions(dims.prod()).
     """
-    if dims is None:
-      return
-
-    dims = np.array(dims)
-
     if (dims < 1).any():
       raise ValueError("All input partition dims must be >= 1.")
 
@@ -823,11 +816,6 @@ class _PartitionedInfeedQueue(InfeedQueue):
           "partition dims = {}).".format(tensor.shape.as_list(), dims))
 
     tensor.shape.assert_is_fully_defined()
-    if (np.array(tensor.shape.as_list()) % dims != 0).any():
-      raise ValueError(
-          "All input partition dims must divide exactly into the `Tensor` "
-          "shape (tensor shape = {}, input partition dims = {}).".format(
-              tensor.shape.as_list(), dims))
 
   def _partition_or_replicate_on_host(self, tensor, dims):
     """Partitions or replicates the input tensor.
@@ -840,16 +828,39 @@ class _PartitionedInfeedQueue(InfeedQueue):
     Returns:
       An iterator of `Tensor`s or a list of partioned tensors.
     """
-    self._check_input_partition_dims(tensor, dims)
     if dims is None:
       return itertools.repeat(tensor)
-    else:
-      output = [tensor]
-      for axis, dim in enumerate(dims):
-        if dim > 1:
-          output = [array_ops.split(x, dim, axis=axis) for x in output]
-          output = nest.flatten(output)
-      return output
+    dims = np.array(dims)
+    self._check_input_partition_dims(tensor, dims)
+    output = [tensor]
+    shape_list = np.array(tensor.shape.as_list())
+    quotients, remainders = np.divmod(shape_list, dims)
+    for axis, (quotient, remainder, dim, original_size) in enumerate(
+        zip(quotients, remainders, dims, shape_list)):
+      if dim <= 1:
+        continue
+      if remainder > 0:
+        # For each dimension, when it cannot be evenly partitioned, XLA assumes
+        # tensors are partitioned in a greedy manner by using
+        # ceil_ratio(size/dim) first. E.g. 2D tensor with shape (5, 14) and dims
+        # are (2, 4). Since 5 % 2 = 1 and 14 % 4 = 2, [5, 14] =>
+        # [[(3, 4), (3, 4), (2, 4), (2, 2)],
+        # [(2, 4), (2, 4), (2, 4), (2, 2)]]
+        ceil_ratio = quotient + 1
+        num_full_slots, left_over = np.divmod(original_size, ceil_ratio)
+        num_or_size_splits = [ceil_ratio] * num_full_slots + [left_over]
+        if len(num_or_size_splits) < dim:
+          num_or_size_splits += [0] * (dim - len(num_or_size_splits))
+        new_output = []
+        for x in output:
+          new_output.append(
+              array_ops.split(
+                  x, num_or_size_splits=num_or_size_splits, axis=axis))
+        output = new_output
+      else:
+        output = [array_ops.split(x, dim, axis=axis) for x in output]
+      output = nest.flatten(output)
+    return output
 
   def _tag_sharding_attribute_for_dequeued_tensor(self, tensor, dims):
     """Tags appropriate XLA sharding attribute to the dequeued tensor.
@@ -866,13 +877,9 @@ class _PartitionedInfeedQueue(InfeedQueue):
     elif np.prod(dims) == 1:
       return xla_sharding.assign_device(tensor, 0)
     else:
-      tile_shape = np.array(tensor.shape.as_list()) // dims
       tile_assignment = np.arange(np.prod(dims)).reshape(dims)
       return xla_sharding.tile(
           tensor=tensor,
-          tile_shape=xla_shape.CreateShapeFromDtypeAndTuple(
-              dtype=np.dtype(tensor.dtype.as_numpy_dtype),
-              shape_tuple=tile_shape),
           tile_assignment=tile_assignment)
 
   def _tag_sharding_attribute_for_dequeued_tensors(self, dequeues, dims):
diff --git a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
index ec682e5829c4df536a043334b74200f0b6259df3..d66ecfcf4a56b8da1c2d2f518bebe4baa76b315e 100644
--- a/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
+++ b/tensorflow/contrib/tpu/python/tpu/tpu_system_metadata.py
@@ -52,6 +52,7 @@ def _query_tpu_system_metadata(master_address, cluster_def=None,
   devices = []
   device_dict = collections.defaultdict(list)
 
+  # TODO(b/120564445): Replace with standard library for retries.
   retry_count = 1
   while True:
     logging.info('Querying Tensorflow master (%s) for TPU system metadata.',
diff --git a/tensorflow/contrib/tpu/python/tpu/training_loop.py b/tensorflow/contrib/tpu/python/tpu/training_loop.py
index b6c350ecd7588221b0e7bc979ed1be3b911c8cfd..0187b4bec6ecc55943bf48b9268a74e18ea5b488 100644
--- a/tensorflow/contrib/tpu/python/tpu/training_loop.py
+++ b/tensorflow/contrib/tpu/python/tpu/training_loop.py
@@ -166,8 +166,8 @@ def while_loop(condition, body, inputs=None, infeed_queue=None, name=None):
   # control dependencies from any side-effecting operations.
   if input_arity == 0:
     inputs = [array_ops.constant(0)]
-  return control_flow_ops.while_loop(condition_wrapper, body_wrapper, inputs,
-                                     name="")
+  return control_flow_ops.while_loop(
+      condition_wrapper, body_wrapper, inputs, name="", parallel_iterations=1)
 
 
 def repeat(n, body, inputs=None, infeed_queue=None, name=None):
diff --git a/tensorflow/contrib/tpu/tpu_estimator.md b/tensorflow/contrib/tpu/tpu_estimator.md
index b6514e19dc92fe4c7cdcdb6582a7c0ad5ad573d5..552febd80bd35b37a95cdaaf8d5923278311ac8e 100644
--- a/tensorflow/contrib/tpu/tpu_estimator.md
+++ b/tensorflow/contrib/tpu/tpu_estimator.md
@@ -89,12 +89,9 @@ handle training:
 
         dataset = tf.data.TFRecordDataset(
             filename, buffer_size=FLAGS.dataset_reader_buffer_size)
-        dataset = dataset.map(parser).cache().repeat().batch(batch_size)
-        images, labels = dataset.make_one_shot_iterator().get_next()
-        # set_shape to give inputs statically known shapes.
-        images.set_shape([batch_size, 28 * 28])
-        labels.set_shape([batch_size])
-        return images, labels
+        dataset = dataset.map(parser).cache().repeat().batch(
+            batch_size, drop_remainder=True)
+        return dataset
       return input_fn
 
 
diff --git a/tensorflow/contrib/training/BUILD b/tensorflow/contrib/training/BUILD
index 00295f57f60858db5234ce28cc643ea9eee44daa..f6427ae05a20f253edf030eff0f860361616042b 100644
--- a/tensorflow/contrib/training/BUILD
+++ b/tensorflow/contrib/training/BUILD
@@ -26,7 +26,6 @@ py_library(
         "python/training/resample.py",
         "python/training/sampling_ops.py",
         "python/training/sequence_queueing_state_saver.py",
-        "python/training/tensor_queue_dataset.py",
         "python/training/training.py",
         "python/training/tuner.py",
     ],
@@ -287,28 +286,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "tensor_queue_dataset_test",
-    size = "large",
-    srcs = ["python/training/tensor_queue_dataset_test.py"],
-    srcs_version = "PY2AND3",
-    tags = ["notsan"],
-    deps = [
-        ":training_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/kernel_tests/serialization:dataset_serialization_test_base",
-        "//third_party/py/numpy",
-    ],
-)
-
 tf_proto_library(
     name = "protos_all",
     srcs = glob(["**/*.proto"]),
diff --git a/tensorflow/contrib/training/__init__.py b/tensorflow/contrib/training/__init__.py
index 3547e71184ec2b99163ea4247c01d24487811b47..87ce57ef060a0eb9383248255713421c14988416 100644
--- a/tensorflow/contrib/training/__init__.py
+++ b/tensorflow/contrib/training/__init__.py
@@ -59,8 +59,6 @@ from tensorflow.contrib.training.python.training.hparam import *
 from tensorflow.contrib.training.python.training.resample import *
 from tensorflow.contrib.training.python.training.sampling_ops import *
 from tensorflow.contrib.training.python.training.sequence_queueing_state_saver import *
-from tensorflow.contrib.training.python.training.tensor_queue_dataset import enqueue_in_queue_dataset
-from tensorflow.contrib.training.python.training.tensor_queue_dataset import prepend_from_queue_and_padded_batch_dataset
 from tensorflow.contrib.training.python.training.training import add_gradients_summaries
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms
 from tensorflow.contrib.training.python.training.training import clip_gradient_norms_fn
@@ -79,7 +77,6 @@ _allowed_symbols = [
     'FeedingQueueRunner', 'get_or_create_eval_step', 'StopAfterNEvalsHook',
     'SummaryAtEndHook', 'wait_for_new_checkpoint', 'add_gradients_summaries',
     'clip_gradient_norms', 'clip_gradient_norms_fn', 'create_train_op',
-    'multiply_gradients', 'enqueue_in_queue_dataset',
-    'prepend_from_queue_and_padded_batch_dataset', 'train']
+    'multiply_gradients', 'train']
 
 remove_undocumented(__name__, _allowed_symbols)
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
deleted file mode 100644
index 8896a95327a4cb609a9a78412afa68b316a3131e..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Python wrappers for Datasets and Iterators."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import convert
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.util import nest as tf_nest
-
-
-class _PrependFromQueueAndPaddedBatchDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that prepends a queue to another `Dataset`.
-
-  A vector of handles to the queue is returned as the first component of
-  the associated iterator.  This vector can be passed to
-  `enqueue_in_queue_dataset` to add new elements to the queue.
-  """
-
-  def __init__(self, input_dataset, batch_size, padded_shapes, padding_values):
-    """Initialize `PrependFromQueueAndPaddedBatchDataset`."""
-    super(_PrependFromQueueAndPaddedBatchDataset, self).__init__(input_dataset)
-    if sparse.any_sparse(input_dataset.output_classes):
-      raise TypeError(
-          "Batching of padded sparse tensors is not currently supported")
-    self._input_dataset = input_dataset
-    self._batch_size = ops.convert_to_tensor(
-        batch_size, dtype=dtypes.int64, name="batch_size")
-    if padded_shapes is None:
-      self._padded_shapes = nest.map_structure(
-          convert.partial_shape_to_tensor, input_dataset.output_shapes)
-    else:
-      self._padded_shapes = nest.map_structure_up_to(
-          input_dataset.output_shapes, convert.partial_shape_to_tensor,
-          padded_shapes)
-    # pylint: disable=protected-access
-    padding_values = (
-        padding_values if padding_values is not None else
-        dataset_ops._default_padding(input_dataset))
-    self._padding_values = nest.map_structure_up_to(
-        input_dataset.output_shapes, dataset_ops._padding_value_to_tensor,
-        padding_values, input_dataset.output_types)
-    # pylint: enable=protected-access
-
-  def _as_variant_tensor(self):
-    # pylint: disable=protected-access
-    return gen_dataset_ops.prepend_from_queue_and_padded_batch_dataset(
-        self._input_dataset._as_variant_tensor(),
-        batch_size=self._batch_size,
-        padded_shapes=[
-            ops.convert_to_tensor(s, dtype=dtypes.int64)
-            for s in nest.flatten(self._padded_shapes)
-        ],
-        padding_values=nest.flatten(self._padding_values),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-    # pylint: enable=protected-access
-
-  @property
-  def output_classes(self):
-    return (ops.Tensor, self._input_dataset.output_classes)
-
-  def _as_batch_shape(self, shape_like):
-    return tensor_shape.vector(None).concatenate(
-        tensor_util.constant_value_as_shape(shape_like))
-
-  @property
-  def output_shapes(self):
-    # First output is a variant representing the Queue
-    return (tensor_shape.vector(None),
-            nest.map_structure(self._as_batch_shape, self._padded_shapes))
-
-  @property
-  def output_types(self):
-    # First output is a variant representing the Queue
-    return (dtypes.variant, self._input_dataset.output_types)
-
-
-def prepend_from_queue_and_padded_batch_dataset(batch_size,
-                                                padding_values=None,
-                                                padded_shapes=None):
-  """A transformation that prepends a queue to a `Dataset` and batches results.
-
-  A vector of handles to the queue is returned as the first component of the
-  associated iterator.  This vector can be passed to `enqueue_in_queue_dataset`
-  to add new elements to the queue.
-
-  Below is an example of how this dataset might be used to split incoming
-  variable-length sequences into "head" and "rest" parts, where "rest" parts
-  are re-enqueued back into the dataset.  A more realistic example would
-  perform some calculation on the "head" and modify some components of "rest"
-  with the result (before re-enqueueing).
-
-  ```python
-  dataset = tf.data.Dataset.from_tensor_slices([2*x for x in range(10)])
-  # Make a dataset of variable-length vectors and their lengths.
-  dataset = dataset.map(lambda count: (count, tf.ones((count,))))
-  # Emit a queue we can prepend to, and counts/values as padded batch.
-  dataset = dataset.apply(
-      tf.contrib.training.prepend_from_queue_and_padded_batch_dataset(
-        batch_size=10))
-  dataset = dataset.prefetch(1)
-
-  iterator = dataset.make_one_shot_iterator()
-  queue, (count, padded_value) = iterator.get_next()
-
-  # Split the padded_value into two pieces: head and rest
-  rest_indices = tf.squeeze(tf.where(count > 3), axis=1)
-  bound = tf.minimum(3, tf.reduce_max(count))
-  value_head = padded_value[:, :bound]
-  count_rest = tf.gather(count - 3, rest_indices)
-  value_rest = tf.gather(padded_value[:, bound:], rest_indices)
-  queue_rest = tf.gather(queue, rest_indices)
-  enqueue_rest_op = tf.contrib.training.enqueue_in_queue_dataset(
-    queue_rest, (count_rest, value_rest))
-  with tf.control_dependencies([enqueue_rest_op]):
-    calculation = fn(value_head)
-
-  while True:  # Will raise OutOfRange when finished with all pieces.
-    session.run(calculation)
-  ```
-
-  Args:
-    batch_size: `int64` scalar tensor.  The batch size to use when performing
-      padded batching.
-    padding_values: (optional) Nested tuple of scalar tensors.  If provided,
-      the structure and dtypes of padding_values should match that of
-      incoming dataset's `output_types`.
-    padded_shapes: (optional) Nested tuple of `int64` vector tensors.
-      If provided, the structure must match that of the incoming dataset's
-      `output_types`.  If not provided, the incoming dataset's `output_shapes`
-      is used.  Any unknown (`None` or `-1`) dimensions in the shapes are
-      treated as being unique per-batch: for each batch time, an unknown
-      dimension is replaced with the maximum given value of this dimension
-      across all tensors for the given component in the batch.
-
-  Returns:
-    A `Dataset` transformation function, which can be passed to
-    `tf.data.Dataset.apply`.
-  """
-
-  def _apply_fn(dataset):
-    return _PrependFromQueueAndPaddedBatchDataset(
-        dataset,
-        batch_size=batch_size,
-        padding_values=padding_values,
-        padded_shapes=padded_shapes)
-
-  return _apply_fn
-
-
-def enqueue_in_queue_dataset(queue, components):
-  """Enqueue components into queue from `PrependFromQueueAndPaddedBatchDataset`.
-
-  The components' dtypes and shapes must be compatible with the `output_shapes`
-  attribute of the `dataset` created by
-  `prepend_from_queue_and_padded_batch_dataset`.  This operation supports both
-  non-batched and batched modes.
-
-  For more details, see the example in the docstring for
-  `prepend_from_queue_and_padded_batch_dataset`.
-
-  Args:
-    queue: `variant` scalar or vector tensor.
-      The tensor emitted by the first component of the iterator associated with
-      `prepend_from_queue_and_padded_batch_dataset`.  If this is a scalar,
-      then the `components` input tensors should not have a prepended batch
-      dimension.
-    components: Nested tuple of tensors, each with a leading batch dimension
-      if `queue` is a vector.  The structure, dtypes, and shapes
-      (excluding batch dimension) must match the nested tuples
-      `dataset.output_types[1]` and `dataset.output_shapes[1]` (the non-queue
-      output types and shapes) of the `dataset` emitted by
-      the original `prepend_from_queue_and_padded_batch_dataset` call.
-
-  Returns:
-    An `Operation` that enqueues `components` into the dataset(s) associated
-    with entries of `queue`.
-  """
-  return gen_dataset_ops.enqueue_in_queue_dataset(
-      queue=queue, components=tf_nest.flatten(components))
diff --git a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py b/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
deleted file mode 100644
index c1657fec7bbe4a3227c3ea273b72176ac4066c50..0000000000000000000000000000000000000000
--- a/tensorflow/contrib/training/python/training/tensor_queue_dataset_test.py
+++ /dev/null
@@ -1,355 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for TensorQueueDataset."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.contrib.training.python.training import tensor_queue_dataset as tqd
-from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-
-
-class PrependFromQueueAndPaddedBatchDatasetTest(test.TestCase):
-
-  def testNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    self.assertEqual((dtypes.variant, dtypes.int32), dataset.output_types)
-    self.assertAllEqual(([None],) * 2,
-                        [x.as_list() for x in dataset.output_shapes])
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertEqual([0], self.evaluate(value))
-    self.assertEqual([1], self.evaluate(value))
-    self.assertEqual([2], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertAllEqual([0, 1], self.evaluate(value))
-    self.assertAllEqual([2], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedWithBiggerPaddingNoEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=2, padded_shapes=[3]))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    self.assertAllEqual([[0, 0, 0], [1, 0, 0]], self.evaluate(value))
-    self.assertAllEqual([[2, 0, 0]], self.evaluate(value))
-    with self.assertRaisesOpError("End of sequence"):
-      self.evaluate(value)
-
-  def testBatchedWithBiggerPaddingOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0], [1], [2]])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=1, padded_shapes=[3]))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0, 0, 0]], sess.run(value))
-      value_1, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[1, 0, 0]], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[-1, 0, 0]], value_2)
-      value_3 = sess.run(value)
-      self.assertAllEqual([[1, 0, 0]], value_3)
-      value_4, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([[2, 0, 0]], value_4)
-      value_5 = sess.run(value)
-      self.assertAllEqual([[-2, 0, 0]], value_5)
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    with self.cached_session() as sess:
-      self.assertEqual([0], sess.run(value))
-      value_1, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([1], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([-1], value_2)
-      value_3 = sess.run(value)
-      self.assertEqual([1], value_3)
-      value_4, _ = sess.run([value, enqueue_negative])
-      self.assertEqual([2], value_4)
-      value_5 = sess.run(value)
-      self.assertEqual([-2], value_5)
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testBatchedOneEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=2))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_negative = tqd.enqueue_in_queue_dataset(queue_handle, -value)
-    enqueue_zeroth = tqd.enqueue_in_queue_dataset([queue_handle[0]],
-                                                  array_ops.expand_dims(
-                                                      value[0], axis=0))
-    with self.cached_session() as sess:
-      value_0, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([0, 1], value_0)
-      value_1, _ = sess.run([value, enqueue_zeroth])
-      self.assertAllEqual([0, -1], value_1)
-      value_2, _ = sess.run([value, enqueue_negative])
-      self.assertAllEqual([0, 2], value_2)
-      self.assertAllEqual([0, -2], sess.run(value))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testManyEnqueue(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue_many_more = [
-        tqd.enqueue_in_queue_dataset(queue_handle, value + 100 + i)
-        for i in range(1000)
-    ]
-    with self.cached_session() as sess:
-      value_0, _ = sess.run((value, enqueue_many_more))
-      self.assertEqual([0], value_0)
-      rest = []
-      for _ in range(1000):
-        rest.append(sess.run(value))
-      self.assertEquals([[100 + i] for i in range(1000)], sorted(rest))
-      # Going back to the original input.
-      value_1, _ = sess.run((value, enqueue_many_more))
-      self.assertEqual(1, value_1)
-      rest = []
-      for _ in range(1000):
-        rest.append(sess.run(value))
-      self.assertEquals([[100 + i + 1] for i in range(1000)], sorted(rest))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(value)
-
-  def testEnqueueWithPrefetch(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    # Prefetching will request additional values before they are
-    # available to the queue.
-    dataset = dataset.prefetch(buffer_size=3)
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-    enqueue = tqd.enqueue_in_queue_dataset(queue_handle, value + 1)
-    with self.cached_session() as sess:
-      i = 0
-      while i < 4:
-        received, _ = sess.run((value, enqueue))
-        if received.size > 0:
-          self.assertAllEqual([i], received)
-          i += 1
-      received_last = False
-      while True:
-        try:
-          received = sess.run(value)
-          if received.size > 0:
-            self.assertAllEqual([4], received)
-            received_last = True
-        except errors.OutOfRangeError:
-          break
-      self.assertTrue(received_last)
-
-  def testDatasetWithPaddedShapeSmallerThanInputFails(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([[0, 0, 0]]).repeat(None)
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=1, padded_shapes=[2]))
-    iterator = dataset.make_one_shot_iterator()
-    _, value = iterator.get_next()
-    with self.cached_session() as sess:
-      with self.assertRaisesOpError(
-          r"Incompatible input shapes at component 0 between "
-          r"input dataset this dataset: \[3\] vs. \[2\]"):
-        sess.run(value)
-
-  def testEnqueueWithIncompatibleInputsFailsWithInformativeError(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0]).repeat(None)
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    iterator = dataset.make_one_shot_iterator()
-    queue_handle, value = iterator.get_next()
-
-    enqueue_bad_structure = tqd.enqueue_in_queue_dataset(
-        queue_handle, (value, value))
-    enqueue_bad_dtype = tqd.enqueue_in_queue_dataset(queue_handle,
-                                                     np.array(
-                                                         [1.0],
-                                                         dtype=np.float32))
-    enqueue_bad_shape_no_batch_dim = tqd.enqueue_in_queue_dataset(
-        queue_handle, ([1],))
-    enqueue_bad_shape = tqd.enqueue_in_queue_dataset(queue_handle,
-                                                     np.array(
-                                                         [[1]], dtype=np.int32))
-
-    with self.cached_session() as sess:
-      with self.assertRaisesOpError(
-          "mismatched number of tensors.  Queue expects 1 tensors but "
-          "tried to insert 2"):
-        sess.run(enqueue_bad_structure)
-      with self.assertRaisesOpError(r"Expected component 0 to have batched "
-                                    r"shape \[1,...\], but saw shape: \[\]"):
-        sess.run(enqueue_bad_shape_no_batch_dim)
-      with self.assertRaisesOpError(
-          r"mismatched shapes at component 0.  Attempted to insert tensor "
-          r"with shape \[1\] but queue expected shape: \[\]"):
-        sess.run(enqueue_bad_shape)
-      with self.assertRaisesOpError(
-          r"mismatched dtypes at component 0.  Attempted to insert tensor "
-          r"of type float but queue expected type: int32"):
-        sess.run(enqueue_bad_dtype)
-
-  def testEnqueueWithPaddedBatchFailsWithInformativeError(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 1, 2])
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=1))
-    with self.assertRaisesRegexp(
-        TypeError, r"Unable to create padding for field of type 'variant'"):
-      dataset.padded_batch(batch_size=10, padded_shapes=[1])
-
-  def testOneEnqueueWithPadding(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
-    # Make a dataset of variable-length vectors and their lengths.
-    dataset = dataset.map(
-        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
-    # Emit a queue we can prepend to, and counts/values as padded
-    # batch.
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=3))
-
-    iterator = dataset.make_one_shot_iterator()
-    queue, (count, padded_value) = iterator.get_next()
-
-    # Split the padded_value into two pieces: head and rest
-    rest_indices = array_ops.squeeze(array_ops.where(count > 2), axis=1)
-    bound = math_ops.minimum(2, math_ops.reduce_max(count))
-    value_head = padded_value[:, :bound]
-    count_rest = array_ops.gather(count - 2, rest_indices)
-    value_rest = array_ops.gather(padded_value, rest_indices)[:, bound:]
-    queue_rest = array_ops.gather(queue, rest_indices)
-    enqueue_rest_op = tqd.enqueue_in_queue_dataset(queue_rest,
-                                                   (count_rest, value_rest))
-    with ops.control_dependencies([enqueue_rest_op]):
-      calc = array_ops.identity(value_head)
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0, 0], [2, 2], [4, 4]], sess.run(calc))
-      self.assertAllEqual([[4, 4], [6, 6]], sess.run(calc))
-      self.assertAllEqual([[6, 6]], sess.run(calc))
-      self.assertAllEqual([[6, 6]], sess.run(calc))
-      # Get some final batches due to prefetching.
-      for _ in range(3):
-        try:
-          self.assertAllEqual(
-              np.empty(shape=(0, 0), dtype=np.int32), sess.run(calc))
-        except errors.OutOfRangeError as e:
-          self.assertTrue(str(e).startswith("End of sequence"))
-
-  def testNonstandardPadding(self):
-    dataset = dataset_ops.Dataset.from_tensor_slices([0, 2, 4, 6])
-    # Make a dataset of variable-length vectors and their lengths.
-    dataset = dataset.map(
-        lambda c: (c, c * array_ops.ones((c,), dtype=c.dtype)))
-    # Emit a queue we can prepend to, and counts/values as padded
-    # batch.
-    dataset = dataset.apply(
-        tqd.prepend_from_queue_and_padded_batch_dataset(
-            batch_size=3, padding_values=(
-                0,
-                -1,
-            )))
-
-    iterator = dataset.make_one_shot_iterator()
-    _, (unused_count, padded_value) = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([[-1, -1, -1, -1], [2, 2, -1, -1], [4, 4, 4, 4]],
-                          sess.run(padded_value))
-      self.assertAllEqual([[6] * 6], sess.run(padded_value))
-      with self.assertRaisesOpError("End of sequence"):
-        sess.run(padded_value)
-
-
-# TODO(ebrevdo): Figure out how to use run_core_tests to test state
-# saving of an iterator that's had some tensors enqueued into its queue.
-class PrependFromQueueAndPaddedBatchDatasetSerializationTest(
-    dataset_serialization_test_base.DatasetSerializationTestBase):
-
-  def testPrependFromQueueAndPaddedBatch(self):
-
-    def build_dataset(seq_lens):
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
-          lambda x: array_ops.fill([x], x)).apply(
-              tqd.prepend_from_queue_and_padded_batch_dataset(batch_size=4))
-
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
-
-  def testPrependFromQueueAndPaddedBatchNonDefaultPadding(self):
-
-    def build_dataset(seq_lens):
-
-      def fill_tuple(x):
-        filled = array_ops.fill([x], x)
-        return (filled, string_ops.as_string(filled))
-
-      padded_shape = [-1]
-      return dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
-          fill_tuple).apply(
-              tqd.prepend_from_queue_and_padded_batch_dataset(
-                  batch_size=4,
-                  padded_shapes=(padded_shape, padded_shape),
-                  padding_values=(-1, "<end>")))
-
-    seq_lens1 = np.random.randint(1, 20, size=(32,)).astype(np.int32)
-    seq_lens2 = np.random.randint(21, 40, size=(32,)).astype(np.int32)
-    self.run_core_tests(lambda: build_dataset(seq_lens1),
-                        lambda: build_dataset(seq_lens2), 8)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/contrib/verbs/rdma.cc b/tensorflow/contrib/verbs/rdma.cc
index f7c979e86320d59ad033e2b8d7fcdff89ce0d133..9db80f6b5736d849d88e1e41ea467a5ff11844f5 100644
--- a/tensorflow/contrib/verbs/rdma.cc
+++ b/tensorflow/contrib/verbs/rdma.cc
@@ -30,7 +30,6 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rendezvous_mgr_interface.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/session_mgr.h"
-#include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -1028,7 +1027,10 @@ Status RdmaTensorResponse::PrepareRecvTensor(
     return errors::Aborted(
         "RecvTensor expects a different device incarnation: ",
         parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
-        ". Your worker job was probably restarted. Check your "
+        ". Your worker job (\"",
+        channel_->adapter_->worker_env_->session_mgr->LegacySession()
+            ->worker_name,
+        "\") was probably restarted. Check your "
         "worker job for the reason why it was restarted.");
   }
 
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a701b38d4b3e736a72f20084dbaa6489f1232fb0..66714235b535c14a8f13c40bb2a4df8d7494dc05 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -95,7 +95,8 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cc_tests_gpu")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_version_info_genrule")
-load("//tensorflow:tensorflow.bzl", "if_not_tx2_llvm_or_windows_cuda")
+load("//tensorflow:tensorflow.bzl", "if_nccl")
+load("//tensorflow:tensorflow.bzl", "tensorflow_opensource_extra_deps")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_only_cc_test")
 
 # For platform specific build config
@@ -112,6 +113,7 @@ load(
     "tf_additional_device_tracer_test_flags",
     "tf_additional_gdr_lib_defines",
     "tf_additional_human_readable_json_deps",
+    "tf_additional_logger_deps",
     "tf_additional_lib_defines",
     "tf_additional_lib_deps",
     "tf_additional_lib_hdrs",
@@ -300,6 +302,7 @@ filegroup(
         "platform/env_time.h",
         "platform/logging.h",
         "platform/macros.h",
+        "platform/platform_strings.h",
         "platform/types.h",
     ],
     visibility = ["//visibility:private"],
@@ -442,6 +445,18 @@ cc_library(
     ] + tf_additional_human_readable_json_deps(),
 )
 
+cc_library(
+    name = "logger",
+    srcs = tf_platform_srcs(["logger.cc"]),
+    hdrs = ["platform/logger.h"] + tf_platform_hdrs(["logger.h"]),
+    copts = tf_copts(),
+    visibility = ["//visibility:public"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+    ] + tf_additional_logger_deps(),
+)
+
 filegroup(
     name = "platform_env_hdrs",
     srcs = [
@@ -477,7 +492,10 @@ cc_library(
         ":platform_env_internal_hdrs",
     ],
     copts = tf_copts(),
-    visibility = ["//tensorflow/core:__subpackages__"],
+    visibility = [
+        "//tensorflow/c:__subpackages__",
+        "//tensorflow/core:__subpackages__",
+    ],
     deps = [
         ":error_codes_proto_cc",
         ":lib",
@@ -519,6 +537,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "platform_strings",
+    srcs = tf_platform_srcs([
+        "platform/platform_strings.cc",
+        "platform/platform_strings_computed.h",
+    ]),
+    hdrs = [
+        "platform/platform_strings.h",
+    ],
+    visibility = ["//tensorflow/core:__subpackages__"],
+    deps = [":lib"],
+)
+
 filegroup(
     name = "platform_other_hdrs",
     srcs = [
@@ -841,6 +872,7 @@ tf_cuda_library(
         "framework/dataset_stateful_op_whitelist.h",
         "framework/device_base.h",
         "framework/function.h",
+        "framework/function_handle_cache.h",
         "framework/graph_def_util.h",
         "framework/graph_to_functiondef.h",
         "framework/kernel_def_builder.h",
@@ -884,6 +916,7 @@ tf_cuda_library(
         "util/bcast.h",
         "util/cuda_kernel_helper.h",
         "util/device_name_utils.h",
+        "util/dump_graph.h",
         "util/events_writer.h",
         "util/example_proto_fast_parsing.h",
         "util/example_proto_helper.h",
@@ -901,6 +934,7 @@ tf_cuda_library(
         "util/stream_executor_util.h",
         "util/strided_slice_op.h",
         "util/tensor_format.h",
+        "util/tensor_ops_util.h",
         "util/tensor_slice_reader.h",
         "util/tensor_slice_reader_cache.h",
         "util/tensor_slice_writer.h",
@@ -1038,6 +1072,7 @@ tf_gen_op_libs(
         "batch_ops",
         "bitwise_ops",
         "boosted_trees_ops",
+        "tensor_forest_ops",
         "candidate_sampling_ops",
         "checkpoint_ops",
         "collective_ops",
@@ -1085,7 +1120,11 @@ tf_gen_op_libs(
     op_lib_names = [
         "string_ops",
     ],
-    deps = ["@com_google_absl//absl/strings"],
+    deps = [
+        ":lib_internal",
+        ":lib_proto_parsing",
+        "@com_google_absl//absl/strings",
+    ],
 )
 
 tf_gen_op_libs(
@@ -1187,6 +1226,7 @@ cc_library(
         ":batch_ops_op_lib",
         ":bitwise_ops_op_lib",
         ":boosted_trees_ops_op_lib",
+        ":tensor_forest_ops_op_lib",
         ":candidate_sampling_ops_op_lib",
         ":checkpoint_ops_op_lib",
         ":collective_ops_op_lib",
@@ -1340,6 +1380,7 @@ cc_library(
         "//tensorflow/core/kernels:batch_kernels",
         "//tensorflow/core/kernels:bincount_op",
         "//tensorflow/core/kernels:boosted_trees_ops",
+        "//tensorflow/core/kernels:tensor_forest_ops",
         "//tensorflow/core/kernels:candidate_sampler_ops",
         "//tensorflow/core/kernels:checkpoint_ops",
         "//tensorflow/core/kernels:collective_ops",
@@ -1386,9 +1427,7 @@ cc_library(
         "//tensorflow/core/kernels:summary_kernels",
         "//tensorflow/core/kernels:training_ops",
         "//tensorflow/core/kernels:word2vec_kernels",
-    ] + tf_additional_cloud_kernel_deps() + if_not_tx2_llvm_or_windows_cuda([
-        "//tensorflow/core/kernels:nccl_kernels",
-    ]) + if_not_windows([
+    ] + tf_additional_cloud_kernel_deps() + if_not_windows([
         "//tensorflow/core/kernels:fact_op",
         "//tensorflow/core/kernels:array_not_windows",
         "//tensorflow/core/kernels:math_not_windows",
@@ -1413,6 +1452,8 @@ cc_library(
     ]) + if_cuda([
         "//tensorflow/core/grappler/optimizers:gpu_swapping_kernels",
         "//tensorflow/core/grappler/optimizers:gpu_swapping_ops",
+    ]) + if_nccl([
+        "//tensorflow/core/kernels:nccl_kernels",
     ]),
 )
 
@@ -1437,7 +1478,7 @@ tf_cuda_library(
         ":gpu_runtime",
         ":lib",
         ":ops",
-    ],
+    ] + tensorflow_opensource_extra_deps(),
 )
 
 cc_library(
@@ -1577,6 +1618,8 @@ filegroup(
             "util/stats_calculator.*",
             "util/reporter.*",
             "platform/**/cuda_libdevice_path.*",
+            "platform/**/logger.cc",
+            "platform/**/logger.h",
             "platform/default/test_benchmark.*",
             "platform/cuda.h",
             "platform/google/**/*",
@@ -1639,6 +1682,9 @@ filegroup(
 # operators, use :android_tensorflow_lib if you want full operator
 # support.
 #
+# If you just need TensorFlow types, e.g. Tensors, use
+# :android_tensorflow_lib_lite_no_runtime.
+#
 # Compiles to a trivial library on non-Android to prevent irrelevant
 # build errors. If not building this as part of an android_binary,
 # a command such as the following must be used:
@@ -1649,7 +1695,33 @@ filegroup(
 cc_library(
     name = "android_tensorflow_lib_lite",
     srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None),
+    copts = tf_copts(android_optimization_level_override = None) + [
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ],
+    linkopts = ["-lz"],
+    tags = [
+        "manual",
+        "notap",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":mobile_additional_lib_deps",
+        ":protos_all_cc_impl",
+        ":stats_calculator_portable",
+        "//third_party/eigen3",
+        "@double_conversion//:double-conversion",
+        "@nsync//:nsync_cpp",
+        "@protobuf_archive//:protobuf",
+    ],
+    alwayslink = 1,
+)
+
+cc_library(
+    name = "android_tensorflow_lib_lite_nortti",
+    srcs = if_android(["//tensorflow/core:android_srcs"]),
+    copts = tf_copts(android_optimization_level_override = None) + [
+        "-DSUPPORT_SELECTIVE_REGISTRATION",
+    ] + tf_opts_nortti_if_android(),
     linkopts = ["-lz"],
     tags = [
         "manual",
@@ -1671,8 +1743,8 @@ cc_library(
 cc_library(
     name = "mobile_additional_lib_deps",
     deps = tf_additional_lib_deps() + [
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
     ],
 )
 
@@ -1761,50 +1833,21 @@ cc_library(
 # Does not contain operators. In contrast to android_tensorflow_lib_lite,
 # this links in framework support for all types, relying on selective
 # registration of ops to prune code size.
-cc_library(
+#
+# TODO(gonnet): Move all users of these aliases to the corresponding
+#     :android_tensorflow_lib_lite* targets and remove.
+alias(
     name = "android_tensorflow_lib_selective_registration",
-    srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None) + [
-        "-DSUPPORT_SELECTIVE_REGISTRATION",
-    ],
-    linkopts = if_android(["-lz"]),
-    tags = [
-        "manual",
-        "notap",
-    ],
+    actual = ":android_tensorflow_lib_lite",
     visibility = ["//visibility:public"],
-    deps = [
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
-        "@nsync//:nsync_cpp",
-        "@protobuf_archive//:protobuf",
-    ],
-    alwayslink = 1,
 )
 
 # Android library for use with the SELECTIVE_REGISTRATION feature with
 # no proto_rtti.
-cc_library(
+alias(
     name = "android_tensorflow_lib_selective_registration_nortti",
-    srcs = if_android(["//tensorflow/core:android_srcs"]),
-    copts = tf_copts(android_optimization_level_override = None) + tf_opts_nortti_if_android() + [
-        "-DSUPPORT_SELECTIVE_REGISTRATION",
-    ],
-    linkopts = if_android(["-lz"]),
-    tags = [
-        "manual",
-        "notap",
-    ],
+    actual = ":android_tensorflow_lib_lite_nortti",
     visibility = ["//visibility:public"],
-    deps = [
-        ":protos_all_cc_impl",
-        "//third_party/eigen3",
-        "@double_conversion//:double-conversion",
-        "@nsync//:nsync_cpp",
-        "@protobuf_archive//:protobuf",
-    ],
-    alwayslink = 1,
 )
 
 filegroup(
@@ -2045,9 +2088,7 @@ tf_proto_library_cc(
     srcs = ["protobuf/master.proto"],
     cc_api_version = 2,
     protodeps = tf_additional_all_protos(),
-    visibility = [
-        "//tensorflow:internal",
-    ],
+    visibility = ["//tensorflow:internal"],
 )
 
 tf_proto_library_cc(
@@ -2187,6 +2228,7 @@ cc_library(
             "platform/**/env_time.cc",
             "platform/**/cuda_libdevice_path.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logger.cc",
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
@@ -2199,6 +2241,7 @@ cc_library(
             "platform/**/stream_executor.h",
             "platform/**/env_time.cc",
             "platform/**/device_tracer.cc",
+            "platform/**/logger.cc",
             "platform/**/logging.cc",
             "platform/**/human_readable_json.cc",
             "platform/abi.cc",
@@ -2641,6 +2684,8 @@ tf_cuda_library(
         ":stats_calculator_portable",
         ":version_lib",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "//tensorflow/core/platform/default/build_config:platformlib",
         "//tensorflow/core/kernels:bounds_check",
         "//third_party/eigen3",
@@ -2943,6 +2988,7 @@ tf_cuda_library(
         ":lib_internal",
         ":proto_text",
         ":protos_all_cc",
+        "@com_google_absl//absl/memory",
         "//third_party/eigen3",
         "//tensorflow/core/grappler:grappler_item",
     ] + mkl_deps(),
@@ -3008,7 +3054,6 @@ tf_cuda_library(
     hdrs = ["common_runtime/metrics.h"],
     deps = [
         ":lib",
-        "@com_google_absl//absl/time",
     ],
 )
 
@@ -3033,7 +3078,6 @@ tf_cuda_library(
         ":protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
         "//tensorflow/core/kernels:function_ops",
-        "@com_google_absl//absl/time",
     ],
     alwayslink = 1,
 )
@@ -3393,6 +3437,7 @@ tf_cc_tests(
         "platform/profile_utils/cpu_utils_test.cc",
         "platform/stacktrace_handler_test.cc",
         "platform/subprocess_test.cc",
+        "platform/vmodule_benchmark_test.cc",
     ],
     deps = [
         ":lib",
@@ -3406,6 +3451,20 @@ tf_cc_tests(
     ],
 )
 
+tf_cc_test(
+    name = "vmodule_test",
+    srcs = ["platform/vmodule_test.cc"],
+    tags = ["optonly"],
+    deps = [
+        ":lib",
+        ":lib_internal",
+        ":lib_test_internal",
+        ":protos_all_cc",
+        ":test",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_cc_test(
     name = "lib_random_random_distributions_test",
     srcs = ["lib/random/random_distributions_test.cc"],
@@ -3421,6 +3480,16 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "platform_strings_test",
+    size = "small",
+    srcs = ["platform/platform_strings_test.cc"],
+    deps = [
+        ":lib",
+        ":platform_strings",
+    ],
+)
+
 tf_cc_test(
     name = "platform_env_test",
     size = "small",
@@ -3668,6 +3737,7 @@ tf_cc_tests(
         "util/bcast_test.cc",
         "util/command_line_flags_test.cc",
         "util/device_name_utils_test.cc",
+        "util/dump_graph_test.cc",
         "util/equal_graph_def_test.cc",
         "util/events_writer_test.cc",
         "util/example_proto_fast_parsing_test.cc",
@@ -3798,6 +3868,7 @@ tf_cc_tests_gpu(
         ":test",
         ":test_main",
         ":testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -3826,6 +3897,7 @@ tf_cc_tests_gpu(
         ":test",
         ":test_main",
         ":testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -4099,6 +4171,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:immutable_constant_op",
         "//tensorflow/core/kernels:matmul_op",
+        "//tensorflow/core/kernels:topk_op",
         "//third_party/eigen3",
     ],
 )
@@ -4392,6 +4465,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:shape_ops",
         "//third_party/eigen3",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -4871,6 +4945,7 @@ transitive_hdrs(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:platform_strings",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:stream_executor",
     ],
diff --git a/tensorflow/core/api_def/api_test.cc b/tensorflow/core/api_def/api_test.cc
index 6f9885691595368ab50cfe660b1b5c75673063cf..7405e2ace72d1c08cf87cc0040e617379e18149b 100644
--- a/tensorflow/core/api_def/api_test.cc
+++ b/tensorflow/core/api_def/api_test.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 namespace tensorflow {
 namespace {
@@ -182,11 +181,14 @@ void TestDeprecationVersionSetCorrectly(
   for (const auto& name_and_api_def : api_defs_map) {
     const auto& name = name_and_api_def.first;
     const auto& api_def = name_and_api_def.second;
-    ASSERT_TRUE(api_def.deprecation_version() == 0 ||
-                api_def.deprecation_message().empty())
-        << "ApiDef that includes deprecation_version > 0 must also specify "
-        << "a deprecation_message. Op " << name
-        << " has deprecation_version > 0 but deprecation_message is not set.";
+    if (api_def.deprecation_version() != 0) {
+      ASSERT_TRUE(api_def.deprecation_version() > 0)
+          << "Found ApiDef with negative deprecation_version";
+      ASSERT_FALSE(api_def.deprecation_message().empty())
+          << "ApiDef that includes deprecation_version > 0 must also specify "
+          << "a deprecation_message. Op " << name
+          << " has deprecation_version > 0 but deprecation_message is not set.";
+    }
   }
 }
 }  // namespace
diff --git a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
index 639d962874d083472e6df13550e107026fd2d0a1..32def912f83e420eab58a3071f573ae81139a298 100644
--- a/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_BatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "BatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "batch_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
index 6889b8ea148b57da847964c062bd52b1027b8d22..9f7088b90077544ca11fff08dae526140ca1aa6e 100644
--- a/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_CacheDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "CacheDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filename"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
index 67281f9547ac6bb9df5b19e9f31da891454993bd..7997d8daaf91e47044f0729fb8a3c80d69d13acc 100644
--- a/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ConcatenateDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "ConcatenateDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that concatenates `input_dataset` with `another_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
index 2b9dffd883250fd5631444252e7b236116e2e822..27d7d6b98684e10853f2f73373a756f0006daa0e 100644
--- a/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetToSingleElement.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "DatasetToSingleElement"
+  visibility: HIDDEN
   in_arg {
     name: "dataset"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
deleted file mode 100644
index 9722f5ede30cb0b893171bfc36a0eb8c1ab3c7e2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_EnqueueInQueueDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "EnqueueInQueueDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
similarity index 56%
rename from tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
index 73df11b2f75f82fad174fb7e77eccbef35c2c7d1..dc296162ae83117d349147c2655756c59384c051 100644
--- a/tensorflow/core/api_def/base_api/api_def_BytesProducedStatsDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalBytesProducedStatsDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "BytesProducedStatsDataset"
+  graph_op_name: "ExperimentalBytesProducedStatsDataset"
+  visibility: HIDDEN
   summary: "Records the bytes size of each element of `input_dataset` in a StatsAggregator."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ac014bcc5e6ae48cdecd6acefca267da3f2fe4f1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetCardinality.pbtxt
@@ -0,0 +1,21 @@
+op {
+  graph_op_name: "ExperimentalDatasetCardinality"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to return cardinality for.
+END
+  }
+  out_arg {
+    name: "cardinality"
+    description: <<END
+The cardinality of `input_dataset`. Named constants are used to represent
+infinite and unknown cardinality.
+END
+  }
+  summary: "Returns the cardinality of `input_dataset`."
+  description: <<END
+Returns the cardinality of `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
similarity index 91%
rename from tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
index e1b8a9abdd2bec0fda690f96d266569b2fb2fcab..085d20d7bf1882accfa3380465568774d1459afb 100644
--- a/tensorflow/core/api_def/base_api/api_def_DatasetToTFRecord.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDatasetToTFRecord.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "DatasetToTFRecord"
+  graph_op_name: "ExperimentalDatasetToTFRecord"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
similarity index 89%
rename from tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
index e275cfdd3de5de36979967b1d85d1ae9cd0582a8..8ebd6d88a8b9ff9e0a855215a0167f043d083bad 100644
--- a/tensorflow/core/api_def/base_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalDenseToSparseBatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "DenseToSparseBatchDataset"
+  graph_op_name: "ExperimentalDenseToSparseBatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "input_dataset"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
deleted file mode 100644
index 66511eff60b900ab061c96d310ead3dfb7b3eba4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResource.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResource"
-  in_arg {
-    name: "string_arg"
-    description: <<END
-String argument to the function call.
-END
-  }
-  in_arg {
-    name: "target_device"
-    description: <<END
-Target device to execute the function on.
-END
-  }
-  out_arg {
-    name: "resource"
-    description: <<END
-Handle to the resource created.
-END
-  }
-  attr {
-    name: "shared_name"
-    description: <<END
-If non-empty, this resource will be shared under the given name across
-multiple sessions.
-END
-  }
-  attr {
-    name: "container"
-    description: <<END
-If non-empty, this resource is placed in the given container.
-Otherwise, a default container is used.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-Function to be executed.
-END
-  }
-  attr {
-    name: "buffer_size"
-    description: <<END
-Size of the buffer.
-END
-  }
-  attr {
-    name: "output_types"
-    description: <<END
-The type list for the return values.
-END
-  }
-  summary: <<END
-Creates a resource that fills up a buffer by making function calls.
-END
-  visibility: HIDDEN
-}
-
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
deleted file mode 100644
index bf4b66b22bfe23312ddfcb86ef0084d1d2fa71ea..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceGetNext.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResourceGetNext"
-  in_arg {
-    name: "function_buffer_resource"
-    description: <<END
-The FunctionBufferingResource handle.
-END
-  }
-  out_arg {
-    name: "output"
-    description: <<END
-A list of return values.
-END
-  }
-  attr {
-    name: "output_types"
-    description: <<END
-The type list for the return values.
-END
-  }
-  summary: <<END
-Gets the next element from a FunctionBufferingResource.
-END
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
deleted file mode 100644
index 729718ddb3d4480f10f395f34e76d47a8b0f8b28..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_ExperimentalFunctionBufferingResourceReset.pbtxt
+++ /dev/null
@@ -1,13 +0,0 @@
-op {
-  graph_op_name: "ExperimentalFunctionBufferingResourceReset"
-  in_arg {
-    name: "function_buffer_resource"
-    description: <<END
-The FunctionBufferingResource handle.
-END
-  }
-  summary: <<END
-Resets the FunctionBufferingResource.
-END
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
similarity index 97%
rename from tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
index 067ad4018b09d4909325dbc152e30a0afcf29235..dd132802fac8cbbd06872cd50415d3a5d29abc38 100644
--- a/tensorflow/core/api_def/base_api/api_def_GroupByReducerDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByReducerDataset.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "GroupByReducerDataset"
+  graph_op_name: "ExperimentalGroupByReducerDataset"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
similarity index 82%
rename from tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
index ea6bcd469577d02e39afbeb2ba0c8b467e312ba9..6e4c12ed815d8119999852056a473b76e2d4ab90 100644
--- a/tensorflow/core/api_def/base_api/api_def_GroupByWindowDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalGroupByWindowDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "GroupByWindowDataset"
+  graph_op_name: "ExperimentalGroupByWindowDataset"
+  visibility: HIDDEN
   attr {
     name: "key_func"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
similarity index 58%
rename from tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
index 78d946b0b47044855ff145e9492fdb3721ff0044..e7351b9d70a75285351534d474209339b6bcbce4 100644
--- a/tensorflow/core/api_def/base_api/api_def_LatencyStatsDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalLatencyStatsDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "LatencyStatsDataset"
+  graph_op_name: "ExperimentalLatencyStatsDataset"
+  visibility: HIDDEN
   summary: "Records the latency of producing `input_dataset` elements in a StatsAggregator."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
similarity index 96%
rename from tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
index 81ef92cae0c95c765a82c993f58f261509c47d71..bc4270670c5369d6d7440b50dae98f367453b3d9 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDatasetV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapAndBatchDataset.pbtxt
@@ -1,5 +1,5 @@
 op {
-  graph_op_name: "MapAndBatchDatasetV2"
+  graph_op_name: "ExperimentalMapAndBatchDataset"
   visibility: HIDDEN
   in_arg {
     name: "input_dataset"
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e9619edcac1cce1bf8ab73ab271b647f902539bb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMapDataset.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalMapDataset"
+  summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..993a79814907a0d11c639ce60a785f740ec665c7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMatchingFilesDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalMatchingFilesDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a18aa378ffa1e6f8a1d857760b30d81f9afa15b1
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalMaxIntraOpParallelismDataset.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ExperimentalMaxIntraOpParallelismDataset"
+  in_arg {
+    name: "max_intra_op_parallelism"
+    description: <<END
+Identifies the maximum intra-op parallelism to use.
+END
+  }
+  summary: <<END
+Creates a dataset that overrides the maximum intra-op parallelism.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
similarity index 90%
rename from tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
index d6889b54a032bb20896dc7b03af5621f45d365d9..dd70e3328493825b268fc1a2f6e1c85207a426bf 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParallelInterleaveDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalParallelInterleaveDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "ParallelInterleaveDataset"
+  graph_op_name: "ExperimentalParallelInterleaveDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
similarity index 96%
rename from tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
index 3de2f18fc28b57171b478f43c64a88d72069a89f..2de13c5ceef4eced73f6e0984e70921926ece7f2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParseExampleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalParseExampleDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "ParseExampleDataset"
+  graph_op_name: "ExperimentalParseExampleDataset"
+  visibility: HIDDEN
   in_arg {
     name: "dense_defaults"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..eaa49b7fa5e9f98f02586d9922b00f0bda3af908
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalPrivateThreadPoolDataset.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "ExperimentalPrivateThreadPoolDataset"
+  in_arg {
+    name: "num_threads"
+    description: <<END
+Identifies the number of threads to use for the private threadpool.
+END
+  }
+  summary: <<END
+Creates a dataset that uses a custom thread pool to compute `input_dataset`.
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
similarity index 86%
rename from tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
index 0466b40f85eb118c94404e2f0d7670392bc7afdf..f5d7bc4adb79ac63aaf41f03063b26257ebee429 100644
--- a/tensorflow/core/api_def/base_api/api_def_RandomDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalRandomDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "RandomDataset"
+  graph_op_name: "ExperimentalRandomDataset"
+  visibility: HIDDEN
   in_arg {
     name: "seed"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
similarity index 61%
rename from tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
index e83d4a9e967f959b19adc5fad38a7141f8936cc4..4742cf4d57ff471178f0d59d9fd8a99a1e6f2166 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScanDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalScanDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "ScanDataset"
+  graph_op_name: "ExperimentalScanDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset successively reduces `f` over the elements of `input_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6e6b2f81b333899e3cdc2723edb537507f541a64
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSetStatsAggregatorDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ExperimentalSetStatsAggregatorDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
similarity index 88%
rename from tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
index ddde3ee5b4ef1d82cc244563d4835e319a9dc50a..dc62750b66a996d1429fcd8477bcd57b7b488dda 100644
--- a/tensorflow/core/api_def/base_api/api_def_SlideDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSlidingWindowDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "SlideDataset"
+  graph_op_name: "ExperimentalSlidingWindowDataset"
+  visibility: HIDDEN
   in_arg {
     name: "window_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
similarity index 87%
rename from tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
index 7570d5da5662b8eab90e7dd00f8cb225a963d373..35cddbd061917e397aa7b10e7fee43033adfc2e2 100644
--- a/tensorflow/core/api_def/base_api/api_def_SqlDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalSqlDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
-  graph_op_name: "SqlDataset"
+  graph_op_name: "ExperimentalSqlDataset"
+  visibility: HIDDEN
   in_arg {
     name: "driver_name"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8a770d462d54230340ac278f755b997d7c9144a5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorHandle.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "ExperimentalStatsAggregatorHandle"
+  visibility: HIDDEN
+  summary: "Creates a statistics manager resource."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
similarity index 56%
rename from tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
index bcaf9fea1af5123848b2d6267b3ef0f7279a7230..ffe010368918a2134fa70d3bc6d6fb30a7dbc2c5 100644
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorSummary.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalStatsAggregatorSummary.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "StatsAggregatorSummary"
+  graph_op_name: "ExperimentalStatsAggregatorSummary"
+  visibility: HIDDEN
   summary: "Produces a summary of any statistics recorded by the given statistics manager."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
similarity index 57%
rename from tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
rename to tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
index 324fadac0af5088e86e61beaaa27f2111cfd4b82..c89e1fd0bdd6ef594797233170b41cb86521c84f 100644
--- a/tensorflow/core/api_def/base_api/api_def_UnbatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ExperimentalUnbatchDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
-  graph_op_name: "UnbatchDataset"
+  graph_op_name: "ExperimentalUnbatchDataset"
+  visibility: HIDDEN
   summary: "A dataset that splits the elements of its input into multiple elements."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
index 4e48d6c169b6641ece5f11d5add478ce25611ee8..0ba2327371a4ba0f5f553815fc9e8c991f62b424 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most
+A complex tensor of the same shape as `input`. The inner-most
   dimension of `input` is replaced with its 1D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
index 555f8e60673d71e43dbb5d4dc17ae345606a2089..c7b780a56f04298bc7906955cb17bc335ec4e8d5 100644
--- a/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FFT2D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 2
+A complex tensor of the same shape as `input`. The inner-most 2
   dimensions of `input` are replaced with their 2D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
index fd60c0f3785a22f456c63285bf59381e6a2a5d66..776529bc593b10915c6be8c4a3bdac6e6b131c32 100644
--- a/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FilterDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FilterDataset"
+  visibility: HIDDEN
   in_arg {
     name: "other_arguments"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
index 651b84d0d660a0bfc0ef45dd841dfc51ee1e3340..3b142432582146fcc0534d36d1aa063b71f11338 100644
--- a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FixedLengthRecordDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
index ad82eddb587b40e8ab61dd55aa3dc277aefd03d5..def9f85e02d9d34412ed42d7774d77e8b6a328e0 100644
--- a/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FixedLengthRecordDatasetV2.pbtxt
@@ -1,3 +1,4 @@
 op {
   graph_op_name: "FixedLengthRecordDatasetV2"
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
index 1936119c50f5323e69465a79cda784afc68c3aca..1e20e853254ccb5086b3b52f473a4a823fefefe8 100644
--- a/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_FlatMapDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "FlatMapDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
index 4f1cf3e6867a06df1f39774bc389fbe35a994ab4..06e9a6463e76dbf43caae878b62afcba55e6995d 100644
--- a/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_GeneratorDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "GeneratorDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that invokes a function to generate elements."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
index b793c99cf74408305b48dbbf1c9df7b03d09b2f3..c17a84000560e9e14e10326e42e84dd49d924bf2 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most
+A complex tensor of the same shape as `input`. The inner-most
   dimension of `input` is replaced with its inverse 1D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
index 7f38f14308de70fb0ebc229064d010762055c458..7458d233ec8bd385e7976095d0cf89dfa0b36ace 100644
--- a/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_IFFT2D.pbtxt
@@ -3,13 +3,13 @@ op {
   in_arg {
     name: "input"
     description: <<END
-A complex64 tensor.
+A complex tensor.
 END
   }
   out_arg {
     name: "output"
     description: <<END
-A complex64 tensor of the same shape as `input`. The inner-most 2
+A complex tensor of the same shape as `input`. The inner-most 2
   dimensions of `input` are replaced with their inverse 2D Fourier transform.
 
 @compatibility(numpy)
diff --git a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
index bec2828e2462227b962bc045d796484a10365452..597edf5fb2b2d1c1f9d5a97992ec074385407f47 100644
--- a/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_InterleaveDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "InterleaveDataset"
+  visibility: HIDDEN
   attr {
     name: "f"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt b/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..35dbee8364ec596ee18cf8892361ee3112a7764a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_Lu.pbtxt
@@ -0,0 +1,51 @@
+op {
+  graph_op_name: "Lu"
+  in_arg {
+    name: "input"
+    description: <<END
+A tensor of shape `[..., M, M]` whose inner-most 2 dimensions form matrices of
+size `[M, M]`.
+END
+  }
+  out_arg {
+    name: "lu"
+    description: <<END
+A tensor of shape `[..., M, M]` whose strictly lower triangular part denotes the
+lower triangular factor `L` with unit diagonal, and whose upper triangular part
+denotes the upper triangular factor `U`.
+END
+  }
+  out_arg {
+    name: "p"
+    description: <<END
+Permutation of the rows encoded as a list of indices in `0..M-1`. Shape is
+`[..., M]`.
+@compatibility(scipy)
+Similar to `scipy.linalg.lu`, except the triangular factors `L` and `U` are
+packed into a single tensor, the permutation is applied to `input` instead of
+the right hand side and the permutation `P` is returned as a list of indices
+instead of a permutation matrix.
+@end_compatibility
+END
+  }
+  summary: "Computes the LU decomposition of one or more square matrices."
+  description: <<END
+The input is a tensor of shape `[..., M, M]` whose inner-most 2 dimensions
+form square matrices.
+
+The input has to be invertible.
+
+The output consists of two tensors LU and P containing the LU decomposition
+of all input submatrices `[..., :, :]`. LU encodes the lower triangular and
+upper triangular factors.
+
+For each input submatrix of shape `[M, M]`, L is a lower triangular matrix of
+shape `[M, M]` with unit diagonal whose entries correspond to the strictly lower
+triangular part of LU. U is a upper triangular matrix of shape `[M, M]` whose
+entries correspond to the upper triangular part, including the diagonal, of LU.
+
+P represents a permutation matrix encoded as a list of indices each between `0`
+and `M-1`, inclusive. If P_mat denotes the permutation matrix corresponding to
+P, then the L, U and P satisfies P_mat * input = L * U.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
deleted file mode 100644
index e230c51edfe9355b556812b0946b3a4879f160bc..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_MapAndBatchDataset.pbtxt
+++ /dev/null
@@ -1,53 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDataset"
-  visibility: HIDDEN
-  in_arg {
-    name: "input_dataset"
-    description: <<END
-A variant tensor representing the input dataset.
-END
-  }
-  in_arg {
-    name: "other_arguments"
-    description: <<END
-A list of tensors, typically values that were captured when building a closure
-for `f`.
-END
-  }
-  in_arg {
-    name: "batch_size"
-    description: <<END
-A scalar representing the number of elements to accumulate in a
-batch. It determines the number of concurrent invocations of `f` that process
-elements from `input_dataset` in parallel.
-END
-  }
-  in_arg {
-    name: "num_parallel_batches"
-    description: <<END
-A scalar representing the number of batches to create in parallel. Processing
-multiple batches in parallel benefits workloads prone to stragglers.
-END
-  }
-  in_arg {
-    name: "drop_remainder"
-    description: <<END
-A scalar representing whether the last batch should be dropped in case its size
-is smaller than desired.
-END
-  }
-  attr {
-    name: "f"
-    description: <<END
-A function to apply to the outputs of `input_dataset`.
-END
-  }
-  summary: "Creates a dataset that fuses mapping with batching."
-  description: <<END
-Creates a dataset that applies `f` to the outputs of `input_dataset` and then
-batches `batch_size` of them.
-
-Unlike a "MapDataset", which applies `f` sequentially, this dataset invokes up
-to `batch_size * num_parallel_batches` copies of `f` in parallel.
-END
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
index 76d63ec2478e07d5af09754dc63994841119fa56..4f235f49461465931c6b863b2007c512511c873c 100644
--- a/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_MapDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "MapDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that applies `f` to the outputs of `input_dataset`."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_MatchingFilesDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_MatchingFilesDataset.pbtxt
deleted file mode 100644
index ab2a33108d1117c5eab535c4ef9f7276d6e27a42..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_MatchingFilesDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MatchingFilesDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
index d243dfe8b67bc14e9c5e22d5e68e3faf5d4684a8..53f4d94ecc8810a38aaafac29438d8186636684a 100644
--- a/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_PaddedBatchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "PaddedBatchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "batch_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
index 313494dd738b02d09807ec78fc8e0802e719e116..5343605edd5859d2cafa656f3821a318e24d0b09 100644
--- a/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ParallelMapDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ParallelMapDataset"
+  visibility: HIDDEN
   in_arg {
     name: "num_parallel_calls"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
index e158eedc6f0ef11de3c8979d65dd69d8bece1eb4..a71336a285542bc4bdf095fb2ac477ea975725c0 100644
--- a/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_PrefetchDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "PrefetchDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
deleted file mode 100644
index d4549340fac6d59cc994050e65f5a0016f2d52ab..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
index c43142599bbe097d411e2bd89fe2045e79c924bb..dff7c8754f90026c69f22a3a1eea097b946a8c1f 100644
--- a/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_QuantizeAndDequantizeV2.pbtxt
@@ -39,6 +39,19 @@ END
     name: "range_given"
     description: <<END
 Whether the range is given or should be determined from the `input` tensor.
+END
+  }
+  attr {
+    name: "round_mode"
+    description: <<END
+The 'round_mode' attribute controls which rounding tie-breaking algorithm is
+used when rounding float values to their quantized equivalents. The following
+rounding modes are currently supported:
+
+*   HALF_TO_EVEN: this is the default round_mode.
+*   HALF_UP: round towards positive. In this mode 7.5 rounds up to 8 and -7.5
+    rounds up to -7.
+
 END
   }
   summary: "Quantizes then dequantizes a tensor."
@@ -93,7 +106,7 @@ following to each value in the 'input' tensor.
 
 output = round(clamp(value, input_min, input_max) * scale_factor) / scale_factor.
 
-The above round function uses half to even rounding.
+The above round function rounds the value based on the given round_mode.
 
 END
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt
index 240c987ddab4cd6ba04655891a258801716dc619..9c40332ea28421e0b6a8ab771f6d19fdaa75a63a 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedGather.pbtxt
@@ -11,8 +11,8 @@ END
   in_arg {
     name: "params_dense_values"
     description: <<END
-The `inner_values` for the `params` RaggedTensor. There was a terminology change
-at the python level from dense_values to inner_values, so dense_values is the
+The `flat_values` for the `params` RaggedTensor. There was a terminology change
+at the python level from dense_values to flat_values, so dense_values is the
 deprecated name.
 END
   }
@@ -32,7 +32,7 @@ END
   }
   out_arg {
     name: "output_dense_values"
-    description: "The `inner_values` for the returned RaggedTensor."
+    description: "The `flat_values` for the returned RaggedTensor."
   }
   attr {
     name: "PARAMS_RAGGED_RANK"
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
index 927e839b72ab0c09318bf58734effe5aab2d7f5a..4a9b2af804483df8eafd3306fc4f68cb9de55f2b 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedRange.pbtxt
@@ -19,7 +19,7 @@ op {
   }
   out_arg{
     name: "rt_dense_values"
-    description: "The `inner_values` for the returned `RaggedTensor`."
+    description: "The `flat_values` for the returned `RaggedTensor`."
   }
   summary: <<END
 Returns a `RaggedTensor` containing the specified sequences of numbers.
diff --git a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt
index 8c73ea644c8072a2a3d11f6489976ca34e02b55d..958c71185e4b9f2f876ca66f9cfaeabcbe2050cc 100644
--- a/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RaggedTensorToSparse.pbtxt
@@ -7,7 +7,7 @@ op {
   }
   in_arg {
     name: "rt_dense_values"
-    description: "The `inner_values` for the `RaggedTensor`."
+    description: "The `flat_values` for the `RaggedTensor`."
   }
   out_arg {
     name: "sparse_indices"
diff --git a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
index a9e14b8a052e416dd78f1abdc25c9b024a778107..4ac5050040c22ff6ffc5d0bb7c69453cd9e12f5c 100644
--- a/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RangeDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "RangeDataset"
+  visibility: HIDDEN
   in_arg {
     name: "start"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
index fc6169cd32f1671000a9cb96209059d062c00db8..b2fcab15384d0cc7354699d15a25bdf8879fbac6 100644
--- a/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_RepeatDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "RepeatDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..8ee16ef1baa86f31dfa78bb75aeea81e4b983972
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,85 @@
+op {
+  graph_op_name: "ResourceApplyAdamWithAmsgrad"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "m"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "v"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "vhat"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "beta1_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2_power"
+    description: <<END
+Must be a scalar.
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta1"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "beta2"
+    description: <<END
+Momentum factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "epsilon"
+    description: <<END
+Ridge term. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var, m, and v tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  summary: "Update \'*var\' according to the Adam algorithm."
+  description: <<END
+$$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+$$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+$$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+$$vhat_t := max{vhat_{t-1}, v_t}$$
+$$variable := variable - lr_t * m_t / (\sqrt{vhat_t} + \epsilon)$$
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..830391a32baa48a358c5cd12d73bfc26b852fe6d
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,56 @@
+op {
+  graph_op_name: "ResourceApplyKerasMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Scaling factor. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var + momentum * accum, so in the end, the var you get is actually
+var + momentum * accum.
+END
+  }
+  summary: "Update \'*var\' according to the momentum scheme. Set use_nesterov = True if you"
+  description: <<END
+want to use Nesterov momentum.
+
+accum = accum * momentum - lr * grad
+var += accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b10b1bc2a9bb7a28f9f96fdb0328ab23952f7e56
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,64 @@
+op {
+  graph_op_name: "ResourceSparseApplyKerasMomentum"
+  in_arg {
+    name: "var"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "accum"
+    description: <<END
+Should be from a Variable().
+END
+  }
+  in_arg {
+    name: "lr"
+    description: <<END
+Learning rate. Must be a scalar.
+END
+  }
+  in_arg {
+    name: "grad"
+    description: <<END
+The gradient.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+A vector of indices into the first dimension of var and accum.
+END
+  }
+  in_arg {
+    name: "momentum"
+    description: <<END
+Momentum. Must be a scalar.
+END
+  }
+  attr {
+    name: "use_locking"
+    description: <<END
+If `True`, updating of the var and accum tensors will be protected
+by a lock; otherwise the behavior is undefined, but may exhibit less
+contention.
+END
+  }
+  attr {
+    name: "use_nesterov"
+    description: <<END
+If `True`, the tensor passed to compute grad will be
+var + momentum * accum, so in the end, the var you get is actually
+var + momentum * accum.
+END
+  }
+  summary: "Update relevant entries in \'*var\' and \'*accum\' according to the momentum scheme."
+  description: <<END
+Set use_nesterov = True if you want to use Nesterov momentum.
+
+That is for rows we have grad for, we update var and accum as follows:
+
+accum = accum * momentum - lr * grad
+var += accum
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
index 0b5917d428c5a2d8438294760020fa61efbe2b7a..41955cfbfa44a97659df26cfc6abb3a7a8c72582 100644
--- a/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ScatterNd.pbtxt
@@ -32,6 +32,10 @@ slices within a tensor (initially zero for numeric, empty for string) of
 the given `shape` according to indices.  This operator is the inverse of the
 `tf.gather_nd` operator which extracts values or slices from a given tensor.
 
+This operation is similar to tensor_scatter_add, except that the tensor is
+zero-initialized. Calling `tf.scatter_nd(indices, values, shape)` is identical
+to `tensor_scatter_add(tf.zeros(shape, values.dtype), indices, values)`
+
 If `indices` contains duplicates, then their updates are accumulated (summed).
 
 **WARNING**: The order in which updates are applied is nondeterministic, so the
diff --git a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
deleted file mode 100644
index 77123e143b200fc079879bc0e891a771a7cb67e7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_SetStatsAggregatorDataset.pbtxt
+++ /dev/null
@@ -1,3 +0,0 @@
-op {
-  graph_op_name: "SetStatsAggregatorDataset"
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
index fb425b24a4134366df1129df63dc0361537dd746..9ea1cc8babe8832d0553b942901c1c391f1b2709 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleAndRepeatDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ShuffleAndRepeatDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
index ea5c52c0ee3826076b855ca243f03cb940b8e0b2..c7f4836a3ad32011f4903973f9400362c795c841 100644
--- a/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ShuffleDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "ShuffleDataset"
+  visibility: HIDDEN
   in_arg {
     name: "buffer_size"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
index 44e5bac79b8cdfb703d8679b66d79ab9e9e7509a..f830049d053b50257d343306c9726adcf10aabd7 100644
--- a/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SkipDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "SkipDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
index ffb805834908103865e5fcb8d98fb080d60a44ab..4203eca73a5f954a3f407f2a5ad9b1193b044ec5 100644
--- a/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_SparseTensorSliceDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "SparseTensorSliceDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that splits a SparseTensor into elements row-wise."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
deleted file mode 100644
index 9b30d64afe18a71fbbe73b397979796b8b844faa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/base_api/api_def_StatsAggregatorHandle.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorHandle"
-  summary: "Creates a statistics manager resource."
-}
diff --git a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
index 80f64cebb1bef262146afdadd5c37b0a30277db0..30e425794b358f9a99efae1c116d7b35753f6bff 100644
--- a/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TFRecordDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TFRecordDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
index 8808dc6b1f0d0ae3a0e83f376eab245beaad2de1..eadcb6cd051bc306ba98d8a4318135e1fd7ccfb2 100644
--- a/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TakeDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TakeDataset"
+  visibility: HIDDEN
   in_arg {
     name: "count"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
index 050e174aacb12b415357437e7f989b09faf40621..c086d7420c27055d374b1924148c868cc9d6dfcc 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "TensorDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that emits `components` as a tuple of tensors once."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..fe2ccd9da62db86c2204cad8be7ed0d7588eb47a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestCreateTreeVariable.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestCreateTreeVariable"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be created.
+END
+  }
+  in_arg {
+    name: "tree_config"
+    description: <<END
+Serialized proto string of the boosted_trees.Tree.
+END
+  }
+  summary: "Creates a tree resource and returns a handle to it."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..43dbcb7b42d3bc72077292a765fe71d6393286ae
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeDeserialize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeDeserialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be restored.
+END
+  }
+  in_arg {
+    name: "tree_config"
+    description: <<END
+Serialied proto string of the boosted_trees.Tree proto.
+END
+  }
+  summary: "Deserializes a proto into the tree handle"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9c7a67888e21cbc025750bce66a8b85da5f2519
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeIsInitializedOp.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeIsInitializedOp"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree.
+END
+  }
+  out_arg {
+    name: "is_initialized"
+    description: <<END
+Whether the tree is initialized.
+END
+  }
+  summary: "Checks whether a tree has been initialized."
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8d92702748299dbf38b187f412ad72920374dfb
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreePredict.pbtxt
@@ -0,0 +1,29 @@
+op {
+  graph_op_name: "TensorForestTreePredict"
+  visibility: HIDDEN
+  attr {
+    name: "logits_dimension"
+    description: <<END
+Scalar, dimension of the logits.
+END
+  }
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource.
+END
+  }
+  in_arg {
+    name: "dense_features"
+    description: <<END
+Rank 2 dense features tensor.
+END
+  }
+  out_arg {
+    name: "logits"
+    description: <<END
+The logits predictions from the tree for each instance in the batch.
+END
+  }
+  summary: "Output the logits for the given input data"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bbf5c51d647ca76e6af49af66c4e732a70d76472
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeResourceHandleOp.pbtxt
@@ -0,0 +1,5 @@
+op {
+  graph_op_name: "TensorForestTreeResourceHandleOp"
+  visibility: HIDDEN
+  summary: "Creates a handle to a TensorForestTreeResource"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aac2afa0f85958012abb336d0c853cc2ad6d2c90
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSerialize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeSerialize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource to be serialized.
+END
+  }
+  out_arg {
+    name: "tree_config"
+    description: <<END
+Serialied proto string of the tree resource.
+END
+  }
+  summary: "Serializes the tree handle to a proto"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6b85b0ed6cf59bf69d9e48583ad39666aa21d6c5
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorForestTreeSize.pbtxt
@@ -0,0 +1,17 @@
+op {
+  graph_op_name: "TensorForestTreeSize"
+  visibility: HIDDEN
+  in_arg {
+    name: "tree_handle"
+    description: <<END
+Handle to the tree resource.
+END
+  }
+  out_arg {
+    name: "tree_size"
+    description: <<END
+The size of the tree.
+END
+  }
+  summary: "Get the number of nodes in a tree"
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..909c09aa12bd715d4ec6b6d19a9cd6b4b72f804a
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListConcat.pbtxt
@@ -0,0 +1,12 @@
+op {
+  graph_op_name: "TensorListConcat"
+  summary: "Concats all tensors in the list along the 0th dimension."
+  description: <<END
+Requires that all tensors have the same shape except the first dimension.
+
+input_handle: The input list.
+tensor: The concated result.
+lengths: Output tensor containing sizes of the 0th dimension of tensors in the list, used for computing the gradient.
+
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..24156cb8c47fab5af34bff3be3975b7a7959e542
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorListSplit.pbtxt
@@ -0,0 +1,13 @@
+op {
+  graph_op_name: "TensorListSplit"
+  summary: "Splits a tensor into a list."
+  description: <<END
+list[i] corresponds to lengths[i] tensors from the input tensor.
+The tensor must have rank at least 1 and contain exactly sum(lengths) elements.
+
+tensor: The input tensor.
+element_shape: A shape compatible with that of elements in the tensor.
+lengths: Vector of sizes of the 0th dimension of tensors in the list.
+output_handle: The list.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1634e51c3cb6f009a2578b145f968af815da988f
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterAdd.pbtxt
@@ -0,0 +1,94 @@
+op {
+  graph_op_name: "TensorScatterAdd"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor copied from tensor and updates added according to the indices.
+END
+  }
+  summary: "Adds sparse `updates` to an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by adding sparse `updates` to the passed
+in `tensor`.
+This operation is very similar to `tf.scatter_nd_add`, except that the updates
+are added onto an existing tensor (as opposed to a variable). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of tensor_scatter_add is to add individual elements to a
+tensor by index. For example, say we want to add 4 elements in a rank-1
+tensor with 8 elements.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_add(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, 12, 1, 11, 10, 1, 1, 13]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_add(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8], [9, 9, 9, 9]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..851628bce155874b164336f56a7e4c6f3a424d90
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterSub.pbtxt
@@ -0,0 +1,94 @@
+op {
+  graph_op_name: "TensorScatterSub"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor copied from tensor and updates subtracted according to the indices.
+END
+  }
+  summary: "Subtracts sparse `updates` from an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by subtracting sparse `updates` from the
+passed in `tensor`.
+This operation is very similar to `tf.scatter_nd_sub`, except that the updates
+are subtracted from an existing tensor (as opposed to a variable). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of tensor_scatter_sub is to subtract individual elements
+from a tensor by index. For example, say we want to insert 4 scattered elements
+in a rank-1 tensor with 8 elements.
+
+In Python, this scatter subtract operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_sub(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, -10, 1, -9, -8, 1, 1, -11]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter add operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_sub(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[-4, -4, -4, -4], [-5, -5, -5, -5], [-6, -6, -6, -6], [-7, -7, -7, -7]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a6ed1e1ce49891e98ec42c8f28f27c27d3669e7
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_TensorScatterUpdate.pbtxt
@@ -0,0 +1,106 @@
+op {
+  graph_op_name: "TensorScatterUpdate"
+  in_arg {
+    name: "tensor"
+    description: <<END
+Tensor to copy/update.
+END
+  }
+  in_arg {
+    name: "indices"
+    description: <<END
+Index tensor.
+END
+  }
+  in_arg {
+    name: "updates"
+    description: <<END
+Updates to scatter into output.
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+A new tensor with the given shape and updates applied according
+to the indices.
+END
+  }
+  summary: "Scatter `updates` into an existing tensor according to `indices`."
+  description: <<END
+This operation creates a new tensor by applying sparse `updates` to the passed
+in `tensor`.
+This operation is very similar to `tf.scatter_nd`, except that the updates are
+scattered onto an existing tensor (as opposed to a zero-tensor). If the memory
+for the existing tensor cannot be re-used, a copy is made and updated.
+
+If `indices` contains duplicates, then their updates are accumulated (summed).
+
+**WARNING**: The order in which updates are applied is nondeterministic, so the
+output will be nondeterministic if `indices` contains duplicates -- because
+of some numerical approximation issues, numbers summed in different order
+may yield different results.
+
+`indices` is an integer tensor containing indices into a new tensor of shape
+`shape`.  The last dimension of `indices` can be at most the rank of `shape`:
+
+    indices.shape[-1] <= shape.rank
+
+The last dimension of `indices` corresponds to indices into elements
+(if `indices.shape[-1] = shape.rank`) or slices
+(if `indices.shape[-1] < shape.rank`) along dimension `indices.shape[-1]` of
+`shape`.  `updates` is a tensor with shape
+
+    indices.shape[:-1] + shape[indices.shape[-1]:]
+
+The simplest form of scatter is to insert individual elements in a tensor by
+index. For example, say we want to insert 4 scattered elements in a rank-1
+tensor with 8 elements.
+
+<div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+<img style="width:100%" src="https://www.tensorflow.org/images/ScatterNd1.png" alt>
+</div>
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[4], [3], [1], [7]])
+    updates = tf.constant([9, 10, 11, 12])
+    tensor = tf.ones([8], dtype=tf.int32)
+    updated = tf.tensor_scatter_update(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [1, 11, 1, 10, 9, 1, 1, 12]
+
+We can also, insert entire slices of a higher rank tensor all at once. For
+example, if we wanted to insert two slices in the first dimension of a
+rank-3 tensor with two matrices of new values.
+
+In Python, this scatter operation would look like this:
+
+```python
+    indices = tf.constant([[0], [2]])
+    updates = tf.constant([[[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]],
+                           [[5, 5, 5, 5], [6, 6, 6, 6],
+                            [7, 7, 7, 7], [8, 8, 8, 8]]])
+    tensor = tf.ones([4, 4, 4])
+    updated = tf.tensor_scatter_update(tensor, indices, updates)
+    with tf.Session() as sess:
+      print(sess.run(scatter))
+```
+
+The resulting tensor would look like this:
+
+    [[[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]],
+     [[5, 5, 5, 5], [6, 6, 6, 6], [7, 7, 7, 7], [8, 8, 8, 8]],
+     [[1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1], [1, 1, 1, 1]]]
+
+Note that on CPU, if an out of bound index is found, an error is returned.
+On GPU, if an out of bound index is found, the index is ignored.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
index a26a98fd7f3a6564309efd28dff8c2bc93d7a67f..30cb803b26bf836a7b02cc3fb6875175046eab94 100644
--- a/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TensorSliceDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "TensorSliceDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that emits each dim-0 slice of `components` once."
 }
diff --git a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
index 6b630509964ed56ecaf401b10a46c5e53cd46528..31ef3e3335e2812156fc3d1af2c5c1724fa52310 100644
--- a/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_TextLineDataset.pbtxt
@@ -1,5 +1,6 @@
 op {
   graph_op_name: "TextLineDataset"
+  visibility: HIDDEN
   in_arg {
     name: "filenames"
     description: <<END
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..15fc8747af14b4ee139fd5a6781ff6126ab95a64
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeDecodeWithOffsets.pbtxt
@@ -0,0 +1,87 @@
+op {
+  graph_op_name: "UnicodeDecodeWithOffsets"
+  in_arg {
+    name: "input"
+    description: <<END
+The text to be decoded. Can have any shape. Note that the output is flattened
+to a vector of char values.
+END
+  }
+  out_arg {
+    name: "row_splits"
+    description: <<END
+A 1D int32 tensor containing the row splits.
+END
+  }
+  out_arg {
+    name: "char_values"
+    description: <<END
+A 1D int32 Tensor containing the decoded codepoints.
+END
+  }
+  out_arg {
+    name: "char_to_byte_starts"
+    description: <<END
+A 1D int32 Tensor containing the byte index in the input string where each
+character in `char_values` starts.
+END
+  }
+  attr {
+    name: "input_encoding"
+    description: <<END
+Text encoding of the input strings. This is any of the encodings supported
+by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD or U+65533.)
+END
+  }
+  attr {
+    name: "replace_control_characters"
+    description: <<END
+Whether to replace the C0 control characters (00-1F) with the
+`replacement_char`. Default is false.
+END
+  }
+  summary: <<END
+Decodes each string in `input` into a sequence of Unicode code points.
+END
+  description: <<END
+The character codepoints for all strings are returned using a single vector
+`char_values`, with strings expanded to characters in row-major order.
+Similarly, the character start byte offsets are returned using a single vector
+`char_to_byte_starts`, with strings expanded in row-major order.
+
+The `row_splits` tensor indicates where the codepoints and start offsets for
+each input string begin and end within the `char_values` and
+`char_to_byte_starts` tensors.  In particular, the values for the `i`th
+string (in row-major order) are stored in the slice
+`[row_splits[i]:row_splits[i+1]]`. Thus:
+
+* `char_values[row_splits[i]+j]` is the Unicode codepoint for the `j`th
+  character in the `i`th string (in row-major order).
+* `char_to_bytes_starts[row_splits[i]+j]` is the start byte offset for the `j`th
+  character in the `i`th string (in row-major order).
+* `row_splits[i+1] - row_splits[i]` is the number of characters in the `i`th
+  string (in row-major order).
+END
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..26f786586073f10d5ab93a3edaa928e868735878
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnicodeEncode.pbtxt
@@ -0,0 +1,73 @@
+op {
+  graph_op_name: "UnicodeEncode"
+  visibility: HIDDEN
+  endpoint {
+    name: "UnicodeEncode"
+  }
+  in_arg {
+    name: "input_values"
+    description: <<END
+A 1D tensor containing the unicode codepoints that should be encoded.
+END
+  }
+  in_arg {
+    name: "input_splits"
+    description: <<END
+A 1D tensor specifying how the unicode codepoints should be split into strings.
+In particular, `output[i]` is constructed by encoding the codepoints in the
+slice `input_values[input_splits[i]:input_splits[i+1]]`.
+END
+  }
+  attr {
+    name: "output_encoding"
+    description: <<END
+Unicode encoding of the output strings. Valid encodings are: `"UTF-8",
+"UTF-16-BE", and "UTF-32-BE"`.
+END
+  }
+  attr {
+    name: "errors"
+    description: <<END
+Error handling policy when there is invalid formatting found in the input.
+The value of 'strict' will cause the operation to produce a InvalidArgument
+error on any invalid input formatting. A value of 'replace' (the default) will
+cause the operation to replace any invalid formatting in the input with the
+`replacement_char` codepoint. A value of 'ignore' will cause the operation to
+skip any invalid formatting in the input and produce no corresponding output
+character.
+END
+  }
+  attr {
+    name: "replacement_char"
+    description: <<END
+The replacement character codepoint to be used in place of any invalid
+formatting in the input when `errors='replace'`. Any valid unicode codepoint may
+be used. The default value is the default unicode replacement character is
+0xFFFD (U+65533).
+END
+  }
+  out_arg {
+    name: "output"
+    description: <<END
+The 1-D Tensor of strings encoded from the provided unicode codepoints.
+END
+  }
+  summary: "Encode a tensor of ints into unicode strings."
+  description: <<END
+Returns a vector of strings, where `output[i]` is constructed by encoding the
+Unicode codepoints in `input_values[input_splits[i]:input_splits[i+1]]`
+using `output_encoding`.
+
+---
+
+Example:
+
+```
+input_values = [72, 101, 108, 108, 111, 87, 111, 114, 108, 100]
+input_splits = [0, 5, 10]
+output_encoding = 'UTF-8'
+
+output = ['Hello', 'World']
+```
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt b/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7b3f88a1dac378b5fd8a3347df90b987d21644a3
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_UnwrapDatasetVariant.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "UnwrapDatasetVariant"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt b/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40f5c7a0d212fb74e67ea6dde58bca191a153231
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_WrapDatasetVariant.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "WrapDatasetVariant"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
index 7495693ccc50fede4a359d13aa710a1fd2fd9402..3c819963590f8f4ca05fd137ee70183c7d688aa2 100644
--- a/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ZipDataset.pbtxt
@@ -1,4 +1,5 @@
 op {
   graph_op_name: "ZipDataset"
+  visibility: HIDDEN
   summary: "Creates a dataset that zips together `input_datasets`."
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
deleted file mode 100644
index 4289c1daf96583943b8dfad84aeca3351657bee4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_BatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt b/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
index 801dfbc28545da16e573556d65f580007e58e176..94ffc7c068edd961ced8879fde3482076376010f 100644
--- a/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BatchToSpaceND.pbtxt
@@ -1,7 +1,9 @@
 op {
   graph_op_name: "BatchToSpaceND"
+  deprecation_message: "use batch_to_space"
   endpoint {
     name: "batch_to_space_nd"
+    deprecation_version: 2
   }
   endpoint {
     name: "manip.batch_to_space_nd"
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
index 7965af4916e7b8f590bd22452459410075c37cf8..fdbe5282bc136fa7cb59e9e638e6f1952b3ed5ce 100644
--- a/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI0e.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "BesselI0e"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.bessel_i0e"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
index dffd296f6d8288356add56f8fbff01bfc4c9213a..3f08cd766d8cb0698c62fbb488ce71ea8018d9e2 100644
--- a/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_BesselI1e.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "BesselI1e"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.bessel_i1e"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
deleted file mode 100644
index fcf541f9036baaef1590f06da0d7471b0558b4c7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_BytesProducedStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "BytesProducedStatsDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
deleted file mode 100644
index 2bbb4ff9e3b08d0dd11c7444e5d00feb514e81c0..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_CacheDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "CacheDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt b/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
index 33110d8c9ec3ff6e8aa2ba094011b6a5b1339058..cf7a56ec782360076a18aa9ab7959e0de4a20987 100644
--- a/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_CheckNumerics.pbtxt
@@ -1,5 +1,7 @@
 op {
   graph_op_name: "CheckNumerics"
+  deprecation_version: 2
+  deprecation_message: "Use debugging.assert_all_finite instead"
   endpoint {
     name: "debugging.check_numerics"
   }
diff --git a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
deleted file mode 100644
index c005a4da0f866c1d1106effabbaa22f1abecf422..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ConcatenateDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ConcatenateDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
index 2ae75d6da222d84245bb2a912942522eb52047bc..1f4bc6d22e3e9aa6e5923bd4fccf6caec322921d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2D"
-  endpoint {
-    name: "nn.conv2d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
index 6f21d8c8802f9a18c9357dbe68d3c65407bff923..1a9d96f3ab184d22ee999f727cb0f8f33e86841d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropFilter.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2DBackpropFilter"
-  endpoint {
-    name: "nn.conv2d_backprop_filter"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
index ea976799cbc73bc9164a15e781a051f03e14275b..1505a307658786b2c9d68263d7b50e87348d5027 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv2DBackpropInput.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv2DBackpropInput"
-  endpoint {
-    name: "nn.conv2d_backprop_input"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
index ba8d178263c94574c0aaac8f1f24fb1424a50275..cb463dd0d8d725ca4851d93e37d1f6b63e4117c8 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3D.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "Conv3D"
-  endpoint {
-    name: "nn.conv3d"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
index 1da8ee3a25f36a0b44f6458a351854190fe7830f..590b37c95fb2a43e49d5c5ae4dcfe8cc499a4c6d 100644
--- a/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Conv3DBackpropFilterV2.pbtxt
@@ -1,6 +1,10 @@
 op {
   graph_op_name: "Conv3DBackpropFilterV2"
+  endpoint {
+    name: "nn.conv3d_backprop_filter"
+  }
   endpoint {
     name: "nn.conv3d_backprop_filter_v2"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
index ce65f8172ddfea2ae08750cf37bba8e3e012f5f5..2559a6c80b812475ef5b6ca5d0a0cc35bffc4d4b 100644
--- a/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_CropAndResize.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "CropAndResize"
-  endpoint {
-    name: "image.crop_and_resize"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt b/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
deleted file mode 100644
index e3d34cc15be752b466aa03f6805cd687698f74fa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_DatasetToSingleElement.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DatasetToSingleElement"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
index fbe9c882538776abb35b7c654ede0fffbfaa078c..2c3857cc539df8cfc9085d0a44628ebbb6a36e34 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeAndCropJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeAndCropJpeg"
-  endpoint {
-    name: "image.decode_and_crop_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
index 573d83f3739a86d00550c519cb19aef452813927..ffe19ca8dc3a91857b6c5473209670c3b0f1240a 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeBmp.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeBmp"
-  endpoint {
-    name: "image.decode_bmp"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
index eed64df79cf7837c1cc0580dd2cb0f06acf289cc..ff68b997e14c043b1d1af8b22ba99607106bb302 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeGif.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeGif"
-  endpoint {
-    name: "image.decode_gif"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
index 994bc4e1f4fd1707579ac2bda4fae5ed327430ab..97d262abe578df1ca357b6288d415ed180df3392 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodeJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodeJpeg"
-  endpoint {
-    name: "image.decode_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt b/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
index 309eec5ac368297563af7e6e752921fd270186ef..3b9290a2c5b8ee1e10de6dad1eeafbbe450d99d7 100644
--- a/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DecodePng.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "DecodePng"
-  endpoint {
-    name: "image.decode_png"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
deleted file mode 100644
index 0a8e068afb744ce8b472111d19cf743d39ac44ef..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_DenseToSparseBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "DenseToSparseBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
index 1bb17e548d1cd0ca77d6415b7fa165b1a6b7cae3..e26d029212e3bc421987f6d203b2e6ce5a95c7ac 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNative.pbtxt
@@ -1,6 +1,8 @@
 op {
   graph_op_name: "DepthwiseConv2dNative"
+  deprecation_message: "Use nn.depthwise_conv2d instead"
   endpoint {
     name: "nn.depthwise_conv2d_native"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
index 6f9df4b1a11459c252f2961fb1caacaad64021ae..01c4a50ca6fa31f65feb9d5a65fbf105525772e8 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropFilter.pbtxt
@@ -2,5 +2,10 @@ op {
   graph_op_name: "DepthwiseConv2dNativeBackpropFilter"
   endpoint {
     name: "nn.depthwise_conv2d_native_backprop_filter"
+    deprecated: true
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "nn.depthwise_conv2d_backprop_filter"
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
index 0bd72539e932f597e86f63ef52519652f0e8efd7..f32aa8a69f24db4abc3f8e1aef514ee84d73c23f 100644
--- a/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_DepthwiseConv2dNativeBackpropInput.pbtxt
@@ -2,5 +2,10 @@ op {
   graph_op_name: "DepthwiseConv2dNativeBackpropInput"
   endpoint {
     name: "nn.depthwise_conv2d_native_backprop_input"
+    deprecated: true
+    deprecation_version: 2
+  }
+  endpoint {
+    name: "nn.depthwise_conv2d_backprop_input"
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
index 6d73ecf1bb06895017b2d2ac2a16c702681eb217..1bd83d906152d2e5792fecd5e80e339e0c67e7a5 100644
--- a/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Dilation2D.pbtxt
@@ -2,5 +2,6 @@ op {
   graph_op_name: "Dilation2D"
   endpoint {
     name: "nn.dilation2d"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt b/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
index 5c31e9d0f32e6e13ba7d87d8a234e238c048a8b9..054ffb997b3def412f50b12216794d53d3add41c 100644
--- a/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_EncodeJpeg.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "EncodeJpeg"
-  endpoint {
-    name: "image.encode_jpeg"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
deleted file mode 100644
index 051cf14c0ec2b32779be8b9c297b93abd1bc1318..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_EnqueueInQueueDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "EnqueueInQueueDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
index 391167254edb69725c778e6319bf8a9f6038589f..21ae77e9ed71cae895b5e3f62adb2607704b5858 100644
--- a/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Erf.pbtxt
@@ -1,4 +1,10 @@
 op {
   graph_op_name: "Erf"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.erf"
+  }
+  endpoint {
+    name: "erf"
+    deprecation_version: 2
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
index 0bd8b1c11aa15b49f45960abfa43ca1c7e947c49..17921dea4d5e19ef960100a72709a2311da66f3d 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractImagePatches.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "ExtractImagePatches"
-  endpoint {
-    name: "image.extract_image_patches"
-  }
-  endpoint {
-    name: "extract_image_patches"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt b/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
index 6849a6d3fa5f37b0d4f92829c8b07754b922a319..a57955c8a74af58cafee4719a86d649efbcb504b 100644
--- a/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ExtractJpegShape.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ExtractJpegShape"
-  endpoint {
-    name: "image.extract_jpeg_shape"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
deleted file mode 100644
index 6f91b842181c769d0a2f921f1d7566c4d8522541..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FilterDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FilterDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
deleted file mode 100644
index d0703471d38c94a8c37da6f0a65ebd165c23a820..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FixedLengthRecordDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt b/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
deleted file mode 100644
index def9f85e02d9d34412ed42d7774d77e8b6a328e0..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FixedLengthRecordDatasetV2.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FixedLengthRecordDatasetV2"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
deleted file mode 100644
index 9de61ac263cd82a0893aa2e27b9d7532490ca441..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_FlatMapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "FlatMapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
index 26598ab1fb918e251d4c4da7b14810ebf4c44779..efd42b888d21fad6c369ae63182ed8846bf9f0b1 100644
--- a/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FloorDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "FloorDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "floor_div"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
index ef562e93a0dee0a3f24716719cb24232302626dc..e5db6d49b29e46c9f19c43767c16a5e5296304e8 100644
--- a/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FloorMod.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "FloorMod"
-  visibility: HIDDEN
+  endpoint {
+    name: "floormod"
+  }
+  endpoint {
+    name: "mod"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
index 16ed9b56f2b662b6cca44f5c955e579c2f9d7971..cbe87777a7fec7557b5153df8cd7689f22aa961e 100644
--- a/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalAvgPool.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "FractionalAvgPool"
-  endpoint {
-    name: "nn.fractional_avg_pool"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
index 695559520805abd02e0575f7f85937d00f0dc5fd..02470b43454cdcb44ee624ecab4486fa36caa7da 100644
--- a/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_FractionalMaxPool.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "FractionalMaxPool"
-  endpoint {
-    name: "nn.fractional_max_pool"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
deleted file mode 100644
index 9dcfa0f7d210012aa5c2d43349239a953ea3739e..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_GeneratorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "GeneratorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
deleted file mode 100644
index 8d40208e613e6b7ee1522c2990afea1345cc5de1..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_GroupByWindowDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "GroupByWindowDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
deleted file mode 100644
index ef1b06b19cc6a0c62f6e9f451aceed8aeabed553..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_InterleaveDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "InterleaveDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
index 91160bd8bfa7760c4529c028df178755d35c49db..ccd736a483ef3e927e270a33639f6f38856312b8 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsFinite.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsFinite"
+  endpoint {
+    name: "math.is_finite"
+  }
   endpoint {
     name: "debugging.is_finite"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_finite"
diff --git a/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
index 7f029ee8cf0c7cd85a2bf75f9302469dd8174deb..3cbfb7317c1383db74317080d1dfe93628aab3b4 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsInf.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsInf"
+  endpoint {
+    name: "math.is_inf"
+  }
   endpoint {
     name: "debugging.is_inf"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_inf"
diff --git a/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt b/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
index f2b8862c28d4968289f5d0c6a2a85d9cf632487d..b01536664e5111217c7d1e5fb415c8e791cbaa34 100644
--- a/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_IsNan.pbtxt
@@ -1,7 +1,11 @@
 op {
   graph_op_name: "IsNan"
+  endpoint {
+    name: "math.is_nan"
+  }
   endpoint {
     name: "debugging.is_nan"
+    deprecation_version: 2
   }
   endpoint {
     name: "is_nan"
diff --git a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
deleted file mode 100644
index 94bf6106ad8459767d31a345a17483b255dfc02b..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_LatencyStatsDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "LatencyStatsDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt b/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
index b1de2cb207d221593b41d82d43b759e49d411710..3835661be57ce0ca829db231fcd1d5b0bec8215a 100644
--- a/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_LinSpace.pbtxt
@@ -2,6 +2,7 @@ op {
   graph_op_name: "LinSpace"
   endpoint {
     name: "lin_space"
+    deprecation_version: 2
   }
   endpoint {
     name: "linspace"
diff --git a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
index ac4a4454c7479114de48cd60c8968b24e9680cd2..b6d2da6d32a270f5e99bf551fc786e7a98b54cf7 100644
--- a/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Log.pbtxt
@@ -5,5 +5,6 @@ op {
   }
   endpoint {
     name: "log"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
index 5a2d77a4176bb066424aa763345725f93f48cd46..e3da451de3fc7ceb544779a0a4445620f0d0af1f 100644
--- a/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Log1p.pbtxt
@@ -5,5 +5,6 @@ op {
   }
   endpoint {
     name: "log1p"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt b/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c0b6b53da50e474c3bfe2065a607a19baf06bc80
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_Lu.pbtxt
@@ -0,0 +1,6 @@
+op {
+  graph_op_name: "Lu"
+  endpoint {
+    name: "linalg.lu"
+  }
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
deleted file mode 100644
index cffd2910fb404bc7f75e55e42b9ebba1635db134..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_MapAndBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MapAndBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
deleted file mode 100644
index 0b1d2f2c730ff8b8b928fcd97c4fe3bdc704e470..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_MapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "MapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
index 7d8abca5f1ad76df62e78f9d7228b586dce31bf6..13a1a0b5df4d73884d267777ccf5ad6a44fcdbd4 100644
--- a/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_MaxPoolWithArgmax.pbtxt
@@ -2,5 +2,6 @@ op {
   graph_op_name: "MaxPoolWithArgmax"
   endpoint {
     name: "nn.max_pool_with_argmax"
+    deprecation_version: 2
   }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
index 0e2bb9b950d933f2e73272b403fba2c29110b3cb..ac166561ee9b1ab5fcee6fad776971172b0ee5ba 100644
--- a/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Neg.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Neg"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.negative"
+  }
+  endpoint {
+    name: "negative"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
deleted file mode 100644
index c6223b3132ed0d6878995d3c5e657275fac0cc4f..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PaddedBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
deleted file mode 100644
index 93cd5719feb613cd3de2e422e23cc3d690bdef08..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParallelInterleaveDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParallelInterleaveDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
deleted file mode 100644
index 09d200dd24c828af85d1505bb17086dbfa688ee8..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParallelMapDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParallelMapDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
deleted file mode 100644
index 45826b6fdcc582ac7fd84d45b079b7f4994bc370..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ParseExampleDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ParseExampleDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
deleted file mode 100644
index ec4e214eb5e082c8f732cbef9db69524c48d80a4..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PrefetchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PrefetchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
deleted file mode 100644
index 228c4047d2e0b7ddfec1d8cd4fad478aa6c4c1a7..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_PrependFromQueueAndPaddedBatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "PrependFromQueueAndPaddedBatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
index dfa793a16e18ab30891bcb9a997d7bed02410e54..6aceba3b1188919d4b0318f560ed32921e823343 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedAvgPool.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedAvgPool"
   endpoint {
     name: "nn.quantized_avg_pool"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
index 2409d12abeff922cca92f9ae609764a27f651356..4b5a04f45ef014ad328fea26e613f227d1821e71 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedConv2D.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedConv2D"
   endpoint {
     name: "nn.quantized_conv2d"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
index 3a58590f5773a3d886ace95108ee63a659362de2..cd1c7fdbf22ec746a080566b20daa7b100e5cb65 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedMaxPool.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedMaxPool"
   endpoint {
     name: "nn.quantized_max_pool"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
index 926ec98eeb468e7fa4846ae013a112cc865bb82c..d83d71c65cabf7a00d65c9dc87c6465f7c1ae9f5 100644
--- a/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_QuantizedReluX.pbtxt
@@ -2,5 +2,7 @@ op {
   graph_op_name: "QuantizedReluX"
   endpoint {
     name: "nn.quantized_relu_x"
+    deprecation_version: 2
   }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
deleted file mode 100644
index a5f6f8c6f1db344c480e2bd452362d977dc15000..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RandomDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RandomDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
deleted file mode 100644
index 4cd8296b2233ac58c12e6573d2194f7d976d9137..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RangeDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RangeDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
index bd87eef8240532c158b7604d8c5576e6d0b8b24b..f9e01eb56744cefddb41bad1a54d539ab3e0c548 100644
--- a/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_RealDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "RealDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "realdiv"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
deleted file mode 100644
index be301da8386af0fbd98c9b02d2cfc0fe79178990..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_RepeatDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "RepeatDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
index 2f1b4aee00d90221d659daa34a7eb3462f42fa0c..e1a1f883d8ba6850f429ca5ebc8ab89789a2df90 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeArea.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeArea"
-  endpoint {
-    name: "image.resize_area"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
index 3ec8e0ad6359307eab1b166801474817d8c5282b..e0bec8c116db961f873e1aa961d32d9422311696 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBicubic.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeBicubic"
-  endpoint {
-    name: "image.resize_bicubic"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
index eb3b8d6f458fff6163932457ef6c73a8fbbd721e..6121c1128c9060914723beb9d056d51a212b54bc 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeBilinear.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeBilinear"
-  endpoint {
-    name: "image.resize_bilinear"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
index 25c5d5701feefd6f8270236f29e1c187fa3cf06a..0e86e4ce3ea33515947eae08705d5ea6c6860faa 100644
--- a/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_ResizeNearestNeighbor.pbtxt
@@ -1,6 +1,4 @@
 op {
   graph_op_name: "ResizeNearestNeighbor"
-  endpoint {
-    name: "image.resize_nearest_neighbor"
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1eef1b69b979bfeaaaaec81f47a6e62c8ecd8284
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyAdamWithAmsgrad.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyAdamWithAmsgrad"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1c39242b3101449ed08c7b132502f7a9eea1228e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceApplyKerasMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceApplyKerasMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..180793521352a3d9ba3b75b709c3f9d2d37c8f93
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_ResourceSparseApplyKerasMomentum.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "ResourceSparseApplyKerasMomentum"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
deleted file mode 100644
index e71b655c22fbcbf1524433fc65a392e4d80c5c43..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ScanDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ScanDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
deleted file mode 100644
index 3a8c1036ca34233b245a92110dc6e81ac348942d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SetStatsAggregatorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SetStatsAggregatorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
deleted file mode 100644
index 7b0d2994f0711f440fb6623aa2322c86bd3859f8..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ShuffleAndRepeatDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ShuffleAndRepeatDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
deleted file mode 100644
index 8f0be9197adeb23b2d5047c5d69916df0e2c1eda..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ShuffleDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ShuffleDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
index c2ee91dd12ed16ba27a9c4ae45b48194bc5a8b03..fb427cdb191d4976cf50d214e7f58695e7c41490 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sign.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Sign"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.sign"
+  }
+  endpoint {
+    name: "sign"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
deleted file mode 100644
index 96a551c5b6669a8d019e3c705507aba768ab9d21..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SkipDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SkipDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
deleted file mode 100644
index 867116c5da718f66205132d70a93c39464096df6..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SlideDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SlideDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
deleted file mode 100644
index 19c0c7f199dfd24d24a56c3766733f9e55957c12..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SparseTensorSliceDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SparseTensorSliceDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
deleted file mode 100644
index 2ab4c3e441dd51f50a2796ef9d6fa0d21b727ffa..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_SqlDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "SqlDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
index 59e2dfe8366813242337c9490d74ca317e525636..16a4d9a7bcc0058aa0baf46ed0b932d4c26a23e2 100644
--- a/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Sqrt.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Sqrt"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.sqrt"
+  }
+  endpoint {
+    name: "sqrt"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
index 7b39ae25fa062b4271dcc2aee6523847c97b1e4d..0bd2f1bf41b80b1a21d50a9b9f437da33e36584c 100644
--- a/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Square.pbtxt
@@ -1,4 +1,9 @@
 op {
   graph_op_name: "Square"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.square"
+  }
+  endpoint {
+    name: "square"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
deleted file mode 100644
index f7bed36602f40602313157c20677acbbf592d7be..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorHandle.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorHandle"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt b/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
deleted file mode 100644
index 8b1bab2440f1934f1fd0194b76b7907fb0fb142d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_StatsAggregatorSummary.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "StatsAggregatorSummary"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
index cf0b8831ef14bff33a0bfb3e7e63d9c01a7a54b1..dc4493c841062adfbdfccdc501ecdfd6228e7aae 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringToHashBucket.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "StringToHashBucket"
-  endpoint {
-    name: "strings.to_hash_bucket"
-  }
-  endpoint {
-    name: "string_to_hash_bucket"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt b/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
index 155dd2675037b0a16f5878c1a47b389921772e75..9c89d02fb762c50eb2379d35b3d238797caa41ef 100644
--- a/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_StringToNumber.pbtxt
@@ -1,10 +1,4 @@
 op {
   graph_op_name: "StringToNumber"
-  endpoint {
-    name: "strings.to_number"
-  }
-  endpoint {
-    name: "string_to_number"
-    deprecation_version: 2
-  }
+  visibility: HIDDEN
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
deleted file mode 100644
index 3c270ada3c219b03715e0cd651a4b56fe5ebc227..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TFRecordDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TFRecordDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
deleted file mode 100644
index 711b335dc1926d32071637b3c986727c339736a3..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TakeDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TakeDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
index c946e0a794a77fe6f40613824e6d614e9667ccf9..80d11d27853d89b17fc86fca4fc9219452cd1aca 100644
--- a/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_Tanh.pbtxt
@@ -1,4 +1,12 @@
 op {
   graph_op_name: "Tanh"
-  visibility: HIDDEN
+  endpoint {
+    name: "math.tanh"
+  }
+  endpoint {
+    name: "nn.tanh"
+  }
+  endpoint {
+    name: "tanh"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
deleted file mode 100644
index 5bc3920c56360f2348805db1db79ab2b630f379d..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TensorDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TensorDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..c7b6fd106ce304f1e75913614c54f12a3efe5e38
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListConcat.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListConcat"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListPushBackBatch.pbtxt
similarity index 100%
rename from tensorflow/core/api_def/python_api/api_defTensorListPushBackBatch.pbtxt
rename to tensorflow/core/api_def/python_api/api_def_TensorListPushBackBatch.pbtxt
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..091297db07174a3925ed2a09b879d013580b606e
--- /dev/null
+++ b/tensorflow/core/api_def/python_api/api_def_TensorListSplit.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "TensorListSplit"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
deleted file mode 100644
index 89ad016483fa392a302915d588d32201237c717a..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TensorSliceDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TensorSliceDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
deleted file mode 100644
index 08d785191b6a4bddce2ac43fd4c0188b4d74548e..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_TextLineDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "TextLineDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
index 2a547f771cfb3d4f3d9496ea24196e1a8a1f1879..8e46c5e663a3fca40a6c2e4890a6ab9388645ad9 100644
--- a/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_TruncateDiv.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "TruncateDiv"
-  visibility: HIDDEN
+  endpoint {
+    name: "truncatediv"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt b/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
index 0731e8810e25cad2cca02522aba55d032b1765b2..97fb816a7ad395a4ad67d0296d87cf6264c76ac2 100644
--- a/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
+++ b/tensorflow/core/api_def/python_api/api_def_TruncateMod.pbtxt
@@ -1,4 +1,6 @@
 op {
   graph_op_name: "TruncateMod"
-  visibility: HIDDEN
+  endpoint {
+    name: "truncatemod"
+  }
 }
diff --git a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
deleted file mode 100644
index 1e5415749f0d3abad8f6f5c632a0bc59b11e8de2..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_UnbatchDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "UnbatchDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt b/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
deleted file mode 100644
index dd1459521ff70fc4b3adce7fbb1251b45106b439..0000000000000000000000000000000000000000
--- a/tensorflow/core/api_def/python_api/api_def_ZipDataset.pbtxt
+++ /dev/null
@@ -1,4 +0,0 @@
-op {
-  graph_op_name: "ZipDataset"
-  visibility: HIDDEN
-}
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index 822d0065b6713dbc6692ed11b7a938a784b0d597..c4bc1a684cb3ffaa30cdaece041fc51c266a3782 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -74,8 +74,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
 
   Status rewriteNode(Node* n, Graph* g) {
     AttrSlice n_attrs = n->attrs();
-    auto base_make_node = [n, g, &n_attrs](const string& op,
-                                           const string& name) {
+    auto base_make_node = [n, &n_attrs](const string& op, const string& name) {
       NodeBuilder node_builder(name, op);
 
       // The pieces of AccumulateNV2 should all be on the same node.
@@ -86,7 +85,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
       }
       return node_builder;
     };
-    auto make_node = [n, g, &n_attrs, &base_make_node](string op) {
+    auto make_node = [n, g, &base_make_node](string op) {
       return base_make_node(
           op, g->NewName(strings::StrCat(n->name(), "/Internal")));
     };
diff --git a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
index 91994c57311f95a669949a38c161f7d3acf5f54d..f3d86aa633938042b862613162d1c2a94b0fe35a 100644
--- a/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
+++ b/tensorflow/core/common_runtime/collective_executor_mgr_test.cc
@@ -38,8 +38,9 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
     auto* device_count = options.config.mutable_device_count();
     string task_name = "/job:localhost/replica:0/task:0";
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     std::unique_ptr<DeviceResolverInterface> drl(
         new DeviceResolverLocal(device_mgr_.get()));
     std::unique_ptr<ParamResolverInterface> prl(
@@ -50,7 +51,6 @@ class CollectiveExecutorMgrTest : public ::testing::Test {
   }
 
   std::unique_ptr<CollectiveExecutorMgr> cme_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
 };
 
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index 3a03b6724c15c38164905c1be65edb1668332bd4..a8e3f4c881afc9c37ce4b5196c32ec591be5506d 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -511,7 +511,7 @@ void CollectiveParamResolverLocal::FindInstanceRec(
         if (irec->is_init) {
           exit_outside_locks = true;
         } else {
-          irec->init_waiters.push_back([this, gr, cp, done](InstanceRec* irec) {
+          irec->init_waiters.push_back([this, done](InstanceRec* irec) {
             CallbackWithStatus(done, irec);
           });
           return;
@@ -696,7 +696,7 @@ void CollectiveParamResolverLocal::CompleteInstanceSource(InstanceRec* ir,
         if (ir->source_rank >= 0) {
           ir->status = errors::Internal("Instance ", cp->instance.instance_key,
                                         " already has source ", ir->source_rank,
-                                        ", recevied second claim from ",
+                                        ", received second claim from ",
                                         cp->default_rank);
         } else {
           ir->source_rank = cp->default_rank;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
index 9a501b329818938f8fde828d73daecb8a0a46b5e..94d889c40dff89204ccfc43478f8732815a4ead4 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local_test.cc
@@ -37,8 +37,9 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     string task_name = "/job:localhost/replica:0/task:0";
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
     prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
                                                 task_name));
@@ -73,7 +74,6 @@ class CollectiveParamResolverLocalTest : public ::testing::Test {
     }
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index 288ae9d794a2547d7837e1311e71c4681236704a..d99565b49abde95ca2fa28293771970b19620dd5 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -38,7 +38,7 @@ void CollectiveRemoteAccessLocal::RecvFromPeer(
     return;
   }
   buf_rendezvous_.ConsumeBuf(
-      key, [this, to_tensor, to_device_ctx, to_device, to_alloc_attr,
+      key, [to_tensor, to_device_ctx, to_device, to_alloc_attr,
             dev_to_dev_stream_index,
             done](const Status& s, BufRendezvous::Hook* hook) {
         if (!s.ok()) {
diff --git a/tensorflow/core/common_runtime/collective_rma_local_test.cc b/tensorflow/core/common_runtime/collective_rma_local_test.cc
index a931fe64bd13c57e2b9d55c5c1bf46862b3cb524..4263f3a4add524bf59e7c08cfb5d927ac9e23e06 100644
--- a/tensorflow/core/common_runtime/collective_rma_local_test.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local_test.cc
@@ -42,8 +42,9 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, kTaskName, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
     prl_.reset(new CollectiveParamResolverLocal(device_mgr_.get(), drl_.get(),
                                                 kTaskName));
@@ -51,7 +52,6 @@ class CollectiveRemoteAccessLocalTest : public ::testing::Test {
                                                kStepId));
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
   std::unique_ptr<CollectiveParamResolverLocal> prl_;
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index 6d5c7f951e36d9d1fa728a5ad5463dad59a550ac..5c226ec56e13fbb398d852ff6287910d2347785e 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -471,10 +471,10 @@ bool ReplaceTensorWithConstant(
   // Be conservative when replacing a tensor with a constant, when not
   // running on CPU.
   // 1) Do not replace another constant.
-  // 2) If the destination tensor is not an int32 tensor, and has HOST_MEMORY
-  // constraint, do not replace it.
-  // 3) If the destination tensor is an int32 tensor, and has DEVICE_MEMORY
-  // constraint, do not replace it.
+  // 2) If the destination tensor or any other tensor from the same node is not
+  // an int32 tensor, and has HOST_MEMORY constraint, do not replace it.
+  // 3) If the destination tensor or any other tensor from the same node is an
+  // int32 tensor, and has DEVICE_MEMORY constraint, do not replace it.
   // 4) If the size of the constant in bytes is too large (>
   // max_constant_in_bytes), do not replace it. This prevents the size of the
   // Graph from growing too large.
@@ -490,16 +490,20 @@ bool ReplaceTensorWithConstant(
                                ? DeviceType{partition_device->device_type()}
                                : DEVICE_CPU;
   if (partition_device && device_type != DEVICE_CPU) {
-    MemoryType memory_type;
-    if (!MemoryTypeForOutput(device_type, graph, tensor.first, tensor.second,
-                             &memory_type)
+    MemoryTypeVector input_mvec;
+    MemoryTypeVector output_mvec;
+    if (!MemoryTypesForNode(graph->op_registry(), device_type,
+                            tensor.first->def(), &input_mvec, &output_mvec)
              .ok()) {
       return false;
     }
-    bool is_int32 = tensor.first->output_type(tensor.second) == DT_INT32;
-    if ((memory_type == HOST_MEMORY && !is_int32) ||
-        (memory_type == DEVICE_MEMORY && is_int32)) {
-      return false;
+    for (int i = 0; i < output_mvec.size(); i++) {
+      MemoryType memory_type = output_mvec[i];
+      bool is_int32 = tensor.first->output_type(i) == DT_INT32;
+      if ((memory_type == HOST_MEMORY && !is_int32) ||
+          (memory_type == DEVICE_MEMORY && is_int32)) {
+        return false;
+      }
     }
   }
   if (constant.TotalBytes() > max_constant_size_in_bytes) {
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index 98aefcde27fc9589b09cfb8af6a1e8734e13af24..1d4586f3da84f0beabe440dca51105826feb197c 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -18,13 +18,16 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/cc/ops/nn_ops.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/sendrecv_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -90,6 +93,24 @@ class ConstantFoldingTest : public ::testing::Test {
   }
 };
 
+class FakeDevice : public Device {
+ private:
+  explicit FakeDevice(const DeviceAttributes& device_attributes)
+      : Device(nullptr, device_attributes) {}
+
+ public:
+  Status Sync() override { return errors::Unimplemented("FakeDevice::Sync()"); }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
+
+  static std::unique_ptr<Device> Make(const string& name, const string& type) {
+    DeviceAttributes device_attributes;
+    device_attributes.set_name(name);
+    device_attributes.set_device_type(DeviceType(type).type());
+    return std::unique_ptr<Device>(new FakeDevice(device_attributes));
+  }
+};
+
 TEST_F(ConstantFoldingTest, Basic) {
   Scope s = Scope::NewRootScope();
   BuildSimpleGraph(&s);
@@ -610,6 +631,31 @@ TEST_F(ConstantFoldingTest, ConstShapeKnown) {
   }
 }
 
+TEST_F(ConstantFoldingTest, NoReplacePartialOutput) {
+  Graph g(OpRegistry::Global());
+  {
+    Scope s = Scope::NewRootScope().ExitOnError().WithAssignedDevice("/gpu:0");
+
+    auto c0 = ops::Const<float>(s.WithOpName("c0"), {5.0, 2.0, 8.0, 1.0}, {4});
+    auto k = ops::Const<int>(s.WithOpName("k"), 3);
+    auto topK =
+        ops::TopK(s.WithOpName("topK"), c0, k, ops::TopK::Sorted(false));
+    auto send_values = ops::_Send(s.WithOpName("send_values"), topK.values,
+                                  "send_values", "sender", 0, "receiver");
+    auto send_indices = ops::_Send(s.WithOpName("send_indices"), topK.indices,
+                                   "send_indices", "sender", 0, "receiver");
+    TF_ASSERT_OK(s.ToGraph(&g));
+  }
+  bool was_mutated;
+  TF_EXPECT_OK(ConstantFold(
+      ConstantFoldingOptions{}, nullptr, Env::Default(),
+      FakeDevice::Make("/job:tpu_worker/replica:0/task:0/device:GPU:0",
+                       DEVICE_GPU)
+          .get(),
+      &g, &was_mutated));
+  EXPECT_FALSE(was_mutated);
+}
+
 namespace {
 
 const char kTestMemRegionName[] = "test://test";
diff --git a/tensorflow/core/common_runtime/device.cc b/tensorflow/core/common_runtime/device.cc
index 8fc64fff69a6252ed9860f8dcb75814cfd0785ff..9925814a48acf19162a39f07666a909db56e39e4 100644
--- a/tensorflow/core/common_runtime/device.cc
+++ b/tensorflow/core/common_runtime/device.cc
@@ -36,6 +36,8 @@ Device::~Device() {
   }
 }
 
+void Device::Sync(const DoneCallback& done) { done(Sync()); }
+
 // static
 DeviceAttributes Device::BuildDeviceAttributes(
     const string& name, DeviceType device, Bytes memory_limit,
diff --git a/tensorflow/core/common_runtime/device.h b/tensorflow/core/common_runtime/device.h
index 2ef1547cd9a56de0750eac1583568a06720acb99..8dfbb21eda641ff9f70c58f1f4bf150ba4cceef3 100644
--- a/tensorflow/core/common_runtime/device.h
+++ b/tensorflow/core/common_runtime/device.h
@@ -55,6 +55,9 @@ class DeviceMgr;
 
 class Device : public DeviceBase {
  public:
+  // Callback type that takes a Status and returns void.
+  typedef std::function<void(const Status&)> DoneCallback;
+
   Device(Env* env, const DeviceAttributes& device_attributes);
   ~Device() override;
 
@@ -112,6 +115,13 @@ class Device : public DeviceBase {
   // at completion.
   virtual Status Sync() = 0;
 
+  // Calls the given callback when all operations queued on the device at the
+  // time of the call have completed. The callback is passed any error pending
+  // on the device at completion.
+  // TODO(b/112409994): Consolidate these two APIs, removing the synchronous
+  // version.
+  virtual void Sync(const DoneCallback& done);
+
   // Override this to return true for devices that require a Sync() call before
   // session completion.
   virtual bool RequiresSyncOnCompletion() const { return false; }
diff --git a/tensorflow/core/common_runtime/device_factory.cc b/tensorflow/core/common_runtime/device_factory.cc
index b43c718817558f0e44eff5f5e5d5ec3a81d25ddd..0fad13fe1e747e219c40c5262877dce6a7544b8a 100644
--- a/tensorflow/core/common_runtime/device_factory.cc
+++ b/tensorflow/core/common_runtime/device_factory.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/logging.h"
@@ -89,9 +90,9 @@ DeviceFactory* DeviceFactory::GetFactory(const string& device_type) {
   return it->second.factory.get();
 }
 
-Status DeviceFactory::AddDevices(const SessionOptions& options,
-                                 const string& name_prefix,
-                                 std::vector<Device*>* devices) {
+Status DeviceFactory::AddDevices(
+    const SessionOptions& options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   // CPU first. A CPU device is required.
   auto cpu_factory = GetFactory("CPU");
   if (!cpu_factory) {
@@ -116,19 +117,24 @@ Status DeviceFactory::AddDevices(const SessionOptions& options,
   return Status::OK();
 }
 
-Device* DeviceFactory::NewDevice(const string& type,
-                                 const SessionOptions& options,
-                                 const string& name_prefix) {
+std::unique_ptr<Device> DeviceFactory::NewDevice(const string& type,
+                                                 const SessionOptions& options,
+                                                 const string& name_prefix) {
   auto device_factory = GetFactory(type);
   if (!device_factory) {
     return nullptr;
   }
   SessionOptions opt = options;
   (*opt.config.mutable_device_count())[type] = 1;
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(device_factory->CreateDevices(opt, name_prefix, &devices));
-  CHECK_EQ(devices.size(), size_t{1});
-  return devices[0];
+  int expected_num_devices = 1;
+  auto iter = options.config.device_count().find(type);
+  if (iter != options.config.device_count().end()) {
+    expected_num_devices = iter->second;
+  }
+  DCHECK_EQ(devices.size(), static_cast<size_t>(expected_num_devices));
+  return std::move(devices[0]);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_factory.h b/tensorflow/core/common_runtime/device_factory.h
index db50226fe895963778eafe8a49289889eae16b1f..b3cd7adca9c638d43400cfa04ec63db1437ed62c 100644
--- a/tensorflow/core/common_runtime/device_factory.h
+++ b/tensorflow/core/common_runtime/device_factory.h
@@ -40,18 +40,19 @@ class DeviceFactory {
   // CPU devices are added first.
   static Status AddDevices(const SessionOptions& options,
                            const string& name_prefix,
-                           std::vector<Device*>* devices);
+                           std::vector<std::unique_ptr<Device>>* devices);
 
   // Helper for tests.  Create a single device of type "type".  The
   // returned device is always numbered zero, so if creating multiple
   // devices of the same type, supply distinct name_prefix arguments.
-  static Device* NewDevice(const string& type, const SessionOptions& options,
-                           const string& name_prefix);
+  static std::unique_ptr<Device> NewDevice(const string& type,
+                                           const SessionOptions& options,
+                                           const string& name_prefix);
 
   // Most clients should call AddDevices() instead.
-  virtual Status CreateDevices(const SessionOptions& options,
-                               const string& name_prefix,
-                               std::vector<Device*>* devices) = 0;
+  virtual Status CreateDevices(
+      const SessionOptions& options, const string& name_prefix,
+      std::vector<std::unique_ptr<Device>>* devices) = 0;
 
   // Return the device priority number for a "device_type" string.
   //
diff --git a/tensorflow/core/common_runtime/device_mgr.cc b/tensorflow/core/common_runtime/device_mgr.cc
index 470abc1431292820dec747110a60c08246470c3c..1f7d7c4699872e55a73ebab919936435684405fe 100644
--- a/tensorflow/core/common_runtime/device_mgr.cc
+++ b/tensorflow/core/common_runtime/device_mgr.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
 
+#include <memory>
 #include <vector>
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
@@ -24,32 +25,32 @@ limitations under the License.
 
 namespace tensorflow {
 
-DeviceMgr::DeviceMgr(const std::vector<Device*>& devices)
-    : name_backing_store_(128) {
-  for (Device* d : devices) {
+DeviceMgr::DeviceMgr(std::vector<std::unique_ptr<Device>> devices)
+    : devices_(std::move(devices)), name_backing_store_(128) {
+  for (auto& d : devices_) {
     CHECK(d->device_mgr_ == nullptr);
     d->device_mgr_ = this;
 
-    devices_.push_back(d);
-
     // Register under the (1) full name and (2) canonical name.
     for (const string& name :
          DeviceNameUtils::GetNamesForDeviceMappings(d->parsed_name())) {
-      device_map_[CopyToBackingStore(name)] = d;
+      device_map_[CopyToBackingStore(name)] = d.get();
     }
     // Register under the (3) local name and (4) legacy local name.
     for (const string& name :
          DeviceNameUtils::GetLocalNamesForDeviceMappings(d->parsed_name())) {
-      device_map_[CopyToBackingStore(name)] = d;
+      device_map_[CopyToBackingStore(name)] = d.get();
     }
     device_type_counts_[d->device_type()]++;
   }
 }
 
-DeviceMgr::~DeviceMgr() {
-  // TODO(b/37437134): Remove destructor after converting to std::unique_ptr.
-  for (Device* p : devices_) delete p;
-}
+DeviceMgr::DeviceMgr(std::unique_ptr<Device> device)
+    : DeviceMgr([&device] {
+        std::vector<std::unique_ptr<Device>> vector;
+        vector.push_back(std::move(device));
+        return vector;
+      }()) {}
 
 StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
   size_t n = s.size();
@@ -61,18 +62,22 @@ StringPiece DeviceMgr::CopyToBackingStore(StringPiece s) {
 void DeviceMgr::ListDeviceAttributes(
     std::vector<DeviceAttributes>* devices) const {
   devices->reserve(devices_.size());
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     devices->emplace_back(dev->attributes());
   }
 }
 
 std::vector<Device*> DeviceMgr::ListDevices() const {
-  return std::vector<Device*>(devices_.begin(), devices_.end());
+  std::vector<Device*> devices(devices_.size());
+  for (size_t i = 0; i < devices_.size(); ++i) {
+    devices[i] = devices_[i].get();
+  }
+  return devices;
 }
 
 string DeviceMgr::DebugString() const {
   string out;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     strings::StrAppend(&out, dev->name(), "\n");
   }
   return out;
@@ -80,7 +85,7 @@ string DeviceMgr::DebugString() const {
 
 string DeviceMgr::DeviceMappingString() const {
   string out;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     if (!dev->attributes().physical_device_desc().empty()) {
       strings::StrAppend(&out, dev->name(), " -> ",
                          dev->attributes().physical_device_desc(), "\n");
@@ -107,7 +112,7 @@ Status DeviceMgr::LookupDevice(StringPiece name, Device** device) const {
 
 void DeviceMgr::ClearContainers(gtl::ArraySlice<string> containers) const {
   Status s;
-  for (Device* dev : devices_) {
+  for (const auto& dev : devices_) {
     if (containers.empty()) {
       s.Update(dev->resource_manager()->Cleanup(
           dev->resource_manager()->default_container()));
diff --git a/tensorflow/core/common_runtime/device_mgr.h b/tensorflow/core/common_runtime/device_mgr.h
index c1ff10d9b59cbba59bb89c7585a3b1c27111aaf6..bf8694655ae06fab590e4111488e3212e3e87ef7 100644
--- a/tensorflow/core/common_runtime/device_mgr.h
+++ b/tensorflow/core/common_runtime/device_mgr.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_DEVICE_MGR_H_
 
+#include <memory>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -34,15 +35,17 @@ class DeviceAttributes;
 
 class DeviceMgr {
  public:
-  // Takes ownership of each device in 'devices'.
+  // Constructs a DeviceMgr from a list of devices.
   // TODO(zhifengc): Other initialization information.
-  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
-  explicit DeviceMgr(const std::vector<Device*>& devices);
-  ~DeviceMgr();
+  explicit DeviceMgr(std::vector<std::unique_ptr<Device>> devices);
+
+  // Constructs a DeviceMgr managing a single device.
+  explicit DeviceMgr(std::unique_ptr<Device> device);
 
   // Returns attributes of all devices.
   void ListDeviceAttributes(std::vector<DeviceAttributes>* devices) const;
 
+  // Returns raw pointers to the underlying devices.
   std::vector<Device*> ListDevices() const;
 
   // Returns a string listing all devices.
@@ -62,9 +65,7 @@ class DeviceMgr {
   int NumDeviceType(const string& type) const;
 
  private:
-  // TODO(b/37437134): Use std::unique_ptr's to track ownership.
-  typedef gtl::InlinedVector<Device*, 8> DeviceVec;
-  DeviceVec devices_;
+  const std::vector<std::unique_ptr<Device>> devices_;
 
   StringPiece CopyToBackingStore(StringPiece s);
 
diff --git a/tensorflow/core/common_runtime/device_resolver_local_test.cc b/tensorflow/core/common_runtime/device_resolver_local_test.cc
index f5a6471ff731578d377ccfc9ad146847ae3f221c..54f1119e139886096cb7c2007e584003992d86c2 100644
--- a/tensorflow/core/common_runtime/device_resolver_local_test.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local_test.cc
@@ -36,12 +36,12 @@ class DeviceResolverLocalTest : public ::testing::Test {
     string task_name = "/job:localhost/replica:0/task:0";
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     drl_.reset(new DeviceResolverLocal(device_mgr_.get()));
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<DeviceResolverLocal> drl_;
 };
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index fd9c4222a7afd4914415c9c62e1ced118ea75d1f..6a8c3d14e543a74354bae77518e9f88502813463 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -57,7 +57,7 @@ class DeviceSetTest : public ::testing::Test {
 class DummyFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     return Status::OK();
   }
 };
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 3a23f084a5c80554a187d2a91f2ebb247cefae70..0434ca47b68f28ff65cb3d5e165bc5545ebe96f0 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/collective_executor_mgr.h"
 #include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
@@ -157,12 +156,12 @@ class DirectSessionFactory : public SessionFactory {
     if (options.config.graph_options().build_cost_model() > 0) {
       EnableCPUAllocatorFullStats(true);
     }
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
         options, "/job:localhost/replica:0/task:0", &devices));
 
     DirectSession* session =
-        new DirectSession(options, new DeviceMgr(devices), this);
+        new DirectSession(options, new DeviceMgr(std::move(devices)), this);
     {
       mutex_lock l(sessions_lock_);
       sessions_.push_back(session);
@@ -255,11 +254,19 @@ static RunHandlerPool* GetOrCreateRunHandlerPool(
   return pool;
 }
 
-bool DirectSession::ShouldUseRunHandlerPool() const {
-  if (options_.config.session_inter_op_thread_pool_size() > 0 ||
-      options_.config.use_per_session_threads()) {
+bool DirectSession::ShouldUseRunHandlerPool(
+    const RunOptions& run_options) const {
+  if (options_.config.use_per_session_threads()) return false;
+  if (options_.config.session_inter_op_thread_pool_size() > 0 &&
+      run_options.inter_op_thread_pool() > 0)
     return false;
-  }
+  // Only use RunHandlerPool when:
+  // a. Single global thread pool is used for inter-op parallelism.
+  // b. When multiple inter_op_thread_pool(s) are created, use it only while
+  // running sessions on the default inter_op_thread_pool=0. Typically,
+  // servo-team uses inter_op_thread_pool > 0 for model loading.
+  // TODO(crk): Revisit whether we'd want to create one (static) RunHandlerPool
+  // per entry in session_inter_op_thread_pool() in the future.
   return true;
 }
 
@@ -456,7 +463,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
                                   CallFrameInterface* call_frame,
                                   ExecutorsAndKeys* executors_and_keys,
                                   RunMetadata* run_metadata) {
-  const absl::Time start_time = absl::Now();
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
   string session_id_meta = strings::StrCat("SessionRun #id=", step_id, "#");
   tracing::ScopedActivity activity(session_id_meta);
 
@@ -606,9 +613,8 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
   }
 
   std::unique_ptr<RunHandler> handler;
-  if (ShouldUseRunHandlerPool() &&
+  if (ShouldUseRunHandlerPool(run_options) &&
       run_options.experimental().use_run_handler_pool()) {
-    // Non-null only when a global inter-op pool is used.
     VLOG(1) << "Using RunHandler to scheduler inter-op closures.";
     handler = GetOrCreateRunHandlerPool(options_)->Get();
   }
@@ -712,7 +718,7 @@ Status DirectSession::RunInternal(int64 step_id, const RunOptions& run_options,
       exec_and_lib.graph->ToGraphDef(partition_graph_def);
     }
   }
-  UpdateGraphExecutionTime(absl::Now() - start_time);
+  UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
 
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index 3a168bbe3fcb08167465ab75a155e2d2b4038046..6754e9cfb71700090049107cf4dd122175527ffe 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -247,8 +247,10 @@ class DirectSession : public Session {
                                    ExecutorsAndKeys* executors_and_keys,
                                    RunMetadata* run_metadata);
 
-  // Returns whether inter-op execution uses a global pool.
-  bool ShouldUseRunHandlerPool() const;
+  // Returns whether inter-op execution uses a global pool or the input
+  // `run_options` requests being run on inter_op_thread_pool = 0 in case
+  // multiple pools are configured.
+  bool ShouldUseRunHandlerPool(const RunOptions& run_options) const;
 
   ::tensorflow::Status ExtendLocked(const GraphDef& graph)
       EXCLUSIVE_LOCKS_REQUIRED(graph_state_lock_);
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index a7b618c18be5a7f99f29fdce7273aaabe1390ad3..86890ba07d8b9a4320c47ffde1b3b8d78d15ac5a 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -181,6 +181,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 201f06242f80485163031851c7cd4472de38e873..a750f8cbba4de4abd33d6ec395b6b0a5fb76cc67 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -39,6 +39,18 @@ std::unordered_map<string, const AttrTypeMap*>* OpNameToAttrTypeMap() {
 
 const uint32 kIsList = 1U << 31;
 
+AttrTypeMap* DefaultFunctionAttrTypeMap() {
+  AttrTypeMap* map = new AttrTypeMap();
+  (*map)["executor_type"] = TF_ATTR_STRING;
+  (*map)["config"] = TF_ATTR_STRING;
+  return map;
+}
+
+const AttrTypeMap* GetDefaultFunctionAttrTypeMap() {
+  static const AttrTypeMap* map = DefaultFunctionAttrTypeMap();
+  return map;
+}
+
 }  // namespace
 
 Status OpDefForOp(const char* op_name, const OpDef** op_def) {
@@ -50,13 +62,27 @@ Status OpDefForOp(const char* op_name, const OpDef** op_def) {
   return s;
 }
 
-Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
+Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                        bool* is_function) {
   mutex_lock l(g_op_name_to_attr_type_map_lock);
+  *is_function = false;
   *out = gtl::FindPtrOrNull(*OpNameToAttrTypeMap(), op_name);
   if (*out != nullptr) return Status::OK();
   const OpDef* op_def = nullptr;
   Status s = OpDefForOp(op_name, &op_def);
-  if (!s.ok()) return s;
+  if (errors::IsNotFound(s)) {
+    // If we did not find the op def, we assume `op_name` is a function.
+    // If it is actually a misspelled op, user will get another error when
+    // trying to run it.
+    // TODO(iga): If we ever have a use case for different attribute specs
+    // in different functions, we will need to look at the OpDef in the
+    // function def to retrieve their types.
+    *out = GetDefaultFunctionAttrTypeMap();
+    *is_function = true;
+    return Status::OK();
+  } else if (!s.ok()) {
+    return s;
+  }
   std::unique_ptr<AttrTypeMap> m(new AttrTypeMap);
   // TODO(agarwal): Avoid having to create this "registry" at runtime,
   // perhaps can be done at op registration time?
@@ -98,7 +124,7 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out) {
 #define DEFINE_SET_ATTR(value_type, value_field)                             \
   template <>                                                                \
   AttrBuilder& AttrBuilder::Set(StringPiece attr_name, value_type&& value) { \
-    value_field.push_back(std::make_pair(attr_name, value));                 \
+    value_field.push_back(std::make_pair(string(attr_name), value));         \
     return *this;                                                            \
   }
 
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index af5b7d80c324d986102ec66b750644e203c92d83..5e0172dfd328dbd4f16abdce879be1d1338e692c 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -43,7 +43,11 @@ typedef std::unordered_map<string, uint32> AttrTypeMap;
 Status OpDefForOp(const char* op_name, const OpDef** op_def);
 
 // Returns the AttrTypeMap for the TensorFlow operation named op_name.
-Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out);
+// If op_name is not registered in global op registry, AttrTypeMapForOp assumes
+// the op to be a function and returns the default attributes for a function.
+// `is_function` is set to true in this case.
+Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
+                        bool* is_function);
 
 // Looks for 'attr_name' in 'm' and sets 'out' and 'is_list'.
 Status AttrTypeByName(const AttrTypeMap& m, const string& attr_name,
@@ -95,7 +99,7 @@ class AttrBuilder {
   template <class T>
   AttrBuilder& Set(StringPiece attr_name, T&& value) {
     MayBeInitializeNodeDef();
-    SetInAttrValueMap(node_def_->mutable_attr(), attr_name, value);
+    SetInAttrValueMap(node_def_->mutable_attr(), string(attr_name), value);
     return *this;
   }
 
@@ -106,7 +110,7 @@ class AttrBuilder {
 
  private:
   template <class T>
-  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<StringPiece, T>, 2>;
+  using AttrVec = tensorflow::gtl::InlinedVector<std::pair<string, T>, 2>;
 
   void MayBeInitializeNodeDef();
   // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
@@ -118,7 +122,7 @@ class AttrBuilder {
   void FillAttrValueMap(AttrValueMap* m, bool include_those_in_node_def) const;
 
   template <class T>
-  void SetInAttrValueMap(AttrValueMap* m, StringPiece attr_name,
+  void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
                          T&& value) const {
     DCHECK(!node_def_finalized_)
         << "Calling SetInAttrValueMap after BuildNodeDef.";
@@ -127,12 +131,12 @@ class AttrBuilder {
     AttrValue attr_value;
     if (found == nullptr) {
       SetAttrValue(value, &attr_value);
-      m->insert(AttrValueMap::value_type(string(attr_name), attr_value));
+      m->insert(AttrValueMap::value_type(attr_name, attr_value));
     } else {
       // TODO(ashankar): Do what is done in
       // NodeDefBuilder::CheckInconsistency(attr_name, *found, attr_value);
       SetAttrValue(std::forward<T>(value), &attr_value);
-      (*m)[string(attr_name)] = attr_value;
+      (*m)[attr_name] = attr_value;
     }
   }
 
diff --git a/tensorflow/core/common_runtime/eager/attr_builder_test.cc b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
index 79b094f2e008786661b0236bc7bcdb3f37a23946..220cc6f5ce0bff32cfdc8d4e837c6900c773728e 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder_test.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder_test.cc
@@ -35,9 +35,18 @@ namespace {
 
 TEST(AttrTypeMap, Lookup) {
   const AttrTypeMap* m = nullptr;
-  Status s = AttrTypeMapForOp("ThisOpCannotPossiblyExist", &m);
-  EXPECT_FALSE(s.ok());
-  s = AttrTypeMapForOp("MatMul", &m);
+  // Unknown ops are assumed to be functions.
+  // Their maps are filled with default attributes.
+  bool is_function = false;
+  Status s = AttrTypeMapForOp("SomeFunctionName", &m, &is_function);
+  EXPECT_TRUE(s.ok());
+  EXPECT_TRUE(is_function);
+  EXPECT_EQ(TF_ATTR_STRING, m->find("executor_type")->second);
+  EXPECT_EQ(TF_ATTR_STRING, m->find("config")->second);
+
+  is_function = true;
+  s = AttrTypeMapForOp("MatMul", &m, &is_function);
+  EXPECT_FALSE(is_function);
   ASSERT_TRUE(s.ok()) << s;
 
   TF_AttrType t;
@@ -50,7 +59,7 @@ TEST(AttrTypeMap, Lookup) {
   EXPECT_EQ(TF_ATTR_BOOL, t);
   EXPECT_EQ(is_list, 0);
 
-  s = AttrTypeMapForOp("Squeeze", &m);
+  s = AttrTypeMapForOp("Squeeze", &m, &is_function);
   ASSERT_TRUE(s.ok()) << s;
   s = AttrTypeByName(*m, "squeeze_dims", &t, &is_list);
   ASSERT_TRUE(s.ok()) << s;
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index f23cefb33d755dc4dfba9b67d1e8963f8198bd21..1727c045604bd19e038857fa34780f34cbb05d44 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 
+#include "tensorflow/core/common_runtime/collective_executor_mgr.h"
+#include "tensorflow/core/common_runtime/collective_param_resolver_local.h"
+#include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/framework/resource_mgr.h"
@@ -32,18 +35,6 @@ bool ReadBoolFromEnvVar(StringPiece env_var_name, bool default_val) {
   return default_val;
 }
 
-std::unique_ptr<thread::ThreadPool> EagerThreadPool(
-    const SessionOptions& opts) {
-  SessionOptions opts_copy(opts);
-  if (opts_copy.config.inter_op_parallelism_threads() == 0) {
-    // Eager defaults to a single thread when no threads are specified.
-    opts_copy.config.set_inter_op_parallelism_threads(1);
-  }
-
-  return std::unique_ptr<thread::ThreadPool>(
-      NewThreadPoolFromSessionOptions(opts_copy));
-}
-
 }  // namespace
 
 EagerContext::EagerContext(const SessionOptions& opts,
@@ -61,7 +52,7 @@ EagerContext::EagerContext(const SessionOptions& opts,
     : policy_(default_policy),
       devices_(device_mgr->ListDevices()),
       rendezvous_(rendezvous),
-      thread_pool_(EagerThreadPool(opts)),
+      thread_pool_(NewThreadPoolFromSessionOptions(opts)),
       pflr_(new ProcessFunctionLibraryRuntime(
           device_mgr, opts.env, TF_GRAPH_DEF_VERSION, &func_lib_def_, {},
           thread_pool_.get())),
@@ -83,6 +74,13 @@ EagerContext::EagerContext(const SessionOptions& opts,
   runner_ = [this](std::function<void()> closure) {
     this->thread_pool_->Schedule(std::move(closure));
   };
+
+  std::unique_ptr<DeviceResolverInterface> drl(
+      new DeviceResolverLocal(local_device_mgr()));
+  std::unique_ptr<ParamResolverInterface> cprl(new CollectiveParamResolverLocal(
+      local_device_mgr(), drl.get(), "/job:localhost/replica:0/task:0"));
+  collective_executor_mgr_.reset(new CollectiveExecutorMgr(
+      opts.config, local_device_mgr(), std::move(drl), std::move(cprl)));
 }
 
 void EagerContext::InitDeviceMapAndAsync() {
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 4de807bde31ec14b1571948a91fe2f930d50f427..cdef94789337550fdaa760638f098ba47af5dfdb 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -33,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
 #endif
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/log_memory.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -147,6 +148,11 @@ class EagerContext {
   bool LogMemory() { return log_memory_; }
 
   Rendezvous* GetRendezvous() { return rendezvous_; }
+  std::unique_ptr<CollectiveExecutor::Handle> GetCollectiveExecutorHandle() {
+    return std::unique_ptr<CollectiveExecutor::Handle>(
+        new CollectiveExecutor::Handle(
+            collective_executor_mgr_->FindOrCreate(0), true /*inherit_ref*/));
+  }
 
   const tensorflow::DeviceMgr* local_device_mgr() const {
     return (local_device_manager_ != nullptr) ? local_device_manager_.get()
@@ -206,6 +212,8 @@ class EagerContext {
   bool UseSendTensorRPC() { return use_send_tensor_rpc_; }
   bool PinSmallOpsToCPU() { return pin_small_ops_to_cpu_; }
 
+  tensorflow::Env* TFEnv() const { return env_; }
+
  private:
   void InitDeviceMapAndAsync();
   Status MaybeRegisterFunctionRemotely(const FunctionDef& fdef);
@@ -271,6 +279,8 @@ class EagerContext {
 
   Env* const env_;
 
+  std::unique_ptr<CollectiveExecutorMgrInterface> collective_executor_mgr_;
+
 #ifndef __ANDROID__
   void CloseRemoteContexts();
 
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index fcf62c7715320466a49c707e31cf7a5045f16b8e..935ca7f9aa766a69582b4c94fec6c508e3f5a369 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -22,11 +22,14 @@ limitations under the License.
 namespace tensorflow {
 class EagerOperation {
  public:
-  // t is NULL iff the EagerOperation corresponds to a TensorFlow function
-  // instead of a primitive operation.
   EagerOperation(tensorflow::EagerContext* ctx, const char* op,
-                 const tensorflow::AttrTypeMap* t)
-      : ctx_(ctx), name_(op), attrs_(op), attr_types_(t), device_(nullptr) {}
+                 bool is_function, const tensorflow::AttrTypeMap* t)
+      : ctx_(ctx),
+        name_(op),
+        attrs_(op),
+        attr_types_(t),
+        device_(nullptr),
+        is_function_(is_function) {}
 
   ~EagerOperation() {
     for (tensorflow::TensorHandle* h : inputs_) {
@@ -34,7 +37,7 @@ class EagerOperation {
     }
   }
 
-  bool is_function() const { return attr_types_ == nullptr; }
+  bool is_function() const { return is_function_; }
 
   tensorflow::EagerContext* EagerContext() { return ctx_; }
 
@@ -68,6 +71,7 @@ class EagerOperation {
   tensorflow::gtl::InlinedVector<tensorflow::TensorHandle*, 4> inputs_;
   tensorflow::Device* device_;
   bool use_xla_ = false;
+  const bool is_function_;
 };
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 3cdda3ed75322b549a6bb475bc9ce305ff3ee5a2..783baa96c92f224e45404e5f6586011599f02292 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/lib/core/errors.h"
 #ifndef __ANDROID__
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/distributed_runtime/eager/remote_execute_node.h"
@@ -192,7 +193,7 @@ Status ValidateInputTypeAndPlacement(EagerContext* ctx, Device* op_device,
 }
 
 Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
-  DeviceTypeVector final_devices;
+  PrioritizedDeviceTypeVector final_devices;
   TF_RETURN_IF_ERROR(SupportedDeviceTypesForNode(
       ctx->prioritized_device_type_list(), ndef, &final_devices));
   if (final_devices.empty()) {
@@ -202,7 +203,7 @@ Status SelectDevice(const NodeDef& ndef, EagerContext* ctx, Device** device) {
                             " :\n", KernelsRegisteredForOp(ndef.op()));
   }
   for (Device* d : *ctx->devices()) {
-    if (d->device_type() == final_devices[0].type_string()) {
+    if (d->device_type() == final_devices[0].first.type_string()) {
       *device = d;
       return Status::OK();
     }
@@ -262,7 +263,8 @@ Status EagerLocalExecute(EagerOperation* op,
     // Note that it is not ideal, but currently ok, to set this
     // attribute after computing the kernel cache key above.
     if (op->is_function() && device != nullptr &&
-        device->device_type() == "TPU") {
+        (device->device_type() == "TPU" || device->device_type() == "XLA_GPU" ||
+         device->device_type() == "XLA_CPU")) {
       op->MutableAttrs()->Set(kXlaCompileAttr, true);
     }
 
@@ -283,7 +285,8 @@ Status EagerLocalExecute(EagerOperation* op,
           "Unable to find a FunctionLibraryRuntime corresponding to device ",
           device->name());
     }
-    kernel = new KernelAndDevice(ctx->GetRendezvous(), ctx->LogMemory());
+    kernel = new KernelAndDevice(ctx->GetRendezvous(), ctx->LogMemory(),
+                                 ctx->GetCollectiveExecutorHandle());
     status = KernelAndDevice::Init(ndef, flr, ctx->runner(), kernel);
     if (!status.ok()) {
       delete kernel;
@@ -827,8 +830,11 @@ Status ExecuteSend(EagerContext* ctx, tensorflow::Device* device,
                    TensorHandle* h, StringPiece wire_id,
                    const string& recv_device) {
   const tensorflow::AttrTypeMap* types;
-  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Send", &types));
-  tensorflow::EagerOperation op(ctx, "_Send", types);
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp("_Send", &types, &is_function));
+  DCHECK(!is_function);
+  tensorflow::EagerOperation op(ctx, "_Send", /*is_function=*/false, types);
 
   op.AddInput(h);
 
@@ -855,8 +861,11 @@ Status ExecuteRecv(EagerContext* ctx, tensorflow::Device* device,
                    const string& send_device, int64 send_device_incarnation,
                    TensorHandle** result) {
   const tensorflow::AttrTypeMap* types;
-  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp("_Recv", &types));
-  tensorflow::EagerOperation op(ctx, "_Recv", types);
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(
+      tensorflow::AttrTypeMapForOp("_Recv", &types, &is_function));
+  DCHECK(!is_function);
+  tensorflow::EagerOperation op(ctx, "_Recv", /*is_function=*/false, types);
 
   op.SetDevice(device);
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 192d22dfd5a105a31ab19a33c29ddc83ecd04142..317e9a16074b37ef6ecaf1d7f8c1a2daa412f75e 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -84,6 +84,15 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
                              tensorflow::HOST_MEMORY);
   }
 
+  gtl::InlinedVector<DeviceContext*, 4> input_device_contexts;
+  for (int i = 0; i < inputs->size(); i++) {
+    DeviceContext* device_context = nullptr;
+    if (device_->tensorflow_gpu_device_info() != nullptr) {
+      device_context = device_->tensorflow_gpu_device_info()->default_context;
+    }
+    input_device_contexts.push_back(device_context);
+  }
+
   OpKernelContext::Params params;
   params.device = device_;
   params.frame_iter = FrameAndIter(0, 0);
@@ -110,6 +119,9 @@ Status KernelAndDevice::Run(ScopedStepContainer* step_container,
   }
 
   params.step_container = step_container;
+  params.collective_executor =
+      collective_executor_ ? collective_executor_->get() : nullptr;
+  params.input_device_contexts = &input_device_contexts;
 
   OpKernelContext context(&params);
 
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 52dac94ccca0cc987751400778c3c1c6e95272d6..ee430b7fc70e1f4e5256e9dd28f4240ce57de86a 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/collective.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.h"
@@ -55,10 +56,16 @@ class KernelAndDevice {
                      KernelAndDevice* out);
 
   KernelAndDevice(tensorflow::Rendezvous* rendez, bool log_memory)
+      : KernelAndDevice(rendez, log_memory, nullptr) {}
+
+  KernelAndDevice(
+      tensorflow::Rendezvous* rendez, bool log_memory,
+      std::unique_ptr<CollectiveExecutor::Handle> collective_executor)
       : device_(nullptr),
         flr_(nullptr),
         rendez_(rendez),
-        log_memory_(log_memory) {}
+        log_memory_(log_memory),
+        collective_executor_(std::move(collective_executor)) {}
 
   // TODO(ashankar): Handle list-valued inputs.
   Status Run(std::vector<Tensor>* inputs, std::vector<Tensor>* outputs,
@@ -92,6 +99,7 @@ class KernelAndDevice {
   std::function<void(std::function<void()>)>* runner_;
   std::function<void(std::function<void()>)> default_runner_;
   const bool log_memory_;
+  const std::unique_ptr<CollectiveExecutor::Handle> collective_executor_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index 948bdbcaf536081ba78128f687a949bed8e02b6d..3ffed3ce321e79d021c302acf444f93cc9ccce53 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <vector>
 
+#include "absl/memory/memory.h"
 #include "tensorflow/cc/client/client_session.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -37,12 +38,13 @@ namespace {
 class TestEnv {
  public:
   TestEnv() : flib_def_(OpRegistry::Global(), {}) {
-    Device* device =
-        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0");
-    device_mgr_.reset(new DeviceMgr({device}));
-    flib_runtime_ = NewFunctionLibraryRuntime(device_mgr_.get(), Env::Default(),
-                                              device, TF_GRAPH_DEF_VERSION,
-                                              &flib_def_, nullptr, {}, nullptr);
+    std::vector<std::unique_ptr<Device>> devices;
+    devices.push_back(
+        DeviceFactory::NewDevice("CPU", {}, "/job:a/replica:0/task:0"));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
+    flib_runtime_ = NewFunctionLibraryRuntime(
+        device_mgr_.get(), Env::Default(), device_mgr_->ListDevices()[0],
+        TF_GRAPH_DEF_VERSION, &flib_def_, nullptr, {}, nullptr);
   }
 
   FunctionLibraryRuntime* function_library_runtime() const {
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index d8d6b7a63b6f7189d4db66846a2f48982a20e610..0acd1609361453a0901e346f3b9d76e6e3a7b872 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -184,10 +184,7 @@ Status TensorHandle::CopyToDevice(EagerContext* ctx, tensorflow::Device* dstd,
   bool is_same_device = (srcd == dstd) || (srcd->name() == dstd->name());
   const bool dst_cpu = dstd->tensorflow_gpu_device_info() == nullptr;
   const bool src_cpu = srcd->tensorflow_gpu_device_info() == nullptr;
-  // both_on_cpu can be true and yet is_same_device is false, if one of src/dst
-  // has device type XLA_CPU, and the other CPU.
-  const bool both_on_cpu = src_cpu && dst_cpu;
-  if (is_same_device || both_on_cpu) {
+  if (is_same_device) {
     *output = new tensorflow::TensorHandle(*src, dstd, dstd, ctx);
     return tensorflow::Status::OK();
   }
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index 1e68954827f3c5fad781fa8bb3ca821abae53ee4..6b3284b84a0d2741f315c3f91db35eebc68f9e98 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1239,7 +1239,6 @@ class ExecutorState {
   // Step-local container.
   ScopedStepContainer* step_container_;
   StepStatsCollectorInterface* const stats_collector_;
-  const tracing::TraceCollector* const trace_collector_;
   const tracing::EventCollector* const event_collector_;
   Context context_;
 
@@ -1366,7 +1365,6 @@ ExecutorState::ExecutorState(const Executor::Args& args, ExecutorImpl* impl)
       tensor_store_(args.tensor_store),
       step_container_(args.step_container),
       stats_collector_(args.stats_collector),
-      trace_collector_(tracing::GetTraceCollector()),
       event_collector_(
           tracing::GetEventCollector(tracing::EventCategory::kCompute)),
       context_(ContextKind::kThread),
@@ -1565,7 +1563,6 @@ struct ExecutorState::AsyncState {
 // Returns true if `item` might be traced by the given trace and event
 // collectors. Returns false only if `item` definitely will not be traced.
 bool MightTrace(const NodeItem& item,
-                const tracing::TraceCollector* trace_collector,
                 const tracing::EventCollector* event_collector,
                 bool using_annotations) {
   // Tracing will only be enabled if either `event_collector` is non null,
@@ -1578,6 +1575,7 @@ bool MightTrace(const NodeItem& item,
   if (event_collector != nullptr) {
     return true;
   }
+  auto* trace_collector = tracing::GetTraceCollector();
   if (trace_collector) {
     if (using_annotations) {
       return trace_collector->IsEnabledForAnnotations();
@@ -1713,7 +1711,7 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         auto done = [this, state]() {
           Device* device = impl_->params_.device;
           NodeExecStatsInterface* stats = state->stats;  // Shorthand
-          Entry* first_input = state->first_input;     // Shorthand
+          Entry* first_input = state->first_input;       // Shorthand
 
           nodestats::SetOpEnd(stats);
           EntryVector outputs;
@@ -1762,9 +1760,8 @@ void ExecutorState::Process(TaggedNode tagged_node, int64 scheduled_nsec) {
         OpKernelContext ctx(&params, item.num_outputs);
         nodestats::SetOpStart(stats);
 
-        if (TF_PREDICT_FALSE(MightTrace(item, trace_collector_,
-                                        event_collector_,
-                                        trace_using_annotations_))) {
+        if (TF_PREDICT_FALSE(
+                MightTrace(item, event_collector_, trace_using_annotations_))) {
           const string& op_name = op_kernel->name();
           tracing::ScopedRegion region(tracing::EventCategory::kCompute,
                                        op_name);
@@ -2046,6 +2043,24 @@ Status ExecutorState::ProcessOutputs(const NodeItem& item, OpKernelContext* ctx,
 void ExecutorState::PropagateOutputs(const TaggedNode& tagged_node,
                                      const NodeItem* item, EntryVector* outputs,
                                      TaggedNodeSeq* ready) {
+  auto activity_handle =
+      [&]() -> std::unique_ptr<tracing::TraceCollector::Handle> {
+    auto* trace_collector = tracing::GetTraceCollector();
+    if (TF_PREDICT_FALSE(trace_collector != nullptr &&
+                         trace_collector->IsEnabledForActivities(
+                             false /* is_expensive */))) {
+      const string& op_name = item->kernel->name();
+      // Intentionally using ExecutorPropagateOutputs as the first key so that
+      // users are aware that it's not the op invocation.
+      return trace_collector->CreateActivityHandle(
+          "ExecutorPropagateOutputs",
+          strings::StrCat(op_name, "#id=", step_id_, "#"),
+          false /* is_expensive */);
+    } else {
+      return nullptr;
+    }
+  }();
+
   const Node* node = tagged_node.node;
   FrameState* input_frame = tagged_node.input_frame;
   const int64 input_iter = tagged_node.input_iter;
@@ -2377,18 +2392,23 @@ void ExecutorState::Finish() {
   auto done_cb = std::move(done_cb_);
   auto runner = std::move(runner_);
   mu_.unlock();
+  CHECK(done_cb != nullptr);
   Device* device = impl_->params_.device;
+
   if ((sync_on_finish_ && status.ok()) || device->RequiresSyncOnCompletion()) {
     // Block until the device has finished all queued operations. For
     // devices like GPUs that continue to execute Ops after their Compute
     // methods have completed, this ensures that control is not returned to
     // the user until the step (and its side-effects) has actually completed.
-    status.Update(device->Sync());
+    device->Sync([=](Status new_status) mutable {
+      status.Update(new_status);
+      delete this;
+      runner([=]() { done_cb(status); });
+    });
+  } else {
+    delete this;
+    runner([=]() { done_cb(status); });
   }
-
-  delete this;
-  CHECK(done_cb != nullptr);
-  runner([=]() { done_cb(status); });
 }
 
 void ExecutorState::FindOrCreateChildFrame(FrameState* frame, int64 iter,
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 7697103faf9bfa7a3fdbdbc0c3286d07d257d817..c311b2533eaa0bf08494a71b51922b1b886ac549 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -53,17 +53,17 @@ class ExecutorTest : public ::testing::Test {
     // when the test completes.
     CHECK(rendez_->Unref());
     delete exec_;
-    delete device_;
   }
 
   // Resets executor_ with a new executor based on a graph 'gdef'.
   void Create(std::unique_ptr<const Graph> graph) {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
@@ -83,7 +83,7 @@ class ExecutorTest : public ::testing::Test {
   }
 
   thread::ThreadPool* thread_pool_ = nullptr;
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_;
   Executor* exec_ = nullptr;
   StepStatsCollector step_stats_collector_;
   StepStats step_stats_;
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 6775695fa2d9410de53635a130d245f269eb04b6..7eb622dc117f40a68079e6cea1a829227acfed7a 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -46,7 +46,11 @@ namespace tensorflow {
 
 // A few string constant used throughout this module.
 static constexpr const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static constexpr const char* const kDeviceArgOp =
+    FunctionLibraryDefinition::kDeviceArgOp;
 static constexpr const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
+static constexpr const char* const kDeviceRetOp =
+    FunctionLibraryDefinition::kDeviceRetOp;
 static constexpr const char* const kGradientOp =
     FunctionLibraryDefinition::kGradientOp;
 static constexpr const char* const kNodeLabel = "Func";
@@ -1633,9 +1637,9 @@ FunctionBody::FunctionBody(const FunctionDef& f, DataTypeSlice arg_t,
   this->ret_nodes.resize(ret_types.size());
   for (Node* n : this->graph->op_nodes()) {
     gtl::InlinedVector<Node*, 4>* node_vec;
-    if (n->type_string() == kRetOp) {
+    if (n->type_string() == kRetOp || n->type_string() == kDeviceRetOp) {
       node_vec = &this->ret_nodes;
-    } else if (n->type_string() == kArgOp) {
+    } else if (n->type_string() == kArgOp || n->type_string() == kDeviceArgOp) {
       node_vec = &this->arg_nodes;
     } else {
       continue;
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 13c189fb87732cd72e3319f4cff0b0a26d12a93d..cab95cb596858f99285c3cfc5673f87b70368a32 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <atomic>
 #include <utility>
 
+#include "absl/memory/memory.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
@@ -147,14 +148,15 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 3});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    device_mgr_.reset(new DeviceMgr(devices_));
+    device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, default_thread_pool, nullptr /* cluster_flr */));
@@ -358,7 +360,6 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   FunctionLibraryRuntime* flr0_;
   FunctionLibraryRuntime* flr1_;
   FunctionLibraryRuntime* flr2_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
@@ -1432,9 +1433,7 @@ TEST_F(FunctionLibraryRuntimeTest, Gradient_AddSum) {
 
     GraphDef actual;
     g->ToGraphDef(&actual);
-    // The optimizer is non-deterministic, so we only check that the number of
-    // nodes is not greater than expected.
-    EXPECT_LE(actual.node_size(), expected.node_size());
+    TF_EXPECT_GRAPH_EQ(expected, actual);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 655a68cfc936c739fd9d90d0e39b46afb2bb1f45..1b803736fb881c8f133198ab39e5801a357c5659 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -54,21 +54,19 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 3});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices_));
+        options, "/job:localhost/replica:0/task:0", &devices));
 
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
     OptimizerOptions opts;
-    device_mgr_.reset(new DeviceMgr(devices_));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     pflr_.reset(new ProcessFunctionLibraryRuntime(
         device_mgr_.get(), Env::Default(), TF_GRAPH_DEF_VERSION, lib_def_.get(),
         opts, default_thread_pool, nullptr /* cluster_flr */));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
-    flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
-    flr2_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:2");
-    fdef_lib_ = lib_def_->ToProto();
   }
 
   Status Run(FunctionLibraryRuntime* flr, FunctionLibraryRuntime::Handle handle,
@@ -192,13 +190,9 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
   }
 
   FunctionLibraryRuntime* flr0_;
-  FunctionLibraryRuntime* flr1_;
-  FunctionLibraryRuntime* flr2_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionDefLibrary fdef_lib_;
 };
 
 TEST_F(FunctionLibraryRuntimeTest, DefaultThreadpool) {
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index 81fea311e13c766b1fdb79d5fdc63e21940dd2bd..5152d97fdefed688ba05043072ff6df635471ed9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -907,9 +907,9 @@ Allocator* BaseGPUDevice::GetScopedAllocator(AllocatorAttributes attr,
 const int BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength = 1000;
 const int BaseGPUDeviceFactory::InterconnectMap::kStreamExecutorStrength = 1;
 
-Status BaseGPUDeviceFactory::CreateDevices(const SessionOptions& options,
-                                           const string& name_prefix,
-                                           std::vector<Device*>* devices) {
+Status BaseGPUDeviceFactory::CreateDevices(
+    const SessionOptions& options, const string& name_prefix,
+    std::vector<std::unique_ptr<Device>>* devices) {
   TF_RETURN_IF_ERROR(ValidateGPUMachineManager());
   se::Platform* gpu_manager = GPUMachineManager();
   if (gpu_manager == nullptr) {
@@ -1073,12 +1073,10 @@ static string GetShortDeviceDescription(PlatformGpuId platform_gpu_id,
   // LINT.ThenChange(//tensorflow/python/platform/test.py)
 }
 
-Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
-                                             const string& name_prefix,
-                                             TfGpuId tf_gpu_id,
-                                             int64 memory_limit,
-                                             const DeviceLocality& dev_locality,
-                                             std::vector<Device*>* devices) {
+Status BaseGPUDeviceFactory::CreateGPUDevice(
+    const SessionOptions& options, const string& name_prefix, TfGpuId tf_gpu_id,
+    int64 memory_limit, const DeviceLocality& dev_locality,
+    std::vector<std::unique_ptr<Device>>* devices) {
   CHECK_GE(tf_gpu_id.value(), 0);
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_gpu_id.value());
@@ -1108,7 +1106,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
   // different (which should be an error).
   //
   // TODO(laigd): report error if memory_limit doesn't match stats.bytes_limit.
-  BaseGPUDevice* gpu_device = CreateGPUDevice(
+  std::unique_ptr<BaseGPUDevice> gpu_device = CreateGPUDevice(
       options, device_name, static_cast<Bytes>(stats.bytes_limit), dev_locality,
       tf_gpu_id, GetShortDeviceDescription(platform_gpu_id, desc),
       gpu_allocator, ProcessState::singleton()->GetCPUAllocator(numa_node));
@@ -1116,7 +1114,7 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(const SessionOptions& options,
             << (stats.bytes_limit >> 20) << " MB memory) -> physical GPU ("
             << GetShortDeviceDescription(platform_gpu_id, desc) << ")";
   TF_RETURN_IF_ERROR(gpu_device->Init(options));
-  devices->push_back(gpu_device);
+  devices->push_back(std::move(gpu_device));
 
   return Status::OK();
 }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 674e8384d5eaac9946a3882ca7c1c7655229b8c4..d002d02c51d073ef3019fa1659d555b5d092d883 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -166,7 +166,7 @@ class BaseGPUDevice : public LocalDevice {
 class BaseGPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override;
+                       std::vector<std::unique_ptr<Device>>* devices) override;
 
   struct InterconnectMap {
     // Name of interconnect technology, if known.
@@ -207,15 +207,13 @@ class BaseGPUDeviceFactory : public DeviceFactory {
   Status CreateGPUDevice(const SessionOptions& options,
                          const string& name_prefix, TfGpuId tf_gpu_id,
                          int64 memory_limit, const DeviceLocality& dev_locality,
-                         std::vector<Device*>* devices);
-
-  virtual BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
-                                         const string& name, Bytes memory_limit,
-                                         const DeviceLocality& dev_locality,
-                                         TfGpuId tf_gpu_id,
-                                         const string& physical_device_desc,
-                                         Allocator* gpu_allocator,
-                                         Allocator* cpu_allocator) = 0;
+                         std::vector<std::unique_ptr<Device>>* devices);
+
+  virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
+      const SessionOptions& options, const string& name, Bytes memory_limit,
+      const DeviceLocality& dev_locality, TfGpuId tf_gpu_id,
+      const string& physical_device_desc, Allocator* gpu_allocator,
+      Allocator* cpu_allocator) = 0;
 
   // Returns into 'ids' the list of valid platform GPU ids, in the order that
   // they should map to TF GPU ids "/device:GPU:0", "/device:GPU:1", etc,
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
index e1aaf95df6de07c8d12f2c443f0b6bfd6a99a968..8dc719732927880e6ebb628962160c4a90b1f25c 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_factory.cc
@@ -59,15 +59,14 @@ class GPUDevice : public BaseGPUDevice {
 
 class GPUDeviceFactory : public BaseGPUDeviceFactory {
  private:
-  BaseGPUDevice* CreateGPUDevice(const SessionOptions& options,
-                                 const string& name, Bytes memory_limit,
-                                 const DeviceLocality& locality,
-                                 TfGpuId tf_gpu_id,
-                                 const string& physical_device_desc,
-                                 Allocator* gpu_allocator,
-                                 Allocator* cpu_allocator) override {
-    return new GPUDevice(options, name, memory_limit, locality, tf_gpu_id,
-                         physical_device_desc, gpu_allocator, cpu_allocator);
+  std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
+      const SessionOptions& options, const string& name, Bytes memory_limit,
+      const DeviceLocality& locality, TfGpuId tf_gpu_id,
+      const string& physical_device_desc, Allocator* gpu_allocator,
+      Allocator* cpu_allocator) override {
+    return absl::make_unique<GPUDevice>(options, name, memory_limit, locality,
+                                        tf_gpu_id, physical_device_desc,
+                                        gpu_allocator, cpu_allocator);
   }
 };
 
@@ -108,7 +107,7 @@ class GPUCompatibleCPUDevice : public ThreadPoolDevice {
 class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -116,7 +115,7 @@ class GPUCompatibleCPUDeviceFactory : public DeviceFactory {
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:CPU:", i);
-      devices->push_back(new GPUCompatibleCPUDevice(
+      devices->push_back(absl::make_unique<GPUCompatibleCPUDevice>(
           options, name, Bytes(256 << 20), DeviceLocality(), cpu_allocator()));
     }
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
index 75be6d60b86af101fb9de7497490e72c523d632b..58656ec7576ef92122f2855acf2b544a30d00573 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_on_non_gpu_machine_test.cc
@@ -33,7 +33,7 @@ namespace {
 
 TEST(GPUDeviceOnNonGPUMachineTest, CreateGPUDevicesOnNonGPUMachine) {
   SessionOptions opts;
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, "/job:localhost/replica:0/task:0", &devices));
   EXPECT_TRUE(devices.empty());
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
index 36294094e9ad88fb45832c0295d07c9c1dbf5c6b..ae623b2adbe152de6cbad248db234ac5469f83e1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device_test.cc
@@ -88,7 +88,7 @@ class GPUDeviceTest : public ::testing::Test {
 
 TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,abc");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -97,7 +97,7 @@ TEST_F(GPUDeviceTest, FailedToParseVisibleDeviceList) {
 
 TEST_F(GPUDeviceTest, InvalidGpuId) {
   SessionOptions opts = MakeSessionOptions("100");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -107,7 +107,7 @@ TEST_F(GPUDeviceTest, InvalidGpuId) {
 
 TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
   SessionOptions opts = MakeSessionOptions("0,0");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -117,7 +117,7 @@ TEST_F(GPUDeviceTest, DuplicateEntryInVisibleDeviceList) {
 
 TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithMemoryFractionSettings) {
   SessionOptions opts = MakeSessionOptions("0", 0.1, 1, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -129,7 +129,7 @@ TEST_F(GPUDeviceTest, GpuDeviceCountTooSmall) {
   // device_count is 0, but with one entry in visible_device_list and one
   // (empty) VirtualDevices messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 0, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
@@ -141,7 +141,7 @@ TEST_F(GPUDeviceTest, NotEnoughGpuInVisibleDeviceList) {
   // Single entry in visible_device_list with two (empty) VirtualDevices
   // messages.
   SessionOptions opts = MakeSessionOptions("0", 0, 8, {{}, {}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::UNKNOWN);
@@ -155,7 +155,7 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
   // Three entries in visible_device_list with two (empty) VirtualDevices
   // messages.
   SessionOptions opts = MakeSessionOptions("0,1", 0, 8, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INVALID_ARGUMENT);
@@ -169,39 +169,36 @@ TEST_F(GPUDeviceTest, VirtualDeviceConfigConflictsWithVisibleDeviceList) {
 TEST_F(GPUDeviceTest, EmptyVirtualDeviceConfig) {
   // It'll create single virtual device when the virtual device config is empty.
   SessionOptions opts = MakeSessionOptions("0");
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithNoMemoryLimit) {
   // It'll create single virtual device for the gpu in question when
   // memory_limit_mb is unset.
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_GE(devices[0]->attributes().memory_limit(), 0);
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, SingleVirtualDeviceWithMemoryLimit) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(1, devices.size());
   EXPECT_EQ(123 << 20, devices[0]->attributes().memory_limit());
-  gtl::STLDeleteElements(&devices);
 }
 
 TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
   SessionOptions opts = MakeSessionOptions("0", 0, 1, {{123, 456}});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_CHECK_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   EXPECT_EQ(2, devices.size());
@@ -219,7 +216,6 @@ TEST_F(GPUDeviceTest, MultipleVirtualDevices) {
             devices[1]->attributes().locality().links().link(0).type());
   EXPECT_EQ(BaseGPUDeviceFactory::InterconnectMap::kSameDeviceStrength,
             devices[1]->attributes().locality().links().link(0).strength());
-  gtl::STLDeleteElements(&devices);
 }
 
 // Enabling unified memory on pre-Pascal GPUs results in an initialization
@@ -236,7 +232,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryUnavailableOnPrePascalGpus) {
   opts.config.mutable_gpu_options()
       ->mutable_experimental()
       ->set_use_unified_memory(true);
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices);
   EXPECT_EQ(status.code(), error::INTERNAL);
@@ -259,7 +255,7 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
   }
 
   SessionOptions opts = MakeSessionOptions("0", kGpuMemoryFraction);
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_ASSERT_OK(DeviceFactory::GetFactory("GPU")->CreateDevices(
       opts, kDeviceNamePrefix, &devices));
   ASSERT_EQ(1, devices.size());
@@ -278,8 +274,6 @@ TEST_F(GPUDeviceTest, UnifiedMemoryAllocation) {
                                      (memory_limit >> 20) << 20);
   EXPECT_NE(ptr, nullptr);
   allocator->DeallocateRaw(ptr);
-
-  gtl::STLDeleteElements(&devices);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
index d2adf699f524ef6771da6b0a41e7fc552d2bbdfa..fe3214755715a896b472835652be68c5ef65a6e9 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_event_mgr_test.cc
@@ -78,7 +78,8 @@ static std::atomic_int_fast64_t live_tensor_bytes(0);
 // A TensorBuffer that counts live memory usage for testing
 class TestTensorBuffer : public TensorBuffer {
  public:
-  explicit TestTensorBuffer(size_t bytes) : bytes_(bytes) {
+  explicit TestTensorBuffer(size_t bytes)
+      : TensorBuffer(nullptr), bytes_(bytes) {
     live_tensor_bytes += bytes_;
   }
   ~TestTensorBuffer() override { live_tensor_bytes -= bytes_; }
@@ -86,7 +87,6 @@ class TestTensorBuffer : public TensorBuffer {
   size_t size() const override { return bytes_; }
 
   // Not used in this test
-  void* data() const override { return nullptr; }
   TensorBuffer* root_buffer() override { return nullptr; }
   void FillAllocationDescription(AllocationDescription* arg) const override {}
 
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
index 3e95374fda89cd14660fa6974789c17be522bb03..8167cfb9d7dc6cd91a17323b3083d1823cbaa5e0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.cc
@@ -55,35 +55,25 @@ bool useCudaMemoryGuardAllocator() {
 
 }  // namespace
 
-GPUProcessState* GPUProcessState::instance_ = nullptr;
-
-/*static*/ GPUProcessState* GPUProcessState::singleton() {
-  if (instance_ == nullptr) {
-    instance_ = new GPUProcessState;
-  }
-  CHECK(instance_->process_state_);
-
-  return instance_;
+/*static*/ GPUProcessState* GPUProcessState::singleton(GPUProcessState* ps) {
+  static GPUProcessState* instance = ps ? ps : new GPUProcessState;
+  DCHECK((!ps) || (ps == instance))
+      << "Multiple calls to GPUProcessState with non-null ps";
+  return instance;
 }
 
 GPUProcessState::GPUProcessState() : gpu_device_enabled_(false) {
-  CHECK(instance_ == nullptr);
-  instance_ = this;
   process_state_ = ProcessState::singleton();
 }
 
-// Normally the GPUProcessState singleton is never explicitly deleted.
-// This function is defined for debugging problems with the allocators.
-GPUProcessState::~GPUProcessState() {
-  CHECK_EQ(this, instance_);
-  instance_ = nullptr;
-}
-
 int GPUProcessState::BusIdForGPU(TfGpuId tf_gpu_id) {
   // Return the NUMA node associated with the GPU's StreamExecutor.
   se::StreamExecutor* se =
       GpuIdUtil::ExecutorForTfGpuId(tf_gpu_id).ValueOrDie();
-  return se->GetDeviceDescription().numa_node();
+  int numa_node = se->GetDeviceDescription().numa_node();
+  // bus_id must be non-negative.  If the numa_node is not known,
+  // use 0.
+  return numa_node >= 0 ? numa_node : 0;
 }
 
 Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
@@ -110,6 +100,7 @@ Allocator* GPUProcessState::GetGPUAllocator(const GPUOptions& options,
     PlatformGpuId platform_gpu_id;
     TF_CHECK_OK(GpuIdManager::TfToPlatformGpuId(tf_gpu_id, &platform_gpu_id));
     int bus_id = BusIdForGPU(tf_gpu_id);
+    DCHECK_GE(bus_id, 0);
     while (bus_id >= gpu_visitors_.size()) {
       gpu_visitors_.push_back({});
     }
@@ -166,7 +157,9 @@ Allocator* GPUProcessState::GetCUDAHostAllocator(int numa_node) {
       !process_state_->ProcessState::FLAGS_brain_mem_reg_cuda_dma) {
     return process_state_->GetCPUAllocator(numa_node);
   }
-  CHECK_GE(numa_node, 0);
+  if (numa_node == port::kNUMANoAffinity) {
+    numa_node = 0;
+  }
   {
     // Here we optimize the most common use case where cuda_host_allocators_
     // and cuda_al_ have already been populated and since we're only reading
@@ -260,6 +253,7 @@ void GPUProcessState::AddGPUAllocVisitor(int bus_id,
   CHECK(gpu_allocators_.empty())  // Crash OK
       << "AddGPUAllocVisitor must be called before "
          "first call to GetGPUAllocator.";
+  DCHECK_GE(bus_id, 0);
   while (bus_id >= static_cast<int64>(gpu_visitors_.size())) {
     gpu_visitors_.push_back(std::vector<SubAllocator::Visitor>());
   }
diff --git a/tensorflow/core/common_runtime/gpu/gpu_process_state.h b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
index 43e9a316604006bb20f5ff171730f4b2ddc7e3d6..df51c10c8065fa94d736c8f4dfa76faebdc8bc62 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_process_state.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_process_state.h
@@ -37,7 +37,19 @@ class PoolAllocator;
 // Singleton that manages per-process state when GPUs are present.
 class GPUProcessState {
  public:
-  static GPUProcessState* singleton();
+  // If ps == nullptr, returns pointer to the single instance of this class to
+  // be used within this process.
+  //
+  // If ps != nullptrs, accepts a value to be returned by all subsequent calls.
+  // A non-null ps may ONLY be provided during program static storage
+  // initialization.  Must not be called more than once with a non-null ps.
+  //
+  // If a derived class of GPUProcessState is ever used in a process, it must
+  // always be used in place of this class.  In order to ensure that existing
+  // calls to GPUProcessState::singleton() all resolve to the derived instance
+  // instead, this function must be called once during startup, supplying the
+  // derived instance value, prior to any accessor call to this function.
+  static GPUProcessState* singleton(GPUProcessState* ps = nullptr);
 
   // Query whether any GPU device has been created so far.
   // Disable thread safety analysis since a race is benign here.
@@ -97,7 +109,11 @@ class GPUProcessState {
   virtual int BusIdForGPU(TfGpuId tf_gpu_id);
 
  protected:
+  // GPUProcessState is a singleton that should not normally be deleted except
+  // at process shutdown.
   GPUProcessState();
+  virtual ~GPUProcessState() {}
+  friend class GPUDeviceTest;
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
@@ -127,10 +143,6 @@ class GPUProcessState {
       GUARDED_BY(mu_);
   std::vector<std::vector<SubAllocator::Visitor>> cuda_host_free_visitors_
       GUARDED_BY(mu_);
-
-  virtual ~GPUProcessState();
-
-  friend class GPUDeviceTest;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index ab619ef619acab090b5a9d3597e874c23f3b7830..880806f120d010a812bbced62409a1ff5ed8e9d7 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -25,9 +25,11 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/common_runtime/placer.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/graph.pb_text.h"
 #include "tensorflow/core/framework/graph_def_util.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/algorithm.h"
 #include "tensorflow/core/graph/graph.h"
@@ -37,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/graph/validate.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/gtl/flatset.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
@@ -393,6 +396,42 @@ Status ValidateFeedAndFetchDevices(
   }
   return Status::OK();
 }
+
+Status GetFeedShapeAndTypeFromAttribute(const NodeDef& node,
+                                        PartialTensorShape* shape,
+                                        DataType* type) {
+  static const gtl::FlatSet<string>* const kHasExplicitShapeAttribute =
+      CHECK_NOTNULL((new gtl::FlatSet<string>{
+          "Placeholder", "PlaceholderV2", "PlaceholderWithDefault",
+          "ParallelConcat", "ImmutableConst", "_ParallelConcatStart",
+          "InfeedDequeue", "OutfeedDequeue", "CollectiveBcastSend",
+          "CollectiveBcastRecv", "AccumulateNV2", "VariableV2", "Variable",
+          "TemporaryVariable", "NcclBroadcast", "_ScopedAllocator",
+          "_ScopedAllocatorConcat"}));
+
+  // All the node types handled here have their output datatype set in
+  // either attribute 'dtype' or 'T'.
+  if (!GetNodeAttr(node, "dtype", type).ok() &&
+      !GetNodeAttr(node, "T", type).ok()) {
+    return errors::InvalidArgument(
+        "Could not determine output type for feed node: ", node.name(),
+        " of type ", node.op());
+  }
+
+  // First handle the case of feeding a const node.
+  if (node.op() == "Const" && HasNodeAttr(node, "value")) {
+    *shape =
+        PartialTensorShape(node.attr().at("value").tensor().tensor_shape());
+  } else if (kHasExplicitShapeAttribute->find(node.op()) !=
+             kHasExplicitShapeAttribute->end()) {
+    TF_RETURN_IF_ERROR(GetNodeAttr(node, "shape", shape));
+  } else {
+    return errors::InvalidArgument("Could not determine shape for feed node: ",
+                                   node.name(), " of type ", node.op());
+  }
+  return Status::OK();
+}
+
 }  // namespace
 
 Status GraphExecutionState::PruneGraph(
@@ -552,16 +591,17 @@ Status GraphExecutionState::OptimizeGraph(
     return errors::InvalidArgument("Can't optimize a pruned graph");
   }
 
-  const RewriterConfig& rewrite_options =
-      session_options_->config.graph_options().rewrite_options();
-
-  if (grappler::MetaOptimizerEnabled(rewrite_options)) {
-    // Adding this functionality in steps. The first step is to make sure
-    // we don't break dependencies. The second step will be to turn the
-    // functionality on by default.
+  if (grappler::MetaOptimizerEnabled(session_options_->config)) {
     grappler::GrapplerItem item;
     item.id = "tf_graph";
     graph_->ToGraphDef(&item.graph);
+
+    // It's ok to skip invalid device annotations in Grappler.
+    Status inferred_devices = item.InferDevicesFromGraph();
+    if (!inferred_devices.ok()) {
+      VLOG(3) << inferred_devices.error_message();
+    }
+
     // TODO(b/114748242): Add a unit test to test this bug fix.
     if (flib_def_) {
       *item.graph.mutable_library() = flib_def_->ToProto();
@@ -602,26 +642,30 @@ Status GraphExecutionState::OptimizeGraph(
         if (feeds.find(node.name()) == feeds.end()) {
           continue;
         }
-        if (node.attr().count("dtype") == 0 ||
-            node.attr().count("shape") == 0) {
-          return errors::InvalidArgument("Missing node shape or type");
-        }
-        TensorShapeProto shape_proto(node.attr().at("shape").shape());
-        // If the shape of the placeholder value is only partially known,
-        // we're free to use any dimension we want to feed the placeholder. We
-        // choose 1 to minimize the memory impact. Note that this only matters
-        // if an optimizer choose to run the graph to build its cost model,
-        // which doesn't happen (yet)
-        if (shape_proto.unknown_rank()) {
-          shape_proto.set_unknown_rank(false);
-        }
-        for (auto& dim : *shape_proto.mutable_dim()) {
-          if (dim.size() < 0) {
-            dim.set_size(1);
+        // Get the type and shape of the feed node.
+        PartialTensorShape partial_shape;
+        DataType type;
+        TF_RETURN_IF_ERROR(
+            GetFeedShapeAndTypeFromAttribute(node, &partial_shape, &type));
+        // If the shape of the placeholder is only partially known, we are free
+        // to set unknown dimensions of its shape to any value we desire. We
+        // choose 0 to minimize the memory impact. Note that this only matters
+        // if an optimizer chooses to run the graph.
+        TensorShape shape;
+        if (partial_shape.unknown_rank()) {
+          shape = TensorShape({0});
+        } else {
+          for (int i = 0; i < partial_shape.dims(); ++i) {
+            if (partial_shape.dim_size(i) < 0) {
+              partial_shape.set_dim(i, 0);
+            }
+          }
+          if (!partial_shape.AsTensorShape(&shape)) {
+            return errors::InvalidArgument(
+                "Could not derive shape for feed node: ", node.DebugString());
           }
         }
-        TensorShape shape(shape_proto);
-        DataType type = node.attr().at("dtype").type();
+
         Tensor fake_input(type, shape);
         item.feed.emplace_back(node.name(), fake_input);
       }
@@ -638,7 +682,7 @@ Status GraphExecutionState::OptimizeGraph(
     grappler::VirtualCluster cluster(device_set_);
     GraphDef new_graph;
     TF_RETURN_IF_ERROR(grappler::RunMetaOptimizer(
-        item, rewrite_options, cpu_device, &cluster, &new_graph));
+        item, session_options_->config, cpu_device, &cluster, &new_graph));
 
     // Merge optimized graph function library with an original library.
     // Optimized graph might have new functions specialized for it's
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
index 2144eea84f0a86194e7dbb0fb0ae65f5f1a618ec..f0656ff53332d7dd4f21d9d874846c16fb669681 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/hierarchical_tree_broadcaster.h"
 
 #include <algorithm>
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -217,7 +218,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
             << " num_devices_per_worker=" << num_devices_per_worker;
     int total_num_devices = num_workers * num_devices_per_worker;
     device_type_ = device_type;
-    std::vector<Device*> local_devices;
+    std::vector<std::unique_ptr<Device>> local_devices;
     SessionOptions sess_opts;
     sess_opts.env = Env::Default();
     Bytes mem_limit(4 << 20);
@@ -227,7 +228,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
         if (device_type == DEVICE_CPU) {
           string dev_name = strings::StrCat("/job:worker/replica:0/task:", wi,
                                             "/device:CPU:", di);
-          local_devices.push_back(new ThreadPoolDevice(
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
               sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
         } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
           int dev_idx = (wi * num_devices_per_worker) + di;
@@ -235,7 +236,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
             LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
                          "than one ring node.";
           } else {
-            local_devices.push_back(gpu_devices_[dev_idx]);
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
           }
         } else {
           LOG(FATAL) << "Unsupported device_type " << device_type;
@@ -243,7 +244,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
       }
     }
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
-      dev_mgr_.reset(new DeviceMgr(local_devices));
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
     }
     if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
     dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
@@ -714,7 +715,7 @@ class HierarchicalTreeBroadcasterTest : public ::testing::Test {
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
-  std::vector<tensorflow::Device*> gpu_devices_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
index 1f585a8c24801e9139cab5cc650fce19dd97e05e..bdd6c0e87d4443873fa43789afad993399b23fd5 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.cc
@@ -75,12 +75,12 @@ Benchmark::Benchmark(const string& device, Graph* g,
   const int graph_def_version = g->versions().producer();
 
   LocalExecutorParams params;
-  params.device = device_;
+  params.device = device_.get();
   params.function_library = nullptr;
   params.create_kernel = [this, graph_def_version](const NodeDef& ndef,
                                                    OpKernel** kernel) {
-    return CreateNonCachedKernel(device_, nullptr, ndef, graph_def_version,
-                                 kernel);
+    return CreateNonCachedKernel(device_.get(), nullptr, ndef,
+                                 graph_def_version, kernel);
   };
   params.delete_kernel = [](OpKernel* kernel) {
     DeleteNonCachedKernel(kernel);
@@ -107,7 +107,7 @@ Benchmark::~Benchmark() {
     // run kernel destructors that may attempt to access state borrowed from
     // `device_`, such as the resource manager.
     exec_.reset();
-    delete device_;
+    device_.reset();
     delete pool_;
   }
 }
diff --git a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
index 555b43f655b49c76a0a01dd35d099248b4681300..b1557c50b0371d627e93c358073c3c17b681c80b 100644
--- a/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
+++ b/tensorflow/core/common_runtime/kernel_benchmark_testlib.h
@@ -55,7 +55,7 @@ class Benchmark {
 
  private:
   thread::ThreadPool* pool_ = nullptr;
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_ = nullptr;
   Rendezvous* rendez_ = nullptr;
   std::unique_ptr<Executor> exec_;
 
diff --git a/tensorflow/core/common_runtime/local_device.cc b/tensorflow/core/common_runtime/local_device.cc
index 873182371e097cf0929cd6886b3ec70dfb9b3ab2..f1fcca194e9ef56bf7b96e6c73717db7620b9812 100644
--- a/tensorflow/core/common_runtime/local_device.cc
+++ b/tensorflow/core/common_runtime/local_device.cc
@@ -18,11 +18,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/common_runtime/eigen_thread_pool.h"
+#include "tensorflow/core/common_runtime/process_state.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/byte_order.h"
 #include "tensorflow/core/platform/cpu_feature_guard.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session_options.h"
 
@@ -30,23 +32,52 @@ namespace tensorflow {
 
 /* static */
 bool LocalDevice::use_global_threadpool_ = true;
+mutex LocalDevice::global_tp_mu_;
+gtl::InlinedVector<LocalDevice::EigenThreadPoolInfo*, 4>
+    LocalDevice::global_tp_info_;
 
 struct LocalDevice::EigenThreadPoolInfo {
-  explicit EigenThreadPoolInfo(const SessionOptions& options) {
+  // Wrapper so we can provide the CPUAllocator to Eigen for use
+  // when ops need extra tmp memory.
+  class EigenAllocator : public Eigen::Allocator {
+   public:
+    explicit EigenAllocator(tensorflow::Allocator* a) : allocator_(a) {}
+    void* allocate(size_t num_bytes) const override {
+      return allocator_->AllocateRaw(64, num_bytes);
+    }
+    void deallocate(void* buffer) const override {
+      allocator_->DeallocateRaw(buffer);
+    }
+    tensorflow::Allocator* allocator_;
+  };
+
+  explicit EigenThreadPoolInfo(const SessionOptions& options, int numa_node,
+                               Allocator* allocator) {
     int32 intra_op_parallelism_threads =
         options.config.intra_op_parallelism_threads();
     if (intra_op_parallelism_threads == 0) {
       intra_op_parallelism_threads = port::NumSchedulableCPUs();
+      if (numa_node != port::kNUMANoAffinity) {
+        // Assume that CPUs are equally distributed over available NUMA nodes.
+        // This may not be true, but there isn't currently a better way of
+        // determining the number of CPUs specific to the requested node.
+        intra_op_parallelism_threads /= port::NUMANumNodes();
+      }
     }
-    VLOG(1) << "Local device intra op parallelism threads: "
-            << intra_op_parallelism_threads;
+    ThreadOptions thread_opts;
+    thread_opts.numa_node = numa_node;
     eigen_worker_threads_.num_threads = intra_op_parallelism_threads;
     eigen_worker_threads_.workers = new thread::ThreadPool(
-        options.env, "Eigen", intra_op_parallelism_threads);
+        options.env, thread_opts, strings::StrCat("numa_", numa_node, "_Eigen"),
+        intra_op_parallelism_threads);
     eigen_threadpool_wrapper_.reset(
         new EigenThreadPoolWrapper(eigen_worker_threads_.workers));
+    if (allocator) {
+      eigen_allocator_.reset(new EigenAllocator(allocator));
+    }
     eigen_device_.reset(new Eigen::ThreadPoolDevice(
-        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads));
+        eigen_threadpool_wrapper_.get(), eigen_worker_threads_.num_threads,
+        eigen_allocator_.get()));
   }
 
   ~EigenThreadPoolInfo() {
@@ -58,6 +89,7 @@ struct LocalDevice::EigenThreadPoolInfo {
   DeviceBase::CpuWorkerThreads eigen_worker_threads_;
   std::unique_ptr<Eigen::ThreadPoolInterface> eigen_threadpool_wrapper_;
   std::unique_ptr<Eigen::ThreadPoolDevice> eigen_device_;
+  std::unique_ptr<EigenAllocator> eigen_allocator_;
 };
 
 LocalDevice::LocalDevice(const SessionOptions& options,
@@ -68,15 +100,34 @@ LocalDevice::LocalDevice(const SessionOptions& options,
   port::InfoAboutUnusedCPUFeatures();
   LocalDevice::EigenThreadPoolInfo* tp_info;
   if (use_global_threadpool_) {
-    // All ThreadPoolDevices in the process will use this single fixed
-    // sized threadpool for numerical computations.
-    static LocalDevice::EigenThreadPoolInfo* global_tp_info =
-        new LocalDevice::EigenThreadPoolInfo(options);
-    tp_info = global_tp_info;
+    mutex_lock l(global_tp_mu_);
+    if (options.config.experimental().use_numa_affinity()) {
+      int numa_node = attributes.locality().numa_node();
+      int num_numa_nodes = port::NUMANumNodes();
+      DCHECK_LT(numa_node, num_numa_nodes);
+      Allocator* numa_allocator =
+          ProcessState::singleton()->GetCPUAllocator(numa_node);
+      while (numa_node >= global_tp_info_.size()) {
+        global_tp_info_.push_back(nullptr);
+      }
+      if (!global_tp_info_[numa_node]) {
+        global_tp_info_[numa_node] = new LocalDevice::EigenThreadPoolInfo(
+            options, numa_node, numa_allocator);
+      }
+      tp_info = global_tp_info_[numa_node];
+    } else {
+      if (global_tp_info_.empty()) {
+        global_tp_info_.push_back(new LocalDevice::EigenThreadPoolInfo(
+            options, port::kNUMANoAffinity, nullptr));
+      }
+      tp_info = global_tp_info_[0];
+    }
   } else {
     // Each LocalDevice owns a separate ThreadPoolDevice for numerical
     // computations.
-    owned_tp_info_.reset(new LocalDevice::EigenThreadPoolInfo(options));
+    // TODO(tucker): NUMA for these too?
+    owned_tp_info_.reset(new LocalDevice::EigenThreadPoolInfo(
+        options, port::kNUMANoAffinity, nullptr));
     tp_info = owned_tp_info_.get();
   }
   set_tensorflow_cpu_worker_threads(&tp_info->eigen_worker_threads_);
diff --git a/tensorflow/core/common_runtime/local_device.h b/tensorflow/core/common_runtime/local_device.h
index 226f121bf32e0259d13dca633627174d5cdab917..f305c212c5a331be7992188d2b2e4c323ab6d403 100644
--- a/tensorflow/core/common_runtime/local_device.h
+++ b/tensorflow/core/common_runtime/local_device.h
@@ -47,6 +47,13 @@ class LocalDevice : public Device {
   struct EigenThreadPoolInfo;
   std::unique_ptr<EigenThreadPoolInfo> owned_tp_info_;
 
+  // All ThreadPoolDevices in the process associated with the same
+  // NUMA node will share a single fixed sized threadpool for numerical
+  // computations.
+  static mutex global_tp_mu_;
+  static gtl::InlinedVector<EigenThreadPoolInfo*, 4> global_tp_info_
+      GUARDED_BY(global_tp_mu_);
+
   friend class test::Benchmark;
 
   TF_DISALLOW_COPY_AND_ASSIGN(LocalDevice);
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 44a2478e3f9809e174ec3ef49c193b14daae9a62..9738006f5ca9eb821439a9ad507aec3db434946c 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -76,7 +76,7 @@ class CondBuilder {
   // The identity node with the same outputs as the original If op.
   Node* lowered_if_output_;
   // The predicate of the conditional.
-  Node* pred_;
+  OutputTensor pred_;
   // Node corresponding to pivot_f branch of predicate switch which is
   // the pivot node that dominates all nodes in the false/else branch.
   Node* pivot_f_;
@@ -102,7 +102,7 @@ CondBuilder::CondBuilder(Node* if_op, const string& then_fn_name,
       name_(if_op->name()),
       then_call_builder_(NewName("then"), then_fn_name, graph->op_registry()),
       else_call_builder_(NewName("else"), else_fn_name, graph->op_registry()) {
-  TF_CHECK_OK(if_op_->input_node(0, &pred_));
+  TF_CHECK_OK(if_op_->input_tensor(0, &pred_));
   then_call_builder_.Device(if_op_->requested_device());
   else_call_builder_.Device(if_op_->requested_device());
 }
@@ -113,8 +113,8 @@ Status CondBuilder::CreatePivotNodes() {
   Node* switch_pred;
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName("switch_pred"), "Switch", graph_->op_registry())
-          .Input(NodeOut(pred_, 0))
-          .Input(NodeOut(pred_, 0))
+          .Input(NodeOut(pred_))
+          .Input(NodeOut(pred_))
           .Device(if_op_->requested_device())
           .Finalize(graph_, &switch_pred));
   control_predecessor_ = switch_pred;
@@ -140,7 +140,7 @@ Status CondBuilder::AddInput(Node* src, int src_output) {
   TF_RETURN_IF_ERROR(
       NodeBuilder(NewName(src->name()), "Switch", graph_->op_registry())
           .Input(src, src_output)
-          .Input(pred_, 0)
+          .Input(pred_)
           .Device(if_op_->requested_device())
           .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
diff --git a/tensorflow/core/common_runtime/metrics.cc b/tensorflow/core/common_runtime/metrics.cc
index 2736739b1a5679f6b9b02ef1f7bc48f0e4b09dc4..f4c94ed7ec0cb1c5e8b341b75f1d075d30d6125a 100644
--- a/tensorflow/core/common_runtime/metrics.cc
+++ b/tensorflow/core/common_runtime/metrics.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/metrics.h"
-
 #include "tensorflow/core/lib/monitoring/counter.h"
 
 namespace tensorflow {
@@ -24,18 +23,17 @@ namespace {
 auto* graph_runs = monitoring::Counter<0>::New(
     "/tensorflow/core/graph_runs",
     "The number of graph executions used to collect "
-    "/tensorflow/core/graph_run_time_msecs");
+    "/tensorflow/core/graph_run_time_usecs");
 
-auto* graph_run_time_msecs = monitoring::Counter<0>::New(
-    "/tensorflow/core/graph_run_time_msecs",
-    "The total time spent on executing graphs in milliseconds.");
+auto* graph_run_time_usecs = monitoring::Counter<0>::New(
+    "/tensorflow/core/graph_run_time_usecs",
+    "The total time spent on executing graphs in microseconds.");
 }  // namespace
 
-void UpdateGraphExecutionTime(const absl::Duration running_time) {
-  if (running_time > absl::ZeroDuration()) {
+void UpdateGraphExecTime(const uint64 running_time_usecs) {
+  if (running_time_usecs > 0) {
     graph_runs->GetCell()->IncrementBy(1);
-    graph_run_time_msecs->GetCell()->IncrementBy(running_time /
-                                                 absl::Milliseconds(1));
+    graph_run_time_usecs->GetCell()->IncrementBy(running_time_usecs);
   }
 }
 
diff --git a/tensorflow/core/common_runtime/metrics.h b/tensorflow/core/common_runtime/metrics.h
index 5dd4caf5b739d2b559e1a839dde5e7b24dea38d2..d3430c9f030998f118c1626e6bbed93dd316a525 100644
--- a/tensorflow/core/common_runtime/metrics.h
+++ b/tensorflow/core/common_runtime/metrics.h
@@ -16,11 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_METRICS_H_
 
-#include "absl/time/time.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
-void UpdateGraphExecutionTime(const absl::Duration time);
+void UpdateGraphExecTime(const uint64 running_time_usecs);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index 6ac047295dce8f78016d8ce65ddebeb20c372531..9be540b0192416b6dfa636b054bd174bb8376eec 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/util/dump_graph.h"
 
 namespace tensorflow {
 
@@ -39,6 +40,19 @@ Status OptimizationPassRegistry::RunGrouping(
         VLOG(1) << "Running optimization pass: " << pass->name();
         Status s = pass->Run(options);
         if (!s.ok()) return s;
+        if (VLOG_IS_ON(1)) {
+          DumpGraphToFile(
+              strings::StrCat("after_phase_", phase.first, "_", pass->name()),
+              **options.graph);
+          if (options.partition_graphs) {
+            for (auto& part : *options.partition_graphs) {
+              DumpGraphToFile(
+                  strings::StrCat("after_phase_", phase.first, "_",
+                                  pass->name(), "_partition_", part.first),
+                  *part.second);
+            }
+          }
+        }
       }
     }
   }
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index f8d933b45e02c6eed0a8abcdd326518e1206cd1f..515c1971d9d5cb179b7b9764ff3462579e742dfc 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -47,42 +47,51 @@ const StringPiece kColocationGroupPrefixStringPiece(kColocationGroupPrefix);
 // returned list is sorted by preferred type (higher numeric type is preferred).
 std::vector<Device*> FilterSupportedDevices(
     const std::vector<Device*>& devices,
-    const DeviceTypeVector& supported_device_types,
+    const PrioritizedDeviceTypeVector& supported_device_types,
     const Device* default_device) {
   Device* filtered_default_device = nullptr;
-  std::vector<Device*> filtered_devices;
-  for (const DeviceType& d : supported_device_types) {
+  std::vector<std::pair<Device*, int32>> prioritized_filtered_devices;
+  for (const auto& supported_device_type : supported_device_types) {
     for (Device* device : devices) {
-      if (DeviceType(device->attributes().device_type()) == d) {
+      if (DeviceType(device->attributes().device_type()) ==
+          supported_device_type.first) {
         if (device == default_device) {
           filtered_default_device = device;
         } else {
-          filtered_devices.emplace_back(device);
+          prioritized_filtered_devices.emplace_back(
+              device, supported_device_type.second);
         }
       }
     }
   }
 
-  auto device_sort = [](const Device* a, const Device* b) {
-    auto a_priority = DeviceSet::DeviceTypeOrder(DeviceType(a->device_type()));
-    auto b_priority = DeviceSet::DeviceTypeOrder(DeviceType(b->device_type()));
+  auto device_sort = [](const std::pair<Device*, int32>& a,
+                        const std::pair<Device*, int32>& b) {
+    if (a.second != b.second) {
+      return a.second > b.second;
+    }
+
+    auto a_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(a.first->device_type()));
+    auto b_priority =
+        DeviceSet::DeviceTypeOrder(DeviceType(b.first->device_type()));
     // First sort by prioritized device type (higher is preferred) and
     // then by device name (lexicographically).
     if (a_priority != b_priority) {
       return a_priority > b_priority;
     }
-    return StringPiece(a->name()) < StringPiece(b->name());
+    return StringPiece(a.first->name()) < StringPiece(b.first->name());
   };
-  std::vector<Device*>::iterator sort_start;
+  std::sort(prioritized_filtered_devices.begin(),
+            prioritized_filtered_devices.end(), device_sort);
+
+  std::vector<Device*> filtered_devices;
   if (filtered_default_device != nullptr) {
-    // Put the default device first outside of the normal ordering.
     filtered_devices.emplace_back(filtered_default_device);
-    std::iter_swap(filtered_devices.begin(), std::prev(filtered_devices.end()));
-    sort_start = std::next(filtered_devices.begin());
-  } else {
-    sort_start = filtered_devices.begin();
   }
-  std::sort(sort_start, filtered_devices.end(), device_sort);
+  for (const auto& prioritized_filtered_device : prioritized_filtered_devices) {
+    filtered_devices.push_back(prioritized_filtered_device.first);
+  }
   return filtered_devices;
 }
 
@@ -472,7 +481,7 @@ class ColocationGraph {
     // The intersection of all device types supported by this node,
     // and those of all of its children, in priority order
     // of the preferred device.
-    DeviceTypeVector supported_device_types;
+    PrioritizedDeviceTypeVector supported_device_types;
 
     // The merged form of the device requested for this node, with
     // those of all of its children.
@@ -511,8 +520,8 @@ class ColocationGraph {
       const string& op_type = node->type_string();
       string devices_registered;
       for (const auto& device_type : members_[id].supported_device_types) {
-        strings::StrAppend(&devices_registered, DeviceTypeString(device_type),
-                           " ");
+        strings::StrAppend(&devices_registered,
+                           DeviceTypeString(device_type.first), " ");
       }
 
       type_to_devices[op_type] = std::move(devices_registered);
@@ -565,8 +574,9 @@ class ColocationGraph {
                                 "' does not match any device");
       }
 
-      for (const DeviceType& d : member->supported_device_types) {
-        if (DeviceType(assigned_device->attributes().device_type()) == d) {
+      for (const auto& d : member->supported_device_types) {
+        if (DeviceType(assigned_device->attributes().device_type()) ==
+            d.first) {
           return Status::OK();
         }
       }
@@ -623,24 +633,102 @@ class ColocationGraph {
     return Status::OK();
   }
 
+  static bool HasPriorities(const PrioritizedDeviceTypeVector& device_types) {
+    for (const auto& prioritized_device_type : device_types) {
+      if (prioritized_device_type.second != 0) return true;
+    }
+    return false;
+  }
+
+  static bool ArePrioritiesSame(const PrioritizedDeviceTypeVector& a_types,
+                                const PrioritizedDeviceTypeVector& b_types) {
+    if (a_types.size() != b_types.size()) {
+      return false;
+    }
+    for (int i = 0; i < a_types.size(); ++i) {
+      if (a_types[i].first != b_types[i].first) {
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Updates target to contain the intersection of the device types in
   // "target" and "other".
-  static void MergeSupportedDevices(DeviceTypeVector* target,
-                                    const DeviceTypeVector& other) {
-    DeviceTypeVector temp = *target;
+  static void MergeSupportedDevices(PrioritizedDeviceTypeVector* target,
+                                    const PrioritizedDeviceTypeVector& other) {
+    PrioritizedDeviceTypeVector temp = *target;
     target->clear();
 
-    // Iterate in priority order.
-    for (const DeviceType& device_type : temp) {
+    // Generate intersection with priorities.
+    PrioritizedDeviceTypeVector target_intersection;
+    PrioritizedDeviceTypeVector other_intersection;
+    for (const auto& prioritized_device_type : temp) {
       bool found = false;
-      for (const DeviceType& other_device_type : other) {
-        if (device_type == other_device_type) {
+      for (const auto& other_prioritized_device_type : other) {
+        if (prioritized_device_type.first ==
+            other_prioritized_device_type.first) {
           found = true;
+          other_intersection.push_back(other_prioritized_device_type);
           break;
         }
       }
       if (found) {
-        target->push_back(device_type);
+        target_intersection.push_back(prioritized_device_type);
+      }
+    }
+
+    // Sort the devices by priority order.
+    auto device_sort = [](const std::pair<DeviceType, int32>& a,
+                          const std::pair<DeviceType, int32>& b) {
+      // First look at set priorities.
+      if (a.second != b.second) {
+        return a.second > b.second;
+      }
+      // Then fallback to default priorities.
+      auto a_priority = DeviceSet::DeviceTypeOrder(a.first);
+      auto b_priority = DeviceSet::DeviceTypeOrder(b.first);
+      if (a_priority != b_priority) {
+        return a_priority > b_priority;
+      }
+      // Finally just look at the Device type strings.
+      return a.first.type_string() < b.first.type_string();
+    };
+
+    std::sort(target_intersection.begin(), target_intersection.end(),
+              device_sort);
+    std::sort(other_intersection.begin(), other_intersection.end(),
+              device_sort);
+
+    bool is_target_prioritized = HasPriorities(target_intersection);
+    bool is_other_prioritized = HasPriorities(other_intersection);
+    // If neither are prioritized then we just return the original i.e. target
+    // prioritization.
+    if (!is_target_prioritized && !is_other_prioritized) {
+      *target = target_intersection;
+    }
+    // If only one is prioritized, then we respect priorities of that in the
+    // intersection.
+    if (is_target_prioritized && !is_other_prioritized) {
+      *target = target_intersection;
+    }
+    if (!is_target_prioritized && is_other_prioritized) {
+      *target = other_intersection;
+    }
+    // If both have priorities and agree then we go with that. If the
+    // prioritization order is different, then we just fallback to the default
+    // i.e. what the DeviceTypeOrder suggests. In that case, we also set the
+    // merged priorities to 0, so that downstream merges work correctly as well.
+    if (is_target_prioritized && is_other_prioritized) {
+      bool priorities_agree =
+          ArePrioritiesSame(target_intersection, other_intersection);
+      if (priorities_agree) {
+        *target = target_intersection;
+      } else {
+        for (const auto& prioritized_device : target_intersection) {
+          target->push_back(std::make_pair(prioritized_device.first, 0));
+        }
+        std::sort(target->begin(), target->end(), device_sort);
       }
     }
   }
@@ -914,7 +1002,7 @@ Status Placer::Run() {
     int assigned_device = -1;
 
     // Heuristic A application.
-    if (IsGeneratorNode(node)) {
+    if (IsGeneratorNode(node) && !node->out_edges().empty()) {
       const Node* output = (*node->out_edges().begin())->dst();
       int output_device_name = output->assigned_device_name_index();
 
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 69f1611c1dd6102ed5fcc6019dad55304e338674..04e77e55f62e1bd9345c8e9113407bbf0a375774 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -92,7 +92,7 @@ class FakeDevice : public Device {
 class DummyFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
+                       std::vector<std::unique_ptr<Device>>* devices) override {
     return Status::OK();
   }
 };
@@ -164,6 +164,13 @@ REGISTER_KERNEL_BUILDER(Name("TestDeviceEnforce").Device("FakeGPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Shape").Device("FakeCPU"), DummyOp);
 REGISTER_KERNEL_BUILDER(Name("Shape").Device("FakeGPU"), DummyOp);
 
+// Op that has kernels with device priorities specified.
+REGISTER_OP("TestDatasetOp").Input("a: float").Output("b: float");
+REGISTER_KERNEL_BUILDER(Name("TestDatasetOp").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("TestDatasetOp").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
 ////////////////////////////////////////////////////////////////////////////////
 //
 // A PlacerTest method has three phases:
@@ -285,6 +292,251 @@ TEST_F(PlacerTest, TestNoConstraints) {
   EXPECT_DEVICE_TYPE(g, "n2", "FakeGPU");
 }
 
+// Test that a graph with no constraints but using kernels that have a specified
+// device priority will successfully assign nodes to the device with higher
+// priority
+TEST_F(PlacerTest, TestNoConstraintsWithPrioritizedKernels) {
+  Graph g(OpRegistry::Global());
+  {  // Scope for temporary variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestInput", b.opts().WithName("in"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 1),
+                 b.opts().WithName("n2"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "n2", "FakeCPU");
+}
+
+TEST_F(PlacerTest, TestGPUInputIntoPrioritizedKernel) {
+  Graph g(OpRegistry::Global());
+  {
+    // Scope for temp variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestGPUOutput", b.opts().WithName("in"));
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeCPU");
+}
+
+// Tests that a GPU kernel colocated with prioritized kernel respects it.
+TEST_F(PlacerTest, TestGPUInputColocatedWithPrioritizedKernel) {
+  Graph g(OpRegistry::Global());
+  {
+    // Scope for temp variables used to construct g.
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* input = ops::SourceOp("TestGPUOutput", b.opts().WithName("in"));
+    // We colocate n1 with in.
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n1").WithAttr("_class", {"loc:@in"}));
+    // We don't colocate n2 with in.
+    ops::UnaryOp("TestDatasetOp", ops::NodeOut(input, 0),
+                 b.opts().WithName("n2"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "in", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n1", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "n2", "FakeCPU");
+}
+
+REGISTER_OP("CreateDatasetCPU").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetCPU").Device("FakeCPU"), DummyOp);
+
+REGISTER_OP("CreateDatasetSP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetSP").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetSP").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
+REGISTER_OP("CreateDatasetRP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetRP").Device("FakeCPU").Priority(1),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetRP").Device("FakeGPU").Priority(2),
+                        DummyOp);
+
+REGISTER_OP("CreateDatasetNP").Output("o: resource");
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetNP").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("CreateDatasetNP").Device("FakeGPU"), DummyOp);
+
+REGISTER_OP("IteratorNP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorNP").Device("FakeCPU"), DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorNP").Device("FakeGPU"), DummyOp);
+
+REGISTER_OP("IteratorSP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorSP").Device("FakeCPU").Priority(2),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorSP").Device("FakeGPU").Priority(1),
+                        DummyOp);
+
+REGISTER_OP("IteratorRP").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorRP").Device("FakeCPU").Priority(1),
+                        DummyOp);
+REGISTER_KERNEL_BUILDER(Name("IteratorRP").Device("FakeGPU").Priority(2),
+                        DummyOp);
+
+REGISTER_OP("IteratorGPU").Input("i: resource").Output("o: float");
+REGISTER_KERNEL_BUILDER(Name("IteratorGPU").Device("FakeGPU"), DummyOp);
+
+// Test reference edges with one node having prioritized kernels and the other
+// has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestDSWithPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorNP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test reference edges with one node having kernels with regular priority and
+// the other has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestDSWithGPUPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorNP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and the other
+// has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestITWithPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetNP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test reference edges with one node having kernels with regular priority and
+// the other has no preference. We should respect priority here.
+TEST_F(PlacerTest, TestITWithGPUPriority) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetNP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and other node
+// can only be placed on GPU. We should respect the constraint then.
+TEST_F(PlacerTest, TestITGPU) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorGPU", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test reference edges with one node having prioritized kernels and other node
+// can only be placed on CPU. We should respect the constraint then.
+TEST_F(PlacerTest, TestSimpleIteratorOnlyGPU) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetCPU", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test constraints with agreeing priorities.
+TEST_F(PlacerTest, TestAgreeingPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeCPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeCPU");
+}
+
+// Test constraints with agreeing regular priorities.
+TEST_F(PlacerTest, TestAgreeingRegularPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test constraints with different priorities. In this case, we should bail
+// and just revert to default.
+TEST_F(PlacerTest, TestConflictingPriorities) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetSP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorRP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
+// Test constraints with different priorities. In this case, we should bail
+// and just revert to default.
+TEST_F(PlacerTest, TestConflictingPrioritiesReversed) {
+  Graph g(OpRegistry::Global());
+  {
+    GraphDefBuilder b(GraphDefBuilder::kFailImmediately);
+    Node* ds = ops::SourceOp("CreateDatasetRP", b.opts().WithName("ds"));
+    ops::UnaryOp("IteratorSP", ops::NodeOut(ds, 0), b.opts().WithName("it"));
+    TF_EXPECT_OK(BuildGraph(b, &g));
+  }
+  TF_EXPECT_OK(Place(&g));
+  EXPECT_DEVICE_TYPE(g, "ds", "FakeGPU");
+  EXPECT_DEVICE_TYPE(g, "it", "FakeGPU");
+}
+
 // Test that a graph with device type and reference constraints on
 // some of the ops will successfully assign nodes to the constrained
 // device, and colocate nodes with reference connections.
diff --git a/tensorflow/core/common_runtime/pool_allocator.cc b/tensorflow/core/common_runtime/pool_allocator.cc
index 66dc8f332217c30a3b3a1745a7c90a1880e3e068..6b40fcc4c70f50ba5bc643855a8035d73b92bfb0 100644
--- a/tensorflow/core/common_runtime/pool_allocator.cc
+++ b/tensorflow/core/common_runtime/pool_allocator.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -258,7 +259,12 @@ void PoolAllocator::EvictOne() {
 void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
   void* ptr = nullptr;
   if (num_bytes > 0) {
-    ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    if (numa_node_ == port::kNUMANoAffinity) {
+      ptr = port::AlignedMalloc(num_bytes, static_cast<int>(alignment));
+    } else {
+      ptr =
+          port::NUMAMalloc(numa_node_, num_bytes, static_cast<int>(alignment));
+    }
     VisitAlloc(ptr, numa_node_, num_bytes);
   }
   return ptr;
@@ -267,7 +273,11 @@ void* BasicCPUAllocator::Alloc(size_t alignment, size_t num_bytes) {
 void BasicCPUAllocator::Free(void* ptr, size_t num_bytes) {
   if (num_bytes > 0) {
     VisitFree(ptr, numa_node_, num_bytes);
-    port::AlignedFree(ptr);
+    if (numa_node_ == port::kNUMANoAffinity) {
+      port::AlignedFree(ptr);
+    } else {
+      port::NUMAFree(ptr, num_bytes);
+    }
   }
 }
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pool_allocator.h b/tensorflow/core/common_runtime/pool_allocator.h
index 5b4623ba10fe684f6399a244e30ecafd55003c95..8be9c7b678e2bbe7659c9e22e31cb595ce704307 100644
--- a/tensorflow/core/common_runtime/pool_allocator.h
+++ b/tensorflow/core/common_runtime/pool_allocator.h
@@ -150,7 +150,6 @@ class Pow2Rounder : public RoundUpInterface {
 
 class BasicCPUAllocator : public SubAllocator {
  public:
-  // Argument numa_node is currently ignored.
   BasicCPUAllocator(int numa_node, const std::vector<Visitor>& alloc_visitors,
                     const std::vector<Visitor>& free_visitors)
       : SubAllocator(alloc_visitors, free_visitors), numa_node_(numa_node) {}
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index cce230801183591371268bf2827d153c9c19b840..21cb62118aebafa8a03903296b65f0617510f080 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -62,9 +62,12 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     SessionOptions options;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", 2});
+    std::vector<std::unique_ptr<Device>> devices;
     TF_CHECK_OK(DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0",
-                                          &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+                                          &devices));
+    device0_ = devices[0].get();
+    device1_ = devices[1].get();
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     FunctionDefLibrary proto;
     for (const auto& fdef : flib) *(proto.add_function()) = fdef;
     lib_def_.reset(new FunctionLibraryDefinition(OpRegistry::Global(), proto));
@@ -138,8 +141,9 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     return Status::OK();
   }
 
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
+  Device* device0_ = nullptr;  // Not owned. (Owned by device_mgr_.)
+  Device* device1_ = nullptr;  // Not owned. (Owned by device_mgr_.)
   std::unique_ptr<FunctionLibraryDefinition> lib_def_;
   std::unique_ptr<TestClusterFLR> cluster_flr_;
   std::unique_ptr<ProcessFunctionLibraryRuntime> proc_flr_;
@@ -165,16 +169,16 @@ TEST_F(ProcessFunctionLibraryRuntimeTest, Basic) {
   FunctionLibraryRuntime* flr =
       proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/device:CPU:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/device:CPU:0");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[0]);
+  EXPECT_EQ(flr->device(), device0_);
   flr = proc_flr_->GetFLR("/job:a/replica:0/task:0/cpu:1");
   EXPECT_NE(flr, nullptr);
-  EXPECT_EQ(flr->device(), devices_[1]);
+  EXPECT_EQ(flr->device(), device1_);
   flr = proc_flr_->GetFLR("abc");
   EXPECT_EQ(flr, nullptr);
   rendezvous_->Unref();
diff --git a/tensorflow/core/common_runtime/process_state.cc b/tensorflow/core/common_runtime/process_state.cc
index bcaa37fc8a156a63fcc76f9b8bb39ac8fd75f15a..3d8ac9b1344d8f2ca210451194adf4607dd52b7d 100644
--- a/tensorflow/core/common_runtime/process_state.cc
+++ b/tensorflow/core/common_runtime/process_state.cc
@@ -32,28 +32,12 @@ limitations under the License.
 
 namespace tensorflow {
 
-ProcessState* ProcessState::instance_ = nullptr;
-
 /*static*/ ProcessState* ProcessState::singleton() {
-  if (instance_ == nullptr) {
-    instance_ = new ProcessState;
-  }
-
-  return instance_;
+  static ProcessState* instance = new ProcessState;
+  return instance;
 }
 
 ProcessState::ProcessState() : numa_enabled_(false) {
-  CHECK(instance_ == nullptr);
-}
-
-// Normally the ProcessState singleton is never explicitly deleted.
-// This function is defined for debugging problems with the allocators.
-ProcessState::~ProcessState() {
-  CHECK_EQ(this, instance_);
-  instance_ = nullptr;
-  for (Allocator* a : cpu_allocators_) {
-    delete a;
-  }
 }
 
 string ProcessState::MemDesc::DebugString() {
@@ -72,8 +56,7 @@ ProcessState::MemDesc ProcessState::PtrType(const void* ptr) {
 }
 
 Allocator* ProcessState::GetCPUAllocator(int numa_node) {
-  CHECK_GE(numa_node, 0);
-  if (!numa_enabled_) numa_node = 0;
+  if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0;
   mutex_lock lock(mu_);
   while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) {
     // If visitors have been defined we need an Allocator built from
@@ -90,8 +73,9 @@ Allocator* ProcessState::GetCPUAllocator(int numa_node) {
     Allocator* allocator = nullptr;
     SubAllocator* sub_allocator =
         (alloc_visitors_defined || use_bfc_allocator)
-            ? new BasicCPUAllocator(numa_enabled_ ? numa_node : -1,
-                                    cpu_alloc_visitors_, cpu_free_visitors_)
+            ? new BasicCPUAllocator(
+                  numa_enabled_ ? numa_node : port::kNUMANoAffinity,
+                  cpu_alloc_visitors_, cpu_free_visitors_)
             : nullptr;
     if (use_bfc_allocator) {
       // TODO(reedwm): evaluate whether 64GB by default is the best choice.
diff --git a/tensorflow/core/common_runtime/process_state.h b/tensorflow/core/common_runtime/process_state.h
index cac312d8496d3d4e454291405bcd16c432af8852..6849d305b3c5577485e83ed7d2e9521dce20a452 100644
--- a/tensorflow/core/common_runtime/process_state.h
+++ b/tensorflow/core/common_runtime/process_state.h
@@ -63,7 +63,7 @@ class ProcessState {
   MemDesc PtrType(const void* ptr);
 
   // Returns the one CPUAllocator used for the given numa_node.
-  // TEMPORARY: ignores numa_node.
+  // Treats numa_node == kNUMANoAffinity as numa_node == 0.
   Allocator* GetCPUAllocator(int numa_node);
 
   // Registers alloc visitor for the CPU allocator(s).
@@ -87,19 +87,19 @@ class ProcessState {
 
   // Helper method for unit tests to reset the ProcessState singleton by
   // cleaning up everything. Never use in production.
-  virtual void TestOnlyReset();
+  void TestOnlyReset();
 
   static ProcessState* instance_;
   bool numa_enabled_;
 
   mutex mu_;
 
+  // Indexed by numa_node.  If we want numa-specific allocators AND a
+  // non-specific allocator, maybe should index by numa_node+1.
   std::vector<Allocator*> cpu_allocators_ GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_alloc_visitors_ GUARDED_BY(mu_);
   std::vector<SubAllocator::Visitor> cpu_free_visitors_ GUARDED_BY(mu_);
 
-  virtual ~ProcessState();
-
   // Optional RecordingAllocators that wrap the corresponding
   // Allocators for runtime attribute use analysis.
   MDMap mem_desc_map_;
diff --git a/tensorflow/core/common_runtime/renamed_device.cc b/tensorflow/core/common_runtime/renamed_device.cc
index 56766a8df4526cb2d6fb20c5dcd461a65d2a994b..45541c35fe9b7bd7886b0c0928a77e2359a9aaa3 100644
--- a/tensorflow/core/common_runtime/renamed_device.cc
+++ b/tensorflow/core/common_runtime/renamed_device.cc
@@ -14,15 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/common_runtime/renamed_device.h"
+#include "absl/memory/memory.h"
 
 namespace tensorflow {
 
-// TODO(saeta): Convert to returning a std::unique_ptr?
 /* static */
-Device* RenamedDevice::NewRenamedDevice(const string& new_base,
-                                        Device* underlying,
-                                        bool owns_underlying,
-                                        bool isolate_session_state) {
+std::unique_ptr<Device> RenamedDevice::NewRenamedDevice(
+    const string& new_base, Device* underlying, bool owns_underlying,
+    bool isolate_session_state) {
   DeviceNameUtils::ParsedName parsed_name;
   CHECK(DeviceNameUtils::ParseFullName(new_base, &parsed_name));
   DeviceNameUtils::ParsedName underlying_parsed_name =
@@ -36,8 +35,9 @@ Device* RenamedDevice::NewRenamedDevice(const string& new_base,
                                           parsed_name.id);
   DeviceAttributes attributes(underlying->attributes());
   attributes.set_name(name);
-  return new RenamedDevice(underlying, attributes, owns_underlying,
-                           isolate_session_state);
+  // Call absl::WrapUnique to access private constructor.
+  return absl::WrapUnique(new RenamedDevice(
+      underlying, attributes, owns_underlying, isolate_session_state));
 }
 
 RenamedDevice::RenamedDevice(Device* underlying,
diff --git a/tensorflow/core/common_runtime/renamed_device.h b/tensorflow/core/common_runtime/renamed_device.h
index c00789a55631aad1dbd79ee3cbe588b0436a853f..6d24f496ffb3c78c4f7e38564bba11ebabfbc39e 100644
--- a/tensorflow/core/common_runtime/renamed_device.h
+++ b/tensorflow/core/common_runtime/renamed_device.h
@@ -28,9 +28,10 @@ namespace tensorflow {
 // session.
 class RenamedDevice : public Device {
  public:
-  static Device* NewRenamedDevice(const string& new_base, Device* underlying,
-                                  bool owns_underlying,
-                                  bool isolate_session_state);
+  static std::unique_ptr<Device> NewRenamedDevice(const string& new_base,
+                                                  Device* underlying,
+                                                  bool owns_underlying,
+                                                  bool isolate_session_state);
 
   ~RenamedDevice() override;
 
diff --git a/tensorflow/core/common_runtime/ring_reducer.cc b/tensorflow/core/common_runtime/ring_reducer.cc
index b1fe928ba7d4d2184b5d28344fa7dea0cb3c160b..092f15e49e330de21452e0f7b4d8cc51607a44ed 100644
--- a/tensorflow/core/common_runtime/ring_reducer.cc
+++ b/tensorflow/core/common_runtime/ring_reducer.cc
@@ -290,7 +290,7 @@ void RingReducer::Run(StatusCallback done) {
         col_ctx_->device, col_ctx_->op_ctx->input_alloc_attr(0),
         col_ctx_->op_ctx->output_alloc_attr(0), col_ctx_->input,
         col_ctx_->output, 0 /*dev_to_dev_stream_index*/,
-        [this, &note, &status](const Status& s) {
+        [&note, &status](const Status& s) {
           status.Update(s);
           note.Notify();
         });
diff --git a/tensorflow/core/common_runtime/ring_reducer_test.cc b/tensorflow/core/common_runtime/ring_reducer_test.cc
index a271bf7b747abba199df633bebd6563d027ffb2f..7feb29a6dbbb17d73967344ad07db9d234411840 100644
--- a/tensorflow/core/common_runtime/ring_reducer_test.cc
+++ b/tensorflow/core/common_runtime/ring_reducer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/ring_reducer.h"
 
 #include <algorithm>
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/collective_rma_local.h"
 #include "tensorflow/core/common_runtime/device.h"
@@ -157,7 +158,7 @@ class RingReducerTest : public ::testing::Test {
     InitGPUDevices();
 #endif
     device_type_ = device_type;
-    std::vector<Device*> local_devices;
+    std::vector<std::unique_ptr<Device>> local_devices;
     SessionOptions sess_opts;
     sess_opts.env = Env::Default();
     Bytes mem_limit(4 << 20);
@@ -167,7 +168,7 @@ class RingReducerTest : public ::testing::Test {
         if (device_type == DEVICE_CPU) {
           string dev_name =
               strings::StrCat("/job:worker/replica:0/task:", wi, "/cpu:", di);
-          local_devices.push_back(new ThreadPoolDevice(
+          local_devices.push_back(absl::make_unique<ThreadPoolDevice>(
               sess_opts, dev_name, mem_limit, dev_locality, cpu_allocator()));
         } else if (device_type == DEVICE_GPU && !gpu_devices_.empty()) {
           int dev_idx = (wi * num_devices) + di;
@@ -175,7 +176,7 @@ class RingReducerTest : public ::testing::Test {
             LOG(INFO) << "dev_mgr has access to limited GPUs, reusing for more "
                          "than one ring node.";
           } else {
-            local_devices.push_back(gpu_devices_[dev_idx]);
+            local_devices.push_back(std::move(gpu_devices_[dev_idx]));
           }
         } else {
           LOG(FATAL) << "Unsupported device_type " << device_type;
@@ -185,7 +186,7 @@ class RingReducerTest : public ::testing::Test {
     if (!dev_mgr_ || device_type == DEVICE_CPU) {
       LOG(ERROR) << "resetting dev_mgr for " << local_devices.size()
                  << " devices: ";
-      dev_mgr_.reset(new DeviceMgr(local_devices));
+      dev_mgr_.reset(new DeviceMgr(std::move(local_devices)));
     }
     if (!gpu_ring_order_) gpu_ring_order_.reset(new string());
     dev_resolver_.reset(new DeviceResolverLocal(dev_mgr_.get()));
@@ -544,7 +545,7 @@ class RingReducerTest : public ::testing::Test {
   std::unique_ptr<DeviceResolverLocal> dev_resolver_;
   std::vector<DeviceInstance*> instances_;
   CollectiveParams col_params_;
-  std::vector<tensorflow::Device*> gpu_devices_;
+  std::vector<std::unique_ptr<tensorflow::Device>> gpu_devices_;
   std::unique_ptr<tensorflow::DeviceMgr> dev_mgr_;
   std::unique_ptr<string> gpu_ring_order_;
   mutex mu_;
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 6404d8bc6a209997afbe33c547679ebb2cb5cbf5..ca7ca5443c954a6cdcb5d25324ea84163bb4291e 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -93,7 +93,7 @@ Status ThreadPoolDevice::MakeTensorFromProto(
     Tensor* tensor) {
   if (tensor_proto.dtype() > 0 && tensor_proto.dtype() <= DataType_MAX) {
     Tensor parsed(tensor_proto.dtype());
-    if (parsed.FromProto(cpu_allocator(), tensor_proto)) {
+    if (parsed.FromProto(allocator_, tensor_proto)) {
       *tensor = std::move(parsed);
       return Status::OK();
     }
diff --git a/tensorflow/core/common_runtime/threadpool_device_factory.cc b/tensorflow/core/common_runtime/threadpool_device_factory.cc
index 6a900c02c00e976fdef2e4b5f6673f27affb3069..f9cbb817499ef5c35a91e5c7f2e51f9bd5267180 100644
--- a/tensorflow/core/common_runtime/threadpool_device_factory.cc
+++ b/tensorflow/core/common_runtime/threadpool_device_factory.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Register a factory that provides CPU devices.
-#include "tensorflow/core/common_runtime/threadpool_device.h"
-
 #include <vector>
+
+// Register a factory that provides CPU devices.
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
+#include "tensorflow/core/common_runtime/process_state.h"
+#include "tensorflow/core/common_runtime/threadpool_device.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -27,9 +30,8 @@ namespace tensorflow {
 class ThreadPoolDeviceFactory : public DeviceFactory {
  public:
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
-                       std::vector<Device*>* devices) override {
-    // TODO(zhifengc/tucker): Figure out the number of available CPUs
-    // and/or NUMA configuration.
+                       std::vector<std::unique_ptr<Device>>* devices) override {
+    int num_numa_nodes = port::NUMANumNodes();
     int n = 1;
     auto iter = options.config.device_count().find("CPU");
     if (iter != options.config.device_count().end()) {
@@ -37,8 +39,26 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
     }
     for (int i = 0; i < n; i++) {
       string name = strings::StrCat(name_prefix, "/device:CPU:", i);
-      devices->push_back(new ThreadPoolDevice(
-          options, name, Bytes(256 << 20), DeviceLocality(), cpu_allocator()));
+      std::unique_ptr<ThreadPoolDevice> tpd;
+      if (options.config.experimental().use_numa_affinity()) {
+        int numa_node = i % num_numa_nodes;
+        if (numa_node != i) {
+          LOG(INFO) << "Only " << num_numa_nodes
+                    << " NUMA nodes visible in system, "
+                    << " assigning device " << name << " to NUMA node "
+                    << numa_node;
+        }
+        DeviceLocality dev_locality;
+        dev_locality.set_numa_node(numa_node);
+        tpd = absl::make_unique<ThreadPoolDevice>(
+            options, name, Bytes(256 << 20), dev_locality,
+            ProcessState::singleton()->GetCPUAllocator(numa_node));
+      } else {
+        tpd = absl::make_unique<ThreadPoolDevice>(
+            options, name, Bytes(256 << 20), DeviceLocality(),
+            ProcessState::singleton()->GetCPUAllocator(port::kNUMANoAffinity));
+      }
+      devices->push_back(std::move(tpd));
     }
 
     return Status::OK();
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 9f091224348fc5c74e0b7ce8d0526fb760c65e72..e388d3e6f0f5636c044c36ee03c826f1872cac9f 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -429,7 +429,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:worker_proto_cc",
         "//tensorflow/core/debug",
-        "@com_google_absl//absl/time",
     ],
 )
 
@@ -469,6 +468,17 @@ cc_library(
     ],
 )
 
+tf_cc_test(
+    name = "server_lib_test",
+    srcs = ["server_lib_test.cc"],
+    deps = [
+        ":server_lib",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+    ],
+)
+
 cc_library(
     name = "rpc_collective_executor_mgr",
     srcs = ["rpc_collective_executor_mgr.cc"],
@@ -615,6 +625,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 4eed856759ae7ea2a982e1604ecbc0237e304731..40b18d321a1cb3fafeaa4b864e737f6d86695842 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -29,7 +29,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static Device* NewDevice(const string& type, const string& name) {
+static std::unique_ptr<Device> NewDevice(const string& type,
+                                         const string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -40,7 +41,7 @@ static Device* NewDevice(const string& type, const string& name) {
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(3);  // a non-default value
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 class FakeWorker : public TestWorkerInterface {
@@ -156,16 +157,16 @@ class DeviceResDistTest : public ::testing::Test {
 
   void DefineWorker(const ConfigProto& config, const string& worker_name,
                     const string& device_type, int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i)));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto* d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     DeviceResolverDistributed* dev_res =
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index 33e1c8f2c33ff88146cec6e5992c65e67f03417a..26f722a6bd4104b2dc264c2946bc5b5656b0fb32 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -41,7 +41,8 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-static Device* NewDevice(const string& type, const string& name) {
+static std::unique_ptr<Device> NewDevice(const string& type,
+                                         const string& name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -52,7 +53,7 @@ static Device* NewDevice(const string& type, const string& name) {
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(3);  // a non-default value
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 static int64 kStepId = 123;
@@ -211,16 +212,16 @@ class CollRMADistTest : public ::testing::Test {
 
   void DefineWorker(const ConfigProto& config, const string& worker_name,
                     const string& device_type, int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i)));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     DeviceResolverDistributed* dev_res =
diff --git a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
index ae44b98bd52d6dcc32919ca9d850fcf13aac89db..842a2b3b058b8c55bec0c07816c1305ed9a2f305 100644
--- a/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/device_resolver_distributed_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/device_resolver_distributed.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/distributed_runtime/test_utils.h"
 #include "tensorflow/core/lib/core/notification.h"
@@ -41,8 +42,8 @@ class TestableDeviceResolverDistributed : public DeviceResolverDistributed {
 
 // Create a fake 'Device' whose only interesting attribute is a non-default
 // DeviceLocality.
-static Device* NewDevice(const string& type, const string& name,
-                         int numa_node) {
+static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
+                                         int numa_node) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
@@ -53,7 +54,7 @@ static Device* NewDevice(const string& type, const string& name,
   attr.set_name(name);
   attr.set_device_type(type);
   attr.mutable_locality()->set_numa_node(numa_node);
-  return new FakeDevice(attr);
+  return absl::make_unique<FakeDevice>(attr);
 }
 
 // Create a fake WorkerInterface that responds to requests without RPCs,
@@ -151,19 +152,19 @@ class DeviceResDistTest : public ::testing::Test {
 
   void DefineWorker(const string& worker_name, const string& device_type,
                     int num_devices) {
-    std::vector<Device*> devices;
+    std::vector<std::unique_ptr<Device>> devices;
     for (int i = 0; i < num_devices; ++i) {
       devices.push_back(NewDevice(
           device_type,
           strings::StrCat(worker_name, "/device:", device_type, ":", i), i));
     }
-    DeviceMgr* dev_mgr = new DeviceMgr(devices);
+    DeviceMgr* dev_mgr = new DeviceMgr(std::move(devices));
     TestableDeviceResolverDistributed* dev_res =
         new TestableDeviceResolverDistributed(dev_mgr, &wc_, worker_name);
     resolvers_[worker_name] = dev_res;
     device_mgrs_.push_back(dev_mgr);
     std::vector<string>* dv = &dev_by_task_[worker_name];
-    for (auto d : devices) {
+    for (auto* d : dev_mgr->ListDevices()) {
       dv->push_back(d->name());
     }
     FakeWorker* fw = new FakeWorker(worker_name, dev_mgr, dev_res);
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index 055e5dfcedaea0bb2209132f2ffd60cd5a4dbae0..55b2657e74ef5c2be8c1b0f11d4a00186e063e31 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -69,6 +69,7 @@ cc_library(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/eager:remote_tensor_handle",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "@com_google_absl//absl/memory",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index b8af63724aa1dbe1a20dbc18bd6115c9aab78a0c..13c959d8506868a3d9d8dbba59a7d092e6d4fd94 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/eager/eager_service_impl.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -36,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/host_info.h"
 
 namespace tensorflow {
 namespace eager {
@@ -86,7 +88,7 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
     return tensorflow::errors::Internal(
         "invalid eager env_ or env_->rendezvous_mgr.");
   }
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
       // TODO(nareshmodi): Correctly set the SessionOptions.
@@ -96,12 +98,12 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
                       request->server_def().task_index()),
       &devices));
   response->mutable_device_attributes()->Reserve(devices.size());
-  for (auto& d : devices) {
+  for (const auto& d : devices) {
     *response->add_device_attributes() = d->attributes();
   }
 
-  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr =
+      absl::make_unique<DeviceMgr>(std::move(devices));
 
   auto* r = env_->rendezvous_mgr->Find(request->rendezvous_id());
   auto session_name = strings::StrCat("eager_", request->rendezvous_id());
@@ -152,20 +154,19 @@ Status EagerServiceImpl::ExecuteOp(const Operation& operation,
   std::unique_ptr<tensorflow::EagerOperation> op;
   const char* name = operation.name().c_str();  // Shorthand
   const tensorflow::AttrTypeMap* types;
-  auto status = tensorflow::AttrTypeMapForOp(name, &types);
-  if (status.ok()) {
-    op.reset(
-        new tensorflow::EagerOperation(server_context->Context(), name, types));
-  } else if (errors::IsNotFound(status)) {
-    if (server_context->Context()->FindFunctionByName(name)) {
-      op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
-                                              nullptr));
-    } else {
-      return status;
-    }
-  } else {
-    return status;
+  bool is_function = false;
+  TF_RETURN_IF_ERROR(tensorflow::AttrTypeMapForOp(name, &types, &is_function));
+  if (is_function && !server_context->Context()->FindFunctionByName(name)) {
+    return errors::NotFound(
+        "'", name,
+        "' is neither a type of a primitive operation nor a name "
+        "of a function registered in binary running on ",
+        port::Hostname(),
+        ". Make sure the operation or function is "
+        "registered in the binary running in this process.");
   }
+  op.reset(new tensorflow::EagerOperation(server_context->Context(), name,
+                                          is_function, types));
 
   TF_RETURN_IF_ERROR(op->SetDevice(operation.device().c_str()));
 
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 5ba522c2a2e9ce650b7823bbb2d4959531874d98..7a1463e8f047040b28dbb951e6db2b7af75294f2 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -68,12 +68,9 @@ class EagerServiceImplTest : public ::testing::Test {
     worker_env_.rendezvous_mgr = &rendezvous_mgr_;
     worker_env_.session_mgr = session_mgr_.get();
 
-    Device* device = DeviceFactory::NewDevice(
-        "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0");
-
-    worker_env_.local_devices = {device};
-
-    device_mgr_.reset(new DeviceMgr(worker_env_.local_devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(DeviceFactory::NewDevice(
+        "CPU", {}, "/job:localhost/replica:0/task:0/device:CPU:0"));
+    worker_env_.local_devices = device_mgr_->ListDevices();
     worker_env_.device_mgr = device_mgr_.get();
   }
 
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index ee39062e34544b67b57770fcf0c90a756be7bd9b..ee5823e314f777f758a6c0d8ef7129c4bbd2916c 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -15,9 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/graph_mgr.h"
 
+#include <chrono>  // NOLINT(build/c++11)
 #include <vector>
 
-#include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/build_graph_options.h"
 #include "tensorflow/core/common_runtime/constant_folding.h"
 #include "tensorflow/core/common_runtime/debugger_state_interface.h"
@@ -388,7 +388,7 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
                             MutableRunGraphResponseWrapper* response,
                             CancellationManager* cancellation_manager,
                             const NamedTensors& in, StatusCallback done) {
-  const absl::Time start_time = absl::Now();
+  const uint64 start_time_usecs = Env::Default()->NowMicros();
   // Lookup an item. Holds one ref while executing.
   Item* item = nullptr;
   {
@@ -449,9 +449,9 @@ void GraphMgr::ExecuteAsync(const string& handle, const int64 step_id,
   StartParallelExecutors(
       handle, step_id, item, rendezvous, ce_handle, collector, cost_graph,
       cancellation_manager,
-      [item, rendezvous, ce_handle, done, start_time](const Status& s) {
+      [item, rendezvous, ce_handle, done, start_time_usecs](const Status& s) {
         done(s);
-        UpdateGraphExecutionTime(absl::Now() - start_time);
+        UpdateGraphExecTime(Env::Default()->NowMicros() - start_time_usecs);
         rendezvous->Unref();
         item->Unref();
         delete ce_handle;
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index d122016d3ee9ba8d152b430f0f9a62bb95e417d0..273709a01fd799f7f4aa8afc80d3bdfc48d36322 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -105,6 +105,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index 181422118cd9f01658c1601a1779355f127c6fac..3626a48171e0b628b2630c35a17826b8713dc9d1 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -40,7 +40,7 @@ class GrpcEagerClient : public EagerClient {
       override {                                                          \
     new RPCState<protobuf::Message>(                                      \
         &stub_, cq_, "/tensorflow.eager.EagerService/" #method, *request, \
-        response, std::move(done), nullptr);                              \
+        response, std::move(done), nullptr, nullptr);                     \
   }
 
   CLIENT_METHOD(CreateContext);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
index 456c30ecf499016493e220ebdd2008ae48ce52df..781b7d65cdd184363d7c7650305bd62f3129c271 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.cc
@@ -53,30 +53,58 @@ Status ValidateHostPortPair(const string& host_port) {
   }
   return Status::OK();
 }
-}  // namespace
 
-Status NewHostPortGrpcChannel(const string& target,
-                              SharedGrpcChannelPtr* channel_pointer) {
-  // Minimally ensure that the target is valid
-  TF_RETURN_IF_ERROR(ValidateHostPortPair(target));
+}  // namespace
 
+::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options) {
   // TODO(mrry): Implement secure channels.
   ::grpc::ChannelArguments args;
   args.SetInt(GRPC_ARG_MAX_MESSAGE_LENGTH, std::numeric_limits<int32>::max());
   // NOTE(mrry): Some versions of gRPC use a 20-second minimum backoff
   // on connection failure, which makes our tests time out.
   args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 1000);
+  if (rpc_options != nullptr) {
+    if (rpc_options->compression_algorithm() == "deflate") {
+      args.SetCompressionAlgorithm(GRPC_COMPRESS_DEFLATE);
+      args.SetInt(GRPC_COMPRESSION_CHANNEL_DEFAULT_LEVEL,
+                  rpc_options->compression_level());
+      VLOG(5) << "Setting GRPC compression : algo='"
+              << rpc_options->compression_algorithm()
+              << "' level=" << rpc_options->compression_level();
+    } else if (rpc_options->compression_algorithm() == "gzip") {
+      args.SetCompressionAlgorithm(GRPC_COMPRESS_GZIP);
+      args.SetInt(GRPC_COMPRESSION_CHANNEL_DEFAULT_LEVEL,
+                  rpc_options->compression_level());
+      VLOG(5) << "Setting GRPC compression : algo='"
+              << rpc_options->compression_algorithm()
+              << "' level=" << rpc_options->compression_level();
+    } else if (!rpc_options->compression_algorithm().empty()) {
+      LOG(ERROR) << "Invalid compression algorithm: "
+                 << rpc_options->compression_algorithm();
+    }
+  }
+  return args;
+}
+
+Status NewHostPortGrpcChannel(const string& target,
+                              const RPCOptions* rpc_options,
+                              SharedGrpcChannelPtr* channel_pointer) {
+  // Minimally ensure that the target is valid
+  TF_RETURN_IF_ERROR(ValidateHostPortPair(target));
+
+  ::grpc::ChannelArguments args = GetChannelArguments(rpc_options);
   *channel_pointer = ::grpc::CreateCustomChannel(
       "dns:///" + target, ::grpc::InsecureChannelCredentials(), args);
   return Status::OK();
 }
 
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<Status(string, SharedGrpcChannelPtr*)>&
-        new_channel_func_ptr) {
+    const std::function<Status(string, const RPCOptions*,
+                               SharedGrpcChannelPtr*)>& new_channel_func_ptr) {
   return [new_channel_func_ptr](const string& target) -> SharedGrpcChannelPtr {
     SharedGrpcChannelPtr channel_ptr;
-    if (new_channel_func_ptr(target, &channel_ptr).ok()) {
+    if (new_channel_func_ptr(target, /*rpc_options=*/nullptr, &channel_ptr)
+            .ok()) {
       return channel_ptr;
     } else {
       return nullptr;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
index 6fa99d7b148c010dede55a8cdcbdfca081c5e96a..57d16218e8f6a64c5030e075ebc770fc5566a106 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 
 namespace tensorflow {
 
@@ -86,11 +87,14 @@ GrpcChannelCache* NewGrpcChannelCache(const GrpcChannelSpec& channel_spec,
 
 // Below here are internal-only functions.
 
+::grpc::ChannelArguments GetChannelArguments(const RPCOptions* rpc_options);
+
 ChannelCreationFunction ConvertToChannelCreationFunction(
-    const std::function<Status(string, SharedGrpcChannelPtr*)>&
-        new_channel_func_ptr);
+    const std::function<Status(string, const RPCOptions*,
+                               SharedGrpcChannelPtr*)>& new_channel_func_ptr);
 
 Status NewHostPortGrpcChannel(const string& target,
+                              const RPCOptions* rpc_options,
                               SharedGrpcChannelPtr* channel_pointer);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
index a814ef85e2091ef46c466a012ac7c093981a1165..a6fae2286f5957f7aa0b45479ad262647e81ce74 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_channel_test.cc
@@ -184,18 +184,39 @@ TEST(GrpcChannelTest, SparseHostPorts) {
 TEST(GrpcChannelTest, NewHostPortGrpcChannelValidation) {
   SharedGrpcChannelPtr mock_ptr;
 
-  EXPECT_TRUE(NewHostPortGrpcChannel("127.0.0.1:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("example.com:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("fqdn.example.com.:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("[2002:a9c:258e::]:2222", &mock_ptr).ok());
-  EXPECT_TRUE(NewHostPortGrpcChannel("[::]:2222", &mock_ptr).ok());
-
-  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:2222", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("127.0.0.1:2222/", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]/:2222", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]:2222/", &mock_ptr).ok());
-  EXPECT_FALSE(NewHostPortGrpcChannel("[::]:", &mock_ptr).ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("127.0.0.1:2222", /*rpc_options=*/nullptr,
+                                     &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("example.com:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("fqdn.example.com.:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(NewHostPortGrpcChannel("[2002:a9c:258e::]:2222",
+                                     /*rpc_options=*/nullptr, &mock_ptr)
+                  .ok());
+  EXPECT_TRUE(
+      NewHostPortGrpcChannel("[::]:2222", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
+
+  EXPECT_FALSE(NewHostPortGrpcChannel("example.com/abc:2222",
+                                      /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel("127.0.0.1:2222/",
+                                      /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(NewHostPortGrpcChannel(
+                   "example.com/abc:", /*rpc_options=*/nullptr, &mock_ptr)
+                   .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]/:2222", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]:2222/", /*rpc_options=*/nullptr, &mock_ptr)
+          .ok());
+  EXPECT_FALSE(
+      NewHostPortGrpcChannel("[::]:", /*rpc_options=*/nullptr, &mock_ptr).ok());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 885c5e87c17410eefcadc880545a811bd7956fbe..2daefcb399c79324f80278340967b679be5c6574 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/worker_interface.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/tracing.h"
@@ -42,10 +43,12 @@ class GrpcRemoteWorker : public WorkerInterface {
  public:
   explicit GrpcRemoteWorker(SharedGrpcChannelPtr channel,
                             ::grpc::CompletionQueue* completion_queue,
+                            thread::ThreadPool* callback_threadpool,
                             WorkerCacheLogger* logger)
       : channel_(std::move(channel)),
         stub_(channel_),
         cq_(completion_queue),
+        callback_threadpool_(callback_threadpool),
         getstatus_(Method(GrpcWorkerMethod::kGetStatus)),
         createworkersession_(Method(GrpcWorkerMethod::kCreateWorkerSession)),
         deleteworkersession_(Method(GrpcWorkerMethod::kDeleteWorkerSession)),
@@ -258,13 +261,15 @@ class GrpcRemoteWorker : public WorkerInterface {
                     protobuf::Message* response, const ::grpc::string& method,
                     StatusCallback done, CallOptions* call_opts = nullptr) {
     new RPCState<protobuf::Message>(&stub_, cq_, method, *request, response,
-                                    std::move(done), call_opts);
+                                    std::move(done), call_opts,
+                                    callback_threadpool_);
   }
   void IssueRequest(const protobuf::Message* request, TensorResponse* response,
                     const ::grpc::string& method, StatusCallback done,
                     CallOptions* call_opts = nullptr) {
     new RPCState<TensorResponse>(&stub_, cq_, method, *request, response,
-                                 std::move(done), call_opts);
+                                 std::move(done), call_opts,
+                                 callback_threadpool_);
   }
 
   // Helper function for initializing the RpcMethod objects below.
@@ -273,6 +278,7 @@ class GrpcRemoteWorker : public WorkerInterface {
   SharedGrpcChannelPtr channel_;
   ::grpc::GenericStub stub_;
   ::grpc::CompletionQueue* cq_;
+  thread::ThreadPool* callback_threadpool_;
 
   const ::grpc::string getstatus_;
   const ::grpc::string createworkersession_;
@@ -298,8 +304,10 @@ class GrpcRemoteWorker : public WorkerInterface {
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
                                      WorkerCacheLogger* logger) {
-  return new GrpcRemoteWorker(std::move(channel), completion_queue, logger);
+  return new GrpcRemoteWorker(std::move(channel), completion_queue,
+                              callback_threadpool, logger);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
index b85c1dc5b4e592e621ee96853dd724440ad9b4bd..d1f0e94ba52d81451a1085804cf01375f4d2fb57 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.h
@@ -19,18 +19,19 @@ limitations under the License.
 #include <memory>
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 
 namespace grpc {
 class CompletionQueue;
 }
 
 namespace tensorflow {
-
 class WorkerCacheLogger;
 class WorkerInterface;
 
 WorkerInterface* NewGrpcRemoteWorker(SharedGrpcChannelPtr channel,
                                      ::grpc::CompletionQueue* completion_queue,
+                                     thread::ThreadPool* callback_threadpool,
                                      WorkerCacheLogger* logger);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
index cde6b785dc6e351ba0d51bef9b23d6bd05742320..4f5975bbc11a6217355c1fcf368996a0fca45969 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_rpc_factory.cc
@@ -206,11 +206,11 @@ void GrpcRPCFactory::StartCall(const Tensor& address_t, const Tensor& method_t,
 
   int index = call->index();
   // This object will delete itself when done.
-  new RPCState<string>(get_stub(index), &completion_queue_,
-                       *get_method_ptr(index), call->request(),
-                       call->response(),
-                       /*done=*/[call](const Status& s) { call->Done(s); },
-                       call->call_opts(), fail_fast_, timeout_in_ms_);
+  new RPCState<string>(
+      get_stub(index), &completion_queue_, *get_method_ptr(index),
+      call->request(), call->response(),
+      /*done=*/[call](const Status& s) { call->Done(s); }, call->call_opts(),
+      nullptr /*threadpool*/, fail_fast_, timeout_in_ms_);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index ae722fdfe9559f1be6727f2e08c4d0aa5728a654..cbd5cd927e7d73fd0ed28a910c89eef1f73b0d91 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstring>
 #include <limits>
 #include <memory>
+#include <vector>
 
 #include "grpc/support/alloc.h"
 #include "grpcpp/grpcpp.h"
@@ -156,10 +157,12 @@ Status GrpcServer::Init(
   string name_prefix =
       strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
                       "/task:", server_def_.task_index());
-  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(sess_opts, name_prefix,
-                                               &master_env_.local_devices));
-  worker_env_.local_devices = master_env_.local_devices;
-  worker_env_.device_mgr = new DeviceMgr(worker_env_.local_devices);
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_RETURN_IF_ERROR(
+      DeviceFactory::AddDevices(sess_opts, name_prefix, &devices));
+  worker_env_.device_mgr = new DeviceMgr(std::move(devices));
+  master_env_.local_devices = worker_env_.device_mgr->ListDevices();
+  worker_env_.local_devices = worker_env_.device_mgr->ListDevices();
   worker_env_.rendezvous_mgr = rendezvous_mgr_func == nullptr
                                    ? new RpcRendezvousMgr(&worker_env_)
                                    : rendezvous_mgr_func(&worker_env_);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index fdce1b10e0a8ade6f96b280e3c6dc33ec69d504b..32063fecbbef4347bcdbfbdfda32f008015b5975 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -52,8 +52,9 @@ Status GrpcSession::Create(const SessionOptions& options,
   }
   if (!master) {
     SharedGrpcChannelPtr master_channel;
-    TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
-        options.target.substr(kSchemePrefixLength), &master_channel));
+    TF_RETURN_IF_ERROR(
+        NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength),
+                               &options.config.rpc_options(), &master_channel));
     master.reset(NewGrpcMaster(master_channel));
   }
   session->SetRemoteMaster(std::move(master));
@@ -91,6 +92,12 @@ void ReEncodeConsts(GraphDef* gdef) {
 }
 }  // namespace
 
+void GrpcSession::SetHandleAndGraphVersion(string handle, int64 graph_version) {
+  mutex_lock l(mu_);
+  handle_ = std::move(handle);
+  current_graph_version_ = graph_version;
+}
+
 Status GrpcSession::Handle(string* out_handle) {
   mutex_lock l(mu_);
   if (handle_.empty()) {
@@ -116,9 +123,7 @@ Status GrpcSession::CreateImpl(CallOptions* call_options,
   CreateSessionResponse resp;
   Status s = master_->CreateSession(call_options, &req, &resp);
   if (s.ok()) {
-    mutex_lock l(mu_);
-    swap(handle_, *(resp.mutable_session_handle()));
-    current_graph_version_ = resp.graph_version();
+    SetHandleAndGraphVersion(resp.session_handle(), resp.graph_version());
   }
   return s;
 }
@@ -384,8 +389,9 @@ void GrpcSession::SetRemoteMaster(std::unique_ptr<MasterInterface> master) {
 Status GrpcSession::Reset(const SessionOptions& options,
                           const std::vector<string>& containers) {
   SharedGrpcChannelPtr master_channel;
-  TF_RETURN_IF_ERROR(NewHostPortGrpcChannel(
-      options.target.substr(kSchemePrefixLength), &master_channel));
+  TF_RETURN_IF_ERROR(
+      NewHostPortGrpcChannel(options.target.substr(kSchemePrefixLength),
+                             /*rpc_options=*/nullptr, &master_channel));
   auto master = NewGrpcMaster(master_channel);
   ResetRequest req;
   for (const auto& c : containers) req.add_container(c);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.h b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
index 63795117f9763434f5ff331d3d2d3bdb99413e81..a3ed3ec73669a0844c27af90e974131574174e88 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.h
@@ -106,9 +106,12 @@ class GrpcSession : public Session {
  protected:
   // Takes ownership of `*master`.
   void SetRemoteMaster(std::unique_ptr<MasterInterface> master);
+  // Allows subclasses to customize Session creation.
+  void SetHandleAndGraphVersion(string handle, int64 graph_version)
+      LOCKS_EXCLUDED(mu_);
 
  private:
-  SessionOptions options_;
+  const SessionOptions options_;
   std::unique_ptr<MasterInterface> master_;
   mutex mu_;
 
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index fc601991a24d5718d58bc70da370b952622fd5c8..ad0f8e5e2fcec011812b69082bc1747bd51fd7d3 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -1066,4 +1066,31 @@ TEST(SessionTest, RunTimeoutWithRunOptions) {
               error::INTERNAL == status.code());
 }
 
+TEST(SessionTest, TestCompression) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 1, &cluster));
+  SessionOptions options = Options(cluster->targets()[0], 100);
+  RPCOptions* rpc_options = options.config.mutable_rpc_options();
+  rpc_options->set_compression_algorithm("deflate");
+  rpc_options->set_compression_level(GRPC_COMPRESS_LEVEL_HIGH);
+
+  std::unique_ptr<Session> session(NewRemote(options));
+
+  static const float kTestValue = 409.1934f;
+  Graph graph(OpRegistry::Global());
+  Tensor tensor(DT_FLOAT, TensorShape({1, 1}));
+  tensor.flat<float>()(0) = kTestValue;
+  Node* b = test::graph::Constant(&graph, tensor);
+  GraphDef gdef;
+  graph.ToGraphDef(&gdef);
+  RunOptions run_options;
+  TF_CHECK_OK(session->Create(run_options, gdef));
+
+  std::vector<std::pair<string, Tensor>> inputs;
+  std::vector<Tensor> outputs;
+  TF_CHECK_OK(session->Run(inputs, {b->name()}, {}, &outputs));
+  ASSERT_EQ(1, outputs.size());
+  IsSingleFloatValue(outputs[0], kTestValue);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index 61c5bc285f2f2e38a39737408a446a84b8442690..b67f3c4563107882a556e83c07ee20ca69b3f3b4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/core/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/distributed_runtime/tensor_coding.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/notification.h"
 
 namespace tensorflow {
@@ -36,16 +37,18 @@ class RPCState : public GrpcClientCQTag {
   // Default behavior is to set fail_fast = False and handle timeouts manually.
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const protobuf::Message& request,
-           Response* response, StatusCallback done, CallOptions* call_opts)
+           Response* response, StatusCallback done, CallOptions* call_opts,
+           thread::ThreadPool* threadpool)
       : RPCState(stub, cq, method, request, response, std::move(done),
-                 call_opts, /*fail_fast=*/false, /*timeout_in_ms=*/0) {}
+                 call_opts, threadpool, /*fail_fast=*/false,
+                 /*timeout_in_ms=*/0) {}
 
   template <typename Request>
   RPCState(::grpc::GenericStub* stub, ::grpc::CompletionQueue* cq,
            const ::grpc::string& method, const Request& request,
            Response* response, StatusCallback done, CallOptions* call_opts,
-           bool fail_fast, int64 timeout_in_ms)
-      : call_opts_(call_opts), done_(std::move(done)) {
+           thread::ThreadPool* threadpool, bool fail_fast, int64 timeout_in_ms)
+      : call_opts_(call_opts), threadpool_(threadpool), done_(std::move(done)) {
     context_.set_fail_fast(fail_fast);
     if (timeout_in_ms > 0) {
       context_.set_deadline(gpr_time_from_millis(timeout_in_ms, GPR_TIMESPAN));
@@ -77,11 +80,27 @@ class RPCState : public GrpcClientCQTag {
       // to Finish for client-side unary calls, ok should never be false
       s.Update(errors::Internal("unexpected ok value at rpc completion"));
     }
-    if (s.ok() && !GrpcMaybeParseProto(&response_buf_, response_)) {
-      s.Update(errors::Internal("could not parse rpc response"));
-    }
-    if (!s.ok()) {
+
+    if (s.ok()) {
+      if (threadpool_) {
+        // Run parse and callback in another thread, returning this
+        // one to service more RPCs.
+        threadpool_->Schedule([this]() { ParseAndCallDone(); });
+      } else {
+        ParseAndCallDone();
+        return;
+      }
+    } else {
       VLOG(2) << "Call returned with non-ok status: " << s;
+      done_(s);
+      delete this;
+    }
+  }
+
+  void ParseAndCallDone() {
+    Status s;
+    if (!GrpcMaybeParseProto(&response_buf_, response_)) {
+      s.Update(errors::Internal("could not parse rpc response"));
     }
     done_(s);
     delete this;
@@ -90,6 +109,7 @@ class RPCState : public GrpcClientCQTag {
  private:
   CallOptions* call_opts_;
   ::grpc::ClientContext context_;
+  thread::ThreadPool* threadpool_;
   std::unique_ptr<::grpc::GenericClientAsyncResponseReader> call_;
   Response* response_;
   ::grpc::ByteBuffer request_buf_;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index e1541db69bfc2471ff1241a0154f442c1fd5511c..60d5881d4ca75a7ea201d592d8668bce7438592e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -43,7 +43,17 @@ class GrpcWorkerCache : public WorkerCachePartial {
         local_worker_(local_worker),
         channel_cache_(channel_cache),
         threads_(kGrpcWorkerCacheThreadCount),
-        next_round_robin_assignment_(0) {}
+        next_round_robin_assignment_(0) {
+    // NOTE: We don't yet have any reason to assign NUMA affinity to this
+    // ThreadPool.  If there's only a single NIC it shouldn't make any
+    // difference since presumably it is handling memory from all nodes.
+    ThreadOptions options;
+    options.numa_node = port::kNUMANoAffinity;
+    const int kNumCallbackThreads = 10;
+    callback_threadpool_.reset(new thread::ThreadPool(
+        Env::Default(), options, "grpc_wcache_callback", kNumCallbackThreads,
+        false /*low_latency_hint*/, nullptr /*allocator*/));
+  }
 
   // Explicit destructor to control destruction order.
   ~GrpcWorkerCache() override {
@@ -67,7 +77,7 @@ class GrpcWorkerCache : public WorkerCachePartial {
       if (!channel) return nullptr;
       return NewGrpcRemoteWorker(
           channel, threads_[AssignWorkerToThread(target)].completion_queue(),
-          &logger_);
+          callback_threadpool_.get(), &logger_);
     }
   }
 
@@ -138,6 +148,8 @@ class GrpcWorkerCache : public WorkerCachePartial {
   WorkerCacheLogger logger_;
   std::vector<GrpcWorkerCacheThread> threads_;
 
+  std::unique_ptr<thread::ThreadPool> callback_threadpool_;
+
   mutex assignment_mu_;
   std::unordered_map<std::string, size_t> target_assignments_
       GUARDED_BY(assignment_mu_);
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index b8cb5385038ed2c01d15cb5a571cd2d5ec6505c8..9fb920404f987d6b5b324cce4155da40c7e753b4 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -244,6 +244,15 @@ void RpcRemoteRendezvous::RecvFromRemoteAsync(
   // Record "call" in active_ so that it can be aborted cleanly.
   RegisterCall(call);
 
+  // RendezvousMgr already aborted, shouldn't send RPC call any more
+  if (!call->status().ok()) {
+    call->done()(call->status(), Args(), Args(), Tensor(), false);
+    session()->worker_cache->ReleaseWorker(call->src_worker_, call->wi_);
+    call->wi_ = nullptr;
+    get_call_freelist()->Release(call, session()->worker_cache.get());
+    return;
+  }
+
   // Start "call".
   Ref();
   call->Start([this, call]() {
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
index 0323300fdde0734d3da216ed69958556b27a49b5..1c87fe9d928f65008d0a87af58873bffb5f9aa18 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr_test.cc
@@ -42,8 +42,9 @@ class RpcCollectiveExecutorMgrTest : public ::testing::Test {
     WorkerCacheInterface* worker_cache = nullptr;
     auto* device_count = options.config.mutable_device_count();
     device_count->insert({"CPU", NUM_DEVS});
-    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices_));
-    device_mgr_.reset(new DeviceMgr(devices_));
+    std::vector<std::unique_ptr<Device>> devices;
+    TF_CHECK_OK(DeviceFactory::AddDevices(options, task_name, &devices));
+    device_mgr_.reset(new DeviceMgr(std::move(devices)));
     std::unique_ptr<DeviceResolverDistributed> dr(new DeviceResolverDistributed(
         device_mgr_.get(), worker_cache, task_name));
     std::unique_ptr<CollectiveParamResolverDistributed> cpr(
@@ -57,7 +58,6 @@ class RpcCollectiveExecutorMgrTest : public ::testing::Test {
   }
 
   std::unique_ptr<RpcCollectiveExecutorMgr> cme_;
-  std::vector<Device*> devices_;
   std::unique_ptr<DeviceMgr> device_mgr_;
 };
 
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index 7d308bb723a71e23482b6f52fa6d8fa53f89dda8..fe9369e884b8e24b31622b82487712ae6f96a6dd 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -49,16 +49,22 @@ void ServerFactory::Register(const string& server_type,
 Status ServerFactory::GetFactory(const ServerDef& server_def,
                                  ServerFactory** out_factory) {
   mutex_lock l(*get_server_factory_lock());
-  // TODO(mrry): Improve the error reporting here.
   for (const auto& server_factory : *server_factories()) {
     if (server_factory.second->AcceptsOptions(server_def)) {
       *out_factory = server_factory.second;
       return Status::OK();
     }
   }
+
+  std::vector<string> server_names;
+  for (const auto& server_factory : *server_factories()) {
+    server_names.push_back(server_factory.first);
+  }
+
   return errors::NotFound(
       "No server factory registered for the given ServerDef: ",
-      server_def.DebugString());
+      server_def.DebugString(), "\nThe available server factories are: [ ",
+      str_util::Join(server_names, ", "), " ]");
 }
 
 // Creates a server based on the given `server_def`, and stores it in
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..460372523c98c4e5a1e83be7a025e5911e9b4a8c
--- /dev/null
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -0,0 +1,56 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/distributed_runtime/server_lib.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+class TestServerFactory : public ServerFactory {
+ public:
+  bool AcceptsOptions(const ServerDef& server_def) override {
+    return server_def.protocol() == "test_protocol";
+  }
+
+  Status NewServer(const ServerDef& server_def,
+                   std::unique_ptr<ServerInterface>* out_server) override {
+    return Status::OK();
+  }
+};
+
+TEST(ServerLibTest, NewServerFactoryAccepts) {
+  ServerFactory::Register("TEST_SERVER", new TestServerFactory());
+  ServerDef server_def;
+  server_def.set_protocol("test_protocol");
+  std::unique_ptr<ServerInterface> server;
+  TF_EXPECT_OK(NewServer(server_def, &server));
+}
+
+TEST(ServerLibTest, NewServerNoFactoriesAccept) {
+  ServerDef server_def;
+  server_def.set_protocol("fake_protocol");
+  std::unique_ptr<ServerInterface> server;
+  Status s = NewServer(server_def, &server);
+  ASSERT_NE(s, Status::OK());
+  EXPECT_TRUE(str_util::StrContains(
+      s.error_message(),
+      "No server factory registered for the given ServerDef"));
+  EXPECT_TRUE(str_util::StrContains(s.error_message(),
+                                    "The available server factories are: ["));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 95b31c6991f6344c1b15b1fd28225aef37359818..29fe767e42a8dcec873f2a03dfe3247841da38c1 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -78,13 +78,13 @@ Status SessionMgr::CreateSession(const string& session,
 
   if (isolate_session_state) {
     // Create a private copy of the DeviceMgr for the WorkerSession.
-    std::vector<Device*> renamed_devices;
+    std::vector<std::unique_ptr<Device>> renamed_devices;
     for (Device* d : worker_env_->local_devices) {
       renamed_devices.push_back(RenamedDevice::NewRenamedDevice(
           worker_name, d, false, isolate_session_state));
     }
 
-    auto device_mgr = MakeUnique<DeviceMgr>(renamed_devices);
+    auto device_mgr = MakeUnique<DeviceMgr>(std::move(renamed_devices));
     auto graph_mgr = MakeUnique<GraphMgr>(worker_env_, device_mgr.get());
     worker_session.reset(
         new WorkerSession(session, worker_name,
@@ -122,7 +122,9 @@ Status SessionMgr::WorkerSessionForSessionLocked(
     auto it = sessions_.find(session_handle);
     if (it == sessions_.end()) {
       return errors::Aborted("Session handle is not found: ", session_handle,
-                             ". Possibly this worker just restarted.");
+                             ". Possibly this worker (\"",
+                             legacy_session_->worker_name,
+                             "\") just restarted.");
     } else {
       *out_session = it->second;
     }
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 99192119a63e2553bc107eff3f79a436c455b9e3..1ab0d20f0b53798ea63e69d25f41c47bcaef17d4 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -46,11 +46,9 @@ class SessionMgrTest : public ::testing::Test {
   SessionMgrTest()
       : mgr_(&env_, "/job:mnist/replica:0/task:0",
              std::unique_ptr<WorkerCacheInterface>(), factory_) {
-    Device* device =
-        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0")
-            .release();
-    env_.local_devices = {device};
-    device_mgr_.reset(new DeviceMgr(env_.local_devices));
+    device_mgr_ = absl::make_unique<DeviceMgr>(
+        FakeDevice::MakeCPU("/job:mnist/replica:0/task:0/device:fakecpu:0"));
+    env_.local_devices = device_mgr_->ListDevices();
     env_.device_mgr = device_mgr_.get();
   }
 
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 079c09859f46bbb4c3b0e8e671274ee7d80387b4..f42143e5824827e35a97ac25cb80b0e2c82e716e 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -438,7 +438,9 @@ Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
     return errors::Aborted(
         "RecvTensor expects a different device incarnation: ",
         parsed.src_incarnation, " vs. ", (*src_dev)->attributes().incarnation(),
-        ". Your worker job was probably restarted. Check your "
+        ". Your worker job (\"",
+        env_->session_mgr->LegacySession()->worker_name,
+        "\") was probably restarted. Check your "
         "worker job for the reason why it was restarted.");
   }
 
diff --git a/tensorflow/core/framework/allocator.cc b/tensorflow/core/framework/allocator.cc
index 84cee5569c4ac2c0083e4d4970b48460d9bd95ca..89c49a2ad050bfe067e9557aabd2916fba812fb0 100644
--- a/tensorflow/core/framework/allocator.cc
+++ b/tensorflow/core/framework/allocator.cc
@@ -96,9 +96,11 @@ static int64_t TotalAllocationWarningBytes() {
 void EnableCPUAllocatorStats(bool enable) {
   cpu_allocator_collect_stats = enable;
 }
+bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; }
 void EnableCPUAllocatorFullStats(bool enable) {
   cpu_allocator_collect_full_stats = enable;
 }
+bool CPUAllocatorFullStatsEnabled() { return cpu_allocator_collect_full_stats; }
 
 namespace {
 // A default Allocator for CPU devices.  ProcessState::GetCPUAllocator() will
diff --git a/tensorflow/core/framework/allocator.h b/tensorflow/core/framework/allocator.h
index 8c23604625ba77a4ca4fa42f96059735ed525f5d..531ea73e89277c83cfede50fce0de08b65c5e5a5 100644
--- a/tensorflow/core/framework/allocator.h
+++ b/tensorflow/core/framework/allocator.h
@@ -383,10 +383,12 @@ Allocator* cpu_allocator();
 // If 'enable' is true, the default CPU allocator implementation will collect
 // AllocatorStats. By default, it's disabled.
 void EnableCPUAllocatorStats(bool enable);
+bool CPUAllocatorStatsEnabled();
 
 // If 'enable' is true, the default CPU allocator implementation will collect
 // full statistics. By default, it's disabled.
 void EnableCPUAllocatorFullStats(bool enable);
+bool CPUAllocatorFullStatsEnabled();
 
 // An object that does the underlying suballoc/free of memory for a higher-level
 // allocator.  The expectation is that the higher-level allocator is doing some
diff --git a/tensorflow/core/framework/common_shape_fns.cc b/tensorflow/core/framework/common_shape_fns.cc
index 7f35390f90c4ffb22e5e8247096812896371b3ad..bf2d902af41c690be25a170da6fc22a4902e2d50 100644
--- a/tensorflow/core/framework/common_shape_fns.cc
+++ b/tensorflow/core/framework/common_shape_fns.cc
@@ -1549,6 +1549,51 @@ Status ExplicitShapes(InferenceContext* c) {
   return Status::OK();
 }
 
+Status SparseReduceShapeFn(InferenceContext* c) {
+  // Input 0: input_indices
+  // Input 1: input_values
+  // Input 2: input_shape
+  // Input 3: reduction_axes
+  // Attr: keep_dims
+  bool keep_dims = false;
+  TF_RETURN_IF_ERROR(c->GetAttr("keep_dims", &keep_dims));
+
+  const Tensor* shape_tensor = c->input_tensor(2);
+  const Tensor* axes_tensor = c->input_tensor(3);
+  if (shape_tensor != nullptr && axes_tensor != nullptr) {
+    auto shape_vec = shape_tensor->flat<int64>();
+    auto axes_vec = axes_tensor->flat<int32>();
+
+    int64 ndims = shape_vec.size();
+    std::unordered_set<int64> axes;
+    for (int i = 0; i < axes_vec.size(); i++) {
+      axes.insert((axes_vec(i) + ndims) % ndims);
+    }
+
+    std::vector<DimensionHandle> dims;
+    if (keep_dims) {
+      dims.reserve(ndims);
+      for (int d = 0; d < ndims; ++d) {
+        if (axes.find(d) == axes.end()) {
+          dims.push_back(c->MakeDim(shape_vec(d)));
+        } else {
+          dims.push_back(c->MakeDim(1));
+        }
+      }
+    } else {
+      for (int d = 0; d < ndims; ++d) {
+        if (axes.find(d) == axes.end()) {
+          dims.push_back(c->MakeDim(shape_vec(d)));
+        }
+      }
+    }
+
+    c->set_output(0, c->MakeShape(dims));
+    return Status::OK();
+  }
+  return UnknownShape(c);
+}
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/common_shape_fns.h b/tensorflow/core/framework/common_shape_fns.h
index 3a496e06aeb5c28d4e7c8ea306151fee16d3eba4..362899b947b1fd479d227ac5421a5f458405f3c6 100644
--- a/tensorflow/core/framework/common_shape_fns.h
+++ b/tensorflow/core/framework/common_shape_fns.h
@@ -310,6 +310,9 @@ Status ExplicitShape(InferenceContext* c);
 // Shape function for multiple-output ops with an explicit "shapes" attribute.
 Status ExplicitShapes(InferenceContext* c);
 
+// Shape function for SparseReduceMax and SparseReduceSum.
+Status SparseReduceShapeFn(InferenceContext* c);
+
 }  // namespace shape_inference
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index fc6b5dde0cbe0b6ef1ae3c65171c16001f383b64..6e214332710c9f2e854db99ec588424c8df81145 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -13,10 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
+#include <unordered_map>
 
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/variant_encode_decode.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
 #include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/mutex.h"
 
 namespace tensorflow {
 namespace data {
@@ -71,6 +76,113 @@ class DatasetVariantWrapper {
   DatasetBase* const dataset_;  // Owns one reference.
 };
 
+const char kWrappedDatasetVariantTypeName[] =
+    "tensorflow::data::WrappedDatasetVariant";
+
+class WrappedDatasetVariantWrapper {
+ public:
+  WrappedDatasetVariantWrapper() {}
+
+  explicit WrappedDatasetVariantWrapper(const Tensor& ds_tensor)
+      : ds_tensor_(ds_tensor) {}
+
+  Tensor get() const { return ds_tensor_; }
+
+  string TypeName() const { return "tensorflow::WrappedDatasetVariantWrapper"; }
+
+  string DebugString() const {
+    return "tensorflow::WrappedDatasetVariantWrapper::DebugString";
+  }
+
+  void Encode(VariantTensorData* data) const {
+    *(data->add_tensors()) = ds_tensor_;
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    ds_tensor_ = data.tensors(0);
+    return true;
+  }
+
+ private:
+  Tensor ds_tensor_;
+};
+
+class WrapDatasetVariantOp : public OpKernel {
+ public:
+  explicit WrapDatasetVariantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& tensor = ctx->input(0);
+    OP_REQUIRES(ctx,
+                tensor.dtype() == DT_VARIANT &&
+                    TensorShapeUtils::IsScalar(tensor.shape()),
+                errors::InvalidArgument(
+                    "Dataset tensor must be a scalar of dtype DT_VARIANT."));
+    DatasetBase* unused;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(tensor, &unused));
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
+    output->scalar<Variant>()() = WrappedDatasetVariantWrapper(tensor);
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("WrapDatasetVariant").Device(DEVICE_CPU),
+                        WrapDatasetVariantOp);
+REGISTER_KERNEL_BUILDER(Name("WrapDatasetVariant")
+                            .HostMemory("input_handle")
+                            .HostMemory("output_handle")
+                            .Device(DEVICE_GPU),
+                        WrapDatasetVariantOp);
+
+class UnwrapDatasetVariantOp : public OpKernel {
+ public:
+  explicit UnwrapDatasetVariantOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor& tensor = ctx->input(0);
+    OP_REQUIRES(ctx,
+                tensor.dtype() == DT_VARIANT &&
+                    TensorShapeUtils::IsScalar(tensor.shape()),
+                errors::InvalidArgument(
+                    "Dataset tensor must be a scalar of dtype DT_VARIANT."));
+    Variant variant = tensor.scalar<Variant>()();
+    const WrappedDatasetVariantWrapper* wrapper =
+        variant.get<WrappedDatasetVariantWrapper>();
+    OP_REQUIRES(ctx, wrapper != nullptr,
+                errors::InvalidArgument(
+                    "Tensor must be a WrappedDataset variant object."));
+    Tensor ds_tensor = wrapper->get();
+    OP_REQUIRES_OK(ctx, ctx->set_output("output_handle", ds_tensor));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnwrapDatasetVariant").Device(DEVICE_CPU),
+                        UnwrapDatasetVariantOp);
+REGISTER_KERNEL_BUILDER(Name("UnwrapDatasetVariant")
+                            .HostMemory("input_handle")
+                            .HostMemory("output_handle")
+                            .Device(DEVICE_GPU),
+                        UnwrapDatasetVariantOp);
+
+static Status WrappedDatasetVariantDeviceCopy(
+    const WrappedDatasetVariantWrapper& from, WrappedDatasetVariantWrapper* to,
+    const UnaryVariantOpRegistry::AsyncTensorDeviceCopyFn& copy) {
+  *to = WrappedDatasetVariantWrapper(from);
+  return Status::OK();
+}
+
+#define REGISTER_OPTIONAL_COPY(DIRECTION)               \
+  INTERNAL_REGISTER_UNARY_VARIANT_DEVICE_COPY_FUNCTION( \
+      WrappedDatasetVariantWrapper, DIRECTION,          \
+      WrappedDatasetVariantDeviceCopy)
+
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::HOST_TO_DEVICE);
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::DEVICE_TO_HOST);
+REGISTER_OPTIONAL_COPY(VariantDeviceCopyDirection::DEVICE_TO_DEVICE);
+
+REGISTER_UNARY_VARIANT_DECODE_FUNCTION(WrappedDatasetVariantWrapper,
+                                       kWrappedDatasetVariantTypeName);
+
 }  // namespace
 
 Status GraphDefBuilderWrapper::AddDataset(
@@ -203,6 +315,20 @@ bool GraphDefBuilderWrapper::HasAttr(const string& name,
   return HasAttr(op_def, attr_name);
 }
 
+int64 GetAllocatedBytes(const std::vector<Tensor>& element) {
+  int64 allocated_bytes = 0;
+  DatasetBase* dataset;
+  for (auto& tensor : element) {
+    if (tensor.dtype() == DT_VARIANT &&
+        GetDatasetFromVariantTensor(tensor, &dataset).ok()) {
+      allocated_bytes += dataset->AllocatedBytes();
+    } else {
+      allocated_bytes += tensor.AllocatedBytes();
+    }
+  }
+  return allocated_bytes;
+}
+
 Status GetDatasetFromVariantTensor(const Tensor& tensor,
                                    DatasetBase** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT &&
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index 5960c105c84c2e1af3bcfb80bfbdb1f5976f5a07..7d3776a6ec92b5ab6befbab3162c3d4937c4fe70 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 #include <memory>
+#include <unordered_map>
 
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -30,8 +31,10 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/framework/variant_encode_decode.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/tracing.h"
 
 // Polymorphic datasets should support all primitive TensorFlow
@@ -50,6 +53,9 @@ namespace data {
 // A constant that can be used to enable auto-tuning.
 constexpr int kAutoTune = -1;
 
+constexpr int kInfiniteCardinality = -1;
+constexpr int kUnknownCardinality = -2;
+
 class DatasetBase;
 class SerializationContext;
 
@@ -160,7 +166,7 @@ class GraphDefBuilderWrapper {
                     const std::vector<std::pair<StringPiece, AttrValue>>& attrs,
                     Node** output) {
     std::vector<std::pair<size_t, Node*>> enumerated_inputs(inputs.size());
-    for (int i = 0; i < inputs.size(); i++) {
+    for (size_t i = 0; i < inputs.size(); i++) {
       enumerated_inputs[i] = std::make_pair(i, inputs[i]);
     }
     return AddDataset(dataset, enumerated_inputs, {}, attrs, output);
@@ -257,6 +263,7 @@ class GraphDefBuilderWrapper {
 };
 
 class StatsAggregator;
+class FunctionHandleCache;
 
 // A cut-down version of `OpKernelContext` for running computations in
 // iterators. Note that we cannot simply use `OpKernelContext` here because we
@@ -277,6 +284,8 @@ class IteratorContext {
           env(ctx->env()),
           function_library(ctx->function_library()),
           lib(ctx->lib()),
+          function_handle_cache(ctx->function_handle_cache()),
+          resource_mgr(ctx->resource_mgr()),
           model(ctx->model()),
           runner(*(ctx->runner())),
           runner_threadpool_size(ctx->runner_threadpool_size()),
@@ -285,15 +294,20 @@ class IteratorContext {
     explicit Params(OpKernelContext* ctx)
         : env(ctx->env()),
           lib(ctx->function_library()),
-          runner(*(ctx->runner())),
-          runner_threadpool_size(
-              ctx->device()->tensorflow_cpu_worker_threads()->num_threads) {
+          runner(*(ctx->runner())) {
       // NOTE: need reinterpret_cast because function.h forward-declares Device.
       DeviceBase* device =
           reinterpret_cast<DeviceBase*>(ctx->function_library()->device());
       allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
+      thread::ThreadPool* thread_pool =
+          ctx->device()->tensorflow_device_thread_pool();
+      if (thread_pool) {
+        runner_threadpool_size = thread_pool->NumThreads();
+      } else {
+        runner_threadpool_size = port::NumSchedulableCPUs();
+      }
     }
 
     // The Allocator to be used to allocate the output of an iterator.
@@ -308,6 +322,13 @@ class IteratorContext {
     // The FunctionLibraryRuntime object to be used to make function calls.
     FunctionLibraryRuntime* lib = nullptr;
 
+    // A FunctionHandleCache that owns all the function handles. Not owned.
+    FunctionHandleCache* function_handle_cache = nullptr;
+
+    // A resource manager for storing dataset-related state, e.g. random
+    // seeds or cached tensors. Not owned.
+    ResourceMgr* resource_mgr = nullptr;
+
     // If non-null, identifies the object used for performance modeling.
     std::shared_ptr<model::Model> model = nullptr;
 
@@ -343,6 +364,12 @@ class IteratorContext {
 
   FunctionLibraryRuntime* lib() { return params_.lib; }
 
+  FunctionHandleCache* function_handle_cache() {
+    return params_.function_handle_cache;
+  }
+
+  ResourceMgr* resource_mgr() { return params_.resource_mgr; }
+
   const std::shared_ptr<model::Model>& model() { return params_.model; }
 
   std::function<void(std::function<void()>)>* runner() {
@@ -514,6 +541,25 @@ class DatasetContext {
   Params params_;
 };
 
+// Returns the number of bytes allocated for the given tensor.
+int64 GetAllocatedBytes(const std::vector<Tensor>& element);
+
+// Validates and extracts a `DatasetBase` object from `tensor`.
+//
+// `tensor` must have been written by a call to SetVariantTensorToDataset().
+//
+// The retrieved pointer is a borrowed reference to the dataset, which is owned
+// by the tensor. The consumer must either acquire its own reference to the
+// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
+// destroyed or mutated while the retrieved pointer is in use.
+Status GetDatasetFromVariantTensor(const Tensor& tensor,
+                                   DatasetBase** out_dataset);
+
+// Stores a `DatasetBase` object in `tensor`.
+//
+// The ownership of `dataset` is transferred to `tensor`.
+Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
+
 // Represents a (potentially infinite) range of outputs, where each
 // output is a tuple of tensors.
 class DatasetBase : public core::RefCounted {
@@ -567,6 +613,12 @@ class DatasetBase : public core::RefCounted {
   // in the outputs of this dataset.
   virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
 
+  // Returns the number of bytes allocated for tensors of this dataset.
+  virtual int64 AllocatedBytes() const { return 0; }
+
+  // Returns the cardinality of this dataset.
+  virtual int64 Cardinality() const { return kUnknownCardinality; }
+
   // A human-readable debug string for this dataset.
   virtual string DebugString() const = 0;
 
@@ -584,7 +636,6 @@ class DatasetBase : public core::RefCounted {
                            const DatasetBase* dataset, Node** output);
   };
 
-  // TODO(jsimsa): Consolidate overloading into a single method.
   virtual Status AsGraphDefInternal(SerializationContext* ctx,
                                     DatasetGraphDefBuilder* b,
                                     Node** node) const = 0;
@@ -672,6 +723,24 @@ class DatasetBaseIterator : public IteratorBase {
     return model::MakeUnknownNode(std::move(args));
   }
 
+  // When performance modeling is enabled, this method records the fact that
+  // this iterator has dequeued a element from an internal buffer.
+  void RecordBufferDequeue(IteratorContext* ctx,
+                           const std::vector<Tensor>& element) {
+    if (node_) {
+      node_->add_buffered_bytes(-GetAllocatedBytes(element));
+    }
+  }
+
+  // When performance modeling is enabled, this method records the fact that
+  // this iterator has enqueued a element in an internal buffer.
+  void RecordBufferEnqueue(IteratorContext* ctx,
+                           const std::vector<Tensor>& element) {
+    if (node_) {
+      node_->add_buffered_bytes(GetAllocatedBytes(element));
+    }
+  }
+
   // When performance modeling is enabled, this method records the fact that
   // this iterator has produced an element.
   void RecordElement(IteratorContext* ctx) {
@@ -804,22 +873,6 @@ class BinaryDatasetOpKernel : public DatasetOpKernel {
                            DatasetBase** output) = 0;
 };
 
-// Validates and extracts a `DatasetBase` object from `tensor`.
-//
-// `tensor` must have been written by a call to SetVariantTensorToDataset().
-//
-// The retrieved pointer is a borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetDatasetFromVariantTensor(const Tensor& tensor,
-                                   DatasetBase** out_dataset);
-
-// Stores a `DatasetBase` object in `tensor`.
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreDatasetInVariantTensor(DatasetBase* dataset, Tensor* tensor);
-
 // A simple background worker that executes closures asynchronously and without
 // blocking.
 //
diff --git a/tensorflow/core/framework/device_base.cc b/tensorflow/core/framework/device_base.cc
index 9108c32942ad65616b246227f2ad84a56ea9eb93..78ace480c4bad66b06f27ca90a1bc5c482c3f00c 100644
--- a/tensorflow/core/framework/device_base.cc
+++ b/tensorflow/core/framework/device_base.cc
@@ -34,14 +34,14 @@ const string& DeviceBase::name() const {
 }
 
 void DeviceBase::set_eigen_cpu_device(Eigen::ThreadPoolDevice* d) {
-  // Eigen::ThreadPoolDevice is a very cheap struct (one pointer and
+  // Eigen::ThreadPoolDevice is a very cheap struct (two pointers and
   // an int).  Therefore, we can afford a pre-allocated array of
   // Eigen::ThreadPoolDevice.  Here, we ensure that
   // Eigen::ThreadPoolDevices in eigen_cpu_devices_ has increasingly
   // larger numThreads.
   for (int i = 1; i <= d->numThreads(); ++i) {
-    eigen_cpu_devices_.push_back(
-        new Eigen::ThreadPoolDevice(d->getPool(), i /* numThreads() */));
+    eigen_cpu_devices_.push_back(new Eigen::ThreadPoolDevice(
+        d->getPool(), i /* numThreads() */, d->allocator()));
   }
 }
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index abd0930ca9e8d0d881498093f98762e8ab1d3e5c..b69a40f3128905960cc054ddea7cc20b5d4583a3 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/function.pb_text.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/equal_graph_def.h"
 
 namespace tensorflow {
@@ -149,8 +152,8 @@ class FunctionInstantiationHelper {
   }
 
   // Builds index for nodes that can be used as node's input arguments.
-  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def,
-                            AttrSlice attr_values) {
+  Status BuildInputArgIndex(const OpDef::ArgDef& arg_def, AttrSlice attr_values,
+                            bool ints_on_device) {
     bool is_type_list;
     DataTypeVector dtypes;
     TF_RETURN_IF_ERROR(
@@ -169,7 +172,11 @@ class FunctionInstantiationHelper {
         strings::StrAppend(&name, "_", i);
       }
       NodeDef* gnode = AddNode(name);
-      gnode->set_op(FunctionLibraryDefinition::kArgOp);
+      if (ints_on_device && dtypes[i] == DataType::DT_INT32) {
+        gnode->set_op(FunctionLibraryDefinition::kDeviceArgOp);
+      } else {
+        gnode->set_op(FunctionLibraryDefinition::kArgOp);
+      }
       AddAttr("T", dtypes[i], gnode);
       AddAttr("index", arg_index, gnode);
       result_.arg_types.push_back(dtypes[i]);
@@ -502,6 +509,16 @@ string Print(const NodeDef& n) {
       entries.push_back(strings::StrCat(a.first, "=", Print(a.second)));
     }
     std::sort(entries.begin(), entries.end());
+    // Add a short device string at the end of all attributes.
+    if (!n.device().empty()) {
+      DeviceNameUtils::ParsedName parsed;
+      if (DeviceNameUtils::ParseFullName(n.device(), &parsed)) {
+        entries.push_back(
+            strings::StrCat("device=", parsed.type, ":", parsed.id));
+      } else {
+        entries.push_back("device=<FAILED_TO_PARSE>");
+      }
+    }
     strings::StrAppend(&out, "[", str_util::Join(entries, ", "), "]");
   }
   strings::StrAppend(&out, "(");
@@ -564,9 +581,11 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::vector<const NodeDef*> ret;
   std::vector<const NodeDef*> body;
   for (const NodeDef* n : nodes) {
-    if (n->op() == FunctionLibraryDefinition::kArgOp) {
+    if (n->op() == FunctionLibraryDefinition::kArgOp ||
+        n->op() == FunctionLibraryDefinition::kDeviceArgOp) {
       arg.push_back(n);
-    } else if (n->op() == FunctionLibraryDefinition::kRetOp) {
+    } else if (n->op() == FunctionLibraryDefinition::kRetOp ||
+               n->op() == FunctionLibraryDefinition::kDeviceRetOp) {
       ret.push_back(n);
     } else {
       body.push_back(n);
@@ -583,26 +602,50 @@ string Print(gtl::ArraySlice<const NodeDef*> nodes) {
   std::sort(ret.begin(), ret.end(), comp);
   string out;
   strings::StrAppend(&out, "\n(");
-  auto get_type = [](const NodeDef& n) {
+  auto get_type_and_device = [](const NodeDef& n) {
     DataType dt;
     if (!GetNodeAttr(n, "T", &dt).ok()) {
       dt = DT_INVALID;
     }
+    if (!n.device().empty()) {
+      DeviceNameUtils::ParsedName parsed;
+      if (DeviceNameUtils::ParseFullName(n.device(), &parsed)) {
+        return strings::StrCat(DataTypeString(dt), "@", parsed.type, ":",
+                               parsed.id);
+      } else {
+        return strings::StrCat(DataTypeString(dt), "@",
+                               "<FAILED_TO_PARSE_DEVICE>");
+      }
+    }
     return DataTypeString(dt);
   };
   for (size_t i = 0; i < arg.size(); ++i) {
     const NodeDef* n = arg[i];
     if (i > 0) strings::StrAppend(&out, ", ");
     CHECK_GE(n->attr_size(), 2);
-    strings::StrAppend(&out, n->name(), ":", get_type(*n));
+    strings::StrAppend(&out, n->name(), ":", get_type_and_device(*n));
   }
   strings::StrAppend(&out, ") -> (");
   for (size_t i = 0; i < ret.size(); ++i) {
     const NodeDef* n = ret[i];
     if (i > 0) strings::StrAppend(&out, ", ");
     CHECK_LE(2, n->attr_size());
-    CHECK_EQ(1, n->input_size());
-    strings::StrAppend(&out, n->input(0), ":", get_type(*n));
+
+    // The _RetVal op should have a unique non-control input. We assert that
+    // here and add it to the output.
+    bool found_non_control_input = false;
+    for (const string& input : n->input()) {
+      if (!input.empty() && input[0] != '^') {
+        DCHECK_EQ(found_non_control_input, false)
+            << "RetVal node has more than one non-control input: "
+            << absl::StrJoin(n->input(), ", ");
+        strings::StrAppend(&out, n->input(0), ":", get_type_and_device(*n));
+        found_non_control_input = true;
+      }
+    }
+    DCHECK_EQ(found_non_control_input, true)
+        << "RetVal did not have any non-control inputs: "
+        << absl::StrJoin(n->input(), ", ");
   }
   strings::StrAppend(&out, ") {\n");
   for (size_t i = 0; i < body.size(); ++i) {
@@ -638,10 +681,13 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
   const OpDef& sig = fdef.signature();
   TF_RETURN_IF_ERROR(ValidateSignatureWithAttrs(sig, attr_values));
 
+  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
+                        fdef.attr().at("experimental_ints_on_device").b();
+
   FunctionInstantiationHelper helper(get_function, result);
   Status s;
   for (const OpDef::ArgDef& arg_def : sig.input_arg()) {
-    s = helper.BuildInputArgIndex(arg_def, attr_values);
+    s = helper.BuildInputArgIndex(arg_def, attr_values, ints_on_device);
     if (!s.ok()) {
       errors::AppendToMessage(&s, "In ", Print(arg_def));
       return s;
@@ -693,9 +739,6 @@ Status InstantiateFunction(const FunctionDef& fdef, AttrSlice attr_values,
     }
   }
 
-  bool ints_on_device = fdef.attr().count("experimental_ints_on_device") != 0 &&
-                        fdef.attr().at("experimental_ints_on_device").b();
-
   // Emits nodes for the function's return values.
   int ret_index = 0;
   for (const OpDef::ArgDef& ret_def : sig.output_arg()) {
@@ -1234,6 +1277,16 @@ const FunctionDef* FunctionLibraryDefinition::GetAttrImpl(
   }
 }
 
+std::vector<string> FunctionLibraryDefinition::ListFunctionNames() const {
+  std::vector<string> function_names;
+  tf_shared_lock l(mu_);
+  function_names.reserve(function_defs_.size());
+  for (const auto& it : function_defs_) {
+    function_names.emplace_back(it.first);
+  }
+  return function_names;
+}
+
 FunctionDefLibrary FunctionLibraryDefinition::ToProto() const {
   FunctionDefLibrary lib;
   tf_shared_lock l(mu_);
@@ -1273,6 +1326,138 @@ GET_ATTR(string)
 GET_ATTR(bool)
 #undef GET_ATTR
 
+namespace {
+
+constexpr char kExperimentalApiImplements[] = "experimental_api_implements";
+
+absl::flat_hash_set<string> ReachableFunctions(
+    const FunctionLibraryDefinition& flib,
+    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+  // Functions that are reachable from the graph.
+  absl::flat_hash_set<string> reachable_funcs;
+
+  // For any functions, if it has attribute "experimental_api_implements" =
+  // "some_interface" and it is reachable, then it means any other
+  // function with same attribute name and value could also be potentially
+  // reachable, eg via experimental_implementation_selector swapping the
+  // nodedef.
+  absl::flat_hash_set<string> reachable_api_interface;
+
+  // Functions might be reachable from the nested function calls, so we keep a
+  // queue of functions that we have to check.
+  gtl::InlinedVector<const FunctionDef*, 4> func_queue;
+
+  // Add reachable and not already processed functions to the functions queue.
+  const auto add_to_func_queue = [&](const string& func_name) {
+    const FunctionDef* func = flib.Find(func_name);
+    if (func && reachable_funcs.find(func_name) == reachable_funcs.end()) {
+      func_queue.push_back(func);
+    }
+  };
+
+  // Add all the functions that are reachable from the given node to the queue.
+  const auto process_node = [&](const NodeDef& node) {
+    // Node itself can be a call to the function.
+    add_to_func_queue(node.op());
+
+    // Or node can have an attribute referencing a function.
+    for (const auto& attr : node.attr()) {
+      const auto& attr_value = attr.second;
+
+      // 1. AttrValue.func
+      if (attr_value.has_func()) {
+        add_to_func_queue(attr_value.func().name());
+      }
+
+      // 2. AttrValue.ListValue.func
+      if (attr_value.has_list()) {
+        for (const auto& func : attr_value.list().func()) {
+          add_to_func_queue(func.name());
+        }
+      }
+    }
+  };
+
+  // Add all functions that are directly called from the optimized graph.
+  std::for_each(nodes.begin(), nodes.end(), process_node);
+
+  // Process all reachable functions.
+  while (!func_queue.empty()) {
+    const FunctionDef* func = func_queue.back();
+    func_queue.pop_back();
+
+    const string& func_name = func->signature().name();
+    reachable_funcs.insert(func_name);
+
+    const auto attr_it = func->attr().find(kExperimentalApiImplements);
+    if (attr_it != func->attr().end()) {
+      reachable_api_interface.insert(attr_it->second.s());
+    }
+
+    // Find all the functions called from the function body.
+    const auto& func_body = func->node_def();
+    std::for_each(func_body.begin(), func_body.end(), process_node);
+
+    // Check if the function has a registered gradient.
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
+  }
+
+  for (const auto& func_name : flib.ListFunctionNames()) {
+    const auto& func_def = flib.Find(func_name);
+    const auto attr_it = func_def->attr().find(kExperimentalApiImplements);
+    if (attr_it != func_def->attr().end()) {
+      if (reachable_api_interface.contains(attr_it->second.s())) {
+        reachable_funcs.insert(func_name);
+      }
+    }
+  }
+
+  return reachable_funcs;
+}
+
+FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
+    const FunctionLibraryDefinition& flib,
+    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
+  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, nodes);
+
+  FunctionLibraryDefinition reachable_flib(flib.default_registry(),
+                                           FunctionDefLibrary());
+
+  for (const string& func_name : reachable_funcs) {
+    const FunctionDef* func = flib.Find(func_name);
+    DCHECK_NE(func, nullptr);
+    // That should never fail, because we copy functions from valid flib and use
+    // the same default registry.
+    const Status added = reachable_flib.AddFunctionDef(*func);
+    DCHECK(added.ok());
+
+    const string grad_func_name = flib.FindGradient(func_name);
+    if (!grad_func_name.empty()) {
+      GradientDef grad;
+      grad.set_function_name(func_name);
+      grad.set_gradient_func(grad_func_name);
+      // It can only fail if function already has a gradient function.
+      const Status added_grad = reachable_flib.AddGradientDef(grad);
+      DCHECK(added_grad.ok());
+    }
+  }
+
+  return reachable_flib;
+}
+
+}  // namespace
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const GraphDef& graph) const {
+  return ReachableFunctionLibraryDefinition(*this, graph.node());
+}
+
+FunctionLibraryDefinition FunctionLibraryDefinition::ReachableDefinitions(
+    const FunctionDef& func) const {
+  return ReachableFunctionLibraryDefinition(*this, func.node_def());
+}
+
 void FunctionDefHelper::AttrValueWrapper::InitFromString(StringPiece val) {
   if (val.size() >= 2 && val[0] == '$') {
     proto.set_placeholder(val.data() + 1, val.size() - 1);
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 40ace6ef81589885b2eb025edd337220a8cce545..9cf4b0f4cdf1d4c3604eebcf33bb51274578d73c 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -379,6 +379,7 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Ops created for function arguments bear the name given by `kArgOp`; those
   // created for return values bear the name given by `kRetOp`.
   static constexpr const char* const kArgOp = "_Arg";
+  static constexpr const char* const kDeviceArgOp = "_DeviceArg";
   static constexpr const char* const kRetOp = "_Retval";
   static constexpr const char* const kDeviceRetOp = "_DeviceRetval";
 
@@ -406,10 +407,18 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
     return function_defs_.size();
   }
 
+  // Returns all the function names in the FunctionLibraryDefinition.
+  std::vector<string> ListFunctionNames() const LOCKS_EXCLUDED(mu_);
+
   const OpRegistryInterface* default_registry() const {
     return default_registry_;
   }
 
+  // Returns a copy of `*this` with only the subset of functions that are
+  // reachable from the nodes of `graph` or `func`.
+  FunctionLibraryDefinition ReachableDefinitions(const GraphDef& graph) const;
+  FunctionLibraryDefinition ReachableDefinitions(const FunctionDef& func) const;
+
  private:
   // Shape inference for functions is handled separately by ShapeRefiner.
 
diff --git a/tensorflow/core/framework/function_handle_cache.cc b/tensorflow/core/framework/function_handle_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2b93b6b2f8702ccfbc191072054278f7f732dc5a
--- /dev/null
+++ b/tensorflow/core/framework/function_handle_cache.cc
@@ -0,0 +1,66 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/function_handle_cache.h"
+
+#include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/lib/strings/stringprintf.h"
+
+namespace tensorflow {
+namespace data {
+
+FunctionHandleCache::FunctionHandleCache(FunctionLibraryRuntime* lib)
+    : lib_(lib), state_handle_(strings::Printf("%lld", random::New64())) {}
+
+FunctionHandleCache::~FunctionHandleCache() {
+  Status s = Clear();
+  if (!s.ok()) {
+    LOG(ERROR) << "Failed to clear function handle cache: " << s.ToString();
+  }
+}
+
+Status FunctionHandleCache::Instantiate(
+    const string& function_name, AttrSlice attrs,
+    FunctionLibraryRuntime::InstantiateOptions options,
+    FunctionLibraryRuntime::Handle* handle) {
+  string key = Canonicalize(function_name, attrs, options);
+  FunctionLibraryRuntime::Handle h;
+  {
+    tf_shared_lock l(mu_);
+    h = gtl::FindWithDefault(handles_, key, kInvalidHandle);
+  }
+  if (h == kInvalidHandle) {
+    options.state_handle = state_handle_;
+    TF_RETURN_IF_ERROR(
+        lib_->Instantiate(function_name, attrs, options, handle));
+    mutex_lock l(mu_);
+    handles_[key] = *handle;
+  } else {
+    *handle = h;
+  }
+  return Status::OK();
+}
+
+Status FunctionHandleCache::Clear() {
+  mutex_lock l(mu_);
+  for (auto entry : handles_) {
+    TF_RETURN_IF_ERROR(lib_->ReleaseHandle(entry.second));
+  }
+  handles_.clear();
+  return Status::OK();
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/framework/function_handle_cache.h b/tensorflow/core/framework/function_handle_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..2800a598e09dc305dc65abd0283545f5493b150a
--- /dev/null
+++ b/tensorflow/core/framework/function_handle_cache.h
@@ -0,0 +1,53 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
+#define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+namespace data {
+
+class FunctionHandleCache {
+ public:
+  explicit FunctionHandleCache(FunctionLibraryRuntime* lib);
+
+  ~FunctionHandleCache();
+
+  // Looks up the function to be instantiated in the cache first. If present,
+  // returns handle from there. Otherwise, instantiates a new function
+  // and stores handle in the cache.
+  Status Instantiate(const string& function_name, AttrSlice attrs,
+                     FunctionLibraryRuntime::InstantiateOptions options,
+                     FunctionLibraryRuntime::Handle* handle);
+
+  // Releases all the handles in the cache, clearing out the state for all
+  // functions involved.
+  Status Clear();
+
+ private:
+  mutex mu_;
+  FunctionLibraryRuntime* lib_ = nullptr;  // not owned
+  const string state_handle_;
+  std::unordered_map<string, FunctionLibraryRuntime::Handle> handles_
+      GUARDED_BY(mu_);
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_FRAMEWORK_FUNCTION_HANDLE_CACHE_H_
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 10392a9f32850a32c1a4a8ca273693987f102244..75d45fa2c84ebc340dfb79b76f7b406d7a099c1f 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -1213,6 +1213,17 @@ TEST(FunctionLibraryDefinitionTest, ToProto) {
   EXPECT_EQ(f3->DebugString(), f4->DebugString());
 }
 
+TEST(FunctionLibraryDefinitionTest, FunctionNames) {
+  FunctionDefLibrary proto;
+  *proto.add_function() = test::function::XTimesTwo();
+  *proto.add_function() = test::function::WXPlusB();
+  const FunctionLibraryDefinition lib_def(OpRegistry::Global(), proto);
+
+  const std::vector<string> function_names = lib_def.ListFunctionNames();
+  const std::vector<string> expected = {"XTimesTwo", "WXPlusB"};
+  EXPECT_EQ(function_names, expected);
+}
+
 TEST(FunctionLibraryDefinitionTest, GetAttr_FuncNoAttr) {
   FunctionDefLibrary proto;
   *proto.add_function() = test::function::XTimesTwo();
@@ -1293,6 +1304,79 @@ TEST(FunctionLibraryDefinitionTest, GetAttr_Gradient) {
   EXPECT_EQ(annotation, false);  // WXPlusB has no custom gradient.
 }
 
+TEST(FunctionLibraryDefinitionTest, ReachableDefinitions) {
+  using ::tensorflow::test::function::GDef;
+  using ::tensorflow::test::function::NDef;
+  using FDH = ::tensorflow::FunctionDefHelper;
+
+  const auto make_simple_fdef = [](const string& name,
+                                   const string& interface_name) {
+    auto func_def = FDH::Create(
+        name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+        {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+        /* Mapping between function returns and function node outputs. */
+        {{"z", "output:z:0"}});
+
+    if (!interface_name.empty()) {
+      auto* attr = func_def.mutable_attr();
+      (*attr)["experimental_api_implements"].set_s(interface_name);
+    }
+    return func_def;
+  };
+
+  FunctionDef func_1 = make_simple_fdef("Func1", "");
+  FunctionDef func_2 = make_simple_fdef("Func2", "");
+  FunctionDef func_3 = make_simple_fdef("Func3", "");
+  FunctionDef func_4 = make_simple_fdef("Func4", "api_1");
+  FunctionDef func_5 = make_simple_fdef("Func5", "api_1");
+  FunctionDef func_6 = make_simple_fdef("Func6", "api_2");
+
+  FunctionDef func_2_grad = make_simple_fdef("Func2_grad", "");
+
+  constexpr char kDevice[] = "/device:CPU:0";
+
+  GraphDef graph = GDef(
+      {
+          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+          NDef("x", "Func1", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+          NDef("y", "PartitionedCall", {"a", "b"},
+               {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+                {"Tout", DataTypeSlice{DT_FLOAT}},
+                {"f", FDH::FunctionRef("Func2", {{"T", DT_FLOAT}})}},
+               kDevice),
+          NDef("z", "Func4", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
+      },
+      // FunctionLib
+      {func_1, func_2, func_3, func_2_grad, func_4, func_5, func_6});
+
+  // Register custom function gradient after the graph was constructed.
+  GradientDef* func3_grad_def = graph.mutable_library()->add_gradient();
+  func3_grad_def->set_function_name("Func2");
+  func3_grad_def->set_gradient_func("Func2_grad");
+
+  FunctionLibraryDefinition flib(OpRegistry::Global(), graph.library());
+
+  // - 'Func1' is called directly from the graph.
+  // - 'Func2' is called indirectly via a PartitionedCall attribute, and it also
+  //   has a custom gradient ('Func2_grad') that must remain in the library.
+  // - 'Func3' is unreachable and has to be removed from the library
+  // - 'Func4' is called directly from the graph
+  // - 'Func5' is not called directly, but it implements same interface as Func4
+  //   which is directly called.
+  // - 'Func6' is not called directly, and the interface it implements has not
+  //   not been called by another nodes in the graph.
+  FunctionLibraryDefinition reachable_flib = flib.ReachableDefinitions(graph);
+  EXPECT_EQ(reachable_flib.num_functions(), 5);
+  EXPECT_TRUE(reachable_flib.Contains("Func1"));
+  EXPECT_TRUE(reachable_flib.Contains("Func2"));
+  EXPECT_TRUE(reachable_flib.Contains("Func2_grad"));
+  EXPECT_FALSE(reachable_flib.Contains("Func3"));
+  EXPECT_TRUE(reachable_flib.Contains("Func4"));
+  EXPECT_TRUE(reachable_flib.Contains("Func5"));
+  EXPECT_FALSE(reachable_flib.Contains("Func6"));
+}
+
 // TODO(skyewm): this could be more thorough
 TEST(FunctionDefsEqualTest, TestFunctionDefsEqual) {
   // Equal functions
diff --git a/tensorflow/core/framework/kernel_def.proto b/tensorflow/core/framework/kernel_def.proto
index e16c2ae73bd5fb559daa0f1b8ec141479ce3d67a..358621dc0f5cc19d4687d75e97a76b9fafe3325f 100644
--- a/tensorflow/core/framework/kernel_def.proto
+++ b/tensorflow/core/framework/kernel_def.proto
@@ -33,6 +33,11 @@ message KernelDef {
   // won't be used unless the user specifies a "_kernel" attr with
   // value matching this.
   string label = 5;
+
+  // Prioritization of kernel amongst different devices. By default we assume
+  // priority is 0. The higher the priority the better. By default (i.e. if
+  // this is not set), we prefer GPU kernels over CPU.
+  int32 priority = 6;
 }
 
 // A collection of KernelDefs
diff --git a/tensorflow/core/framework/kernel_def_builder.cc b/tensorflow/core/framework/kernel_def_builder.cc
index eb86f18ff06c38860e0c24e60b42326317ddecfb..fcacc3bebbab66449f81e5fa4f3aba2565f3f18e 100644
--- a/tensorflow/core/framework/kernel_def_builder.cc
+++ b/tensorflow/core/framework/kernel_def_builder.cc
@@ -66,6 +66,11 @@ KernelDefBuilder& KernelDefBuilder::Label(const char* label) {
   return *this;
 }
 
+KernelDefBuilder& KernelDefBuilder::Priority(int32 priority) {
+  kernel_def_->set_priority(priority);
+  return *this;
+}
+
 const KernelDef* KernelDefBuilder::Build() {
   KernelDef* r = kernel_def_;
   kernel_def_ = nullptr;
diff --git a/tensorflow/core/framework/kernel_def_builder.h b/tensorflow/core/framework/kernel_def_builder.h
index 32dd21f94e0edf8b48cd2f710d1cd99038cba122..d74453cf60678d0f07e53190adba4903c120c69a 100644
--- a/tensorflow/core/framework/kernel_def_builder.h
+++ b/tensorflow/core/framework/kernel_def_builder.h
@@ -64,6 +64,9 @@ class KernelDefBuilder {
   // "_kernel" attr.  May only be specified once.  Returns *this.
   KernelDefBuilder& Label(const char* label);
 
+  // Specify a priority number for this kernel.
+  KernelDefBuilder& Priority(int32 priority);
+
   // Returns a pointer to a KernelDef with fields set based on the
   // above calls to this instance.
   // Caller takes ownership of the result.
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 24aa5630cc38550789d6184500cff6b0394ecbee..10059bbfd5a89a3b24ce3daf981408564a5351b2 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -112,6 +112,12 @@ class Node {
   explicit Node(Args args)
       : id_(args.id), name_(args.name), output_(args.output.get()) {}
 
+  // Increments the bytes buffered by the given delta.
+  void add_buffered_bytes(int64 delta) LOCKS_EXCLUDED(mu_) {
+    mutex_lock l(mu_);
+    buffered_bytes_ += delta;
+  }
+
   // Adds an input.
   void add_input(std::shared_ptr<Node> node) LOCKS_EXCLUDED(mu_) {
     mutex_lock l(mu_);
@@ -124,18 +130,24 @@ class Node {
     processing_time_ += delta;
   }
 
+  // Returns the number of bytes stored in this node's buffer.
+  int64 buffered_bytes() const LOCKS_EXCLUDED(mu_) {
+    tf_shared_lock l(mu_);
+    return buffered_bytes_;
+  }
+
   // Returns the unique node ID.
   int64 id() const LOCKS_EXCLUDED(mu_) { return id_; }
 
-  // Returns the node name.
-  const string& name() const { return name_; }
-
   // Returns the node inputs.
   std::list<std::shared_ptr<Node>> inputs() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     return inputs_;
   }
 
+  // Returns the node name.
+  const string& name() const { return name_; }
+
   // Returns the number of elements produced by the node.
   int64 num_elements() const LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
@@ -185,7 +197,8 @@ class Node {
 
   // Collects tunable parameters in the subtree rooted in this node.
   void CollectTunableParameters(
-      std::vector<std::shared_ptr<Parameter>>* parameters) LOCKS_EXCLUDED(mu_) {
+      std::vector<std::shared_ptr<Parameter>>* parameters) const
+      LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     for (auto& pair : parameters_) {
       if (pair.second->state->tunable) {
@@ -219,6 +232,7 @@ class Node {
       LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     std::shared_ptr<Node> result = Clone(output);
+    result->buffered_bytes_ = buffered_bytes_;
     result->processing_time_ = processing_time_;
     result->num_elements_ = num_elements_;
     result->parameters_ = parameters_;
@@ -274,6 +288,7 @@ class Node {
   mutable mutex mu_;
   const int64 id_;
   const string name_;
+  int64 buffered_bytes_ GUARDED_BY(mu_) = 0;
   int64 processing_time_ GUARDED_BY(mu_) = 0;
   int64 num_elements_ GUARDED_BY(mu_) = 0;
   std::map<std::thread::id, int64> work_start_ GUARDED_BY(mu_);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 53e35f25b28cb3770b52e8f7de54eb0ff4e65d83..90bd570f90cdab2182f3d46e009b2cd972667ef9 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -330,6 +330,62 @@ TEST(UnknownTest, Model) {
   EXPECT_EQ(100, unknown->OutputTime(&input_times));
 }
 
+class TestNode : public model::Node {
+ public:
+  using model::Node::Node;
+
+  virtual ~TestNode() {}
+
+ protected:
+  std::shared_ptr<Node> Clone(std::shared_ptr<Node> output) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return nullptr;
+  }
+
+  int64 OutputTimeLocked(std::vector<int64>* input_times) const override
+      SHARED_LOCKS_REQUIRED(mu_) {
+    return 0;
+  }
+
+  int64 ProcessingTimeLocked() const override SHARED_LOCKS_REQUIRED(mu_) {
+    return 0;
+  }
+};
+
+TEST(SetterGetterTest, Node) {
+  std::shared_ptr<TestNode> node =
+      std::make_shared<TestNode>(model::Node::Args{-1, "TestNode", nullptr});
+  EXPECT_EQ(-1, node->id());
+  EXPECT_EQ("TestNode", node->name());
+  EXPECT_EQ(nullptr, node->output());
+
+  EXPECT_EQ(0, node->buffered_bytes());
+  node->add_buffered_bytes(42);
+  EXPECT_EQ(42, node->buffered_bytes());
+
+  EXPECT_EQ(0, node->processing_time());
+  node->record_start(1);
+  EXPECT_EQ(0, node->processing_time());
+  node->record_stop(41);
+  EXPECT_EQ(40, node->processing_time());
+  node->add_processing_time(2);
+  EXPECT_EQ(42, node->processing_time());
+
+  std::shared_ptr<TestNode> input =
+      std::make_shared<TestNode>(model::Node::Args{-1, "TestInput", node});
+  EXPECT_EQ(node.get(), input->output());
+  EXPECT_EQ(0, node->inputs().size());
+  node->add_input(input);
+  EXPECT_EQ(1, node->inputs().size());
+  EXPECT_EQ(input, node->inputs().front());
+  node->remove_input(input);
+  EXPECT_EQ(0, node->inputs().size());
+
+  EXPECT_EQ(0, node->num_elements());
+  node->record_element();
+  EXPECT_EQ(1, node->num_elements());
+}
+
 }  // namespace
 }  // namespace model
 }  // namespace data
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 578ec7f2e4dbebc8f0f5b3d80b551346523f8d10..95a787b2df02d48f316653ee5059b4f7e80f73e1 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -102,6 +102,10 @@ string SummarizeNodeDef(const NodeDef& node_def) {
   return ret;
 }
 
+string SummarizeAttrs(const NodeDef& node_def) {
+  return SummarizeAttrsHelper(node_def, node_def.device());
+}
+
 string FormatNodeForError(const Node& node) {
   return FormatNodeDefForError(node.def());
 }
diff --git a/tensorflow/core/framework/node_def_util.h b/tensorflow/core/framework/node_def_util.h
index 0ff67554eb3d2b4713c6c329dec2dc814ce28395..f682bb15355550622e8bbe384df790f1022bd630 100644
--- a/tensorflow/core/framework/node_def_util.h
+++ b/tensorflow/core/framework/node_def_util.h
@@ -48,6 +48,7 @@ extern const char* const kColocationGroupPrefix;
 // than a text-format proto.
 string SummarizeNode(const Node& node);
 string SummarizeNodeDef(const NodeDef& node_def);
+string SummarizeAttrs(const NodeDef& node_def);
 
 // Produces a formatted string pattern from the node which can uniquely identify
 // this node upstream to produce an informative error message. The pattern
diff --git a/tensorflow/core/framework/op_kernel.cc b/tensorflow/core/framework/op_kernel.cc
index fe71196979afd8241736779b031c0d110d530ed3..e3cb4a40ec5503307813d292f4f538fb8577a25b 100644
--- a/tensorflow/core/framework/op_kernel.cc
+++ b/tensorflow/core/framework/op_kernel.cc
@@ -39,9 +39,11 @@ limitations under the License.
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/platform_strings.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/ptr_util.h"
 
@@ -255,6 +257,9 @@ Status OpKernelConstruction::allocate_persistent(
 
 // OpKernelContext -----------------------------------------------------------
 
+const int OpKernelContext::Params::kNeverForward;
+const int OpKernelContext::Params::kNoReservation;
+
 OpKernelContext::OpKernelContext(Params* params)
     : OpKernelContext(
           params, static_cast<int>(params->op_kernel->output_types().size())) {}
@@ -942,6 +947,44 @@ static const char kKernelLibPattern[] = "libtfkernel*.dylib";
 static const char kKernelLibPattern[] = "libtfkernel*.so";
 #endif
 
+#define FEATURE(x) \
+  { x, #x }
+
+// Returns Status::OK if the dynamic library at the given path is safe to
+// load with some level of confidence.
+static Status IsProbablySafeToLoad(const string& path) {
+  // A map of platform string to required CPU feature.
+  using port::CPUFeature;
+  static const auto* feature_map =
+      new std::map<string, std::pair<CPUFeature, string>>{
+          {"__AVX512VL__=1", FEATURE(CPUFeature::AVX512VL)},
+      };
+
+  std::vector<std::string> platform_strings;
+  int result = GetPlatformStrings(path, &platform_strings);
+  if (result) {
+    return Status(error::Code::UNKNOWN, strerror(result));
+  }
+  if (platform_strings.empty()) {
+    return Status(error::Code::FAILED_PRECONDITION,
+                  "Didn't find any platform strings");
+  }
+  std::vector<std::string> missing_features;
+  for (const auto& platform_string : platform_strings) {
+    const auto& entry = feature_map->find(platform_string);
+    if (entry != feature_map->end() &&
+        !port::TestCPUFeature(entry->second.first)) {
+      missing_features.emplace_back(entry->second.second);
+    }
+  }
+  if (!missing_features.empty()) {
+    string errmsg = "Missing CPU features: ";
+    errmsg.append(str_util::Join(missing_features, ", "));
+    return Status(errors::Code::FAILED_PRECONDITION, errmsg);
+  }
+  return Status::OK();
+}
+
 void LoadDynamicKernelsInternal() {
   Env* env = Env::Default();
   string bazel_kernel_dir = io::JoinPath(env->GetRunfilesDir(),
@@ -952,12 +995,18 @@ void LoadDynamicKernelsInternal() {
   Status s_kernel_dir = env->GetChildren(bazel_kernel_dir, &files);
   if (s_kernel_dir.ok()) {
     string dll_spec = io::JoinPath(bazel_kernel_dir, kKernelLibPattern);
-    for (const auto&  file : files) {
-      string fullpath =  io::JoinPath(bazel_kernel_dir, file);
+    for (const auto& file : files) {
+      string fullpath = io::JoinPath(bazel_kernel_dir, file);
       if (env->MatchPath(fullpath, dll_spec)) {
-        // TODO(gunan): Store the handles to the opened files.
-        void* unused_filehandle;
-        TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
+        Status s = IsProbablySafeToLoad(fullpath);
+        if (s.ok()) {
+          // TODO(gunan): Store the handles to the opened files.
+          void* unused_filehandle;
+          TF_CHECK_OK(env->LoadLibrary(fullpath.c_str(), &unused_filehandle));
+        } else {
+          LOG(WARNING) << "Not loading plugin library " << fullpath << ": "
+                       << s.error_message();
+        }
       }
     }
   }
@@ -1078,7 +1127,8 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
         FormatNodeDefForError(node_def));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
-          &s, " (OpKernel was found, but attributes didn't match)");
+          &s, " (OpKernel was found, but attributes didn't match) ",
+          "Requested Attributes: ", SummarizeAttrs(node_def));
     }
     errors::AppendToMessage(
         &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
@@ -1091,7 +1141,7 @@ Status FindKernelDef(const DeviceType& device_type, const NodeDef& node_def,
 
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    DeviceTypeVector* device_types) {
+    PrioritizedDeviceTypeVector* prioritized_device_types) {
   // TODO(zhifengc): Changes the callers (SimplePlacer and
   // DynamicPlacer) to consider the possibility that 'def' is call to
   // a user-defined function and only calls this
@@ -1104,12 +1154,21 @@ Status SupportedDeviceTypesForNode(
       bool was_attr_mismatch;
       TF_RETURN_IF_ERROR(
           FindKernelRegistration(device_type, def, &reg, &was_attr_mismatch));
-      if (reg != nullptr) device_types->push_back(device_type);
+      if (reg != nullptr) {
+        int32 priority = reg->def.priority();
+        prioritized_device_types->emplace_back(device_type, priority);
+      }
     }
+    std::sort(prioritized_device_types->begin(),
+              prioritized_device_types->end(),
+              [](const std::pair<DeviceType, int32>& a,
+                 const std::pair<DeviceType, int32>& b) {
+                return a.second > b.second;
+              });
   } else {
     // Assumes that all device types support this node.
     for (const DeviceType& device_type : prioritized_types) {
-      device_types->push_back(device_type);
+      prioritized_device_types->push_back(std::make_pair(device_type, 0));
     }
   }
   return Status::OK();
@@ -1204,7 +1263,8 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
                               FormatNodeDefForError(node_def)));
     if (was_attr_mismatch) {
       errors::AppendToMessage(
-          &s, " (OpKernel was found, but attributes didn't match)");
+          &s, " (OpKernel was found, but attributes didn't match) ",
+          "Requested Attributes: ", SummarizeAttrs(node_def));
     }
     errors::AppendToMessage(
         &s, ".  Registered:", KernelsRegisteredForOp(node_def.op()));
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index 37a437136bd79d63e1f6853f3098fd7240dfecb8..19a0c5e5be2e8cbb16d55db21d4d425d9add2974 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1237,7 +1237,7 @@ Status CreateOpKernel(DeviceType device_type, DeviceBase* device,
 //           * def has all attrs specified (e.g. using AddDefaultsToNodeDef()).
 Status SupportedDeviceTypesForNode(
     const std::vector<DeviceType>& prioritized_types, const NodeDef& def,
-    DeviceTypeVector* device_types);
+    PrioritizedDeviceTypeVector* device_types);
 
 // Returns a message with a description of the kernels registered for op
 // `op_name`.
@@ -1527,6 +1527,7 @@ T* OpKernelContext::op_device_context() {
 
 template <typename T>
 T* OpKernelContext::input_device_context(int index) {
+  DCHECK_NE(params_->input_device_contexts, nullptr);
   DCHECK_GE(index, 0);
   DCHECK_LT(index, params_->input_device_contexts->size());
   static_assert(std::is_base_of<DeviceContext, T>::value,
@@ -1535,6 +1536,7 @@ T* OpKernelContext::input_device_context(int index) {
 }
 
 inline DeviceContext* OpKernelContext::input_device_context(int index) {
+  DCHECK_NE(params_->input_device_contexts, nullptr);
   DCHECK_GE(index, 0);
   DCHECK_LT(index, params_->input_device_contexts->size());
   return (*params_->input_device_contexts)[index];
diff --git a/tensorflow/core/framework/op_kernel_test.cc b/tensorflow/core/framework/op_kernel_test.cc
index 83dda6579b784be538f45d9c95be57d412f49668..d8001cd07103f01c57480b62f3d40ff40514af88 100644
--- a/tensorflow/core/framework/op_kernel_test.cc
+++ b/tensorflow/core/framework/op_kernel_test.cc
@@ -102,6 +102,27 @@ REGISTER_OP("Test4").Input("i: float").Output("o: float");
 REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_CPU), DummyKernel);
 REGISTER_KERNEL_BUILDER(Name("Test4").Device(DEVICE_GPU), DummyKernel);
 
+// Kernels with different priorities.
+REGISTER_OP("Test5").Input("a: T").Input("b: T").Attr("T: type");
+
+class TestOp5Cpu : public tensorflow::OpKernel {
+ public:
+  explicit TestOp5Cpu(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_CPU).Priority(2),
+                        TestOp5Cpu);
+
+class TestOp5Gpu : public tensorflow::OpKernel {
+ public:
+  explicit TestOp5Gpu(OpKernelConstruction* context) : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {}
+};
+
+REGISTER_KERNEL_BUILDER(Name("Test5").Device(DEVICE_GPU).Priority(1),
+                        TestOp5Gpu);
+
 static std::vector<DeviceType> DeviceTypes() {
   return {DeviceType(DEVICE_GPU), DeviceType(DEVICE_CPU)};
 }
@@ -185,10 +206,10 @@ TEST_F(OpKernelTest, SuccessBothCpuAndGpu) {
 
 TEST_F(OpKernelTest, CpuTypeRegistered) {
   NodeDef ndef = CreateNodeDef("Test1", {DT_FLOAT, DT_INT32});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
   EXPECT_EQ(1, devs.size());
-  EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0]);
+  EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
 }
 
 TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
@@ -196,24 +217,24 @@ TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
     // Try a node def of an op that is registered for a specific type
     // only on CPU.
     NodeDef ndef = CreateNodeDef("Test3", {DT_INT8, DT_INT8});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(1, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0]);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
   }
   {
     // Try a node def of an op that is registered for a specific type
     // only on GPU.
     NodeDef ndef = CreateNodeDef("Test3", {DT_FLOAT, DT_FLOAT});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(1, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0]);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
   }
   {
     // Try a node def of an op that is only registered for other types.
     NodeDef ndef = CreateNodeDef("Test3", {DT_STRING, DT_STRING});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(0, devs.size());
   }
@@ -221,11 +242,23 @@ TEST_F(OpKernelTest, CpuAndGpuTypeRegistered) {
   {
     // Try a node def of an op that is registered for both.
     NodeDef ndef = CreateNodeDef("Test4", {DT_FLOAT});
-    DeviceTypeVector devs;
+    PrioritizedDeviceTypeVector devs;
+    TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
+    EXPECT_EQ(2, devs.size());
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0].first);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1].first);
+  }
+
+  {
+    // Try a node def of an op where kernels have priorities.
+    NodeDef ndef = CreateNodeDef("Test5", {DT_STRING, DT_STRING});
+    PrioritizedDeviceTypeVector devs;
     TF_ASSERT_OK(SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs));
     EXPECT_EQ(2, devs.size());
-    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[0]);
-    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[1]);
+    EXPECT_EQ(DeviceType(DEVICE_CPU), devs[0].first);
+    EXPECT_EQ(2, devs[0].second);
+    EXPECT_EQ(DeviceType(DEVICE_GPU), devs[1].first);
+    EXPECT_EQ(1, devs[1].second);
   }
 }
 
@@ -412,11 +445,11 @@ class OpKernelBuilderTest : public ::testing::Test {
     }
 
     // Test SupportedDeviceTypesForNode()
-    DeviceTypeVector devices;
+    PrioritizedDeviceTypeVector devices;
     TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
     bool found = false;
-    for (const DeviceType& dt : devices) {
-      if (dt == device_type) {
+    for (const auto& dt : devices) {
+      if (dt.first == device_type) {
         found = true;
       }
     }
@@ -445,11 +478,11 @@ class OpKernelBuilderTest : public ::testing::Test {
       EXPECT_EQ(code, status.code());
 
       // Test SupportedDeviceTypesForNode().
-      DeviceTypeVector devices;
+      PrioritizedDeviceTypeVector devices;
       if (errors::IsNotFound(status)) {
         TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
-        for (const DeviceType& dt : devices) {
-          EXPECT_NE(dt, device_type);
+        for (const auto& dt : devices) {
+          EXPECT_NE(dt.first, device_type);
         }
       } else {
         Status status2 =
@@ -562,7 +595,7 @@ REGISTER_KERNEL_BUILDER(Name("DuplicateKernel").Device(DEVICE_CPU),
 
 TEST_F(OpKernelBuilderTest, DuplicateKernel) {
   const NodeDef ndef = CreateNodeDef("DuplicateKernel", {});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(
@@ -582,7 +615,7 @@ REGISTER_KERNEL_BUILDER(
 TEST_F(OpKernelBuilderTest, DuplicateKernelForT) {
   const NodeDef ndef =
       CreateNodeDef("DuplicateKernelForT", {"T|type|DT_FLOAT"});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(str_util::StrContains(
@@ -603,7 +636,7 @@ REGISTER_KERNEL_BUILDER(Name("BadConstraint")
 
 TEST_F(OpKernelBuilderTest, BadConstraint) {
   const NodeDef ndef = CreateNodeDef("BadConstraint", {});
-  DeviceTypeVector devs;
+  PrioritizedDeviceTypeVector devs;
   Status status = SupportedDeviceTypesForNode(DeviceTypes(), ndef, &devs);
   ASSERT_FALSE(status.ok());
   EXPECT_TRUE(
diff --git a/tensorflow/core/framework/resource_mgr.cc b/tensorflow/core/framework/resource_mgr.cc
index 508a8d3149b9f614afc900b528ae5777d0d2f5fc..9f3204ab96050a1cc06ab3052741f0044369b83e 100644
--- a/tensorflow/core/framework/resource_mgr.cc
+++ b/tensorflow/core/framework/resource_mgr.cc
@@ -204,12 +204,19 @@ Status ResourceMgr::Delete(const ResourceHandle& handle) {
 }
 
 Status ResourceMgr::Cleanup(const string& container) {
+  {
+    tf_shared_lock l(mu_);
+    if (!gtl::FindOrNull(containers_, container)) {
+      // Nothing to cleanup.
+      return Status::OK();
+    }
+  }
   Container* b = nullptr;
   {
     mutex_lock l(mu_);
     auto iter = containers_.find(container);
     if (iter == containers_.end()) {
-      // Nothing to cleanup, it's OK.
+      // Nothing to cleanup, it's OK (concurrent cleanup).
       return Status::OK();
     }
     b = iter->second;
diff --git a/tensorflow/core/framework/tensor.cc b/tensorflow/core/framework/tensor.cc
index c7ddc6c21eda7af94379b07ab3dff8a25021665e..7e841489eb35d4ec3d18fe255472107ef9d60efe 100644
--- a/tensorflow/core/framework/tensor.cc
+++ b/tensorflow/core/framework/tensor.cc
@@ -68,7 +68,8 @@ namespace {
 // An un-templated base class for Buffer.
 class BufferBase : public TensorBuffer {
  public:
-  explicit BufferBase(Allocator* alloc) : alloc_(alloc) {}
+  explicit BufferBase(Allocator* alloc, void* data_ptr)
+      : TensorBuffer(data_ptr), alloc_(alloc) {}
 
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -106,7 +107,6 @@ class Buffer : public BufferBase {
   Buffer(Allocator* a, int64 n);
   Buffer(Allocator* a, int64 n, const AllocationAttributes& allocation_attr);
 
-  void* data() const override { return data_; }
   size_t size() const override { return sizeof(T) * elem_; }
 
  private:
@@ -442,20 +442,20 @@ struct ProtoHelper<Eigen::half> {
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n)
-    : BufferBase(a), data_(a->Allocate<T>(n)), elem_(n) {}
+    : BufferBase(a, a->Allocate<T>(n)), elem_(n) {}
 
 template <typename T>
 Buffer<T>::Buffer(Allocator* a, int64 n,
                   const AllocationAttributes& allocation_attr)
-    : BufferBase(a), data_(a->Allocate<T>(n, allocation_attr)), elem_(n) {}
+    : BufferBase(a, a->Allocate<T>(n, allocation_attr)), elem_(n) {}
 
 template <typename T>
 Buffer<T>::~Buffer() {
-  if (data_) {
+  if (data()) {
     if (LogMemory::IsEnabled()) {
       RecordDeallocation();
     }
-    alloc_->Deallocate<T>(data_, elem_);
+    alloc_->Deallocate<T>(static_cast<T*>(data()), elem_);
   }
 }
 
@@ -764,7 +764,9 @@ class SubBuffer : public TensorBuffer {
  public:
   // This buffer is an alias to buf[delta, delta + n).
   SubBuffer(TensorBuffer* buf, int64 delta, int64 n)
-      : root_(buf->root_buffer()), data_(buf->base<T>() + delta), elem_(n) {
+      : TensorBuffer(buf->base<T>() + delta),
+        root_(buf->root_buffer()),
+        elem_(n) {
     // Sanity check. The caller should ensure the sub buffer is valid.
     CHECK_LE(root_->base<T>(), this->base<T>());
     T* root_limit = root_->base<T>() + root_->size() / sizeof(T);
@@ -775,7 +777,6 @@ class SubBuffer : public TensorBuffer {
     root_->Ref();
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return sizeof(T) * elem_; }
   TensorBuffer* root_buffer() override { return root_; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 0d58ab3875a938c4b7cbd73f24985f45211e0e25..6e03cf9f6f47c89289ffaec507f56d8c734e52a9 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_TENSOR_H_
 
 #include <cstdint>
+#include <type_traits>
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -29,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mem.h"
 #include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
@@ -118,7 +120,7 @@ class Tensor {
 
   class HostScalarTensorBufferBase;
   template <typename T>
-  class HostScalarTensorBuffer;
+  struct ValueAndTensorBuffer;
 
   // Creates a tensor with the given scalar `value` in CPU memory.
   template <typename T>
@@ -634,10 +636,15 @@ class Tensor {
 // Interface to access the raw ref-counted data buffer.
 class TensorBuffer : public core::RefCounted {
  public:
+  explicit TensorBuffer(void* data_ptr) : data_(data_ptr) {}
   ~TensorBuffer() override {}
 
   // data() points to a memory region of size() bytes.
-  virtual void* data() const = 0;
+  //
+  // NOTE(mrry): The `data()` method is not virtual for performance reasons.
+  // It can be called multiple times when the contents of a `Tensor` are
+  // accessed, and so making it non-virtual allows the body to be inlined.
+  void* data() const { return data_; }
   virtual size_t size() const = 0;
 
   // If this TensorBuffer is sub-buffer of another TensorBuffer,
@@ -655,6 +662,9 @@ class TensorBuffer : public core::RefCounted {
 
   // Whether this TensorBuffer owns the underlying memory.
   virtual bool OwnsMemory() const { return true; }
+
+ private:
+  void* const data_;
 };
 
 template <typename T>
@@ -872,41 +882,75 @@ inline Tensor::Tensor(Tensor&& other)
 
 class Tensor::HostScalarTensorBufferBase : public TensorBuffer {
  public:
+  using TensorBuffer::TensorBuffer;
   void FillAllocationDescription(AllocationDescription* proto) const final;
 };
 
-// `Tensor::HostScalarTensorBuffer<T>` is a specialized `TensorBuffer`
-// implementation for storing a single scalar value.
-//
-// TODO(mrry): Evaluate other compilers or approaches to aligning the value
-// so that it can be used directly as a tensor value. For example, in a C++17
-// future, we could use `alignas(EIGEN_MAX_ALIGN_BYTES)` to store the value
-// inline in this object to save an allocation. However, this is not currently
-// widely supported in our compilers.
+// A packed representation for a single scalar value of type `T`, and a
+// `TensorBuffer` implementation that describes (and manages the lifetime of)
+// that value.
 template <typename T>
-class Tensor::HostScalarTensorBuffer : public HostScalarTensorBufferBase {
- public:
-  HostScalarTensorBuffer(T&& value)
-      : data_(reinterpret_cast<T*>(cpu_allocator()->AllocateRaw(
-            EIGEN_MAX_ALIGN_BYTES, sizeof(value)))) {
-    if (is_simple_type<T>::value) {
-      *data_ = value;
-    } else {
-      new (data_) T(std::move(value));
+struct Tensor::ValueAndTensorBuffer {
+  class HostScalarTensorBuffer : public Tensor::HostScalarTensorBufferBase {
+   public:
+    HostScalarTensorBuffer(void* data) : HostScalarTensorBufferBase(data) {}
+    size_t size() const final { return sizeof(T); }
+    TensorBuffer* root_buffer() final { return this; }
+
+    // Override `operator delete` so that calling `delete this` in
+    // `core::Refcounted::Unref()` for an object of this type will free
+    // the enclosing `ValueAndTensorBuffer` for the tensor buffer.
+    //
+    // NOTE(mrry): The definition of this method must be outside the class
+    // definition in order to satisfy some compilers.
+    static void operator delete(void* ptr);
+
+    static void operator delete(void*, void*) {
+      // Some compilers require an overridden class-specific deallocation
+      // function, which will be called if placement `new` throws an
+      // exception.
     }
-  }
-  ~HostScalarTensorBuffer() { cpu_allocator()->Deallocate(data_, 1); }
-  void* data() const final { return const_cast<T*>(data_); }
-  size_t size() const final { return sizeof(*data_); }
-  TensorBuffer* root_buffer() final { return this; }
 
- private:
-  T* const data_;
+   private:
+    ~HostScalarTensorBuffer() override { static_cast<T*>(data())->~T(); }
+  };
+
+  T value;
+  HostScalarTensorBuffer tensor_buffer;
 };
 
+/* static */
+template <typename T>
+void Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer::operator delete(
+    void* ptr) {
+  // Use a dummy object to compute to offset of
+  // `ValueAndTensorBuffer::tensor_buffer`, because `offsetof()` is not
+  // necessarily defined on this non-POD type (until C++17).
+  //
+  // NOTE(mrry): Using `sizeof(Tensor::ValueAndTensorBuffer<T>)` here requires
+  // us to define this method outside the class definition, so that it is not
+  // considered an incomplete type.
+  typename std::aligned_storage<sizeof(Tensor::ValueAndTensorBuffer<T>),
+                                alignof(Tensor::ValueAndTensorBuffer<T>)>::type
+      dummy_storage_;
+  Tensor::ValueAndTensorBuffer<T>* dummy_object =
+      reinterpret_cast<Tensor::ValueAndTensorBuffer<T>*>(&dummy_storage_);
+  intptr_t offset = reinterpret_cast<intptr_t>(&dummy_object->tensor_buffer) -
+                    reinterpret_cast<intptr_t>(dummy_object);
+
+  port::AlignedFree(static_cast<char*>(ptr) - offset);
+}
+
 template <typename T>
-Tensor::Tensor(T value, host_scalar_tag tag)
-    : buf_(new HostScalarTensorBuffer<T>(std::move(value))) {
+Tensor::Tensor(T value, host_scalar_tag tag) {
+  auto* value_and_buf = static_cast<Tensor::ValueAndTensorBuffer<T>*>(
+      port::AlignedMalloc(sizeof(typename Tensor::ValueAndTensorBuffer<T>),
+                          EIGEN_MAX_ALIGN_BYTES));
+  new (&value_and_buf->value) T(std::move(value));
+  new (&value_and_buf->tensor_buffer)
+      typename Tensor::ValueAndTensorBuffer<T>::HostScalarTensorBuffer(
+          value_and_buf);
+  buf_ = &value_and_buf->tensor_buffer;
   set_dtype(DataTypeToEnum<T>::value);
 }
 
diff --git a/tensorflow/core/framework/tensor_test.cc b/tensorflow/core/framework/tensor_test.cc
index 925ebc49454d4e448c86a640ac0312dcb0159fb9..713f91fe04c6fe498209d88193f6fbb1729ec57c 100644
--- a/tensorflow/core/framework/tensor_test.cc
+++ b/tensorflow/core/framework/tensor_test.cc
@@ -854,15 +854,18 @@ TEST(Tensor_HostScalar, Basics) {
     EXPECT_FLOAT_EQ(42.0f, Tt());
   }
   {
-    Tensor t("foo");
+    // NOTE(mrry): Use long enough strings so that the contents are dynamically
+    // allocated, and the absence of a call to the string destructor would
+    // cause a memory leak.
+    Tensor t("fooooooooooooooooooooooooooooooooooooo");
     EXPECT_EQ(DT_STRING, t.dtype());
     EXPECT_EQ(1, t.NumElements());
     auto Tt = t.scalar<string>();
     EXPECT_EQ(1, Tt.size());
     EXPECT_EQ(0, Tt.rank());
-    EXPECT_EQ("foo", Tt());
-    Tt() = "bar";
-    EXPECT_EQ("bar", Tt());
+    EXPECT_EQ("fooooooooooooooooooooooooooooooooooooo", Tt());
+    Tt() = "baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaar";
+    EXPECT_EQ("baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaar", Tt());
   }
 }
 
@@ -1488,5 +1491,26 @@ void BM_CreateAndMoveCtrWithBuf(int iters) {
 }
 BENCHMARK(BM_CreateAndMoveCtrWithBuf);
 
+// Benchmark creating and destroy a host-scalar tensor, using the allocator
+// interface.
+void BM_CreateAndDestroyHostScalarNonOptimized(int iters) {
+  TensorShape shape({});
+  Allocator* allocator = cpu_allocator();
+  while (--iters) {
+    Tensor a(allocator, DT_FLOAT, shape);
+    a.scalar<float>()() = 37.0;
+  }
+}
+BENCHMARK(BM_CreateAndDestroyHostScalarNonOptimized);
+
+// Benchmark creating and destroy a host-scalar tensor, using the specialized
+// constructor.
+void BM_CreateAndDestroyHostScalarOptimized(int iters) {
+  while (--iters) {
+    Tensor a(37.0);
+  }
+}
+BENCHMARK(BM_CreateAndDestroyHostScalarOptimized);
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/framework/types.h b/tensorflow/core/framework/types.h
index a05dea19ec425de6d15df6bcb08ae62d4ab2017b..c0df19334210bb0830371d3d5c2fc4edd0d297bc 100644
--- a/tensorflow/core/framework/types.h
+++ b/tensorflow/core/framework/types.h
@@ -104,6 +104,8 @@ typedef gtl::InlinedVector<DataType, 4> DataTypeVector;
 typedef gtl::ArraySlice<DataType> DataTypeSlice;
 
 typedef gtl::InlinedVector<DeviceType, 4> DeviceTypeVector;
+typedef gtl::InlinedVector<std::pair<DeviceType, int32>, 4>
+    PrioritizedDeviceTypeVector;
 
 // Convert the enums to strings for errors:
 string DataTypeString(DataType dtype);
diff --git a/tensorflow/core/graph/edgeset.cc b/tensorflow/core/graph/edgeset.cc
index 02315a3e27b9e855610a2f01b7cf8d39995561f0..2e0c67146169d4b0fe3bbb548c70451b2b1907b9 100644
--- a/tensorflow/core/graph/edgeset.cc
+++ b/tensorflow/core/graph/edgeset.cc
@@ -37,7 +37,7 @@ std::pair<EdgeSet::const_iterator, bool> EdgeSet::insert(value_type value) {
       }
     }
     // array is full. convert to set.
-    s = new gtl::FlatSet<const Edge*>;
+    s = new std::set<const Edge*>;
     for (int i = 0; i < kInline; i++) {
       s->insert(static_cast<const Edge*>(ptrs_[i]));
     }
diff --git a/tensorflow/core/graph/edgeset.h b/tensorflow/core/graph/edgeset.h
index 2776c8491c2b3fbccfb4c2a2f28aa7d677072e4a..0a1ee5a666cbd0d1978c075f75ab688223355f78 100644
--- a/tensorflow/core/graph/edgeset.h
+++ b/tensorflow/core/graph/edgeset.h
@@ -17,18 +17,17 @@ limitations under the License.
 #define TENSORFLOW_GRAPH_EDGESET_H_
 
 #include <stddef.h>
-
-#include "tensorflow/core/lib/gtl/flatset.h"
-#include "tensorflow/core/platform/logging.h"
+#include <set>
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
+
+#include "tensorflow/core/platform/logging.h"
 namespace tensorflow {
 
 class Edge;
 
 // An unordered set of edges.  Uses very little memory for small sets.
-// Unlike gtl::FlatSet, EdgeSet does NOT allow mutations during
-// iteration.
+// Unlike std::set, EdgeSet does NOT allow mutations during iteration.
 class EdgeSet {
  public:
   EdgeSet();
@@ -55,15 +54,12 @@ class EdgeSet {
  private:
   // Up to kInline elements are stored directly in ptrs_ (nullptr means none).
   // If ptrs_[0] == this then ptrs_[1] points to a set<const Edge*>.
-  // kInline must be >= 2, and is chosen such that ptrs_ fills a 64 byte
-  // cacheline.
-  static constexpr int kInline = 64 / sizeof(const void*);
+  static const int kInline = 4;  // Must be >= 2.
   const void* ptrs_[kInline];
 
-  gtl::FlatSet<const Edge*>* get_set() const {
+  std::set<const Edge*>* get_set() const {
     if (ptrs_[0] == this) {
-      return static_cast<gtl::FlatSet<const Edge*>*>(
-          const_cast<void*>(ptrs_[1]));
+      return static_cast<std::set<const Edge*>*>(const_cast<void*>(ptrs_[1]));
     } else {
       return nullptr;
     }
@@ -103,7 +99,7 @@ class EdgeSet::const_iterator {
   friend class EdgeSet;
 
   void const* const* array_iter_ = nullptr;
-  typename gtl::FlatSet<const Edge*>::const_iterator tree_iter_;
+  typename std::set<const Edge*>::const_iterator tree_iter_;
 
 #ifdef NDEBUG
   inline void Init(const EdgeSet* e) {}
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index b9fceb6a31b31237145f78582d459095a6560b19..550e3ef915290c499c904c14e2ca8c5fa7e4a981 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -285,6 +285,14 @@ Status Node::input_node(int idx, const Node** const_n) const {
   return Status::OK();
 }
 
+Status Node::input_tensor(int idx, OutputTensor* t) const {
+  const Edge* e;
+  TF_RETURN_IF_ERROR(input_edge(idx, &e));
+  DCHECK(e != nullptr);
+  *t = OutputTensor(e->src(), e->src_output());
+  return Status::OK();
+}
+
 // InputTensor
 
 bool InputTensor::operator==(const InputTensor& other) const {
@@ -540,6 +548,22 @@ Status Graph::UpdateEdge(Node* new_src, int new_src_index, Node* dst,
   return Status::OK();
 }
 
+Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
+  if (dst->type_string() != "While") {
+    return errors::Internal(
+        "dst argument to AddWhileEdgeHack should be a While op, got: ",
+        dst->DebugString());
+  }
+  TF_RETURN_IF_ERROR(IsValidOutputTensor(new_src, new_src_index));
+  int dst_index = dst->in_edges().size();
+  TF_RETURN_IF_ERROR(IsValidInputTensor(dst, dst_index));
+  AddEdge(new_src, new_src_index, dst, dst_index);
+  dst->MaybeCopyOnWrite();
+  dst->props_->node_def.add_input(
+      strings::StrCat(new_src->name(), ":", new_src_index));
+  return Status::OK();
+}
+
 Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
   // Need a new-enough consumer to support the functions we add to the graph.
   if (fdef_lib.function_size() > 0 && versions_->min_consumer() < 12) {
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index 6a224ca4a2317fecc47ec6fabb90da670a3fe19e..667eaba24c3341cbafc68c92ac5e9fa23dbe669d 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -59,12 +59,13 @@ class EdgeSetTest;
 class Graph;
 class GraphDef;
 class Node;
+struct OutputTensor;
 class VersionDef;
 class WhileContext;
 
 class NeighborIter;    // Declared below
 class NodeIter;        // Declared below
-class NodeProperties;  // Defined in .cc
+struct NodeProperties;  // Defined in .cc
 
 class Node {
  public:
@@ -189,6 +190,10 @@ class Node {
   Status input_node(int idx, const Node** n) const;
   Status input_node(int idx, Node** n) const;
 
+  // Returns into '*t' the idx-th input tensor of this node, represented as the
+  // output tensor of input_node(idx).
+  Status input_tensor(int idx, OutputTensor* t) const;
+
   WhileContext* while_ctx() const { return while_ctx_; }
   void set_while_ctx(WhileContext* while_ctx) {
     DCHECK(IsExit());
@@ -287,10 +292,10 @@ class Node {
 
 // Represents an input of a node, i.e., the `index`-th input to `node`.
 struct InputTensor {
-  const Node* node;
+  Node* node;
   int index;
 
-  InputTensor(const Node* n, int i) : node(n), index(i) {}
+  InputTensor(Node* n, int i) : node(n), index(i) {}
   InputTensor() : node(nullptr), index(0) {}
 
   // Returns true if this InputTensor is identical to 'other'. Nodes are
@@ -308,10 +313,10 @@ struct InputTensor {
 // that a single `OutputTensor` can correspond to multiple `Edge`s if the output
 // is consumed by multiple destination nodes.
 struct OutputTensor {
-  const Node* node;
+  Node* node;
   int index;
 
-  OutputTensor(const Node* n, int i) : node(n), index(i) {}
+  OutputTensor(Node* n, int i) : node(n), index(i) {}
   OutputTensor() : node(nullptr), index(0) {}
 
   // Returns true if this OutputTensor is identical to 'other'. Nodes are
@@ -488,11 +493,17 @@ class Graph {
   // the corresponding NodeDef to reflect the change.
   // REQUIRES: The control edge must exist.
   void RemoveControlEdge(const Edge* e);
+
   // Updates the input to a node.  The existing edge to `dst` is removed and an
   // edge from `new_src` to `dst` is created. The NodeDef associated with `dst`
   // is also updated.
   Status UpdateEdge(Node* new_src, int new_src_index, Node* dst, int dst_index);
 
+  // Like AddEdge but updates dst's NodeDef. Used to add an input edge to a
+  // "While" op during gradient construction, see AddInputWhileHack in
+  // python_api.h for more details.
+  Status AddWhileInputHack(Node* new_src, int new_src_index, Node* dst);
+
   // Adds the function and gradient definitions in `fdef_lib` to this graph's op
   // registry. Ignores duplicate functions, and returns a bad status if an
   // imported function differs from an existing function or op with the same
diff --git a/tensorflow/core/graph/graph_partition.cc b/tensorflow/core/graph/graph_partition.cc
index 1dbcebab598c7230008ab61e1094229bde76b757..9c640c42a5891b632e18517c848cc9a0c76a0f45 100644
--- a/tensorflow/core/graph/graph_partition.cc
+++ b/tensorflow/core/graph/graph_partition.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/memory_types.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor.pb.h"
@@ -1186,7 +1187,8 @@ Status Partition(const PartitionOptions& opts, Graph* g,
   for (auto& it : *partitions) {
     GraphDef* gdef = &it.second;
     *gdef->mutable_versions() = g->versions();
-    *gdef->mutable_library() = flib_def->ToProto();
+    // Prune unreachable functions from `flib_def` before adding them to `gdef`.
+    *gdef->mutable_library() = flib_def->ReachableDefinitions(*gdef).ToProto();
 
     // Traverse the graph to fill every send/recv op's incarnation
     // information.
diff --git a/tensorflow/core/graph/graph_partition_test.cc b/tensorflow/core/graph/graph_partition_test.cc
index f44ed47a6e94acdce66c36902cbcf2fdfb041447..29d8034d2a14b6fa2c49b5fa65cb409209b29944 100644
--- a/tensorflow/core/graph/graph_partition_test.cc
+++ b/tensorflow/core/graph/graph_partition_test.cc
@@ -470,13 +470,19 @@ TEST_F(GraphPartitionTest, Functions) {
   ConstructOp(in_.WithOpName("A2"), "XTimesTwo", {a1});
   ConstructOp(in_.WithOpName("B2"), "XTimesFour", {b1});
 
+  // The `Partition()` helper function uses the first letter of the op name ('A'
+  // or 'B') to choose a device for each node.
   Partition(ToGraphDef(), &partitions_);
   EXPECT_EQ(2, partitions_.size());
 
-  // Test that partition graphs inherit function library from original graph
+  // Test that partition graphs inherit function library from original graph.
   string a = "/job:a/replica:0/task:0/cpu:0";
   string b = "/job:a/replica:0/task:0/cpu:1";
-  ExpectFunctions(partitions_[a].library(), {"XTimesTwo", "XTimesFour"});
+
+  // Node "A2" is placed in part `a`, and uses only "XTimesTwo".
+  ExpectFunctions(partitions_[a].library(), {"XTimesTwo"});
+  // Node "B2" is placed in part `b`, and uses both "XTimesFour" directly,
+  // and "XTimesTwo" in the body of "XTimesFour".
   ExpectFunctions(partitions_[b].library(), {"XTimesTwo", "XTimesFour"});
 }
 
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index e7762fd4147dfb880a76ba04b0a93cf1bf34922c..333c32567fc9b922951b558c86f29087da770894 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -799,44 +799,5 @@ BENCHMARK(BM_GraphCreation)->ArgPair(1 << 9, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 12, 16);
 BENCHMARK(BM_GraphCreation)->ArgPair(1 << 15, 16);
 
-static void BM_ToGraphDef(int iters, int num_nodes, int num_edges_per_node) {
-  testing::StopTiming();
-  const GraphDef graph_def = CreateGraphDef(num_nodes, num_edges_per_node);
-  const auto registry = OpRegistry::Global();
-  GraphConstructorOptions opts;
-  // Warmup step.
-  Graph graph(registry);
-  TF_CHECK_OK(ConvertGraphDefToGraph(opts, graph_def, &graph));
-  int64 sum = 0;
-  testing::StartTiming();
-  for (int i = 0; i < iters; ++i) {
-    GraphDef graph_def;
-    graph.ToGraphDef(&graph_def);
-    sum += graph_def.node_size();
-  }
-  VLOG(1) << sum;
-  testing::StopTiming();
-}
-BENCHMARK(BM_ToGraphDef)->ArgPair(10, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 2);
-BENCHMARK(BM_ToGraphDef)->ArgPair(10, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 4);
-BENCHMARK(BM_ToGraphDef)->ArgPair(10, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 8);
-BENCHMARK(BM_ToGraphDef)->ArgPair(10, 16);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 6, 16);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 9, 16);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 12, 16);
-BENCHMARK(BM_ToGraphDef)->ArgPair(1 << 15, 16);
-
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/mkl_layout_pass.cc b/tensorflow/core/graph/mkl_layout_pass.cc
index 69735aac028a979d4deb2561fd0389cceb4a11de..52b46600943b31f4d0205d0eb120cc282c78240f 100644
--- a/tensorflow/core/graph/mkl_layout_pass.cc
+++ b/tensorflow/core/graph/mkl_layout_pass.cc
@@ -22,9 +22,12 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <set>
+#include <stack>
+#include <tuple>
 #include <unordered_set>
 #include <utility>
 #include <vector>
+
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
 #include "tensorflow/core/framework/node_def_util.h"
@@ -271,6 +274,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.mkl_conv2d_with_bias = "_MklConv2DWithBias";
     csinfo_.mkl_conv2d_grad_filter_with_bias =
         "_MklConv2DBackpropFilterWithBias";
+    csinfo_.mkl_pad_with_conv2d = "_MklPadWithConv2D";
+    csinfo_.pad = "Pad";
+    csinfo_.pad_with_conv2d = "__MklDummyPadWithConv2D";
 // Temporarily don't convert quantized operators into MKL versions for now.
 // TODO(Intel-tf) Once all the relevant PRs have been merged then remove
 // the ifdef.
@@ -310,6 +316,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     csinfo_.slice = "Slice";
     csinfo_.softmax = "Softmax";
     csinfo_.split = "Split";
+    csinfo_.transpose = "Transpose";
     // Element-wise ops. Ensure you also add any new ops to IsOpElementWise
     // in the MklUtil.h (IsMklElementWiseOp method) to ensure that the
     // MklInputConversion op is added before it.
@@ -398,6 +405,8 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
                       CopyAttrsDataType, AlwaysRewrite});
     rinfo_.push_back({csinfo_.mul, mkl_op_registry::GetMklOpName(csinfo_.mul),
                       CopyAttrsDataType, AlwaysRewrite});
+    rinfo_.push_back({csinfo_.pad_with_conv2d, csinfo_.mkl_pad_with_conv2d,
+                      CopyAttrsPadWithConv2D, AlwaysRewrite});
 #ifdef INTEL_MKL_QUANTIZED
     rinfo_.push_back({csinfo_.quantized_avg_pool,
                       mkl_op_registry::GetMklOpName(csinfo_.quantized_avg_pool),
@@ -508,6 +517,44 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     minfo_.push_back({csinfo_.conv2d_grad_filter, csinfo_.bias_add_grad,
                       csinfo_.conv2d_grad_filter_with_bias,
                       GetConv2DBackpropFilterOrBiasAddGrad});
+    minfo_.push_back(
+        {csinfo_.pad, csinfo_.conv2d, csinfo_.pad_with_conv2d, GetPadOrConv2D});
+    // Merge Pad and Conv2d, only if the pad op is "Pad"
+    // Doesn't merge if pad op is "PadV2" or "MirrorPad"
+
+    // The fusion patterns in "finfo_" that show up first will get applied
+    // first, for example, graph "A->B->C-D" and finfo_ is {A->B->C to ABC,
+    // A->B->C->D to ABCD}, since the first gets applied first, the final
+    // graph will be ABC->D.
+
+    //
+    // Add rules to fuse sequences such as "Transpose (NCHW -> NHWC) + Conv2D
+    // (NHWC) + Transpose (NHWC->
+    // NCHW)" into "Conv2D (NCHW)". Such patterns occur frequently in Keras.
+    // Note: we use the term "merge" to combine (exactly) 2 nodes into one,
+    // while "fusion" is for 3+ nodes situation.
+    //
+
+    // Transpose + Conv2d + Transpose:
+    std::vector<int> transpose_to_nhwc = {NCHW::dim::N, NCHW::dim::H,
+                                          NCHW::dim::W, NCHW::dim::C};
+    std::vector<int> transpose_to_nchw = {NHWC::dim::N, NHWC::dim::C,
+                                          NHWC::dim::H, NHWC::dim::W};
+    auto CheckForTransposeToNHWC =
+        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nhwc);
+    auto CheckForConv2dOp =
+        std::bind(CheckForMklOp, std::placeholders::_1, csinfo_.conv2d);
+    auto CheckForTransposeToNCHW =
+        std::bind(CheckForTranspose, std::placeholders::_1, transpose_to_nchw);
+    auto FuseConv2D =
+        std::bind(FuseTransposeMklOpTranspose, std::placeholders::_1,
+                  std::placeholders::_2, std::placeholders::_3, "NCHW");
+    finfo_.push_back(
+        {"transpose-elimination for Conv2D",
+         {CheckForTransposeToNHWC, CheckForConv2dOp, CheckForTransposeToNCHW},
+         // CheckForMklOp
+         FuseConv2D,
+         CopyAttrsConv});
   }
 
   // Standard interface to run pass
@@ -530,7 +577,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string name;      // Original name of op of the node in the graph
     string new_name;  // New name of the op of the node in the graph
     // A function handler to copy attributes from an old node to a new node.
-    std::function<void(const Node*, NodeBuilder*)> copy_attrs;
+    std::function<void(const Node*, NodeBuilder*, bool)> copy_attrs;
     // A rule under which to rewrite this node
     std::function<bool(const Node*)> rewrite_rule;
   } RewriteInfo;
@@ -560,6 +607,41 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     std::function<Node*(const Node*)> get_node_to_be_merged;
   } MergeInfo;
 
+  // Structure to specify information used in node fusion of 3+ operators
+  typedef struct {
+    std::string pattern_name;  // Name to describe this pattern, such as
+                               // "Transpose_Mklop_Transpose".
+    std::vector<std::function<bool(const Node*)> >
+        node_checkers;  // Extra restriction checker for these ops
+    std::function<Status(
+        std::unique_ptr<Graph>*, std::vector<Node*>&,
+        std::function<void(const Node*, NodeBuilder* nb, bool)>)>
+        fuse_func;
+    std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs;
+  } FusionInfo;
+
+  //
+  // Dimension indices for 2D tensor.
+  //
+  struct NCHW {
+    enum dim { N = 0, C = 1, H = 2, W = 3 };
+  };
+
+  struct NHWC {
+    enum dim { N = 0, H = 1, W = 2, C = 3 };
+  };
+
+  //
+  // dimension indices for 3D tensor.
+  //
+  struct NCDHW {
+    enum dim { N = 0, C = 1, D = 2, H = 3, W = 4 };
+  };
+
+  struct NDHWC {
+    enum dim { N = 0, D = 1, H = 2, W = 3, C = 4 };
+  };
+
   /// Structure to store all constant strings
   /// NOTE: names are alphabetically sorted.
   typedef struct {
@@ -597,7 +679,10 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string mkl_conv2d_grad_filter;
     string mkl_conv2d_grad_filter_with_bias;
     string mkl_conv2d_with_bias;
+    string mkl_pad_with_conv2d;
     string mul;
+    string pad;
+    string pad_with_conv2d;
     string quantized_avg_pool;
     string quantized_conv2d;
     string quantized_conv2d_with_requantize;
@@ -619,6 +704,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     string requantize;
     string tanh;
     string tanh_grad;
+    string transpose;
     string reshape;
     string slice;
     string softmax;
@@ -637,6 +723,9 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   /// Maintain info about nodes to be merged
   std::vector<MergeInfo> minfo_;
 
+  /// Maintain info about nodes to be fused
+  std::vector<FusionInfo> finfo_;
+
   /// Maintain structure of constant strings
   static ConstStringsInfo csinfo_;
 
@@ -721,6 +810,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
 
   // Helper function to merge different nodes
   Status MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g, Node* m, Node* n);
+  Status MergePadWithConv2D(std::unique_ptr<Graph>* g, Node* m, Node* n);
   Status MergeConv2DBackpropFilterWithBiasAddGrad(std::unique_ptr<Graph>* g,
                                                   Node* m, Node* n);
 
@@ -758,6 +848,54 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return n;
   }
 
+  // Find Pad or Conv2D node that can be merged with input node 'm'.
+  // If input 'm' is Pad, then check if there exists Conv2D node that can be
+  // merged with 'm'. If input 'm' is Conv2D, then check if there exists Pad
+  // node that can be merged with 'm'.
+  static Node* GetPadOrConv2D(const Node* m) {
+    DCHECK(m);
+    Node* n = nullptr;
+
+    const Node* conv_node;
+    if (m->type_string() == csinfo_.pad) {
+      // If m is Pad, then Conv2D is the output of Pad.
+      for (const Edge* e : m->out_edges()) {
+        if (!e->IsControlEdge() && e->dst()->type_string() == csinfo_.conv2d) {
+          n = e->dst();
+          conv_node = n;
+          break;
+        }
+      }
+    } else {
+      DCHECK_EQ(m->type_string(), csinfo_.conv2d);
+      // If m is conv2D, Go over all input edges
+      // and search for Pad  Node.
+      for (const Edge* e : m->in_edges()) {
+        if (!e->IsControlEdge() && e->src()->type_string() == csinfo_.pad) {
+          n = e->src();
+          conv_node = m;
+          break;
+        }
+      }
+    }
+    // Check if only VALID type of padding is used
+    // or not.
+    if (n != nullptr) {
+      string padding;
+      TF_CHECK_OK(GetNodeAttr(conv_node->def(), "padding", &padding));
+      if (padding != "VALID")
+        // Then do not merge.
+        // Only VALID type of padding in conv op can be
+        // merged with Pad op.
+        n = nullptr;
+    } else {
+      VLOG(1) << "MklLayoutRewritePass: Could not find matching "
+              << "Pad and Conv2D node for merging. Input node: "
+              << m->DebugString();
+    }
+
+    return n;
+  }
   // Find Conv2DBackpropFilter or BiasAddGrad node that can be merged with input
   // node 'm'. If input 'm' is Conv2DBackpropFilter, then check if there exists
   // BiasAddGrad node that can be merged with 'm'. If input 'm' is BiasAddGrad,
@@ -815,6 +953,119 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     return n;
   }
 
+  // Return a node that can be fused with input node 'n'
+  //
+  // @return tuple. If we can find such nodes, the first
+  // element of the tuple is a true. Otherwise, it's false.
+  std::tuple<bool, std::vector<Node*>, const MklLayoutRewritePass::FusionInfo>
+  CheckForNodeFusion(Node* n) const;
+
+  // Fuse nodes in the vector "nodes"
+  Status FuseNode(std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+                  const MklLayoutRewritePass::FusionInfo fi);
+
+  // Fuse tranpose(to "NHWC") + mklop("NHWC") + transpose(to "NCHW") into
+  // mklop("NCHW").
+  // Here "mklop" can be any MKL-DNN supported op, such as Conv2D.
+  static Status FuseTransposeMklOpTranspose(
+      std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+      std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs,
+      string data_format);
+
+  static bool CheckForTranspose(const Node* node, std::vector<int> perm) {
+    // Check if node's type is "Transpose"
+    if (node->type_string() != "Transpose") return false;
+
+    // If "Transpose" has multiple output data edges, also don't fuse it.
+    if (node->num_outputs() > 1 || node->out_edges().size() > 1) return false;
+
+    // Check if has out control edge. If true, this is a training graph.
+    // Currently we focus on inference and do no fusion in training.
+    // Note: this constraint will eventually be removed, if we enabled this
+    // fusion for training
+    // in the future.
+    for (const Edge* e : node->out_edges()) {
+      if (e->IsControlEdge()) {
+        return false;
+      }
+    }
+
+    // If "Transpose" has input control edges, don't fuse on it.
+    for (const Edge* e : node->in_edges()) {
+      if (e->IsControlEdge()) {
+        return false;
+      }
+    }
+
+    // We compared the tensor containing the permutation order ("perm_node")
+    // with our desired order ("perm"). If they're exactly match, this check
+    // succeed and returns true.
+    for (const Edge* e : node->in_edges()) {
+      if (!e->IsControlEdge()) {
+        const Node* perm_node = e->src();
+
+        const int kPermTensorIndex = 1;
+        if (perm_node->type_string() == "Const" &&
+            e->dst_input() == kPermTensorIndex) {
+          // we find the "perm" node, now try to retrieve its value.
+          const TensorProto* proto = nullptr;
+          DCHECK(GetNodeAttr(perm_node->def(), "value", &proto).ok());
+
+          DataType type;
+          GetNodeAttr(perm_node->def(), "dtype", &type);
+
+          // Here we directly access to the "tensor_content", rather than
+          // "int_val". This is because we find "int_val" is
+          // not set properly under some circumstances.
+          if (type == DT_INT32) {
+            const int type_size = 4;
+            const int* tensor_content =
+                reinterpret_cast<const int*>(proto->tensor_content().c_str());
+            const int tensor_content_size =
+                proto->tensor_content().size() / type_size;
+
+            std::vector<int> perm_value(tensor_content,
+                                        tensor_content + tensor_content_size);
+
+            return perm_value == perm;
+          } else if (type == DT_INT64) {
+            const int type_size = 8;
+            const long* tensor_content =
+                reinterpret_cast<const long*>(proto->tensor_content().c_str());
+            const int tensor_content_size =
+                proto->tensor_content().size() / type_size;
+
+            std::vector<long> perm_value(tensor_content,
+                                         tensor_content + tensor_content_size);
+            std::vector<long> long_perm(perm.cbegin(), perm.cend());
+
+            return perm_value == long_perm;
+          }
+          return false;
+        }
+      }
+    }
+    return false;
+  }
+
+  static bool CheckForMklOp(const Node* node, string name = "") {
+    if (node == nullptr) return false;
+
+    if (!name.empty() && node->type_string() != name) {
+      return false;
+    }
+
+    // if mklop has multiple outputs, don't fuse it.
+    if (node->num_outputs() > 1) return false;
+
+    if (node->out_edges().size() > 1) return false;
+
+    DataType T;
+    TF_CHECK_OK(GetNodeAttr(node->def(), "T", &T));
+    return mkl_op_registry::IsMklOp(
+        mkl_op_registry::GetMklOpName(node->type_string()), T);
+  }
+
   // Check if the node 'n' has any applicable rewrite rule
   // We check for 2 scenarios for rewrite.
   //
@@ -1070,22 +1321,43 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   // We need operator-specific function to copy attributes because the framework
   // does not provide any generic function for it.
   // NOTE: names are alphabetically sorted.
-  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb);
-  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb);
+  static void CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
+                            bool change_format = false);
+  static void CopyAttrsBiasAddGrad(const Node* orig_node, NodeBuilder* nb,
+                                   bool change_format = false);
+  static void CopyAttrsConcat(const Node* orig_node, NodeBuilder* nb,
+                              bool change_format = false);
+  static void CopyAttrsConcatV2(const Node* orig_node, NodeBuilder* nb,
+                                bool change_format = false);
+  static void CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                            bool change_format = false);
+  static void CopyAttrsDataType(const Node* orig_node, NodeBuilder* nb,
+                                bool change_format = false);
+  static void CopyAttrsFusedBatchNorm(const Node* orig_node, NodeBuilder* nb,
+                                      bool change_format = false);
+  static void CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
+                           bool change_format = false);
+  static void CopyAttrsPadWithConv2D(const Node* orig_node, NodeBuilder* nb,
+                                     bool change_format = false);
+  static void CopyAttrsFromPadAndConv2D(const Node* orig_node1,
+                                        const Node* orig_node2, NodeBuilder* nb,
+                                        bool change_format = false);
+  static void CopyAttrsPooling(const Node* orig_node, NodeBuilder* nb,
+                               bool change_format = false);
+  static void CopyAttrsQuantizedPooling(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format = false);
+  static void CopyAttrsQuantizedConv2D(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsQuantizedConcat(const Node* orig_node, NodeBuilder* nb,
+                                       bool change_format = false);
+  static void CopyAttrsReshape(const Node* orig_node, NodeBuilder* nb,
+                               bool change_format = false);
+  static void CopyAttrsRequantize(const Node* orig_node, NodeBuilder* nb,
+                                  bool change_format = false);
+  static void CopyAttrsSlice(const Node* orig_node, NodeBuilder* nb,
+                             bool change_format = false);
+  static void CopyAttrsSplit(const Node* orig_node, NodeBuilder* nb,
+                             bool change_format = false);
 
   // Generate a graph node in graph 'g' representing a dummy Mkl tensor node,
   // using node for original node 'orig_node' and return it in '*out'.
@@ -1285,6 +1557,8 @@ int MklLayoutRewritePass::SetUpContiguousInputs(
     // 2nd input (slot 1) of _MklConv2D and _MklConv2DWithBias.
     for (const Edge* e : filter_node->out_edges()) {
       if ((e->dst()->type_string() == csinfo_.mkl_conv2d ||
+           // add check for mkl_pad_with_conv2d
+           e->dst()->type_string() == csinfo_.mkl_pad_with_conv2d ||
            e->dst()->type_string() == csinfo_.mkl_conv2d_with_bias) &&
           e->dst_input() == kConv2DFilterInputSlotIdx
           /* filter is 2nd input of Conv2D and _MklConv2D. */) {
@@ -1586,20 +1860,81 @@ void MklLayoutRewritePass::AddWorkSpaceEdgeIfNeeded(
 // Op-specific functions to copy attributes from old node to new node
 //////////////////////////////////////////////////////////////////////////
 
-void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
-                                         NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
   DataType T;
   string data_format;
   string padding;
   std::vector<int32> strides;
   std::vector<int32> dilations;
 
+  // Get all attributes from old node.
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("padding", padding);
+
+  if (!change_format) {
+    nb->Attr("strides", strides);
+    nb->Attr("dilations", dilations);
+
+    TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+    nb->Attr("data_format", data_format);
+  } else {
+    std::vector<int32> new_strides;
+    std::vector<int32> new_dilations;
+    if (strides.size() == 5) {
+      // "strides" and "dilations" also need to be changed according to
+      // "data_format",
+      // in this case, is "NDHWC" to "NCDHW".
+      new_strides = {strides[NDHWC::dim::N], strides[NDHWC::dim::C],
+                     strides[NDHWC::dim::D], strides[NDHWC::dim::H],
+                     strides[NDHWC::dim::W]};
+
+      new_dilations = {dilations[NDHWC::dim::N], dilations[NDHWC::dim::C],
+                       dilations[NDHWC::dim::D], dilations[NDHWC::dim::H],
+                       dilations[NDHWC::dim::W]};
+    } else {
+      // "strides" and "dilations" also need to be changed according to
+      // "data_format",
+      // in this case, is "NHWC" to "NCHW".
+
+      new_strides = {strides[NHWC::dim::N], strides[NHWC::dim::C],
+                     strides[NHWC::dim::H], strides[NHWC::dim::W]};
+
+      new_dilations = {dilations[NHWC::dim::N], dilations[NHWC::dim::C],
+                       dilations[NHWC::dim::H], dilations[NHWC::dim::W]};
+    }
+    nb->Attr("strides", new_strides);
+    nb->Attr("dilations", new_dilations);
+  }
+}
+
+// Used in rinfo when replacing __MklDummyPadWithConv2D by _MklPadWithConv2D
+void MklLayoutRewritePass::CopyAttrsPadWithConv2D(const Node* orig_node,
+                                                  NodeBuilder* nb,
+                                                  bool change_format) {
+  DataType Tpaddings;
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  bool use_cudnn_on_gpu;
+
   // Get all attributes from old node.
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
   TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "Tpaddings", &Tpaddings));
 
   // Add attributes to new node.
   nb->Attr("T", T);
@@ -1607,10 +1942,46 @@ void MklLayoutRewritePass::CopyAttrsConv(const Node* orig_node,
   nb->Attr("dilations", dilations);
   nb->Attr("padding", padding);
   nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+  nb->Attr("Tpaddings", Tpaddings);
 }
 
-void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
-                                         NodeBuilder* nb) {
+// Used with MergePadWithConv2D
+void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
+                                                     const Node* orig_node2,
+                                                     NodeBuilder* nb,
+                                                     bool change_format) {
+  DataType Tpaddings;
+  DataType T;
+  string data_format;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  bool use_cudnn_on_gpu;
+
+  // Get all attributes from old node 1.
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "T", &T));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "dilations", &dilations));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(orig_node1->def(), "data_format", &data_format));
+  TF_CHECK_OK(
+      GetNodeAttr(orig_node1->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
+  // Get all attributes from old node 2.
+  TF_CHECK_OK(GetNodeAttr(orig_node2->def(), "Tpaddings", &Tpaddings));
+
+  // Add attributes to new node.
+  nb->Attr("T", T);
+  nb->Attr("strides", strides);
+  nb->Attr("dilations", dilations);
+  nb->Attr("padding", padding);
+  nb->Attr("data_format", data_format);
+  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
+  nb->Attr("Tpaddings", Tpaddings);
+}
+
+void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node, NodeBuilder* nb,
+                                         bool change_format) {
   DataType T;
   int N;
 
@@ -1624,7 +1995,8 @@ void MklLayoutRewritePass::CopyAttrsAddN(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
-                                                NodeBuilder* nb) {
+                                                NodeBuilder* nb,
+                                                bool change_format) {
   DataType T;
   string data_format;
   std::vector<int32> strides;
@@ -1640,8 +2012,8 @@ void MklLayoutRewritePass::CopyAttrsBiasAddGrad(const Node* orig_node,
   nb->Attr("data_format", data_format);
 }
 
-void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
-                                        NodeBuilder* nb) {
+void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node, NodeBuilder* nb,
+                                        bool change_format) {
   DataType T;
   int depth_radius;
   float bias;
@@ -1664,7 +2036,8 @@ void MklLayoutRewritePass::CopyAttrsLRN(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
-                                            NodeBuilder* nb) {
+                                            NodeBuilder* nb,
+                                            bool change_format) {
   DataType T;
   string data_format;
   string padding;
@@ -1686,7 +2059,8 @@ void MklLayoutRewritePass::CopyAttrsPooling(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
-                                             NodeBuilder* nb) {
+                                             NodeBuilder* nb,
+                                             bool change_format) {
   DataType T;
 
   // Get all attributes from old node.
@@ -1697,7 +2071,8 @@ void MklLayoutRewritePass::CopyAttrsDataType(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
-                                                     NodeBuilder* nb) {
+                                                     NodeBuilder* nb,
+                                                     bool change_format) {
   DataType T;
   string data_format;
   string padding;
@@ -1717,7 +2092,8 @@ void MklLayoutRewritePass::CopyAttrsQuantizedPooling(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
-                                                    NodeBuilder* nb) {
+                                                    NodeBuilder* nb,
+                                                    bool change_format) {
   DataType Tinput, Tfilter, out_type;
   string padding;
   string data_format("NHWC");
@@ -1747,7 +2123,8 @@ void MklLayoutRewritePass::CopyAttrsQuantizedConv2D(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node,
-                                               NodeBuilder* nb) {
+                                               NodeBuilder* nb,
+                                               bool change_format) {
   DataType Tinput, out_type;
 
   // Get all attributes from old node.
@@ -1760,7 +2137,8 @@ void MklLayoutRewritePass::CopyAttrsRequantize(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
-                                            NodeBuilder* nb) {
+                                            NodeBuilder* nb,
+                                            bool change_format) {
   DataType T;
   DataType Tshape;
 
@@ -1773,7 +2151,7 @@ void MklLayoutRewritePass::CopyAttrsReshape(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
-                                          NodeBuilder* nb) {
+                                          NodeBuilder* nb, bool change_format) {
   DataType T;
   DataType Index;
 
@@ -1786,7 +2164,7 @@ void MklLayoutRewritePass::CopyAttrsSlice(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
-                                          NodeBuilder* nb) {
+                                          NodeBuilder* nb, bool change_format) {
   DataType T;
   string data_format;
   int num_split;
@@ -1803,7 +2181,8 @@ void MklLayoutRewritePass::CopyAttrsSplit(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
-                                           NodeBuilder* nb) {
+                                           NodeBuilder* nb,
+                                           bool change_format) {
   DataType T;
   int N;
 
@@ -1817,7 +2196,8 @@ void MklLayoutRewritePass::CopyAttrsConcat(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
-                                             NodeBuilder* nb) {
+                                             NodeBuilder* nb,
+                                             bool change_format) {
   DataType T;
   int N;
   DataType tidx;
@@ -1834,7 +2214,8 @@ void MklLayoutRewritePass::CopyAttrsConcatV2(const Node* orig_node,
 }
 
 void MklLayoutRewritePass::CopyAttrsFusedBatchNorm(const Node* orig_node,
-                                                   NodeBuilder* nb) {
+                                                   NodeBuilder* nb,
+                                                   bool change_format) {
   DataType T;
   float epsilon;
   string data_format;
@@ -2050,6 +2431,165 @@ Status MklLayoutRewritePass::MergeConv2DWithBiasAdd(std::unique_ptr<Graph>* g,
   return Status::OK();
 }
 
+Status MklLayoutRewritePass::MergePadWithConv2D(std::unique_ptr<Graph>* g,
+                                                Node* m, Node* n) {
+  DCHECK(((m->type_string() == csinfo_.pad &&
+           n->type_string() == csinfo_.conv2d)) ||
+         ((n->type_string() == csinfo_.pad &&
+           m->type_string() == csinfo_.conv2d)));
+
+  // Conv2D is successor node, and Pad predecessor node.
+  Node* pred = m->type_string() == csinfo_.pad ? m : n;
+  Node* succ = m->type_string() == csinfo_.pad ? n : m;
+
+  // 1. Get all attributes from input nodes.
+  DataType T_pred, T_succ;
+  string padding;
+  std::vector<int32> strides;
+  std::vector<int32> dilations;
+  string data_format_pred, data_format_succ;
+  bool use_cudnn_on_gnu;
+  TF_CHECK_OK(GetNodeAttr(pred->def(), "T", &T_pred));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "T", &T_succ));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "padding", &padding));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "strides", &strides));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "dilations", &dilations));
+  // Data format for pad is not available and not necessary, thus
+  // dont need to match data format for Pad
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "data_format", &data_format_succ));
+  TF_CHECK_OK(GetNodeAttr(succ->def(), "use_cudnn_on_gpu", &use_cudnn_on_gnu));
+  // Check if the data types and devices of both succ and pred are the same.
+  // Assert is not used,  because it can be too strict.
+  // Don't need to check for data formats because it is not available in Pad.
+  if (T_pred != T_succ ||
+      pred->assigned_device_name() != succ->assigned_device_name() ||
+      pred->def().device() != succ->def().device()) {
+    return Status(error::Code::INVALID_ARGUMENT,
+                  "T attribute or devices of Conv2D and "
+                  "Pad do not match. Will skip node merge optimization");
+  }
+
+  const int succ_num = succ->num_inputs();
+  gtl::InlinedVector<Node*, 4> succ_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> succ_in(succ_num);
+  FillInputs(succ, &succ_control_edges, &succ_in);
+
+  const int pred_num = pred->num_inputs();
+  gtl::InlinedVector<Node*, 4> pred_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> pred_in(pred_num);
+  FillInputs(pred, &pred_control_edges, &pred_in);
+
+  // We need to ensure that Pad only feeds to Conv2D (some other operator is
+  // not expecting output of Pad). If this is not the case, then we cannot
+  // merge Conv2D with Pad.
+  const int kFirstOutputSlot = 0;
+  for (const Edge* e : pred->out_edges()) {
+    if (e->src_output() == kFirstOutputSlot && e->dst() != succ) {
+      return Status(error::Code::INVALID_ARGUMENT,
+                    "Pad does not feed to Conv2D, or "
+                    "it feeds Conv2D but has multiple outputs. "
+                    "Will skip node merge optimization");
+    }
+  }
+
+  // 2. Get inputs from both the nodes.
+
+  // Pad must have 2 data inputs: "input" and paddings.
+  int PadDataInputEdges = 0;
+  for (const Edge* e : pred->in_edges()) {
+    if (!e->IsControlEdge()) {
+      PadDataInputEdges++;
+    }
+  }
+  DCHECK_EQ(PadDataInputEdges, 2);
+
+  // Conv2D must have 2 data inputs: pad output and Filter
+  int ConvDataInputEdges = 0;
+  for (const Edge* e : succ->in_edges()) {
+    if (!e->IsControlEdge()) {
+      ConvDataInputEdges++;
+    }
+  }
+  DCHECK_EQ(ConvDataInputEdges, 2);
+
+  // We will use the node name of Conv2D as the name of new node
+  // Build new node. We use same name as original node, but change the op
+  // name.
+  NodeBuilder nb(succ->name(), csinfo_.pad_with_conv2d);
+  nb.Input(pred_in[0].first, pred_in[0].second);  // In1 (input data)  of Pad
+  // pred_in[1] will be 2nd Tensorflow tensor for Conv2D.
+  nb.Input(succ_in[1].first, succ_in[1].second);  // In2 (filter) of conv2d
+  // In1 of Conv2D is same as output of Pad.
+  // Thus, only need to add In2 of Conv2D
+  nb.Input(pred_in[1].first, pred_in[1].second);  // In2 (paddings) of Pad
+
+  // Copy attributes from Pad and conv2D to PadWithConv2D.
+  CopyAttrsFromPadAndConv2D(const_cast<const Node*>(succ),
+                            const_cast<const Node*>(pred), &nb);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(succ->def().device());
+
+  // Create node.
+  Node* new_node;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  DCHECK(new_node);
+
+  // Incoming data edges from 'pred' node and 'succ' node to new 'new_node'
+  // node are already copied in BuildNode.
+  // We handle control edges now.
+  for (const Edge* e : pred->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(e->src(), new_node, false);
+    }
+  }
+  for (const Edge* e : succ->in_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(e->src(), new_node, false);
+    }
+  }
+
+  // Incoming edges are fixed, we will fix the outgoing edges now.
+  // First, we will fix outgoing control edges from 'pred' node.
+  for (const Edge* e : pred->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Don't allow duplicate edge
+      (*g)->AddControlEdge(new_node, e->dst(), false);
+    }
+  }
+
+  // Second, we will fix outgoing control and data edges from 'succ' node.
+  for (const Edge* e : succ->out_edges()) {
+    if (e->IsControlEdge()) {
+      // Allow duplicate while adding control edge as it would fail (return
+      // NULL) if we try to add duplicate edge.
+      (*g)->AddControlEdge(new_node, e->dst(), false);
+    } else {
+      // Conv2D has only 1 output (at slot 0) and merged node also has only 1
+      // output (at slot 0).
+      const int kPadWithConv2DOutputSlot = 0;
+      (*g)->AddEdge(new_node, kPadWithConv2DOutputSlot, e->dst(),
+                    e->dst_input());
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  // It's ok to use pred or succ as we have enforced a check that
+  // both have same device assigned.
+  new_node->set_assigned_device_name(pred->assigned_device_name());
+
+  VLOG(1) << "MklLayoutRewritePass: Merged old node:" << pred->DebugString()
+          << ", and node: " << succ->DebugString()
+          << ", into node:" << new_node->DebugString();
+
+  (*g)->RemoveNode(succ);
+  (*g)->RemoveNode(pred);
+
+  return Status::OK();
+}
+
 Status MklLayoutRewritePass::MergeConv2DBackpropFilterWithBiasAddGrad(
     std::unique_ptr<Graph>* g, Node* m, Node* n) {
   CHECK_EQ(((m->type_string() == csinfo_.bias_add_grad &&
@@ -2183,6 +2723,12 @@ Status MklLayoutRewritePass::MergeNode(std::unique_ptr<Graph>* g, Node* m,
         m->type_string() == csinfo_.conv2d))) {
     return this->MergeConv2DWithBiasAdd(g, m, n);
   }
+  if (((m->type_string() == csinfo_.pad &&
+        n->type_string() == csinfo_.conv2d)) ||
+      ((n->type_string() == csinfo_.pad &&
+        m->type_string() == csinfo_.conv2d))) {
+    return this->MergePadWithConv2D(g, m, n);
+  }
 
   if (((m->type_string() == csinfo_.bias_add_grad &&
         n->type_string() == csinfo_.conv2d_grad_filter)) ||
@@ -2231,7 +2777,8 @@ Status MklLayoutRewritePass::RewriteNode(std::unique_ptr<Graph>* g,
     return s;
   }
 
-  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb);
+  const bool kPartialCopyAttrs = false;
+  ri->copy_attrs(const_cast<const Node*>(orig_node), &nb, kPartialCopyAttrs);
 
   // Set the Mkl layer label for this op.
   if (DataTypeIsQuantized(orig_node->input_type(0)) ||
@@ -2328,10 +2875,11 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
     return nullptr;
   }
 
-  // We make an exception for __MklDummyConv2DWithBias and
-  // __MklConv2DBackpropFilterWithBias since their names do not match Mkl node
-  // names.
+  // We make an exception for __MklDummyConv2DWithBias,
+  // __MklConv2DBackpropFilterWithBias, and __MklDummyPadWithConv2D since their
+  // names do not match Mkl node names.
   if (n->type_string() != csinfo_.conv2d_with_bias &&
+      n->type_string() != csinfo_.pad_with_conv2d &&
       n->type_string() != csinfo_.conv2d_grad_filter_with_bias &&
       !mkl_op_registry::IsMklOp(mkl_op_registry::GetMklOpName(n->type_string()),
                                 T)) {
@@ -2391,6 +2939,143 @@ MklLayoutRewritePass::CheckForNodeRewrite(const Node* n) const {
   return nullptr;
 }
 
+//////////////////////////////////////////////////////////////////////////
+//           Helper functions for node fusion
+//////////////////////////////////////////////////////////////////////////
+Status MklLayoutRewritePass::FuseTransposeMklOpTranspose(
+    std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+    std::function<void(const Node*, NodeBuilder* nb, bool)> copy_attrs,
+    string data_format) {
+  Node* transpose_to_nhwc = nodes[0];
+  Node* mklop = nodes[1];
+  Node* transpose_to_nchw = nodes[2];
+
+  const int transpose_nhwc_num_inputs = transpose_to_nhwc->num_inputs();
+  gtl::InlinedVector<Node*, 4> transpose_nhwc_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> transpose_nhwc_in(
+      transpose_nhwc_num_inputs);
+  FillInputs(transpose_to_nhwc, &transpose_nhwc_control_edges,
+             &transpose_nhwc_in);
+
+  const int mklop_num_inputs = mklop->num_inputs();
+  gtl::InlinedVector<Node*, 4> mklop_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> mklop_in(mklop_num_inputs);
+  FillInputs(mklop, &mklop_control_edges, &mklop_in);
+
+  const int transpose_nchw_num_inputs = transpose_to_nchw->num_inputs();
+  gtl::InlinedVector<Node*, 4> transpose_nchw_control_edges;
+  gtl::InlinedVector<std::pair<Node*, int>, 4> transpose_nchw_in(
+      transpose_nchw_num_inputs);
+  FillInputs(transpose_to_nchw, &transpose_nchw_control_edges,
+             &transpose_nchw_in);
+
+  // We use same name as original node, but change the op
+  // type.
+  NodeBuilder nb(mklop->name(), mklop->type_string());
+
+  // Storing the output slots of the input nodes.
+  for (int i = 0; i < mklop_num_inputs; i++) {
+    if (mklop_in[i].first == transpose_to_nhwc) {
+      // Fill "x":
+      nb.Input(transpose_nhwc_in[0].first, transpose_nhwc_in[0].second);
+    } else {
+      // Fill inputs other than "x":
+      nb.Input(mklop_in[i].first, mklop_in[i].second);
+    }
+  }
+
+  copy_attrs(const_cast<const Node*>(mklop), &nb, true);
+  nb.Attr("data_format", data_format);
+
+  // Copy the device assigned to old node to new node.
+  nb.Device(mklop->def().device());
+
+  // Create node.
+  Node* new_node;
+  TF_CHECK_OK(nb.Finalize(&**g, &new_node));
+  DCHECK(new_node);
+
+  // Fill outputs.
+  for (const Edge* e : transpose_to_nchw->out_edges()) {
+    if (!e->IsControlEdge()) {
+      const int kTransposeWithMklOpOutputSlot = 0;
+      DCHECK((*g)->AddEdge(new_node, kTransposeWithMklOpOutputSlot, e->dst(),
+                           e->dst_input()));
+    }
+  }
+
+  // Copy device assigned to old node to new node.
+  new_node->set_assigned_device_name(mklop->assigned_device_name());
+
+  // Copy requested_device and assigned_device_name_index
+  new_node->set_requested_device(mklop->requested_device());
+  new_node->set_assigned_device_name_index(mklop->assigned_device_name_index());
+
+  (*g)->RemoveNode(transpose_to_nhwc);
+  (*g)->RemoveNode(mklop);
+  (*g)->RemoveNode(transpose_to_nchw);
+
+  return Status::OK();
+}
+
+Status MklLayoutRewritePass::FuseNode(
+    std::unique_ptr<Graph>* g, std::vector<Node*>& nodes,
+    const MklLayoutRewritePass::FusionInfo fi) {
+  return fi.fuse_func(g, nodes, fi.copy_attrs);
+}
+
+std::tuple<bool, std::vector<Node*>, const MklLayoutRewritePass::FusionInfo>
+MklLayoutRewritePass::CheckForNodeFusion(Node* a) const {
+  // Stores matched nodes, in the same order as node_checkers.
+  std::vector<Node*> nodes;
+
+  for (auto fi = finfo_.begin(); fi != finfo_.end(); ++fi) {
+    //
+    // Make sure node "a" and its succeding nodes (b, c ...), match the pattern
+    // defined in fusion info (ops[0], ops[1], ...),
+    // a.k.a. "a->b->c" matches "op1->op2->op3"
+    //
+
+    // Stores the first unvisted outgoing edge of each matched node in "nodes".
+    std::stack<EdgeSet::const_iterator> current_neighbor_stack;
+    nodes.clear();
+
+    auto node_checker = fi->node_checkers.begin();
+    if (a != nullptr && (*node_checker)(a)) {
+      nodes.push_back(a);
+      current_neighbor_stack.push(a->out_edges().begin());
+      ++node_checker;
+    }
+
+    while (!nodes.empty()) {
+      auto& current_neighbor_iter = current_neighbor_stack.top();
+
+      if (current_neighbor_iter != nodes.back()->out_edges().end()) {
+        // Found an unvisited edge. Goes through the edge to get the neighbor.
+        Node* neighbor_node = (*current_neighbor_iter)->dst();
+        ++current_neighbor_stack.top();  // Retrieves the next unvisited edge.
+
+        if ((*node_checker)(neighbor_node)) {
+          // Found a match. Stores the node and moves to the next checker.
+          nodes.push_back(neighbor_node);
+          current_neighbor_stack.push(neighbor_node->out_edges().begin());
+          if (++node_checker == fi->node_checkers.end()) {
+            return make_tuple(true, nodes, *fi);
+          }
+        }
+      } else {
+        // Removes the current node since none of its neighbor leads to a
+        // further match.
+        nodes.pop_back();
+        current_neighbor_stack.pop();
+        --node_checker;
+      }
+    }
+  }
+
+  return make_tuple(false, std::vector<Node*>(), FusionInfo());
+}
+
 ///////////////////////////////////////////////////////////////////////////////
 //              Post-rewrite Mkl metadata fixup pass
 ///////////////////////////////////////////////////////////////////////////////
@@ -2516,6 +3201,30 @@ bool MklLayoutRewritePass::RunPass(std::unique_ptr<Graph>* g) {
 
   DumpGraph("After running MklLayoutRewritePass(NodeMerge)", &**g);
 
+#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+  order.clear();
+  GetReversePostOrder(**g, &order);  // This will give us topological sort.
+  for (Node* n : order) {
+    // If node is not an op or it cannot run on CPU device, then skip.
+    if (!n->IsOp() || !CanOpRunOnCPUDevice(n)) {
+      continue;
+    }
+
+    auto check_result = CheckForNodeFusion(n);
+    bool found_pattern = std::get<0>(check_result);
+    std::vector<Node*> nodes = std::get<1>(check_result);
+    const FusionInfo fi = std::get<2>(check_result);
+
+    // if "found_pattern" is true, we can do the fusion.
+    if (found_pattern) {
+      if (FuseNode(g, nodes, fi) == Status::OK()) {
+        result = true;
+      }
+    }
+  }
+  DumpGraph("After running MklLayoutRewritePass(NodeFusion)", &**g);
+#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
+
   order.clear();
   GetReversePostOrder(**g, &order);  // This will give us topological sort.
   for (Node* n : order) {
diff --git a/tensorflow/core/graph/mkl_layout_pass_test.cc b/tensorflow/core/graph/mkl_layout_pass_test.cc
index 7e2d1f78785c59b2fe58d32c3f750923234419d2..04c4b85d64d63f275a08abb86d7bf3393398dc67 100644
--- a/tensorflow/core/graph/mkl_layout_pass_test.cc
+++ b/tensorflow/core/graph/mkl_layout_pass_test.cc
@@ -65,6 +65,13 @@ static void InitGraph(const string& s, Graph* graph,
 class MklLayoutPassTest : public ::testing::Test {
  public:
   MklLayoutPassTest() : graph_(OpRegistry::Global()) {}
+  // Ashraf added
+  Node* FindNode(const string& name) {
+    for (Node* node : graph_.nodes()) {
+      if (node->name() == name) return node;
+    }
+    LOG(FATAL) << name;
+  }
 
   void InitGraph(const string& s, const string& device = kCPUDevice) {
     ::tensorflow::InitGraph(s, &graph_, device);
@@ -131,6 +138,8 @@ REGISTER_OP("_MklInput2")
     .Output("o: uint8")
     .Output("o1: uint8")
     .SetIsStateful();
+REGISTER_OP("Output2").Input("i: float").Input("i1: float").SetIsStateful();
+REGISTER_OP("Output").Input("i: float").SetIsStateful();
 
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to node merge optiimization
@@ -455,6 +464,559 @@ TEST_F(MklLayoutPassTest, NodeMerge_Conv2DWithBias_ConvBpropInput_FilterFwd) {
             "E:3->G:4;F->G;F:control->DMT/_3:control;G->Z;X->Y:1;X->Z:1");
 }
 
+// Test set 3: Pad + Conv2D fusion
+// padding is VALID type
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// After layout pass
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Test if input control edges do not duplicate after merge.
+// If both the merging ops have input control edge from a common op
+// then, the merged op will have only one control edge from that
+// common op.
+// padding is VALID type
+// A = input(image), A1 = input, B = input(paddings),
+// C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// A1:control->C:control
+// A1:control->E:control
+// After layout pass:
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+// A1:control->E:control (only one control edge)
+TEST_F(MklLayoutPassTest, Input_ControlEdge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A1' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  Node* a1 = FindNode("A1");
+  Node* c = FindNode("C");
+  Node* e = FindNode("E");
+  const Edge* edge = graph_.AddControlEdge(a1, c);
+  const Edge* edge_1 = graph_.AddControlEdge(a1, e);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+      "A1:control->E:control;A:control->DMT/_0:control;A:control->DMT/"
+      "_1:control;"
+      "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+      "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Test if output control edges does not duplicate after merge.
+// If both the merging ops have output control edge to a common op,
+// then after merge, the merged op will have only one control edge
+// to that commom op.
+// padding is VALID type
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// C:control->A1:control
+// E:control->A1:control
+// After layout pass:
+// _MklPadWithConv2D(A, D, B, DMT/_0, DMT/_1, DMT/_2)
+// E:control->A1:control (only one control edge)
+TEST_F(MklLayoutPassTest, Output_ControlEdge_PadWithConv2D_Positive) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A1' op: 'Input'}"
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  Node* a1 = FindNode("A1");
+  Node* c = FindNode("C");
+  Node* e = FindNode("E");
+  const Edge* edge = graph_.AddControlEdge(c, a1);
+  const Edge* edge_1 = graph_.AddControlEdge(e, a1);
+  ASSERT_NE(edge, nullptr);
+  ASSERT_NE(edge_1, nullptr);
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);A1(Input);B(Int32Input);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;"
+      "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+      "A:control->DMT/_2:control;B->E:2;D->E:1;DMT/_0->E:3;DMT/_1->E:4;"
+      "DMT/_2->E:5;E->Z;E:control->A1:control;Y->Z:1");
+}
+// Pad + Conv2D fusion with padding is VALID,
+// Input node pointing to both Pad and Conv2D
+// A = input(image), B = input(paddings), C= Pad
+// E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,A); Z=Zeta(E,Y)
+// After layout pass
+// _MklPadWithConv2D(A, A, B, DMT/_0, DMT/_1, DMT/_2)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_Input) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'A'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);DMT/_0(Const);DMT/_1(Const);"
+            "DMT/_2(Const);E(_MklPadWithConv2D);Y(Input);Z(Zeta)|A->E;A->E:1;"
+            "A:control->DMT/_0:control;A:control->DMT/_1:control;"
+            "A:control->DMT/_2:control;B->E:2;DMT/_0->E:3;DMT/_1->E:4;"
+            "DMT/_2->E:5;E->Z;Y->Z:1");
+}
+// Pad + Conv2D with padding is VALID,
+// Input node pointing to both Pad and Conv2D
+// Output of both Pad and Conv2D feeds one node (Z as Output2)
+// A = input(as image), B = input(as paddings), C= Pad
+// E = Conv2D, Z = Output2
+// C=Pad(A,B); E=Conv2D(C,A); Z=Output(C,E)
+// After layout pass - No merging, since Pad and Conv2D both
+// feed to the same node (Z)
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Common_InOutput) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'VALID' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'A'] }"
+      "node { name: 'Z' op: 'Output2'"
+      " input: ['C', 'E']}");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "A(Input);B(Int32Input);C(Pad);DMT/_0(Const);DMT/_1(Const);"
+            "E(_MklConv2D);Z(Output2)|A->C;A->E:1;B->C:1;C->E;C->Z;"
+            "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+            "DMT/_0->E:2;DMT/_1->E:3;E->Z:1");
+}
+// Pad + Conv2D; padding is SAME
+// A = input(image), B = input(paddings), C= Pad = input of conv2D,
+// D=input(filter), E = Conv2D, Z = Zeta
+// C=Pad(A,B); E=Conv2D(C,D); Z=Zeta(E,Y)
+// After layout pass - No merging
+TEST_F(MklLayoutPassTest, NodeMerge_PadWithConv2D_Negative) {
+  DCHECK_EQ(kTensorOrdering, MklTfTensorOrdering::TENSORS_CONTIGUOUS);
+  InitGraph(
+      "node { name: 'A' op: 'Input'}"
+      "node { name: 'B' op: 'Int32Input'}"
+      "node { name: 'C' op: 'Pad'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'Tpaddings'        value { type: DT_INT32 } }"
+      " input: ['A', 'B']}"
+      "node { name: 'D' op: 'Input'}"
+      "node { name: 'E' op: 'Conv2D'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " attr { key: 'data_format'      value { s: 'NHWC' } }"
+      " attr { key: 'use_cudnn_on_gpu' value { b: false } }"
+      " attr { key: 'strides'          value { list: {i: 1, i:1, i:1, i:1} } }"
+      " attr { key: 'padding'          value { s: 'SAME' } }"
+      " attr { key: 'dilations'        value { list: {i: 1, i:1, i:1, i:1} } }"
+      " input: ['C', 'D'] }"
+      "node { name: 'Y' op: 'Input'}"
+      "node { name: 'Z' op: 'Zeta'"
+      " attr {key: 'T'                 value { type: DT_FLOAT } }"
+      " input: ['E', 'Y']}");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "A(Input);B(Int32Input);C(Pad);D(Input);DMT/_0(Const);DMT/_1(Const);"
+      "E(_MklConv2D);Y(Input);Z(Zeta)|A->C;B->C:1;C->E;"
+      "C:control->DMT/_0:control;C:control->DMT/_1:control;"
+      "D->E:1;DMT/_0->E:2;DMT/_1->E:3;E->Z;Y->Z:1");
+}
+#ifdef ENABLE_TRANSPOSE_OPTIMIZATION
+TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Positive) {
+  InitGraph(
+      "node { name: 'Input0' op: 'Input'}"
+      "node { name: 'Input1' op: 'Input'}"
+      "node { name: 'Const0' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node { name: 'Const1' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\003\\000\\000\\000\\001\\000\\000\\000\\002\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node {              \
+      name: 'Transpose0' \
+      op: 'Transpose'    \
+      input: 'Input0'    \
+      input: 'Const0'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node {                 \
+      name: 'Conv2D'        \
+      op: 'Conv2D'          \
+      input: 'Transpose0'   \
+      input: 'Input1'       \
+      attr {                \
+        key: 'T'            \
+        value {             \
+          type: DT_FLOAT    \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'data_format'  \
+        value {             \
+          s: 'NHWC'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'dilations'    \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'padding'      \
+        value {             \
+          s: 'SAME'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'strides'      \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'use_cudnn_on_gpu' \
+        value {                 \
+          b: true               \
+        }                       \
+      }                         \
+    }"
+      "node {              \
+      name: 'Transpose1' \
+      op: 'Transpose'    \
+      input: 'Conv2D'    \
+      input: 'Const1'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node { name: 'Relu' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['Transpose1'] }");
+  EXPECT_EQ(DoMklLayoutOptimizationPass(),
+            "Const0(Const);Const1(Const);"
+            "Conv2D(_MklConv2D);DMT/_0(Const);DMT/_1(Const);Input0(Input);"
+            "Input1(Input);Relu(_MklRelu)|Conv2D->Relu;Conv2D:2->Relu:1;DMT/"
+            "_0->Conv2D:2;DMT/_1->Conv2D:3;Input0->Conv2D;"
+            "Input0:control->DMT/_0:control;Input0:control->DMT/"
+            "_1:control;Input1->Conv2D:1");
+}
+
+TEST_F(MklLayoutPassTest, NodeMerge_TransposeConv2DTranspose_Negative) {
+  InitGraph(
+      "node { name: 'Input0' op: 'Input'}"
+      "node { name: 'Input1' op: 'Input'}"
+      "node { name: 'Const0' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node { name: 'Const1' op: 'Const'"
+      "  attr {"
+      "   key: 'dtype'"
+      "   value {"
+      "     type: DT_INT32"
+      "   }"
+      "  }"
+      " attr {"
+      "   key: 'value'"
+      "   value {"
+      "     tensor {"
+      "       dtype: DT_INT32"
+      "       tensor_shape {"
+      "         dim {"
+      "           size: 4"
+      "         }"
+      "       }"
+      "       tensor_content: "
+      "'\\000\\000\\000\\000\\002\\000\\000\\000\\003\\000\\000\\000\\001\\000"
+      "\\000\\000'"
+      "     }"
+      "   }"
+      " }"
+      "}"
+      "node {              \
+      name: 'Transpose0' \
+      op: 'Transpose'    \
+      input: 'Input0'    \
+      input: 'Const0'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node {                 \
+      name: 'Conv2D'        \
+      op: 'Conv2D'          \
+      input: 'Transpose0'   \
+      input: 'Input1'       \
+      attr {                \
+        key: 'T'            \
+        value {             \
+          type: DT_FLOAT    \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'data_format'  \
+        value {             \
+          s: 'NHWC'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'dilations'    \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'padding'      \
+        value {             \
+          s: 'SAME'         \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'strides'      \
+        value {             \
+          list {            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+            i: 1            \
+          }                 \
+        }                   \
+      }                     \
+      attr {                \
+        key: 'use_cudnn_on_gpu' \
+        value {                 \
+          b: true               \
+        }                       \
+      }                         \
+    }"
+      "node {              \
+      name: 'Transpose1' \
+      op: 'Transpose'    \
+      input: 'Conv2D'    \
+      input: 'Const1'    \
+      attr {             \
+        key: 'T'         \
+        value {          \
+          type: DT_FLOAT \
+        }                \
+      }                  \
+      attr {             \
+        key: 'Tperm'     \
+        value {          \
+          type: DT_INT32 \
+        }                \
+      }                  \
+    }"
+      "node { name: 'Relu' op: 'Relu'"
+      " attr { key: 'T'                value { type: DT_FLOAT } }"
+      " input: ['Transpose1'] }");
+  EXPECT_EQ(
+      DoMklLayoutOptimizationPass(),
+      "Const0(Const);Const1(Const);"
+      "Conv2D(_MklConv2D);DMT/_0(Const);DMT/_1(Const);DMT/_2(Const);"
+      "Input0(Input);Input1(Input);Relu(_MklRelu);"
+      "Transpose0(Transpose);Transpose1(Transpose)|Const0->Transpose0:1;Const1-"
+      ">Transpose1:1;"
+      "Conv2D->Transpose1;DMT/_0->Conv2D:2;DMT/_1->Conv2D:3;DMT/"
+      "_2->Relu:1;Input0->Transpose0;"
+      "Input1->Conv2D:1;Transpose0->Conv2D;Transpose0:control->DMT/_0:control;"
+      "Transpose0:control->DMT/"
+      "_1:control;Transpose1->Relu;Transpose1:control->DMT/_2:control");
+}
+#endif  // ENABLE_TRANSPOSE_OPTIMIZATION
+
 /////////////////////////////////////////////////////////////////////
 //  Unit tests related to rewriting node to Mkl node
 /////////////////////////////////////////////////////////////////////
diff --git a/tensorflow/core/graph/node_builder.cc b/tensorflow/core/graph/node_builder.cc
index adaee479359c22f08bbec5af0245719fa161912e..a91e6dd05738ae8242c812970e8bbc4a10c7675a 100644
--- a/tensorflow/core/graph/node_builder.cc
+++ b/tensorflow/core/graph/node_builder.cc
@@ -29,6 +29,8 @@ NodeBuilder::NodeOut::NodeOut(Node* n, int32 i)  // NOLINT(runtime/explicit)
       index(i),
       dt(SafeGetOutput(node, i, &error)) {}
 
+NodeBuilder::NodeOut::NodeOut(OutputTensor t) : NodeOut(t.node, t.index) {}
+
 NodeBuilder::NodeOut::NodeOut(StringPiece n, int32 i, DataType t)
     : node(nullptr), error(false), name(n), index(i), dt(t) {}
 
diff --git a/tensorflow/core/graph/node_builder.h b/tensorflow/core/graph/node_builder.h
index 31fb5909393058585a6fa994b144dae9218c3bba..b1dc2ae92f14ba4519d98a4c556c1d06e14b6b5d 100644
--- a/tensorflow/core/graph/node_builder.h
+++ b/tensorflow/core/graph/node_builder.h
@@ -50,6 +50,7 @@ class NodeBuilder {
   struct NodeOut {
     // For referencing an existing Node.
     NodeOut(Node* n, int32 i = 0);
+    NodeOut(OutputTensor t);
 
     // For referencing Nodes not in the graph being built. It is
     // useful when preparing a graph for ExtendSession or creating a
diff --git a/tensorflow/core/graph/optimizer_cse_test.cc b/tensorflow/core/graph/optimizer_cse_test.cc
index 642298fa95d9bef9da37f50fca9a5831c1702f23..c1f93ce05ae99fef05d6a16815c3886643d17e26 100644
--- a/tensorflow/core/graph/optimizer_cse_test.cc
+++ b/tensorflow/core/graph/optimizer_cse_test.cc
@@ -337,13 +337,9 @@ TEST_F(OptimizerCSETest, Constant_Dedup) {
   EXPECT_EQ(OriginalGraph(),
             "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const);"
             "n/_4(Const);n/_5(Const);n/_6(Const);n/_7(Const)|");
-  std::vector<string> nodes = str_util::Split(DoCSE(), ";|");
-  std::set<string> node_set(nodes.begin(), nodes.end());
-  // Expect exactly one of each type of node to be retained after CSE.
-  EXPECT_EQ(node_set.count("n/_0(Const)") + node_set.count("n/_7(Const)"), 1);
-  EXPECT_EQ(node_set.count("n/_1(Const)") + node_set.count("n/_6(Const)"), 1);
-  EXPECT_EQ(node_set.count("n/_2(Const)") + node_set.count("n/_5(Const)"), 1);
-  EXPECT_EQ(node_set.count("n/_3(Const)") + node_set.count("n/_4(Const)"), 1);
+  // In theory, there are 2^4 possible correct output of CSE.  In this
+  // test, it happens to eliminate the last 4 nodes.
+  EXPECT_EQ(DoCSE(), "n/_0(Const);n/_1(Const);n/_2(Const);n/_3(Const)|");
 }
 
 static void BM_CSE(int iters, int op_nodes) {
diff --git a/tensorflow/core/graph/testlib.cc b/tensorflow/core/graph/testlib.cc
index 0a38aa1c9192a6f2628c1ca916bd75a8cb51d2e8..0e74a30c7a92ebd46a933f1056ccb093fa095128 100644
--- a/tensorflow/core/graph/testlib.cc
+++ b/tensorflow/core/graph/testlib.cc
@@ -123,6 +123,17 @@ Node* Assign(Graph* g, Node* var, Node* val) {
   return ret;
 }
 
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive, bool reverse) {
+  Node* ret;
+  TF_CHECK_OK(NodeBuilder(g->NewName("n"), "Cumsum")
+                  .Input(data)
+                  .Input(axes)
+                  .Attr("exclusive", exclusive)
+                  .Attr("reverse", reverse)
+                  .Finalize(g, &ret));
+  return ret;
+}
+
 Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
              bool keep_dims) {
   Node* ret;
diff --git a/tensorflow/core/graph/testlib.h b/tensorflow/core/graph/testlib.h
index b00196f58735f938f562b5cabcd2985274b34f56..0c7233161f4128c1da0d8761b0b49fc2f4cf2524 100644
--- a/tensorflow/core/graph/testlib.h
+++ b/tensorflow/core/graph/testlib.h
@@ -68,6 +68,10 @@ Node* Recv(Graph* g, const string& tensor, const string& type,
            const string& sender, const uint64 sender_incarnation,
            const string& receiver);
 
+// Adds a cumsum "node" in "g" doing cumsum(data, axes).
+Node* Cumsum(Graph* g, Node* data, Node* axes, bool exclusive = false,
+             bool reverse = false);
+
 // Adds a reduction "node" in "g" doing sum(data, axes).  "reduce" is
 // a reduction, e.g., Sum, Max, Min, Mean, etc.
 Node* Reduce(Graph* g, const string& reduce, Node* data, Node* axes,
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 7b03ec38bf5bb13f4fc20cccef241839eaacc426..f353d789d47030afda5d9680cca8094d48b827f1 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -41,6 +41,7 @@ tf_cc_test(
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensor_testutil",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
     ],
@@ -106,6 +107,8 @@ cc_library(
         ":utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -140,6 +143,7 @@ tf_cc_test(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
         "//tensorflow/core/grappler/inputs:trivial_test_graph_input_yielder",
     ],
 )
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 270b75269c794249975d0316c628e40f0ec95a5b..1df26d94d1fe1ed35765291da6c7d2eae513e713 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -679,6 +679,11 @@ class SymbolicShapeRefiner {
               "' was not previously added to SymbolicShapeRefiner.");
         }
 
+        if (src_output >= c->inference_context->num_outputs())
+          return errors::OutOfRange("src_output = ", src_output,
+                                    ", but num_outputs is only ",
+                                    c->inference_context->num_outputs());
+
         // Propagate input node's NodeContext info to the current node's
         // NodeContext:
         // output_tensor_protos to input_tensor_protos and input_tensors, and
diff --git a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
index 998bd59dce37e320b847852fe0c5529c5bccebc4..c9ce63a8ef2aa301f690cec16fcd03fb83309c7c 100644
--- a/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
+++ b/tensorflow/core/grappler/costs/op_level_cost_estimator_test.cc
@@ -832,7 +832,7 @@ TEST_F(OpLevelCostEstimatorTest, GetTensorShapeProtoFromTensorProto) {
   EXPECT_FALSE(
       GetTensorShapeProtoFromTensorProto(tensor_proto, &tensor_shape_proto));
 
-  // Check GetTensorShapeProtoFromTensorProto() resturns correct values.
+  // Check GetTensorShapeProtoFromTensorProto() returns correct values.
   {
     std::vector<int64> shape_expected = {10, 20, 30, 40};
     GetTensorProto(DT_INT32, {4}, shape_expected,
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index ba50e5553852d2b722b1f0fffe0507c2a77d9d9b..ae5200b359232153f96c9ffa21a505d2a056d55d 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -469,8 +469,8 @@ Status VirtualScheduler::Init() {
         } else {
           // Different device, no cached copy; transfer input_node to the
           // curr_node's device.
-          auto send_and_recv =
-              CreateSendRecv(input_node, curr_node, input_node_name);
+          auto send_and_recv = CreateSendRecv(input_node, curr_node, input_node,
+                                              input_node_name);
           // Note that CreateSendRecv() already connected input/output between
           // _Send and _Recv ops.
           const auto* send = send_and_recv.first;
@@ -608,7 +608,8 @@ string VirtualScheduler::ChannelDeviceName(const NodeDef* from,
 }
 
 std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
-    const NodeDef* from, const NodeDef* to, const string& input_name) {
+    const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+    const string& input_name) {
   CHECK(!initialized_) << "CreateSendRecv is called after Init().";
 
   // Connect "from" node to "to" node with _Send and _Recv such that
@@ -641,6 +642,12 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   send_attr[kAttrInputSrc].set_s(input_name);
   send_attr[kAttrSrcDevice].set_s(DeviceName(from));
   send_attr[kAttrDstDevice].set_s(DeviceName(to));
+  // GraphDef generated by AutoGrappler has tensor_name field when removing
+  // _Send/_Recv nodes.
+  if (input_node->attr().count(kAttrTensorName)) {
+    send_attr[kAttrTensorName].set_s(
+        input_node->attr().at(kAttrTensorName).s());
+  }
 
   // _Recv op.
   auto* recv = new NodeDef();
@@ -650,6 +657,10 @@ std::pair<const NodeDef*, const NodeDef*> VirtualScheduler::CreateSendRecv(
   recv->set_device(DeviceName(to));
   auto& recv_attr = *(recv->mutable_attr());
   recv_attr[kAttrInputSrc].set_s(input_name);
+  if (input_node->attr().count(kAttrTensorName)) {
+    recv_attr[kAttrTensorName].set_s(
+        input_node->attr().at(kAttrTensorName).s());
+  }
 
   // NodeState for _Send op.
   auto& send_node_state = GetNodeStateOrCreateIt(send);
@@ -1022,7 +1033,8 @@ Costs VirtualScheduler::Summary() const {
       bool is_cost_accurate;
       std::tie(cost, is_cost_accurate) = op_costs_.at(item.first);
       VLOG(2) << "Node: " << item.first << ", Count: " << item.second
-              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost;
+              << ", Individual Cost: " << (is_cost_accurate ? "" : "~") << cost
+              << " us";
     }
   }
 
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index 89dff9686d3983330e0261c1074b8b791a98b459..6a835f32d16d0850c06891f656b2bec910e26b78 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -308,15 +308,17 @@ class VirtualScheduler {
  private:
   // Constants.
   const string kAttrInputSrc = "input_source_";
-  const string kAttrSrcDevice = "src_device_";
-  const string kAttrDstDevice = "dst_device_";
+  const string kAttrSrcDevice = "send_device";
+  const string kAttrDstDevice = "recv_device";
+  const string kAttrTensorName = "tensor_name";
   const string kChannelDevice = "Channel";
 
   // Methods called from Init(). Fails if initialize_ is set.
   void MaybeUpdateInputOutput(const NodeDef* node);
   NodeState& GetNodeStateOrCreateIt(const NodeDef* node);
   std::pair<const NodeDef*, const NodeDef*> CreateSendRecv(
-      const NodeDef* from, const NodeDef* to, const string& input_name);
+      const NodeDef* from, const NodeDef* to, const NodeDef* input_node,
+      const string& input_name);
   string DeviceName(const NodeDef* node) const;
   string SanitizedDeviceName(const NodeDef* node) const;
   string ChannelDeviceName(const NodeDef* from, const NodeDef* to) const;
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.h b/tensorflow/core/grappler/graph_analyzer/sig_node.h
index 45c0ed31626ec99d1c443313f9b4d6ef9a6fa43a..66d290d88e4a4b4ef8fef7574444d47f57cc237a 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.h
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.h
@@ -178,7 +178,7 @@ class SigNode {
   // computed.
   size_t GetTopoHash(int distance) const;
 
-  // The the hash value for the highest computed distance. It must be previously
+  // The hash value for the highest computed distance. It must be previously
   // computed.
   size_t GetHighTopoHash() const {
     CHECK(!topo_hash_.empty());
diff --git a/tensorflow/core/grappler/graph_view.cc b/tensorflow/core/grappler/graph_view.cc
index 35675fb1a26d156364b67aefd00a51bc5197e37d..ba9d2eb32181940bc430771db281c6cea8cb48c4 100644
--- a/tensorflow/core/grappler/graph_view.cc
+++ b/tensorflow/core/grappler/graph_view.cc
@@ -70,6 +70,12 @@ bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
   return fanout.size() <= 1;
 }
 
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port) {
+  const auto output = GraphView::OutputPort(node, port);
+  const auto fanout = graph_view.GetFanout(output);
+  return !fanout.empty();
+}
+
 bool NoControlFanin(const GraphView& graph_view, const NodeDef* node) {
   const auto control_port = GraphView::InputPort(node, -1);
   return graph_view.GetFanin(control_port).empty();
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 89cec2eb2ec43b2307244b92665673b17f9cf6f1..0a47b2256583f35e6ef413b50fdc8eea2bdc978d 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -342,10 +342,13 @@ class GraphView
   }
 };
 
-// Returns true if node has one (or zero) fanout nodes at given port.
+// Returns true if node has one (or zero) fanout nodes at given output port.
 bool HasSingleFanoutNode(const GraphView& graph_view, const NodeDef* node,
                          int port = 0);
 
+// Returns true if node has at least one fanout node at given output port.
+bool HasFanouts(const GraphView& graph_view, const NodeDef* node, int port = 0);
+
 bool NoControlFanin(const GraphView& graph_view, const NodeDef* node);
 bool NoControlFanout(const GraphView& graph_view, const NodeDef* node);
 bool NoControlFaninOrFanout(const GraphView& graph_view, const NodeDef* node);
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 2c490f3966cb45f61a22ba0a858a928f46e9db1b..74bde67f198f8c6d31273861cf9b35537909447c 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -19,27 +19,33 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
-GrapplerItem::GrapplerItem(const GrapplerItem& other, GraphDef* graph_def) {
-  id = other.id;
-  feed = other.feed;
-  fetch = other.fetch;
-  init_ops = other.init_ops;
-  keep_ops = other.keep_ops;
-  expected_init_time = other.expected_init_time;
-  save_op = other.save_op;
-  restore_op = other.restore_op;
-  save_restore_loc_tensor = other.save_restore_loc_tensor;
-  queue_runners = other.queue_runners;
-  allowed_optimizations = other.allowed_optimizations;
-  graph.Swap(graph_def);
+GrapplerItem GrapplerItem::WithGraph(GraphDef&& graph_def) const {
+  GrapplerItem item;
+  item.id = id;
+  item.feed = feed;
+  item.fetch = fetch;
+  item.init_ops = init_ops;
+  item.keep_ops = keep_ops;
+  item.expected_init_time = expected_init_time;
+  item.save_op = save_op;
+  item.restore_op = restore_op;
+  item.save_restore_loc_tensor = save_restore_loc_tensor;
+  item.queue_runners = queue_runners;
+  item.devices_ = devices_;
+  item.allowed_optimizations_ = allowed_optimizations_;
+  item.graph.Swap(&graph_def);
+  return item;
 }
 
 std::vector<const NodeDef*> GrapplerItem::MainOpsFanin() const {
@@ -111,6 +117,64 @@ std::unordered_set<string> GrapplerItem::NodesToPreserve() const {
   return result;
 }
 
+const std::unordered_set<string>& GrapplerItem::devices() const {
+  return devices_;
+}
+
+Status GrapplerItem::AddDevice(const string& device) {
+  DeviceNameUtils::ParsedName name;
+
+  if (!DeviceNameUtils::ParseFullName(device, &name)) {
+    return errors::InvalidArgument("Invalid device name: device=", device);
+
+  } else if (!name.has_job || !name.has_replica || !name.has_task ||
+             !name.has_type || !name.has_id) {
+    return errors::InvalidArgument("Not a fully defined device name: device=",
+                                   device);
+  }
+
+  devices_.insert(DeviceNameUtils::ParsedNameToString(name));
+  return Status::OK();
+}
+
+Status GrapplerItem::AddDevices(const GrapplerItem& other) {
+  std::vector<absl::string_view> invalid_devices;
+  for (const string& device : other.devices()) {
+    Status added = AddDevice(device);
+    if (!added.ok()) invalid_devices.emplace_back(device);
+  }
+  return invalid_devices.empty()
+             ? Status::OK()
+             : errors::InvalidArgument("Skipped invalid devices: [",
+                                       absl::StrJoin(invalid_devices, ", "),
+                                       "]");
+}
+
+Status GrapplerItem::InferDevicesFromGraph() {
+  absl::flat_hash_set<absl::string_view> invalid_devices;
+  for (const NodeDef& node : graph.node()) {
+    Status added = AddDevice(node.device());
+    if (!added.ok()) invalid_devices.insert(node.device());
+  }
+  VLOG(2) << "Inferred device set: [" << absl::StrJoin(devices_, ", ") << "]";
+  return invalid_devices.empty()
+             ? Status::OK()
+             : errors::InvalidArgument("Skipped invalid devices: [",
+                                       absl::StrJoin(invalid_devices, ", "),
+                                       "]");
+}
+
+void GrapplerItem::ClearDevices() { devices_.clear(); }
+
+const GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations()
+    const {
+  return allowed_optimizations_;
+}
+
+GrapplerItem::AllowedOptimizations& GrapplerItem::allowed_optimizations() {
+  return allowed_optimizations_;
+}
+
 std::vector<const NodeDef*> ComputeTransitiveFanin(
     const GraphDef& graph, const std::vector<string>& terminal_nodes) {
   bool ill_formed = false;
diff --git a/tensorflow/core/grappler/grappler_item.h b/tensorflow/core/grappler/grappler_item.h
index a0748abfe691334c6dc838c05e0d3f1cee2e2ecb..9051542988c4261aacb5fc25c8e6e2f1d35adfa0 100644
--- a/tensorflow/core/grappler/grappler_item.h
+++ b/tensorflow/core/grappler/grappler_item.h
@@ -35,12 +35,15 @@ namespace grappler {
 // nodes, and potentially a set of nodes to feed.
 struct GrapplerItem {
   GrapplerItem() = default;
-  GrapplerItem(const GrapplerItem& other, GraphDef&& graph_def)
-      : GrapplerItem(other, &graph_def) {}
-  // Swaps *graph_def with an empty GraphDef.
-  GrapplerItem(const GrapplerItem& other, GraphDef* graph_def);
+  GrapplerItem(const GrapplerItem& other) = default;
+  GrapplerItem(GrapplerItem&& other) = default;
+  GrapplerItem& operator=(const GrapplerItem& other) = default;
+  GrapplerItem& operator=(GrapplerItem&& other) = default;
   virtual ~GrapplerItem() = default;
 
+  // Create a copy of this GrapplerItem with graph swapped with the argument.
+  GrapplerItem WithGraph(GraphDef&& graph) const;
+
   string id;  // A unique id for this item
 
   // Inputs
@@ -83,9 +86,42 @@ struct GrapplerItem {
     // Is it allowed to add nodes to the graph that do not have registered
     // gradient function.
     bool non_differentiable_rewrites = true;
+    // By default we are not allowed to inline ops with side effects into the
+    // main graph, because we can't guarantee that after pruning these ops will
+    // be executed. However if we are optimizing a function library (see
+    // meta_optimizer.cc) and a graph was instantiated by a function definition,
+    // we can do that, because functions guarantee that all side effects will be
+    // executed (see function_optimizer.cc for details).
+    bool inline_ops_with_side_effects = false;
   };
 
-  AllowedOptimizations allowed_optimizations;
+  const std::unordered_set<string>& devices() const;
+  // Adds a device to a set of available devices, only if it's a valid fully
+  // defined device name. Returns `Status::OK()` if successfully added a device,
+  // and an error otherwise.
+  Status AddDevice(const string& device);
+  // Adds all valid devices from the other Grappler item to the device set.
+  Status AddDevices(const GrapplerItem& other);
+  // Adds all valid devices from the nodes of the graph to the device set.
+  // Returns `Status::OK()` if all device annotations found in a graph are valid
+  // fully defined device names, and an error otherwise.
+  Status InferDevicesFromGraph();
+  // Clears a set of available devices.
+  void ClearDevices();
+
+  const AllowedOptimizations& allowed_optimizations() const;
+  AllowedOptimizations& allowed_optimizations();
+
+ private:
+  // TODO(ezhulenev) Make GrapplerItem a class and hide all public data members.
+  // TODO(ezhulenev): Migrate all unordered collections to absl.
+
+  // A set of fully defined device names that can be used to place the nodes of
+  // the `graph`.
+  // Example of a fully defined name: "/job:work/replica:1/task:1/device:CPU:0"
+  std::unordered_set<string> devices_;
+
+  AllowedOptimizations allowed_optimizations_;
 };
 
 // Return the transitive fanin of a set of terminal nodes.
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index cf99f4908bf1bae65d597ae268c18a6f32265f6c..9224ee7849211f849c3655d6faea18dcc32b8e17 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -102,10 +102,11 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
   }
 
   // Instantiate all variables for function library runtime creation.
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
-  std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(devices));
+  Device* cpu_device = devices[0].get();
+  std::unique_ptr<DeviceMgr> dvc_mgr(new DeviceMgr(std::move(devices)));
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
                                              graph_def.library());
   Env* env = Env::Default();
@@ -124,7 +125,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
       new ProcessFunctionLibraryRuntime(dvc_mgr.get(), env,
                                         graph_def.versions().producer(),
                                         &function_library, *optimizer_opts));
-  FunctionLibraryRuntime* flr = pflr->GetFLR(devices[0]->name());
+  FunctionLibraryRuntime* flr = pflr->GetFLR(cpu_device->name());
 
   // Create the GraphOptimizer to optimize the graph def.
   GraphConstructorOptions graph_ctor_opts;
@@ -137,7 +138,7 @@ Status OptimizeGraph(const GraphDef& graph_def_arg, GraphDef* output_graph_def,
 
   // Optimize the graph.
   ::tensorflow::GraphOptimizer optimizer(*optimizer_opts);
-  optimizer.Optimize(flr, env, devices[0], &graphptr, /*shape_map=*/nullptr);
+  optimizer.Optimize(flr, env, cpu_device, &graphptr, /*shape_map=*/nullptr);
   graphptr->ToGraphDef(output_graph_def);
 
   // The default values of attributes might have been stripped by the optimizer.
@@ -519,7 +520,7 @@ std::unique_ptr<GrapplerItem> GrapplerItemFromMetaGraphDef(
         }
         if (!iter->second.has_tensor() ||
             iter->second.tensor().string_val_size() != 1) {
-          LOG(INFO) << "Unexected AttrValue proto: "
+          LOG(INFO) << "Unexpected AttrValue proto: "
                     << iter->second.DebugString();
           return nullptr;
         }
diff --git a/tensorflow/core/grappler/grappler_item_test.cc b/tensorflow/core/grappler/grappler_item_test.cc
index 72a9f481cab6cc5dfdc5994459e149739e427ce6..a8fbe356829409ac3b472267cd22d4b5b54cd1f5 100644
--- a/tensorflow/core/grappler/grappler_item_test.cc
+++ b/tensorflow/core/grappler/grappler_item_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/inputs/trivial_test_graph_input_yielder.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -44,6 +46,32 @@ TEST_F(GrapplerItemTest, Basic) {
   EXPECT_EQ(main_ops, graph_nodes);
 }
 
+TEST_F(GrapplerItemTest, InferDevices) {
+  using test::function::NDef;
+
+  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+  const string cpu2 = "/device:CPU:2";
+
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {
+          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+          NDef("c", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu2),
+      },
+      {} /* Empty function library */);
+
+  ASSERT_FALSE(item.InferDevicesFromGraph().ok());
+
+  EXPECT_EQ(item.devices().size(), 2);
+  EXPECT_NE(item.devices().find(cpu0), item.devices().end());
+  EXPECT_NE(item.devices().find(cpu1), item.devices().end());
+
+  item.ClearDevices();
+  EXPECT_EQ(item.devices().size(), 0);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index ebc4e9c46627fb92740c669f80d9edd9e60f8840..38fc1fff329eda5b80bb771442f2c543bd27e85d 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -253,6 +253,10 @@ bool IsIgammac(const NodeDef& node) { return node.op() == "Igammac"; }
 
 bool IsImag(const NodeDef& node) { return node.op() == "Imag"; }
 
+bool IsImmutableConst(const NodeDef& node) {
+  return node.op() == "ImmutableConst";
+}
+
 bool IsInvGrad(const NodeDef& node) { return node.op() == "InvGrad"; }
 
 bool IsLess(const NodeDef& node) { return node.op() == "Less"; }
@@ -547,14 +551,15 @@ bool MaybeHasRefInput(const NodeDef& node) {
   return false;
 }
 
-bool IsFreeOfSideEffect(const NodeDef& node) {
+bool IsFreeOfSideEffect(const NodeDef& node,
+                        const OpRegistryInterface* op_registry) {
   // Placeholders must be preserved to keep the graph feedable.
   if (IsPlaceholder(node)) {
     return false;
   }
   const OpDef* op_def = nullptr;
   const string& op_name = node.op();
-  Status status = OpRegistry::Global()->LookUpOpDef(op_name, &op_def);
+  Status status = op_registry->LookUpOpDef(op_name, &op_def);
   if (!status.ok()) {
     return false;
   }
@@ -571,9 +576,17 @@ bool IsFreeOfSideEffect(const NodeDef& node) {
   if (node.op().find("Queue") != string::npos) {
     return false;
   }
+  // Sending a tensor via a network is a side effect.
+  if (IsSend(node)) {
+    return false;
+  }
   return !ModifiesInputsInPlace(node);
 }
 
+bool IsFreeOfSideEffect(const NodeDef& node) {
+  return IsFreeOfSideEffect(node, OpRegistry::Global());
+}
+
 bool ModifiesInputsInPlace(const NodeDef& node) {
   // Some nodes do in-place updates on regular tensor inputs.
   string op_name = node.op();
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index 067d4e774f46df9d927c1b2208aa0aa1ed9194eb..67897e8512d7dc6e4774c066297674629dd4f714 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_OP_TYPES_H_
 
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -77,6 +78,7 @@ bool IsIdentityNSingleInput(const NodeDef& node);
 bool IsIgamma(const NodeDef& node);
 bool IsIgammac(const NodeDef& node);
 bool IsImag(const NodeDef& node);
+bool IsImmutableConst(const NodeDef& node);
 bool IsInvGrad(const NodeDef& node);
 bool IsLess(const NodeDef& node);
 bool IsLessEqual(const NodeDef& node);
@@ -179,7 +181,9 @@ bool IsCommutative(const NodeDef& node);
 // value.
 bool IsPersistent(const NodeDef& node);
 
-bool IsFreeOfSideEffect(const NodeDef& node);
+bool IsFreeOfSideEffect(const NodeDef& node,
+                        const OpRegistryInterface* op_registry);
+bool IsFreeOfSideEffect(const NodeDef& node);  // use OpRegistry::Global()
 
 // Returns true if the takes a tensor reference as input, or if looking up its
 // OpDef failed.
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 3a5b1334d3f42e9db3ddf67c501ac64524464e06..79578cb3ce0733bcfce1a382414c20881879e3e3 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -141,8 +141,8 @@ cc_library(
     deps = [
         ":graph_optimizer",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
@@ -150,6 +150,8 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/utils:functions",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -178,22 +180,6 @@ tf_cuda_cc_test(
     ],
 )
 
-cc_library(
-    name = "graph_rewriter",
-    srcs = ["graph_rewriter.cc"],
-    hdrs = [
-        "graph_rewriter.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
-    ],
-)
-
 cc_library(
     name = "graph_optimizer",
     hdrs = [
@@ -352,10 +338,10 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        ":graph_rewriter",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -399,7 +385,7 @@ cc_library(
     srcs = [
         "gpu_swapping_ops.cc",
     ],
-    visibility = ["//tensorflow:__subpackages__"],
+    visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
@@ -419,7 +405,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_optimizer",
-        ":graph_rewriter",
         ":static_schedule",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -476,7 +461,6 @@ cc_library(
         "//tensorflow/core/grappler:devices",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/costs:virtual_placer",
@@ -630,7 +614,6 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:mutable_graph_view",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -708,6 +691,8 @@ cc_library(
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
 
@@ -780,7 +765,6 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
     ],
@@ -847,6 +831,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -873,11 +858,10 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/costs:graph_properties",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index cf294cd20bb3dcbc0714bcdab5dda8804a2d31d2..d35c00f29ecb1c1acedb41c29f08d20decf6476e 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -2309,7 +2309,9 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
   ~SimplifyAggregation() override = default;
 
   bool IsSupported(const NodeDef* node) const override {
-    return IsAggregate(*node) && NumNonControlInputs(*node) > 0;
+    return IsAggregate(*node) && NumNonControlInputs(*node) > 0 &&
+           GetDataTypeFromAttr(*node, "T") !=
+               DT_VARIANT;  // TODO(b/119787146): Enable for variants.
   }
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
@@ -2405,11 +2407,10 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     const auto& pow_props =
         ctx().graph_properties->GetInputProperties(node->name())[1];
-    for (int i = 0; i < pow_props.shape().dim_size(); ++i) {
-      if (pow_props.shape().dim(i).size() < 0) {
-        // skip if p is is not fully defined.
-        return Status::OK();
-      }
+    PartialTensorShape shape(pow_props.shape());
+    if (!shape.IsFullyDefined()) {
+      // skip if p is not fully defined.
+      return Status::OK();
     }
     if (TensorShape::IsValid(pow_props.shape()) && pow_props.has_value()) {
       Tensor pow(pow_props.dtype(), pow_props.shape());
@@ -2457,11 +2458,10 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
         AddToOptimizationQueue(y);
       } else if (curr == complex128(0, 0) &&
                  ShapesSymbolicallyEqual(value_props.shape(), output_shape)) {
-        for (int i = 0; i < value_props.shape().dim_size(); ++i) {
-          if (value_props.shape().dim(i).size() < 0) {
-            // skip if b is is not fully defined.
-            return Status::OK();
-          }
+        PartialTensorShape shape(value_props.shape());
+        if (!shape.IsFullyDefined()) {
+          // skip if b is not fully defined.
+          return Status::OK();
         }
         if (TensorShape::IsValid(value_props.shape()) &&
             value_props.has_value()) {
@@ -3561,8 +3561,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   // Set up helper data structures.
   nodes_to_preserve_ = item.NodesToPreserve();
   fetch_nodes_known_ = !item.fetch.empty();
-  *optimized_graph = item.graph;
-  GrapplerItem optimized_item(item, optimized_graph);
+  GrapplerItem optimized_item(item);
   optimized_graph_ = &optimized_item.graph;
   node_map_.reset(new NodeMap(optimized_graph_));
 
@@ -3572,7 +3571,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
 
   // Disable restricted graph rewrites.
   options_.unary_ops_composition &=
-      item.allowed_optimizations.non_differentiable_rewrites;
+      item.allowed_optimizations().non_differentiable_rewrites;
 
   if (options_.dedup_computations) {
     DedupComputations();
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
index b6286c425e51b60c5d93e8e6da6ec01034633fb3..35d22898f6c15afd63df8b6136fad1f346172cd5 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/arithmetic_optimizer.h"
+#include "tensorflow/cc/ops/math_ops.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
@@ -3793,5 +3794,31 @@ TEST_F(ArithmeticOptimizerTest, RemoveStackStridedSliceSameAxis) {
                                  tensors[fCSlice2ToOut]);
 }
 
+TEST_F(ArithmeticOptimizerTest, SimplifyAggregationBFloat16) {
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+  Output x = ops::Const(s.WithOpName("x"), {1.0f, 2.0f}, {1, 2});
+  Output cast = ops::Cast(s.WithOpName("cast"), x, DT_BFLOAT16);
+  Output add = ops::AddN(s.WithOpName("add"), {cast, cast});
+  Output id = ops::Identity(s.WithOpName("id"), add);
+
+  GrapplerItem item;
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+  item.fetch = {"id"};
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
+  EXPECT_EQ(1, tensors_expected.size());
+
+  GraphDef output;
+  ArithmeticOptimizer optimizer;
+  EnableOnlySimplifyAggregation(&optimizer);
+  OptimizeAndPrune(&optimizer, &item, &output);
+
+  // Extra node created for multiplier.
+  EXPECT_EQ(5, output.node_size());
+
+  auto tensors = EvaluateNodes(output, item.fetch);
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorEqual<bfloat16>(tensors_expected[0], tensors[0]);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/constant_folding_test.cc b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
index f6fdb32e989219039a44e636a20573cb707dd1ba..192f48272f9ed08b2b6424f3c8e33d1afafdb56d 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding_test.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding_test.cc
@@ -14,10 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
+#include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/grappler_test.h"
@@ -72,9 +74,9 @@ class ConstantFoldingTest : public GrapplerTest {
       GrapplerItem item;
       TF_CHECK_OK(s.ToGraphDef(&item.graph));
       item.fetch = {"mul1", "mul2", "add1", "add2"};
-      ConstantFolding optimizer(nullptr /* cpu_device */);
+      ConstantFolding optimizer(/*cpu_device=*/nullptr);
       GraphDef output;
-      Status status = optimizer.Optimize(nullptr, item, &output);
+      Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
       TF_EXPECT_OK(status);
 
       EXPECT_EQ(7, output.node_size());
@@ -132,9 +134,9 @@ TEST_F(ConstantFoldingTest, SimpleFolding) {
   item.fetch.push_back("d");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(1, output.node_size());
@@ -178,9 +180,9 @@ TEST_F(ConstantFoldingTest, AddTree) {
   item.fetch = {"add_parent", "mul_parent", "addmul_parent"};
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // We expect the following rewrite(s) to occur:
@@ -276,13 +278,11 @@ TEST_F(ConstantFoldingTest, ConvPushDownTest) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding fold(nullptr);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = fold.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
-  std::cout << output.DebugString() << std::endl;
-
   EXPECT_EQ(5, output.node_size());
   int found = 0;
   for (const auto& node : output.node()) {
@@ -366,9 +366,9 @@ TEST_F(ConstantFoldingTest, NeutralElement) {
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     item.fetch = {"stack", "matmul3", "matmul4"};
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     const string suffix =
@@ -521,9 +521,9 @@ TEST_F(ConstantFoldingTest, StrengthReduce_Reciprocal) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
   item.fetch = {"div_f", "div_i", "realdiv"};
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(8, output.node_size());
@@ -611,9 +611,9 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_UnknownOutputShape) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(15, output.node_size());
@@ -683,9 +683,9 @@ TEST_F(ConstantFoldingTest, NeutralElement_PartialShape_KnownOutputShape) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(10, output.node_size());
@@ -741,9 +741,9 @@ TEST_F(ConstantFoldingTest, CreateConstNodes) {
 
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(24, output.node_size());
@@ -790,9 +790,9 @@ TEST_F(ConstantFoldingTest, FoldingNodeWithTwoOutputs) {
   item.fetch.push_back("f");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(2, output.node_size());
@@ -831,9 +831,9 @@ TEST_F(ConstantFoldingTest, ControlDependencies) {
   item.fetch.push_back("e");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "e"};
@@ -874,9 +874,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "c",
@@ -932,9 +932,9 @@ TEST_F(ConstantFoldingTest, ControlDependenciesDeduplicate) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   auto tensors_expected = EvaluateNodes(item.graph, item.fetch);
   EXPECT_EQ(1, tensors_expected.size());
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> expected_nodes = {"dflt", "p1", "p2", "i2"};
@@ -1009,9 +1009,9 @@ TEST_F(ConstantFoldingTest, VariableNumberOfOutputs) {
   }
 
   item.fetch = outputs;
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int constant_folded = 0;
@@ -1047,9 +1047,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterialization) {
   item.fetch.push_back("p2");
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1097,9 +1097,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1163,9 +1163,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   int found = 0;
   for (const auto& node : output.node()) {
@@ -1235,9 +1235,9 @@ TEST_F(ConstantFoldingTest, ShapeMaterializationShapeN_MultipleOutputs) {
   item.fetch.push_back("ia");
   item.fetch.push_back("ib");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -1307,9 +1307,9 @@ TEST_F(ConstantFoldingTest, SwitchNodesEmptyFetch) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
@@ -1409,9 +1409,9 @@ TEST_F(ConstantFoldingTest, SwitchNodes) {
 
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   std::set<string> present_nodes = {"v_in",     "v_ctrl",
                                     "switch",   "i",
@@ -1505,9 +1505,9 @@ TEST_F(ConstantFoldingTest, MergeNodes) {
   item.fetch = {"out1", "idx1", "out2", "idx2", "out3", "idx3", "out4", "idx4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(19, output.node_size());
@@ -1590,9 +1590,9 @@ TEST_F(ConstantFoldingTest, SplitRemoval) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1636,9 +1636,9 @@ TEST_F(ConstantFoldingTest, SplitVRemoval) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1686,9 +1686,9 @@ TEST_F(ConstantFoldingTest, TransposeOnSize1DimsRemoval) {
   item.fetch = {"out1"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1723,9 +1723,9 @@ TEST_F(ConstantFoldingTest, RandomShuffleOnScalarRemoval) {
   item.fetch = {"out1", "out2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1769,9 +1769,9 @@ TEST_F(ConstantFoldingTest, ReverseOnSize1DimsRemoval) {
   item.fetch = {"out1"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -1805,9 +1805,9 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1852,9 +1852,9 @@ TEST_F(ConstantFoldingTest, SliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1901,9 +1901,9 @@ TEST_F(ConstantFoldingTest, StridedSliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -1959,9 +1959,9 @@ TEST_F(ConstantFoldingTest, StridedSliceWithSameDimensionRemoval) {
     item.fetch = {"out"};
     TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef got;
-    Status status = optimizer.Optimize(nullptr, item, &got);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
     TF_EXPECT_OK(status);
 
     GraphDef want;
@@ -2012,9 +2012,9 @@ TEST_F(ConstantFoldingTest, TileWithMultipliesBeingOne) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2045,9 +2045,9 @@ TEST_F(ConstantFoldingTest, MergeConcat) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2075,9 +2075,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_SameInput) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2106,9 +2106,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_ConcatWithConst) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2137,9 +2137,9 @@ TEST_F(ConstantFoldingTest, MergeConcat_AxisMismatch) {
   item.fetch = {"c2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2175,9 +2175,9 @@ TEST_F(ConstantFoldingTest, PaddingWithZeroSize) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2221,9 +2221,9 @@ TEST_F(ConstantFoldingTest, SqueezeWithAllDimesionsGreaterThanOne) {
   item.fetch = {"out"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef got;
-  Status status = optimizer.Optimize(nullptr, item, &got);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &got);
   TF_EXPECT_OK(status);
 
   GraphDef want;
@@ -2269,9 +2269,9 @@ TEST_F(ConstantFoldingTest, NoOpReduction) {
   item.fetch = {"s", "p2"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2338,9 +2338,9 @@ TEST_F(ConstantFoldingTest, SingleElementEmptyAxisReduction) {
   item.fetch = {"mean_1", "mean_2", "mean_3", "mean_4", "mean_5"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Ensure Mean node is optimized to Reshape.
@@ -2433,9 +2433,9 @@ TEST_F(ConstantFoldingTest, NoOpReshape) {
   item.fetch = {"s1", "s2", "s3", "s4"};
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2495,9 +2495,9 @@ TEST_F(ConstantFoldingTest, Packing) {
   GrapplerItem item;
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   const std::vector<string> fetch_nodes = {"i1", "i2"};
@@ -2538,9 +2538,9 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
   GrapplerItem item;
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> fetch_nodes = {"o1", "o2", "p1", "p2"};
@@ -2552,7 +2552,7 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs) {
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   int found = 0;
@@ -2619,14 +2619,14 @@ TEST_F(ConstantFoldingTest, MaterializeBroadcastGradientArgs_InfiniteLoop) {
   auto tensors_expected = EvaluateNodes(item.graph, fetch_nodes, {{"a", a_t}});
   EXPECT_EQ(fetch_nodes.size(), tensors_expected.size());
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Run a second time to make sure the optimization is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(11, output.node_size());
@@ -2711,14 +2711,14 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices) {
     // Use aggressive mode to force the shape inference to propagate placeholder
     // shapes.
     ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
-                              nullptr /* cpu_device */);
+                              /*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     // Run a second time to make sure the optimization is idempotent.
     item.graph.Swap(&output);
-    status = optimizer.Optimize(nullptr, item, &output);
+    status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     int found = 0;
@@ -2767,9 +2767,9 @@ TEST_F(ConstantFoldingTest, MaterializeReductionIndices_NotFullReduction) {
     // Use aggressive mode to force the shape inference to propagate placeholder
     // shapes.
     ConstantFolding optimizer(RewriterConfig::AGGRESSIVE,
-                              nullptr /* cpu_device */);
+                              /*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     CompareGraphs(item.graph, output);
@@ -2788,9 +2788,9 @@ TEST_F(ConstantFoldingTest, LargeConstant) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch.push_back("out");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   // Make sure the diag node hasn't been folded, since it would use too much
@@ -2833,9 +2833,9 @@ TEST_F(ConstantFoldingTest, SwitchIdenticalInputs) {
   item.fetch.push_back("id_true");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(6, output.node_size());
@@ -2925,9 +2925,9 @@ TEST_F(ConstantFoldingTest, PartialFolding_AssociativeAndCommutative) {
     TF_CHECK_OK(s.ToGraphDef(&item.graph));
     item.fetch = {"stack"};
 
-    ConstantFolding optimizer(nullptr /* cpu_device */);
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
     GraphDef output;
-    Status status = optimizer.Optimize(nullptr, item, &output);
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
     TF_EXPECT_OK(status);
 
     EXPECT_EQ(16, output.node_size());
@@ -3017,13 +3017,13 @@ TEST_F(ConstantFoldingTest, PartialFolding_Concat) {
 
   auto tensors_expected = EvaluateNodes(item.graph, {"concat0"});
   EXPECT_EQ(1, tensors_expected.size());
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(21, output.node_size());
@@ -3090,9 +3090,9 @@ TEST_F(ConstantFoldingTest, PartialFolding_IdentityN) {
   item.fetch.push_back("add0");
   item.fetch.push_back("add1");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(8, output.node_size());
   for (const auto& node : output.node()) {
@@ -3152,9 +3152,9 @@ TEST_F(ConstantFoldingTest, TrivialPack) {
   TF_CHECK_OK(scope.ToGraphDef(&item.graph));
   item.fetch = {"stack", "stack_no_axis"};
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(7, output.node_size());
   int found = 0;
@@ -3234,13 +3234,13 @@ TEST_F(ConstantFoldingTest, Enter) {
   item.fetch.push_back("id3");
   item.fetch.push_back("id4");
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(9, output.node_size());
@@ -3289,13 +3289,13 @@ TEST_F(ConstantFoldingTest, TensorArraySize) {
   auto tensors_expected =
       EvaluateNodes(item.graph, {"dynamic_sz", "static_sz"});
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
   // Run the optimizer twice to make sure the rewrite is idempotent.
   item.graph.Swap(&output);
-  status = optimizer.Optimize(nullptr, item, &output);
+  status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(8, output.node_size());
@@ -3327,9 +3327,9 @@ TEST_F(ConstantFoldingTest, FoldingPreservesDenormalFlushing) {
   item.fetch.push_back("c");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   EXPECT_EQ(1, output.node_size());
@@ -3363,9 +3363,9 @@ TEST_F(ConstantFoldingTest, EvaluatingLargeConstantNoFoldingMergingLoop) {
   item.fetch.push_back("result");
   TF_CHECK_OK(s.ToGraphDef(&item.graph));
 
-  ConstantFolding optimizer(nullptr /* cpu_device */);
+  ConstantFolding optimizer(/*cpu_device=*/nullptr);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
+  Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
   TF_EXPECT_OK(status);
 
   std::vector<string> fetch = {"result"};
@@ -3376,6 +3376,96 @@ TEST_F(ConstantFoldingTest, EvaluatingLargeConstantNoFoldingMergingLoop) {
   EXPECT_EQ(tensors_expected[0].shape(), tensors[0].shape());
 }
 
+class ConstantFoldingCastConstTest : public GrapplerTest {
+ protected:
+  void ConstantFoldingCastConst(bool fetch_const, bool fetch_cast,
+                                bool fetch_const_child, bool fetch_cast_child) {
+    if (!fetch_const && !fetch_cast && !fetch_const_child &&
+        !fetch_cast_child) {
+      return;
+    }
+
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+    CreateCastConstGraph(s);
+    GrapplerItem item;
+    int expected_output_size = SetFetch(&item, fetch_const, fetch_cast,
+                                        fetch_const_child, fetch_cast_child);
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    GraphDef output = ConstantFoldingOptimize(item);
+    EXPECT_EQ(expected_output_size, output.node_size());
+
+    EvaluateAndCompareUnoptimized(item.graph, output, item.fetch);
+  }
+
+ private:
+  void CreateCastConstGraph(const tensorflow::Scope& s) {
+    Output const1 = ops::Const(s.WithOpName("const1"), 2, {5, 5});
+    Output cast = ops::Cast(s.WithOpName("cast"), const1, DT_FLOAT);
+    Output const1_child = ops::Identity(s.WithOpName("const1_child"), const1);
+    Output cast_child = ops::Identity(s.WithOpName("cast_child"), cast);
+  }
+
+  int SetFetch(GrapplerItem* item, bool fetch_const, bool fetch_cast,
+               bool fetch_const_child, bool fetch_cast_child) {
+    int expected_output_size = 0;
+    if (fetch_const) {
+      item->fetch.push_back("const1");
+      expected_output_size++;
+    }
+    if (fetch_cast) {
+      item->fetch.push_back("cast");
+      expected_output_size++;
+    }
+    if (fetch_const_child) {
+      item->fetch.push_back("const1_child");
+      expected_output_size++;
+    }
+    if (fetch_cast_child) {
+      item->fetch.push_back("cast_child");
+      expected_output_size++;
+    }
+    return expected_output_size;
+  }
+
+  GraphDef ConstantFoldingOptimize(const GrapplerItem& item) {
+    ConstantFolding optimizer(/*cpu_device=*/nullptr);
+    GraphDef output;
+    Status status = optimizer.Optimize(/*cluster=*/nullptr, item, &output);
+    TF_EXPECT_OK(status);
+    return output;
+  }
+
+  void EvaluateAndCompareUnoptimized(const GraphDef& unoptimized_graph,
+                                     const GraphDef& optimized_graph,
+                                     const std::vector<string>& fetch_nodes) {
+    auto tensors_expected = EvaluateNodes(unoptimized_graph, fetch_nodes);
+    auto tensors = EvaluateNodes(optimized_graph, fetch_nodes);
+    ASSERT_EQ(fetch_nodes.size(), tensors_expected.size());
+    ASSERT_EQ(fetch_nodes.size(), tensors.size());
+    for (int i = 0; i < fetch_nodes.size(); i++) {
+      if (fetch_nodes[i] == "const1" || fetch_nodes[i] == "const1_child") {
+        test::ExpectTensorEqual<int>(tensors_expected[i], tensors[i]);
+      } else {
+        test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
+      }
+    }
+  }
+};
+
+TEST_F(ConstantFoldingCastConstTest, CastConstFolding) {
+  for (bool fetch_const : {false, true}) {
+    for (bool fetch_cast : {false, true}) {
+      for (bool fetch_const_child : {false, true}) {
+        for (bool fetch_cast_child : {false, true}) {
+          ConstantFoldingCastConst(fetch_const, fetch_cast, fetch_const_child,
+                                   fetch_cast_child);
+        }
+      }
+    }
+  }
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 89e95067b83d70204de29d785666cf4e46fc939c..7593023ff4d649c623db9be98ac52ef6b799219f 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -628,6 +628,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:cwise_op",
         "//tensorflow/core/kernels:logging_ops",
         "//tensorflow/core/kernels:math",
+        "//tensorflow/core/kernels:nn",
         "//tensorflow/core/kernels:parsing",
         "//tensorflow/tools/graph_transforms:transform_utils",
     ] + tf_protos_all(),
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index affaeafb0fba27f6754e447de588e65b768d5d41..9d8b388a3a8bca1fb560e5acc94d50f3d82ed30d 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -41,7 +41,7 @@ NodeDef MakeMapAndBatchNode(StringPiece name, StringPiece input_node_name,
                             StringPiece drop_remainder_node_name,
                             StringPiece function_name) {
   return test::function::NDef(
-      name, "MapAndBatchDatasetV2",
+      name, "ExperimentalMapAndBatchDataset",
       {string(input_node_name), "", string(batch_size_node_name),
        string(num_parallel_calls_node_name), string(drop_remainder_node_name)},
       {{"f", FunctionDefHelper::FunctionRef(string(function_name))},
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
index 5af9fbadf76bfde5b031df0978ff9447ea3afb57..60755256d83d74287748125e18ccd8a63a1b4759 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform.cc
@@ -67,7 +67,7 @@ NodeDef MakeStatelessMap(const NodeDef& map_node, const NodeDef& zip_node,
 NodeDef MakeRandomDataset(const NodeDef& random_uniform_node,
                           MutableGraphView* graph) {
   NodeDef random_dataset;
-  random_dataset.set_op("RandomDataset");
+  random_dataset.set_op("ExperimentalRandomDataset");
   graph_utils::SetUniqueGraphNodeName("RandomDataset", graph->graph(),
                                       &random_dataset);
 
diff --git a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
index 455459e3f67d9cb51bf24af24e2c73f30447b24f..b6a29a442ea3a3e62eeec8d1f571fef5225c3c80 100644
--- a/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/hoist_random_uniform_test.cc
@@ -55,7 +55,7 @@ TEST(HoistRandomUniform, SimpleHoisting) {
   const int zip_dataset_id =
       graph_utils::FindGraphNodeWithOp("ZipDataset", output);
   const int random_dataset_id =
-      graph_utils::FindGraphNodeWithOp("RandomDataset", output);
+      graph_utils::FindGraphNodeWithOp("ExperimentalRandomDataset", output);
   const int batch_random_id =
       graph_utils::FindGraphNodeWithOp("BatchDatasetV2", output);
   ASSERT_NE(random_dataset_id, -1);
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
index 16b2efb3ed3c25c4fa5b8b42205037c212140289..52b4b785a3d09ca7f3bec3373d9dd1c8de444a87 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges.cc
@@ -31,7 +31,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kInsertOpName[] = "LatencyStatsDataset";
+constexpr char kInsertOpName[] = "ExperimentalLatencyStatsDataset";
 
 NodeDef MakeLatencyNode(const NodeDef& node, MutableGraphView* graph) {
   NodeDef new_node;
diff --git a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
index 6789cf5bd669cfa61e161397f792700098923e75..d428d04a66659cd3b961428e3762ea3ab81ad69e 100644
--- a/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/latency_all_edges_test.cc
@@ -57,9 +57,10 @@ TEST(LatencyAllEdgesTest, AddLatenciesAfterTensorMapPrefetch) {
   GraphDef output;
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("LatencyStatsDataset", output));
-  std::vector<int> latency_node_indices =
-      graph_utils::FindAllGraphNodesWithOp("LatencyStatsDataset", output);
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalLatencyStatsDataset",
+                                              output));
+  std::vector<int> latency_node_indices = graph_utils::FindAllGraphNodesWithOp(
+      "ExperimentalLatencyStatsDataset", output);
   EXPECT_EQ(latency_node_indices.size(), 3);
   std::vector<NodeDef> dataset_nodes = {std::move(from_tensor_node),
                                         std::move(map_node),
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
index e5de981822376d2e4d1d78ac628f527d242f133a..72c27a1d4afb8f3766a1f7c56ade37b1e161a039 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware.cc
@@ -44,7 +44,7 @@ Status MakeNumaAware::Optimize(Cluster* cluster, const GrapplerItem& item,
   std::set<string> nodes_to_delete;
 
   for (const NodeDef& node : item.graph.node()) {
-    if (node.op() != "MapAndBatchDatasetV2") continue;
+    if (node.op() != "ExperimentalMapAndBatchDataset") continue;
 
     auto* numa_node = graph.AddNode(MakeNumaAwareNode(node, &graph));
     graph.UpdateFanouts(node.name(), numa_node->name());
diff --git a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
index 5d52bd6208f7f21ff44cfe4fef042146a97c5fb9..4b83fb6ef19f8ee241dd4f7b635c9672ef01bcc0 100644
--- a/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_numa_aware_test.cc
@@ -57,7 +57,8 @@ TEST(MakeNumaAwareTest, ReplaceSimple) {
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                               output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
       "ExperimentalNumaMapAndBatchDataset", output));
 }
@@ -91,7 +92,8 @@ TEST(MapAndBatchNumaAawareReplacementTest, ReplaceWithExtraChild) {
   TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
 
   EXPECT_FALSE(graph_utils::ContainsGraphNodeWithName("map_and_batch", output));
-  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_FALSE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                               output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
       "ExperimentalNumaMapAndBatchDataset", output));
   EXPECT_TRUE(graph_utils::ContainsNodeWithOp("CacheDataset", output));
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index 800050b840326d826328763a52c5447c8df70a99..84c4d82f6a38dd81e88374c6ce6a7a6082451a38 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -30,7 +30,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-constexpr char kFusedOpName[] = "MapAndBatchDatasetV2";
+constexpr char kFusedOpName[] = "ExperimentalMapAndBatchDataset";
 
 NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
                             MutableGraphView* graph) {
@@ -77,15 +77,22 @@ NodeDef MakeMapAndBatchNode(const NodeDef& map_node, const NodeDef& batch_node,
     new_node.add_input(tmp->name());
   }
 
-  // Set `f` and `Targuments` attributes.
+  // Required attributes.
   for (auto key : {"f", "Targuments"}) {
     graph_utils::CopyAttribute(key, map_node, &new_node);
   }
-
-  // Set `output_types` and `output_shapes` attributes.
   for (auto key : {"output_shapes", "output_types"}) {
     graph_utils::CopyAttribute(key, batch_node, &new_node);
   }
+
+  // Optional attributes.
+  // TODO(jsimsa): Support `use_inter_op_parallelism` and `sloppy`.
+  for (auto key : {"preserve_cardinality"}) {
+    if (gtl::FindOrNull(map_node.attr(), key)) {
+      graph_utils::CopyAttribute(key, map_node, &new_node);
+    }
+  }
+
   return new_node;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
index eed558de7eb42c5b7879e93bdc43fc8184b599b4..ef4e64826f030ae404a0a523ad5f09bbf7e325a4 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion_test.cc
@@ -84,9 +84,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchNodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -169,9 +170,10 @@ TEST(MapAndBatchFusionTest, FuseMapAndBatchV2NodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
@@ -252,9 +254,10 @@ TEST(MapAndBatchFusionTest, FuseParallelMapAndBatchNodesIntoOne) {
       graph_utils::ContainsGraphNodeWithName(map_node->name(), output));
   EXPECT_FALSE(
       graph_utils::ContainsGraphNodeWithName(batch_node->name(), output));
-  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("MapAndBatchDatasetV2", output));
-  NodeDef map_and_batch_node = output.node(
-      graph_utils::FindGraphNodeWithOp("MapAndBatchDatasetV2", output));
+  EXPECT_TRUE(graph_utils::ContainsNodeWithOp("ExperimentalMapAndBatchDataset",
+                                              output));
+  NodeDef map_and_batch_node = output.node(graph_utils::FindGraphNodeWithOp(
+      "ExperimentalMapAndBatchDataset", output));
   EXPECT_EQ(map_and_batch_node.input_size(), 5);
   EXPECT_EQ(map_and_batch_node.input(0), map_node->input(0));
   EXPECT_EQ(map_and_batch_node.input(1), map_node->input(1));
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 2b0a347ce625140be16d258964af06ef418e9f58..233d7968c8965a5ec2389aa297da72a9708b9257 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -55,8 +55,9 @@ NodeDef MakeFusedNode(const NodeDef& map_node,
   }
 
   // Optional attrs.
-  for (auto key : {"use_inter_op_parallelism", "sloppy"}) {
-    if (const auto* attr = gtl::FindOrNull(map_node.attr(), key)) {
+  for (auto key :
+       {"use_inter_op_parallelism", "sloppy", "preserve_cardinality"}) {
+    if (gtl::FindOrNull(map_node.attr(), key)) {
       graph_utils::CopyAttribute(key, map_node, &fused_node);
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 6ca0da27551bc78a9167d308eb229c662821c582..6b8015f96a29ac2fa2de3871a678a1b82efb12ff 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -62,9 +62,16 @@ NodeDef MakeFusedNode(const NodeDef& parent_map_node, const NodeDef& map_node,
       gtl::FindOrNull(map_node.attr(), "use_inter_op_parallelism");
   // Some graphs cannot execute with use_inter_op_parallelism=False, so we need
   // to set it to true if one of the ops have it set to true.
-  if (value_or_false(first_parallelism) || value_or_false(second_parallelism)) {
-    (*fused_node.mutable_attr())["use_inter_op_parallelism"].set_b(true);
-  }
+  (*fused_node.mutable_attr())["use_inter_op_parallelism"].set_b(
+      value_or_false(first_parallelism) || value_or_false(second_parallelism));
+
+  const auto* first_cardinality =
+      gtl::FindOrNull(parent_map_node.attr(), "preserve_cardinality");
+  const auto* second_cardinality =
+      gtl::FindOrNull(map_node.attr(), "preserve_cardinality");
+  (*fused_node.mutable_attr())["preserve_cardinality"].set_b(
+      value_or_false(first_cardinality) && value_or_false(second_cardinality));
+
   return fused_node;
 }
 
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
index 2c36c9b7b314669402108c5f5a864eb731002fcf..75ad8bffefd8aa00bb1ba88c10ed9b1170a0d25f 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -32,6 +34,73 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
+Status UpdateNodeDef(NodeDef* node_def, const string& funcName,
+                     const FunctionApiInfo& apiInfo) {
+  VLOG(3) << "Node def before swap is: " << node_def->DebugString();
+  auto tin = node_def->mutable_attr()->find("Tin");
+  tin->second.mutable_list()->clear_type();
+  for (const auto& tin_dtype : apiInfo.input_arg_dtypes()) {
+    tin->second.mutable_list()->add_type(tin_dtype);
+  }
+
+  auto tout = node_def->mutable_attr()->find("Tout");
+  tout->second.mutable_list()->clear_type();
+  for (const auto& tout_dtype : apiInfo.output_arg_dtypes()) {
+    tout->second.mutable_list()->add_type(tout_dtype);
+  }
+
+  if (apiInfo.function_type() == FunctionApiInfo::BACKWARD) {
+    // Update the inputs since for backward function, it might have different
+    // number of inputs due the different number output from forward function.
+    // The output of forward function are composed by two parts:
+    //   1. Real output tensors from defun.
+    //   2. Internal states that will be used for gradient calculation.
+    // Part 1 will be static, and part 2 could be different based on the
+    // different implementation.
+
+    const int prev_input_size = node_def->input_size();
+    const int diff = prev_input_size - apiInfo.input_arg_dtypes().size();
+    if (diff >= 0) {
+      for (int i = 0; i < diff; ++i) node_def->mutable_input()->RemoveLast();
+    } else {
+      // Adding new inputs for internal states, the name of the internal states
+      // should be in format "{forward_node_name}:{index}", where the newly
+      // added index should start from last index of the state.
+      // Eg:
+      // {
+      //   input: "gradients/unified_lstm/strided_slice_1_grad/StridedSliceGrad"
+      //   input: "gradients/zeros_like_1"
+      //   input: "gradients/zeros_like_2"
+      //   input: "unified_lstm/StatefulPartitionedCall:3"
+      //   input: "unified_lstm/StatefulPartitionedCall:4"
+      //   # New input should be "unified_lstm/StatefulPartitionedCall:5"
+      // }
+      const string last_input = node_def->input(prev_input_size - 1);
+      const std::vector<string> name_index = ::absl::StrSplit(last_input, ':');
+      if (name_index.size() != 2) {
+        return errors::InvalidArgument(
+            "Invalid format of input node name: ", last_input,
+            " Expected: {forward_node_name}:{index}");
+      }
+      const absl::string_view node_name = name_index[0];
+      int last_index;
+      if (!::absl::SimpleAtoi(name_index[1], &last_index)) {
+        return errors::InvalidArgument(
+            "The index of input node is expected to be number, got: ",
+            name_index[1]);
+      }
+      for (int i = 1; i <= -diff; ++i)
+        node_def->add_input(strings::StrCat(node_name, ":", i + last_index));
+    }
+  }
+
+  node_def->mutable_attr()->find("f")->second.mutable_func()->set_name(
+      funcName);
+
+  VLOG(3) << "Node def after swap is: " << node_def->DebugString();
+  return Status::OK();
+}
+
 Status ExperimentalImplementationSelector::LoadFunctions(
     const GraphDef& graph) {
   lib_info_.reset(new FunctionLibraryApiInfo);
@@ -43,8 +112,11 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
     NodeDef* node_def) const {
   // There are two ways of calling functions:
   //  1. By specifying an op name as a function name, or
-  //  2. Via the @defun functional interface, where the real function name
-  //     appear as the attribute with type func.
+  //  2. Via the @defun functional interface, where the real function call
+  //     happens with partitionedcall op, and the function name appear as the
+  //     attribute with name "f" and type func. In this use case, there are more
+  //     attributes need to be taken care, like Tin and Tout which take care of
+  //     the DTYPE of input/output.
   std::vector<string> function_attribute_names;
   for (const auto& attr : node_def->attr()) {
     if (attr.second.has_func() &&
@@ -70,22 +142,29 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
 
   for (const auto& attr_name : function_attribute_names) {
     string function_name = node_def->attr().at(attr_name).func().name();
-    string best_function_name;
-    lib_info_->GetBestImplementation(function_name, parsed_name.type,
-                                     &best_function_name);
-    if (function_name != best_function_name) {
-      node_def->mutable_attr()
-          ->find(attr_name)
-          ->second.mutable_func()
-          ->set_name(best_function_name);
+    std::vector<string> equiv_func_names;
+    TF_RETURN_IF_ERROR(lib_info_->GetEquivalentImplementations(
+        function_name, &equiv_func_names));
+    for (const auto& func_name : equiv_func_names) {
+      const auto& func_api_info = lib_info_->GetApiInfo(func_name);
+      if (func_api_info->preferred_device() == parsed_name.type) {
+        VLOG(2) << "Swapping: " << function_name << " TO: " << func_name;
+        TF_RETURN_IF_ERROR(UpdateNodeDef(node_def, func_name, *func_api_info));
+        break;
+      }
     }
   }
+
   if (lib_info_->GetApiInfo(node_def->op()) != nullptr) {
-    string best_function_name;
-    lib_info_->GetBestImplementation(node_def->op(), parsed_name.type,
-                                     &best_function_name);
-    if (node_def->op() != best_function_name) {
-      node_def->set_op(best_function_name);
+    std::vector<string> equiv_func_names;
+    TF_RETURN_IF_ERROR(lib_info_->GetEquivalentImplementations(
+        node_def->op(), &equiv_func_names));
+    for (const string& func_name : equiv_func_names) {
+      const auto func_api_info = lib_info_->GetApiInfo(func_name);
+      if (func_api_info->preferred_device() == parsed_name.type) {
+        node_def->set_op(func_name);
+        break;
+      }
     }
   }
   return Status::OK();
@@ -93,6 +172,11 @@ Status ExperimentalImplementationSelector::MaybeOptimizeFunctionCall(
 
 Status ExperimentalImplementationSelector::SelectImplementation(
     GraphDef* graph) const {
+  if (!graph->has_library()) {
+    VLOG(2) << "Skipping graph since it does not have function def";
+    return Status::OK();
+  }
+
   for (int k = 0; k < graph->node_size(); ++k)
     TF_RETURN_IF_ERROR(MaybeOptimizeFunctionCall(graph->mutable_node(k)));
 
diff --git a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
index 3f1ebefac68a1e9b86acea0ddb9dd1c6a638ac6e..e330835e9bc4fea33928e376a3fd98ebe34a74ee 100644
--- a/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
+++ b/tensorflow/core/grappler/optimizers/experimental_implementation_selector_test.cc
@@ -127,12 +127,107 @@ TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationEval) {
                                  test::AsScalar<float>(4.0f));
 
   TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   const auto twice_boosted_tensor = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(twice_boosted_tensor[0],
                                  test::AsScalar<float>(2.0f));
 }
 
+TEST_F(ExperimentalImplementationSelectorTest, SwapImplementationWithGradient) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+  // boost_1 returns the doubled input and a const as the internal state, the
+  // state will be feed to gradient function to mimic the behavior of backward
+  // function of defun that use internal states as extra inputs.
+  FunctionDef boost_1 = FDH::Create(
+      "Boost1", {"x:float"}, {"z:float", "s:float"}, {},
+      {{{"boost"}, "Add", {"x", "x"}, {{"T", DT_FLOAT}}},
+       FDH::Const("one", 1.0f)},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "boost:z:0"}, {"s", "one:output:0"}});
+  auto* boost_1_attr = boost_1.mutable_attr();
+  (*boost_1_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_1_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_attr)["backward_function_name"].set_s("BoostCpuGradient");
+
+  FunctionDef boost_1_gradient = FDH::Create(
+      "Boost1Gradient", {"x:float", "s:float"}, {"dx:float"}, {},
+      {FDH::Const("two", 2.0f),
+       {{"grad"}, "Mul", {"x", "two:output:0"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"dx", "grad:z:0"}});
+  auto* boost_1_grad_attr = boost_1_gradient.mutable_attr();
+  (*boost_1_grad_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_1_grad_attr)["experimental_api_preferred_device"].set_s("CPU");
+  (*boost_1_grad_attr)["forward_function_name"].set_s("BoostCpu");
+
+  // boost_2 return the input * 4, and with two extra internal states.
+  FunctionDef boost_2_func = FDH::Create(
+      "Boost2", {"x:float"}, {"z:float", "s1:float", "s2:float"}, {},
+      {FDH::Const("four", 4.0f),
+       {{"boost"}, "Mul", {"x", "four:output:0"}, {{"T", DT_FLOAT}}},
+       FDH::Const("one", 1.0f),
+       FDH::Const("two", 2.0f)},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "boost:z:0"}, {"s1", "one:output:0"}, {"s2", "two:output:0"}});
+  auto* boost_2_attr = boost_2_func.mutable_attr();
+  (*boost_2_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_2_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_attr)["backward_function_name"].set_s("BoostGpuGradient");
+
+  FunctionDef boost_2_gradient = FDH::Create(
+      "Boost2Gradient", {"x:float", "s1:float", "s2:float"}, {"dx:float"}, {},
+      {FDH::Const("four", 4.0f),
+       {{"grad"}, "Mul", {"x", "four:output:0"}, {{"T", DT_FLOAT}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"dx", "grad:z:0"}});
+  auto* boost_2_grad_attr = boost_2_gradient.mutable_attr();
+  (*boost_2_grad_attr)["experimental_api_implements"].set_s("random_boost");
+  (*boost_2_grad_attr)["experimental_api_preferred_device"].set_s("GPU");
+  (*boost_2_grad_attr)["forward_function_name"].set_s("BoostGpu");
+
+  // Define the forward function with f = boost2 function but with CPU device.
+  // Expect the grappler plugin to swap f and attributes to use the boost1.
+  const auto forward =
+      NDef("lstm/StatefulPartitionedCall", "StatefulPartitionedCall", {"input"},
+           {{"Tin", DataTypeSlice{DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+            {"f", FDH::FunctionRef("Boost2")}},
+           CpuDevice);
+  const auto backward =
+      NDef("gradient/lstm/StatefulPartitionedCall", "StatefulPartitionedCall",
+           {"input", "lstm/StatefulPartitionedCall:1",
+            "lstm/StatefulPartitionedCall:2"},
+           {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_FLOAT}},
+            {"Tout", DataTypeSlice{DT_FLOAT}},
+            {"f", FDH::FunctionRef("Boost2Gradient")}},
+           CpuDevice);
+
+  ExperimentalImplementationSelector optimizer;
+  GraphDef output;
+  GrapplerItem item;
+  item.graph = test::function::GDef(
+      {NDef("input", "Placeholder", {}, {{"dtype", DT_FLOAT}}, CpuDevice),
+       forward, backward,
+       NDef("output", "Identity", {"lstm/StatefulPartitionedCall:0"},
+            {{"T", DT_FLOAT}}, CpuDevice)},
+      // FunctionLib
+      {boost_1, boost_1_gradient, boost_2_func, boost_2_gradient});
+
+  const Tensor input = test::AsScalar<float>(1.0f);
+  item.fetch = {"output"};
+  item.feed.emplace_back("input", input);
+
+  const auto four_times_boosted_tensor = EvaluateFetchNodes(item);
+  test::ExpectTensorEqual<float>(four_times_boosted_tensor[0],
+                                 test::AsScalar<float>(4.0f));
+
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
+  const auto twice_boosted_tensor = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(twice_boosted_tensor[0],
+                                 test::AsScalar<float>(2.0f));
+}
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.cc b/tensorflow/core/grappler/optimizers/function_api_info.cc
index 798e0f6fd55930f437d7a95d1886eb14e07946b5..497ad6032ea80b22e5b5e2b23b2860b7c99fc57b 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info.cc
@@ -27,6 +27,7 @@ FunctionApiInfo::FunctionApiInfo() {}
 FunctionApiInfo::~FunctionApiInfo() {}
 
 Status FunctionApiInfo::Init(const FunctionDef& function_def) {
+  function_type_ = FunctionApiInfo::FunctionType::INFERENCE;
   for (const auto& attr : function_def.attr()) {
     if (attr.first == "experimental_api_preferred_device") {
       preferred_device_ = attr.second.s();
@@ -34,7 +35,25 @@ Status FunctionApiInfo::Init(const FunctionDef& function_def) {
     if (attr.first == "experimental_api_implements") {
       interface_name_ = attr.second.s();
     }
+    if (attr.first == "forward_function_name") {
+      function_type_ = FunctionApiInfo::FunctionType::BACKWARD;
+      pairing_function_name_ = attr.second.s();
+    }
+    if (attr.first == "backward_function_name") {
+      function_type_ = FunctionApiInfo::FunctionType::FORWARD;
+      pairing_function_name_ = attr.second.s();
+    }
+  }
+
+  input_arg_dtypes_.reserve(function_def.signature().input_arg_size());
+  for (const auto& input_arg : function_def.signature().input_arg()) {
+    input_arg_dtypes_.emplace_back(input_arg.type());
   }
+  output_arg_dtypes_.reserve(function_def.signature().output_arg_size());
+  for (const auto& output_arg : function_def.signature().output_arg()) {
+    output_arg_dtypes_.emplace_back(output_arg.type());
+  }
+
   if (interface_name_.empty() && !preferred_device_.empty()) {
     return errors::InvalidArgument(
         "Function '", function_def.signature().name(),
@@ -51,53 +70,94 @@ const string& FunctionApiInfo::interface_name() const {
   return interface_name_;
 }
 
+const FunctionApiInfo::FunctionType FunctionApiInfo::function_type() const {
+  return function_type_;
+}
+
+const string& FunctionApiInfo::pairing_function_name() const {
+  return pairing_function_name_;
+}
+
+const DataTypeVector& FunctionApiInfo::input_arg_dtypes() const {
+  return input_arg_dtypes_;
+}
+
+const DataTypeVector& FunctionApiInfo::output_arg_dtypes() const {
+  return output_arg_dtypes_;
+}
+
 FunctionLibraryApiInfo::FunctionLibraryApiInfo() {}
 FunctionLibraryApiInfo::~FunctionLibraryApiInfo() {}
 
 namespace {
-bool IsSameSignature(const FunctionDef& f1, const FunctionDef& f2) {
-  if (f1.ret().size() != f2.ret().size()) return false;
+bool IsSameArgDef(const OpDef::ArgDef& arg1, const OpDef::ArgDef& arg2) {
+  if (arg1.type() != arg2.type()) return false;
+  if (arg1.type_attr() != arg2.type_attr()) return false;
+  if (arg1.number_attr() != arg2.number_attr()) return false;
+  if (arg1.type_list_attr() != arg2.type_list_attr()) return false;
+  if (arg1.is_ref() != arg2.is_ref()) return false;
+  return true;
+}
+
+bool IsSameSignature(const FunctionDef& f1, const FunctionDef& f2,
+                     const bool check_inputs, const bool check_outputs) {
   const auto& sig1 = f1.signature();
   const auto& sig2 = f2.signature();
   // Functions have positional semantics, so we don't check for names.
-  if (sig1.input_arg_size() != sig2.input_arg_size()) return false;
-  for (int k = 0; k < sig1.input_arg_size(); ++k) {
-    const OpDef::ArgDef& arg1 = sig1.input_arg(k);
-    const OpDef::ArgDef& arg2 = sig2.input_arg(k);
-    if (arg1.type() != arg2.type()) return false;
-    if (arg1.type_attr() != arg2.type_attr()) return false;
-    if (arg1.number_attr() != arg2.number_attr()) return false;
-    if (arg1.type_list_attr() != arg2.type_list_attr()) return false;
-    if (arg1.is_ref() != arg2.is_ref()) return false;
+  if (check_inputs) {
+    if (sig1.input_arg_size() != sig2.input_arg_size()) return false;
+    for (int k = 0; k < sig1.input_arg_size(); ++k) {
+      if (!IsSameArgDef(sig1.input_arg(k), sig2.input_arg(k))) return false;
+    }
+  }
+  if (check_outputs) {
+    if (f1.ret().size() != f2.ret().size()) return false;
+    if (sig1.output_arg_size() != sig2.output_arg_size()) return false;
+    for (int k = 0; k < sig1.output_arg_size(); ++k) {
+      if (!IsSameArgDef(sig1.output_arg(k), sig2.output_arg(k))) return false;
+    }
   }
   return true;
 }
 
 Status ValidateSignature(const string& interface_name,
-                         const std::vector<const FunctionDef*>& equiv_funcs) {
+                         const std::vector<const FunctionDef*>& equiv_funcs,
+                         const FunctionApiInfo::FunctionType function_type) {
   if (equiv_funcs.size() < 2) return Status::OK();
   for (size_t k = 1; k < equiv_funcs.size(); ++k) {
-    if (!IsSameSignature(*equiv_funcs[0], *equiv_funcs[k]))
+    const bool check_input =
+        (function_type == FunctionApiInfo::FunctionType::INFERENCE ||
+         function_type == FunctionApiInfo::FunctionType::FORWARD);
+    const bool check_output =
+        (function_type == FunctionApiInfo::FunctionType::INFERENCE ||
+         function_type == FunctionApiInfo::FunctionType::BACKWARD);
+    if (!IsSameSignature(*equiv_funcs[0], *equiv_funcs[k], check_input,
+                         check_output)) {
       return errors::InvalidArgument(
           "Functions '", equiv_funcs[0]->signature().name(), "' and '",
           equiv_funcs[k]->signature().name(), "' both implement '",
           interface_name, "' but their signatures do not match.");
+    }
   }
   return Status::OK();
 }
 
 Status ValidateSignatures(
     const std::unordered_map<string, std::vector<const FunctionDef*>>&
-        intf_to_func) {
+        intf_to_func,
+    const FunctionApiInfo::FunctionType function_type) {
   for (const auto& item : intf_to_func)
-    TF_RETURN_IF_ERROR(ValidateSignature(item.first, item.second));
+    TF_RETURN_IF_ERROR(
+        ValidateSignature(item.first, item.second, function_type));
   return Status::OK();
 }
 }  // namespace
 
 Status FunctionLibraryApiInfo::Init(
     const FunctionDefLibrary& function_library) {
-  std::unordered_map<string, std::vector<const FunctionDef*>> intf_to_func;
+  std::unordered_map<string, std::vector<const FunctionDef*>> infer_funcs;
+  std::unordered_map<string, std::vector<const FunctionDef*>> fwd_funcs;
+  std::unordered_map<string, std::vector<const FunctionDef*>> bwd_funcs;
   for (const auto& function : function_library.function()) {
     std::unique_ptr<FunctionApiInfo> func_info(new FunctionApiInfo);
     TF_RETURN_IF_ERROR(func_info->Init(function));
@@ -106,54 +166,64 @@ Status FunctionLibraryApiInfo::Init(
 
     const string& function_name = function.signature().name();
     const string& interface_name = func_info->interface_name();
-    func_to_intf_[function_name] = interface_name;
-    intf_to_funcs_[interface_name].emplace_back(function_name);
-    intf_to_func[interface_name].emplace_back(&function);
+    VLOG(3) << "Got " << func_info->function_type()
+            << " function: " << function_name
+            << " with interface: " << interface_name;
+    switch (func_info->function_type()) {
+      case FunctionApiInfo::FunctionType::INFERENCE:
+        intf_to_inference_funcs_[interface_name].emplace_back(function_name);
+        infer_funcs[interface_name].emplace_back(&function);
+        break;
+      case FunctionApiInfo::FunctionType::FORWARD:
+        intf_to_forward_funcs_[interface_name].emplace_back(function_name);
+        fwd_funcs[interface_name].emplace_back(&function);
+        break;
+      case FunctionApiInfo::FunctionType::BACKWARD:
+        intf_to_backward_funcs_[interface_name].emplace_back(function_name);
+        bwd_funcs[interface_name].emplace_back(&function);
+        break;
+      default:
+        return errors::InvalidArgument("Unrecognized function type: ",
+                                       func_info->function_type());
+    }
     func_info_[function_name] = std::move(func_info);
   }
-  TF_RETURN_IF_ERROR(ValidateSignatures(intf_to_func));
+  TF_RETURN_IF_ERROR(ValidateSignatures(
+      infer_funcs, FunctionApiInfo::FunctionType::INFERENCE));
+  TF_RETURN_IF_ERROR(
+      ValidateSignatures(fwd_funcs, FunctionApiInfo::FunctionType::FORWARD));
+  TF_RETURN_IF_ERROR(
+      ValidateSignatures(bwd_funcs, FunctionApiInfo::FunctionType::BACKWARD));
   return Status::OK();
 }
 
-void FunctionLibraryApiInfo::GetEquivalentImplementations(
-    const string& function_name, std::vector<string>* other_names) const {
-  const auto intf_it = func_to_intf_.find(function_name);
-  // The function does not implement any interface.
-  if (intf_it == func_to_intf_.end()) return;
-  CHECK(!intf_it->second.empty()) << "Function " << function_name
-                                  << "should at least implement 1 interface.";
-  const auto it = intf_to_funcs_.find(intf_it->second);
-  CHECK(it != intf_to_funcs_.end())
-      << "Function " << function_name << " maps to " << intf_it->second
-      << " but no reverse mapping was found";
-  CHECK_GE(it->second.size(), 1) << "Class " << it->first << " is empty";
-  other_names->reserve(it->second.size() - 1);
-  for (const auto& other_name : it->second) {
-    if (other_name == function_name) continue;
-    other_names->emplace_back(other_name);
+Status FunctionLibraryApiInfo::GetEquivalentImplementations(
+    const string& function_name, std::vector<string>* other_functions) const {
+  const auto func_it = func_info_.find(function_name);
+  if (func_it == func_info_.end()) return Status::OK();
+  const FunctionApiInfo* func_info = func_it->second.get();
+
+  absl::flat_hash_map<string, std::vector<string>>::const_iterator it;
+  switch (func_info->function_type()) {
+    case FunctionApiInfo::FunctionType::INFERENCE:
+      it = intf_to_inference_funcs_.find(func_info->interface_name());
+      break;
+    case FunctionApiInfo::FunctionType::FORWARD:
+      it = intf_to_forward_funcs_.find(func_info->interface_name());
+      break;
+    case FunctionApiInfo::FunctionType::BACKWARD:
+      it = intf_to_backward_funcs_.find(func_info->interface_name());
+      break;
+    default:
+      return errors::InvalidArgument("Unrecognized function type: ",
+                                     func_info->function_type());
   }
-}
 
-void FunctionLibraryApiInfo::GetBestImplementation(
-    const string& function_name, const string& device,
-    string* best_func_name) const {
-  CHECK(best_func_name != nullptr);
-  const auto func_it = func_to_intf_.find(function_name);
-  if (func_it == func_to_intf_.end()) return;
-
-  const auto it = intf_to_funcs_.find(func_it->second);
-  // No function found for the given interface.
-  if (it == intf_to_funcs_.end()) return;
   for (const auto& func_name : it->second) {
-    const auto func_api_info = func_info_.find(func_name)->second.get();
-    if (func_api_info->preferred_device() == device) {
-      best_func_name->assign(func_name);
-      return;
-    }
+    if (func_name == function_name) continue;
+    other_functions->emplace_back(func_name);
   }
-  // Didn't find a function with the match device name, choose the first one
-  // among all the available functions.
-  best_func_name->assign(it->second.front());
+  return Status::OK();
 }
 
 const FunctionApiInfo* FunctionLibraryApiInfo::GetApiInfo(
diff --git a/tensorflow/core/grappler/optimizers/function_api_info.h b/tensorflow/core/grappler/optimizers/function_api_info.h
index 412687c58c15460a05b2e697afb1f84454462da8..9a5f548951f0931e98fbe4074f7bbd9aacab0c6e 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info.h
+++ b/tensorflow/core/grappler/optimizers/function_api_info.h
@@ -20,7 +20,10 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
@@ -30,14 +33,32 @@ class FunctionApiInfo {
   FunctionApiInfo();
   virtual ~FunctionApiInfo();
 
+  enum FunctionType {
+    INFERENCE,  // Default type.
+    FORWARD,
+    BACKWARD,
+  };
+
   Status Init(const FunctionDef& function_def);
 
   const string& interface_name() const;
   const string& preferred_device() const;
+  const FunctionType function_type() const;
+  const string& pairing_function_name() const;
+  const DataTypeVector& input_arg_dtypes() const;
+  const DataTypeVector& output_arg_dtypes() const;
 
  private:
   string interface_name_;
   string preferred_device_;
+  FunctionType function_type_;
+  // The pairing function is used to pair between forward and backward function,
+  // which will be useful during function swapping. Inference function won't
+  // have pairing function.
+  string pairing_function_name_;
+  // The following two attributes are useful for forward and backward functions.
+  DataTypeVector input_arg_dtypes_;
+  DataTypeVector output_arg_dtypes_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionApiInfo);
 };
@@ -55,21 +76,22 @@ class FunctionLibraryApiInfo {
   // Populate the internal field for the functions within the function_library.
   Status Init(const FunctionDefLibrary& function_library);
 
-  void GetEquivalentImplementations(const string& function_name,
-                                    std::vector<string>* other_names) const;
-
-  void GetBestImplementation(const string& function_name, const string& device,
-                             string* best_func_name) const;
+  Status GetEquivalentImplementations(
+      const string& function_name, std::vector<string>* other_functions) const;
 
   const FunctionApiInfo* GetApiInfo(const string& function_name) const;
 
  private:
   // Map between function name to function details.
   std::unordered_map<string, std::unique_ptr<FunctionApiInfo>> func_info_;
-  // Map between function name to interface name.
-  std::unordered_map<string, string> func_to_intf_;
+
   // Map between interface name to function names.
-  std::unordered_map<string, std::vector<string>> intf_to_funcs_;
+  // Forward/backward function pair usually have different signatures between
+  // each other since forward function could produce extra internal state as
+  // output, and backward will take those extra state as inputs.
+  absl::flat_hash_map<string, std::vector<string>> intf_to_inference_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_forward_funcs_;
+  absl::flat_hash_map<string, std::vector<string>> intf_to_backward_funcs_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionLibraryApiInfo);
 };
diff --git a/tensorflow/core/grappler/optimizers/function_api_info_test.cc b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
index 582890d3e3bb807552039de4a3ff5e8c6e393ca5..b683d26b32f04759b658e9e0704f1b6b661fe178 100644
--- a/tensorflow/core/grappler/optimizers/function_api_info_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_api_info_test.cc
@@ -36,28 +36,35 @@ void SetArg(const string& name, const string& type_name,
 
 typedef std::pair<string, string> ArgSpec;  // name, type.
 
-void SetArgs(const std::vector<ArgSpec>& args_spec, OpDef* sig) {
-  for (const auto& arg_spec : args_spec)
+void SetArgs(const std::vector<ArgSpec>& input_args_spec,
+             const std::vector<ArgSpec>& output_args_spec, OpDef* sig) {
+  for (const auto& arg_spec : input_args_spec)
     SetArg(arg_spec.first, arg_spec.second, sig->add_input_arg());
-  SetArg("output", "float32", sig->add_output_arg());
+  for (const auto& arg_spec : output_args_spec)
+    SetArg(arg_spec.first, arg_spec.second, sig->add_output_arg());
 }
 
 void PopulateFunction(const string& name, const string& api_interface_name,
                       const string& preferred_device,
                       const std::vector<ArgSpec>& input_args,
+                      const std::vector<ArgSpec>& output_args,
+                      const string& forward_function_name,
+                      const string& backward_function_name,
                       FunctionDef* func_def) {
   OpDef* sig = func_def->mutable_signature();
   sig->set_name(name);
 
-  SetArgs(input_args, sig);
-
-  if (!api_interface_name.empty() || !preferred_device.empty()) {
-    auto* func_attr = func_def->mutable_attr();
-    if (!api_interface_name.empty())
-      (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
-    if (!preferred_device.empty())
-      (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
-  }
+  SetArgs(input_args, output_args, sig);
+
+  auto* func_attr = func_def->mutable_attr();
+  if (!api_interface_name.empty())
+    (*func_attr)["experimental_api_implements"].set_s(api_interface_name);
+  if (!preferred_device.empty())
+    (*func_attr)["experimental_api_preferred_device"].set_s(preferred_device);
+  if (!forward_function_name.empty())
+    (*func_attr)["forward_function_name"].set_s(forward_function_name);
+  if (!backward_function_name.empty())
+    (*func_attr)["backward_function_name"].set_s(backward_function_name);
 }
 
 void PopulateSampleLibrary(const bool mismatch_args,
@@ -65,39 +72,50 @@ void PopulateSampleLibrary(const bool mismatch_args,
   const std::vector<ArgSpec> func_args{{"in1", "float32"}, {"in2", "int32"}};
   const std::vector<ArgSpec> func_wrong_args{{"in1", "int32"},
                                              {"in2", "int32"}};
-  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", func_args,
-                   func_lib->add_function());
+  const std::vector<ArgSpec> output_args{{"out", "float32"}};
+  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", func_args, output_args, "",
+                   "", func_lib->add_function());
   PopulateFunction("DoStuffGpu", "DoStuff", "GPU",
-                   mismatch_args ? func_wrong_args : func_args,
+                   mismatch_args ? func_wrong_args : func_args, output_args, "",
+                   "", func_lib->add_function());
+  PopulateFunction("DoThings", "DoThings", "", func_args, output_args, "", "",
                    func_lib->add_function());
-  PopulateFunction("DoThings", "DoThings", "", func_args,
+  PopulateFunction("OneOff", "", "", func_args, output_args, "", "",
                    func_lib->add_function());
-  PopulateFunction("OneOff", "", "", func_args, func_lib->add_function());
-  PopulateFunction("AnotherOneOff", "", "", func_args,
+  PopulateFunction("AnotherOneOff", "", "", func_args, output_args, "", "",
                    func_lib->add_function());
 }
 
+void PopulateComplexLibrary(FunctionDefLibrary* func_lib) {
+  const std::vector<ArgSpec> input_args{{"in1", "float32"}, {"in2", "int32"}};
+  const std::vector<ArgSpec> output_args{{"out", "float32"}};
+  const std::vector<ArgSpec> output_with_state{
+      {"out", "float32"}, {"state1", "int32"}, {"state2", "int32"}};
+
+  PopulateFunction("DoStuffCpu", "DoStuff", "CPU", input_args, output_args, "",
+                   "DoStuffCpu_gradient", func_lib->add_function());
+  PopulateFunction("DoStuffCpu_gradient", "DoStuff", "CPU", output_args,
+                   input_args, "DoStuffCpu", "", func_lib->add_function());
+  PopulateFunction("DoStuffGpu", "DoStuff", "GPU", input_args,
+                   output_with_state, "", "DoStuffGpu_gradient",
+                   func_lib->add_function());
+  PopulateFunction("DoStuffGpu_gradient", "DoStuff", "GPU", output_with_state,
+                   input_args, "DoStuffGpu", "", func_lib->add_function());
+}
+
 bool CheckEquivImpl(const FunctionLibraryApiInfo& lib_api_info,
                     const string& func_name,
                     const std::vector<string>& expected_other) {
   std::vector<string> other_impl;
-  lib_api_info.GetEquivalentImplementations(func_name, &other_impl);
+  Status status =
+      lib_api_info.GetEquivalentImplementations(func_name, &other_impl);
+  EXPECT_EQ(status, Status::OK());
   const std::unordered_set<string> actual(other_impl.begin(), other_impl.end());
   const std::unordered_set<string> expected(expected_other.begin(),
                                             expected_other.end());
   return actual == expected;
 }
 
-bool CheckGetBestImpl(const FunctionLibraryApiInfo& lib_api_info,
-                      const string& function_name, const string& device,
-                      const string& expected_function_name) {
-  string best_function_name;
-  lib_api_info.GetBestImplementation(function_name, device,
-                                     &best_function_name);
-
-  return best_function_name == expected_function_name;
-}
-
 string GetInterfaceName(const FunctionLibraryApiInfo& lib_api_info,
                         const string& func_name) {
   auto* info = lib_api_info.GetApiInfo(func_name);
@@ -117,34 +135,46 @@ TEST(FunctionApiInfoTest, ParseTags) {
   PopulateSampleLibrary(/* mismatch_args */ false, &func_lib);
   FunctionLibraryApiInfo lib_api_info;
   TF_ASSERT_OK(lib_api_info.Init(func_lib));
+
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu"));
+  EXPECT_EQ("DoThings", GetInterfaceName(lib_api_info, "DoThings"));
+
+  EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu"));
+  EXPECT_EQ("", GetPreferredDevice(lib_api_info, "DoThings"));
+
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "OneOff", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "AnotherOneOff", {}));
   EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoThings", {}));
+}
+
+TEST(FunctionApiInfoTest, ComplexFunctionLib) {
+  FunctionDefLibrary func_lib;
+  PopulateComplexLibrary(&func_lib);
+  FunctionLibraryApiInfo lib_api_info;
+  TF_ASSERT_OK(lib_api_info.Init(func_lib));
 
   EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffCpu_gradient"));
   EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu"));
-  EXPECT_EQ("DoThings", GetInterfaceName(lib_api_info, "DoThings"));
+  EXPECT_EQ("DoStuff", GetInterfaceName(lib_api_info, "DoStuffGpu_gradient"));
 
   EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu"));
+  EXPECT_EQ("CPU", GetPreferredDevice(lib_api_info, "DoStuffCpu_gradient"));
   EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu"));
-  EXPECT_EQ("", GetPreferredDevice(lib_api_info, "DoThings"));
+  EXPECT_EQ("GPU", GetPreferredDevice(lib_api_info, "DoStuffGpu_gradient"));
 
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffCpu", "CPU", "DoStuffCpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffCpu", "GPU", "DoStuffGpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "CPU", "DoStuffCpu"));
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "GPU", "DoStuffGpu"));
-
-  EXPECT_TRUE(CheckGetBestImpl(lib_api_info, "DoThings", "GPU", "DoThings"));
-  // TPU impl is not available, choose the first one available which is the CPU.
-  EXPECT_TRUE(
-      CheckGetBestImpl(lib_api_info, "DoStuffGpu", "TPU", "DoStuffCpu"));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu", {"DoStuffGpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu", {"DoStuffCpu"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffCpu_gradient",
+                             {"DoStuffGpu_gradient"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "DoStuffGpu_gradient",
+                             {"DoStuffCpu_gradient"}));
+  EXPECT_TRUE(CheckEquivImpl(lib_api_info, "Undefined", {}));
 }
 
 TEST(FunctionApiInfoTest, MismatchedArguments) {
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index f99826ddcad1fefc37fb5a78ee0dd63e14b17ce8..8beebb90496005dea556ec90de24072a6e6fd9b6 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -16,11 +16,17 @@ limitations under the License.
 #include "tensorflow/core/grappler/optimizers/function_optimizer.h"
 
 #include <unordered_map>
+#include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
+#include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/common_runtime/placer.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.h"
@@ -31,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_constructor.h"
+#include "tensorflow/core/graph/tensor_id.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
@@ -107,6 +114,44 @@ AttrSlice FunctionInstantiationAttributes(const FunctionDef& func,
   }
 }
 
+// This is a fake device that should not be used for any op kernel execution,
+// the only purpose of this device is to be passed as a part of DeviceSet to the
+// Placer.
+class FakeDevice : public Device {
+ public:
+  FakeDevice(Env* env, const string& device) : Device(env, attr(device)) {}
+  explicit FakeDevice(const string& device) : FakeDevice(nullptr, device) {}
+  Status Sync() override { return Status::OK(); }
+
+ private:
+  static DeviceAttributes attr(const string& device) {
+    DeviceNameUtils::ParsedName parsed_name;
+    bool parsed = DeviceNameUtils::ParseFullName(device, &parsed_name);
+    DCHECK(parsed) << "Failed to parse full device name: " << device;
+
+    DeviceAttributes attr;
+    attr.set_name(device);
+    attr.set_device_type(parsed_name.type);
+    return attr;
+  }
+};
+
+// -------------------------------------------------------------------------- //
+// Function specialization.
+//
+// FunctionDef is somewhat similar to function template in C++, given all the
+// type parameters (and attribute values) it generates a statically defined
+// graph from the type parametrized "graph template" (function body).
+//
+// Function specialization instantiates a parametrized FunctionDef into a
+// statically defined graph, and then converts it back to the fully defined
+// FunctionDef (it doesn't have any unknown type parameters or attribute
+// values, known as placeholders).
+//
+// Given the fully specified graph we can apply all the Grappler optimizers to
+// it (see details in MetaOptimizer). Also we can push known constant inputs
+// into the function body, and remove unused outputs/inputs.
+
 // Specialized function instantiation type parameters, body parameters, and
 // const inputs.
 struct FunctionSpecializationSignature {
@@ -206,25 +251,27 @@ struct FunctionSpecialization {
   std::vector<std::pair<int, int>> output_mapping;
 };
 
-class FakeCPUDevice : public Device {
- public:
-  FakeCPUDevice(Env* env, const DeviceAttributes& attr) : Device(env, attr) {}
-  Status Sync() override { return Status::OK(); }
-};
-
 class FunctionOptimizerContext {
  public:
   explicit FunctionOptimizerContext(RewriterConfig::Toggle opt_level,
                                     const GrapplerItem& item)
       : grappler_item_id_(item.id),
         graph_version_(item.graph.versions().producer()),
+        opt_level_(opt_level),
+        allowed_optimizations_(item.allowed_optimizations()),
         function_library_(OpRegistry::Global(), item.graph.library()),
+        available_device_names_(item.devices().begin(), item.devices().end()),
         graph_view_(&item.graph) {
     InitializeTrulyConstNodes(item);
-    InitializeInlinedFunctions(opt_level, item);
     InitializeFetchNodes(item);
   }
 
+  const RewriterConfig::Toggle opt_level() const { return opt_level_; }
+
+  const GrapplerItem::AllowedOptimizations& allowed_optimizations() const {
+    return allowed_optimizations_;
+  }
+
   const FunctionLibraryDefinition& function_library() const {
     return function_library_;
   }
@@ -238,9 +285,13 @@ class FunctionOptimizerContext {
     return flr_;
   }
 
-  const gtl::FlatMap<string, std::vector<std::pair<int, int>>>&
-  output_mappings() const {
-    return output_mappings_;
+  const gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>&
+  tensor_mapping() const {
+    return tensor_mapping_;
+  }
+
+  const gtl::FlatMap<string, std::vector<string>>& control_overrides() const {
+    return control_overrides_;
   }
 
   const GraphView& graph_view() const { return graph_view_; }
@@ -249,12 +300,20 @@ class FunctionOptimizerContext {
 
   const gtl::FlatSet<string>& fetch_tensors() const { return fetch_tensors_; }
 
-  bool IsFetchNode(const string& node_name) const {
-    return fetch_nodes_.find(node_name) != fetch_nodes_.end();
+  const DeviceSet* devices() const {
+    // Create fake devices lazily only if we need a DeviceSet.
+    if (available_devices_.empty() && !available_device_names_.empty()) {
+      for (const string& name : available_device_names_) {
+        auto device = absl::make_unique<FakeDevice>(name);
+        available_device_set_.AddDevice(device.get());
+        available_devices_.push_back(std::move(device));
+      }
+    }
+    return &available_device_set_;
   }
 
-  bool IsInlinedFunction(const string& name) const {
-    return inlined_functions_.count(name) > 0;
+  bool IsFetchNode(const string& node_name) const {
+    return fetch_nodes_.find(node_name) != fetch_nodes_.end();
   }
 
   bool IsTrulyConst(const string& name) const {
@@ -265,11 +324,6 @@ class FunctionOptimizerContext {
     return gtl::FindWithDefault(truly_const_nodes_, name, nullptr);
   }
 
-  // Find inlining candidate by name. Return nullptr if not found.
-  const FunctionDef* FindInlinedFunction(const string& name) const {
-    return gtl::FindWithDefault(inlined_functions_, name, nullptr);
-  }
-
   const FunctionSpecialization* FindFunctionSpecialization(
       const FunctionSpecializationSignature& sig) const {
     return gtl::FindOrNull(specialized_functions_, sig);
@@ -280,20 +334,33 @@ class FunctionOptimizerContext {
     specialized_functions_.emplace(sig, specialized_func);
   }
 
-  void AddOutputMapping(const string& func_node,
-                        const FunctionSpecialization& specialized_func) {
-    output_mappings_.emplace(func_node, specialized_func.output_mapping);
+  void AddTensorMapping(const SafeTensorId& from, const SafeTensorId& to) {
+    auto inserted = tensor_mapping_.insert({from, to});
+    DCHECK(inserted.second)
+        << "Failed to insert duplicated tensor mapping: "
+        << "from=" << from.ToString() << " to=" << to.ToString();
   }
 
-  // Return true if we had any specialized function that changed it's output
-  // mapping, and it's required to update output consumers to new ports ids.
-  bool RequiresOutputMapping() const {
-    for (const auto& m1 : output_mappings_) {
-      for (const std::pair<int, int>& m2 : m1.second) {
-        if (m2.first != m2.second) return true;
+  void AddTensorMapping(const string& func_node,
+                        const FunctionSpecialization& specialized_func) {
+    for (const auto& pair : specialized_func.output_mapping) {
+      int from_idx = pair.first;
+      int to_idx = pair.second;
+      if (from_idx != to_idx) {
+        SafeTensorId from_tensor(func_node, from_idx);
+        SafeTensorId to_tensor(func_node, to_idx);
+        auto inserted = tensor_mapping_.insert({from_tensor, to_tensor});
+        DCHECK(inserted.second);
       }
     }
-    return false;
+  }
+
+  void AddControlOverrides(const NodeDef& func_node,
+                           const std::vector<string>& control_overrides) {
+    control_overrides_[func_node.name()].reserve(control_overrides.size());
+    for (const string& control_override : control_overrides) {
+      control_overrides_[func_node.name()].push_back(control_override);
+    }
   }
 
  private:
@@ -310,26 +377,6 @@ class FunctionOptimizerContext {
     }
   }
 
-  void InitializeInlinedFunctions(RewriterConfig::Toggle opt_level,
-                                  const GrapplerItem& item) {
-    bool aggressive = opt_level == RewriterConfig::AGGRESSIVE;
-
-    for (const FunctionDef& func : item.graph.library().function()) {
-      // Can't create IdentityN nodes with no input or output: skip these
-      // functions for now.
-      if (func.signature().input_arg_size() == 0 ||
-          func.signature().output_arg_size() == 0) {
-        continue;
-      }
-      bool marked_noinline = MarkedNoInline(func);
-      bool marked_specialized = MarkedSpecialized(func);
-
-      if (!marked_specialized && (!marked_noinline || aggressive)) {
-        inlined_functions_[func.signature().name()] = &func;
-      }
-    }
-  }
-
   void InitializeFetchNodes(const GrapplerItem& item) {
     for (const string& fetch : item.fetch) {
       fetch_tensors_.insert(fetch);
@@ -340,22 +387,22 @@ class FunctionOptimizerContext {
   void InitializeFunctionLibraryRuntime() {
     if (!flr_) {
       Env* env = Env::Default();
-      DeviceAttributes attr;
-      attr.set_name("/device:CPU:0");
-      attr.set_device_type("CPU");
-      Device* device = new FakeCPUDevice(env, attr);
-      device_mgr_.reset(new DeviceMgr({device}));
+      std::vector<std::unique_ptr<Device>> devices;
+      devices.push_back(absl::make_unique<FakeDevice>(env, "/device:CPU:0"));
+      device_mgr_ = absl::make_unique<DeviceMgr>(std::move(devices));
       OptimizerOptions optimizer_opts;
       optimizer_opts.set_do_function_inlining(true);
       process_flr_.reset(new ProcessFunctionLibraryRuntime(
           device_mgr_.get(), env, graph_version_, &function_library_,
           optimizer_opts));
-      flr_ = process_flr_->GetFLR(device->name());
+      flr_ = process_flr_->GetFLR(device_mgr_->ListDevices()[0]->name());
     }
   }
 
   const string grappler_item_id_;
   const int graph_version_;
+  const RewriterConfig::Toggle opt_level_;
+  const GrapplerItem::AllowedOptimizations allowed_optimizations_;
   FunctionLibraryDefinition function_library_;
 
   // These fields initialized lazily only if needed.
@@ -363,8 +410,16 @@ class FunctionOptimizerContext {
   std::unique_ptr<ProcessFunctionLibraryRuntime> process_flr_;
   FunctionLibraryRuntime* flr_ = nullptr;
 
-  // Functions that can be inlined into optimized graph.
-  std::unordered_map<string, const FunctionDef*> inlined_functions_;
+  // Fully defined names of the devices available to the GrapplerItem.
+  const gtl::FlatSet<string> available_device_names_;
+
+  // List of available `FakedDevices` (lazily initialized, see devices()).
+  mutable std::vector<std::unique_ptr<Device>> available_devices_;
+
+  // DeviceSet of fake devices (`FakeDevice`) constructed from
+  // available_devices_ (lazily initialized).
+  mutable DeviceSet available_device_set_;
+
   // Nodes that are Const and not in feed.
   std::unordered_map<string, const NodeDef*> truly_const_nodes_;
   // Specialized functions.
@@ -377,9 +432,23 @@ class FunctionOptimizerContext {
   gtl::FlatSet<string> fetch_tensors_;  // format: node_name:port
   gtl::FlatSet<string> fetch_nodes_;    // format: node_name
 
-  // Output mappings that have to be applied to the graph after all functions
-  // are specialized (node name -> output mappings).
-  gtl::FlatMap<string, std::vector<std::pair<int, int>>> output_mappings_;
+  // After function inlining and specialization, the optimized graph might be in
+  // invalid state, nodes can read from non-existing function call nodes that
+  // were inlined, or they can read from output index that is no longer valid
+  // after unused outputs pruning.
+  //
+  // Tensor mapping that has to be applied to the graph after all functions
+  // optimizations (invalidated tensor id -> optimized graph tensor id).
+  gtl::FlatMap<SafeTensorId, SafeTensorId, SafeTensorId::Hasher>
+      tensor_mapping_;
+
+  // When we inline a function into the optimized graph, we no longer have the
+  // function call node to anchor control dependencies. Instead we must expand
+  // each function call control output edge into multiple control dependencies
+  // to all side-effectful ops inside the function body.
+  //
+  // Invalidated function call node name -> Inlined side-effectful nodes
+  gtl::FlatMap<string, std::vector<string>> control_overrides_;
 
   // Use graph view to find active outputs of the function caller nodes.
   GraphView graph_view_;
@@ -387,6 +456,22 @@ class FunctionOptimizerContext {
   TF_DISALLOW_COPY_AND_ASSIGN(FunctionOptimizerContext);
 };
 
+// Returns a pointer to the called function definition iff the given node is
+// indeed a function call. Otherwise returns nullptr.
+const FunctionDef* FindFunctionCall(const FunctionOptimizerContext& ctx,
+                                    const NodeDef& node) {
+  // Check if a node does indirect function call via PartitionedCallOp.
+  if (IsPartitionedCall(node) || IsStatefulPartitionedCall(node)) {
+    const AttrValue* func_attr = AttrSlice(node).Find("f");
+    return (func_attr != nullptr && func_attr->has_func())
+               ? ctx.function_library().Find(func_attr->func().name())
+               : nullptr;
+  }
+
+  // Check if the function op itself is a function name.
+  return ctx.function_library().Find(node.op());
+}
+
 gtl::FlatSet<int> GetActiveOutputs(const NodeDef& node,
                                    const FunctionOptimizerContext& ctx,
                                    int size_hint = 0) {
@@ -414,7 +499,7 @@ bool HasTrulyConstInputs(const NodeDef& node,
   const auto is_truly_const = [&ctx](const string& input) {
     return ctx.IsTrulyConst(NodeName(input));
   };
-  return std::any_of(node.input().begin(), node.input().end(), is_truly_const);
+  return absl::c_any_of(node.input(), is_truly_const);
 }
 
 bool HasUnusedOutputs(const NodeDef& func_node, const FunctionDef& func,
@@ -605,6 +690,9 @@ Status UpdateSpecializedFunctionNode(
   // 2. Remove inputs corresponding to the pushed down consts.
   RemovePushedDownConstInputs(specialization, specialized_func_node);
 
+  // NOTE: PartitionedCallOp has `Tin` and `Tout` attributes for input/output
+  // types, that must be in sync with updated function signature.
+
   // 3. Update input types for the indirect function calls.
   if (is_indirect_call) {
     RemovePushedDownConstInputTypes(specialization, func_node,
@@ -693,7 +781,7 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
     TF_RETURN_IF_ERROR(UpdateSpecializedFunctionNode(
         func, func_node, *already_specialized, specialized_func_node));
 
-    ctx->AddOutputMapping(specialized_func_node->name(), *already_specialized);
+    ctx->AddTensorMapping(specialized_func_node->name(), *already_specialized);
 
     return Status::OK();
   }
@@ -755,7 +843,98 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
       func, func_node, func_specialization, specialized_func_node));
 
   ctx->AddSpecializedFunction(signature, func_specialization);
-  ctx->AddOutputMapping(specialized_func_node->name(), func_specialization);
+  ctx->AddTensorMapping(specialized_func_node->name(), func_specialization);
+
+  return Status::OK();
+}
+
+// -------------------------------------------------------------------------- //
+// Inline direct functions calls.
+//
+// When we inline direct function calls, we instantiate the function body from
+// its FunctionDef and caller node attributes, and embed the instantiated graph
+// into the "main graph". When we do that, we must preserve the function call
+// semantics:
+//
+// 1) All input nodes must be executed before any of function body nodes will
+//    start executing.
+// 2) All function body nodes must be executed before any of the nodes, reading
+//    outputs of the function will start executing.
+// 3) All nodes with side effects inside a function must be executed, this is
+//    different from the nodes with side effects in the main graph, that can be
+//    pruned if they are not in transitive dependency set of any of the fetch
+//    nodes.
+// 4) All nodes of the function body must be execute on the device specified by
+//    the function caller node.
+//
+// To guarantee that function call semantics are preserved after inlining, we
+// insert an IdentityN node before the inlined function body, and hook all
+// inputs into that, and we insert another IdentityN node to hook all function
+// outputs to it.
+
+// Returns `Status::OK()` iff `node` is a direct function call of `func`, and we
+// know how to inline it into the main graph, otherwise returns and error
+// indicating why the function call is not inlinable.
+Status IsInlinableDirectFunctionCall(const FunctionOptimizerContext& ctx,
+                                     const FunctionDef& func,
+                                     const NodeDef& func_node) {
+  // Indirect function calls (PartitionedCallOp) have automatic control
+  // dependencies and inlined separately from direct function calls.
+  if (!IsDirectFunctionCall(func, func_node)) {
+    return errors::InvalidArgument("Unsupported function call type: ",
+                                   SummarizeNodeDef(func_node));
+  }
+
+  // For direct function  calls we insert IdentityN nodes before/after inlined
+  // function body to preserve function call semantics (all inputs evaluated
+  // before function evaluation starts, and all function body nodes finished
+  // before output consumed by other nodes).
+  if (func.signature().input_arg_size() == 0) {
+    return errors::FailedPrecondition(
+        "Can't inline direct function call with empty inputs: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // TODO(ezhulenev): Relax constraint on output args?
+  if (func.signature().output_arg_size() == 0) {
+    return errors::FailedPrecondition(
+        "Can't inline direct function call with empty outputs: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function must execute all the nodes in a function body that might have side
+  // effects. After inlining these nodes into the main graph, we can no longer
+  // guarantee that. For now we disable inlining functions with side effects.
+  //
+  // Attaching control dependency to the output IdentityN node is not safe,
+  // because it might be split or pruned in a later optimization pass.
+  //
+  // Indirect function calls (via PartitionedCallOp) have automatic dependency
+  // tracking, and allow us to safely inline functions with side effects.
+  bool has_side_effects =
+      absl::c_any_of(func.node_def(), [&ctx](const NodeDef& node) {
+        return !IsFreeOfSideEffect(node, &ctx.function_library());
+      });
+  if (has_side_effects) {
+    return errors::FailedPrecondition(
+        "Can't inline function with side-effects in the function body: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We ignore `_noinline` marker in aggressive mode.
+  bool aggressive = ctx.opt_level() == RewriterConfig::AGGRESSIVE;
+  if (MarkedNoInline(func) && !aggressive) {
+    return errors::FailedPrecondition(
+        "Can't inline function marked with '_noinline': ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function specialization and inlining must be mutually exclusive.
+  if (MarkedSpecialized(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function created in Grappler function specialization: ",
+        SummarizeNodeDef(func_node));
+  }
 
   return Status::OK();
 }
@@ -802,16 +981,13 @@ NodeDef InlinedFunctionOutputsNode(const NodeDef& func_node,
   return outputs;
 }
 
-Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
-                      const FunctionOptimizerContext& ctx,
-                      const int graph_def_version, GraphDef* optimized_graph) {
-  VLOG(2) << "Inline function instantiation: " << SummarizeNodeDef(func_node);
-
-  // Specialized function call kernels might have behavior that is not
-  // representable in a graph (e.g. runtime ops device placing).
-  if (!IsDirectFunctionCall(func, func_node)) {
-    return errors::InvalidArgument("Can't inline indirect function call");
-  }
+Status InlineDirectFunctionCall(const NodeDef& func_node,
+                                const FunctionDef& func,
+                                const int graph_def_version,
+                                const FunctionOptimizerContext& ctx,
+                                GraphDef* optimized_graph) {
+  VLOG(2) << "Inline direct function call: " << SummarizeNodeDef(func_node);
+  TF_RETURN_IF_ERROR(IsInlinableDirectFunctionCall(ctx, func, func_node));
 
   const AttrSlice func_instantiation_attr =
       FunctionInstantiationAttributes(func, func_node);
@@ -874,21 +1050,35 @@ Status InlineFunction(const NodeDef& func_node, const FunctionDef& func,
     // Make sure the node is placed.
     func_body_node.set_device(func_node.device());
 
-    // Check if a body node is itself a function.
-    const FunctionDef* func_body_node_func =
-        ctx.FindInlinedFunction(func_body_node.op());
-    if (func_body_node_func != nullptr) {
-      // Recursively inline function calls.
-      TF_RETURN_IF_ERROR(InlineFunction(func_body_node, *func_body_node_func,
-                                        ctx, graph_def_version,
-                                        optimized_graph));
-    } else {
+    // Move the function body node to the optimized graph.
+    const auto move_node_to_optimized_graph = [&]() {
       // Annotate the node with the function attributes.
       for (const auto& attr : func.attr()) {
         func_body_node.mutable_attr()->insert(attr);
       }
       // Move the node to the main graph.
       optimized_graph->add_node()->Swap(&func_body_node);
+    };
+
+    // Check if a body node is itself a function call and can be inlined.
+    const FunctionDef* func_body_node_func =
+        FindFunctionCall(ctx, func_body_node);
+
+    if (func_body_node_func != nullptr) {
+      Status inlinable = IsInlinableDirectFunctionCall(
+          ctx, *func_body_node_func, func_body_node);
+      if (inlinable.ok()) {
+        TF_RETURN_IF_ERROR(
+            InlineDirectFunctionCall(func_body_node, *func_body_node_func,
+                                     graph_def_version, ctx, optimized_graph));
+      } else {
+        VLOG(2) << "Can't inline nested direct function call: "
+                << inlinable.error_message();
+        move_node_to_optimized_graph();
+      }
+
+    } else {
+      move_node_to_optimized_graph();
     }
   }
 
@@ -995,9 +1185,336 @@ Status InlineSymbolicGradient(const NodeDef& node,
   return Status::OK();
 }
 
+// -------------------------------------------------------------------------- //
+// Inline indirect functions calls (aka PartitionedCallOp).
+//
+// When we inline indirect function calls, we instantiate the function body from
+// its FunctionDef and caller node attributes, and embed the instantiated graph
+// into the "main graph".
+//
+// In contrast to direct function calls, `PartitionedCallOp` has automatic
+// dependency tracking via input/output control edges, and we relax some of the
+// constraints that we have for direct function call inlining.
+//
+// "When a `PartitionedCallOp` function has a resource (DT_RESOURCE data type)
+// input argument it "captures" the mutable resource.  This is implemented by
+// automatically adding a incoming control edge from the previous side-effectful
+// op touching that resource, and an outgoing control edge to the next
+// side-effectful op using the same resource. This serializes the mutations of
+// the resource to make graph execution deterministic.
+//
+// Function call inlining must preserve side effect visibility:
+//
+// 1) All side effects to the captured resources, that happened before function
+//    call must be visible to the function body nodes using that resources.
+// 2) All side effects to the captured resources, that happened inside function
+//    body, must be visible to every op/function using that resource after the
+//    function call completed.
+
+// To guarantee that these properties are preserved after inlining we do:
+//
+// 1) Forward all input control dependencies from the function call node to the
+//    inlined function inputs (Identity nodes).
+// 2) Each side-effectful op inside function body adds itself as a control
+//    dependency to all the nodes in output control set of function call node.
+//
+// We do not add any other control dependencies to/from function body nodes,
+// because they are pure functions of input tensors, and can be freely
+// reordered.
+
+// Returns `Status::OK()` iff `node` is an indirect function call of `func`, and
+// we know how to inline it into the main graph, otherwise returns and error
+// indicating why the function call is not inlinable.
+Status IsInlinableIndirectFunctionCall(const FunctionOptimizerContext& ctx,
+                                       const FunctionDef& func,
+                                       const NodeDef& func_node) {
+  // We inline direct function calls above, using different rules.
+  if (!IsIndirectFunctionCall(func, func_node)) {
+    return errors::InvalidArgument("Unsupported function call type: ",
+                                   SummarizeNodeDef(func_node));
+  }
+
+  if (MarkedNoInline(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function marked with '_noinline': ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // Function specialization and inlining must be mutually exclusive.
+  if (MarkedSpecialized(func)) {
+    return errors::FailedPrecondition(
+        "Can't inline function created in Grappler function specialization: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We can't inline functions that are in a fetch set, because it would
+  // invalidate fetch tensors (function call node fully inlined and doesn't
+  // exist in the optimized graph).
+  if (ctx.IsFetchNode(func_node.name())) {
+    return errors::FailedPrecondition(
+        "Can't inline function in a Grappler item fetch set: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  // We can't inline functions with `Switch` nodes in the function body, because
+  // they might have dead tensors as a function output argument (we need all
+  // intermediate tensors to compute the function gradient). `PartitionedCallOp`
+  // invokes functions with `allow_dead_tensors = true` to reset dead flag,
+  // and return default initialized tensors instead of a dead tensors.
+  // TODO(ezhulenev): Do the liveness analysis and add
+  // `IdentitytWithResurrection` nodes after all potentially dead output
+  // tensors?
+  if (absl::c_any_of(func.node_def(), IsSwitch)) {
+    return errors::FailedPrecondition(
+        "Can't inline function with `Switch` nodes in the function body: ",
+        SummarizeNodeDef(func_node));
+  }
+
+  return Status::OK();
+}
+
+Status InlineIndirectFunctionCall(const NodeDef& func_node,
+                                  const FunctionDef& func,
+                                  const int graph_def_version,
+                                  FunctionOptimizerContext* ctx,
+                                  GraphDef* optimized_graph) {
+  VLOG(2) << "Inline indirect function call: " << SummarizeNodeDef(func_node);
+  TF_RETURN_IF_ERROR(IsInlinableIndirectFunctionCall(*ctx, func, func_node));
+
+  const AttrSlice func_instantiation_attr =
+      FunctionInstantiationAttributes(func, func_node);
+
+  GrapplerFunctionItem item;
+  Status item_status = MakeGrapplerFunctionItem(func, func_instantiation_attr,
+                                                ctx->function_library(),
+                                                graph_def_version, &item);
+
+  if (!item_status.ok()) {
+    return errors::InvalidArgument("Failed to inline function ", func_node.op(),
+                                   " instantiated by ", func_node.name(),
+                                   ". Error: ", item_status.error_message());
+  }
+
+  GraphView::InputPort control_input_port =
+      ctx->graph_view().GetInputPort(func_node.name(), Graph::kControlSlot);
+  GraphView::OutputPort control_output_port =
+      ctx->graph_view().GetOutputPort(func_node.name(), Graph::kControlSlot);
+
+  // Nodes that have side effects to the captured resources.
+  std::vector<string> happens_before;
+  absl::c_transform(
+      ctx->graph_view().GetFanin(control_input_port),
+      std::back_inserter(happens_before),
+      [](const GraphView::OutputPort port) { return port.node->name(); });
+
+  VLOG(3) << "Happens before set (size = " << happens_before.size()
+          << "): " << absl::StrJoin(happens_before, ", ");
+
+  // Nodes that must observe side effects to the captured resources.
+  std::vector<string> happens_after;
+  absl::c_transform(
+      ctx->graph_view().GetFanout(control_output_port),
+      std::back_inserter(happens_after),
+      [](const GraphView::InputPort port) { return port.node->name(); });
+
+  VLOG(3) << "Happens after set (size = " << happens_after.size()
+          << "): " << absl::StrJoin(happens_after, ", ");
+
+  // Regular (positional) inputs to the function call.
+  std::vector<SafeTensorId> inputs;
+  for (const string& input : func_node.input()) {
+    SafeTensorId tensor_id = ParseTensorName(input);
+    if (tensor_id.index() == Graph::kControlSlot) break;
+    inputs.push_back(tensor_id);
+  }
+
+  // If we have a node inside the function body without inputs (e.g. Const), we
+  // must attach a control dependency to it, to make sure that if a function
+  // call happens inside a loop, the node will be evaluated in correct frame.
+  //
+  // If the function call node has no inputs and no control dependencies, it
+  // means that it can't be a function call inside a loop, and we can safely
+  // insert that node without inputs into the main graph.
+  //
+  // TODO(ezhulenev): Use FrameMap (see grappler/utils/frame.h) to find out if
+  // the function is called inside a loop.
+  std::vector<string> empty_inputs_hook;
+  if (!item.inputs().empty()) {
+    const InputArgExpansion& arg0 = item.inputs()[0];
+    DCHECK(!arg0.placeholders.empty());
+    empty_inputs_hook.push_back(AsControlDependency(AddPrefixToNodeName(
+        arg0.placeholders[0], /*prefix=*/func_node.name())));
+  } else if (!happens_before.empty()) {
+    empty_inputs_hook.push_back(AsControlDependency(happens_before[0]));
+  }
+
+  // Mapping from input placeholder name to function input position.
+  int idx = 0;
+  absl::flat_hash_map<absl::string_view, int> input_placeholders_idx;
+  for (const InputArgExpansion& input_arg : item.inputs()) {
+    for (const string& placeholder : input_arg.placeholders) {
+      input_placeholders_idx[placeholder] = idx++;
+    }
+  }
+
+  const string prefix = strings::StrCat(func_node.name(), "/");
+
+  // ------------------------------------------------------------------------ //
+  // First we need to assign device placements to all function body nodes.
+
+  GraphDef placed_graph_def;
+
+  const DeviceSet* devices = ctx->devices();
+
+  if (devices->devices().empty()) {
+    // If there are no devices available for placer, we just put all nodes to
+    // the same device as a function caller node. This can happen if Grappler is
+    // running "offline", without active runtime session, for example as a part
+    // of a batch job for graph analysis/optimization.
+    VLOG(3) << "Assign function call node device to all function body nodes. "
+            << "Device: " << func_node.device();
+    placed_graph_def = item.mutable_function_body();
+    for (NodeDef& node : *placed_graph_def.mutable_node()) {
+      node.set_device(func_node.device());
+    }
+  } else {
+    // If we are running in an active runtime session, Grappler will get the
+    // graph after initial placing is done, and we should have devices for the
+    // placer.
+    VLOG(3) << "Run placer for instantiated function body. Devices: ["
+            << absl::StrJoin(
+                   devices->devices(), ", ",
+                   [](string* out, const Device* d) { out->append(d->name()); })
+            << "]";
+
+    // Construct a Graph object from the instantiated function body.
+    GraphConstructorOptions opts;
+    Graph graph(ctx->function_library());
+    TF_RETURN_IF_ERROR(
+        ConvertGraphDefToGraph(opts, item.function_body(), &graph));
+
+    // Use function caller node device as a default for placer.
+    const Device* default_device =
+        devices->FindDeviceByName(func_node.device());
+
+    Placer placer(&graph, devices, nullptr, /* No session options */
+                  default_device);
+    TF_RETURN_IF_ERROR(placer.Run());
+
+    // Convert Graph back to the GraphDef.
+    graph.ToGraphDef(&placed_graph_def);
+  }
+
+  // ------------------------------------------------------------------------ //
+  // After all nodes placed we need to prepare them for inlining into the
+  // optimized graph: turn placeholders into identities, update nodes
+  // connectivity, etc...
+
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    if (item.IsInputPlaceholder(func_body_node.name())) {
+      // Turn input placeholders into identity node.
+      DCHECK_EQ(0, func_body_node.input_size());
+      func_body_node.set_op("Identity");
+      (*func_body_node.mutable_attr())["T"] = func_body_node.attr().at("dtype");
+      func_body_node.mutable_attr()->erase("dtype");
+      func_body_node.mutable_attr()->erase("shape");
+      int input_idx = input_placeholders_idx[func_body_node.name()];
+      func_body_node.add_input(strings::StrCat(inputs[input_idx].ToString()));
+
+      // All side effects must happen before inputs can start executing.
+      for (const string& hb_node : happens_before) {
+        func_body_node.add_input(AsControlDependency(hb_node));
+      }
+
+    } else {
+      // Update inputs of the regular function body nodes.
+      for (string& input : *func_body_node.mutable_input()) {
+        input = AddPrefixToNodeName(input, /*prefix=*/func_node.name());
+      }
+      if (func_body_node.input_size() == 0 && !empty_inputs_hook.empty()) {
+        *func_body_node.add_input() = empty_inputs_hook[0];
+      }
+    }
+
+    // Add the function node name as a prefix 1) to node name to avoid
+    // collisions; 2) to frame name to avoid multiple LoopCond nodes in one
+    // frame after inlining.
+    TF_RETURN_IF_ERROR(
+        AddPrefixAndSuffixToNode(prefix, /*suffix=*/"", &func_body_node));
+
+    // After inlining into the optimized graph, NodeDef must have all attributes
+    // defined, which is not required for a node in a FunctionDef.
+    const OpDef* op_def;
+    TF_RETURN_IF_ERROR(
+        ctx->function_library().LookUpOpDef(func_body_node.op(), &op_def));
+    AddDefaultsToNodeDef(*op_def, &func_body_node);
+  }
+
+  // Construct a graph view for the preprocessed function body graph.
+  GraphView placed_graph_view(&placed_graph_def);
+
+  // Keep track of side-effectful ops inside function body. Each outgoing
+  // control edge from the function call node, must be replaced with control
+  // edges from inlined side-effectful ops.
+  std::vector<string> side_effectful_nodes;
+
+  // We have to make sure that all side-effectful nodes inside a function body
+  // will be executed after function inlining.
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    if (!IsFreeOfSideEffect(func_body_node, &ctx->function_library())) {
+      int num_fanouts = placed_graph_view.NumFanouts(
+          func_body_node, /*include_controlling_nodes=*/true);
+
+      // If the node doesn't have any outgoing edges and we do not have any
+      // nodes in the `happens_after` set, we can't inline a function and
+      // guarantee that side-effects will be executed. The only exception if we
+      // do function library optimization, and the GrapplerItem was constructed
+      // for the function body, because functions have strict semantics.
+
+      if (num_fanouts == 0 && happens_after.empty() &&
+          !ctx->allowed_optimizations().inline_ops_with_side_effects) {
+        return errors::Internal(
+            "Can't inline a function with a side-effectful op with empty "
+            "fanouts and empty output control edge set. Function body node: ",
+            SummarizeNodeDef(func_body_node));
+      }
+
+      side_effectful_nodes.push_back(func_body_node.name());
+    }
+  }
+
+  // Move all the nodes to the optimized graph after successful preprocessing.
+  for (NodeDef& func_body_node : *placed_graph_def.mutable_node()) {
+    optimized_graph->add_node()->Swap(&func_body_node);
+  }
+
+  // TODO(ezhulenev): Inline nested indirect function calls.
+
+  // Indirect function call is fully inlined into the optimized graph, and we do
+  // not copy the original function call node, so we have to setup tensor
+  // mapping from old output tensors, to the outputs of inlined nodes.
+  int output_idx = 0;
+  for (const OutputArgExpansion& output : item.outputs()) {
+    for (const string& output_tensor : output.output_tensors) {
+      const SafeTensorId from_tensor(func_node.name(), output_idx++);
+      const SafeTensorId to_tensor = ParseTensorName(
+          AddPrefixToNodeName(output_tensor, /*prefix=*/func_node.name()));
+      ctx->AddTensorMapping(from_tensor, to_tensor);
+    }
+  }
+
+  // After inlining we'll have to forward all control dependencies from function
+  // call node to all side-effectful ops inside function body.
+  ctx->AddControlOverrides(func_node, side_effectful_nodes);
+
+  VLOG(3) << "Successfully inlined indirect function call: "
+          << SummarizeNodeDef(func_node);
+  return Status::OK();
+}
+
 }  // namespace
 
-Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
+Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
@@ -1012,8 +1529,6 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   bool specialize_func = options_.enable_function_specialization;
 
   for (const NodeDef& node : item.graph.node()) {
-    const string op_name = node.op();
-
     // Each node optimization can modify optimized graph only by adding new
     // nodes, we can check node size to make sure that graph was not modified.
     const int num_nodes_before = optimized_graph->node_size();
@@ -1042,11 +1557,13 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     // 1. Inline symbolic gradients into the optimized graph.                 //
     // ---------------------------------------------------------------------- //
 
-    if (op_name == "SymbolicGradient" && inline_gradients) {
-      // Inline symbolic gradients only if the corresponding function is inlined
+    if (IsSymbolicGradient(node) && inline_gradients) {
+      // Inline symbolic gradients only if the corresponding function is not
+      // marked as `_noinline`.
       const auto* f_attr = gtl::FindOrNull(node.attr(), "f");
-      string f_name = f_attr != nullptr ? f_attr->func().name() : "";
-      if (ctx.IsInlinedFunction(f_name)) {
+      const string f_name = f_attr != nullptr ? f_attr->func().name() : "";
+      const FunctionDef* func = ctx.function_library().Find(f_name);
+      if (func && !MarkedNoInline(*func)) {
         TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
             InlineSymbolicGradient(node, &ctx, optimized_graph));
         continue;
@@ -1054,28 +1571,52 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     // ---------------------------------------------------------------------- //
-    // 2. Inline or specialize direct function calls.                         //
+    // 2. Inline or specialize function calls.                                //
     // ---------------------------------------------------------------------- //
 
-    const FunctionDef* func = ctx.function_library().Find(op_name);
+    // Find if a node is a function call (direct or indirect).
+    const FunctionDef* func = FindFunctionCall(ctx, node);
+
     if (func != nullptr) {
-      // 2a. Inline it if it's allowed to do so.
-      if (inline_func && ctx.IsInlinedFunction(op_name)) {
-        // Inline function body into the optimized graph}
-        TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(
-            InlineFunction(node, *func, ctx, item.graph.versions().producer(),
-                           optimized_graph));
-        continue;
+      const string& func_name = func->signature().name();
+      const int graph_def_version = item.graph.versions().producer();
+
+      const bool is_direct_func = IsDirectFunctionCall(*func, node);
+      const bool is_indirect_func = IsIndirectFunctionCall(*func, node);
+
+      // 2a. Inline direct function call if it's inlinable.
+      if (inline_func && is_direct_func) {
+        Status inlinable = IsInlinableDirectFunctionCall(ctx, *func, node);
+        if (inlinable.ok()) {
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineDirectFunctionCall(
+              node, *func, graph_def_version, ctx, optimized_graph));
+          continue;
+        } else {
+          VLOG(2) << inlinable.error_message();
+        }
       }
 
-      // Do not specialize if function has custom gradient.
-      const string grad_func = ctx.function_library().FindGradient(op_name);
+      // 2b. Inline indirect function call if it's inlinable.
+      if (inline_func && is_indirect_func) {
+        Status inlinable = IsInlinableIndirectFunctionCall(ctx, *func, node);
+        if (inlinable.ok()) {
+          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(InlineIndirectFunctionCall(
+              node, *func, graph_def_version, &ctx, optimized_graph));
+          continue;
+        } else {
+          VLOG(2) << inlinable.error_message();
+        }
+      }
 
-      // 2b. Specialize it to it's instantiation context if can't be inlined,
+      // 2c. Specialize it to its instantiation context if can't be inlined,
       // and it has something worth specializing.
       bool specialization_worthy = IsParametrized(*func) ||
                                    HasTrulyConstInputs(node, ctx) ||
                                    HasUnusedOutputs(node, *func, ctx);
+
+      // Do not specialize if function has custom gradient.
+      const string grad_func = ctx.function_library().FindGradient(func_name);
+
       if (specialize_func && grad_func.empty() && specialization_worthy) {
         // TODO(ezhulenev): Specialize function call if input has a known shape.
         // Specialize function body for its instantiation attributes and inputs.
@@ -1086,41 +1627,6 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       }
     }
 
-    // ---------------------------------------------------------------------- //
-    // 3. Specialize indirect function calls through the PartitionedCallOp.   //
-    // ---------------------------------------------------------------------- //
-
-    bool is_partitioned_call =
-        IsPartitionedCall(node) || IsStatefulPartitionedCall(node);
-
-    // We can only specialize PartitionedCall ops. Inlining is not supported.
-    if (is_partitioned_call && specialize_func) {
-      const AttrValue* func_attr = AttrSlice(node).Find("f");
-      string indirect_func_name =
-          (func_attr != nullptr && func_attr->has_func())
-              ? func_attr->func().name()
-              : "";
-      const FunctionDef* indirect_func =
-          ctx.function_library().Find(indirect_func_name);
-
-      if (indirect_func != nullptr) {
-        // Do not specialize if function has custom gradient.
-        const string grad_func =
-            ctx.function_library().FindGradient(indirect_func_name);
-
-        // Specialize it to it's instantiation context.
-        bool specialization_worthy =
-            IsParametrized(*indirect_func) || HasTrulyConstInputs(node, ctx) ||
-            HasUnusedOutputs(node, *indirect_func, ctx);
-        if (grad_func.empty() && specialization_worthy) {
-          TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED(SpecializeFunction(
-              node, *indirect_func, item.graph.versions().producer(), &ctx,
-              optimized_graph));
-          continue;
-        }
-      }
-    }
-
     // ---------------------------------------------------------------------- //
     // If we reached this point, node was not handled by any of the stages
     // (inline, specialize), simply add a copy to the graph.
@@ -1129,29 +1635,73 @@ Status FunctionOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 #undef TF_SKIP_ERROR_IF_GRAPH_UNMODIFIED
   }
 
-  // Function specialization might change the number of function outputs, so we
-  // have to process the final optimized graph and update all the node mapping.
-  if (ctx.RequiresOutputMapping()) {
-    MutableGraphView optimized_graph_view(optimized_graph);
-    for (const auto& output_mapping : ctx.output_mappings()) {
-      const auto& node_name = output_mapping.first;
-      const auto& mappings = output_mapping.second;
+  // After function specialization and inlining graph might be in invalid
+  // state, and some nodes can read tensors that do not exists anymore in the
+  // optimized graph: function call node was fully inlined into the graph, or
+  // output index was invalidated by the output pruning.
 
-      for (const std::pair<int, int>& mapping : mappings) {
-        int from = mapping.first;
-        int to = mapping.second;
+  if (!ctx.tensor_mapping().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      for (int idx = 0; idx < node.input_size(); ++idx) {
+        TensorId input_tensor = ParseTensorName(node.input(idx));
+        if (input_tensor.index() == Graph::kControlSlot) break;
 
-        // Get the output port corresponding to the old output position.
-        MutableGraphView::OutputPort from_port =
-            optimized_graph_view.GetOutputPort(node_name, from);
+        auto mapping = ctx.tensor_mapping().find(input_tensor);
+        if (mapping != ctx.tensor_mapping().end()) {
+          node.set_input(idx, mapping->second.ToString());
+        }
+      }
+    }
+  }
 
-        // Update all input ports that read from old output port.
-        for (MutableGraphView::InputPort to_port :
-             optimized_graph_view.GetFanout(from_port)) {
-          *to_port.node->mutable_input(to_port.port_id) =
-              strings::StrCat(node_name, ":", to);
+  // Function inlining instantiates function body directly into the optimized
+  // graph, and we might end up with control dependencies to the nodes that no
+  // longer exist in a graph. We need to apply control overrides to all
+  // invalidated nodes, and rewire control dependencies to the inlined
+  // side-effectful function body nodes.
+
+  // TODO(ezhulenev): With nested function call inlining, single pass over
+  // `control_overrides` might not bring the graph into a valid state,
+  // continue until it converges and all invalidated control dependencies
+  // removed.
+
+  if (!ctx.control_overrides().empty()) {
+    for (NodeDef& node : *optimized_graph->mutable_node()) {
+      // Keep track of new control inputs to the node.
+      gtl::FlatSet<string> add_ctrl_inputs;
+
+      // Remove all invalidated control inputs.
+      for (int idx = 0; idx < node.input_size(); /* see below */) {
+        // TODO(ezhulenev): Use non-allocating TensorId after migrating
+        // `control_overrides()` to absl::flat_hash_set.
+        SafeTensorId input_tensor = ParseTensorName(node.input(idx));
+
+        auto overrides = ctx.control_overrides().find(input_tensor.node());
+        if (overrides != ctx.control_overrides().end()) {
+          // If this happens it's a bug in the function inlining.
+          if (input_tensor.index() != Graph::kControlSlot) {
+            return errors::Internal(
+                "Illegal input edge from inlined function call node");
+          }
+          // Remove control dependency to the inlined function call node.
+          node.mutable_input()->SwapElements(idx, node.input_size() - 1);
+          node.mutable_input()->RemoveLast();
+
+          // Keep track of all overrides.
+          for (const string& override : overrides->second) {
+            add_ctrl_inputs.insert(AsControlDependency(override));
+          }
+        } else {
+          // Go to the next input only if the current one was not invalidated,
+          // otherwise we need to check the swapped input as well.
+          ++idx;
         }
       }
+
+      // Add overrides to the node inputs.
+      for (const string& ctrl_input : add_ctrl_inputs) {
+        node.add_input(ctrl_input);
+      }
     }
   }
 
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
index 9bb51c2641937fab68600fecddc5cbc439d4f4b2..c971eec3f4dae5cc3457ad802700ee4f3086eb90 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer_test.cc
@@ -108,7 +108,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_SimpleFunction) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -184,7 +184,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_SkipErrorsIfGraphNotModified) {
   item.fetch = {"z1"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -284,7 +284,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FixedTypeFunction) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -368,7 +368,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithOutputMapping) {
   item.fetch = {"z"};
   item.feed.emplace_back("x", pi);
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -418,7 +418,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithInputForwarding) {
   item.feed.emplace_back("x4", test::AsScalar<float>(-1.0f));
   item.feed.emplace_back("x3", test::AsScalar<int>(1234));
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
   test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
@@ -549,7 +549,7 @@ TEST_F(FunctionOptimizerTest, InlineFunction_FunctionWithNestedFunctionCall) {
   item.feed.emplace_back("a", test::AsScalar<float>(2.0f));
   auto tensors_expected = EvaluateFetchNodes(item);
 
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -699,6 +699,331 @@ TEST_F(FunctionOptimizerTest, InlineSymbolicGradient_NoInlineFunc) {
   CompareGraphs(item.graph, output);
 }
 
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionSimpleFunction) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("c", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func} /* Function library */);
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Function must be inlined and all nodes placed on a valid device.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  Tensor pi = test::AsScalar<float>(3.14f);
+  item.feed.emplace_back("a", pi);
+  item.feed.emplace_back("b", pi);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors_expected = EvaluateFetchNodes(item);
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  ASSERT_EQ(tensors.size(), tensors_expected.size());
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithControlDependencies) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const TensorShape scalar = TensorShape({});
+
+  // Compute `x*y` and add `1.0` to the variable.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T", "v: resource"}, {"z:T"}, {"T: {float, double}"},
+      {{{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+       {{"add"},
+        "AssignAddVariableOp",
+        {"v", "one:output:0"},
+        {{"dtype", DT_FLOAT}}},
+       {{"mul"}, "Mul", {"x", "y", "^add"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute:
+  //   a = Placeholder
+  //   b = Placeholder
+  //   v = VarHandleOp(init = a)
+  //   f1 = MyMul(a, b, v)
+  //   f2 = MyMul(f1, f1, v)
+  //   return [f2, v]
+  GrapplerItem item;
+  item.fetch = {"out_1", "out_2"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Initialize variable with one of the placeholders.
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
+            kDevice),
+
+       // Call function first time.
+       NDef("f1", "PartitionedCall", {"a", "b", "v", "^init_v"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_RESOURCE}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Call function second time.
+       NDef("f2", "PartitionedCall", {"f1", "f1", "v", "^f1"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT, DT_RESOURCE}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Return result of multiplication and a current value of the variable.
+       NDef("out_1", "Identity", {"f2"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_2", "ReadVariableOp", {"v", "^f1", "^f2"},
+            {{"dtype", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Initialize variable with one of the placeholders.
+       NDef("v", "VarHandleOp", {}, {{"dtype", DT_FLOAT}, {"shape", scalar}}),
+       NDef("init_v", "AssignVariableOp", {"v", "a"}, {{"dtype", DT_FLOAT}},
+            kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("f1/x", "Identity", {"a:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/y", "Identity", {"b:0", "^init_v"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/v", "Identity", {"v:0", "^init_v"}, {{"T", DT_RESOURCE}},
+            kDevice),
+       NDef("f1/one", "Const", {"^f1/x"},
+            {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
+       NDef("f1/add", "AssignAddVariableOp", {"f1/v", "f1/one"},
+            {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("f1/mul", "Mul", {"f1/x", "f1/y", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Function body of a second function call also inlined into the graph,
+       // and input nodes read directly from the inlined nodes of the first
+       // function call.
+       NDef("f2/x", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0", "^f1/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+       NDef("f2/v", "Identity", {"v:0", "^f1/add"}, {{"T", DT_RESOURCE}},
+            kDevice),
+       NDef("f2/one", "Const", {"^f2/x"},
+            {{"dtype", DT_FLOAT}, {"value", kOne}}, kDevice),
+       NDef("f2/add", "AssignAddVariableOp", {"f2/v", "f2/one"},
+            {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("f2/mul", "Mul", {"f2/x", "f2/y", "^f2/add"}, {{"T", DT_FLOAT}},
+            kDevice),
+
+       // Return values read directly from inlined nodes.
+       NDef("out_1", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("out_2", "ReadVariableOp", {"v", "^f1/add", "^f2/add"},
+            {{"dtype", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  item.feed.emplace_back("a", kOne);
+  item.feed.emplace_back("b", kTwo);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 2);
+  EXPECT_EQ(tensors_expected[0].flat<float>()(0), 4.0);  // mul
+  EXPECT_EQ(tensors_expected[1].flat<float>()(0), 3.0);  // read variable
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  ASSERT_EQ(tensors.size(), 2);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+  test::ExpectTensorEqual<float>(tensors_expected[1], tensors[1]);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithDevicePlacement) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+  // Add device placement spec to the function body node.
+  (*mul_func.mutable_node_def())[0].set_device("/device:CPU:1");
+
+  // We need fully defined device names to run the placer for inlined function.
+  const string cpu0 = "/job:work/replica:1/task:1/device:CPU:0";
+  const string cpu1 = "/job:work/replica:1/task:1/device:CPU:1";
+
+  // Build a graph to compute c = MyMul(a, b)
+  GrapplerItem item;
+  item.fetch = {"d"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+       NDef("c", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            cpu0),
+       NDef("d", "Identity", {"c"}, {{"T", DT_FLOAT}}, cpu0)},
+      // Function library.
+      {mul_func});
+  ASSERT_TRUE(item.InferDevicesFromGraph().ok());
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu0),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, cpu1),
+
+       // Function must be inlined and `mul` node placed on a requested device.
+       NDef("c/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, cpu1),
+       NDef("c/mul", "Mul", {"c/x", "c/y"}, {{"T", DT_FLOAT}}, cpu1),
+
+       NDef("d", "Identity", {"c/mul:0"}, {{"T", DT_FLOAT}}, cpu0)},
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+}
+
+TEST_F(FunctionOptimizerTest, InlineIndirectFunctionWithoutSideEffects) {
+  using test::function::NDef;
+  using FDH = FunctionDefHelper;
+
+  FunctionOptimizer optimizer(RewriterConfig::AGGRESSIVE);
+
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  const Tensor kTwo = test::AsScalar<float>(2.0);
+  const TensorShape scalar = TensorShape({});
+
+  // MyMul doesn't have any side-effectful nodes in the function body, but the
+  // optimized graph has a control dependency edge `f1->f2`.
+  FunctionDef mul_func = FunctionDefHelper::Create(
+      "MyMul", {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
+      {{{"mul"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
+      /* Mapping between function returns and function node outputs. */
+      {{"z", "mul:z:0"}});
+
+  // Build a graph to compute:
+  //   a = Placeholder
+  //   b = Placeholder
+  //   f1 = MyMul(a, b)
+  //   f2 = MyMul(a, b, ^f1)  <-- control dependency on inlined function!
+  //   return f2
+  GrapplerItem item;
+  item.fetch = {"out"};
+  item.graph = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Call function first time.
+       NDef("f1", "PartitionedCall", {"a", "b"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Call function second time.
+       NDef("f2", "PartitionedCall", {"f1", "f1", "^f1"},
+            {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
+             {"Tout", DataTypeSlice{DT_FLOAT}},
+             {"f", FDH::FunctionRef("MyMul", {{"T", DT_FLOAT}})}},
+            kDevice),
+
+       // Return result of f2.
+       NDef("out", "Identity", {"f2"}, {{"T", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  GraphDef optimized_graph;
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &optimized_graph));
+
+  GraphDef expected = test::function::GDef(
+      {NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+       NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
+
+       // Function body of a first function call inlined into the graph.
+       NDef("f1/x", "Identity", {"a:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/y", "Identity", {"b:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f1/mul", "Mul", {"f1/x", "f1/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Function body of a second function call also inlined into the graph,
+       // and input nodes read directly from the inlined nodes of the first
+       // function call, and control dependency edge removed.
+       NDef("f2/x", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/y", "Identity", {"f1/mul:0"}, {{"T", DT_FLOAT}}, kDevice),
+       NDef("f2/mul", "Mul", {"f2/x", "f2/y"}, {{"T", DT_FLOAT}}, kDevice),
+
+       // Return directly from inlined node of f2.
+       NDef("out", "Identity", {"f2/mul:0"}, {{"T", DT_FLOAT}}, kDevice)},
+
+      // Function library.
+      {mul_func});
+
+  CompareGraphs(expected, optimized_graph);
+
+  item.feed.emplace_back("a", kOne);
+  item.feed.emplace_back("b", kTwo);
+
+  auto tensors_expected = EvaluateFetchNodes(item);
+  ASSERT_EQ(tensors_expected.size(), 1);
+
+  GrapplerItem optimized = item.WithGraph(std::move(optimized_graph));
+  auto tensors = EvaluateFetchNodes(optimized);
+  test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
+}
+
 TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   using test::function::NDef;
 
@@ -742,7 +1067,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunctionXTimesTwo) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -806,7 +1131,7 @@ TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionXTimesTwo) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -869,7 +1194,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunctionPushDownConstInput) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -949,7 +1274,7 @@ TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionPushDownConstInput) {
   item.feed.emplace_back("x", pi);
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -1065,7 +1390,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunction_OncePerUniqueContext) {
   item.feed = {{"xf", pi}, {"yf", pi}, {"xi", four}, {"yi", four}};
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -1174,7 +1499,7 @@ TEST_F(FunctionOptimizerTest, SpecializeFunctionForUsedOutputTensors) {
   item.feed = {{"xf", pi}, {"yf", pi}};
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   ASSERT_EQ(tensors_expected.size(), tensors.size());
@@ -1335,7 +1660,7 @@ TEST_F(FunctionOptimizerTest, SpecializeIndirectFunctionForUsedOutputTensors) {
   item.feed = {{"xf", pi}, {"yf", pi}};
 
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   ASSERT_EQ(tensors_expected.size(), tensors.size());
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
index f31a30ec0edf5022004e9489994dc6875f60bfd0..99fcb31523800c76b8c413da92576fc16092f588 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer_stage.h
@@ -239,7 +239,8 @@ class GraphOptimizerStagePipeline {
         // case of any error it must leave optimized graph unmodified.
         if (!stage_status.ok()) {
           LOG(WARNING) << "Failed to run optimizer " << stage->optimizer_name()
-                       << ", stage " << stage->stage_name()
+                       << ", stage " << stage->stage_name() << " node "
+                       << node->name()
                        << ". Error: " << stage_status.error_message();
         }
         if (break_predicate_(*result)) return true;
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.cc b/tensorflow/core/grappler/optimizers/graph_rewriter.cc
deleted file mode 100644
index b45ceb12a7972d8e0fb15c0562d0e4ceeeeeef1c..0000000000000000000000000000000000000000
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.cc
+++ /dev/null
@@ -1,214 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
-#include <unordered_map>
-#include <unordered_set>
-#include "tensorflow/core/framework/function.pb.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_def.pb.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/utils.h"
-
-namespace tensorflow {
-namespace grappler {
-
-GraphRewriter::GraphRewriter(const GrapplerItem& item) {
-  OpRegistryInterface* op_registry = OpRegistry::Global();
-  for (auto& node : item.graph.node()) {
-    NodeInfo* info = new NodeInfo();
-    info->def = &node;
-
-    const OpRegistrationData* op_reg_data = nullptr;
-    Status s = op_registry->LookUp(node.op(), &op_reg_data);
-    // TODO(bsteiner): make this not a best-effort lookup and evaluation?
-    if (s.ok()) {
-      DataTypeVector inputs;
-      s = InOutTypesForNode(node, op_reg_data->op_def, &inputs, &info->outputs);
-      if (!s.ok()) {
-        info->outputs.clear();
-      }
-    }
-
-    nodes_[node.name()].reset(info);
-  }
-
-  std::unordered_set<string> function_names;
-  for (const auto& function : item.graph.library().function()) {
-    function_names.insert(function.signature().name());
-  }
-
-  for (auto& node : item.graph.node()) {
-    RecordConnectivity(node, function_names);
-  }
-}
-
-void GraphRewriter::ForwardInputs(
-    const NodeDef& original_node,
-    const std::unordered_set<const NodeDef*>& nodes_to_delete,
-    NodeDef* new_node) {
-  ForwardInputsInternal(original_node, nodes_to_delete, false, new_node);
-  if (!new_node->name().empty()) {
-    optimized_nodes_[new_node->name()] = new_node;
-  }
-  // Reorder inputs such that control inputs come after regular inputs.
-  int pos = 0;
-  for (int i = 0; i < new_node->input_size(); ++i) {
-    if (!IsControlInput(new_node->input(i))) {
-      new_node->mutable_input()->SwapElements(pos, i);
-      ++pos;
-    }
-  }
-  DedupControlInputs(new_node);
-}
-
-bool GraphRewriter::DrivesControlDependency(const NodeDef& node) const {
-  return control_dependency_drivers_.find(&node) !=
-         control_dependency_drivers_.end();
-}
-
-bool GraphRewriter::FeedsMerge(const NodeDef& node) const {
-  return merge_feeders_.find(&node) != merge_feeders_.end();
-}
-
-bool GraphRewriter::IsDrivenByControlDependency(const NodeDef& node) const {
-  for (const auto& input : node.input()) {
-    CHECK(!input.empty());
-    if (input[0] == '^') {
-      return true;
-    }
-  }
-  return false;
-}
-
-bool GraphRewriter::IsConnectedToFunction(const NodeDef& node) const {
-  return function_neighbors_.find(&node) != function_neighbors_.end();
-}
-
-bool GraphRewriter::IsDrivenByAnotherDevice(const NodeDef& node) const {
-  return cross_device_receivers_.find(&node) != cross_device_receivers_.end();
-}
-
-bool GraphRewriter::ReceivesRefValue(const NodeDef& node) const {
-  return ref_receivers_.find(&node) != ref_receivers_.end();
-}
-
-bool GraphRewriter::IsDrivenBySwitch(const NodeDef& node) const {
-  return switch_receivers_.find(&node) != switch_receivers_.end();
-}
-
-bool GraphRewriter::RemovalIncreasesEdgeCount(const NodeDef& node) const {
-  const int in_degree = node.input_size();
-  auto itr = nodes_.find(node.name());
-  if (itr == nodes_.end()) {
-    return true;
-  }
-  const int out_degree = itr->second->out_degree;
-  return in_degree * out_degree > in_degree + out_degree;
-}
-
-void GraphRewriter::RecordConnectivity(
-    const NodeDef& node, const std::unordered_set<string>& function_names) {
-  const bool is_function =
-      function_names.find(node.op()) != function_names.end();
-
-  bool ref_receiver = false;
-  bool switch_receiver = false;
-  for (const auto& input : node.input()) {
-    int position = 0;
-    string input_node_name = ParseNodeName(input, &position);
-    auto itr = nodes_.find(input_node_name);
-    if (itr == nodes_.end()) {
-      continue;
-    }
-
-    NodeInfo* fanin_info = itr->second.get();
-    const NodeDef* fanin = fanin_info->def;
-    if (IsMerge(node)) {
-      merge_feeders_.insert(fanin);
-    }
-    // Update out_degree of fanin.
-    ++fanin_info->out_degree;
-    if (position < 0) {
-      // This is a control edge
-      control_dependency_drivers_.insert(fanin);
-    } else {
-      // This is a regular edge
-      if (function_names.find(fanin->op()) != function_names.end()) {
-        function_neighbors_.insert(&node);
-      }
-      if (is_function) {
-        function_neighbors_.insert(fanin);
-      }
-      if (IsSwitch(*fanin)) {
-        switch_receiver = true;
-      }
-      if (position < fanin_info->outputs.size() &&
-          IsRefType(fanin_info->outputs[position])) {
-        ref_receiver = true;
-      }
-    }
-    if (fanin->device() != node.device()) {
-      cross_device_receivers_.insert(&node);
-    }
-  }
-
-  if (ref_receiver) {
-    ref_receivers_.insert(&node);
-  }
-  if (switch_receiver) {
-    switch_receivers_.insert(&node);
-  }
-}
-
-void GraphRewriter::ForwardInputsInternal(
-    const NodeDef& node,
-    const std::unordered_set<const NodeDef*>& nodes_to_delete,
-    bool add_as_control, NodeDef* new_node) {
-  // To speed things up, use the optimized version of the node if
-  // available.
-  auto itr = optimized_nodes_.find(node.name());
-  if (itr != optimized_nodes_.end()) {
-    for (const string& input : itr->second->input()) {
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-    }
-    return;
-  }
-  for (const auto& input : node.input()) {
-    const string input_node_name = NodeName(input);
-    auto itr = nodes_.find(input_node_name);
-    if (itr == nodes_.end()) {
-      // Invalid input, preserve it as is.
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-      continue;
-    }
-    const NodeDef* input_node = itr->second->def;
-    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
-      ForwardInputsInternal(*input_node, nodes_to_delete,
-                            add_as_control || IsControlInput(input), new_node);
-    } else {
-      *new_node->add_input() =
-          add_as_control ? AsControlDependency(NodeName(input)) : input;
-    }
-  }
-}
-
-}  // end namespace grappler
-}  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/graph_rewriter.h b/tensorflow/core/grappler/optimizers/graph_rewriter.h
deleted file mode 100644
index 4a5a150dc9234ffdeed9b991c828e8ec30befde8..0000000000000000000000000000000000000000
--- a/tensorflow/core/grappler/optimizers/graph_rewriter.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
-#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
-
-#include <unordered_map>
-#include <unordered_set>
-#include "tensorflow/core/grappler/grappler_item.h"
-
-namespace tensorflow {
-namespace grappler {
-
-// Tools and utilities to simplify common graph rewrites.
-class GraphRewriter {
- public:
-  GraphRewriter(const GrapplerItem& item);
-
-  // Forward the inputs of original_node as needed to skip over the nodes that
-  // are to be deleted. In other words, if I is an input of 'original_node', and
-  // I doesn't belong to one of the nodes in 'nodes_to_delete', I will be an
-  // input to 'new_node'. On the other hand, if I belong to a node that will be
-  // deleted, I will be replaced with the inputs J of the deleted node (unless J
-  // belong to nodes that will be deleted, in which case we'll look for
-  // preserved inputs further down the graph).
-  void ForwardInputs(const NodeDef& original_node,
-                     const std::unordered_set<const NodeDef*>& nodes_to_delete,
-                     NodeDef* new_node);
-
-  // Returns true if at least one of the edges in the direct fanout of 'node' is
-  // a control dependency edge.
-  bool DrivesControlDependency(const NodeDef& node) const;
-
-  // Returns true if at least one of the incident edges is a control dependency
-  // edge.
-  bool IsDrivenByControlDependency(const NodeDef& node) const;
-
-  // Returns true if at least one of the nodes in the direct fanin or the direct
-  // fanout (excluding control dependencies) of 'node' is a function.
-  bool IsConnectedToFunction(const NodeDef& node) const;
-
-  // Returns true if the node is driven by at least one node placed on another
-  // device.
-  bool IsDrivenByAnotherDevice(const NodeDef& node) const;
-
-  // Returns true if the node has input from a stateful op.
-  bool ReceivesRefValue(const NodeDef& node) const;
-
-  // Returns true if the node is driven by a Switch node.
-  bool IsDrivenBySwitch(const NodeDef& node) const;
-
-  // Returns true if the node feeds a Merge node.
-  bool FeedsMerge(const NodeDef& node) const;
-
-  // Returns true if removal of this degree would increase edge count, i.e. if
-  // in-degree * out-degree > in-degree + out-degree or if the condition could
-  // not be verified.
-  bool RemovalIncreasesEdgeCount(const NodeDef& node) const;
-
- private:
-  void RecordConnectivity(const NodeDef& node,
-                          const std::unordered_set<string>& function_names);
-  void ForwardInputsInternal(
-      const NodeDef& original_node,
-      const std::unordered_set<const NodeDef*>& nodes_to_delete,
-      bool add_as_control, NodeDef* new_node);
-
-  struct NodeInfo {
-    int out_degree = 0;
-    const NodeDef* def;
-
-    // These are filled in when the NodeInfo is built, but not that they
-    // may be empty - if the op could not be loaded from the registry.
-    DataTypeVector outputs;
-  };
-
-  std::unordered_map<string, std::unique_ptr<NodeInfo>> nodes_;
-  std::unordered_map<string, const NodeDef*> optimized_nodes_;
-  std::unordered_set<const NodeDef*> control_dependency_drivers_;
-  std::unordered_set<const NodeDef*> function_neighbors_;
-  std::unordered_set<const NodeDef*> cross_device_receivers_;
-  std::unordered_set<const NodeDef*> ref_receivers_;
-  std::unordered_set<const NodeDef*> switch_receivers_;
-  std::unordered_set<const NodeDef*> merge_feeders_;
-};
-
-}  // end namespace grappler
-}  // end namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_GRAPH_REWRITER_H_
diff --git a/tensorflow/core/grappler/optimizers/layout_optimizer.cc b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
index 7dc62e24df52c042dad40572d480fa17863666a7..8f25a1c8c1c48281fb44c01a142348863836d5aa 100644
--- a/tensorflow/core/grappler/optimizers/layout_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/layout_optimizer.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/layout_optimizer.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -119,6 +118,8 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "Exit",
                                           "Exp",
                                           "Expm1",
+                                          "FakeQuantWithMinMaxVars",
+                                          "FakeQuantWithMinMaxArgs",
                                           "Fill",
                                           "Floor",
                                           "FloorDiv",
@@ -161,6 +162,8 @@ std::set<string> GetOpsFormatAgnostic() {
                                           "PreventGradient",
                                           "Prod",
                                           "Polygamma",
+                                          "QuantizeAndDequantizeV2",
+                                          "QuantizeAndDequantizeV3",
                                           "Pow",
                                           "Real",
                                           "RealDiv",
@@ -1965,9 +1968,9 @@ class DataLayoutOptimizer : GraphProcessor {
   // Expand all nodes which is in NHWC, but supports NCHW or is layout agnostic.
   Status Expand() {
     int node_size_original = graph_->node_size();
-    std::unordered_map<const NodeDef*, std::vector<int>> frames;
-    int num_frames;
-    TF_RETURN_IF_ERROR(IdentifyFrames(*graph_, &frames, &num_frames));
+
+    FrameView frame_view;
+    TF_RETURN_IF_ERROR(frame_view.InferFromGraph(*graph_));
 
     // This is the first pass where we expand the nodes which support NCHW.
     std::set<string> ops_format_supported = GetOpsFormatSupported();
@@ -1979,7 +1982,7 @@ class DataLayoutOptimizer : GraphProcessor {
       if (ops_format_supported.find(graph_->node(i).op()) !=
           ops_format_supported.end()) {
         auto node = graph_->mutable_node(i);
-        bool is_in_frame = !frames[node].empty();
+        bool is_in_frame = frame_view.IsInFrame(*node);
         OptimizeContext opt_cxt(graph_, node, node_map_, graph_properties_,
                                 virtual_placer_, nodes_to_preserve_,
                                 is_in_frame);
@@ -2029,7 +2032,7 @@ class DataLayoutOptimizer : GraphProcessor {
         if (ops_format_agnostic.find(graph_->node(i).op()) !=
             ops_format_agnostic.end()) {
           auto node = graph_->mutable_node(i);
-          bool is_in_frame = !frames[node].empty();
+          bool is_in_frame = frame_view.IsInFrame(*node);
           OptimizeContext opt_cxt(graph_, node, node_map_, graph_properties_,
                                   virtual_placer_, nodes_to_preserve_,
                                   is_in_frame);
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 775fb9a95f2a7107d013bfafa3779ef465138b20..36064738408c744db53cb9e95645d6a2968b1746 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/optimizers/evaluation_utils.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
@@ -380,14 +379,14 @@ Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(
 
 Status LoopInvariantNodeMotionOptimizer::Optimize() {
   node_map_.reset(new NodeMap(optimized_graph_));
-  FrameMap frame_map;
-  int num_frames;
-  TF_RETURN_IF_ERROR(IdentifyFramesWithNodeMap(*optimized_graph_, *node_map_,
-                                               &frame_map, &num_frames));
+  FrameView frame_view;
+  // TODO(ezhulenev): Use GraphView when migrated from NodeMap.
+  TF_RETURN_IF_ERROR(frame_view.InferFromGraph(*optimized_graph_));
+
   std::deque<int> worklist;
-  for (auto iter = frame_map.begin(); iter != frame_map.end(); ++iter) {
-    auto* node = iter->first;
-    auto& frame_ids = iter->second;
+  for (const NodeDef& node : optimized_graph_->node()) {
+    const std::vector<int>& frame_ids = frame_view.Frames(node);
+
     if (frame_ids.size() >= 3) {
       for (unsigned int i = 1; i < frame_ids.size() - 1; ++i) {
         frame_parent_[frame_ids[i]] = frame_ids[i - 1];
@@ -400,18 +399,18 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
     }
     if (!frame_ids.empty()) {
       frame_children_.insert(std::make_pair(frame_ids.back(), empty_set_));
-      if (node->op() == "LoopCond") {
+      if (node.op() == "LoopCond") {
         if (loop_cond_.count(frame_ids.back())) {
           return errors::InvalidArgument(
               "Loop ", frame_ids.back(),
-              " has more than one LoopCond node: ", node->name(), " and ",
+              " has more than one LoopCond node: ", node.name(), " and ",
               loop_cond_[frame_ids.back()]->name());
         }
-        loop_cond_[frame_ids.back()] = node;
+        loop_cond_[frame_ids.back()] = &node;
       }
-      if (IsEnter(*node) && node->attr().at("is_constant").b()) {
+      if (IsEnter(node) && node.attr().at("is_constant").b()) {
         invariant_enters_[frame_ids.back()].push_back(
-            const_cast<NodeDef*>(node));
+            const_cast<NodeDef*>(&node));
       }
     }
   }
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.h b/tensorflow/core/grappler/optimizers/loop_optimizer.h
index 7c04f55381edca8f6a6679edb73479414f4c6f0b..d467237a9a704a81a0ecc1da71531868c7f3a49b 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <unordered_set>
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/frame.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
index 7332839128eef23bdf77bcdfc6b22a19413c3dfa..587767c23c370ca1f747fc5b4e2bfa4cba3ae10d 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer_test.cc
@@ -101,27 +101,30 @@ TEST_F(LoopOptimizerTest, Basic) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd")).back(), 0);
+  }
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd")).back(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, Const) {
@@ -149,26 +152,29 @@ TEST_F(LoopOptimizerTest, Const) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const")).back(), 0);
+  }
+
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const")).size(), 0);
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const")).size(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, ControlOutput) {
@@ -197,24 +203,27 @@ TEST_F(LoopOptimizerTest, ControlOutput) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
+
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoop1) {
@@ -258,31 +267,34 @@ TEST_F(LoopOptimizerTest, NestedLoop1) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).back(), 0);
+  }
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).back(), 0);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd")).size(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd")).size(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoop2) {
@@ -326,27 +338,30 @@ TEST_F(LoopOptimizerTest, NestedLoop2) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+  }
+
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("VariantAdd2")).back(), 1);
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("VariantAdd2")).back(), 1);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoopConst1) {
@@ -390,28 +405,31 @@ TEST_F(LoopOptimizerTest, NestedLoopConst1) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 1);
+  }
+
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 0);
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 1);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 0);
+  }
 }
 
 TEST_F(LoopOptimizerTest, NestedLoopConst2) {
@@ -455,26 +473,29 @@ TEST_F(LoopOptimizerTest, NestedLoopConst2) {
   LoopOptimizer optimizer;
   EnableOnlyLoopInvariantNodeMotion(&optimizer);
   GraphDef output;
-  Status status = optimizer.Optimize(nullptr, item, &output);
-  TF_EXPECT_OK(status);
+  TF_EXPECT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  {  // Original graph.
+    GraphView view(&graph);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).back(), 1);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 2);
+    EXPECT_EQ(frames.Frames(*view.GetNode("Const2")).back(), 1);
+  }
 
-  std::unique_ptr<NodeMap> node_map;
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-
-  node_map.reset(new NodeMap(&graph));
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).back(), 1);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).back(), 1);
-
-  node_map.reset(new NodeMap(&output));
-  EXPECT_TRUE(IdentifyFrames(output, &frames, &num_frames).ok());
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.at(node_map->GetNode("InvariantAdd2")).size(), 0);
-  EXPECT_EQ(frames.at(node_map->GetNode("Const2")).size(), 0);
+  {  // Optimized graph.
+    GraphView view(&output);
+    FrameView frames;
+    TF_EXPECT_OK(frames.InferFromGraphView(view));
+
+    EXPECT_EQ(frames.num_frames(), 2);
+    ASSERT_EQ(frames.Frames(*view.GetNode("InvariantAdd2")).size(), 0);
+    ASSERT_EQ(frames.Frames(*view.GetNode("Const2")).size(), 0);
+  }
 }
 
 void VerifyGraphsEqual(const GraphDef& original_graph,
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index e0a913565fc4b98a055dcea7efe2bba09dc0171d..227c2bb8b0f3d3e6809f65f3b3716270b0c2c6e5 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include "tensorflow/core/grappler/optimizers/static_schedule.h"
 #include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/grappler/utils/topological_sort.h"
@@ -1307,13 +1306,12 @@ Status RelaxAllocatorConstraints(GraphDef* optimized_graph) {
 
 Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                  GraphDef* optimized_graph) {
-  *optimized_graph = item.graph;
+  GrapplerItem optimized_item(item);
 
   RecomputationRewritingPass(optimization_level_,
-                             recomputation_targets_name_scope_, optimized_graph,
-                             item);
+                             recomputation_targets_name_scope_,
+                             &optimized_item.graph, item);
 
-  GrapplerItem optimized_item(item, optimized_graph);
   std::unordered_set<string> skip_list;
   // Bound the number of rewrite passes to avoid long processing times on graphs
   // that simply won't fit in memory.
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
index 75285b07bb9305fb78e37ef4918b3daf997015f6..356b23dec0de7d8648fd92b977413720654f2451 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer_test.cc
@@ -279,7 +279,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   EXPECT_EQ("^swap_out_e_0", new_c.input(1));
 
   // Run the optimizer a second time to ensure it's idempotent.
-  GrapplerItem item_copy(item, std::move(output));
+  GrapplerItem item_copy = item.WithGraph(std::move(output));
   status = optimizer.Optimize(cluster.get(), item_copy, &output);
   TF_EXPECT_OK(status);
 
@@ -287,7 +287,7 @@ TEST_F(MemoryOptimizerTest, SimpleSwapping) {
   item.fetch = {"e"};
   item.init_ops = {init.name()};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
@@ -337,7 +337,7 @@ TEST_F(MemoryOptimizerTest, SwappingHeuristics) {
 
 #if GOOGLE_CUDA
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   for (int i = 0; i < item.fetch.size(); ++i) {
     test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
@@ -386,7 +386,7 @@ TEST_F(MemoryOptimizerTest, UnswappableInputs) {
 
 #if GOOGLE_CUDA
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
@@ -474,7 +474,7 @@ TEST_F(RelaxAllocatorConstraintsTest, SameDevice) {
   item.fetch = {"exp"};
   item.init_ops = {"variable"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 }
@@ -505,7 +505,7 @@ TEST_F(RelaxAllocatorConstraintsTest, DifferentDevice) {
   item.fetch = {"exp"};
   item.init_ops = {"variable"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
 #endif
@@ -598,7 +598,7 @@ TEST_F(RelaxAllocatorConstraintsTest, AssignNodeInFanout) {
   item.fetch = {"assign0", "assign1"};
   item.init_ops = {"exp1", "variable1"};
   auto tensors_expected = EvaluateFetchNodes(item);
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
   for (int i = 0; i < tensors_expected.size(); ++i) {
     test::ExpectTensorEqual<float>(tensors_expected[i], tensors[i]);
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 82c88bb06aeca73cd7f6523af4cdf3d0c6142e86..572cc41d765f5b0e285bbff3ff600c15fbed1431 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
+#include "tensorflow/core/util/dump_graph.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
@@ -127,8 +128,10 @@ std::unique_ptr<GraphOptimizer> MetaOptimizer::MakeNewOptimizer(
 
 #undef MK_OPT
 
-MetaOptimizer::MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg)
-    : cpu_device_(cpu_device), cfg_(cfg) {
+MetaOptimizer::MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg)
+    : cpu_device_(cpu_device),
+      config_proto_(cfg),
+      cfg_(*config_proto_.mutable_graph_options()->mutable_rewrite_options()) {
   DCHECK(cpu_device_ == nullptr ||
          cpu_device_->attributes().device_type() == "CPU");
 }
@@ -437,7 +440,7 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
           item.graph)
           .ToProto();
 
-  GrapplerItem trimmed_item(item, std::move(trimmed_graph));
+  GrapplerItem trimmed_item = item.WithGraph(std::move(trimmed_graph));
 
   VLOG(1) << absl::Substitute(
       "Deleted $0 unreachable functions from the graph (library size = $1)",
@@ -460,6 +463,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   // optimize TPU functions with Grappler, this check preserves that.
   if (IsTPUGraphDef(*optimized_graph)) {
     VLOG(2) << "Skipping optimizing funcs for TPU graphs";
+    if (VLOG_IS_ON(1)) {
+      DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+    }
     return Status::OK();
   }
 
@@ -518,9 +524,19 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       // can't perform non-differentiable rewrites.
       if (differentiable_functions.find(func_name) !=
           differentiable_functions.end()) {
-        func_item.allowed_optimizations.non_differentiable_rewrites = false;
+        func_item.allowed_optimizations().non_differentiable_rewrites = false;
+      }
+
+      // Function item is allowed to use all devices from the main graph.
+      Status added_devices = func_item.AddDevices(item);
+      if (!added_devices.ok()) {
+        VLOG(3) << added_devices.error_message();
       }
 
+      // We can safely inline nested function calls with side-effectful ops into
+      // the function body (see function_optimizer.cc for details).
+      func_item.allowed_optimizations().inline_ops_with_side_effects = true;
+
       // Optimize function body graph.
       GraphDef optimized_func_graph;
       TF_RETURN_IF_ERROR(
@@ -553,6 +569,9 @@ Status MetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   VLOG(1) << "Optimized " << optimized_funcs.size()
           << " functions: " << str_util::Join(optimized_funcs, ", ");
 
+  if (VLOG_IS_ON(1)) {
+    DumpGraphDefToFile("after_MetaOptimizer", *optimized_graph);
+  }
   return Status::OK();
 }
 
@@ -570,32 +589,35 @@ void MetaOptimizer::Feedback(Cluster* cluster, const GrapplerItem& item,
   // Nothing to do for MetaOptimizer.
 }
 
-bool MetaOptimizerEnabled(const RewriterConfig& cfg) {
-  if (cfg.disable_meta_optimizer()) {
+bool MetaOptimizerEnabled(const ConfigProto& cfg) {
+  const auto& rewrite_cfg = cfg.graph_options().rewrite_options();
+  if (rewrite_cfg.disable_meta_optimizer()) {
     return false;
   }
-  return !cfg.disable_model_pruning() ||
-         cfg.layout_optimizer() != RewriterConfig::OFF ||
-         cfg.function_optimization() != RewriterConfig::OFF ||
-         cfg.constant_folding() != RewriterConfig::OFF ||
-         cfg.shape_optimization() != RewriterConfig::OFF ||
-         cfg.remapping() != RewriterConfig::OFF ||
-         cfg.arithmetic_optimization() != RewriterConfig::OFF ||
-         cfg.loop_optimization() != RewriterConfig::OFF ||
-         cfg.dependency_optimization() != RewriterConfig::OFF ||
-         cfg.auto_parallel().enable() ||
-         cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
-         cfg.debug_stripper() == RewriterConfig::ON ||
-         cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
-         cfg.pin_to_host_optimization() == RewriterConfig::ON ||
-         !cfg.optimizers().empty() || !cfg.custom_optimizers().empty();
+  return !rewrite_cfg.disable_model_pruning() ||
+         rewrite_cfg.layout_optimizer() != RewriterConfig::OFF ||
+         rewrite_cfg.function_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.constant_folding() != RewriterConfig::OFF ||
+         rewrite_cfg.shape_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.remapping() != RewriterConfig::OFF ||
+         rewrite_cfg.arithmetic_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.loop_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.dependency_optimization() != RewriterConfig::OFF ||
+         rewrite_cfg.auto_parallel().enable() ||
+         rewrite_cfg.memory_optimization() != RewriterConfig::NO_MEM_OPT ||
+         rewrite_cfg.debug_stripper() == RewriterConfig::ON ||
+         rewrite_cfg.scoped_allocator_optimization() == RewriterConfig::ON ||
+         rewrite_cfg.pin_to_host_optimization() == RewriterConfig::ON ||
+         !rewrite_cfg.optimizers().empty() ||
+         !rewrite_cfg.custom_optimizers().empty();
 }
 
-Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph) {
   MetaOptimizer optimizer(cpu_device, cfg);
-  optimizer.set_deadline_usec(DeadlineMicroSeconds(cfg));
+  optimizer.set_deadline_usec(
+      DeadlineMicroSeconds(cfg.graph_options().rewrite_options()));
   Status status = optimizer.Optimize(cluster, item, optimized_graph);
   if (!status.ok()) {
     *optimized_graph = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.h b/tensorflow/core/grappler/optimizers/meta_optimizer.h
index e599a9201bc2755d1424a7495a0b86667ed0d828..a06da4394e4b8a4d8e75855a0a432114f7d7fcb3 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/graph_optimizer.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 
 namespace tensorflow {
@@ -28,7 +29,7 @@ namespace grappler {
 // Run the other grappler optimizers based on the specified rewriter config.
 class MetaOptimizer : public GraphOptimizer {
  public:
-  MetaOptimizer(DeviceBase* cpu_device, const RewriterConfig& cfg);
+  MetaOptimizer(DeviceBase* cpu_device, const ConfigProto& cfg);
   ~MetaOptimizer() override = default;
 
   string name() const override { return "meta_optimizer"; };
@@ -65,7 +66,8 @@ class MetaOptimizer : public GraphOptimizer {
                        GraphDef* optimized_graph);
 
   DeviceBase* const cpu_device_;  // may be NULL
-  RewriterConfig cfg_;
+  ConfigProto config_proto_;
+  RewriterConfig& cfg_;
 
   struct OptimizerResult {
     string optimizer_name;
@@ -85,7 +87,7 @@ class MetaOptimizer : public GraphOptimizer {
   std::vector<GraphOptimizationResult> optimization_results_;
 };
 
-bool MetaOptimizerEnabled(const RewriterConfig& cfg);
+bool MetaOptimizerEnabled(const ConfigProto& cfg);
 
 // Run the meta optimizer.
 //
@@ -93,7 +95,7 @@ bool MetaOptimizerEnabled(const RewriterConfig& cfg);
 // during constant folding; if NULL, a new device is created for doing constant
 // folding. For performance, it is recommended to pass in an existing cpu_device
 // when possible.
-Status RunMetaOptimizer(const GrapplerItem& item, const RewriterConfig& cfg,
+Status RunMetaOptimizer(const GrapplerItem& item, const ConfigProto& cfg,
                         DeviceBase* cpu_device, Cluster* cluster,
                         GraphDef* optimized_graph);
 
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index 6105bf27bab8020facf259fc426edd44a6547ff2..12db5d6ca9b001fa04e42e6d228fe6289d87726e 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -108,7 +108,7 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
                   GraphDef* optimized_graph) override {
     *optimized_graph = item.graph;
     if (allowed_optimizations_) {
-      allowed_optimizations_->insert({item.id, item.allowed_optimizations});
+      allowed_optimizations_->insert({item.id, item.allowed_optimizations()});
     }
     return Status::OK();
   }
@@ -134,11 +134,13 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizer) {
   CHECK(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -151,13 +153,15 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerWithParams) {
   CHECK(fake_input.NextItem(&item));
 
   TestOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizerWithParams");
   auto* custom_config = rewriter_config.add_custom_optimizers();
   custom_config->set_name("TestOptimizerWithParams");
   (*custom_config->mutable_parameter_map())["foo"] = AttrValue();
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -171,13 +175,15 @@ TEST_F(MetaOptimizerTest, RunsCustomOptimizerAndCustomGraphOptimizer) {
 
   TestOptimizer::SetOptimized(false);
   TestGraphOptimizer::SetOptimized(false);
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("TestOptimizer");
   auto customGraphOptimizer = rewriter_config.add_custom_optimizers();
   customGraphOptimizer->set_name("TestGraphOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -190,11 +196,13 @@ TEST_F(MetaOptimizerTest, RunOptimizersTwice) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -205,13 +213,15 @@ TEST_F(MetaOptimizerTest, RunToggleOptimizersAndCustomGraphOptimizerTwice) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
   auto customGraphOptimizer = rewriter_config.add_custom_optimizers();
   customGraphOptimizer->set_name("TestGraphOptimizer");
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
   GraphDef output;
   const Status status = optimizer.Optimize(nullptr, item, &output);
   TF_EXPECT_OK(status);
@@ -222,13 +232,16 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   using test::function::NDef;
 
   // Enable ony function optimization.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_function_optimization(RewriterConfig::ON);
   rewriter_config.add_optimizers("function");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define function library:
   //
@@ -383,7 +396,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibrary) {
   item.feed.emplace_back("b", test::AsScalar<int>(4));
   auto tensors_expected = EvaluateFetchNodes(item);
 
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -394,14 +407,17 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
   using test::function::NDef;
 
   // Enable function optimization and pruning.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.set_function_optimization(RewriterConfig::ON);
   rewriter_config.add_optimizers("function");
   rewriter_config.add_optimizers("pruning");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // MyFunc defines two Mul nodes inside function body and two corresponding
   // function outputs.
@@ -486,7 +502,7 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryPruneFunctionBody) {
   item.feed.emplace_back("b", test::AsScalar<float>(3.123f));
   auto tensors_expected = EvaluateFetchNodes(item);
 
-  GrapplerItem optimized(item, std::move(output));
+  GrapplerItem optimized = item.WithGraph(std::move(output));
   auto tensors = EvaluateFetchNodes(optimized);
 
   test::ExpectTensorEqual<float>(tensors_expected[0], tensors[0]);
@@ -505,12 +521,15 @@ TEST_F(MetaOptimizerTest, OptimizeFunctionLibraryWithRestrictions) {
       &allowed_optimizations);
 
   // Just record properties of optimized Grappler items.
-  RewriterConfig rewriter_config;
+  ConfigProto config_proto;
+  auto& rewriter_config =
+      *config_proto.mutable_graph_options()->mutable_rewrite_options();
+
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::TWO);
   rewriter_config.add_optimizers("GrapplerItemPropertiesAccumulator");
   rewriter_config.set_min_graph_nodes(-1);
 
-  MetaOptimizer optimizer(nullptr, rewriter_config);
+  MetaOptimizer optimizer(nullptr, config_proto);
 
   // Define simple function library with two identical mul functions.
   FunctionDef mul_func_1 = FunctionDefHelper::Create(
@@ -605,7 +624,9 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("SleepingOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
   rewriter_config.set_meta_optimizer_timeout_ms(1500);
@@ -613,7 +634,7 @@ TEST_F(MetaOptimizerTest, OptimizerTimesOut) {
 
   GraphDef output;
   const Status status =
-      RunMetaOptimizer(item, rewriter_config, nullptr, nullptr, &output);
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
   EXPECT_EQ(status.error_message(), "meta_optimizer exceeded deadline.");
   // Make sure the graph was reverted to the original regardless of when the
   // optimizer timed out.
@@ -625,14 +646,16 @@ TEST_F(MetaOptimizerTest, OptimizerDoesNotTimeOut) {
   GrapplerItem item;
   CHECK(fake_input.NextItem(&item));
 
-  RewriterConfig rewriter_config;
+  ConfigProto config;
+  RewriterConfig& rewriter_config =
+      *config.mutable_graph_options()->mutable_rewrite_options();
   rewriter_config.add_optimizers("SleepingOptimizer");
   rewriter_config.set_min_graph_nodes(-1);
   rewriter_config.set_meta_optimizer_timeout_ms(1500);
   rewriter_config.set_meta_optimizer_iterations(RewriterConfig::ONE);
   GraphDef output;
   const Status status =
-      RunMetaOptimizer(item, rewriter_config, nullptr, nullptr, &output);
+      RunMetaOptimizer(item, config, nullptr, nullptr, &output);
   TF_EXPECT_OK(status);
   EXPECT_EQ(item.graph.node_size() + 1, output.node_size());
 }
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index 1be87a9d0d516a49e6b50e2dada3a2cdeea71ef6..c548c570e07499ae326ca57ec83ea1b5738fdaf6 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -23,30 +23,164 @@ limitations under the License.
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/optimizers/graph_rewriter.h"
 #include "tensorflow/core/grappler/utils.h"
 
 namespace tensorflow {
 namespace grappler {
 
-bool IsTrivialOp(const NodeDef& node, const GraphRewriter& rewriter) {
+bool IsTrivialIdentity(const NodeDef& node,
+                       const MutableGraphView& graph_view) {
+  for (const auto input :
+       graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
+    if (input.port_id == Graph::kControlSlot) {
+      // Node is driven by control dependency.
+      return false;
+    } else if (IsSwitch(*input.node)) {  // Node is driven by switch.
+      return false;
+    }
+  }
+  for (const auto output :
+       graph_view.GetFanouts(node, /*include_controlled_nodes=*/true)) {
+    if (output.port_id == Graph::kControlSlot) {
+      // Node drives control dependency.
+      return false;
+    } else if (IsMerge(*output.node)) {  // Node feeds merge.
+      return false;
+    }
+  }
+  return true;
+}
+
+bool IsTrivialOp(const NodeDef& node, const MutableGraphView& graph_view) {
   // Remove the stop gradient nodes since they serve no purpose once the graph
   // is built. Also remove Identity ops.
   if (IsStopGradient(node)) {
     return true;
   }
   if (IsIdentity(node) || IsIdentityNSingleInput(node)) {
-    return !(rewriter.FeedsMerge(node) || rewriter.IsDrivenBySwitch(node) ||
-             rewriter.IsDrivenByControlDependency(node) ||
-             rewriter.DrivesControlDependency(node));
+    return IsTrivialIdentity(node, graph_view);
   }
 
   return IsAddN(node) && NumNonControlInputs(node) <= 1;
 }
 
+bool RemovalIncreasesEdgeCount(const NodeDef& node,
+                               const MutableGraphView& graph_view) {
+  int in_degree =
+      graph_view.NumFanins(node, /*include_controlling_nodes=*/true);
+  int out_degree =
+      graph_view.NumFanouts(node, /*include_controlling_nodes=*/true);
+  return in_degree * out_degree > in_degree + out_degree;
+}
+
+bool IsOutputPortRefValue(const NodeDef& node, int port_id,
+                          const OpRegistryInterface& op_registry) {
+  const OpRegistrationData* op_reg_data = nullptr;
+  Status s = op_registry.LookUp(node.op(), &op_reg_data);
+  if (s.ok()) {
+    DataType output_type;
+    s = OutputTypeForNode(node, op_reg_data->op_def, port_id, &output_type);
+    if (s.ok() && IsRefType(output_type)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool CanRemoveNode(const NodeDef& node, const MutableGraphView& graph_view,
+                   const absl::flat_hash_set<string>& function_names,
+                   const OpRegistryInterface& op_registry) {
+  if (RemovalIncreasesEdgeCount(node, graph_view)) {
+    return false;
+  }
+  for (const auto input :
+       graph_view.GetFanins(node, /*include_controlling_nodes=*/true)) {
+    if (node.device() != input.node->device()) {
+      // Node is driven by a different device.
+      return false;
+    } else if (input.port_id == Graph::kControlSlot) {
+      // Node is driven by control dependency.
+      continue;
+    } else if (function_names.find(input.node->op()) != function_names.end()) {
+      // Node input is a function call.
+      return false;
+    } else if (IsOutputPortRefValue(*input.node, input.port_id, op_registry)) {
+      return false;
+    }
+  }
+  for (const auto output :
+       graph_view.GetFanouts(node, /*include_controlled_nodes=*/false)) {
+    if (function_names.find(output.node->op()) != function_names.end()) {
+      // Node output is a function call.
+      return false;
+    }
+  }
+  return true;
+}
+
+void ForwardInputsInternal(
+    const NodeDef& node,
+    const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
+    bool add_as_control, NodeDef* new_node,
+    const absl::flat_hash_map<string, const NodeDef*>& optimized_nodes,
+    const MutableGraphView& graph_view) {
+  // To speed things up, use the optimized version of the node if
+  // available.
+  auto itr = optimized_nodes.find(node.name());
+  if (itr != optimized_nodes.end()) {
+    for (const string& input : itr->second->input()) {
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+    }
+    return;
+  }
+  for (const auto& input : node.input()) {
+    const NodeDef* input_node = graph_view.GetNode(NodeName(input));
+    if (input_node == nullptr) {
+      // Invalid input, preserve it as is.
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+      continue;
+    }
+    if (nodes_to_delete.find(input_node) != nodes_to_delete.end()) {
+      ForwardInputsInternal(*input_node, nodes_to_delete,
+                            add_as_control || IsControlInput(input), new_node,
+                            optimized_nodes, graph_view);
+    } else {
+      *new_node->add_input() =
+          add_as_control ? AsControlDependency(NodeName(input)) : input;
+    }
+  }
+}
+
+void ForwardInputs(const NodeDef& original_node,
+                   const absl::flat_hash_set<const NodeDef*>& nodes_to_delete,
+                   NodeDef* new_node,
+                   absl::flat_hash_map<string, const NodeDef*>* optimized_nodes,
+                   const MutableGraphView& graph_view) {
+  // Forwards inputs of nodes to be deleted to their respective outputs.
+  ForwardInputsInternal(original_node, nodes_to_delete,
+                        /*add_as_control=*/false, new_node, *optimized_nodes,
+                        graph_view);
+  if (!new_node->name().empty()) {
+    (*optimized_nodes)[new_node->name()] = new_node;
+  }
+  // Reorder inputs such that control inputs come after regular inputs.
+  int pos = 0;
+  for (int i = 0; i < new_node->input_size(); ++i) {
+    if (!IsControlInput(new_node->input(i))) {
+      new_node->mutable_input()->SwapElements(pos, i);
+      ++pos;
+    }
+  }
+  DedupControlInputs(new_node);
+}
+
 absl::flat_hash_map<string, absl::flat_hash_set<int>> IdentityNTerminalPorts(
     const NodeMap& node_map, const std::vector<string>& terminal_nodes,
     int graph_size) {
@@ -313,12 +447,17 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     runnable_item = item;
   }
 
-  GraphRewriter rewriter(runnable_item);
+  MutableGraphView graph_view(&runnable_item.graph);
+  absl::flat_hash_set<string> function_names;
+  for (const auto& function : item.graph.library().function()) {
+    function_names.insert(function.signature().name());
+  }
+  OpRegistryInterface* op_registry = OpRegistry::Global();
 
   // Check if we can further prune the graph, by removing the trivial ops.
-  std::unordered_set<const NodeDef*> nodes_to_delete;
+  absl::flat_hash_set<const NodeDef*> nodes_to_delete;
   for (auto& node : runnable_item.graph.node()) {
-    if (!IsTrivialOp(node, rewriter)) {
+    if (!IsTrivialOp(node, graph_view)) {
       continue;
     }
 
@@ -341,10 +480,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
     //   converting references to non-references. It is important to preserve
     //   these non-references since the partitioner will avoid sending
     //   non-references across partitions more than once.
-    if (!rewriter.RemovalIncreasesEdgeCount(node) &&
-        !rewriter.IsConnectedToFunction(node) &&
-        !rewriter.IsDrivenByAnotherDevice(node) &&
-        !rewriter.ReceivesRefValue(node)) {
+    if (CanRemoveNode(node, graph_view, function_names, *op_registry)) {
       nodes_to_delete.insert(&node);
     }
   }
@@ -360,13 +496,15 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   const bool fetches_are_known = !item.fetch.empty();
   pruned_graph->mutable_node()->Reserve(runnable_item.graph.node_size());
+  absl::flat_hash_map<string, const NodeDef*> optimized_nodes;
   for (auto& node : runnable_item.graph.node()) {
     if (!fetches_are_known ||
         nodes_to_delete.find(&node) == nodes_to_delete.end()) {
       NodeDef* new_node = pruned_graph->add_node();
       *new_node = node;
       new_node->clear_input();
-      rewriter.ForwardInputs(node, nodes_to_delete, new_node);
+      ForwardInputs(node, nodes_to_delete, new_node, &optimized_nodes,
+                    graph_view);
     }
   }
   VLOG(1) << "Pruned " << nodes_to_delete.size()
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 32f603a949c9bf2092c669d3ec38a3c3dc4e32a8..3fb3f2b0ec75d1a628445a2f5e4d58e7a498c893 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/remapper.h"
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/grappler/costs/graph_properties.h"
 #include "tensorflow/core/grappler/graph_view.h"
@@ -22,6 +23,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/op_types.h"
 #include "tensorflow/core/grappler/optimizers/constant_folding.h"
 #include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/utils/topological_sort.h"
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
@@ -29,22 +31,71 @@ namespace grappler {
 
 namespace {
 
+constexpr char kFusedConv2D[] = "_FusedConv2D";
+
+constexpr char kDataFormat[] = "data_format";
+constexpr char kIsTraining[] = "is_training";
+
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+bool EigenSupportsContractionOutputKernel() {
+#if defined(EIGEN_USE_LIBXSMM)
+  return false;
+#endif
+  return true;
+}
+
+struct RemapperContext {
+  explicit RemapperContext(const GrapplerItem& item)
+      : nodes_to_preserve(item.NodesToPreserve()),
+        graph_view(&item.graph),
+        graph_properties(item),
+        inferred_graph_properties(false) {}
+
+  std::unordered_set<string> nodes_to_preserve;
+  GraphView graph_view;
+  GraphProperties graph_properties;
+  bool inferred_graph_properties;
+};
+
 // FusedBatchNorm that can be replaced with a cheaper set of primitives.
 struct FusedBatchNorm {
-  const NodeDef* fused_batch_norm;
+  const NodeDef* fused_batch_norm = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd.
 struct Conv2DWithBiasAdd {
-  const NodeDef* conv2d;
-  const NodeDef* bias_add;
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* bias_add = nullptr;
 };
 
 // Conv2D node followed by a BiasAdd and Relu.
 struct Conv2DWithBiasAddAndRelu {
-  const NodeDef* conv2d;
-  const NodeDef* bias_add;
-  const NodeDef* relu;
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* bias_add = nullptr;
+  const NodeDef* relu = nullptr;
+};
+
+// Conv2D node followed by a Squeeze and BiasAdd.
+struct Conv2DWithSqueezeAndBiasAdd {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* squeeze = nullptr;
+  const NodeDef* bias_add = nullptr;
+};
+
+// Conv2D node followed by a FusedBatchNorm.
+struct Conv2DWithBatchNorm {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* fused_batch_norm = nullptr;
+  float epsilon = 0.0;
+};
+
+// Conv2D node followed by a FusedBatchNorm and Relu.
+struct Conv2DWithBatchNormAndRelu {
+  const NodeDef* conv2d = nullptr;
+  const NodeDef* fused_batch_norm = nullptr;
+  const NodeDef* relu = nullptr;
+  float epsilon = 0.0;
 };
 
 bool IsFloatOrDoubleDataType(const NodeDef* node,
@@ -54,7 +105,7 @@ bool IsFloatOrDoubleDataType(const NodeDef* node,
 }
 
 bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
-                      const string& type_attr) {
+                      const string& type_attr = "T") {
   DataType lhs_attr = GetDataTypeFromAttr(*lhs, type_attr);
   DataType rhs_attr = GetDataTypeFromAttr(*rhs, type_attr);
 
@@ -62,25 +113,38 @@ bool HaveSameDataType(const NodeDef* lhs, const NodeDef* rhs,
          lhs_attr == rhs_attr;
 }
 
-bool FindConv2DWithBias(const GraphView& graph_view, const NodeDef* node,
+bool HasDataType(const NodeDef* node, const DataType& expected,
+                 const string& type_attr = "T") {
+  DataType dtype = GetDataTypeFromAttr(*node, type_attr);
+  return dtype == expected;
+}
+
+bool IsInPreserveSet(const RemapperContext& ctx, const NodeDef* node) {
+  return ctx.nodes_to_preserve.count(node->name()) > 0;
+}
+
+bool FindConv2DWithBias(const RemapperContext& ctx, const NodeDef* node,
                         Conv2DWithBiasAdd* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
   // Root of the pattern must be a BiasAdd.
   if (!node) return false;
   if (!IsBiasAdd(*node)) return false;
   if (!NodeIsOnCpu(node)) return false;
   if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(graph_view, node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
 
   // Input to the BiasAdd must be a Conv2D in NHWC format.
   const auto input_port = GraphView::InputPort(node, 0);
-  const auto conv2d = graph_view.GetRegularFanin(input_port);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
   if (!conv2d.node) return false;
   if (!IsConv2D(*conv2d.node)) return false;
-  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
+  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
   if (!NodeIsOnCpu(conv2d.node)) return false;
-  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
-  if (!NoControlFaninOrFanout(graph_view, conv2d.node)) return false;
-  if (!HasSingleFanoutNode(graph_view, conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
 
   // We successfully found a Conv2D+BiasAdd pattern.
   matched->conv2d = conv2d.node;
@@ -89,23 +153,26 @@ bool FindConv2DWithBias(const GraphView& graph_view, const NodeDef* node,
   return true;
 }
 
-bool FindConv2DWithBiasAndRelu(const GraphView& graph_view, const NodeDef* node,
+bool FindConv2DWithBiasAndRelu(const RemapperContext& ctx, const NodeDef* node,
                                Conv2DWithBiasAddAndRelu* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
   // Root of the pattern must be a Relu.
   if (!node) return false;
   if (!IsRelu(*node)) return false;
   if (!NodeIsOnCpu(node)) return false;
   if (!IsFloatOrDoubleDataType(node)) return false;
-  if (!NoControlFaninOrFanout(graph_view, node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
 
   // And input to Relu must match Conv2DWithBiasAdd pattern.
   const auto input_port = GraphView::InputPort(node, 0);
-  const auto bias_add = graph_view.GetRegularFanin(input_port);
+  const auto bias_add = ctx.graph_view.GetRegularFanin(input_port);
 
   Conv2DWithBiasAdd base;
-  if (!FindConv2DWithBias(graph_view, bias_add.node, &base)) return false;
-  if (!HasSingleFanoutNode(graph_view, base.bias_add)) return false;
-  if (!HaveSameDataType(node, base.bias_add, "T")) return false;
+  if (!FindConv2DWithBias(ctx, bias_add.node, &base)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, base.bias_add)) return false;
+  if (!HaveSameDataType(node, base.bias_add)) return false;
+  if (IsInPreserveSet(ctx, base.bias_add)) return false;
 
   // We successfully found a Conv2D+BiasAdd+Relu pattern.
   matched->conv2d = base.conv2d;
@@ -115,27 +182,151 @@ bool FindConv2DWithBiasAndRelu(const GraphView& graph_view, const NodeDef* node,
   return true;
 }
 
+bool FindConv2DWithSqueezeAndBias(const RemapperContext& ctx,
+                                  const NodeDef* node,
+                                  Conv2DWithSqueezeAndBiasAdd* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a BiasAdd.
+  if (node == nullptr) return false;
+  if (node->op() != "BiasAdd") return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // Input to the BiasAdd must be a Squeeze.
+  const auto bias_input_port = GraphView::InputPort(node, 0);
+  const auto squeeze = ctx.graph_view.GetRegularFanin(bias_input_port);
+  if (squeeze.node == nullptr) return false;
+  if (squeeze.node->op() != "Squeeze") return false;
+  if (!NodeIsOnCpu(squeeze.node)) return false;
+  if (!HaveSameDataType(node, squeeze.node, "T")) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, squeeze.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, squeeze.node)) return false;
+  if (IsInPreserveSet(ctx, squeeze.node)) return false;
+
+  // Squeeze must not squeeze output channel dimension.
+  std::vector<int32> dims;
+  if (!GetNodeAttr(*squeeze.node, "squeeze_dims", &dims).ok()) return false;
+  for (auto dim : dims) {
+    if (dim == 3) return false;
+  }
+
+  // Input to the Squeeze must be a Conv2D in NHWC format.
+  const auto squeeze_input_port = GraphView::InputPort(squeeze.node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(squeeze_input_port);
+  if (conv2d.node == nullptr) return false;
+  if (conv2d.node->op() != "Conv2D") return false;
+  if (conv2d.node->attr().at("data_format").s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node, "T")) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+Squeeze+BiasAdd pattern.
+  matched->conv2d = conv2d.node;
+  matched->squeeze = squeeze.node;
+  matched->bias_add = node;
+
+  return true;
+}
+
+bool FindConv2DWithBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+                             Conv2DWithBatchNorm* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a FusedBatchNorm or a FusedBatchNormV2.
+  if (node == nullptr) return false;
+  if (!IsFusedBatchNorm(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!HasDataType(node, DT_FLOAT)) return false;
+
+  // V2 has a separate data type for the scale/offset/mean/variance inputs.
+  if (node->op() == "FusedBatchNormV2" && !HasDataType(node, DT_FLOAT, "U"))
+    return false;
+
+  // Check that batch normalization is in inference mode.
+  const auto& attr = node->attr();
+  if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
+
+  // Check that only 0th output is consumed by other nodes.
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+  if (HasFanouts(ctx.graph_view, node, 1)) return false;  // batch_mean
+  if (HasFanouts(ctx.graph_view, node, 2)) return false;  // batch_variance
+  if (HasFanouts(ctx.graph_view, node, 3)) return false;  // reserve_space_1
+  if (HasFanouts(ctx.graph_view, node, 4)) return false;  // reserve_space_2
+
+  // Input to the FusedBatchNorm must be a Conv2D in NHWC format.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto conv2d = ctx.graph_view.GetRegularFanin(input_port);
+  if (conv2d.node == nullptr) return false;
+  if (!IsConv2D(*conv2d.node)) return false;
+  if (conv2d.node->attr().at(kDataFormat).s() != "NHWC") return false;
+  if (!NodeIsOnCpu(conv2d.node)) return false;
+  if (!HaveSameDataType(node, conv2d.node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, conv2d.node)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, conv2d.node)) return false;
+  if (IsInPreserveSet(ctx, conv2d.node)) return false;
+
+  // We successfully found a Conv2D+FusedBatchNorm pattern.
+  matched->conv2d = conv2d.node;
+  matched->fused_batch_norm = node;
+  if (!GetNodeAttr(*node, "epsilon", &matched->epsilon).ok()) return false;
+
+  return true;
+}
+
+bool FindConv2DWithBatchNormAndRelu(const RemapperContext& ctx,
+                                    const NodeDef* node,
+                                    Conv2DWithBatchNormAndRelu* matched) {
+  if (!EigenSupportsContractionOutputKernel()) return false;
+
+  // Root of the pattern must be a Relu.
+  if (node == nullptr) return false;
+  if (!IsRelu(*node)) return false;
+  if (!NodeIsOnCpu(node)) return false;
+  if (!IsFloatOrDoubleDataType(node)) return false;
+  if (!NoControlFaninOrFanout(ctx.graph_view, node)) return false;
+
+  // And input to Relu must match Conv2DWithBatchNorm pattern.
+  const auto input_port = GraphView::InputPort(node, 0);
+  const auto batch_norm = ctx.graph_view.GetRegularFanin(input_port);
+
+  Conv2DWithBatchNorm base;
+  if (!FindConv2DWithBatchNorm(ctx, batch_norm.node, &base)) return false;
+  if (!HasSingleFanoutNode(ctx.graph_view, base.fused_batch_norm)) return false;
+  if (!HaveSameDataType(node, base.fused_batch_norm)) return false;
+  if (IsInPreserveSet(ctx, base.fused_batch_norm)) return false;
+
+  // We successfully found a Conv2D+FusedBatchNorm+Relu pattern.
+  matched->conv2d = base.conv2d;
+  matched->fused_batch_norm = base.fused_batch_norm;
+  matched->relu = node;
+  matched->epsilon = base.epsilon;
+
+  return true;
+}
+
 // Check that given node meets some basic FusedBatchNorm optimization
 // preconditions. We use this check to lazily infer graph properties which is
 // rather expensive.
-bool IsFusedBatchNormCandidate(const GraphView& graph_view,
-                               const NodeDef& node) {
+bool IsFusedBatchNormCandidate(const NodeDef& node) {
   if (!IsFusedBatchNorm(node)) return false;
   if (GetDataTypeFromAttr(node, "T") != DT_FLOAT) return false;
 
   // Check that the node is in inference mode.
   const auto& attr = node.attr();
-  if (attr.count("is_training") > 0 && attr.at("is_training").b()) return false;
+  if (attr.count(kIsTraining) > 0 && attr.at(kIsTraining).b()) return false;
 
   return true;
 }
 
-bool FindFusedBatchNorm(const GraphView& graph_view,
-                        const GraphProperties& graph_properties,
-                        const NodeDef* node, FusedBatchNorm* matched) {
-  if (!IsFusedBatchNormCandidate(graph_view, *node)) return false;
+bool FindFusedBatchNorm(const RemapperContext& ctx, const NodeDef* node,
+                        FusedBatchNorm* matched) {
+  if (!IsFusedBatchNormCandidate(*node)) return false;
 
-  const auto& props = graph_properties.GetInputProperties(node->name());
+  const auto& props = ctx.graph_properties.GetInputProperties(node->name());
 
   // a. Scaling factor can be const folded:
   //      scaling_factor = (variance + epsilon).rsqrt() * scale
@@ -155,7 +346,7 @@ bool FindFusedBatchNorm(const GraphView& graph_view,
   if (!can_remap) return false;
 
   // The optimized version only generates the first output.
-  for (GraphView::Edge edge : graph_view.GetFanoutEdges(*node, false)) {
+  for (GraphView::Edge edge : ctx.graph_view.GetFanoutEdges(*node, false)) {
     if (edge.src.port_id != 0) return false;
   }
 
@@ -164,11 +355,9 @@ bool FindFusedBatchNorm(const GraphView& graph_view,
   return true;
 }
 
-#undef REMAPPER_REQUIRES
-
 void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
                           const std::vector<string>& fused_ops = {},
-                          int num_args = 1) {
+                          int num_args = 1, float epsilon = 0.0) {
   auto* attr = fused_conv2d->mutable_attr();
   auto src_attr = conv2d->attr();
 
@@ -184,39 +373,133 @@ void CopyConv2DAttributes(const NodeDef* conv2d, NodeDef* fused_conv2d,
   }
 
   SetAttrValue(num_args, &(*attr)["num_args"]);
+  // Required only for FusedBatchNorm.
+  SetAttrValue(epsilon, &(*attr)["epsilon"]);
 }
 
-void AddFusedConv2DNode(const Conv2DWithBiasAdd& matched,
-                        GraphDef* optimized_graph) {
+void AddFusedConv2DNode(
+    const Conv2DWithBiasAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
   VLOG(2) << "Fuse Conv2D with BiasAdd: bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.bias_add->name());
-  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_op(kFusedConv2D);
   fused_conv2d->set_device(matched.bias_add->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
   CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
 }
 
-void AddFusedConv2DNode(const Conv2DWithBiasAddAndRelu& matched,
-                        GraphDef* optimized_graph) {
+void AddFusedConv2DNode(
+    const Conv2DWithBiasAddAndRelu& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
   VLOG(2) << "Fuse Conv2D with BiasAdd and Relu: relu=" << matched.relu->name()
           << " bias_add=" << matched.bias_add->name()
           << " conv2d=" << matched.conv2d->name();
 
   NodeDef* fused_conv2d = optimized_graph->add_node();
   fused_conv2d->set_name(matched.relu->name());
-  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_op(kFusedConv2D);
   fused_conv2d->set_device(matched.relu->device());
   fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
   fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
   fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
 
   CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd", "Relu"});
+
+  invalidated_nodes->insert(matched.relu);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithSqueezeAndBiasAdd& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with Squeeze and BiasAdd: "
+          << " bias_add=" << matched.bias_add->name()
+          << " squeeze=" << matched.squeeze->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  // Replace Conv2D node with a fused Conv2D. Matched pattern guarantees that it
+  // has single consumer (only the squeeze node).
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.conv2d->name());
+  fused_conv2d->set_op("_FusedConv2D");
+  fused_conv2d->set_device(matched.conv2d->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));    // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));    // 1: filter
+  fused_conv2d->add_input(matched.bias_add->input(1));  // 2: bias
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"BiasAdd"});
+
+  // Replace BiasAdd node with a Squeeze.
+  NodeDef* remapped_squeeze = optimized_graph->add_node();
+  *remapped_squeeze = *matched.squeeze;
+  remapped_squeeze->set_name(matched.bias_add->name());
+  remapped_squeeze->set_input(0, fused_conv2d->name());
+
+  invalidated_nodes->insert(matched.squeeze);
+  invalidated_nodes->insert(matched.bias_add);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBatchNorm& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BatchNorm: batch_norm="
+          << matched.fused_batch_norm->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.fused_batch_norm->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
+  fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
+  fused_conv2d->add_input(matched.fused_batch_norm->input(2));  // 3: offset
+  fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
+  fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm"},
+                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+
+  invalidated_nodes->insert(matched.fused_batch_norm);
+  invalidated_nodes->insert(matched.conv2d);
+}
+
+void AddFusedConv2DNode(
+    const Conv2DWithBatchNormAndRelu& matched, GraphDef* optimized_graph,
+    absl::flat_hash_set<const NodeDef*>* invalidated_nodes) {
+  VLOG(2) << "Fuse Conv2D with BatchNorm and Relu: relu="
+          << matched.relu->name()
+          << " batch_norm=" << matched.fused_batch_norm->name()
+          << " conv2d=" << matched.conv2d->name();
+
+  NodeDef* fused_conv2d = optimized_graph->add_node();
+  fused_conv2d->set_name(matched.relu->name());
+  fused_conv2d->set_op(kFusedConv2D);
+  fused_conv2d->set_device(matched.fused_batch_norm->device());
+  fused_conv2d->add_input(matched.conv2d->input(0));            // 0: input
+  fused_conv2d->add_input(matched.conv2d->input(1));            // 1: filter
+  fused_conv2d->add_input(matched.fused_batch_norm->input(1));  // 2: scale
+  fused_conv2d->add_input(matched.fused_batch_norm->input(2));  // 3: offset
+  fused_conv2d->add_input(matched.fused_batch_norm->input(3));  // 4: mean
+  fused_conv2d->add_input(matched.fused_batch_norm->input(4));  // 5: variance
+
+  CopyConv2DAttributes(matched.conv2d, fused_conv2d, {"FusedBatchNorm", "Relu"},
+                       /*num_args*/ 4, /*epsilon*/ matched.epsilon);
+
+  invalidated_nodes->insert(matched.relu);
+  invalidated_nodes->insert(matched.fused_batch_norm);
+  invalidated_nodes->insert(matched.conv2d);
 }
 
 void AddBatchNormNodes(const FusedBatchNorm& matched,
@@ -231,7 +514,7 @@ void AddBatchNormNodes(const FusedBatchNorm& matched,
   string mean = fused_node.input(3);
   string variance = fused_node.input(4);
 
-  if (fused_node.attr().at("data_format").s() == "NCHW") {
+  if (fused_node.attr().at(kDataFormat).s() == "NCHW") {
     // Need to reshape the last 4 inputs
     NodeDef* new_shape = optimized_graph->add_node();
     new_shape->set_name(AddPrefixToNodeName("NCHWShape", fused_node.name()));
@@ -365,38 +648,80 @@ void AddBatchNormNodes(const FusedBatchNorm& matched,
 
 Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
                           GraphDef* optimized_graph) {
-  GraphProperties properties(item);
-  bool inferred_properties = false;
-  GraphView graph(const_cast<GraphDef*>(&item.graph));
-
   // Supported graph patterns.
-  FusedBatchNorm fused_batch_norm{};
-  Conv2DWithBiasAdd conv2d_with_bias{};
-  Conv2DWithBiasAddAndRelu conv2d_with_bias_and_relu{};
-
-  optimized_graph->mutable_node()->Reserve(item.graph.node_size());
-  for (const NodeDef& node : item.graph.node()) {
-    // Remap Conv2D+BiasAdd into the _FusedConv2DWithBias.
-    if (FindConv2DWithBias(graph, &node, &conv2d_with_bias)) {
-      AddFusedConv2DNode(conv2d_with_bias, optimized_graph);
+  // clang-format off
+  FusedBatchNorm              fused_batch_norm;
+  Conv2DWithBiasAdd           conv2d_with_bias;
+  Conv2DWithBiasAddAndRelu    conv2d_with_bias_and_relu;
+  Conv2DWithBatchNorm         conv2d_with_batch_norm;
+  Conv2DWithBatchNormAndRelu  conv2d_with_batch_norm_and_relu;
+  Conv2DWithSqueezeAndBiasAdd conv2d_with_squeeze_and_bias;
+  // clang-format on
+
+  // Processing graph in reverse-topological sorted order allows to remap
+  // longer chains of dependent ops in one pass.
+  GraphDef topo_sorted_graph = item.graph;
+  TF_RETURN_IF_ERROR(TopologicalSort(&topo_sorted_graph));
+  std::reverse(topo_sorted_graph.mutable_node()->begin(),
+               topo_sorted_graph.mutable_node()->end());
+
+  GrapplerItem topo_sorted_item = item.WithGraph(std::move(topo_sorted_graph));
+  RemapperContext ctx(topo_sorted_item);
+
+  // Skip nodes that were invalidated by a remapper, e.g. do not process BiasAdd
+  // and Relu nodes that were fused into a Conv2D node.
+  absl::flat_hash_set<const NodeDef*> invalidated_nodes;
+
+  optimized_graph->mutable_node()->Reserve(topo_sorted_item.graph.node_size());
+  for (const NodeDef& node : topo_sorted_item.graph.node()) {
+    // Check if node was invalidated by one of the previous remaps.
+    if (invalidated_nodes.count(&node) > 0) continue;
+
+    // Remap Conv2D+BiasAdd into the _FusedConv2D.
+    if (FindConv2DWithBias(ctx, &node, &conv2d_with_bias)) {
+      AddFusedConv2DNode(conv2d_with_bias, optimized_graph, &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+BiasAdd+Relu into the _FusedConv2D.
+    if (FindConv2DWithBiasAndRelu(ctx, &node, &conv2d_with_bias_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+Squeeze+BiasAdd into the _FusedConv2D+Squeeze.
+    if (FindConv2DWithSqueezeAndBias(ctx, &node,
+                                     &conv2d_with_squeeze_and_bias)) {
+      AddFusedConv2DNode(conv2d_with_squeeze_and_bias, optimized_graph,
+                         &invalidated_nodes);
+      continue;
+    }
+
+    // Remap Conv2D+FusedBatchNorm into the _FusedConv2D;
+    if (FindConv2DWithBatchNorm(ctx, &node, &conv2d_with_batch_norm)) {
+      AddFusedConv2DNode(conv2d_with_batch_norm, optimized_graph,
+                         &invalidated_nodes);
       continue;
     }
 
-    // Remap Conv2D+BiasAdd+Relu into the _FusedConv2DWithBias(Relu).
-    if (FindConv2DWithBiasAndRelu(graph, &node, &conv2d_with_bias_and_relu)) {
-      AddFusedConv2DNode(conv2d_with_bias_and_relu, optimized_graph);
+    // Remap Conv2D+FusedBatchNorm+Relu into the _FusedConv2D;
+    if (FindConv2DWithBatchNormAndRelu(ctx, &node,
+                                       &conv2d_with_batch_norm_and_relu)) {
+      AddFusedConv2DNode(conv2d_with_batch_norm_and_relu, optimized_graph,
+                         &invalidated_nodes);
       continue;
     }
 
     // Infer properties lazily in case they are not needed.
-    if (!inferred_properties && IsFusedBatchNormCandidate(graph, node)) {
-      TF_RETURN_IF_ERROR(properties.InferStatically(false));
-      inferred_properties = true;
+    if (!ctx.inferred_graph_properties && IsFusedBatchNormCandidate(node)) {
+      TF_RETURN_IF_ERROR(ctx.graph_properties.InferStatically(false));
+      ctx.inferred_graph_properties = true;
     }
 
     // During inference, most of the inputs to FusedBatchNorm are constant, and
     // we can therefore replace the op with a much cheaper set of primitives.
-    if (FindFusedBatchNorm(graph, properties, &node, &fused_batch_norm)) {
+    if (FindFusedBatchNorm(ctx, &node, &fused_batch_norm)) {
       AddBatchNormNodes(fused_batch_norm, optimized_graph);
       continue;
     }
@@ -405,8 +730,8 @@ Status Remapper::Optimize(Cluster* /*cluster*/, const GrapplerItem& item,
     *optimized_graph->add_node() = node;
   }
 
-  *optimized_graph->mutable_library() = item.graph.library();
-  *optimized_graph->mutable_versions() = item.graph.versions();
+  *optimized_graph->mutable_library() = topo_sorted_item.graph.library();
+  *optimized_graph->mutable_versions() = topo_sorted_item.graph.versions();
 
   return Status::OK();
 }
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index 249ca706730b0c73f8f324b7a3a63ef866a866d5..ffc242decc70e8947547fbe9ca25909625381887 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -24,7 +24,17 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-class RemapperTest : public GrapplerTest {};
+class RemapperTest : public GrapplerTest {
+ protected:
+  // TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+  // contractions with non-default contraction output kernels.
+  bool EigenSupportsContractionOutputKernel() {
+#if defined(EIGEN_USE_LIBXSMM)
+    return false;
+#endif
+    return true;
+  }
+};
 
 TEST_F(RemapperTest, FusedBatchNorm) {
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -92,6 +102,8 @@ TEST_F(RemapperTest, FusedBatchNormNCHW) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBias) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
   using ::tensorflow::ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -153,6 +165,8 @@ TEST_F(RemapperTest, FuseConv2DWithBias) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBiasAndRelu) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
   using ::tensorflow::ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -215,5 +229,233 @@ TEST_F(RemapperTest, FuseConv2DWithBiasAndRelu) {
   test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
 }
 
+TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto scale_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
+  auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
+  auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
+  auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  ops::FusedBatchNorm::Attrs attrs;
+  attrs = attrs.IsTraining(false);
+  auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
+                                        offset, mean, variance, attrs);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), batch_norm.y);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t},
+               {"scale", scale_t}, {"offset", offset_t},
+               {"mean", mean_t},   {"variance", variance_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "batch_norm") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(4, node.attr().at("num_args").i());
+      EXPECT_EQ("scale", node.input(2));
+      EXPECT_EQ("offset", node.input(3));
+      EXPECT_EQ("mean", node.input(4));
+      EXPECT_EQ("variance", node.input(5));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(1, fused_ops.size());
+      EXPECT_EQ("FusedBatchNorm", fused_ops[0]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithBatchNormAndRelu) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto scale_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto scale = Placeholder(s.WithOpName("scale"), DT_FLOAT, scale_shape);
+  auto offset = Placeholder(s.WithOpName("offset"), DT_FLOAT, scale_shape);
+  auto mean = Placeholder(s.WithOpName("mean"), DT_FLOAT, scale_shape);
+  auto variance = Placeholder(s.WithOpName("variance"), DT_FLOAT, scale_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+  ops::FusedBatchNorm::Attrs attrs;
+  attrs = attrs.IsTraining(false);
+  auto batch_norm = ops::FusedBatchNorm(s.WithOpName("batch_norm"), conv, scale,
+                                        offset, mean, variance, attrs);
+  auto relu = ops::Relu(s.WithOpName("relu"), batch_norm.y);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), relu);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 32, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto scale_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto offset_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto mean_t = GenerateRandomTensor<DT_FLOAT>({128});
+  auto variance_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t},
+               {"scale", scale_t}, {"offset", offset_t},
+               {"mean", mean_t},   {"variance", variance_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "relu") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(4, node.attr().at("num_args").i());
+      EXPECT_EQ("scale", node.input(2));
+      EXPECT_EQ("offset", node.input(3));
+      EXPECT_EQ("mean", node.input(4));
+      EXPECT_EQ("variance", node.input(5));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      EXPECT_EQ(2, fused_ops.size());
+      EXPECT_EQ("FusedBatchNorm", fused_ops[0]);
+      EXPECT_EQ("Relu", fused_ops[1]);
+      found++;
+    }
+  }
+  EXPECT_EQ(1, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
+TEST_F(RemapperTest, FuseConv2DWithSqueezeAndBias) {
+  if (!EigenSupportsContractionOutputKernel()) return;
+
+  using ops::Placeholder;
+
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 32, 1, 3});
+  auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+  auto bias_shape = ops::Placeholder::Shape({128});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias = Placeholder(s.WithOpName("bias"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 1, 1, 1};
+  auto conv = ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+
+  ops::Squeeze::Attrs attrs;
+  attrs = attrs.Axis({2});
+  auto squeeze = ops::Squeeze(s.WithOpName("squeeze"), conv, attrs);
+
+  auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), squeeze, bias);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), bias_add);
+
+  auto input_t = GenerateRandomTensor<DT_FLOAT>({8, 32, 1, 3});
+  auto filter_t = GenerateRandomTensor<DT_FLOAT>({1, 1, 3, 128});
+  auto bias_t = GenerateRandomTensor<DT_FLOAT>({128});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+  TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::ON);
+  GraphDef output;
+  TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "conv") {
+      EXPECT_EQ("_FusedConv2D", node.op());
+      EXPECT_EQ("input", node.input(0));
+      EXPECT_EQ("filter", node.input(1));
+
+      EXPECT_EQ(1, node.attr().at("num_args").i());
+      EXPECT_EQ("bias", node.input(2));
+
+      const auto fused_ops = node.attr().at("fused_ops").list().s();
+      ASSERT_EQ(1, fused_ops.size());
+      EXPECT_EQ("BiasAdd", fused_ops[0]);
+      found++;
+    } else if (node.name() == "bias_add") {
+      EXPECT_EQ("Squeeze", node.op());
+      EXPECT_EQ("conv", node.input(0));
+      found++;
+    }
+  }
+  EXPECT_EQ(2, found);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  EXPECT_EQ(1, tensors_expected.size());
+  EXPECT_EQ(1, tensors.size());
+  test::ExpectTensorNear<float>(tensors_expected[0], tensors[0], 1e-6);
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 0d4aaf646218f1a784878bd099e68f166dd0340b..e537b3df07deea17b1a53d1abf18be7bad3a6d23 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -790,20 +790,17 @@ Tree* ComputeScopeTree(const string& op_name,
   return root;
 }
 
-void PartitionByLoopStructure(const FrameMap& frame_map,
+void PartitionByLoopStructure(const FrameView& frame_view,
                               std::vector<NodeDef*> nodes,
                               std::vector<std::vector<NodeDef*>>* loop_groups) {
   // It is assumed that two nodes with identical loop containment have
-  // identical integer vectors.  Represent those by 64 bit hashes.
+  // identical integer vectors. Represent those by 64 bit hashes.
   std::unordered_map<uint64, std::vector<NodeDef*>> loop_sets;
   for (NodeDef* nd : nodes) {
     uint64 hash = 0;
-    const auto& it = frame_map.find(nd);
-    if (it != frame_map.end()) {
-      const std::vector<int>& loop_ids = it->second;
-      for (int id : loop_ids) {
-        hash = Hash64Combine(hash, static_cast<uint64>(id));
-      }
+    const std::vector<int>& loop_ids = frame_view.Frames(*nd);
+    for (int id : loop_ids) {
+      hash = Hash64Combine(hash, static_cast<uint64>(id));
     }
     loop_sets[hash].push_back(nd);
   }
@@ -821,10 +818,11 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
   GraphOpOccurrences occ;
   FindOpOccurrences(graph, op_name_set_, &occ);
   if (!occ.empty()) {
-    FrameMap frame_map;
-    int num_frames;
-    LOG_WARNING_AND_RETURN_IF_ERROR(
-        IdentifyFramesWithNodeMap(*graph, *node_map_, &frame_map, &num_frames));
+    FrameView frame_view;
+    // TODO(ezhulenev): Pass a GraphView when this optimizer will be migrated
+    // from NodeMap.
+    LOG_WARNING_AND_RETURN_IF_ERROR(frame_view.InferFromGraph(*graph));
+
     for (auto& dt : occ) {
       VLOG(2) << "Processing device " << dt.first;
       const DevOpOccurrences& dev_occ = dt.second;
@@ -841,26 +839,26 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
         // Nodes with a common depth and root path are now grouped
         // in the same Tree struct.  Split those groups into subgroups that
         // share identical loop nesting.
-        status = ApplyToAll(
-            root.get(), [this, rewriter, graph, &frame_map, &op_name](Tree* t) {
-              VLOG(2) << "applied to tree node " << t->edge_ << " at depth "
-                      << t->depth_ << " of size " << t->nodes_.size();
-              if (t->nodes_.size() > 1) {
-                std::vector<std::vector<NodeDef*>> loop_groups;
-                PartitionByLoopStructure(frame_map, t->nodes_, &loop_groups);
-                for (auto& lg : loop_groups) {
-                  if (lg.size() > 1) {
-                    bool applied = false;
-                    Status s = OrderNodeSet(&lg);
-                    TF_RETURN_IF_ERROR(s);
-                    VLOG(1) << "Applying Rewriter for " << op_name;
-                    s = rewriter->Rewrite(this, graph, op_name, lg, &applied);
-                    LOG_WARNING_AND_RETURN_IF_ERROR(s);
-                  }
-                }
+        status = ApplyToAll(root.get(), [this, rewriter, graph, &frame_view,
+                                         &op_name](Tree* t) {
+          VLOG(2) << "applied to tree node " << t->edge_ << " at depth "
+                  << t->depth_ << " of size " << t->nodes_.size();
+          if (t->nodes_.size() > 1) {
+            std::vector<std::vector<NodeDef*>> loop_groups;
+            PartitionByLoopStructure(frame_view, t->nodes_, &loop_groups);
+            for (auto& lg : loop_groups) {
+              if (lg.size() > 1) {
+                bool applied = false;
+                Status s = OrderNodeSet(&lg);
+                TF_RETURN_IF_ERROR(s);
+                VLOG(1) << "Applying Rewriter for " << op_name;
+                s = rewriter->Rewrite(this, graph, op_name, lg, &applied);
+                LOG_WARNING_AND_RETURN_IF_ERROR(s);
               }
-              return Status::OK();
-            });
+            }
+          }
+          return Status::OK();
+        });
         if (!status.ok()) {
           break;
         }
diff --git a/tensorflow/core/grappler/utils.cc b/tensorflow/core/grappler/utils.cc
index 9336c4df8b05408d9f8ce622bf488a7b3d07bc3e..29775442629dd5a56776f2d0005f9ba50c2da84b 100644
--- a/tensorflow/core/grappler/utils.cc
+++ b/tensorflow/core/grappler/utils.cc
@@ -40,8 +40,8 @@ namespace {
 template <typename T>
 bool SafeSetScalarTensorValue(double value, Tensor* tensor) {
   using RealType = typename Eigen::NumTraits<T>::Real;
-  if (value > static_cast<double>(std::numeric_limits<RealType>::max()) ||
-      value < static_cast<double>(std::numeric_limits<RealType>::min())) {
+  if (value > static_cast<double>(Eigen::NumTraits<RealType>::highest()) ||
+      value < static_cast<double>(Eigen::NumTraits<RealType>::lowest())) {
     return false;
   }
   tensor->flat<T>()(0) = static_cast<T>(value);
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index dbe425b75fd1bb3632690d2ba16073e9ba9340a3..c0f19d3828ac1581a937531318ff62875fbf3bc7 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -74,8 +74,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/grappler:graph_view",
         "//tensorflow/core/grappler:op_types",
-        "//tensorflow/core/grappler:utils",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -172,7 +173,6 @@ cc_library(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler:op_types",
         "//tensorflow/core/grappler:utils",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/utils/frame.cc b/tensorflow/core/grappler/utils/frame.cc
index df5f4ff7cf38dbc7ab3038346cd4ea65031c8227..2484b35de06c74659c583c7d34d4881729e00f21 100644
--- a/tensorflow/core/grappler/utils/frame.cc
+++ b/tensorflow/core/grappler/utils/frame.cc
@@ -15,77 +15,128 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/utils/frame.h"
 #include <deque>
-#include <stack>
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/op_types.h"
-#include "tensorflow/core/grappler/utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 
 namespace tensorflow {
 namespace grappler {
 
-Status IdentifyFrames(const GraphDef& graph, FrameMap* frame_map,
-                      int* num_frames) {
-  NodeMap node_map(const_cast<GraphDef*>(&graph));
-  return IdentifyFramesWithNodeMap(graph, node_map, frame_map, num_frames);
-}
+namespace {}  // namespace
+
+Status FrameView::InferFromGraphView(const GraphView& graph_view) {
+  if (is_inferred_) {
+    return errors::Internal("FrameView was already inferred from the graph");
+  }
+  is_inferred_ = true;
+
+  std::deque<const NodeDef*> ready_nodes;
 
-Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
-                                 FrameMap* frame_map, int* num_frames) {
-  std::deque<std::pair<const NodeDef*, std::vector<int>>> ready_nodes;
-  for (const NodeDef& node : graph.node()) {
+  // All nodes without inputs are automatically added to the ready queue.
+  for (const NodeDef& node : graph_view.graph()->node()) {
     if (node.input_size() == 0) {
-      std::vector<int> empty;
-      ready_nodes.emplace_back(&node, empty);
-      (*frame_map)[&node] = empty;
+      ready_nodes.push_back(&node);
+      node_to_frames_[&node] = node_has_no_frames_;
     }
   }
-  std::map<string, int> name_to_id;
+
+  // We assign unique int id to each frame, and use this map to track what
+  // frames we've already seen in the graph.
+  absl::flat_hash_map<string, int> frame_name_to_id;
+
   while (!ready_nodes.empty()) {
-    auto ready_node = ready_nodes.front();
-    for (const auto& fanout : node_map.GetOutputs(ready_node.first->name())) {
-      if (frame_map->count(fanout) < 1) {
-        std::vector<int> frame_ids = ready_node.second;
-        if (IsExit(*ready_node.first)) {
+    const NodeDef* ready_node = ready_nodes.front();
+
+    absl::flat_hash_set<GraphView::InputPort> fanouts =
+        graph_view.GetFanouts(*ready_node, /*include_controlled_nodes=*/true);
+
+    for (const GraphView::InputPort& fanout : fanouts) {
+      if (node_to_frames_.count(fanout.node) < 1) {
+        // If we have never seen this node before, we add all frames from the
+        // incoming node (and pop/push frames if coming from Exit/Enter nodes).
+        std::vector<int> frame_ids = node_to_frames_[ready_node];
+
+        if (IsExit(*ready_node)) {
           frame_ids.pop_back();
         }
-        if (IsEnter(*fanout)) {
-          CHECK(fanout->attr().count("frame_name"))
-              << "Missing frame name for the Enter node " << fanout->name();
-          string name = fanout->attr().at("frame_name").s();
-          int id;
-          if (name_to_id.count(name)) {
-            id = name_to_id[name];
+
+        if (IsEnter(*fanout.node)) {
+          const AttrValue* frame_name_attr =
+              AttrSlice(*fanout.node).Find("frame_name");
+
+          if (!frame_name_attr) {
+            return errors::InvalidArgument(
+                "Missing frame name for the Enter node: ",
+                SummarizeNodeDef(*fanout.node));
+          }
+
+          absl::string_view frame_name = frame_name_attr->s();
+          int frame_id;
+
+          if (frame_name_to_id.count(frame_name)) {
+            frame_id = frame_name_to_id[frame_name];
           } else {
-            id = name_to_id.size();
-            name_to_id[name] = id;
+            frame_id = static_cast<int>(frame_name_to_id.size());
+            frame_name_to_id[frame_name] = frame_id;
           }
-          frame_ids.push_back(id);
+
+          frame_ids.push_back(frame_id);
         }
-        ready_nodes.emplace_back(fanout, frame_ids);
-        (*frame_map)[fanout] = frame_ids;
+
+        ready_nodes.push_back(fanout.node);
+        node_to_frames_[fanout.node] = std::move(frame_ids);
+
       } else {
-        auto frame_ids_fanout = (*frame_map)[fanout];
-        auto frame_ids_node = ready_node.second;
-        if (IsEnter(*fanout)) {
+        // If we've already seen this node before, we need to make sure that
+        // graph is correct and same nodes doesn't have incoming edges with
+        // conflicting frames (all inputs must be produces in the same frame).
+
+        std::vector<int> frame_ids_fanout = node_to_frames_[fanout.node];
+        std::vector<int> frame_ids_node = node_to_frames_[ready_node];
+
+        if (IsEnter(*fanout.node)) {
           frame_ids_fanout.pop_back();
         }
-        if (IsExit(*ready_node.first)) {
+        if (IsExit(*ready_node)) {
           frame_ids_node.pop_back();
         }
+
         if (frame_ids_node != frame_ids_fanout) {
           return errors::InvalidArgument(
-              "Invalid graph: Frame ids for node ", ready_node.first->name(),
-              " does not match frame ids for it's fanout.");
+              "Invalid graph: Frame ids for node ", ready_node->name(),
+              " does not match frame ids for it's fanout ",
+              fanout.node->name());
         }
       }
     }
+
     ready_nodes.pop_front();
   }
-  *num_frames = name_to_id.size();
+
+  num_frames_ = static_cast<int>(frame_name_to_id.size());
   return Status::OK();
 }
 
+Status FrameView::InferFromGraph(const GraphDef& graph) {
+  return InferFromGraphView(GraphView(&graph));
+}
+
+const std::vector<int>& FrameView::Frames(const NodeDef& node) const {
+  DCHECK(is_inferred_) << "FrameView is not initialized";
+  auto frames = node_to_frames_.find(&node);
+  if (frames == node_to_frames_.end()) {
+    LOG(WARNING) << "Node doesn't belong to the graph used for initialization";
+    return node_has_no_frames_;
+  } else {
+    return frames->second;
+  }
+}
+
+bool FrameView::IsInFrame(const NodeDef& node) const {
+  return !Frames(node).empty();
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/frame.h b/tensorflow/core/grappler/utils/frame.h
index 95b72748f4e1f13f1c61d64c4a457287e9d7d46b..04c6588275098a0a3f7110be7af4e2e9207b0ac2 100644
--- a/tensorflow/core/grappler/utils/frame.h
+++ b/tensorflow/core/grappler/utils/frame.h
@@ -17,25 +17,52 @@ limitations under the License.
 #define TENSORFLOW_CORE_GRAPPLER_UTILS_FRAME_H_
 
 #include <unordered_map>
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/graph.pb.h"
-#include "tensorflow/core/grappler/utils.h"
+#include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tensorflow {
 namespace grappler {
 
-using FrameMap = std::unordered_map<const NodeDef*, std::vector<int>>;
+// FrameView is a helper class that allows to find in what execution frames (if
+// any) the given node can be running in. It's constructed from an immutable
+// GraphView, and any modification of the underlying graph might invalidate it.
+//
+// All execution frames assigned an unique integer id, but they do not have any
+// meaning whatsoever, it's just a sequence number.
+//
+// See the paper "Dynamic Control Flow in Large-Scale Machine Learning" for
+// detailed explanation of execution frames (https://arxiv.org/abs/1805.01772).
+class FrameView {
+ public:
+  FrameView() : is_inferred_(false), num_frames_(0) {}
 
-// Returns the number of frames present in the graph, and populates
-// the 'frames' argument with the collection of frames (denoted by their
-// frame ids) in the outermost-to-innermost order. Frame ids are arbitrary.
-Status IdentifyFrames(const GraphDef& graph, FrameMap* frame_map,
-                      int* num_frames);
+  // Infers nodes execution frames from the GraphView. Returns an error if
+  // called multiple times.
+  Status InferFromGraphView(const GraphView& graph_view);
+  // Infers nodes execution by constructing temporary GraphView and passing it
+  // to InferFromGraphView.
+  Status InferFromGraph(const GraphDef& graph);
 
-// As above, but use an existing NodeMap for graph instead of building it
-// from scratch.
-Status IdentifyFramesWithNodeMap(const GraphDef& graph, const NodeMap& node_map,
-                                 FrameMap* frame_map, int* num_frames);
+  // Returns all frames of the given node (denoted by their frame ids) in
+  // outermost-to-innermost order.
+  const std::vector<int>& Frames(const NodeDef& node) const;
+
+  // Returns true iff the node is at least in one execution frame.
+  bool IsInFrame(const NodeDef& node) const;
+
+  int num_frames() const { return num_frames_; }
+  bool is_inferred() const { return is_inferred_; }
+
+ private:
+  bool is_inferred_;  // true if it was inferred from the graph
+  int num_frames_;    // number of frames present in a graph
+  absl::flat_hash_map<const NodeDef*, std::vector<int>> node_to_frames_;
+
+  // We return a reference to this vector if node has no frames.
+  const std::vector<int> node_has_no_frames_;
+};
 
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/frame_test.cc b/tensorflow/core/grappler/utils/frame_test.cc
index df76083fc3a0334172ac93998e0b549a2c723431..cc82e0ed3a39dd117e2197fa9a47fe2f3372051d 100644
--- a/tensorflow/core/grappler/utils/frame_test.cc
+++ b/tensorflow/core/grappler/utils/frame_test.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 namespace grappler {
 namespace {
 
-class IdentifyFramesTest : public ::testing::Test {
+class FrameViewTest : public ::testing::Test {
  protected:
   static NodeDef CreateNode(const string& name,
                             const std::vector<string>& inputs) {
@@ -53,19 +53,17 @@ class IdentifyFramesTest : public ::testing::Test {
   }
 };
 
-TEST_F(IdentifyFramesTest, NestedLoop) {
+TEST_F(FrameViewTest, NestedLoop) {
   GraphDef graph;
   // Create a two-level nested loop
   *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() =
-      CreateNode("1", "Enter", "map/while/while_context1", {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context1", {"0"});
   *graph.add_node() = CreateNode("2", {"1"});
   *graph.add_node() = CreateNode("3", "Merge", {"2", "14"});
   *graph.add_node() = CreateNode("4", {"3"});
   *graph.add_node() = CreateNode("5", "Switch", {"4"});
   *graph.add_node() = CreateNode("6", {"5"});
-  *graph.add_node() =
-      CreateNode("7", "Enter", "map/while/while_context2", {"6"});
+  *graph.add_node() = CreateNode("7", "Enter", "while/context2", {"6"});
   *graph.add_node() = CreateNode("8", {"7"});
   *graph.add_node() = CreateNode("9", "Merge", {"8", "12"});
   *graph.add_node() = CreateNode("10", {"9"});
@@ -77,118 +75,106 @@ TEST_F(IdentifyFramesTest, NestedLoop) {
   *graph.add_node() = CreateNode("16", "Exit", {"15"});
   *graph.add_node() = CreateNode("17", {"16"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}},      {"1", {0}},     {"2", {0}},     {"3", {0}},
       {"4", {0}},     {"5", {0}},     {"6", {0}},     {"7", {0, 1}},
       {"8", {0, 1}},  {"9", {0, 1}},  {"10", {0, 1}}, {"11", {0, 1}},
       {"12", {0, 1}}, {"13", {0, 1}}, {"14", {0}},    {"15", {0}},
       {"16", {0}},    {"17", {}}};
-  EXPECT_EQ(num_frames, 2);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 2);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, MultipleInputsToEnter) {
+TEST_F(FrameViewTest, MultipleInputsToEnter) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
   *graph.add_node() = CreateNode("1", {});
-  *graph.add_node() =
-      CreateNode("2", "Enter", "map/while/while_context", {"0", "1"});
+  *graph.add_node() = CreateNode("2", "Enter", "while/context", {"0", "1"});
   *graph.add_node() = CreateNode("3", "Exit", {"2"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {}}, {"2", {0}}, {"3", {0}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, ExitOutput) {
+TEST_F(FrameViewTest, ExitOutput) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
-  *graph.add_node() =
-      CreateNode("1", "Enter", "map/while/while_context", {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context", {"0"});
   *graph.add_node() = CreateNode("2", "Exit", {"1"});
   *graph.add_node() = CreateNode("3", {});
   *graph.add_node() = CreateNode("4", {"2", "3"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {0}}, {"2", {0}}, {"3", {}}, {"4", {}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
-TEST_F(IdentifyFramesTest, MultipleEnterNodes) {
+TEST_F(FrameViewTest, MultipleEnterNodes) {
   GraphDef graph;
   *graph.add_node() = CreateNode("0", {});
-  string frame = "map/while/while_context";
-  *graph.add_node() = CreateNode("1", "Enter", frame, {"0"});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context", {"0"});
   *graph.add_node() = CreateNode("2", {"1"});
   *graph.add_node() = CreateNode("5", {});
-  *graph.add_node() = CreateNode("4", "Enter", frame, {"5"});
+  *graph.add_node() = CreateNode("4", "Enter", "while/context", {"5"});
   *graph.add_node() = CreateNode("3", {"4", "2"});
   *graph.add_node() = CreateNode("6", "Merge", {"3", "8"});
   *graph.add_node() = CreateNode("7", "Switch", {"6"});
   *graph.add_node() = CreateNode("8", "NextIteration", {"7"});
   *graph.add_node() = CreateNode("9", "Exit", {"7"});
 
-  std::unordered_map<const NodeDef*, std::vector<int>> frames;
-  int num_frames;
-  EXPECT_TRUE(IdentifyFrames(graph, &frames, &num_frames).ok());
+  FrameView frame_view;
+  ASSERT_TRUE(frame_view.InferFromGraph(graph).ok());
+
   std::unordered_map<string, std::vector<int>> expected = {
       {"0", {}}, {"1", {0}}, {"2", {0}}, {"3", {0}}, {"4", {0}},
       {"5", {}}, {"6", {0}}, {"7", {0}}, {"8", {0}}, {"9", {0}}};
-  EXPECT_EQ(num_frames, 1);
-  EXPECT_EQ(frames.size(), expected.size());
-  std::cout << "Number of frame: " << num_frames << std::endl;
-  for (const auto& node : frames) {
-    std::cout << node.first->name() << ": ";
-    EXPECT_EQ(node.second.size(), expected[node.first->name()].size());
-    for (int i = 0; i < node.second.size(); i++) {
-      EXPECT_EQ(expected[node.first->name()][i], node.second[i]);
-      std::cout << node.second[i] << " ";
-    }
-    std::cout << std::endl;
+
+  EXPECT_EQ(frame_view.num_frames(), 1);
+  for (const NodeDef& node : graph.node()) {
+    std::vector<int> expected_frames = expected[node.name()];
+    std::vector<int> node_frames = frame_view.Frames(node);
+    EXPECT_EQ(expected_frames, node_frames);
   }
 }
 
+TEST_F(FrameViewTest, ConflictingFrames) {
+  GraphDef graph;
+  *graph.add_node() = CreateNode("0", {});
+  *graph.add_node() = CreateNode("1", "Enter", "while/context1", {"0"});
+  *graph.add_node() = CreateNode("2", "Enter", "while/context2", {"1"});
+  *graph.add_node() = CreateNode("3", {"1", "2"});
+
+  FrameView frame_view;
+  ASSERT_FALSE(frame_view.InferFromGraph(graph).ok());
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index c806f3874ddbfa7493914e69c08dbacb8c5db763..57863a71f35f176e3935e2121f5650a58c72d642 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -74,120 +74,16 @@ Status ResolveFunctionBodyNodeAttrPlaceholders(
   return Status::OK();
 }
 
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
-  // Functions that are reachable from the graph.
-  absl::flat_hash_set<string> reachable_funcs;
-
-  // Functions might be reachable from the nested function calls, so we keep a
-  // queue of functions that we have to check.
-  gtl::InlinedVector<const FunctionDef*, 4> func_queue;
-
-  // Add reachable and not already processed functions to the functions queue.
-  const auto add_to_func_queue = [&](const string& func_name) {
-    const FunctionDef* func = flib.Find(func_name);
-    if (func && reachable_funcs.find(func_name) == reachable_funcs.end()) {
-      func_queue.push_back(func);
-    }
-  };
-
-  // Add all the functions that are reachable from the given node to the queue.
-  const auto process_node = [&](const NodeDef& node) {
-    // Node itself can be a call to the function.
-    add_to_func_queue(node.op());
-
-    // Or node can have an attribute referencing a function.
-    for (const auto& attr : node.attr()) {
-      const auto& attr_value = attr.second;
-
-      // 1. AttrValue.func
-      if (attr_value.has_func()) {
-        add_to_func_queue(attr_value.func().name());
-      }
-
-      // 2. AttrValue.ListValue.func
-      if (attr_value.has_list()) {
-        for (const auto& func : attr_value.list().func()) {
-          add_to_func_queue(func.name());
-        }
-      }
-    }
-  };
-
-  // Add all functions that are directly called from the optimized graph.
-  std::for_each(nodes.begin(), nodes.end(), process_node);
-
-  // Process all reachable functions.
-  while (!func_queue.empty()) {
-    const FunctionDef* func = func_queue.back();
-    func_queue.pop_back();
-
-    const string& func_name = func->signature().name();
-    reachable_funcs.insert(func_name);
-
-    // Find all the functions called from the function body.
-    const auto& func_body = func->node_def();
-    std::for_each(func_body.begin(), func_body.end(), process_node);
-
-    // Check if the function has a registered gradient.
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) add_to_func_queue(grad_func_name);
-  }
-
-  return reachable_funcs;
-}
-
-FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
-    const FunctionLibraryDefinition& flib,
-    const protobuf::RepeatedPtrField<NodeDef>& nodes) {
-  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, nodes);
-
-  FunctionLibraryDefinition reachable_flib(flib.default_registry(),
-                                           FunctionDefLibrary());
-
-  for (const string& func_name : reachable_funcs) {
-    const FunctionDef* func = flib.Find(func_name);
-    DCHECK_NE(func, nullptr);
-    // That should never fail, because we copy functions from valid flib and use
-    // the same default registry.
-    const Status added = reachable_flib.AddFunctionDef(*func);
-    DCHECK(added.ok());
-
-    const string grad_func_name = flib.FindGradient(func_name);
-    if (!grad_func_name.empty()) {
-      GradientDef grad;
-      grad.set_function_name(func_name);
-      grad.set_gradient_func(grad_func_name);
-      // It can only fail if function already has a gradient function.
-      const Status added_grad = reachable_flib.AddGradientDef(grad);
-      DCHECK(added_grad.ok());
-    }
-  }
-
-  return reachable_flib;
-}
-
 }  // namespace
 
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return ReachableFunctions(flib, graph.node());
-}
-
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return ReachableFunctions(flib, func.node_def());
-}
-
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
     const FunctionLibraryDefinition& flib, const GraphDef& graph) {
-  return ReachableFunctionLibraryDefinition(flib, graph.node());
+  return flib.ReachableDefinitions(graph);
 }
 
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
     const FunctionLibraryDefinition& flib, const FunctionDef& func) {
-  return ReachableFunctionLibraryDefinition(flib, func.node_def());
+  return flib.ReachableDefinitions(func);
 }
 
 void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
@@ -198,7 +94,7 @@ void GrapplerFunctionConnectivity::RegisterInputArgExpansion(
   for (int i = 0; i < placeholders.size(); ++i) {
     const string& placeholder = input_arg_expansion.placeholders[i];
     input_arg_placeholders_.insert(
-        {placeholder, InputArgPlaceholder{input_name, /*position=*/i}});
+        {placeholder, InputArgPlaceholder{input_name, /*input_position=*/i}});
   }
   input_arg_expansions_.insert(
       {std::move(input_name), std::move(input_arg_expansion)});
@@ -352,8 +248,8 @@ Status GrapplerFunctionConnectivity::AsFunctionDefInput(
     const InputArgPlaceholder* placeholder =
         FindOrNull(input_arg_placeholders_, node_name);
     if (placeholder != nullptr) {
-      *func_def_input =
-          strings::StrCat(placeholder->input_name, ":", placeholder->position);
+      *func_def_input = strings::StrCat(placeholder->input_name, ":",
+                                        placeholder->input_position);
       return Status::OK();
     }
   }
@@ -451,12 +347,6 @@ GrapplerFunctionItem::GrapplerFunctionItem(
       fetch.push_back(output_tensor);
     }
   }
-  // Stateful and Send (it's not stateful) nodes must be preserved in the graph.
-  for (const NodeDef& node : graph.node()) {
-    if (IsSend(node)) {
-      keep_ops.push_back(node.name());
-    }
-  }
 }
 
 const string& GrapplerFunctionItem::description() const { return description_; }
@@ -688,8 +578,8 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
     TF_RETURN_IF_ERROR(RegisterFunctionBodyOutputs(*registration, func_def_node,
                                                    &connectivity));
 
-    // Stateful and Send nodes must be preserved in a function body
-    if (registration->op_def.is_stateful() || IsSend(func_def_node)) {
+    // Ops with side effects must be preserved in a function body.
+    if (!IsFreeOfSideEffect(func_def_node)) {
       keep_nodes.push_back(func_def_node.name());
     }
   }
diff --git a/tensorflow/core/grappler/utils/functions.h b/tensorflow/core/grappler/utils/functions.h
index 72b3c0f31aaa919d57567c158e90648513379fb5..038cf5f527e0f32cc10e123bb0cab357e5902463 100644
--- a/tensorflow/core/grappler/utils/functions.h
+++ b/tensorflow/core/grappler/utils/functions.h
@@ -19,7 +19,6 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
-#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -31,13 +30,6 @@ limitations under the License.
 namespace tensorflow {
 namespace grappler {
 
-// Returns a set of functions from the function library, that are reachable from
-// the nodes of the graph.
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const GraphDef& graph);
-absl::flat_hash_set<string> ReachableFunctions(
-    const FunctionLibraryDefinition& flib, const FunctionDef& func);
-
 // Returns a copy of FunctionLibraryDefinition with subset of functions that are
 // reachable from the nodes of the graph.
 FunctionLibraryDefinition ReachableFunctionLibraryDefinition(
@@ -119,8 +111,10 @@ class GrapplerFunctionConnectivity {
   std::unordered_map<string, tensorflow::NameRangeMap> function_body_outputs_;
 
   struct InputArgPlaceholder {
-    string input_name;
-    int position;
+    string input_name;   // Name of the function input argument.
+    int input_position;  // Index of a tensor in the function input argument
+                         // expansion, it can be greater than `0` if input
+                         // argument is a list of tensors (aka list(type)).
   };
 
   // Mapping from input arg placeholder to the function input tensor.
@@ -150,12 +144,6 @@ class GrapplerFunctionItemInstantiation {
 class GrapplerFunctionItem : public GrapplerItem {
  public:
   GrapplerFunctionItem() = default;
-  GrapplerFunctionItem(string func_name, string description,
-                       AttrSlice func_attr,
-                       std::vector<InputArgExpansion> input_arg_expansions,
-                       std::vector<OutputArgExpansion> output_arg_expansions,
-                       std::vector<string> keep_nodes, int graph_def_version,
-                       bool is_stateful, GraphDef&& function_body);
 
   const string& description() const;
 
@@ -178,12 +166,22 @@ class GrapplerFunctionItem : public GrapplerItem {
   GrapplerFunctionItem& SwapFunctionBody(GraphDef&& other);
 
  private:
+  friend Status MakeGrapplerFunctionItem(const FunctionDef&, const AttrSlice&,
+                                         const FunctionLibraryDefinition&, int,
+                                         GrapplerFunctionItem*);
   friend Status ReplaceInputWithConst(const NodeDef&, int,
                                       GrapplerFunctionItem*);
   friend Status RemoveUnusedOutputs(
       const gtl::FlatSet<int>& active_outputs, GrapplerFunctionItem* item,
       std::vector<std::pair<int, int>>* output_mapping);
 
+  GrapplerFunctionItem(string func_name, string description,
+                       AttrSlice func_attr,
+                       std::vector<InputArgExpansion> input_arg_expansions,
+                       std::vector<OutputArgExpansion> output_arg_expansions,
+                       std::vector<string> keep_nodes, int graph_def_version,
+                       bool is_stateful, GraphDef&& function_body);
+
   string description_;
   AttrSlice func_attr_;  // Attributes specific to function definition that
                          // produced this item (FuncDef.attr field).
diff --git a/tensorflow/core/grappler/utils/functions_test.cc b/tensorflow/core/grappler/utils/functions_test.cc
index 16834acecf08d3e066cc52b52ac86bf543c499d4..8639dec05a1eb8aa7afcadc20ee9f8949bfeae14 100644
--- a/tensorflow/core/grappler/utils/functions_test.cc
+++ b/tensorflow/core/grappler/utils/functions_test.cc
@@ -32,65 +32,6 @@ constexpr char kDevice[] = "/device:CPU:0";
 
 class FunctionsTest : public ::testing::Test {};
 
-TEST_F(FunctionsTest, ReachableFunctions) {
-  using ::tensorflow::test::function::GDef;
-  using ::tensorflow::test::function::NDef;
-  using FDH = ::tensorflow::FunctionDefHelper;
-
-  const auto make_simple_fdef = [](const string &name) {
-    return FDH::Create(
-        name, {"x:T", "y:T"}, {"z:T"}, {"T: {float, double}"},
-        {{{"output"}, "Mul", {"x", "y"}, {{"T", "$T"}}}},
-        /* Mapping between function returns and function node outputs. */
-        {{"z", "output:z:0"}});
-  };
-
-  FunctionDef func_1 = make_simple_fdef("Func1");
-  FunctionDef func_2 = make_simple_fdef("Func2");
-  FunctionDef func_3 = make_simple_fdef("Func3");
-
-  FunctionDef func_2_grad = make_simple_fdef("Func2_grad");
-
-  GraphDef graph = GDef(
-      {
-          NDef("a", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
-          NDef("b", "Placeholder", {}, {{"dtype", DT_FLOAT}}, kDevice),
-          NDef("x", "Func1", {"a", "b"}, {{"T", DT_FLOAT}}, kDevice),
-          NDef("y", "PartitionedCall", {"a", "b"},
-               {{"Tin", DataTypeSlice{DT_FLOAT, DT_FLOAT}},
-                {"Tout", DataTypeSlice{DT_FLOAT}},
-                {"f", FDH::FunctionRef("Func2", {{"T", DT_FLOAT}})}},
-               kDevice),
-      },
-      // FunctionLib
-      {func_1, func_2, func_3, func_2_grad});
-
-  // Register custom function gradient after the graph was constructed.
-  GradientDef *func3_grad_def = graph.mutable_library()->add_gradient();
-  func3_grad_def->set_function_name("Func2");
-  func3_grad_def->set_gradient_func("Func2_grad");
-
-  FunctionLibraryDefinition flib(OpRegistry::Global(), graph.library());
-
-  // - 'Func1' called directly from the graph
-  // - 'Func2' called indirectly via PartitionedCall attribute, and it also
-  //   has a custom gradient ('Func2_grad') that must remain in the library
-  // - 'Func3' in unreachable and has to be removed from the library
-
-  absl::flat_hash_set<string> reachable_funcs = ReachableFunctions(flib, graph);
-  ASSERT_EQ(reachable_funcs.size(), 3);
-  EXPECT_NE(reachable_funcs.find("Func1"), reachable_funcs.end());
-  EXPECT_NE(reachable_funcs.find("Func2"), reachable_funcs.end());
-  EXPECT_NE(reachable_funcs.find("Func2_grad"), reachable_funcs.end());
-
-  FunctionLibraryDefinition reachable_flib =
-      ReachableFunctionLibraryDefinition(flib, graph);
-  ASSERT_EQ(reachable_flib.num_functions(), 3);
-  EXPECT_TRUE(reachable_flib.Contains("Func1"));
-  EXPECT_TRUE(reachable_flib.Contains("Func2"));
-  EXPECT_TRUE(reachable_flib.Contains("Func2_grad"));
-}
-
 TEST_F(FunctionsTest, IsParametrized) {
   // Function is defined for multiple input types.
   FunctionDef parametrized_func = FunctionDefHelper::Create(
@@ -635,6 +576,33 @@ TEST_F(FunctionsTest, FromFunctionDefWithoutInput) {
   EXPECT_EQ("two", cast.input(0));
 }
 
+TEST_F(FunctionsTest, FromFunctionDefWithSideEffectfulOps) {
+  const Tensor kOne = test::AsScalar<float>(1.0);
+  FunctionDef func = FunctionDefHelper::Define(
+      /* Name */ "SideEffects",
+      /* Args */ {"x: Ref(float)"},
+      /* Return values */ {},
+      /* Attr def */ {},
+      /* Nodes */
+      {{{"one"}, "Const", {}, {{"value", kOne}, {"dtype", DT_FLOAT}}},
+       {{"update"}, "AssignAdd", {"x", "one"}, {{"T", DT_FLOAT}}}});
+
+  protobuf::Map<string, AttrValue> func_instantiation_attr;
+  FunctionLibraryDefinition flib(OpRegistry::Global(), FunctionDefLibrary());
+
+  GrapplerFunctionItem item;
+  TF_EXPECT_OK(MakeGrapplerFunctionItem(func,
+                                        AttrSlice(&func_instantiation_attr),
+                                        flib, TF_GRAPH_DEF_VERSION, &item));
+
+  EXPECT_EQ("SideEffects", item.id);
+  EXPECT_EQ(3, item.function_body().node_size());
+  EXPECT_EQ(1, item.input_size());
+  EXPECT_EQ(0, item.output_size());
+  ASSERT_EQ(1, item.keep_ops.size());
+  EXPECT_EQ("update", item.keep_ops[0]);
+}
+
 TEST_F(FunctionsTest, MakeFunctionDef) {
   const Tensor kTwo = test::AsScalar<int64>(2);
   FunctionDef func = FunctionDefHelper::Define(
diff --git a/tensorflow/core/grappler/utils/grappler_test.cc b/tensorflow/core/grappler/utils/grappler_test.cc
index 6266733f3e6588af9e06a5a279ecabf5adbd009a..576494cad55e22ba8457f30d0ea79b53f6f5de78 100644
--- a/tensorflow/core/grappler/utils/grappler_test.cc
+++ b/tensorflow/core/grappler/utils/grappler_test.cc
@@ -114,9 +114,13 @@ void GrapplerTest::CompareGraphs(GraphDef want, GraphDef got) const {
   for (int i = 0; i < want.node_size(); ++i) {
     EXPECT_EQ(want.node(i).op(), got.node(i).op());
     EXPECT_EQ(want.node(i).name(), got.node(i).name());
+    EXPECT_EQ(want.node(i).device(), got.node(i).device());
+
     ASSERT_EQ(want.node(i).input_size(), got.node(i).input_size());
     for (int j = 0; j < want.node(i).input_size(); ++j) {
-      EXPECT_TRUE(IsSameInput(want.node(i).input(j), got.node(i).input(j)));
+      const TensorId want_tensor = ParseTensorName(want.node(i).input(j));
+      const TensorId got_tensor = ParseTensorName(got.node(i).input(j));
+      EXPECT_EQ(want_tensor.ToString(), got_tensor.ToString());
     }
   }
 }
diff --git a/tensorflow/core/grappler/utils_test.cc b/tensorflow/core/grappler/utils_test.cc
index 8cbff1c397114c51d5dfe1f32464e0559845cb17..e993391b51bfe882a1e662f220ace0542db4ffba 100644
--- a/tensorflow/core/grappler/utils_test.cc
+++ b/tensorflow/core/grappler/utils_test.cc
@@ -16,10 +16,13 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils.h"
 
 #include <unistd.h>
+#include <limits>
 #include <memory>
 #include "tensorflow/cc/ops/standard_ops.h"
 #include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/lib/bfloat16/bfloat16.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -441,6 +444,26 @@ BM_ParseNodeNameAsStringPiece("foo:123", foo123);
 BM_ParseNodeNameAsStringPiece("foo/bar/baz:123", foo_bar_baz_123);
 BM_ParseNodeNameAsStringPiece("^foo/bar/baz:123", foo_bar_baz_123_ctrl);
 
+TEST_F(UtilsTest, SetTensorValueBFloat16) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), 2, &t));
+  test::ExpectTensorEqual<bfloat16>(Tensor(bfloat16(2)), t);
+}
+
+TEST_F(UtilsTest, SetTensorValueBFloat16IntMax) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), std::numeric_limits<int>::max(), &t));
+  test::ExpectTensorEqual<bfloat16>(
+      Tensor(bfloat16(std::numeric_limits<int>::max())), t);
+}
+
+TEST_F(UtilsTest, SetTensorValueBFloat16IntMin) {
+  Tensor t(DT_BFLOAT16, TensorShape({}));
+  TF_ASSERT_OK(SetTensorValue(t.dtype(), std::numeric_limits<int>::min(), &t));
+  test::ExpectTensorEqual<bfloat16>(
+      Tensor(bfloat16(std::numeric_limits<int>::min())), t);
+}
+
 }  // namespace
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 18d70422a0bdaf5c045e353117a4ef6982bdce06..0e5d8d765a6bfde3a0e187c0b386174d3b20a098 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -29,26 +29,26 @@ package_group(
 
 load(
     "//tensorflow:tensorflow.bzl",
+    "cc_header_only_library",
     "if_android",
+    "if_not_windows",
+    "tf_cc_binary",
     "tf_cc_test",
     "tf_cc_test_mkl",
     "tf_cc_tests",
-    "tf_cc_binary",
     "tf_copts",
     "tf_cuda_library",
-    "tf_opts_nortti_if_android",
     "tf_kernel_library",
     "tf_mkl_kernel_library",
-    "cc_header_only_library",
-    "if_not_windows",
+    "tf_opts_nortti_if_android",
 )
 load("@local_config_sycl//sycl:build_defs.bzl", "if_sycl")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_tests")
 load(
     "//tensorflow/core:platform/default/build_config.bzl",
-    "tf_proto_library",
     "tf_kernel_tests_linkstatic",
+    "tf_proto_library",
 )
 load(
     "//tensorflow/core:platform/default/build_config_root.bzl",
@@ -195,15 +195,35 @@ cc_library(
     deps = ["//third_party/eigen3"],
 )
 
-cc_library(
+tf_kernel_library(
     name = "conv_2d",
     hdrs = ["conv_2d.h"],
+    gpu_srcs = [
+        "conv_2d.h",
+        "conv_2d_gpu.h",
+        "conv_2d_gpu_double.cu.cc",
+        "conv_2d_gpu_float.cu.cc",
+        "conv_2d_gpu_half.cu.cc",
+        "conv_2d_gpu_int.cu.cc",
+        "conv_2d_gpu_uint16.cu.cc",
+        "conv_2d_gpu_uint32.cu.cc",
+        "conv_2d_gpu_uint64.cu.cc",
+        "conv_2d_gpu_uint8.cu.cc",
+    ],
     deps = [
         ":eigen_helpers",
+        ":fill_functor",
         ":gpu_util_hdrs",
+        ":image_resizer_state",
+        ":ops_util",
+        "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:nn_ops_op_lib",
         "//third_party/eigen3",
     ],
+    alwayslink = 1,
 )
 
 cc_library(
@@ -583,7 +603,7 @@ cc_library(
     deps = [
         "//third_party/eigen3",
     ] + select({
-        ":mkldnn_contraction_kernel": ["//third_party/intel_mkl_dnn:mkldnn_single_threaded"],
+        ":mkldnn_contraction_kernel": ["@mkl_dnn//:mkldnn_single_threaded"],
         "//conditions:default": [],
     }),
 )
@@ -1673,14 +1693,14 @@ tf_kernel_library(
     ],
     visibility = [":friends"],
     deps = [
-        ":conv_ops",
+        ":conv_2d",
         ":ops_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//third_party/eigen3",
     ],
-    alwayslink = 0,
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -2735,6 +2755,7 @@ cc_library(
         ":cholesky_grad",
         ":cholesky_op",
         ":determinant_op",
+        ":lu_op",
         ":matrix_exponential_op",
         ":matrix_inverse_op",
         ":matrix_logarithm_op",
@@ -2880,6 +2901,19 @@ tf_kernel_library(
     deps = LINALG_DEPS,
 )
 
+tf_kernel_library(
+    name = "lu_op",
+    prefix = "lu_op",
+    deps = if_cuda([
+        ":cuda_solvers",
+        ":transpose_functor",
+    ]) + [
+        "//third_party/eigen3",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+    ],
+)
+
 cc_library(
     name = "linalg_ops_common",
     srcs = ["linalg_ops_common.cc"],
@@ -3185,7 +3219,7 @@ tf_kernel_library(
 tf_kernel_library(
     name = "scan_ops",
     prefix = "scan_ops",
-    deps = MATH_DEPS,
+    deps = MATH_DEPS + if_cuda(["@cub_archive//:cub"]),
 )
 
 tf_kernel_library(
@@ -3363,6 +3397,29 @@ tf_cuda_cc_test(
     ],
 )
 
+tf_cuda_cc_test(
+    name = "scan_ops_test",
+    size = "small",
+    srcs = ["scan_ops_test.cc"],
+    linkopts = select({
+        "//tensorflow:darwin": ["-headerpad_max_install_names"],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":host_constant_op",
+        ":ops_testutil",
+        ":ops_util",
+        ":scan_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "reduction_ops_test",
     size = "small",
@@ -4546,6 +4603,7 @@ tf_kernel_library(
         ":dense_update_functor",
         ":training_op_helpers",
         ":variable_ops",
+        ":inplace_ops",
     ],
 )
 
@@ -4783,11 +4841,14 @@ tf_kernel_library(
     name = "unicode_ops",
     prefix = "unicode_ops",
     deps = [
+        ":bounds_check",
         ":string_util",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:string_ops_op_lib",
+        "//third_party/eigen3",
+        "//third_party/icu/data:conversion_data",
         "@icu//:common",
     ],
 )
@@ -5360,6 +5421,7 @@ filegroup(
         "mfcc_mel_filterbank.h",
         "mirror_pad_op.h",
         "mirror_pad_op_cpu_impl.h",
+        "multinomial_op.h",
         "pad_op.h",
         "pooling_ops_3d.h",
         "random_op.h",
@@ -5378,6 +5440,7 @@ filegroup(
         "spacetobatch_functor.h",
         "spacetodepth_op.h",
         "spectrogram.h",
+        "stateless_random_ops.h",
         "string_util.h",
         "tensor_array.h",
         "tile_functor.h",
@@ -5517,6 +5580,7 @@ filegroup(
         "mirror_pad_op_cpu_impl_3.cc",
         "mirror_pad_op_cpu_impl_4.cc",
         "mirror_pad_op_cpu_impl_5.cc",
+        "multinomial_op.cc",
         "pad_op.cc",
         "padding_fifo_queue.cc",
         "padding_fifo_queue_op.cc",
@@ -5557,6 +5621,7 @@ filegroup(
         "stack.cc",
         "stack.h",
         "stack_ops.cc",
+        "stateless_random_ops.cc",
         "string_join_op.cc",
         "string_util.cc",
         "summary_op.cc",
@@ -5683,8 +5748,8 @@ filegroup(
             "batch_kernels.*",
             "regex_full_match_op.cc",
             "regex_replace_op.cc",
-            "unicode_script_op.cc",
             "unicode_ops.cc",
+            "unicode_script_op.cc",
             # Ops that are inherently incompatible with Android (e.g. tied to x86 platform).
             "mkl_*",
             "xsmm_*",
@@ -6229,6 +6294,28 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_quantized_pooling_ops_test",
+    size = "small",
+    srcs = ["mkl_quantized_pooling_ops_test.cc"],
+    deps = [
+        ":mkl_input_conversion_op",
+        ":mkl_pooling_ops",
+        ":ops_testutil",
+        ":ops_util",
+        ":quantization_utils",
+        ":quantized_ops",
+        "//tensorflow/core:array_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:math_ops_op_lib",
+        "//tensorflow/core:nn_ops_op_lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_cc_test(
     name = "quantized_reshape_op_test",
     size = "small",
@@ -6668,6 +6755,31 @@ tf_mkl_kernel_library(
     deps = NN_DEPS + mkl_deps() + [":cwise_op"],
 )
 
+tf_cc_test_mkl(
+    name = "mkl_fused_ops_test",
+    size = "small",
+    srcs = ["mkl_fused_ops_test.cc"],
+    linkstatic = 1,
+    deps = [
+        ":conv_ops",
+        ":image",
+        ":mkl_conv_op",
+        ":mkl_tfconv_op",
+        ":ops_testutil",
+        ":ops_util",
+        "//tensorflow/cc:cc_ops",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ],
+)
+
 tf_mkl_kernel_library(
     name = "mkl_transpose_op",
     srcs = [
@@ -6695,6 +6807,13 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "tensor_forest_ops",
+    deps = [
+        "//tensorflow/core/kernels/tensor_forest:tensor_forest_ops",
+    ],
+)
+
 tf_kernel_library(
     name = "dataset_ops",
     deps = [
diff --git a/tensorflow/core/kernels/adjust_contrast_op.cc b/tensorflow/core/kernels/adjust_contrast_op.cc
index 72155fd037378fc3d93c02e9b893a6671e9659a6..47e10f56dfa682d97b04b78cd0e5f9a536081025 100644
--- a/tensorflow/core/kernels/adjust_contrast_op.cc
+++ b/tensorflow/core/kernels/adjust_contrast_op.cc
@@ -320,13 +320,14 @@ class AdjustContrastOpv2<CPUDevice> : public AdjustContrastOpV2Base {
     int64 batch = outputs.dimension(0);
     int64 image_size = outputs.dimension(1);
     int64 channels = outputs.dimension(2);
-    // Similar to the reduction case, a straighforward implementation of this
+    // Similar to the reduction case, a straightforward implementation of this
     // does not utilize vectorization well because of the small channel size.
     // This algorithm repeatedly increases the area to be copied, and leads to
     // much better vectorinizations in the copy.
     for (int64 i = 0; i < batch; i++) {
       // Copy over the inputs into outputs in this batch. Effectively:
-      // outputs(i, :, k) = inputs(i, k). An example of how this algorith works:
+      // outputs(i, :, k) = inputs(i, k). An example of how this algorithm
+      // works:
       //
       //    x = float[1, 3], y = float[2048, 3]
       //    round 0
diff --git a/tensorflow/core/kernels/adjust_hue_op.cc b/tensorflow/core/kernels/adjust_hue_op.cc
index 6079aa749d52c5a3483ac21cd44feef5a3978fb3..52dec94305d3c8558013861a44524609ad6eed7a 100644
--- a/tensorflow/core/kernels/adjust_hue_op.cc
+++ b/tensorflow/core/kernels/adjust_hue_op.cc
@@ -216,8 +216,8 @@ class AdjustHueOp<CPUDevice> : public AdjustHueOpBase {
         *context->device()->tensorflow_cpu_worker_threads();
     Shard(worker_threads.num_threads, worker_threads.workers, channel_count,
           kCostPerChannel,
-          [channel_count, &input_data, &output_data, delta_h](
-              int64 start_channel, int64 end_channel) {
+          [&input_data, &output_data, delta_h](int64 start_channel,
+                                               int64 end_channel) {
             const float* p = input_data.data() + start_channel * kChannelSize;
             float* q = output_data.data() + start_channel * kChannelSize;
             for (int i = start_channel; i < end_channel; i++) {
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 944564dfba62f257ae45b3c5c25d0de64fa0b773..aa9123582210bdf31993e9d8c58ba90cc02acc5e 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -180,7 +180,7 @@ class Barrier : public ResourceBase {
         // SQSS is closed, nothing is left in the incomplete set,
         // the queue is not already marked as closed, and (most
         // importantly), the queue has entries in it.
-        [this, ctx, callback, component_index]() {
+        [this, ctx, callback]() {
           if (!ctx->status().ok()) {
             callback();
             return;
diff --git a/tensorflow/core/kernels/boosted_trees/BUILD b/tensorflow/core/kernels/boosted_trees/BUILD
index 4e8bfa02fc3a21329e6495fc4ebccf365d3a02a8..8f2c2dbe8a778353dff5e0b8823ac99de68282df 100644
--- a/tensorflow/core/kernels/boosted_trees/BUILD
+++ b/tensorflow/core/kernels/boosted_trees/BUILD
@@ -2,7 +2,10 @@
 #   OpKernels for boosted trees ops.
 
 package(
-    default_visibility = ["//tensorflow:internal"],
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+        "//tensorflow:internal",
+    ],
 )
 
 licenses(["notice"])  # Apache 2.0
diff --git a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
index 1ab72af05914bc15148fc4caff7a07493c1ff1e5..4e9bab3e21f9f240d32e78a1a489033a693caa73 100644
--- a/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
+++ b/tensorflow/core/kernels/boosted_trees/boosted_trees.proto
@@ -12,6 +12,7 @@ message Node {
     Leaf leaf = 1;
     BucketizedSplit bucketized_split = 2;
     CategoricalSplit categorical_split = 3;
+    DenseSplit dense_split = 4;
   }
   NodeMetadata metadata = 777;
 }
@@ -70,6 +71,19 @@ message CategoricalSplit {
   int32 right_id = 4;
 }
 
+// TODO(nponomareva): move out of boosted_trees and rename to trees.proto
+message DenseSplit {
+  // Float feature column and split threshold describing
+  // the rule feature <= threshold.
+  int32 feature_id = 1;
+  float threshold = 2;
+
+  // Node children indexing into a contiguous
+  // vector of nodes starting from the root.
+  int32 left_id = 3;
+  int32 right_id = 4;
+}
+
 // Tree describes a list of connected nodes.
 // Node 0 must be the root and can carry any payload including a leaf
 // in the case of representing the bias.
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 382c9d5e503519d99ce20e1fbf025d2bf946e821..36def4a53065e2c6ac68a8b67818096012104753 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -71,11 +71,14 @@ TF_CALL_ALL_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_ALL_TYPES(REGISTER_CPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_CPU_REF_SWITCH);
+REGISTER_CPU_SWITCH(uint64);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_SWITCH);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_GPU_REF_SWITCH);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_SWITCH);
+REGISTER_GPU_SWITCH(uint64);
+TF_CALL_variant(REGISTER_GPU_SWITCH);
 
 #undef REGISTER_CPU_SWITCH
 #undef REGISTER_CPU_REF_SWITCH
@@ -263,6 +266,7 @@ TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_KERNEL);
 TF_CALL_QUANTIZED_TYPES(REGISTER_GPU_REF_KERNEL);
 REGISTER_GPU_KERNEL(bool);
 REGISTER_GPU_REF_KERNEL(bool);
+REGISTER_GPU_KERNEL(uint64);
 
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
@@ -597,6 +601,13 @@ LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
 LoopCondOp::~LoopCondOp() = default;
 
 void LoopCondOp::Compute(OpKernelContext* context) {
+  CancellationManager* cm = context->cancellation_manager();
+  if (cm != nullptr) {
+    bool already_cancelled = cm->IsCancelled();
+    OP_REQUIRES(context, !already_cancelled,
+                errors::Cancelled("Loop execution was cancelled."));
+  }
+
   context->set_output(0, context->input(0));
 }
 
diff --git a/tensorflow/core/kernels/conv_2d.h b/tensorflow/core/kernels/conv_2d.h
index a6964b1aacb445ffb3938817b241d2455a4c2fa3..1bac2a18c30c841b7431e6a12063eba508e54d86 100644
--- a/tensorflow/core/kernels/conv_2d.h
+++ b/tensorflow/core/kernels/conv_2d.h
@@ -162,7 +162,7 @@ struct TransformFilter {
     merged_dims[1] = in.dimension(NDIMS - 2);  // input filters
     merged_dims[2] = in.dimension(NDIMS - 1);  // output filters
 
-    CHECK(dst_filter_format == FORMAT_OIHW)
+    DCHECK(dst_filter_format == FORMAT_OIHW)
         << "Unsupported destination filter format: "
         << ToString(dst_filter_format);
     // Source filter format is FORMAT_HWIO and spatial dimensions HW are merged
diff --git a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc b/tensorflow/core/kernels/conv_2d_gpu.h
similarity index 91%
rename from tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
rename to tensorflow/core/kernels/conv_2d_gpu.h
index 46167db3a2b44da40a2dc60e90d6b0cd900503ec..8d117574284065ff8fcf62d913257b0ccdd497e5 100644
--- a/tensorflow/core/kernels/conv_ops_gpu_3.cu.cc
+++ b/tensorflow/core/kernels/conv_2d_gpu.h
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#ifndef TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
+#define TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
+
 #if GOOGLE_CUDA
 
 #define EIGEN_USE_GPU
@@ -34,7 +37,7 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 
 namespace functor {
-namespace {
+
 template <typename T, bool conjugate>
 struct maybe_conj {
   __device__ static __inline__ T run(T x) {
@@ -75,8 +78,6 @@ struct maybe_conj<double2, conjugate> {
   }
 };
 
-}  // namespace
-
 // TODO(mjanusz): Move this to a shared util file.
 // A simple array that contains data that can be passed between CPU and GPU.
 template <typename T, int IndexCount, T DefaultValue>
@@ -999,77 +1000,8 @@ struct NCHWToNHWC<GPUDevice, T, NDIMS> {
 };
 
 }  // namespace functor
-
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4, int>;
-template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4, int>;
-
-template struct functor::ShuffleAndReverse<GPUDevice, float, 4,
-                                           Eigen::DenseIndex>;
-template struct functor::ShuffleAndReverse<GPUDevice, Eigen::half, 4,
-                                           Eigen::DenseIndex>;
-
-template struct functor::TransformDepth<GPUDevice, float, int>;
-template struct functor::TransformDepth<GPUDevice, Eigen::half, int>;
-
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint8>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint16>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint32>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, uint64>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, float4>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, float2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, double2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension1And2InTensor3<GPUDevice, Eigen::half>;
-
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint8>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint16>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint32>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, uint64>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, float4>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, float2,
-                                                     /*conjugate=*/true>;
-template struct functor::SwapDimension0And2InTensor3<GPUDevice, double2,
-                                                     /*conjugate=*/true>;
-
-// For 2d ops.
-template struct functor::TransformFilter<GPUDevice, double, int, 4>;
-template struct functor::TransformFilter<GPUDevice, float, int, 4>;
-template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 4>;
-
-template struct functor::ReverseTransformFilter<GPUDevice, double, 4>;
-template struct functor::ReverseTransformFilter<GPUDevice, float, 4>;
-template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 4>;
-
-template struct functor::NHWCToNCHW<GPUDevice, double, 4>;
-template struct functor::NHWCToNCHW<GPUDevice, float, 4>;
-template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 4>;
-
-template struct functor::NCHWToNHWC<GPUDevice, double, 4>;
-template struct functor::NCHWToNHWC<GPUDevice, float, 4>;
-template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 4>;
-
-template struct functor::PadInput<GPUDevice, int, int, 4>;
-template struct functor::PadInput<GPUDevice, double, int, 4>;
-template struct functor::PadInput<GPUDevice, float, int, 4>;
-template struct functor::PadInput<GPUDevice, Eigen::half, int, 4>;
-
-// For 3d ops.
-template struct functor::TransformFilter<GPUDevice, float, int, 5>;
-template struct functor::TransformFilter<GPUDevice, Eigen::half, int, 5>;
-
-template struct functor::ReverseTransformFilter<GPUDevice, float, 5>;
-template struct functor::ReverseTransformFilter<GPUDevice, Eigen::half, 5>;
-
-template struct functor::NHWCToNCHW<GPUDevice, float, 5>;
-template struct functor::NHWCToNCHW<GPUDevice, Eigen::half, 5>;
-
-template struct functor::NCHWToNHWC<GPUDevice, float, 5>;
-template struct functor::NCHWToNHWC<GPUDevice, Eigen::half, 5>;
-
-template struct functor::PadInput<GPUDevice, float, int, 5>;
-template struct functor::PadInput<GPUDevice, Eigen::half, int, 5>;
-
 }  // namespace tensorflow
 
 #endif  // GOOGLE_CUDA
+
+#endif  // TENSORFLOW_CORE_KERNELS_CONV_2D_GPU_H_
diff --git a/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..353d6d113023a3b970a39bbc097e32d5154de9a7
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_double.cu.cc
@@ -0,0 +1,50 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
+
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, double2,
+                                            /*conjugate=*/true>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, double, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 4>;
+template struct PadInput<Eigen::GpuDevice, double, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, double, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, double, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, double, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, double, 5>;
+template struct PadInput<Eigen::GpuDevice, double, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21030dd12b3912b8d251c0bf386e7059348bb312
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_float.cu.cc
@@ -0,0 +1,63 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, float, 4,
+                                  Eigen::DenseIndex>;
+
+template struct TransformDepth<Eigen::GpuDevice, float, int>;
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float4>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, float2,
+                                            /*conjugate=*/true>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, float, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 4>;
+template struct PadInput<Eigen::GpuDevice, float, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, float, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, float, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, float, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, float, 5>;
+template struct PadInput<Eigen::GpuDevice, float, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..948308651fbe8a8ed0c88ba33416361d468abc97
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_half.cu.cc
@@ -0,0 +1,57 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4, int>;
+template struct ShuffleAndReverse<Eigen::GpuDevice, Eigen::half, 4,
+                                  Eigen::DenseIndex>;
+
+template struct TransformDepth<Eigen::GpuDevice, Eigen::half, int>;
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, Eigen::half>;
+
+// For 2d ops.
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 4>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 4>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 4>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 4>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 4>;
+
+// For 3d ops.
+template struct TransformFilter<Eigen::GpuDevice, Eigen::half, int, 5>;
+template struct ReverseTransformFilter<Eigen::GpuDevice, Eigen::half, 5>;
+template struct NHWCToNCHW<Eigen::GpuDevice, Eigen::half, 5>;
+template struct NCHWToNHWC<Eigen::GpuDevice, Eigen::half, 5>;
+template struct PadInput<Eigen::GpuDevice, Eigen::half, int, 5>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_int.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_int.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..901ce3e55d4f42cf399331cbf471835bef5d7097
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_int.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+// For 2d ops.
+template struct PadInput<Eigen::GpuDevice, int, int, 4>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e47532a9832f7c77cc5a6d714e870b3c872aa0d5
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint16.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint16>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint16>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..56cd5dd218ccc20bc434025bfabbe87c67e0e090
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint32.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint32>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint32>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..045a664e9653e37ab27ba3a05836ed303b2cf0ea
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint64.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint64>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint64>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc b/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..215417860afc2b01108fad085133144542791a52
--- /dev/null
+++ b/tensorflow/core/kernels/conv_2d_gpu_uint8.cu.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <array>
+#include <limits>
+#include <utility>
+
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_2d_gpu.h"
+
+namespace tensorflow {
+
+namespace functor {
+
+template struct SwapDimension1And2InTensor3<Eigen::GpuDevice, uint8>;
+template struct SwapDimension0And2InTensor3<Eigen::GpuDevice, uint8>;
+
+}  // namespace functor
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_grad_ops_3d.cc b/tensorflow/core/kernels/conv_grad_ops_3d.cc
index f62c60d255d47d50a7712f08113914dcef2c2c9b..e4c49efea0bd87fdbaa3fbdad3d5612d6b4f8a82 100644
--- a/tensorflow/core/kernels/conv_grad_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_ops_3d.cc
@@ -1074,6 +1074,7 @@ namespace functor {
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 }  // namespace functor
 
@@ -1863,6 +1864,7 @@ class Conv3DBackpropFilterOp<GPUDevice, T> : public OpKernel {
                           Conv3DBackpropFilterOp<GPUDevice, T>);
 TF_CALL_half(REGISTER_GPU_KERNEL);
 TF_CALL_float(REGISTER_GPU_KERNEL);
+TF_CALL_double(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 
 #endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/conv_ops_3d.cc b/tensorflow/core/kernels/conv_ops_3d.cc
index 83df4dce38e09b09956104c411d3e36f6cfb7657..f20ac93b5a01cf2dbd1c53ce55c832727f49979f 100644
--- a/tensorflow/core/kernels/conv_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_ops_3d.cc
@@ -533,10 +533,19 @@ namespace functor {
       const GPUDevice& d, typename TTypes<T, 5, int>::ConstTensor in, \
       const std::array<int, 3>& padding_left,                         \
       const std::array<int, 3>& padding_right,                        \
-      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);
+      typename TTypes<T, 5, int>::Tensor out, TensorFormat format);   \
+  template <>                                                         \
+  void NHWCToNCHW<GPUDevice, T, 5>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
+      typename TTypes<T, 5>::Tensor out);                             \
+  template <>                                                         \
+  void NCHWToNHWC<GPUDevice, T, 5>::operator()(                       \
+      const GPUDevice& d, typename TTypes<T, 5>::ConstTensor in,      \
+      typename TTypes<T, 5>::Tensor out);
 
 DECLARE_GPU_SPEC(Eigen::half);
 DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
 #undef DECLARE_GPU_SPEC
 
 }  // namespace functor
@@ -548,6 +557,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(
     Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<float>("T"),
     Conv3DOp<GPUDevice, float>);
+REGISTER_KERNEL_BUILDER(
+    Name("Conv3D").Device(DEVICE_GPU).TypeConstraint<double>("T"),
+    Conv3DOp<GPUDevice, double>);
 #endif  // GOOGLE_CUDA
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_fused.cc b/tensorflow/core/kernels/conv_ops_fused.cc
index c75bb679322c244e14556ebb61b37a2000e9a9cb..798a7325cd25494d8b12447c86f4883ca038c8ca 100644
--- a/tensorflow/core/kernels/conv_ops_fused.cc
+++ b/tensorflow/core/kernels/conv_ops_fused.cc
@@ -14,897 +14,30 @@ limitations under the License.
 ==============================================================================*/
 
 // Implements convolution operations with other kernels baked into the
-// processing, to optimize latency and memory usage.
+// processing, to optimize latency and memory usage:
+//  - Conv2D + BiasAdd + <Activation>
+//  - Conv2D + FusedBatchNorm + <Activation>
+//
+// Activation: Relu, Relu6, Elu, etc...
+//
+// Kernels for convolutions fused with image transformations (resize and mirror
+// padding) defined in `conv_ops_fused_image_transform.cc`.
 
 #define EIGEN_USE_THREADS
 
-#include <string.h>
-#include <map>
+#include <string>
 #include <vector>
-#include "tensorflow/core/framework/common_shape_fns.h"
-#include "tensorflow/core/framework/numeric_op.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/tensor_slice.h"
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/conv_2d.h"
 #include "tensorflow/core/kernels/conv_ops.h"
-#include "tensorflow/core/kernels/gemm_functors.h"
-#include "tensorflow/core/kernels/image_resizer_state.h"
 #include "tensorflow/core/kernels/ops_util.h"
-#include "tensorflow/core/lib/core/threadpool.h"
-#include "tensorflow/core/util/mirror_pad_mode.h"
-#include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 namespace tensorflow {
-
-namespace {
-
-// We don't want to allocate a buffer to hold all the patches if the size is
-// going to be extremely large, so break it into chunks if it's bigger than
-// a limit. Each chunk will be processed serially, so we can refill the
-// buffer for the next chunk and reuse it, keeping maximum memory size down.
-// In this case, we've picked 16 megabytes as a reasonable limit for Android and
-// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
-#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
-const size_t kMaxChunkSize = (1 * 1024 * 1024);
-#else
-const size_t kMaxChunkSize = (16 * 1024 * 1024);
-#endif
-const size_t kResizeCacheSize = (8 * 1024 * 1024);
-
-// Lookup method used when resizing.
-enum SamplingMode {
-  BILINEAR = 0,
-  NEAREST = 1,
-};
-
-// Simple utility function used by FusedConv to multithread basic workloads. To
-// use it, pass begin and end values for the full workload and a std::function
-// that receives a subset of that through the begin and end values for each
-// worker's task. The division of the full workload into worker tasks is handled
-// by the multithreading logic. Here's an example of how to use it:
-// std::vector<float> my_vector(100);
-// ...
-// FusedConvParallelFor(context, 0, 100,
-//   [&my_vector](int64 task_begin, int64 task_end) {
-//     for (int64 current = task_begin; current != task_end; ++current) {
-//       my_vector[current] *= 10.0f;
-//     }
-// });
-void FusedConvParallelFor(
-    OpKernelContext* context, int64 begin, int64 end,
-    const std::function<void(int64, int64)>& task_function) {
-// On iOS, the thread management imposes a very big performance penalty, so
-// just call the function directly with no multithreading.
-#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
-  task_function(begin, end);
-#else
-  auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
-  thread::ThreadPool* thread_pool = worker_threads.workers;
-  const int64 total_elements = end - begin;
-  // This is a bit of an arbitrary number, but was found to work well for
-  // typical models we've been profiling on various devices.
-  const int64 element_cost = 10000000;
-  thread_pool->ParallelFor(
-      total_elements, element_cost,
-      [begin, task_function](int64 begin_offset, int64 end_offset) {
-        const int64 task_begin = begin + begin_offset;
-        const int64 task_end = begin + end_offset;
-        task_function(task_begin, task_end);
-      });
-#endif
-}
-
-// Holds the state needed for the resizing subtasks.
-template <class T1>
-struct ResizeTaskParameters {
-  ResizeTaskParameters() : st(false) {}
-
-  int cache_height;
-  T1* resize_cache;
-  int cache_line_width;
-  int input_width;
-  int input_depth;
-  int top_padding;
-  int pad_offset;
-  int64 resized_height;
-  ImageResizerState st;
-  const T1* input_batch_start;
-  int64 cache_start_x;
-  int64 cache_end_x;
-  int left_padding;
-  int64 resized_width;
-  int64 padded_width;
-  int64 padded_height;
-};
-
-template <class T1>
-struct PerCacheLineParameters {
-  PerCacheLineParameters() {}
-  PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
-      : cache_line_start(other.cache_line_start),
-        input_top_row_start(other.input_top_row_start),
-        input_bottom_row_start(other.input_bottom_row_start),
-        y_lerp(other.y_lerp) {}
-
-  T1* cache_line_start;
-  const T1* input_top_row_start;
-  const T1* input_bottom_row_start;
-  T1 y_lerp;
-};
-
-// Helper class to simplify bilinear filtering
-template <class T1>
-struct SampleRect {
-  EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
-                                 const T1* in_bottom_left,
-                                 const T1* in_bottom_right)
-      : top_left(in_top_left),
-        top_right(in_top_right),
-        bottom_left(in_bottom_left),
-        bottom_right(in_bottom_right) {}
-
-  EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
-                                        T1 y_lerp) const {
-    const T1 top =
-        top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
-    const T1 bottom = bottom_left[channel] +
-                      (bottom_right[channel] - bottom_left[channel]) * x_lerp;
-    return top + (bottom - top) * y_lerp;
-  }
-
-  const T1* top_left;
-  const T1* top_right;
-  const T1* bottom_left;
-  const T1* bottom_right;
-};
-
-// Calculates parameters which remain constant through a resize cache row.
-template <class T1>
-EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
-    int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
-    int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
-    int64 resized_height, const ImageResizerState& st,
-    const T1* input_batch_start) {
-  PerCacheLineParameters<T1> result;
-  // The cache is organized so that the real y values of the resized image map
-  // onto the actual cache values through a modulo scheme. This means that as we
-  // progress downwards through the image, we keep reusing a small cache and so
-  // keep memory usage down.
-  int64 cache_index_y;
-  if (cache_y < 0) {
-    cache_index_y = cache_height + (cache_y % cache_height);
-  } else {
-    cache_index_y = cache_y % cache_height;
-  }
-  result.cache_line_start =
-      resize_cache + (cache_index_y * cache_line_width * input_depth);
-  // This part is implementing the mirror padding that happens before resizing.
-  float in_y = (cache_y - top_padding);
-  if (in_y < 0) {
-    in_y = -(in_y + 1.0f - pad_offset);
-  } else if (in_y >= resized_height) {
-    in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
-  }
-  // Here's where do do the actual resize.
-  in_y *= st.height_scale;
-  const int64 top_y_index = static_cast<int64>(std::floor(in_y));
-  const int64 bottom_y_index =
-      std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
-  // Lerp is used for bilinear filtering when that's needed.
-  result.y_lerp = static_cast<T1>(in_y - top_y_index);
-  // Which rows of the original input image to pull the values from.
-  result.input_top_row_start =
-      input_batch_start + (top_y_index * input_width * input_depth);
-  result.input_bottom_row_start =
-      input_batch_start + (bottom_y_index * input_width * input_depth);
-  return result;
-}
-
-template <class T1>
-struct PerCachePixelParameters {
-  PerCachePixelParameters() {}
-  PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
-      : cache_line_pixel(other.cache_line_pixel),
-        left_x_index(other.left_x_index),
-        right_x_index(other.right_x_index),
-        x_lerp(other.x_lerp) {}
-
-  T1* cache_line_pixel;
-  int64 left_x_index;
-  int64 right_x_index;
-  T1 x_lerp;
-};
-
-// Pulls out common parameters used for every resized pixel.
-template <class T1>
-EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
-CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
-                                 T1* cache_line_start, int64 input_depth,
-                                 int64 left_padding, int64 pad_offset,
-                                 int64 resized_width,
-                                 const ImageResizerState& st) {
-  PerCachePixelParameters<T1> result;
-  // Figure out where we're going to store the results of our transform.
-  const int cache_index_x = cache_x - cache_start_x;
-  result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
-  // Implement mirror padding by flipping in_x if it's off the edge.
-  float in_x = (cache_x - left_padding);
-  if (in_x < 0) {
-    in_x = -(in_x + 1.0f - pad_offset);
-  } else if (in_x >= resized_width) {
-    in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
-  }
-  // Resize the x parameters.
-  in_x *= st.width_scale;
-  // Get the x coordinates for the left and right pixels to pull from.
-  result.left_x_index = static_cast<int64>(std::floor(in_x));
-  result.right_x_index =
-      std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
-  // This x_lerp is used to blend pixels in bilinear filtering.
-  result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
-  return result;
-}
-
-// Combines bilinear resizing and mirror padding into the im2col transformation
-// stage of convolution.
-template <class T1, class T2, class T3, class TGemmFunctor,
-          SamplingMode SampleMode>
-class FusedResizeAndPadConvFunctor {
- public:
-  void operator()(OpKernelContext* context, const Tensor& input,
-                  int input_batches, int resized_height, int resized_width,
-                  int padded_height, int padded_width, int input_depth,
-                  const T2* filter_data, int filter_height, int filter_width,
-                  int filter_count, int stride_rows, int stride_cols,
-                  Padding padding, T3* output_data, int output_height,
-                  int output_width, const ImageResizerState& st,
-                  int top_padding, int bottom_padding, int left_padding,
-                  int right_padding, int pad_offset) {
-    if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
-        (input_depth <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
-                   << input_batches << ", " << padded_height << ", "
-                   << padded_width << ", " << input_depth;
-      return;
-    }
-    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
-                   << filter_width << ", " << filter_height << ", "
-                   << filter_count;
-      return;
-    }
-    if ((output_width <= 0) || (output_height <= 0)) {
-      LOG(WARNING) << "Conv2D was called with bad output width or height: "
-                   << output_width << ", " << output_height;
-      return;
-    }
-    OP_REQUIRES(
-        context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
-        errors::InvalidArgument("Bad sample mode passed in", SampleMode));
-
-    // These calculations define how the patches will be positioned within the
-    // input image. The actual definitions are quite complex, and rely on the
-    // previously-calculated output size.
-    int filter_left_offset;
-    int filter_top_offset;
-    if (padding == VALID) {
-      filter_left_offset =
-          ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
-          2;
-      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
-                           padded_height + 1) /
-                          2;
-    } else {
-      filter_left_offset =
-          ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
-      filter_top_offset =
-          ((output_height - 1) * stride_rows + filter_height - padded_height) /
-          2;
-    }
-
-    ResizeTaskParameters<T1> task_params;
-    task_params.input_depth = input_depth;
-    task_params.top_padding = top_padding;
-    task_params.pad_offset = pad_offset;
-    task_params.resized_height = resized_height;
-    task_params.st = st;
-    task_params.left_padding = left_padding;
-    task_params.resized_width = resized_width;
-    task_params.padded_width = padded_width;
-    task_params.padded_height = padded_height;
-
-    // The im2col buffer has # of patches rows, and # of filters cols.
-    // It's laid out like this, in row major order in memory:
-    //        < filter value count >
-    //   ^   +---------------------+
-    // patch |                     |
-    // count |                     |
-    //   v   +---------------------+
-    // Each patch row contains a filter_width x filter_height patch of the
-    // input, with the depth channel as the most contiguous in memory, followed
-    // by the width, then the height. This is the standard memory order in the
-    // image world if it helps to visualize it.
-    const int filter_value_count = filter_width * filter_height * input_depth;
-
-    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
-                errors::InvalidArgument("Im2Col patch too large for buffer"));
-    const size_t patches_per_chunk =
-        kMaxChunkSize / (filter_value_count * sizeof(T1));
-    // Because memory allocation is very expensive on mobile platforms, try to
-    // allocate a persistent buffer that will be kept around between calls. We
-    // use TensorFlow's resource management to ensure that the memory will be
-    // released when the session is over.
-    Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
-    std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
-        [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
-          *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
-          return Status::OK();
-        };
-    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
-                                "Conv2d", "im2col_buffer",
-                                &im2col_buffer_resource, creator));
-
-    // Create a resize cache memory buffer that will hold the rows of
-    // transformed and mirror padded input pixels, ready to be copied
-    // into filter patches by im2col.
-    // It's laid out like this, in row major order in memory:
-    //         < cache line width >
-    //   ^    +--------------------+
-    // cache  |                    |
-    // height |                    |
-    //   v    +--------------------+
-    // Each cache row contains a cache_line_width number of resized pixels,
-    // each with input_depth channels. The cache height is typically less than
-    // the full height the resized image would be, so it's filled up
-    // incrementally as we progress downwards through the input creating im2col
-    // patches.
-    task_params.cache_start_x = -filter_left_offset;
-    task_params.cache_end_x =
-        (((output_width - 1) * stride_cols) - filter_left_offset) +
-        filter_width;
-    task_params.cache_line_width =
-        task_params.cache_end_x - task_params.cache_start_x;
-    task_params.cache_height =
-        kResizeCacheSize / (task_params.cache_line_width * input_depth);
-    const int needed_resize_cache_count =
-        filter_height * task_params.cache_line_width * input_depth;
-    OP_REQUIRES(context,
-                (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
-                errors::InvalidArgument("Input too large for resize cache"));
-    Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
-    std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
-        resize_creator =
-            [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
-              *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
-              return Status::OK();
-            };
-    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
-                                "Conv2d", "resize_cache",
-                                &resize_cache_resource, resize_creator));
-
-    // This means that multiple ops can't be run simultaneously on different
-    // threads, because we have a single shared resource. The platforms this is
-    // aimed at have intra-op parallelism as their focus though, so it shouldn't
-    // be an issue.
-    mutex_lock lock_buffer(im2col_buffer_resource->mu);
-    core::ScopedUnref unref_buffer(im2col_buffer_resource);
-    T1* im2col_buffer = im2col_buffer_resource->data;
-
-    // This buffer is used as a fairly heavy-weight cache for the resized and
-    // mirrored inputs to the im2col operation. The problem is that we want to
-    // keep the memory usage down by not rendering the fully resized and padded
-    // input tensor to the convolution into an entire buffer. The first approach
-    // to avoid this was to fold the bilinear filtering and padding spatial
-    // transformations into the im2col lookup itself. This successfully reduced
-    // memory usage, but because im2col can access an individual pixel for many
-    // different patches, the extra overhead of doing the same bilinear lookups
-    // repeatedly became too expensive.
-    // The resize cache is designed to avoid this problem by keeping a
-    // horizontal slice of the resized and padded input to the im2col
-    // precalculated, so that repeated accesses to the same pixel from different
-    // filter patches can just be copied from this cache. It's organized as a
-    // horizontal slice stretching across the whole virtual image, and as high
-    // as the filter window, so that as the patch processing moves across all
-    // the pixels are present, and before a new row of patches is started any
-    // previously calculated rows that are needed are maintained, with new rows
-    // calculated as required.
-    mutex_lock resize_lock_buffer(resize_cache_resource->mu);
-    core::ScopedUnref unref_resized_cache(resize_cache_resource);
-    task_params.resize_cache = resize_cache_resource->data;
-
-    const T1* input_data = input.flat<T1>().data();
-    const int64 input_height = input.shape().dim_sizes()[1];
-    task_params.input_width = input.shape().dim_sizes()[2];
-
-    int end_cached_lines = std::numeric_limits<int>::min();
-
-    for (int batch = 0; batch < input_batches; ++batch) {
-      task_params.input_batch_start =
-          input_data +
-          (batch * input_height * task_params.input_width * input_depth);
-      const int in_y_end =
-          ((output_height * stride_rows) - filter_top_offset) + filter_height;
-      for (int out_y = 0; out_y < output_height; ++out_y) {
-        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
-        const int cache_start_y = std::max(in_y_origin, end_cached_lines);
-        const int cache_end_y = std::min(
-            in_y_end, std::max((in_y_origin + task_params.cache_height),
-                               end_cached_lines));
-        if (end_cached_lines < (in_y_origin + filter_height)) {
-          // This call breaks up the work required for calculating the mirror
-          // padding and resizing across multiple threads.
-          FusedConvParallelFor(
-              context, cache_start_y, cache_end_y,
-              [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
-                // This is a long and confusing function, but it's been laid out
-                // this way to help with performance on some intensive models.
-                // What it's doing is populating a cache of the original input
-                // image, after it's been bilinear resized and had its edges
-                // mirrored. This allows the following im2col code to access the
-                // transformed pixels from this cache, without having to
-                // repeatedly apply the expensive bilinear calculations as the
-                // same pixels are accessed by different patches.
-                // This is most effective when the stride is small and the
-                // filter size is large, since that's when pixels are reused
-                // most frequently as patches overlap.
-                for (int cache_y = task_cache_start_y;
-                     cache_y < task_cache_end_y; ++cache_y) {
-                  // We organize the cache as a series of rows, each containing
-                  // all the transformed pixels for a given line in the image.
-                  // This cache is big enough to hold at least a filter's height
-                  // worth of rows, but typically more, limited by the size of
-                  // the cache buffer.
-                  // We don't allocate an entire image's worth of rows though,
-                  // because we're trying to keep memory usage down, so as we
-                  // progress downwards through the im2col we periodically
-                  // refresh the cache so that the next lines that are needed
-                  // for that operation are always present.
-                  // Work out the parameters that remain constant across the
-                  // row we're calculating.
-                  PerCacheLineParameters<T1> line_params(
-                      CalculatePerCacheLineParameters<T1>(
-                          task_params.cache_height, cache_y,
-                          task_params.resize_cache,
-                          task_params.cache_line_width, task_params.input_width,
-                          task_params.input_depth, task_params.top_padding,
-                          task_params.pad_offset, task_params.resized_height,
-                          task_params.st, task_params.input_batch_start));
-                  // Iterate through the resize cache row we're filling in.
-                  for (int cache_x = task_params.cache_start_x;
-                       cache_x < task_params.cache_end_x; ++cache_x) {
-                    // Figure out what we need for the cache pixel we're
-                    // populating.
-                    PerCachePixelParameters<T1> pixel_params(
-                        CalculatePerCachePixelParameters<T1>(
-                            cache_x, task_params.cache_start_x,
-                            line_params.cache_line_start,
-                            task_params.input_depth, task_params.left_padding,
-                            task_params.pad_offset, task_params.resized_width,
-                            task_params.st));
-                    // If the access is off the left, right, top, or bottom of
-                    // the resized image, the conv padding means we should set
-                    // it to zero.
-                    if ((cache_x < 0) ||
-                        (cache_x >= task_params.padded_width) ||
-                        (cache_y < 0) ||
-                        (cache_y >= task_params.padded_height)) {
-                      std::fill_n(pixel_params.cache_line_pixel,
-                                  task_params.input_depth, T1(0));
-                    } else {
-                      // There are two different sampling strategies for
-                      // resizing. When using nearest, we can just do a
-                      // straight copy of the pixel closest to our sample point,
-                      // but bilinear requires a more complex calculation.
-                      if (SampleMode == NEAREST) {
-                        const T1* input_top_left_pixel =
-                            line_params.input_top_row_start +
-                            (pixel_params.left_x_index *
-                             task_params.input_depth);
-
-                        std::copy_n(input_top_left_pixel,
-                                    task_params.input_depth,
-                                    pixel_params.cache_line_pixel);
-                      } else {
-                        const SampleRect<T1> rect(
-                            line_params.input_top_row_start +
-                                (pixel_params.left_x_index *
-                                 task_params.input_depth),
-                            line_params.input_top_row_start +
-                                (pixel_params.right_x_index *
-                                 task_params.input_depth),
-                            line_params.input_bottom_row_start +
-                                (pixel_params.left_x_index *
-                                 task_params.input_depth),
-                            line_params.input_bottom_row_start +
-                                (pixel_params.right_x_index *
-                                 task_params.input_depth));
-                        for (int in_channel = 0;
-                             in_channel < task_params.input_depth;
-                             ++in_channel) {
-                          pixel_params.cache_line_pixel[in_channel] =
-                              rect.BilinearSample(in_channel,
-                                                  pixel_params.x_lerp,
-                                                  line_params.y_lerp);
-                        }
-                      }
-                    }
-                  }
-                }
-              });
-          end_cached_lines = cache_end_y;
-        }
-        for (int out_x = 0; out_x < output_width; ++out_x) {
-          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
-          const int patch_index = (batch * output_width * output_height) +
-                                  (out_y * output_width) + out_x;
-          const int patch_index_within_chunk = patch_index % patches_per_chunk;
-          T1* im2col_patch_start =
-              im2col_buffer + (patch_index_within_chunk * filter_value_count);
-          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
-            T1* im2col_row_start =
-                im2col_patch_start +
-                (filter_y * filter_width * task_params.input_depth);
-            const int conv_in_y = in_y_origin + filter_y;
-            int cache_index_y;
-            if (conv_in_y < 0) {
-              cache_index_y = task_params.cache_height +
-                              (conv_in_y % task_params.cache_height);
-            } else {
-              cache_index_y = conv_in_y % task_params.cache_height;
-            }
-            T1* cache_line_start =
-                task_params.resize_cache +
-                (cache_index_y * task_params.cache_line_width *
-                 task_params.input_depth);
-            T1* cache_filter_row_start =
-                cache_line_start + ((in_x_origin - task_params.cache_start_x) *
-                                    task_params.input_depth);
-            std::copy_n(cache_filter_row_start,
-                        (filter_width * task_params.input_depth),
-                        im2col_row_start);
-          }
-          const bool is_last_in_chunk =
-              (patch_index_within_chunk == (patches_per_chunk - 1));
-          const bool is_last_overall =
-              ((batch == (input_batches - 1)) &&
-               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
-          if (is_last_in_chunk || is_last_overall) {
-            // Now we've assembled a set of image patches into a matrix, apply
-            // a GEMM matrix multiply of the patches as rows, times the filter
-            // weights in columns, to get partial results in the output
-            // matrix.
-            const int how_many_patches = patch_index_within_chunk + 1;
-            const int m = how_many_patches;
-            const int n = filter_count;
-            const int k = filter_value_count;
-            const int lda = filter_value_count;
-            const int ldb = filter_count;
-            const int ldc = filter_count;
-            const size_t start_patch_index =
-                patch_index - (how_many_patches - 1);
-            T3* chunk_output_data =
-                output_data + (start_patch_index * filter_count);
-            TGemmFunctor gemm_functor;
-            gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
-                         chunk_output_data, ldc);
-          }
-        }
-      }
-    }
-  }
-};
-
-}  // namespace
-
-// Implements a version of convolution with bilinear resizing and mirror padding
-// included.
-template <class T, class TConvFunctor, bool DoResize>
-class FusedResizeConv2DUsingGemmOp : public OpKernel {
- public:
-  explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
-      : OpKernel(context) {
-    if (DoResize) {
-      OP_REQUIRES_OK(context,
-                     context->GetAttr("resize_align_corners", &align_corners_));
-    }
-    MirrorPadMode mode;
-    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
-
-    switch (mode) {
-      case MirrorPadMode::SYMMETRIC: {
-        offset_ = 0;
-        break;
-      }
-      case MirrorPadMode::REFLECT: {
-        offset_ = 1;
-        break;
-      }
-      default:
-        OP_REQUIRES(context, false,
-                    errors::InvalidArgument(
-                        "mode must be either REFLECT or SYMMETRIC."));
-    }
-    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
-    OP_REQUIRES(context, strides_.size() == 4,
-                errors::InvalidArgument("Sliding window strides field must "
-                                        "specify 4 dimensions"));
-    const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
-    const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
-    OP_REQUIRES(
-        context, stride_n == 1 && stride_c == 1,
-        errors::InvalidArgument("Current implementation does not yet support "
-                                "strides in the batch and depth dimensions."));
-    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    // Input tensor is of the following dimensions:
-    // [ batch, in_rows, in_cols, in_depth ]
-    const Tensor& input = context->input(0);
-    OP_REQUIRES(context, (input.shape().num_elements() > 0),
-                errors::InvalidArgument("Input tensor can't be empty"));
-
-    ImageResizerState st(false);
-    if (DoResize) {
-      st = ImageResizerState(align_corners_);
-      st.ValidateAndCalculateOutputSize(context, input);
-      if (!context->status().ok()) return;
-    } else {
-      // Set up the resize parameters to do no scaling at all.
-      st.batch_size = input.dim_size(0);
-      st.out_height = input.dim_size(1);
-      st.out_width = input.dim_size(2);
-      st.in_height = input.dim_size(1);
-      st.in_width = input.dim_size(2);
-      st.channels = input.dim_size(3);
-      st.height_scale = 1.0f;
-      st.width_scale = 1.0f;
-    }
-    TensorShape resized_shape(
-        {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
-    int paddings_index;
-    int filter_index;
-    if (DoResize) {
-      paddings_index = 2;
-      filter_index = 3;
-    } else {
-      paddings_index = 1;
-      filter_index = 2;
-    }
-    const Tensor& paddings = context->input(paddings_index);
-
-    const int dims = resized_shape.dims();
-    OP_REQUIRES(
-        context,
-        TensorShapeUtils::IsMatrix(paddings.shape()) &&
-            paddings.dim_size(1) == 2,
-        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
-                                paddings.shape().DebugString()));
-    const int fixed_dims =
-        (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1)
-            ? 1
-            : dims;
-    OP_REQUIRES(
-        context, fixed_dims == paddings.dim_size(0),
-        errors::InvalidArgument(
-            "The first dimension of paddings must be the rank of inputs: ",
-            fixed_dims, " ", paddings.shape().DebugString(), " ",
-            resized_shape.DebugString()));
-    OP_REQUIRES(
-        context, dims == paddings.dim_size(0),
-        errors::InvalidArgument(
-            "The first dimension of paddings must be the rank of inputs: ",
-            dims, " ", paddings.shape().DebugString(), " ",
-            resized_shape.DebugString()));
-
-    OP_REQUIRES(
-        context, dims == 4,
-        errors::InvalidArgument(
-            "Fused mirror padding only supports four-dimensional inputs, but ",
-            dims, " requested"));
-
-    // Compute the shape of the output tensor, and allocate it.
-    TensorShape padded_shape;
-    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
-    for (int d = 0; d < dims; ++d) {
-      const int32 before =
-          paddings_matrix(d, 0);  // Pad before existing elements.
-      const int32 after =
-          paddings_matrix(d, 1);  // Pad after existing elements.
-      OP_REQUIRES(context, before >= 0 && after >= 0,
-                  errors::InvalidArgument(
-                      "paddings must be non-negative: ", before, " ", after));
-      if (offset_ == 0) {  // SYMMETRIC mode.
-        OP_REQUIRES(
-            context,
-            before <= resized_shape.dim_size(d) &&
-                after <= resized_shape.dim_size(d),
-            errors::InvalidArgument("paddings must be no greater "
-                                    "than the dimension size: ",
-                                    before, ", ", after, " greater than ",
-                                    resized_shape.dim_size(d)));
-      } else if (offset_ == 1) {  // REFLECT mode.
-        OP_REQUIRES(
-            context,
-            before < resized_shape.dim_size(d) &&
-                after < resized_shape.dim_size(d),
-            errors::InvalidArgument("paddings must be less than"
-                                    " the dimension size: ",
-                                    before, ", ", after, " not less than ",
-                                    resized_shape.dim_size(d)));
-      }
-      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
-    }
-
-    OP_REQUIRES(
-        context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
-        errors::InvalidArgument(
-            "Fused mirror padding only support spatial padding, not batches: ",
-            paddings.DebugString()));
-    OP_REQUIRES(
-        context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
-        errors::InvalidArgument(
-            "Fused mirror padding only support spatial padding, not channels: ",
-            paddings.DebugString()));
-    const int32 top_padding = paddings_matrix(1, 0);
-    const int32 bottom_padding = paddings_matrix(1, 1);
-    const int32 left_padding = paddings_matrix(2, 0);
-    const int32 right_padding = paddings_matrix(2, 1);
-
-    // Input filter is of the following dimensions:
-    // [ filter_rows, filter_cols, in_depth, out_depth]
-    const Tensor& filter = context->input(filter_index);
-
-    // For 2D convolution, there should be 4 dimensions.
-    OP_REQUIRES(context, padded_shape.dims() == 4,
-                errors::InvalidArgument("input must be 4-dimensional",
-                                        padded_shape.DebugString()));
-    OP_REQUIRES(context, filter.dims() == 4,
-                errors::InvalidArgument("filter must be 4-dimensional: ",
-                                        filter.shape().DebugString()));
-
-    // We only check the first three dims, since the depth is accessed as an
-    // int64 below.
-    for (int i = 0; i < 3; i++) {
-      OP_REQUIRES(
-          context,
-          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
-          errors::InvalidArgument("filter too large"));
-    }
-
-    // The last dimension for input is in_depth. It must be the same as the
-    // filter's in_depth.
-    const int64 in_depth = padded_shape.dim_size(3);
-    OP_REQUIRES(context, in_depth == filter.dim_size(2),
-                errors::InvalidArgument(
-                    "input and filter must have the same depth: ", in_depth,
-                    " vs ", filter.dim_size(2)));
-
-    // The last dimension for filter is out_depth.
-    const int out_depth = static_cast<int>(filter.dim_size(3));
-
-    // The second dimension for input is rows/height.
-    // The first dimension for filter is rows/height.
-    const int64 padded_rows_raw = padded_shape.dim_size(1);
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input rows too large"));
-    const int padded_rows = static_cast<int>(padded_rows_raw);
-    const int filter_rows = static_cast<int>(filter.dim_size(0));
-    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
-
-    // The third dimension for input is columns/width.
-    // The second dimension for filter is columns/width.
-    const int64 padded_cols_raw = padded_shape.dim_size(2);
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("Input cols too large"));
-    const int padded_cols = static_cast<int>(padded_cols_raw);
-    const int filter_cols = static_cast<int>(filter.dim_size(1));
-    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
-
-    // The first dimension for input is batch.
-    const int64 batch_raw = padded_shape.dim_size(0);
-    OP_REQUIRES(context,
-                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
-                errors::InvalidArgument("batch is too large"));
-    const int batch = static_cast<int>(batch_raw);
-
-    // For now we take the stride from the second and third dimensions only (we
-    // do not support striding on the batch or depth dimension).
-    const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
-    const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
-
-    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
-                                         padding_, &out_rows, &pad_rows));
-    OP_REQUIRES_OK(context,
-                   GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
-                                         padding_, &out_cols, &pad_cols));
-    TensorShape out_shape =
-        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
-    OP_REQUIRES(context, (out_shape.num_elements() > 0),
-                errors::InvalidArgument("Output tensor can't be empty"));
-
-    // Output tensor is of the following dimensions:
-    // [ in_batch, out_rows, out_cols, out_depth ]
-    Tensor* output = nullptr;
-    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
-
-    VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
-            << ", padded_cols = " << padded_cols
-            << ", resized_cols = " << resized_cols
-            << ", filter_cols = " << filter_cols
-            << ", padded_rows = " << padded_rows
-            << ", resized_rows = " << resized_rows
-            << ", filter_rows = " << filter_rows
-            << ", stride_rows = " << stride_rows
-            << ", stride_cols = " << stride_cols
-            << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
-
-    // If there is nothing to compute, return.
-    if (out_shape.num_elements() == 0) {
-      return;
-    }
-    TConvFunctor conv_functor;
-    conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
-                 padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
-                 filter_cols, out_depth, stride_rows, stride_cols, padding_,
-                 output->flat<T>().data(), out_rows, out_cols, st, top_padding,
-                 bottom_padding, left_padding, right_padding, offset_);
-  }
-
- private:
-  std::vector<int32> strides_;
-  Padding padding_;
-  bool align_corners_;
-  int offset_;
-
-  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
-};
-
-#define REGISTER_FUSED(T)                                                 \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedResizeAndPadConv2D")                                     \
-          .Device(DEVICE_CPU)                                             \
-          .TypeConstraint<T>("T"),                                        \
-      FusedResizeConv2DUsingGemmOp<                                       \
-          T,                                                              \
-          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                       BILINEAR>,                         \
-          true>);
-
-TF_CALL_half(REGISTER_FUSED);
-TF_CALL_float(REGISTER_FUSED);
-TF_CALL_double(REGISTER_FUSED);
-
-#define REGISTER_PAD_ONLY_FUSED(T)                                        \
-  REGISTER_KERNEL_BUILDER(                                                \
-      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
-      FusedResizeConv2DUsingGemmOp<                                       \
-          T,                                                              \
-          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
-                                       NEAREST>,                          \
-          false>);
-
-TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
-TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
-TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
-
-// Support for fusing computationally cheap, but memory bandwidth expensive
-// computations into the output of convolution to reduce the overall latency.
-//
-// Example: Fuse Conv2D+BiasAdd+Relu.
-
 namespace {
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
@@ -929,9 +62,45 @@ template <typename Scalar, typename Index>
 using ContractionOutputMapper =
     Eigen::internal::blas_data_mapper<Scalar, Index, Eigen::ColMajor>;
 
-// Output kernel that fused BiasAdd operation into the output of tensor
-// contraction.
-template <typename T>
+// Returns input expression without any transformations.
+struct Identity {
+  template <typename XprType>
+  static auto apply(XprType expr) -> XprType {
+    return expr;
+  };
+};
+
+// Applies `Relu` to the passed input expression.
+struct Relu {
+  template <typename XprType>
+  static auto apply(XprType expr)
+      -> decltype(expr.cwiseMax(std::declval<typename XprType::Scalar>())) {
+    return expr.cwiseMax(static_cast<typename XprType::Scalar>(0));
+  };
+};
+
+// TensorContraction swaps lhs with rhs, and changes layout from RowMajor
+// (default in Tensorflow) to ColMajor (preferred in Eigen), and computes matmul
+// using these tensors.
+//
+// TensorContraction output matrix (before reshape) has a ColMajor layout, and
+// has dimensions:
+//  - rows: output_channels
+//  - cols: all other dimensions
+//
+// First element in every column is:
+//   [batch ??, height ??, width ??, out_channel = i]
+//
+// We do not know what are the values of the 'batch', 'height', and 'width' here
+// (if we know original dimensions, they can be computed from 'j').
+//
+// Each column of an output block is a continuous slice along the output channel
+// dimension, so we can use it to efficiently compute any transformation that
+// depends only on a channel value (e.g. add channel bias).
+
+// Output kernel that fuses BiasAdd operation into the output of tensor
+// contraction + any other transformation defined by Transform.
+template <typename T, typename Transform = Identity>
 struct BiasAddOutputKernel {
   explicit BiasAddOutputKernel(const T* bias_data) : bias_data(bias_data) {}
 
@@ -943,13 +112,13 @@ struct BiasAddOutputKernel {
     DCHECK(params.swapped_arguments);
 
     const T* bias_base = bias_data + i;
+    typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
 
-    // TODO(ezhulenev): Use Eigen::Array with strides after upgrading Eigen.
     for (int col = 0; col < num_cols; ++col) {
       T* output_base = &output_mapper(0, col);
       typename OutputTypes<T>::Tensor output(output_base, num_rows);
-      typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
-      output = output + bias;
+      const auto expr = output + bias;
+      output = Transform::template apply<decltype(expr)>(expr);
     }
   }
 
@@ -957,12 +126,16 @@ struct BiasAddOutputKernel {
   const T* bias_data;
 };
 
-// Output kernel that fused BiasAdd and Relu operations into the output of
-// tensor contraction.
-template <typename T>
-struct BiasAddWithReluOutputKernel {
-  explicit BiasAddWithReluOutputKernel(const T* bias_data)
-      : bias_data(bias_data) {}
+// Output kernel that fuses FusedBatchNorm operation into the output of tensor
+// contraction + any other transformation defined by Transform.
+template <typename T, typename Transform = Identity>
+struct FusedBatchNormOutputKernel {
+  FusedBatchNormOutputKernel(T epsilon, const T* scaling_factor_data,
+                             const T* offset_data, const T* estimated_mean_data)
+      : epsilon(epsilon),
+        scaling_factor_data(scaling_factor_data),
+        offset_data(offset_data),
+        estimated_mean_data(estimated_mean_data) {}
 
   template <typename Index, typename Scalar>
   EIGEN_ALWAYS_INLINE void operator()(
@@ -971,19 +144,31 @@ struct BiasAddWithReluOutputKernel {
       Index num_rows, Index num_cols) const {
     DCHECK(params.swapped_arguments);
 
-    const T* bias_base = bias_data + i;
+    const T* scaling_factor_base = scaling_factor_data + i;
+    const T* offset_base = offset_data + i;
+    const T* mean_base = estimated_mean_data + i;
+
+    typename OutputTypes<T>::ConstTensor scaling_factor(scaling_factor_base,
+                                                        num_rows);
+    typename OutputTypes<T>::ConstTensor offset(offset_base, num_rows);
+    typename OutputTypes<T>::ConstTensor mean(mean_base, num_rows);
 
-    // TODO(ezhulenev): Use Eigen::Array with strides after upgrading Eigen.
     for (int col = 0; col < num_cols; ++col) {
       T* output_base = &output_mapper(0, col);
       typename OutputTypes<T>::Tensor output(output_base, num_rows);
-      typename OutputTypes<T>::ConstTensor bias(bias_base, num_rows);
-      output = (output + bias).cwiseMax(static_cast<T>(0));
+
+      auto scaled = (output - mean) * scaling_factor;
+      auto shifted = scaled + offset;
+
+      output = Transform::template apply<decltype(shifted)>(shifted);
     }
   }
 
  private:
-  const T* bias_data;
+  T epsilon;
+  const T* scaling_factor_data;
+  const T* offset_data;
+  const T* estimated_mean_data;
 };
 
 // Type aliases for the output kernels, purely for the sake of better launch
@@ -991,21 +176,33 @@ struct BiasAddWithReluOutputKernel {
 template <typename T>
 using WithBiasAdd = BiasAddOutputKernel<T>;
 template <typename T>
-using WithBiasAddAndRelu = BiasAddWithReluOutputKernel<T>;
+using WithBiasAddAndRelu = BiasAddOutputKernel<T, Relu>;
+template <typename T>
+using WithFusedBatchNorm = FusedBatchNormOutputKernel<T>;
+template <typename T>
+using WithFusedBatchNormAndRelu = FusedBatchNormOutputKernel<T, Relu>;
 
 // Dispatch 2D convolution to the appropriate primitive operation:
 //   (1) MatMul for the case of 1x1 convolution.
 //   (2) MatMul for the case when filter size equals to the input size.
 //   (3) General spatial 2D convolution for all other cases.
-template <typename T, typename OutputKernel>
-struct LaunchConv2DWithOutputKernel {
-  void operator()(OpKernelContext* ctx, const Tensor& input,
-                  const Tensor& filter, int row_stride, int col_stride,
-                  int row_dilation, int col_dilation, const Padding& padding,
-                  const OutputKernel& output_kernel, Tensor* output,
-                  TensorFormat data_format) {
-    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 && row_stride == 1 &&
-        col_stride == 1) {
+template <typename T>
+class LaunchConv2DWithOutputKernel {
+ public:
+  LaunchConv2DWithOutputKernel(int row_stride, int col_stride,      //
+                               int row_dilation, int col_dilation,  //
+                               Padding padding)
+      : row_stride_(row_stride),
+        col_stride_(col_stride),
+        row_dilation_(row_dilation),
+        col_dilation_(col_dilation),
+        padding_(padding) {}
+
+  template <typename OutputKernel>
+  void operator()(const OutputKernel& output_kernel, OpKernelContext* ctx,
+                  const Tensor& input, const Tensor& filter, Tensor* output) {
+    if (filter.dim_size(0) == 1 && filter.dim_size(1) == 1 &&
+        row_stride_ == 1 && col_stride_ == 1) {
       int conv_width = 1;  // Width for the convolution step.
       for (int i = 0; i < 3; ++i) {
         conv_width *= output->dim_size(i);
@@ -1021,8 +218,8 @@ struct LaunchConv2DWithOutputKernel {
           dim_pair, output_kernel);
 
     } else if (filter.dim_size(0) == input.dim_size(1) &&
-               filter.dim_size(1) == input.dim_size(2) && row_dilation == 1 &&
-               col_dilation == 1 && padding == VALID) {
+               filter.dim_size(1) == input.dim_size(2) && row_dilation_ == 1 &&
+               col_dilation_ == 1 && padding_ == VALID) {
       // If the input data and filter have the same height/width,
       // reduce the 2D convolution to matrix multiplication.
       const auto k =  // Length of reduction dimension.
@@ -1040,11 +237,18 @@ struct LaunchConv2DWithOutputKernel {
     } else {
       functor::SpatialConvolution<CPUDevice, T, OutputKernel>()(
           ctx->eigen_device<CPUDevice>(), output->tensor<T, 4>(),
-          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride, col_stride,
-          row_dilation, col_dilation, BrainPadding2EigenPadding(padding),
+          input.tensor<T, 4>(), filter.tensor<T, 4>(), row_stride_, col_stride_,
+          row_dilation_, col_dilation_, BrainPadding2EigenPadding(padding_),
           output_kernel);
     }
   }
+
+ private:
+  int row_stride_;
+  int col_stride_;
+  int row_dilation_;
+  int col_dilation_;
+  const Padding padding_;
 };
 
 }  // namespace
@@ -1065,27 +269,43 @@ class FusedConv2DOp : public OpKernel {
                 errors::InvalidArgument(
                     "Fused Conv2D must have at least one fused op."));
 
-    // Right now we always expect to have just one extra argument that is an
-    // input to the BiasAdd. In future we might fuse other types of computations
-    // taking additional arguments.
-
     int num_args;
     OP_REQUIRES_OK(context, context->GetAttr("num_args", &num_args));
-    OP_REQUIRES(context, num_args == 1,
-                errors::InvalidArgument(
-                    "Fused Conv2D must have one extra argument with a bias."));
 
     // TODO(ezhulenev): Add support for fusion element-wise op chains defined
     // at runtime, e.g. Relu+Sqrt+Tanh+etc...
 
+    // Match combination of fused ops to one of the supported fusions.
     if (FusedOpsMatches(fused_ops, {"BiasAdd"})) {
       fused_computation_ = FusedComputationType::kBiasAdd;
     } else if (FusedOpsMatches(fused_ops, {"BiasAdd", "Relu"})) {
       fused_computation_ = FusedComputationType::kBiasAddWithRelu;
+    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm"})) {
+      fused_computation_ = FusedComputationType::kFusedBatchNorm;
+    } else if (FusedOpsMatches(fused_ops, {"FusedBatchNorm", "Relu"})) {
+      fused_computation_ = FusedComputationType::kFusedBatchNormWithRelu;
     } else {
       OP_REQUIRES(context, false,
-                  errors::Unimplemented("Fusion is not implemented: ",
-                                        str_util::Join(fused_ops, ",")));
+                  errors::Unimplemented("Fusion is not implemented: [",
+                                        str_util::Join(fused_ops, ","), "]"));
+    }
+
+    // Depending on a picked fusion type validate fusion-specific arguments.
+
+    if (fused_computation_ == FusedComputationType::kBiasAdd ||
+        fused_computation_ == FusedComputationType::kBiasAddWithRelu) {
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "Fused Conv2D must have one extra argument: bias."));
+    }
+
+    if (fused_computation_ == FusedComputationType::kFusedBatchNorm ||
+        fused_computation_ == FusedComputationType::kFusedBatchNormWithRelu) {
+      OP_REQUIRES(
+          context, num_args == 4,
+          errors::InvalidArgument("Fused FusedBatchNorm must have four extra "
+                                  "arguments: scale, offset, mean, variance."));
+      OP_REQUIRES_OK(context, context->GetAttr("epsilon", &epsilon_));
     }
   }
 
@@ -1098,10 +318,6 @@ class FusedConv2DOp : public OpKernel {
     // [ filter_rows, filter_cols, in_depth, out_depth]
     const Tensor& filter = context->input(1);
 
-    // Bias of the following dimensions:
-    // [ output_depth ]
-    const Tensor& bias = context->input(2);
-
     Conv2DDimensions dimensions;
     OP_REQUIRES_OK(context,
                    ComputeConv2DDimension(params_, input, filter, &dimensions));
@@ -1139,23 +355,47 @@ class FusedConv2DOp : public OpKernel {
                 errors::Unimplemented("Fused conv implementation does not "
                                       "support grouped convolutions for now."));
 
-    auto bias_data = reinterpret_cast<const T*>(bias.tensor_data().data());
+    BiasAddArgs bias_add;
+    FusedBatchNormArgs fused_batch_norm;
 
-#define LAUNCH_CONV2D(KERNEL)                                                 \
-  LaunchConv2DWithOutputKernel<T, KERNEL>()(                                  \
-      context, input, filter, dimensions.stride_rows, dimensions.stride_cols, \
-      dimensions.dilation_rows, dimensions.dilation_cols, params_.padding,    \
-      KERNEL(bias_data), output, params_.data_format);                        \
-  break
+    LaunchConv2DWithOutputKernel<T> conv2d(
+        dimensions.stride_rows, dimensions.stride_cols,
+        dimensions.dilation_rows, dimensions.dilation_cols, params_.padding);
 
     switch (fused_computation_) {
       case FusedComputationType::kBiasAdd:
-        LAUNCH_CONV2D(WithBiasAdd<T>);
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
+        conv2d(WithBiasAdd<T>(bias_add.bias_add_data), context, input, filter,
+               output);
+        break;
+
       case FusedComputationType::kBiasAddWithRelu:
-        LAUNCH_CONV2D(WithBiasAddAndRelu<T>);
+        OP_REQUIRES_OK(context, InitBiasAddArgs(context, &bias_add));
+        conv2d(WithBiasAddAndRelu<T>(bias_add.bias_add_data), context, input,
+               filter, output);
+        break;
+
+      case FusedComputationType::kFusedBatchNorm:
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+        conv2d(WithFusedBatchNorm<T>(epsilon_,
+                                     fused_batch_norm.scaling_factor.data(),
+                                     fused_batch_norm.offset_data,
+                                     fused_batch_norm.estimated_mean_data),
+               context, input, filter, output);
+        break;
+
+      case FusedComputationType::kFusedBatchNormWithRelu:
+        OP_REQUIRES_OK(context,
+                       InitFusedBatchNormArgs(context, &fused_batch_norm));
+        conv2d(WithFusedBatchNormAndRelu<T>(
+                   epsilon_, fused_batch_norm.scaling_factor.data(),
+                   fused_batch_norm.offset_data,
+                   fused_batch_norm.estimated_mean_data),
+               context, input, filter, output);
+        break;
     }
   }
-#undef LAUNCH_CONV2D
 
  private:
   bool FusedOpsMatches(const std::vector<string>& fused_ops,
@@ -1163,13 +403,92 @@ class FusedConv2DOp : public OpKernel {
     return fused_ops == expected;
   }
 
+  struct BiasAddArgs {
+    const T* bias_add_data = nullptr;
+  };
+
+  struct FusedBatchNormArgs {
+    const T* scale_data = nullptr;
+    const T* offset_data = nullptr;
+    const T* estimated_mean_data = nullptr;
+    const T* estimated_variance_data = nullptr;
+
+    // Precomputed expression:
+    //   scaling_factor = (estimated_variance + epsilon).rsqrt() * scale
+    Eigen::Tensor<T, 1, Eigen::RowMajor> scaling_factor;
+  };
+
+#define TF_REQUIRES(EXP, STATUS) \
+  if (!TF_PREDICT_TRUE(EXP)) return (STATUS)
+
+  void InitDataPtr(const Tensor& tensor, const T** ptr) const {
+    *ptr = reinterpret_cast<const T*>(tensor.tensor_data().data());
+  }
+
+  Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs* args) const {
+    // Bias of the following dimensions: [ output_depth ]
+    const Tensor& bias = context->input(2);
+
+    TF_REQUIRES(bias.dims() == 1,
+                errors::InvalidArgument("bias must be 1-dimensional",
+                                        bias.shape().DebugString()));
+
+    InitDataPtr(bias, &args->bias_add_data);
+
+    return Status::OK();
+  }
+
+  Status InitFusedBatchNormArgs(OpKernelContext* context,
+                                FusedBatchNormArgs* args) const {
+    const Tensor& scale = context->input(2);
+    const Tensor& offset = context->input(3);
+    const Tensor& estimated_mean = context->input(4);
+    const Tensor& estimated_variance = context->input(5);
+
+    TF_REQUIRES(scale.dims() == 1,
+                errors::InvalidArgument("scale must be 1-dimensional",
+                                        scale.shape().DebugString()));
+    TF_REQUIRES(offset.dims() == 1,
+                errors::InvalidArgument("offset must be 1-dimensional",
+                                        offset.shape().DebugString()));
+    TF_REQUIRES(estimated_mean.dims() == 1,
+                errors::InvalidArgument("estimated_mean must be 1-dimensional",
+                                        estimated_mean.shape().DebugString()));
+    TF_REQUIRES(
+        estimated_variance.dims() == 1,
+        errors::InvalidArgument("estimated_variance must be 1-dimensional",
+                                estimated_variance.shape().DebugString()));
+
+    InitDataPtr(scale, &args->scale_data);
+    InitDataPtr(offset, &args->offset_data);
+    InitDataPtr(estimated_mean, &args->estimated_mean_data);
+    InitDataPtr(estimated_variance, &args->estimated_variance_data);
+
+    // Precompute scaling factor once for all output blocks (kernels).
+    args->scaling_factor =
+        (estimated_variance.flat<T>() + static_cast<T>(epsilon_)).rsqrt() *
+        scale.flat<T>();
+
+    return Status::OK();
+  }
+
+#undef TF_REQUIRES
+
   // Element-wise ops applied to the result of Conv2D.
   // TODO(ezhulenev): Add support for runtime-defined op chains.
-  enum class FusedComputationType { kBiasAdd, kBiasAddWithRelu };
+  enum class FusedComputationType {
+    kBiasAdd,
+    kBiasAddWithRelu,
+    kFusedBatchNorm,
+    kFusedBatchNormWithRelu
+  };
 
   Conv2DParameters params_;
   FusedComputationType fused_computation_;
 
+  // FusedBatchNorm attributes.
+  float epsilon_;
+
   TF_DISALLOW_COPY_AND_ASSIGN(FusedConv2DOp);
 };
 
@@ -1180,7 +499,9 @@ class FusedConv2DOp : public OpKernel {
 
 // If we're using the alternative GEMM-based implementation of Conv2D for the
 // CPU implementation, don't register this EigenTensor-based version.
-#if !defined(USE_GEMM_FOR_CONV)
+// TODO(b/119765980): Upgrade upstream Eigen to set `m_can_use_xsmm=false` for
+// contractions with non-default contraction output kernels.
+#if !defined(USE_GEMM_FOR_CONV) && !defined(EIGEN_USE_LIBXSMM)
 TF_CALL_float(REGISTER_FUSED_CONV2D);
 TF_CALL_double(REGISTER_FUSED_CONV2D);
 #endif  // !USE_GEMM_FOR_CONV
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7be1de29c951dca16085e35587d02eeeec01354f
--- /dev/null
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -0,0 +1,902 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Implements convolution operations with image transformations (resize and
+// mirror padding) baked into the processing, to optimize latency and memory
+// usage.
+
+#define EIGEN_USE_THREADS
+
+#include <string>
+#include <vector>
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/numeric_op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_slice.h"
+#include "tensorflow/core/kernels/bounds_check.h"
+#include "tensorflow/core/kernels/conv_2d.h"
+#include "tensorflow/core/kernels/conv_ops.h"
+#include "tensorflow/core/kernels/gemm_functors.h"
+#include "tensorflow/core/kernels/image_resizer_state.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/mirror_pad_mode.h"
+#include "tensorflow/core/util/padding.h"
+#include "tensorflow/core/util/tensor_format.h"
+
+namespace tensorflow {
+namespace {
+
+// We don't want to allocate a buffer to hold all the patches if the size is
+// going to be extremely large, so break it into chunks if it's bigger than
+// a limit. Each chunk will be processed serially, so we can refill the
+// buffer for the next chunk and reuse it, keeping maximum memory size down.
+// In this case, we've picked 16 megabytes as a reasonable limit for Android and
+// other platforms using Eigen, and 1MB for iOS devices, from experimentation.
+#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
+const size_t kMaxChunkSize = (1 * 1024 * 1024);
+#else
+const size_t kMaxChunkSize = (16 * 1024 * 1024);
+#endif
+const size_t kResizeCacheSize = (8 * 1024 * 1024);
+
+// Lookup method used when resizing.
+enum SamplingMode {
+  BILINEAR = 0,
+  NEAREST = 1,
+};
+
+// Simple utility function used by FusedConv to multithread basic workloads. To
+// use it, pass begin and end values for the full workload and a std::function
+// that receives a subset of that through the begin and end values for each
+// worker's task. The division of the full workload into worker tasks is handled
+// by the multithreading logic. Here's an example of how to use it:
+// std::vector<float> my_vector(100);
+// ...
+// FusedConvParallelFor(context, 0, 100,
+//   [&my_vector](int64 task_begin, int64 task_end) {
+//     for (int64 current = task_begin; current != task_end; ++current) {
+//       my_vector[current] *= 10.0f;
+//     }
+// });
+void FusedConvParallelFor(
+    OpKernelContext* context, int64 begin, int64 end,
+    const std::function<void(int64, int64)>& task_function) {
+// On iOS, the thread management imposes a very big performance penalty, so
+// just call the function directly with no multithreading.
+#if defined(__APPLE__) && defined(IS_MOBILE_PLATFORM)
+  task_function(begin, end);
+#else
+  auto& worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+  thread::ThreadPool* thread_pool = worker_threads.workers;
+  const int64 total_elements = end - begin;
+  // This is a bit of an arbitrary number, but was found to work well for
+  // typical models we've been profiling on various devices.
+  const int64 element_cost = 10000000;
+  thread_pool->ParallelFor(
+      total_elements, element_cost,
+      [begin, task_function](int64 begin_offset, int64 end_offset) {
+        const int64 task_begin = begin + begin_offset;
+        const int64 task_end = begin + end_offset;
+        task_function(task_begin, task_end);
+      });
+#endif
+}
+
+// Holds the state needed for the resizing subtasks.
+template <class T1>
+struct ResizeTaskParameters {
+  ResizeTaskParameters() : st(false) {}
+
+  int cache_height;
+  T1* resize_cache;
+  int cache_line_width;
+  int input_width;
+  int input_depth;
+  int top_padding;
+  int pad_offset;
+  int64 resized_height;
+  ImageResizerState st;
+  const T1* input_batch_start;
+  int64 cache_start_x;
+  int64 cache_end_x;
+  int left_padding;
+  int64 resized_width;
+  int64 padded_width;
+  int64 padded_height;
+};
+
+template <class T1>
+struct PerCacheLineParameters {
+  PerCacheLineParameters() {}
+  PerCacheLineParameters(const PerCacheLineParameters<T1>& other)
+      : cache_line_start(other.cache_line_start),
+        input_top_row_start(other.input_top_row_start),
+        input_bottom_row_start(other.input_bottom_row_start),
+        y_lerp(other.y_lerp) {}
+
+  T1* cache_line_start;
+  const T1* input_top_row_start;
+  const T1* input_bottom_row_start;
+  T1 y_lerp;
+};
+
+// Helper class to simplify bilinear filtering
+template <class T1>
+struct SampleRect {
+  EIGEN_ALWAYS_INLINE SampleRect(const T1* in_top_left, const T1* in_top_right,
+                                 const T1* in_bottom_left,
+                                 const T1* in_bottom_right)
+      : top_left(in_top_left),
+        top_right(in_top_right),
+        bottom_left(in_bottom_left),
+        bottom_right(in_bottom_right) {}
+
+  EIGEN_ALWAYS_INLINE T1 BilinearSample(int channel, T1 x_lerp,
+                                        T1 y_lerp) const {
+    const T1 top =
+        top_left[channel] + (top_right[channel] - top_left[channel]) * x_lerp;
+    const T1 bottom = bottom_left[channel] +
+                      (bottom_right[channel] - bottom_left[channel]) * x_lerp;
+    return top + (bottom - top) * y_lerp;
+  }
+
+  const T1* top_left;
+  const T1* top_right;
+  const T1* bottom_left;
+  const T1* bottom_right;
+};
+
+// Calculates parameters which remain constant through a resize cache row.
+template <class T1>
+EIGEN_ALWAYS_INLINE PerCacheLineParameters<T1> CalculatePerCacheLineParameters(
+    int64 cache_height, int64 cache_y, T1* resize_cache, int64 cache_line_width,
+    int64 input_width, int64 input_depth, int64 top_padding, int64 pad_offset,
+    int64 resized_height, const ImageResizerState& st,
+    const T1* input_batch_start) {
+  PerCacheLineParameters<T1> result;
+  // The cache is organized so that the real y values of the resized image map
+  // onto the actual cache values through a modulo scheme. This means that as we
+  // progress downwards through the image, we keep reusing a small cache and so
+  // keep memory usage down.
+  int64 cache_index_y;
+  if (cache_y < 0) {
+    cache_index_y = cache_height + (cache_y % cache_height);
+  } else {
+    cache_index_y = cache_y % cache_height;
+  }
+  result.cache_line_start =
+      resize_cache + (cache_index_y * cache_line_width * input_depth);
+  // This part is implementing the mirror padding that happens before resizing.
+  float in_y = (cache_y - top_padding);
+  if (in_y < 0) {
+    in_y = -(in_y + 1.0f - pad_offset);
+  } else if (in_y >= resized_height) {
+    in_y = (resized_height * 2.0f) - (in_y + 1.0f + pad_offset);
+  }
+  // Here's where do do the actual resize.
+  in_y *= st.height_scale;
+  const int64 top_y_index = static_cast<int64>(std::floor(in_y));
+  const int64 bottom_y_index =
+      std::min(static_cast<int64>(std::ceil(in_y)), (st.in_height - 1));
+  // Lerp is used for bilinear filtering when that's needed.
+  result.y_lerp = static_cast<T1>(in_y - top_y_index);
+  // Which rows of the original input image to pull the values from.
+  result.input_top_row_start =
+      input_batch_start + (top_y_index * input_width * input_depth);
+  result.input_bottom_row_start =
+      input_batch_start + (bottom_y_index * input_width * input_depth);
+  return result;
+}
+
+template <class T1>
+struct PerCachePixelParameters {
+  PerCachePixelParameters() {}
+  PerCachePixelParameters(const PerCachePixelParameters<T1>& other)
+      : cache_line_pixel(other.cache_line_pixel),
+        left_x_index(other.left_x_index),
+        right_x_index(other.right_x_index),
+        x_lerp(other.x_lerp) {}
+
+  T1* cache_line_pixel;
+  int64 left_x_index;
+  int64 right_x_index;
+  T1 x_lerp;
+};
+
+// Pulls out common parameters used for every resized pixel.
+template <class T1>
+EIGEN_ALWAYS_INLINE PerCachePixelParameters<T1>
+CalculatePerCachePixelParameters(int64 cache_x, int64 cache_start_x,
+                                 T1* cache_line_start, int64 input_depth,
+                                 int64 left_padding, int64 pad_offset,
+                                 int64 resized_width,
+                                 const ImageResizerState& st) {
+  PerCachePixelParameters<T1> result;
+  // Figure out where we're going to store the results of our transform.
+  const int cache_index_x = cache_x - cache_start_x;
+  result.cache_line_pixel = cache_line_start + (cache_index_x * input_depth);
+  // Implement mirror padding by flipping in_x if it's off the edge.
+  float in_x = (cache_x - left_padding);
+  if (in_x < 0) {
+    in_x = -(in_x + 1.0f - pad_offset);
+  } else if (in_x >= resized_width) {
+    in_x = (resized_width * 2.0f) - (in_x + 1.0f + pad_offset);
+  }
+  // Resize the x parameters.
+  in_x *= st.width_scale;
+  // Get the x coordinates for the left and right pixels to pull from.
+  result.left_x_index = static_cast<int64>(std::floor(in_x));
+  result.right_x_index =
+      std::min(static_cast<int64>(std::ceil(in_x)), (st.in_width - 1));
+  // This x_lerp is used to blend pixels in bilinear filtering.
+  result.x_lerp = static_cast<T1>(in_x - result.left_x_index);
+  return result;
+}
+
+// Combines bilinear resizing and mirror padding into the im2col transformation
+// stage of convolution.
+template <class T1, class T2, class T3, class TGemmFunctor,
+          SamplingMode SampleMode>
+class FusedResizeAndPadConvFunctor {
+ public:
+  void operator()(OpKernelContext* context, const Tensor& input,
+                  int input_batches, int resized_height, int resized_width,
+                  int padded_height, int padded_width, int input_depth,
+                  const T2* filter_data, int filter_height, int filter_width,
+                  int filter_count, int stride_rows, int stride_cols,
+                  Padding padding, T3* output_data, int output_height,
+                  int output_width, const ImageResizerState& st,
+                  int top_padding, int bottom_padding, int left_padding,
+                  int right_padding, int pad_offset) {
+    if ((input_batches <= 0) || (padded_width <= 0) || (padded_height <= 0) ||
+        (input_depth <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad input dimensions: "
+                   << input_batches << ", " << padded_height << ", "
+                   << padded_width << ", " << input_depth;
+      return;
+    }
+    if ((filter_width <= 0) || (filter_height <= 0) || (filter_count <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad filter dimensions: "
+                   << filter_width << ", " << filter_height << ", "
+                   << filter_count;
+      return;
+    }
+    if ((output_width <= 0) || (output_height <= 0)) {
+      LOG(WARNING) << "Conv2D was called with bad output width or height: "
+                   << output_width << ", " << output_height;
+      return;
+    }
+    OP_REQUIRES(
+        context, ((SampleMode == NEAREST) || (SampleMode == BILINEAR)),
+        errors::InvalidArgument("Bad sample mode passed in", SampleMode));
+
+    // These calculations define how the patches will be positioned within the
+    // input image. The actual definitions are quite complex, and rely on the
+    // previously-calculated output size.
+    int filter_left_offset;
+    int filter_top_offset;
+    if (padding == VALID) {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - padded_width + 1) /
+          2;
+      filter_top_offset = ((output_height - 1) * stride_rows + filter_height -
+                           padded_height + 1) /
+                          2;
+    } else {
+      filter_left_offset =
+          ((output_width - 1) * stride_cols + filter_width - padded_width) / 2;
+      filter_top_offset =
+          ((output_height - 1) * stride_rows + filter_height - padded_height) /
+          2;
+    }
+
+    ResizeTaskParameters<T1> task_params;
+    task_params.input_depth = input_depth;
+    task_params.top_padding = top_padding;
+    task_params.pad_offset = pad_offset;
+    task_params.resized_height = resized_height;
+    task_params.st = st;
+    task_params.left_padding = left_padding;
+    task_params.resized_width = resized_width;
+    task_params.padded_width = padded_width;
+    task_params.padded_height = padded_height;
+
+    // The im2col buffer has # of patches rows, and # of filters cols.
+    // It's laid out like this, in row major order in memory:
+    //        < filter value count >
+    //   ^   +---------------------+
+    // patch |                     |
+    // count |                     |
+    //   v   +---------------------+
+    // Each patch row contains a filter_width x filter_height patch of the
+    // input, with the depth channel as the most contiguous in memory, followed
+    // by the width, then the height. This is the standard memory order in the
+    // image world if it helps to visualize it.
+    const int filter_value_count = filter_width * filter_height * input_depth;
+
+    OP_REQUIRES(context, (filter_value_count * sizeof(T1)) <= kMaxChunkSize,
+                errors::InvalidArgument("Im2Col patch too large for buffer"));
+    const size_t patches_per_chunk =
+        kMaxChunkSize / (filter_value_count * sizeof(T1));
+    // Because memory allocation is very expensive on mobile platforms, try to
+    // allocate a persistent buffer that will be kept around between calls. We
+    // use TensorFlow's resource management to ensure that the memory will be
+    // released when the session is over.
+    Im2ColBufferResource<T1, kMaxChunkSize>* im2col_buffer_resource;
+    std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
+        [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
+          *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
+          return Status::OK();
+        };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "im2col_buffer",
+                                &im2col_buffer_resource, creator));
+
+    // Create a resize cache memory buffer that will hold the rows of
+    // transformed and mirror padded input pixels, ready to be copied
+    // into filter patches by im2col.
+    // It's laid out like this, in row major order in memory:
+    //         < cache line width >
+    //   ^    +--------------------+
+    // cache  |                    |
+    // height |                    |
+    //   v    +--------------------+
+    // Each cache row contains a cache_line_width number of resized pixels,
+    // each with input_depth channels. The cache height is typically less than
+    // the full height the resized image would be, so it's filled up
+    // incrementally as we progress downwards through the input creating im2col
+    // patches.
+    task_params.cache_start_x = -filter_left_offset;
+    task_params.cache_end_x =
+        (((output_width - 1) * stride_cols) - filter_left_offset) +
+        filter_width;
+    task_params.cache_line_width =
+        task_params.cache_end_x - task_params.cache_start_x;
+    task_params.cache_height =
+        kResizeCacheSize / (task_params.cache_line_width * input_depth);
+    const int needed_resize_cache_count =
+        filter_height * task_params.cache_line_width * input_depth;
+    OP_REQUIRES(context,
+                (needed_resize_cache_count * sizeof(T1)) <= kResizeCacheSize,
+                errors::InvalidArgument("Input too large for resize cache"));
+    Im2ColBufferResource<T1, kResizeCacheSize>* resize_cache_resource;
+    std::function<Status(Im2ColBufferResource<T1, kResizeCacheSize>**)>
+        resize_creator =
+            [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
+              *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
+              return Status::OK();
+            };
+    OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
+                                "Conv2d", "resize_cache",
+                                &resize_cache_resource, resize_creator));
+
+    // This means that multiple ops can't be run simultaneously on different
+    // threads, because we have a single shared resource. The platforms this is
+    // aimed at have intra-op parallelism as their focus though, so it shouldn't
+    // be an issue.
+    mutex_lock lock_buffer(im2col_buffer_resource->mu);
+    core::ScopedUnref unref_buffer(im2col_buffer_resource);
+    T1* im2col_buffer = im2col_buffer_resource->data;
+
+    // This buffer is used as a fairly heavy-weight cache for the resized and
+    // mirrored inputs to the im2col operation. The problem is that we want to
+    // keep the memory usage down by not rendering the fully resized and padded
+    // input tensor to the convolution into an entire buffer. The first approach
+    // to avoid this was to fold the bilinear filtering and padding spatial
+    // transformations into the im2col lookup itself. This successfully reduced
+    // memory usage, but because im2col can access an individual pixel for many
+    // different patches, the extra overhead of doing the same bilinear lookups
+    // repeatedly became too expensive.
+    // The resize cache is designed to avoid this problem by keeping a
+    // horizontal slice of the resized and padded input to the im2col
+    // precalculated, so that repeated accesses to the same pixel from different
+    // filter patches can just be copied from this cache. It's organized as a
+    // horizontal slice stretching across the whole virtual image, and as high
+    // as the filter window, so that as the patch processing moves across all
+    // the pixels are present, and before a new row of patches is started any
+    // previously calculated rows that are needed are maintained, with new rows
+    // calculated as required.
+    mutex_lock resize_lock_buffer(resize_cache_resource->mu);
+    core::ScopedUnref unref_resized_cache(resize_cache_resource);
+    task_params.resize_cache = resize_cache_resource->data;
+
+    const T1* input_data = input.flat<T1>().data();
+    const int64 input_height = input.shape().dim_sizes()[1];
+    task_params.input_width = input.shape().dim_sizes()[2];
+
+    int end_cached_lines = std::numeric_limits<int>::min();
+
+    for (int batch = 0; batch < input_batches; ++batch) {
+      task_params.input_batch_start =
+          input_data +
+          (batch * input_height * task_params.input_width * input_depth);
+      const int in_y_end =
+          ((output_height * stride_rows) - filter_top_offset) + filter_height;
+      for (int out_y = 0; out_y < output_height; ++out_y) {
+        const int in_y_origin = (out_y * stride_rows) - filter_top_offset;
+        const int cache_start_y = std::max(in_y_origin, end_cached_lines);
+        const int cache_end_y = std::min(
+            in_y_end, std::max((in_y_origin + task_params.cache_height),
+                               end_cached_lines));
+        if (end_cached_lines < (in_y_origin + filter_height)) {
+          // This call breaks up the work required for calculating the mirror
+          // padding and resizing across multiple threads.
+          FusedConvParallelFor(
+              context, cache_start_y, cache_end_y,
+              [task_params](int64 task_cache_start_y, int64 task_cache_end_y) {
+                // This is a long and confusing function, but it's been laid out
+                // this way to help with performance on some intensive models.
+                // What it's doing is populating a cache of the original input
+                // image, after it's been bilinear resized and had its edges
+                // mirrored. This allows the following im2col code to access the
+                // transformed pixels from this cache, without having to
+                // repeatedly apply the expensive bilinear calculations as the
+                // same pixels are accessed by different patches.
+                // This is most effective when the stride is small and the
+                // filter size is large, since that's when pixels are reused
+                // most frequently as patches overlap.
+                for (int cache_y = task_cache_start_y;
+                     cache_y < task_cache_end_y; ++cache_y) {
+                  // We organize the cache as a series of rows, each containing
+                  // all the transformed pixels for a given line in the image.
+                  // This cache is big enough to hold at least a filter's height
+                  // worth of rows, but typically more, limited by the size of
+                  // the cache buffer.
+                  // We don't allocate an entire image's worth of rows though,
+                  // because we're trying to keep memory usage down, so as we
+                  // progress downwards through the im2col we periodically
+                  // refresh the cache so that the next lines that are needed
+                  // for that operation are always present.
+                  // Work out the parameters that remain constant across the
+                  // row we're calculating.
+                  PerCacheLineParameters<T1> line_params(
+                      CalculatePerCacheLineParameters<T1>(
+                          task_params.cache_height, cache_y,
+                          task_params.resize_cache,
+                          task_params.cache_line_width, task_params.input_width,
+                          task_params.input_depth, task_params.top_padding,
+                          task_params.pad_offset, task_params.resized_height,
+                          task_params.st, task_params.input_batch_start));
+                  // Iterate through the resize cache row we're filling in.
+                  for (int cache_x = task_params.cache_start_x;
+                       cache_x < task_params.cache_end_x; ++cache_x) {
+                    // Figure out what we need for the cache pixel we're
+                    // populating.
+                    PerCachePixelParameters<T1> pixel_params(
+                        CalculatePerCachePixelParameters<T1>(
+                            cache_x, task_params.cache_start_x,
+                            line_params.cache_line_start,
+                            task_params.input_depth, task_params.left_padding,
+                            task_params.pad_offset, task_params.resized_width,
+                            task_params.st));
+                    // If the access is off the left, right, top, or bottom of
+                    // the resized image, the conv padding means we should set
+                    // it to zero.
+                    if ((cache_x < 0) ||
+                        (cache_x >= task_params.padded_width) ||
+                        (cache_y < 0) ||
+                        (cache_y >= task_params.padded_height)) {
+                      std::fill_n(pixel_params.cache_line_pixel,
+                                  task_params.input_depth, T1(0));
+                    } else {
+                      // There are two different sampling strategies for
+                      // resizing. When using nearest, we can just do a
+                      // straight copy of the pixel closest to our sample point,
+                      // but bilinear requires a more complex calculation.
+                      if (SampleMode == NEAREST) {
+                        const T1* input_top_left_pixel =
+                            line_params.input_top_row_start +
+                            (pixel_params.left_x_index *
+                             task_params.input_depth);
+
+                        std::copy_n(input_top_left_pixel,
+                                    task_params.input_depth,
+                                    pixel_params.cache_line_pixel);
+                      } else {
+                        const SampleRect<T1> rect(
+                            line_params.input_top_row_start +
+                                (pixel_params.left_x_index *
+                                 task_params.input_depth),
+                            line_params.input_top_row_start +
+                                (pixel_params.right_x_index *
+                                 task_params.input_depth),
+                            line_params.input_bottom_row_start +
+                                (pixel_params.left_x_index *
+                                 task_params.input_depth),
+                            line_params.input_bottom_row_start +
+                                (pixel_params.right_x_index *
+                                 task_params.input_depth));
+                        for (int in_channel = 0;
+                             in_channel < task_params.input_depth;
+                             ++in_channel) {
+                          pixel_params.cache_line_pixel[in_channel] =
+                              rect.BilinearSample(in_channel,
+                                                  pixel_params.x_lerp,
+                                                  line_params.y_lerp);
+                        }
+                      }
+                    }
+                  }
+                }
+              });
+          end_cached_lines = cache_end_y;
+        }
+        for (int out_x = 0; out_x < output_width; ++out_x) {
+          const int in_x_origin = (out_x * stride_cols) - filter_left_offset;
+          const int patch_index = (batch * output_width * output_height) +
+                                  (out_y * output_width) + out_x;
+          const int patch_index_within_chunk = patch_index % patches_per_chunk;
+          T1* im2col_patch_start =
+              im2col_buffer + (patch_index_within_chunk * filter_value_count);
+          for (int filter_y = 0; filter_y < filter_height; ++filter_y) {
+            T1* im2col_row_start =
+                im2col_patch_start +
+                (filter_y * filter_width * task_params.input_depth);
+            const int conv_in_y = in_y_origin + filter_y;
+            int cache_index_y;
+            if (conv_in_y < 0) {
+              cache_index_y = task_params.cache_height +
+                              (conv_in_y % task_params.cache_height);
+            } else {
+              cache_index_y = conv_in_y % task_params.cache_height;
+            }
+            T1* cache_line_start =
+                task_params.resize_cache +
+                (cache_index_y * task_params.cache_line_width *
+                 task_params.input_depth);
+            T1* cache_filter_row_start =
+                cache_line_start + ((in_x_origin - task_params.cache_start_x) *
+                                    task_params.input_depth);
+            std::copy_n(cache_filter_row_start,
+                        (filter_width * task_params.input_depth),
+                        im2col_row_start);
+          }
+          const bool is_last_in_chunk =
+              (patch_index_within_chunk == (patches_per_chunk - 1));
+          const bool is_last_overall =
+              ((batch == (input_batches - 1)) &&
+               (out_y == (output_height - 1)) && (out_x == (output_width - 1)));
+          if (is_last_in_chunk || is_last_overall) {
+            // Now we've assembled a set of image patches into a matrix, apply
+            // a GEMM matrix multiply of the patches as rows, times the filter
+            // weights in columns, to get partial results in the output
+            // matrix.
+            const int how_many_patches = patch_index_within_chunk + 1;
+            const int m = how_many_patches;
+            const int n = filter_count;
+            const int k = filter_value_count;
+            const int lda = filter_value_count;
+            const int ldb = filter_count;
+            const int ldc = filter_count;
+            const size_t start_patch_index =
+                patch_index - (how_many_patches - 1);
+            T3* chunk_output_data =
+                output_data + (start_patch_index * filter_count);
+            TGemmFunctor gemm_functor;
+            gemm_functor(context, m, n, k, im2col_buffer, lda, filter_data, ldb,
+                         chunk_output_data, ldc);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace
+
+// Implements a version of convolution with bilinear resizing and mirror padding
+// included.
+template <class T, class TConvFunctor, bool DoResize>
+class FusedResizeConv2DUsingGemmOp : public OpKernel {
+ public:
+  explicit FusedResizeConv2DUsingGemmOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    if (DoResize) {
+      OP_REQUIRES_OK(context,
+                     context->GetAttr("resize_align_corners", &align_corners_));
+    }
+    MirrorPadMode mode;
+    OP_REQUIRES_OK(context, context->GetAttr("mode", &mode));
+
+    switch (mode) {
+      case MirrorPadMode::SYMMETRIC: {
+        offset_ = 0;
+        break;
+      }
+      case MirrorPadMode::REFLECT: {
+        offset_ = 1;
+        break;
+      }
+      default:
+        OP_REQUIRES(context, false,
+                    errors::InvalidArgument(
+                        "mode must be either REFLECT or SYMMETRIC."));
+    }
+    OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
+    OP_REQUIRES(context, strides_.size() == 4,
+                errors::InvalidArgument("Sliding window strides field must "
+                                        "specify 4 dimensions"));
+    const int64 stride_n = GetTensorDim(strides_, FORMAT_NHWC, 'N');
+    const int64 stride_c = GetTensorDim(strides_, FORMAT_NHWC, 'C');
+    OP_REQUIRES(
+        context, stride_n == 1 && stride_c == 1,
+        errors::InvalidArgument("Current implementation does not yet support "
+                                "strides in the batch and depth dimensions."));
+    OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    // Input tensor is of the following dimensions:
+    // [ batch, in_rows, in_cols, in_depth ]
+    const Tensor& input = context->input(0);
+    OP_REQUIRES(context, (input.shape().num_elements() > 0),
+                errors::InvalidArgument("Input tensor can't be empty"));
+
+    ImageResizerState st(false);
+    if (DoResize) {
+      st = ImageResizerState(align_corners_);
+      st.ValidateAndCalculateOutputSize(context, input);
+      if (!context->status().ok()) return;
+    } else {
+      // Set up the resize parameters to do no scaling at all.
+      st.batch_size = input.dim_size(0);
+      st.out_height = input.dim_size(1);
+      st.out_width = input.dim_size(2);
+      st.in_height = input.dim_size(1);
+      st.in_width = input.dim_size(2);
+      st.channels = input.dim_size(3);
+      st.height_scale = 1.0f;
+      st.width_scale = 1.0f;
+    }
+    TensorShape resized_shape(
+        {input.dim_size(0), st.out_height, st.out_width, input.dim_size(3)});
+    int paddings_index;
+    int filter_index;
+    if (DoResize) {
+      paddings_index = 2;
+      filter_index = 3;
+    } else {
+      paddings_index = 1;
+      filter_index = 2;
+    }
+    const Tensor& paddings = context->input(paddings_index);
+
+    const int dims = resized_shape.dims();
+    OP_REQUIRES(
+        context,
+        TensorShapeUtils::IsMatrix(paddings.shape()) &&
+            paddings.dim_size(1) == 2,
+        errors::InvalidArgument("paddings must be a matrix with 2 columns: ",
+                                paddings.shape().DebugString()));
+    const int fixed_dims =
+        (allow_legacy_scalars() && dims == 0 && paddings.dim_size(0) == 1)
+            ? 1
+            : dims;
+    OP_REQUIRES(
+        context, fixed_dims == paddings.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs: ",
+            fixed_dims, " ", paddings.shape().DebugString(), " ",
+            resized_shape.DebugString()));
+    OP_REQUIRES(
+        context, dims == paddings.dim_size(0),
+        errors::InvalidArgument(
+            "The first dimension of paddings must be the rank of inputs: ",
+            dims, " ", paddings.shape().DebugString(), " ",
+            resized_shape.DebugString()));
+
+    OP_REQUIRES(
+        context, dims == 4,
+        errors::InvalidArgument(
+            "Fused mirror padding only supports four-dimensional inputs, but ",
+            dims, " requested"));
+
+    // Compute the shape of the output tensor, and allocate it.
+    TensorShape padded_shape;
+    TTypes<int32>::ConstMatrix paddings_matrix = paddings.matrix<int32>();
+    for (int d = 0; d < dims; ++d) {
+      const int32 before =
+          paddings_matrix(d, 0);  // Pad before existing elements.
+      const int32 after =
+          paddings_matrix(d, 1);  // Pad after existing elements.
+      OP_REQUIRES(context, before >= 0 && after >= 0,
+                  errors::InvalidArgument(
+                      "paddings must be non-negative: ", before, " ", after));
+      if (offset_ == 0) {  // SYMMETRIC mode.
+        OP_REQUIRES(
+            context,
+            before <= resized_shape.dim_size(d) &&
+                after <= resized_shape.dim_size(d),
+            errors::InvalidArgument("paddings must be no greater "
+                                    "than the dimension size: ",
+                                    before, ", ", after, " greater than ",
+                                    resized_shape.dim_size(d)));
+      } else if (offset_ == 1) {  // REFLECT mode.
+        OP_REQUIRES(
+            context,
+            before < resized_shape.dim_size(d) &&
+                after < resized_shape.dim_size(d),
+            errors::InvalidArgument("paddings must be less than"
+                                    " the dimension size: ",
+                                    before, ", ", after, " not less than ",
+                                    resized_shape.dim_size(d)));
+      }
+      padded_shape.AddDim(before + resized_shape.dim_size(d) + after);
+    }
+
+    OP_REQUIRES(
+        context, ((paddings_matrix(0, 0) == 0) && (paddings_matrix(0, 1) == 0)),
+        errors::InvalidArgument(
+            "Fused mirror padding only support spatial padding, not batches: ",
+            paddings.DebugString()));
+    OP_REQUIRES(
+        context, ((paddings_matrix(3, 0) == 0) && (paddings_matrix(3, 1) == 0)),
+        errors::InvalidArgument(
+            "Fused mirror padding only support spatial padding, not channels: ",
+            paddings.DebugString()));
+    const int32 top_padding = paddings_matrix(1, 0);
+    const int32 bottom_padding = paddings_matrix(1, 1);
+    const int32 left_padding = paddings_matrix(2, 0);
+    const int32 right_padding = paddings_matrix(2, 1);
+
+    // Input filter is of the following dimensions:
+    // [ filter_rows, filter_cols, in_depth, out_depth]
+    const Tensor& filter = context->input(filter_index);
+
+    // For 2D convolution, there should be 4 dimensions.
+    OP_REQUIRES(context, padded_shape.dims() == 4,
+                errors::InvalidArgument("input must be 4-dimensional",
+                                        padded_shape.DebugString()));
+    OP_REQUIRES(context, filter.dims() == 4,
+                errors::InvalidArgument("filter must be 4-dimensional: ",
+                                        filter.shape().DebugString()));
+
+    // We only check the first three dims, since the depth is accessed as an
+    // int64 below.
+    for (int i = 0; i < 3; i++) {
+      OP_REQUIRES(
+          context,
+          FastBoundsCheck(filter.dim_size(i), std::numeric_limits<int>::max()),
+          errors::InvalidArgument("filter too large"));
+    }
+
+    // The last dimension for input is in_depth. It must be the same as the
+    // filter's in_depth.
+    const int64 in_depth = padded_shape.dim_size(3);
+    OP_REQUIRES(context, in_depth == filter.dim_size(2),
+                errors::InvalidArgument(
+                    "input and filter must have the same depth: ", in_depth,
+                    " vs ", filter.dim_size(2)));
+
+    // The last dimension for filter is out_depth.
+    const int out_depth = static_cast<int>(filter.dim_size(3));
+
+    // The second dimension for input is rows/height.
+    // The first dimension for filter is rows/height.
+    const int64 padded_rows_raw = padded_shape.dim_size(1);
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_rows_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input rows too large"));
+    const int padded_rows = static_cast<int>(padded_rows_raw);
+    const int filter_rows = static_cast<int>(filter.dim_size(0));
+    const int resized_rows = static_cast<int>(resized_shape.dim_size(1));
+
+    // The third dimension for input is columns/width.
+    // The second dimension for filter is columns/width.
+    const int64 padded_cols_raw = padded_shape.dim_size(2);
+    OP_REQUIRES(
+        context,
+        FastBoundsCheck(padded_cols_raw, std::numeric_limits<int>::max()),
+        errors::InvalidArgument("Input cols too large"));
+    const int padded_cols = static_cast<int>(padded_cols_raw);
+    const int filter_cols = static_cast<int>(filter.dim_size(1));
+    const int resized_cols = static_cast<int>(resized_shape.dim_size(2));
+
+    // The first dimension for input is batch.
+    const int64 batch_raw = padded_shape.dim_size(0);
+    OP_REQUIRES(context,
+                FastBoundsCheck(batch_raw, std::numeric_limits<int>::max()),
+                errors::InvalidArgument("batch is too large"));
+    const int batch = static_cast<int>(batch_raw);
+
+    // For now we take the stride from the second and third dimensions only (we
+    // do not support striding on the batch or depth dimension).
+    const int stride_rows = GetTensorDim(strides_, FORMAT_NHWC, 'H');
+    const int stride_cols = GetTensorDim(strides_, FORMAT_NHWC, 'W');
+
+    int64 out_rows = 0, out_cols = 0, pad_rows = 0, pad_cols = 0;
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(padded_rows, filter_rows, stride_rows,
+                                         padding_, &out_rows, &pad_rows));
+    OP_REQUIRES_OK(context,
+                   GetWindowedOutputSize(padded_cols, filter_cols, stride_cols,
+                                         padding_, &out_cols, &pad_cols));
+    TensorShape out_shape =
+        ShapeFromFormat(FORMAT_NHWC, batch, out_rows, out_cols, out_depth);
+    OP_REQUIRES(context, (out_shape.num_elements() > 0),
+                errors::InvalidArgument("Output tensor can't be empty"));
+
+    // Output tensor is of the following dimensions:
+    // [ in_batch, out_rows, out_cols, out_depth ]
+    Tensor* output = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, out_shape, &output));
+
+    VLOG(2) << "FusedConv2D: " << name() << ", in_depth = " << in_depth
+            << ", padded_cols = " << padded_cols
+            << ", resized_cols = " << resized_cols
+            << ", filter_cols = " << filter_cols
+            << ", padded_rows = " << padded_rows
+            << ", resized_rows = " << resized_rows
+            << ", filter_rows = " << filter_rows
+            << ", stride_rows = " << stride_rows
+            << ", stride_cols = " << stride_cols
+            << ", out_depth = " << out_depth << ", DoResize=" << DoResize;
+
+    // If there is nothing to compute, return.
+    if (out_shape.num_elements() == 0) {
+      return;
+    }
+    TConvFunctor conv_functor;
+    conv_functor(context, input, batch, resized_rows, resized_cols, padded_rows,
+                 padded_cols, in_depth, filter.flat<T>().data(), filter_rows,
+                 filter_cols, out_depth, stride_rows, stride_cols, padding_,
+                 output->flat<T>().data(), out_rows, out_cols, st, top_padding,
+                 bottom_padding, left_padding, right_padding, offset_);
+  }
+
+ private:
+  std::vector<int32> strides_;
+  Padding padding_;
+  bool align_corners_;
+  int offset_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(FusedResizeConv2DUsingGemmOp);
+};
+
+#define REGISTER_FUSED(T)                                                 \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedResizeAndPadConv2D")                                     \
+          .Device(DEVICE_CPU)                                             \
+          .TypeConstraint<T>("T"),                                        \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       BILINEAR>,                         \
+          true>);
+
+TF_CALL_half(REGISTER_FUSED);
+TF_CALL_float(REGISTER_FUSED);
+TF_CALL_double(REGISTER_FUSED);
+
+#define REGISTER_PAD_ONLY_FUSED(T)                                        \
+  REGISTER_KERNEL_BUILDER(                                                \
+      Name("FusedPadConv2D").Device(DEVICE_CPU).TypeConstraint<T>("T"),   \
+      FusedResizeConv2DUsingGemmOp<                                       \
+          T,                                                              \
+          FusedResizeAndPadConvFunctor<T, T, T, FastGemmFunctor<T, T, T>, \
+                                       NEAREST>,                          \
+          false>);
+
+TF_CALL_half(REGISTER_PAD_ONLY_FUSED);
+TF_CALL_float(REGISTER_PAD_ONLY_FUSED);
+TF_CALL_double(REGISTER_PAD_ONLY_FUSED);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_ops_test.cc b/tensorflow/core/kernels/conv_ops_test.cc
index 6421cad367e4971c61d9e3e467595c59271ecd9c..bf98acdecfd1a3b8a946648c105f0d313f2296ab 100644
--- a/tensorflow/core/kernels/conv_ops_test.cc
+++ b/tensorflow/core/kernels/conv_ops_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "tensorflow/cc/ops/const_op.h"
 #include "tensorflow/cc/ops/image_ops.h"
 #include "tensorflow/cc/ops/nn_ops.h"
@@ -27,6 +30,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
@@ -522,6 +526,7 @@ TEST_F(ConvOpTest, HandwrittenConv) { HandwrittenConv(); }
 
 TEST_F(ConvOpTest, AnisotropicStride) { AnisotropicStrides(); }
 
+template <typename T>
 class FusedConv2DOpTest : public OpsTestBase {
  protected:
   static constexpr int kDepth = 3;
@@ -529,10 +534,15 @@ class FusedConv2DOpTest : public OpsTestBase {
   static constexpr int kImageHeight = 32;
   static constexpr int kImageBatchCount = 8;
 
-  using GraphRunner =
+  using BiasAddGraphRunner =
       std::function<void(const Tensor& input_data, const Tensor& filter_data,
                          const Tensor& bias_data, Tensor* out)>;
 
+  using BatchNormGraphRunner = std::function<void(
+      const Tensor& input_data, const Tensor& filter_data,
+      const Tensor& scale_data, const Tensor& offset_data,
+      const Tensor& mean_data, const Tensor& variance_data, Tensor* out)>;
+
   // Runs a Tensorflow graph defined by the root scope, and fetches the result
   // of 'fetch' node into the output Tensor.
   void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
@@ -540,8 +550,22 @@ class FusedConv2DOpTest : public OpsTestBase {
     tensorflow::GraphDef graph;
     TF_ASSERT_OK(root.ToGraphDef(&graph));
 
+    // `FusedConv2D` is available only on CPU, and in this test we don't want to
+    // compare GPU vs CPU numbers, so place all nodes on CPU.
+    for (NodeDef& mutable_node : *graph.mutable_node()) {
+      mutable_node.set_device("/device:CPU:0");
+    }
+
+    // Disable Grappler constant folding for the test graphs.
+    tensorflow::SessionOptions session_options;
+    tensorflow::RewriterConfig* cfg =
+        session_options.config.mutable_graph_options()
+            ->mutable_rewrite_options();
+    cfg->set_constant_folding(tensorflow::RewriterConfig::OFF);
+
     std::unique_ptr<tensorflow::Session> session(
-        tensorflow::NewSession(tensorflow::SessionOptions()));
+        tensorflow::NewSession(session_options));
+
     TF_ASSERT_OK(session->Create(graph));
 
     std::vector<Tensor> unfused_tensors;
@@ -550,8 +574,9 @@ class FusedConv2DOpTest : public OpsTestBase {
     *output = unfused_tensors[0];
   }
 
-  void RunConv2DOp(const Tensor& input_data, const Tensor& filter_data,
-                   const Tensor& bias_data, Tensor* output, int stride = 1) {
+  void RunConv2DWithBias(const Tensor& input_data, const Tensor& filter_data,
+                         const Tensor& bias_data, Tensor* output,
+                         int stride = 1) {
     auto root = tensorflow::Scope::NewRootScope();
 
     auto conv = ops::Conv2D(
@@ -567,9 +592,10 @@ class FusedConv2DOpTest : public OpsTestBase {
     RunAndFetch(root, "with_bias", output);
   }
 
-  void RunConv2DWithReluOp(const Tensor& input_data, const Tensor& filter_data,
-                           const Tensor& bias_data, Tensor* output,
-                           int stride = 1) {
+  void RunConv2DWithBiasAndRelu(const Tensor& input_data,
+                                const Tensor& filter_data,
+                                const Tensor& bias_data, Tensor* output,
+                                int stride = 1) {
     auto root = tensorflow::Scope::NewRootScope();
 
     auto conv = ops::Conv2D(
@@ -587,18 +613,79 @@ class FusedConv2DOpTest : public OpsTestBase {
     RunAndFetch(root, "with_relu", output);
   }
 
-  template <typename T>
+  void RunConv2DWithBatchNorm(const Tensor& input_data,
+                              const Tensor& filter_data,
+                              const Tensor& scale_data,
+                              const Tensor& offset_data,
+                              const Tensor& mean_data,
+                              const Tensor& variance_data, Tensor* output,
+                              int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    ops::FusedBatchNorm::Attrs attr;
+    attr = attr.IsTraining(false);
+
+    auto with_fused_batch_norm = ops::FusedBatchNorm(
+        root.WithOpName("with_fused_batch_norm"), conv,
+        ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
+        ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
+        ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
+        ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
+        attr);
+
+    RunAndFetch(root, "with_fused_batch_norm", output);
+  }
+
+  void RunConv2DWithBatchNormAndRelu(const Tensor& input_data,
+                                     const Tensor& filter_data,
+                                     const Tensor& scale_data,
+                                     const Tensor& offset_data,
+                                     const Tensor& mean_data,
+                                     const Tensor& variance_data,
+                                     Tensor* output, int stride = 1) {
+    auto root = tensorflow::Scope::NewRootScope();
+
+    auto conv = ops::Conv2D(
+        root.WithOpName("conv"),
+        ops::Const(root.WithOpName("input"), Input::Initializer(input_data)),
+        ops::Const(root.WithOpName("filter"), Input::Initializer(filter_data)),
+        {1, stride, stride, 1}, "SAME");
+
+    ops::FusedBatchNorm::Attrs attr;
+    attr = attr.IsTraining(false);
+
+    auto with_fused_batch_norm = ops::FusedBatchNorm(
+        root.WithOpName("with_fused_batch_norm"), conv,
+        ops::Const(root.WithOpName("scale"), Input::Initializer(scale_data)),
+        ops::Const(root.WithOpName("offset"), Input::Initializer(offset_data)),
+        ops::Const(root.WithOpName("mean"), Input::Initializer(mean_data)),
+        ops::Const(root.WithOpName("var"), Input::Initializer(variance_data)),
+        attr);
+
+    auto with_relu =
+        ops::Relu(root.WithOpName("with_relu"), with_fused_batch_norm.y);
+
+    RunAndFetch(root, "with_relu", output);
+  }
+
   void RunFusedConv2DOp(const Tensor& image, const Tensor& filter,
-                        const Tensor& bias,
+                        const std::vector<Tensor>& args,
                         const std::vector<string>& fused_ops, Tensor* output,
                         int stride = 1) {
     DataType dtype = DataTypeToEnum<T>::v();
+    int num_args = static_cast<int>(args.size());
 
     TF_EXPECT_OK(NodeDefBuilder("fused_conv_op", "_FusedConv2D")
                      .Input(FakeInput(dtype))
                      .Input(FakeInput(dtype))
-                     .Attr("num_args", 1)
-                     .Input(FakeInput(dtype))
+                     .Attr("num_args", num_args)
+                     .Input(FakeInput(num_args, dtype))
                      .Attr("T", dtype)
                      .Attr("strides", {1, stride, stride, 1})
                      .Attr("padding", "SAME")
@@ -609,27 +696,32 @@ class FusedConv2DOpTest : public OpsTestBase {
 
     AddInputFromArray<T>(image.shape(), image.flat<T>());
     AddInputFromArray<T>(filter.shape(), filter.flat<T>());
-    AddInputFromArray<T>(bias.shape(), bias.flat<T>());
+    for (const Tensor& arg : args)
+      AddInputFromArray<T>(arg.shape(), arg.flat<T>());
     TF_ASSERT_OK(RunOpKernel());
 
     *output = *GetOutput(0);
   }
 
-  template <typename T>
-  void VerifyTensorsNear(int depth, int image_width, int image_height,
-                         int image_batch_count, int filter_size,
-                         int filter_count, const GraphRunner& run_default,
-                         const GraphRunner& run_fused) {
+  void VerifyBiasAddTensorsNear(int depth, int image_width, int image_height,
+                                int image_batch_count, int filter_size,
+                                int filter_count,
+                                const BiasAddGraphRunner& run_default,
+                                const BiasAddGraphRunner& run_fused) {
     DataType dtype = DataTypeToEnum<T>::v();
+
     Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
     image.flat<T>() = image.flat<T>().setRandom();
 
+    // Add some negative values to filter to properly test Relu.
     Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
     filter.flat<T>() = filter.flat<T>().setRandom();
+    filter.flat<T>() -= filter.flat<T>().constant(static_cast<T>(0.5f));
 
     const int bias_size = filter_count;
     Tensor bias(dtype, {bias_size});
     bias.flat<T>() = bias.flat<T>().setRandom();
+    bias.flat<T>() += bias.flat<T>().constant(static_cast<T>(0.5f));
 
     Tensor conv_2d;
     Tensor fused_conv_2d;
@@ -640,114 +732,291 @@ class FusedConv2DOpTest : public OpsTestBase {
     ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
     ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
 
-    test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-5);
+    // NOTE(ezhulenev): When filter size is equal to the input image size, we
+    // effectevily do element-wise product and full sum reduction, and these
+    // operations intoroduce higher than "normal" numerical errors.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d);
+    }
+  }
+
+  void VerifyFusedBatchNormTensorsNear(int depth, int image_width,
+                                       int image_height, int image_batch_count,
+                                       int filter_size, int filter_count,
+                                       const BatchNormGraphRunner& run_default,
+                                       const BatchNormGraphRunner& run_fused) {
+    DataType dtype = DataTypeToEnum<T>::v();
+
+    Tensor image(dtype, {image_batch_count, image_height, image_width, depth});
+    image.flat<T>() = image.flat<T>().setRandom();
+
+    // Add some negative values to filter to properly test Relu.
+    Tensor filter(dtype, {filter_size, filter_size, depth, filter_count});
+    filter.flat<T>() = filter.flat<T>().setRandom();
+    filter.flat<T>() -= filter.flat<T>().constant(static_cast<T>(0.5f));
+
+    const int scale_size = filter_count;
+
+    Tensor scale(dtype, {scale_size});
+    scale.flat<T>() = scale.flat<T>().setRandom();
+
+    Tensor offset(dtype, {scale_size});
+    offset.flat<T>() = offset.flat<T>().setRandom();
+
+    Tensor mean(dtype, {scale_size});
+    mean.flat<T>() = mean.flat<T>().setRandom();
+
+    Tensor variance(dtype, {scale_size});
+    variance.flat<T>() = variance.flat<T>().setRandom();
+    variance.flat<T>() += variance.flat<T>().constant(static_cast<T>(0.5f));
+
+    Tensor conv_2d;
+    Tensor fused_conv_2d;
+
+    run_default(image, filter, scale, offset, mean, variance, &conv_2d);
+    run_fused(image, filter, scale, offset, mean, variance, &fused_conv_2d);
+
+    ASSERT_EQ(conv_2d.dtype(), fused_conv_2d.dtype());
+    ASSERT_EQ(conv_2d.shape(), fused_conv_2d.shape());
+
+    // NOTE(ezhulenev): When filter size is equal to the input image size, we
+    // effectevily do element-wise product and full sum reduction, and these
+    // operations intoroduce higher than "normal" numerical errors.
+    if (image_width == filter_size && image_height == filter_size) {
+      test::ExpectTensorNear<T>(conv_2d, fused_conv_2d, 1e-3);
+    } else {
+      test::ExpectClose(conv_2d, fused_conv_2d);
+    }
   }
 
   // Verifies that computing Conv2D+BiasAdd in a graph is identical to
   // FusedConv2D.
-  template <typename T>
-  void VerifyConv2DWithBias(int depth, int image_width, int image_height,
-                            int image_batch_count, int filter_size,
-                            int filter_count) {
-    const GraphRunner run_default =
+  void VerifyConv2DWithBias(int filter_size, int filter_count,
+                            int depth = kDepth, int image_width = kImageWidth,
+                            int image_height = kImageHeight,
+                            int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
         [this](const Tensor& input_data, const Tensor& filter_data,
                const Tensor& bias_data, Tensor* out) {
-          RunConv2DOp(input_data, filter_data, bias_data, out);
+          RunConv2DWithBias(input_data, filter_data, bias_data, out);
         };
 
-    const GraphRunner run_fused = [this](const Tensor& input_data,
-                                         const Tensor& filter_data,
-                                         const Tensor& bias_data, Tensor* out) {
-      RunFusedConv2DOp<T>(input_data, filter_data, bias_data, {"BiasAdd"}, out);
+    const BiasAddGraphRunner run_fused = [this](const Tensor& input_data,
+                                                const Tensor& filter_data,
+                                                const Tensor& bias_data,
+                                                Tensor* out) {
+      RunFusedConv2DOp(input_data, filter_data, {bias_data}, {"BiasAdd"}, out);
     };
 
-    VerifyTensorsNear<T>(depth, image_width, image_height, image_batch_count,
-                         filter_size, filter_count, run_default, run_fused);
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
   }
 
   // Verifies that computing Conv2D+BiasAdd+Relu in a graph is identical to
   // FusedConv2D.
-  template <typename T>
-  void VerifyConv2DWithBiasAndRelu(int depth, int image_width, int image_height,
-                                   int image_batch_count, int filter_size,
-                                   int filter_count) {
-    const GraphRunner run_default =
+  void VerifyConv2DWithBiasAndRelu(int filter_size, int filter_count,
+                                   int depth = kDepth,
+                                   int image_width = kImageWidth,
+                                   int image_height = kImageHeight,
+                                   int image_batch_count = kImageBatchCount) {
+    const BiasAddGraphRunner run_default =
         [this](const Tensor& input_data, const Tensor& filter_data,
                const Tensor& bias_data, Tensor* out) {
-          RunConv2DWithReluOp(input_data, filter_data, bias_data, out);
+          RunConv2DWithBiasAndRelu(input_data, filter_data, bias_data, out);
         };
 
-    const GraphRunner run_fused = [this](const Tensor& input_data,
-                                         const Tensor& filter_data,
-                                         const Tensor& bias_data, Tensor* out) {
-      RunFusedConv2DOp<T>(input_data, filter_data, bias_data,
-                          {"BiasAdd", "Relu"}, out);
-    };
+    const BiasAddGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& bias_data, Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data, {bias_data},
+                           {"BiasAdd", "Relu"}, out);
+        };
 
-    VerifyTensorsNear<T>(depth, image_width, image_height, image_batch_count,
-                         filter_size, filter_count, run_default, run_fused);
+    VerifyBiasAddTensorsNear(depth, image_width, image_height,
+                             image_batch_count, filter_size, filter_count,
+                             run_default, run_fused);
   }
-};
 
-#define FUSED_CONV2D_TESTS(dtype, name)                                       \
-  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddOneByOneConvolution##name) {     \
-    const int filter_size = 1;                                                \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
-                                kImageBatchCount, filter_size, filter_count); \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddImageSizeConvolution##name) {    \
-    const int filter_size = 32;                                               \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
-                                kImageBatchCount, filter_size, filter_count); \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest, Conv2DWithBiasAddSpatialConvolution##name) {      \
-    const int filter_size = 3;                                                \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBias<dtype>(kDepth, kImageWidth, kImageHeight,            \
-                                kImageBatchCount, filter_size, filter_count); \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest,                                                   \
-         Conv2DWithBiasAddAndReluOneByOneConvolution##name) {                 \
-    const int filter_size = 1;                                                \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
-                                       kImageBatchCount, filter_size,         \
-                                       filter_count);                         \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest,                                                   \
-         Conv2DWithBiasAddAndReluImageSizeConvolution##name) {                \
-    const int filter_size = 32;                                               \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
-                                       kImageBatchCount, filter_size,         \
-                                       filter_count);                         \
-  }                                                                           \
-                                                                              \
-  TEST_F(FusedConv2DOpTest,                                                   \
-         Conv2DWithBiasAddAndReluSpatialConvolution##name) {                  \
-    const int filter_size = 3;                                                \
-    const int filter_count = 12;                                              \
-                                                                              \
-    VerifyConv2DWithBiasAndRelu<dtype>(kDepth, kImageWidth, kImageHeight,     \
-                                       kImageBatchCount, filter_size,         \
-                                       filter_count);                         \
+  // Verifies that computing Conv2D+FusedBatchNorm in a graph is identical to
+  // FusedConv2D.
+  void VerifyConv2DWithBatchNorm(int filter_size, int filter_count,
+                                 int depth = kDepth,
+                                 int image_width = kImageWidth,
+                                 int image_height = kImageHeight,
+                                 int image_batch_count = kImageBatchCount) {
+    const BatchNormGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunConv2DWithBatchNorm(input_data, filter_data, scale_data,
+                                 offset_data, mean_data, variance_data, out);
+        };
+
+    const BatchNormGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data,
+                           {scale_data, offset_data, mean_data, variance_data},
+                           {"FusedBatchNorm"}, out);
+        };
+
+    VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
+                                    image_batch_count, filter_size,
+                                    filter_count, run_default, run_fused);
+  }
+
+  // Verifies that computing Conv2D+FusedBatchNorm+Relu in a graph is identical
+  // to FusedConv2D.
+  void VerifyConv2DWithBatchNormAndRelu(
+      int filter_size, int filter_count, int depth = kDepth,
+      int image_width = kImageWidth, int image_height = kImageHeight,
+      int image_batch_count = kImageBatchCount) {
+    const BatchNormGraphRunner run_default =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunConv2DWithBatchNormAndRelu(input_data, filter_data, scale_data,
+                                        offset_data, mean_data, variance_data,
+                                        out);
+        };
+
+    const BatchNormGraphRunner run_fused =
+        [this](const Tensor& input_data, const Tensor& filter_data,
+               const Tensor& scale_data, const Tensor& offset_data,
+               const Tensor& mean_data, const Tensor& variance_data,
+               Tensor* out) {
+          RunFusedConv2DOp(input_data, filter_data,
+                           {scale_data, offset_data, mean_data, variance_data},
+                           {"FusedBatchNorm", "Relu"}, out);
+        };
+
+    VerifyFusedBatchNormTensorsNear(depth, image_width, image_height,
+                                    image_batch_count, filter_size,
+                                    filter_count, run_default, run_fused);
   }
+};
+
+// Conv2D with BatchNorm can be tested only with `T=float`, because default
+// `FusedBatchNorm` kernel supports only floats for scale, mean and variance.
 
-FUSED_CONV2D_TESTS(float, F);
-FUSED_CONV2D_TESTS(double, D);
+template <typename T>
+class FusedConv2DWithBiasOpTest : public FusedConv2DOpTest<T> {};
+template <typename T>
+class FusedConv2DWithBatchNormOpTest : public FusedConv2DOpTest<T> {};
 
-#undef FUSED_CONV2D_TESTS
+TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest);
+TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest);
+
+// -------------------------------------------------------------------------- //
+// Conv2D + BiasAdd + {Relu}                                                  //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolution) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBias(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, ImageSizeConvolutionAndRelu) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBiasOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBiasAndRelu(filter_size, filter_count);
+}
+
+// -------------------------------------------------------------------------- //
+// Conv2D + FusedBatchNorm + {Relu}                                           //
+// -------------------------------------------------------------------------- //
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolution) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolution) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolution) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNorm(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, OneByOneConvolutionAndRelu) {
+  const int filter_size = 1;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, ImageSizeConvolutionAndRelu) {
+  const int filter_size = TestFixture::kImageWidth;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+TYPED_TEST_P(FusedConv2DWithBatchNormOpTest, SpatialConvolutionAndRelu) {
+  const int filter_size = 3;
+  const int filter_count = 12;
+  this->VerifyConv2DWithBatchNormAndRelu(filter_size, filter_count);
+}
+
+REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBiasOpTest,    //
+                           OneByOneConvolution,          //
+                           ImageSizeConvolution,         //
+                           SpatialConvolution,           //
+                           OneByOneConvolutionAndRelu,   //
+                           ImageSizeConvolutionAndRelu,  //
+                           SpatialConvolutionAndRelu);
+
+REGISTER_TYPED_TEST_CASE_P(FusedConv2DWithBatchNormOpTest,  //
+                           OneByOneConvolution,             //
+                           ImageSizeConvolution,            //
+                           SpatialConvolution,              //
+                           OneByOneConvolutionAndRelu,      //
+                           ImageSizeConvolutionAndRelu,     //
+                           SpatialConvolutionAndRelu);
+
+using FusedBiasAddDataTypes = ::testing::Types<float, double>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBiasOpTest,
+                              FusedBiasAddDataTypes);
+
+using FusedBatchNormDataTypes = ::testing::Types<float>;
+INSTANTIATE_TYPED_TEST_CASE_P(Test, FusedConv2DWithBatchNormOpTest,
+                              FusedBatchNormDataTypes);
 
 ////////////////////////////////////////////////////////////////////////////////
 // Performance benchmarks for the FusedConv2DWithBiasOp.                      //
@@ -771,6 +1040,19 @@ struct Conv2DWithBiasAndReluGraph {
   Node* relu;
 };
 
+struct Conv2DWithBatchNormGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* batch_norm;
+};
+
+struct Conv2DWithBatchNormAndReluGraph {
+  Graph* graph;
+  Node* conv2d;
+  Node* batch_norm;
+  Node* relu;
+};
+
 static Tensor MakeRandomTensor(const TensorShape& shape) {
   Tensor tensor(DT_FLOAT, TensorShape(shape));
   tensor.flat<float>() = tensor.flat<float>().setRandom();
@@ -800,7 +1082,7 @@ static Conv2DGraph Conv2D(int batch, int height, int width, int in_depth,
   return {graph, conv2d};
 }
 
-// Creates a Tensorflow graph with a Conv2D node followed by Relu.
+// Creates a Tensorflow graph with a Conv2D node followed by BiasAdd.
 static Conv2DWithBiasGraph Conv2DWithBias(int batch, int height, int width,
                                           int in_depth, int filter_w,
                                           int filter_h, int out_depth) {
@@ -846,11 +1128,68 @@ static Conv2DWithBiasAndReluGraph Conv2DWithBiasAndRelu(int batch, int height,
   return {graph, conv2d, bias, relu};
 }
 
-// Creates a tensorflow graph with a single FusedConv2D node and fuses into it
-// additional computations (e.g. BiasAdd or Relu).
-static Graph* FusedConv2D(int batch, int height, int width, int in_depth,
-                          int filter_w, int filter_h, int out_depth,
-                          const std::vector<string>& fused_ops = {}) {
+// Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm.
+static Conv2DWithBatchNormGraph Conv2DWithBatchNorm(int batch, int height,
+                                                    int width, int in_depth,
+                                                    int filter_w, int filter_h,
+                                                    int out_depth) {
+  Conv2DGraph conv_graph =
+      Conv2D(batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+
+  Tensor scale_t = MakeRandomTensor({out_depth});
+  Tensor offset_t = MakeRandomTensor({out_depth});
+  Tensor mean_t = MakeRandomTensor({out_depth});
+  Tensor variance_t = MakeRandomTensor({out_depth});
+
+  Node* scale = test::graph::Constant(graph, scale_t, "scale");
+  Node* offset = test::graph::Constant(graph, offset_t, "offset");
+  Node* mean = test::graph::Constant(graph, mean_t, "mean");
+  Node* variance = test::graph::Constant(graph, variance_t, "variance");
+
+  Node* out;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("batch_norm"), "FusedBatchNorm")
+                  .Input(conv2d)
+                  .Input(scale)
+                  .Input(offset)
+                  .Input(mean)
+                  .Input(variance)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("is_training", false)
+                  .Finalize(graph, &out));
+
+  return {graph, conv2d, out};
+}
+
+// Creates a Tensorflow graph with a Conv2D node followed by FusedBatchNorm and
+// Relu.
+static Conv2DWithBatchNormAndReluGraph Conv2DWithBatchNormAndRelu(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth) {
+  Conv2DWithBatchNormGraph conv_graph = Conv2DWithBatchNorm(
+      batch, height, width, in_depth, filter_w, filter_h, out_depth);
+
+  Graph* graph = conv_graph.graph;
+  Node* conv2d = conv_graph.conv2d;
+  Node* batch_norm = conv_graph.batch_norm;
+
+  Node* relu;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("relu"), "Relu")
+                  .Input(batch_norm)
+                  .Attr("T", DT_FLOAT)
+                  .Finalize(graph, &relu));
+
+  return {graph, conv2d, batch_norm, relu};
+}
+
+// Creates a tensorflow graph with a single FusedConv2D (with BiasAdd) node and
+// fuses into it additional computations (e.g. Relu).
+static Graph* FusedConv2DWithBias(int batch, int height, int width,
+                                  int in_depth, int filter_w, int filter_h,
+                                  int out_depth,
+                                  const std::vector<string>& fused_ops = {}) {
   Graph* graph = new Graph(OpRegistry::Global());
 
   Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
@@ -878,6 +1217,53 @@ static Graph* FusedConv2D(int batch, int height, int width, int in_depth,
   return graph;
 }
 
+// Creates a tensorflow graph with a single FusedConv2D (with FusedBatchNorm)
+// node and fuses into it additional computations (e.g. Relu).
+static Graph* FusedConv2DWithBatchNorm(
+    int batch, int height, int width, int in_depth, int filter_w, int filter_h,
+    int out_depth, const std::vector<string>& fused_ops = {}) {
+  Graph* graph = new Graph(OpRegistry::Global());
+
+  Tensor images_t = MakeRandomTensor({batch, height, width, in_depth});
+  Tensor filter_t = MakeRandomTensor({filter_w, filter_h, in_depth, out_depth});
+  Tensor scale_t = MakeRandomTensor({out_depth});
+  Tensor offset_t = MakeRandomTensor({out_depth});
+  Tensor mean_t = MakeRandomTensor({out_depth});
+  Tensor variance_t = MakeRandomTensor({out_depth});
+
+  Node* images = test::graph::Constant(graph, images_t, "images");
+  Node* filter = test::graph::Constant(graph, filter_t, "filter");
+  Node* scale = test::graph::Constant(graph, scale_t, "scale");
+  Node* offset = test::graph::Constant(graph, offset_t, "offset");
+  Node* mean = test::graph::Constant(graph, mean_t, "mean");
+  Node* variance = test::graph::Constant(graph, variance_t, "variance");
+
+  std::vector<NodeBuilder::NodeOut> args = {scale, offset, mean, variance};
+
+  Node* conv;
+  TF_CHECK_OK(NodeBuilder(graph->NewName("conv"), "_FusedConv2D")
+                  .Input(images)
+                  .Input(filter)
+                  .Attr("num_args", 4)
+                  .Input(args)
+                  .Attr("T", DT_FLOAT)
+                  .Attr("strides", {1, 1, 1, 1})
+                  .Attr("padding", "SAME")
+                  .Attr("fused_ops", fused_ops)
+                  .Finalize(graph, &conv));
+
+  return graph;
+}
+
+// Macro arguments names: --------------------------------------------------- //
+//    N: batch size
+//    H: height
+//    W: width
+//    C: channels
+//   FC: filter count
+//   FH: filter height
+//   FW: filter width
+
 #define BM_SETUP(N, H, W, C, type, LABEL, NAME)                               \
   testing::ItemsProcessed(static_cast<int64>(iters) * (N) * (H) * (W) * (C)); \
   testing::SetLabel(LABEL);
@@ -911,26 +1297,73 @@ static Graph* FusedConv2D(int batch, int height, int width, int in_depth,
   }                                                                       \
   BENCHMARK(BM_NAME(BM_Conv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
 
-#define BM_FusedConv2D(N, H, W, C, FW, FH, FC, type, LABEL)                  \
-  static void BM_NAME(BM_FusedConv2D, type, N, H, W, C, FW, FH,              \
+#define BM_FusedConv2DWithBias(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type,                                                    \
+                    FusedConv2DWithBias(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBias, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBiasAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                         \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
+    test::Benchmark(#type, FusedConv2DWithBias(N, H, W, C, FW, FH, FC,         \
+                                               {"BiasAdd", "Relu"}))           \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(BM_FusedConv2DWithBiasAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)           \
+  static void BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH,       \
+                      FC)(int iters) {                                        \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
+    test::Benchmark(#type, Conv2DWithBatchNorm(N, H, W, C, FW, FH, FC).graph) \
+        .Run(iters);                                                          \
+  }                                                                           \
+  BENCHMARK(BM_NAME(BM_Conv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
+
+#define BM_Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, \
+                      FC)(int iters) {                                         \
+    BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                 \
+    test::Benchmark(#type,                                                     \
+                    Conv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC).graph)  \
+        .Run(iters);                                                           \
+  }                                                                            \
+  BENCHMARK(                                                                   \
+      BM_NAME(BM_Conv2DWithBatchNormAndRelu, type, N, H, W, C, FW, FH, FC));
+
+#define BM_FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC, type, LABEL)     \
+  static void BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, \
                       FC)(int iters) {                                       \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                               \
-    test::Benchmark(#type, FusedConv2D(N, H, W, C, FW, FH, FC, {"BiasAdd"})) \
+    test::Benchmark(#type, FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,  \
+                                                    {"FusedBatchNorm"}))     \
         .Run(iters);                                                         \
   }                                                                          \
-  BENCHMARK(BM_NAME(BM_FusedConv2D, type, N, H, W, C, FW, FH, FC));
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNorm, type, N, H, W, C, FW, FH, FC));
 
-#define BM_FusedConv2DAndRelu(N, H, W, C, FW, FH, FC, type, LABEL)            \
-  static void BM_NAME(BM_FusedConv2DAndRelu, type, N, H, W, C, FW, FH,        \
-                      FC)(int iters) {                                        \
+#define BM_FusedConv2DWithBatchNormAndRelu(N, H, W, C, FW, FH, FC, type,      \
+                                           LABEL)                             \
+  static void BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C,   \
+                      FW, FH, FC)(int iters) {                                \
     BM_SETUP(N, H, W, C, type, LABEL, Conv2D);                                \
     test::Benchmark(#type,                                                    \
-                    FusedConv2D(N, H, W, C, FW, FH, FC, {"BiasAdd", "Relu"})) \
+                    FusedConv2DWithBatchNorm(N, H, W, C, FW, FH, FC,          \
+                                             {"FusedBatchNorm", "Relu"}))     \
         .Run(iters);                                                          \
   }                                                                           \
-  BENCHMARK(BM_NAME(BM_FusedConv2DAndRelu, type, N, H, W, C, FW, FH, FC));
+  BENCHMARK(BM_NAME(BM_FusedConv2DWithBatchNormAndRelu, type, N, H, W, C, FW, \
+                    FH, FC));
 
+// -------------------------------------------------------------------------- //
 // Pixel CNN convolutions.
+// -------------------------------------------------------------------------- //
 
 // 1x1 Convolution: MatMulFunctor
 
@@ -938,6 +1371,8 @@ BM_Conv2D(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
 BM_Conv2D(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
 BM_Conv2D(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
 
+// 1) BiasAdd {+ Relu}
+
 BM_Conv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
 BM_Conv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
 BM_Conv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
@@ -946,20 +1381,44 @@ BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
 BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
 BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
 
-BM_FusedConv2D(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
-BM_FusedConv2D(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
-BM_FusedConv2D(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+BM_FusedConv2DWithBias(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBias(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBias(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+// 2) FusedBatchNorm {+ Relu}
+
+BM_Conv2DWithBatchNorm(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNorm(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNorm(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBatchNormAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNormAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNormAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+
+BM_FusedConv2DWithBatchNorm(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNorm(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBatchNorm(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
 
-BM_FusedConv2DAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
-BM_FusedConv2DAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 16");
-BM_FusedConv2DAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 32");
+BM_FusedConv2DWithBatchNormAndRelu(8, 32, 32, 128, 1, 1, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 1, 1, 1024, cpu,
+                                   "1x1 /b 16");
+BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 1, 1, 1024, cpu,
+                                   "1x1 /b 32");
 
+// -------------------------------------------------------------------------- //
 // 3x3 Convolution: SpatialConvolution
+// -------------------------------------------------------------------------- //
 
 BM_Conv2D(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
 BM_Conv2D(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
 BM_Conv2D(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
 
+// 1) BiasAdd {+ Relu}
+
 BM_Conv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
 BM_Conv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
 BM_Conv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
@@ -968,12 +1427,32 @@ BM_Conv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
 BM_Conv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
 BM_Conv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
 
-BM_FusedConv2D(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
-BM_FusedConv2D(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
-BM_FusedConv2D(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+BM_FusedConv2DWithBias(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBias(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DWithBias(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBiasAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBiasAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_FusedConv2DWithBiasAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+// 2) FusedBatchNorm {+ Relu}
+
+BM_Conv2DWithBatchNorm(8, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 8");
+BM_Conv2DWithBatchNorm(16, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 16");
+BM_Conv2DWithBatchNorm(32, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 32");
+
+BM_Conv2DWithBatchNormAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_Conv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
+BM_Conv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+
+BM_FusedConv2DWithBatchNorm(8, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 8");
+BM_FusedConv2DWithBatchNorm(16, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 16");
+BM_FusedConv2DWithBatchNorm(32, 32, 32, 128, 3, 3, 1024, cpu, "1x1 /b 32");
 
-BM_FusedConv2DAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
-BM_FusedConv2DAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 16");
-BM_FusedConv2DAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 32");
+BM_FusedConv2DWithBatchNormAndRelu(8, 32, 32, 128, 3, 3, 1024, cpu, "3x3 /b 8");
+BM_FusedConv2DWithBatchNormAndRelu(16, 32, 32, 128, 3, 3, 1024, cpu,
+                                   "3x3 /b 16");
+BM_FusedConv2DWithBatchNormAndRelu(32, 32, 32, 128, 3, 3, 1024, cpu,
+                                   "3x3 /b 32");
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/cwise_op_squared_difference.cc b/tensorflow/core/kernels/cwise_op_squared_difference.cc
index 78fefc69c776e2f7b7c44c941e0a1afefdbaf143..d0ff271df6ad0475b970b7303292c8f7ea14396e 100644
--- a/tensorflow/core/kernels/cwise_op_squared_difference.cc
+++ b/tensorflow/core/kernels/cwise_op_squared_difference.cc
@@ -16,8 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops_common.h"
 
 namespace tensorflow {
-REGISTER5(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
-          float, Eigen::half, double, int32, int64);
+REGISTER7(BinaryOp, CPU, "SquaredDifference", functor::squared_difference,
+          float, Eigen::half, double, int32, int64, complex64, complex128);
 #if GOOGLE_CUDA
 REGISTER4(BinaryOp, GPU, "SquaredDifference", functor::squared_difference,
           float, Eigen::half, double, int64);
diff --git a/tensorflow/core/kernels/cwise_ops.h b/tensorflow/core/kernels/cwise_ops.h
index 3f7aa0dc39919e223e206b6a2328d379a0f828a5..abfb4a039cf85a14d8cfcd5acf96d35175cf8c95 100644
--- a/tensorflow/core/kernels/cwise_ops.h
+++ b/tensorflow/core/kernels/cwise_ops.h
@@ -296,27 +296,32 @@ struct less_equal : std::binary_function<T, T, bool> {
   }
 };
 
-// Functor that enables composition of multiple Eigen functors.
-template <typename Scalar, typename UnaryFunctor, typename BinaryFunctor>
-struct scalar_compose_op {
+// Functor that enables squared difference functor.
+template <typename Scalar>
+struct scalar_squared_difference_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
   operator()(const Scalar& a, const Scalar& b) const {
-    return UnaryFunctor()(BinaryFunctor()(a, b));
+    const Scalar v = scalar_difference_op<Scalar>()(a, b);
+    return scalar_product_op<Scalar>()(v, scalar_conjugate_op<Scalar>()(v));
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& a, const Packet& b) const {
-    return UnaryFunctor().packetOp(BinaryFunctor().packetOp(a, b));
+    const Packet v = scalar_difference_op<Scalar>().packetOp(a, b);
+    return scalar_product_op<Scalar>().packetOp(
+        v, scalar_conjugate_op<Scalar>().packetOp(v));
   }
 };
 
-template <typename Scalar, typename UnaryFunctor, typename BinaryFunctor>
-struct functor_traits<scalar_compose_op<Scalar, UnaryFunctor, BinaryFunctor>> {
+template <typename Scalar>
+struct functor_traits<scalar_squared_difference_op<Scalar>> {
   enum {
-    Cost = functor_traits<UnaryFunctor>::Cost +
-           functor_traits<BinaryFunctor>::Cost,
-    PacketAccess = functor_traits<UnaryFunctor>::PacketAccess &&
-                   functor_traits<BinaryFunctor>::PacketAccess
+    Cost = functor_traits<scalar_difference_op<Scalar>>::Cost +
+           functor_traits<scalar_conjugate_op<Scalar>>::Cost +
+           functor_traits<scalar_product_op<Scalar>>::Cost,
+    PacketAccess = functor_traits<scalar_difference_op<Scalar>>::PacketAccess &&
+                   functor_traits<scalar_conjugate_op<Scalar>>::PacketAccess &&
+                   functor_traits<scalar_product_op<Scalar>>::PacketAccess
   };
 };
 
@@ -449,6 +454,27 @@ struct functor_traits<scalar_round_op_google<Scalar>> {
   enum { Cost = 4 * NumTraits<Scalar>::AddCost, PacketAccess = false };
 };
 
+template <typename Scalar>
+struct scalar_round_up_op {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar
+  operator()(const Scalar& x) const {
+    EIGEN_STATIC_ASSERT((!NumTraits<Scalar>::IsComplex),
+                        NUMERIC_TYPE_MUST_BE_REAL)
+
+    Scalar round_val = Eigen::numext::floor(x);
+    const Scalar fraction = x - round_val;
+    if (fraction >= Scalar(.5)) {
+      round_val += Scalar(1.0);
+    }
+    return round_val;
+  }
+};
+
+template <typename Scalar>
+struct functor_traits<scalar_round_up_op<Scalar>> {
+  enum { Cost = 4 * NumTraits<Scalar>::AddCost, PacketAccess = false };
+};
+
 #undef ENABLE_FLOAT_EQUALITY_WARNING
 #undef DISABLE_FLOAT_EQUALITY_WARNING
 
@@ -754,7 +780,7 @@ struct rint : base<T, scalar_rint_op<T>> {};
 // pow(x, y) = x ^ y
 // maximum(x, y) = x > y ? x : y
 // minimum(x, y) = x < y ? x : y
-// squared_difference(x, y) = (x - y) * (x - y)
+// squared_difference(x, y) = conj(x - y) * (x - y)
 
 template <typename T>
 struct add : base<T, Eigen::internal::scalar_sum_op<T>> {
@@ -864,9 +890,7 @@ struct atan2 : base<T, scalar_atan2_op<T>> {};
 
 template <typename T>
 struct squared_difference
-    : base<T, Eigen::internal::scalar_compose_op<
-                  T, Eigen::internal::scalar_square_op<T>,
-                  Eigen::internal::scalar_difference_op<T>>> {};
+    : base<T, Eigen::internal::scalar_squared_difference_op<T>> {};
 
 template <typename T>
 struct xdivy : base<T, Eigen::internal::xdivy_op<T>> {};
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index b7ccf5f70ec28f475b02e652987c3578048e9976..e2ab77632da4830f63d63c95c6ace5465fb46b9e 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -13,16 +13,6 @@ load(
     "tf_cc_test",
 )
 
-tf_kernel_library(
-    name = "stats_aggregator_ops",
-    srcs = ["stats_aggregator_ops.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
 # TODO(mrry): Remove this empty forwarding library.
 cc_library(
     name = "dataset",
@@ -139,17 +129,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "slide_dataset_op",
-    srcs = ["slide_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "padded_batch_dataset_op",
     srcs = ["padded_batch_dataset_op.cc"],
@@ -161,44 +140,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "dense_to_sparse_batch_dataset_op",
-    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_reducer_dataset_op",
-    srcs = ["group_by_reducer_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "group_by_window_dataset_op",
-    srcs = ["group_by_window_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":window_dataset",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "filter_dataset_op",
     srcs = ["filter_dataset_op.cc"],
@@ -238,21 +179,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "map_and_batch_dataset_op",
-    srcs = ["map_and_batch_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        ":dataset_utils",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:inplace_ops",
-    ],
-)
-
 cc_library(
     name = "parallel_map_iterator",
     srcs = ["parallel_map_iterator.cc"],
@@ -267,16 +193,6 @@ cc_library(
     ],
 )
 
-tf_kernel_library(
-    name = "parse_example_dataset_op",
-    srcs = ["parse_example_dataset_op.cc"],
-    deps = [
-        ":parallel_map_iterator",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:framework",
-    ],
-)
-
 tf_kernel_library(
     name = "parallel_map_dataset_op",
     srcs = ["parallel_map_dataset_op.cc"],
@@ -307,19 +223,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "scan_dataset_op",
-    srcs = ["scan_dataset_op.cc"],
-    deps = [
-        ":captured_function",
-        "//tensorflow/core:core_cpu_internal",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "flat_map_dataset_op",
     srcs = ["flat_map_dataset_op.cc"],
@@ -359,7 +262,6 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -430,39 +332,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "stats_dataset_ops",
-    srcs = ["stats_dataset_ops.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-    ],
-)
-
-tf_kernel_library(
-    name = "stats_aggregator_dataset_op",
-    srcs = ["stats_aggregator_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:core_cpu_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
-tf_kernel_library(
-    name = "random_dataset_op",
-    srcs = ["random_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "range_dataset_op",
     srcs = ["range_dataset_op.cc"],
@@ -506,17 +375,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "tensor_queue_dataset_op",
-    srcs = ["tensor_queue_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "tensor_slice_dataset_op",
     srcs = ["tensor_slice_dataset_op.cc"],
@@ -527,17 +385,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "unbatch_dataset_op",
-    srcs = ["unbatch_dataset_op.cc"],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "zip_dataset_op",
     srcs = ["zip_dataset_op.cc"],
@@ -571,20 +418,6 @@ tf_kernel_library(
     ],
 )
 
-tf_kernel_library(
-    name = "sql_dataset_ops",
-    srcs = [
-        "sql_dataset_ops.cc",
-    ],
-    deps = [
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels/data/sql",
-    ],
-)
-
 tf_kernel_library(
     name = "iterator_ops",
     srcs = ["iterator_ops.cc"],
@@ -600,6 +433,7 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
         "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -620,6 +454,10 @@ tf_kernel_library(
     name = "optional_ops",
     srcs = ["optional_ops.cc"],
     hdrs = ["optional_ops.h"],
+    gpu_srcs = [
+        "optional_ops.cu.cc",
+        "optional_ops.h",
+    ],
     deps = [
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
@@ -627,6 +465,7 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "//third_party/eigen3",
     ],
 )
 
@@ -649,7 +488,6 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler:graph_view",
@@ -658,22 +496,11 @@ tf_kernel_library(
         "//tensorflow/core/grappler/clusters:virtual_cluster",
         "//tensorflow/core/grappler/optimizers:meta_optimizer",
         "//tensorflow/core/grappler/optimizers/data",
+        "//tensorflow/core/grappler/optimizers/data:function_utils",
         "//tensorflow/core/grappler/optimizers/data:graph_utils",
     ],
 )
 
-tf_kernel_library(
-    name = "matching_files_dataset_op",
-    srcs = ["matching_files_dataset_op.cc"],
-    deps = [
-        ":dataset",
-        "//tensorflow/core:dataset_ops_op_lib",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-    ],
-)
-
 tf_kernel_library(
     name = "model_dataset_op",
     srcs = ["model_dataset_op.cc"],
@@ -706,19 +533,14 @@ tf_kernel_library(
         ":cache_dataset_ops",
         ":concatenate_dataset_op",
         ":dataset_ops",
-        ":dense_to_sparse_batch_dataset_op",
         ":filter_by_component_dataset_op",
         ":filter_dataset_op",
         ":flat_map_dataset_op",
         ":generator_dataset_op",
-        ":group_by_reducer_dataset_op",
-        ":group_by_window_dataset_op",
         ":interleave_dataset_op",
         ":iterator_ops",
-        ":map_and_batch_dataset_op",
         ":map_dataset_op",
         ":map_defun_op",
-        ":matching_files_dataset_op",
         ":model_dataset_op",
         ":multi_device_iterator_ops",
         ":optimize_dataset_op",
@@ -726,45 +548,22 @@ tf_kernel_library(
         ":padded_batch_dataset_op",
         ":parallel_interleave_dataset_op",
         ":parallel_map_dataset_op",
-        ":parse_example_dataset_op",
         ":prefetch_dataset_op",
-        ":random_dataset_op",
         ":range_dataset_op",
         ":reader_dataset_ops",
         ":repeat_dataset_op",
-        ":scan_dataset_op",
         ":shuffle_dataset_op",
         ":skip_dataset_op",
-        ":slide_dataset_op",
         ":sparse_tensor_slice_dataset_op",
-        ":sql_dataset_ops",
-        ":stats_aggregator_dataset_op",
-        ":stats_aggregator_ops",
-        ":stats_dataset_ops",
         ":take_dataset_op",
         ":tensor_dataset_op",
-        ":tensor_queue_dataset_op",
         ":tensor_slice_dataset_op",
-        ":unbatch_dataset_op",
         ":window_dataset_op",
-        ":writer_ops",
         ":zip_dataset_op",
         "//tensorflow/core/kernels/data/experimental:dataset_kernels",
     ],
 )
 
-tf_kernel_library(
-    name = "writer_ops",
-    srcs = ["writer_ops.cc"],
-    deps = [
-        ":dataset_utils",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/kernels:ops_util",
-    ],
-)
-
 tf_kernel_library(
     name = "map_defun_op",
     srcs = ["map_defun_op.cc"],
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 41b04346ebdd20dedd00f0a9575e349dc6403e03..1f8d2bdbae897e471113375150935b69e47f6d84 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -95,6 +95,15 @@ class BatchDatasetOp : public UnaryDatasetOpKernel {
       return strings::StrCat("BatchDatasetOp(", batch_size_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index ce6fd09aee53a4bb94fde1cfd332e34f4d608b17..f00b38e732a7835896a275d14507e75eade05fa1 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
@@ -84,6 +85,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return "CacheDatasetOp::FileDataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -562,9 +565,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
   class MemoryDataset : public DatasetBase {
    public:
     explicit MemoryDataset(OpKernelContext* ctx, const DatasetBase* input)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          cache_(new MemoryCache()) {
+        : DatasetBase(DatasetContext(ctx)), input_(input) {
       input->Ref();
     }
 
@@ -572,8 +573,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new MemoryIterator(
-          {this, strings::StrCat(prefix, "::MemoryCache")}, cache_));
+      return std::unique_ptr<IteratorBase>(
+          new MemoryIterator({this, strings::StrCat(prefix, "::MemoryCache")}));
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -588,6 +589,8 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       return "CacheDatasetOp::MemoryDataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -607,10 +610,12 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
     // The expected use is that a single `MemoryWriterIterator` populates the
     // cache with dataset elements. Once all elements are cached, the cache can
     // be used by one or more `MemoryReaderIterator`s.
-    class MemoryCache {
+    class MemoryCache : public ResourceBase {
      public:
       MemoryCache() = default;
 
+      string DebugString() override { return "CacheDataset::MemoryCache"; }
+
       // Marks the cache as completed.
       void Complete() {
         mutex_lock l(mu_);
@@ -677,15 +682,25 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
 
     class MemoryIterator : public DatasetIterator<MemoryDataset> {
      public:
-      explicit MemoryIterator(const Params& params,
-                              const std::shared_ptr<MemoryCache>& cache)
-          : DatasetIterator<MemoryDataset>(params), cache_(cache) {
-        mode_ = cache->MaybeClaim() ? Mode::write : Mode::read;
-        InitializeIterator();
-      }
+      explicit MemoryIterator(const Params& params)
+          : DatasetIterator<MemoryDataset>(params) {}
+
+      ~MemoryIterator() override { cache_->Unref(); }
 
       Status Initialize(IteratorContext* ctx) override {
         mutex_lock l(mu_);
+        // Use the resource manager in the iterator context to get / create
+        // a cache.
+        ResourceMgr* mgr = ctx->resource_mgr();
+        const string name =
+            strings::StrCat(prefix(), "::", dataset()->name(), "::MemoryCache");
+        TF_RETURN_IF_ERROR(mgr->LookupOrCreate<MemoryCache>(
+            "tf_data", name, &cache_, [](MemoryCache** cache) {
+              *cache = new MemoryCache();
+              return Status::OK();
+            }));
+        mode_ = cache_->MaybeClaim() ? Mode::write : Mode::read;
+        InitializeIterator();
         if (mode_ == Mode::read && !cache_->IsCompleted()) {
           return errors::Internal(
               "Cache should only be read after it has been completed.");
@@ -784,8 +799,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
      private:
       class MemoryWriterIterator : public DatasetIterator<MemoryDataset> {
        public:
-        explicit MemoryWriterIterator(const Params& params,
-                                      const std::shared_ptr<MemoryCache>& cache)
+        explicit MemoryWriterIterator(const Params& params, MemoryCache* cache)
             : DatasetIterator<MemoryDataset>(params), cache_(cache) {
           CHECK(cache_);
         }
@@ -818,6 +832,7 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
             cache_->Complete();
             return Status::OK();
           }
+          RecordBufferEnqueue(ctx, *out_tensors);
           cache_->emplace_back(*out_tensors);
           return Status::OK();
         }
@@ -843,17 +858,46 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
        private:
         mutex mu_;
         std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-        std::shared_ptr<MemoryCache> cache_;
+        MemoryCache* const cache_ GUARDED_BY(mu_);  // not owned.
       };  // MemoryWriterIterator
 
       class MemoryReaderIterator : public DatasetIterator<MemoryDataset> {
        public:
-        explicit MemoryReaderIterator(const Params& params,
-                                      const std::shared_ptr<MemoryCache>& cache)
+        explicit MemoryReaderIterator(const Params& params, MemoryCache* cache)
             : DatasetIterator<MemoryDataset>(params), cache_(cache), index_(0) {
           CHECK(cache);
         }
 
+        Status Initialize(IteratorContext* ctx) override {
+          // The memory allocated for the cache is owned by the parent
+          // dataset but performance modeling uses the iterator abstraction and
+          // thus we record the memory allocated for the cache here. The caveat
+          // is that this is incorrect if there are concurrent instances of this
+          // iterator.
+          tf_shared_lock l(mu_);
+          for (size_t i = 0; i < cache_->size(); ++i) {
+            RecordBufferEnqueue(ctx, cache_->at(i));
+          }
+          return Status::OK();
+        }
+
+        Status GetNextInternal(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) override {
+          mutex_lock l(mu_);
+          if (index_ < cache_->size()) {
+            const std::vector<Tensor>& cache_tensors = cache_->at(index_);
+            out_tensors->insert(out_tensors->begin(), cache_tensors.begin(),
+                                cache_tensors.end());
+            index_++;
+            *end_of_sequence = false;
+            return Status::OK();
+          } else {
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+        }
+
        protected:
         std::shared_ptr<model::Node> CreateNode(
             IteratorContext* ctx, model::Node::Args args) const override {
@@ -878,26 +922,9 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        Status GetNextInternal(IteratorContext* ctx,
-                               std::vector<Tensor>* out_tensors,
-                               bool* end_of_sequence) override {
-          mutex_lock l(mu_);
-          if (index_ < cache_->size()) {
-            const std::vector<Tensor>& cache_tensors = cache_->at(index_);
-            out_tensors->insert(out_tensors->begin(), cache_tensors.begin(),
-                                cache_tensors.end());
-            index_++;
-            *end_of_sequence = false;
-            return Status::OK();
-          } else {
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-        }
-
        private:
         mutex mu_;
-        const std::shared_ptr<MemoryCache> cache_;
+        MemoryCache* const cache_ GUARDED_BY(mu_);  // not owned.
         size_t index_ GUARDED_BY(mu_);
       };  // MemoryReaderIterator
 
@@ -914,14 +941,13 @@ class CacheDatasetOp : public UnaryDatasetOpKernel {
       }
 
       mutex mu_;
-      std::shared_ptr<MemoryCache> cache_;
+      MemoryCache* cache_ GUARDED_BY(mu_);  // not owned.
       enum Mode { read, write };
       Mode mode_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> iterator_ GUARDED_BY(mu_);
     };  // MemoryIterator
 
     const DatasetBase* const input_;
-    const std::shared_ptr<MemoryCache> cache_;
   };  // MemoryDataset
 };    // CacheDatasetOp
 
diff --git a/tensorflow/core/kernels/data/captured_function.cc b/tensorflow/core/kernels/data/captured_function.cc
index 64834e507f2d5bfb224693d8419b0c1070aace8a..973b6b06048fb715d9fd32791223cda21751b1c8 100644
--- a/tensorflow/core/kernels/data/captured_function.cc
+++ b/tensorflow/core/kernels/data/captured_function.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/cancellation.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/stats_aggregator.h"
 #include "tensorflow/core/lib/gtl/optional.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -118,10 +119,34 @@ Status CapturedFunction::Create(
   return Status::OK();
 }
 
-CapturedFunction::~CapturedFunction() {
-  if (lib_ != nullptr && f_handle_ != kInvalidHandle) {
-    lib_->ReleaseHandle(f_handle_).IgnoreError();
+Status CapturedFunction::Instantiate(
+    IteratorContext* ctx, std::unique_ptr<InstantiatedCapturedFunction>*
+                              instantiated_captured_function) {
+  // The context's runtime will be used for all subsequent calls.
+  FunctionLibraryRuntime* lib = ctx->lib();
+  FunctionLibraryRuntime::InstantiateOptions inst_opts;
+  inst_opts.overlay_lib = ctx->function_library().get();
+  inst_opts.create_kernels_eagerly = true;
+  if (!use_inter_op_parallelism_) {
+    inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
   }
+
+  FunctionLibraryRuntime::Handle f_handle;
+  TF_RETURN_IF_ERROR(ctx->function_handle_cache()->Instantiate(
+      func_.name(), AttrSlice(&func_.attr()), inst_opts, &f_handle));
+  const FunctionBody* fbody = lib->GetFunctionBody(f_handle);
+  if (fbody == nullptr) {
+    return errors::Internal("Failed to instantiate function body.");
+  }
+
+  DataTypeVector ret_types;
+  for (const auto& ret_type : fbody->ret_types) {
+    ret_types.push_back(ret_type);
+  }
+
+  instantiated_captured_function->reset(new InstantiatedCapturedFunction(
+      lib, f_handle, std::move(ret_types), *ctx->runner(), this));
+  return Status::OK();
 }
 
 namespace {
@@ -244,35 +269,35 @@ class BorrowedArgsCallFrame : public CallFrameBase {
 
 }  // namespace
 
-Status CapturedFunction::GetHandle(IteratorContext* ctx,
-                                   FunctionLibraryRuntime::Handle* out_handle) {
-  tf_shared_lock l(mu_);
-  if (lib_ == nullptr) {
-    return errors::Internal("Captured function \"", func_.name(),
-                            "\" was called before it was instantiated.");
-  }
-  if (ctx->lib() != lib_) {
-    return errors::Internal("Captured function \"", func_.name(),
-                            "\" was called with a different "
-                            "FunctionLibraryRuntime*, which is not permitted.");
-  }
-  *out_handle = f_handle_;
-  return Status::OK();
-}
-
-Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
-                             std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(GetHandle(ctx, &handle));
-
+InstantiatedCapturedFunction::InstantiatedCapturedFunction(
+    FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
+    DataTypeVector ret_types, std::function<void(std::function<void()>)> runner,
+    CapturedFunction* captured_func)
+    : lib_(lib),
+      f_handle_(f_handle),
+      ret_types_(std::move(ret_types)),
+      captured_runner_(std::move(runner)),
+      captured_func_(captured_func) {}
+
+// NOTE: We don't release f_handle_ here and instead delegate the function
+// handle releasing to the FunctionHandleCache. This is because in some cases
+// (RepeatDatasetOp in particular), we want to keep the function state (e.g.
+// random number generator) even after the Iterator is reset after going through
+// one epoch.
+InstantiatedCapturedFunction::~InstantiatedCapturedFunction() {}
+
+Status InstantiatedCapturedFunction::Run(IteratorContext* ctx,
+                                         std::vector<Tensor>&& args,
+                                         std::vector<Tensor>* rets) const {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [ctx](const string& name) {
-    ctx->lib()->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -284,10 +309,11 @@ Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  OwnedArgsCallFrame frame(std::move(args), &captured_inputs_, ret_types_);
+  OwnedArgsCallFrame frame(std::move(args), &captured_func_->captured_inputs(),
+                           ret_types_);
   Notification n;
   Status s;
-  ctx->lib()->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -296,20 +322,18 @@ Status CapturedFunction::Run(IteratorContext* ctx, std::vector<Tensor>&& args,
   return frame.ConsumeRetvals(rets);
 }
 
-Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
-                                             const std::vector<Tensor>& args,
-                                             std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime::Handle handle;
-  TF_RETURN_IF_ERROR(GetHandle(ctx, &handle));
-
+Status InstantiatedCapturedFunction::RunWithBorrowedArgs(
+    IteratorContext* ctx, const std::vector<Tensor>& args,
+    std::vector<Tensor>* rets) const {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [ctx](const string& name) {
-    ctx->lib()->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -321,11 +345,12 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
+                              ret_types_);
   Notification n;
   Status s;
 
-  ctx->lib()->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -334,65 +359,17 @@ Status CapturedFunction::RunWithBorrowedArgs(IteratorContext* ctx,
   return frame.ConsumeRetvals(rets);
 }
 
-Status CapturedFunction::Instantiate(IteratorContext* ctx) {
-  mutex_lock l(mu_);
-  if (lib_ == nullptr) {
-    // The context's runtime will be used for all subsequent calls.
-    lib_ = ctx->lib();
-    DCHECK(f_handle_ == kInvalidHandle);
-    FunctionLibraryRuntime::InstantiateOptions inst_opts;
-    inst_opts.overlay_lib = ctx->function_library().get();
-    inst_opts.state_handle = std::to_string(random::New64());
-    inst_opts.create_kernels_eagerly = true;
-    if (!use_inter_op_parallelism_) {
-      inst_opts.executor_type = "SINGLE_THREADED_EXECUTOR";
-    }
-    Status s = (lib_->Instantiate(func_.name(), AttrSlice(&func_.attr()),
-                                  inst_opts, &f_handle_));
-    TF_RETURN_IF_ERROR(s);
-    const FunctionBody* fbody = lib_->GetFunctionBody(f_handle_);
-    if (fbody == nullptr) {
-      return errors::Internal("Failed to instantiate function body.");
-    }
-    ret_types_ = fbody->ret_types;
-  } else {
-    if (ctx->lib() != lib_) {
-      return errors::Internal(
-          "Captured function was called with a different "
-          "FunctionLibraryRuntime*, which is not permitted.");
-    }
-  }
-  if (captured_runner_ == nullptr) {
-    captured_runner_ = *ctx->runner();
-  }
-  return Status::OK();
-}
-
-Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
-                                         std::vector<Tensor>* rets) {
-  FunctionLibraryRuntime* lib;
-  FunctionLibraryRuntime::Handle handle;
-  std::function<void(std::function<void()>)>* runner;
-  {
-    tf_shared_lock l(mu_);
-    if (lib_ == nullptr) {
-      return errors::FailedPrecondition(
-          "`CapturedFunction::Instantiate()` must be called before a call to "
-          "`CapturedFunction::RunInstantiated()`.");
-    }
-    lib = lib_;
-    handle = f_handle_;
-    runner = &captured_runner_;
-  }
-
+Status InstantiatedCapturedFunction::RunInstantiated(
+    const std::vector<Tensor>& args, std::vector<Tensor>* rets) {
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ScopedStepContainer step_container(f_opts.step_id, [lib](const string& name) {
-    lib->device()->resource_manager()->Cleanup(name).IgnoreError();
-  });
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ScopedStepContainer step_container(
+      f_opts.step_id, [this](const string& name) {
+        lib_->device()->resource_manager()->Cleanup(name).IgnoreError();
+      });
   f_opts.step_container = &step_container;
-  f_opts.runner = runner;
-  if (lib->device()->device_type() != DEVICE_CPU) {
+  f_opts.runner = &captured_runner_;
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -404,11 +381,12 @@ Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
   CancellationManager c_mgr;
   f_opts.cancellation_manager = &c_mgr;
 
-  BorrowedArgsCallFrame frame(args, &captured_inputs_, ret_types_);
+  BorrowedArgsCallFrame frame(args, &captured_func_->captured_inputs(),
+                              ret_types_);
   Notification n;
   Status s;
 
-  lib->Run(f_opts, handle, &frame, [&n, &s](Status func_status) {
+  lib_->Run(f_opts, f_handle_, &frame, [&n, &s](Status func_status) {
     s.Update(func_status);
     n.Notify();
   });
@@ -417,33 +395,25 @@ Status CapturedFunction::RunInstantiated(const std::vector<Tensor>& args,
   return frame.ConsumeRetvals(rets);
 }
 
-void CapturedFunction::RunAsync(IteratorContext* ctx,
-                                std::vector<Tensor>&& args,
-                                std::vector<Tensor>* rets,
-                                FunctionLibraryRuntime::DoneCallback done,
-                                const string& prefix) {
+void InstantiatedCapturedFunction::RunAsync(
+    IteratorContext* ctx, std::vector<Tensor>&& args, std::vector<Tensor>* rets,
+    FunctionLibraryRuntime::DoneCallback done, const string& prefix) const {
   // NOTE(mrry): This method does not transfer ownership of `ctx`, and it may
   // be deleted before `done` is called. Take care not to capture `ctx` in any
   // code that may execute asynchronously in this function.
-  FunctionLibraryRuntime::Handle handle;
-  Status s = GetHandle(ctx, &handle);
-  if (!s.ok()) {
-    done(s);
-    return;
-  }
-  OwnedArgsCallFrame* frame =
-      new OwnedArgsCallFrame(std::move(args), &captured_inputs_, ret_types_);
+  OwnedArgsCallFrame* frame = new OwnedArgsCallFrame(
+      std::move(args), &captured_func_->captured_inputs(), ret_types_);
 
   FunctionLibraryRuntime::Options f_opts;
-  f_opts.step_id = CapturedFunction::generate_step_id();
-  ResourceMgr* resource_mgr = ctx->lib()->device()->resource_manager();
+  f_opts.step_id = InstantiatedCapturedFunction::generate_step_id();
+  ResourceMgr* resource_mgr = lib_->device()->resource_manager();
   ScopedStepContainer* step_container = new ScopedStepContainer(
       f_opts.step_id, [resource_mgr](const string& name) {
         resource_mgr->Cleanup(name).IgnoreError();
       });
   f_opts.step_container = step_container;
   f_opts.runner = ctx->runner();
-  if (ctx->lib()->device()->device_type() != DEVICE_CPU) {
+  if (lib_->device()->device_type() != DEVICE_CPU) {
     f_opts.create_rendezvous = true;
   }
   // TODO(mrry): Add cancellation manager support to IteratorContext
@@ -480,7 +450,7 @@ void CapturedFunction::RunAsync(IteratorContext* ctx,
           stats_aggregator->AddToHistogram(
               strings::StrCat(
                   str_util::Split(prefix, "::", str_util::SkipEmpty()).back(),
-                  "::", func_.name(), "::execution_time"),
+                  "::", captured_func_->func().name(), "::execution_time"),
               {static_cast<float>(stats_collector->processing_time())});
         }
         if (model) {
@@ -495,15 +465,13 @@ void CapturedFunction::RunAsync(IteratorContext* ctx,
       std::move(done), ctx->model(), ctx->stats_aggregator(), prefix,
       std::move(stats_collector), std::placeholders::_1);
 
-  ctx->lib()->Run(f_opts, handle, frame, std::move(callback));
+  lib_->Run(f_opts, f_handle_, frame, std::move(callback));
 }
 
 CapturedFunction::CapturedFunction(const NameAttrList& func,
                                    std::vector<Tensor> captured_inputs,
                                    bool use_inter_op_parallelism)
     : func_(func),
-      lib_(nullptr),
-      f_handle_(kInvalidHandle),
       captured_inputs_(std::move(captured_inputs)),
       use_inter_op_parallelism_(use_inter_op_parallelism) {}
 
diff --git a/tensorflow/core/kernels/data/captured_function.h b/tensorflow/core/kernels/data/captured_function.h
index c6a5fe9e1ec755646e6c250ef121afac83d48ee2..cffaf405ecbad4302be4e1b6022fda6db3dad359 100644
--- a/tensorflow/core/kernels/data/captured_function.h
+++ b/tensorflow/core/kernels/data/captured_function.h
@@ -34,59 +34,41 @@ class ResourceMgr;
 
 namespace data {
 
-// A `CapturedFunction` encapsulates a TensorFlow function and all of
-// the runtime support required to execute it.
+class CapturedFunction;
+
+// An InstantiatedCapturedFunction encapsulates all the runtime support needed
+// to execute a tensorflow function.
 //
-// The `Dataset`-related classes use `CapturedFunction` to execute
-// TensorFlow functions outside a the normal `OpKernel::Compute()`
-// context.
-class CapturedFunction {
+// While CapturedFunction (below) encapsulates the more permanent attributes
+// of the function i.e. name, captured arguments etc.,
+// InstantiatedCapturedFunction encapsulates the more runtime aspects i.e.
+// FunctionLibraryRuntime, function handle etc.
+//
+// The `Iterator-`related classes use `InstantiatedCapturedFunction` to execute
+// functions outside a the normal `OpKernel::Compute()` context.
+class InstantiatedCapturedFunction {
  public:
-  // Creates a new instance using a list of named attributes, fetching captured
-  // inputs from a context argument.
-  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
-                       const string& argument,
-                       std::unique_ptr<CapturedFunction>* out_function);
-
-  // Creates a new instance using a list of named attributes, fetching captured
-  // inputs from a context argument.
-  //
-  // If `use_inter_op_parallelism` is false, the runtime may use an executor
-  // that is optimized for small functions.
-  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
-                       const string& argument, bool use_inter_op_parallelism,
-                       std::unique_ptr<CapturedFunction>* out_function);
-
-  ~CapturedFunction();
+  ~InstantiatedCapturedFunction();
 
-  // Runs the "Captured function" using the given FLR and caches the lib and
-  // handle generated during instantiation. If Run is called with a different
-  // lib afterwards, generates an error. This method takes ownership of the
-  // tensors in `args`, in order to be able to deallocate them as early as
+  // Runs the "Instantiated Captured function". This method takes ownership of
+  // the tensors in `args`, in order to be able to deallocate them as early as
   // possible. Use `RunWithBorrowedArgs()` if the caller needs to retain
   // ownership of the `args`.
   Status Run(IteratorContext* ctx, std::vector<Tensor>&& args,
-             std::vector<Tensor>* rets);
+             std::vector<Tensor>* rets) const;
 
   // Synchronously runs the captured function on the given `args`, and stores
   // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
   // possible.
   Status RunWithBorrowedArgs(IteratorContext* ctx,
                              const std::vector<Tensor>& args,
-                             std::vector<Tensor>* rets);
-
-  // Explicitly instantiate this function for use in the given
-  // context. This method, and the context-less overload
-  // `RunInstantiated()` below can be useful for calling a captured
-  // function in cases where an `IteratorContext*` is not available
-  // (such as a destructor).
-  Status Instantiate(IteratorContext* ctx);
+                             std::vector<Tensor>* rets) const;
 
   // Synchronously runs the captured function on the given `args`, and stores
   // the results in `*rets`. Prefer to use `Run()` or `RunAsync()` when
-  // possible.
-  //
-  // REQUIRES: `this->Instantiate()` must have been called before this method.
+  // possible. This can be useful for calling a captured
+  // function in cases where an `IteratorContext*` is not available
+  // (such as a destructor).
   Status RunInstantiated(const std::vector<Tensor>& args,
                          std::vector<Tensor>* rets);
 
@@ -97,16 +79,9 @@ class CapturedFunction {
   void RunAsync(IteratorContext* ctx, std::vector<Tensor>&& args,
                 std::vector<Tensor>* rets,
                 FunctionLibraryRuntime::DoneCallback done,
-                const string& prefix);
-
-  // Returns the named list of function arguments.
-  const NameAttrList& func() { return func_; }
+                const string& prefix) const;
 
-  // Returns that additional captured inputs that will be passed to the function
-  // when `Run*()` is called.
-  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
-
-  // Returns a step ID for use when running a `CapturedFunction`.
+  // Returns a step ID for use when running an `InstantiatedCapturedFunction`.
   static int64 generate_step_id() {
     // Choose a step ID that is guaranteed not to clash with any
     // Session-generated step ID. DirectSession only generates
@@ -116,26 +91,66 @@ class CapturedFunction {
     return -std::abs(static_cast<int64>(random::New64()));
   }
 
+ private:
+  InstantiatedCapturedFunction(
+      FunctionLibraryRuntime* lib, FunctionLibraryRuntime::Handle f_handle,
+      DataTypeVector ret_types,
+      std::function<void(std::function<void()>)> runner,
+      CapturedFunction* captured_func);
+
+  friend class CapturedFunction;
+
+  FunctionLibraryRuntime* const lib_;
+  const FunctionLibraryRuntime::Handle f_handle_;
+  const DataTypeVector ret_types_;
+  std::function<void(std::function<void()>)> captured_runner_;
+  CapturedFunction* const captured_func_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(InstantiatedCapturedFunction);
+};
+
+// A `CapturedFunction` encapsulates a TensorFlow function, plus any "captured"
+// arguments that it closed over in the user program.
+class CapturedFunction {
+ public:
+  // Creates a new instance using a list of named attributes, fetching captured
+  // inputs from a context argument.
+  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
+                       const string& argument,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  // Creates a new instance using a list of named attributes, fetching captured
+  // inputs from a context argument.
+  //
+  // If `use_inter_op_parallelism` is false, the runtime may use an executor
+  // that is optimized for small functions.
+  static Status Create(const NameAttrList& func, OpKernelContext* ctx,
+                       const string& argument, bool use_inter_op_parallelism,
+                       std::unique_ptr<CapturedFunction>* out_function);
+
+  // Instantiates this function for use in the given context, providing an
+  // InstantiatedCapturedFunction that can be used to execute functions.
+  Status Instantiate(IteratorContext* ctx,
+                     std::unique_ptr<InstantiatedCapturedFunction>*
+                         instantiated_captured_function);
+
+  // Returns the named list of function arguments.
+  const NameAttrList& func() { return func_; }
+
+  // Returns that additional captured inputs that will be passed to the function
+  const std::vector<Tensor>& captured_inputs() { return captured_inputs_; }
+
  private:
   CapturedFunction(const NameAttrList& func,
                    std::vector<Tensor> captured_inputs,
                    bool use_inter_op_parallelism);
 
-  Status GetHandle(IteratorContext* ctx,
-                   FunctionLibraryRuntime::Handle* out_handle);
-
-  mutex mu_;
   const NameAttrList func_;
-  FunctionLibraryRuntime* lib_ GUARDED_BY(mu_);
-  FunctionLibraryRuntime::Handle f_handle_ GUARDED_BY(mu_);
   const std::vector<Tensor> captured_inputs_;
-  DataTypeSlice ret_types_;
-  std::function<void(std::function<void()>)> captured_runner_ = nullptr;
   const bool use_inter_op_parallelism_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(CapturedFunction);
 };
-
 }  // namespace data
 
 // TODO(b/114112161): Remove these aliases when all users have moved over to the
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index d5a0abc64b4e8769aff260d2580bd44e7af7e9ac..066b2c9aef4faaf23981b207e46c301e99360119 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -79,6 +79,18 @@ class ConcatenateDatasetOp : public BinaryDatasetOpKernel {
       return "ConcatenateDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override {
+      int64 n1 = input_->Cardinality();
+      int64 n2 = to_concatenate_->Cardinality();
+      if (n1 == kInfiniteCardinality || n2 == kInfiniteCardinality) {
+        return kInfiniteCardinality;
+      }
+      if (n1 == kUnknownCardinality || n2 == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      return n1 + n2;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index 36e9714736a5725f69e33a49bd5d1389994213ac..0abfdbb56b577764bbd48dbe0903148b2cf691d6 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -46,8 +46,25 @@ class DatasetToGraphOp : public OpKernel {
   }
 };
 
+class DatasetCardinalityOp : public OpKernel {
+ public:
+  explicit DatasetCardinalityOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    DatasetBase* dataset;
+    OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+    Tensor* result;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
+    result->scalar<int64>()() = dataset->Cardinality();
+  }
+};
+
 REGISTER_KERNEL_BUILDER(Name("DatasetToGraph").Device(DEVICE_CPU),
                         DatasetToGraphOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDatasetCardinality").Device(DEVICE_CPU),
+    DatasetCardinalityOp);
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/dataset_utils.cc b/tensorflow/core/kernels/data/dataset_utils.cc
index 1be38c4cb26782a4bd721e8ced2dbe4a6ae39638..4d92d314d3d207d12310bb744b5601ad922bc570 100644
--- a/tensorflow/core/kernels/data/dataset_utils.cc
+++ b/tensorflow/core/kernels/data/dataset_utils.cc
@@ -81,12 +81,12 @@ std::vector<bool> ComputeMoveVector(const std::vector<int>& indices) {
 
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
-    int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
-    std::unique_ptr<IteratorBase>* out_iterator) {
+    int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
+    StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator) {
   std::vector<Tensor> return_values;
 
-  TF_RETURN_IF_ERROR(
-      captured_func->RunWithBorrowedArgs(ctx, input_element, &return_values));
+  TF_RETURN_IF_ERROR(inst_captured_func.RunWithBorrowedArgs(ctx, input_element,
+                                                            &return_values));
 
   if (!(return_values.size() == 1 && return_values[0].dtype() == DT_VARIANT &&
         TensorShapeUtils::IsScalar(return_values[0].shape()))) {
diff --git a/tensorflow/core/kernels/data/dataset_utils.h b/tensorflow/core/kernels/data/dataset_utils.h
index 0b2816119dd154dcbf95262982696aea470a21a3..23a3d93ed160c95099a5c8ddb237b4c055a1845c 100644
--- a/tensorflow/core/kernels/data/dataset_utils.h
+++ b/tensorflow/core/kernels/data/dataset_utils.h
@@ -44,8 +44,8 @@ std::vector<bool> ComputeMoveVector(const std::vector<int>& indices);
 
 Status MakeIteratorFromInputElement(
     IteratorContext* ctx, const std::vector<Tensor>& input_element,
-    int64 thread_index, CapturedFunction* captured_func, StringPiece prefix,
-    std::unique_ptr<IteratorBase>* out_iterator);
+    int64 thread_index, const InstantiatedCapturedFunction& inst_captured_func,
+    StringPiece prefix, std::unique_ptr<IteratorBase>* out_iterator);
 
 // Returns Status::OK() if `expected` and `received` types match,
 // errors::InvalidArgument otherwise.
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 1a18864ecf5619d6bb6c86bd9452202ee1db490f..7433303f77671cbf67a6365fb1d552edc7b471e0 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -11,35 +11,31 @@ load(
     "tf_kernel_library",
 )
 
-cc_library(
-    name = "indexed_dataset_headers",
-    hdrs = ["indexed_dataset.h"],
+tf_kernel_library(
+    name = "assert_next_dataset_op",
+    srcs = ["assert_next_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
 
 tf_kernel_library(
-    name = "indexed_dataset",
-    srcs = [
-        "identity_indexed_dataset.cc",
-        "indexed_dataset.cc",
-    ],
+    name = "csv_dataset_op",
+    srcs = ["csv_dataset_op.cc"],
     deps = [
-        ":indexed_dataset_headers",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
 tf_kernel_library(
-    name = "prefetching_kernels",
-    srcs = ["prefetching_kernels.cc"],
+    name = "dense_to_sparse_batch_dataset_op",
+    srcs = ["dense_to_sparse_batch_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -59,13 +55,29 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "csv_dataset_op",
-    srcs = ["csv_dataset_op.cc"],
+    name = "group_by_reducer_dataset_op",
+    srcs = ["group_by_reducer_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+    ],
+)
+
+tf_kernel_library(
+    name = "group_by_window_dataset_op",
+    srcs = ["group_by_window_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:window_dataset",
     ],
 )
 
@@ -79,6 +91,18 @@ tf_kernel_library(
     ],
 )
 
+tf_kernel_library(
+    name = "indexed_dataset_op",
+    srcs = ["indexed_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/data:dataset_utils",
+        "//third_party/eigen3",
+    ],
+)
+
 tf_kernel_library(
     name = "lmdb_dataset_op",
     srcs = ["lmdb_dataset_op.cc"],
@@ -92,12 +116,38 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "threadpool_dataset_op",
-    srcs = ["threadpool_dataset_op.cc"],
+    name = "map_and_batch_dataset_op",
+    srcs = ["map_and_batch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:inplace_ops",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "matching_files_dataset_op",
+    srcs = ["matching_files_dataset_op.cc"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:dataset",
+    ],
+)
+
+tf_kernel_library(
+    name = "non_serializable_dataset_op",
+    srcs = ["non_serializable_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
         "//third_party/eigen3",
     ],
 )
@@ -118,23 +168,72 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "unique_dataset_op",
-    srcs = ["unique_dataset_op.cc"],
+    name = "parallel_interleave_dataset_op",
+    srcs = ["parallel_interleave_dataset_op.cc"],
     deps = [
-        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+        "//tensorflow/core/kernels/data:dataset_utils",
     ],
 )
 
 tf_kernel_library(
-    name = "assert_next_dataset_op",
-    srcs = ["assert_next_dataset_op.cc"],
+    name = "parse_example_dataset_op",
+    srcs = ["parse_example_dataset_op.cc"],
     deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/kernels/data:parallel_map_iterator",
+    ],
+)
+
+tf_kernel_library(
+    name = "prefetching_kernels",
+    srcs = ["prefetching_kernels.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
-        "//third_party/eigen3",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "random_dataset_op",
+    srcs = ["random_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "scan_dataset_op",
+    srcs = ["scan_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data:captured_function",
+    ],
+)
+
+tf_kernel_library(
+    name = "set_stats_aggregator_dataset_op",
+    srcs = ["set_stats_aggregator_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:core_cpu_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib_internal",
     ],
 )
 
@@ -148,11 +247,93 @@ tf_kernel_library(
 )
 
 tf_kernel_library(
-    name = "non_serializable_dataset_op",
-    srcs = ["non_serializable_dataset_op.cc"],
+    name = "sliding_window_dataset_op",
+    srcs = ["sliding_window_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "sql_dataset_op",
+    srcs = [
+        "sql_dataset_op.cc",
+    ],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels/data/experimental/sql",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_aggregator_ops",
+    srcs = ["stats_aggregator_ops.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "stats_dataset_ops",
+    srcs = ["stats_dataset_ops.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core:protos_all_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "to_tf_record_op",
+    srcs = ["to_tf_record_op.cc"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/kernels:ops_util",
+        "//tensorflow/core/kernels/data:dataset_utils",
+    ],
+)
+
+tf_kernel_library(
+    name = "threadpool_dataset_op",
+    srcs = ["threadpool_dataset_op.cc"],
     deps = [
         "//tensorflow/core:experimental_dataset_ops_op_lib",
         "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//third_party/eigen3",
+    ],
+)
+
+tf_kernel_library(
+    name = "unbatch_dataset_op",
+    srcs = ["unbatch_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+    ],
+)
+
+tf_kernel_library(
+    name = "unique_dataset_op",
+    srcs = ["unique_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:experimental_dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//third_party/eigen3",
     ],
 )
@@ -162,15 +343,31 @@ tf_kernel_library(
     deps = [
         ":assert_next_dataset_op",
         ":csv_dataset_op",
+        ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
+        ":group_by_reducer_dataset_op",
+        ":group_by_window_dataset_op",
         ":ignore_errors_dataset_op",
-        ":indexed_dataset",
+        ":indexed_dataset_op",
         ":lmdb_dataset_op",
+        ":map_and_batch_dataset_op",
+        ":matching_files_dataset_op",
         ":non_serializable_dataset_op",
         ":numa_map_and_batch_dataset_op",
+        ":parallel_interleave_dataset_op",
+        ":parse_example_dataset_op",
         ":prefetching_kernels",
+        ":random_dataset_op",
+        ":scan_dataset_op",
+        ":set_stats_aggregator_dataset_op",
         ":sleep_dataset_op",
+        ":sliding_window_dataset_op",
+        ":sql_dataset_op",
+        ":stats_aggregator_ops",
+        ":stats_dataset_ops",
         ":threadpool_dataset_op",
+        ":to_tf_record_op",
+        ":unbatch_dataset_op",
         ":unique_dataset_op",
     ],
 )
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index 3b5ee9b783c7c6b123ae220221f82a10c59dbd4c..3e87f484b940b336ed68099df7427250a4304207 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -76,6 +76,8 @@ class AssertNextDatasetOp : public UnaryDatasetOpKernel {
       return "AssertNextDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index d684d23b24212e629c8082091b1b310e82eedb70..97e64dd7444e93660afa6defa31314c909a31c7b 100644
--- a/tensorflow/core/kernels/data/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -114,6 +114,14 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ + (n % batch_size_ == 0 ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -304,8 +312,9 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("DenseToSparseBatchDataset").Device(DEVICE_CPU),
-                        DenseToSparseBatchDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDenseToSparseBatchDataset").Device(DEVICE_CPU),
+    DenseToSparseBatchDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
similarity index 93%
rename from tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index dc1925a21fe0394df23424093c337170d3069c2d..1c298cfdd6a3a39aabd81cb5226e03b1c3e3de63 100644
--- a/tensorflow/core/kernels/data/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -191,11 +191,14 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_finalize_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(
+            ctx, &instantiated_key_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Instantiate(
+            ctx, &instantiated_init_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(
+            ctx, &instantiated_reduce_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_finalize_func_->Instantiate(
+            ctx, &instantiated_finalize_func_));
         return Status::OK();
       }
 
@@ -213,9 +216,8 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           if (!end_of_input_) {
             // Run the key function on the input element.
             std::vector<Tensor> key_func_output;
-            TF_RETURN_IF_ERROR(
-                dataset()->captured_key_func_->RunWithBorrowedArgs(
-                    ctx, next_input_element, &key_func_output));
+            TF_RETURN_IF_ERROR(instantiated_key_func_->RunWithBorrowedArgs(
+                ctx, next_input_element, &key_func_output));
 
             if (key_func_output.size() != 1 ||
                 key_func_output[0].dtype() != DT_INT64 ||
@@ -229,7 +231,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
             if (states_.find(key) == states_.end()) {
               // Run the init function to create the initial state.
               std::vector<Tensor> init_func_output;
-              TF_RETURN_IF_ERROR(dataset()->captured_init_func_->Run(
+              TF_RETURN_IF_ERROR(instantiated_init_func_->Run(
                   ctx, std::move(key_func_output), &init_func_output));
               states_[key] = init_func_output;
             }
@@ -243,7 +245,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
                       std::back_inserter(args));
 
             std::vector<Tensor> reduce_func_output;
-            TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
+            TF_RETURN_IF_ERROR(instantiated_reduce_func_->Run(
                 ctx, std::move(args), &reduce_func_output));
             states_[key] = reduce_func_output;
           } else {
@@ -259,9 +261,8 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           *end_of_sequence = true;
           return Status::OK();
         }
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_finalize_func_->RunWithBorrowedArgs(
-                ctx, states_[keys_[keys_index_++]], out_tensors));
+        TF_RETURN_IF_ERROR(instantiated_finalize_func_->RunWithBorrowedArgs(
+            ctx, states_[keys_[keys_index_++]], out_tensors));
         *end_of_sequence = false;
         return Status::OK();
       }
@@ -384,6 +385,10 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
       std::map<int64, std::vector<Tensor>> states_ GUARDED_BY(mu_);
       std::vector<int64> keys_ GUARDED_BY(mu_);
       int64 keys_index_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_key_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_init_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_reduce_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_finalize_func_;
     };
 
     const NameAttrList& key_func() const { return captured_key_func_->func(); }
@@ -433,8 +438,9 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList finalize_func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("GroupByReducerDataset").Device(DEVICE_CPU),
-                        GroupByReducerDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalGroupByReducerDataset").Device(DEVICE_CPU),
+    GroupByReducerDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
similarity index 95%
rename from tensorflow/core/kernels/data/group_by_window_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 64db5df31ebdf241dadb1ee89d9d51dabf79efd2..98603d5a732c8143db61535e6704d6a7b214413c 100644
--- a/tensorflow/core/kernels/data/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -175,10 +175,12 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(ctx));
-        TF_RETURN_IF_ERROR(
-            dataset()->captured_window_size_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_key_func_->Instantiate(
+            ctx, &instantiated_key_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Instantiate(
+            ctx, &instantiated_reduce_func_));
+        TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Instantiate(
+            ctx, &instantiated_window_size_func_));
         return Status::OK();
       }
 
@@ -215,9 +217,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
               // Run the key function on the input element to identify its
               // group.
               std::vector<Tensor> key_func_output;
-              TF_RETURN_IF_ERROR(
-                  dataset()->captured_key_func_->RunWithBorrowedArgs(
-                      ctx, next_input_element, &key_func_output));
+              TF_RETURN_IF_ERROR(instantiated_key_func_->RunWithBorrowedArgs(
+                  ctx, next_input_element, &key_func_output));
 
               if (key_func_output.size() != 1 ||
                   key_func_output[0].dtype() != DT_INT64 ||
@@ -232,7 +233,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 // Run the window size function on the key to identify its
                 // window size.
                 std::vector<Tensor> window_size_func_output;
-                TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Run(
+                TF_RETURN_IF_ERROR(instantiated_window_size_func_->Run(
                     ctx, std::move(key_func_output), &window_size_func_output));
 
                 if (window_size_func_output.size() != 1 ||
@@ -452,8 +453,8 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         std::vector<Tensor> args(
             {std::move(key_arg), std::move(group_dataset_arg)});
         std::vector<Tensor> return_values;
-        TF_RETURN_IF_ERROR(dataset()->captured_reduce_func_->Run(
-            ctx, std::move(args), &return_values));
+        TF_RETURN_IF_ERROR(instantiated_reduce_func_->Run(ctx, std::move(args),
+                                                          &return_values));
 
         if (!(return_values.size() == 1 &&
               return_values[0].dtype() == DT_VARIANT &&
@@ -482,6 +483,10 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
       std::map<int64, std::vector<std::vector<Tensor>>> groups_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_group_iterator_ GUARDED_BY(mu_);
       std::map<int64, int64> window_sizes_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_key_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_reduce_func_;
+      std::unique_ptr<InstantiatedCapturedFunction>
+          instantiated_window_size_func_;
     };
 
     Status OtherArgumentsNodeAndType(
@@ -518,8 +523,9 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
   NameAttrList window_size_func_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("GroupByWindowDataset").Device(DEVICE_CPU),
-                        GroupByWindowDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalGroupByWindowDataset").Device(DEVICE_CPU),
+    GroupByWindowDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
deleted file mode 100644
index d10a3dea110c9cd29919b89e0814bc815ac13500..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/experimental/identity_indexed_dataset.cc
+++ /dev/null
@@ -1,163 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
-#include "tensorflow/core/lib/core/errors.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
- public:
-  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
-
-  void MakeIndexedDataset(OpKernelContext* ctx,
-                          IndexedDataset** output) override {
-    uint64 size = -1;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
-    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
-    *output = new Dataset(ctx, size);
-  }
-
-  class Dataset : public IndexedDataset {
-   public:
-    Dataset(OpKernelContext* ctx, uint64 size)
-        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
-
-    Status MaterializeDataset(
-        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
-      materialized->reset(new Materialized(this));
-      return Status::OK();
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
-      return *dtypes;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      static std::vector<PartialTensorShape>* shapes =
-          new std::vector<PartialTensorShape>({{}});
-      return *shapes;
-    }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
-    }
-
-    string DebugString() const override {
-      return "IdentityIndexedDataset::Dataset";
-    }
-
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** node) const override {
-      return errors::Unimplemented(
-          "identity_indexed_dataset.AsGraphDefInternal");
-    }
-
-   private:
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params) {}
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        if (cur_ < dataset()->size_) {
-          out_tensors->emplace_back(ctx->allocator({}), DT_UINT64,
-                                    TensorShape({}));
-          out_tensors->back().scalar<uint64>()() = cur_++;
-          *end_of_sequence = false;
-          return Status::OK();
-        }
-        *end_of_sequence = true;
-        return Status::OK();
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeKnownRatioNode(std::move(args),
-                                         /*ratio=*/1);
-      }
-
-     private:
-      mutex mu_;
-      uint64 cur_ GUARDED_BY(mu_);
-    };
-
-    class Materialized : public MaterializedIndexedDataset {
-     public:
-      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
-        dataset->Ref();
-      }
-
-      ~Materialized() override {
-        // TODO(saeta): Pull this into MaterializedIndexedDataset
-        dataset_->Unref();
-      }
-
-      const DataTypeVector& output_dtypes() const override {
-        return dataset_->output_dtypes();
-      }
-
-      const std::vector<PartialTensorShape>& output_shapes() const override {
-        return dataset_->output_shapes();
-      }
-
-      Status Get(IteratorContext&& ctx, uint64 index,
-                 std::vector<Tensor>* out_tensors) const override {
-        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
-                  << ")";
-        if (index >= dataset_->size_) {
-          // Note: use InvalidArgument instead of OutOfRange error because many
-          // things consider OutOfRange to be a "clean termination" error.
-          return errors::InvalidArgument(
-              "Index ", index,
-              " is out of range for this dataset. (Size is: ", dataset_->size_,
-              ".)");
-        }
-        out_tensors->emplace_back(ctx.allocator({}), DT_UINT64,
-                                  TensorShape({}));
-        out_tensors->back().scalar<uint64>()() = index;
-        return Status::OK();
-      }
-
-      Status Size(uint64* size) const override {
-        *size = dataset_->size_;
-        return Status::OK();
-      }
-
-     private:
-      const Dataset* const dataset_;  // Not owned.
-    };
-
-    const uint64 size_;
-    std::shared_ptr<Materialized> materialized_;
-  };
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
-    IdentityIndexedDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index 57cb44335b17f368d41178ed7cef0b3daefef3c3..d445d9c8094eec5c9a2bff9c45e2dc28e264d096 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -60,6 +60,8 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       return "IgnoreErrorsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset.h b/tensorflow/core/kernels/data/experimental/indexed_dataset.h
deleted file mode 100644
index 27a8360cbcffc55c2f4f8ce437e5080e070845df..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset.h
+++ /dev/null
@@ -1,119 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-namespace tensorflow {
-namespace data {
-
-// TODO(saeta): Urgh, this is ugly.
-class MaterializedIndexedDataset {
- public:
-  virtual ~MaterializedIndexedDataset() = default;
-
-  // Retrieve the element at a given index. The output tensors are stored in
-  // out_tensors.
-  //
-  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
-  // returned.
-  //
-  // Get is thread-safe.
-  virtual Status Get(IteratorContext&& ctx, uint64 index,
-                     std::vector<Tensor>* out_tensors) const = 0;
-
-  // Size determines the number of elements in this IndexedDataset.
-  //
-  // Size is thread-safe.
-  virtual Status Size(uint64* size) const = 0;
-
-  // Returns a vector of DataType values, representing the respective
-  // element types of each tuple component in the outputs of this dataset.
-  virtual const DataTypeVector& output_dtypes() const = 0;
-
-  // Returns a vector of tensor shapes, representing the respective
-  // (and possibly partially defined) shapes of each tuple component
-  // in the outputs of this dataset.
-  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
-};
-
-// IndexedDataset represents a dataset that supports random access in addition
-// to iterator-based sequential access.
-//
-// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
-// significant (backwards incompatible) changes!
-class IndexedDataset : public DatasetBase {
- public:
-  IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
-
-  // Materialize (if necessary) the dataset, and return a pointer.
-  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
-  virtual Status MaterializeDataset(
-      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
-};
-
-// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
-// rest of the TensorFlow runtime.
-//
-// Most IndexedDataset's will be private members of classes inheriting from this
-// class.
-class IndexedDatasetOpKernel : public OpKernel {
- public:
-  IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) final;
-
- protected:
-  // Subclasses should implement this method. It will be called during Compute
-  // execution.
-  virtual void MakeIndexedDataset(OpKernelContext* ctx,
-                                  IndexedDataset** output) = 0;
-
-  template <typename T>
-  Status ParseScalarArgument(OpKernelContext* ctx,
-                             const StringPiece& argument_name, T* output) {
-    const Tensor* argument_t;
-    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
-    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
-      return errors::InvalidArgument(argument_name, " must be a scalar");
-    }
-    *output = argument_t->scalar<T>()();
-    return Status::OK();
-  }
-};
-
-// Validates and extracts an `IndexedDataset` object from `tensor`.
-//
-// `tensor` must have been written by a call to
-// `StoreIndexedDatasetInVariantTensor`
-//
-// The retrieved pointer isa  borrowed reference to the dataset, which is owned
-// by the tensor. The consumer must either acquire its own reference to the
-// dataset by calling `(*out_dataset)->Ref()`, or ensure that `tensor` is not
-// destroyed or mutated while the retrieved pointer is in use.
-Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
-                                          IndexedDataset** out_dataset);
-
-// Stores an `IndexedDataset` object in `tensor.`
-//
-// The ownership of `dataset` is transferred to `tensor`.
-Status StoreIndexedDatasetInVariantTensor(IndexedDataset* dataset,
-                                          Tensor* tensor);
-
-}  // namespace data
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_INDEXED_DATASET_H_
diff --git a/tensorflow/core/kernels/data/experimental/indexed_dataset.cc b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
similarity index 62%
rename from tensorflow/core/kernels/data/experimental/indexed_dataset.cc
rename to tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
index 75ea462f4020bbf02ab05597a23869f90a90cc30..a07eaebdf9d645fba51945d7bd3e79b72b5e5dc2 100644
--- a/tensorflow/core/kernels/data/experimental/indexed_dataset.cc
+++ b/tensorflow/core/kernels/data/experimental/indexed_dataset_op.cc
@@ -12,10 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/experimental/indexed_dataset.h"
 
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 
@@ -23,42 +25,79 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-Status VerifyTypesMatch(const DataTypeVector& expected,
-                        const DataTypeVector& received) {
-  if (expected.size() != received.size()) {
-    return errors::InvalidArgument(
-        "Number of components does not match: expected ", expected.size(),
-        " types but got ", received.size(), ".");
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    if (expected[i] != received[i]) {
-      return errors::InvalidArgument("Data type mismatch at component ", i,
-                                     ": expected ", DataTypeString(expected[i]),
-                                     " but got ", DataTypeString(received[i]),
-                                     ".");
-    }
-  }
-  return Status::OK();
-}
+// TODO(saeta): Urgh, this is ugly.
+class MaterializedIndexedDataset {
+ public:
+  virtual ~MaterializedIndexedDataset() = default;
+
+  // Retrieve the element at a given index. The output tensors are stored in
+  // out_tensors.
+  //
+  // If `index` is greater than `Size()`, tensorflow::errors::OutOfRangeError is
+  // returned.
+  //
+  // Get is thread-safe.
+  virtual Status Get(IteratorContext&& ctx, uint64 index,
+                     std::vector<Tensor>* out_tensors) const = 0;
+
+  // Size determines the number of elements in this IndexedDataset.
+  //
+  // Size is thread-safe.
+  virtual Status Size(uint64* size) const = 0;
+
+  // Returns a vector of DataType values, representing the respective
+  // element types of each tuple component in the outputs of this dataset.
+  virtual const DataTypeVector& output_dtypes() const = 0;
+
+  // Returns a vector of tensor shapes, representing the respective
+  // (and possibly partially defined) shapes of each tuple component
+  // in the outputs of this dataset.
+  virtual const std::vector<PartialTensorShape>& output_shapes() const = 0;
+};
 
-Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
-                              const std::vector<PartialTensorShape>& received) {
-  if (expected.size() != received.size()) {
-    return errors::InvalidArgument(
-        "Number of components does not match: expected ", expected.size(),
-        " shapes but got ", received.size(), ".");
-  }
-  for (size_t i = 0; i < expected.size(); ++i) {
-    if (!expected[i].IsCompatibleWith(received[i])) {
-      return errors::InvalidArgument("Incompatible shapes at component ", i,
-                                     ": expected ", expected[i].DebugString(),
-                                     " but got ", received[i].DebugString(),
-                                     ".");
+// IndexedDataset represents a dataset that supports random access in addition
+// to iterator-based sequential access.
+//
+// Note: IndexedDatasets are HIGHLY experimental at this time. Expect
+// significant (backwards incompatible) changes!
+class IndexedDataset : public DatasetBase {
+ public:
+  explicit IndexedDataset(DatasetContext&& ctx) : DatasetBase(std::move(ctx)) {}
+
+  // Materialize (if necessary) the dataset, and return a pointer.
+  // TODO(saeta): Add in `IteratorContext* ctx` when materializing.
+  virtual Status MaterializeDataset(
+      std::shared_ptr<MaterializedIndexedDataset>* materialized) = 0;
+};
+
+// IndexedDatasetOpKernel abstracts away interfacing IndexedDatasets with the
+// rest of the TensorFlow runtime.
+//
+// Most IndexedDataset's will be private members of classes inheriting from this
+// class.
+class IndexedDatasetOpKernel : public OpKernel {
+ public:
+  explicit IndexedDatasetOpKernel(OpKernelConstruction* ctx) : OpKernel(ctx) {}
+  void Compute(OpKernelContext* ctx) final;
+
+ protected:
+  // Subclasses should implement this method. It will be called during Compute
+  // execution.
+  virtual void MakeIndexedDataset(OpKernelContext* ctx,
+                                  IndexedDataset** output) = 0;
+
+  template <typename T>
+  Status ParseScalarArgument(OpKernelContext* ctx,
+                             const StringPiece argument_name, T* output) {
+    const Tensor* argument_t;
+    TF_RETURN_IF_ERROR(ctx->input(argument_name, &argument_t));
+    if (!TensorShapeUtils::IsScalar(argument_t->shape())) {
+      return errors::InvalidArgument(argument_name, " must be a scalar");
     }
+    *output = argument_t->scalar<T>()();
+    return Status::OK();
   }
-
-  return Status::OK();
-}
+};
 
 class MaterializedDatasetResource : public ResourceBase {
  public:
@@ -164,8 +203,6 @@ class IndexedDatasetVariantWrapper {
   IndexedDataset* const dataset_;  // Owns one reference.
 };
 
-}  // namespace
-
 Status GetIndexedDatasetFromVariantTensor(const Tensor& tensor,
                                           IndexedDataset** out_dataset) {
   if (!(tensor.dtype() == DT_VARIANT ||
@@ -211,8 +248,6 @@ void IndexedDatasetOpKernel::Compute(OpKernelContext* ctx) {
   }
 }
 
-namespace {
-
 class MaterializedHandleOp : public OpKernel {
  public:
   explicit MaterializedHandleOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
@@ -370,6 +405,144 @@ REGISTER_KERNEL_BUILDER(
     Name("ExperimentalIndexedDatasetGet").Device(DEVICE_CPU),
     IndexedDatasetGet);
 
+class IdentityIndexedDatasetOp : public IndexedDatasetOpKernel {
+ public:
+  using IndexedDatasetOpKernel::IndexedDatasetOpKernel;
+
+  void MakeIndexedDataset(OpKernelContext* ctx,
+                          IndexedDataset** output) override {
+    uint64 size = -1;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument<uint64>(ctx, "size", &size));
+    OP_REQUIRES(ctx, size > 0, errors::InvalidArgument("`size` must be > 0"));
+    *output = new Dataset(ctx, size);
+  }
+
+  class Dataset : public IndexedDataset {
+   public:
+    Dataset(OpKernelContext* ctx, uint64 size)
+        : IndexedDataset(DatasetContext(ctx)), size_(size) {}
+
+    Status MaterializeDataset(
+        std::shared_ptr<MaterializedIndexedDataset>* materialized) override {
+      materialized->reset(new Materialized(this));
+      return Status::OK();
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      static DataTypeVector* dtypes = new DataTypeVector({DT_UINT64});
+      return *dtypes;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      static std::vector<PartialTensorShape>* shapes =
+          new std::vector<PartialTensorShape>({{}});
+      return *shapes;
+    }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::IdentityIndexedDataset")}));
+    }
+
+    string DebugString() const override {
+      return "IdentityIndexedDataset::Dataset";
+    }
+
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** node) const override {
+      return errors::Unimplemented(
+          "identity_indexed_dataset.AsGraphDefInternal");
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        if (cur_ < dataset()->size_) {
+          out_tensors->emplace_back(ctx->allocator({}), DT_UINT64,
+                                    TensorShape({}));
+          out_tensors->back().scalar<uint64>()() = cur_++;
+          *end_of_sequence = false;
+          return Status::OK();
+        }
+        *end_of_sequence = true;
+        return Status::OK();
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      mutex mu_;
+      uint64 cur_ GUARDED_BY(mu_);
+    };
+
+    class Materialized : public MaterializedIndexedDataset {
+     public:
+      explicit Materialized(Dataset* dataset) : dataset_(dataset) {
+        dataset->Ref();
+      }
+
+      ~Materialized() override {
+        // TODO(saeta): Pull this into MaterializedIndexedDataset
+        dataset_->Unref();
+      }
+
+      const DataTypeVector& output_dtypes() const override {
+        return dataset_->output_dtypes();
+      }
+
+      const std::vector<PartialTensorShape>& output_shapes() const override {
+        return dataset_->output_shapes();
+      }
+
+      Status Get(IteratorContext&& ctx, uint64 index,
+                 std::vector<Tensor>* out_tensors) const override {
+        LOG(INFO) << "Materialized(" << dataset_->size_ << ")::Get(" << index
+                  << ")";
+        if (index >= dataset_->size_) {
+          // Note: use InvalidArgument instead of OutOfRange error because many
+          // things consider OutOfRange to be a "clean termination" error.
+          return errors::InvalidArgument(
+              "Index ", index,
+              " is out of range for this dataset. (Size is: ", dataset_->size_,
+              ".)");
+        }
+        out_tensors->emplace_back(ctx.allocator({}), DT_UINT64,
+                                  TensorShape({}));
+        out_tensors->back().scalar<uint64>()() = index;
+        return Status::OK();
+      }
+
+      Status Size(uint64* size) const override {
+        *size = dataset_->size_;
+        return Status::OK();
+      }
+
+     private:
+      const Dataset* const dataset_;  // Not owned.
+    };
+
+    const uint64 size_;
+    std::shared_ptr<Materialized> materialized_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalIdentityIndexedDataset").Device(DEVICE_CPU),
+    IdentityIndexedDatasetOp);
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
similarity index 85%
rename from tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 72a401e99b818d0357a3e52b153fb6c2d867197a..d86c3a1a63dff8c9b0c4c1ea9bfbced6e3ddbf7e 100644
--- a/tensorflow/core/kernels/data/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -38,20 +38,25 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+// Maximum number of batch results to buffer.
+const int64 kMaxBatchResults = 16;
+
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
  public:
   using MapAndBatchIteratorFunction =
-      std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
+      std::function<void(IteratorContext*, InstantiatedCapturedFunction*,
+                         const string&, std::vector<Tensor>,
                          std::shared_ptr<std::vector<Tensor>>, StatusCallback)>;
 
   explicit MapAndBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx),
-        op_version_(ctx->def().op() == "MapAndBatchDataset" ? 1 : 2) {
+      : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
  protected:
@@ -64,29 +69,11 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         errors::InvalidArgument("batch_size must be greater than zero."));
 
     int64 num_parallel_calls;
-    switch (op_version_) {
-      case 1:
-        int64 num_parallel_batches;
-        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_batches",
-                                                &num_parallel_batches));
-        num_parallel_calls = num_parallel_batches * batch_size;
-        OP_REQUIRES(ctx, num_parallel_batches > 0,
-                    errors::InvalidArgument(
-                        "num_parallel_batches must be greater than zero."));
-        break;
-      case 2:
-        OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
-                                                &num_parallel_calls));
-        OP_REQUIRES(ctx,
-                    num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
-                    errors::InvalidArgument(
-                        "num_parallel_calls must be greater than zero."));
-        break;
-      default:
-        OP_REQUIRES(ctx, false,
-                    errors::Unimplemented("Unsupported operation version %d.",
-                                          op_version_));
-    }
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "num_parallel_calls",
+                                            &num_parallel_calls));
+    OP_REQUIRES(ctx, num_parallel_calls > 0 || num_parallel_calls == kAutoTune,
+                errors::InvalidArgument(
+                    "num_parallel_calls must be greater than zero."));
 
     bool drop_remainder;
     OP_REQUIRES_OK(ctx,
@@ -102,19 +89,20 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     MapAndBatchIteratorFunction map_func;
     CapturedFunction* raw_captured_func = captured_func.get();
     if (indices.empty()) {
-      map_func = [raw_captured_func](
-                     IteratorContext* ctx, const string& prefix,
-                     std::vector<Tensor> args,
-                     std::shared_ptr<std::vector<Tensor>> out_tensors,
-                     StatusCallback done) {
-        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors.get(),
-                                    std::move(done), prefix);
+      map_func = [](IteratorContext* ctx,
+                    InstantiatedCapturedFunction* instantiated_captured_func,
+                    const string& prefix, std::vector<Tensor> args,
+                    std::shared_ptr<std::vector<Tensor>> out_tensors,
+                    StatusCallback done) {
+        instantiated_captured_func->RunAsync(
+            ctx, std::move(args), out_tensors.get(), std::move(done), prefix);
       };
     } else {
       std::vector<bool> can_move = ComputeMoveVector(indices);
       map_func = [raw_captured_func, indices, can_move](
-                     IteratorContext* ctx, const string& prefix,
-                     std::vector<Tensor> args,
+                     IteratorContext* ctx,
+                     InstantiatedCapturedFunction* instantiated_captured_func,
+                     const string& prefix, std::vector<Tensor> args,
                      std::shared_ptr<std::vector<Tensor>> out_tensors,
                      StatusCallback done) {
         const std::vector<Tensor>& captured_inputs =
@@ -141,7 +129,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     *output = new Dataset(ctx, input, func_, batch_size, num_parallel_calls,
                           drop_remainder, output_types_, output_shapes_,
                           std::move(captured_func), &ctx->eigen_cpu_device(),
-                          std::move(map_func));
+                          std::move(map_func), preserve_cardinality_);
   }
 
  private:
@@ -154,7 +142,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             const std::vector<PartialTensorShape>& output_shapes,
             std::unique_ptr<CapturedFunction> captured_func,
             const Eigen::ThreadPoolDevice* device,
-            MapAndBatchIteratorFunction map_func)
+            MapAndBatchIteratorFunction map_func, bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -165,7 +153,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           output_shapes_(output_shapes),
           captured_func_(std::move(captured_func)),
           device_(device),
-          map_func_(std::move(map_func)) {
+          map_func_(std::move(map_func)),
+          preserve_cardinality_(preserve_cardinality) {
       input_->Ref();
     }
 
@@ -190,6 +179,15 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       return "MapAndBatchDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -219,6 +217,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       b->BuildAttrValue(func_, &f);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
 
       TF_RETURN_IF_ERROR(b->AddDataset(
           this,
@@ -228,7 +228,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
            std::make_pair(4, drop_remainder_node)},  // Single tensor inputs.
           {std::make_pair(1, other_arguments)},      // Tensor list inputs.
           {std::make_pair("f", f),
-           std::make_pair("Targuments", other_arguments_types_attr)},  // Attrs
+           std::make_pair("Targuments", other_arguments_types_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
           output));
       return Status::OK();
     }
@@ -243,7 +245,11 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
             cond_var_(std::make_shared<condition_variable>()),
             num_parallel_calls_(std::make_shared<model::SharedState>(
                 params.dataset->num_parallel_calls_, mu_, cond_var_)),
-            map_func_(std::move(map_func)) {
+            map_func_(std::move(map_func)),
+            max_batch_results_(std::min(kMaxBatchResults,
+                                        (params.dataset->num_parallel_calls_ +
+                                         params.dataset->batch_size_ - 1) /
+                                            params.dataset->batch_size_)) {
         std::vector<string> components =
             str_util::Split(params.prefix, "::", str_util::SkipEmpty());
         prefix_end_ = components.back();
@@ -268,7 +274,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -280,9 +287,11 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           EnsureRunnerThreadStarted(ctx);
           while (batch_results_.empty() ||
                  batch_results_.front()->num_calls > 0) {
+            ++waiting_;
             RecordStop(ctx);
             cond_var_->wait(l);
             RecordStart(ctx);
+            --waiting_;
           }
           std::swap(result, batch_results_.front());
           batch_results_.pop_front();
@@ -306,7 +315,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         while (num_calls_ > 0) {
           cond_var_->wait(l);
         }
-        CHECK_EQ(num_calls_, 0);
+        DCHECK_EQ(num_calls_, 0);
         TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("call_counter"), call_counter_));
@@ -411,6 +420,15 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         std::shared_ptr<std::vector<Tensor>> return_values =
             std::make_shared<std::vector<Tensor>>();
         auto done = [this, ctx, result, return_values, offset](Status status) {
+          if (dataset()->preserve_cardinality_ &&
+              errors::IsOutOfRange(status)) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            status = errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                status.error_message());
+          }
           result->UpdateStatus(status, offset);
           if (status.ok()) {
             Status allocate_status =
@@ -437,8 +455,8 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                 // TODO(mrry): Add a version of DoParallelConcat that allows us
                 // to move `tensor` where possible, to speed up string tensor
                 // batching.
-                Status copy_status = ::tensorflow::functor::DoParallelConcat(
-                    *dataset()->device_, tensor, offset, batch);
+                Status copy_status =
+                    batch_util::CopyElementToSlice(tensor, batch, offset);
                 if (!copy_status.ok()) {
                   result->UpdateStatus(copy_status, offset);
                   break;
@@ -455,8 +473,9 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
 
         // Apply the map function on `input_element`, storing the result in
         // `return_values`, and invoking `done` when finished.
-        map_func_(ctx.get(), prefix(), std::move(input_element),
-                  std::move(return_values), std::move(done));
+        map_func_(ctx.get(), instantiated_captured_func_.get(), prefix(),
+                  std::move(input_element), std::move(return_values),
+                  std::move(done));
       }
 
       Status CopyPartialBatch(Tensor* output, const Tensor& value,
@@ -522,11 +541,14 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
                            bool* end_of_sequence) {
         mutex_lock l(result->mu);
         if (result->num_elements == 0) {
-          *end_of_sequence = true;
-          return Status::OK();
+          if (result->status.ok() || errors::IsOutOfRange(result->status)) {
+            *end_of_sequence = true;
+            return Status::OK();
+          } else {
+            *end_of_sequence = false;
+            return result->status;
+          }
         }
-        // `f` may deliberately raise `errors::OutOfRange` to indicate that we
-        // should terminate the iteration early.
         if (!result->status.ok() && !errors::IsOutOfRange(result->status)) {
           // Deallocate tensors allocated for the output.
           result->output.clear();
@@ -556,7 +578,7 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         } else {
           *out_tensors = std::move(result->output);
         }
-        *end_of_sequence = result->num_elements == 0;
+        *end_of_sequence = false;
         return Status::OK();
       }
 
@@ -566,21 +588,30 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         RecordStart(ctx.get());
         auto stop_cleanup =
             gtl::MakeCleanup([this, &ctx]() { RecordStop(ctx.get()); });
-        new_calls.reserve(num_parallel_calls_->value);
+        {
+          tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
+          new_calls.reserve(num_parallel_calls_->value);
+        }
         auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
           int64 num_parallel_calls = num_parallel_calls_->value;
-          int64 max_batch_results =
-              (num_parallel_calls + dataset()->batch_size_ - 1) /
-              dataset()->batch_size_;
           return num_calls_ >= num_parallel_calls ||
-                 (batch_results_.size() > max_batch_results ||
-                  (batch_results_.size() == max_batch_results &&
+                 (batch_results_.size() > max_batch_results_ ||
+                  (batch_results_.size() == max_batch_results_ &&
                    call_counter_ % dataset()->batch_size_ == 0));
         };
         while (true) {
           {
             mutex_lock l(*mu_);
             while (!cancelled_ && busy()) {
+              if (waiting_ > 0 && num_calls_ < num_parallel_calls_->value &&
+                  max_batch_results_ < kMaxBatchResults) {
+                // If there is a caller waiting for a batch and the number of
+                // outstanding calls is not maxed out, it means we are out of
+                // `batch_results_` slots. Instead of waiting for a slot to open
+                // up, we create a new one to utilize CPU efficiently.
+                max_batch_results_++;
+                continue;
+              }
               RecordStop(ctx.get());
               cond_var_->wait(l);
               RecordStart(ctx.get());
@@ -758,9 +789,16 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
       // Buffer for storing the (intermediate) batch results.
       std::deque<std::shared_ptr<BatchResult>> batch_results_ GUARDED_BY(*mu_);
+      // Background thread used for coordinating input processing.
       std::unique_ptr<Thread> runner_thread_ GUARDED_BY(*mu_);
+      // Determines whether the transformation has been cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
+      // Identifies the number of callers currently waiting for a batch result.
+      int64 waiting_ GUARDED_BY(*mu_) = 0;
+      // Identifies the maximum number of batch results to store.
+      int64 max_batch_results_ GUARDED_BY(*mu_);
       string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
@@ -773,19 +811,18 @@ class MapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     const std::unique_ptr<CapturedFunction> captured_func_;
     const Eigen::ThreadPoolDevice* device_;  // not owned
     const MapAndBatchIteratorFunction map_func_;
+    const bool preserve_cardinality_;
   };
 
-  const int op_version_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
+  bool preserve_cardinality_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("MapAndBatchDataset").Device(DEVICE_CPU),
-                        MapAndBatchDatasetOp);
-
-REGISTER_KERNEL_BUILDER(Name("MapAndBatchDatasetV2").Device(DEVICE_CPU),
-                        MapAndBatchDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMapAndBatchDataset").Device(DEVICE_CPU),
+    MapAndBatchDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
similarity index 99%
rename from tensorflow/core/kernels/data/matching_files_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index d36b9e7e786774179e0fb236739908cb67fb5c2f..aa27a13416d093dd19475b97b51ac28489d4d177 100644
--- a/tensorflow/core/kernels/data/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -366,8 +366,9 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("MatchingFilesDataset").Device(DEVICE_CPU),
-                        MatchingFilesDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMatchingFilesDataset").Device(DEVICE_CPU),
+    MatchingFilesDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 953e086de3786bcb101da9b8a15d5a19c0f8cc57..61811ea14eddc9f40987e12ce6343268da24a503 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -75,6 +75,8 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
       return errors::Unimplemented(DebugString(), "::AsGraphDefInternal");
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    private:
     class Iterator : public DatasetIterator<Dataset> {
      public:
diff --git a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
index 068f854023064a90720ca0e51a94fb994be2386c..46233942f066de8fe799a958f164f8afa30e49ef 100644
--- a/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/numa_map_and_batch_dataset_op.cc
@@ -59,6 +59,9 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    // TODO(saeta): Implement support for preserve_cardinality logic.
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
  protected:
@@ -133,6 +136,17 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       return "NumaMapAndBatchDatasetOp::Dataset";
     }
 
+    // TODO(b/120482302): Note that this is inaccurate until
+    // NumaMapAndBatchMapDataset modified to preserve cardinality.
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -206,7 +220,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(ctx));
+        TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_));
         return Status::OK();
       }
 
@@ -1052,8 +1067,8 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
           {
             tracing::ScopedActivity trace(
                 "NumaMapAndBatch::Iterator::Worker::FunctionExecution");
-            s = dataset()->captured_func_->Run(ctx.get(), std::move(input),
-                                               &return_values);
+            s = instantiated_captured_func_->Run(ctx.get(), std::move(input),
+                                                 &return_values);
           }
           WORKER_VLOG(4) << "ran function for index: " << index
                          << ", sequence_number: " << sequence_number;
@@ -1099,6 +1114,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
       const std::shared_ptr<condition_variable> autotune_cond_var_;
       // The maximum number of parallel calls (can be auto-tuned).
       const std::shared_ptr<model::SharedState> num_parallel_calls_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
 
       // Caches the last-seen value of num_parallel_calls_->value to
       // short-circuit starting workers.
@@ -1127,6 +1143,7 @@ class NumaMapAndBatchDatasetOp : public UnaryDatasetOpKernel {
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
+  bool preserve_cardinality_;
 };
 
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0230f90aba1c849483da5f8d7297c44c8a1174de
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -0,0 +1,1085 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <atomic>
+#include <deque>
+#include <utility>
+
+#include "tensorflow/core/common_runtime/function.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/stats_aggregator.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/kernels/data/captured_function.h"
+#include "tensorflow/core/kernels/data/dataset_utils.h"
+#include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/random/random.h"
+#include "tensorflow/core/util/ptr_util.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+// See documentation in ../../ops/dataset_ops.cc for a high-level
+// description of the following op.
+
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+  }
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 cycle_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
+    OP_REQUIRES(ctx, cycle_length > 0,
+                errors::InvalidArgument("`cycle_length` must be > 0"));
+
+    int64 block_length = 0;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument(ctx, "block_length", &block_length));
+    OP_REQUIRES(ctx, block_length > 0,
+                errors::InvalidArgument("`block_length` must be > 0"));
+
+    bool sloppy = false;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
+
+    int64 buffer_output_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
+                                            &buffer_output_elements));
+    OP_REQUIRES(
+        ctx, buffer_output_elements > 0,
+        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
+
+    int64 prefetch_input_elements = 0;
+    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
+                                            &prefetch_input_elements));
+    OP_REQUIRES(
+        ctx, prefetch_input_elements >= 0,
+        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
+
+    std::unique_ptr<CapturedFunction> captured_func;
+    OP_REQUIRES_OK(
+        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
+                                      &captured_func));
+
+    *output =
+        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
+                    cycle_length, block_length, sloppy, buffer_output_elements,
+                    prefetch_input_elements, output_types_, output_shapes_);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            const NameAttrList& func,
+            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
+            int64 block_length, bool sloppy, int64 buffer_output_elements,
+            int64 prefetch_input_elements, const DataTypeVector& output_types,
+            const std::vector<PartialTensorShape>& output_shapes)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          interleave_func_(func),
+          captured_func_(std::move(captured_func)),
+          cycle_length_(cycle_length),
+          block_length_(block_length),
+          sloppy_(sloppy),
+          buffer_output_elements_(buffer_output_elements),
+          prefetch_input_elements_(prefetch_input_elements),
+          output_types_(output_types),
+          output_shapes_(output_shapes) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return output_types_;
+    }
+
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return output_shapes_;
+    }
+
+    string DebugString() const override {
+      return "ParallelInterleaveDatasetOp::Dataset";
+    }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
+      Node* input_node;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
+      Node* cycle_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
+      Node* block_length_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
+      Node* sloppy_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
+      Node* buffer_output_elements_node;
+      TF_RETURN_IF_ERROR(
+          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
+      Node* prefetch_input_elements_node;
+      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
+                                      &prefetch_input_elements_node));
+      DataTypeVector other_arguments_types;
+      other_arguments_types.reserve(captured_func_->captured_inputs().size());
+      std::vector<Node*> other_arguments;
+      other_arguments.reserve(captured_func_->captured_inputs().size());
+      for (const Tensor& t : captured_func_->captured_inputs()) {
+        Node* node;
+        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
+        other_arguments.emplace_back(node);
+        other_arguments_types.emplace_back(t.dtype());
+      }
+      AttrValue f;
+      b->BuildAttrValue(interleave_func_, &f);
+      AttrValue other_arguments_types_attr;
+      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this,
+          {{0, input_node},
+           {2, cycle_length_node},
+           {3, block_length_node},
+           {4, sloppy_node},
+           {5, buffer_output_elements_node},
+           {6, prefetch_input_elements_node}},
+          {{1, other_arguments}},
+          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
+      return Status::OK();
+    }
+
+   private:
+    int64 num_threads() const {
+      return cycle_length_ + prefetch_input_elements_;
+    }
+
+    // Parallel interleave's implementation is designed around a few principles:
+    //  1. Thread creation is relatively expensive. (Not reusing
+    //     threads causes a number of indirect costs such as poorer tcmalloc
+    //     performance due to thread-local caches, etc.) We allocate a fixed
+    //     number of threads at the start and never change. This is why we've
+    //     fused functionality that is theoretically orthogonal (i.e.
+    //     .prefetch()) into the implementation.
+    //  2. Drop-in replacement for standard interleave. The goal will be to
+    //     auto-opt people into an optimized implementation without any work
+    //     on the customer's part. We thus go through great pains to maintain
+    //     identical iteration orders, full determinism (disabled only via a
+    //     flag, etc.)
+    //  3. Performance across a variety of environments and I/O envelopes.
+    //
+    // The actual implementation centers around a collection of worker threads
+    // and their corresponding worker state (tracked in the `workers_` vector).
+    // Worker threads repeatedly receive a vector of Tensors that are used as
+    // input to the flat-map function (`captured_func_`). The output of this
+    // function must be a dataset. The worker thread then repeatedly calls
+    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
+    // that a caller will block waiting for an element to be produced.
+    //
+    // Pointers to these worker states are kept in 2 disjoint data structures:
+    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
+    //     in `workers_` that we are interleaving. Worker threads backing these
+    //     WorkerStates should be regularly producing values.
+    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
+    //     `workers_` that we will move to `interleave_indices_` when an
+    //     iterator in `interleave_indices_` is exhausted.
+    //
+    // The client calls `GetNext[Internal]()` to retrieve an output element. The
+    // internal implementation updates the state of `interleave_indices_` and
+    // `staging_indices_` as output iterators (run by the worker threads) are
+    // exhausted.
+    //
+    // `input_impl_` is the input iterator that generates arguments for the
+    // flat-map function (`captured_func_`). It is set to an iterator at
+    // Iterator construction, and is fixed until we consume all input elements.
+    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
+    // memory.
+    //
+    // A few invariants are maintained:
+    //  1. No element in interleave_indices_ should be a -1 unless
+    //     `staging_indices_` is empty and `input_impl_` is empty.
+    //  2. Every `worker_` element is pointed to by at most one element of the
+    //     union of `interleave_indices_` and `staging_indices_`.
+    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
+    //     an element in `interleave_indices_` or `staging_indices_`.
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params),
+            workers_(dataset()->num_threads()),
+            worker_thread_states_(dataset()->num_threads()) {}
+
+      ~Iterator() override {
+        mutex_lock l(mu_);
+        cancelled_ = true;
+        // Notify all workers in case they are blocked.
+        for (auto& worker : workers_) {
+          worker.cond_var.notify_all();
+        }
+      }
+
+      Status Initialize(IteratorContext* ctx) override {
+        TF_RETURN_IF_ERROR(
+            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      // It is implemented so that it matches the deterministic interleave
+      // unless getting the next element would block and we are allowed to be
+      // sloppy.
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        mutex_lock l(mu_);
+        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
+        while (!cancelled_) {
+          // Wait for an item to become available, blocking if necessary. If we
+          // are allowed to be sloppy, we can skip over input datasets that do
+          // not have an item readily available.
+          bool can_produce_elements = false;
+          bool must_wait_for_input = true;
+          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
+            int64 index = (next_index_ + i) % interleave_indices_.size();
+            int64 current_worker_index = interleave_indices_[index];
+            if (current_worker_index < 0) {
+              continue;  // Empty interleave elements.
+            }
+            WorkerState* current_worker = &workers_[current_worker_index];
+            can_produce_elements |= current_worker->MayHaveElements();
+            if (!current_worker->outputs.empty()) {
+              // We have an element!
+              next_index_ = index;
+              const bool element_acquired_sloppily =
+                  dataset()->sloppy_ && i > 1;
+              if (!element_acquired_sloppily) {
+                // If the element was acquired in the regular (non-sloppy)
+                // order, then advance the current block and cycle pointers to
+                // the next element in the regular order.
+                block_count_++;
+                if (block_count_ == dataset()->block_length_) {
+                  next_index_ = (index + 1) % interleave_indices_.size();
+                  block_count_ = 0;
+                }
+              } else {
+                block_count_ = 0;
+              }
+              *end_of_sequence = false;
+              Status s = current_worker->outputs.front().status;
+              current_worker->outputs.front().output.swap(*out_tensors);
+              current_worker->outputs.pop_front();
+              current_worker->cond_var.notify_one();
+              return s;
+            } else if (current_worker->is_producing && !dataset()->sloppy_) {
+              // current_worker.outputs.empty(), and we must wait for this
+              // iterator.
+              if (next_index_ != index) {
+                // We have advanced to a new iterator; reset block counts.
+                next_index_ = index;
+                block_count_ = 0;
+              }
+              break;
+            } else if (!current_worker->is_producing) {
+              // This iterator has reached end of input.
+              interleave_indices_[index] = -1;
+              if (input_impl_) {
+                // Start prefetching a new iterator.
+                std::vector<Tensor> args;
+                bool end_of_input = false;
+                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+                if (end_of_input) {
+                  input_impl_.reset();
+                } else {
+                  current_worker->SetInputs(s, std::move(args));
+                  staging_indices_.emplace_back(current_worker_index);
+                }
+              }
+
+              if (!staging_indices_.empty()) {
+                // Move a worker from `staging_indices_` to
+                // `interleave_indices_`.
+                interleave_indices_[index] = staging_indices_.front();
+                staging_indices_.pop_front();
+
+                next_index_ = (index + 1) % interleave_indices_.size();
+                block_count_ = 0;
+                // Restart the inner [for] loop
+                can_produce_elements = true;
+                must_wait_for_input = false;
+                break;
+              }
+            }
+          }
+
+          if (!can_produce_elements && !input_impl_) {
+            // No potential for future values.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
+
+          if (must_wait_for_input) {
+            // Wait for elements to become available.
+            RecordStop(ctx);
+            if (dataset()->sloppy_) {
+              sloppy_cond_var_.wait(l);
+            } else {
+              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
+            }
+            RecordStart(ctx);
+          }
+        }
+        return errors::Cancelled(
+            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeAsyncInterleaveManyNode(std::move(args),
+                                                  /*parameters=*/{});
+      }
+
+      Status SaveInternal(IteratorStateWriter* writer) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (input_impl_) {
+          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
+        } else {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("input_exhausted"), ""));
+        }
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("next_index"), next_index_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("block_count"), block_count_));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name("workers_size"), workers_.size()));
+        for (int i = 0; i < workers_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
+        }
+        for (int i = 0; i < worker_thread_states_.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
+                                               interleave_indices_.size()));
+        for (int i = 0; i < interleave_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("interleave_indices_", i)),
+              interleave_indices_[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
+                                               staging_indices_.size()));
+        for (int i = 0; i < staging_indices_.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat("staging_indices_", i)),
+              staging_indices_[i]));
+        }
+        if (!worker_threads_.empty()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name("worker_threads_running"), ""));
+        }
+        return Status::OK();
+      }
+
+      Status RestoreInternal(IteratorContext* ctx,
+                             IteratorStateReader* reader) override {
+        // The order of locking is important here to avoid deadlock.
+        mutex_lock l(mu_);
+        mutex_lock ckpt_l(ckpt_mu_);
+        if (!reader->Contains(full_name("input_exhausted"))) {
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
+        } else {
+          input_impl_.reset();
+        }
+        int64 temp;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
+        next_index_ = size_t(temp);
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
+        block_count_ = size_t(temp);
+
+        // Restore WorkerStates.
+        TF_RETURN_IF_ERROR(
+            reader->ReadScalar(full_name("workers_size"), &temp));
+        if (temp != dataset()->num_threads()) {
+          return errors::Internal("Expected ", dataset()->num_threads(),
+                                  " worker states but found ", temp, ".");
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
+        }
+        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
+        }
+
+        // Restore `interleave_indices_`.
+        std::set<int64> all_indices;
+        {
+          int64 interleave_size;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
+                                                &interleave_size));
+          interleave_indices_.reserve(interleave_size);
+          for (int64 i = 0; i < interleave_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("interleave_indices_", i)), &temp));
+            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            interleave_indices_.emplace_back(temp);
+          }
+        }
+
+        // Restore `staging_indices_`.
+        {
+          int64 staging_size;
+          TF_RETURN_IF_ERROR(
+              reader->ReadScalar(full_name("staging_size"), &staging_size));
+          for (int i = 0; i < staging_size; ++i) {
+            int64 temp;
+            TF_RETURN_IF_ERROR(reader->ReadScalar(
+                full_name(strings::StrCat("staging_indices_", i)), &temp));
+            if (all_indices.find(temp) != all_indices.end()) {
+              return errors::Internal(
+                  "Duplicate entry for ", temp,
+                  " found when reading interleave and staging indices.");
+            }
+            if (temp >= 0) {
+              all_indices.insert(temp);
+            }
+            staging_indices_.emplace_back(temp);
+          }
+        }
+
+        // Start Worker threads.
+        if (reader->Contains(full_name("worker_threads_running"))) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
+            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+          }
+        }
+        return Status::OK();
+      }
+
+     private:
+      // OutputElem contains the information from a call to GetNext by an output
+      // iterator.
+      struct OutputElem {
+        // The output iterator sets `status` if getting the output element
+        // fails.
+        Status status;
+        // The buffered data element.
+        std::vector<Tensor> output;
+
+        explicit OutputElem(const Status& s) : status(s) {}
+      };
+
+      // Worker threads operate on their relevant WorkerState structs.
+      //
+      // WorkerState's fields are all protected by mu_;
+      struct WorkerState {
+        // The arguments to be used to construct an output iterator.
+        std::vector<Tensor> input;
+        // The buffered output elements.
+        std::deque<OutputElem> outputs;
+        // Set to true iff the worker thread expects to append more elements to
+        // outputs. is_producing can be false despite !outputs.empty().
+        // Concretely, all output elements will have been consumed only when:
+        // is_producing == false && outputs.empty();
+        bool is_producing = false;
+        // Condition variable used to coordinate between threads. The worker
+        // thread waits on this condition variable when it is either (1) waiting
+        // for the main thread to add arguments to `input`, or (2) waiting for
+        // the main thread to consume an element of `outputs`. The main thread
+        // waits on cond_var if it is waiting for the worker thread to produce
+        // an element into `outputs` (this implies sloppy_==false).
+        condition_variable cond_var;
+
+        inline bool MayHaveElements() const {
+          return is_producing || !outputs.empty();
+        }
+
+        // Sets inputs for a worker thread and notifies it to start processing.
+        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
+          if (s.ok()) {
+            DCHECK(!MayHaveElements())
+                << "Tried to start inputs, despite already producing!";
+            input = std::move(input_arguments);
+            is_producing = true;
+            cond_var.notify_one();
+          } else {
+            outputs.emplace_back(s);
+          }
+        }
+      };
+
+      // The internal state of a worker thread that is not already captured
+      // in its `WorkerState`.
+      //
+      // This is needed only for checkpointing purposes. We keep this
+      // separate from `WorkerState` and guard its fields using a separate
+      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
+      struct WorkerThreadState {
+        // The output element that has been produced from the input iterator
+        // and is waiting to be added to `WorkerState.outputs`.
+        OutputElem output_elem;
+
+        // Whether the input iterator returned an `end_of_sequence`.
+        bool end_of_sequence = false;
+
+        // Status returned from `MakeIteratorFromInputElement`.
+        Status iterator_creation_status;
+
+        // The arguments to be used to construct `iterator`.
+        std::vector<Tensor> input;
+
+        std::unique_ptr<IteratorBase> iterator;
+
+        WorkerThreadState() : output_elem(Status::OK()) {}
+      };
+
+      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        if (worker_threads_.empty()) {
+          worker_threads_.reserve(dataset()->num_threads());
+          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
+            std::vector<Tensor> args;
+            bool end_of_input = false;
+            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
+            if (end_of_input) {
+              input_impl_.reset();
+              return Status::OK();
+            }
+            workers_[i].SetInputs(s, std::move(args));
+            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
+            worker_threads_.emplace_back(ctx->env()->StartThread(
+                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
+                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
+            if (i < dataset()->cycle_length_) {
+              interleave_indices_.push_back(i);
+            } else {
+              staging_indices_.push_back(i);
+            }
+          }
+          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
+          DCHECK(staging_indices_.size() ==
+                 dataset()->prefetch_input_elements_);
+        }
+        return Status::OK();
+      }
+
+      // Produces elements into the worker's output buffers.
+      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
+                        const int64 thread_index) {
+        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
+        //
+        // 1. Any local state that may need to be checkpointed should be kept
+        //    in `worker_thread_states_[thread_index]`.
+        // 2. `WorkerThreadState` should contain state that is needed only for
+        //    checkpointing, i.e., if we were to remove checkpointing support,
+        //    we could keep that state as local variables in this thread.
+        // 3. This thread should only read/write state at `thread_index`
+        //    and should not access other thread states.
+        // 4. When restoring from checkpoint, threads are started only after
+        //    the restore is complete.
+        // 5. Once restored from a checkpoint, the local state is edited only
+        //    by this thread. 3 & 4 allow making assumptions like temporarily
+        //    caching local state in this thread and using it outside a lock
+        //    e.g. `make_new_iterator`.
+        // 6. `ckpt_mu_` should be wisely used to create *consistent*
+        //    checkpoint markers.
+
+        // std::function arguments are copy-constructable, so we pass raw
+        // pointers, and then immediately wrap them to ensure correct ownership.
+        RecordStart(ctx.get());
+        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
+          mutex_lock l(mu_);
+          workers_[thread_index].cond_var.notify_all();
+          RecordStop(ctx.get());
+        });
+        bool make_new_iterator;
+        {
+          tf_shared_lock l(ckpt_mu_);
+          // Decide whether a new iterator should be built.
+          // 1. If there is an existing iterator, we use it.
+          // 2. If there was an error in iterator creation that could not be
+          //    notified to the client we attempt to send that to the client
+          //    first.
+          make_new_iterator =
+              worker_thread_states_[thread_index].iterator == nullptr &&
+              worker_thread_states_[thread_index].iterator_creation_status.ok();
+        }
+        // Even though `make_new_iterator` has cached values from
+        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
+        // it is safe to *read* `make_new_iterator`outside of a lock without
+        // worrying about concurrent changes to values in
+        // `worker_thread_states_[thread_index]`. See comment at the start of
+        // this function for details.
+        while (true) {
+          // Whether creation of the iterator succeeded.
+          Status iterator_creation_status;
+          // 1. Build a new iterator or use the existing one.
+          if (make_new_iterator) {
+            // 1a. Get new input tensors or use the exiting ones.
+            bool read_new_input;
+            {
+              tf_shared_lock l(ckpt_mu_);
+              // worker_thread_states_[thread_index].input will be non-empty
+              // if checkpointing happened at CHECKPOINT_MARKER_A.
+              read_new_input =
+                  worker_thread_states_[thread_index].input.empty();
+            }
+
+            if (read_new_input) {
+              mutex_lock l(mu_);
+              while (!cancelled_ && !workers_[thread_index].is_producing) {
+                RecordStop(ctx.get());
+                workers_[thread_index].cond_var.wait(l);
+                RecordStart(ctx.get());
+              }
+              if (cancelled_) return;
+              // Copy the input tensors so that we do not need to block on `mu_`
+              // when building the iterator.
+              // We keep a copy of the input tensors in
+              // `WorkerThreadState.input` till the iterator is in use. This is
+              // used in `RestoreInternal` to re-build the iterator.
+              // TODO(b/78046638): Explore ways to avoid tracking the input
+              // tensors.
+              tf_shared_lock ckpt_l(ckpt_mu_);
+              worker_thread_states_[thread_index].input.swap(
+                  workers_[thread_index].input);
+              // CHECKPOINT_MARKER_A
+              // We have the input tensors but have not built the iterator yet.
+            }
+
+            // 1b. Run the user defined function to produce a new iterator.
+            {
+              tf_shared_lock l(ckpt_mu_);
+              worker_thread_states_[thread_index].iterator_creation_status =
+                  MakeIteratorFromInputElement(
+                      ctx.get(), worker_thread_states_[thread_index].input,
+                      thread_index, *instantiated_captured_func_, prefix(),
+                      &worker_thread_states_[thread_index].iterator);
+              iterator_creation_status =
+                  worker_thread_states_[thread_index].iterator_creation_status;
+              if (!iterator_creation_status.ok()) {
+                worker_thread_states_[thread_index].input.clear();
+              }
+              // CHECKPOINT_MARKER_B
+              // Either an iterator has been successfully built and placed in
+              // `worker_thread_states_[thread_index].iterator` or it failed and
+              // a non-OK status has been put in
+              // `worker_thread_states_[thread_index].iterator_creation_status`.
+            }
+          } else {
+            tf_shared_lock l(ckpt_mu_);
+            iterator_creation_status =
+                worker_thread_states_[thread_index].iterator_creation_status;
+            // Mark that we have used up the restored iterator.
+            make_new_iterator = true;
+          }
+          // 2. Start producing elements or send error state to client if
+          //    iterator creation failed.
+          if (!iterator_creation_status.ok()) {
+            mutex_lock l(mu_);
+            // Wait for space in the prefetch queue.
+            while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                      dataset()->buffer_output_elements_) {
+              RecordStop(ctx.get());
+              workers_[thread_index].cond_var.wait(l);
+              RecordStart(ctx.get());
+            }
+            if (cancelled_) return;
+            tf_shared_lock ckpt_l(ckpt_mu_);
+            workers_[thread_index].outputs.emplace_back(
+                iterator_creation_status);
+            workers_[thread_index].is_producing = false;
+            worker_thread_states_[thread_index].iterator_creation_status =
+                Status::OK();
+            // CHECKPOINT_MARKER_C
+            // Non-OK iterator creation status has been notified to the
+            // client.
+            workers_[thread_index].cond_var.notify_one();
+          } else {
+            bool end_of_sequence = false;
+            while (!end_of_sequence) {
+              // 3.a Produce an element!
+              {
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                if (worker_thread_states_[thread_index]
+                        .output_elem.status.ok() &&
+                    worker_thread_states_[thread_index]
+                        .output_elem.output.empty() &&
+                    !worker_thread_states_[thread_index].end_of_sequence) {
+                  worker_thread_states_[thread_index].output_elem.status =
+                      worker_thread_states_[thread_index].iterator->GetNext(
+                          ctx.get(),
+                          &worker_thread_states_[thread_index]
+                               .output_elem.output,
+                          &worker_thread_states_[thread_index].end_of_sequence);
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                } else {
+                  end_of_sequence =
+                      worker_thread_states_[thread_index].end_of_sequence;
+                }
+                // CHECKPOINT_MARKER_D
+                // An element has been read or an error or end_of_sequence has
+                // been received from the input iterator and is waiting to be
+                // sent to client.
+              }
+
+              // 3.b Make it available to the client.
+              {
+                mutex_lock l(mu_);
+
+                // Wait for space in the prefetch queue.
+                while (!cancelled_ && workers_[thread_index].outputs.size() ==
+                                          dataset()->buffer_output_elements_) {
+                  RecordStop(ctx.get());
+                  workers_[thread_index].cond_var.wait(l);
+                  RecordStart(ctx.get());
+                }
+                if (cancelled_) return;
+
+                tf_shared_lock ckpt_l(ckpt_mu_);
+                workers_[thread_index].is_producing = !end_of_sequence;
+
+                // Output the element.
+
+                // Move the temporary state in WorkerThreadState to WorkerState
+                // and mark it as used.
+                if (end_of_sequence) {
+                  worker_thread_states_[thread_index].iterator.reset();
+                  worker_thread_states_[thread_index].input.clear();
+                  worker_thread_states_[thread_index].end_of_sequence = false;
+                } else {
+                  workers_[thread_index].outputs.emplace_back(
+                      worker_thread_states_[thread_index].output_elem.status);
+                  workers_[thread_index].outputs.back().output.swap(
+                      worker_thread_states_[thread_index].output_elem.output);
+                }
+                worker_thread_states_[thread_index].output_elem.status =
+                    Status::OK();
+                if (dataset()->sloppy_) {
+                  sloppy_cond_var_.notify_one();
+                } else {
+                  workers_[thread_index].cond_var.notify_one();
+                }
+                // CHECKPOINT_MARKER_E
+                // Output element or iterator status has been sent to the
+                // client.
+              }
+            }
+          }
+        }
+      }
+
+      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_", index);
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            workers_[index].input.size()));
+        for (int i = 0; i < workers_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              workers_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_outputs_size")),
+            workers_[index].outputs.size()));
+        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
+          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+              writer, workers_[index].outputs[i],
+              full_name(strings::StrCat(prefix, "_outputs_", i))));
+        }
+        if (workers_[index].is_producing) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_is_producing")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
+                                   IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        workers_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          workers_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &workers_[index].input.back()));
+        }
+        int64 outputs_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
+            &outputs_size));
+        for (int i = 0; i < outputs_size; ++i) {
+          workers_[index].outputs.emplace_back(Status::OK());
+          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+              reader, &workers_[index].outputs.back(),
+              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
+        }
+        if (reader->Contains(
+                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
+          workers_[index].is_producing = true;
+        } else {
+          workers_[index].is_producing = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
+                                          int index)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string prefix = strings::StrCat("worker_thread_", index);
+        if (worker_thread_states_[index].iterator != nullptr) {
+          TF_RETURN_IF_ERROR(
+              SaveInput(writer, worker_thread_states_[index].iterator));
+        } else {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
+        }
+        TF_RETURN_IF_ERROR(writer->WriteScalar(
+            full_name(strings::StrCat(prefix, "_input_size")),
+            worker_thread_states_[index].input.size()));
+        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              full_name(strings::StrCat(prefix, "_input_", i)),
+              worker_thread_states_[index].input[i]));
+        }
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_iterator_creation_status"),
+            worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
+            writer, worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(prefix, "_output"))));
+        if (worker_thread_states_[index].end_of_sequence) {
+          TF_RETURN_IF_ERROR(writer->WriteScalar(
+              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
+        }
+        return Status::OK();
+      }
+
+      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
+                                         IteratorContext* ctx)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        string worker_prefix = strings::StrCat("worker_thread_", index);
+        // Restore inputs.
+        int64 input_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(worker_prefix, "_input_size")),
+            &input_size));
+        worker_thread_states_[index].input.reserve(input_size);
+        for (int i = 0; i < input_size; ++i) {
+          worker_thread_states_[index].input.emplace_back();
+          TF_RETURN_IF_ERROR(reader->ReadTensor(
+              full_name(strings::StrCat(worker_prefix, "_input_", i)),
+              &worker_thread_states_[index].input.back()));
+        }
+        // Restore iterator.
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
+          worker_thread_states_[index].iterator.reset();
+        } else {
+          std::unique_ptr<IteratorBase> iterator;
+          Status s = MakeIteratorFromInputElement(
+              ctx, worker_thread_states_[index].input, index,
+              *instantiated_captured_func_, prefix(), &iterator);
+          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
+          worker_thread_states_[index].iterator.swap(iterator);
+        }
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
+            &worker_thread_states_[index].iterator_creation_status));
+        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
+            reader, &worker_thread_states_[index].output_elem,
+            full_name(strings::StrCat(worker_prefix, "_output"))));
+        if (reader->Contains(full_name(
+                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
+          worker_thread_states_[index].end_of_sequence = true;
+        } else {
+          worker_thread_states_[index].end_of_sequence = false;
+        }
+        return Status::OK();
+      }
+
+      Status WriteOutputElemLocked(IteratorStateWriter* writer,
+                                   const OutputElem& output_elem,
+                                   const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(WriteStatusLocked(
+            writer, strings::StrCat(prefix, "_status"), output_elem.status));
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
+                                output_elem.output.size()));
+        for (int i = 0; i < output_elem.output.size(); ++i) {
+          TF_RETURN_IF_ERROR(writer->WriteTensor(
+              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
+        }
+        return Status::OK();
+      }
+
+      Status ReadOutputElemLocked(IteratorStateReader* reader,
+                                  OutputElem* output_elem, const string& prefix)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(ReadStatusLocked(
+            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
+        int64 output_size;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            strings::StrCat(prefix, "_output_size"), &output_size));
+        output_elem->output.reserve(output_size);
+        for (int i = 0; i < output_size; ++i) {
+          output_elem->output.emplace_back();
+          TF_RETURN_IF_ERROR(
+              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
+                                 &output_elem->output.back()));
+        }
+        return Status::OK();
+      }
+
+      Status WriteStatusLocked(IteratorStateWriter* writer,
+                               const string& prefix, const Status& status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        TF_RETURN_IF_ERROR(
+            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
+                                static_cast<int64>(status.code())));
+        if (!status.ok()) {
+          TF_RETURN_IF_ERROR(
+              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
+                                  status.error_message()));
+        }
+        return Status::OK();
+      }
+
+      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
+                              Status* status)
+          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
+        int64 code_int;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name(strings::StrCat(prefix, "_code")), &code_int));
+        error::Code code = static_cast<error::Code>(code_int);
+
+        if (code != error::Code::OK) {
+          string error_message;
+          TF_RETURN_IF_ERROR(reader->ReadScalar(
+              full_name(strings::StrCat(prefix, "_msg")), &error_message));
+          *status = Status(code, error_message);
+        } else {
+          *status = Status::OK();
+        }
+        return Status::OK();
+      }
+
+      // Mutex & condition variable to guard mutable iterator internals and
+      // coordinate among worker threads and client thread[s].
+      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
+      // The main thread waits on this condition variable if running in sloppy
+      // mode and no values are available.
+      condition_variable sloppy_cond_var_;
+      // Mutex used to wait for a consistent state while checkpointing.
+      // Only Save and Restore require an exclusive lock on this mutex. In
+      // other scenarios we just acquire a shared lock so the pipeline's
+      // performance should not be affected in the absence of checkpointing.
+      // A thread must not wait on any condition variable while holding
+      // `ckpt_mu_` in either shared or exclusive modes.
+      mutex ckpt_mu_;
+
+      // The iterator producing elements which are converted to datasets by
+      // the dataset()->captured_func_ then interleaved together.
+      // input_impl_ is reset when we have exhausted its input.
+      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
+
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+
+      // The WorkerState structs the worker threads operate on.
+      // workers_ elements are in at most one of interleave_ and staging_.
+      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
+
+      // Stores the temporary state of WorkerThreads which is not stored in
+      // WorkerState. This is used for checkpointing purposes only.
+      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
+
+      // Indices in `workers_` of iterators to interleave.
+      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
+      // Indices in `workers_` of prefetched iterators.
+      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
+
+      // The index into output_elements_ for next element to produce.
+      size_t next_index_ GUARDED_BY(mu_) = 0;
+      // The number of items produced so far within the block
+      size_t block_count_ GUARDED_BY(mu_) = 0;
+      // Flag to instruct the worker threads to exit.
+      bool cancelled_ GUARDED_BY(mu_) = false;
+      // The worker threads. This must be last to ensure the
+      // threads have exited before any other members are deallocated.
+      // TODO(b/65178177): Avoid allocating additional threads.
+      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
+    };
+
+    const DatasetBase* const input_;
+    const NameAttrList interleave_func_;
+    const std::unique_ptr<CapturedFunction> captured_func_;
+    const int64 cycle_length_;
+    const int64 block_length_;
+    const bool sloppy_;
+    const int64 buffer_output_elements_;
+    const int64 prefetch_input_elements_;
+    const DataTypeVector output_types_;
+    const std::vector<PartialTensorShape> output_shapes_;
+  };
+
+  DataTypeVector output_types_;
+  std::vector<PartialTensorShape> output_shapes_;
+  NameAttrList interleave_func_;
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalParallelInterleaveDataset").Device(DEVICE_CPU),
+    ParallelInterleaveDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
similarity index 85%
rename from tensorflow/core/kernels/data/parse_example_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index 608b39d5f50e11211572d849a97b0b6c91ff4590..ea99a8b32c5a945f30945369ef2ed4f4b6725887 100644
--- a/tensorflow/core/kernels/data/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -183,99 +183,12 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto map_fn = [this](IteratorContext* ctx, const string& prefix,
-                           std::vector<Tensor> input_element,
-                           std::vector<Tensor>* result, StatusCallback done) {
-        (*ctx->runner())([this, ctx, input_element, result, done]() {
-          thread::ThreadPool* device_threadpool =
-              ctx->lib()->device()->tensorflow_cpu_worker_threads()->workers;
-          std::vector<string> slice_vec;
-          for (const Tensor& t : input_element) {
-            auto serialized_t = t.flat<string>();
-            gtl::ArraySlice<string> slice(serialized_t.data(),
-                                          serialized_t.size());
-            for (auto it = slice.begin(); it != slice.end(); it++)
-              slice_vec.push_back(*it);
-          }
-          example::FastParseExampleConfig config = config_;
-          // local copy of config_ for modification.
-          auto stats_aggregator = ctx->stats_aggregator();
-          if (stats_aggregator) {
-            config.collect_feature_stats = true;
-          }
-          example::Result example_result;
-          Status s = FastParseExample(config, slice_vec, {}, device_threadpool,
-                                      &example_result);
-          if (s.ok()) {
-            (*result).resize(key_to_output_index_.size());
-            for (int d = 0; d < dense_keys_.size(); ++d) {
-              int output_index = key_to_output_index_.at(dense_keys_[d]);
-              CHECK(example_result.dense_values[d].dtype() ==
-                    output_dtypes()[output_index])
-                  << "Got wrong type for FastParseExample return value " << d
-                  << " (expected "
-                  << DataTypeString(output_dtypes()[output_index]) << ", got "
-                  << DataTypeString(example_result.dense_values[d].dtype())
-                  << ").";
-              CHECK(output_shapes()[output_index].IsCompatibleWith(
-                  example_result.dense_values[d].shape()))
-                  << "Got wrong shape for FastParseExample return value " << d
-                  << " (expected "
-                  << output_shapes()[output_index].DebugString() << ", got "
-                  << example_result.dense_values[d].shape().DebugString()
-                  << ").";
-              (*result)[output_index] = example_result.dense_values[d];
-            }
-            for (int d = 0; d < sparse_keys_.size(); ++d) {
-              int output_index = key_to_output_index_.at(sparse_keys_[d]);
-              (*result)[output_index] =
-                  Tensor(ctx->allocator({}), DT_VARIANT, {3});
-              Tensor& serialized_sparse = (*result)[output_index];
-              auto serialized_sparse_t = serialized_sparse.vec<Variant>();
-              serialized_sparse_t(0) = example_result.sparse_indices[d];
-              serialized_sparse_t(1) = example_result.sparse_values[d];
-              serialized_sparse_t(2) = example_result.sparse_shapes[d];
-              CHECK(serialized_sparse.dtype() == output_dtypes()[output_index])
-                  << "Got wrong type for FastParseExample return value " << d
-                  << " (expected "
-                  << DataTypeString(output_dtypes()[output_index]) << ", got "
-                  << DataTypeString(serialized_sparse.dtype()) << ").";
-              CHECK(output_shapes()[output_index].IsCompatibleWith(
-                  serialized_sparse.shape()))
-                  << "Got wrong shape for FastParseExample return value " << d
-                  << " (expected "
-                  << output_shapes()[output_index].DebugString() << ", got "
-                  << serialized_sparse.shape().DebugString() << ").";
-            }
-            // TODO(b/111553342): User provided tags instead of fixed tag.
-            if (stats_aggregator) {
-              stats_aggregator->IncrementCounter(
-                  "examples_count", "trainer",
-                  example_result.feature_stats.size());
-              for (example::PerExampleFeatureStats feature_stats :
-                   example_result.feature_stats) {
-                stats_aggregator->AddToHistogram(
-                    "features",
-                    {static_cast<double>(feature_stats.features_count)});
-                stats_aggregator->IncrementCounter(
-                    "features_count", "trainer", feature_stats.features_count);
-                stats_aggregator->IncrementCounter(
-                    "feature_values_count", "trainer",
-                    feature_stats.feature_values_count);
-                stats_aggregator->AddToHistogram(
-                    "feature-values",
-                    {static_cast<double>(feature_stats.feature_values_count)});
-              }
-            }
-          }
-          done(s);
-        });
-      };
-
+      std::unique_ptr<ParallelMapFunctor> parse_example_functor(
+          new ParseExampleFunctor(this));
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParseExample")}, input_,
-          /*init_func=*/nullptr, std::move(map_fn), num_parallel_calls_,
-          sloppy_);
+          std::move(parse_example_functor), num_parallel_calls_, sloppy_,
+          /*preserve_cardinality=*/true);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -290,6 +203,8 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
       return "ParseExampleDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -341,6 +256,111 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
     }
 
    private:
+    class ParseExampleFunctor : public ParallelMapFunctor {
+     public:
+      explicit ParseExampleFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input, std::vector<Tensor>* output,
+                   StatusCallback callback) override {
+        (*ctx->runner())([this, ctx, input, output, callback]() {
+          thread::ThreadPool* device_threadpool =
+              ctx->lib()->device()->tensorflow_cpu_worker_threads()->workers;
+          std::vector<string> slice_vec;
+          for (const Tensor& t : input) {
+            auto serialized_t = t.flat<string>();
+            gtl::ArraySlice<string> slice(serialized_t.data(),
+                                          serialized_t.size());
+            for (auto it = slice.begin(); it != slice.end(); it++)
+              slice_vec.push_back(*it);
+          }
+          example::FastParseExampleConfig config = dataset_->config_;
+          // local copy of config_ for modification.
+          auto stats_aggregator = ctx->stats_aggregator();
+          if (stats_aggregator) {
+            config.collect_feature_stats = true;
+          }
+          example::Result example_result;
+          Status s = FastParseExample(config, slice_vec, {}, device_threadpool,
+                                      &example_result);
+          if (s.ok()) {
+            (*output).resize(dataset_->key_to_output_index_.size());
+            for (int d = 0; d < dataset_->dense_keys_.size(); ++d) {
+              int output_index =
+                  dataset_->key_to_output_index_.at(dataset_->dense_keys_[d]);
+              DCHECK(example_result.dense_values[d].dtype() ==
+                     dataset_->output_dtypes()[output_index])
+                  << "Got wrong type for FastParseExample return value " << d
+                  << " (expected "
+                  << DataTypeString(dataset_->output_dtypes()[output_index])
+                  << ", got "
+                  << DataTypeString(example_result.dense_values[d].dtype())
+                  << ").";
+              DCHECK(dataset_->output_shapes()[output_index].IsCompatibleWith(
+                  example_result.dense_values[d].shape()))
+                  << "Got wrong shape for FastParseExample return value " << d
+                  << " (expected "
+                  << dataset_->output_shapes()[output_index].DebugString()
+                  << ", got "
+                  << example_result.dense_values[d].shape().DebugString()
+                  << ").";
+              (*output)[output_index] = example_result.dense_values[d];
+            }
+            for (int d = 0; d < dataset_->sparse_keys_.size(); ++d) {
+              int output_index =
+                  dataset_->key_to_output_index_.at(dataset_->sparse_keys_[d]);
+              (*output)[output_index] =
+                  Tensor(ctx->allocator({}), DT_VARIANT, {3});
+              Tensor& serialized_sparse = (*output)[output_index];
+              auto serialized_sparse_t = serialized_sparse.vec<Variant>();
+              serialized_sparse_t(0) = example_result.sparse_indices[d];
+              serialized_sparse_t(1) = example_result.sparse_values[d];
+              serialized_sparse_t(2) = example_result.sparse_shapes[d];
+              DCHECK(serialized_sparse.dtype() ==
+                     dataset_->output_dtypes()[output_index])
+                  << "Got wrong type for FastParseExample return value " << d
+                  << " (expected "
+                  << DataTypeString(dataset_->output_dtypes()[output_index])
+                  << ", got " << DataTypeString(serialized_sparse.dtype())
+                  << ").";
+              DCHECK(dataset_->output_shapes()[output_index].IsCompatibleWith(
+                  serialized_sparse.shape()))
+                  << "Got wrong shape for FastParseExample return value " << d
+                  << " (expected "
+                  << dataset_->output_shapes()[output_index].DebugString()
+                  << ", got " << serialized_sparse.shape().DebugString()
+                  << ").";
+            }
+            // TODO(b/111553342): User provided tags instead of fixed tag.
+            if (stats_aggregator) {
+              stats_aggregator->IncrementCounter(
+                  "examples_count", "trainer",
+                  example_result.feature_stats.size());
+              for (example::PerExampleFeatureStats feature_stats :
+                   example_result.feature_stats) {
+                stats_aggregator->AddToHistogram(
+                    "features",
+                    {static_cast<double>(feature_stats.features_count)});
+                stats_aggregator->IncrementCounter(
+                    "features_count", "trainer", feature_stats.features_count);
+                stats_aggregator->IncrementCounter(
+                    "feature_values_count", "trainer",
+                    feature_stats.feature_values_count);
+                stats_aggregator->AddToHistogram(
+                    "feature-values",
+                    {static_cast<double>(feature_stats.feature_values_count)});
+              }
+            }
+          }
+          callback(s);
+        });
+      }
+
+     private:
+      const Dataset* dataset_;
+    };
+
     const DatasetBase* const input_;
     const std::vector<Tensor> dense_defaults_;
     const std::vector<string> sparse_keys_;
@@ -369,8 +389,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
   std::vector<std::size_t> elements_per_stride_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ParseExampleDataset").Device(DEVICE_CPU),
-                        ParseExampleDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalParseExampleDataset").Device(DEVICE_CPU),
+    ParseExampleDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
index 2c6179d9f5938d5ef413a83f2aad43fb96f67f47..af024520982106aead1b4bf3d09886bcc42d73d1 100644
--- a/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
+++ b/tensorflow/core/kernels/data/experimental/prefetching_kernels.cc
@@ -27,434 +27,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-struct BufferElement {
-  // The producer sets `status` if getting the input element fails.
-  Status status;
-  // The buffered data element.
-  std::vector<Tensor> value;
-};
-
-using FunctionBufferCallback = std::function<void(const BufferElement&)>;
-
-class FunctionBufferingResource : public ResourceBase {
- public:
-  FunctionBufferingResource(FunctionLibraryRuntime* lib,
-                            std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-                            const NameAttrList& func, int64 buffer_size,
-                            const string& source_device,
-                            const string& target_device,
-                            const std::vector<Tensor>& func_args,
-                            const DataTypeVector& output_types)
-      : lib_(lib),
-        pflr_(std::move(pflr)),
-        func_(func),
-        buffer_size_(buffer_size),
-        source_device_(source_device),
-        target_device_(target_device),
-        func_args_(func_args),
-        output_types_(output_types),
-        handle_(kInvalidHandle),
-        is_buffering_(false),
-        end_of_sequence_(false),
-        cancelled_(false) {}
-
-  ~FunctionBufferingResource() override {
-    Cancel();
-  }
-
-  string DebugString() override {
-    return strings::StrCat("FunctionBufferingResource. Size: ", buffer_size_,
-                           "; target_device: ", target_device_);
-  }
-
-  // Instantiates the function the first time it's called. After that it caches
-  // the handle.
-  Status Instantiate() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    // Re-use existing handle if it's been set, effectively caching it.
-    if (handle_ != kInvalidHandle) {
-      return Status::OK();
-    }
-    AttrValueMap attr_values = func_.attr();
-    FunctionLibraryRuntime::InstantiateOptions opts;
-    opts.target = target_device_;
-    return lib_->Instantiate(func_.name(), AttrSlice(&attr_values), opts,
-                             &handle_);
-  }
-
-  // Returns true if we've got to the end of the sequence and exhausted the
-  // buffer.
-  bool Finished() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    return end_of_sequence_ && buffer_.empty();
-  }
-
-  // Cancels any buffering / prefetching going on.
-  void Cancel() LOCKS_EXCLUDED(mu_) {
-    mutex_lock l(mu_);
-    cancelled_ = true;
-    while (is_buffering_) {
-      cond_var_.wait(l);
-    }
-  }
-
-  // Cancels all pending operations and then clears out the state.
-  void Reset() LOCKS_EXCLUDED(mu_) {
-    Cancel();
-    mutex_lock l(mu_);
-    buffer_.clear();
-    requests_.clear();
-    is_buffering_ = false;
-    end_of_sequence_ = false;
-    cancelled_ = false;
-  }
-
-  // If the buffer has anything, runs `callback` on the first element in the
-  // buffer, else schedules the `callback` to be called. Requires `args` and
-  // `lib` in case more function calls need to be scheduled.
-  void MaybeGet(FunctionBufferCallback callback) LOCKS_EXCLUDED(mu_) {
-    bool start_buffering = false;
-    bool produced_output = false;
-    BufferElement buffer_element;
-    {
-      mutex_lock l(mu_);
-      if (!is_buffering_ && !end_of_sequence_) {
-        start_buffering = true;
-      }
-      if (!buffer_.empty()) {
-        produced_output = true;
-        std::swap(buffer_element, buffer_.front());
-        buffer_.pop_front();
-      } else {
-        produced_output = false;
-        requests_.push_back(std::move(callback));
-      }
-    }
-    if (produced_output) {
-      callback(buffer_element);
-    }
-    if (start_buffering) {
-      FillBuffer();
-    }
-  }
-
- private:
-  void FillBuffer() LOCKS_EXCLUDED(mu_) {
-    FunctionLibraryRuntime::Handle handle;
-    std::vector<FunctionBufferCallback> cancellation_callbacks;
-    std::vector<BufferElement> cancellation_buffer_elements;
-    bool cancelled = false;
-    {
-      mutex_lock l(mu_);
-      handle = handle_;
-      if (cancelled_) {
-        cancelled = true;
-        // Run through and fulfill all pending requests, if possible.
-        while (!requests_.empty()) {
-          if (!buffer_.empty()) {
-            cancellation_buffer_elements.push_back(std::move(buffer_.front()));
-            buffer_.pop_front();
-            cancellation_callbacks.push_back(std::move(requests_.front()));
-            requests_.pop_front();
-          } else {
-            LOG(ERROR) << "Buffer ran out of elements and we couldn't satisfy: "
-                       << requests_.size() << " requests";
-            break;
-          }
-        }
-        is_buffering_ = false;
-      } else {
-        is_buffering_ = true;
-      }
-    }
-    if (cancelled) {
-      for (int i = 0; i < cancellation_callbacks.size(); ++i) {
-        cancellation_callbacks[i](cancellation_buffer_elements[i]);
-      }
-      cond_var_.notify_all();
-      return;
-    }
-    FunctionLibraryRuntime::Options opts;
-    // Copied from CapturedFunction::generate_step_id();
-    opts.step_id = -std::abs(static_cast<int64>(random::New64()));
-    opts.source_device = source_device_;
-    AllocatorAttributes arg_alloc_attr;
-    arg_alloc_attr.set_on_host(true);
-    opts.args_alloc_attrs.push_back(arg_alloc_attr);
-    for (const auto& dtype : output_types_) {
-      AllocatorAttributes ret_alloc_attrs;
-      if (DataTypeAlwaysOnHost(dtype)) {
-        ret_alloc_attrs.set_on_host(true);
-      }
-      opts.rets_alloc_attrs.push_back(ret_alloc_attrs);
-    }
-    if (opts.source_device != target_device_) {
-      opts.remote_execution = true;
-    }
-    opts.create_rendezvous = true;
-    auto* rets = new std::vector<Tensor>;
-    lib_->Run(opts, handle, func_args_, rets,
-              [this, rets](const Status& status) {
-                FunctionBufferCallback callback = nullptr;
-                BufferElement buffer_front;
-                bool restart_buffering = false;
-                {
-                  mutex_lock l(mu_);
-                  BufferElement buffer_element;
-                  buffer_element.status = status;
-                  if (status.ok()) {
-                    buffer_element.value.swap(*rets);
-                  } else {
-                    end_of_sequence_ = true;
-                    is_buffering_ = false;
-                  }
-                  buffer_.push_back(std::move(buffer_element));
-                  if (!requests_.empty()) {
-                    buffer_front = std::move(buffer_.front());
-                    buffer_.pop_front();
-                    callback = std::move(requests_.front());
-                    requests_.pop_front();
-                  }
-                  if (buffer_.size() < buffer_size_ && !end_of_sequence_) {
-                    restart_buffering = true;
-                  } else {
-                    // When the buffer is full, we don't want to call
-                    // FillBuffer() unless we're in cancellation phase in which
-                    // case FillBuffer() will do the final cleanup post
-                    // cancellation.
-                    if (cancelled_) {
-                      restart_buffering = true;
-                    }
-                    is_buffering_ = false;
-                  }
-                }
-                if (callback != nullptr) {
-                  callback(buffer_front);
-                }
-                if (restart_buffering) {
-                  FillBuffer();
-                }
-              });
-  }
-
-  mutex mu_;
-  FunctionLibraryRuntime* lib_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  NameAttrList func_;
-  const int64 buffer_size_;
-  const string source_device_;
-  const string target_device_;
-  const std::vector<Tensor> func_args_;
-  const DataTypeVector output_types_;
-  FunctionLibraryRuntime::Handle handle_ GUARDED_BY(mu_);
-  std::deque<BufferElement> buffer_ GUARDED_BY(mu_);
-  std::deque<FunctionBufferCallback> requests_ GUARDED_BY(mu_);
-  bool is_buffering_ GUARDED_BY(mu_);
-  bool end_of_sequence_ GUARDED_BY(mu_);
-  bool cancelled_ GUARDED_BY(mu_);
-  condition_variable cond_var_;
-};
-
-class FunctionBufferResourceHandleOp : public OpKernel {
- public:
-  explicit FunctionBufferResourceHandleOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx), flib_def_(nullptr) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("buffer_size", &buffer_size_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("container", &container_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("shared_name", &name_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-  }
-
-  ~FunctionBufferResourceHandleOp() override {
-    if (cinfo_.resource_is_private_to_kernel()) {
-      if (!cinfo_.resource_manager()
-               ->Delete<FunctionBufferingResource>(cinfo_.container(),
-                                                   cinfo_.name())
-               .ok()) {
-        // Do nothing; the resource can have been deleted by session resets.
-      }
-    }
-  }
-
-  void Compute(OpKernelContext* ctx) override {
-    const Tensor* string_arg;
-    OP_REQUIRES_OK(ctx, ctx->input("string_arg", &string_arg));
-    std::vector<Tensor> func_args;
-    func_args.push_back(*string_arg);
-
-    const string& source_device = ctx->device()->name();
-
-    // Obtain and canonicalize target_device.
-    const Tensor* target_arg;
-    OP_REQUIRES_OK(ctx, ctx->input("target_device", &target_arg));
-    string target_device;
-    OP_REQUIRES_OK(ctx, DeviceNameUtils::CanonicalizeDeviceName(
-                            target_arg->scalar<string>()(), source_device,
-                            &target_device));
-
-    FunctionLibraryRuntime* lib = ctx->function_library();
-    OP_REQUIRES(ctx, lib != nullptr,
-                errors::Internal("No function library is provided."));
-
-    mutex_lock l(mu_);
-    if (!initialized_) {
-      OP_REQUIRES_OK(ctx, cinfo_.Init(ctx->resource_manager(), def()));
-      FunctionLibraryRuntime* clone_lib;
-      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr;
-      OP_REQUIRES_OK(ctx, lib->Clone(&flib_def_, &pflr, &clone_lib));
-      // Create the resource.
-      FunctionBufferingResource* buffer;
-      OP_REQUIRES_OK(
-          ctx,
-          ctx->resource_manager()->LookupOrCreate<FunctionBufferingResource>(
-              cinfo_.container(), cinfo_.name(), &buffer,
-              [clone_lib, &pflr, &source_device, &target_device, func_args,
-               this](FunctionBufferingResource** ptr) {
-                *ptr = new FunctionBufferingResource(
-                    clone_lib, std::move(pflr), func_, buffer_size_,
-                    source_device, target_device, func_args, output_types_);
-                return Status::OK();
-              }));
-      core::ScopedUnref s(buffer);
-      OP_REQUIRES_OK(ctx, buffer->Instantiate());
-      initialized_ = true;
-    }
-
-    OP_REQUIRES_OK(ctx, MakeResourceHandleToOutput(
-                            ctx, 0, cinfo_.container(), cinfo_.name(),
-                            MakeTypeIndex<FunctionBufferingResource>()));
-  }
-
- private:
-  mutex mu_;
-  ContainerInfo cinfo_ GUARDED_BY(mu_);
-  bool initialized_ GUARDED_BY(mu_) = false;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  NameAttrList func_;
-  int64 buffer_size_;
-  string container_;
-  string name_;
-  DataTypeVector output_types_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResource")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("resource")
-                            .HostMemory("string_arg")
-                            .HostMemory("target_device"),
-                        FunctionBufferResourceHandleOp);
-#endif  // TENSORFLOW_USE_SYCL
-
-// Prefetches and fills up a buffer by calling a function that provides the
-// elements to buffer.
-class FunctionBufferingResourceGetNextOp : public AsyncOpKernel {
- public:
-  explicit FunctionBufferingResourceGetNextOp(OpKernelConstruction* ctx)
-      : AsyncOpKernel(ctx) {}
-
-  ~FunctionBufferingResourceGetNextOp() override {}
-
-  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
-    ResourceHandle handle;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, HandleFromInput(ctx, "function_buffer_resource", &handle), done);
-    FunctionBufferingResource* buffer = nullptr;
-    OP_REQUIRES_OK_ASYNC(
-        ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer),
-        done);
-
-    if (buffer->Finished()) {
-      buffer->Unref();
-      ctx->SetStatus(errors::OutOfRange("end_of_sequence"));
-      done();
-      return;
-    }
-
-    FunctionBufferCallback callback =
-        [ctx, buffer, done](const BufferElement& buffer_element) {
-          Status s = buffer_element.status;
-          if (!s.ok()) {
-            ctx->SetStatus(s);
-            buffer->Unref();
-            done();
-            return;
-          }
-          for (size_t i = 0; i < buffer_element.value.size(); ++i) {
-            ctx->set_output(i, buffer_element.value[i]);
-          }
-          buffer->Unref();
-          done();
-        };
-    buffer->MaybeGet(std::move(callback));
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceGetNext")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceGetNextOp);
-#endif  // TENSORFLOW_USE_SYCL
-
-// Resets the FunctionBufferingResource, cancelling all pending requests and
-// clearing out the buffer.
-class FunctionBufferingResourceResetOp : public OpKernel {
- public:
-  explicit FunctionBufferingResourceResetOp(OpKernelConstruction* ctx)
-      : OpKernel(ctx) {}
-
-  ~FunctionBufferingResourceResetOp() override {}
-
-  void Compute(OpKernelContext* ctx) override {
-    ResourceHandle handle;
-    OP_REQUIRES_OK(ctx,
-                   HandleFromInput(ctx, "function_buffer_resource", &handle));
-    FunctionBufferingResource* buffer = nullptr;
-    OP_REQUIRES_OK(
-        ctx, LookupResource<FunctionBufferingResource>(ctx, handle, &buffer));
-    core::ScopedUnref s(buffer);
-
-    buffer->Reset();
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_CPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_GPU)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-#if TENSORFLOW_USE_SYCL
-REGISTER_KERNEL_BUILDER(Name("ExperimentalFunctionBufferingResourceReset")
-                            .Device(DEVICE_SYCL)
-                            .HostMemory("function_buffer_resource"),
-                        FunctionBufferingResourceResetOp);
-#endif  // TENSORFLOW_USE_SYCL
-
 class IteratorGetDeviceOp : public OpKernel {
  public:
   using OpKernel::OpKernel;
diff --git a/tensorflow/core/kernels/data/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/random_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 816405fea90ef5de5fb5da9db03818b8775b0d3e..6d85cd5c450640a0042add2ead26836433166ade 100644
--- a/tensorflow/core/kernels/data/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -76,6 +76,8 @@ class RandomDatasetOp : public DatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override { return kInfiniteCardinality; }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -152,7 +154,7 @@ class RandomDatasetOp : public DatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("RandomDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalRandomDataset").Device(DEVICE_CPU),
                         RandomDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
similarity index 85%
rename from tensorflow/core/kernels/data/scan_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index d9182d15bed272b3e45b0ad309abeca55ba1a6a2..0d9a629a27f907fca2214a574db1ea0074a9ed2e 100644
--- a/tensorflow/core/kernels/data/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -37,6 +37,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("Tstate", &state_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -53,7 +55,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     *output = new Dataset(ctx, input, func_, std::move(initial_state),
                           std::move(captured_func), state_types_, output_types_,
-                          output_shapes_);
+                          output_shapes_, preserve_cardinality_);
   }
 
  private:
@@ -64,7 +66,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& state_types,
             const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
+            const std::vector<PartialTensorShape>& output_shapes,
+            bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -72,7 +75,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
           captured_func_(std::move(captured_func)),
           state_types_(state_types),
           output_types_(output_types),
-          output_shapes_(output_shapes) {
+          output_shapes_(output_shapes),
+          preserve_cardinality_(preserve_cardinality) {
       input_->Ref();
     }
 
@@ -93,6 +97,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "ScanDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -123,12 +129,15 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       b->BuildAttrValue(state_types_, &state_types);
       AttrValue other_arguments_types_attr;
       b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
       TF_RETURN_IF_ERROR(
           b->AddDataset(this, {{0, input_node}},
                         {{1, initial_state_nodes}, {2, other_arguments}},
                         {{"f", f},
                          {"Tstate", state_types},
-                         {"Targuments", other_arguments_types_attr}},
+                         {"Targuments", other_arguments_types_attr},
+                         {"preserve_cardinality", preserve_cardinality_attr}},
                         output));
       return Status::OK();
     }
@@ -143,7 +152,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -168,8 +178,8 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         state_and_output.reserve(dataset()->state_types_.size() +
                                  output_dtypes().size());
 
-        Status s = dataset()->captured_func_->Run(ctx, std::move(args),
-                                                  &state_and_output);
+        Status s = instantiated_captured_func_->Run(ctx, std::move(args),
+                                                    &state_and_output);
         if (s.ok()) {
           state_.clear();
           size_t i = 0;
@@ -202,10 +212,19 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             out_tensors->push_back(std::move(state_and_output[i]));
           }
         } else if (errors::IsOutOfRange(s)) {
-          // `f` may deliberately raise `errors::OutOfRange` to indicate
-          // that we should terminate the iteration early.
-          *end_of_sequence = true;
-          return Status::OK();
+          if (dataset()->preserve_cardinality_) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            return errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                s.error_message());
+          } else {
+            // `f` may deliberately raise `errors::OutOfRange` to indicate
+            // that we should terminate the iteration early.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
         }
         return s;
       }
@@ -252,6 +271,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::vector<Tensor> state_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
@@ -261,15 +281,18 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     const DataTypeVector state_types_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
+    const bool preserve_cardinality_;
   };
 
   DataTypeVector state_types_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
+  bool preserve_cardinality_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("ScanDataset").Device(DEVICE_CPU), ScanDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalScanDataset").Device(DEVICE_CPU),
+                        ScanDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
similarity index 97%
rename from tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index a21b3fc16b7a93978bd2e03081aec9e7aa5e5ba4..fe128005faca9bd986e7c85600f7f871ebb97a25 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -129,6 +129,8 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       return "SetStatsAggregatorDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -203,8 +205,9 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SetStatsAggregatorDataset").Device(DEVICE_CPU),
-                        SetStatsAggregatorDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalSetStatsAggregatorDataset").Device(DEVICE_CPU),
+    SetStatsAggregatorDatasetOp);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index c7bf89cbdeb4f81da4346ceef68b46598b032d0c..d2fb8ac4f33b1e844bb39cc70a47ccb15424ace7 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -68,6 +68,8 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "SleepDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/slide_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
similarity index 95%
rename from tensorflow/core/kernels/data/slide_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index e67c5272b6fa3eee4bd852da45dd5081e7ce12e4..1ce4fbd3136d7fbd245fbb920ff658c4eae794c6 100644
--- a/tensorflow/core/kernels/data/slide_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -29,9 +29,9 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class SlideDatasetOp : public UnaryDatasetOpKernel {
+class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
  public:
-  explicit SlideDatasetOp(OpKernelConstruction* ctx)
+  explicit SlidingWindowDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {}
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -99,10 +99,18 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
     }
 
     string DebugString() const override {
-      return strings::StrCat("SlideDatasetOp(", window_size_, ", ",
+      return strings::StrCat("SlidingWindowDatasetOp(", window_size_, ", ",
                              window_shift_, ", ", window_stride_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / window_shift_;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -295,8 +303,9 @@ class SlideDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("SlideDataset").Device(DEVICE_CPU),
-                        SlideDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalSlidingWindowDataset").Device(DEVICE_CPU),
+    SlidingWindowDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/sql/BUILD b/tensorflow/core/kernels/data/experimental/sql/BUILD
similarity index 100%
rename from tensorflow/core/kernels/data/sql/BUILD
rename to tensorflow/core/kernels/data/experimental/sql/BUILD
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.cc b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
similarity index 88%
rename from tensorflow/core/kernels/data/sql/driver_manager.cc
rename to tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
index 783d1e6cb28fdd3f2e42caecc300ba9bd8b22c04..58174f69a44a5e28dd2d4fd018ee45688d407054 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/sql/driver_manager.h"
-#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
 namespace tensorflow {
 namespace data {
diff --git a/tensorflow/core/kernels/data/sql/driver_manager.h b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
similarity index 81%
rename from tensorflow/core/kernels/data/sql/driver_manager.h
rename to tensorflow/core/kernels/data/experimental/sql/driver_manager.h
index c5428f396b03f03390f53b6a2e50fca3821dac0c..6afadf91a478e5da470897c3aa2977462337b5e5 100644
--- a/tensorflow/core/kernels/data/sql/driver_manager.h
+++ b/tensorflow/core/kernels/data/experimental/sql/driver_manager.h
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
 
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 
 namespace tensorflow {
 namespace data {
@@ -38,4 +38,4 @@ class DriverManager {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_DRIVER_MANAGER_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_DRIVER_MANAGER_H_
diff --git a/tensorflow/core/kernels/data/sql/query_connection.h b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
similarity index 92%
rename from tensorflow/core/kernels/data/sql/query_connection.h
rename to tensorflow/core/kernels/data/experimental/sql/query_connection.h
index 2fd229a9bfd4dd4f6e49eaa2452dbd9140050523..10c66436792a9794112a38a4a590e2e9fc3c05c5 100644
--- a/tensorflow/core/kernels/data/sql/query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/query_connection.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
 
 #include "tensorflow/core/framework/tensor.h"
 
@@ -67,4 +67,4 @@ class QueryConnection {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
similarity index 97%
rename from tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
rename to tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index 1d374898dc321b28b091b90c232afefb0457697b..cadceee8f516c08a45b63702aa321944e8f0a21e 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/kernels/data/sql/sqlite_query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h"
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -106,7 +106,7 @@ void SqliteQueryConnection::FillTensorWithResultSetEntry(
       break;
     // Error preemptively thrown by SqlDatasetOp::MakeDataset in this case.
     default:
-      LOG(FATAL)
+      LOG(ERROR)
           << "Use of unsupported TensorFlow data type by 'SqlQueryConnection': "
           << DataTypeString(data_type) << ".";
   }
diff --git a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
similarity index 84%
rename from tensorflow/core/kernels/data/sql/sqlite_query_connection.h
rename to tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
index 175492c49dba512f602c7153f1ab66ba6427aa3d..61df29065e15281067ec0fbcb499d382b0ba73f8 100644
--- a/tensorflow/core/kernels/data/sql/sqlite_query_connection.h
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.h
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
-#define TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#ifndef TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
+#define TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
 
 #include <memory>
 
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 #include "tensorflow/core/lib/db/sqlite.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -53,4 +53,4 @@ class SqliteQueryConnection : public QueryConnection {
 }  // namespace data
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_KERNELS_DATA_SQL_SQLITE_QUERY_CONNECTION_H_
+#endif  // TENSORFLOW_CORE_KERNELS_DATA_EXPERIMENTAL_SQL_SQLITE_QUERY_CONNECTION_H_
diff --git a/tensorflow/core/kernels/data/sql_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
similarity index 96%
rename from tensorflow/core/kernels/data/sql_dataset_ops.cc
rename to tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index f01ecf84afab05fcbf87f70668489d2358e66817..c16d8ed02ccdfb01a41ff9206a003f4a8c04a667 100644
--- a/tensorflow/core/kernels/data/sql_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -17,8 +17,8 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/kernels/data/sql/driver_manager.h"
-#include "tensorflow/core/kernels/data/sql/query_connection.h"
+#include "tensorflow/core/kernels/data/experimental/sql/driver_manager.h"
+#include "tensorflow/core/kernels/data/experimental/sql/query_connection.h"
 #include "tensorflow/core/lib/io/inputbuffer.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -214,7 +214,8 @@ class SqlDatasetOp : public DatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("SqlDataset").Device(DEVICE_CPU), SqlDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalSqlDataset").Device(DEVICE_CPU),
+                        SqlDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/data/stats_aggregator_ops.cc
rename to tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 2d5146761631f8ed28ebcafac9fd670da9e3b47d..894465e1814cf93b02ecbbb053494d4c032fe243 100644
--- a/tensorflow/core/kernels/data/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -141,10 +141,12 @@ class StatsAggregatorSummaryOp : public OpKernel {
   }
 };
 
-REGISTER_KERNEL_BUILDER(Name("StatsAggregatorHandle").Device(DEVICE_CPU),
-                        StatsAggregatorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("StatsAggregatorSummary").Device(DEVICE_CPU),
-                        StatsAggregatorSummaryOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalStatsAggregatorHandle").Device(DEVICE_CPU),
+    StatsAggregatorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalStatsAggregatorSummary").Device(DEVICE_CPU),
+    StatsAggregatorSummaryOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
similarity index 95%
rename from tensorflow/core/kernels/data/stats_dataset_ops.cc
rename to tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index da0039773cee1bcdf313c03b0b01198b03c71cc5..1961f25df846e8773bf6b0266d089c9d3bac355b 100644
--- a/tensorflow/core/kernels/data/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -78,6 +78,8 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       return "LatencyStatsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -186,6 +188,8 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       return "BytesProducedStatsDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -255,10 +259,12 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("LatencyStatsDataset").Device(DEVICE_CPU),
-                        LatencyStatsDatasetOp);
-REGISTER_KERNEL_BUILDER(Name("BytesProducedStatsDataset").Device(DEVICE_CPU),
-                        BytesProducedStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalLatencyStatsDataset").Device(DEVICE_CPU),
+    LatencyStatsDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalBytesProducedStatsDataset").Device(DEVICE_CPU),
+    BytesProducedStatsDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index ab21dfc6bc5ddcc758585950b361d5dd5f762222..8ae45ed5c9d9fe199ef392a1430f359172ec5c73 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/lib/core/threadpool.h"
+#include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/work_sharder.h"
 
 namespace tensorflow {
@@ -167,6 +169,8 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       return "ThreadPoolDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -187,20 +191,137 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
           : DatasetIterator<Dataset>(params) {}
 
       Status Initialize(IteratorContext* ctx) override {
-        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+        return dataset()->input_->MakeIterator(
+            IteratorContext(CreateParams(ctx)), prefix(), &input_impl_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
                              bool* end_of_sequence) override {
+        return input_impl_->GetNext(IteratorContext(CreateParams(ctx)),
+                                    out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      IteratorContext::Params CreateParams(IteratorContext* ctx) {
         ThreadPoolResource* pool = dataset()->threadpool_;
         IteratorContext::Params params(ctx);
         params.runner = [pool](std::function<void()> c) {
           pool->Schedule(std::move(c));
         };
         params.runner_threadpool_size = pool->NumThreads();
-        IteratorContext iter_ctx(params);
-        return input_impl_->GetNext(&iter_ctx, out_tensors, end_of_sequence);
+        return params;
+      }
+
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const Tensor resource_handle_;
+    ThreadPoolResource* const threadpool_;
+  };
+};
+
+class MaxIntraOpParallelismDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit MaxIntraOpParallelismDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 max_intra_op_parallelism;
+    OP_REQUIRES_OK(ctx,
+                   ParseScalarArgument<int64>(ctx, "max_intra_op_parallelism",
+                                              &max_intra_op_parallelism));
+    OP_REQUIRES(
+        ctx, max_intra_op_parallelism >= 0,
+        errors::InvalidArgument("`max_intra_op_parallelism` must be >= 0"));
+    *output = new Dataset(ctx, input, max_intra_op_parallelism);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input,
+            int64 max_intra_op_parallelism)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          max_intra_op_parallelism_(max_intra_op_parallelism) {
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::MaxIntraOpParallelism")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "MaxIntraOpParallelismDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* max_intra_op_parallelism_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(max_intra_op_parallelism_,
+                                      &max_intra_op_parallelism_node));
+      TF_RETURN_IF_ERROR(b->AddDataset(
+          this, {input_graph_node, max_intra_op_parallelism_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        IteratorContext::Params params(ctx);
+        auto max_parallelism = dataset()->max_intra_op_parallelism_;
+        params.runner = std::bind(
+            [max_parallelism](
+                const std::function<void(std::function<void()>)>& runner,
+                std::function<void()> fn) {
+              std::function<void()> scoped_fn = std::bind(
+                  [max_parallelism](const std::function<void()>& fn) {
+                    ScopedPerThreadMaxParallelism scope(max_parallelism);
+                    fn();
+                  },
+                  std::move(fn));
+              (runner)(std::move(scoped_fn));
+            },
+            std::move(*ctx->runner()), std::placeholders::_1);
+        return input_impl_->GetNext(IteratorContext{std::move(params)},
+                                    out_tensors, end_of_sequence);
       }
 
      protected:
@@ -215,11 +336,118 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
     };
 
     const DatasetBase* const input_;
-    const Tensor resource_handle_;
-    ThreadPoolResource* const threadpool_;
+    const int64 max_intra_op_parallelism_;
   };
 };
 
+class PrivateThreadPoolDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit PrivateThreadPoolDatasetOp(OpKernelConstruction* ctx)
+      : UnaryDatasetOpKernel(ctx) {}
+
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override {
+    int64 num_threads;
+    OP_REQUIRES_OK(
+        ctx, ParseScalarArgument<int64>(ctx, "num_threads", &num_threads));
+    OP_REQUIRES(ctx, num_threads >= 1,
+                errors::InvalidArgument("`num_threads` must be >= 1"));
+    *output = new Dataset(ctx, input, num_threads);
+  }
+
+ private:
+  class Dataset : public DatasetBase {
+   public:
+    Dataset(OpKernelContext* ctx, const DatasetBase* input, int num_threads)
+        : DatasetBase(DatasetContext(ctx)),
+          input_(input),
+          num_threads_(num_threads) {
+      thread_pool_ = MakeUnique<thread::ThreadPool>(
+          ctx->env(), ThreadOptions{}, "data_private_threadpool", num_threads,
+          /*low_latency_hint=*/false);
+      input_->Ref();
+    }
+
+    ~Dataset() override { input_->Unref(); }
+
+    std::unique_ptr<IteratorBase> MakeIteratorInternal(
+        const string& prefix) const override {
+      return std::unique_ptr<IteratorBase>(
+          new Iterator({this, strings::StrCat(prefix, "::PrivateThreadPool")}));
+    }
+
+    const DataTypeVector& output_dtypes() const override {
+      return input_->output_dtypes();
+    }
+    const std::vector<PartialTensorShape>& output_shapes() const override {
+      return input_->output_shapes();
+    }
+
+    string DebugString() const override {
+      return "PrivateThreadPoolDatasetOp::Dataset";
+    }
+
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
+   protected:
+    Status AsGraphDefInternal(SerializationContext* ctx,
+                              DatasetGraphDefBuilder* b,
+                              Node** output) const override {
+      Node* input_graph_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+      Node* num_threads_node = nullptr;
+      TF_RETURN_IF_ERROR(b->AddScalar(num_threads_, &num_threads_node));
+      TF_RETURN_IF_ERROR(
+          b->AddDataset(this, {input_graph_node, num_threads_node}, output));
+      return Status::OK();
+    }
+
+   private:
+    class Iterator : public DatasetIterator<Dataset> {
+     public:
+      explicit Iterator(const Params& params)
+          : DatasetIterator<Dataset>(params) {}
+
+      Status Initialize(IteratorContext* ctx) override {
+        return dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_);
+      }
+
+      Status GetNextInternal(IteratorContext* ctx,
+                             std::vector<Tensor>* out_tensors,
+                             bool* end_of_sequence) override {
+        thread::ThreadPool* pool = dataset()->thread_pool_.get();
+        IteratorContext::Params params(ctx);
+        params.runner = [pool](std::function<void()> c) {
+          pool->Schedule(std::move(c));
+        };
+        params.runner_threadpool_size = dataset()->num_threads_;
+        return input_impl_->GetNext(IteratorContext{std::move(params)},
+                                    out_tensors, end_of_sequence);
+      }
+
+     protected:
+      std::shared_ptr<model::Node> CreateNode(
+          IteratorContext* ctx, model::Node::Args args) const override {
+        return model::MakeKnownRatioNode(std::move(args),
+                                         /*ratio=*/1);
+      }
+
+     private:
+      std::unique_ptr<IteratorBase> input_impl_;
+    };
+
+    const DatasetBase* const input_;
+    const int64 num_threads_;
+    std::unique_ptr<thread::ThreadPool> thread_pool_;
+  };
+};
+
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalMaxIntraOpParallelismDataset").Device(DEVICE_CPU),
+    MaxIntraOpParallelismDatasetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalPrivateThreadPoolDataset").Device(DEVICE_CPU),
+    PrivateThreadPoolDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("ExperimentalThreadPoolHandle").Device(DEVICE_CPU),
                         ThreadPoolHandleOp);
 REGISTER_KERNEL_BUILDER(
diff --git a/tensorflow/core/kernels/data/writer_ops.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
similarity index 84%
rename from tensorflow/core/kernels/data/writer_ops.cc
rename to tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
index 66e759a135591c2ac355c0e79af72df834a10933..7728baf1507c6cec2b44f41561f2ab3d04a80cc8 100644
--- a/tensorflow/core/kernels/data/writer_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
 #include "tensorflow/core/kernels/ops_util.h"
@@ -67,20 +68,24 @@ class ToTFRecordOp : public AsyncOpKernel {
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          dataset->MakeIterator(IteratorContext(ctx), "ToTFRecordOpIterator",
-                                &iterator),
+          dataset->MakeIterator(&iter_ctx, "ToTFRecordOpIterator", &iterator),
           done);
 
       std::vector<Tensor> components;
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence;
       do {
-        OP_REQUIRES_OK_ASYNC(ctx,
-                             iterator->GetNext(IteratorContext(ctx),
-                                               &components, &end_of_sequence),
-                             done);
+        OP_REQUIRES_OK_ASYNC(
+            ctx, iterator->GetNext(&iter_ctx, &components, &end_of_sequence),
+            done);
 
         if (!end_of_sequence) {
           OP_REQUIRES_OK_ASYNC(
@@ -96,8 +101,8 @@ class ToTFRecordOp : public AsyncOpKernel {
   BackgroundWorker background_worker_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("DatasetToTFRecord").Device(DEVICE_CPU),
-                        ToTFRecordOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ExperimentalDatasetToTFRecord").Device(DEVICE_CPU), ToTFRecordOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
similarity index 98%
rename from tensorflow/core/kernels/data/unbatch_dataset_op.cc
rename to tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index b32ab8ba4faa7b762c950f7fa444456ecd0c76d1..2626ec3ed7250b725650a76b8674e0a76ebc638f 100644
--- a/tensorflow/core/kernels/data/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -54,6 +54,8 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       }
     }
 
+    ~Dataset() override { input_->Unref(); }
+
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
       return std::unique_ptr<IteratorBase>(
@@ -219,7 +221,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
   };
 };
 
-REGISTER_KERNEL_BUILDER(Name("UnbatchDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("ExperimentalUnbatchDataset").Device(DEVICE_CPU),
                         UnbatchDatasetOp);
 
 }  // namespace
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index 40cbb124252aa4bd345a5a815609bb748d325f86..b8b657d3433422731d10a00ae6498c2f802669dd 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -34,7 +34,8 @@ namespace {
 class FilterDatasetOp : public UnaryDatasetOpKernel {
  public:
   using FilterIteratorPredicate =
-      std::function<Status(IteratorContext*, std::vector<Tensor>, bool*)>;
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>, bool*)>;
 
   explicit FilterDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
@@ -55,13 +56,12 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
 
     FilterIteratorPredicate filter_pred;
     if (indices.empty()) {
-      CapturedFunction* raw_captured_func = captured_func.get();
-      filter_pred = [raw_captured_func](IteratorContext* ctx,
-                                        const std::vector<Tensor>& args,
-                                        bool* out_matched) {
+      filter_pred = [](IteratorContext* ctx,
+                       InstantiatedCapturedFunction* inst_captured_func,
+                       const std::vector<Tensor>& args, bool* out_matched) {
         std::vector<Tensor> result;
         TF_RETURN_IF_ERROR(
-            raw_captured_func->RunWithBorrowedArgs(ctx, args, &result));
+            inst_captured_func->RunWithBorrowedArgs(ctx, args, &result));
 
         if (result.size() != 1 || result[0].dtype() != DT_BOOL ||
             result[0].NumElements() != 1) {
@@ -73,6 +73,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       };
     } else {
       filter_pred = [indices](IteratorContext* ctx,
+                              InstantiatedCapturedFunction* inst_captured_func,
                               const std::vector<Tensor>& args,
                               bool* out_matched) {
         const Tensor& predicate = args[indices[0]];
@@ -169,7 +170,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -197,7 +199,8 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
             return Status::OK();
           }
 
-          TF_RETURN_IF_ERROR(filter_pred_(ctx, *out_tensors, &matched));
+          TF_RETURN_IF_ERROR(filter_pred_(
+              ctx, instantiated_captured_func_.get(), *out_tensors, &matched));
           if (!matched) {
             // Clear the output tensor list since it didn't match.
             out_tensors->clear();
@@ -274,6 +277,7 @@ class FilterDatasetOp : public UnaryDatasetOpKernel {
       int64 dropped_elements_ GUARDED_BY(mu_);
       const FilterIteratorPredicate filter_pred_;
       string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 9b42981ed75aff0ac49f813343a23e6f22c101bd..3846334622bf48ecb5e62464f22c2fa3e7c4adc4 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -122,7 +122,8 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -243,8 +244,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
           EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         return MakeIteratorFromInputElement(
             ctx, captured_func_inputs_, element_index_++,
-            dataset()->captured_func_.get(), prefix(),
-            &current_element_iterator_);
+            *instantiated_captured_func_, prefix(), &current_element_iterator_);
       }
 
       mutex mu_;
@@ -252,6 +252,7 @@ class FlatMapDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> current_element_iterator_ GUARDED_BY(mu_);
       std::vector<Tensor> captured_func_inputs_ GUARDED_BY(mu_);
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index ed18d6ed9d8d8f3d0b6c2d1ab4bc58e7964edf9e..48697ec6c8f05c438badedbc3234dbb1110c7088 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -73,7 +73,8 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     ~Iterator() override {
       if (!finalized_) {
         std::vector<Tensor> ignored;
-        Status s = dataset()->finalize_func_->RunInstantiated(state_, &ignored);
+        Status s =
+            instantiated_finalize_func_->RunInstantiated(state_, &ignored);
         if (!s.ok()) {
           LOG(WARNING)
               << "Error occurred when finalizing GeneratorDataset iterator: "
@@ -83,9 +84,12 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     }
 
     Status Initialize(IteratorContext* ctx) override {
-      TF_RETURN_IF_ERROR(dataset()->init_func_->Instantiate(ctx));
-      TF_RETURN_IF_ERROR(dataset()->next_func_->Instantiate(ctx));
-      TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(ctx));
+      TF_RETURN_IF_ERROR(
+          dataset()->init_func_->Instantiate(ctx, &instantiated_init_func_));
+      TF_RETURN_IF_ERROR(
+          dataset()->next_func_->Instantiate(ctx, &instantiated_next_func_));
+      TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(
+          ctx, &instantiated_finalize_func_));
       return Status::OK();
     }
 
@@ -96,7 +100,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
 
       if (!initialized_) {
         TF_RETURN_IF_ERROR(
-            dataset()->init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
+            instantiated_init_func_->RunWithBorrowedArgs(ctx, {}, &state_));
         initialized_ = true;
       }
 
@@ -105,8 +109,8 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         return Status::OK();
       }
 
-      Status s =
-          dataset()->next_func_->RunWithBorrowedArgs(ctx, state_, out_tensors);
+      Status s = instantiated_next_func_->RunWithBorrowedArgs(ctx, state_,
+                                                              out_tensors);
       if (s.ok()) {
         *end_of_sequence = false;
       } else if (errors::IsOutOfRange(s)) {
@@ -119,7 +123,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
         // finalize function.
         std::vector<Tensor> ignored;
         TF_RETURN_IF_ERROR(
-            dataset()->finalize_func_->RunInstantiated(state_, &ignored));
+            instantiated_finalize_func_->RunInstantiated(state_, &ignored));
         finalized_ = true;
       }
       return s;
@@ -136,6 +140,9 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
     bool initialized_ GUARDED_BY(mu_) = false;
     bool finalized_ GUARDED_BY(mu_) = false;
     std::vector<Tensor> state_ GUARDED_BY(mu_);
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_init_func_;
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_next_func_;
+    std::unique_ptr<InstantiatedCapturedFunction> instantiated_finalize_func_;
   };
 
   const std::unique_ptr<CapturedFunction> init_func_;
@@ -175,11 +182,13 @@ void GeneratorDatasetOp::MakeDataset(OpKernelContext* ctx,
 }
 
 namespace {
-REGISTER_KERNEL_BUILDER(Name("GeneratorDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("GeneratorDataset").Device(DEVICE_CPU).Priority(2),
+                        GeneratorDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("GeneratorDataset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("handle")
+                            .Priority(1),
                         GeneratorDatasetOp);
-REGISTER_KERNEL_BUILDER(
-    Name("GeneratorDataset").Device(DEVICE_GPU).HostMemory("handle"),
-    GeneratorDatasetOp);
 }  // namespace
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index 9574e400a2db6c8c87305cd361135e3e240a44b5..54e3645612cd3905f1338fe59ab8caf0ca8941eb 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -149,7 +149,8 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       void AdvanceToNextInCycle() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -195,7 +196,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
             if (!end_of_input_) {
               TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
                   ctx, args_list_[cycle_index_], cycle_index_,
-                  dataset()->captured_func_.get(), prefix(),
+                  *instantiated_captured_func_, prefix(),
                   &current_elements_[cycle_index_]));
               ++num_open_;
             }
@@ -286,7 +287,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
                   &args_list_[idx][i]));
             }
             TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, args_list_[idx], idx, dataset()->captured_func_.get(),
+                ctx, args_list_[idx], idx, *instantiated_captured_func_,
                 prefix(), &current_elements_[idx]));
             TF_RETURN_IF_ERROR(
                 RestoreInput(ctx, reader, current_elements_[idx]));
@@ -306,6 +307,7 @@ class InterleaveDatasetOp : public UnaryDatasetOpKernel {
       int64 block_index_ GUARDED_BY(mu_) = 0;
       bool end_of_input_ GUARDED_BY(mu_) = false;
       size_t num_open_ GUARDED_BY(mu_) = 0;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 445718ba1e532a384dcce6cd9edd1272532e239e..d5b4bfa5c5e23cc6948f680ba7f49c23447464a5 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/iterator_ops.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/renamed_device.h"
 #include "tensorflow/core/common_runtime/threadpool_device.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/iterator.pb.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
@@ -33,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/public/session_options.h"
 
 namespace tensorflow {
@@ -56,22 +60,26 @@ class IteratorResource : public ResourceBase {
                    std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
                    FunctionLibraryRuntime* lib)
       : device_mgr_(std::move(device_mgr)),
-        flib_def_(std::move(flib_def)),
-        pflr_(std::move(pflr)),
-        lib_(lib),
-        iterator_(nullptr),
+        iterator_state_(
+            new State(std::move(flib_def), std::move(pflr), lib, nullptr)),
         output_dtypes_(output_dtypes),
         output_shapes_(output_shapes) {}
 
   Status GetNext(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                  bool* end_of_sequence) {
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
-    if (captured_iterator) {
-      CHECK_NOTNULL(lib_);
+    std::shared_ptr<State> captured_state;
+    {
+      tf_shared_lock l(mu_);
+      captured_state = iterator_state_;
+    }
+    if (captured_state->iterator) {
       IteratorContext::Params params(ctx);
-      params.lib = lib_;
-      return captured_iterator->GetNext(IteratorContext(std::move(params)),
-                                        out_tensors, end_of_sequence);
+      params.lib = captured_state->lib;
+      params.function_handle_cache =
+          captured_state->function_handle_cache.get();
+      params.resource_mgr = &captured_state->resource_mgr;
+      return captured_state->iterator->GetNext(
+          IteratorContext(std::move(params)), out_tensors, end_of_sequence);
     } else {
       return errors::FailedPrecondition(
           "GetNext() failed because the iterator has not been initialized. "
@@ -86,9 +94,13 @@ class IteratorResource : public ResourceBase {
   }
 
   Status Save(SerializationContext* ctx, IteratorStateWriter* writer) {
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
-    if (captured_iterator) {
-      return captured_iterator->Save(ctx, writer);
+    std::shared_ptr<State> captured_state;
+    {
+      tf_shared_lock l(mu_);
+      captured_state = iterator_state_;
+    }
+    if (captured_state) {
+      return captured_state->iterator->Save(ctx, writer);
     } else {
       return errors::FailedPrecondition(
           "Save() failed because the iterator has not been initialized. "
@@ -120,63 +132,101 @@ class IteratorResource : public ResourceBase {
     // because some of the OpKernels in the graph might call functions that are
     // only defined in the loaded GraphDef.
     FunctionLibraryRuntime* lib;
-    std::unique_ptr<DeviceMgr> device_mgr(nullptr);
     std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
     TF_RETURN_IF_ERROR(ctx->function_library()->Clone(&flib_def, &pflr, &lib));
     TF_RETURN_IF_ERROR(flib_def->AddLibrary(graph_def.library()));
+    std::unique_ptr<State> new_state(new State(
+        std::move(flib_def), std::move(pflr), lib, nullptr /* iterator */));
 
     TF_RETURN_IF_ERROR(
-        graph_runner.Run(&graph, lib, {}, {output_node}, &outputs));
+        graph_runner.Run(&graph, new_state->lib, {}, {output_node}, &outputs));
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(outputs[0], &dataset));
 
-    std::unique_ptr<IteratorBase> iterator;
     IteratorContext::Params params(ctx);
-    params.lib = lib;
+    params.lib = new_state->lib;
+    params.function_handle_cache = new_state->function_handle_cache.get();
+    params.resource_mgr = &new_state->resource_mgr;
     TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
-                                             "Iterator", &iterator));
-    TF_RETURN_IF_ERROR(set_iterator(std::move(iterator)));
-    std::shared_ptr<IteratorBase> captured_iterator(iterator_);
+                                             "Iterator", &new_state->iterator));
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, new_state->iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(VerifyShapesCompatible(
+        output_shapes_, new_state->iterator->output_shapes()));
 
-    if (captured_iterator) {
+    {
       IteratorContext::Params params(ctx);
-      params.lib = lib;
-      DeviceBase* device = lib->device();
+      params.lib = new_state->lib;
+      params.function_handle_cache = new_state->function_handle_cache.get();
+      params.resource_mgr = &new_state->resource_mgr;
+      DeviceBase* device = new_state->lib->device();
       params.allocator_getter = [device](AllocatorAttributes attrs) {
         return device->GetAllocator(attrs);
       };
       IteratorContext iter_ctx(std::move(params));
-      TF_RETURN_IF_ERROR(captured_iterator->Restore(&iter_ctx, reader));
-      mutex_lock l(mu_);
-      device_mgr_ = std::move(device_mgr);
-      lib_def_ = std::move(flib_def);
-      pflr_ = std::move(pflr);
-      lib_ = lib;
-      return Status::OK();
-    } else {
-      return errors::FailedPrecondition(
-          "Failed to restore iterator. Make sure the checkpoint ",
-          "is not corrupt. If the checkpoint does not contain the GraphDef, ",
-          "you will need to initialize your iterator before restoring.");
+      TF_RETURN_IF_ERROR(new_state->iterator->Restore(&iter_ctx, reader));
     }
+
+    mutex_lock l(mu_);
+    iterator_state_ = std::move(new_state);
+    return Status::OK();
   }
 
-  std::shared_ptr<const FunctionLibraryDefinition> function_library() {
-    tf_shared_lock l(mu_);
-    return lib_def_;
+  Status AddLibrary(const FunctionLibraryDefinition& flib_def) {
+    mutex_lock l(mu_);
+    return iterator_state_->flib_def->AddLibrary(flib_def);
   }
 
-  FunctionLibraryRuntime* function_library_runtime() { return lib_; }
+  Status SetIteratorFromDataset(OpKernelContext* ctx, DatasetBase* dataset) {
+    std::shared_ptr<State> new_state;
+    {
+      tf_shared_lock l(mu_);
+      new_state.reset(new State(iterator_state_->flib_def,
+                                iterator_state_->pflr, iterator_state_->lib,
+                                nullptr /* function_handle_cache */,
+                                nullptr /* iterator */));
+    }
 
-  // Transfers ownership of iterator to this. This method is thread-safe.
-  Status set_iterator(std::unique_ptr<IteratorBase> iterator) {
-    if (iterator) {
-      TF_RETURN_IF_ERROR(
-          VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+    // Ensure that the iterator has access to all functions in the current
+    // subgraph, because some functions may have been defined after the resource
+    // was initially created.
+    Status s = new_state->flib_def->AddLibrary(
+        *ctx->function_library()->GetFunctionLibraryDefinition());
+
+    if (!s.ok()) {
+      // Adding functions to `flib_def_` may fail, if there are clashes between
+      // the function names in (e.g.) a restored graph and the currently
+      // executing graph. In that case, we create a new function runtime for
+      // this iterator, based on the current `OpKernelContext`, which will have
+      // the functions we need.
+      FunctionLibraryRuntime* lib;
+      std::unique_ptr<FunctionLibraryDefinition> flib_def(nullptr);
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
       TF_RETURN_IF_ERROR(
-          VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+          ctx->function_library()->Clone(&flib_def, &pflr, &lib));
+      new_state->flib_def = std::move(flib_def);
+      new_state->pflr = std::move(pflr);
+      new_state->lib = lib;
     }
-    iterator_.reset(iterator.release());
+
+    new_state->function_handle_cache.reset(
+        new FunctionHandleCache(new_state->lib));
+    // Create new iterator.
+    std::unique_ptr<IteratorBase> iterator;
+    IteratorContext::Params params(ctx);
+    params.lib = new_state->lib;
+    params.function_handle_cache = new_state->function_handle_cache.get();
+    params.resource_mgr = &new_state->resource_mgr;
+    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
+                                             "Iterator", &iterator));
+    TF_RETURN_IF_ERROR(
+        VerifyTypesMatch(output_dtypes_, iterator->output_dtypes()));
+    TF_RETURN_IF_ERROR(
+        VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
+    std::swap(new_state->iterator, iterator);
+
+    mutex_lock l(mu_);
+    std::swap(iterator_state_, new_state);
     return Status::OK();
   }
 
@@ -189,16 +239,38 @@ class IteratorResource : public ResourceBase {
   }
 
  private:
-  // The following (device_mgr_, flib_def_, pflr_) are only used when the
-  // IteratorResource is shared between sessions and in that case we create
-  // a new FLR. Otherwise these are set to null.
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> flib_def_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  FunctionLibraryRuntime* lib_ = nullptr;  // not owned.
-  std::shared_ptr<IteratorBase> iterator_;
+  struct State {
+    State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
+          std::shared_ptr<ProcessFunctionLibraryRuntime> pflr,
+          FunctionLibraryRuntime* lib, std::unique_ptr<IteratorBase> iterator)
+        : flib_def(flib_def),
+          pflr(pflr),
+          lib(lib),
+          function_handle_cache(absl::make_unique<FunctionHandleCache>(lib)),
+          iterator(std::move(iterator)) {}
+
+    State(std::shared_ptr<FunctionLibraryDefinition> flib_def,
+          std::shared_ptr<ProcessFunctionLibraryRuntime> pflr,
+          FunctionLibraryRuntime* lib,
+          std::unique_ptr<FunctionHandleCache> function_handle_cache,
+          std::unique_ptr<IteratorBase> iterator)
+        : flib_def(flib_def),
+          pflr(pflr),
+          lib(lib),
+          function_handle_cache(std::move(function_handle_cache)),
+          iterator(std::move(iterator)) {}
+
+    std::shared_ptr<FunctionLibraryDefinition> flib_def;
+    std::shared_ptr<ProcessFunctionLibraryRuntime> pflr;
+    FunctionLibraryRuntime* lib = nullptr;  // not owned.
+    std::unique_ptr<FunctionHandleCache> function_handle_cache;
+    ResourceMgr resource_mgr;
+    std::unique_ptr<IteratorBase> iterator;
+  };
+
   mutex mu_;
-  std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
+  const std::unique_ptr<DeviceMgr> device_mgr_ GUARDED_BY(mu_);
+  std::shared_ptr<State> iterator_state_ GUARDED_BY(mu_);
   const DataTypeVector output_dtypes_;
   const std::vector<PartialTensorShape> output_shapes_;
 };
@@ -508,10 +580,9 @@ FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
   // in its resource manager. The existing device will outlive the
   // IteratorResource, because we are storing the IteratorResource
   // in that device's resource manager.
-  Device* wrapped_device = RenamedDevice::NewRenamedDevice(
+  *device_mgr = absl::make_unique<DeviceMgr>(RenamedDevice::NewRenamedDevice(
       ctx->device()->name(), down_cast<Device*>(ctx->device()),
-      false /* owns_underlying */, false /* isolate_session_state */);
-  device_mgr->reset(new DeviceMgr({wrapped_device}));
+      false /* owns_underlying */, false /* isolate_session_state */));
   flib_def->reset(new FunctionLibraryDefinition(
       *ctx->function_library()->GetFunctionLibraryDefinition()));
   pflr->reset(new ProcessFunctionLibraryRuntime(
@@ -584,13 +655,7 @@ void MakeIteratorOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(
       ctx, LookupResource(ctx, HandleFromInput(ctx, 1), &iterator_resource));
   core::ScopedUnref unref(iterator_resource);
-
-  std::unique_ptr<IteratorBase> iterator;
-  IteratorContext::Params params(ctx);
-  params.lib = iterator_resource->function_library_runtime();
-  OP_REQUIRES_OK(ctx, dataset->MakeIterator(IteratorContext(std::move(params)),
-                                            "Iterator", &iterator));
-  OP_REQUIRES_OK(ctx, iterator_resource->set_iterator(std::move(iterator)));
+  OP_REQUIRES_OK(ctx, iterator_resource->SetIteratorFromDataset(ctx, dataset));
 }
 
 namespace {
@@ -610,10 +675,15 @@ class ToSingleElementOp : public AsyncOpKernel {
       OP_REQUIRES_OK_ASYNC(
           ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset), done);
       std::unique_ptr<IteratorBase> iterator;
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          dataset->MakeIterator(IteratorContext(ctx), "SingleElementIterator",
-                                &iterator),
+          dataset->MakeIterator(&iter_ctx, "SingleElementIterator", &iterator),
           done);
 
       // NOTE(jsimsa): We must destroy the iterator before calling `done()`, to
@@ -627,8 +697,8 @@ class ToSingleElementOp : public AsyncOpKernel {
       components.reserve(dataset->output_dtypes().size());
       bool end_of_sequence = false;
 
-      Status s = raw_iterator->GetNext(IteratorContext(ctx), &components,
-                                       &end_of_sequence);
+      Status s =
+          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
       if (!s.ok()) {
         ctx->SetStatus(s);
         return;
@@ -643,8 +713,8 @@ class ToSingleElementOp : public AsyncOpKernel {
       }
 
       components.clear();
-      Status s2 = raw_iterator->GetNext(IteratorContext(ctx), &components,
-                                        &end_of_sequence);
+      Status s2 =
+          raw_iterator->GetNext(&iter_ctx, &components, &end_of_sequence);
       if (!s2.ok()) {
         ctx->SetStatus(s2);
         return;
@@ -693,8 +763,16 @@ class ReduceDatasetOp : public AsyncOpKernel {
                                    use_inter_op_parallelism_, &captured_func),
           done);
 
-      IteratorContext iter_ctx(ctx);
-      OP_REQUIRES_OK_ASYNC(ctx, captured_func->Instantiate(&iter_ctx), done);
+      IteratorContext::Params params(ctx);
+      std::unique_ptr<FunctionHandleCache> function_handle_cache(
+          new FunctionHandleCache(params.lib));
+      params.function_handle_cache = function_handle_cache.get();
+      IteratorContext iter_ctx(std::move(params));
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          captured_func->Instantiate(&iter_ctx, &instantiated_captured_func),
+          done);
 
       std::unique_ptr<IteratorBase> iterator;
       OP_REQUIRES_OK_ASYNC(
@@ -728,8 +806,8 @@ class ReduceDatasetOp : public AsyncOpKernel {
                   std::back_inserter(args));
 
         std::vector<Tensor> reduce_func_output;
-        status =
-            captured_func->Run(&iter_ctx, std::move(args), &reduce_func_output);
+        status = instantiated_captured_func->Run(&iter_ctx, std::move(args),
+                                                 &reduce_func_output);
         if (!status.ok()) {
           break;
         }
@@ -916,13 +994,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
     // factory function.
     DatasetBase* dataset;
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
-    std::unique_ptr<IteratorBase> iter;
-    IteratorContext::Params params(ctx);
-    params.lib = lib;
-    TF_RETURN_IF_ERROR(dataset->MakeIterator(IteratorContext(std::move(params)),
-                                             "Iterator", &iter));
-    TF_RETURN_IF_ERROR((*iterator)->set_iterator(std::move(iter)));
-
+    TF_RETURN_IF_ERROR((*iterator)->SetIteratorFromDataset(ctx, dataset));
     (*iterator)->Ref();
     return Status::OK();
   }
@@ -976,10 +1048,8 @@ void IteratorGetNextOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
         std::vector<Tensor> components;
         bool end_of_sequence = false;
 
-        IteratorContext::Params params(ctx);
-        params.function_library = iterator->function_library();
-        Status s = iterator->GetNext(IteratorContext(std::move(params)),
-                                     &components, &end_of_sequence);
+        Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                     &end_of_sequence);
         // NOTE(mrry): We must unref the iterator before calling `done()`, to
         // avoid destruction races.
         iterator->Unref();
@@ -1005,10 +1075,9 @@ void IteratorGetNextSyncOp::Compute(OpKernelContext* ctx) {
   core::ScopedUnref unref_iterator(iterator);
   std::vector<Tensor> components;
   bool end_of_sequence = false;
-  IteratorContext::Params params(ctx);
-  params.function_library = iterator->function_library();
-  OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(std::move(params)),
-                                        &components, &end_of_sequence));
+
+  OP_REQUIRES_OK(ctx, iterator->GetNext(IteratorContext(ctx), &components,
+                                        &end_of_sequence));
   OP_REQUIRES(ctx, !end_of_sequence, errors::OutOfRange("End of sequence"));
 
   for (int i = 0; i < components.size(); ++i) {
@@ -1041,10 +1110,8 @@ class IteratorGetNextAsOptionalOp : public AsyncOpKernel {
           std::vector<Tensor> components;
           bool end_of_sequence = false;
 
-          IteratorContext::Params params(ctx);
-          params.function_library = iterator->function_library();
-          Status s = iterator->GetNext(IteratorContext(std::move(params)),
-                                       &components, &end_of_sequence);
+          Status s = iterator->GetNext(IteratorContext(ctx), &components,
+                                       &end_of_sequence);
           // NOTE(mrry): We must unref the iterator before calling `done()`, to
           // avoid destruction races.
           iterator->Unref();
@@ -1211,50 +1278,60 @@ class DeserializeIteratorOp : public OpKernel {
 
 
 REGISTER_KERNEL_BUILDER(Name("Iterator").Device(DEVICE_CPU), IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_CPU).Priority(2),
                         IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorV2").Device(DEVICE_GPU).Priority(1),
                         IteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("MakeIterator").Device(DEVICE_CPU).Priority(2),
                         MakeIteratorOp);
 REGISTER_KERNEL_BUILDER(
-    Name("MakeIterator").Device(DEVICE_GPU).HostMemory("dataset"),
+    Name("MakeIterator").Device(DEVICE_GPU).Priority(1).HostMemory("dataset"),
     MakeIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE_CPU),
-                        AnonymousIteratorHandleOp);
-REGISTER_KERNEL_BUILDER(Name("AnonymousIterator").Device(DEVICE_GPU),
-                        AnonymousIteratorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("AnonymousIterator").Device(DEVICE_CPU).Priority(2),
+    AnonymousIteratorHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("AnonymousIterator").Device(DEVICE_GPU).Priority(1),
+    AnonymousIteratorHandleOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToSingleElement").Device(DEVICE_CPU),
                         ToSingleElementOp);
 REGISTER_KERNEL_BUILDER(Name("ReduceDataset").Device(DEVICE_CPU),
                         ReduceDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("OneShotIterator").Device(DEVICE_CPU),
                         OneShotIteratorOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_CPU).Priority(2),
                         IteratorGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("IteratorGetNext").Device(DEVICE_GPU).Priority(1),
                         IteratorGetNextOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE_CPU),
-                        IteratorGetNextSyncOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextSync").Device(DEVICE_GPU),
-                        IteratorGetNextSyncOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE_CPU),
-                        IteratorGetNextAsOptionalOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorGetNextAsOptional").Device(DEVICE_GPU),
-                        IteratorGetNextAsOptionalOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle").Device(DEVICE_CPU),
-                        IteratorToStringHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextSync").Device(DEVICE_CPU).Priority(2),
+    IteratorGetNextSyncOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextSync").Device(DEVICE_GPU).Priority(1),
+    IteratorGetNextSyncOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextAsOptional").Device(DEVICE_CPU).Priority(2),
+    IteratorGetNextAsOptionalOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorGetNextAsOptional").Device(DEVICE_GPU).Priority(1),
+    IteratorGetNextAsOptionalOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorToStringHandle").Device(DEVICE_CPU).Priority(2),
+    IteratorToStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorToStringHandle")
                             .Device(DEVICE_GPU)
-                            .HostMemory("string_handle"),
+                            .HostMemory("string_handle")
+                            .Priority(1),
                         IteratorToStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandle").Device(DEVICE_CPU),
                         IteratorFromStringHandleOp);
-REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2").Device(DEVICE_CPU),
-                        IteratorFromStringHandleOp);
+REGISTER_KERNEL_BUILDER(
+    Name("IteratorFromStringHandleV2").Device(DEVICE_CPU).Priority(2),
+    IteratorFromStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("IteratorFromStringHandleV2")
                             .Device(DEVICE_GPU)
-                            .HostMemory("string_handle"),
+                            .HostMemory("string_handle")
+                            .Priority(1),
                         IteratorFromStringHandleOp);
 REGISTER_KERNEL_BUILDER(Name("SerializeIterator").Device(DEVICE_CPU),
                         SerializeIteratorOp);
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index d64114e70e531c527e07ec1f38e3771ee171e8cd..fc6e93a81cb47372fa023a2f793d35008ab830c8 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -30,8 +30,9 @@ namespace {
 
 class MapDatasetOp : public UnaryDatasetOpKernel {
  public:
-  using MapIteratorFunction = std::function<Status(
-      IteratorContext*, std::vector<Tensor>, std::vector<Tensor>*)>;
+  using MapIteratorFunction =
+      std::function<Status(IteratorContext*, InstantiatedCapturedFunction*,
+                           std::vector<Tensor>, std::vector<Tensor>*)>;
 
   explicit MapDatasetOp(OpKernelConstruction* ctx) : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
@@ -39,6 +40,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
                                      &use_inter_op_parallelism_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
   void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
@@ -54,15 +57,18 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
     MapIteratorFunction map_func;
     CapturedFunction* raw_captured_func = captured_func.get();
     if (indices.empty()) {
-      map_func = [raw_captured_func](IteratorContext* ctx,
-                                     std::vector<Tensor> args,
-                                     std::vector<Tensor>* out_tensors) {
-        return raw_captured_func->Run(ctx, std::move(args), out_tensors);
+      map_func = [](IteratorContext* ctx,
+                    InstantiatedCapturedFunction* inst_captured_func,
+                    std::vector<Tensor> args,
+                    std::vector<Tensor>* out_tensors) {
+        return inst_captured_func->Run(ctx, std::move(args), out_tensors);
       };
     } else {
       std::vector<bool> can_move = ComputeMoveVector(indices);
       map_func = [raw_captured_func, indices, can_move](
-                     IteratorContext* ctx, std::vector<Tensor> args,
+                     IteratorContext* ctx,
+                     InstantiatedCapturedFunction* inst_captured_func,
+                     std::vector<Tensor> args,
                      std::vector<Tensor>* out_tensors) {
         const std::vector<Tensor>& captured_inputs =
             raw_captured_func->captured_inputs();
@@ -82,9 +88,10 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       };
     }
 
-    *output = new Dataset(ctx, input, func_, std::move(captured_func),
-                          output_types_, output_shapes_,
-                          use_inter_op_parallelism_, std::move(map_func));
+    *output =
+        new Dataset(ctx, input, func_, std::move(captured_func), output_types_,
+                    output_shapes_, use_inter_op_parallelism_,
+                    std::move(map_func), preserve_cardinality_);
   }
 
  private:
@@ -95,11 +102,13 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
             std::unique_ptr<CapturedFunction> captured_func,
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes,
-            bool use_inter_op_parallelism, MapIteratorFunction map_func)
+            bool use_inter_op_parallelism, MapIteratorFunction map_func,
+            bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
           use_inter_op_parallelism_(use_inter_op_parallelism),
+          preserve_cardinality_(preserve_cardinality),
           captured_func_(std::move(captured_func)),
           output_types_(output_types),
           output_shapes_(output_shapes),
@@ -124,6 +133,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "MapDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -157,13 +168,19 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       b->BuildAttrValue(use_inter_op_parallelism_,
                         &use_inter_op_parallelism_attr);
 
+      // Attr: preserve_cardinality
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {std::make_pair(0, input_graph_node)},  // Single tensor inputs.
           {std::make_pair(1, other_arguments)},         // Tensor list inputs.
           {std::make_pair("f", f_attr),
            std::make_pair("Targuments", other_arguments_types_attr),
            std::make_pair("use_inter_op_parallelism",
-                          use_inter_op_parallelism_attr)},  // Attrs
+                          use_inter_op_parallelism_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
           output));
       return Status::OK();
     }
@@ -177,7 +194,8 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -194,12 +212,22 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
           return Status::OK();
         }
 
-        Status s = map_func_(ctx, args, out_tensors);
+        Status s = map_func_(ctx, instantiated_captured_func_.get(), args,
+                             out_tensors);
         if (errors::IsOutOfRange(s)) {
-          // `f` may deliberately raise `errors::OutOfRange` to indicate
-          // that we should terminate the iteration early.
-          *end_of_sequence = true;
-          return Status::OK();
+          if (dataset()->preserve_cardinality_) {
+            // To guarantee that the transformation preserves the cardinality of
+            // the dataset, we convert `OutOfRange` to `InvalidArgument` as the
+            // former may be interpreted by a caller as the end of sequence.
+            return errors::InvalidArgument(
+                "Function invocation produced OutOfRangeError: ",
+                s.error_message());
+          } else {
+            // `f` may deliberately raise `errors::OutOfRange` to indicate
+            // that we should terminate the iteration early.
+            *end_of_sequence = true;
+            return Status::OK();
+          }
         } else {
           return s;
         }
@@ -226,11 +254,13 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
      private:
       std::unique_ptr<IteratorBase> input_impl_;
       const MapIteratorFunction map_func_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
     const NameAttrList func_;
     const bool use_inter_op_parallelism_;
+    const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
     const DataTypeVector output_types_;
     const std::vector<PartialTensorShape> output_shapes_;
@@ -241,9 +271,15 @@ class MapDatasetOp : public UnaryDatasetOpKernel {
   std::vector<PartialTensorShape> output_shapes_;
   NameAttrList func_;
   bool use_inter_op_parallelism_;
+  bool preserve_cardinality_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("MapDataset").Device(DEVICE_CPU), MapDatasetOp);
+REGISTER_KERNEL_BUILDER(Name("ExperimentalMapDataset")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("input_dataset")
+                            .HostMemory("handle"),
+                        MapDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index dcd23095968493a9051fe918f6c79c527dad638e..069d61d80d4f00eecdd77356626d7278c0842445 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -60,6 +60,8 @@ class ModelDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "ModelDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 5268007e3d95286eaf3bdf19456c6b007e90f329..ba2125a66eb98985ebd0ae8f55bfc239997ad6df 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
@@ -40,18 +41,21 @@ using MultiDeviceIteratorCallback =
 
 class MultiDeviceIterator : public ResourceBase {
  public:
-  MultiDeviceIterator(const DataTypeVector& output_types,
-                      const std::vector<PartialTensorShape>& output_shapes,
-                      const std::vector<string>& devices,
-                      std::unique_ptr<FunctionLibraryDefinition> flib_def,
-                      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
-                      FunctionLibraryRuntime* lib)
+  MultiDeviceIterator(
+      const DataTypeVector& output_types,
+      const std::vector<PartialTensorShape>& output_shapes,
+      const std::vector<string>& devices,
+      std::unique_ptr<FunctionLibraryDefinition> flib_def,
+      std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
+      FunctionLibraryRuntime* lib,
+      std::unique_ptr<FunctionHandleCache> function_handle_cache)
       : output_types_(output_types),
         output_shapes_(output_shapes),
         devices_(devices),
         flib_def_(std::move(flib_def)),
         pflr_(std::move(pflr)),
-        lib_(lib) {
+        lib_(lib),
+        function_handle_cache_(std::move(function_handle_cache)) {
     DCHECK(lib_ != nullptr);
   }
 
@@ -93,6 +97,8 @@ class MultiDeviceIterator : public ResourceBase {
     } else {
       IteratorContext::Params params(ctx);
       params.lib = lib_;
+      params.function_handle_cache = function_handle_cache_.get();
+      params.resource_mgr = &resource_mgr_;
       IteratorContext iter_ctx(std::move(params));
       tf_shared_lock l(mu_);
       multi_device_buffer_->GetNextFromShard(
@@ -116,6 +122,12 @@ class MultiDeviceIterator : public ResourceBase {
     return lib_;
   }
 
+  FunctionHandleCache* function_handle_cache() {
+    return function_handle_cache_.get();
+  }
+
+  ResourceMgr* resource_mgr() { return &resource_mgr_; }
+
  private:
   // A private class that uses a background thread to keep a per device buffer
   // full.
@@ -340,6 +352,8 @@ class MultiDeviceIterator : public ResourceBase {
   const std::unique_ptr<FunctionLibraryDefinition> flib_def_;
   const std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
   FunctionLibraryRuntime* const lib_ = nullptr;  // not owned.
+  const std::unique_ptr<FunctionHandleCache> function_handle_cache_;
+  ResourceMgr resource_mgr_;
   std::shared_ptr<const FunctionLibraryDefinition> lib_def_ GUARDED_BY(mu_);
 
   int64 incarnation_id_ GUARDED_BY(mu_) = 0;
@@ -383,21 +397,24 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
         std::unique_ptr<ProcessFunctionLibraryRuntime> pflr(nullptr);
         OP_REQUIRES_OK(context, context->function_library()->Clone(
                                     &flib_def, &pflr, &lib));
+        std::unique_ptr<FunctionHandleCache> function_handle_cache(
+            new FunctionHandleCache(lib));
         ResourceMgr* mgr = context->resource_manager();
         OP_REQUIRES_OK(context, cinfo_.Init(mgr, def()));
 
         MultiDeviceIterator* resource;
-        OP_REQUIRES_OK(
-            context,
-            mgr->LookupOrCreate<MultiDeviceIterator>(
-                cinfo_.container(), cinfo_.name(), &resource,
-                [this, lib, &flib_def, &pflr](MultiDeviceIterator** ret)
-                    EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-                      *ret = new MultiDeviceIterator(
-                          output_types_, output_shapes_, devices_,
-                          std::move(flib_def), std::move(pflr), lib);
-                      return Status::OK();
-                    }));
+        OP_REQUIRES_OK(context,
+                       mgr->LookupOrCreate<MultiDeviceIterator>(
+                           cinfo_.container(), cinfo_.name(), &resource,
+                           [this, lib, &flib_def, &pflr,
+                            &function_handle_cache](MultiDeviceIterator** ret)
+                               EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+                                 *ret = new MultiDeviceIterator(
+                                     output_types_, output_shapes_, devices_,
+                                     std::move(flib_def), std::move(pflr), lib,
+                                     std::move(function_handle_cache));
+                                 return Status::OK();
+                               }));
 
         Status s = VerifyResource(resource);
         if (TF_PREDICT_FALSE(!s.ok())) {
@@ -463,6 +480,8 @@ class MultiDeviceIteratorInitOp : public OpKernel {
     std::unique_ptr<IteratorBase> iterator;
     IteratorContext::Params params(ctx);
     params.lib = resource->lib();
+    params.function_handle_cache = resource->function_handle_cache();
+    params.resource_mgr = resource->resource_mgr();
     IteratorContext iter_ctx(std::move(params));
     OP_REQUIRES_OK(
         ctx, dataset->MakeIterator(std::move(iter_ctx), "Iterator", &iterator));
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op.cc b/tensorflow/core/kernels/data/optimize_dataset_op.cc
index 213ee7c601810447727b9a816785ccbc111c8c83..9c50d8050a82397f1578ab3f577ef5ad77f81767 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/device_base.h"
+#include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/graph/graph_constructor.h"
@@ -27,8 +28,10 @@ limitations under the License.
 #include "tensorflow/core/grappler/graph_view.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/grappler_item_builder.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
 #include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
@@ -56,8 +59,13 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         ctx, ParseVectorArgument<string>(ctx, "optimizations", &optimizations));
     Dataset* dataset =
         new Dataset(ctx, input, optimizations, output_types_, output_shapes_);
-    OP_REQUIRES_OK(ctx, dataset->Optimize(ctx));
-    *output = dataset;
+    Status s = dataset->Optimize(ctx);
+    if (s.ok()) {
+      *output = dataset;
+    } else {
+      dataset->Unref();
+      OP_REQUIRES_OK(ctx, s);
+    }
   }
 
  private:
@@ -68,6 +76,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
             const DataTypeVector& output_types,
             const std::vector<PartialTensorShape>& output_shapes)
         : DatasetBase(DatasetContext(ctx)),
+          optimized_input_(nullptr),
           input_(input),
           optimizations_(optimizations),
           output_types_(output_types),
@@ -77,7 +86,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     ~Dataset() override {
       input_->Unref();
-      optimized_input_->Unref();
+      if (optimized_input_) {
+        optimized_input_->Unref();
+      }
     }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
@@ -115,6 +126,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(
           ctx->function_library()->Clone(&flib_def_, &pflr_, &lib_));
 
+      // Create a FunctionHandleCache.
+      function_handle_cache_.reset(new FunctionHandleCache(lib_));
+
       // Some functions may have been modified without having their names
       // changed (for example, nested dataset graphs from FlatMap or
       // Interleave). To avoid name conflicts, we remove these functions from
@@ -148,6 +162,8 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "OptimizeDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -167,6 +183,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       Status Initialize(IteratorContext* ctx) override {
         IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
+        params.function_handle_cache = dataset()->function_handle_cache_.get();
         return dataset()->optimized_input_->MakeIterator(
             IteratorContext(std::move(params)), prefix(), &input_impl_);
       }
@@ -176,6 +193,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
                              bool* end_of_sequence) override {
         IteratorContext::Params params(ctx);
         params.lib = dataset()->lib_;
+        params.function_handle_cache = dataset()->function_handle_cache_.get();
         return input_impl_->GetNext(IteratorContext(std::move(params)),
                                     out_tensors, end_of_sequence);
       }
@@ -202,6 +220,39 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       std::unique_ptr<IteratorBase> input_impl_;
     };
 
+    void AddFakeSinks(FunctionDef* function_def) {
+      int counter = 0;
+      for (const auto& output : function_def->signature().output_arg()) {
+        NodeDef* node = function_def->add_node_def();
+        tensorflow::grappler::function_utils::SetUniqueFunctionNodeName(
+            strings::StrCat("FakeSink", counter++), function_def, node);
+        node->set_op("Identity");
+        node->add_input(function_def->ret().at(output.name()));
+        (*node->mutable_attr())["T"].set_type(output.type());
+
+        (*function_def->mutable_ret())[output.name()] =
+            strings::StrCat(node->name(), ":output:0");
+      }
+    }
+
+    void RemoveFakeSinks(FunctionDef* function_def) {
+      // Map from identity node names to their input tensor strings
+      std::map<string, string> identity_map;
+      for (const auto& node : function_def->node_def()) {
+        if (node.op() == "Identity" && node.input_size() == 1) {
+          identity_map[node.name()] = node.input(0);
+        }
+      }
+      for (const auto& output_arg : function_def->signature().output_arg()) {
+        const string& tensor = function_def->ret().at(output_arg.name());
+        const string& output_node = tensor.substr(0, tensor.find(':'));
+        if (identity_map.find(output_node) != identity_map.end()) {
+          (*function_def->mutable_ret())[output_arg.name()] =
+              identity_map.at(output_node);
+        }
+      }
+    }
+
     Status ApplyOptimizations(OpKernelContext* ctx, GraphDef* graph_def,
                               string* output_node) {
       // Add an identity node as the fetch node, otherwise we might get
@@ -215,6 +266,15 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       (*node->mutable_attr())["T"].set_type(DT_VARIANT);
       *output_node = node->name();
 
+      // Add fake sink node to graph and functions to allow rewriting the actual
+      // sink nodes.
+      // TODO(b/118820916): When MetaOptimizer adds provisions for function
+      // retvals to be optimizable, we will no longer need this.
+      for (auto& function_def :
+           *graph_def->mutable_library()->mutable_function()) {
+        AddFakeSinks(&function_def);
+      }
+
       // Create metagraph.
       MetaGraphDef meta_graph_def;
       (*meta_graph_def.mutable_graph_def()) = *graph_def;
@@ -226,7 +286,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
       (*meta_graph_def.mutable_collection_def())["train_op"] = collection_def;
 
       // Create Grappler item.
-      tensorflow::RewriterConfig rewriter_config;
+      tensorflow::ConfigProto config;
+      RewriterConfig& rewriter_config =
+          *config.mutable_graph_options()->mutable_rewrite_options();
       for (const string& optimization : optimizations_) {
         rewriter_config.add_optimizers(optimization);
       }
@@ -243,8 +305,9 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         // removing unused graph nodes)
         // TODO(b/118175421): This should be part of the tf.data optimization
         // pass manager.
-        for (const auto& optimizer : {"pruning", "function", "constfold",
-                                      "shape", "arithmetic", "dependency"}) {
+        // TODO(b/120437209): Apply `constfold` optimization when it is fixed.
+        for (const auto& optimizer :
+             {"pruning", "function", "shape", "arithmetic", "dependency"}) {
           rewriter_config.add_optimizers(optimizer);
         }
       }
@@ -264,7 +327,15 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
         }
       }
       TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-          *grappler_item, rewriter_config, ctx->device(), &cluster, graph_def));
+          *grappler_item, config, ctx->device(), &cluster, graph_def));
+
+      // Remove fake sinks after optimizations are done.
+      // TODO(b/118820916): When MetaOptimizer adds provisions for function
+      // retvals to be optimizable, we will no longer need this.
+      for (auto& function_def :
+           *graph_def->mutable_library()->mutable_function()) {
+        RemoveFakeSinks(&function_def);
+      }
 
       return Status::OK();
     }
@@ -273,6 +344,7 @@ class OptimizeDatasetOp : public UnaryDatasetOpKernel {
     FunctionLibraryRuntime* lib_ = nullptr;
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_ = nullptr;
     std::unique_ptr<FunctionLibraryDefinition> flib_def_ = nullptr;
+    std::unique_ptr<FunctionHandleCache> function_handle_cache_ = nullptr;
     const DatasetBase* input_;
     const std::vector<string> optimizations_;
     const DataTypeVector output_types_;
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index 2ab5c83082b3f97f354a4f39db02c1012557b6a4..d8a7f21c5f99c6d99e506847e00cabc6bd49168f 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -22,75 +22,6 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 namespace {
-const char kOptionalVariantTypeName[] = "tensorflow::data::Optional";
-
-// An `OptionalVariant` can represent either an "actual value" (a tuple of
-// tensors) or "none", and may be stored in a DT_VARIANT tensor.
-class OptionalVariant {
- public:
-  // Create an `OptionalVariant` with no actual value.
-  OptionalVariant() : values_(nullptr) {}
-
-  // Create an `OptionalVariant` with the actual value given by the tuple of
-  // tensors in `values`.
-  explicit OptionalVariant(std::vector<Tensor> values)
-      : values_(new std::vector<Tensor>(std::move(values))) {}
-
-  OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
-
-  // Returns true if `this` represents an actual value.
-  bool has_value() const { return values_ != nullptr; }
-
-  // REQUIRES: `this->has_value()` must be true.
-  const std::vector<Tensor>& get_values() const {
-    CHECK(values_) << "Tried to get values from an empty OptionalVariant";
-    return *values_;
-  }
-
-  // Implementations of the necessary methods for using `OptionalVariant`
-  // objects in DT_VARIANT tensors.
-  string TypeName() const { return kOptionalVariantTypeName; }
-  void Encode(VariantTensorData* data) const {
-    data->set_metadata(values_ != nullptr);
-    if (values_ != nullptr) {
-      for (const auto& t : *values_) {
-        *(data->add_tensors()) = t;
-      }
-    }
-  }
-
-  bool Decode(const VariantTensorData& data) {
-    if (data.type_name() != TypeName()) {
-      return false;
-    }
-    bool has_value = false;
-    if (!data.get_metadata(&has_value)) {
-      return false;
-    }
-    if (has_value) {
-      values_.reset(new std::vector<Tensor>(data.tensors()));
-    } else {
-      values_.reset();
-    }
-    return true;
-  }
-
-  string DebugString() const {
-    if (values_) {
-      return strings::StrCat("OptionalVariant<", "values: (",
-                             str_util::Join(*values_, ", ",
-                                            [](string* s, const Tensor& elem) {
-                                              *s = elem.DebugString();
-                                            }),
-                             ")>");
-    } else {
-      return strings::StrCat("OptionalVariant<None>");
-    }
-  }
-
- private:
-  std::shared_ptr<const std::vector<Tensor>> values_;
-};
 
 class OptionalNoneOp : public OpKernel {
  public:
@@ -143,6 +74,12 @@ class OptionalGetValueOp : public OpKernel {
   explicit OptionalGetValueOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
+    OP_REQUIRES(
+        ctx, output_shapes_.size() == output_types_.size(),
+        errors::InvalidArgument(
+            "output_types and output_shapes must be same length, got:\n",
+            "output_types: ", output_types_.size(), "\n",
+            "output_shapes: ", output_shapes_.size()));
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -162,6 +99,10 @@ class OptionalGetValueOp : public OpKernel {
         ctx, optional->has_value(),
         errors::InvalidArgument("The given optional does not have a value."));
     const auto& components = optional->get_values();
+    OP_REQUIRES(ctx, components.size() == output_types_.size(),
+                errors::InvalidArgument(
+                    "The given optional has ", components.size(),
+                    " components, expected ", output_types_.size()));
     for (int i = 0; i < components.size(); ++i) {
       OP_REQUIRES(
           ctx, components[i].dtype() == output_types_[i],
@@ -186,23 +127,27 @@ class OptionalGetValueOp : public OpKernel {
   std::vector<PartialTensorShape> output_shapes_;
 };
 
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_CPU).Priority(2),
                         OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalNone").Device(DEVICE_GPU).Priority(1),
                         OptionalNoneOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE_CPU),
-                        OptionalFromValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalFromValue").Device(DEVICE_GPU),
-                        OptionalFromValueOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_CPU).Priority(2),
+    OptionalFromValueOp);
+REGISTER_KERNEL_BUILDER(
+    Name("OptionalFromValue").Device(DEVICE_GPU).Priority(1),
+    OptionalFromValueOp);
 
-REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue").Device(DEVICE_CPU).Priority(2),
                         OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(
-    Name("OptionalHasValue").Device(DEVICE_GPU).HostMemory("has_value"),
-    OptionalHasValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalHasValue")
+                            .Device(DEVICE_GPU)
+                            .HostMemory("has_value")
+                            .Priority(1),
+                        OptionalHasValueOp);
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_CPU).Priority(2),
                         OptionalGetValueOp);
-REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU),
+REGISTER_KERNEL_BUILDER(Name("OptionalGetValue").Device(DEVICE_GPU).Priority(1),
                         OptionalGetValueOp);
 
 static Status OptionalDeviceCopy(
@@ -213,15 +158,7 @@ static Status OptionalDeviceCopy(
     std::vector<Tensor> to_values;
     to_values.reserve(from_values.size());
     for (const Tensor& t : from_values) {
-      if (t.dtype() == DT_VARIANT) {
-        // TODO(b/116349787): Implement support for nested variants.
-        return errors::Unimplemented(
-            "Support for copying nested variants to device has not yet been "
-            "implemented.");
-      }
-    }
-    for (const Tensor& t : from_values) {
-      if (DMAHelper::CanUseDMA(&t)) {
+      if (DMAHelper::CanUseDMA(&t) || t.dtype() == DT_VARIANT) {
         Tensor tmp(t.dtype());
         TF_RETURN_IF_ERROR(copy(t, &tmp));
         to_values.push_back(std::move(tmp));
@@ -272,5 +209,20 @@ Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index) {
   return Status::OK();
 }
 
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_CPU, OptionalVariant,
+                                         OptionalZerosLike<CPUDevice>);
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_CPU,
+                                          OptionalVariant,
+                                          OptionalBinaryAdd<CPUDevice>);
+
+Status OptionalShape(const OptionalVariant& x, TensorShape* s) {
+  *s = TensorShape({});
+  return Status::OK();
+}
+
+REGISTER_UNARY_VARIANT_SHAPE_FUNCTION(OptionalVariant, OptionalShape);
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/optional_ops.cu.cc b/tensorflow/core/kernels/data/optional_ops.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eb4a95a6f2245665ff70922df733909e9ba996df
--- /dev/null
+++ b/tensorflow/core/kernels/data/optional_ops.cu.cc
@@ -0,0 +1,37 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#define EIGEN_USE_THREADS
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include "tensorflow/core/kernels/data/optional_ops.h"
+
+#include "tensorflow/core/framework/variant_op_registry.h"
+
+namespace tensorflow {
+namespace data {
+
+REGISTER_UNARY_VARIANT_UNARY_OP_FUNCTION(ZEROS_LIKE_VARIANT_UNARY_OP,
+                                         DEVICE_GPU, OptionalVariant,
+                                         OptionalZerosLike<GPUDevice>);
+
+REGISTER_UNARY_VARIANT_BINARY_OP_FUNCTION(ADD_VARIANT_BINARY_OP, DEVICE_GPU,
+                                          OptionalVariant,
+                                          OptionalBinaryAdd<GPUDevice>);
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/data/optional_ops.h b/tensorflow/core/kernels/data/optional_ops.h
index 2cbf2933f50a11b01ca19739e6f3318b4816e800..ef14e843115da0c37d79c6be13b8064c78c072d5 100644
--- a/tensorflow/core/kernels/data/optional_ops.h
+++ b/tensorflow/core/kernels/data/optional_ops.h
@@ -19,10 +19,13 @@ limitations under the License.
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
 
 namespace tensorflow {
 namespace data {
 
+const char kOptionalVariantTypeName[] = "tensorflow::data::Optional";
+
 // Stores a DT_VARIANT value representing an Optional with the given value
 // in the `output_index`^th output of the given kernel execution context.
 Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
@@ -32,6 +35,122 @@ Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
 // in the `output_index`^th output of the given kernel execution context.
 Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index);
 
+// An `OptionalVariant` can represent either an "actual value" (a tuple of
+// tensors) or "none", and may be stored in a DT_VARIANT tensor.
+class OptionalVariant {
+ public:
+  // Create an `OptionalVariant` with no actual value.
+  OptionalVariant() : values_(nullptr) {}
+
+  // Create an `OptionalVariant` with the actual value given by the tuple of
+  // tensors in `values`.
+  explicit OptionalVariant(std::vector<Tensor> values)
+      : values_(new std::vector<Tensor>(std::move(values))) {}
+
+  OptionalVariant(const OptionalVariant& other) : values_(other.values_) {}
+
+  // Returns true if `this` represents an actual value.
+  bool has_value() const { return values_ != nullptr; }
+
+  // REQUIRES: `this->has_value()` must be true.
+  const std::vector<Tensor>& get_values() const {
+    DCHECK(values_) << "Tried to get values from an empty OptionalVariant";
+    return *values_;
+  }
+
+  // Implementations of the necessary methods for using `OptionalVariant`
+  // objects in DT_VARIANT tensors.
+  string TypeName() const { return kOptionalVariantTypeName; }
+  void Encode(VariantTensorData* data) const {
+    data->set_metadata(values_ != nullptr);
+    if (values_ != nullptr) {
+      for (const auto& t : *values_) {
+        *(data->add_tensors()) = t;
+      }
+    }
+  }
+
+  bool Decode(const VariantTensorData& data) {
+    if (data.type_name() != TypeName()) {
+      return false;
+    }
+    bool has_value = false;
+    if (!data.get_metadata(&has_value)) {
+      return false;
+    }
+    if (has_value) {
+      values_.reset(new std::vector<Tensor>(data.tensors()));
+    } else {
+      values_.reset();
+    }
+    return true;
+  }
+
+  string DebugString() const {
+    if (values_) {
+      return strings::StrCat("OptionalVariant<", "values: (",
+                             str_util::Join(*values_, ", ",
+                                            [](string* s, const Tensor& elem) {
+                                              *s = elem.DebugString();
+                                            }),
+                             ")>");
+    } else {
+      return strings::StrCat("OptionalVariant<None>");
+    }
+  }
+
+ private:
+  std::shared_ptr<const std::vector<Tensor>> values_;
+};
+
+template <typename Device>
+Status OptionalZerosLike(OpKernelContext* ctx, const OptionalVariant& x,
+                         OptionalVariant* y) {
+  if (!x.has_value()) {
+    *y = x;
+    return Status::OK();
+  }
+  std::vector<Tensor> zero_tensors;
+  for (const Tensor& tensor : x.get_values()) {
+    Tensor zero_t;
+    TF_RETURN_IF_ERROR(ZerosLikeTensor<Device>(ctx, tensor, &zero_t));
+    zero_tensors.push_back(std::move(zero_t));
+  }
+  *y = OptionalVariant(zero_tensors);
+  return Status::OK();
+}
+
+template <typename Device>
+Status OptionalBinaryAdd(OpKernelContext* ctx, const OptionalVariant& a,
+                         const OptionalVariant& b, OptionalVariant* out) {
+  // TODO(skyewm): should adding a value to a non-value be a no-op instead?
+  if (a.has_value() != b.has_value()) {
+    return errors::InvalidArgument(
+        "Cannot add optionals because one has a value and the other doesn't.");
+  }
+  if (!a.has_value()) {
+    *out = a;
+    return Status::OK();
+  }
+  if (a.get_values().size() != b.get_values().size()) {
+    return errors::InvalidArgument(
+        "Cannot add optionals because they have different numbers of "
+        "components (",
+        a.get_values().size(), " vs. ", b.get_values().size(), ").");
+  }
+  std::vector<Tensor> out_tensors;
+  for (int i = 0; i < a.get_values().size(); ++i) {
+    const Tensor& a_tensor = a.get_values()[i];
+    const Tensor& b_tensor = b.get_values()[i];
+    Tensor out_tensor;
+    TF_RETURN_IF_ERROR(
+        BinaryAddTensors<Device>(ctx, a_tensor, b_tensor, &out_tensor));
+    out_tensors.push_back(std::move(out_tensor));
+  }
+  *out = OptionalVariant(out_tensors);
+  return Status::OK();
+}
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 594a9ce7ec2d3ac634de4d643f19b6ec6c53ddc8..0fff4c53706269538f770889744e21fffcae3601 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -152,6 +152,15 @@ class PaddedBatchDatasetOp : public UnaryDatasetOpKernel {
                              ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / batch_size_ +
+             (n % batch_size_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 985e197a9934ece2fcc8abdc98f445b5293f7a98..2f6d91e863401ca4cc56187a9423ae406b5f651a 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/kernels/data/captured_function.h"
 #include "tensorflow/core/kernels/data/dataset_utils.h"
-#include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -36,1047 +35,6 @@ namespace {
 // See documentation in ../../ops/dataset_ops.cc for a high-level
 // description of the following op.
 
-class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_shapes", &output_shapes_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 cycle_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "cycle_length", &cycle_length));
-    OP_REQUIRES(ctx, cycle_length > 0,
-                errors::InvalidArgument("`cycle_length` must be > 0"));
-
-    int64 block_length = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument(ctx, "block_length", &block_length));
-    OP_REQUIRES(ctx, block_length > 0,
-                errors::InvalidArgument("`block_length` must be > 0"));
-
-    bool sloppy = false;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "sloppy", &sloppy));
-
-    int64 buffer_output_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "buffer_output_elements",
-                                            &buffer_output_elements));
-    OP_REQUIRES(
-        ctx, buffer_output_elements > 0,
-        errors::InvalidArgument("`buffer_output_elements` must be > 0"));
-
-    int64 prefetch_input_elements = 0;
-    OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, "prefetch_input_elements",
-                                            &prefetch_input_elements));
-    OP_REQUIRES(
-        ctx, prefetch_input_elements >= 0,
-        errors::InvalidArgument("`prefetch_input_elements` must be >= 0"));
-
-    std::unique_ptr<CapturedFunction> captured_func;
-    OP_REQUIRES_OK(
-        ctx, CapturedFunction::Create(interleave_func_, ctx, "other_arguments",
-                                      &captured_func));
-
-    *output =
-        new Dataset(ctx, input, interleave_func_, std::move(captured_func),
-                    cycle_length, block_length, sloppy, buffer_output_elements,
-                    prefetch_input_elements, output_types_, output_shapes_);
-  }
-
- private:
-  class Dataset : public DatasetBase {
-   public:
-    Dataset(OpKernelContext* ctx, const DatasetBase* input,
-            const NameAttrList& func,
-            std::unique_ptr<CapturedFunction> captured_func, int64 cycle_length,
-            int64 block_length, bool sloppy, int64 buffer_output_elements,
-            int64 prefetch_input_elements, const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-        : DatasetBase(DatasetContext(ctx)),
-          input_(input),
-          interleave_func_(func),
-          captured_func_(std::move(captured_func)),
-          cycle_length_(cycle_length),
-          block_length_(block_length),
-          sloppy_(sloppy),
-          buffer_output_elements_(buffer_output_elements),
-          prefetch_input_elements_(prefetch_input_elements),
-          output_types_(output_types),
-          output_shapes_(output_shapes) {
-      input_->Ref();
-    }
-
-    ~Dataset() override { input_->Unref(); }
-
-    std::unique_ptr<IteratorBase> MakeIteratorInternal(
-        const string& prefix) const override {
-      return std::unique_ptr<IteratorBase>(new Iterator(
-          {this, strings::StrCat(prefix, "::ParallelInterleave")}));
-    }
-
-    const DataTypeVector& output_dtypes() const override {
-      return output_types_;
-    }
-
-    const std::vector<PartialTensorShape>& output_shapes() const override {
-      return output_shapes_;
-    }
-
-    string DebugString() const override {
-      return "ParallelInterleaveDatasetOp::Dataset";
-    }
-
-   protected:
-    Status AsGraphDefInternal(SerializationContext* ctx,
-                              DatasetGraphDefBuilder* b,
-                              Node** output) const override {
-      TF_RETURN_IF_ERROR(b->AddFunction(ctx, interleave_func_.name()));
-      Node* input_node;
-      TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_node));
-      Node* cycle_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(cycle_length_, &cycle_length_node));
-      Node* block_length_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(block_length_, &block_length_node));
-      Node* sloppy_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(sloppy_, &sloppy_node));
-      Node* buffer_output_elements_node;
-      TF_RETURN_IF_ERROR(
-          b->AddScalar(buffer_output_elements_, &buffer_output_elements_node));
-      Node* prefetch_input_elements_node;
-      TF_RETURN_IF_ERROR(b->AddScalar(prefetch_input_elements_,
-                                      &prefetch_input_elements_node));
-      DataTypeVector other_arguments_types;
-      other_arguments_types.reserve(captured_func_->captured_inputs().size());
-      std::vector<Node*> other_arguments;
-      other_arguments.reserve(captured_func_->captured_inputs().size());
-      for (const Tensor& t : captured_func_->captured_inputs()) {
-        Node* node;
-        TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-        other_arguments.emplace_back(node);
-        other_arguments_types.emplace_back(t.dtype());
-      }
-      AttrValue f;
-      b->BuildAttrValue(interleave_func_, &f);
-      AttrValue other_arguments_types_attr;
-      b->BuildAttrValue(other_arguments_types, &other_arguments_types_attr);
-
-      TF_RETURN_IF_ERROR(b->AddDataset(
-          this,
-          {{0, input_node},
-           {2, cycle_length_node},
-           {3, block_length_node},
-           {4, sloppy_node},
-           {5, buffer_output_elements_node},
-           {6, prefetch_input_elements_node}},
-          {{1, other_arguments}},
-          {{"f", f}, {"Targuments", other_arguments_types_attr}}, output));
-      return Status::OK();
-    }
-
-   private:
-    int64 num_threads() const {
-      return cycle_length_ + prefetch_input_elements_;
-    }
-
-    // Parallel interleave's implementation is designed around a few principles:
-    //  1. Thread creation is relatively expensive. (Not reusing
-    //     threads causes a number of indirect costs such as poorer tcmalloc
-    //     performance due to thread-local caches, etc.) We allocate a fixed
-    //     number of threads at the start and never change. This is why we've
-    //     fused functionality that is theoretically orthogonal (i.e.
-    //     .prefetch()) into the implementation.
-    //  2. Drop-in replacement for standard interleave. The goal will be to
-    //     auto-opt people into an optimized implementation without any work
-    //     on the customer's part. We thus go through great pains to maintain
-    //     identical iteration orders, full determinism (disabled only via a
-    //     flag, etc.)
-    //  3. Performance across a variety of environments and I/O envelopes.
-    //
-    // The actual implementation centers around a collection of worker threads
-    // and their corresponding worker state (tracked in the `workers_` vector).
-    // Worker threads repeatedly receive a vector of Tensors that are used as
-    // input to the flat-map function (`captured_func_`). The output of this
-    // function must be a dataset. The worker thread then repeatedly calls
-    // `GetNext()`, maintaining a buffer of elements to minimize the likelihood
-    // that a caller will block waiting for an element to be produced.
-    //
-    // Pointers to these worker states are kept in 2 disjoint data structures:
-    //  1. `interleave_indices_` is a vector containing indices of WorkerStates
-    //     in `workers_` that we are interleaving. Worker threads backing these
-    //     WorkerStates should be regularly producing values.
-    //  2. `staging_indices_` is a deque containing indices of WorkerStates in
-    //     `workers_` that we will move to `interleave_indices_` when an
-    //     iterator in `interleave_indices_` is exhausted.
-    //
-    // The client calls `GetNext[Internal]()` to retrieve an output element. The
-    // internal implementation updates the state of `interleave_indices_` and
-    // `staging_indices_` as output iterators (run by the worker threads) are
-    // exhausted.
-    //
-    // `input_impl_` is the input iterator that generates arguments for the
-    // flat-map function (`captured_func_`). It is set to an iterator at
-    // Iterator construction, and is fixed until we consume all input elements.
-    // Once it is exhausted, we reset the unique_ptr to eagerly deallocate
-    // memory.
-    //
-    // A few invariants are maintained:
-    //  1. No element in interleave_indices_ should be a -1 unless
-    //     `staging_indices_` is empty and `input_impl_` is empty.
-    //  2. Every `worker_` element is pointed to by at most one element of the
-    //     union of `interleave_indices_` and `staging_indices_`.
-    //  3. Unless `input_impl_` is empty, every `worker_` must be pointed to by
-    //     an element in `interleave_indices_` or `staging_indices_`.
-    class Iterator : public DatasetIterator<Dataset> {
-     public:
-      explicit Iterator(const Params& params)
-          : DatasetIterator<Dataset>(params),
-            workers_(dataset()->num_threads()),
-            worker_thread_states_(dataset()->num_threads()) {}
-
-      ~Iterator() override {
-        mutex_lock l(mu_);
-        cancelled_ = true;
-        // Notify all workers in case they are blocked.
-        for (auto& worker : workers_) {
-          worker.cond_var.notify_all();
-        }
-      }
-
-      Status Initialize(IteratorContext* ctx) override {
-        TF_RETURN_IF_ERROR(
-            dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
-      }
-
-      // It is implemented so that it matches the deterministic interleave
-      // unless getting the next element would block and we are allowed to be
-      // sloppy.
-      Status GetNextInternal(IteratorContext* ctx,
-                             std::vector<Tensor>* out_tensors,
-                             bool* end_of_sequence) override {
-        mutex_lock l(mu_);
-        TF_RETURN_IF_ERROR(EnsureWorkerThreadsStarted(ctx));
-        while (!cancelled_) {
-          // Wait for an item to become available, blocking if necessary. If we
-          // are allowed to be sloppy, we can skip over input datasets that do
-          // not have an item readily available.
-          bool can_produce_elements = false;
-          bool must_wait_for_input = true;
-          for (int64 i = 0; i < interleave_indices_.size(); ++i) {
-            int64 index = (next_index_ + i) % interleave_indices_.size();
-            int64 current_worker_index = interleave_indices_[index];
-            if (current_worker_index < 0) {
-              continue;  // Empty interleave elements.
-            }
-            WorkerState* current_worker = &workers_[current_worker_index];
-            can_produce_elements |= current_worker->MayHaveElements();
-            if (!current_worker->outputs.empty()) {
-              // We have an element!
-              next_index_ = index;
-              const bool element_acquired_sloppily =
-                  dataset()->sloppy_ && i > 1;
-              if (!element_acquired_sloppily) {
-                // If the element was acquired in the regular (non-sloppy)
-                // order, then advance the current block and cycle pointers to
-                // the next element in the regular order.
-                block_count_++;
-                if (block_count_ == dataset()->block_length_) {
-                  next_index_ = (index + 1) % interleave_indices_.size();
-                  block_count_ = 0;
-                }
-              } else {
-                block_count_ = 0;
-              }
-              *end_of_sequence = false;
-              Status s = current_worker->outputs.front().status;
-              current_worker->outputs.front().output.swap(*out_tensors);
-              current_worker->outputs.pop_front();
-              current_worker->cond_var.notify_one();
-              return s;
-            } else if (current_worker->is_producing && !dataset()->sloppy_) {
-              // current_worker.outputs.empty(), and we must wait for this
-              // iterator.
-              if (next_index_ != index) {
-                // We have advanced to a new iterator; reset block counts.
-                next_index_ = index;
-                block_count_ = 0;
-              }
-              break;
-            } else if (!current_worker->is_producing) {
-              // This iterator has reached end of input.
-              interleave_indices_[index] = -1;
-              if (input_impl_) {
-                // Start prefetching a new iterator.
-                std::vector<Tensor> args;
-                bool end_of_input = false;
-                Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-                if (end_of_input) {
-                  input_impl_.reset();
-                } else {
-                  current_worker->SetInputs(s, std::move(args));
-                  staging_indices_.emplace_back(current_worker_index);
-                }
-              }
-
-              if (!staging_indices_.empty()) {
-                // Move a worker from `staging_indices_` to
-                // `interleave_indices_`.
-                interleave_indices_[index] = staging_indices_.front();
-                staging_indices_.pop_front();
-
-                next_index_ = (index + 1) % interleave_indices_.size();
-                block_count_ = 0;
-                // Restart the inner [for] loop
-                can_produce_elements = true;
-                must_wait_for_input = false;
-                break;
-              }
-            }
-          }
-
-          if (!can_produce_elements && !input_impl_) {
-            // No potential for future values.
-            *end_of_sequence = true;
-            return Status::OK();
-          }
-
-          if (must_wait_for_input) {
-            // Wait for elements to become available.
-            RecordStop(ctx);
-            if (dataset()->sloppy_) {
-              sloppy_cond_var_.wait(l);
-            } else {
-              workers_[interleave_indices_[next_index_]].cond_var.wait(l);
-            }
-            RecordStart(ctx);
-          }
-        }
-        return errors::Cancelled(
-            "ParallelInterleaveDatasetOp::Dataset::Iterator::GetNext");
-      }
-
-     protected:
-      std::shared_ptr<model::Node> CreateNode(
-          IteratorContext* ctx, model::Node::Args args) const override {
-        return model::MakeAsyncInterleaveManyNode(std::move(args),
-                                                  /*parameters=*/{});
-      }
-
-      Status SaveInternal(IteratorStateWriter* writer) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("next_index"), next_index_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("block_count"), block_count_));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name("workers_size"), workers_.size()));
-        for (int i = 0; i < workers_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerStateLocked(writer, i));
-        }
-        for (int i = 0; i < worker_thread_states_.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteWorkerThreadStateLocked(writer, i));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("interleave_size"),
-                                               interleave_indices_.size()));
-        for (int i = 0; i < interleave_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("interleave_indices_", i)),
-              interleave_indices_[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("staging_size"),
-                                               staging_indices_.size()));
-        for (int i = 0; i < staging_indices_.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat("staging_indices_", i)),
-              staging_indices_[i]));
-        }
-        if (!worker_threads_.empty()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name("worker_threads_running"), ""));
-        }
-        return Status::OK();
-      }
-
-      Status RestoreInternal(IteratorContext* ctx,
-                             IteratorStateReader* reader) override {
-        // The order of locking is important here to avoid deadlock.
-        mutex_lock l(mu_);
-        mutex_lock ckpt_l(ckpt_mu_);
-        if (!reader->Contains(full_name("input_exhausted"))) {
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        } else {
-          input_impl_.reset();
-        }
-        int64 temp;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("next_index"), &temp));
-        next_index_ = size_t(temp);
-        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("block_count"), &temp));
-        block_count_ = size_t(temp);
-
-        // Restore WorkerStates.
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("workers_size"), &temp));
-        if (temp != dataset()->num_threads()) {
-          return errors::Internal("Expected ", dataset()->num_threads(),
-                                  " worker states but found ", temp, ".");
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerStateLocked(reader, i, ctx));
-        }
-        for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-          TF_RETURN_IF_ERROR(ReadWorkerThreadStateLocked(reader, i, ctx));
-        }
-
-        // Restore `interleave_indices_`.
-        std::set<int64> all_indices;
-        {
-          int64 interleave_size;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(full_name("interleave_size"),
-                                                &interleave_size));
-          interleave_indices_.reserve(interleave_size);
-          for (int64 i = 0; i < interleave_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("interleave_indices_", i)), &temp));
-            if (temp >= 0 && all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            interleave_indices_.emplace_back(temp);
-          }
-        }
-
-        // Restore `staging_indices_`.
-        {
-          int64 staging_size;
-          TF_RETURN_IF_ERROR(
-              reader->ReadScalar(full_name("staging_size"), &staging_size));
-          for (int i = 0; i < staging_size; ++i) {
-            int64 temp;
-            TF_RETURN_IF_ERROR(reader->ReadScalar(
-                full_name(strings::StrCat("staging_indices_", i)), &temp));
-            if (all_indices.find(temp) != all_indices.end()) {
-              return errors::Internal(
-                  "Duplicate entry for ", temp,
-                  " found when reading interleave and staging indices.");
-            }
-            if (temp >= 0) {
-              all_indices.insert(temp);
-            }
-            staging_indices_.emplace_back(temp);
-          }
-        }
-
-        // Start Worker threads.
-        if (reader->Contains(full_name("worker_threads_running"))) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (size_t i = 0; i < dataset()->num_threads(); ++i) {
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-          }
-        }
-        return Status::OK();
-      }
-
-     private:
-      // OutputElem contains the information from a call to GetNext by an output
-      // iterator.
-      struct OutputElem {
-        // The output iterator sets `status` if getting the output element
-        // fails.
-        Status status;
-        // The buffered data element.
-        std::vector<Tensor> output;
-
-        explicit OutputElem(const Status& s) : status(s) {}
-      };
-
-      // Worker threads operate on their relevant WorkerState structs.
-      //
-      // WorkerState's fields are all protected by mu_;
-      struct WorkerState {
-        // The arguments to be used to construct an output iterator.
-        std::vector<Tensor> input;
-        // The buffered output elements.
-        std::deque<OutputElem> outputs;
-        // Set to true iff the worker thread expects to append more elements to
-        // outputs. is_producing can be false despite !outputs.empty().
-        // Concretely, all output elements will have been consumed only when:
-        // is_producing == false && outputs.empty();
-        bool is_producing = false;
-        // Condition variable used to coordinate between threads. The worker
-        // thread waits on this condition variable when it is either (1) waiting
-        // for the main thread to add arguments to `input`, or (2) waiting for
-        // the main thread to consume an element of `outputs`. The main thread
-        // waits on cond_var if it is waiting for the worker thread to produce
-        // an element into `outputs` (this implies sloppy_==false).
-        condition_variable cond_var;
-
-        inline bool MayHaveElements() const {
-          return is_producing || !outputs.empty();
-        }
-
-        // Sets inputs for a worker thread and notifies it to start processing.
-        void SetInputs(const Status& s, std::vector<Tensor> input_arguments) {
-          if (s.ok()) {
-            DCHECK(!MayHaveElements())
-                << "Tried to start inputs, despite already producing!";
-            input = std::move(input_arguments);
-            is_producing = true;
-            cond_var.notify_one();
-          } else {
-            outputs.emplace_back(s);
-          }
-        }
-      };
-
-      // The internal state of a worker thread that is not already captured
-      // in its `WorkerState`.
-      //
-      // This is needed only for checkpointing purposes. We keep this
-      // separate from `WorkerState` and guard its fields using a separate
-      // lock `ckpt_mu_` so as to not affect the performance of main pipeline.
-      struct WorkerThreadState {
-        // The output element that has been produced from the input iterator
-        // and is waiting to be added to `WorkerState.outputs`.
-        OutputElem output_elem;
-
-        // Whether the input iterator returned an `end_of_sequence`.
-        bool end_of_sequence = false;
-
-        // Status returned from `MakeIteratorFromInputElement`.
-        Status iterator_creation_status;
-
-        // The arguments to be used to construct `iterator`.
-        std::vector<Tensor> input;
-
-        std::unique_ptr<IteratorBase> iterator;
-
-        WorkerThreadState() : output_elem(Status::OK()) {}
-      };
-
-      Status EnsureWorkerThreadsStarted(IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (worker_threads_.empty()) {
-          worker_threads_.reserve(dataset()->num_threads());
-          for (int64 i = 0; i < dataset()->num_threads(); ++i) {
-            std::vector<Tensor> args;
-            bool end_of_input = false;
-            Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
-            if (end_of_input) {
-              input_impl_.reset();
-              return Status::OK();
-            }
-            workers_[i].SetInputs(s, std::move(args));
-            std::shared_ptr<IteratorContext> new_ctx(new IteratorContext(*ctx));
-            worker_threads_.emplace_back(ctx->env()->StartThread(
-                {}, strings::StrCat("tf_data_parallel_interleave_worker_", i),
-                [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
-            if (i < dataset()->cycle_length_) {
-              interleave_indices_.push_back(i);
-            } else {
-              staging_indices_.push_back(i);
-            }
-          }
-          DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
-          DCHECK(staging_indices_.size() ==
-                 dataset()->prefetch_input_elements_);
-        }
-        return Status::OK();
-      }
-
-      // Produces elements into the worker's output buffers.
-      void WorkerThread(const std::shared_ptr<IteratorContext>& ctx,
-                        const int64 thread_index) {
-        // Notes on checkpointing thread local state, i.e., `WorkerThreadState`:
-        //
-        // 1. Any local state that may need to be checkpointed should be kept
-        //    in `worker_thread_states_[thread_index]`.
-        // 2. `WorkerThreadState` should contain state that is needed only for
-        //    checkpointing, i.e., if we were to remove checkpointing support,
-        //    we could keep that state as local variables in this thread.
-        // 3. This thread should only read/write state at `thread_index`
-        //    and should not access other thread states.
-        // 4. When restoring from checkpoint, threads are started only after
-        //    the restore is complete.
-        // 5. Once restored from a checkpoint, the local state is edited only
-        //    by this thread. 3 & 4 allow making assumptions like temporarily
-        //    caching local state in this thread and using it outside a lock
-        //    e.g. `make_new_iterator`.
-        // 6. `ckpt_mu_` should be wisely used to create *consistent*
-        //    checkpoint markers.
-
-        // std::function arguments are copy-constructable, so we pass raw
-        // pointers, and then immediately wrap them to ensure correct ownership.
-        RecordStart(ctx.get());
-        auto cleanup = gtl::MakeCleanup([this, thread_index, ctx] {
-          mutex_lock l(mu_);
-          workers_[thread_index].cond_var.notify_all();
-          RecordStop(ctx.get());
-        });
-        bool make_new_iterator;
-        {
-          tf_shared_lock l(ckpt_mu_);
-          // Decide whether a new iterator should be built.
-          // 1. If there is an existing iterator, we use it.
-          // 2. If there was an error in iterator creation that could not be
-          //    notified to the client we attempt to send that to the client
-          //    first.
-          make_new_iterator =
-              worker_thread_states_[thread_index].iterator == nullptr &&
-              worker_thread_states_[thread_index].iterator_creation_status.ok();
-        }
-        // Even though `make_new_iterator` has cached values from
-        // `worker_thread_states_[thread_index]` which is guarded by ckpt_mu_,
-        // it is safe to *read* `make_new_iterator`outside of a lock without
-        // worrying about concurrent changes to values in
-        // `worker_thread_states_[thread_index]`. See comment at the start of
-        // this function for details.
-        while (true) {
-          // Whether creation of the iterator succeeded.
-          Status iterator_creation_status;
-          // 1. Build a new iterator or use the existing one.
-          if (make_new_iterator) {
-            // 1a. Get new input tensors or use the exiting ones.
-            bool read_new_input;
-            {
-              tf_shared_lock l(ckpt_mu_);
-              // worker_thread_states_[thread_index].input will be non-empty
-              // if checkpointing happened at CHECKPOINT_MARKER_A.
-              read_new_input =
-                  worker_thread_states_[thread_index].input.empty();
-            }
-
-            if (read_new_input) {
-              mutex_lock l(mu_);
-              while (!cancelled_ && !workers_[thread_index].is_producing) {
-                RecordStop(ctx.get());
-                workers_[thread_index].cond_var.wait(l);
-                RecordStart(ctx.get());
-              }
-              if (cancelled_) return;
-              // Copy the input tensors so that we do not need to block on `mu_`
-              // when building the iterator.
-              // We keep a copy of the input tensors in
-              // `WorkerThreadState.input` till the iterator is in use. This is
-              // used in `RestoreInternal` to re-build the iterator.
-              // TODO(b/78046638): Explore ways to avoid tracking the input
-              // tensors.
-              tf_shared_lock ckpt_l(ckpt_mu_);
-              worker_thread_states_[thread_index].input.swap(
-                  workers_[thread_index].input);
-              // CHECKPOINT_MARKER_A
-              // We have the input tensors but have not built the iterator yet.
-            }
-
-            // 1b. Run the user defined function to produce a new iterator.
-            {
-              tf_shared_lock l(ckpt_mu_);
-              worker_thread_states_[thread_index].iterator_creation_status =
-                  MakeIteratorFromInputElement(
-                      ctx.get(), worker_thread_states_[thread_index].input,
-                      thread_index, dataset()->captured_func_.get(), prefix(),
-                      &worker_thread_states_[thread_index].iterator);
-              iterator_creation_status =
-                  worker_thread_states_[thread_index].iterator_creation_status;
-              if (!iterator_creation_status.ok()) {
-                worker_thread_states_[thread_index].input.clear();
-              }
-              // CHECKPOINT_MARKER_B
-              // Either an iterator has been successfully built and placed in
-              // `worker_thread_states_[thread_index].iterator` or it failed and
-              // a non-OK status has been put in
-              // `worker_thread_states_[thread_index].iterator_creation_status`.
-            }
-          } else {
-            tf_shared_lock l(ckpt_mu_);
-            iterator_creation_status =
-                worker_thread_states_[thread_index].iterator_creation_status;
-            // Mark that we have used up the restored iterator.
-            make_new_iterator = true;
-          }
-          // 2. Start producing elements or send error state to client if
-          //    iterator creation failed.
-          if (!iterator_creation_status.ok()) {
-            mutex_lock l(mu_);
-            // Wait for space in the prefetch queue.
-            while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                      dataset()->buffer_output_elements_) {
-              RecordStop(ctx.get());
-              workers_[thread_index].cond_var.wait(l);
-              RecordStart(ctx.get());
-            }
-            if (cancelled_) return;
-            tf_shared_lock ckpt_l(ckpt_mu_);
-            workers_[thread_index].outputs.emplace_back(
-                iterator_creation_status);
-            workers_[thread_index].is_producing = false;
-            worker_thread_states_[thread_index].iterator_creation_status =
-                Status::OK();
-            // CHECKPOINT_MARKER_C
-            // Non-OK iterator creation status has been notified to the
-            // client.
-            workers_[thread_index].cond_var.notify_one();
-          } else {
-            bool end_of_sequence = false;
-            while (!end_of_sequence) {
-              // 3.a Produce an element!
-              {
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                if (worker_thread_states_[thread_index]
-                        .output_elem.status.ok() &&
-                    worker_thread_states_[thread_index]
-                        .output_elem.output.empty() &&
-                    !worker_thread_states_[thread_index].end_of_sequence) {
-                  worker_thread_states_[thread_index].output_elem.status =
-                      worker_thread_states_[thread_index].iterator->GetNext(
-                          ctx.get(),
-                          &worker_thread_states_[thread_index]
-                               .output_elem.output,
-                          &worker_thread_states_[thread_index].end_of_sequence);
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                } else {
-                  end_of_sequence =
-                      worker_thread_states_[thread_index].end_of_sequence;
-                }
-                // CHECKPOINT_MARKER_D
-                // An element has been read or an error or end_of_sequence has
-                // been received from the input iterator and is waiting to be
-                // sent to client.
-              }
-
-              // 3.b Make it available to the client.
-              {
-                mutex_lock l(mu_);
-
-                // Wait for space in the prefetch queue.
-                while (!cancelled_ && workers_[thread_index].outputs.size() ==
-                                          dataset()->buffer_output_elements_) {
-                  RecordStop(ctx.get());
-                  workers_[thread_index].cond_var.wait(l);
-                  RecordStart(ctx.get());
-                }
-                if (cancelled_) return;
-
-                tf_shared_lock ckpt_l(ckpt_mu_);
-                workers_[thread_index].is_producing = !end_of_sequence;
-
-                // Output the element.
-
-                // Move the temporary state in WorkerThreadState to WorkerState
-                // and mark it as used.
-                if (end_of_sequence) {
-                  worker_thread_states_[thread_index].iterator.reset();
-                  worker_thread_states_[thread_index].input.clear();
-                  worker_thread_states_[thread_index].end_of_sequence = false;
-                } else {
-                  workers_[thread_index].outputs.emplace_back(
-                      worker_thread_states_[thread_index].output_elem.status);
-                  workers_[thread_index].outputs.back().output.swap(
-                      worker_thread_states_[thread_index].output_elem.output);
-                }
-                worker_thread_states_[thread_index].output_elem.status =
-                    Status::OK();
-                if (dataset()->sloppy_) {
-                  sloppy_cond_var_.notify_one();
-                } else {
-                  workers_[thread_index].cond_var.notify_one();
-                }
-                // CHECKPOINT_MARKER_E
-                // Output element or iterator status has been sent to the
-                // client.
-              }
-            }
-          }
-        }
-      }
-
-      Status WriteWorkerStateLocked(IteratorStateWriter* writer, int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_", index);
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            workers_[index].input.size()));
-        for (int i = 0; i < workers_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              workers_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_outputs_size")),
-            workers_[index].outputs.size()));
-        for (int i = 0; i < workers_[index].outputs.size(); ++i) {
-          TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-              writer, workers_[index].outputs[i],
-              full_name(strings::StrCat(prefix, "_outputs_", i))));
-        }
-        if (workers_[index].is_producing) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_is_producing")), ""));
-        }
-        return Status::OK();
-      }
-
-      Status ReadWorkerStateLocked(IteratorStateReader* reader, int index,
-                                   IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        workers_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          workers_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &workers_[index].input.back()));
-        }
-        int64 outputs_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_outputs_size")),
-            &outputs_size));
-        for (int i = 0; i < outputs_size; ++i) {
-          workers_[index].outputs.emplace_back(Status::OK());
-          TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-              reader, &workers_[index].outputs.back(),
-              full_name(strings::StrCat(worker_prefix, "_outputs_", i))));
-        }
-        if (reader->Contains(
-                full_name(strings::StrCat(worker_prefix, "_is_producing")))) {
-          workers_[index].is_producing = true;
-        } else {
-          workers_[index].is_producing = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteWorkerThreadStateLocked(IteratorStateWriter* writer,
-                                          int index)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string prefix = strings::StrCat("worker_thread_", index);
-        if (worker_thread_states_[index].iterator != nullptr) {
-          TF_RETURN_IF_ERROR(
-              SaveInput(writer, worker_thread_states_[index].iterator));
-        } else {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_iterator_exhausted")), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(
-            full_name(strings::StrCat(prefix, "_input_size")),
-            worker_thread_states_[index].input.size()));
-        for (int i = 0; i < worker_thread_states_[index].input.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              full_name(strings::StrCat(prefix, "_input_", i)),
-              worker_thread_states_[index].input[i]));
-        }
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_iterator_creation_status"),
-            worker_thread_states_[index].iterator_creation_status));
-        TF_RETURN_IF_ERROR(WriteOutputElemLocked(
-            writer, worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(prefix, "_output"))));
-        if (worker_thread_states_[index].end_of_sequence) {
-          TF_RETURN_IF_ERROR(writer->WriteScalar(
-              full_name(strings::StrCat(prefix, "_end_of_sequence")), ""));
-        }
-        return Status::OK();
-      }
-
-      Status ReadWorkerThreadStateLocked(IteratorStateReader* reader, int index,
-                                         IteratorContext* ctx)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        string worker_prefix = strings::StrCat("worker_thread_", index);
-        // Restore inputs.
-        int64 input_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(worker_prefix, "_input_size")),
-            &input_size));
-        worker_thread_states_[index].input.reserve(input_size);
-        for (int i = 0; i < input_size; ++i) {
-          worker_thread_states_[index].input.emplace_back();
-          TF_RETURN_IF_ERROR(reader->ReadTensor(
-              full_name(strings::StrCat(worker_prefix, "_input_", i)),
-              &worker_thread_states_[index].input.back()));
-        }
-        // Restore iterator.
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_iterator_exhausted")))) {
-          worker_thread_states_[index].iterator.reset();
-        } else {
-          std::unique_ptr<IteratorBase> iterator;
-          Status s = MakeIteratorFromInputElement(
-              ctx, worker_thread_states_[index].input, index,
-              dataset()->captured_func_.get(), prefix(), &iterator);
-          TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
-          worker_thread_states_[index].iterator.swap(iterator);
-        }
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(worker_prefix, "_iterator_creation_status"),
-            &worker_thread_states_[index].iterator_creation_status));
-        TF_RETURN_IF_ERROR(ReadOutputElemLocked(
-            reader, &worker_thread_states_[index].output_elem,
-            full_name(strings::StrCat(worker_prefix, "_output"))));
-        if (reader->Contains(full_name(
-                strings::StrCat(worker_prefix, "_end_of_sequence")))) {
-          worker_thread_states_[index].end_of_sequence = true;
-        } else {
-          worker_thread_states_[index].end_of_sequence = false;
-        }
-        return Status::OK();
-      }
-
-      Status WriteOutputElemLocked(IteratorStateWriter* writer,
-                                   const OutputElem& output_elem,
-                                   const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(WriteStatusLocked(
-            writer, strings::StrCat(prefix, "_status"), output_elem.status));
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(strings::StrCat(prefix, "_output_size"),
-                                output_elem.output.size()));
-        for (int i = 0; i < output_elem.output.size(); ++i) {
-          TF_RETURN_IF_ERROR(writer->WriteTensor(
-              strings::StrCat(prefix, "_output_", i), output_elem.output[i]));
-        }
-        return Status::OK();
-      }
-
-      Status ReadOutputElemLocked(IteratorStateReader* reader,
-                                  OutputElem* output_elem, const string& prefix)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(ReadStatusLocked(
-            reader, strings::StrCat(prefix, "_status"), &output_elem->status));
-        int64 output_size;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            strings::StrCat(prefix, "_output_size"), &output_size));
-        output_elem->output.reserve(output_size);
-        for (int i = 0; i < output_size; ++i) {
-          output_elem->output.emplace_back();
-          TF_RETURN_IF_ERROR(
-              reader->ReadTensor(strings::StrCat(prefix, "_output_", i),
-                                 &output_elem->output.back()));
-        }
-        return Status::OK();
-      }
-
-      Status WriteStatusLocked(IteratorStateWriter* writer,
-                               const string& prefix, const Status& status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        TF_RETURN_IF_ERROR(
-            writer->WriteScalar(full_name(strings::StrCat(prefix, "_code")),
-                                static_cast<int64>(status.code())));
-        if (!status.ok()) {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(full_name(strings::StrCat(prefix, "_msg")),
-                                  status.error_message()));
-        }
-        return Status::OK();
-      }
-
-      Status ReadStatusLocked(IteratorStateReader* reader, const string& prefix,
-                              Status* status)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_, ckpt_mu_) {
-        int64 code_int;
-        TF_RETURN_IF_ERROR(reader->ReadScalar(
-            full_name(strings::StrCat(prefix, "_code")), &code_int));
-        error::Code code = static_cast<error::Code>(code_int);
-
-        if (code != error::Code::OK) {
-          string error_message;
-          TF_RETURN_IF_ERROR(reader->ReadScalar(
-              full_name(strings::StrCat(prefix, "_msg")), &error_message));
-          *status = Status(code, error_message);
-        } else {
-          *status = Status::OK();
-        }
-        return Status::OK();
-      }
-
-      // Mutex & condition variable to guard mutable iterator internals and
-      // coordinate among worker threads and client thread[s].
-      mutex mu_ ACQUIRED_BEFORE(ckpt_mu_);
-      // The main thread waits on this condition variable if running in sloppy
-      // mode and no values are available.
-      condition_variable sloppy_cond_var_;
-      // Mutex used to wait for a consistent state while checkpointing.
-      // Only Save and Restore require an exclusive lock on this mutex. In
-      // other scenarios we just acquire a shared lock so the pipeline's
-      // performance should not be affected in the absence of checkpointing.
-      // A thread must not wait on any condition variable while holding
-      // `ckpt_mu_` in either shared or exclusive modes.
-      mutex ckpt_mu_;
-
-      // The iterator producing elements which are converted to datasets by
-      // the dataset()->captured_func_ then interleaved together.
-      // input_impl_ is reset when we have exhausted its input.
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-
-      // The WorkerState structs the worker threads operate on.
-      // workers_ elements are in at most one of interleave_ and staging_.
-      std::vector<WorkerState> workers_ GUARDED_BY(mu_);
-
-      // Stores the temporary state of WorkerThreads which is not stored in
-      // WorkerState. This is used for checkpointing purposes only.
-      std::vector<WorkerThreadState> worker_thread_states_ GUARDED_BY(ckpt_mu_);
-
-      // Indices in `workers_` of iterators to interleave.
-      std::vector<int64> interleave_indices_ GUARDED_BY(mu_);
-      // Indices in `workers_` of prefetched iterators.
-      std::deque<int64> staging_indices_ GUARDED_BY(mu_);
-
-      // The index into output_elements_ for next element to produce.
-      size_t next_index_ GUARDED_BY(mu_) = 0;
-      // The number of items produced so far within the block
-      size_t block_count_ GUARDED_BY(mu_) = 0;
-      // Flag to instruct the worker threads to exit.
-      bool cancelled_ GUARDED_BY(mu_) = false;
-      // The worker threads. This must be last to ensure the
-      // threads have exited before any other members are deallocated.
-      // TODO(b/65178177): Avoid allocating additional threads.
-      std::vector<std::unique_ptr<Thread>> worker_threads_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* const input_;
-    const NameAttrList interleave_func_;
-    const std::unique_ptr<CapturedFunction> captured_func_;
-    const int64 cycle_length_;
-    const int64 block_length_;
-    const bool sloppy_;
-    const int64 buffer_output_elements_;
-    const int64 prefetch_input_elements_;
-    const DataTypeVector output_types_;
-    const std::vector<PartialTensorShape> output_shapes_;
-  };
-
-  DataTypeVector output_types_;
-  std::vector<PartialTensorShape> output_shapes_;
-  NameAttrList interleave_func_;
-};
-
-REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetOp);
-
 // The motivation for creating an alternative implementation of parallel
 // interleave is to decouple the degree of parallelism from the cycle length.
 // This makes it possible to change the degree of parallelism (e.g. through
@@ -1091,9 +49,9 @@ REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDataset").Device(DEVICE_CPU),
 // The above design choices were made with automated optimizations in mind,
 // isolating the degree of parallelism as the single tunable knob of this
 // implementation.
-class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
+class ParallelInterleaveDatasetOp : public UnaryDatasetOpKernel {
  public:
-  explicit ParallelInterleaveDatasetV2Op(OpKernelConstruction* ctx)
+  explicit ParallelInterleaveDatasetOp(OpKernelConstruction* ctx)
       : UnaryDatasetOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &interleave_func_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_types", &output_types_));
@@ -1241,7 +199,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
             element_in_use_(params.dataset->cycle_length_, false),
             thread_pool_(new thread::ThreadPool(
                 Env::Default(), ThreadOptions(),
-                "tf_data_parallel_interleave_worker_pool",
+                "data_parallel_interleave_worker_pool",
                 dataset()->cycle_length_ /* num_threads */,
                 false /* low_latency_hint */)) {
         std::vector<string> components =
@@ -1268,7 +226,8 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
         }
         TF_RETURN_IF_ERROR(
             dataset()->input_->MakeIterator(ctx, prefix(), &input_impl_));
-        return dataset()->captured_func_->Instantiate(ctx);
+        return dataset()->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -1297,6 +256,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 
         if (result->status.ok()) {
           *out_tensors = std::move(result->return_values);
+          RecordBufferDequeue(ctx, *out_tensors);
         }
         *end_of_sequence = false;
         return result->status;
@@ -1435,6 +395,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
           if (end_of_input) {
             result->skip = true;
           }
+          RecordBufferEnqueue(ctx.get(), result->return_values);
           {
             mutex_lock l(*mu_);
             result->notification.Notify();
@@ -1511,7 +472,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
               if (!end_of_input_) {
                 Status status = MakeIteratorFromInputElement(
                     ctx.get(), args_list_[cycle_index_], cycle_index_,
-                    dataset()->captured_func_.get(), prefix(),
+                    *instantiated_captured_func_, prefix(),
                     &current_elements_[cycle_index_]);
                 if (!status.ok()) {
                   invocation_results_.emplace_back(new InvocationResult());
@@ -1658,7 +619,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
                   &args_list_[idx][i]));
             }
             TF_RETURN_IF_ERROR(MakeIteratorFromInputElement(
-                ctx, args_list_[idx], idx, dataset()->captured_func_.get(),
+                ctx, args_list_[idx], idx, *instantiated_captured_func_.get(),
                 prefix(), &current_elements_[idx]));
             TF_RETURN_IF_ERROR(
                 RestoreInput(ctx, reader, current_elements_[idx]));
@@ -1722,6 +683,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
       // Identifies whether background activity should be cancelled.
       bool cancelled_ GUARDED_BY(*mu_) = false;
       string prefix_end_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
     };
 
     const DatasetBase* const input_;
@@ -1742,7 +704,7 @@ class ParallelInterleaveDatasetV2Op : public UnaryDatasetOpKernel {
 };
 
 REGISTER_KERNEL_BUILDER(Name("ParallelInterleaveDatasetV2").Device(DEVICE_CPU),
-                        ParallelInterleaveDatasetV2Op);
+                        ParallelInterleaveDatasetOp);
 
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index 6e4005ff6d34c9e21d4fb1a18fde2c72dd12cc7f..5ac81c187c4f3338785d49b47c232be1f8d1e185 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -41,6 +41,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("use_inter_op_parallelism",
                                      &use_inter_op_parallelism_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("sloppy", &sloppy_));
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr("preserve_cardinality", &preserve_cardinality_));
   }
 
  protected:
@@ -61,52 +63,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     std::vector<int> indices;
     OP_REQUIRES_OK(ctx, ComputeShortCircuitIndices(ctx, func_, &indices));
 
-    ParallelMapIteratorFunction map_func;
-    CapturedFunction* raw_captured_func = captured_func.get();
-    if (indices.empty()) {
-      map_func = [raw_captured_func](IteratorContext* ctx, const string& prefix,
-                                     std::vector<Tensor> args,
-                                     std::vector<Tensor>* out_tensors,
-                                     StatusCallback done) {
-        raw_captured_func->RunAsync(ctx, std::move(args), out_tensors,
-                                    std::move(done), prefix);
-      };
-      if (!use_inter_op_parallelism_) {
-        map_func = [map_func](IteratorContext* ctx, const string& prefix,
-                              std::vector<Tensor> args,
-                              std::vector<Tensor>* out_tensors,
-                              StatusCallback done) {
-          (*ctx->runner())(std::bind(map_func, ctx, prefix, std::move(args),
-                                     out_tensors, std::move(done)));
-        };
-      }
-    } else {
-      std::vector<bool> can_move = ComputeMoveVector(indices);
-      map_func = [raw_captured_func, indices, can_move](
-                     IteratorContext* ctx, const string& prefix,
-                     std::vector<Tensor> args, std::vector<Tensor>* out_tensors,
-                     StatusCallback done) {
-        const std::vector<Tensor>& captured_inputs =
-            raw_captured_func->captured_inputs();
-        size_t num_args = args.size();
-        for (size_t i = 0; i < indices.size(); ++i) {
-          if (indices[i] < num_args) {
-            if (can_move[i]) {
-              out_tensors->push_back(std::move(args[indices[i]]));
-            } else {
-              out_tensors->push_back(args[indices[i]]);
-            }
-          } else {
-            out_tensors->push_back(captured_inputs[indices[i] - num_args]);
-          }
-        }
-        done(Status::OK());
-      };
-    }
-
-    *output = new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
-                          output_shapes_, use_inter_op_parallelism_, sloppy_,
-                          std::move(captured_func), std::move(map_func));
+    *output =
+        new Dataset(ctx, input, func_, num_parallel_calls, output_types_,
+                    output_shapes_, use_inter_op_parallelism_, sloppy_,
+                    std::move(captured_func), indices, preserve_cardinality_);
   }
 
  private:
@@ -118,7 +78,7 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
             const std::vector<PartialTensorShape>& output_shapes,
             bool use_inter_op_parallelism, bool sloppy,
             std::unique_ptr<CapturedFunction> captured_func,
-            ParallelMapIteratorFunction map_func)
+            const std::vector<int> indices, bool preserve_cardinality)
         : DatasetBase(DatasetContext(ctx)),
           input_(input),
           func_(func),
@@ -127,8 +87,11 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
           output_shapes_(output_shapes),
           use_inter_op_parallelism_(use_inter_op_parallelism),
           sloppy_(sloppy),
+          preserve_cardinality_(preserve_cardinality),
           captured_func_(std::move(captured_func)),
-          map_func_(std::move(map_func)) {
+          indices_(indices),
+          can_move_(indices.empty() ? std::vector<bool>()
+                                    : ComputeMoveVector(indices)) {
       input_->Ref();
     }
 
@@ -136,13 +99,16 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      auto init_func = [this](IteratorContext* ctx) {
-        return captured_func_->Instantiate(ctx);
-      };
-
+      std::unique_ptr<ParallelMapFunctor> parallel_map_functor(nullptr);
+      if (indices_.empty()) {
+        parallel_map_functor.reset(new ParallelMapDatasetFunctor(this));
+      } else {
+        parallel_map_functor.reset(new ShortCircuitFunctor(this));
+      }
       return NewParallelMapIterator(
           {this, strings::StrCat(prefix, "::ParallelMap")}, input_,
-          std::move(init_func), map_func_, num_parallel_calls_, sloppy_);
+          std::move(parallel_map_functor), num_parallel_calls_, sloppy_,
+          preserve_cardinality_);
     }
 
     const DataTypeVector& output_dtypes() const override {
@@ -157,6 +123,8 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       return "ParallelMapDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -200,6 +168,10 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
       AttrValue sloppy_attr;
       b->BuildAttrValue(sloppy_, &sloppy_attr);
 
+      // Attr: preserve_cardinality
+      AttrValue preserve_cardinality_attr;
+      b->BuildAttrValue(preserve_cardinality_, &preserve_cardinality_attr);
+
       TF_RETURN_IF_ERROR(b->AddDataset(
           this,
           {std::make_pair(0, input_graph_node),
@@ -209,12 +181,79 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
            std::make_pair("Targuments", other_arguments_types_attr),
            std::make_pair("use_inter_op_parallelism",
                           use_inter_op_parallelism_attr),
-           std::make_pair("sloppy", sloppy_attr)},  // Attrs
+           std::make_pair("sloppy", sloppy_attr),
+           std::make_pair("preserve_cardinality",
+                          preserve_cardinality_attr)},  // Attrs
           output));
       return Status::OK();
     }
 
    private:
+    class ShortCircuitFunctor : public ParallelMapFunctor {
+     public:
+      explicit ShortCircuitFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input_element,
+                   std::vector<Tensor>* result, StatusCallback done) override {
+        const std::vector<Tensor>& captured_inputs =
+            dataset_->captured_func_->captured_inputs();
+        size_t num_args = input_element.size();
+        for (size_t i = 0; i < dataset_->indices_.size(); ++i) {
+          if (dataset_->indices_[i] < num_args) {
+            if (dataset_->can_move_[i]) {
+              result->push_back(
+                  std::move(input_element[dataset_->indices_[i]]));
+            } else {
+              result->push_back(input_element[dataset_->indices_[i]]);
+            }
+          } else {
+            result->push_back(
+                captured_inputs[dataset_->indices_[i] - num_args]);
+          }
+        }
+        done(Status::OK());
+      }
+
+      const Dataset* const dataset_;
+    };
+
+    class ParallelMapDatasetFunctor : public ParallelMapFunctor {
+     public:
+      explicit ParallelMapDatasetFunctor(const Dataset* dataset)
+          : dataset_(dataset) {}
+
+      Status InitFunc(IteratorContext* ctx) override {
+        return dataset_->captured_func_->Instantiate(
+            ctx, &instantiated_captured_func_);
+      }
+
+      void MapFunc(IteratorContext* ctx, const string& prefix,
+                   std::vector<Tensor> input_element,
+                   std::vector<Tensor>* result, StatusCallback done) override {
+        auto map_func = [this](IteratorContext* ctx, const string& prefix,
+                               std::vector<Tensor> input_element,
+                               std::vector<Tensor>* result,
+                               StatusCallback done) {
+          instantiated_captured_func_->RunAsync(
+              ctx, std::move(input_element), result, std::move(done), prefix);
+        };
+        if (!dataset_->use_inter_op_parallelism_) {
+          (*ctx->runner())(std::bind(map_func, ctx, prefix,
+                                     std::move(input_element), result,
+                                     std::move(done)));
+        } else {
+          map_func(ctx, prefix, std::move(input_element), result,
+                   std::move(done));
+        }
+      }
+
+     private:
+      const Dataset* const dataset_;
+      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+    };
+
     const DatasetBase* const input_;
     const NameAttrList func_;
     const int32 num_parallel_calls_;
@@ -222,14 +261,17 @@ class ParallelMapDatasetOp : public UnaryDatasetOpKernel {
     const std::vector<PartialTensorShape> output_shapes_;
     const bool use_inter_op_parallelism_;
     const bool sloppy_;
+    const bool preserve_cardinality_;
     const std::unique_ptr<CapturedFunction> captured_func_;
-    const ParallelMapIteratorFunction map_func_;
+    const std::vector<int> indices_;
+    const std::vector<bool> can_move_;
   };
 
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
   bool use_inter_op_parallelism_;
   bool sloppy_;
+  bool preserve_cardinality_;
   NameAttrList func_;
 };
 
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.cc b/tensorflow/core/kernels/data/parallel_map_iterator.cc
index ec1c92384304d06332ba82f4315bd7286bcf99da..b97f69250056fbf80c1cf866192a320861b70770 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.cc
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.cc
@@ -32,22 +32,34 @@ namespace {
 
 class ParallelMapIterator : public DatasetBaseIterator {
  public:
-  ParallelMapIterator(const typename DatasetBaseIterator::BaseParams& params,
-                      const DatasetBase* input_dataset,
-                      std::function<Status(IteratorContext*)> init_func,
-                      ParallelMapIteratorFunction map_func,
-                      int32 num_parallel_calls, bool sloppy)
-      : DatasetBaseIterator(params),
+  struct Params {
+    Params(std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+           int32 num_parallel_calls, bool sloppy, bool preserve_cardinality)
+        : parallel_map_functor(std::move(parallel_map_functor)),
+          num_parallel_calls(num_parallel_calls),
+          sloppy(sloppy),
+          preserve_cardinality(preserve_cardinality) {}
+
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor;
+    int32 num_parallel_calls;
+    bool sloppy;
+    bool preserve_cardinality;
+  };
+
+  ParallelMapIterator(
+      const typename DatasetBaseIterator::BaseParams& base_params,
+      const DatasetBase* input_dataset, Params params)
+      : DatasetBaseIterator(base_params),
         input_dataset_(input_dataset),
-        init_func_(std::move(init_func)),
-        map_func_(std::move(map_func)),
+        parallel_map_functor_(std::move(params.parallel_map_functor)),
         mu_(std::make_shared<mutex>()),
         cond_var_(std::make_shared<condition_variable>()),
         num_parallel_calls_(std::make_shared<model::SharedState>(
-            num_parallel_calls, mu_, cond_var_)),
-        sloppy_(sloppy) {
+            params.num_parallel_calls, mu_, cond_var_)),
+        sloppy_(params.sloppy),
+        preserve_cardinality_(params.preserve_cardinality) {
     std::vector<string> components =
-        str_util::Split(params.prefix, "::", str_util::SkipEmpty());
+        str_util::Split(base_params.prefix, "::", str_util::SkipEmpty());
     prefix_end_ = components.back();
   }
 
@@ -70,10 +82,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
     }
     TF_RETURN_IF_ERROR(
         input_dataset_->MakeIterator(ctx, prefix(), &input_impl_));
-    if (init_func_) {
-      TF_RETURN_IF_ERROR(init_func_(ctx));
-    }
-    return Status::OK();
+    return parallel_map_functor_->InitFunc(ctx);
   }
 
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -91,7 +100,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
     RecordStop(ctx);
     result->notification.WaitForNotification();
     RecordStart(ctx);
-    return ProcessResult(result, out_tensors, end_of_sequence);
+    return ProcessResult(ctx, result, out_tensors, end_of_sequence);
   }
 
  protected:
@@ -202,6 +211,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
           strings::StrCat(prefix_end_, "::active_parallel_calls"),
           static_cast<float>(num_calls_));
     }
+    RecordBufferEnqueue(ctx.get(), result->return_values);
     result->notification.Notify();
     cond_var_->notify_all();
   }
@@ -225,23 +235,35 @@ class ParallelMapIterator : public DatasetBaseIterator {
 
     // Apply the map function on `input_element`, storing the result in
     // `result->return_values`, and invoking `done` when finished.
-    map_func_(ctx.get(), prefix(), std::move(input_element),
-              &result->return_values, std::move(done));
+    parallel_map_functor_->MapFunc(ctx.get(), prefix(),
+                                   std::move(input_element),
+                                   &result->return_values, std::move(done));
   }
 
-  Status ProcessResult(const std::shared_ptr<InvocationResult>& result,
+  Status ProcessResult(IteratorContext* ctx,
+                       const std::shared_ptr<InvocationResult>& result,
                        std::vector<Tensor>* out_tensors, bool* end_of_sequence)
       LOCKS_EXCLUDED(*mu_) {
     if (!result->end_of_input && result->status.ok()) {
       *out_tensors = std::move(result->return_values);
+      RecordBufferDequeue(ctx, *out_tensors);
       *end_of_sequence = false;
       return Status::OK();
     }
     if (errors::IsOutOfRange(result->status)) {
-      // `f` may deliberately raise `errors::OutOfRange` to indicate that we
-      // should terminate the iteration early.
-      *end_of_sequence = true;
-      return Status::OK();
+      if (preserve_cardinality_) {
+        // To guarantee that the transformation preserves the cardinality of the
+        // dataset, we convert `OutOfRange` to `InvalidArgument` as the former
+        // may be interpreted by a caller as the end of sequence.
+        return errors::InvalidArgument(
+            "Function invocation produced OutOfRangeError: ",
+            result->status.error_message());
+      } else {
+        // `f` may deliberately raise `errors::OutOfRange` to indicate
+        // that we should terminate the iteration early.
+        *end_of_sequence = true;
+        return Status::OK();
+      }
     }
     *end_of_sequence = result->end_of_input;
     return result->status;
@@ -252,7 +274,10 @@ class ParallelMapIterator : public DatasetBaseIterator {
     RecordStart(ctx.get());
     auto cleanup = gtl::MakeCleanup([this, ctx] { RecordStop(ctx.get()); });
     std::vector<std::shared_ptr<InvocationResult>> new_calls;
-    new_calls.reserve(num_parallel_calls_->value);
+    {
+      tf_shared_lock l(*mu_);  // mu_ == num_parallel_calls_->mu
+      new_calls.reserve(num_parallel_calls_->value);
+    }
     auto busy = [this]() EXCLUSIVE_LOCKS_REQUIRED(*mu_) -> bool {
       int64 num_parallel_calls = num_parallel_calls_->value;
       return num_calls_ >= num_parallel_calls ||
@@ -357,8 +382,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   }
 
   const DatasetBase* const input_dataset_;  // Not owned.
-  const std::function<Status(IteratorContext*)> init_func_;
-  const ParallelMapIteratorFunction map_func_;
+  std::unique_ptr<ParallelMapFunctor> parallel_map_functor_;
   // Used for coordination between the main thread and the runner thread.
   const std::shared_ptr<mutex> mu_;
   // Used for coordination between the main thread and the runner thread. In
@@ -371,6 +395,7 @@ class ParallelMapIterator : public DatasetBaseIterator {
   const std::shared_ptr<model::SharedState> num_parallel_calls_;
   // Determines whether outputs can be produced in non-deterministic order.
   const bool sloppy_;
+  const bool preserve_cardinality_;
   // Counts the number of outstanding calls.
   int64 num_calls_ GUARDED_BY(*mu_) = 0;
   std::unique_ptr<IteratorBase> input_impl_;
@@ -387,12 +412,13 @@ class ParallelMapIterator : public DatasetBaseIterator {
 std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset,
-    std::function<Status(IteratorContext*)> init_func,
-    ParallelMapIteratorFunction map_func, int32 num_parallel_calls,
-    bool sloppy) {
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+    int32 num_parallel_calls, bool sloppy, bool preserve_cardinality) {
   return MakeUnique<ParallelMapIterator>(
-      params, input_dataset, std::move(init_func), std::move(map_func),
-      num_parallel_calls, sloppy);
+      params, input_dataset,
+      ParallelMapIterator::Params{std::move(parallel_map_functor),
+                                  num_parallel_calls, sloppy,
+                                  preserve_cardinality});
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/parallel_map_iterator.h b/tensorflow/core/kernels/data/parallel_map_iterator.h
index d715b9a4975443ff7972f13200a4738dbdbe4ae1..de30446f2631c7e40e090a03517dcc53fdd873b9 100644
--- a/tensorflow/core/kernels/data/parallel_map_iterator.h
+++ b/tensorflow/core/kernels/data/parallel_map_iterator.h
@@ -22,28 +22,33 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-// A function that transforms elements of one dataset into another
-// asynchronously. The arguments are:
-// 1. An `IteratorContext*` for the context in which the function should
-// execute.
-// 2. A `std::vector<Tensor>` containing the input element.
-// 3. A `std::vector<Tensor>*` to which the function will write the result.
-// 4. A `StatusCallback` that should be invoked when the function is complete.
-using ParallelMapIteratorFunction =
-    std::function<void(IteratorContext*, const string&, std::vector<Tensor>,
-                       std::vector<Tensor>*, StatusCallback)>;
-
-// Returns a new iterator that applies `map_func` to the elements of
-// `input_dataset` using the given degree of parallelism. `init_func` (if
-// specified) will be executed when the iterator is initialized (see
-// `IteratorBase::Initialize()`) and enables the user to specify error checking
-// logic that can fail early.
+class ParallelMapFunctor {
+ public:
+  virtual ~ParallelMapFunctor() {}
+
+  // A function that runs when the Iterator is initialized. It enables the user
+  // to specify error checking logic that can fail early.
+  virtual Status InitFunc(IteratorContext* ctx) { return Status::OK(); }
+
+  // A function that transforms elements of one dataset into another
+  // asynchronously. The arguments are:
+  // 1. An `IteratorContext*` for the context in which the function should
+  // execute.
+  // 2. A `std::vector<Tensor>` containing the input element.
+  // 3. A `std::vector<Tensor>*` to which the function will write the result.
+  // 4. A `StatusCallback` that should be invoked when the function is complete.
+  virtual void MapFunc(IteratorContext* ctx, const string& prefix,
+                       std::vector<Tensor> input, std::vector<Tensor>* output,
+                       StatusCallback callback) = 0;
+};
+
+// Returns a new iterator that uses `parallel_map_functor` to apply `MapFunc`
+// to the elements of `input_dataset` using the given degree of parallelism.
 std::unique_ptr<IteratorBase> NewParallelMapIterator(
     const DatasetBaseIterator::BaseParams& params,
     const DatasetBase* input_dataset,
-    std::function<Status(IteratorContext*)> init_func,
-    ParallelMapIteratorFunction map_func, int32 num_parallel_calls,
-    bool sloppy);
+    std::unique_ptr<ParallelMapFunctor> parallel_map_functor,
+    int32 num_parallel_calls, bool sloppy, bool preserve_cardinality);
 
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index 960373b74f3c657c4851602e37437ec163c6fd49..08d6de4bf9a654d433e3cb6dddd6ab0cc1435136 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -56,6 +56,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   string DebugString() const override { return "PrefetchDatasetOp::Dataset"; }
 
+  int64 Cardinality() const override { return input_->Cardinality(); }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
@@ -123,7 +125,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         }
 
         if (!buffer_.empty()) {
-          return Consume(out_tensors, end_of_sequence, ctx);
+          return Consume(ctx, out_tensors, end_of_sequence);
         }
 
         if (prefetch_thread_finished_) {
@@ -226,8 +228,8 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       std::vector<Tensor> value;
     };
 
-    Status Consume(std::vector<Tensor>* out_tensors, bool* end_of_sequence,
-                   IteratorContext* ctx) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    Status Consume(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                   bool* end_of_sequence) EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       const auto& stats_aggregator = ctx->stats_aggregator();
       if (stats_aggregator) {
         stats_aggregator->AddToHistogram(
@@ -246,6 +248,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
       Status s = buffer_.front().status;
       if (s.ok()) {
         *out_tensors = std::move(buffer_.front().value);
+        RecordBufferDequeue(ctx, *out_tensors);
       }
       auto_tuner_.RecordConsumption(buffer_.size());
       buffer_.pop_front();
@@ -316,6 +319,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         // 3. Signal that the element has been produced.
         {
           mutex_lock l(mu_);
+          RecordBufferEnqueue(ctx.get(), buffer_element.value);
           buffer_.push_back(std::move(buffer_element));
           cond_var_.notify_all();
         }
@@ -391,13 +395,14 @@ void PrefetchDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
 }
 
 namespace {
-REGISTER_KERNEL_BUILDER(Name("PrefetchDataset").Device(DEVICE_CPU),
+REGISTER_KERNEL_BUILDER(Name("PrefetchDataset").Device(DEVICE_CPU).Priority(2),
                         PrefetchDatasetOp);
 REGISTER_KERNEL_BUILDER(Name("PrefetchDataset")
                             .Device(DEVICE_GPU)
                             .HostMemory("buffer_size")
                             .HostMemory("input_dataset")
-                            .HostMemory("handle"),
+                            .HostMemory("handle")
+                            .Priority(1),
                         PrefetchDatasetOp);
 }  // namespace
 
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index 207e957e3747e4a03a7f91cc5502f92fb6953e1b..580702f741814b6bd86cab2d537b3ad49b4f6177 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -73,6 +73,14 @@ class RangeDatasetOp : public DatasetOpKernel {
                              step_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      if (step_ > 0) {
+        return std::max(0LL, (stop_ - start_ - 1) / step_ + 1);
+      } else {
+        return std::max(0LL, (start_ - stop_ - 1) / -step_ + 1);
+      }
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index cee14df69d07a1e477b4f10a569a7ec268cfe2ad..8100f2695b6ee529da252b7b012a7c87ebb0a670 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -71,6 +71,23 @@ class RepeatDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "RepeatDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (count_ < 0) {
+        if (n == 0) {
+          return 0;
+        }
+        return kInfiniteCardinality;
+      }
+      if (count_ == 0) {
+        return 0;
+      }
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return count_ * n;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index ad6960685e4284f129d96baa4eaffa7df99f3946..7134793e26da82e39f53ac21030a9e56e16e26ab 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
+#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/random/philox_random.h"
 #include "tensorflow/core/lib/random/random.h"
@@ -61,6 +62,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       return input_->output_shapes();
     }
 
+    int64 Cardinality() const override { return input_->Cardinality(); }
+
    protected:
     template <class T>
     class Iterator : public DatasetIterator<T> {
@@ -68,9 +71,9 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
       explicit Iterator(const typename DatasetIterator<T>::Params& params,
                         int64 seed, int64 seed2)
           : DatasetIterator<T>(params),
-            input_impl_(nullptr),
             seed_(seed),
             seed2_(seed2),
+            input_impl_(nullptr),
             epoch_(0),
             num_elements_(0),
             parent_generator_(seed, seed2),
@@ -124,6 +127,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
                 ctx, this->prefix(), &input_impl_));
           }
           if (!end_of_input_sequence) {
+            this->RecordBufferEnqueue(ctx, input_element);
             buffer_[slices_.back()->end % this->dataset()->buffer_size_] =
                 std::move(input_element);
             num_elements_++;
@@ -151,6 +155,7 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
           int64 index =
               (slices_.front()->start + offset) % this->dataset()->buffer_size_;
           *out_tensors = std::move(buffer_[index]);
+          this->RecordBufferDequeue(ctx, *out_tensors);
           std::swap(
               buffer_[index],
               buffer_[slices_.front()->start % this->dataset()->buffer_size_]);
@@ -170,6 +175,14 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
                                          /*ratio=*/1);
       }
 
+      void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+        // Reset the generators based on the current iterator seeds.
+        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+      }
+
       Status SaveInternal(IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         // Save state needed to restore the random number generators.
@@ -277,6 +290,10 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         return Status::OK();
       }
 
+      mutex mu_;
+      int64 seed_ GUARDED_BY(mu_);
+      int64 seed2_ GUARDED_BY(mu_);
+
      private:
       // Used to represent slices of `buffer_` that belong to different epochs.
       // The invariant maintained by the implementation is: `start` <= `end`.
@@ -297,19 +314,8 @@ class ShuffleDatasetOpBase : public UnaryDatasetOpKernel {
         return out;
       }
 
-      void ResetRngs() EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // Reset the generators based on the current iterator seeds.
-        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
-        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
-            &parent_generator_);
-        generator_.Skip(num_random_samples_);
-      }
-
-      mutex mu_;
       std::unique_ptr<std::vector<Tensor>[]> buffer_ GUARDED_BY(mu_);
       std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      int64 seed_ GUARDED_BY(mu_);
-      int64 seed2_ GUARDED_BY(mu_);
       int64 epoch_ GUARDED_BY(mu_);
       int64 num_elements_ GUARDED_BY(mu_);
       std::deque<std::unique_ptr<Slice>> slices_ GUARDED_BY(mu_);
@@ -366,7 +372,7 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
   }
 
  private:
-  // A dataset that uses a pseduorandom sequence of seeds for the iterators
+  // A dataset that uses a pseudorandom sequence of seeds for the iterators
   // created from it. Used when `reshuffle_each_iteration` is true.
   class ReshufflingDataset : public ShuffleDatasetBase {
    public:
@@ -374,37 +380,114 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
                        int64 buffer_size, int64 seed, int64 seed2, int64 count)
         : ShuffleDatasetBase(ctx, input, buffer_size, count),
           seed_(seed),
-          seed2_(seed2),
-          parent_generator_(seed, seed2),
-          generator_(&parent_generator_) {}
+          seed2_(seed2) {}
 
     string DebugString() const override {
-      mutex_lock l(mu_);
       return strings::StrCat("ShuffleDatasetOp(", buffer_size_, ", ", seed_,
                              ", ", seed2_, ")::ReshufflingDataset");
     }
 
     std::unique_ptr<IteratorBase> MakeIteratorInternal(
         const string& prefix) const override {
-      int64 iterator_seed;
-      int64 iterator_seed2;
-      {
-        mutex_lock l(mu_);
-        iterator_seed = Random();
-        iterator_seed2 = Random();
-      }
-      return std::unique_ptr<IteratorBase>(
-          new Iterator({this, strings::StrCat(prefix, "::Shuffle")},
-                       iterator_seed, iterator_seed2));
+      return std::unique_ptr<IteratorBase>(new Iterator(
+          {this, strings::StrCat(prefix, "::Shuffle")}, seed_, seed2_));
     }
 
    protected:
+    class RandomSeedGenerator : public ResourceBase {
+     public:
+      RandomSeedGenerator(int64 seed, int64 seed2)
+          : seed_(seed),
+            seed2_(seed2),
+            parent_generator_(seed, seed2),
+            generator_(&parent_generator_) {}
+
+      string DebugString() override {
+        return "ReshufflingDataset::RandomSeedGenerator";
+      }
+
+      void GenerateRandomSeeds(int64* seed1, int64* seed2) {
+        mutex_lock l(mu_);
+        num_random_samples_++;
+        *seed1 = generator_();
+        num_random_samples_++;
+        *seed2 = generator_();
+      }
+
+      int64 num_random_samples() {
+        tf_shared_lock l(mu_);
+        return num_random_samples_;
+      }
+
+      void set_num_random_samples(int64 num_random_samples) {
+        mutex_lock l(mu_);
+        num_random_samples_ = num_random_samples;
+      }
+
+      void Reset() {
+        mutex_lock l(mu_);
+        // Reset the generators based on the current seeds.
+        parent_generator_ = random::PhiloxRandom(seed_, seed2_);
+        generator_ = random::SingleSampleAdapter<random::PhiloxRandom>(
+            &parent_generator_);
+        generator_.Skip(num_random_samples_);
+      }
+
+     private:
+      const int64 seed_;
+      const int64 seed2_;
+      mutex mu_;
+      random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
+      random::SingleSampleAdapter<random::PhiloxRandom> generator_
+          GUARDED_BY(mu_);
+      int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    };
+
     class Iterator : public ShuffleDatasetBase::Iterator<ReshufflingDataset> {
      public:
       explicit Iterator(const Params& params, int64 seed, int64 seed2)
           : ShuffleDatasetBase::Iterator<ReshufflingDataset>(params, seed,
                                                              seed2) {}
 
+      ~Iterator() override { seed_generator_->Unref(); }
+
+      Status Initialize(IteratorContext* ctx) override {
+        // Firstly, lookup or create a seed generator from the IteratorResource
+        // resource_mgr.
+        ResourceMgr* mgr = ctx->resource_mgr();
+        RandomSeedGenerator* seed_generator;
+        const string name = strings::StrCat(prefix(), "::", dataset()->name(),
+                                            "::RandomSeedGenerator");
+
+        int64 dataset_seed, dataset_seed2;
+        {
+          tf_shared_lock l(mu_);
+          // Ideally we'd like to hold this lock in the LookupOrCreate method,
+          // but that trips up our Deadlock detection code.
+          dataset_seed = seed_;
+          dataset_seed2 = seed2_;
+        }
+        TF_RETURN_IF_ERROR(mgr->LookupOrCreate<RandomSeedGenerator>(
+            "tf_data", name, &seed_generator,
+            [dataset_seed,
+             dataset_seed2](RandomSeedGenerator** seed_generator) {
+              // On the first iterator creation, use the original seeds from the
+              // dataset to seed a `RandomSeedGenerator` that will provide seeds
+              // for subsequent repetitions of the same dataset.
+              *seed_generator =
+                  new RandomSeedGenerator(dataset_seed, dataset_seed2);
+              return Status::OK();
+            }));
+        // Now use the seed generator to update the base class Iterator seeds
+        // and random number generator with generated seeds for the current
+        // repetition.
+        mutex_lock l(mu_);
+        seed_generator->GenerateRandomSeeds(&seed_, &seed2_);
+        ResetRngs();
+        seed_generator_ = seed_generator;
+        return Status::OK();
+      }
+
      protected:
       std::shared_ptr<model::Node> CreateNode(
           IteratorContext* ctx, model::Node::Args args) const override {
@@ -413,12 +496,10 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
       }
 
       Status SaveInternal(IteratorStateWriter* writer) override {
-        mutex_lock l(dataset()->mu_);
-
         // Save RNG state of Dataset.
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("ds_num_random_samples"),
-                                dataset()->num_random_samples_));
+                                seed_generator_->num_random_samples()));
 
         // Save the Iterator.
         return ShuffleDatasetBase::Iterator<ReshufflingDataset>::SaveInternal(
@@ -427,24 +508,25 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
-        mutex_lock l(dataset()->mu_);
-
         // Restore RNG state of Dataset.
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(full_name("ds_num_random_samples"),
-                               &dataset()->num_random_samples_));
-        dataset()->ResetRngs();
+        int64 num_random_samples;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(
+            full_name("ds_num_random_samples"), &num_random_samples));
+        seed_generator_->set_num_random_samples(num_random_samples);
+        seed_generator_->Reset();
 
         // Restore the Iterator.
         return ShuffleDatasetBase::Iterator<
             ReshufflingDataset>::RestoreInternal(ctx, reader);
       }
+
+     private:
+      RandomSeedGenerator* seed_generator_;
     };
 
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
                               Node** output) const override {
-      mutex_lock l(mu_);
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       Node* buffer_size = nullptr;
@@ -465,28 +547,8 @@ class ShuffleDatasetOp : public ShuffleDatasetOpBase {
     }
 
    private:
-    random::SingleSampleAdapter<random::PhiloxRandom>::ResultType Random() const
-        EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      num_random_samples_++;
-      auto out = generator_();
-      return out;
-    }
-
-    void ResetRngs() const EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-      // Reset the generators based on the current seeds.
-      parent_generator_ = random::PhiloxRandom(seed_, seed2_);
-      generator_ =
-          random::SingleSampleAdapter<random::PhiloxRandom>(&parent_generator_);
-      generator_.Skip(num_random_samples_);
-    }
-
-    mutable int64 seed_ GUARDED_BY(mu_);
-    mutable int64 seed2_ GUARDED_BY(mu_);
-    mutable mutex mu_;
-    mutable random::PhiloxRandom parent_generator_ GUARDED_BY(mu_);
-    mutable random::SingleSampleAdapter<random::PhiloxRandom> generator_
-        GUARDED_BY(mu_);
-    mutable int64 num_random_samples_ GUARDED_BY(mu_) = 0;
+    const int64 seed_;
+    const int64 seed2_;
   };
 
   // A dataset that uses the same fixed seed for all iterators created from it.
diff --git a/tensorflow/core/kernels/data/single_threaded_executor.cc b/tensorflow/core/kernels/data/single_threaded_executor.cc
index 5b084a16f0be01ef276253a703edfdc17bdede01..89e3881037666299f093ed7423b62c9741ca5dd9 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor.cc
@@ -65,21 +65,28 @@ class SingleThreadedExecutorImpl : public Executor {
         if (IsRefType(dt)) {
           return errors::Unimplemented(
               "Single-threaded executor does not support reference-typed "
-              "edges.");
+              "edges.  But saw type ",
+              DataTypeString(dt), " in outputs of node ", n->name());
         }
       }
 
       if (n->IsControlFlow()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support control flow.");
+            "Single-threaded executor does not support control flow.  But saw "
+            "control flow node ",
+            n->name());
       }
       if (n->IsSend() || n->IsHostSend() || n->IsRecv() || n->IsHostRecv()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support partitioned graphs.");
+            "Single-threaded executor does not support partitioned graphs.  "
+            "But saw send/recv node ",
+            n->name());
       }
       if (n->IsCollective()) {
         return errors::Unimplemented(
-            "Single-threaded executor does not support collective ops.");
+            "Single-threaded executor does not support collective ops.  But "
+            "saw collective node ",
+            n->name());
       }
 
       KernelState& kernel_state = kernels_[i];
diff --git a/tensorflow/core/kernels/data/single_threaded_executor_test.cc b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
index 6244e287bb0db911c04da51c3e1fbdc9ae049e38..7bb51fb8b53d59789f2d1efad04f4ffdf39587e4 100644
--- a/tensorflow/core/kernels/data/single_threaded_executor_test.cc
+++ b/tensorflow/core/kernels/data/single_threaded_executor_test.cc
@@ -51,17 +51,17 @@ class ExecutorTest : public ::testing::Test {
     // when the test completes.
     CHECK(rendez_->Unref());
     delete exec_;
-    delete device_;
   }
 
   // Resets executor_ with a new executor based on a graph 'gdef'.
   void Create(std::unique_ptr<const Graph> graph) {
     const int version = graph->versions().producer();
     LocalExecutorParams params;
-    params.device = device_;
+    params.device = device_.get();
     params.create_kernel = [this, version](const NodeDef& ndef,
                                            OpKernel** kernel) {
-      return CreateNonCachedKernel(device_, nullptr, ndef, version, kernel);
+      return CreateNonCachedKernel(device_.get(), nullptr, ndef, version,
+                                   kernel);
     };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
@@ -86,7 +86,7 @@ class ExecutorTest : public ::testing::Test {
     return exec_->Run(args);
   }
 
-  Device* device_ = nullptr;
+  std::unique_ptr<Device> device_;
   Executor* exec_ = nullptr;
   Executor::Args::Runner runner_;
   Rendezvous* rendez_ = nullptr;
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 8379383662a760ba54b0b2542371890221e1a8c6..e321066a715d180f0791c9afdfa947560a0fd9ce 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -67,6 +67,14 @@ class SkipDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "SkipDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return std::max(0LL, n - count_);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index a002c605357381071884489df8079da3ddbfaa28..be105f8170b8fff79c0c60a76a699a6ee6ba13f9 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -54,6 +54,8 @@ class Dataset : public DatasetBase {
     return "SparseTensorSliceDatasetOp::Dataset";
   }
 
+  int64 Cardinality() const override { return sparse_tensor_.shape()[0]; }
+
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
                             DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 57c9b0d57f68129e5b00462be79bb5864b7853a7..0a3d5869534ddad9f7ed295171d8deefc2154107 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -68,6 +68,17 @@ class TakeDatasetOp : public UnaryDatasetOpKernel {
 
     string DebugString() const override { return "TakeDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kUnknownCardinality) {
+        return kUnknownCardinality;
+      }
+      if (n == kInfiniteCardinality) {
+        return count_;
+      }
+      return std::min(n, count_);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index c7d374f489740a62b837690a4a80278212e98cce..98c23f23b202dee580fb89f5473f69c61d57c640 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -61,6 +61,8 @@ class TensorDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "TensorDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override { return 1LL; }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc b/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
deleted file mode 100644
index 7fd1c4c9e0488ac47de7e8b2a618eb941f70f507..0000000000000000000000000000000000000000
--- a/tensorflow/core/kernels/data/tensor_queue_dataset_op.cc
+++ /dev/null
@@ -1,657 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <deque>
-
-#include "tensorflow/core/framework/dataset.h"
-#include "tensorflow/core/framework/partial_tensor_shape.h"
-#include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/variant.h"
-#include "tensorflow/core/framework/variant_encode_decode.h"
-#include "tensorflow/core/util/batch_util.h"
-
-namespace tensorflow {
-namespace data {
-namespace {
-
-bool IsGreaterEqualToOrCompatibleWith(const PartialTensorShape& a,
-                                      const PartialTensorShape& b) {
-  // Returns true if dims[a] >= dims[b], or are compatible.
-  if (a.unknown_rank()) return true;
-  if (a.dims() != b.dims()) return false;
-  for (int d = 0; d < a.dims(); ++d) {
-    if (a.dim_size(d) == -1 || b.dim_size(d) == -1) continue;
-    if (a.dim_size(d) < b.dim_size(d)) return false;
-  }
-  return true;
-}
-
-DataTypeVector PrependQueueType(const DataTypeVector& dtypes) {
-  DataTypeVector out;
-  out.reserve(dtypes.size() + 1);
-  out.push_back(DT_VARIANT);  // The queue component.
-  for (const DataType& d : dtypes) out.push_back(d);
-  return out;
-}
-
-std::vector<PartialTensorShape> PrependQueueShapeWithBatch(
-    const std::vector<PartialTensorShape>& shapes) {
-  std::vector<PartialTensorShape> out;
-  out.reserve(shapes.size() + 1);
-  out.emplace_back(PartialTensorShape({-1}));  // The queue component.
-  for (PartialTensorShape s : shapes) {
-    s.InsertDim(0, -1);  // Unknown batch size.
-    out.push_back(std::move(s));
-  }
-  return out;
-}
-
-class EnqueueInQueueDatasetOp;
-
-class PrependFromQueueAndPaddedBatchDataset : public DatasetBase {
- public:
-  PrependFromQueueAndPaddedBatchDataset(
-      OpKernelContext* ctx, const int64 batch_size, const DatasetBase* input,
-      const DataTypeVector& dtypes,
-      const std::vector<PartialTensorShape>& shapes,
-      std::vector<Tensor> padding_values)
-      : DatasetBase(DatasetContext(ctx)),
-        batch_size_(batch_size),
-        input_(input),
-        dtypes_(dtypes),
-        shapes_(shapes),
-        padding_values_(std::move(padding_values)),
-        dtypes_with_queue_(PrependQueueType(dtypes)),
-        batched_shapes_with_queue_(PrependQueueShapeWithBatch(shapes)) {
-    input_->Ref();
-  }
-
-  ~PrependFromQueueAndPaddedBatchDataset() override { input_->Unref(); }
-
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return std::unique_ptr<IteratorBase>(new Iterator(
-        {this, strings::StrCat(prefix, "::PrependFromQueueAndPaddedBatch")}));
-  }
-
-  const DataTypeVector& output_dtypes() const override {
-    return dtypes_with_queue_;
-  }
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return batched_shapes_with_queue_;
-  }
-
-  string DebugString() const override {
-    return "PrependFromQueueAndPaddedBatchDatasetOp::Dataset";
-  }
-
- protected:
-  Status AsGraphDefInternal(SerializationContext* ctx,
-                            DatasetGraphDefBuilder* b,
-                            Node** output) const override {
-    Node* input_graph = nullptr;
-    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph));
-    Node* batch_size = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(batch_size_, &batch_size));
-
-    std::vector<Node*> padded_shapes;
-    padded_shapes.reserve(shapes_.size());
-    for (int i = 0; i < shapes_.size(); i++) {
-      Node* node;
-      Tensor t(DT_INT64, TensorShape({shapes_[i].dims()}));
-      for (int j = 0; j < shapes_[i].dims(); j++) {
-        t.vec<int64>()(j) = shapes_[i].dim_size(j);
-      }
-      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-      padded_shapes.emplace_back(node);
-    }
-
-    std::vector<Node*> padding_values;
-    padding_values.reserve(padding_values_.size());
-    for (const Tensor& t : padding_values_) {
-      Node* node;
-      TF_RETURN_IF_ERROR(b->AddTensor(t, &node));
-      padding_values.emplace_back(node);
-    }
-
-    AttrValue output_types;
-    b->BuildAttrValue(dtypes_, &output_types);
-
-    AttrValue output_shapes;
-    b->BuildAttrValue(batched_shapes_with_queue_, &output_shapes);
-
-    AttrValue N;
-    b->BuildAttrValue<int64>(shapes_.size(), &N);
-
-    TF_RETURN_IF_ERROR(b->AddDataset(this, {{0, input_graph}, {1, batch_size}},
-                                     {{2, padded_shapes}, {3, padding_values}},
-                                     {{"Toutput_types", output_types},
-                                      {"output_shapes", output_shapes},
-                                      {"N", N}},
-                                     output));
-
-    return Status::OK();
-  }
-
- private:
-  friend class EnqueueInQueueDatasetOp;
-
-  class Iterator
-      : public DatasetIterator<PrependFromQueueAndPaddedBatchDataset> {
-   public:
-    explicit Iterator(const Params& params)
-        : DatasetIterator<PrependFromQueueAndPaddedBatchDataset>(params) {}
-
-    ~Iterator() override { queue_->Unref(); }
-
-    Status Initialize(IteratorContext* ctx) override {
-      std::unique_ptr<IteratorBase> iterator;
-      TF_RETURN_IF_ERROR(
-          dataset()->input_->MakeIterator(ctx, prefix(), &iterator));
-      queue_ = new TensorQueue(std::move(iterator), dataset()->dtypes_,
-                               dataset()->shapes_);
-      return Status::OK();
-    }
-
-    Status GetNextInternal(IteratorContext* ctx,
-                           std::vector<Tensor>* out_tensors,
-                           bool* end_of_sequence) override {
-      std::vector<std::vector<Tensor>> batch;
-      TF_RETURN_IF_ERROR(queue_->GetNext(ctx, dataset()->batch_size_, &batch,
-                                         end_of_sequence));
-      const auto& dtypes = dataset()->dtypes_;
-      const auto& shapes = dataset()->shapes_;
-      const auto& input_shapes = dataset()->input_->output_shapes();
-      const auto& padding_values = dataset()->padding_values_;
-      const int64 batch_size = batch.size();
-      out_tensors->reserve(dtypes.size());
-
-      std::vector<TensorShape> max_shapes;  // Of non-queue components.
-      for (int i = 0; i < dtypes.size(); ++i) {
-        const PartialTensorShape& shape = shapes[i];
-        TensorShape out_shape({batch_size});
-        for (int r = 0; r < shape.dims(); ++r) {
-          if (shape.dim_size(r) >= 0) {
-            // padded_shape[r] is known.
-            out_shape.AddDim(shape.dim_size(r));
-          } else {
-            // padded_shape[r] is unknown, find the maximum across
-            // the batch.
-            int64 dim = 0;
-            for (int b = 0; b < batch.size(); ++b) {
-              dim = std::max(dim, batch[b][i].dim_size(r));
-            }
-            out_shape.AddDim(dim);
-          }
-        }
-        max_shapes.push_back(std::move(out_shape));
-      }
-
-      out_tensors->emplace_back(ctx->allocator({}), DT_VARIANT,
-                                TensorShape({batch_size}));
-      if (!batch.empty()) {
-        auto queues = out_tensors->back().flat<Variant>();
-        Variant& queue_inserter = queues(0);
-        queue_inserter = TensorQueueInserter();
-        queue_inserter.get<TensorQueueInserter>()->set_queue(queue_);
-        for (int b = 1; b < batch.size(); ++b) {
-          // Copy the TensorQueueInserter.  Each copy increments the
-          // Ref on the queue_.
-          queues(b) = queues(0);
-        }
-      }
-
-      for (int i = 0; i < max_shapes.size(); ++i) {
-        out_tensors->emplace_back(ctx->allocator({}), dtypes[i], max_shapes[i]);
-        Tensor& component = out_tensors->back();
-        // Try hard to take the fast path.
-        if (shapes[i].IsFullyDefined() &&
-            shapes[i].IsIdenticalTo(input_shapes[i])) {
-          // Take the fast path if we know all the shapes statically.
-          for (int64 b = 0; b < batch.size(); ++b) {
-            TF_RETURN_IF_ERROR(
-                batch_util::CopyElementToSlice(batch[b][i], &component, b));
-          }
-        } else {
-          TF_RETURN_IF_ERROR(
-              batch_util::SetElementZero(&component, padding_values[i]));
-          for (int64 b = 0; b < batch.size(); ++b) {
-            if (batch[b][i].shape() == max_shapes[i]) {
-              TF_RETURN_IF_ERROR(
-                  batch_util::CopyElementToSlice(batch[b][i], &component, b));
-            } else {
-              TF_RETURN_IF_ERROR(batch_util::CopyElementToLargerSlice(
-                  batch[b][i], &component, b));
-            }
-          }
-        }
-      }
-
-      // end_of_sequence was set before we populated out_tensors, so
-      // it's ok to return now.
-      return Status::OK();
-    }
-
-   protected:
-    // Work around bug in MSVC that disallows access to protected
-    // members of Iterator from within TensorQueue.
-    class TensorQueue;
-    friend class TensorQueue;
-
-    class TensorQueue : public core::RefCounted {
-     public:
-      TensorQueue(std::unique_ptr<IteratorBase> input_impl,
-                  const DataTypeVector& dtypes,
-                  const std::vector<PartialTensorShape>& shapes)
-          : dtypes_(dtypes),
-            shapes_(shapes),
-            input_impl_(std::move(input_impl)) {}
-
-      void MaybeWaitForNotificationLocked(mutex_lock* lock)
-          EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        // This essentially just releases the lock and immediately relocks.
-        cv_.wait_for(*lock, std::chrono::milliseconds(0));
-      }
-
-      void NotifyLocked() EXCLUSIVE_LOCKS_REQUIRED(mu_) { cv_.notify_all(); }
-
-      Status GetNext(IteratorContext* ctx, const int64 batch_size,
-                     std::vector<std::vector<Tensor>>* batch,
-                     bool* end_of_sequence) {
-        mutex_lock lock(mu_);
-
-        *end_of_sequence = false;
-
-        for (int64 b = 0; b < batch_size;) {
-          if (!entries_.empty()) {
-            batch->push_back(std::move(entries_.front()));
-            entries_.pop_front();
-            ++b;
-            continue;
-          } else {
-            if (input_impl_) {
-              // There's still input coming in.
-              std::vector<Tensor> tensors;
-              bool input_end;
-              TF_RETURN_IF_ERROR(
-                  input_impl_->GetNext(ctx, &tensors, &input_end));
-              if (!input_end) {
-                batch->push_back(std::move(tensors));
-                ++b;
-                continue;
-              } else {
-                input_impl_.reset();
-              }
-            }
-            if (!input_impl_) {
-              // There's no more input coming in.
-              if (RefCountIsOne()) {
-                // No TensorQueueInserters in the wild.
-                if (batch->empty()) {
-                  *end_of_sequence = true;
-                }
-                break;
-              } else {
-                MaybeWaitForNotificationLocked(&lock);
-                // If there's data available, try to add entries again.
-                // Otherwise return a smaller batch and hope the next
-                // iterator request has a non-empty or unused queue_.
-                if (entries_.empty()) {
-                  break;
-                }
-              }
-            }
-          }
-        }  // for (int64 b = ... batch_size)
-        return Status::OK();
-      }
-
-      Status Insert(const std::vector<Tensor>& tensors) {
-        if (tensors.size() != dtypes_.size()) {
-          return errors::InvalidArgument(
-              "TensorQueue::Insert: mismatched number of tensors.  Queue "
-              "expects ",
-              dtypes_.size(), " tensors but tried to insert ", tensors.size());
-        }
-        for (int i = 0; i < tensors.size(); ++i) {
-          if (tensors[i].dtype() != dtypes_[i]) {
-            return errors::InvalidArgument(
-                "TensorQueue::Insert: mismatched dtypes at component ", i,
-                ".  Attempted "
-                "to insert tensor of type ",
-                DataTypeString(tensors[i].dtype()),
-                " but queue expected type: ", DataTypeString(dtypes_[i]));
-          }
-          if (!shapes_[i].IsCompatibleWith(tensors[i].shape())) {
-            return errors::InvalidArgument(
-                "TensorQueue::Insert: mismatched shapes at component ", i,
-                ".  Attempted "
-                "to insert tensor with shape ",
-                tensors[i].shape().DebugString(),
-                " but queue expected shape: ", shapes_[i].DebugString());
-          }
-        }
-        mutex_lock lock(mu_);
-        entries_.push_back(tensors);
-        NotifyLocked();
-        return Status::OK();
-      }
-
-      Status Save(Iterator* iter, IteratorStateWriter* writer) {
-        mutex_lock lock(mu_);
-        if (input_impl_) {
-          TF_RETURN_IF_ERROR(iter->SaveInput(writer, input_impl_));
-        } else {
-          TF_RETURN_IF_ERROR(
-              writer->WriteScalar(iter->full_name("input_exhausted"), ""));
-        }
-        TF_RETURN_IF_ERROR(writer->WriteScalar(iter->full_name("entries_size"),
-                                               entries_.size()));
-        for (int64 b = 0; b < entries_.size(); ++b) {
-          for (int i = 0; i < dtypes_.size(); ++i) {
-            TF_RETURN_IF_ERROR(
-                writer->WriteTensor(strings::StrCat(iter->full_name("entries"),
-                                                    "[", b, "][", i, "]"),
-                                    entries_[b][i]));
-          }
-        }
-        return Status::OK();
-      }
-
-      Status Restore(Iterator* iter, IteratorContext* ctx,
-                     IteratorStateReader* reader) {
-        mutex_lock l(mu_);
-        if (reader->Contains(iter->full_name("input_exhausted"))) {
-          input_impl_.reset();
-        } else {
-          TF_RETURN_IF_ERROR(iter->dataset_input()->MakeIterator(
-              ctx, iter->prefix(), &input_impl_));
-          TF_RETURN_IF_ERROR(iter->RestoreInput(ctx, reader, input_impl_));
-        }
-        entries_.clear();
-        int64 entries_size = -1;
-        TF_RETURN_IF_ERROR(
-            reader->ReadScalar(iter->full_name("entries_size"), &entries_size));
-        if (entries_size < 0) {
-          return errors::DataLoss(
-              "Expected entries_size key '", iter->full_name("entries_size"),
-              "' to have nonnegative value, but saw: ", entries_size);
-        }
-        for (int64 b = 0; b < entries_size; ++b) {
-          std::vector<Tensor> entry;
-          for (int i = 0; i < dtypes_.size(); ++i) {
-            Tensor value;
-            TF_RETURN_IF_ERROR(
-                reader->ReadTensor(strings::StrCat(iter->full_name("entries"),
-                                                   "[", b, "][", i, "]"),
-                                   &value));
-            entry.push_back(std::move(value));
-          }
-          entries_.push_back(std::move(entry));
-        }
-        return Status::OK();
-      }
-
-      mutex* mu() { return &mu_; }
-
-     private:
-      DataTypeVector dtypes_;
-      std::vector<PartialTensorShape> shapes_;
-
-      mutex mu_;
-      std::unique_ptr<IteratorBase> input_impl_ GUARDED_BY(mu_);
-      std::deque<std::vector<Tensor>> entries_ GUARDED_BY(mu_);
-      condition_variable cv_ GUARDED_BY(mu_);
-    };
-
-    const DatasetBase* dataset_input() const { return dataset()->input_; }
-
-    std::shared_ptr<model::Node> CreateNode(
-        IteratorContext* ctx, model::Node::Args args) const override {
-      return model::MakeKnownRatioNode(std::move(args), dataset()->batch_size_);
-    }
-
-    Status SaveInternal(IteratorStateWriter* writer) override {
-      return queue_->Save(this, writer);
-    }
-
-    Status RestoreInternal(IteratorContext* ctx,
-                           IteratorStateReader* reader) override {
-      return queue_->Restore(this, ctx, reader);
-    }
-
-   public:
-    class TensorQueueInserter {
-     public:
-      TensorQueueInserter() : queue_(nullptr) {}
-
-      void set_queue(TensorQueue* queue) {
-        queue_ = queue;
-        queue_->Ref();
-      }
-
-      TensorQueueInserter(const TensorQueueInserter& rhs) {
-        queue_ = rhs.queue_;
-        queue_->Ref();
-      }
-
-      TensorQueueInserter(TensorQueueInserter&& rhs) {
-        queue_ = rhs.queue_;
-        rhs.queue_ = nullptr;
-      }
-
-      TensorQueueInserter& operator=(const TensorQueueInserter& rhs) = delete;
-
-      string TypeName() const { return "tensorflow::TensorQueueInserter"; }
-      string DebugString() const { return TypeName(); }
-
-      void Encode(VariantTensorData*) const {}
-      bool Decode(const VariantTensorData&) { return false; }
-
-      ~TensorQueueInserter() {
-        if (queue_) {
-          mutex_lock lock(*queue_->mu());
-          queue_->Unref();
-          queue_->NotifyLocked();
-          queue_ = nullptr;
-        }
-      }
-
-      Status Insert(const std::vector<Tensor>& tensors) const {
-        CHECK(queue_);
-        return queue_->Insert(tensors);
-      }
-
-     private:
-      mutable TensorQueue* queue_;
-    };
-
-   private:
-    TensorQueue* queue_;
-  };
-
- private:
-  const int64 batch_size_;
-  const DatasetBase* input_;
-  const DataTypeVector dtypes_;
-  const std::vector<PartialTensorShape> shapes_;
-  const std::vector<Tensor> padding_values_;
-  const DataTypeVector dtypes_with_queue_;
-  const std::vector<PartialTensorShape> batched_shapes_with_queue_;
-};
-
-class PrependFromQueueAndPaddedBatchDatasetOp : public UnaryDatasetOpKernel {
- public:
-  explicit PrependFromQueueAndPaddedBatchDatasetOp(OpKernelConstruction* ctx)
-      : UnaryDatasetOpKernel(ctx) {
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("Toutput_types", &output_types_));
-  }
-
-  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
-                   DatasetBase** output) override {
-    int64 batch_size = 0;
-    OP_REQUIRES_OK(ctx,
-                   ParseScalarArgument<int64>(ctx, "batch_size", &batch_size));
-    OP_REQUIRES(
-        ctx, batch_size > 0,
-        errors::InvalidArgument("Batch size must be greater than zero."));
-
-    OpInputList padded_shape_tensors;
-    OP_REQUIRES_OK(ctx,
-                   ctx->input_list("padded_shapes", &padded_shape_tensors));
-    std::vector<PartialTensorShape> padded_shapes;
-    padded_shapes.reserve(padded_shape_tensors.size());
-    OP_REQUIRES(ctx,
-                padded_shape_tensors.size() == input->output_shapes().size(),
-                errors::InvalidArgument("Number of padded shapes (",
-                                        padded_shape_tensors.size(),
-                                        ") must match the number of components "
-                                        "in the input dataset's elements (",
-                                        input->output_shapes().size(), ")"));
-    for (const Tensor& padded_shape_t : padded_shape_tensors) {
-      OP_REQUIRES(ctx, TensorShapeUtils::IsVector(padded_shape_t.shape()),
-                  errors::InvalidArgument("All padded shapes must be vectors"));
-      PartialTensorShape padded_shape;
-      OP_REQUIRES_OK(ctx, PartialTensorShape::MakePartialShape(
-                              padded_shape_t.vec<int64>().data(),
-                              padded_shape_t.NumElements(), &padded_shape));
-      padded_shapes.push_back(std::move(padded_shape));
-    }
-
-    OP_REQUIRES(
-        ctx, input->output_dtypes() == output_types_,
-        errors::InvalidArgument("Input dataset and this dataset "
-                                "have different output_types: ",
-                                DataTypeVectorString(input->output_dtypes()),
-                                " and ", DataTypeVectorString(output_types_)));
-
-    for (int i = 0; i < input->output_shapes().size(); ++i) {
-      // Exclude the queue from the tensor_shapes calculation.
-      const PartialTensorShape& tensor_shape = padded_shapes[i];
-      OP_REQUIRES(
-          ctx,
-          IsGreaterEqualToOrCompatibleWith(tensor_shape,
-                                           input->output_shapes()[i]),
-          errors::InvalidArgument("Incompatible input shapes at component ", i,
-                                  " between input dataset this dataset: ",
-                                  input->output_shapes()[i].DebugString(),
-                                  " vs. ", tensor_shape.DebugString()));
-    }
-
-    OpInputList padding_values_list;
-    OP_REQUIRES_OK(ctx,
-                   ctx->input_list("padding_values", &padding_values_list));
-    std::vector<Tensor> padding_values;
-    OP_REQUIRES(ctx,
-                padding_values_list.size() == input->output_shapes().size(),
-                errors::InvalidArgument(
-                    "Number of padding values (", padding_values_list.size(),
-                    ") must match the number of components in the input "
-                    "dataset's elements (",
-                    input->output_shapes().size(), ")"));
-    for (int i = 0; i < padding_values_list.size(); ++i) {
-      const Tensor& padding_value_t = padding_values_list[i];
-      OP_REQUIRES(
-          ctx, TensorShapeUtils::IsScalar(padding_value_t.shape()),
-          errors::InvalidArgument(
-              "All padding values must be scalars; but at component ", i,
-              " saw shape: ", padding_value_t.shape().DebugString()));
-      OP_REQUIRES(ctx, padding_value_t.dtype() == input->output_dtypes()[i],
-                  errors::InvalidArgument(
-                      "Mismatched type between padding value ", i,
-                      " and input dataset's component ", i, ": ",
-                      DataTypeString(padding_value_t.dtype()), " vs. ",
-                      DataTypeString(input->output_dtypes()[i])));
-      padding_values.push_back(padding_value_t);
-    }
-
-    *output = new PrependFromQueueAndPaddedBatchDataset(
-        ctx, batch_size, input, output_types_, padded_shapes,
-        std::move(padding_values));
-  }
-
- private:
-  DataTypeVector output_types_;
-};
-
-REGISTER_KERNEL_BUILDER(
-    Name("PrependFromQueueAndPaddedBatchDataset").Device(DEVICE_CPU),
-    PrependFromQueueAndPaddedBatchDatasetOp);
-
-class EnqueueInQueueDatasetOp : public OpKernel {
- public:
-  explicit EnqueueInQueueDatasetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
-  void Compute(OpKernelContext* ctx) override {
-    using TensorQueueInserter =
-        PrependFromQueueAndPaddedBatchDataset::Iterator::TensorQueueInserter;
-
-    // TODO(ebrevdo): accept list of sequence lengths to do proper
-    // sub-slicing of tensors for placement into the queue?
-    const Tensor& tensor_queue_t = ctx->input(0);
-    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_queue_t.shape()),
-                errors::InvalidArgument("queue must be a vector, saw shape: ",
-                                        tensor_queue_t.shape().DebugString()));
-    std::vector<const TensorQueueInserter*> inserters;
-    const int64 batch_size = tensor_queue_t.NumElements();
-    inserters.reserve(batch_size);
-    const Variant* variants = tensor_queue_t.flat<Variant>().data();
-    for (int i = 0; i < batch_size; ++i) {
-      const auto* inserter = variants[i].get<TensorQueueInserter>();
-      OP_REQUIRES(ctx, inserter != nullptr,
-                  errors::InvalidArgument(
-                      "Could not access TensorQueueInserter from queue[", i,
-                      "].  Received variant: ", variants[i].DebugString()));
-      inserters.push_back(inserter);
-    }
-
-    OpInputList components;
-    OP_REQUIRES_OK(ctx, ctx->input_list("components", &components));
-    for (int i = 0; i < components.size(); ++i) {
-      OP_REQUIRES(
-          ctx,
-          components[i].dims() > 0 && components[i].dim_size(0) == batch_size,
-          errors::InvalidArgument(
-              "Expected component ", i, " to have batched shape [", batch_size,
-              ",...], but saw shape: ", components[i].shape().DebugString()));
-    }
-    std::vector<TensorShape> element_shapes;
-    for (int i = 0; i < components.size(); ++i) {
-      TensorShape element_shape = components[i].shape();
-      element_shape.RemoveDim(0);
-      element_shapes.push_back(std::move(element_shape));
-    }
-    for (int64 b = 0; b < batch_size; ++b) {
-      std::vector<Tensor> tensors;
-      tensors.reserve(components.size());
-      for (int i = 0; i < components.size(); ++i) {
-        Tensor t(components[i].dtype(), element_shapes[i]);
-        OP_REQUIRES_OK(ctx,
-                       batch_util::CopySliceToElement(components[i], &t, b));
-        tensors.push_back(std::move(t));
-      }
-      // TODO(ebrevdo): Acquire the lock once for all inserters with
-      // the same underlying queue?  Add InsertLocked?
-      OP_REQUIRES_OK(ctx, inserters[b]->Insert(tensors));
-    }
-  }
-};
-
-REGISTER_KERNEL_BUILDER(Name("EnqueueInQueueDataset").Device(DEVICE_CPU),
-                        EnqueueInQueueDatasetOp);
-
-}  // namespace
-}  // namespace data
-}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 6291bfc110bafe028114b8f9ed010fdd2f97f1cd..4ba2bde718a6351ff13bc17cf14ae5c60332c6ca 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -84,6 +84,8 @@ class TensorSliceDatasetOp : public DatasetOpKernel {
       return "TensorSliceDatasetOp::Dataset";
     }
 
+    int64 Cardinality() const override { return tensors_[0].dim_size(0); }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 2ad4711aabe40bc6af771396c40006670eaf6b9b..c295631550aa008ccbf1abee0a91b27d64a6ba35 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -41,6 +41,16 @@ class WindowDataset : public DatasetBase {
     return output_shapes_;
   }
 
+  int64 AllocatedBytes() const override {
+    int64 allocated_bytes = 0;
+    for (auto& element : elements_) {
+      allocated_bytes += GetAllocatedBytes(element);
+    }
+    return allocated_bytes;
+  }
+
+  int64 Cardinality() const override { return elements_.size(); }
+
   string DebugString() const override { return "WindowDataset"; }
 
  protected:
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 2c68e1ee05b542663e85839444560bdd8085393a..ae13ae5da8d4c093bdb4d6e168584bda234e4502 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -98,6 +98,15 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
                              window_stride_, drop_remainder_, ")::Dataset");
     }
 
+    int64 Cardinality() const override {
+      int64 n = input_->Cardinality();
+      if (n == kInfiniteCardinality || n == kUnknownCardinality) {
+        return n;
+      }
+      return n / window_shift_ +
+             (n % window_shift_ == 0 || drop_remainder_ ? 0 : 1);
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
@@ -155,6 +164,7 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
               Status status =
                   input_impl_->GetNext(ctx, &element, end_of_sequence);
               if (!*end_of_sequence) {
+                RecordBufferEnqueue(ctx, element);
                 buffer_.emplace_back(std::move(element), status);
               } else {
                 input_impl_.reset();
@@ -192,8 +202,14 @@ class WindowDatasetOp : public UnaryDatasetOpKernel {
                 input_impl_.reset();
               }
             }
+            for (size_t i = 0; i < buffer_.size(); ++i) {
+              RecordBufferDequeue(ctx, buffer_.at(i).result);
+            }
             buffer_.clear();
           } else {
+            for (size_t i = 0; i < window_shift; ++i) {
+              RecordBufferDequeue(ctx, buffer_.at(i).result);
+            }
             buffer_.erase(buffer_.begin(), buffer_.begin() + window_shift);
           }
         }
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 6e94d77867168df3aeaeae19b310ef93b0f654f5..1760e63a9e1c6b6262c19baa8354052d7d73fd3c 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -76,6 +76,21 @@ class ZipDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "ZipDatasetOp::Dataset"; }
 
+    int64 Cardinality() const override {
+      int64 result = kInfiniteCardinality;
+      for (const auto& input : inputs_) {
+        int64 n = input->Cardinality();
+        if (n == kUnknownCardinality) {
+          return kUnknownCardinality;
+        }
+        if (n != kInfiniteCardinality &&
+            (result == kInfiniteCardinality || n < result)) {
+          result = n;
+        }
+      }
+      return result;
+    }
+
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
                               DatasetGraphDefBuilder* b,
diff --git a/tensorflow/core/kernels/data_format_ops.cc b/tensorflow/core/kernels/data_format_ops.cc
index 23319e6d0c56788e875eba0720006e2843d78a9d..27020cdabdb867e149bc65743fc60673492436f2 100644
--- a/tensorflow/core/kernels/data_format_ops.cc
+++ b/tensorflow/core/kernels/data_format_ops.cc
@@ -156,6 +156,16 @@ TF_CALL_int32(REGISTER_KERNEL);
 TF_CALL_int64(REGISTER_KERNEL);
 #undef REGISTER_KERNEL
 
+#define REGISTER_KERNEL(T)                             \
+  REGISTER_KERNEL_BUILDER(Name("DataFormatVecPermute") \
+                              .Device(DEVICE_CPU)      \
+                              .Label("host")           \
+                              .TypeConstraint<T>("T"), \
+                          DataFormatVecPermuteOp<CPUDevice, T>);
+TF_CALL_int32(REGISTER_KERNEL);
+TF_CALL_int64(REGISTER_KERNEL);
+#undef REGISTER_KERNEL
+
 #if GOOGLE_CUDA
 // Forward declarations of the functor specializations for GPU.
 namespace functor {
diff --git a/tensorflow/core/kernels/deep_conv2d.cc b/tensorflow/core/kernels/deep_conv2d.cc
index f9c8f16cb9a4c10d58d10bbc442b8bbbdae71939..750c0318a4df483e1980869a3af8de8aa1efea41 100644
--- a/tensorflow/core/kernels/deep_conv2d.cc
+++ b/tensorflow/core/kernels/deep_conv2d.cc
@@ -434,10 +434,9 @@ struct TransformFilters {
         tile_spatial_size, base_filter_spatial_size, transform_matrix);
 
     auto shard = [&ctx, &args, &transform, &base_filter_rows, &base_filter_cols,
-                  &num_filters_transform, &in_depth, &out_depth,
-                  &filter_shards_row, &filter_shards_col, &tile_spatial_size,
-                  &filter_in, &transform_matrix,
-                  &filter_out](int64 start, int64 limit) {
+                  &num_filters_transform, &in_depth, &filter_shards_row,
+                  &filter_shards_col, &tile_spatial_size, &filter_in,
+                  &transform_matrix, &filter_out](int64 start, int64 limit) {
       // Allocate buffer for pre-processed filter:
       //   [base_filter_rows, base_filter_cols, num_filters_transform, in_depth]
       //
@@ -533,9 +532,9 @@ struct PackFilters {
     const int64 out_depth = args.out_depth;
     const int64 num_filters = filter_shards_row * filter_shards_col * out_depth;
 
-    auto shard = [&ctx, &packed_filters, &filter_transform_data,
-                  &tile_spatial_size, &in_depth, &out_depth, &filter_shards_row,
-                  &filter_shards_col, &num_filters](int64 start, int64 limit) {
+    auto shard = [&ctx, &packed_filters, &filter_transform_data, &in_depth,
+                  &out_depth, &filter_shards_row, &filter_shards_col,
+                  &num_filters](int64 start, int64 limit) {
       const int64 filter_coord_stride = num_filters * in_depth;
       for (int64 i = start; i < limit; ++i) {
         // Allocate filter buffer [out_depth, shard_rows, shard_cols, in_depth].
@@ -788,7 +787,7 @@ struct TransformOutputTile {
             const int64 shard_base = sr * filter_shards_col + sc;
             const int64 out_buf_base = tile_base + out_depth_base + shard_base;
 
-            // Calcuate output indices and outputs to drop (if needed).
+            // Calculate output indices and outputs to drop (if needed).
             const int64 out_r_start =
                 in_r + args.pad_rows - sr * tile_stride_rows;
             // NOTE: The index 't' for 'num_tiles is used in index calculation
@@ -1004,9 +1003,9 @@ struct DeepConv2D<CPUDevice, T> {
         out_tile_spatial_size, tile_spatial_size, output_transform_matrix);
 
     auto shard = [&ctx, &args, &transform, &packed_filters, &in_depth,
-                  out_depth, tile_rows, tile_cols, out_tile_rows, out_tile_cols,
-                  filter_shards_row, filter_shards_col, tile_spatial_size,
-                  &input, &tile_transform_matrix, &output_transform_matrix,
+                  out_depth, out_tile_rows, out_tile_cols, filter_shards_row,
+                  filter_shards_col, tile_spatial_size, &input,
+                  &tile_transform_matrix, &output_transform_matrix,
                   &output](int64 batch_start, int64 batch_limit) {
       const int64 row_tiles =
           (args.out_rows + out_tile_rows - 1) / out_tile_rows +
diff --git a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
index 1398c87662575ff5d1752b4db03087bd7dabcb83..e811968d277ba3594341a59e8d6262cac637e602 100644
--- a/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/depthwise_conv_op_gpu.cu.cc
@@ -175,7 +175,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNHWCSmall(
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.x depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -459,7 +459,7 @@ __global__ __launch_bounds__(1024, 2) void DepthwiseConv2dGPUKernelNCHWSmall(
   assert(CanLaunchDepthwiseConv2dGPUSmall(args));
   // Holds block plus halo and filter data for blockDim.z depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -1176,7 +1176,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNHWCSmall(
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.z));
   // Holds block plus halo and filter data for blockDim.x depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
@@ -1448,7 +1448,7 @@ __launch_bounds__(1024, 2) void DepthwiseConv2dBackpropFilterGPUKernelNCHWSmall(
   assert(CanLaunchDepthwiseConv2dBackpropFilterGPUSmall(args, blockDim.x));
   // Holds block plus halo and filter data for blockDim.z depths.
   extern __shared__ __align__(8) unsigned char shared_memory[];
-  static_assert(sizeof(S) <= 8, "Insufficient alignement detected");
+  static_assert(sizeof(S) <= 8, "Insufficient alignment detected");
   S* const shared_data = reinterpret_cast<S*>(shared_memory);
 
   const int num_batches = args.batch;
diff --git a/tensorflow/core/kernels/dynamic_partition_op.cc b/tensorflow/core/kernels/dynamic_partition_op.cc
index 3c988db5e618b976b5b2d45a9bfc386485249826..572d04ae2c464d493508d494ba325a33eb92d4c1 100644
--- a/tensorflow/core/kernels/dynamic_partition_op.cc
+++ b/tensorflow/core/kernels/dynamic_partition_op.cc
@@ -142,7 +142,7 @@ class DynamicPartitionOp : public DynamicPartitionOp_Shared {
         OP_REQUIRES(
             c, FastBoundsCheck(p, num_partitions_),
             errors::InvalidArgument("indices[", i,
-                                    "] has been asynchronously overwitten and "
+                                    "] has been asynchronously overwritten and "
                                     "is no longer in range!"));
         auto oi = output_index[p];
         OP_REQUIRES(c, FastBoundsCheck(oi, out_flat[p].dimension(0)),
diff --git a/tensorflow/core/kernels/eigen_contraction_kernel.h b/tensorflow/core/kernels/eigen_contraction_kernel.h
index 92d29e39958e3cd30ee80776f2abb5c67f1a07e2..66e93a83af2e5a7aa40818067638bfdde8dd42c9 100644
--- a/tensorflow/core/kernels/eigen_contraction_kernel.h
+++ b/tensorflow/core/kernels/eigen_contraction_kernel.h
@@ -33,7 +33,7 @@ limitations under the License.
 //   #endif
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
-#include "third_party/intel_mkl_dnn/include/mkldnn.h"
+#include "mkldnn.h"
 
 namespace Eigen {
 namespace internal {
@@ -126,6 +126,11 @@ struct mkldnn_gemm_kernel</*Scalar*/ float, IndexType, OutputMapper,
                                       &alpha, blockA, &ldA, blockB, &ldB, &beta,
                                       const_cast<float*>(output.data()), &ldC);
     eigen_assert(st == 0);
+
+    // eigen_assert is a no-op in optimized mode so we add these to avoid
+    // compiler's unused-variable errors.
+    EIGEN_UNUSED_VARIABLE(max_index);
+    EIGEN_UNUSED_VARIABLE(st);
   }
 };
 
@@ -143,8 +148,8 @@ class TensorContractionBlocking<float, float, float, StorageIndex,
   // Multiply default choice of block size along M and N dimensions.
   // TODO(ezhulenev): Explore if this can work in general (kScaleM=2.0 worked
   // well in some of models).
-  static const float kScaleM = 1.5;
-  static const float kScaleN = 1.0;
+  static constexpr float kScaleM = 1.5;
+  static constexpr float kScaleN = 1.0;
 
   // Mkldnn Avx/Avx2/Avx512 unroll factors are: 8/16/48.
   static const StorageIndex kUnrollM = 48;
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions.h b/tensorflow/core/kernels/eigen_spatial_convolutions.h
index 1f211b19b4ad982d2ab2a6520bc0e9277e99055a..25c735d080e1cef54b7c8cd87d25eb31612192b3 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions.h
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions.h
@@ -56,6 +56,7 @@ namespace internal {
 //
 // TODO(ezhulenev): Consolidate this part of the code with the image patch
 // extraction code since they are both very similar.
+
 template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
           typename Device, typename Scalar_, typename Index,
           typename nocontract_t, typename contract_t, int Side, int packet_size,
@@ -70,6 +71,7 @@ class TensorContractionInputMapper<
     inner_dim_reordered, Alignment> {
  public:
   typedef Scalar_ Scalar;
+
   typedef TensorContractionInputMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -79,6 +81,7 @@ class TensorContractionInputMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       Self;
+
   typedef TensorContractionSubMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -88,6 +91,7 @@ class TensorContractionInputMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       SubMapper;
+
   typedef SubMapper VectorMapper;
   typedef SubMapper LinearMapper;
   typedef typename packet_traits<Scalar>::type Packet;
@@ -533,6 +537,7 @@ class TensorContractionSubMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       ParentMapper;
+
   typedef TensorContractionSubMapper<
       Scalar, Index, Side,
       TensorEvaluator<
@@ -542,21 +547,22 @@ class TensorContractionSubMapper<
       nocontract_t, contract_t, packet_size, inner_dim_contiguous,
       inner_dim_reordered, Alignment>
       Self;
+
   typedef Self LinearMapper;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
       const ParentMapper& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper),
-        m_depth_offset(vert_offset),
-        m_col_offset(horiz_offset) {
+      : m_depth_offset(vert_offset),
+        m_col_offset(horiz_offset),
+        m_base_mapper(base_mapper) {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
                                      m_otherIndex);
   }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorContractionSubMapper(
       const Self& base_mapper, Index vert_offset, Index horiz_offset)
-      : m_base_mapper(base_mapper.m_base_mapper),
-        m_depth_offset(vert_offset + base_mapper.m_depth_offset),
-        m_col_offset(horiz_offset + base_mapper.m_col_offset) {
+      : m_depth_offset(vert_offset + base_mapper.m_depth_offset),
+        m_col_offset(horiz_offset + base_mapper.m_col_offset),
+        m_base_mapper(base_mapper.m_base_mapper) {
     m_base_mapper.computeBaseIndices(m_col_offset, m_rowIndex, m_colIndex,
                                      m_otherIndex);
   }
@@ -578,7 +584,6 @@ class TensorContractionSubMapper<
     return m_base_mapper.template loadPacket<Alignment>(i + m_depth_offset,
                                                         j + m_col_offset);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Scalar
   loadCoeffStandard(Index i) const {
     return m_base_mapper.loadCoeffStandard(i + m_depth_offset, m_rowIndex,
@@ -611,18 +616,29 @@ class TensorContractionSubMapper<
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxCol(const Index peeled_k) const {
     const Index max_col =
-        fastPatchColStride().divide(m_depth_offset + peeled_k);
+        (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1)) /
+        fastPatchColStride();
     return std::min<Index>(1 + max_col, patchCols());
   }
 
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxRow(const Index peeled_k,
                                    const Index col) const {
-    const Index max_row = fastPatchRowStride().divide(
-        m_depth_offset + peeled_k - col * patchColStride());
+    const Index max_row = (m_depth_offset + (peeled_k == 0 ? 0 : peeled_k - 1) -
+                           col * patchColStride()) /
+                          fastPatchRowStride();
     return std::min<Index>(1 + max_row, patchRows());
   }
 
+  EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE Index maxDepth(const Index peeled_k, const Index col,
+                                     Index row) const {
+    const Index max_depth = m_depth_offset + peeled_k -  //
+                            col * patchColStride() -     //
+                            row * patchRowStride();
+    return std::min<Index>(max_depth, patchDepth());
+  }
+
   // MaxDepth uses only the remaining number of elements in the peeled_k.
   EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE Index maxDepth(const Index num_elements,
@@ -692,6 +708,12 @@ class TensorContractionSubMapper<
     return r < 0 || r >= m_base_mapper.m_inputRows;
   }
   EIGEN_DEVICE_FUNC
+  EIGEN_ALWAYS_INLINE bool padAnyRow(const Index first_row,
+                                     const Index last_row) const {
+    return m_rowIndex + first_row < 0 ||
+           m_rowIndex + last_row >= m_base_mapper.m_inputRows;
+  }
+  EIGEN_DEVICE_FUNC
   EIGEN_ALWAYS_INLINE bool padCol(const Index col) const {
     const Index c = m_colIndex + col;
     return c < 0 || c >= m_base_mapper.m_inputCols;
@@ -738,9 +760,6 @@ class TensorContractionSubMapper<
   }
 
  private:
-  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
-                                     // performs better in benchmarks.
-
   Index m_depth_offset;  // First row in the input matrix
   Index m_col_offset;    // First col in the input matrix
 
@@ -750,6 +769,9 @@ class TensorContractionSubMapper<
   Index m_rowIndex;
   Index m_colIndex;
   Index m_otherIndex;
+
+  const ParentMapper m_base_mapper;  // Keeping a copy instead of a reference
+                                     // performs better in benchmarks.
 };
 
 // Arrange a block of the right input matrix (in our case it's always a "virtual
@@ -1319,23 +1341,19 @@ struct mkldnn_gemm_pack<
   typedef typename packet_traits<Scalar>::type Packet;
 
   EIGEN_DONT_INLINE
-  void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
+  void operator()(Scalar* block, const DataMapper rhs, StorageIndex rows,
                   StorageIndex cols) {
     const bool standard_patches = !rhs.nonStandardPatches();
 
     if (standard_patches && (rhs.patchDepth() % packet_size == 0)) {
-      if (rhs.rowStride() == 1) {
-        packStandardPatches<true, /*squeeze*/ true>(block, rhs, rows, cols);
-      } else {
-        packStandardPatches<true, /*squeeze*/ false>(block, rhs, rows, cols);
-      }
+      // Single packet always belong to single patch (row, col).
+      packStandardPatches</*patch_depth_is_multiple_of_packet_size*/ true>(
+          block, rhs, rows, cols);
 
     } else if (standard_patches) {
-      if (rhs.rowStride() == 1) {
-        packStandardPatches<false, /*squeeze*/ true>(block, rhs, rows, cols);
-      } else {
-        packStandardPatches<false, /*squeeze*/ false>(block, rhs, rows, cols);
-      }
+      // Single packet can span across multiple patch rows or columns.
+      packStandardPatches</*patch_depth_is_multiple_of_packet_size*/ false>(
+          block, rhs, rows, cols);
 
     } else {
       // With non-standard patches we don't do any vectorized loads.
@@ -1357,72 +1375,64 @@ struct mkldnn_gemm_pack<
   // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
   //   depth dimension size to be a multiple of packet size, so we can skip all
   //   non vectorized loads and checks.
-  //
-  // - squeeze_reads=true: If stride along the `row` dimension is `1`, we can
-  //   squeeze reads along the `row` and `depth` dimensions, because they are
-  //   guaranteed to be contiguous in memory (two innermost dimensions).
-  //
-  template <bool patch_depth_is_multiple_of_packet_size, bool squeeze_reads>
+  template <bool patch_depth_is_multiple_of_packet_size>
   EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* block,
-                                               const DataMapper& rhs,
+                                               const DataMapper rhs,
                                                StorageIndex rows,
                                                StorageIndex cols) {
     eigen_assert(!rhs.nonStandardPatches());
 
     // Give vectorized_rows the name used in all other gemm_pack_rhs above.
-    const Index peeled_k = (rows / packet_size) * packet_size;
+    const StorageIndex peeled_k = (rows / packet_size) * packet_size;
 
-    const Index start_col = rhs.colOffset();
-    const Index max_col = rhs.maxCol(peeled_k);
+    const StorageIndex start_col = rhs.colOffset();
+    const StorageIndex max_col = rhs.maxCol(peeled_k);
 
     for (StorageIndex col = 0; col < cols; ++col) {
       SubMapper lm = rhs.getLinearMapper(0, col);
 
-      Index k = 0;
+      StorageIndex k = 0;
       for (Index c = start_col; c < max_col; ++c) {
         eigen_assert(k <= peeled_k);
 
-        const Index start_row = (c == start_col) ? rhs.rowOffset() : 0;
-        const Index max_row = rhs.maxRow(peeled_k, c);
+        const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
+        const StorageIndex max_row = rhs.maxRow(peeled_k, c);
         const bool pad_col = lm.padCol(c);
 
         // We can squeeze reads for all rows in [start_row, max_row) range.
-        if (squeeze_reads && !pad_col && !lm.padRow(start_row) &&
-            !lm.padRow(max_row - 1)) {
-          const Index start_depth = (c == start_col) ? rhs.depthOffset() : 0;
-
-          // Upper bound on the number of elements in the depth dimension that
-          // we can squeeze read.
-          const Index squeeze_length =
-              (max_row - start_row) * rhs.patchDepth() - start_depth;
+        if (!pad_col && !lm.padAnyRow(start_row, max_row - 1)) {
+          const StorageIndex start_depth =
+              (c == start_col) ? rhs.depthOffset() : 0;
 
-          // Do not overshoot beyond the block size.
-          const Index max_depth =
-              start_depth + std::min<Index>(peeled_k - k, squeeze_length);
+          const StorageIndex max_depth =
+              std::min<StorageIndex>(start_depth + (peeled_k - k),
+                                     (max_row - start_row) * rhs.patchDepth());
 
-          const Index base_idx = lm.baseIndex(start_row, c);
+          const StorageIndex base_idx = lm.baseIndex(start_row, c);
 
-          if (patch_depth_is_multiple_of_packet_size)
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
             eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
 
-          // If patch depth is a multiple of packet size, it's guaranteed that
-          // we can process all values in depth dimension with packets.
-          const Index max_vectorized_depth =
-              patch_depth_is_multiple_of_packet_size ? max_depth
-                                                     : max_depth - packet_size;
-
-          Index d = start_depth;
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+              block += packet_size;
+              k += packet_size;
+            }
 
-          // 1. Process depth dimension with vectorized instructions.
-          for (; d < max_vectorized_depth; d += packet_size) {
-            eigen_assert(k < peeled_k);
-            internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
-            block += packet_size;
-            k += packet_size;
-          }
+          } else {
+            StorageIndex d = start_depth;
+            const StorageIndex vectorized_depth = max_depth - packet_size;
 
-          // 2. Finish with coefficients.
-          if (!patch_depth_is_multiple_of_packet_size) {
+            for (; d <= vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
+              block += packet_size;
+              k += packet_size;
+            }
             for (; d < max_depth; d++) {
               eigen_assert(k < peeled_k);
               *block = rhs.coeffNoPadding(d, base_idx);
@@ -1437,39 +1447,43 @@ struct mkldnn_gemm_pack<
 
         // If we are not allowed to squeeze reads along the `row` and `depth`
         // dimensions, we must process rows one by one.
-        for (Index r = start_row; r < max_row; ++r) {
+        for (StorageIndex r = start_row; r < max_row; ++r) {
           eigen_assert(k <= peeled_k);
 
-          const Index start_depth =
+          const StorageIndex start_depth =
               ((c == start_col) && (r == start_row)) ? rhs.depthOffset() : 0;
-          const Index max_depth = rhs.maxDepth(peeled_k - k, start_depth);
+          const StorageIndex max_depth =
+              rhs.maxDepth(peeled_k - k, start_depth);
 
           const bool pad = pad_col || lm.padRow(r);
-          const Index base_idx = lm.baseIndex(r, c);
+          const StorageIndex base_idx = lm.baseIndex(r, c);
 
-          if (patch_depth_is_multiple_of_packet_size)
+          if (patch_depth_is_multiple_of_packet_size) {
+            // If patch depth is a multiple of packet size, it's guaranteed that
+            // we can process all values in depth dimension with packets.
             eigen_assert((max_depth - start_depth) % packet_size == 0);
+            StorageIndex d = start_depth;
 
-          // If patch depth is a multiple of packet size, it's guaranteed that
-          // we can process all values in depth dimension with packets.
-          const Index max_vectorized_depth =
-              patch_depth_is_multiple_of_packet_size ? max_depth
-                                                     : max_depth - packet_size;
-
-          Index d = start_depth;
-
-          // 1. Process depth dimension with vectorized instructions.
-          for (; d < max_vectorized_depth; d += packet_size) {
-            eigen_assert(k < peeled_k);
-            const Packet p = pad ? pset1<Packet>(Scalar(0))
-                                 : rhs.packetNoPadding(d, base_idx);
-            internal::pstoreu(block, p);
-            block += packet_size;
-            k += packet_size;
-          }
+            for (; d < max_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = pad ? pset1<Packet>(Scalar(0))
+                                   : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
 
-          // 2. Finish with coefficients.
-          if (!patch_depth_is_multiple_of_packet_size) {
+          } else {
+            const StorageIndex max_vectorized_depth = max_depth - packet_size;
+            StorageIndex d = start_depth;
+            for (; d < max_vectorized_depth; d += packet_size) {
+              eigen_assert(k < peeled_k);
+              const Packet p = pad ? pset1<Packet>(Scalar(0))
+                                   : rhs.packetNoPadding(d, base_idx);
+              internal::pstoreu(block, p);
+              block += packet_size;
+              k += packet_size;
+            }
             for (; d < max_depth; d++) {
               eigen_assert(k < peeled_k);
               *block = pad ? Scalar(0) : rhs.coeffNoPadding(d, base_idx);
diff --git a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
index 8219fc9025b49ad0de23edbcbcb5324bbf88b22b..22f71d62602cc984c0337f728298f7483c35bed9 100644
--- a/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
+++ b/tensorflow/core/kernels/eigen_spatial_convolutions_test.cc
@@ -1380,7 +1380,12 @@ static void PackRhsHelper(int iters,
                           /* Filter (kernel) dimensions: */
                           int filter_count, int filter_cols, int filter_rows,
                           /* Input strides: */
-                          int col_strides, int row_strides) {
+                          int col_strides, int row_strides,
+                          /* Block dimensions: */
+                          Index block_rows, Index block_cols) {
+  // Set random seed for benchmark repeatability.
+  srand(12345);
+
   tensorflow::testing::UseRealTime();
   tensorflow::testing::StopTiming();
 
@@ -1508,10 +1513,6 @@ static void PackRhsHelper(int iters,
 
   PackRhsImpl pack_rhs;
 
-  // This is the typical size of the rhs block used in Tensor contractions.
-  const Index default_depth = 320;  // must be multiple of 8
-  const Index default_cols = 280;
-
   const Index packed_total_size = input_dims.TotalSize();
 
   tensorflow::testing::StartTiming();
@@ -1520,11 +1521,14 @@ static void PackRhsHelper(int iters,
         num_inputs == 1 ? 1 : internal::random<int>(0, num_inputs - 1);
 
     // Depth offset must be a multiple of 8 (float packet size with AVX2).
-    Index depth_offset = (internal::random<Index>(0, patch_size - 10) / 8) * 8;
+    Index depth_offset =
+        (patch_size > block_rows)
+            ? (internal::random<Index>(0, patch_size - 10) / 8) * 8
+            : 0;
     Index col_offset = internal::random<Index>(0, num_patches - 10);
 
-    Index depth = std::min(default_depth, patch_size - depth_offset);
-    Index cols = std::min(default_cols, num_patches - col_offset);
+    Index depth = std::min(block_rows, patch_size - depth_offset);
+    Index cols = std::min(block_cols, num_patches - col_offset);
 
     // Write packed data to random memory location to emulate cold caches.
     Index packed_size = depth * cols;
@@ -1538,20 +1542,37 @@ static void PackRhsHelper(int iters,
   tensorflow::testing::StopTiming();
 
   std::ostringstream stringStream;
-  stringStream << "patch: depth=" << patch_depth << " rows=" << patch_rows
-               << " cols=" << patch_cols << " num_patches=" << num_patches
+  stringStream << "patch: " << patch_rows << "x" << patch_cols << " D"
+               << patch_depth << "; num_patches=" << num_patches
                << " patch_size=" << patch_size << " num_inputs=" << num_inputs;
   tensorflow::testing::SetLabel(stringStream.str());
 }
 
-#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW) \
-  BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW##_s##SH##x##SW
-
-#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW)                          \
-  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW)(int iters) { \
-    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW);                   \
-  }                                                                         \
-  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW))
+// -------------------------------------------------------------------------- //
+// Macro argumentnames:
+//    N: batch size
+//    H: height
+//    W: width
+//    C: input channels
+//   FC: filter channles
+//   FH: filter height
+//   SH: stride in height dimensions
+//   SW: stride in width dimensions
+//   BR: block rows
+//   BC: block cols
+
+#define BM_CONCAT(a, b) a##b
+
+#define BM_NAME(prefix, N, H, W, C, FC, FH, FW, SH, SW, BR, BC)           \
+  BM_CONCAT(BM_##prefix##_##N##_##H##x##W##_IC##C##_FC##FC##_##FH##x##FW, \
+            _s##SH##x##SW##_B##BR##x##BC)
+
+#define BM_PackRhs(N, H, W, C, FC, FH, FW, SH, SW, BR, BC)         \
+  static void BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, \
+                      BC)(int iters) {                             \
+    PackRhsHelper(iters, N, H, W, C, FC, FH, FW, SH, SW, BR, BC);  \
+  }                                                                \
+  BENCHMARK(BM_NAME(PackRhs, N, H, W, C, FC, FH, FW, SH, SW, BR, BC))
 
 // Number of input channel (input depth) it equal to the number of patch
 // channels (patch depth).
@@ -1563,14 +1584,16 @@ BM_PackRhs(/*batch*/ 32,        //
            /*channels*/ 32,     //
            /*num_filters*/ 64,  //
            /*filter*/ 5, 5,     //
-           /*stride*/ 1, 1);
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
 
 BM_PackRhs(/*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 32,     //
            /*num_filters*/ 64,  //
            /*filter*/ 5, 5,     //
-           /*stride*/ 2, 2);
+           /*stride*/ 2, 2,     //
+           /*block*/ 256, 56);
 
 // Slow path: input channel dimension is not the multiple of the packet size.
 BM_PackRhs(/*batch*/ 32,        //
@@ -1578,12 +1601,48 @@ BM_PackRhs(/*batch*/ 32,        //
            /*channels*/ 30,     //
            /*num_filters*/ 64,  //
            /*filter*/ 5, 5,     //
-           /*stride*/ 1, 1);
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
 
 BM_PackRhs(/*batch*/ 32,        //
            /*image*/ 64, 64,    //
            /*channels*/ 30,     //
            /*num_filters*/ 64,  //
            /*filter*/ 5, 5,     //
-           /*stride*/ 2, 2);
+           /*stride*/ 2, 2,     //
+           /*block*/ 256, 56);
+
+// Slow path with input channel dimension smaller than the packet size.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 256, 256,  //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 8, 8,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 256, 56);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 256, 256,  //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 8, 8,     //
+           /*stride*/ 2, 4,     //
+           /*block*/ 256, 56);
+
+// Short and wide block with small input channel dimension.
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 3, 3,     //
+           /*stride*/ 1, 1,     //
+           /*block*/ 36, 432);
+
+BM_PackRhs(/*batch*/ 32,        //
+           /*image*/ 64, 64,    //
+           /*channels*/ 4,      //
+           /*num_filters*/ 16,  //
+           /*filter*/ 3, 3,     //
+           /*stride*/ 2, 2,     //
+           /*block*/ 36, 432);
 }  // namespace Eigen
diff --git a/tensorflow/core/kernels/fractional_avg_pool_op.cc b/tensorflow/core/kernels/fractional_avg_pool_op.cc
index 135d0023458b1ef393ab0bc296dc07310347e7ff..61234479eac086c545c1457a743fb0da9db0c8d6 100644
--- a/tensorflow/core/kernels/fractional_avg_pool_op.cc
+++ b/tensorflow/core/kernels/fractional_avg_pool_op.cc
@@ -223,7 +223,7 @@ class FractionalAvgPoolGradOp : public OpKernel {
     // Once we figure out the original contributors, we just need to evenly
     // divide the value of this element among these contributors.
     //
-    // Internally, we divide the out_backprop tensor and store it in a temparary
+    // Internally, we divide the out_backprop tensor and store it in a temporary
     // tensor of double type. And cast it to the corresponding type.
     typedef Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>>
         ConstEigenMatrixMap;
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index d5c09712b251346830528181dd2433b2e5935548..90f94ee4a06519eca064abf9b1e0d60f1f181188 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -69,6 +69,7 @@ void RetvalOp::Compute(OpKernelContext* ctx) {
 }
 
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kArgOp).Device(DEVICE_CPU), ArgOp);
+REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceArgOp).Device(DEVICE_CPU), ArgOp);
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kRetOp).Device(DEVICE_CPU), RetvalOp);
 REGISTER_SYSTEM_KERNEL_BUILDER(Name(kDeviceRetOp).Device(DEVICE_CPU), RetvalOp);
 
@@ -105,6 +106,8 @@ TF_CALL_bool(REGISTER) REGISTER_KERNEL_BUILDER(Name(kArgOp)
                                                    .HostMemory("output")
                                                    .TypeConstraint<int32>("T"),
                                                ArgOp);
+REGISTER_KERNEL_BUILDER(
+    Name(kDeviceArgOp).Device(DEVICE_GPU).TypeConstraint<int32>("T"), ArgOp);
 #undef REGISTER
 
 REGISTER_KERNEL_BUILDER(Name(kArgOp)
@@ -119,6 +122,9 @@ REGISTER_KERNEL_BUILDER(Name(kArgOp)
                             .TypeConstraint<string>("T"),
                         ArgOp);
 
+REGISTER_KERNEL_BUILDER(
+    Name(kArgOp).Device(DEVICE_GPU).TypeConstraint<Variant>("T"), ArgOp);
+
 #define REGISTER(type)     \
   REGISTER_KERNEL_BUILDER( \
       Name(kRetOp).Device(DEVICE_GPU).TypeConstraint<type>("T"), RetvalOp);
diff --git a/tensorflow/core/kernels/function_ops.h b/tensorflow/core/kernels/function_ops.h
index 0f51eca16380acf98664a6ca255c64691ee57691..9ddd49560392dd4c313877f819c13d2a6b0079ed 100644
--- a/tensorflow/core/kernels/function_ops.h
+++ b/tensorflow/core/kernels/function_ops.h
@@ -22,6 +22,7 @@ limitations under the License.
 namespace tensorflow {
 
 static const char* const kArgOp = FunctionLibraryDefinition::kArgOp;
+static const char* const kDeviceArgOp = FunctionLibraryDefinition::kDeviceArgOp;
 static const char* const kRetOp = FunctionLibraryDefinition::kRetOp;
 static const char* const kDeviceRetOp = FunctionLibraryDefinition::kDeviceRetOp;
 
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index 1529d2e3368266174d3098bad5f4b35bb83b502e..5ecb203cbc7296d75f6a0a68a2189d7bf018c7fe 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -526,21 +526,40 @@ REGISTER_KERNEL_BUILDER(Name("For")
                             .HostMemory("delta"),
                         ForOp);
 
+// FakeParamOp allocates a tensor with a shape conforming to the expected
+// output. This is necessary if the value will be stored in a while_loop's
+// TensorList. The output is otherwise not expected to be consumed by anything
+// else.
 class FakeParamOp : public OpKernel {
  public:
   explicit FakeParamOp(OpKernelConstruction* context) : OpKernel(context) {
-    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
+    DataType dtype;
+    OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype));
+
+    // Set shape to the specified shape, setting unknown dimensions to empty.
+    // If the specified shape is unknown, leave as an empty shape.
+    TensorShape shape;
+    PartialTensorShape partial_shape;
+    OP_REQUIRES_OK(context, context->GetAttr("shape", &partial_shape));
+    if (!partial_shape.unknown_rank()) {
+      for (int64 d : partial_shape.dim_sizes()) {
+        shape.AddDim(d == -1 ? 0 : d);
+      }
+    }
+
+    // Create a persistent tensor that we can repeatedly return to save memory.
+    // TODO(b/119612758): add optimization to prevent sending this across
+    // devices on each Compute() call.
+    OP_REQUIRES_OK(context, context->allocate_persistent(
+                                dtype, shape, &value_handle_, nullptr));
   }
 
   void Compute(OpKernelContext* context) override {
-    // We must produce something (only Switch and Recvs are allowed to output
-    // dead tensors). This output is not expected to be consumed by anything.
-    Tensor output_tensor(dtype_, TensorShape({}));
-    context->set_output(0, output_tensor);
+    context->set_output(0, *value_handle_.AccessTensor(context));
   }
 
  private:
-  DataType dtype_;
+  PersistentTensor value_handle_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("FakeParam").Device(DEVICE_CPU), FakeParamOp);
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index d89f1592bd72d0f349b6f8a7eca64fc4d046050a..dbd3bb05dbf1a310ea9c5a5b1003474e33825133 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -248,7 +248,7 @@ struct FusedBatchNorm<GPUDevice, T, U> {
                   Tensor* saved_inv_var, TensorFormat tensor_format,
                   bool is_training) {
     auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream avalible"));
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
 
     const int64 batch_size = GetTensorDim(x, tensor_format, 'N');
     const int64 channels = GetTensorDim(x, tensor_format, 'C');
@@ -389,7 +389,7 @@ struct FusedBatchNormGrad<GPUDevice, T, U> {
                   Tensor* scale_backprop, Tensor* offset_backprop,
                   TensorFormat tensor_format) {
     auto* stream = context->op_device_context()->stream();
-    OP_REQUIRES(context, stream, errors::Internal("No GPU stream avalible"));
+    OP_REQUIRES(context, stream, errors::Internal("No GPU stream available"));
 
     const int64 batch_size = GetTensorDim(x, tensor_format, 'N');
     const int64 channels = GetTensorDim(x, tensor_format, 'C');
diff --git a/tensorflow/core/kernels/fuzzing/BUILD b/tensorflow/core/kernels/fuzzing/BUILD
index 6f3a49805ce769645ccc113a59360beab27e8403..2d8b734535c964bf4162838baa8ad65af4790423 100644
--- a/tensorflow/core/kernels/fuzzing/BUILD
+++ b/tensorflow/core/kernels/fuzzing/BUILD
@@ -18,15 +18,23 @@ cc_library(
 )
 
 load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_ops_fuzz_target_lib")
+load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_oss_fuzz_corpus")
+load("//tensorflow/core/kernels/fuzzing:tf_ops_fuzz_target_lib.bzl", "tf_oss_fuzz_dict")
 
 tf_ops_fuzz_target_lib("identity")
 
 tf_ops_fuzz_target_lib("string_to_number")
 
+tf_oss_fuzz_corpus("string_to_number")
+
 tf_ops_fuzz_target_lib("string_split")
 
+tf_oss_fuzz_corpus("string_split")
+
 tf_ops_fuzz_target_lib("string_split_v2")
 
+tf_oss_fuzz_corpus("string_split_v2")
+
 tf_ops_fuzz_target_lib("encode_base64")
 
 tf_ops_fuzz_target_lib("decode_base64")
@@ -35,10 +43,20 @@ tf_ops_fuzz_target_lib("encode_jpeg")
 
 tf_ops_fuzz_target_lib("decode_bmp")
 
+tf_oss_fuzz_corpus("decode_bmp")
+
 tf_ops_fuzz_target_lib("decode_png")
 
+tf_oss_fuzz_corpus("decode_png")
+
+tf_oss_fuzz_dict("decode_png")
+
 tf_ops_fuzz_target_lib("decode_wav")
 
+tf_oss_fuzz_corpus("decode_wav")
+
+tf_oss_fuzz_dict("decode_wav")
+
 tf_ops_fuzz_target_lib("example_proto_fast_parsing")
 
 tf_ops_fuzz_target_lib("parse_tensor_op")
@@ -46,3 +64,7 @@ tf_ops_fuzz_target_lib("parse_tensor_op")
 tf_ops_fuzz_target_lib("decode_compressed")
 
 tf_ops_fuzz_target_lib("decode_json_example")
+
+tf_oss_fuzz_corpus("decode_json_example")
+
+tf_oss_fuzz_dict("decode_json_example")
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187
new file mode 100644
index 0000000000000000000000000000000000000000..7a1b8966c21c74f1e0a3f3af4240551bc10ef36c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/012e3ad384a4a1165f8498b5c94ba0d32a73e187 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b
new file mode 100644
index 0000000000000000000000000000000000000000..24f658497f182188a6cbf431e1ff810ca268e709
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/055d77f7810048caa28323f6eb552a53d156040b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d
new file mode 100644
index 0000000000000000000000000000000000000000..a2d8f84cab77d805ae2582fb551fe2f074b2cfbb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/131e251bfb82c681cb075d32b99f18fceaca115d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526
new file mode 100644
index 0000000000000000000000000000000000000000..6206dab82b1e4a3d96ee4ae276006183349aa8e6
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/1399ab0bd9f2c91d270cb43251bdc5729bef3526 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de
new file mode 100644
index 0000000000000000000000000000000000000000..bcc7f481ae90d0075912bf80469a18a0fd27682e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/16a6ce88f66d2e9686c8354cad8ba915cf0c11de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad
new file mode 100644
index 0000000000000000000000000000000000000000..92bddb6dca98bcc26ab9257bf568a462b5db8a36
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/185097ed0588195164619ea930ddd8274a5f32ad differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c
new file mode 100644
index 0000000000000000000000000000000000000000..082b1e5752a95bed0941c60f68e9efb83dcea73e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/27711a87e06a50c81571c27c3aa403a6ad5dc55c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2
new file mode 100644
index 0000000000000000000000000000000000000000..af1091428d59645a8158218a953fbd60e0c463d8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/298c3787ad1722b22569cbc405c464d2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922
new file mode 100644
index 0000000000000000000000000000000000000000..fd711cb0e51bb10d84902b03d3a27c086ec8dfbc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/2b95ba6d8141ce0d29ff279770903922 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb
new file mode 100644
index 0000000000000000000000000000000000000000..6748826bd88a8ff6616536e0badb9b5d5184d9cb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/321fb3d758b86e37fc340ae2b09b8ed9fa73a4cb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542
new file mode 100644
index 0000000000000000000000000000000000000000..9cf1b9d3af82244e2f1f5f4e9d696fa006a9416f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/331a98b4e4c87840efea69223766ebd0e1736542 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883
new file mode 100644
index 0000000000000000000000000000000000000000..932e78b3547e36ee7892f97135b7ac16d03bdff8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/352d73f841223ecb630b5836585d2ba7b0f9d883 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc
new file mode 100644
index 0000000000000000000000000000000000000000..89a090d74ee4af9f5352f3e954e41db64727bfcc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3a84f409d4c117edfdebc508cd23e8fc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1
new file mode 100644
index 0000000000000000000000000000000000000000..286949bc56a4fd135b6d645c2f6f6543fb54cefe
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/3ef5cc982c0b45f69a26fd0f7d376415fdebabd1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906
new file mode 100644
index 0000000000000000000000000000000000000000..8d5c7d136e51b0b71a574de271c1d076ad118ec7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/401c7de8e122018a0e17f57c93db7ee49ab0e906 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415
new file mode 100644
index 0000000000000000000000000000000000000000..f77ffec08653f23af6ec36d2bc3851d37a15a7ef
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/52fee71bb8c9c79068e1fe580677ad739a2d0415 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86
new file mode 100644
index 0000000000000000000000000000000000000000..f9af0697d53b40b1617eff3dba43955053de8bff
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57b11507813d5727b7789354d888eda83d5f3d86 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d
new file mode 100644
index 0000000000000000000000000000000000000000..109ab7948ff737251655b35d801b7a147107da71
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/57dff0fa53ee0ef24a43cca6ab0523bfdc1f720d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2
new file mode 100644
index 0000000000000000000000000000000000000000..bf9772902653f1b4bf258c4268937e18f9abbcf8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5c42d3df0dc400a7a4175b8d4eec6cc8ee2437b2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43
new file mode 100644
index 0000000000000000000000000000000000000000..cf7a78e9488a8fd55813862e78a360064dff19b7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5cca20637ae75fddad9370ee930837baef8aeb43 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09
new file mode 100644
index 0000000000000000000000000000000000000000..e5621aa3d1ba240abb65a521507762d520d1d4bf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5d34bc9cef0c844b9c5ebe948145c4ca11b5ca09 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac
new file mode 100644
index 0000000000000000000000000000000000000000..eea39d6b2f856ab8f1debf18d4d99543076a1b9a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e162fe883bd12fb1c4131d4e0c979a12bd15eac differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092
new file mode 100644
index 0000000000000000000000000000000000000000..fabcbdbe3d4b0e2b8f1de3bf0c7a388369d11547
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/5e83f8faab9c1a51a33d5e29edbb9dcec23c6092 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef
new file mode 100644
index 0000000000000000000000000000000000000000..8dfc17e8e05f948210d4ed34113b3517445c418e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/61b29dc2fcef7b6fbe3e0cc88769a7ef differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f
new file mode 100644
index 0000000000000000000000000000000000000000..141e331ad0148af9b17e2824d8b64a442ac18b38
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6361eca190157ece389665ee523ccc3aefcd957f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f
new file mode 100644
index 0000000000000000000000000000000000000000..567c645c00ee0e47ca2a840c7a115f68ae889e3d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/65150515ab3b11d657519b22bb887d74e94b2d7f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394
new file mode 100644
index 0000000000000000000000000000000000000000..e1cdb4e5bf9d55e87b4495ab8db931f269e22d61
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/656f38ef6dcd58c6a909d61db11f777def69c394 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7
new file mode 100644
index 0000000000000000000000000000000000000000..73e53b460ec81a8accdf2b6601dfcd8679465b6a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/66e0d2cafd592bf9d61ad900fade8ee530d5f3d7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c
new file mode 100644
index 0000000000000000000000000000000000000000..f29b9b217184c4cc8496f1f711caec0c7c632ca4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/6b5b42cb105a2c4c5fd6034e9885cbe457f1b50c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f
new file mode 100644
index 0000000000000000000000000000000000000000..3b0c338ce203a0fd72936ae40120f140aae712b4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/722ed0197cb92ecbf9745edb38275e7a9aaf322f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1
new file mode 100644
index 0000000000000000000000000000000000000000..61dd2583cd6e9c7679c2c30ff6d35df09264fd91
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/77bdd2efdf328366cbbf3c5688768dc0a88d02b1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166
new file mode 100644
index 0000000000000000000000000000000000000000..907ec3b5a3ba35bf86b8988ad6d25592e9015c17
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7841bfa002c05c61d5a5d9241f214cc17a336166 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48
new file mode 100644
index 0000000000000000000000000000000000000000..7e3b1990ad7c09a19fcef334aee9545a31d05380
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7899e22fc83f6be28e9130c4a1c91a48 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119
new file mode 100644
index 0000000000000000000000000000000000000000..0329a2826a89961de414d0e3c51a7c81ea02d8ef
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/7dddccaebd16ae0c26daeffc42df50f529891119 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7
new file mode 100644
index 0000000000000000000000000000000000000000..7e9ef4b3dd47fbbdf0d4087a4d4fdf5323c3bf23
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8157442eee4bbfdd9716e264b11085d61a9955b7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0
new file mode 100644
index 0000000000000000000000000000000000000000..6390e6b2b30b9e4512b191b72f0bed8d4005c0cd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/81ff28ed63d5435ddc4c8771dd5d40aa658cbbe0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0
new file mode 100644
index 0000000000000000000000000000000000000000..0084212a656bd4f97a2ba22f4d7ced7f8746946f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/820c8c0d33c18f6c4d9edd314e91289186931ad0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981
new file mode 100644
index 0000000000000000000000000000000000000000..a36c88daf011cf7809049396476863fd66d5dc2d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/849e9d7cee1c52105242327086997296e452b981 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2
new file mode 100644
index 0000000000000000000000000000000000000000..b5d34609b88399ffe7198acfb06dac91fd50d4cd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/84ddb92c63e0fad7018f6069daf8779ce11501e2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70
new file mode 100644
index 0000000000000000000000000000000000000000..a9ef2b5a50b36814ecaada8813a5f4b056f17ce8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/86bc3d5dbb9313137502080e58551edd2e649c70 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14
new file mode 100644
index 0000000000000000000000000000000000000000..83de83f4eb59e2cfea819e1fdee97b3f6aec525c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/87d94d88fe29d277c76e1a52042b02c092d5ae14 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16
new file mode 100644
index 0000000000000000000000000000000000000000..fa47e75a6323a989daa16e2a482a44f9ab2f2705
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/8c4646f3357945c4e19a59ff79fffe3c874dbf16 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080
new file mode 100644
index 0000000000000000000000000000000000000000..e739e858b860e774e3cdf9142de13206f508d2ea
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/90632bc6dee4eb836f3d7db1d16446a9c8510080 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244
new file mode 100644
index 0000000000000000000000000000000000000000..c989a76df155ff09d69d2a765d6e097928b8344f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/94d06016aa949e8e7203217e4cc6625ded7f4244 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a
new file mode 100644
index 0000000000000000000000000000000000000000..6ff64a7d2bab262cee2e8b6bebf65af9aafda630
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9875819b9e5783e7489c29a81cc9d4279209956a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2
new file mode 100644
index 0000000000000000000000000000000000000000..2d1d8576a29b050160e63a142b48927db9540b8d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9c1cc734114b29aac6c51782d5c17e9dbe1faca2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62
new file mode 100644
index 0000000000000000000000000000000000000000..a5865c60a614829812b37c8439f6bfc00f7198c0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9d2961871eeb201ef8a6f5503d8a8b62 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d
new file mode 100644
index 0000000000000000000000000000000000000000..8f712aea9ff2d6e930f0269002f9e9d1a0caa016
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/9f39e11cdd88344a4894b678e5a04a810880064d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7
new file mode 100644
index 0000000000000000000000000000000000000000..b7ad89078b5fc1b96e811aa117cc11c7f4f0467b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a350588a6dabe4376a066aed44ef8786d8e752e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19
new file mode 100644
index 0000000000000000000000000000000000000000..173c941952bf798f796a9ef4c751d1e6e6e3a09c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a6101a79919d444e1fc50aefab5837c39e3f4a19 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5
new file mode 100644
index 0000000000000000000000000000000000000000..644560ced96b56c960032d7df08b3730053bc3ea
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/a9c8793f8fb063bec839ee1280406fe5396545e5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e
new file mode 100644
index 0000000000000000000000000000000000000000..f1826b06e88c9ca21599abaee7481ae2f07ee840
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ad4e9d2234e8599bdf12607c6b8cab4edae82c4e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd
new file mode 100644
index 0000000000000000000000000000000000000000..2da6be376f2af7b9cefc00d5b0d096c7e68fea69
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b90b6830917919e94186d312f06481bd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b
new file mode 100644
index 0000000000000000000000000000000000000000..b84b57bb53177764b84b10a75624dfe949ddbe81
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/b98fd4cb1d7031240414301c19b03097c0035c6b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb
new file mode 100644
index 0000000000000000000000000000000000000000..4ee9cbdfc3defea2195af8ccd749fc0a326a7d00
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ba976fcdb4daf092ef17ce43bf2b78d9d8bc2aeb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d
new file mode 100644
index 0000000000000000000000000000000000000000..af1091428d59645a8158218a953fbd60e0c463d8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/bc112b571eafee0f5a031f3c9cce6244216d128d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d
new file mode 100644
index 0000000000000000000000000000000000000000..996e8c826cbd548e4567bfc600496ae31084c288
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c42b981c28a1715c375050f6fcf53f1d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9
new file mode 100644
index 0000000000000000000000000000000000000000..4863878ca02ca42e550f521cb1ba5b8bd7046ecc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c6049874b33eadb016fccf0c5fa66e556ae069b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3
new file mode 100644
index 0000000000000000000000000000000000000000..30aacc2f98820e139b1495c9c1527f9119dec97b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8697bf2369f6ab85f501376c4d93bb8a56974a3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6
new file mode 100644
index 0000000000000000000000000000000000000000..b831633f02b50870db441110db29e03331decdaf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/c8daf283e0aef2fd7b630c0430e05dc28f24ecf6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19
new file mode 100644
index 0000000000000000000000000000000000000000..ff492d29d76ce94f0fb8db9a0b6c481c407b205b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/cacff56e1af4b8fde912822da06b10fb8c545a19 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104
new file mode 100644
index 0000000000000000000000000000000000000000..ea776fb0a94b79362822f76bfa1f0d9364a09c4a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ce4dcc22b1d595c49a25121c0b580104 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a
new file mode 100644
index 0000000000000000000000000000000000000000..b9dc5d0f4ee4edb8d45ae3e5ed182f57e5d4e5db
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d0cd71dbf039fd64cf42eff30da92a71a919226a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9
new file mode 100644
index 0000000000000000000000000000000000000000..b17294ec90a31bc77f439f7baae0d897bf860ec4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d5ce626ac3264bed6af5580e341a89406857cbb9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21
new file mode 100644
index 0000000000000000000000000000000000000000..9cc65607fecd34abe27d5e0956e9fa462f0c48f1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d77ada02e9bc8c24b2711eca6a8f52ae356bfc21 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484
new file mode 100644
index 0000000000000000000000000000000000000000..531427a99c0f1bbef21f35fa90027a7d662b39ec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/d7eb9c5a0f9803df4c00390793b8ab57bd7c9484 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33
new file mode 100644
index 0000000000000000000000000000000000000000..5a5c1c30eb0d70bea1562137328ac2d74871e43d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dc1efccdeec17e151a1ec8228c09ab61c3040b33 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f
new file mode 100644
index 0000000000000000000000000000000000000000..44d2ebfb3d0f37c6355fe4e44c27f7457ebee63c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/dcea22c66c60088165a2f1772036473f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e
new file mode 100644
index 0000000000000000000000000000000000000000..f9a8f33443d84d5494b23e5b3317015f10b0496a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/de539ae7442fa05dafcfe1a021f0186ef74a2b0e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215
new file mode 100644
index 0000000000000000000000000000000000000000..71bf61cebe4e69dc71714916d804d979824a7d0b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2306b1d6b88d0ccc4e2c3a9edb07462a5a32215 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd
new file mode 100644
index 0000000000000000000000000000000000000000..1bad15905ffd4776122e34df11873b390abee72a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e2778da0240fdd15ef5844905d81c4e05f34a8bd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794
new file mode 100644
index 0000000000000000000000000000000000000000..f9d9de9c9c21f18a6eb88afb89f4450d4fd6771f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/e6642e9266875f9d908942e534bf898103a2c794 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120
new file mode 100644
index 0000000000000000000000000000000000000000..782a0925210623c0d6e33ba7c885435bb19654d3
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ec6cdb929c08d8daf2bd7fc185fbf4d787b45120 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9
new file mode 100644
index 0000000000000000000000000000000000000000..efd9312d94dde30707099125b0280be21f6b2101
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ed8636357f79439b6a03eb14469b686cc401a1c9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d
new file mode 100644
index 0000000000000000000000000000000000000000..03e09e28193a513347433ee41f005f0926329d20
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ee313e9acecb5c688ce8c9bb10e70e136fbb9c6d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c
new file mode 100644
index 0000000000000000000000000000000000000000..f8688710452dafe40fb247674471ea9f642b3778
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ef689af320e7d9e22231109faae2e8149cb86e1c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4
new file mode 100644
index 0000000000000000000000000000000000000000..20efec0d1e19ea1f2b317f82c08f0ec5d41a66ca
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/fda6b9a9f6ffdf4765c00465619c7ceb3f7db2e4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54 b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54
new file mode 100644
index 0000000000000000000000000000000000000000..e24c09dacce1b52dc3737b1d921920846b627b2f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_bmp/ffe829bb0adac20d9c0756f68a22d1255e4fdb54 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144
new file mode 100644
index 0000000000000000000000000000000000000000..06fd8044808ff9cae8663cec970645bd22bf8ab8
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/013a29ea098a178f8a36741c9fd91144
@@ -0,0 +1,48 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie: {
+        bytes_list: {
+          value: "VGhlIFNoYXdzaGFuayBSZWRlbXB0aW9u",
+          value: "RmlnaHQgQ2x1Yg=="
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: 9.0,
+          value: 9.7
+        }
+      }
+    },
+    feature: {
+      suggestion: {
+        bytes_list: {
+          value: "SW5jZXB0aW9u"
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2
new file mode 100644
index 0000000000000000000000000000000000000000..4ae686974e2be25e49e3a25064dcfdfb91a41b5b
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/0875575fb76d630ccb19c5da8aab66b2
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:[29.0,2,3,4]}}},feature:{movie_ratings:{float_list:{value:[9.0,9.7]}}},feature:{suggestion_purchased:{float_list:{value:[1.0,2,3,4,5]}}},feature:{purchase_price:{float_list:{value:[9.99,8.88,7.77,6.66,5.55],value:[4.44,3.33,2.22,1.11],value:[1.11,2.22,3.33],value:[4.44,5.55],value:0}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b
new file mode 100644
index 0000000000000000000000000000000000000000..150f8710f7dc094ad1189f1d3c659910d2e1b3e2
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/7e7f58fc443a11a0a2c5d9b643b7e99b
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:[9.0,9.7]}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416
new file mode 100644
index 0000000000000000000000000000000000000000..fcfdfedd1b090871954e1d9b99d90480f6082dae
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/849a23936269a261c0370b5e9abe2416
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:[[[[[[9.0,9.7]]]]]],value:[[[9.0,-9.2]]]}}},feature:{suggestion_purchased:{float_list:{value:[1.0,[2,3,[4,5,6,[7,8,9,0]]]]}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f
new file mode 100644
index 0000000000000000000000000000000000000000..7c9981d482fcf5a2a138cc2583ea0dca9589e756
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/85282c1696d98b9843ce3e8bd1cd899f
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie_ratings:{float_list:{value:9.0,value:9.7}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87
new file mode 100644
index 0000000000000000000000000000000000000000..a1315bb8f9363858c6d79066cac3e93dc40f1602
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/90388b9c8093d8adedad0644b618da87
@@ -0,0 +1,33 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [[[[[[9.0,9.7]]]]]],
+          value: [[[9.0, -9.2]]]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: [1.0, [2, 3, [4, 5, 6, [7, 8, 9, 0]]]]
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d
new file mode 100644
index 0000000000000000000000000000000000000000..d4f9494bbd3f945ed6926f8669c9fab62ae3ede6
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/9fa2f86ea6d3ade36e961247c3026f8d
@@ -0,0 +1,33 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: 9.0,
+          value: 9.7
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758
new file mode 100644
index 0000000000000000000000000000000000000000..e8ba267eb27b84fb427f33dea60623b8dace79cf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/c4f18ca60a84e9869a28faf6f65dc758
@@ -0,0 +1,32 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: 29.0
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [9.0,9.7]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: 1.0
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: 9.99
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223
new file mode 100644
index 0000000000000000000000000000000000000000..3428a1e0fcd730a5e0bce03f0dfd1d5fec90ea74
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/d456ee029700adef5d28438593010223
@@ -0,0 +1 @@
+{features:{feature:{age:{float_list:{value:29.0}}},feature:{movie:{bytes_list:{value:"VGhlIFNoYXdzaGFuayBSZWRlbXB0aW9u",value:"RmlnaHQgQ2x1Yg=="}}},feature:{movie_ratings:{float_list:{value:9.0,value:9.7}}},feature:{suggestion:{bytes_list:{value:"SW5jZXB0aW9u"}}},feature:{suggestion_purchased:{float_list:{value:1.0}}},feature:{purchase_price:{float_list:{value:9.99}}}}}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2 b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2
new file mode 100644
index 0000000000000000000000000000000000000000..ef0923c4500ecc3c6e8f01a87d1109066a752f48
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/decode_json_example/e9f0ff6ee8d691ae69d2ecb4710030a2
@@ -0,0 +1,36 @@
+{
+  features: {
+    feature: {
+      age: {
+        float_list: {
+          value: [29.0, 2, 3, 4]
+        }
+      }
+    },
+    feature: {
+      movie_ratings: {
+        float_list: {
+          value: [9.0,9.7]
+        }
+      }
+    },
+    feature: {
+      suggestion_purchased: {
+        float_list: {
+          value: [1.0, 2, 3, 4, 5]
+        }
+      }
+    },
+    feature: {
+      purchase_price: {
+        float_list: {
+          value: [9.99, 8.88, 7.77, 6.66, 5.55],
+          value: [4.44, 3.33, 2.22, 1.11],
+          value: [1.11, 2.22, 3.33],
+          value: [4.44, 5.55],
+          value: 0
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3
new file mode 100644
index 0000000000000000000000000000000000000000..9dbc560e1e4b50f98060fdad36ae0f65f0c0c92b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/010dc3d4b05288fcc40de2721052b3dc699f1cb3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9
new file mode 100644
index 0000000000000000000000000000000000000000..fab6d15ebe37176ddc7f3868f576b15d27518d00
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0555cd5e9d99629819cc985285f80da0f00be1e9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f
new file mode 100644
index 0000000000000000000000000000000000000000..7918406ac4bc04196bf07a3e8804b0dbc946eaf3
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0a0352aa168803ff65455792d9f6ee555c3e7c3f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381
new file mode 100644
index 0000000000000000000000000000000000000000..c294b3180f7d1c315466efacee0167a1685ba68f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/0ed54162df93ef8d00f993ce6b59ba422903d381 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f
new file mode 100644
index 0000000000000000000000000000000000000000..0eb3eff90d7fa0c03fc827b65da2e21050f27ebf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1547b448171c700613c3946d730de496c9b9863f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba b/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba
new file mode 100644
index 0000000000000000000000000000000000000000..deb7b0a784aa876e880755bca4c2f96c1167fa27
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/17859046cbe4ac598a645173d679ce2a52c6afba differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2
new file mode 100644
index 0000000000000000000000000000000000000000..2b9721d742ad143de0881894c52407af1865b79b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1df76c07817fbc3653a26f34d97658e9973627c2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443
new file mode 100644
index 0000000000000000000000000000000000000000..e0c330f7f4ee396723fedbb530197cd890045730
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/1f0717f8856d7782e3ab7992d3a72d783a018443 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a
new file mode 100644
index 0000000000000000000000000000000000000000..41fc2fe9516ecaf369c9093c5d815a709963c2b1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/23b911e4ce936def88bc9a46b8b433c0e83fba2a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151
new file mode 100644
index 0000000000000000000000000000000000000000..8b5755c4bc513368a2605e05c8b3117feadab086
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/25592201c3edff0578dbdac6b0e4f2be109ce151 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73
new file mode 100644
index 0000000000000000000000000000000000000000..c4d2d8d7f1adc091d4340f3cb45a304df4617a17
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/266fd8495e0b8eb64387c1a62264185e061fee73 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e b/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e
new file mode 100644
index 0000000000000000000000000000000000000000..1608e5b08373fdbfb62f83c5a708e76e3dcd3db0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/27f178cf415b4ff8671131ddf1d042dafac2fb3e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f
new file mode 100644
index 0000000000000000000000000000000000000000..12e4140981d115312f32b710bb2882d5b1ede161
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2a0bdc4d9cc5ea5bb21dd256d6ac96075376a94f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938
new file mode 100644
index 0000000000000000000000000000000000000000..ecf597f7365bf06492af4d53bca4166f5ecf744a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e5d25add6adc68e0457b358c7a34abf3d41c938 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544
new file mode 100644
index 0000000000000000000000000000000000000000..e5a18917e3744daf0f8d4eb5cbbb702daff21188
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e6c5b6a766dd5e9bd41eacfd0a36572bd2f7544 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176
new file mode 100644
index 0000000000000000000000000000000000000000..50be7f686b791d7aa08652159592311c728ec79f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2e9c935cf82f6ca640e9a9abc3c30a578ad46176 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826
new file mode 100644
index 0000000000000000000000000000000000000000..00eba4c39a92d546e7b4f6f77b18e9d3ddaec399
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/2fcf1ed4477f7eaee028f5b3f9edeb5f1a737826 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b
new file mode 100644
index 0000000000000000000000000000000000000000..af3afc499d820cdd966ec9a67d3ec6fb39b3e240
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3480713774f590908ca5dba16d121cdfb8fba62b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5
new file mode 100644
index 0000000000000000000000000000000000000000..02c187a49225947f8f20ee87b970783f93eaed76
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/39289afcec60d98802b333e0fbb1da4d7aed4ce5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9
new file mode 100644
index 0000000000000000000000000000000000000000..1cf24048f8ba10e68082ea717656c2c889ebf7f5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3adc488e21d4aca7bed9422f0241a42d0f93e7d9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173
new file mode 100644
index 0000000000000000000000000000000000000000..7f9c0c93ec9bc8f135e7f78010b275079e052b48
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3cbf274da522483dc991fad9df43a22ac4fb3173 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432
new file mode 100644
index 0000000000000000000000000000000000000000..f48cb4cd19a65487d555b2fdce297d638736edb7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3d840cdff7f5ad16fe8bcb985ed4946c03459432 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea
new file mode 100644
index 0000000000000000000000000000000000000000..df07889441d48fd938b74aac94e01b39dfa8f63e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3f1e6753c1fca958e859189857449746592158ea differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b
new file mode 100644
index 0000000000000000000000000000000000000000..5f9cec9ab5cb27973b8d8e71b47cf61601c0ee3d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/3fa4075993cb0f9bfa8eea785174a2038a69aa1b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a
new file mode 100644
index 0000000000000000000000000000000000000000..385b8b0c35936d3b9c99facefccb33835569470b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4023a373e977be58413e55350380310c5dd1fd6a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a
new file mode 100644
index 0000000000000000000000000000000000000000..22f1649adc45eae591f46c4b1a1e0db6bf0cc82e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/40caba69dce1cfc48e0e43184d2bfbc6daa4399a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b
new file mode 100644
index 0000000000000000000000000000000000000000..16c0c33b93dadec4fd326f2eff73a81d6c82f508
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41841e9561d8135945c1c1e55ab9e9a1e933653b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf
new file mode 100644
index 0000000000000000000000000000000000000000..6e44f2adc7fd1343044613a087c1c3efdaef5081
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/41d40f2d66fa43e34537385594ee9911e65deadf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312
new file mode 100644
index 0000000000000000000000000000000000000000..131004b8943e997ec7ba3212f3ba0d555c4ba4d2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/421bd39810b50309a71adb2dadc3b19f01a52312 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf b/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf
new file mode 100644
index 0000000000000000000000000000000000000000..1f2f90b3bc41f8daa93d2df651bf55b2a3ddc78b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/446c305b2c0665736f94fb2b62dbdef445eff0cf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a
new file mode 100644
index 0000000000000000000000000000000000000000..c671f7e4c0d7c8816a5fa4915b1fe0cbe9be98e1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/449cee952bb645f6f4241a6665d3c6028c073c7a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb
new file mode 100644
index 0000000000000000000000000000000000000000..c6f2f7052d2c169c84429800d57e79d8876e2a03
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/45520b07609978c5aa3516d803527438b93fbadb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e
new file mode 100644
index 0000000000000000000000000000000000000000..605ad2d2014ba7bcc2a0b94a9a0b9bacf055d7c5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/4da74a34bcede234b0415f77fbd87d70bf9a777e differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b
new file mode 100644
index 0000000000000000000000000000000000000000..bcacbe623f8ca1ac88de8d3dfdce160de04f2f29
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/51db5d31d2c5300d34831d9f23bcdd0aff9a998b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc
new file mode 100644
index 0000000000000000000000000000000000000000..2619e1d87638b8f37900a750dba325d075739ad1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5cde2a9167798cb77f10abbfb2640a5c357f99fc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249
new file mode 100644
index 0000000000000000000000000000000000000000..cb55f03ee184e66592d578050dc44a7bf70fe4ac
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/5e352fc10ac476cfbe1d755f092e069820223249 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52
new file mode 100644
index 0000000000000000000000000000000000000000..be952039a4ec42a60e188115aa9c27888c692b06
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/63661677dd1306cec4b5a565190e65adf2446e52 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9
new file mode 100644
index 0000000000000000000000000000000000000000..776adbe8d4bfd9e9b2532525a8f3187d90453d93
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/65887ed3db382aab1d9485c500f4401318d303b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7
new file mode 100644
index 0000000000000000000000000000000000000000..5bee1d494a574bf8a675933d3045150b32395478
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/67b5181f8f0644597e9bde539e8f083b5cacd0e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466
new file mode 100644
index 0000000000000000000000000000000000000000..ea3e0d2bd444e8fd484bd57facdf53f318762b86
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/74c9dcf7afee2a6cb1ab3a2c0de744d1b03c1466 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce b/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce
new file mode 100644
index 0000000000000000000000000000000000000000..521deb8d44ea8afc4d5ef285d41f1a42909e8ad5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/792181ca19e6ded261434e588bb7fc2a4816d4ce differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741
new file mode 100644
index 0000000000000000000000000000000000000000..885332337762ff92dd6a1ad00fe0a1694995a5c0
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/79f0e2a475487f8fa69e68c1cc947c5851bda741 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db
new file mode 100644
index 0000000000000000000000000000000000000000..cc011aedc9b5234cb26140a73dc93a85c0e1c6c3
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7e5fcdfeb557ce379ed96925c68505eaac0112db differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838
new file mode 100644
index 0000000000000000000000000000000000000000..7e3b6f564f0dc029247355c9a920245354f7cd1f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/7eec7530acf34b3a96fa9189783453999f7b6838 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1
new file mode 100644
index 0000000000000000000000000000000000000000..4828092a8aa538f619fe0238f01041ec24187f0c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80114bf9781bffc9db411413d83541d8deaaf7c1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3
new file mode 100644
index 0000000000000000000000000000000000000000..2ed0139989f994cef05524f606058404ebc7614a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/80425fb92bb86627e854892f23823fa804e5fdc3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945
new file mode 100644
index 0000000000000000000000000000000000000000..28925e3c80c2210e90461b82d950f63427bd5439
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/821cdd6eeb919a8dd7f35289abbd583828dd4945 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde b/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde
new file mode 100644
index 0000000000000000000000000000000000000000..9a5487fbfbe79979ddb06faf9d1b29a445db75eb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/83e1a31785285338b0ddb3334b0ed098e63dedde differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb
new file mode 100644
index 0000000000000000000000000000000000000000..c6c8b7c717310177d1034ec4e1643afbdc6994bf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8a4c8100dedd0fb5f2a8b468c678f7ad8269deeb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4
new file mode 100644
index 0000000000000000000000000000000000000000..ce211f7cfd7e6c6fc33c30d8fdcaf63e401a9365
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/8ae8268c24dc866c1edb3826b93a1c75dbf74ff4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288
new file mode 100644
index 0000000000000000000000000000000000000000..3f12cb5f6595439978754328620f1f8f0fb43291
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/90f72038cc627f34f074ea72eadbba87a5e3e288 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3
new file mode 100644
index 0000000000000000000000000000000000000000..18ff654a110c137350e93dec975a12e17cc0761c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/92b67faee4a49df2cdbed785e27b4a1cddcfffa3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b
new file mode 100644
index 0000000000000000000000000000000000000000..eec341bf2bd623f50bbb5e393f4aaa245bf15fec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9463810467aacdc9923b2b20a2236116b760d75b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6
new file mode 100644
index 0000000000000000000000000000000000000000..776f17c6b218bd55751e740b35b3ad6f4097a288
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/94d7c96aea32ad41ce643d35b951a6d8990b81d6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5
new file mode 100644
index 0000000000000000000000000000000000000000..d7296ca03c4fa909de8fb18d8dda7f9cc6bafce7
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/98cc7e9fe87df914d89a0aef008930f27b3c26f5 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814
new file mode 100644
index 0000000000000000000000000000000000000000..82559facc80344c43d682b2ee9de939b517264bf
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/99172dfdb4f59aaced29c7681ac6e6ce8356e814 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160
new file mode 100644
index 0000000000000000000000000000000000000000..9e2eff2c4ec0c6ac3ba6e5558cd0a881b4dbb042
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9ae3b647d895af97fe872c0b1442df7b5b767160 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3
new file mode 100644
index 0000000000000000000000000000000000000000..6d17e06d470d501c0984f339d75de0f3ba59b4fd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/9d2b1d2121b0508a4fa8d1508adb9d05633fdac3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6
new file mode 100644
index 0000000000000000000000000000000000000000..ce8245f2da2025b6c37911a22179c45d06b90f5e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a335af37917ccf0c8b11bb884a3a74f3f1d2a7c6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709
new file mode 100644
index 0000000000000000000000000000000000000000..a980c777a8531fd113f9f54dee0fa00946f7a420
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a738609112d3a6772c50a71e2c3504ebc515b709 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d
new file mode 100644
index 0000000000000000000000000000000000000000..31a0fe82b99e564781cf22c7de167c25fc4d5e37
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/a8cecab5d917da5a4729632a7a18c564d7e1607d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621
new file mode 100644
index 0000000000000000000000000000000000000000..776f0b88dcca473db2eb6cb7ccfab99183260ce8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ade919ab2b4a458e806575c941dfe50ae3fd3621 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36
new file mode 100644
index 0000000000000000000000000000000000000000..ba6aa256542e6dc9a6d802784ab451ed6b93b141
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1251621a5eb5e7fda9cac9baead1c993a285c36 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb
new file mode 100644
index 0000000000000000000000000000000000000000..c4ec4ad4b9c4a38459c974e8718532bc8ff4d345
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b1516b78c3dfe77eeb554985fd7344c0478fbbcb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158
new file mode 100644
index 0000000000000000000000000000000000000000..5413efd933685ec8073cdccd087a5009439ffe17
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b41241740f5f8ad2c1d408f7bb6a313bd863c158 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc
new file mode 100644
index 0000000000000000000000000000000000000000..9cd72246b91b803783c6224511d825edc14c299f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/b799c8596523a7ebeb8e11ada08818c10f7eabfc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443
new file mode 100644
index 0000000000000000000000000000000000000000..66aac674cd26032159ef1a195cee7e70cd8511ec
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ba48d0521a111222dc95a3a997c7c92dea5f4443 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7
new file mode 100644
index 0000000000000000000000000000000000000000..eff793b204a409edde4d0b601cb5e43e3fe1d088
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c01457c6889fb1b597d308363a36412c0b7f90e7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7
new file mode 100644
index 0000000000000000000000000000000000000000..ba604969f6e0275a1f9db37b2755e9d693d60f5c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c82ebc0d6688d104af04fd20d6d3da591dc391f7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c
new file mode 100644
index 0000000000000000000000000000000000000000..c23fb3da9ce10cfab5eed3382e34f887bb117942
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/c9a03eb758dd84e954e3d70916e2311e8fd21f3c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7
new file mode 100644
index 0000000000000000000000000000000000000000..42f1f9a29827e46a86ee16dd90a8779e7bc927cb
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/cf892756b33578a54ab20044514e573328d2f1d7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9
new file mode 100644
index 0000000000000000000000000000000000000000..6b1183f4ffaab6f8b429eeb7ce3640bd3df887d6
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d3bc3f158a63f1d50b474addd3f7b3d17f23e8e9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37
new file mode 100644
index 0000000000000000000000000000000000000000..2d8fd3f3f0d34782904ba50077e62b6c82ef4f13
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/d4906950aa9d60ad09dc0f5413c3d88080c3bc37 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65
new file mode 100644
index 0000000000000000000000000000000000000000..dc37f788a1acf12c1e252c880f26eb0a4f53809e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/da31578a8068bad65e1c7a3d06e8f543a2a0bc65 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b b/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b
new file mode 100644
index 0000000000000000000000000000000000000000..82c5120f464521aa502f234fbccf4ea3e5763b7b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/dd4a9b5d0740679c249fc884efc499433b29436b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb b/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb
new file mode 100644
index 0000000000000000000000000000000000000000..6daa5452a159945afb1d5e072f3f5daa3a1cc2d6
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/deea4ecc6f0b2a6d89fd25ff76762299f21602fb differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6
new file mode 100644
index 0000000000000000000000000000000000000000..306bbf464bf804c169e405550ecc2c68d33c7020
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e1040c7ffcb39915e0f539018c81f9798924cba6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392
new file mode 100644
index 0000000000000000000000000000000000000000..36487c0002294149201115e745543cc2363515d4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/e381dc85682cc33ad99f622b89d145b47f7d6392 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a
new file mode 100644
index 0000000000000000000000000000000000000000..ab99a8374aa80d829891b5670078d95b776a6957
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ea24498fc7a144fccc6f1665ebf7020df803dd1a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6
new file mode 100644
index 0000000000000000000000000000000000000000..63ff2676ae32de1b7293ec4a8c34b550b2c22ad5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/eaa5d677e797c07bac98c3c7051abad91852e7c6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26
new file mode 100644
index 0000000000000000000000000000000000000000..d2a4b9aafd7b745a50401a3ac94660ccc787f99a
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ed7871269315725535d8bffec7836c45a3fc5c26 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570
new file mode 100644
index 0000000000000000000000000000000000000000..c3b2bd442c1f47cc24db060eb85a842c26f35a30
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ee8460f4077064c5a2137075b48eba7d3db5c570 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6
new file mode 100644
index 0000000000000000000000000000000000000000..2422f7cb3fefac0242fc41a5b1a5aec6c0ffc42e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ef09f26e0ee61329f84a9f589629a865ae9ee0a6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a
new file mode 100644
index 0000000000000000000000000000000000000000..2ec0b7ae29cea7c8aa406470f5ad812102ba207b
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f477da4d7d8ff2066041e1dd5ee4e833b7111a1a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae
new file mode 100644
index 0000000000000000000000000000000000000000..5431f584cd14d3c2bb0f1cbea21b1ea8a8c505c5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/f8a379b2498a4eb452a85791a49adf065dab59ae differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046
new file mode 100644
index 0000000000000000000000000000000000000000..cf043445f4b62caa450c89b2c5ee44f84d837242
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/fe67bccb06f2174523943cc684518fcf1f7f8046 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628 b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628
new file mode 100644
index 0000000000000000000000000000000000000000..b1f8d120a3955c09b390dbaa8cab59ecf063b506
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_png/ff1e67d17c1c27ef0d97900d0ea276b563a64628 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a
new file mode 100644
index 0000000000000000000000000000000000000000..2a65e0e2559eea919bdbb661370c679fdbdd7e26
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/02cc44cdfec1d9d0d0c66c5a5f40d3d20e4c4c3a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186
new file mode 100644
index 0000000000000000000000000000000000000000..329af8a3b9d8928bc197d1361236a82fda888026
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/087e1d7fae1c1ddcbaa3b5f822a171ad15498186 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf
new file mode 100644
index 0000000000000000000000000000000000000000..8cda165c8c791847ddaffdecd62f503c42a2895f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/0f61c33027394a0f14d29dcd22f405cad943b7cf differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1
new file mode 100644
index 0000000000000000000000000000000000000000..4afe44dd91c3038dfa26d06730b2213c1d98dc50
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/10cdebea1659c21a0248f88654ae41f62786abf1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57
new file mode 100644
index 0000000000000000000000000000000000000000..9b2d29856d1483ced1b8ea3cf55f53182947451f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/126e68def9fd973a100e0f66cadf09448a716b57 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3
new file mode 100644
index 0000000000000000000000000000000000000000..b4370a16d58c0b2d52af8fc09ac8606fd7e88d7f
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1275d41ebf8788ce3a949352e4bc654b04012da3 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584
new file mode 100644
index 0000000000000000000000000000000000000000..d28721708d44c0b6227a14286b19f6cdf3be7010
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1a7f1c407fb3864ddb559f88f373a21d1be51584 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d
new file mode 100644
index 0000000000000000000000000000000000000000..611b38b71d541c68be9e6397dc4366c75a951532
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/1c3e1c91f187f6bcea86f172ff5bbbd955a9654d differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4
new file mode 100644
index 0000000000000000000000000000000000000000..fcf8360b277d0f051e111ce38d6fd4e33a2c17e1
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/300fe1e0a47543037cbf0243b6756c9aa48799c4 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad
new file mode 100644
index 0000000000000000000000000000000000000000..868e2672727ac1b7d505faaa22d79edee3252cf5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/31ec5b0134bedcfe283f4978e6e65b7d35d5d4ad differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1
new file mode 100644
index 0000000000000000000000000000000000000000..898584d96f6bfcd76ebed9dcde3b7df88442af78
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/4e7cbb27667bcfca92838aa8020749990013a9b1 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9
new file mode 100644
index 0000000000000000000000000000000000000000..a4994c208300b6cba20708367337667fbbd8dabc
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/585e469231d202812bfba8285fb30c8e31c857b9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de
new file mode 100644
index 0000000000000000000000000000000000000000..eb38d110153b75463f4af9c56bf11d5c6d425685
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/58eab6bc2386e2ef43fe4f55cb6ad3611399d5de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328
new file mode 100644
index 0000000000000000000000000000000000000000..6c534ab19cf330994c377b4e158ed98492880589
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/63448c6a9feb8c72b3e82af4d735ec2e62ddd328 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b
new file mode 100644
index 0000000000000000000000000000000000000000..e054ad5f14723fa1bd5829725e38de4a681cb3e8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/6874d5b1c7a64b596c61f24877d422e89bebe58b differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f
new file mode 100644
index 0000000000000000000000000000000000000000..3be6a61cbab76ee7142caae58d99695ac573a46d
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7501f79cb067da108020579ed654349c7933d22f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd
new file mode 100644
index 0000000000000000000000000000000000000000..a0d8a6ec48c983fc3870d173c5b4c73eb474eecd
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/782051f8120182b860c7fe1b265179cfa2fe03fd differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6
new file mode 100644
index 0000000000000000000000000000000000000000..8a9216e10b777da5e8888e5549ab39346de8acf2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/793feab2deb35e284a975f6527d76a8be5540fe6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329
new file mode 100644
index 0000000000000000000000000000000000000000..aaa91f2f45bd5730550211f4a080e6119d15b48e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/7f41ec3a9805c6b8f3656c4f9f6d0ff7dbf8a329 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065
new file mode 100644
index 0000000000000000000000000000000000000000..46316baf29ff126da46159bf6c16fb11eb9cb23e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8210dc595a2652f2f812093b01e239e7918ea065 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7
new file mode 100644
index 0000000000000000000000000000000000000000..54a777d22c159918c5e85078560fe363c2a78b10
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/8dffe4c5c26d891b578fd2ea4b9adfc0c96ad5f7 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de
new file mode 100644
index 0000000000000000000000000000000000000000..826747d852b00397d859fcd65e7404d42cd329f5
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/91d787a9298ddc015efa783a92c4bdba8af0d7de differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc
new file mode 100644
index 0000000000000000000000000000000000000000..77b8f518b4c46b2cef89bd7a12f23ea49c8505ab
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/92c065286f956f086e977556358f6b54b12bcacc differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6
new file mode 100644
index 0000000000000000000000000000000000000000..45d6b6fa606fe4a25d00476bd4ff785bb80fbba4
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a35c9bb71792b60a13dea23a41b41847ad4b93d6 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904
new file mode 100644
index 0000000000000000000000000000000000000000..14954c595882e6d35c5f77f36dfcb9ad315c606c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/a6ea960c7b4d42772888280277b26e645ceee904 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2
new file mode 100644
index 0000000000000000000000000000000000000000..c0cc5c469f9ab41222d76349f960610e01d4cda8
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/aa526aa853333f0bb11804b5243df411452cecd2 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9
new file mode 100644
index 0000000000000000000000000000000000000000..bb026584ca28d5de4e7438c9d21e68726cbf521c
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/ca533cd26c7ca6bf69e62351b265ded496fdf1d9 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86 b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86
new file mode 100644
index 0000000000000000000000000000000000000000..0900eb1352b10576bebda11b0e2b943d77877d57
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f38c61da15f2cb7a39ff02e69f0b00e99f37ec86 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c
new file mode 100644
index 0000000000000000000000000000000000000000..d74f0a3326d2a644f869ba969426268dc559df25
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/f88f1012473e6cfcc9b39b2552f682b2f73eff8c differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f
new file mode 100644
index 0000000000000000000000000000000000000000..2fd41a34fe5b503756e93b68aaa4f90301a78950
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fa79819c5de04bc06c69bec3fa7f2e982826ea2f differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a
new file mode 100644
index 0000000000000000000000000000000000000000..35a99bc97d93c9beeea6a917453685fabf853ab2
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/decode_wav/fce08de222896ac3a20657a3b4f42d5b6c54a96a differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424 b/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424
new file mode 100644
index 0000000000000000000000000000000000000000..eb84b9e610c1e988273a020f8a42c16a0c484951
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/4c01a1504da9de2216894743ecc44424
@@ -0,0 +1 @@
+./,abcd.efgh/abcd,efgh.abcd/efgh,abcd.efgh/a
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440 b/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440
new file mode 100644
index 0000000000000000000000000000000000000000..4cd522da7bf4b638331fbd5ef1514a6f81baaf5f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/5bf16424630b5afbcffe711fb9834440
@@ -0,0 +1 @@
+.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736 b/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736
new file mode 100644
index 0000000000000000000000000000000000000000..03cfb6256f33c8605c5122d7459ea6955d63ee68
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/a7185605aef0a8fd682fcb4656e4a736
@@ -0,0 +1 @@
+./, abcde.fghab/cdefg,habcd efgha.bcdef/ghabc
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0 b/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0
new file mode 100644
index 0000000000000000000000000000000000000000..304b0d66fe08fd1a29827488727702dd9b9bce3e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/string_split/d5606def44fdbb9385dd764612069db0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146 b/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146
new file mode 100644
index 0000000000000000000000000000000000000000..a8740444aa40ccf3249589ccce97568b15a822ae
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/dbac766f3160de65894bf5153f478146
@@ -0,0 +1 @@
+./, ?abcdef.ghabcd/efghab,cdefgh abcdef?ghabcd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24 b/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24
new file mode 100644
index 0000000000000000000000000000000000000000..47d551466a4e76bb71bc7496cf1a8a30aa087809
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split/e85ff62f6d457666f54a37a19a115a24
@@ -0,0 +1 @@
+./abc.def/gha.bcd/efg.hab/cde.fgh/abc.def/g
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec
new file mode 100644
index 0000000000000000000000000000000000000000..f1410e184b23aacb36f8c741d3ada783b28aa75e
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/00fd47bf73afcb72e7ed51bffd5f5fec
@@ -0,0 +1 @@
+./abc./de./fg./ha./bc./de./fg./ha./bc./de./
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29
new file mode 100644
index 0000000000000000000000000000000000000000..e118d2d351b58910d496319e53fbfa78bf3d3ee4
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/14908973e6720513a5f37676cb9fcc29
@@ -0,0 +1 @@
+./, abcde./, fg./, ha./, bc./, de./, fg./, ha
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d
new file mode 100644
index 0000000000000000000000000000000000000000..9a6c80919746dbb49b844bb1175de31c3026f9f5
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/2779ba7c4d23eee9f79efa3660084c5d
@@ -0,0 +1 @@
+./,abcd./,ef./,gh./,ab./,cd./,ef./,gh./,ab./
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440
new file mode 100644
index 0000000000000000000000000000000000000000..4cd522da7bf4b638331fbd5ef1514a6f81baaf5f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/5bf16424630b5afbcffe711fb9834440
@@ -0,0 +1 @@
+.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd.ef.gh.ab.cd
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1
new file mode 100644
index 0000000000000000000000000000000000000000..5301a91d8e4e48818f103eeee2cee58c2c2e0808
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/89734a96b93275e495a9498b806fafe1
@@ -0,0 +1 @@
+./, ?abcdef./, ?gh./, ?ab./, ?cd./, ?ef./, ?gh
\ No newline at end of file
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0 b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0
new file mode 100644
index 0000000000000000000000000000000000000000..304b0d66fe08fd1a29827488727702dd9b9bce3e
Binary files /dev/null and b/tensorflow/core/kernels/fuzzing/corpus/string_split_v2/d5606def44fdbb9385dd764612069db0 differ
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82
new file mode 100644
index 0000000000000000000000000000000000000000..3de80927d5770479cb068d9cd5af9d8a8470dbdf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/2db83ea58639b6d7d585fa12e3947a82
@@ -0,0 +1 @@
+6.023e+23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94
new file mode 100644
index 0000000000000000000000000000000000000000..d531129b2833e4ec7542a939268ae4cb0eeadeba
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/36b4a931886b941dc41180050d12ca94
@@ -0,0 +1 @@
+6.023e-23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4
new file mode 100644
index 0000000000000000000000000000000000000000..d81cc0710eb6cf9efd5b920a8453e1e07157b6cd
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/50a2fabfdd276f573ff97ace8b11c5f4
@@ -0,0 +1 @@
+42
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2
new file mode 100644
index 0000000000000000000000000000000000000000..72f88139d0f639d92fe7869ac222267652d570d8
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/62edb2a1eee34b001652cd86584becf2
@@ -0,0 +1 @@
+0xabcdef
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc
new file mode 100644
index 0000000000000000000000000000000000000000..c1113b83e8f16ef607af4427b77d90ed0bfec0b8
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/90013d1ec28c46a5c00574e60c70b6fc
@@ -0,0 +1 @@
+3.14159265359
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4
new file mode 100644
index 0000000000000000000000000000000000000000..320aa3f00ee0abc68be7a71e18775e07a009f73f
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/94f3e3cee6957ce5815326d6788c85f4
@@ -0,0 +1 @@
+0.69314718056
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8
new file mode 100644
index 0000000000000000000000000000000000000000..51b7b732f69d9bc2c3328a649aba0e7523f4dc92
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/96f547bc04bb913da0bc08915238ebd8
@@ -0,0 +1 @@
+6.023e23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed
new file mode 100644
index 0000000000000000000000000000000000000000..9a0be0764b639269e5bc669b91c00e0a14a9dc46
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/d3a903d18fc11e1f35c572ad4da690ed
@@ -0,0 +1 @@
+1.61803
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3
new file mode 100644
index 0000000000000000000000000000000000000000..6a0e60d48b173a5049079e37e284315a97918b76
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/e3b629c92af44260c189deb32d6f06f3
@@ -0,0 +1 @@
+-42
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d
new file mode 100644
index 0000000000000000000000000000000000000000..ea9cd255bc796c310f3cca79272597bc75c39cb5
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/f03eecf3bcfe4967a1888156a3115c8d
@@ -0,0 +1 @@
+6.023E+23
diff --git a/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16 b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16
new file mode 100644
index 0000000000000000000000000000000000000000..00f1e2ed8ff84bc765220f9e7a939fb101cead10
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/corpus/string_to_number/fa54ca9186f77122ae2a82684a062e16
@@ -0,0 +1 @@
+2.71828182846
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict
new file mode 100644
index 0000000000000000000000000000000000000000..5fe4ca23d1f9403b6ac7fc3084c9165b55391caf
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_json_example.dict
@@ -0,0 +1,6 @@
+"features"
+"feature"
+"bytes_list"
+"float_list"
+"int64_list"
+"value"
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict
new file mode 100644
index 0000000000000000000000000000000000000000..d795ae7f71ff5b0c54b96dd967b2a692753523ac
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_png.dict
@@ -0,0 +1,50 @@
+header_87a="87a"
+header_89a="89a"
+header_gif="GIF"
+header_jfif="JFIF\x00"
+header_jfxx="JFXX\x00"
+header_png="\x89PNG\x0d\x0a\x1a\x0a"
+marker_2c=","
+marker_3b=";"
+section_2101="!\x01\x12"
+section_21f9="!\xf9\x04"
+section_21fe="!\xfe"
+section_21ff="!\xff\x11"
+section_IDAT="IDAT"
+section_IEND="IEND"
+section_IHDR="IHDR"
+section_PLTE="PLTE"
+section_bKGD="bKGD"
+section_cHRM="cHRM"
+section_fRAc="fRAc"
+section_ffc0="\xff\xc0"
+section_ffc2="\xff\xc2"
+section_ffc4="\xff\xc4"
+section_ffd0="\xff\xd0"
+section_ffd8="\xff\xd8"
+section_ffd9="\xff\xd9"
+section_ffda="\xff\xda"
+section_ffdb="\xff\xdb"
+section_ffdd="\xff\xdd"
+section_ffe0="\xff\xe0"
+section_ffe1="\xff\xe1"
+section_fffe="\xff\xfe"
+section_gAMA="gAMA"
+section_gIFg="gIFg"
+section_gIFt="gIFt"
+section_gIFx="gIFx"
+section_hIST="hIST"
+section_iCCP="iCCP"
+section_iTXt="iTXt"
+section_oFFs="oFFs"
+section_pCAL="pCAL"
+section_pHYs="pHYs"
+section_sBIT="sBIT"
+section_sCAL="sCAL"
+section_sPLT="sPLT"
+section_sRGB="sRGB"
+section_sTER="sTER"
+section_tEXt="tEXt"
+section_tIME="tIME"
+section_tRNS="tRNS"
+section_zTXt="zTXt"
diff --git a/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict b/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict
new file mode 100644
index 0000000000000000000000000000000000000000..eab65386ce33e0d9ffcf2ef213cdcaea2f5aa7ef
--- /dev/null
+++ b/tensorflow/core/kernels/fuzzing/dictionaries/decode_wav.dict
@@ -0,0 +1,4 @@
+header_RIFF="RIFF"
+header_WAVE="WAVE"
+section_fmt="fmt "
+section_data="data"
diff --git a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
index a8f07f4bad3a7e7ccff4ebefd4c56c695d0b2573..b8d779fb1384b22b88a79e115fe413464fe6a7e3 100644
--- a/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/encode_base64_fuzz.cc
@@ -19,7 +19,7 @@ limitations under the License.
 namespace tensorflow {
 namespace fuzzing {
 
-class FuzzEncodeBase64 : public FuzzSession {
+class FuzzEncodeBase64 : public FuzzStringInputOp {
   SINGLE_INPUT_OP_BUILDER(DT_STRING, EncodeBase64);
 };
 
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 9777be1ae8a2428857d38264db226f99a2b34894..57d562ddf43142e47e5d52e4c0dfbbcbbb4bdfe0 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -72,11 +72,11 @@ class FuzzSession {
   // By convention, the graph should have inputs named "input1", ...
   // "inputN", and one output node, named "output".
   // Users of FuzzSession should override this method to create their graph.
-  virtual void BuildGraph(const Scope& scope) {}
+  virtual void BuildGraph(const Scope& scope) = 0;
 
   // Implements the logic that converts an opaque byte buffer
   // from the fuzzer to Tensor inputs to the graph.  Users must override.
-  virtual void FuzzImpl(const uint8_t* data, size_t size) {}
+  virtual void FuzzImpl(const uint8_t* data, size_t size) = 0;
 
   // Initializes the FuzzSession.  Not safe for multithreading.
   // Separate init function because the call to virtual BuildGraphDef
diff --git a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
index 87a548a999ce8f66b42314f76ae0fe2fd6253567..2564f8ed0303d1c80bad32181507eb678b18345b 100644
--- a/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
+++ b/tensorflow/core/kernels/fuzzing/string_split_fuzz.cc
@@ -37,8 +37,7 @@ class FuzzStringSplit : public FuzzSession {
       // The spec for split is that the delimeter should be 0 or 1 characters.
       // Naturally, fuzz it with something larger.  (This omits the possibility
       // of handing it a > int32_max size string, which should be tested for in
-      // an
-      // explicit test).
+      // an explicit test).
       size_t delim_len = static_cast<size_t>(data[0]);
       if (delim_len > size) {
         delim_len = size - 1;
diff --git a/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl b/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
index f752b59568a74f56c9b581651e54d1cab2af227f..e9322133590487356cf49700e5396e692cda3f04 100644
--- a/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
+++ b/tensorflow/core/kernels/fuzzing/tf_ops_fuzz_target_lib.bzl
@@ -1,13 +1,25 @@
 """Fuzzing template for TensorFlow ops."""
 
 def tf_ops_fuzz_target_lib(name):
-  native.cc_library(
-      name = name + "_fuzz_lib",
-      srcs = [name + "_fuzz.cc"],
-      deps = [
-          "//tensorflow/core/kernels/fuzzing:fuzz_session",
-          "//tensorflow/cc:cc_ops",
-      ],
-      tags = ["no_windows"],
-      alwayslink = 1,
-  )
+    native.cc_library(
+        name = name + "_fuzz_lib",
+        srcs = [name + "_fuzz.cc"],
+        deps = [
+            "//tensorflow/core/kernels/fuzzing:fuzz_session",
+            "//tensorflow/cc:cc_ops",
+        ],
+        tags = ["no_windows"],
+        alwayslink = 1,
+    )
+
+def tf_oss_fuzz_corpus(name):
+    native.filegroup(
+        name = name + "_corpus",
+        srcs = native.glob(["corpus/" + name + "/*"]),
+    )
+
+def tf_oss_fuzz_dict(name):
+    native.filegroup(
+        name = name + "_dict",
+        srcs = native.glob(["dictionaries/" + name + ".dict"]),
+    )
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index 5f244b1b10f65c60becc1ce3c0e87836a48e3ae3..42fad1d4b053f84a7f5eaae4382f0a090ba628da 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -483,9 +483,19 @@ REGISTER_KERNEL_BUILDER(Name("TensorListGetItem").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("TensorListGetItem").Device(DEVICE_GPU).HostMemory("index"),
-    TensorListGetItem);
+#define REGISTER_TENSOR_LIST_GET_ITEM_GPU(T)                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorListGetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index"),               \
+                          TensorListGetItem);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_GET_ITEM_GPU);
+REGISTER_TENSOR_LIST_GET_ITEM_GPU(bfloat16)
+#undef REGISTER_TENSOR_LIST_GET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -537,9 +547,19 @@ REGISTER_KERNEL_BUILDER(Name("TensorListSetItem").Device(DEVICE_CPU),
 
 #if GOOGLE_CUDA
 
-REGISTER_KERNEL_BUILDER(
-    Name("TensorListSetItem").Device(DEVICE_GPU).HostMemory("index"),
-    TensorListSetItem);
+#define REGISTER_TENSOR_LIST_SET_ITEM_GPU(T)                      \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSetItem")               \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("index"),               \
+                          TensorListSetItem);
+
+TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_complex64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_complex128(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+TF_CALL_int64(REGISTER_TENSOR_LIST_SET_ITEM_GPU);
+REGISTER_TENSOR_LIST_SET_ITEM_GPU(bfloat16)
+#undef REGISTER_TENSOR_LIST_SET_ITEM_GPU
 
 #endif  // GOOGLE_CUDA
 
@@ -660,7 +680,11 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListGather")                \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListGather<CPUDevice, T>)
+                          TensorListGather<CPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListConcat<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_STACK_CPU);
 REGISTER_TENSOR_LIST_STACK_CPU(quint8);
@@ -680,7 +704,11 @@ REGISTER_TENSOR_LIST_STACK_CPU(bfloat16);
   REGISTER_KERNEL_BUILDER(Name("TensorListScatter")               \
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_CPU),                \
-                          TensorListScatter<CPUDevice, T>)
+                          TensorListScatter<CPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_CPU),                \
+                          TensorListSplit<CPUDevice, T>)
 
 TF_CALL_POD_STRING_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_CPU);
 REGISTER_TENSOR_LIST_FROM_TENSOR_CPU(quint8);
diff --git a/tensorflow/core/kernels/list_kernels.cu.cc b/tensorflow/core/kernels/list_kernels.cu.cc
index a00bf700ca21ea2a69fdcc84815ca473375b333c..23f552642cac273cf53b25a6d43e1e6ca23ea0cc 100644
--- a/tensorflow/core/kernels/list_kernels.cu.cc
+++ b/tensorflow/core/kernels/list_kernels.cu.cc
@@ -45,7 +45,12 @@ typedef Eigen::GpuDevice GPUDevice;
                               .TypeConstraint<T>("element_dtype") \
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("indices"),             \
-                          TensorListGather<GPUDevice, T>)
+                          TensorListGather<GPUDevice, T>)         \
+  REGISTER_KERNEL_BUILDER(Name("TensorListConcat")                \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("lengths"),             \
+                          TensorListConcat<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_STACK_GPU);
 REGISTER_TENSOR_LIST_STACK_GPU(bfloat16);
@@ -82,7 +87,13 @@ REGISTER_TENSOR_LIST_PUSH_BACK_BATCH_GPU(bool);
                               .Device(DEVICE_GPU)                 \
                               .HostMemory("element_shape")        \
                               .HostMemory("indices"),             \
-                          TensorListScatter<GPUDevice, T>)
+                          TensorListScatter<GPUDevice, T>)        \
+  REGISTER_KERNEL_BUILDER(Name("TensorListSplit")                 \
+                              .TypeConstraint<T>("element_dtype") \
+                              .Device(DEVICE_GPU)                 \
+                              .HostMemory("element_shape")        \
+                              .HostMemory("lengths"),             \
+                          TensorListSplit<GPUDevice, T>)
 
 TF_CALL_GPU_NUMBER_TYPES(REGISTER_TENSOR_LIST_FROM_TENSOR_GPU);
 REGISTER_TENSOR_LIST_FROM_TENSOR_GPU(bfloat16);
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index c2591f53141622bc90aa6b77f7f08bc86627bbe7..686679474c40dc922683786cdfe65ffb3fbc03e2 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -30,6 +30,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/concat_lib.h"
 #include "tensorflow/core/lib/core/coding.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/gtl/array_slice.h"
+#include "tensorflow/core/util/tensor_ops_util.h"
 #include "tensorflow/core/util/util.h"
 
 namespace tensorflow {
@@ -76,26 +78,30 @@ class TensorListStack : public OpKernel {
   ~TensorListStack() {}
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
-    OP_REQUIRES(c, !l->tensors.empty() || l->element_shape.IsFullyDefined(),
-                errors::InvalidArgument("Tried to stack elements of a empty ",
-                                        "list with non-fully-defined shape: ",
-                                        l->element_shape.DebugString()));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    OP_REQUIRES(
+        c,
+        !tensor_list->tensors.empty() ||
+            tensor_list->element_shape.IsFullyDefined(),
+        errors::InvalidArgument("Tried to stack elements of a empty ",
+                                "list with non-fully-defined shape: ",
+                                tensor_list->element_shape.DebugString()));
     if (num_elements_ != -1) {
-      OP_REQUIRES(c, l->tensors.size() == num_elements_,
-                  errors::InvalidArgument("Operation expected a list with ",
-                                          num_elements_,
-                                          " elements but got a list with ",
-                                          l->tensors.size(), " elements."));
+      OP_REQUIRES(c, tensor_list->tensors.size() == num_elements_,
+                  errors::InvalidArgument(
+                      "Operation expected a list with ", num_elements_,
+                      " elements but got a list with ",
+                      tensor_list->tensors.size(), " elements."));
     }
     // Compute the shape of the output tensor.
     // If `element_shape` is fully-defined it gets used. It is assumed that all
@@ -104,11 +110,11 @@ class TensorListStack : public OpKernel {
     // tensor is used and it is checked that all other tensors have the same
     // shape.
     TensorShape resulting_shape;
-    if (!l->element_shape.AsTensorShape(&resulting_shape)) {
-      const Tensor& t = l->tensors[0];
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
+      const Tensor& t = tensor_list->tensors[0];
       resulting_shape = t.shape();
-      for (int i = 1; i < l->tensors.size(); ++i) {
-        const Tensor& t = l->tensors[i];
+      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& t = tensor_list->tensors[i];
         OP_REQUIRES(c, t.shape() == resulting_shape,
                     errors::InvalidArgument(
                         "Tried to stack tensors with unequal shapes: ",
@@ -116,7 +122,7 @@ class TensorListStack : public OpKernel {
                         t.shape().DebugString()));
       }
     }
-    resulting_shape.InsertDim(0, l->tensors.size());
+    resulting_shape.InsertDim(0, tensor_list->tensors.size());
     Tensor* output;
     OP_REQUIRES_OK(c, c->allocate_output(0, resulting_shape, &output));
     if (output->NumElements() == 0) {
@@ -124,8 +130,8 @@ class TensorListStack : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(l->tensors.size());
-    for (const auto& t : l->tensors) {
+    inputs_flat.reserve(tensor_list->tensors.size());
+    for (const auto& t : tensor_list->tensors) {
       inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
           t.shaped<T, 2>({1, t.NumElements()})));
     }
@@ -145,6 +151,200 @@ class TensorListStack : public OpKernel {
   DataType element_dtype_;
 };
 
+template <typename Device, typename T>
+class TensorListConcat : public OpKernel {
+ public:
+  using ConstMatrixVector =
+      std::vector<std::unique_ptr<typename TTypes<T, 2>::ConstMatrix>>;
+  explicit TensorListConcat(OpKernelConstruction* c) : OpKernel(c) {
+    OP_REQUIRES_OK(c, c->GetAttr("element_dtype", &element_dtype_));
+  }
+
+  ~TensorListConcat() {}
+
+  void Compute(OpKernelContext* c) override {
+    // Check that the input Variant tensor is indeed a TensorList and has the
+    // correct element type.
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
+                errors::InvalidArgument(
+                    "Input handle is not a list. Saw: '",
+                    c->input(0).scalar<Variant>()().DebugString(), "'"));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
+    // If the TensorList is empty, its element_shape must be fully defined
+    // except for the first dimension.
+    PartialTensorShape shape_except_first_dim;
+    if (!tensor_list->element_shape.unknown_rank()) {
+      OP_REQUIRES(c, tensor_list->element_shape.dims() >= 1,
+                  errors::InvalidArgument(
+                      "Concat requires elements to be at least vectors, ",
+                      "found scalars instead."));
+      shape_except_first_dim = PartialTensorShape(
+          gtl::ArraySlice<int64>(tensor_list->element_shape.dim_sizes())
+              .subspan(1));
+    }
+    OP_REQUIRES(c,
+                !tensor_list->tensors.empty() ||
+                    shape_except_first_dim.IsFullyDefined(),
+                errors::InvalidArgument(
+                    "All except the first dimension must be fully defined ",
+                    "when concating an empty tensor list. element_shape: ",
+                    tensor_list->element_shape.DebugString()));
+    // 1. Compute the shape of the output tensor.
+    // If `shape_except_first_dim` is fully-defined we just prepend the leading
+    // dim to it. Otherwise we use the shape of the first element tensor and
+    // check to make sure shapes of all tensors are compatible.
+    TensorShape output_shape;
+    if (!shape_except_first_dim.AsTensorShape(&output_shape)) {
+      const Tensor& element_tensor = tensor_list->tensors[0];
+      OP_REQUIRES(
+          c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
+          errors::InvalidArgument("Concat saw a scalar shape at index ", 0,
+                                  " but requires at least vectors."));
+      output_shape =
+          TensorShape(gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
+                          .subspan(1));
+      for (int i = 1; i < tensor_list->tensors.size(); ++i) {
+        const Tensor& element_tensor = tensor_list->tensors[i];
+        OP_REQUIRES(
+            c, TensorShapeUtils::IsVectorOrHigher(element_tensor.shape()),
+            errors::InvalidArgument("Concat saw a scalar shape at index ", i,
+                                    " but requires at least vectors."));
+        TensorShape actual_shape(
+            gtl::ArraySlice<int64>(element_tensor.shape().dim_sizes())
+                .subspan(1));
+        OP_REQUIRES(c, actual_shape.dim_sizes() == output_shape.dim_sizes(),
+                    errors::InvalidArgument(
+                        "Tried to concat tensors with unequal shapes: ",
+                        output_shape.DebugString(), " vs ",
+                        actual_shape.DebugString()));
+      }
+    }
+    // 2. Build the lengths_tensor and leading dim of the output tensor by
+    // iterating over all element tensors.
+    Tensor* lengths_tensor = nullptr;
+    OP_REQUIRES_OK(
+        c,
+        c->allocate_output(
+            1, TensorShape({static_cast<int64>(tensor_list->tensors.size())}),
+            &lengths_tensor));
+    auto lengths_tensor_vec = lengths_tensor->vec<int64>();
+    int64 leading_dim = 0;
+    for (size_t i = 0; i < tensor_list->tensors.size(); i++) {
+      int64 dim = tensor_list->tensors[i].shape().dim_size(0);
+      leading_dim += dim;
+      lengths_tensor_vec(i) = dim;
+    }
+    output_shape.InsertDim(0, leading_dim);
+    Tensor* output;
+    // 3. Allocate the output tensor and fill it up with the concated element
+    // tensors.
+    OP_REQUIRES_OK(c, c->allocate_output(0, output_shape, &output));
+    if (output->NumElements() == 0) {
+      return;
+    }
+
+    ConstMatrixVector inputs_flat;
+    inputs_flat.reserve(tensor_list->tensors.size());
+    for (const auto& element_tensor : tensor_list->tensors) {
+      inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
+          element_tensor.shaped<T, 2>({1, element_tensor.NumElements()})));
+    }
+    auto output_flat = output->shaped<T, 2>({1, output->NumElements()});
+
+#if GOOGLE_CUDA
+    if (std::is_same<Device, Eigen::GpuDevice>::value) {
+      ConcatGPU<T>(c, inputs_flat, output, &output_flat);
+      return;
+    }
+#endif  // GOOGLE_CUDA
+    ConcatCPU<T>(c->device(), inputs_flat, &output_flat);
+  }
+
+ private:
+  DataType element_dtype_;
+};
+
+template <typename Device, typename T>
+class TensorListSplit : public OpKernel {
+ public:
+  TensorListSplit(OpKernelConstruction* c) : OpKernel(c) {}
+
+  void Compute(OpKernelContext* c) override {
+    Tensor* output_tensor;
+    AllocatorAttributes attr;
+    attr.set_on_host(true);
+    OP_REQUIRES_OK(c, c->allocate_output(0, {}, &output_tensor, attr));
+    PartialTensorShape element_shape;
+    OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(1), &element_shape));
+    OP_REQUIRES(c, element_shape.unknown_rank() || element_shape.dims() >= 1,
+                errors::InvalidArgument(
+                    "TensorListSplit requires element_shape to be at least of ",
+                    "rank 1, but saw: ", element_shape.DebugString()));
+    TensorList output_list;
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
+                errors::InvalidArgument(
+                    "Tensor must be at least a vector, but saw shape: ",
+                    input_tensor.shape().DebugString()));
+    TensorShape tensor_shape_without_first_dim(input_tensor.shape());
+    tensor_shape_without_first_dim.RemoveDim(0);
+    PartialTensorShape element_shape_without_first_dim;
+    if (!element_shape.unknown_rank()) {
+      element_shape_without_first_dim =
+          PartialTensorShape(element_shape.dim_sizes());
+      element_shape_without_first_dim.RemoveDim(0);
+    }
+    OP_REQUIRES(c,
+                element_shape_without_first_dim.IsCompatibleWith(
+                    tensor_shape_without_first_dim),
+                errors::InvalidArgument(
+                    "tensor shape ", input_tensor.shape().DebugString(),
+                    " is not compatible with element_shape ",
+                    element_shape.DebugString()));
+    output_list.element_shape = element_shape;
+    const Tensor& lengths = c->input(2);
+    OP_REQUIRES(c, TensorShapeUtils::IsVector(lengths.shape()),
+                errors::InvalidArgument(
+                    "Expected lengths to be a vector, received shape: ",
+                    lengths.shape().DebugString()));
+    output_list.tensors.reserve(lengths.shape().dim_size(0));
+    int64 start = 0;
+    int64 end = 0;
+    for (int i = 0; i < lengths.shape().dim_size(0); ++i) {
+      int64 length = lengths.vec<int64>()(i);
+      OP_REQUIRES(
+          c, length >= 0,
+          errors::InvalidArgument("Invalid value in lengths: ", length));
+      end = start + length;
+      OP_REQUIRES(c, end <= input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument("Attempting to slice [", start, ", ",
+                                          end, "] from tensor with length ",
+                                          input_tensor.shape().dim_size(0)));
+      Tensor tmp = input_tensor.Slice(start, end);
+      start = end;
+      // TODO(apassos) maybe not always align; but weird compiler bugs seem to
+      // prevent this.
+      Tensor aligned;
+      OP_REQUIRES_OK(c, c->allocate_temp(tmp.dtype(), tmp.shape(), &aligned));
+      aligned.flat<T>().device(c->eigen_device<Device>()) =
+          tmp.unaligned_flat<T>();
+      output_list.tensors.emplace_back(aligned);
+    }
+    OP_REQUIRES(c, end == input_tensor.shape().dim_size(0),
+                errors::InvalidArgument(
+                    "Unused values in tensor. Length of tensor: ",
+                    input_tensor.shape().dim_size(0), " Values used: ", end));
+    output_tensor->scalar<Variant>()() = std::move(output_list);
+  }
+};
+
 template <typename Device, typename T>
 class TensorListGather : public OpKernel {
  public:
@@ -155,22 +355,25 @@ class TensorListGather : public OpKernel {
   }
 
   void Compute(OpKernelContext* c) override {
-    const TensorList* l = c->input(0).scalar<Variant>()().get<TensorList>();
-    OP_REQUIRES(c, l != nullptr,
+    const TensorList* tensor_list =
+        c->input(0).scalar<Variant>()().get<TensorList>();
+    OP_REQUIRES(c, tensor_list != nullptr,
                 errors::InvalidArgument(
                     "Input handle is not a list. Saw: '",
                     c->input(0).scalar<Variant>()().DebugString(), "'"));
-    OP_REQUIRES(c, element_dtype_ == l->element_dtype,
-                errors::InvalidArgument("Invalid data types; op elements ",
-                                        DataTypeString(element_dtype_),
-                                        " but list elements ",
-                                        DataTypeString(l->element_dtype)));
+    OP_REQUIRES(
+        c, element_dtype_ == tensor_list->element_dtype,
+        errors::InvalidArgument(
+            "Invalid data types; op elements ", DataTypeString(element_dtype_),
+            " but list elements ", DataTypeString(tensor_list->element_dtype)));
     Tensor indices = c->input(1);
-    OP_REQUIRES(c,
-                indices.NumElements() > 0 || l->element_shape.IsFullyDefined(),
-                errors::InvalidArgument("Tried to gather 0-elements from "
-                                        "a list with non-fully-defined shape: ",
-                                        l->element_shape.DebugString()));
+    OP_REQUIRES(
+        c,
+        indices.NumElements() > 0 ||
+            tensor_list->element_shape.IsFullyDefined(),
+        errors::InvalidArgument("Tried to gather 0-elements from "
+                                "a list with non-fully-defined shape: ",
+                                tensor_list->element_shape.DebugString()));
     // Compute the shape of the output tensor.
     // If `element_shape` is fully-defined it gets used. It is assumed that all
     // requested tensors have the same shape.
@@ -178,17 +381,17 @@ class TensorListGather : public OpKernel {
     // tensor is used and it is checked that all other tensors have the same
     // shape.
     TensorShape resulting_shape;
-    if (!l->element_shape.AsTensorShape(&resulting_shape)) {
+    if (!tensor_list->element_shape.AsTensorShape(&resulting_shape)) {
       const int i = indices.flat<int32>()(0);
       OP_REQUIRES(
-          c, i < l->tensors.size(),
+          c, i < tensor_list->tensors.size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  l->tensors.size(), " elements."));
-      const Tensor& t = l->tensors[i];
+                                  tensor_list->tensors.size(), " elements."));
+      const Tensor& t = tensor_list->tensors[i];
       resulting_shape = t.shape();
       for (int index = 1; index < indices.NumElements(); ++index) {
         const int i = indices.flat<int32>()(index);
-        const Tensor& t = l->tensors[i];
+        const Tensor& t = tensor_list->tensors[i];
         OP_REQUIRES(c, t.shape() == resulting_shape,
                     errors::InvalidArgument(
                         "Tried to gather elements with unequal shapes: ",
@@ -204,14 +407,14 @@ class TensorListGather : public OpKernel {
     }
 
     ConstMatrixVector inputs_flat;
-    inputs_flat.reserve(l->tensors.size());
+    inputs_flat.reserve(tensor_list->tensors.size());
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
       OP_REQUIRES(
-          c, i < l->tensors.size(),
+          c, i < tensor_list->tensors.size(),
           errors::InvalidArgument("Index ", i, " out o range; list only has ",
-                                  l->tensors.size(), " elements."));
-      const Tensor& t = l->tensors[i];
+                                  tensor_list->tensors.size(), " elements."));
+      const Tensor& t = tensor_list->tensors[i];
       inputs_flat.emplace_back(new typename TTypes<T, 2>::ConstMatrix(
           t.shaped<T, 2>({1, t.NumElements()})));
     }
@@ -289,13 +492,13 @@ class TensorListScatter : public OpKernel {
     PartialTensorShape element_shape;
     OP_REQUIRES_OK(c, TensorShapeFromTensor(c->input(2), &element_shape));
     TensorList output_list;
-    const Tensor& t = c->input(0);
-    output_list.element_dtype = t.dtype();
-    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(t.shape()),
+    const Tensor& input_tensor = c->input(0);
+    output_list.element_dtype = input_tensor.dtype();
+    OP_REQUIRES(c, TensorShapeUtils::IsVectorOrHigher(input_tensor.shape()),
                 errors::InvalidArgument(
                     "Tensor must be at least a vector, but saw shape: ",
-                    t.shape().DebugString()));
-    TensorShape output_shape(t.shape());
+                    input_tensor.shape().DebugString()));
+    TensorShape output_shape(input_tensor.shape());
     output_shape.RemoveDim(0);
     OP_REQUIRES(c, element_shape.IsCompatibleWith(output_shape),
                 errors::InvalidArgument(
@@ -305,11 +508,11 @@ class TensorListScatter : public OpKernel {
     output_list.tensors.reserve(indices.NumElements());
     for (int index = 0; index < indices.NumElements(); ++index) {
       const int i = indices.flat<int32>()(index);
-      OP_REQUIRES(c, i < t.shape().dim_size(0),
-                  errors::InvalidArgument("Trying to scatter index ", i,
-                                          " from tensor with ",
-                                          t.shape().dim_size(0), " rows."));
-      Tensor tmp = t.Slice(i, i + 1);
+      OP_REQUIRES(c, i < input_tensor.shape().dim_size(0),
+                  errors::InvalidArgument(
+                      "Trying to scatter index ", i, " from tensor with ",
+                      input_tensor.shape().dim_size(0), " rows."));
+      Tensor tmp = input_tensor.Slice(i, i + 1);
       TensorShape tmp_shape = tmp.shape();
       tmp_shape.RemoveDim(0);
       OP_REQUIRES(c, tmp.CopyFrom(tmp, tmp_shape),
@@ -357,40 +560,10 @@ Status TensorListBinaryAdd(OpKernelContext* c, const TensorList& a,
   for (int i = 0; i < a.tensors.size(); ++i) {
     const Tensor& a_tensor = a.tensors[i];
     const Tensor& b_tensor = b.tensors[i];
-    if (a_tensor.dtype() == DT_INVALID) {
-      out->tensors.push_back(b_tensor);
-      continue;
-    }
-    if (b_tensor.dtype() == DT_INVALID) {
-      out->tensors.push_back(a_tensor);
-      continue;
-    }
-    if (a_tensor.shape() != b_tensor.shape()) {
-      // TODO(apassos) support broadcasting additions here?
-      return errors::InvalidArgument(
-          "Trying to add two tensors with incompatible element shapes. "
-          "One is ",
-          a_tensor.shape().DebugString(), " and the other is ",
-          b_tensor.shape().DebugString(), " in position ", i);
-    }
     Tensor out_tensor;
     TF_RETURN_IF_ERROR(
-        c->allocate_temp(a_tensor.dtype(), a_tensor.shape(), &out_tensor));
+        BinaryAddTensors<Device>(c, a_tensor, b_tensor, &out_tensor));
     out->tensors.push_back(out_tensor);
-    switch (out_tensor.dtype()) {
-#define DTYPE_CASE(dtype)                                        \
-  case DataTypeToEnum<dtype>::value:                             \
-    out_tensor.flat<dtype>().device(c->eigen_device<Device>()) = \
-        a_tensor.flat<dtype>() + b_tensor.flat<dtype>();         \
-    break;
-
-      TF_CALL_NUMBER_TYPES(DTYPE_CASE)
-
-#undef DTYPE_CASE
-      default:
-        return errors::InvalidArgument("Trying to add unsupported dtype ",
-                                       out_tensor.dtype());
-    }
   }
   return Status::OK();
 }
@@ -403,46 +576,7 @@ Status TensorListZerosLike(OpKernelContext* c, const TensorList& x,
   y->tensors.reserve(x.tensors.size());
   for (const Tensor& t : x.tensors) {
     Tensor out_tensor;
-    AllocatorAttributes attr;
-    if (t.dtype() == DT_VARIANT) {
-      attr.set_on_host(true);
-    }
-    TF_RETURN_IF_ERROR(
-        c->allocate_temp(t.dtype(), t.shape(), &out_tensor, attr));
-    switch (out_tensor.dtype()) {
-#define DTYPE_CASE(dtype)                                        \
-  case DataTypeToEnum<dtype>::value:                             \
-    out_tensor.flat<dtype>().device(c->eigen_device<Device>()) = \
-        out_tensor.flat<dtype>().constant(dtype(0));             \
-    break;
-
-      TF_CALL_POD_TYPES(DTYPE_CASE)
-
-#undef DTYPE_CASE
-
-      case DT_INVALID: {
-        // Uninitialized tensor in the TensorList.
-        out_tensor = Tensor(DT_INVALID);
-        break;
-      }
-      case DataTypeToEnum<Variant>::value: {
-        const TensorList* inner_x = t.scalar<Variant>()().get<TensorList>();
-        if (inner_x == nullptr) {
-          return errors::InvalidArgument("Input handle is not a list. Saw: '",
-                                         t.scalar<Variant>()().DebugString(),
-                                         "'");
-        }
-        TensorList inner_y;
-        TF_RETURN_IF_ERROR(TensorListZerosLike<Device>(c, *inner_x, &inner_y));
-        out_tensor.scalar<Variant>()() = std::move(inner_y);
-        break;
-      }
-
-      default:
-        return errors::InvalidArgument(
-            "Trying to compute zeros_like for unsupported dtype ",
-            DataTypeString(out_tensor.dtype()));
-    }
+    TF_RETURN_IF_ERROR(ZerosLikeTensor<Device>(c, t, &out_tensor));
     y->tensors.emplace_back(out_tensor);
   }
   return Status::OK();
diff --git a/tensorflow/core/kernels/lu_op.cc b/tensorflow/core/kernels/lu_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f9591d1bdf2fddea7b9d6265d4a8dd6c3f5f5df6
--- /dev/null
+++ b/tensorflow/core/kernels/lu_op.cc
@@ -0,0 +1,193 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "third_party/eigen3/Eigen/Core"
+#include "third_party/eigen3/Eigen/LU"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/lib/math/math_util.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+
+template <typename Scalar, typename Tidx>
+class LuOp : public OpKernel {
+ public:
+  explicit LuOp(OpKernelConstruction* context) : OpKernel(context) {}
+
+ protected:
+  using TensorShapes = gtl::InlinedVector<TensorShape, 4>;
+  using TensorOutputs = gtl::InlinedVector<Tensor*, 4>;
+
+  using Matrix =
+      Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using ConstMatrixMap = Eigen::Map<const Matrix>;
+  using MatrixMap = Eigen::Map<Matrix>;
+
+  using RealScalar = typename Eigen::NumTraits<Scalar>::Real;
+
+  using Indices =
+      Eigen::Matrix<Tidx, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  using IndicesMap = Eigen::Map<Indices>;
+  using ConstIndicesMap = Eigen::Map<const Indices>;
+
+ public:
+  // Returns the cost per matrix operation. This is used to determine the
+  // number of threads to use for parallelizing factorization in batch mode.
+  // Cost per unit is assumed to be roughly 1ns, based on comments
+  // in core/util/work_sharder.cc.
+  // LU decomposition for a square matrix takes roughly (2/3) * (num_rows)^3.
+  // TODO(anudhyan): Refine this estimate after taking constant factors into
+  // account.
+  int64 GetCostPerUnit(const TensorShape& input_matrix_shape) const {
+    double num_rows = static_cast<double>(input_matrix_shape.dim_size(0));
+    double cost = (2 / 3.0) * MathUtil::IPow(num_rows, 3);
+    return cost >= static_cast<double>(kint64max) ? kint64max
+                                                  : static_cast<int64>(cost);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    OP_REQUIRES(context, context->num_inputs() == 1,
+                errors::InvalidArgument("Expecting exactly one input, got ",
+                                        context->num_inputs()));
+
+    const Tensor& input = context->input(0);
+    int input_rank = input.dims();
+    OP_REQUIRES(context, input_rank >= 2,
+                errors::InvalidArgument(
+                    "Input tensor must have rank >= 2, got ", input_rank));
+
+    // If the tensor rank is greater than 2, we consider the inner-most
+    // dimensions as matrices, and loop over all the other outer ("batch")
+    // dimensions to compute the results.
+    TensorShape input_matrix_shape;
+    TensorShape batch_shape;
+    for (int dim = 0; dim < input_rank - 2; ++dim) {
+      batch_shape.AddDim(input.dim_size(dim));
+    }
+    const int64 num_rows = input.dim_size(input_rank - 2);
+    const int64 num_cols = input.dim_size(input_rank - 1);
+
+    input_matrix_shape.AppendShape({num_rows, num_cols});
+    OP_REQUIRES(context, TensorShapeUtils::IsSquareMatrix(input_matrix_shape),
+                errors::InvalidArgument("Input matrix must be square."));
+
+    // packed_triangular_factors is a matrix with the same shape as the input;
+    // permutation is a vector.
+    TensorShape permutation_shape = batch_shape;
+    permutation_shape.AddDim(num_rows);
+
+    TensorShapes output_matrix_shapes({input.shape(), permutation_shape});
+
+    TensorOutputs outputs;
+    Tensor* output_packed_triangular_factors = nullptr;
+    OP_REQUIRES_OK(
+        context, context->forward_input_or_allocate_output(
+                     {0}, 0, input.shape(), &output_packed_triangular_factors));
+    outputs.emplace_back(output_packed_triangular_factors);
+
+    Tensor* output_permutation = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(1, permutation_shape,
+                                                     &output_permutation));
+    outputs.emplace_back(output_permutation);
+
+    if (num_rows == 0) {
+      return;
+    }
+
+    // Process the individual matrix problems in parallel using a threadpool.
+    auto shard = [this, &input, &num_rows, &num_cols, &outputs,
+                  &output_matrix_shapes, context](int64 begin, int64 end) {
+      for (int64 i = begin; i < end; ++i) {
+        ComputeTensorSlice(context, i, input, num_rows, num_cols, outputs,
+                           output_matrix_shapes);
+      }
+    };
+    auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
+    Shard(worker_threads.num_threads, worker_threads.workers,
+          batch_shape.num_elements(), GetCostPerUnit(input_matrix_shape),
+          shard);
+  }
+
+  void ComputeTensorSlice(OpKernelContext* context, int64 matrix_index,
+                          const Tensor& input, int64 num_rows, int64 num_cols,
+                          const TensorOutputs& outputs,
+                          const TensorShapes& output_matrix_shapes) {
+    // TODO(kalakris): Handle alignment if possible. Eigen::Map is
+    // unaligned by default.
+    ConstMatrixMap input_matrix(
+        input.flat<Scalar>().data() + matrix_index * num_rows * num_cols,
+        num_rows, num_cols);
+
+    // packed_triangular_factors has shape [num_rows, num_cols]
+    MatrixMap packed_triangular_factors(
+        outputs[0]->flat<Scalar>().data() + matrix_index * num_rows * num_cols,
+        num_rows, num_rows);
+
+    // permutation has shape [num_rows, 1]
+    IndicesMap permutation_indices(
+        outputs[1]->flat<Tidx>().data() + matrix_index * num_rows, num_rows, 1);
+
+    Eigen::PartialPivLU<Eigen::Matrix<Scalar, Eigen::Dynamic, Eigen::Dynamic>>
+        lu_decomposition(input_matrix);
+
+    // Output the packed triangular factors in a dense form.
+    // The lower triangular factor L corresponds to the strictly lower
+    // triangular part of packed_triangular_factors with an implicit unit
+    // diagonal. The upper triangular factor U is the upper triangular part of
+    // packed_triangular_factors. The triangular factors satisfy the equation
+    //     P * input_matrix = L * U
+    // where P is the permutation matrix corresponding to the indices in
+    // permutation_indices.
+    packed_triangular_factors = lu_decomposition.matrixLU();
+    // Output the permutation matrix used for pivoting.
+    Eigen::PermutationMatrix<-1, -1, Tidx> permutation =
+        lu_decomposition.permutationP().transpose();
+    permutation_indices = permutation.indices();
+
+    // PartialPivLU cannot give strong guarantees on invertibility,
+    // but we can at least guard against exact zero pivots. This can occur as
+    // a result of basic user mistakes such providing integer valued
+    // matrices that are exactly singular, or due to underflow if this
+    // code is run with denormals being flushed to zero.
+    const RealScalar min_abs_pivot =
+        packed_triangular_factors.diagonal().cwiseAbs().minCoeff();
+    OP_REQUIRES(context, min_abs_pivot > RealScalar(0),
+                errors::InvalidArgument("Input is not invertible."));
+  }
+};
+
+#define REGISTER_LU(type, idx_type)                                         \
+  REGISTER_KERNEL_BUILDER(Name("Lu")                                        \
+                              .Device(DEVICE_CPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<idx_type>("output_idx_type"), \
+                          LuOp<type, idx_type>);
+
+REGISTER_LU(float, int32);
+REGISTER_LU(double, int32);
+REGISTER_LU(complex64, int32);
+REGISTER_LU(complex128, int32);
+
+REGISTER_LU(float, int64);
+REGISTER_LU(double, int64);
+REGISTER_LU(complex64, int64);
+REGISTER_LU(complex128, int64);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/lu_op_gpu.cu.cc b/tensorflow/core/kernels/lu_op_gpu.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f83744b50de5ca7fd247b17e3fcac52889f5f288
--- /dev/null
+++ b/tensorflow/core/kernels/lu_op_gpu.cu.cc
@@ -0,0 +1,275 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if GOOGLE_CUDA
+#define EIGEN_USE_GPU
+
+#include <algorithm>
+#include <vector>
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/cuda_solvers.h"
+#include "tensorflow/core/kernels/transpose_functor.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/util/cuda_kernel_helper.h"
+
+namespace tensorflow {
+
+typedef Eigen::GpuDevice GPUDevice;
+
+namespace {
+template <typename Scalar>
+__device__ void ComputePermutationFromTranspositions(
+    int64 num_rows, const int* pivots, Scalar* permutation_indices) {
+  // Fill in the output array with the identity permutation.
+  for (int i = 0; i < num_rows; ++i) {
+    permutation_indices[i] = Scalar(i);
+  }
+
+  // Compute the permutation from a sequence of transpositions encoded
+  // in the pivot array by applying the transpositions in order on the
+  // identity permutation.
+  for (int i = 0; i < num_rows; ++i) {
+    // Note: Internally, the cuBlas code uses Fortran convention (1-based)
+    // indexing so ith row was swapped with (pivots[i]-1)'th row in 0-based
+    // indexing.
+    Scalar t = permutation_indices[i];
+    permutation_indices[i] = permutation_indices[pivots[i] - 1];
+    permutation_indices[pivots[i] - 1] = t;
+  }
+}
+}  // namespace
+
+// Kernel to compute the inverse of a permutation from a sequence of
+// transpositions.
+template <typename Scalar>
+__global__ void ComputePermutationFromTranspositionsKernel(
+    CudaLaunchConfig config, const int64 num_rows, const int* all_pivots,
+    Scalar* all_permutation_indices) {
+  // We only parallelize over batches here. Performance is not critical,
+  // since this cheap O(num_rows) kernel always follows an O(num_rows^3)
+  // LU factorization.
+  CUDA_1D_KERNEL_LOOP(index, config.virtual_thread_count) {
+    ComputePermutationFromTranspositions(
+        num_rows, all_pivots + index * num_rows,
+        all_permutation_indices + index * num_rows);
+  }
+}
+
+template <class Scalar, class Tidx>
+class LuOpGpu : public AsyncOpKernel {
+ public:
+  explicit LuOpGpu(OpKernelConstruction* context) : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* context, DoneCallback done) final {
+    const Tensor& input = context->input(0);
+
+    // Analyze shape and validate inputs.
+    const int input_rank = input.dims();
+
+    OP_REQUIRES_ASYNC(
+        context, input_rank >= 2,
+        errors::InvalidArgument("Input must have rank >= 2, got ", input_rank),
+        done);
+
+    const int64 num_rows = input.dim_size(input_rank - 2);
+    const int64 num_cols = input.dim_size(input_rank - 1);
+
+    OP_REQUIRES_ASYNC(
+        context, num_rows == num_cols,
+        errors::InvalidArgument("Input matrices must be squares, got", num_rows,
+                                " != ", num_cols),
+        done);
+
+    TensorShape batch_shape;
+    for (int dim = 0; dim < input_rank - 2; ++dim) {
+      batch_shape.AddDim(input.dim_size(dim));
+    }
+    TensorShape permutation_indices_shape = batch_shape;
+    permutation_indices_shape.AddDim(num_rows);
+
+    const GPUDevice& device = context->eigen_device<GPUDevice>();
+    auto solver = absl::make_unique<CudaSolver>(context);
+
+    // We output the packed triangular factors in a dense form.
+    // The lower triangular factor L corresponds to the strictly lower
+    // triangular part of packed_triangular_factors with an implicit unit
+    // diagonal. The upper triangular factor U is the upper triangular part of
+    // packed_triangular_factors. The triangular factors satisfy the equation
+    //     P * input_matrix = L * U
+    // where P is the permutation matrix corresponding to the indices in
+    // permutation_indices.
+    //
+    // Reuse the input buffer or make a copy for the factorization step,
+    // depending on whether this ops owns it exclusively.
+    Tensor* packed_triangular_factors;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->forward_input_or_allocate_output(
+                             {0}, 0, input.shape(), &packed_triangular_factors),
+                         done);
+    if (!packed_triangular_factors->SharesBufferWith(input)) {
+      device.memcpy(packed_triangular_factors->flat<Scalar>().data(),
+                    input.flat<Scalar>().data(),
+                    input.NumElements() * sizeof(Scalar));
+    }
+
+    // Allocate output permutation.
+    Tensor* permutation_indices = nullptr;
+    OP_REQUIRES_OK_ASYNC(context,
+                         context->allocate_output(1, permutation_indices_shape,
+                                                  &permutation_indices),
+                         done);
+
+    if (input.NumElements() == 0) {
+      done();
+      return;
+    }
+
+    // Allocate a temporary Tensor to store the transposed packed triangular
+    // factors.
+    Tensor packed_triangular_factors_transpose;
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        context->allocate_temp(DataTypeToEnum<Scalar>::value, input.shape(),
+                               &packed_triangular_factors_transpose),
+        done);
+    auto packed_triangular_factors_transpose_reshaped =
+        packed_triangular_factors_transpose
+            .template flat_inner_dims<Scalar, 3>();
+    const int64 batch_size =
+        packed_triangular_factors_transpose_reshaped.dimension(0);
+
+    // Allocate pivots on the device.
+    Tensor pivots;
+    OP_REQUIRES_OK_ASYNC(context,
+                         solver->allocate_scoped_tensor(
+                             DataTypeToEnum<int32>::value,
+                             TensorShape{batch_size, num_rows}, &pivots),
+                         done);
+    auto pivots_mat = pivots.template matrix<int32>();
+
+    // Transpose the input. This is necessary because cuBLAS assumes
+    // column-major storage while TensorFlow uses row-major.
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        DoMatrixTranspose(device, *packed_triangular_factors,
+                          &packed_triangular_factors_transpose),
+        done);
+
+    std::vector<DeviceLapackInfo> dev_info;
+    if (num_rows == num_cols && num_rows / batch_size <= 128) {
+      // For small matrices or large batch sizes, we use the batched
+      // interface from cuBlas.
+      auto packed_triangular_factors_ptrs = solver->GetScratchSpace<uint8>(
+          sizeof(Scalar*) * batch_size, "packed_triangular_factors_ptrs",
+          /* on_host */ true);
+      const Scalar** packed_triangular_factors_ptrs_base =
+          reinterpret_cast<const Scalar**>(
+              packed_triangular_factors_ptrs.mutable_data());
+      for (int batch = 0; batch < batch_size; ++batch) {
+        packed_triangular_factors_ptrs_base[batch] =
+            &packed_triangular_factors_transpose_reshaped(batch, 0, 0);
+      }
+      dev_info.push_back(
+          solver->GetDeviceLapackInfo(batch_size, "getrfBatched"));
+      OP_REQUIRES_OK_ASYNC(
+          context,
+          solver->GetrfBatched(num_rows, packed_triangular_factors_ptrs_base,
+                               num_rows, pivots_mat.data(), &dev_info.back(),
+                               batch_size),
+          done);
+    } else {
+      // For small batch sizes we use the non-batched interface from cuSolver,
+      // which is much faster for large matrices.
+      dev_info.push_back(solver->GetDeviceLapackInfo(batch_size, "getrf"));
+      for (int batch = 0; batch < batch_size; ++batch) {
+        OP_REQUIRES_OK_ASYNC(
+            context,
+            solver->Getrf(
+                num_rows, num_cols,
+                &packed_triangular_factors_transpose_reshaped(batch, 0, 0),
+                num_rows, &pivots_mat(batch, 0), &dev_info.back()(batch)),
+            done);
+      }
+    }
+
+    // Transpose the result since we had transposed the input.
+    OP_REQUIRES_OK_ASYNC(
+        context,
+        DoMatrixTranspose(device, packed_triangular_factors_transpose,
+                          packed_triangular_factors),
+        done);
+
+    // Pivots encode the permutation of the rows as a sequences of row swaps.
+    // For each index i, row i is swapped with row pivots[i].
+    int* pivots_ptr = pivots.flat<int>().data();
+    Tidx* permutation_indices_ptr =
+        permutation_indices->template flat<Tidx>().data();
+    CudaLaunchConfig cfgPivots = GetCudaLaunchConfig(batch_size, device);
+    ComputePermutationFromTranspositionsKernel<<<cfgPivots.block_count,
+                                                 cfgPivots.thread_per_block, 0,
+                                                 device.stream()>>>(
+        cfgPivots, num_rows, pivots_ptr, permutation_indices_ptr);
+
+    // Callback for checking info after kernels finish. Also capture the
+    // temporary Tensors/ScratchSpace so they don't get deallocated before the
+    // kernels run.
+    // TODO(rmlarsen): Use move capture once C++14 becomes available.
+    auto info_checker = [context, done, dev_info](
+                            const Status& status,
+                            const std::vector<HostLapackInfo>& host_infos) {
+      if (!status.ok() && errors::IsInvalidArgument(status) &&
+          !host_infos.empty()) {
+        for (int i = 0; i < host_infos[0].size(); ++i) {
+          // Match the CPU error message for singular matrices. Otherwise
+          // just print the original error message from the status below.
+          OP_REQUIRES_ASYNC(context, host_infos[0].data()[i] <= 0,
+                            errors::InvalidArgument("Input is not invertible."),
+                            done);
+        }
+      }
+      OP_REQUIRES_OK_ASYNC(context, status, done);
+      done();
+    };
+
+    CudaSolver::CheckLapackInfoAndDeleteSolverAsync(std::move(solver), dev_info,
+                                                    std::move(info_checker));
+  }
+};
+
+#define REGISTER_LU_GPU(type, idx_type)                                     \
+  REGISTER_KERNEL_BUILDER(Name("Lu")                                        \
+                              .Device(DEVICE_GPU)                           \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<idx_type>("output_idx_type"), \
+                          LuOpGpu<type, idx_type>);
+
+REGISTER_LU_GPU(float, int32);
+REGISTER_LU_GPU(double, int32);
+REGISTER_LU_GPU(complex64, int32);
+REGISTER_LU_GPU(complex128, int32);
+
+REGISTER_LU_GPU(float, int64);
+REGISTER_LU_GPU(double, int64);
+REGISTER_LU_GPU(complex64, int64);
+REGISTER_LU_GPU(complex128, int64);
+}  // namespace tensorflow
+
+#endif  // GOOGLE_CUDA
diff --git a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
index 0c7a236b2ff0f0b5c6287d1dffb1e8ef9bac7cc0..56d0340547a891fe4929bd6a36a72c5e03d1d1e0 100644
--- a/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/maxpooling_op_gpu.cu.cc
@@ -384,6 +384,7 @@ bool MaxPoolForwardNoMask_NCHW_VECT_C::operator()(
     int32* top_data, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
+  if (output_size == 0) return true;
   MaxPoolForwardNoMaskKernel_NCHW_VECT_C<<<
       (output_size + kThreadsPerBlock - 1) / kThreadsPerBlock, kThreadsPerBlock,
       0, d.stream()>>>(output_size, bottom_data, height, width, channels,
@@ -402,6 +403,7 @@ bool MaxPoolForwardWithOptionalArgmax<T>::operator()(
     int64* mask, const Eigen::GpuDevice& d, bool propagate_nans) {
   const int kThreadsPerBlock = 1024;
   const int output_size = batch * channels * pooled_height * pooled_width;
+  if (output_size == 0) return true;
   if (propagate_nans) {
     MaxPoolForwardNHWC<true>
         <<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
@@ -430,6 +432,7 @@ bool MaxPoolBackwardNoMask<T>::operator()(
   const int kThreadsPerBlock = 1024;
 
   const int bottom_size = batch * channels * height * width;
+  if (bottom_size == 0) return true;
   SetZero<<<(bottom_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(bottom_size, bottom_diff);
 
@@ -449,6 +452,7 @@ bool MaxPoolBackwardWithArgmax<T>::operator()(
     const int64* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d) {
   const int kThreadsPerBlock = 1024;
+  if (input_size == 0) return true;
   SetZero<<<(input_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
             kThreadsPerBlock, 0, d.stream()>>>(input_size, bottom_diff);
   MaxPoolBackward<<<(output_size + kThreadsPerBlock - 1) / kThreadsPerBlock,
@@ -466,6 +470,7 @@ bool MaxPoolGradBackwardNoMask<T>::operator()(
     const int pad_l, const T* top_diff, T* bottom_diff,
     const Eigen::GpuDevice& d) {
   const int num_kernels = batch * channels * pooled_height * pooled_width;
+  if (num_kernels == 0) return true;
   CudaLaunchConfig config = GetCudaLaunchConfig(num_kernels, d);
 
   if (data_format == FORMAT_NHWC) {
@@ -489,6 +494,7 @@ bool MaxPoolGradBackwardWithArgmax<T>::operator()(
     const int output_size, const int input_size, const T* top_diff,
     const int64* mask, const int top_offset, const int bottom_offset,
     T* bottom_diff, const Eigen::GpuDevice& d) {
+  if (input_size == 0) return true;
   CudaLaunchConfig config = GetCudaLaunchConfig(output_size, d);
   MaxPoolGradBackward<<<config.block_count, config.thread_per_block, 0,
                         d.stream()>>>(output_size, top_diff, mask, top_offset,
diff --git a/tensorflow/core/kernels/mkl_avgpooling_op.cc b/tensorflow/core/kernels/mkl_avgpooling_op.cc
index 2409f7e9dc298a2f51145d211e984784429f7c8f..28825e1a9c6711d4daf74036896b9fea324163ea 100644
--- a/tensorflow/core/kernels/mkl_avgpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_avgpooling_op.cc
@@ -364,15 +364,15 @@ class MklAvgPoolingGradOp : public OpKernel {
                                     "1-dimensional and 4 elements"));
 
         // For avgpooling, out_backprop should have 4 dimensions.
-        OP_REQUIRES(context, out_backprop.dims() == 4,
-                    errors::InvalidArgument("out_backprop must be "
-                                            "4-dimensional"));
+        OP_REQUIRES(
+            context, out_backprop.dims() == 4,
+            errors::InvalidArgument("out_backprop must be 4-dimensional"));
       } else {
         // Input in MKL format.
         // For avgpooling, out_backprop should have 4 dimensions.
-        OP_REQUIRES(context, out_backprop_shape.GetDimension() == 4,
-                    errors::InvalidArgument("out_backprop must be "
-                                            "4-dimensional"));
+        OP_REQUIRES(
+            context, out_backprop_shape.GetDimension() == 4,
+            errors::InvalidArgument("out_backprop must be 4-dimensional"));
       }
 
       // TODO(inteltf): Get outbackprop layout.
@@ -484,9 +484,9 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
           dnn_shape_input.IsMklTensor()
               ? dnn_shape_input.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::desc input_md = dnn_shape_input.IsMklTensor()
                                   ? dnn_shape_input.GetMklLayout()
                                   : memory::desc(src_dims, MklDnnType<T>(),
@@ -494,9 +494,17 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
 
       // Get an average pooling primitive from the op pool
       MklPoolingFwdPrimitive<T>* pooling_fwd = nullptr;
+      prop_kind pooling_prop_kind;
+      bool int8_forward_inference =
+          std::is_same<T, qint8>::value || std::is_same<T, quint8>::value;
+      if (int8_forward_inference)
+        pooling_prop_kind = prop_kind::forward_inference;
+      else
+        pooling_prop_kind = prop_kind::forward_training;
       MklPoolingParams fwdParams(src_dims, output_dims_mkl_order, filter_dims,
                                  strides, padding_left, padding_right,
-                                 algorithm::pooling_avg_exclude_padding);
+                                 algorithm::pooling_avg_exclude_padding,
+                                 pooling_prop_kind);
       pooling_fwd = MklPoolingFwdPrimitiveFactory<T>::Get(fwdParams);
 
       // allocate output tensor
@@ -523,6 +531,26 @@ class MklAvgPoolingOp : public MklPoolingForwardOpBase<T> {
 
       // execute pooling
       pooling_fwd->Execute(src_data, dst_data);
+
+      // Pass min, max from input to output
+      if (int8_forward_inference) {
+        const Tensor& min_input_t = MklGetInput(context, 1);
+        const Tensor& max_input_t = MklGetInput(context, 2);
+        const float min_input = min_input_t.flat<float>()(0);
+        const float max_input = max_input_t.flat<float>()(0);
+
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        output_min->flat<float>()(0) = min_input;
+        output_max->flat<float>()(0) = max_input;
+      }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -576,24 +604,26 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_mkl_shape.IsMklTensor()
               ? orig_input_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape,
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
+                                                       this->data_format_tf_);
 
       memory::dims diff_dst_dims =
           grad_mkl_shape.IsMklTensor()
               ? grad_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
-      MklPoolingParams bwdParams(orig_input_dims_mkl_order,
-                                 output_dims_mkl_order, filter_dims, strides,
-                                 padding_left, padding_right,
-                                 algorithm::pooling_avg_exclude_padding);
+      // Pass prop_kind::forward_training to create a forward primitive
+      // that is used in the backward pass
+      MklPoolingParams bwdParams(
+          orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
+          strides, padding_left, padding_right,
+          algorithm::pooling_avg_exclude_padding, prop_kind::forward_training);
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -660,13 +690,13 @@ class MklAvgPoolingGradOp : public MklPoolingBackwardOpBase<T> {
 
     if (!input_gradient_mkl_shape.IsMklTensor()) {
       // For avgpooling, input_gradient_diff_dst should have 4 dimensions.
-      OP_REQUIRES(context, input_gradient_tensor.dims() == 4,
-                  errors::InvalidArgument("Gradient shape must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_tensor.dims() == 4,
+          errors::InvalidArgument("Gradient shape must be 4-dimensional"));
     } else {
-      OP_REQUIRES(context, input_gradient_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Gradient shape must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, input_gradient_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Gradient shape must be 4-dimensional"));
     }
   }
 };  // MklAvgPoolingGradOp
@@ -691,6 +721,18 @@ REGISTER_KERNEL_BUILDER(Name("_MklAvgPool")
                             .Label(mkl_op_registry::kMklOpLabel),
                         MklAvgPoolingOp<CPUDevice, float>);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklAvgPoolingOp<CPUDevice, quint8>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedAvgPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklAvgPoolingOp<CPUDevice, qint8>);
+
 REGISTER_KERNEL_BUILDER(Name("_MklAvgPoolGrad")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl_conv_ops.cc
index 14d134e2d0c07328a07e472fa963bafadf4a72cb..6e4fbf55c5f78158ffa811f4823d0086fb382d88 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl_conv_ops.cc
@@ -850,7 +850,8 @@ REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")
 
 // Base class for convolution forward operations
 template <typename Device, typename Tinput, typename Tfilter, typename Tbias,
-          typename Toutput, typename Ttemp_output, bool biasEnabled>
+          typename Toutput, typename Ttemp_output, typename Tpadding,
+          bool biasEnabled, bool padEnabled>
 class MklConvOp : public OpKernel {
  public:
   ~MklConvOp() {}
@@ -928,6 +929,11 @@ class MklConvOp : public OpKernel {
           dilations, strides;
       memory::dims dst_dims_tf_order, dst_dims_mkl_order;
 
+      // If pad with conv2d fusion is enabled
+      if (padEnabled) {
+        PadWithConvFusion(context, padding_left, padding_right);
+      }
+
       // Get shapes of input tensors in MKL-DNN order
       MklDnnConvUtil conv_utl(context, strides_, padding_, data_format_,
                               dilations_);
@@ -936,7 +942,7 @@ class MklConvOp : public OpKernel {
       conv_utl.GetConvFwdSizesInMklOrder(
           src_tf_shape, filter_tf_shape, &src_dims, &filter_dims, &strides,
           &dilations, &dst_dims_tf_order, &dst_dims_mkl_order, &padding_left,
-          &padding_right);
+          &padding_right, padEnabled);
       if (!context->status().ok()) return;
 
       // Check for corner case - if there is nothing to compute, return.
@@ -967,7 +973,12 @@ class MklConvOp : public OpKernel {
       }
 
       bool isConv2D = (strides_.size() == 4);
-
+      // TODO(Intel-tf) Add check to make sure padEnabled is true only for 2D
+      if (!isConv2D) {
+        OP_REQUIRES(
+            context, !padEnabled,
+            errors::InvalidArgument("Pad+Conv fusion only works for 2D"));
+      }
       // Create memory for user data.
       // Describe how the inputs and outputs of Convolution look like. Also
       // specify buffers containing actual input and output data.
@@ -1067,8 +1078,14 @@ class MklConvOp : public OpKernel {
       Tfilter* filter_data = nullptr;
       if (filter_md.data.format != conv_fwd->GetFilterMemoryFormat()) {
         filter.SetUsrMem(filter_md, &filter_tensor);
-        filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_primitive_desc(),
-                                   filter.GetTensorBuffer(filter_out_tensor));
+        if (filter_out_tensor == nullptr) {
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc());
+        } else {
+          filter.CheckReorderToOpMem(
+              conv_fwd_pd.get()->weights_primitive_desc(),
+              filter.GetTensorBuffer(filter_out_tensor));
+        }
         filter_data =
             static_cast<Tfilter*>(filter.GetOpMem().get_data_handle());
       } else {
@@ -1098,6 +1115,44 @@ class MklConvOp : public OpKernel {
     }
   }
 
+  void PadWithConvFusion(OpKernelContext* context, memory::dims& padding_left,
+                         memory::dims& padding_right) {
+    const Tensor& paddings_tf = MklGetInput(context, 2);
+    OP_REQUIRES(context, paddings_tf.dims() == 2,
+                errors::InvalidArgument("paddings must be 2-dimensional: ",
+                                        paddings_tf.shape().DebugString()));
+    Tpadding* paddings = nullptr;
+    // To get individual pad, need to flatten the tensor
+    paddings = static_cast<Tpadding*>(
+        const_cast<Tpadding*>(paddings_tf.flat<Tpadding>().data()));
+    // For NHWC format:
+    // paddings[0], paddings[1], paddings[6], paddings[7] should be zero
+    // if the paddings_tf is [ [0, 0] [1,2] [3,4] [0,0] ]
+    // paddings = {0, 0, 1, 2, 3, 4, 0, 0} ; flat method is row major
+    // then, values are: top = 1, bottom =2, left=3, right=4
+    // For NCHW format:
+    // paddings[0], paddings[1], paddings[2], paddings[3] should be zero
+    // similar explanation as NHWC format will apply.
+    int64 pad_top, pad_left;
+    int64 pad_bottom, pad_right;
+    string data_format = ToString(data_format_);
+    if (data_format == "NHWC") {
+      pad_top = paddings[2];
+      pad_bottom = paddings[3];
+      pad_left = paddings[4];
+      pad_right = paddings[5];
+    } else if (data_format == "NCHW") {
+      pad_top = paddings[4];
+      pad_bottom = paddings[5];
+      pad_left = paddings[6];
+      pad_right = paddings[7];
+    }
+    // Create padding arrays for MKL DNN convolutions.
+    // MKL-DNN uses asymetric padding.
+    padding_left = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+    padding_right = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+  }
+
  protected:
   virtual void ExtendConvFwdParams(OpKernelContext* context,
                                    MklConvFwdParams& params) {
@@ -1160,6 +1215,7 @@ class MklConvOp : public OpKernel {
   Padding padding_;
   TensorFormat data_format_;
   const int kInputIndex_Src = 0, kInputIndex_Filter = 1, kInputIndex_Bias = 2;
+  const int kInputIndex_Pad = 2;
   const int kOutputIndex_Dst = 0, kOutputIndex_Filter = 1;
   const int kDilationH = 0, kDilationW = 1;
 
@@ -1232,7 +1288,7 @@ template <typename Device, typename Tbias, typename Toutput,
           typename Ttemp_output, bool biasEnabled>
 class MklQuantizedConv2DOp
     : public MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-                       biasEnabled> {
+                       int32, biasEnabled, false> {
  public:
   virtual ~MklQuantizedConv2DOp() {
     if (this->input_bias_ != nullptr) {
@@ -1247,13 +1303,13 @@ class MklQuantizedConv2DOp
   }
 
   explicit MklQuantizedConv2DOp(OpKernelConstruction* context)
-      : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-                  biasEnabled>(context) {}
+      : MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+                  biasEnabled, false>(context) {}
 
   void Compute(OpKernelContext* context) override {
     // Compute int32 output tensor
-    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-              biasEnabled>::Compute(context);
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::Compute(context);
 
     // Compute additional outputs: min/max scalars.
     int bias_index_offset;
@@ -1299,8 +1355,8 @@ class MklQuantizedConv2DOp
  protected:
   void ExtendConvFwdParams(OpKernelContext* context,
                            MklConvFwdParams& params) override {
-    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-              biasEnabled>::ExtendConvFwdParams(context, params);
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::ExtendConvFwdParams(context, params);
 
     // When the output type is quint8, the output data id requantized
     // into quint8. A post_op "output_scale" is added to do the conversion.
@@ -1468,7 +1524,7 @@ class MklQuantizedConv2DSumReluOp
             {"sum", {scale_summand / scale_output}});
       else
         params.post_op_params.push_back(
-            {"sum", {2.0 * scale_summand / scale_output}});
+            {"sum", {2.0f * scale_summand / scale_output}});
     } else {
       params.post_op_params.push_back({"sum", {1.0}});
     }
@@ -1511,11 +1567,11 @@ class MklQuantizedConv2DSumReluOp
       }
     }
     // TODO(mdfaijul): Add cleaner code for non-mkl tensor
-    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output,
-              biasEnabled>::AllocateOutputTensor(context, conv_prim_desc,
-                                                 output_dims_mkl_order,
-                                                 output_tf_format,
-                                                 output_tensor);
+    MklConvOp<Device, quint8, qint8, Tbias, Toutput, Ttemp_output, int32,
+              biasEnabled, false>::AllocateOutputTensor(context, conv_prim_desc,
+                                                        output_dims_mkl_order,
+                                                        output_tf_format,
+                                                        output_tensor);
     const Tensor& summand = MklGetInput(context, summand_idx);
     if (summand.dtype() != DT_FLOAT)
       TF_CHECK_OK(Status(error::Code::FAILED_PRECONDITION,
@@ -1784,34 +1840,55 @@ REGISTER_KERNEL_BUILDER(
 #endif  // INTEL_MKL_ML
 
 // Register 2D operations
-#define REGISTER_MKL_CPU_2D(T)                                         \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("_MklConv2D")                                               \
-          .Device(DEVICE_CPU)                                          \
-          .TypeConstraint<float>("T")                                  \
-          .Label(mkl_op_registry::kMklOpLabel),                        \
-      MklConvOp<CPUDevice, float, float, float, float, float, false>); \
-  REGISTER_KERNEL_BUILDER(                                             \
-      Name("_MklConv2DWithBias")                                       \
-          .Device(DEVICE_CPU)                                          \
-          .TypeConstraint<float>("T")                                  \
-          .Label(mkl_op_registry::kMklOpLabel),                        \
-      MklConvOp<CPUDevice, float, float, float, float, float, true>);  \
-  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")             \
-                              .Device(DEVICE_CPU)                      \
-                              .TypeConstraint<T>("T")                  \
-                              .Label(mkl_op_registry::kMklOpLabel),    \
+#define REGISTER_MKL_CPU_2D(T)                                             \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2D")                               \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, false, false>);          \
+  REGISTER_KERNEL_BUILDER(Name("_MklConv2DWithBias")                       \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, true, false>);           \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyConv2DWithBias")                 \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklDummyOp<CPUDevice, T>);                       \
+  REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D")                        \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int32>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int32, false, true>);           \
+  REGISTER_KERNEL_BUILDER(Name("_MklPadWithConv2D")                        \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int64>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
+                          MklConvOp<CPUDevice, float, float, float, float, \
+                                    float, int64, false, true>);           \
+  REGISTER_KERNEL_BUILDER(Name("__MklDummyPadWithConv2D")                  \
+                              .Device(DEVICE_CPU)                          \
+                              .TypeConstraint<T>("T")                      \
+                              .TypeConstraint<int32>("Tpaddings")          \
+                              .Label(mkl_op_registry::kMklOpLabel),        \
                           MklDummyOp<CPUDevice, T>);
 
 TF_CALL_float(REGISTER_MKL_CPU_2D);
 
 // Register 3D operations
-#define REGISTER_MKL_CPU_3D(T)                                      \
-  REGISTER_KERNEL_BUILDER(Name("_MklConv3D")                        \
-                              .Device(DEVICE_CPU)                   \
-                              .TypeConstraint<T>("T")               \
-                              .Label(mkl_op_registry::kMklOpLabel), \
-                          MklConvOp<CPUDevice, T, T, T, T, T, false>);
+#define REGISTER_MKL_CPU_3D(T)                  \
+  REGISTER_KERNEL_BUILDER(                      \
+      Name("_MklConv3D")                        \
+          .Device(DEVICE_CPU)                   \
+          .TypeConstraint<T>("T")               \
+          .Label(mkl_op_registry::kMklOpLabel), \
+      MklConvOp<CPUDevice, T, T, T, T, T, int32, false, false>);
 TF_CALL_float(REGISTER_MKL_CPU_3D);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/mkl_conv_ops.h b/tensorflow/core/kernels/mkl_conv_ops.h
index e6989d884d68f59e4bfe9d102dcdfcaa0946c2ed..e61c20dea9f8c3f8749c302f88a46233dab270b7 100644
--- a/tensorflow/core/kernels/mkl_conv_ops.h
+++ b/tensorflow/core/kernels/mkl_conv_ops.h
@@ -17,8 +17,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_KERNELS_MKL_CONV_OPS_H_
 
 #include <limits>
-#include <vector>
 #include <memory>
+#include <vector>
 
 #include "mkldnn.hpp"
 #include "tensorflow/core/framework/numeric_op.h"
@@ -85,7 +85,7 @@ class MklDnnConvUtil {
   }
 
   // Calculate Convolution dilations
-  virtual inline void GetDilationsInMklOrder(memory::dims *dilations) {
+  virtual inline void GetDilationsInMklOrder(memory::dims* dilations) {
     // For now we take the dilation from the second and third dimensions only
     // (we do not support dilation on the batch or depth dimension).
     CHECK_NOTNULL(dilations);
@@ -288,7 +288,7 @@ class MklDnnConvUtil {
       const TensorShape& input_shape, const TensorShape& filter_shape,
       const memory::dims& strides, const memory::dims& dilations,
       memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
-      memory::dims* pad_l, memory::dims* pad_r) {
+      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
     CHECK_NOTNULL(output_dims_tf_order);
     CHECK_NOTNULL(output_dims_mkl_order);
     CHECK_NOTNULL(pad_l);
@@ -373,6 +373,36 @@ class MklDnnConvUtil {
                                    padding_, &out_cols, &pad_left, &pad_right));
     }
 
+    if (isConv2D) {
+      // Conv + pad fusion is enabled only for 2D
+      // If padEnabled, i.e., pad and conv op are fused, then
+      // all pads are already passed from pad op through
+      // *pad_l and *pad_r
+      if (padEnabled) {
+        pad_top = static_cast<int64>((*pad_l)[0]);
+        pad_left = static_cast<int64>((*pad_l)[1]);
+        pad_bottom = static_cast<int64>((*pad_r)[0]);
+        pad_right = static_cast<int64>((*pad_r)[1]);
+        // update the out_rows and out_cols based on all
+        // sides of the pads coming from pad op.
+        out_rows = out_rows + (pad_top + pad_bottom) / stride_rows;
+        out_cols = out_cols + (pad_left + pad_right) / stride_cols;
+      }
+      // Handle padding. MKL-DNN uses asymetric padding.
+      // But, if padEnabled, i.e., pad and conv op are fused,
+      // then, *pad_l and *pad_r are already set from pad op.
+      // In that case they need not set here.
+      else {
+        *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
+        *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
+      }
+    } else {
+      // Set padding for Conv3D here
+      *pad_l = {static_cast<int>(pad_D1), static_cast<int>(pad_top),
+                static_cast<int>(pad_left)};
+      *pad_r = {static_cast<int>(pad_D2), static_cast<int>(pad_bottom),
+                static_cast<int>(pad_right)};
+    }
     // Tensorflow output is in data_format order.
     //     Conv2D: NHWC or NCHW
     //     Conv3D: NDHWC or NCDHW
@@ -393,9 +423,6 @@ class MklDnnConvUtil {
       mkldnn_sizes[MklDnnDims::Dim_H] = static_cast<int>(out_rows);
       mkldnn_sizes[MklDnnDims::Dim_W] = static_cast<int>(out_cols);
       *output_dims_mkl_order = mkldnn_sizes;
-
-      *pad_l = {static_cast<int>(pad_top), static_cast<int>(pad_left)};
-      *pad_r = {static_cast<int>(pad_bottom), static_cast<int>(pad_right)};
     } else {
       std::vector<int> mkldnn_sizes(5, -1);
       mkldnn_sizes[MklDnnDims3D::Dim3d_N] = out_batch;
@@ -404,11 +431,6 @@ class MklDnnConvUtil {
       mkldnn_sizes[MklDnnDims3D::Dim3d_H] = static_cast<int>(out_rows);
       mkldnn_sizes[MklDnnDims3D::Dim3d_W] = static_cast<int>(out_cols);
       *output_dims_mkl_order = mkldnn_sizes;
-
-      *pad_l = {static_cast<int>(pad_D1), static_cast<int>(pad_top),
-                static_cast<int>(pad_left)};
-      *pad_r = {static_cast<int>(pad_D2), static_cast<int>(pad_bottom),
-                static_cast<int>(pad_right)};
     }
   }
 
@@ -441,8 +463,8 @@ class MklDnnConvUtil {
                                           input_tf_shape.DebugString()));
     }
 
-    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape,
-                                  strides, dilations, output_dims_tf_order,
+    GetOutputAndPadSizeInMklOrder(input_tf_shape, filter_tf_shape, strides,
+                                  dilations, output_dims_tf_order,
                                   output_dims_mkl_order, pad_l, pad_r);
   }
 
@@ -457,10 +479,9 @@ class MklDnnConvUtil {
   inline void GetConvFwdSizesInMklOrder(
       const TensorShape& input_shape, const TensorShape& filter_shape,
       memory::dims* input_dims, memory::dims* filter_dims,
-      memory::dims* strides, memory::dims *dilations,
-      memory::dims* output_dims_tf_order,
-      memory::dims* output_dims_mkl_order, memory::dims* pad_l,
-      memory::dims* pad_r) {
+      memory::dims* strides, memory::dims* dilations,
+      memory::dims* output_dims_tf_order, memory::dims* output_dims_mkl_order,
+      memory::dims* pad_l, memory::dims* pad_r, bool padEnabled = false) {
     CHECK_NOTNULL(input_dims);
     CHECK_NOTNULL(filter_dims);
     CHECK_NOTNULL(strides);
@@ -476,10 +497,9 @@ class MklDnnConvUtil {
     if (!context_->status().ok()) return;
     GetStridesInMklOrder(strides);
     GetDilationsInMklOrder(dilations);
-    GetOutputAndPadSizeInMklOrder(input_shape, filter_shape,
-                                  *strides, *dilations,
-                                  output_dims_tf_order, output_dims_mkl_order,
-                                  pad_l, pad_r);
+    GetOutputAndPadSizeInMklOrder(
+        input_shape, filter_shape, *strides, *dilations, output_dims_tf_order,
+        output_dims_mkl_order, pad_l, pad_r, padEnabled);
     if (!context_->status().ok()) return;
   }
 };
@@ -536,7 +556,6 @@ class MklConvBackpropCommonOp : public OpKernel {
   TensorFormat data_format_;  // NCHW or NHWC
 };
 
-
 /////////////////////////////////////////////////////////////////////
 ///  Dummy Mkl op that is just used for operators that are intermediate
 ///  output of node fusion in the graph
diff --git a/tensorflow/core/kernels/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl_fused_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..991fb080934883e05e38e91207a111256b885b82
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_fused_ops_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+#include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/cc/ops/image_ops.h"
+#include "tensorflow/cc/ops/nn_ops.h"
+#include "tensorflow/cc/ops/standard_ops.h"
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/conv_ops_gpu.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/public/session.h"
+
+namespace tensorflow {
+
+// Helper class for converting MKL tesnors to TF tensors and comparing to
+// expected values
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertAndCompare(DataType dtype, const Tensor& first,
+                         const Tensor& second, const Tensor& expected) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // Mkl second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(first.shape(), first.flat<T>());
+    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    const Tensor& output = *GetOutput(0);
+    test::ExpectTensorNear<T>(expected, output, 1e-5);
+  }
+  void TestBody(){};
+};
+
+// Testing fusion of pad and convolution
+
+class FusedPadConvOpTest : public OpsTestBase {
+ public:
+  template <typename T>
+  void Run(DataType dtype, Tensor& image, Tensor& filter, Tensor& padding,
+           Tensor& expected, const string data_format) {
+    const int stride = 1;
+
+    // Create a fused pad+conv2d node
+    TF_EXPECT_OK(NodeDefBuilder("fused_pad_conv_op", "_MklPadWithConv2D")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(dtype))     // Filter
+                     .Input(FakeInput(DT_INT32))  // Padding
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                     .Attr("padding", "VALID")
+                     .Attr("data_format", data_format)
+                     .Attr("T", dtype)
+                     .Attr("strides", {1, stride, stride, 1})
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+
+    // Setting up inputs and execute
+    AddInputFromArray<T>(image.shape(), image.flat<T>());
+    AddInputFromArray<T>(filter.shape(), filter.flat<T>());
+    AddInputFromArray<int32>(padding.shape(), padding.flat<int32>());
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+    TF_ASSERT_OK(RunOpKernel());
+
+    // Compare output to expected results
+    const Tensor& first = *GetOutput(0);
+    const Tensor& second = *GetOutput(2);
+    ConvMklToTF conv_comp;
+    conv_comp.ConvertAndCompare<T>(dtype, first, second, expected);
+  }
+};
+
+TEST_F(FusedPadConvOpTest, PaddingConvTest) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  Tensor image(DT_FLOAT, {image_batch_count, image_height, image_width, depth});
+  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+  const int padding_height = 4;
+  const int padding_width = 2;
+  Tensor padding(DT_INT32, {padding_height, padding_width});
+  test::FillValues<int32>(&padding, {0, 0, 3, 4, 1, 2, 0, 0});
+
+  Tensor expected(DT_FLOAT, TensorShape({1, 8, 5, 1}));
+  test::FillValues<float>(
+      &expected,
+      {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
+       32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
+       59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
+
+  Run<float>(DT_FLOAT, image, filter, padding, expected, "NHWC");
+}
+
+TEST_F(FusedPadConvOpTest, PaddingConvTestNchw) {
+  const int depth = 1;
+  const int image_width = 4;
+  const int image_height = 3;
+  const int image_batch_count = 1;
+  Tensor image(DT_FLOAT, {image_batch_count, depth, image_height, image_width});
+  test::FillValues<float>(&image, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+  const int filter_size = 3;
+  const int filter_count = 1;
+  Tensor filter(DT_FLOAT, {filter_size, filter_size, depth, filter_count});
+  test::FillValues<float>(&filter, {1, 4, 7, 2, 5, 8, 3, 6, 9});
+
+  const int padding_height = 4;
+  const int padding_width = 2;
+  Tensor padding(DT_INT32, {padding_height, padding_width});
+  test::FillValues<int32>(&padding, {0, 0, 0, 0, 3, 4, 1, 2});
+
+  Tensor expected(DT_FLOAT, TensorShape({1, 1, 8, 5}));
+  test::FillValues<float>(
+      &expected,
+      {0,  0,   0,   0,   0,   24, 42,  60,  33,  12,  105, 150, 183, 95,
+       32, 235, 312, 357, 178, 56, 187, 234, 261, 121, 32,  106, 126, 138,
+       59, 12,  0,   0,   0,   0,  0,   0,   0,   0,   0,   0});
+
+  Run<float>(DT_FLOAT, image, filter, padding, expected, "NCHW");
+}
+}  // namespace tensorflow
+#endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl_lrn_op.cc b/tensorflow/core/kernels/mkl_lrn_op.cc
index 22ff4cd80fe6d4d0b8a85c88dd65a58b7288a351..4d46abb0a4dd232ef13c8b6b0547b0779af1f98f 100644
--- a/tensorflow/core/kernels/mkl_lrn_op.cc
+++ b/tensorflow/core/kernels/mkl_lrn_op.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 #include <vector>
+#include "mkldnn.hpp"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
@@ -29,25 +30,18 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/ops_util.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/tensor_format.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/util/work_sharder.h"
 #endif
 
-#ifndef INTEL_MKL_ML_ONLY
-#include "mkldnn.hpp"
 using mkldnn::lrn_across_channels;
 using mkldnn::lrn_backward;
 using mkldnn::lrn_forward;
 using mkldnn::prop_kind;
 using mkldnn::stream;
-#else
-#include "mkl_dnn.h"
-#include "mkl_dnn_types.h"
-#endif
-
-#include "tensorflow/core/util/mkl_util.h"
 
 namespace tensorflow {
 
@@ -69,672 +63,6 @@ void GetBandMatrix(int depth, int depth_radius,
 
 }  // namespace
 
-#ifdef INTEL_MKL_ML_ONLY
-
-template <typename T>
-class MklLRNOp : public OpKernel {
- public:
-  ~MklLRNOp() {}
-
-  explicit MklLRNOp(OpKernelConstruction* context) : OpKernel(context) {
-    int64 depth_radius64;
-    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
-    depth_radius_ = static_cast<size_t>(depth_radius64);
-
-    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
-    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
-    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
-    workspace_enabled_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklLRNOpContext mkl_context;
-
-    const Tensor& input = MklGetInput(context, 0);
-    GetMklShape(context, 0, &mkl_context.input_shape);
-    bool input_in_mkl_format = mkl_context.input_shape.IsMklTensor();
-
-    // Sanity checks
-    mkl_context.in_dims = input_in_mkl_format
-                              ? mkl_context.input_shape.GetDimension()
-                              : input.dims();
-    OP_REQUIRES(context, mkl_context.in_dims == 4,
-                errors::InvalidArgument("input must be 4-dimensional"));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(input.NumElements(), std::numeric_limits<int>::max()),
-        errors::InvalidArgument("argument to LRN too large"));
-
-    if (!input_in_mkl_format) {
-      mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                    beta_, input);
-      return;
-    }
-
-    if (input_in_mkl_format) {
-      // MKL supports normalization over channel dimension only
-      if (mkl_context.input_shape.tf_dim_idx(mkl_context.in_dims - 1) ==
-          MklDims::C) {
-        mkl_context.lt_input =
-            static_cast<dnnLayout_t>(mkl_context.input_shape.GetCurLayout());
-        workspace_enabled_ = true;
-      } else {
-        Tensor converted_tensor =
-            ConvertMklToTF<T>(context, input, mkl_context.input_shape);
-        mkl_context.MklDefaultToEigen(context, depth_radius_, bias_, alpha_,
-                                      beta_, converted_tensor);
-        return;
-      }
-    }
-
-    int kernel_size = 2 * depth_radius_ + 1;
-
-    CHECK_EQ(dnnLRNCreateForward_F32(
-                 &mkl_context.lrn_fwd, NULL, mkl_context.lt_input, kernel_size,
-                 static_cast<float>(alpha_ * kernel_size), beta_, bias_),
-             E_SUCCESS);
-
-    // Allocate output tensor and shape
-    Tensor* output = nullptr;
-    Tensor* workspace = nullptr;
-
-    // Convert Inputs if needed
-    Tensor mkl_tmp_input_buf_tensor;
-    mkl_context.MklPrepareLRNInputs(context, &mkl_tmp_input_buf_tensor);
-
-    // Allocate Layer Outputs
-    mkl_context.MklAllocateOutputs(context, &output, &workspace,
-                                   workspace_enabled_);
-
-    Tensor mkl_tmp_workspace_buf_tensor;
-    mkl_context.MklPrepareLRNOutputs(context, output, workspace,
-                                     &mkl_tmp_workspace_buf_tensor,
-                                     workspace_enabled_);
-
-    // Execute LRN.
-    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_fwd, mkl_context.lrn_res),
-             E_SUCCESS);
-
-    // Release MKL resources.
-    mkl_context.MklCleanup();
-  }
-
- private:
-  typedef struct {
-    size_t in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    MklShape input_shape;
-    dnnPrimitive_t lrn_fwd = nullptr;
-    dnnPrimitive_t convert_input = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_internal_input = nullptr;
-    dnnLayout_t lt_internal_workspace = nullptr;
-    dnnLayout_t lt_internal_output = nullptr;
-    void* lrn_res[dnnResourceNumber];
-
-    // Convert Inputs if needed
-    void MklPrepareLRNInputs(OpKernelContext* context,
-                             Tensor* mkl_tmp_input_buf_tensor) {
-      const Tensor& input = MklGetInput(context, 0);
-      void* mkl_buf_input =
-          const_cast<void*>(static_cast<const void*>(input.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_fwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
-
-      void* mkl_buf_convert_input = nullptr;
-      bool mkl_convert_input = false;
-      mkl_convert_input = !dnnLayoutCompare_F32(lt_internal_input, lt_input);
-
-      if (mkl_convert_input) {
-        CHECK_EQ(dnnConversionCreate_F32(&convert_input, lt_input,
-                                         lt_internal_input),
-                 E_SUCCESS);
-        AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_internal_input,
-                       &mkl_buf_convert_input);
-        CHECK_EQ(dnnConversionExecute_F32(convert_input, mkl_buf_input,
-                                          mkl_buf_convert_input),
-                 E_SUCCESS);
-        dnnDelete_F32(convert_input);
-      }
-
-      lrn_res[dnnResourceSrc] =
-          (mkl_convert_input) ? mkl_buf_convert_input : mkl_buf_input;
-    }
-
-    // Allocate Layer Outputs
-    void MklAllocateOutputs(OpKernelContext* context, Tensor** output,
-                            Tensor** workspace, bool workspace_enabled_) {
-      TensorShape mkl_output_tf_shape; /* First tensor */
-      MklShape mkl_output_mkl_shape;   /* Second tensor */
-
-      mkl_output_mkl_shape.SetMklTensor(true);
-      mkl_output_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceDst);
-      mkl_output_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
-                                       input_shape.GetStrides());
-      mkl_output_mkl_shape.SetTfDimOrder(in_dims,
-                                         input_shape.GetTfToMklDimMap());
-      mkl_output_tf_shape.AddDim(
-          dnnLayoutGetMemorySize_F32(
-              static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-          sizeof(T));
-      AllocateOutputSetMklShape(context, 0, output,
-                                mkl_output_tf_shape /* First tensor */,
-                                mkl_output_mkl_shape /* Second Tensor */);
-
-      if (workspace_enabled_) {
-        TensorShape mkl_workspace_tf_shape; /* First tensor */
-        MklShape mkl_workspace_mkl_shape;   /* Second tensor */
-        mkl_workspace_mkl_shape.SetMklTensor(false);
-        mkl_workspace_mkl_shape.SetMklLayout(lrn_fwd, dnnResourceWorkspace);
-        // Assumes workspace has same TF layout and TF dim order as input
-        mkl_workspace_mkl_shape.SetTfLayout(in_dims, input_shape.GetSizes(),
-                                            input_shape.GetStrides());
-        mkl_workspace_mkl_shape.SetTfDimOrder(in_dims,
-                                              input_shape.GetTfToMklDimMap());
-        mkl_workspace_tf_shape.AddDim(
-            dnnLayoutGetMemorySize_F32(static_cast<dnnLayout_t>(
-                mkl_workspace_mkl_shape.GetMklLayout())) /
-            sizeof(T));
-        AllocateOutputSetMklShape(context, 1, workspace,
-                                  mkl_workspace_tf_shape /* First tensor */,
-                                  mkl_workspace_mkl_shape /* Second Tensor */);
-      }
-    }
-
-    void MklPrepareLRNOutputs(OpKernelContext* context, Tensor* output,
-                              Tensor* workspace,
-                              Tensor* mkl_tmp_workspace_buf_tensor,
-                              bool workspace_enabled_) {
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_workspace, lrn_fwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_output, lrn_fwd,
-                                                dnnResourceDst),
-               E_SUCCESS);
-
-      void* mkl_buf_output =
-          const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-      lrn_res[dnnResourceDst] = mkl_buf_output;
-
-      void* mkl_buf_workspace = nullptr;
-      if (workspace_enabled_) {
-        mkl_buf_workspace = const_cast<void*>(
-            static_cast<const void*>(workspace->flat<T>().data()));
-      } else {
-        AllocTmpBuffer(context, mkl_tmp_workspace_buf_tensor,
-                       lt_internal_workspace, &mkl_buf_workspace);
-      }
-      lrn_res[dnnResourceWorkspace] = mkl_buf_workspace;
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(inteltf) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context, int depth_radius_,
-                           float bias_, float alpha_, float beta_,
-                           const Tensor& input) {
-      const int batch = static_cast<int>(input.dim_size(0));
-      const int rows = static_cast<int>(input.dim_size(1));
-      const int cols = static_cast<int>(input.dim_size(2));
-      const int depth = static_cast<int>(input.dim_size(3));
-      const int nodes = cols * rows;
-
-      auto in_shaped = input.shaped<T, 2>({nodes * batch, depth});
-      // Multiplying the input with the band matrix has the effect of reducing
-      // the
-      // correct patch along the depth.
-      Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
-      GetBandMatrix<T>(depth, depth_radius_, &multiplier);
-
-      Tensor *output, *workspace;
-      MklShape mkl_output_mkl_shape, mkl_workspace_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 0, &output, input.shape(),
-                                mkl_output_mkl_shape);
-
-      mkl_workspace_mkl_shape.SetMklTensor(false);
-      mkl_workspace_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 1, &workspace, input.shape(),
-                                mkl_workspace_mkl_shape);
-
-      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
-      Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
-      auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
-      if (beta_ == T(1)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.inverse();
-      } else if (beta_ == T(0.5)) {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * tmp.rsqrt();
-      } else {
-        out_shaped.device(context->eigen_cpu_device()) =
-            in_shaped * (tmp.log() * -beta_).exp();
-      }
-    }
-
-    // Release MKL resources.
-    void MklCleanup() {
-      dnnDelete_F32(lrn_fwd);
-      dnnLayoutDelete_F32(lt_internal_input);
-      dnnLayoutDelete_F32(lt_internal_workspace);
-      dnnLayoutDelete_F32(lt_internal_output);
-    }
-  } MklLRNOpContext;
-
-  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
-
-  bool workspace_enabled_;
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-};
-
-template <typename T>
-class MklLRNGradOp : public OpKernel {
- public:
-  explicit MklLRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
-    int64 depth_radius64;
-    OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
-    OP_REQUIRES(
-        context,
-        FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
-        errors::InvalidArgument("depth_radius = ", depth_radius64,
-                                " larger than int max"));
-    depth_radius_ = static_cast<int>(depth_radius64);
-    OP_REQUIRES_OK(context, context->GetAttr("bias", &bias_));
-    OP_REQUIRES_OK(context, context->GetAttr("alpha", &alpha_));
-    OP_REQUIRES_OK(context, context->GetAttr("beta", &beta_));
-    workspace_enabled_ = false;
-    OP_REQUIRES_OK(context,
-                   context->GetAttr("workspace_enabled", &workspace_enabled_));
-  }
-
-  void Compute(OpKernelContext* context) override {
-    MklLRNGradOpContext mkl_context;
-    mkl_context.depth_radius_ = depth_radius_;
-    mkl_context.bias_ = bias_;
-    mkl_context.alpha_ = alpha_;
-    mkl_context.beta_ = beta_;
-
-    const Tensor& in_grads = MklGetInput(context, 0);
-    const Tensor& in_image = MklGetInput(context, 1);
-    const Tensor& out_image = MklGetInput(context, 2);
-
-    GetMklShape(context, 0, &mkl_context.ingrad_shape);
-    GetMklShape(context, 1, &mkl_context.inimage_shape);
-    GetMklShape(context, 2, &mkl_context.outimage_shape);
-
-    bool ingrad_in_mkl_format = mkl_context.ingrad_shape.IsMklTensor();
-    bool inimage_in_mkl_format = mkl_context.inimage_shape.IsMklTensor();
-    bool outimage_in_mkl_format = mkl_context.outimage_shape.IsMklTensor();
-
-    mkl_context.in_dims = inimage_in_mkl_format
-                              ? mkl_context.inimage_shape.GetDimension()
-                              : in_image.dims();
-    OP_REQUIRES(context, mkl_context.in_dims == 4,
-                errors::InvalidArgument("input images must be 4-dimensional"));
-
-    if (!workspace_enabled_) {
-      mkl_context.MklDefaultToEigen(context);
-      return;
-    }
-
-    if (ingrad_in_mkl_format || inimage_in_mkl_format) {
-      const MklShape* tmp_mkl_shape = (ingrad_in_mkl_format)
-                                          ? &mkl_context.ingrad_shape
-                                          : &mkl_context.inimage_shape;
-      if (tmp_mkl_shape->tf_dim_idx(mkl_context.in_dims - 1) != MklDims::C) {
-        // Fallback to eigen
-        mkl_context.MklDefaultToEigen(context);
-        return;
-      } else {  // MKL supports normalization over channel dimension only
-        for (int i = 0; i < mkl_context.in_dims; i++) {
-          mkl_context.in_sizes[i] = mkl_context.out_sizes[i] =
-              tmp_mkl_shape->GetSizes()[i];
-          mkl_context.in_strides[i] = mkl_context.out_strides[i] =
-              tmp_mkl_shape->GetStrides()[i];
-        }
-      }
-    } else {
-      // Fallback to eigen
-      mkl_context.MklDefaultToEigen(context);
-      return;
-    }
-
-    // Dimensions check for sanity purpose
-    if (ingrad_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.ingrad_shape.GetDimension() == 4,
-          errors::InvalidArgument("input gradient must be 4-dimensional"));
-    } else {
-      OP_REQUIRES(
-          context, in_grads.dims() == 4,
-          errors::InvalidArgument("input gradient must be 4-dimensional"));
-    }
-
-    if (outimage_in_mkl_format) {
-      OP_REQUIRES(
-          context, mkl_context.outimage_shape.GetDimension() == 4,
-          errors::InvalidArgument("Output image must be 4-dimensional"));
-    } else {
-      OP_REQUIRES(
-          context, out_image.dims() == 4,
-          errors::InvalidArgument("Output image must be 4-dimensional"));
-    }
-
-    // Prepare mkl input layout
-    mkl_context.MklPrepareLRNInputsLayouts(context);
-    int ksize = 2 * depth_radius_ + 1;
-
-    CHECK_EQ(dnnLRNCreateBackward_F32(
-                 &mkl_context.lrn_bwd, NULL, mkl_context.lt_input,
-                 mkl_context.lt_output, ksize,
-                 static_cast<float>(alpha_ * ksize), beta_, bias_),
-             E_SUCCESS);
-
-    // Allocate output tensor and shape.
-    TensorShape mkl_output_tf_shape; /* First tensor */
-    MklShape mkl_output_mkl_shape;   /* Second tensor */
-    mkl_output_mkl_shape.SetMklTensor(true);
-    CHECK_NE(mkl_context.lrn_bwd, nullptr);
-    mkl_output_mkl_shape.SetMklLayout(mkl_context.lrn_bwd, dnnResourceDiffSrc);
-    mkl_output_mkl_shape.SetTfLayout(mkl_context.in_dims, mkl_context.out_sizes,
-                                     mkl_context.out_strides);
-    if (ingrad_in_mkl_format) {
-      mkl_output_mkl_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.ingrad_shape.GetTfToMklDimMap());
-    } else {
-      mkl_output_mkl_shape.SetTfDimOrder(
-          mkl_context.in_dims, mkl_context.inimage_shape.GetTfToMklDimMap());
-    }
-    mkl_output_tf_shape.AddDim(
-        dnnLayoutGetMemorySize_F32(
-            static_cast<dnnLayout_t>(mkl_output_mkl_shape.GetMklLayout())) /
-        sizeof(T));
-    Tensor* output = nullptr;
-    AllocateOutputSetMklShape(context, 0, &output, mkl_output_tf_shape,
-                              mkl_output_mkl_shape);
-
-    // Get pointers to output data.
-    void* user_output =
-        const_cast<void*>(static_cast<const void*>(output->flat<T>().data()));
-
-    Tensor mkl_tmp_input_buf_tensor, mkl_tmp_image_buf_tensor,
-        mkl_tmp_outimage_buf_tensor;
-    // Convert Inputs if needed
-    mkl_context.MklPrepareLRNGradInput(context, &mkl_tmp_input_buf_tensor,
-                                       &mkl_tmp_image_buf_tensor,
-                                       &mkl_tmp_outimage_buf_tensor);
-
-    // We do not do any conversion for output. But we simply emit it
-    // in MKL format.
-    mkl_context.res_lrn_bwd[dnnResourceDiffSrc] = user_output;
-    // Execute LRN backward using dnnExecute
-    CHECK_EQ(dnnExecute_F32(mkl_context.lrn_bwd, mkl_context.res_lrn_bwd),
-             E_SUCCESS);
-    // Release MKL resources.
-    mkl_context.Mklcleanup();
-  }
-
- private:
-  typedef struct {
-    int depth_radius_;
-    float bias_;
-    float alpha_;
-    float beta_;
-    size_t in_dims;
-    size_t in_sizes[4];
-    size_t in_strides[4];
-    size_t out_sizes[4];
-    size_t out_strides[4];
-    MklShape ingrad_shape, inimage_shape, outimage_shape;
-    dnnPrimitive_t lrn_bwd = nullptr;
-    dnnPrimitive_t convert_input = nullptr;
-    dnnLayout_t lt_input = nullptr;
-    dnnLayout_t lt_output = nullptr;
-    dnnLayout_t lt_bdw_input = nullptr;
-    dnnLayout_t lt_workspace = nullptr;
-    dnnLayout_t lt_internal_input = nullptr;
-    void* res_lrn_bwd[dnnResourceNumber];
-
-    // prepare mkl input
-    void MklPrepareLRNInputsLayouts(OpKernelContext* context) {
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (!ingrad_in_mkl_format) {
-        CHECK_EQ(dnnLayoutCreate_F32(&lt_input, in_dims, in_sizes, in_strides),
-                 E_SUCCESS);
-      } else {
-        lt_input = static_cast<dnnLayout_t>(ingrad_shape.GetCurLayout());
-      }
-
-      if (!inimage_in_mkl_format) {
-        CHECK_EQ(
-            dnnLayoutCreate_F32(&lt_output, in_dims, out_sizes, out_strides),
-            E_SUCCESS);
-      } else {
-        lt_output = static_cast<dnnLayout_t>(inimage_shape.GetCurLayout());
-      }
-    }
-
-    // convert input if needed
-    void MklPrepareLRNGradInput(OpKernelContext* context,
-                                Tensor* mkl_tmp_input_buf_tensor,
-                                Tensor* mkl_tmp_image_buf_tensor,
-                                Tensor* mkl_tmp_outimage_buf_tensor) {
-      const Tensor& in_grads = MklGetInput(context, 0);
-      const Tensor& in_image = MklGetInput(context, 1);
-      const Tensor& workspace = MklGetInput(
-          context,
-          3); /*Worskpsace is enabled, get the buffer to the workspace */
-
-      void* user_input = const_cast<void*>(
-          static_cast<const void*>(in_grads.flat<T>().data()));
-      void* user_fwd_input = const_cast<void*>(
-          static_cast<const void*>(in_image.flat<T>().data()));
-      void* workspace_buffer = const_cast<void*>(
-          static_cast<const void*>(workspace.flat<T>().data()));
-
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_workspace, lrn_bwd,
-                                                dnnResourceWorkspace),
-               E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_bdw_input, lrn_bwd,
-                                                dnnResourceDiffDst),
-               E_SUCCESS);
-      CHECK_EQ(dnnLayoutCreateFromPrimitive_F32(&lt_internal_input, lrn_bwd,
-                                                dnnResourceSrc),
-               E_SUCCESS);
-
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      if (ingrad_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
-          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
-                         &res_lrn_bwd[dnnResourceDiffDst]);
-          ingrad_shape.GetConvertedFlatData(lt_bdw_input, user_input,
-                                            res_lrn_bwd[dnnResourceDiffDst]);
-        } else {
-          res_lrn_bwd[dnnResourceDiffDst] = user_input;
-        }
-      } else {
-        if (!dnnLayoutCompare_F32(lt_bdw_input, lt_input)) {
-          CHECK_EQ(
-              dnnConversionCreate_F32(&convert_input, lt_input, lt_bdw_input),
-              E_SUCCESS);
-
-          AllocTmpBuffer(context, mkl_tmp_input_buf_tensor, lt_bdw_input,
-                         &res_lrn_bwd[dnnResourceDiffDst]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_input,
-                                            res_lrn_bwd[dnnResourceDiffDst]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceDiffDst] = user_input;
-        }
-      }
-
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (inimage_in_mkl_format) {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          ingrad_shape.GetConvertedFlatData(lt_internal_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
-      } else {
-        if (!dnnLayoutCompare_F32(
-                lt_internal_input,
-                static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()))) {
-          CHECK_EQ(dnnConversionCreate_F32(
-                       &convert_input,
-                       static_cast<dnnLayout_t>(inimage_shape.GetCurLayout()),
-                       lt_internal_input),
-                   E_SUCCESS);
-
-          AllocTmpBuffer(context, mkl_tmp_image_buf_tensor, lt_internal_input,
-                         &res_lrn_bwd[dnnResourceSrc]);
-          CHECK_EQ(dnnConversionExecute_F32(convert_input, user_fwd_input,
-                                            res_lrn_bwd[dnnResourceSrc]),
-                   E_SUCCESS);
-          dnnDelete_F32(convert_input);
-        } else {
-          res_lrn_bwd[dnnResourceSrc] = user_fwd_input;
-        }
-      }
-
-      res_lrn_bwd[dnnResourceWorkspace] = workspace_buffer;
-    }
-
-    // Fallback implementation - Taken from lrn_op.cc
-    // TODO(intelft) Check if we can use EigenLRNOp directly instead of making a
-    // copy.
-    void MklDefaultToEigen(OpKernelContext* context) {
-      Tensor in_grads;
-      Tensor in_image;
-      Tensor out_image;
-
-      GetMklShape(context, 0, &ingrad_shape);
-      GetMklShape(context, 1, &inimage_shape);
-      GetMklShape(context, 2, &outimage_shape);
-
-      if (ingrad_shape.IsMklTensor()) {
-        in_grads =
-            ConvertMklToTF<T>(context, MklGetInput(context, 0), ingrad_shape);
-      } else {
-        in_grads = MklGetInput(context, 0);
-      }
-
-      if (inimage_shape.IsMklTensor()) {
-        in_image =
-            ConvertMklToTF<T>(context, MklGetInput(context, 1), inimage_shape);
-      } else {
-        in_image = MklGetInput(context, 1);
-      }
-
-      if (outimage_shape.IsMklTensor()) {
-        out_image =
-            ConvertMklToTF<T>(context, MklGetInput(context, 2), outimage_shape);
-      } else {
-        out_image = MklGetInput(context, 2);
-      }
-
-      const int64 batch = static_cast<int64>(in_grads.dim_size(0));
-      const int64 rows = static_cast<int64>(in_grads.dim_size(1));
-      const int64 cols = static_cast<int64>(in_grads.dim_size(2));
-      const int64 depth = static_cast<int64>(in_grads.dim_size(3));
-      const auto nodes = cols * rows;
-
-      auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
-
-      auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
-      auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
-
-      Tensor* output;
-      MklShape mkl_output_mkl_shape;
-      mkl_output_mkl_shape.SetMklTensor(false);
-      mkl_output_mkl_shape.SetDimensions(4);
-      AllocateOutputSetMklShape(context, 0, &output, in_grads.shape(),
-                                mkl_output_mkl_shape);
-
-      auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
-      out_shaped.setZero();
-      auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
-                    depth](int64 begin, int64 end) {
-        for (int64 i = begin; i < end; ++i) {
-          for (int64 j = 0; j < depth; ++j) {
-            int64 depth_begin = std::max<int64>(0, j - depth_radius_);
-            int64 depth_end = std::min<int64>(depth, j + depth_radius_ + 1);
-
-            T norm(0);
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              norm += in_shaped(i, k) * in_shaped(i, k);
-            }
-            norm = alpha_ * norm + bias_;
-            DCHECK_GT(norm, T(1e-6));
-            for (int64 k = depth_begin; k < depth_end; ++k) {
-              T dyi = T(-2) * alpha_ * beta_ * in_shaped(i, k) *
-                      activations(i, j) / norm;
-              if (k == j) {
-                dyi += Eigen::numext::pow(norm, -beta_);
-              }
-              dyi *= grads_shaped(i, j);
-              const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) +=
-                  dyi;
-            }
-          }
-        }
-      };
-      auto worker_threads =
-          *(context->device()->tensorflow_cpu_worker_threads());
-      Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
-            depth * depth, shard);
-    }
-
-    // release mkl resources
-    void Mklcleanup() {
-      bool ingrad_in_mkl_format = ingrad_shape.IsMklTensor();
-      bool inimage_in_mkl_format = inimage_shape.IsMklTensor();
-      if (!ingrad_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_input), E_SUCCESS);
-      }
-
-      if (!inimage_in_mkl_format) {
-        CHECK_EQ(dnnLayoutDelete_F32(lt_output), E_SUCCESS);
-      }
-      dnnDelete_F32(lrn_bwd);
-      dnnLayoutDelete_F32(lt_bdw_input);
-      dnnLayoutDelete_F32(lt_workspace);
-    }
-  } MklLRNGradOpContext;
-
-  typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
-  bool workspace_enabled_;
-  int depth_radius_;
-  float bias_;
-  float alpha_;
-  float beta_;
-};
-
-#else
-
 template <typename T>
 class MklLRNOp : public OpKernel {
  public:
@@ -847,7 +175,6 @@ class MklLRNOp : public OpKernel {
                             MklDnnData<T>* src_dnn_data,
                             MklDnnData<T>* dst_dnn_data,
                             MklDnnData<uint8>* wksp_dnn_data = nullptr) {
-
     // Check for input reorder
     src_dnn_data->CheckReorderToOpMem(lrn_fwd_desc.src_primitive_desc());
 
@@ -1160,7 +487,6 @@ class MklLRNGradOp : public OpKernel {
       MklDnnData<T>* output_diff_src,
       const memory::primitive_desc& target_diff_dst_pd,
       const MklDnnData<uint8>* workspace_dnn_data = nullptr) {
-
     // Check for input reordering on the diff dst input
     input_gradient_diff_dst->CheckReorderToOpMem(
         lrn_bkwd_desc.diff_dst_primitive_desc());
@@ -1345,8 +671,6 @@ class MklLRNGradOp : public OpKernel {
   float beta_;
 };
 
-#endif  // INTEL_MKL_ML_ONLY
-
 #define REGISTER_MKL_LRN_CPU(T)                                     \
   REGISTER_KERNEL_BUILDER(Name("_MklLRN")                           \
                               .Device(DEVICE_CPU)                   \
diff --git a/tensorflow/core/kernels/mkl_maxpooling_op.cc b/tensorflow/core/kernels/mkl_maxpooling_op.cc
index 256d48f4d5d56995fbca31c18cf29c902831679b..cb494f6c3ec75d36bad42669fd0addcfa31b8bf7 100644
--- a/tensorflow/core/kernels/mkl_maxpooling_op.cc
+++ b/tensorflow/core/kernels/mkl_maxpooling_op.cc
@@ -520,7 +520,6 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
 
       MklDnnData<T> dnn_data_input(&cpu_engine);
       MklDnnData<T> dnn_data_output(&cpu_engine);
-      MklDnnData<uint8> dnn_data_wksp(&cpu_engine);
 
       // initialize variables for the pooling op
       MklPoolParameters pool_params;
@@ -550,13 +549,13 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
           dnn_shape_input.IsMklTensor()
               ? dnn_shape_input.GetMklLayout()
               : is_pool2d ? memory::desc(
-                               TFShapeToMklDnnDimsInNCHW(input_tensor_shape,
-                                                         this->data_format_tf_),
-                               MklDnnType<T>(), this->data_format_mkldnn_)
-                         : memory::desc(
-                               TFShapeToMklDnnDimsInNCDHW(
-                                   input_tensor_shape, this->data_format_tf_),
-                               MklDnnType<T>(), this->data_format_mkldnn_);
+                                TFShapeToMklDnnDimsInNCHW(
+                                    input_tensor_shape, this->data_format_tf_),
+                                MklDnnType<T>(), this->data_format_mkldnn_)
+                          : memory::desc(
+                                TFShapeToMklDnnDimsInNCDHW(
+                                    input_tensor_shape, this->data_format_tf_),
+                                MklDnnType<T>(), this->data_format_mkldnn_);
 
       // Get src/filter/stride/padding information
       memory::dims src_dims =
@@ -564,17 +563,24 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
               ? dnn_shape_input.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(input_tensor.shape(),
                                                       this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
-                                                      this->data_format_tf_);
+                          : TFShapeToMklDnnDimsInNCDHW(input_tensor.shape(),
+                                                       this->data_format_tf_);
       memory::dims filter_dims, strides, padding_left, padding_right;
       this->PoolParamsToDims(&pool_params, &filter_dims, &strides,
                              &padding_left, &padding_right, is_pool2d);
 
       // Get a pooling op from the cached pool
       MklPoolingFwdPrimitive<T>* pooling_fwd = nullptr;
+      prop_kind pooling_prop_kind;
+      bool int8_forward_inference =
+          std::is_same<T, qint8>::value || std::is_same<T, quint8>::value;
+      if (int8_forward_inference)
+        pooling_prop_kind = prop_kind::forward_inference;
+      else
+        pooling_prop_kind = prop_kind::forward_training;
       MklPoolingParams fwdParams(src_dims, output_dims_mkl_order, filter_dims,
                                  strides, padding_left, padding_right,
-                                 algorithm::pooling_max);
+                                 algorithm::pooling_max, pooling_prop_kind);
       pooling_fwd = MklPoolingFwdPrimitiveFactory<T>::Get(fwdParams);
 
       // allocate output tensor
@@ -586,10 +592,6 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
                                 pooling_fwd->GetDstMemoryFormat(),
                                 output_tensor);
 
-      AllocateWorkspaceTensor(context, *(pooling_fwd->GetPoolingFwdPd()),
-                              &dnn_data_wksp);
-      OP_REQUIRES_OK(context, context->status());
-
       // check wehther we need to reorder src
       const T* src_data = input_tensor.flat<T>().data();
       if (input_md.data.format != pooling_fwd->GetSrcMemoryFormat()) {
@@ -603,10 +605,39 @@ class MklMaxPoolingOp : public MklPoolingForwardOpBase<T> {
       }
 
       T* dst_data = output_tensor->flat<T>().data();
-      void* ws_data = dnn_data_wksp.GetOpMem().get_data_handle();
 
-      // execute pooling op
-      pooling_fwd->Execute(src_data, dst_data, ws_data);
+      if (int8_forward_inference) {
+        // Execute pooling op
+        pooling_fwd->Execute(src_data, dst_data);
+
+        // pass min, max from input to output
+        const Tensor& min_input_t = MklGetInput(context, 1);
+        const Tensor& max_input_t = MklGetInput(context, 2);
+        const float min_input = min_input_t.flat<float>()(0);
+        const float max_input = max_input_t.flat<float>()(0);
+
+        Tensor* output_min = nullptr;
+        Tensor* output_max = nullptr;
+        MklDnnShape output_min_mkl_shape, output_max_mkl_shape;
+        output_min_mkl_shape.SetMklTensor(false);
+        output_max_mkl_shape.SetMklTensor(false);
+        AllocateOutputSetMklShape(context, 1, &output_min, {},
+                                  output_min_mkl_shape);
+        AllocateOutputSetMklShape(context, 2, &output_max, {},
+                                  output_max_mkl_shape);
+        output_min->flat<float>()(0) = min_input;
+        output_max->flat<float>()(0) = max_input;
+      } else {
+        MklDnnData<uint8> dnn_data_wksp(&cpu_engine);
+        AllocateWorkspaceTensor(context, *(pooling_fwd->GetPoolingFwdPd()),
+                                &dnn_data_wksp);
+        OP_REQUIRES_OK(context, context->status());
+        T* ws_data =
+            static_cast<T*>(dnn_data_wksp.GetOpMem().get_data_handle());
+
+        // execute pooling op
+        pooling_fwd->Execute(src_data, dst_data, ws_data);
+      }
     } catch (mkldnn::error& e) {
       string error_msg = "Status: " + std::to_string(e.status) +
                          ", message: " + string(e.message) + ", in file " +
@@ -684,24 +715,25 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
           orig_input_mkl_shape.IsMklTensor()
               ? orig_input_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(orig_input_shape,
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(orig_input_shape,
+                                                       this->data_format_tf_);
 
       memory::dims diff_dst_dims =
           grad_mkl_shape.IsMklTensor()
               ? grad_mkl_shape.GetSizesAsMklDnnDims()
               : is_pool2d ? TFShapeToMklDnnDimsInNCHW(grad_tensor.shape(),
-                                                     this->data_format_tf_)
-                         : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
-                                                      this->data_format_tf_);
+                                                      this->data_format_tf_)
+                          : TFShapeToMklDnnDimsInNCDHW(grad_tensor.shape(),
+                                                       this->data_format_tf_);
 
       memory::dims output_dims_mkl_order;
       this->GetOutputDims(pool_params, &output_dims_mkl_order);
 
       MklPoolingParams bwdParams(
           orig_input_dims_mkl_order, output_dims_mkl_order, filter_dims,
-          strides, padding_left, padding_right, algorithm::pooling_max);
+          strides, padding_left, padding_right, algorithm::pooling_max,
+          prop_kind::forward_training);
       MklPoolingBwdPrimitive<T>* pooling_bwd =
           MklPoolingBwdPrimitiveFactory<T>::Get(bwdParams);
 
@@ -788,39 +820,38 @@ class MklMaxPoolingGradOp : public MklPoolingBackwardOpBase<T> {
                          const MklDnnShape& workspace_mkl_shape) {
     if (!orig_input_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, orig_input_tensor.dims() == 4,
-                  errors::InvalidArgument("Original input shape must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument(
+                      "Original input shape must be 4-dimensional"));
     } else {
       OP_REQUIRES(context, orig_input_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Original input shape must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument(
+                      "Original input shape must be 4-dimensional"));
     }
     if (!orig_output_mkl_shape.IsMklTensor()) {
-      OP_REQUIRES(context, orig_output_tensor.dims() == 4,
-                  errors::InvalidArgument("Original output must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_tensor.dims() == 4,
+          errors::InvalidArgument("Original output must be 4-dimensional"));
     } else {
-      OP_REQUIRES(context, orig_output_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Original output must be "
-                                          "4-dimensional"));
+      OP_REQUIRES(
+          context, orig_output_mkl_shape.GetDimension() == 4,
+          errors::InvalidArgument("Original output must be 4-dimensional"));
     }
     if (!grad_mkl_shape.IsMklTensor()) {
       OP_REQUIRES(context, grad_tensor.dims() == 4,
                   errors::InvalidArgument("Gradient must be 4-dimensional"));
     } else {
       OP_REQUIRES(context, grad_mkl_shape.GetDimension() == 4,
-                  errors::InvalidArgument("Gradient must be "
-                                          "4-dimensional"));
+                  errors::InvalidArgument("Gradient must be 4-dimensional"));
     }
     if (this->workspace_enabled_) {
       // The workspace should not be an MKL tensor
       OP_REQUIRES(context, workspace_mkl_shape.IsMklTensor() == false,
-                  errors::InvalidArgument("Workspace tensor should not"
-                                          " be an MKL Tensor."));
+                  errors::InvalidArgument(
+                      "Workspace tensor should not be an MKL Tensor."));
       // It should only have one dimension
-      OP_REQUIRES(context, workspace_tensor.dims() == 1,
-                  errors::InvalidArgument("Workspace tensor must be "
-                                          "1-dimensional"));
+      OP_REQUIRES(
+          context, workspace_tensor.dims() == 1,
+          errors::InvalidArgument("Workspace tensor must be 1-dimensional"));
     } else {
       OP_REQUIRES(
           context, this->workspace_enabled_,
@@ -852,6 +883,18 @@ REGISTER_KERNEL_BUILDER(Name("_MklMaxPool")
                             .Label(mkl_op_registry::kMklOpLabel),
                         MklMaxPoolingOp<CPUDevice, float>);
 
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<quint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklMaxPoolingOp<CPUDevice, quint8>);
+
+REGISTER_KERNEL_BUILDER(Name("_MklQuantizedMaxPool")
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<qint8>("T")
+                            .Label(mkl_op_registry::kMklQuantizedOpLabel),
+                        MklMaxPoolingOp<CPUDevice, qint8>);
+
 REGISTER_KERNEL_BUILDER(Name("_MklMaxPoolGrad")
                             .Device(DEVICE_CPU)
                             .TypeConstraint<float>("T")
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.cc b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
index 5398e6113f53ecc0516dd87d0148eae63b1aae10..dc84d3941e78a2232041b2dbcf83bf3545982dee 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.cc
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.cc
@@ -41,28 +41,33 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
       << "Pooling algorithm kind is not supported";
 
   context_.alg_kind = fwdParams.alg_kind;
+  context_.prop_kind = fwdParams.prop_kind;
+
   // create memory desc
   // FIXME: Pooling doesn't expose to get the src_primitive_desc,
   //        so src format is currently hard-coded.
   //        A utility function is used to do this,
   //        which may be broken with future CPU architectures
   bool is_2d = (fwdParams.src_dims.size() == 4);
-  context_.src_md.reset(
-      new memory::desc({fwdParams.src_dims}, MklDnnType<T>(),
-                       get_desired_format(fwdParams.src_dims[1], is_2d)));
+  if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value)
+    context_.src_fmt = is_2d ? memory::format::nhwc : memory::format::ndhwc;
+  else
+    context_.src_fmt = get_desired_format(fwdParams.src_dims[1], is_2d);
+
+  context_.src_md.reset(new memory::desc({fwdParams.src_dims}, MklDnnType<T>(),
+                                         context_.src_fmt));
   context_.dst_md.reset(new memory::desc({fwdParams.dst_dims}, MklDnnType<T>(),
                                          memory::format::any));
 
   // create a pooling descriptor
   context_.fwd_desc.reset(new pooling_forward::desc(
-      prop_kind::forward_training, fwdParams.alg_kind, *context_.src_md,
+      fwdParams.prop_kind, fwdParams.alg_kind, *context_.src_md,
       *context_.dst_md, fwdParams.strides, fwdParams.filter_dims,
       fwdParams.padding_left, fwdParams.padding_right, padding_kind::zero));
   context_.fwd_pd.reset(
       new pooling_forward::primitive_desc(*context_.fwd_desc, cpu_engine_));
 
   // store expected primitive format
-  context_.src_fmt = get_desired_format(fwdParams.src_dims[1], is_2d);
   context_.dst_fmt = static_cast<mkldnn::memory::format>(
       context_.fwd_pd.get()->dst_primitive_desc().desc().data.format);
 
@@ -74,7 +79,8 @@ void MklPoolingFwdPrimitive<T>::Setup(const MklPoolingParams& fwdParams) {
       new memory(context_.fwd_pd.get()->dst_primitive_desc(), DummyData));
 
   // for max pooling, need to return workspace(ws) for backward computing
-  if (fwdParams.alg_kind == pooling_max) {
+  if (fwdParams.alg_kind == pooling_max &&
+      fwdParams.prop_kind == prop_kind::forward_training) {
     auto ws_pd = context_.fwd_pd.get()->workspace_primitive_desc().desc().data;
     // store workspace's dims and format to create workspace tensor
     context_.ws_fmt = static_cast<mkldnn::memory::format>(ws_pd.format);
@@ -101,7 +107,9 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   context_.src_mem->set_data_handle(
       static_cast<void*>(const_cast<T*>(src_data)));
   context_.dst_mem->set_data_handle(static_cast<void*>(dst_data));
-  if (context_.alg_kind == pooling_max) {  // max pooling must have ws
+  if (context_.alg_kind == pooling_max &&
+      context_.prop_kind ==
+          prop_kind::forward_training) {  // max pooling must have ws
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(ws_data);
   }
@@ -110,13 +118,17 @@ void MklPoolingFwdPrimitive<T>::Execute(const T* src_data, T* dst_data,
   // set back data handle
   context_.src_mem->set_data_handle(DummyData);
   context_.dst_mem->set_data_handle(DummyData);
-  if (context_.alg_kind == pooling_max) {  // max pooling must have ws
+  if (context_.alg_kind == pooling_max &&
+      context_.prop_kind ==
+          prop_kind::forward_training) {  // max pooling must have ws
     DCHECK(ws_data != nullptr);
     context_.ws_mem->set_data_handle(DummyData);
   }
 }
 
 template class MklPoolingFwdPrimitive<float>;
+template class MklPoolingFwdPrimitive<quint8>;
+template class MklPoolingFwdPrimitive<qint8>;
 
 template <typename T>
 void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
@@ -143,7 +155,7 @@ void MklPoolingBwdPrimitive<T>::Setup(const MklPoolingParams& bwdParams) {
   // create a forward primitive,
   // which will be used as a hint for creating backward primitive
   context_.fwd_desc.reset(new pooling_forward::desc(
-      prop_kind::forward_training, bwdParams.alg_kind, *context_.diff_src_md,
+      bwdParams.prop_kind, bwdParams.alg_kind, *context_.diff_src_md,
       *context_.diff_dst_md, bwdParams.strides, bwdParams.filter_dims,
       bwdParams.padding_left, bwdParams.padding_right, padding_kind::zero));
   context_.fwd_pd.reset(
diff --git a/tensorflow/core/kernels/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl_pooling_ops_common.h
index 49f799d7ba2d28bf90bbb4ebd5ada33f0e5d620e..6e42b70d14919f7a15ace0dd9035b4fd57a82a76 100644
--- a/tensorflow/core/kernels/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl_pooling_ops_common.h
@@ -18,8 +18,8 @@ limitations under the License.
 
 #ifdef INTEL_MKL
 #include <memory>
-#include <vector>
 #include <string>
+#include <vector>
 #include "tensorflow/core/util/mkl_util.h"
 #include "tensorflow/core/util/padding.h"
 
@@ -50,18 +50,20 @@ struct MklPoolingParams {
   memory::dims padding_left;
   memory::dims padding_right;
   mkldnn::algorithm alg_kind;
+  mkldnn::prop_kind prop_kind;
 
   MklPoolingParams(memory::dims src_dims, memory::dims dst_dims,
                    memory::dims filter_dims, memory::dims strides,
                    memory::dims padding_left, memory::dims padding_right,
-                   mkldnn::algorithm alg_kind)
+                   mkldnn::algorithm alg_kind, mkldnn::prop_kind prop_kind)
       : src_dims(src_dims),
         dst_dims(dst_dims),
         filter_dims(filter_dims),
         strides(strides),
         padding_left(padding_left),
         padding_right(padding_right),
-        alg_kind(alg_kind) {}
+        alg_kind(alg_kind),
+        prop_kind(prop_kind) {}
 };
 
 template <typename T>
@@ -97,6 +99,9 @@ class MklPoolingFwdPrimitive : public MklPrimitive {
     // algorithm
     mkldnn::algorithm alg_kind;
 
+    // Kind of propagation, forward or backward
+    mkldnn::prop_kind prop_kind;
+
     // expected memory format
     memory::format src_fmt;
     memory::format dst_fmt;
@@ -187,6 +192,7 @@ class MklPoolingFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
     key_creator.AddAsKey(fwdParams.padding_left);
     key_creator.AddAsKey(fwdParams.padding_right);
     key_creator.AddAsKey<int>(static_cast<int>(fwdParams.alg_kind));
+    key_creator.AddAsKey<int>(static_cast<int>(fwdParams.prop_kind));
     return key_creator.GetKey();
   }
 
@@ -443,7 +449,12 @@ class MklPoolingOpBase : public OpKernel {
   explicit MklPoolingOpBase(OpKernelConstruction* context)
       : OpKernel(context), workspace_enabled_(false) {
     string data_format;
-    OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    if (std::is_same<T, qint8>::value || std::is_same<T, quint8>::value) {
+      // current quantized convolution doesn't have data_format attribute.
+      data_format = "NHWC";
+    } else {
+      OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
+    }
     OP_REQUIRES(context, FormatFromString(data_format, &this->data_format_tf_),
                 errors::InvalidArgument("Invalid data format"));
     OP_REQUIRES_OK(context, context->GetAttr("ksize", &this->ksize_));
@@ -461,7 +472,7 @@ class MklPoolingOpBase : public OpKernel {
     bool is_pool2d = (this->ksize_.size() == 4);
     this->data_format_mkldnn_ =
         is_pool2d ? TFDataFormatToMklDnnDataFormat(this->data_format_tf_)
-                 : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_);
+                  : TFDataFormatToMklDnn3DDataFormat(this->data_format_tf_);
 
     // We may not get this attribute for this node if it does not go through
     // graph rewrite pass. So we do not check for error while retrieving this
@@ -655,10 +666,11 @@ class MklPoolingForwardOpBase : public MklPoolingOpBase<T> {
       OP_REQUIRES(context, input_tensor.dims() == 4 || input_tensor.dims() == 5,
                   errors::InvalidArgument("Input must be 4 or 5-dimensional"));
     } else {
-      OP_REQUIRES(context, input_mkl_shape.GetDimension() == 4 ||
-                               input_mkl_shape.GetDimension() == 5,
-                  errors::InvalidArgument("Input shape must be "
-                                          "4 or 5-dimensional"));
+      OP_REQUIRES(
+          context,
+          input_mkl_shape.GetDimension() == 4 ||
+              input_mkl_shape.GetDimension() == 5,
+          errors::InvalidArgument("Input shape must be 4 or 5-dimensional"));
     }
   }
   // .Input("value: T")
diff --git a/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc b/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7c1e32d6e35326273cfdd070ca8197e30b8ea7f9
--- /dev/null
+++ b/tensorflow/core/kernels/mkl_quantized_pooling_ops_test.cc
@@ -0,0 +1,201 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef INTEL_MKL
+#define EIGEN_USE_THREADS
+
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/fake_input.h"
+#include "tensorflow/core/framework/node_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/kernels/ops_util.h"
+#include "tensorflow/core/kernels/quantization_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+
+// Helper class for converting MKL tensors to TF tensors and comparing to
+// expected values
+
+static const uint8 dummy_tensor[] = {0, 0, 0, 0, 0, 0, 0, 0};
+static const TensorShape dummy_shape({8});
+
+class ConvMklToTF : public OpsTestBase {
+ public:
+  template <typename T>
+  void ConvertMKL2TF(DataType dtype, const Tensor& first, const Tensor& second,
+                     Tensor& output) {
+    // Create an MKL to TF conversion node and execute it
+    TF_EXPECT_OK(NodeDefBuilder("mkl_to_tf_op", "_MklToTf")
+                     .Input(FakeInput(dtype))     // Input
+                     .Input(FakeInput(DT_UINT8))  // Mkl second tensor
+                     .Attr("T", dtype)
+                     .Attr("_kernel", "MklOp")
+                     .Finalize(node_def()));
+    TF_EXPECT_OK(InitOp());
+    AddInputFromArray<T>(first.shape(), first.flat<T>());
+    AddInputFromArray<uint8>(second.shape(), second.flat<uint8>());
+    TF_ASSERT_OK(RunOpKernel());
+
+    output = *GetOutput(0);
+  }
+  void TestBody(){};
+};
+
+class QuantizedPoolingTest : public OpsTestBase {};
+
+TEST_F(QuantizedPoolingTest, SmallAveragePooling) {
+  const int ksize = 2;
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_avg_pool_op", "_MklQuantizedAvgPool")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("ksize", {1, ksize, ksize, 1})
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(DT_FLOAT, {1, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+
+  const int expected_width = input_width / stride;
+  const int expected_height = input_height / stride;
+
+  // The input pools we are averaging. (NHWC input, quantized.)
+  //    0th channel       1st channel
+  //    1  3 |  5  7      2  4 |  6  8
+  //    9 11 | 13 15     10 12 | 14 16
+  //   -------------     -------------
+  //   17 19 | 21 23     18 20 | 22 24
+  //   25 27 | 29 31     26 28 | 30 32
+  Tensor expected_float(DT_FLOAT,
+                        {1, expected_height, expected_width, input_channels});
+  test::FillValues<float>(&expected_float, {6, 7, 10, 11, 22, 23, 26, 27});
+
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& mkl_shape_tensor = *GetOutput(3);
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMKL2TF<quint8>(DT_QUINT8, output, mkl_shape_tensor,
+                                  output_quantized);
+
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+
+TEST_F(QuantizedPoolingTest, SmallMaxPooling) {
+  const int ksize = 2;
+  const int stride = 2;
+  TF_ASSERT_OK(NodeDefBuilder("quantized_max_pool_op", "_MklQuantizedMaxPool")
+                   .Input(FakeInput(DT_QUINT8))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_FLOAT))
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Input(FakeInput(DT_UINT8))  // MKl second tensor
+                   .Attr("T", DataTypeToEnum<quint8>::v())
+                   .Attr("ksize", {1, ksize, ksize, 1})
+                   .Attr("strides", {1, stride, stride, 1})
+                   .Attr("padding", "SAME")
+                   .Attr("_kernel", "QuantizedMklOp")
+                   .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  const float input_min = 0.0f;
+  const float input_max = 255.0f;
+  const int input_height = 4;
+  const int input_width = 4;
+  const int input_channels = 2;
+  Tensor input_float(DT_FLOAT, {1, input_height, input_width, input_channels});
+  test::FillValues<float>(
+      &input_float,
+      {1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16,
+       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32});
+  Tensor input_quantized =
+      FloatTensorToQuantized<quint8>(input_float, input_min, input_max);
+  const int expected_width = input_width / stride;
+  const int expected_height = input_height / stride;
+
+  // The max is computed from these input pools. (NHWC input, quantized.)
+  //    0th channel       1st channel
+  //    1  3 |  5  7      2  4 |  6  8
+  //    9 11 | 13 15     10 12 | 14 16
+  //   -------------     -------------
+  //   17 19 | 21 23     18 20 | 22 24
+  //   25 27 | 29 31     26 28 | 30 32
+
+  Tensor expected_float(DT_FLOAT,
+                        {1, expected_height, expected_width, input_channels});
+  test::FillValues<float>(&expected_float, {11, 12, 15, 16, 27, 28, 31, 32});
+  AddInputFromArray<quint8>(input_quantized.shape(),
+                            input_quantized.flat<quint8>());
+  AddInputFromArray<float>(TensorShape({1}), {input_min});
+  AddInputFromArray<float>(TensorShape({1}), {input_max});
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+  AddInputFromArray<uint8>(dummy_shape, dummy_tensor);
+
+  TF_ASSERT_OK(RunOpKernel());
+
+  const Tensor& output = *GetOutput(0);
+  const Tensor& mkl_shape_tensor = *GetOutput(3);
+  ConvMklToTF conv_comp;
+  Tensor output_quantized;
+  conv_comp.ConvertMKL2TF<quint8>(DT_QUINT8, output, mkl_shape_tensor,
+                                  output_quantized);
+
+  const float output_min = GetOutput(1)->flat<float>()(0);
+  const float output_max = GetOutput(2)->flat<float>()(0);
+  Tensor output_float =
+      QuantizedTensorToFloat<quint8>(output_quantized, output_min, output_max);
+
+  test::ExpectTensorNear<float>(expected_float, output_float, 0.2);
+}
+}  // namespace tensorflow
+#endif
diff --git a/tensorflow/core/kernels/mkl_softmax_op.cc b/tensorflow/core/kernels/mkl_softmax_op.cc
index cfab529662fd334b29181c3c895556eaa7867854..094129ae3efe87e070f8a27c8584f67c927bbec3 100644
--- a/tensorflow/core/kernels/mkl_softmax_op.cc
+++ b/tensorflow/core/kernels/mkl_softmax_op.cc
@@ -56,7 +56,7 @@ class MklSoftmaxOp : public OpKernel {
       MklDnnShape src_mkl_shape;
       GetMklShape(context, src_idx, &src_mkl_shape);
 
-      // src_dims is the dimenstion of src_tensor
+      // src_dims is the dimension of src_tensor
       // dim of the dst will also be same as src_dims
       auto src_tf_shape = src_mkl_shape.IsMklTensor()
                               ? src_mkl_shape.GetTfShape()
@@ -64,12 +64,12 @@ class MklSoftmaxOp : public OpKernel {
       auto src_dims = TFShapeToMklDnnDims(src_tf_shape);
       auto output_dims = src_dims;
       memory::format layout_type;
-      // In MKL, data format passed to mkl softmax op depends on dimension of the input tensor.
-      // Here "x" data format in MKL is used for 1 dim tensor, "nc" for 2 dim tensor, 
-      // "tnc" for 3 dim tensor, "nchw" for 4 dim tensor, and "ncdhw" for 5 dim tensor.
-      // Each of the simbols has the following meaning:
-      // n = batch, c = channels, t = sequence lenght, h = height,
-      // w = width, d = depth 
+      // In MKL, data format passed to mkl softmax op depends on dimension of
+      // the input tensor. Here "x" data format in MKL is used for 1 dim tensor,
+      // "nc" for 2 dim tensor, "tnc" for 3 dim tensor, "nchw" for 4 dim tensor,
+      // and "ncdhw" for 5 dim tensor. Each of the simbols has the following
+      // meaning: n = batch, c = channels, t = sequence length, h = height, w =
+      // width, d = depth
       switch (input_dims) {
         case 1:
           layout_type = memory::format::x;
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 71e506e5e6fd66cb8166f8f223e86cf0882fb1c4..ba51db219ec5528d1dd98f744e70c5cd2cf6c6f8 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
 #include "tensorflow/core/grappler/utils/functions.h"
+#include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/util/ptr_util.h"
 #include "tensorflow/core/util/reffed_status_callback.h"
@@ -50,12 +51,29 @@ class PartitionedCallOp : public AsyncOpKernel {
  public:
   explicit PartitionedCallOp(OpKernelConstruction* ctx) : AsyncOpKernel(ctx) {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("f", &func_));
-    string rewriter_config_serialized;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &rewriter_config_serialized));
+    string deprecated_config_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config", &deprecated_config_serialized));
+    string config_proto_serialized;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("config_proto", &config_proto_serialized));
     OP_REQUIRES(
-        ctx, rewriter_config_.ParseFromString(rewriter_config_serialized),
-        errors::InvalidArgument("Unable to parse rewriter_config string as "
-                                "tensorflow::RewriterConfig proto."));
+        ctx,
+        deprecated_config_serialized.empty() || config_proto_serialized.empty(),
+        errors::InvalidArgument("Provided both 'config' and 'config_proto' but "
+                                "only one should be provided.  Note the "
+                                "'config' option is deprecated."));
+    if (!deprecated_config_serialized.empty()) {
+      OP_REQUIRES(ctx,
+                  config_proto_.mutable_graph_options()
+                      ->mutable_rewrite_options()
+                      ->ParseFromString(deprecated_config_serialized),
+                  errors::InvalidArgument("Unable to parse config string as "
+                                          "tensorflow::RewriteOptions proto."));
+    } else {
+      OP_REQUIRES(
+          ctx, config_proto_.ParseFromString(config_proto_serialized),
+          errors::InvalidArgument("Unable to parse config_proto string as "
+                                  "tensorflow::ConfigProto proto."));
+    }
     OP_REQUIRES_OK(ctx, ctx->GetAttr("executor_type", &executor_type_));
   }
 
@@ -166,12 +184,6 @@ class PartitionedCallOp : public AsyncOpKernel {
             OptimizationPassRegistry::Global()->RunGrouping(
                 OptimizationPassRegistry::POST_PLACEMENT, optimization_options),
             done);
-        OP_REQUIRES_OK_ASYNC(
-            ctx,
-            OptimizationPassRegistry::Global()->RunGrouping(
-                OptimizationPassRegistry::POST_REWRITE_FOR_EXEC,
-                optimization_options),
-            done);
 
         Device* cpu_device;
         OP_REQUIRES_OK_ASYNC(
@@ -184,6 +196,13 @@ class PartitionedCallOp : public AsyncOpKernel {
                                            device_set, cpu_device, &graph),
                              done);
 
+        OP_REQUIRES_OK_ASYNC(
+            ctx,
+            OptimizationPassRegistry::Global()->RunGrouping(
+                OptimizationPassRegistry::POST_REWRITE_FOR_EXEC,
+                optimization_options),
+            done);
+
         std::unordered_map<string, std::unique_ptr<Graph>> subgraphs;
         OP_REQUIRES_OK_ASYNC(
             ctx, PartitionHelper(device_set, std::move(graph), &subgraphs),
@@ -435,7 +454,7 @@ class PartitionedCallOp : public AsyncOpKernel {
         },
         rendez, std::move(done), std::placeholders::_1);
     auto* refcounted_done = new ReffedStatusCallback(std::move(callback));
-    for (int i = 1; i < handles->size(); ++i) {
+    for (int i = 0; i < handles->size(); ++i) {
       refcounted_done->Ref();
     }
 
@@ -489,6 +508,7 @@ class PartitionedCallOp : public AsyncOpKernel {
             });
       }
     }
+    refcounted_done->Unref();
   }
 
   string UniquifyFunctionName(const FunctionLibraryDefinition* function_library,
@@ -506,12 +526,18 @@ class PartitionedCallOp : public AsyncOpKernel {
                        FunctionLibraryDefinition* flib,
                        const DeviceSet& device_set, Device* cpu_device,
                        std::unique_ptr<Graph>* graph) {
-    if (!tensorflow::grappler::MetaOptimizerEnabled(rewriter_config_)) {
+    if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto_)) {
       return Status::OK();
     }
 
     tensorflow::grappler::GrapplerItem item;
 
+    // Add all available devices so that inlined function can be placed.
+    for (const Device* d : device_set.devices()) {
+      Status added_device = item.AddDevice(d->name());
+      if (!added_device.ok()) VLOG(3) << added_device.error_message();
+    }
+
     // Add fetches so that the graph can be pruned.
     for (Node* node : ret_nodes) {
       item.fetch.push_back(node->name());
@@ -530,7 +556,7 @@ class PartitionedCallOp : public AsyncOpKernel {
     // TODO(nareshmodi): Consider adding and using the more generic GraphOptions
     // proto (which also contain the OptimizerOptions).
     TF_RETURN_IF_ERROR(tensorflow::grappler::RunMetaOptimizer(
-        item, rewriter_config_, cpu_device, &cluster, &out_graph));
+        item, config_proto_, cpu_device, &cluster, &out_graph));
 
     std::unique_ptr<Graph> optimized_graph(new Graph(OpRegistry::Global()));
     TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(
@@ -562,7 +588,7 @@ class PartitionedCallOp : public AsyncOpKernel {
   }
 
   NameAttrList func_;
-  RewriterConfig rewriter_config_;
+  ConfigProto config_proto_;
   string executor_type_;
   // Contains maps from device names to handles of function partitions, keyed by
   // FunctionLibraryRuntime pointers. (Because this kernel may be instantiated
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.cc b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
index dadc15b69ee67b51be1647a1e8a6794e684bcff2..f13341e0afe2a605122c77f0d0833d8119ac28d9 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.cc
@@ -49,6 +49,21 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
                 errors::InvalidArgument("num_bits is out of range: ", num_bits_,
                                         " with signed_input_ ", signed_input_));
     OP_REQUIRES_OK(ctx, ctx->GetAttr("range_given", &range_given_));
+
+    string round_mode_string;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("round_mode", &round_mode_string));
+    OP_REQUIRES(
+        ctx,
+        (round_mode_string == "HALF_UP" || round_mode_string == "HALF_TO_EVEN"),
+        errors::InvalidArgument("Round mode string must be "
+                                "'HALF_UP' or "
+                                "'HALF_TO_EVEN', is '" +
+                                round_mode_string + "'"));
+    if (round_mode_string == "HALF_UP") {
+      round_mode_ = ROUND_HALF_UP;
+    } else if (round_mode_string == "HALF_TO_EVEN") {
+      round_mode_ = ROUND_HALF_TO_EVEN;
+    }
   }
 
   void Compute(OpKernelContext* ctx) override {
@@ -76,13 +91,15 @@ class QuantizeAndDequantizeV2Op : public OpKernel {
 
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
     f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_,
-      range_given_, &input_min_tensor, &input_max_tensor, output->flat<T>());
+      range_given_, &input_min_tensor, &input_max_tensor, round_mode_,
+      output->flat<T>());
   }
 
  private:
   bool signed_input_;
   int num_bits_;
   bool range_given_;
+  QuantizerRoundMode round_mode_;
 };
 
 // Simulate quantization precision loss in a float tensor by:
@@ -135,7 +152,8 @@ class QuantizeAndDequantizeV3Op : public OpKernel {
 
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> f;
     f(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_, num_bits_val,
-      range_given_, &input_min_tensor, &input_max_tensor, output->flat<T>());
+      range_given_, &input_min_tensor, &input_max_tensor, ROUND_HALF_TO_EVEN,
+      output->flat<T>());
   }
 
  private:
@@ -180,7 +198,7 @@ class QuantizeAndDequantizeOp : public OpKernel {
     functor::QuantizeAndDequantizeOneScaleFunctor<Device, T> functor;
     functor(ctx->eigen_device<Device>(), input.flat<T>(), signed_input_,
             num_bits_, range_given_, &input_min_tensor, &input_max_tensor,
-            output->flat<T>());
+            ROUND_HALF_TO_EVEN, output->flat<T>());
   }
 
  private:
@@ -198,10 +216,11 @@ struct QuantizeAndDequantizeOneScaleFunctor<CPUDevice, T> {
   void operator()(const CPUDevice& d, typename TTypes<T>::ConstVec input,
                   const bool signed_input, const int num_bits,
                   const bool range_given, Tensor* input_min_tensor,
-                  Tensor* input_max_tensor, typename TTypes<T>::Vec out) {
+                  Tensor* input_max_tensor, QuantizerRoundMode round_mode,
+                  typename TTypes<T>::Vec out) {
     QuantizeAndDequantizeOneScaleImpl<CPUDevice, T>::Compute(
         d, input, signed_input, num_bits, range_given, input_min_tensor,
-        input_max_tensor, out);
+        input_max_tensor, round_mode, out);
   }
 };
 }  // namespace functor
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op.h b/tensorflow/core/kernels/quantize_and_dequantize_op.h
index 6b0c5e5a466baf60a771d7aa7754975a0c121138..a495e8b71fec285f8649979553bcae7d400cd3d3 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op.h
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op.h
@@ -22,6 +22,20 @@ limitations under the License.
 #include "tensorflow/core/kernels/cwise_ops.h"
 
 namespace tensorflow {
+
+enum QuantizerRoundMode {
+  // Round half up: if the fraction of y is exactly 0.5, then
+  // round(y) = y + 0.5
+  // E.g., -5.5 gets rounded to -5, -5.4 goes to -5,
+  // 5.4 goes to 5, and 5.5 goes to 6.
+  ROUND_HALF_UP,
+  // Round half to even: if the fraction of y is exactly 0.5, then round(y) is
+  // the nearest even integer to y.
+  // E.g., 23.5 gets rounded to 24, 24.5 gets rounded to 24, while -23.5 becomes
+  // -24, and -24.5 gets rounded to 24.
+  ROUND_HALF_TO_EVEN,
+};
+
 namespace functor {
 
 // TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
@@ -31,15 +45,69 @@ struct QuantizeAndDequantizeOneScaleFunctor {
   void operator()(const Device& d, typename TTypes<T>::ConstVec input,
                   bool signed_input, int num_bits, bool range_given,
                   Tensor* input_min_tensor, Tensor* input_max_tensor,
-                  typename TTypes<T>::Vec out);
+                  QuantizerRoundMode round_mode, typename TTypes<T>::Vec out);
 };
 
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Func>
+void ClampScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input,
+                        T min_range, T max_range, T scale, T inverse_scale,
+                        Func round_func, typename TTypes<T>::Vec out) {
+  out.device(d) = (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
+                      .unaryExpr(round_func) *
+                  inverse_scale;
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T>
+void ClampScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input,
+                        T min_range, T max_range, T scale, T inverse_scale,
+                        QuantizerRoundMode round_mode,
+                        typename TTypes<T>::Vec out) {
+  switch (round_mode) {
+    case ROUND_HALF_TO_EVEN:
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         Eigen::internal::scalar_round_op_google<T>(), out);
+      break;
+    case ROUND_HALF_UP:
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         Eigen::internal::scalar_round_up_op<T>(), out);
+      break;
+  }
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T, typename Func>
+void ScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input, T scale,
+                   T inverse_scale, Func round_func,
+                   typename TTypes<T>::Vec out) {
+  out.device(d) = (input * scale).unaryExpr(round_func) * inverse_scale;
+}
+
+// The implementation below runs on both CPU and GPU.
+template <typename Device, typename T>
+void ScaleAndRound(const Device& d, typename TTypes<T>::ConstVec input, T scale,
+                   T inverse_scale, QuantizerRoundMode round_mode,
+                   typename TTypes<T>::Vec out) {
+  switch (round_mode) {
+    case ROUND_HALF_TO_EVEN:
+      ScaleAndRound(d, input, scale, inverse_scale,
+                    Eigen::internal::scalar_round_op_google<T>(), out);
+      break;
+    case ROUND_HALF_UP:
+      ScaleAndRound(d, input, scale, inverse_scale,
+                    Eigen::internal::scalar_round_up_op<T>(), out);
+      break;
+  }
+}
+
 // The implementation below runs on both CPU and GPU.
 template <typename Device, typename T>
 struct QuantizeAndDequantizeOneScaleImpl {
   static void Compute(const Device& d, typename TTypes<T>::ConstVec input,
                       bool signed_input, int num_bits, bool range_given,
                       Tensor* input_min_tensor, Tensor* input_max_tensor,
+                      QuantizerRoundMode round_mode,
                       typename TTypes<T>::Vec out) {
     T min_range;
     T max_range;
@@ -89,15 +157,10 @@ struct QuantizeAndDequantizeOneScaleImpl {
       // The semantics of the op does not guarantee to clamp to the specified
       // min_range and max_range - because we may have changed either min_range
       // or max_range.
-      out.device(d) =
-          (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
-              .unaryExpr(Eigen::internal::scalar_round_op_google<T>()) *
-          inverse_scale;
+      ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
+                         round_mode, out);
     } else {
-      out.device(d) =
-          (input * scale)
-              .unaryExpr(Eigen::internal::scalar_round_op_google<T>()) *
-          inverse_scale;
+      ScaleAndRound(d, input, scale, inverse_scale, round_mode, out);
     }
   }
 };
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
index 61c79cf6959ce5a20ce1a4ddd1bd4fae103a15d6..5745e418f3614b7ff3a786c4e6ac4c0f40308a25 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_gpu.cu.cc
@@ -32,10 +32,10 @@ struct QuantizeAndDequantizeOneScaleFunctor<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::ConstVec input,
                   bool signed_input, int num_bits, bool range_given,
                   Tensor* input_min_tensor, Tensor* input_max_tensor,
-                  typename TTypes<T>::Vec out) {
+                  QuantizerRoundMode round_mode, typename TTypes<T>::Vec out) {
     QuantizeAndDequantizeOneScaleImpl<GPUDevice, T>::Compute(
         d, input, signed_input, num_bits, range_given, input_min_tensor,
-        input_max_tensor, out);
+        input_max_tensor, round_mode, out);
   }
 };
 }  // end namespace functor
diff --git a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
index cddabf8a99aca4a17de78c0ed8e7888e6959be6e..b9e015c96b5cd1edc2c349f1a38fdd074124230e 100644
--- a/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
+++ b/tensorflow/core/kernels/quantize_and_dequantize_op_test.cc
@@ -101,17 +101,51 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8) {
           .Attr("range_given", false)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({7}),
+                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
-  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71}.
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71, 64}.
   // Scale is: 1/127
-  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128}
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128, 0.5}
   TF_ASSERT_OK(RunOpKernel());
-  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
-  test::FillValues<float>(&expected,
-                          {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128});
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
+  test::FillValues<float>(
+      &expected, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128, 71.0 / 128, 0.5});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
+// Convert a 1D tensor with signed 8 bits and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int8_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 8)
+          .Attr("range_given", false)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({7}),
+                           {-1, -0.5, 0, 0.3, 0.8, 0.555, 0.50390625});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+
+  // With int8, the tensor is quantized to {-128, -64, 0, 38, 102, 71, 65}.
+  // Scale is: 1/127
+  // Then it is dequantized to {-1, -0.5, 0, 38.0/128, 102.0/128, 71.0/128,
+  // 65.0 /128}
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({7}));
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 38.0 / 128, 102.0 / 128,
+                                      71.0 / 128, 65.0 / 128});
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 
   // Ensure that the inputs haven't been changed.
@@ -162,7 +196,7 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
           .Attr("range_given", false)
           .Finalize(node_def()));
   TF_ASSERT_OK(InitOp());
-  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3125, 0.8, 0.555});
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
   AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
 
@@ -178,6 +212,35 @@ TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4) {
   EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
 }
 
+// Convert a 1D tensor with signed 4 bits and round_mode hafl_up.
+TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 4)
+          .Attr("range_given", false)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({6}), {-1, -0.5, 0, 0.3125, 0.8, 0.555});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Max
+
+  // With int4, the tensor is quantized to {-8, -4, 0, 3, 6, 4}.
+  // Scale is: 1/8
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({6}));
+  test::FillValues<float>(&expected, {-1, -0.5, 0, 0.375, 0.75, 0.5});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+
+  // Ensure that the inputs haven't been changed.
+  EXPECT_EQ(inputs_[1]->scalar<float>()(), 0.0);
+  EXPECT_EQ(inputs_[2]->scalar<float>()(), 0.0);
+}
+
 // Convert a 1D tensor with signed 4 bits.
 TEST_F(QuantizeAndDequantizeTest, Convert_1D_tensor_with_int4_V3) {
   TF_ASSERT_OK(
@@ -237,6 +300,38 @@ TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a 2D tensor with signed 8 bits, given range and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest,
+       Convert_2D_tensor_with_int8_range_given_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", true)
+          .Attr("num_bits", 8)
+          .Attr("range_given", true)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  // Note that the last two values are saturated.
+  AddInputFromArray<float>(TensorShape({2, 4}),
+                           {-0.8, -0.5, 0, 0.3, 0.8, 0.555, -2, 33});
+  AddInputFromArray<float>(TensorShape({}), {-1.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});   // Max
+
+  // Note that the range is given as [-1, 1].
+  // With int8, the tensor is quantized to {-102, -63, 0, 38, 102, 70, -128,
+  // 127}.
+  // Scale is: 1/127
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 4}));
+  test::FillValues<float>(
+      &expected, {-102.0 / 127, -63.0 / 127, 0, 38.0 / 127, 102.0 / 127,
+                  70.0 / 127, -128.0 / 127, 1});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Convert a 2D tensor with signed 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_2D_tensor_with_int8_range_given_V3) {
   TF_ASSERT_OK(
@@ -293,6 +388,33 @@ TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given) {
   test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
 }
 
+// Convert a 4D tensor with unsigned 8 bits, given range and round_mode half_up.
+TEST_F(QuantizeAndDequantizeTest,
+       Convert_4D_tensor_with_uint8_range_given_round_half_up) {
+  TF_ASSERT_OK(
+      NodeDefBuilder("quantize_and_dequantize_op", "QuantizeAndDequantizeV2")
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Input(FakeInput(DT_FLOAT))
+          .Attr("signed_input", false)
+          .Attr("num_bits", 8)
+          .Attr("range_given", true)
+          .Attr("round_mode", "HALF_UP")
+          .Finalize(node_def()));
+  TF_ASSERT_OK(InitOp());
+  AddInputFromArray<float>(TensorShape({2, 2, 1, 1}), {-0.5, 0, 0.3, 0.8});
+  AddInputFromArray<float>(TensorShape({}), {0.0});  // Min
+  AddInputFromArray<float>(TensorShape({}), {1.0});  // Max
+
+  // Note that the range is given as [0, 1].
+  // With int8, the tensor is quantized to {0, 0, 77, 204}
+  // Scale is: 1/255
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 2, 1, 1}));
+  test::FillValues<float>(&expected, {0, 0, 77.0 / 255, 204.0 / 255});
+  test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5);
+}
+
 // Convert a 4D tensor with unsigned 8 bits with given range.
 TEST_F(QuantizeAndDequantizeTest, Convert_4D_tensor_with_uint8_range_given_V3) {
   TF_ASSERT_OK(
diff --git a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
index e6133415d0f5c143acad25ee6e681820e956cca8..6fc489459231695a685346e3f728dd0a1e2202f2 100644
--- a/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
+++ b/tensorflow/core/kernels/quantized_resize_bilinear_op_test.cc
@@ -273,7 +273,7 @@ void TestResizeBilinearOneDim() {
         << expected_val << ", " << resized_image_val;
   }
 
-  // Value testing with reference implemenatation
+  // Value testing with reference implementation
   CheckTensorValue<qint32>(image_quantized_tensor.flat<qint32>().data(),
                            outputs.at(0).flat<qint32>().data(),
                            /*batch_size=*/1,
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index b2a342f63783a72369e63d77c2ba9fde407a3511..903a97a9601a9e8613c3189ef61ed9965c82d3d5 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -236,8 +236,10 @@ class RaggedGatherOpBase : public OpKernel {
     values_shape.set_dim(0, num_values);
     TF_RETURN_IF_ERROR(
         context->allocate_output(values_index, values_shape, &values_out));
-    int64 value_size = params_dense_values_in.NumElements() /
-                       params_dense_values_in.dim_size(0);
+    const int64 num_elements = params_dense_values_in.NumElements();
+    const int64 value_size =
+        num_elements == 0 ? 0
+                          : (num_elements / params_dense_values_in.dim_size(0));
     CallWriteValueSlices(params_dense_values_in, value_slices, value_size,
                          values_out);
     return ::tensorflow::Status::OK();
diff --git a/tensorflow/core/kernels/scan_ops_gpu.cu.cc b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
index ed6c6affce54a7e847ede07b329d31411b713bec..ed66c02dc584541ce4d5eb644630b678c1b05916 100644
--- a/tensorflow/core/kernels/scan_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/scan_ops_gpu.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,8 +17,20 @@ limitations under the License.
 
 #define EIGEN_USE_GPU
 
+#if CUDA_VERSION >= 9000
+#define CUB_USE_COOPERATIVE_GROUPS
+#endif  // CUDA_VERSION >= 9000
+
+#include "third_party/cub/block/block_load.cuh"
+#include "third_party/cub/block/block_scan.cuh"
+#include "third_party/cub/block/block_store.cuh"
+#include "third_party/cub/iterator/counting_input_iterator.cuh"
+#include "third_party/cub/iterator/transform_input_iterator.cuh"
+#include "cuda/include/cuComplex.h"
 #include "tensorflow/core/framework/numeric_types.h"
 #include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/util/permutation_input_iterator.h"
+#include "tensorflow/core/util/permutation_output_iterator.h"
 
 #include "tensorflow/core/kernels/scan_ops.h"
 
@@ -27,6 +39,258 @@ namespace tensorflow {
 typedef Eigen::GpuDevice GPUDevice;
 typedef Eigen::Index Index;
 
+namespace functor {
+
+// Map a contiguous range to the actual memory locations depending on which
+// axis the scan is taking place over and whether or not reversed.
+struct MapIndexToLocation {
+  __host__ __device__ MapIndexToLocation(int dimx, int dimy, int dimz,
+                                         bool reverse = false)
+      : dimx_(dimx), dimy_(dimy), dimz_(dimz), reverse_(reverse) {}
+
+  __host__ __device__ int operator()(int id) const {
+    if (dimx_ == 1) {
+      int row = id % dimy_;
+      int col = id / dimy_;
+
+      if (reverse_) return (dimy_ - row - 1) * dimz_ + col;
+
+      return row * dimz_ + col;
+    } else if (dimz_ == 1) {
+      if (reverse_) {
+        int row = id / dimy_;
+        int col = id % dimy_;
+        return row * dimy_ + (dimy_ - col - 1);
+      }
+      return id;
+    } else {
+      int col = id % dimy_;
+      int tmp = id / dimy_;
+
+      int row1 = id / (dimy_ * dimz_);
+      int col1 = tmp % dimz_;
+
+      if (reverse_)
+        return row1 * dimy_ * dimz_ + (dimy_ - col - 1) * dimz_ + col1;
+
+      return row1 * dimy_ * dimz_ + col * dimz_ + col1;
+    }
+  }
+
+  int dimx_;
+  int dimy_;
+  int dimz_;
+  bool reverse_;
+};
+
+template <typename T, typename Op>
+struct BlockPrefixCallbackOp {
+  // Running prefix
+  T running_total_;
+  Op op_;
+
+  __device__ BlockPrefixCallbackOp(T running_total, Op op)
+      : running_total_(running_total), op_(op) {}
+
+  // Callback operator to be entered by the first warp of threads in the block.
+  // tid 0 is responsible for returning a value for seeding the block-wide scan.
+  __device__ T operator()(T block_aggregate) {
+    T old_prefix = running_total_;
+    running_total_ = op_(old_prefix, block_aggregate);
+    return old_prefix;
+  }
+};
+
+template <typename T>
+struct Sum {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a + b;
+  }
+};
+
+template <typename T>
+struct Prod {
+  __host__ __device__ T operator()(const T& a, const T& b) const {
+    return a * b;
+  }
+};
+
+template <typename T, typename Op>
+struct IsSum {
+  constexpr static bool value =
+      (std::is_same<Op, Sum<T>>::value ||
+       std::is_same<Op, Eigen::internal::SumReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IsProd {
+  constexpr static bool value =
+      (std::is_same<Op, Prod<T>>::value ||
+       std::is_same<Op, Eigen::internal::ProdReducer<T>>::value);
+};
+
+template <typename T, typename Op>
+struct IdentityValue {
+  static_assert(IsSum<T, Op>::value || IsProd<T, Op>::value,
+                "IdentityValue not yet defined for this type.");
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsSum<U, OpCopy>::value, U>::type t = U(0)) {
+    return t;
+  }
+
+  template <typename U = T, typename OpCopy = Op>
+  __host__ __device__ U operator()(
+      typename std::enable_if<IsProd<U, OpCopy>::value, U>::type t = U(1)) {
+    return t;
+  }
+};
+
+// Each block is mapped to one sequence.  A contiguous range is mapped to the
+// appropriate locations in memory by the permutation iterators.  This is
+// ideal for 1-D and row based scans.  Column scans would be better if they
+// did a block load and then locally transposed.  CUB's device wide scan is not
+// used in the large 1D case, even though it would be more efficient, because
+// it is not deterministic.
+template <typename T, typename Op, int BlockDim = 128, int ItemsPerThread = 4>
+__global__ void scan_kernel(const T* in, T* out, int dimx, int dimy, int dimz,
+                            bool exclusive, bool reverse, Op op) {
+  typedef cub::BlockLoad<T, BlockDim, ItemsPerThread, cub::BLOCK_LOAD_TRANSPOSE>
+      BlockLoad;
+  typedef cub::BlockStore<T, BlockDim, ItemsPerThread,
+                          cub::BLOCK_STORE_TRANSPOSE>
+      BlockStore;
+  typedef cub::BlockScan<T, BlockDim> BlockScan;
+
+  // Allocate aliased shared memory for BlockLoad, BlockStore, and BlockScan
+  __shared__ union {
+    typename BlockLoad::TempStorage load;
+    typename BlockScan::TempStorage scan;
+    typename BlockStore::TempStorage store;
+  } temp_storage;
+
+  int problem_length = dimy;
+
+  // Initialize running total
+  BlockPrefixCallbackOp<T, Op> prefix_op(IdentityValue<T, Op>()(), op);
+
+  MapIndexToLocation map_op(dimx, dimy, dimz, reverse);
+  int block_start = problem_length * blockIdx.x;
+  // Have the block iterate over segments of items
+  for (int block_offset = block_start;
+       block_offset < block_start + problem_length;
+       block_offset += BlockDim * ItemsPerThread) {
+    int valid_items = min(BlockDim * ItemsPerThread,
+                          problem_length - (block_offset % problem_length));
+
+    // first construct a counting iterator that has the desired start point
+    typedef cub::TransformInputIterator<int, MapIndexToLocation,
+                                        cub::CountingInputIterator<int>>
+        MapIterType;
+
+    cub::CountingInputIterator<int> counting_iter(block_offset);
+
+    // Next map the iterator to the actual locations in memory
+    MapIterType map_iter(counting_iter, map_op);
+
+    PermutationInputIterator<T, const T*, MapIterType> permutein_iter(in,
+                                                                      map_iter);
+    PermutationOutputIterator<T, T*, MapIterType> permuteout_iter(out,
+                                                                  map_iter);
+
+    // Load a segment of consecutive items that are blocked across threads
+    T thread_data[ItemsPerThread];
+    BlockLoad(temp_storage.load).Load(permutein_iter, thread_data, valid_items);
+    __syncthreads();
+
+    // Collectively compute the block-wide scan
+    if (exclusive) {
+      BlockScan(temp_storage.scan)
+          .ExclusiveScan(thread_data, thread_data, op, prefix_op);
+    } else {
+      BlockScan(temp_storage.scan)
+          .InclusiveScan(thread_data, thread_data, op, prefix_op);
+    }
+    __syncthreads();
+
+    // Store scanned items to output segment
+    BlockStore(temp_storage.store)
+        .Store(permuteout_iter, thread_data, valid_items);
+    __syncthreads();
+  }
+}
+
+template <typename T, typename Op>
+void LaunchScan(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                typename TTypes<T, 3>::Tensor out, Op op, const bool reverse,
+                const bool exclusive) {
+  const int items_per_thread = 4;
+
+  int dimx = in.dimension(0);
+  int dimy = in.dimension(1);
+  int dimz = in.dimension(2);
+  int num_blocks = dimx * dimz;
+
+  int ideal_block_size = dimy / items_per_thread;
+
+  // There seems to be a bug when the type is not float and block_size 1024.
+  // Launch on the smallest power of 2 block size that we can.
+  if (ideal_block_size >= 1024 && std::is_same<T, float>::value) {
+    const int block_size = 1024;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 512) {
+    const int block_size = 512;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 256) {
+    const int block_size = 256;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 128) {
+    const int block_size = 128;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else if (ideal_block_size >= 64) {
+    const int block_size = 64;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  } else {
+    const int block_size = 32;
+    scan_kernel<T, Op, block_size, items_per_thread>
+        <<<num_blocks, block_size, 0, d.stream()>>>(
+            in.data(), out.data(), dimx, dimy, dimz, exclusive, reverse, op);
+  }
+}
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::SumReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::SumReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Sum<T>>(d, in, out, Sum<T>(), reverse, exclusive);
+  }
+};
+
+template <typename T>
+struct Scan<GPUDevice, Eigen::internal::ProdReducer<T>, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T, 3>::ConstTensor in,
+                  typename TTypes<T, 3>::Tensor out,
+                  const Eigen::internal::ProdReducer<T>& reducer,
+                  const bool reverse, const bool exclusive) {
+    LaunchScan<T, Prod<T>>(d, in, out, Prod<T>(), reverse, exclusive);
+  }
+};
+
+}  // namespace functor
+
 #define DEFINE(REDUCER, T) template struct functor::Scan<GPUDevice, REDUCER, T>;
 
 #define DEFINE_FOR_ALL_REDUCERS(T)           \
diff --git a/tensorflow/core/kernels/scan_ops_test.cc b/tensorflow/core/kernels/scan_ops_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..588b606a99b73588112aec1ca66cabf8d82dc38e
--- /dev/null
+++ b/tensorflow/core/kernels/scan_ops_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/platform/test_benchmark.h"
+
+namespace tensorflow {
+
+template <typename T>
+static Graph* LargeOneDCumsum(int num_x, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DataTypeToEnum<T>::value, TensorShape({num_x}));
+  data.flat<T>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ColCumsum(int num_x, int num_y, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 0;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* RowCumsum(int num_x, int num_y, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({num_x, num_y}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+static Graph* ThreeDYCumsum(int num_y, int num_z, bool reverse = false) {
+  auto* g = new Graph(OpRegistry::Global());
+  Tensor data(DT_FLOAT, TensorShape({32, num_y, num_z}));
+  data.flat<float>().setRandom();
+  Tensor axes(DT_INT32, TensorShape({}));
+  axes.flat<int32>()(0) = 1;
+  test::graph::Cumsum(g, test::graph::Constant(g, data),
+                      test::graph::Constant(g, axes));
+  return g;
+}
+
+template <typename T>
+static void LargeOneDimensional(int iters, const string& device, int num_x,
+                                bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * sizeof(T));
+  test::Benchmark(device, LargeOneDCumsum<T>(num_x, reverse)).Run(iters);
+}
+
+static void DoRowCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, RowCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void DoColCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ColCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void Do3DYCumsum(int iters, const string& device, int num_x, int num_y,
+                        bool reverse = false) {
+  testing::ItemsProcessed(static_cast<int64>(iters) * num_x * num_y);
+  testing::BytesProcessed(static_cast<int64>(iters) * num_x * num_y *
+                          sizeof(float));
+  test::Benchmark(device, ThreeDYCumsum(num_x, num_y, reverse)).Run(iters);
+}
+
+static void BM_OneDCumsumGPU(int iters, int num_x) {
+  LargeOneDimensional<float>(iters, "gpu", num_x);
+}
+BENCHMARK(BM_OneDCumsumGPU)->Range(1, 1 << 21);
+
+static void BM_OneDCumsumGPUHalf(int iters, int num_x) {
+  LargeOneDimensional<Eigen::half>(iters, "gpu", num_x);
+}
+BENCHMARK(BM_OneDCumsumGPUHalf)->Range(1, 1 << 21);
+
+static void BM_Sum2DRowCumsumGPU(int iters, int num_x, int num_y) {
+  DoRowCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DRowCumsumGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnCumsumGPU(int iters, int num_x, int num_y) {
+  DoColCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum2DColumnCumsumGPU)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYCumsumGPU(int iters, int num_x, int num_y) {
+  Do3DYCumsum(iters, "gpu", num_x, num_y);
+}
+BENCHMARK(BM_Sum3DYCumsumGPU)->RangePair(64, 4096, 64, 4096);
+
+static void BM_OneDCumsumGPU_reverse(int iters, int num_x) {
+  LargeOneDimensional<float>(iters, "gpu", num_x, true);
+}
+BENCHMARK(BM_OneDCumsumGPU_reverse)->Range(1, 1 << 21);
+
+static void BM_Sum2DRowCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  DoRowCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum2DRowCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum2DColumnCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  DoColCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum2DColumnCumsumGPU_reverse)->RangePair(1, 8192, 1, 8192);
+
+static void BM_Sum3DYCumsumGPU_reverse(int iters, int num_x, int num_y) {
+  Do3DYCumsum(iters, "gpu", num_x, num_y, true);
+}
+BENCHMARK(BM_Sum3DYCumsumGPU_reverse)->RangePair(32, 2048, 32, 2048);
+
+}  // end namespace tensorflow
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index fd54c6d6d7528c6e7858b492ccbe414d569e4fe0..63bb793fdcb7eb20daeee1708cb4ba78274cb9f7 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/dense_update_functor.h"
 #include "tensorflow/core/kernels/fill_functor.h"
+#include "tensorflow/core/kernels/inplace_ops_functor.h"
 #include "tensorflow/core/kernels/training_op_helpers.h"
 #include "tensorflow/core/kernels/variable_ops.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -121,6 +122,90 @@ class ScatterNdOp : public OpKernel {
   }
 };
 
+template <typename Device, typename T, typename Index,
+          scatter_nd_op::UpdateOp op>
+class TensorScatterOp : public OpKernel {
+ public:
+  explicit TensorScatterOp(OpKernelConstruction* c) : OpKernel(c) {
+    const DataType dt = DataTypeToEnum<T>::v();
+    const DataType index_t = DataTypeToEnum<Index>::v();
+    OP_REQUIRES_OK(c, c->MatchSignature({dt, index_t, dt}, {dt}));
+  }
+
+  void Compute(OpKernelContext* c) override {
+    const Tensor& input = c->input(0);
+    const Tensor& indices = c->input(1);
+    const Tensor& updates = c->input(2);
+
+    OP_REQUIRES(c, indices.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Indices shape must have rank at least one. Found:",
+                    indices.shape().DebugString()));
+    OP_REQUIRES(c, updates.shape().dims() >= 1,
+                errors::InvalidArgument(
+                    "Updates shape must have rank at least one. Found:",
+                    updates.shape().DebugString()));
+
+    TensorShape shape = input.shape();
+
+    OP_REQUIRES(
+        c,
+        (shape.num_elements() > 0 || (indices.shape().num_elements() == 0 &&
+                                      updates.shape().num_elements() == 0)),
+        errors::InvalidArgument(
+            "Indices and updates specified for empty output shape"));
+
+    const int64 outer_dims = indices.shape().dims() - 1;
+
+    for (int i = 0; i < outer_dims; ++i) {
+      OP_REQUIRES(c, indices.shape().dim_size(i) == updates.shape().dim_size(i),
+                  errors::InvalidArgument(
+                      "Outer dimensions of indices and update must match. "
+                      "Indices shape: ",
+                      indices.shape().DebugString(),
+                      ", updates shape:", updates.shape().DebugString()));
+    }
+
+    const int64 ix = indices.shape().dim_size(outer_dims);
+    OP_REQUIRES(
+        c, updates.shape().dims() - outer_dims == shape.dims() - ix,
+        errors::InvalidArgument("Inner dimensions of output shape must match "
+                                "inner dimensions of updates shape. Output: ",
+                                shape.DebugString(),
+                                " updates: ", updates.shape().DebugString()));
+    for (int i = 0; i + outer_dims < updates.shape().dims(); ++i) {
+      OP_REQUIRES(
+          c, updates.shape().dim_size(i + outer_dims) == shape.dim_size(ix + i),
+          errors::InvalidArgument(
+              "The inner ", shape.dims() - ix,
+              " dimensions of output.shape=", shape.DebugString(),
+              " must match the inner ", updates.shape().dims() - outer_dims,
+              " dimensions of updates.shape=", updates.shape().DebugString()));
+    }
+
+    std::unique_ptr<Tensor> forwarded_input = c->forward_input(
+        2, 0, input.dtype(), shape, DEVICE_MEMORY, AllocatorAttributes());
+
+    if (forwarded_input == nullptr) {
+      // We were not able to forward the input, so we deep copy the tensor and
+      // set the output.
+      Tensor* out;
+      OP_REQUIRES_OK(c, c->allocate_output(0, input.shape(), &out));
+
+      OP_REQUIRES_OK(c, tensorflow::functor::DoCopy(c->eigen_device<Device>(),
+                                                    input, out));
+      OP_REQUIRES_OK(c,
+                     functor::DoScatterNd<Device, T, Index, op>(
+                         c, indices, updates, shape, out, false /*allocate*/));
+    } else {
+      // Output forwarded, so simply perform the scatter.
+      OP_REQUIRES_OK(c, functor::DoScatterNd<Device, T, Index, op>(
+                            c, indices, updates, shape, forwarded_input.get(),
+                            false /*allocate*/));
+    }
+  }
+};
+
 template <typename Device, typename T, typename Index,
           scatter_nd_op::UpdateOp op>
 class ScatterNdUpdateOp : public OpKernel {
@@ -282,6 +367,56 @@ TF_CALL_bool(REGISTER_SCATTER_ND_ADD_SUB_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_UPDATE_CPU);
 TF_CALL_bool(REGISTER_SCATTER_ND_CPU);
 
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, index_type, \
+                                                          dev)              \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterUpdate")                       \
+                              .Device(DEVICE_##dev)                         \
+                              .TypeConstraint<type>("T")                    \
+                              .TypeConstraint<index_type>("Tindices"),      \
+                          TensorScatterOp<dev##Device, type, index_type,    \
+                                          scatter_nd_op::UpdateOp::ASSIGN>)
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, index_type, dev) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterAdd")                            \
+                              .Device(DEVICE_##dev)                           \
+                              .TypeConstraint<type>("T")                      \
+                              .TypeConstraint<index_type>("Tindices"),        \
+                          TensorScatterOp<dev##Device, type, index_type,      \
+                                          scatter_nd_op::UpdateOp::ADD>)
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, index_type, dev) \
+  REGISTER_KERNEL_BUILDER(Name("TensorScatterSub")                            \
+                              .Device(DEVICE_##dev)                           \
+                              .TypeConstraint<type>("T")                      \
+                              .TypeConstraint<index_type>("Tindices"),        \
+                          TensorScatterOp<dev##Device, type, index_type,      \
+                                          scatter_nd_op::UpdateOp::SUB>)
+
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_CPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int32, CPU); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int64, CPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_CPU(type)   \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU(type); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_CPU(type);    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_CPU(type);
+
+// Register TensorScatterUpdate/Add/Sub for all number types.
+TF_CALL_NUMBER_TYPES(REGISTER_SCATTER_ND_TENSOR_CPU);
+// Register only TensorScatterUpdate for string/bool types as well.
+TF_CALL_string(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
+TF_CALL_bool(REGISTER_SCATTER_ND_TENSOR_UPDATE_CPU);
+
+#undef REGISTER_SCATTER_ND_TENSOR_CPU
+
 // Registers GPU kernels.
 #if GOOGLE_CUDA
 
@@ -319,6 +454,25 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_UPDATE_SYCL
 #endif  // TENSORFLOW_USE_SYCL
 
+#define REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_ADD_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type)                    \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int32, GPU); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE(type, int64, GPU);
+
+#define REGISTER_SCATTER_ND_TENSOR_GPU(type)   \
+  REGISTER_SCATTER_ND_TENSOR_ADD_GPU(type);    \
+  REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU(type); \
+  REGISTER_SCATTER_ND_TENSOR_SUB_GPU(type);
+
+TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_TENSOR_GPU);
+
 #undef REGISTER_SCATTER_ND_ADD
 #undef REGISTER_SCATTER_ND_ADD_SUB
 #undef REGISTER_SCATTER_ND_ADD_SUB_CPU
@@ -328,6 +482,16 @@ TF_CALL_GPU_NUMBER_TYPES_NO_HALF(REGISTER_SCATTER_ND_UPDATE_SYCL);
 #undef REGISTER_SCATTER_ND_UPDATE_GPU
 #undef REGISTER_SCATTER_ND_KERNEL
 #undef REGISTER_SCATTER_ND_KERNEL_INDEX
+#undef REGISTER_SCATTER_ND_TENSOR_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_CPU
+#undef REGISTER_SCATTER_ND_TENSOR_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_UPDATE_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_ADD_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_SUB_TYPE_INDEX_TYPE
+#undef REGISTER_SCATTER_ND_TENSOR_UPDATE_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_ADD_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_SUB_GPU
+#undef REGISTER_SCATTER_ND_TENSOR_GPU
 
 #endif  // GOOGLE_CUDA
 
diff --git a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
index ac48202ada2204ea36478257630f20f7892be50b..a4e89f439ed9f5711253924ad120f7a6751e1728 100644
--- a/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
+++ b/tensorflow/core/kernels/sparse_dense_binary_op_shared.cc
@@ -88,12 +88,12 @@ class SparseDenseBinaryOpShared : public OpKernel {
     const auto rhs_dims = BCast::FromShape(dense_t->shape());
     BCast b(lhs_dims, rhs_dims, false);  // false for keeping the same num dims.
 
-    // True iff (size(lhs) > size(rhs)), or (sizes equal, lhs cwise rhs).
+    // True iff (size(lhs) >= size(rhs)) and all dims in lhs is greater or equal
+    // to dims in rhs (from right to left).
     auto VecGreaterEq = [](ArraySlice<int64> lhs, ArraySlice<int64> rhs) {
-      if (lhs.size() > rhs.size()) return true;
       if (lhs.size() < rhs.size()) return false;
-      for (size_t i = 0; i < lhs.size(); ++i) {
-        if (lhs[i] < rhs[i]) return false;
+      for (size_t i = 0; i < rhs.size(); ++i) {
+        if (lhs[lhs.size() - 1 - i] < rhs[rhs.size() - 1 - i]) return false;
       }
       return true;
     };
diff --git a/tensorflow/core/kernels/stage_op.cc b/tensorflow/core/kernels/stage_op.cc
index 73a02a34cf231799e6a813f042757d70b4e9414a..c91bdc43cf4636481f141df70f30b1f2d74dc1a2 100644
--- a/tensorflow/core/kernels/stage_op.cc
+++ b/tensorflow/core/kernels/stage_op.cc
@@ -151,7 +151,7 @@ class Buffer : public ResourceBase {
   }
 
   // Are there a limit number of elements or a memory limit
-  // configued on this buffer?
+  // configured on this buffer?
   bool IsBounded() const { return capacity_ > 0 || memory_limit_ > 0; }
 
   bool IsCapacityFull() const { return buf_.size() >= capacity_; }
diff --git a/tensorflow/core/kernels/tensor_array_ops.cc b/tensorflow/core/kernels/tensor_array_ops.cc
index a97a71b344d64be09daf919c387d55a5c06db5aa..aa85f546a81d0e6b8cf41fc23532fd4a11fe42ec 100644
--- a/tensorflow/core/kernels/tensor_array_ops.cc
+++ b/tensorflow/core/kernels/tensor_array_ops.cc
@@ -352,9 +352,9 @@ class TensorArrayGradOp : public TensorArrayCreationOp {
     }
 
     const auto key = strings::StrCat(output_handle(0), output_handle(1));
-    auto creator = [this, key, tensor_array, array_size, marked_size,
-                    element_shape, shape_to_prepend, tensor_array_output_handle,
-                    output_handle](TensorArray** ret) -> Status {
+    auto creator = [key, tensor_array, array_size, marked_size, element_shape,
+                    shape_to_prepend,
+                    tensor_array_output_handle](TensorArray** ret) -> Status {
       *ret = new TensorArray(
           key, tensor_array->ElemType(), *tensor_array_output_handle,
           array_size, element_shape, tensor_array->HasIdenticalElementShapes(),
diff --git a/tensorflow/core/kernels/tensor_forest/BUILD b/tensorflow/core/kernels/tensor_forest/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..df035506f7698d1d213efad6088e9bfb53d97282
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/BUILD
@@ -0,0 +1,53 @@
+# Description:
+#   OpKernels for tensor forest ops.
+
+package(
+    default_visibility = ["//tensorflow:internal"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+load("//tensorflow:tensorflow.bzl", "tf_kernel_library")
+
+cc_library(
+    name = "resources",
+    srcs = ["resources.cc"],
+    hdrs = ["resources.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "resource_ops",
+    srcs = ["resource_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "prediction_ops",
+    srcs = ["prediction_ops.cc"],
+    deps = [
+        ":resources",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_cc",
+    ],
+)
+
+tf_kernel_library(
+    name = "tensor_forest_ops",
+    deps = [
+        ":prediction_ops",
+        ":resource_ops",
+    ],
+)
diff --git a/tensorflow/core/kernels/tensor_forest/prediction_ops.cc b/tensorflow/core/kernels/tensor_forest/prediction_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e75421fb95791c9dc8aa3b3baf13cffed50d3da
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/prediction_ops.cc
@@ -0,0 +1,93 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+#include "tensorflow/core/platform/thread_annotations.h"
+#include "tensorflow/core/util/work_sharder.h"
+
+namespace tensorflow {
+
+class TensorForestTreePredictOp : public OpKernel {
+ public:
+  explicit TensorForestTreePredictOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    OP_REQUIRES_OK(context,
+                   context->GetAttr("logits_dimension", &logits_dimension_));
+  }
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* dense_features_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->input("dense_features", &dense_features_t));
+
+    auto dense_features = dense_features_t->matrix<float>();
+    const int32 batch_size = dense_features_t->dim_size(0);
+
+    Tensor* output_predictions = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {batch_size, logits_dimension_},
+                                            &output_predictions));
+    auto out = output_predictions->matrix<float>();
+
+    if (decision_tree_resource->get_size() <= 0) {
+      out.setZero();
+      return;
+    }
+    auto* worker_threads = context->device()->tensorflow_cpu_worker_threads();
+    const int32 num_threads = worker_threads->num_threads;
+
+    // TODO(yupbank): This was from contrib version.
+    //  This cost would probably depend on the depth of the tree we have.
+    //  We will need to run it on a number of trees of diff depth
+    //  and see the num of cpu cycles
+    const int64 cost_per_traverse = 500;
+    auto traverse = [this, &out, &dense_features, decision_tree_resource,
+                     batch_size](int64 start, int64 end) {
+      DCHECK_LE(start, end) << "Start exceeding End";
+      DCHECK_LE(end, batch_size) << "End exceeding batch size";
+      for (int example_id = start; example_id < end; ++example_id) {
+        const int32 leaf_id =
+            decision_tree_resource->TraverseTree(example_id, &dense_features);
+        set_output_value(example_id, leaf_id, decision_tree_resource, &out);
+      }
+    };
+    Shard(num_threads, worker_threads->workers, batch_size, cost_per_traverse,
+          traverse);
+  };
+
+  void set_output_value(const int32 example_id, const int32 leaf_id,
+                        const TensorForestTreeResource* decision_tree_resource,
+                        TTypes<float>::Matrix* out) const {
+    for (int j = 0; j < logits_dimension_; ++j) {
+      const float logit = decision_tree_resource->get_prediction(leaf_id, j);
+      (*out)(example_id, j) = logit;
+    }
+  }
+
+ private:
+  int32 logits_dimension_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreePredict").Device(DEVICE_CPU),
+                        TensorForestTreePredictOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resource_ops.cc b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0474d56098f50412345fe017c8bdfb09e908be0b
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resource_ops.cc
@@ -0,0 +1,136 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+
+namespace tensorflow {
+
+class TensorForestCreateTreeVariableOp : public OpKernel {
+ public:
+  explicit TensorForestCreateTreeVariableOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+
+    auto* const result = new TensorForestTreeResource();
+
+    if (!result->InitFromSerialized(tree_config_t->scalar<string>()())) {
+      result->Unref();
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree config."));
+    }
+
+    // Only create one, if one does not exist already. Report status for all
+    // other exceptions.
+    auto status = CreateResource(context, HandleFromInput(context, 0), result);
+    if (!status.ok() && status.code() != tensorflow::error::ALREADY_EXISTS) {
+      OP_REQUIRES(context, false, status);
+    }
+  }
+};
+
+// Op for serializing a model.
+class TensorForestTreeSerializeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeSerializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_config_t = nullptr;
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, TensorShape(), &output_config_t));
+    output_config_t->scalar<string>()() =
+        decision_tree_resource->decision_tree().SerializeAsString();
+  }
+};
+
+// Op for deserializing a tree variable from a checkpoint.
+class TensorForestTreeDeserializeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeDeserializeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+
+    const Tensor* tree_config_t;
+    OP_REQUIRES_OK(context, context->input("tree_config", &tree_config_t));
+
+    // Deallocate all the previous objects on the resource.
+    decision_tree_resource->Reset();
+
+    if (!decision_tree_resource->InitFromSerialized(
+            tree_config_t->scalar<string>()())) {
+      OP_REQUIRES(context, false,
+                  errors::InvalidArgument("Unable to parse tree config."));
+    }
+  }
+};
+
+// Op for getting tree size.
+class TensorForestTreeSizeOp : public OpKernel {
+ public:
+  explicit TensorForestTreeSizeOp(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* context) override {
+    TensorForestTreeResource* decision_tree_resource;
+    OP_REQUIRES_OK(context, LookupResource(context, HandleFromInput(context, 0),
+                                           &decision_tree_resource));
+    mutex_lock l(*decision_tree_resource->get_mutex());
+    core::ScopedUnref unref_me(decision_tree_resource);
+    Tensor* output_t = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, TensorShape(), &output_t));
+    output_t->scalar<int32>()() = decision_tree_resource->get_size();
+  }
+};
+
+REGISTER_RESOURCE_HANDLE_KERNEL(TensorForestTreeResource);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorForestTreeIsInitializedOp").Device(DEVICE_CPU),
+    IsResourceInitialized<TensorForestTreeResource>);
+
+REGISTER_KERNEL_BUILDER(
+    Name("TensorForestCreateTreeVariable").Device(DEVICE_CPU),
+    TensorForestCreateTreeVariableOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeSerialize").Device(DEVICE_CPU),
+                        TensorForestTreeSerializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeDeserialize").Device(DEVICE_CPU),
+                        TensorForestTreeDeserializeOp);
+
+REGISTER_KERNEL_BUILDER(Name("TensorForestTreeSize").Device(DEVICE_CPU),
+                        TensorForestTreeSizeOp);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resources.cc b/tensorflow/core/kernels/tensor_forest/resources.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bcd1a1e904171c6c97a6c1cb5ce0809e393be015
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resources.cc
@@ -0,0 +1,71 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/kernels/tensor_forest/resources.h"
+#include "tensorflow/core/kernels/boosted_trees/boosted_trees.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+const boosted_trees::Tree& TensorForestTreeResource::decision_tree() const {
+  return *decision_tree_;
+}
+
+const int32 TensorForestTreeResource::get_size() const {
+  return decision_tree_->nodes_size();
+}
+
+TensorForestTreeResource::TensorForestTreeResource()
+    : decision_tree_(
+          protobuf::Arena::CreateMessage<boosted_trees::Tree>(&arena_)) {}
+
+const float TensorForestTreeResource::get_prediction(
+    const int32 id, const int32 dimension_id) const {
+  return decision_tree_->nodes(id).leaf().vector().value(dimension_id);
+}
+
+const int32 TensorForestTreeResource::TraverseTree(
+    const int32 example_id,
+    const TTypes<float>::ConstMatrix* dense_data) const {
+  using boosted_trees::Node;
+  using boosted_trees::Tree;
+  int32 current_id = 0;
+  while (true) {
+    const Node& current = decision_tree_->nodes(current_id);
+    if (current.has_leaf()) {
+      return current_id;
+    }
+    DCHECK_EQ(current.node_case(), Node::kDenseSplit);
+    const auto& split = current.dense_split();
+
+    if ((*dense_data)(example_id, split.feature_id()) <= split.threshold()) {
+      current_id = split.left_id();
+    } else {
+      current_id = split.right_id();
+    }
+  }
+}
+
+bool TensorForestTreeResource::InitFromSerialized(const string& serialized) {
+  return ParseProtoUnlimited(decision_tree_, serialized);
+}
+
+void TensorForestTreeResource::Reset() {
+  arena_.Reset();
+  DCHECK_EQ(0, arena_.SpaceAllocated());
+  decision_tree_ = protobuf::Arena::CreateMessage<boosted_trees::Tree>(&arena_);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/tensor_forest/resources.h b/tensorflow/core/kernels/tensor_forest/resources.h
new file mode 100644
index 0000000000000000000000000000000000000000..da258e5017ca8cc9b996d83bcd767e89d61322d7
--- /dev/null
+++ b/tensorflow/core/kernels/tensor_forest/resources.h
@@ -0,0 +1,65 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
+#define TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
+
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Forward declaration for proto class Tree.
+namespace boosted_trees {
+class Tree;
+}  // namespace boosted_trees
+
+// Keep a tree ensemble in memory for efficient evaluation and mutation.
+class TensorForestTreeResource : public ResourceBase {
+ public:
+  TensorForestTreeResource();
+
+  string DebugString() override {
+    return strings::StrCat("TensorForestTree[size=", get_size(), "]");
+  }
+
+  mutex* get_mutex() { return &mu_; }
+
+  bool InitFromSerialized(const string& serialized);
+
+  // Resets the resource and frees the proto.
+  // Caller needs to hold the mutex lock while calling this.
+  void Reset();
+
+  const int32 get_size() const;
+
+  const boosted_trees::Tree& decision_tree() const;
+
+  const float get_prediction(const int32 id, const int32 dimension_id) const;
+
+  const int32 TraverseTree(const int32 example_id,
+                           const TTypes<float>::ConstMatrix* dense_data) const;
+
+ protected:
+  mutex mu_;
+  protobuf::Arena arena_;
+  boosted_trees::Tree* decision_tree_;
+};
+
+}  // namespace tensorflow
+#endif  // TENSORFLOW_CORE_KERNELS_TENSOR_FOREST_RESOURCES_H_
diff --git a/tensorflow/core/kernels/training_ops.cc b/tensorflow/core/kernels/training_ops.cc
index acf162deec9bdb05183103ce6b47f364106a2036..6504ad1b09c089cafec8c2b0ce0f2971aa506b52 100644
--- a/tensorflow/core/kernels/training_ops.cc
+++ b/tensorflow/core/kernels/training_ops.cc
@@ -283,6 +283,22 @@ struct ApplyMomentum<CPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyKerasMomentum<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
+    accum.device(d) = accum * momentum() - grad * lr();
+    if (use_nesterov) {
+      var.device(d) += (accum * momentum() - grad * lr());
+    } else {
+      var.device(d) += accum;
+    }
+  }
+};
+
 template <typename Device, typename T>
 struct ApplyAdamNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -331,6 +347,28 @@ struct ApplyAdamSYCL {
 template <typename T>
 struct ApplyAdam<CPUDevice, T> : ApplyAdamNonCuda<CPUDevice, T> {};
 
+template <typename T>
+struct ApplyAdamWithAmsgrad<CPUDevice, T> {
+  void operator()(const CPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    const T alpha = lr() * Eigen::numext::sqrt(T(1) - beta2_power()) /
+                    (T(1) - beta1_power());
+
+    m.device(d) += (grad - m) * (T(1) - beta1());
+    v.device(d) += (grad.square() - v) * (T(1) - beta2());
+    vhat.device(d) = vhat.cwiseMax(v);
+    var.device(d) -= (m * alpha) / (vhat.sqrt() + epsilon());
+  }
+};
+
 template <typename Device, typename T>
 struct ApplyAdaMaxNonCuda {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -2525,6 +2563,217 @@ TF_CALL_double(REGISTER_CPU_KERNELS);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyKerasMomentumOp : public OpKernel {
+ public:
+  explicit ApplyKerasMomentumOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar: ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Tensor& momentum = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyKerasMomentum<Device, T>()(
+        device, var.flat<T>(), accum.flat<T>(), lr.scalar<T>(), grad.flat<T>(),
+        momentum.scalar<T>(), use_nesterov_);
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(D, T)                               \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyKerasMomentum") \
+                              .Device(DEVICE_##D)            \
+                              .HostMemory("var")             \
+                              .HostMemory("accum")           \
+                              .TypeConstraint<T>("T"),       \
+                          ApplyKerasMomentumOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                               \
+  template <>                                                             \
+  void ApplyKerasMomentum<GPUDevice, T>::operator()(                      \
+      const GPUDevice& d, typename TTypes<T>::Flat var,                   \
+      typename TTypes<T>::Flat accum, typename TTypes<T>::ConstScalar lr, \
+      typename TTypes<T>::ConstFlat grad,                                 \
+      typename TTypes<T>::ConstScalar momentum, bool use_nesterov);       \
+  extern template struct ApplyKerasMomentum<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
+// Note, this op works on cpu only.
+template <typename T, typename Tindex>
+class SparseApplyKerasMomentumOp : public OpKernel {
+ public:
+  explicit SparseApplyKerasMomentumOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_nesterov", &use_nesterov_));
+  }
+
+  void Compute(OpKernelContext* ctx) override NO_THREAD_SAFETY_ANALYSIS {
+    auto locks =
+        MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_, {0, 1});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 0, use_exclusive_lock_, true, &var));
+    Tensor accum;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<CPUDevice, T>(
+                            ctx, 1, use_exclusive_lock_, true, &accum));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, accum.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(accum.shape()),
+        errors::InvalidArgument("var and accum do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                accum.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVectorOrHigher(var.shape()),
+                errors::InvalidArgument("var must be at least 1 dimensional"));
+
+    const Tensor& lr = ctx->input(2);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    const Tensor& grad = ctx->input(3);
+    const Tensor& indices = ctx->input(4);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsVector(indices.shape()),
+                errors::InvalidArgument("indices must be one-dimensional"));
+
+    for (int d = 1; d < var.dims(); d++) {
+      OP_REQUIRES(ctx, var.dim_size(d) == grad.dim_size(d),
+                  errors::InvalidArgument(strings::StrCat(
+                      "var and grad must match in dimension ", d)));
+    }
+    const Tindex N = indices.dim_size(0);
+    OP_REQUIRES(
+        ctx, grad.dim_size(0) == N,
+        errors::InvalidArgument(
+            "grad must be the same size as indices in the first dimension."));
+
+    const Tensor& momentum = ctx->input(5);
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(momentum.shape()),
+                errors::InvalidArgument("momentum is not a scalar: ",
+                                        momentum.shape().DebugString()));
+
+    if (N > 0) {
+      const Tindex first_dim_size = var.dim_size(0);
+      auto indices_vec = indices.vec<Tindex>();
+      auto var_flat = var.flat_outer_dims<T>();
+      auto accum_flat = accum.flat_outer_dims<T>();
+      auto grad_flat = grad.flat_outer_dims<T>();
+      T lr_scalar = lr.scalar<T>()();
+      T momentum_scalar = momentum.scalar<T>()();
+
+      for (Tindex i = 0; i < N; i++) {
+        const Tindex index = internal::SubtleMustCopy(indices_vec(i));
+        OP_REQUIRES(ctx, FastBoundsCheck(index, first_dim_size),
+                    errors::InvalidArgument(
+                        strings::StrCat("Index ", index, " at offset ", i,
+                                        " in indices is out of range")));
+        auto a = accum_flat.template chip<0>(index);
+        auto g = grad_flat.template chip<0>(i);
+        auto v = var_flat.template chip<0>(index);
+        a = a * a.constant(momentum_scalar) - g * g.constant(lr_scalar);
+        if (use_nesterov_) {
+          v += a * a.constant(momentum_scalar) - g * g.constant(lr_scalar);
+        } else {
+          v += a;
+        }
+      }
+    }
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+  bool use_nesterov_;
+};
+
+#define REGISTER_KERNELS(T, Tindices)                                \
+  REGISTER_KERNEL_BUILDER(Name("ResourceSparseApplyKerasMomentum")   \
+                              .Device(DEVICE_CPU)                    \
+                              .TypeConstraint<T>("T")                \
+                              .TypeConstraint<Tindices>("Tindices"), \
+                          SparseApplyKerasMomentumOp<T, Tindices>);
+#define REGISTER_CPU_KERNELS(T) \
+  REGISTER_KERNELS(T, int32);   \
+  REGISTER_KERNELS(T, int64);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdamOp : public OpKernel {
  public:
@@ -2786,6 +3035,147 @@ REGISTER_KERNELS(GPU, double);
 #undef REGISTER_CPU_KERNELS
 #undef REGISTER_KERNELS
 
+template <typename Device, typename T>
+class ApplyAdamWithAmsgradOp : public OpKernel {
+ public:
+  explicit ApplyAdamWithAmsgradOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("use_locking", &use_exclusive_lock_));
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    auto locks = MaybeLockVariableInputMutexesInOrder(ctx, use_exclusive_lock_,
+                                                      {0, 1, 2});
+
+    Tensor var;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 0, use_exclusive_lock_, false, &var));
+    Tensor m;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 1, use_exclusive_lock_, false, &m));
+    Tensor v;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 2, use_exclusive_lock_, false, &v));
+    Tensor vhat;
+    OP_REQUIRES_OK(ctx, GetInputTensorFromVariable<Device, T>(
+                            ctx, 3, use_exclusive_lock_, false, &vhat));
+    OP_REQUIRES(
+        ctx, var.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(0)));
+    OP_REQUIRES(
+        ctx, m.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(1)));
+    OP_REQUIRES(
+        ctx, v.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+    OP_REQUIRES(
+        ctx, vhat.IsInitialized(),
+        errors::FailedPrecondition(
+            "Attempting to use uninitialized variables: ", requested_input(2)));
+
+    const Tensor& beta1_power = ctx->input(4);
+    const Tensor& beta2_power = ctx->input(5);
+    const Tensor& lr = ctx->input(6);
+    const Tensor& beta1 = ctx->input(7);
+    const Tensor& beta2 = ctx->input(8);
+    const Tensor& epsilon = ctx->input(9);
+
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1_power.shape()),
+                errors::InvalidArgument("beta1_power is not a scalar: ",
+                                        beta1_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2_power.shape()),
+                errors::InvalidArgument("beta2_power is not a scalar: ",
+                                        beta2_power.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(lr.shape()),
+                errors::InvalidArgument("lr is not a scalar : ",
+                                        lr.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta1.shape()),
+                errors::InvalidArgument("beta1 is not a scalar: ",
+                                        beta1.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(beta2.shape()),
+                errors::InvalidArgument("beta2 is not a scalar: ",
+                                        beta2.shape().DebugString()));
+    OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(epsilon.shape()),
+                errors::InvalidArgument("epsilon is not a scalar: ",
+                                        epsilon.shape().DebugString()));
+
+    const Tensor& grad = ctx->input(10);
+    OP_REQUIRES(ctx, var.shape().IsSameSize(m.shape()),
+                errors::InvalidArgument("var and m do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        m.shape().DebugString()));
+    OP_REQUIRES(ctx, var.shape().IsSameSize(v.shape()),
+                errors::InvalidArgument("var and v do not have the same shape",
+                                        var.shape().DebugString(), " ",
+                                        v.shape().DebugString()));
+    OP_REQUIRES(
+        ctx, var.shape().IsSameSize(grad.shape()),
+        errors::InvalidArgument("var and grad do not have the same shape",
+                                var.shape().DebugString(), " ",
+                                grad.shape().DebugString()));
+
+    const Device& device = ctx->template eigen_device<Device>();
+    functor::ApplyAdamWithAmsgrad<Device, T>()(
+        device, var.flat<T>(), m.flat<T>(), v.flat<T>(), vhat.flat<T>(),
+        beta1_power.scalar<T>(), beta2_power.scalar<T>(), lr.scalar<T>(),
+        beta1.scalar<T>(), beta2.scalar<T>(), epsilon.scalar<T>(),
+        grad.flat<T>());
+
+    MaybeForwardRefInputToRefOutput(ctx, 0, 0);
+  }
+
+ private:
+  bool use_exclusive_lock_;
+};
+
+#define REGISTER_KERNELS(D, T)                                 \
+  REGISTER_KERNEL_BUILDER(Name("ResourceApplyAdamWithAmsgrad") \
+                              .HostMemory("var")               \
+                              .HostMemory("m")                 \
+                              .HostMemory("v")                 \
+                              .HostMemory("vhat")              \
+                              .Device(DEVICE_##D)              \
+                              .TypeConstraint<T>("T"),         \
+                          ApplyAdamWithAmsgradOp<D##Device, T>);
+#define REGISTER_CPU_KERNELS(T) REGISTER_KERNELS(CPU, T);
+
+TF_CALL_half(REGISTER_CPU_KERNELS);
+TF_CALL_bfloat16(REGISTER_CPU_KERNELS);
+TF_CALL_float(REGISTER_CPU_KERNELS);
+TF_CALL_double(REGISTER_CPU_KERNELS);
+
+#if GOOGLE_CUDA
+// Forward declarations of the functor specializations for GPU.
+namespace functor {
+#define DECLARE_GPU_SPEC(T)                                   \
+  template <>                                                 \
+  void ApplyAdamWithAmsgrad<GPUDevice, T>::operator()(        \
+      const GPUDevice& d, typename TTypes<T>::Flat var,       \
+      typename TTypes<T>::Flat m, typename TTypes<T>::Flat v, \
+      typename TTypes<T>::Flat vhat,                          \
+      typename TTypes<T>::ConstScalar beta1_power,            \
+      typename TTypes<T>::ConstScalar beta2_power,            \
+      typename TTypes<T>::ConstScalar lr,                     \
+      typename TTypes<T>::ConstScalar beta1,                  \
+      typename TTypes<T>::ConstScalar beta2,                  \
+      typename TTypes<T>::ConstScalar epsilon,                \
+      typename TTypes<T>::ConstFlat grad);                    \
+  extern template struct ApplyAdamWithAmsgrad<GPUDevice, T>;
+DECLARE_GPU_SPEC(Eigen::half);
+DECLARE_GPU_SPEC(float);
+DECLARE_GPU_SPEC(double);
+#undef DECLARE_GPU_SPEC
+}  // namespace functor
+
+REGISTER_KERNELS(GPU, Eigen::half);
+REGISTER_KERNELS(GPU, float);
+REGISTER_KERNELS(GPU, double);
+#endif
+#undef REGISTER_CPU_KERNELS
+#undef REGISTER_KERNELS
+
 template <typename Device, typename T>
 class ApplyAdaMaxOp : public OpKernel {
  public:
diff --git a/tensorflow/core/kernels/training_ops.h b/tensorflow/core/kernels/training_ops.h
index e10a4cb125410dee383932f134e0339ba1c19b93..054f07350e60cd8a0c3713efc31d5a606fa6d2bc 100644
--- a/tensorflow/core/kernels/training_ops.h
+++ b/tensorflow/core/kernels/training_ops.h
@@ -126,6 +126,15 @@ struct ApplyMomentum {
                   typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyKerasMomentum {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov);
+};
+
 template <typename Device, typename T>
 struct ApplyAdam {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
@@ -139,6 +148,20 @@ struct ApplyAdam {
                   typename TTypes<T>::ConstFlat grad, bool use_nesterov);
 };
 
+template <typename Device, typename T>
+struct ApplyAdamWithAmsgrad {
+  void operator()(const Device& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad);
+};
+
 template <typename Device, typename T>
 struct ApplyAdaMax {
   void operator()(const Device& d, typename TTypes<T>::Flat var,
diff --git a/tensorflow/core/kernels/training_ops_gpu.cu.cc b/tensorflow/core/kernels/training_ops_gpu.cu.cc
index 4bd32592db16b70b2731a6cf775dbf774263d283..f45b9ffca7c9970ca2aee1416d2c5bf4d90f413a 100644
--- a/tensorflow/core/kernels/training_ops_gpu.cu.cc
+++ b/tensorflow/core/kernels/training_ops_gpu.cu.cc
@@ -101,6 +101,27 @@ struct ApplyMomentum<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyKerasMomentum<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat accum,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstFlat grad,
+                  typename TTypes<T>::ConstScalar momentum, bool use_nesterov) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    accum.device(d) = (accum * momentum.reshape(single).broadcast(bcast) -
+                       grad * lr.reshape(single).broadcast(bcast));
+    if (use_nesterov) {
+      var.device(d) += (accum * momentum.reshape(single).broadcast(bcast) -
+                        grad * lr.reshape(single).broadcast(bcast));
+    } else {
+      var.device(d) += accum;
+    }
+  }
+};
+
 template <typename T>
 struct ApplyAdam<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -144,6 +165,39 @@ struct ApplyAdam<GPUDevice, T> {
   }
 };
 
+template <typename T>
+struct ApplyAdamWithAmsgrad<GPUDevice, T> {
+  void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
+                  typename TTypes<T>::Flat m, typename TTypes<T>::Flat v,
+                  typename TTypes<T>::Flat vhat,
+                  typename TTypes<T>::ConstScalar beta1_power,
+                  typename TTypes<T>::ConstScalar beta2_power,
+                  typename TTypes<T>::ConstScalar lr,
+                  typename TTypes<T>::ConstScalar beta1,
+                  typename TTypes<T>::ConstScalar beta2,
+                  typename TTypes<T>::ConstScalar epsilon,
+                  typename TTypes<T>::ConstFlat grad) {
+    Eigen::array<typename TTypes<T>::Tensor::Index, 1> bcast;
+    bcast[0] = grad.dimension(0);
+    Eigen::Sizes<1> single;
+    const auto one = static_cast<T>(1.0);
+    m.device(d) =
+        m + (beta1.constant(one) - beta1).reshape(single).broadcast(bcast) *
+                (grad - m);
+    v.device(d) =
+        v + (beta2.constant(one) - beta2).reshape(single).broadcast(bcast) *
+                (grad.square() - v);
+    vhat.device(d) = vhat.cwiseMax(v);
+
+    var.device(d) -= (lr * (beta2_power.constant(one) - beta2_power).sqrt() /
+                      (beta1_power.constant(one) - beta1_power))
+                         .reshape(single)
+                         .broadcast(bcast) *
+                     m /
+                     (epsilon.reshape(single).broadcast(bcast) + vhat.sqrt());
+  }
+};
+
 template <typename T>
 struct ApplyAdaMax<GPUDevice, T> {
   void operator()(const GPUDevice& d, typename TTypes<T>::Flat var,
@@ -302,10 +356,18 @@ template struct functor::ApplyMomentum<GPUDevice, Eigen::half>;
 template struct functor::ApplyMomentum<GPUDevice, float>;
 template struct functor::ApplyMomentum<GPUDevice, double>;
 
+template struct functor::ApplyKerasMomentum<GPUDevice, Eigen::half>;
+template struct functor::ApplyKerasMomentum<GPUDevice, float>;
+template struct functor::ApplyKerasMomentum<GPUDevice, double>;
+
 template struct functor::ApplyAdam<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdam<GPUDevice, float>;
 template struct functor::ApplyAdam<GPUDevice, double>;
 
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, Eigen::half>;
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, float>;
+template struct functor::ApplyAdamWithAmsgrad<GPUDevice, double>;
+
 template struct functor::ApplyAdaMax<GPUDevice, Eigen::half>;
 template struct functor::ApplyAdaMax<GPUDevice, float>;
 template struct functor::ApplyAdaMax<GPUDevice, double>;
diff --git a/tensorflow/core/kernels/training_ops_test.cc b/tensorflow/core/kernels/training_ops_test.cc
index 2dcc4a500e6c64753c6fde4f88582f914a50089e..1ec57b45221906bebe7366af45375cc93b08d3df 100644
--- a/tensorflow/core/kernels/training_ops_test.cc
+++ b/tensorflow/core/kernels/training_ops_test.cc
@@ -151,6 +151,40 @@ static void BM_Momentum(int iters, int params) {
 }
 BENCHMARK(BM_Momentum)->Arg(128 << 10)->Arg(256 << 10);
 
+static void KerasMomentum(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto accum = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, accum, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto accum = Var(g, n);
+    auto lr = Scalar(g, 0.01);
+    auto grad = Random(g, n);
+    auto mom = Scalar(g, 0.01);
+    test::graph::Multi(g, "ApplyKerasMomentum", {var, accum, lr, grad, mom});
+    *train_g = g;
+  }
+}
+
+static void BM_KerasMomentum(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  KerasMomentum(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_KerasMomentum)->Arg(128 << 10)->Arg(256 << 10);
+
 static void Adam(int32 n, Graph** init_g, Graph** train_g) {
   TensorShape shape({n});
   {
@@ -194,6 +228,50 @@ static void BM_Adam(int iters, int params) {
 }
 BENCHMARK(BM_Adam)->Arg(128 << 10)->Arg(256 << 10);
 
+static void AdamWithAmsgrad(int32 n, Graph** init_g, Graph** train_g) {
+  TensorShape shape({n});
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto v = Var(g, n);
+    auto zero = Zeros(g, n);
+    test::graph::Assign(g, var, zero);
+    test::graph::Assign(g, m, zero);
+    test::graph::Assign(g, v, zero);
+    *init_g = g;
+  }
+  {
+    Graph* g = new Graph(OpRegistry::Global());
+    auto var = Var(g, n);
+    auto m = Var(g, n);
+    auto v = Var(g, n);
+    auto vhat = Var(g, n);
+    auto beta1_power = Scalar(g, 0.9);
+    auto beta2_power = Scalar(g, 0.99);
+    auto lr = Scalar(g, 0.01);
+    auto beta1 = Scalar(g, 0.9);
+    auto beta2 = Scalar(g, 0.99);
+    auto epsilon = Scalar(g, 1e-8);
+    auto grad = Random(g, n);
+    test::graph::Multi(g, "ApplyAdamWithAmsgrad",
+                       {var, m, v, vhat, beta1_power, beta2_power, lr, beta1,
+                        beta2, epsilon, grad});
+    *train_g = g;
+  }
+}
+
+static void BM_AdamWithAmsgrad(int iters, int params) {
+  const int64 tot = static_cast<int64>(iters) * params;
+  testing::ItemsProcessed(tot);
+  testing::BytesProcessed(tot * sizeof(float));
+  Graph* init;
+  Graph* train;
+  AdamWithAmsgrad(params, &init, &train);
+  test::Benchmark("cpu", train, GetOptions(), init).Run(iters);
+}
+BENCHMARK(BM_AdamWithAmsgrad)->Arg(128 << 10)->Arg(256 << 10);
+
 static void RMSProp(int32 n, Graph** init_g, Graph** train_g) {
   TensorShape shape({n});
   {
diff --git a/tensorflow/core/kernels/unicode_ops.cc b/tensorflow/core/kernels/unicode_ops.cc
index dd4415711b1b36ca570a9af72a5829ae030a5d6a..3ee0edb35a72d2e3de747fad32bb69bb2872ac80 100644
--- a/tensorflow/core/kernels/unicode_ops.cc
+++ b/tensorflow/core/kernels/unicode_ops.cc
@@ -13,9 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <stdint.h>
+#include <cstddef>
+#include <functional>
 #include <memory>
 #include <string>
+#include <vector>
 
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "unicode/appendable.h"  // TF:icu
+#include "unicode/schriter.h"  // TF:icu
+#include "unicode/uchar.h"  // TF:icu
 #include "unicode/ucnv.h"  // TF:icu
 #include "unicode/ucnv_err.h"  // TF:icu
 #include "unicode/umachine.h"  // TF:icu
@@ -23,15 +31,57 @@ limitations under the License.
 #include "unicode/unistr.h"  // TF:icu
 #include "unicode/uset.h"  // TF:icu
 #include "unicode/utypes.h"  // TF:icu
+#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_types.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/kernels/bounds_check.h"
 #include "tensorflow/core/kernels/string_util.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/core/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/util/bcast.h"
 #include "tensorflow/core/util/ptr_util.h"
 
 namespace tensorflow {
+namespace {
+
+void Encode(const UnicodeEncoding encoding, const icu::UnicodeString& in,
+            string* out) {
+  if (encoding == UnicodeEncoding::UTF8) {
+    out->clear();
+    in.toUTF8String(*out);
+  } else if (encoding == UnicodeEncoding::UTF16BE) {
+    // TODO(gbillock): consider using the
+    // extract(char *dest, int32_t destCapacity, UConverter *cnv)
+    // for UTF16/32
+    out->clear();  // subtle: must come before reserve()
+    out->reserve(2 * in.length() + 1);
+    const char16_t* buf = in.getBuffer();
+    for (int i = 0; i < in.length(); ++i) {
+      // Emit big-endian encoding for UTF-16 always.
+      out->push_back((buf[i] & 0xFF00) >> 8);
+      out->push_back(buf[i] & 0x00FF);
+    }
+  } else if (encoding == UnicodeEncoding::UTF32BE) {
+    out->clear();  // subtle: must come before reserve()
+    out->reserve(4 * in.countChar32() + 1);
+    icu::StringCharacterIterator it(in);
+    UChar32 ch;
+    while (it.hasNext()) {
+      ch = it.next32PostInc();
+      out->push_back((ch & 0xFF000000) >> 24);
+      out->push_back((ch & 0x00FF0000) >> 16);
+      out->push_back((ch & 0x0000FF00) >> 8);
+      out->push_back((ch & 0x000000FF));
+    }
+  }
+}
 
 // This error callback is only useful for finding illegal encoding errors when
 // we want to be strict -- otherwise illegal encodings are replaced on read
@@ -146,40 +196,66 @@ class WrappedConverter {
   string name_;
 };
 
+struct ErrorOptions {
+  UChar32 subst = 0xFFFD;
+  bool elide_replacement = false;
+  bool replace_control_chars = false;
+  bool error_on_malformatting = false;
+};
+
+Status GetErrorOptions(OpKernelConstruction* ctx, ErrorOptions* out) {
+  *out = ErrorOptions();
+
+  string error_policy;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("errors", &error_policy));
+
+  if (error_policy == "replace") {
+    out->elide_replacement = false;
+  } else if (error_policy == "ignore") {
+    out->elide_replacement = true;
+  } else if (error_policy == "strict") {
+    out->error_on_malformatting = true;
+  } else {
+    return errors::InvalidArgument(
+        "errors policy must be one of 'strict', 'replace', or 'ignore'");
+  }
+
+  int32 replacement_char;
+  TF_RETURN_IF_ERROR(ctx->GetAttr("replacement_char", &replacement_char));
+
+  if (replacement_char >= UCHAR_MIN_VALUE &&
+      replacement_char <= UCHAR_MAX_VALUE) {
+    out->subst = replacement_char;
+  } else {
+    return errors::InvalidArgument(
+        "replacement_char out of unicode codepoint range");
+  }
+
+  if (ctx->HasAttr("replace_control_characters")) {
+    TF_RETURN_IF_ERROR(ctx->GetAttr("replace_control_characters",
+                                    &(out->replace_control_chars)));
+  }
+
+  return Status::OK();
+}
+
+inline bool ShouldHandleFormatError(const ErrorOptions& error_options,
+                                    UChar32 ch, bool format_error) {
+  return ((error_options.replace_control_chars && ch <= 0x1F) || format_error);
+}
+
+}  // namespace
+
 class UnicodeTranscodeOp : public OpKernel {
  public:
   explicit UnicodeTranscodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
-    string error_policy;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("errors", &error_policy));
-    if (error_policy == "replace") {
-      elide_replacement_ = false;
-    } else if (error_policy == "ignore") {
-      elide_replacement_ = true;
-    } else if (error_policy == "strict") {
-      error_on_malformatting_ = true;
-    } else {
-      ctx->CtxFailure(errors::InvalidArgument(
-          "errors policy must be one of 'strict', 'replace', or 'ignore'"));
-    }
-
-    int32 replacement_char;
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("replacement_char", &replacement_char));
-    if (replacement_char >= UCHAR_MIN_VALUE &&
-        replacement_char <= UCHAR_MAX_VALUE) {
-      subst_ = replacement_char;
-    } else {
-      ctx->CtxFailure(errors::InvalidArgument(
-          "replacement_char out of unicode codepoint range"));
-    }
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
 
     string output_encoding;
     OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &output_encoding));
     OP_REQUIRES_OK(ctx,
                    ParseUnicodeEncoding(output_encoding, &output_encoding_));
 
-    OP_REQUIRES_OK(ctx, ctx->GetAttr("replace_control_characters",
-                                     &replace_control_chars_));
-
     OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
     // Make a temporary UConverter to ensure it will create without error
     // at execution time (and to warm any data caches the converter needs).
@@ -228,7 +304,7 @@ class UnicodeTranscodeOp : public OpKernel {
       Transcode(&(output_flat(i)), input_encoder->converter_,
                 &found_any_format_error);
     }
-    if (error_on_malformatting_ && found_any_format_error) {
+    if (error_options_.error_on_malformatting && found_any_format_error) {
       ctx->CtxFailure(
           errors::InvalidArgument("Invalid formatting on input string"));
     }
@@ -240,12 +316,12 @@ class UnicodeTranscodeOp : public OpKernel {
   // out-of-range inputs.
   void TranslateCodepoints(icu::UnicodeString* s, bool* found_any_format_error,
                            UChar32 ch, int src_bytes, bool format_error) {
-    if ((replace_control_chars_ && ch <= 0x1F) || format_error) {
+    if (ShouldHandleFormatError(error_options_, ch, format_error)) {
       *found_any_format_error = true;
-      if (elide_replacement_) {
+      if (error_options_.elide_replacement) {
         return;
       } else {
-        ch = subst_;
+        ch = error_options_.subst;
       }
     }
     s->append(ch);
@@ -263,45 +339,202 @@ class UnicodeTranscodeOp : public OpKernel {
                   found_any_format_error, std::placeholders::_1,
                   std::placeholders::_2, std::placeholders::_3));
 
-    if (output_encoding_ == UnicodeEncoding::UTF8) {
-      s->clear();
-      source.toUTF8String(*s);
-    } else if (output_encoding_ == UnicodeEncoding::UTF16BE) {
-      // TODO(gbillock): consider using the
-      // extract(char *dest, int32_t destCapacity, UConverter *cnv)
-      // for UTF16/32
-      s->clear();  // subtle: must come before reserve()
-      s->reserve(2 * source.length() + 1);
-      const char16_t* buf = source.getBuffer();
-      for (int i = 0; i < source.length(); ++i) {
-        // Emit big-endian encoding for UTF-16 always.
-        s->push_back((buf[i] & 0xFF00) >> 8);
-        s->push_back(buf[i] & 0x00FF);
-      }
-    } else if (output_encoding_ == UnicodeEncoding::UTF32BE) {
-      s->clear();  // subtle: must come before reserve()
-      s->reserve(4 * source.countChar32() + 1);
-      for (int i = 0; i < source.countChar32(); ++i) {
-        // Emit big-endian encoding for UTF-32 always.
-        UChar32 ch = source.char32At(i);
-        s->push_back((ch & 0xFF000000) >> 24);
-        s->push_back((ch & 0x00FF0000) >> 16);
-        s->push_back((ch & 0x0000FF00) >> 8);
-        s->push_back((ch & 0x000000FF));
-      }
-    }
+    Encode(output_encoding_, source, s);
   }
 
-  UChar32 subst_ = 0xFFFD;
-  bool elide_replacement_ = false;
-  bool replace_control_chars_ = false;
-  bool error_on_malformatting_ = false;
-
   string input_encoding_;
+  ErrorOptions error_options_;
   UnicodeEncoding output_encoding_ = UnicodeEncoding::UTF8;
 };
 
 REGISTER_KERNEL_BUILDER(Name("UnicodeTranscode").Device(DEVICE_CPU),
                         UnicodeTranscodeOp);
 
+class UnicodeDecodeWithOffsetsOp : public OpKernel {
+ public:
+  explicit UnicodeDecodeWithOffsetsOp(OpKernelConstruction* ctx)
+      : OpKernel(ctx) {
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("input_encoding", &input_encoding_));
+    // Make a temporary UConverter to ensure it will create without error
+    // at execution time (and to warm any data caches the converter needs).
+    // This instance is not used.
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+  }
+
+  void Decode(OpKernelContext* ctx, std::vector<UChar32>* char_values,
+              std::vector<int64>* offset_values, int* string_length,
+              int64* next_row_split, UChar32 char_value, int char_length,
+              bool found_any_format_error) {
+    if (error_options_.error_on_malformatting && found_any_format_error) {
+      ctx->CtxFailure(
+          errors::InvalidArgument("Invalid formatting on input string"));
+    }
+    UChar32 decoded_value = char_value;
+    if (ShouldHandleFormatError(error_options_, char_value,
+                                found_any_format_error)) {
+      if (error_options_.elide_replacement) {
+        return;
+      } else {
+        decoded_value = error_options_.subst;
+      }
+    }
+
+    // Emit the char value.
+    char_values->push_back(decoded_value);
+
+    // Emit the byte offset
+    offset_values->push_back(*string_length);
+    *string_length += char_length;
+    *next_row_split += 1;
+  }
+
+  void Compute(OpKernelContext* ctx) override {
+    const Tensor* input_tensor;
+    OP_REQUIRES_OK(ctx, ctx->input("input", &input_tensor));
+
+    // Go through all the strings in `input`.
+    const auto& input_vec = input_tensor->flat<string>();
+
+    std::unique_ptr<WrappedConverter> input_encoder =
+        absl::make_unique<WrappedConverter>();
+    input_encoder->init(input_encoding_);
+    OP_REQUIRES(ctx, input_encoder->converter_,
+                errors::InvalidArgument(
+                    "Could not create converter for input encoding: " +
+                    input_encoding_));
+
+    std::vector<UChar32> char_values;
+    std::vector<int64> offset_values;
+
+    Tensor* output_row_splits;
+    OP_REQUIRES_OK(ctx, ctx->allocate_output("row_splits",
+                                             {input_tensor->NumElements() + 1},
+                                             &output_row_splits));
+    auto out_row_splits = output_row_splits->vec<int64>();
+
+    int row_split_index = 0;
+    int64 next_row_split = 0;
+    for (int i = 0; i < input_vec.size(); ++i) {
+      const string& input = input_vec(i);
+      // Convert input strings into unicode values. Output to a list of
+      // char_values, record row splits and char_to_byte_starts, which are all
+      // the fields needed to construct a RaggedTensor.
+      out_row_splits(row_split_index) = next_row_split;
+      row_split_index++;
+      int string_length = 0;
+      IterateUnicodeString(
+          input, input_encoder->converter_,
+          std::bind(&UnicodeDecodeWithOffsetsOp::Decode, this, ctx,
+                    &char_values, &offset_values, &string_length,
+                    &next_row_split, std::placeholders::_1,
+                    std::placeholders::_2, std::placeholders::_3));
+    }
+    out_row_splits(row_split_index) = next_row_split;
+
+    DCHECK(offset_values.size() == char_values.size());
+    Tensor* output_char_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("char_values",
+                                  {static_cast<int64>(char_values.size())},
+                                  &output_char_values));
+    Tensor* output_offset_values;
+    OP_REQUIRES_OK(
+        ctx, ctx->allocate_output("char_to_byte_starts",
+                                  {static_cast<int64>(offset_values.size())},
+                                  &output_offset_values));
+    auto out_char_values = output_char_values->vec<int32>();
+    auto out_offset_values = output_offset_values->vec<int64>();
+
+    // Load output tensors from intermediate value arrays.
+    for (int i = 0; i < char_values.size(); ++i) {
+      out_char_values(i) = static_cast<int32>(char_values[i]);
+      out_offset_values(i) = offset_values[i];
+    }
+  }
+
+ private:
+  string input_encoding_;
+  ErrorOptions error_options_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeDecodeWithOffsets").Device(DEVICE_CPU),
+                        UnicodeDecodeWithOffsetsOp);
+
+class UnicodeEncodeOp : public OpKernel {
+ public:
+  explicit UnicodeEncodeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
+    string encoding_tmp;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("output_encoding", &encoding_tmp));
+    OP_REQUIRES_OK(ctx, ParseUnicodeEncoding(encoding_tmp, &encoding_));
+    OP_REQUIRES_OK(ctx, GetErrorOptions(ctx, &error_options_));
+  }
+
+  /**
+   * Encodes Unicode codepoints into the desired string representation.
+   *
+   * We lose a dimension while encoding, since a series of integer codepoints is
+   * encoded into a single string.
+   *
+   * This accepts two input tensors: a rank 1 tensor of code point values and
+   * a single rank 1 tensor of splits which determine where each string begins
+   * and ends from the provided code points.
+   */
+  void Compute(OpKernelContext* context) override {
+    // Get inputs
+    const Tensor& input_tensor = context->input(0);
+    const auto input_tensor_flat = input_tensor.flat<int32>();
+    const Tensor& input_splits = context->input(1);
+    const auto input_splits_flat = input_splits.flat<int64>();
+
+    // Since we limit to a 2-D input (flat_values of rank 1 and a single splits
+    // tensor), our output dimension will be 1 with it's size equal to the
+    // number of splits (outer dimension or ragged tensor).
+    TensorShape output_shape({input_splits.dim_size(0) - 1});
+    Tensor* output_tensor;
+    OP_REQUIRES_OK(context, context->allocate_output("output", output_shape,
+                                                     &output_tensor));
+    auto output_tensor_flat = output_tensor->flat<string>();
+
+    // Use a single index over the flattened input values tensor.
+    int idx = 0;
+    // Loop through our split dimension to create a new string at each split.
+    for (int i = 1; i < input_splits_flat.size(); ++i) {
+      icu::UnicodeString unicode_string;
+      icu::UnicodeStringAppendable appendable_unicode_string(unicode_string);
+      for (; idx < input_splits_flat(i); ++idx) {
+        int32 code_point = input_tensor_flat(idx);
+        // Check for invalid code point
+        if (code_point > UCHAR_MAX_VALUE || code_point < UCHAR_MIN_VALUE) {
+          if (error_options_.error_on_malformatting) {
+            context->CtxFailure(errors::InvalidArgument(
+                "Code point value out of valid Unicode range."));
+            return;
+          } else if (!error_options_.elide_replacement) {
+            code_point = error_options_.subst;
+          }
+        }
+        appendable_unicode_string.appendCodePoint(code_point);
+      }
+      // Encode our string and save in the output.
+      string result;
+      Encode(encoding_, unicode_string, &result);
+      output_tensor_flat(i - 1) = result;
+    }
+  }
+
+ private:
+  UnicodeEncoding encoding_;
+  ErrorOptions error_options_;
+};
+
+REGISTER_KERNEL_BUILDER(Name("UnicodeEncode").Device(DEVICE_CPU),
+                        UnicodeEncodeOp);
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/core/threadpool.cc b/tensorflow/core/lib/core/threadpool.cc
index 42689a6c3b37a5fd32ddb8595a91953ddef6e188..e929ff45a1fb8656d5762a8793cb17175f04c1f9 100644
--- a/tensorflow/core/lib/core/threadpool.cc
+++ b/tensorflow/core/lib/core/threadpool.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/core/platform/denormal.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/setround.h"
 #include "tensorflow/core/platform/tracing.h"
 #include "tensorflow/core/platform/types.h"
@@ -54,6 +55,9 @@ struct EigenEnvironment {
       port::ScopedFlushDenormal flush;
       // Set the processor rounding mode to ROUND TO NEAREST.
       port::ScopedSetRound round(FE_TONEAREST);
+      if (thread_options_.numa_node != port::kNUMANoAffinity) {
+        port::NUMASetThreadNodeAffinity(thread_options_.numa_node);
+      }
       f();
     });
   }
@@ -83,35 +87,38 @@ struct EigenEnvironment {
 
 struct ThreadPool::Impl : Eigen::ThreadPoolTempl<EigenEnvironment> {
   Impl(Env* env, const ThreadOptions& thread_options, const string& name,
-       int num_threads, bool low_latency_hint)
+       int num_threads, bool low_latency_hint, Eigen::Allocator* allocator)
       : Eigen::ThreadPoolTempl<EigenEnvironment>(
             num_threads, low_latency_hint,
-            EigenEnvironment(env, thread_options, name)) {}
+            EigenEnvironment(env, thread_options, name)),
+        allocator_(allocator) {}
 
   void ParallelFor(int64 total, int64 cost_per_unit,
                    std::function<void(int64, int64)> fn) {
     CHECK_GE(total, 0);
     CHECK_EQ(total, (int64)(Eigen::Index)total);
-    Eigen::ThreadPoolDevice device(this, this->NumThreads());
+    Eigen::ThreadPoolDevice device(this, this->NumThreads(), allocator_);
     device.parallelFor(
         total, Eigen::TensorOpCost(0, 0, cost_per_unit),
         [&fn](Eigen::Index first, Eigen::Index last) { fn(first, last); });
   }
+
+  Eigen::Allocator* allocator_;
 };
 
 ThreadPool::ThreadPool(Env* env, const string& name, int num_threads)
-    : ThreadPool(env, ThreadOptions(), name, num_threads, true) {}
+    : ThreadPool(env, ThreadOptions(), name, num_threads, true, nullptr) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads)
-    : ThreadPool(env, thread_options, name, num_threads, true) {}
+    : ThreadPool(env, thread_options, name, num_threads, true, nullptr) {}
 
 ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        const string& name, int num_threads,
-                       bool low_latency_hint) {
+                       bool low_latency_hint, Eigen::Allocator* allocator) {
   CHECK_GE(num_threads, 1);
   impl_.reset(new ThreadPool::Impl(env, thread_options, "tf_" + name,
-                                   num_threads, low_latency_hint));
+                                   num_threads, low_latency_hint, allocator));
 }
 
 ThreadPool::~ThreadPool() {}
diff --git a/tensorflow/core/lib/core/threadpool.h b/tensorflow/core/lib/core/threadpool.h
index 3da7dcb63285092d11f69801766994fe40483592..90c9f294472f1475c99494bc276ce475d5cded81 100644
--- a/tensorflow/core/lib/core/threadpool.h
+++ b/tensorflow/core/lib/core/threadpool.h
@@ -22,6 +22,9 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 
+namespace Eigen {
+class Allocator;
+}  // namespace Eigen
 namespace tensorflow {
 namespace thread {
 
@@ -37,7 +40,8 @@ class ThreadPool {
   //
   // REQUIRES: num_threads > 0
   ThreadPool(Env* env, const ThreadOptions& thread_options, const string& name,
-             int num_threads, bool low_latency_hint);
+             int num_threads, bool low_latency_hint,
+             Eigen::Allocator* allocator = nullptr);
 
   // Constructs a pool for low-latency ops that contains "num_threads" threads
   // with specified "name". env->StartThread() is used to create individual
diff --git a/tensorflow/core/lib/png/png_io.cc b/tensorflow/core/lib/png/png_io.cc
index bc52180265c70ce2953e6818c1ca414f86feee6f..e8dbcb97b94475f91345676bade0a9d220560741 100644
--- a/tensorflow/core/lib/png/png_io.cc
+++ b/tensorflow/core/lib/png/png_io.cc
@@ -92,7 +92,11 @@ void StringReader(png_structp png_ptr, png_bytep data, png_size_t length) {
   DecodeContext* const ctx =
       absl::bit_cast<DecodeContext*>(png_get_io_ptr(png_ptr));
   if (static_cast<png_size_t>(ctx->data_left) < length) {
-    memset(data, 0, length);
+    // Don't zero out the data buffer as it has been lazily allocated (copy on
+    // write) and zeroing it out here can produce an OOM. Since the buffer is
+    // only used for reading data from the image, this doesn't result in any
+    // data leak, so it is safe to just leave the buffer be as it is and just
+    // exit with error.
     png_error(png_ptr, "More bytes requested to read than available");
   } else {
     memcpy(data, ctx->data, length);
diff --git a/tensorflow/core/nccl/BUILD b/tensorflow/core/nccl/BUILD
index 50d9a2e8daa8ae8abf0c61fb1a74dd8ad72d949f..4be33b2a0cf10a2525f9a93b5d4942b381d92629 100644
--- a/tensorflow/core/nccl/BUILD
+++ b/tensorflow/core/nccl/BUILD
@@ -11,6 +11,10 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "tf_cuda_cc_test")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
+load(
+    "//tensorflow/core:platform/default/build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
 
 cc_library(
     name = "nccl_lib",
@@ -34,27 +38,17 @@ cc_library(
 tf_cuda_cc_test(
     name = "nccl_manager_test",
     size = "medium",
-    srcs = if_cuda(
-        [
-            "nccl_manager_test.cc",
-        ],
-        [],
-    ),
-    # Disabled on jenkins until errors finding nvmlShutdown are found.
-    tags = [
-        "manual",
-        "multi_gpu",
-        "no_oss",
-        "noguitar",
-        "notap",
+    srcs = ["nccl_manager_test.cc"],
+    tags = tf_cuda_tests_tags() + [
+        "no_cuda_on_cpu_tap",  # TODO(b/120284216): re-enable multi_gpu
     ],
-    deps =
-        if_cuda([
-            ":nccl_lib",
-            "@local_config_nccl//:nccl",
-            "//tensorflow/core:cuda",
-            "//tensorflow/core:test",
-            "//tensorflow/core:test_main",
-            "//tensorflow/core:testlib",
-        ]),
+    deps = [
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+    ] + if_cuda([
+        ":nccl_lib",
+        "@local_config_nccl//:nccl",
+        "//tensorflow/core:cuda",
+    ]),
 )
diff --git a/tensorflow/core/nccl/nccl_manager.cc b/tensorflow/core/nccl/nccl_manager.cc
index f8e8c752227a414f6fbe2739314a2efd6d9e0063..df49bf1b976726b3c1cbc3917c881dbc380f2f9a 100644
--- a/tensorflow/core/nccl/nccl_manager.cc
+++ b/tensorflow/core/nccl/nccl_manager.cc
@@ -24,6 +24,22 @@ limitations under the License.
 
 namespace tensorflow {
 
+#define NCCL_RETURN_IF_ERROR(...)                               \
+  do {                                                          \
+    ncclResult_t nccl_status = (__VA_ARGS__);                   \
+    if (nccl_status != ncclSuccess) {                           \
+      return errors::Internal(ncclGetErrorString(nccl_status)); \
+    }                                                           \
+  } while (0)
+
+#define CUDA_RETURN_IF_ERROR(...)                               \
+  do {                                                          \
+    cudaError_t cuda_status = (__VA_ARGS__);                    \
+    if (cuda_status != cudaSuccess) {                           \
+      return errors::Internal(cudaGetErrorString(cuda_status)); \
+    }                                                           \
+  } while (0)
+
 using se::cuda::ScopedActivateExecutorContext;
 
 // Contains data for a single stream used for nccl communication; this includes
@@ -177,8 +193,8 @@ NcclManager* NcclManager::instance() {
   return instance;
 }
 
-NcclManager::Communicator* NcclManager::GetCommunicator(
-    NcclManager::Collective* collective) {
+Status NcclManager::GetCommunicator(NcclManager::Collective* collective,
+                                    NcclManager::Communicator** communicator) {
   // Sort by executor to make ordering of executors deterministic.
   std::sort(collective->participants.begin(), collective->participants.end(),
             [](const std::unique_ptr<Participant>& a,
@@ -217,7 +233,10 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
           break;
         }
       }
-      if (i == num_devices) return comm.get();
+      if (i == num_devices) {
+        *communicator = comm.get();
+        return Status::OK();
+      }
     }
   }
 
@@ -264,37 +283,36 @@ NcclManager::Communicator* NcclManager::GetCommunicator(
   // NCCL2 prevents InitAll for more communicators than devices (but doesn't
   // check that device ids are unique). Work around it by initializing each
   // rank individually.
-  cudaGetDeviceCount(&device_count);
+  CUDA_RETURN_IF_ERROR(cudaGetDeviceCount(&device_count));
 #endif
   std::vector<ncclComm_t> nccl_comms(num_devices);
   if (num_devices <= device_count) {
-    auto result =
-        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data());
-    CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+    NCCL_RETURN_IF_ERROR(
+        ncclCommInitAll(nccl_comms.data(), num_devices, devices.data()));
   } else {
     int savedDevice = 0;
-    CHECK_EQ(cudaGetDevice(&savedDevice), cudaSuccess);
+    CUDA_RETURN_IF_ERROR(cudaGetDevice(&savedDevice));
     ncclUniqueId commId;
-    ncclGetUniqueId(&commId);
+    NCCL_RETURN_IF_ERROR(ncclGetUniqueId(&commId));
 #if NCCL_MAJOR >= 2
-    CHECK_EQ(ncclGroupStart(), ncclSuccess);
+    NCCL_RETURN_IF_ERROR(ncclGroupStart());
 #endif
     for (int rank = 0; rank < num_devices; ++rank) {
-      cudaSetDevice(devices[rank]);
-      auto result =
-          ncclCommInitRank(nccl_comms.data() + rank, num_devices, commId, rank);
-      CHECK_EQ(result, ncclSuccess) << ncclGetErrorString(result);
+      CUDA_RETURN_IF_ERROR(cudaSetDevice(devices[rank]));
+      NCCL_RETURN_IF_ERROR(ncclCommInitRank(nccl_comms.data() + rank,
+                                            num_devices, commId, rank));
     }
 #if NCCL_MAJOR >= 2
-    CHECK_EQ(ncclGroupEnd(), ncclSuccess);
+    NCCL_RETURN_IF_ERROR(ncclGroupEnd());
 #endif
-    cudaSetDevice(savedDevice);
+    CUDA_RETURN_IF_ERROR(cudaSetDevice(savedDevice));
   }
   for (int rank = 0; rank < num_devices; ++rank) {
     members[rank].nccl_comm = nccl_comms[rank];
   }
   communicators_.emplace_back(new Communicator(std::move(members)));
-  return communicators_.back().get();
+  *communicator = communicators_.back().get();
+  return Status::OK();
 }
 
 void NcclManager::AddToAllReduce(int num_devices, const string& key,
@@ -400,10 +418,18 @@ void NcclManager::AddParticipant(int num_devices, const string& key,
 void NcclManager::RunCollective(const string& key, Collective* collective) {
   static mutex collective_mu(LINKER_INITIALIZED);
 
-  auto* communicator = GetCommunicator(collective);
-  collective->communicator = communicator;
-  const int size = communicator->num_devices;
+  Communicator* communicator = nullptr;
+  const int size = static_cast<int>(collective->participants.size());
+  Status s = GetCommunicator(collective, &communicator);
+  if (!s.ok()) {
+    for (int i = 0; i < size; ++i) {
+      collective->participants[i]->done_callback(s);
+    }
+    delete collective;
+    return;
+  }
 
+  collective->communicator = communicator;
   for (int rank = 0; rank < size; ++rank) {
     Participant* p = collective->participants[rank].get();
     NcclStream* nccl_stream = communicator->members[rank].nccl_stream;
diff --git a/tensorflow/core/nccl/nccl_manager.h b/tensorflow/core/nccl/nccl_manager.h
index 76b49101d47559d47783d91aaec56fa604fc26b9..5da4fe5554d134f79c279542666c841a4e205485 100644
--- a/tensorflow/core/nccl/nccl_manager.h
+++ b/tensorflow/core/nccl/nccl_manager.h
@@ -103,7 +103,13 @@ class NcclManager {
   struct NcclStream;
   struct Participant;
 
-  Communicator* GetCommunicator(Collective* collective);
+  // Gets the `Communicator` object that will be used to enqueue NCCL kernels
+  // for `collective`, and returns it via `communicator`.
+  //
+  // This may involve creating CUDA streams and NCCL initialization.  If a NCCL
+  // or CUDA error occurs in the process, this returns an INTERNAL error with
+  // the corresponding NCCL/CUDA error string.
+  Status GetCommunicator(Collective* collective, Communicator** communicator);
 
   void AddParticipant(int num_devices, const string& key,
                       std::unique_ptr<Participant> participant,
diff --git a/tensorflow/core/nccl/nccl_manager_test.cc b/tensorflow/core/nccl/nccl_manager_test.cc
index dbc07865f0b7a96b941d21131b689a7be32c445e..f9ed4d0b9a26c390bc5974f206faea16c8b5b974 100644
--- a/tensorflow/core/nccl/nccl_manager_test.cc
+++ b/tensorflow/core/nccl/nccl_manager_test.cc
@@ -28,8 +28,8 @@ limitations under the License.
 
 namespace tensorflow {
 
-static std::vector<BaseGPUDevice*> GetGPUDevices() {
-  std::vector<Device*> devices;
+static std::vector<std::unique_ptr<BaseGPUDevice>> GetGPUDevices() {
+  std::vector<std::unique_ptr<Device>> devices;
   SessionOptions session_options;
   session_options.config.mutable_gpu_options()
       ->set_per_process_gpu_memory_fraction(0.1);
@@ -37,12 +37,12 @@ static std::vector<BaseGPUDevice*> GetGPUDevices() {
   Status s = DeviceFactory::GetFactory(DEVICE_GPU)
                  ->AddDevices(session_options, "", &devices);
   TF_CHECK_OK(s);
-  std::vector<BaseGPUDevice*> gpus;
-  for (Device* d : devices) {
-    if (d->device_type() == "GPU") {
-      gpus.push_back(static_cast<BaseGPUDevice*>(d));
-    } else {
-      delete d;
+  std::vector<std::unique_ptr<BaseGPUDevice>> gpus;
+  for (std::unique_ptr<Device>& device : devices) {
+    if (device->device_type() == "GPU") {
+      // If `device_type()` is GPU, this `Device` is guaranteed to be a
+      // `BaseGPUDevice`, which is a subclass of `Device`.
+      gpus.emplace_back(static_cast<BaseGPUDevice*>(device.release()));
     }
   }
   return gpus;
@@ -64,16 +64,15 @@ class NcclManagerTest : public ::testing::Test {
   };
 
   static void SetUpTestCase() {
-    setenv("NCCL_DEBUG", "INFO", 1 /* replace */);
-    devices_ = new std::vector<BaseGPUDevice*>(GetGPUDevices());
-    CHECK(!devices_->empty());
+    setenv("NCCL_DEBUG", "WARN", 1 /* replace */);
+    setenv("NCCL_LAUNCH_MODE", "PARALLEL", 1 /* replace */);
+    devices_ = new std::vector<std::unique_ptr<BaseGPUDevice>>(GetGPUDevices());
     LOG(ERROR) << "Running test with " << devices_->size() << " gpus";
   }
 
-  static void TearDownTestCase() {
-    for (auto device : *devices_) delete device;
-    delete devices_;
-  }
+  static int32 NumGPUs() { return static_cast<int32>(devices_->size()); }
+
+  static void TearDownTestCase() { delete devices_; }
 
   TestCase* MakeTestCase(int num_ranks, ncclRedOp_t reduction_op,
                          TensorShape shape, float value_offset) {
@@ -153,7 +152,7 @@ class NcclManagerTest : public ::testing::Test {
       stream->ThenMemcpy(out_cpu.flat<Scalar>().data(), out_gpu_mem,
                          out_cpu.TotalBytes());
       SE_ASSERT_OK(stream->BlockHostUntilDone());
-      test::ExpectTensorNear<Scalar>(test_case->expected, out_cpu, 0.01);
+      test::ExpectClose(test_case->expected, out_cpu);
     }
   }
 
@@ -166,7 +165,7 @@ class NcclManagerTest : public ::testing::Test {
   }
 
   static BaseGPUDevice* GetDevice(size_t rank) {
-    return devices_->at(rank % devices_->size());
+    return devices_->at(rank % devices_->size()).get();
   }
 
  private:
@@ -181,13 +180,14 @@ class NcclManagerTest : public ::testing::Test {
   }
 
  private:
-  static std::vector<BaseGPUDevice*>* devices_;
+  static std::vector<std::unique_ptr<BaseGPUDevice>>* devices_;
   static const DataType data_type_;
   static const Scalar max_;
 };
 
 template <typename Scalar>
-std::vector<BaseGPUDevice*>* NcclManagerTest<Scalar>::devices_ = nullptr;
+std::vector<std::unique_ptr<BaseGPUDevice>>* NcclManagerTest<Scalar>::devices_ =
+    nullptr;
 template <typename Scalar>
 const DataType NcclManagerTest<Scalar>::data_type_ =
     DataTypeToEnum<Scalar>::value;
@@ -195,13 +195,13 @@ template <typename Scalar>
 const Scalar NcclManagerTest<Scalar>::max_ =
     Eigen::NumTraits<Scalar>::highest();
 
-// Instantiate tests for float and half.
-using TypeList = ::testing::Types<float, Eigen::half>;
+// Instantiate tests for float and double.
+using TypeList = ::testing::Types<float, double>;
 TYPED_TEST_CASE(NcclManagerTest, TypeList);
 
 // Test basic sum reduction.
 TYPED_TEST(NcclManagerTest, BasicSumReduction) {
-  const int num_ranks = 3;
+  const int num_ranks = 4;
 
   for (int op = 0; op < 4; ++op) {
     ncclRedOp_t reduction_op = static_cast<ncclRedOp_t>(op);
@@ -209,6 +209,7 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
         this->MakeTestCase(num_ranks, reduction_op, TensorShape({2, 3}), 0.0f));
     for (int rank = 0; rank < num_ranks; ++rank) {
       auto* device = this->GetDevice(rank);
+      VLOG(2) << "rank " << rank << " device " << device->name();
       auto* event_mgr = device->tensorflow_gpu_device_info()->event_mgr;
       auto* stream = device->tensorflow_gpu_device_info()->stream;
       NcclManager::instance()->AddToAllReduce(
@@ -225,15 +226,13 @@ TYPED_TEST(NcclManagerTest, BasicSumReduction) {
 // Same as the Basic test, but with multiple threads launching parts of many
 // reductions.
 //
-// Testing the multi-rank execution is currently reduced as it can hang when run
-// with num_ranks > devices->size(), for some GPUs (e.g. K20m).
-// To test the higher settings, increase num_ranks,
-// num_collectives_per_iteration and time_limit_micros.
+// To run test longer, increase num_ranks, num_collectives_per_iteration and
+// time_limit_micros.
 TYPED_TEST(NcclManagerTest, MultipleCallers) {
-  const int num_ranks = 1;                      // 2;
-  const int num_collectives_per_iteration = 1;  // 1000;
-  const int num_threads = 3;
-  const int time_limit_micros = 1;  // 60 * 30 * 1000 * 1000;
+  const int num_ranks = 4;
+  const int num_collectives_per_iteration = 10;  // 1000;
+  const int num_threads = num_ranks * 2;
+  const int time_limit_micros = 100;  // 60 * 30 * 1000 * 1000;
 
   int64 start = Env::Default()->NowMicros();
   srand(Env::Default()->NowMicros());
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index f55562ec99d91ef17c4a74d4ecaa7467e6a12e1f..281e2996ed7c2b07881d5ab564fc31463f8f8607 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -2743,6 +2743,9 @@ REGISTER_OP("QuantizeAndDequantizeV2")
     .Attr("range_given: bool = false")
     .Output("output: T")
     .Attr("T: {bfloat16, half, float, double}")
+    .Attr(
+        "round_mode: {'HALF_TO_EVEN', 'HALF_UP'} = "
+        "'HALF_TO_EVEN'")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle unused;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
@@ -2878,14 +2881,9 @@ REGISTER_OP("QuantizedInstanceNorm")
 
 namespace {
 
-Status ScatterNdShape(InferenceContext* c) {
-  ShapeHandle indices_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &indices_shape));
-  ShapeHandle updates_shape;
-  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &updates_shape));
-  ShapeHandle output_shape;
-  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output_shape));
-
+Status ScatterNdShapeHelper(InferenceContext* c, ShapeHandle indices_shape,
+                            ShapeHandle updates_shape,
+                            ShapeHandle output_shape) {
   if (c->Value(c->NumElements(output_shape)) == 0 &&
       (c->Value(c->NumElements(indices_shape)) > 0 ||
        c->Value(c->NumElements(updates_shape)) > 0)) {
@@ -2940,6 +2938,26 @@ Status ScatterNdShape(InferenceContext* c) {
   return Status::OK();
 }
 
+Status ScatterNdShape(InferenceContext* c) {
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &indices_shape));
+  ShapeHandle updates_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &updates_shape));
+  ShapeHandle output_shape;
+  TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output_shape));
+  return ScatterNdShapeHelper(c, indices_shape, updates_shape, output_shape);
+}
+
+Status ScatterNdTensorShape(InferenceContext* c) {
+  ShapeHandle output_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 1, &output_shape));
+  ShapeHandle indices_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(1), 1, &indices_shape));
+  ShapeHandle updates_shape;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(2), 1, &updates_shape));
+  return ScatterNdShapeHelper(c, indices_shape, updates_shape, output_shape);
+}
+
 }  // namespace
 
 REGISTER_OP("UpperBound")
@@ -2979,6 +2997,33 @@ REGISTER_OP("ScatterNd")
     .Attr("Tindices: {int32, int64}")
     .SetShapeFn(ScatterNdShape);
 
+REGISTER_OP("TensorScatterUpdate")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
+REGISTER_OP("TensorScatterAdd")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
+REGISTER_OP("TensorScatterSub")
+    .Input("tensor: T")
+    .Input("indices: Tindices")
+    .Input("updates: T")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("Tindices: {int32, int64}")
+    .SetShapeFn(ScatterNdTensorShape);
+
 REGISTER_OP("ScatterNdNonAliasingAdd")
     .Input("input: T")
     .Input("indices: Tindices")
diff --git a/tensorflow/core/ops/compat/ops_history.v1.pbtxt b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
index bfcc92dcb0f12b93b5fdf264f185e071cbd9acd9..1492741e8b3ef4aac19effb9656cf07ecffe7ff3 100644
--- a/tensorflow/core/ops/compat/ops_history.v1.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history.v1.pbtxt
@@ -12076,33 +12076,6 @@ op {
     type: "list(float)"
   }
 }
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -17327,22 +17300,41 @@ op {
   }
 }
 op {
-  name: "DatasetToTFRecord"
+  name: "DebugGradientIdentity"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input"
+    type_attr: "T"
   }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
+  output_arg {
+    name: "output"
+    type_attr: "T"
   }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugGradientRefIdentity"
   input_arg {
-    name: "compression_type"
-    type: DT_STRING
+    name: "input"
+    type_attr: "T"
+    is_ref: true
   }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+    is_ref: true
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  allows_uninitialized_input: true
 }
 op {
-  name: "DebugGradientIdentity"
+  name: "DebugIdentity"
   input_arg {
     name: "input"
     type_attr: "T"
@@ -17355,24 +17347,59 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
   allows_uninitialized_input: true
 }
 op {
-  name: "DebugGradientRefIdentity"
+  name: "DebugIdentity"
   input_arg {
     name: "input"
     type_attr: "T"
-    is_ref: true
   }
   output_arg {
     name: "output"
     type_attr: "T"
-    is_ref: true
   }
   attr {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   allows_uninitialized_input: true
 }
 op {
@@ -17384,234 +17411,165 @@ op {
   output_arg {
     name: "output"
     type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugIdentity"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNanCount"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNanCount"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNanCount"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-  }
-  attr {
-    name: "device_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "tensor_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "debug_urls"
-    type: "list(string)"
-    default_value {
-      list {
-      }
-    }
-  }
-  attr {
-    name: "gated_grpc"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  allows_uninitialized_input: true
-}
-op {
-  name: "DebugNumericSummary"
-  input_arg {
-    name: "input"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type: DT_DOUBLE
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNanCount"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_INT64
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "device_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "tensor_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "debug_urls"
+    type: "list(string)"
+    default_value {
+      list {
+      }
+    }
+  }
+  attr {
+    name: "gated_grpc"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  allows_uninitialized_input: true
+}
+op {
+  name: "DebugNumericSummary"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type: DT_DOUBLE
   }
   attr {
     name: "T"
@@ -18487,69 +18445,6 @@ op {
     }
   }
 }
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -21153,24 +21048,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "EnqueueInQueueDataset"
-  input_arg {
-    name: "queue"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "EnsureShape"
   input_arg {
@@ -21626,6 +21503,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalCSVDataset"
   input_arg {
@@ -21691,6 +21595,95 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalDatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
@@ -21726,50 +21719,129 @@ op {
   }
 }
 op {
-  name: "ExperimentalFunctionBufferingResource"
+  name: "ExperimentalGroupByReducerDataset"
   input_arg {
-    name: "string_arg"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "target_device"
-    type: DT_STRING
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
   }
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "key_func"
+    type: "func"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "init_func"
+    type: "func"
   }
   attr {
-    name: "f"
+    name: "reduce_func"
     type: "func"
   }
   attr {
-    name: "buffer_size"
-    type: "int"
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
     type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   is_stateful: true
 }
 op {
-  name: "ExperimentalFunctionBufferingResourceGetNext"
+  name: "ExperimentalGroupByWindowDataset"
   input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
@@ -21777,15 +21849,75 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
   is_stateful: true
 }
 op {
-  name: "ExperimentalFunctionBufferingResourceReset"
+  name: "ExperimentalGroupByWindowDataset"
   input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
   name: "ExperimentalIdentityIndexedDataset"
@@ -21898,6 +22030,241 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalMaterializedIndexDatasetHandle"
   output_arg {
@@ -21926,6 +22293,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalNonSerializableDataset"
   input_arg {
@@ -21997,6 +22391,454 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalNumaMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalPrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalSleepDataset"
   input_arg {
@@ -22024,6 +22866,107 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -22089,6 +23032,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalUniqueDataset"
   input_arg {
@@ -26129,56 +27095,21 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_UINT16
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-}
-op {
-  name: "Greater"
-  input_arg {
-    name: "x"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "y"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "z"
-    type: DT_BOOL
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "GreaterEqual"
+  name: "Greater"
   input_arg {
     name: "x"
     type_attr: "T"
@@ -26199,12 +27130,15 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
-        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -26237,8 +27171,6 @@ op {
         type: DT_INT8
         type: DT_UINT16
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -26273,7 +27205,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -26300,218 +27231,52 @@ op {
         type: DT_FLOAT
         type: DT_DOUBLE
         type: DT_INT32
+        type: DT_INT64
         type: DT_UINT8
         type: DT_INT16
         type: DT_INT8
-        type: DT_INT64
-        type: DT_BFLOAT16
         type: DT_UINT16
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
 }
 op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
+  name: "GreaterEqual"
   input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
+    name: "x"
+    type_attr: "T"
   }
   input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
+    name: "y"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "z"
+    type: DT_BOOL
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_INT64
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
   }
 }
 op {
@@ -29151,33 +29916,6 @@ op {
     }
   }
 }
-op {
-  name: "LatencyStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "LeakyRelu"
   input_arg {
@@ -30623,6 +31361,46 @@ op {
     }
   }
 }
+op {
+  name: "Lu"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "lu"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
+  }
+  attr {
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "MakeIterator"
   input_arg {
@@ -30636,55 +31414,45 @@ op {
   is_stateful: true
 }
 op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
+  name: "MapClear"
   attr {
-    name: "f"
-    type: "func"
+    name: "capacity"
+    type: "int"
+    default_value {
+      i: 0
+    }
+    has_minimum: true
   }
   attr {
-    name: "Targuments"
-    type: "list(type)"
+    name: "memory_limit"
+    type: "int"
+    default_value {
+      i: 0
+    }
     has_minimum: true
   }
   attr {
-    name: "output_types"
+    name: "dtypes"
     type: "list(type)"
-    has_minimum: true
-    minimum: 1
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
   }
+  is_stateful: true
 }
 op {
-  name: "MapAndBatchDatasetV2"
+  name: "MapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -30693,18 +31461,6 @@ op {
     name: "other_arguments"
     type_list_attr: "Targuments"
   }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
   output_arg {
     name: "handle"
     type: DT_VARIANT
@@ -30730,43 +31486,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-}
-op {
-  name: "MapClear"
-  attr {
-    name: "capacity"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "memory_limit"
-    type: "int"
-    default_value {
-      i: 0
-    }
-    has_minimum: true
-  }
-  attr {
-    name: "dtypes"
-    type: "list(type)"
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
   is_stateful: true
 }
 op {
@@ -30804,7 +31523,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
 }
 op {
   name: "MapDataset"
@@ -30841,6 +31559,13 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "MapDataset"
@@ -30884,6 +31609,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "MapDefun"
@@ -31447,18 +32179,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "MatchingFilesDataset"
-  input_arg {
-    name: "patterns"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
 op {
   name: "MatrixBandPart"
   input_arg {
@@ -39012,7 +39732,7 @@ op {
   }
 }
 op {
-  name: "ParallelInterleaveDataset"
+  name: "ParallelInterleaveDatasetV2"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -39030,15 +39750,7 @@ op {
     type: DT_INT64
   }
   input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
+    name: "num_parallel_calls"
     type: DT_INT64
   }
   output_arg {
@@ -39114,9 +39826,16 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
-  name: "ParallelInterleaveDatasetV2"
+  name: "ParallelMapDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
@@ -39126,16 +39845,49 @@ op {
     type_list_attr: "Targuments"
   }
   input_arg {
-    name: "cycle_length"
-    type: DT_INT64
+    name: "num_parallel_calls"
+    type: DT_INT32
   }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ParallelMapDataset"
   input_arg {
-    name: "block_length"
-    type: DT_INT64
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
   }
   input_arg {
     name: "num_parallel_calls"
-    type: DT_INT64
+    type: DT_INT32
   }
   output_arg {
     name: "handle"
@@ -39162,13 +39914,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
 }
 op {
   name: "ParallelMapDataset"
@@ -39209,7 +39954,13 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
 }
 op {
   name: "ParallelMapDataset"
@@ -39250,6 +40001,20 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ParallelMapDataset"
@@ -39297,55 +40062,15 @@ op {
       b: true
     }
   }
-}
-op {
-  name: "ParallelMapDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT32
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "use_inter_op_parallelism"
+    name: "sloppy"
     type: "bool"
     default_value {
-      b: true
+      b: false
     }
   }
   attr {
-    name: "sloppy"
+    name: "preserve_cardinality"
     type: "bool"
     default_value {
       b: false
@@ -39561,153 +40286,6 @@ op {
     has_minimum: true
   }
 }
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
 op {
   name: "ParseSequenceExample"
   input_arg {
@@ -40276,6 +40854,52 @@ op {
     }
   }
 }
+op {
+  name: "PartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+}
 op {
   name: "Placeholder"
   output_arg {
@@ -40562,60 +41186,24 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
-}
-op {
-  name: "PrefetchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "buffer_size"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
+  is_stateful: true
 }
 op {
-  name: "PrependFromQueueAndPaddedBatchDataset"
+  name: "PrefetchDataset"
   input_arg {
     name: "input_dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
+    name: "buffer_size"
     type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
   }
   output_arg {
     name: "handle"
     type: DT_VARIANT
   }
   attr {
-    name: "Toutput_types"
+    name: "output_types"
     type: "list(type)"
     has_minimum: true
     minimum: 1
@@ -40626,12 +41214,6 @@ op {
     has_minimum: true
     minimum: 1
   }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
 }
 op {
   name: "PreventGradient"
@@ -41643,6 +42225,71 @@ op {
     }
   }
 }
+op {
+  name: "QuantizeAndDequantizeV2"
+  input_arg {
+    name: "input"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_min"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "input_max"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "signed_input"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "num_bits"
+    type: "int"
+    default_value {
+      i: 8
+    }
+  }
+  attr {
+    name: "range_given"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_BFLOAT16
+        type: DT_HALF
+        type: DT_FLOAT
+        type: DT_DOUBLE
+      }
+    }
+  }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
+}
 op {
   name: "QuantizeAndDequantizeV3"
   input_arg {
@@ -44844,34 +45491,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "RandomGamma"
   input_arg {
@@ -49256,6 +49875,86 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyAdamWithAmsgrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "m"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "v"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "beta1_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2_power"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "beta2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyAddSign"
   input_arg {
@@ -49904,21 +50603,89 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyFtrl"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "linear"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "lr_power"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -49932,7 +50699,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrl"
+  name: "ResourceApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -49961,6 +50728,10 @@ op {
     name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "l2_shrinkage"
+    type_attr: "T"
+  }
   input_arg {
     name: "lr_power"
     type_attr: "T"
@@ -49972,21 +50743,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -50056,6 +50824,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -50127,6 +50897,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -50184,21 +50955,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -50212,41 +50983,17 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyFtrlV2"
+  name: "ResourceApplyGradientDescent"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
+    name: "alpha"
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
+    name: "delta"
     type_attr: "T"
   }
   attr {
@@ -50256,21 +51003,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -50316,6 +51060,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -50363,6 +51109,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -50396,21 +51143,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -50424,17 +51171,25 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyGradientDescent"
+  name: "ResourceApplyKerasMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "alpha"
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "delta"
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -50469,6 +51224,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
@@ -52524,17 +53286,173 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -52561,7 +53479,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -52570,20 +53488,71 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
@@ -52615,7 +53584,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -52639,7 +53607,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdadelta"
+  name: "ResourceSparseApplyAdagrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -52648,20 +53616,74 @@ op {
     name: "accum"
     type: DT_RESOURCE
   }
-  input_arg {
-    name: "accum_update"
-    type: DT_RESOURCE
-  }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT64
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_UINT16
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+        type: DT_BFLOAT16
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceSparseApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
@@ -52745,18 +53767,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -52777,21 +53802,28 @@ op {
       b: false
     }
   }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -52801,6 +53833,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -52820,8 +53868,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -52845,18 +53891,18 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -52866,6 +53912,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -52887,7 +53949,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -52911,18 +53972,18 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -52932,6 +53993,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -52939,21 +54016,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -52977,18 +54054,18 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagrad"
+  name: "ResourceSparseApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
+    name: "gradient_squared_accumulator"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "grad"
@@ -52998,6 +54075,22 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l1"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "l2"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -53040,107 +54133,41 @@ op {
       b: false
     }
   }
-  attr {
-    name: "update_slots"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "ms"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "mom"
+    type: DT_RESOURCE
   }
   input_arg {
     name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "rho"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyAdagradDA"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
+    name: "epsilon"
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
@@ -53150,22 +54177,6 @@ op {
     name: "indices"
     type_attr: "Tindices"
   }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "global_step"
-    type: DT_INT64
-  }
   attr {
     name: "T"
     type: "type"
@@ -53185,8 +54196,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -53210,42 +54219,46 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "ms"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -53268,7 +54281,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -53292,42 +54304,46 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyAdagradDA"
+  name: "ResourceSparseApplyCenteredRMSProp"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
+    name: "mg"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_squared_accumulator"
+    name: "ms"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "grad"
+    name: "mom"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "rho"
+    type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "momentum"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "indices"
+    type_attr: "Tindices"
   }
   attr {
     name: "T"
@@ -53336,21 +54352,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -53422,18 +54438,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -53457,46 +54476,42 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -53517,8 +54532,6 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -53542,46 +54555,42 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -53604,7 +54613,6 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -53628,46 +54636,42 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyCenteredRMSProp"
+  name: "ResourceSparseApplyFtrl"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mg"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "ms"
+    name: "accum"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "mom"
+    name: "linear"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
-    type_attr: "T"
+    name: "indices"
+    type_attr: "Tindices"
   }
   input_arg {
-    name: "momentum"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
   input_arg {
-    name: "indices"
-    type_attr: "Tindices"
+    name: "lr_power"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -53676,21 +54680,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -53758,96 +54762,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_HALF
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
+        type: DT_BFLOAT16
         type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
         type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
@@ -53874,7 +54800,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrl"
+  name: "ResourceSparseApplyFtrlV2"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -53908,85 +54834,7 @@ op {
     type_attr: "T"
   }
   input_arg {
-    name: "lr_power"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT64
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_UINT16
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_COMPLEX128
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-        type: DT_BFLOAT16
-      }
-    }
-  }
-  attr {
-    name: "Tindices"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_INT32
-        type: DT_INT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceSparseApplyFtrl"
-  input_arg {
-    name: "var"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "accum"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "linear"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "indices"
-    type_attr: "Tindices"
-  }
-  input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
+    name: "l2_shrinkage"
     type_attr: "T"
   }
   input_arg {
@@ -54000,21 +54848,18 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
+        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
+        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_INT64
+        type: DT_COMPLEX128
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
         type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
       }
     }
   }
@@ -54098,6 +54943,8 @@ op {
         type: DT_QUINT8
         type: DT_QINT32
         type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
       }
     }
   }
@@ -54183,6 +55030,7 @@ op {
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
+        type: DT_BFLOAT16
       }
     }
   }
@@ -54254,21 +55102,21 @@ op {
       list {
         type: DT_FLOAT
         type: DT_DOUBLE
-        type: DT_INT64
         type: DT_INT32
         type: DT_UINT8
-        type: DT_UINT16
         type: DT_INT16
         type: DT_INT8
         type: DT_COMPLEX64
-        type: DT_COMPLEX128
+        type: DT_INT64
         type: DT_QINT8
         type: DT_QUINT8
         type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
         type: DT_HALF
         type: DT_UINT32
         type: DT_UINT64
-        type: DT_BFLOAT16
       }
     }
   }
@@ -54292,7 +55140,7 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceSparseApplyFtrlV2"
+  name: "ResourceSparseApplyKerasMomentum"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -54302,8 +55150,8 @@ op {
     type: DT_RESOURCE
   }
   input_arg {
-    name: "linear"
-    type: DT_RESOURCE
+    name: "lr"
+    type_attr: "T"
   }
   input_arg {
     name: "grad"
@@ -54314,23 +55162,7 @@ op {
     type_attr: "Tindices"
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l1"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "l2_shrinkage"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "lr_power"
+    name: "momentum"
     type_attr: "T"
   }
   attr {
@@ -54375,6 +55207,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
@@ -57111,52 +57950,6 @@ op {
     }
   }
 }
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -60889,42 +61682,6 @@ op {
     }
   }
 }
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Shape"
   input_arg {
@@ -61771,41 +62528,6 @@ op {
     }
   }
 }
-op {
-  name: "SlideDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Snapshot"
   input_arg {
@@ -70964,38 +71686,6 @@ op {
     }
   }
 }
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Sqrt"
   input_arg {
@@ -71827,6 +72517,53 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "StatefulPartitionedCall"
+  input_arg {
+    name: "args"
+    type_list_attr: "Tin"
+  }
+  output_arg {
+    name: "output"
+    type_list_attr: "Tout"
+  }
+  attr {
+    name: "Tin"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tout"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "config"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "executor_type"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "StatelessIf"
   input_arg {
@@ -72509,40 +73246,6 @@ op {
     }
   }
 }
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
 op {
   name: "StopGradient"
   input_arg {
@@ -75378,6 +76081,127 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -75638,6 +76462,39 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListStack"
   input_arg {
@@ -75660,6 +76517,105 @@ op {
     }
   }
 }
+op {
+  name: "TensorScatterAdd"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterSub"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterUpdate"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorSliceDataset"
   input_arg {
@@ -76810,29 +77766,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -76874,6 +77807,104 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+}
 op {
   name: "UnicodeScript"
   input_arg {
@@ -78031,6 +79062,17 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "UnwrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "UpperBound"
   input_arg {
@@ -78464,6 +79506,17 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "WriteAudioSummary"
   input_arg {
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index 8402f250f9fe77319e74887c1957d71773b36b87..1c117166de029d40b84bbd2335b9315cdc53bcba 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -83,13 +83,6 @@ REGISTER_OP("GeneratorDataset")
                       // stateful to inhibit constant folding.
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("UnbatchDataset")
-    .Input("input_dataset: variant")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ZipDataset")
     .Input("input_datasets: N * variant")
     .Output("handle: variant")
@@ -142,57 +135,6 @@ REGISTER_OP("SkipDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("BytesProducedStatsDataset")
-    .Input("input_dataset: variant")
-    .Input("tag: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle tag_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("LatencyStatsDataset")
-    .Input("input_dataset: variant")
-    .Input("tag: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle tag_shape;
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("ParseExampleDataset")
-    .Input("input_dataset: variant")
-    .Input("num_parallel_calls: int64")
-    .Input("dense_defaults: Tdense")
-    .Output("handle: variant")
-    .Attr("sparse_keys: list(string) >= 0")
-    .Attr("dense_keys: list(string) >= 0")
-    .Attr("sparse_types: list({float,int64,string}) >= 0")
-    .Attr("Tdense: list({float,int64,string}) >= 0")
-    .Attr("dense_shapes: list(shape) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
-                                              // sorted by key (dense_keys and
-                                              // sparse_keys combined) here.
-    .Attr("sloppy: bool = false")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("SetStatsAggregatorDataset")
-    .Input("input_dataset: variant")
-    .Input("stats_aggregator: resource")
-    .Input("tag: string")
-    .Input("counter_prefix: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("MapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -202,6 +144,7 @@ REGISTER_OP("MapDataset")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ParallelMapDataset")
@@ -215,60 +158,9 @@ REGISTER_OP("ParallelMapDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .Attr("use_inter_op_parallelism: bool = true")
     .Attr("sloppy: bool = false")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("MapAndBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_batches: int64")
-    .Input("drop_remainder: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_batches, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("MapAndBatchDatasetV2")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("batch_size: int64")
-    .Input("num_parallel_calls: int64")
-    .Input("drop_remainder: bool")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      // Use index from the end to retrieve the Input shapes,
-      // so that to avoid guessing the length of "other_arguments".
-      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
-      shape_inference::ShapeHandle unused;
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
-      TF_RETURN_IF_ERROR(
-          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
-
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("PrefetchDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -282,18 +174,6 @@ REGISTER_OP("PrefetchDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("ScanDataset")
-    .Input("input_dataset: variant")
-    .Input("initial_state: Tstate")
-    .Input("other_arguments: Targuments")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Tstate: list(type) >= 1")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("FlatMapDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -316,21 +196,6 @@ REGISTER_OP("InterleaveDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ParallelInterleaveDataset")
-    .Input("input_dataset: variant")
-    .Input("other_arguments: Targuments")
-    .Input("cycle_length: int64")
-    .Input("block_length: int64")
-    .Input("sloppy: bool")
-    .Input("buffer_output_elements: int64")
-    .Input("prefetch_input_elements: int64")
-    .Output("handle: variant")
-    .Attr("f: func")
-    .Attr("Targuments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("ParallelInterleaveDatasetV2")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -345,43 +210,6 @@ REGISTER_OP("ParallelInterleaveDatasetV2")
     .Attr("sloppy: bool = false")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("GroupByReducerDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("init_func_other_arguments: Tinit_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("init_func: func")
-    .Attr("reduce_func: func")
-    .Attr("finalize_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Tinit_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("GroupByWindowDataset")
-    .Input("input_dataset: variant")
-    .Input("key_func_other_arguments: Tkey_func_other_arguments")
-    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
-    .Input(
-        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
-    .Output("handle: variant")
-    .Attr("key_func: func")
-    .Attr("reduce_func: func")
-    .Attr("window_size_func: func")
-    .Attr("Tkey_func_other_arguments: list(type) >= 0")
-    .Attr("Treduce_func_other_arguments: list(type) >= 0")
-    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
-
 REGISTER_OP("FilterDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
@@ -447,23 +275,6 @@ REGISTER_OP("BatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("SlideDataset")
-    .Input("input_dataset: variant")
-    .Input("window_size: int64")
-    .Input("window_shift: int64")
-    .Input("window_stride: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // window_size, window_shift, and window_stride should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 // TODO(mrry): Validate that `padded_shapes` are all vectors, the lengths of
 // `output_types` and `output_shapes` are `N` the `output_shapes` are (as far as
 // possible to tell statically) compatible with `padded_shapes`, and that
@@ -504,22 +315,6 @@ REGISTER_OP("PaddedBatchDatasetV2")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("DenseToSparseBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("row_shape: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // batch_size should be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      // row_shape should be a 1-D vector.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("RangeDataset")
     .Input("start: int64")
     .Input("stop: int64")
@@ -538,22 +333,6 @@ REGISTER_OP("RangeDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("RandomDataset")
-    .Input("seed: int64")
-    .Input("seed2: int64")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // buffer_size, seed, and seed2 should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("ShuffleDataset")
     .Input("input_dataset: variant")
     .Input("buffer_size: int64")
@@ -622,36 +401,6 @@ REGISTER_OP("TextLineDataset")
       return shape_inference::ScalarShape(c);
     });
 
-REGISTER_OP("MatchingFilesDataset")
-    .Input("patterns: string")
-    .Output("handle: variant")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // `patterns` must be a scalar or a vector.
-      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("SqlDataset")
-    .Input("driver_name: string")
-    .Input("data_source_name: string")
-    .Input("query: string")
-    .Output("handle: variant")
-    .Attr("output_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
-                      // stateful to inhibit constant folding.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // driver_name, data_source_name, and query should be scalars.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
 REGISTER_OP("FixedLengthRecordDataset")
     .Input("filenames: string")
     .Input("header_bytes: int64")
@@ -838,53 +587,6 @@ REGISTER_OP("DeserializeIterator")
     .Input("serialized: variant")
     .SetShapeFn(shape_inference::NoOutputs);
 
-REGISTER_OP("StatsAggregatorHandle")
-    .Output("handle: resource")
-    .SetShapeFn(shape_inference::ScalarShape)
-    .Attr("container: string = ''")
-    .Attr("shared_name: string = ''");
-
-REGISTER_OP("StatsAggregatorSummary")
-    .Input("iterator: resource")
-    .Output("summary: string")
-    .SetShapeFn(shape_inference::ScalarShape);
-
-REGISTER_OP("PrependFromQueueAndPaddedBatchDataset")
-    .Input("input_dataset: variant")
-    .Input("batch_size: int64")
-    .Input("padded_shapes: N * int64")
-    .Input("padding_values: Toutput_types")
-    .Output("handle: variant")
-    .Attr("Toutput_types: list(type) >= 1")
-    .Attr("output_shapes: list(shape) >= 1")
-    .Attr("N: int >= 1")
-    // TODO(ebrevdo): Validate that `padded_shapes` are all vectors, the lengths
-    // of `Toutput_types` and `output_shapes` are `N`, that the
-    // length of `output_types` is `N`, the `output_shapes` are
-    // (as far as possible to tell statically) compatible with `padded_shapes`,
-    // and that `padding_values` are all scalars.
-    .SetShapeFn([](shape_inference::InferenceContext* c) {
-      shape_inference::ShapeHandle unused;
-      // batch_size should be a scalar.
-      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
-      return shape_inference::ScalarShape(c);
-    });
-
-REGISTER_OP("EnqueueInQueueDataset")
-    .Input("queue: variant")
-    .Input("components: Tcomponents")
-    .Attr("Tcomponents: list(type) >= 1")
-    .SetIsStateful()  // To avoid CSE on multiple calls to Enqueue.
-    // TODO(ebrevdo): SetShapeFn to test input dtypes and shapes by
-    // reading from queue handle (is that even possible?).
-    .SetShapeFn(shape_inference::NoOutputs);
-
-REGISTER_OP("DatasetToTFRecord")
-    .Input("input_dataset: variant")
-    .Input("filename: string")
-    .Input("compression_type: string")
-    .SetShapeFn(shape_inference::NoOutputs);
-
 REGISTER_OP("DatasetToGraph")
     .Input("input_dataset: variant")
     .Output("graph: string")
@@ -985,6 +687,16 @@ REGISTER_OP("MapDefun")
       return Status::OK();
     });
 
+REGISTER_OP("WrapDatasetVariant")
+    .Input("input_handle: variant")
+    .Output("output_handle: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("UnwrapDatasetVariant")
+    .Input("input_handle: variant")
+    .Output("output_handle: variant")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("MultiDeviceIterator")
     .Output("handle: resource")
     .Attr("devices: list(string) >= 1")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 088d1865ddf07425801d1bb8f7da6ac9dce45bba..f904e2536dfe67facc25335dc3f86b3d45fd116f 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -17,14 +17,17 @@ limitations under the License.
 
 namespace tensorflow {
 
-REGISTER_OP("ExperimentalDirectedInterleaveDataset")
-    .Input("selector_input_dataset: variant")
-    .Input("data_input_datasets: N * variant")
+REGISTER_OP("ExperimentalBytesProducedStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
     .Output("handle: variant")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
-    .Attr("N: int >= 1")
-    .SetShapeFn(shape_inference::ScalarShape);
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
 
 REGISTER_OP("ExperimentalCSVDataset")
     .Input("filenames: string")
@@ -68,6 +71,79 @@ REGISTER_OP("ExperimentalCSVDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalDatasetCardinality")
+    .Input("input_dataset: variant")
+    .Output("cardinality: int64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalDatasetToTFRecord")
+    .Input("input_dataset: variant")
+    .Input("filename: string")
+    .Input("compression_type: string")
+    .SetShapeFn(shape_inference::NoOutputs);
+
+REGISTER_OP("ExperimentalDenseToSparseBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("batch_size: int64")
+    .Input("row_shape: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // batch_size should be a scalar.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      // row_shape should be a 1-D vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalDirectedInterleaveDataset")
+    .Input("selector_input_dataset: variant")
+    .Input("data_input_datasets: N * variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("N: int >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalGroupByReducerDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("init_func_other_arguments: Tinit_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input("finalize_func_other_arguments: Tfinalize_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("init_func: func")
+    .Attr("reduce_func: func")
+    .Attr("finalize_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Tinit_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Tfinalize_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalGroupByWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("key_func_other_arguments: Tkey_func_other_arguments")
+    .Input("reduce_func_other_arguments: Treduce_func_other_arguments")
+    .Input(
+        "window_size_func_other_arguments: Twindow_size_func_other_arguments")
+    .Output("handle: variant")
+    .Attr("key_func: func")
+    .Attr("reduce_func: func")
+    .Attr("window_size_func: func")
+    .Attr("Tkey_func_other_arguments: list(type) >= 0")
+    .Attr("Treduce_func_other_arguments: list(type) >= 0")
+    .Attr("Twindow_size_func_other_arguments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -75,6 +151,69 @@ REGISTER_OP("ExperimentalIgnoreErrorsDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalLatencyStatsDataset")
+    .Input("input_dataset: variant")
+    .Input("tag: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle tag_shape;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &tag_shape));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalMapAndBatchDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("batch_size: int64")
+    .Input("num_parallel_calls: int64")
+    .Input("drop_remainder: bool")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      // Use index from the end to retrieve the Input shapes,
+      // so that to avoid guessing the length of "other_arguments".
+      // batch_size, num_parallel_calls, and drop_remainder are 0-D scalars.
+      shape_inference::ShapeHandle unused;
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 3), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 2), 0, &unused));
+      TF_RETURN_IF_ERROR(
+          c->WithRank(c->input(c->num_inputs() - 1), 0, &unused));
+
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalMapDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("use_inter_op_parallelism: bool = true")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalMatchingFilesDataset")
+    .Input("patterns: string")
+    .Output("handle: variant")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // `patterns` must be a scalar or a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(0), 1, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("ExperimentalNonSerializableDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -82,6 +221,77 @@ REGISTER_OP("ExperimentalNonSerializableDataset")
     .Attr("output_shapes: list(shape) >= 1")
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("ExperimentalParallelInterleaveDataset")
+    .Input("input_dataset: variant")
+    .Input("other_arguments: Targuments")
+    .Input("cycle_length: int64")
+    .Input("block_length: int64")
+    .Input("sloppy: bool")
+    .Input("buffer_output_elements: int64")
+    .Input("prefetch_input_elements: int64")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalParseExampleDataset")
+    .Input("input_dataset: variant")
+    .Input("num_parallel_calls: int64")
+    .Input("dense_defaults: Tdense")
+    .Output("handle: variant")
+    .Attr("sparse_keys: list(string) >= 0")
+    .Attr("dense_keys: list(string) >= 0")
+    .Attr("sparse_types: list({float,int64,string}) >= 0")
+    .Attr("Tdense: list({float,int64,string}) >= 0")
+    .Attr("dense_shapes: list(shape) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")  // Output components will be
+                                              // sorted by key (dense_keys and
+                                              // sparse_keys combined) here.
+    .Attr("sloppy: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalRandomDataset")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // buffer_size, seed, and seed2 should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalScanDataset")
+    .Input("input_dataset: variant")
+    .Input("initial_state: Tstate")
+    .Input("other_arguments: Targuments")
+    .Output("handle: variant")
+    .Attr("f: func")
+    .Attr("Tstate: list(type) >= 1")
+    .Attr("Targuments: list(type) >= 0")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalSetStatsAggregatorDataset")
+    .Input("input_dataset: variant")
+    .Input("stats_aggregator: resource")
+    .Input("tag: string")
+    .Input("counter_prefix: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalSleepDataset")
     .Input("input_dataset: variant")
     .Input("sleep_microseconds: int64")
@@ -96,6 +306,59 @@ REGISTER_OP("ExperimentalSleepDataset")
       return shape_inference::ScalarShape(c);
     });
 
+REGISTER_OP("ExperimentalSlidingWindowDataset")
+    .Input("input_dataset: variant")
+    .Input("window_size: int64")
+    .Input("window_shift: int64")
+    .Input("window_stride: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // window_size, window_shift, and window_stride should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalSqlDataset")
+    .Input("driver_name: string")
+    .Input("data_source_name: string")
+    .Input("query: string")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetIsStateful()  // TODO(b/65524810): Source dataset ops must be marked
+                      // stateful to inhibit constant folding.
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // driver_name, data_source_name, and query should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
+REGISTER_OP("ExperimentalStatsAggregatorHandle")
+    .Output("handle: resource")
+    .SetShapeFn(shape_inference::ScalarShape)
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''");
+
+REGISTER_OP("ExperimentalStatsAggregatorSummary")
+    .Input("iterator: resource")
+    .Output("summary: string")
+    .SetShapeFn(shape_inference::ScalarShape);
+
+REGISTER_OP("ExperimentalUnbatchDataset")
+    .Input("input_dataset: variant")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ExperimentalUniqueDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
@@ -108,26 +371,21 @@ REGISTER_OP("ExperimentalIteratorGetDevice")
     .Output("device: string")
     .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ExperimentalFunctionBufferingResource")
-    .Input("string_arg: string")
-    .Input("target_device: string")
-    .Output("resource: resource")
-    .Attr("shared_name: string")
-    .Attr("container: string")
-    .Attr("f: func")
-    .Attr("buffer_size: int")
-    .Attr("output_types: list(type)")
-    .SetShapeFn(shape_inference::UnknownShape);
-
-REGISTER_OP("ExperimentalFunctionBufferingResourceGetNext")
-    .Input("function_buffer_resource: resource")
-    .Attr("output_types: list(type)")
-    .Output("output: output_types")
-    .SetShapeFn(shape_inference::UnknownShape);
+REGISTER_OP("ExperimentalMaxIntraOpParallelismDataset")
+    .Input("input_dataset: variant")
+    .Input("max_intra_op_parallelism: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
 
-REGISTER_OP("ExperimentalFunctionBufferingResourceReset")
-    .Input("function_buffer_resource: resource")
-    .SetShapeFn(shape_inference::UnknownShape);
+REGISTER_OP("ExperimentalPrivateThreadPoolDataset")
+    .Input("input_dataset: variant")
+    .Input("num_threads: int64")
+    .Output("handle: variant")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .SetShapeFn(shape_inference::ScalarShape);
 
 REGISTER_OP("ExperimentalThreadPoolDataset")
     .Input("input_dataset: variant")
@@ -170,6 +428,7 @@ REGISTER_OP("ExperimentalNumaMapAndBatchDataset")
     .Attr("Targuments: list(type) >= 0")
     .Attr("output_types: list(type) >= 1")
     .Attr("output_shapes: list(shape) >= 1")
+    .Attr("preserve_cardinality: bool = false")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       // Use index from the end to retrieve the Input shapes,
       // so that to avoid guessing the length of "other_arguments".
diff --git a/tensorflow/core/ops/function_ops.cc b/tensorflow/core/ops/function_ops.cc
index 6edd86b3ad0eae3b0eaa360e5fef9983d6cd3dc4..8e86dd9f780c8eac3dd813c996288a9707247bc4 100644
--- a/tensorflow/core/ops/function_ops.cc
+++ b/tensorflow/core/ops/function_ops.cc
@@ -35,6 +35,22 @@ output: The argument.
 index: This argument is the index-th argument of the function.
 )doc");
 
+REGISTER_SYSTEM_OP("_DeviceArg")
+    .Output("output: T")
+    .Attr("T: type")
+    .Attr("index: int >= 0")
+    .SetIsStateful()
+    .SetShapeFn([](shape_inference::InferenceContext* context) {
+      context->set_output(0, context->UnknownShape());
+      return Status::OK();
+    })
+    .Doc(R"doc(
+A graph node which represents an argument to a function.
+
+output: The argument.
+index: This argument is the index-th argument of the function.
+)doc");
+
 REGISTER_SYSTEM_OP("_Retval")
     .Input("input: T")
     .Attr("T: type")
diff --git a/tensorflow/core/ops/functional_ops.cc b/tensorflow/core/ops/functional_ops.cc
index ee14a851eb9f62f978817236aef98fd7e3a3df0c..5e0bdd888cea1c508a38afe2f40c7c9f17d28269 100644
--- a/tensorflow/core/ops/functional_ops.cc
+++ b/tensorflow/core/ops/functional_ops.cc
@@ -226,6 +226,7 @@ REGISTER_OP("PartitionedCall")
     .Attr("Tout: list(type) >= 0")
     .Attr("f: func")
     .Attr("config: string = ''")
+    .Attr("config_proto: string = ''")
     .Attr("executor_type: string = ''")
     .SetShapeFn(shape_inference::UnknownShape);
 
@@ -235,7 +236,8 @@ REGISTER_OP("StatefulPartitionedCall")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >= 0")
     .Attr("f: func")
-    .Attr("config: string = ''")
+    .Attr("config: string = ''")  // Deprecated in favor of config_proto
+    .Attr("config_proto: string = ''")
     .Attr("executor_type: string = ''")
     .SetIsStateful()
     .SetShapeFn(shape_inference::UnknownShape);
diff --git a/tensorflow/core/ops/linalg_ops.cc b/tensorflow/core/ops/linalg_ops.cc
index 525b19e51e013278211e8961a17588514796c4d8..952ee4bee2e5a49edeea168f4184767dbebc2527 100644
--- a/tensorflow/core/ops/linalg_ops.cc
+++ b/tensorflow/core/ops/linalg_ops.cc
@@ -109,6 +109,30 @@ Status SelfAdjointEigV2ShapeFn(InferenceContext* c) {
   return Status::OK();
 }
 
+// Input is [...,N,N].
+// First and second outputs are:
+//   [...,N,N]; [...,N].
+Status LuShapeFn(InferenceContext* c) {
+  ShapeHandle input;
+  TF_RETURN_IF_ERROR(c->WithRankAtLeast(c->input(0), 2, &input));
+
+  DimensionHandle n;
+  TF_RETURN_IF_ERROR(c->Merge(c->Dim(input, -2), c->Dim(input, -1), &n));
+
+  ShapeHandle batch_shape;
+  TF_RETURN_IF_ERROR(c->Subshape(input, 0, -2, &batch_shape));
+
+  ShapeHandle lu_shape;
+  ShapeHandle p_shape;
+
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Matrix(n, n), &lu_shape));
+  TF_RETURN_IF_ERROR(c->Concatenate(batch_shape, c->Vector(n), &p_shape));
+
+  c->set_output(0, lu_shape);
+  c->set_output(1, p_shape);
+  return Status::OK();
+}
+
 // Input is [...,M,N].
 // First and second outputs are:
 //   [...,M,M]; [...,M,N], if full_matrices is true,
@@ -289,6 +313,14 @@ REGISTER_OP("SelfAdjointEigV2")
     .Attr("T: {double, float, complex64, complex128}")
     .SetShapeFn(SelfAdjointEigV2ShapeFn);
 
+REGISTER_OP("Lu")
+    .Input("input: T")
+    .Output("lu: T")
+    .Output("p: output_idx_type")
+    .Attr("T: {double, float, complex64, complex128}")
+    .Attr("output_idx_type: {int32, int64} = DT_INT32")
+    .SetShapeFn(LuShapeFn);
+
 REGISTER_OP("MatrixSolve")
     .Input("matrix: T")
     .Input("rhs: T")
diff --git a/tensorflow/core/ops/linalg_ops_test.cc b/tensorflow/core/ops/linalg_ops_test.cc
index f4be820defa3d4b4e2a45ba2038d9250570f59a5..bfacee14efa41408865fecb103bc63b5f6de73ff 100644
--- a/tensorflow/core/ops/linalg_ops_test.cc
+++ b/tensorflow/core/ops/linalg_ops_test.cc
@@ -274,4 +274,23 @@ TEST(LinalgOpsTest, Svd_ShapeFn) {
   INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
 }
 
+TEST(LinalgOpsTest, Lu_ShapeFn) {
+  ShapeInferenceTestOp op("Lu");
+  INFER_OK(op, "?", "?;?");
+  INFER_ERROR("Shape must be at least rank 2 but is rank 1", op, "[1]");
+  INFER_ERROR("Dimensions must be equal, but are 1 and 2", op, "[1,?,3,4,1,2]");
+
+  INFER_OK(op, "[?,?]", "[d0_0,d0_0];[d0_0]");
+  INFER_OK(op, "[1,?]", "[d0_0,d0_0];[d0_0]");
+  INFER_OK(op, "[?,1]", "[d0_1,d0_1];[d0_1]");
+
+  // Repeat previous block of tests with input rank > 2.
+  INFER_OK(op, "[1,?,3,4,?,?]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_4,d0_4];[d0_0,d0_1,d0_2,d0_3,d0_4]");
+  INFER_OK(op, "[1,?,3,4,1,?]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_4,d0_4];[d0_0,d0_1,d0_2,d0_3,d0_4]");
+  INFER_OK(op, "[1,?,3,4,?,1]",
+           "[d0_0,d0_1,d0_2,d0_3,d0_5,d0_5];[d0_0,d0_1,d0_2,d0_3,d0_5]");
+}
+
 }  // end namespace tensorflow
diff --git a/tensorflow/core/ops/list_ops.cc b/tensorflow/core/ops/list_ops.cc
index 88d6d14c306f5f6e3bd2317692524d6bdce62621..01ebcd15439d670274d7e2a784ce78c5c1ee44ef 100644
--- a/tensorflow/core/ops/list_ops.cc
+++ b/tensorflow/core/ops/list_ops.cc
@@ -28,13 +28,14 @@ REGISTER_OP("EmptyTensorList")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          0, &element_shape));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -45,9 +46,9 @@ REGISTER_OP("TensorListPushBack")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
@@ -57,18 +58,21 @@ REGISTER_OP("TensorListPushBack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to push to list with wrong element dtype. List has type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -89,9 +93,9 @@ REGISTER_OP("TensorListPushBackBatch")
 
       c->set_output(0, input_handles);
 
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
 
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
@@ -101,18 +105,21 @@ REGISTER_OP("TensorListPushBackBatch")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to push to list with wrong element dtype. List has type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -127,9 +134,9 @@ REGISTER_OP("TensorListPopBack")
     .Output("tensor: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -138,19 +145,21 @@ REGISTER_OP("TensorListPopBack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to read from list with wrong element dtype. List has "
               "type ",
               DataTypeString(list_shape_type.dtype),
-              " but trying to push element with type ", DataTypeString(t));
+              " but trying to push element with type ",
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
+        TF_RETURN_IF_ERROR(
+            c->Merge(tensor_shape, list_shape_type.shape, &ignored));
         c->set_output_handle_shapes_and_types(0, *handle_data);
-        s = list_shape_type.shape;
+        tensor_shape = list_shape_type.shape;
       }
-      c->set_output(1, s);
+      c->set_output(1, tensor_shape);
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -161,9 +170,9 @@ REGISTER_OP("TensorListStack")
     .Attr("element_dtype: type")
     .Attr("num_elements: int = -1")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->UnknownShape();
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
       auto* handle_data = c->input_handle_shapes_and_types(0);
       if (handle_data != nullptr && handle_data->size() != 1) {
         return errors::InvalidArgument(
@@ -172,16 +181,17 @@ REGISTER_OP("TensorListStack")
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument(
               "Trying to read from list with wrong element dtype. List has "
               "type ",
               DataTypeString(list_shape_type.dtype), " but expectec type ",
-              DataTypeString(t));
+              DataTypeString(element_dtype));
         }
         shape_inference::ShapeHandle ignored;
-        TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &ignored));
-        s = list_shape_type.shape;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
       }
       int expected_num_elements = -1;
       TF_RETURN_IF_ERROR(c->GetAttr("num_elements", &expected_num_elements));
@@ -192,11 +202,88 @@ REGISTER_OP("TensorListStack")
         num_elements = c->MakeShape({expected_num_elements});
       }
       shape_inference::ShapeHandle result;
-      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, s, &result));
+      TF_RETURN_IF_ERROR(c->Concatenate(num_elements, element_shape, &result));
       c->set_output(0, result);
       return Status::OK();
     });
 
+REGISTER_OP("TensorListConcat")
+    .Input("input_handle: variant")
+    .Output("tensor: element_dtype")
+    .Output("lengths: int64")
+    .Attr("element_dtype: type")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape = c->UnknownShape();
+      auto* handle_data = c->input_handle_shapes_and_types(0);
+      if (handle_data != nullptr && handle_data->size() != 1) {
+        return errors::InvalidArgument(
+            "Trying to read from list with wrong variant data.");
+      }
+      if (handle_data != nullptr) {
+        const shape_inference::ShapeAndType& list_shape_type =
+            (*handle_data)[0];
+        if (list_shape_type.dtype != element_dtype) {
+          return errors::InvalidArgument(
+              "Trying to read from list with wrong element dtype. List has "
+              "type ",
+              DataTypeString(list_shape_type.dtype), " but expected type ",
+              DataTypeString(element_dtype));
+        }
+        shape_inference::ShapeHandle ignored;
+        TF_RETURN_IF_ERROR(
+            c->Merge(element_shape, list_shape_type.shape, &ignored));
+        element_shape = list_shape_type.shape;
+      }
+      if (c->RankKnown(element_shape)) {
+        shape_inference::ShapeHandle result;
+        TF_RETURN_IF_ERROR(c->Subshape(element_shape, 1, &result));
+        TF_RETURN_IF_ERROR(
+            c->Concatenate(c->MakeShape({c->UnknownDim()}), result, &result));
+        c->set_output(0, result);
+      } else {
+        c->set_output(0, c->UnknownShape());
+      }
+      c->set_output(1, c->MakeShape({c->UnknownDim()}));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorListSplit")
+    .Input("tensor: element_dtype")
+    .Input("element_shape: shape_type")
+    .Input("lengths: int64")
+    .Output("output_handle: variant")
+    .Attr("element_dtype: type")
+    .Attr("shape_type: {int32, int64}")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      c->set_output(0, c->Scalar());
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->input(0);
+      shape_inference::ShapeHandle ignored;
+      // Check that tensor is at least a vector.
+      TF_RETURN_IF_ERROR(c->WithRankAtLeast(tensor_shape, 1, &ignored));
+      // Check that lengths is a vector.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 1, &ignored));
+      shape_inference::ShapeHandle element_shape_from_tensor_shape;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(tensor_shape, 1, &element_shape_from_tensor_shape));
+      TF_RETURN_IF_ERROR(c->Concatenate(c->MakeShape({c->UnknownDim()}),
+                                        element_shape_from_tensor_shape,
+                                        &element_shape_from_tensor_shape));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          1, &element_shape));
+      TF_RETURN_IF_ERROR(c->Merge(element_shape_from_tensor_shape,
+                                  element_shape,
+                                  &element_shape_from_tensor_shape));
+      c->set_output_handle_shapes_and_types(
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
+      return Status::OK();
+    });
+
 REGISTER_OP("TensorListFromTensor")
     .Input("tensor: element_dtype")
     .Input("element_shape: shape_type")
@@ -205,17 +292,20 @@ REGISTER_OP("TensorListFromTensor")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s = c->input(0);
-      shape_inference::ShapeHandle o;
-      TF_RETURN_IF_ERROR(c->Subshape(s, 1, &o));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle tensor_shape = c->input(0);
+      shape_inference::ShapeHandle tensor_shape_except_first_dim;
+      TF_RETURN_IF_ERROR(
+          c->Subshape(tensor_shape, 1, &tensor_shape_except_first_dim));
       shape_inference::ShapeHandle element_shape;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
           1, &element_shape));
-      TF_RETURN_IF_ERROR(c->Merge(o, element_shape, &o));
+      TF_RETURN_IF_ERROR(c->Merge(tensor_shape_except_first_dim, element_shape,
+                                  &tensor_shape_except_first_dim));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{element_shape, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -241,13 +331,14 @@ REGISTER_OP("TensorListReserve")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Scalar());
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(0, &s));
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          0, &element_shape));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       c->set_output_handle_shapes_and_types(
-          0, std::vector<shape_inference::ShapeAndType>{{s, t}});
+          0, std::vector<shape_inference::ShapeAndType>{
+                 {element_shape, element_dtype}});
       return Status::OK();
     });
 
@@ -257,17 +348,17 @@ REGISTER_OP("TensorListGetItem")
     .Output("item: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         element_shape = list_shape_type.shape;
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument("Expected list with element dtype ",
-                                         DataTypeString(t),
+                                         DataTypeString(element_dtype),
                                          " but got list with element dtype ",
                                          DataTypeString(list_shape_type.dtype));
         }
@@ -283,17 +374,19 @@ REGISTER_OP("TensorListSetItem")
     .Output("output_handle: variant")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       c->set_output(0, c->Scalar());
       if (handle_data == nullptr) {
-        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        c->set_output_handle_shapes_and_types(
+            0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       const shape_inference::ShapeAndType& list_shape_type = (*handle_data)[0];
-      shape_inference::ShapeHandle s = c->input(2);
-      TF_RETURN_IF_ERROR(c->Merge(s, list_shape_type.shape, &s));
+      shape_inference::ShapeHandle item_shape = c->input(2);
+      TF_RETURN_IF_ERROR(
+          c->Merge(item_shape, list_shape_type.shape, &item_shape));
       c->set_output_handle_shapes_and_types(0, *handle_data);
       return Status::OK();
     });
@@ -304,17 +397,17 @@ REGISTER_OP("TensorListGather")
     .Output("values: element_dtype")
     .Attr("element_dtype: type")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
       auto* handle_data = c->input_handle_shapes_and_types(0);
       shape_inference::ShapeHandle element_shape = c->UnknownShape();
       if (handle_data != nullptr) {
         const shape_inference::ShapeAndType& list_shape_type =
             (*handle_data)[0];
         element_shape = list_shape_type.shape;
-        if (list_shape_type.dtype != t) {
+        if (list_shape_type.dtype != element_dtype) {
           return errors::InvalidArgument("Expected list with element dtype ",
-                                         DataTypeString(t),
+                                         DataTypeString(element_dtype),
                                          " but got list with element dtype ",
                                          DataTypeString(list_shape_type.dtype));
         }
@@ -333,12 +426,13 @@ REGISTER_OP("TensorListScatter")
     .Attr("element_dtype: type")
     .Attr("shape_type: {int32, int64}")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
-      shape_inference::ShapeHandle s;
-      TF_RETURN_IF_ERROR(
-          c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(2, &s));
-      c->set_output_handle_shapes_and_types(0, {{s, t}});
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
+      shape_inference::ShapeHandle element_shape;
+      TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensorTreatScalarAsUnknownShape(
+          2, &element_shape));
+      c->set_output_handle_shapes_and_types(0,
+                                            {{element_shape, element_dtype}});
       c->set_output(0, c->Scalar());
       return Status::OK();
     });
@@ -354,28 +448,29 @@ REGISTER_OP("TensorListConcatLists")
       TF_RETURN_IF_ERROR(c->Merge(input_a, input_b, &input_a));
       c->set_output(0, input_a);
 
-      DataType t;
-      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &t));
+      DataType element_dtype;
+      TF_RETURN_IF_ERROR(c->GetAttr("element_dtype", &element_dtype));
 
       auto* handle_data_a = c->input_handle_shapes_and_types(0);
       auto* handle_data_b = c->input_handle_shapes_and_types(1);
       if (handle_data_a == nullptr && handle_data_b == nullptr) {
-        c->set_output_handle_shapes_and_types(0, {{c->UnknownShape(), t}});
+        c->set_output_handle_shapes_and_types(
+            0, {{c->UnknownShape(), element_dtype}});
         return Status::OK();
       }
       shape_inference::ShapeAndType list_shape_type_a =
           (handle_data_a) ? handle_data_a->at(0) : handle_data_b->at(0);
       const shape_inference::ShapeAndType& list_shape_type_b =
           (handle_data_b) ? handle_data_b->at(0) : handle_data_a->at(0);
-      if (list_shape_type_a.dtype != t) {
+      if (list_shape_type_a.dtype != element_dtype) {
         return errors::InvalidArgument("input_a.type != element_dtype: ",
                                        DataTypeString(list_shape_type_a.dtype),
-                                       " vs. ", DataTypeString(t));
+                                       " vs. ", DataTypeString(element_dtype));
       }
-      if (list_shape_type_b.dtype != t) {
+      if (list_shape_type_b.dtype != element_dtype) {
         return errors::InvalidArgument("input_b.type != element_dtype: ",
                                        DataTypeString(list_shape_type_b.dtype),
-                                       " vs. ", DataTypeString(t));
+                                       " vs. ", DataTypeString(element_dtype));
       }
       TF_RETURN_IF_ERROR(c->Merge(list_shape_type_a.shape,
                                   list_shape_type_b.shape,
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index 4dfd95b01916c015b35cd32bdb4c16a3ade90ffe..bc59abc54cc1b87af3c06ce5cfda6fe5dca86e36 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -327,6 +327,9 @@ REGISTER_OP("_FusedConv2D")
     .Attr(GetConvnetDataFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .Attr("fused_ops: list(string) = []")
+    // Attributes for the FusedBatchNorm ------------------------------------ //
+    .Attr("epsilon: float = 0.0001")
+    // ---------------------------------------------------------------------- //
     .SetShapeFn(shape_inference::Conv2DShape)
     .Doc(R"doc(
 *NOTE*: Do not invoke this operator directly in Python. Grappler is
@@ -1206,9 +1209,9 @@ Status TopKShapeFn(InferenceContext* c) {
   DimensionHandle last_dim = c->Dim(input, -1);
   if (c->ValueKnown(last_dim) && c->ValueKnown(k_dim) &&
       c->Value(last_dim) < c->Value(k_dim)) {
-    return errors::InvalidArgument("input must have last dimension >= k = ",
-                                   c->Value(k_dim), " but is ",
-                                   c->Value(last_dim));
+    return errors::InvalidArgument(
+        "input must have last dimension >= k = ", c->Value(k_dim), " but is ",
+        c->Value(last_dim));
   }
 
   // Replace last_dim with k_dim.
@@ -1262,9 +1265,9 @@ REGISTER_OP("NthElement")
       DimensionHandle last_dim = c->Dim(input, -1);
       if (c->ValueKnown(last_dim) && c->ValueKnown(n_dim) &&
           c->Value(last_dim) <= c->Value(n_dim)) {
-        return errors::InvalidArgument("Input must have last dimension > n = ",
-                                       c->Value(n_dim), " but is ",
-                                       c->Value(last_dim));
+        return errors::InvalidArgument(
+            "Input must have last dimension > n = ", c->Value(n_dim),
+            " but is ", c->Value(last_dim));
       }
 
       // Reduce last_dim for output tensor
@@ -1606,6 +1609,55 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
+REGISTER_OP("__MklDummyPadWithConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("paddings: Tpaddings")
+    .Output("output: T")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+Dummy node that enables fusing Pad and Conv2D operator for MKL. This node
+does not perform anything. It is just created as an intermediate output of
+merging Pad and Conv2D.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
+REGISTER_OP("_MklPadWithConv2D")
+    .Input("input: T")
+    .Input("filter: T")
+    .Input("paddings: Tpaddings")
+    .Input("mkl_input: uint8")
+    .Input("mkl_filter: uint8")
+    .Input("mkl_paddings: uint8")
+    .Output("output: T")
+    .Output("filter_output: T")
+    .Output("mkl_output: uint8")
+    .Output("mkl_filter_output: uint8")
+    .Attr("T: {half, float, double}")
+    .Attr("strides: list(int)")
+    .Attr("use_cudnn_on_gpu: bool = true")
+    .Attr(GetPaddingAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr("dilations: list(int) = [1, 1, 1, 1]")
+    .Attr("Tpaddings: {int32, int64} = DT_INT32")
+    .SetShapeFn(shape_inference::Conv2DShape)
+    .Doc(R"doc(
+MKL version of Pad and Conv2D operator. Uses MKL DNN APIs to perform
+Pad and 2D convolution to the output of convolution.
+
+NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
+expected to invoke these operators.
+)doc");
+
 REGISTER_OP("_MklConv2DBackpropFilter")
     .Input("input: T")
     .Input("filter_sizes: int32")
@@ -2107,7 +2159,6 @@ NOTE Do not invoke this operator directly in Python. Graph rewrite pass is
 expected to invoke these operators.
 )doc");
 
-
 REGISTER_OP("_MklAvgPool3DGrad")
     .Input("orig_input_shape: int32")
     .Input("grad: T")
@@ -2190,11 +2241,7 @@ REGISTER_OP("_MklLRN")
     .Input("input: T")
     .Input("mkl_input: uint8")
     .Output("output: T")
-#ifdef INTEL_MKL_ML_ONLY
-    .Output("workspace: T")
-#else
     .Output("workspace: uint8")
-#endif
     .Output("mkl_output: uint8")
     .Output("mkl_workspace: uint8")
     .Attr("depth_radius: int = 5")
@@ -2218,11 +2265,7 @@ REGISTER_OP("_MklLRNGrad")
     .Input("input_grads: T")
     .Input("input_image: T")
     .Input("output_image: T")
-#ifdef INTEL_MKL_ML_ONLY
-    .Input("workspace: T")
-#else
     .Input("workspace: uint8")
-#endif
     .Input("mkl_input_grads: uint8")
     .Input("mkl_input_image: uint8")
     .Input("mkl_output_image: uint8")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 430212ee1d1fd54c8436a0bc5e1a3079fee01eea..89bdcc571efee6c0d193341936758670c1218aab 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -4944,33 +4944,6 @@ op {
     type: "list(float)"
   }
 }
-op {
-  name: "BytesProducedStatsDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "CTCBeamSearchDecoder"
   input_arg {
@@ -7915,21 +7888,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "DatasetToTFRecord"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "filename"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "compression_type"
-    type: DT_STRING
-  }
-}
 op {
   name: "DebugGradientIdentity"
   input_arg {
@@ -8599,37 +8557,6 @@ op {
     }
   }
 }
-op {
-  name: "DenseToSparseBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "row_shape"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "DenseToSparseSetOperation"
   input_arg {
@@ -9834,24 +9761,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "EnqueueInQueueDataset"
-  input_arg {
-    name: "queue"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "components"
-    type_list_attr: "Tcomponents"
-  }
-  attr {
-    name: "Tcomponents"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "EnsureShape"
   input_arg {
@@ -10089,6 +9998,33 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalBytesProducedStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalCSVDataset"
   input_arg {
@@ -10154,6 +10090,63 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalDatasetCardinality"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "cardinality"
+    type: DT_INT64
+  }
+}
+op {
+  name: "ExperimentalDatasetToTFRecord"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "filename"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "compression_type"
+    type: DT_STRING
+  }
+}
+op {
+  name: "ExperimentalDenseToSparseBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "row_shape"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalDirectedInterleaveDataset"
   input_arg {
@@ -10189,50 +10182,129 @@ op {
   }
 }
 op {
-  name: "ExperimentalFunctionBufferingResource"
+  name: "ExperimentalGroupByReducerDataset"
   input_arg {
-    name: "string_arg"
-    type: DT_STRING
+    name: "input_dataset"
+    type: DT_VARIANT
   }
   input_arg {
-    name: "target_device"
-    type: DT_STRING
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "init_func_other_arguments"
+    type_list_attr: "Tinit_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "finalize_func_other_arguments"
+    type_list_attr: "Tfinalize_func_other_arguments"
   }
   output_arg {
-    name: "resource"
-    type: DT_RESOURCE
+    name: "handle"
+    type: DT_VARIANT
   }
   attr {
-    name: "shared_name"
-    type: "string"
+    name: "key_func"
+    type: "func"
   }
   attr {
-    name: "container"
-    type: "string"
+    name: "init_func"
+    type: "func"
   }
   attr {
-    name: "f"
+    name: "reduce_func"
     type: "func"
   }
   attr {
-    name: "buffer_size"
-    type: "int"
+    name: "finalize_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tinit_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Tfinalize_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
     type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
   is_stateful: true
 }
 op {
-  name: "ExperimentalFunctionBufferingResourceGetNext"
+  name: "ExperimentalGroupByWindowDataset"
   input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "key_func_other_arguments"
+    type_list_attr: "Tkey_func_other_arguments"
+  }
+  input_arg {
+    name: "reduce_func_other_arguments"
+    type_list_attr: "Treduce_func_other_arguments"
+  }
+  input_arg {
+    name: "window_size_func_other_arguments"
+    type_list_attr: "Twindow_size_func_other_arguments"
   }
   output_arg {
-    name: "output"
-    type_list_attr: "output_types"
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "key_func"
+    type: "func"
+  }
+  attr {
+    name: "reduce_func"
+    type: "func"
+  }
+  attr {
+    name: "window_size_func"
+    type: "func"
+  }
+  attr {
+    name: "Tkey_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Treduce_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "Twindow_size_func_other_arguments"
+    type: "list(type)"
+    has_minimum: true
   }
   attr {
     name: "output_types"
@@ -10240,15 +10312,12 @@ op {
     has_minimum: true
     minimum: 1
   }
-  is_stateful: true
-}
-op {
-  name: "ExperimentalFunctionBufferingResourceReset"
-  input_arg {
-    name: "function_buffer_resource"
-    type: DT_RESOURCE
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
   }
-  is_stateful: true
 }
 op {
   name: "ExperimentalIdentityIndexedDataset"
@@ -10361,6 +10430,150 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalLatencyStatsDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalMapAndBatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "batch_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "drop_remainder"
+    type: DT_BOOL
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMapDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "use_inter_op_parallelism"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalMatchingFilesDataset"
+  input_arg {
+    name: "patterns"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalMaterializedIndexDatasetHandle"
   output_arg {
@@ -10389,6 +10602,33 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalMaxIntraOpParallelismDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "max_intra_op_parallelism"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalNonSerializableDataset"
   input_arg {
@@ -10459,6 +10699,290 @@ op {
     has_minimum: true
     minimum: 1
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalParallelInterleaveDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  input_arg {
+    name: "cycle_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "block_length"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "sloppy"
+    type: DT_BOOL
+  }
+  input_arg {
+    name: "buffer_output_elements"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "prefetch_input_elements"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalParseExampleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_parallel_calls"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "dense_defaults"
+    type_list_attr: "Tdense"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "sparse_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "dense_keys"
+    type: "list(string)"
+    has_minimum: true
+  }
+  attr {
+    name: "sparse_types"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "Tdense"
+    type: "list(type)"
+    has_minimum: true
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_INT64
+        type: DT_STRING
+      }
+    }
+  }
+  attr {
+    name: "dense_shapes"
+    type: "list(shape)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "sloppy"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalPrivateThreadPoolDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "num_threads"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalRandomDataset"
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalScanDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "initial_state"
+    type_list_attr: "Tstate"
+  }
+  input_arg {
+    name: "other_arguments"
+    type_list_attr: "Targuments"
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "f"
+    type: "func"
+  }
+  attr {
+    name: "Tstate"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "Targuments"
+    type: "list(type)"
+    has_minimum: true
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "ExperimentalSetStatsAggregatorDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "stats_aggregator"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tag"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "counter_prefix"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
 }
 op {
   name: "ExperimentalSleepDataset"
@@ -10487,6 +11011,107 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "ExperimentalSlidingWindowDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "window_size"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_shift"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "window_stride"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
+op {
+  name: "ExperimentalSqlDataset"
+  input_arg {
+    name: "driver_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "data_source_name"
+    type: DT_STRING
+  }
+  input_arg {
+    name: "query"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorHandle"
+  output_arg {
+    name: "handle"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ExperimentalStatsAggregatorSummary"
+  input_arg {
+    name: "iterator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "summary"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
 op {
   name: "ExperimentalThreadPoolDataset"
   input_arg {
@@ -10552,6 +11177,29 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ExperimentalUnbatchDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+}
 op {
   name: "ExperimentalUniqueDataset"
   input_arg {
@@ -12744,144 +13392,6 @@ op {
     }
   }
 }
-op {
-  name: "GroupByReducerDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "init_func_other_arguments"
-    type_list_attr: "Tinit_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "finalize_func_other_arguments"
-    type_list_attr: "Tfinalize_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "init_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "finalize_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tinit_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Tfinalize_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
-op {
-  name: "GroupByWindowDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "key_func_other_arguments"
-    type_list_attr: "Tkey_func_other_arguments"
-  }
-  input_arg {
-    name: "reduce_func_other_arguments"
-    type_list_attr: "Treduce_func_other_arguments"
-  }
-  input_arg {
-    name: "window_size_func_other_arguments"
-    type_list_attr: "Twindow_size_func_other_arguments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "key_func"
-    type: "func"
-  }
-  attr {
-    name: "reduce_func"
-    type: "func"
-  }
-  attr {
-    name: "window_size_func"
-    type: "func"
-  }
-  attr {
-    name: "Tkey_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Treduce_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "Twindow_size_func_other_arguments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "GuaranteeConst"
   input_arg {
@@ -14362,91 +14872,64 @@ op {
   }
 }
 op {
-  name: "LRNGrad"
-  input_arg {
-    name: "input_grads"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "input_image"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "output_image"
-    type_attr: "T"
-  }
-  output_arg {
-    name: "output"
-    type_attr: "T"
-  }
-  attr {
-    name: "depth_radius"
-    type: "int"
-    default_value {
-      i: 5
-    }
-  }
-  attr {
-    name: "bias"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "alpha"
-    type: "float"
-    default_value {
-      f: 1
-    }
-  }
-  attr {
-    name: "beta"
-    type: "float"
-    default_value {
-      f: 0.5
-    }
-  }
-  attr {
-    name: "T"
-    type: "type"
-    default_value {
-      type: DT_FLOAT
-    }
-    allowed_values {
-      list {
-        type: DT_HALF
-        type: DT_BFLOAT16
-        type: DT_FLOAT
-      }
-    }
-  }
-}
-op {
-  name: "LatencyStatsDataset"
+  name: "LRNGrad"
   input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
+    name: "input_grads"
+    type_attr: "T"
   }
   input_arg {
-    name: "tag"
-    type: DT_STRING
+    name: "input_image"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "output_image"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
+    name: "output"
+    type_attr: "T"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "depth_radius"
+    type: "int"
+    default_value {
+      i: 5
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "bias"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "alpha"
+    type: "float"
+    default_value {
+      f: 1
+    }
+  }
+  attr {
+    name: "beta"
+    type: "float"
+    default_value {
+      f: 0.5
+    }
+  }
+  attr {
+    name: "T"
+    type: "type"
+    default_value {
+      type: DT_FLOAT
+    }
+    allowed_values {
+      list {
+        type: DT_HALF
+        type: DT_BFLOAT16
+        type: DT_FLOAT
+      }
+    }
   }
 }
 op {
@@ -15308,112 +15791,56 @@ op {
   }
 }
 op {
-  name: "MakeIterator"
+  name: "Lu"
   input_arg {
-    name: "dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  is_stateful: true
-}
-op {
-  name: "MapAndBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_batches"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
+    name: "input"
+    type_attr: "T"
   }
   output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
+    name: "lu"
+    type_attr: "T"
   }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
+  output_arg {
+    name: "p"
+    type_attr: "output_idx_type"
   }
   attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_DOUBLE
+        type: DT_FLOAT
+        type: DT_COMPLEX64
+        type: DT_COMPLEX128
+      }
+    }
   }
   attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "output_idx_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
   }
 }
 op {
-  name: "MapAndBatchDatasetV2"
+  name: "MakeIterator"
   input_arg {
-    name: "input_dataset"
+    name: "dataset"
     type: DT_VARIANT
   }
   input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "drop_remainder"
-    type: DT_BOOL
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
+    name: "iterator"
+    type: DT_RESOURCE
   }
+  is_stateful: true
 }
 op {
   name: "MapClear"
@@ -15495,6 +15922,13 @@ op {
       b: true
     }
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "MapDefun"
@@ -15894,18 +16328,6 @@ op {
     type: DT_STRING
   }
 }
-op {
-  name: "MatchingFilesDataset"
-  input_arg {
-    name: "patterns"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  is_stateful: true
-}
 op {
   name: "MatrixBandPart"
   input_arg {
@@ -19449,62 +19871,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "ParallelInterleaveDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  input_arg {
-    name: "cycle_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "block_length"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "sloppy"
-    type: DT_BOOL
-  }
-  input_arg {
-    name: "buffer_output_elements"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "prefetch_input_elements"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ParallelInterleaveDatasetV2"
   input_arg {
@@ -19613,6 +19979,13 @@ op {
       b: false
     }
   }
+  attr {
+    name: "preserve_cardinality"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
 }
 op {
   name: "ParameterizedTruncatedNormal"
@@ -19760,83 +20133,6 @@ op {
     has_minimum: true
   }
 }
-op {
-  name: "ParseExampleDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "num_parallel_calls"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "dense_defaults"
-    type_list_attr: "Tdense"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "sparse_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "dense_keys"
-    type: "list(string)"
-    has_minimum: true
-  }
-  attr {
-    name: "sparse_types"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "Tdense"
-    type: "list(type)"
-    has_minimum: true
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_INT64
-        type: DT_STRING
-      }
-    }
-  }
-  attr {
-    name: "dense_shapes"
-    type: "list(shape)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "sloppy"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-}
 op {
   name: "ParseSequenceExample"
   input_arg {
@@ -20340,6 +20636,13 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "executor_type"
     type: "string"
@@ -20516,48 +20819,6 @@ op {
     minimum: 1
   }
 }
-op {
-  name: "PrependFromQueueAndPaddedBatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "batch_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "padded_shapes"
-    type: DT_INT64
-    number_attr: "N"
-  }
-  input_arg {
-    name: "padding_values"
-    type_list_attr: "Toutput_types"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "Toutput_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "N"
-    type: "int"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "PreventGradient"
   input_arg {
@@ -20986,6 +21247,19 @@ op {
       }
     }
   }
+  attr {
+    name: "round_mode"
+    type: "string"
+    default_value {
+      s: "HALF_TO_EVEN"
+    }
+    allowed_values {
+      list {
+        s: "HALF_TO_EVEN"
+        s: "HALF_UP"
+      }
+    }
+  }
 }
 op {
   name: "QuantizeAndDequantizeV3"
@@ -22816,34 +23090,6 @@ op {
   }
   is_stateful: true
 }
-op {
-  name: "RandomDataset"
-  input_arg {
-    name: "seed"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "seed2"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "RandomGamma"
   input_arg {
@@ -24822,35 +25068,162 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdadelta"
+  name: "ResourceApplyAdadelta"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum_update"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "rho"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "epsilon"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagrad"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "update_slots"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "ResourceApplyAdagradDA"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "gradient_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum_update"
+    name: "gradient_squared_accumulator"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
+    name: "grad"
     type_attr: "T"
   }
   input_arg {
-    name: "rho"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "epsilon"
+    name: "l1"
     type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "l2"
     type_attr: "T"
   }
+  input_arg {
+    name: "global_step"
+    type: DT_INT64
+  }
   attr {
     name: "T"
     type: "type"
@@ -24886,97 +25259,46 @@ op {
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdagrad"
+  name: "ResourceApplyAdam"
   input_arg {
     name: "var"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "accum"
+    name: "m"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "lr"
-    type_attr: "T"
-  }
-  input_arg {
-    name: "grad"
-    type_attr: "T"
-  }
-  attr {
-    name: "T"
-    type: "type"
-    allowed_values {
-      list {
-        type: DT_FLOAT
-        type: DT_DOUBLE
-        type: DT_INT32
-        type: DT_UINT8
-        type: DT_INT16
-        type: DT_INT8
-        type: DT_COMPLEX64
-        type: DT_INT64
-        type: DT_QINT8
-        type: DT_QUINT8
-        type: DT_QINT32
-        type: DT_BFLOAT16
-        type: DT_UINT16
-        type: DT_COMPLEX128
-        type: DT_HALF
-        type: DT_UINT32
-        type: DT_UINT64
-      }
-    }
-  }
-  attr {
-    name: "use_locking"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
-  attr {
-    name: "update_slots"
-    type: "bool"
-    default_value {
-      b: true
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "ResourceApplyAdagradDA"
-  input_arg {
-    name: "var"
+    name: "v"
     type: DT_RESOURCE
   }
   input_arg {
-    name: "gradient_accumulator"
-    type: DT_RESOURCE
+    name: "beta1_power"
+    type_attr: "T"
   }
   input_arg {
-    name: "gradient_squared_accumulator"
-    type: DT_RESOURCE
+    name: "beta2_power"
+    type_attr: "T"
   }
   input_arg {
-    name: "grad"
+    name: "lr"
     type_attr: "T"
   }
   input_arg {
-    name: "lr"
+    name: "beta1"
     type_attr: "T"
   }
   input_arg {
-    name: "l1"
+    name: "beta2"
     type_attr: "T"
   }
   input_arg {
-    name: "l2"
+    name: "epsilon"
     type_attr: "T"
   }
   input_arg {
-    name: "global_step"
-    type: DT_INT64
+    name: "grad"
+    type_attr: "T"
   }
   attr {
     name: "T"
@@ -25010,10 +25332,17 @@ op {
       b: false
     }
   }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
   is_stateful: true
 }
 op {
-  name: "ResourceApplyAdam"
+  name: "ResourceApplyAdamWithAmsgrad"
   input_arg {
     name: "var"
     type: DT_RESOURCE
@@ -25026,6 +25355,10 @@ op {
     name: "v"
     type: DT_RESOURCE
   }
+  input_arg {
+    name: "vhat"
+    type: DT_RESOURCE
+  }
   input_arg {
     name: "beta1_power"
     type_attr: "T"
@@ -25086,13 +25419,6 @@ op {
       b: false
     }
   }
-  attr {
-    name: "use_nesterov"
-    type: "bool"
-    default_value {
-      b: false
-    }
-  }
   is_stateful: true
 }
 op {
@@ -25419,6 +25745,69 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceApplyMomentum"
   input_arg {
@@ -26690,6 +27079,83 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "ResourceSparseApplyKerasMomentum"
+  input_arg {
+    name: "var"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "accum"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "lr"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "grad"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "momentum"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_FLOAT
+        type: DT_DOUBLE
+        type: DT_INT32
+        type: DT_UINT8
+        type: DT_INT16
+        type: DT_INT8
+        type: DT_COMPLEX64
+        type: DT_INT64
+        type: DT_QINT8
+        type: DT_QUINT8
+        type: DT_QINT32
+        type: DT_BFLOAT16
+        type: DT_UINT16
+        type: DT_COMPLEX128
+        type: DT_HALF
+        type: DT_UINT32
+        type: DT_UINT64
+      }
+    }
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+  attr {
+    name: "use_locking"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  attr {
+    name: "use_nesterov"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "ResourceSparseApplyMomentum"
   input_arg {
@@ -27789,52 +28255,6 @@ op {
     }
   }
 }
-op {
-  name: "ScanDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "initial_state"
-    type_list_attr: "Tstate"
-  }
-  input_arg {
-    name: "other_arguments"
-    type_list_attr: "Targuments"
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "f"
-    type: "func"
-  }
-  attr {
-    name: "Tstate"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "Targuments"
-    type: "list(type)"
-    has_minimum: true
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "ScatterAdd"
   input_arg {
@@ -29272,42 +29692,6 @@ op {
     }
   }
 }
-op {
-  name: "SetStatsAggregatorDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "stats_aggregator"
-    type: DT_RESOURCE
-  }
-  input_arg {
-    name: "tag"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "counter_prefix"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Shape"
   input_arg {
@@ -29771,41 +30155,6 @@ op {
     }
   }
 }
-op {
-  name: "SlideDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  input_arg {
-    name: "window_size"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_shift"
-    type: DT_INT64
-  }
-  input_arg {
-    name: "window_stride"
-    type: DT_INT64
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "Snapshot"
   input_arg {
@@ -32996,38 +33345,6 @@ op {
     }
   }
 }
-op {
-  name: "SqlDataset"
-  input_arg {
-    name: "driver_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "data_source_name"
-    type: DT_STRING
-  }
-  input_arg {
-    name: "query"
-    type: DT_STRING
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-  is_stateful: true
-}
 op {
   name: "Sqrt"
   input_arg {
@@ -33513,6 +33830,13 @@ op {
       s: ""
     }
   }
+  attr {
+    name: "config_proto"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
   attr {
     name: "executor_type"
     type: "string"
@@ -33913,40 +34237,6 @@ op {
     }
   }
 }
-op {
-  name: "StatsAggregatorHandle"
-  output_arg {
-    name: "handle"
-    type: DT_RESOURCE
-  }
-  attr {
-    name: "container"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  attr {
-    name: "shared_name"
-    type: "string"
-    default_value {
-      s: ""
-    }
-  }
-  is_stateful: true
-}
-op {
-  name: "StatsAggregatorSummary"
-  input_arg {
-    name: "iterator"
-    type: DT_RESOURCE
-  }
-  output_arg {
-    name: "summary"
-    type: DT_STRING
-  }
-  is_stateful: true
-}
 op {
   name: "StopGradient"
   input_arg {
@@ -35923,6 +36213,127 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "TensorForestCreateTreeVariable"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeDeserialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeIsInitializedOp"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "is_initialized"
+    type: DT_BOOL
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreePredict"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  input_arg {
+    name: "dense_features"
+    type: DT_FLOAT
+  }
+  output_arg {
+    name: "logits"
+    type: DT_FLOAT
+  }
+  attr {
+    name: "logits_dimension"
+    type: "int"
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeResourceHandleOp"
+  output_arg {
+    name: "resource"
+    type: DT_RESOURCE
+  }
+  attr {
+    name: "container"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  attr {
+    name: "shared_name"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSerialize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_config"
+    type: DT_STRING
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorForestTreeSize"
+  input_arg {
+    name: "tree_handle"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "tree_size"
+    type: DT_INT32
+  }
+  is_stateful: true
+}
+op {
+  name: "TensorListConcat"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  output_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+}
 op {
   name: "TensorListConcatLists"
   input_arg {
@@ -36183,6 +36594,39 @@ op {
     type: "type"
   }
 }
+op {
+  name: "TensorListSplit"
+  input_arg {
+    name: "tensor"
+    type_attr: "element_dtype"
+  }
+  input_arg {
+    name: "element_shape"
+    type_attr: "shape_type"
+  }
+  input_arg {
+    name: "lengths"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+  attr {
+    name: "element_dtype"
+    type: "type"
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorListStack"
   input_arg {
@@ -36205,6 +36649,105 @@ op {
     }
   }
 }
+op {
+  name: "TensorScatterAdd"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterSub"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
+op {
+  name: "TensorScatterUpdate"
+  input_arg {
+    name: "tensor"
+    type_attr: "T"
+  }
+  input_arg {
+    name: "indices"
+    type_attr: "Tindices"
+  }
+  input_arg {
+    name: "updates"
+    type_attr: "T"
+  }
+  output_arg {
+    name: "output"
+    type_attr: "T"
+  }
+  attr {
+    name: "T"
+    type: "type"
+  }
+  attr {
+    name: "Tindices"
+    type: "type"
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
 op {
   name: "TensorSliceDataset"
   input_arg {
@@ -36822,29 +37365,6 @@ op {
     type: "type"
   }
 }
-op {
-  name: "UnbatchDataset"
-  input_arg {
-    name: "input_dataset"
-    type: DT_VARIANT
-  }
-  output_arg {
-    name: "handle"
-    type: DT_VARIANT
-  }
-  attr {
-    name: "output_types"
-    type: "list(type)"
-    has_minimum: true
-    minimum: 1
-  }
-  attr {
-    name: "output_shapes"
-    type: "list(shape)"
-    has_minimum: true
-    minimum: 1
-  }
-}
 op {
   name: "UnbatchGrad"
   input_arg {
@@ -36886,6 +37406,104 @@ op {
     type: "type"
   }
 }
+op {
+  name: "UnicodeDecodeWithOffsets"
+  input_arg {
+    name: "input"
+    type: DT_STRING
+  }
+  output_arg {
+    name: "row_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "char_values"
+    type: DT_INT32
+  }
+  output_arg {
+    name: "char_to_byte_starts"
+    type: DT_INT64
+  }
+  attr {
+    name: "input_encoding"
+    type: "string"
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "strict"
+        s: "replace"
+        s: "ignore"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+  attr {
+    name: "replace_control_characters"
+    type: "bool"
+    default_value {
+      b: false
+    }
+  }
+}
+op {
+  name: "UnicodeEncode"
+  input_arg {
+    name: "input_values"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "input_splits"
+    type: DT_INT64
+  }
+  output_arg {
+    name: "output"
+    type: DT_STRING
+  }
+  attr {
+    name: "errors"
+    type: "string"
+    default_value {
+      s: "replace"
+    }
+    allowed_values {
+      list {
+        s: "ignore"
+        s: "replace"
+        s: "strict"
+      }
+    }
+  }
+  attr {
+    name: "output_encoding"
+    type: "string"
+    allowed_values {
+      list {
+        s: "UTF-8"
+        s: "UTF-16-BE"
+        s: "UTF-32-BE"
+      }
+    }
+  }
+  attr {
+    name: "replacement_char"
+    type: "int"
+    default_value {
+      i: 65533
+    }
+  }
+}
 op {
   name: "UnicodeScript"
   input_arg {
@@ -37535,6 +38153,17 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "UnwrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "UpperBound"
   input_arg {
@@ -37853,6 +38482,17 @@ op {
     minimum: 1
   }
 }
+op {
+  name: "WrapDatasetVariant"
+  input_arg {
+    name: "input_handle"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "output_handle"
+    type: DT_VARIANT
+  }
+}
 op {
   name: "WriteAudioSummary"
   input_arg {
diff --git a/tensorflow/core/ops/sparse_ops.cc b/tensorflow/core/ops/sparse_ops.cc
index bc0cb2095dabf366e85106770c56a2f169f040c8..de08a1078458c236520924f52450fa8b4dc6f18a 100644
--- a/tensorflow/core/ops/sparse_ops.cc
+++ b/tensorflow/core/ops/sparse_ops.cc
@@ -401,7 +401,7 @@ REGISTER_OP("SparseReduceMax")
     .Attr("keep_dims: bool = False")
     .Output("output: T")
     .Attr("T: realnumbertype")
-    .SetShapeFn(shape_inference::UnknownShape);
+    .SetShapeFn(shape_inference::SparseReduceShapeFn);
 
 REGISTER_OP("SparseReduceMaxSparse")
     .Input("input_indices: int64")
@@ -423,7 +423,7 @@ REGISTER_OP("SparseReduceSum")
     .Attr("keep_dims: bool = False")
     .Output("output: T")
     .Attr("T: numbertype")
-    .SetShapeFn(shape_inference::UnknownShape);
+    .SetShapeFn(shape_inference::SparseReduceShapeFn);
 
 REGISTER_OP("SparseReduceSumSparse")
     .Input("input_indices: int64")
diff --git a/tensorflow/core/ops/sparse_ops_test.cc b/tensorflow/core/ops/sparse_ops_test.cc
index 6a9b5ce4d31fcd03a69a53893689d67ba5b2b9e7..00283c59932c579046a166e90531c1b8a740f4ab 100644
--- a/tensorflow/core/ops/sparse_ops_test.cc
+++ b/tensorflow/core/ops/sparse_ops_test.cc
@@ -133,6 +133,13 @@ TEST(SparseOpsTest, SparseToDense_ShapeFn) {
 
 TEST(SparseOpsTest, SparseReduceSum_ShapeFn) {
   ShapeInferenceTestOp op("SparseReduceSum");
+  TF_ASSERT_OK(NodeDefBuilder("test", "SparseReduceSum")
+                   .Input({"input_indices", 0, DT_INT64})
+                   .Input({"input_values", 1, DT_INT64})
+                   .Input({"input_shape", 2, DT_INT64})
+                   .Input({"reduction_axes", 3, DT_INT32})
+                   .Attr("keep_dims", false)
+                   .Finalize(&op.node_def));
 
   // Shape fn always yields unknown.
   INFER_OK(op, "?;?;?;?", "?");
diff --git a/tensorflow/core/ops/string_ops.cc b/tensorflow/core/ops/string_ops.cc
index 352253135c406285459527fa1af45237cd9f4207..8ea74f1d43e5baa3f14398e6ea17c19466ea2973 100644
--- a/tensorflow/core/ops/string_ops.cc
+++ b/tensorflow/core/ops/string_ops.cc
@@ -13,13 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/lib/strings/strcat.h"
+#include "tensorflow/core/platform/types.h"
 
 namespace tensorflow {
 
+namespace shape_inference {
+class InferenceContext;
+}  // namespace shape_inference
+
 using shape_inference::DimensionHandle;
 using shape_inference::InferenceContext;
 using shape_inference::ShapeHandle;
@@ -250,6 +261,31 @@ REGISTER_OP("UnicodeScript")
     .Output("output: int32")
     .SetShapeFn(shape_inference::UnchangedShape);
 
+REGISTER_OP("UnicodeEncode")
+    .Input("input_values: int32")
+    .Input("input_splits: int64")
+    .Attr("errors: {'ignore', 'replace', 'strict'} = 'replace'")
+    .Attr("output_encoding: {'UTF-8', 'UTF-16-BE', 'UTF-32-BE'}")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Output("output: string")
+    .SetShapeFn([](InferenceContext* c) {
+      // Check rank of inner values
+      ShapeHandle input_inner_values_shape = c->input(0);
+      ShapeHandle unused;
+      TF_RETURN_IF_ERROR(c->WithRank(input_inner_values_shape, 1, &unused));
+
+      // Check rank of input_splits
+      ShapeHandle splits_shape = c->input(1);
+      TF_RETURN_IF_ERROR(c->WithRank(splits_shape, 1, &unused));
+
+      // Output shape is a 1-D tensor with size equal to number of splits.
+      std::vector<DimensionHandle> dims(1);
+      TF_RETURN_IF_ERROR(c->Subtract(c->Dim(splits_shape, 0), 1, &dims[0]));
+      c->set_output(0, c->MakeShape(dims));
+
+      return Status::OK();
+    });
+
 REGISTER_OP("UnicodeTranscode")
     .Input("input: string")
     .Output("output: string")
@@ -259,4 +295,28 @@ REGISTER_OP("UnicodeTranscode")
     .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
     .Attr("replace_control_characters: bool = false")
     .SetShapeFn(shape_inference::UnchangedShape);
+
+REGISTER_OP("UnicodeDecodeWithOffsets")
+    .Input("input: string")
+    .Output("row_splits: int64")
+    .Output("char_values: int32")
+    .Output("char_to_byte_starts: int64")
+    .Attr("input_encoding: string")
+    .Attr("errors: {'strict', 'replace', 'ignore'} = 'replace'")
+    .Attr("replacement_char: int = 65533")  // 0xFFFD unicode replacement char
+    .Attr("replace_control_characters: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      // row_splits.shape == [input.size() + 1]
+      DimensionHandle num_row_splits;
+      DimensionHandle input_size = c->NumElements(c->input(0));
+      TF_RETURN_IF_ERROR(c->Add(input_size, 1, &num_row_splits));
+      c->set_output(0, c->Vector(num_row_splits));
+
+      // char_values.shape == offset_values.shape == [num_chars]
+      DimensionHandle num_chars = c->UnknownDim();
+      c->set_output(1, c->Vector(num_chars));
+      c->set_output(2, c->Vector(num_chars));
+      return Status::OK();
+    });
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/ops/tensor_forest_ops.cc b/tensorflow/core/ops/tensor_forest_ops.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b4b6ba318e9d981af2797a54eca7f9caf049f6b0
--- /dev/null
+++ b/tensorflow/core/ops/tensor_forest_ops.cc
@@ -0,0 +1,79 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <vector>
+
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/shape_inference.h"
+
+namespace tensorflow {
+
+REGISTER_RESOURCE_HANDLE_OP(TensorForestTreeResource);
+
+REGISTER_OP("TensorForestTreeIsInitializedOp")
+    .Input("tree_handle: resource")
+    .Output("is_initialized: bool")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      c->set_output(0, c->Scalar());
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorForestCreateTreeVariable")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::NoOutputs);
+
+REGISTER_OP("TensorForestTreeSerialize")
+    .Input("tree_handle: resource")
+    .Output("tree_config: string")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("TensorForestTreeDeserialize")
+    .Input("tree_handle: resource")
+    .Input("tree_config: string")
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused_input;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &unused_input));
+      return Status::OK();
+    });
+
+REGISTER_OP("TensorForestTreeSize")
+    .Input("tree_handle: resource")
+    .Output("tree_size: int32")
+    .SetShapeFn(tensorflow::shape_inference::ScalarShape);
+
+REGISTER_OP("TensorForestTreePredict")
+    .Attr("logits_dimension: int")
+    .Input("tree_handle: resource")
+    .Input("dense_features: float")
+    .Output("logits: float")
+    .SetShapeFn([](tensorflow::shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle shape_handle;
+      shape_inference::DimensionHandle batch_size = c->UnknownDim();
+
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 2, &shape_handle));
+
+      batch_size = c->Dim(shape_handle, 0);
+
+      int logits_dimension;
+      TF_RETURN_IF_ERROR(c->GetAttr("logits_dimension", &logits_dimension));
+      c->set_output(0, c->Matrix(batch_size, logits_dimension));
+      return Status::OK();
+    });
+}  // namespace tensorflow
diff --git a/tensorflow/core/ops/training_ops.cc b/tensorflow/core/ops/training_ops.cc
index 94ff092a85d512e602da5e97fc3007d4c68c5937..995ed42d53dd286e5068f0067b35849c4e36e64b 100644
--- a/tensorflow/core/ops/training_ops.cc
+++ b/tensorflow/core/ops/training_ops.cc
@@ -685,6 +685,34 @@ REGISTER_OP("ResourceSparseApplyMomentum")
       return ApplyMomentumShapeFn(c, true /* sparse */);
     });
 
+REGISTER_OP("ResourceApplyKerasMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, false /* sparse */);
+    });
+
+REGISTER_OP("ResourceSparseApplyKerasMomentum")
+    .Input("var: resource")
+    .Input("accum: resource")
+    .Input("lr: T")
+    .Input("grad: T")
+    .Input("indices: Tindices")
+    .Input("momentum: T")
+    .Attr("T: numbertype")
+    .Attr("Tindices: {int32, int64}")
+    .Attr("use_locking: bool = false")
+    .Attr("use_nesterov: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyMomentumShapeFn(c, true /* sparse */);
+    });
+
 static Status ApplyAdamShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
@@ -741,6 +769,44 @@ REGISTER_OP("ResourceApplyAdam")
       return ApplyAdamShapeFn(c, false /* sparse */);
     });
 
+static Status ApplyAdamWithAmsgradShapeFn(InferenceContext* c, bool sparse) {
+  ShapeHandle unused;
+  ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 1), &s));  // m
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 2), &s));  // v
+  TF_RETURN_IF_ERROR(c->Merge(s, ShapeOrHandleShape(c, 3), &s));  // vhat
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(4), 0, &unused));       // beta1_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(5), 0, &unused));       // beta2_power
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(6), 0, &unused));       // lr
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(7), 0, &unused));       // beta1
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(8), 0, &unused));       // beta2
+  TF_RETURN_IF_ERROR(c->WithRank(c->input(9), 0, &unused));       // epsilon
+  TF_RETURN_IF_ERROR(
+      HandleGradAndIndicesInputs(c, sparse, 10 /* grad_idx */, &s));
+  if (c->num_outputs() > 0) {
+    c->set_output(0, s);
+  }
+  return Status::OK();
+}
+
+REGISTER_OP("ResourceApplyAdamWithAmsgrad")
+    .Input("var: resource")
+    .Input("m: resource")
+    .Input("v: resource")
+    .Input("vhat: resource")
+    .Input("beta1_power: T")
+    .Input("beta2_power: T")
+    .Input("lr: T")
+    .Input("beta1: T")
+    .Input("beta2: T")
+    .Input("epsilon: T")
+    .Input("grad: T")
+    .Attr("T: numbertype")
+    .Attr("use_locking: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      return ApplyAdamWithAmsgradShapeFn(c, false /* sparse */);
+    });
+
 static Status ApplyAdaMaxShapeFn(InferenceContext* c, bool sparse) {
   ShapeHandle unused;
   ShapeHandle s = ShapeOrHandleShape(c, 0);                       // var
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index 9d00aa7b7feadec3a2a2861247c5d536a4237c3e..2efe0c0876e871f6752bb3e7724de4c505102130 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -41,7 +41,7 @@ void CheckFeatureOrDie(CPUFeature feature, const string& feature_name) {
   }
 }
 
-// Check if CPU feature is inclued in the TensorFlow binary.
+// Check if CPU feature is included in the TensorFlow binary.
 void CheckIfFeatureUnused(CPUFeature feature, const string& feature_name,
                           string& missing_instructions) {
   if (TestCPUFeature(feature)) {
diff --git a/tensorflow/core/platform/default/build_config.bzl b/tensorflow/core/platform/default/build_config.bzl
index 3a4415f229b5f625576cf85bd1852894300e109a..04287151301dd0c6eb25ec7bc8b12a207f44ab90 100644
--- a/tensorflow/core/platform/default/build_config.bzl
+++ b/tensorflow/core/platform/default/build_config.bzl
@@ -543,6 +543,9 @@ def tf_additional_proto_srcs():
 def tf_additional_human_readable_json_deps():
     return []
 
+def tf_additional_logger_deps():
+    return []
+
 def tf_additional_all_protos():
     return ["//tensorflow/core:protos_all"]
 
diff --git a/tensorflow/core/platform/default/device_tracer.cc b/tensorflow/core/platform/default/device_tracer.cc
index cf8b477b83d3460ac20d5fb7cfbe4c6cb0ad9d7a..8351362e05699c591b5563f2270928f4408077e8 100644
--- a/tensorflow/core/platform/default/device_tracer.cc
+++ b/tensorflow/core/platform/default/device_tracer.cc
@@ -297,19 +297,16 @@ CUPTIManager *GetCUPTIManager() {
 // for the duration of the CUPTI API callback.
 TF_STATIC_THREAD_LOCAL_POD(const char *, tls_current_annotation);
 
-class DeviceTracerImpl : public DeviceTracer,
-                         public CUPTIClient,
-                         public tracing::TraceCollector {
+class TraceCollectorImpl : public tracing::TraceCollector {
  public:
-  DeviceTracerImpl(CUPTIManager *cupti_manager);
-  ~DeviceTracerImpl() override;
+  TraceCollectorImpl() { tracing::SetTraceCollector(this); }
 
-  // DeviceTracer interface:
-  Status Start() override;
-  Status Stop() override;
-  Status Collect(StepStatsCollector *collector) override;
+  ~TraceCollectorImpl() override {
+    DCHECK(!active_trace_session_)
+        << "Unexpected active trace session detected. ";
+  }
 
-  // tracing::TraceCollector interface:
+  // Note the method can be called after a call to Stop().
   virtual std::unique_ptr<Handle> CreateAnnotationHandle(
       StringPiece name_part1, StringPiece name_part2) const {
     struct Impl : public tracing::TraceCollector::Handle {
@@ -332,8 +329,7 @@ class DeviceTracerImpl : public DeviceTracer,
   }
 
   bool IsEnabledForAnnotations() const override {
-    // We are always enabled for 'Annotations'.
-    return true;
+    return active_trace_session_.load(std::memory_order_relaxed);
   }
 
   bool IsEnabledForActivities(bool is_expensive) const override {
@@ -341,6 +337,36 @@ class DeviceTracerImpl : public DeviceTracer,
     return false;
   }
 
+  void Start() {
+    DCHECK(!active_trace_session_)
+        << "Unexpected active trace session detected. ";
+    active_trace_session_ = true;
+  }
+
+  void Stop() {
+    DCHECK(active_trace_session_) << "No active trace session detected. ";
+    active_trace_session_ = false;
+  }
+
+ private:
+  std::atomic<bool> active_trace_session_;
+};
+
+TraceCollectorImpl *GlobalDefaultTraceCollector() {
+  static auto *instance = new TraceCollectorImpl();
+  return instance;
+}
+
+class DeviceTracerImpl : public DeviceTracer, public CUPTIClient {
+ public:
+  DeviceTracerImpl(CUPTIManager *cupti_manager);
+  ~DeviceTracerImpl() override;
+
+  // DeviceTracer interface:
+  Status Start() override;
+  Status Stop() override;
+  Status Collect(StepStatsCollector *collector) override;
+
  protected:
   // This callback is used exclusively by CUPTIManager.
   friend class CUPTIManager;
@@ -430,7 +456,7 @@ Status DeviceTracerImpl::Start() {
   }
 
   // Register as a TraceEngine to receive ScopedAnnotations.
-  tracing::SetTraceCollector(this);
+  GlobalDefaultTraceCollector()->Start();
 
   // Intercept launch and memcpy calls to capture the Op name annotation.
   // TODO(pbar) Add callbacks for memcpy variants.
@@ -478,7 +504,8 @@ Status DeviceTracerImpl::Stop() {
     return Status::OK();
   }
   CUPTI_CALL(Unsubscribe(subscriber_));
-  tracing::SetTraceCollector(nullptr);
+  GlobalDefaultTraceCollector()->Stop();
+
   TF_RETURN_IF_ERROR(cupti_manager_->DisableTrace());
   end_walltime_us_ = NowInUsec();
   CUPTI_CALL(GetTimestamp(&end_timestamp_));
diff --git a/tensorflow/core/platform/default/human_readable_json.cc b/tensorflow/core/platform/default/human_readable_json.cc
index 9f97c8272c10c9036901ac0405c27806d59fdab0..bf9c7b76206b79ad43969a1e3e2de6e6cbdacc46 100644
--- a/tensorflow/core/platform/default/human_readable_json.cc
+++ b/tensorflow/core/platform/default/human_readable_json.cc
@@ -20,7 +20,7 @@ limitations under the License.
 
 namespace tensorflow {
 
-Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
+Status ProtoToHumanReadableJson(const protobuf::Message& proto,
                                 string* result) {
 #ifdef TENSORFLOW_LITE_PROTOS
   *result = "[human readable output not available on Android]";
@@ -28,7 +28,7 @@ Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
 #else
   result->clear();
 
-  auto status = google::protobuf::util::MessageToJsonString(proto, result);
+  auto status = protobuf::util::MessageToJsonString(proto, result);
   if (!status.ok()) {
     // Convert error_msg google::protobuf::StringPiece to
     // tensorflow::StringPiece.
@@ -41,8 +41,7 @@ Status ProtoToHumanReadableJson(const ::google::protobuf::Message& proto,
 #endif
 }
 
-Status HumanReadableJsonToProto(const string& str,
-                                ::google::protobuf::Message* proto) {
+Status HumanReadableJsonToProto(const string& str, protobuf::Message* proto) {
 #ifdef TENSORFLOW_LITE_PROTOS
   return errors::Internal("Cannot parse JSON protos on Android");
 #else
diff --git a/tensorflow/core/platform/default/logger.cc b/tensorflow/core/platform/default/logger.cc
new file mode 100644
index 0000000000000000000000000000000000000000..54b1a1a67ca7da65aa6897e6461ebe9b54fb4767
--- /dev/null
+++ b/tensorflow/core/platform/default/logger.cc
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/logger.h"
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace tensorflow {
+
+Logger* Logger::Singleton() {
+  class DefaultLogger : public Logger {
+   private:
+    void DoLogProto(google::protobuf::Any* proto) override {
+      VLOG(2) << proto->ShortDebugString();
+    }
+    void DoFlush() override {}
+  };
+  static Logger* instance = new DefaultLogger();
+  return instance;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/default/logging.cc b/tensorflow/core/platform/default/logging.cc
index 34db4901067989f0ae17729b9cf7677446281d2f..26bd8542fd70f7a2565192e3626d3ac84f0edf5f 100644
--- a/tensorflow/core/platform/default/logging.cc
+++ b/tensorflow/core/platform/default/logging.cc
@@ -21,18 +21,18 @@ limitations under the License.
 #include <android/log.h>
 #include <iostream>
 #include <sstream>
-#include <cstring>
 #endif
 
 #include <stdlib.h>
+#include <string.h>
 #include <time.h>
 
+#include <string>
+#include <unordered_map>
+
 namespace tensorflow {
 namespace internal {
 
-LogMessage::LogMessage(const char* fname, int line, int severity)
-    : fname_(fname), line_(line), severity_(severity) {}
-
 #if defined(PLATFORM_POSIX_ANDROID)
 void LogMessage::GenerateLogMessage() {
   int android_log_level;
@@ -94,55 +94,156 @@ void LogMessage::GenerateLogMessage() {
 
 namespace {
 
+int ParseInteger(const char* str, size_t size) {
+  // Ideally we would use env_var / safe_strto64, but it is
+  // hard to use here without pulling in a lot of dependencies,
+  // so we use std:istringstream instead
+  string integer_str(str, size);
+  std::istringstream ss(integer_str);
+  int level = 0;
+  ss >> level;
+  return level;
+}
+
 // Parse log level (int64) from environment variable (char*)
 int64 LogLevelStrToInt(const char* tf_env_var_val) {
   if (tf_env_var_val == nullptr) {
     return 0;
   }
+  return ParseInteger(tf_env_var_val, strlen(tf_env_var_val));
+}
 
-  // Ideally we would use env_var / safe_strto64, but it is
-  // hard to use here without pulling in a lot of dependencies,
-  // so we use std:istringstream instead
-  string min_log_level(tf_env_var_val);
-  std::istringstream ss(min_log_level);
-  int64 level;
-  if (!(ss >> level)) {
-    // Invalid vlog level setting, set level to default (0)
-    level = 0;
+// Using StringPiece breaks Windows build.
+struct StringData {
+  struct Hasher {
+    size_t operator()(const StringData& sdata) const {
+      // For dependency reasons, we cannot use hash.h here. Use DBJHash instead.
+      size_t hash = 5381;
+      const char* data = sdata.data;
+      for (const char* top = data + sdata.size; data < top; ++data) {
+        hash = ((hash << 5) + hash) + (*data);
+      }
+      return hash;
+    }
+  };
+
+  StringData() = default;
+  StringData(const char* data, size_t size) : data(data), size(size) {}
+
+  bool operator==(const StringData& rhs) const {
+    return size == rhs.size && memcmp(data, rhs.data, size) == 0;
   }
 
-  return level;
+  const char* data = nullptr;
+  size_t size = 0;
+};
+
+using VmoduleMap = std::unordered_map<StringData, int, StringData::Hasher>;
+
+// Returns a mapping from module name to VLOG level, derived from the
+// TF_CPP_VMOUDLE environment variable; ownership is transferred to the caller.
+VmoduleMap* VmodulesMapFromEnv() {
+  // The value of the env var is supposed to be of the form:
+  //    "foo=1,bar=2,baz=3"
+  const char* env = getenv("TF_CPP_VMODULE");
+  if (env == nullptr) {
+    // If there is no TF_CPP_VMODULE configuration (most common case), return
+    // nullptr so that the ShouldVlogModule() API can fast bail out of it.
+    return nullptr;
+  }
+  // The memory returned by getenv() can be invalidated by following getenv() or
+  // setenv() calls. And since we keep references to it in the VmoduleMap in
+  // form of StringData objects, make a copy of it.
+  const char* env_data = strdup(env);
+  VmoduleMap* result = new VmoduleMap();
+  while (true) {
+    const char* eq = strchr(env_data, '=');
+    if (eq == nullptr) {
+      break;
+    }
+    const char* after_eq = eq + 1;
+
+    // Comma either points at the next comma delimiter, or at a null terminator.
+    // We check that the integer we parse ends at this delimiter.
+    const char* comma = strchr(after_eq, ',');
+    const char* new_env_data;
+    if (comma == nullptr) {
+      comma = strchr(after_eq, '\0');
+      new_env_data = comma;
+    } else {
+      new_env_data = comma + 1;
+    }
+    (*result)[StringData(env_data, eq - env_data)] =
+        ParseInteger(after_eq, comma - after_eq);
+    env_data = new_env_data;
+  }
+  return result;
 }
 
 }  // namespace
 
 int64 MinLogLevelFromEnv() {
+  // We don't want to print logs during fuzzing as that would slow fuzzing down
+  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
+  // return a value so that nothing is actually printed. Since LOG uses >=
+  // (see ~LogMessage in this file) to see if log messages need to be printed,
+  // the value we're interested on to disable printing is the maximum severity.
+  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  return tensorflow::NUM_SEVERITIES;
+#else
   const char* tf_env_var_val = getenv("TF_CPP_MIN_LOG_LEVEL");
   return LogLevelStrToInt(tf_env_var_val);
+#endif
 }
 
 int64 MinVLogLevelFromEnv() {
+  // We don't want to print logs during fuzzing as that would slow fuzzing down
+  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
+  // return a value so that nothing is actually printed. Since VLOG uses <=
+  // (see VLOG_IS_ON in logging.h) to see if log messages need to be printed,
+  // the value we're interested on to disable printing is 0.
+  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
+#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
+  return 0;
+#else
   const char* tf_env_var_val = getenv("TF_CPP_MIN_VLOG_LEVEL");
   return LogLevelStrToInt(tf_env_var_val);
+#endif
 }
 
+LogMessage::LogMessage(const char* fname, int line, int severity)
+    : fname_(fname), line_(line), severity_(severity) {}
+
 LogMessage::~LogMessage() {
   // Read the min log level once during the first call to logging.
   static int64 min_log_level = MinLogLevelFromEnv();
-  if (TF_PREDICT_TRUE(severity_ >= min_log_level)) GenerateLogMessage();
+  if (severity_ >= min_log_level) {
+    GenerateLogMessage();
+  }
 }
 
 int64 LogMessage::MinVLogLevel() {
-  // We don't want to print logs during fuzzing as that would slow fuzzing down
-  // by almost 2x. So, if we are in fuzzing mode (not just running a test), we
-  // return maximum value so that nothing is actually printed
-  // See also http://llvm.org/docs/LibFuzzer.html#fuzzer-friendly-build-mode
-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
-  return tensorflow::NUM_SEVERITIES;
-#else
   static int64 min_vlog_level = MinVLogLevelFromEnv();
   return min_vlog_level;
-#endif
+}
+
+bool LogMessage::VmoduleActivated(const char* fname, int level) {
+  if (level <= MinVLogLevel()) {
+    return true;
+  }
+  static VmoduleMap* vmodules = VmodulesMapFromEnv();
+  if (TF_PREDICT_TRUE(vmodules == nullptr)) {
+    return false;
+  }
+  const char* last_slash = strrchr(fname, '/');
+  const char* module_start = last_slash == nullptr ? fname : last_slash + 1;
+  const char* dot_after = strchr(module_start, '.');
+  const char* module_limit =
+      dot_after == nullptr ? strchr(fname, '\0') : dot_after;
+  StringData module(module_start, module_limit - module_start);
+  auto it = vmodules->find(module);
+  return it != vmodules->end() && it->second >= level;
 }
 
 LogMessageFatal::LogMessageFatal(const char* file, int line)
diff --git a/tensorflow/core/platform/default/logging.h b/tensorflow/core/platform/default/logging.h
index 08a692fff75c79a5602d252908284925325deb76..bb8735ed32505294eff75620006694a4eda80bcc 100644
--- a/tensorflow/core/platform/default/logging.h
+++ b/tensorflow/core/platform/default/logging.h
@@ -46,6 +46,17 @@ class LogMessage : public std::basic_ostringstream<char> {
   // but VLOG(3) will not. Defaults to 0.
   static int64 MinVLogLevel();
 
+  // Returns whether VLOG level lvl is activated for the file fname.
+  //
+  // E.g. if the environment variable TF_CPP_VMODULE contains foo=3 and fname is
+  // foo.cc and lvl is <= 3, this will return true. It will also return true if
+  // the level is lower or equal to TF_CPP_MIN_VLOG_LEVEL (default zero).
+  //
+  // It is expected that the result of this query will be cached in the VLOG-ing
+  // call site to avoid repeated lookups. This routine performs a hash-map
+  // access against the VLOG-ing specification provided by the env var.
+  static bool VmoduleActivated(const char* fname, int level);
+
  protected:
   void GenerateLogMessage();
 
@@ -55,6 +66,13 @@ class LogMessage : public std::basic_ostringstream<char> {
   int severity_;
 };
 
+// Uses the lower operator & precedence to voidify a LogMessage reference, so
+// that the ternary VLOG() implementation is balanced, type wise.
+struct Voidifier {
+  template <typename T>
+  void operator&(const T&)const {}
+};
+
 // LogMessageFatal ensures the process will exit in failure after
 // logging this message.
 class LogMessageFatal : public LogMessage {
@@ -77,18 +95,30 @@ class LogMessageFatal : public LogMessage {
 #define LOG(severity) _TF_LOG_##severity
 
 #ifdef IS_MOBILE_PLATFORM
+
 // Turn VLOG off when under mobile devices for considerations of binary size.
 #define VLOG_IS_ON(lvl) ((lvl) <= 0)
+
 #else
-// Otherwise, Set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level
-// of VLOG
-#define VLOG_IS_ON(lvl) \
-  ((lvl) <= ::tensorflow::internal::LogMessage::MinVLogLevel())
+
+// Otherwise, set TF_CPP_MIN_VLOG_LEVEL environment to update minimum log level
+// of VLOG, or TF_CPP_VMODULE to set the minimum log level for individual
+// translation units.
+#define VLOG_IS_ON(lvl)                                                     \
+  (([](int level, const char* fname) {                                      \
+    static const bool vmodule_activated =                                   \
+        ::tensorflow::internal::LogMessage::VmoduleActivated(fname, level); \
+    return vmodule_activated;                                               \
+  })(lvl, __FILE__))
+
 #endif
 
-#define VLOG(lvl)                        \
-  if (TF_PREDICT_FALSE(VLOG_IS_ON(lvl))) \
-  ::tensorflow::internal::LogMessage(__FILE__, __LINE__, tensorflow::INFO)
+#define VLOG(level)                                              \
+  TF_PREDICT_TRUE(!VLOG_IS_ON(level))                            \
+  ? (void)0                                                      \
+  : ::tensorflow::internal::Voidifier() &                        \
+          ::tensorflow::internal::LogMessage(__FILE__, __LINE__, \
+                                             tensorflow::INFO)
 
 // CHECK dies with a fatal error if condition is not true.  It is *not*
 // controlled by NDEBUG, so the check will be executed regardless of
diff --git a/tensorflow/core/platform/env.h b/tensorflow/core/platform/env.h
index 5732271f150a64e22f7eea2eea243e3c6c75631f..1b5382841574e6b8843079ae9cb359c5c9b475d0 100644
--- a/tensorflow/core/platform/env.h
+++ b/tensorflow/core/platform/env.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/numa.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -166,11 +167,24 @@ class Env {
   Status DeleteFile(const string& fname);
 
   /// \brief Deletes the specified directory and all subdirectories and files
-  /// underneath it. undeleted_files and undeleted_dirs stores the number of
-  /// files and directories that weren't deleted (unspecified if the return
-  /// status is not OK).
+  /// underneath it. This is accomplished by traversing the directory tree
+  /// rooted at dirname and deleting entries as they are encountered.
+  ///
+  /// If dirname itself is not readable or does not exist, *undeleted_dir_count
+  /// is set to 1, *undeleted_file_count is set to 0 and an appropriate status
+  /// (e.g. NOT_FOUND) is returned.
+  ///
+  /// If dirname and all its descendants were successfully deleted, TF_OK is
+  /// returned and both error counters are set to zero.
+  ///
+  /// Otherwise, while traversing the tree, undeleted_file_count and
+  /// undeleted_dir_count are updated if an entry of the corresponding type
+  /// could not be deleted. The returned error status represents the reason that
+  /// any one of these entries could not be deleted.
+  ///
   /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
-  /// Typical return codes
+  ///
+  /// Typical return codes:
   ///  * OK - dirname exists and we were able to delete everything underneath.
   ///  * NOT_FOUND - dirname doesn't exist
   ///  * PERMISSION_DENIED - dirname or some descendant is not writable
@@ -395,6 +409,7 @@ struct ThreadOptions {
   size_t stack_size = 0;  // 0: use system default value
   /// Guard area size to use near thread stacks to use (in bytes)
   size_t guard_size = 0;  // 0: use system default value
+  int numa_node = port::kNUMANoAffinity;
 };
 
 /// A utility routine: copy contents of `src` in file system `src_fs`
diff --git a/tensorflow/core/platform/file_system.h b/tensorflow/core/platform/file_system.h
index 156af6cdeaa015429d60e4599f59c5a4b806f5e6..c84a93b1bf59be7cb19352825cc4bb82b48e2246 100644
--- a/tensorflow/core/platform/file_system.h
+++ b/tensorflow/core/platform/file_system.h
@@ -167,10 +167,23 @@ class FileSystem {
   virtual Status DeleteDir(const string& dirname) = 0;
 
   /// \brief Deletes the specified directory and all subdirectories and files
-  /// underneath it. undeleted_files and undeleted_dirs stores the number of
-  /// files and directories that weren't deleted (unspecified if the return
-  /// status is not OK).
+  /// underneath it. This is accomplished by traversing the directory tree
+  /// rooted at dirname and deleting entries as they are encountered.
+  ///
+  /// If dirname itself is not readable or does not exist, *undeleted_dir_count
+  /// is set to 1, *undeleted_file_count is set to 0 and an appropriate status
+  /// (e.g. NOT_FOUND) is returned.
+  ///
+  /// If dirname and all its descendants were successfully deleted, TF_OK is
+  /// returned and both error counters are set to zero.
+  ///
+  /// Otherwise, while traversing the tree, undeleted_file_count and
+  /// undeleted_dir_count are updated if an entry of the corresponding type
+  /// could not be deleted. The returned error status represents the reason that
+  /// any one of these entries could not be deleted.
+  ///
   /// REQUIRES: undeleted_files, undeleted_dirs to be not null.
+  ///
   /// Typical return codes:
   ///  * OK - dirname exists and we were able to delete everything underneath.
   ///  * NOT_FOUND - dirname doesn't exist
diff --git a/tensorflow/core/platform/logger.h b/tensorflow/core/platform/logger.h
new file mode 100644
index 0000000000000000000000000000000000000000..5d304bea63a7c78e4a90d78ea2be4ce01caa802d
--- /dev/null
+++ b/tensorflow/core/platform/logger.h
@@ -0,0 +1,51 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_LOGGER_H_
+#define TENSORFLOW_CORE_PLATFORM_LOGGER_H_
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/core/platform/protobuf.h"
+
+namespace tensorflow {
+
+// Abstract logging interface. Contrary to logging.h, this class describes an
+// interface, not a concrete logging mechanism. This is useful when we want to
+// log anything to a non-local place, e.g. a database.
+class Logger {
+ public:
+  static Logger* Singleton();
+
+  virtual ~Logger() = default;
+
+  // Logs a typed proto.
+  template <typename ProtoType>
+  void LogProto(const ProtoType& proto) {
+    google::protobuf::Any any;
+    any.PackFrom(proto);
+    DoLogProto(&any);
+  }
+
+  // Flushes any pending log. Blocks until everything is flushed.
+  void Flush() { DoFlush(); }
+
+ private:
+  virtual void DoLogProto(google::protobuf::Any* proto) = 0;
+  virtual void DoFlush() = 0;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_LOGGER_H_
diff --git a/tensorflow/core/platform/numa_test.cc b/tensorflow/core/platform/numa_test.cc
index 8b39ecd59cb1d95b30f33475981ca0a5fce117af..91789efd1eee2f30b0277562e5a2b1f0d14aae26 100644
--- a/tensorflow/core/platform/numa_test.cc
+++ b/tensorflow/core/platform/numa_test.cc
@@ -44,7 +44,7 @@ TEST(Numa, Malloc) {
 
 TEST(Numa, SetNodeAffinity) {
   // NOTE(tucker): This test is not reliable when executed under tap because
-  // the virtual machine may not have access to all of the availble NUMA
+  // the virtual machine may not have access to all of the available NUMA
   // nodes.  Not sure what to do about that.
   EXPECT_EQ(-1, port::NUMAGetThreadNodeAffinity());
   if (port::NUMAEnabled()) {
diff --git a/tensorflow/core/platform/platform_strings.cc b/tensorflow/core/platform/platform_strings.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1852633d595e0b65415284a3233ba11385a3c44
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings.cc
@@ -0,0 +1,64 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/platform/platform_strings.h"
+
+#include <cstdio>
+#include <cstring>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found) {
+  int result;
+  FILE* ifp = fopen(path.c_str(), "rb");
+  if (ifp != nullptr) {
+    static const char prefix[] = TF_PLAT_STR_MAGIC_PREFIX_;
+    int first_char = prefix[1];
+    int last_char = -1;
+    int c;
+    while ((c = getc(ifp)) != EOF) {
+      if (c == first_char && last_char == 0) {
+        int i = 2;
+        while (prefix[i] != 0 && (c = getc(ifp)) == prefix[i]) {
+          i++;
+        }
+        if (prefix[i] == 0) {
+          std::string str;
+          while ((c = getc(ifp)) != EOF && c != 0) {
+            str.push_back(c);
+          }
+          if (!str.empty()) {
+            found->push_back(str);
+          }
+        }
+      }
+      last_char = c;
+    }
+
+    result = (ferror(ifp) == 0) ? 0 : errno;
+    fclose(ifp);
+  } else {
+    result = errno;
+  }
+  return result;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/platform/platform_strings.h b/tensorflow/core/platform/platform_strings.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b1dbd130e0df0e991ac3e2dcce2840e66b1f9b9
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings.h
@@ -0,0 +1,364 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
+
+// This header defines the macro TF_PLATFORM_STRINGS() which should be used
+// once in each dynamically loadable TensorFlow module.  It embeds static
+// strings into the compilation unit that allow TensorFlow to determine what
+// compilation options were in effect when the compilation unit was built.  All
+// compilation units within the same dynamically loadable library should be
+// built with the same options (or at least, the strings should be embedded in
+// the compilation unit built with the most restrictive options).
+
+// The platform strings embedded into a binary may be retrieved with the
+// GetPlatformStrings function.
+
+// Rationale:
+// We wish to load only those libraries that this CPU can execute.  For
+// example, we should not load a library compiled with avx256 instructions on a
+// CPU that cannot execute them.
+//
+// One might think that one could dlopen() the library, and call a routine that
+// would return which cpu type it was compiled for.  Alas, this does not work,
+// because at dlopen() time, a library containing C++ will execute constructors
+// of class variables with static storage class.  Even code that looks
+// innocuous may use optional platform-specific instructions.  For example,
+// the fastest way to zero a region of memory might use optional instructions.
+//
+// One might think one could run a tool such as "objdump" to read flags from
+// the libraries' headers, or perhaps disassemble each library to look for
+// particular instructions.  Unfortunately, the desired flags are not present
+// in the headers, and disassembly can be prohibitively slow ("objdump -d" is
+// very slow, for example).  Moreover, a tool to examine the library may not
+// be present on the system unless the user has installed special packages (for
+// example, on Windows).
+//
+// Instead, we adopt a crude but straightforward solution:  We require
+// developers to use the macro TF_PLATFORM_STRINGS() in their library, to
+// embed the compilation options as constant strings.  The compiler's
+// predefined macros pick which strings are included.  We then search for the
+// strings in the files, and then dlopen() only those libraries that have or
+// lack strings as needed.
+//
+// We adopt the approach of placing in the binary a fairly raw copy of the
+// predefined macros, rather than trying to interpret them in complex ways at
+// compile time.  This allows the loading binary to alter its interpretation of
+// the strings without library developers having to recompile.
+
+#include <stdio.h>
+
+#include <string>
+#include <vector>
+
+// Aside from the header guard, the internal macros defined here have the form:
+//   TF_PLAT_STR_*
+
+// If a macro is removed from the list of tested macros, the major version in
+// the following version number should be incremented, and the minor version
+// set to zero.  Otherwise, if a macro is added to the list of tested macros,
+// the minor number should be incremented.
+#define TF_PLAT_STR_VERSION_ "1.0"
+
+// Prefix of each option string indicator in the binary.
+// After the prefix, such strings have the form:
+//    [A-Za-z_0-9]=<value>
+// followed by a terminating nul.  To simplify searching, this prefix is all
+// ASCII, starts with a nul, and contains no character twice.
+#define TF_PLAT_STR_MAGIC_PREFIX_ "\0S\\s\":^p*L}"
+
+// A helper macro for TF_PLAT_STR_AS_STR_().
+#define TF_PLAT_STR_STR_1_(x) #x
+
+// Yield a constant string corresponding to x, after macro expansion.
+#define TF_PLAT_STR_AS_STR_(x) TF_PLAT_STR_STR_1_(x)
+
+// An empty definition to make lists more uniform.
+#define TF_PLAT_STR_TERMINATOR_
+
+// TF_PLAT_STR_(x) introduces a constant string indicating whether a
+// particular compilation option has been turned on.
+//
+// In gcc and clang, we might imagine using something like
+// #define TF_PLAT_STR_(x) \
+//     (sizeof (#x) != sizeof (TF_PLAT_STR_AS_STR_ (x))? \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_ (x) : \
+//      TF_PLAT_STR_MAGIC_PREFIX_ #x "=0"),
+// but some compilers (notably MSVC) place both "foo" and "bar" in the binary
+// when presented with
+//    (true?  "foo" : "bar")
+// so we must use #if to select the strings we need, which is rather verbose.
+#define TF_PLAT_STR_(x) TF_PLAT_STR_MAGIC_PREFIX_ #x "=" TF_PLAT_STR_AS_STR_(x)
+
+// Include the #if machinery that sets the macros used below.
+// platform_strings_computed.h can be generated by filtering this header file
+// through:
+// awk '
+// header == "" { print; }
+// /\*\// && header == "" {
+//     print "// Generated from platform_strings.h.";
+//     print "";
+//     print "#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+//     print "";
+//     header = 1;
+// }
+// /^#define TF_PLAT_STR_LIST_[a-zA-Z0-9_]*\(\) *\\$/ { active = 1; }
+// /TF_PLAT_STR_TERMINATOR_/ { active = 0; }
+// /^ *TF_PLAT_STR_[A-Za-z0-9_]* *\\$/ && active {
+//     x = $0;
+//     sub(/^ *TF_PLAT_STR_/, "", x);
+//     sub(/ *\\$/, "", x);
+//     printf ("#if defined(%s)\n", x);
+//     printf ("#define TF_PLAT_STR_%s TF_PLAT_STR_(%s)\n", x, x);
+//     printf ("#else\n");
+//     printf ("#define TF_PLAT_STR_%s\n", x);
+//     printf ("#endif\n");
+// }
+// END {
+//     print "";
+//     print "#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_";
+// }'
+#include "tensorflow/core/platform/platform_strings_computed.h"
+
+// clang-format butchers the following lines.
+// clang-format off
+
+// x86_64 and x86_32 optional features.
+#define TF_PLAT_STR_LIST___x86_64__()                                      \
+        TF_PLAT_STR__M_IX86_FP                                             \
+        TF_PLAT_STR__NO_PREFETCHW                                          \
+        TF_PLAT_STR___3dNOW_A__                                            \
+        TF_PLAT_STR___3dNOW__                                              \
+        TF_PLAT_STR___ABM__                                                \
+        TF_PLAT_STR___ADX__                                                \
+        TF_PLAT_STR___AES__                                                \
+        TF_PLAT_STR___AVX2__                                               \
+        TF_PLAT_STR___AVX512BW__                                           \
+        TF_PLAT_STR___AVX512CD__                                           \
+        TF_PLAT_STR___AVX512DQ__                                           \
+        TF_PLAT_STR___AVX512ER__                                           \
+        TF_PLAT_STR___AVX512F__                                            \
+        TF_PLAT_STR___AVX512IFMA__                                         \
+        TF_PLAT_STR___AVX512PF__                                           \
+        TF_PLAT_STR___AVX512VBMI__                                         \
+        TF_PLAT_STR___AVX512VL__                                           \
+        TF_PLAT_STR___AVX__                                                \
+        TF_PLAT_STR___BMI2__                                               \
+        TF_PLAT_STR___BMI__                                                \
+        TF_PLAT_STR___CLFLUSHOPT__                                         \
+        TF_PLAT_STR___CLZERO__                                             \
+        TF_PLAT_STR___F16C__                                               \
+        TF_PLAT_STR___FMA4__                                               \
+        TF_PLAT_STR___FMA__                                                \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___FSGSBASE__                                           \
+        TF_PLAT_STR___FXSR__                                               \
+        TF_PLAT_STR___LWP__                                                \
+        TF_PLAT_STR___LZCNT__                                              \
+        TF_PLAT_STR___MMX__                                                \
+        TF_PLAT_STR___MWAITX__                                             \
+        TF_PLAT_STR___PCLMUL__                                             \
+        TF_PLAT_STR___PKU__                                                \
+        TF_PLAT_STR___POPCNT__                                             \
+        TF_PLAT_STR___PRFCHW__                                             \
+        TF_PLAT_STR___RDRND__                                              \
+        TF_PLAT_STR___RDSEED__                                             \
+        TF_PLAT_STR___RTM__                                                \
+        TF_PLAT_STR___SHA__                                                \
+        TF_PLAT_STR___SSE2_MATH__                                          \
+        TF_PLAT_STR___SSE2__                                               \
+        TF_PLAT_STR___SSE_MATH__                                           \
+        TF_PLAT_STR___SSE__                                                \
+        TF_PLAT_STR___SSE3__                                               \
+        TF_PLAT_STR___SSE4A__                                              \
+        TF_PLAT_STR___SSE4_1__                                             \
+        TF_PLAT_STR___SSE4_2__                                             \
+        TF_PLAT_STR___SSSE3__                                              \
+        TF_PLAT_STR___TBM__                                                \
+        TF_PLAT_STR___XOP__                                                \
+        TF_PLAT_STR___XSAVEC__                                             \
+        TF_PLAT_STR___XSAVEOPT__                                           \
+        TF_PLAT_STR___XSAVES__                                             \
+        TF_PLAT_STR___XSAVE__                                              \
+        TF_PLAT_STR_TERMINATOR_
+
+// PowerPC (64- and 32-bit) optional features.
+#define TF_PLAT_STR_LIST___powerpc64__()                                   \
+        TF_PLAT_STR__SOFT_DOUBLE                                           \
+        TF_PLAT_STR__SOFT_FLOAT                                            \
+        TF_PLAT_STR___ALTIVEC__                                            \
+        TF_PLAT_STR___APPLE_ALTIVEC__                                      \
+        TF_PLAT_STR___CRYPTO__                                             \
+        TF_PLAT_STR___FLOAT128_HARDWARE__                                  \
+        TF_PLAT_STR___FLOAT128_TYPE__                                      \
+        TF_PLAT_STR___FP_FAST_FMA                                          \
+        TF_PLAT_STR___FP_FAST_FMAF                                         \
+        TF_PLAT_STR___HTM__                                                \
+        TF_PLAT_STR___NO_FPRS__                                            \
+        TF_PLAT_STR___NO_LWSYNC__                                          \
+        TF_PLAT_STR___POWER8_VECTOR__                                      \
+        TF_PLAT_STR___POWER9_VECTOR__                                      \
+        TF_PLAT_STR___PPC405__                                             \
+        TF_PLAT_STR___QUAD_MEMORY_ATOMIC__                                 \
+        TF_PLAT_STR___RECIPF__                                             \
+        TF_PLAT_STR___RECIP_PRECISION__                                    \
+        TF_PLAT_STR___RECIP__                                              \
+        TF_PLAT_STR___RSQRTEF__                                            \
+        TF_PLAT_STR___RSQRTE__                                             \
+        TF_PLAT_STR___TM_FENCE__                                           \
+        TF_PLAT_STR___UPPER_REGS_DF__                                      \
+        TF_PLAT_STR___UPPER_REGS_SF__                                      \
+        TF_PLAT_STR___VEC__                                                \
+        TF_PLAT_STR___VSX__                                                \
+        TF_PLAT_STR_TERMINATOR_
+
+// aarch64 and 32-bit arm optional features
+#define TF_PLAT_STR_LIST___aarch64__()                                     \
+        TF_PLAT_STR___ARM_ARCH                                             \
+        TF_PLAT_STR___ARM_FEATURE_CLZ                                      \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRC32                                    \
+        TF_PLAT_STR___ARM_FEATURE_CRYPTO                                   \
+        TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING                        \
+        TF_PLAT_STR___ARM_FEATURE_DSP                                      \
+        TF_PLAT_STR___ARM_FEATURE_FMA                                      \
+        TF_PLAT_STR___ARM_FEATURE_IDIV                                     \
+        TF_PLAT_STR___ARM_FEATURE_LDREX                                    \
+        TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN                           \
+        TF_PLAT_STR___ARM_FEATURE_QBIT                                     \
+        TF_PLAT_STR___ARM_FEATURE_QRDMX                                    \
+        TF_PLAT_STR___ARM_FEATURE_SAT                                      \
+        TF_PLAT_STR___ARM_FEATURE_SIMD32                                   \
+        TF_PLAT_STR___ARM_FEATURE_UNALIGNED                                \
+        TF_PLAT_STR___ARM_FP                                               \
+        TF_PLAT_STR___ARM_NEON_FP                                          \
+        TF_PLAT_STR___ARM_NEON__                                           \
+        TF_PLAT_STR___ARM_WMMX                                             \
+        TF_PLAT_STR___IWMMXT2__                                            \
+        TF_PLAT_STR___IWMMXT__                                             \
+        TF_PLAT_STR___VFP_FP__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+// Generic features, including indication of architecture and OS.
+// The _M_* macros are defined by Visual Studio.
+// It doesn't define __LITTLE_ENDIAN__ or __BYTE_ORDER__;
+// Windows is assumed to be little endian.
+#define TF_PLAT_STR_LIST___generic__()                                     \
+        TF_PLAT_STR_TARGET_IPHONE_SIMULATOR                                \
+        TF_PLAT_STR_TARGET_OS_IOS                                          \
+        TF_PLAT_STR_TARGET_OS_IPHONE                                       \
+        TF_PLAT_STR__MSC_VER                                               \
+        TF_PLAT_STR__M_ARM                                                 \
+        TF_PLAT_STR__M_ARM64                                               \
+        TF_PLAT_STR__M_ARM_ARMV7VE                                         \
+        TF_PLAT_STR__M_ARM_FP                                              \
+        TF_PLAT_STR__M_IX86                                                \
+        TF_PLAT_STR__M_X64                                                 \
+        TF_PLAT_STR__WIN32                                                 \
+        TF_PLAT_STR__WIN64                                                 \
+        TF_PLAT_STR___ANDROID__                                            \
+        TF_PLAT_STR___APPLE__                                              \
+        TF_PLAT_STR___BYTE_ORDER__                                         \
+        TF_PLAT_STR___CYGWIN__                                             \
+        TF_PLAT_STR___FreeBSD__                                            \
+        TF_PLAT_STR___LITTLE_ENDIAN__                                      \
+        TF_PLAT_STR___NetBSD__                                             \
+        TF_PLAT_STR___OpenBSD__                                            \
+        TF_PLAT_STR_____MSYS__                                             \
+        TF_PLAT_STR___aarch64__                                            \
+        TF_PLAT_STR___alpha__                                              \
+        TF_PLAT_STR___arm__                                                \
+        TF_PLAT_STR___i386__                                               \
+        TF_PLAT_STR___i686__                                               \
+        TF_PLAT_STR___ia64__                                               \
+        TF_PLAT_STR___linux__                                              \
+        TF_PLAT_STR___mips32__                                             \
+        TF_PLAT_STR___mips64__                                             \
+        TF_PLAT_STR___powerpc64__                                          \
+        TF_PLAT_STR___powerpc__                                            \
+        TF_PLAT_STR___riscv___                                             \
+        TF_PLAT_STR___s390x__                                              \
+        TF_PLAT_STR___sparc64__                                            \
+        TF_PLAT_STR___sparc__                                              \
+        TF_PLAT_STR___x86_64__                                             \
+        TF_PLAT_STR_TERMINATOR_
+
+#if !defined(__x86_64__) && !defined(_M_X64) && \
+    !defined(__i386__) && !defined(_M_IX86)
+#undef TF_PLAT_STR_LIST___x86_64__
+#define TF_PLAT_STR_LIST___x86_64__()
+#endif
+#if !defined(__powerpc64__) && !defined(__powerpc__)
+#undef TF_PLAT_STR_LIST___powerpc64__
+#define TF_PLAT_STR_LIST___powerpc64__()
+#endif
+#if !defined(__aarch64__) && !defined(_M_ARM64) && \
+    !defined(__arm__) && !defined(_M_ARM)
+#undef TF_PLAT_STR_LIST___aarch64__
+#define TF_PLAT_STR_LIST___aarch64__()
+#endif
+
+// Macro to be used in each dynamically loadable library.
+//
+// The BSS global variable tf_cpu_option_global and the class
+// instance tf_cpu_option_avoid_omit_class are needed to prevent
+// compilers/linkers such as clang from omitting the static variable
+// tf_cpu_option[], which would otherwise appear to be unused.  We cannot make
+// tf_cpu_option[] global, because we then might get multiply-defined symbols
+// if TF_PLAT_STR() is used twice in the same library.
+// (tf_cpu_option_global doesn't see such errors because it is
+// defined in BSS, so multiple definitions are combined by the linker.)  gcc's
+// __attribute__((used)) is insufficient because it seems to be ignored by
+// linkers.
+#define TF_PLATFORM_STRINGS()                                                  \
+    static const char tf_cpu_option[] =                                        \
+        TF_PLAT_STR_MAGIC_PREFIX_ "TF_PLAT_STR_VERSION=" TF_PLAT_STR_VERSION_  \
+        TF_PLAT_STR_LIST___x86_64__()                                          \
+        TF_PLAT_STR_LIST___powerpc64__()                                       \
+        TF_PLAT_STR_LIST___aarch64__()                                         \
+        TF_PLAT_STR_LIST___generic__()                                         \
+    ;                                                                          \
+    const char *tf_cpu_option_global;                                          \
+    namespace {                                                                \
+    class TFCPUOptionHelper {                                                  \
+     public:                                                                   \
+      TFCPUOptionHelper() {                                                    \
+        /* Compilers/linkers remove unused variables aggressively.  The */     \
+        /* following gyrations subvert most such optimizations. */             \
+        tf_cpu_option_global = tf_cpu_option;                                  \
+        /* Nothing is printed because the string starts with a nul. */         \
+        printf("%s", tf_cpu_option);                                           \
+      }                                                                        \
+    } tf_cpu_option_avoid_omit_class;                                          \
+    }  /* anonymous namespace */
+// clang-format on
+
+namespace tensorflow {
+
+class Status;
+
+// Retrieves the platform strings from the file at the given path and appends
+// them to the given vector. If the returned int is non-zero, an error occurred
+// reading the file and vector may or may not be modified. The returned error
+// code is suitable for use with strerror().
+int GetPlatformStrings(const std::string& path,
+                       std::vector<std::string>* found);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_H_
diff --git a/tensorflow/core/platform/platform_strings_computed.h b/tensorflow/core/platform/platform_strings_computed.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a17f3bfc3a866ee1fd4945e9ade5a3e379eefa3
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings_computed.h
@@ -0,0 +1,735 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Generated from platform_strings.h.
+
+#ifndef TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+#define TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
+
+#if defined(_M_IX86_FP)
+#define TF_PLAT_STR__M_IX86_FP TF_PLAT_STR_(_M_IX86_FP)
+#else
+#define TF_PLAT_STR__M_IX86_FP
+#endif
+#if defined(_NO_PREFETCHW)
+#define TF_PLAT_STR__NO_PREFETCHW TF_PLAT_STR_(_NO_PREFETCHW)
+#else
+#define TF_PLAT_STR__NO_PREFETCHW
+#endif
+#if defined(__3dNOW_A__)
+#define TF_PLAT_STR___3dNOW_A__ TF_PLAT_STR_(__3dNOW_A__)
+#else
+#define TF_PLAT_STR___3dNOW_A__
+#endif
+#if defined(__3dNOW__)
+#define TF_PLAT_STR___3dNOW__ TF_PLAT_STR_(__3dNOW__)
+#else
+#define TF_PLAT_STR___3dNOW__
+#endif
+#if defined(__ABM__)
+#define TF_PLAT_STR___ABM__ TF_PLAT_STR_(__ABM__)
+#else
+#define TF_PLAT_STR___ABM__
+#endif
+#if defined(__ADX__)
+#define TF_PLAT_STR___ADX__ TF_PLAT_STR_(__ADX__)
+#else
+#define TF_PLAT_STR___ADX__
+#endif
+#if defined(__AES__)
+#define TF_PLAT_STR___AES__ TF_PLAT_STR_(__AES__)
+#else
+#define TF_PLAT_STR___AES__
+#endif
+#if defined(__AVX2__)
+#define TF_PLAT_STR___AVX2__ TF_PLAT_STR_(__AVX2__)
+#else
+#define TF_PLAT_STR___AVX2__
+#endif
+#if defined(__AVX512BW__)
+#define TF_PLAT_STR___AVX512BW__ TF_PLAT_STR_(__AVX512BW__)
+#else
+#define TF_PLAT_STR___AVX512BW__
+#endif
+#if defined(__AVX512CD__)
+#define TF_PLAT_STR___AVX512CD__ TF_PLAT_STR_(__AVX512CD__)
+#else
+#define TF_PLAT_STR___AVX512CD__
+#endif
+#if defined(__AVX512DQ__)
+#define TF_PLAT_STR___AVX512DQ__ TF_PLAT_STR_(__AVX512DQ__)
+#else
+#define TF_PLAT_STR___AVX512DQ__
+#endif
+#if defined(__AVX512ER__)
+#define TF_PLAT_STR___AVX512ER__ TF_PLAT_STR_(__AVX512ER__)
+#else
+#define TF_PLAT_STR___AVX512ER__
+#endif
+#if defined(__AVX512F__)
+#define TF_PLAT_STR___AVX512F__ TF_PLAT_STR_(__AVX512F__)
+#else
+#define TF_PLAT_STR___AVX512F__
+#endif
+#if defined(__AVX512IFMA__)
+#define TF_PLAT_STR___AVX512IFMA__ TF_PLAT_STR_(__AVX512IFMA__)
+#else
+#define TF_PLAT_STR___AVX512IFMA__
+#endif
+#if defined(__AVX512PF__)
+#define TF_PLAT_STR___AVX512PF__ TF_PLAT_STR_(__AVX512PF__)
+#else
+#define TF_PLAT_STR___AVX512PF__
+#endif
+#if defined(__AVX512VBMI__)
+#define TF_PLAT_STR___AVX512VBMI__ TF_PLAT_STR_(__AVX512VBMI__)
+#else
+#define TF_PLAT_STR___AVX512VBMI__
+#endif
+#if defined(__AVX512VL__)
+#define TF_PLAT_STR___AVX512VL__ TF_PLAT_STR_(__AVX512VL__)
+#else
+#define TF_PLAT_STR___AVX512VL__
+#endif
+#if defined(__AVX__)
+#define TF_PLAT_STR___AVX__ TF_PLAT_STR_(__AVX__)
+#else
+#define TF_PLAT_STR___AVX__
+#endif
+#if defined(__BMI2__)
+#define TF_PLAT_STR___BMI2__ TF_PLAT_STR_(__BMI2__)
+#else
+#define TF_PLAT_STR___BMI2__
+#endif
+#if defined(__BMI__)
+#define TF_PLAT_STR___BMI__ TF_PLAT_STR_(__BMI__)
+#else
+#define TF_PLAT_STR___BMI__
+#endif
+#if defined(__CLFLUSHOPT__)
+#define TF_PLAT_STR___CLFLUSHOPT__ TF_PLAT_STR_(__CLFLUSHOPT__)
+#else
+#define TF_PLAT_STR___CLFLUSHOPT__
+#endif
+#if defined(__CLZERO__)
+#define TF_PLAT_STR___CLZERO__ TF_PLAT_STR_(__CLZERO__)
+#else
+#define TF_PLAT_STR___CLZERO__
+#endif
+#if defined(__F16C__)
+#define TF_PLAT_STR___F16C__ TF_PLAT_STR_(__F16C__)
+#else
+#define TF_PLAT_STR___F16C__
+#endif
+#if defined(__FMA4__)
+#define TF_PLAT_STR___FMA4__ TF_PLAT_STR_(__FMA4__)
+#else
+#define TF_PLAT_STR___FMA4__
+#endif
+#if defined(__FMA__)
+#define TF_PLAT_STR___FMA__ TF_PLAT_STR_(__FMA__)
+#else
+#define TF_PLAT_STR___FMA__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__FSGSBASE__)
+#define TF_PLAT_STR___FSGSBASE__ TF_PLAT_STR_(__FSGSBASE__)
+#else
+#define TF_PLAT_STR___FSGSBASE__
+#endif
+#if defined(__FXSR__)
+#define TF_PLAT_STR___FXSR__ TF_PLAT_STR_(__FXSR__)
+#else
+#define TF_PLAT_STR___FXSR__
+#endif
+#if defined(__LWP__)
+#define TF_PLAT_STR___LWP__ TF_PLAT_STR_(__LWP__)
+#else
+#define TF_PLAT_STR___LWP__
+#endif
+#if defined(__LZCNT__)
+#define TF_PLAT_STR___LZCNT__ TF_PLAT_STR_(__LZCNT__)
+#else
+#define TF_PLAT_STR___LZCNT__
+#endif
+#if defined(__MMX__)
+#define TF_PLAT_STR___MMX__ TF_PLAT_STR_(__MMX__)
+#else
+#define TF_PLAT_STR___MMX__
+#endif
+#if defined(__MWAITX__)
+#define TF_PLAT_STR___MWAITX__ TF_PLAT_STR_(__MWAITX__)
+#else
+#define TF_PLAT_STR___MWAITX__
+#endif
+#if defined(__PCLMUL__)
+#define TF_PLAT_STR___PCLMUL__ TF_PLAT_STR_(__PCLMUL__)
+#else
+#define TF_PLAT_STR___PCLMUL__
+#endif
+#if defined(__PKU__)
+#define TF_PLAT_STR___PKU__ TF_PLAT_STR_(__PKU__)
+#else
+#define TF_PLAT_STR___PKU__
+#endif
+#if defined(__POPCNT__)
+#define TF_PLAT_STR___POPCNT__ TF_PLAT_STR_(__POPCNT__)
+#else
+#define TF_PLAT_STR___POPCNT__
+#endif
+#if defined(__PRFCHW__)
+#define TF_PLAT_STR___PRFCHW__ TF_PLAT_STR_(__PRFCHW__)
+#else
+#define TF_PLAT_STR___PRFCHW__
+#endif
+#if defined(__RDRND__)
+#define TF_PLAT_STR___RDRND__ TF_PLAT_STR_(__RDRND__)
+#else
+#define TF_PLAT_STR___RDRND__
+#endif
+#if defined(__RDSEED__)
+#define TF_PLAT_STR___RDSEED__ TF_PLAT_STR_(__RDSEED__)
+#else
+#define TF_PLAT_STR___RDSEED__
+#endif
+#if defined(__RTM__)
+#define TF_PLAT_STR___RTM__ TF_PLAT_STR_(__RTM__)
+#else
+#define TF_PLAT_STR___RTM__
+#endif
+#if defined(__SHA__)
+#define TF_PLAT_STR___SHA__ TF_PLAT_STR_(__SHA__)
+#else
+#define TF_PLAT_STR___SHA__
+#endif
+#if defined(__SSE2_MATH__)
+#define TF_PLAT_STR___SSE2_MATH__ TF_PLAT_STR_(__SSE2_MATH__)
+#else
+#define TF_PLAT_STR___SSE2_MATH__
+#endif
+#if defined(__SSE2__)
+#define TF_PLAT_STR___SSE2__ TF_PLAT_STR_(__SSE2__)
+#else
+#define TF_PLAT_STR___SSE2__
+#endif
+#if defined(__SSE_MATH__)
+#define TF_PLAT_STR___SSE_MATH__ TF_PLAT_STR_(__SSE_MATH__)
+#else
+#define TF_PLAT_STR___SSE_MATH__
+#endif
+#if defined(__SSE__)
+#define TF_PLAT_STR___SSE__ TF_PLAT_STR_(__SSE__)
+#else
+#define TF_PLAT_STR___SSE__
+#endif
+#if defined(__SSE3__)
+#define TF_PLAT_STR___SSE3__ TF_PLAT_STR_(__SSE3__)
+#else
+#define TF_PLAT_STR___SSE3__
+#endif
+#if defined(__SSE4A__)
+#define TF_PLAT_STR___SSE4A__ TF_PLAT_STR_(__SSE4A__)
+#else
+#define TF_PLAT_STR___SSE4A__
+#endif
+#if defined(__SSE4_1__)
+#define TF_PLAT_STR___SSE4_1__ TF_PLAT_STR_(__SSE4_1__)
+#else
+#define TF_PLAT_STR___SSE4_1__
+#endif
+#if defined(__SSE4_2__)
+#define TF_PLAT_STR___SSE4_2__ TF_PLAT_STR_(__SSE4_2__)
+#else
+#define TF_PLAT_STR___SSE4_2__
+#endif
+#if defined(__SSSE3__)
+#define TF_PLAT_STR___SSSE3__ TF_PLAT_STR_(__SSSE3__)
+#else
+#define TF_PLAT_STR___SSSE3__
+#endif
+#if defined(__TBM__)
+#define TF_PLAT_STR___TBM__ TF_PLAT_STR_(__TBM__)
+#else
+#define TF_PLAT_STR___TBM__
+#endif
+#if defined(__XOP__)
+#define TF_PLAT_STR___XOP__ TF_PLAT_STR_(__XOP__)
+#else
+#define TF_PLAT_STR___XOP__
+#endif
+#if defined(__XSAVEC__)
+#define TF_PLAT_STR___XSAVEC__ TF_PLAT_STR_(__XSAVEC__)
+#else
+#define TF_PLAT_STR___XSAVEC__
+#endif
+#if defined(__XSAVEOPT__)
+#define TF_PLAT_STR___XSAVEOPT__ TF_PLAT_STR_(__XSAVEOPT__)
+#else
+#define TF_PLAT_STR___XSAVEOPT__
+#endif
+#if defined(__XSAVES__)
+#define TF_PLAT_STR___XSAVES__ TF_PLAT_STR_(__XSAVES__)
+#else
+#define TF_PLAT_STR___XSAVES__
+#endif
+#if defined(__XSAVE__)
+#define TF_PLAT_STR___XSAVE__ TF_PLAT_STR_(__XSAVE__)
+#else
+#define TF_PLAT_STR___XSAVE__
+#endif
+#if defined(_SOFT_DOUBLE)
+#define TF_PLAT_STR__SOFT_DOUBLE TF_PLAT_STR_(_SOFT_DOUBLE)
+#else
+#define TF_PLAT_STR__SOFT_DOUBLE
+#endif
+#if defined(_SOFT_FLOAT)
+#define TF_PLAT_STR__SOFT_FLOAT TF_PLAT_STR_(_SOFT_FLOAT)
+#else
+#define TF_PLAT_STR__SOFT_FLOAT
+#endif
+#if defined(__ALTIVEC__)
+#define TF_PLAT_STR___ALTIVEC__ TF_PLAT_STR_(__ALTIVEC__)
+#else
+#define TF_PLAT_STR___ALTIVEC__
+#endif
+#if defined(__APPLE_ALTIVEC__)
+#define TF_PLAT_STR___APPLE_ALTIVEC__ TF_PLAT_STR_(__APPLE_ALTIVEC__)
+#else
+#define TF_PLAT_STR___APPLE_ALTIVEC__
+#endif
+#if defined(__CRYPTO__)
+#define TF_PLAT_STR___CRYPTO__ TF_PLAT_STR_(__CRYPTO__)
+#else
+#define TF_PLAT_STR___CRYPTO__
+#endif
+#if defined(__FLOAT128_HARDWARE__)
+#define TF_PLAT_STR___FLOAT128_HARDWARE__ TF_PLAT_STR_(__FLOAT128_HARDWARE__)
+#else
+#define TF_PLAT_STR___FLOAT128_HARDWARE__
+#endif
+#if defined(__FLOAT128_TYPE__)
+#define TF_PLAT_STR___FLOAT128_TYPE__ TF_PLAT_STR_(__FLOAT128_TYPE__)
+#else
+#define TF_PLAT_STR___FLOAT128_TYPE__
+#endif
+#if defined(__FP_FAST_FMA)
+#define TF_PLAT_STR___FP_FAST_FMA TF_PLAT_STR_(__FP_FAST_FMA)
+#else
+#define TF_PLAT_STR___FP_FAST_FMA
+#endif
+#if defined(__FP_FAST_FMAF)
+#define TF_PLAT_STR___FP_FAST_FMAF TF_PLAT_STR_(__FP_FAST_FMAF)
+#else
+#define TF_PLAT_STR___FP_FAST_FMAF
+#endif
+#if defined(__HTM__)
+#define TF_PLAT_STR___HTM__ TF_PLAT_STR_(__HTM__)
+#else
+#define TF_PLAT_STR___HTM__
+#endif
+#if defined(__NO_FPRS__)
+#define TF_PLAT_STR___NO_FPRS__ TF_PLAT_STR_(__NO_FPRS__)
+#else
+#define TF_PLAT_STR___NO_FPRS__
+#endif
+#if defined(__NO_LWSYNC__)
+#define TF_PLAT_STR___NO_LWSYNC__ TF_PLAT_STR_(__NO_LWSYNC__)
+#else
+#define TF_PLAT_STR___NO_LWSYNC__
+#endif
+#if defined(__POWER8_VECTOR__)
+#define TF_PLAT_STR___POWER8_VECTOR__ TF_PLAT_STR_(__POWER8_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER8_VECTOR__
+#endif
+#if defined(__POWER9_VECTOR__)
+#define TF_PLAT_STR___POWER9_VECTOR__ TF_PLAT_STR_(__POWER9_VECTOR__)
+#else
+#define TF_PLAT_STR___POWER9_VECTOR__
+#endif
+#if defined(__PPC405__)
+#define TF_PLAT_STR___PPC405__ TF_PLAT_STR_(__PPC405__)
+#else
+#define TF_PLAT_STR___PPC405__
+#endif
+#if defined(__QUAD_MEMORY_ATOMIC__)
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__ TF_PLAT_STR_(__QUAD_MEMORY_ATOMIC__)
+#else
+#define TF_PLAT_STR___QUAD_MEMORY_ATOMIC__
+#endif
+#if defined(__RECIPF__)
+#define TF_PLAT_STR___RECIPF__ TF_PLAT_STR_(__RECIPF__)
+#else
+#define TF_PLAT_STR___RECIPF__
+#endif
+#if defined(__RECIP_PRECISION__)
+#define TF_PLAT_STR___RECIP_PRECISION__ TF_PLAT_STR_(__RECIP_PRECISION__)
+#else
+#define TF_PLAT_STR___RECIP_PRECISION__
+#endif
+#if defined(__RECIP__)
+#define TF_PLAT_STR___RECIP__ TF_PLAT_STR_(__RECIP__)
+#else
+#define TF_PLAT_STR___RECIP__
+#endif
+#if defined(__RSQRTEF__)
+#define TF_PLAT_STR___RSQRTEF__ TF_PLAT_STR_(__RSQRTEF__)
+#else
+#define TF_PLAT_STR___RSQRTEF__
+#endif
+#if defined(__RSQRTE__)
+#define TF_PLAT_STR___RSQRTE__ TF_PLAT_STR_(__RSQRTE__)
+#else
+#define TF_PLAT_STR___RSQRTE__
+#endif
+#if defined(__TM_FENCE__)
+#define TF_PLAT_STR___TM_FENCE__ TF_PLAT_STR_(__TM_FENCE__)
+#else
+#define TF_PLAT_STR___TM_FENCE__
+#endif
+#if defined(__UPPER_REGS_DF__)
+#define TF_PLAT_STR___UPPER_REGS_DF__ TF_PLAT_STR_(__UPPER_REGS_DF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_DF__
+#endif
+#if defined(__UPPER_REGS_SF__)
+#define TF_PLAT_STR___UPPER_REGS_SF__ TF_PLAT_STR_(__UPPER_REGS_SF__)
+#else
+#define TF_PLAT_STR___UPPER_REGS_SF__
+#endif
+#if defined(__VEC__)
+#define TF_PLAT_STR___VEC__ TF_PLAT_STR_(__VEC__)
+#else
+#define TF_PLAT_STR___VEC__
+#endif
+#if defined(__VSX__)
+#define TF_PLAT_STR___VSX__ TF_PLAT_STR_(__VSX__)
+#else
+#define TF_PLAT_STR___VSX__
+#endif
+#if defined(__ARM_ARCH)
+#define TF_PLAT_STR___ARM_ARCH TF_PLAT_STR_(__ARM_ARCH)
+#else
+#define TF_PLAT_STR___ARM_ARCH
+#endif
+#if defined(__ARM_FEATURE_CLZ)
+#define TF_PLAT_STR___ARM_FEATURE_CLZ TF_PLAT_STR_(__ARM_FEATURE_CLZ)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CLZ
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRC32)
+#define TF_PLAT_STR___ARM_FEATURE_CRC32 TF_PLAT_STR_(__ARM_FEATURE_CRC32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRC32
+#endif
+#if defined(__ARM_FEATURE_CRYPTO)
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO TF_PLAT_STR_(__ARM_FEATURE_CRYPTO)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_CRYPTO
+#endif
+#if defined(__ARM_FEATURE_DIRECTED_ROUNDING)
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING \
+  TF_PLAT_STR_(__ARM_FEATURE_DIRECTED_ROUNDING)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DIRECTED_ROUNDING
+#endif
+#if defined(__ARM_FEATURE_DSP)
+#define TF_PLAT_STR___ARM_FEATURE_DSP TF_PLAT_STR_(__ARM_FEATURE_DSP)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_DSP
+#endif
+#if defined(__ARM_FEATURE_FMA)
+#define TF_PLAT_STR___ARM_FEATURE_FMA TF_PLAT_STR_(__ARM_FEATURE_FMA)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_FMA
+#endif
+#if defined(__ARM_FEATURE_IDIV)
+#define TF_PLAT_STR___ARM_FEATURE_IDIV TF_PLAT_STR_(__ARM_FEATURE_IDIV)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_IDIV
+#endif
+#if defined(__ARM_FEATURE_LDREX)
+#define TF_PLAT_STR___ARM_FEATURE_LDREX TF_PLAT_STR_(__ARM_FEATURE_LDREX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_LDREX
+#endif
+#if defined(__ARM_FEATURE_NUMERIC_MAXMIN)
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN \
+  TF_PLAT_STR_(__ARM_FEATURE_NUMERIC_MAXMIN)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_NUMERIC_MAXMIN
+#endif
+#if defined(__ARM_FEATURE_QBIT)
+#define TF_PLAT_STR___ARM_FEATURE_QBIT TF_PLAT_STR_(__ARM_FEATURE_QBIT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QBIT
+#endif
+#if defined(__ARM_FEATURE_QRDMX)
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX TF_PLAT_STR_(__ARM_FEATURE_QRDMX)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_QRDMX
+#endif
+#if defined(__ARM_FEATURE_SAT)
+#define TF_PLAT_STR___ARM_FEATURE_SAT TF_PLAT_STR_(__ARM_FEATURE_SAT)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SAT
+#endif
+#if defined(__ARM_FEATURE_SIMD32)
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32 TF_PLAT_STR_(__ARM_FEATURE_SIMD32)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_SIMD32
+#endif
+#if defined(__ARM_FEATURE_UNALIGNED)
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED \
+  TF_PLAT_STR_(__ARM_FEATURE_UNALIGNED)
+#else
+#define TF_PLAT_STR___ARM_FEATURE_UNALIGNED
+#endif
+#if defined(__ARM_FP)
+#define TF_PLAT_STR___ARM_FP TF_PLAT_STR_(__ARM_FP)
+#else
+#define TF_PLAT_STR___ARM_FP
+#endif
+#if defined(__ARM_NEON_FP)
+#define TF_PLAT_STR___ARM_NEON_FP TF_PLAT_STR_(__ARM_NEON_FP)
+#else
+#define TF_PLAT_STR___ARM_NEON_FP
+#endif
+#if defined(__ARM_NEON__)
+#define TF_PLAT_STR___ARM_NEON__ TF_PLAT_STR_(__ARM_NEON__)
+#else
+#define TF_PLAT_STR___ARM_NEON__
+#endif
+#if defined(__ARM_WMMX)
+#define TF_PLAT_STR___ARM_WMMX TF_PLAT_STR_(__ARM_WMMX)
+#else
+#define TF_PLAT_STR___ARM_WMMX
+#endif
+#if defined(__IWMMXT2__)
+#define TF_PLAT_STR___IWMMXT2__ TF_PLAT_STR_(__IWMMXT2__)
+#else
+#define TF_PLAT_STR___IWMMXT2__
+#endif
+#if defined(__IWMMXT__)
+#define TF_PLAT_STR___IWMMXT__ TF_PLAT_STR_(__IWMMXT__)
+#else
+#define TF_PLAT_STR___IWMMXT__
+#endif
+#if defined(__VFP_FP__)
+#define TF_PLAT_STR___VFP_FP__ TF_PLAT_STR_(__VFP_FP__)
+#else
+#define TF_PLAT_STR___VFP_FP__
+#endif
+#if defined(TARGET_IPHONE_SIMULATOR)
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR \
+  TF_PLAT_STR_(TARGET_IPHONE_SIMULATOR)
+#else
+#define TF_PLAT_STR_TARGET_IPHONE_SIMULATOR
+#endif
+#if defined(TARGET_OS_IOS)
+#define TF_PLAT_STR_TARGET_OS_IOS TF_PLAT_STR_(TARGET_OS_IOS)
+#else
+#define TF_PLAT_STR_TARGET_OS_IOS
+#endif
+#if defined(TARGET_OS_IPHONE)
+#define TF_PLAT_STR_TARGET_OS_IPHONE TF_PLAT_STR_(TARGET_OS_IPHONE)
+#else
+#define TF_PLAT_STR_TARGET_OS_IPHONE
+#endif
+#if defined(_MSC_VER)
+#define TF_PLAT_STR__MSC_VER TF_PLAT_STR_(_MSC_VER)
+#else
+#define TF_PLAT_STR__MSC_VER
+#endif
+#if defined(_M_ARM)
+#define TF_PLAT_STR__M_ARM TF_PLAT_STR_(_M_ARM)
+#else
+#define TF_PLAT_STR__M_ARM
+#endif
+#if defined(_M_ARM64)
+#define TF_PLAT_STR__M_ARM64 TF_PLAT_STR_(_M_ARM64)
+#else
+#define TF_PLAT_STR__M_ARM64
+#endif
+#if defined(_M_ARM_ARMV7VE)
+#define TF_PLAT_STR__M_ARM_ARMV7VE TF_PLAT_STR_(_M_ARM_ARMV7VE)
+#else
+#define TF_PLAT_STR__M_ARM_ARMV7VE
+#endif
+#if defined(_M_ARM_FP)
+#define TF_PLAT_STR__M_ARM_FP TF_PLAT_STR_(_M_ARM_FP)
+#else
+#define TF_PLAT_STR__M_ARM_FP
+#endif
+#if defined(_M_IX86)
+#define TF_PLAT_STR__M_IX86 TF_PLAT_STR_(_M_IX86)
+#else
+#define TF_PLAT_STR__M_IX86
+#endif
+#if defined(_M_X64)
+#define TF_PLAT_STR__M_X64 TF_PLAT_STR_(_M_X64)
+#else
+#define TF_PLAT_STR__M_X64
+#endif
+#if defined(_WIN32)
+#define TF_PLAT_STR__WIN32 TF_PLAT_STR_(_WIN32)
+#else
+#define TF_PLAT_STR__WIN32
+#endif
+#if defined(_WIN64)
+#define TF_PLAT_STR__WIN64 TF_PLAT_STR_(_WIN64)
+#else
+#define TF_PLAT_STR__WIN64
+#endif
+#if defined(__ANDROID__)
+#define TF_PLAT_STR___ANDROID__ TF_PLAT_STR_(__ANDROID__)
+#else
+#define TF_PLAT_STR___ANDROID__
+#endif
+#if defined(__APPLE__)
+#define TF_PLAT_STR___APPLE__ TF_PLAT_STR_(__APPLE__)
+#else
+#define TF_PLAT_STR___APPLE__
+#endif
+#if defined(__BYTE_ORDER__)
+#define TF_PLAT_STR___BYTE_ORDER__ TF_PLAT_STR_(__BYTE_ORDER__)
+#else
+#define TF_PLAT_STR___BYTE_ORDER__
+#endif
+#if defined(__CYGWIN__)
+#define TF_PLAT_STR___CYGWIN__ TF_PLAT_STR_(__CYGWIN__)
+#else
+#define TF_PLAT_STR___CYGWIN__
+#endif
+#if defined(__FreeBSD__)
+#define TF_PLAT_STR___FreeBSD__ TF_PLAT_STR_(__FreeBSD__)
+#else
+#define TF_PLAT_STR___FreeBSD__
+#endif
+#if defined(__LITTLE_ENDIAN__)
+#define TF_PLAT_STR___LITTLE_ENDIAN__ TF_PLAT_STR_(__LITTLE_ENDIAN__)
+#else
+#define TF_PLAT_STR___LITTLE_ENDIAN__
+#endif
+#if defined(__NetBSD__)
+#define TF_PLAT_STR___NetBSD__ TF_PLAT_STR_(__NetBSD__)
+#else
+#define TF_PLAT_STR___NetBSD__
+#endif
+#if defined(__OpenBSD__)
+#define TF_PLAT_STR___OpenBSD__ TF_PLAT_STR_(__OpenBSD__)
+#else
+#define TF_PLAT_STR___OpenBSD__
+#endif
+#if defined(____MSYS__)
+#define TF_PLAT_STR_____MSYS__ TF_PLAT_STR_(____MSYS__)
+#else
+#define TF_PLAT_STR_____MSYS__
+#endif
+#if defined(__aarch64__)
+#define TF_PLAT_STR___aarch64__ TF_PLAT_STR_(__aarch64__)
+#else
+#define TF_PLAT_STR___aarch64__
+#endif
+#if defined(__alpha__)
+#define TF_PLAT_STR___alpha__ TF_PLAT_STR_(__alpha__)
+#else
+#define TF_PLAT_STR___alpha__
+#endif
+#if defined(__arm__)
+#define TF_PLAT_STR___arm__ TF_PLAT_STR_(__arm__)
+#else
+#define TF_PLAT_STR___arm__
+#endif
+#if defined(__i386__)
+#define TF_PLAT_STR___i386__ TF_PLAT_STR_(__i386__)
+#else
+#define TF_PLAT_STR___i386__
+#endif
+#if defined(__i686__)
+#define TF_PLAT_STR___i686__ TF_PLAT_STR_(__i686__)
+#else
+#define TF_PLAT_STR___i686__
+#endif
+#if defined(__ia64__)
+#define TF_PLAT_STR___ia64__ TF_PLAT_STR_(__ia64__)
+#else
+#define TF_PLAT_STR___ia64__
+#endif
+#if defined(__linux__)
+#define TF_PLAT_STR___linux__ TF_PLAT_STR_(__linux__)
+#else
+#define TF_PLAT_STR___linux__
+#endif
+#if defined(__mips32__)
+#define TF_PLAT_STR___mips32__ TF_PLAT_STR_(__mips32__)
+#else
+#define TF_PLAT_STR___mips32__
+#endif
+#if defined(__mips64__)
+#define TF_PLAT_STR___mips64__ TF_PLAT_STR_(__mips64__)
+#else
+#define TF_PLAT_STR___mips64__
+#endif
+#if defined(__powerpc64__)
+#define TF_PLAT_STR___powerpc64__ TF_PLAT_STR_(__powerpc64__)
+#else
+#define TF_PLAT_STR___powerpc64__
+#endif
+#if defined(__powerpc__)
+#define TF_PLAT_STR___powerpc__ TF_PLAT_STR_(__powerpc__)
+#else
+#define TF_PLAT_STR___powerpc__
+#endif
+#if defined(__riscv___)
+#define TF_PLAT_STR___riscv___ TF_PLAT_STR_(__riscv___)
+#else
+#define TF_PLAT_STR___riscv___
+#endif
+#if defined(__s390x__)
+#define TF_PLAT_STR___s390x__ TF_PLAT_STR_(__s390x__)
+#else
+#define TF_PLAT_STR___s390x__
+#endif
+#if defined(__sparc64__)
+#define TF_PLAT_STR___sparc64__ TF_PLAT_STR_(__sparc64__)
+#else
+#define TF_PLAT_STR___sparc64__
+#endif
+#if defined(__sparc__)
+#define TF_PLAT_STR___sparc__ TF_PLAT_STR_(__sparc__)
+#else
+#define TF_PLAT_STR___sparc__
+#endif
+#if defined(__x86_64__)
+#define TF_PLAT_STR___x86_64__ TF_PLAT_STR_(__x86_64__)
+#else
+#define TF_PLAT_STR___x86_64__
+#endif
+
+#endif  // TENSORFLOW_CORE_PLATFORM_PLATFORM_STRINGS_COMPUTED_H_
diff --git a/tensorflow/core/platform/platform_strings_test.cc b/tensorflow/core/platform/platform_strings_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5251f10d4124650dd7b2d260b1665b988bb663c9
--- /dev/null
+++ b/tensorflow/core/platform/platform_strings_test.cc
@@ -0,0 +1,146 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Test for the platform_strings.h header file.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/str_util.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/platform_strings.h"
+
+// Embed the platform strings in this binary.
+TF_PLATFORM_STRINGS()
+
+// A vector of strings.
+typedef std::vector<std::string> string_vec;
+
+// Append to *found the strings within the named file with the platform_strings
+// magic prefix, and return true; or return false on error.
+
+// Print the platform strings embedded in the binary file_name and return 0,
+// on on error return 2.
+static int PrintStrings(const std::string file_name) {
+  int rc = 0;
+  string_vec str;
+  if (!tensorflow::GetPlatformStrings(file_name, &str)) {
+    for (int i = 0; i != str.size(); i++) {
+      printf("%s\n", str[i].c_str());
+    }
+  } else {
+    perror(file_name.c_str());
+    rc = 2;
+  }
+  return rc;
+}
+
+// Return whether str[] conatins a string with prefix "macro_name="; if so,
+// set *pvalue to the suffix.
+static bool GetValue(const string_vec &str, const std::string &macro_name,
+                     std::string *pvalue) {
+  std::string nam_eq = macro_name + "=";
+  int i = 0;
+  while (i != str.size() && !tensorflow::str_util::StartsWith(str[i], nam_eq)) {
+    i++;
+  }
+  bool found = (i != str.size());
+  if (found) {
+    *pvalue = str[i].substr(nam_eq.size());
+  }
+  return found;
+}
+
+// If macro_name[] is not equal to value[], check that str[] contains the
+// string "macro_name=value".  Otherwise, check that str[] does not contain any
+// string starting with macro_name=".
+static void CheckStr(const string_vec &str, const std::string &macro_name,
+                     const std::string &value) {
+  std::string value_from_str;
+  if (GetValue(str, macro_name, &value_from_str)) {
+    if (value != value_from_str) {
+      // Output everything found, to aid debugging.
+      LOG(ERROR) << "===== value=" << value
+                 << "  value_from_str=" << value_from_str;
+      for (int i = 0; i != str.size(); i++) {
+        LOG(ERROR) << "% " << str[i];
+      }
+      LOG(ERROR) << "=====";
+    }
+    CHECK_EQ(value, value_from_str) << " " << macro_name << ": bad value";
+  } else {
+    // If the string is not found, we expect value to be macro_name.
+    if (value != macro_name) {
+      // Output everything found, to aid debugging.
+      LOG(ERROR) << "===== value=" << value << "  macro_name=" << macro_name;
+      for (int i = 0; i != str.size(); i++) {
+        LOG(ERROR) << "% " << str[i];
+      }
+      LOG(ERROR) << "=====";
+    }
+    CHECK_EQ(value, macro_name) << " " << macro_name << ": not found in binary";
+  }
+}
+
+// Helper for AS_STR(), below, to perform macro expansion.
+#define AS_STR_1_(x) #x
+
+// Yield x after macro expansion as a nul-terminated constant string.
+#define AS_STR(x) AS_STR_1_(x)
+
+// Run the test, and return 0 on success, 2 otherwise.
+static int RunTest(const std::string &binary_name) {
+  int rc = 0;
+  string_vec str;
+
+  if (!tensorflow::GetPlatformStrings(binary_name, &str)) {
+    CheckStr(str, "__linux__", AS_STR(__linux__));
+    CheckStr(str, "_WIN32", AS_STR(_WIN32));
+    CheckStr(str, "__APPLE__", AS_STR(__APPLE__));
+    CheckStr(str, "__x86_64__", AS_STR(__x86_64__));
+    CheckStr(str, "__aarch64__", AS_STR(__aarch64__));
+    CheckStr(str, "__powerpc64__", AS_STR(__powerpc64__));
+    CheckStr(str, "TF_PLAT_STR_VERSION", TF_PLAT_STR_VERSION_);
+  } else {
+    perror(binary_name.c_str());
+    rc = 2;
+  }
+
+  return rc;
+}
+
+int main(int argc, char *argv[]) {
+  tensorflow::Env *env = tensorflow::Env::Default();
+  static const char usage[] = "usage: platform_strings_test [file...]";
+  int rc = 0;
+  tensorflow::port::InitMain(usage, &argc, &argv);
+  if (argc == 1) {
+    printf("rc=%d\n", PrintStrings(env->GetExecutablePath()));
+    rc = RunTest(env->GetExecutablePath());
+  } else {
+    for (int argn = 1; argn != argc; argn++) {
+      rc |= PrintStrings(argv[argn]);
+    }
+  }
+  return rc;
+}
diff --git a/tensorflow/core/platform/posix/posix_file_system.cc b/tensorflow/core/platform/posix/posix_file_system.cc
index c7afab9583cee1612a8c12b6f9fff7b89af1d86a..fc48cab56460d85d9997f57cb761481c77413d00 100644
--- a/tensorflow/core/platform/posix/posix_file_system.cc
+++ b/tensorflow/core/platform/posix/posix_file_system.cc
@@ -240,11 +240,14 @@ Status PosixFileSystem::DeleteFile(const string& fname) {
 }
 
 Status PosixFileSystem::CreateDir(const string& name) {
-  Status result;
-  if (mkdir(TranslateName(name).c_str(), 0755) != 0) {
-    result = IOError(name, errno);
+  string translated = TranslateName(name);
+  if (translated.empty()) {
+    return errors::AlreadyExists(name);
   }
-  return result;
+  if (mkdir(translated.c_str(), 0755) != 0) {
+    return IOError(name, errno);
+  }
+  return Status::OK();
 }
 
 Status PosixFileSystem::DeleteDir(const string& name) {
diff --git a/tensorflow/core/platform/regexp.h b/tensorflow/core/platform/regexp.h
index a4eedf30454567074191b36e0b87bf53987ffc42..ca9ca1e2442d272fc12e29dce81b6c633de97b35 100644
--- a/tensorflow/core/platform/regexp.h
+++ b/tensorflow/core/platform/regexp.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_PLATFORM_REGEXP_H_
 #define TENSORFLOW_PLATFORM_REGEXP_H_
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/platform.h"
 #include "tensorflow/core/platform/types.h"
 
@@ -23,7 +24,7 @@ limitations under the License.
     defined(GOOGLE_RE2)
 #include "tensorflow/core/platform/google/build_config/re2.h"
 namespace tensorflow {
-typedef ::StringPiece RegexpStringPiece;
+typedef absl::string_view RegexpStringPiece;
 }  // namespace tensorflow
 
 #else
diff --git a/tensorflow/core/platform/windows/windows_file_system.cc b/tensorflow/core/platform/windows/windows_file_system.cc
index 6cf79634d7a4f1591185c594ce66fb67ddb85309..993b9906b1c072cb48c816855fb2fc1498ae3f40 100644
--- a/tensorflow/core/platform/windows/windows_file_system.cc
+++ b/tensorflow/core/platform/windows/windows_file_system.cc
@@ -439,6 +439,9 @@ Status WindowsFileSystem::DeleteFile(const string& fname) {
 Status WindowsFileSystem::CreateDir(const string& name) {
   Status result;
   std::wstring ws_name = Utf8ToWideChar(name);
+  if (ws_name.empty()) {
+    return errors::AlreadyExists(name);
+  }
   if (_wmkdir(ws_name.c_str()) != 0) {
     result = IOError("Failed to create a directory: " + name, errno);
   }
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index 744e1e95deb458e4399cceba4c91a12eed30be7c..0c26855a43ec40992687cc9c3dd0a0d93e8594df 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -183,7 +183,7 @@ class Samples {
   // This method adds the statistics of graph nodes created by the python
   // call.
   void Add(const CodeNode* node, const std::vector<uint64>& location_ids) {
-    // displayed leaf might not be true leaf. Retrive the true leaves for
+    // displayed leaf might not be true leaf. Retrieve the true leaves for
     // stats.
     std::vector<const CodeNode*> all_leaf = FetchAllLeaf(node);
     CHECK(!all_leaf.empty()) << node->name();
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 86cb20de7bbb4f36bfaa431bc2b81a00dace84df..8796234be0cced4c977a0529aefa10cd16961c1b 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -151,7 +151,7 @@ void ExecStep::AddMemoryStats(const string& dev,
   }
 
   // TODO(xpan): Make this more accurate:
-  // High level: Memory tracking is suspicous and requires large scale
+  // High level: Memory tracking is suspicious and requires large scale
   // clean up.
   // Investigte the memory usage difference between CPU/GPU with OpViewTest.
   //
diff --git a/tensorflow/core/protobuf/config.proto b/tensorflow/core/protobuf/config.proto
index 4689af06afedb2339449e2f01e6d325ea26cd4c9..b3dc5dccc02737202f9f5ced78471f332efd2eba 100644
--- a/tensorflow/core/protobuf/config.proto
+++ b/tensorflow/core/protobuf/config.proto
@@ -291,6 +291,13 @@ message RPCOptions {
   // transport for client-master communication that avoids the RPC
   // stack. This option is primarily for used testing the RPC stack.
   bool use_rpc_for_inprocess_master = 1;
+
+  // The compression algorithm to be used. One of "deflate", "gzip".
+  string compression_algorithm = 2;
+
+  // If compression_algorithm is set, the compression level to be used.
+  // From 0 (no compression), up to 3.
+  int32 compression_level = 3;
 };
 
 // Session configuration parameters.
@@ -413,6 +420,11 @@ message ConfigProto {
     // Any positive value sets the max chunk size.  0 defaults to 4096.
     // Any negative value indicates no max, i.e. one chunk only.
     int32 recv_buf_max_chunk = 4;
+
+    // If true, and supported by the platform, the runtime will attempt to
+    // use NUMA affinity where applicable.  One consequence will be the
+    // existence of as many CPU devices as there are available NUMA nodes.
+    bool use_numa_affinity = 5;
   };
 
   Experimental experimental = 16;
diff --git a/tensorflow/core/protobuf/master.proto b/tensorflow/core/protobuf/master.proto
index 03022875e64ace5d680e969138727efb1f522097..c104463c51c7e7be02430c7750ebacee60ed50e4 100644
--- a/tensorflow/core/protobuf/master.proto
+++ b/tensorflow/core/protobuf/master.proto
@@ -224,7 +224,7 @@ message CloseSessionResponse {
 message ResetRequest {
   // A list of container names, which may be empty.
   //
-  // If 'container' is not empty, releases resoures in the given
+  // If 'container' is not empty, releases resources in the given
   // containers in all devices.
   //
   // If 'container' is empty, releases resources in the default
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index d68f2735365b0aa330344238193ab4a22a0a3911..515d673828e3792ac6f4268fd55b58e43aab509b 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -38,7 +38,7 @@ message RewriterConfig {
   }
 
   // Enum controlling the number of times to run optimizers. The default is to
-  // run them once.
+  // run them twice.
   enum NumIterationsType {
     DEFAULT_NUM_ITERS = 0;
     ONE = 1;
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..523d37ecc244b3634545ea82385b377c871569c8
--- /dev/null
+++ b/tensorflow/core/util/dump_graph.cc
@@ -0,0 +1,131 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
+// debugging.
+
+#include "tensorflow/core/util/dump_graph.h"
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+
+namespace tensorflow {
+
+namespace {
+
+struct NameCounts {
+  mutex counts_mutex;
+  std::unordered_map<string, int> counts;
+};
+
+string MakeUniqueFilename(string name) {
+  static NameCounts& instance = *new NameCounts;
+
+  // Remove illegal characters from `name`.
+  for (int i = 0; i < name.size(); ++i) {
+    char ch = name[i];
+    if (ch == '/' || ch == '[' || ch == ']' || ch == '*' || ch == '?') {
+      name[i] = '_';
+    }
+  }
+
+  int count;
+  {
+    mutex_lock lock(instance.counts_mutex);
+    count = instance.counts[name]++;
+  }
+
+  string filename = name;
+  if (count > 0) {
+    absl::StrAppend(&filename, "_", count);
+  }
+  absl::StrAppend(&filename, ".pbtxt");
+  return filename;
+}
+
+#if defined(TENSORFLOW_LITE_PROTOS)
+Status WriteToFile(const string& filepath,
+                   const ::tensorflow::protobuf::MessageLite& proto) {
+  string s;
+  if (!SerializeToStringDeterministic(proto, &s)) {
+    return errors::Internal("Failed to serialize proto to string.");
+  }
+  return WriteStringToFile(Env::Default(), filepath, s);
+}
+#else
+Status WriteToFile(const string& filepath,
+                   const ::tensorflow::protobuf::Message& proto) {
+  return WriteTextProto(Env::Default(), filepath, proto);
+}
+#endif
+
+template <class T>
+string WriteTextProtoToUniqueFile(Env* env, const string& name,
+                                  const char* proto_type, T& proto,
+                                  const string& dirname) {
+  const char* dir = nullptr;
+  if (!dirname.empty()) {
+    dir = dirname.c_str();
+  } else {
+    dir = getenv("TF_DUMP_GRAPH_PREFIX");
+  }
+  if (!dir) {
+    return "(TF_DUMP_GRAPH_PREFIX not specified)";
+  }
+  Status status = env->RecursivelyCreateDir(dir);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to create " << dir << " for dumping " << proto_type
+                 << ": " << status;
+    return "(unavailable)";
+  }
+  string filepath = absl::StrCat(dir, "/", MakeUniqueFilename(name));
+  status = WriteToFile(filepath, proto);
+  if (!status.ok()) {
+    LOG(WARNING) << "Failed to dump " << proto_type << " to file: " << filepath
+                 << " : " << status;
+    return "(unavailable)";
+  }
+  LOG(INFO) << "Dumped " << proto_type << " to " << filepath;
+  return filepath;
+}
+
+}  // anonymous namespace
+
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname) {
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "GraphDef", graph_def,
+                                    dirname);
+}
+
+string DumpGraphToFile(const string& name, Graph const& graph,
+                       const FunctionLibraryDefinition* flib_def,
+                       const string& dirname) {
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+  if (flib_def) {
+    *graph_def.mutable_library() = flib_def->ToProto();
+  }
+  return DumpGraphDefToFile(name, graph_def, dirname);
+}
+
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname) {
+  return WriteTextProtoToUniqueFile(Env::Default(), name, "FunctionDef", fdef,
+                                    dirname);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/dump_graph.h b/tensorflow/core/util/dump_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..03dc807a2b342edaea57ad8558495462a6af0109
--- /dev/null
+++ b/tensorflow/core/util/dump_graph.h
@@ -0,0 +1,52 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Helper functions for dumping Graphs, GraphDefs, and FunctionDefs to files for
+// debugging.
+
+#ifndef TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
+#define TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/graph/graph.h"
+
+namespace tensorflow {
+
+// Dumps 'graph_def' to a file, as a GraphDef text proto. Returns the file name
+// chosen.
+//
+// Automatically picks a file name. Prefixes 'name' with the value of the
+// TF_DUMP_GRAPH_PREFIX environment variable if 'dirname' is empty, and suffixes
+// 'name' with ".pbtxt" to form a name. If a graph has already been dumped by
+// this process with the same name, suffixes with "_n.pbtxt", where 'n' is a
+// sequence number.
+string DumpGraphDefToFile(const string& name, GraphDef const& graph_def,
+                          const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but builds the GraphDef to dump from a 'graph'
+// and an optional function library 'flib_def'. Returns the file name chosen.
+string DumpGraphToFile(const string& name, Graph const& graph,
+                       const FunctionLibraryDefinition* flib_def = nullptr,
+                       const string& dirname = "");
+
+// Similar to DumpGraphDefToFile, but dumps a function as a FunctionDef text
+// proto. Returns the file name chosen.
+string DumpFunctionDefToFile(const string& name, FunctionDef const& fdef,
+                             const string& dirname = "");
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_DUMP_GRAPH_H_
diff --git a/tensorflow/core/util/dump_graph_test.cc b/tensorflow/core/util/dump_graph_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d01c1c5a0290197d8b52899ab703c1f183c0545b
--- /dev/null
+++ b/tensorflow/core/util/dump_graph_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/util/dump_graph.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/lib/strings/proto_serialization.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(DumpGraph, DumpGraphToFileSuccess) {
+  Graph graph(OpRegistry::Global());
+  Node* node;
+  TF_CHECK_OK(NodeBuilder("A", "NoOp").Finalize(&graph, &node));
+
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+  string ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph.pbtxt"));
+  ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "graph_1.pbtxt"));
+
+  GraphDef gdef;
+  TF_CHECK_OK(ReadTextProto(
+      Env::Default(), io::JoinPath(testing::TmpDir(), "graph.pbtxt"), &gdef));
+  string read, written;
+  gdef.AppendToString(&read);
+  graph.ToGraphDefDebug().AppendToString(&written);
+  EXPECT_EQ(read, written);
+}
+
+TEST(DumpGraph, DumpGraphToFileNoEnvPrefix) {
+  Graph graph(OpRegistry::Global());
+  unsetenv("TF_DUMP_GRAPH_PREFIX");
+  string ret = DumpGraphToFile("graph", graph);
+  EXPECT_EQ(ret, "(TF_DUMP_GRAPH_PREFIX not specified)");
+}
+
+TEST(DumpGraph, DumpFunctionDefToFileSuccess) {
+  FunctionDef fdef;
+  setenv("TF_DUMP_GRAPH_PREFIX", testing::TmpDir().c_str(), 1);
+  string ret = DumpFunctionDefToFile("function", fdef);
+  EXPECT_EQ(ret, io::JoinPath(testing::TmpDir(), "function.pbtxt"));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/util/mkl_util.h b/tensorflow/core/util/mkl_util.h
index b7a6e0b690282b95f08183b5b7a11abd0a5972b6..928807458aca3c79d52e14509eb4238e134b5cdf 100644
--- a/tensorflow/core/util/mkl_util.h
+++ b/tensorflow/core/util/mkl_util.h
@@ -1644,6 +1644,9 @@ class MklDnnData {
         cpu_engine_(e) {}
 
   ~MklDnnData() {
+    if (allocated_buffer_ != nullptr) {
+      cpu_allocator()->DeallocateRaw(allocated_buffer_);
+    }
     cpu_engine_ = nullptr;  // We don't own this.
     delete (user_memory_);
     delete (reorder_memory_);
diff --git a/tensorflow/core/util/permutation_input_iterator.h b/tensorflow/core/util/permutation_input_iterator.h
index f6375b25157644cda97aa195958b60ac27b8a4d6..649318ebf3b4542a244f98342702cef087d28fce 100644
--- a/tensorflow/core/util/permutation_input_iterator.h
+++ b/tensorflow/core/util/permutation_input_iterator.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
-#define TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
 
 #include <iostream>
 #include <iterator>
@@ -131,4 +131,4 @@ class PermutationInputIterator {
 
 }  // end namespace tensorflow
 
-#endif  // TENSORFLOW_UTIL_PERMUTATION_INPUT_ITERATOR_H_
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_INPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/permutation_output_iterator.h b/tensorflow/core/util/permutation_output_iterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..638c0f4545853b28dd5822817c1ec8759bb3a80b
--- /dev/null
+++ b/tensorflow/core/util/permutation_output_iterator.h
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+#define TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
+
+#include <iostream>
+#include <iterator>
+
+namespace tensorflow {
+
+template <typename ValueType, typename OutputIteratorT, typename IndexIteratorT,
+          typename OffsetT = ptrdiff_t>
+class PermutationOutputIterator {
+ public:
+  // Required iterator traits
+  typedef PermutationOutputIterator self_type;  ///< My own type
+  typedef OffsetT difference_type;  ///< Type to express the result of
+                                    ///< subtracting one iterator from another
+  typedef ValueType
+      value_type;  ///< The type of the element the iterator can point to
+  typedef ValueType* pointer;    ///< The type of a pointer to an element the
+                                 ///< iterator can point to
+  typedef ValueType& reference;  ///< The type of a reference to an element the
+                                 ///< iterator can point to
+
+  typedef std::random_access_iterator_tag
+      iterator_category;  ///< The iterator category
+
+ private:
+  OutputIteratorT output_itr;
+  IndexIteratorT index_itr;
+
+ public:
+  /// Constructor
+  __host__ __device__ __forceinline__ PermutationOutputIterator(
+      OutputIteratorT output_itr,  ///< Input iterator to wrap
+      IndexIteratorT index_itr)    ///< Conversion functor to wrap
+      : output_itr(output_itr), index_itr(index_itr) {}
+
+  /// Postfix increment
+  __host__ __device__ __forceinline__ self_type operator++(int) {
+    self_type retval = *this;
+    index_itr++;
+    return retval;
+  }
+
+  /// Prefix increment
+  __host__ __device__ __forceinline__ self_type operator++() {
+    index_itr++;
+    return *this;
+  }
+
+  /// Indirection
+  __host__ __device__ __forceinline__ reference operator*() const {
+    return output_itr[*index_itr];
+  }
+
+  /// Addition
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator+(Distance n) const {
+    self_type retval(output_itr, index_itr + n);
+    return retval;
+  }
+
+  /// Addition assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator+=(Distance n) {
+    index_itr += n;
+    return *this;
+  }
+
+  /// Subtraction
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type operator-(Distance n) const {
+    self_type retval(output_itr, index_itr - n);
+    return retval;
+  }
+
+  /// Subtraction assignment
+  template <typename Distance>
+  __host__ __device__ __forceinline__ self_type& operator-=(Distance n) {
+    index_itr -= n;
+    return *this;
+  }
+
+  /// Distance
+  __host__ __device__ __forceinline__ difference_type
+  operator-(self_type other) const {
+    return index_itr - other.index_itr;
+  }
+
+  /// Array subscript
+  template <typename Distance>
+  __host__ __device__ __forceinline__ reference operator[](Distance n) const {
+    return output_itr[index_itr[n]];
+  }
+
+  /// Equal to
+  __host__ __device__ __forceinline__ bool operator==(const self_type& rhs) {
+    return (index_itr == rhs.index_itr && output_itr == rhs.output_itr);
+  }
+
+  /// Not equal to
+  __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs) {
+    return !(*this == rhs);
+  }
+
+  /// ostream operator
+  friend std::ostream& operator<<(std::ostream& os, const self_type& itr) {
+    return os;
+  }
+};
+
+}  // end namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_PERMUTATION_OUTPUT_ITERATOR_H_
diff --git a/tensorflow/core/util/proto/decode.h b/tensorflow/core/util/proto/decode.h
index cbcb203ee76471674429f133d54d4d0875dd9d5d..8dde14dffcdc5ffe4d64360f3af40521efe29bf8 100644
--- a/tensorflow/core/util/proto/decode.h
+++ b/tensorflow/core/util/proto/decode.h
@@ -318,7 +318,7 @@ inline int ReadPackedPrimitives(const void* bufp, const size_t len,
   return count;
 }
 
-// Reads a primitive value field from a serialized proto.
+// Reads a value of a primitive type field from a serialized proto.
 // The value is parsed from the serialized format, then static_cast
 // to the desired type for TensorFlow and stored.
 template <class ValueType, class TensorType,
diff --git a/tensorflow/core/util/sparse/sparse_tensor.h b/tensorflow/core/util/sparse/sparse_tensor.h
index b9ca8ab395bb85048e9dfca1db48303ce92e8316..89c163aa5133fafc23b01c7153ac40d32efcaaf6 100644
--- a/tensorflow/core/util/sparse/sparse_tensor.h
+++ b/tensorflow/core/util/sparse/sparse_tensor.h
@@ -238,15 +238,6 @@ class SparseTensor {
   static Status Split(const SparseTensor& tensor, const int split_dim,
                       const int num_split, std::vector<SparseTensor>* result);
 
-  template <typename T>
-  ABSL_DEPRECATED(
-      "Use the form of Split() that takes an output pointer and returns a "
-      "status instead.")
-  static std::vector<SparseTensor> Split(const SparseTensor& tensor,
-                                         const int split_dim,
-                                         const int num_split,
-                                         Status* status = nullptr);
-
   // Slice() will slice the input SparseTensor into a SparseTensor based on
   // specified start and size. Both start and size are 1-D array with each
   // element of the array representing one dimension. The start is the start
@@ -578,10 +569,9 @@ SparseTensor SparseTensor::Concat(
 }
 
 template <typename T>
-std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
-                                              const int split_dim,
-                                              const int num_split,
-                                              Status* status /* = nullptr */) {
+Status SparseTensor::Split(const SparseTensor& input_tensor,
+                           const int split_dim, const int num_split,
+                           std::vector<SparseTensor>* result) {
   std::vector<Tensor> output_indices;
   std::vector<Tensor> output_values;
   std::vector<TensorShape> output_shapes;
@@ -601,17 +591,15 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
   const int split_dim_size = input_tensor.shape()[split_dim];
   const int split_size = split_dim_size / num_split;
 
-  if (!(num_split > 0 && num_split <= split_dim_size) && status != nullptr) {
-    *status = Status(error::INVALID_ARGUMENT,
-                     strings::StrCat("num_split must be in the interval (0, ",
-                                     split_dim_size, "]"));
-    return {};
+  if (!(num_split > 0 && num_split <= split_dim_size)) {
+    return Status(error::INVALID_ARGUMENT,
+                  strings::StrCat("num_split must be in the interval (0, ",
+                                  split_dim_size, "]"));
   }
   if (!(split_dim >= 0 && split_dim < num_dim)) {
-    *status = Status(
+    return Status(
         error::INVALID_ARGUMENT,
         strings::StrCat("num_dim must be in the interval [0, ", num_dim, ")"));
-    return {};
   }
 
   const int residual = split_dim_size % num_split;
@@ -649,28 +637,18 @@ std::vector<SparseTensor> SparseTensor::Split(const SparseTensor& input_tensor,
     }
   }
 
-  std::vector<SparseTensor> output_tensors;
-  output_tensors.reserve(num_split);
+  result->clear();
+  result->reserve(num_split);
   for (int i = 0; i < num_split; ++i) {
     SparseTensor tensor;
     Status create_status =
         Create(output_indices[i], output_values[i], output_shapes[i], &tensor);
-    if (!create_status.ok() && status != nullptr) {
-      *status = create_status;
-      return {};
+    if (!create_status.ok()) {
+      return create_status;
     }
-    output_tensors.push_back(std::move(tensor));
+    result->push_back(std::move(tensor));
   }
-  return output_tensors;
-}
-
-template <typename T>
-Status SparseTensor::Split(const SparseTensor& input_tensor,
-                           const int split_dim, const int num_split,
-                           std::vector<SparseTensor>* result) {
-  Status status;
-  *result = Split<T>(input_tensor, split_dim, num_split, &status);
-  return status;
+  return Status::OK();
 }
 
 template <typename T>
diff --git a/tensorflow/core/util/stats_calculator.cc b/tensorflow/core/util/stats_calculator.cc
index eb077546501327c62aff5c9d68eb5d0ba1c9aa1c..bce650f2456029b578356e572393c0ec08df2441 100644
--- a/tensorflow/core/util/stats_calculator.cc
+++ b/tensorflow/core/util/stats_calculator.cc
@@ -53,7 +53,7 @@ std::string StatsCalculator::HeaderString(const std::string& title) const {
          << " ==============================" << std::endl;
 
   InitField(stream, 24) << "[node type]";
-  InitField(stream, 9) << "[start]";
+  InitField(stream, 17) << "[start]";
   InitField(stream, 9) << "[first]";
   InitField(stream, 9) << "[avg ms]";
   InitField(stream, 8) << "[%]";
@@ -77,7 +77,7 @@ std::string StatsCalculator::ColumnString(const Detail& detail,
 
   std::stringstream stream;
   InitField(stream, 24) << detail.type;
-  InitField(stream, 9) << start_ms;
+  InitField(stream, 17) << start_ms;
   InitField(stream, 9) << first_time_ms;
   InitField(stream, 9) << avg_time_ms;
   InitField(stream, 7) << percentage << "%";
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index ad8a44a518489b3b60738df9902d395666afc96b..55688e580848e42bdd453a270a530a5423fb3aec 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -83,10 +83,17 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
   {
     int full_index = 0;
 
-    const auto& strides_flat = sparse.strides_tensor.flat<T>();
+    const T* const strides_flat = sparse.strides_tensor.vec<T>().data();
     dense->begin_valid = sparse.begin_tensor != nullptr;
     dense->end_valid = sparse.end_tensor != nullptr;
 
+    const T* const begin_flat = sparse.begin_tensor != nullptr
+                                    ? sparse.begin_tensor->vec<T>().data()
+                                    : nullptr;
+    const T* const end_flat = sparse.end_tensor != nullptr
+                                  ? sparse.end_tensor->vec<T>().data()
+                                  : nullptr;
+
     for (int i = 0; i < sparse.dims; i++) {
       if ((1 << i) & sparse.ellipsis_mask) {
         // Expand the ellipsis into the appropriate indices
@@ -112,16 +119,14 @@ static Status TF_MUST_USE_RESULT BuildDenseSpec(
         }
 
         // Gather slicing spec into appropriate index
-        if (sparse.begin_tensor != nullptr) {
-          const auto& begin_flat = sparse.begin_tensor->flat<T>();
-          dense->begin[full_index] = internal::SubtleMustCopy<T>(begin_flat(i));
+        if (begin_flat != nullptr) {
+          dense->begin[full_index] = internal::SubtleMustCopy<T>(begin_flat[i]);
         }
-        if (sparse.end_tensor != nullptr) {
-          const auto& end_flat = sparse.end_tensor->flat<T>();
-          dense->end[full_index] = internal::SubtleMustCopy<T>(end_flat(i));
+        if (end_flat != nullptr) {
+          dense->end[full_index] = internal::SubtleMustCopy<T>(end_flat[i]);
         }
         dense->strides[full_index] =
-            internal::SubtleMustCopy<T>(strides_flat(i));
+            internal::SubtleMustCopy<T>(strides_flat[i]);
         if (sparse.begin_mask & (1 << i)) {
           dense->begin_mask |= (1 << full_index);
         }
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 2dcb57a1f9bd22eeee746debb32e08551ef2d6ec..3709ee5ae30f9a01652c98b5188ca3229109c1a1 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -785,7 +785,7 @@ Status BundleReader::GetBundleEntryProto(StringPiece key,
   TF_RETURN_IF_ERROR(
       ParseEntryProto(iter_->key(), iter_->value(), &entry_copy));
   if (!TensorShape::IsValid(entry_copy.shape())) {
-    return errors::DataLoss("Invaid tensor shape: ", key, " ",
+    return errors::DataLoss("Invalid tensor shape: ", key, " ",
                             ProtoShortDebugString(entry_copy.shape()));
   }
 
@@ -895,7 +895,7 @@ Status BundleReader::ReadCurrent(Tensor* val) {
   BundleEntryProto entry;
   TF_RETURN_IF_ERROR(ParseEntryProto(iter_->key(), iter_->value(), &entry));
   if (!TensorShape::IsValid(entry.shape())) {
-    return errors::DataLoss("Invaid tensor shape: ", iter_->key(), " ",
+    return errors::DataLoss("Invalid tensor shape: ", iter_->key(), " ",
                             ProtoShortDebugString(entry.shape()));
   }
 
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index b0c349dd907b71f1a33854930802e1692b3cfb69..a296fb447e252e62809aeb17d9d00cf35ad15fc9 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -498,7 +498,8 @@ inline TensorShape ShapeFromFormat(TensorFormat format, int64 N,
   dim_sizes[GetTensorBatchDimIndex(dims, format)] = N;
   for (int dim = 0; static_cast<size_t>(dim) < spatial.size(); dim++) {
     auto dim_size = spatial[dim];
-    if (format == FORMAT_NHWC_VECT_W && dim == spatial.size() - 1) {
+    if (format == FORMAT_NHWC_VECT_W &&
+        static_cast<size_t>(dim) == spatial.size() - 1) {
       CHECK_EQ(0, dim_size % 4)
           << "FORMAT_NHWC_VECT_W requires W to be a multiple of 4, but W="
           << dim_size;
diff --git a/tensorflow/core/util/tensor_ops_util.h b/tensorflow/core/util/tensor_ops_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..615f088a9b9e4dfce918473cb5e0ef8c9e551230
--- /dev/null
+++ b/tensorflow/core/util/tensor_ops_util.h
@@ -0,0 +1,128 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
+#define TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
+
+#define EIGEN_USE_THREADS
+
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/register_types.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/variant_op_registry.h"
+#include "tensorflow/core/lib/core/status.h"
+
+namespace tensorflow {
+
+typedef Eigen::ThreadPoolDevice CPUDevice;
+typedef Eigen::GpuDevice GPUDevice;
+
+template <typename Device>
+Status ZerosLikeTensor(OpKernelContext* ctx, const Tensor& x, Tensor* out) {
+  AllocatorAttributes attr;
+  if (x.dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(x.dtype(), x.shape(), out, attr));
+
+  switch (out->dtype()) {
+#define DTYPE_CASE(dtype)                                       \
+  case DataTypeToEnum<dtype>::value:                            \
+    /* TODO(skyewm): use SetZeroFunctor like in ZerosLikeOp? */ \
+    out->flat<dtype>().device(ctx->eigen_device<Device>()) =    \
+        out->flat<dtype>().constant(dtype(0));                  \
+    break;
+
+    TF_CALL_POD_TYPES(DTYPE_CASE)
+#undef DTYPE_CASE
+
+    case DT_INVALID: {
+      *out = Tensor(DT_INVALID);
+      break;
+    }
+    case DataTypeToEnum<Variant>::value: {
+      Variant* out_variant = out->scalar<Variant>().data();
+      TF_RETURN_IF_ERROR(
+          UnaryOpVariant<Device>(ctx, ZEROS_LIKE_VARIANT_UNARY_OP,
+                                 x.scalar<Variant>()(), out_variant));
+      break;
+    }
+    default:
+      return errors::InvalidArgument(
+          "Trying to compute zeros_like for unsupported dtype ",
+          DataTypeString(out->dtype()));
+  }
+  return Status::OK();
+}
+
+template <typename Device>
+Status BinaryAddTensors(OpKernelContext* ctx, const Tensor& a, const Tensor& b,
+                        Tensor* out) {
+  if (a.dtype() == DT_INVALID) {
+    *out = b;
+    return Status::OK();
+  }
+  if (b.dtype() == DT_INVALID) {
+    *out = a;
+    return Status::OK();
+  }
+  if (a.dtype() != b.dtype()) {
+    return errors::InvalidArgument(
+        "Trying to add two tensors with incompatible element types. ",
+        "One is ", DataTypeString(a.dtype()), " and the other is ",
+        DataTypeString(b.dtype()));
+  }
+  if (a.shape() != b.shape()) {
+    // TODO(apassos) support broadcasting additions here?
+    return errors::InvalidArgument(
+        "Trying to add two tensors with incompatible element shapes. ",
+        "One is ", a.shape().DebugString(), " and the other is ",
+        b.shape().DebugString());
+  }
+
+  AllocatorAttributes attr;
+  if (a.dtype() == DT_VARIANT) {
+    attr.set_on_host(true);
+  }
+  TF_RETURN_IF_ERROR(ctx->allocate_temp(a.dtype(), a.shape(), out, attr));
+
+  switch (out->dtype()) {
+#define DTYPE_CASE(dtype)                                    \
+  case DataTypeToEnum<dtype>::value:                         \
+    out->flat<dtype>().device(ctx->eigen_device<Device>()) = \
+        a.flat<dtype>() + b.flat<dtype>();                   \
+    break;
+
+    TF_CALL_NUMBER_TYPES(DTYPE_CASE)
+#undef DTYPE_CASE
+
+    case DataTypeToEnum<Variant>::value: {
+      Variant* out_variant = out->scalar<Variant>().data();
+      TF_RETURN_IF_ERROR(BinaryOpVariants<Device>(
+          ctx, ADD_VARIANT_BINARY_OP, a.scalar<Variant>()(),
+          b.scalar<Variant>()(), out_variant));
+      break;
+    }
+    default:
+      return errors::InvalidArgument("Trying to add unsupported dtype ",
+                                     out->dtype());
+  }
+  return Status::OK();
+}
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_UTIL_TENSOR_OPS_UTIL_H_
diff --git a/tensorflow/examples/adding_an_op/fact_test.py b/tensorflow/examples/adding_an_op/fact_test.py
index 11163e7ba5c6421554afa0486f4c102d0743e5e2..46beaebe0cc01d2fea29defa6a58573a45ec091b 100644
--- a/tensorflow/examples/adding_an_op/fact_test.py
+++ b/tensorflow/examples/adding_an_op/fact_test.py
@@ -19,10 +19,12 @@ from __future__ import division
 from __future__ import print_function
 
 import tensorflow as tf
+from tensorflow.python.framework import test_util
 
 
 class FactTest(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       print(tf.user_ops.my_fact().eval())
diff --git a/tensorflow/examples/adding_an_op/zero_out_1_test.py b/tensorflow/examples/adding_an_op/zero_out_1_test.py
index 342d3a020cc325de4991b1f620f4cd2110ed0906..459ac2dc279ef6adfd7bbc1773fa3745c15ca35d 100644
--- a/tensorflow/examples/adding_an_op/zero_out_1_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_1_test.py
@@ -23,10 +23,12 @@ import os.path
 
 import tensorflow as tf
 from tensorflow.examples.adding_an_op import zero_out_op_1
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut1Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_1.zero_out([5, 4, 3, 2, 1])
diff --git a/tensorflow/examples/adding_an_op/zero_out_2_test.py b/tensorflow/examples/adding_an_op/zero_out_2_test.py
index 45045978176a65fb7aaacd4c8d6f1b209f6e82ac..650fd9546b5501f603306d935ade8d08b86b133a 100644
--- a/tensorflow/examples/adding_an_op/zero_out_2_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_2_test.py
@@ -24,20 +24,24 @@ import tensorflow as tf
 
 from tensorflow.examples.adding_an_op import zero_out_grad_2  # pylint: disable=unused-import
 from tensorflow.examples.adding_an_op import zero_out_op_2
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut2Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_2.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def test_2d(self):
     with self.cached_session():
       result = zero_out_op_2.zero_out([[6, 5, 4], [3, 2, 1]])
       self.assertAllEqual(result.eval(), [[6, 0, 0], [0, 0, 0]])
 
+  @test_util.run_deprecated_v1
   def test_grad(self):
     with self.cached_session():
       shape = (5,)
@@ -46,6 +50,7 @@ class ZeroOut2Test(tf.test.TestCase):
       err = tf.test.compute_gradient_error(x, shape, y, shape)
       self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def test_grad_2d(self):
     with self.cached_session():
       shape = (2, 3)
diff --git a/tensorflow/examples/adding_an_op/zero_out_3_test.py b/tensorflow/examples/adding_an_op/zero_out_3_test.py
index 15d62495aaee769f8aad79b844e3bb9b0a1e0df2..8cbe2b6793a436be7f3b954e64dd85d8ae5c891f 100644
--- a/tensorflow/examples/adding_an_op/zero_out_3_test.py
+++ b/tensorflow/examples/adding_an_op/zero_out_3_test.py
@@ -21,31 +21,36 @@ from __future__ import print_function
 
 import tensorflow as tf
 from tensorflow.examples.adding_an_op import zero_out_op_3
+from tensorflow.python.framework import test_util
 
 
 class ZeroOut3Test(tf.test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1])
       self.assertAllEqual(result.eval(), [5, 0, 0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def testAttr(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=3)
       self.assertAllEqual(result.eval(), [0, 0, 0, 2, 0])
 
+  @test_util.run_deprecated_v1
   def testNegative(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=-1)
       with self.assertRaisesOpError("Need preserve_index >= 0, got -1"):
-        result.eval()
+        self.evaluate(result)
 
+  @test_util.run_deprecated_v1
   def testLarge(self):
     with self.cached_session():
       result = zero_out_op_3.zero_out([5, 4, 3, 2, 1], preserve_index=17)
       with self.assertRaisesOpError("preserve_index out of range"):
-        result.eval()
+        self.evaluate(result)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/examples/autograph/integration_tests/BUILD b/tensorflow/examples/autograph/integration_tests/BUILD
index d20c17b63b923458952dbfdb1e07e808cf6a36ff..2a4a0f75e7a120d554c882025ad2a0e280913a6d 100644
--- a/tensorflow/examples/autograph/integration_tests/BUILD
+++ b/tensorflow/examples/autograph/integration_tests/BUILD
@@ -22,7 +22,6 @@ py_test(
         "keras_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
@@ -34,7 +33,6 @@ py_test(
         "list_literals_test.py",
     ],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         "//tensorflow:tensorflow_py",
     ],
diff --git a/tensorflow/examples/autograph/integration_tests/keras_test.py b/tensorflow/examples/autograph/integration_tests/keras_test.py
index dca7c07b470498394593756a93a69af48c4ece43..3fe33df920d008845bfd1002075fd6b5dc25b31f 100644
--- a/tensorflow/examples/autograph/integration_tests/keras_test.py
+++ b/tensorflow/examples/autograph/integration_tests/keras_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.python import autograph
+from tensorflow.python.framework import test_util
 
 
 class MinimalKeras(tf.keras.Model):
@@ -84,6 +85,7 @@ class KerasTest(tf.test.TestCase):
     model = ModelWithStaticConditional(True)
     self.assertEqual(model.call(), 25)
 
+  @test_util.run_deprecated_v1
   def test_recursive_true(self):
     with self.assertRaisesRegexp(NotImplementedError,
                                  'Object conversion is not yet supported.'):
@@ -93,10 +95,10 @@ class KerasTest(tf.test.TestCase):
         init = tf.global_variables_initializer()
 
         with tf.Session() as sess:
-          sess.run(init)
+          self.evaluate(init)
           sample_input = tf.random_uniform((1, 10, 10, 1))
           output = model(sample_input)  # pylint: disable=not-callable
-          self.assertEqual(sess.run(output).shape, (1, 3))
+          self.assertEqual(self.evaluate(output).shape, (1, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/autograph/integration_tests/list_literals_test.py b/tensorflow/examples/autograph/integration_tests/list_literals_test.py
index 917f5ff9d849d131d18e7e6c748d9c679b1b119e..e85d4abcfc9adfbb4bc6390589b846f7e59f3739 100644
--- a/tensorflow/examples/autograph/integration_tests/list_literals_test.py
+++ b/tensorflow/examples/autograph/integration_tests/list_literals_test.py
@@ -34,7 +34,7 @@ class ListLiteralsTest(tf.test.TestCase):
     result = converted()
 
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(result), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(result), [1, 2, 3])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/get_started/regression/custom_regression.py b/tensorflow/examples/get_started/regression/custom_regression.py
index 2e34362c5ced96ac6aec5a9258519bb49ef9157d..7b7cbb78666f0de5e77858b79eda721adc493ecb 100644
--- a/tensorflow/examples/get_started/regression/custom_regression.py
+++ b/tensorflow/examples/get_started/regression/custom_regression.py
@@ -100,12 +100,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
diff --git a/tensorflow/examples/get_started/regression/dnn_regression.py b/tensorflow/examples/get_started/regression/dnn_regression.py
index 951c93b52e73a8e7f4497e9c4b0e91038de85620..94669a5082b26cac79e2879da43cc8aa6e5e83d0 100644
--- a/tensorflow/examples/get_started/regression/dnn_regression.py
+++ b/tensorflow/examples/get_started/regression/dnn_regression.py
@@ -45,12 +45,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The first way assigns a unique weight to each category. To do this you must
   # specify the category's vocabulary (values outside this specification will
diff --git a/tensorflow/examples/get_started/regression/linear_regression_categorical.py b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
index e2ad415fbcb161a599cff7d123597e5156d11770..5312272a9592973e757e6cdd5a2305c0c04372a9 100644
--- a/tensorflow/examples/get_started/regression/linear_regression_categorical.py
+++ b/tensorflow/examples/get_started/regression/linear_regression_categorical.py
@@ -45,12 +45,11 @@ def main(argv):
         # that the examples are well mixed.
         train.shuffle(1000).batch(128)
         # Repeat forever
-        .repeat().make_one_shot_iterator().get_next())
+        .repeat())
 
   # Build the validation input_fn.
   def input_test():
-    return (test.shuffle(1000).batch(128)
-            .make_one_shot_iterator().get_next())
+    return test.shuffle(1000).batch(128)
 
   # The following code demonstrates two of the ways that `feature_columns` can
   # be used to build a model with categorical inputs.
diff --git a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
index 740224744860fdd76bea9c4531242a4976b20784..5c52a2c8461660e19ef6e98c01a6a58a3f3c0920 100644
--- a/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
+++ b/tensorflow/examples/how_tos/reading_data/fully_connected_reader.py
@@ -126,7 +126,7 @@ def inputs(train, batch_size, num_epochs):
     dataset = dataset.repeat(num_epochs)
     dataset = dataset.batch(batch_size)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)
   return iterator.get_next()
 
 
diff --git a/tensorflow/examples/learn/iris_custom_decay_dnn.py b/tensorflow/examples/learn/iris_custom_decay_dnn.py
index 4a219694d10ef075e0e0403cdd7ed100c39ddadd..73bf20fada488a818471f47c6f0b5f0d6073ce25 100644
--- a/tensorflow/examples/learn/iris_custom_decay_dnn.py
+++ b/tensorflow/examples/learn/iris_custom_decay_dnn.py
@@ -76,12 +76,12 @@ def main(unused_argv):
   classifier = tf.estimator.Estimator(model_fn=my_model)
 
   # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
   classifier.train(input_fn=train_input_fn, steps=1000)
 
   # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
diff --git a/tensorflow/examples/learn/iris_custom_model.py b/tensorflow/examples/learn/iris_custom_model.py
index c6bdb86ba52b9715b977909d9b7d0fbc59161a53..bf34d72ba07860569183f3eec49fa29f1d577cbf 100644
--- a/tensorflow/examples/learn/iris_custom_model.py
+++ b/tensorflow/examples/learn/iris_custom_model.py
@@ -73,12 +73,12 @@ def main(unused_argv):
   classifier = tf.estimator.Estimator(model_fn=my_model)
 
   # Train.
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_train}, y=y_train, num_epochs=None, shuffle=True)
   classifier.train(input_fn=train_input_fn, steps=1000)
 
   # Predict.
-  test_input_fn = tf.estimator.inputs.numpy_input_fn(
+  test_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={X_FEATURE: x_test}, y=y_test, num_epochs=1, shuffle=False)
   predictions = classifier.predict(input_fn=test_input_fn)
   y_predicted = np.array(list(p['class'] for p in predictions))
diff --git a/tensorflow/examples/speech_commands/freeze_test.py b/tensorflow/examples/speech_commands/freeze_test.py
index 0c7ca9bc011886f4b8155b7f1d876ce183221ad4..9ed9050035baee7081ff7413c1c2fc41b86c607d 100644
--- a/tensorflow/examples/speech_commands/freeze_test.py
+++ b/tensorflow/examples/speech_commands/freeze_test.py
@@ -19,11 +19,13 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.examples.speech_commands import freeze
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FreezeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCreateInferenceGraphWithMfcc(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
@@ -43,6 +45,7 @@ class FreezeTest(test.TestCase):
       ops = [node.op for node in sess.graph_def.node]
       self.assertEqual(1, ops.count('Mfcc'))
 
+  @test_util.run_deprecated_v1
   def testCreateInferenceGraphWithoutMfcc(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
@@ -62,6 +65,7 @@ class FreezeTest(test.TestCase):
       ops = [node.op for node in sess.graph_def.node]
       self.assertEqual(0, ops.count('Mfcc'))
 
+  @test_util.run_deprecated_v1
   def testFeatureBinCount(self):
     with self.cached_session() as sess:
       freeze.create_inference_graph(
diff --git a/tensorflow/examples/speech_commands/input_data_test.py b/tensorflow/examples/speech_commands/input_data_test.py
index b766ba6de0de93fa160b3464e5a860b5f665a76d..9269bb6c0bc780e06ee0c42617478e3a1486100e 100644
--- a/tensorflow/examples/speech_commands/input_data_test.py
+++ b/tensorflow/examples/speech_commands/input_data_test.py
@@ -26,6 +26,7 @@ import tensorflow as tf
 from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
 from tensorflow.examples.speech_commands import input_data
 from tensorflow.examples.speech_commands import models
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -35,7 +36,7 @@ class InputDataTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([32000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
@@ -96,6 +97,7 @@ class InputDataTest(test.TestCase):
         input_data.which_set("foo_nohash_0.wav", 10, 10),
         input_data.which_set("foo_nohash_1.wav", 10, 10))
 
+  @test_util.run_deprecated_v1
   def testPrepareDataIndex(self):
     tmp_dir = self.get_temp_dir()
     self._saveWavFolders(tmp_dir, ["a", "b", "c"], 100)
@@ -125,6 +127,7 @@ class InputDataTest(test.TestCase):
                                     10, self._model_settings(), tmp_dir)
     self.assertTrue("Expected to find" in str(e.exception))
 
+  @test_util.run_deprecated_v1
   def testPrepareBackgroundData(self):
     tmp_dir = self.get_temp_dir()
     background_dir = os.path.join(tmp_dir, "_background_noise_")
@@ -156,6 +159,7 @@ class InputDataTest(test.TestCase):
     self.assertIsNotNone(loaded_data)
     self.assertEqual(16000, len(loaded_data))
 
+  @test_util.run_deprecated_v1
   def testPrepareProcessingGraph(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
@@ -186,15 +190,19 @@ class InputDataTest(test.TestCase):
     self.assertIsNotNone(audio_processor.background_volume_placeholder_)
     self.assertIsNotNone(audio_processor.output_)
 
+  @test_util.run_deprecated_v1
   def testGetDataAverage(self):
     self._runGetDataTest("average", 10)
 
+  @test_util.run_deprecated_v1
   def testGetDataAverageLongWindow(self):
     self._runGetDataTest("average", 30)
 
+  @test_util.run_deprecated_v1
   def testGetDataMfcc(self):
     self._runGetDataTest("mfcc", 30)
 
+  @test_util.run_deprecated_v1
   def testGetUnprocessedData(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
@@ -216,6 +224,7 @@ class InputDataTest(test.TestCase):
     self.assertEqual(10, len(result_data))
     self.assertEqual(10, len(result_labels))
 
+  @test_util.run_deprecated_v1
   def testGetFeaturesForWav(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
diff --git a/tensorflow/examples/speech_commands/label_wav_test.py b/tensorflow/examples/speech_commands/label_wav_test.py
index f0af2a4798785d53fe937fde45dbc9c9d67acfbc..77a88f98e165758994ddbbd21acab8823dcf5686 100644
--- a/tensorflow/examples/speech_commands/label_wav_test.py
+++ b/tensorflow/examples/speech_commands/label_wav_test.py
@@ -33,7 +33,7 @@ class LabelWavTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([1000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
diff --git a/tensorflow/examples/speech_commands/models_test.py b/tensorflow/examples/speech_commands/models_test.py
index 04478c09626f565e7d439afb45999f587da050ab..cb9304eab8df47800145e14e7e28c739af44292b 100644
--- a/tensorflow/examples/speech_commands/models_test.py
+++ b/tensorflow/examples/speech_commands/models_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.examples.speech_commands import models
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -47,6 +48,7 @@ class ModelsTest(test.TestCase):
             feature_bin_count=40,
             preprocess="mfcc"))
 
+  @test_util.run_deprecated_v1
   def testCreateModelConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -58,6 +60,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
       self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelConvInference(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -67,6 +70,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(logits)
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelLowLatencyConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -78,6 +82,7 @@ class ModelsTest(test.TestCase):
       self.assertIsNotNone(sess.graph.get_tensor_by_name(logits.name))
       self.assertIsNotNone(sess.graph.get_tensor_by_name(dropout_prob.name))
 
+  @test_util.run_deprecated_v1
   def testCreateModelFullyConnectedTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
@@ -98,6 +103,7 @@ class ModelsTest(test.TestCase):
                             "bad_architecture", True)
       self.assertTrue("not recognized" in str(e.exception))
 
+  @test_util.run_deprecated_v1
   def testCreateModelTinyConvTraining(self):
     model_settings = self._modelSettings()
     with self.cached_session() as sess:
diff --git a/tensorflow/examples/speech_commands/wav_to_features_test.py b/tensorflow/examples/speech_commands/wav_to_features_test.py
index 87f298769390a7e9b3d3e8bada70770ba7452172..6234490b26760c99e3184cfc9a51b56169ec63bb 100644
--- a/tensorflow/examples/speech_commands/wav_to_features_test.py
+++ b/tensorflow/examples/speech_commands/wav_to_features_test.py
@@ -24,6 +24,7 @@ import tensorflow as tf
 
 from tensorflow.contrib.framework.python.ops import audio_ops as contrib_audio
 from tensorflow.examples.speech_commands import wav_to_features
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -33,7 +34,7 @@ class WavToFeaturesTest(test.TestCase):
     with self.cached_session() as sess:
       sample_data = tf.zeros([32000, 2])
       wav_encoder = contrib_audio.encode_wav(sample_data, 16000)
-      wav_data = sess.run(wav_encoder)
+      wav_data = self.evaluate(wav_encoder)
     return wav_data
 
   def _saveTestWavFile(self, filename, wav_data):
@@ -49,6 +50,7 @@ class WavToFeaturesTest(test.TestCase):
         file_path = os.path.join(dir_name, "some_audio_%d.wav" % i)
         self._saveTestWavFile(file_path, wav_data)
 
+  @test_util.run_deprecated_v1
   def testWavToFeatures(self):
     tmp_dir = self.get_temp_dir()
     wav_dir = os.path.join(tmp_dir, "wavs")
diff --git a/tensorflow/examples/tutorials/layers/cnn_mnist.py b/tensorflow/examples/tutorials/layers/cnn_mnist.py
index 1e8d7d05e1c6af08d788857e74c04134333d019c..670e929236f26363fb1682e8e9576543cc27fb38 100644
--- a/tensorflow/examples/tutorials/layers/cnn_mnist.py
+++ b/tensorflow/examples/tutorials/layers/cnn_mnist.py
@@ -134,7 +134,7 @@ def main(unused_argv):
       tensors=tensors_to_log, every_n_iter=50)
 
   # Train the model
-  train_input_fn = tf.estimator.inputs.numpy_input_fn(
+  train_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
       x={"x": train_data},
       y=train_labels,
       batch_size=100,
@@ -146,11 +146,8 @@ def main(unused_argv):
       hooks=[logging_hook])
 
   # Evaluate the model and print results
-  eval_input_fn = tf.estimator.inputs.numpy_input_fn(
-      x={"x": eval_data},
-      y=eval_labels,
-      num_epochs=1,
-      shuffle=False)
+  eval_input_fn = tf.compat.v1.estimator.inputs.numpy_input_fn(
+      x={"x": eval_data}, y=eval_labels, num_epochs=1, shuffle=False)
   eval_results = mnist_classifier.evaluate(input_fn=eval_input_fn)
   print(eval_results)
 
diff --git a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
index 7967e22d6a0319a530cb2f00e54872f022ac0095..1854e84d490d6c2ff462ee3bc3cc57b48c4d9328 100644
--- a/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
+++ b/tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -183,7 +183,8 @@ def main(_):
   if tf.gfile.Exists(FLAGS.log_dir):
     tf.gfile.DeleteRecursively(FLAGS.log_dir)
   tf.gfile.MakeDirs(FLAGS.log_dir)
-  train()
+  with tf.Graph().as_default():
+    train()
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
index 4a429837b7b997f0f6571060280a9a15543b9f54..464484dab830e73fbc11cc9a2bfd9310bac88653 100644
--- a/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
+++ b/tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 
 using tensorflow::DT_FLOAT;
 using tensorflow::DT_UINT8;
diff --git a/tensorflow/go/README.md b/tensorflow/go/README.md
index 3989f9b25a4f5f47dd235ba55da9a20dae5e7ff4..31bba1ffbfae1d6ae2ae2b106b262486ff3b56a7 100644
--- a/tensorflow/go/README.md
+++ b/tensorflow/go/README.md
@@ -23,8 +23,8 @@ from source.
 
 -   [bazel](https://www.bazel.build/versions/master/docs/install.html)
 -   Environment to build TensorFlow from source code
-    ([Linux of macOS](https://www.tensorflow.org/install/source)).
-    If you don't need GPU support, then try the following:
+    ([Linux or macOS](https://www.tensorflow.org/install/source)). If you don't
+    need GPU support, then try the following:
 
     ```sh
     sudo apt-get install python swig python-numpy # Linux
diff --git a/tensorflow/go/graph.go b/tensorflow/go/graph.go
index 67e42aa961b75e3639a9605af7d3a0e93a25617d..6ff41ca916930f52f660a08e0089dc9f7f1a8e24 100644
--- a/tensorflow/go/graph.go
+++ b/tensorflow/go/graph.go
@@ -112,9 +112,17 @@ func (g *Graph) ImportWithOptions(def []byte, options GraphImportOptions) error
 	C.TF_ImportGraphDefOptionsSetPrefix(opts, cprefix)
 
 	if len(options.Device) != 0 {
-		cdev := C.CString(options.Device)
-		defer C.free(unsafe.Pointer(cdev))
-		C.TF_ImportGraphDefOptionsSetDefaultDevice(opts, cdev)
+		// TODO(ashankar): Remove this error and uncomment below
+		// when a release of the C library which includes
+		// https://github.com/tensorflow/tensorflow/commit/e0af5ac53e5a8ad9b07cdd5738c0a8e12f938c4e
+		// has been made.
+		// See https://github.com/tensorflow/tensorflow/issues/23257
+		return fmt.Errorf("GraphImportOptions.Device is only supported with the TensorFlow C library versions after 1.12 (or built from master). See https://github.com/tensorflow/tensorflow/issues/23257")
+		/*
+			cdev := C.CString(options.Device)
+			defer C.free(unsafe.Pointer(cdev))
+			C.TF_ImportGraphDefOptionsSetDefaultDevice(opts, cdev)
+		*/
 	}
 
 	buf := C.TF_NewBuffer()
@@ -174,6 +182,68 @@ func (g *Graph) Operations() []Operation {
 	return ops
 }
 
+// AddGradients adds operations to compute the partial derivatives of the sum of tensors in y
+// with respect to tensors in x, i.e., d(y[0] + y[1] + ...) / d x[0], d(y[0] + y[1] + ... ) / d x[1] etc.
+//
+// prefix, if non-empty, is the name prefix used for all operations added to the graph to compute
+// these gradients.
+func (g *Graph) AddGradients(prefix string, y []Output, x []Output, dx []Output) ([]Output, error) {
+	var (
+		cprefix *C.char
+
+		cy  = make([]C.TF_Output, len(y))
+		cx  = make([]C.TF_Output, len(x))
+		cdx = make([]C.TF_Output, len(dx))
+		cdy = make([]C.TF_Output, len(x))
+
+		pcy  *C.TF_Output
+		pcx  *C.TF_Output
+		pcdx *C.TF_Output
+		pcdy *C.TF_Output
+
+		status = newStatus()
+	)
+
+	if len(y) > 0 {
+		pcy = &cy[0]
+		for i, o := range y {
+			cy[i] = o.c()
+		}
+	}
+	if len(x) > 0 {
+		pcx = &cx[0]
+		for i, o := range x {
+			cx[i] = o.c()
+		}
+		pcdy = &cdy[0]
+	}
+	if len(dx) > 0 {
+		pcdx = &cdx[0]
+		for i, o := range dx {
+			cdx[i] = o.c()
+		}
+	}
+
+	// If prefix is "", the C.TF_AddGradientsWithPrefix need cprefix to be nil but not ""
+	if len(prefix) != 0 {
+		cprefix = C.CString(prefix)
+		defer C.free(unsafe.Pointer(cprefix))
+	}
+
+	C.TF_AddGradientsWithPrefix(g.c, cprefix, pcy, C.int(len(y)), pcx, C.int(len(x)), pcdx, status.c, pcdy)
+
+	if err := status.Err(); err != nil {
+		return nil, err
+	}
+	dy := make([]Output, len(x))
+	for i, co := range cdy {
+		op := &Operation{co.oper, g}
+		dy[i] = Output{op, int(co.index)}
+	}
+
+	return dy, nil
+}
+
 // OpSpec is the specification of an Operation to be added to a Graph
 // (using Graph.AddOperation).
 type OpSpec struct {
diff --git a/tensorflow/go/graph_test.go b/tensorflow/go/graph_test.go
index b8d65c54f697153ad236f5e27d9f27d048c3a22e..067c7db5c3cd9c880e6f257b199c0742178d29fd 100644
--- a/tensorflow/go/graph_test.go
+++ b/tensorflow/go/graph_test.go
@@ -19,6 +19,7 @@ package tensorflow
 import (
 	"bytes"
 	"fmt"
+	"strings"
 	"testing"
 )
 
@@ -80,3 +81,260 @@ func TestGraphWriteToAndImport(t *testing.T) {
 		t.Error(err)
 	}
 }
+
+func TestGraphAddGradients(t *testing.T) {
+	g := NewGraph()
+	x1, err := Placeholder(g, "x1", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	x2, err := Placeholder(g, "x2", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x1},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y1 := op1.Output(0)
+	op2, err := g.AddOperation(OpSpec{
+		Type:  "AddN",
+		Input: []Input{OutputList([]Output{y0, x2})},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y2 := op2.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y1}, []Output{x1}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), Float)
+	}
+
+	grads1, err := g.AddGradients("", []Output{y2}, []Output{x1, x2}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 2 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), Float)
+	}
+	if grads1[1].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[1].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c1, _ := NewTensor(float32(3.0))
+	c2, _ := NewTensor(float32(2.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x1: c1, x2: c2},
+		[]Output{grads0[0], grads1[0], grads1[1]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(outputs) != 3 {
+		t.Fatal(len(outputs))
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+	if outputs[1].Value().(float32) != 6.0 {
+		t.Fatalf("Got %v, wanted float 6.0", outputs[1].Value())
+	}
+	if outputs[2].Value().(float32) != 1.0 {
+		t.Fatalf("Got %v, wanted float 1.0", outputs[2].Value())
+	}
+}
+
+func TestGraphAddGradientsSums(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	y1 := op1.Output(0)
+
+	grad, err := g.AddGradients("", []Output{y0, y1}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grad) != 1 {
+		t.Fatal(len(grad))
+	}
+	if grad[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grad[0].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x: c},
+		[]Output{grad[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 114.0 {
+		t.Fatalf("Got %v, wanted float 114.0", outputs[0].Value())
+	}
+}
+
+func TestGraphAddGradientsWithInitialValues(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+	op1, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y1",
+		Input: []Input{y0},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y1 := op1.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y1}, []Output{y0}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), Float)
+	}
+
+	grads1, err := g.AddGradients("", []Output{y0}, []Output{x}, []Output{grads0[0]})
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 1 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), Float)
+	}
+
+	sess, err := NewSession(g, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[Output]*Tensor{x: c},
+		[]Output{grads1[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+}
+
+func TestGraphValidateGradientsNames(t *testing.T) {
+	g := NewGraph()
+	x, err := Placeholder(g, "x", Float)
+	if err != nil {
+		t.Fatal(err)
+	}
+	op0, err := g.AddOperation(OpSpec{
+		Type:  "Square",
+		Name:  "y0",
+		Input: []Input{x},
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	y0 := op0.Output(0)
+
+	grads0, err := g.AddGradients("", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads0[0].Op.Name(), "gradients/") {
+		t.Fatalf("Got name %v, wanted started with gradients/", grads0[0].Op.Name())
+	}
+
+	grads1, err := g.AddGradients("", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads1[0].Op.Name(), "gradients_1/") {
+		t.Fatalf("Got name %v, wanted started with gradients_1/", grads1[0].Op.Name())
+	}
+
+	grads2, err := g.AddGradients("more_gradients", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads2[0].Op.Name(), "more_gradients/") {
+		t.Fatalf("Got name %v, wanted started with more_gradients/", grads2[0].Op.Name())
+	}
+
+	grads3, err := g.AddGradients("even_more_gradients", []Output{y0}, []Output{x}, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads3[0].Op.Name(), "even_more_gradients/") {
+		t.Fatalf("Got name %v, wanted started with even_more_gradients/", grads3[0].Op.Name())
+	}
+
+	_, err = g.AddGradients("even_more_gradients", []Output{y0}, []Output{x}, nil)
+	if err == nil {
+		t.Error("AddGradients should have failed if gradients name is already existing")
+	}
+}
diff --git a/tensorflow/go/op/gradients.go b/tensorflow/go/op/gradients.go
new file mode 100644
index 0000000000000000000000000000000000000000..c5956789f426f4cabad11c54d6c30ca2c1fa39d7
--- /dev/null
+++ b/tensorflow/go/op/gradients.go
@@ -0,0 +1,49 @@
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package op
+
+import (
+	"fmt"
+
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+)
+
+// Gradients adds gradients computation ops to the graph according to scope.
+//
+// Arguments:
+//  y: output of the function to derive
+//  x: inputs of the function for which partial derivatives are computed
+//  dx: if not null, the partial derivatives of some loss function L w.r.t. y
+//
+//  return the partial derivatives
+func Gradients(scope *Scope, y []tf.Output, x []tf.Output, dx ...tf.Output) (output []tf.Output) {
+	if len(scope.controlDependencies) > 0 {
+		scope.UpdateErr("Gradients", fmt.Errorf("Gradients does not currently support control dependencies (via Scope.WithControlDependencies)."))
+		return
+	}
+	if scope.device != "" {
+		scope.UpdateErr("Gradients", fmt.Errorf("Gradients does not currently support device annotations (via Scope.WithDevice)."))
+		return
+	}
+
+	var err error
+	if output, err = scope.graph.AddGradients(scope.opName("Gradients"), y, x, dx); err != nil {
+		scope.UpdateErr("Gradients", err)
+		return
+	}
+	return output
+}
diff --git a/tensorflow/go/op/gradients_test.go b/tensorflow/go/op/gradients_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..3d1d57b77eac44b5048d0b41bf63d271005c52ee
--- /dev/null
+++ b/tensorflow/go/op/gradients_test.go
@@ -0,0 +1,246 @@
+/*
+Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package op
+
+import (
+	"strings"
+	"testing"
+
+	tf "github.com/tensorflow/tensorflow/tensorflow/go"
+)
+
+func TestAddGradients(t *testing.T) {
+	var (
+		s  = NewScope()
+		x1 = Placeholder(s.SubScope("x1"), tf.Float)
+		x2 = Placeholder(s.SubScope("x2"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x1)
+		y1 = Square(s.SubScope("y1"), y0)
+		y2 = AddN(s.SubScope("y2"), []tf.Output{y0, x2})
+	)
+
+	grads0 := Gradients(s, []tf.Output{y1}, []tf.Output{x1})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), tf.Float)
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y2}, []tf.Output{x1, x2})
+	if err := sub.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 2 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), tf.Float)
+	}
+	if grads1[1].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[1].DataType(), tf.Float)
+	}
+
+	graph, err := sub.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c1, _ := tf.NewTensor(float32(3.0))
+	c2, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x1: c1, x2: c2},
+		[]tf.Output{grads0[0], grads1[0], grads1[1]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if len(outputs) != 3 {
+		t.Fatal(len(outputs))
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+	if outputs[1].Value().(float32) != 6.0 {
+		t.Fatalf("Got %v, wanted float 6.0", outputs[1].Value())
+	}
+	if outputs[2].Value().(float32) != 1.0 {
+		t.Fatalf("Got %v, wanted float 1.0", outputs[2].Value())
+	}
+}
+
+func TestAddGradientsSums(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+		y1 = Square(s.SubScope("y1"), y0)
+	)
+
+	grad := Gradients(s, []tf.Output{y0, y1}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grad) != 1 {
+		t.Fatal(len(grad))
+	}
+	if grad[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grad[0].DataType(), tf.Float)
+	}
+
+	graph, err := s.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x: c},
+		[]tf.Output{grad[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 114.0 {
+		t.Fatalf("Got %v, wanted float 114.0", outputs[0].Value())
+	}
+}
+
+func TestAddGradientsWithInitialValues(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x1"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+		y1 = Square(s.SubScope("y1"), y0)
+	)
+
+	grads0 := Gradients(s, []tf.Output{y1}, []tf.Output{y0})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads0) != 1 {
+		t.Fatal(len(grads0))
+	}
+	if grads0[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads0[0].DataType(), tf.Float)
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y0}, []tf.Output{x}, grads0[0])
+	if err := sub.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if len(grads1) != 1 {
+		t.Fatal(len(grads1))
+	}
+	if grads1[0].DataType() != tf.Float {
+		t.Fatalf("Got DataType %v, wanted %v", grads1[0].DataType(), tf.Float)
+	}
+
+	graph, err := sub.Finalize()
+	if err != nil {
+		t.Fatal(err)
+	}
+	sess, err := tf.NewSession(graph, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	c, _ := tf.NewTensor(float32(3.0))
+	outputs, err := sess.Run(
+		map[tf.Output]*tf.Tensor{x: c},
+		[]tf.Output{grads1[0]},
+		nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if outputs[0].Value().(float32) != 108.0 {
+		t.Fatalf("Got %v, wanted float 108.0", outputs[0].Value())
+	}
+}
+
+func TestValidateGradientsNames(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+	)
+
+	grads0 := Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads0[0].Op.Name(), "Gradients/") {
+		t.Fatalf("Got name %v, wanted started with Gradients/", grads0[0].Op.Name())
+	}
+
+	sub := s.SubScope("sub")
+	grads1 := Gradients(sub, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err != nil {
+		t.Fatal(err)
+	}
+	if !strings.HasPrefix(grads1[0].Op.Name(), "sub/Gradients/") {
+		t.Fatalf("Got name %v, wanted started with sub/Gradients/", grads1[0].Op.Name())
+	}
+
+	Gradients(sub, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed if executed more than once for scope of the same namespace")
+	}
+}
+
+func TestAddGradientsWithControlDependencies(t *testing.T) {
+	var (
+		s        = NewScope()
+		zero     = Const(s.SubScope("zero"), int32(0))
+		x        = Placeholder(s.SubScope("x"), tf.Float)
+		y0       = Square(s.SubScope("y0"), x)
+		variable = VarHandleOp(s, tf.Int32, tf.ScalarShape())
+		init     = AssignVariableOp(s, variable, zero)
+		readDeps = []*tf.Operation{init}
+	)
+	s = s.WithControlDependencies(readDeps...)
+	Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed when control dependencies are set")
+	}
+}
+
+func TestAddGradientsWithDevice(t *testing.T) {
+	var (
+		s  = NewScope()
+		x  = Placeholder(s.SubScope("x"), tf.Float)
+		y0 = Square(s.SubScope("y0"), x)
+	)
+	s = s.WithDevice("/device:GPU:0")
+	Gradients(s, []tf.Output{y0}, []tf.Output{x})
+	if err := s.Err(); err == nil {
+		t.Error("Gradients should have failed when device is set")
+	}
+}
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index baf43f84f8ee0fb09b28e91bb46a6706cdea7a93..6e49fbb9eae047b4b45758165ad47a5c1923aaf6 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -463,6 +463,14 @@ func QuantizeAndDequantizeV2RangeGiven(value bool) QuantizeAndDequantizeV2Attr {
 	}
 }
 
+// QuantizeAndDequantizeV2RoundMode sets the optional round_mode attribute to value.
+// If not specified, defaults to "HALF_TO_EVEN"
+func QuantizeAndDequantizeV2RoundMode(value string) QuantizeAndDequantizeV2Attr {
+	return func(m optionalAttr) {
+		m["round_mode"] = value
+	}
+}
+
 // Quantizes then dequantizes a tensor.
 //
 // This op simulates the precision loss from the quantized forward pass by:
@@ -3487,30 +3495,6 @@ func BoostedTreesQuantileStreamResourceFlush(scope *Scope, quantile_stream_resou
 	return scope.AddOperation(opspec)
 }
 
-// Add the quantile summaries to each quantile stream resource.
-//
-// An op that adds a list of quantile summaries to a quantile stream resource. Each
-// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
-// for a single feature.
-//
-// Arguments:
-//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
-//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
-//
-// Returns the created operation.
-func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
-		Input: []tf.Input{
-			quantile_stream_resource_handle, tf.OutputList(summaries),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Makes the summary of quantiles for the batch.
 //
 // An op that takes a list of tensors and outputs the quantile summaries for each tensor.
@@ -5547,6 +5531,452 @@ func OrderedMapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.
 	return values
 }
 
+// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
+type MapIncompleteSizeAttr func(optionalAttr)
+
+// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapIncompleteSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of incomplete elements in the underlying container.
+func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapIncompleteSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapSizeAttr is an optional argument to MapSize.
+type MapSizeAttr func(optionalAttr)
+
+// MapSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeCapacity(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapSizeMemoryLimit(value int64) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapSizeContainer(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapSizeSharedName(value string) MapSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MapUnstageAttr is an optional argument to MapUnstage.
+type MapUnstageAttr func(optionalAttr)
+
+// MapUnstageCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageCapacity(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageContainer(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageSharedName(value string) MapUnstageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns the values associated with the key
+//
+// from the underlying container.   If the underlying container
+// does not contain this key, the op will block until it does.
+func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstage",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstage", err)
+		return
+	}
+	return values
+}
+
+// MapPeekAttr is an optional argument to MapPeek.
+type MapPeekAttr func(optionalAttr)
+
+// MapPeekCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekCapacity(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapPeekMemoryLimit(value int64) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapPeekContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapPeekContainer(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapPeekSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapPeekSharedName(value string) MapPeekAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op peeks at the values at the specified key.  If the
+//
+// underlying container does not contain this key
+// this op will block until it does.
+func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapPeek",
+		Input: []tf.Input{
+			key, indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapPeek", err)
+		return
+	}
+	return values
+}
+
+// MapStageAttr is an optional argument to MapStage.
+type MapStageAttr func(optionalAttr)
+
+// MapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageCapacity(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapStageMemoryLimit(value int64) MapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func MapStageContainer(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func MapStageSharedName(value string) MapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a hashtable.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StageClearAttr is an optional argument to StageClear.
+type StageClearAttr func(optionalAttr)
+
+// StageClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearCapacity(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageClearMemoryLimit(value int64) StageClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageClearContainer(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageClearSharedName(value string) StageClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// StageSizeAttr is an optional argument to StageSize.
+type StageSizeAttr func(optionalAttr)
+
+// StageSizeCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeCapacity(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func StageSizeMemoryLimit(value int64) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// StageSizeContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func StageSizeContainer(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// StageSizeSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func StageSizeSharedName(value string) StageSizeAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op returns the number of elements in the underlying container.
+func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StageSize",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Compute the regularized incomplete beta integral \\(I_x(a, b)\\).
 //
 // The regularized incomplete beta integral is defined as:
@@ -5612,34 +6042,6 @@ func Atan2(scope *Scope, y tf.Output, x tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset that passes a sliding window over `input_dataset`.
-//
-// Arguments:
-//
-//	window_size: A scalar representing the number of elements in the
-// sliding window.
-//	window_shift: A scalar representing the steps moving the sliding window
-// forward in one iteration. It must be positive.
-//	window_stride: A scalar representing the stride of the input elements of the sliding window.
-// It must be positive.
-//
-//
-func SlideDataset(scope *Scope, input_dataset tf.Output, window_size tf.Output, window_shift tf.Output, window_stride tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SlideDataset",
-		Input: []tf.Input{
-			input_dataset, window_size, window_shift, window_stride,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // EditDistanceAttr is an optional argument to EditDistance.
 type EditDistanceAttr func(optionalAttr)
 
@@ -5811,37 +6213,6 @@ func DepthwiseConv2dNativeBackpropInput(scope *Scope, input_sizes tf.Output, fil
 	return op.Output(0)
 }
 
-// ApproximateEqualAttr is an optional argument to ApproximateEqual.
-type ApproximateEqualAttr func(optionalAttr)
-
-// ApproximateEqualTolerance sets the optional tolerance attribute to value.
-// If not specified, defaults to 1e-05
-func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
-	return func(m optionalAttr) {
-		m["tolerance"] = value
-	}
-}
-
-// Returns the truth value of abs(x-y) < tolerance element-wise.
-func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ApproximateEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns x / y element-wise.
 //
 // *NOTE*: `Div` supports broadcasting. More about broadcasting
@@ -6279,6 +6650,71 @@ func Asin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// SparseToDenseAttr is an optional argument to SparseToDense.
+type SparseToDenseAttr func(optionalAttr)
+
+// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
+//
+// value: If true, indices are checked to make sure they are sorted in
+// lexicographic order and that there are no repeats.
+// If not specified, defaults to true
+func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
+	return func(m optionalAttr) {
+		m["validate_indices"] = value
+	}
+}
+
+// Converts a sparse representation into a dense tensor.
+//
+// Builds an array `dense` with shape `output_shape` such that
+//
+// ```
+// # If sparse_indices is scalar
+// dense[i] = (i == sparse_indices ? sparse_values : default_value)
+//
+// # If sparse_indices is a vector, then for each i
+// dense[sparse_indices[i]] = sparse_values[i]
+//
+// # If sparse_indices is an n by d matrix, then for each i in [0, n)
+// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
+// ```
+//
+// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
+// scalar, all sparse indices are set to this single value.
+//
+// Indices should be sorted in lexicographic order, and indices must not
+// contain any repeats. If `validate_indices` is true, these properties
+// are checked during execution.
+//
+// Arguments:
+//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
+// index where `sparse_values[i]` will be placed.
+//	output_shape: 1-D.  Shape of the dense output tensor.
+//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
+// or a scalar value to be used for all sparse indices.
+//	default_value: Scalar value to set for indices not specified in
+// `sparse_indices`.
+//
+// Returns Dense output tensor of shape `output_shape`.
+func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseToDense",
+		Input: []tf.Input{
+			sparse_indices, output_shape, sparse_values, default_value,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Computes the sum along sparse segments of a tensor.
 //
 // Like `SparseSegmentSum`, but allows missing ids in `segment_ids`. If an id is
@@ -6370,21 +6806,6 @@ func Sin(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// Computes the complementary error function of `x` element-wise.
-func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Erfc",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes Psi, the derivative of Lgamma (the log of the absolute value of
 //
 // `Gamma(x)`), element-wise.
@@ -6829,61 +7250,6 @@ func Reciprocal(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
-// ParseExampleDatasetAttr is an optional argument to ParseExampleDataset.
-type ParseExampleDatasetAttr func(optionalAttr)
-
-// ParseExampleDatasetSloppy sets the optional sloppy attribute to value.
-// If not specified, defaults to false
-func ParseExampleDatasetSloppy(value bool) ParseExampleDatasetAttr {
-	return func(m optionalAttr) {
-		m["sloppy"] = value
-	}
-}
-
-// Transforms `input_dataset` containing `Example` protos as vectors of DT_STRING into a dataset of `Tensor` or `SparseTensor` objects representing the parsed features.
-//
-// Arguments:
-//
-//
-//	dense_defaults: A dict mapping string keys to `Tensor`s.
-// The keys of the dict must match the dense_keys of the feature.
-//	sparse_keys: A list of string keys in the examples features.
-// The results for these keys will be returned as `SparseTensor` objects.
-//	dense_keys: A list of Ndense string Tensors (scalars).
-// The keys expected in the Examples features associated with dense values.
-//	sparse_types: A list of `DTypes` of the same length as `sparse_keys`.
-// Only `tf.float32` (`FloatList`), `tf.int64` (`Int64List`),
-// and `tf.string` (`BytesList`) are supported.
-//	dense_shapes: List of tuples with the same length as `dense_keys`.
-// The shape of the data for each dense feature referenced by `dense_keys`.
-// Required for any input tensors identified by `dense_keys`.  Must be
-// either fully defined, or may contain an unknown first dimension.
-// An unknown first dimension means the feature is treated as having
-// a variable number of blocks, and the output shape along this dimension
-// is considered unknown at graph build time.  Padding is applied for
-// minibatch elements smaller than the maximum number of blocks for the
-// given feature along this dimension.
-//	output_types: The type list for the return values.
-//	output_shapes: The list of shapes being produced.
-func ParseExampleDataset(scope *Scope, input_dataset tf.Output, num_parallel_calls tf.Output, dense_defaults []tf.Output, sparse_keys []string, dense_keys []string, sparse_types []tf.DataType, dense_shapes []tf.Shape, output_types []tf.DataType, output_shapes []tf.Shape, optional ...ParseExampleDatasetAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"sparse_keys": sparse_keys, "dense_keys": dense_keys, "sparse_types": sparse_types, "dense_shapes": dense_shapes, "output_types": output_types, "output_shapes": output_shapes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ParseExampleDataset",
-		Input: []tf.Input{
-			input_dataset, num_parallel_calls, tf.OutputList(dense_defaults),
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns a batched matrix tensor with new batched diagonal values.
 //
 // Given `input` and `diagonal`, this operation returns a tensor with the
@@ -7646,44 +8012,6 @@ func DecodeBmp(scope *Scope, contents tf.Output, optional ...DecodeBmpAttr) (ima
 	return op.Output(0)
 }
 
-// Computes softsign gradients for a softsign operation.
-//
-// Arguments:
-//	gradients: The backpropagated gradients to the corresponding softsign operation.
-//	features: The features passed as input to the corresponding softsign operation.
-//
-// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
-func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SoftsignGrad",
-		Input: []tf.Input{
-			gradients, features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Provides the time since epoch in seconds.
-//
-// Returns the timestamp as a `float64` for seconds since the Unix epoch.
-//
-// Note: the timestamp is computed when the op is executed, not when it is added
-// to the graph.
-func Timestamp(scope *Scope) (ts tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Timestamp",
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // BatchMatMulAttr is an optional argument to BatchMatMul.
 type BatchMatMulAttr func(optionalAttr)
 
@@ -7956,28 +8284,6 @@ func LeakyRelu(scope *Scope, features tf.Output, optional ...LeakyReluAttr) (act
 	return op.Output(0)
 }
 
-// Writes the given dataset to the given file using the TFRecord format.
-//
-// Arguments:
-//	input_dataset: A variant tensor representing the dataset to write.
-//	filename: A scalar string tensor representing the filename to use.
-//	compression_type: A scalar string tensor containing either (i) the empty string (no
-// compression), (ii) "ZLIB", or (iii) "GZIP".
-//
-// Returns the created operation.
-func DatasetToTFRecord(scope *Scope, input_dataset tf.Output, filename tf.Output, compression_type tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DatasetToTFRecord",
-		Input: []tf.Input{
-			input_dataset, filename, compression_type,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Computes rectified linear 6: `min(max(features, 0), 6)`.
 func Relu6(scope *Scope, features tf.Output) (activations tf.Output) {
 	if scope.Err() != nil {
@@ -9441,6 +9747,178 @@ func BesselI1e(scope *Scope, x tf.Output) (y tf.Output) {
 	return op.Output(0)
 }
 
+// MapClearAttr is an optional argument to MapClear.
+type MapClearAttr func(optionalAttr)
+
+// MapClearCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearCapacity(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapClearMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapClearMemoryLimit(value int64) MapClearAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapClearContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapClearContainer(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapClearSharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapClearSharedName(value string) MapClearAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes all elements in the underlying container.
+//
+// Returns the created operation.
+func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapClear",
+
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// DecodeCSVAttr is an optional argument to DecodeCSV.
+type DecodeCSVAttr func(optionalAttr)
+
+// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
+//
+// value: char delimiter to separate fields in a record.
+// If not specified, defaults to ","
+func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["field_delim"] = value
+	}
+}
+
+// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+//
+// value: If false, treats double quotation marks as regular
+// characters inside of the string fields (ignoring RFC 4180, Section 2,
+// Bullet 5).
+// If not specified, defaults to true
+func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["use_quote_delim"] = value
+	}
+}
+
+// DecodeCSVNaValue sets the optional na_value attribute to value.
+//
+// value: Additional string to recognize as NA/NaN.
+// If not specified, defaults to ""
+func DecodeCSVNaValue(value string) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["na_value"] = value
+	}
+}
+
+// DecodeCSVSelectCols sets the optional select_cols attribute to value.
+// If not specified, defaults to <>
+func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
+	return func(m optionalAttr) {
+		m["select_cols"] = value
+	}
+}
+
+// Convert CSV records to tensors. Each column maps to one tensor.
+//
+// RFC 4180 format is expected for the CSV records.
+// (https://tools.ietf.org/html/rfc4180)
+// Note that we allow leading and trailing spaces with int or float field.
+//
+// Arguments:
+//	records: Each string is a record/row in the csv and all records should have
+// the same format.
+//	record_defaults: One tensor per column of the input record, with either a
+// scalar default value for that column or an empty vector if the column is
+// required.
+//
+// Returns Each tensor will have the same shape as records.
+func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeCSV",
+		Input: []tf.Input{
+			records, tf.OutputList(record_defaults),
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
+		scope.UpdateErr("DecodeCSV", err)
+		return
+	}
+	return output
+}
+
+// Convert JSON-encoded Example records to binary protocol buffer strings.
+//
+// This op translates a tensor containing Example records, encoded using
+// the [standard JSON
+// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
+// into a tensor containing the same records encoded as binary protocol
+// buffers. The resulting tensor can then be fed to any of the other
+// Example-parsing ops.
+//
+// Arguments:
+//	json_examples: Each string is a JSON object serialized according to the JSON
+// mapping of the Example proto.
+//
+// Returns Each string is a binary Example protocol buffer corresponding
+// to the respective element of `json_examples`.
+func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeJSONExample",
+		Input: []tf.Input{
+			json_examples,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Transforms a Tensor into a serialized TensorProto proto.
 //
 // Arguments:
@@ -10499,6 +10977,31 @@ func ParseExample(scope *Scope, serialized tf.Output, names tf.Output, sparse_ke
 	return sparse_indices, sparse_values, sparse_shapes, dense_values
 }
 
+// Compute the pairwise cross product.
+//
+// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
+// or any shape where the innermost dimension is 3. In the latter case, each pair
+// of corresponding 3-element vectors is cross-multiplied independently.
+//
+// Arguments:
+//	a: A tensor containing 3-element vectors.
+//	b: Another tensor, of same type and shape as `a`.
+//
+// Returns Pairwise cross product of the vectors in `a` and `b`.
+func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Cross",
+		Input: []tf.Input{
+			a, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CudnnRNNAttr is an optional argument to CudnnRNN.
 type CudnnRNNAttr func(optionalAttr)
 
@@ -11046,24 +11549,6 @@ func RandomShuffle(scope *Scope, value tf.Output, optional ...RandomShuffleAttr)
 	return op.Output(0)
 }
 
-// Returns the truth value of (x > y) element-wise.
-//
-// *NOTE*: `Greater` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Greater",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // ResourceSparseApplyRMSPropAttr is an optional argument to ResourceSparseApplyRMSProp.
 type ResourceSparseApplyRMSPropAttr func(optionalAttr)
 
@@ -11417,66 +11902,6 @@ func RandomStandardNormal(scope *Scope, shape tf.Output, dtype tf.DataType, opti
 	return op.Output(0)
 }
 
-// RandomUniformIntAttr is an optional argument to RandomUniformInt.
-type RandomUniformIntAttr func(optionalAttr)
-
-// RandomUniformIntSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random integers from a uniform distribution.
-//
-// The generated values are uniform integers in the range `[minval, maxval)`.
-// The lower bound `minval` is included in the range, while the upper bound
-// `maxval` is excluded.
-//
-// The random integers are slightly biased unless `maxval - minval` is an exact
-// power of two.  The bias is small for values of `maxval - minval` significantly
-// smaller than the range of the output (either `2^32` or `2^64`).
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	minval: 0-D.  Inclusive lower bound on the generated integers.
-//	maxval: 0-D.  Exclusive upper bound on the generated integers.
-//
-// Returns A tensor of the specified shape filled with uniform random integers.
-func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomUniformInt",
-		Input: []tf.Input{
-			shape, minval, maxval,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // FusedResizeAndPadConv2DAttr is an optional argument to FusedResizeAndPadConv2D.
 type FusedResizeAndPadConv2DAttr func(optionalAttr)
 
@@ -11827,6 +12252,134 @@ func ResourceScatterUpdate(scope *Scope, resource tf.Output, indices tf.Output,
 	return scope.AddOperation(opspec)
 }
 
+// MaxPoolAttr is an optional argument to MaxPool.
+type MaxPoolAttr func(optionalAttr)
+
+// MaxPoolDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolDataFormat(value string) MaxPoolAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Performs max pooling on the input.
+//
+// Arguments:
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
+//	padding: The type of padding algorithm to use.
+//
+// Returns The max pooled output tensor.
+func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool",
+		Input: []tf.Input{
+			input,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Multiplies sparse updates into the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] *= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] *= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions multiply.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterMul",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Subtracts sparse updates from the variable referenced by `resource`.
+//
+// This operation computes
+//
+//     # Scalar indices
+//     ref[indices, ...] -= updates[...]
+//
+//     # Vector indices (for each i)
+//     ref[indices[i], ...] -= updates[i, ...]
+//
+//     # High rank indices (for each i, ..., j)
+//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
+//
+// Duplicate entries are handled correctly: if multiple `indices` reference
+// the same location, their contributions add.
+//
+// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+//
+// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
+// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
+// </div>
+//
+// Arguments:
+//	resource: Should be from a `Variable` node.
+//	indices: A tensor of indices into the first dimension of `ref`.
+//	updates: A tensor of updated values to add to `ref`.
+//
+// Returns the created operation.
+func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceScatterSub",
+		Input: []tf.Input{
+			resource, indices, updates,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // ResourceSparseApplyFtrlV2Attr is an optional argument to ResourceSparseApplyFtrlV2.
 type ResourceSparseApplyFtrlV2Attr func(optionalAttr)
 
@@ -12431,231 +12984,84 @@ func Bucketize(scope *Scope, input tf.Output, boundaries []float32) (output tf.O
 	return op.Output(0)
 }
 
-// EncodeJpegAttr is an optional argument to EncodeJpeg.
-type EncodeJpegAttr func(optionalAttr)
-
-// EncodeJpegFormat sets the optional format attribute to value.
-//
-// value: Per pixel image format.
-// If not specified, defaults to ""
-func EncodeJpegFormat(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["format"] = value
-	}
-}
-
-// EncodeJpegQuality sets the optional quality attribute to value.
-//
-// value: Quality of the compression from 0 to 100 (higher is better and slower).
-// If not specified, defaults to 95
-func EncodeJpegQuality(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["quality"] = value
-	}
-}
-
-// EncodeJpegProgressive sets the optional progressive attribute to value.
-//
-// value: If True, create a JPEG that loads progressively (coarse to fine).
-// If not specified, defaults to false
-func EncodeJpegProgressive(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["progressive"] = value
-	}
-}
-
-// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
-//
-// value: If True, spend CPU/RAM to reduce size with no quality change.
-// If not specified, defaults to false
-func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["optimize_size"] = value
-	}
-}
-
-// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
-//
-// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
-// If not specified, defaults to true
-func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["chroma_downsampling"] = value
-	}
-}
-
-// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
-//
-// value: Unit used to specify `x_density` and `y_density`:
-// pixels per inch (`'in'`) or centimeter (`'cm'`).
-// If not specified, defaults to "in"
-func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["density_unit"] = value
-	}
-}
-
-// EncodeJpegXDensity sets the optional x_density attribute to value.
-//
-// value: Horizontal pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegXDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["x_density"] = value
-	}
-}
-
-// EncodeJpegYDensity sets the optional y_density attribute to value.
-//
-// value: Vertical pixels per density unit.
-// If not specified, defaults to 300
-func EncodeJpegYDensity(value int64) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["y_density"] = value
-	}
-}
-
-// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
-//
-// value: If not empty, embed this XMP metadata in the image header.
-// If not specified, defaults to ""
-func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
-	return func(m optionalAttr) {
-		m["xmp_metadata"] = value
-	}
-}
-
-// JPEG-encode an image.
-//
-// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
-//
-// The attr `format` can be used to override the color format of the encoded
-// output.  Values can be:
-//
-// *   `''`: Use a default format based on the number of channels in the image.
-// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
-//     of `image` must be 1.
-// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
-//     of `image` must be 3.
+// Encode audio data using the WAV file format.
 //
-// If `format` is not specified or is the empty string, a default format is picked
-// in function of the number of channels in `image`:
+// This operation will generate a string suitable to be saved out to create a .wav
+// audio file. It will be encoded in the 16-bit PCM format. It takes in float
+// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
+// that range.
 //
-// *   1: Output a grayscale image.
-// *   3: Output an RGB image.
+// `audio` is a 2-D float Tensor of shape `[length, channels]`.
+// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
 //
 // Arguments:
-//	image: 3-D with shape `[height, width, channels]`.
+//	audio: 2-D with shape `[length, channels]`.
+//	sample_rate: Scalar containing the sample frequency.
 //
-// Returns 0-D. JPEG-encoded image.
-func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+// Returns 0-D. WAV-encoded file contents.
+func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "EncodeJpeg",
+		Type: "EncodeWav",
 		Input: []tf.Input{
-			image,
+			audio, sample_rate,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// MultinomialAttr is an optional argument to Multinomial.
-type MultinomialAttr func(optionalAttr)
-
-// MultinomialSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 is set to be non-zero, the internal random number
-// generator is seeded by the given seed.  Otherwise, a random seed is used.
-// If not specified, defaults to 0
-func MultinomialSeed(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// MultinomialSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func MultinomialSeed2(value int64) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// MultinomialOutputDtype sets the optional output_dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
-	return func(m optionalAttr) {
-		m["output_dtype"] = value
-	}
-}
-
-// Draws samples from a multinomial distribution.
-//
-// Arguments:
-//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
-// represents the unnormalized log probabilities for all classes.
-//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
-//
-// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
-// contains the drawn class labels with range `[0, num_classes)`.
-func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+// Computes atan of x element-wise.
+func Atan(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "Multinomial",
+		Type: "Atan",
 		Input: []tf.Input{
-			logits, num_samples,
+			x,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
-type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
+type ResourceApplyAdaMaxAttr func(optionalAttr)
 
-// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If `True`, updating of the var, m, and v tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
 // If not specified, defaults to false
-func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+// Update '*var' according to the AdaMax algorithm.
+//
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// v_t <- max(beta2 * v_{t-1}, abs(g))
+// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
+//	m: Should be from a Variable().
+//	v: Should be from a Variable().
+//	beta1_power: Must be a scalar.
+//	lr: Scaling factor. Must be a scalar.
+//	beta1: Momentum factor. Must be a scalar.
+//	beta2: Momentum factor. Must be a scalar.
+//	epsilon: Ridge term. Must be a scalar.
 //	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
 //
 // Returns the created operation.
-func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12664,52 +13070,39 @@ func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumul
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyAdagradDA",
+		Type: "ResourceApplyAdaMax",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
-type ResourceSparseApplyFtrlAttr func(optionalAttr)
+// AssertAttr is an optional argument to Assert.
+type AssertAttr func(optionalAttr)
 
-// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+// AssertSummarize sets the optional summarize attribute to value.
 //
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+// value: Print this many entries of each tensor.
+// If not specified, defaults to 3
+func AssertSummarize(value int64) AssertAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["summarize"] = value
 	}
 }
 
-// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+// Asserts that the given condition is true.
 //
-// That is for rows we have grad for, we update var, accum and linear as follows:
-// accum_new = accum + grad * grad
-// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
+// If `condition` evaluates to false, print the list of tensors in `data`.
+// `summarize` determines how many entries of the tensors to print.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	lr_power: Scaling factor. Must be a scalar.
+//	condition: The condition to evaluate.
+//	data: The tensors to print out when condition is false.
 //
 // Returns the created operation.
-func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -12718,397 +13111,9 @@ func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, line
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyFtrl",
+		Type: "Assert",
 		Input: []tf.Input{
-			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Returns which elements of x are Inf.
-//
-// @compatibility(numpy)
-// Equivalent to np.isinf
-// @end_compatibility
-func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IsInf",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// TruncatedNormalAttr is an optional argument to TruncatedNormal.
-type TruncatedNormalAttr func(optionalAttr)
-
-// TruncatedNormalSeed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// Outputs random values from a truncated normal distribution.
-//
-// The generated values follow a normal distribution with mean 0 and standard
-// deviation 1, except that values whose magnitude is more than 2 standard
-// deviations from the mean are dropped and re-picked.
-//
-// Arguments:
-//	shape: The shape of the output tensor.
-//	dtype: The type of the output.
-//
-// Returns A tensor of the specified shape filled with random truncated normal
-// values.
-func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtype": dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "TruncatedNormal",
-		Input: []tf.Input{
-			shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SkipgramAttr is an optional argument to Skipgram.
-type SkipgramAttr func(optionalAttr)
-
-// SkipgramWindowSize sets the optional window_size attribute to value.
-//
-// value: The number of words to predict to the left and right of the target.
-// If not specified, defaults to 5
-func SkipgramWindowSize(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["window_size"] = value
-	}
-}
-
-// SkipgramMinCount sets the optional min_count attribute to value.
-//
-// value: The minimum number of word occurrences for it to be included in the
-// vocabulary.
-// If not specified, defaults to 5
-func SkipgramMinCount(value int64) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["min_count"] = value
-	}
-}
-
-// SkipgramSubsample sets the optional subsample attribute to value.
-//
-// value: Threshold for word occurrence. Words that appear with higher
-// frequency will be randomly down-sampled. Set to 0 to disable.
-// If not specified, defaults to 0.001
-func SkipgramSubsample(value float32) SkipgramAttr {
-	return func(m optionalAttr) {
-		m["subsample"] = value
-	}
-}
-
-// Parses a text file and creates a batch of examples.
-//
-// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
-//
-// Arguments:
-//	filename: The corpus's text file name.
-//	batch_size: The size of produced batch.
-//
-// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
-func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Skipgram",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
-}
-
-// StringToNumberAttr is an optional argument to StringToNumber.
-type StringToNumberAttr func(optionalAttr)
-
-// StringToNumberOutType sets the optional out_type attribute to value.
-//
-// value: The numeric type to interpret each string in `string_tensor` as.
-// If not specified, defaults to DT_FLOAT
-func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Converts each string in the input Tensor to the specified numeric type.
-//
-// (Note that int32 overflow results in an error while float overflow
-// results in a rounded value.)
-//
-// Returns A Tensor of the same shape as the input `string_tensor`.
-func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StringToNumber",
-		Input: []tf.Input{
-			string_tensor,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
-type ResourceApplyFtrlV2Attr func(optionalAttr)
-
-// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and accum tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the Ftrl-proximal scheme.
-//
-// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
-// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
-// linear += grad_with_shrinkage +
-//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
-// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
-// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
-// accum = accum_new
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	linear: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
-//	l1: L1 regulariation. Must be a scalar.
-//	l2: L2 shrinkage regulariation. Must be a scalar.
-//
-//	lr_power: Scaling factor. Must be a scalar.
-//
-// Returns the created operation.
-func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyFtrlV2",
-		Input: []tf.Input{
-			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
-//
-// This Op does not require `a_indices` be sorted in standard lexicographic order.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
-//	b: `ndims`-D Tensor.  With shape `a_shape`.
-func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Encode audio data using the WAV file format.
-//
-// This operation will generate a string suitable to be saved out to create a .wav
-// audio file. It will be encoded in the 16-bit PCM format. It takes in float
-// values in the range -1.0f to 1.0f, and any outside that value will be clamped to
-// that range.
-//
-// `audio` is a 2-D float Tensor of shape `[length, channels]`.
-// `sample_rate` is a scalar Tensor holding the rate to use (e.g. 44100).
-//
-// Arguments:
-//	audio: 2-D with shape `[length, channels]`.
-//	sample_rate: Scalar containing the sample frequency.
-//
-// Returns 0-D. WAV-encoded file contents.
-func EncodeWav(scope *Scope, audio tf.Output, sample_rate tf.Output) (contents tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "EncodeWav",
-		Input: []tf.Input{
-			audio, sample_rate,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Computes atan of x element-wise.
-func Atan(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Atan",
-		Input: []tf.Input{
-			x,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAdaMaxAttr is an optional argument to ResourceApplyAdaMax.
-type ResourceApplyAdaMaxAttr func(optionalAttr)
-
-// ResourceApplyAdaMaxUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, m, and v tensors will be protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAdaMaxUseLocking(value bool) ResourceApplyAdaMaxAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AdaMax algorithm.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// v_t <- max(beta2 * v_{t-1}, abs(g))
-// variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	v: Should be from a Variable().
-//	beta1_power: Must be a scalar.
-//	lr: Scaling factor. Must be a scalar.
-//	beta1: Momentum factor. Must be a scalar.
-//	beta2: Momentum factor. Must be a scalar.
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAdaMax(scope *Scope, var_ tf.Output, m tf.Output, v tf.Output, beta1_power tf.Output, lr tf.Output, beta1 tf.Output, beta2 tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyAdaMaxAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdaMax",
-		Input: []tf.Input{
-			var_, m, v, beta1_power, lr, beta1, beta2, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// AssertAttr is an optional argument to Assert.
-type AssertAttr func(optionalAttr)
-
-// AssertSummarize sets the optional summarize attribute to value.
-//
-// value: Print this many entries of each tensor.
-// If not specified, defaults to 3
-func AssertSummarize(value int64) AssertAttr {
-	return func(m optionalAttr) {
-		m["summarize"] = value
-	}
-}
-
-// Asserts that the given condition is true.
-//
-// If `condition` evaluates to false, print the list of tensors in `data`.
-// `summarize` determines how many entries of the tensors to print.
-//
-// Arguments:
-//	condition: The condition to evaluate.
-//	data: The tensors to print out when condition is false.
-//
-// Returns the created operation.
-func Assert(scope *Scope, condition tf.Output, data []tf.Output, optional ...AssertAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Assert",
-		Input: []tf.Input{
-			condition, tf.OutputList(data),
+			condition, tf.OutputList(data),
 		},
 		Attrs: attrs,
 	}
@@ -13530,6 +13535,39 @@ func StatelessRandomNormal(scope *Scope, shape tf.Output, seed tf.Output, option
 	return op.Output(0)
 }
 
+// Computes the complementary error function of `x` element-wise.
+func Erfc(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Erfc",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the number of tensors in the input tensor list.
+//
+// input_handle: the input list
+// length: the number of tensors in the list
+func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TensorListLength",
+		Input: []tf.Input{
+			input_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Determine the script codes of a given tensor of Unicode integer code points.
 //
 // This operation converts Unicode code points to script codes corresponding to
@@ -13747,96 +13785,170 @@ func ResourceApplyMomentum(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.
 	return scope.AddOperation(opspec)
 }
 
-// Exits the current frame to its parent frame.
-//
-// Exit makes its input `data` available to the parent frame.
-//
-// Arguments:
-//	data: The tensor to be made available to the parent frame.
+// SubstrAttr is an optional argument to Substr.
+type SubstrAttr func(optionalAttr)
+
+// SubstrUnit sets the optional unit attribute to value.
 //
-// Returns The same tensor as `data`.
-func Exit(scope *Scope, data tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Exit",
-		Input: []tf.Input{
-			data,
-		},
+// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
+// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
+// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
+// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
+// UTF-8.
+// If not specified, defaults to "BYTE"
+func SubstrUnit(value string) SubstrAttr {
+	return func(m optionalAttr) {
+		m["unit"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// Produce a string tensor that encodes the state of a Reader.
+// Return substrings from `Tensor` of strings.
 //
-// Not all Readers support being serialized, so this can produce an
-// Unimplemented error.
+// For each string in the input `Tensor`, creates a substring starting at index
+// `pos` with a total length of `len`.
 //
-// Arguments:
-//	reader_handle: Handle to a Reader.
-func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderSerializeStateV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MaxPoolAttr is an optional argument to MaxPool.
-type MaxPoolAttr func(optionalAttr)
-
-// MaxPoolDataFormat sets the optional data_format attribute to value.
+// If `len` defines a substring that would extend beyond the length of the input
+// string, then as many characters as possible are used.
 //
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolDataFormat(value string) MaxPoolAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// A negative `pos` indicates distance within the string backwards from the end.
 //
-// Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
+// If `pos` specifies an index which is out of range for any of the input strings,
+// then an `InvalidArgumentError` is thrown.
 //
-// Returns The max pooled output tensor.
-func MaxPool(scope *Scope, input tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPoolAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MaxPool",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Concatenates quantized tensors along one dimension.
+// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
+// Op creation.
+//
+// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
+// broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+//
+// ---
+//
+// Examples
+//
+// Using scalar `pos` and `len`:
+//
+// ```python
+// input = [b'Hello', b'World']
+// position = 1
+// length = 3
+//
+// output = [b'ell', b'orl']
+// ```
+//
+// Using `pos` and `len` with same shape as `input`:
+//
+// ```python
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen']]
+// position = [[1, 2, 3],
+//             [1, 2, 3],
+//             [1, 2, 3]]
+// length =   [[2, 3, 4],
+//             [4, 3, 2],
+//             [5, 5, 5]]
+//
+// output = [[b'en', b'eve', b'lve'],
+//           [b'hirt', b'urt', b'te'],
+//           [b'ixtee', b'vente', b'hteen']]
+// ```
+//
+// Broadcasting `pos` and `len` onto `input`:
+//
+// ```
+// input = [[b'ten', b'eleven', b'twelve'],
+//          [b'thirteen', b'fourteen', b'fifteen'],
+//          [b'sixteen', b'seventeen', b'eighteen'],
+//          [b'nineteen', b'twenty', b'twentyone']]
+// position = [1, 2, 3]
+// length =   [1, 2, 3]
+//
+// output = [[b'e', b'ev', b'lve'],
+//           [b'h', b'ur', b'tee'],
+//           [b'i', b've', b'hte'],
+//           [b'i', b'en', b'nty']]
+// ```
+//
+// Broadcasting `input` onto `pos` and `len`:
+//
+// ```
+// input = b'thirteen'
+// position = [1, 5, 7]
+// length =   [3, 2, 1]
+//
+// output = [b'hir', b'ee', b'n']
+// ```
+//
+// Arguments:
+//	input: Tensor of strings
+//	pos: Scalar defining the position of first character in each substring
+//	len: Scalar defining the number of characters to include in each substring
+//
+// Returns Tensor of substrings
+func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Substr",
+		Input: []tf.Input{
+			input, pos, len,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Exits the current frame to its parent frame.
+//
+// Exit makes its input `data` available to the parent frame.
+//
+// Arguments:
+//	data: The tensor to be made available to the parent frame.
+//
+// Returns The same tensor as `data`.
+func Exit(scope *Scope, data tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Exit",
+		Input: []tf.Input{
+			data,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Produce a string tensor that encodes the state of a Reader.
+//
+// Not all Readers support being serialized, so this can produce an
+// Unimplemented error.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+func ReaderSerializeStateV2(scope *Scope, reader_handle tf.Output) (state tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderSerializeStateV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Concatenates quantized tensors along one dimension.
 //
 // Arguments:
 //	concat_dim: 0-D.  The dimension along which to concatenate.  Must be in the
@@ -13973,66 +14085,6 @@ func QuantizedReshape(scope *Scope, tensor tf.Output, shape tf.Output, input_min
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Returns the truth value of (x != y) element-wise.
-//
-// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NotEqual",
-		Input: []tf.Input{
-			x, y,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Inverse 3D real-valued fast Fourier transform.
-//
-// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
-// signal over the inner-most 3 dimensions of `input`.
-//
-// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
-// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
-// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
-// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
-// to compute `input` is odd, it should be provided since it cannot be inferred
-// properly.
-//
-// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
-// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A complex64 tensor.
-//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
-//
-// Returns A float32 tensor of the same rank as `input`. The inner-most 3
-//   dimensions of `input` are replaced with the `fft_length` samples of their
-//   inverse 3D real Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.irfftn with 3 dimensions.
-// @end_compatibility
-func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "IRFFT3D",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // StringSplitAttr is an optional argument to StringSplit.
 type StringSplitAttr func(optionalAttr)
 
@@ -14187,284 +14239,6 @@ func Conj(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
-type AvgPoolGradAttr func(optionalAttr)
-
-// AvgPoolGradDataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Computes gradients of the average pooling function.
-//
-// Arguments:
-//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
-//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
-// the output of `avg_pool`.
-//	ksize: The size of the sliding window for each dimension of the input.
-//	strides: The stride of the sliding window for each dimension of the input.
-//	padding: The type of padding algorithm to use.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
-func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "AvgPoolGrad",
-		Input: []tf.Input{
-			orig_input_shape, grad,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Greedily selects a subset of bounding boxes in descending order of score,
-//
-// pruning away boxes that have high overlaps
-// with previously selected boxes.  Bounding boxes with score less than
-// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
-// which allows for defining a custom overlap criterium (eg. intersection over union,
-// intersection over area, etc.).
-//
-// The output of this operation is a set of integers indexing into the input
-// collection of bounding boxes representing the selected boxes.  The bounding
-// box coordinates corresponding to the selected indices can then be obtained
-// using the `tf.gather operation`.  For example:
-//
-//   selected_indices = tf.image.non_max_suppression_with_overlaps(
-//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
-//   selected_boxes = tf.gather(boxes, selected_indices)
-//
-// Arguments:
-//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
-// the n-by-n box overlap values.
-//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
-// score corresponding to each box (each row of boxes).
-//	max_output_size: A scalar integer tensor representing the maximum number of
-// boxes to be selected by non max suppression.
-//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
-// boxes overlap too.
-//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
-// boxes based on score.
-//
-// Returns A 1-D integer tensor of shape `[M]` representing the selected
-// indices from the boxes tensor, where `M <= max_output_size`.
-func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "NonMaxSuppressionWithOverlaps",
-		Input: []tf.Input{
-			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// StageClearAttr is an optional argument to StageClear.
-type StageClearAttr func(optionalAttr)
-
-// StageClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearCapacity(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func StageClearMemoryLimit(value int64) StageClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageClearContainer(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StageClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageClearSharedName(value string) StageClearAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes all elements in the underlying container.
-//
-// Returns the created operation.
-func StageClear(scope *Scope, dtypes []tf.DataType, optional ...StageClearAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StageClear",
-
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
-type QuantizedRelu6Attr func(optionalAttr)
-
-// QuantizedRelu6OutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedRelu6",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
-type FixedLengthRecordReaderV2Attr func(optionalAttr)
-
-// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
-//
-// value: Number of bytes in the header, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["header_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
-//
-// value: Number of bytes in the footer, defaults to 0.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["footer_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
-//
-// value: Number of bytes to hop before each read. Default of 0 means using
-// record_bytes.
-// If not specified, defaults to 0
-func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["hop_bytes"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this reader is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this reader is named in the given bucket
-// with this shared_name. Otherwise, the node name is used instead.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
-//
-// value: The type of encoding for the file. Currently ZLIB and GZIP
-// are supported. Defaults to none.
-// If not specified, defaults to ""
-func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
-	return func(m optionalAttr) {
-		m["encoding"] = value
-	}
-}
-
-// A Reader that outputs fixed-length records from a file.
-//
-// Arguments:
-//	record_bytes: Number of bytes in the record.
-//
-// Returns The handle to reference the Reader.
-func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"record_bytes": record_bytes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "FixedLengthRecordReaderV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes numerical negative value element-wise.
 //
 // I.e., \\(y = -x\\).
@@ -14946,112 +14720,43 @@ func ResourceApplyAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.O
 	return scope.AddOperation(opspec)
 }
 
-// Return the shape of s0 op s1 with broadcast.
+// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
 //
-// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
-// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
-func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+// Arguments:
+//	tree_ensemble_handle: Handle to the tree ensemble.
+//
+// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
+// layer.
+func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "BroadcastArgs",
+		Type: "BoostedTreesGetEnsembleStates",
 		Input: []tf.Input{
-			s0, s1,
+			tree_ensemble_handle,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
 }
 
-// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
-type DataFormatDimMapAttr func(optionalAttr)
+// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
+type ResourceApplyPowerSignAttr func(optionalAttr)
 
-// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
 //
-// value: source data format.
-// If not specified, defaults to "NHWC"
-func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
 	return func(m optionalAttr) {
-		m["src_format"] = value
+		m["use_locking"] = value
 	}
 }
 
-// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
-//
-// value: destination data format.
-// If not specified, defaults to "NCHW"
-func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
-	return func(m optionalAttr) {
-		m["dst_format"] = value
-	}
-}
-
-// Returns the dimension index in the destination data format given the one in
-//
-// the source data format.
-//
-// Arguments:
-//	x: A Tensor with each element as a dimension index in source data format.
-// Must be in the range [-4, 4).
-//
-// Returns A Tensor with each element as a dimension index in destination data format.
-func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DataFormatDimMap",
-		Input: []tf.Input{
-			x,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Retrieves the tree ensemble resource stamp token, number of trees and growing statistics.
-//
-// Arguments:
-//	tree_ensemble_handle: Handle to the tree ensemble.
-//
-// Returns Stamp token of the tree ensemble resource.The number of trees in the tree ensemble resource.The number of trees that were finished successfully.The number of layers we attempted to build (but not necessarily succeeded).Rank size 2 tensor that contains start and end ids of the nodes in the latest
-// layer.
-func BoostedTreesGetEnsembleStates(scope *Scope, tree_ensemble_handle tf.Output) (stamp_token tf.Output, num_trees tf.Output, num_finalized_trees tf.Output, num_attempted_layers tf.Output, last_layer_nodes_range tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "BoostedTreesGetEnsembleStates",
-		Input: []tf.Input{
-			tree_ensemble_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4)
-}
-
-// ResourceApplyPowerSignAttr is an optional argument to ResourceApplyPowerSign.
-type ResourceApplyPowerSignAttr func(optionalAttr)
-
-// ResourceApplyPowerSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyPowerSignUseLocking(value bool) ResourceApplyPowerSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
+// Update '*var' according to the AddSign update.
 //
 // m_t <- beta1 * m_{t-1} + (1 - beta1) * g
 // update <- exp(logbase * sign_decay * sign(g) * sign(m_t)) * g
@@ -15230,355 +14935,6 @@ func SdcaFprint(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// The gradient operator for the SparseAdd op.
-//
-// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
-// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
-// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
-// values of A and B.
-//
-// Arguments:
-//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
-// the non-empty values of the sum.
-//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
-//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
-//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
-// `[nnz(sum), ndims]`.
-//
-// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
-// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
-// non-empty values of B.
-func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAddGrad",
-		Input: []tf.Input{
-			backprop_val_grad, a_indices, b_indices, sum_indices,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
-type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
-
-// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, mg, ms, and mom tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the centered RMSProp algorithm.
-//
-// The centered RMSProp algorithm uses an estimate of the centered second moment
-// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
-// uses the (uncentered) second moment. This often helps with training, but is
-// slightly more expensive in terms of computation and memory.
-//
-// Note that in dense implementation of this algorithm, mg, ms, and mom will
-// update even if the grad is zero, but in this sparse implementation, mg, ms,
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// mean_grad = decay * mean_grad + (1-decay) * gradient
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	mg: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var, ms and mom.
-//
-// Returns the created operation.
-func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyCenteredRMSProp",
-		Input: []tf.Input{
-			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Creates a dataset that batches `batch_size` elements from `input_dataset`.
-//
-// Arguments:
-//
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//
-//
-func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
-type RandomPoissonV2Attr func(optionalAttr)
-
-// RandomPoissonV2Seed sets the optional seed attribute to value.
-//
-// value: If either `seed` or `seed2` are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
-
-// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
-//
-// value: A second seed to avoid seed collision.
-// If not specified, defaults to 0
-func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["seed2"] = value
-	}
-}
-
-// RandomPoissonV2Dtype sets the optional dtype attribute to value.
-// If not specified, defaults to DT_INT64
-func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
-	return func(m optionalAttr) {
-		m["dtype"] = value
-	}
-}
-
-// Outputs random values from the Poisson distribution(s) described by rate.
-//
-// This op uses two algorithms, depending on rate. If rate >= 10, then
-// the algorithm by Hormann is used to acquire samples via
-// transformation-rejection.
-// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
-//
-// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
-// random variables.
-// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
-// Programming, Volume 2. Addison Wesley
-//
-// Arguments:
-//	shape: 1-D integer tensor. Shape of independent samples to draw from each
-// distribution described by the shape parameters given in rate.
-//	rate: A tensor in which each scalar is a "rate" parameter describing the
-// associated poisson distribution.
-//
-// Returns A tensor with shape `shape + shape(rate)`. Each slice
-// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
-// `rate[i0, i1, ...iN]`.
-func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "RandomPoissonV2",
-		Input: []tf.Input{
-			shape, rate,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
-type DecodeAndCropJpegAttr func(optionalAttr)
-
-// DecodeAndCropJpegChannels sets the optional channels attribute to value.
-//
-// value: Number of color channels for the decoded image.
-// If not specified, defaults to 0
-func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["channels"] = value
-	}
-}
-
-// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
-//
-// value: Downscaling ratio.
-// If not specified, defaults to 1
-func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["ratio"] = value
-	}
-}
-
-// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
-//
-// value: If true use a slower but nicer upscaling of the
-// chroma planes (yuv420/422 only).
-// If not specified, defaults to true
-func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["fancy_upscaling"] = value
-	}
-}
-
-// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
-//
-// value: If true try to recover an image from truncated input.
-// If not specified, defaults to false
-func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["try_recover_truncated"] = value
-	}
-}
-
-// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
-//
-// value: The minimum required fraction of lines before a truncated
-// input is accepted.
-// If not specified, defaults to 1
-func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["acceptable_fraction"] = value
-	}
-}
-
-// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
-//
-// value: string specifying a hint about the algorithm used for
-// decompression.  Defaults to "" which maps to a system-specific
-// default.  Currently valid values are ["INTEGER_FAST",
-// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
-// jpeg library changes to a version that does not have that specific
-// option.)
-// If not specified, defaults to ""
-func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
-	return func(m optionalAttr) {
-		m["dct_method"] = value
-	}
-}
-
-// Decode and Crop a JPEG-encoded image to a uint8 tensor.
-//
-// The attr `channels` indicates the desired number of color channels for the
-// decoded image.
-//
-// Accepted values are:
-//
-// *   0: Use the number of channels in the JPEG-encoded image.
-// *   1: output a grayscale image.
-// *   3: output an RGB image.
-//
-// If needed, the JPEG-encoded image is transformed to match the requested number
-// of color channels.
-//
-// The attr `ratio` allows downscaling the image by an integer factor during
-// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
-// downscaling the image later.
-//
-//
-// It is equivalent to a combination of decode and crop, but much faster by only
-// decoding partial jpeg image.
-//
-// Arguments:
-//	contents: 0-D.  The JPEG-encoded image.
-//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
-//
-// Returns 3-D with shape `[height, width, channels]`..
-func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeAndCropJpeg",
-		Input: []tf.Input{
-			contents, crop_window,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Adds two `SparseTensor` objects to produce another `SparseTensor`.
-//
-// The input `SparseTensor` objects' indices are assumed ordered in standard
-// lexicographic order.  If this is not the case, before this step run
-// `SparseReorder` to restore index ordering.
-//
-// By default, if two values sum to zero at some index, the output `SparseTensor`
-// would still include that particular location in its index, storing a zero in the
-// corresponding value slot.  To override this, callers can specify `thresh`,
-// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
-// corresponding value and index would then not be included.  In particular,
-// `thresh == 0` (default) means everything is kept and actual thresholding happens
-// only for a positive value.
-//
-// In the following shapes, `nnz` is the count after taking `thresh` into account.
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
-//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
-//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
-//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
-//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
-// pair takes space.
-func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseAdd",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
 // LRNAttr is an optional argument to LRN.
 type LRNAttr func(optionalAttr)
 
@@ -16248,6 +15604,531 @@ func VarIsInitializedOp(scope *Scope, resource tf.Output) (is_initialized tf.Out
 	return op.Output(0)
 }
 
+// ResourceSparseApplyFtrlAttr is an optional argument to ResourceSparseApplyFtrl.
+type ResourceSparseApplyFtrlAttr func(optionalAttr)
+
+// ResourceSparseApplyFtrlUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyFtrlUseLocking(value bool) ResourceSparseApplyFtrlAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update relevant entries in '*var' according to the Ftrl-proximal scheme.
+//
+// That is for rows we have grad for, we update var, accum and linear as follows:
+// accum_new = accum + grad * grad
+// linear += grad + (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyFtrl(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, lr_power tf.Output, optional ...ResourceSparseApplyFtrlAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyFtrl",
+		Input: []tf.Input{
+			var_, accum, linear, grad, indices, lr, l1, l2, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Returns which elements of x are Inf.
+//
+// @compatibility(numpy)
+// Equivalent to np.isinf
+// @end_compatibility
+func IsInf(scope *Scope, x tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "IsInf",
+		Input: []tf.Input{
+			x,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// TruncatedNormalAttr is an optional argument to TruncatedNormal.
+type TruncatedNormalAttr func(optionalAttr)
+
+// TruncatedNormalSeed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func TruncatedNormalSeed(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// TruncatedNormalSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func TruncatedNormalSeed2(value int64) TruncatedNormalAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Outputs random values from a truncated normal distribution.
+//
+// The generated values follow a normal distribution with mean 0 and standard
+// deviation 1, except that values whose magnitude is more than 2 standard
+// deviations from the mean are dropped and re-picked.
+//
+// Arguments:
+//	shape: The shape of the output tensor.
+//	dtype: The type of the output.
+//
+// Returns A tensor of the specified shape filled with random truncated normal
+// values.
+func TruncatedNormal(scope *Scope, shape tf.Output, dtype tf.DataType, optional ...TruncatedNormalAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "TruncatedNormal",
+		Input: []tf.Input{
+			shape,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SkipgramAttr is an optional argument to Skipgram.
+type SkipgramAttr func(optionalAttr)
+
+// SkipgramWindowSize sets the optional window_size attribute to value.
+//
+// value: The number of words to predict to the left and right of the target.
+// If not specified, defaults to 5
+func SkipgramWindowSize(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["window_size"] = value
+	}
+}
+
+// SkipgramMinCount sets the optional min_count attribute to value.
+//
+// value: The minimum number of word occurrences for it to be included in the
+// vocabulary.
+// If not specified, defaults to 5
+func SkipgramMinCount(value int64) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["min_count"] = value
+	}
+}
+
+// SkipgramSubsample sets the optional subsample attribute to value.
+//
+// value: Threshold for word occurrence. Words that appear with higher
+// frequency will be randomly down-sampled. Set to 0 to disable.
+// If not specified, defaults to 0.001
+func SkipgramSubsample(value float32) SkipgramAttr {
+	return func(m optionalAttr) {
+		m["subsample"] = value
+	}
+}
+
+// Parses a text file and creates a batch of examples.
+//
+// DEPRECATED at GraphDef version 19: Moving word2vec into tensorflow_models/tutorials and deprecating its ops here as a result
+//
+// Arguments:
+//	filename: The corpus's text file name.
+//	batch_size: The size of produced batch.
+//
+// Returns A vector of words in the corpus.Frequencies of words. Sorted in the non-ascending order.Number of words per epoch in the data file.The current epoch number.The total number of words processed so far.A vector of word ids.A vector of word ids.
+func Skipgram(scope *Scope, filename string, batch_size int64, optional ...SkipgramAttr) (vocab_word tf.Output, vocab_freq tf.Output, words_per_epoch tf.Output, current_epoch tf.Output, total_words_processed tf.Output, examples tf.Output, labels tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"filename": filename, "batch_size": batch_size}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Skipgram",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2), op.Output(3), op.Output(4), op.Output(5), op.Output(6)
+}
+
+// StringToNumberAttr is an optional argument to StringToNumber.
+type StringToNumberAttr func(optionalAttr)
+
+// StringToNumberOutType sets the optional out_type attribute to value.
+//
+// value: The numeric type to interpret each string in `string_tensor` as.
+// If not specified, defaults to DT_FLOAT
+func StringToNumberOutType(value tf.DataType) StringToNumberAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Converts each string in the input Tensor to the specified numeric type.
+//
+// (Note that int32 overflow results in an error while float overflow
+// results in a rounded value.)
+//
+// Returns A Tensor of the same shape as the input `string_tensor`.
+func StringToNumber(scope *Scope, string_tensor tf.Output, optional ...StringToNumberAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StringToNumber",
+		Input: []tf.Input{
+			string_tensor,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyFtrlV2Attr is an optional argument to ResourceApplyFtrlV2.
+type ResourceApplyFtrlV2Attr func(optionalAttr)
+
+// ResourceApplyFtrlV2UseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var and accum tensors will be protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyFtrlV2UseLocking(value bool) ResourceApplyFtrlV2Attr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the Ftrl-proximal scheme.
+//
+// grad_with_shrinkage = grad + 2 * l2_shrinkage * var
+// accum_new = accum + grad_with_shrinkage * grad_with_shrinkage
+// linear += grad_with_shrinkage +
+//     (accum_new^(-lr_power) - accum^(-lr_power)) / lr * var
+// quadratic = 1.0 / (accum_new^(lr_power) * lr) + 2 * l2
+// var = (sign(linear) * l1 - linear) / quadratic if |linear| > l1 else 0.0
+// accum = accum_new
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	linear: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regulariation. Must be a scalar.
+//	l2: L2 shrinkage regulariation. Must be a scalar.
+//
+//	lr_power: Scaling factor. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceApplyFtrlV2(scope *Scope, var_ tf.Output, accum tf.Output, linear tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, l2_shrinkage tf.Output, lr_power tf.Output, optional ...ResourceApplyFtrlV2Attr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyFtrlV2",
+		Input: []tf.Input{
+			var_, accum, linear, grad, lr, l1, l2, l2_shrinkage, lr_power,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// EncodeJpegAttr is an optional argument to EncodeJpeg.
+type EncodeJpegAttr func(optionalAttr)
+
+// EncodeJpegFormat sets the optional format attribute to value.
+//
+// value: Per pixel image format.
+// If not specified, defaults to ""
+func EncodeJpegFormat(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["format"] = value
+	}
+}
+
+// EncodeJpegQuality sets the optional quality attribute to value.
+//
+// value: Quality of the compression from 0 to 100 (higher is better and slower).
+// If not specified, defaults to 95
+func EncodeJpegQuality(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["quality"] = value
+	}
+}
+
+// EncodeJpegProgressive sets the optional progressive attribute to value.
+//
+// value: If True, create a JPEG that loads progressively (coarse to fine).
+// If not specified, defaults to false
+func EncodeJpegProgressive(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["progressive"] = value
+	}
+}
+
+// EncodeJpegOptimizeSize sets the optional optimize_size attribute to value.
+//
+// value: If True, spend CPU/RAM to reduce size with no quality change.
+// If not specified, defaults to false
+func EncodeJpegOptimizeSize(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["optimize_size"] = value
+	}
+}
+
+// EncodeJpegChromaDownsampling sets the optional chroma_downsampling attribute to value.
+//
+// value: See http://en.wikipedia.org/wiki/Chroma_subsampling.
+// If not specified, defaults to true
+func EncodeJpegChromaDownsampling(value bool) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["chroma_downsampling"] = value
+	}
+}
+
+// EncodeJpegDensityUnit sets the optional density_unit attribute to value.
+//
+// value: Unit used to specify `x_density` and `y_density`:
+// pixels per inch (`'in'`) or centimeter (`'cm'`).
+// If not specified, defaults to "in"
+func EncodeJpegDensityUnit(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["density_unit"] = value
+	}
+}
+
+// EncodeJpegXDensity sets the optional x_density attribute to value.
+//
+// value: Horizontal pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegXDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["x_density"] = value
+	}
+}
+
+// EncodeJpegYDensity sets the optional y_density attribute to value.
+//
+// value: Vertical pixels per density unit.
+// If not specified, defaults to 300
+func EncodeJpegYDensity(value int64) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["y_density"] = value
+	}
+}
+
+// EncodeJpegXmpMetadata sets the optional xmp_metadata attribute to value.
+//
+// value: If not empty, embed this XMP metadata in the image header.
+// If not specified, defaults to ""
+func EncodeJpegXmpMetadata(value string) EncodeJpegAttr {
+	return func(m optionalAttr) {
+		m["xmp_metadata"] = value
+	}
+}
+
+// JPEG-encode an image.
+//
+// `image` is a 3-D uint8 Tensor of shape `[height, width, channels]`.
+//
+// The attr `format` can be used to override the color format of the encoded
+// output.  Values can be:
+//
+// *   `''`: Use a default format based on the number of channels in the image.
+// *   `grayscale`: Output a grayscale JPEG image.  The `channels` dimension
+//     of `image` must be 1.
+// *   `rgb`: Output an RGB JPEG image. The `channels` dimension
+//     of `image` must be 3.
+//
+// If `format` is not specified or is the empty string, a default format is picked
+// in function of the number of channels in `image`:
+//
+// *   1: Output a grayscale image.
+// *   3: Output an RGB image.
+//
+// Arguments:
+//	image: 3-D with shape `[height, width, channels]`.
+//
+// Returns 0-D. JPEG-encoded image.
+func EncodeJpeg(scope *Scope, image tf.Output, optional ...EncodeJpegAttr) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "EncodeJpeg",
+		Input: []tf.Input{
+			image,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// MultinomialAttr is an optional argument to Multinomial.
+type MultinomialAttr func(optionalAttr)
+
+// MultinomialSeed sets the optional seed attribute to value.
+//
+// value: If either seed or seed2 is set to be non-zero, the internal random number
+// generator is seeded by the given seed.  Otherwise, a random seed is used.
+// If not specified, defaults to 0
+func MultinomialSeed(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// MultinomialSeed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func MultinomialSeed2(value int64) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// MultinomialOutputDtype sets the optional output_dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func MultinomialOutputDtype(value tf.DataType) MultinomialAttr {
+	return func(m optionalAttr) {
+		m["output_dtype"] = value
+	}
+}
+
+// Draws samples from a multinomial distribution.
+//
+// Arguments:
+//	logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice `[i, :]`
+// represents the unnormalized log probabilities for all classes.
+//	num_samples: 0-D.  Number of independent samples to draw for each row slice.
+//
+// Returns 2-D Tensor with shape `[batch_size, num_samples]`.  Each slice `[i, :]`
+// contains the drawn class labels with range `[0, num_classes)`.
+func Multinomial(scope *Scope, logits tf.Output, num_samples tf.Output, optional ...MultinomialAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Multinomial",
+		Input: []tf.Input{
+			logits, num_samples,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyAdagradDAAttr is an optional argument to ResourceSparseApplyAdagradDA.
+type ResourceSparseApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceSparseApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyAdagradDAUseLocking(value bool) ResourceSparseApplyAdagradDAAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update entries in '*var' and '*accum' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
+//
+// Returns the created operation.
+func ResourceSparseApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, indices tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceSparseApplyAdagradDAAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, indices, lr, l1, l2, global_step,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
 // Converts each string in the input Tensor to its hash mod by a number of buckets.
 //
 // The hash function is deterministic on the content of the string within the
@@ -16467,158 +16348,584 @@ func RestoreV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and
 		scope.UpdateErr("RestoreV2", err)
 		return
 	}
-	return tensors
+	return tensors
+}
+
+// Receives a tensor value broadcast from another device.
+func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	opspec := tf.OpSpec{
+		Type: "CollectiveBcastRecv",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Decode web-safe base64-encoded strings.
+//
+// Input may or may not have padding at the end. See EncodeBase64 for padding.
+// Web-safe means that input must use - and _ instead of + and /.
+//
+// Arguments:
+//	input: Base64 strings to decode.
+//
+// Returns Decoded strings.
+func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeBase64",
+		Input: []tf.Input{
+			input,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Store the input tensor in the state of the current session.
+//
+// Arguments:
+//	value: The tensor to be stored.
+//
+// Returns The handle for the tensor stored in the session state, represented
+// as a string.
+func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "GetSessionHandle",
+		Input: []tf.Input{
+			value,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
+type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+
+// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+//
+// That is for rows we have grad for, we update var and accum as follows:
+// accum += grad * grad
+// prox_v = var
+// prox_v -= lr * grad * (1 / sqrt(accum))
+// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	accum: Should be from a Variable().
+//	lr: Learning rate. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
+//
+// Returns the created operation.
+func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceSparseApplyProximalAdagrad",
+		Input: []tf.Input{
+			var_, accum, lr, l1, l2, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
+type MaxPool3DGradAttr func(optionalAttr)
+
+// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+//
+// value: The data format of the input and output data. With the
+// default format "NDHWC", the data is stored in the order of:
+//     [batch, in_depth, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCDHW", the data storage order is:
+//     [batch, in_channels, in_depth, in_height, in_width].
+// If not specified, defaults to "NDHWC"
+func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+	return func(m optionalAttr) {
+		m["data_format"] = value
+	}
+}
+
+// Computes gradients of max pooling function.
+//
+// Arguments:
+//	orig_input: The original input tensor.
+//	orig_output: The original output tensor.
+//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
+//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
+// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
+//	strides: 1-D tensor of length 5. The stride of the sliding window for each
+// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	padding: The type of padding algorithm to use.
+func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MaxPool3DGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Returns the name of the device on which `resource` has been placed.
+func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ExperimentalIteratorGetDevice",
+		Input: []tf.Input{
+			resource,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseReduceSumAttr is an optional argument to SparseReduceSum.
+type SparseReduceSumAttr func(optionalAttr)
+
+// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+//
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+	return func(m optionalAttr) {
+		m["keep_dims"] = value
+	}
+}
+
+// Computes the sum of elements across dimensions of a SparseTensor.
+//
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+// instead of a sparse one.
+//
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
+//
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
+//
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//
+// Returns `R-K`-D.  The reduced Tensor.
+func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReduceSum",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape, reduction_axes,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
+type SparseTensorDenseMatMulAttr func(optionalAttr)
+
+// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
+//
+// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
+// is transpose(conj(A)).  Otherwise it's transpose(A).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_a"] = value
+	}
+}
+
+// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
+//
+// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
+// is transpose(conj(B)).  Otherwise it's transpose(B).
+// If not specified, defaults to false
+func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
+	return func(m optionalAttr) {
+		m["adjoint_b"] = value
+	}
+}
+
+// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
+//
+// No validity checking is performed on the indices of A.  However, the following
+// input format is recommended for optimal behavior:
+//
+// if adjoint_a == false:
+//   A should be sorted in lexicographically increasing order.  Use SparseReorder
+//   if you're not sure.
+// if adjoint_a == true:
+//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
+//   order instead of "row major" order).
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
+//	b: 2-D.  A dense Matrix.
+func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseMatMul",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
+type ResourceApplyRMSPropAttr func(optionalAttr)
+
+// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
+//
+// value: If `True`, updating of the var, ms, and mom tensors is protected
+// by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the RMSProp algorithm.
+//
+// Note that in dense implementation of this algorithm, ms and mom will
+// update even if the grad is zero, but in this sparse implementation, ms
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//
+// Returns the created operation.
+func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ResourceApplyRMSProp",
+		Input: []tf.Input{
+			var_, ms, mom, lr, rho, momentum, epsilon, grad,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// SerializeManySparseAttr is an optional argument to SerializeManySparse.
+type SerializeManySparseAttr func(optionalAttr)
+
+// SerializeManySparseOutType sets the optional out_type attribute to value.
+//
+// value: The `dtype` to use for serialization; the supported types are `string`
+// (default) and `variant`.
+// If not specified, defaults to DT_STRING
+func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
 }
 
-// Receives a tensor value broadcast from another device.
-func CollectiveBcastRecv(scope *Scope, T tf.DataType, group_size int64, group_key int64, instance_key int64, shape tf.Shape) (data tf.Output) {
+// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+//
+// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+// is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The serialized
+// `SparseTensor` objects going into each row of `serialized_sparse` will have
+// rank `R-1`.
+//
+// The minibatch size `N` is extracted from `sparse_shape[0]`.
+//
+// Arguments:
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"T": T, "group_size": group_size, "group_key": group_key, "instance_key": instance_key, "shape": shape}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "CollectiveBcastRecv",
-
+		Type: "SerializeManySparse",
+		Input: []tf.Input{
+			sparse_indices, sparse_values, sparse_shape,
+		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Decode web-safe base64-encoded strings.
-//
-// Input may or may not have padding at the end. See EncodeBase64 for padding.
-// Web-safe means that input must use - and _ instead of + and /.
-//
-// Arguments:
-//	input: Base64 strings to decode.
-//
-// Returns Decoded strings.
-func DecodeBase64(scope *Scope, input tf.Output) (output tf.Output) {
+// Computes inverse hyperbolic cosine of x element-wise.
+func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeBase64",
+		Type: "Acosh",
 		Input: []tf.Input{
-			input,
+			x,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Store the input tensor in the state of the current session.
-//
-// Arguments:
-//	value: The tensor to be stored.
+// TensorArrayV2Attr is an optional argument to TensorArrayV2.
+type TensorArrayV2Attr func(optionalAttr)
+
+// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
+// If not specified, defaults to <unknown_rank:true >
+func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["element_shape"] = value
+	}
+}
+
+// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// If not specified, defaults to false
+func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["dynamic_size"] = value
+	}
+}
+
+// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
+// If not specified, defaults to true
+func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["clear_after_read"] = value
+	}
+}
+
+// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
+// If not specified, defaults to ""
+func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+	return func(m optionalAttr) {
+		m["tensor_array_name"] = value
+	}
+}
+
+// Deprecated. Use TensorArrayV3
 //
-// Returns The handle for the tensor stored in the session state, represented
-// as a string.
-func GetSessionHandle(scope *Scope, value tf.Output) (handle tf.Output) {
+// DEPRECATED at GraphDef version 26: Use TensorArrayV3
+func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{"dtype": dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "GetSessionHandle",
+		Type: "TensorArrayV2",
 		Input: []tf.Input{
-			value,
+			size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// ResourceSparseApplyProximalAdagradAttr is an optional argument to ResourceSparseApplyProximalAdagrad.
-type ResourceSparseApplyProximalAdagradAttr func(optionalAttr)
+// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
+type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
 
-// ResourceSparseApplyProximalAdagradUseLocking sets the optional use_locking attribute to value.
+// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalAdagradUseLocking(value bool) ResourceSparseApplyProximalAdagradAttr {
+// value: If either seed or seed2 are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
 	return func(m optionalAttr) {
-		m["use_locking"] = value
+		m["seed"] = value
 	}
 }
 
-// Sparse update entries in '*var' and '*accum' according to FOBOS algorithm.
+// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
 //
-// That is for rows we have grad for, we update var and accum as follows:
-// accum += grad * grad
-// prox_v = var
-// prox_v -= lr * grad * (1 / sqrt(accum))
-// var = sign(prox_v)/(1+lr*l2) * max{|prox_v|-lr*l1,0}
+// value: An second seed to avoid seed collision.
+// If not specified, defaults to 0
+func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// Generates labels for candidate sampling with a learned unigram distribution.
+//
+// See explanations of candidate sampling and the data formats at
+// go/candidate-sampling.
+//
+// For each batch, this op picks a single set of sampled candidate labels.
+//
+// The advantages of sampling candidates per-batch are simplicity and the
+// possibility of efficient dense matrix multiplication. The disadvantage is that
+// the sampled candidates must be chosen independently of the context and of the
+// true labels.
 //
 // Arguments:
-//	var_: Should be from a Variable().
-//	accum: Should be from a Variable().
-//	lr: Learning rate. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
+//	true_classes: A batch_size * num_true matrix, in which each row contains the
+// IDs of the num_true target_classes in the corresponding original label.
+//	num_true: Number of true labels per context.
+//	num_sampled: Number of candidates to randomly sample.
+//	unique: If unique is true, we sample with rejection, so that all sampled
+// candidates in a batch are unique. This requires some approximation to
+// estimate the post-rejection sampling probabilities.
+//	range_max: The sampler will sample integers from the interval [0, range_max).
 //
-// Returns the created operation.
-func ResourceSparseApplyProximalAdagrad(scope *Scope, var_ tf.Output, accum tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalAdagradAttr) (o *tf.Operation) {
+// Returns A vector of length num_sampled, in which each element is
+// the ID of a sampled candidate.A batch_size * num_true matrix, representing
+// the number of times each candidate is expected to occur in a batch
+// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
+// candidate representing the number of times the candidate is expected
+// to occur in a batch of sampled candidates.  If unique=true, then this is a
+// probability.
+func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalAdagrad",
+		Type: "ThreadUnsafeUnigramCandidateSampler",
 		Input: []tf.Input{
-			var_, accum, lr, l1, l2, grad, indices,
+			true_classes,
 		},
 		Attrs: attrs,
 	}
-	return scope.AddOperation(opspec)
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPool3DGradAttr is an optional argument to MaxPool3DGrad.
-type MaxPool3DGradAttr func(optionalAttr)
+// MaxPoolV2Attr is an optional argument to MaxPoolV2.
+type MaxPoolV2Attr func(optionalAttr)
 
-// MaxPool3DGradDataFormat sets the optional data_format attribute to value.
+// MaxPoolV2DataFormat sets the optional data_format attribute to value.
 //
-// value: The data format of the input and output data. With the
-// default format "NDHWC", the data is stored in the order of:
-//     [batch, in_depth, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCDHW", the data storage order is:
-//     [batch, in_channels, in_depth, in_height, in_width].
-// If not specified, defaults to "NDHWC"
-func MaxPool3DGradDataFormat(value string) MaxPool3DGradAttr {
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
 	return func(m optionalAttr) {
 		m["data_format"] = value
 	}
 }
 
-// Computes gradients of max pooling function.
+// Performs max pooling on the input.
 //
 // Arguments:
-//	orig_input: The original input tensor.
-//	orig_output: The original output tensor.
-//	grad: Output backprop of shape `[batch, depth, rows, cols, channels]`.
-//	ksize: 1-D tensor of length 5. The size of the window for each dimension of
-// the input tensor. Must have `ksize[0] = ksize[4] = 1`.
-//	strides: 1-D tensor of length 5. The stride of the sliding window for each
-// dimension of `input`. Must have `strides[0] = strides[4] = 1`.
+//	input: 4-D input to pool over.
+//	ksize: The size of the window for each dimension of the input tensor.
+//	strides: The stride of the sliding window for each dimension of the
+// input tensor.
 //	padding: The type of padding algorithm to use.
-func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...MaxPool3DGradAttr) (output tf.Output) {
+//
+// Returns The max pooled output tensor.
+func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	attrs := map[string]interface{}{"padding": padding}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MaxPool3DGrad",
+		Type: "MaxPoolV2",
 		Input: []tf.Input{
-			orig_input, orig_output, grad,
+			input, ksize, strides,
 		},
 		Attrs: attrs,
 	}
@@ -16626,58 +16933,65 @@ func MaxPool3DGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, gr
 	return op.Output(0)
 }
 
-// Returns the name of the device on which `resource` has been placed.
-func ExperimentalIteratorGetDevice(scope *Scope, resource tf.Output) (device tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ExperimentalIteratorGetDevice",
-		Input: []tf.Input{
-			resource,
-		},
+// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
+type AddManySparseToTensorsMapAttr func(optionalAttr)
+
+// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
+//
+// value: The container name for the `SparseTensorsMap` created by this op.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// SparseReduceSumAttr is an optional argument to SparseReduceSum.
-type SparseReduceSumAttr func(optionalAttr)
-
-// SparseReduceSumKeepDims sets the optional keep_dims attribute to value.
+// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
 //
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceSumKeepDims(value bool) SparseReduceSumAttr {
+// value: The shared name for the `SparseTensorsMap` created by this op.
+// If blank, the new Operation's unique name is used.
+// If not specified, defaults to ""
+func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
 	return func(m optionalAttr) {
-		m["keep_dims"] = value
+		m["shared_name"] = value
 	}
 }
 
-// Computes the sum of elements across dimensions of a SparseTensor.
+// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
-// instead of a sparse one.
+// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
+// `sparse_values`, and `sparse_shape`, where
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
+// having a first `sparse_indices` column taking values between `[0, N)`, where
+// the minibatch size `N == sparse_shape[0]`.
+//
+// The input `SparseTensor` must have rank `R` greater than 1, and the first
+// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
+// must be sorted in increasing order of this first dimension.  The stored
+// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
+// will have rank `R-1`.
+//
+// The `SparseTensor` values can then be read out as part of a minibatch by passing
+// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
+// the correct `SparseTensorsMap` is accessed, ensure that the same
+// `container` and `shared_name` are passed to that Op.  If no `shared_name`
+// is provided here, instead use the *name* of the Operation created by calling
+// `AddManySparseToTensorsMap` as the `shared_name` passed to
+// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
+// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
+//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
+//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
+// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns `R-K`-D.  The reduced Tensor.
-func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceSumAttr) (output tf.Output) {
+// Returns 1-D.  The handles of the `SparseTensor` now stored in the
+// `SparseTensorsMap`.  Shape: `[N]`.
+func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -16686,9 +17000,9 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceSum",
+		Type: "AddManySparseToTensorsMap",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			sparse_indices, sparse_values, sparse_shape,
 		},
 		Attrs: attrs,
 	}
@@ -16696,6 +17010,46 @@ func SparseReduceSum(scope *Scope, input_indices tf.Output, input_values tf.Outp
 	return op.Output(0)
 }
 
+// Concatenates tensors along one dimension.
+//
+// Arguments:
+//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
+// and their sizes must match in all dimensions except `concat_dim`.
+//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
+// range [-rank(values), rank(values)).
+//
+// Returns A `Tensor` with the concatenation of values stacked along the
+// `concat_dim` dimension.  This tensor's shape matches that of `values` except
+// in `concat_dim` where it has the sum of the sizes.
+func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ConcatV2",
+		Input: []tf.Input{
+			tf.OutputList(values), axis,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Reads and outputs the entire contents of the input filename.
+func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReadFile",
+		Input: []tf.Input{
+			filename,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns immutable tensor from memory region.
 //
 // The current implementation memmaps the tensor from a file.
@@ -16783,6 +17137,44 @@ func EmptyTensorList(scope *Scope, element_shape tf.Output, max_num_elements tf.
 	return op.Output(0)
 }
 
+// Computes softsign gradients for a softsign operation.
+//
+// Arguments:
+//	gradients: The backpropagated gradients to the corresponding softsign operation.
+//	features: The features passed as input to the corresponding softsign operation.
+//
+// Returns The gradients: `gradients / (1 + abs(features)) ** 2`.
+func SoftsignGrad(scope *Scope, gradients tf.Output, features tf.Output) (backprops tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SoftsignGrad",
+		Input: []tf.Input{
+			gradients, features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Provides the time since epoch in seconds.
+//
+// Returns the timestamp as a `float64` for seconds since the Unix epoch.
+//
+// Note: the timestamp is computed when the op is executed, not when it is added
+// to the graph.
+func Timestamp(scope *Scope) (ts tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Timestamp",
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // VariableShapeAttr is an optional argument to VariableShape.
 type VariableShapeAttr func(optionalAttr)
 
@@ -16823,80 +17215,96 @@ func VariableShape(scope *Scope, input tf.Output, optional ...VariableShapeAttr)
 	return op.Output(0)
 }
 
-// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
-type SparseToSparseSetOperationAttr func(optionalAttr)
+// AvgPoolGradAttr is an optional argument to AvgPoolGrad.
+type AvgPoolGradAttr func(optionalAttr)
 
-// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
-// If not specified, defaults to true
-func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
+// AvgPoolGradDataFormat sets the optional data_format attribute to value.
+//
+// value: Specify the data format of the input and output data. With the
+// default format "NHWC", the data is stored in the order of:
+//     [batch, in_height, in_width, in_channels].
+// Alternatively, the format could be "NCHW", the data storage order of:
+//     [batch, in_channels, in_height, in_width].
+// If not specified, defaults to "NHWC"
+func AvgPoolGradDataFormat(value string) AvgPoolGradAttr {
 	return func(m optionalAttr) {
-		m["validate_indices"] = value
+		m["data_format"] = value
 	}
 }
 
-// Applies set operation along last dimension of 2 `SparseTensor` inputs.
-//
-// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
+// Computes gradients of the average pooling function.
 //
-// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
-// order and range of `set1` and `set2` indices.
+// Arguments:
+//	orig_input_shape: 1-D.  Shape of the original input to `avg_pool`.
+//	grad: 4-D with shape `[batch, height, width, channels]`.  Gradients w.r.t.
+// the output of `avg_pool`.
+//	ksize: The size of the sliding window for each dimension of the input.
+//	strides: The stride of the sliding window for each dimension of the input.
+//	padding: The type of padding algorithm to use.
 //
-// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
-// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// Returns 4-D.  Gradients w.r.t. the input of `avg_pool`.
+func AvgPoolGrad(scope *Scope, orig_input_shape tf.Output, grad tf.Output, ksize []int64, strides []int64, padding string, optional ...AvgPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"ksize": ksize, "strides": strides, "padding": padding}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "AvgPoolGrad",
+		Input: []tf.Input{
+			orig_input_shape, grad,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Greedily selects a subset of bounding boxes in descending order of score,
 //
-// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
-// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
-// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
-// ignored.
+// pruning away boxes that have high overlaps
+// with previously selected boxes.  Bounding boxes with score less than
+// `score_threshold` are removed. N-by-n overlap values are supplied as square matrix,
+// which allows for defining a custom overlap criterium (eg. intersection over union,
+// intersection over area, etc.).
 //
-// If `validate_indices` is `True`, this op validates the order and range of `set1`
-// and `set2` indices.
+// The output of this operation is a set of integers indexing into the input
+// collection of bounding boxes representing the selected boxes.  The bounding
+// box coordinates corresponding to the selected indices can then be obtained
+// using the `tf.gather operation`.  For example:
 //
-// Output `result` is a `SparseTensor` represented by `result_indices`,
-// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
-// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
-// dimension contains the result of `set_operation` applied to the corresponding
-// `[0...n-1]` dimension of `set`.
+//   selected_indices = tf.image.non_max_suppression_with_overlaps(
+//       overlaps, scores, max_output_size, overlap_threshold, score_threshold)
+//   selected_boxes = tf.gather(boxes, selected_indices)
 //
 // Arguments:
-//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
-// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
-// order.
-//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
-// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
-// max set size across `0...n-1` dimensions.
-//
+//	overlaps: A 2-D float tensor of shape `[num_boxes, num_boxes]` representing
+// the n-by-n box overlap values.
+//	scores: A 1-D float tensor of shape `[num_boxes]` representing a single
+// score corresponding to each box (each row of boxes).
+//	max_output_size: A scalar integer tensor representing the maximum number of
+// boxes to be selected by non max suppression.
+//	overlap_threshold: A 0-D float tensor representing the threshold for deciding whether
+// boxes overlap too.
+//	score_threshold: A 0-D float tensor representing the threshold for deciding when to remove
+// boxes based on score.
 //
-// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
-// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
-// is the max result set size across all `0...n-1` dimensions.
-func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
+// Returns A 1-D integer tensor of shape `[M]` representing the selected
+// indices from the boxes tensor, where `M <= max_output_size`.
+func NonMaxSuppressionWithOverlaps(scope *Scope, overlaps tf.Output, scores tf.Output, max_output_size tf.Output, overlap_threshold tf.Output, score_threshold tf.Output) (selected_indices tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"set_operation": set_operation}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseToSparseSetOperation",
+		Type: "NonMaxSuppressionWithOverlaps",
 		Input: []tf.Input{
-			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
+			overlaps, scores, max_output_size, overlap_threshold, score_threshold,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
 // Computes softmax cross entropy cost and gradients to backpropagate.
@@ -17290,47 +17698,6 @@ func Conv3DBackpropInput(scope *Scope, input tf.Output, filter tf.Output, out_ba
 	return op.Output(0)
 }
 
-// Subtracts sparse updates from the variable referenced by `resource`.
-//
-// This operation computes
-//
-//     # Scalar indices
-//     ref[indices, ...] -= updates[...]
-//
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] -= updates[i, ...]
-//
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] -= updates[i, ..., j, ...]
-//
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions add.
-//
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
-//
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
-//
-// Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
-//
-// Returns the created operation.
-func ResourceScatterSub(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceScatterSub",
-		Input: []tf.Input{
-			resource, indices, updates,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
 // DepthwiseConv2dNativeBackpropFilterAttr is an optional argument to DepthwiseConv2dNativeBackpropFilter.
 type DepthwiseConv2dNativeBackpropFilterAttr func(optionalAttr)
 
@@ -17911,6 +18278,38 @@ func TopK(scope *Scope, input tf.Output, k int64, optional ...TopKAttr) (values
 	return op.Output(0), op.Output(1)
 }
 
+// The gradient operator for the SparseAdd op.
+//
+// The SparseAdd op calculates A + B, where A, B, and the sum are all represented
+// as `SparseTensor` objects.  This op takes in the upstream gradient w.r.t.
+// non-empty values of the sum, and outputs the gradients w.r.t. the non-empty
+// values of A and B.
+//
+// Arguments:
+//	backprop_val_grad: 1-D with shape `[nnz(sum)]`.  The gradient with respect to
+// the non-empty values of the sum.
+//	a_indices: 2-D.  The `indices` of the `SparseTensor` A, size `[nnz(A), ndims]`.
+//	b_indices: 2-D.  The `indices` of the `SparseTensor` B, size `[nnz(B), ndims]`.
+//	sum_indices: 2-D.  The `indices` of the sum `SparseTensor`, size
+// `[nnz(sum), ndims]`.
+//
+// Returns 1-D with shape `[nnz(A)]`. The gradient with respect to the
+// non-empty values of A.1-D with shape `[nnz(B)]`. The gradient with respect to the
+// non-empty values of B.
+func SparseAddGrad(scope *Scope, backprop_val_grad tf.Output, a_indices tf.Output, b_indices tf.Output, sum_indices tf.Output) (a_val_grad tf.Output, b_val_grad tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAddGrad",
+		Input: []tf.Input{
+			backprop_val_grad, a_indices, b_indices, sum_indices,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
 // Returns a list of tensors with the same shapes and contents as the input
 //
 // tensors.
@@ -18017,113 +18416,445 @@ func ResourceApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms
 	return scope.AddOperation(opspec)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
+// ResourceSparseApplyCenteredRMSPropAttr is an optional argument to ResourceSparseApplyCenteredRMSProp.
+type ResourceSparseApplyCenteredRMSPropAttr func(optionalAttr)
+
+// ResourceSparseApplyCenteredRMSPropUseLocking sets the optional use_locking attribute to value.
 //
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+// value: If `True`, updating of the var, mg, ms, and mom tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceSparseApplyCenteredRMSPropUseLocking(value bool) ResourceSparseApplyCenteredRMSPropAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the centered RMSProp algorithm.
+//
+// The centered RMSProp algorithm uses an estimate of the centered second moment
+// (i.e., the variance) for normalization, as opposed to regular RMSProp, which
+// uses the (uncentered) second moment. This often helps with training, but is
+// slightly more expensive in terms of computation and memory.
+//
+// Note that in dense implementation of this algorithm, mg, ms, and mom will
+// update even if the grad is zero, but in this sparse implementation, mg, ms,
+// and mom will not update in iterations during which the grad is zero.
+//
+// mean_square = decay * mean_square + (1-decay) * gradient ** 2
+// mean_grad = decay * mean_grad + (1-decay) * gradient
+// Delta = learning_rate * gradient / sqrt(mean_square + epsilon - mean_grad ** 2)
+//
+// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
+// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
+// var <- var - mom
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	mg: Should be from a Variable().
+//	ms: Should be from a Variable().
+//	mom: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	rho: Decay rate. Must be a scalar.
+//
+//	epsilon: Ridge term. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var, ms and mom.
+//
+// Returns the created operation.
+func ResourceSparseApplyCenteredRMSProp(scope *Scope, var_ tf.Output, mg tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyCenteredRMSPropAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ReciprocalGrad",
+		Type: "ResourceSparseApplyCenteredRMSProp",
 		Input: []tf.Input{
-			y, dy,
+			var_, mg, ms, mom, lr, rho, momentum, epsilon, grad, indices,
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// Creates a dataset that batches `batch_size` elements from `input_dataset`.
+//
+// Arguments:
+//
+//	batch_size: A scalar representing the number of elements to accumulate in a
+// batch.
+//
+//
+func BatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
+	opspec := tf.OpSpec{
+		Type: "BatchDataset",
+		Input: []tf.Input{
+			input_dataset, batch_size,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
+// RandomPoissonV2Attr is an optional argument to RandomPoissonV2.
+type RandomPoissonV2Attr func(optionalAttr)
+
+// RandomPoissonV2Seed sets the optional seed attribute to value.
+//
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed"] = value
+	}
+}
+
+// RandomPoissonV2Seed2 sets the optional seed2 attribute to value.
+//
+// value: A second seed to avoid seed collision.
+// If not specified, defaults to 0
+func RandomPoissonV2Seed2(value int64) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["seed2"] = value
+	}
+}
+
+// RandomPoissonV2Dtype sets the optional dtype attribute to value.
+// If not specified, defaults to DT_INT64
+func RandomPoissonV2Dtype(value tf.DataType) RandomPoissonV2Attr {
+	return func(m optionalAttr) {
+		m["dtype"] = value
+	}
+}
+
+// Outputs random values from the Poisson distribution(s) described by rate.
+//
+// This op uses two algorithms, depending on rate. If rate >= 10, then
+// the algorithm by Hormann is used to acquire samples via
+// transformation-rejection.
+// See http://www.sciencedirect.com/science/article/pii/0167668793909974.
+//
+// Otherwise, Knuth's algorithm is used to acquire samples via multiplying uniform
+// random variables.
+// See Donald E. Knuth (1969). Seminumerical Algorithms. The Art of Computer
+// Programming, Volume 2. Addison Wesley
+//
+// Arguments:
+//	shape: 1-D integer tensor. Shape of independent samples to draw from each
+// distribution described by the shape parameters given in rate.
+//	rate: A tensor in which each scalar is a "rate" parameter describing the
+// associated poisson distribution.
+//
+// Returns A tensor with shape `shape + shape(rate)`. Each slice
+// `[:, ..., :, i0, i1, ...iN]` contains the samples drawn for
+// `rate[i0, i1, ...iN]`.
+func RandomPoissonV2(scope *Scope, shape tf.Output, rate tf.Output, optional ...RandomPoissonV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "RandomPoissonV2",
+		Input: []tf.Input{
+			shape, rate,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DecodeAndCropJpegAttr is an optional argument to DecodeAndCropJpeg.
+type DecodeAndCropJpegAttr func(optionalAttr)
+
+// DecodeAndCropJpegChannels sets the optional channels attribute to value.
+//
+// value: Number of color channels for the decoded image.
+// If not specified, defaults to 0
+func DecodeAndCropJpegChannels(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["channels"] = value
+	}
+}
+
+// DecodeAndCropJpegRatio sets the optional ratio attribute to value.
+//
+// value: Downscaling ratio.
+// If not specified, defaults to 1
+func DecodeAndCropJpegRatio(value int64) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["ratio"] = value
+	}
+}
+
+// DecodeAndCropJpegFancyUpscaling sets the optional fancy_upscaling attribute to value.
+//
+// value: If true use a slower but nicer upscaling of the
+// chroma planes (yuv420/422 only).
+// If not specified, defaults to true
+func DecodeAndCropJpegFancyUpscaling(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["fancy_upscaling"] = value
+	}
+}
+
+// DecodeAndCropJpegTryRecoverTruncated sets the optional try_recover_truncated attribute to value.
+//
+// value: If true try to recover an image from truncated input.
+// If not specified, defaults to false
+func DecodeAndCropJpegTryRecoverTruncated(value bool) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["try_recover_truncated"] = value
+	}
+}
+
+// DecodeAndCropJpegAcceptableFraction sets the optional acceptable_fraction attribute to value.
+//
+// value: The minimum required fraction of lines before a truncated
+// input is accepted.
+// If not specified, defaults to 1
+func DecodeAndCropJpegAcceptableFraction(value float32) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["acceptable_fraction"] = value
+	}
+}
+
+// DecodeAndCropJpegDctMethod sets the optional dct_method attribute to value.
+//
+// value: string specifying a hint about the algorithm used for
+// decompression.  Defaults to "" which maps to a system-specific
+// default.  Currently valid values are ["INTEGER_FAST",
+// "INTEGER_ACCURATE"].  The hint may be ignored (e.g., the internal
+// jpeg library changes to a version that does not have that specific
+// option.)
+// If not specified, defaults to ""
+func DecodeAndCropJpegDctMethod(value string) DecodeAndCropJpegAttr {
+	return func(m optionalAttr) {
+		m["dct_method"] = value
+	}
+}
+
+// Decode and Crop a JPEG-encoded image to a uint8 tensor.
+//
+// The attr `channels` indicates the desired number of color channels for the
+// decoded image.
+//
+// Accepted values are:
+//
+// *   0: Use the number of channels in the JPEG-encoded image.
+// *   1: output a grayscale image.
+// *   3: output an RGB image.
+//
+// If needed, the JPEG-encoded image is transformed to match the requested number
+// of color channels.
+//
+// The attr `ratio` allows downscaling the image by an integer factor during
+// decoding.  Allowed values are: 1, 2, 4, and 8.  This is much faster than
+// downscaling the image later.
+//
+//
+// It is equivalent to a combination of decode and crop, but much faster by only
+// decoding partial jpeg image.
+//
+// Arguments:
+//	contents: 0-D.  The JPEG-encoded image.
+//	crop_window: 1-D.  The crop window: [crop_y, crop_x, crop_height, crop_width].
+//
+// Returns 3-D with shape `[height, width, channels]`..
+func DecodeAndCropJpeg(scope *Scope, contents tf.Output, crop_window tf.Output, optional ...DecodeAndCropJpegAttr) (image tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DecodeAndCropJpeg",
+		Input: []tf.Input{
+			contents, crop_window,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Adds two `SparseTensor` objects to produce another `SparseTensor`.
+//
+// The input `SparseTensor` objects' indices are assumed ordered in standard
+// lexicographic order.  If this is not the case, before this step run
+// `SparseReorder` to restore index ordering.
+//
+// By default, if two values sum to zero at some index, the output `SparseTensor`
+// would still include that particular location in its index, storing a zero in the
+// corresponding value slot.  To override this, callers can specify `thresh`,
+// indicating that if the sum has a magnitude strictly smaller than `thresh`, its
+// corresponding value and index would then not be included.  In particular,
+// `thresh == 0` (default) means everything is kept and actual thresholding happens
+// only for a positive value.
+//
+// In the following shapes, `nnz` is the count after taking `thresh` into account.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the first `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	a_values: 1-D.  The `values` of the first `SparseTensor`, size `[nnz]` Vector.
+//	a_shape: 1-D.  The `shape` of the first `SparseTensor`, size `[ndims]` Vector.
+//	b_indices: 2-D.  The `indices` of the second `SparseTensor`, size `[nnz, ndims]` Matrix.
+//	b_values: 1-D.  The `values` of the second `SparseTensor`, size `[nnz]` Vector.
+//	b_shape: 1-D.  The `shape` of the second `SparseTensor`, size `[ndims]` Vector.
+//	thresh: 0-D.  The magnitude threshold that determines if an output value/index
+// pair takes space.
+func SparseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b_indices tf.Output, b_values tf.Output, b_shape tf.Output, thresh tf.Output) (sum_indices tf.Output, sum_values tf.Output, sum_shape tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b_indices, b_values, b_shape, thresh,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1), op.Output(2)
+}
+
+// QuantizedRelu6Attr is an optional argument to QuantizedRelu6.
+type QuantizedRelu6Attr func(optionalAttr)
+
+// QuantizedRelu6OutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedRelu6OutType(value tf.DataType) QuantizedRelu6Attr {
+	return func(m optionalAttr) {
+		m["out_type"] = value
+	}
+}
+
+// Computes Quantized Rectified Linear 6: `min(max(features, 0), 6)`
+//
+// Arguments:
 //
-// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu6(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedRelu6Attr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "Minimum",
+		Type: "QuantizedRelu6",
 		Input: []tf.Input{
-			x, y,
+			features, min_features, max_features,
 		},
+		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MfccAttr is an optional argument to Mfcc.
-type MfccAttr func(optionalAttr)
+// FixedLengthRecordReaderV2Attr is an optional argument to FixedLengthRecordReaderV2.
+type FixedLengthRecordReaderV2Attr func(optionalAttr)
 
-// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+// FixedLengthRecordReaderV2HeaderBytes sets the optional header_bytes attribute to value.
 //
-// value: The highest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 4000
-func MfccUpperFrequencyLimit(value float32) MfccAttr {
+// value: Number of bytes in the header, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HeaderBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["upper_frequency_limit"] = value
+		m["header_bytes"] = value
 	}
 }
 
-// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
+// FixedLengthRecordReaderV2FooterBytes sets the optional footer_bytes attribute to value.
 //
-// value: The lowest frequency to use when calculating the
-// ceptstrum.
-// If not specified, defaults to 20
-func MfccLowerFrequencyLimit(value float32) MfccAttr {
+// value: Number of bytes in the footer, defaults to 0.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2FooterBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["lower_frequency_limit"] = value
+		m["footer_bytes"] = value
 	}
 }
 
-// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
+// FixedLengthRecordReaderV2HopBytes sets the optional hop_bytes attribute to value.
 //
-// value: Resolution of the Mel bank used internally.
-// If not specified, defaults to 40
-func MfccFilterbankChannelCount(value int64) MfccAttr {
+// value: Number of bytes to hop before each read. Default of 0 means using
+// record_bytes.
+// If not specified, defaults to 0
+func FixedLengthRecordReaderV2HopBytes(value int64) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["filterbank_channel_count"] = value
+		m["hop_bytes"] = value
 	}
 }
 
-// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
+// FixedLengthRecordReaderV2Container sets the optional container attribute to value.
 //
-// value: How many output channels to produce per time slice.
-// If not specified, defaults to 13
-func MfccDctCoefficientCount(value int64) MfccAttr {
+// value: If non-empty, this reader is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Container(value string) FixedLengthRecordReaderV2Attr {
 	return func(m optionalAttr) {
-		m["dct_coefficient_count"] = value
+		m["container"] = value
 	}
 }
 
-// Transforms a spectrogram into a form that's useful for speech recognition.
+// FixedLengthRecordReaderV2SharedName sets the optional shared_name attribute to value.
 //
-// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
-// been effective as an input feature for machine learning. They are created by
-// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
-// higher frequencies that are less significant to the human ear. They have a long
-// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-// is a good resource to learn more.
+// value: If non-empty, this reader is named in the given bucket
+// with this shared_name. Otherwise, the node name is used instead.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2SharedName(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// FixedLengthRecordReaderV2Encoding sets the optional encoding attribute to value.
+//
+// value: The type of encoding for the file. Currently ZLIB and GZIP
+// are supported. Defaults to none.
+// If not specified, defaults to ""
+func FixedLengthRecordReaderV2Encoding(value string) FixedLengthRecordReaderV2Attr {
+	return func(m optionalAttr) {
+		m["encoding"] = value
+	}
+}
+
+// A Reader that outputs fixed-length records from a file.
 //
 // Arguments:
-//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
-// set to true.
-//	sample_rate: How many samples per second the source audio used.
-func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
+//	record_bytes: Number of bytes in the record.
+//
+// Returns The handle to reference the Reader.
+func FixedLengthRecordReaderV2(scope *Scope, record_bytes int64, optional ...FixedLengthRecordReaderV2Attr) (reader_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"record_bytes": record_bytes}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Mfcc",
-		Input: []tf.Input{
-			spectrogram, sample_rate,
-		},
+		Type: "FixedLengthRecordReaderV2",
+
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
@@ -18240,23 +18971,6 @@ func Qr(scope *Scope, input tf.Output, optional ...QrAttr) (q tf.Output, r tf.Ou
 	return op.Output(0), op.Output(1)
 }
 
-// Records the bytes size of each element of `input_dataset` in a StatsAggregator.
-func BytesProducedStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "BytesProducedStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Check if the input matches the regex pattern.
 //
 // The input is a string tensor of any shape. The pattern is the
@@ -18275,401 +18989,11 @@ func StaticRegexFullMatch(scope *Scope, input tf.Output, pattern string) (output
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"pattern": pattern}
-	opspec := tf.OpSpec{
-		Type: "StaticRegexFullMatch",
-		Input: []tf.Input{
-			input,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
-type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
-
-// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
-//
-// value: If True, the subtraction will be protected by a lock;
-// otherwise the behavior is undefined, but may exhibit less contention.
-// If not specified, defaults to false
-func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
-//
-// That is for rows we have grad for, we update var as follows:
-// prox_v = var - alpha * grad
-// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	alpha: Scaling factor. Must be a scalar.
-//	l1: L1 regularization. Must be a scalar.
-//	l2: L2 regularization. Must be a scalar.
-//	grad: The gradient.
-//	indices: A vector of indices into the first dimension of var and accum.
-//
-// Returns the created operation.
-func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceSparseApplyProximalGradientDescent",
-		Input: []tf.Input{
-			var_, alpha, l1, l2, grad, indices,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Real-valued fast Fourier transform.
-//
-// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
-// over the inner-most dimension of `input`.
-//
-// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
-// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
-// followed by the `fft_length / 2` positive-frequency terms.
-//
-// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
-// corresponding dimension of `input`, the dimension is cropped. If it is larger,
-// the dimension is padded with zeros.
-//
-// Arguments:
-//	input: A float32 tensor.
-//	fft_length: An int32 tensor of shape [1]. The FFT length.
-//
-// Returns A complex64 tensor of the same rank as `input`. The inner-most
-//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
-//   frequency components of its 1D Fourier transform.
-//
-// @compatibility(numpy)
-// Equivalent to np.fft.rfft
-// @end_compatibility
-func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "RFFT",
-		Input: []tf.Input{
-			input, fft_length,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// QuantizedReluAttr is an optional argument to QuantizedRelu.
-type QuantizedReluAttr func(optionalAttr)
-
-// QuantizedReluOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_QUINT8
-func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Computes Quantized Rectified Linear: `max(features, 0)`
-//
-// Arguments:
-//
-//	min_features: The float value that the lowest quantized value represents.
-//	max_features: The float value that the highest quantized value represents.
-//
-// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
-func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "QuantizedRelu",
-		Input: []tf.Input{
-			features, min_features, max_features,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
-}
-
-// Reshapes a SparseTensor to represent values in a new dense shape.
-//
-// This operation has the same semantics as reshape on the represented dense
-// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
-//
-// If one component of `new_shape` is the special value -1, the size of that
-// dimension is computed so that the total dense size remains constant.  At
-// most one component of `new_shape` can be -1.  The number of dense elements
-// implied by `new_shape` must be the same as the number of dense elements
-// originally implied by `input_shape`.
-//
-// Reshaping does not affect the order of values in the SparseTensor.
-//
-// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
-// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
-// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
-// `output_shape` has length `R_out`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
-// SparseTensor.
-//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
-//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
-//
-// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
-// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
-// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
-// filled in.
-func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReshape",
-		Input: []tf.Input{
-			input_indices, input_shape, new_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Deprecated. Use TensorArraySplitV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
-func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorArraySplitV2",
-		Input: []tf.Input{
-			handle, value, lengths, flow_in,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Reorders a SparseTensor into the canonical, row-major ordering.
-//
-// Note that by convention, all sparse ops preserve the canonical ordering along
-// increasing dimension number. The only time ordering can be violated is during
-// manual manipulation of the indices and values vectors to add entries.
-//
-// Reordering does not affect the shape of the SparseTensor.
-//
-// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
-// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
-//
-// Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//
-// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
-// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
-func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseReorder",
-		Input: []tf.Input{
-			input_indices, input_values, input_shape,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1)
-}
-
-// Computes rectified linear: `max(features, 0)`.
-func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Relu",
-		Input: []tf.Input{
-			features,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
-type ResourceApplyAddSignAttr func(optionalAttr)
-
-// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var and m tensors is
-// protected by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the AddSign update.
-//
-// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
-// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
-// variable <- variable - lr_t * update
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	m: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	alpha: Must be a scalar.
-//	sign_decay: Must be a scalar.
-//	beta: Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyAddSign",
-		Input: []tf.Input{
-			var_, m, lr, alpha, sign_decay, beta, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// UpperBoundAttr is an optional argument to UpperBound.
-type UpperBoundAttr func(optionalAttr)
-
-// UpperBoundOutType sets the optional out_type attribute to value.
-// If not specified, defaults to DT_INT32
-func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
-	return func(m optionalAttr) {
-		m["out_type"] = value
-	}
-}
-
-// Applies upper_bound(sorted_search_values, values) along each row.
-//
-// Each set of rows with the same index in (sorted_inputs, values) is treated
-// independently.  The resulting row is the equivalent of calling
-// `np.searchsorted(sorted_inputs, values, side='right')`.
-//
-// The result is not a global index to the entire
-// `Tensor`, but rather just the index in the last dimension.
-//
-// A 2-D example:
-//   sorted_sequence = [[0, 3, 9, 9, 10],
-//                      [1, 2, 3, 4, 5]]
-//   values = [[2, 4, 9],
-//             [0, 2, 6]]
-//
-//   result = UpperBound(sorted_sequence, values)
-//
-//   result == [[1, 2, 4],
-//              [0, 2, 5]]
-//
-// Arguments:
-//	sorted_inputs: 2-D Tensor where each row is ordered.
-//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
-// the values that will be searched for in `sorted_search_values`.
-//
-// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
-// into the last dimension where values can be inserted without changing the
-// ordered property.
-func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "UpperBound",
-		Input: []tf.Input{
-			sorted_inputs, values,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
-type FractionalMaxPoolGradAttr func(optionalAttr)
-
-// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
-//
-// value: When set to True, it means when pooling, the values at the boundary
-// of adjacent pooling cells are used by both cells. For example:
-//
-// `index  0  1  2  3  4`
-//
-// `value  20 5  16 3  7`
-//
-// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
-// The result would be [20, 16] for fractional max pooling.
-// If not specified, defaults to false
-func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
-	return func(m optionalAttr) {
-		m["overlapping"] = value
-	}
-}
-
-// Computes gradient of the FractionalMaxPool function.
-//
-// Arguments:
-//	orig_input: Original input for `fractional_max_pool`
-//	orig_output: Original output for `fractional_max_pool`
-//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
-// w.r.t. the output of `fractional_max_pool`.
-//	row_pooling_sequence: row pooling sequence, form pooling region with
-// col_pooling_sequence.
-//	col_pooling_sequence: column pooling sequence, form pooling region with
-// row_pooling sequence.
-//
-// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
-func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"pattern": pattern}
 	opspec := tf.OpSpec{
-		Type: "FractionalMaxPoolGrad",
+		Type: "StaticRegexFullMatch",
 		Input: []tf.Input{
-			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+			input,
 		},
 		Attrs: attrs,
 	}
@@ -18677,34 +19001,36 @@ func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Ou
 	return op.Output(0)
 }
 
-// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
-type ResourceApplyAdagradDAAttr func(optionalAttr)
+// ResourceSparseApplyProximalGradientDescentAttr is an optional argument to ResourceSparseApplyProximalGradientDescent.
+type ResourceSparseApplyProximalGradientDescentAttr func(optionalAttr)
 
-// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+// ResourceSparseApplyProximalGradientDescentUseLocking sets the optional use_locking attribute to value.
 //
-// value: If True, updating of the var and accum tensors will be protected by
-// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// value: If True, the subtraction will be protected by a lock;
+// otherwise the behavior is undefined, but may exhibit less contention.
 // If not specified, defaults to false
-func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
+func ResourceSparseApplyProximalGradientDescentUseLocking(value bool) ResourceSparseApplyProximalGradientDescentAttr {
 	return func(m optionalAttr) {
 		m["use_locking"] = value
 	}
 }
 
-// Update '*var' according to the proximal adagrad scheme.
+// Sparse update '*var' as FOBOS algorithm with fixed learning rate.
+//
+// That is for rows we have grad for, we update var as follows:
+// prox_v = var - alpha * grad
+// var = sign(prox_v)/(1+alpha*l2) * max{|prox_v|-alpha*l1,0}
 //
 // Arguments:
 //	var_: Should be from a Variable().
-//	gradient_accumulator: Should be from a Variable().
-//	gradient_squared_accumulator: Should be from a Variable().
-//	grad: The gradient.
-//	lr: Scaling factor. Must be a scalar.
+//	alpha: Scaling factor. Must be a scalar.
 //	l1: L1 regularization. Must be a scalar.
 //	l2: L2 regularization. Must be a scalar.
-//	global_step: Training step number. Must be a scalar.
+//	grad: The gradient.
+//	indices: A vector of indices into the first dimension of var and accum.
 //
 // Returns the created operation.
-func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
+func ResourceSparseApplyProximalGradientDescent(scope *Scope, var_ tf.Output, alpha tf.Output, l1 tf.Output, l2 tf.Output, grad tf.Output, indices tf.Output, optional ...ResourceSparseApplyProximalGradientDescentAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18713,151 +19039,96 @@ func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator t
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ResourceApplyAdagradDA",
+		Type: "ResourceSparseApplyProximalGradientDescent",
 		Input: []tf.Input{
-			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+			var_, alpha, l1, l2, grad, indices,
 		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
-type SparseReduceMaxSparseAttr func(optionalAttr)
-
-// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
-//
-// value: If true, retain reduced dimensions with length 1.
-// If not specified, defaults to false
-func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
-	return func(m optionalAttr) {
-		m["keep_dims"] = value
-	}
-}
-
-// Computes the max of elements across dimensions of a SparseTensor.
+// Real-valued fast Fourier transform.
 //
-// This Op takes a SparseTensor and is the sparse counterpart to
-// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
-// SparseTensor.
+// Computes the 1-dimensional discrete Fourier transform of a real-valued signal
+// over the inner-most dimension of `input`.
 //
-// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
-// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
-// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
-// with length 1.
+// Since the DFT of a real signal is Hermitian-symmetric, `RFFT` only returns the
+// `fft_length / 2 + 1` unique components of the FFT: the zero-frequency term,
+// followed by the `fft_length / 2` positive-frequency terms.
 //
-// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
-// with a single element is returned.  Additionally, the axes can be negative,
-// which are interpreted according to the indexing rules in Python.
+// Along the axis `RFFT` is computed on, if `fft_length` is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
 //
 // Arguments:
-//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
-// SparseTensor, possibly not in canonical ordering.
-//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
-//	input_shape: 1-D.  Shape of the input SparseTensor.
-//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
-func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
+//	input: A float32 tensor.
+//	fft_length: An int32 tensor of shape [1]. The FFT length.
+//
+// Returns A complex64 tensor of the same rank as `input`. The inner-most
+//   dimension of `input` is replaced with the `fft_length / 2 + 1` unique
+//   frequency components of its 1D Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.fft.rfft
+// @end_compatibility
+func RFFT(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "SparseReduceMaxSparse",
+		Type: "RFFT",
 		Input: []tf.Input{
-			input_indices, input_values, input_shape, reduction_axes,
+			input, fft_length,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0), op.Output(1), op.Output(2)
+	return op.Output(0)
 }
 
-// Creates a dataset that emits the outputs of `input_dataset` `count` times.
-//
-// Arguments:
+// Adds a value to the current value of a variable.
 //
-//	count: A scalar representing the number of times that `input_dataset` should
-// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+// Any ReadVariableOp with a control dependency on this op is guaranteed to
+// see the incremented value or a subsequent newer one.
 //
+// Arguments:
+//	resource: handle to the resource in which to store the variable.
+//	value: the value by which the variable will be incremented.
 //
-func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Returns the created operation.
+func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "RepeatDataset",
+		Type: "AssignAddVariableOp",
 		Input: []tf.Input{
-			input_dataset, count,
+			resource, value,
 		},
-		Attrs: attrs,
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return scope.AddOperation(opspec)
 }
 
-// AddManySparseToTensorsMapAttr is an optional argument to AddManySparseToTensorsMap.
-type AddManySparseToTensorsMapAttr func(optionalAttr)
-
-// AddManySparseToTensorsMapContainer sets the optional container attribute to value.
-//
-// value: The container name for the `SparseTensorsMap` created by this op.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapContainer(value string) AddManySparseToTensorsMapAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
+// QuantizedReluAttr is an optional argument to QuantizedRelu.
+type QuantizedReluAttr func(optionalAttr)
 
-// AddManySparseToTensorsMapSharedName sets the optional shared_name attribute to value.
-//
-// value: The shared name for the `SparseTensorsMap` created by this op.
-// If blank, the new Operation's unique name is used.
-// If not specified, defaults to ""
-func AddManySparseToTensorsMapSharedName(value string) AddManySparseToTensorsMapAttr {
+// QuantizedReluOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_QUINT8
+func QuantizedReluOutType(value tf.DataType) QuantizedReluAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["out_type"] = value
 	}
 }
 
-// Add an `N`-minibatch `SparseTensor` to a `SparseTensorsMap`, return `N` handles.
-//
-// A `SparseTensor` of rank `R` is represented by three tensors: `sparse_indices`,
-// `sparse_values`, and `sparse_shape`, where
-//
-// ```sparse_indices.shape[1] == sparse_shape.shape[0] == R```
-//
-// An `N`-minibatch of `SparseTensor` objects is represented as a `SparseTensor`
-// having a first `sparse_indices` column taking values between `[0, N)`, where
-// the minibatch size `N == sparse_shape[0]`.
-//
-// The input `SparseTensor` must have rank `R` greater than 1, and the first
-// dimension is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The stored
-// `SparseTensor` objects pointed to by each row of the output `sparse_handles`
-// will have rank `R-1`.
-//
-// The `SparseTensor` values can then be read out as part of a minibatch by passing
-// the given keys as vector elements to `TakeManySparseFromTensorsMap`.  To ensure
-// the correct `SparseTensorsMap` is accessed, ensure that the same
-// `container` and `shared_name` are passed to that Op.  If no `shared_name`
-// is provided here, instead use the *name* of the Operation created by calling
-// `AddManySparseToTensorsMap` as the `shared_name` passed to
-// `TakeManySparseFromTensorsMap`.  Ensure the Operations are colocated.
+// Computes Quantized Rectified Linear: `max(features, 0)`
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-// `sparse_indices[:, 0]` must be ordered values in `[0, N)`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-// The minibatch size `N == sparse_shape[0]`.
 //
-// Returns 1-D.  The handles of the `SparseTensor` now stored in the
-// `SparseTensorsMap`.  Shape: `[N]`.
-func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...AddManySparseToTensorsMapAttr) (sparse_handles tf.Output) {
+//	min_features: The float value that the lowest quantized value represents.
+//	max_features: The float value that the highest quantized value represents.
+//
+// Returns Has the same output shape as "features".The float value that the lowest quantized value represents.The float value that the highest quantized value represents.
+func QuantizedRelu(scope *Scope, features tf.Output, min_features tf.Output, max_features tf.Output, optional ...QuantizedReluAttr) (activations tf.Output, min_activations tf.Output, max_activations tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -18866,93 +19137,168 @@ func AddManySparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_va
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "AddManySparseToTensorsMap",
+		Type: "QuantizedRelu",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			features, min_features, max_features,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Concatenates tensors along one dimension.
+// Reshapes a SparseTensor to represent values in a new dense shape.
+//
+// This operation has the same semantics as reshape on the represented dense
+// tensor.  The `input_indices` are recomputed based on the requested `new_shape`.
+//
+// If one component of `new_shape` is the special value -1, the size of that
+// dimension is computed so that the total dense size remains constant.  At
+// most one component of `new_shape` can be -1.  The number of dense elements
+// implied by `new_shape` must be the same as the number of dense elements
+// originally implied by `input_shape`.
+//
+// Reshaping does not affect the order of values in the SparseTensor.
+//
+// If the input tensor has rank `R_in` and `N` non-empty values, and `new_shape`
+// has length `R_out`, then `input_indices` has shape `[N, R_in]`,
+// `input_shape` has length `R_in`, `output_indices` has shape `[N, R_out]`, and
+// `output_shape` has length `R_out`.
 //
 // Arguments:
-//	values: List of `N` Tensors to concatenate. Their ranks and types must match,
-// and their sizes must match in all dimensions except `concat_dim`.
-//	axis: 0-D.  The dimension along which to concatenate.  Must be in the
-// range [-rank(values), rank(values)).
+//	input_indices: 2-D.  `N x R_in` matrix with the indices of non-empty values in a
+// SparseTensor.
+//	input_shape: 1-D.  `R_in` vector with the input SparseTensor's dense shape.
+//	new_shape: 1-D.  `R_out` vector with the requested new dense shape.
 //
-// Returns A `Tensor` with the concatenation of values stacked along the
-// `concat_dim` dimension.  This tensor's shape matches that of `values` except
-// in `concat_dim` where it has the sum of the sizes.
-func ConcatV2(scope *Scope, values []tf.Output, axis tf.Output) (output tf.Output) {
+// Returns 2-D.  `N x R_out` matrix with the updated indices of non-empty
+// values in the output SparseTensor.1-D.  `R_out` vector with the full dense shape of the output
+// SparseTensor.  This is the same as `new_shape` but with any -1 dimensions
+// filled in.
+func SparseReshape(scope *Scope, input_indices tf.Output, input_shape tf.Output, new_shape tf.Output) (output_indices tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ConcatV2",
+		Type: "SparseReshape",
 		Input: []tf.Input{
-			tf.OutputList(values), axis,
+			input_indices, input_shape, new_shape,
 		},
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1)
 }
 
-// Reads and outputs the entire contents of the input filename.
-func ReadFile(scope *Scope, filename tf.Output) (contents tf.Output) {
+// Deprecated. Use TensorArraySplitV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArraySplitV3
+func TensorArraySplitV2(scope *Scope, handle tf.Output, value tf.Output, lengths tf.Output, flow_in tf.Output) (flow_out tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ReadFile",
+		Type: "TensorArraySplitV2",
 		Input: []tf.Input{
-			filename,
+			handle, value, lengths, flow_in,
 		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Multiplies sparse updates into the variable referenced by `resource`.
+// Reorders a SparseTensor into the canonical, row-major ordering.
 //
-// This operation computes
+// Note that by convention, all sparse ops preserve the canonical ordering along
+// increasing dimension number. The only time ordering can be violated is during
+// manual manipulation of the indices and values vectors to add entries.
 //
-//     # Scalar indices
-//     ref[indices, ...] *= updates[...]
+// Reordering does not affect the shape of the SparseTensor.
 //
-//     # Vector indices (for each i)
-//     ref[indices[i], ...] *= updates[i, ...]
+// If the tensor has rank `R` and `N` non-empty values, `input_indices` has
+// shape `[N, R]`, input_values has length `N`, and input_shape has length `R`.
 //
-//     # High rank indices (for each i, ..., j)
-//     ref[indices[i, ..., j], ...] *= updates[i, ..., j, ...]
+// Arguments:
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
 //
-// Duplicate entries are handled correctly: if multiple `indices` reference
-// the same location, their contributions multiply.
+// Returns 2-D.  `N x R` matrix with the same indices as input_indices, but
+// in canonical row-major ordering.1-D.  `N` non-empty values corresponding to `output_indices`.
+func SparseReorder(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output) (output_indices tf.Output, output_values tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseReorder",
+		Input: []tf.Input{
+			input_indices, input_values, input_shape,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0), op.Output(1)
+}
+
+// Computes rectified linear: `max(features, 0)`.
+func Relu(scope *Scope, features tf.Output) (activations tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Relu",
+		Input: []tf.Input{
+			features,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// ResourceApplyAddSignAttr is an optional argument to ResourceApplyAddSign.
+type ResourceApplyAddSignAttr func(optionalAttr)
+
+// ResourceApplyAddSignUseLocking sets the optional use_locking attribute to value.
 //
-// Requires `updates.shape = indices.shape + ref.shape[1:]` or `updates.shape = []`.
+// value: If `True`, updating of the var and m tensors is
+// protected by a lock; otherwise the behavior is undefined, but may exhibit less
+// contention.
+// If not specified, defaults to false
+func ResourceApplyAddSignUseLocking(value bool) ResourceApplyAddSignAttr {
+	return func(m optionalAttr) {
+		m["use_locking"] = value
+	}
+}
+
+// Update '*var' according to the AddSign update.
 //
-// <div style="width:70%; margin:auto; margin-bottom:10px; margin-top:20px;">
-// <img style="width:100%" src='https://www.tensorflow.org/images/ScatterAdd.png' alt>
-// </div>
+// m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+// update <- (alpha + sign_decay * sign(g) *sign(m)) * g
+// variable <- variable - lr_t * update
 //
 // Arguments:
-//	resource: Should be from a `Variable` node.
-//	indices: A tensor of indices into the first dimension of `ref`.
-//	updates: A tensor of updated values to add to `ref`.
+//	var_: Should be from a Variable().
+//	m: Should be from a Variable().
+//	lr: Scaling factor. Must be a scalar.
+//	alpha: Must be a scalar.
+//	sign_decay: Must be a scalar.
+//	beta: Must be a scalar.
+//	grad: The gradient.
 //
 // Returns the created operation.
-func ResourceScatterMul(scope *Scope, resource tf.Output, indices tf.Output, updates tf.Output) (o *tf.Operation) {
+func ResourceApplyAddSign(scope *Scope, var_ tf.Output, m tf.Output, lr tf.Output, alpha tf.Output, sign_decay tf.Output, beta tf.Output, grad tf.Output, optional ...ResourceApplyAddSignAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
 	opspec := tf.OpSpec{
-		Type: "ResourceScatterMul",
+		Type: "ResourceApplyAddSign",
 		Input: []tf.Input{
-			resource, indices, updates,
+			var_, m, lr, alpha, sign_decay, beta, grad,
 		},
+		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
@@ -19042,118 +19388,198 @@ func FractionalAvgPoolGrad(scope *Scope, orig_input_tensor_shape tf.Output, out_
 	return op.Output(0)
 }
 
-// SerializeManySparseAttr is an optional argument to SerializeManySparse.
-type SerializeManySparseAttr func(optionalAttr)
-
-// SerializeManySparseOutType sets the optional out_type attribute to value.
+// Adds up a `SparseTensor` and a dense `Tensor`, producing a dense `Tensor`.
 //
-// value: The `dtype` to use for serialization; the supported types are `string`
-// (default) and `variant`.
-// If not specified, defaults to DT_STRING
-func SerializeManySparseOutType(value tf.DataType) SerializeManySparseAttr {
+// This Op does not require `a_indices` be sorted in standard lexicographic order.
+//
+// Arguments:
+//	a_indices: 2-D.  The `indices` of the `SparseTensor`, with shape `[nnz, ndims]`.
+//	a_values: 1-D.  The `values` of the `SparseTensor`, with shape `[nnz]`.
+//	a_shape: 1-D.  The `shape` of the `SparseTensor`, with shape `[ndims]`.
+//	b: `ndims`-D Tensor.  With shape `a_shape`.
+func SparseTensorDenseAdd(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SparseTensorDenseAdd",
+		Input: []tf.Input{
+			a_indices, a_values, a_shape, b,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// SparseToSparseSetOperationAttr is an optional argument to SparseToSparseSetOperation.
+type SparseToSparseSetOperationAttr func(optionalAttr)
+
+// SparseToSparseSetOperationValidateIndices sets the optional validate_indices attribute to value.
+// If not specified, defaults to true
+func SparseToSparseSetOperationValidateIndices(value bool) SparseToSparseSetOperationAttr {
 	return func(m optionalAttr) {
-		m["out_type"] = value
+		m["validate_indices"] = value
 	}
 }
 
-// Serialize an `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor` object.
+// Applies set operation along last dimension of 2 `SparseTensor` inputs.
 //
-// The `SparseTensor` must have rank `R` greater than 1, and the first dimension
-// is treated as the minibatch dimension.  Elements of the `SparseTensor`
-// must be sorted in increasing order of this first dimension.  The serialized
-// `SparseTensor` objects going into each row of `serialized_sparse` will have
-// rank `R-1`.
+// See SetOperationOp::SetOperationFromContext for values of `set_operation`.
 //
-// The minibatch size `N` is extracted from `sparse_shape[0]`.
+// If `validate_indices` is `True`, `SparseToSparseSetOperation` validates the
+// order and range of `set1` and `set2` indices.
+//
+// Input `set1` is a `SparseTensor` represented by `set1_indices`, `set1_values`,
+// and `set1_shape`. For `set1` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set2`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// Input `set2` is a `SparseTensor` represented by `set2_indices`, `set2_values`,
+// and `set2_shape`. For `set2` ranked `n`, 1st `n-1` dimensions must be the same
+// as `set1`. Dimension `n` contains values in a set, duplicates are allowed but
+// ignored.
+//
+// If `validate_indices` is `True`, this op validates the order and range of `set1`
+// and `set2` indices.
+//
+// Output `result` is a `SparseTensor` represented by `result_indices`,
+// `result_values`, and `result_shape`. For `set1` and `set2` ranked `n`, this
+// has rank `n` and the same 1st `n-1` dimensions as `set1` and `set2`. The `nth`
+// dimension contains the result of `set_operation` applied to the corresponding
+// `[0...n-1]` dimension of `set`.
 //
 // Arguments:
-//	sparse_indices: 2-D.  The `indices` of the minibatch `SparseTensor`.
-//	sparse_values: 1-D.  The `values` of the minibatch `SparseTensor`.
-//	sparse_shape: 1-D.  The `shape` of the minibatch `SparseTensor`.
-func SerializeManySparse(scope *Scope, sparse_indices tf.Output, sparse_values tf.Output, sparse_shape tf.Output, optional ...SerializeManySparseAttr) (serialized_sparse tf.Output) {
+//	set1_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set1_shape: 1D `Tensor`, shape of a `SparseTensor`. `set1_shape[0...n-1]` must
+// be the same as `set2_shape[0...n-1]`, `set1_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//	set2_indices: 2D `Tensor`, indices of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_values: 1D `Tensor`, values of a `SparseTensor`. Must be in row-major
+// order.
+//	set2_shape: 1D `Tensor`, shape of a `SparseTensor`. `set2_shape[0...n-1]` must
+// be the same as `set1_shape[0...n-1]`, `set2_shape[n]` is the
+// max set size across `0...n-1` dimensions.
+//
+//
+// Returns 2D indices of a `SparseTensor`.1D values of a `SparseTensor`.1D `Tensor` shape of a `SparseTensor`. `result_shape[0...n-1]` is
+// the same as the 1st `n-1` dimensions of `set1` and `set2`, `result_shape[n]`
+// is the max result set size across all `0...n-1` dimensions.
+func SparseToSparseSetOperation(scope *Scope, set1_indices tf.Output, set1_values tf.Output, set1_shape tf.Output, set2_indices tf.Output, set2_values tf.Output, set2_shape tf.Output, set_operation string, optional ...SparseToSparseSetOperationAttr) (result_indices tf.Output, result_values tf.Output, result_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{}
+	attrs := map[string]interface{}{"set_operation": set_operation}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "SerializeManySparse",
+		Type: "SparseToSparseSetOperation",
 		Input: []tf.Input{
-			sparse_indices, sparse_values, sparse_shape,
+			set1_indices, set1_values, set1_shape, set2_indices, set2_values, set2_shape,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	return op.Output(0)
+	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// Computes inverse hyperbolic cosine of x element-wise.
-func Acosh(scope *Scope, x tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Acosh",
-		Input: []tf.Input{
-			x,
-		},
+// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
+type MutableDenseHashTableV2Attr func(optionalAttr)
+
+// MutableDenseHashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
 	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
 }
 
-// TensorArrayV2Attr is an optional argument to TensorArrayV2.
-type TensorArrayV2Attr func(optionalAttr)
-
-// TensorArrayV2ElementShape sets the optional element_shape attribute to value.
-// If not specified, defaults to <unknown_rank:true >
-func TensorArrayV2ElementShape(value tf.Shape) TensorArrayV2Attr {
+// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["element_shape"] = value
+		m["shared_name"] = value
 	}
 }
 
-// TensorArrayV2DynamicSize sets the optional dynamic_size attribute to value.
+// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
 // If not specified, defaults to false
-func TensorArrayV2DynamicSize(value bool) TensorArrayV2Attr {
+func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["dynamic_size"] = value
+		m["use_node_name_sharing"] = value
 	}
 }
 
-// TensorArrayV2ClearAfterRead sets the optional clear_after_read attribute to value.
-// If not specified, defaults to true
-func TensorArrayV2ClearAfterRead(value bool) TensorArrayV2Attr {
+// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+//
+// value: The shape of each value.
+// If not specified, defaults to <>
+func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["clear_after_read"] = value
+		m["value_shape"] = value
 	}
 }
 
-// TensorArrayV2TensorArrayName sets the optional tensor_array_name attribute to value.
-// If not specified, defaults to ""
-func TensorArrayV2TensorArrayName(value string) TensorArrayV2Attr {
+// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+//
+// value: The initial number of hash table buckets. Must be a power
+// to 2.
+// If not specified, defaults to 131072
+func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+	return func(m optionalAttr) {
+		m["initial_num_buckets"] = value
+	}
+}
+
+// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+//
+// value: The maximum ratio between number of entries and number of
+// buckets before growing the table. Must be between 0 and 1.
+// If not specified, defaults to 0.8
+func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
 	return func(m optionalAttr) {
-		m["tensor_array_name"] = value
+		m["max_load_factor"] = value
 	}
 }
 
-// Deprecated. Use TensorArrayV3
+// Creates an empty hash table that uses tensors as the backing store.
 //
-// DEPRECATED at GraphDef version 26: Use TensorArrayV3
-func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...TensorArrayV2Attr) (handle tf.Output) {
+// It uses "open addressing" with quadratic reprobing to resolve
+// collisions.
+//
+// This op creates a mutable hash table, specifying the type of its keys and
+// values. Each value must be a scalar. Data can be inserted into the table using
+// the insert operations. It does not support the initialization operation.
+//
+// Arguments:
+//	empty_key: The key used to represent empty key buckets internally. Must not
+// be used in insert or lookup operations.
+//
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtype": dtype}
+	attrs := map[string]interface{}{"value_dtype": value_dtype}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "TensorArrayV2",
+		Type: "MutableDenseHashTableV2",
 		Input: []tf.Input{
-			size,
+			empty_key, deleted_key,
 		},
 		Attrs: attrs,
 	}
@@ -19161,64 +19587,46 @@ func TensorArrayV2(scope *Scope, size tf.Output, dtype tf.DataType, optional ...
 	return op.Output(0)
 }
 
-// DecodeCSVAttr is an optional argument to DecodeCSV.
-type DecodeCSVAttr func(optionalAttr)
+// UpperBoundAttr is an optional argument to UpperBound.
+type UpperBoundAttr func(optionalAttr)
 
-// DecodeCSVFieldDelim sets the optional field_delim attribute to value.
-//
-// value: char delimiter to separate fields in a record.
-// If not specified, defaults to ","
-func DecodeCSVFieldDelim(value string) DecodeCSVAttr {
+// UpperBoundOutType sets the optional out_type attribute to value.
+// If not specified, defaults to DT_INT32
+func UpperBoundOutType(value tf.DataType) UpperBoundAttr {
 	return func(m optionalAttr) {
-		m["field_delim"] = value
+		m["out_type"] = value
 	}
 }
 
-// DecodeCSVUseQuoteDelim sets the optional use_quote_delim attribute to value.
+// Applies upper_bound(sorted_search_values, values) along each row.
 //
-// value: If false, treats double quotation marks as regular
-// characters inside of the string fields (ignoring RFC 4180, Section 2,
-// Bullet 5).
-// If not specified, defaults to true
-func DecodeCSVUseQuoteDelim(value bool) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["use_quote_delim"] = value
-	}
-}
-
-// DecodeCSVNaValue sets the optional na_value attribute to value.
+// Each set of rows with the same index in (sorted_inputs, values) is treated
+// independently.  The resulting row is the equivalent of calling
+// `np.searchsorted(sorted_inputs, values, side='right')`.
 //
-// value: Additional string to recognize as NA/NaN.
-// If not specified, defaults to ""
-func DecodeCSVNaValue(value string) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["na_value"] = value
-	}
-}
-
-// DecodeCSVSelectCols sets the optional select_cols attribute to value.
-// If not specified, defaults to <>
-func DecodeCSVSelectCols(value []int64) DecodeCSVAttr {
-	return func(m optionalAttr) {
-		m["select_cols"] = value
-	}
-}
-
-// Convert CSV records to tensors. Each column maps to one tensor.
+// The result is not a global index to the entire
+// `Tensor`, but rather just the index in the last dimension.
 //
-// RFC 4180 format is expected for the CSV records.
-// (https://tools.ietf.org/html/rfc4180)
-// Note that we allow leading and trailing spaces with int or float field.
+// A 2-D example:
+//   sorted_sequence = [[0, 3, 9, 9, 10],
+//                      [1, 2, 3, 4, 5]]
+//   values = [[2, 4, 9],
+//             [0, 2, 6]]
+//
+//   result = UpperBound(sorted_sequence, values)
+//
+//   result == [[1, 2, 4],
+//              [0, 2, 5]]
 //
 // Arguments:
-//	records: Each string is a record/row in the csv and all records should have
-// the same format.
-//	record_defaults: One tensor per column of the input record, with either a
-// scalar default value for that column or an empty vector if the column is
-// required.
+//	sorted_inputs: 2-D Tensor where each row is ordered.
+//	values: 2-D Tensor with the same numbers of rows as `sorted_search_values`. Contains
+// the values that will be searched for in `sorted_search_values`.
 //
-// Returns Each tensor will have the same shape as records.
-func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, optional ...DecodeCSVAttr) (output []tf.Output) {
+// Returns A `Tensor` with the same shape as `values`.  It contains the last scalar index
+// into the last dimension where values can be inserted without changing the
+// ordered property.
+func UpperBound(scope *Scope, sorted_inputs tf.Output, values tf.Output, optional ...UpperBoundAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -19227,149 +19635,160 @@ func DecodeCSV(scope *Scope, records tf.Output, record_defaults []tf.Output, opt
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "DecodeCSV",
+		Type: "UpperBound",
 		Input: []tf.Input{
-			records, tf.OutputList(record_defaults),
+			sorted_inputs, values,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("DecodeCSV", err)
-		return
-	}
-	return output
+	return op.Output(0)
 }
 
-// MapClearAttr is an optional argument to MapClear.
-type MapClearAttr func(optionalAttr)
+// FractionalMaxPoolGradAttr is an optional argument to FractionalMaxPoolGrad.
+type FractionalMaxPoolGradAttr func(optionalAttr)
 
-// MapClearCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// FractionalMaxPoolGradOverlapping sets the optional overlapping attribute to value.
 //
-// REQUIRES: value >= 0
-func MapClearCapacity(value int64) MapClearAttr {
+// value: When set to True, it means when pooling, the values at the boundary
+// of adjacent pooling cells are used by both cells. For example:
+//
+// `index  0  1  2  3  4`
+//
+// `value  20 5  16 3  7`
+//
+// If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used twice.
+// The result would be [20, 16] for fractional max pooling.
+// If not specified, defaults to false
+func FractionalMaxPoolGradOverlapping(value bool) FractionalMaxPoolGradAttr {
 	return func(m optionalAttr) {
-		m["capacity"] = value
+		m["overlapping"] = value
 	}
 }
 
-// MapClearMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Computes gradient of the FractionalMaxPool function.
 //
-// REQUIRES: value >= 0
-func MapClearMemoryLimit(value int64) MapClearAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
+// Arguments:
+//	orig_input: Original input for `fractional_max_pool`
+//	orig_output: Original output for `fractional_max_pool`
+//	out_backprop: 4-D with shape `[batch, height, width, channels]`.  Gradients
+// w.r.t. the output of `fractional_max_pool`.
+//	row_pooling_sequence: row pooling sequence, form pooling region with
+// col_pooling_sequence.
+//	col_pooling_sequence: column pooling sequence, form pooling region with
+// row_pooling sequence.
+//
+// Returns 4-D.  Gradients w.r.t. the input of `fractional_max_pool`.
+func FractionalMaxPoolGrad(scope *Scope, orig_input tf.Output, orig_output tf.Output, out_backprop tf.Output, row_pooling_sequence tf.Output, col_pooling_sequence tf.Output, optional ...FractionalMaxPoolGradAttr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// MapClearContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapClearContainer(value string) MapClearAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "FractionalMaxPoolGrad",
+		Input: []tf.Input{
+			orig_input, orig_output, out_backprop, row_pooling_sequence, col_pooling_sequence,
+		},
+		Attrs: attrs,
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MapClearSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapClearSharedName(value string) MapClearAttr {
+// ResourceApplyAdagradDAAttr is an optional argument to ResourceApplyAdagradDA.
+type ResourceApplyAdagradDAAttr func(optionalAttr)
+
+// ResourceApplyAdagradDAUseLocking sets the optional use_locking attribute to value.
+//
+// value: If True, updating of the var and accum tensors will be protected by
+// a lock; otherwise the behavior is undefined, but may exhibit less contention.
+// If not specified, defaults to false
+func ResourceApplyAdagradDAUseLocking(value bool) ResourceApplyAdagradDAAttr {
 	return func(m optionalAttr) {
-		m["shared_name"] = value
+		m["use_locking"] = value
 	}
 }
 
-// Op removes all elements in the underlying container.
+// Update '*var' according to the proximal adagrad scheme.
+//
+// Arguments:
+//	var_: Should be from a Variable().
+//	gradient_accumulator: Should be from a Variable().
+//	gradient_squared_accumulator: Should be from a Variable().
+//	grad: The gradient.
+//	lr: Scaling factor. Must be a scalar.
+//	l1: L1 regularization. Must be a scalar.
+//	l2: L2 regularization. Must be a scalar.
+//	global_step: Training step number. Must be a scalar.
 //
 // Returns the created operation.
-func MapClear(scope *Scope, dtypes []tf.DataType, optional ...MapClearAttr) (o *tf.Operation) {
+func ResourceApplyAdagradDA(scope *Scope, var_ tf.Output, gradient_accumulator tf.Output, gradient_squared_accumulator tf.Output, grad tf.Output, lr tf.Output, l1 tf.Output, l2 tf.Output, global_step tf.Output, optional ...ResourceApplyAdagradDAAttr) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MapClear",
-
+		Type: "ResourceApplyAdagradDA",
+		Input: []tf.Input{
+			var_, gradient_accumulator, gradient_squared_accumulator, grad, lr, l1, l2, global_step,
+		},
 		Attrs: attrs,
 	}
 	return scope.AddOperation(opspec)
 }
 
-// ThreadUnsafeUnigramCandidateSamplerAttr is an optional argument to ThreadUnsafeUnigramCandidateSampler.
-type ThreadUnsafeUnigramCandidateSamplerAttr func(optionalAttr)
-
-// ThreadUnsafeUnigramCandidateSamplerSeed sets the optional seed attribute to value.
-//
-// value: If either seed or seed2 are set to be non-zero, the random number
-// generator is seeded by the given seed.  Otherwise, it is seeded by a
-// random seed.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
-	return func(m optionalAttr) {
-		m["seed"] = value
-	}
-}
+// SparseReduceMaxSparseAttr is an optional argument to SparseReduceMaxSparse.
+type SparseReduceMaxSparseAttr func(optionalAttr)
 
-// ThreadUnsafeUnigramCandidateSamplerSeed2 sets the optional seed2 attribute to value.
+// SparseReduceMaxSparseKeepDims sets the optional keep_dims attribute to value.
 //
-// value: An second seed to avoid seed collision.
-// If not specified, defaults to 0
-func ThreadUnsafeUnigramCandidateSamplerSeed2(value int64) ThreadUnsafeUnigramCandidateSamplerAttr {
+// value: If true, retain reduced dimensions with length 1.
+// If not specified, defaults to false
+func SparseReduceMaxSparseKeepDims(value bool) SparseReduceMaxSparseAttr {
 	return func(m optionalAttr) {
-		m["seed2"] = value
+		m["keep_dims"] = value
 	}
 }
 
-// Generates labels for candidate sampling with a learned unigram distribution.
+// Computes the max of elements across dimensions of a SparseTensor.
 //
-// See explanations of candidate sampling and the data formats at
-// go/candidate-sampling.
+// This Op takes a SparseTensor and is the sparse counterpart to
+// `tf.reduce_max()`.  In contrast to SparseReduceMax, this Op returns a
+// SparseTensor.
 //
-// For each batch, this op picks a single set of sampled candidate labels.
+// Reduces `sp_input` along the dimensions given in `reduction_axes`.  Unless
+// `keep_dims` is true, the rank of the tensor is reduced by 1 for each entry in
+// `reduction_axes`. If `keep_dims` is true, the reduced dimensions are retained
+// with length 1.
 //
-// The advantages of sampling candidates per-batch are simplicity and the
-// possibility of efficient dense matrix multiplication. The disadvantage is that
-// the sampled candidates must be chosen independently of the context and of the
-// true labels.
+// If `reduction_axes` has no entries, all dimensions are reduced, and a tensor
+// with a single element is returned.  Additionally, the axes can be negative,
+// which are interpreted according to the indexing rules in Python.
 //
 // Arguments:
-//	true_classes: A batch_size * num_true matrix, in which each row contains the
-// IDs of the num_true target_classes in the corresponding original label.
-//	num_true: Number of true labels per context.
-//	num_sampled: Number of candidates to randomly sample.
-//	unique: If unique is true, we sample with rejection, so that all sampled
-// candidates in a batch are unique. This requires some approximation to
-// estimate the post-rejection sampling probabilities.
-//	range_max: The sampler will sample integers from the interval [0, range_max).
-//
-// Returns A vector of length num_sampled, in which each element is
-// the ID of a sampled candidate.A batch_size * num_true matrix, representing
-// the number of times each candidate is expected to occur in a batch
-// of sampled candidates. If unique=true, then this is a probability.A vector of length num_sampled, for each sampled
-// candidate representing the number of times the candidate is expected
-// to occur in a batch of sampled candidates.  If unique=true, then this is a
-// probability.
-func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, num_true int64, num_sampled int64, unique bool, range_max int64, optional ...ThreadUnsafeUnigramCandidateSamplerAttr) (sampled_candidates tf.Output, true_expected_count tf.Output, sampled_expected_count tf.Output) {
+//	input_indices: 2-D.  `N x R` matrix with the indices of non-empty values in a
+// SparseTensor, possibly not in canonical ordering.
+//	input_values: 1-D.  `N` non-empty values corresponding to `input_indices`.
+//	input_shape: 1-D.  Shape of the input SparseTensor.
+//	reduction_axes: 1-D.  Length-`K` vector containing the reduction axes.
+func SparseReduceMaxSparse(scope *Scope, input_indices tf.Output, input_values tf.Output, input_shape tf.Output, reduction_axes tf.Output, optional ...SparseReduceMaxSparseAttr) (output_indices tf.Output, output_values tf.Output, output_shape tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"num_true": num_true, "num_sampled": num_sampled, "unique": unique, "range_max": range_max}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "ThreadUnsafeUnigramCandidateSampler",
+		Type: "SparseReduceMaxSparse",
 		Input: []tf.Input{
-			true_classes,
+			input_indices, input_values, input_shape, reduction_axes,
 		},
 		Attrs: attrs,
 	}
@@ -19377,45 +19796,23 @@ func ThreadUnsafeUnigramCandidateSampler(scope *Scope, true_classes tf.Output, n
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
-// MaxPoolV2Attr is an optional argument to MaxPoolV2.
-type MaxPoolV2Attr func(optionalAttr)
-
-// MaxPoolV2DataFormat sets the optional data_format attribute to value.
-//
-// value: Specify the data format of the input and output data. With the
-// default format "NHWC", the data is stored in the order of:
-//     [batch, in_height, in_width, in_channels].
-// Alternatively, the format could be "NCHW", the data storage order of:
-//     [batch, in_channels, in_height, in_width].
-// If not specified, defaults to "NHWC"
-func MaxPoolV2DataFormat(value string) MaxPoolV2Attr {
-	return func(m optionalAttr) {
-		m["data_format"] = value
-	}
-}
-
-// Performs max pooling on the input.
+// Creates a dataset that emits the outputs of `input_dataset` `count` times.
 //
 // Arguments:
-//	input: 4-D input to pool over.
-//	ksize: The size of the window for each dimension of the input tensor.
-//	strides: The stride of the sliding window for each dimension of the
-// input tensor.
-//	padding: The type of padding algorithm to use.
 //
-// Returns The max pooled output tensor.
-func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output, padding string, optional ...MaxPoolV2Attr) (output tf.Output) {
+//	count: A scalar representing the number of times that `input_dataset` should
+// be repeated. A value of `-1` indicates that it should be repeated infinitely.
+//
+//
+func RepeatDataset(scope *Scope, input_dataset tf.Output, count tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"padding": padding}
-	for _, a := range optional {
-		a(attrs)
-	}
+	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "MaxPoolV2",
+		Type: "RepeatDataset",
 		Input: []tf.Input{
-			input, ksize, strides,
+			input_dataset, count,
 		},
 		Attrs: attrs,
 	}
@@ -19423,99 +19820,112 @@ func MaxPoolV2(scope *Scope, input tf.Output, ksize tf.Output, strides tf.Output
 	return op.Output(0)
 }
 
-// MutableDenseHashTableV2Attr is an optional argument to MutableDenseHashTableV2.
-type MutableDenseHashTableV2Attr func(optionalAttr)
-
-// MutableDenseHashTableV2Container sets the optional container attribute to value.
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2Container(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func ReciprocalGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReciprocalGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2SharedName sets the optional shared_name attribute to value.
+// Returns the min of x and y (i.e. x < y ? x : y) element-wise.
 //
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func MutableDenseHashTableV2SharedName(value string) MutableDenseHashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+// *NOTE*: `Minimum` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Minimum(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Minimum",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// MutableDenseHashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-// If not specified, defaults to false
-func MutableDenseHashTableV2UseNodeNameSharing(value bool) MutableDenseHashTableV2Attr {
+// MfccAttr is an optional argument to Mfcc.
+type MfccAttr func(optionalAttr)
+
+// MfccUpperFrequencyLimit sets the optional upper_frequency_limit attribute to value.
+//
+// value: The highest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 4000
+func MfccUpperFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
+		m["upper_frequency_limit"] = value
 	}
 }
 
-// MutableDenseHashTableV2ValueShape sets the optional value_shape attribute to value.
+// MfccLowerFrequencyLimit sets the optional lower_frequency_limit attribute to value.
 //
-// value: The shape of each value.
-// If not specified, defaults to <>
-func MutableDenseHashTableV2ValueShape(value tf.Shape) MutableDenseHashTableV2Attr {
+// value: The lowest frequency to use when calculating the
+// ceptstrum.
+// If not specified, defaults to 20
+func MfccLowerFrequencyLimit(value float32) MfccAttr {
 	return func(m optionalAttr) {
-		m["value_shape"] = value
+		m["lower_frequency_limit"] = value
 	}
 }
 
-// MutableDenseHashTableV2InitialNumBuckets sets the optional initial_num_buckets attribute to value.
+// MfccFilterbankChannelCount sets the optional filterbank_channel_count attribute to value.
 //
-// value: The initial number of hash table buckets. Must be a power
-// to 2.
-// If not specified, defaults to 131072
-func MutableDenseHashTableV2InitialNumBuckets(value int64) MutableDenseHashTableV2Attr {
+// value: Resolution of the Mel bank used internally.
+// If not specified, defaults to 40
+func MfccFilterbankChannelCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["initial_num_buckets"] = value
+		m["filterbank_channel_count"] = value
 	}
 }
 
-// MutableDenseHashTableV2MaxLoadFactor sets the optional max_load_factor attribute to value.
+// MfccDctCoefficientCount sets the optional dct_coefficient_count attribute to value.
 //
-// value: The maximum ratio between number of entries and number of
-// buckets before growing the table. Must be between 0 and 1.
-// If not specified, defaults to 0.8
-func MutableDenseHashTableV2MaxLoadFactor(value float32) MutableDenseHashTableV2Attr {
+// value: How many output channels to produce per time slice.
+// If not specified, defaults to 13
+func MfccDctCoefficientCount(value int64) MfccAttr {
 	return func(m optionalAttr) {
-		m["max_load_factor"] = value
+		m["dct_coefficient_count"] = value
 	}
 }
 
-// Creates an empty hash table that uses tensors as the backing store.
-//
-// It uses "open addressing" with quadratic reprobing to resolve
-// collisions.
+// Transforms a spectrogram into a form that's useful for speech recognition.
 //
-// This op creates a mutable hash table, specifying the type of its keys and
-// values. Each value must be a scalar. Data can be inserted into the table using
-// the insert operations. It does not support the initialization operation.
+// Mel Frequency Cepstral Coefficients are a way of representing audio data that's
+// been effective as an input feature for machine learning. They are created by
+// taking the spectrum of a spectrogram (a 'cepstrum'), and discarding some of the
+// higher frequencies that are less significant to the human ear. They have a long
+// history in the speech recognition world, and https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+// is a good resource to learn more.
 //
 // Arguments:
-//	empty_key: The key used to represent empty key buckets internally. Must not
-// be used in insert or lookup operations.
-//
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func MutableDenseHashTableV2(scope *Scope, empty_key tf.Output, deleted_key tf.Output, value_dtype tf.DataType, optional ...MutableDenseHashTableV2Attr) (table_handle tf.Output) {
+//	spectrogram: Typically produced by the Spectrogram op, with magnitude_squared
+// set to true.
+//	sample_rate: How many samples per second the source audio used.
+func Mfcc(scope *Scope, spectrogram tf.Output, sample_rate tf.Output, optional ...MfccAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"value_dtype": value_dtype}
+	attrs := map[string]interface{}{}
 	for _, a := range optional {
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "MutableDenseHashTableV2",
+		Type: "Mfcc",
 		Input: []tf.Input{
-			empty_key, deleted_key,
+			spectrogram, sample_rate,
 		},
 		Attrs: attrs,
 	}
@@ -19627,63 +20037,6 @@ func IFFT2D(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// ResourceApplyRMSPropAttr is an optional argument to ResourceApplyRMSProp.
-type ResourceApplyRMSPropAttr func(optionalAttr)
-
-// ResourceApplyRMSPropUseLocking sets the optional use_locking attribute to value.
-//
-// value: If `True`, updating of the var, ms, and mom tensors is protected
-// by a lock; otherwise the behavior is undefined, but may exhibit less
-// contention.
-// If not specified, defaults to false
-func ResourceApplyRMSPropUseLocking(value bool) ResourceApplyRMSPropAttr {
-	return func(m optionalAttr) {
-		m["use_locking"] = value
-	}
-}
-
-// Update '*var' according to the RMSProp algorithm.
-//
-// Note that in dense implementation of this algorithm, ms and mom will
-// update even if the grad is zero, but in this sparse implementation, ms
-// and mom will not update in iterations during which the grad is zero.
-//
-// mean_square = decay * mean_square + (1-decay) * gradient ** 2
-// Delta = learning_rate * gradient / sqrt(mean_square + epsilon)
-//
-// ms <- rho * ms_{t-1} + (1-rho) * grad * grad
-// mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon)
-// var <- var - mom
-//
-// Arguments:
-//	var_: Should be from a Variable().
-//	ms: Should be from a Variable().
-//	mom: Should be from a Variable().
-//	lr: Scaling factor. Must be a scalar.
-//	rho: Decay rate. Must be a scalar.
-//
-//	epsilon: Ridge term. Must be a scalar.
-//	grad: The gradient.
-//
-// Returns the created operation.
-func ResourceApplyRMSProp(scope *Scope, var_ tf.Output, ms tf.Output, mom tf.Output, lr tf.Output, rho tf.Output, momentum tf.Output, epsilon tf.Output, grad tf.Output, optional ...ResourceApplyRMSPropAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "ResourceApplyRMSProp",
-		Input: []tf.Input{
-			var_, ms, mom, lr, rho, momentum, epsilon, grad,
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
 // Returns element-wise remainder of division. This emulates C semantics in that
 //
 // the result here is consistent with a truncating divide. E.g. `truncate(x / y) *
@@ -19862,58 +20215,61 @@ func DecodeJpeg(scope *Scope, contents tf.Output, optional ...DecodeJpegAttr) (i
 	return op.Output(0)
 }
 
-// StageSizeAttr is an optional argument to StageSize.
-type StageSizeAttr func(optionalAttr)
-
-// StageSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
+// Inverse 3D real-valued fast Fourier transform.
 //
-// REQUIRES: value >= 0
-func StageSizeCapacity(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// StageSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
+// Computes the inverse 3-dimensional discrete Fourier transform of a real-valued
+// signal over the inner-most 3 dimensions of `input`.
 //
-// REQUIRES: value >= 0
-func StageSizeMemoryLimit(value int64) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// StageSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StageSizeContainer(value string) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
+// The inner-most 3 dimensions of `input` are assumed to be the result of `RFFT3D`:
+// The inner-most dimension contains the `fft_length / 2 + 1` unique components of
+// the DFT of a real-valued signal. If `fft_length` is not provided, it is computed
+// from the size of the inner-most 3 dimensions of `input`. If the FFT length used
+// to compute `input` is odd, it should be provided since it cannot be inferred
+// properly.
+//
+// Along each axis `IRFFT3D` is computed on, if `fft_length` (or
+// `fft_length / 2 + 1` for the inner-most dimension) is smaller than the
+// corresponding dimension of `input`, the dimension is cropped. If it is larger,
+// the dimension is padded with zeros.
+//
+// Arguments:
+//	input: A complex64 tensor.
+//	fft_length: An int32 tensor of shape [3]. The FFT length for each dimension.
+//
+// Returns A float32 tensor of the same rank as `input`. The inner-most 3
+//   dimensions of `input` are replaced with the `fft_length` samples of their
+//   inverse 3D real Fourier transform.
+//
+// @compatibility(numpy)
+// Equivalent to np.irfftn with 3 dimensions.
+// @end_compatibility
+func IRFFT3D(scope *Scope, input tf.Output, fft_length tf.Output) (output tf.Output) {
+	if scope.Err() != nil {
+		return
 	}
-}
-
-// StageSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StageSizeSharedName(value string) StageSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
+	opspec := tf.OpSpec{
+		Type: "IRFFT3D",
+		Input: []tf.Input{
+			input, fft_length,
+		},
 	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
 }
 
-// Op returns the number of elements in the underlying container.
-func StageSize(scope *Scope, dtypes []tf.DataType, optional ...StageSizeAttr) (size tf.Output) {
+// Returns the truth value of (x != y) element-wise.
+//
+// *NOTE*: `NotEqual` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func NotEqual(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
 	opspec := tf.OpSpec{
-		Type: "StageSize",
-
-		Attrs: attrs,
+		Type: "NotEqual",
+		Input: []tf.Input{
+			x, y,
+		},
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
@@ -20760,67 +21116,6 @@ func AddSparseToTensorsMap(scope *Scope, sparse_indices tf.Output, sparse_values
 	return op.Output(0)
 }
 
-// SparseTensorDenseMatMulAttr is an optional argument to SparseTensorDenseMatMul.
-type SparseTensorDenseMatMulAttr func(optionalAttr)
-
-// SparseTensorDenseMatMulAdjointA sets the optional adjoint_a attribute to value.
-//
-// value: Use the adjoint of A in the matrix multiply.  If A is complex, this
-// is transpose(conj(A)).  Otherwise it's transpose(A).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointA(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_a"] = value
-	}
-}
-
-// SparseTensorDenseMatMulAdjointB sets the optional adjoint_b attribute to value.
-//
-// value: Use the adjoint of B in the matrix multiply.  If B is complex, this
-// is transpose(conj(B)).  Otherwise it's transpose(B).
-// If not specified, defaults to false
-func SparseTensorDenseMatMulAdjointB(value bool) SparseTensorDenseMatMulAttr {
-	return func(m optionalAttr) {
-		m["adjoint_b"] = value
-	}
-}
-
-// Multiply SparseTensor (of rank 2) "A" by dense matrix "B".
-//
-// No validity checking is performed on the indices of A.  However, the following
-// input format is recommended for optimal behavior:
-//
-// if adjoint_a == false:
-//   A should be sorted in lexicographically increasing order.  Use SparseReorder
-//   if you're not sure.
-// if adjoint_a == true:
-//   A should be sorted in order of increasing dimension 1 (i.e., "column major"
-//   order instead of "row major" order).
-//
-// Arguments:
-//	a_indices: 2-D.  The `indices` of the `SparseTensor`, size `[nnz, 2]` Matrix.
-//	a_values: 1-D.  The `values` of the `SparseTensor`, size `[nnz]` Vector.
-//	a_shape: 1-D.  The `shape` of the `SparseTensor`, size `[2]` Vector.
-//	b: 2-D.  A dense Matrix.
-func SparseTensorDenseMatMul(scope *Scope, a_indices tf.Output, a_values tf.Output, a_shape tf.Output, b tf.Output, optional ...SparseTensorDenseMatMulAttr) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseTensorDenseMatMul",
-		Input: []tf.Input{
-			a_indices, a_values, a_shape, b,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Deserialize and concatenate `SparseTensors` from a serialized minibatch.
 //
 // The input `serialized_sparse` must be a string matrix of shape `[N x 3]` where
@@ -20903,151 +21198,6 @@ func BitwiseAnd(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
 	return op.Output(0)
 }
 
-// SubstrAttr is an optional argument to Substr.
-type SubstrAttr func(optionalAttr)
-
-// SubstrUnit sets the optional unit attribute to value.
-//
-// value: The unit that is used to create the substring.  One of: `"BYTE"` (for
-// defining position and length by bytes) or `"UTF8_CHAR"` (for the UTF-8
-// encoded Unicode code points).  The default is `"BYTE"`. Results are undefined if
-// `unit=UTF8_CHAR` and the `input` strings do not contain structurally valid
-// UTF-8.
-// If not specified, defaults to "BYTE"
-func SubstrUnit(value string) SubstrAttr {
-	return func(m optionalAttr) {
-		m["unit"] = value
-	}
-}
-
-// Return substrings from `Tensor` of strings.
-//
-// For each string in the input `Tensor`, creates a substring starting at index
-// `pos` with a total length of `len`.
-//
-// If `len` defines a substring that would extend beyond the length of the input
-// string, then as many characters as possible are used.
-//
-// A negative `pos` indicates distance within the string backwards from the end.
-//
-// If `pos` specifies an index which is out of range for any of the input strings,
-// then an `InvalidArgumentError` is thrown.
-//
-// `pos` and `len` must have the same shape, otherwise a `ValueError` is thrown on
-// Op creation.
-//
-// *NOTE*: `Substr` supports broadcasting up to two dimensions. More about
-// broadcasting
-// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
-//
-// ---
-//
-// Examples
-//
-// Using scalar `pos` and `len`:
-//
-// ```python
-// input = [b'Hello', b'World']
-// position = 1
-// length = 3
-//
-// output = [b'ell', b'orl']
-// ```
-//
-// Using `pos` and `len` with same shape as `input`:
-//
-// ```python
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen']]
-// position = [[1, 2, 3],
-//             [1, 2, 3],
-//             [1, 2, 3]]
-// length =   [[2, 3, 4],
-//             [4, 3, 2],
-//             [5, 5, 5]]
-//
-// output = [[b'en', b'eve', b'lve'],
-//           [b'hirt', b'urt', b'te'],
-//           [b'ixtee', b'vente', b'hteen']]
-// ```
-//
-// Broadcasting `pos` and `len` onto `input`:
-//
-// ```
-// input = [[b'ten', b'eleven', b'twelve'],
-//          [b'thirteen', b'fourteen', b'fifteen'],
-//          [b'sixteen', b'seventeen', b'eighteen'],
-//          [b'nineteen', b'twenty', b'twentyone']]
-// position = [1, 2, 3]
-// length =   [1, 2, 3]
-//
-// output = [[b'e', b'ev', b'lve'],
-//           [b'h', b'ur', b'tee'],
-//           [b'i', b've', b'hte'],
-//           [b'i', b'en', b'nty']]
-// ```
-//
-// Broadcasting `input` onto `pos` and `len`:
-//
-// ```
-// input = b'thirteen'
-// position = [1, 5, 7]
-// length =   [3, 2, 1]
-//
-// output = [b'hir', b'ee', b'n']
-// ```
-//
-// Arguments:
-//	input: Tensor of strings
-//	pos: Scalar defining the position of first character in each substring
-//	len: Scalar defining the number of characters to include in each substring
-//
-// Returns Tensor of substrings
-func Substr(scope *Scope, input tf.Output, pos tf.Output, len tf.Output, optional ...SubstrAttr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "Substr",
-		Input: []tf.Input{
-			input, pos, len,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Creates a Dataset that returns pseudorandom numbers.
-//
-// Arguments:
-//	seed: A scalar seed for the random number generator. If either seed or
-// seed2 is set to be non-zero, the random number generator is seeded
-// by the given seed.  Otherwise, a random seed is used.
-//	seed2: A second scalar seed to avoid seed collision.
-//
-//
-func RandomDataset(scope *Scope, seed tf.Output, seed2 tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "RandomDataset",
-		Input: []tf.Input{
-			seed, seed2,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Inverse real-valued fast Fourier transform.
 //
 // Computes the inverse 1-dimensional discrete Fourier transform of a real-valued
@@ -21469,6 +21619,142 @@ func SparseSparseMinimum(scope *Scope, a_indices tf.Output, a_values tf.Output,
 	return op.Output(0), op.Output(1)
 }
 
+// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
+type MapUnstageNoKeyAttr func(optionalAttr)
+
+// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// MapUnstageNoKeyContainer sets the optional container attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
+// If not specified, defaults to ""
+func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Op removes and returns a random (key, value)
+//
+// from the underlying container.   If the underlying container
+// does not contain elements, the op will block until it does.
+func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "MapUnstageNoKey",
+		Input: []tf.Input{
+			indices,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	if scope.Err() != nil {
+		return
+	}
+	var idx int
+	var err error
+	key = op.Output(idx)
+	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
+		scope.UpdateErr("MapUnstageNoKey", err)
+		return
+	}
+	return key, values
+}
+
+// HashTableV2Attr is an optional argument to HashTableV2.
+type HashTableV2Attr func(optionalAttr)
+
+// HashTableV2Container sets the optional container attribute to value.
+//
+// value: If non-empty, this table is placed in the given container.
+// Otherwise, a default container is used.
+// If not specified, defaults to ""
+func HashTableV2Container(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// HashTableV2SharedName sets the optional shared_name attribute to value.
+//
+// value: If non-empty, this table is shared under the given name across
+// multiple sessions.
+// If not specified, defaults to ""
+func HashTableV2SharedName(value string) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
+//
+// value: If true and shared_name is empty, the table is shared
+// using the node name.
+// If not specified, defaults to false
+func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
+	return func(m optionalAttr) {
+		m["use_node_name_sharing"] = value
+	}
+}
+
+// Creates a non-initialized hash table.
+//
+// This op creates a hash table, specifying the type of its keys and values.
+// Before using the table you will have to initialize it.  After initialization the
+// table will be immutable.
+//
+// Arguments:
+//	key_dtype: Type of the table keys.
+//	value_dtype: Type of the table values.
+//
+// Returns Handle to a table.
+func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "HashTableV2",
+
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // TakeManySparseFromTensorsMapAttr is an optional argument to TakeManySparseFromTensorsMap.
 type TakeManySparseFromTensorsMapAttr func(optionalAttr)
 
@@ -22230,6 +22516,34 @@ func MatrixTriangularSolve(scope *Scope, matrix tf.Output, rhs tf.Output, option
 	return op.Output(0)
 }
 
+// Saves tensors in V2 checkpoint format.
+//
+// By default, saves the named tensors in full.  If the caller wishes to save
+// specific slices of full tensors, "shape_and_slices" should be non-empty strings
+// and correspondingly well-formed.
+//
+// Arguments:
+//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
+// write the tensors.
+//	tensor_names: shape {N}. The names of the tensors to be saved.
+//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
+// Empty strings indicate that they are non-partitioned tensors.
+//	tensors: `N` tensors to save.
+//
+// Returns the created operation.
+func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "SaveV2",
+		Input: []tf.Input{
+			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // UnicodeTranscodeAttr is an optional argument to UnicodeTranscode.
 type UnicodeTranscodeAttr func(optionalAttr)
 
@@ -22640,6 +22954,37 @@ func QuantizeDownAndShrinkRange(scope *Scope, input tf.Output, input_min tf.Outp
 	return op.Output(0), op.Output(1), op.Output(2)
 }
 
+// ApproximateEqualAttr is an optional argument to ApproximateEqual.
+type ApproximateEqualAttr func(optionalAttr)
+
+// ApproximateEqualTolerance sets the optional tolerance attribute to value.
+// If not specified, defaults to 1e-05
+func ApproximateEqualTolerance(value float32) ApproximateEqualAttr {
+	return func(m optionalAttr) {
+		m["tolerance"] = value
+	}
+}
+
+// Returns the truth value of abs(x-y) < tolerance element-wise.
+func ApproximateEqual(scope *Scope, x tf.Output, y tf.Output, optional ...ApproximateEqualAttr) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "ApproximateEqual",
+		Input: []tf.Input{
+			x, y,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Returns the truth value of x OR y element-wise.
 //
 // *NOTE*: `LogicalOr` supports broadcasting. More about broadcasting
@@ -25011,6 +25356,75 @@ func Cumsum(scope *Scope, x tf.Output, axis tf.Output, optional ...CumsumAttr) (
 	return op.Output(0)
 }
 
+// Return the shape of s0 op s1 with broadcast.
+//
+// Given `s0` and `s1`, tensors that represent shapes, compute `r0`, the
+// broadcasted shape. `s0`, `s1` and `r0` are all integer vectors.
+func BroadcastArgs(scope *Scope, s0 tf.Output, s1 tf.Output) (r0 tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "BroadcastArgs",
+		Input: []tf.Input{
+			s0, s1,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// DataFormatDimMapAttr is an optional argument to DataFormatDimMap.
+type DataFormatDimMapAttr func(optionalAttr)
+
+// DataFormatDimMapSrcFormat sets the optional src_format attribute to value.
+//
+// value: source data format.
+// If not specified, defaults to "NHWC"
+func DataFormatDimMapSrcFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["src_format"] = value
+	}
+}
+
+// DataFormatDimMapDstFormat sets the optional dst_format attribute to value.
+//
+// value: destination data format.
+// If not specified, defaults to "NCHW"
+func DataFormatDimMapDstFormat(value string) DataFormatDimMapAttr {
+	return func(m optionalAttr) {
+		m["dst_format"] = value
+	}
+}
+
+// Returns the dimension index in the destination data format given the one in
+//
+// the source data format.
+//
+// Arguments:
+//	x: A Tensor with each element as a dimension index in source data format.
+// Must be in the range [-4, 4).
+//
+// Returns A Tensor with each element as a dimension index in destination data format.
+func DataFormatDimMap(scope *Scope, x tf.Output, optional ...DataFormatDimMapAttr) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "DataFormatDimMap",
+		Input: []tf.Input{
+			x,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // CumprodAttr is an optional argument to Cumprod.
 type CumprodAttr func(optionalAttr)
 
@@ -25404,77 +25818,6 @@ func Roll(scope *Scope, input tf.Output, shift tf.Output, axis tf.Output) (outpu
 	return op.Output(0)
 }
 
-// MapPeekAttr is an optional argument to MapPeek.
-type MapPeekAttr func(optionalAttr)
-
-// MapPeekCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekCapacity(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapPeekMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapPeekMemoryLimit(value int64) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapPeekContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapPeekContainer(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapPeekSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapPeekSharedName(value string) MapPeekAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op peeks at the values at the specified key.  If the
-//
-// underlying container does not contain this key
-// this op will block until it does.
-func MapPeek(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapPeekAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapPeek",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapPeek", err)
-		return
-	}
-	return values
-}
-
 // Looks up keys in a table, outputs the corresponding values.
 //
 // The tensor `keys` must of the same type as the keys of the table.
@@ -25685,142 +26028,6 @@ func LookupTableImportV2(scope *Scope, table_handle tf.Output, keys tf.Output, v
 	return scope.AddOperation(opspec)
 }
 
-// MapUnstageNoKeyAttr is an optional argument to MapUnstageNoKey.
-type MapUnstageNoKeyAttr func(optionalAttr)
-
-// MapUnstageNoKeyCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageNoKeyCapacity(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageNoKeyMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageNoKeyMemoryLimit(value int64) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageNoKeyContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeyContainer(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageNoKeySharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageNoKeySharedName(value string) MapUnstageNoKeyAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns a random (key, value)
-//
-// from the underlying container.   If the underlying container
-// does not contain elements, the op will block until it does.
-func MapUnstageNoKey(scope *Scope, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageNoKeyAttr) (key tf.Output, values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstageNoKey",
-		Input: []tf.Input{
-			indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	key = op.Output(idx)
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstageNoKey", err)
-		return
-	}
-	return key, values
-}
-
-// HashTableV2Attr is an optional argument to HashTableV2.
-type HashTableV2Attr func(optionalAttr)
-
-// HashTableV2Container sets the optional container attribute to value.
-//
-// value: If non-empty, this table is placed in the given container.
-// Otherwise, a default container is used.
-// If not specified, defaults to ""
-func HashTableV2Container(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// HashTableV2SharedName sets the optional shared_name attribute to value.
-//
-// value: If non-empty, this table is shared under the given name across
-// multiple sessions.
-// If not specified, defaults to ""
-func HashTableV2SharedName(value string) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// HashTableV2UseNodeNameSharing sets the optional use_node_name_sharing attribute to value.
-//
-// value: If true and shared_name is empty, the table is shared
-// using the node name.
-// If not specified, defaults to false
-func HashTableV2UseNodeNameSharing(value bool) HashTableV2Attr {
-	return func(m optionalAttr) {
-		m["use_node_name_sharing"] = value
-	}
-}
-
-// Creates a non-initialized hash table.
-//
-// This op creates a hash table, specifying the type of its keys and values.
-// Before using the table you will have to initialize it.  After initialization the
-// table will be immutable.
-//
-// Arguments:
-//	key_dtype: Type of the table keys.
-//	value_dtype: Type of the table values.
-//
-// Returns Handle to a table.
-func HashTableV2(scope *Scope, key_dtype tf.DataType, value_dtype tf.DataType, optional ...HashTableV2Attr) (table_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"key_dtype": key_dtype, "value_dtype": value_dtype}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "HashTableV2",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // MultiDeviceIteratorFromStringHandleAttr is an optional argument to MultiDeviceIteratorFromStringHandle.
 type MultiDeviceIteratorFromStringHandleAttr func(optionalAttr)
 
@@ -26322,24 +26529,6 @@ func TensorArrayReadV3(scope *Scope, handle tf.Output, index tf.Output, flow_in
 	return op.Output(0)
 }
 
-// Computes the gradient for the tanh of `x` wrt its input.
-//
-// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
-// is the corresponding input gradient.
-func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TanhGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Reduces sparse updates into the variable referenced by `resource` using the `max` operation.
 //
 // This operation computes
@@ -26381,6 +26570,24 @@ func ResourceScatterMax(scope *Scope, resource tf.Output, indices tf.Output, upd
 	return scope.AddOperation(opspec)
 }
 
+// Computes the gradient for the tanh of `x` wrt its input.
+//
+// Specifically, `grad = dy * (1 - y*y)`, where `y = tanh(x)`, and `dy`
+// is the corresponding input gradient.
+func TanhGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "TanhGrad",
+		Input: []tf.Input{
+			y, dy,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Outputs a `Summary` protocol buffer with scalar values.
 //
 // The input `tags` and `values` must have the same shape.  The generated summary
@@ -26669,24 +26876,6 @@ func MergeSummary(scope *Scope, inputs []tf.Output) (summary tf.Output) {
 	return op.Output(0)
 }
 
-// Returns the number of tensors in the input tensor list.
-//
-// input_handle: the input list
-// length: the number of tensors in the list
-func TensorListLength(scope *Scope, input_handle tf.Output) (length tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "TensorListLength",
-		Input: []tf.Input{
-			input_handle,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // The shape of the elements of the given list, as a tensor.
 //
 //   input_handle: the list
@@ -27361,35 +27550,6 @@ func MatrixSquareRoot(scope *Scope, input tf.Output) (output tf.Output) {
 	return op.Output(0)
 }
 
-// Convert JSON-encoded Example records to binary protocol buffer strings.
-//
-// This op translates a tensor containing Example records, encoded using
-// the [standard JSON
-// mapping](https://developers.google.com/protocol-buffers/docs/proto3#json),
-// into a tensor containing the same records encoded as binary protocol
-// buffers. The resulting tensor can then be fed to any of the other
-// Example-parsing ops.
-//
-// Arguments:
-//	json_examples: Each string is a JSON object serialized according to the JSON
-// mapping of the Example proto.
-//
-// Returns Each string is a binary Example protocol buffer corresponding
-// to the respective element of `json_examples`.
-func DecodeJSONExample(scope *Scope, json_examples tf.Output) (binary_examples tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "DecodeJSONExample",
-		Input: []tf.Input{
-			json_examples,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // SvdAttr is an optional argument to Svd.
 type SvdAttr func(optionalAttr)
 
@@ -28426,6 +28586,49 @@ func ReaderReadUpToV2(scope *Scope, reader_handle tf.Output, queue_handle tf.Out
 	return op.Output(0), op.Output(1)
 }
 
+//     Adds v into specified rows of x.
+//
+//     Computes y = x; y[i, :] += v; return y.
+//
+// Arguments:
+//	x: A `Tensor` of type T.
+//	i: A vector. Indices into the left-most dimension of `x`.
+//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
+//
+// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
+func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "InplaceAdd",
+		Input: []tf.Input{
+			x, i, v,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// Restore a Reader to its initial clean state.
+//
+// Arguments:
+//	reader_handle: Handle to a Reader.
+//
+// Returns the created operation.
+func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "ReaderResetV2",
+		Input: []tf.Input{
+			reader_handle,
+		},
+	}
+	return scope.AddOperation(opspec)
+}
+
 // BatchAttr is an optional argument to Batch.
 type BatchAttr func(optionalAttr)
 
@@ -29711,71 +29914,6 @@ func CropAndResizeGradBoxes(scope *Scope, grads tf.Output, image tf.Output, boxe
 	return op.Output(0)
 }
 
-// Saves tensors in V2 checkpoint format.
-//
-// By default, saves the named tensors in full.  If the caller wishes to save
-// specific slices of full tensors, "shape_and_slices" should be non-empty strings
-// and correspondingly well-formed.
-//
-// Arguments:
-//	prefix: Must have a single element. The prefix of the V2 checkpoint to which we
-// write the tensors.
-//	tensor_names: shape {N}. The names of the tensors to be saved.
-//	shape_and_slices: shape {N}.  The slice specs of the tensors to be saved.
-// Empty strings indicate that they are non-partitioned tensors.
-//	tensors: `N` tensors to save.
-//
-// Returns the created operation.
-func SaveV2(scope *Scope, prefix tf.Output, tensor_names tf.Output, shape_and_slices tf.Output, tensors []tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "SaveV2",
-		Input: []tf.Input{
-			prefix, tensor_names, shape_and_slices, tf.OutputList(tensors),
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// StatsAggregatorHandleAttr is an optional argument to StatsAggregatorHandle.
-type StatsAggregatorHandleAttr func(optionalAttr)
-
-// StatsAggregatorHandleContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleContainer(value string) StatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// StatsAggregatorHandleSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func StatsAggregatorHandleSharedName(value string) StatsAggregatorHandleAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Creates a statistics manager resource.
-func StatsAggregatorHandle(scope *Scope, optional ...StatsAggregatorHandleAttr) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorHandle",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Greedily selects a subset of bounding boxes in descending order of score,
 //
 // pruning away boxes that have high intersection-over-union (IOU) overlap
@@ -29955,116 +30093,64 @@ func FakeParam(scope *Scope, dtype tf.DataType, shape tf.Shape) (output tf.Outpu
 	return op.Output(0)
 }
 
-// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
-//
-// Arguments:
-//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
-// `N` data inputs should produce the next output element.
-//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
-// the values of `selector_input_dataset`.
-//
+// Computes the gradient for the inverse of `x` wrt its input.
 //
-func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
+// is the corresponding input gradient.
+func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalDirectedInterleaveDataset",
+		Type: "InvGrad",
 		Input: []tf.Input{
-			selector_input_dataset, tf.OutputList(data_input_datasets),
+			y, dy,
 		},
-		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Gets the next element from a FunctionBufferingResource.
-//
-// Arguments:
-//	function_buffer_resource: The FunctionBufferingResource handle.
-//	output_types: The type list for the return values.
+// List of the given size with empty elements.
 //
-// Returns A list of return values.
-func ExperimentalFunctionBufferingResourceGetNext(scope *Scope, function_buffer_resource tf.Output, output_types []tf.DataType) (output []tf.Output) {
+// element_shape: the shape of the future elements of the list
+// num_elements: the number of elements to reserve
+// handle: the output list
+// element_dtype: the desired type of elements in the list.
+func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
-	attrs := map[string]interface{}{"output_types": output_types}
+	attrs := map[string]interface{}{"element_dtype": element_dtype}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalFunctionBufferingResourceGetNext",
+		Type: "TensorListReserve",
 		Input: []tf.Input{
-			function_buffer_resource,
+			element_shape, num_elements,
 		},
 		Attrs: attrs,
 	}
 	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if output, idx, err = makeOutputList(op, idx, "output"); err != nil {
-		scope.UpdateErr("ExperimentalFunctionBufferingResourceGetNext", err)
-		return
-	}
-	return output
-}
-
-//     Adds v into specified rows of x.
-//
-//     Computes y = x; y[i, :] += v; return y.
-//
-// Arguments:
-//	x: A `Tensor` of type T.
-//	i: A vector. Indices into the left-most dimension of `x`.
-//	v: A `Tensor` of type T. Same dimension sizes as x except the first dimension, which must be the same as i's size.
-//
-// Returns A `Tensor` of type T. An alias of `x`. The content of `y` is undefined if there are duplicates in `i`.
-func InplaceAdd(scope *Scope, x tf.Output, i tf.Output, v tf.Output) (y tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InplaceAdd",
-		Input: []tf.Input{
-			x, i, v,
-		},
-	}
-	op := scope.AddOperation(opspec)
 	return op.Output(0)
 }
 
-// Restore a Reader to its initial clean state.
+// A substitute for `InterleaveDataset` on a fixed list of `N` datasets.
 //
 // Arguments:
-//	reader_handle: Handle to a Reader.
+//	selector_input_dataset: A dataset of scalar `DT_INT64` elements that determines which of the
+// `N` data inputs should produce the next output element.
+//	data_input_datasets: `N` datasets with the same type that will be interleaved according to
+// the values of `selector_input_dataset`.
 //
-// Returns the created operation.
-func ReaderResetV2(scope *Scope, reader_handle tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "ReaderResetV2",
-		Input: []tf.Input{
-			reader_handle,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// A dataset that splits the elements of its input into multiple elements.
-func UnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
+//
+func ExperimentalDirectedInterleaveDataset(scope *Scope, selector_input_dataset tf.Output, data_input_datasets []tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
 	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
 	opspec := tf.OpSpec{
-		Type: "UnbatchDataset",
+		Type: "ExperimentalDirectedInterleaveDataset",
 		Input: []tf.Input{
-			input_dataset,
+			selector_input_dataset, tf.OutputList(data_input_datasets),
 		},
 		Attrs: attrs,
 	}
@@ -30072,184 +30158,48 @@ func UnbatchDataset(scope *Scope, input_dataset tf.Output, output_types []tf.Dat
 	return op.Output(0)
 }
 
-// OrderedMapStageAttr is an optional argument to OrderedMapStage.
-type OrderedMapStageAttr func(optionalAttr)
+// RandomUniformIntAttr is an optional argument to RandomUniformInt.
+type RandomUniformIntAttr func(optionalAttr)
 
-// OrderedMapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
+// RandomUniformIntSeed sets the optional seed attribute to value.
 //
-// REQUIRES: value >= 0
-func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// value: If either `seed` or `seed2` are set to be non-zero, the random number
+// generator is seeded by the given seed.  Otherwise, it is seeded by a
+// random seed.
 // If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// OrderedMapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func OrderedMapStageContainer(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// OrderedMapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a ordered
-//
-// associative container.   Elements are ordered by key.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "OrderedMapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// RpcAttr is an optional argument to Rpc.
-type RpcAttr func(optionalAttr)
-
-// RpcProtocol sets the optional protocol attribute to value.
-//
-// value: RPC protocol to use.  Empty string means use the default protocol.
-// Options include 'grpc'.
-// If not specified, defaults to ""
-func RpcProtocol(value string) RpcAttr {
-	return func(m optionalAttr) {
-		m["protocol"] = value
-	}
-}
-
-// RpcFailFast sets the optional fail_fast attribute to value.
-//
-// value: `boolean`. If `true` (default), then failures to connect
-// (i.e., the server does not immediately respond) cause an RPC failure.
-// If not specified, defaults to true
-func RpcFailFast(value bool) RpcAttr {
+func RandomUniformIntSeed(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["fail_fast"] = value
+		m["seed"] = value
 	}
 }
 
-// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+// RandomUniformIntSeed2 sets the optional seed2 attribute to value.
 //
-// value: `int`. If `0` (default), then the kernel will run the RPC
-// request and only time out if the RPC deadline passes or the session times out.
-// If this value is greater than `0`, then the op will raise an exception if
-// the RPC takes longer than `timeout_in_ms`.
+// value: A second seed to avoid seed collision.
 // If not specified, defaults to 0
-func RpcTimeoutInMs(value int64) RpcAttr {
+func RandomUniformIntSeed2(value int64) RandomUniformIntAttr {
 	return func(m optionalAttr) {
-		m["timeout_in_ms"] = value
+		m["seed2"] = value
 	}
 }
 
-// Perform batches of RPC requests.
-//
-// This op asynchronously performs either a single RPC request, or a batch
-// of requests.  RPC requests are defined by three main parameters:
-//
-//   - `address` (the host+port or BNS address of the request)
-//   - `method` (the RPC method name for the request)
-//   - `request` (the serialized proto string, or vector of strings,
-//      of the RPC request argument).
-//
-// For example, if you have an RPC service running on port localhost:2345,
-// and its interface is configured with the following proto declaration:
-//
-// ```
-// service MyService {
-//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
-//   }
-// };
-// ```
-//
-// then call this op with arguments:
-//
-// ```
-// address = "localhost:2345"
-// method = "MyService/MyMethod"
-// ```
-//
-// The `request` tensor is a string tensor representing serialized `MyRequestProto`
-// strings; and the output string tensor `response` will have the same shape
-// and contain (upon successful completion) corresponding serialized
-// `MyResponseProto` strings.
-//
-// For example, to send a single, empty, `MyRequestProto`, call
-// this op with `request = ""`.  To send 5 **parallel** empty requests,
-// call this op with `request = ["", "", "", "", ""]`.
-//
-// More generally, one can create a batch of `MyRequestProto` serialized protos
-// from regular batched tensors using the `encode_proto` op, and convert
-// the response `MyResponseProto` serialized protos to batched tensors
-// using the `decode_proto` op.
-//
-// **NOTE** Working with serialized proto strings is faster than instantiating
-// actual proto objects in memory, so no performance degradation is expected
-// compared to writing custom kernels for this workflow.
+// Outputs random integers from a uniform distribution.
 //
-// If the connection fails or the remote worker returns an error
-// status, the op reraises this exception locally.
+// The generated values are uniform integers in the range `[minval, maxval)`.
+// The lower bound `minval` is included in the range, while the upper bound
+// `maxval` is excluded.
 //
-// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+// The random integers are slightly biased unless `maxval - minval` is an exact
+// power of two.  The bias is small for values of `maxval - minval` significantly
+// smaller than the range of the output (either `2^32` or `2^64`).
 //
 // Arguments:
-//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `method` and `request`.
-//	method: `0-D` or `1-D`.  The method address on the RPC server.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `request`.
-//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
-// If this tensor has more than 1 element, then multiple parallel rpc requests
-// are sent.  This argument broadcasts with `address` and `method`.
+//	shape: The shape of the output tensor.
+//	minval: 0-D.  Inclusive lower bound on the generated integers.
+//	maxval: 0-D.  Exclusive upper bound on the generated integers.
 //
-// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
-func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+// Returns A tensor of the specified shape filled with uniform random integers.
+func RandomUniformInt(scope *Scope, shape tf.Output, minval tf.Output, maxval tf.Output, optional ...RandomUniformIntAttr) (output tf.Output) {
 	if scope.Err() != nil {
 		return
 	}
@@ -30258,9 +30208,9 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 		a(attrs)
 	}
 	opspec := tf.OpSpec{
-		Type: "Rpc",
+		Type: "RandomUniformInt",
 		Input: []tf.Input{
-			address, method, request,
+			shape, minval, maxval,
 		},
 		Attrs: attrs,
 	}
@@ -30268,59 +30218,25 @@ func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, o
 	return op.Output(0)
 }
 
-// StackPushV2Attr is an optional argument to StackPushV2.
-type StackPushV2Attr func(optionalAttr)
-
-// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
-//
-// value: Swap `elem` to CPU. Default to false.
-// If not specified, defaults to false
-func StackPushV2SwapMemory(value bool) StackPushV2Attr {
-	return func(m optionalAttr) {
-		m["swap_memory"] = value
-	}
-}
-
-// Push an element onto the stack.
-//
-// Arguments:
-//	handle: The handle to a stack.
-//	elem: The tensor to be pushed onto the stack.
+// Add the quantile summaries to each quantile stream resource.
 //
-// Returns The same tensor as the input 'elem'.
-func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "StackPushV2",
-		Input: []tf.Input{
-			handle, elem,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Resets the FunctionBufferingResource.
+// An op that adds a list of quantile summaries to a quantile stream resource. Each
+// summary Tensor is rank 2, containing summaries (value, weight, min_rank, max_rank)
+// for a single feature.
 //
 // Arguments:
-//	function_buffer_resource: The FunctionBufferingResource handle.
+//	quantile_stream_resource_handle: resource handle referring to a QuantileStreamResource.
+//	summaries: string; List of Rank 2 Tensor each containing the summaries for a single feature.
 //
 // Returns the created operation.
-func ExperimentalFunctionBufferingResourceReset(scope *Scope, function_buffer_resource tf.Output) (o *tf.Operation) {
+func BoostedTreesQuantileStreamResourceAddSummaries(scope *Scope, quantile_stream_resource_handle tf.Output, summaries []tf.Output) (o *tf.Operation) {
 	if scope.Err() != nil {
 		return
 	}
 	opspec := tf.OpSpec{
-		Type: "ExperimentalFunctionBufferingResourceReset",
+		Type: "BoostedTreesQuantileStreamResourceAddSummaries",
 		Input: []tf.Input{
-			function_buffer_resource,
+			quantile_stream_resource_handle, tf.OutputList(summaries),
 		},
 	}
 	return scope.AddOperation(opspec)
@@ -30615,168 +30531,6 @@ func ConcatenateDataset(scope *Scope, input_dataset tf.Output, another_dataset t
 	return op.Output(0)
 }
 
-// Adds a value to the current value of a variable.
-//
-// Any ReadVariableOp with a control dependency on this op is guaranteed to
-// see the incremented value or a subsequent newer one.
-//
-// Arguments:
-//	resource: handle to the resource in which to store the variable.
-//	value: the value by which the variable will be incremented.
-//
-// Returns the created operation.
-func AssignAddVariableOp(scope *Scope, resource tf.Output, value tf.Output) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "AssignAddVariableOp",
-		Input: []tf.Input{
-			resource, value,
-		},
-	}
-	return scope.AddOperation(opspec)
-}
-
-// Records the latency of producing `input_dataset` elements in a StatsAggregator.
-func LatencyStatsDataset(scope *Scope, input_dataset tf.Output, tag tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "LatencyStatsDataset",
-		Input: []tf.Input{
-			input_dataset, tag,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// MapSizeAttr is an optional argument to MapSize.
-type MapSizeAttr func(optionalAttr)
-
-// MapSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeCapacity(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapSizeMemoryLimit(value int64) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapSizeContainer(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapSizeSharedName(value string) MapSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of elements in the underlying container.
-func MapSize(scope *Scope, dtypes []tf.DataType, optional ...MapSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// SparseToDenseAttr is an optional argument to SparseToDense.
-type SparseToDenseAttr func(optionalAttr)
-
-// SparseToDenseValidateIndices sets the optional validate_indices attribute to value.
-//
-// value: If true, indices are checked to make sure they are sorted in
-// lexicographic order and that there are no repeats.
-// If not specified, defaults to true
-func SparseToDenseValidateIndices(value bool) SparseToDenseAttr {
-	return func(m optionalAttr) {
-		m["validate_indices"] = value
-	}
-}
-
-// Converts a sparse representation into a dense tensor.
-//
-// Builds an array `dense` with shape `output_shape` such that
-//
-// ```
-// # If sparse_indices is scalar
-// dense[i] = (i == sparse_indices ? sparse_values : default_value)
-//
-// # If sparse_indices is a vector, then for each i
-// dense[sparse_indices[i]] = sparse_values[i]
-//
-// # If sparse_indices is an n by d matrix, then for each i in [0, n)
-// dense[sparse_indices[i][0], ..., sparse_indices[i][d-1]] = sparse_values[i]
-// ```
-//
-// All other values in `dense` are set to `default_value`.  If `sparse_values` is a
-// scalar, all sparse indices are set to this single value.
-//
-// Indices should be sorted in lexicographic order, and indices must not
-// contain any repeats. If `validate_indices` is true, these properties
-// are checked during execution.
-//
-// Arguments:
-//	sparse_indices: 0-D, 1-D, or 2-D.  `sparse_indices[i]` contains the complete
-// index where `sparse_values[i]` will be placed.
-//	output_shape: 1-D.  Shape of the dense output tensor.
-//	sparse_values: 1-D.  Values corresponding to each row of `sparse_indices`,
-// or a scalar value to be used for all sparse indices.
-//	default_value: Scalar value to set for indices not specified in
-// `sparse_indices`.
-//
-// Returns Dense output tensor of shape `output_shape`.
-func SparseToDense(scope *Scope, sparse_indices tf.Output, output_shape tf.Output, sparse_values tf.Output, default_value tf.Output, optional ...SparseToDenseAttr) (dense tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "SparseToDense",
-		Input: []tf.Input{
-			sparse_indices, output_shape, sparse_values, default_value,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Computes the grayscale dilation of 4-D `input` and 3-D `filter` tensors.
 //
 // The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
@@ -30910,52 +30664,6 @@ func PaddedBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Out
 	return op.Output(0)
 }
 
-// Creates a dataset that batches input elements into a SparseTensor.
-//
-// Arguments:
-//	input_dataset: A handle to an input dataset. Must have a single component.
-//	batch_size: A scalar representing the number of elements to accumulate in a
-// batch.
-//	row_shape: A vector representing the dense shape of each row in the produced
-// SparseTensor. The shape may be partially specified, using `-1` to indicate
-// that a particular dimension should use the maximum size of all batch elements.
-//
-//
-func DenseToSparseBatchDataset(scope *Scope, input_dataset tf.Output, batch_size tf.Output, row_shape tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "DenseToSparseBatchDataset",
-		Input: []tf.Input{
-			input_dataset, batch_size, row_shape,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Deprecated. Use TensorArrayGradV3
-//
-// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
-func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"source": source}
-	opspec := tf.OpSpec{
-		Type: "TensorArrayGradV2",
-		Input: []tf.Input{
-			handle, flow_in,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Creates a dataset that shuffles and repeats elements from `input_dataset`
 //
 // pseudorandomly.
@@ -31561,46 +31269,6 @@ func FIFOQueueV2(scope *Scope, component_types []tf.DataType, optional ...FIFOQu
 	return op.Output(0)
 }
 
-// Produces a summary of any statistics recorded by the given statistics manager.
-func StatsAggregatorSummary(scope *Scope, iterator tf.Output) (summary tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "StatsAggregatorSummary",
-		Input: []tf.Input{
-			iterator,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// Compute the pairwise cross product.
-//
-// `a` and `b` must be the same shape; they can either be simple 3-element vectors,
-// or any shape where the innermost dimension is 3. In the latter case, each pair
-// of corresponding 3-element vectors is cross-multiplied independently.
-//
-// Arguments:
-//	a: A tensor containing 3-element vectors.
-//	b: Another tensor, of same type and shape as `a`.
-//
-// Returns Pairwise cross product of the vectors in `a` and `b`.
-func Cross(scope *Scope, a tf.Output, b tf.Output) (product tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "Cross",
-		Input: []tf.Input{
-			a, b,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Constructs an Optional variant from a tuple of tensors.
 func OptionalFromValue(scope *Scope, components []tf.Output) (optional tf.Output) {
 	if scope.Err() != nil {
@@ -31769,30 +31437,6 @@ func OptionalHasValue(scope *Scope, optional tf.Output) (has_value tf.Output) {
 	return op.Output(0)
 }
 
-// Creates a dataset that executes a SQL query and emits rows of the result set.
-//
-// Arguments:
-//	driver_name: The database type. Currently, the only supported type is 'sqlite'.
-//	data_source_name: A connection string to connect to the database.
-//	query: A SQL query to execute.
-//
-//
-func SqlDataset(scope *Scope, driver_name tf.Output, data_source_name tf.Output, query tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"output_types": output_types, "output_shapes": output_shapes}
-	opspec := tf.OpSpec{
-		Type: "SqlDataset",
-		Input: []tf.Input{
-			driver_name, data_source_name, query,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // Returns the value stored in an Optional variant or raises an error if none exists.
 func OptionalGetValue(scope *Scope, optional tf.Output, output_types []tf.DataType, output_shapes []tf.Shape) (components []tf.Output) {
 	if scope.Err() != nil {
@@ -31888,6 +31532,24 @@ func ModelDataset(scope *Scope, input_dataset tf.Output, output_types []tf.DataT
 	return op.Output(0)
 }
 
+// Returns the truth value of (x > y) element-wise.
+//
+// *NOTE*: `Greater` supports broadcasting. More about broadcasting
+// [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+func Greater(scope *Scope, x tf.Output, y tf.Output) (z tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "Greater",
+		Input: []tf.Input{
+			x, y,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Performs a padding as a preprocess during a convolution.
 //
 // Similar to FusedResizeAndPadConv2d, this op allows for an optimized
@@ -32208,46 +31870,6 @@ func ParallelDynamicStitch(scope *Scope, indices []tf.Output, data []tf.Output)
 	return op.Output(0)
 }
 
-// Computes the gradient for the inverse of `x` wrt its input.
-//
-// Specifically, `grad = -dy * y*y`, where `y = 1/x`, and `dy`
-// is the corresponding input gradient.
-func InvGrad(scope *Scope, y tf.Output, dy tf.Output) (z tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	opspec := tf.OpSpec{
-		Type: "InvGrad",
-		Input: []tf.Input{
-			y, dy,
-		},
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
-// List of the given size with empty elements.
-//
-// element_shape: the shape of the future elements of the list
-// num_elements: the number of elements to reserve
-// handle: the output list
-// element_dtype: the desired type of elements in the list.
-func TensorListReserve(scope *Scope, element_shape tf.Output, num_elements tf.Output, element_dtype tf.DataType) (handle tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"element_dtype": element_dtype}
-	opspec := tf.OpSpec{
-		Type: "TensorListReserve",
-		Input: []tf.Input{
-			element_shape, num_elements,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
-
 // PriorityQueueV2Attr is an optional argument to PriorityQueueV2.
 type PriorityQueueV2Attr func(optionalAttr)
 
@@ -32786,6 +32408,241 @@ func StackV2(scope *Scope, max_size tf.Output, elem_type tf.DataType, optional .
 	return op.Output(0)
 }
 
+// OrderedMapStageAttr is an optional argument to OrderedMapStage.
+type OrderedMapStageAttr func(optionalAttr)
+
+// OrderedMapStageCapacity sets the optional capacity attribute to value.
+//
+// value: Maximum number of elements in the Staging Area. If > 0, inserts
+// on the container will block when the capacity is reached.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageCapacity(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["capacity"] = value
+	}
+}
+
+// OrderedMapStageMemoryLimit sets the optional memory_limit attribute to value.
+// If not specified, defaults to 0
+//
+// REQUIRES: value >= 0
+func OrderedMapStageMemoryLimit(value int64) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["memory_limit"] = value
+	}
+}
+
+// OrderedMapStageContainer sets the optional container attribute to value.
+//
+// value: If non-empty, this queue is placed in the given container. Otherwise,
+// a default container is used.
+// If not specified, defaults to ""
+func OrderedMapStageContainer(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["container"] = value
+	}
+}
+
+// OrderedMapStageSharedName sets the optional shared_name attribute to value.
+//
+// value: It is necessary to match this name to the matching Unstage Op.
+// If not specified, defaults to ""
+func OrderedMapStageSharedName(value string) OrderedMapStageAttr {
+	return func(m optionalAttr) {
+		m["shared_name"] = value
+	}
+}
+
+// Stage (key, values) in the underlying container which behaves like a ordered
+//
+// associative container.   Elements are ordered by key.
+//
+// Arguments:
+//	key: int64
+//
+//	values: a list of tensors
+// dtypes A list of data types that inserted values should adhere to.
+//
+//
+// Returns the created operation.
+func OrderedMapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...OrderedMapStageAttr) (o *tf.Operation) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"dtypes": dtypes}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "OrderedMapStage",
+		Input: []tf.Input{
+			key, indices, tf.OutputList(values),
+		},
+		Attrs: attrs,
+	}
+	return scope.AddOperation(opspec)
+}
+
+// RpcAttr is an optional argument to Rpc.
+type RpcAttr func(optionalAttr)
+
+// RpcProtocol sets the optional protocol attribute to value.
+//
+// value: RPC protocol to use.  Empty string means use the default protocol.
+// Options include 'grpc'.
+// If not specified, defaults to ""
+func RpcProtocol(value string) RpcAttr {
+	return func(m optionalAttr) {
+		m["protocol"] = value
+	}
+}
+
+// RpcFailFast sets the optional fail_fast attribute to value.
+//
+// value: `boolean`. If `true` (default), then failures to connect
+// (i.e., the server does not immediately respond) cause an RPC failure.
+// If not specified, defaults to true
+func RpcFailFast(value bool) RpcAttr {
+	return func(m optionalAttr) {
+		m["fail_fast"] = value
+	}
+}
+
+// RpcTimeoutInMs sets the optional timeout_in_ms attribute to value.
+//
+// value: `int`. If `0` (default), then the kernel will run the RPC
+// request and only time out if the RPC deadline passes or the session times out.
+// If this value is greater than `0`, then the op will raise an exception if
+// the RPC takes longer than `timeout_in_ms`.
+// If not specified, defaults to 0
+func RpcTimeoutInMs(value int64) RpcAttr {
+	return func(m optionalAttr) {
+		m["timeout_in_ms"] = value
+	}
+}
+
+// Perform batches of RPC requests.
+//
+// This op asynchronously performs either a single RPC request, or a batch
+// of requests.  RPC requests are defined by three main parameters:
+//
+//   - `address` (the host+port or BNS address of the request)
+//   - `method` (the RPC method name for the request)
+//   - `request` (the serialized proto string, or vector of strings,
+//      of the RPC request argument).
+//
+// For example, if you have an RPC service running on port localhost:2345,
+// and its interface is configured with the following proto declaration:
+//
+// ```
+// service MyService {
+//   rpc MyMethod(MyRequestProto) returns (MyResponseProto) {
+//   }
+// };
+// ```
+//
+// then call this op with arguments:
+//
+// ```
+// address = "localhost:2345"
+// method = "MyService/MyMethod"
+// ```
+//
+// The `request` tensor is a string tensor representing serialized `MyRequestProto`
+// strings; and the output string tensor `response` will have the same shape
+// and contain (upon successful completion) corresponding serialized
+// `MyResponseProto` strings.
+//
+// For example, to send a single, empty, `MyRequestProto`, call
+// this op with `request = ""`.  To send 5 **parallel** empty requests,
+// call this op with `request = ["", "", "", "", ""]`.
+//
+// More generally, one can create a batch of `MyRequestProto` serialized protos
+// from regular batched tensors using the `encode_proto` op, and convert
+// the response `MyResponseProto` serialized protos to batched tensors
+// using the `decode_proto` op.
+//
+// **NOTE** Working with serialized proto strings is faster than instantiating
+// actual proto objects in memory, so no performance degradation is expected
+// compared to writing custom kernels for this workflow.
+//
+// If the connection fails or the remote worker returns an error
+// status, the op reraises this exception locally.
+//
+// See the `TryRpc` op if you prefer to handle RPC failures manually in the graph.
+//
+// Arguments:
+//	address: `0-D` or `1-D`.  The address (i.e. host_name:port) of the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `method` and `request`.
+//	method: `0-D` or `1-D`.  The method address on the RPC server.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `request`.
+//	request: `0-D` or `1-D`.  Serialized proto strings: the rpc request argument.
+// If this tensor has more than 1 element, then multiple parallel rpc requests
+// are sent.  This argument broadcasts with `address` and `method`.
+//
+// Returns Same shape as `request`. Serialized proto strings: the rpc responses.
+func Rpc(scope *Scope, address tf.Output, method tf.Output, request tf.Output, optional ...RpcAttr) (response tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "Rpc",
+		Input: []tf.Input{
+			address, method, request,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
+// StackPushV2Attr is an optional argument to StackPushV2.
+type StackPushV2Attr func(optionalAttr)
+
+// StackPushV2SwapMemory sets the optional swap_memory attribute to value.
+//
+// value: Swap `elem` to CPU. Default to false.
+// If not specified, defaults to false
+func StackPushV2SwapMemory(value bool) StackPushV2Attr {
+	return func(m optionalAttr) {
+		m["swap_memory"] = value
+	}
+}
+
+// Push an element onto the stack.
+//
+// Arguments:
+//	handle: The handle to a stack.
+//	elem: The tensor to be pushed onto the stack.
+//
+// Returns The same tensor as the input 'elem'.
+func StackPushV2(scope *Scope, handle tf.Output, elem tf.Output, optional ...StackPushV2Attr) (output tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{}
+	for _, a := range optional {
+		a(attrs)
+	}
+	opspec := tf.OpSpec{
+		Type: "StackPushV2",
+		Input: []tf.Input{
+			handle, elem,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // FusedBatchNormGradV2Attr is an optional argument to FusedBatchNormGradV2.
 type FusedBatchNormGradV2Attr func(optionalAttr)
 
@@ -33169,6 +33026,25 @@ func TensorArraySizeV3(scope *Scope, handle tf.Output, flow_in tf.Output) (size
 	return op.Output(0)
 }
 
+// Deprecated. Use TensorArrayGradV3
+//
+// DEPRECATED at GraphDef version 26: Use TensorArrayGradV3
+func TensorArrayGradV2(scope *Scope, handle tf.Output, flow_in tf.Output, source string) (grad_handle tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	attrs := map[string]interface{}{"source": source}
+	opspec := tf.OpSpec{
+		Type: "TensorArrayGradV2",
+		Input: []tf.Input{
+			handle, flow_in,
+		},
+		Attrs: attrs,
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // SparseReduceMaxAttr is an optional argument to SparseReduceMax.
 type SparseReduceMaxAttr func(optionalAttr)
 
@@ -33794,206 +33670,3 @@ func StagePeek(scope *Scope, index tf.Output, dtypes []tf.DataType, optional ...
 	}
 	return values
 }
-
-// MapStageAttr is an optional argument to MapStage.
-type MapStageAttr func(optionalAttr)
-
-// MapStageCapacity sets the optional capacity attribute to value.
-//
-// value: Maximum number of elements in the Staging Area. If > 0, inserts
-// on the container will block when the capacity is reached.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapStageCapacity(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapStageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapStageMemoryLimit(value int64) MapStageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapStageContainer sets the optional container attribute to value.
-//
-// value: If non-empty, this queue is placed in the given container. Otherwise,
-// a default container is used.
-// If not specified, defaults to ""
-func MapStageContainer(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapStageSharedName sets the optional shared_name attribute to value.
-//
-// value: It is necessary to match this name to the matching Unstage Op.
-// If not specified, defaults to ""
-func MapStageSharedName(value string) MapStageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Stage (key, values) in the underlying container which behaves like a hashtable.
-//
-// Arguments:
-//	key: int64
-//
-//	values: a list of tensors
-// dtypes A list of data types that inserted values should adhere to.
-//
-//
-// Returns the created operation.
-func MapStage(scope *Scope, key tf.Output, indices tf.Output, values []tf.Output, dtypes []tf.DataType, optional ...MapStageAttr) (o *tf.Operation) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapStage",
-		Input: []tf.Input{
-			key, indices, tf.OutputList(values),
-		},
-		Attrs: attrs,
-	}
-	return scope.AddOperation(opspec)
-}
-
-// MapUnstageAttr is an optional argument to MapUnstage.
-type MapUnstageAttr func(optionalAttr)
-
-// MapUnstageCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageCapacity(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapUnstageMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapUnstageMemoryLimit(value int64) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapUnstageContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapUnstageContainer(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapUnstageSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapUnstageSharedName(value string) MapUnstageAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op removes and returns the values associated with the key
-//
-// from the underlying container.   If the underlying container
-// does not contain this key, the op will block until it does.
-func MapUnstage(scope *Scope, key tf.Output, indices tf.Output, dtypes []tf.DataType, optional ...MapUnstageAttr) (values []tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapUnstage",
-		Input: []tf.Input{
-			key, indices,
-		},
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	if scope.Err() != nil {
-		return
-	}
-	var idx int
-	var err error
-	if values, idx, err = makeOutputList(op, idx, "values"); err != nil {
-		scope.UpdateErr("MapUnstage", err)
-		return
-	}
-	return values
-}
-
-// MapIncompleteSizeAttr is an optional argument to MapIncompleteSize.
-type MapIncompleteSizeAttr func(optionalAttr)
-
-// MapIncompleteSizeCapacity sets the optional capacity attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapIncompleteSizeCapacity(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["capacity"] = value
-	}
-}
-
-// MapIncompleteSizeMemoryLimit sets the optional memory_limit attribute to value.
-// If not specified, defaults to 0
-//
-// REQUIRES: value >= 0
-func MapIncompleteSizeMemoryLimit(value int64) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["memory_limit"] = value
-	}
-}
-
-// MapIncompleteSizeContainer sets the optional container attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeContainer(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["container"] = value
-	}
-}
-
-// MapIncompleteSizeSharedName sets the optional shared_name attribute to value.
-// If not specified, defaults to ""
-func MapIncompleteSizeSharedName(value string) MapIncompleteSizeAttr {
-	return func(m optionalAttr) {
-		m["shared_name"] = value
-	}
-}
-
-// Op returns the number of incomplete elements in the underlying container.
-func MapIncompleteSize(scope *Scope, dtypes []tf.DataType, optional ...MapIncompleteSizeAttr) (size tf.Output) {
-	if scope.Err() != nil {
-		return
-	}
-	attrs := map[string]interface{}{"dtypes": dtypes}
-	for _, a := range optional {
-		a(attrs)
-	}
-	opspec := tf.OpSpec{
-		Type: "MapIncompleteSize",
-
-		Attrs: attrs,
-	}
-	op := scope.AddOperation(opspec)
-	return op.Output(0)
-}
diff --git a/tensorflow/java/README.md b/tensorflow/java/README.md
index 2fa81ed88f6187c5306584b705522f9fcf3aeac1..951e8bdd0dd8aae46a361a8ffcff276579433641 100644
--- a/tensorflow/java/README.md
+++ b/tensorflow/java/README.md
@@ -1,12 +1,13 @@
 # TensorFlow for Java
 
 > *WARNING*: The TensorFlow Java API is not currently covered by the TensorFlow
-> [API stability guarantees](https://www.tensorflow.org/guide/version_semantics).
+> [API stability guarantees](https://www.tensorflow.org/guide/version_compat).
 >
 > For using TensorFlow on Android refer instead to
 > [contrib/android](https://www.tensorflow.org/code/tensorflow/contrib/android),
 > [makefile](https://www.tensorflow.org/code/tensorflow/contrib/makefile#android)
-> and/or the [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
+> and/or the
+> [Android demo](https://www.tensorflow.org/code/tensorflow/examples/android).
 
 ## Quickstart
 
diff --git a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
index 1bd00a763ddff2f067183f57cfa80fdcbed84fd2..3229cce2776dd305a67d5936c37db5b1d9626402 100644
--- a/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
+++ b/tensorflow/java/src/test/java/org/tensorflow/TensorTest.java
@@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
+import java.nio.Buffer;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.DoubleBuffer;
@@ -100,7 +101,7 @@ public class TensorTest {
                     : ByteOrder.LITTLE_ENDIAN)
             .asDoubleBuffer()
             .put(doubles);
-    buf.flip();
+    flipBuffer(buf);
     try (Tensor<Double> t = Tensor.create(new long[] {doubles.length}, buf)) {
       double[] actual = new double[doubles.length];
       assertArrayEquals(doubles, t.copyTo(actual), EPSILON);
@@ -179,30 +180,30 @@ public class TensorTest {
       {
         ByteBuffer bbuf = ByteBuffer.allocate(1024).order(ByteOrder.nativeOrder());
 
-        bbuf.clear(); // FLOAT
+        clearBuffer(bbuf); // FLOAT
         tfloats.writeTo(bbuf);
         assertEquals(tfloats.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(floats[0], bbuf.asFloatBuffer().get(0), EPSILON);
-        bbuf.clear(); // DOUBLE
+        clearBuffer(bbuf); // DOUBLE
         tdoubles.writeTo(bbuf);
         assertEquals(tdoubles.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(doubles[0], bbuf.asDoubleBuffer().get(0), EPSILON);
-        bbuf.clear(); // INT32
+        clearBuffer(bbuf); // INT32
         tints.writeTo(bbuf);
         assertEquals(tints.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(ints[0], bbuf.asIntBuffer().get(0));
-        bbuf.clear(); // INT64
+        clearBuffer(bbuf); // INT64
         tlongs.writeTo(bbuf);
         assertEquals(tlongs.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(longs[0], bbuf.asLongBuffer().get(0));
-        bbuf.clear(); // BOOL
+        clearBuffer(bbuf); // BOOL
         tbools.writeTo(bbuf);
         assertEquals(tbools.numBytes(), bbuf.position());
-        bbuf.flip();
+        flipBuffer(bbuf);
         assertEquals(bools[0], bbuf.get(0) != 0);
       }
 
@@ -254,7 +255,7 @@ public class TensorTest {
                         : ByteOrder.LITTLE_ENDIAN)
                 .asDoubleBuffer();
         tdoubles.writeTo(foreignBuf);
-        foreignBuf.flip();
+        flipBuffer(foreignBuf);
         double[] actual = new double[foreignBuf.remaining()];
         foreignBuf.get(actual);
         assertArrayEquals(doubles, actual, EPSILON);
@@ -547,4 +548,25 @@ public class TensorTest {
       // expected.
     }
   }
+
+  // Workaround for cross compiliation
+  // (e.g., javac -source 1.9 -target 1.8).
+  //
+  // In Java 8 and prior, subclasses of java.nio.Buffer (e.g., java.nio.DoubleBuffer) inherited the
+  // "flip()" and "clear()" methods from java.nio.Buffer resulting in the signature:
+  //   Buffer flip();
+  // In Java 9 these subclasses had their own methods like:
+  //   DoubleBuffer flip();
+  // As a result, compiling for 1.9 source for a target of JDK 1.8 would result in errors at runtime
+  // like:
+  //
+  // java.lang.NoSuchMethodError: java.nio.DoubleBuffer.flip()Ljava/nio/DoubleBuffer
+  private static void flipBuffer(Buffer buf) {
+    buf.flip();
+  }
+
+  // See comment for flipBuffer()
+  private static void clearBuffer(Buffer buf) {
+    buf.clear();
+  }
 }
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index be84fc5db1fa4240de53beef0f14f679508298a5..8fca01624cfa2c21cd428e63ed1eadf7b853f107 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1,11 +1,12 @@
-package(default_visibility = [
-    "//visibility:public",
-])
+package(
+    default_visibility = ["//visibility:public"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "gen_selected_ops")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts")
+load("//tensorflow:tensorflow.bzl", "if_not_windows")
 
 exports_files(glob([
     "testdata/*.bin",
@@ -35,15 +36,22 @@ config_setting(
     visibility = ["//visibility:public"],
 )
 
+TFLITE_DEFAULT_COPTS = if_not_windows([
+    "-Wall",
+    "-Wno-comment",
+])
+
 cc_library(
     name = "schema_fbs_version",
     hdrs = ["version.h"],
+    copts = TFLITE_DEFAULT_COPTS,
 )
 
 cc_library(
     name = "arena_planner",
     srcs = ["arena_planner.cc"],
     hdrs = ["arena_planner.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = [
         ":graph_info",
         ":memory_planner",
@@ -57,12 +65,10 @@ cc_test(
     size = "small",
     srcs = ["arena_planner_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
         ":arena_planner",
-        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
@@ -74,18 +80,21 @@ cc_test(
 cc_library(
     name = "context",
     hdrs = ["context.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
 cc_library(
     name = "graph_info",
     hdrs = ["graph_info.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
 cc_library(
     name = "memory_planner",
     hdrs = ["memory_planner.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
@@ -93,6 +102,7 @@ cc_library(
     name = "simple_memory_arena",
     srcs = ["simple_memory_arena.cc"],
     hdrs = ["simple_memory_arena.h"],
+    copts = TFLITE_DEFAULT_COPTS,
     deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
@@ -109,9 +119,9 @@ cc_library(
     hdrs = [
         "builtin_op_data.h",
         "builtin_ops.h",
-        "context.h",
         "context_util.h",
     ],
+    deps = ["//tensorflow/lite/c:c_api_internal"],
 )
 
 exports_files(["builtin_ops.h"])
@@ -121,9 +131,7 @@ cc_library(
     hdrs = [
         "string.h",
     ],
-    deps = [
-        "//tensorflow/core:lib_platform",
-    ],
+    copts = TFLITE_DEFAULT_COPTS,
 )
 
 # TODO(ahentz): investigate dependency on gemm_support requiring usage of tf_copts.
@@ -131,6 +139,7 @@ cc_library(
     name = "framework",
     srcs = [
         "allocation.cc",
+        "core/subgraph.cc",
         "graph_info.cc",
         "interpreter.cc",
         "model.cc",
@@ -155,6 +164,7 @@ cc_library(
         "allocation.h",
         "context.h",
         "context_util.h",
+        "core/subgraph.h",
         "error_reporter.h",
         "graph_info.h",
         "interpreter.h",
@@ -165,7 +175,7 @@ cc_library(
         "optional_debug_tools.h",
         "stderr_reporter.h",
     ],
-    copts = tflite_copts(),
+    copts = tflite_copts() + TFLITE_DEFAULT_COPTS,
     linkopts = [
     ] + select({
         "//tensorflow:android": [
@@ -183,7 +193,7 @@ cc_library(
         ":string",
         ":util",
         "//tensorflow/lite/c:c_api_internal",
-        "//tensorflow/lite/core/api",
+        "//tensorflow/lite/core/api:api",
         "//tensorflow/lite/kernels:eigen_support",
         "//tensorflow/lite/kernels:gemm_support",
         "//tensorflow/lite/nnapi:nnapi_lib",
@@ -201,10 +211,10 @@ cc_library(
     name = "string_util",
     srcs = ["string_util.cc"],
     hdrs = ["string_util.h"],
-    copts = tflite_copts(),
+    copts = TFLITE_DEFAULT_COPTS,
     deps = [
-        ":framework",
         ":string",
+        "//tensorflow/lite/c:c_api_internal",
     ],
 )
 
@@ -215,6 +225,7 @@ cc_test(
     deps = [
         ":framework",
         ":string_util",
+        "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
@@ -244,10 +255,8 @@ cc_test(
     name = "graph_info_test",
     size = "small",
     srcs = ["graph_info_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":framework",
-        ":string_util",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
@@ -296,7 +305,12 @@ tf_cc_test(
     data = [
         "testdata/multi_add_flex.bin",
     ],
-    tags = ["no_windows"],  # TODO(b/116667551): No weak symbols with MSVC.
+    tags = [
+        "no_gpu",  # GPU + flex is not officially supported.
+        "no_windows",  # TODO(b/116667551): No weak symbols with MSVC.
+        "tflite_not_portable_android",
+        "tflite_not_portable_ios",
+    ],
     deps = [
         ":framework",
         "//tensorflow/lite/core/api",
@@ -312,7 +326,6 @@ cc_test(
     name = "mutable_op_resolver_test",
     size = "small",
     srcs = ["mutable_op_resolver_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":framework",
         "//tensorflow/lite/testing:util",
@@ -324,7 +337,7 @@ cc_library(
     name = "util",
     srcs = ["util.cc"],
     hdrs = ["util.h"],
-    copts = tflite_copts(),
+    copts = TFLITE_DEFAULT_COPTS + tflite_copts(),
     deps = [
         "//tensorflow/lite/c:c_api_internal",
     ],
@@ -334,27 +347,9 @@ cc_test(
     name = "util_test",
     size = "small",
     srcs = ["util_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":util",
-        "//tensorflow/lite/testing:util",
+        "//tensorflow/lite/c:c_api_internal",
         "@com_google_googletest//:gtest",
     ],
 )
-
-# Test the serialization of a model with optional tensors.
-
-# Model tests
-
-#cc_library(
-#    name = "models_test_utils",
-#    testonly = 1,
-#    hdrs = ["models/test_utils.h"],
-#    deps = select({
-#        "//tensorflow:android": [],
-#        "//conditions:default": [
-#            "@com_google_absl//absl/strings",
-#            "//tensorflow/core:test",
-#        ],
-#    }),
-#)
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index bc98dc57bc590f1dee4bb0aea3993e1ed20240b3..c17eddf47bc86c9537364117c302df38e390c8da 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -112,7 +112,8 @@ def tflite_jni_binary(
         linkshared = 1,
         linkstatic = 1,
         testonly = 0,
-        deps = []):
+        deps = [],
+        srcs = []):
     """Builds a jni binary for TFLite."""
     linkopts = linkopts + [
         "-Wl,--version-script",  # Export only jni functions & classes.
@@ -124,6 +125,7 @@ def tflite_jni_binary(
         linkshared = linkshared,
         linkstatic = linkstatic,
         deps = deps + [linkscript],
+        srcs = srcs,
         linkopts = linkopts,
         testonly = testonly,
     )
@@ -221,6 +223,7 @@ def json_to_tflite(name, src, out):
 # generated_test_models_failing().
 def generated_test_models():
     return [
+        "abs",
         "add",
         "arg_min_max",
         "avg_pool",
@@ -236,18 +239,21 @@ def generated_test_models():
         "equal",
         "exp",
         "expand_dims",
+        "fill",
         "floor",
         "floor_div",
         "floor_mod",
         "fully_connected",
         "fused_batch_norm",
         "gather",
+        "gather_with_constant",
         "global_batch_norm",
         "greater",
         "greater_equal",
         "sum",
         "l2norm",
         "l2_pool",
+        "leaky_relu",
         "less",
         "less_equal",
         "local_response_norm",
@@ -261,6 +267,7 @@ def generated_test_models():
         "maximum",
         "mean",
         "minimum",
+        "mirror_pad",
         "mul",
         "neg",
         "not_equal",
@@ -268,6 +275,7 @@ def generated_test_models():
         "pack",
         "pad",
         "padv2",
+        "placeholder_with_default",
         "prelu",
         "pow",
         "range",
@@ -290,17 +298,21 @@ def generated_test_models():
         "space_to_depth",
         "sparse_to_dense",
         "split",
+        "splitv",
         "sqrt",
         "square",
+        "squared_difference",
         "squeeze",
         "strided_slice",
         "strided_slice_1d_exhaustive",
+        "strided_slice_buggy",
         "sub",
         "tile",
         "topk",
         "transpose",
         "transpose_conv",
         "unpack",
+        "unroll_batch_matmul",
         "where",
         "zeros_like",
     ]
@@ -447,6 +459,7 @@ def gen_model_coverage_test(model_name, data, failure_type, tags):
         native.py_test(
             name = "model_coverage_test_%s_%s" % (model_name, target_op_sets.lower().replace(",", "_")),
             srcs = ["model_coverage_test.py"],
+            size = "large",
             main = "model_coverage_test.py",
             args = [
                 "--model_name=%s" % model_name,
@@ -457,7 +470,6 @@ def gen_model_coverage_test(model_name, data, failure_type, tags):
             tags = [
                 "no_oss",
                 "no_windows",
-                "notap",
             ] + tags,
             deps = [
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
diff --git a/tensorflow/lite/builtin_ops.h b/tensorflow/lite/builtin_ops.h
index b8c05f57bb59b5770ec2dca00d41e1ebd8ca23c4..f97d3ac4bf0b27cdd9b1f5ab7258a12036c29179 100644
--- a/tensorflow/lite/builtin_ops.h
+++ b/tensorflow/lite/builtin_ops.h
@@ -123,6 +123,11 @@ typedef enum {
   kTfLiteBuiltinFloorMod = 95,
   kTfLiteBuiltinRange = 96,
   kTfLiteBuiltinResizeNearestNeighbor = 97,
+  kTfLiteBuiltinLeakyRelu = 98,
+  kTfLiteBuiltinSquaredDifference = 99,
+  kTfLiteBuiltinMirrorPad = 100,
+  kTfLiteBuiltinAbs = 101,
+  kTfLiteBuiltinSplitV = 102,
 } TfLiteBuiltinOperator;
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data.h b/tensorflow/lite/c/builtin_op_data.h
index 855983d60dfd18d3b35ced7fed93f8fa3dfca80a..6a5a027a9dc94bb2a11081276d269a7007c86cad 100644
--- a/tensorflow/lite/c/builtin_op_data.h
+++ b/tensorflow/lite/c/builtin_op_data.h
@@ -35,11 +35,21 @@ typedef enum {
   kTfLitePaddingValid,
 } TfLitePadding;
 
+typedef enum {
+  kTfLiteMirrorPaddingUnknown = 0,
+  kTfLiteMirrorPaddingReflect,
+  kTfLiteMirrorPaddingSymmetric,
+} TfLiteMirrorPaddingMode;
+
 typedef struct {
   int width;
   int height;
 } TfLitePaddingValues;
 
+typedef struct {
+  TfLiteMirrorPaddingMode mode;
+} TfLiteMirrorPaddingParams;
+
 // Possible fused activation functions.
 // TODO(aselle): rename to TfLiteActivation
 typedef enum {
@@ -267,6 +277,10 @@ typedef struct {
   int num_splits;
 } TfLiteSplitParams;
 
+typedef struct {
+  int num_splits;
+} TfLiteSplitVParams;
+
 typedef struct {
   // TODO(ahentz): We can't have dynamic data in this struct, at least not yet.
   // For now we will fix the maximum possible number of dimensions.
@@ -328,6 +342,10 @@ typedef struct {
   int axis;
 } TfLiteUnpackParams;
 
+typedef struct {
+  float alpha;
+} TfLiteLeakyReluParams;
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/c/builtin_op_data_test.cc b/tensorflow/lite/c/builtin_op_data_test.cc
index 0e33dcd8c8447df76ebb91aa6bad7d2a2035e2a6..4ce7c481e1c26e6fcfdaa680e9ca666b82968d53 100644
--- a/tensorflow/lite/c/builtin_op_data_test.cc
+++ b/tensorflow/lite/c/builtin_op_data_test.cc
@@ -63,6 +63,7 @@ TEST(IntArray, CanCompileStructs) {
   TfLiteTransposeParams transpose_params;
   TfLiteReducerParams reducer_params;
   TfLiteSplitParams split_params;
+  TfLiteSplitVParams split_v_params;
   TfLiteSqueezeParams squeeze_params;
   TfLiteStridedSliceParams strided_slice_params;
   TfLiteArgMaxParams arg_max_params;
diff --git a/tensorflow/lite/c/c_api_internal.c b/tensorflow/lite/c/c_api_internal.c
index b131f0677467b3504bda84d96a95707cb1587884..2923dbad4ef285c497ca2c84d86168954fe8ec99 100644
--- a/tensorflow/lite/c/c_api_internal.c
+++ b/tensorflow/lite/c/c_api_internal.c
@@ -59,7 +59,7 @@ void TfLiteIntArrayPrint(const char* s, TfLiteIntArray* a) {
   printf("]\n");
 }
 
-TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src) {
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
   if (!src) return NULL;
   TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
   if (ret) {
@@ -125,6 +125,8 @@ const char* TfLiteTypeGetName(TfLiteType type) {
       return "INT32";
     case kTfLiteUInt8:
       return "UINT8";
+    case kTfLiteInt8:
+      return "INT8";
     case kTfLiteInt64:
       return "INT64";
     case kTfLiteBool:
@@ -137,3 +139,14 @@ const char* TfLiteTypeGetName(TfLiteType type) {
   return "Unknown type";
 }
 
+TfLiteDelegate TfLiteDelegateCreate() {
+  TfLiteDelegate d = {
+      .data_ = NULL,
+      .Prepare = NULL,
+      .CopyFromBufferHandle = NULL,
+      .CopyToBufferHandle = NULL,
+      .FreeBufferHandle = NULL,
+      .flags = kTfLiteDelegateFlagsNone,
+  };
+  return d;
+}
diff --git a/tensorflow/lite/c/c_api_internal.h b/tensorflow/lite/c/c_api_internal.h
index e05fd19936e89c94f68f89a8268b9d3b4aeca9b6..1cd84eff5c436abb781c74d1ac287b709558133f 100644
--- a/tensorflow/lite/c/c_api_internal.h
+++ b/tensorflow/lite/c/c_api_internal.h
@@ -96,7 +96,7 @@ int TfLiteIntArrayEqualsArray(TfLiteIntArray* a, int b_size, int b_data[]);
 
 // Create a copy of an array passed as `src`.
 // You are expected to free memory with TfLiteIntArrayFree
-TfLiteIntArray* TfLiteIntArrayCopy(TfLiteIntArray* src);
+TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src);
 
 // Free memory of array `v`.
 void TfLiteIntArrayFree(TfLiteIntArray* v);
@@ -179,6 +179,7 @@ typedef enum {
   kTfLiteBool = 6,
   kTfLiteInt16 = 7,
   kTfLiteComplex64 = 8,
+  kTfLiteInt8 = 9,
 } TfLiteType;
 
 // Return the name of a given type, for error reporting purposes.
@@ -203,6 +204,7 @@ typedef union {
   bool* b;
   int16_t* i16;
   TfLiteComplex64* c64;
+  int8_t* int8;
 } TfLitePtrUnion;
 
 // Memory allocation strategies. kTfLiteMmapRo is for read-only memory-mapped
@@ -486,19 +488,20 @@ typedef struct _TfLiteDelegate {
   // delegated subgraphs of the original graph.
   TfLiteStatus (*Prepare)(TfLiteContext* context, TfLiteDelegate* delegate);
 
-  // Copy the data from delegate buffer handle to raw memory.
-  // This can be null if the delegate doesn't use its own buffer.
+  // Copy the data from delegate buffer handle into raw memory of the given
+  // 'tensor'. This cannot be null. The delegate is allowed to allocate the raw
+  // bytes as long as it follows the rules for kTfLiteDynamic tensors.
   TfLiteStatus (*CopyFromBufferHandle)(TfLiteContext* context,
                                        TfLiteDelegate* delegate,
                                        TfLiteBufferHandle buffer_handle,
-                                       void* data, size_t size);
+                                       TfLiteTensor* tensor);
 
-  // Copy the data from raw memory to delegate buffer handle.
-  // This can be null if the delegate doesn't use its own buffer.
+  // Copy the data from raw memory of the given 'tensor' to delegate buffer
+  // handle. This can be null if the delegate doesn't use its own buffer.
   TfLiteStatus (*CopyToBufferHandle)(TfLiteContext* context,
                                      TfLiteDelegate* delegate,
                                      TfLiteBufferHandle buffer_handle,
-                                     void* data, size_t size);
+                                     TfLiteTensor* tensor);
 
   // Free the Delegate Buffer Handle. Note: This only frees the handle, but
   // this doesn't release the underlying resource (e.g. textures). The
@@ -511,6 +514,10 @@ typedef struct _TfLiteDelegate {
   int64_t flags;
 } TfLiteDelegate;
 
+// Build a 'null' delegate, with all the fields properly set to their default
+// values.
+TfLiteDelegate TfLiteDelegateCreate();
+
 // WARNING: This is an experimental interface that is subject to change.
 //
 // Currently, TfLiteDelegateParams has to be allocated in a way that it's
diff --git a/tensorflow/lite/c/c_api_internal_test.cc b/tensorflow/lite/c/c_api_internal_test.cc
index e21823c41f0b43e7395b19f241d6a628b8a78f41..acf0dfc5be8e233b642ccea42f72cbf6af2d4c5d 100644
--- a/tensorflow/lite/c/c_api_internal_test.cc
+++ b/tensorflow/lite/c/c_api_internal_test.cc
@@ -74,6 +74,7 @@ TEST(Types, TestTypeNames) {
   EXPECT_EQ(type_name(kTfLiteInt16), "INT16");
   EXPECT_EQ(type_name(kTfLiteInt32), "INT32");
   EXPECT_EQ(type_name(kTfLiteUInt8), "UINT8");
+  EXPECT_EQ(type_name(kTfLiteInt8), "INT8");
   EXPECT_EQ(type_name(kTfLiteInt64), "INT64");
   EXPECT_EQ(type_name(kTfLiteBool), "BOOL");
   EXPECT_EQ(type_name(kTfLiteComplex64), "COMPLEX64");
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index 8cd3faabb728097832796bb7c9d56e5f2e9632b0..c00a0a3a546b1b2b0167663b5f00c5e25e261f15 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -61,6 +61,9 @@ TfLiteStatus ConvertTensorType(TensorType tensor_type, TfLiteType* type,
     case TensorType_UINT8:
       *type = kTfLiteUInt8;
       break;
+    case TensorType_INT8:
+      *type = kTfLiteInt8;
+      break;
     case TensorType_INT64:
       *type = kTfLiteInt64;
       break;
@@ -503,6 +506,14 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_SPLIT_V: {
+      auto* params = allocator->AllocatePOD<TfLiteSplitParams>();
+      if (auto* schema_params = op->builtin_options_as_SplitVOptions()) {
+        params->num_splits = schema_params->num_splits();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
     case BuiltinOperator_SQUEEZE: {
       auto* params = allocator->AllocatePOD<TfLiteSqueezeParams>();
       if (auto* schema_params = op->builtin_options_as_SqueezeOptions()) {
@@ -617,8 +628,31 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
       *builtin_data = reinterpret_cast<void*>(params);
       break;
     }
+    case BuiltinOperator_LEAKY_RELU: {
+      TfLiteLeakyReluParams* params =
+          allocator->AllocatePOD<TfLiteLeakyReluParams>();
+      if (auto* leaky_relu_params = op->builtin_options_as_LeakyReluOptions()) {
+        params->alpha = leaky_relu_params->alpha();
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
+    case BuiltinOperator_MIRROR_PAD: {
+      TfLiteMirrorPaddingParams* params =
+          allocator->AllocatePOD<TfLiteMirrorPaddingParams>();
+      auto* mirror_pad_params = op->builtin_options_as_MirrorPadOptions();
+      if (mirror_pad_params != nullptr) {
+        params->mode =
+            mirror_pad_params->mode() == tflite::MirrorPadMode_REFLECT
+                ? TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect
+                : TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingSymmetric;
+      }
+      *builtin_data = reinterpret_cast<void*>(params);
+      break;
+    }
 
     // Below are the ops with no builtin_data strcture.
+    case BuiltinOperator_ABS:
     case BuiltinOperator_BATCH_TO_SPACE_ND:
     // TODO(aselle): Implement call in BuiltinOptions, but nullptrs are
     // ok for now, since there is no call implementation either.
@@ -668,6 +702,7 @@ TfLiteStatus ParseOpData(const Operator* op, BuiltinOperator op_type,
     case BuiltinOperator_FILL:
     case BuiltinOperator_FLOOR_MOD:
     case BuiltinOperator_RANGE:
+    case BuiltinOperator_SQUARED_DIFFERENCE:
       break;
   }
   return kTfLiteOk;
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
new file mode 100644
index 0000000000000000000000000000000000000000..90361faeae3c085fd4bd73a22b64635ce4b2969e
--- /dev/null
+++ b/tensorflow/lite/core/subgraph.cc
@@ -0,0 +1,970 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/arena_planner.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/context_util.h"
+#include "tensorflow/lite/graph_info.h"
+#include "tensorflow/lite/nnapi_delegate.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+
+namespace tflite {
+
+namespace {
+TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
+                           const TfLiteRegistration& registration,
+                           int node_index, const char* message) {
+  context->ReportError(
+      context, "Node number %d (%s) %s.\n", node_index,
+      registration.custom_name
+          ? registration.custom_name
+          : EnumNameBuiltinOperator(
+                static_cast<BuiltinOperator>(registration.builtin_code)),
+      message);
+  return kTfLiteError;
+}
+
+// Stub method which returns kTfLiteError when the function is forbidden.
+// We're registrating this function to several different function to save
+// compiled binary size. Please note the restrictions:
+// * The type of first parameter have to be `TfLiteContext*`.
+// * All paramteters must be trivailly destructible. (E.g. No C++ class)
+TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
+  context->ReportError(context,
+                       "The function is forbidden if not calling in delegate.");
+  return kTfLiteError;
+}
+
+// Set the ForbiddenContextFunction to a compatible function pointer.
+template <typename FunctionType>
+void SetForbiddenContextFunction(FunctionType* func) {
+  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
+}
+
+// Returns true if at least one tensor in the given list is kTfLiteDynamic.
+template <typename TensorIntArray>
+bool HasDynamicTensorImpl(const TfLiteContext& context,
+                          const TensorIntArray& int_array) {
+  for (int i : int_array) {
+    const TfLiteTensor& tensor = context.tensors[i];
+    if (tensor.allocation_type == kTfLiteDynamic) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool HasDynamicTensor(const TfLiteContext& context,
+                      const TfLiteIntArray* int_array) {
+  return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
+}
+
+}  // namespace
+
+// A trivial implementation of GraphInfo around the Interpreter.
+// NOTE: this interpreter info represents the subset of the
+// graph that is executed according to execution plan. Thus,
+// the indices are execution plan indices rather than raw node
+// indices.
+class InterpreterInfo : public GraphInfo {
+ public:
+  explicit InterpreterInfo(Subgraph* subgraph) : subgraph_(subgraph) {}
+
+  size_t num_tensors() const override { return subgraph_->tensors().size(); }
+  TfLiteTensor* tensor(size_t index) override {
+    return &subgraph_->tensors()[index];
+  }
+  size_t num_nodes() const override {
+    return subgraph_->execution_plan().size();
+  }
+  const TfLiteNode& node(size_t index) const override {
+    int node_index = subgraph_->execution_plan()[index];
+    return subgraph_->nodes_and_registration()[node_index].first;
+  }
+  const std::vector<int>& inputs() const override {
+    return subgraph_->inputs();
+  }
+  const std::vector<int>& outputs() const override {
+    return subgraph_->outputs();
+  }
+  const std::vector<int>& variables() const override {
+    return subgraph_->variables();
+  }
+
+ public:
+  Subgraph* subgraph_;
+};
+
+Subgraph::Subgraph(ErrorReporter* error_reporter,
+                   TfLiteExternalContext** external_contexts,
+                   std::vector<std::unique_ptr<Subgraph>>* subgraphs)
+    : context_(&owned_context_),
+      error_reporter_(error_reporter),
+      next_execution_plan_index_to_prepare_(0),
+      external_contexts_(external_contexts),
+      subgraphs_(subgraphs) {
+  context_->impl_ = static_cast<void*>(this);
+  context_->ResizeTensor = ResizeTensor;
+  context_->ReportError = ReportErrorC;
+  context_->AddTensors = AddTensors;
+  context_->tensors = nullptr;
+  context_->tensors_size = 0;
+  context_->allow_fp32_relax_to_fp16 = false;
+  context_->recommended_num_threads = -1;
+  context_->GetExternalContext = GetExternalContext;
+  context_->SetExternalContext = SetExternalContext;
+
+  // Reserve some space for the tensors to avoid excessive resizing.
+  tensors_.reserve(kTensorsReservedCapacity);
+  nodes_and_registration().reserve(kTensorsReservedCapacity);
+  // Invalid to call these these except from TfLiteDelegate
+  SwitchToKernelContext();
+}
+
+Subgraph::~Subgraph() {
+  for (auto& node_and_reg : nodes_and_registration_) {
+    TfLiteNode& node = node_and_reg.first;
+    TfLiteIntArrayFree(node.inputs);
+    TfLiteIntArrayFree(node.outputs);
+    TfLiteIntArrayFree(node.temporaries);
+    if (node.builtin_data) free(node.builtin_data);
+    OpFree(node_and_reg.second, node.user_data);
+    node.builtin_data = nullptr;
+  }
+
+  for (size_t i = 0; i < context_->tensors_size; i++) {
+    TfLiteTensor* tensor = &context_->tensors[i];
+    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
+        tensor->delegate->FreeBufferHandle != nullptr) {
+      tensor->delegate->FreeBufferHandle(context_, tensor->delegate,
+                                         &tensor->buffer_handle);
+    }
+    TfLiteTensorFree(tensor);
+  }
+}
+
+TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
+    TfLiteContext* context, TfLiteRegistration registration,
+    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->ReplaceNodeSubsetsWithDelegateKernels(registration, nodes_to_replace,
+                                              delegate);
+}
+
+namespace {
+
+// Copy a std::vector<int> to an existing TfLiteIntArray.
+// This is a low-level data manipulation function, and it's caller's
+// responsibility to ensure TfLiteIntArray has enough size.
+void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
+                                TfLiteIntArray* arr) {
+  arr->size = vec.size();
+  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
+}
+
+// This function allocates a continuous memory space that contains a
+// TfLiteDelegateParams followed by a several TfLiteIntArray.
+// When calling `free` at TfLiteDelegateParams*, all the allocated space
+// will be freed together.
+//
+// +-----------------------------------+
+// | TfLiteDelegateParams              |
+// | TfLiteDelegate* delegate;         |
+// | TfLiteIntArray* nodes_to_replace; |--\
+// | TfLiteIntArray* input_tensors;    |--+--\
+// | TfLiteIntArray* output_tensors;   |--+--+--\
+// +-----------------------------------+  |  |  |
+// | TfLiteIntArray (variable size)    |<-/  |  |
+// +-----------------------------------+     |  |
+// | TfLiteIntArray (variable size)    |<----/  |
+// +-----------------------------------+        |
+// | TfLiteIntArray (variable size)    |<-------/
+// +-----------------------------------+
+TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
+                                           const NodeSubset& node_subset) {
+  // Step 1: Calculate the allocation size.
+  int allocation_size = sizeof(TfLiteDelegateParams);
+
+  int nodes_to_replace_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.nodes.size());
+  allocation_size += nodes_to_replace_size;
+
+  int input_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.input_tensors.size());
+  allocation_size += input_tensors_size;
+
+  int output_tensors_size =
+      TfLiteIntArrayGetSizeInBytes(node_subset.output_tensors.size());
+  allocation_size += output_tensors_size;
+
+  // Step 2: Allocate the memory.
+  // Use `char*` for conveniently step through the allocated space by bytes.
+  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
+
+  // Step 3: Fill all data structures structures.
+  TfLiteDelegateParams* params =
+      reinterpret_cast<TfLiteDelegateParams*>(allocation);
+  params->delegate = delegate;
+  allocation += sizeof(TfLiteDelegateParams);
+
+  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.nodes, params->nodes_to_replace);
+  allocation += nodes_to_replace_size;
+
+  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.input_tensors, params->input_tensors);
+  allocation += input_tensors_size;
+
+  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
+  CopyVectorToTfLiteIntArray(node_subset.output_tensors,
+                             params->output_tensors);
+  allocation += output_tensors_size;
+
+  return params;
+}
+
+}  // namespace
+
+TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
+    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+    TfLiteDelegate* delegate) {
+  // Annotate the registration as DELEGATE op.
+  registration.builtin_code = BuiltinOperator_DELEGATE;
+
+  // Analyze the graph to find all independent node_subsets that are either
+  // fully not-this-delegate or this-delegate computation.
+  InterpreterInfo info(this);
+  std::vector<NodeSubset> node_subsets;
+  PartitionGraphIntoIndependentNodeSubsets(&info, nodes_to_replace,
+                                           &node_subsets);
+
+  execution_plan_.clear();
+
+  for (auto& node_subset : node_subsets) {
+    // Subsets calimed by the delegate should have a "macro" op created, the
+    // other node_subsets (kTfNonPartition) just have their nodes added back to
+    // the execution plan.
+    switch (node_subset.type) {
+      case NodeSubset::kTfNonPartition:
+        for (auto it = node_subset.nodes.begin(); it != node_subset.nodes.end();
+             ++it) {
+          execution_plan_.push_back(*it);
+        }
+        break;
+      case NodeSubset::kTfPartition: {
+        int node_index;
+
+        TfLiteDelegateParams* params =
+            CreateDelegateParams(delegate, node_subset);
+        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
+            node_subset.input_tensors, node_subset.output_tensors, nullptr, 0,
+            params, &registration, &node_index));
+
+        // Initialize the output tensors's delegate-related fields.
+        for (int tensor_index : node_subset.output_tensors) {
+          TfLiteTensor* tensor = &tensors_[tensor_index];
+          TF_LITE_ENSURE(context_, tensor->delegate == nullptr ||
+                                       tensor->delegate == delegate);
+          tensor->delegate = delegate;
+        }
+
+        // Associate the node with the delegate.
+        TfLiteNode* node = &nodes_and_registration_[node_index].first;
+        node->delegate = delegate;
+      } break;
+      case NodeSubset::kTfUnexplored:
+        return kTfLiteError;
+        break;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteExternalContext* Subgraph::GetExternalContext(
+    TfLiteExternalContextType type) {
+  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
+    return external_contexts_[type];
+  }
+  return nullptr;
+}
+
+TfLiteExternalContext* Subgraph::GetExternalContext(
+    struct TfLiteContext* context, TfLiteExternalContextType type) {
+  return static_cast<Subgraph*>(context->impl_)->GetExternalContext(type);
+}
+
+void Subgraph::SetExternalContext(TfLiteExternalContextType type,
+                                  TfLiteExternalContext* ctx) {
+  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
+    external_contexts_[type] = ctx;
+  }
+}
+
+void Subgraph::SetExternalContext(struct TfLiteContext* context,
+                                  TfLiteExternalContextType type,
+                                  TfLiteExternalContext* ctx) {
+  return static_cast<Subgraph*>(context->impl_)->SetExternalContext(type, ctx);
+}
+
+// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
+// this memory and it is only guaranteed to exist during the invocation of the
+// delegate prepare.
+TfLiteStatus Subgraph::GetExecutionPlan(TfLiteIntArray** execution_plan) {
+  // TODO(aselle): Do not make a copy here
+  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
+  *execution_plan = plan_cache_.get();
+  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
+                "TfLiteIntArray and execution_plan do not contain same type.");
+  std::memcpy(plan_cache_->data, execution_plan_.data(),
+              sizeof(plan_cache_->data[0]) * execution_plan_.size());
+  return kTfLiteOk;
+}
+
+// WARNING: This is an experimental interface that is subject to change.
+// Entry point for C node plugin API to get the execution plan
+TfLiteStatus Subgraph::GetExecutionPlan(struct TfLiteContext* context,
+                                        TfLiteIntArray** execution_plan) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->GetExecutionPlan(execution_plan);
+}
+
+TfLiteStatus Subgraph::SetInputs(std::vector<int> inputs) {
+  TF_LITE_ENSURE_OK(&context_,
+                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
+  inputs_ = std::move(inputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetOutputs(std::vector<int> outputs) {
+  TF_LITE_ENSURE_OK(
+      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
+  outputs_ = std::move(outputs);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetVariables(std::vector<int> variables) {
+  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("variables", variables.data(),
+                                                  variables.size()));
+  variables_ = std::move(variables);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::CheckTensorIndices(const char* label, const int* indices,
+                                          int length) {
+  // Making sure kOptionalTensor is not re-defined to something other than -1.
+  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
+
+  for (int i = 0; i < length; i++) {
+    int index = indices[i];
+    // Continue if index == kOptionalTensor before additional comparisons below,
+    // size_t(-1) is always >= context_tensors_size.
+    if (index == kOptionalTensor) {
+      continue;
+    }
+    if (index < 0 || static_cast<size_t>(index) >= context_->tensors_size) {
+      ReportError("Invalid tensor index %d in %s\n", index, label);
+      consistent_ = false;
+      return kTfLiteError;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::BytesRequired(TfLiteType type, const int* dims,
+                                     size_t dims_size, size_t* bytes) {
+  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
+  // MultiplyWithoutOverflow.
+  TF_LITE_ENSURE(context_, bytes != nullptr);
+  size_t count = 1;
+  for (int k = 0; k < dims_size; k++) count *= dims[k];
+  switch (type) {
+    case kTfLiteFloat32:
+      *bytes = sizeof(float) * count;
+      break;
+    case kTfLiteInt16:
+      *bytes = sizeof(int16_t) * count;
+      break;
+    case kTfLiteInt32:
+      *bytes = sizeof(int32_t) * count;
+      break;
+    case kTfLiteUInt8:
+      *bytes = sizeof(uint8_t) * count;
+      break;
+    case kTfLiteInt64:
+      *bytes = sizeof(int64_t) * count;
+      break;
+    case kTfLiteBool:
+      *bytes = sizeof(bool) * count;
+      break;
+    case kTfLiteComplex64:
+      *bytes = sizeof(std::complex<float>) * count;
+      break;
+    case kTfLiteInt8:
+      *bytes = sizeof(int8_t) * count;
+      break;
+    default:
+      ReportError(
+          "Only float32, int8, int16, int32, int64, uint8, bool, complex64 "
+          "supported currently.");
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AllocateTensors() {
+  if (!consistent_) {
+    ReportError("AllocateTensors() called on inconsistent model.");
+    return kTfLiteError;
+  }
+
+  // Explicit (re)allocation is necessary if nodes have been changed or tensors
+  // have been resized. For inputs marked as dynamic, we can't short-circuit the
+  // allocation as the client may have done the resize manually.
+  if (state_ != kStateUninvokable &&
+      !HasDynamicTensorImpl(*context_, inputs())) {
+    return kTfLiteOk;
+  }
+
+  next_execution_plan_index_to_prepare_ = 0;
+  if (memory_planner_) {
+    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
+  }
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+
+  state_ = kStateInvokable;
+
+  // Reset the variable tensors to zero after (re)allocating the tensors.
+  // Developers shouldn't rely on the side effect of this function to reset
+  // variable tesnsors. They should call `ResetVariableTensors` directly
+  // instead.
+  ResetVariableTensors();
+
+  return kTfLiteOk;
+}
+
+// TODO(ycling): Support non-zero default values.
+TfLiteStatus Subgraph::ResetVariableTensors() {
+  for (auto& tensor : tensors_) {
+    if (!tensor.is_variable) {
+      continue;
+    }
+
+    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
+    // allocated after the initial `PrepareOpsAndTensors()` is called.
+    TF_LITE_ENSURE_EQ(context_, tensor.allocation_type,
+                      kTfLiteArenaRwPersistent);
+    TF_LITE_ENSURE(context_, tensor.data.raw != nullptr);
+
+    memset(tensor.data.raw, 0, tensor.bytes);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AddNodeWithParameters(
+    const std::vector<int>& inputs, const std::vector<int>& outputs,
+    const char* init_data, size_t init_data_size, void* builtin_data,
+    const TfLiteRegistration* registration, int* node_index) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError("AddNodeWithParameters is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  state_ = kStateUninvokable;
+
+  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
+                                                              free);
+
+  TF_LITE_ENSURE_OK(context_, CheckTensorIndices("node inputs", inputs.data(),
+                                                 inputs.size()));
+  TF_LITE_ENSURE_OK(
+      &context_,
+      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
+
+  int new_node_index = nodes_and_registration_.size();
+  if (node_index) *node_index = new_node_index;
+  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
+  auto& node_and_reg = nodes_and_registration_.back();
+  TfLiteNode& node = node_and_reg.first;
+  if (node.inputs) TfLiteIntArrayFree(node.inputs);
+  if (node.outputs) TfLiteIntArrayFree(node.outputs);
+  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
+
+  // NOTE, here we are not using move semantics yet, since our internal
+  // representation isn't std::vector, but in the future we would like to avoid
+  // copies, so we want the interface to take r-value references now.
+  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
+  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
+  node.temporaries = TfLiteIntArrayCreate(0);
+  if (init_data) {
+    node.user_data = OpInit(*registration, init_data, init_data_size);
+  } else {
+    node.user_data =
+        OpInit(*registration,
+               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
+  }
+
+  node.builtin_data = builtin_data_deleter.release();
+  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
+  // properly for nodes generated by ReplaceNodeSubsetsWithDelegateKernels.
+
+  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
+    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
+    // `Operator` table is passed in.
+    node.custom_initial_data = init_data;
+    node.custom_initial_data_size = init_data_size;
+  } else {
+    node.custom_initial_data = nullptr;
+    node.custom_initial_data_size = 0;
+  }
+
+  node.delegate = nullptr;
+  node_and_reg.second = *registration;
+  execution_plan_.push_back(new_node_index);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::ResizeInputTensor(int tensor_index,
+                                         const std::vector<int>& dims) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError("ResizeInputTensor is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
+  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
+  // checks by casting to unsigned for efficiency. Profile before doing this.
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  TfLiteTensor* tensor = &context_->tensors[tensor_index];
+
+  // Short-circuit the state change if the dimensions don't change, avoiding
+  // unnecessary (re)allocations.
+  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
+    return kTfLiteOk;
+  }
+
+  state_ = kStateUninvokable;
+  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
+}
+
+TfLiteStatus Subgraph::PrepareOpsStartingAt(
+    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
+  if (first_execution_plan_index == 0) {
+    has_dynamic_tensors_ = false;
+  }
+  for (int execution_plan_index = first_execution_plan_index;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    EnsureTensorsVectorCapacity();
+    if (OpPrepare(registration, &node) == kTfLiteError) {
+      return ReportOpError(context_, node, registration, node_index,
+                           "failed to prepare");
+    }
+
+    *last_execution_plan_index_prepared = execution_plan_index;
+
+    // Discontinue if the node has dynamic outputs. Note that we don't
+    // stop for dynamic temporary tensors since they won't affect the
+    // sizes of other tensors in the graph.
+    if (HasDynamicTensor(*context_, node.outputs)) {
+      has_dynamic_tensors_ = true;
+      return kTfLiteOk;
+    }
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::PrepareOpsAndTensors() {
+  if (!memory_planner_) {
+    memory_planner_.reset(new ArenaPlanner(
+        context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this)),
+        /*preserve_inputs=*/true, /*preserve_intermediates*/ false));
+    memory_planner_->PlanAllocations();
+  }
+
+  int last_exec_plan_index_prepared = 0;
+
+  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
+      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
+  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
+      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
+
+  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::Invoke() {
+  if (!consistent_) {
+    ReportError("Invoke called on model that is not consistent.");
+    return kTfLiteError;
+  }
+
+  TfLiteStatus status = kTfLiteOk;
+  if (state_ == kStateUninvokable) {
+    ReportError("Invoke called on model that is not ready.");
+    return kTfLiteError;
+  }
+
+  if (nnapi_delegate_) {
+    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
+      TF_LITE_ENSURE_OK(context_, nnapi_delegate_->Invoke(this));
+      return kTfLiteOk;
+    } else {
+      // TODO(aselle): In the future, we would like this to be an
+      // automatic tflite CPU fallback.
+      ReportError(
+          "NNAPI was requested, but dependent sized tensors "
+          "being used.\n");
+      return kTfLiteError;
+    }
+  }
+
+  // Invocations are always done in node order.
+  // Note that calling Invoke repeatedly will cause the original memory plan to
+  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
+  // called.
+  for (int execution_plan_index = 0;
+       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
+    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
+      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
+      TF_LITE_ENSURE(context_, next_execution_plan_index_to_prepare_ >=
+                                   execution_plan_index);
+    }
+    int node_index = execution_plan_[execution_plan_index];
+    TfLiteNode& node = nodes_and_registration_[node_index].first;
+    const TfLiteRegistration& registration =
+        nodes_and_registration_[node_index].second;
+    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
+
+    // TODO(ycling): This is an extra loop through inputs to check if the data
+    // need to be copied from Delegate buffer to raw memory, which is often not
+    // needed. We may want to cache this in prepare to know if this needs to be
+    // done for a node or not.
+    for (int i = 0; i < node.inputs->size; ++i) {
+      int tensor_index = node.inputs->data[i];
+      if (tensor_index == kOptionalTensor) {
+        continue;
+      }
+      TfLiteTensor* tensor = &tensors_[tensor_index];
+      if (tensor->delegate && tensor->delegate != node.delegate &&
+          tensor->data_is_stale) {
+        EnsureTensorDataIsReadable(tensor_index);
+      }
+    }
+
+    EnsureTensorsVectorCapacity();
+    tensor_resized_since_op_invoke_ = false;
+    if (OpInvoke(registration, &node) == kTfLiteError) {
+      status = ReportOpError(context_, node, registration, node_index,
+                             "failed to invoke");
+    }
+
+    // Force execution prep for downstream ops if the latest op triggered the
+    // resize of a dynamic tensor.
+    if (tensor_resized_since_op_invoke_ &&
+        HasDynamicTensor(*context_, node.outputs)) {
+      next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
+    }
+  }
+
+  return status;
+}
+
+TfLiteStatus Subgraph::ResizeTensor(TfLiteContext* context,
+                                    TfLiteTensor* tensor,
+                                    TfLiteIntArray* new_size) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function ResizeTensorImpl
+  // (this function is static).
+  return static_cast<Subgraph*>(context->impl_)
+      ->ResizeTensorImpl(tensor, new_size);
+}
+
+void Subgraph::ReportErrorImpl(const char* format, va_list args) {
+  error_reporter_->Report(format, args);
+}
+
+void Subgraph::ReportErrorC(TfLiteContext* context, const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Subgraph*>(context->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Subgraph to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+// Entry point for C node plugin API to report an error.
+void Subgraph::ReportError(const char* format, ...) {
+  va_list args;
+  va_start(args, format);
+  auto* f = static_cast<Subgraph*>(context_->impl_);
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Subgraph to call into the member function ReportErrorImpl
+  // (this function is static).
+  f->ReportErrorImpl(format, args);
+  va_end(args);
+}
+
+TfLiteStatus Subgraph::AddTensors(int tensors_to_add,
+                                  int* first_new_tensor_index) {
+  const size_t base_index = tensors_.size();
+  if (first_new_tensor_index) *first_new_tensor_index = base_index;
+  tensors_.resize(tensors_.size() + tensors_to_add);
+  for (size_t i = base_index; i < tensors_.size(); i++) {
+    memset(&tensors_[i], 0, sizeof(tensors_[i]));
+    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
+  }
+  context_->tensors = tensors_.data();
+  context_->tensors_size = tensors_.size();
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::AddTensors(TfLiteContext* context, int tensors_to_add,
+                                  int* first_new_tensor_index) {
+  // Note here that context->impl_ is recovering the this pointer for an
+  // instance of Interpreter to call into the member function AddTensors
+  // (this function is static).
+  return static_cast<Subgraph*>(context->impl_)
+      ->AddTensors(tensors_to_add, first_new_tensor_index);
+}
+
+TfLiteStatus Subgraph::GetNodeAndRegistration(
+    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
+  TF_LITE_ENSURE(context_, node_index >= 0);
+  auto nodes_size = nodes_and_registration_.size();
+  TF_LITE_ENSURE(context_, static_cast<size_t>(node_index) < nodes_size);
+  TF_LITE_ENSURE(context_, node != nullptr && registration != nullptr);
+  auto& node_and_reg = nodes_and_registration_[node_index];
+  *node = &node_and_reg.first;
+  *registration = &node_and_reg.second;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::GetNodeAndRegistration(
+    struct TfLiteContext* context, int node_index, TfLiteNode** node,
+    TfLiteRegistration** registration) {
+  return static_cast<Subgraph*>(context->impl_)
+      ->GetNodeAndRegistration(node_index, node, registration);
+}
+
+TfLiteStatus Subgraph::SetTensorParametersReadOnly(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
+    size_t bytes, const Allocation* allocation) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  // For most tensors we know exactly how much memory is necessary so we can
+  // ensure the buffer is large enough. However, we need to skip string tensors
+  // because their sizes change with the contents of the individual strings.
+  if (type != kTfLiteString) {
+    size_t required_bytes;
+    TF_LITE_ENSURE_OK(context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
+    TF_LITE_ENSURE_EQ(context_, required_bytes, bytes);
+  }
+
+  TfLiteTensor& tensor = context_->tensors[tensor_index];
+  if (type == tensor.type &&
+      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
+    // Fast path which does not invalidate the invokable property.
+    TfLiteTensorDataFree(&tensor);
+    tensor.data.raw = const_cast<char*>(buffer);
+    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
+    tensor.params = quantization;
+    tensor.allocation_type = kTfLiteMmapRo;
+    tensor.allocation = allocation;
+  } else {
+    state_ = kStateUninvokable;
+    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                      quantization, const_cast<char*>(buffer), bytes,
+                      kTfLiteMmapRo, allocation, false, &tensor);
+  }
+  return kTfLiteOk;
+}
+
+// Set description of inputs/outputs/data/fptrs for node `node_index`.
+// This variant assumes an external buffer has been allocated of size
+// bytes. The lifetime of buffer must be ensured to be greater or equal
+// to Interpreter.
+TfLiteStatus Subgraph::SetTensorParametersReadWrite(
+    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
+  if (state_ == kStateInvokableAndImmutable) {
+    ReportError(
+        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
+    return kTfLiteError;
+  }
+  TF_LITE_ENSURE(context_,
+                 tensor_index < context_->tensors_size && tensor_index >= 0);
+  size_t required_bytes = 0;
+  if (type != kTfLiteString) {
+    // These types will be allocated in our arena so we need to record how
+    // many bytes we will need based on the dimensions. String tensors are
+    // allocated dynamically and we can't know ahead of time how much space
+    // they will require.
+    TF_LITE_ENSURE_OK(context_,
+                      BytesRequired(type, dims, rank, &required_bytes));
+  }
+
+  TfLiteAllocationType allocation_type = kTfLiteArenaRw;
+  if (type == kTfLiteString) {
+    if (is_variable) {
+      // We don't have a real use case for string variable tensor.
+      ReportError("String variable tensor isn't supported.");
+      return kTfLiteError;
+    }
+    allocation_type = kTfLiteDynamic;
+  } else if (is_variable) {
+    allocation_type = kTfLiteArenaRwPersistent;
+  }
+
+  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
+                    quantization,
+                    /*buffer=*/nullptr, required_bytes, allocation_type,
+                    nullptr, is_variable, &context_->tensors[tensor_index]);
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::SetExecutionPlan(const std::vector<int>& new_plan) {
+  for (int node_index : new_plan) {
+    TF_LITE_ENSURE(context_, node_index >= 0 &&
+                                 node_index < nodes_and_registration_.size());
+  }
+  execution_plan_ = new_plan;
+  return kTfLiteOk;
+}
+
+TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
+                                        TfLiteIntArray* new_size) {
+  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
+  if (tensor->allocation_type == kTfLiteArenaRw ||
+      tensor->allocation_type == kTfLiteDynamic ||
+      tensor->allocation_type == kTfLiteArenaRwPersistent) {
+    tensor_resized_since_op_invoke_ |=
+        TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
+    if (tensor->type != kTfLiteString) {
+      size_t bytesRequired;
+      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
+                                          new_size->size, &bytesRequired);
+      if (status != kTfLiteOk) {
+        TfLiteIntArrayFree(new_size);
+        return kTfLiteError;
+      }
+
+      // Realloc space for kTfLiteDynamic tensors.
+      TfLiteTensorRealloc(bytesRequired, tensor);
+      tensor->bytes = bytesRequired;
+    }
+    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
+    tensor->dims = new_size;
+
+    if (tensor->allocation_type != kTfLiteDynamic) {
+      tensor->data.raw = nullptr;
+    }
+  } else {
+    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
+    // of fixed size.
+    TfLiteIntArrayFree(new_size);
+    ReportError("Attempting to resize a fixed-size tensor.");
+    return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+void Subgraph::UseNNAPI(bool enable) {
+  // TODO(aselle): This is a workaround for finding if NNAPI exists.
+  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
+  // prefixed.
+  if (!NNAPIDelegate::IsSupported()) enable = false;
+  if (!enable) {
+    nnapi_delegate_.reset();
+  } else if (!nnapi_delegate_) {
+    nnapi_delegate_.reset(new NNAPIDelegate);
+  }
+}
+
+void Subgraph::SwitchToDelegateContext() {
+  context_->GetNodeAndRegistration = GetNodeAndRegistration;
+  context_->ReplaceNodeSubsetsWithDelegateKernels =
+      ReplaceNodeSubsetsWithDelegateKernels;
+  context_->GetExecutionPlan = GetExecutionPlan;
+}
+
+void Subgraph::SwitchToKernelContext() {
+  context_->GetNodeAndRegistration = [](struct TfLiteContext* context,
+                                        int node_index, TfLiteNode** node,
+                                        TfLiteRegistration** registration) {
+    return ForbiddenContextFunction(context);
+  };
+  context_->ReplaceNodeSubsetsWithDelegateKernels =
+      [](TfLiteContext* context, TfLiteRegistration registration,
+         const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
+        return ForbiddenContextFunction(context);
+      };
+  context_->GetExecutionPlan = [](struct TfLiteContext* context,
+                                  TfLiteIntArray**) {
+    return ForbiddenContextFunction(context);
+  };
+}
+
+TfLiteStatus Subgraph::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
+  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
+    int last_execution_plan_index_prepared;
+    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
+                                     0, &last_execution_plan_index_prepared));
+    if (has_dynamic_tensors_) {
+      ReportError(
+          "Attempting to use a delegate that only supports static-sized "
+          "tensors with a graph that has dynamic-sized tensors.");
+      return kTfLiteError;
+    }
+  }
+
+  // TODO(aselle): Consider if it is worth storing pointers to delegates.
+  // Setup additional context interface.
+  SwitchToDelegateContext();
+
+  TfLiteStatus status = delegate->Prepare(context_, delegate);
+
+  // Remove additional context info.
+  SwitchToKernelContext();
+
+  TF_LITE_ENSURE_OK(context_, status);
+
+  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
+    // Reset the state to force tensor/op reallocation.
+    state_ = kStateUninvokable;
+    TF_LITE_ENSURE_OK(context_, AllocateTensors());
+    TF_LITE_ENSURE_EQ(context_, state_, kStateInvokable);
+    // After using a delegate which doesn't support dynamic tensors, make the
+    // entire graph immutable.
+    state_ = kStateInvokableAndImmutable;
+  }
+
+  return status;
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a7c3a7c322e55500d9edb7d7c1b9763e9a76e88
--- /dev/null
+++ b/tensorflow/lite/core/subgraph.h
@@ -0,0 +1,501 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+#define TENSORFLOW_LITE_CORE_SUBGRAPH_H_
+
+#include <cstdlib>
+#include <vector>
+
+#include "tensorflow/lite/allocation.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/memory_planner.h"
+#include "tensorflow/lite/profiling/profiler.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+
+// Forward declare since NNAPIDelegate uses Interpreter.
+class NNAPIDelegate;
+
+class Subgraph {
+ public:
+  friend class Interpreter;
+
+  Subgraph(ErrorReporter* error_reporter,
+           TfLiteExternalContext** external_contexts,
+           std::vector<std::unique_ptr<Subgraph>>* subgraphs);
+
+  Subgraph(const Subgraph&) = delete;
+
+  // Subgraphs should be movable but not copyable.
+  Subgraph(Subgraph&&) = default;
+  Subgraph& operator=(const Subgraph&) = delete;
+  virtual ~Subgraph();
+
+  // Provide a list of tensor indexes that are inputs to the model.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetInputs(std::vector<int> inputs);
+
+  // Provide a list of tensor indexes that are outputs to the model
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetOutputs(std::vector<int> outputs);
+
+  // Provide a list of tensor indexes that are variable tensors.
+  // Each index is bound check and this modifies the consistent_ flag of the
+  // interpreter.
+  TfLiteStatus SetVariables(std::vector<int> variables);
+
+  // Adds a node with the given parameters and returns the index of the new
+  // node in `node_index` (optionally). Interpreter will take ownership of
+  // `builtin_data` and destroy it with `free`. Ownership of 'init_data'
+  // remains with the caller.
+  TfLiteStatus AddNodeWithParameters(const std::vector<int>& inputs,
+                                     const std::vector<int>& outputs,
+                                     const char* init_data,
+                                     size_t init_data_size, void* builtin_data,
+                                     const TfLiteRegistration* registration,
+                                     int* node_index);
+
+  // Adds `tensors_to_add` tensors, preserving pre-existing Tensor entries.
+  // The value pointed to by `first_new_tensor_index` will be set to the
+  // index of the first new tensor if `first_new_tensor_index` is non-null.
+  TfLiteStatus AddTensors(int tensors_to_add, int* first_new_tensor_index);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadOnly(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization,
+      const char* buffer, size_t bytes, const Allocation* allocation);
+
+  // Set description of inputs/outputs/data/fptrs for node `node_index`.
+  // This variant assumes an external buffer has been allocated of size
+  // bytes. The lifetime of buffer must be ensured to be greater or equal
+  // to Interpreter.
+  TfLiteStatus SetTensorParametersReadWrite(
+      int tensor_index, TfLiteType type, const char* name, const size_t rank,
+      const int* dims, TfLiteQuantizationParams quantization, bool is_variable);
+
+  // WARNING: Experimental interface, subject to change
+  // Overrides execution plan. This bounds checks indices sent in.
+  TfLiteStatus SetExecutionPlan(const std::vector<int>& new_plan);
+
+  // Get a mutable tensor data structure.
+  // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
+  // read/write access to structure
+  TfLiteTensor* tensor(int tensor_index) {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_->tensors_size) {
+      return nullptr;
+    }
+    return &context_->tensors[tensor_index];
+  }
+
+  // Get an immutable tensor data structure.
+  const TfLiteTensor* tensor(int tensor_index) const {
+    if (tensor_index < 0 ||
+        static_cast<size_t>(tensor_index) >= context_->tensors_size) {
+      return nullptr;
+    }
+    return &context_->tensors[tensor_index];
+  }
+
+  // Read only access to list of inputs.
+  std::vector<int>& inputs() { return inputs_; }
+
+  // Read only access to list of inputs.
+  const std::vector<int>& inputs() const { return inputs_; }
+
+  // Read only access to list of outputs.
+  std::vector<int>& outputs() { return outputs_; }
+
+  // Read only access to list of outputs.
+  const std::vector<int>& outputs() const { return outputs_; }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& variables() { return variables_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& variables() const { return variables_; }
+
+  size_t tensors_size() const { return tensors_.size(); }
+
+  // Return the number of ops in the model.
+  size_t nodes_size() const { return nodes_and_registration_.size(); }
+
+  // Read only access to list of variable tensors.
+  std::vector<int>& execution_plan() { return execution_plan_; }
+
+  // Read only access to list of variable tensors.
+  const std::vector<int>& execution_plan() const { return execution_plan_; }
+
+  // Mutable form of tensors (TEMPORARY for refactor).
+  // TODO(b/119495520): remove when refactoring complete.
+  std::vector<TfLiteTensor>& tensors() { return tensors_; }
+  // Mutable form of tensors (TEMPORARY for refactor).
+  // TODO(b/119495520): remove when refactoring complete.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() {
+    return nodes_and_registration_;
+  }
+
+  const std::vector<std::pair<TfLiteNode, TfLiteRegistration>>&
+  nodes_and_registration() const {
+    return nodes_and_registration_;
+  }
+
+  // Get a pointer to an operation and registration data structure if in bounds.
+  const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
+      int node_index) const {
+    if (node_index < 0 || static_cast<size_t>(node_index) >= nodes_size())
+      return nullptr;
+    return &nodes_and_registration_[node_index];
+  }
+
+  // Change the dimensionality of a given tensor. Note, this is only acceptable
+  // for tensor indices that are inputs.
+  // Returns status of failure or success.
+  // TODO(aselle): Consider implementing ArraySlice equivalent to make this
+  //   more adept at accepting data without an extra copy. Use absl::ArraySlice
+  //   if our partners determine that dependency is acceptable.
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims);
+
+  // Update allocations for all tensors. This will redim dependent tensors using
+  // the input tensor dimensionality as given. This is relatively expensive.
+  // If you know that your sizes are not changing, you need not call this.
+  // Returns status of success or failure.
+  TfLiteStatus AllocateTensors();
+
+  // Invoke the subgraph (run the whole graph in dependency order).
+  //
+  // NOTE: It is possible that the interpreter is not in a ready state
+  // to evaluate (i.e. if a ResizeTensor() has been performed without an
+  // AllocateTensors().
+  // Returns status of success or failure.
+  TfLiteStatus Invoke();
+
+  // Entry point for C node plugin API to report an error.
+  void ReportError(const char* format, ...);
+
+  void UseNNAPI(bool enable);
+
+  // Return the subgraph specific context.
+  TfLiteContext* context() { return context_; }
+
+  // Set the value of an external context.
+  void SetExternalContext(TfLiteExternalContextType type,
+                          TfLiteExternalContext* ctx);
+  // Get the half precision flag.
+  // WARNING: This is an experimental API and subject to change.
+  bool GetAllowFp16PrecisionForFp32() const {
+    return context_->allow_fp32_relax_to_fp16;
+  }
+
+  // Ensure the data in `tensor.data` is readable. In case delegate is used,
+  // it might require to copy the data from delegate buffer to raw memory.
+  // WARNING: This is an experimental API and subject to change.
+  // TODO(b/119495520): make this private when refactoring complete.
+  TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
+    TfLiteTensor* t = &tensors_[tensor_index];
+    TF_LITE_ENSURE(context_, t != nullptr);
+    if (t->data_is_stale) {
+      TF_LITE_ENSURE(context_, t->delegate != nullptr);
+      TF_LITE_ENSURE(context_, t->buffer_handle != kTfLiteNullBufferHandle);
+      TF_LITE_ENSURE(context_, t->delegate->CopyFromBufferHandle != nullptr);
+      // TODO(b/120420546): we must add a test that exercise this code.
+      TF_LITE_ENSURE_STATUS(t->delegate->CopyFromBufferHandle(
+          context_, t->delegate, t->buffer_handle, t));
+      t->data_is_stale = false;
+    }
+    return kTfLiteOk;
+  }
+
+  // The default capacity of `tensors_` vector.
+  static constexpr int kTensorsReservedCapacity = 128;
+  // The capacity headroom of `tensors_` vector before calling ops'
+  // `prepare` and `invoke` function. In these functions, it's guaranteed
+  // allocating up to `kTensorsCapacityHeadroom` more tensors won't invalidate
+  // pointers to existing tensors.
+  static constexpr int kTensorsCapacityHeadroom = 16;
+
+  // Reset all variable tensors to the default value.
+  // If a variable tensor doesn't have a buffer, reset it to zero.
+  // TODO(b/115961645): Implement - If a variable tensor has a buffer, reset it
+  // to the value of the buffer.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ResetVariableTensors();
+
+  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+
+  profiling::Profiler* GetProfiler() { return profiler_; }
+
+  // Returns a pointer to vector of subgraphs.
+  // WARNING: This is an experimental API and subject to change.
+  std::vector<std::unique_ptr<Subgraph>>* GetSubgraphs() { return subgraphs_; }
+
+  // True if all tensors in the graph has static size after calling
+  // `AllocateTensors` function.
+  // Before `AllocateTensors` is called, this will always return true;
+  bool HasDynamicTensors() { return has_dynamic_tensors_; }
+
+ private:
+  // Prevent 'context_' from accessing functions that are only available to
+  // delegated kernels.
+  void SwitchToKernelContext();
+
+  // Add delegate-only functions to 'context_'.
+  void SwitchToDelegateContext();
+
+  // Give 'op_reg' a chance to initialize itself using the contents of
+  // 'buffer'.
+  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
+               size_t length) {
+    if (op_reg.init == nullptr) return nullptr;
+    return op_reg.init(context_, buffer, length);
+  }
+
+  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
+  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
+    if (op_reg.free == nullptr) return;
+    if (buffer) {
+      op_reg.free(context_, buffer);
+    }
+  }
+
+  // Prepare the given 'node' for execution.
+  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.prepare == nullptr) return kTfLiteOk;
+    return op_reg.prepare(context_, node);
+  }
+
+  // Invoke the operator represented by 'node'.
+  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
+    if (op_reg.invoke == nullptr) return kTfLiteError;
+    return op_reg.invoke(context_, node);
+  }
+
+  // Call OpPrepare() for as many ops as possible, allocating memory for their
+  // tensors. If an op containing dynamic tensors is found, preparation will be
+  // postponed until this function is called again. This allows the interpreter
+  // to wait until Invoke() to resolve the sizes of dynamic tensors.
+  TfLiteStatus PrepareOpsAndTensors();
+
+  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
+  // dynamic tensors is found or all ops have been prepared. Fill
+  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
+  // the last in the graph.
+  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
+                                    int* last_execution_plan_index_prepared);
+
+  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
+  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
+  // `context_` whenever this std::vector is reallocated. Currently this
+  // only happens in `AddTensors()`.
+  std::vector<TfLiteTensor> tensors_;
+
+  // Check if an array of tensor indices are valid with respect to the Tensor
+  // array.
+  // NOTE: this changes consistent_ to be false if indices are out of bounds.
+  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
+                                  int length);
+
+  // Compute the number of bytes required to represent a tensor with dimensions
+  // specified by the array dims (of length dims_size). Returns the status code
+  // and bytes.
+  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
+                             size_t* bytes);
+
+  // Request an tensor be resized implementation. If the given tensor is of
+  // type kTfLiteDynamic it will also be allocated new memory.
+  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
+
+  // Report a detailed error string (will be printed to stderr).
+  // TODO(aselle): allow user of class to provide alternative destinations.
+  void ReportErrorImpl(const char* format, va_list args);
+
+  // Entry point for C node plugin API to request an tensor be resized.
+  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
+                                   TfLiteIntArray* new_size);
+  // Entry point for C node plugin API to report an error.
+  static void ReportErrorC(TfLiteContext* context, const char* format, ...);
+
+  // Entry point for C node plugin API to add new tensors.
+  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
+                                 int* first_new_tensor_index);
+
+  // WARNING: This is an experimental API and subject to change.
+  // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
+  static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteContext* context, TfLiteRegistration registration,
+      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
+
+  // Update the execution graph to replace some of the nodes with stub
+  // nodes. Specifically any node index that has `nodes[index]==1` will be
+  // slated for replacement with a delegate kernel specified by registration.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
+  // WARNING: This is an experimental interface that is subject to change.
+  TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
+      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
+      TfLiteDelegate* delegate);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets the internal pointer to a TensorFlow lite node by node_index.
+  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
+                                      TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get a node by index.
+  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
+                                             int node_index, TfLiteNode** node,
+                                             TfLiteRegistration** registration);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
+  // owns this memory and it is only guaranteed to exist during the invocation
+  // of the delegate prepare.
+  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // Entry point for C node plugin API to get the execution plan.
+  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
+                                       TfLiteIntArray** execution_plan);
+
+  // Retrieve an existing external context by type.
+  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
+  static TfLiteExternalContext* GetExternalContext(
+      struct TfLiteContext* context, TfLiteExternalContextType type);
+
+  // Set the value of an external context.
+  static void SetExternalContext(struct TfLiteContext* context,
+                                 TfLiteExternalContextType type,
+                                 TfLiteExternalContext* ctx);
+
+  // Allow a delegate to look at the graph and modify the graph to handle
+  // parts of the graph themselves. After this is called, the graph may
+  // contain new nodes that replace 1 more nodes.
+  // WARNING: This is an experimental API and subject to change.
+  TfLiteStatus ModifyGraphWithDelegate(TfLiteDelegate* delegate);
+
+  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
+  // capacity. Calling this function may invalidate existing pointers to
+  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
+  // more tensors won't invalidate the pointer to existing tensors.
+  void EnsureTensorsVectorCapacity() {
+    const size_t required_capacity = tensors_.size() + kTensorsCapacityHeadroom;
+    if (required_capacity > tensors_.capacity()) {
+      tensors_.reserve(required_capacity);
+      context_->tensors = tensors_.data();
+    }
+  }
+
+  // The state of the Interpreter.
+  enum State {
+    // The interpreter isn't ready to be invoked.
+    // `AllocateTensor` need to be called to enter an invokable state.
+    kStateUninvokable = 0,
+    // The interpreter is ready to be invoked.
+    kStateInvokable,
+    // The interpreter is ready to be invoked, and graph can't be further
+    // modified. The interpreter will enter this state when calling
+    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
+    kStateInvokableAndImmutable,
+  };
+  State state_ = kStateUninvokable;
+
+  // A pure C data structure used to communicate with the pure C plugin
+  // interface. To avoid copying tensor metadata, this is also the definitive
+  // structure to store tensors.
+  // TODO(b/119495520): Get rid of owned and just make context_ a instance.
+  TfLiteContext owned_context_;
+  TfLiteContext* context_;
+
+  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
+  // function pointers to actual implementation.
+  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
+      nodes_and_registration_;
+
+  // Whether the model is consistent. That is to say if the inputs and outputs
+  // of every node and the global inputs and outputs are valid indexes into
+  // the tensor array.
+  bool consistent_ = true;
+
+  // Array of indices representing the tensors that are inputs to the
+  // interpreter.
+  std::vector<int> inputs_;
+
+  // Array of indices representing the tensors that are outputs to the
+  // interpreter.
+  std::vector<int> outputs_;
+
+  // Array of indices representing the tensors that are variable tensors.
+  std::vector<int> variables_;
+
+  // The error reporter delegate that tflite will forward queries errors to.
+  ErrorReporter* error_reporter_;
+
+  // Index of the next node to prepare.
+  // During Invoke(), Interpreter will allocate input tensors first, which are
+  // known to be fixed size. Then it will allocate outputs from nodes as many
+  // as possible. When there is a node that produces dynamic sized tensor.
+  // Interpreter will stop allocating tensors, set the value of next allocate
+  // node id, and execute the node to generate the output tensor before continue
+  // to allocate successors. This process repeats until all nodes are executed.
+  // NOTE: this relies on the order of nodes that is in topological order.
+  int next_execution_plan_index_to_prepare_;
+
+  // WARNING: This is an experimental interface that is subject to change.
+  // This is a list of node indices (to index into nodes_and_registration).
+  // This represents a valid topological sort (dependency ordered) execution
+  // plan. In particular, it is valid for this ordering to contain only a
+  // subset of the node indices.
+  std::vector<int> execution_plan_;
+
+  // In the future, we'd like a TfLiteIntArray compatible representation.
+  // TODO(aselle): replace execution_plan_ with this.
+  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
+
+  // Whether to delegate to NN API
+  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
+
+  std::unique_ptr<MemoryPlanner> memory_planner_;
+
+  // Tracking bit for whether a tensor was resized in the course of an op
+  // invocation. This is a useful hint to ensure that dynamic tensor outputs
+  // trigger downstream reallocation after op invocation.
+  bool tensor_resized_since_op_invoke_ = false;
+
+  // External contexts (kTfLiteMaxExternalContexts).
+  TfLiteExternalContext** external_contexts_;
+
+  // Profiler for this interpreter instance.
+  profiling::Profiler* profiler_ = nullptr;
+
+  // A pointer to vector of subgraphs. The vector is owned by the interpreter.
+  std::vector<std::unique_ptr<Subgraph>>* subgraphs_ = nullptr;
+
+  // True if all tensors in the graph has static size after calling
+  // `PrepareOpsStartingAt` function (which is called by the `AllocateTensors`
+  // public function).
+  // The value is invalid before `PrepareOpStartingAt` is called.
+  bool has_dynamic_tensors_ = true;
+};
+
+}  // namespace tflite
+#endif  // TENSORFLOW_LITE_CORE_SUBGRAPH_H_
diff --git a/tensorflow/lite/delegates/flex/BUILD b/tensorflow/lite/delegates/flex/BUILD
index 222a043a88e8804c6cad85150c55261f6bec9973..75083bf95a126fe7a8d1ca92af2cfa0c5a85f371 100644
--- a/tensorflow/lite/delegates/flex/BUILD
+++ b/tensorflow/lite/delegates/flex/BUILD
@@ -83,8 +83,10 @@ cc_library(
         ":delegate_data",
         ":kernel",
         ":util",
+        "@com_google_absl//absl/strings:strings",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite:kernel_api",
+        "//tensorflow/lite:string_util",
         "//tensorflow/lite:util",
     ] + select({
         "//tensorflow:android": [
@@ -116,6 +118,7 @@ cc_library(
     hdrs = ["delegate_data.h"],
     deps = [
         ":buffer_map",
+        "@com_google_absl//absl/memory",
         "//tensorflow/core/common_runtime/eager:context",
     ] + select({
         "//tensorflow:android": [
diff --git a/tensorflow/lite/delegates/flex/buffer_map.cc b/tensorflow/lite/delegates/flex/buffer_map.cc
index 9a6c5e74a7b8d71a04c20bbcb969cfe0b0ce3478..0d0c953636672e33130a991b1a302f410e42f381 100644
--- a/tensorflow/lite/delegates/flex/buffer_map.cc
+++ b/tensorflow/lite/delegates/flex/buffer_map.cc
@@ -26,6 +26,8 @@ namespace flex {
 namespace {
 // A tensor buffer that is allocated, deallocated and populated by TF Lite.
 class BaseTfLiteTensorBuffer : public tensorflow::TensorBuffer {
+  using tensorflow::TensorBuffer::TensorBuffer;
+
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(
       tensorflow::AllocationDescription* proto) const override {
@@ -60,31 +62,29 @@ class BaseTfLiteTensorBuffer : public tensorflow::TensorBuffer {
 // representation in TFLITE and TF, so we just need use memcpy().
 class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
  public:
-  explicit TfLiteTensorBuffer(const TfLiteTensor* tensor) {
+  explicit TfLiteTensorBuffer(const TfLiteTensor* tensor)
+      : BaseTfLiteTensorBuffer(tensorflow::cpu_allocator()->AllocateRaw(
+            EIGEN_MAX_ALIGN_BYTES, tensor->bytes)) {
     // TODO(ahentz): if we can guarantee that TF Lite allocated tensors with
     // the same alignment as TensorFlow (EIGEN_MAX_ALIGN_BYTES), then we can
     // potentially eliminate the copy below.
     len_ = tensor->bytes;
-    data_ =
-        tensorflow::cpu_allocator()->AllocateRaw(EIGEN_MAX_ALIGN_BYTES, len_);
 
     LogAllocation();
 
-    if (data_) {
-      std::memcpy(data_, tensor->data.raw, tensor->bytes);
+    if (data()) {
+      std::memcpy(data(), tensor->data.raw, tensor->bytes);
     }
   }
 
   ~TfLiteTensorBuffer() override {
     LogDeallocation();
-    tensorflow::cpu_allocator()->DeallocateRaw(data_);
+    tensorflow::cpu_allocator()->DeallocateRaw(data());
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return len_; }
 
  private:
-  void* data_;
   size_t len_;
 };
 
@@ -92,14 +92,30 @@ class TfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
 // TF's so we need perform the conversion here.
 class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
  public:
-  explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor) {
-    num_strings_ = GetStringCount(tensor->data.raw);
-    data_ = tensorflow::cpu_allocator()->Allocate<string>(num_strings_);
+  explicit StringTfLiteTensorBuffer(const TfLiteTensor* tensor)
+      : StringTfLiteTensorBuffer(tensor, tensor->data.raw != nullptr
+                                             ? GetStringCount(tensor->data.raw)
+                                             : 0) {}
+
+  ~StringTfLiteTensorBuffer() override {
+    LogDeallocation();
+    tensorflow::cpu_allocator()->Deallocate<string>(
+        static_cast<string*>(data()), num_strings_);
+  }
+
+  size_t size() const override { return num_strings_ * sizeof(string); }
 
+ private:
+  StringTfLiteTensorBuffer(const TfLiteTensor* tensor, int num_strings)
+      : BaseTfLiteTensorBuffer(
+            num_strings != 0
+                ? tensorflow::cpu_allocator()->Allocate<string>(num_strings)
+                : nullptr),
+        num_strings_(num_strings) {
     LogAllocation();
 
-    if (data_) {
-      string* p = data_;
+    if (data()) {
+      string* p = static_cast<string*>(data());
       for (size_t i = 0; i < num_strings_; ++p, ++i) {
         auto ref = GetString(tensor->data.raw, i);
         p->assign(ref.str, ref.len);
@@ -107,16 +123,6 @@ class StringTfLiteTensorBuffer : public BaseTfLiteTensorBuffer {
     }
   }
 
-  ~StringTfLiteTensorBuffer() override {
-    LogDeallocation();
-    tensorflow::cpu_allocator()->Deallocate<string>(data_, num_strings_);
-  }
-
-  void* data() const override { return data_; }
-  size_t size() const override { return num_strings_ * sizeof(string); }
-
- private:
-  string* data_;
   int num_strings_;
 };
 
diff --git a/tensorflow/lite/delegates/flex/delegate.cc b/tensorflow/lite/delegates/flex/delegate.cc
index 4fc2d82b494a4cd8165ae2d070aad1cc9e2440f4..ca7314fbaee6644cf9385a1d7b0b2964d6a2762f 100644
--- a/tensorflow/lite/delegates/flex/delegate.cc
+++ b/tensorflow/lite/delegates/flex/delegate.cc
@@ -16,12 +16,14 @@ limitations under the License.
 
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/delegates/flex/buffer_map.h"
 #include "tensorflow/lite/delegates/flex/kernel.h"
 #include "tensorflow/lite/delegates/flex/util.h"
+#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/util.h"
-#include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
 namespace flex {
@@ -57,8 +59,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteDelegate* delegate) {
 
 TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
                                   TfLiteDelegate* delegate,
-                                  TfLiteBufferHandle buffer_handle, void* data,
-                                  size_t size) {
+                                  TfLiteBufferHandle buffer_handle,
+                                  TfLiteTensor* output) {
   BufferMap* buffer_map =
       reinterpret_cast<DelegateData*>(delegate->data_)->GetBufferMap(context);
 
@@ -68,15 +70,38 @@ TfLiteStatus CopyFromBufferHandle(TfLiteContext* context,
   }
 
   tensorflow::Tensor t = buffer_map->GetTensor(buffer_handle);
+
+  if (output->type == kTfLiteString) {
+    if (t.dtype() != tensorflow::DT_STRING) {
+      context->ReportError(context,
+                           "Inconsistent type for TF string tensor index %d.",
+                           buffer_handle);
+      return kTfLiteError;
+    }
+    DynamicBuffer dynamic_buffer;
+
+    auto tf_data = t.flat<string>();
+    for (int i = 0; i < t.NumElements(); ++i) {
+      dynamic_buffer.AddString(tf_data(i).data(), tf_data(i).size());
+    }
+
+    dynamic_buffer.WriteToTensor(output, /*new_shape=*/nullptr);
+    return kTfLiteOk;
+  }
+
   tensorflow::StringPiece t_data = t.tensor_data();
 
-  if (size != t_data.size()) {
-    context->ReportError(
-        context, "Not enough space to store TensorFlow's aligned buffer.");
+  if (output->bytes != t_data.size()) {
+    context->ReportError(context,
+                         absl::StrCat("The given ", output->bytes,
+                                      " bytes are not enough to store "
+                                      "TensorFlow's aligned buffer of size ",
+                                      t_data.size(), " bytes.")
+                             .c_str());
     return kTfLiteError;
   }
 
-  memcpy(data, t_data.data(), t_data.size());
+  memcpy(output->data.raw, t_data.data(), t_data.size());
   return kTfLiteOk;
 }
 
@@ -104,14 +129,13 @@ std::unique_ptr<FlexDelegate> FlexDelegate::Create() {
 }
 
 FlexDelegate::FlexDelegate(std::unique_ptr<flex::DelegateData> delegate_data)
-    : TfLiteDelegate{
-          /*data_=*/delegate_data.get(),
-          /*nullptr,*/ &flex::delegate::Prepare,
-          /*CopyFromBufferHandle=*/&flex::delegate::CopyFromBufferHandle,
-          /*CopyToBufferHandle=*/nullptr,
-          /*FreeBufferHandle=*/nullptr,
-          /*flags=*/kTfLiteDelegateFlagsAllowDynamicTensors},
-      delegate_data_(std::move(delegate_data)) {}
+    : TfLiteDelegate(TfLiteDelegateCreate()),
+      delegate_data_(std::move(delegate_data)) {
+  data_ = delegate_data_.get();
+  Prepare = &flex::delegate::Prepare;
+  CopyFromBufferHandle = &flex::delegate::CopyFromBufferHandle;
+  flags = kTfLiteDelegateFlagsAllowDynamicTensors;
+}
 
 FlexDelegate::~FlexDelegate() {}
 
diff --git a/tensorflow/lite/delegates/flex/delegate_data.cc b/tensorflow/lite/delegates/flex/delegate_data.cc
index b62479a448073d4cd522464d0f86f8cf9db8eada..1483a530388d1dd48ff6179de4ddc2084ddb3d87 100644
--- a/tensorflow/lite/delegates/flex/delegate_data.cc
+++ b/tensorflow/lite/delegates/flex/delegate_data.cc
@@ -14,20 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/delegates/flex/delegate_data.h"
 
+#include "absl/memory/memory.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/lib/core/status.h"
 
 namespace tflite {
 namespace flex {
 tensorflow::Status DelegateData::Create(std::unique_ptr<DelegateData>* data) {
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
 
   TF_RETURN_IF_ERROR(tensorflow::DeviceFactory::AddDevices(
       tensorflow::SessionOptions(), "/job:localhost/replica:0/task:0",
       &devices));
 
-  std::unique_ptr<tensorflow::DeviceMgr> device_mgr(
-      new tensorflow::DeviceMgr(devices));
+  std::unique_ptr<tensorflow::DeviceMgr> device_mgr =
+      absl::make_unique<tensorflow::DeviceMgr>(std::move(devices));
   // Note that Rendezvous is ref-counted so it will be automatically deleted.
   tensorflow::Rendezvous* rendezvous =
       new tensorflow::IntraProcessRendezvous(device_mgr.get());
diff --git a/tensorflow/lite/delegates/flex/delegate_test.cc b/tensorflow/lite/delegates/flex/delegate_test.cc
index e13029d9a514e7207c69a530713d2dcb6ec11ad5..ee37090d94eaadca2a767a0ea9a2ad105618da97 100644
--- a/tensorflow/lite/delegates/flex/delegate_test.cc
+++ b/tensorflow/lite/delegates/flex/delegate_test.cc
@@ -22,7 +22,6 @@ namespace tflite {
 namespace flex {
 namespace {
 
-using ::testing::ContainsRegex;
 using ::testing::ElementsAre;
 
 class DelegateTest : public testing::FlexModelTest {
@@ -93,6 +92,25 @@ TEST_F(DelegateTest, NonFloatTypeInference) {
   ASSERT_EQ(GetType(2), kTfLiteInt32);
 }
 
+TEST_F(DelegateTest, StringInference) {
+  AddTensors(3, {0, 1}, {2}, kTfLiteString, {2});
+
+  AddTfOp(testing::kAdd, {0, 1}, {2});
+
+  ConfigureDelegate();
+
+  SetShape(0, {2, 2});
+  SetStringValues(0, {"1", "2", "3", "4"});
+  SetShape(1, {2, 2});
+  SetStringValues(1, {"4", "3", "2", "1"});
+
+  ASSERT_TRUE(Invoke());
+
+  ASSERT_THAT(GetShape(2), ElementsAre(2, 2));
+  ASSERT_THAT(GetStringValues(2), ElementsAre("14", "23", "32", "41"));
+  ASSERT_EQ(GetType(2), kTfLiteString);
+}
+
 TEST_F(DelegateTest, MixedGraph) {
   AddTensors(9, {0, 3}, {8}, kTfLiteFloat32, {3});
 
diff --git a/tensorflow/lite/delegates/flex/kernel.cc b/tensorflow/lite/delegates/flex/kernel.cc
index c4fe142dff10510a2f05dae9bedfea7b5d604b98..02da1d1a224ee87c34c2a019bff6430fd0e7d88a 100644
--- a/tensorflow/lite/delegates/flex/kernel.cc
+++ b/tensorflow/lite/delegates/flex/kernel.cc
@@ -15,6 +15,12 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/kernel.h"
 
 #include "flatbuffers/flexbuffers.h"  // TF:flatbuffers
+#include "tensorflow/core/common_runtime/eager/context.h"
+#include "tensorflow/core/common_runtime/eager/execute.h"
+#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
@@ -22,11 +28,6 @@ limitations under the License.
 #include "tensorflow/lite/delegates/flex/util.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/string.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/common_runtime/eager/execute.h"
-#include "tensorflow/core/common_runtime/eager/tensor_handle.h"
-#include "tensorflow/core/framework/node_def.pb.h"
-#include "tensorflow/core/framework/node_def_util.h"
 
 // Note: this is part of TF Lite's Flex delegation code which is to be
 // completed soon.
@@ -78,11 +79,18 @@ tensorflow::Status ExecuteFlexOp(tensorflow::EagerContext* eager_context,
                                  const std::vector<int>& inputs,
                                  const std::vector<int>& outputs) {
   const tensorflow::AttrTypeMap* attr_types;
+  bool is_function = false;
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types),
+      tensorflow::AttrTypeMapForOp(op_name.c_str(), &attr_types, &is_function),
       " (while processing attributes of '", op_name, "')");
-
-  tensorflow::EagerOperation op(eager_context, op_name.c_str(), attr_types);
+  if (is_function) {
+    return tensorflow::errors::NotFound(
+        "Operation '", op_name,
+        "' is not registered.  (while processing attributes of '", op_name,
+        "')");
+  }
+  tensorflow::EagerOperation op(eager_context, op_name.c_str(),
+                                /*is_function=*/false, attr_types);
   for (const auto& attr : nodedef.attr()) {
     op.MutableAttrs()->Set(attr.first, attr.second);
   }
diff --git a/tensorflow/lite/delegates/flex/kernel_test.cc b/tensorflow/lite/delegates/flex/kernel_test.cc
index f55759594df51356986c2a328165c17b3ead2d80..efb7300b0bd9693f93fc4b7fb3078c384130cf65 100644
--- a/tensorflow/lite/delegates/flex/kernel_test.cc
+++ b/tensorflow/lite/delegates/flex/kernel_test.cc
@@ -59,12 +59,12 @@ class KernelTest : public testing::FlexModelTest {
     delegate_.CopyFromBufferHandle = [](TfLiteContext* context,
                                         TfLiteDelegate* delegate,
                                         TfLiteBufferHandle buffer_handle,
-                                        void* data, size_t size) {
+                                        TfLiteTensor* output) {
       auto* delegate_data = reinterpret_cast<DelegateData*>(delegate->data_);
       tensorflow::StringPiece values = delegate_data->GetBufferMap(context)
                                            ->GetTensor(buffer_handle)
                                            .tensor_data();
-      memcpy(data, values.data(), values.size());
+      memcpy(output->data.raw, values.data(), values.size());
       return kTfLiteOk;
     };
     CHECK(interpreter_->ModifyGraphWithDelegate(&delegate_) == kTfLiteOk);
diff --git a/tensorflow/lite/delegates/flex/test_util.cc b/tensorflow/lite/delegates/flex/test_util.cc
index 08feb349e6dbf15dc908c7c4d4fd5694814c8594..aa24675a7b1beab8632435debc8dd1fc04f347e7 100644
--- a/tensorflow/lite/delegates/flex/test_util.cc
+++ b/tensorflow/lite/delegates/flex/test_util.cc
@@ -25,6 +25,29 @@ namespace testing {
 
 bool FlexModelTest::Invoke() { return interpreter_->Invoke() == kTfLiteOk; }
 
+void FlexModelTest::SetStringValues(int tensor_index,
+                                    const std::vector<string>& values) {
+  DynamicBuffer dynamic_buffer;
+  for (const string& s : values) {
+    dynamic_buffer.AddString(s.data(), s.size());
+  }
+  dynamic_buffer.WriteToTensor(interpreter_->tensor(tensor_index),
+                               /*new_shape=*/nullptr);
+}
+
+std::vector<string> FlexModelTest::GetStringValues(int tensor_index) const {
+  std::vector<string> result;
+
+  TfLiteTensor* tensor = interpreter_->tensor(tensor_index);
+  auto num_strings = GetStringCount(tensor->data.raw);
+  for (size_t i = 0; i < num_strings; ++i) {
+    auto ref = GetString(tensor->data.raw, i);
+    result.push_back(string(ref.str, ref.len));
+  }
+
+  return result;
+}
+
 void FlexModelTest::SetShape(int tensor_index, const std::vector<int>& values) {
   ASSERT_EQ(interpreter_->ResizeInputTensor(tensor_index, values), kTfLiteOk);
   ASSERT_EQ(interpreter_->AllocateTensors(), kTfLiteOk);
@@ -95,12 +118,22 @@ void FlexModelTest::AddTfOp(TfOpType op, const std::vector<int>& inputs,
     return " attr{ key: '" + key + "' value {" + value + "}}";
   };
 
-  // Crude type attribution, will need fleshing out as more tests are added.
-  // TODO(b/113613439): Use nodedef string utilities to properly handle
-  // all types.
-  string type_attribute = attr("T", "type: DT_FLOAT");
-  if (interpreter_->tensor(inputs[0])->type == kTfLiteInt32) {
-    type_attribute = attr("T", "type: DT_INT32");
+  string type_attribute;
+  switch (interpreter_->tensor(inputs[0])->type) {
+    case kTfLiteInt32:
+      type_attribute = attr("T", "type: DT_INT32");
+      break;
+    case kTfLiteFloat32:
+      type_attribute = attr("T", "type: DT_FLOAT");
+      break;
+    case kTfLiteString:
+      type_attribute = attr("T", "type: DT_STRING");
+      break;
+    default:
+      // TODO(b/113613439): Use nodedef string utilities to properly handle all
+      // types.
+      LOG(FATAL) << "Type not supported";
+      break;
   }
 
   if (op == kUnpack) {
diff --git a/tensorflow/lite/delegates/flex/test_util.h b/tensorflow/lite/delegates/flex/test_util.h
index 4d3f5ad0968ad34ef9ee673ffacd7d9b2c83cb7f..2cc2dc30e92586535687187105057d41ab5c0350 100644
--- a/tensorflow/lite/delegates/flex/test_util.h
+++ b/tensorflow/lite/delegates/flex/test_util.h
@@ -63,11 +63,13 @@ class FlexModelTest : public ::testing::Test {
   void SetValues(int tensor_index, const std::vector<float>& values) {
     SetTypedValues<float>(tensor_index, values);
   }
+  void SetStringValues(int tensor_index, const std::vector<string>& values);
 
   // Returns the tensor's values at the given index.
   std::vector<float> GetValues(int tensor_index) {
     return GetTypedValues<float>(tensor_index);
   }
+  std::vector<string> GetStringValues(int tensor_index) const;
 
   // Sets the tensor's shape at the given index.
   void SetShape(int tensor_index, const std::vector<int>& values);
diff --git a/tensorflow/lite/delegates/flex/util.cc b/tensorflow/lite/delegates/flex/util.cc
index c786ffa1a2150b24ec9b283f5fb254813d1d4ba2..c995b360f9d5ecfaced217a372af38690aee74f6 100644
--- a/tensorflow/lite/delegates/flex/util.cc
+++ b/tensorflow/lite/delegates/flex/util.cc
@@ -66,6 +66,8 @@ TF_DataType GetTensorFlowDataType(TfLiteType type) {
       return TF_INT32;
     case kTfLiteUInt8:
       return TF_UINT8;
+    case kTfLiteInt8:
+      return TF_INT8;
     case kTfLiteInt64:
       return TF_INT64;
     case kTfLiteComplex64:
@@ -87,6 +89,8 @@ TfLiteType GetTensorFlowLiteType(TF_DataType type) {
       return kTfLiteInt32;
     case TF_UINT8:
       return kTfLiteUInt8;
+    case TF_INT8:
+      return kTfLiteInt8;
     case TF_INT64:
       return kTfLiteInt64;
     case TF_COMPLEX64:
diff --git a/tensorflow/lite/delegates/nnapi/BUILD b/tensorflow/lite/delegates/nnapi/BUILD
index c24f0f71ac4edde456fc67a926ef120da6a50931..fd954ba222627ab0457711b87baf9c3f7573e129 100644
--- a/tensorflow/lite/delegates/nnapi/BUILD
+++ b/tensorflow/lite/delegates/nnapi/BUILD
@@ -23,10 +23,7 @@ tf_cc_test(
     name = "nnapi_delegate_test",
     size = "small",
     srcs = ["nnapi_delegate_test.cc"],
-    tags = [
-        "no_oss",
-        "noasan",  # TODO(b/112326936): re-enable for asan once fixed.
-    ],
+    tags = ["no_oss"],
     deps = [
         ":nnapi_delegate",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 9690c6592118474874f3413f00e769eef76dce11..4fe07004a82ff30228d866bcc7a90067e5940aca 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -1141,7 +1141,7 @@ class NNAPIDelegateKernel {
 TfLiteDelegate* NnApiDelegate() {
   static TfLiteDelegate delegate = {
       .data_ = nullptr,
-      .flags = kTfLiteDelegateFlagsAllowDynamicTensors,
+      .flags = kTfLiteDelegateFlagsNone,
       .Prepare = [](TfLiteContext* context,
                     TfLiteDelegate* delegate) -> TfLiteStatus {
         // Do not check nodes_ if NN API is unavailable.
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 84a0a6a1d1cd0cacb9e433b94d41078c963b09ef..ca48af0c95211e644fc7e2a1a1472a2f1b46ad35 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -34,6 +34,11 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
       interpreter->ModifyGraphWithDelegate(NnApiDelegate());
     });
   }
+
+  TfLiteStatus ResizeInputTensor(int tensor_index,
+                                 const std::vector<int>& dims) {
+    return interpreter_->ResizeInputTensor(tensor_index, dims);
+  }
 };
 
 class FloatAddOpModel : public SingleOpModelWithNNAPI {
@@ -97,6 +102,17 @@ TEST(NNAPIDelegate, AddWithRelu) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0.0, 0.4, 1.0, 1.3}));
 }
 
+// Verify that resize attempts fail.
+// TODO(b/113110851): Verify success after the delegate supports resizing.
+TEST(NNAPIDelegate, ResizeFails) {
+  FloatAddOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {1, 2, 2, 1}},
+                    {TensorType_FLOAT32, {}}, ActivationFunctionType_NONE);
+  m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.7, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.1, 0.2, 0.3, 0.5});
+  EXPECT_EQ(m.ResizeInputTensor(m.input1(), {1, 3, 3, 1}), kTfLiteError);
+}
+
 class FloatMulOpModel : public SingleOpModelWithNNAPI {
  public:
   FloatMulOpModel(const TensorData& input1, const TensorData& input2,
diff --git a/tensorflow/lite/examples/android/BUILD b/tensorflow/lite/examples/android/BUILD
index 761a60314e8fb663d9a60af4116bd96a7e5839e2..80cefd415a579ad053c9f4cfcd59f63a64566931 100644
--- a/tensorflow/lite/examples/android/BUILD
+++ b/tensorflow/lite/examples/android/BUILD
@@ -34,7 +34,7 @@ android_binary(
     # to reduce APK size.
     assets = [
         "//tensorflow/lite/examples/android/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
-        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
+        "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
         "@tflite_conv_actions_frozen//:conv_actions_frozen.tflite",
         "//tensorflow/lite/examples/android/app/src/main/assets:conv_actions_labels.txt",
         "@tflite_mobilenet_ssd//:mobilenet_ssd.tflite",
diff --git a/tensorflow/lite/examples/android/app/download-models.gradle b/tensorflow/lite/examples/android/app/download-models.gradle
index d2f03db5f6373b8f679d55464dbfbf01ab8bd0c0..36bd177a1fd6bb21a27edd6d2b6e82fa7aa5d57b 100644
--- a/tensorflow/lite/examples/android/app/download-models.gradle
+++ b/tensorflow/lite/examples/android/app/download-models.gradle
@@ -8,13 +8,12 @@
  *     3 model files will be downloaded into given folder of ext.ASSET_DIR
  */
 // hard coded model files
-// LINT.IfChange
 
-def models = ['conv_actions_tflite.zip',
-              'mobilenet_ssd_tflite_v1.zip',
-              'mobilenet_v1_224_android_quant_2017_11_08.zip',
-              'coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip']
-// LINT.ThenChange(//tensorflow/lite/examples/android/BUILD)
+def models = ['https://storage.googleapis.com/download.tensorflow.org/models/tflite/conv_actions_tflite.zip',
+              'https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_ssd_tflite_v1.zip',
+              'https://storage.googleapis.com/download.tensorflow.org/models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip',
+              'http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz',
+              'http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz']
 
 // Root URL for model archives
 def MODEL_URL = 'https://storage.googleapis.com/download.tensorflow.org/models/tflite'
@@ -30,9 +29,9 @@ buildscript {
 
 import de.undercouch.gradle.tasks.download.Download
 task downloadFile(type: Download){
-    for (f in models) {
-        def modelUrl = MODEL_URL + "/" + f
-        println "Downloading ${f} from ${modelUrl}"
+    for (modelUrl in models) {
+        def localFile = modelUrl.split("/")[-1]
+        println "Downloading ${localFile} from ${modelUrl}"
         src modelUrl
     }
 
@@ -43,7 +42,12 @@ task downloadFile(type: Download){
 task extractModels(type: Copy) {
     for (f in models) {
         def localFile = f.split("/")[-1]
-        from zipTree(project.ext.TMP_DIR + '/' + localFile)
+        def localExt = localFile.split("[.]")[-1]
+        if (localExt == "tgz") {
+            from tarTree(project.ext.TMP_DIR + '/' + localFile)
+        } else {
+            from zipTree(project.ext.TMP_DIR + '/' + localFile)
+        }
     }
 
     into file(project.ext.ASSET_DIR)
@@ -63,6 +67,9 @@ task extractModels(type: Copy) {
     }
 }
 
+
+
+
 tasks.whenTaskAdded { task ->
     if (task.name == 'assembleDebug') {
         task.dependsOn 'extractModels'
diff --git a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
index dcbbefbeab6627b37579902cd25841c0ae257dda..698251d8b4aff3423808126ff490fe277a7ed283 100644
--- a/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
+++ b/tensorflow/lite/examples/android/app/src/main/java/org/tensorflow/demo/ClassifierActivity.java
@@ -65,7 +65,7 @@ public class ClassifierActivity extends CameraActivity implements OnImageAvailab
   // --input_binary=true
   private static final int INPUT_SIZE = 224;
 
-  private static final String MODEL_FILE = "mobilenet_quant_v1_224.tflite";
+  private static final String MODEL_FILE = "mobilenet_v1_1.0_224_quant.tflite";
   private static final String LABEL_FILE = "labels_mobilenet_quant_v1_224.txt";
 
   private static final boolean MAINTAIN_ASPECT = true;
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
index 6bc94e950220b952c5763df8cfda1610a67f89f8..438e6adc79a2eb6ca0ed9a61d278eef79546ce8d 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.h
@@ -17,8 +17,26 @@
 
 #include <vector>
 
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
+// TensorFlow Lite was migrated out of `contrib/` directory. The change
+// wasn't reflected in newest CocoaPod release yet (1.12.0).
+// Change this to 0 when using a TFLite version which is newer than 1.12.0.
+// TODO(ycling): Remove the macro when we release the next version.
+#ifndef TFLITE_USE_CONTRIB_LITE
+#define TFLITE_USE_CONTRIB_LITE 1
+#endif
+
+// Set TFLITE_USE_GPU_DELEGATE to 1 to use TFLite GPU Delegate.
+// Note: TFLite GPU Delegate binary isn't releast yet, and we're working
+// on it.
+#ifndef TFLITE_USE_GPU_DELEGATE
+#define TFLITE_USE_GPU_DELEGATE 0
+#endif
+
+#if TFLITE_USE_GPU_DELEGATE && TFLITE_USE_CONTRIB_LITE
+// Sanity check.
+#error "GPU Delegate only works with newer TFLite " \
+    "after migrating out of contrib"
+#endif
 
 @interface CameraExampleViewController
     : UIViewController<UIGestureRecognizerDelegate, AVCaptureVideoDataOutputSampleBufferDelegate> {
@@ -33,10 +51,6 @@
   AVCaptureSession* session;
 
   std::vector<std::string> labels;
-  std::unique_ptr<tflite::FlatBufferModel> model;
-  tflite::ops::builtin::BuiltinOpResolver resolver;
-  std::unique_ptr<tflite::Interpreter> interpreter;
-
   double total_latency;
   int total_count;
 }
diff --git a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
index 1e6725592b0c6b7a9f2883fb51d50d3ad9f6292d..48cd313c9d7a94328d990e45243e2b84c9dc7a62 100644
--- a/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
+++ b/tensorflow/lite/examples/ios/camera/CameraExampleViewController.mm
@@ -23,10 +23,20 @@
 #include <iostream>
 #include <queue>
 
+#if TFLITE_USE_CONTRIB_LITE
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/string_util.h"
+#else
 #include "tensorflow/lite/kernels/register.h"
 #include "tensorflow/lite/model.h"
-#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/lite/string_util.h"
+#if TFLITE_USE_GPU_DELEGATE
+#include "tensorflow/lite/delegates/gpu/metal_delegate.h"
+#endif
+#endif
 
 #define LOG(x) std::cerr
 
@@ -34,7 +44,12 @@ namespace {
 
 // If you have your own model, modify this to the file name, and make sure
 // you've added the file to your app resources too.
+#if TFLITE_USE_GPU_DELEGATE
+// GPU Delegate only supports float model now.
 NSString* model_file_name = @"mobilenet_v1_1.0_224";
+#else
+NSString* model_file_name = @"mobilenet_quant_v1_224.tflite";
+#endif
 NSString* model_file_type = @"tflite";
 // If you have your own model, point this to the labels file.
 NSString* labels_file_name = @"labels";
@@ -151,7 +166,12 @@ void ProcessInputWithQuantizedModel(
 - (void)teardownAVCapture;
 @end
 
-@implementation CameraExampleViewController
+@implementation CameraExampleViewController {
+  std::unique_ptr<tflite::FlatBufferModel> model;
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  TfLiteDelegate* delegate;
+}
 
 - (void)setupAVCapture {
   NSError* error = nil;
@@ -363,6 +383,11 @@ void ProcessInputWithQuantizedModel(
 }
 
 - (void)dealloc {
+#if TFLITE_USE_GPU_DELEGATE
+  if (delegate) {
+    DeleteGpuDelegate(delegate);
+  }
+#endif
   [self teardownAVCapture];
 }
 
@@ -388,6 +413,15 @@ void ProcessInputWithQuantizedModel(
   LoadLabels(labels_file_name, labels_file_type, &labels);
 
   tflite::InterpreterBuilder(*model, resolver)(&interpreter);
+
+#if TFLITE_USE_GPU_DELEGATE
+  GpuDelegateOptions options;
+  options.allow_precision_loss = true;
+  options.wait_type = GpuDelegateOptions::WaitType::kActive;
+  delegate = NewGpuDelegate(&options);
+  interpreter->ModifyGraphWithDelegate(delegate);
+#endif
+
   // Explicitly resize the input tensor.
   {
     int input = interpreter->inputs()[0];
diff --git a/tensorflow/lite/examples/ios/camera/Podfile b/tensorflow/lite/examples/ios/camera/Podfile
index f460693122af8353286ea7069d5db873fedfc9b3..2e15cc63decb30eb2b8c9bffab3b5d1bff10e9b3 100644
--- a/tensorflow/lite/examples/ios/camera/Podfile
+++ b/tensorflow/lite/examples/ios/camera/Podfile
@@ -1,5 +1,13 @@
 platform :ios, '8.0'
 inhibit_all_warnings!
 
+project 'tflite_camera_example.xcodeproj'
+
 target 'tflite_camera_example'
-       pod 'TensorFlowLite', '1.10.1'
+  # Comment 'TensorFlowLite' pod and un-comment 'TensorFlowLiteGpuExperimental'
+  # to use TFLite GPU Delegate.
+  # Note: TFLite GPU Delegate binary isn't releast yet, and we're working
+  # on it.
+
+  pod 'TensorFlowLite', '1.12.0'
+  # pod 'TensorFlowLiteGpuExperimental', '0.0.1'
diff --git a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
index 9522c41dea0e6609e1b8e1462d9abec8874e3999..9b5c2b32a8f176e58a2d28d11ee3e41ef875e722 100644
--- a/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
+++ b/tensorflow/lite/examples/ios/camera/tflite_camera_example.xcodeproj/project.pbxproj
@@ -15,6 +15,7 @@
 		1CDB2D4A1ED3A9CD007929E9 /* CameraExampleViewController.mm in Sources */ = {isa = PBXBuildFile; fileRef = 1CDB2D451ED3A9CD007929E9 /* CameraExampleViewController.mm */; };
 		54DC6C3C5F734F3A58069F0C /* libPods-tflite_camera_example.a in Frameworks */ = {isa = PBXBuildFile; fileRef = 3BA8BF92C84895BFE59D8236 /* libPods-tflite_camera_example.a */; };
 		AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */ = {isa = PBXBuildFile; fileRef = AC1F82641FBA3CBD0052BA77 /* labels.txt */; };
+		AC31178921BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */; };
 		AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */ = {isa = PBXBuildFile; fileRef = AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */; };
 /* End PBXBuildFile section */
 
@@ -36,6 +37,7 @@
 		3BC5BE4BBD09374D3E98F082 /* Pods-tflite_camera_example.debug.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.debug.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.debug.xcconfig"; sourceTree = "<group>"; };
 		55ED318E8D29C8AFEF03DF1E /* Pods-tflite_camera_example.release.xcconfig */ = {isa = PBXFileReference; includeInIndex = 1; lastKnownFileType = text.xcconfig; name = "Pods-tflite_camera_example.release.xcconfig"; path = "Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example.release.xcconfig"; sourceTree = "<group>"; };
 		AC1F82641FBA3CBD0052BA77 /* labels.txt */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = labels.txt; sourceTree = "<group>"; };
+		AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_quant_v1_224.tflite; sourceTree = "<group>"; };
 		AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 
@@ -103,6 +105,7 @@
 		59A3CFF31CF4E68100C4259F /* data */ = {
 			isa = PBXGroup;
 			children = (
+				AC31178821BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite */,
 				AC3BB41620114C400084552C /* mobilenet_v1_1.0_224.tflite */,
 				AC1F82641FBA3CBD0052BA77 /* labels.txt */,
 			);
@@ -120,8 +123,6 @@
 				1C564C091ED3A92E00087306 /* Sources */,
 				1C564C0A1ED3A92E00087306 /* Frameworks */,
 				1C564C0B1ED3A92E00087306 /* Resources */,
-				00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */,
-				5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */,
 			);
 			buildRules = (
 			);
@@ -175,42 +176,13 @@
 				AC3BB41720114C400084552C /* mobilenet_v1_1.0_224.tflite in Resources */,
 				1C99111C1ED3B0E600A6BFB9 /* MainStoryboard_iPhone.storyboard in Resources */,
 				AC1F82661FBA3CBD0052BA77 /* labels.txt in Resources */,
+				AC31178921BB3FF900AFF1D2 /* mobilenet_quant_v1_224.tflite in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXResourcesBuildPhase section */
 
 /* Begin PBXShellScriptBuildPhase section */
-		00E875C3B066535AE6B77101 /* [CP] Embed Pods Frameworks */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Embed Pods Frameworks";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example-frameworks.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
-		5C2D02120E3E5E09567AA946 /* [CP] Copy Pods Resources */ = {
-			isa = PBXShellScriptBuildPhase;
-			buildActionMask = 2147483647;
-			files = (
-			);
-			inputPaths = (
-			);
-			name = "[CP] Copy Pods Resources";
-			outputPaths = (
-			);
-			runOnlyForDeploymentPostprocessing = 0;
-			shellPath = /bin/sh;
-			shellScript = "\"${SRCROOT}/Pods/Target Support Files/Pods-tflite_camera_example/Pods-tflite_camera_example-resources.sh\"\n";
-			showEnvVarsInLog = 0;
-		};
 		66DAEAAEE9EF6550C3A061E0 /* [CP] Check Pods Manifest.lock */ = {
 			isa = PBXShellScriptBuildPhase;
 			buildActionMask = 2147483647;
@@ -322,9 +294,7 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(inherited)",
-				);
+				HEADER_SEARCH_PATHS = "$(inherited)";
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = YES;
 				ONLY_ACTIVE_ARCH = YES;
@@ -365,9 +335,7 @@
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
-				HEADER_SEARCH_PATHS = (
-					"$(inherited)",
-				);
+				HEADER_SEARCH_PATHS = "$(inherited)";
 				IPHONEOS_DEPLOYMENT_TARGET = 8.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				SDKROOT = iphoneos;
diff --git a/tensorflow/lite/examples/ios/download_models.sh b/tensorflow/lite/examples/ios/download_models.sh
index ad6ccd1b0ad89aadd8035d5c952164f63f29ccaf..4828617d95e94c1b6ad811e04d3b94b659bd8f74 100755
--- a/tensorflow/lite/examples/ios/download_models.sh
+++ b/tensorflow/lite/examples/ios/download_models.sh
@@ -53,6 +53,6 @@ download_and_extract "${QUANTIZED_MODELS_URL}" "${DOWNLOADS_DIR}/quantized_model
 file ${DOWNLOADS_DIR}/models
 
 cp ${DOWNLOADS_DIR}/models/models/* simple/data/
-cp "${DOWNLOADS_DIR}/quantized_models/labels.txt" camera/data/
+cp ${DOWNLOADS_DIR}/models/models/* camera/data/
 cp "${DOWNLOADS_DIR}/quantized_models/mobilenet_quant_v1_224.tflite" \
-   'camera/data/mobilenet_v1_1.0_224.tflite'
+   'camera/data/mobilenet_quant_v1_224.tflite'
diff --git a/tensorflow/lite/examples/ios/simple/Podfile b/tensorflow/lite/examples/ios/simple/Podfile
index ddb77088d9f16fb55e8060a91504ebc44dd0b73e..931b72c1f5e946e8be61ac6dec3c6106a75b9685 100644
--- a/tensorflow/lite/examples/ios/simple/Podfile
+++ b/tensorflow/lite/examples/ios/simple/Podfile
@@ -2,4 +2,4 @@ platform :ios, '8.0'
 inhibit_all_warnings!
 
 target 'tflite_simple_example'
-       pod 'TensorFlowLite', '1.10.1'
+       pod 'TensorFlowLite', '1.12.0'
diff --git a/tensorflow/lite/examples/ios/simple/RunModelViewController.mm b/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
index e5764944f665077f8f97362888b34070de36c0ec..32da7f7e4fce5cafc3c4746e5847315172542fc9 100644
--- a/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
+++ b/tensorflow/lite/examples/ios/simple/RunModelViewController.mm
@@ -22,10 +22,10 @@
 #include <sstream>
 #include <string>
 
-#include "tensorflow/lite/kernels/register.h"
-#include "tensorflow/lite/model.h"
-#include "tensorflow/lite/string_util.h"
-#include "tensorflow/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/kernels/register.h"
+#include "tensorflow/contrib/lite/model.h"
+#include "tensorflow/contrib/lite/op_resolver.h"
+#include "tensorflow/contrib/lite/string_util.h"
 
 #include "ios_image_load.h"
 
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index de1bfd7053256538e4516b681d78b040bf9aec0d..4fc8648d46c4bdefe3865381a23f4d73c87c284b 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -63,7 +63,6 @@ cc_test(
     data = [
         "testdata/grace_hopper.bmp",
     ],
-    tags = ["no_oss"],
     deps = [
         ":bitmap_helpers",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/examples/label_image/label_image_test.cc b/tensorflow/lite/examples/label_image/label_image_test.cc
index 6b4ec2a9374ca58a227506ec312c9374c1a7fee3..4db139f048d44a263fa1bbe38099b55ee45fd593 100644
--- a/tensorflow/lite/examples/label_image/label_image_test.cc
+++ b/tensorflow/lite/examples/label_image/label_image_test.cc
@@ -20,8 +20,6 @@ limitations under the License.
 #include "tensorflow/lite/examples/label_image/get_top_n.h"
 #include "tensorflow/lite/examples/label_image/label_image.h"
 
-using ::testing::ElementsAreArray;
-
 namespace tflite {
 namespace label_image {
 
diff --git a/tensorflow/lite/examples/minimal/minimal.cc b/tensorflow/lite/examples/minimal/minimal.cc
index 46f8b09df6cee12cfd7a3767be1e8f501cc5ee4f..9bbfee60851e0d9a1cd1e7549338341b634f0aa6 100644
--- a/tensorflow/lite/examples/minimal/minimal.cc
+++ b/tensorflow/lite/examples/minimal/minimal.cc
@@ -50,7 +50,7 @@ int main(int argc, char* argv[]) {
 
   // Build the interpreter
   tflite::ops::builtin::BuiltinOpResolver resolver;
-  InterpreterBuilder builder(*model.get(), resolver);
+  InterpreterBuilder builder(*model, resolver);
   std::unique_ptr<Interpreter> interpreter;
   builder(&interpreter);
   TFLITE_MINIMAL_CHECK(interpreter != nullptr);
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
index 07fb87641133edb5550844dd5920cf712f0fe262..799b2e5a5dd097c6e017f574449d339992f7c41b 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/BUILD
@@ -10,18 +10,46 @@ load(
     "tflite_micro_cc_test",
 )
 
-tflite_micro_cc_test(
-    name = "micro_speech_test",
+cc_library(
+    name = "model_settings",
+    srcs = [
+        "model_settings.cc",
+    ],
+    hdrs = [
+        "model_settings.h",
+    ],
+)
+
+cc_library(
+    name = "tiny_conv_model_data",
     srcs = [
-        "micro_speech_test.cc",
-        "no_features_data.cc",
-        "no_features_data.h",
         "tiny_conv_model_data.cc",
+    ],
+    hdrs = [
         "tiny_conv_model_data.h",
+    ],
+)
+
+cc_library(
+    name = "features_test_data",
+    srcs = [
+        "no_features_data.cc",
         "yes_features_data.cc",
+    ],
+    hdrs = [
+        "no_features_data.h",
         "yes_features_data.h",
     ],
+)
+
+tflite_micro_cc_test(
+    name = "micro_speech_test",
+    srcs = [
+        "micro_speech_test.cc",
+    ],
     deps = [
+        ":features_test_data",
+        ":tiny_conv_model_data",
         "//tensorflow/lite:schema_fbs_version",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
@@ -31,46 +59,185 @@ tflite_micro_cc_test(
     ],
 )
 
-tflite_micro_cc_test(
-    name = "preprocessor_reference_test",
+cc_library(
+    name = "preprocessor_test_data",
     srcs = [
         "no_30ms_sample_data.cc",
-        "no_30ms_sample_data.h",
         "no_power_spectrum_data.cc",
+        "yes_30ms_sample_data.cc",
+        "yes_power_spectrum_data.cc",
+    ],
+    hdrs = [
+        "no_30ms_sample_data.h",
         "no_power_spectrum_data.h",
+        "yes_30ms_sample_data.h",
+        "yes_power_spectrum_data.h",
+    ],
+)
+
+cc_library(
+    name = "preprocessor_reference",
+    srcs = [
         "preprocessor.cc",
+    ],
+    hdrs = [
         "preprocessor.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "preprocessor_reference_test",
+    srcs = [
         "preprocessor_test.cc",
-        "yes_30ms_sample_data.cc",
-        "yes_30ms_sample_data.h",
-        "yes_power_spectrum_data.cc",
-        "yes_power_spectrum_data.h",
     ],
     deps = [
+        ":model_settings",
+        ":preprocessor_reference",
+        ":preprocessor_test_data",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
 
-tflite_micro_cc_test(
-    name = "preprocessor_fixed_test",
+cc_library(
+    name = "preprocessor_fixed",
     srcs = [
         "fixed_point/preprocessor.cc",
-        "no_30ms_sample_data.cc",
-        "no_30ms_sample_data.h",
-        "no_power_spectrum_data.cc",
-        "no_power_spectrum_data.h",
+    ],
+    hdrs = [
         "preprocessor.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "preprocessor_fixed_test",
+    srcs = [
         "preprocessor_test.cc",
-        "yes_30ms_sample_data.cc",
-        "yes_30ms_sample_data.h",
-        "yes_power_spectrum_data.cc",
-        "yes_power_spectrum_data.h",
     ],
     deps = [
+        ":model_settings",
+        ":preprocessor_fixed",
+        ":preprocessor_test_data",
         "//tensorflow/lite/c:c_api_internal",
         "//tensorflow/lite/experimental/micro:micro_framework",
         "//tensorflow/lite/experimental/micro/testing:micro_test",
     ],
 )
+
+cc_library(
+    name = "audio_provider",
+    srcs = [
+        "audio_provider.cc",
+    ],
+    hdrs = [
+        "audio_provider.h",
+    ],
+    deps = [
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "audio_provider_test",
+    srcs = [
+        "audio_provider_test.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":model_settings",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "feature_provider",
+    srcs = [
+        "feature_provider.cc",
+    ],
+    hdrs = [
+        "feature_provider.h",
+    ],
+    deps = [
+        ":audio_provider",
+        ":model_settings",
+        ":preprocessor_reference",
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "feature_provider_test",
+    srcs = [
+        "feature_provider_test.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":feature_provider",
+        ":model_settings",
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_library(
+    name = "timer",
+    srcs = [
+        "timer.cc",
+    ],
+    hdrs = [
+        "timer.h",
+    ],
+)
+
+tflite_micro_cc_test(
+    name = "timer_test",
+    srcs = [
+        "timer_test.cc",
+    ],
+    deps = [
+        ":timer",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/testing:micro_test",
+    ],
+)
+
+cc_binary(
+    name = "micro_speech",
+    srcs = [
+        "main.cc",
+    ],
+    deps = [
+        ":audio_provider",
+        ":feature_provider",
+        ":features_test_data",
+        ":model_settings",
+        ":preprocessor_reference",
+        ":timer",
+        ":tiny_conv_model_data",
+        "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite/experimental/micro:micro_framework",
+        "//tensorflow/lite/experimental/micro/kernels:all_ops_resolver",
+        "//tensorflow/lite/experimental/micro/kernels:micro_ops",
+        "//tensorflow/lite/schema:schema_fbs",
+    ],
+)
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c0365d56901b503628b323a2fe09a4fa0de9165e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.cc
@@ -0,0 +1,33 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+namespace {
+int16_t g_dummy_audio_data[kMaxAudioSampleSize];
+}  // namespace
+
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples) {
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
+    g_dummy_audio_data[i] = 0;
+  }
+  *audio_samples_size = kMaxAudioSampleSize;
+  *audio_samples = g_dummy_audio_data;
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e2442a5e83ee1f809f82587c816adb01dc09e5e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h
@@ -0,0 +1,36 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// This is an abstraction around an audio source like a microphone, and is
+// expected to return 16-bit PCM sample data for a given point in time. The
+// sample data itself should be used as quickly as possible by the caller, since
+// to allow memory optimizations there are no guarantees that the samples won't
+// be overwritten by new data in the future. In practice, implementations should
+// ensure that there's a reasonable time allowed for clients to access the data
+// before any reuse.
+// The reference implementation can have no platform-specific dependencies, so
+// it just returns an array filled with zeros. For real applications, you should
+// ensure there's a specialized implementation that accesses hardware APIs.
+TfLiteStatus GetAudioSamples(tflite::ErrorReporter* error_reporter,
+                             int start_ms, int duration_ms,
+                             int* audio_samples_size, int16_t** audio_samples);
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_AUDIO_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f7c7605f0feb3fd3179a0edd5e51574b867ce68
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider_test.cc
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestAudioProvider) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  int audio_samples_size = 0;
+  int16_t* audio_samples = nullptr;
+  TfLiteStatus get_status =
+      GetAudioSamples(error_reporter, 0, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, get_status);
+  TF_LITE_MICRO_EXPECT_LE(audio_samples_size, kMaxAudioSampleSize);
+  TF_LITE_MICRO_EXPECT_NE(audio_samples, nullptr);
+
+  // Make sure we can read all of the returned memory locations.
+  int total = 0;
+  for (int i = 0; i < audio_samples_size; ++i) {
+    total += audio_samples[i];
+  }
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c4c52ac0ff3696a05192465f8ac911b5d6a83925
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.cc
@@ -0,0 +1,121 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/audio_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
+
+namespace {
+// Stores the timestamp for the previous fetch of audio data, so that we can
+// avoid recalculating all the features from scratch if some earlier timeslices
+// are still present.
+int32_t g_last_time_in_ms = 0;
+// Make sure we don't try to use cached information if this is the first call
+// into the provider.
+bool g_is_first_run = true;
+}  // namespace
+
+FeatureProvider::FeatureProvider(int feature_size, uint8_t* feature_data)
+    : feature_size_(feature_size), feature_data_(feature_data) {
+  // Initialize the feature data to default values.
+  for (int n = 0; n < feature_size_; ++n) {
+    feature_data_[n] = 0;
+  }
+}
+
+FeatureProvider::~FeatureProvider() {}
+
+TfLiteStatus FeatureProvider::PopulateFeatureData(
+    tflite::ErrorReporter* error_reporter, int* how_many_new_slices) {
+  if (feature_size_ != kFeatureElementCount) {
+    error_reporter->Report("Requested feature_data_ size %d doesn't match %d",
+                           feature_size_, kFeatureElementCount);
+    return kTfLiteError;
+  }
+
+  const int32_t time_in_ms = TimeInMilliseconds();
+  // Quantize the time into steps as long as each window stride, so we can
+  // figure out which audio data we need to fetch.
+  const int last_step = (g_last_time_in_ms / kFeatureSliceStrideMs);
+  const int current_step = (time_in_ms / kFeatureSliceStrideMs);
+  g_last_time_in_ms = time_in_ms;
+
+  int slices_needed = current_step - last_step;
+  // If this is the first call, make sure we don't use any cached information.
+  if (g_is_first_run) {
+    g_is_first_run = false;
+    slices_needed = kFeatureSliceCount;
+  }
+  if (slices_needed > kFeatureSliceCount) {
+    slices_needed = kFeatureSliceCount;
+  }
+  *how_many_new_slices = slices_needed;
+
+  const int slices_to_keep = kFeatureSliceCount - slices_needed;
+  const int slices_to_drop = kFeatureSliceCount - slices_to_keep;
+  // If we can avoid recalculating some slices, just move the existing data
+  // up in the spectrogram, to perform something like this:
+  // last time = 80ms          current time = 120ms
+  // +-----------+             +-----------+
+  // | data@20ms |         --> | data@60ms |
+  // +-----------+       --    +-----------+
+  // | data@40ms |     --  --> | data@80ms |
+  // +-----------+   --  --    +-----------+
+  // | data@60ms | --  --      |  <empty>  |
+  // +-----------+   --        +-----------+
+  // | data@80ms | --          |  <empty>  |
+  // +-----------+             +-----------+
+  if (slices_to_keep > 0) {
+    for (int dest_slice = 0; dest_slice < slices_to_keep; ++dest_slice) {
+      uint8_t* dest_slice_data =
+          feature_data_ + (dest_slice * kFeatureSliceSize);
+      const int src_slice = dest_slice + slices_to_drop;
+      const uint8_t* src_slice_data =
+          feature_data_ + (src_slice * kFeatureSliceSize);
+      for (int i = 0; i < kFeatureSliceSize; ++i) {
+        dest_slice_data[i] = src_slice_data[i];
+      }
+    }
+  }
+  // Any slices that need to be filled in with feature data have their
+  // appropriate audio data pulled, and features calculated for that slice.
+  if (slices_needed > 0) {
+    for (int new_slice = slices_to_keep; new_slice < kFeatureSliceCount;
+         ++new_slice) {
+      const int new_step = (current_step - kFeatureSliceCount + 1) + new_slice;
+      const int32_t slice_start_ms = (new_step * kFeatureSliceStrideMs);
+      int16_t* audio_samples = nullptr;
+      int audio_samples_size = 0;
+      GetAudioSamples(error_reporter, slice_start_ms, kFeatureSliceDurationMs,
+                      &audio_samples_size, &audio_samples);
+      if (audio_samples_size < kMaxAudioSampleSize) {
+        error_reporter->Report("Audio data size %d  too small, want %d",
+                               audio_samples_size, kMaxAudioSampleSize);
+        return kTfLiteError;
+      }
+      uint8_t* new_slice_data = feature_data_ + (new_slice * kFeatureSliceSize);
+      TfLiteStatus preprocess_status =
+          Preprocess(error_reporter, audio_samples, audio_samples_size,
+                     kFeatureSliceSize, new_slice_data);
+      if (preprocess_status != kTfLiteOk) {
+        return preprocess_status;
+      }
+    }
+  }
+  return kTfLiteOk;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
new file mode 100644
index 0000000000000000000000000000000000000000..a86c56ebf053a8807e38c42c6a7088c146a31b9e
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h
@@ -0,0 +1,48 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+
+// Binds itself to an area of memory intended to hold the input features for an
+// audio-recognition neural network model, and fills that data area with the
+// features representing the current audio input, for example from a microphone.
+// The audio features themselves are a two-dimensional array, made up of
+// horizontal slices representing the frequencies at one point in time, stacked
+// on top of each other to form a spectrogram showing how those frequencies
+// changed over time.
+class FeatureProvider {
+ public:
+  // Create the provider, and bind it to an area of memory. This memory should
+  // remain accessible for the lifetime of the provider object, since subsequent
+  // calls will fill it with feature data. The provider does no memory
+  // management of this data.
+  FeatureProvider(int feature_size, uint8_t* feature_data);
+  ~FeatureProvider();
+
+  // Fills the feature data with information from audio inputs, and returns how
+  // many feature slices were updated.
+  TfLiteStatus PopulateFeatureData(tflite::ErrorReporter* error_reporter,
+                                   int* how_many_new_slices);
+
+ private:
+  int feature_size_;
+  uint8_t* feature_data_;
+};
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_FEATURE_PROVIDER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e52aec8d2741678a0f79f643bb7dcf42c848a58
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider_test.cc
@@ -0,0 +1,38 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestFeatureProvider) {
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  uint8_t feature_data[kFeatureElementCount];
+  FeatureProvider feature_provider(kFeatureElementCount, feature_data);
+
+  int how_many_new_slices = 0;
+  TfLiteStatus populate_status = feature_provider.PopulateFeatureData(
+      error_reporter, &how_many_new_slices);
+  TF_LITE_MICRO_EXPECT_EQ(kTfLiteOk, populate_status);
+  TF_LITE_MICRO_EXPECT_EQ(kFeatureSliceCount, how_many_new_slices);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
index de60c982f3a062a6a1f32369f388f5ed3b10f6ac..b623d8d11b75d59600cc6a029527d3957084a328 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/fixed_point/preprocessor.cc
@@ -31,6 +31,8 @@ limitations under the License.
 
 #include <cmath>
 
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
 namespace {
 
 // q format notation: qx.y => 1 sign bit, x-1 integer bits, y fraction bits.
@@ -66,13 +68,6 @@ inline int32_t FloatToFixed_Q2_30(float input) {
   return static_cast<int32_t>(roundf(input * (1 << 30)));
 }
 
-// These constants allow us to allocate fixed-sized arrays on the stack for our
-// working memory.
-constexpr int kInputSize = 512;
-constexpr int kAverageWindowSize = 6;
-constexpr int kOutputSize =
-    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
-
 // Performs a discrete Fourier transform on the real inputs. This corresponds to
 // rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
 // and to kiss_fftr() in KISSFFT at https://github.com/mborgerding/kissfft.
@@ -127,14 +122,14 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
                         const int16_t* input, int input_size, int output_size,
                         uint8_t* output) {
   // Ensure our input and output data arrays are valid.
-  if (input_size > kInputSize) {
+  if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
-                           kInputSize);
+                           kMaxAudioSampleSize);
     return kTfLiteError;
   }
-  if (output_size != kOutputSize) {
+  if (output_size != kFeatureSliceSize) {
     error_reporter->Report("Requested output size %d doesn't match %d",
-                           output_size, kOutputSize);
+                           output_size, kFeatureSliceSize);
     return kTfLiteError;
   }
 
@@ -142,18 +137,17 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   // In a real application, we'd calculate this table once in an initialization
   // function and store it for repeated reuse.
   // q1.15 format.
-  int16_t window_function[kInputSize];
+  int16_t window_function[kMaxAudioSampleSize];
   CalculatePeriodicHann(input_size, window_function);
 
   // Apply the window function to our time series input, and pad it with zeroes
   // to the next power of two.
-  int32_t fixed_input[kInputSize];
-  for (int i = 0; i < kInputSize; ++i) {
+  int32_t fixed_input[kMaxAudioSampleSize];
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
     if (i < input_size) {
       // input is int16_t.  Treat as q1.15 fixed point value in range [-1,1)
       // window_function is also q1.15 fixed point number
-      fixed_input[i] =
-          Q1_15_FixedMultiply_Q2_30(input[i], window_function[i]);
+      fixed_input[i] = Q1_15_FixedMultiply_Q2_30(input[i], window_function[i]);
     } else {
       fixed_input[i] = 0;
     }
@@ -161,31 +155,31 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
 
   // Pull the frequency data from the time series sample.
   // Calculated in q10.22 format from q2.30 inputs.
-  int32_t fourier_values[kInputSize];
-  CalculateDiscreteFourierTransform(fixed_input, kInputSize, fourier_values);
+  int32_t fourier_values[kMaxAudioSampleSize];
+  CalculateDiscreteFourierTransform(fixed_input, kMaxAudioSampleSize,
+                                    fourier_values);
 
   // We have the complex numbers giving us information about each frequency
   // band, but all we want to know is how strong each frequency is, so calculate
   // the squared magnitude by adding together the squares of each component.
-  int32_t power_spectrum[kInputSize / 2];
-  for (int i = 0; i < (kInputSize / 2); ++i) {
+  int32_t power_spectrum[kMaxAudioSampleSize / 2];
+  for (int i = 0; i < (kMaxAudioSampleSize / 2); ++i) {
     const int32_t real = fourier_values[(i * 2) + 0];
     const int32_t imaginary = fourier_values[(i * 2) + 1];
     // q10.22 results
-    power_spectrum[i] =
-        Q10_22_FixedMultiply_Q10_22(real, real) +
-        Q10_22_FixedMultiply_Q10_22(imaginary, imaginary);
+    power_spectrum[i] = Q10_22_FixedMultiply_Q10_22(real, real) +
+                        Q10_22_FixedMultiply_Q10_22(imaginary, imaginary);
   }
 
   // Finally, reduce the size of the output by averaging together six adjacent
   // frequencies into each slot, producing an array of 43 values.
   // Power_spectrum numbers are q10.22.  Divide by kAverageWindowSize inside
   // loop to prevent overflow.
-  for (int i = 0; i < kOutputSize; ++i) {
+  for (int i = 0; i < kFeatureSliceSize; ++i) {
     int32_t average = 0;
     for (int j = 0; j < kAverageWindowSize; ++j) {
       const int index = (i * kAverageWindowSize) + j;
-      if (index < (kInputSize / 2)) {
+      if (index < (kMaxAudioSampleSize / 2)) {
         average += power_spectrum[index] / kAverageWindowSize;
       }
     }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1890c25cf2b44c96c549757b31f88255d4a9ee09
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/main.cc
@@ -0,0 +1,112 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/feature_provider.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/tiny_conv_model_data.h"
+#include "tensorflow/lite/experimental/micro/kernels/all_ops_resolver.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/micro_interpreter.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/version.h"
+
+int main(int argc, char* argv[]) {
+  // Set up logging.
+  tflite::MicroErrorReporter micro_error_reporter;
+  tflite::ErrorReporter* error_reporter = &micro_error_reporter;
+
+  // Map the model into a usable data structure. This doesn't involve any
+  // copying or parsing, it's a very lightweight operation.
+  const tflite::Model* model = ::tflite::GetModel(g_tiny_conv_model_data);
+  if (model->version() != TFLITE_SCHEMA_VERSION) {
+    error_reporter->Report(
+        "Model provided is schema version %d not equal "
+        "to supported version %d.\n",
+        model->version(), TFLITE_SCHEMA_VERSION);
+    return 1;
+  }
+
+  // This pulls in all the operation implementations we need.
+  tflite::ops::micro::AllOpsResolver resolver;
+
+  // Create an area of memory to use for input, output, and intermediate arrays.
+  // The size of this will depend on the model you're using, and may need to be
+  // determined by experimentation.
+  const int tensor_arena_size = 10 * 1024;
+  uint8_t tensor_arena[tensor_arena_size];
+  tflite::SimpleTensorAllocator tensor_allocator(tensor_arena,
+                                                 tensor_arena_size);
+
+  // Build an interpreter to run the model with.
+  tflite::MicroInterpreter interpreter(model, resolver, &tensor_allocator,
+                                       error_reporter);
+
+  // Get information about the memory area to use for the model's input.
+  TfLiteTensor* model_input = interpreter.input(0);
+  if ((model_input->dims->size != 4) || (model_input->dims->data[0] != 1) ||
+      (model_input->dims->data[1] != kFeatureSliceCount) ||
+      (model_input->dims->data[2] != kFeatureSliceSize) ||
+      (model_input->type != kTfLiteUInt8)) {
+    error_reporter->Report("Bad input tensor parameters in model");
+    return 1;
+  }
+
+  // Prepare to access the audio spectrograms from a microphone or other source
+  // that will provide the inputs to the neural network.
+  FeatureProvider feature_provider(kFeatureElementCount,
+                                   model_input->data.uint8);
+
+  // Keep reading and analysing audio data in an infinite loop.
+  while (true) {
+    // Fetch the spectrogram for the current time.
+    int how_many_new_slices = 0;
+    TfLiteStatus feature_status = feature_provider.PopulateFeatureData(
+        error_reporter, &how_many_new_slices);
+    if (feature_status != kTfLiteOk) {
+      error_reporter->Report("Feature generation failed");
+      return 1;
+    }
+    // If no new audio samples have been received since last time, don't bother
+    // running the network model.
+    if (how_many_new_slices == 0) {
+      continue;
+    }
+
+    // Run the model on the spectrogram input and make sure it succeeds.
+    TfLiteStatus invoke_status = interpreter.Invoke();
+    if (invoke_status != kTfLiteOk) {
+      error_reporter->Report("Invoke failed");
+      return 1;
+    }
+
+    // The output from the model is a vector containing the scores for each
+    // kind of prediction, so figure out what the highest scoring category was.
+    TfLiteTensor* output = interpreter.output(0);
+    uint8_t top_category_score = 0;
+    int top_category_index = 0;
+    for (int category_index = 0; category_index < kCategoryCount;
+         ++category_index) {
+      const uint8_t category_score = output->data.uint8[category_index];
+      if (category_score > top_category_score) {
+        top_category_score = category_score;
+        top_category_index = category_index;
+      }
+    }
+
+    error_reporter->Report("Heard %s", kCategoryLabels[top_category_index]);
+  }
+
+  return 0;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9b8fb37b19d384fe92edf8ce2292aee19b99b7f
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.cc
@@ -0,0 +1,23 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
+
+const char* kCategoryLabels[kCategoryCount] = {
+    "silence",
+    "unknown",
+    "yes",
+    "no",
+};
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d8f3123a57bc5b807d39151adaf64f29d2f5f95
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
+
+// Keeping these as constant expressions allow us to allocate fixed-sized arrays
+// on the stack for our working memory.
+
+// The size of the input time series data we pass to the FFT to produce the
+// frequency information. This has to be a power of two, and since we're dealing
+// with 30ms of 16KHz inputs, which means 480 samples, this is the next value.
+constexpr int kMaxAudioSampleSize = 512;
+
+// All of these values are derived from the values used during model training,
+// if you change your model you'll need to update these constants.
+constexpr int kAverageWindowSize = 6;
+constexpr int kFeatureSliceSize =
+    ((kMaxAudioSampleSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+constexpr int kFeatureSliceCount = 49;
+constexpr int kFeatureElementCount = (kFeatureSliceSize * kFeatureSliceCount);
+constexpr int kFeatureSliceStrideMs = 20;
+constexpr int kFeatureSliceDurationMs = 30;
+
+constexpr int kCategoryCount = 4;
+constexpr int kSilenceIndex = 0;
+constexpr int kUnknownIndex = 1;
+extern const char* kCategoryLabels[kCategoryCount];
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_MODEL_SETTINGS_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
index 12f9e22038bafa2b4d960b9a9d7b6bcad452bf4c..f4a7f801cc6251b82339509f691fd64012fbe390 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.cc
@@ -28,14 +28,9 @@ limitations under the License.
 
 #include <cmath>
 
-namespace {
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/model_settings.h"
 
-// These constants allow us to allocate fixed-sized arrays on the stack for our
-// working memory.
-constexpr int kInputSize = 512;
-constexpr int kAverageWindowSize = 6;
-constexpr int kOutputSize =
-    ((kInputSize / 2) + (kAverageWindowSize - 1)) / kAverageWindowSize;
+namespace {
 
 // Performs a discrete Fourier transform on the real inputs. This corresponds to
 // rdft() in the FFT package at http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html,
@@ -78,27 +73,27 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
                         const int16_t* input, int input_size, int output_size,
                         uint8_t* output) {
   // Ensure our input and output data arrays are valid.
-  if (input_size > kInputSize) {
+  if (input_size > kMaxAudioSampleSize) {
     error_reporter->Report("Input size %d larger than %d", input_size,
-                           kInputSize);
+                           kMaxAudioSampleSize);
     return kTfLiteError;
   }
-  if (output_size != kOutputSize) {
+  if (output_size != kFeatureSliceSize) {
     error_reporter->Report("Requested output size %d doesn't match %d",
-                           output_size, kOutputSize);
+                           output_size, kFeatureSliceSize);
     return kTfLiteError;
   }
 
   // Pre-calculate the window function we'll be applying to the input data.
   // In a real application, we'd calculate this table once in an initialization
   // function and store it for repeated reuse.
-  float window_function[kInputSize];
+  float window_function[kMaxAudioSampleSize];
   CalculatePeriodicHann(input_size, window_function);
 
   // Apply the window function to our time series input, and pad it with zeroes
   // to the next power of two.
-  float float_input[kInputSize];
-  for (int i = 0; i < kInputSize; ++i) {
+  float float_input[kMaxAudioSampleSize];
+  for (int i = 0; i < kMaxAudioSampleSize; ++i) {
     if (i < input_size) {
       float_input[i] =
           (input[i] * window_function[i]) / static_cast<float>(1 << 15);
@@ -108,14 +103,15 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
   }
 
   // Pull the frequency data from the time series sample.
-  float fourier_values[kInputSize];
-  CalculateDiscreteFourierTransform(float_input, kInputSize, fourier_values);
+  float fourier_values[kMaxAudioSampleSize];
+  CalculateDiscreteFourierTransform(float_input, kMaxAudioSampleSize,
+                                    fourier_values);
 
   // We have the complex numbers giving us information about each frequency
   // band, but all we want to know is how strong each frequency is, so calculate
   // the squared magnitude by adding together the squares of each component.
-  float power_spectrum[kInputSize / 2];
-  for (int i = 0; i < (kInputSize / 2); ++i) {
+  float power_spectrum[kMaxAudioSampleSize / 2];
+  for (int i = 0; i < (kMaxAudioSampleSize / 2); ++i) {
     const float real = fourier_values[(i * 2) + 0];
     const float imaginary = fourier_values[(i * 2) + 1];
     power_spectrum[i] = (real * real) + (imaginary * imaginary);
@@ -123,11 +119,11 @@ TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
 
   // Finally, reduce the size of the output by averaging together six adjacent
   // frequencies into each slot, producing an array of 43 values.
-  for (int i = 0; i < kOutputSize; ++i) {
+  for (int i = 0; i < kFeatureSliceSize; ++i) {
     float total = 0.0f;
     for (int j = 0; j < kAverageWindowSize; ++j) {
       const int index = (i * kAverageWindowSize) + j;
-      if (index < (kInputSize / 2)) {
+      if (index < (kMaxAudioSampleSize / 2)) {
         total += power_spectrum[index];
       }
     }
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
index dede2a864219c3fd677cf1449b8ce51e70bdc976..adff790d6cc527578dbfb9dc481c99c1021b92db 100644
--- a/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/preprocessor.h
@@ -19,6 +19,11 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
 
+// Converts audio sample data into a more compact form that's appropriate for
+// feeding into a neural network. There are reference implementations that use
+// both floating point and fixed point available, but because the calculations
+// involved can be time-consuming, it's recommended that you use or write
+// specialized versions for your platform.
 TfLiteStatus Preprocess(tflite::ErrorReporter* error_reporter,
                         const int16_t* input, int input_size, int output_size,
                         uint8_t* output);
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c96a61ab517487413e875dc7369bddb1c9a0d9a
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.cc
@@ -0,0 +1,22 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
+
+int32_t TimeInMilliseconds() {
+  static int current_time = 0;
+  current_time += 100;
+  return current_time;
+}
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..162952844a832ebd0b0273d13a929fec6fa22892
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer.h
@@ -0,0 +1,31 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
+
+#include <cstdint>
+
+// Returns the time in milliseconds. There's no contract about what time zero
+// represents, the accuracy, or the granularity of the result. Subsequent calls
+// will generally not return a lower value, but even that's not guaranteed if
+// there's an overflow  wraparound.
+// The reference implementation of this function just returns a constantly
+// incrementing value for each call, since it would need a non-portable platform
+// call to access time information. For real applications, you'll need to write
+// your own platform-specific implementation.
+int32_t TimeInMilliseconds();
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_EXAMPLES_MICRO_SPEECH_TIMER_H_
diff --git a/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0487a12b25fc17208f1d9ab2b51538102f7ec914
--- /dev/null
+++ b/tensorflow/lite/experimental/micro/examples/micro_speech/timer_test.cc
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/micro/examples/micro_speech/timer.h"
+
+#include <limits>
+
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/experimental/micro/micro_error_reporter.h"
+#include "tensorflow/lite/experimental/micro/testing/micro_test.h"
+
+TF_LITE_MICRO_TESTS_BEGIN
+
+TF_LITE_MICRO_TEST(TestTimer) {
+  // Make sure that the technically-undefined overflow behavior we rely on below
+  // works on this platform. It's still not guaranteed, but at least this is a
+  // sanity check.  Turn off when running with ASan, as it will complain about
+  // the following undefined behavior.
+#ifndef ADDRESS_SANITIZER
+  int32_t overflow_value = std::numeric_limits<int32_t>::max();
+  overflow_value += 1;
+  TF_LITE_MICRO_EXPECT_EQ(std::numeric_limits<int32_t>::min(), overflow_value);
+#endif
+
+  const int32_t first_time = TimeInMilliseconds();
+  const int32_t second_time = TimeInMilliseconds();
+
+  // It's possible that the timer may have wrapped around from +BIG_NUM to
+  // -BIG_NUM between the first and second calls, since we're storing
+  // milliseconds in a 32-bit integer. It's not reasonable that the call itself
+  // would have taken more than 2^31 milliseconds though, so look at the
+  // difference and rely on integer overflow to ensure it's accurate.
+  const int32_t time_delta = (second_time - first_time);
+  TF_LITE_MICRO_EXPECT_LE(0, time_delta);
+}
+
+TF_LITE_MICRO_TESTS_END
diff --git a/tensorflow/lite/experimental/micro/testing/micro_test.h b/tensorflow/lite/experimental/micro/testing/micro_test.h
index 10bab05faec9fd93c91cc8479ed6f72e03600917..2f20dd5ac77dfd3f304c7cc93be0b865a0c2f0cb 100644
--- a/tensorflow/lite/experimental/micro/testing/micro_test.h
+++ b/tensorflow/lite/experimental/micro/testing/micro_test.h
@@ -153,4 +153,22 @@ extern tflite::ErrorReporter* reporter;
     }                                                                        \
   } while (false)
 
+#define TF_LITE_MICRO_EXPECT_GE(x, y)                                         \
+  do {                                                                        \
+    if ((x) < (y)) {                                                          \
+      micro_test::reporter->Report(#x " >= " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
+#define TF_LITE_MICRO_EXPECT_LE(x, y)                                         \
+  do {                                                                        \
+    if ((x) > (y)) {                                                          \
+      micro_test::reporter->Report(#x " <= " #y " failed at %s:%d", __FILE__, \
+                                   __LINE__);                                 \
+      micro_test::did_test_fail = true;                                       \
+    }                                                                         \
+  } while (false)
+
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_MICRO_TESTING_MICRO_TEST_H_
diff --git a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
index 020d40bc13c1bbd3d950bd505508ef654ac756ec..3ce861707fda767a3ec1c6e2d23e6a70c6131f24 100644
--- a/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
+++ b/tensorflow/lite/experimental/microfrontend/python/kernel_tests/audio_microfrontend_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import tensorflow as tf
 
 from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op as frontend_op
+from tensorflow.python.framework import test_util
 
 SAMPLE_RATE = 1000
 WINDOW_SIZE = 25
@@ -33,6 +34,7 @@ SMOOTHING_BITS = 10
 
 class AudioFeatureGenerationTest(tf.test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testSimple(self):
     with self.test_session():
       audio = tf.constant(
@@ -51,6 +53,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
       self.assertAllEqual(filterbanks.eval(),
                           [[479, 425], [436, 378], [410, 350], [391, 325]])
 
+  @test_util.run_v1_only("b/120545219")
   def testSimpleFloatScaled(self):
     with self.test_session():
       audio = tf.constant(
@@ -72,6 +75,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
                           [[7.484375, 6.640625], [6.8125, 5.90625],
                            [6.40625, 5.46875], [6.109375, 5.078125]])
 
+  @test_util.run_v1_only("b/120545219")
   def testStacking(self):
     with self.test_session():
       audio = tf.constant(
@@ -110,10 +114,11 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
           left_context=1,
           right_context=1)
       self.assertAllEqual(
-          filterbanks.eval(),
+          self.evaluate(filterbanks),
           [[479, 425, 479, 425, 436, 378], [479, 425, 436, 378, 410, 350],
            [436, 378, 410, 350, 391, 325], [410, 350, 391, 325, 391, 325]])
 
+  @test_util.run_v1_only("b/120545219")
   def testStackingDropFrame(self):
     with self.test_session():
       audio = tf.constant(
@@ -153,7 +158,7 @@ class AudioFeatureGenerationTest(tf.test.TestCase):
           frame_stride=3,
           zero_padding=True)
       self.assertAllEqual(
-          filterbanks.eval(),
+          self.evaluate(filterbanks),
           [[0, 0, 0, 0, 479, 425], [436, 378, 410, 350, 391, 325],
            [374, 308, 362, 292, 352, 275]])
 
diff --git a/tensorflow/lite/experimental/writer/BUILD b/tensorflow/lite/experimental/writer/BUILD
index 506c668cf2c70f1e294bcf2039fbb88ec9c4fd96..57ce63636714aa616cb50e04fe2c15210cc2eb1c 100644
--- a/tensorflow/lite/experimental/writer/BUILD
+++ b/tensorflow/lite/experimental/writer/BUILD
@@ -1,6 +1,9 @@
-package(default_visibility = [
-    "//visibility:public",
-])
+package(
+    default_visibility = [
+        "//visibility:public",
+    ],
+    features = ["-parse_headers"],
+)
 
 licenses(["notice"])  # Apache 2.0
 
diff --git a/tensorflow/lite/experimental/writer/option_writer_generator.cc b/tensorflow/lite/experimental/writer/option_writer_generator.cc
index 036809e94abcfc20df315b973c855152f923181b..fa360a2f47e3dba34e05d2e32616821294f0e678 100644
--- a/tensorflow/lite/experimental/writer/option_writer_generator.cc
+++ b/tensorflow/lite/experimental/writer/option_writer_generator.cc
@@ -56,6 +56,7 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteTransposeParams",
                                       "TfLiteReducerParams",
                                       "TfLiteSplitParams",
+                                      "TfLiteSplitVParams",
                                       "TfLiteSqueezeParams",
                                       "TfLiteStridedSliceParams",
                                       "TfLiteArgMaxParams",
@@ -66,6 +67,8 @@ static const char* param_structs[] = {"TfLiteConvParams",
                                       "TfLiteFakeQuantParams",
                                       "TfLitePackParams",
                                       "TfLiteOneHotParams",
+                                      "TfLiteLeakyReluParams",
+                                      "TfLiteMirrorPaddingParams",
                                       nullptr};
 }  // namespace
 
@@ -152,6 +155,7 @@ class OpOptionData {
     op_to_option_["BIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
     op_to_option_["UNIDIRECTIONAL_SEQUENCE_RNN"] = "SequenceRNNOptions";
+    op_to_option_["MIRROR_PAD"] = "";  // TODO(karimnosseir): MirrorPadOptions.
     // Manually specified mappings between ops and options (none)
     op_to_option_["EMBEDDING_LOOKUP"] =
         "";  // TODO(aselle): maybe something else.
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index ab0d186848fcac1f11ddfe3b55e4ffa2292b8395..36bf4f4618c42f4e56ce79b73c50c0454644a26d 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -3,11 +3,11 @@ upper_tabs:
 - include: /_upper_tabs_left.yaml
 - include: /api_docs/_upper_tabs_api.yaml
 # Dropdown menu
-- name: Ecosystem
-  path: /ecosystem
+- name: Resources
+  path: /resources
   is_default: true
   menu:
-  - include: /ecosystem/_menu_toc.yaml
+  - include: /resources/_menu_toc.yaml
   lower_tabs:
     # Subsite tabs
     other:
@@ -82,3 +82,5 @@ upper_tabs:
       contents:
       - title: API
         path: /api_docs/python/tf/lite
+
+- include: /_upper_tabs_right.yaml
diff --git a/tensorflow/lite/g3doc/_index.yaml b/tensorflow/lite/g3doc/_index.yaml
index 093f86b54202c3a705649b2a2d07d6ca7bceb020..1b3f1d616ae953e3c6a659301d7a7dd6860dcbf2 100644
--- a/tensorflow/lite/g3doc/_index.yaml
+++ b/tensorflow/lite/g3doc/_index.yaml
@@ -189,7 +189,7 @@ landing_page:
       - label: Read more
         path: https://cloud.google.com/blog/products/ai-machine-learning/ai-motion-designing-simple-system-see-understand-and-react-real-world-part-ii
     - heading: "Introducing the Model Optimization Toolkit"
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      image_path: /resources/images/tf-logo-card-16x9.png
       path: https://medium.com/tensorflow/introducing-the-model-optimization-toolkit-for-tensorflow-254aca1ba0a3
       buttons:
       - label: Read on TensorFlow blog
@@ -205,7 +205,7 @@ landing_page:
     background: grey
     items:
     - heading: "Using TensorFlow Lite on Android"
-      image_path: /ecosystem/images/tf-logo-card-16x9.png
+      image_path: /resources/images/tf-logo-card-16x9.png
       path: https://medium.com/tensorflow/using-tensorflow-lite-on-android-9bbc9cb7d69d
       buttons:
       - label: Read on TensorFlow blog
@@ -216,7 +216,7 @@ landing_page:
       - label: Watch the video
         path: https://www.youtube.com/watch?v=FAMfy7izB6A
     - heading: "TensorFlow Lite on GitHub"
-      image_path: /ecosystem/images/github-card-16x9.png
+      image_path: /resources/images/github-card-16x9.png
       path: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite
       buttons:
       - label: View on GitHub
diff --git a/tensorflow/lite/g3doc/apis.md b/tensorflow/lite/g3doc/apis.md
index e9fa24bff1d1a3d2b8e6a62f061245289afabcd1..b15159ce4145727863c335126557e06402f8dbd3 100644
--- a/tensorflow/lite/g3doc/apis.md
+++ b/tensorflow/lite/g3doc/apis.md
@@ -304,6 +304,13 @@ one of the following primitive types:
 *   `long`
 *   `byte`
 
+`String` types are also supported, but they are encoded differently than the
+primitive types. In particular, the shape of a string Tensor dictates the number
+and arrangement of strings in the Tensor, with each element itself being a
+variable length string. In this sense, the (byte) size of the Tensor cannot be
+computed from the shape and type alone, and consequently strings cannot be
+provided as a single, flat `ByteBuffer` argument.
+
 If other data types, including boxed types like `Integer` and `Float`, are used,
 an `IllegalArgumentException` will be thrown.
 
@@ -345,13 +352,12 @@ interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
 ```
 
 where each entry in `inputs` corresponds to an input tensor and
-`map_of_indices_to_outputs` maps indices of output tensors to the
-corresponding output data. In both cases the tensor indices should correspond to
-the values given to the [TensorFlow Lite Optimized Converter](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco/g3doc/cmdline_examples.md)
+`map_of_indices_to_outputs` maps indices of output tensors to the corresponding
+output data. In both cases the tensor indices should correspond to the values
+given to the [TensorFlow Lite Optimized Converter](convert/cmdline_examples.md)
 when the model was created. Be aware that the order of tensors in `input` must
 match the order given to the `TensorFlow Lite Optimized Converter`.
 
-
 The Java API also provides convenient functions for app developers to get the
 index of any model input or output using a tensor name:
 
diff --git a/tensorflow/lite/g3doc/convert/cmdline_examples.md b/tensorflow/lite/g3doc/convert/cmdline_examples.md
index 59f26b35051ce2ec410e25a5c877344ffe96dc45..de81e2cfdd41d6232ee1b76985a2e7dc9167e88f 100644
--- a/tensorflow/lite/g3doc/convert/cmdline_examples.md
+++ b/tensorflow/lite/g3doc/convert/cmdline_examples.md
@@ -94,11 +94,12 @@ tflite_convert \
 ### Convert a TensorFlow GraphDef for quantized inference <a name="graphdef_quant"></a>
 
 The TensorFlow Lite Converter is compatible with fixed point quantization models
-described [here](https://www.tensorflow.org/performance/quantization). These are
-float models with `FakeQuant*` ops inserted at the boundaries of fused layers
-to record min-max range information. This generates a quantized inference
-workload that reproduces the quantization behavior that was used during
-training.
+described
+[here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/contrib/q
+uantize/README.md). These are float models with `FakeQuant*` ops inserted at the
+boundaries of fused layers to record min-max range information. This generates a
+quantized inference workload that reproduces the quantization behavior that was
+used during training.
 
 The following command generates a quantized TensorFlow Lite FlatBuffer from a
 "quantized" TensorFlow GraphDef.
diff --git a/tensorflow/lite/g3doc/convert/index.md b/tensorflow/lite/g3doc/convert/index.md
index bc92a1c1a11a6f3808e44f37d04704ece1627fc3..60fa265c295174453b1a910f5279807dd0be32cb 100644
--- a/tensorflow/lite/g3doc/convert/index.md
+++ b/tensorflow/lite/g3doc/convert/index.md
@@ -6,14 +6,20 @@ file used by the TensorFlow Lite interpreter.
 ## From model training to device deployment
 
 After a TensorFlow model is trained, the TensorFlow Lite converter uses that
-model to generate a TensorFlow Lite [FlatBuffer](https://google.github.io/flatbuffers/)
-file (`.tflite`). The converter supports as input:
+model to generate a TensorFlow Lite
+[FlatBuffer](https://google.github.io/flatbuffers/) file (`.tflite`). The
+converter supports as input:
 [SavedModels](https://www.tensorflow.org/guide/saved_model#using_savedmodel_with_estimators),
 frozen graphs (models generated by
 [freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)),
-and `tf.keras` models.  The TensorFlow Lite `FlatBuffer` file is deployed to a
-client device (generally a mobile or embedded device), and the TensorFlow Lite
+and `tf.keras` HDF5 models. The TensorFlow Lite `FlatBuffer` file is deployed to
+a client device (generally a mobile or embedded device), and the TensorFlow Lite
 interpreter uses the compressed model for on-device inference. This conversion
 process is shown in the diagram below:
 
 ![TFLite converter workflow](../images/convert/workflow.svg)
+
+The TensorFlow Lite Converter can be used either from [Python](python_api.md) or
+from the [command line](cmdline_examples.md). This allows you to integrate the
+conversion step into the model design workflow, ensuring the model is easy to
+convert to a mobile inference graph.
diff --git a/tensorflow/lite/g3doc/convert/python_api.md b/tensorflow/lite/g3doc/convert/python_api.md
index 4bdf0d8cbe8f5732cc9d3b744d3e1820bb42bd5d..4d2c7361c9f399848c161ccc706c71894625725d 100644
--- a/tensorflow/lite/g3doc/convert/python_api.md
+++ b/tensorflow/lite/g3doc/convert/python_api.md
@@ -3,10 +3,9 @@
 This page provides examples on how to use the TensorFlow Lite Converter and the
 TensorFlow Lite interpreter using the Python API.
 
-Note: TFLite recently moved from `tf.contrib.lite` to `tf.lite`. If you are
-using tensorflow `r1.12` or earlier you will need to add `.contrib` to the
-commands below. `tf.lite` works with newer builds, like the nightly build,
-which can be installed with: `pip install tf-nightly`
+Note: These docs describe the converter in the TensorFlow nightly release,
+installed using `pip install tf-nightly`. For docs describing older versions
+reference ["Converting models from TensorFlow 1.12"](#pre_tensorflow_1.12).
 
 [TOC]
 
@@ -20,14 +19,9 @@ be targeted to devices with mobile.
 
 ## API
 
-The API for converting TensorFlow models to TensorFlow Lite as of TensorFlow 1.9
-is `tf.lite.TFLiteConverter`. The API for calling the Python intepreter
-is `tf.lite.Interpreter`.
-
-Note: Reference "Additional Instructions" sections for converting TensorFlow
-models to TensorFlow Lite
-[in TensorFlow 1.9 to TensorFlow 1.11](#pre_tensorflow_1.11) and
-[prior to TensorFlow 1.9](#pre_tensorflow_1.9)
+The API for converting TensorFlow models to TensorFlow Lite is
+`tf.lite.TFLiteConverter`. The API for calling the Python interpreter is
+`tf.lite.Interpreter`.
 
 `TFLiteConverter` provides class methods based on the original format of the
 model. `TFLiteConverter.from_session()` is available for GraphDefs.
@@ -250,14 +244,13 @@ either install the nightly build with
 [Docker](https://www.tensorflow.org/install/docker), or
 [build the pip package from source](https://www.tensorflow.org/install/source).
 
-### Converting models in TensorFlow 1.9 to TensorFlow 1.11 <a name="pre_tensorflow_1.11"></a>
-
-To convert TensorFlow models to TensorFlow Lite in TensorFlow 1.9 through
-TensorFlow 1.11, use `TocoConverter`. `TocoConverter` is semantically
-identically to `TFLiteConverter`.
+### Converting models from TensorFlow 1.12 <a name="pre_tensorflow_1.12"></a>
 
-### Converting models prior to TensorFlow 1.9 <a name="pre_tensorflow_1.9"></a>
+Reference the following table to convert TensorFlow models to TensorFlow Lite in
+and before TensorFlow 1.12. Run `help()` to get details of each API.
 
-To convert TensorFlow models to TensorFlow Lite in TensorFlow 1.7 and TensorFlow
-1.8, use the `toco_convert` function. Run `help(tf.lite.toco_convert)`
-to get details about accepted parameters.
+TensorFlow Version | Python API
+------------------ | ---------------------------------
+1.12               | `tf.contrib.lite.TFLiteConverter`
+1.9-1.11           | `tf.contrib.lite.TocoConverter`
+1.7-1.8            | `tf.contrib.lite.toco_convert`
diff --git a/tensorflow/lite/g3doc/devguide.md b/tensorflow/lite/g3doc/devguide.md
index 270cb8ce378a2bb7e4c0f8f93bce30ac6e740045..fdd02638f9b78e05e77cfeb22644bfb37878a580 100644
--- a/tensorflow/lite/g3doc/devguide.md
+++ b/tensorflow/lite/g3doc/devguide.md
@@ -35,7 +35,7 @@ by suggesting contextually relevant messages. The model is built specifically fo
 memory constrained devices, such as watches and phones, and has been successfully
 used in Smart Replies on Android Wear. Currently, this model is Android-specific.
 
-These pre-trained models are [available for download](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/models.md)
+These pre-trained models are [available for download](models.md).
 
 ### Re-train Inception-V3 or MobileNet for a custom data set
 
@@ -57,51 +57,59 @@ A developer may choose to train a custom model using Tensorflow (see the
 [TensorFlow tutorials](../tutorials/) for examples of building and training
 models). If you have already written a model, the first step is to export this
 to a `tf.GraphDef` file. This is required because some formats do not store the
-model structure outside the code, and we must communicate with other parts of the
-framework. See
-[Exporting the Inference Graph](https://github.com/tensorflow/models/blob/master/research/slim/README.md)
-to create .pb file for the custom model.
+model structure outside the code, and we must communicate with other parts of
+the framework. See
+[Exporting the Inference Graph](https://www.tensorflow.org/tutorials/keras/save_and_restore_models#save_the_entire_model)
+to create file for the custom model.
 
-TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to the
-[TensorFlow Lite & TensorFlow Compatibility Guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/g3doc/tf_ops_compatibility.md)
+TensorFlow Lite currently supports a subset of TensorFlow operators. Refer to
+the [TensorFlow Lite & TensorFlow Compatibility Guide](tf_ops_compatibility.md)
 for supported operators and their usage. This set of operators will continue to
 grow in future Tensorflow Lite releases.
 
-
 ## 2. Convert the model format
 
-The model generated (or downloaded) in the previous step is a *standard*
-Tensorflow model and you should now have a .pb or .pbtxt `tf.GraphDef` file.
-Models generated with transfer learning (re-training) or custom models must be
-converted—but, we must first freeze the graph to convert the model to the
-Tensorflow Lite format. This process uses several model formats:
-
-* `tf.GraphDef` (.pb) —A protobuf that represents the TensorFlow training or
-  computation graph. It contains operators, tensors, and variables definitions.
-* *CheckPoint* (.ckpt) —Serialized variables from a TensorFlow graph. Since this
-  does not contain a graph structure, it cannot be interpreted by itself.
-* `FrozenGraphDef` —A subclass of `GraphDef` that does not contain
-  variables. A `GraphDef` can be converted to a `FrozenGraphDef` by taking a
-  CheckPoint and a `GraphDef`, and converting each variable into a constant
-  using the value retrieved from the CheckPoint.
-* `SavedModel` —A `GraphDef` and CheckPoint with a signature that labels
-  input and output arguments to a model. A `GraphDef` and CheckPoint can be
-  extracted from a `SavedModel`.
-* *TensorFlow Lite model* (.tflite) —A serialized
-  [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
-  Lite operators and tensors for the TensorFlow Lite interpreter, similar to a
-  `FrozenGraphDef`.
-
-### Freeze Graph
-
-To use the `GraphDef` .pb file with TensorFlow Lite, you must have checkpoints
-that contain trained weight parameters. The .pb file only contains the structure
-of the graph. The process of merging the checkpoint values with the graph
-structure is called *freezing the graph*.
-
-You should have a checkpoints folder or download them for a pre-trained model
-(for example,
-[MobileNets](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md)).
+The [TensorFlow Lite Converter](convert/index.md) accepts the following file
+formats:
+
+*   `SavedModel` — A `GraphDef` and checkpoint with a signature that labels
+    input and output arguments to a model. See the documentation for converting
+    SavedModels using [Python](convert/python_api.md#basic_savedmodel) or using
+    the [command line](convert/cmdline_examples.md#savedmodel).
+*   `tf.keras` - A HDF5 file containing a model with weights and input and
+    output arguments generated by `tf.Keras`. See the documentation for
+    converting HDF5 models using
+    [Python](convert/python_api.md#basic_keras_file) or using the
+    [command line](convert/cmdline_examples.md#keras).
+*   `frozen tf.GraphDef` — A subclass of `tf.GraphDef` that does not contain
+    variables. A `GraphDef` can be converted to a `frozen GraphDef` by taking a
+    checkpoint and a `GraphDef`, and converting each variable into a constant
+    using the value retrieved from the checkpoint. Instructions on converting a
+    `tf.GraphDef` to a TensorFlow Lite model are described in the next
+    subsection.
+
+### Converting a tf.GraphDef
+
+TensorFlow models may be saved as a .pb or .pbtxt `tf.GraphDef` file. In order
+to convert the `tf.GraphDef` file to TensorFlow Lite, the model must first be
+frozen. This process invovles several file formats including the `frozen
+GraphDef`:
+
+*   `tf.GraphDef` (.pb or .pbtxt) — A protobuf that represents the TensorFlow
+    training or computation graph. It contains operators, tensors, and variables
+    definitions.
+*   *checkpoint* (.ckpt) — Serialized variables from a TensorFlow graph. Since
+    this does not contain a graph structure, it cannot be interpreted by itself.
+*   *TensorFlow Lite model* (.tflite) — A serialized
+    [FlatBuffer](https://google.github.io/flatbuffers/) that contains TensorFlow
+    Lite operators and tensors for the TensorFlow Lite interpreter.
+
+You must have checkpoints that contain trained weights. The `tf.GraphDef` file
+only contains the structure of the graph. The process of merging the checkpoint
+values with the graph structure is called *freezing the graph*.
+
+`tf.GraphDef` and checkpoint files for MobileNet models are available
+[here](https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet_v1.md).
 
 To freeze the graph, use the following command (changing the arguments):
 
@@ -113,69 +121,53 @@ freeze_graph --input_graph=/tmp/mobilenet_v1_224.pb \
   --output_node_names=MobileNetV1/Predictions/Reshape_1
 ```
 
-The `input_binary` flag must be enabled so the protobuf is read and written in
-a binary format. Set the `input_graph` and `input_checkpoint` files.
+Set the `input_binary` flag to `True` when reading a binary protobuf, a `.pb`
+file. Set to `False` for a `.pbtxt` file.
 
-The `output_node_names` may not be obvious outside of the code that built the
-model. The easiest way to find them is to visualize the graph, either with
-[TensorBoard](https://codelabs.developers.google.com/codelabs/tensorflow-for-poets-2/#3)
-or `graphviz`.
+Set `input_graph` and `input_checkpoint` to the respective filenames. The
+`output_node_names` may not be obvious outside of the code that built the model.
+The easiest way to find them is to visualize the graph, either with
+[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard) or
+`graphviz`.
 
 The frozen `GraphDef` is now ready for conversion to the `FlatBuffer` format
-(.tflite) for use on Android or iOS devices. For Android, the Tensorflow
-Optimizing Converter tool supports both float and quantized models. To convert
-the frozen `GraphDef` to the .tflite format:
+(.tflite) for use on Android or iOS devices. For Android, the TensorFlow Lite
+Converter tool supports both float and quantized models. To convert the frozen
+`GraphDef` to the .tflite format use a command similar to the following:
 
 ```
-toco --input_file=$(pwd)/mobilenet_v1_1.0_224/frozen_graph.pb \
-  --input_format=TENSORFLOW_GRAPHDEF \
-  --output_format=TFLITE \
+tflite_convert \
   --output_file=/tmp/mobilenet_v1_1.0_224.tflite \
-  --inference_type=FLOAT \
-  --input_type=FLOAT \
+  --graph_def_file=/tmp/mobilenet_v1_0.50_128/frozen_graph.pb \
   --input_arrays=input \
-  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
-  --input_shapes=1,224,224,3
+  --output_arrays=MobilenetV1/Predictions/Reshape_1
 ```
 
-The `input_file` argument should reference the frozen `GraphDef` file
-containing the model architecture. The [frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
-file used here is available for download. `output_file` is where the TensorFlow
-Lite model will get generated. The `input_type` and `inference_type`
-arguments should be set to `FLOAT`, unless converting a
-<a href="https://www.tensorflow.org/performance/quantization">quantized model</a>.
-Setting the `input_array`, `output_array`, and `input_shape` arguments are not as
-straightforward. The easiest way to find these values is to explore the graph
-using Tensorboard. Reuse the arguments for specifying the output nodes for
-inference in the `freeze_graph` step.
-
-It is also possible to use the Tensorflow Optimizing Converter with protobufs
-from either Python or from the command line (see the 
-[toco_from_protos.py](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco/python/toco_from_protos.py)
-example). This allows you to integrate the conversion step into the model design
-workflow, ensuring the model is easily convertible to a mobile inference graph.
-For example:
-
-```python
-import tensorflow as tf
-
-img = tf.placeholder(name="img", dtype=tf.float32, shape=(1, 64, 64, 3))
-val = img + tf.constant([1., 2., 3.]) + tf.constant([1., 4., 4.])
-out = tf.identity(val, name="out")
-
-with tf.Session() as sess:
-  tflite_model = tf.lite.toco_convert(sess.graph_def, [img], [out])
-  open("converteds_model.tflite", "wb").write(tflite_model)
-```
+The
+[frozen_graph.pb](https://storage.googleapis.com/download.tensorflow.org/models/mobilenet_v1_1.0_224_frozen.tgz)
+file used here is available for download. Setting the `input_array` and
+`output_array` arguments is not straightforward. The easiest way to find these
+values is to explore the graph using
+[TensorBoard](https://www.tensorflow.org/guide/summaries_and_tensorboard). Reuse
+the arguments for specifying the output nodes for inference in the
+`freeze_graph` step.
+
+### Full converter reference
 
-For usage, see the Tensorflow Optimizing Converter
-[command-line examples](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/toco/g3doc/cmdline_examples.md).
+The [TensorFlow Lite Converter](convert/index.md) can be
+[Python](convert/python_api.md) or from the
+[command line](convert/cmdline_examples.md). This allows you to integrate the
+conversion step into the model design workflow, ensuring the model is easy to
+convert to a mobile inference graph.
 
-Refer to the
-[Ops compatibility guide](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/g3doc/tf_ops_compatibility.md)
-for troubleshooting help, and if that doesn't help, please
+### Ops compatibility
+
+Refer to the [ops compatibility guide](tf_ops_compatibility.md) for
+troubleshooting help, and if that doesn't help, please
 [file an issue](https://github.com/tensorflow/tensorflow/issues).
 
+### Graph vizualization tool
+
 The [development repo](https://github.com/tensorflow/tensorflow) contains a tool
 to visualize TensorFlow Lite models after conversion. To build the
 [visualize.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/visualize.py)
@@ -212,8 +204,8 @@ installing TensorFlow on Android and setting up `bazel` and Android Studio.
 ### iOS
 
 To integrate a TensorFlow model in an iOS app, see the
-[TensorFlow Lite for iOS](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/g3doc/ios.md)
-guide and <a href="./demo_ios.md">iOS demo</a> guide.
+[TensorFlow Lite for iOS](ios.md) guide and <a href="./demo_ios.md">iOS demo</a>
+guide.
 
 #### Core ML support
 
@@ -227,6 +219,5 @@ devices. To use the converter, refer to the
 ### Raspberry Pi
 
 Compile Tensorflow Lite for a Raspberry Pi by following the
-[RPi build instructions](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/rpi.md)
-This compiles a static library file (`.a`) used to build your app. There are
-plans for Python bindings and a demo app.
+[RPi build instructions](rpi.md) This compiles a static library file (`.a`) used
+to build your app. There are plans for Python bindings and a demo app.
diff --git a/tensorflow/lite/g3doc/models.md b/tensorflow/lite/g3doc/models.md
index 537e285490f905730d9aa5fc61faefae6556b7d9..62b3f17c79aa3688011a1452da18e098008f414e 100644
--- a/tensorflow/lite/g3doc/models.md
+++ b/tensorflow/lite/g3doc/models.md
@@ -76,8 +76,11 @@ Mobilenet_V1_1.0_128_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tf
 Mobilenet_V1_1.0_160_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_160_quant.tgz)  | 4.3 Mb     | 66.9%          | 86.7%          | 37.4 ms
 Mobilenet_V1_1.0_192_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_192_quant.tgz)  | 4.3 Mb     | 69.1%          | 88.1%          | 51.9 ms
 Mobilenet_V1_1.0_224_quant  | [paper](https://arxiv.org/pdf/1712.05877.pdf), [tflite&pb](http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz)  | 4.3 Mb     | 70.0%          | 89.0%          | 70.2 ms
-Mobilenet_v2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
-Inception_v3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Mobilenet_V2_1.0_224_quant  | [paper](https://arxiv.org/abs/1806.08342), [tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/mobilenet_v2_1.0_224_quant.tgz)              | 3.4 Mb     | 70.8%          | 89.9%          | 80.3 ms
+Inception_V1_quant          | [paper](https://arxiv.org/abs/1409.4842), [tflite&pb](http://download.tensorflow.org/models/inception_v1_224_quant_20181026.tgz)                          | 6.4 Mb     | 70.1%          | 89.8%          | 154.5 ms
+Inception_V2_quant          | [paper](https://arxiv.org/abs/1512.00567), [tflite&pb](http://download.tensorflow.org/models/inception_v2_224_quant_20181026.tgz)                         | 11 Mb      | 73.5%          | 91.4%          | 235.0 ms
+Inception_V3_quant          | [paper](https://arxiv.org/abs/1806.08342),[tflite&pb](http://download.tensorflow.org/models/tflite_11_05_08/inception_v3_quant.tgz)                       | 23 Mb      | 77.5%          | 93.7%          | 637 ms
+Inception_V4_quant          | [paper](https://arxiv.org/abs/1602.07261), [tflite&pb](http://download.tensorflow.org/models/inception_v4_299_quant_20181026.tgz)                         | 41 Mb      | 79.5%          | 93.9%          | 1250.8 ms
 
 ## Other models
 
diff --git a/tensorflow/lite/g3doc/tf_ops_compatibility.md b/tensorflow/lite/g3doc/tf_ops_compatibility.md
index b0dfb0fed1f7a072487a06c11bddf5545911ffdf..dcfda72137cafbc676dec2fb5dbf5da8ab8cb45a 100644
--- a/tensorflow/lite/g3doc/tf_ops_compatibility.md
+++ b/tensorflow/lite/g3doc/tf_ops_compatibility.md
@@ -1,4 +1,3 @@
-
 # TensorFlow Lite & TensorFlow Compatibility Guide
 
 TensorFlow Lite supports a number of TensorFlow operations used in common
@@ -75,6 +74,7 @@ counterparts:
     0D tensor*
 *   [tf.squeeze](https://www.tensorflow.org/api_docs/python/tf/squeeze) - *as
     long as axis is not provided*
+*   [tf.squared_difference](https://www.tensorflow.org/versions/master/api_docs/python/tf/squared_difference)
 *   [tf.strided_slice](https://www.tensorflow.org/api_docs/python/tf/strided_slice) -
     *as long as ellipsis_mask and new_axis_mask are not used*
 *   [tf.transpose](https://www.tensorflow.org/versions/master/api_docs/python/tf/transpose) -
@@ -139,6 +139,17 @@ following common ops are not supported at the moment:
 The following TensorFlow Lite operations are fully supported and used in place
 of the TensorFlow operations listed above:
 
+**ABS**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: elementwise abs of the input
+}
+```
+
 **ADD**
 
 ```
@@ -154,6 +165,30 @@ Options {
 }
 ```
 
+**ARG_MAX**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: A tensor of indices of maximum values.
+}
+```
+
+**ARG_MIN**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: A tensor of indices of minium values.
+}
+```
+
 **AVERAGE_POOL_2D**
 
 ```
@@ -280,6 +315,18 @@ Outputs {
 }
 ```
 
+**FILL**
+
+```
+Inputs {
+  0: a 1D tensor
+  1: a 0D (scalar) tensor
+}
+Outputs {
+  0: A tensor of shape `tensor 0` filled with the value in `tensor 1`.
+}
+```
+
 **FLOOR**
 
 ```
@@ -291,6 +338,30 @@ outputs: {
 }
 ```
 
+**FLOOR_DIV**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: result of computing element-wise floor of `tensor 0` divided by `tensor 1`.
+}
+```
+
+**FLOOR_MOD**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: result of computing element-wise floor of `tensor 0` modulo `tensor 1`.
+}
+```
+
 **FULLY_CONNECTED**
 
 ```
@@ -378,6 +449,34 @@ Options {
 }
 ```
 
+**LEAKY_RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(input, input * alpha)
+}
+Options {
+  alpha: slope of the activation at x < 0 (provided alpha <= 1)
+}
+```
+
+**LEAKY_RELU**
+
+```
+Inputs {
+  0: a tensor
+}
+Outputs {
+  0: a tensor equivalent to max(input, input * alpha)
+}
+Options {
+  alpha
+}
+```
+
 **LESS**
 
 ```
@@ -421,6 +520,18 @@ Options {
 }
 ```
 
+**LOGICAL_OR**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: a list of tensors.
+}
+Outputs {
+  0: A tensor of logical_or output tensors.
+}
+```
+
 **LOGISTIC**
 
 ```
@@ -498,6 +609,18 @@ Outputs {
 }
 ```
 
+**PACK**
+
+```
+Inputs {
+  0: a list of tensors.
+  1: an integer.
+}
+Outputs {
+  0: A tensor of stacked tensors.
+}
+```
+
 **PAD**
 
 ```
@@ -539,6 +662,35 @@ Outputs {
 }
 ```
 
+**POW**
+
+```
+Inputs {
+  0: a tensor
+  1: a tensor
+}
+Outputs {
+  0: elementwise pow of the input tensors
+}
+```
+
+**RANGE**
+
+```
+Inputs {
+  0: a 0D (scalar) tensor
+  1: a 0D (scalar) tensor
+  2: a 0D (scalar) tensor
+}
+Outputs {
+  0: A 1D tensor of type `dtype` defined by a sequence where `tensor 0` is the
+  start, `tensor 1` is the limit, and `tensor 2` is the delta.
+}
+Options {
+  dtype
+}
+```
+
 **RELU**
 
 ```
@@ -587,6 +739,22 @@ Options {
 }
 ```
 
+**RESIZE_NEAREST_NEIGHBOR**
+
+```
+Inputs {
+  0: a 4D tensor
+  1: a 1D tensor with 2 elements
+}
+Outputs {
+  0: A tensor of type `tensor 0` resized according to `tensor 1` heigh/width values
+  using nearest neighbors interpolation.
+}
+Options {
+  align_corners
+}
+```
+
 **RSQRT**
 
 ```
@@ -698,6 +866,22 @@ Options {
 }
 ```
 
+**SPLIT_V**
+
+```
+Inputs {
+  0: tensor (input)
+  1: 1-D tensor (size_splits)
+  2: 0-D tensor (axis)
+}
+Outputs {
+  0-N: subtensors built from the input tensors
+}
+Options {
+  num_splits: Specifies number of outputs
+}
+```
+
 **SQRT**
 
 ```
@@ -781,66 +965,6 @@ Outputs {
 }
 ```
 
-**POW**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: elementwise pow of the input tensors
-}
-```
-
-**ARG_MAX**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of maximum values.
-}
-```
-
-**ARG_MIN**
-
-```
-Inputs {
-  0: a tensor
-  1: a tensor
-}
-Outputs {
-  0: A tensor of indices of minium values.
-}
-```
-
-**PACK**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: an integer.
-}
-Outputs {
-  0: A tensor of stacked tensors.
-}
-```
-
-**LOGICAL_OR**
-
-```
-Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
-}
-Outputs {
-  0: A tensor of logical_or output tensors.
-}
-```
-
 **UNPACK**
 
 ```
@@ -854,26 +978,26 @@ Outputs {
 }
 ```
 
-**FLOOR_DIV**
+**ZEROS_LIKE**
 
 ```
 Inputs {
-  0: a list of tensors.
-  1: a list of tensors.
+  0: a tensor
 }
 Outputs {
-  0: A tensor of floor_div output tensors.
+  0: A tensor of the same shape and type as x but filled with zeros
 }
 ```
 
-**ZEROS_LIKE**
+**FILL**
 
 ```
 Inputs {
-  0: a tensor
+  0: A Tensor. Must be one of the following types: int32, int64. 1-D. Represents the shape of the output tensor.
+  1: A Tensor. 0-D (scalar). Value to fill the returned tensor.
 }
 Outputs {
-  0: A tensor of the same shape and type as x but filled with zeros
+  0: A tensor of the same type as value (input1).
 }
 ```
 
diff --git a/tensorflow/lite/g3doc/using_select_tf_ops.md b/tensorflow/lite/g3doc/using_select_tf_ops.md
new file mode 100644
index 0000000000000000000000000000000000000000..aa51f58baa4ecf01fbe75d2ce9095bb1a5286ae8
--- /dev/null
+++ b/tensorflow/lite/g3doc/using_select_tf_ops.md
@@ -0,0 +1,249 @@
+# [Experimental] Using TensorFlow Lite with select TensorFlow ops
+
+The TensorFlow Lite builtin op library has grown rapidly, and will continue to
+grow, but there remains a long tail of TensorFlow ops that are not yet natively
+supported by TensorFlow Lite . These unsupported ops can be a point of friction
+in the TensorFlow Lite model conversion process. To that end, the team has
+recently been working on an experimental mechanism for reducing this friction.
+
+This document outlines how to use TensorFlow Lite with select TensorFlow ops.
+*Note that this feature is experimental and is under active development.* As you
+use this feature, keep in mind the [known limitations](#known-limitations), and
+please send feedback about models that work and issues you are facing to
+tflite@tensorflow.org.
+
+TensorFlow Lite will continue to have
+[TensorFlow Lite builtin ops](tf_ops_compatibility.md) optimized for mobile and
+embedded devices. However, TensorFlow Lite models can now use a subset of
+TensorFlow ops when TFLite builtin ops are not sufficient.
+
+Models converted with TensorFlow ops will require a TensorFlow Lite interpreter
+that has a larger binary size than the interpreter with only TFLite builtin ops.
+Additionally, performance optimizations will not be available for any TensorFlow
+ops in the TensorFlow Lite model.
+
+This document outlines how to [convert](#converting-the-model) and
+[run](#running-the-model) a TFLite model with TensorFlow ops on your platform of
+choice. It also discusses some [known limitations](#known-limitations), the
+[future plans](#future-plans) for this feature, and basic
+[performance and size metrics](#metrics).
+
+## Converting the model
+
+To convert a TensorFlow model to a TensorFlow Lite model with TensorFlow ops,
+use the `target_ops` argument in the
+[TensorFlow Lite converter](https://www.tensorflow.org/lite/convert/). The
+following values are valid options for `target_ops`:
+
+*   `TFLITE_BUILTINS` - Converts models using TensorFlow Lite builtin ops.
+*   `SELECT_TF_OPS` - Converts models using TensorFlow ops. The exact subset of
+    supported ops can be found in the whitelist at
+    `lite/toco/tflite/whitelisted_flex_ops.cc`.
+
+The recommended approach is to convert the model with `TFLITE_BUILTINS`, then
+with both `TFLITE_BUILTINS,SELECT_TF_OPS`, and finally with only
+`SELECT_TF_OPS`. Using both options (i.e. `TFLITE_BUILTINS,SELECT_TF_OPS`)
+creates models with TensorFlow Lite ops where possible. Using only
+`SELECT_TF_OPS` is useful when the model contains TensorFlow ops that are only
+partially supported by TensorFlow Lite, and one would like to avoid those
+limitations.
+
+The following example shows how to use `target_ops` in the
+[`TFLiteConverter`](https://www.tensorflow.org/lite/convert/python_api) Python
+API.
+
+```
+import tensorflow as tf
+
+converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+converter.target_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,
+                        tf.lite.OpsSet.SELECT_TF_OPS]
+tflite_model = converter.convert()
+open("converted_model.tflite", "wb").write(tflite_model)
+```
+
+The following example shows how to use `target_ops` in the
+[`tflite_convert`](https://www.tensorflow.org/lite/convert/cmdline_examples)
+command line tool.
+
+```
+tflite_convert \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/foo.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+When building and running `tflite_convert` directly with `bazel`, please pass
+`--define=with_select_tf_ops=true` as an additional argument.
+
+```
+bazel run --define=with_select_tf_ops=true tflite_convert -- \
+  --output_file=/tmp/foo.tflite \
+  --graph_def_file=/tmp/foo.pb \
+  --input_arrays=input \
+  --output_arrays=MobilenetV1/Predictions/Reshape_1 \
+  --target_ops=TFLITE_BUILTINS,SELECT_TF_OPS
+```
+
+## Running the model
+
+When using a TensorFlow Lite model that has been converted with support for
+select TensorFlow ops, the client must also use a TensorFlow Lite runtime that
+includes the necessary library of TensorFlow ops.
+
+### Android AAR
+
+A new Android AAR target with select TensorFlow ops has been added for
+convenience. Assuming a <a href="./demo_android.md">working TensorFlow Lite
+build environment</a>, build the Android AAR with select TensorFlow ops as
+follows:
+
+```sh
+bazel build --cxxopt='--std=c++11' -c opt             \
+  --config=android_arm --config=monolithic          \
+  //tensorflow/lite/java:tensorflow-lite-with-select-tf-ops
+```
+
+This will generate an AAR file in `bazel-genfiles/tensorflow/lite/java/`. From
+there, you can either import the AAR directly into your project, or publish the
+custom AAR to your local Maven repository:
+
+```sh
+mvn install:install-file \
+  -Dfile=bazel-genfiles/tensorflow/lite/java/tensorflow-lite-with-select-tf-ops.aar \
+  -DgroupId=org.tensorflow \
+  -DartifactId=tensorflow-lite-with-select-tf-ops -Dversion=0.1.100 -Dpackaging=aar
+```
+
+Finally, in your app's `build.gradle`, ensure you have the `mavenLocal()`
+dependency and replace the standard TensorFlow Lite dependency with the one that
+has support for select TensorFlow ops:
+
+```
+allprojects {
+    repositories {
+        jcenter()
+        mavenLocal()
+    }
+}
+
+dependencies {
+    compile 'org.tensorflow:tensorflow-lite-with-select-tf-ops:0.1.100'
+}
+```
+
+### iOS
+
+With XCode Command Line Tools installed, TensorFlow Lite with select TensorFlow
+ops support can be built with the following command:
+
+```sh
+tensorflow/contrib/makefile/build_all_ios_with_tflite.sh
+```
+
+This will generate the required static linking libraries in the
+`tensorflow/contrib/makefile/gen/lib/` directory.
+
+The TensorFlow Lite camera example app can be used to test this. A new
+TensorFlow Lite XCode project with support for select TensorFlow ops has been
+added to
+`tensorflow/lite/examples/ios/camera/tflite_camera_example_with_select_tf_ops.xcodeproj`.
+
+To use this feature in a your own project, either clone the example project or
+set the project settings for a new or existing project to the following:
+
+*   In Build Phases -> Link Binary With Libraries, add the static libraries
+    under `tensorflow/contrib/makefile/gen/lib/` directory:
+    *   `libtensorflow-lite.a`
+    *   `libprotobuf.a`
+    *   `nsync.a`
+*   In Build Settings -> Header Search Paths, add the following directories:
+    *   `tensorflow/lite/`
+    *   `tensorflow/contrib/makefile/downloads/flatbuffer/include`
+    *   `tensorflow/contrib/makefile/downloads/eigen`
+*   In Build Settings -> Other Linker Flags, add `-force_load
+    tensorflow/contrib/makefile/gen/lib/libtensorflow-lite.a`.
+
+A CocoaPod with support for select TensorFlow ops will also be released in the
+future.
+
+### C++
+
+When building TensorFlow Lite libraries using the bazel pipeline, the additional
+TensorFlow ops library can be included and enabled as follows:
+
+*   Enable monolithic builds if necessary by adding the `--config=monolithic`
+    build flag.
+*   Do one of the following:
+    *   Include the `--define=with_select_tf_ops=true` build flag in the `bazel
+        build` invocation when building TensorFlow Lite.
+    *   Add the TensorFlow ops delegate library dependency to the build
+        dependencies: `tensorflow/lite/delegates/flex:delegate`.
+
+Note that the necessary `TfLiteDelegate` will be installed automatically when
+creating the interpreter at runtime as long as the delegate is linked into the
+client library. It is not necessary to explicitly install the delegate instance
+as is typically required with other delegate types.
+
+### Python pip Package
+
+Python support is actively under development.
+
+## Metrics
+
+### Performance
+
+When using a mixture of both builtin and select TensorFlow ops, all of the same
+TensorFlow Lite optimizations and optimized builtin kernels will be be available
+and usable with the converted model. For the TensorFlow ops, performance should
+generally be comparable to that of
+[TensorFlow Mobile](https://www.tensorflow.org/lite/tfmobile/).
+
+The following table describes the average time taken to run inference on
+MobileNet on a Pixel 2. The listed times are an average of 100 runs. These
+targets were built for Android using the flags: `--config=android_arm64 -c opt`.
+
+Build                                | Time (milliseconds)
+------------------------------------ | -------------------
+Only built-in ops (`TFLITE_BUILTIN`) | 260.7
+Using only TF ops (`SELECT_TF_OPS`)  | 264.5
+
+### Binary Size
+
+The following table describes the binary size of TensorFlow Lite for each build.
+These targets were built for Android using `--config=android_arm -c opt`.
+
+Build                 | C++ Binary Size | Android APK Size
+--------------------- | --------------- | ----------------
+Only built-in ops     | 796 KB          | 561 KB
+Built-in ops + TF ops | 23.0 MB         | 8.0 MB
+
+## Known Limitations
+
+The following is a list of some of the known limitations:
+
+*   Control flow ops are not yet supported.
+*   The
+    [`post_training_quantization`](https://www.tensorflow.org/performance/post_training_quantization)
+    flag is currently not supported for TensorFlow ops so it will not quantize
+    weights for any TensorFlow ops. In models with both TensorFlow Lite builtin
+    ops and TensorFlow ops, the weights for the builtin ops will be quantized.
+*   Ops that require explicit initialization from resources, like HashTableV2,
+    are not yet supported.
+*   Certain TensorFlow ops may not support the full set of input/output types
+    that are typically available on stock TensorFlow.
+
+## Future Plans
+
+The following is a list of improvements to this pipeline that are in progress:
+
+*   *Selective registration* - There is work being done to make it simple to
+    generate TFLite interpreter binaries that only contain the TensorFlow ops
+    required for a particular set of models.
+*   *Improved usability* - The conversion process will be simplified to only
+    require a single pass through the converter. Additionally, pre-built Android
+    AAR and iOS CocoaPod binaries will be provided.
+*   *Improved performance* - There is work being done to ensure TensorFlow Lite
+    with TensorFlow ops has performance parity to TensorFlow Mobile.
diff --git a/tensorflow/lite/interpreter.cc b/tensorflow/lite/interpreter.cc
index c90fc3be87e18c96db5771a0467972d9bfc4ffab..e2129ed46d94061211e02445a437f7adca51363e 100644
--- a/tensorflow/lite/interpreter.cc
+++ b/tensorflow/lite/interpreter.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 
-#include "tensorflow/lite/arena_planner.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/context_util.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -32,110 +31,15 @@ limitations under the License.
 #include "tensorflow/lite/util.h"
 
 namespace tflite {
-namespace {
-
-TfLiteStatus ReportOpError(TfLiteContext* context, const TfLiteNode& node,
-                           const TfLiteRegistration& registration,
-                           int node_index, const char* message) {
-  context->ReportError(
-      context, "Node number %d (%s) %s.\n", node_index,
-      registration.custom_name
-          ? registration.custom_name
-          : EnumNameBuiltinOperator(
-                static_cast<BuiltinOperator>(registration.builtin_code)),
-      message);
-  return kTfLiteError;
-}
-
-// Stub method which returns kTfLiteError when the function is forbidden.
-// We're registrating this function to several different function to save
-// compiled binary size. Please note the restrictions:
-// * The type of first parameter have to be `TfLiteContext*`.
-// * All paramteters must be trivailly destructible. (E.g. No C++ class)
-TfLiteStatus ForbiddenContextFunction(TfLiteContext* context, ...) {
-  context->ReportError(context,
-                       "The function is forbidden if not calling in delegate.");
-  return kTfLiteError;
-}
-
-// Set the ForbiddenContextFunction to a compatible function pointer.
-template <typename FunctionType>
-void SetForbiddenContextFunction(FunctionType* func) {
-  *func = reinterpret_cast<FunctionType>(ForbiddenContextFunction);
-}
-
-// Returns true if at least one tensor in the given list is kTfLiteDynamic.
-template <typename TensorIntArray>
-bool HasDynamicTensorImpl(const TfLiteContext& context,
-                          const TensorIntArray& int_array) {
-  for (int i : int_array) {
-    const TfLiteTensor& tensor = context.tensors[i];
-    if (tensor.allocation_type == kTfLiteDynamic) {
-      return true;
-    }
-  }
-  return false;
-}
-
-}  // namespace
-
-// A trivial implementation of GraphInfo around the Interpreter.
-// NOTE: this interpreter info represents the subset of the
-// graph that is executed according to execution plan. Thus,
-// the indices are execution plan indices rather than raw node
-// indices.
-class InterpreterInfo : public GraphInfo {
- public:
-  explicit InterpreterInfo(Interpreter* interpreter)
-      : interpreter_(interpreter) {}
-
-  size_t num_tensors() const override { return interpreter_->tensors_size(); }
-  TfLiteTensor* tensor(size_t index) override {
-    return interpreter_->tensor(index);
-  }
-  size_t num_nodes() const override {
-    return interpreter_->execution_plan().size();
-  }
-  const TfLiteNode& node(size_t index) const override {
-    int node_index = interpreter_->execution_plan()[index];
-    return interpreter_->node_and_registration(node_index)->first;
-  }
-  const std::vector<int>& inputs() const override {
-    return interpreter_->inputs();
-  }
-  const std::vector<int>& outputs() const override {
-    return interpreter_->outputs();
-  }
-  const std::vector<int>& variables() const override {
-    return interpreter_->variables();
-  }
-
- public:
-  Interpreter* interpreter_;
-};
 
 Interpreter::Interpreter(ErrorReporter* error_reporter)
     : error_reporter_(error_reporter ? error_reporter
                                      : DefaultErrorReporter()) {
-  context_.impl_ = static_cast<void*>(this);
-  context_.ResizeTensor = ResizeTensor;
-  context_.ReportError = ReportError;
-  context_.AddTensors = AddTensors;
-  context_.tensors = nullptr;
-  context_.tensors_size = 0;
-  context_.allow_fp32_relax_to_fp16 = false;
-  context_.recommended_num_threads = -1;
-  context_.GetExternalContext = GetExternalContext;
-  context_.SetExternalContext = SetExternalContext;
-
-  // Invalid to call these these except from TfLiteDelegate
-  SwitchToKernelContext();
+  // There's always at least 1 subgraph which is the primary subgraph.
+  AddSubgraphs(1);
+  context_ = primary_subgraph().context();
 
   // Reserve some space for the tensors to avoid excessive resizing.
-  tensors_.reserve(kTensorsReservedCapacity);
-  nodes_and_registration_.reserve(kTensorsReservedCapacity);
-  next_execution_plan_index_to_prepare_ = 0;
-
   for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
     external_contexts_[i] = nullptr;
   }
@@ -143,670 +47,88 @@ Interpreter::Interpreter(ErrorReporter* error_reporter)
   UseNNAPI(false);
 }
 
-Interpreter::~Interpreter() {
-  for (auto& nodeAndReg : nodes_and_registration_) {
-    TfLiteNode& node = nodeAndReg.first;
-    TfLiteIntArrayFree(node.inputs);
-    TfLiteIntArrayFree(node.outputs);
-    TfLiteIntArrayFree(node.temporaries);
-    if (node.builtin_data) free(node.builtin_data);
-    OpFree(nodeAndReg.second, node.user_data);
-    node.builtin_data = nullptr;
-  }
-
-  for (size_t i = 0; i < context_.tensors_size; i++) {
-    TfLiteTensor* tensor = &context_.tensors[i];
-    if (tensor->buffer_handle != kTfLiteNullBufferHandle &&
-        tensor->delegate->FreeBufferHandle != nullptr) {
-      tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
-                                         &tensor->buffer_handle);
-    }
-    TfLiteTensorFree(tensor);
-  }
-}
-
-TfLiteStatus Interpreter::ReplaceNodeSubsetsWithDelegateKernels(
-    TfLiteContext* context, TfLiteRegistration registration,
-    const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->ReplaceNodeSubsetsWithDelegateKernels(registration, nodes_to_replace,
-                                              delegate);
-}
-
-namespace {
-
-// Copy a std::vector<int> to an existing TfLiteIntArray.
-// This is a low-level data manipulation function, and it's caller's
-// responsibility to ensure TfLiteIntArray has enough size.
-void CopyVectorToTfLiteIntArray(const std::vector<int>& vec,
-                                TfLiteIntArray* arr) {
-  arr->size = vec.size();
-  memcpy(arr->data, vec.data(), sizeof(int) * arr->size);
-}
-
-// This function allocates a continuous memory space that contains a
-// TfLiteDelegateParams followed by a several TfLiteIntArray.
-// When calling `free` at TfLiteDelegateParams*, all the allocated space
-// will be freed together.
-//
-// +-----------------------------------+
-// | TfLiteDelegateParams              |
-// | TfLiteDelegate* delegate;         |
-// | TfLiteIntArray* nodes_to_replace; |--\
-// | TfLiteIntArray* input_tensors;    |--+--\
-// | TfLiteIntArray* output_tensors;   |--+--+--\
-// +-----------------------------------+  |  |  |
-// | TfLiteIntArray (variable size)    |<-/  |  |
-// +-----------------------------------+     |  |
-// | TfLiteIntArray (variable size)    |<----/  |
-// +-----------------------------------+        |
-// | TfLiteIntArray (variable size)    |<-------/
-// +-----------------------------------+
-TfLiteDelegateParams* CreateDelegateParams(TfLiteDelegate* delegate,
-                                           const NodeSubset& node_subset) {
-  // Step 1: Calculate the allocation size.
-  int allocation_size = sizeof(TfLiteDelegateParams);
-
-  int nodes_to_replace_size =
-      TfLiteIntArrayGetSizeInBytes(node_subset.nodes.size());
-  allocation_size += nodes_to_replace_size;
-
-  int input_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(node_subset.input_tensors.size());
-  allocation_size += input_tensors_size;
-
-  int output_tensors_size =
-      TfLiteIntArrayGetSizeInBytes(node_subset.output_tensors.size());
-  allocation_size += output_tensors_size;
-
-  // Step 2: Allocate the memory.
-  // Use `char*` for conveniently step through the allocated space by bytes.
-  char* allocation = reinterpret_cast<char*>(malloc(allocation_size));
-
-  // Step 3: Fill all data structures structures.
-  TfLiteDelegateParams* params =
-      reinterpret_cast<TfLiteDelegateParams*>(allocation);
-  params->delegate = delegate;
-  allocation += sizeof(TfLiteDelegateParams);
-
-  params->nodes_to_replace = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(node_subset.nodes, params->nodes_to_replace);
-  allocation += nodes_to_replace_size;
-
-  params->input_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(node_subset.input_tensors, params->input_tensors);
-  allocation += input_tensors_size;
-
-  params->output_tensors = reinterpret_cast<TfLiteIntArray*>(allocation);
-  CopyVectorToTfLiteIntArray(node_subset.output_tensors,
-                             params->output_tensors);
-  allocation += output_tensors_size;
-
-  return params;
-}
-
-}  // namespace
-
-TfLiteStatus Interpreter::ReplaceNodeSubsetsWithDelegateKernels(
-    TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
-    TfLiteDelegate* delegate) {
-  // Annotate the registration as DELEGATE op.
-  registration.builtin_code = BuiltinOperator_DELEGATE;
-
-  // Analyze the graph to find all independent node_subsets that are either
-  // fully not-this-delegate or this-delegate computation.
-  InterpreterInfo info(this);
-  std::vector<NodeSubset> node_subsets;
-  PartitionGraphIntoIndependentNodeSubsets(&info, nodes_to_replace,
-                                           &node_subsets);
-
-  execution_plan_.clear();
-  for (auto& node_subset : node_subsets) {
-    // Subsets calimed by the delegate should have a "macro" op created, the
-    // other node_subsets (kTfNonPartition) just have their nodes added back to
-    // the execution plan.
-    switch (node_subset.type) {
-      case NodeSubset::kTfNonPartition:
-        for (auto it = node_subset.nodes.begin(); it != node_subset.nodes.end();
-             ++it) {
-          execution_plan_.push_back(*it);
-        }
-        break;
-      case NodeSubset::kTfPartition: {
-        int node_index;
-
-        TfLiteDelegateParams* params =
-            CreateDelegateParams(delegate, node_subset);
-        TF_LITE_ENSURE_STATUS(AddNodeWithParameters(
-            node_subset.input_tensors, node_subset.output_tensors, nullptr, 0,
-            params, &registration, &node_index));
-
-        // Initialize the output tensors's delegate-related fields.
-        for (int tensor_index : node_subset.output_tensors) {
-          TfLiteTensor* tensor = &tensors_[tensor_index];
-          TF_LITE_ENSURE(&context_, tensor->delegate == nullptr ||
-                                        tensor->delegate == delegate);
-          tensor->delegate = delegate;
-        }
-
-        // Associate the node with the delegate.
-        TfLiteNode* node = &nodes_and_registration_[node_index].first;
-        node->delegate = delegate;
-      } break;
-      case NodeSubset::kTfUnexplored:
-        return kTfLiteError;
-        break;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteExternalContext* Interpreter::GetExternalContext(
-    TfLiteExternalContextType type) {
-  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
-    return external_contexts_[type];
-  }
-  return nullptr;
-}
-
-TfLiteExternalContext* Interpreter::GetExternalContext(
-    struct TfLiteContext* context, TfLiteExternalContextType type) {
-  return static_cast<Interpreter*>(context->impl_)->GetExternalContext(type);
-}
+Interpreter::~Interpreter() {}
 
 void Interpreter::SetExternalContext(TfLiteExternalContextType type,
                                      TfLiteExternalContext* ctx) {
-  if (type >= 0 && type < kTfLiteMaxExternalContexts) {
-    external_contexts_[type] = ctx;
-  }
-}
-
-void Interpreter::SetExternalContext(struct TfLiteContext* context,
-                                     TfLiteExternalContextType type,
-                                     TfLiteExternalContext* ctx) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->SetExternalContext(type, ctx);
-}
-
-// Gets an TfLiteIntArray* representing the execution plan. The interpreter owns
-// this memory and it is only guaranteed to exist during the invocation of the
-// delegate prepare.
-TfLiteStatus Interpreter::GetExecutionPlan(TfLiteIntArray** execution_plan) {
-  // TODO(aselle): Do not make a copy here
-  plan_cache_.reset(TfLiteIntArrayCreate(execution_plan_.size()));
-  *execution_plan = plan_cache_.get();
-  static_assert(sizeof(plan_cache_->data[0]) == sizeof(execution_plan_[0]),
-                "TfLiteIntArray and execution_plan do not contain same type.");
-  std::memcpy(plan_cache_->data, execution_plan_.data(),
-              sizeof(plan_cache_->data[0]) * execution_plan_.size());
-  return kTfLiteOk;
-}
-
-// WARNING: This is an experimental interface that is subject to change.
-// Entry point for C node plugin API to get the execution plan
-TfLiteStatus Interpreter::GetExecutionPlan(struct TfLiteContext* context,
-                                           TfLiteIntArray** execution_plan) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->GetExecutionPlan(execution_plan);
+  primary_subgraph().SetExternalContext(type, ctx);
 }
 
 TfLiteStatus Interpreter::SetInputs(std::vector<int> inputs) {
-  TF_LITE_ENSURE_OK(&context_,
-                    CheckTensorIndices("inputs", inputs.data(), inputs.size()));
-  inputs_ = std::move(inputs);
-  return kTfLiteOk;
+  return primary_subgraph().SetInputs(inputs);
 }
 
 TfLiteStatus Interpreter::SetOutputs(std::vector<int> outputs) {
-  TF_LITE_ENSURE_OK(
-      &context_, CheckTensorIndices("outputs", outputs.data(), outputs.size()));
-  outputs_ = std::move(outputs);
-  return kTfLiteOk;
+  return primary_subgraph().SetOutputs(outputs);
 }
 
 TfLiteStatus Interpreter::SetVariables(std::vector<int> variables) {
-  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("variables", variables.data(),
-                                                  variables.size()));
-  variables_ = std::move(variables);
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::CheckTensorIndices(const char* label,
-                                             const int* indices, int length) {
-  // Making sure kOptionalTensor is not re-defined to something other than -1.
-  static_assert(kOptionalTensor == -1, "kOptionalTensor should be defined -1");
-
-  for (int i = 0; i < length; i++) {
-    int index = indices[i];
-    // Continue if index == kOptionalTensor before additional comparisons below,
-    // size_t(-1) is always >= context_tensors_size.
-    if (index == kOptionalTensor) {
-      continue;
-    }
-    if (index < 0 || static_cast<size_t>(index) >= context_.tensors_size) {
-      ReportError(&context_, "Invalid tensor index %d in %s\n", index, label);
-      consistent_ = false;
-      return kTfLiteError;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::BytesRequired(TfLiteType type, const int* dims,
-                                        size_t dims_size, size_t* bytes) {
-  // TODO(aselle): Check for overflow here using overflow.h in TensorFlow
-  // MultiplyWithoutOverflow.
-  TF_LITE_ENSURE(&context_, bytes != nullptr);
-  size_t count = 1;
-  for (int k = 0; k < dims_size; k++) count *= dims[k];
-  switch (type) {
-    case kTfLiteFloat32:
-      *bytes = sizeof(float) * count;
-      break;
-    case kTfLiteInt16:
-      *bytes = sizeof(int16_t) * count;
-      break;
-    case kTfLiteInt32:
-      *bytes = sizeof(int32_t) * count;
-      break;
-    case kTfLiteUInt8:
-      *bytes = sizeof(uint8_t) * count;
-      break;
-    case kTfLiteInt64:
-      *bytes = sizeof(int64_t) * count;
-      break;
-    case kTfLiteBool:
-      *bytes = sizeof(bool) * count;
-      break;
-    case kTfLiteComplex64:
-      *bytes = sizeof(std::complex<float>) * count;
-      break;
-    default:
-      ReportError(&context_,
-                  "Only float32, int16, int32, int64, uint8, bool, complex64 "
-                  "supported currently.");
-      return kTfLiteError;
-  }
-  return kTfLiteOk;
+  return primary_subgraph().SetVariables(variables);
 }
 
 TfLiteStatus Interpreter::AllocateTensors() {
-  if (!consistent_) {
-    ReportError(&context_, "AllocateTensors() called on inconsistent model.");
-    return kTfLiteError;
-  }
-
-  // Explicit (re)allocation is necessary if nodes have been changed or tensors
-  // have been resized. For inputs marked as dynamic, we can't short-circuit the
-  // allocation as the client may have done the resize manually.
-  if (state_ != kStateUninvokable && !HasDynamicTensorImpl(context_, inputs_)) {
-    return kTfLiteOk;
-  }
-
-  next_execution_plan_index_to_prepare_ = 0;
-  if (memory_planner_) {
-    TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations());
-  }
-
-  TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-
-  state_ = kStateInvokable;
-
-  // Reset the variable tensors to zero after (re)allocating the tensors.
-  // Developers shouldn't rely on the side effect of this function to reset
-  // variable tesnsors. They should call `ResetVariableTensors` directly
-  // instead.
-  ResetVariableTensors();
-
-  return kTfLiteOk;
+  return primary_subgraph().AllocateTensors();
 }
 
-// TODO(ycling): Support non-zero default values.
-TfLiteStatus Interpreter::ResetVariableTensors() {
-  for (auto& tensor : tensors_) {
-    if (!tensor.is_variable) {
-      continue;
-    }
+void Interpreter::ReserveNodes(int count) {
+  primary_subgraph().nodes_and_registration().reserve(count);
+}
 
-    // Variable tensors have to be `kTfLiteArenaRwPersistent`, and must be
-    // allocated after the initial `PrepareOpsAndTensors()` is called.
-    TF_LITE_ENSURE_EQ(&context_, tensor.allocation_type,
-                      kTfLiteArenaRwPersistent);
-    TF_LITE_ENSURE(&context_, tensor.data.raw != nullptr);
+void Interpreter::AddSubgraphs(int subgraphs_to_add,
+                               int* first_new_subgraph_index) {
+  const size_t base_index = subgraphs_.size();
+  if (first_new_subgraph_index) *first_new_subgraph_index = base_index;
 
-    memset(tensor.data.raw, 0, tensor.bytes);
+  subgraphs_.reserve(base_index + subgraphs_to_add);
+  for (int i = 0; i < subgraphs_to_add; ++i) {
+    Subgraph* subgraph =
+        new Subgraph(error_reporter_, external_contexts_, &subgraphs_);
+    subgraphs_.emplace_back(subgraph);
   }
-  return kTfLiteOk;
-}
-
-void Interpreter::ReserveNodes(int count) {
-  nodes_and_registration_.reserve(count);
 }
 
 TfLiteStatus Interpreter::AddNodeWithParameters(
     const std::vector<int>& inputs, const std::vector<int>& outputs,
     const char* init_data, size_t init_data_size, void* builtin_data,
     const TfLiteRegistration* registration, int* node_index) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(&context_,
-                "AddNodeWithParameters is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-  state_ = kStateUninvokable;
-
-  std::unique_ptr<void, decltype(free)*> builtin_data_deleter(builtin_data,
-                                                              free);
-
-  TF_LITE_ENSURE_OK(&context_, CheckTensorIndices("node inputs", inputs.data(),
-                                                  inputs.size()));
-  TF_LITE_ENSURE_OK(
-      &context_,
-      CheckTensorIndices("node outputs", outputs.data(), outputs.size()));
-
-  int new_node_index = nodes_and_registration_.size();
-  if (node_index) *node_index = new_node_index;
-  nodes_and_registration_.resize(nodes_and_registration_.size() + 1);
-  auto& node_and_reg = nodes_and_registration_.back();
-  TfLiteNode& node = node_and_reg.first;
-  if (node.inputs) TfLiteIntArrayFree(node.inputs);
-  if (node.outputs) TfLiteIntArrayFree(node.outputs);
-  if (node.temporaries) TfLiteIntArrayFree(node.temporaries);
-
-  // NOTE, here we are not using move semantics yet, since our internal
-  // representation isn't std::vector, but in the future we would like to avoid
-  // copies, so we want the interface to take r-value references now.
-  node.inputs = ConvertVectorToTfLiteIntArray(inputs);
-  node.outputs = ConvertVectorToTfLiteIntArray(outputs);
-  node.temporaries = TfLiteIntArrayCreate(0);
-  if (init_data) {
-    node.user_data = OpInit(*registration, init_data, init_data_size);
-  } else {
-    node.user_data =
-        OpInit(*registration,
-               reinterpret_cast<const char*>(builtin_data_deleter.get()), 0);
-  }
-
-  node.builtin_data = builtin_data_deleter.release();
-  // TODO(ycling): Filling `custom_initial_data` and `custom_initial_data_size`
-  // properly for nodes generated by ReplaceNodeSubsetsWithDelegateKernels.
-
-  if (registration->builtin_code == BuiltinOperator_CUSTOM) {
-    // When it's a CUSTOM op, the `custom_options` field in the Flatbuffer
-    // `Operator` table is passed in.
-    node.custom_initial_data = init_data;
-    node.custom_initial_data_size = init_data_size;
-  } else {
-    node.custom_initial_data = nullptr;
-    node.custom_initial_data_size = 0;
-  }
-
-  node.delegate = nullptr;
-  node_and_reg.second = *registration;
-  execution_plan_.push_back(new_node_index);
-  return kTfLiteOk;
+  return primary_subgraph().AddNodeWithParameters(inputs, outputs, init_data,
+                                                  init_data_size, builtin_data,
+                                                  registration, node_index);
 }
 
 TfLiteStatus Interpreter::ResizeInputTensor(int tensor_index,
                                             const std::vector<int>& dims) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(&context_,
-                "ResizeInputTensor is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-
-  // TODO(aselle): All bounds checks can be implemented as one-sided bounds
-  // checks by casting to unsigned for efficiency. Profile before doing this.
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  TfLiteTensor* tensor = &context_.tensors[tensor_index];
-
-  // Short-circuit the state change if the dimensions don't change, avoiding
-  // unnecessary (re)allocations.
-  if (EqualArrayAndTfLiteIntArray(tensor->dims, dims.size(), dims.data())) {
-    return kTfLiteOk;
-  }
-
-  state_ = kStateUninvokable;
-  return ResizeTensorImpl(tensor, ConvertVectorToTfLiteIntArray(dims));
-}
-
-bool HasDynamicTensor(const TfLiteContext& context,
-                      const TfLiteIntArray* int_array) {
-  return HasDynamicTensorImpl(context, TfLiteIntArrayView{int_array});
-}
-
-TfLiteStatus Interpreter::PrepareOpsStartingAt(
-    int first_execution_plan_index, int* last_execution_plan_index_prepared) {
-  for (int execution_plan_index = first_execution_plan_index;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    int node_index = execution_plan_[execution_plan_index];
-    TfLiteNode& node = nodes_and_registration_[node_index].first;
-    const TfLiteRegistration& registration =
-        nodes_and_registration_[node_index].second;
-    EnsureTensorsVectorCapacity();
-    if (OpPrepare(registration, &node) == kTfLiteError) {
-      return ReportOpError(&context_, node, registration, node_index,
-                           "failed to prepare");
-    }
-
-    *last_execution_plan_index_prepared = execution_plan_index;
-
-    // Discontinue if the node has dynamic outputs. Note that we don't
-    // stop for dynamic temporary tensors since they won't affect the
-    // sizes of other tensors in the graph.
-    if (HasDynamicTensor(context_, node.outputs)) {
-      break;
-    }
-  }
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::PrepareOpsAndTensors() {
-  if (!memory_planner_) {
-    memory_planner_.reset(new ArenaPlanner(
-        &context_, std::unique_ptr<GraphInfo>(new InterpreterInfo(this)),
-        /*preserve_inputs=*/true, /*preserve_intermediates*/ false));
-    memory_planner_->PlanAllocations();
-  }
-
-  int last_exec_plan_index_prepared = 0;
-
-  TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt(
-      next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared));
-  TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
-      next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared));
-
-  next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
-  return kTfLiteOk;
+  return primary_subgraph().ResizeInputTensor(tensor_index, dims);
 }
 
 TfLiteStatus Interpreter::Invoke() {
-  if (!consistent_) {
-    ReportError(&context_, "Invoke called on model that is not consistent.");
-    return kTfLiteError;
-  }
-  if (state_ == kStateUninvokable) {
-    ReportError(&context_, "Invoke called on model that is not ready.");
-    return kTfLiteError;
-  }
-
-  TfLiteStatus status = kTfLiteOk;
-  if (nnapi_delegate_) {
-    if (next_execution_plan_index_to_prepare_ == execution_plan_.size()) {
-      TF_LITE_ENSURE_OK(&context_, nnapi_delegate_->Invoke(this));
-      return kTfLiteOk;
-    } else {
-      // TODO(aselle): In the future, we would like this to be an
-      // automatic tflite CPU fallback.
-      ReportError(&context_,
-                  "NNAPI was requested, but dependent sized tensors "
-                  "being used.\n");
-      return kTfLiteError;
-    }
-  }
-
-  // Invocations are always done in node order.
-  // Note that calling Invoke repeatedly will cause the original memory plan to
-  // be reused, unless either ResizeInputTensor() or AllocateTensors() has been
-  // called.
-  for (int execution_plan_index = 0;
-       execution_plan_index < execution_plan_.size(); execution_plan_index++) {
-    if (execution_plan_index == next_execution_plan_index_to_prepare_) {
-      TF_LITE_ENSURE_STATUS(PrepareOpsAndTensors());
-      TF_LITE_ENSURE(&context_, next_execution_plan_index_to_prepare_ >=
-                                    execution_plan_index);
-    }
-    int node_index = execution_plan_[execution_plan_index];
-    TfLiteNode& node = nodes_and_registration_[node_index].first;
-    const TfLiteRegistration& registration =
-        nodes_and_registration_[node_index].second;
-    SCOPED_OPERATOR_PROFILE(profiler_, node_index);
-
-    // TODO(ycling): This is an extra loop through inputs to check if the data
-    // need to be copied from Delegate buffer to raw memory, which is often not
-    // needed. We may want to cache this in prepare to know if this needs to be
-    // done for a node or not.
-    for (int i = 0; i < node.inputs->size; ++i) {
-      int tensor_index = node.inputs->data[i];
-      if (tensor_index == kOptionalTensor) {
-        continue;
-      }
-      TfLiteTensor* tensor = &tensors_[tensor_index];
-      if (tensor->delegate && tensor->delegate != node.delegate &&
-          tensor->data_is_stale) {
-        EnsureTensorDataIsReadable(tensor_index);
-      }
-    }
-
-    EnsureTensorsVectorCapacity();
-    tensor_resized_since_op_invoke_ = false;
-    if (OpInvoke(registration, &node) == kTfLiteError) {
-      status = ReportOpError(&context_, node, registration, node_index,
-                             "failed to invoke");
-    }
-
-    // Force execution prep for downstream ops if the latest op triggered the
-    // resize of a dynamic tensor.
-    if (tensor_resized_since_op_invoke_ &&
-        HasDynamicTensor(context_, node.outputs)) {
-      next_execution_plan_index_to_prepare_ = execution_plan_index + 1;
-    }
-  }
+  TfLiteStatus status = primary_subgraph().Invoke();
 
   if (!allow_buffer_handle_output_) {
-    for (int tensor_index : outputs_) {
-      EnsureTensorDataIsReadable(tensor_index);
+    for (int tensor_index : outputs()) {
+      primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
     }
   }
 
   return status;
 }
 
-TfLiteStatus Interpreter::ResizeTensor(TfLiteContext* context,
-                                       TfLiteTensor* tensor,
-                                       TfLiteIntArray* new_size) {
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function ResizeTensorImpl
-  // (this function is static).
-  return static_cast<Interpreter*>(context->impl_)
-      ->ResizeTensorImpl(tensor, new_size);
-}
-
-void Interpreter::ReportErrorImpl(const char* format, va_list args) {
-  error_reporter_->Report(format, args);
-}
-
-void Interpreter::ReportError(TfLiteContext* context, const char* format, ...) {
-  va_list args;
-  va_start(args, format);
-  auto* f = static_cast<Interpreter*>(context->impl_);
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function ReportErrorImpl
-  // (this function is static).
-  f->ReportErrorImpl(format, args);
-  va_end(args);
-}
-
 TfLiteStatus Interpreter::AddTensors(int tensors_to_add,
                                      int* first_new_tensor_index) {
-  const size_t base_index = tensors_.size();
-  if (first_new_tensor_index) *first_new_tensor_index = base_index;
-  tensors_.resize(tensors_.size() + tensors_to_add);
-  for (size_t i = base_index; i < tensors_.size(); i++) {
-    memset(&tensors_[i], 0, sizeof(tensors_[i]));
-    tensors_[i].buffer_handle = kTfLiteNullBufferHandle;
-  }
-  context_.tensors = tensors_.data();
-  context_.tensors_size = tensors_.size();
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::AddTensors(TfLiteContext* context, int tensors_to_add,
-                                     int* first_new_tensor_index) {
-  // Note here that context->impl_ is recovering the this pointer for an
-  // instance of Interpreter to call into the member function AddTensors
-  // (this function is static).
-  return static_cast<Interpreter*>(context->impl_)
-      ->AddTensors(tensors_to_add, first_new_tensor_index);
-}
-
-TfLiteStatus Interpreter::GetNodeAndRegistration(
-    int node_index, TfLiteNode** node, TfLiteRegistration** registration) {
-  TF_LITE_ENSURE(&context_, node_index >= 0);
-  TF_LITE_ENSURE(&context_, static_cast<size_t>(node_index) < nodes_size());
-  TF_LITE_ENSURE(&context_, node != nullptr && registration != nullptr);
-  *node = &nodes_and_registration_[node_index].first;
-  *registration = &nodes_and_registration_[node_index].second;
-  return kTfLiteOk;
+  return primary_subgraph().AddTensors(tensors_to_add, first_new_tensor_index);
 }
 
-TfLiteStatus Interpreter::GetNodeAndRegistration(
-    struct TfLiteContext* context, int node_index, TfLiteNode** node,
-    TfLiteRegistration** registration) {
-  return static_cast<Interpreter*>(context->impl_)
-      ->GetNodeAndRegistration(node_index, node, registration);
+TfLiteStatus Interpreter::ResetVariableTensors() {
+  return primary_subgraph().ResetVariableTensors();
 }
 
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(
-        &context_,
-        "SetTensorParametersReadOnly is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  // For most tensors we know exactly how much memory is necessary so we can
-  // ensure the buffer is large enough. However, we need to skip string tensors
-  // because their sizes change with the contents of the individual strings.
-  if (type != kTfLiteString) {
-    size_t required_bytes;
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, rank, &required_bytes));
-    TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
-  }
-
-  TfLiteTensor& tensor = context_.tensors[tensor_index];
-  if (type == tensor.type &&
-      EqualArrayAndTfLiteIntArray(tensor.dims, rank, dims)) {
-    // Fast path which does not invalidate the invokable property.
-    TfLiteTensorDataFree(&tensor);
-    tensor.data.raw = const_cast<char*>(buffer);
-    if (!tensor.dims) tensor.dims = ConvertArrayToTfLiteIntArray(rank, dims);
-    tensor.params = quantization;
-    tensor.allocation_type = kTfLiteMmapRo;
-    tensor.allocation = allocation;
-  } else {
-    state_ = kStateUninvokable;
-    TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                      quantization, const_cast<char*>(buffer), bytes,
-                      kTfLiteMmapRo, allocation, false, &tensor);
-  }
-  return kTfLiteOk;
+  return primary_subgraph().SetTensorParametersReadOnly(
+      tensor_index, type, name, rank, dims, quantization, buffer, bytes,
+      allocation);
 }
 
 // Set description of inputs/outputs/data/fptrs for node `node_index`.
@@ -816,186 +138,52 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
     int tensor_index, TfLiteType type, const char* name, const size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, bool is_variable) {
-  if (state_ == kStateInvokableAndImmutable) {
-    ReportError(
-        &context_,
-        "SetTensorParametersReadWrite is disallowed when graph is immutable.");
-    return kTfLiteError;
-  }
-  TF_LITE_ENSURE(&context_,
-                 tensor_index < context_.tensors_size && tensor_index >= 0);
-  size_t required_bytes = 0;
-  if (type != kTfLiteString) {
-    // These types will be allocated in our arena so we need to record how
-    // many bytes we will need based on the dimensions. String tensors are
-    // allocated dynamically and we can't know ahead of time how much space
-    // they will require.
-    TF_LITE_ENSURE_OK(&context_,
-                      BytesRequired(type, dims, rank, &required_bytes));
-  }
-
-  TfLiteAllocationType allocation_type = kTfLiteArenaRw;
-  if (type == kTfLiteString) {
-    if (is_variable) {
-      // We don't have a real use case for string variable tensor.
-      ReportError(&context_, "String variable tensor isn't supported.");
-      return kTfLiteError;
-    }
-    allocation_type = kTfLiteDynamic;
-  } else if (is_variable) {
-    allocation_type = kTfLiteArenaRwPersistent;
-  }
-
-  TfLiteTensorReset(type, name, ConvertArrayToTfLiteIntArray(rank, dims),
-                    quantization,
-                    /*buffer=*/nullptr, required_bytes, allocation_type,
-                    nullptr, is_variable, &context_.tensors[tensor_index]);
-  return kTfLiteOk;
+  return primary_subgraph().SetTensorParametersReadWrite(
+      tensor_index, type, name, rank, dims, quantization, is_variable);
 }
 
 TfLiteStatus Interpreter::SetExecutionPlan(const std::vector<int>& new_plan) {
-  for (int node_index : new_plan) {
-    TF_LITE_ENSURE(&context_, node_index >= 0 && node_index < nodes_size());
-  }
-  execution_plan_ = new_plan;
-  return kTfLiteOk;
-}
-
-TfLiteStatus Interpreter::ResizeTensorImpl(TfLiteTensor* tensor,
-                                           TfLiteIntArray* new_size) {
-  // Note that in theory we could resize kTfLiteArenaRwPersistent tensors too.
-  if (tensor->allocation_type == kTfLiteArenaRw ||
-      tensor->allocation_type == kTfLiteDynamic ||
-      tensor->allocation_type == kTfLiteArenaRwPersistent) {
-    tensor_resized_since_op_invoke_ |=
-        TfLiteIntArrayEqual(tensor->dims, new_size) == 0;
-    if (tensor->type != kTfLiteString) {
-      size_t bytesRequired;
-      TfLiteStatus status = BytesRequired(tensor->type, new_size->data,
-                                          new_size->size, &bytesRequired);
-      if (status != kTfLiteOk) {
-        TfLiteIntArrayFree(new_size);
-        return kTfLiteError;
-      }
-
-      // Realloc space for kTfLiteDynamic tensors.
-      TfLiteTensorRealloc(bytesRequired, tensor);
-      tensor->bytes = bytesRequired;
-    }
-    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
-    tensor->dims = new_size;
-
-    if (tensor->allocation_type != kTfLiteDynamic) {
-      tensor->data.raw = nullptr;
-    }
-  } else {
-    // kTfLiteMmapRo tensors are stored in the flatbuffer and are therefore
-    // of fixed size.
-    TfLiteIntArrayFree(new_size);
-    ReportError(&context_, "Attempting to resize a fixed-size tensor.");
-    return kTfLiteError;
-  }
-  return kTfLiteOk;
+  return primary_subgraph().SetExecutionPlan(new_plan);
 }
 
-void Interpreter::UseNNAPI(bool enable) {
-  // TODO(aselle): This is a workaround for finding if NNAPI exists.
-  // We also need to make sure getLibraryHandle() is renamed to be NNAPI
-  // prefixed.
-  if (!NNAPIDelegate::IsSupported()) enable = false;
-  if (!enable) {
-    nnapi_delegate_.reset();
-  } else if (!nnapi_delegate_) {
-    nnapi_delegate_.reset(new NNAPIDelegate);
-  }
-}
+void Interpreter::UseNNAPI(bool enable) { primary_subgraph().UseNNAPI(enable); }
 
 void Interpreter::SetNumThreads(int num_threads) {
-  context_.recommended_num_threads = num_threads;
+  for (auto& subgraph : subgraphs_) {
+    subgraph->context()->recommended_num_threads = num_threads;
+  }
 
   for (int i = 0; i < kTfLiteMaxExternalContexts; ++i) {
     auto* c = external_contexts_[i];
     if (c && c->Refresh) {
-      c->Refresh(&context_);
+      c->Refresh(context_);
     }
   }
 }
 
-void Interpreter::SwitchToDelegateContext() {
-  context_.GetNodeAndRegistration = GetNodeAndRegistration;
-  context_.ReplaceNodeSubsetsWithDelegateKernels =
-      ReplaceNodeSubsetsWithDelegateKernels;
-  context_.GetExecutionPlan = GetExecutionPlan;
-}
-
-void Interpreter::SwitchToKernelContext() {
-  SetForbiddenContextFunction(&context_.GetNodeAndRegistration);
-  SetForbiddenContextFunction(&context_.ReplaceNodeSubsetsWithDelegateKernels);
-  SetForbiddenContextFunction(&context_.GetExecutionPlan);
+void Interpreter::SetAllowFp16PrecisionForFp32(bool allow) {
+  for (auto& subgraph : subgraphs_) {
+    subgraph->context()->allow_fp32_relax_to_fp16 = allow;
+  }
 }
 
 TfLiteStatus Interpreter::ModifyGraphWithDelegate(TfLiteDelegate* delegate) {
-  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
-    int last_execution_plan_index_prepared;
-    TF_LITE_ENSURE_OK(&context_, PrepareOpsStartingAt(
-                                     0, &last_execution_plan_index_prepared));
-
-    bool has_dynamic_tensors = true;
-    // Dynamic tensors exist if not all nodes can be prepared.
-    if (last_execution_plan_index_prepared + 1 == execution_plan_.size()) {
-      // If all the nodes can be prepared, check if the last node has dynamic
-      // tensors.
-      int node_index = execution_plan_[last_execution_plan_index_prepared];
-      TfLiteNode& node = nodes_and_registration_[node_index].first;
-      if (!HasDynamicTensor(context_, node.outputs)) {
-        has_dynamic_tensors = false;
-      }
-    }
-    if (has_dynamic_tensors) {
-      ReportError(
-          &context_,
-          "Attempting to use a delegate that only supports static-sized "
-          "tensors with a graph that has dynamic-sized tensors.");
-      return kTfLiteError;
-    }
-  }
-
-  // TODO(aselle): Consider if it is worth storing pointers to delegates.
-  // Setup additional context interface.
-  SwitchToDelegateContext();
-
-  TfLiteStatus status = delegate->Prepare(&context_, delegate);
-
-  // Remove additional context info.
-  SwitchToKernelContext();
-
-  TF_LITE_ENSURE_OK(&context_, status);
-
-  if (!(delegate->flags & kTfLiteDelegateFlagsAllowDynamicTensors)) {
-    // Reset the state to force tensor/op reallocation.
-    state_ = kStateUninvokable;
-    TF_LITE_ENSURE_OK(&context_, AllocateTensors());
-    TF_LITE_ENSURE_EQ(&context_, state_, kStateInvokable);
-    // After using a delegate which doesn't support dynamic tensors, make the
-    // entire graph immutable.
-    state_ = kStateInvokableAndImmutable;
-  }
-
-  return status;
+  return primary_subgraph().ModifyGraphWithDelegate(delegate);
 }
 
 TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle buffer_handle,
                                           TfLiteDelegate* delegate) {
-  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-  TfLiteTensor* tensor = &tensors_[tensor_index];
+  TF_LITE_ENSURE(context_, tensor_index < tensors_size());
+  std::vector<TfLiteTensor>& tensors = primary_subgraph().tensors();
+  TfLiteTensor* tensor = &tensors[tensor_index];
 
-  TF_LITE_ENSURE(&context_,
+  TF_LITE_ENSURE(context_,
                  tensor->delegate == nullptr || tensor->delegate == delegate);
   tensor->delegate = delegate;
   if (tensor->buffer_handle != kTfLiteNullBufferHandle) {
-    TF_LITE_ENSURE(&context_, tensor->delegate->FreeBufferHandle != nullptr);
-    tensor->delegate->FreeBufferHandle(&context_, tensor->delegate,
+    TF_LITE_ENSURE(context_, tensor->delegate->FreeBufferHandle != nullptr);
+    tensor->delegate->FreeBufferHandle(context_, tensor->delegate,
                                        &tensor->buffer_handle);
   }
   tensor->buffer_handle = buffer_handle;
@@ -1006,8 +194,9 @@ TfLiteStatus Interpreter::SetBufferHandle(int tensor_index,
 TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
                                           TfLiteBufferHandle* buffer_handle,
                                           TfLiteDelegate** delegate) {
-  TF_LITE_ENSURE(&context_, tensor_index < tensors_size());
-  TfLiteTensor* tensor = &tensors_[tensor_index];
+  TF_LITE_ENSURE(context_, tensor_index < tensors_size());
+  std::vector<TfLiteTensor>& tensors = primary_subgraph().tensors();
+  TfLiteTensor* tensor = &tensors[tensor_index];
 
   *delegate = tensor->delegate;
   *buffer_handle = tensor->buffer_handle;
@@ -1015,4 +204,12 @@ TfLiteStatus Interpreter::GetBufferHandle(int tensor_index,
   return kTfLiteOk;
 }
 
+void Interpreter::SetProfiler(profiling::Profiler* profiler) {
+  for (auto& subgraph : subgraphs_) subgraph->SetProfiler(profiler);
+}
+
+profiling::Profiler* Interpreter::GetProfiler() {
+  return primary_subgraph().GetProfiler();
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter.h b/tensorflow/lite/interpreter.h
index 415c5f0979c9750fad4e128ff665ac870447e3c8..6192d56ca2b5810d7ffaddbf4cc7ae3c1b27c268 100644
--- a/tensorflow/lite/interpreter.h
+++ b/tensorflow/lite/interpreter.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/memory_planner.h"
 #include "tensorflow/lite/profiling/profiler.h"
 #include "tensorflow/lite/stderr_reporter.h"
@@ -57,6 +58,10 @@ constexpr TfLiteType typeToTfLiteType<unsigned char>() {
   return kTfLiteUInt8;
 }
 template <>
+constexpr TfLiteType typeToTfLiteType<int8_t>() {
+  return kTfLiteInt8;
+}
+template <>
 constexpr TfLiteType typeToTfLiteType<bool>() {
   return kTfLiteBool;
 }
@@ -69,9 +74,6 @@ constexpr TfLiteType typeToTfLiteType<string>() {
   return kTfLiteString;
 }
 
-// Forward declare since NNAPIDelegate uses Interpreter.
-class NNAPIDelegate;
-
 // An interpreter for a graph of nodes that input and output from tensors.
 // Each node of the graph processes a set of input tensors and produces a
 // set of output Tensors. All inputs/output tensors are referenced by index.
@@ -100,12 +102,6 @@ class NNAPIDelegate;
 // foo.Invoke();
 //
 
-struct TfLiteIntArrayDeleter {
-  void operator()(TfLiteIntArray* a) {
-    if (a) TfLiteIntArrayFree(a);
-  }
-};
-
 class Interpreter {
  public:
   // Instantiate an interpreter. All errors associated with reading and
@@ -117,6 +113,7 @@ class Interpreter {
 
   ~Interpreter();
 
+  // Interpreters are not copyable as they have non-trivial memory semantics.
   Interpreter(const Interpreter&) = delete;
   Interpreter& operator=(const Interpreter&) = delete;
 
@@ -197,34 +194,40 @@ class Interpreter {
   // Functions to access tensor data
 
   // Read only access to list of inputs.
-  const std::vector<int>& inputs() const { return inputs_; }
+  const std::vector<int>& inputs() const { return primary_subgraph().inputs(); }
 
   // Return the name of a given input. The given index must be between 0 and
   // inputs().size().
   const char* GetInputName(int index) const {
-    return context_.tensors[inputs_[index]].name;
+    return context_->tensors[inputs()[index]].name;
   }
 
   // Read only access to list of outputs.
-  const std::vector<int>& outputs() const { return outputs_; }
+  const std::vector<int>& outputs() const {
+    return primary_subgraph().outputs();
+  }
 
   // Read only access to list of variable tensors.
-  const std::vector<int>& variables() const { return variables_; }
+  const std::vector<int>& variables() const {
+    return primary_subgraph().variables();
+  }
 
   // Return the name of a given output. The given index must be between 0 and
   // outputs().size().
   const char* GetOutputName(int index) const {
-    return context_.tensors[outputs_[index]].name;
+    return context_->tensors[outputs()[index]].name;
   }
 
   // Return the number of tensors in the model.
-  size_t tensors_size() const { return context_.tensors_size; }
+  size_t tensors_size() const { return context_->tensors_size; }
 
   // Return the number of ops in the model.
-  size_t nodes_size() const { return nodes_and_registration_.size(); }
+  size_t nodes_size() const { return primary_subgraph().nodes_size(); }
 
   // WARNING: Experimental interface, subject to change
-  const std::vector<int>& execution_plan() const { return execution_plan_; }
+  const std::vector<int>& execution_plan() const {
+    return primary_subgraph().execution_plan();
+  }
 
   // WARNING: Experimental interface, subject to change
   // Overrides execution plan. This bounds checks indices sent in.
@@ -234,27 +237,18 @@ class Interpreter {
   // TODO(aselle): Create a safe ArrayHandle interface to avoid exposing this
   // read/write access to structure
   TfLiteTensor* tensor(int tensor_index) {
-    if (tensor_index < 0 ||
-        static_cast<size_t>(tensor_index) >= context_.tensors_size)
-      return nullptr;
-    return &context_.tensors[tensor_index];
+    return primary_subgraph().tensor(tensor_index);
   }
 
   // Get an immutable tensor data structure.
   const TfLiteTensor* tensor(int tensor_index) const {
-    if (tensor_index < 0 ||
-        static_cast<size_t>(tensor_index) >= context_.tensors_size)
-      return nullptr;
-    return &context_.tensors[tensor_index];
+    return primary_subgraph().tensor(tensor_index);
   }
 
   // Get a pointer to an operation and registration data structure if in bounds.
   const std::pair<TfLiteNode, TfLiteRegistration>* node_and_registration(
       int node_index) const {
-    if (node_index < 0 ||
-        static_cast<size_t>(node_index) >= nodes_and_registration_.size())
-      return nullptr;
-    return &nodes_and_registration_[node_index];
+    return primary_subgraph().node_and_registration(node_index);
   }
 
   // Perform a checked cast to the appropriate tensor type (mutable pointer
@@ -285,28 +279,28 @@ class Interpreter {
   // index must be between 0 and inputs().size().
   template <class T>
   T* typed_input_tensor(int index) {
-    return typed_tensor<T>(inputs_[index]);
+    return typed_tensor<T>(inputs()[index]);
   }
 
   // Return an immutable pointer into the data of a given input tensor. The
   // given index must be between 0 and inputs().size().
   template <class T>
   const T* typed_input_tensor(int index) const {
-    return typed_tensor<T>(inputs_[index]);
+    return typed_tensor<T>(inputs()[index]);
   }
 
   // Return a mutable pointer into the data of a given output tensor. The given
   // index must be between 0 and outputs().size().
   template <class T>
   T* typed_output_tensor(int index) {
-    return typed_tensor<T>(outputs_[index]);
+    return typed_tensor<T>(outputs()[index]);
   }
 
   // Return an immutable pointer into the data of a given output tensor. The
   // given index must be between 0 and outputs().size().
   template <class T>
   const T* typed_output_tensor(int index) const {
-    return typed_tensor<T>(outputs_[index]);
+    return typed_tensor<T>(outputs()[index]);
   }
 
   // Change the dimensionality of a given tensor. Note, this is only acceptable
@@ -321,7 +315,6 @@ class Interpreter {
   // Update allocations for all tensors. This will redim dependent tensors using
   // the input tensor dimensionality as given. This is relatively expensive.
   // If you know that your sizes are not changing, you need not call this.
-
   // Returns status of success or failure.
   TfLiteStatus AllocateTensors();
 
@@ -342,14 +335,12 @@ class Interpreter {
   // Allow float16 precision for FP32 calculation when possible.
   // default: not allow.
   // WARNING: This is an experimental API and subject to change.
-  void SetAllowFp16PrecisionForFp32(bool allow) {
-    context_.allow_fp32_relax_to_fp16 = allow;
-  }
+  void SetAllowFp16PrecisionForFp32(bool allow);
 
   // Get the half precision flag.
   // WARNING: This is an experimental API and subject to change.
   bool GetAllowFp16PrecisionForFp32() const {
-    return context_.allow_fp32_relax_to_fp16;
+    return context_->allow_fp32_relax_to_fp16;
   }
 
   // Owning handle to a TfLiteDelegate instance.
@@ -366,18 +357,7 @@ class Interpreter {
   // it might require to copy the data from delegate buffer to raw memory.
   // WARNING: This is an experimental API and subject to change.
   TfLiteStatus EnsureTensorDataIsReadable(int tensor_index) {
-    TfLiteTensor* t = tensor(tensor_index);
-    TF_LITE_ENSURE(&context_, t != nullptr);
-    if (t->data_is_stale) {
-      TF_LITE_ENSURE(&context_, t->delegate != nullptr);
-      TF_LITE_ENSURE(&context_, t->buffer_handle != kTfLiteNullBufferHandle);
-      // This can be null if the delegate doesn't use its own buffer.
-      TF_LITE_ENSURE(&context_, t->delegate->CopyFromBufferHandle != nullptr);
-      t->delegate->CopyFromBufferHandle(
-          &context_, t->delegate, t->buffer_handle, t->data.raw, t->bytes);
-      t->data_is_stale = false;
-    }
-    return kTfLiteOk;
+    return primary_subgraph().EnsureTensorDataIsReadable(tensor_index);
   }
 
   // Set the delegate buffer handle to a tensor. It can be called in the
@@ -400,9 +380,9 @@ class Interpreter {
                                TfLiteBufferHandle* buffer_handle,
                                TfLiteDelegate** delegate);
 
-  void SetProfiler(profiling::Profiler* profiler) { profiler_ = profiler; }
+  void SetProfiler(profiling::Profiler* profiler);
 
-  profiling::Profiler* GetProfiler() { return profiler_; }
+  profiling::Profiler* GetProfiler();
 
   // The default capacity of `tensors_` vector.
   static constexpr int kTensorsReservedCapacity = 128;
@@ -435,142 +415,47 @@ class Interpreter {
   const char* OpProfilingString(const TfLiteRegistration& op_reg,
                                 const TfLiteNode* node) const {
     if (op_reg.profiling_string == nullptr) return nullptr;
-    return op_reg.profiling_string(&context_, node);
+    return op_reg.profiling_string(context_, node);
   }
 
   // Set the value of an external context.
   void SetExternalContext(TfLiteExternalContextType type,
                           TfLiteExternalContext* ctx);
 
- private:
-  friend class InterpreterBuilder;
-  friend class InterpreterTest;
-
-  // Prevent 'context_' from accessing functions that are only available to
-  // delegated kernels.
-  void SwitchToKernelContext();
-
-  // Add delegate-only functions to 'context_'.
-  void SwitchToDelegateContext();
+  // Adds `subgraphs_to_add` subgraphs, preserving pre-existing Subgraph
+  // entries. The value pointed to by `first_new_subgraph_index` will be set to
+  // the index of the first new subgraph if `first_new_subgraph_index` is
+  // non-null.
+  // WARNING: This is an experimental API and subject to change.
+  void AddSubgraphs(int subgraphs_to_add,
+                    int* first_new_subgraph_index = nullptr);
 
-  // Give 'op_reg' a chance to initialize itself using the contents of
-  // 'buffer'.
-  void* OpInit(const TfLiteRegistration& op_reg, const char* buffer,
-               size_t length) {
-    if (op_reg.init == nullptr) return nullptr;
-    return op_reg.init(&context_, buffer, length);
-  }
+  // Return the number of subgraphs in the model.
+  // WARNING: This is an experimental API and subject to change.
+  size_t subgraphs_size() const { return subgraphs_.size(); }
 
-  // Let 'op_reg' release any memory it might have allocated via 'OpInit'.
-  void OpFree(const TfLiteRegistration& op_reg, void* buffer) {
-    if (op_reg.free == nullptr) return;
-    if (buffer) {
-      op_reg.free(&context_, buffer);
-    }
+  // Get a pointer to a subgraph if in bounds.
+  // WARNING: This is an experimental API and subject to change.
+  Subgraph* subgraph(int subgraph_index) {
+    if (subgraph_index < 0 ||
+        static_cast<size_t>(subgraph_index) >= subgraphs_size())
+      return nullptr;
+    return &*subgraphs_[subgraph_index];
   }
 
-  // Prepare the given 'node' for execution.
-  TfLiteStatus OpPrepare(const TfLiteRegistration& op_reg, TfLiteNode* node) {
-    if (op_reg.prepare == nullptr) return kTfLiteOk;
-    return op_reg.prepare(&context_, node);
+  // WARNING: Experimental interface, subject to change
+  Subgraph& primary_subgraph() {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
   }
 
-  // Invoke the operator represented by 'node'.
-  TfLiteStatus OpInvoke(const TfLiteRegistration& op_reg, TfLiteNode* node) {
-    if (op_reg.invoke == nullptr) return kTfLiteError;
-    return op_reg.invoke(&context_, node);
+  // WARNING: Experimental interface, subject to change
+  const Subgraph& primary_subgraph() const {
+    return *subgraphs_.front();  // Safe as subgraphs_ always has 1 entry.
   }
 
-  // Call OpPrepare() for as many ops as possible, allocating memory for their
-  // tensors. If an op containing dynamic tensors is found, preparation will be
-  // postponed until this function is called again. This allows the interpreter
-  // to wait until Invoke() to resolve the sizes of dynamic tensors.
-  TfLiteStatus PrepareOpsAndTensors();
-
-  // Call OpPrepare() for all ops starting at 'first_node'. Stop when a
-  // dynamic tensors is found or all ops have been prepared. Fill
-  // 'last_node_prepared' with the id of the op containing dynamic tensors, or
-  // the last in the graph.
-  TfLiteStatus PrepareOpsStartingAt(int first_execution_plan_index,
-                                    int* last_execution_plan_index_prepared);
-
-  // Tensors needed by the interpreter. Use `AddTensors` to add more blank
-  // tensor entries. Note, `tensors_.data()` needs to be synchronized to the
-  // `context_` whenever this std::vector is reallocated. Currently this
-  // only happens in `AddTensors()`.
-  std::vector<TfLiteTensor> tensors_;
-
-  // Check if an array of tensor indices are valid with respect to the Tensor
-  // array.
-  // NOTE: this changes consistent_ to be false if indices are out of bounds.
-  TfLiteStatus CheckTensorIndices(const char* label, const int* indices,
-                                  int length);
-
-  // Compute the number of bytes required to represent a tensor with dimensions
-  // specified by the array dims (of length dims_size). Returns the status code
-  // and bytes.
-  TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
-                             size_t* bytes);
-
-  // Request an tensor be resized implementation. If the given tensor is of
-  // type kTfLiteDynamic it will also be allocated new memory.
-  TfLiteStatus ResizeTensorImpl(TfLiteTensor* tensor, TfLiteIntArray* new_size);
-
-  // Report a detailed error string (will be printed to stderr).
-  // TODO(aselle): allow user of class to provide alternative destinations.
-  void ReportErrorImpl(const char* format, va_list args);
-
-  // Entry point for C node plugin API to request an tensor be resized.
-  static TfLiteStatus ResizeTensor(TfLiteContext* context, TfLiteTensor* tensor,
-                                   TfLiteIntArray* new_size);
-  // Entry point for C node plugin API to report an error.
-  static void ReportError(TfLiteContext* context, const char* format, ...);
-
-  // Entry point for C node plugin API to add new tensors.
-  static TfLiteStatus AddTensors(TfLiteContext* context, int tensors_to_add,
-                                 int* first_new_tensor_index);
-
-  // WARNING: This is an experimental API and subject to change.
-  // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
-  static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
-      TfLiteContext* context, TfLiteRegistration registration,
-      const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
-
-  // Update the execution graph to replace some of the nodes with stub
-  // nodes. Specifically any node index that has `nodes[index]==1` will be
-  // slated for replacement with a delegate kernel specified by registration.
-  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
-  // WARNING: This is an experimental interface that is subject to change.
-  TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
-      TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
-      TfLiteDelegate* delegate);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Gets the internal pointer to a TensorFlow lite node by node_index.
-  TfLiteStatus GetNodeAndRegistration(int node_index, TfLiteNode** node,
-                                      TfLiteRegistration** registration);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get a node by index.
-  static TfLiteStatus GetNodeAndRegistration(struct TfLiteContext*,
-                                             int node_index, TfLiteNode** node,
-                                             TfLiteRegistration** registration);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Gets an TfLiteIntArray* representing the execution plan. The interpreter
-  // owns this memory and it is only guaranteed to exist during the invocation
-  // of the delegate prepare.
-  TfLiteStatus GetExecutionPlan(TfLiteIntArray** execution_plan);
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // Entry point for C node plugin API to get the execution plan.
-  static TfLiteStatus GetExecutionPlan(struct TfLiteContext* context,
-                                       TfLiteIntArray** execution_plan);
-
-  // Retrieve an existing external context by type.
-  TfLiteExternalContext* GetExternalContext(TfLiteExternalContextType type);
-  static TfLiteExternalContext* GetExternalContext(
-      struct TfLiteContext* context, TfLiteExternalContextType type);
+ private:
+  friend class InterpreterBuilder;
+  friend class InterpreterTest;
 
   // Set the value of an external context.
   static void SetExternalContext(struct TfLiteContext* context,
@@ -587,105 +472,28 @@ class Interpreter {
     return ModifyGraphWithDelegate(owned_delegates_.back().get());
   }
 
-  // Ensures that `tensors_` has at least `kTensorsCapacityHeadroom` extra
-  // capacity. Calling this function may invalidate existing pointers to
-  // tensors. After calling this function, adding `kTensorsCapacityHeadroom`
-  // more tensors won't invalidate the pointer to existing tensors.
-  void EnsureTensorsVectorCapacity() {
-    const size_t required_capacity = tensors_size() + kTensorsCapacityHeadroom;
-    if (required_capacity > tensors_.capacity()) {
-      tensors_.reserve(required_capacity);
-      context_.tensors = tensors_.data();
-    }
-  }
-
-  // The state of the Interpreter.
-  enum State {
-    // The interpreter isn't ready to be invoked.
-    // `AllocateTensor` need to be called to enter an invokable state.
-    kStateUninvokable = 0,
-    // The interpreter is ready to be invoked.
-    kStateInvokable,
-    // The interpreter is ready to be invoked, and graph can't be further
-    // modified. The interpreter will enter this state when calling
-    // `ModifyGraphWithDelegate` with `allow_dynamic_tensors=false`.
-    kStateInvokableAndImmutable,
-  };
-  State state_ = kStateUninvokable;
-
   // A pure C data structure used to communicate with the pure C plugin
   // interface. To avoid copying tensor metadata, this is also the definitive
   // structure to store tensors.
-  TfLiteContext context_;
-
-  // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
-  // function pointers to actual implementation.
-  std::vector<std::pair<TfLiteNode, TfLiteRegistration>>
-      nodes_and_registration_;
-
-  // Whether the model is consistent. That is to say if the inputs and outputs
-  // of every node and the global inputs and outputs are valid indexes into
-  // the tensor array.
-  bool consistent_ = true;
-
-  // Array of indices representing the tensors that are inputs to the
-  // interpreter.
-  std::vector<int> inputs_;
-
-  // Array of indices representing the tensors that are outputs to the
-  // interpreter.
-  std::vector<int> outputs_;
-
-  // Array of indices representing the tensors that are variable tensors.
-  std::vector<int> variables_;
+  // This is the primary subgraph context.
+  TfLiteContext* context_;
 
   // The error reporter delegate that tflite will forward queries errors to.
   ErrorReporter* error_reporter_;
 
-  // Index of the next node to prepare.
-  // During Invoke(), Interpreter will allocate input tensors first, which are
-  // known to be fixed size. Then it will allocate outputs from nodes as many
-  // as possible. When there is a node that produces dynamic sized tensor.
-  // Interpreter will stop allocating tensors, set the value of next allocate
-  // node id, and execute the node to generate the output tensor before continue
-  // to allocate successors. This process repeats until all nodes are executed.
-  // NOTE: this relies on the order of nodes that is in topological order.
-  int next_execution_plan_index_to_prepare_;
-
-  // WARNING: This is an experimental interface that is subject to change.
-  // This is a list of node indices (to index into nodes_and_registration).
-  // This represents a valid topological sort (dependency ordered) execution
-  // plan. In particular, it is valid for this ordering to contain only a
-  // subset of the node indices.
-  std::vector<int> execution_plan_;
-
-  // In the future, we'd like a TfLiteIntArray compatible representation.
-  // TODO(aselle): replace execution_plan_ with this.
-  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> plan_cache_;
-
-  // Whether to delegate to NN API
-  std::unique_ptr<NNAPIDelegate> nnapi_delegate_;
-
   // List of delegates that have been installed and are owned by this
   // interpreter instance. Useful if client delegate ownership is burdensome.
   // WARNING: This is an experimental API and subject to change.
   // TODO(b/116667551): Use TfLiteExternalContext for storing state.
   std::vector<TfLiteDelegatePtr> owned_delegates_;
 
-  std::unique_ptr<MemoryPlanner> memory_planner_;
-
   bool allow_buffer_handle_output_ = false;
 
-  // Tracking bit for whether a tensor was resized in the course of an op
-  // invocation. This is a useful hint to ensure that dynamic tensor outputs
-  // trigger downstream reallocation after op invocation.
-  bool tensor_resized_since_op_invoke_ = false;
-
-  // Profiler for this interpreter instance.
-  profiling::Profiler* profiler_ = nullptr;
-
   // List of active external contexts.
   TfLiteExternalContext* external_contexts_[kTfLiteMaxExternalContexts];
+
+  // Subgraphs
+  std::vector<std::unique_ptr<Subgraph>> subgraphs_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc
index 7f03c3ceba199ec9ceaff3d8d9b5d5b73f1c05e1..78b5d1b8873b8b3558b098031ffa33c7857a31e5 100644
--- a/tensorflow/lite/interpreter_test.cc
+++ b/tensorflow/lite/interpreter_test.cc
@@ -38,7 +38,7 @@ class InterpreterTest : public ::testing::Test {
   }
 
  protected:
-  TfLiteContext* GetInterpreterContext() { return &interpreter_.context_; }
+  TfLiteContext* GetInterpreterContext() { return interpreter_.context_; }
 
   Interpreter interpreter_;
 };
@@ -566,7 +566,7 @@ TEST(BasicInterpreter, ThreeStepAllocate) {
     DynamicBuffer buf;
     StringRef str_ref = GetString(input, 0);
     buf.AddString(str_ref);
-    buf.WriteToTensor(output);
+    buf.WriteToTensorAsVector(output);
     return kTfLiteOk;
   };
 
@@ -1090,17 +1090,17 @@ class TestDelegate : public ::testing::Test {
         TfLiteIntArrayFree(nodes_to_separate);
         return kTfLiteOk;
       };
-      delegate_.CopyToBufferHandle =
-          [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle, void* data,
-             size_t size) -> TfLiteStatus {
+      delegate_.CopyToBufferHandle = [](TfLiteContext* context,
+                                        TfLiteDelegate* delegate,
+                                        TfLiteBufferHandle buffer_handle,
+                                        TfLiteTensor* tensor) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
       delegate_.CopyFromBufferHandle =
           [](TfLiteContext* context, TfLiteDelegate* delegate,
-             TfLiteBufferHandle buffer_handle, void* data,
-             size_t size) -> TfLiteStatus {
+             TfLiteBufferHandle buffer_handle,
+             TfLiteTensor* output) -> TfLiteStatus {
         // TODO(ycling): Implement tests to test buffer copying logic.
         return kTfLiteOk;
       };
diff --git a/tensorflow/lite/java/demo/app/build.gradle b/tensorflow/lite/java/demo/app/build.gradle
index 05301ebf88c12cc95f71d5efd74062d76e598e1d..b8fc282cb1dfe8a9c80692759e985bf369fc163d 100644
--- a/tensorflow/lite/java/demo/app/build.gradle
+++ b/tensorflow/lite/java/demo/app/build.gradle
@@ -40,6 +40,15 @@ repositories {
         url 'https://google.bintray.com/tensorflow'
     }
 }
+allprojects {
+    repositories {
+        // Uncomment if you want to use a local repo.
+        // mavenLocal()
+        jcenter()
+    }
+}
+
+
 
 dependencies {
     compile fileTree(dir: 'libs', include: ['*.jar'])
@@ -49,31 +58,66 @@ dependencies {
     compile 'com.android.support:support-annotations:25.3.1'
     compile 'com.android.support:support-v13:25.2.0'
 
+    // Build off of nightly TensorFlow Lite
     compile 'org.tensorflow:tensorflow-lite:0.0.0-nightly'
+    // Use local TensorFlow library
+    // compile 'org.tensorflow:tensorflow-lite-local:0.0.0'
 }
 
-def modelDownloadUrl = "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip"
-def localCache = "build/intermediates/mobilenet_v1_224_android_quant_2017_11_08.zip"
 def targetFolder = "src/main/assets"
+def modelFloatDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_02_22/mobilenet_v1_1.0_224.tgz"
+def modelQuantDownloadUrl = "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz"
+def localCacheFloat = "build/intermediates/mobilenet_v1_1.0_224.tgz"
+def localCacheQuant = "build/intermediates/mmobilenet_v1_1.0_224_quant.tgz"
+
 
-task downloadModel(type: DownloadUrlTask) {
+task downloadModelFloat(type: DownloadUrlTask) {
     doFirst {
-        println "Downloading ${modelDownloadUrl}"
+        println "Downloading ${modelFloatDownloadUrl}"
     }
-    sourceUrl = "${modelDownloadUrl}"
-    target = file("${localCache}")
+    sourceUrl = "${modelFloatDownloadUrl}"
+    target = file("${localCacheFloat}")
 }
 
-task unzipModel(type: Copy, dependsOn: 'downloadModel') {
+task downloadModelQuant(type: DownloadUrlTask) {
     doFirst {
-        println "Unzipping ${localCache}"
+        println "Downloading ${modelQuantDownloadUrl}"
     }
-    from zipTree("${localCache}")
+    sourceUrl = "${modelQuantDownloadUrl}"
+    target = file("${localCacheQuant}")
+}
+
+task unzipModelFloat(type: Copy, dependsOn: 'downloadModelFloat') {
+    doFirst {
+        println "Unzipping ${localCacheFloat}"
+    }
+    from tarTree("${localCacheFloat}")
     into "${targetFolder}"
 }
 
+task unzipModelQuant(type: Copy, dependsOn: 'downloadModelQuant') {
+    doFirst {
+        println "Unzipping ${localCacheQuant}"
+    }
+    from tarTree("${localCacheQuant}")
+    into "${targetFolder}"
+}
+
+task cleanUnusedFiles(type: Delete, dependsOn: ['unzipModelFloat', 'unzipModelQuant']) {
+    delete fileTree("${targetFolder}").matching {
+        include "*.pb"
+        include "*.ckpt.*"
+        include "*.pbtxt.*"
+        include "*.quant_info.*"
+        include "*.meta"
+    }
+}
+
+
 // Ensure the model file is downloaded and extracted before every build
-preBuild.dependsOn unzipModel
+preBuild.dependsOn unzipModelFloat
+preBuild.dependsOn unzipModelQuant
+preBuild.dependsOn cleanUnusedFiles
 
 class DownloadUrlTask extends DefaultTask {
     @Input
@@ -87,3 +131,4 @@ class DownloadUrlTask extends DefaultTask {
         ant.get(src: sourceUrl, dest: target)
     }
 }
+
diff --git a/tensorflow/lite/java/demo/app/src/main/BUILD b/tensorflow/lite/java/demo/app/src/main/BUILD
index df8a024a570fe071c808bcd70167221f8c8fd8cc..9a7c1d0b61192c61896813f41b2db1e03ff65ecb 100644
--- a/tensorflow/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/lite/java/demo/app/src/main/BUILD
@@ -10,7 +10,8 @@ android_binary(
     aapt_version = "aapt",
     assets = [
         "//tensorflow/lite/java/demo/app/src/main/assets:labels_mobilenet_quant_v1_224.txt",
-        "@tflite_mobilenet//:mobilenet_quant_v1_224.tflite",
+        "@tflite_mobilenet_quant//:mobilenet_v1_1.0_224_quant.tflite",
+        "@tflite_mobilenet_float//:mobilenet_v1_1.0_224.tflite",
     ],
     assets_dir = "",
     custom_package = "com.example.android.tflitecamerademo",
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
index 3596e4201150abaecc1cd8fdd736510a0afc97bb..165d33510131ac9c9fc08070f0a4d08653188fae 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/Camera2BasicFragment.java
@@ -56,11 +56,12 @@ import android.view.Surface;
 import android.view.TextureView;
 import android.view.View;
 import android.view.ViewGroup;
-import android.widget.CompoundButton;
+import android.widget.AdapterView;
+import android.widget.ArrayAdapter;
+import android.widget.ListView;
 import android.widget.NumberPicker;
 import android.widget.TextView;
 import android.widget.Toast;
-import android.widget.ToggleButton;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -70,6 +71,7 @@ import java.util.List;
 import java.util.concurrent.Semaphore;
 import java.util.concurrent.TimeUnit;
 
+
 /** Basic fragments for the Camera. */
 public class Camera2BasicFragment extends Fragment
     implements FragmentCompat.OnRequestPermissionsResultCallback {
@@ -87,9 +89,11 @@ public class Camera2BasicFragment extends Fragment
   private boolean runClassifier = false;
   private boolean checkedPermissions = false;
   private TextView textView;
-  private ToggleButton toggle;
   private NumberPicker np;
   private ImageClassifier classifier;
+  private ListView deviceView;
+  private ListView modelView;
+
 
   /** Max preview width that is guaranteed by Camera2 API */
   private static final int MAX_PREVIEW_WIDTH = 1920;
@@ -123,6 +127,15 @@ public class Camera2BasicFragment extends Fragment
         public void onSurfaceTextureUpdated(SurfaceTexture texture) {}
       };
 
+  // Model parameter constants.
+  private String gpu;
+  private String cpu;
+  private String nnApi;
+  private String mobilenetV1Quant;
+  private String mobilenetV1Float;
+
+
+
   /** ID of the current {@link CameraDevice}. */
   private String cameraId;
 
@@ -169,6 +182,14 @@ public class Camera2BasicFragment extends Fragment
         }
       };
 
+  private ArrayList<String> deviceStrings = new ArrayList<String>();
+  private ArrayList<String> modelStrings = new ArrayList<String>();
+
+  /** Current indices of device and model. */
+  int currentDevice = -1;
+
+  int currentModel = -1;
+
   /** An additional thread for running tasks that shouldn't block the UI. */
   private HandlerThread backgroundThread;
 
@@ -298,17 +319,113 @@ public class Camera2BasicFragment extends Fragment
     return inflater.inflate(R.layout.fragment_camera2_basic, container, false);
   }
 
+  private void updateActiveModel() {
+    // Get UI information before delegating to background
+    final int modelIndex = modelView.getCheckedItemPosition();
+    final int deviceIndex = deviceView.getCheckedItemPosition();
+
+    backgroundHandler.post(() -> {
+      if (modelIndex == currentModel && deviceIndex == currentDevice) {
+        return;
+      }
+      currentModel = modelIndex;
+      currentDevice = deviceIndex;
+
+      // Disable classifier while updating
+      if (classifier != null) {
+        classifier.close();
+        classifier = null;
+      }
+
+      // Lookup names of parameters.
+      String model = modelStrings.get(modelIndex);
+      String device = deviceStrings.get(deviceIndex);
+
+      Log.i(TAG, "Changing model to " + model + " device " + device);
+
+      // Try to load model.
+      try {
+        if (model.equals(mobilenetV1Quant)) {
+          classifier = new ImageClassifierQuantizedMobileNet(getActivity());
+        } else if (model.equals(mobilenetV1Float)) {
+          classifier = new ImageClassifierFloatMobileNet(getActivity());
+        } else {
+          showToast("Failed to load model");
+        }
+      } catch (IOException e) {
+        Log.d(TAG, "Failed to load", e);
+        classifier = null;
+      }
+
+      // Customzie the interpreter to the type of device we want to use.
+      if (device.equals(cpu)) {
+      } else if (device.equals(gpu)) {
+        if (!GpuDelegateHelper.isGpuDelegateAvailable()) {
+          showToast("gpu not in this build.");
+          classifier = null;
+        } else if (model.equals(mobilenetV1Quant)) {
+          showToast("gpu requires float model.");
+          classifier = null;
+        } else {
+          classifier.useGpu();
+        }
+      } else if (device.equals(nnApi)) {
+        classifier.useNNAPI();
+      }
+    });
+  }
+
   /** Connect the buttons to their event handler. */
   @Override
   public void onViewCreated(final View view, Bundle savedInstanceState) {
+    gpu = getString(R.string.gpu);
+    cpu = getString(R.string.cpu);
+    nnApi = getString(R.string.nnapi);
+    mobilenetV1Quant = getString(R.string.mobilenetV1Quant);
+    mobilenetV1Float = getString(R.string.mobilenetV1Float);
+
+    // Get references to widgets.
     textureView = (AutoFitTextureView) view.findViewById(R.id.texture);
     textView = (TextView) view.findViewById(R.id.text);
-    toggle = (ToggleButton) view.findViewById(R.id.button);
-
-    toggle.setOnCheckedChangeListener(
-        new CompoundButton.OnCheckedChangeListener() {
-          public void onCheckedChanged(CompoundButton buttonView, boolean isChecked) {
-            backgroundHandler.post(() -> classifier.setUseNNAPI(isChecked));
+    deviceView = (ListView) view.findViewById(R.id.device);
+    modelView = (ListView) view.findViewById(R.id.model);
+
+    // Build list of models
+    modelStrings.add(mobilenetV1Quant);
+    modelStrings.add(mobilenetV1Float);
+
+    // Build list of devices
+    int defaultModelIndex = 0;
+    deviceStrings.add(cpu);
+    if (GpuDelegateHelper.isGpuDelegateAvailable()) {
+      deviceStrings.add(gpu);
+    }
+    deviceStrings.add(nnApi);
+
+    deviceView.setAdapter(
+        new ArrayAdapter<String>(
+            getContext(), R.layout.listview_row, R.id.listview_row_text, deviceStrings));
+    deviceView.setChoiceMode(ListView.CHOICE_MODE_SINGLE);
+    deviceView.setOnItemClickListener(
+        new AdapterView.OnItemClickListener() {
+          @Override
+          public void onItemClick(AdapterView<?> parent, View view, int position, long id) {
+            updateActiveModel();
+          }
+        });
+    deviceView.setItemChecked(0, true);
+
+    modelView.setChoiceMode(ListView.CHOICE_MODE_SINGLE);
+    ArrayAdapter<String> modelAdapter =
+        new ArrayAdapter<>(
+            getContext(), R.layout.listview_row, R.id.listview_row_text, modelStrings);
+    modelView.setAdapter(modelAdapter);
+    modelView.setItemChecked(defaultModelIndex, true);
+    modelView.setOnItemClickListener(
+        new AdapterView.OnItemClickListener() {
+          @Override
+          public void onItemClick(AdapterView<?> parent, View view, int position, long id) {
+            updateActiveModel();
           }
         });
 
@@ -323,18 +440,14 @@ public class Camera2BasicFragment extends Fragment
             backgroundHandler.post(() -> classifier.setNumThreads(newVal));
           }
         });
+
+    // Start initial model.
   }
 
   /** Load the model and labels. */
   @Override
   public void onActivityCreated(Bundle savedInstanceState) {
     super.onActivityCreated(savedInstanceState);
-    try {
-      // create either a new ImageClassifierQuantizedMobileNet or an ImageClassifierFloatInception
-      classifier = new ImageClassifierQuantizedMobileNet(getActivity());
-    } catch (IOException e) {
-      Log.e(TAG, "Failed to initialize an image classifier.", e);
-    }
     startBackgroundThread();
   }
 
@@ -562,10 +675,12 @@ public class Camera2BasicFragment extends Fragment
     backgroundThread = new HandlerThread(HANDLE_THREAD_NAME);
     backgroundThread.start();
     backgroundHandler = new Handler(backgroundThread.getLooper());
+    // Start the classification train & load an initial model.
     synchronized (lock) {
       runClassifier = true;
     }
     backgroundHandler.post(periodicClassify);
+    updateActiveModel();
   }
 
   /** Stops the background thread and its {@link Handler}. */
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java
new file mode 100644
index 0000000000000000000000000000000000000000..8dca17744eb7a3d1e69612abf61deafb6370e4ff
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/GpuDelegateHelper.java
@@ -0,0 +1,49 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import org.tensorflow.lite.Delegate;
+
+/**
+ * Helper class for {@code GpuDelegate}.
+ *
+ * <p>WARNING: This is an experimental API and subject to change.
+ */
+public class GpuDelegateHelper {
+  private GpuDelegateHelper() {}
+
+  /** Checks whether {@code GpuDelegate} is available. */
+  public static boolean isGpuDelegateAvailable() {
+    try {
+      Class.forName("org.tensorflow.lite.experimental.GpuDelegate");
+      return true;
+    } catch (Exception e) {
+      return false;
+    }
+  }
+
+  /** Returns an instance of {@code GpuDelegate} if available. */
+  public static Delegate createGpuDelegate() {
+    try {
+      return Class.forName("org.tensorflow.lite.experimental.GpuDelegate")
+          .asSubclass(Delegate.class)
+          .getDeclaredConstructor()
+          .newInstance();
+    } catch (Exception e) {
+      throw new IllegalStateException(e);
+    }
+  }
+}
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
index 39057aa7768c54fb0f7b48211823730dc6217a70..512f8b64db1637385e7be56db6d0889c44abb2fb 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifier.java
@@ -38,6 +38,7 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Map;
 import java.util.PriorityQueue;
+import org.tensorflow.lite.Delegate;
 import org.tensorflow.lite.Interpreter;
 
 /**
@@ -93,6 +94,9 @@ public abstract class ImageClassifier {
             }
           });
 
+  /** holds a gpu delegate */
+  Delegate gpuDelegate = null;
+
   /** Initializes an {@code ImageClassifier}. */
   ImageClassifier(Activity activity) throws IOException {
     tfliteModel = loadModelFile(activity);
@@ -159,12 +163,27 @@ public abstract class ImageClassifier {
   private void recreateInterpreter() {
     if (tflite != null) {
       tflite.close();
+      // TODO(b/120679982)
+      // gpuDelegate.close();
       tflite = new Interpreter(tfliteModel, tfliteOptions);
     }
   }
 
-  public void setUseNNAPI(Boolean nnapi) {
-    tfliteOptions.setUseNNAPI(nnapi);
+  public void useGpu() {
+    if (gpuDelegate == null && GpuDelegateHelper.isGpuDelegateAvailable()) {
+      gpuDelegate = GpuDelegateHelper.createGpuDelegate();
+      tfliteOptions.addDelegate(gpuDelegate);
+      recreateInterpreter();
+    }
+  }
+
+  public void useCPU() {
+    tfliteOptions.setUseNNAPI(false);
+    recreateInterpreter();
+  }
+
+  public void useNNAPI() {
+    tfliteOptions.setUseNNAPI(true);
     recreateInterpreter();
   }
 
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
new file mode 100644
index 0000000000000000000000000000000000000000..c87ffff8f6c39dc1d87c2cf0c09b5602edd9329c
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierFloatMobileNet.java
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package com.example.android.tflitecamerademo;
+
+import android.app.Activity;
+import java.io.IOException;
+
+/** This classifier works with the float MobileNet model. */
+public class ImageClassifierFloatMobileNet extends ImageClassifier {
+
+  /**
+   * An array to hold inference results, to be feed into Tensorflow Lite as outputs. This isn't part
+   * of the super class, because we need a primitive array here.
+   */
+  private float[][] labelProbArray = null;
+
+  /**
+   * Initializes an {@code ImageClassifierFloatMobileNet}.
+   *
+   * @param activity
+   */
+  ImageClassifierFloatMobileNet(Activity activity) throws IOException {
+    super(activity);
+    labelProbArray = new float[1][getNumLabels()];
+  }
+
+  @Override
+  protected String getModelPath() {
+    // you can download this file from
+    // see build.gradle for where to obtain this file. It should be auto
+    // downloaded into assets.
+    return "mobilenet_v1_1.0_224.tflite";
+  }
+
+  @Override
+  protected String getLabelPath() {
+    return "labels_mobilenet_quant_v1_224.txt";
+  }
+
+  @Override
+  protected int getImageSizeX() {
+    return 224;
+  }
+
+  @Override
+  protected int getImageSizeY() {
+    return 224;
+  }
+
+  @Override
+  protected int getNumBytesPerChannel() {
+    return 4; // Float.SIZE / Byte.SIZE;
+  }
+
+  @Override
+  protected void addPixelValue(int pixelValue) {
+    imgData.putFloat(((pixelValue >> 16) & 0xFF) / 255.f);
+    imgData.putFloat(((pixelValue >> 8) & 0xFF) / 255.f);
+    imgData.putFloat((pixelValue & 0xFF) / 255.f);
+  }
+
+  @Override
+  protected float getProbability(int labelIndex) {
+    return labelProbArray[0][labelIndex];
+  }
+
+  @Override
+  protected void setProbability(int labelIndex, Number value) {
+    labelProbArray[0][labelIndex] = value.floatValue();
+  }
+
+  @Override
+  protected float getNormalizedProbability(int labelIndex) {
+    return labelProbArray[0][labelIndex];
+  }
+
+  @Override
+  protected void runInference() {
+    tflite.run(imgData, labelProbArray);
+  }
+}
diff --git a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
index e164ac75543ebab12e6b1c057c4ed487eb9accdf..6310a5616838ac6b4258ec05028efa12e8cadab5 100644
--- a/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
+++ b/tensorflow/lite/java/demo/app/src/main/java/com/example/android/tflitecamerademo/ImageClassifierQuantizedMobileNet.java
@@ -42,8 +42,9 @@ public class ImageClassifierQuantizedMobileNet extends ImageClassifier {
   @Override
   protected String getModelPath() {
     // you can download this file from
-    // https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip
-    return "mobilenet_quant_v1_224.tflite";
+    // see build.gradle for where to obtain this file. It should be auto
+    // downloaded into assets.
+    return "mobilenet_v1_1.0_224_quant.tflite";
   }
 
   @Override
diff --git a/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml b/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml
new file mode 100644
index 0000000000000000000000000000000000000000..202c900769fdd3be15d6b1252d5c2c4f7f728d8c
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/drawable/item_selector.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="utf-8"?>
+<selector xmlns:android="http://schemas.android.com/apk/res/android">
+
+    <!-- pressed -->
+    <item android:drawable="@color/selection_highlight" android:state_pressed="true" />
+    <!-- focused -->
+    <item android:drawable="@color/selection_focus" android:state_activated="true" />
+    <!-- default -->
+    <item android:drawable="@color/item_normal" />
+
+</selector>
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
index ef8a9e08450d72e392815756606f5ef8301cdd58..ee71ab808f4810ac092b37b0d996331072f44652 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout-land/fragment_camera2_basic.xml
@@ -40,12 +40,27 @@
         android:scaleType="centerInside"
         android:src="@drawable/logo"/>
 
-    <ToggleButton
-        android:id="@+id/button"
+    <RadioGroup
+        android:gravity="center"
         android:layout_width="match_parent"
-        android:layout_height="wrap_content"
-        android:textOff="@string/tflite"
-        android:textOn="@string/nnapi"/>
+        android:layout_height="match_parent"
+        android:orientation="horizontal">
+        <RadioButton
+            android:id="@+id/radio_cpu"
+            android:background="#0000000f"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:text="@string/cpu"
+            android:textColor="@android:color/white" />
+        <RadioButton
+            android:id="@+id/radio_nnapi"
+            android:background="#0000000f"
+            android:layout_width="wrap_content"
+            android:layout_height="wrap_content"
+            android:text="@string/nnapi"
+            android:textColor="@android:color/white" />
+        </RadioGroup>
+
     <NumberPicker
         android:id="@+id/np"
         android:layout_width="wrap_content"
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
index ddb099a950c2f83d7b2867f8f35d96885229536d..70eedfdd02ad3ac03f6d413c0d5e2357a320751f 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout-v26/fragment_camera2_basic.xml
@@ -57,38 +57,83 @@
             android:textStyle="bold" />
 
     </LinearLayout>
-    <LinearLayout
-        android:orientation="horizontal"
-        android:background="#513400"
-        android:layout_alignParentBottom="true"
 
-        android:layout_width="match_parent"
+    <LinearLayout
         android:id="@+id/bottom_info_view"
+        android:layout_width="match_parent"
+        android:layout_height="200dp"
+
+        android:layout_alignParentBottom="true"
         android:layout_marginBottom="10dp"
-        android:layout_height="50dp">
-        <TextView
+        android:background="#513400"
+        android:orientation="horizontal">
+
+        <LinearLayout
             android:layout_width="wrap_content"
             android:layout_height="match_parent"
-            android:textColor="@android:color/white"
-            android:textAlignment="center"
-            android:gravity="center"
-            android:text="Threads:"/>
-        <NumberPicker
-            android:id="@+id/np"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:theme="@style/AppTheme.Picker"
-            android:visibility="visible" />
-        <ToggleButton
-            android:id="@+id/button"
-            android:textOff="@string/tflite"
-            android:textOn="@string/nnapi"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:background="#0000000f"
-            android:textColor="@android:color/white" />
+            android:orientation="vertical">
+
+            <TextView
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:gravity="center"
+                android:text="Threads"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <NumberPicker
+                android:id="@+id/np"
+                android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
+                android:layout_marginLeft="10dp"
+                android:theme="@style/AppTheme.Picker"
+                android:visibility="visible" />
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="150dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/modelLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/model"
+                android:layout_width="match_parent"
+                android:layout_height="180dp">
+
+            </ListView>
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/deviceLayout"
+            android:layout_width="140dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView2"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/deviceLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/device"
+                android:layout_width="match_parent"
+                android:layout_height="180dp" />
+
+        </LinearLayout>
+
     </LinearLayout>
 
 
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml b/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
index e567009a424ed77384bee193c47d4f4d253f5767..f8312cc0f7567a5298e5b0a851f011e4d0d6c0bb 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout/fragment_camera2_basic.xml
@@ -57,22 +57,30 @@
             android:textStyle="bold" />
 
     </LinearLayout>
-    <LinearLayout
-        android:orientation="horizontal"
-        android:background="#aa7700"
-        android:layout_alignParentBottom="true"
 
-        android:layout_width="match_parent"
+    <LinearLayout
         android:id="@+id/bottom_info_view"
+        android:layout_width="match_parent"
+        android:layout_height="200dp"
+
+        android:layout_alignParentBottom="true"
         android:layout_marginBottom="10dp"
-        android:layout_height="50dp">
-        <TextView
+        android:background="#513400"
+        android:orientation="horizontal">
+
+      <LinearLayout
             android:layout_width="wrap_content"
             android:layout_height="match_parent"
-            android:textColor="@android:color/white"
-            android:textAlignment="center"
+            android:orientation="vertical">
+
+        <TextView
+            android:layout_width="wrap_content"
+                android:layout_height="wrap_content"
             android:gravity="center"
-            android:text="@string/threads" />
+                android:text="Threads"
+            android:textAlignment="center"
+            android:textColor="@android:color/white" />
+
         <NumberPicker
             android:id="@+id/np"
             android:layout_width="wrap_content"
@@ -80,15 +88,51 @@
             android:layout_marginLeft="10dp"
             android:theme="@style/AppTheme.Picker"
             android:visibility="visible" />
-        <ToggleButton
-            android:id="@+id/button"
-            android:textOff="@string/tflite"
-            android:textOn="@string/nnapi"
-            android:layout_width="wrap_content"
-            android:layout_height="wrap_content"
-            android:layout_marginLeft="10dp"
-            android:background="#0000000f"
-            android:textColor="@android:color/white" />
+
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/modelLayout"
+            android:layout_width="150dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/modelLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/model"
+                android:layout_width="match_parent"
+                android:layout_height="180dp">
+
+            </ListView>
+        </LinearLayout>
+
+        <LinearLayout
+            android:id="@+id/deviceLayout"
+            android:layout_width="140dp"
+            android:layout_height="match_parent"
+            android:orientation="vertical">
+
+            <TextView
+                android:id="@+id/textView2"
+                android:layout_width="match_parent"
+                android:layout_height="20dp"
+                android:text="@string/deviceLabel"
+                android:textAlignment="center"
+                android:textColor="@android:color/white" />
+
+            <ListView
+                android:id="@+id/device"
+                android:layout_width="match_parent"
+                android:layout_height="180dp" />
+
+        </LinearLayout>
 
     </LinearLayout>
 </RelativeLayout>
diff --git a/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml b/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml
new file mode 100644
index 0000000000000000000000000000000000000000..349b0f63b4dbae11d21dbb0a58c3cda47299cbf0
--- /dev/null
+++ b/tensorflow/lite/java/demo/app/src/main/res/layout/listview_row.xml
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="utf-8"?>
+<LinearLayout xmlns:android="http://schemas.android.com/apk/res/android"
+    android:layout_width="match_parent"
+    android:layout_height="match_parent">
+
+
+    <TextView
+        android:id="@+id/listview_row_text"
+        android:layout_width="match_parent"
+        android:layout_height="match_parent"
+        android:layout_marginRight="2dp"
+        android:background="@drawable/item_selector"
+        android:padding="10dp"
+        android:textSize="18sp"
+        android:textStyle="bold" />
+
+</LinearLayout>
\ No newline at end of file
diff --git a/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml b/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
index 4b75d2b2bda0f95166d0442ebae19cedcad162d8..c30f1dc3ac79a7ef33908a625710f7ac96bfc858 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/values/colors.xml
@@ -16,4 +16,7 @@
 -->
 <resources>
     <color name="control_background">#cc4285f4</color>
+    <color name="selection_highlight">#aaaaaa</color>
+    <color name="selection_focus">#eeaa55</color>
+    <color name="item_normal">#eeeeee</color>
 </resources>
diff --git a/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml b/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
index 29a033bcd437c951ef6e8ba78f4fc3a0fcafac96..8cc88f25652256665acbab2855c60ee1a10293c4 100644
--- a/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
+++ b/tensorflow/lite/java/demo/app/src/main/res/values/strings.xml
@@ -23,4 +23,11 @@
     <string name="toggle">Use NNAPI</string>
     <string name="tflite">tflite</string>
     <string name="nnapi">NNAPI</string>
+    <string name="gpu">GPU</string>
+    <string name="cpu">CPU</string>
+    <string name="modelLabel">Model</string>
+    <string name="deviceLabel">Device</string>
+    <string name="mobilenetV1Quant">mobilenet v1 quant</string>;
+    <string name="mobilenetV1Float">mobilenet v1 float</string>;;
+
 </resources>
diff --git a/tensorflow/lite/java/jni/BUILD b/tensorflow/lite/java/jni/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..ce17ac4fa0d37cb0b790617c4258ea469d14a664
--- /dev/null
+++ b/tensorflow/lite/java/jni/BUILD
@@ -0,0 +1,47 @@
+package(default_visibility = ["//tensorflow/lite:__subpackages__"])
+
+licenses(["notice"])  # Apache 2.0
+
+# Helper target for exposing JNI headers across multiple platforms.
+cc_library(
+    name = "jni",
+    hdrs = select({
+        # The Android toolchain makes "jni.h" available in the include path.
+        # For non-Android toolchains, generate jni.h and jni_md.h.
+        "//tensorflow:android": [],
+        "//conditions:default": [
+            ":jni.h",
+            ":jni_md.h",
+        ],
+    }),
+    includes = select({
+        "//tensorflow:android": [],
+        "//conditions:default": ["."],
+    }),
+)
+
+# Silly rules to make
+# #include <jni.h>
+# in the source headers work
+# (in combination with the "includes" attribute of the tf_cuda_library rule
+# above. Not needed when using the Android toolchain).
+#
+# Inspired from:
+# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
+# but hopefully there is a simpler alternative to this.
+genrule(
+    name = "copy_jni_h",
+    srcs = ["@bazel_tools//tools/jdk:jni_header"],
+    outs = ["jni.h"],
+    cmd = "cp -f $< $@",
+)
+
+genrule(
+    name = "copy_jni_md_h",
+    srcs = select({
+        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
+        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
+    }),
+    outs = ["jni_md.h"],
+    cmd = "cp -f $< $@",
+)
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
index 41093e8ffe6407d31659c51e13717ef67014dec5..bd47574f71b28989378eb50faab40e64e543bd1c 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/DataType.java
@@ -27,7 +27,10 @@ public enum DataType {
   UINT8(3),
 
   /** 64-bit signed integer. */
-  INT64(4);
+  INT64(4),
+
+  /** Strings. */
+  STRING(5);
 
   private final int value;
 
@@ -46,6 +49,8 @@ public enum DataType {
         return 1;
       case INT64:
         return 8;
+      case STRING:
+        return -1;
     }
     throw new IllegalArgumentException(
         "DataType error: DataType " + this + " is not supported yet");
@@ -82,6 +87,8 @@ public enum DataType {
         return "byte";
       case INT64:
         return "long";
+      case STRING:
+        return "string";
     }
     throw new IllegalArgumentException(
         "DataType error: DataType " + this + " is not supported yet");
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
index a03d7b567637e306f55b2e161cef162def3550c6..2203d5fbdb260aaf2bf826343343426a5015e889 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Interpreter.java
@@ -43,15 +43,34 @@ import org.checkerframework.checker.nullness.qual.NonNull;
  * <pre>{@code
  * Object[] inputs = {input0, input1, ...};
  * Map<Integer, Object> map_of_indices_to_outputs = new HashMap<>();
- * float[][][] ith_output = new float[3][2][4];
+ * ByteBuffer ith_output = ByteBuffer.allocateDirect(3 * 2 * 4 * 4);  // Float tensor, shape 3x2x4.
+ * ith_output.order(ByteOrder.nativeOrder());
  * map_of_indices_to_outputs.put(i, ith_output);
  * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
  *   interpreter.runForMultipleInputsOutputs(inputs, map_of_indices_to_outputs);
  * }
  * }</pre>
  *
+ * <p>If a model takes or produces string tensors:
+ *
+ * <pre>{@code
+ * String[] input = {"foo", "bar"};  // Input tensor shape is [2].
+ * String[] output = new String[3][2];  // Output tensor shape is [3, 2].
+ * try (Interpreter interpreter = new Interpreter(file_of_a_tensorflowlite_model)) {
+ *   interpreter.runForMultipleInputsOutputs(input, output);
+ * }
+ * }</pre>
+ *
  * <p>Orders of inputs and outputs are determined when converting TensorFlow model to TensorFlowLite
- * model with Toco.
+ * model with Toco, as are the default shapes of the inputs.
+ *
+ * <p>When inputs are provided as (multi-dimensional) arrays, the corresponding input tensor(s) will
+ * be implicitly resized according to that array's shape. When inputs are provided as {@link
+ * ByteBuffer} types, no implicit resizing is done; the caller must ensure that the {@link
+ * ByteBuffer} byte size either matches that of the corresponding tensor, or that they first resize
+ * the tensor via {@link #resizeInput()}. Tensor shape and type information can be obtained via the
+ * {@link Tensor} class, available via {@link #getInputTensor(int)} and {@link
+ * #getOutputTensor(int)}.
  *
  * <p><b>WARNING:</b>Instances of a {@code Interpreter} is <b>not</b> thread-safe. A {@code
  * Interpreter} owns resources that <b>must</b> be explicitly freed by invoking {@link #close()}
@@ -192,12 +211,13 @@ public final class Interpreter implements AutoCloseable {
    * Runs model inference if the model takes only one input, and provides only one output.
    *
    * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
-   * consider using {@link ByteBuffer} to feed input data for better performance.
+   * consider using {@link ByteBuffer} to feed primitive input data for better performance.
    *
    * @param input an array or multidimensional array, or a {@link ByteBuffer} of primitive types
    *     including int, float, long, and byte. {@link ByteBuffer} is the preferred way to pass large
-   *     input data. When {@link ByteBuffer} is used, its content should remain unchanged until
-   *     model inference is done.
+   *     input data for primitive types, whereas string types require using the (multi-dimensional)
+   *     array input path. When {@link ByteBuffer} is used, its content should remain unchanged
+   *     until model inference is done.
    * @param output a multidimensional array of output data, or a {@link ByteBuffer} of primitive
    *     types including int, float, long, and byte.
    */
@@ -212,13 +232,14 @@ public final class Interpreter implements AutoCloseable {
    * Runs model inference if the model takes multiple inputs, or returns multiple outputs.
    *
    * <p>Warning: The API runs much faster if {@link ByteBuffer} is used as input data type. Please
-   * consider using {@link ByteBuffer} to feed input data for better performance.
+   * consider using {@link ByteBuffer} to feed primitive input data for better performance.
    *
    * @param inputs an array of input data. The inputs should be in the same order as inputs of the
    *     model. Each input can be an array or multidimensional array, or a {@link ByteBuffer} of
    *     primitive types including int, float, long, and byte. {@link ByteBuffer} is the preferred
-   *     way to pass large input data. When {@link ByteBuffer} is used, its content should remain
-   *     unchanged until model inference is done.
+   *     way to pass large input data, whereas string types require using the (multi-dimensional)
+   *     array input path. When {@link ByteBuffer} is used, its content should remain unchanged
+   *     until model inference is done.
    * @param outputs a map mapping output indices to multidimensional arrays of output data or {@link
    *     ByteBuffer}s of primitive types including int, float, long, and byte. It only needs to keep
    *     entries for the outputs to be used.
diff --git a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
index 6ca47aa3edff34ba312754f4cd769e1bebaf4d27..7aa24b4198a110f68680c0f8ec2a527b23c5e1bc 100644
--- a/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
+++ b/tensorflow/lite/java/src/main/java/org/tensorflow/lite/Tensor.java
@@ -162,6 +162,8 @@ public final class Tensor {
         return DataType.UINT8;
       } else if (long.class.equals(c)) {
         return DataType.INT64;
+      } else if (String.class.equals(c)) {
+        return DataType.STRING;
       }
     }
     throw new IllegalArgumentException(
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 2abba24345824c1c47d5a5e9589924b78eca9e64..52194e86db32a259ca1fe640ca72d42010ba1a44 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -15,15 +15,7 @@ cc_library(
         "nativeinterpreterwrapper_jni.cc",
         "tensor_jni.cc",
         "tensorflow_lite_jni.cc",
-    ] + select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            ":jni.h",
-            ":jni_md.h",
-        ],
-    }),
+    ],
     hdrs = [
         "exception_jni.h",
         "nativeinterpreterwrapper_jni.h",
@@ -31,74 +23,31 @@ cc_library(
         "tensorflow_lite_jni.h",
     ],
     copts = tflite_copts(),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["."],
-    }),
     linkopts = [
         "-lm",
         "-ldl",
     ],
     deps = [
-        "//tensorflow/lite:context",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:schema_fbs_version",
+        "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/java/jni",
     ],
     alwayslink = 1,
 )
 
-# Silly rules to make
-# #include <jni.h>
-# in the source headers work
-# (in combination with the "includes" attribute of the tf_cuda_library rule
-# above. Not needed when using the Android toolchain).
-#
-# Inspired from:
-# https://github.com/bazelbuild/bazel/blob/f99a0543f8d97339d32075c7176b79f35be84606/src/main/native/BUILD
-# but hopefully there is a simpler alternative to this.
-genrule(
-    name = "copy_jni_h",
-    srcs = ["@bazel_tools//tools/jdk:jni_header"],
-    outs = ["jni.h"],
-    cmd = "cp -f $< $@",
-)
-
-genrule(
-    name = "copy_jni_md_h",
-    srcs = select({
-        "//tensorflow:darwin": ["@bazel_tools//tools/jdk:jni_md_header-darwin"],
-        "//conditions:default": ["@bazel_tools//tools/jdk:jni_md_header-linux"],
-    }),
-    outs = ["jni_md.h"],
-    cmd = "cp -f $< $@",
-)
-
 cc_library(
     name = "init_tensorflow",
     srcs = [
         "init_tensorflow_jni.cc",
-    ] + select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            ":jni.h",
-            ":jni_md.h",
-        ],
-    }),
+    ],
     hdrs = [
         "init_tensorflow_jni.h",
     ],
     copts = tflite_copts(),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["."],
-    }),
-    linkopts = [
-        "-lm",
-        "-ldl",
-    ],
     deps = [
+        "//tensorflow/lite/java/jni",
         "//tensorflow/lite/testing:init_tensorflow",
     ],
     alwayslink = 1,
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index c7389c581100acbef3b53c215b4449753cfd2a68..1e98f942504b7e4f238d8715de1dc75eedf046cf 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -78,6 +78,8 @@ int getDataType(TfLiteType data_type) {
       return 3;
     case kTfLiteInt64:
       return 4;
+    case kTfLiteString:
+      return 5;
     default:
       return -1;
   }
diff --git a/tensorflow/lite/java/src/main/native/tensor_jni.cc b/tensorflow/lite/java/src/main/native/tensor_jni.cc
index 1d813d50da44de07d4b9238f53dca432330eb3ae..82d2679de9c868694668bca23ce6c8a6fb55dbe8 100644
--- a/tensorflow/lite/java/src/main/native/tensor_jni.cc
+++ b/tensorflow/lite/java/src/main/native/tensor_jni.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/lite/java/src/main/native/tensor_jni.h"
 #include <cstring>
 #include <memory>
+#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/java/src/main/native/exception_jni.h"
+#include "tensorflow/lite/string_util.h"
 
 namespace {
 
@@ -48,7 +50,7 @@ TfLiteTensor* GetTensorFromHandle(JNIEnv* env, jlong handle) {
   return reinterpret_cast<TensorHandle*>(handle)->tensor();
 }
 
-size_t elementByteSize(TfLiteType data_type) {
+size_t ElementByteSize(TfLiteType data_type) {
   // The code in this file makes the assumption that the
   // TensorFlow TF_DataTypes and the Java primitive types
   // have the same byte sizes. Validate that:
@@ -77,11 +79,11 @@ size_t elementByteSize(TfLiteType data_type) {
   }
 }
 
-size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
+size_t WriteOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
                                 void* dst, size_t dst_size) {
   jarray array = static_cast<jarray>(object);
   const int num_elements = env->GetArrayLength(array);
-  size_t to_copy = num_elements * elementByteSize(type);
+  size_t to_copy = num_elements * ElementByteSize(type);
   if (to_copy > dst_size) {
     throwException(env, kIllegalStateException,
                    "Internal error: cannot write Java array of %d bytes to "
@@ -126,10 +128,10 @@ size_t writeOneDimensionalArray(JNIEnv* env, jobject object, TfLiteType type,
   }
 }
 
-size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
+size_t ReadOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
                                const void* src, size_t src_size, jarray dst) {
   const int len = env->GetArrayLength(dst);
-  const size_t size = len * elementByteSize(data_type);
+  const size_t size = len * ElementByteSize(data_type);
   if (size > src_size) {
     throwException(
         env, kIllegalStateException,
@@ -170,17 +172,17 @@ size_t readOneDimensionalArray(JNIEnv* env, TfLiteType data_type,
   return 0;
 }
 
-size_t readMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
+size_t ReadMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
                                  size_t src_size, int dims_left, jarray dst) {
   if (dims_left == 1) {
-    return readOneDimensionalArray(env, data_type, src, src_size, dst);
+    return ReadOneDimensionalArray(env, data_type, src, src_size, dst);
   } else {
     jobjectArray ndarray = static_cast<jobjectArray>(dst);
     int len = env->GetArrayLength(ndarray);
     size_t size = 0;
     for (int i = 0; i < len; ++i) {
       jarray row = static_cast<jarray>(env->GetObjectArrayElement(ndarray, i));
-      size += readMultiDimensionalArray(env, data_type, src + size,
+      size += ReadMultiDimensionalArray(env, data_type, src + size,
                                         src_size - size, dims_left - 1, row);
       env->DeleteLocalRef(row);
       if (env->ExceptionCheck()) return size;
@@ -189,10 +191,43 @@ size_t readMultiDimensionalArray(JNIEnv* env, TfLiteType data_type, char* src,
   }
 }
 
-size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
+// Returns the total number of strings read.
+int ReadMultiDimensionalStringArray(JNIEnv* env, TfLiteTensor* tensor,
+                                    int dims_left, int start_str_index,
+                                    jarray dst) {
+  jobjectArray object_array = static_cast<jobjectArray>(dst);
+  int len = env->GetArrayLength(object_array);
+  int num_strings_read = 0;
+
+  // If dst is a 1-dimensional array, copy the strings into it. Else
+  // recursively call ReadMultiDimensionalStringArray over sub-dimensions.
+  if (dims_left == 1) {
+    for (int i = 0; i < len; ++i) {
+      const tflite::StringRef strref =
+          tflite::GetString(tensor, start_str_index + num_strings_read);
+      jstring string_dest = env->NewStringUTF(strref.str);
+      env->SetObjectArrayElement(object_array, i, string_dest);
+      env->DeleteLocalRef(string_dest);
+      ++num_strings_read;
+    }
+  } else {
+    for (int i = 0; i < len; ++i) {
+      jarray row =
+          static_cast<jarray>(env->GetObjectArrayElement(object_array, i));
+      num_strings_read += ReadMultiDimensionalStringArray(
+          env, tensor, dims_left - 1, start_str_index + num_strings_read, row);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return num_strings_read;
+    }
+  }
+
+  return num_strings_read;
+}
+
+size_t WriteMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
                                   int dims_left, char** dst, int dst_size) {
   if (dims_left <= 1) {
-    return writeOneDimensionalArray(env, src, type, *dst, dst_size);
+    return WriteOneDimensionalArray(env, src, type, *dst, dst_size);
   } else {
     jobjectArray ndarray = static_cast<jobjectArray>(src);
     int len = env->GetArrayLength(ndarray);
@@ -200,7 +235,7 @@ size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
     for (int i = 0; i < len; ++i) {
       jobject row = env->GetObjectArrayElement(ndarray, i);
       char* next_dst = *dst + sz;
-      sz += writeMultiDimensionalArray(env, row, type, dims_left - 1, &next_dst,
+      sz += WriteMultiDimensionalArray(env, row, type, dims_left - 1, &next_dst,
                                        dst_size - sz);
       env->DeleteLocalRef(row);
       if (env->ExceptionCheck()) return sz;
@@ -209,6 +244,44 @@ size_t writeMultiDimensionalArray(JNIEnv* env, jobject src, TfLiteType type,
   }
 }
 
+void PopulateStringDynamicBuffer(JNIEnv* env, jobject src,
+                                 tflite::DynamicBuffer* dst_buffer,
+                                 int dims_left) {
+  jobjectArray object_array = static_cast<jobjectArray>(src);
+  const int num_elements = env->GetArrayLength(object_array);
+
+  // If src is a 1-dimensional array, add the strings into dst_buffer. Else
+  // recursively call populateStringDynamicBuffer over sub-dimensions.
+  if (dims_left <= 1) {
+    for (int i = 0; i < num_elements; ++i) {
+      jstring string_obj =
+          static_cast<jstring>(env->GetObjectArrayElement(object_array, i));
+      const char* chars = env->GetStringUTFChars(string_obj, nullptr);
+      // + 1 for terminating character.
+      const int byte_len = env->GetStringUTFLength(string_obj) + 1;
+      dst_buffer->AddString(chars, byte_len);
+      env->ReleaseStringUTFChars(string_obj, chars);
+      env->DeleteLocalRef(string_obj);
+    }
+  } else {
+    for (int i = 0; i < num_elements; ++i) {
+      jobject row = env->GetObjectArrayElement(object_array, i);
+      PopulateStringDynamicBuffer(env, row, dst_buffer, dims_left - 1);
+      env->DeleteLocalRef(row);
+      if (env->ExceptionCheck()) return;
+    }
+  }
+}
+
+void WriteMultiDimensionalStringArray(JNIEnv* env, jobject src,
+                                      TfLiteTensor* tensor) {
+  tflite::DynamicBuffer dst_buffer;
+  PopulateStringDynamicBuffer(env, src, &dst_buffer, tensor->dims->size);
+  if (!env->ExceptionCheck()) {
+    dst_buffer.WriteToTensor(tensor, /*new_shape=*/nullptr);
+  }
+}
+
 }  // namespace
 
 JNIEXPORT jlong JNICALL Java_org_tensorflow_lite_Tensor_create(
@@ -266,8 +339,14 @@ Java_org_tensorflow_lite_Tensor_readMultiDimensionalArray(JNIEnv* env,
                    "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
-  readMultiDimensionalArray(env, tensor->type, tensor->data.raw, tensor->bytes,
-                            num_dims, static_cast<jarray>(value));
+  if (tensor->type == kTfLiteString) {
+    ReadMultiDimensionalStringArray(env, tensor, num_dims, 0,
+                                    static_cast<jarray>(value));
+  } else {
+    ReadMultiDimensionalArray(env, tensor->type, tensor->data.raw,
+                              tensor->bytes, num_dims,
+                              static_cast<jarray>(value));
+  }
 }
 
 JNIEXPORT void JNICALL
@@ -277,7 +356,7 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
                                                            jobject src) {
   TfLiteTensor* tensor = GetTensorFromHandle(env, handle);
   if (tensor == nullptr) return;
-  if (tensor->data.raw == nullptr) {
+  if (tensor->type != kTfLiteString && tensor->data.raw == nullptr) {
     throwException(env, kIllegalArgumentException,
                    "Internal error: Target Tensor hasn't been allocated.");
     return;
@@ -287,8 +366,12 @@ Java_org_tensorflow_lite_Tensor_writeMultiDimensionalArray(JNIEnv* env,
                    "Internal error: Cannot copy empty/scalar Tensors.");
     return;
   }
-  writeMultiDimensionalArray(env, src, tensor->type, tensor->dims->size,
-                             &tensor->data.raw, tensor->bytes);
+  if (tensor->type == kTfLiteString) {
+    WriteMultiDimensionalStringArray(env, src, tensor);
+  } else {
+    WriteMultiDimensionalArray(env, src, tensor->type, tensor->dims->size,
+                               &tensor->data.raw, tensor->bytes);
+  }
 }
 
 JNIEXPORT jint JNICALL Java_org_tensorflow_lite_Tensor_dtype(JNIEnv* env,
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
index 6d6417f895e88584b46f619565a593a61921189d..8412ec0e9dacd5e837286e629603e0e354d2341c 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/DataTypeTest.java
@@ -30,6 +30,7 @@ public final class DataTypeTest {
     assertThat(DataType.INT32.byteSize()).isEqualTo(4);
     assertThat(DataType.UINT8.byteSize()).isEqualTo(1);
     assertThat(DataType.INT64.byteSize()).isEqualTo(8);
+    assertThat(DataType.STRING.byteSize()).isEqualTo(-1);
   }
 
   @Test
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
index 07d334c33b233705df167eeafb80496b04730b18..b00efa77cbf60296f0ee3db8059bac01edd6ccea 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/NativeInterpreterWrapperTest.java
@@ -43,6 +43,9 @@ public final class NativeInterpreterWrapperTest {
   private static final String BYTE_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/uint8.bin";
 
+  private static final String STRING_MODEL_PATH =
+      "tensorflow/lite/java/src/testdata/string.bin";
+
   private static final String QUANTIZED_MODEL_PATH =
       "tensorflow/lite/java/src/testdata/quantized.bin";
 
@@ -224,6 +227,50 @@ public final class NativeInterpreterWrapperTest {
     wrapper.close();
   }
 
+  @Test
+  public void testRunWithString() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(STRING_MODEL_PATH);
+    String[] oneD = {"s1", "s22", "s333"};
+    String[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    String[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    String[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    String[][][][] parsedOutputs = new String[2][4][4][12];
+    Map<Integer, Object> outputs = new HashMap<>();
+    outputs.put(0, parsedOutputs);
+    wrapper.run(inputs, outputs);
+    String[] outputOneD = parsedOutputs[0][0][0];
+    String[] expected = {
+      "s1", "s22", "s333", "s1", "s22", "s333", "s1", "s22", "s333", "s1", "s22", "s333"
+    };
+    assertThat(outputOneD).isEqualTo(expected);
+    wrapper.close();
+  }
+
+  @Test
+  public void testRunWithString_wrongShapeError() {
+    NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(STRING_MODEL_PATH);
+    String[] oneD = {"s1", "s22", "s333"};
+    String[][] twoD = {oneD, oneD, oneD, oneD, oneD, oneD, oneD, oneD};
+    String[][][] threeD = {twoD, twoD, twoD, twoD, twoD, twoD, twoD, twoD};
+    String[][][][] fourD = {threeD, threeD};
+    Object[] inputs = {fourD};
+    String[][][][] parsedOutputs = new String[2][4][4][10];
+    Map<Integer, Object> outputs = new HashMap<>();
+    outputs.put(0, parsedOutputs);
+    try {
+      wrapper.run(inputs, outputs);
+      fail();
+    } catch (IllegalArgumentException e) {
+      assertThat(e)
+          .hasMessageThat()
+          .contains(
+              "Cannot copy between a TensorFlowLite tensor with shape [2, 4, 4, 12] and "
+                  + "a Java object with shape [2, 4, 4, 10]");
+    }
+    wrapper.close();
+  }
+
   @Test
   public void testRunWithByteBufferHavingBytes() {
     NativeInterpreterWrapper wrapper = new NativeInterpreterWrapper(BYTE_MODEL_PATH);
diff --git a/tensorflow/lite/java/src/test/native/BUILD b/tensorflow/lite/java/src/test/native/BUILD
index 4d3e82b1ac14990be13aaba1d917e26dcc00b961..481aea7ecd5dd8f9c26307e3b00992e21e6c2501 100644
--- a/tensorflow/lite/java/src/test/native/BUILD
+++ b/tensorflow/lite/java/src/test/native/BUILD
@@ -12,20 +12,11 @@ cc_library(
     testonly = 1,
     srcs = [
         "interpreter_test_jni.cc",
-    ] + select({
-        # The Android toolchain makes "jni.h" available in the include path.
-        # For non-Android toolchains, generate jni.h and jni_md.h.
-        "//tensorflow:android": [],
-        "//conditions:default": [
-            "//tensorflow/lite/java/src/main/native:jni.h",
-            "//tensorflow/lite/java/src/main/native:jni_md.h",
-        ],
-    }),
-    includes = select({
-        "//tensorflow:android": [],
-        "//conditions:default": ["../../main/native/."],
-    }),
-    deps = ["//tensorflow/lite/c:c_api_internal"],
+    ],
+    deps = [
+        "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/java/jni",
+    ],
 )
 
 tflite_jni_binary(
diff --git a/tensorflow/lite/java/src/testdata/string.bin b/tensorflow/lite/java/src/testdata/string.bin
new file mode 100644
index 0000000000000000000000000000000000000000..36a2509acdfa17841d0c128674e7b4e382ad00fc
Binary files /dev/null and b/tensorflow/lite/java/src/testdata/string.bin differ
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index 010ba834661f7df7856cd7d2eebe396ba8746987..bad1c4aebf1e9d9c7c6d33f87a6e7ea9cab8d700 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -26,7 +26,6 @@ tf_cc_test(
     size = "small",
     srcs = ["optional_tensor_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -124,7 +123,6 @@ tf_cc_test(
     size = "small",
     srcs = ["kernel_util_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -139,7 +137,6 @@ tf_cc_test(
     size = "small",
     srcs = ["test_util_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",  # TODO(b/117786830)
     ],
     deps = [
@@ -183,6 +180,7 @@ cc_library(
         "exp.cc",
         "expand_dims.cc",
         "fake_quant.cc",
+        "fill.cc",
         "floor.cc",
         "floor_div.cc",
         "floor_mod.cc",
@@ -197,6 +195,7 @@ cc_library(
         "lstm.cc",
         "maximum_minimum.cc",
         "mfcc.cc",
+        "mirror_pad.cc",
         "mul.cc",
         "neg.cc",
         "one_hot.cc",
@@ -219,6 +218,8 @@ cc_library(
         "sparse_output_fully_connected.cc",
         "sparse_to_dense.cc",
         "split.cc",
+        "split_v.cc",
+        "squared_difference.cc",
         "squeeze.cc",
         "strided_slice.cc",
         "sub.cc",
@@ -289,7 +290,6 @@ tf_cc_test(
     size = "small",
     srcs = ["audio_spectrogram_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -306,7 +306,6 @@ tf_cc_test(
     size = "small",
     srcs = ["mfcc_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -323,7 +322,6 @@ tf_cc_test(
     size = "small",
     srcs = ["detection_postprocess_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -340,7 +338,6 @@ tf_cc_test(
     size = "small",
     srcs = ["relu1_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -357,7 +354,6 @@ tf_cc_test(
     size = "small",
     srcs = ["sparse_output_fully_connected_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -400,7 +396,6 @@ tf_cc_test(
     size = "small",
     srcs = ["arg_min_max_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -416,7 +411,6 @@ tf_cc_test(
     size = "small",
     srcs = ["div_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -432,7 +426,6 @@ tf_cc_test(
     size = "small",
     srcs = ["sub_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -448,7 +441,6 @@ tf_cc_test(
     size = "small",
     srcs = ["transpose_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -466,7 +458,6 @@ tf_cc_test(
     size = "small",
     srcs = ["space_to_batch_nd_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -482,7 +473,6 @@ tf_cc_test(
     size = "small",
     srcs = ["batch_to_space_nd_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -498,7 +488,6 @@ tf_cc_test(
     size = "small",
     srcs = ["cast_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -555,7 +544,6 @@ tf_cc_test(
     size = "small",
     srcs = ["dequantize_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -585,7 +573,6 @@ tf_cc_test(
     size = "small",
     srcs = ["bidirectional_sequence_lstm_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -602,7 +589,6 @@ tf_cc_test(
     size = "small",
     srcs = ["floor_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -618,7 +604,6 @@ tf_cc_test(
     size = "small",
     srcs = ["elementwise_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -634,7 +619,6 @@ tf_cc_test(
     size = "small",
     srcs = ["unidirectional_sequence_lstm_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -650,7 +634,6 @@ tf_cc_test(
     size = "small",
     srcs = ["bidirectional_sequence_rnn_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -666,7 +649,6 @@ tf_cc_test(
     size = "small",
     srcs = ["unidirectional_sequence_rnn_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -695,7 +677,6 @@ tf_cc_test(
     size = "small",
     srcs = ["exp_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -711,7 +692,6 @@ tf_cc_test(
     size = "small",
     srcs = ["fake_quant_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -727,7 +707,6 @@ tf_cc_test(
     size = "small",
     srcs = ["maximum_minimum_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -743,7 +722,6 @@ tf_cc_test(
     size = "small",
     srcs = ["reduce_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -772,7 +750,6 @@ tf_cc_test(
     size = "small",
     srcs = ["pad_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -801,7 +778,6 @@ tf_cc_test(
     size = "small",
     srcs = ["gather_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -818,7 +794,6 @@ tf_cc_test(
     size = "small",
     srcs = ["topk_v2_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -955,7 +930,6 @@ tf_cc_test(
     size = "small",
     srcs = ["log_softmax_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1053,7 +1027,21 @@ tf_cc_test(
     size = "small",
     srcs = ["split_test.cc"],
     tags = [
-        "no_oss",
+        "tflite_not_portable_ios",
+    ],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "split_v_test",
+    size = "small",
+    srcs = ["split_v_test.cc"],
+    tags = [
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1069,7 +1057,6 @@ tf_cc_test(
     size = "small",
     srcs = ["squeeze_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1085,7 +1072,6 @@ tf_cc_test(
     size = "small",
     srcs = ["strided_slice_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1101,7 +1087,6 @@ tf_cc_test(
     size = "small",
     srcs = ["tile_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1120,7 +1105,6 @@ tf_cc_test(
         "comparisons_test.cc",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1136,7 +1120,6 @@ tf_cc_test(
     size = "small",
     srcs = ["neg_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1154,7 +1137,6 @@ tf_cc_test(
         "select_test.cc",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1172,7 +1154,6 @@ tf_cc_test(
         "slice_test.cc",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1188,7 +1169,6 @@ tf_cc_test(
     size = "small",
     srcs = ["transpose_conv_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1205,7 +1185,6 @@ tf_cc_test(
     size = "small",
     srcs = ["expand_dims_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1222,7 +1201,6 @@ tf_cc_test(
     size = "small",
     srcs = ["sparse_to_dense_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1239,7 +1217,6 @@ tf_cc_test(
     size = "small",
     srcs = ["shape_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1256,7 +1233,6 @@ tf_cc_test(
     size = "small",
     srcs = ["pow_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -1379,6 +1355,32 @@ tf_cc_test(
     ],
 )
 
+tf_cc_test(
+    name = "squared_difference_test",
+    size = "small",
+    srcs = ["squared_difference_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+tf_cc_test(
+    name = "fill_test",
+    size = "small",
+    srcs = ["fill_test.cc"],
+    tags = ["tflite_not_portable_ios"],
+    deps = [
+        ":builtin_ops",
+        "//tensorflow/lite:framework",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 filegroup(
     name = "all_files",
     srcs = glob(
@@ -1392,3 +1394,14 @@ filegroup(
 )
 
 tflite_portable_test_suite()
+
+tf_cc_test(
+    name = "mirror_pad_test",
+    srcs = ["mirror_pad_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_util",
+        "//tensorflow/lite:framework",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 9c525d964077eb7007a27a004786961e67ea21dd..a76654256044702736a2855d4bb12d445c90be55 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -45,6 +45,11 @@ struct LogSoftmaxOpData : public OpData {
   int32_t reverse_scaling_right_shift = 0;
 };
 
+struct PreluOpData : public OpData {
+  int32_t output_multiplier = 0;
+  int output_shift = 0;
+};
+
 void* Init(TfLiteContext* context, const char* buffer, size_t length) {
   // This is a builtin op, so we don't use the contents in 'buffer', if any.
   // Instead, we allocate a new object to carry information from Prepare() to
@@ -57,6 +62,10 @@ void* LogSoftmaxInit(TfLiteContext* context, const char* buffer,
   return new LogSoftmaxOpData;
 }
 
+void* PreluInit(TfLiteContext* context, const char* buffer, size_t length) {
+  return new PreluOpData;
+}
+
 void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
@@ -65,6 +74,10 @@ void LogSoftmaxFree(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<LogSoftmaxOpData*>(buffer);
 }
 
+void PreluFree(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<PreluOpData*>(buffer);
+}
+
 TfLiteStatus GenericPrepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
@@ -253,13 +266,18 @@ TfLiteStatus PreluPrepare(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   TfLiteTensor* output = GetOutput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
+  PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
 
-  // Currently only Float32 is supported
-  // TODO(ycling): Support other data types.
-  TF_LITE_ENSURE_EQ(context, input->type, kTfLiteFloat32);
-  TF_LITE_ENSURE_EQ(context, alpha->type, kTfLiteFloat32);
+  TF_LITE_ENSURE_EQ(context, input->type, alpha->type);
   output->type = input->type;
 
+  if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt16) {
+    double real_multiplier =
+        input->params.scale * alpha->params.scale / output->params.scale;
+    QuantizeMultiplierSmallerThanOneExp(
+        real_multiplier, &data->output_multiplier, &data->output_shift);
+  }
+
   // PRelu (parameteric Relu) shares the same alpha value on "shared axis".
   // This means it's always required to "broadcast" alpha values in PRelu.
   TfLiteIntArray* output_size = nullptr;
@@ -288,8 +306,8 @@ TfLiteStatus ReluEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -309,8 +327,8 @@ TfLiteStatus Relu1Eval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -327,9 +345,24 @@ TfLiteStatus Relu6Eval(TfLiteContext* context, TfLiteNode* node) {
       for (; in < in_end; in++, out++) *out = std::min(std::max(0.f, *in), 6.f);
       return kTfLiteOk;
     } break;
+    case kTfLiteUInt8: {
+      ActivationParams params;
+      params.activation_type = FusedActivationFunctionType::kRelu6;
+      params.quantized_activation_min = std::max(
+          0, output->params.zero_point +
+                 static_cast<int32>(roundf(0.f / output->params.scale)));
+      params.quantized_activation_max = std::min(
+          255, output->params.zero_point +
+                   static_cast<int32>(roundf(6.f / output->params.scale)));
+      optimized_ops::ReluX(params, GetTensorShape(input),
+                           GetTensorData<uint8>(input), GetTensorShape(output),
+                           GetTensorData<uint8>(output));
+      return kTfLiteOk;
+    } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(
+          context, "Only float32 and uint8 supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -367,8 +400,8 @@ TfLiteStatus TanhEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     } break;
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -407,9 +440,8 @@ TfLiteStatus SigmoidEval(TfLiteContext* context, TfLiteNode* node) {
       break;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently, got %d.",
-                           input->type);
-      return kTfLiteError;
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
   }
   return kTfLiteOk;
 }
@@ -604,8 +636,8 @@ TfLiteStatus SoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
     }
     default:
       context->ReportError(
-          context, "Only float32 and uint8_t supported currently, got %d.",
-          input->type);
+          context, "Only float32 and uint8_t supported currently, got %s.",
+          TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -636,8 +668,8 @@ TfLiteStatus LogSoftmaxEval(TfLiteContext* context, TfLiteNode* node) {
       return kTfLiteOk;
     }
     default:
-      context->ReportError(context, "Only float32 supported currently., got %d",
-                           input->type);
+      context->ReportError(context, "Only float32 supported currently., got %s",
+                           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
 }
@@ -651,16 +683,57 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
   const TfLiteTensor* input = GetInput(context, node, 0);
   const TfLiteTensor* alpha = GetInput(context, node, 1);
   TfLiteTensor* output = GetOutput(context, node, 0);
-  if (input->type != kTfLiteFloat32) {
-    context->ReportError(context, "Only float32 supported currently, got %d.",
-                         input->type);
-    return kTfLiteError;
+  const PreluOpData* data = reinterpret_cast<PreluOpData*>(node->user_data);
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
+          GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(alpha), GetTensorData<float>(alpha),
+          GetTensorShape(output), GetTensorData<float>(output),
+          ApplyPrelu<float>);
+      return kTfLiteOk;
+    } break;
+    case kTfLiteUInt8: {
+      PreluParams op_params;
+      op_params.input_offset = -input->params.zero_point;
+      op_params.alpha_offset = -alpha->params.zero_point;
+      op_params.output_offset = output->params.zero_point;
+      op_params.output_multiplier = data->output_multiplier;
+      op_params.output_shift = data->output_shift;
+      reference_ops::BroadcastPrelu4DSlow(
+          op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
+          GetTensorShape(alpha), GetTensorData<uint8_t>(alpha),
+          GetTensorShape(output), GetTensorData<uint8_t>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context,
+                           "Only float32, uint8 supported currently, got %d.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
+  }
+}
+
+TfLiteStatus LeakyReluEval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input = GetInput(context, node, 0);
+  TfLiteTensor* output = GetOutput(context, node, 0);
+  const auto* params =
+      reinterpret_cast<TfLiteLeakyReluParams*>(node->builtin_data);
+
+  LeakyReluParams op_params;
+  op_params.alpha = params->alpha;
+  switch (input->type) {
+    case kTfLiteFloat32: {
+      optimized_ops::LeakyRelu(
+          op_params, GetTensorShape(input), GetTensorData<float>(input),
+          GetTensorShape(output), GetTensorData<float>(output));
+      return kTfLiteOk;
+    } break;
+    default:
+      context->ReportError(context, "Only float32 supported currently, got %s.",
+                           TfLiteTypeGetName(input->type));
+      return kTfLiteError;
   }
-  reference_ops::BroadcastBinaryFunction4DSlow<float, float, float>(
-      GetTensorShape(input), GetTensorData<float>(input), GetTensorShape(alpha),
-      GetTensorData<float>(alpha), GetTensorShape(output),
-      GetTensorData<float>(output), ApplyPrelu<float>);
-  return kTfLiteOk;
 }
 
 }  // namespace activations
@@ -715,12 +788,19 @@ TfLiteRegistration* Register_LOG_SOFTMAX() {
 }
 
 TfLiteRegistration* Register_PRELU() {
-  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+  static TfLiteRegistration r = {activations::PreluInit, activations::PreluFree,
                                  activations::PreluPrepare,
                                  activations::PreluEval};
   return &r;
 }
 
+TfLiteRegistration* Register_LEAKY_RELU() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 activations::GenericPrepare,
+                                 activations::LeakyReluEval};
+  return &r;
+}
+
 }  // namespace builtin
 }  // namespace ops
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/activations_test.cc b/tensorflow/lite/kernels/activations_test.cc
index fff4121dc0c265d9dc3fe50521683b0be4ab4f94..67f137baff29808d7a03571e1880901e44c34712 100644
--- a/tensorflow/lite/kernels/activations_test.cc
+++ b/tensorflow/lite/kernels/activations_test.cc
@@ -170,6 +170,29 @@ TEST(FloatActivationsOpTest, Tanh) {
                              })));
 }
 
+TEST(QuantizedActivationsOpTest, Relu6) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedActivationsOpModel m(
+      BuiltinOperator_RELU6,
+      /*input=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax},
+      /*output=*/{TensorType_UINT8, {1, 2, 4, 1}, 8 * kMin, 8 * kMax});
+  m.SetInput<uint8_t>({
+      0, -6, 2, 4,   //
+      3, -2, 10, 1,  //
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0, 0, 2, 4,  //
+                      3, 0, 6, 1,  //
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(),
+              ElementsAreArray({128, 128, 160, 192, 176, 128, 224, 144}));
+}
+
 TEST(QuantizedActivationsOpTest, Tanh) {
   const float kMin = -1;
   const float kMax = 127.f / 128.f;
@@ -563,15 +586,29 @@ TEST(QuantizedActivationsOpTest, LogSoftmax) {
               ElementsAreArray({189, 93, 221, 253, 142, 63, 255, 111}));
 }
 
-class PReluOpModel : public SingleOpModel {
+// A base class of PRelu op model. It provides the constructor for
+// FloatPReluOpModel and QuantizedPReluOpModel.
+class BasePReluOpModel : public SingleOpModel {
  public:
-  PReluOpModel(const TensorData& input, const TensorData& alpha) {
+  BasePReluOpModel(const TensorData& input, const TensorData& alpha) {
     input_ = AddInput(input);
     alpha_ = AddInput(alpha);
-    output_ = AddOutput(input);
+    output_ = AddOutput({input.type, input.shape, input.min, input.max});
     SetBuiltinOp(BuiltinOperator_PRELU, BuiltinOptions_NONE, 0);
     BuildInterpreter({GetShape(input_), GetShape(alpha_)});
   }
+
+ protected:
+  int input_;
+  int alpha_;
+  int output_;
+};
+
+// The FloatPReluOpModel class handles float input and output.
+class FloatPReluOpModel : public BasePReluOpModel {
+ public:
+  using BasePReluOpModel::BasePReluOpModel;
+
   void SetInput(std::initializer_list<float> data) {
     PopulateTensor(input_, data);
   }
@@ -579,16 +616,35 @@ class PReluOpModel : public SingleOpModel {
     PopulateTensor(alpha_, data);
   }
   std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
 
- protected:
-  int input_;
-  int alpha_;
-  int output_;
+// The QuantizedPReluOpModel class handles quantized input and output.
+class QuantizedPReluOpModel : public BasePReluOpModel {
+ public:
+  using BasePReluOpModel::BasePReluOpModel;
+
+  template <typename T>
+  void SetInput(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(input_, data);
+  }
+  template <typename T>
+  void SetAlpha(std::initializer_list<float> data) {
+    QuantizeAndPopulate<T>(alpha_, data);
+  }
+  template <typename T>
+  std::vector<T> GetOutput() {
+    return ExtractVector<T>(output_);
+  }
+  template <typename T>
+  std::vector<float> GetDequantizedOutput() {
+    return Dequantize<T>(ExtractVector<T>(output_), GetScale(output_),
+                         GetZeroPoint(output_));
+  }
 };
 
 TEST(FloatActivationsOpTest, PRelu) {
-  PReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
-                 {TensorType_FLOAT32, {1, 1, 3}});
+  FloatPReluOpModel m({TensorType_FLOAT32, {1, 2, 2, 3}},
+                      {TensorType_FLOAT32, {1, 1, 3}});
 
   m.SetInput({
       0.0f, 0.0f, 0.0f,     // Row 1, Column 1
@@ -606,6 +662,69 @@ TEST(FloatActivationsOpTest, PRelu) {
                              }));
 }
 
+TEST(QuantizedActivationsOpTest, PRelu) {
+  const float kMin = -1;
+  const float kMax = 127.f / 128.f;
+  QuantizedPReluOpModel m({TensorType_UINT8, {1, 2, 2, 3}, kMin, kMax},
+                          {TensorType_UINT8, {1, 1, 3}, kMin, kMax});
+  m.SetInput<uint8_t>({
+      0.0f, 0.0f, 0.0f,        // Row 1, Column 1
+      0.5f, 0.5f, 0.5f,        // Row 1, Column 2
+      -1.0f, -1.0f, -1.0f,     // Row 2, Column 1
+      -0.25f, -0.25f, -0.25f,  // Row 1, Column 2
+  });
+  m.SetAlpha<uint8_t>({0.0f, 0.5f, -0.5f});
+  m.Invoke();
+  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
+              ElementsAreArray(ArrayFloatNear(
+                  {
+                      0.0f, 0.0f, 0.0f,       // Row 1, Column 1
+                      0.5f, 0.5f, 0.5f,       // Row 1, Column 2
+                      0.0f, -0.5f, 0.5f,      // Row 2, Column 1
+                      0.0f, -0.125f, 0.125f,  // Row 1, Column 2
+                  },
+                  kQuantizedTolerance)));
+  EXPECT_THAT(m.GetOutput<uint8_t>(), ElementsAreArray({
+                                          128, 128, 128,  // Row 1, Column 1
+                                          192, 192, 192,  // Row 1, Column 2
+                                          128, 64, 192,   // Row 2, Column 1
+                                          128, 112, 144,  // Row 1, Column 2
+                                      }));
+}
+
+class LeakyReluOpModel : public SingleOpModel {
+ public:
+  LeakyReluOpModel(const TensorData& input, float alpha) {
+    input_ = AddInput(input);
+    output_ = AddOutput(input);
+    SetBuiltinOp(BuiltinOperator_LEAKY_RELU, BuiltinOptions_LeakyReluOptions,
+                 CreateLeakyReluOptions(builder_, alpha).Union());
+    BuildInterpreter({GetShape(input_)});
+  }
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+
+ protected:
+  int input_;
+  int output_;
+};
+
+TEST(FloatActivationsOpTest, LeakyRelu) {
+  LeakyReluOpModel m({TensorType_FLOAT32, {2, 3}}, 0.5f);
+
+  m.SetInput({
+      0.0f, 1.0f, 3.0f,    // Row 1
+      1.0f, -1.0f, -2.0f,  // Row 2
+  });
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({
+                                 0.0f, 1.0f, 3.0f,    // Row 1
+                                 1.0f, -0.5f, -1.0f,  // Row 2
+                             }));
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/conv.cc b/tensorflow/lite/kernels/conv.cc
index 0c14b9eb65692fa25dbd5784a65ebc4bded8853c..1fd870be93eda12d1c057e29b017d80e2a96412b 100644
--- a/tensorflow/lite/kernels/conv.cc
+++ b/tensorflow/lite/kernels/conv.cc
@@ -24,7 +24,6 @@ limitations under the License.
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/eigen_support.h"
 #include "tensorflow/lite/kernels/gemm_support.h"
-#include "tensorflow/lite/kernels/internal/optimized/cblas_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
@@ -491,11 +490,10 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
   CalculateActivationRange(params->activation, &output_activation_min,
                            &output_activation_max);
   KernelType effective_kernel_type;
-  if ((kernel_type == kMultithreadOptimized ||
-       kernel_type == kCblasOptimized) &&
+  if ((kernel_type == kMultithreadOptimized) &&
       (params->dilation_width_factor != 1 ||
        params->dilation_height_factor != 1)) {
-    // kMultithreadOptimized and kCblasOptimized do not support dilation.
+    // kMultithreadOptimized does not support dilation.
     // Therefore, fallback to optimized.
     effective_kernel_type = kGenericOptimized;
   } else {
@@ -521,6 +519,7 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
                           GetTensorData<float>(im2col));
       break;
     }
+    case kCblasOptimized:
     case kGenericOptimized: {
       optimized_ops::Conv(op_params, GetTensorShape(input),
                           GetTensorData<float>(input), GetTensorShape(filter),
@@ -546,15 +545,6 @@ void EvalFloat(TfLiteContext* context, TfLiteNode* node,
           GetTensorData<float>(im2col));
       break;
     }
-    case kCblasOptimized: {
-      cblas_ops::Conv(op_params, GetTensorShape(input),
-                      GetTensorData<float>(input), GetTensorShape(filter),
-                      GetTensorData<float>(filter), GetTensorShape(bias),
-                      GetTensorData<float>(bias), GetTensorShape(output),
-                      GetTensorData<float>(output), GetTensorShape(im2col),
-                      GetTensorData<float>(im2col));
-      break;
-    }
   }
 }
 
diff --git a/tensorflow/lite/kernels/dequantize.cc b/tensorflow/lite/kernels/dequantize.cc
index b2825bb9ea5a57789bf6f3aa312b09c43f07bbf7..7f03c73c9c960e3c134e33bf78a572f100405b7a 100644
--- a/tensorflow/lite/kernels/dequantize.cc
+++ b/tensorflow/lite/kernels/dequantize.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/c/builtin_op_data.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -57,7 +58,8 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   OpContext op_context(context, node);
 
-  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8);
+  TF_LITE_ENSURE(context, op_context.input->type == kTfLiteUInt8 ||
+                              op_context.input->type == kTfLiteInt8);
 
   op_context.output->type = kTfLiteFloat32;
   // If the input tensor is constant, we can persist the dequantized value in
@@ -80,10 +82,25 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   tflite::DequantizationParams op_params;
   op_params.zero_point = op_context.input->params.zero_point;
   op_params.scale = op_context.input->params.scale;
-  optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
-                            GetTensorData<uint8_t>(op_context.input),
-                            GetTensorShape(op_context.output),
-                            GetTensorData<float>(op_context.output));
+  switch (op_context.input->type) {
+    case kTfLiteUInt8:
+      optimized_ops::Dequantize(op_params, GetTensorShape(op_context.input),
+                                GetTensorData<uint8_t>(op_context.input),
+                                GetTensorShape(op_context.output),
+                                GetTensorData<float>(op_context.output));
+      break;
+    case kTfLiteInt8:
+      reference_integer_ops::Dequantize(
+          op_params, GetTensorShape(op_context.input),
+          GetTensorData<int8_t>(op_context.input),
+          GetTensorShape(op_context.output),
+          GetTensorData<float>(op_context.output));
+      break;
+    default:
+      context->ReportError(context, "Type %d not supported.",
+                           op_context.input->type);
+      return kTfLiteError;
+  }
 
   if (IsConstantTensor(op_context.input)) {
     op_data->float_dequantized_weights_initialized = true;
diff --git a/tensorflow/lite/kernels/dequantize_test.cc b/tensorflow/lite/kernels/dequantize_test.cc
index 55265d93e527fdf69d8958c14ab9e347d57b3ce0..bb5f1e74a8b0174209043e14af9c35db32bf14b5 100644
--- a/tensorflow/lite/kernels/dequantize_test.cc
+++ b/tensorflow/lite/kernels/dequantize_test.cc
@@ -25,8 +25,16 @@ using ::testing::ElementsAreArray;
 
 class DequantizeOpModel : public SingleOpModel {
  public:
-  DequantizeOpModel(std::initializer_list<int> shape, float min, float max) {
-    input_ = AddInput({TensorType_UINT8, shape, min, max});
+  DequantizeOpModel(TensorType type, std::initializer_list<int> shape,
+                    float scale, int32_t zero_point) {
+    TensorData input_tensor_data;
+    input_tensor_data.type = type;
+    input_tensor_data.shape = shape;
+    input_tensor_data.min = 0;
+    input_tensor_data.max = 0;
+    input_tensor_data.scale = scale;
+    input_tensor_data.zero_point = zero_point;
+    input_ = AddInput(input_tensor_data);
     output_ = AddOutput({TensorType_FLOAT32, shape});
     SetBuiltinOp(BuiltinOperator_DEQUANTIZE, BuiltinOptions_DequantizeOptions,
                  CreateDequantizeOptions(builder_).Union());
@@ -34,7 +42,8 @@ class DequantizeOpModel : public SingleOpModel {
     BuildInterpreter({GetShape(input_)});
   }
 
-  void SetInput(std::initializer_list<uint8_t> data) {
+  template <typename T>
+  void SetInput(std::initializer_list<T> data) {
     PopulateTensor(input_, data);
   }
 
@@ -45,10 +54,22 @@ class DequantizeOpModel : public SingleOpModel {
   int output_;
 };
 
-TEST(SplitOpTest, FourDimensional) {
-  DequantizeOpModel m({2, 5}, -63.5, 64);
+TEST(DequantizeOpTest, UINT8) {
+  // [-63.5, 64] -> scale=0.5 zero_point=127 for UINT8
+  DequantizeOpModel m(TensorType_UINT8, {2, 5}, 0.5, 127);
 
-  m.SetInput({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.SetInput<uint8>({0, 1, 2, 3, 4, 251, 252, 253, 254, 255});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {-63.5, -63, -62.5, -62, -61.5, 62, 62.5, 63, 63.5, 64})));
+}
+
+TEST(DequantizeOpTest, INT8) {
+  // [-63.5, 64] -> scale=0.5, zero_point=1 for INT8
+  DequantizeOpModel m(TensorType_INT8, {2, 5}, 0.5, -1);
+
+  m.SetInput<int8>({-128, -127, -126, -125, -124, 123, 124, 125, 126, 127});
   m.Invoke();
   EXPECT_THAT(m.GetOutput(),
               ElementsAreArray(ArrayFloatNear(
diff --git a/tensorflow/lite/kernels/eigen_support.cc b/tensorflow/lite/kernels/eigen_support.cc
index 44e0086ad88303a5214161e533313923f9aed301..bad5975a7c187cc4bdcd65721d397897ff2cf09d 100644
--- a/tensorflow/lite/kernels/eigen_support.cc
+++ b/tensorflow/lite/kernels/eigen_support.cc
@@ -34,6 +34,15 @@ static_assert(
     "kDefaultArenaAlignment doesn't comply with Eigen alignment requirement.");
 #endif  // EIGEN_DONT_ALIGN
 
+// Helper routine for updating the global Eigen thread count used for OpenMP.
+void SetEigenNbThreads(int threads) {
+#if defined(EIGEN_HAS_OPENMP)
+  // The global Eigen thread count is only used when OpenMP is enabled. As this
+  // call causes problems with tsan, make it only when OpenMP is available.
+  Eigen::setNbThreads(context->recommended_num_threads);
+#endif  // defined(EIGEN_HAS_OPENMP)
+}
+
 // We have a single global threadpool for all convolution operations. This means
 // that inferences started from different threads may block each other, but
 // since the underlying resource of CPU cores should be consumed by the
@@ -78,7 +87,7 @@ void InitDevice(TfLiteContext* context, RefCountedEigenContext* ptr) {
 }
 
 TfLiteStatus Refresh(TfLiteContext* context) {
-  Eigen::setNbThreads(context->recommended_num_threads);
+  SetEigenNbThreads(context->recommended_num_threads);
 
   auto* ptr = GetEigenContext(context);
   if (ptr != nullptr) {
@@ -94,7 +103,7 @@ void IncrementUsageCounter(TfLiteContext* context) {
   auto* ptr = GetEigenContext(context);
   if (ptr == nullptr) {
     if (context->recommended_num_threads != -1) {
-      Eigen::setNbThreads(context->recommended_num_threads);
+      SetEigenNbThreads(context->recommended_num_threads);
     }
     ptr = new RefCountedEigenContext;
     ptr->type = kTfLiteEigenContext;
diff --git a/tensorflow/lite/kernels/elementwise.cc b/tensorflow/lite/kernels/elementwise.cc
index 416a69eb0ed824783f03975c50b744bfce118117..a79388b900eb89b56a4d18f887dbe52e84fb123f 100644
--- a/tensorflow/lite/kernels/elementwise.cc
+++ b/tensorflow/lite/kernels/elementwise.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cmath>
 #include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 
@@ -74,6 +75,10 @@ inline TfLiteStatus EvalLogical(TfLiteContext* context, TfLiteNode* node,
   return EvalImpl<bool>(context, node, bool_func, kTfLiteBool);
 }
 
+TfLiteStatus AbsEval(TfLiteContext* context, TfLiteNode* node) {
+  return EvalNumeric(context, node, std::abs);
+}
+
 TfLiteStatus SinEval(TfLiteContext* context, TfLiteNode* node) {
   return EvalNumeric(context, node, std::sin);
 }
@@ -101,6 +106,14 @@ TfLiteStatus LogicalNotEval(TfLiteContext* context, TfLiteNode* node) {
 }  // namespace
 }  // namespace elementwise
 
+TfLiteRegistration* Register_ABS() {
+  static TfLiteRegistration r = {
+      /*init=*/nullptr, /*free=*/nullptr,
+      elementwise::GenericPrepare<elementwise::IsNumericSupportedType>,
+      elementwise::AbsEval};
+  return &r;
+}
+
 TfLiteRegistration* Register_SIN() {
   static TfLiteRegistration r = {
       /*init=*/nullptr, /*free=*/nullptr,
diff --git a/tensorflow/lite/kernels/elementwise_test.cc b/tensorflow/lite/kernels/elementwise_test.cc
index 52df8dc3cca0b0550702f1904bfd15c282aaed73..7d24320081257925508b2aa53503c1cf71d0e913 100644
--- a/tensorflow/lite/kernels/elementwise_test.cc
+++ b/tensorflow/lite/kernels/elementwise_test.cc
@@ -74,6 +74,19 @@ TEST(ElementWise, Log) {
   EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
 }
 
+TEST(FloatActivationsOpTest, Abs) {
+  ElementWiseOpFloatModel m(BuiltinOperator_ABS, {1, 2, 4, 1});
+  m.PopulateTensor<float>(m.input(), {
+                                         0.f, -6.2f, 2.f, 4.f,  //
+                                         3.f, -2.f, 10.f, 1.f,  //
+                                     });
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({
+                                                      0.f, 6.2f, 2.f, 4.f,  //
+                                                      3.f, 2.f, 10.f, 1.f,  //
+                                                  }));
+}
+
 TEST(ElementWise, Sqrt) {
   ElementWiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {0, 1, 2, 4});
diff --git a/tensorflow/lite/kernels/fill.cc b/tensorflow/lite/kernels/fill.cc
new file mode 100644
index 0000000000000000000000000000000000000000..079ee44f3719f9fa283bf617ee3917eb4c377aff
--- /dev/null
+++ b/tensorflow/lite/kernels/fill.cc
@@ -0,0 +1,141 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace fill {
+
+namespace {
+
+constexpr int kDimsTensor = 0;
+constexpr int kValueTensor = 1;
+constexpr int kOutputTensor = 0;
+
+template <typename T>
+TfLiteStatus ResizeOutputImpl(TfLiteContext* context, const TfLiteTensor* dims,
+                              TfLiteTensor* output) {
+  TfLiteIntArray* output_shape = TfLiteIntArrayCreate(dims->dims->data[0]);
+  for (int i = 0; i < output_shape->size; ++i) {
+    T data = GetTensorData<T>(dims)[i];
+    if (data < 0) {
+      context->ReportError(context, "Fill dimensions must be >= 0", dims->type);
+      return kTfLiteError;
+    }
+    output_shape->data[i] = data;
+  }
+  return context->ResizeTensor(context, output, output_shape);
+}
+
+TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* dims,
+                          TfLiteTensor* output) {
+  switch (dims->type) {
+    case kTfLiteInt32:
+      return ResizeOutputImpl<int32_t>(context, dims, output);
+    case kTfLiteInt64:
+      return ResizeOutputImpl<int64_t>(context, dims, output);
+    default:
+      context->ReportError(
+          context,
+          "Fill only currently supports int32, int64 for input 0, "
+          "got %d.",
+          dims->type);
+      return kTfLiteError;
+  }
+}
+
+}  // namespace
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+
+  // Make sure the 1st input tensor is 1-D.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(dims), 1);
+
+  // Make sure the 1st input tensor is int32 or int64.
+  const auto dtype = dims->type;
+  TF_LITE_ENSURE(context, dtype == kTfLiteInt32 || dtype == kTfLiteInt64);
+
+  // Make sure the 2nd input tensor is a scalar.
+  TF_LITE_ENSURE_EQ(context, NumDimensions(value), 0);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+  output->type = value->type;
+
+  if (IsConstantTensor(dims)) {
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
+  } else {
+    SetTensorToDynamic(output);
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* value = GetInput(context, node, kValueTensor);
+
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (IsDynamicTensor(output)) {
+    const TfLiteTensor* dims = GetInput(context, node, kDimsTensor);
+    TF_LITE_ENSURE_OK(context, ResizeOutput(context, dims, output));
+  }
+#define TF_LITE_FILL(data_type)                                               \
+  reference_ops::Fill(GetTensorShape(value), GetTensorData<data_type>(value), \
+                      GetTensorShape(output),                                 \
+                      GetTensorData<data_type>(output))
+  switch (output->type) {
+    case kTfLiteInt32:
+      TF_LITE_FILL(int32_t);
+      break;
+    case kTfLiteInt64:
+      TF_LITE_FILL(int64_t);
+      break;
+    case kTfLiteFloat32:
+      TF_LITE_FILL(float);
+      break;
+    default:
+      context->ReportError(
+          context,
+          "Fill only currently supports int32, int64, float32 for input 1,"
+          "got %d.",
+          value->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_FILL
+  return kTfLiteOk;
+}
+
+}  // namespace fill
+
+TfLiteRegistration* Register_FILL() {
+  static TfLiteRegistration r = {/*init=*/nullptr, /*free=*/nullptr,
+                                 fill::Prepare, fill::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/fill_test.cc b/tensorflow/lite/kernels/fill_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..08044d76f9d95774fa1b0e37ebb6a9716e9809cb
--- /dev/null
+++ b/tensorflow/lite/kernels/fill_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+using ::testing::IsEmpty;
+
+class FillOpModel : public SingleOpModel {
+ public:
+  explicit FillOpModel(const TensorData& input1, const TensorData& input2) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(input1);
+    SetBuiltinOp(BuiltinOperator_FILL, BuiltinOptions_FillOptions,
+                 CreateFillOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+  int output() { return output_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+TEST(FillOpModel, FillInt32) {
+  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT32});
+  m.PopulateTensor<int32_t>(m.input1(), {2, 3});
+  m.PopulateTensor<int32_t>(m.input2(), {-11});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int32_t>(m.output()),
+              ElementsAreArray({-11, -11, -11, -11, -11, -11}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 3}));
+}
+
+TEST(FillOpModel, FillInt64) {
+  FillOpModel m({TensorType_INT32, {2}}, {TensorType_INT64});
+  m.PopulateTensor<int32_t>(m.input1(), {2, 4});
+  m.PopulateTensor<int64_t>(m.input2(), {2 ^ 45});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<int64_t>(m.output()),
+              ElementsAreArray({2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45, 2 ^ 45,
+                                2 ^ 45, 2 ^ 45}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 4}));
+}
+
+TEST(FillOpModel, FillFloat) {
+  FillOpModel m({TensorType_INT64, {3}}, {TensorType_FLOAT32});
+  m.PopulateTensor<int64_t>(m.input1(), {2, 2, 2});
+  m.PopulateTensor<float>(m.input2(), {4.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({2, 2, 2}));
+}
+
+TEST(FillOpModel, FillOutputScalar) {
+  FillOpModel m({TensorType_INT64, {0}}, {TensorType_FLOAT32});
+  m.PopulateTensor<float>(m.input2(), {4.0});
+  m.Invoke();
+  EXPECT_THAT(m.ExtractVector<float>(m.output()), ElementsAreArray({4.0}));
+  EXPECT_THAT(m.GetTensorShape(m.output()), IsEmpty());
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 63cca1cf5427f9c328b68868a4cfbef3fec08bf9..a1eecb284ab647e8b7fc7b18dfd8ad82aedeece3 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -117,7 +117,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   // Note that quantized inference requires that all tensors have their
   // parameters set. This is usually done during quantized training.
   TfLiteType data_type = input->type;
-  if (data_type != kTfLiteFloat32) {
+  if (data_type != kTfLiteFloat32 && data_type != kTfLiteInt32) {
     double real_multiplier = 0.0;
     TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
         context, input, filter, bias, output, &real_multiplier));
diff --git a/tensorflow/lite/kernels/gather.cc b/tensorflow/lite/kernels/gather.cc
index 61884d6a12c3e150d910244108a357dd34fe8783..f205daae1343cb0abecc95e7d1b280c10f55d897 100644
--- a/tensorflow/lite/kernels/gather.cc
+++ b/tensorflow/lite/kernels/gather.cc
@@ -118,7 +118,7 @@ TfLiteStatus GatherStrings(TfLiteContext* context, const TfLiteTensor* input,
     const auto string_ref = GetString(input, pos);
     buffer.AddString(string_ref.str, string_ref.len);
   }
-  buffer.WriteToTensor(output);
+  buffer.WriteToTensorAsVector(output);
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/kernels/hashtable_lookup.cc b/tensorflow/lite/kernels/hashtable_lookup.cc
index b6ae7a3d1a5479e8ac6996815de9cb02b472acaf..da1116cf858667b1fc35f3f88269b66f81afcdb7 100644
--- a/tensorflow/lite/kernels/hashtable_lookup.cc
+++ b/tensorflow/lite/kernels/hashtable_lookup.cc
@@ -137,7 +137,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   }
   if (output->type == kTfLiteString) {
-    buf.WriteToTensor(output);
+    buf.WriteToTensorAsVector(output);
   }
 
   return kTfLiteOk;
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index 6d9690ea460bd86ef481d7c82e0f8770969e35d4..69816583f5020843aeff76890f51c6c306f11a4f 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -234,8 +234,6 @@ cc_library(
 cc_library(
     name = "optimized",
     hdrs = [
-        "optimized/cblas_conv.h",
-        "optimized/cblas_reference.h",
         "optimized/eigen_spatial_convolutions.h",
         "optimized/eigen_tensor_reduced_instantiations_oss.h",
         "optimized/multithreaded_conv.h",
@@ -256,7 +254,6 @@ cc_test(
     name = "tensor_test",
     srcs = ["tensor_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",  # TODO(b/117786830)
     ],
     deps = [
@@ -289,7 +286,6 @@ cc_test(
     name = "quantization_util_test",
     srcs = ["quantization_util_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",  # TODO(b/117786830)
     ],
     deps = [
@@ -317,6 +313,7 @@ cc_library(
         "reference/depthwiseconv_float.h",
         "reference/depthwiseconv_uint8.h",
         "reference/fully_connected.h",
+        "reference/integer_ops/dequantize.h",
         "reference/reference_ops.h",
         "reference/softmax.h",
     ],
@@ -564,7 +561,6 @@ cc_test(
     }),
     linkstatic = 1,
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -578,7 +574,6 @@ cc_test(
 cc_test(
     name = "depthwiseconv_float_test",
     srcs = ["depthwiseconv_float_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -593,7 +588,6 @@ cc_test(
     srcs = ["depthwiseconv_quantized_test.cc"],
     shard_count = 2,
     tags = [
-        "no_oss",
         "tflite_not_portable_ios",
     ],
     deps = [
@@ -609,7 +603,6 @@ cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -625,7 +618,6 @@ cc_test(
     name = "resize_nearest_neighbor_test",
     srcs = ["resize_nearest_neighbor_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -643,7 +635,6 @@ cc_test(
     srcs = [
         "softmax_quantized_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":optimized_base",
         ":quantization_util",
@@ -661,7 +652,6 @@ cc_test(
         "logsoftmax_quantized_test.cc",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
@@ -677,7 +667,6 @@ cc_test(
 cc_test(
     name = "log_quantized_test",
     srcs = ["log_quantized_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":optimized_base",
         ":reference_base",
@@ -705,7 +694,6 @@ cc_library(
 cc_test(
     name = "batch_to_space_nd_test",
     srcs = ["batch_to_space_nd_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":optimized_base",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/kernels/internal/optimized/cblas_conv.h b/tensorflow/lite/kernels/internal/optimized/cblas_conv.h
deleted file mode 100644
index 53772050503b2b1947af29ba08903ef3bc92e896..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/internal/optimized/cblas_conv.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
-
-// The Conv implementation based on CBLAS interface. This is only used on iOS
-// for now, utilizing Apple's Accelerate framework.
-
-#if TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
-#include <Accelerate/Accelerate.h>
-#else
-#include "tensorflow/lite/kernels/internal/optimized/cblas_reference.h"
-#endif
-
-#include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
-#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
-
-namespace tflite {
-namespace cblas_ops {
-
-inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
-                 const float* input_data, const RuntimeShape& filter_shape,
-                 const float* filter_data, const RuntimeShape& bias_shape,
-                 const float* bias_data, const RuntimeShape& output_shape,
-                 float* output_data, const RuntimeShape& im2col_shape,
-                 float* im2col_data) {
-  const int stride_width = params.stride_width;
-  const int stride_height = params.stride_height;
-  const int pad_width = params.padding_values.width;
-  const int pad_height = params.padding_values.height;
-  const int dilation_width_factor = params.dilation_width_factor;
-  const int dilation_height_factor = params.dilation_height_factor;
-  const float output_activation_min = params.float_activation_min;
-  const float output_activation_max = params.float_activation_max;
-  TFLITE_DCHECK_EQ(input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(filter_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_EQ(output_shape.DimensionsCount(), 4);
-  gemmlowp::ScopedProfilingLabel label("Conv/cblas");
-
-  const float* gemm_input_data = nullptr;
-  const RuntimeShape* gemm_input_shape = nullptr;
-  const int filter_width = filter_shape.Dims(2);
-  const int filter_height = filter_shape.Dims(1);
-  const bool need_im2col = stride_width != 1 || stride_height != 1 ||
-                           filter_width != 1 || filter_height != 1;
-  if (need_im2col) {
-    TFLITE_DCHECK(im2col_data);
-    ConvParams op_params;
-    op_params.padding_type = PaddingType::kSame;
-    op_params.padding_values.width = pad_width;
-    op_params.padding_values.height = pad_height;
-    op_params.stride_width = stride_width;
-    op_params.stride_height = stride_height;
-    op_params.dilation_width_factor = dilation_width_factor;
-    op_params.dilation_height_factor = dilation_height_factor;
-    optimized_ops::Im2col(op_params, filter_height, filter_width, 0,
-                          input_shape, input_data, im2col_shape, im2col_data);
-
-    gemm_input_data = im2col_data;
-    gemm_input_shape = &im2col_shape;
-  } else {
-    TFLITE_DCHECK(!im2col_data);
-    gemm_input_data = input_data;
-    gemm_input_shape = &input_shape;
-  }
-
-  // The following code computes matrix multiplication c = a * transponse(b)
-  // with CBLAS, where:
-  // * `a` is a matrix with dimensions (m, k).
-  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
-  // * `c` is a matrix with dimensions (m, n).
-  // The naming of variables are aligned with CBLAS specification here.
-  const float* a = gemm_input_data;
-  const float* b = filter_data;
-  float* c = output_data;
-  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
-  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
-  int n = output_shape.Dims(3);
-  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
-  // The stride of matrix a, b and c respectively.
-  int stride_a = k;
-  int stride_b = k;
-  int stride_c = n;
-
-  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
-              stride_a, b, stride_b, 0.0f, c, stride_c);
-
-  optimized_ops::AddBiasAndEvalActivationFunction(
-      output_activation_min, output_activation_max, bias_shape, bias_data,
-      output_shape, output_data);
-}
-
-}  // namespace cblas_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_CONV_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/cblas_reference.h b/tensorflow/lite/kernels/internal/optimized/cblas_reference.h
deleted file mode 100644
index fa07578612aaa56c77c39c9c6280c81e8657d86f..0000000000000000000000000000000000000000
--- a/tensorflow/lite/kernels/internal/optimized/cblas_reference.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
-#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
-
-#include "tensorflow/lite/kernels/internal/compatibility.h"
-
-// The reference implementation for a small subset of CBLAS interface.
-// This is only used for testing CBLAS implementation, and should never be used
-// in production code.
-
-namespace tflite {
-namespace cblas_ops {
-
-// The following code follows the original CBLAS specification, and it might
-// conflict with the TensorFlow naming convention.
-// TODO(ycling): Find another way to test CBLAS with bazel, without writing
-// a reference implementation by ourselves.
-enum CBLAS_ORDER { CblasRowMajor = 0, CblasColMajor = 1 };
-
-enum CBLAS_TRANSPOSE { CblasNoTrans = 0, CblasTrans = 1, CblasConjTrans = 2 };
-
-// A reference implementation for matrix multiplication.
-// The following code computes, c = a * transponse(b) matrix multiplication
-// with CBLAS, where:
-// * `a` is a matrix with dimensions (m, k).
-// * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
-// * `c` is a matrix with dimensions (m, n).
-// The naming of variables is aligned with CBLAS specification here.
-void cblas_sgemm(const enum CBLAS_ORDER order,
-                 const enum CBLAS_TRANSPOSE trans_a,
-                 const enum CBLAS_TRANSPOSE trans_b, const int m, const int n,
-                 const int k, const float alpha, const float *a,
-                 const int stride_a, const float *b, const int stride_b,
-                 const float beta, float *c, const int stride_c) {
-  TFLITE_DCHECK(order == CblasRowMajor);
-  TFLITE_DCHECK(trans_a == CblasNoTrans);
-  TFLITE_DCHECK(trans_b == CblasTrans);
-  TFLITE_DCHECK(beta == 0.0f);
-  for (int row = 0; row < m; ++row) {
-    for (int col = 0; col < n; ++col) {
-      // If `beta` non-zero, multiple it with the original values in output.
-      // Otherwise, ignore the original value in output completely.
-      float value = 0.0f;
-      for (int idx = 0; idx < k; ++idx) {
-        value += alpha * a[stride_a * row + idx] * b[stride_b * col + idx];
-      }
-      c[stride_c * row + col] = value;
-    }
-  }
-}
-
-}  // namespace cblas_ops
-}  // namespace tflite
-
-#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_CBLAS_REFERENCE_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
index 25b66d4b5537f58e9a8795e05128e4cb4b3d2890..c77715de57990666b362b08dae7c21b9707d942c 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_float.h
@@ -793,22 +793,26 @@ void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
     int out_x_loop_end_unclampled = 0;
     if (kAllowStrided) {
       if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
       } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
       } else {
         out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled = (pad_width + input_width -
+                                     dilation_factor * filter_x + stride - 1) /
+                                    stride;
       }
     } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled =
+          pad_width + input_width - dilation_factor * filter_x;
     }
     // The kernel will have to iterate on the segment of the
     // output row that starts at out_x_loop_start and out_x_loop_end.
@@ -819,7 +823,8 @@ void FloatDepthwiseConvAccumRow(int stride, int dilation_factor,
 
     float* acc_buffer_ptr =
         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
     const float* input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     FloatDepthwiseConvKernel<kAllowStrided, kFixedInputDepth,
@@ -936,8 +941,7 @@ inline void DepthwiseConv(
                                         FIXED_DEPTH_MULTIPLIER)           \
   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
-      depth_multiplier == FIXED_DEPTH_MULTIPLIER &&                       \
-      dilation_height_factor == 1 && dilation_width_factor == 1) {        \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
     row_accum_func =                                                      \
         FloatDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,      \
                                    FIXED_DEPTH_MULTIPLIER>;               \
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
index 5317cea88439231a6a4dc24206987f8db6fa7b77..d3dca799a7cca4a3048cd2d19477ba2b57fbcdac 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8.h
@@ -1499,22 +1499,26 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
     int out_x_loop_end_unclampled = 0;
     if (kAllowStrided) {
       if (stride == 2) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 1) / 2;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 1) / 2;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 1) / 2;
+            (pad_width + input_width - dilation_factor * filter_x + 1) / 2;
       } else if (stride == 4) {
-        out_x_loop_start_unclampled = (pad_width - filter_x + 3) / 4;
+        out_x_loop_start_unclampled =
+            (pad_width - dilation_factor * filter_x + 3) / 4;
         out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + 3) / 4;
+            (pad_width + input_width - dilation_factor * filter_x + 3) / 4;
       } else {
         out_x_loop_start_unclampled =
-            (pad_width - filter_x + stride - 1) / stride;
-        out_x_loop_end_unclampled =
-            (pad_width + input_width - filter_x + stride - 1) / stride;
+            (pad_width - dilation_factor * filter_x + stride - 1) / stride;
+        out_x_loop_end_unclampled = (pad_width + input_width -
+                                     dilation_factor * filter_x + stride - 1) /
+                                    stride;
       }
     } else {
-      out_x_loop_start_unclampled = pad_width - filter_x;
-      out_x_loop_end_unclampled = pad_width + input_width - filter_x;
+      out_x_loop_start_unclampled = pad_width - dilation_factor * filter_x;
+      out_x_loop_end_unclampled =
+          pad_width + input_width - dilation_factor * filter_x;
     }
     // The kernel will have to iterate on the segment of the
     // output row that starts at out_x_loop_start and out_x_loop_end.
@@ -1525,7 +1529,8 @@ void QuantizedDepthwiseConvAccumRow(int stride, int dilation_factor,
 
     int32* acc_buffer_ptr =
         acc_buffer + (out_x_loop_start - out_x_buffer_start) * output_depth;
-    const int in_x_origin = (out_x_loop_start * stride) - pad_width + filter_x;
+    const int in_x_origin =
+        (out_x_loop_start * stride) - pad_width + dilation_factor * filter_x;
     const uint8* input_ptr = input_data + in_x_origin * input_depth;
     const int num_output_pixels = out_x_loop_end - out_x_loop_start;
     QuantizedDepthwiseConvKernel<
@@ -1703,8 +1708,7 @@ inline void DepthwiseConvGeneral(
                                         FIXED_DEPTH_MULTIPLIER)           \
   if (!row_accum_func && (stride_width == 1 || ALLOW_STRIDED) &&          \
       (input_depth == FIXED_INPUT_DEPTH || FIXED_INPUT_DEPTH == 0) &&     \
-      depth_multiplier == FIXED_DEPTH_MULTIPLIER &&                       \
-      dilation_width_factor == 1 && dilation_height_factor == 1) {        \
+      depth_multiplier == FIXED_DEPTH_MULTIPLIER) {                       \
     row_accum_func =                                                      \
         QuantizedDepthwiseConvAccumRow<ALLOW_STRIDED, FIXED_INPUT_DEPTH,  \
                                        FIXED_DEPTH_MULTIPLIER>;           \
diff --git a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
index 3f2ed0b1f0eb3c009048550f879ae3726519c518..5859bcaed4ac2b991ca22e7d9c17d34d3267a120 100644
--- a/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
+++ b/tensorflow/lite/kernels/internal/optimized/depthwiseconv_uint8_3x3_filter.h
@@ -23,11 +23,6 @@ limitations under the License.
 namespace tflite {
 namespace optimized_ops {
 
-// clang-format gets confused with this file and ends up formatting lines to
-// be larger than 80 characters. Turn off here and back on at the end of the
-// file.
-// clang-format off
-
 // See CategorizeDotProductKernel for definitive taxonomy.
 enum class DotProduct3x3KernelType {
   kNone = 0,  // Parameter combination is not supported for dot product kernels.
@@ -120,42 +115,58 @@ struct DepthwiseConvParams {
 #define OFFSET_OUTPUT_WIDTH 84
 #define OFFSET_OUTPUT_HEIGHT 88
 
-static_assert(offsetof(DepthwiseConvParams, input_depth) ==
-                  OFFSET_INPUT_DEPTH, "");
+static_assert(offsetof(DepthwiseConvParams, input_depth) == OFFSET_INPUT_DEPTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_row_size) ==
-                  OFFSET_INPUT_ROW_SIZE, "");
+                  OFFSET_INPUT_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_depth) ==
-                  OFFSET_OUTPUT_DEPTH, "");
+                  OFFSET_OUTPUT_DEPTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_row_size) ==
-                  OFFSET_OUTPUT_ROW_SIZE, "");
+                  OFFSET_OUTPUT_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, filter_row_size) ==
-                  OFFSET_FILTER_ROW_SIZE, "");
+                  OFFSET_FILTER_ROW_SIZE,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_offset) ==
-                  OFFSET_INPUT_OFFSET, "");
+                  OFFSET_INPUT_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_offset) ==
-                  OFFSET_OUTPUT_OFFSET, "");
+                  OFFSET_OUTPUT_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, filter_offset) ==
-                  OFFSET_FILTER_OFFSET, "");
+                  OFFSET_FILTER_OFFSET,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_multiplier) ==
-                  OFFSET_OUTPUT_MULTIPLIER, "");
+                  OFFSET_OUTPUT_MULTIPLIER,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_activation_min) ==
-                  OFFSET_OUTPUT_ACTIVATION_MIN, "");
+                  OFFSET_OUTPUT_ACTIVATION_MIN,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_activation_max) ==
-                  OFFSET_OUTPUT_ACTIVATION_MAX, "");
+                  OFFSET_OUTPUT_ACTIVATION_MAX,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_right_shift) ==
-                  OFFSET_OUTPUT_RIGHT_SHIFT, "");
-static_assert(offsetof(DepthwiseConvParams, input_width) ==
-                  OFFSET_INPUT_WIDTH, "");
+                  OFFSET_OUTPUT_RIGHT_SHIFT,
+              "");
+static_assert(offsetof(DepthwiseConvParams, input_width) == OFFSET_INPUT_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, input_height) ==
-                  OFFSET_INPUT_HEIGHT, "");
+                  OFFSET_INPUT_HEIGHT,
+              "");
 static_assert(offsetof(DepthwiseConvParams, stride_width) ==
-                  OFFSET_STRIDE_WIDTH, "");
+                  OFFSET_STRIDE_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, stride_height) ==
-                  OFFSET_STRIDE_HEIGHT, "");
+                  OFFSET_STRIDE_HEIGHT,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_width) ==
-                  OFFSET_OUTPUT_WIDTH, "");
+                  OFFSET_OUTPUT_WIDTH,
+              "");
 static_assert(offsetof(DepthwiseConvParams, output_height) ==
-                  OFFSET_OUTPUT_HEIGHT, "");
+                  OFFSET_OUTPUT_HEIGHT,
+              "");
 
 template <int32 kDepth, int32 kStrideWidth, int32 kStrideHeight>
 struct DepthwiseConvWindow {};
@@ -164,10 +175,10 @@ template <>
 struct DepthwiseConvWindow<8, 1, 1> {
  public:
   static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                  const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
-                  int64_t input_row_size, int32 output_window_height,
-                  int32 output_window_width,
-                  const DepthwiseConvParams* params_ptr) {
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
     const int64_t input_width_increment = 2 * input_depth;
     const int64_t input_height_increment = 2 * input_row_size;
     const int64_t output_height_increment = 2 * params_ptr->output_row_size;
@@ -1147,10 +1158,10 @@ struct DepthwiseConvWindow<8, 1, 1> {
 template <>
 struct DepthwiseConvWindow<8, 2, 2> {
   static inline void Run(const uint8* input_ptr, const uint8* filter_ptr,
-                  const int32* bias_ptr, uint8* output_ptr, int64_t input_depth,
-                  int64_t input_row_size, int32 output_window_height,
-                  int32 output_window_width,
-                  const DepthwiseConvParams* params_ptr) {
+                         const int32* bias_ptr, uint8* output_ptr,
+                         int64_t input_depth, int64_t input_row_size,
+                         int32 output_window_height, int32 output_window_width,
+                         const DepthwiseConvParams* params_ptr) {
     const int64_t input_width_increment = 4 * input_depth;
     const int64_t input_height_increment = 4 * input_row_size;
     const int64_t output_height_increment = 2 * params_ptr->output_row_size;
@@ -2990,11 +3001,10 @@ struct ShuffleParams {
   ShuffleParams() = default;
   ShuffleParams(int32 output_width, int32 output_height, int32 stride_width,
                 int32 stride_height)
-  : output_width(output_width)
-  , output_height(output_height)
-  , input_width(get_shuffle_input_size(stride_width, output_width))
-  , input_height(get_shuffle_input_size(stride_height, output_height)) {
-  }
+      : output_width(output_width),
+        output_height(output_height),
+        input_width(get_shuffle_input_size(stride_width, output_width)),
+        input_height(get_shuffle_input_size(stride_height, output_height)) {}
 };
 
 template <int32 kStrideWidth, int32 kStrideHeight>
@@ -3003,10 +3013,10 @@ struct DepthwiseConvThroughDepth {
   // |start_depth| to |end_depth|. Keep this not inlined to maintain a small
   // binary size. We use a DepthwiseConvParams struct for read only params
   // to minimize call overhead.
-  static __attribute__((noinline)) void Run(const uint8* input_ptr,
-      const uint8* filter_ptr, const int32* bias_ptr, uint8* output_ptr,
-      int64_t start_depth, int64_t end_depth, int64_t input_depth,
-      int64_t input_row_size, int32 output_window_height,
+  static __attribute__((noinline)) void Run(
+      const uint8* input_ptr, const uint8* filter_ptr, const int32* bias_ptr,
+      uint8* output_ptr, int64_t start_depth, int64_t end_depth,
+      int64_t input_depth, int64_t input_row_size, int32 output_window_height,
       int32 output_window_width, const DepthwiseConvParams& params) {
     for (; start_depth <= end_depth - 8; start_depth += 8) {
       DepthwiseConvWindow<8, kStrideWidth, kStrideHeight>::Run(
@@ -3029,12 +3039,15 @@ struct DepthwiseConvMultiRow {
                          uint8* output_data, const DepthwiseConvParams& params,
                          const ShuffleParams& shuffle_params,
                          uint8* shuffle_workspace) {
-    TFLITE_DCHECK(shuffle_params.input_height ==
+    TFLITE_DCHECK(
+        shuffle_params.input_height ==
         get_shuffle_input_size(kStrideHeight, shuffle_params.output_height));
-    TFLITE_DCHECK(shuffle_params.input_width ==
+    TFLITE_DCHECK(
+        shuffle_params.input_width ==
         get_shuffle_input_size(kStrideWidth, shuffle_params.output_width));
-    TFLITE_DCHECK(64 * shuffle_params.input_width * shuffle_params.input_height
-                  <= DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
+    TFLITE_DCHECK(64 * shuffle_params.input_width *
+                      shuffle_params.input_height <=
+                  DEPTHWISECONV_SHUFFLE_WORKSPACE_SIZE);
 
     int32 out_x = start_x;
 
@@ -3045,7 +3058,7 @@ struct DepthwiseConvMultiRow {
     if (params.output_depth > 64 ||
         (params.output_depth <= 64 && params.input_width > 150)) {
       for (; out_x <= (end_x - shuffle_params.output_width);
-             out_x += shuffle_params.output_width) {
+           out_x += shuffle_params.output_width) {
         const uint8* input_ptr = input_data;
         const int32* bias_ptr = bias_data;
         const uint8* filter_ptr = filter_data;
@@ -3091,8 +3104,8 @@ struct DepthwiseConvMultiRow {
         }
 
         // Handle leftover depth.
-        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr,
-                        depth, params.output_depth, params.input_depth,
+        ConvKernel::Run(input_ptr, filter_ptr, bias_ptr, output_ptr, depth,
+                        params.output_depth, params.input_depth,
                         params.input_row_size, shuffle_params.output_height,
                         shuffle_params.output_width, params);
 
@@ -3119,13 +3132,15 @@ struct DepthwiseConvMultiRow {
 //   * Horizontal edges.
 //   * Vertical edges.
 inline void DepthwiseConvHandlePadding(const uint8* input_data,
-    const uint8* filter_data, const int32* bias_data, uint8* output_data,
-    const DepthwiseConvParams& params) {
+                                       const uint8* filter_data,
+                                       const int32* bias_data,
+                                       uint8* output_data,
+                                       const DepthwiseConvParams& params) {
   if (params.input_width == 1 && params.input_height == 1) {
-    const uint8* filter_ptr = filter_data + params.filter_row_size
-        + params.output_depth;
-    DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(input_data, filter_ptr,
-        bias_data, output_data, &params);
+    const uint8* filter_ptr =
+        filter_data + params.filter_row_size + params.output_depth;
+    DepthwiseConvPartial<EdgeType::kCenter, 1, 1>::Run(
+        input_data, filter_ptr, bias_data, output_data, &params);
     return;
   }
 
@@ -3136,27 +3151,27 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
 
   // Handle top row.
   const uint8* input_ptr = input_data;
-  const uint8* filter_ptr = filter_data + params.filter_row_size
-      + params.output_depth;
+  const uint8* filter_ptr =
+      filter_data + params.filter_row_size + params.output_depth;
   uint8* output_ptr = output_data;
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   input_ptr += (params.stride_width - 1) * params.input_depth;
   filter_ptr = filter_data + params.filter_row_size;
   output_ptr += params.output_depth;
 
   for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
-           out_x++) {
+       out_x++) {
     DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_depth;
     output_ptr += params.output_depth;
   }
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   // Handle left side.
   input_ptr = input_data + (params.stride_width - 1) * params.input_row_size;
@@ -3164,7 +3179,7 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   output_ptr = output_data + params.output_row_size;
 
   for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
-           out_y++) {
+       out_y++) {
     DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_row_size;
@@ -3172,14 +3187,14 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   }
 
   // Handle right side.
-  input_ptr = input_data + (params.input_width - 2) * params.input_depth
-      + (params.stride_width - 1) * params.input_row_size;
+  input_ptr = input_data + (params.input_width - 2) * params.input_depth +
+              (params.stride_width - 1) * params.input_row_size;
   filter_ptr = filter_data;
   output_ptr = output_data + params.output_row_size +
-      (params.output_width - 1) * params.output_depth;
+               (params.output_width - 1) * params.output_depth;
 
   for (int32 out_y = out_y_start_corner + 1; out_y < out_y_end_corner;
-         out_y++) {
+       out_y++) {
     DepthwiseConvPartial<EdgeType::kVertical, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_row_size;
@@ -3189,26 +3204,26 @@ inline void DepthwiseConvHandlePadding(const uint8* input_data,
   // Handle bottom row.
   input_ptr = input_data + (params.input_height - 2) * params.input_row_size;
   filter_ptr = filter_data + params.output_depth;
-  output_ptr = output_data +
-      (params.output_height - 1) * params.output_row_size;
+  output_ptr =
+      output_data + (params.output_height - 1) * params.output_row_size;
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 
   input_ptr += (params.stride_width == 1) ? 0 : params.input_depth;
   filter_ptr = filter_data;
   output_ptr += params.output_depth;
 
   for (int32 out_x = out_x_start_corner + 1; out_x < out_x_end_corner;
-           out_x++) {
+       out_x++) {
     DepthwiseConvPartial<EdgeType::kHorizontal, 1, 1>::Run(
         input_ptr, filter_ptr, bias_data, output_ptr, &params);
     input_ptr += params.stride_width * params.input_depth;
     output_ptr += params.output_depth;
   }
 
-  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(input_ptr, filter_ptr,
-      bias_data, output_ptr, &params);
+  DepthwiseConvPartial<EdgeType::kCorner, 1, 1>::Run(
+      input_ptr, filter_ptr, bias_data, output_ptr, &params);
 }
 
 inline bool Fast3x3FilterKernelSupported(
@@ -3383,8 +3398,8 @@ inline void DepthwiseConv3x3Filter(
       const int in_x = (out_x * stride_width) - pad_width;
       const int in_y = (out_y * stride_height) - pad_height;
       input_ptr += in_y * params.input_row_size + in_x * params.input_depth;
-      output_ptr += out_y * params.output_row_size
-          + out_x * params.output_depth;
+      output_ptr +=
+          out_y * params.output_row_size + out_x * params.output_depth;
     }
 
     // Shuffling shapes that maximize width over the shuffle workspace size
@@ -3439,7 +3454,6 @@ inline void DepthwiseConv3x3Filter(
     }
   }
 }
-// clang-format on
 
 #endif  // __aarch64__
 
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
index 6f2cd4faab2f267aa9daa0d7bf7c42b77f0051b4..c79b69a22e4dcdac5c32d03c0edd9f3cfb09a0ae 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
@@ -25,6 +25,10 @@ limitations under the License.
 #include <tuple>
 #include <type_traits>
 
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+#include <Accelerate/Accelerate.h>
+#endif
+
 #include "third_party/eigen3/Eigen/Core"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "fixedpoint/fixedpoint.h"
@@ -60,11 +64,13 @@ using reference_ops::DepthConcatenation;
 using reference_ops::Dequantize;
 using reference_ops::Div;
 using reference_ops::FakeQuant;
+using reference_ops::Fill;
 using reference_ops::Gather;
 using reference_ops::Greater;
 using reference_ops::GreaterEqual;
 using reference_ops::GreaterEqualWithScaling;
 using reference_ops::GreaterWithScaling;
+using reference_ops::LeakyRelu;
 using reference_ops::Less;
 using reference_ops::LessEqual;
 using reference_ops::LessEqualWithScaling;
@@ -1867,18 +1873,45 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
     gemm_input_shape = &input_shape;
   }
 
-  const auto im2col_matrix_map =
-      MapAsMatrixWithLastDimAsRows(gemm_input_data, *gemm_input_shape);
-  const auto filter_matrix_map =
-      MapAsMatrixWithFirstDimAsCols(filter_data, filter_shape);
-  auto output_matrix_map =
-      MapAsMatrixWithLastDimAsRows(output_data, output_shape);
-
-  Gemm(filter_matrix_map.transpose(), im2col_matrix_map, &output_matrix_map);
-
-  AddBiasAndEvalActivationFunction(output_activation_min, output_activation_max,
-                                   bias_shape, bias_data, output_shape,
-                                   output_data);
+  // The following code computes matrix multiplication c = a * transponse(b)
+  // with CBLAS, where:
+  // * `a` is a matrix with dimensions (m, k).
+  // * `b` is a matrix with dimensions (n, k), so transpose(b) is (k, n).
+  // * `c` is a matrix with dimensions (m, n).
+  // The naming of variables are aligned with CBLAS specification here.
+  const float* a = gemm_input_data;
+  const float* b = filter_data;
+  float* c = output_data;
+  const int gemm_input_dims = gemm_input_shape->DimensionsCount();
+  int m = FlatSizeSkipDim(*gemm_input_shape, gemm_input_dims - 1);
+  int n = output_shape.Dims(3);
+  int k = gemm_input_shape->Dims(gemm_input_dims - 1);
+
+#if defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+  // The stride of matrix a, b and c respectively.
+  int stride_a = k;
+  int stride_b = k;
+  int stride_c = n;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, n, k, 1.0f, a,
+              stride_a, b, stride_b, 0.0f, c, stride_c);
+#else
+  // When an optimized CBLAS implementation is not available, fall back
+  // to using Eigen.
+  typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>
+      Matrix;
+  typedef Eigen::Map<Matrix> MatrixRef;
+  typedef Eigen::Map<const Matrix> ConstMatrixRef;
+
+  MatrixRef matrix_c(c, m, n);
+  ConstMatrixRef matrix_a(a, m, k);
+  ConstMatrixRef matrix_b(b, n, k);
+  matrix_c.noalias() = matrix_a * matrix_b.transpose();
+#endif  //  defined(TF_LITE_USE_CBLAS) && defined(__APPLE__)
+
+  optimized_ops::AddBiasAndEvalActivationFunction(
+      output_activation_min, output_activation_max, bias_shape, bias_data,
+      output_shape, output_data);
 }
 
 inline void HybridConv(const ConvParams& params, float* scaling_factors_ptr,
@@ -3549,8 +3582,8 @@ inline void AveragePool(const PoolParams& params,
             std::min(params.filter_height, input_height - in_y_origin);
         const int filter_count =
             (filter_x_end - filter_x_start) * (filter_y_end - filter_y_start);
-        // 1280 required by Inception v3
-        static constexpr int kAccBufferMaxSize = 2048;
+        // 2560 is required by MobileNetV2 with depth multiplier 2.
+        static constexpr int kAccBufferMaxSize = 4096;
         TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
         uint16 acc[kAccBufferMaxSize];
         memset(acc, 0, depth * sizeof(acc[0]));
@@ -3715,8 +3748,8 @@ inline void MaxPool(const PoolParams& params, const RuntimeShape& input_shape,
         const int filter_y_start = std::max(0, -in_y_origin);
         const int filter_y_end =
             std::min(params.filter_height, input_height - in_y_origin);
-        // 2048 required by Inception v3
-        static constexpr int kAccBufferMaxSize = 2048;
+        // 2560 is required by MobileNetV2 with depth multiplier 2.
+        static constexpr int kAccBufferMaxSize = 4096;
         TFLITE_DCHECK_LE(depth, kAccBufferMaxSize);
         uint8 acc[kAccBufferMaxSize];
         memset(acc, 0, depth * sizeof(acc[0]));
@@ -4292,7 +4325,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   using FixedPointScaledDiff =
       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/README.md b/tensorflow/lite/kernels/internal/reference/integer_ops/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..4b1d3c91d50a4c77865ec25fa9961f745a489aea
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/README.md
@@ -0,0 +1,8 @@
+This directory contains reference implementations for int8 fully integer kernels.
+
+Weight filters of convs are expected to be symmetric per-channel quantized in
+the range [-127, 127].
+Inputs/activations are expected to be asymmetric per-layer quantized in the
+range [-128, 127].
+
+THESE ARE EXPERIMENTAL AND PRONE TO CHANGE.
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h b/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h
new file mode 100644
index 0000000000000000000000000000000000000000..03dcb6c220d3fcbbd219df3a1a1ea5f3b2b29c81
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/dequantize.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
+
+#include "tensorflow/lite/kernels/internal/common.h"
+#include "tensorflow/lite/kernels/internal/types.h"
+
+namespace tflite {
+namespace reference_integer_ops {
+
+inline void Dequantize(const tflite::DequantizationParams& op_params,
+                       const RuntimeShape& input_shape, const int8* input_data,
+                       const RuntimeShape& output_shape, float* output_data) {
+  const int32 zero_point = op_params.zero_point;
+  const double scale = op_params.scale;
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+
+  for (int i = 0; i < flat_size; i++) {
+    const int32 val = input_data[i];
+    const float result = static_cast<float>(scale * (val - zero_point));
+    output_data[i] = result;
+  }
+}
+
+}  // namespace reference_integer_ops
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_REFERENCE_INTEGER_OPS_DEQUANTIZE_H_
diff --git a/tensorflow/lite/kernels/internal/reference/reference_ops.h b/tensorflow/lite/kernels/internal/reference/reference_ops.h
index 1bd9129488a958ab5ed532779a77433d12184656..ea3ab06da1f775b5ea0771bbb3f32c91c9caacd0 100644
--- a/tensorflow/lite/kernels/internal/reference/reference_ops.h
+++ b/tensorflow/lite/kernels/internal/reference/reference_ops.h
@@ -558,6 +558,19 @@ inline void ReluX(const tflite::ActivationParams& params,
   }
 }
 
+inline void LeakyRelu(const tflite::LeakyReluParams& params,
+                      const RuntimeShape& input_shape, const float* input_data,
+                      const RuntimeShape& output_shape, float* output_data) {
+  gemmlowp::ScopedProfilingLabel label("LeakyRelu (not fused)");
+  const int flat_size = MatchingFlatSize(input_shape, output_shape);
+  for (int i = 0; i < flat_size; ++i) {
+    const float val = input_data[i];
+    // Note that this implementation matches that of TensorFlow, and corresponds
+    // to the traditional LeakyRelu equation only for alpha <= 1.
+    output_data[i] = std::max(val, val * params.alpha);
+  }
+}
+
 inline void L2Normalization(const tflite::L2NormalizationParams& op_params,
                             const RuntimeShape& input_shape,
                             const float* input_data,
@@ -2723,7 +2736,6 @@ inline void LogSoftmax(const SoftmaxParams& params,
   using FixedPointScaledDiff =
       gemmlowp::FixedPoint<int32, kScaledDiffIntegerBits>;
   using FixedPointAccum = gemmlowp::FixedPoint<int32, kAccumulationIntegerBits>;
-  using FixedPoint0 = gemmlowp::FixedPoint<int32, 0>;
 
   const int trailing_dim = input_shape.DimensionsCount() - 1;
   const int outer_size =
@@ -3651,8 +3663,10 @@ inline void Mean(const tflite::MeanParams& op_params,
                  const RuntimeShape& unextended_output_shape, T* output_data) {
   gemmlowp::ScopedProfilingLabel label("Mean");
 
-  TFLITE_DCHECK_LE(unextended_input_shape.DimensionsCount(), 4);
-  TFLITE_DCHECK_LE(unextended_output_shape.DimensionsCount(), 4);
+  // Current implementation only supports dimension equals 4 and simultaneous
+  // reduction over width and height.
+  TFLITE_CHECK_EQ(unextended_input_shape.DimensionsCount(), 4);
+  TFLITE_CHECK_LE(unextended_output_shape.DimensionsCount(), 4);
   const RuntimeShape input_shape =
       RuntimeShape::ExtendedShape(4, unextended_input_shape);
   const RuntimeShape output_shape =
@@ -3666,8 +3680,6 @@ inline void Mean(const tflite::MeanParams& op_params,
   const int input_height = input_shape.Dims(1);
   const int input_width = input_shape.Dims(2);
 
-  // The current implementation only supports simultaneous reduction over
-  // width and height.
   TFLITE_DCHECK_EQ(op_params.axis_count, 2);
   TFLITE_DCHECK((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
                 (op_params.axis[0] == 2 && op_params.axis[1] == 1));
@@ -4554,6 +4566,63 @@ inline void ResizeNearestNeighbor(
   }
 }
 
+inline void BroadcastPrelu4DSlow(const PreluParams& params,
+                                 const RuntimeShape& input_shape,
+                                 const uint8* input_data,
+                                 const RuntimeShape& alpha_shape,
+                                 const uint8* alpha_data,
+                                 const RuntimeShape& output_shape,
+                                 uint8* output_data) {
+  TFLITE_DCHECK_LE(input_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(alpha_shape.DimensionsCount(), 4);
+  TFLITE_DCHECK_LE(output_shape.DimensionsCount(), 4);
+  const RuntimeShape extended_output_shape =
+      RuntimeShape::ExtendedShape(4, output_shape);
+  NdArrayDesc<4> desc1;
+  NdArrayDesc<4> desc2;
+  NdArrayDescsForElementwiseBroadcast(input_shape, alpha_shape, &desc1, &desc2);
+
+  for (int b = 0; b < extended_output_shape.Dims(0); ++b) {
+    for (int y = 0; y < extended_output_shape.Dims(1); ++y) {
+      for (int x = 0; x < extended_output_shape.Dims(2); ++x) {
+        for (int c = 0; c < extended_output_shape.Dims(3); ++c) {
+          int output_index = Offset(extended_output_shape, b, y, x, c);
+          int input_index = SubscriptToIndex(desc1, b, y, x, c);
+          const int32 input_value =
+              params.input_offset + input_data[input_index];
+          if (input_value >= 0) {
+            output_data[output_index] = input_data[input_index];
+          } else {
+            auto alpha_index = SubscriptToIndex(desc2, b, y, x, c);
+            const int32 alpha_value =
+                params.alpha_offset + alpha_data[alpha_index];
+            const int32 unclamped_output =
+                params.output_offset +
+                MultiplyByQuantizedMultiplierSmallerThanOneExp(
+                    input_value * alpha_value, params.output_multiplier,
+                    params.output_shift);
+            const int32 quantized_min = std::numeric_limits<uint8_t>::min();
+            const int32 quantized_max = std::numeric_limits<uint8_t>::max();
+            const int32 clamped_output = std::min(
+                quantized_max, std::max(quantized_min, unclamped_output));
+            output_data[output_index] = static_cast<uint8>(clamped_output);
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void Fill(const RuntimeShape& value_shape, const T* value_data,
+          const RuntimeShape& output_shape, T* output_data) {
+  TFLITE_DCHECK_EQ(value_shape.DimensionsCount(), 0);
+  const int flat_size = output_shape.FlatSize();
+  for (int i = 0; i < flat_size; ++i) {
+    output_data[i] = *value_data;
+  }
+}
+
 }  // namespace reference_ops
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/internal/tensor_ctypes.h b/tensorflow/lite/kernels/internal/tensor_ctypes.h
index d24dca9bfbbee78498f797713dac8b67a232923a..4a94b703f8b299e503305aaa897a2ebc65e50d3b 100644
--- a/tensorflow/lite/kernels/internal/tensor_ctypes.h
+++ b/tensorflow/lite/kernels/internal/tensor_ctypes.h
@@ -53,6 +53,11 @@ inline bool* GetTensorData(TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.b : nullptr;
 }
 
+template <>
+inline int8_t* GetTensorData(TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.int8 : nullptr;
+}
+
 template <typename T>
 inline const T* GetTensorData(const TfLiteTensor* tensor);
 
@@ -66,6 +71,11 @@ inline const uint8_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.uint8 : nullptr;
 }
 
+template <>
+inline const int8_t* GetTensorData(const TfLiteTensor* tensor) {
+  return tensor != nullptr ? tensor->data.int8 : nullptr;
+}
+
 template <>
 inline const int16_t* GetTensorData(const TfLiteTensor* tensor) {
   return tensor != nullptr ? tensor->data.i16 : nullptr;
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index a05bd5e003386297a0427b9ce56afb9cf8980ae5..859ec8c68252538e3cf6d06ce7864f62d2a236dc 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -904,6 +904,14 @@ struct PadParams {
   ResizingCategory resizing_category;
 };
 
+struct PreluParams {
+  int32 input_offset;
+  int32 alpha_offset;
+  int32 output_offset;
+  int32 output_multiplier;
+  int output_shift;
+};
+
 struct PoolParams {
   FusedActivationFunctionType activation;
   PaddingType padding_type;
@@ -1006,6 +1014,10 @@ struct UnpackParams {
   int16 axis;
 };
 
+struct LeakyReluParams {
+  float alpha;
+};
+
 template <typename P>
 inline void SetActivationParams(float min, float max, P* params) {
   params->float_activation_min = min;
diff --git a/tensorflow/lite/kernels/layer_norm_lstm.cc b/tensorflow/lite/kernels/layer_norm_lstm.cc
index 5b0046a7b31c9c2e805c6de48572776cf8d3883c..49e8a53c829a0c4a8ae355f8e7a6b97e3bbb81e1 100644
--- a/tensorflow/lite/kernels/layer_norm_lstm.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm.cc
@@ -55,7 +55,7 @@ constexpr int kCellToForgetWeightsTensor = 10;  // Optional
 constexpr int kCellToOutputWeightsTensor = 11;  // Optional
 
 // Layer norm weights tensors of size {n_cell}, representing a diagonal matrix.
-constexpr int kInputLayerNormWeightsTensor = 12;
+constexpr int kInputLayerNormWeightsTensor = 12;  // Optional
 constexpr int kForgetLayerNormWeightsTensor = 13;
 constexpr int kCellLayerNormWeightsTensor = 14;
 constexpr int kOutputLayerNormWeightsTensor = 15;
@@ -118,7 +118,8 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* input_to_input_weights =
       GetOptionalInputTensor(context, node, kInputToInputWeightsTensor);
-  if (input_to_input_weights != nullptr) {
+  const bool use_cifg = (input_to_input_weights == nullptr);
+  if (!use_cifg) {
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[0], n_cell);
     TF_LITE_ENSURE_EQ(context, input_to_input_weights->dims->data[1], n_input);
@@ -138,7 +139,9 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   const TfLiteTensor* recurrent_to_input_weights =
       GetOptionalInputTensor(context, node, kRecurrentToInputWeightsTensor);
-  if (recurrent_to_input_weights != nullptr) {
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights, nullptr);
+  } else {
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->size, 2);
     TF_LITE_ENSURE_EQ(context, recurrent_to_input_weights->dims->data[0],
                       n_cell);
@@ -161,15 +164,6 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   TF_LITE_ENSURE_EQ(context, recurrent_to_cell_weights->dims->data[1],
                     n_output);
 
-  // We make sure the input-gate's parameters are either both present (regular
-  // LSTM) or not at all (CIFG-LSTM).
-  const bool cifg_weights_all_or_none =
-      ((input_to_input_weights != nullptr) &&
-       (recurrent_to_input_weights != nullptr)) ||
-      ((input_to_input_weights == nullptr) &&
-       (recurrent_to_input_weights == nullptr));
-  TF_LITE_ENSURE(context, cifg_weights_all_or_none == true);
-
   const TfLiteTensor* cell_to_input_weights =
       GetOptionalInputTensor(context, node, kCellToInputWeightsTensor);
   if (cell_to_input_weights) {
@@ -192,7 +186,6 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
   }
 
   // Making sure the peephole weights are there all or none.
-  const bool use_cifg = (input_to_input_weights == nullptr);
   const bool peephole_weights_all_or_none =
       ((cell_to_input_weights != nullptr || use_cifg) &&
        (cell_to_forget_weights != nullptr) &&
@@ -204,10 +197,14 @@ TfLiteStatus CheckInputTensorDimensions(TfLiteContext* context,
 
   // Making sure layer norm weights are not null and have the right dimension.
   const TfLiteTensor* input_layer_norm_weights =
-      GetInput(context, node, kInputLayerNormWeightsTensor);
-  TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr);
-  TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1);
-  TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell);
+      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
+  if (use_cifg) {
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights, nullptr);
+  } else {
+    TF_LITE_ENSURE(context, input_layer_norm_weights != nullptr);
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->size, 1);
+    TF_LITE_ENSURE_EQ(context, input_layer_norm_weights->dims->data[0], n_cell);
+  }
 
   const TfLiteTensor* forget_layer_norm_weights =
       GetInput(context, node, kForgetLayerNormWeightsTensor);
@@ -978,6 +975,9 @@ TfLiteStatus EvalFloat(
       (projection_weights == nullptr) ? nullptr : projection_weights->data.f;
   const float* projection_bias_ptr =
       (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+  const float* input_layer_norm_weight_ptr =
+      (input_layer_norm_weights == nullptr) ? nullptr
+                                            : input_layer_norm_weights->data.f;
 
   // Required tensors, pointers are non-null.
   const float* input_ptr_batch = input->data.f;
@@ -990,7 +990,6 @@ TfLiteStatus EvalFloat(
       recurrent_to_cell_weights->data.f;
   const float* recurrent_to_output_weights_ptr =
       recurrent_to_output_weights->data.f;
-  const float* input_layer_norm_weight_ptr = input_layer_norm_weights->data.f;
   const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
   const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
   const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
@@ -1115,6 +1114,9 @@ TfLiteStatus EvalHybrid(
       (projection_weights == nullptr) ? 1.0f : projection_weights->params.scale;
   const float* projection_bias_ptr =
       (projection_bias == nullptr) ? nullptr : projection_bias->data.f;
+  const float* input_layer_norm_weight_ptr =
+      (input_layer_norm_weights == nullptr) ? nullptr
+                                            : input_layer_norm_weights->data.f;
 
   // Required tensors, pointers are non-null.
   const float* input_ptr_batch = input->data.f;
@@ -1141,7 +1143,6 @@ TfLiteStatus EvalHybrid(
       reinterpret_cast<int8_t*>(recurrent_to_output_weights->data.uint8);
   const float recurrent_to_output_weights_scale =
       recurrent_to_output_weights->params.scale;
-  const float* input_layer_norm_weight_ptr = input_layer_norm_weights->data.f;
   const float* forget_layer_norm_weight_ptr = forget_layer_norm_weights->data.f;
   const float* cell_layer_norm_weight_ptr = cell_layer_norm_weights->data.f;
   const float* output_layer_norm_weight_ptr = output_layer_norm_weights->data.f;
@@ -1221,7 +1222,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       GetOptionalInputTensor(context, node, kCellToOutputWeightsTensor);
 
   const TfLiteTensor* input_layer_norm_weights =
-      GetInput(context, node, kInputLayerNormWeightsTensor);
+      GetOptionalInputTensor(context, node, kInputLayerNormWeightsTensor);
   const TfLiteTensor* forget_layer_norm_weights =
       GetInput(context, node, kForgetLayerNormWeightsTensor);
   const TfLiteTensor* cell_layer_norm_weights =
diff --git a/tensorflow/lite/kernels/layer_norm_lstm_test.cc b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
index e89bce50c311eb0bf685a7da487c18704e831c91..1c13cee1c3f66ed2a3459cd2bcc32211c3b1f00e 100644
--- a/tensorflow/lite/kernels/layer_norm_lstm_test.cc
+++ b/tensorflow/lite/kernels/layer_norm_lstm_test.cc
@@ -83,7 +83,11 @@ class LayerNormLSTMOpModel : public SingleOpModel {
       cell_to_output_weights_ = AddNullInput();
     }
 
-    input_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
+    if (use_cifg) {
+      input_layer_norm_weights_ = AddNullInput();
+    } else {
+      input_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
+    }
     forget_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
     cell_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
     output_layer_norm_weights_ = AddInput(TensorType_FLOAT32);
@@ -650,6 +654,223 @@ TEST_F(NoCifgPeepholeProjectionNoClippingLayerNormLstmTest,
                 &layer_norm_lstm);
 }
 
+class CifgPeepholeProjectionNoClippingLayerNormLstmTest
+    : public BaseLayerNormLstmTest {
+  void SetUp() override {
+    input_to_forget_weights_ = {-0.6, -0.1, 0.3,  0.2,  0.9,  -0.5, -0.2,
+                                -0.4, 0.3,  -0.8, -0.4, 0.3,  -0.5, -0.4,
+                                -0.6, 0.3,  -0.4, -0.6, -0.5, -0.5};
+    input_to_cell_weights_ = {-0.4, -0.3, -0.2, -0.1, -0.5, 0.5,  -0.2,
+                              -0.3, -0.2, -0.6, 0.6,  -0.1, -0.4, -0.3,
+                              -0.7, 0.7,  -0.9, -0.5, 0.8,  0.6};
+    input_to_output_weights_ = {-0.8, -0.4, -0.2, -0.9, -0.1, -0.7, 0.3,
+                                -0.3, -0.8, -0.2, 0.6,  -0.2, 0.4,  -0.7,
+                                -0.3, -0.5, 0.1,  0.5,  -0.6, -0.4};
+
+    forget_gate_bias_ = {0.1, -0.3, -0.2, 0.1};
+    cell_gate_bias_ = {-0.05, 0.72, 0.25, 0.08};
+    output_gate_bias_ = {0.05, -0.01, 0.2, 0.1};
+
+    recurrent_to_cell_weights_ = {-0.3, 0.2, 0.1, -0.3, 0.8,  -0.08,
+                                  -0.2, 0.3, 0.8, -0.6, -0.1, 0.2};
+    recurrent_to_forget_weights_ = {-0.5, -0.3, -0.5, -0.2, 0.6, 0.4,
+                                    0.9,  0.3,  -0.1, 0.2,  0.5, 0.2};
+    recurrent_to_output_weights_ = {0.3,  -0.1, 0.1,  -0.2, -0.5, -0.7,
+                                    -0.2, -0.6, -0.1, -0.4, -0.7, -0.2};
+
+    cell_to_forget_weights_ = {-0.02, -0.15, -0.25, -0.03};
+    cell_to_output_weights_ = {0.1, -0.1, -0.5, 0.05};
+
+    forget_layer_norm_weights_ = {0.2, 0.2, 0.4, 0.3};
+    cell_layer_norm_weights_ = {0.7, 0.2, 0.3, 0.8};
+    output_layer_norm_weights_ = {0.6, 0.2, 0.2, 0.5};
+    projection_weights_ = {-0.1, 0.2,  0.01, -0.2, 0.1,  0.5,
+                           0.3,  0.08, 0.07, 0.2,  -0.4, 0.2};
+
+    layer_norm_lstm_input_ = {
+        {// Batch0: 3 (input_sequence_size) * 5 (n_input)
+         0.7, 0.8, 0.1, 0.2, 0.3,   // seq 0
+         0.8, 0.1, 0.2, 0.4, 0.5,   // seq 1
+         0.2, 0.7, 0.7, 0.1, 0.7},  // seq 2
+
+        {// Batch1: 3 (input_sequence_size) * 5 (n_input)
+         0.3, 0.2, 0.9, 0.8, 0.1,   // seq 0
+         0.1, 0.5, 0.2, 0.4, 0.2,   // seq 1
+         0.6, 0.9, 0.2, 0.5, 0.7},  // seq 2
+    };
+  }
+};
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       LayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  LayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_layer_norm_weight tensor
+          {n_cell},  // forget_layer_norm_weight tensor
+          {n_cell},  // cell_layer_norm_weight tensor
+          {n_cell},  // output_layer_norm_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.02129706, 0.140816242, 0.0112733059,     // seq 0
+          0.0132302344, 0.152308047, 0.0346313119,   // seq 1
+          -0.0123688057, 0.165790111, 0.0893077999,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0226350538, 0.0916948169, 0.0769175813,  // seq 0
+          -0.0269966982, 0.149707705, 0.094149217,    // seq 1
+          -0.0103429332, 0.173016444, 0.0720508844,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
+TEST_F(CifgPeepholeProjectionNoClippingLayerNormLstmTest,
+       HybridLayerNormLstmBlackBoxTest) {
+  const int n_batch = 2;
+  const int n_input = 5;
+  const int n_cell = 4;
+  const int n_output = 3;
+  const float ceil_clip = 0.0;
+  const float proj_clip = 0.0;
+
+  HybridLayerNormLSTMOpModel layer_norm_lstm(
+      n_batch, n_input, n_cell, n_output,
+      /*use_cifg=*/true, /*use_peephole=*/true,
+      /*use_projection_weights=*/true,
+      /*use_projection_bias=*/false, ceil_clip, proj_clip,
+      {
+          {n_batch, n_input},  // input tensor
+
+          {0, 0},             // input_to_input_weight tensor
+          {n_cell, n_input},  // input_to_forget_weight tensor
+          {n_cell, n_input},  // input_to_cell_weight tensor
+          {n_cell, n_input},  // input_to_output_weight tensor
+
+          {0, 0},              // recurrent_to_input_weight tensor
+          {n_cell, n_output},  // recurrent_to_forget_weight tensor
+          {n_cell, n_output},  // recurrent_to_cell_weight tensor
+          {n_cell, n_output},  // recurrent_to_output_weight tensor
+
+          {0},       // cell_to_input_weight tensor
+          {n_cell},  // cell_to_forget_weight tensor
+          {n_cell},  // cell_to_output_weight tensor
+
+          {0},       // input_layer_norm_weight tensor
+          {n_cell},  // forget_layer_norm_weight tensor
+          {n_cell},  // cell_layer_norm_weight tensor
+          {n_cell},  // output_layer_norm_weight tensor
+
+          {0},       // input_gate_bias tensor
+          {n_cell},  // forget_gate_bias tensor
+          {n_cell},  // cell_bias tensor
+          {n_cell},  // output_gate_bias tensor
+
+          {n_output, n_cell},  // projection_weight tensor
+          {0},                 // projection_bias tensor
+      });
+
+  layer_norm_lstm.SetInputToCellWeights(input_to_cell_weights_);
+  layer_norm_lstm.SetInputToForgetWeights(input_to_forget_weights_);
+  layer_norm_lstm.SetInputToOutputWeights(input_to_output_weights_);
+
+  layer_norm_lstm.SetCellBias(cell_gate_bias_);
+  layer_norm_lstm.SetForgetGateBias(forget_gate_bias_);
+  layer_norm_lstm.SetOutputGateBias(output_gate_bias_);
+
+  layer_norm_lstm.SetRecurrentToCellWeights(recurrent_to_cell_weights_);
+  layer_norm_lstm.SetRecurrentToForgetWeights(recurrent_to_forget_weights_);
+  layer_norm_lstm.SetRecurrentToOutputWeights(recurrent_to_output_weights_);
+
+  layer_norm_lstm.SetCellToForgetWeights(cell_to_forget_weights_);
+  layer_norm_lstm.SetCellToOutputWeights(cell_to_output_weights_);
+
+  layer_norm_lstm.SetForgetLayerNormWeights(forget_layer_norm_weights_);
+  layer_norm_lstm.SetCellLayerNormWeights(cell_layer_norm_weights_);
+  layer_norm_lstm.SetOutputLayerNormWeights(output_layer_norm_weights_);
+
+  layer_norm_lstm.SetProjectionWeights(projection_weights_);
+
+  // Verify the final output.
+  const std::vector<std::vector<float>> layer_norm_lstm_golden_output = {
+      {
+          // Batch0: 3 (input_sequence_size) * 3 (n_output)
+          0.0212250091, 0.140474007, 0.0115012666,   // seq 0
+          0.0130806509, 0.152660668, 0.0347516984,   // seq 1
+          -0.0124010444, 0.166042402, 0.0898982584,  // seq 2
+      },
+      {
+          // Batch1: 3 (input_sequence_size) * 3 (n_output)
+          -0.0228835996, 0.0917588323, 0.0778886303,  // seq 0
+          -0.0275101066, 0.148769245, 0.0938384682,   // seq 1
+          -0.0103605557, 0.172605693, 0.0728750974,   // seq 2
+      }};
+
+  VerifyGoldens(layer_norm_lstm_input_, layer_norm_lstm_golden_output,
+                &layer_norm_lstm);
+}
+
 }  // namespace
 }  // namespace custom
 }  // namespace ops
diff --git a/tensorflow/lite/kernels/mirror_pad.cc b/tensorflow/lite/kernels/mirror_pad.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e74e47f7a37b0f449fb2a63237e95066bb452de6
--- /dev/null
+++ b/tensorflow/lite/kernels/mirror_pad.cc
@@ -0,0 +1,374 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace mirror_pad {
+namespace {
+
+// Simple class that represents a mirror padded tensor - which is the output
+// from the Op.
+struct PaddedTensor {
+  // If not null that means this is a scalar value.
+  // Note: This is not owned by default. It will point to the value
+  // in the input tensor.
+  const void* value = nullptr;
+  // If this tensor is not one value, then this vector will have
+  // all the tensors that belongs to this tensor.
+  // Pointers are owned.
+  std::vector<std::unique_ptr<PaddedTensor>> values;
+  // Pointers to PaddedTensors that are padded on the left of the current
+  // tensor.
+  std::vector<PaddedTensor*> left_pad_ptrs;
+  // Pointers to PaddedTensors that are padded on the right of the current
+  // tensor.
+  std::vector<PaddedTensor*> right_pad_ptrs;
+
+  // Returns mutable pointer to the tensor identified by 'indices'.
+  PaddedTensor* GetMutable(const std::vector<int>& indices) {
+    auto* result = this;
+    for (int i = 0; i < indices.size(); ++i) {
+      if (indices[i] >= result->values.size()) {
+        return nullptr;
+      }
+      result = result->values[indices[i]].get();
+      if (result == nullptr) break;
+    }
+    return result;
+  }
+};
+
+// Util method to initialize the memory of the padded tensor.
+void InitializeTensorMemory(const TfLiteIntArray* const dims, int dim_index,
+                            int dims_size, PaddedTensor* padded_tensor) {
+  if (dim_index >= dims_size) {
+    return;
+  }
+  padded_tensor->values.reserve(dims->data[dim_index]);
+  for (int i = 0; i < dims->data[dim_index]; ++i) {
+    padded_tensor->values.emplace_back(new PaddedTensor());
+    InitializeTensorMemory(dims, dim_index + 1, dims_size,
+                           padded_tensor->values.back().get());
+  }
+}
+
+// Returns pointer to the value at the specified index in 'data'.
+inline const void* GetValuePointerAtIndex(const void* data, int index,
+                                          const TfLiteType data_type) {
+  switch (data_type) {
+    case kTfLiteFloat32:
+      return static_cast<const float*>(data) + index;
+    case kTfLiteInt32:
+      return static_cast<const int32_t*>(data) + index;
+    case kTfLiteUInt8:
+      return static_cast<const uint8_t*>(data) + index;
+    case kTfLiteInt64:
+      return static_cast<const int64_t*>(data) + index;
+    case kTfLiteBool:
+      return static_cast<const bool*>(data) + index;
+    case kTfLiteInt16:
+      return static_cast<const int16_t*>(data) + index;
+    case kTfLiteInt8:
+      return static_cast<const int8_t*>(data) + index;
+    // Unsupported types ?
+    default:
+      return nullptr;
+  }
+  return nullptr;
+}
+
+// Util method that increment index in the N-d array.
+void IncrementTensorIndex(const TfLiteIntArray* dims,
+                          std::vector<int>* tensor_index_ptr) {
+  int dimension_index = dims->size - 1;
+  auto& tensor_index = *tensor_index_ptr;
+  tensor_index[dimension_index]++;
+  while (dimension_index >= 0 &&
+         tensor_index[dimension_index] == dims->data[dimension_index]) {
+    tensor_index[dimension_index] = 0;
+    dimension_index--;
+    if (dimension_index >= 0) tensor_index[dimension_index]++;
+  }
+}
+
+// Fills the 'padded_tensor' with data from 'input_tensor'.
+TfLiteStatus InitFromInputTensor(const TfLiteTensor* input_tensor,
+                                 PaddedTensor* padded_tensor) {
+  const auto* dims = input_tensor->dims;
+  const auto data_type = input_tensor->type;
+  const void* data = static_cast<const void*>(input_tensor->data.raw_const);
+  // Either invalid input or unsupported type.+
+  if (data == nullptr) {
+    return kTfLiteError;
+  }
+  // Index of current processing tensor.
+  std::vector<int> tensor_index(dims->size, 0);
+  int flat_index = 0;
+  const int num_elements = NumElements(input_tensor);
+  while (flat_index < num_elements) {
+    auto* tensor = padded_tensor->GetMutable(tensor_index);
+    if (tensor == nullptr) {
+      return kTfLiteError;
+    }
+    tensor->value = GetValuePointerAtIndex(data, flat_index, data_type);
+    IncrementTensorIndex(dims, &tensor_index);
+    ++flat_index;
+  }
+
+  return kTfLiteOk;
+}
+
+template <typename T>
+inline void GetPadding(const T* data, int offset, int64_t* left_pad,
+                       int64_t* right_pad) {
+  *left_pad = static_cast<int64_t>(*(data + offset * 2));
+  *right_pad = static_cast<int64_t>(*(data + offset * 2 + 1));
+}
+
+inline TfLiteStatus GetPadding(const TfLiteTensor* padding_matrix,
+                               int dimension, int64_t* left_pad,
+                               int64_t* right_pad) {
+  switch (padding_matrix->type) {
+    case kTfLiteInt32:
+      GetPadding(padding_matrix->data.i32, dimension, left_pad, right_pad);
+      break;
+    case kTfLiteInt64:
+      GetPadding(padding_matrix->data.i64, dimension, left_pad, right_pad);
+      break;
+    default:
+      return kTfLiteError;
+  }
+  return kTfLiteOk;
+}
+
+TfLiteStatus ValidateTensor(const TfLiteTensor* padding_matrix, int offset,
+                            int dimension_index, PaddedTensor* padded_tensor,
+                            TfLiteContext* context) {
+  if (dimension_index >= padding_matrix->dims->data[0]) {
+    return kTfLiteOk;
+  }
+
+  int64_t left_pad = 0, right_pad = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+  // If we are not going to include border we must have enough values
+  // to use.
+  if (left_pad + offset > padded_tensor->values.size()) {
+    context->ReportError(
+        context, "Not enough values for Mirror Pad, required %d, available %d.",
+        left_pad + offset, padded_tensor->values.size());
+    return kTfLiteError;
+  }
+  if (right_pad + offset > padded_tensor->values.size()) {
+    context->ReportError(
+        context, "Not enough values for Mirror Pad, required %d, available %d.",
+        right_pad + offset, padded_tensor->values.size());
+    return kTfLiteError;
+  }
+  if (!padded_tensor->values.empty()) {
+    ValidateTensor(padding_matrix, offset, dimension_index + 1,
+                   padded_tensor->values[0].get(), context);
+  }
+  return kTfLiteOk;
+}
+
+// Fills 'padded_tensor' with the padding information based on
+// 'padding_matrix'.
+// 'dimension_index' represents which dimension the function is operating on.
+TfLiteStatus PadTensor(const TfLiteTensor* padding_matrix, int offset,
+                       int dimension_index, PaddedTensor* padded_tensor,
+                       TfLiteContext* context) {
+  if (dimension_index >= padding_matrix->dims->data[0]) return kTfLiteOk;
+
+  int64_t left_pad = 0, right_pad = 0;
+  TF_LITE_ENSURE_STATUS(
+      GetPadding(padding_matrix, dimension_index, &left_pad, &right_pad));
+
+  for (int i = left_pad + offset - 1; i >= offset && left_pad > 0;
+       --i, --left_pad) {
+    padded_tensor->left_pad_ptrs.push_back(padded_tensor->values[i].get());
+  }
+  for (int i = padded_tensor->values.size() - (1 + offset);
+       i >= 0 && right_pad > 0; --i, --right_pad) {
+    padded_tensor->right_pad_ptrs.push_back(padded_tensor->values[i].get());
+  }
+
+  for (auto& tensor : padded_tensor->values) {
+    TF_LITE_ENSURE_STATUS(PadTensor(padding_matrix, offset, dimension_index + 1,
+                                    tensor.get(), context));
+  }
+  return kTfLiteOk;
+}
+
+// Fills 'output_data' with data from 'padded_tensor'.
+// The function does this recursively by setting left padding first then
+// original data, followed by the right padding.
+template <typename T>
+int FillOutput(const PaddedTensor* padded_tensor, T* output_data,
+               int index_in_output) {
+  if (padded_tensor == nullptr || output_data == nullptr) {
+    return -1;
+  }
+  if (padded_tensor->value != nullptr) {
+    output_data[index_in_output] = *static_cast<const T*>(padded_tensor->value);
+    return index_in_output + 1;
+  }
+  for (const auto* tensor : padded_tensor->left_pad_ptrs) {
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
+  }
+  for (const auto& tensor : padded_tensor->values) {
+    index_in_output = FillOutput(tensor.get(), output_data, index_in_output);
+  }
+  for (const auto* tensor : padded_tensor->right_pad_ptrs) {
+    index_in_output = FillOutput(tensor, output_data, index_in_output);
+  }
+  return index_in_output;
+}
+
+// Returns the shape of the final output after padding.
+std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> GetPaddedOutputShape(
+    const TfLiteTensor* input, const TfLiteTensor* padding_matrix) {
+  const int input_dims = NumDimensions(input);
+  std::unique_ptr<TfLiteIntArray, void (*)(TfLiteIntArray*)> shape(
+      TfLiteIntArrayCreate(input_dims), TfLiteIntArrayFree);
+
+  int64_t left_pad = 0, right_pad = 0;
+  for (int i = 0; i < input_dims; ++i) {
+    GetPadding(padding_matrix, i, &left_pad, &right_pad);
+    shape->data[i] = SizeOfDimension(input, i) + left_pad + right_pad;
+  }
+  return shape;
+}
+
+}  // namespace
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  auto* params =
+      reinterpret_cast<TfLiteMirrorPaddingParams*>(node->builtin_data);
+
+  if (params == nullptr) {
+    return kTfLiteError;
+  }
+  const int input_dims = NumDimensions(input_tensor);
+
+  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+  if (IsDynamicTensor(output_tensor)) {
+    auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
+    if (output_size == nullptr) {
+      return kTfLiteError;
+    }
+    TF_LITE_ENSURE_STATUS(
+        context->ResizeTensor(context, output_tensor, output_size.release()));
+  }
+
+  PaddedTensor padded_tensor;
+  // Initialize memory.
+  InitializeTensorMemory(input_tensor->dims, 0, input_dims, &padded_tensor);
+  // Set the values from the input_tensor.
+  TF_LITE_ENSURE_STATUS(InitFromInputTensor(input_tensor, &padded_tensor));
+
+  const int offset =
+      params->mode != TfLiteMirrorPaddingMode::kTfLiteMirrorPaddingReflect ? 0
+                                                                           : 1;
+  // Make sure padding values are sufficient and valid to use.
+  TF_LITE_ENSURE_STATUS(
+      ValidateTensor(padding_matrix, offset, 0, &padded_tensor, context));
+  // Apply padding.
+  TF_LITE_ENSURE_STATUS(
+      PadTensor(padding_matrix, offset, 0, &padded_tensor, context));
+
+  // Fill the output tensor from the padded tensor.
+  TfLiteStatus status = kTfLiteOk;
+
+#define TF_LITE_MIRROR_PAD(type) \
+  FillOutput(&padded_tensor, GetTensorData<type>(output_tensor), 0);
+
+  switch (output_tensor->type) {
+    case kTfLiteFloat32: {
+      TF_LITE_MIRROR_PAD(float);
+      break;
+    }
+    case kTfLiteInt32: {
+      TF_LITE_MIRROR_PAD(int32_t);
+      break;
+    }
+    case kTfLiteUInt8: {
+      TF_LITE_MIRROR_PAD(uint8_t);
+      break;
+    }
+    case kTfLiteInt64: {
+      TF_LITE_MIRROR_PAD(int64_t);
+      break;
+    }
+    default:
+      status = kTfLiteError;
+      break;
+  }
+#undef TF_LITE_MIRROR_PAD
+  return status;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  return nullptr;
+}
+
+void Free(TfLiteContext* context, void* buffer) {}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input_tensor = GetInput(context, node, 0);
+  const TfLiteTensor* padding_matrix = GetInput(context, node, 1);
+  TfLiteTensor* output_tensor = GetOutput(context, node, 0);
+
+  TF_LITE_ENSURE_EQ(context, NumDimensions(padding_matrix), 2);
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(padding_matrix, 0),
+                    NumDimensions(input_tensor));
+
+  if (!IsConstantTensor(padding_matrix)) {
+    SetTensorToDynamic(output_tensor);
+    return kTfLiteOk;
+  }
+  // We have constant padding, so we can infer output size.
+
+  auto output_size = GetPaddedOutputShape(input_tensor, padding_matrix);
+  if (output_size == nullptr) {
+    return kTfLiteError;
+  }
+  return context->ResizeTensor(context, output_tensor, output_size.release());
+}
+
+}  // namespace mirror_pad
+TfLiteRegistration* Register_MIRROR_PAD() {
+  static TfLiteRegistration r = {mirror_pad::Init, mirror_pad::Free,
+                                 mirror_pad::Prepare, mirror_pad::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/mirror_pad_test.cc b/tensorflow/lite/kernels/mirror_pad_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd09e6e4493d3a29bffecfcd4a4d1946840a4e5e
--- /dev/null
+++ b/tensorflow/lite/kernels/mirror_pad_test.cc
@@ -0,0 +1,189 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+template <typename T>
+class BaseMirrorPadOpModel : public SingleOpModel {
+ public:
+  BaseMirrorPadOpModel(const TensorData& input,
+                       const TensorData& padding_matrix,
+                       const TensorData& output,
+                       const tflite::MirrorPadMode mode) {
+    input_id_ = AddInput(input);
+    padding_matrix_id_ = AddInput(padding_matrix);
+    output_id_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_MIRROR_PAD, BuiltinOptions_MirrorPadOptions,
+                 CreateMirrorPadOptions(builder_, mode).Union());
+    BuildInterpreter({GetShape(input_id_), GetShape(padding_matrix_id_)});
+  }
+
+  int input_tensor_id() { return input_id_; }
+  int padding_matrix_tensor_id() { return padding_matrix_id_; }
+
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_id_); }
+
+ protected:
+  int input_id_;
+  int padding_matrix_id_;
+  int output_id_;
+};
+
+TEST(MirrorPadTest, EmptyPad) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 0, 0, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 1, 0, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 2, 4, 5, 6, 5, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 0, 1, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 2, 1, 2, 3, 5, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_right_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 1, 0, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 2, 3, 3, 4, 5, 6, 6, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadOneSide_left_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 0, 1, 0});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 1, 1, 2, 3, 4, 4, 5, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 1, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({1, 1, 2, 3, 3, 1, 1, 2, 3, 3,
+                                4, 4, 5, 6, 6, 4, 4, 5, 6, 6}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 1, 1});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({5, 4, 5, 6, 5, 2, 1, 2, 3, 2,
+                                5, 4, 5, 6, 5, 2, 1, 2, 3, 2}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Symmetric_Whole) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {2, 2, 3, 3});
+  model.Invoke();
+  EXPECT_THAT(
+      model.GetOutput(),
+      ElementsAreArray({6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1,
+                        3, 2, 1, 1, 2, 3, 3, 2, 1, 6, 5, 4, 4, 5, 6, 6, 5, 4,
+                        6, 5, 4, 4, 5, 6, 6, 5, 4, 3, 2, 1, 1, 2, 3, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, PadBothSides_Reflect_Whole) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 2, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1,
+                                6, 5, 4, 5, 6, 5, 4, 3, 2, 1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {2, 3}}, {TensorType_INT32, {2, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3, 4, 5, 6});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {1, 1, 2, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(),
+              ElementsAreArray({2, 1, 1, 2, 3, 3, 2, 2, 1, 1, 2, 3, 3, 2,
+                                5, 4, 4, 5, 6, 6, 5, 5, 4, 4, 5, 6, 6, 5}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Reflect) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_REFLECT);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 2, 1}));
+}
+
+TEST(MirrorPadTest, Pad_1D_Symmetric) {
+  BaseMirrorPadOpModel<int> model(
+      {TensorType_INT32, {3}}, {TensorType_INT32, {1, 2}},
+      {TensorType_INT32, {}}, tflite::MirrorPadMode_SYMMETRIC);
+  model.PopulateTensor<int>(model.input_tensor_id(), {1, 2, 3});
+  model.PopulateTensor<int>(model.padding_matrix_tensor_id(), {0, 2});
+  model.Invoke();
+  EXPECT_THAT(model.GetOutput(), ElementsAreArray({1, 2, 3, 3, 2}));
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/pooling_test.cc b/tensorflow/lite/kernels/pooling_test.cc
index 80eef02509009c3b553b2696ab5fc00aeccea972..98777f1c13ff97551c05cddc1d319918ea6ed69a 100644
--- a/tensorflow/lite/kernels/pooling_test.cc
+++ b/tensorflow/lite/kernels/pooling_test.cc
@@ -67,6 +67,10 @@ class QuantizedPoolingOpModel : public BasePoolingOpModel {
     QuantizeAndPopulate<uint8_t>(input_, data);
   }
 
+  void SetInput(const std::vector<float>& data) {
+    QuantizeAndPopulate<uint8_t>(input_, data);
+  }
+
   std::vector<uint8_t> GetOutput() { return ExtractVector<uint8_t>(output_); }
   std::vector<float> GetDequantizedOutput() {
     return Dequantize<uint8_t>(ExtractVector<uint8_t>(output_),
@@ -106,6 +110,45 @@ TEST(QuantizedPoolingOpTest, AveragePool) {
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({44, 92}));
 }
 
+// Send in a white image, expect a white pixel.
+TEST(QuantizedPoolingOpTest, AveragePoolImageSize16) {
+  int image_size = 16;
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, image_size, image_size, 1}, 0, 16},
+      /*filter_width=*/image_size,
+      /*filter_height=*/image_size,
+      /*output=*/{TensorType_UINT8, {}, 0, 16});
+
+  std::vector<float> input(image_size * image_size, 16.f);
+  m.SetInput(input);
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(), ::testing::ElementsAre(255));
+  EXPECT_THAT(m.GetDequantizedOutput(), ElementsAreArray(ArrayFloatNear({16})));
+}
+
+// Send in a white image, expect something other than a white pixel, due to
+// overflow.
+TEST(QuantizedPoolingOpTest, AveragePoolImageSize17) {
+  int image_size = 17;
+  QuantizedPoolingOpModel m(
+      BuiltinOperator_AVERAGE_POOL_2D,
+      /*input=*/{TensorType_UINT8, {1, image_size, image_size, 1}, 0, 16},
+      /*filter_width=*/image_size,
+      /*filter_height=*/image_size,
+      /*output=*/{TensorType_UINT8, {}, 0, 16});
+
+  std::vector<float> input(image_size * image_size, 16.f);
+  m.SetInput(input);
+  m.Invoke();
+
+  // Ordinarily we would see '255' here. However, the optimized version of
+  // AveragePool uses a uint16 accumulator which causes it to overflow for
+  // images this large.
+  EXPECT_THAT(m.GetOutput(), ::testing::ElementsAre(28));
+}
+
 TEST(FloatPoolingOpTest, MaxPool) {
   FloatPoolingOpModel m(BuiltinOperator_MAX_POOL_2D,
                         /*input=*/{TensorType_FLOAT32, {1, 2, 4, 1}},
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index ed2d475f6d7d3809a29da184ef5061e83507697c..336e827ca4c76abf3a08492249dfc0ce9cd81439 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
+#include "tensorflow/lite/kernels/internal/types.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
 
@@ -229,6 +231,17 @@ TfLiteStatus PrepareMeanOrSum(TfLiteContext* context, TfLiteNode* node) {
   return ResizeTempSum(context, &op_context, temp_sum);
 }
 
+void ResolveAxis(const int* axis_data, int axis_count,
+                 tflite::MeanParams* op_params) {
+  int i = 0;
+  for (; i < axis_count; ++i) {
+    op_params->axis[i] = static_cast<int16>(axis_data[i]);
+  }
+  for (; i < 4; ++i) {
+    op_params->axis[i] = 1;
+  }
+}
+
 template <KernelType kernel_type>
 TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
@@ -257,9 +270,23 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
 
   if (kernel_type == kReference) {
     switch (op_context.input->type) {
-      case kTfLiteFloat32:
-        TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
-        break;
+      case kTfLiteFloat32: {
+        tflite::MeanParams op_params;
+        op_params.axis_count = num_axis;
+        ResolveAxis(GetTensorData<int>(op_context.axis), num_axis, &op_params);
+        const TfLiteTensor* input = op_context.input;
+        if (op_context.params->keep_dims && NumDimensions(input) == 4 &&
+            op_params.axis_count == 2 &&
+            ((op_params.axis[0] == 1 && op_params.axis[1] == 2) ||
+             (op_params.axis[0] == 2 && op_params.axis[1] == 1))) {
+          reference_ops::Mean(op_params, GetTensorShape(input),
+                              GetTensorData<float>(input),
+                              GetTensorShape(op_context.output),
+                              GetTensorData<float>(op_context.output));
+        } else {
+          TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, float, float));
+        }
+      } break;
       case kTfLiteInt32:
         TF_LITE_ENSURE(context, TF_LITE_MEAN(reference_ops, int, int64_t));
         break;
@@ -286,7 +313,8 @@ TfLiteStatus EvalMean(TfLiteContext* context, TfLiteNode* node) {
                   GetTensorData<int>(op_context.axis), num_axis,
                   op_context.params->keep_dims, GetTensorData<int>(temp_index),
                   GetTensorData<int>(resolved_axis),
-                  GetTensorData<int>(temp_sum), /*compute_sum=*/false));
+                  GetTensorData<int>(temp_sum),
+                  /*compute_sum=*/false));
         }
         break;
       default:
diff --git a/tensorflow/lite/kernels/register.cc b/tensorflow/lite/kernels/register.cc
index c6834537671034b5736c232486dd5eecfea75033..c0e6f6994fd2334917b178d4d3b16d73c27121c4 100644
--- a/tensorflow/lite/kernels/register.cc
+++ b/tensorflow/lite/kernels/register.cc
@@ -31,6 +31,7 @@ TfLiteRegistration* Register_RELU_1();
 
 namespace builtin {
 
+TfLiteRegistration* Register_ABS();
 TfLiteRegistration* Register_RELU();
 TfLiteRegistration* Register_RELU_N1_TO_1();
 TfLiteRegistration* Register_RELU6();
@@ -74,6 +75,7 @@ TfLiteRegistration* Register_GATHER();
 TfLiteRegistration* Register_TRANSPOSE();
 TfLiteRegistration* Register_MEAN();
 TfLiteRegistration* Register_SPLIT();
+TfLiteRegistration* Register_SPLIT_V();
 TfLiteRegistration* Register_SQUEEZE();
 TfLiteRegistration* Register_STRIDED_SLICE();
 TfLiteRegistration* Register_EXP();
@@ -123,6 +125,10 @@ TfLiteRegistration* Register_SQUARE();
 TfLiteRegistration* Register_ZEROS_LIKE();
 TfLiteRegistration* Register_FLOOR_MOD();
 TfLiteRegistration* Register_RANGE();
+TfLiteRegistration* Register_LEAKY_RELU();
+TfLiteRegistration* Register_SQUARED_DIFFERENCE();
+TfLiteRegistration* Register_FILL();
+TfLiteRegistration* Register_MIRROR_PAD();
 
 TfLiteStatus UnsupportedTensorFlowOp(TfLiteContext* context, TfLiteNode* node) {
   context->ReportError(
@@ -152,6 +158,7 @@ const TfLiteRegistration* BuiltinOpResolver::FindOp(const char* op,
 }
 
 BuiltinOpResolver::BuiltinOpResolver() {
+  AddBuiltin(BuiltinOperator_ABS, Register_ABS());
   AddBuiltin(BuiltinOperator_RELU, Register_RELU());
   AddBuiltin(BuiltinOperator_RELU_N1_TO_1, Register_RELU_N1_TO_1());
   AddBuiltin(BuiltinOperator_RELU6, Register_RELU6());
@@ -207,6 +214,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_DIV, Register_DIV());
   AddBuiltin(BuiltinOperator_SUB, Register_SUB());
   AddBuiltin(BuiltinOperator_SPLIT, Register_SPLIT());
+  AddBuiltin(BuiltinOperator_SPLIT_V, Register_SPLIT_V());
   AddBuiltin(BuiltinOperator_SQUEEZE, Register_SQUEEZE());
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE());
   AddBuiltin(BuiltinOperator_EXP, Register_EXP());
@@ -214,7 +222,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_LOG, Register_LOG());
   AddBuiltin(BuiltinOperator_LOG_SOFTMAX, Register_LOG_SOFTMAX());
   AddBuiltin(BuiltinOperator_CAST, Register_CAST());
-  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE());
+  AddBuiltin(BuiltinOperator_DEQUANTIZE, Register_DEQUANTIZE(),
+             /* min_version */ 1,
+             /* max_version */ 2);
   AddBuiltin(BuiltinOperator_PRELU, Register_PRELU());
   AddBuiltin(BuiltinOperator_MAXIMUM, Register_MAXIMUM());
   AddBuiltin(BuiltinOperator_MINIMUM, Register_MINIMUM());
@@ -256,6 +266,10 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_ZEROS_LIKE, Register_ZEROS_LIKE());
   AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD());
   AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
+  AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU());
+  AddBuiltin(BuiltinOperator_SQUARED_DIFFERENCE, Register_SQUARED_DIFFERENCE());
+  AddBuiltin(BuiltinOperator_FILL, Register_FILL());
+  AddBuiltin(BuiltinOperator_MIRROR_PAD, Register_MIRROR_PAD());
 
   // TODO(andrewharp, ahentz): Move these somewhere more appropriate so that
   // custom ops aren't always included by default.
diff --git a/tensorflow/lite/kernels/register.h b/tensorflow/lite/kernels/register.h
index eb5ce667d4c9ebcc8e392d06b92736ea41432bd6..059c9d165ee8a81096cce3885fc940f5977d7342 100644
--- a/tensorflow/lite/kernels/register.h
+++ b/tensorflow/lite/kernels/register.h
@@ -15,7 +15,6 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_KERNELS_REGISTER_H_
 #define TENSORFLOW_LITE_KERNELS_REGISTER_H_
 
-#include <unordered_map>
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/model.h"
 #include "tensorflow/lite/mutable_op_resolver.h"
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index 530bb32b946f07acf60f3ccbeab0248c7c2b5747..d3f4837a287accd93c23e17fa3a361efd4120101 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -26,8 +26,8 @@ using uint8 = std::uint8_t;
 
 class ResizeBilinearOpModel : public SingleOpModel {
  public:
-  ResizeBilinearOpModel(const TensorData& input,
-                        std::initializer_list<int> size_data = {}) {
+  explicit ResizeBilinearOpModel(const TensorData& input,
+                                 std::initializer_list<int> size_data = {}) {
     bool const_size = size_data.size() != 0;
     input_ = AddInput(input);
     if (const_size) {
diff --git a/tensorflow/lite/kernels/skip_gram.cc b/tensorflow/lite/kernels/skip_gram.cc
index f20719ecaf6eda023f9a2826d7a995c1708e9577..265ba18a3e39d3316fef2d41306540e7a170e675 100644
--- a/tensorflow/lite/kernels/skip_gram.cc
+++ b/tensorflow/lite/kernels/skip_gram.cc
@@ -107,7 +107,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   // Generate n-grams recursively.
   tflite::DynamicBuffer buf;
   if (words.size() < params->ngram_size) {
-    buf.WriteToTensor(GetOutput(context, node, 0));
+    buf.WriteToTensorAsVector(GetOutput(context, node, 0));
     return kTfLiteOk;
   }
 
@@ -145,7 +145,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     }
   }
 
-  buf.WriteToTensor(GetOutput(context, node, 0));
+  buf.WriteToTensorAsVector(GetOutput(context, node, 0));
   return kTfLiteOk;
 }
 }  // namespace
diff --git a/tensorflow/lite/kernels/split_v.cc b/tensorflow/lite/kernels/split_v.cc
new file mode 100644
index 0000000000000000000000000000000000000000..060e3c5f79c808cd3c8d4b21efd7f2595a68b8e8
--- /dev/null
+++ b/tensorflow/lite/kernels/split_v.cc
@@ -0,0 +1,207 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <vector>
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace split_v {
+
+struct OpContext {
+  OpContext(TfLiteContext* context, TfLiteNode* node) {
+    params = reinterpret_cast<TfLiteSplitVParams*>(node->builtin_data);
+    input = GetInput(context, node, 0);
+    size_splits = GetInput(context, node, 1);
+    axis = GetInput(context, node, 2);
+  }
+  TfLiteSplitVParams* params;
+  const TfLiteTensor* input;
+  const TfLiteTensor* size_splits;
+  const TfLiteTensor* axis;
+};
+
+TfLiteStatus UseDynamicOutputTensors(TfLiteContext* context, TfLiteNode* node) {
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    SetTensorToDynamic(GetOutput(context, node, i));
+  }
+  return kTfLiteOk;
+}
+
+template <typename T>
+void GetSizeSplitsVector(const TfLiteTensor* size_splits,
+                         std::vector<int64_t>* size_splits_vector) {
+  const auto num_elements = NumElements(size_splits);
+  for (int i = 0; i < num_elements; ++i) {
+    size_splits_vector->push_back(GetTensorData<T>(size_splits)[i]);
+  }
+}
+
+TfLiteStatus ResizeOutputTensors(TfLiteContext* context, TfLiteNode* node,
+                                 const TfLiteTensor* input,
+                                 const TfLiteTensor* size_splits,
+                                 const TfLiteTensor* axis) {
+  int axis_value = GetTensorData<int>(axis)[0];
+  if (axis_value < 0) {
+    axis_value += NumDimensions(input);
+  }
+
+  std::vector<int64_t> size_splits_vector;
+  if (size_splits->type == kTfLiteInt32) {
+    GetSizeSplitsVector<int32_t>(size_splits, &size_splits_vector);
+  } else if (size_splits->type == kTfLiteInt64) {
+    GetSizeSplitsVector<int64_t>(size_splits, &size_splits_vector);
+  } else {
+    context->ReportError(context, "size_splits only support type int32|int64.");
+    return kTfLiteError;
+  }
+
+  int minus_one_index = -1;
+  int64_t size_splits_sum = 0;
+
+  for (int i = 0; i < size_splits_vector.size(); ++i) {
+    if (size_splits_vector.at(i) == -1) {
+      if (minus_one_index == -1) {
+        minus_one_index = i;
+      } else {
+        context->ReportError(context,
+                             "The size_splits contains more than one -1.");
+      }
+    } else {
+      size_splits_sum += size_splits_vector.at(i);
+    }
+  }
+
+  const int input_size = SizeOfDimension(input, axis_value);
+
+  if (minus_one_index != -1) {
+    if (size_splits_sum > input_size) {
+      context->ReportError(
+          context,
+          "The sum of size_splits must be less than the dimension of value.");
+    } else {
+      size_splits_vector[minus_one_index] = input_size - size_splits_sum;
+    }
+  } else if (size_splits_sum != input_size) {
+    context->ReportError(
+        context,
+        "The size_splits must sum to the dimension of value along axis.");
+  }
+
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    TfLiteIntArray* output_dims = TfLiteIntArrayCopy(input->dims);
+    output_dims->data[axis_value] = size_splits_vector.at(i);
+    TfLiteTensor* output = GetOutput(context, node, i);
+    TF_LITE_ENSURE_STATUS(context->ResizeTensor(context, output, output_dims));
+  }
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 3);
+
+  OpContext op_context(context, node);
+
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), op_context.params->num_splits);
+
+  auto input_type = op_context.input->type;
+  TF_LITE_ENSURE(context, input_type == kTfLiteFloat32 ||
+                              input_type == kTfLiteUInt8 ||
+                              input_type == kTfLiteInt16);
+  for (int i = 0; i < NumOutputs(node); ++i) {
+    GetOutput(context, node, i)->type = input_type;
+  }
+
+  auto size_splits = op_context.size_splits;
+  TF_LITE_ENSURE_EQ(context, NumDimensions(size_splits), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), NumElements(size_splits));
+
+  // If we know the contents of the 'size_splits' tensor and the 'axis' tensor,
+  // resize all outputs. Otherwise, wait until Eval().
+  if (IsConstantTensor(op_context.size_splits) &&
+      IsConstantTensor(op_context.axis)) {
+    return ResizeOutputTensors(context, node, op_context.input,
+                               op_context.size_splits, op_context.axis);
+  } else {
+    return UseDynamicOutputTensors(context, node);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpContext op_context(context, node);
+
+  // When the 'size_splits' and the 'axis' tensor is non-const we can't resize
+  // output tensors in Prepare(), and we have to do it now.
+  if (!IsConstantTensor(op_context.axis) ||
+      !IsConstantTensor(op_context.size_splits)) {
+    TF_LITE_ENSURE_OK(
+        context, ResizeOutputTensors(context, node, op_context.input,
+                                     op_context.size_splits, op_context.axis));
+  }
+
+  int axis_value = GetTensorData<int>(op_context.axis)[0];
+
+  // Use split function to build the outputs since they share the same logic.
+#define TF_LITE_SPLIT_V(scalar)                                     \
+  VectorOfTensors<scalar> all_outputs(*context, *node->outputs);    \
+  tflite::SplitParams op_params;                                    \
+  op_params.num_split = NumOutputs(node);                           \
+  op_params.axis = axis_value;                                      \
+  reference_ops::Split(op_params, GetTensorShape(op_context.input), \
+                       GetTensorData<scalar>(op_context.input),     \
+                       all_outputs.shapes(), all_outputs.data());
+  switch (op_context.input->type) {
+    case kTfLiteFloat32: {
+      TF_LITE_SPLIT_V(float);
+      break;
+    }
+    case kTfLiteUInt8: {
+      TF_LITE_SPLIT_V(uint8_t);
+      break;
+    }
+    case kTfLiteInt16: {
+      TF_LITE_SPLIT_V(int16_t);
+      break;
+    }
+    default:
+      context->ReportError(
+          context,
+          "Only float32, uint8 and int16 are currently supported, got %d.",
+          op_context.input->type);
+      return kTfLiteError;
+  }
+#undef TF_LITE_SPLIT_V
+
+  return kTfLiteOk;
+}
+
+}  // namespace split_v
+
+TfLiteRegistration* Register_SPLIT_V() {
+  static TfLiteRegistration r = {nullptr, nullptr, split_v::Prepare,
+                                 split_v::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/split_v_test.cc b/tensorflow/lite/kernels/split_v_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d1d36d6851c12d1b05374cda5ef32255e162875
--- /dev/null
+++ b/tensorflow/lite/kernels/split_v_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <initializer_list>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+constexpr int kAxisIsATensor = -1000;
+
+class SplitVOpModel : public SingleOpModel {
+ public:
+  SplitVOpModel(const TensorData& input, const TensorData& size_splits,
+                int num_splits, int axis) {
+    input_ = AddInput(input);
+    size_splits_ = AddInput(size_splits);
+    if (axis == kAxisIsATensor) {
+      axis_ = AddInput({TensorType_INT32, {1}});
+    } else {
+      axis_ = AddConstInput(TensorType_INT32, {axis}, {1});
+    }
+    for (int i = 0; i < num_splits; ++i) {
+      outputs_.push_back(AddOutput(input.type));
+    }
+    SetBuiltinOp(BuiltinOperator_SPLIT_V, BuiltinOptions_SplitVOptions,
+                 CreateSplitVOptions(builder_, num_splits).Union());
+    if (axis == kAxisIsATensor) {
+      BuildInterpreter(
+          {GetShape(input_), GetShape(size_splits_), GetShape(axis_)});
+    } else {
+      BuildInterpreter({GetShape(input_), GetShape(size_splits_), {}});
+    }
+  }
+
+  void SetInput(std::initializer_list<float> data) {
+    PopulateTensor(input_, data);
+  }
+  void SetSizeSplits(std::initializer_list<int> data) {
+    PopulateTensor(size_splits_, data);
+  }
+  void SetAxis(int axis) { PopulateTensor(axis_, {axis}); }
+
+  std::vector<float> GetOutput(int i) {
+    return ExtractVector<float>(outputs_[i]);
+  }
+  std::vector<int> GetOutputShape(int i) { return GetTensorShape(outputs_[i]); }
+
+ private:
+  int input_;
+  int size_splits_;
+  int axis_;
+  std::vector<int> outputs_;
+};
+
+// TODO(ruic): Add tests to test quantized values. b/119638735
+using TensorValues = std::initializer_list<float>;
+
+void Check(int axis, std::initializer_list<int> input_shape,
+           std::initializer_list<int> size_splits_shape,
+           std::vector<std::initializer_list<int>> output_shapes,
+           const TensorValues& input_data,
+           const std::initializer_list<int>& size_splits_data,
+           const std::vector<TensorValues>& output_data) {
+  int num_splits = size_splits_data.size();
+  SplitVOpModel m({TensorType_FLOAT32, input_shape},
+                  {TensorType_INT32, size_splits_shape}, num_splits,
+                  kAxisIsATensor);
+  m.SetInput(input_data);
+  m.SetSizeSplits(size_splits_data);
+  m.SetAxis(axis);
+  m.Invoke();
+  for (int i = 0; i < num_splits; ++i) {
+    EXPECT_THAT(m.GetOutput(i), ElementsAreArray(output_data[i]));
+    EXPECT_THAT(m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
+  }
+
+  SplitVOpModel const_m({TensorType_FLOAT32, input_shape},
+                        {TensorType_INT32, size_splits_shape}, num_splits,
+                        axis);
+  const_m.SetInput(input_data);
+  const_m.SetSizeSplits(size_splits_data);
+  const_m.Invoke();
+  for (int i = 0; i < num_splits; ++i) {
+    EXPECT_THAT(const_m.GetOutput(i), ElementsAreArray(output_data[i]));
+    EXPECT_THAT(const_m.GetOutputShape(i), ElementsAreArray(output_shapes[i]));
+  }
+}
+
+TEST(SplitVOpTest, TwoDimensional) {
+  // Input shape: {4, 3}
+  // size_splits: {1, 1, 3}
+  // axis: 0
+  // We should have 3 outpus with shapes respectively:
+  //  output 0 : {1, 3}
+  //  output 1 : {1, 3}
+  //  output 1 : {2, 3}
+  Check(/*axis=*/0, {4, 3}, {3}, {{1, 3}, {1, 3}, {2, 3}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}, {1, 1, 2},
+        {{1, 2, 3}, {4, 5, 6}, {7, 8, 9, 10, 11, 12}});
+}
+
+TEST(SplitVOpTest, FourDimensional) {
+  Check(/*axis=*/0, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 3, 4, 5, 6, 7, 8},
+            {9, 10, 11, 12, 13, 14, 15, 16},
+        });
+  Check(/*axis=*/1, {2, 2, 2, 2}, {2}, {{2, 1, 2, 2}, {2, 1, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, -1},
+        {
+            {1, 2, 3, 4, 9, 10, 11, 12},
+            {5, 6, 7, 8, 13, 14, 15, 16},
+        });
+  Check(/*axis=*/2, {2, 2, 2, 2}, {2}, {{2, 2, 1, 2}, {2, 2, 1, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 5, 6, 9, 10, 13, 14},
+            {3, 4, 7, 8, 11, 12, 15, 16},
+        });
+  Check(/*axis=*/3, {2, 2, 2, 2}, {2}, {{2, 2, 2, 1}, {2, 2, 2, 1}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 3, 5, 7, 9, 11, 13, 15},
+            {2, 4, 6, 8, 10, 12, 14, 16},
+        });
+}
+
+TEST(SplitVOpTest, OneDimensional) {
+  Check(/*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {1}, {1}},
+        {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 1, 1},
+        {{1}, {2}, {3}, {4}, {5}, {6}, {7}, {8}});
+}
+
+TEST(SplitVOpTest, OneDimensional2) {
+  Check(/*axis=*/0, {8}, {8}, {{1}, {1}, {1}, {1}, {1}, {1}, {2}, {0}},
+        {1, 2, 3, 4, 5, 6, 7, 8}, {1, 1, 1, 1, 1, 1, 2, -1},
+        {{1}, {2}, {3}, {4}, {5}, {6}, {7, 8}, {}});
+}
+
+TEST(SplitVOpTest, NegativeAxis) {
+  Check(/*axis=*/-4, {2, 2, 2, 2}, {2}, {{1, 2, 2, 2}, {1, 2, 2, 2}},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}, {1, 1},
+        {
+            {1, 2, 3, 4, 5, 6, 7, 8},
+            {9, 10, 11, 12, 13, 14, 15, 16},
+        });
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/squared_difference.cc b/tensorflow/lite/kernels/squared_difference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59b53a6287dbbc863a61875be82090c1b9c6d442
--- /dev/null
+++ b/tensorflow/lite/kernels/squared_difference.cc
@@ -0,0 +1,129 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/c/builtin_op_data.h"
+#include "tensorflow/lite/c/c_api_internal.h"
+#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
+#include "tensorflow/lite/kernels/internal/quantization_util.h"
+#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
+#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/kernels/kernel_util.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+
+namespace tflite {
+namespace ops {
+namespace builtin {
+namespace squared_difference {
+
+constexpr int kInputTensor1 = 0;
+constexpr int kInputTensor2 = 1;
+constexpr int kOutputTensor = 0;
+
+struct OpData {
+  bool requires_broadcast;
+};
+
+template <typename T>
+T SquaredDifference(T input1, T input2) {
+  const T difference = input1 - input2;
+  return difference * difference;
+}
+
+void* Init(TfLiteContext* context, const char* buffer, size_t length) {
+  auto* data = new OpData;
+  data->requires_broadcast = false;
+  return data;
+}
+
+void Free(TfLiteContext* context, void* buffer) {
+  delete reinterpret_cast<OpData*>(buffer);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  TF_LITE_ENSURE_EQ(context, input1->type, input2->type);
+  output->type = input2->type;
+
+  data->requires_broadcast = !HaveSameShapes(input1, input2);
+
+  TfLiteIntArray* output_size = nullptr;
+  if (data->requires_broadcast) {
+    TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
+                                   context, input1, input2, &output_size));
+  } else {
+    output_size = TfLiteIntArrayCopy(input1->dims);
+  }
+
+  return context->ResizeTensor(context, output, output_size);
+}
+
+template <typename T>
+void EvalSquaredDifference(TfLiteContext* context, TfLiteNode* node,
+                           const OpData* data, const TfLiteTensor* input1,
+                           const TfLiteTensor* input2, TfLiteTensor* output) {
+  if (data->requires_broadcast) {
+    reference_ops::BroadcastBinaryFunction4DSlow<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), SquaredDifference<T>);
+  } else {
+    reference_ops::BinaryFunction<T, T, T>(
+        GetTensorShape(input1), GetTensorData<T>(input1),
+        GetTensorShape(input2), GetTensorData<T>(input2),
+        GetTensorShape(output), GetTensorData<T>(output), SquaredDifference<T>);
+  }
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+
+  const TfLiteTensor* input1 = GetInput(context, node, kInputTensor1);
+  const TfLiteTensor* input2 = GetInput(context, node, kInputTensor2);
+  TfLiteTensor* output = GetOutput(context, node, kOutputTensor);
+
+  if (output->type == kTfLiteFloat32) {
+    EvalSquaredDifference<float>(context, node, data, input1, input2, output);
+  } else if (output->type == kTfLiteInt32) {
+    EvalSquaredDifference<int32_t>(context, node, data, input1, input2, output);
+  } else {
+    context->ReportError(context,
+                         "SquaredDifference only supports FLOAT32, INT32 and "
+                         "quantized UINT8 now, got %d.",
+                         output->type);
+    return kTfLiteError;
+  }
+
+  return kTfLiteOk;
+}
+
+}  // namespace squared_difference
+
+TfLiteRegistration* Register_SQUARED_DIFFERENCE() {
+  static TfLiteRegistration r = {
+      squared_difference::Init, squared_difference::Free,
+      squared_difference::Prepare, squared_difference::Eval};
+  return &r;
+}
+
+}  // namespace builtin
+}  // namespace ops
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/squared_difference_test.cc b/tensorflow/lite/kernels/squared_difference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..32bcab3b87f5f0cf5ad47724cc06c98f1a561e4a
--- /dev/null
+++ b/tensorflow/lite/kernels/squared_difference_test.cc
@@ -0,0 +1,157 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model.h"
+
+namespace tflite {
+namespace {
+
+using ::testing::ElementsAreArray;
+
+class BaseSquaredDifferenceOpModel : public SingleOpModel {
+ public:
+  BaseSquaredDifferenceOpModel(const TensorData& input1,
+                               const TensorData& input2,
+                               const TensorData& output) {
+    input1_ = AddInput(input1);
+    input2_ = AddInput(input2);
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_SQUARED_DIFFERENCE,
+                 BuiltinOptions_SquaredDifferenceOptions,
+                 CreateSquaredDifferenceOptions(builder_).Union());
+    BuildInterpreter({GetShape(input1_), GetShape(input2_)});
+  }
+
+  int input1() { return input1_; }
+  int input2() { return input2_; }
+
+ protected:
+  int input1_;
+  int input2_;
+  int output_;
+};
+
+class FloatSquaredDifferenceOpModel : public BaseSquaredDifferenceOpModel {
+ public:
+  using BaseSquaredDifferenceOpModel::BaseSquaredDifferenceOpModel;
+
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+};
+
+class IntegerSquaredDifferenceOpModel : public BaseSquaredDifferenceOpModel {
+ public:
+  using BaseSquaredDifferenceOpModel::BaseSquaredDifferenceOpModel;
+
+  std::vector<int32_t> GetOutput() { return ExtractVector<int32_t>(output_); }
+};
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_SameShape) {
+  FloatSquaredDifferenceOpModel m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                  {TensorType_FLOAT32, {1, 2, 2, 1}},
+                                  {TensorType_FLOAT32, {}});
+  m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, -1.2, 0.8});
+  m.PopulateTensor<float>(m.input2(), {0.5, 0.2, -1.5, 0.5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear({0.49, 0.0, 0.09, 0.09})));
+}
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSquaredDifferenceOpModel m({TensorType_FLOAT32, test_shapes[i]},
+                                    {TensorType_FLOAT32, test_shapes[i]},
+                                    {TensorType_FLOAT32, {}});
+    m.PopulateTensor<float>(m.input1(), {-2.0, 0.2, 0.3, 0.8, 1.1, -2.0});
+    m.PopulateTensor<float>(m.input2(), {1.0, 0.2, 0.6, 0.4, -1.0, -0.0});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({9.0, 0.0, 0.09, 0.16, 4.41, 4.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(FloatSquaredDifferenceOpTest, FloatType_WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    FloatSquaredDifferenceOpModel m(
+        {TensorType_FLOAT32, test_shapes[i]},
+        {TensorType_FLOAT32, {}},  // always a scalar
+        {TensorType_FLOAT32, {}});
+    m.PopulateTensor<float>(m.input1(), {-0.2, 0.2, 0.5, 0.8, 0.11, 1.1});
+    m.PopulateTensor<float>(m.input2(), {0.1});
+    m.Invoke();
+    EXPECT_THAT(
+        m.GetOutput(),
+        ElementsAreArray(ArrayFloatNear({0.09, 0.01, 0.16, 0.49, 0.0001, 1.0})))
+        << "With shape number " << i;
+  }
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_SameShape) {
+  IntegerSquaredDifferenceOpModel m({TensorType_INT32, {1, 2, 2, 1}},
+                                    {TensorType_INT32, {1, 2, 2, 1}},
+                                    {TensorType_INT32, {}});
+  m.PopulateTensor<int32_t>(m.input1(), {-2, 2, -15, 8});
+  m.PopulateTensor<int32_t>(m.input2(), {5, -2, -3, 5});
+  m.Invoke();
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({49, 16, 144, 9}));
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_VariousInputShapes) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    IntegerSquaredDifferenceOpModel m({TensorType_INT32, test_shapes[i]},
+                                      {TensorType_INT32, test_shapes[i]},
+                                      {TensorType_INT32, {}});
+    m.PopulateTensor<int32_t>(m.input1(), {-20, 2, 3, 8, 11, -20});
+    m.PopulateTensor<int32_t>(m.input2(), {1, 2, 6, 5, -5, -20});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({441, 0, 9, 9, 256, 0}))
+        << "With shape number " << i;
+  }
+}
+
+TEST(IntegerSquaredDifferenceOpTest, IntegerType_WithBroadcast) {
+  std::vector<std::vector<int>> test_shapes = {
+      {6}, {2, 3}, {2, 1, 3}, {1, 3, 1, 2}};
+  for (int i = 0; i < test_shapes.size(); ++i) {
+    IntegerSquaredDifferenceOpModel m(
+        {TensorType_INT32, test_shapes[i]},
+        {TensorType_INT32, {}},  // always a scalar
+        {TensorType_INT32, {}});
+    m.PopulateTensor<int32_t>(m.input1(), {-20, 10, 7, 3, 1, 13});
+    m.PopulateTensor<int32_t>(m.input2(), {3});
+    m.Invoke();
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({529, 49, 16, 0, 4, 100}))
+        << "With shape number " << i;
+  }
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  ::tflite::LogToStderr();
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/tensorflow/lite/kernels/test_util.cc b/tensorflow/lite/kernels/test_util.cc
index 6b2a1f89c37dd3dcccdf5aade53ed0f984263e3a..549ea78f5b45b20139b023552a98c3dcb0d75610 100644
--- a/tensorflow/lite/kernels/test_util.cc
+++ b/tensorflow/lite/kernels/test_util.cc
@@ -129,14 +129,14 @@ void SingleOpModel::BuildInterpreter(std::vector<std::vector<int>> input_shapes,
 
   interpreter_->SetAllowFp16PrecisionForFp32(allow_fp32_relax_to_fp16);
 
+  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
+      << "Cannot allocate tensors";
+  interpreter_->ResetVariableTensors();
+
   // Modify delegate with function.
   if (apply_delegate_fn_) {
     apply_delegate_fn_(interpreter_.get());
   }
-
-  CHECK(interpreter_->AllocateTensors() == kTfLiteOk)
-      << "Cannot allocate tensors";
-  interpreter_->ResetVariableTensors();
 }
 
 void SingleOpModel::Invoke() { CHECK(interpreter_->Invoke() == kTfLiteOk); }
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 43a5137a941d5062fbbc5d89724face9bd0976d9..dadabb86abbe3b13da74fda9224e693d310ada26 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -199,7 +199,7 @@ class SingleOpModel {
     for (const string& s : content) {
       buf.AddString(s.data(), s.length());
     }
-    buf.WriteToTensor(tensor);
+    buf.WriteToTensor(tensor, /*new_shape=*/nullptr);
   }
 
   // Populate the tensor given its index.
@@ -307,6 +307,7 @@ class SingleOpModel {
 
     if (is_quantized) {
       if (t.min != 0 || t.max != 0) {
+        // TODO(b/119422369): Handle signed int8 here.
         if (t.type == TensorType_UINT8) {
           std::tie(t.scale, t.zero_point) =
               QuantizationParams<uint8_t>(t.min, t.max);
diff --git a/tensorflow/lite/lib_package/create_ios_frameworks.sh b/tensorflow/lite/lib_package/create_ios_frameworks.sh
index fa466ed5bc7ad31f371a7f1b67754d446b45063a..abf40e7dec6c3f14ba38cb3491be5d2d0acc7caa 100755
--- a/tensorflow/lite/lib_package/create_ios_frameworks.sh
+++ b/tensorflow/lite/lib_package/create_ios_frameworks.sh
@@ -1,4 +1,4 @@
-#!/bin/bash -x
+#!/bin/bash
 # Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -20,17 +20,48 @@ set -e
 echo "Starting"
 TFLITE_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)/.."
 
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-g build with GPU delegate"
+  exit 1
+}
+
+USE_GPU_DELEGATE="false"
+FRAMEWORK_NAME="tensorflow_lite"
+while getopts "g" opt_name; do
+  case "$opt_name" in
+    g)
+        USE_GPU_DELEGATE="true"
+        FRAMEWORK_NAME="tensorflow_lite_gpu"
+        ;;
+    *) usage;;
+  esac
+done
+shift $((OPTIND - 1))
+readonly USE_GPU_DELEGATE
+readonly FRAMEWORK_NAME
+
+if [ $USE_GPU_DELEGATE == "true" ] ; then
+  for filename in metal_delegate.h libmetal_delegate.a ; do
+    if [[ ! -f "${TFLITE_DIR}/delegates/gpu/${filename}" ]] ; then
+      echo "File ${TFLITE_DIR}/delegates/gpu/${filename} doesn't exist."
+      echo "It's requried for building TFLite Framework with GPU. Aborting."
+      exit 1
+    fi
+  done
+fi
+
 TMP_DIR=$(mktemp -d)
 echo "Package dir: " $TMP_DIR
 FW_DIR=$TMP_DIR/tensorflow_lite_ios_frameworks
-FW_DIR_TFLITE=$FW_DIR/tensorflow_lite.framework
+FW_DIR_TFLITE=$FW_DIR/$FRAMEWORK_NAME.framework
 FW_DIR_TFLITE_HDRS=$FW_DIR_TFLITE/Headers
 
 echo "Creating target Headers directories"
 mkdir -p $FW_DIR_TFLITE_HDRS
 
 echo "Headers, populating: TensorFlow Lite"
-cd $TFLITE_DIR/../../..
+cd $TFLITE_DIR/../..
 
 find tensorflow/lite -name '*.h' \
     -not -path 'tensorflow/lite/tools/*' \
@@ -51,15 +82,21 @@ cd $FW_DIR_TFLITE_HDRS
 tar xf tmp.tar
 rm -f tmp.tar
 
-cd $TFLITE_DIR/../../..
+cd $TFLITE_DIR/../..
 echo "Generate master LICENSE file and copy to target"
 bazel build //tensorflow/tools/lib_package:clicenses_generate
-cp $TFLITE_DIR/../../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
+cp $TFLITE_DIR/../../bazel-genfiles/tensorflow/tools/lib_package/include/tensorflow/c/LICENSE \
    $FW_DIR_TFLITE
 
 echo "Copying static libraries"
+# Note: There must be a static library with the same name
+# as the framework name.
 cp $TFLITE_DIR/tools/make/gen/lib/libtensorflow-lite.a \
-   $FW_DIR_TFLITE/tensorflow_lite
+    $FW_DIR_TFLITE/$FRAMEWORK_NAME
+if [ $USE_GPU_DELEGATE == "true" ] ; then
+  cp "${TFLITE_DIR}/delegates/gpu/libmetal_delegate.a" \
+      $FW_DIR_TFLITE/libmetal_delegate.a
+fi
 
 # This is required, otherwise they interfere with the documentation of the
 # pod at cocoapods.org.
@@ -71,10 +108,10 @@ find . -type f -name readme\* -exec rm -f {} \;
 TARGET_GEN_LOCATION="$TFLITE_DIR/gen/ios_frameworks"
 echo "Moving results to target: " $TARGET_GEN_LOCATION
 cd $FW_DIR
-zip -q -r tensorflow_lite.framework.zip tensorflow_lite.framework -x .DS_Store
+zip -q -r $FRAMEWORK_NAME.framework.zip $FRAMEWORK_NAME.framework -x .DS_Store
 rm -rf $TARGET_GEN_LOCATION
 mkdir -p $TARGET_GEN_LOCATION
-cp -r tensorflow_lite.framework.zip $TARGET_GEN_LOCATION
+cp -r $FRAMEWORK_NAME.framework.zip $TARGET_GEN_LOCATION
 
 echo "Cleaning up"
 rm -rf $TMP_DIR
diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD b/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
index b14af4cb20b893f49a0b6145f63b889115f8dbf6..73326e994bcd1bcbbea13e438b7be3ff26d378e6 100644
--- a/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/BUILD
@@ -62,6 +62,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/lite:framework",
+        "//tensorflow/lite/java/jni",
         "//tensorflow/lite/models/smartreply:predictor_lib",
     ],
     alwayslink = 1,
diff --git a/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
index d5b1ac0ffbc47283aa0c1bf68c0a85ad6228cdcc..fbd75051e714c011ba0cd747905b4ac8aec6ad75 100644
--- a/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
+++ b/tensorflow/lite/models/smartreply/demo/app/src/main/java/com/example/android/smartreply/SmartReplyClient.java
@@ -90,29 +90,26 @@ public class SmartReplyClient implements AutoCloseable {
   }
 
   private MappedByteBuffer loadModelFile() throws IOException {
-    AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
-    FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor());
-    try {
+    try (AssetFileDescriptor fileDescriptor = context.getAssets().openFd(MODEL_PATH);
+        FileInputStream inputStream = new FileInputStream(fileDescriptor.getFileDescriptor())) {
       FileChannel fileChannel = inputStream.getChannel();
       long startOffset = fileDescriptor.getStartOffset();
       long declaredLength = fileDescriptor.getDeclaredLength();
       return fileChannel.map(FileChannel.MapMode.READ_ONLY, startOffset, declaredLength);
-    } finally {
-      inputStream.close();
     }
   }
 
   private String[] loadBackoffList() throws IOException {
     List<String> labelList = new ArrayList<String>();
-    BufferedReader reader =
-        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)));
-    String line;
-    while ((line = reader.readLine()) != null) {
-      if (!line.isEmpty()) {
-        labelList.add(line);
+    try (BufferedReader reader =
+        new BufferedReader(new InputStreamReader(context.getAssets().open(BACKOFF_PATH)))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        if (!line.isEmpty()) {
+          labelList.add(line);
+        }
       }
     }
-    reader.close();
     String[] ans = new String[labelList.size()];
     labelList.toArray(ans);
     return ans;
diff --git a/tensorflow/lite/models/smartreply/ops/normalize.cc b/tensorflow/lite/models/smartreply/ops/normalize.cc
index 8480260f279c0072d09fb883fbd711cac3ea875f..3cb11cc055b269a6230a593617a86055e9d34139 100644
--- a/tensorflow/lite/models/smartreply/ops/normalize.cc
+++ b/tensorflow/lite/models/smartreply/ops/normalize.cc
@@ -92,7 +92,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
 
   tflite::DynamicBuffer buf;
   buf.AddString(result.data(), result.length());
-  buf.WriteToTensor(GetOutput(context, node, 0));
+  buf.WriteToTensorAsVector(GetOutput(context, node, 0));
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/models/smartreply/predictor.cc b/tensorflow/lite/models/smartreply/predictor.cc
index 7db2502977707d66f8b45c91d4191b92b39b75e0..59bf4a3cf1ed964e58a3b3dc9c6fb62139fcd56e 100644
--- a/tensorflow/lite/models/smartreply/predictor.cc
+++ b/tensorflow/lite/models/smartreply/predictor.cc
@@ -49,7 +49,7 @@ void ExecuteTfLite(const std::string& sentence,
     TfLiteTensor* input = interpreter->tensor(interpreter->inputs()[0]);
     tflite::DynamicBuffer buf;
     buf.AddString(sentence.data(), sentence.length());
-    buf.WriteToTensor(input);
+    buf.WriteToTensorAsVector(input);
     interpreter->AllocateTensors();
 
     interpreter->Invoke();
diff --git a/tensorflow/lite/nnapi_delegate.cc b/tensorflow/lite/nnapi_delegate.cc
index 950bdb39425f89e8870ca7f2146641912073a2e0..26d75696a1c889d752f9715358701da6300f49df 100644
--- a/tensorflow/lite/nnapi_delegate.cc
+++ b/tensorflow/lite/nnapi_delegate.cc
@@ -140,13 +140,13 @@ NNAPIDelegate::~NNAPIDelegate() {
   // ANeuralNetworksShutdown();
 }
 
-// Adds the tensors of the interpreter to the NN API model.
-TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter,
+// Adds the tensors of the subgraph to the NN API model.
+TfLiteStatus addTensorOperands(tflite::Subgraph* subgraph,
                                ANeuralNetworksModel* nn_model,
                                uint32_t* no_of_operands_added,
                                std::vector<int64_t>* nnapi_ids) {
   uint32_t next_id = 0;
-  for (size_t i = 0; i < interpreter->tensors_size(); i++) {
+  for (size_t i = 0; i < subgraph->tensors_size(); i++) {
     // Skip temporaries and RNN back-edges.
     if ((*nnapi_ids)[i] == kOperandNotNeeded) continue;
 
@@ -156,7 +156,7 @@ TfLiteStatus addTensorOperands(tflite::Interpreter* interpreter,
     // NNAPI requires 32-bit float scale to be zero, tflite doesn't care
     float scale = 0.0f;
     int32_t zeroPoint = 0;
-    TfLiteTensor* tensor = interpreter->tensor(i);
+    TfLiteTensor* tensor = subgraph->tensor(i);
     switch (tensor->type) {
       case kTfLiteNoType:
         // Tensors added during initialization of Ops don't have a type yet and
@@ -240,12 +240,12 @@ void MapAndAddTensorIds(const int* from_ids_buf, size_t from_ids_count,
 // Adds the operations and their parameters to the NN API model.
 // 'next-id' is the operand ID of the next operand of the model.
 TfLiteStatus AddOpsAndParams(
-    tflite::Interpreter* interpreter, ANeuralNetworksModel* nn_model,
+    tflite::Subgraph* subgraph, ANeuralNetworksModel* nn_model,
     uint32_t next_id, std::vector<int>* model_state_inputs,
     std::vector<int>* model_state_outputs,
     const std::vector<int64_t>& tensor_id_to_nnapi_id) {
-  for (size_t i = 0; i < interpreter->nodes_size(); i++) {
-    const auto* node_and_registration = interpreter->node_and_registration(i);
+  for (size_t i = 0; i < subgraph->nodes_size(); i++) {
+    const auto* node_and_registration = subgraph->node_and_registration(i);
     const TfLiteNode& node = node_and_registration->first;
     const TfLiteRegistration& registration = node_and_registration->second;
     tflite::BuiltinOperator builtin =
@@ -291,9 +291,9 @@ TfLiteStatus AddOpsAndParams(
     // For each state_out tensor, a corresponding state_in operand needs to be
     // created for NNAPI.
     auto duplicate_state_tensor_float32 =
-        [interpreter, &nn_model, &next_id, &augmented_inputs,
-         &model_state_inputs, &model_state_outputs](int tensor_id) {
-          const TfLiteTensor* tensor = interpreter->tensor(tensor_id);
+        [subgraph, &nn_model, &next_id, &augmented_inputs, &model_state_inputs,
+         &model_state_outputs](int tensor_id) {
+          const TfLiteTensor* tensor = subgraph->tensor(tensor_id);
           ANeuralNetworksOperandType operand_type{
               ANEURALNETWORKS_TENSOR_FLOAT32,
               static_cast<uint32_t>(tensor->dims->size),
@@ -388,11 +388,11 @@ TfLiteStatus AddOpsAndParams(
     };
 
     // LSTM in NNAPI requires scratch tensor as an output operand.
-    auto add_lstm_scratch_tensor_float32 = [interpreter, &node, &nn_model,
+    auto add_lstm_scratch_tensor_float32 = [subgraph, &node, &nn_model,
                                             &next_id, &augmented_outputs]() {
       if (node.temporaries->size == 0) return;
       int scratch_buffer_index = node.temporaries->data[0];
-      const TfLiteTensor* tensor = interpreter->tensor(scratch_buffer_index);
+      const TfLiteTensor* tensor = subgraph->tensor(scratch_buffer_index);
       ANeuralNetworksOperandType operand_type{
           ANEURALNETWORKS_TENSOR_FLOAT32,
           static_cast<uint32_t>(tensor->dims->size),
@@ -584,7 +584,7 @@ TfLiteStatus AddOpsAndParams(
         // The permutation input tensor value dictates the output dimensions.
         // TODO(b/110888333): Support dynamically-sized tensors in delegates.
         if ((node.inputs->size > 1) &&
-            (interpreter->tensor(node.inputs->data[1])->allocation_type !=
+            (subgraph->tensor(node.inputs->data[1])->allocation_type !=
              kTfLiteMmapRo)) {
           logError("NNAPI does not yet support dynamic tensors.");
           return kTfLiteError;
@@ -601,14 +601,13 @@ TfLiteStatus AddOpsAndParams(
           return kTfLiteError;
         }
         if ((node.inputs->size > 0) &&
-            (interpreter->tensor(node.inputs->data[0])->dims->size != 4)) {
+            (subgraph->tensor(node.inputs->data[0])->dims->size != 4)) {
           logError("NNAPI only supports input rank 4 for L2Normalization");
           return kTfLiteError;
         }
         break;
       case tflite::BuiltinOperator_HASHTABLE_LOOKUP:
-        if (interpreter->tensor(node.outputs->data[0])->type !=
-            kTfLiteFloat32) {
+        if (subgraph->tensor(node.outputs->data[0])->type != kTfLiteFloat32) {
           logError("NNAPI only support HASHTABLE_LOOKUP with float32 output",
                    builtin);
           return kTfLiteError;
@@ -682,6 +681,11 @@ TfLiteStatus AddOpsAndParams(
       case tflite::BuiltinOperator_FILL:
       case tflite::BuiltinOperator_FLOOR_MOD:
       case tflite::BuiltinOperator_RANGE:
+      case tflite::BuiltinOperator_LEAKY_RELU:
+      case tflite::BuiltinOperator_SQUARED_DIFFERENCE:
+      case tflite::BuiltinOperator_MIRROR_PAD:
+      case tflite::BuiltinOperator_ABS:
+      case tflite::BuiltinOperator_SPLIT_V:
         logError("Op code %d is currently not delegated to NNAPI", builtin);
         return kTfLiteError;
         break;
@@ -706,7 +710,7 @@ TfLiteStatus AddOpsAndParams(
   return kTfLiteOk;
 }
 
-TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   if (nn_model_ && nn_compiled_model_) return model_status_;
 
   // TODO(aselle): This is not correct. need to handle resize invalidation.
@@ -718,7 +722,7 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
     // inputs and outputs and mark the mapping in tensor_id_to_nnapi_id with
     // kOperandIdNotSet. addTensorOperands will replace those with the
     // corresponding NNAPI operand ids and skip kOperandNotNeeded entries.
-    std::vector<int64_t> tensor_id_to_nnapi_id(interpreter->tensors_size(),
+    std::vector<int64_t> tensor_id_to_nnapi_id(subgraph->tensors_size(),
                                                kOperandNotNeeded);
     auto set_ids_to_not_set = [&tensor_id_to_nnapi_id](const int* buf,
                                                        size_t count) {
@@ -729,35 +733,31 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
         }
       }
     };
-    for (size_t i = 0; i < interpreter->nodes_size(); i++) {
-      const auto* node_and_registration = interpreter->node_and_registration(i);
+    for (size_t i = 0; i < subgraph->nodes_size(); i++) {
+      const auto* node_and_registration = subgraph->node_and_registration(i);
       const TfLiteNode& node = node_and_registration->first;
       set_ids_to_not_set(node.inputs->data, node.inputs->size);
       set_ids_to_not_set(node.outputs->data, node.outputs->size);
     }
-    set_ids_to_not_set(interpreter->inputs().data(),
-                       interpreter->inputs().size());
-    set_ids_to_not_set(interpreter->outputs().data(),
-                       interpreter->outputs().size());
+    set_ids_to_not_set(subgraph->inputs().data(), subgraph->inputs().size());
+    set_ids_to_not_set(subgraph->outputs().data(), subgraph->outputs().size());
 
     uint32_t next_id = 0;
     RETURN_ERROR_IF_TFLITE_FAILED(addTensorOperands(
-        interpreter, nn_model_, &next_id, &tensor_id_to_nnapi_id));
+        subgraph, nn_model_, &next_id, &tensor_id_to_nnapi_id));
     RETURN_ERROR_IF_TFLITE_FAILED(
-        AddOpsAndParams(interpreter, nn_model_, next_id, &model_states_inputs_,
+        AddOpsAndParams(subgraph, nn_model_, next_id, &model_states_inputs_,
                         &model_states_outputs_, tensor_id_to_nnapi_id));
 
     std::vector<uint32_t> augmented_inputs;
-    MapAndAddTensorIds(interpreter->inputs().data(),
-                       interpreter->inputs().size(), &augmented_inputs,
-                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(subgraph->inputs().data(), subgraph->inputs().size(),
+                       &augmented_inputs, tensor_id_to_nnapi_id);
     augmented_inputs.insert(augmented_inputs.end(),
                             model_states_inputs_.begin(),
                             model_states_inputs_.end());
     std::vector<uint32_t> augmented_outputs;
-    MapAndAddTensorIds(interpreter->outputs().data(),
-                       interpreter->outputs().size(), &augmented_outputs,
-                       tensor_id_to_nnapi_id);
+    MapAndAddTensorIds(subgraph->outputs().data(), subgraph->outputs().size(),
+                       &augmented_outputs, tensor_id_to_nnapi_id);
     MapAndAddTensorIds(model_states_outputs_.data(),
                        model_states_outputs_.size(), &augmented_outputs,
                        tensor_id_to_nnapi_id);
@@ -770,7 +770,7 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
 
     if (GetAndroidSdkVersionCached() >= 28) {
       CHECK_NN(ANeuralNetworksModel_relaxComputationFloat32toFloat16(
-          nn_model_, interpreter->GetAllowFp16PrecisionForFp32()));
+          nn_model_, subgraph->GetAllowFp16PrecisionForFp32()));
     }
     CHECK_NN(ANeuralNetworksModel_finish(nn_model_));
   }
@@ -781,9 +781,9 @@ TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
   return kTfLiteOk;
 }
 
-TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) {
   if (!nn_model_) {
-    model_status_ = BuildGraph(interpreter);
+    model_status_ = BuildGraph(subgraph);
     if (model_status_ != kTfLiteOk) {
       logError("Failed to build graph for NNAPI");
     }
@@ -796,19 +796,19 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
   CHECK_NN(ANeuralNetworksExecution_create(nn_compiled_model_, &execution));
 
   // Currently perform deep copy of input buffer
-  for (size_t i = 0; i < interpreter->inputs().size(); i++) {
-    int input = interpreter->inputs()[i];
+  for (size_t i = 0; i < subgraph->inputs().size(); i++) {
+    int input = subgraph->inputs()[i];
     // TODO(aselle): Is this what we want or do we want input instead?
     // TODO(aselle): This should be called setInputValue maybe to be cons.
-    TfLiteTensor* tensor = interpreter->tensor(input);
+    TfLiteTensor* tensor = subgraph->tensor(input);
     CHECK_NN(ANeuralNetworksExecution_setInput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
 
   // Tell nn api where to place final data.
-  for (size_t i = 0; i < interpreter->outputs().size(); i++) {
-    int output = interpreter->outputs()[i];
-    TfLiteTensor* tensor = interpreter->tensor(output);
+  for (size_t i = 0; i < subgraph->outputs().size(); i++) {
+    int output = subgraph->outputs()[i];
+    TfLiteTensor* tensor = subgraph->tensor(output);
     CHECK_NN(ANeuralNetworksExecution_setOutput(
         execution, i, nullptr, tensor->data.raw, tensor->bytes));
   }
@@ -817,16 +817,16 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
   // current invocation.
   for (size_t i = 0; i < model_states_outputs_.size(); i++) {
     int state_tensor_idx = model_states_outputs_[i];
-    TfLiteTensor* tensor = interpreter->tensor(state_tensor_idx);
+    TfLiteTensor* tensor = subgraph->tensor(state_tensor_idx);
     // Here we are using a deep copy for state_in tensors so that we are not
     // reading and writing into the same buffer during a invocation.
     // TODO(miaowang): using double shared buffer to minimize the copies.
     CHECK_NN(ANeuralNetworksExecution_setInput(
-        execution, i + interpreter->inputs().size(), nullptr, tensor->data.raw,
+        execution, i + subgraph->inputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
     // Tell NNAPI where to output the state_out.
     CHECK_NN(ANeuralNetworksExecution_setOutput(
-        execution, i + interpreter->outputs().size(), nullptr, tensor->data.raw,
+        execution, i + subgraph->outputs().size(), nullptr, tensor->data.raw,
         tensor->bytes));
   }
 
@@ -839,9 +839,9 @@ TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
 
 #if 0
   printf("From the NN API:\n");
-  TfLiteTensor* tensor = interpreter->tensor(interpreter->outputs()[0]);
+  TfLiteTensor* tensor = subgraph->tensor(subgraph->outputs()[0]);
   if (float* data =
-          interpreter->typed_tensor<float>(interpreter->outputs()[0])) {
+          subgraph->typed_tensor<float>(subgraph->outputs()[0])) {
     size_t num = tensor->bytes / sizeof(float);
     for (float* p = data; p < data + num; p++) {
       printf(" %f", *p);
diff --git a/tensorflow/lite/nnapi_delegate.h b/tensorflow/lite/nnapi_delegate.h
index 63b408c1416ed1c2126cbdb5c376cb3dbb10f789..b4f8e4ecf3935c41346c78647e631651dbcccb3e 100644
--- a/tensorflow/lite/nnapi_delegate.h
+++ b/tensorflow/lite/nnapi_delegate.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/lite/allocation.h"
 #include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
+#include "tensorflow/lite/core/subgraph.h"
 #include "tensorflow/lite/interpreter.h"
 
 class ANeuralNetworksModel;
@@ -50,10 +51,10 @@ class NNAPIDelegate {
   ~NNAPIDelegate();
 
   // Convert a tflite graph to NNAPI
-  TfLiteStatus BuildGraph(Interpreter* interpreter);
+  TfLiteStatus BuildGraph(Subgraph* subgraph);
 
   // Run
-  TfLiteStatus Invoke(Interpreter* interpreter);
+  TfLiteStatus Invoke(Subgraph* subgraph);
 
   // Whether the current platform supports NNAPI delegation.
   static bool IsSupported();
diff --git a/tensorflow/lite/nnapi_delegate_disabled.cc b/tensorflow/lite/nnapi_delegate_disabled.cc
index 44dc21f1b6c2b3e4eb2c31fb19046fca90440428..a8f2c0bfe386f1339c17e34a199cf929c43ecc33 100644
--- a/tensorflow/lite/nnapi_delegate_disabled.cc
+++ b/tensorflow/lite/nnapi_delegate_disabled.cc
@@ -35,13 +35,11 @@ NNAPIDelegate::~NNAPIDelegate() {
 #undef UNUSED_MEMBER
 }
 
-TfLiteStatus NNAPIDelegate::BuildGraph(Interpreter* interpreter) {
+TfLiteStatus NNAPIDelegate::BuildGraph(Subgraph* subgraph) {
   return kTfLiteError;
 }
 
-TfLiteStatus NNAPIDelegate::Invoke(Interpreter* interpreter) {
-  return kTfLiteError;
-}
+TfLiteStatus NNAPIDelegate::Invoke(Subgraph* subgraph) { return kTfLiteError; }
 
 bool NNAPIDelegate::IsSupported() { return false; }
 
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 5ee1cf6d33d78c5445f1e5e11d6e54b8675e3fc4..1113bf01b175d93d849dbd51abf2f6c677f450d4 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -44,6 +44,8 @@ const char* TensorTypeName(TfLiteType type) {
       return "kTfLiteInt32";
     case kTfLiteUInt8:
       return "kTfLiteUInt8";
+    case kTfLiteInt8:
+      return "kTfLiteInt8";
     case kTfLiteInt64:
       return "kTfLiteInt64";
     case kTfLiteString:
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index c7a8e4f06ae19057e6b869d840233613a04a95d3..52ea6fe636247ec0a4d5fedb41c56fc095e6ac61 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -58,7 +58,6 @@ cc_test(
     name = "profile_summarizer_test",
     srcs = ["profile_summarizer_test.cc"],
     copts = common_copts,
-    tags = ["no_oss"],
     deps = [
         ":profile_summarizer",
         "//tensorflow/lite:framework",
diff --git a/tensorflow/lite/profiling/profiler_test.cc b/tensorflow/lite/profiling/profiler_test.cc
index 82d053729c900fbb536c59658357f3a5a550646b..addebabe1b1556e3853eb0a2bec65132f743d012 100644
--- a/tensorflow/lite/profiling/profiler_test.cc
+++ b/tensorflow/lite/profiling/profiler_test.cc
@@ -27,11 +27,8 @@ namespace tflite {
 namespace profiling {
 namespace {
 
-void AssertDurationOfEventAroundMs(const ProfileEvent* event,
-                                   double expected_ms, double eps_ms) {
-  double duration_ms =
-      (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
-  EXPECT_NEAR(expected_ms, duration_ms, eps_ms);
+double GetDurationOfEventMs(const ProfileEvent* event) {
+  return (event->end_timestamp_us - event->begin_timestamp_us) / 1e3;
 }
 
 void SleepForQuarterSecond(Profiler* profiler) {
@@ -84,12 +81,17 @@ TEST(ProfilingTest, ProfilesAreCollected) {
 
 #ifndef ADDRESS_SANITIZER
   // ASAN build is sometimes very slow. Set a large epsilon to avoid flakiness.
+  // Due to flakiness, just verify relative values match.
   const int eps_ms = 50;
-  AssertDurationOfEventAroundMs(profile_events[0], /*expected_ms*/ 500, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[1], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[2], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[3], /*expected_ms*/ 250, eps_ms);
-  AssertDurationOfEventAroundMs(profile_events[4], /*expected_ms*/ 250, eps_ms);
+  auto parent_ms = GetDurationOfEventMs(profile_events[0]);
+  double child_ms[2], sleep_for_quarter_ms[2];
+  child_ms[0] = GetDurationOfEventMs(profile_events[1]);
+  child_ms[1] = GetDurationOfEventMs(profile_events[3]);
+  sleep_for_quarter_ms[0] = GetDurationOfEventMs(profile_events[2]);
+  sleep_for_quarter_ms[1] = GetDurationOfEventMs(profile_events[4]);
+  EXPECT_NEAR(parent_ms, child_ms[0] + child_ms[1], eps_ms);
+  EXPECT_NEAR(child_ms[0], sleep_for_quarter_ms[0], eps_ms);
+  EXPECT_NEAR(child_ms[1], sleep_for_quarter_ms[1], eps_ms);
 #endif
 }
 
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 017dd72f78156155ac3afc01275d2d152df3d696..acf827892bfd0081f1bbc7d0c3fa4f65af3a0817 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -89,6 +89,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/lite/toco:toco_flags_proto_py",
+        "//tensorflow/python:dtypes",
     ],
 )
 
@@ -103,6 +104,7 @@ py_library(
         "//tensorflow/lite/toco:toco_flags_proto_py",
         "//tensorflow/lite/toco/python:tensorflow_wrap_toco",
         "//tensorflow/lite/toco/python:toco_from_protos",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:platform",
     ],
 )
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 9991fb2a7335ddd9c916c35a4378ab3dcfb643bf..9c603998717019ac8624868b16d720e300a30efd 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -28,6 +28,8 @@ import tempfile as _tempfile
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.toco import model_flags_pb2 as _model_flags_pb2
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
+from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import resource_loader as _resource_loader
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.lazy_loader import LazyLoader
@@ -53,6 +55,18 @@ else:
 if _toco_from_proto_bin and not _os.path.exists(_toco_from_proto_bin):
   _toco_from_proto_bin = "toco_from_protos"
 
+
+# Map of tf.dtypes to TFLite types_flag_pb2.
+_MAP_TF_TO_TFLITE_TYPES = {
+    dtypes.float32: _types_pb2.FLOAT,
+    dtypes.int32: _types_pb2.INT32,
+    dtypes.int64: _types_pb2.INT64,
+    dtypes.string: _types_pb2.STRING,
+    dtypes.uint8: _types_pb2.QUANTIZED_UINT8,
+    dtypes.complex64: _types_pb2.COMPLEX64
+}
+
+
 def _try_convert_to_unicode(output):
   if output is None:
     return u""
@@ -65,6 +79,25 @@ def _try_convert_to_unicode(output):
   return output
 
 
+def convert_dtype_to_tflite_type(tf_dtype):
+  """Converts tf.dtype to TFLite proto type.
+
+  Args:
+    tf_dtype: tf.dtype
+
+  Raises:
+    ValueError: Unsupported tf.dtype.
+
+  Returns:
+    types_flag_pb2.
+  """
+  result = _MAP_TF_TO_TFLITE_TYPES.get(tf_dtype)
+  if result is None:
+    raise ValueError("Unsupported tf.dtype {0}".format(tf_dtype))
+  return result
+
+
+@_tf_export("lite.OpsSet")
 class OpsSet(enum.Enum):
   """Enum class defining the sets of ops available to generate TFLite models.
 
@@ -214,10 +247,10 @@ def build_toco_convert_protos(input_tensors,
       `foo.get_shape()` and `foo.dtype`.
     output_tensors: List of output tensors (only .name is used from this).
     inference_type: Target data type of real-number arrays in the output file.
-      Must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      Must be `{tf.float32, tf.uint8}`.  (default tf.float32)
     inference_input_type: Target data type of real-number input arrays. Allows
       for a different type for input arrays in the case of quantization.
-      Must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      Must be `{tf.float32, tf.uint8}`. (default `inference_type`)
     input_format: Type of data to read Currently must be
       `{TENSORFLOW_GRAPHDEF}`. (default TENSORFLOW_GRAPHDEF)
     input_shapes: Input array shape. It needs to be a list of the same length
@@ -269,16 +302,19 @@ def build_toco_convert_protos(input_tensors,
     process.
 
   Raises:
-    ValueError: If the input tensor type is unknown
+    ValueError:
+      If the input tensor type is unknown
+      Missing mean_values or std_dev_values
     RuntimeError: If TOCO fails to convert (in which case the runtime error's
       error text will contain the TOCO error log)
   """
   toco = _toco_flags_pb2.TocoFlags()
   toco.input_format = input_format
   toco.output_format = output_format
-  toco.inference_type = inference_type
+  toco.inference_type = convert_dtype_to_tflite_type(inference_type)
   if inference_input_type:
-    toco.inference_input_type = inference_input_type
+    toco.inference_input_type = convert_dtype_to_tflite_type(
+        inference_input_type)
   else:
     toco.inference_input_type = toco.inference_type
   toco.drop_control_dependency = drop_control_dependency
@@ -302,9 +338,14 @@ def build_toco_convert_protos(input_tensors,
   model.change_concat_input_ranges = change_concat_input_ranges
   for idx, input_tensor in enumerate(input_tensors):
     input_array = model.input_arrays.add()
-    if toco.inference_input_type == lite_constants.QUANTIZED_UINT8:
-      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
     input_array.name = tensor_name(input_tensor)
+    input_array.data_type = convert_dtype_to_tflite_type(input_tensor.dtype)
+
+    if toco.inference_input_type == _types_pb2.QUANTIZED_UINT8:
+      if not quantized_input_stats:
+        raise ValueError("std_dev and mean must be defined when "
+                         "inference_input_type is QUANTIZED_UINT8.")
+      input_array.mean_value, input_array.std_value = quantized_input_stats[idx]
     if input_shapes is None:
       shape = input_tensor.get_shape()
     else:
@@ -352,7 +393,11 @@ def toco_convert_graph_def(input_data, input_arrays_with_shape, output_arrays,
 
   for idx, (name, shape) in enumerate(input_arrays_with_shape):
     input_array = model_flags.input_arrays.add()
-    if kwargs["inference_type"] == lite_constants.QUANTIZED_UINT8:
+    if toco_flags.inference_input_type == _types_pb2.QUANTIZED_UINT8:
+      if (("quantized_input_stats" not in kwargs) or
+          (not kwargs["quantized_input_stats"])):
+        raise ValueError("std_dev and mean must be defined when "
+                         "inference_input_type is QUANTIZED_UINT8.")
       input_array.mean_value, input_array.std_value = kwargs[
           "quantized_input_stats"][idx]
     input_array.name = name
diff --git a/tensorflow/lite/python/convert_saved_model.py b/tensorflow/lite/python/convert_saved_model.py
index 3f54d2559c4d85f4a621b3781f3d76856c709576..f8d986b746911c68e0589b587ce0beceafc0c534 100644
--- a/tensorflow/lite/python/convert_saved_model.py
+++ b/tensorflow/lite/python/convert_saved_model.py
@@ -197,12 +197,27 @@ def set_tensor_shapes(tensors, shapes):
     tensors: TensorFlow ops.Tensor.
     shapes: Dict of strings representing input tensor names to list of
       integers representing input shapes (e.g., {"foo": : [1, 16, 16, 3]}).
+
+  Raises:
+    ValueError:
+      `shapes` contains an invalid tensor.
+      `shapes` contains an invalid shape for a valid tensor.
   """
   if shapes:
-    for tensor in tensors:
-      shape = shapes.get(tensor_name(tensor))
+    tensor_names_to_tensor = {tensor_name(tensor): tensor for tensor in tensors}
+    for name, shape in shapes.items():
+      if name not in tensor_names_to_tensor:
+        raise ValueError("Invalid tensor \'{}\' found in tensor shapes "
+                         "map.".format(name))
       if shape is not None:
-        tensor.set_shape(shape)
+        tensor = tensor_names_to_tensor[name]
+        try:
+          tensor.set_shape(shape)
+        except ValueError as error:
+          message = ("The shape of tensor '{0}' cannot be changed from {1} to "
+                     "{2}. {3}".format(name, tensor.get_shape(), shape,
+                                       str(error)))
+          raise ValueError(message)
 
 
 def freeze_saved_model(saved_model_dir, input_arrays, input_shapes,
diff --git a/tensorflow/lite/python/convert_saved_model_test.py b/tensorflow/lite/python/convert_saved_model_test.py
index dff582f1a16d2f228df5253652437e4b5266e502..11bfcdc79548378a0cec8d13a089a8d505ccf7b0 100644
--- a/tensorflow/lite/python/convert_saved_model_test.py
+++ b/tensorflow/lite/python/convert_saved_model_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.saved_model import tag_constants
 
 class TensorFunctionsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorsValid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -49,6 +50,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
         sess.graph, ["Placeholder"])
     self.assertEqual("Placeholder:0", tensors[0].name)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorsInvalid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32)
@@ -61,6 +63,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     self.assertEqual("Invalid tensors 'invalid-input' were found.",
                      str(error.exception))
 
+  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeValid(self):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
@@ -68,6 +71,7 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [5, 3, 5]})
     self.assertEqual([5, 3, 5], tensor.shape.as_list())
 
+  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeNoneValid(self):
     tensor = array_ops.placeholder(dtype=dtypes.float32)
     self.assertEqual(None, tensor.shape)
@@ -75,14 +79,35 @@ class TensorFunctionsTest(test_util.TensorFlowTestCase):
     convert_saved_model.set_tensor_shapes([tensor], {"Placeholder": [1, 3, 5]})
     self.assertEqual([1, 3, 5], tensor.shape.as_list())
 
-  def testSetTensorShapeInvalid(self):
+  @test_util.run_v1_only("b/120545219")
+  def testSetTensorShapeArrayInvalid(self):
+    # Tests set_tensor_shape where the tensor name passed in doesn't exist.
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
-    convert_saved_model.set_tensor_shapes([tensor],
-                                          {"invalid-input": [5, 3, 5]})
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.set_tensor_shapes([tensor],
+                                            {"invalid-input": [5, 3, 5]})
+    self.assertEqual(
+        "Invalid tensor 'invalid-input' found in tensor shapes map.",
+        str(error.exception))
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+  @test_util.run_v1_only("b/120545219")
+  def testSetTensorShapeDimensionInvalid(self):
+    # Tests set_tensor_shape where the shape passed in is incompatiable.
+    tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
+    self.assertEqual([None, 3, 5], tensor.shape.as_list())
+
+    with self.assertRaises(ValueError) as error:
+      convert_saved_model.set_tensor_shapes([tensor],
+                                            {"Placeholder": [1, 5, 5]})
+    self.assertIn(
+        "The shape of tensor 'Placeholder' cannot be changed from "
+        "(?, 3, 5) to [1, 5, 5].", str(error.exception))
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
 
+  @test_util.run_v1_only("b/120545219")
   def testSetTensorShapeEmpty(self):
     tensor = array_ops.placeholder(shape=[None, 3, 5], dtype=dtypes.float32)
     self.assertEqual([None, 3, 5], tensor.shape.as_list())
diff --git a/tensorflow/lite/python/convert_test.py b/tensorflow/lite/python/convert_test.py
index 7a0bce921b599f0dad9012c3148abd7a86496594..cf49ee2b472d2c6617811cde0978eb8ae3a16f8e 100644
--- a/tensorflow/lite/python/convert_test.py
+++ b/tensorflow/lite/python/convert_test.py
@@ -23,6 +23,7 @@ from tensorflow.lite.python import convert
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import op_hint
 from tensorflow.lite.python.interpreter import Interpreter
+from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -33,6 +34,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class ConvertTest(test_util.TensorFlowTestCase):
 
   def testBasic(self):
@@ -65,6 +67,21 @@ class ConvertTest(test_util.TensorFlowTestCase):
         quantized_input_stats=[(0., 1.)])
     self.assertTrue(tflite_model)
 
+  def testQuantizationInvalid(self):
+    in_tensor = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    out_tensor = array_ops.fake_quant_with_min_max_args(
+        in_tensor + in_tensor, min=0., max=1.)
+    sess = session.Session()
+
+    with self.assertRaises(ValueError) as error:
+      convert.toco_convert(
+          sess.graph_def, [in_tensor], [out_tensor],
+          inference_type=lite_constants.QUANTIZED_UINT8)
+    self.assertEqual(
+        "std_dev and mean must be defined when inference_input_type is "
+        "QUANTIZED_UINT8.", str(error.exception))
+
   def testGraphDefBasic(self):
     in_tensor = array_ops.placeholder(
         shape=[1, 16, 16, 3], dtype=dtypes.float32, name="input")
@@ -138,7 +155,29 @@ class ConvertTest(test_util.TensorFlowTestCase):
     self.assertTrue(([1, 16, 16, 3] == output_details[0]["shape"]).all())
     self.assertTrue(output_details[0]["quantization"][0] > 0)  # scale
 
+  def testGraphDefQuantizationInvalid(self):
+    in_tensor_1 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputA")
+    in_tensor_2 = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32, name="inputB")
+    _ = array_ops.fake_quant_with_min_max_args(
+        in_tensor_1 + in_tensor_2, min=0., max=1., name="output")
+    sess = session.Session()
 
+    input_arrays_map = [("inputA", [1, 16, 16, 3]), ("inputB", [1, 16, 16, 3])]
+    output_arrays = ["output"]
+    with self.assertRaises(ValueError) as error:
+      convert.toco_convert_graph_def(
+          sess.graph_def,
+          input_arrays_map,
+          output_arrays,
+          inference_type=lite_constants.QUANTIZED_UINT8)
+    self.assertEqual(
+        "std_dev and mean must be defined when inference_input_type is "
+        "QUANTIZED_UINT8.", str(error.exception))
+
+
+@test_util.run_v1_only("b/120545219")
 class ConvertTestOpHint(test_util.TensorFlowTestCase):
   """Test the hint to stub functionality."""
 
@@ -329,6 +368,27 @@ class ConvertTestOpHint(test_util.TensorFlowTestCase):
               output_nodes=[op_hint._tensor_name_base(output.name)]),
           set(["agg", "Const", "Identity"]))
 
+  def testConvertDtype(self):
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(lite_constants.FLOAT),
+        _types_pb2.FLOAT)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.float32), _types_pb2.FLOAT)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.int32), _types_pb2.INT32)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.int64), _types_pb2.INT64)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.string), _types_pb2.STRING)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.uint8),
+        _types_pb2.QUANTIZED_UINT8)
+    self.assertEqual(
+        convert.convert_dtype_to_tflite_type(dtypes.complex64),
+        _types_pb2.COMPLEX64)
+    with self.assertRaises(ValueError):
+      convert.convert_dtype_to_tflite_type(dtypes.bool)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
index e71752fe6318e8518a8e67d1bb006661b4bdd880..d14af439ec0ab600ea260da17ef0041cca25d629 100644
--- a/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
+++ b/tensorflow/lite/python/interpreter_wrapper/interpreter_wrapper.cc
@@ -124,6 +124,8 @@ int TfLiteTypeToPyArrayType(TfLiteType tf_lite_type) {
       return NPY_INT16;
     case kTfLiteUInt8:
       return NPY_UINT8;
+    case kTfLiteInt8:
+      return NPY_INT8;
     case kTfLiteInt64:
       return NPY_INT64;
     case kTfLiteString:
@@ -150,6 +152,8 @@ TfLiteType TfLiteTypeFromPyArray(PyArrayObject* array) {
       return kTfLiteInt16;
     case NPY_UINT8:
       return kTfLiteUInt8;
+    case NPY_INT8:
+      return kTfLiteInt8;
     case NPY_INT64:
       return kTfLiteInt64;
     case NPY_BOOL:
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 5810553da2cf8e2bb3098e2928cf3c0c8c130d3b..1b20ff2f92b6a84c21972ccccbc27ec6f999d74b 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -25,8 +25,6 @@ EXPERIMENTAL: APIs here are unstable and likely to change without notice.
 @@convert_op_hints_to_stubs
 @@build_toco_convert_protos
 
-@@FLOAT
-@@QUANTIZED_UINT8
 @@TFLITE
 @@GRAPHVIZ_DOT
 
@@ -78,10 +76,10 @@ class TFLiteConverter(object):
   Attributes:
 
     inference_type: Target data type of real-number arrays in the output file.
-      Must be `{FLOAT, QUANTIZED_UINT8}`.  (default FLOAT)
+      Must be `{tf.float32, tf.uint8}`. (default tf.float32)
     inference_input_type: Target data type of real-number input arrays. Allows
       for a different type for input arrays in the case of quantization.
-      Must be `{FLOAT, QUANTIZED_UINT8}`. (default `inference_type`)
+      Must be `{tf.float32, tf.uint8}`. (default `inference_type`)
     output_format: Output file format. Currently must be `{TFLITE,
       GRAPHVIZ_DOT}`. (default TFLITE)
     quantized_input_stats: Dict of strings representing input tensor names
@@ -402,15 +400,16 @@ class TFLiteConverter(object):
     # Checks dimensions in input tensor.
     if self._has_valid_tensors():
       for tensor in self._input_tensors:
-        if not tensor.get_shape():
+        shape = tensor.get_shape()
+        if not shape or not shape.as_list():
           raise ValueError("Provide an input shape for input array "
                            "'{0}'.".format(_tensor_name(tensor)))
-        shape = tensor.get_shape().as_list()
-        if None in shape[1:]:
+        shape_list = shape.as_list()
+        if None in shape_list[1:]:
           raise ValueError(
               "None is only supported in the 1st dimension. Tensor '{0}' has "
-              "invalid shape '{1}'.".format(_tensor_name(tensor), shape))
-        elif shape[0] is None:
+              "invalid shape '{1}'.".format(_tensor_name(tensor), shape_list))
+        elif shape_list[0] is None:
           self._set_batch_size(batch_size=1)
 
     # Get quantization stats. Ensures there is one stat per name if the stats
diff --git a/tensorflow/lite/python/lite_constants.py b/tensorflow/lite/python/lite_constants.py
index fdefc5e6cf044894d45f6f5050b5c44f5c452acf..f5d6d1037952bed73ffa5adff13b4bdbf264185c 100644
--- a/tensorflow/lite/python/lite_constants.py
+++ b/tensorflow/lite/python/lite_constants.py
@@ -19,26 +19,25 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.lite.toco import types_pb2 as _types_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.util.all_util import remove_undocumented
 from tensorflow.python.util.tf_export import tf_export as _tf_export
 
-# Enum types from the protobuf promoted to the API
-FLOAT = _types_pb2.FLOAT
-INT32 = _types_pb2.INT32
-INT64 = _types_pb2.INT64
-STRING = _types_pb2.STRING
-QUANTIZED_UINT8 = _types_pb2.QUANTIZED_UINT8
-COMPLEX64 = _types_pb2.COMPLEX64
+FLOAT = dtypes.float32
+INT32 = dtypes.int32
+INT64 = dtypes.int64
+STRING = dtypes.string
+QUANTIZED_UINT8 = dtypes.uint8
+COMPLEX64 = dtypes.complex64
 TENSORFLOW_GRAPHDEF = _toco_flags_pb2.TENSORFLOW_GRAPHDEF
 TFLITE = _toco_flags_pb2.TFLITE
 GRAPHVIZ_DOT = _toco_flags_pb2.GRAPHVIZ_DOT
 
-_tf_export("lite.constants.FLOAT").export_constant(__name__, "FLOAT")
-_tf_export("lite.constants.INT32").export_constant(__name__, "INT32")
-_tf_export("lite.constants.INT64").export_constant(__name__, "INT64")
-_tf_export("lite.constants.STRING").export_constant(__name__, "STRING")
-_tf_export("lite.constants.QUANTIZED_UINT8").export_constant(
+_tf_export(v1=["lite.constants.FLOAT"]).export_constant(__name__, "FLOAT")
+_tf_export(v1=["lite.constants.INT32"]).export_constant(__name__, "INT32")
+_tf_export(v1=["lite.constants.INT64"]).export_constant(__name__, "INT64")
+_tf_export(v1=["lite.constants.STRING"]).export_constant(__name__, "STRING")
+_tf_export(v1=["lite.constants.QUANTIZED_UINT8"]).export_constant(
     __name__, "QUANTIZED_UINT8")
 _tf_export("lite.constants.TFLITE").export_constant(__name__, "TFLITE")
 _tf_export("lite.constants.GRAPHVIZ_DOT").export_constant(
diff --git a/tensorflow/lite/python/lite_test.py b/tensorflow/lite/python/lite_test.py
index 5a5697db92b2167de447c1a5b650aa5cd7203ac8..1f9c768b4441cc1385d93285d26eeee9b651ca83 100644
--- a/tensorflow/lite/python/lite_test.py
+++ b/tensorflow/lite/python/lite_test.py
@@ -80,6 +80,7 @@ class FromConstructor(test_util.TensorFlowTestCase):
     self.assertTrue(converter._has_valid_tensors())
 
 
+@test_util.run_v1_only('b/120545219')
 class FromSessionTest(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -177,12 +178,57 @@ class FromSessionTest(test_util.TensorFlowTestCase):
         'Quantization input stats are not available for input tensors '
         '\'inputB\'.', str(error.exception))
 
+  def testIntermediateInputArray(self):
+    """Convert a model from an intermediate input array."""
+    in_tensor_init = array_ops.placeholder(
+        shape=[1, 16, 16, 3], dtype=dtypes.float32)
+    in_tensor_final = in_tensor_init + in_tensor_init
+    out_tensor = in_tensor_final + in_tensor_final
+    sess = session.Session()
+
+    # Convert model and ensure model is not None.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor_final],
+                                                  [out_tensor])
+    tflite_model = converter.convert()
+    self.assertTrue(tflite_model)
+
+    # Check values from converted model.
+    interpreter = Interpreter(model_content=tflite_model)
+    interpreter.allocate_tensors()
+
+    input_details = interpreter.get_input_details()
+    self.assertEqual(1, len(input_details))
+    self.assertEqual('add', input_details[0]['name'])
+    self.assertEqual(np.float32, input_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == input_details[0]['shape']).all())
+    self.assertEqual((0., 0.), input_details[0]['quantization'])
+
+    output_details = interpreter.get_output_details()
+    self.assertEqual(1, len(output_details))
+    self.assertEqual('add_1', output_details[0]['name'])
+    self.assertEqual(np.float32, output_details[0]['dtype'])
+    self.assertTrue(([1, 16, 16, 3] == output_details[0]['shape']).all())
+    self.assertEqual((0., 0.), output_details[0]['quantization'])
+
   def testSizeNoneInvalid(self):
     in_tensor = array_ops.placeholder(dtype=dtypes.float32)
     out_tensor = in_tensor + in_tensor
     sess = session.Session()
 
-    # Test invalid shape. None after 1st dimension.
+    # Test None as shape.
+    converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
+                                                  [out_tensor])
+    with self.assertRaises(ValueError) as error:
+      converter.convert()
+    self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
+                     str(error.exception))
+
+  def testSizeEmptyInvalid(self):
+    in_tensor = array_ops.placeholder(dtype=dtypes.float32, shape=[])
+    out_tensor = in_tensor + in_tensor
+    sess = session.Session()
+
+    # Test empty shape.
     converter = lite.TFLiteConverter.from_session(sess, [in_tensor],
                                                   [out_tensor])
     with self.assertRaises(ValueError) as error:
@@ -190,7 +236,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     self.assertEqual('Provide an input shape for input array \'Placeholder\'.',
                      str(error.exception))
 
-  def testBatchSizeInvalid(self):
+  def testSizeInvalid(self):
     in_tensor = array_ops.placeholder(
         shape=[1, None, 16, 3], dtype=dtypes.float32)
     out_tensor = in_tensor + in_tensor
@@ -452,6 +498,7 @@ class FromSessionTest(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromFrozenGraphFile(test_util.TensorFlowTestCase):
 
   def testFloat(self):
@@ -699,6 +746,7 @@ class FromFrozenGraphFile(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromSavedModelTest(test_util.TensorFlowTestCase):
 
   def _createSavedModel(self, shape):
@@ -843,6 +891,7 @@ class FromSavedModelTest(test_util.TensorFlowTestCase):
     interpreter.allocate_tensors()
 
 
+@test_util.run_v1_only('b/120545219')
 class FromKerasFile(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -931,12 +980,13 @@ class FromKerasFile(test_util.TensorFlowTestCase):
     """Test a Sequential tf.keras model testing input shapes argument."""
     keras_file = self._getSequentialModel()
 
-    # Passing in shape of invalid input array has no impact as long as all input
-    # arrays have a shape.
-    converter = lite.TFLiteConverter.from_keras_model_file(
-        keras_file, input_shapes={'invalid-input': [2, 3]})
-    tflite_model = converter.convert()
-    self.assertTrue(tflite_model)
+    # Passing in shape of invalid input array raises error.
+    with self.assertRaises(ValueError) as error:
+      converter = lite.TFLiteConverter.from_keras_model_file(
+          keras_file, input_shapes={'invalid-input': [2, 3]})
+    self.assertEqual(
+        "Invalid tensor 'invalid-input' found in tensor shapes map.",
+        str(error.exception))
 
     # Passing in shape of valid input array.
     converter = lite.TFLiteConverter.from_keras_model_file(
diff --git a/tensorflow/lite/python/op_hint.py b/tensorflow/lite/python/op_hint.py
index 3afce1baf2e3c214eba7e73147e9a946c551bd5b..8d7f9316bfe81255510fc5aca9ffdf9671cd64df 100644
--- a/tensorflow/lite/python/op_hint.py
+++ b/tensorflow/lite/python/op_hint.py
@@ -104,9 +104,9 @@ class OpHint(object):
   that make up the pseudo op. A similar process is done to any output that
   is to be exported from the current op.
 
-  TODO(aselle): When TensorFlow functions functionality works for arbitrary
-  constructs, this mechanism can be retired and changed to use python defun's.
   """
+  # TODO(aselle): When TensorFlow functions functionality works for arbitrary
+  # constructs, this mechanism can be retired and changed to use python defun's.
 
   # Attr constants that are used for representation in the GraphDef. These
   # will be used on every Identity op that is involved in a total OpHint.
@@ -403,7 +403,7 @@ class _LiteOperand(object):
       out_graphdef: A graphdef that is ready to have this input added.
 
     Returns:
-      The the output that the stub should use as an input for this operand.
+      The output that the stub should use as an input for this operand.
 
     Raises:
       RuntimeError: if the method is not implemented.
diff --git a/tensorflow/lite/python/tflite_convert.py b/tensorflow/lite/python/tflite_convert.py
index 00ea6d722e249364200309204120effb413aaaff..341b539bead296ca28c1f5f8c17928e553ebabc4 100644
--- a/tensorflow/lite/python/tflite_convert.py
+++ b/tensorflow/lite/python/tflite_convert.py
@@ -25,7 +25,6 @@ import sys
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.toco import toco_flags_pb2 as _toco_flags_pb2
-from tensorflow.lite.toco import types_pb2 as _types_pb2
 from tensorflow.python.platform import app
 
 
@@ -41,6 +40,27 @@ def _parse_set(values):
   return None
 
 
+def _parse_inference_type(value, flag):
+  """Converts the inference type to the value of the constant.
+
+  Args:
+    value: str representing the inference type.
+    flag: str representing the flag name.
+
+  Returns:
+    tf.dtype.
+
+  Raises:
+    ValueError: Unsupported value.
+  """
+  if value == "FLOAT":
+    return lite_constants.FLOAT
+  if value == "QUANTIZED_UINT8":
+    return lite_constants.QUANTIZED_UINT8
+  raise ValueError("Unsupported value for --{0}. Only FLOAT and "
+                   "QUANTIZED_UINT8 are supported.".format(flag))
+
+
 def _get_toco_converter(flags):
   """Makes a TFLiteConverter object based on the flags provided.
 
@@ -101,10 +121,11 @@ def _convert_model(flags):
   # Create converter.
   converter = _get_toco_converter(flags)
   if flags.inference_type:
-    converter.inference_type = _types_pb2.IODataType.Value(flags.inference_type)
+    converter.inference_type = _parse_inference_type(flags.inference_type,
+                                                     "inference_type")
   if flags.inference_input_type:
-    converter.inference_input_type = _types_pb2.IODataType.Value(
-        flags.inference_input_type)
+    converter.inference_input_type = _parse_inference_type(
+        flags.inference_input_type, "inference_input_type")
   if flags.output_format:
     converter.output_format = _toco_flags_pb2.FileFormat.Value(
         flags.output_format)
@@ -115,7 +136,7 @@ def _convert_model(flags):
 
     # In quantized inference, mean_value has to be integer so that the real
     # value 0.0 is exactly representable.
-    if flags.inference_type == lite_constants.QUANTIZED_UINT8:
+    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
       mean_values = _parse_array(flags.mean_values, type_fn=int)
     else:
       mean_values = _parse_array(flags.mean_values, type_fn=float)
@@ -156,7 +177,7 @@ def _convert_model(flags):
 
   if flags.post_training_quantize:
     converter.post_training_quantize = flags.post_training_quantize
-    if flags.inference_type == lite_constants.QUANTIZED_UINT8:
+    if converter.inference_type == lite_constants.QUANTIZED_UINT8:
       print("--post_training_quantize quantizes a graph of inference_type "
             "FLOAT. Overriding inference type QUANTIZED_UINT8 to FLOAT.")
       converter.inference_type = lite_constants.FLOAT
diff --git a/tensorflow/lite/schema/builtin_ops_header/BUILD b/tensorflow/lite/schema/builtin_ops_header/BUILD
index 8a01541d575e288b94f8bb049caa288a777d61d8..52cbd052d6aa8cafcf562eb483638915be297cf7 100644
--- a/tensorflow/lite/schema/builtin_ops_header/BUILD
+++ b/tensorflow/lite/schema/builtin_ops_header/BUILD
@@ -24,7 +24,6 @@ cc_binary(
 cc_test(
     name = "generator_test",
     srcs = ["generator_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":generator",
         "@com_google_googletest//:gtest",
@@ -37,7 +36,6 @@ cc_test(
     data = [
         "//tensorflow/lite:builtin_ops.h",
     ],
-    tags = ["no_oss"],
     deps = [
         ":generator",
         "@com_google_googletest//:gtest",
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index 9b0eae74c3bb07d91a254185e5d640dc29fef233..980f13b19b4f6a32fe8b693c560be2b4f4f95fd9 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -45,7 +45,7 @@ enum TensorType : byte {
 // Custom quantization parameters for experimenting with new quantization
 // techniques.
 table CustomQuantization {
-  custom:[byte];
+  custom:[ubyte] (force_align: 16);
 }
 
 // Represents a specific quantization technique's parameters.
@@ -200,6 +200,11 @@ enum BuiltinOperator : byte {
   FLOOR_MOD = 95,
   RANGE = 96,
   RESIZE_NEAREST_NEIGHBOR = 97,
+  LEAKY_RELU = 98,
+  SQUARED_DIFFERENCE = 99,
+  MIRROR_PAD = 100,
+  ABS = 101,
+  SPLIT_V = 102,
 }
 
 // Options for the builtin operators.
@@ -278,6 +283,11 @@ union BuiltinOptions {
   FloorModOptions,
   RangeOptions,
   ResizeNearestNeighborOptions,
+  LeakyReluOptions,
+  SquaredDifferenceOptions,
+  MirrorPadOptions,
+  AbsOptions,
+  SplitVOptions,
 }
 
 enum Padding : byte { SAME, VALID }
@@ -526,6 +536,10 @@ table SplitOptions {
   num_splits: int;
 }
 
+table SplitVOptions {
+  num_splits: int;
+}
+
 table StridedSliceOptions {
   begin_mask: int;
   end_mask: int;
@@ -629,6 +643,10 @@ table OneHotOptions {
   axis:int;
 }
 
+table AbsOptions {
+}
+
+
 table LogicalAndOptions {
 }
 
@@ -658,6 +676,24 @@ table FloorModOptions {
 table RangeOptions {
 }
 
+table LeakyReluOptions {
+  alpha:float;
+}
+
+table SquaredDifferenceOptions {
+}
+
+enum MirrorPadMode : byte {
+  // Doesn't include borders.
+  REFLECT = 0,
+  // Includes borders.
+  SYMMETRIC = 1,
+}
+
+table MirrorPadOptions {
+  mode:MirrorPadMode;
+}
+
 // An OperatorCode can be an enum value (BuiltinOperator) if the operator is a
 // builtin, or a string if the operator is custom.
 table OperatorCode {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index b7885cfcc50fb49e5e77167f71e377bcb3196122..637cbafabdad47892b1e3f4a93837b44d50a5b46 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -148,6 +148,9 @@ struct SqueezeOptionsT;
 struct SplitOptions;
 struct SplitOptionsT;
 
+struct SplitVOptions;
+struct SplitVOptionsT;
+
 struct StridedSliceOptions;
 struct StridedSliceOptionsT;
 
@@ -226,6 +229,9 @@ struct LogicalOrOptionsT;
 struct OneHotOptions;
 struct OneHotOptionsT;
 
+struct AbsOptions;
+struct AbsOptionsT;
+
 struct LogicalAndOptions;
 struct LogicalAndOptionsT;
 
@@ -253,6 +259,15 @@ struct FloorModOptionsT;
 struct RangeOptions;
 struct RangeOptionsT;
 
+struct LeakyReluOptions;
+struct LeakyReluOptionsT;
+
+struct SquaredDifferenceOptions;
+struct SquaredDifferenceOptionsT;
+
+struct MirrorPadOptions;
+struct MirrorPadOptionsT;
+
 struct OperatorCode;
 struct OperatorCodeT;
 
@@ -500,11 +515,16 @@ enum BuiltinOperator {
   BuiltinOperator_FLOOR_MOD = 95,
   BuiltinOperator_RANGE = 96,
   BuiltinOperator_RESIZE_NEAREST_NEIGHBOR = 97,
+  BuiltinOperator_LEAKY_RELU = 98,
+  BuiltinOperator_SQUARED_DIFFERENCE = 99,
+  BuiltinOperator_MIRROR_PAD = 100,
+  BuiltinOperator_ABS = 101,
+  BuiltinOperator_SPLIT_V = 102,
   BuiltinOperator_MIN = BuiltinOperator_ADD,
-  BuiltinOperator_MAX = BuiltinOperator_RESIZE_NEAREST_NEIGHBOR
+  BuiltinOperator_MAX = BuiltinOperator_SPLIT_V
 };
 
-inline const BuiltinOperator (&EnumValuesBuiltinOperator())[97] {
+inline const BuiltinOperator (&EnumValuesBuiltinOperator())[102] {
   static const BuiltinOperator values[] = {
     BuiltinOperator_ADD,
     BuiltinOperator_AVERAGE_POOL_2D,
@@ -602,7 +622,12 @@ inline const BuiltinOperator (&EnumValuesBuiltinOperator())[97] {
     BuiltinOperator_FILL,
     BuiltinOperator_FLOOR_MOD,
     BuiltinOperator_RANGE,
-    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR
+    BuiltinOperator_RESIZE_NEAREST_NEIGHBOR,
+    BuiltinOperator_LEAKY_RELU,
+    BuiltinOperator_SQUARED_DIFFERENCE,
+    BuiltinOperator_MIRROR_PAD,
+    BuiltinOperator_ABS,
+    BuiltinOperator_SPLIT_V
   };
   return values;
 }
@@ -707,6 +732,11 @@ inline const char * const *EnumNamesBuiltinOperator() {
     "FLOOR_MOD",
     "RANGE",
     "RESIZE_NEAREST_NEIGHBOR",
+    "LEAKY_RELU",
+    "SQUARED_DIFFERENCE",
+    "MIRROR_PAD",
+    "ABS",
+    "SPLIT_V",
     nullptr
   };
   return names;
@@ -793,11 +823,16 @@ enum BuiltinOptions {
   BuiltinOptions_FloorModOptions = 72,
   BuiltinOptions_RangeOptions = 73,
   BuiltinOptions_ResizeNearestNeighborOptions = 74,
+  BuiltinOptions_LeakyReluOptions = 75,
+  BuiltinOptions_SquaredDifferenceOptions = 76,
+  BuiltinOptions_MirrorPadOptions = 77,
+  BuiltinOptions_AbsOptions = 78,
+  BuiltinOptions_SplitVOptions = 79,
   BuiltinOptions_MIN = BuiltinOptions_NONE,
-  BuiltinOptions_MAX = BuiltinOptions_ResizeNearestNeighborOptions
+  BuiltinOptions_MAX = BuiltinOptions_SplitVOptions
 };
 
-inline const BuiltinOptions (&EnumValuesBuiltinOptions())[75] {
+inline const BuiltinOptions (&EnumValuesBuiltinOptions())[80] {
   static const BuiltinOptions values[] = {
     BuiltinOptions_NONE,
     BuiltinOptions_Conv2DOptions,
@@ -873,7 +908,12 @@ inline const BuiltinOptions (&EnumValuesBuiltinOptions())[75] {
     BuiltinOptions_UnidirectionalSequenceLSTMOptions,
     BuiltinOptions_FloorModOptions,
     BuiltinOptions_RangeOptions,
-    BuiltinOptions_ResizeNearestNeighborOptions
+    BuiltinOptions_ResizeNearestNeighborOptions,
+    BuiltinOptions_LeakyReluOptions,
+    BuiltinOptions_SquaredDifferenceOptions,
+    BuiltinOptions_MirrorPadOptions,
+    BuiltinOptions_AbsOptions,
+    BuiltinOptions_SplitVOptions
   };
   return values;
 }
@@ -955,6 +995,11 @@ inline const char * const *EnumNamesBuiltinOptions() {
     "FloorModOptions",
     "RangeOptions",
     "ResizeNearestNeighborOptions",
+    "LeakyReluOptions",
+    "SquaredDifferenceOptions",
+    "MirrorPadOptions",
+    "AbsOptions",
+    "SplitVOptions",
     nullptr
   };
   return names;
@@ -1265,6 +1310,26 @@ template<> struct BuiltinOptionsTraits<ResizeNearestNeighborOptions> {
   static const BuiltinOptions enum_value = BuiltinOptions_ResizeNearestNeighborOptions;
 };
 
+template<> struct BuiltinOptionsTraits<LeakyReluOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_LeakyReluOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SquaredDifferenceOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SquaredDifferenceOptions;
+};
+
+template<> struct BuiltinOptionsTraits<MirrorPadOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_MirrorPadOptions;
+};
+
+template<> struct BuiltinOptionsTraits<AbsOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_AbsOptions;
+};
+
+template<> struct BuiltinOptionsTraits<SplitVOptions> {
+  static const BuiltinOptions enum_value = BuiltinOptions_SplitVOptions;
+};
+
 struct BuiltinOptionsUnion {
   BuiltinOptions type;
   void *value;
@@ -1888,6 +1953,46 @@ struct BuiltinOptionsUnion {
     return type == BuiltinOptions_ResizeNearestNeighborOptions ?
       reinterpret_cast<const ResizeNearestNeighborOptionsT *>(value) : nullptr;
   }
+  LeakyReluOptionsT *AsLeakyReluOptions() {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<LeakyReluOptionsT *>(value) : nullptr;
+  }
+  const LeakyReluOptionsT *AsLeakyReluOptions() const {
+    return type == BuiltinOptions_LeakyReluOptions ?
+      reinterpret_cast<const LeakyReluOptionsT *>(value) : nullptr;
+  }
+  SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  const SquaredDifferenceOptionsT *AsSquaredDifferenceOptions() const {
+    return type == BuiltinOptions_SquaredDifferenceOptions ?
+      reinterpret_cast<const SquaredDifferenceOptionsT *>(value) : nullptr;
+  }
+  MirrorPadOptionsT *AsMirrorPadOptions() {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<MirrorPadOptionsT *>(value) : nullptr;
+  }
+  const MirrorPadOptionsT *AsMirrorPadOptions() const {
+    return type == BuiltinOptions_MirrorPadOptions ?
+      reinterpret_cast<const MirrorPadOptionsT *>(value) : nullptr;
+  }
+  AbsOptionsT *AsAbsOptions() {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<AbsOptionsT *>(value) : nullptr;
+  }
+  const AbsOptionsT *AsAbsOptions() const {
+    return type == BuiltinOptions_AbsOptions ?
+      reinterpret_cast<const AbsOptionsT *>(value) : nullptr;
+  }
+  SplitVOptionsT *AsSplitVOptions() {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<SplitVOptionsT *>(value) : nullptr;
+  }
+  const SplitVOptionsT *AsSplitVOptions() const {
+    return type == BuiltinOptions_SplitVOptions ?
+      reinterpret_cast<const SplitVOptionsT *>(value) : nullptr;
+  }
 };
 
 bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *obj, BuiltinOptions type);
@@ -2085,6 +2190,35 @@ inline const char *EnumNameCombinerType(CombinerType e) {
   return EnumNamesCombinerType()[index];
 }
 
+enum MirrorPadMode {
+  MirrorPadMode_REFLECT = 0,
+  MirrorPadMode_SYMMETRIC = 1,
+  MirrorPadMode_MIN = MirrorPadMode_REFLECT,
+  MirrorPadMode_MAX = MirrorPadMode_SYMMETRIC
+};
+
+inline const MirrorPadMode (&EnumValuesMirrorPadMode())[2] {
+  static const MirrorPadMode values[] = {
+    MirrorPadMode_REFLECT,
+    MirrorPadMode_SYMMETRIC
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesMirrorPadMode() {
+  static const char * const names[] = {
+    "REFLECT",
+    "SYMMETRIC",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameMirrorPadMode(MirrorPadMode e) {
+  const size_t index = static_cast<int>(e);
+  return EnumNamesMirrorPadMode()[index];
+}
+
 enum CustomOptionsFormat {
   CustomOptionsFormat_FLEXBUFFERS = 0,
   CustomOptionsFormat_MIN = CustomOptionsFormat_FLEXBUFFERS,
@@ -2113,7 +2247,7 @@ inline const char *EnumNameCustomOptionsFormat(CustomOptionsFormat e) {
 
 struct CustomQuantizationT : public flatbuffers::NativeTable {
   typedef CustomQuantization TableType;
-  std::vector<int8_t> custom;
+  std::vector<uint8_t> custom;
   CustomQuantizationT() {
   }
 };
@@ -2123,8 +2257,8 @@ struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   enum {
     VT_CUSTOM = 4
   };
-  const flatbuffers::Vector<int8_t> *custom() const {
-    return GetPointer<const flatbuffers::Vector<int8_t> *>(VT_CUSTOM);
+  const flatbuffers::Vector<uint8_t> *custom() const {
+    return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM);
   }
   bool Verify(flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
@@ -2140,7 +2274,7 @@ struct CustomQuantization FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
 struct CustomQuantizationBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
-  void add_custom(flatbuffers::Offset<flatbuffers::Vector<int8_t>> custom) {
+  void add_custom(flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom) {
     fbb_.AddOffset(CustomQuantization::VT_CUSTOM, custom);
   }
   explicit CustomQuantizationBuilder(flatbuffers::FlatBufferBuilder &_fbb)
@@ -2157,7 +2291,7 @@ struct CustomQuantizationBuilder {
 
 inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
     flatbuffers::FlatBufferBuilder &_fbb,
-    flatbuffers::Offset<flatbuffers::Vector<int8_t>> custom = 0) {
+    flatbuffers::Offset<flatbuffers::Vector<uint8_t>> custom = 0) {
   CustomQuantizationBuilder builder_(_fbb);
   builder_.add_custom(custom);
   return builder_.Finish();
@@ -2165,10 +2299,10 @@ inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(
 
 inline flatbuffers::Offset<CustomQuantization> CreateCustomQuantizationDirect(
     flatbuffers::FlatBufferBuilder &_fbb,
-    const std::vector<int8_t> *custom = nullptr) {
+    const std::vector<uint8_t> *custom = nullptr) {
   return tflite::CreateCustomQuantization(
       _fbb,
-      custom ? _fbb.CreateVector<int8_t>(*custom) : 0);
+      custom ? _fbb.CreateVector<uint8_t>(*custom) : 0);
 }
 
 flatbuffers::Offset<CustomQuantization> CreateCustomQuantization(flatbuffers::FlatBufferBuilder &_fbb, const CustomQuantizationT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -4935,6 +5069,60 @@ inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(
 
 flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct SplitVOptionsT : public flatbuffers::NativeTable {
+  typedef SplitVOptions TableType;
+  int32_t num_splits;
+  SplitVOptionsT()
+      : num_splits(0) {
+  }
+};
+
+struct SplitVOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SplitVOptionsT NativeTableType;
+  enum {
+    VT_NUM_SPLITS = 4
+  };
+  int32_t num_splits() const {
+    return GetField<int32_t>(VT_NUM_SPLITS, 0);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int32_t>(verifier, VT_NUM_SPLITS) &&
+           verifier.EndTable();
+  }
+  SplitVOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SplitVOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SplitVOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_num_splits(int32_t num_splits) {
+    fbb_.AddElement<int32_t>(SplitVOptions::VT_NUM_SPLITS, num_splits, 0);
+  }
+  explicit SplitVOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SplitVOptionsBuilder &operator=(const SplitVOptionsBuilder &);
+  flatbuffers::Offset<SplitVOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SplitVOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    int32_t num_splits = 0) {
+  SplitVOptionsBuilder builder_(_fbb);
+  builder_.add_num_splits(num_splits);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct StridedSliceOptionsT : public flatbuffers::NativeTable {
   typedef StridedSliceOptions TableType;
   int32_t begin_mask;
@@ -6247,6 +6435,46 @@ inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(
 
 flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatBufferBuilder &_fbb, const OneHotOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct AbsOptionsT : public flatbuffers::NativeTable {
+  typedef AbsOptions TableType;
+  AbsOptionsT() {
+  }
+};
+
+struct AbsOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef AbsOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  AbsOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<AbsOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct AbsOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit AbsOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  AbsOptionsBuilder &operator=(const AbsOptionsBuilder &);
+  flatbuffers::Offset<AbsOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<AbsOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  AbsOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct LogicalAndOptionsT : public flatbuffers::NativeTable {
   typedef LogicalAndOptions TableType;
   LogicalAndOptionsT() {
@@ -6633,6 +6861,154 @@ inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(
 
 flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBufferBuilder &_fbb, const RangeOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct LeakyReluOptionsT : public flatbuffers::NativeTable {
+  typedef LeakyReluOptions TableType;
+  float alpha;
+  LeakyReluOptionsT()
+      : alpha(0.0f) {
+  }
+};
+
+struct LeakyReluOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef LeakyReluOptionsT NativeTableType;
+  enum {
+    VT_ALPHA = 4
+  };
+  float alpha() const {
+    return GetField<float>(VT_ALPHA, 0.0f);
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<float>(verifier, VT_ALPHA) &&
+           verifier.EndTable();
+  }
+  LeakyReluOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<LeakyReluOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct LeakyReluOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_alpha(float alpha) {
+    fbb_.AddElement<float>(LeakyReluOptions::VT_ALPHA, alpha, 0.0f);
+  }
+  explicit LeakyReluOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  LeakyReluOptionsBuilder &operator=(const LeakyReluOptionsBuilder &);
+  flatbuffers::Offset<LeakyReluOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<LeakyReluOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    float alpha = 0.0f) {
+  LeakyReluOptionsBuilder builder_(_fbb);
+  builder_.add_alpha(alpha);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct SquaredDifferenceOptionsT : public flatbuffers::NativeTable {
+  typedef SquaredDifferenceOptions TableType;
+  SquaredDifferenceOptionsT() {
+  }
+};
+
+struct SquaredDifferenceOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef SquaredDifferenceOptionsT NativeTableType;
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           verifier.EndTable();
+  }
+  SquaredDifferenceOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<SquaredDifferenceOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct SquaredDifferenceOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  explicit SquaredDifferenceOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  SquaredDifferenceOptionsBuilder &operator=(const SquaredDifferenceOptionsBuilder &);
+  flatbuffers::Offset<SquaredDifferenceOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<SquaredDifferenceOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(
+    flatbuffers::FlatBufferBuilder &_fbb) {
+  SquaredDifferenceOptionsBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+struct MirrorPadOptionsT : public flatbuffers::NativeTable {
+  typedef MirrorPadOptions TableType;
+  MirrorPadMode mode;
+  MirrorPadOptionsT()
+      : mode(MirrorPadMode_REFLECT) {
+  }
+};
+
+struct MirrorPadOptions FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
+  typedef MirrorPadOptionsT NativeTableType;
+  enum {
+    VT_MODE = 4
+  };
+  MirrorPadMode mode() const {
+    return static_cast<MirrorPadMode>(GetField<int8_t>(VT_MODE, 0));
+  }
+  bool Verify(flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyField<int8_t>(verifier, VT_MODE) &&
+           verifier.EndTable();
+  }
+  MirrorPadOptionsT *UnPack(const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static flatbuffers::Offset<MirrorPadOptions> Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct MirrorPadOptionsBuilder {
+  flatbuffers::FlatBufferBuilder &fbb_;
+  flatbuffers::uoffset_t start_;
+  void add_mode(MirrorPadMode mode) {
+    fbb_.AddElement<int8_t>(MirrorPadOptions::VT_MODE, static_cast<int8_t>(mode), 0);
+  }
+  explicit MirrorPadOptionsBuilder(flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  MirrorPadOptionsBuilder &operator=(const MirrorPadOptionsBuilder &);
+  flatbuffers::Offset<MirrorPadOptions> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = flatbuffers::Offset<MirrorPadOptions>(end);
+    return o;
+  }
+};
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(
+    flatbuffers::FlatBufferBuilder &_fbb,
+    MirrorPadMode mode = MirrorPadMode_REFLECT) {
+  MirrorPadOptionsBuilder builder_(_fbb);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct OperatorCodeT : public flatbuffers::NativeTable {
   typedef OperatorCode TableType;
   BuiltinOperator builtin_code;
@@ -6988,6 +7364,21 @@ struct Operator FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table {
   const ResizeNearestNeighborOptions *builtin_options_as_ResizeNearestNeighborOptions() const {
     return builtin_options_type() == BuiltinOptions_ResizeNearestNeighborOptions ? static_cast<const ResizeNearestNeighborOptions *>(builtin_options()) : nullptr;
   }
+  const LeakyReluOptions *builtin_options_as_LeakyReluOptions() const {
+    return builtin_options_type() == BuiltinOptions_LeakyReluOptions ? static_cast<const LeakyReluOptions *>(builtin_options()) : nullptr;
+  }
+  const SquaredDifferenceOptions *builtin_options_as_SquaredDifferenceOptions() const {
+    return builtin_options_type() == BuiltinOptions_SquaredDifferenceOptions ? static_cast<const SquaredDifferenceOptions *>(builtin_options()) : nullptr;
+  }
+  const MirrorPadOptions *builtin_options_as_MirrorPadOptions() const {
+    return builtin_options_type() == BuiltinOptions_MirrorPadOptions ? static_cast<const MirrorPadOptions *>(builtin_options()) : nullptr;
+  }
+  const AbsOptions *builtin_options_as_AbsOptions() const {
+    return builtin_options_type() == BuiltinOptions_AbsOptions ? static_cast<const AbsOptions *>(builtin_options()) : nullptr;
+  }
+  const SplitVOptions *builtin_options_as_SplitVOptions() const {
+    return builtin_options_type() == BuiltinOptions_SplitVOptions ? static_cast<const SplitVOptions *>(builtin_options()) : nullptr;
+  }
   const flatbuffers::Vector<uint8_t> *custom_options() const {
     return GetPointer<const flatbuffers::Vector<uint8_t> *>(VT_CUSTOM_OPTIONS);
   }
@@ -7315,6 +7706,26 @@ template<> inline const ResizeNearestNeighborOptions *Operator::builtin_options_
   return builtin_options_as_ResizeNearestNeighborOptions();
 }
 
+template<> inline const LeakyReluOptions *Operator::builtin_options_as<LeakyReluOptions>() const {
+  return builtin_options_as_LeakyReluOptions();
+}
+
+template<> inline const SquaredDifferenceOptions *Operator::builtin_options_as<SquaredDifferenceOptions>() const {
+  return builtin_options_as_SquaredDifferenceOptions();
+}
+
+template<> inline const MirrorPadOptions *Operator::builtin_options_as<MirrorPadOptions>() const {
+  return builtin_options_as_MirrorPadOptions();
+}
+
+template<> inline const AbsOptions *Operator::builtin_options_as<AbsOptions>() const {
+  return builtin_options_as_AbsOptions();
+}
+
+template<> inline const SplitVOptions *Operator::builtin_options_as<SplitVOptions>() const {
+  return builtin_options_as_SplitVOptions();
+}
+
 struct OperatorBuilder {
   flatbuffers::FlatBufferBuilder &fbb_;
   flatbuffers::uoffset_t start_;
@@ -8932,6 +9343,32 @@ inline flatbuffers::Offset<SplitOptions> CreateSplitOptions(flatbuffers::FlatBuf
       _num_splits);
 }
 
+inline SplitVOptionsT *SplitVOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SplitVOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SplitVOptions::UnPackTo(SplitVOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = num_splits(); _o->num_splits = _e; };
+}
+
+inline flatbuffers::Offset<SplitVOptions> SplitVOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSplitVOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SplitVOptions> CreateSplitVOptions(flatbuffers::FlatBufferBuilder &_fbb, const SplitVOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SplitVOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _num_splits = _o->num_splits;
+  return tflite::CreateSplitVOptions(
+      _fbb,
+      _num_splits);
+}
+
 inline StridedSliceOptionsT *StridedSliceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new StridedSliceOptionsT();
   UnPackTo(_o, _resolver);
@@ -9593,6 +10030,29 @@ inline flatbuffers::Offset<OneHotOptions> CreateOneHotOptions(flatbuffers::FlatB
       _axis);
 }
 
+inline AbsOptionsT *AbsOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new AbsOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void AbsOptions::UnPackTo(AbsOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<AbsOptions> AbsOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateAbsOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<AbsOptions> CreateAbsOptions(flatbuffers::FlatBufferBuilder &_fbb, const AbsOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const AbsOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateAbsOptions(
+      _fbb);
+}
+
 inline LogicalAndOptionsT *LogicalAndOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new LogicalAndOptionsT();
   UnPackTo(_o, _resolver);
@@ -9806,6 +10266,81 @@ inline flatbuffers::Offset<RangeOptions> CreateRangeOptions(flatbuffers::FlatBuf
       _fbb);
 }
 
+inline LeakyReluOptionsT *LeakyReluOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new LeakyReluOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void LeakyReluOptions::UnPackTo(LeakyReluOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = alpha(); _o->alpha = _e; };
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> LeakyReluOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateLeakyReluOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<LeakyReluOptions> CreateLeakyReluOptions(flatbuffers::FlatBufferBuilder &_fbb, const LeakyReluOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const LeakyReluOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _alpha = _o->alpha;
+  return tflite::CreateLeakyReluOptions(
+      _fbb,
+      _alpha);
+}
+
+inline SquaredDifferenceOptionsT *SquaredDifferenceOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new SquaredDifferenceOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void SquaredDifferenceOptions::UnPackTo(SquaredDifferenceOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> SquaredDifferenceOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateSquaredDifferenceOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<SquaredDifferenceOptions> CreateSquaredDifferenceOptions(flatbuffers::FlatBufferBuilder &_fbb, const SquaredDifferenceOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const SquaredDifferenceOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  return tflite::CreateSquaredDifferenceOptions(
+      _fbb);
+}
+
+inline MirrorPadOptionsT *MirrorPadOptions::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = new MirrorPadOptionsT();
+  UnPackTo(_o, _resolver);
+  return _o;
+}
+
+inline void MirrorPadOptions::UnPackTo(MirrorPadOptionsT *_o, const flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = mode(); _o->mode = _e; };
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> MirrorPadOptions::Pack(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT* _o, const flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateMirrorPadOptions(_fbb, _o, _rehasher);
+}
+
+inline flatbuffers::Offset<MirrorPadOptions> CreateMirrorPadOptions(flatbuffers::FlatBufferBuilder &_fbb, const MirrorPadOptionsT *_o, const flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { flatbuffers::FlatBufferBuilder *__fbb; const MirrorPadOptionsT* __o; const flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _mode = _o->mode;
+  return tflite::CreateMirrorPadOptions(
+      _fbb,
+      _mode);
+}
+
 inline OperatorCodeT *OperatorCode::UnPack(const flatbuffers::resolver_function_t *_resolver) const {
   auto _o = new OperatorCodeT();
   UnPackTo(_o, _resolver);
@@ -10360,6 +10895,26 @@ inline bool VerifyBuiltinOptions(flatbuffers::Verifier &verifier, const void *ob
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptions *>(obj);
       return verifier.VerifyTable(ptr);
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
+      return verifier.VerifyTable(ptr);
+    }
     default: return false;
   }
 }
@@ -10674,6 +11229,26 @@ inline void *BuiltinOptionsUnion::UnPack(const void *obj, BuiltinOptions type, c
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptions *>(obj);
       return ptr->UnPack(resolver);
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptions *>(obj);
+      return ptr->UnPack(resolver);
+    }
     default: return nullptr;
   }
 }
@@ -10976,6 +11551,26 @@ inline flatbuffers::Offset<void> BuiltinOptionsUnion::Pack(flatbuffers::FlatBuff
       auto ptr = reinterpret_cast<const ResizeNearestNeighborOptionsT *>(value);
       return CreateResizeNearestNeighborOptions(_fbb, ptr, _rehasher).Union();
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<const LeakyReluOptionsT *>(value);
+      return CreateLeakyReluOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<const SquaredDifferenceOptionsT *>(value);
+      return CreateSquaredDifferenceOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<const MirrorPadOptionsT *>(value);
+      return CreateMirrorPadOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<const AbsOptionsT *>(value);
+      return CreateAbsOptions(_fbb, ptr, _rehasher).Union();
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<const SplitVOptionsT *>(value);
+      return CreateSplitVOptions(_fbb, ptr, _rehasher).Union();
+    }
     default: return 0;
   }
 }
@@ -11278,6 +11873,26 @@ inline BuiltinOptionsUnion::BuiltinOptionsUnion(const BuiltinOptionsUnion &u) FL
       value = new ResizeNearestNeighborOptionsT(*reinterpret_cast<ResizeNearestNeighborOptionsT *>(u.value));
       break;
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      value = new LeakyReluOptionsT(*reinterpret_cast<LeakyReluOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      value = new SquaredDifferenceOptionsT(*reinterpret_cast<SquaredDifferenceOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      value = new MirrorPadOptionsT(*reinterpret_cast<MirrorPadOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      value = new AbsOptionsT(*reinterpret_cast<AbsOptionsT *>(u.value));
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      value = new SplitVOptionsT(*reinterpret_cast<SplitVOptionsT *>(u.value));
+      break;
+    }
     default:
       break;
   }
@@ -11655,6 +12270,31 @@ inline void BuiltinOptionsUnion::Reset() {
       delete ptr;
       break;
     }
+    case BuiltinOptions_LeakyReluOptions: {
+      auto ptr = reinterpret_cast<LeakyReluOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SquaredDifferenceOptions: {
+      auto ptr = reinterpret_cast<SquaredDifferenceOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_MirrorPadOptions: {
+      auto ptr = reinterpret_cast<MirrorPadOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_AbsOptions: {
+      auto ptr = reinterpret_cast<AbsOptionsT *>(value);
+      delete ptr;
+      break;
+    }
+    case BuiltinOptions_SplitVOptions: {
+      auto ptr = reinterpret_cast<SplitVOptionsT *>(value);
+      delete ptr;
+      break;
+    }
     default: break;
   }
   value = nullptr;
diff --git a/tensorflow/lite/string_util.cc b/tensorflow/lite/string_util.cc
index 1b33f5bcba01bf32d366436812df014c3fbc1390..6efa11d60c55540c099fadc33c7756ae8f77b97f 100644
--- a/tensorflow/lite/string_util.cc
+++ b/tensorflow/lite/string_util.cc
@@ -96,8 +96,7 @@ int DynamicBuffer::WriteToBuffer(char** buffer) {
   return bytes;
 }
 
-void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor) {
-  // Set tensor content pointer to tensor_buffer, and release original data.
+void DynamicBuffer::WriteToTensorAsVector(TfLiteTensor* tensor) {
   auto dims = TfLiteIntArrayCreate(1);
   dims->data[0] = offset_.size() - 1;  // Store number of strings.
   WriteToTensor(tensor, dims);
@@ -108,6 +107,10 @@ void DynamicBuffer::WriteToTensor(TfLiteTensor* tensor,
   char* tensor_buffer;
   int bytes = WriteToBuffer(&tensor_buffer);
 
+  if (new_shape == nullptr) {
+    new_shape = TfLiteIntArrayCopy(tensor->dims);
+  }
+
   // Set tensor content pointer to tensor_buffer, and release original data.
   TfLiteTensorReset(tensor->type, tensor->name, new_shape, tensor->params,
                     tensor_buffer, bytes, kTfLiteDynamic, tensor->allocation,
diff --git a/tensorflow/lite/string_util.h b/tensorflow/lite/string_util.h
index c9b74482f7d04b8cd667c18fc0a2aadc2f5f6490..f076db76f2d4ef416e5f7ec98ac2ec0aa94d95c2 100644
--- a/tensorflow/lite/string_util.h
+++ b/tensorflow/lite/string_util.h
@@ -74,12 +74,18 @@ class DynamicBuffer {
   // The function allocates space for the buffer but does NOT take ownership.
   int WriteToBuffer(char** buffer);
 
-  // Fill content into a string tensor, with the given new_shape. The new
-  // shape must match the number of strings in this object.
+  // Fill content into a string tensor, with the given new_shape. The new shape
+  // must match the number of strings in this object. Caller relinquishes
+  // ownership of new_shape. If 'new_shape' is nullptr, keep the tensor's
+  // existing shape.
   void WriteToTensor(TfLiteTensor* tensor, TfLiteIntArray* new_shape);
 
   // Fill content into a string tensor. Set shape to {num_strings}.
-  void WriteToTensor(TfLiteTensor* tensor);
+  void WriteToTensorAsVector(TfLiteTensor* tensor);
+
+  // Deprecated. Use WriteToTensorAsVector() or pass in the new shpe.
+  // TODO(b/120230709): remove when people migrate away.
+  void WriteToTensor(TfLiteTensor* tensor) { WriteToTensorAsVector(tensor); }
 
  private:
   // Data buffer to store contents of strings, not including headers.
diff --git a/tensorflow/lite/string_util_test.cc b/tensorflow/lite/string_util_test.cc
index 377cdd77eb4651bb057055cc4f7a4ab33cbb5297..cbf1d7b226af20251d5f70a354a21f1eb40ae1c6 100644
--- a/tensorflow/lite/string_util_test.cc
+++ b/tensorflow/lite/string_util_test.cc
@@ -55,7 +55,7 @@ TEST(StringUtil, TestStringUtil) {
   new_shape->data[0] = 2;
   new_shape->data[1] = 1;
   buf0.WriteToTensor(t0, new_shape);
-  buf1.WriteToTensor(t1);
+  buf1.WriteToTensorAsVector(t1);
 
   // Check tensor shapes.
   EXPECT_EQ(t0->dims->size, 2);
@@ -99,7 +99,7 @@ TEST(StringUtil, TestAddJoinedString) {
 
   DynamicBuffer buf;
   buf.AddJoinedString({{s0, 3}, {s1, 4}, {s2, 0}, {s3, 3}}, ' ');
-  buf.WriteToTensor(t0);
+  buf.WriteToTensorAsVector(t0);
 
   ASSERT_EQ(GetStringCount(t0), 1);
   StringRef str_ref;
@@ -115,12 +115,43 @@ TEST(StringUtil, TestEmptyList) {
   t0->type = kTfLiteString;
   t0->allocation_type = kTfLiteDynamic;
   DynamicBuffer buf;
-  buf.WriteToTensor(t0);
+  buf.WriteToTensorAsVector(t0);
 
   ASSERT_EQ(GetStringCount(t0), 0);
   ASSERT_EQ(t0->bytes, 8);
 }
 
+TEST(StringUtil, TestShapes) {
+  Interpreter interpreter;
+  interpreter.AddTensors(1);
+  TfLiteTensor* t0 = interpreter.tensor(0);
+  t0->type = kTfLiteString;
+  t0->allocation_type = kTfLiteDynamic;
+  t0->dims = TfLiteIntArrayCreate(2);
+  t0->dims->data[0] = 2;
+  t0->dims->data[1] = 1;
+
+  // Not setting a new shape: number of strings must match
+  DynamicBuffer buf;
+  buf.AddString("ABC", 3);
+  buf.AddString("X", 1);
+  buf.WriteToTensor(t0, nullptr);
+
+  ASSERT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 2);
+  EXPECT_EQ(t0->dims->data[1], 1);
+
+  auto new_shape = TfLiteIntArrayCreate(2);
+  new_shape->data[0] = 1;
+  new_shape->data[1] = 2;
+
+  buf.WriteToTensor(t0, new_shape);
+
+  ASSERT_EQ(t0->dims->size, 2);
+  EXPECT_EQ(t0->dims->data[0], 1);
+  EXPECT_EQ(t0->dims->data[1], 2);
+}
+
 }  // namespace tflite
 
 int main(int argc, char** argv) {
diff --git a/tensorflow/lite/testing/generate_examples.py b/tensorflow/lite/testing/generate_examples.py
index 2b129df766aad431d7612cf5bfc91b09d0bf7ae4..dd7b3d07456fbd9943e9f45b815e6015f4973a94 100644
--- a/tensorflow/lite/testing/generate_examples.py
+++ b/tensorflow/lite/testing/generate_examples.py
@@ -103,8 +103,6 @@ KNOWN_BUGS = {
     r"batch_to_space_nd.*input_shape=\[8,2,2,2,1,1\]": "70594733",
     # Div will use floordiv.
     r"div.*int32": "72051395",
-    # No support for SplitV
-    r"split.*num_or_size_splits=\[2,2\]": "73377559",
 }
 
 
@@ -370,7 +368,8 @@ def make_zip_of_tests(zip_path,
                       make_graph,
                       make_test_inputs,
                       extra_toco_options=ExtraTocoOptions(),
-                      use_frozen_graph=False):
+                      use_frozen_graph=False,
+                      expected_tf_success=None):
   """Helper to make a zip file of a bunch of TensorFlow models.
 
   This does a cartestian product of the dictionary of test_parameters and
@@ -390,6 +389,8 @@ def make_zip_of_tests(zip_path,
       `output_tensors` and returns tuple `(input_values, output_values)`.
     extra_toco_options: Additional toco options.
     use_frozen_graph: Whether or not freeze graph before toco converter.
+    expected_tf_success: Number of times tensorflow is supposed to succeed in
+      executing the input graphs. `None` means "unknown".
 
   Raises:
     RuntimeError: if there are toco errors that can't be ignored.
@@ -550,6 +551,11 @@ def make_zip_of_tests(zip_path,
                    " and %d TOCO converted graphs (%.1f%%"), zip_path,
                   total_conversions, tf_success, toco_success, percent)
 
+  if expected_tf_success is not None and tf_success != expected_tf_success:
+    raise RuntimeError(
+        "Expected TF to succeed %d times, but that happened %d times" %
+        (expected_tf_success, tf_success))
+
   if not FLAGS.ignore_toco_errors and toco_errors > 0:
     raise RuntimeError(
         "Found %d errors while generating toco models" % toco_errors)
@@ -616,6 +622,30 @@ def make_max_pool_tests(zip_path):
   make_pool_tests(tf.nn.max_pool)(zip_path)
 
 
+def make_abs_tests(zip_path):
+  """Make a set of tests to do relu."""
+
+  # Chose a set of parameters
+  test_parameters = [{
+      "input_shape": [[], [1], [2, 3], [1, 1, 1, 1], [1, 3, 4, 3],
+                      [3, 15, 14, 3], [3, 1, 2, 4, 6], [2, 2, 3, 4, 5, 6]],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.abs(input_tensor)
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-10, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_relu_tests(zip_path):
   """Make a set of tests to do relu."""
 
@@ -747,6 +777,34 @@ def make_prelu_tests(zip_path):
       use_frozen_graph=True)
 
 
+def make_leaky_relu_tests(zip_path):
+  """Make a set of tests to do LeakyRelu."""
+
+  test_parameters = [
+      {
+          "input_shape": [[], [1], [5], [1, 10, 10, 3], [3, 3, 3, 3]],
+          "alpha": [0.1, 1.0, 2.0, -0.1, -1.0, -2.0],
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.nn.leaky_relu(input_tensor, alpha=parameters["alpha"])
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    """Build the inputs for the test case."""
+    input_values = create_tensor_data(
+        np.float32, parameters["input_shape"], min_value=-3, max_value=10)
+    return [input_values], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_values])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 # This function tests various TensorFLow functions that generates Const op,
 # including `tf.ones`, `tf.zeros` and random functions.
 def make_constant_tests(zip_path):
@@ -755,6 +813,7 @@ def make_constant_tests(zip_path):
   test_parameters = [{
       "dtype": [tf.float32, tf.int32],
       "input_shape": [[], [1], [2], [1, 1, 1, 1], [2, 2, 2, 2]],
+      "constant_is_also_output": [True, False],
   }]
 
   def build_graph(parameters):
@@ -764,17 +823,19 @@ def make_constant_tests(zip_path):
         shape=parameters["input_shape"])
     constant = tf.constant(
         create_tensor_data(parameters["dtype"], parameters["input_shape"]))
-    # This maximum node is here to avoid the situation where a graph output is
-    # a constant, which is an error in toco.
-    out = tf.maximum(dummy_input, constant)
-    return [dummy_input], [out]
+    out = [tf.maximum(dummy_input, constant)]
+    if parameters["constant_is_also_output"]:
+      out.append(constant)
+
+    return [dummy_input], out
 
   def build_inputs(parameters, sess, inputs, outputs):
     dummy_input = np.zeros(
         parameters["input_shape"], dtype=_TF_TYPE_INFO[parameters["dtype"]][0])
     return [dummy_input], sess.run(outputs, feed_dict={inputs[0]: dummy_input})
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=20)
 
 
 def make_binary_op_tests(zip_path, binary_operator):
@@ -869,34 +930,46 @@ def make_reduce_tests(reduce_op,
   def f(zip_path):
     """Actual function that generates examples."""
 
-    test_parameters = [{
-        "input_dtype": [tf.float32, tf.int32, tf.int64],
-        "input_shape": [[3, 2, 4]],
-        "axis": [
-            0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
-            [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1], [-1, 0],
-            [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
-        ],
-        "const_axis": [True, False],
-        "keepdims": [True, False],
-    }, {
-        "input_dtype": [tf.float32],
-        "input_shape": [[1, 8, 8, 3]],
-        "axis": [
-            0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2, 3],
-            [3, 2, 1, 0], [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2,
-            -3, -4, [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
-            [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
-        ],
-        "const_axis": [True, False],
-        "keepdims": [True, False],
-    }, {
-        "input_dtype": [tf.float32],
-        "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
-        "axis": [None],
-        "const_axis": [True],
-        "keepdims": [True, False],
-    }]
+    test_parameters = [
+        {
+            "input_dtype": [tf.float32, tf.int32, tf.int64],
+            "input_shape": [[3, 3, 2, 4]],
+            "axis": [
+                0, 1, 2, [0, 1], [0, 2], [1, 2], [0, 1, 2], [1, 0], [2, 0],
+                [2, 1], [2, 1, 0], [2, 0, 1], -1, -2, -3, [1, -1], [0, -1],
+                [-1, 0], [-1, -2, -3], [0, 0, 0], [2, 2, 0], [1, 0, -3, -3]
+            ],
+            "const_axis": [True, False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[1, 8, 8, 3]],
+            "axis": [
+                0, 1, 2, 3, [1, 2], [0, 3], [1, 2, 3], [0, 1, 2,
+                                                        3], [3, 2, 1, 0],
+                [3, 1, 0, 2], [2, 0], [3, 0], [3, 1], [1, 0], -1, -2, -3, -4,
+                [0, -2], [2, 3, -1, 0], [3, 1, 2, -3], [3, -4], [2, 2, 2],
+                [2, 2, 3], [-3, -3, -4], [-3, 2, 1]
+            ],
+            "const_axis": [True, False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+            "axis": [[]],  # shape is: [0]
+            "const_axis": [False],
+            "keepdims": [True, False],
+        },
+        {
+            "input_dtype": [tf.float32],
+            "input_shape": [[], [1, 8, 8, 3], [3, 2, 4]],
+            "axis": [None],  # shape is: []
+            "const_axis": [True],
+            "keepdims": [True, False],
+        }
+    ]
 
     def build_graph(parameters):
       """Build the mean op testing graph."""
@@ -1135,6 +1208,10 @@ def make_floor_mod_tests(zip_path):
   make_binary_op_tests(zip_path, tf.floormod)
 
 
+def make_squared_difference_tests(zip_path):
+  make_binary_op_tests(zip_path, tf.squared_difference)
+
+
 def make_gather_tests(zip_path):
   """Make a set of tests to do gather."""
 
@@ -1142,9 +1219,9 @@ def make_gather_tests(zip_path):
       # TODO(mgubin): add string tests when they are supported by Toco.
       # TODO(mgubin): add tests for Nd indices when they are supported by
       # TfLite.
-      "params_dtype": [tf.float32, tf.int32],
+      "params_dtype": [tf.float32, tf.int32, tf.int64],
       "params_shape": [[10], [1, 2, 20]],
-      "indices_dtype": [tf.int32],
+      "indices_dtype": [tf.int32, tf.int64],
       "indices_shape": [[3], [5]],
       "axis": [-1, 0, 1],
   }]
@@ -1172,7 +1249,43 @@ def make_gather_tests(zip_path):
     return [params, indices], sess.run(
         outputs, feed_dict=dict(zip(inputs, [params, indices])))
 
-  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+  # Note that TF can't execute with index=1 and params_shape=[10].
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=60)
+
+
+def make_gather_with_constant_tests(zip_path):
+  """Make a set of test which feed a constant to gather toco."""
+
+  test_parameters = [{
+      "input_shape": [[3]],
+      "reference_shape": [[2]],
+  }, {
+      "input_shape": [[2, 3]],
+      "reference_shape": [[2, 3]],
+  }]
+
+  def build_graph(parameters):
+    """Build a graph where the inputs to Gather are constants."""
+    reference = tf.placeholder(
+        dtype=tf.int32, shape=parameters["reference_shape"])
+    gather_input = tf.constant(
+        create_tensor_data(tf.int32, parameters["input_shape"]))
+    gather_indices = tf.constant([0, 1], tf.int32)
+    out = tf.equal(reference, tf.gather(gather_input, gather_indices))
+    return [reference], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    reference_values = np.zeros(parameters["reference_shape"], dtype=np.int32)
+    return [reference_values], sess.run(
+        outputs, feed_dict={inputs[0]: reference_values})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=2)
 
 
 def make_global_batch_norm_tests(zip_path):
@@ -1340,23 +1453,27 @@ def make_conv_with_shared_weights_tests(zip_path):
     input_shape, filter_shape = get_tensor_shapes(parameters)
     input_tensor = tf.placeholder(
         dtype=tf.float32, name="input", shape=input_shape)
+    input_tensors = [input_tensor]
 
     # Construct a constant weights tensor which will be used by both Conv2D.
     filter_tensor = tf.constant(
         create_tensor_data(np.float32, filter_shape), dtype=tf.float32)
-    input_tensors = [input_tensor]
+
+    # Ensure that FuseBinaryIntoFollowingAffine works with an input which
+    # is shared by multiple affine ops.
+    conv_input = input_tensor + 0.1
 
     # Construct 2 Conv2D operations which use exactly the same input and
     # weights.
     result1 = tf.nn.conv2d(
-        input_tensor,
+        conv_input,
         filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
         padding=parameters["padding"],
         data_format=parameters["data_format"])
     result2 = tf.nn.conv2d(
-        input_tensor,
+        conv_input,
         filter_tensor,
         strides=parameters["strides"],
         dilations=parameters["dilations"],
@@ -1524,7 +1641,7 @@ def make_split_tests(zip_path):
 
   test_parameters = [{
       "input_shape": [[1, 3, 4, 6], [2, 4, 1], [6, 4], [8]],
-      "num_or_size_splits": [1, 2, 3, 4, 5, [2, 2]],
+      "num_or_size_splits": [1, 2, 3, 4, 5],
       "axis": [0, 1, 2, 3, -4, -3, -2, -1],
   }]
 
@@ -1542,6 +1659,29 @@ def make_split_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_splitv_tests(zip_path):
+  """Make a set of tests to do tf.split_v."""
+
+  test_parameters = [{
+      "input_shape": [[1, 3, 4, 6], [2, 4, 1], [6, 4], [8]],
+      "size_splits": [[2, 2], [1, 3], [4, 2], [5, 3],
+                      [-1, 1], [-1, 2], [-1, 4]],
+      "axis": [0, 1, 2, 3, -4, -3, -2, -1],
+  }]
+
+  def build_graph(parameters):
+    input_tensor = tf.placeholder(
+        dtype=tf.float32, name="input", shape=parameters["input_shape"])
+    out = tf.split(input_tensor, parameters["size_splits"], parameters["axis"])
+    return [input_tensor], [out[0]]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    values = [create_tensor_data(np.float32, parameters["input_shape"])]
+    return values, sess.run(outputs, feed_dict=dict(zip(inputs, values)))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_concat_tests(zip_path):
   """Make a set of tests to do concatenation."""
 
@@ -2468,6 +2608,32 @@ def make_strided_slice_1d_exhaustive_tests(zip_path):
   _make_strided_slice_tests(zip_path, test_parameters)
 
 
+def make_strided_slice_buggy_tests(zip_path):
+  """Make a set of tests to show strided_slice yields incorrect results."""
+
+  test_parameters = [{
+      "unused_iteration_counter": [1],
+  }]
+
+  def build_graph(parameters):
+    """Build the strided_slice op testing graph."""
+    del parameters
+    input_values = tf.placeholder(dtype=tf.float32, shape=[4, 2])
+    data = tf.constant([[0, 1, 2, 3],
+                        [4, 5, 6, 7],
+                        [8, 9, 10, 11],
+                        [12, 13, 14, 15]], tf.float32)
+    return [input_values], [input_values + data[:, :2]]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    del parameters
+    input_values = np.zeros([4, 2], dtype=np.float32)
+    return [input_values], sess.run(
+        outputs, feed_dict={inputs[0]: input_values})
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def make_lstm_tests(zip_path):
   """Make a set of tests to do basic Lstm cell."""
 
@@ -3121,7 +3287,7 @@ def make_transpose_conv_tests(zip_path):
 def make_tile_tests(zip_path):
   """Make a set of tests to do tile."""
   test_parameters = [{
-      "input_dtype": [tf.float32, tf.int32],
+      "input_dtype": [tf.float32, tf.int32, tf.bool],
       "input_shape": [[3, 2, 1], [2, 2, 2]],
       "multiplier_dtype": [tf.int32, tf.int64],
       "multiplier_shape": [[3]]
@@ -3143,8 +3309,10 @@ def make_tile_tests(zip_path):
   def build_inputs(parameters, sess, inputs, outputs):
     input_value = create_tensor_data(parameters["input_dtype"],
                                      parameters["input_shape"])
-    multipliers_value = create_tensor_data(parameters["multiplier_dtype"],
-                                           parameters["multiplier_shape"])
+    multipliers_value = create_tensor_data(
+        parameters["multiplier_dtype"],
+        parameters["multiplier_shape"],
+        min_value=0)
     return [input_value, multipliers_value], sess.run(
         outputs,
         feed_dict={
@@ -3365,6 +3533,36 @@ def make_range_tests(zip_path):
   make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
 
 
+def make_fill_tests(zip_path):
+  """Make a set of tests to do fill."""
+
+  test_parameters = [{
+      "dims_dtype": [tf.int32, tf.int64],
+      "dims_shape": [[], [1], [3], [3, 3]],
+      "value_dtype": [tf.int32, tf.int64, tf.float32],
+  }]
+
+  def build_graph(parameters):
+    """Build the fill op testing graph."""
+    input1 = tf.placeholder(
+        dtype=parameters["dims_dtype"],
+        name="dims",
+        shape=parameters["dims_shape"])
+    input2 = tf.placeholder(
+        dtype=parameters["value_dtype"], name="value", shape=[])
+    out = tf.fill(input1, input2)
+    return [input1, input2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input1 = create_tensor_data(parameters["dims_dtype"],
+                                parameters["dims_shape"], 1)
+    input2 = create_scalar_data(parameters["value_dtype"])
+    return [input1, input2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input1, input2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
 def _make_logical_tests(op):
   """Make a set of tests to do logical operations."""
 
@@ -3416,6 +3614,141 @@ def make_logical_xor_tests(zip_path):
   return _make_logical_tests(tf.logical_xor)(zip_path)
 
 
+def make_mirror_pad_tests(zip_path):
+  """Make a set of tests to do mirror_pad."""
+
+  test_parameters = [
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [1, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[2, 3]],
+          "padding_matrix": [[[1, 1], [2, 1]]],
+          "mode": ["REFLECT"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["placeholder"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["SYMMETRIC"],
+          "type": ["const"]
+      },
+      {
+          "input_shape": [[3]],
+          "padding_matrix": [[[0, 2]]],
+          "mode": ["REFLECT"],
+          "type": ["const"]
+      },
+  ]
+
+  def build_graph(parameters):
+    """Build the graph for the test case."""
+
+    input_tensor = tf.placeholder(
+        dtype=tf.int32, name="input", shape=parameters["input_shape"])
+    if parameters["type"] != "const":
+      padding_matrix = tf.placeholder(
+          dtype=tf.int32,
+          name="padding",
+          shape=[len(parameters["input_shape"]), 2])
+      input_tensors = [input_tensor, padding_matrix]
+    else:
+      padding_matrix = tf.constant(np.array(parameters["padding_matrix"]))
+      input_tensors = [input_tensor]
+    output = tf.pad(
+        input_tensor, paddings=padding_matrix, mode=parameters["mode"])
+
+    return input_tensors, [output]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_values = [create_tensor_data(tf.int32, parameters["input_shape"])]
+    if parameters["type"] != "const":
+      input_values.append(np.array(parameters["padding_matrix"]))
+    return input_values, sess.run(
+        outputs, feed_dict=dict(zip(inputs, input_values)))
+
+  make_zip_of_tests(
+      zip_path,
+      test_parameters,
+      build_graph,
+      build_inputs,
+      expected_tf_success=7)
+
+
+def make_unroll_batch_matmul_tests(zip_path):
+  """Make a set of tests to test unroll_batch_matmul."""
+
+  test_parameters = [{"dtype": [tf.float32], "shape": [[(2, 2, 3), (2, 3, 2)]]}]
+
+  def build_graph(parameters):
+    """Build the batch_matmul op testing graph."""
+    input_tensor1 = tf.placeholder(
+        dtype=parameters["dtype"], shape=parameters["shape"][0])
+    input_tensor2 = tf.placeholder(
+        dtype=parameters["dtype"], shape=parameters["shape"][1])
+    # Should be unrolled and replaced with fully_connected ops in the end.
+    out = tf.matmul(input_tensor1, input_tensor2)
+    return [input_tensor1, input_tensor2], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    input_value1 = create_tensor_data(
+        parameters["dtype"], shape=parameters["shape"][0])
+    input_value2 = create_tensor_data(
+        parameters["dtype"], shape=parameters["shape"][1])
+    return [input_value1, input_value2], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value1, input_value2])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs)
+
+
+def make_placeholder_with_default_tests(zip_path):
+  """Make a set of tests to test placeholder_with_default."""
+
+  test_parameters = [{
+      "dtype": [tf.float32, tf.int32, tf.int64],
+  }]
+
+  def build_graph(parameters):
+    """Build the placeholder_with_default testing graph."""
+    const_node = tf.constant(
+        [1, 2, 2, 0], shape=[2, 2], dtype=parameters["dtype"])
+    input_tensor = tf.placeholder_with_default(
+        const_node, shape=[2, 2], name="input")
+    out = tf.equal(input_tensor, const_node, name="output")
+
+    return [input_tensor], [out]
+
+  def build_inputs(parameters, sess, inputs, outputs):
+    numpy_type = _TF_TYPE_INFO[parameters["dtype"]][0]
+    input_value = np.array([[1, 0], [2, 1]], numpy_type)
+    return [input_value], sess.run(
+        outputs, feed_dict=dict(zip(inputs, [input_value])))
+
+  make_zip_of_tests(zip_path, test_parameters, build_graph, build_inputs,
+                    expected_tf_success=3)
+
+
 # Toco binary path provided by the generate rule.
 bin_path = None
 
diff --git a/tensorflow/lite/testing/generated_examples_zip_test.cc b/tensorflow/lite/testing/generated_examples_zip_test.cc
index aedea52065f71e654a52b0ac8b89c266e07cda71..a9a31ad088e6f4b0297ba313c585abbe6189728b 100644
--- a/tensorflow/lite/testing/generated_examples_zip_test.cc
+++ b/tensorflow/lite/testing/generated_examples_zip_test.cc
@@ -101,6 +101,10 @@ std::map<string, string> kBrokenTests = {
     {R"(^\/mul.*dtype=tf\.int64)", "119126484"},
     {R"(^\/add.*dtype=tf\.int64)", "119126484"},
     {R"(^\/floor_div.*dtype=tf\.int64)", "119126484"},
+    {R"(^\/squared_difference.*dtype=tf\.int64)", "119126484"},
+
+    // Strided Slice chooses the wrong dimension.
+    {R"(^\/strided_slice_buggy)", "119786029"},
 };
 
 // Allows test data to be unarchived into a temporary directory and makes
@@ -201,7 +205,7 @@ tensorflow::Status ReadManifest(const string& original_file, const string& dir,
   }
   if (!added) {
     string message = "Test had no examples: " + original_file;
-    return tensorflow::Status(tensorflow::error::UNKNOWN, message.c_str());
+    return tensorflow::Status(tensorflow::error::UNKNOWN, message);
   }
   return tensorflow::Status::OK();
 }
diff --git a/tensorflow/lite/testing/join.h b/tensorflow/lite/testing/join.h
index 7d0040c488a4ce4bf38f35948efb9c0b80777079..d1c314608687f045b346cc5526ea46c8149c2755 100644
--- a/tensorflow/lite/testing/join.h
+++ b/tensorflow/lite/testing/join.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_TESTING_JOIN_H_
 
 #include <cstdlib>
+#include <iomanip>
 #include <sstream>
 
 #include "tensorflow/lite/string.h"
@@ -30,9 +31,9 @@ string Join(T* data, size_t len, const string& delimiter) {
     return "";
   }
   std::stringstream result;
-  result << data[0];
+  result << std::setprecision(9) << data[0];
   for (int i = 1; i < len; i++) {
-    result << delimiter << data[i];
+    result << std::setprecision(9) << delimiter << data[i];
   }
   return result.str();
 }
diff --git a/tensorflow/lite/testing/join_test.cc b/tensorflow/lite/testing/join_test.cc
index a8d036c547ded369618bf62544dafffcc27bbf0a..0b3c07f37e14e3815ac1eb4acd0aefac3515064c 100644
--- a/tensorflow/lite/testing/join_test.cc
+++ b/tensorflow/lite/testing/join_test.cc
@@ -28,7 +28,7 @@ TEST(JoinTest, JoinInt) {
 
 TEST(JoinTest, JoinFloat) {
   float data[] = {1.0, -3, 2.3, 1e-5};
-  EXPECT_EQ(Join(data, 4, " "), "1 -3 2.3 1e-05");
+  EXPECT_EQ(Join(data, 4, " "), "1 -3 2.29999995 9.99999975e-06");
 }
 
 TEST(JoinTest, JoinNullData) { EXPECT_THAT(Join<int>(nullptr, 3, ","), ""); }
diff --git a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
index 6b4e7427ed9c69b702d37ccc1b6de0b0c414fe5d..4e329ac97d7358edf068329b21f0194c94c57cb0 100644
--- a/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
+++ b/tensorflow/lite/testing/model_coverage/model_coverage_lib_test.py
@@ -28,6 +28,7 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -42,6 +43,7 @@ class EvaluateFrozenGraph(test.TestCase):
     write_graph(sess.graph_def, '', graph_def_file, False)
     return graph_def_file
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     with session.Session().as_default() as sess:
       in_tensor = array_ops.placeholder(
@@ -51,6 +53,7 @@ class EvaluateFrozenGraph(test.TestCase):
 
     model_coverage.test_frozen_graph(filename, ['Placeholder'], ['add'])
 
+  @test_util.run_v1_only('b/120545219')
   def testMultipleOutputs(self):
     with session.Session().as_default() as sess:
       in_tensor_1 = array_ops.placeholder(
@@ -84,15 +87,18 @@ class EvaluateFrozenGraph(test.TestCase):
     filename = self._saveFrozenGraph(sess)
     return filename
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantized(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(filename, ['inputA'], ['output'])
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedInputShapes(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(
         filename, ['inputA'], ['output'], input_shapes={'inputA': [33, 33]})
 
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedFlexAll(self):
     filename = self._getQuantizedModel()
     model_coverage.test_frozen_graph_quant(
@@ -102,6 +108,7 @@ class EvaluateFrozenGraph(test.TestCase):
 
 class EvaluateSavedModel(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     saved_model_dir = os.path.join(self.get_temp_dir(), 'simple_savedmodel')
     with session.Session().as_default() as sess:
@@ -139,18 +146,21 @@ class EvaluateKerasModel(test.TestCase):
       os.close(fd)
     return keras_file
 
+  @test_util.run_v1_only('b/120545219')
   def testFloat(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file)
 
+  @test_util.run_v1_only('b/120545219')
   def testPostTrainingQuantize(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
 
     model_coverage.test_keras_model(keras_file, post_training_quantize=True)
 
+  @test_util.run_v1_only('b/120545219')
   def testTargetOps(self):
     model = self._getSingleInputKerasModel()
     keras_file = self._saveKerasModel(model)
diff --git a/tensorflow/lite/testing/tf_driver_test.cc b/tensorflow/lite/testing/tf_driver_test.cc
index 4381fe4c19dc2240ed2335495276e3a5dab91022..363d162d56a1670821d29768bc36411bf22d61e9 100644
--- a/tensorflow/lite/testing/tf_driver_test.cc
+++ b/tensorflow/lite/testing/tf_driver_test.cc
@@ -46,7 +46,7 @@ TEST(TfDriverTest, ReadingAndWrintingValues) {
   TestDriver driver;
   ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_FLOAT, {1, 2, 2},
                                     "0.10,0.20,0.30,0.40"),
-            "0.1,0.2,0.3,0.4");
+            "0.100000001,0.200000003,0.300000012,0.400000006");
   ASSERT_EQ(driver.WriteAndReadBack(tensorflow::DT_INT32, {1, 2, 2},
                                     "10,40,100,-100"),
             "10,40,100,-100");
@@ -111,8 +111,10 @@ TEST(TfDriverTest, SimpleTest) {
   runner->ResetTensor(2);
   runner->Invoke();
 
-  ASSERT_EQ(runner->ReadOutput(0), "0.101,0.202,0.303,0.404");
-  ASSERT_EQ(runner->ReadOutput(1), "0.011,0.022,0.033,0.044");
+  ASSERT_EQ(runner->ReadOutput(0),
+            "0.101000004,0.202000007,0.303000003,0.404000014");
+  ASSERT_EQ(runner->ReadOutput(1),
+            "0.0109999999,0.0219999999,0.0329999998,0.0439999998");
 }
 
 }  // namespace
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 3a0febb780c331178a36fdfa72ba5d59c260a331..4e11d49f252818f9f7024b8bbafa8b17ad77ad48 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -77,6 +77,13 @@ class TfLiteDriver::Expectation {
     SetTensorData(values, &data_);
   }
 
+  template <>
+  void SetData<string>(const string& csv_values) {
+    string s = absl::HexStringToBytes(csv_values);
+    data_.raw = new char[s.size()];
+    memcpy(data_.raw, s.data(), s.size());
+  }
+
   bool Check(bool verbose, const TfLiteTensor& tensor) {
     switch (tensor.type) {
       case kTfLiteFloat32:
@@ -89,6 +96,8 @@ class TfLiteDriver::Expectation {
         return TypedCheck<uint8_t>(verbose, tensor);
       case kTfLiteBool:
         return TypedCheck<bool>(verbose, tensor);
+      case kTfLiteString:
+        return TypedCheck<string>(verbose, tensor);
       default:
         fprintf(stderr, "Unsupported type %d in Check\n", tensor.type);
         return false;
@@ -135,6 +144,46 @@ class TfLiteDriver::Expectation {
     return good_output;
   }
 
+  template <>
+  bool TypedCheck<string>(bool verbose, const TfLiteTensor& tensor) {
+    if (tensor.data.raw == nullptr) {
+      if (verbose) {
+        std::cerr << "  got empty string" << std::endl;
+      }
+      return false;
+    }
+    int expected_num_strings = GetStringCount(data_.raw);
+    int returned_num_strings = GetStringCount(tensor.data.raw);
+    if (expected_num_strings != returned_num_strings) {
+      if (verbose) {
+        std::cerr << "  string count differ: got " << returned_num_strings
+                  << ", but expected " << expected_num_strings << std::endl;
+      }
+      return false;
+    }
+    for (int i = 0; i < returned_num_strings; ++i) {
+      auto expected_ref = GetString(data_.raw, i);
+      auto returned_ref = GetString(tensor.data.raw, i);
+      if (expected_ref.len != returned_ref.len) {
+        if (verbose) {
+          std::cerr << "  index " << i << ": got string of size "
+                    << returned_ref.len << ", but expected size "
+                    << expected_ref.len << std::endl;
+        }
+        return false;
+      }
+      if (strncmp(expected_ref.str, returned_ref.str, returned_ref.len) != 0) {
+        if (verbose) {
+          std::cerr << "  index " << i << ": strings are different"
+                    << std::endl;
+        }
+        return false;
+      }
+    }
+
+    return true;
+  }
+
   TfLitePtrUnion data_;
   size_t num_elements_;
 };
@@ -147,9 +196,10 @@ TfLiteDriver::TfLiteDriver(bool use_nnapi, const string& delegate_name)
 }
 
 TfLiteDriver::~TfLiteDriver() {
-  for (TfLiteTensor* t : tensors_to_deallocate_) {
-    free(t->data.raw);
+  for (auto t : tensors_to_deallocate_) {
+    DeallocateStringTensor(t.second);
   }
+  interpreter_.reset();
 }
 
 void TfLiteDriver::AllocateTensors() {
@@ -242,17 +292,16 @@ void TfLiteDriver::SetInput(int id, const string& csv_values) {
     case kTfLiteString: {
       string s = absl::HexStringToBytes(csv_values);
 
-      tensor->data.raw = reinterpret_cast<char*>(malloc(s.size()));
-      tensor->bytes = s.size();
+      DeallocateStringTensor(tensors_to_deallocate_[id]);
+      AllocateStringTensor(id, s.size(), tensor);
       memcpy(tensor->data.raw, s.data(), s.size());
 
-      // We must remember to free the memory we allocated above.
-      tensors_to_deallocate_.push_back(tensor);
       break;
     }
     default:
-      fprintf(stderr, "Unsupported type %d in SetInput\n", tensor->type);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::SetInput"));
       return;
   }
 }
@@ -261,8 +310,7 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
   if (!IsValid()) return;
   auto* tensor = interpreter_->tensor(id);
   if (expected_output_.count(id) != 0) {
-    fprintf(stderr, "Overridden expectation for tensor %d\n", id);
-    Invalidate("Overridden expectation");
+    Invalidate(absl::StrCat("Overridden expectation for tensor '", id, "'"));
   }
   expected_output_[id].reset(new Expectation);
   switch (tensor->type) {
@@ -281,9 +329,13 @@ void TfLiteDriver::SetExpectation(int id, const string& csv_values) {
     case kTfLiteBool:
       expected_output_[id]->SetData<bool>(csv_values);
       break;
+    case kTfLiteString:
+      expected_output_[id]->SetData<string>(csv_values);
+      break;
     default:
-      fprintf(stderr, "Unsupported type %d in SetExpectation\n", tensor->type);
-      Invalidate("Unsupported tensor data type");
+      Invalidate(absl::StrCat("Unsupported tensor type ",
+                              TfLiteTypeGetName(tensor->type),
+                              " in TfLiteDriver::SetExpectation"));
       return;
   }
 }
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index d8b40565bacd181df9f3ed114d76e6c003e645e5..1da0533c57cf51f442253f28b6d9ba13078ef9a7 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -49,6 +49,18 @@ class TfLiteDriver : public TestRunner {
   string ReadOutput(int id) override { return "no-op"; }
 
  private:
+  void DeallocateStringTensor(TfLiteTensor* t) {
+    if (t) {
+      free(t->data.raw);
+      t->data.raw = nullptr;
+    }
+  }
+  void AllocateStringTensor(int id, size_t num_bytes, TfLiteTensor* t) {
+    t->data.raw = reinterpret_cast<char*>(malloc(num_bytes));
+    t->bytes = num_bytes;
+    tensors_to_deallocate_[id] = t;
+  }
+
   void ResetLSTMStateTensors();
 
   class Expectation;
@@ -59,7 +71,7 @@ class TfLiteDriver : public TestRunner {
   std::unique_ptr<Interpreter> interpreter_;
   std::map<int, std::unique_ptr<Expectation>> expected_output_;
   bool must_allocate_tensors_ = true;
-  std::vector<TfLiteTensor*> tensors_to_deallocate_;
+  std::map<int, TfLiteTensor*> tensors_to_deallocate_;
 };
 
 }  // namespace testing
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index 14302874441c4af49909dd3ba5b3bee78c421c45..93d41fcae14c8130de87471bdce64edad131c11f 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -341,7 +341,6 @@ cc_library(
 tf_cc_test(
     name = "import_tensorflow_test",
     srcs = ["import_tensorflow_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":toco_tooling",
         "//tensorflow/core:framework",
@@ -384,7 +383,6 @@ cc_library(
 tf_cc_test(
     name = "tooling_util_test",
     srcs = ["tooling_util_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":model",
         ":tooling_util",
@@ -395,6 +393,28 @@ tf_cc_test(
 
 # :toco is the main public command-line tool exposing the functionality
 # of the :toco_tooling library.
+cc_library(
+    name = "toco_convert",
+    srcs = ["toco_convert.cc"],
+    hdrs = ["toco_convert.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
 tf_cc_binary(
     name = "toco",
     srcs = ["toco.cc"],
@@ -404,6 +424,7 @@ tf_cc_binary(
         ":model_cmdline_flags",
         ":model_flags_proto_cc",
         ":toco_cmdline_flags",
+        ":toco_convert",
         ":toco_flags_proto_cc",
         ":toco_port",
         ":toco_tooling",
@@ -416,13 +437,35 @@ tf_cc_binary(
     ],
 )
 
+tf_cc_test(
+    name = "toco_convert_test",
+    srcs = ["toco_convert_test.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":model",
+        ":model_cmdline_flags",
+        ":model_flags_proto_cc",
+        ":toco_cmdline_flags",
+        ":toco_convert",
+        ":toco_flags_proto_cc",
+        ":toco_port",
+        ":toco_tooling",
+        ":types_proto_cc",
+        "@com_google_googletest//:gtest_main",
+        "@com_google_absl//absl/strings",
+        "//tensorflow/core:lib",
+        # We cannot embed the core:ops dependency directly into :toco_tooling as
+        # it can conflict with downstream deps when toco is used as a library.
+        "//tensorflow/core:ops",
+    ],
+)
+
 tf_cc_test(
     name = "toco_port_test",
     srcs = ["toco_port_test.cc"],
     data = [
         "toco_port_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":toco_port",
         "@com_google_googletest//:gtest_main",
diff --git a/tensorflow/lite/toco/README.md b/tensorflow/lite/toco/README.md
index 91f6f618a376ff4df7c51dfd285152229f4757cc..fe98a90b38583a368b02bd4b422943f6f6b16c9b 100644
--- a/tensorflow/lite/toco/README.md
+++ b/tensorflow/lite/toco/README.md
@@ -8,9 +8,9 @@ the usage documentation.
 
 Usage information is given in these documents:
 
-*   [Command-line glossary](g3doc/cmdline_reference.md)
-*   [Command-line examples](g3doc/cmdline_examples.md)
-*   [Python API examples](g3doc/python_api.md)
+*   [Command-line glossary](../g3doc/convert/cmdline_reference.md)
+*   [Command-line examples](../g3doc/convert/cmdline_examples.md)
+*   [Python API examples](../g3doc/convert/python_api.md)
 
 ## Where the converter fits in the TensorFlow landscape
 
@@ -26,4 +26,4 @@ to client devices, generally mobile devices, where the TensorFlow Lite
 interpreter handles them on-device.  This flow is represented in the diagram
 below.
 
-![drawing](g3doc/toco_landscape.svg)
+![drawing](../g3doc/images/convert/workflow.svg)
diff --git a/tensorflow/lite/toco/export_tensorflow.cc b/tensorflow/lite/toco/export_tensorflow.cc
index 1752745aaee987e1ef029523ce12d05a4a80cdce..9fff0015527ebadf501f571bdd5ed0a7643d66e0 100644
--- a/tensorflow/lite/toco/export_tensorflow.cc
+++ b/tensorflow/lite/toco/export_tensorflow.cc
@@ -48,7 +48,8 @@ using tensorflow::TensorProto;
 namespace toco {
 namespace {
 
-tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
+tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type,
+                                           const string& error_location) {
   switch (data_type) {
     case ArrayDataType::kBool:
       return tensorflow::DT_BOOL;
@@ -66,14 +67,21 @@ tensorflow::DataType GetTensorFlowDataType(ArrayDataType data_type) {
       return tensorflow::DT_COMPLEX64;
     default:
     case ArrayDataType::kNone:
-      LOG(FATAL) << "Unsupported data type: " << static_cast<int>(data_type);
+      LOG(FATAL) << "Unsupported data type '" << ArrayDataTypeName(data_type)
+                 << "' in " << error_location;
       return tensorflow::DT_INVALID;
   }
 }
 
+tensorflow::DataType GetTensorFlowDataTypeForOp(ArrayDataType data_type,
+                                                const string& op_name) {
+  return GetTensorFlowDataType(data_type, "op '" + op_name + "'");
+}
+
 tensorflow::DataType GetTensorFlowDataType(const Model& model,
                                            const string& array_name) {
-  return GetTensorFlowDataType(model.GetArray(array_name).data_type);
+  return GetTensorFlowDataType(model.GetArray(array_name).data_type,
+                               "array '" + array_name + "'");
 }
 
 // TensorFlow sometimes forbids what it calls "legacy scalars",
@@ -1150,6 +1158,29 @@ void ConvertSplitOperator(const Model& model,
                                   tensorflow_graph);
 }
 
+void ConvertSplitVOperator(const Model& model,
+                           const TensorFlowSplitVOperator& src_op,
+                           GraphDef* tensorflow_graph) {
+  tensorflow::NodeDef* split_v_op = tensorflow_graph->add_node();
+  split_v_op->set_op("SplitV");
+  split_v_op->set_name(src_op.outputs[0]);
+  for (const auto& input : src_op.inputs) {
+    *split_v_op->add_input() = input;
+  }
+  (*split_v_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataType(model, src_op.inputs[0]));
+  (*split_v_op->mutable_attr())["num_split"].set_i(src_op.num_split);
+  const auto& split_dim_array = model.GetArray(src_op.inputs[1]);
+  CHECK(split_dim_array.buffer);
+  CHECK(split_dim_array.data_type == ArrayDataType::kInt32);
+  const auto& split_dim_data =
+      split_dim_array.GetBuffer<ArrayDataType::kInt32>().data;
+  CHECK_EQ(split_dim_data.size(), 1);
+  const int split_dim = split_dim_data[0];
+  CreateDummyConcatDimTensorConst(src_op.inputs[0], split_dim,
+                                  tensorflow_graph);
+}
+
 void ConvertCastOperator(const Model& model, const CastOperator& src_op,
                          GraphDef* tensorflow_graph) {
   tensorflow::NodeDef* cast_op = tensorflow_graph->add_node();
@@ -1285,7 +1316,7 @@ void ConvertRangeOperator(const Model& model, const RangeOperator& src_op,
   *range_op->add_input() = src_op.inputs[1];
   *range_op->add_input() = src_op.inputs[2];
   (*range_op->mutable_attr())["Tidx"].set_type(
-      GetTensorFlowDataType(src_op.dtype));
+      GetTensorFlowDataTypeForOp(src_op.dtype, /*op_name=*/src_op.outputs[0]));
 }
 
 void ConvertPackOperator(const Model& model, const PackOperator& src_op,
@@ -1298,7 +1329,8 @@ void ConvertPackOperator(const Model& model, const PackOperator& src_op,
   }
   (*pack_op->mutable_attr())["axis"].set_i(src_op.axis);
   (*pack_op->mutable_attr())["N"].set_i(src_op.inputs.size());
-  (*pack_op->mutable_attr())["T"].set_type(GetTensorFlowDataType(src_op.dtype));
+  (*pack_op->mutable_attr())["T"].set_type(
+      GetTensorFlowDataTypeForOp(src_op.dtype, src_op.outputs[0]));
 }
 
 void ConvertFillOperator(const Model& model, const FillOperator& src_op,
@@ -1887,7 +1919,7 @@ void ConvertRandomUniformOperator(const Model& model,
       GetTensorFlowDataType(model, src_op.inputs[0]);
   (*new_op->mutable_attr())["T"].set_type(shape_type);
   (*new_op->mutable_attr())["dtype"].set_type(
-      GetTensorFlowDataType(src_op.dtype));
+      GetTensorFlowDataTypeForOp(src_op.dtype, src_op.outputs[0]));
   (*new_op->mutable_attr())["seed"].set_i(src_op.seed);
   (*new_op->mutable_attr())["seed2"].set_i(src_op.seed2);
 }
@@ -2124,6 +2156,10 @@ void ConvertOperator(const Model& model, const Operator& src_op,
     ConvertSplitOperator(model,
                          static_cast<const TensorFlowSplitOperator&>(src_op),
                          tensorflow_graph);
+  } else if (src_op.type == OperatorType::kSplitV) {
+    ConvertSplitVOperator(model,
+                          static_cast<const TensorFlowSplitVOperator&>(src_op),
+                          tensorflow_graph);
   } else if (src_op.type == OperatorType::kFakeQuant) {
     ConvertFakeQuantOperator(static_cast<const FakeQuantOperator&>(src_op),
                              tensorflow_graph);
diff --git a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
index 6b4765b23c47d0a8af4ef3995d6e27c978387593..436b639253f2e190fcaab895cd077b06796c1ca1 100644
--- a/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
+++ b/tensorflow/lite/toco/graph_transformations/fuse_binary_into_following_affine.cc
@@ -221,9 +221,8 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
   Operator* following_op = GetOpWithInput(*model, binary_op->outputs[0]);
 
   if (!following_op) {
-    AddMessageF(
-        "Not fusing %s because it is not consumed by exactly one other op",
-        LogName(*binary_op));
+    AddMessageF("Not fusing %s because it is not consumed by any op",
+                LogName(*binary_op));
     return ::tensorflow::Status::OK();
   }
 
@@ -288,7 +287,10 @@ void FuseMulOrDivParamsIntoFollowingAffine(Model* model, Operator* following_op,
   AddMessageF("Fusing %s into the following %s", LogName(*binary_op),
               LogName(*following_op));
 
-  model->EraseArray(binary_op->outputs[0]);
+  if (CountOpsWithInput(*model, binary_op->outputs[0]) == 1) {
+    model->EraseArray(binary_op->outputs[0]);
+  }
+
   following_op->inputs[0] = binary_op->inputs[index_of_variable_input];
   const auto& old_constant_param_name =
       binary_op->inputs[index_of_constant_input];
diff --git a/tensorflow/lite/toco/graph_transformations/graph_transformations.h b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
index 73a90c8239b2a24de8bb4d63e711225b4127f19a..187b584b6989cc55894160fc5508c13474a1d2d3 100644
--- a/tensorflow/lite/toco/graph_transformations/graph_transformations.h
+++ b/tensorflow/lite/toco/graph_transformations/graph_transformations.h
@@ -139,7 +139,7 @@ DECLARE_GRAPH_TRANSFORMATION(MakeInitialDequantizeOperator)
 DECLARE_GRAPH_TRANSFORMATION(MoveBinaryOperatorBeforeReshape)
 DECLARE_GRAPH_TRANSFORMATION(PropagateActivationFunctionIntoConstants)
 DECLARE_GRAPH_TRANSFORMATION(PropagateArrayDataTypes)
-DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits);
+DECLARE_GRAPH_TRANSFORMATION(PropagateFakeQuantNumBits)
 DECLARE_GRAPH_TRANSFORMATION(PropagateFixedSizes)
 DECLARE_GRAPH_TRANSFORMATION(HardcodeMinMax)
 DECLARE_GRAPH_TRANSFORMATION(Quantize)
diff --git a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
index df50f31de88cd8114ee66ce417354e33a12a5d8b..2e41767095fb3cde09a7fb5d690ac57b1cfcd762 100644
--- a/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
+++ b/tensorflow/lite/toco/graph_transformations/hardcode_min_max.cc
@@ -208,12 +208,32 @@ bool HardcodeMinMaxForSelect(Model* model, Operator* op) {
   if (output_array.minmax) {
     return false;
   }
-  const auto& input_array_1 = model->GetArray(op->inputs[1]);
-  if (!input_array_1.minmax) {
+
+  auto& input_array_1 = model->GetArray(op->inputs[1]);
+  auto& input_array_2 = model->GetArray(op->inputs[2]);
+
+  if (!input_array_1.minmax && !input_array_2.minmax) {
     return false;
   }
-  const auto& input_array_2 = model->GetArray(op->inputs[2]);
-  if (!input_array_2.minmax) {
+
+  // Propagate up if one input is quantized and the other is constant.
+  if (!input_array_1.minmax &&
+      IsConstantParameterArray(*model, op->inputs[1])) {
+    auto& minmax_1 = input_array_1.GetOrCreateMinMax();
+    const auto& minmax_2 = input_array_2.GetMinMax();
+    minmax_1.min = minmax_2.min;
+    minmax_1.max = minmax_2.max;
+  }
+
+  if (!input_array_2.minmax &&
+      IsConstantParameterArray(*model, op->inputs[2])) {
+    auto& minmax_2 = input_array_2.GetOrCreateMinMax();
+    const auto& minmax_1 = input_array_1.GetMinMax();
+    minmax_2.min = minmax_1.min;
+    minmax_2.max = minmax_1.max;
+  }
+
+  if (!input_array_1.minmax || !input_array_2.minmax) {
     return false;
   }
 
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
index 9a458dccb9cbb372500771bd5321869a9bf1db36..cbae6610d7f4703a898d8d6f35351a09cd70173c 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_array_data_types.cc
@@ -86,6 +86,13 @@ void SetDataTypeForAllOutputs(Model* model, Operator* op,
       SetDataTypeForAllOutputs(model, op, data_type);
       break;
     }
+    case OperatorType::kSplitV: {
+      // These operators produce output with the same type as its 1st input
+      CHECK_GE(op->inputs.size(), 3);
+      const ArrayDataType data_type = model->GetArray(op->inputs[0]).data_type;
+      SetDataTypeForAllOutputs(model, op, data_type);
+      break;
+    }
     case OperatorType::kTransposeConv: {
       // These operators produce an output with the same type as their 3rd input
       CHECK_GE(op->inputs.size(), 3);
diff --git a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
index 78ea54e452b9dd163aa75349162493a2abe72707..0e653f08a04f237c861038639a1469eb62f35dfa 100644
--- a/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
+++ b/tensorflow/lite/toco/graph_transformations/propagate_fixed_sizes.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <algorithm>
 #include <iterator>
 #include <memory>
+#include <numeric>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -786,6 +787,97 @@ void ProcessTensorFlowSplitOperator(Model* model, TensorFlowSplitOperator* op) {
   }
 }
 
+void ProcessTensorFlowSplitVOperator(Model* model,
+                                     TensorFlowSplitVOperator* op) {
+  CHECK_EQ(op->inputs.size(), 3);
+
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+  const Shape& input_shape = input_array.shape();
+
+  // Yield until size_splits is constant.
+  if (!IsConstantParameterArray(*model, op->inputs[1])) {
+    return;
+  }
+  const auto& size_array = model->GetArray(op->inputs[1]);
+  // Yield until size_splits dims have been resolved.
+  if (!size_array.has_shape()) {
+    return;
+  }
+  const Shape& size_shape = size_array.shape();
+
+  CHECK(size_array.data_type == ArrayDataType::kInt32 ||
+        size_array.data_type == ArrayDataType::kInt64)
+      << "size_splits must be int32, int64";
+  CHECK_EQ(size_shape.dimensions_count(), 1) << "size_splits must be 1-D";
+
+  std::vector<int64> size_splits_vector;
+  if (size_array.data_type == ArrayDataType::kInt32) {
+    for (const auto each_size :
+         size_array.GetBuffer<ArrayDataType::kInt32>().data) {
+      size_splits_vector.push_back(each_size);
+    }
+  } else {
+    size_splits_vector = size_array.GetBuffer<ArrayDataType::kInt64>().data;
+  }
+
+  // Yield until axis is constant.
+  if (!IsConstantParameterArray(*model, op->inputs[2])) {
+    return;
+  }
+  const auto& axis_array = model->GetArray(op->inputs[2]);
+  // Yield until axis dims have been resolved.
+  if (!axis_array.has_shape()) {
+    return;
+  }
+
+  CHECK(axis_array.data_type == ArrayDataType::kInt32)
+      << "Axis array must be int32.";
+  CHECK_EQ(RequiredBufferSizeForShape(axis_array.shape()), 1)
+      << "Axis array must be scalar.";
+
+  int axis = axis_array.GetBuffer<ArrayDataType::kInt32>().data[0];
+  if (axis < 0) {
+    axis += input_shape.dimensions_count();
+  }
+
+  CHECK_EQ(op->num_split, size_splits_vector.size());
+
+  int64_t minus_one_count = 0, size_splits_sum = 0;
+  for (auto size : size_splits_vector) {
+    if (size == -1) {
+      ++minus_one_count;
+    } else {
+      size_splits_sum += size;
+    }
+  }
+
+  const int input_size = input_shape.dims(axis);
+
+  CHECK_LE(minus_one_count, 1) << "size_splits can contain at most one -1.";
+
+  if (minus_one_count == 1) {
+    CHECK_LE(size_splits_sum, input_size);
+    auto iter =
+        std::find(size_splits_vector.begin(), size_splits_vector.end(), -1);
+    *iter = input_size - size_splits_sum;
+  } else {
+    CHECK_EQ(size_splits_sum, input_size);
+  }
+
+  CHECK_EQ(op->outputs.size(), op->num_split);
+
+  for (int i = 0; i < op->outputs.size(); ++i) {
+    const auto& output = op->outputs[i];
+    Shape output_shape = input_shape;
+    (*output_shape.mutable_dims())[axis] = size_splits_vector.at(i);
+    model->GetArray(output).copy_shape(output_shape);
+  }
+}
+
 void ProcessAveragePoolOperator(Model* model, AveragePoolOperator* op) {
   const string& input_name = op->inputs[0];
   const auto& input_array = model->GetArray(input_name);
@@ -1691,6 +1783,51 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   }
 }
 
+void ProcessMirrorPadOperator(Model* model, MirrorPadOperator* op) {
+  CHECK_EQ(op->inputs.size(), 2);
+  const auto& input_array = model->GetArray(op->inputs[0]);
+  const auto& padding_matrix = model->GetArray(op->inputs[1]);
+
+  // Yield until input dims have been resolved.
+  if (!input_array.has_shape()) {
+    return;
+  }
+
+  auto& output_array = model->GetArray(op->outputs[0]);
+  // If output already computed or padding matrix is non
+  // const then return.
+  if (output_array.has_shape() ||
+      !IsConstantParameterArray(*model, op->inputs[1])) {
+    return;
+  }
+  Shape output_shape = input_array.shape();
+  std::vector<int>& dims = *output_shape.mutable_dims();
+
+  std::vector<int64_t> padding;
+  if (padding_matrix.data_type == ArrayDataType::kInt32) {
+    const auto& data = padding_matrix.GetBuffer<ArrayDataType::kInt32>().data;
+    for (auto elem : data) {
+      padding.push_back(static_cast<int64_t>(elem));
+    }
+  } else if (padding_matrix.data_type == ArrayDataType::kInt64) {
+    const auto& data = padding_matrix.GetBuffer<ArrayDataType::kInt64>().data;
+    for (auto elem : data) {
+      padding.push_back(elem);
+    }
+  } else {
+    CHECK(padding_matrix.data_type == ArrayDataType::kInt64 ||
+          padding_matrix.data_type == ArrayDataType::kInt32);
+  }
+  CHECK_EQ(padding_matrix.shape().dimensions_count(), 2);
+  CHECK_EQ(input_array.shape().dimensions_count(),
+           padding_matrix.shape().dims(0));
+  for (int i = 0; i < input_array.shape().dimensions_count(); ++i) {
+    dims[i] += padding[i * 2] + padding[i * 2 + 1];
+  }
+
+  output_array.copy_shape(output_shape);
+}
+
 }  // namespace
 
 ::tensorflow::Status PropagateFixedSizes::Run(Model* model,
@@ -1707,6 +1844,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
   }
 
   switch (op->type) {
+    case OperatorType::kAbs:
     case OperatorType::kBatchNormalization:
     case OperatorType::kL2Normalization:
     case OperatorType::kDequantize:
@@ -1714,6 +1852,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kRelu1:
     case OperatorType::kRelu6:
     case OperatorType::kPRelu:
+    case OperatorType::kLeakyRelu:
     case OperatorType::kSoftmax:
     case OperatorType::kLogSoftmax:
     case OperatorType::kLog:
@@ -1759,6 +1898,7 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kEqual:
     case OperatorType::kNotEqual:
     case OperatorType::kPow:
+    case OperatorType::kSquaredDifference:
       ProcessSimpleBinaryOperator(model, op);
       break;
     case OperatorType::kAddN:
@@ -1834,6 +1974,10 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
       ProcessTensorFlowSplitOperator(model,
                                      static_cast<TensorFlowSplitOperator*>(op));
       break;
+    case OperatorType::kSplitV:
+      ProcessTensorFlowSplitVOperator(
+          model, static_cast<TensorFlowSplitVOperator*>(op));
+      break;
     case OperatorType::kSqueeze:
       ProcessSqueezeOperator(model, static_cast<SqueezeOperator*>(op));
       break;
@@ -1956,6 +2100,9 @@ void ProcessUnpackOperator(Model* model, UnpackOperator* op) {
     case OperatorType::kUnpack:
       ProcessUnpackOperator(model, static_cast<UnpackOperator*>(op));
       break;
+    case OperatorType::kMirrorPad:
+      ProcessMirrorPadOperator(model, static_cast<MirrorPadOperator*>(op));
+      break;
     default:
       // Unimplemented, another graph transformation should drop it.
       LOG(FATAL) << "Unhandled operator type " << OperatorTypeName(op->type);
diff --git a/tensorflow/lite/toco/graph_transformations/quantize.cc b/tensorflow/lite/toco/graph_transformations/quantize.cc
index e28b7288f0102a6b03dff61c3e1b6aeb3dd1adbe..1146078c301fd1b880c99da23e5be8223efe31e3 100644
--- a/tensorflow/lite/toco/graph_transformations/quantize.cc
+++ b/tensorflow/lite/toco/graph_transformations/quantize.cc
@@ -64,7 +64,8 @@ bool SupportsQuantization(const Operator& op) {
          type == OperatorType::kRelu1 || type == OperatorType::kRelu6 ||
          type == OperatorType::kShape || type == OperatorType::kExpandDims ||
          type == OperatorType::kPack || type == OperatorType::kTopK_V2 ||
-         type == OperatorType::kResizeNearestNeighbor;
+         type == OperatorType::kResizeNearestNeighbor ||
+         type == OperatorType::kPRelu;
 }
 
 // The quantized op allows output arrays of type float using
@@ -360,7 +361,7 @@ bool ChooseQuantizationForOperatorOutput(
       op.type == OperatorType::kSpaceToDepth ||
       op.type == OperatorType::kReshape || op.type == OperatorType::kSplit ||
       op.type == OperatorType::kRelu || op.type == OperatorType::kRelu1 ||
-      op.type == OperatorType::kRelu6) {
+      op.type == OperatorType::kRelu6 || op.type == OperatorType::kPRelu) {
     int data_input_index = 0;
     if (op.type == OperatorType::kSplit) {
       data_input_index = 1;
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
index 1149930131e0d6889df50cf67eaed863617eda1f..27836efb0b2ff77d72811205617b721cc7106cf1 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_constant_gather.cc
@@ -25,8 +25,8 @@ namespace {
 
 // Gathers data from axis 0.
 template <ArrayDataType Type>
-inline void Gather(const Array& input_array, int input_rank,
-                   const Array& coords_array, Array* output_array) {
+inline void Gather(const Array& input_array, const Array& coords_array,
+                   Array* output_array) {
   const Shape& input_shape = input_array.shape();
   const std::vector<DataType<Type>>& input_data =
       input_array.GetBuffer<Type>().data;
@@ -39,17 +39,20 @@ inline void Gather(const Array& input_array, int input_rank,
       output_array->GetMutableBuffer<Type>().data;
   output_data.resize(RequiredBufferSizeForShape(output_shape));
 
-  int rev_input_rank = input_shape.dimensions_count() - 1 - (input_rank - 1);
-  CHECK_EQ(coords_shape.dims(0), output_array->shape().dims(rev_input_rank));
+  CHECK_EQ(coords_shape.dims(0), output_array->shape().dims(0));
 
   int stride = 1;
-  for (int i = input_shape.dimensions_count() - 1; i >= input_rank - 1; --i) {
+  for (int i = 1; i < input_shape.dimensions_count(); ++i) {
     stride *= input_shape.dims(i);
   }
 
+  // Let's make sure we have enough space for all element in the memcpy()
+  // below, which writes 'stride' elements starting at 'i * stride'.
+  CHECK_EQ(stride * coords_shape.dims(0), output_data.size());
+
   for (int i = 0; i < coords_shape.dims(0); ++i) {
     DCHECK_GE(coords_data[i], 0);
-    DCHECK_LT(coords_data[i], input_shape.dims(rev_input_rank));
+    DCHECK_LT(coords_data[i], input_shape.dims(0));
     DataType<Type>* out = output_data.data() + i * stride;
     const DataType<Type>* in = input_data.data() + coords_data[i] * stride;
     memcpy(out, in, sizeof(DataType<Type>) * stride);
@@ -118,24 +121,20 @@ inline void Gather(const Array& input_array, int input_rank,
   CHECK(!output_array.buffer);
   switch (output_array.data_type) {
     case ArrayDataType::kFloat:
-      Gather<ArrayDataType::kFloat>(input_array, op->input_rank, coords_array,
-                                    &output_array);
+      Gather<ArrayDataType::kFloat>(input_array, coords_array, &output_array);
       break;
     case ArrayDataType::kUint8:
-      Gather<ArrayDataType::kUint8>(input_array, op->input_rank, coords_array,
-                                    &output_array);
+      Gather<ArrayDataType::kUint8>(input_array, coords_array, &output_array);
       break;
     case ArrayDataType::kInt32:
-      Gather<ArrayDataType::kInt32>(input_array, op->input_rank, coords_array,
-                                    &output_array);
+      Gather<ArrayDataType::kInt32>(input_array, coords_array, &output_array);
       break;
     case ArrayDataType::kInt64:
-      Gather<ArrayDataType::kInt64>(input_array, op->input_rank, coords_array,
-                                    &output_array);
+      Gather<ArrayDataType::kInt64>(input_array, coords_array, &output_array);
       break;
     case ArrayDataType::kComplex64:
-      Gather<ArrayDataType::kComplex64>(input_array, op->input_rank,
-                                        coords_array, &output_array);
+      Gather<ArrayDataType::kComplex64>(input_array, coords_array,
+                                        &output_array);
       break;
     default:
       LOG(FATAL) << "Unsupported data type given to Gather op with output \""
diff --git a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
index ea5d33009b4b602abf4de8b310c456f142737c7d..9ceba45e93fee10c820f2b0ba01a5948be0787b6 100644
--- a/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
+++ b/tensorflow/lite/toco/graph_transformations/resolve_reduce_attributes.cc
@@ -35,6 +35,11 @@ bool ResolveAttributes(Model* model, T* op) {
 
   const Array& indices_array = model->GetArray(op->inputs[1]);
   if (!indices_array.has_shape()) return false;
+
+  // It is ok for indices_array to have a shape for an empty tensor. In that
+  // case, we don't bother setting 'axis'.
+  if (indices_array.buffer->Length() == 0) return false;
+
   op->axis = indices_array.GetBuffer<ArrayDataType::kInt32>().data;
   return true;
 }
diff --git a/tensorflow/lite/toco/graph_transformations/tests/BUILD b/tensorflow/lite/toco/graph_transformations/tests/BUILD
index 2e9b213d0018f547740673e26f2ffe7aac010777..bbbedbe3a93065e3a7007073aad7f6e7600e2651 100644
--- a/tensorflow/lite/toco/graph_transformations/tests/BUILD
+++ b/tensorflow/lite/toco/graph_transformations/tests/BUILD
@@ -10,7 +10,6 @@ load(
 tf_cc_test(
     name = "lstm_utils_test",
     srcs = ["lstm_utils_test.cc"],
-    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
@@ -22,7 +21,6 @@ tf_cc_test(
 tf_cc_test(
     name = "resolve_constant_concatenation_test",
     srcs = ["resolve_constant_concatenation_test.cc"],
-    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
@@ -34,7 +32,6 @@ tf_cc_test(
 tf_cc_test(
     name = "resolve_constant_unary_test",
     srcs = ["resolve_constant_unary_test.cc"],
-    tags = ["no_oss"],
     deps = [
         "//tensorflow/lite/toco:graph_transformations",
         "//tensorflow/lite/toco:model",
diff --git a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
index d59954fc740ed91ce041948ff76a029ea294017b..41a735394d714b65a4c9fc309927e34a7f610431 100644
--- a/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
+++ b/tensorflow/lite/toco/graph_transformations/unroll_batch_matmul.cc
@@ -117,7 +117,8 @@ namespace toco {
     auto* slice_b_op = new SliceOperator;
     slice_b_op->inputs = {
         batch_op->inputs[1],
-        CreateInt32Array(model, batch_name + "/slice_b/slice/begin", {0, 0, 0}),
+        CreateInt32Array(model, batch_name + "/slice_b/slice/begin",
+                         {batch, 0, 0}),
         CreateInt32Array(
             model, batch_name + "/slice_b/slice/size",
             {1, input_array_b.shape().dims(1), input_array_b.shape().dims(2)}),
diff --git a/tensorflow/lite/toco/import_tensorflow.cc b/tensorflow/lite/toco/import_tensorflow.cc
index 96f3c6a6ab94789bf118740eb57b6e81ce48f333..0b2f810394311a33899b9242e73131e109a2b4c0 100644
--- a/tensorflow/lite/toco/import_tensorflow.cc
+++ b/tensorflow/lite/toco/import_tensorflow.cc
@@ -935,6 +935,25 @@ tensorflow::Status ConvertSplitOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertSplitVOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "SplitV");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 3));
+  auto* op = new TensorFlowSplitVOperator;
+  op->inputs.push_back(node.input(0));
+  op->inputs.push_back(node.input(1));
+  op->inputs.push_back(node.input(2));
+  const int num_split = GetIntAttr(node, "num_split");
+  op->outputs.push_back(node.name());
+  for (int i = 1; i < num_split; i++) {
+    op->outputs.push_back(absl::StrCat(node.name(), ":", i));
+  }
+  op->num_split = num_split;
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 tensorflow::Status ConvertSwitchOperator(
     const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
     Model* model) {
@@ -1134,6 +1153,31 @@ tensorflow::Status ConvertConcatOperator(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertMirrorPadOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  if (node.op() != "MirrorPad") {
+    LOG(FATAL) << "Expected MirrorPad.";
+  }
+  const int num_inputs = GetInputsCount(node, tf_import_flags);
+  CHECK_EQ(num_inputs, 2);
+  auto* op = new MirrorPadOperator;
+  for (int i = 0; i < num_inputs; ++i) {
+    op->inputs.push_back(node.input(i));
+  }
+  op->outputs.push_back(node.name());
+  const auto mode = GetStringAttr(node, "mode");
+  if (mode == "REFLECT") {
+    op->mode = toco::MirrorPadMode::kReflect;
+  } else if (mode == "SYMMETRIC") {
+    op->mode = toco::MirrorPadMode::kSymmetric;
+  }
+
+  model->operators.emplace_back(op);
+
+  return tensorflow::Status::OK();
+}
+
 static constexpr int kAnyNumInputs = -1;
 
 enum FlexSupport { kFlexOk, kFlexNotOk };
@@ -1221,7 +1265,7 @@ void GetOutputNamesFromNodeDef(const NodeDef& node,
 void GetOutputTypesFromNodeDef(const NodeDef& node,
                                const tensorflow::OpDef& op_def,
                                TensorFlowUnsupportedOperator* op) {
-  // The the given type to the op, or clear the types if invalid.
+  // The given type to the op, or clear the types if invalid.
   auto add_type = [&node, op](tensorflow::DataType type) {
     if (type == tensorflow::DT_INVALID) {
       LOG(WARNING) << "Op node missing output type attribute: " << node.name();
@@ -2012,13 +2056,13 @@ bool InlineAllFunctions(GraphDef* graphdef) {
   tensorflow::SessionOptions options;
   auto* device_count = options.config.mutable_device_count();
   device_count->insert({"CPU", 1});
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   TF_CHECK_OK(tensorflow::DeviceFactory::AddDevices(
       options, "/job:localhost/replica:0/task:0", &devices));
 
   tensorflow::FunctionLibraryDefinition fld(tensorflow::OpRegistry::Global(),
                                             graphdef_copy.library());
-  tensorflow::DeviceMgr device_mgr(devices);
+  tensorflow::DeviceMgr device_mgr(std::move(devices));
   tensorflow::OptimizerOptions o_opts;
   tensorflow::ProcessFunctionLibraryRuntime pflr(
       &device_mgr, tensorflow::Env::Default(), TF_GRAPH_DEF_VERSION, &fld,
@@ -2220,6 +2264,21 @@ tensorflow::Status ConvertUnidirectionalSequenceLstm(
   return tensorflow::Status::OK();
 }
 
+tensorflow::Status ConvertLeakyReluOperator(
+    const NodeDef& node, const TensorFlowImportFlags& tf_import_flags,
+    Model* model) {
+  CHECK_EQ(node.op(), "LeakyRelu");
+  TF_QCHECK_OK(CheckInputsCount(node, tf_import_flags, 1));
+  CHECK_EQ(GetDataTypeAttr(node, "T"), DT_FLOAT);
+  const auto& input_name = node.input(0);
+  auto* op = new LeakyReluOperator;
+  op->inputs.push_back(input_name);
+  op->outputs.push_back(node.name());
+  op->alpha = GetFloatAttr(node, "alpha");
+  model->operators.emplace_back(op);
+  return tensorflow::Status::OK();
+}
+
 }  // namespace
 
 namespace internal {
@@ -2233,12 +2292,14 @@ ConverterMapType GetTensorFlowNodeConverterMapForFlex() {
   return std::unordered_map<std::string, ConverterType>({
       // We need to let TCO convert Placeholder information into
       // array data, so that the data types are correct.
+      {"LegacyFedInput", ConvertPlaceholderOperator},
       {"Placeholder", ConvertPlaceholderOperator},
   });
 }
 
 ConverterMapType GetTensorFlowNodeConverterMap() {
   return std::unordered_map<std::string, ConverterType>({
+      {"Abs", ConvertSimpleOperator<AbsOperator>},
       {"Add", ConvertSimpleOperator<AddOperator, 2>},
       {"AddN", ConvertSimpleOperatorFlexOk<AddNOperator>},
       {"All", ConvertSimpleOperator<TensorFlowAllOperator>},
@@ -2282,6 +2343,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
        ConvertSimpleOperator<TensorFlowGreaterEqualOperator, 2>},
       {"Identity", ConvertIdentityOperator},
       {"LRN", ConvertLRNOperator},
+      {"LeakyRelu", ConvertLeakyReluOperator},
       {"LegacyFedInput", ConvertPlaceholderOperator},
       {"Less", ConvertSimpleOperator<TensorFlowLessOperator, 2>},
       {"LessEqual", ConvertSimpleOperator<TensorFlowLessEqualOperator, 2>},
@@ -2332,8 +2394,11 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"SpaceToDepth", ConvertSpaceToDepthOperator},
       {"SparseToDense", ConvertSparseToDenseOperator},
       {"Split", ConvertSplitOperator},
+      {"SplitV", ConvertSplitVOperator},
       {"Sqrt", ConvertSimpleOperator<TensorFlowSqrtOperator, 1>},
       {"Square", ConvertSimpleOperator<TensorFlowSquareOperator, 1>},
+      {"SquaredDifference",
+       ConvertSimpleOperator<SquaredDifferenceOperator, 2>},
       {"Squeeze", ConvertSqueezeOperator},
       {"StopGradient", ConvertIdentityOperator},
       {"StridedSlice", ConvertStridedSliceOperator},
@@ -2349,6 +2414,7 @@ ConverterMapType GetTensorFlowNodeConverterMap() {
       {"Unpack", ConvertUnpackOperator},
       {"ZerosLike", ConvertSimpleOperator<TensorFlowZerosLikeOperator, 1>},
       {"UnidirectionalSequenceLstm", ConvertUnidirectionalSequenceLstm},
+      {"MirrorPad", ConvertMirrorPadOperator},
   });
 }
 
diff --git a/tensorflow/lite/toco/model.h b/tensorflow/lite/toco/model.h
index f85e1c287879e636a56ef10bf0f75a781d252ae9..d392535f5c98cdd3532299064f2c6d9305214e71 100644
--- a/tensorflow/lite/toco/model.h
+++ b/tensorflow/lite/toco/model.h
@@ -121,8 +121,10 @@ enum class OperatorType : uint8 {
   kRsqrt,
   kShape,
   kSplit,
+  kSplitV,
   kSqrt,
   kSquare,
+  kSquaredDifference,
   kSum,
   kSwitch,
   kTile,
@@ -152,7 +154,10 @@ enum class OperatorType : uint8 {
   kCTCBeamSearchDecoder,
   kUnpack,
   kZerosLike,
-  kResizeNearestNeighbor
+  kResizeNearestNeighbor,
+  kLeakyRelu,
+  kAbs,
+  kMirrorPad
 };
 
 // Helper to deal with TensorFlow arrays using a different ordering of
@@ -653,6 +658,17 @@ struct MulOperator : Operator {
   MulOperator() : Operator(OperatorType::kMul) {}
 };
 
+// Element-wise Abs operator:
+//   x -> abs(x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: Relu
+struct AbsOperator : Operator {
+  AbsOperator() : Operator(OperatorType::kAbs) {}
+};
+
 // Element-wise Relu operator:
 //   x -> max(0, x)
 //
@@ -699,6 +715,19 @@ struct PReluOperator : Operator {
   PReluOperator() : Operator(OperatorType::kPRelu) {}
 };
 
+// LeakyRelu
+//   x -> max(x, alpha * x)
+//
+// Inputs:
+//   inputs[0]: required: the input array
+//
+// TensorFlow equivalent: LeakyRelu
+struct LeakyReluOperator : Operator {
+  LeakyReluOperator() : Operator(OperatorType::kLeakyRelu) {}
+
+  float alpha = 0.2f;  // 0.2 matches the default value for the TF op attribute.
+};
+
 // Element-wise Logistic operator:
 //   x -> Logistic(x) = 1 / (1 + exp(-x))
 //
@@ -1289,6 +1318,17 @@ struct TensorFlowSquareOperator : Operator {
   TensorFlowSquareOperator() : Operator(OperatorType::kSquare) {}
 };
 
+// Element-wise squared difference ((x-y)*(x-y)) operator.
+//
+// Inputs:
+//   inputs[0]: required: the left-hand side array
+//   inputs[1]: required: the right-hand side array
+//
+// TensorFlow equivalent: SquaredDifference
+struct SquaredDifferenceOperator : Operator {
+  SquaredDifferenceOperator() : Operator(OperatorType::kSquaredDifference) {}
+};
+
 // Transposes a tensor.
 //
 // By default, this operation performs a regular matrix transpose on 2-D input
@@ -1363,6 +1403,12 @@ struct TensorFlowSplitOperator : Operator {
   int num_split = 0;
 };
 
+// TensorFlow SplitV equivalent. Refer to TensorFlow documentation for details.
+struct TensorFlowSplitVOperator : Operator {
+  TensorFlowSplitVOperator() : Operator(OperatorType::kSplitV) {}
+  int num_split = 0;
+};
+
 // TensorFlow Concat equivalent. Refer to TensorFlow documentation for details.
 // Not fully supported, just a placeholder to handle TensorFlow graphs and
 // support graph transformations to other operator types by matching sub-graphs.
@@ -1627,6 +1673,9 @@ struct GatherOperator : Operator {
   // ResolveGatherAttributes. An empty axis indicates that the axis has not yet
   // be resolved.
   absl::optional<int> axis;
+
+  // This field is not used by the standard TF Lite export but it is still need
+  // for legacy Gather implementations.
   int input_rank = 0;
 };
 
@@ -1887,6 +1936,23 @@ struct TensorFlowZerosLikeOperator : Operator {
   TensorFlowZerosLikeOperator() : Operator(OperatorType::kZerosLike) {}
 };
 
+enum class MirrorPadMode { kNone, kSymmetric, kReflect };
+
+// MirrorPad Operator:
+//
+// Inputs:
+// Inputs[0]: required: input tensor to be padded.
+// Inputs[1]: required: 2 Column matrix specifying padding sizes. The number of
+// rows must be the same as the rank of the input.
+// Inputs[2]: required: REFLECT or SYMMETRIC.
+//
+// TensorFlow equivalent: MirrorPad.
+struct MirrorPadOperator : Operator {
+  MirrorPadOperator() : Operator(OperatorType::kMirrorPad) {}
+  // mode is either SYMMETRIC or REFLECT.
+  MirrorPadMode mode;
+};
+
 // Alloc's are used for transient arrays only. An Alloc specifies which interval
 // of the "transient_data" workspace buffer passed to inference functions, is to
 // be used for the transient array at hand. The 'start' and 'end' values are
diff --git a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
index 56acc284cc06d6bb8a277adb15aacfee5b1e781c..ae361bf212daeae5cede941111329b2265962ce6 100644
--- a/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
+++ b/tensorflow/lite/toco/tensorflow_graph_matching/BUILD
@@ -60,7 +60,6 @@ cc_library(
 tf_cc_test(
     name = "resolve_svdf_test",
     srcs = ["resolve_svdf_test.cc"],
-    tags = ["no_oss"],
     deps = [
         ":cluster",
         ":cluster_utils",
diff --git a/tensorflow/lite/toco/tflite/BUILD b/tensorflow/lite/toco/tflite/BUILD
index 99c4f8edebe5186400253d01689520230594b885..36ca638ee8c83f6cc1d887a0efaf2b0676f95bd8 100644
--- a/tensorflow/lite/toco/tflite/BUILD
+++ b/tensorflow/lite/toco/tflite/BUILD
@@ -41,7 +41,6 @@ tf_cc_test(
     srcs = [
         "operator_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":operator",
         "//tensorflow/core:ops",
@@ -72,7 +71,6 @@ tf_cc_test(
     srcs = [
         "types_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":types",
         "//tensorflow/core:ops",
@@ -107,7 +105,6 @@ tf_cc_test(
     srcs = [
         "export_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":export",
         "//tensorflow/core:ops",
@@ -142,7 +139,6 @@ tf_cc_test(
     srcs = [
         "import_test.cc",
     ],
-    tags = ["no_oss"],
     deps = [
         ":import",
         "//tensorflow/core:ops",
diff --git a/tensorflow/lite/toco/tflite/export.cc b/tensorflow/lite/toco/tflite/export.cc
index 489c21295ef8fc805eb6d587a6d84fd36c5ac3ed..8b9448486dfb60695cddda9dc320c4ab616e8217 100644
--- a/tensorflow/lite/toco/tflite/export.cc
+++ b/tensorflow/lite/toco/tflite/export.cc
@@ -126,7 +126,6 @@ OperatorKey::OperatorKey(
     type_ = builtin_ops.at(name);
     return;
   }
-
   // The logic below is all for custom ops or Flex ops.
   is_custom_op_ = true;
   type_ = BuiltinOperator_CUSTOM;
@@ -332,6 +331,11 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     std::set<int32_t>* variable_tensor_indices, const ExportParams& params) {
   variable_tensor_indices->clear();
 
+  auto is_tflite_builtin = [](const BaseOperator* op) {
+    const auto& tflite_builtins = GetBuiltinOpsMap();
+    return (op && tflite_builtins.find(op->name()) != tflite_builtins.end());
+  };
+
   // The operators are in execution order, so we just follow tf.mini order.
   std::vector<Offset<Operator>> op_vector;
   for (const auto& op : model.operators) {
@@ -360,7 +364,19 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
     auto options = Options::Custom(0);
 
     std::vector<bool> mutating_input_variables;
-    if (tflite_op) {
+
+    // It is conceivable that an op is exportable via Serialize() but does not
+    // have a corresponding TFLITE builtin. In that case, when flex mode is
+    // enabled we should export it as a flex op, not as a native.
+    bool export_as_flex_op = !is_tflite_builtin(tflite_op) &&
+                             key.is_flex_op() &&
+                             !op->tensorflow_node_def.empty();
+    if (export_as_flex_op) {
+      auto fbb = WriteFlexOpOptions(op->tensorflow_node_def);
+      if (fbb) {
+        options = Options::Custom(builder->CreateVector(fbb->GetBuffer()));
+      }
+    } else if (tflite_op) {
       options = tflite_op->Serialize(*op, builder);
       mutating_input_variables = tflite_op->GetMutatingInputVariables(*op);
 
@@ -373,12 +389,13 @@ Offset<Vector<Offset<Operator>>> ExportOperators(
           variable_tensor_indices->insert(variable_tensor_index);
         }
       }
-    } else if (key.is_flex_op() && !op->tensorflow_node_def.empty()) {
-      auto fbb = WriteFlexOpOptions(op->tensorflow_node_def);
-      if (fbb) {
-        options = Options::Custom(builder->CreateVector(fbb->GetBuffer()));
-      }
+    } else {
+      // We don't know much about this op. It doesn't have a serializer and
+      // it is not supposed to be exported as a flex op. We will treat it as
+      // a regular custom op: we will still create an operator for it, but it
+      // will not have any 'options'.
     }
+
     // The only supported CustomOptionFormat is FLEXBUFFERS now.
     op_vector.push_back(CreateOperator(
         *builder, op_index, builder->CreateVector(inputs),
diff --git a/tensorflow/lite/toco/tflite/export_test.cc b/tensorflow/lite/toco/tflite/export_test.cc
index b6c67772acadccd36ddc37766424d13e86858ffb..b371296784a34e081ae9bc5c1497348d9eb925ba 100644
--- a/tensorflow/lite/toco/tflite/export_test.cc
+++ b/tensorflow/lite/toco/tflite/export_test.cc
@@ -46,6 +46,18 @@ class ExportTest : public ::testing::Test {
         input_model_.operators.emplace_back(new AddOperator);
       } else if (name == "Sub") {
         input_model_.operators.emplace_back(new SubOperator);
+      } else if (name == "Assert") {
+        auto* op = new TensorFlowAssertOperator;
+
+        // Even though assert is known to TOCO, it doesn't have a tflite
+        // serializer, so it has to be exported as a custom op. If we attach a
+        // NodeDef to it, however, it will be exported as a flex op instead.
+        ::tensorflow::NodeDef node_def;
+        node_def.set_name("Assert");
+        node_def.set_op("Assert");
+        node_def.SerializeToString(&op->tensorflow_node_def);
+
+        input_model_.operators.emplace_back(op);
       } else {
         auto* op = new TensorFlowUnsupportedOperator;
         op->tensorflow_op = name;
@@ -232,37 +244,38 @@ class OpSetsTest : public ExportTest {
 TEST_F(OpSetsTest, BuiltinsOnly) {
   // --target_op_set=TFLITE_BUILTINS
   SetAllowedOpSets({kTfLiteBuiltins});
-  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
               ElementsAre());
   EXPECT_THAT(ImportExport({"Add"}), ElementsAre("builtin:ADD"));
 
   // --target_op_set=TFLITE_BUILTINS --allow_custom_ops
   SetAllowedOpSets({kTfLiteBuiltins, kCustomOps});
-  EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
-      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:UnrollAndFold"));
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
+              ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:Assert",
+                          "custom:UnrollAndFold"));
 }
 
 TEST_F(OpSetsTest, TfSelectOnly) {
   // --target_op_set=SELECT_TF_OPS
   SetAllowedOpSets({kSelectTfOps});
-  EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre());
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "RandomUniform",
+                            "UnrollAndFold", "Assert"}),
+              ElementsAre());
   EXPECT_THAT(ImportExport({"Add"}), ElementsAre("custom:FlexAdd"));
 
   // --target_op_set=SELECT_TF_OPS --allow_custom_ops
   SetAllowedOpSets({kSelectTfOps, kCustomOps});
   EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre("custom:AdjustHue", "custom:FlexAdd",
+      ImportExport(
+          {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
+      ElementsAre("custom:AdjustHue", "custom:FlexAdd", "custom:FlexAssert",
                   "custom:FlexRandomUniform", "custom:UnrollAndFold"));
 }
 
 TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
   // --target_op_set=TFLITE_BUILTINS,SELECT_TF_OPS
   SetAllowedOpSets({kTfLiteBuiltins, kSelectTfOps});
-  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold"}),
+  EXPECT_THAT(ImportExport({"Add", "AdjustHue", "UnrollAndFold", "Assert"}),
               ElementsAre());
   EXPECT_THAT(ImportExport({"Add", "RandomUniform"}),
               ElementsAre("builtin:ADD", "custom:FlexRandomUniform"));
@@ -270,9 +283,10 @@ TEST_F(OpSetsTest, BuiltinsAndTfSelect) {
   // --target_op_set=TFLITE_BUILTINS,SELECT_TF_OPS --allow_custom_ops
   SetAllowedOpSets({kTfLiteBuiltins, kSelectTfOps, kCustomOps});
   EXPECT_THAT(
-      ImportExport({"Add", "AdjustHue", "RandomUniform", "UnrollAndFold"}),
-      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:FlexRandomUniform",
-                  "custom:UnrollAndFold"));
+      ImportExport(
+          {"Add", "AdjustHue", "RandomUniform", "UnrollAndFold", "Assert"}),
+      ElementsAre("builtin:ADD", "custom:AdjustHue", "custom:FlexAssert",
+                  "custom:FlexRandomUniform", "custom:UnrollAndFold"));
 }
 
 // This test is based on a hypothetical scenario that dilation is supported
diff --git a/tensorflow/lite/toco/tflite/import.cc b/tensorflow/lite/toco/tflite/import.cc
index 88028aa144f2dcf090153252157a1b9b46e13279..1692f721256090f5a03c4e46dabdbe65be497d16 100644
--- a/tensorflow/lite/toco/tflite/import.cc
+++ b/tensorflow/lite/toco/tflite/import.cc
@@ -165,21 +165,28 @@ void ImportOperators(
   }
 }
 
-void ImportIOTensors(const ::tflite::Model& input_model,
+void ImportIOTensors(const ModelFlags& model_flags,
+                     const ::tflite::Model& input_model,
                      const details::TensorsTable& tensors_table, Model* model) {
-  auto inputs = (*input_model.subgraphs())[0]->inputs();
-  if (inputs) {
-    for (int input : *inputs) {
-      const string& input_name = tensors_table.at(input);
-      model->flags.add_input_arrays()->set_name(input_name);
+  // Import from the first subgraph if input arrays have not been specified.
+  if (model_flags.input_arrays().empty()) {
+    auto inputs = (*input_model.subgraphs())[0]->inputs();
+    if (inputs) {
+      for (int input : *inputs) {
+        const string& input_name = tensors_table.at(input);
+        model->flags.add_input_arrays()->set_name(input_name);
+      }
     }
   }
 
-  auto outputs = (*input_model.subgraphs())[0]->outputs();
-  if (outputs) {
-    for (int output : *outputs) {
-      const string& output_name = tensors_table.at(output);
-      model->flags.add_output_arrays(output_name);
+  // Import from the first subgraph if output arrays have not been specified.
+  if (model_flags.output_arrays().empty()) {
+    auto outputs = (*input_model.subgraphs())[0]->outputs();
+    if (outputs) {
+      for (int output : *outputs) {
+        const string& output_name = tensors_table.at(output);
+        model->flags.add_output_arrays(output_name);
+      }
     }
   }
 }
@@ -219,7 +226,8 @@ std::unique_ptr<Model> Import(const ModelFlags& model_flags,
   ImportTensors(*input_model, model.get());
   ImportOperators(*input_model, ops_by_name, tensors_table, operators_table,
                   model.get());
-  ImportIOTensors(*input_model, tensors_table, model.get());
+
+  ImportIOTensors(model_flags, *input_model, tensors_table, model.get());
 
   UndoWeightsShuffling(model.get());
 
diff --git a/tensorflow/lite/toco/tflite/operator.cc b/tensorflow/lite/toco/tflite/operator.cc
index 84ae4482469053e081114ed66062a4b8d4568538..205af23da57b08c8c62367df1c154bea5e50cc57 100644
--- a/tensorflow/lite/toco/tflite/operator.cc
+++ b/tensorflow/lite/toco/tflite/operator.cc
@@ -978,6 +978,26 @@ class Split
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class SplitV
+    : public BuiltinOperator<TensorFlowSplitVOperator, ::tflite::SplitVOptions,
+                             ::tflite::BuiltinOptions_SplitVOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSplitVOptions(*builder, op.num_split);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->num_split = options.num_splits();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 class StridedSlice
     : public BuiltinOperator<StridedSliceOperator,
                              ::tflite::StridedSliceOptions,
@@ -1218,6 +1238,66 @@ class Unpack : public BuiltinOperator<UnpackOperator, ::tflite::UnpackOptions,
   int GetVersion(const Operator& op) const override { return 1; }
 };
 
+class LeakyRelu
+    : public BuiltinOperator<LeakyReluOperator, ::tflite::LeakyReluOptions,
+                             ::tflite::BuiltinOptions_LeakyReluOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateLeakyReluOptions(*builder, op.alpha);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->alpha = options.alpha();
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class SquaredDifference
+    : public BuiltinOperator<
+          SquaredDifferenceOperator, ::tflite::SquaredDifferenceOptions,
+          ::tflite::BuiltinOptions_SquaredDifferenceOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateSquaredDifferenceOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
+class MirrorPad
+    : public BuiltinOperator<MirrorPadOperator, ::tflite::MirrorPadOptions,
+                             ::tflite::BuiltinOptions_MirrorPadOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateMirrorPadOptions(
+        *builder, op.mode == MirrorPadMode::kReflect
+                      ? ::tflite::MirrorPadMode::MirrorPadMode_REFLECT
+                      : ::tflite::MirrorPadMode::MirrorPadMode_SYMMETRIC);
+  }
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {
+    op->mode = options.mode() == ::tflite::MirrorPadMode::MirrorPadMode_REFLECT
+                   ? MirrorPadMode::kReflect
+                   : MirrorPadMode::kSymmetric;
+  }
+
+  int GetVersion(const Operator& op) const override { return 1; }
+};
+
 std::unique_ptr<flexbuffers::Builder> WriteFlexOpOptions(
     const string& tensorflow_node_def) {
   auto fbb = absl::make_unique<flexbuffers::Builder>();
@@ -1402,6 +1482,30 @@ class TensorFlowUnsupported : public BaseOperator {
   const bool enable_select_tf_ops_;
 };
 
+class Dequantize
+    : public BuiltinOperator<DequantizeOperator, ::tflite::DequantizeOptions,
+                             ::tflite::BuiltinOptions_DequantizeOptions> {
+ public:
+  using BuiltinOperator::BuiltinOperator;
+
+  flatbuffers::Offset<TfLiteOptions> WriteOptions(
+      const TocoOperator& op,
+      flatbuffers::FlatBufferBuilder* builder) const override {
+    return ::tflite::CreateDequantizeOptions(*builder);
+  }
+
+  void ReadOptions(const TfLiteOptions& options,
+                   TocoOperator* op) const override {}
+
+  int GetVersion(const Operator& op) const override {
+    // TODO(suharshs): Dequantize now supports INT8 in addition to
+    // QUANTIZED_UINT8. When TOCO can create models with INT8, we need
+    // to find a way to see the type here and return version 2. Right now
+    // version 2 will only be added by post training quantization tools.
+    return 1;
+  }
+};
+
 namespace {
 // Build a vector containing all the known operators.
 std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
@@ -1447,6 +1551,7 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                     OperatorType::kMaxPool));
   ops.push_back(
       MakeUnique<Mul>(::tflite::BuiltinOperator_MUL, OperatorType::kMul));
+
   ops.push_back(
       MakeUnique<Pad>(::tflite::BuiltinOperator_PAD, OperatorType::kPad));
   ops.push_back(
@@ -1483,6 +1588,8 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                     OperatorType::kSqueeze));
   ops.push_back(
       MakeUnique<Split>(::tflite::BuiltinOperator_SPLIT, OperatorType::kSplit));
+  ops.push_back(MakeUnique<SplitV>(::tflite::BuiltinOperator_SPLIT_V,
+                                   OperatorType::kSplitV));
   ops.push_back(MakeUnique<StridedSlice>(
       ::tflite::BuiltinOperator_STRIDED_SLICE, OperatorType::kStridedSlice));
   ops.push_back(MakeUnique<TopK_V2>(::tflite::BuiltinOperator_TOPK_V2,
@@ -1516,6 +1623,13 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
                                    OperatorType::kOneHot));
   ops.push_back(MakeUnique<Unpack>(::tflite::BuiltinOperator_UNPACK,
                                    OperatorType::kUnpack));
+  ops.push_back(MakeUnique<LeakyRelu>(::tflite::BuiltinOperator_LEAKY_RELU,
+                                      OperatorType::kLeakyRelu));
+  ops.push_back(MakeUnique<SquaredDifference>(
+      ::tflite::BuiltinOperator_SQUARED_DIFFERENCE,
+      OperatorType::kSquaredDifference));
+  ops.push_back(MakeUnique<MirrorPad>(::tflite::BuiltinOperator_MIRROR_PAD,
+                                      OperatorType::kMirrorPad));
 
   // Custom Operators.
   ops.push_back(
@@ -1600,7 +1714,10 @@ std::vector<std::unique_ptr<BaseOperator>> BuildOperatorList(
       "SQUARE", OperatorType::kSquare));
   ops.push_back(MakeUnique<SimpleOperator<TensorFlowZerosLikeOperator>>(
       "ZEROS_LIKE", OperatorType::kZerosLike));
-
+  ops.push_back(
+      MakeUnique<SimpleOperator<AbsOperator>>("ABS", OperatorType::kAbs));
+  ops.push_back(
+      MakeUnique<SimpleOperator<FillOperator>>("FILL", OperatorType::kFill));
   return ops;
 }
 }  // namespace
diff --git a/tensorflow/lite/toco/tflite/operator_test.cc b/tensorflow/lite/toco/tflite/operator_test.cc
index 8a776cbf0be57d906408afcd7d7d7a687a0d4c17..14ec89cd73f19fcd141640bda7bfba6435f59ac7 100644
--- a/tensorflow/lite/toco/tflite/operator_test.cc
+++ b/tensorflow/lite/toco/tflite/operator_test.cc
@@ -151,6 +151,7 @@ TEST_F(OperatorTest, SimpleOperators) {
                                                    OperatorType::kZerosLike);
   CheckSimpleOperator<FloorModOperator>("FLOOR_MOD", OperatorType::kFloorMod);
   CheckSimpleOperator<RangeOperator>("RANGE", OperatorType::kRange);
+  CheckSimpleOperator<FillOperator>("FILL", OperatorType::kFill);
 }
 
 TEST_F(OperatorTest, BuiltinAdd) {
@@ -310,6 +311,14 @@ TEST_F(OperatorTest, CustomSplit) {
   EXPECT_EQ(op.num_split, output_toco_op->num_split);
 }
 
+TEST_F(OperatorTest, CustomSplitV) {
+  TensorFlowSplitVOperator op;
+  op.num_split = 123;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SPLIT_V", OperatorType::kSplitV), op);
+  EXPECT_EQ(op.num_split, output_toco_op->num_split);
+}
+
 TEST_F(OperatorTest, BuiltinAveragePool) {
   AveragePoolOperator op;
   op.fused_activation_function = FusedActivationFunctionType::kRelu6;
@@ -517,6 +526,21 @@ TEST_F(OperatorTest, BuiltinUnpack) {
   EXPECT_EQ(op.axis, output_toco_op->axis);
 }
 
+TEST_F(OperatorTest, BuiltinLeakyRelu) {
+  LeakyReluOperator op;
+  op.alpha = 3;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("LEAKY_RELU", OperatorType::kLeakyRelu), op);
+  EXPECT_EQ(op.alpha, output_toco_op->alpha);
+}
+
+TEST_F(OperatorTest, BuiltinSquaredDifference) {
+  SquaredDifferenceOperator op;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("SQUARED_DIFFERENCE", OperatorType::kSquaredDifference), op);
+  ASSERT_NE(nullptr, output_toco_op.get());
+}
+
 TEST_F(OperatorTest, CustomCTCBeamSearchDecoder) {
   CTCBeamSearchDecoderOperator op;
   op.beam_width = 3;
@@ -592,6 +616,14 @@ TEST_F(OperatorTest, TestShouldExportAsFlexOp) {
   EXPECT_FALSE(ShouldExportAsFlexOp(true, "RFFT"));
 }
 
+TEST_F(OperatorTest, BuiltinMirrorPad) {
+  MirrorPadOperator op;
+  op.mode = MirrorPadMode::kReflect;
+  auto output_toco_op = SerializeAndDeserialize(
+      GetOperator("MIRROR_PAD", OperatorType::kMirrorPad), op);
+  EXPECT_EQ(op.mode, output_toco_op->mode);
+}
+
 }  // namespace
 }  // namespace tflite
 
diff --git a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
index d251589b4832b4fb2c75830f70d291af9fcf9525..039a918af16019292214f982326fba3eb5695c62 100644
--- a/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
+++ b/tensorflow/lite/toco/tflite/whitelisted_flex_ops.cc
@@ -187,6 +187,7 @@ bool IsWhitelistedFlexOp(const std::string& tensorflow_op_name) {
           "MirrorPad",
           "MirrorPadGrad",
           "Mul",
+          "Multinomial",
           "Neg",
           "NextIteration",
           "NonMaxSuppression",
diff --git a/tensorflow/lite/toco/toco.cc b/tensorflow/lite/toco/toco.cc
index 9740015850a05cdbc2ad2e97c508012e1678d998..4a3d6a5848751f4c1d526153bd6f6d08a9f882af 100644
--- a/tensorflow/lite/toco/toco.cc
+++ b/tensorflow/lite/toco/toco.cc
@@ -16,87 +16,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
-#include "absl/strings/string_view.h"
-#include "tensorflow/lite/toco/model.h"
 #include "tensorflow/lite/toco/model_cmdline_flags.h"
-#include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_cmdline_flags.h"
-#include "tensorflow/lite/toco/toco_flags.pb.h"
-#include "tensorflow/lite/toco/toco_port.h"
-#include "tensorflow/lite/toco/toco_tooling.h"
-#include "tensorflow/lite/toco/toco_types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/logging.h"
-
-namespace toco {
-namespace {
-
-// Checks the permissions of the output file to ensure it is writeable.
-void CheckOutputFilePermissions(const Arg<string>& output_file) {
-  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
-  QCHECK(port::file::Writable(output_file.value()).ok())
-      << "Specified output_file is not writable: " << output_file.value()
-      << ".\n";
-}
-
-// Checks the permissions of the frozen model file.
-void CheckFrozenModelPermissions(const Arg<string>& input_file) {
-  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
-  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
-      << "Specified input_file does not exist: " << input_file.value() << ".\n";
-  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
-      << "Specified input_file exists, but is not readable: "
-      << input_file.value() << ".\n";
-}
-
-// Reads the contents of the GraphDef from either the frozen graph file or the
-// SavedModel directory. If it reads the SavedModel directory, it updates the
-// ModelFlags and TocoFlags accordingly.
-void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
-                   const ParsedModelFlags& parsed_model_flags,
-                   TocoFlags* toco_flags, ModelFlags* model_flags,
-                   string* graph_def_contents) {
-  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
-
-  // Ensure savedmodel_directory is not set.
-  QCHECK(!parsed_toco_flags.savedmodel_directory.specified())
-      << "Use `tensorflow/lite/python/tflite_convert` script with "
-      << "SavedModel directories.\n";
-
-  // Checks the input file permissions and reads the contents.
-  CheckFrozenModelPermissions(parsed_toco_flags.input_file);
-  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
-                                graph_def_contents, port::file::Defaults())
-            .ok());
-}
-
-tensorflow::Status ToolMain(const ParsedTocoFlags& parsed_toco_flags,
-                            const ParsedModelFlags& parsed_model_flags) {
-  ModelFlags model_flags;
-  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
-
-  TocoFlags toco_flags;
-  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
-
-  string graph_def_contents;
-  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
-                &model_flags, &graph_def_contents);
-  CheckOutputFilePermissions(parsed_toco_flags.output_file);
-
-  std::unique_ptr<Model> model =
-      Import(toco_flags, model_flags, graph_def_contents);
-  Transform(toco_flags, model.get());
-  string output_file_contents;
-  TF_RETURN_IF_ERROR(Export(toco_flags, *model, toco_flags.allow_custom_ops(),
-                            &output_file_contents));
-  TF_RETURN_IF_ERROR(
-      port::file::SetContents(parsed_toco_flags.output_file.value(),
-                              output_file_contents, port::file::Defaults()));
-  return tensorflow::Status();
-}
-
-}  // namespace
-}  // namespace toco
+#include "tensorflow/lite/toco/toco_convert.h"
 
 int main(int argc, char** argv) {
   toco::string msg;
@@ -126,6 +48,6 @@ int main(int argc, char** argv) {
     return 1;
   }
   toco::port::InitGoogle(argv[0], effective_argc, &effective_argv, true);
-  auto status = toco::ToolMain(parsed_toco_flags, parsed_model_flags);
+  auto status = toco::Convert(parsed_toco_flags, parsed_model_flags);
   return status.ok() ? 0 : -1;
 }
diff --git a/tensorflow/lite/toco/toco_convert.cc b/tensorflow/lite/toco/toco_convert.cc
new file mode 100644
index 0000000000000000000000000000000000000000..28e7b10ecd056815c8ca6d7a74f324a18d307451
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert.cc
@@ -0,0 +1,108 @@
+/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdio>
+#include <memory>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/lite/toco/model.h"
+#include "tensorflow/lite/toco/model_cmdline_flags.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_cmdline_flags.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+#include "tensorflow/lite/toco/toco_port.h"
+#include "tensorflow/lite/toco/toco_tooling.h"
+#include "tensorflow/lite/toco/toco_types.h"
+#include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/logging.h"
+
+namespace toco {
+namespace {
+
+// Checks the permissions of the output file to ensure it is writeable.
+void CheckOutputFilePermissions(const Arg<string>& output_file) {
+  QCHECK(output_file.specified()) << "Missing required flag --output_file.\n";
+  QCHECK(port::file::Writable(output_file.value()).ok())
+      << "Specified output_file is not writable: " << output_file.value()
+      << ".\n";
+}
+
+// Checks the permissions of the frozen model file.
+void CheckFrozenModelPermissions(const Arg<string>& input_file) {
+  QCHECK(input_file.specified()) << "Missing required flag --input_file.\n";
+  QCHECK(port::file::Exists(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file does not exist: " << input_file.value() << ".\n";
+  QCHECK(port::file::Readable(input_file.value(), port::file::Defaults()).ok())
+      << "Specified input_file exists, but is not readable: "
+      << input_file.value() << ".\n";
+}
+
+// Reads the contents of the GraphDef from either the frozen graph file or the
+// SavedModel directory. If it reads the SavedModel directory, it updates the
+// ModelFlags and TocoFlags accordingly.
+void ReadInputData(const ParsedTocoFlags& parsed_toco_flags,
+                   const ParsedModelFlags& parsed_model_flags,
+                   TocoFlags* toco_flags, ModelFlags* model_flags,
+                   string* graph_def_contents) {
+  port::CheckInitGoogleIsDone("InitGoogle is not done yet.\n");
+
+  // Ensure savedmodel_directory is not set.
+  QCHECK(!parsed_toco_flags.savedmodel_directory.specified())
+      << "Use `tensorflow/lite/python/tflite_convert` script with "
+      << "SavedModel directories.\n";
+
+  // Checks the input file permissions and reads the contents.
+  CheckFrozenModelPermissions(parsed_toco_flags.input_file);
+  CHECK(port::file::GetContents(parsed_toco_flags.input_file.value(),
+                                graph_def_contents, port::file::Defaults())
+            .ok());
+}
+}  // namespace
+
+tensorflow::Status Convert(const string& graph_def_contents,
+                           const TocoFlags& toco_flags,
+                           const ModelFlags& model_flags,
+                           string* output_file_contents) {
+  std::unique_ptr<Model> model =
+      Import(toco_flags, model_flags, graph_def_contents);
+  Transform(toco_flags, model.get());
+  return Export(toco_flags, *model, toco_flags.allow_custom_ops(),
+                output_file_contents);
+}
+
+tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags) {
+  ModelFlags model_flags;
+  ReadModelFlagsFromCommandLineFlags(parsed_model_flags, &model_flags);
+
+  TocoFlags toco_flags;
+  ReadTocoFlagsFromCommandLineFlags(parsed_toco_flags, &toco_flags);
+
+  string graph_def_contents;
+  ReadInputData(parsed_toco_flags, parsed_model_flags, &toco_flags,
+                &model_flags, &graph_def_contents);
+  CheckOutputFilePermissions(parsed_toco_flags.output_file);
+
+  string output_file_contents;
+  TF_RETURN_IF_ERROR(Convert(graph_def_contents, toco_flags, model_flags,
+                             &output_file_contents));
+
+  TF_RETURN_IF_ERROR(
+      port::file::SetContents(parsed_toco_flags.output_file.value(),
+                              output_file_contents, port::file::Defaults()));
+  return tensorflow::Status();
+}
+
+}  // namespace toco
diff --git a/tensorflow/lite/toco/toco_convert.h b/tensorflow/lite/toco/toco_convert.h
new file mode 100644
index 0000000000000000000000000000000000000000..ebbd336d3f50ae63a106387eadb5888c00ed9064
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert.h
@@ -0,0 +1,34 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+#define TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
+
+#include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/lite/toco/args.h"
+#include "tensorflow/lite/toco/model_flags.pb.h"
+#include "tensorflow/lite/toco/toco_flags.pb.h"
+
+namespace toco {
+
+tensorflow::Status Convert(const string& graph_def_contents,
+                           const TocoFlags& toco_flags,
+                           const ModelFlags& model_flags,
+                           string* output_file_contents);
+
+tensorflow::Status Convert(const ParsedTocoFlags& parsed_toco_flags,
+                           const ParsedModelFlags& parsed_model_flags);
+}  // namespace toco
+
+#endif  // TENSORFLOW_LITE_TOCO_TOCO_CONVERT_H_
diff --git a/tensorflow/lite/toco/toco_convert_test.cc b/tensorflow/lite/toco/toco_convert_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c3c440db94396def2f8cfd40242642767d11a63a
--- /dev/null
+++ b/tensorflow/lite/toco/toco_convert_test.cc
@@ -0,0 +1,173 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/toco/toco_convert.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace toco {
+namespace {
+
+TEST(TocoTest, MissingInputFile) {
+  ParsedTocoFlags toco_flags;
+  ParsedModelFlags model_flags;
+  EXPECT_DEATH(Convert(toco_flags, model_flags).ok(),
+               "Missing required flag --input_file");
+}
+
+TEST(TocoTest, BadInputFormat) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Unhandled input_format='FILE_FORMAT_UNKNOWN'");
+}
+
+TEST(TocoTest, MissingOuputArrays) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "This model does not define output arrays, so a --output_arrays "
+               "flag must be given on the command-line");
+}
+
+TEST(TocoTest, BadOutputArray) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  model_flags.add_output_arrays("output1");
+  string input;
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Specified output array .output1. is not produced by any op "
+               "in this graph. Is it a typo. To silence this message, pass "
+               "this flag:  allow_nonexistent_arrays");
+}
+
+TEST(TocoTest, BadOutputFormat) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "output1"
+      input: "input1"
+      input: "input2"
+      op: "Sub"
+      attr { key: "T" value { type: DT_FLOAT } }
+    }
+  )GraphDef";
+
+  string output;
+
+  EXPECT_DEATH(Convert(input, toco_flags, model_flags, &output).ok(),
+               "Unhandled output_format='FILE_FORMAT_UNKNOWN'");
+}
+
+TEST(TocoTest, SimpleFloatModel) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+  toco_flags.set_output_format(TENSORFLOW_GRAPHDEF);
+
+  // Inputs are automatically selected (but that might not be a good idea).
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "input1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "input2"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "output1"
+      input: "input1"
+      input: "input2"
+      op: "Sub"
+      attr { key: "T" value { type: DT_FLOAT } }
+    }
+  )GraphDef";
+
+  string output;
+  EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
+  EXPECT_TRUE(!output.empty());
+}
+
+TEST(TocoTest, TransientStringTensors) {
+  TocoFlags toco_flags;
+  ModelFlags model_flags;
+
+  toco_flags.set_input_format(TENSORFLOW_GRAPHDEF);
+
+  // We need to do a couple of things to trigger the transient array
+  // initialization code: output format must support memory planning, and the
+  // input array must have a shape.
+  toco_flags.set_output_format(TFLITE);
+
+  model_flags.add_output_arrays("output1");
+  string input = R"GraphDef(
+    node {
+      name: "input1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_STRING } }
+      attr { key: "shape" value { shape { dim { size:1 }}}}
+    }
+    node {
+      name: "indices1"
+      op: "Placeholder"
+      attr { key: "dtype" value { type: DT_INT64 } }
+    }
+    node {
+      name: "intermediate1"
+      op: "Gather"
+      input: "input1"
+      input: "indices1"
+      attr { key: "Tparams" value { type: DT_STRING } }
+      attr { key: "Tindices" value { type: DT_INT64 } }
+    }
+    node {
+      name: "output1"
+      op: "Gather"
+      input: "intermediate1"
+      input: "indices2"
+      attr { key: "Tparams" value { type: DT_STRING } }
+      attr { key: "Tindices" value { type: DT_INT64 } }
+    }
+  )GraphDef";
+
+  string output;
+
+  EXPECT_TRUE(Convert(input, toco_flags, model_flags, &output).ok());
+  EXPECT_TRUE(!output.empty());
+}
+
+}  // namespace
+}  // namespace toco
diff --git a/tensorflow/lite/toco/toco_tooling.cc b/tensorflow/lite/toco/toco_tooling.cc
index 5f96e833fbf4000f1796ca8efbb62fa960ad9544..55a454e66de4d0afce18421450d875911bea01f4 100644
--- a/tensorflow/lite/toco/toco_tooling.cc
+++ b/tensorflow/lite/toco/toco_tooling.cc
@@ -210,7 +210,8 @@ std::unique_ptr<Model> Import(const TocoFlags& toco_flags,
       CheckInvariants(*model);
       break;
     default:
-      LOG(FATAL) << "Unhandled input_format";
+      LOG(FATAL) << "Unhandled input_format='"
+                 << FileFormat_Name(toco_flags.input_format()) << "'";
   }
 
   LogDump(kLogLevelModelChanged, "AT IMPORT", *model);
@@ -308,6 +309,7 @@ void Transform(const TocoFlags& toco_flags, Model* model) {
   // Fix any issues with IO edges. This must happen after any transform that
   // may modify the structure of the edges.
   FixEdgeArrays(model);
+  FixOperatorOrdering(model);
 
   if (quantize_output) {
     // If the user specified default min/max ranges we need to set all arrays
@@ -424,7 +426,8 @@ tensorflow::Status Export(const TocoFlags& toco_flags, const Model& model,
       DumpGraphviz(model, output_file_contents);
       break;
     default:
-      LOG(FATAL) << "Unhandled output_format";
+      LOG(FATAL) << "Unhandled output_format='"
+                 << FileFormat_Name(toco_flags.output_format()) << "'";
   }
   return tensorflow::Status();
 }
diff --git a/tensorflow/lite/toco/tooling_util.cc b/tensorflow/lite/toco/tooling_util.cc
index e33f7c8452f88d8402f51ef1bc55a8dfe95631ec..af4cd386a209d82cb56a877410abe6fbdbf99c7b 100644
--- a/tensorflow/lite/toco/tooling_util.cc
+++ b/tensorflow/lite/toco/tooling_util.cc
@@ -308,6 +308,7 @@ const char* OperatorTypeName(OperatorType type) {
 #define HANDLE_OPERATORTYPENAME_CASE(c) \
   case OperatorType::k##c:              \
     return #c;
+    HANDLE_OPERATORTYPENAME_CASE(Abs)
     HANDLE_OPERATORTYPENAME_CASE(Add)
     HANDLE_OPERATORTYPENAME_CASE(AddN)
     HANDLE_OPERATORTYPENAME_CASE(AveragePool)
@@ -371,6 +372,7 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(Shape)
     HANDLE_OPERATORTYPENAME_CASE(Slice)
     HANDLE_OPERATORTYPENAME_CASE(Split)
+    HANDLE_OPERATORTYPENAME_CASE(SplitV)
     HANDLE_OPERATORTYPENAME_CASE(Sqrt)
     HANDLE_OPERATORTYPENAME_CASE(Square)
     HANDLE_OPERATORTYPENAME_CASE(Switch)
@@ -411,6 +413,9 @@ const char* OperatorTypeName(OperatorType type) {
     HANDLE_OPERATORTYPENAME_CASE(ZerosLike)
     HANDLE_OPERATORTYPENAME_CASE(UnidirectionalSequenceLstm)
     HANDLE_OPERATORTYPENAME_CASE(ResizeNearestNeighbor)
+    HANDLE_OPERATORTYPENAME_CASE(LeakyRelu)
+    HANDLE_OPERATORTYPENAME_CASE(SquaredDifference)
+    HANDLE_OPERATORTYPENAME_CASE(MirrorPad)
     default:
       LOG(FATAL) << "Unhandled op type";
 #undef HANDLE_OPERATORTYPENAME_CASE
@@ -439,6 +444,7 @@ bool OperatorSupportsFusedActivation(OperatorType type) {
     case OperatorType::kMaxPool:
     case OperatorType::kMul:
     case OperatorType::kSub:
+    case OperatorType::kSquaredDifference:
       return true;
     default:
       return false;
@@ -531,12 +537,12 @@ void DumpGraphvizVideoFrame(const Model& model) {
   if (!dump_hashes.count(hash)) {
     LOG(INFO) << "DUMPING GRAPHVIZ VIDEO FRAME: " << dump_id;
     dump_hashes.insert(hash);
-    CHECK(port::file::SetContents(
-              port::file::JoinPath(
-                  dump_options.dump_graphviz,
-                  toco::port::StringF("toco_video_%05d.dot", dump_id)),
-              graphviz_dump, port::file::Defaults())
-              .ok());
+    const auto result = port::file::SetContents(
+        port::file::JoinPath(
+            dump_options.dump_graphviz,
+            toco::port::StringF("toco_video_%05d.dot", dump_id)),
+        graphviz_dump, port::file::Defaults());
+    QCHECK(result.ok()) << result.error_message();
     dump_id++;
   }
 }
@@ -550,14 +556,13 @@ void LogDump(int log_level, const string& message, const Model& model) {
     string graphviz_dump;
 
     DumpGraphviz(model, &graphviz_dump);
-    CHECK(port::file::SetContents(
-              port::file::JoinPath(
-                  dump_options.dump_graphviz,
-                  absl::StrCat("toco_",
-                               absl::StrReplaceAll(message, {{" ", "_"}}),
-                               ".dot")),
-              graphviz_dump, port::file::Defaults())
-              .ok());
+    const auto result = port::file::SetContents(
+        port::file::JoinPath(
+            dump_options.dump_graphviz,
+            absl::StrCat("toco_", absl::StrReplaceAll(message, {{" ", "_"}}),
+                         ".dot")),
+        graphviz_dump, port::file::Defaults());
+    QCHECK(result.ok()) << result.error_message();
   }
 
   if (!VLOG_IS_ON(log_level)) {
@@ -894,6 +899,9 @@ void CheckNonExistentIOArrays(const Model& model) {
         << "\" is not consumed by any op in this graph. " << general_comment;
   }
   for (const string& output_array : model.flags.output_arrays()) {
+    if (IsConstantParameterArray(model, output_array)) {
+      continue;  // It is OK to request that a constant be an output.
+    }
     QCHECK(GetOpWithOutput(model, output_array))
         << "Specified output array \"" << output_array
         << "\" is not produced by any op in this graph. " << general_comment;
@@ -1032,10 +1040,10 @@ void CheckEachArray(const Model& model) {
     if (colon_pos != string::npos) {
       CHECK_EQ(name.substr(colon_pos + 1).find_first_not_of("0123456789"),
                string::npos)
-          << "Array name must only have digits after colon";
+          << "Array '" << name << "' has non-digit characters after colon.";
     }
-    CHECK_GT(colon_pos, 0)
-        << "First character of array name must not be a colon.";
+    CHECK_GT(colon_pos, 0) << "Array '" << name
+                           << "' must not start with a colon.";
   }
 }
 
@@ -1767,6 +1775,14 @@ bool IsAllocatableTransientArray(const Model& model, const string& array_name) {
   if (!array->has_shape()) {
     return false;
   }
+
+  // The size of string tensors is rarely known ahead of time, so all transient
+  // tensors of this type will need to be dynamically allocated.
+  if (array->final_data_type == ArrayDataType::kString ||
+      array->data_type == ArrayDataType::kString) {
+    return false;
+  }
+
   return true;
 }
 
@@ -2207,6 +2223,8 @@ ArrayDataType ConvertIODataTypeToArrayDataType(IODataType type) {
       return ArrayDataType::kFloat;
     case QUANTIZED_UINT8:
       return ArrayDataType::kUint8;
+    case INT8:
+      return ArrayDataType::kInt8;
     case QUANTIZED_INT16:
       return ArrayDataType::kInt16;
     case INT32:
diff --git a/tensorflow/lite/toco/types.proto b/tensorflow/lite/toco/types.proto
index 12f711fd8a3c7cbed103bcf43206966e3c5f72b9..fa911b8a4c80d96790fa16e34dbc3f114b522e45 100644
--- a/tensorflow/lite/toco/types.proto
+++ b/tensorflow/lite/toco/types.proto
@@ -43,4 +43,7 @@ enum IODataType {
 
   // Complex64, not quantized
   COMPLEX64 = 8;
+
+  // Int8, quantized based on QuantizationParameters in schema.
+  INT8 = 9;
 }
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 93725b5de473e43c4f7c398a2ac0bf1a52e0b3f2..1d141b5dd01a4a03c65d0c8a119ad62eea224d52 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -53,7 +53,6 @@ cc_test(
         "//tensorflow/lite:testdata/test_model_broken.bin",
     ],
     tags = [
-        "no_oss",
         "tflite_not_portable_android",
         "tflite_not_portable_ios",
     ],
@@ -80,7 +79,6 @@ cc_test(
     size = "small",
     srcs = ["verifier_test.cc"],
     tags = [
-        "no_oss",
         "tflite_not_portable",
     ],
     deps = [
diff --git a/tensorflow/lite/tools/benchmark/BUILD b/tensorflow/lite/tools/benchmark/BUILD
index 583046ad73d67ba9fba76570299fc1331aef07e4..bc47406cd92d406a0900743986ea67a4ba39240e 100644
--- a/tensorflow/lite/tools/benchmark/BUILD
+++ b/tensorflow/lite/tools/benchmark/BUILD
@@ -70,6 +70,7 @@ cc_test(
     deps = [
         ":benchmark_tflite_model_lib",
         ":command_line_flags",
+        "//tensorflow/lite:framework",
         "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest",
     ],
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index a71a2fa1c0ec3c17b49c6acd62feacfb029c43d2..a4d9c879eb645019a7626502207e9a3f4e89b1c1 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -11,6 +11,11 @@ The instructions below are for running the binary on Desktop and Android,
 for iOS please use the
 [iOS benchmark app](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/ios).
 
+An experimental Android APK wrapper for the benchmark model utility offers more
+faithful execution behavior on Android (via a foreground Activity). It is
+located
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/android).
+
 ## Parameters
 
 The binary takes the following required parameters:
diff --git a/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
new file mode 100644
index 0000000000000000000000000000000000000000..7cdca2885ddabe89bc846f3099dc055d471874b3
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    package="org.tensorflow.lite.benchmark">
+
+    <!-- Necessary for loading custom models from disk. -->
+    <uses-permission android:name="android.permission.READ_EXTERNAL_STORAGE"/>
+
+    <!-- Target SDK 21 (<23) to avoid the need for requesting storage
+         permissions. This APK will almost always be used from the command-line
+         anyway, and be expicitly installed by the developer. -->
+    <uses-sdk
+        android:minSdkVersion="21"
+        android:targetSdkVersion="21" />
+
+    <application>
+        <!-- This Activity runs the TensorFlow Lite benchmark at creation, using
+             a provided set of arguments, then immediately terminates. -->
+        <activity android:name="org.tensorflow.lite.benchmark.BenchmarkModelActivity"
+                  android:screenOrientation="portrait"
+                  android:label="TFLite Benchmark"
+                  android:theme="@android:style/Theme.NoDisplay"
+                  android:exported="true"
+                  android:noHistory="true" />
+    </application>
+
+</manifest>
diff --git a/tensorflow/lite/tools/benchmark/android/BUILD b/tensorflow/lite/tools/benchmark/android/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a291effddc91d2abd153e9e8422ec7cbf5725c4b
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/BUILD
@@ -0,0 +1,44 @@
+# Description:
+#   BenchmarkModel Android harness for TensorFlow Lite benchmarks.
+package(default_visibility = ["//visibility:private"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow/lite:build_def.bzl", "tflite_jni_binary")
+load("@build_bazel_rules_android//android:rules.bzl", "android_binary")
+
+# See README.md for details about building and executing this benchmark.
+android_binary(
+    name = "benchmark_model",
+    srcs = glob([
+        "src/**/*.java",
+    ]),
+    custom_package = "org.tensorflow.lite.benchmark",
+    manifest = "AndroidManifest.xml",
+    # In some platforms we don't have an Android SDK/NDK and this target
+    # can't be built. We need to prevent the build system from trying to
+    # use the target in that case.
+    tags = ["manual"],
+    deps = [":tensorflowlite_benchmark_native"],
+)
+
+tflite_jni_binary(
+    name = "libtensorflowlite_benchmark.so",
+    srcs = glob([
+        "jni/**/*.cc",
+        "jni/**/*.h",
+    ]),
+    deps = [
+        "//tensorflow/lite/java/jni",
+        "//tensorflow/lite/tools/benchmark:benchmark_tflite_model_lib",
+        "//tensorflow/lite/tools/benchmark:logging",
+    ],
+)
+
+cc_library(
+    name = "tensorflowlite_benchmark_native",
+    srcs = ["libtensorflowlite_benchmark.so"],
+    visibility = ["//visibility:private"],
+)
diff --git a/tensorflow/lite/tools/benchmark/android/README.md b/tensorflow/lite/tools/benchmark/android/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f5b67e3f79aa669c5424d46c23f053213ad3a101
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/README.md
@@ -0,0 +1,65 @@
+# TFLite Android Model Benchmark Tool
+
+## Description
+
+This Android benchmark app is a simple wrapper around the TensorFlow Lite
+[command-line benchmark utility](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark).
+
+Pushing and executing binaries directly on Android is a valid approach to
+benchmarking, but it can result in subtle (but observable) differences in
+performance relative to execution within an actual Android app. In particular,
+Android's scheduler tailors behavior based on thread and process priorities,
+which differ between a foreground Activity/Application and a regular background
+binary executed via `adb shell ...`. This tailored behavior is most evident when
+enabling multi-threaded CPU execution with TensorFlow Lite.
+
+To that end, this app offers perhaps a more faithful view of runtime performance
+that developers can expected when deploying TensorFlow Lite with their
+application.
+
+## To build/install/run
+
+(0) Refer to
+https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/android
+to edit the `WORKSPACE` to configure the android NDK/SDK.
+
+(1) Build for your specific platform, e.g.:
+
+```
+bazel build -c opt \
+  --config=android_arm64 \
+  --cxxopt='--std=c++11' \
+  tensorflow/lite/tools/benchmark/android:benchmark_model
+```
+
+(2) Connect your phone. Install the benchmark APK to your phone with adb:
+
+```
+adb install -r -d bazel-bin/tensorflow/lite/tools/benchmark/android/benchmark_model.apk
+```
+
+(3) Push the compute graph that you need to test.
+
+```
+adb push mobilenet_quant_v1_224.tflite /data/local/tmp
+```
+
+(4) Run the benchmark. Additional command-line flags are documented
+[here](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/tools/benchmark/README.md)
+and can be appended to the `args` string alongside the required `--graph` flag
+(note that all args must be nested in the single quoted string that follows the
+args key).
+
+```
+adb shell am start -S -n
+  org.tensorflow.lite.benchmark/org.tensorflow.lite.benchmark.BenchmarkModelActivity \
+  --es args '"--graph=/data/local/tmp/mobilenet_quant_v1_224.tflite --num_threads=4"'
+```
+
+(5) The results will be available in Android's logcat, e.g.:
+
+```
+adb logcat | grep "Average inference"
+
+... tflite  : Average inference timings in us: Warmup: 91471, Init: 4108, Inference: 80660.1
+```
diff --git a/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc b/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ee67bdafb0d3dd84ca1eaba8062e385887f3eb74
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/jni/benchmark_model_jni.cc
@@ -0,0 +1,92 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <jni.h>
+
+#include <sstream>
+#include <string>
+
+#include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
+#include "tensorflow/lite/tools/benchmark/logging.h"
+
+#ifdef __ANDROID__
+#include <android/log.h>
+#endif
+
+namespace tflite {
+namespace benchmark {
+namespace {
+
+class AndroidBenchmarkLoggingListener : public BenchmarkListener {
+  void OnBenchmarkEnd(const BenchmarkResults& results) override {
+    auto inference_us = results.inference_time_us();
+    auto init_us = results.startup_latency_us();
+    auto warmup_us = results.warmup_time_us();
+    std::stringstream results_output;
+    results_output << "Average inference timings in us: "
+                   << "Warmup: " << warmup_us.avg() << ", "
+                   << "Init: " << init_us << ", "
+                   << "Inference: " << inference_us.avg();
+#ifdef __ANDROID__
+    __android_log_print(ANDROID_LOG_ERROR, "tflite", "%s",
+                        results_output.str().c_str());
+#else
+    fprintf(stderr, "%s", results_output.str().c_str());
+#endif
+  }
+};
+
+void Run(int argc, char** argv) {
+  BenchmarkTfLiteModel benchmark;
+  AndroidBenchmarkLoggingListener listener;
+  benchmark.AddListener(&listener);
+  benchmark.Run(argc, argv);
+}
+
+}  // namespace
+}  // namespace benchmark
+}  // namespace tflite
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+JNIEXPORT void JNICALL
+Java_org_tensorflow_lite_benchmark_BenchmarkModel_nativeRun(JNIEnv* env,
+                                                            jclass clazz,
+                                                            jstring args_obj) {
+  const char* args_chars = env->GetStringUTFChars(args_obj, nullptr);
+
+  // Split the args string into individual arg tokens.
+  std::istringstream iss(args_chars);
+  std::vector<std::string> args_split{std::istream_iterator<std::string>(iss),
+                                      {}};
+
+  // Construct a fake argv command-line object for the benchmark.
+  std::vector<char*> argv;
+  std::string arg0 = "(BenchmarkModelAndroid)";
+  argv.push_back(const_cast<char*>(arg0.data()));
+  for (auto& arg : args_split) {
+    argv.push_back(const_cast<char*>(arg.data()));
+  }
+
+  tflite::benchmark::Run(static_cast<int>(argv.size()), argv.data());
+
+  env->ReleaseStringUTFChars(args_obj, args_chars);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
diff --git a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java
new file mode 100644
index 0000000000000000000000000000000000000000..a6cf8d78d5703300b3576ab3221326a2335e602e
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModel.java
@@ -0,0 +1,32 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark;
+
+/** Helper class for running a native TensorFlow Lite benchmark. */
+class BenchmarkModel {
+  static {
+    System.loadLibrary("tensorflowlite_benchmark");
+  }
+
+  // Executes a standard TensorFlow Lite benchmark according to the provided args.
+  //
+  // Note that {@code args} will be split by the native execution code.
+  public static void run(String args) {
+    nativeRun(args);
+  }
+
+  private static native void nativeRun(String args);
+}
diff --git a/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
new file mode 100644
index 0000000000000000000000000000000000000000..12410adf3d6687ffa514c6ba21981fb19286fe62
--- /dev/null
+++ b/tensorflow/lite/tools/benchmark/android/src/org/tensorflow/lite/benchmark/BenchmarkModelActivity.java
@@ -0,0 +1,44 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+package org.tensorflow.lite.benchmark;
+
+import android.app.Activity;
+import android.content.Intent;
+import android.os.Bundle;
+import android.util.Log;
+
+/** Main {@code Activity} class for the benchmark app. */
+public class BenchmarkModelActivity extends Activity {
+
+  private static final String TAG = "tflite_BenchmarkModelActivity";
+
+  private static final String ARGS_INTENT_KEY_0 = "args";
+  private static final String ARGS_INTENT_KEY_1 = "--args";
+
+  @Override
+  public void onCreate(Bundle savedInstanceState) {
+    super.onCreate(savedInstanceState);
+
+    Intent intent = getIntent();
+    Bundle bundle = intent.getExtras();
+    String args = bundle.getString(ARGS_INTENT_KEY_0, bundle.getString(ARGS_INTENT_KEY_1));
+    Log.i(TAG, "Running TensorFlow Lite benchmark with args: " + args);
+
+    BenchmarkModel.run(args);
+
+    finish();
+  }
+}
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.cc b/tensorflow/lite/tools/benchmark/benchmark_model.cc
index 05148aea65b6e5a517f5a538f26909994d53cc4f..e9b485efcaa81b011c598d5dfa39d4f253090dc8 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.cc
@@ -51,11 +51,13 @@ using tensorflow::Stat;
 BenchmarkParams BenchmarkModel::DefaultParams() {
   BenchmarkParams params;
   params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(50));
+  params.AddParam("min_secs", BenchmarkParam::Create<float>(1.0f));
   params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
   params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
   params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
   params.AddParam("output_prefix", BenchmarkParam::Create<std::string>(""));
   params.AddParam("warmup_runs", BenchmarkParam::Create<int32_t>(1));
+  params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   return params;
 }
 
@@ -73,19 +75,34 @@ void BenchmarkLoggingListener::OnBenchmarkEnd(const BenchmarkResults &results) {
 
 std::vector<Flag> BenchmarkModel::GetFlags() {
   return {
-      CreateFlag<int32_t>("num_runs", &params_, "number of runs"),
+      CreateFlag<int32_t>("num_runs", &params_,
+                          "minimum number of runs, see also min_secs"),
+      CreateFlag<float>(
+          "min_secs", &params_,
+          "minimum number of seconds to rerun for, potentially making the "
+          "actual number of runs to be greater than num_runs"),
       CreateFlag<float>("run_delay", &params_, "delay between runs in seconds"),
       CreateFlag<int32_t>("num_threads", &params_, "number of threads"),
       CreateFlag<std::string>("benchmark_name", &params_, "benchmark name"),
       CreateFlag<std::string>("output_prefix", &params_,
                               "benchmark output prefix"),
-      CreateFlag<int32_t>("warmup_runs", &params_,
-                          "how many runs to initialize model"),
+      CreateFlag<int32_t>(
+          "warmup_runs", &params_,
+          "minimum number of runs performed on initialization, to "
+          "allow performance characteristics to settle, see also "
+          "warmup_min_secs"),
+      CreateFlag<float>(
+          "warmup_min_secs", &params_,
+          "minimum number of seconds to rerun for, potentially making the "
+          "actual number of warm-up runs to be greater than warmup_runs"),
   };
 }
 
 void BenchmarkModel::LogParams() {
-  TFLITE_LOG(INFO) << "Num runs: [" << params_.Get<int32_t>("num_runs") << "]";
+  TFLITE_LOG(INFO) << "Min num runs: [" << params_.Get<int32_t>("num_runs")
+                   << "]";
+  TFLITE_LOG(INFO) << "Min runs duration (seconds): ["
+                   << params_.Get<float>("min_secs") << "]";
   TFLITE_LOG(INFO) << "Inter-run delay (seconds): ["
                    << params_.Get<float>("run_delay") << "]";
   TFLITE_LOG(INFO) << "Num threads: [" << params_.Get<int32_t>("num_threads")
@@ -94,16 +111,24 @@ void BenchmarkModel::LogParams() {
                    << params_.Get<std::string>("benchmark_name") << "]";
   TFLITE_LOG(INFO) << "Output prefix: ["
                    << params_.Get<std::string>("output_prefix") << "]";
-  TFLITE_LOG(INFO) << "Warmup runs: [" << params_.Get<int32_t>("warmup_runs")
-                   << "]";
+  TFLITE_LOG(INFO) << "Min warmup runs: ["
+                   << params_.Get<int32_t>("warmup_runs") << "]";
+  TFLITE_LOG(INFO) << "Min warmup runs duration (seconds): ["
+                   << params_.Get<float>("warmup_min_secs") << "]";
 }
 
 void BenchmarkModel::PrepareInputsAndOutputs() {}
 
-Stat<int64_t> BenchmarkModel::Run(int num_times, RunType run_type) {
+Stat<int64_t> BenchmarkModel::Run(int min_num_times, float min_secs,
+                                  RunType run_type) {
   Stat<int64_t> run_stats;
-  TFLITE_LOG(INFO) << "Running benchmark for " << num_times << " iterations ";
-  for (int run = 0; run < num_times; run++) {
+  TFLITE_LOG(INFO) << "Running benchmark for at least " << min_num_times
+                   << " iterations and at least " << min_secs << " seconds";
+  int64_t min_finish_us =
+      profiling::time::NowMicros() + static_cast<int64_t>(min_secs * 1.e6f);
+  for (int run = 0;
+       run < min_num_times || profiling::time::NowMicros() < min_finish_us;
+       run++) {
     PrepareInputsAndOutputs();
     listeners_.OnSingleRunStart(run_type);
     int64_t start_us = profiling::time::NowMicros();
@@ -145,9 +170,11 @@ void BenchmarkModel::Run() {
 
   uint64_t input_bytes = ComputeInputBytes();
   Stat<int64_t> warmup_time_us =
-      Run(params_.Get<int32_t>("warmup_runs"), WARMUP);
+      Run(params_.Get<int32_t>("warmup_runs"),
+          params_.Get<float>("warmup_min_secs"), WARMUP);
   Stat<int64_t> inference_time_us =
-      Run(params_.Get<int32_t>("num_runs"), REGULAR);
+      Run(params_.Get<int32_t>("num_runs"), params_.Get<float>("min_secs"),
+          REGULAR);
   listeners_.OnBenchmarkEnd(
       {startup_latency_us, input_bytes, warmup_time_us, inference_time_us});
 }
diff --git a/tensorflow/lite/tools/benchmark/benchmark_model.h b/tensorflow/lite/tools/benchmark/benchmark_model.h
index d8a9b05010aba465b58d8b2616df8fce263035ce..31ee5c92aa3f1ffb53f1a39fbc6e1344d92a260c 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_model.h
@@ -150,7 +150,8 @@ class BenchmarkModel {
   bool ParseFlags(int argc, char** argv);
   virtual std::vector<Flag> GetFlags();
   virtual uint64_t ComputeInputBytes() = 0;
-  virtual tensorflow::Stat<int64_t> Run(int num_times, RunType run_type);
+  virtual tensorflow::Stat<int64_t> Run(int min_num_times, float min_secs,
+                                        RunType run_type);
   virtual void PrepareInputsAndOutputs();
   virtual void RunImpl() = 0;
   BenchmarkParams params_;
diff --git a/tensorflow/lite/tools/benchmark/benchmark_test.cc b/tensorflow/lite/tools/benchmark/benchmark_test.cc
index 59d23d90086761422ebbf8f7164a1626cd164dd2..a4f830122f65bcacb0eae4783998cf8bb5611fb9 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_test.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/testing/util.h"
 #include "tensorflow/lite/tools/benchmark/benchmark_tflite_model.h"
 #include "tensorflow/lite/tools/benchmark/command_line_flags.h"
@@ -33,6 +34,7 @@ namespace {
 BenchmarkParams CreateParams() {
   BenchmarkParams params;
   params.AddParam("num_runs", BenchmarkParam::Create<int32_t>(2));
+  params.AddParam("min_secs", BenchmarkParam::Create<float>(1.0f));
   params.AddParam("run_delay", BenchmarkParam::Create<float>(-1.0f));
   params.AddParam("num_threads", BenchmarkParam::Create<int32_t>(1));
   params.AddParam("benchmark_name", BenchmarkParam::Create<std::string>(""));
@@ -42,9 +44,19 @@ BenchmarkParams CreateParams() {
   params.AddParam("input_layer", BenchmarkParam::Create<std::string>(""));
   params.AddParam("input_layer_shape", BenchmarkParam::Create<std::string>(""));
   params.AddParam("use_nnapi", BenchmarkParam::Create<bool>(false));
+  params.AddParam("warmup_min_secs", BenchmarkParam::Create<float>(0.5f));
   return params;
 }
 
+class TestBenchmark : public BenchmarkTfLiteModel {
+ public:
+  explicit TestBenchmark(BenchmarkParams params)
+      : BenchmarkTfLiteModel(std::move(params)) {}
+  const tflite::Interpreter* GetInterpreter() { return interpreter.get(); }
+
+  void Prepare() { PrepareInputsAndOutputs(); }
+};
+
 TEST(BenchmarkTest, DoesntCrash) {
   ASSERT_THAT(g_model_path, testing::NotNull());
 
@@ -52,6 +64,37 @@ TEST(BenchmarkTest, DoesntCrash) {
   benchmark.Run();
 }
 
+TEST(BenchmarkTest, ParametersArePopulatedWhenInputShapeIsNotSpecified) {
+  ASSERT_THAT(g_model_path, testing::NotNull());
+
+  TestBenchmark benchmark(CreateParams());
+  benchmark.Init();
+  benchmark.Prepare();
+
+  auto interpreter = benchmark.GetInterpreter();
+  auto inputs = interpreter->inputs();
+  ASSERT_GE(inputs.size(), 1);
+  auto input_tensor = interpreter->tensor(inputs[0]);
+
+  std::vector<char> input_bytes;
+  input_bytes.reserve(input_tensor->bytes);
+  for (size_t i = 0; i < input_tensor->bytes; i++) {
+    input_bytes.push_back(input_tensor->data.raw_const[i]);
+  }
+  benchmark.Prepare();
+
+  // Expect data is not the same.
+  EXPECT_EQ(input_bytes.size(), input_tensor->bytes);
+  bool is_same = true;
+  for (size_t i = 0; i < input_tensor->bytes; i++) {
+    if (input_bytes[i] != input_tensor->data.raw_const[i]) {
+      is_same = false;
+      break;
+    }
+  }
+  EXPECT_FALSE(is_same);
+}
+
 }  // namespace
 }  // namespace benchmark
 }  // namespace tflite
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index 777d9dde7dd528168744f2f92751961d549743ff..32cf4e4292a57ebb73abfaeb3d73d5c1e5717f43 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "tensorflow/lite/tools/benchmark/logging.h"
 
 #ifdef GEMMLOWP_PROFILING
-#include "third_party/gemmlowp/profiling/profiler.h"
+#include "gemmlowp/profiling/profiler.h"
 #endif
 
 #ifdef TFLITE_CUSTOM_OPS_HEADER
@@ -181,7 +181,18 @@ bool PopulateInputLayerInfo(
   return true;
 }
 
-BenchmarkParams GetDefaultParams() {
+std::vector<int> TfLiteIntArrayToVector(const TfLiteIntArray* int_array) {
+  std::vector<int> values;
+  values.reserve(int_array->size);
+  for (size_t i = 0; i < int_array->size; i++) {
+    values.push_back(int_array->data[i]);
+  }
+  return values;
+}
+
+}  // namespace
+
+BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   BenchmarkParams default_params = BenchmarkModel::DefaultParams();
   default_params.AddParam("graph", BenchmarkParam::Create<std::string>(""));
   default_params.AddParam("input_layer",
@@ -192,10 +203,8 @@ BenchmarkParams GetDefaultParams() {
   return default_params;
 }
 
-}  // namespace
-
 BenchmarkTfLiteModel::BenchmarkTfLiteModel()
-    : BenchmarkTfLiteModel(GetDefaultParams()) {}
+    : BenchmarkTfLiteModel(DefaultParams()) {}
 
 BenchmarkTfLiteModel::BenchmarkTfLiteModel(BenchmarkParams params)
     : BenchmarkModel(std::move(params)) {
@@ -250,12 +259,10 @@ uint64_t BenchmarkTfLiteModel::ComputeInputBytes() {
 void BenchmarkTfLiteModel::PrepareInputsAndOutputs() {
   auto interpreter_inputs = interpreter->inputs();
   // Set the values of the input tensors.
-  for (int j = 0; j < inputs.size(); ++j) {
-    const InputLayerInfo& input = inputs[j];
+  for (int j = 0; j < interpreter_inputs.size(); ++j) {
     int i = interpreter_inputs[j];
     TfLiteTensor* t = interpreter->tensor(i);
-    std::vector<int> sizes = input.shape;
-
+    std::vector<int> sizes = TfLiteIntArrayToVector(t->dims);
     // TODO(ahentz): below we ignore the O-th dimension (number of batches).
     if (t->type == kTfLiteFloat32) {
       FillRandomValue<float>(
@@ -274,12 +281,17 @@ void BenchmarkTfLiteModel::PrepareInputsAndOutputs() {
           interpreter->typed_tensor<uint8_t>(i),
           std::vector<int>(sizes.begin() + 1, sizes.end()),
           []() { return static_cast<uint8_t>(rand()) % 255; });
+    } else if (t->type == kTfLiteInt8) {
+      FillRandomValue<int8_t>(
+          interpreter->typed_tensor<int8_t>(i),
+          std::vector<int>(sizes.begin() + 1, sizes.end()),
+          []() { return static_cast<int8_t>(rand()) % 255 - 127; });
     } else if (t->type == kTfLiteString) {
       tflite::DynamicBuffer buffer;
       FillRandomString(&buffer, sizes, []() {
         return "we're have some friends over saturday to hang out in the yard";
       });
-      buffer.WriteToTensor(interpreter->tensor(i));
+      buffer.WriteToTensor(interpreter->tensor(i), /*new_shape=*/nullptr);
     } else {
       TFLITE_LOG(FATAL) << "Don't know how to populate tensor " << t->name
                         << " of type " << t->type;
@@ -319,6 +331,7 @@ void BenchmarkTfLiteModel::Init() {
   bool use_nnapi = params_.Get<bool>("use_nnapi");
 
   interpreter->UseNNAPI(use_nnapi);
+  ApplyDelegates();
 
   auto interpreter_inputs = interpreter->inputs();
 
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
index 401ab5427d3a04be1678e2360d9dc5ca7babce9f..83599e644d1f41f70fd96f3a73f9155d6e62deef 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.h
@@ -77,11 +77,16 @@ class BenchmarkTfLiteModel : public BenchmarkModel {
   };
 
  protected:
+  static BenchmarkParams DefaultParams();
   void PrepareInputsAndOutputs() override;
 
- private:
+  // Allows installation of custom delegates during initialization
+  virtual void ApplyDelegates() {}
+
   std::unique_ptr<tflite::FlatBufferModel> model;
   std::unique_ptr<tflite::Interpreter> interpreter;
+
+ private:
   std::vector<InputLayerInfo> inputs;
   ProfilingListener profiling_listener_;
   GemmlowpProfilingListener gemmlowp_profiling_listener_;
diff --git a/tensorflow/lite/tools/benchmark/ios/README.md b/tensorflow/lite/tools/benchmark/ios/README.md
index 3dc29d9b94119564344e2060665daf585a3acd2d..fed9e7ea7e8633e00413118fa3e9e4f12d5188a4 100644
--- a/tensorflow/lite/tools/benchmark/ios/README.md
+++ b/tensorflow/lite/tools/benchmark/ios/README.md
@@ -41,3 +41,14 @@ resources that need to be copied.
 
 - Now try running the app. The app has a single button that runs the benchmark
   on the model and displays results in a text view below.
+
+## Profiling
+
+If you want detailed profiling, use the following command:
+
+```bash
+tensorflow/lite/build_ios_universal_lib.sh -p
+```
+
+Then following the same steps above and run the benchmark app. You will see the
+detailed profiling results in the outputs.
diff --git a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
index 958936a6607effbca927e0c8cd9db7e47617acc8..a5f5bfbbdafc7c11a1340dc26cc2b29d525cca7a 100644
--- a/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
+++ b/tensorflow/lite/tools/benchmark/ios/TFLiteBenchmark/TFLiteBenchmark.xcodeproj/project.pbxproj
@@ -20,7 +20,7 @@
 
 /* Begin PBXFileReference section */
 		6FE7579920D59CE500F01636 /* benchmark_params.json */ = {isa = PBXFileReference; lastKnownFileType = text.json; path = benchmark_params.json; sourceTree = "<group>"; };
-		6FE7579C20D5A5E000F01636 /* benchmark-lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "benchmark-lib.a"; path = "$SRCROOT/../../../../../../../tensorflow/lite/tools/make/gen/lib/benchmark-lib.a"; sourceTree = "<group>"; };
+		6FE7579C20D5A5E000F01636 /* benchmark-lib.a */ = {isa = PBXFileReference; lastKnownFileType = archive.ar; name = "benchmark-lib.a"; path = "$SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib/benchmark-lib.a"; sourceTree = "<group>"; };
 		6FE7579E20D5A6A700F01636 /* Accelerate.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Accelerate.framework; path = System/Library/Frameworks/Accelerate.framework; sourceTree = SDKROOT; };
 		6FE757A020D5AB8000F01636 /* mobilenet_v1_1.0_224.tflite */ = {isa = PBXFileReference; lastKnownFileType = file; path = mobilenet_v1_1.0_224.tflite; sourceTree = "<group>"; };
 		6FE93FF820D592D8008C9FE4 /* TFLiteBenchmark.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = TFLiteBenchmark.app; sourceTree = BUILT_PRODUCTS_DIR; };
@@ -309,19 +309,19 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
 				"HEADER_SEARCH_PATHS[arch=*]" = (
-					$SRCROOT/../../../../../../../,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/eigen,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
+					$SRCROOT/../../../../../../,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/eigen,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
 				);
 				INFOPLIST_FILE = TFLiteBenchmark/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../../tensorflow/lite/tools/make/gen/lib;
+				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib;
 				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
@@ -335,19 +335,19 @@
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				CODE_SIGN_STYLE = Automatic;
 				"HEADER_SEARCH_PATHS[arch=*]" = (
-					$SRCROOT/../../../../../../../,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/eigen,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
-					$SRCROOT/../../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
+					$SRCROOT/../../../../../../,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/eigen,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/gemmlowp,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/neon_2_sse,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/farmhash/src,
+					$SRCROOT/../../../../../../tensorflow/lite/tools/make/downloads/flatbuffers/include,
 				);
 				INFOPLIST_FILE = TFLiteBenchmark/Info.plist;
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
-				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../../tensorflow/lite/tools/make/gen/lib;
+				"LIBRARY_SEARCH_PATHS[arch=*]" = $SRCROOT/../../../../../../tensorflow/lite/tools/make/gen/lib;
 				PRODUCT_BUNDLE_IDENTIFIER = example.TFLiteBenchmark;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				TARGETED_DEVICE_FAMILY = "1,2";
diff --git a/tensorflow/lite/tools/make/Makefile b/tensorflow/lite/tools/make/Makefile
index 8f123558545723a603646523aad5dd47cb620e46..994f660dba7742de162525dcf6a8c6a288ee71c6 100644
--- a/tensorflow/lite/tools/make/Makefile
+++ b/tensorflow/lite/tools/make/Makefile
@@ -52,6 +52,7 @@ LIBS := \
 # generate things like the protobuf compiler that require that), so all of
 # these settings are for the target compiler.
 CXXFLAGS := -O3 -DNDEBUG
+CXXFLAGS += $(EXTRA_CXXFLAGS)
 CCFLAGS := ${CXXFLAGS}
 CXXFLAGS += --std=c++11
 CFLAGS :=
@@ -85,6 +86,7 @@ CORE_CC_ALL_SRCS := \
 $(wildcard tensorflow/lite/*.cc) \
 $(wildcard tensorflow/lite/*.c) \
 $(wildcard tensorflow/lite/c/*.c) \
+$(wildcard tensorflow/lite/core/*.cc) \
 $(wildcard tensorflow/lite/core/api/*.cc)
 ifneq ($(BUILD_TYPE),micro)
 CORE_CC_ALL_SRCS += \
@@ -113,6 +115,10 @@ ifeq ($(BUILD_TYPE),micro)
 CORE_CC_EXCLUDE_SRCS += \
 tensorflow/lite/mmap_allocation.cc \
 tensorflow/lite/nnapi_delegate.cc
+else
+CORE_CC_EXCLUDE_SRCS += \
+tensorflow/lite/mmap_allocation_disabled.cc \
+tensorflow/lite/nnapi_delegate_disabled.cc
 endif
 # Filter out all the excluded files.
 TF_LITE_CC_SRCS := $(filter-out $(CORE_CC_EXCLUDE_SRCS), $(CORE_CC_ALL_SRCS))
@@ -208,6 +214,9 @@ $(BENCHMARK_BINARY) : $(BENCHMARK_LIB)
 
 benchmark: $(BENCHMARK_BINARY)
 
+libdir:
+	@echo $(LIBDIR)
+
 # Gets rid of all generated files.
 clean:
 	rm -rf $(MAKEFILE_DIR)/gen
diff --git a/tensorflow/lite/tools/make/build_ios_universal_lib.sh b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
index 6e0d262827f0944918580d073f082d20e0e1803b..8b617ef5937a062261ee23bed3cfd1f40e6a3995 100755
--- a/tensorflow/lite/tools/make/build_ios_universal_lib.sh
+++ b/tensorflow/lite/tools/make/build_ios_universal_lib.sh
@@ -19,20 +19,36 @@ set -e
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 cd "$SCRIPT_DIR/../../../.."
 
+usage() {
+  echo "Usage: $(basename "$0") [-a]"
+  echo "-a [build_arch] build for specified arch comma separate for multiple archs (eg: x86_64 arm64)"
+  echo "  default is [x86_64 armv7 armv7s arm64]"
+  echo "-p enable profiling"
+  exit 1
+}
+
+profiling_args=""
+BUILD_ARCHS="x86_64 armv7 armv7s arm64"
+while getopts "a:p" opt_name; do
+  case "$opt_name" in
+    a) BUILD_ARCHS="${OPTARG}";;
+    p) profiling_args='-DGEMMLOWP_PROFILING,-DTFLITE_PROFILING_ENABLED';;
+    *) usage;;
+  esac
+done
+shift $(($OPTIND - 1))
+
 # Build library for supported architectures and packs them in a fat binary.
 make_library() {
-    for arch in x86_64 armv7 armv7s arm64
+    LIBS=""
+    for arch in $BUILD_ARCHS
     do
         make -f tensorflow/lite/tools/make/Makefile TARGET=ios TARGET_ARCH=${arch} \
-        -j 8
+            EXTRA_CXXFLAGS=$profiling_args -j 8
+        LIBS="${LIBS} tensorflow/lite/tools/make/gen/ios_${arch}/lib/${1}"
     done
     mkdir -p tensorflow/lite/tools/make/gen/lib
-    lipo \
-    tensorflow/lite/tools/make/gen/ios_x86_64/lib/${1} \
-    tensorflow/lite/tools/make/gen/ios_armv7/lib/${1} \
-    tensorflow/lite/tools/make/gen/ios_armv7s/lib/${1} \
-    tensorflow/lite/tools/make/gen/ios_arm64/lib/${1} \
-    -create \
+    lipo $LIBS -create \
     -output tensorflow/lite/tools/make/gen/lib/${1}
 }
 
diff --git a/tensorflow/lite/tools/make/targets/ios_makefile.inc b/tensorflow/lite/tools/make/targets/ios_makefile.inc
index 7f36b8ecef4715a4b89e74bd9ef17d28bbf72ae2..ae9276f9a6382b744801b01eec031cf9a6047398 100644
--- a/tensorflow/lite/tools/make/targets/ios_makefile.inc
+++ b/tensorflow/lite/tools/make/targets/ios_makefile.inc
@@ -22,7 +22,7 @@ ifeq ($(TARGET), ios)
 	TARGET_ARCH := x86_64
 	CXXFLAGS += -miphoneos-version-min=$(MIN_SDK_VERSION) \
 		-DGEMMLOWP_ALLOW_SLOW_SCALAR_FALLBACK \
-		-DTFLITE_USE_APPLE_ACCELERATE_FOR_CONV \
+		-DTF_LITE_USE_CBLAS \
 		-fembed-bitcode \
 		-Wno-c++11-narrowing \
 		-mno-thumb \
diff --git a/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md b/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
index 2517882c84c3079d134e2b144aae2bcfec4059af..cea164c38f0d78eb5797a97da51b1e2dee861b29 100644
--- a/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
+++ b/tensorflow/lite/tools/optimize/g3doc/quantize_weights.md
@@ -3,7 +3,7 @@
 ## Recommended usage
 
 The Quantize Weights transformation is integrated with
-[tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/toco/g3doc/cmdline_reference.md#transformation-flags).
+[tflite_convert](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/convert/cmdline_reference.md#transformation-flags).
 
 The recommended way of invoking this tool is by simply adding the
 `--post_training_quantize` flag to your original tflite_convert invocation. For
diff --git a/tensorflow/lite/tools/pip_package/MANIFEST.in b/tensorflow/lite/tools/pip_package/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..bb574e63a372da96841efbc70b8e213a943213c6
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/MANIFEST.in
@@ -0,0 +1 @@
+recursive-include * *.py
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8190782c39fcb910749fb466b7075dd628cdd554
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -0,0 +1,33 @@
+# Building TensorFlow Lite Standalone Pip
+
+Many users would like to deploy TensorFlow lite interpreter and use it from
+Python without requiring the rest of TensorFlow.
+
+## Steps
+
+To build a binary wheel run this script:
+```
+sudo apt install swig libjpeg-dev zlib1g-dev python3-dev python3-numpy
+sh tensorflow/lite/tools/pip_package/build_pip_package.sh
+```
+That will print out some output and a .whl file. You can then install that
+```
+pip install --upgrade <wheel>
+```
+
+Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
+You can then use the Tensorflow Lite interpreter as.
+```
+import tflite_runtime as tflr
+interpreter = tflr.lite.Interpreter(model_path="foo.tflite")
+```
+
+This currently works to build on Linux machines including Raspberry Pi. In
+the future, cross compilation to smaller SOCs like Raspberry Pi from
+bigger host will be supported.
+
+## Caveats
+
+* You cannot use TensorFlow Select ops, only TensorFlow Lite builtins.
+* Currently custom ops and delegates cannot be registered.
+
diff --git a/tensorflow/lite/tools/pip_package/build_pip_package.sh b/tensorflow/lite/tools/pip_package/build_pip_package.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2887ce84712aa75168bd2b5ae77240f25deddf57
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/build_pip_package.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+set -e
+
+# Find where this script lives and then the Tensorflow root.
+MY_DIRECTORY=`dirname $0`
+export TENSORFLOW_SRC_ROOT=`realpath $MY_DIRECTORY/../../../..`
+
+export TENSORFLOW_VERSION=`grep "_VERSION = " $TENSORFLOW_SRC_ROOT/tensorflow/tools/pip_package/setup.py  | cut -d'=' -f 2 | sed "s/[ '-]//g"`;
+
+
+# Build a pip build tree.
+BUILD_ROOT=/tmp/tflite_pip
+rm -rf $BUILD_ROOT
+mkdir -p $BUILD_ROOT/tflite_runtime/lite
+mkdir -p $BUILD_ROOT/tflite_runtime/lite/python
+
+# Build an importable module tree
+cat > $BUILD_ROOT/tflite_runtime/__init__.py <<EOF;
+import tflite_runtime.lite.interpreter
+EOF
+
+cat > $BUILD_ROOT/tflite_runtime/lite/__init__.py <<EOF;
+from interpreter import Interpreter as Interpreter
+EOF
+
+cat > $BUILD_ROOT/tflite_runtime/lite/python/__init__.py <<EOF;
+# Python module for TensorFlow Lite
+EOF
+
+# Copy necessary source files
+TFLITE_ROOT=$TENSORFLOW_SRC_ROOT/tensorflow/lite
+cp -r  $TFLITE_ROOT/python/interpreter_wrapper $BUILD_ROOT
+cp $TFLITE_ROOT/python/interpreter.py $BUILD_ROOT/tflite_runtime/lite/
+cp $TFLITE_ROOT/tools/pip_package/setup.py $BUILD_ROOT
+cp $TFLITE_ROOT/tools/pip_package/MANIFEST.in $BUILD_ROOT
+
+# Build the Pip
+cd $BUILD_ROOT
+python setup.py bdist_wheel
diff --git a/tensorflow/lite/tools/pip_package/setup.py b/tensorflow/lite/tools/pip_package/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..64d62ee1f2d5d0cc1fa1d1804c637f8220937128
--- /dev/null
+++ b/tensorflow/lite/tools/pip_package/setup.py
@@ -0,0 +1,150 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TensorFlow Lite is for mobile and embedded devices.
+
+TensorFlow Lite is the official solution for running machine learning models on
+mobile and embedded devices. It enables on-device machine learning inference
+with low latency and a small binary size on Android, iOS, and other operating
+systems.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import multiprocessing
+import os
+import subprocess
+
+from distutils.command.build_ext import build_ext
+import numpy
+
+from setuptools import Extension
+from setuptools import find_packages
+from setuptools import setup
+from setuptools.command.build_py import build_py
+PACKAGE_NAME = 'tflite-runtime'
+PACKAGE_VERSION = os.environ['TENSORFLOW_VERSION']
+DOCLINES = __doc__.split('\n')
+PACKAGE = 'tflite_runtime.lite.python'
+TENSORFLOW_DIR = os.environ['TENSORFLOW_SRC_ROOT']
+
+# Setup cross compiling
+TARGET = (
+    os.environ['TENSORFLOW_TARGET'] if 'TENSORFLOW_TARGET' in os.environ
+    else None)
+if TARGET == 'rpi':
+  os.environ['CXX'] = 'arm-linux-gnueabihf-g++'
+  os.environ['CC'] = 'arm-linux-gnueabihf-g++'
+MAKE_CROSS_OPTIONS = ['TARGET=%s' % TARGET]  if TARGET else []
+
+RELATIVE_MAKE_DIR = os.path.join('tensorflow', 'lite', 'tools', 'make')
+MAKE_DIR = os.path.join(TENSORFLOW_DIR, RELATIVE_MAKE_DIR)
+DOWNLOADS_DIR = os.path.join(MAKE_DIR, 'downloads')
+RELATIVE_MAKEFILE_PATH = os.path.join(RELATIVE_MAKE_DIR, 'Makefile')
+DOWNLOAD_SCRIPT_PATH = os.path.join(MAKE_DIR, 'download_dependencies.sh')
+
+
+def make_args(target='', quiet=True):
+  """Construct make command line."""
+  args = (['make', 'SHELL=/bin/bash', '-C', TENSORFLOW_DIR]
+          + MAKE_CROSS_OPTIONS +
+          ['-f', RELATIVE_MAKEFILE_PATH, '-j',
+           str(multiprocessing.cpu_count())])
+  if quiet:
+    args.append('--quiet')
+  if target:
+    args.append(target)
+  return args
+
+
+def make_output(target):
+  """Invoke make on the target and return output."""
+  return subprocess.check_output(make_args(target)).decode('utf-8').strip()
+
+
+def make():
+  """Invoke make to build tflite C++ sources.
+
+  Build dependencies:
+     apt-get install swig libjpeg-dev zlib1g-dev python3-dev python3-nump
+  """
+  subprocess.check_call(make_args(quiet=False))
+
+
+def download_dependencies():
+  """Download build dependencies if haven't done yet."""
+  if not os.path.isdir(DOWNLOADS_DIR) or not os.listdir(DOWNLOADS_DIR):
+    subprocess.check_call(DOWNLOAD_SCRIPT_PATH)
+
+
+class CustomBuildExt(build_ext, object):
+
+  def run(self):
+    download_dependencies()
+    make()
+
+    return super(CustomBuildExt, self).run()
+
+
+class CustomBuildPy(build_py, object):
+
+  def run(self):
+    self.run_command('build_ext')
+    return super(CustomBuildPy, self).run()
+
+
+LIB_TFLITE = 'tensorflow-lite'
+LIB_TFLITE_DIR = make_output('libdir')
+
+ext = Extension(
+    name='%s._interpreter_wrapper' % PACKAGE,
+    language='c++',
+    sources=['interpreter_wrapper/interpreter_wrapper.i',
+             'interpreter_wrapper/interpreter_wrapper.cc'],
+    swig_opts=['-c++',
+               '-I%s' % TENSORFLOW_DIR,
+               '-module', 'interpreter_wrapper',
+               '-outdir', '.'],
+    extra_compile_args=['-std=c++11'],
+    include_dirs=[TENSORFLOW_DIR,
+                  os.path.join(TENSORFLOW_DIR, 'tensorflow', 'lite', 'tools',
+                               'pip_package'),
+                  numpy.get_include(),
+                  os.path.join(DOWNLOADS_DIR, 'flatbuffers', 'include'),
+                  os.path.join(DOWNLOADS_DIR, 'absl')],
+    libraries=[LIB_TFLITE],
+    library_dirs=[LIB_TFLITE_DIR])
+
+
+setup(
+    name=PACKAGE_NAME,
+    version=PACKAGE_VERSION,
+    description=DOCLINES[0],
+    long_description='\n'.join(DOCLINES[2:]),
+    url='https://www.tensorflow.org/lite/',
+    author='Google Inc.',
+    author_email='opensource@google.com',
+    license='Apache 2.0',
+    include_package_data=True,
+    keywords='tflite tensorflow tensor machine learning',
+    packages=find_packages(exclude=[]),
+    ext_modules=[ext],
+    package_dir={PACKAGE: '.'},
+    cmdclass={
+        'build_ext': CustomBuildExt,
+        'build_py': CustomBuildPy,
+    }
+)
diff --git a/tensorflow/lite/tutorials/mnist_tflite.py b/tensorflow/lite/tutorials/mnist_tflite.py
index 002365717fce9e98dad6bacaaff6cdc4e6f5280a..6cc5846163594d74cfcbd95ab99ddb6a7b67bdf1 100644
--- a/tensorflow/lite/tutorials/mnist_tflite.py
+++ b/tensorflow/lite/tutorials/mnist_tflite.py
@@ -34,8 +34,8 @@ flags = flags.FLAGS
 def test_image_generator():
   # Generates an iterator over images
   with tf.Session() as sess:
-    input_data = dataset.test(
-        flags.data_dir).make_one_shot_iterator().get_next()
+    input_data = tf.compat.v1.data.make_one_shot_iterator(dataset.test(
+        flags.data_dir)).get_next()
     try:
       while True:
         yield sess.run(input_data)
diff --git a/tensorflow/lite/tutorials/post_training_quant.ipynb b/tensorflow/lite/tutorials/post_training_quant.ipynb
index 3ff145d9ce9291ad4fbc2f49b423d78632019059..394ab0760b5672978e0638c0ff01a8f00442302c 100644
--- a/tensorflow/lite/tutorials/post_training_quant.ipynb
+++ b/tensorflow/lite/tutorials/post_training_quant.ipynb
@@ -235,9 +235,9 @@
         "id": "AT8BgkKmljOy"
       },
       "source": [
-        "Using the python `TocoConverter`, the saved model can be converted into a TFLite model.\n",
+        "Using the python `TFLiteConverter`, the saved model can be converted into a TFLite model.\n",
         "\n",
-        "First load the model using the `TocoConverter`:"
+        "First load the model using the `TFLiteConverter`:"
       ]
     },
     {
@@ -252,7 +252,7 @@
       "source": [
         "import tensorflow as tf\n",
         "tf.enable_eager_execution()\n",
-        "converter = tf.lite.TocoConverter.from_saved_model(saved_model_dir)\n",
+        "converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)\n",
         "tflite_model = converter.convert()"
       ]
     },
@@ -648,7 +648,7 @@
         "graph_def_file = pathlib.Path(archive_path).parent/\"resnet_v2_101_299_frozen.pb\"\n",
         "input_arrays = [\"input\"] \n",
         "output_arrays = [\"output\"]\n",
-        "converter = tf.lite.TocoConverter.from_frozen_graph(\n",
+        "converter = tf.lite.TFLiteConverter.from_frozen_graph(\n",
         "  str(graph_def_file), input_arrays, output_arrays, input_shapes={\"input\":[1,299,299,3]})\n",
         "converter.post_training_quantize = True\n",
         "resnet_tflite_file = graph_def_file.parent/\"resnet_v2_101_quantized.tflite\"\n",
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index 64a5b52e2f982bb4f7c4802f9b5b79a6edc0325e..dbb87528d06b6719a29b364711a7c62c273fdb34 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -52,6 +52,12 @@ bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
 
 size_t CombineHashes(std::initializer_list<size_t> hashes);
 
+struct TfLiteIntArrayDeleter {
+  void operator()(TfLiteIntArray* a) {
+    if (a) TfLiteIntArrayFree(a);
+  }
+};
+
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
new file mode 100644
index 0000000000000000000000000000000000000000..688a837dac3fe7db6badfa9688ca7640c7658c7f
--- /dev/null
+++ b/tensorflow/opensource_only.files
@@ -0,0 +1,210 @@
+tensorflow/third_party/mkl/MKL_LICENSE
+tensorflow/third_party/mkl/LICENSE
+tensorflow/third_party/mkl/BUILD
+tensorflow/third_party/mkl/mkl.BUILD
+tensorflow/third_party/mkl/build_defs.bzl
+tensorflow/third_party/backports_weakref.BUILD
+tensorflow/third_party/toolchains/clang6/BUILD
+tensorflow/third_party/toolchains/clang6/README.md
+tensorflow/third_party/toolchains/clang6/repo.bzl
+tensorflow/third_party/toolchains/clang6/CROSSTOOL.tpl
+tensorflow/third_party/toolchains/clang6/clang.BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+tensorflow/third_party/toolchains/preconfig/ubuntu14.04/nccl2/BUILD
+tensorflow/third_party/toolchains/preconfig/generate/workspace.bzl
+tensorflow/third_party/toolchains/preconfig/generate/containers.bzl
+tensorflow/third_party/toolchains/preconfig/generate/generate.bzl
+tensorflow/third_party/toolchains/preconfig/generate/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/bazel_018/dummy_toolchain.bzl
+tensorflow/third_party/toolchains/preconfig/win_1803/py36/BUILD
+tensorflow/third_party/toolchains/preconfig/win_1803/BUILD
+tensorflow/third_party/toolchains/gpus/cuda/build_defs.bzl
+tensorflow/third_party/toolchains/gpus/cuda/BUILD
+tensorflow/third_party/toolchains/gpus/cuda/cuda/cuda_config.h
+tensorflow/third_party/toolchains/gpus/crosstool/BUILD
+tensorflow/third_party/toolchains/gpus/crosstool/CROSSTOOL
+tensorflow/third_party/toolchains/gpus/py/BUILD
+tensorflow/third_party/toolchains/cpus/arm/arm_compiler_configure.bzl
+tensorflow/third_party/toolchains/cpus/arm/CROSSTOOL.tpl
+tensorflow/third_party/toolchains/cpus/arm/BUILD
+tensorflow/third_party/toolchains/cpus/py3/BUILD
+tensorflow/third_party/toolchains/cpus/py/BUILD
+tensorflow/third_party/toolchains/BUILD
+tensorflow/third_party/nccl/remote.BUILD.tpl
+tensorflow/third_party/nccl/archive.BUILD
+tensorflow/third_party/nccl/LICENSE
+tensorflow/third_party/nccl/system.BUILD.tpl
+tensorflow/third_party/nccl/nccl_configure.bzl
+tensorflow/third_party/nccl/build_defs.bzl.tpl
+tensorflow/third_party/nccl/BUILD
+tensorflow/third_party/gpus/BUILD
+tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_rocm.tpl
+tensorflow/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+tensorflow/third_party/gpus/crosstool/CROSSTOOL.tpl
+tensorflow/third_party/gpus/crosstool/CROSSTOOL_hipcc.tpl
+tensorflow/third_party/gpus/crosstool/LICENSE
+tensorflow/third_party/gpus/crosstool/remote.BUILD.tpl
+tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+tensorflow/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.bat.tpl
+tensorflow/third_party/gpus/crosstool/BUILD.tpl
+tensorflow/third_party/gpus/crosstool/BUILD
+tensorflow/third_party/gpus/cuda/LICENSE
+tensorflow/third_party/gpus/cuda/BUILD.tpl
+tensorflow/third_party/gpus/cuda/BUILD.windows.tpl
+tensorflow/third_party/gpus/cuda/cuda_config.h.tpl
+tensorflow/third_party/gpus/cuda/remote.BUILD.tpl
+tensorflow/third_party/gpus/cuda/BUILD
+tensorflow/third_party/gpus/cuda/build_defs.bzl.tpl
+tensorflow/third_party/gpus/rocm/rocm_config.h.tpl
+tensorflow/third_party/gpus/rocm/BUILD
+tensorflow/third_party/gpus/rocm/BUILD.tpl
+tensorflow/third_party/gpus/rocm/build_defs.bzl.tpl
+tensorflow/third_party/gpus/cuda_configure.bzl
+tensorflow/third_party/gpus/rocm_configure.bzl
+tensorflow/third_party/snappy.BUILD
+tensorflow/third_party/cython.BUILD
+tensorflow/third_party/farmhash.BUILD
+tensorflow/third_party/eigen3/Eigen/Cholesky
+tensorflow/third_party/eigen3/Eigen/QR
+tensorflow/third_party/eigen3/Eigen/LU
+tensorflow/third_party/eigen3/Eigen/Core
+tensorflow/third_party/eigen3/Eigen/SVD
+tensorflow/third_party/eigen3/Eigen/Eigenvalues
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/FixedPoint
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/Tensor
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX512.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX512.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/PacketMathAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProduct.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/FixedPointTypes.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatVecProduct.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductNEON.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/TypeCastingAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/src/FixedPoint/MatMatProductAVX2.h
+tensorflow/third_party/eigen3/unsupported/Eigen/CXX11/ThreadPool
+tensorflow/third_party/eigen3/unsupported/Eigen/SpecialFunctions
+tensorflow/third_party/eigen3/unsupported/Eigen/MatrixFunctions
+tensorflow/third_party/eigen3/LICENSE
+tensorflow/third_party/eigen3/BUILD
+tensorflow/third_party/systemlibs/build_defs.bzl.tpl
+tensorflow/third_party/systemlibs/absl_py.BUILD
+tensorflow/third_party/systemlibs/curl.BUILD
+tensorflow/third_party/systemlibs/termcolor.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.flags.BUILD
+tensorflow/third_party/systemlibs/grpc.BUILD
+tensorflow/third_party/systemlibs/swig.BUILD
+tensorflow/third_party/systemlibs/protobuf.bzl
+tensorflow/third_party/systemlibs/protobuf.BUILD
+tensorflow/third_party/systemlibs/BUILD
+tensorflow/third_party/systemlibs/google_cloud_cpp.BUILD
+tensorflow/third_party/systemlibs/astor.BUILD
+tensorflow/third_party/systemlibs/six.BUILD
+tensorflow/third_party/systemlibs/absl_py.absl.testing.BUILD
+tensorflow/third_party/systemlibs/boringssl.BUILD
+tensorflow/third_party/systemlibs/nsync.BUILD
+tensorflow/third_party/systemlibs/google_cloud_cpp.google.cloud.bigtable.BUILD
+tensorflow/third_party/systemlibs/gif.BUILD
+tensorflow/third_party/systemlibs/pcre.BUILD
+tensorflow/third_party/systemlibs/BUILD.tpl
+tensorflow/third_party/systemlibs/snappy.BUILD
+tensorflow/third_party/systemlibs/gast.BUILD
+tensorflow/third_party/systemlibs/cython.BUILD
+tensorflow/third_party/systemlibs/double_conversion.BUILD
+tensorflow/third_party/systemlibs/zlib.BUILD
+tensorflow/third_party/systemlibs/jsoncpp.BUILD
+tensorflow/third_party/systemlibs/re2.BUILD
+tensorflow/third_party/systemlibs/lmdb.BUILD
+tensorflow/third_party/systemlibs/googleapis.BUILD
+tensorflow/third_party/systemlibs/png.BUILD
+tensorflow/third_party/systemlibs/syslibs_configure.bzl
+tensorflow/third_party/systemlibs/sqlite.BUILD
+tensorflow/third_party/python_runtime/BUILD
+tensorflow/third_party/sycl/crosstool/BUILD
+tensorflow/third_party/ngraph/LICENSE
+tensorflow/third_party/ngraph/tbb.BUILD
+tensorflow/third_party/ngraph/BUILD
+tensorflow/third_party/ngraph/ngraph.BUILD
+tensorflow/third_party/ngraph/build_defs.bzl
+tensorflow/third_party/ngraph/NGRAPH_LICENSE
+tensorflow/third_party/ngraph/ngraph_tf.BUILD
+tensorflow/third_party/ngraph/nlohmann_json.BUILD
+tensorflow/third_party/clang_toolchain/download_clang.bzl
+tensorflow/third_party/clang_toolchain/BUILD
+tensorflow/third_party/clang_toolchain/cc_configure_clang.bzl
+tensorflow/third_party/gast.BUILD
+tensorflow/third_party/llvm/BUILD
+tensorflow/third_party/llvm/expand_cmake_vars.py
+tensorflow/third_party/llvm/llvm.autogenerated.BUILD
+tensorflow/third_party/llvm/llvm.bzl
+tensorflow/third_party/icu/udata.patch
+tensorflow/third_party/fft2d/BUILD
+tensorflow/third_party/fft2d/fft.h
+tensorflow/third_party/fft2d/LICENSE
+tensorflow/third_party/fft2d/fft2d.BUILD
+tensorflow/third_party/boringssl/BUILD
+tensorflow/third_party/mpi/.gitignore
+tensorflow/third_party/mpi/BUILD
+tensorflow/third_party/tensorrt/LICENSE
+tensorflow/third_party/tensorrt/BUILD
+tensorflow/third_party/tensorrt/build_defs.bzl.tpl
+tensorflow/third_party/tensorrt/BUILD.tpl
+tensorflow/third_party/tensorrt/tensorrt_configure.bzl
+tensorflow/third_party/kafka/config.patch
+tensorflow/third_party/kafka/BUILD
+tensorflow/third_party/android/BUILD
+tensorflow/third_party/android/android.bzl.tpl
+tensorflow/third_party/android/android_configure.bzl
+tensorflow/third_party/android/android_configure.BUILD.tpl
+tensorflow/third_party/tflite_smartreply.BUILD
+tensorflow/third_party/mkl_dnn/LICENSE
+tensorflow/third_party/mkl_dnn/mkldnn.BUILD
+tensorflow/third_party/pcre.BUILD
+tensorflow/third_party/linenoise.BUILD
+tensorflow/third_party/sqlite.BUILD
+tensorflow/third_party/common.bzl
+tensorflow/third_party/com_google_absl.BUILD
+tensorflow/third_party/pprof.BUILD
+tensorflow/third_party/BUILD
+tensorflow/third_party/tflite_mobilenet_quant.BUILD
+tensorflow/third_party/lmdb.BUILD
+tensorflow/third_party/git/BUILD.tpl
+tensorflow/third_party/git/BUILD
+tensorflow/third_party/git/git_configure.bzl
+tensorflow/third_party/protobuf/BUILD
+tensorflow/third_party/tflite_mobilenet.BUILD
+tensorflow/third_party/py/BUILD
+tensorflow/third_party/py/BUILD.tpl
+tensorflow/third_party/py/remote.BUILD.tpl
+tensorflow/third_party/py/numpy/BUILD
+tensorflow/third_party/py/python_configure.bzl
+tensorflow/third_party/termcolor.BUILD
+tensorflow/third_party/png_fix_rpi.patch
+tensorflow/third_party/swig.BUILD
+tensorflow/third_party/astor.BUILD
+tensorflow/third_party/grpc/BUILD
+tensorflow/third_party/curl.BUILD
+tensorflow/third_party/arm_neon_2_x86_sse.BUILD
+tensorflow/third_party/png.BUILD
+tensorflow/third_party/googleapis.BUILD
+tensorflow/third_party/mpi_collectives/BUILD
+tensorflow/third_party/nanopb.BUILD
+tensorflow/third_party/gif.BUILD
+tensorflow/third_party/double_conversion.BUILD
+tensorflow/third_party/six.BUILD
+tensorflow/third_party/tflite_mobilenet_float.BUILD
+tensorflow/third_party/repo.bzl
+tensorflow/third_party/codegen.BUILD
+tensorflow/third_party/cub.BUILD
+tensorflow/third_party/jsoncpp.BUILD
+tensorflow/third_party/tflite_ovic_testdata.BUILD
+tensorflow/third_party/libxsmm.BUILD
+tensorflow/third_party/zlib.BUILD
+tensorflow/third_party/eigen.BUILD
\ No newline at end of file
diff --git a/arm_compiler.BUILD b/tensorflow/opensource_only/arm_compiler.BUILD
similarity index 100%
rename from arm_compiler.BUILD
rename to tensorflow/opensource_only/arm_compiler.BUILD
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 4fe92262ba626f75416557f6aecf4e371815e192..0a3ee65bc48013971c857fc5fb04f397c3edd2aa 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -102,6 +102,7 @@ py_library(
         ":framework_for_generated_wrappers",
         ":functional_ops",
         ":gradient_checker",
+        ":gradient_checker_v2",
         ":graph_util",
         ":histogram_ops",
         ":image_ops",
@@ -124,7 +125,6 @@ py_library(
         ":session_ops",
         ":sets",
         ":sparse_ops",
-        ":spectral_ops",
         ":spectral_ops_test_util",
         ":standard_ops",
         ":state_ops",
@@ -132,6 +132,7 @@ py_library(
         ":subscribe",
         ":summary",
         ":tensor_array_ops",
+        ":tensor_forest_ops",
         ":test_ops",  # TODO: Break testing code out into separate rule.
         ":tf_cluster",
         ":tf_item",
@@ -144,6 +145,7 @@ py_library(
         "//tensorflow/lite/python:lite",
         "//tensorflow/python/compat",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute",
         "//tensorflow/python/distribute:estimator_training",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/feature_column:feature_column_py",
@@ -524,6 +526,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "dispatch_test",
+    srcs = ["util/dispatch_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":client_testlib",
+        ":platform",
+        ":util",
+    ],
+)
+
 py_test(
     name = "keyword_args_test",
     srcs = ["util/keyword_args_test.py"],
@@ -854,7 +867,6 @@ py_library(
     deps = [
         ":c_api_util",
         ":control_flow_util",
-        ":cpp_shape_inference_proto_py",
         ":device",
         ":dtypes",
         ":error_interpolation",
@@ -862,6 +874,7 @@ py_library(
         ":platform",
         ":registry",
         ":tensor_shape",
+        ":tf2",
         ":traceable_stack",
         ":util",
         ":versions",
@@ -880,6 +893,8 @@ py_library(
     deps = [
         ":auto_control_deps",
         ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
         "//tensorflow/python/autograph",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:graph_only_ops",
@@ -894,6 +909,8 @@ py_library(
     deps = [
         ":control_flow_ops",
         ":framework_ops",
+        ":sparse_tensor",
+        ":tensor_array_ops",
         ":util",
     ],
 )
@@ -981,6 +998,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":dtypes",
+        ":tf2",
         ":util",
         "//tensorflow/core:protos_all_py",
     ],
@@ -994,6 +1012,7 @@ py_library(
         ":common_shapes",
         ":dtypes",
         ":tensor_shape",
+        ":util",
         "//third_party/py/numpy",
     ],
 )
@@ -1052,6 +1071,7 @@ py_library(
         ":random_seed",
         ":resource_variable_ops",
         ":session",
+        ":tensor_array_ops",
         ":training",
         ":util",
         ":variables",
@@ -1076,10 +1096,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":client",
+        ":cond_v2",
         ":framework_test_lib",
         ":gradient_checker",
+        ":gradient_checker_v2",
         ":platform_test",
         ":util",
+        ":while_v2",
     ],
 )
 
@@ -1384,6 +1407,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = ["no_pip"],  # test_ops_2 is not available in pip.
     deps = [
+        ":cond_v2",
         ":control_flow_ops",
         ":errors",
         ":framework",
@@ -1398,6 +1422,7 @@ py_test(
         ":util",
         ":variable_scope",
         ":variables",
+        ":while_v2",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
@@ -1618,6 +1643,14 @@ tf_gen_op_wrapper_private_py(
     ],
 )
 
+tf_gen_op_wrapper_private_py(
+    name = "tensor_forest_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:tensor_forest_ops_op_lib",
+    ],
+)
+
 tf_gen_op_wrapper_private_py(
     name = "summary_ops_gen",
     visibility = ["//tensorflow:__subpackages__"],
@@ -1837,6 +1870,7 @@ tf_gen_op_wrapper_private_py(
 
 tf_gen_op_wrapper_private_py(
     name = "spectral_ops_gen",
+    visibility = ["//tensorflow/python/ops/signal:__pkg__"],
 )
 
 tf_gen_op_wrapper_private_py(
@@ -1941,6 +1975,28 @@ py_library(
     ],
 )
 
+py_library(
+    name = "tensor_forest_ops",
+    srcs = ["ops/tensor_forest_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":ops",
+        ":tensor_forest_ops_gen",
+        ":training",
+        "//tensorflow/core/kernels/boosted_trees:boosted_trees_proto_py",
+    ],
+)
+
+py_library(
+    name = "optional_grad",
+    srcs = ["ops/optional_grad.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":framework_ops",
+    ],
+)
+
 py_library(
     name = "sets",
     srcs = [
@@ -2056,7 +2112,6 @@ py_library(
     srcs = ["ops/control_flow_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "tensor_shape",
         ":array_ops",
         ":array_ops_gen",
         ":constant_op",
@@ -2071,6 +2126,7 @@ py_library(
         ":resource_variable_ops_gen",
         ":sparse_tensor",
         ":tensor_array_ops",
+        ":tensor_shape",
         ":tf2",
         ":tf_should_use",
         ":util",
@@ -2093,7 +2149,9 @@ py_library(
     srcs = ["ops/control_flow_util_v2.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "framework_ops",
+        ":control_flow_util",
+        ":framework_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:function",
     ],
@@ -2118,7 +2176,7 @@ py_library(
         ":graph_to_function_def",
         ":pywrap_tensorflow",
         ":util",
-        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:function",
     ],
 )
@@ -2145,7 +2203,6 @@ py_library(
         ":tensor_shape",
         ":tensor_util",
         ":util",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:function",
     ],
 )
@@ -2263,10 +2320,10 @@ py_library(
         ":manip_ops",
         ":math_grad",
         ":math_ops",
+        ":optional_grad",
         ":platform",
         ":random_grad",
         ":resource_variable_ops",
-        ":spectral_grad",
         ":tensor_array_ops",
         ":tensor_util",
         ":unconnected_gradients",
@@ -2508,7 +2565,6 @@ py_library(
         ":nn_ops_gen",
         ":sparse_ops_gen",
         ":sparse_tensor",
-        ":spectral_ops_gen",
         ":state_ops",
         ":state_ops_gen",
         ":tensor_shape",
@@ -2814,33 +2870,34 @@ py_test(
         ":framework_test_lib",
         ":sparse_ops",
         ":sparse_tensor",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_library(
-    name = "spectral_grad",
-    srcs = ["ops/spectral_grad.py"],
+    name = "sort_ops",
+    srcs = ["ops/sort_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
         ":framework",
-        ":framework_for_generated_wrappers",
         ":math_ops",
-        ":spectral_ops",
+        ":nn_ops",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
-    name = "spectral_ops",
-    srcs = ["ops/spectral_ops.py"],
+py_test(
+    name = "sort_ops_test",
+    srcs = ["ops/sort_ops_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":array_ops",
-        ":dtypes",
-        ":framework_ops",
-        ":math_ops",
-        ":spectral_ops_gen",
+        ":client_testlib",
+        ":framework",
+        ":random_ops",
+        ":sort_ops",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -2958,10 +3015,10 @@ py_library(
         ":random_ops",
         ":script_ops",
         ":session_ops",
+        ":sort_ops",
         ":sparse_grad",
         ":sparse_ops",
         ":special_math_ops",
-        ":spectral_grad",
         ":state_grad",
         ":state_ops",
         ":stateless_random_ops",
@@ -2972,6 +3029,7 @@ py_library(
         ":util",
         ":variable_scope",
         ":variables",
+        "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/ops/distributions",
         "//tensorflow/python/ops/linalg",
     ],
@@ -3066,13 +3124,16 @@ py_library(
     deps = [
         ":array_ops",
         ":constant_op",
+        ":control_flow_ops_gen",
         ":data_flow_ops_gen",
         ":dtypes",
         ":errors",
         ":framework_ops",
+        ":list_ops",
         ":math_ops",
         ":tensor_shape",
         ":tensor_util",
+        ":tf2",
         ":tf_should_use",
         "//tensorflow/python/eager:context",
     ],
@@ -3131,6 +3192,19 @@ py_library(
     ],
 )
 
+py_library(
+    name = "gradient_checker_v2",
+    srcs = ["ops/gradient_checker_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":array_ops",
+        ":framework_for_generated_wrappers",
+        ":gradients",
+        ":platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 # This target is deprecated.
 py_library(
     name = "ops",
@@ -3162,6 +3236,7 @@ cuda_py_test(
     srcs = ["ops/control_flow_ops_test.py"],
     additional_deps = [
         ":array_ops",
+        ":cond_v2",
         ":control_flow_ops",
         ":embedding_ops",
         ":framework_for_generated_wrappers",
@@ -3177,6 +3252,8 @@ cuda_py_test(
         ":util",
         ":variable_scope",
         ":variables",
+        ":while_v2",
+        "//tensorflow/python/eager:def_function",
     ],
 )
 
@@ -3196,6 +3273,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "gradient_checker_v2_test",
+    size = "medium",
+    srcs = ["ops/gradient_checker_v2_test.py"],
+    additional_deps = [
+        ":array_ops",
+        ":client_testlib",
+        ":framework_for_generated_wrappers",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_ops",
+        ":platform",
+        "//third_party/py/numpy",
+    ],
+)
+
 cuda_py_test(
     name = "gradients_test",
     size = "medium",
@@ -3304,6 +3397,9 @@ cuda_py_test(
         ":client_testlib",
         ":framework_for_generated_wrappers",
         ":math_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:execution_callbacks",
         "//third_party/py/numpy",
     ],
     tags = ["no_windows_gpu"],
@@ -3419,13 +3515,13 @@ py_library(
         exclude = [
             "**/*test*",
             "training/checkpointable/**/*.py",
+            "training/saving/**/*.py",
             # The following targets have their own build rules (same name as the
             # file):
             "training/basic_session_run_hooks.py",
             "training/checkpoint_management.py",
             "training/distribute.py",
             "training/distribution_strategy_context.py",
-            "training/saveable_object.py",
             "training/saver.py",
             "training/session_run_hook.py",
             "training/training_util.py",
@@ -3475,6 +3571,7 @@ py_library(
         "@six_archive//:six",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -3486,10 +3583,17 @@ py_library(
     ],
 )
 
+# Dependency added and used by ClusterResolvers to avoid circular dependency between keras, distribute, and training.
 py_library(
-    name = "saveable_object",
-    srcs = ["training/saveable_object.py"],
+    name = "training_server_lib",
+    srcs = ["training/server_lib.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":framework",
+        ":pywrap_tensorflow",
+        ":util",
+        "//tensorflow/core:protos_all_py",
+    ],
 )
 
 py_library(
@@ -3545,7 +3649,6 @@ py_library(
         ":platform",
         ":pywrap_tensorflow",
         ":resource_variable_ops",
-        ":saveable_object",
         ":session",
         ":state_ops",
         ":string_ops",
@@ -3555,22 +3658,13 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
-py_library(
-    name = "device_util",
-    srcs = ["training/device_util.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":device",
-        ":framework_ops",
-        "//tensorflow/python/eager:context",
-    ],
-)
-
 py_library(
     name = "distribute",
     srcs = [
@@ -3579,29 +3673,7 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":device_util",
-        ":framework_ops",
-        ":platform",
-        ":resource_variable_ops",
-        ":state_ops",
-        ":util",
-        ":variable_scope",
-        "//tensorflow/python/data",
-        "//tensorflow/python/ops/losses",
-    ],
-)
-
-py_test(
-    name = "distribute_test",
-    size = "small",
-    srcs = ["training/distribute_test.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        ":client_testlib",
-        ":distribute",
-        ":variable_scope",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -3761,6 +3833,7 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":util",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
         "@six_archive//:six",
     ],
@@ -4108,11 +4181,24 @@ genrule(
 
 # Get the import library of  _pywrap_tensorflow_internal.dll
 filegroup(
-    name = "pywrap_tensorflow_import_lib_file",
+    name = "get_pywrap_tensorflow_import_lib_file",
     srcs = [":_pywrap_tensorflow_internal.so"],
     output_group = "interface_library",
 )
 
+# Rename the import library for _pywrap_tensorflow_internal.pyd to _pywrap_tensorflow_internal.lib
+# (It was _pywrap_tensorflow_internal.so.if.lib).
+genrule(
+    name = "pywrap_tensorflow_import_lib_file",
+    srcs = [":get_pywrap_tensorflow_import_lib_file"],
+    outs = ["_pywrap_tensorflow_internal.lib"],
+    cmd = select({
+        "//tensorflow:windows": "cp -f $< $@",
+        "//conditions:default": "touch $@",  # Just a placeholder for Unix platforms
+    }),
+    visibility = ["//visibility:public"],
+)
+
 # Create a cc_import rule for the import library of _pywrap_tensorflow_internal.dll
 # so that custom ops' dynamic libraries can link against it.
 cc_import(
@@ -4590,7 +4676,6 @@ cuda_py_tests(
         "training/basic_loops_test.py",
         "training/coordinator_test.py",
         "training/device_setter_test.py",
-        "training/device_util_test.py",
         "training/ftrl_test.py",
         "training/gradient_descent_test.py",
         "training/learning_rate_decay_test.py",
@@ -4901,7 +4986,7 @@ py_test(
         ":training",
         ":variable_scope",
         ":variables",
-        "//tensorflow/python/feature_column",
+        "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index 5da304e38cc5c4d6da94479930cc358bc7dda282..b2cc63bd1320700801d4aaf0a9b33c8da7821412 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -78,6 +78,7 @@ from tensorflow.python.ops import initializers_ns as initializers
 
 # Bring in subpackages.
 from tensorflow.python import data
+from tensorflow.python import distribute
 from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
 from tensorflow.python.layers import layers
@@ -86,12 +87,12 @@ from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
+from tensorflow.python.ops import ragged
 from tensorflow.python.ops import sets
-from tensorflow.python.ops import spectral_ops as spectral
 from tensorflow.python.ops.distributions import distributions
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.losses import losses
-from tensorflow.python.ops import signal
+from tensorflow.python.ops.signal import signal
 from tensorflow.python.profiler import profiler
 from tensorflow.python.saved_model import saved_model
 from tensorflow.python.summary import summary
@@ -144,26 +145,26 @@ nn.rnn_cell = rnn_cell
 
 # Export protos
 # pylint: disable=undefined-variable
-tf_export('AttrValue')(AttrValue)
-tf_export('ConfigProto')(ConfigProto)
+tf_export(v1=['AttrValue'])(AttrValue)
+tf_export(v1=['ConfigProto'])(ConfigProto)
 tf_export('Event', 'summary.Event')(Event)
-tf_export('GPUOptions')(GPUOptions)
-tf_export('GraphDef')(GraphDef)
-tf_export('GraphOptions')(GraphOptions)
-tf_export('HistogramProto')(HistogramProto)
-tf_export('LogMessage')(LogMessage)
-tf_export('MetaGraphDef')(MetaGraphDef)
-tf_export('NameAttrList')(NameAttrList)
-tf_export('NodeDef')(NodeDef)
-tf_export('OptimizerOptions')(OptimizerOptions)
-tf_export('RunMetadata')(RunMetadata)
-tf_export('RunOptions')(RunOptions)
-tf_export('SessionLog', 'summary.SessionLog')(SessionLog)
+tf_export(v1=['GPUOptions'])(GPUOptions)
+tf_export(v1=['GraphDef'])(GraphDef)
+tf_export(v1=['GraphOptions'])(GraphOptions)
+tf_export(v1=['HistogramProto'])(HistogramProto)
+tf_export(v1=['LogMessage'])(LogMessage)
+tf_export(v1=['MetaGraphDef'])(MetaGraphDef)
+tf_export(v1=['NameAttrList'])(NameAttrList)
+tf_export(v1=['NodeDef'])(NodeDef)
+tf_export(v1=['OptimizerOptions'])(OptimizerOptions)
+tf_export(v1=['RunMetadata'])(RunMetadata)
+tf_export(v1=['RunOptions'])(RunOptions)
+tf_export(v1=['SessionLog', 'summary.SessionLog'])(SessionLog)
 tf_export('Summary', 'summary.Summary')(Summary)
 tf_export('summary.SummaryDescription')(SummaryDescription)
 tf_export('SummaryMetadata')(SummaryMetadata)
 tf_export('summary.TaggedRunMetadata')(TaggedRunMetadata)
-tf_export('TensorInfo')(TensorInfo)
+tf_export(v1=['TensorInfo'])(TensorInfo)
 # pylint: enable=undefined-variable
 
 # Special dunders that we choose to export:
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index ced2e4796b17e507ce1139d53b7820831d6467db..3ac446db02c6ef1946e76a8b549a85c67fed2872 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -63,7 +63,6 @@ py_test(
     name = "asserts_test",
     srcs = ["asserts_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
@@ -239,7 +238,6 @@ py_test(
     name = "error_handlers_test",
     srcs = ["error_handlers_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":converters",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/converters/asserts_test.py b/tensorflow/python/autograph/converters/asserts_test.py
index eef628aeb6fa6abba9a2f6f323cfc1c4adaf2a8e..9ae448892a030b331adc216052ba22d3ca7533df 100644
--- a/tensorflow/python/autograph/converters/asserts_test.py
+++ b/tensorflow/python/autograph/converters/asserts_test.py
@@ -23,12 +23,14 @@ from tensorflow.python.autograph.converters import side_effect_guards
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.platform import test
 
 
 class AssertsTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_basic(self):
 
     def test_fn(a):
@@ -41,7 +43,7 @@ class AssertsTest(converter_testing.TestCase):
         op = result.test_fn(constant_op.constant(False))
         with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                      'test message'):
-          sess.run(op)
+          self.evaluate(op)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/builtin_functions_test.py b/tensorflow/python/autograph/converters/builtin_functions_test.py
index 30cfb13233a73e486cf793074cb3de270b9d0172..2683be16ec7ffa91b1df3cd272336366502d9f4f 100644
--- a/tensorflow/python/autograph/converters/builtin_functions_test.py
+++ b/tensorflow/python/autograph/converters/builtin_functions_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.autograph.converters import builtin_functions
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class BuiltinFunctionsTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_len(self):
 
     def test_fn(a):
@@ -41,6 +43,7 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
         ops = result.test_fn(p)
         self.assertEqual(sess.run(ops, {p: [0, 0, 0]}), 3)
 
+  @test_util.run_deprecated_v1
   def test_print(self):
 
     if six.PY2:
@@ -54,6 +57,7 @@ class BuiltinFunctionsTest(converter_testing.TestCase):
         with self.assertPrints('a\n'):
           sess.run(result.test_fn('a'))
 
+  @test_util.run_deprecated_v1
   def test_print_multiple_values(self):
 
     if six.PY2:
diff --git a/tensorflow/python/autograph/converters/call_trees.py b/tensorflow/python/autograph/converters/call_trees.py
index 55cea89126a257d553a0bb34a56ac22d775a8b9f..9b85fc8367ceda77ab656bb889c88922cc52e173 100644
--- a/tensorflow/python/autograph/converters/call_trees.py
+++ b/tensorflow/python/autograph/converters/call_trees.py
@@ -22,7 +22,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from collections import namedtuple
+import collections
 
 import gast
 
@@ -35,7 +35,7 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.util import tf_inspect
 
 
-class FunctionInfo(namedtuple('FunctionInfo', ('dtype',))):
+class FunctionInfo(collections.namedtuple('FunctionInfo', ('dtype',))):
   pass
 
 
@@ -116,12 +116,19 @@ class CallTreeTransformer(converter.Base):
   def _function_is_compilable(self, target_entity):
     """Determines whether an entity can be compiled at all."""
     # TODO(mdan): Expand.
+
     if target_entity.__module__ is None:
       # Functions like builtins and NumPy don't expose a module.
       # Those in general should not be compiled.
       return False
+
     if inspect_utils.isbuiltin(target_entity):
       return False
+
+    if inspect_utils.isnamedtuple(target_entity):
+      # namedtuple doesn't expose its source code, making it uncompilable.
+      return False
+
     return True
 
   def _should_compile(self, node, fqn):
@@ -140,6 +147,11 @@ class CallTreeTransformer(converter.Base):
 
     if target_entity is not None:
 
+      # Currently, lambdas are always converted.
+      # TODO(mdan): Allow markers of the kind f = ag.do_not_convert(lambda: ...)
+      if inspect_utils.islambda(target_entity):
+        return True
+
       # This may be reached when "calling" a callable attribute of an object.
       # For example:
       #
@@ -296,7 +308,13 @@ class CallTreeTransformer(converter.Base):
         # safe for graph mode.
         return node
 
+      elif inspect_utils.isnamedtuple(target_entity):
+        # Although not compilable, we assume they are safe for graph mode.
+        node = self.generic_visit(node)
+        return node
+
       else:
+        # TODO(mdan): Instert dynamic conversion here instead.
         raise NotImplementedError(
             'py_func with return values (unknown function)')
     else:
diff --git a/tensorflow/python/autograph/converters/call_trees_test.py b/tensorflow/python/autograph/converters/call_trees_test.py
index 916c736fb4bb3099901e9125b37bb54c7050cecc..454d75d755c7273d11e1f89e4138cd997eb6e49a 100644
--- a/tensorflow/python/autograph/converters/call_trees_test.py
+++ b/tensorflow/python/autograph/converters/call_trees_test.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
+
 import numpy as np
 
 from tensorflow.python.autograph.converters import call_trees
@@ -85,6 +87,34 @@ class CallTreesTest(converter_testing.TestCase):
       tc = TestClass()
       self.assertEquals(3, result.test_fn_2(tc, 1))
 
+  def test_known_called_lambda(self):
+
+    l = lambda x: x
+
+    def test_fn(a):
+      return l(a)
+
+    ns = {'l': l}
+    node, ctx = self.prepare(test_fn, ns)
+    node = call_trees.transform(node, ctx)
+
+    with self.compiled(node, ns) as result:
+      self.assertEquals(1, result.test_fn(1))
+
+  def test_known_called_namedtuple(self):
+
+    nt = collections.namedtuple('TestNamedTuple', ['a'])
+
+    def test_fn(a):
+      return nt(a)
+
+    ns = {'nt': nt}
+    node, ctx = self.prepare(test_fn, ns)
+    node = call_trees.transform(node, ctx)
+
+    with self.compiled(node, ns) as result:
+      self.assertEquals(nt(1), result.test_fn(1))
+
   def test_py_func_known_function(self):
 
     def test_fn():
@@ -94,7 +124,7 @@ class CallTreesTest(converter_testing.TestCase):
                         dtypes.int64) as result:
       with self.cached_session() as sess:
         self.assertTrue(isinstance(result.test_fn(), ops.Tensor))
-        self.assertIn(sess.run(result.test_fn()), (0, 1, 2))
+        self.assertIn(self.evaluate(result.test_fn()), (0, 1, 2))
 
   def test_uncompiled_modules(self):
 
@@ -113,7 +143,7 @@ class CallTreesTest(converter_testing.TestCase):
     with self.compiled(node, ns) as result:
       with self.cached_session() as sess:
         result_tensor = result.test_fn(constant_op.constant(1))
-        self.assertEquals(sess.run(result_tensor), 3)
+        self.assertEquals(self.evaluate(result_tensor), 3)
 
   def test_call_to_decorated_function(self):
 
diff --git a/tensorflow/python/autograph/converters/continue_statements.py b/tensorflow/python/autograph/converters/continue_statements.py
index 584cdc1efd4b2e4327be34e1d9d51de3635fccd5..05e19e59fc6701db618e925e1d305f299b270e33 100644
--- a/tensorflow/python/autograph/converters/continue_statements.py
+++ b/tensorflow/python/autograph/converters/continue_statements.py
@@ -24,94 +24,93 @@ from tensorflow.python.autograph.pyct import templates
 from tensorflow.python.autograph.pyct.static_analysis.annos import NodeAnno
 
 
-# Tags for local state.
-CONTROL_VAR_NAME = 'control_var_name'
-CONTINUE_USED = 'continue_used'
-GUARD_CREATED = 'guard_created'
-CREATE_GUARD_NEXT = 'create_guard_next'
+class _Continue(object):
+
+  def __init__(self):
+    self.used = False
+    self.control_var_name = None
+    self.create_guard = False
+    self.guard_created = False
+
+  def __repr__(self):
+    return 'used: %s, var: %s' % (self.used, self.control_var_name)
 
 
 class ContinueCanonicalizationTransformer(converter.Base):
   """Canonicalizes continue statements into additional conditionals."""
 
   def visit_Continue(self, node):
-    self.set_local(CONTINUE_USED, True)
+    self.state[_Continue].used = True
     template = """
-      var_name = tf.constant(True)
+      var_name = True
     """
     return templates.replace(
-        template, var_name=self.get_local(CONTROL_VAR_NAME))
+        template, var_name=self.state[_Continue].control_var_name)
 
   def _postprocess_statement(self, node):
     # Example of how the state machine below works:
     #
-    #   1| stmt           # State: CONTINUE_USED = False
+    #   1| stmt           # State: Continue_.used = False
     #    |                # Action: none
     #   2| if cond:
-    #   3|   continue     # State: CONTINUE_USED = True,
-    #    |                #        GUARD_CREATED = False,
-    #    |                #        CREATE_GUARD_NEXT = False
-    #    |                # Action: set CREATE_GUARD_NEXT = True
-    #   4| stmt           # State: CONTINUE_USED = True,
-    #    |                #        GUARD_CREATED = False,
-    #    |                #        CREATE_GUARD_NEXT = True
+    #   3|   continue     # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = False,
+    #    |                #        Continue_.create_guard = False
+    #    |                # Action: Continue_.create_guard = True
+    #   4| stmt           # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = False,
+    #    |                #        Continue_.create_guard = True
     #    |                # Action: create `if not continue_used`,
-    #    |                #         set GUARD_CREATED = True
-    #   5| stmt           # State: CONTINUE_USED = True, GUARD_CREATED = True
+    #    |                #         set Continue_.guard_created = True
+    #   5| stmt           # State: Continue_.used = True,
+    #    |                #        Continue_.guard_created = True
     #    |                # Action: none (will be wrapped under previously
     #    |                #         created if node)
 
-    if self.get_local(CONTINUE_USED, False):
-      if self.get_local(GUARD_CREATED, False):
+    if self.state[_Continue].used:
+      if self.state[_Continue].guard_created:
         return node, None
 
-      elif not self.get_local(CREATE_GUARD_NEXT, False):
-        self.set_local(CREATE_GUARD_NEXT, True)
+      elif not self.state[_Continue].create_guard:
+        self.state[_Continue].create_guard = True
         return node, None
 
       else:
-        self.set_local(GUARD_CREATED, True)
+        self.state[_Continue].guard_created = True
         template = """
           if not var_name:
             original_node
         """
         cond, = templates.replace(
             template,
-            var_name=self.get_local(CONTROL_VAR_NAME),
+            var_name=self.state[_Continue].control_var_name,
             original_node=node)
         return cond, cond.body
     return node, None
 
   def _visit_loop_body(self, node, nodes):
-    self.enter_local_scope()
+    self.state[_Continue].enter()
     scope = anno.getanno(node, NodeAnno.BODY_SCOPE)
     continue_var = self.ctx.namer.new_symbol('continue_', scope.referenced)
-    self.set_local(CONTROL_VAR_NAME, continue_var)
+    self.state[_Continue].control_var_name = continue_var
 
     nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
 
-    if self.get_local(CONTINUE_USED, False):
+    if self.state[_Continue].used:
       template = """
-        var_name = tf.constant(False)
+        var_name = False
       """
       control_var_init = templates.replace(template, var_name=continue_var)
       nodes = control_var_init + nodes
 
-    self.exit_local_scope()
+    self.state[_Continue].exit()
     return nodes
 
-  def _visit_non_loop_body(self, nodes):
-    self.enter_local_scope(inherit=(CONTROL_VAR_NAME,))
-    nodes = self.visit_block(nodes, after_visit=self._postprocess_statement)
-    continue_used = self.get_local(CONTINUE_USED, False)
-    self.exit_local_scope(keep=(CONTINUE_USED,))
-    return nodes, continue_used
-
   def visit_While(self, node):
     node.test = self.visit(node.test)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse, _ = self._visit_non_loop_body(node.orelse)
+    node.orelse = self.visit_block(node.orelse)
     return node
 
   def visit_For(self, node):
@@ -119,21 +118,11 @@ class ContinueCanonicalizationTransformer(converter.Base):
     node.iter = self.generic_visit(node.iter)
     node.body = self._visit_loop_body(node, node.body)
     # A continue in the else clause applies to the containing scope.
-    node.orelse, _ = self._visit_non_loop_body(node.orelse)
-    return node
-
-  def visit_If(self, node):
-    node.test = self.generic_visit(node.test)
-    node.body, continue_used_body = self._visit_non_loop_body(node.body)
-    node.orelse, continue_used_orelse = self._visit_non_loop_body(node.orelse)
-    self.set_local(CONTINUE_USED, continue_used_body or continue_used_orelse)
-    return node
-
-  def visit_With(self, node):
-    node.items = self.visit_block(node.items)
-    node.body, _ = self._visit_non_loop_body(node.body)
+    node.orelse = self.visit_block(node.orelse)
     return node
 
 
 def transform(node, ctx):
-  return ContinueCanonicalizationTransformer(ctx).visit(node)
+  transformer = ContinueCanonicalizationTransformer(ctx)
+  node = transformer.visit(node)
+  return node
diff --git a/tensorflow/python/autograph/converters/control_flow.py b/tensorflow/python/autograph/converters/control_flow.py
index 5853e044c532d24c3327f06da790f85fddcd5700..bef6cae1bb89908bd644115e31ca5662043b060c 100644
--- a/tensorflow/python/autograph/converters/control_flow.py
+++ b/tensorflow/python/autograph/converters/control_flow.py
@@ -106,14 +106,49 @@ class ControlFlowTransformer(converter.Base):
       return 'no variables'
     return ', '.join(map(str, symbol_set))
 
-  def visit_If(self, node):
-    node = self.generic_visit(node)
+  def _determine_aliased_symbols(self, scope, node_defined_in, block):
+    if block:
+      block_live_in = set(anno.getanno(block[0], anno.Static.LIVE_VARS_IN))
+    else:
+      block_live_in = set()
 
+    # For the purpose of aliasing, composite symbols with live owners are live
+    # as well. Otherwise this would leak tensors from the conditional's body.
+    #
+    # For example:
+    #
+    #   obj = some_obj
+    #   if cond:
+    #     obj.a = val
+    #
+    # Thanslating to the code below would be incorrect:
+    #
+    #   def true_fn():
+    #     obj.a = val()  # Wrong! leaks ops owned by true_fn
+    #     return obj.a
+    for s in scope.modified:
+      if s.is_composite():
+        live_parents = block_live_in & s.owner_set
+        if live_parents:
+          block_live_in.add(s)
+    return scope.modified & node_defined_in & block_live_in
+
+  def visit_If(self, node):
     body_scope = anno.getanno(node, annos.NodeAnno.BODY_SCOPE)
     orelse_scope = anno.getanno(node, annos.NodeAnno.ORELSE_SCOPE)
     defined_in = anno.getanno(node, anno.Static.DEFINED_VARS_IN)
     live_out = anno.getanno(node, anno.Static.LIVE_VARS_OUT)
 
+    # Note: this information needs to be extracted before the body conversion
+    # that happens in the call to generic_visit below, because the conversion
+    # generates nodes that lack static analysis annotations.
+    need_alias_in_body = self._determine_aliased_symbols(
+        body_scope, defined_in, node.body)
+    need_alias_in_orelse = self._determine_aliased_symbols(
+        orelse_scope, defined_in, node.orelse)
+
+    node = self.generic_visit(node)
+
     modified_in_cond = body_scope.modified | orelse_scope.modified
     returned_from_cond = set()
     for s in modified_in_cond:
@@ -125,9 +160,6 @@ class ControlFlowTransformer(converter.Base):
         if live_out & s.owner_set:
           returned_from_cond.add(s)
 
-    need_alias_in_body = body_scope.modified & defined_in
-    need_alias_in_orelse = orelse_scope.modified & defined_in
-
     created_in_body = body_scope.modified & returned_from_cond - defined_in
     created_in_orelse = orelse_scope.modified & returned_from_cond - defined_in
 
diff --git a/tensorflow/python/autograph/converters/control_flow_test.py b/tensorflow/python/autograph/converters/control_flow_test.py
index 03fdfc804e497680c205df1945ac7c6079c51a41..034fcbe3865cdd78cdaad19631da98359cb4690d 100644
--- a/tensorflow/python/autograph/converters/control_flow_test.py
+++ b/tensorflow/python/autograph/converters/control_flow_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.autograph.pyct import transformer
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -36,6 +37,7 @@ class ControlFlowTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         self.assertEqual(sess.run(result.test_fn(*inputs)), expected)
 
+  @test_util.run_deprecated_v1
   def test_while_basic(self):
 
     def test_fn(n):
@@ -48,6 +50,7 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(5), (10, 5, 5))
 
+  @test_util.run_deprecated_v1
   def test_while_nested(self):
 
     def test_fn(n):
@@ -66,6 +69,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(5),
                                  (25, 5, 0, 5))
 
+  @test_util.run_deprecated_v1
   def test_while_single_output(self):
 
     def test_fn(n):
@@ -86,6 +90,7 @@ class ControlFlowTest(converter_testing.TestCase):
     with self.assertRaises(NameError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_if_basic(self):
 
     def test_fn(n):
@@ -100,6 +105,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), (-1, 0))
     self.assertTransformedResult(test_fn, constant_op.constant(-1), (0, -2))
 
+  @test_util.run_deprecated_v1
   def test_if_complex_outputs(self):
 
     class TestClass(object):
@@ -124,6 +130,7 @@ class ControlFlowTest(converter_testing.TestCase):
         res_obj = result.test_fn(constant_op.constant(-1), TestClass(0, 0))
         self.assertEqual(sess.run((res_obj.a, res_obj.b)), (0, -2))
 
+  @test_util.run_deprecated_v1
   def test_if_single_output(self):
 
     def test_fn(n):
@@ -133,6 +140,7 @@ class ControlFlowTest(converter_testing.TestCase):
 
     self.assertTransformedResult(test_fn, constant_op.constant(1), -1)
 
+  @test_util.run_deprecated_v1
   def test_if_semi(self):
 
     def test_fn(n):
@@ -143,6 +151,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(2), 3)
     self.assertTransformedResult(test_fn, constant_op.constant(-3), -3)
 
+  @test_util.run_deprecated_v1
   def test_if_local_var(self):
 
     def test_fn(n):
@@ -154,6 +163,7 @@ class ControlFlowTest(converter_testing.TestCase):
     self.assertTransformedResult(test_fn, constant_op.constant(1), 5)
     self.assertTransformedResult(test_fn, constant_op.constant(-1), -1)
 
+  @test_util.run_deprecated_v1
   def test_if_no_outputs(self):
 
     def test_fn(n):
@@ -177,6 +187,7 @@ class ControlFlowTest(converter_testing.TestCase):
     with self.assertRaises(transformer.AutographParseError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_simple_for(self):
 
     def test_fn(l):
@@ -191,6 +202,7 @@ class ControlFlowTest(converter_testing.TestCase):
     empty_vector = constant_op.constant([], shape=(0,), dtype=dtypes.int32)
     self.assertTransformedResult(test_fn, empty_vector, (0, 0))
 
+  @test_util.run_deprecated_v1
   def test_for_single_output(self):
 
     def test_fn(l):
@@ -235,6 +247,7 @@ class ControlFlowTest(converter_testing.TestCase):
     with self.assertRaises(NameError):
       control_flow.transform(node, ctx)
 
+  @test_util.run_deprecated_v1
   def test_for_tuple_unpacking(self):
     def test_fn(x_list):
       z = tf.constant(0)  # pylint:disable=undefined-variable
diff --git a/tensorflow/python/autograph/converters/function_scopes_test.py b/tensorflow/python/autograph/converters/function_scopes_test.py
index e5ce03a1090a072c413a71eda5643530dff025bd..5a1248c8015c36882136421bfe4efc7d3dd58831 100644
--- a/tensorflow/python/autograph/converters/function_scopes_test.py
+++ b/tensorflow/python/autograph/converters/function_scopes_test.py
@@ -22,11 +22,13 @@ from tensorflow.python.autograph.converters import function_scopes
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FunctionBodyTransformerTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_basic(self):
 
     def test_fn(l):
@@ -40,6 +42,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertIn('test_fn/', result_op.op.name)
       self.assertEqual('Docstring.', result.test_fn.__doc__)
 
+  @test_util.run_deprecated_v1
   def test_multiline_docstring(self):
 
     tf = None
@@ -58,6 +61,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertIn('First sentence.', result.test_fn.__doc__)
       self.assertIn('Second sentence.', result.test_fn.__doc__)
 
+  @test_util.run_deprecated_v1
   def test_nested_functions(self):
 
     def test_fn(l):
@@ -74,6 +78,7 @@ class FunctionBodyTransformerTest(converter_testing.TestCase):
       self.assertNotIn('inner_fn', first.op.name)
       self.assertIn('test_fn/inner_fn/', second.op.name)
 
+  @test_util.run_deprecated_v1
   def test_method(self):
 
     class TestClass(object):
diff --git a/tensorflow/python/autograph/converters/lists_test.py b/tensorflow/python/autograph/converters/lists_test.py
index f6da845fcc3f19106073deaa094c0479063c02e7..39843c7d74f7f8e3f9c35d74258df4d3df86355b 100644
--- a/tensorflow/python/autograph/converters/lists_test.py
+++ b/tensorflow/python/autograph/converters/lists_test.py
@@ -68,7 +68,7 @@ class ListTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2, 3])
+        self.assertAllEqual(self.evaluate(r), [1, 2, 3])
 
   def test_list_pop(self):
 
@@ -91,8 +91,8 @@ class ListTest(converter_testing.TestCase):
       with self.cached_session() as sess:
         ts, tl = result.test_fn()
         r = list_ops.tensor_list_stack(tl, dtypes.int32)
-        self.assertAllEqual(sess.run(r), [1, 2])
-        self.assertAllEqual(sess.run(ts), 3)
+        self.assertAllEqual(self.evaluate(r), [1, 2])
+        self.assertAllEqual(self.evaluate(ts), 3)
 
   def test_double_list_pop(self):
 
@@ -123,7 +123,7 @@ class ListTest(converter_testing.TestCase):
 
     with self.compiled(node, {}, array_ops.stack, dtypes.int32) as result:
       with self.cached_session() as sess:
-        self.assertAllEqual(sess.run(result.test_fn()), [1, 2, 3])
+        self.assertAllEqual(self.evaluate(result.test_fn()), [1, 2, 3])
 
   # TODO(mdan): Add a test with tf.stack with axis kwarg.
 
diff --git a/tensorflow/python/autograph/converters/logical_expressions_test.py b/tensorflow/python/autograph/converters/logical_expressions_test.py
index 99db04a7751596735fe708cef1c9dd1944edbd93..687412750e0b2d3e7db275f6c25e5923ffaaa831 100644
--- a/tensorflow/python/autograph/converters/logical_expressions_test.py
+++ b/tensorflow/python/autograph/converters/logical_expressions_test.py
@@ -21,11 +21,13 @@ from __future__ import print_function
 from tensorflow.python.autograph.converters import logical_expressions
 from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class LogicalExpressionTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_equals(self):
 
     def test_fn(a, b):
@@ -36,6 +38,7 @@ class LogicalExpressionTest(converter_testing.TestCase):
         self.assertTrue(sess.run(result.test_fn(constant_op.constant(1), 1)))
         self.assertFalse(sess.run(result.test_fn(constant_op.constant(1), 2)))
 
+  @test_util.run_deprecated_v1
   def test_bool_ops(self):
 
     def test_fn(a, b, c):
@@ -48,6 +51,7 @@ class LogicalExpressionTest(converter_testing.TestCase):
         self.assertFalse(
             sess.run(result.test_fn(constant_op.constant(True), False, True)))
 
+  @test_util.run_deprecated_v1
   def test_comparison(self):
 
     def test_fn(a, b, c, d):
diff --git a/tensorflow/python/autograph/converters/side_effect_guards_test.py b/tensorflow/python/autograph/converters/side_effect_guards_test.py
index cef3199169c387194a95df72c26f353ad8f58873..645267e56002a999cd497f11f7507449ab900be6 100644
--- a/tensorflow/python/autograph/converters/side_effect_guards_test.py
+++ b/tensorflow/python/autograph/converters/side_effect_guards_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.autograph.core import converter_testing
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
@@ -34,6 +35,7 @@ tf = None  # Will be replaced by a mock.
 
 class SideEffectGuardsTest(converter_testing.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_side_effect_on_return_only_variable(self):
 
     def test_fn(a):
@@ -48,12 +50,12 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Add support for this use case.
         # Right now the variable `a` is not conditioned on the `assign` because
         # there's no way to add control dependencies to a variable object.
-        self.assertEqual(2, sess.run(v))
+        self.assertEqual(2, self.evaluate(v))
 
   def test_side_effect_on_used_variable(self):
 
@@ -69,12 +71,13 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
         # Right now it's 3 or 4 based on whether the read is synchronized.
-        self.assertEqual(3, sess.run(v))
+        self.assertEqual(3, self.evaluate(v))
 
+  @test_util.run_deprecated_v1
   def test_side_effect_on_tensor(self):
 
     def test_fn(a):
@@ -109,10 +112,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign_add) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, sess.run(v))
+        self.assertEqual(4, self.evaluate(v))
 
   def test_multiline_nested_block(self):
 
@@ -130,10 +133,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
     with self.compiled(node, {}, state_ops.assign, ops.name_scope) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(3, sess.run(v))
+        self.assertEqual(3, self.evaluate(v))
 
   def test_multiline_block_unsafe(self):
 
@@ -153,10 +156,10 @@ class SideEffectGuardsTest(converter_testing.TestCase):
                        state_ops.assign_add) as result:
       with self.cached_session() as sess:
         v = variable_scope.get_variable('test', initializer=2)
-        sess.run(v.initializer)
-        sess.run(result.test_fn(v))
+        self.evaluate(v.initializer)
+        self.evaluate(result.test_fn(v))
         # TODO(mdan): Ensure the result of test_fn(v) is also deterministic.
-        self.assertEqual(4, sess.run(v))
+        self.assertEqual(4, self.evaluate(v))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/converters/slices_test.py b/tensorflow/python/autograph/converters/slices_test.py
index e190a7cfe8492bef5985f128cf553a0fc17b3b96..bd049afdfcef4c839bcb3d9ba5444d885c3061cc 100644
--- a/tensorflow/python/autograph/converters/slices_test.py
+++ b/tensorflow/python/autograph/converters/slices_test.py
@@ -49,7 +49,7 @@ class SliceTest(converter_testing.TestCase):
         tl = list_ops.tensor_list_from_tensor(
             [1, 2], element_shape=constant_op.constant([], dtype=dtypes.int32))
         y = result.test_fn(tl)
-        self.assertEqual(2, sess.run(y))
+        self.assertEqual(2, self.evaluate(y))
 
   def test_index_access_multiple_definitions(self):
 
diff --git a/tensorflow/python/autograph/core/converter.py b/tensorflow/python/autograph/core/converter.py
index 49e24895a2b6ec31e83e44b4ef89d463b0157c97..e88c4674ee24867dec32d62589afdc2e48dfcace 100644
--- a/tensorflow/python/autograph/core/converter.py
+++ b/tensorflow/python/autograph/core/converter.py
@@ -82,6 +82,7 @@ from tensorflow.python.autograph.pyct.static_analysis import live_values
 from tensorflow.python.autograph.pyct.static_analysis import liveness
 from tensorflow.python.autograph.pyct.static_analysis import reaching_definitions
 from tensorflow.python.autograph.pyct.static_analysis import type_info
+from tensorflow.python.eager import function
 
 # TODO(mdan): These contexts can be refactored into first class objects.
 # For example, we could define Program and Entity abstractions that hold on
@@ -96,7 +97,7 @@ class Verbosity(IntEnum):
   Attributes:
    * BRIEF: No logging, minimal error messages.
    * VERBOSE: Detailed logging of generated code, detailed error messages.
- """
+  """
   BRIEF = 0
   VERBOSE = 1
 
@@ -151,7 +152,7 @@ class ConversionOptions(object):
                optional_features=Feature.ALL):
     self.recursive = recursive
     self.verbose = verbose
-    self.strip_decorators = strip_decorators or ()
+    self._strip_decorators = strip_decorators or ()
     self.force_conversion = force_conversion
     # TODO(mdan): Rename to conversion_recursion_depth?
     self.internal_convert_user_code = internal_convert_user_code
@@ -161,6 +162,12 @@ class ConversionOptions(object):
     optional_features = frozenset(optional_features)
     self.optional_features = optional_features
 
+  @property
+  def strip_decorators(self):
+    # A few decorators are included by default.
+    # TODO(mdan): Revert if function.defun becomes a public symbol.
+    return self._strip_decorators + (function.defun,)
+
   def uses(self, feature):
     return (Feature.ALL in self.optional_features or
             feature in self.optional_features)
@@ -216,7 +223,7 @@ class ConversionOptions(object):
             as_qualified_name(ConversionOptions)),
         recursive_val=parser.parse_expression(str(self.recursive)),
         verbose_val=parser.parse_expression(str(int(self.verbose))),
-        strip_decorators_val=list_of_names(self.strip_decorators),
+        strip_decorators_val=list_of_names(self._strip_decorators),
         force_conversion_val=parser.parse_expression(
             str(self.force_conversion)),
         internal_convert_user_code_val=parser.parse_expression(
diff --git a/tensorflow/python/autograph/core/converter_testing.py b/tensorflow/python/autograph/core/converter_testing.py
index 7b0608d03fccbb45651ad63e36e4377f7d6a1dd3..f1374081d3c6e0dd93c39d331c76404859b2f40a 100644
--- a/tensorflow/python/autograph/core/converter_testing.py
+++ b/tensorflow/python/autograph/core/converter_testing.py
@@ -32,6 +32,7 @@ from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.pyct import compiler
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import pretty_printer
@@ -43,7 +44,7 @@ def imported_decorator(f):
   return lambda a: f(a) + 1
 
 
-# TODO(mdan): We might be able to use the real namer here.
+# TODO(mdan): We should use the real namer here.
 class FakeNamer(object):
   """A fake namer that uses a global counter to generate unique names."""
 
@@ -61,7 +62,8 @@ class FakeNamer(object):
                              original_fqn,
                              live_entity=None,
                              owner_type=None):
-    del live_entity
+    if inspect_utils.islambda(live_entity):
+      return None, False
     if owner_type is not None:
       return None, False
     return ('renamed_%s' % '_'.join(original_fqn)), True
diff --git a/tensorflow/python/autograph/core/errors_test.py b/tensorflow/python/autograph/core/errors_test.py
index aa6c293268c86892ea076000a112fc6a3012b2ab..845a28a5222a77d0d2a2ee49f6edb86f57ddb6a6 100644
--- a/tensorflow/python/autograph/core/errors_test.py
+++ b/tensorflow/python/autograph/core/errors_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.autograph.core import errors
 from tensorflow.python.autograph.pyct import origin_info
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors as tf_errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_inspect
@@ -47,6 +48,7 @@ class RuntimeErrorsTest(test.TestCase):
                                     'test_comment')
     return loc, origin
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_basic(self):
     loc, origin = self.fake_origin(zero_div, 2)
     zero_div_caller.ag_source_map = {loc: origin}
@@ -55,13 +57,14 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(errors.TfRuntimeError) as cm:
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
     for frame in cm.exception.custom_traceback:
       _, _, function_name, _ = frame
       self.assertNotEqual('zero_div', function_name)
     self.assertIn(origin.as_frame(), set(cm.exception.custom_traceback))
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_no_matching_lineno(self):
     loc, origin = self.fake_origin(zero_div, -1)
     zero_div_caller.ag_source_map = {loc: origin}
@@ -70,7 +73,7 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(errors.TfRuntimeError) as cm:
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
     all_function_names = set()
     for frame in cm.exception.custom_traceback:
@@ -79,6 +82,7 @@ class RuntimeErrorsTest(test.TestCase):
       self.assertNotEqual('test_function_name', function_name)
     self.assertIn('zero_div', all_function_names)
 
+  @test_util.run_deprecated_v1
   def test_improved_errors_failures(self):
     loc, _ = self.fake_origin(zero_div, 2)
     zero_div_caller.ag_source_map = {loc: 'bogus object'}
@@ -87,7 +91,7 @@ class RuntimeErrorsTest(test.TestCase):
     with self.assertRaises(tf_errors.InvalidArgumentError):
       with errors.improved_errors(zero_div_caller):
         with self.cached_session() as sess:
-          sess.run(ops)
+          self.evaluate(ops)
 
   def test_improved_errors_validation(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/autograph/core/function_wrapping_test.py b/tensorflow/python/autograph/core/function_wrapping_test.py
index 5e217055c7154dbcabed06be157a25e4068f2ddb..7e21b979dbcd24f815f2d7ce88ad9ec1f6690507 100644
--- a/tensorflow/python/autograph/core/function_wrapping_test.py
+++ b/tensorflow/python/autograph/core/function_wrapping_test.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.core import function_wrapping
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class FunctionWrappingTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_function_scope_name(self):
     with function_wrapping.function_scope('test_name'):
       t = constant_op.constant(1)
diff --git a/tensorflow/python/autograph/core/naming.py b/tensorflow/python/autograph/core/naming.py
index 43fcbcfc0302a6472bf3bd153212ba7222083016..b8d79daebaa6d6dcf5f324f637a3b496f3742b92 100644
--- a/tensorflow/python/autograph/core/naming.py
+++ b/tensorflow/python/autograph/core/naming.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.autograph.pyct import inspect_utils
 from tensorflow.python.autograph.pyct import qual_names
-from tensorflow.python.util import tf_inspect
 
 
 class Namer(object):
@@ -77,8 +77,7 @@ class Namer(object):
     if not self.recursive:
       return None, False
 
-    if (live_entity is not None and tf_inspect.isfunction(live_entity) and
-        live_entity.__name__ == '<lambda>'):
+    if (live_entity is not None and inspect_utils.islambda(live_entity)):
       return None, False
 
     if owner_type is not None and owner_type not in self.partial_types:
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index 2f9037c43b6452407757ccc5ad27bdd8e06d9ea7..201a88875413982b0f1a791f3408b403a3259eb8 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -41,7 +41,6 @@ py_test(
     name = "api_test",
     srcs = ["api_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
@@ -54,7 +53,6 @@ py_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":impl",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/impl/api.py b/tensorflow/python/autograph/impl/api.py
index 69674b2be3c9e9356349d0670df0548b47be34c0..f7774888c8a5ccb8a64186476d6e78b999e527ba 100644
--- a/tensorflow/python/autograph/impl/api.py
+++ b/tensorflow/python/autograph/impl/api.py
@@ -49,7 +49,10 @@ from tensorflow.python.util import tf_inspect
 # TODO(mdan): This should behave like to_graph (e.g. convert statically).
 # TODO(znado): Make an alias so can write Verbosity directly without needing
 # to write converter.
-def convert(recursive=False, verbose=converter.Verbosity.VERBOSE):
+def convert(
+    recursive=False,
+    verbose=converter.Verbosity.BRIEF,
+    optional_features=converter.Feature.ALL):
   """Decorator that compiles a function to use TensorFlow ops.
 
   The decorator is dynamic - it recompiles the target whenever the decorated
@@ -61,6 +64,9 @@ def convert(recursive=False, verbose=converter.Verbosity.VERBOSE):
     recursive: bool, whether to recursively convert any functions or classes
       that the converted function may use.
     verbose: converter.Verbosity, the level of verbosity.
+    optional_features: converted.Feature, allows toggling optional or
+      experimental features. When set to None, only the core features are
+      enabled.
 
   Returns:
     Callable, a decorator that converts the given function into an equivalent
@@ -78,7 +84,7 @@ def convert(recursive=False, verbose=converter.Verbosity.VERBOSE):
               recursive=recursive,
               verbose=verbose,
               force_conversion=True,
-              optional_features=converter.Feature.ALL,
+              optional_features=optional_features,
           ), *args, **kwargs)
 
     wrapper = tf_decorator.make_decorator(f, wrapper)
@@ -195,6 +201,17 @@ def converted_call(f, owner, options, *args, **kwargs):
   if not options.internal_convert_user_code:
     return f(*args, **kwargs)
 
+  # Unwrap functools.partial objects
+  # TODO(allenl, mdan): Consider sharing unwrapping logic with tf_inspect.
+  while isinstance(f, functools.partial):
+    args = f.args + args
+    new_kwargs = {}
+    if f.keywords is not None:
+      new_kwargs.update(f.keywords)
+    new_kwargs.update(kwargs)
+    kwargs = new_kwargs
+    f = f.func
+
   if tf_inspect.isfunction(f) or tf_inspect.ismethod(f):
     # Regular functions
     target_entity = f
diff --git a/tensorflow/python/autograph/impl/api_test.py b/tensorflow/python/autograph/impl/api_test.py
index ef577568c4ee767c7ab6f6b138579be8737a9337..d5561ba8249f539e720fa1ecb5800b76c61a8c2f 100644
--- a/tensorflow/python/autograph/impl/api_test.py
+++ b/tensorflow/python/autograph/impl/api_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import gc
 
 import numpy as np
@@ -28,6 +29,7 @@ from tensorflow.python.autograph.impl import api
 from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.utils import py_func
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import sequential
 from tensorflow.python.keras.layers import core
 from tensorflow.python.ops import variables
@@ -43,6 +45,7 @@ class TestResource(str):
 
 class ApiTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_decorator_recurses(self):
 
     class TestClass(object):
@@ -63,8 +66,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_does_not_recurse(self):
 
     class TestClass(object):
@@ -83,8 +87,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_unconverted_graph(self):
 
     class TestClass(object):
@@ -104,8 +109,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_unconverted_py_func(self):
 
     class TestClass(object):
@@ -130,8 +136,9 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_decorator_calls_decorated(self):
 
     class TestClass(object):
@@ -153,7 +160,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_decorator_preserves_argspec(self):
 
@@ -171,6 +178,7 @@ class ApiTest(test.TestCase):
         list(tf_inspect.getfullargspec(tc.called_member)),
         list(tf_inspect.getfullargspec(tc.called_member_converted)))
 
+  @test_util.run_deprecated_v1
   def test_convert_call_site_decorator(self):
 
     class TestClass(object):
@@ -192,7 +200,7 @@ class ApiTest(test.TestCase):
       x = tc.test_method(
           constant_op.constant([2, 4]), constant_op.constant(1),
           constant_op.constant(-2))
-      self.assertListEqual([0, 1], sess.run(x).tolist())
+      self.assertListEqual([0, 1], self.evaluate(x).tolist())
 
   def test_converted_call_builtin(self):
     x = api.converted_call(range, None, converter.ConversionOptions(), 3)
@@ -208,7 +216,28 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       x = api.converted_call(test_fn, None, converter.ConversionOptions(),
                              constant_op.constant(-1))
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
+
+  @test_util.run_v1_only('b/120545219')
+  def test_converted_call_functools_partial(self):
+
+    def test_fn(x, y, z):
+      if x < 0:
+        return -x, -y, -z
+      return x, y, z
+
+    x = api.converted_call(
+        functools.partial(test_fn, constant_op.constant(-1), z=-3),
+        None, converter.ConversionOptions(),
+        constant_op.constant(-2))
+    self.assertEqual((1, 2, 3), self.evaluate(x))
+
+    x = api.converted_call(
+        functools.partial(
+            functools.partial(test_fn, constant_op.constant(-1)), z=-3),
+        None, converter.ConversionOptions(),
+        constant_op.constant(-2))
+    self.assertEqual((1, 2, 3), self.evaluate(x))
 
   def test_converted_call_method_explicit_owner(self):
     # TODO(mdan): Implement.
@@ -234,7 +263,7 @@ class ApiTest(test.TestCase):
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(tc.test_method, None,
                              converter.ConversionOptions(), tc)
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_method_by_class(self):
 
@@ -252,7 +281,7 @@ class ApiTest(test.TestCase):
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(TestClass.test_method, None,
                              converter.ConversionOptions(), tc)
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_callable_object(self):
 
@@ -269,7 +298,7 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       tc = TestClass(constant_op.constant(-1))
       x = api.converted_call(tc, None, converter.ConversionOptions())
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_constructor(self):
 
@@ -288,7 +317,7 @@ class ApiTest(test.TestCase):
                               constant_op.constant(-1))
       # tc is now a converted object.
       x = tc.test_method()
-      self.assertEqual(1, sess.run(x))
+      self.assertEqual(1, self.evaluate(x))
 
   def test_converted_call_already_converted(self):
 
@@ -298,13 +327,14 @@ class ApiTest(test.TestCase):
     with self.cached_session() as sess:
       x = api.converted_call(f, None, converter.ConversionOptions(),
                              constant_op.constant(0))
-      self.assertTrue(sess.run(x))
+      self.assertTrue(self.evaluate(x))
 
       converted_f = api.to_graph(f)
       x = api.converted_call(converted_f, None, converter.ConversionOptions(),
                              constant_op.constant(0))
-      self.assertTrue(sess.run(x))
+      self.assertTrue(self.evaluate(x))
 
+  @test_util.run_deprecated_v1
   def test_converted_call_no_user_code(self):
 
     def f(x):
@@ -334,8 +364,8 @@ class ApiTest(test.TestCase):
                            constant_op.constant([[0.0]]), training=True)
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_whitelisted_method_extra_self(self):
 
@@ -349,8 +379,8 @@ class ApiTest(test.TestCase):
                            model, constant_op.constant([[0.0]]), training=True)
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_whitelisted_method_via_owner(self):
 
@@ -364,8 +394,8 @@ class ApiTest(test.TestCase):
                            constant_op.constant([[0.0]]), training=True)
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual([[0.0, 0.0]], sess.run(x))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual([[0.0, 0.0]], self.evaluate(x))
 
   def test_converted_call_lambda(self):
 
@@ -376,9 +406,10 @@ class ApiTest(test.TestCase):
     x = api.converted_call(l, None, opts, constant_op.constant(0))
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      self.assertAllEqual(True, sess.run(x))
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(True, self.evaluate(x))
 
+  @test_util.run_deprecated_v1
   def test_to_graph_basic(self):
 
     def test_fn(x, s):
@@ -390,8 +421,9 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       x = compiled_fn(constant_op.constant([4, 8]), 4)
-      self.assertListEqual([1, 2], sess.run(x).tolist())
+      self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
+  @test_util.run_deprecated_v1
   def test_to_graph_with_defaults(self):
 
     foo = 4
@@ -405,7 +437,7 @@ class ApiTest(test.TestCase):
 
     with self.cached_session() as sess:
       x = compiled_fn(constant_op.constant([4, 8]))
-      self.assertListEqual([1, 2], sess.run(x).tolist())
+      self.assertListEqual([1, 2], self.evaluate(x).tolist())
 
   def test_to_code_basic(self):
 
diff --git a/tensorflow/python/autograph/impl/conversion.py b/tensorflow/python/autograph/impl/conversion.py
index 328a4b5fe48a404f2632f568e3a89c5426395d03..f8decd24e8e2eb5bcad22ba64d1865e8497363e3 100644
--- a/tensorflow/python/autograph/impl/conversion.py
+++ b/tensorflow/python/autograph/impl/conversion.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import imp
 
 import gast
@@ -72,12 +73,31 @@ def is_whitelisted_for_graph(o):
   Returns:
     Boolean
   """
-  m = tf_inspect.getmodule(o)
+  # TODO(b/120224672): Fix this.
+  if isinstance(o, functools.partial):
+    # tf_inspect.getmodule(functools.partial(...)) otherwise returns None since
+    # functools.partial objects do not have a __module__ attribute.
+    m = functools
+  else:
+    m = tf_inspect.getmodule(o)
   for prefix, in config.DEFAULT_UNCOMPILED_MODULES:
     if m.__name__.startswith(prefix):
       return True
+
   if hasattr(o, 'autograph_info__'):
     return True
+
+  if inspect_utils.isnamedtuple(o):
+    # Due to the way they're constructed, namedtuple types cannot be converted
+    # because they don't expose source code. But we assume they are safe for
+    # graph mode since they are just containers.
+    if tf_inspect.isclass(o) and len(o.__bases__) > 1:
+      logging.log_first_n(
+          logging.level_warning(),
+          'Entity {} looks like a namedtuple subclass. If it has any custom'
+          ' methods, they will not be converted by AutoGraph.'.format(o), 1)
+    return True
+
   return False
 
 
@@ -281,11 +301,10 @@ def function_to_graph(f,
   node, source = parser.parse_entity(f)
   node = node.body[0]
 
-  # In general, the output of inspect.getsource is inexact because it uses crude
-  # regex matching methods to search the source file. This is particularly
-  # problematic for lambda functions, where the entire containing lines are
-  # returned. Certain distributions of CPython may also return the enclosing
-  # function for local functions.
+  # In general, the output of inspect.getsource is inexact because it uses
+  # regex matching to adjust the exact location around the line number that
+  # CPython records. This is particularly problematic for lambda functions,
+  # where the entire containing lines are returned.
   nodes = ast_util.find_matching_definitions(node, f)
   if len(nodes) != 1:
     if f.__name__ == '<lambda>':
@@ -298,8 +317,8 @@ def function_to_graph(f,
       raise ValueError(
           'Unable to identify source code of function {}. The source code'
           ' reported by Python did not include exactly one matching signature:'
-          '\n{}\nTo avoid ambiguity, use a unique name for each'
-          ' function.'.format(f, source))
+          '\n{}\n. This is an extremely rare occurrence. Please report it to'
+          ' the TensorFlow team.'.format(f, source))
   node, = nodes
 
   # TODO(znado): Place inside standard_analysis.
diff --git a/tensorflow/python/autograph/lang/special_functions_test.py b/tensorflow/python/autograph/lang/special_functions_test.py
index 123ee65b32663ece3c141261fcf43aa777058c65..8d40f4036c5a1892afca6e5fb2daf891c9487800 100644
--- a/tensorflow/python/autograph/lang/special_functions_test.py
+++ b/tensorflow/python/autograph/lang/special_functions_test.py
@@ -36,7 +36,7 @@ class SpecialFunctionsTest(test.TestCase):
     python_one = special_functions.match_staging_level(1, 1)
     with self.cached_session() as sess:
       self.assertTrue(tensor_util.is_tensor(tensor_one))
-      self.assertAllEqual(sess.run(tensor_one), 1)
+      self.assertAllEqual(self.evaluate(tensor_one), 1)
       self.assertEqual(python_one, 1)
 
   def test_tensor_list_empty_list(self):
@@ -45,21 +45,21 @@ class SpecialFunctionsTest(test.TestCase):
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+      self.assertAllEqual(self.evaluate(sl), [])
 
     l = special_functions.tensor_list((),
                                       element_dtype=dtypes.int32,
                                       element_shape=())
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+      self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_tensor(self):
     l = special_functions.tensor_list(
         constant_op.constant([], dtype=dtypes.int32))
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [])
+      self.assertAllEqual(self.evaluate(sl), [])
 
   def test_tensor_list_unsupported_initializer(self):
     with self.assertRaisesRegexp(ValueError, 'unknown type'):
@@ -76,7 +76,7 @@ class SpecialFunctionsTest(test.TestCase):
     l = special_functions.tensor_list(elements)
     sl = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+      self.assertAllEqual(self.evaluate(sl), [[1, 2], [3, 4]])
 
   def test_tensor_list_array_from_elements(self):
     elements = [constant_op.constant([1, 2]), constant_op.constant([3, 4])]
@@ -84,7 +84,7 @@ class SpecialFunctionsTest(test.TestCase):
     l = special_functions.tensor_list(elements, use_tensor_array=True)
     sl = l.stack()
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(sl), [[1, 2], [3, 4]])
+      self.assertAllEqual(self.evaluate(sl), [[1, 2], [3, 4]])
 
   def test_stack(self):
     self.assertEqual(special_functions.stack(1, strict=False), 1)
diff --git a/tensorflow/python/autograph/operators/control_flow.py b/tensorflow/python/autograph/operators/control_flow.py
index 6eedd695a74c134ad1c7cc3524bef64ba5b7066a..89f7b8522f569542fa935877cdd9de6a9797c2c4 100644
--- a/tensorflow/python/autograph/operators/control_flow.py
+++ b/tensorflow/python/autograph/operators/control_flow.py
@@ -20,7 +20,6 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_math_ops
@@ -61,7 +60,7 @@ def for_stmt(iter_, extra_test, body, init_state):
   """
   if tensor_util.is_tensor(iter_):
     return _known_len_for_stmt(iter_, extra_test, body, init_state)
-  elif isinstance(iter_, dataset_ops.Dataset):
+  elif isinstance(iter_, dataset_ops.DatasetV2):
     return _dataset_for_stmt(iter_, extra_test, body, init_state)
   else:
     return _py_for_stmt(iter_, extra_test, body, init_state)
@@ -100,6 +99,7 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
       extra_deps=(iter_,),
       opts=dict(maximum_iterations=n))
   # Dropping the iteration index because it's not syntactically visible.
+  # TODO(mdan): Don't.
   results = results[1:]
 
   # TODO(mdan): Remove this special case.
@@ -110,40 +110,15 @@ def _known_len_for_stmt(iter_, extra_test, body, init_state):
 
 def _dataset_for_stmt(ds, extra_test, body, init_state):
   """Overload of for_stmt that iterates over TF Datasets."""
-  # Because Datsets only expose get_next, in the style of Python iterators,
-  # we are forced to unpack the loop as:
-  #
-  # epoch_number, iterate = ds.get_next()
-  # while epoch_number < 2:
-  #   <body>
-  #   epoch_number, iterate = ds.get_next()
-  epoch_numbers = dataset_ops.Dataset.range(2)
-  def tag_with(ds, tag):
-    return dataset_ops.Dataset.zip(
-        (dataset_ops.Dataset.from_tensors(tag).repeat(), ds))
-  ds_with_epoch = epoch_numbers.flat_map(lambda i: tag_with(ds, i))
-
-  iterator = ds_with_epoch.make_initializable_iterator()
-  with ops.control_dependencies((iterator.initializer,)):
-    epoch_number, iterate = iterator.get_next()
-
-    def while_body(epoch_number, iterate, *state):
-      new_state = body(iterate, *state)
-      epoch_number, iterate = iterator.get_next()
-      return (epoch_number, iterate) + new_state
-
-    def while_cond(epoch_number, iterate, *state):
-      del iterate
-      return gen_math_ops.logical_and(epoch_number < 1, extra_test(*state))
-
-    results = while_stmt(
-        while_cond,
-        while_body,
-        init_state=(epoch_number, iterate) + init_state,
-        extra_deps=())
-  # Dropping the epoch number and iterate because they are not syntactically
-  # visible.
-  results = results[2:]
+  if extra_test(*init_state) is not True:
+    raise NotImplementedError(
+        'break statements are not yet supported in for/Dataset loops')
+
+  def reduce_body(state, iterate):
+    new_state = body(iterate, *state)
+    return new_state
+
+  results = ds.reduce(init_state, reduce_body)
 
   # TODO(mdan): Remove this special case.
   if len(results) == 1:
diff --git a/tensorflow/python/autograph/operators/control_flow_test.py b/tensorflow/python/autograph/operators/control_flow_test.py
index 2dea18dc5faa250784eff815f216396c353e2014..0a7d4b64022f583bae4effc7d0f7eb04f46cc048 100644
--- a/tensorflow/python/autograph/operators/control_flow_test.py
+++ b/tensorflow/python/autograph/operators/control_flow_test.py
@@ -22,12 +22,14 @@ from tensorflow.python.autograph.operators import control_flow
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
 class ForLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_tensor(self):
     s = control_flow.for_stmt(
         constant_op.constant([1, 2, 3, 4]),
@@ -35,7 +37,7 @@ class ForLoopTest(test.TestCase):
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session() as sess:
-      self.assertEqual((10,), sess.run(s))
+      self.assertEqual((10,), self.evaluate(s))
 
   def test_python(self):
     s = control_flow.for_stmt(
@@ -45,6 +47,7 @@ class ForLoopTest(test.TestCase):
         init_state=(0,))
     self.assertEqual(10, s)
 
+  @test_util.run_deprecated_v1
   def test_dataset(self):
     to_int32 = lambda i: math_ops.cast(i, dtypes.int32)
     s = control_flow.for_stmt(
@@ -53,11 +56,12 @@ class ForLoopTest(test.TestCase):
         body=lambda i, s: (s + i,),
         init_state=(0,))
     with self.cached_session() as sess:
-      self.assertEqual((10,), sess.run(s))
+      self.assertEqual((10,), self.evaluate(s))
 
 
 class WhileLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_tensor(self):
     n = constant_op.constant(5)
     results = control_flow.while_stmt(
@@ -66,7 +70,7 @@ class WhileLoopTest(test.TestCase):
         init_state=(0, 0),
         extra_deps=(n,))
     with self.cached_session() as sess:
-      self.assertEqual((5, 10), sess.run(results))
+      self.assertEqual((5, 10), self.evaluate(results))
 
   def test_python(self):
     n = 5
@@ -87,23 +91,25 @@ class IfStmtTest(test.TestCase):
     return control_flow.if_stmt(
         cond=cond, body=lambda: (1, 2), orelse=lambda: (-1, -2))
 
+  @test_util.run_deprecated_v1
   def test_tensor(self):
     with self.cached_session() as sess:
       t = self.single_return_if_stmt(constant_op.constant(True))
-      self.assertEqual(1, sess.run(t))
+      self.assertEqual(1, self.evaluate(t))
       t = self.single_return_if_stmt(constant_op.constant(False))
-      self.assertEqual(-1, sess.run(t))
+      self.assertEqual(-1, self.evaluate(t))
 
   def test_python(self):
     self.assertEqual(1, self.single_return_if_stmt(True))
     self.assertEqual(-1, self.single_return_if_stmt(False))
 
+  @test_util.run_deprecated_v1
   def test_tensor_multiple_returns(self):
     with self.cached_session() as sess:
       t = self.multi_return_if_stmt(constant_op.constant(True))
-      self.assertAllEqual([1, 2], sess.run(t))
+      self.assertAllEqual([1, 2], self.evaluate(t))
       t = self.multi_return_if_stmt(constant_op.constant(False))
-      self.assertAllEqual([-1, -2], sess.run(t))
+      self.assertAllEqual([-1, -2], self.evaluate(t))
 
   def test_python_multiple_returns(self):
     self.assertEqual((1, 2), self.multi_return_if_stmt(True))
diff --git a/tensorflow/python/autograph/operators/data_structures_test.py b/tensorflow/python/autograph/operators/data_structures_test.py
index 6039b07982c8e4b820acda059c701b8fdb96e295..c5a3a3d1cac998a0fc59163d73288317bd4a3e30 100644
--- a/tensorflow/python/autograph/operators/data_structures_test.py
+++ b/tensorflow/python/autograph/operators/data_structures_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.autograph.operators import data_structures
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -43,7 +44,7 @@ class ListTest(test.TestCase):
     l = data_structures.tf_tensor_list_new([3, 4, 5])
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_list_new_empty(self):
     l = data_structures.tf_tensor_list_new([],
@@ -51,14 +52,15 @@ class ListTest(test.TestCase):
                                            element_shape=())
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [])
+      self.assertAllEqual(self.evaluate(t), [])
 
   def test_tf_tensor_list_new_from_tensor(self):
     l = data_structures.tf_tensor_list_new(constant_op.constant([3, 4, 5]))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
+  @test_util.run_deprecated_v1
   def test_tf_tensor_list_new_illegal_input(self):
     with self.assertRaises(ValueError):
       data_structures.tf_tensor_list_new([3, 4.0])
@@ -77,7 +79,7 @@ class ListTest(test.TestCase):
     l = data_structures.tf_tensor_array_new([3, 4, 5])
     t = l.stack()
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4, 5])
+      self.assertAllEqual(self.evaluate(t), [3, 4, 5])
 
   def test_tf_tensor_array_new_illegal_input(self):
     with self.assertRaises(ValueError):
@@ -102,15 +104,16 @@ class ListTest(test.TestCase):
 
     t = list_ops.tensor_list_stack(l, element_dtype=x.dtype)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [[1, 2, 3]])
+      self.assertAllEqual(self.evaluate(t), [[1, 2, 3]])
 
+  @test_util.run_v1_only("b/117943489")
   def test_append_tensorarray(self):
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l1 = data_structures.list_append(l, 1)
     l2 = data_structures.list_append(l1, 2)
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(l1.stack()), [1])
-      self.assertAllEqual(sess.run(l2.stack()), [1, 2])
+      self.assertAllEqual(self.evaluate(l1.stack()), [1])
+      self.assertAllEqual(self.evaluate(l2.stack()), [1, 2])
 
   def test_append_python(self):
     l = []
@@ -131,10 +134,10 @@ class ListTest(test.TestCase):
 
     with self.cached_session() as sess:
       l, x = data_structures.list_pop(l, None, opts)
-      self.assertAllEqual(sess.run(x), [3, 4])
+      self.assertAllEqual(self.evaluate(x), [3, 4])
 
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
-      self.assertAllEqual(sess.run(t), [[1, 2]])
+      self.assertAllEqual(self.evaluate(t), [[1, 2]])
 
   def test_pop_python(self):
     l = [1, 2, 3]
@@ -152,12 +155,12 @@ class ListTest(test.TestCase):
 
     with self.cached_session() as sess:
       t = data_structures.list_stack(l, opts)
-      self.assertAllEqual(sess.run(t), sess.run(initial_list))
+      self.assertAllEqual(self.evaluate(t), self.evaluate(initial_list))
 
+  @test_util.run_deprecated_v1
   def test_stack_tensor_list_empty(self):
     l = list_ops.empty_tensor_list(
-        element_shape=-1,
-        element_dtype=dtypes.variant)
+        element_shape=None, element_dtype=dtypes.variant)
 
     opts = data_structures.ListStackOpts(
         element_dtype=dtypes.int32, original_call=None)
diff --git a/tensorflow/python/autograph/operators/exceptions_test.py b/tensorflow/python/autograph/operators/exceptions_test.py
index 186535d05b55e14ff0860e7b7610185db230865a..21ba76bb9521132ad3a54eb4d6004dc6d725d03f 100644
--- a/tensorflow/python/autograph/operators/exceptions_test.py
+++ b/tensorflow/python/autograph/operators/exceptions_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.autograph.operators import exceptions
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -30,8 +31,9 @@ class ExceptionsTest(test.TestCase):
     with self.cached_session() as sess:
       t = exceptions.assert_stmt(
           constant_op.constant(True), lambda: constant_op.constant('ignored'))
-      sess.run(t)
+      self.evaluate(t)
 
+  @test_util.run_deprecated_v1
   def test_assert_tf_triggered(self):
     with self.cached_session() as sess:
       t = exceptions.assert_stmt(
@@ -40,8 +42,9 @@ class ExceptionsTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    'test message'):
-        sess.run(t)
+        self.evaluate(t)
 
+  @test_util.run_deprecated_v1
   def test_assert_tf_multiple_printed_values(self):
     two_tensors = [
         constant_op.constant('test message'),
@@ -53,7 +56,7 @@ class ExceptionsTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    'test message.*another message'):
-        sess.run(t)
+        self.evaluate(t)
 
   def test_assert_python_untriggered(self):
     side_effect_trace = []
diff --git a/tensorflow/python/autograph/operators/logical_test.py b/tensorflow/python/autograph/operators/logical_test.py
index d6649f7b2bfccb17b31689fc8ff460c0c58d522c..e22f39932d17397bca22bff8793e7649580d75d3 100644
--- a/tensorflow/python/autograph/operators/logical_test.py
+++ b/tensorflow/python/autograph/operators/logical_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.autograph.operators import logical
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -42,14 +43,15 @@ class LogicalOperatorsTest(test.TestCase):
     self.assertFalse(logical.and_(lambda: False, lambda: True))
     self.assertFalse(logical.and_(lambda: False, self.assertNotCalled))
 
+  @test_util.run_deprecated_v1
   def test_and_tf(self):
     with self.cached_session() as sess:
       t = logical.and_(self._tf_true, self._tf_true)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.and_(self._tf_true, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.and_(self._tf_false, lambda: True)
-      self.assertEqual(sess.run(t), False)
+      self.assertEqual(self.evaluate(t), False)
       # TODO(mdan): Add a test for ops with side effects.
 
   def test_or_python(self):
@@ -60,14 +62,15 @@ class LogicalOperatorsTest(test.TestCase):
     self.assertTrue(logical.or_(lambda: False, lambda: True))
     self.assertTrue(logical.or_(lambda: True, self.assertNotCalled))
 
+  @test_util.run_deprecated_v1
   def test_or_tf(self):
     with self.cached_session() as sess:
       t = logical.or_(self._tf_false, self._tf_true)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.or_(self._tf_false, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       t = logical.or_(self._tf_true, lambda: True)
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
       # TODO(mdan): Add a test for ops with side effects.
 
   def test_not_python(self):
@@ -78,7 +81,7 @@ class LogicalOperatorsTest(test.TestCase):
   def test_not_tf(self):
     with self.cached_session() as sess:
       t = logical.not_(self._tf_false())
-      self.assertEqual(sess.run(t), True)
+      self.assertEqual(self.evaluate(t), True)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/operators/py_builtins.py b/tensorflow/python/autograph/operators/py_builtins.py
index 2f55d538924609f4ad2549acccbc15a57ac13c19..ddf05f73f37821c6ff7e246051cd82a560f370e3 100644
--- a/tensorflow/python/autograph/operators/py_builtins.py
+++ b/tensorflow/python/autograph/operators/py_builtins.py
@@ -174,6 +174,7 @@ def _tf_py_func_print(objects, kwargs):
     override_kwargs['flush'] = True
 
   def print_wrapper(*vals):
+    vals = tuple(v.numpy() if tensor_util.is_tensor(v) else v for v in vals)
     if six.PY3:
       # TensorFlow doesn't seem to generate Unicode when passing strings to
       # py_func. This causes the print to add a "b'" wrapper to the output,
@@ -193,6 +194,7 @@ def range_(start_or_stop, stop=UNDEFINED, step=UNDEFINED):
 
 
 def _tf_range(start_or_stop, stop, step):
+  """Overload of range_ that generates a TF range tensor."""
   # Note: for static inputs (e.g. constants), tf.range errors out at graph
   # construction time, instead of returning an empty tensor. Preventing the
   # graph construction error aligns the semantics with Python.
diff --git a/tensorflow/python/autograph/operators/py_builtins_test.py b/tensorflow/python/autograph/operators/py_builtins_test.py
index 443e30a475d111e08e587c08fb47f42eb776182a..c856e39d141f8479e2b9409b21d6683618a5e645 100644
--- a/tensorflow/python/autograph/operators/py_builtins_test.py
+++ b/tensorflow/python/autograph/operators/py_builtins_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.autograph.operators import py_builtins
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.platform import test
@@ -38,29 +39,29 @@ class PyBuiltinsTest(test.TestCase):
     self.assertEqual(py_builtins.abs_(-1), 1)
     with self.cached_session() as sess:
       t = py_builtins.abs_(constant_op.constant(-1))
-      self.assertEqual(sess.run(t), 1)
+      self.assertEqual(self.evaluate(t), 1)
       t = py_builtins.abs_(constant_op.constant([-1, 2, -3]))
-      self.assertAllEqual(sess.run(t), [1, 2, 3])
+      self.assertAllEqual(self.evaluate(t), [1, 2, 3])
 
   def test_float(self):
     self.assertEqual(py_builtins.float_(10), 10.0)
     self.assertEqual(py_builtins.float_('10.0'), 10.0)
     with self.cached_session() as sess:
       t = py_builtins.float_(constant_op.constant(1, dtype=dtypes.int64))
-      self.assertEqual(sess.run(t), 1.0)
+      self.assertEqual(self.evaluate(t), 1.0)
       st = py_builtins.float_(constant_op.constant('1.0'))
-      self.assertEqual(sess.run(st), 1.0)
+      self.assertEqual(self.evaluate(st), 1.0)
 
   def test_int(self):
     self.assertEqual(py_builtins.int_(10.0), 10)
     self.assertEqual(py_builtins.int_('11', 2), 3)
     with self.cached_session() as sess:
       t = py_builtins.int_(constant_op.constant(1, dtype=dtypes.float64))
-      self.assertEqual(sess.run(t), 1)
+      self.assertEqual(self.evaluate(t), 1)
       st = py_builtins.int_(constant_op.constant('1'))
-      self.assertEqual(sess.run(st), 1)
+      self.assertEqual(self.evaluate(st), 1)
       st = py_builtins.int_(constant_op.constant('1'), 10)
-      self.assertEqual(sess.run(st), 1)
+      self.assertEqual(self.evaluate(st), 1)
 
   def test_int_unsupported_base(self):
     t = constant_op.constant(1, dtype=dtypes.float64)
@@ -73,14 +74,15 @@ class PyBuiltinsTest(test.TestCase):
       t = py_builtins.len_(constant_op.constant([[1], [2], [3]]))
       self.assertEqual(t, 3)
       ta = py_builtins.len_(tensor_array_ops.TensorArray(dtypes.int32, size=5))
-      self.assertEqual(sess.run(ta), 5)
+      self.assertEqual(self.evaluate(ta), 5)
       tl = py_builtins.len_(data_structures.tf_tensor_list_new([3, 4, 5]))
-      self.assertEqual(sess.run(tl), 3)
+      self.assertEqual(self.evaluate(tl), 3)
 
   def test_len_scalar(self):
     with self.assertRaises(ValueError):
       py_builtins.len_(constant_op.constant(1))
 
+  @test_util.run_deprecated_v1
   def test_len_dynamic_shape(self):
     with self.cached_session() as sess:
       p = array_ops.placeholder(dtype=dtypes.int32, shape=None)
@@ -91,6 +93,7 @@ class PyBuiltinsTest(test.TestCase):
         t = py_builtins.len_(p)
         sess.run(t, {p: 1})
 
+  @test_util.run_deprecated_v1
   def test_print_tensors(self):
     try:
       out_capturer = six.StringIO()
@@ -101,6 +104,7 @@ class PyBuiltinsTest(test.TestCase):
     finally:
       sys.stdout = sys.__stdout__
 
+  @test_util.run_deprecated_v1
   def test_print_complex(self):
     try:
       out_capturer = six.StringIO()
@@ -120,18 +124,18 @@ class PyBuiltinsTest(test.TestCase):
   def test_range_tensor(self):
     with self.cached_session() as sess:
       r = py_builtins.range_(constant_op.constant(3))
-      self.assertAllEqual(sess.run(r), [0, 1, 2])
+      self.assertAllEqual(self.evaluate(r), [0, 1, 2])
       r = py_builtins.range_(1, constant_op.constant(3))
-      self.assertAllEqual(sess.run(r), [1, 2])
+      self.assertAllEqual(self.evaluate(r), [1, 2])
       r = py_builtins.range_(2, 0, constant_op.constant(-1))
-      self.assertAllEqual(sess.run(r), [2, 1])
+      self.assertAllEqual(self.evaluate(r), [2, 1])
 
   def test_range_tensor_empty_range(self):
     with self.session() as sess:
       r = py_builtins.range_(constant_op.constant(-3))
-      self.assertAllEqual(sess.run(r), [])
+      self.assertAllEqual(self.evaluate(r), [])
       r = py_builtins.range_(5, constant_op.constant(2))
-      self.assertAllEqual(sess.run(r), [])
+      self.assertAllEqual(self.evaluate(r), [])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/operators/slices_test.py b/tensorflow/python/autograph/operators/slices_test.py
index 9e4865b3c66923815338e70d4104c42318e56eb3..d444054fd772cf68b2e7c028adc87b6623ccffba 100644
--- a/tensorflow/python/autograph/operators/slices_test.py
+++ b/tensorflow/python/autograph/operators/slices_test.py
@@ -34,7 +34,7 @@ class SlicesTest(test.TestCase):
 
     with self.cached_session() as sess:
       t = list_ops.tensor_list_stack(l, element_dtype=initial_list.dtype)
-      self.assertAllEqual(sess.run(t), [[5, 6], [3, 4]])
+      self.assertAllEqual(self.evaluate(t), [[5, 6], [3, 4]])
 
   def test_get_item_tensor_list(self):
     initial_list = constant_op.constant([[1, 2], [3, 4]])
@@ -44,7 +44,7 @@ class SlicesTest(test.TestCase):
         l, 1, slices.GetItemOpts(element_dtype=initial_list.dtype))
 
     with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(t), [3, 4])
+      self.assertAllEqual(self.evaluate(t), [3, 4])
 
   def test_get_item_tensor_string(self):
     initial_str = constant_op.constant('abcd')
@@ -52,14 +52,14 @@ class SlicesTest(test.TestCase):
                         slices.GetItemOpts(element_dtype=initial_str.dtype))
 
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(t), b'b')
+      self.assertEqual(self.evaluate(t), b'b')
 
     initial_list_str = constant_op.constant(['abcd', 'bcde'])
     t = slices.get_item(initial_list_str, 1,
                         slices.GetItemOpts(element_dtype=initial_str.dtype))
 
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(t), b'bcde')
+      self.assertEqual(self.evaluate(t), b'bcde')
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index ddadc6b96e8eb5417bfa1676ae304f7cbdedd92b..ba8ec271394981ec878473205a8dbbd19d255f3b 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -80,7 +80,6 @@ py_test(
     name = "compiler_test",
     srcs = ["compiler_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
@@ -154,7 +153,6 @@ py_test(
     name = "transformer_test",
     srcs = ["transformer_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":pyct",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/inspect_utils.py b/tensorflow/python/autograph/pyct/inspect_utils.py
index 4d56b93671e3305b5099f2ce8976ae629fc087c6..7c819f364fa79d40c0fbb080b3b358b36bfd8c0c 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils.py
@@ -46,6 +46,28 @@ if six.PY2:
   SPECIAL_BUILTINS['xrange'] = xrange
 
 
+def islambda(f):
+  if not tf_inspect.isfunction(f):
+    return False
+  if not hasattr(f, '__name__'):
+    return False
+  return f.__name__ == '<lambda>'
+
+
+def isnamedtuple(f):
+  """Returns True if the argument is a namedtuple-like."""
+  if not (tf_inspect.isclass(f) and issubclass(f, tuple)):
+    return False
+  if not hasattr(f, '_fields'):
+    return False
+  fields = getattr(f, '_fields')
+  if not isinstance(fields, tuple):
+    return False
+  if not all(isinstance(f, str) for f in fields):
+    return False
+  return True
+
+
 def isbuiltin(f):
   """Returns True if the argument is a built-in function."""
   if f in SPECIAL_BUILTINS.values():
diff --git a/tensorflow/python/autograph/pyct/inspect_utils_test.py b/tensorflow/python/autograph/pyct/inspect_utils_test.py
index 622e3bafc0ab3d7dd8876cbbbee45f8055c48056..a2c39056d1b09dbae937915cf17de5c6f55d4886 100644
--- a/tensorflow/python/autograph/pyct/inspect_utils_test.py
+++ b/tensorflow/python/autograph/pyct/inspect_utils_test.py
@@ -18,7 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from functools import wraps
+import collections
+import functools
 import imp
 import types
 import weakref
@@ -46,7 +47,7 @@ def wrapping_decorator():
     def replacement(*_):
       return None
 
-    @wraps(f)
+    @functools.wraps(f)
     def wrapper(*args, **kwargs):
       return replacement(*args, **kwargs)
     return wrapper
@@ -95,6 +96,38 @@ def free_factory():
 
 class InspectUtilsTest(test.TestCase):
 
+  def test_islambda(self):
+    def test_fn():
+      pass
+
+    self.assertTrue(inspect_utils.islambda(lambda x: x))
+    self.assertFalse(inspect_utils.islambda(test_fn))
+
+  def test_isnamedtuple(self):
+    nt = collections.namedtuple('TestNamedTuple', ['a', 'b'])
+
+    class NotANamedTuple(tuple):
+      pass
+
+    self.assertTrue(inspect_utils.isnamedtuple(nt))
+    self.assertFalse(inspect_utils.isnamedtuple(NotANamedTuple))
+
+  def test_isnamedtuple_confounder(self):
+    """This test highlights false positives when detecting named tuples."""
+
+    class NamedTupleLike(tuple):
+      _fields = ('a', 'b')
+
+    self.assertTrue(inspect_utils.isnamedtuple(NamedTupleLike))
+
+  def test_isnamedtuple_subclass(self):
+    """This test highlights false positives when detecting named tuples."""
+
+    class NamedTupleSubclass(collections.namedtuple('Test', ['a', 'b'])):
+      pass
+
+    self.assertTrue(inspect_utils.isnamedtuple(NamedTupleSubclass))
+
   def test_getnamespace_globals(self):
     ns = inspect_utils.getnamespace(factory)
     self.assertEqual(ns['free_function'], free_function)
diff --git a/tensorflow/python/autograph/pyct/parser.py b/tensorflow/python/autograph/pyct/parser.py
index 8f4037c5e286accc600dbac97acd7b5fe045b582..39fc1a7ed05c06da89efe505e439b307badb4b4e 100644
--- a/tensorflow/python/autograph/pyct/parser.py
+++ b/tensorflow/python/autograph/pyct/parser.py
@@ -24,6 +24,7 @@ from __future__ import print_function
 import textwrap
 
 import gast
+import six
 
 from tensorflow.python.util import tf_inspect
 
@@ -91,7 +92,17 @@ def parse_entity(entity):
 def parse_str(src):
   """Returns the AST of given piece of code."""
   # TODO(mdan): This should exclude the module things are autowrapped in.
-  return gast.parse(src)
+
+  if six.PY2 and '.print(' in src:
+    # This special treatment is required because gast.parse is not aware of
+    # whether print_function was present in the original context.
+    src = 'from __future__ import print_function\n' + src
+    parsed_module = gast.parse(src)
+    parsed_module.body = parsed_module.body[1:]
+  else:
+    parsed_module = gast.parse(src)
+
+  return parsed_module
 
 
 def parse_expression(src):
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 4a4ccdcbd15a592d4a6d2713c192d60e8dc76492..5e260c5730ae855397f3f94664c0ccb409dcbba1 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -38,7 +38,6 @@ py_test(
     name = "activity_test",
     srcs = ["activity_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/python:client_testlib",
@@ -51,7 +50,6 @@ py_test(
     name = "live_values_test",
     srcs = ["live_values_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_windows"],
     deps = [
         ":static_analysis",
         "//tensorflow/python:client_testlib",
diff --git a/tensorflow/python/autograph/pyct/static_analysis/liveness.py b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
index 451398f1b70abf56d6c141305930c8a4e1a66a07..f8b8d7fa77c167e0ebf96dd533e3c42b0c30b8e5 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/liveness.py
+++ b/tensorflow/python/autograph/pyct/static_analysis/liveness.py
@@ -161,6 +161,16 @@ class Annotator(transformer.Base):
     self.cross_function_analyzer = cross_function_analyzer
     self.current_analyzer = None
 
+  def visit(self, node):
+    node = super(Annotator, self).visit(node)
+    if (self.current_analyzer is not None and
+        isinstance(node, gast.stmt) and
+        node in self.current_analyzer.graph.index):
+      cfg_node = self.current_analyzer.graph.index[node]
+      anno.setanno(node, anno.Static.LIVE_VARS_IN,
+                   frozenset(self.current_analyzer.in_[cfg_node]))
+    return node
+
   def visit_FunctionDef(self, node):
     parent_analyzer = self.current_analyzer
     self.current_analyzer = self.cross_function_analyzer.analyzers[node]
@@ -198,6 +208,10 @@ class Annotator(transformer.Base):
     node = self._block_statement_live_out(node)
     return self._block_statement_live_in(node, node.test)
 
+  def visit_With(self, node):
+    node = self.generic_visit(node)
+    return self._block_statement_live_in(node, node.items[0])
+
   def visit_Expr(self, node):
     node = self.generic_visit(node)
     cfg_node = self.current_analyzer.graph.index[node]
diff --git a/tensorflow/python/autograph/pyct/templates.py b/tensorflow/python/autograph/pyct/templates.py
index 9901295445c7a77c78ee1c0de9c27724948741c0..2272ea42086ff726eaf02f8fccacc6b661d6207e 100644
--- a/tensorflow/python/autograph/pyct/templates.py
+++ b/tensorflow/python/autograph/pyct/templates.py
@@ -32,6 +32,66 @@ from tensorflow.python.autograph.pyct import parser
 from tensorflow.python.autograph.pyct import qual_names
 
 
+class ContextAdjuster(gast.NodeTransformer):
+  """Adjusts the ctx field of nodes to ensure consistency.
+
+  This transformer can change the ctx fields of a variable, tuple and other
+  AST elements that allow one, based on whether the element is being read or
+  written.
+  """
+
+  def __init__(self, override_value):
+    self._ctx_override = override_value
+
+  def visit(self, node):
+    original_override = self._ctx_override
+    node = super(ContextAdjuster, self).visit(node)
+    if hasattr(node, 'ctx'):
+      assert node.ctx is not None, 'node {} has ctx unset'.format(node)
+    self._ctx_override = original_override
+    return node
+
+  def _apply_override(self, node):
+    if self._ctx_override is not None:
+      node.ctx = self._ctx_override()
+
+  def visit_Attribute(self, node):
+    self._apply_override(node)
+    self._ctx_override = gast.Load
+    node = self.generic_visit(node)
+    return node
+
+  def visit_Tuple(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_List(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_Name(self, node):
+    self._apply_override(node)
+    return self.generic_visit(node)
+
+  def visit_Call(self, node):
+    self._apply_override(node)
+    # We may be able to override these to Load(), but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Dict(self, node):
+    # We may be able to override these to Load(), but for now it's simpler
+    # to just assert that they're set.
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+  def visit_Subscript(self, node):
+    node.value = self.visit(node.value)
+    self._ctx_override = None
+    return self.generic_visit(node)
+
+
 class ReplaceTransformer(gast.NodeTransformer):
   """Replace AST nodes."""
 
@@ -106,91 +166,6 @@ class ReplaceTransformer(gast.NodeTransformer):
     node.name = repl.id
     return node
 
-  def _check_has_context(self, node):
-    if not node.ctx:
-      raise ValueError('node %s is missing ctx value' % node)
-
-  # TODO(mdan): Rewrite _check and _set using a separate transformer.
-  def _check_inner_children_have_context(self, node):
-    if isinstance(node, gast.Attribute):
-      self._check_inner_children_have_context(node.value)
-      self._check_has_context(node)
-    elif isinstance(node, (gast.Tuple, gast.List)):
-      for e in node.elts:
-        self._check_inner_children_have_context(e)
-      self._check_has_context(node)
-    elif isinstance(node, gast.Dict):
-      for e in node.keys:
-        self._check_inner_children_have_context(e)
-      for e in node.values:
-        self._check_inner_children_have_context(e)
-    elif isinstance(node, gast.Index):
-      self._check_inner_children_have_context(node.value)
-    elif isinstance(node, gast.Subscript):
-      self._check_inner_children_have_context(node.value)
-      self._check_inner_children_have_context(node.slice)
-    elif isinstance(node, gast.Slice):
-      self._check_inner_children_have_context(node.lower)
-      if node.upper:
-        self._check_inner_children_have_context(node.upper)
-      if node.step:
-        self._check_inner_children_have_context(node.step)
-    elif isinstance(node, gast.BinOp):
-      self._check_inner_children_have_context(node.left)
-      self._check_inner_children_have_context(node.right)
-    elif isinstance(node, gast.UnaryOp):
-      self._check_inner_children_have_context(node.operand)
-    elif isinstance(node, gast.Name):
-      self._check_has_context(node)
-    elif isinstance(node, (gast.Str, gast.Num)):
-      pass
-    elif isinstance(node, gast.Call):
-      self._check_inner_children_have_context(node.func)
-      for a in node.args:
-        self._check_inner_children_have_context(a)
-      for k in node.keywords:
-        self._check_inner_children_have_context(k.value)
-    else:
-      raise ValueError('unexpected node type "%s"' % node)
-
-  def _set_inner_child_context(self, node, ctx):
-    if isinstance(node, gast.Attribute):
-      self._set_inner_child_context(node.value, gast.Load())
-      node.ctx = ctx
-    elif isinstance(node, (gast.Tuple, gast.List)):
-      for e in node.elts:
-        self._set_inner_child_context(e, ctx)
-      node.ctx = ctx
-    elif isinstance(node, gast.Name):
-      node.ctx = ctx
-    elif isinstance(node, gast.Call):
-      self._set_inner_child_context(node.func, ctx)
-      # We may be able to override these to Load(), but for now it's simpler
-      # to just assert that they're set.
-      for a in node.args:
-        self._check_inner_children_have_context(a)
-      for k in node.keywords:
-        self._check_inner_children_have_context(k.value)
-    elif isinstance(node, gast.Dict):
-      # We may be able to override these to Load(), but for now it's simpler
-      # to just assert that they're set.
-      for e in node.keys:
-        self._check_inner_children_have_context(e)
-      for e in node.values:
-        self._check_inner_children_have_context(e)
-    elif isinstance(node, gast.Subscript):
-      self._set_inner_child_context(node.value, ctx)
-      self._check_inner_children_have_context(node.slice)
-    elif isinstance(node, gast.BinOp):
-      self._check_inner_children_have_context(node.left)
-      self._check_inner_children_have_context(node.right)
-    elif isinstance(node, gast.UnaryOp):
-      self._check_inner_children_have_context(node.operand)
-    elif isinstance(node, (gast.Str, gast.Num)):
-      pass
-    else:
-      raise ValueError('unexpected node type "%s"' % node)
-
   def visit_Attribute(self, node):
     node = self.generic_visit(node)
     if node.attr not in self.replacements:
@@ -210,16 +185,10 @@ class ReplaceTransformer(gast.NodeTransformer):
     new_nodes = self._prepare_replacement(node, node.id)
 
     # Preserve the target context.
+    adjuster = ContextAdjuster(type(node.ctx))
     for n in new_nodes:
-      if isinstance(n, (gast.Tuple, gast.List)):
-        for e in n.elts:
-          self._set_inner_child_context(e, node.ctx)
-      if isinstance(n, gast.Attribute):
-        # For attributes, the inner Name node receives the context, while the
-        # outer ones have it set to Load.
-        self._set_inner_child_context(n, node.ctx)
-      else:
-        n.ctx = node.ctx
+      if hasattr(n, 'ctx'):
+        adjuster.visit(n)
 
     if len(new_nodes) == 1:
       new_nodes, = new_nodes
diff --git a/tensorflow/python/autograph/pyct/templates_test.py b/tensorflow/python/autograph/pyct/templates_test.py
index 54019ef5f4a20ed4a4d69d9c57c8addd12ee3c75..cdb44b822e84ad5822c78d50c2f958b1fba9ec18 100644
--- a/tensorflow/python/autograph/pyct/templates_test.py
+++ b/tensorflow/python/autograph/pyct/templates_test.py
@@ -134,19 +134,18 @@ class TemplatesTest(test.TestCase):
 
   def test_replace_expression_context(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo
     """
 
     node = templates.replace(
         template, foo=parser.parse_expression('a + 2 * b / -c'))[0]
-    self.assertIsInstance(node.body[0].ctx, gast.Load)
     self.assertIsInstance(node.body[0].left.ctx, gast.Load)
     self.assertIsInstance(node.body[0].right.left.right.ctx, gast.Load)
 
   def test_replace_complex_context(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo = 0
     """
 
@@ -160,7 +159,7 @@ class TemplatesTest(test.TestCase):
 
   def test_replace_index(self):
     template = """
-      def test_fn(foo):
+      def test_fn():
         foo = 0
     """
 
diff --git a/tensorflow/python/autograph/utils/misc_test.py b/tensorflow/python/autograph/utils/misc_test.py
index 8d2b0d6e13802313abf6751b0e62b2807a866c2f..c78df48d6263b121076c86198670222441e7fec7 100644
--- a/tensorflow/python/autograph/utils/misc_test.py
+++ b/tensorflow/python/autograph/utils/misc_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.utils.misc import alias_tensors
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops.variables import Variable
 from tensorflow.python.platform import test
@@ -26,14 +27,16 @@ from tensorflow.python.platform import test
 
 class MiscTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_alias_single_tensor(self):
     a = constant(1)
 
     new_a = alias_tensors(a)
     self.assertFalse(new_a is a)
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(new_a))
+      self.assertEqual(1, self.evaluate(new_a))
 
+  @test_util.run_deprecated_v1
   def test_alias_tensors(self):
     a = constant(1)
     v = Variable(2)
@@ -47,7 +50,7 @@ class MiscTest(test.TestCase):
     self.assertTrue(new_s is s)
     self.assertTrue(new_l is l)
     with self.cached_session() as sess:
-      self.assertEqual(1, sess.run(new_a))
+      self.assertEqual(1, self.evaluate(new_a))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/utils/py_func.py b/tensorflow/python/autograph/utils/py_func.py
index 11ebfb2e49f0e762b56ae2cde2b76d2e24032d72..ee8b46b52061f28eacdf2f980cccb07c889e7274 100644
--- a/tensorflow/python/autograph/utils/py_func.py
+++ b/tensorflow/python/autograph/utils/py_func.py
@@ -127,5 +127,6 @@ def wrap_py_func(f, return_dtypes, args, kwargs=None, use_dummy_return=False):
     retval = f(*f_args, **f_kwargs)
     return 1 if use_dummy_return else retval
 
-  return script_ops.py_func(f_wrapper, tensor_args, dtypes.int64
-                            if use_dummy_return else return_dtypes)
+  if use_dummy_return:
+    return_dtypes = dtypes.int32
+  return script_ops.eager_py_func(f_wrapper, tensor_args, return_dtypes)
diff --git a/tensorflow/python/autograph/utils/py_func_test.py b/tensorflow/python/autograph/utils/py_func_test.py
index 1c220d94922be680021bd96c6b7ddbf2593c6125..d17ede77142483208a0954244579b3249f0ffba5 100644
--- a/tensorflow/python/autograph/utils/py_func_test.py
+++ b/tensorflow/python/autograph/utils/py_func_test.py
@@ -32,15 +32,15 @@ class PyFuncTest(test.TestCase):
       return a + b + c
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (1, constant_op.constant(1), 1))
-      self.assertEqual(3, sess.run(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (1, 1, 1))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (1, 1, 1))
+      self.assertEqual(3, self.evaluate(result))
       result = py_func.wrap_py_func(
-          test_fn, dtypes.int64,
+          test_fn, dtypes.int32,
           (constant_op.constant(1), 1, constant_op.constant(1)))
-      self.assertEqual(3, sess.run(result))
+      self.assertEqual(3, self.evaluate(result))
 
   def test_wrap_py_func_complex_args(self):
 
@@ -53,11 +53,11 @@ class PyFuncTest(test.TestCase):
       return a * b.foo
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass()))
-      self.assertEqual(35, sess.run(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (7, TestClass()))
+      self.assertEqual(35, self.evaluate(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (constant_op.constant(7), TestClass()))
-      self.assertEqual(35, sess.run(result))
+      self.assertEqual(35, self.evaluate(result))
 
   def test_wrap_py_func_kwargs(self):
 
@@ -70,17 +70,17 @@ class PyFuncTest(test.TestCase):
       return a * b.foo + c * d.foo
 
     with self.cached_session() as sess:
-      result = py_func.wrap_py_func(test_fn, dtypes.int64, (7, TestClass(5)), {
+      result = py_func.wrap_py_func(test_fn, dtypes.int32, (7, TestClass(5)), {
           'c': 11,
           'd': TestClass(13)
       })
-      self.assertEqual(178, sess.run(result))
-      result = py_func.wrap_py_func(test_fn, dtypes.int64,
+      self.assertEqual(178, self.evaluate(result))
+      result = py_func.wrap_py_func(test_fn, dtypes.int32,
                                     (constant_op.constant(7), TestClass(5)), {
                                         'c': constant_op.constant(11),
                                         'd': TestClass(13)
                                     })
-      self.assertEqual(178, sess.run(result))
+      self.assertEqual(178, self.evaluate(result))
 
   def test_wrap_py_func_dummy_return(self):
 
@@ -91,11 +91,11 @@ class PyFuncTest(test.TestCase):
 
     with self.cached_session() as sess:
       result = py_func.wrap_py_func(test_fn, None, (5,), use_dummy_return=True)
-      self.assertEqual(1, sess.run(result))
+      self.assertEqual(1, self.evaluate(result))
       self.assertEqual([1], side_counter)
       result = py_func.wrap_py_func(
           test_fn, None, (constant_op.constant(5),), use_dummy_return=True)
-      self.assertEqual(1, sess.run(result))
+      self.assertEqual(1, self.evaluate(result))
       self.assertEqual([2], side_counter)
 
 
diff --git a/tensorflow/python/autograph/utils/tensor_list_test.py b/tensorflow/python/autograph/utils/tensor_list_test.py
index 697c166eb12c0f3e5b3782259795fcf2e366cb5d..bbbc3bf691818d292d53999c563bcc1112d0703f 100644
--- a/tensorflow/python/autograph/utils/tensor_list_test.py
+++ b/tensorflow/python/autograph/utils/tensor_list_test.py
@@ -19,10 +19,10 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.autograph.utils import tensor_list as tl
-from tensorflow.python.client.session import Session
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.constant_op import constant
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import tensor_array_ops
@@ -34,6 +34,7 @@ class TensorListTest(test.TestCase):
   def _shape(self, shape_tuple):
     return constant(shape_tuple, dtypes.int32)
 
+  @test_util.run_v1_only("b/117943489")
   def test_dynamic_list_append(self):
     l = []
     l = tl.dynamic_list_append(l, 1)
@@ -42,19 +43,16 @@ class TensorListTest(test.TestCase):
     l = list_ops.empty_tensor_list(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
     s = list_ops.tensor_list_stack(l, element_dtype=dtypes.int32)
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(s), [1])
+    self.assertAllEqual(s, [1])
 
     l = tensor_array_ops.TensorArray(dtypes.int32, size=0, dynamic_size=True)
     l = tl.dynamic_list_append(l, 1)
     s = l.stack()
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(s), [1])
+    self.assertAllEqual(s, [1])
 
     l = tl.TensorList(self._shape(()), dtypes.int32)
     l = tl.dynamic_list_append(l, 1)
-    with self.cached_session() as sess:
-      self.assertAllEqual(sess.run(l[0]), 1)
+    self.assertAllEqual(l[0], 1)
 
   def test_list_append_python(self):
     with context.eager_mode():
@@ -80,6 +78,7 @@ class TensorListTest(test.TestCase):
       l[0] = ops.convert_to_tensor(b)
       self.assertEqual(l[0].numpy(), b.numpy())
 
+  @test_util.run_deprecated_v1
   def test_list_append_tf(self):
     a = constant(3.0)
     l = tl.TensorList(a.shape, a.dtype)
@@ -91,13 +90,12 @@ class TensorListTest(test.TestCase):
     c3 = l.count()
     a2 = l.pop()
     c4 = l.count()
-    with Session() as sess:
-      c1, c2, c3, c4, a, a2 = sess.run([c1, c2, c3, c4, a, a2])
-      self.assertEqual(c1, 1)
-      self.assertEqual(c2, 2)
-      self.assertEqual(c3, 1)
-      self.assertEqual(c4, 0)
-      self.assertEqual(a, a2)
+    c1, c2, c3, c4, a, a2 = self.evaluate([c1, c2, c3, c4, a, a2])
+    self.assertEqual(c1, 1)
+    self.assertEqual(c2, 2)
+    self.assertEqual(c3, 1)
+    self.assertEqual(c4, 0)
+    self.assertEqual(a, a2)
 
   def test_list_index_tf(self):
     a = constant(3.0)
@@ -107,10 +105,9 @@ class TensorListTest(test.TestCase):
     l0 = l[0]
     l[0] = b
     l1 = l[0]
-    with self.cached_session() as sess:
-      l0, l1, a, b = sess.run([l0, l1, a, b])
-      self.assertEqual(l0, a)
-      self.assertEqual(l1, b)
+    l0, l1, a, b = self.evaluate([l0, l1, a, b])
+    self.assertEqual(l0, a)
+    self.assertEqual(l1, b)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/autograph/utils/type_check.py b/tensorflow/python/autograph/utils/type_check.py
index 8748abc47bcfb55b4d0b11178a46816249732da9..ccef7dee03982e46a969ec70d4bbd8f61f8ce6d7 100644
--- a/tensorflow/python/autograph/utils/type_check.py
+++ b/tensorflow/python/autograph/utils/type_check.py
@@ -30,4 +30,4 @@ def is_tensor(*args):
   Returns:
     True if any *args are TensorFlow types, False if none are.
   """
-  return any([tensor_util.is_tensor(a) for a in args])
+  return any(tensor_util.is_tensor(a) for a in args)
diff --git a/tensorflow/python/autograph/utils/type_check_test.py b/tensorflow/python/autograph/utils/type_check_test.py
index b3d1304e16ff1f53e3e1686d5973b76f2de91b1a..2521dc9f925625163ffd0caf63a0c6ac17eca969 100644
--- a/tensorflow/python/autograph/utils/type_check_test.py
+++ b/tensorflow/python/autograph/utils/type_check_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.platform import test
 
 class TypeCheckTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_checks(self):
     self.assertTrue(type_check.is_tensor(constant_op.constant([1, 2, 3])))
     self.assertTrue(
diff --git a/tensorflow/python/client/device_lib.i b/tensorflow/python/client/device_lib.i
index 944e855cee2ab9da7a4a801d1b993bec4d8ebc55..3e579152d5170d8c773136f09add59aaa5b89d98 100644
--- a/tensorflow/python/client/device_lib.i
+++ b/tensorflow/python/client/device_lib.i
@@ -48,17 +48,14 @@ static std::vector<string> ListDevicesWithSessionConfig(
   std::vector<string> output;
   SessionOptions options;
   options.config = config;
-  std::vector<Device*> devices;
+  std::vector<std::unique_ptr<Device>> devices;
   Status status = DeviceFactory::AddDevices(
       options, "" /* name_prefix */, &devices);
   if (!status.ok()) {
     Set_TF_Status_from_Status(out_status, status);
   }
 
-  std::vector<std::unique_ptr<Device>> device_holder(devices.begin(),
-                                                     devices.end());
-
-  for (const Device* device : devices) {
+  for (const std::unique_ptr<Device>& device : devices) {
     const DeviceAttributes& attr = device->attributes();
     string attr_serialized;
     if (!attr.SerializeToString(&attr_serialized)) {
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 06c66dda9fbc0028fa118c9c18943ba39f8b9b47..87a200ed336735f4b4abd9b0ac2352e36f7b84e4 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -828,7 +828,7 @@ class BaseSession(SessionInterface):
     nested list, tuple, namedtuple, dict, or OrderedDict containing graph
     elements at its leaves.  A graph element can be one of the following types:
 
-    * An `tf.Operation`.
+    * A `tf.Operation`.
       The corresponding fetched value will be `None`.
     * A `tf.Tensor`.
       The corresponding fetched value will be a numpy ndarray containing the
@@ -1097,7 +1097,7 @@ class BaseSession(SessionInterface):
           if isinstance(subfeed_val, ops.Tensor):
             raise TypeError('The value of a feed cannot be a tf.Tensor object. '
                             'Acceptable feed values include Python scalars, '
-                            'strings, lists, numpy ndarrays, or TensorHandles.'
+                            'strings, lists, numpy ndarrays, or TensorHandles. '
                             'For reference, the tensor object was ' +
                             str(feed_val) + ' which was passed to the '
                             'feed with key ' + str(feed) + '.')
diff --git a/tensorflow/python/client/session_clusterspec_prop_test.py b/tensorflow/python/client/session_clusterspec_prop_test.py
index df020f88a88687ac9616d40618aebb8f7eef2858..224f880ed15f1796b08d1db3ea52c52302a9b83f 100644
--- a/tensorflow/python/client/session_clusterspec_prop_test.py
+++ b/tensorflow/python/client/session_clusterspec_prop_test.py
@@ -62,7 +62,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
 
     const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config)
-    output = sess.run(const)
+    output = self.evaluate(const)
     self.assertEqual(17, output)
 
   def testClusterSpecPropagationWorker2Placement(self):
@@ -106,7 +106,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default() as g, ops.device('/job:worker/task:0'):
       const = constant_op.constant(17)
     sess = session.Session(server1.target, config=config, graph=g)
-    output = sess.run(const)
+    output = self.evaluate(const)
     self.assertEqual(17, output)
 
   def testCanonicalDeviceNames(self):
@@ -208,7 +208,7 @@ class SessionClusterSpecPropagationTest(test_util.TensorFlowTestCase):
       with ops.device('/job:worker/task:0/cpu:0'):
         sum3 = sum1 + sum2
     sess = session.Session(server1.target, config=config, graph=g)
-    output = sess.run(sum3)
+    output = self.evaluate(sum3)
     self.assertEqual(40, output)
 
   def testLegacyDeviceNames(self):
diff --git a/tensorflow/python/client/session_partial_run_test.py b/tensorflow/python/client/session_partial_run_test.py
index 92ca47efa9348f4ac77f2b22e684080eccb38617..a97930635af5cee0cea4bcdf6f04a5894d7d3aed 100644
--- a/tensorflow/python/client/session_partial_run_test.py
+++ b/tensorflow/python/client/session_partial_run_test.py
@@ -117,7 +117,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     a = constant_op.constant(2.0, dtypes.float32)
     b = a * 2
     c = b * 3
-    r1 = sess.run([b, c])
+    r1 = self.evaluate([b, c])
     h = sess.partial_run_setup([b, c], [])
     r2 = sess.partial_run(h, [b, c])
     self.assertEqual(r1, r2)
@@ -188,6 +188,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     r = sess.partial_run(h, [b], {})
     self.assertEqual([6.0], r)
 
+  @test_util.run_deprecated_v1
   def testInvalidPartialRunSetup(self):
     sess = session.Session()
     x = array_ops.placeholder(dtypes.float32, shape=[])
@@ -196,6 +197,7 @@ class PartialRunTest(test_util.TensorFlowTestCase):
         'specify at least one target to fetch or execute.'):
       sess.partial_run_setup(fetches=[], feeds=[x])
 
+  @test_util.run_deprecated_v1
   def testPartialRunSetupNoFeedsPassed(self):
     sess = session.Session()
     r1 = constant_op.constant([6.0])
@@ -204,80 +206,102 @@ class PartialRunTest(test_util.TensorFlowTestCase):
     result1 = sess.partial_run(h, r1)
     self.assertEqual([6.0], result1)
 
+  @test_util.run_deprecated_v1
   def testPartialRunDirect(self):
     self.RunTestPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunIncompleteDirect(self):
     self.RunTestPartialRunIncomplete(session.Session())
 
+  @test_util.run_deprecated_v1
   def testConcurrentPartialRunDirect(self):
     self.RunTestConcurrentPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testManyPartialRunDirect(self):
     self.RunTestManyPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testRunAndPartialRunDirect(self):
     self.RunTestRunAndPartialRun(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunMissingPlaceholderFeedExceptionDirect(self):
     self.RunTestPartialRunMissingPlaceholderFeedException(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFeedDirect(self):
     self.RunTestPartialRunUnspecifiedFeed(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFetchDirect(self):
     self.RunTestPartialRunUnspecifiedFetch(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFedDirect(self):
     self.RunTestPartialRunAlreadyFed(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFetchedDirect(self):
     self.RunTestPartialRunAlreadyFetched(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunEmptyFetchesDirect(self):
     self.RunTestPartialRunEmptyFetches(session.Session())
 
+  @test_util.run_deprecated_v1
   def testPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunIncompleteDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunIncomplete(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testConcurrentPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestConcurrentPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testManyPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestManyPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testRunAndPartialRunDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestRunAndPartialRun(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunMissingPlaceholderFeedExceptionDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunMissingPlaceholderFeedException(
         session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFeedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunUnspecifiedFeed(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunUnspecifiedFetchDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunUnspecifiedFetch(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunAlreadyFed(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunAlreadyFetchedDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunAlreadyFetched(session.Session(server.target))
 
+  @test_util.run_deprecated_v1
   def testPartialRunEmptyFetchesDist(self):
     server = server_lib.Server.create_local_server()
     self.RunTestPartialRunEmptyFetches(session.Session(server.target))
diff --git a/tensorflow/python/client/session_test.py b/tensorflow/python/client/session_test.py
index 347833ce8fd095eb4acdef4a8a7e09046b554ba3..c4a118a41406afc52586553b1d3f0b446005c46d 100644
--- a/tensorflow/python/client/session_test.py
+++ b/tensorflow/python/client/session_test.py
@@ -312,6 +312,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertEqual(None, res[2])
       self.assertEqual(44.0, res[1])
 
+  @test_util.run_v1_only('b/120545219')
   def testFetchAttrs(self):
     if attr is None:
       self.skipTest('attr module is unavailable.')
@@ -340,6 +341,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(val3, result.field1)
       self.assertAllEqual(val2, result.field2)
 
+  @test_util.run_v1_only('b/120545219')
   def testFetchNestedAttrs(self):
     if attr is None:
       self.skipTest('attr module is unavailable.')
@@ -1024,6 +1026,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       fed_c_val = c.eval(feed_dict={a.name: [[4.0, 4.0]]})
       self.assertAllEqual([[16.0, 16.0, 16.0]], fed_c_val)
 
+  @test_util.run_v1_only('b/120545219')
   def testOperationRunMethod(self):
     with session.Session():
       a = constant_op.constant(1.0, shape=[1, 2])
@@ -1154,6 +1157,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         else:
           importer.import_graph_def(gdef, name='import')
 
+  @test_util.run_v1_only('b/120545219')
   def testParallelRunAndSingleBuild(self):
     with session.Session() as sess:
       c = constant_op.constant(5.0)
@@ -1174,6 +1178,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       for t in threads:
         t.join()
 
+  @test_util.run_v1_only('b/120545219')
   def testParallelRunAndParallelBuild(self):
     with session.Session() as sess:
       c = constant_op.constant(5.0)
@@ -1274,6 +1279,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(RuntimeError, 'The Session graph is empty.'):
         sess.run({})
 
+  @test_util.run_v1_only('b/120545219')
   def testNotEntered(self):
     # pylint: disable=protected-access
     self.assertEqual(ops._default_session_stack.get_default(), None)
@@ -1289,6 +1295,7 @@ class SessionTest(test_util.TensorFlowTestCase):
           ValueError, lambda e: 'No default session is registered.' in str(e)):
         c_2.eval()
 
+  @test_util.run_v1_only('b/120545219')
   def testInteractive(self):
     with ops.device('/cpu:0'):
       sess = session.InteractiveSession()
@@ -1301,6 +1308,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([[24.0]], e.eval())
       sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testMultipleInteractiveSessionsWarning(self):
     # Reinitialize the global state to ensure that the expected warnings will
     # be emitted.
@@ -1328,6 +1336,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     sess2.close()
     sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testInteractivePlacePrunedGraph(self):
     sess = session.InteractiveSession()
 
@@ -1349,6 +1358,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       a.eval()
     sess.close()
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultSessionPlacePrunedGraph(self):
     sess = session.Session()
 
@@ -1769,9 +1779,11 @@ class SessionTest(test_util.TensorFlowTestCase):
     sess.run(a, run_metadata=run_metadata)
     self.assertEqual(len(run_metadata.partition_graphs), 0)
 
+  @test_util.run_v1_only('b/120545219')
   def testOutputPartitionGraphsDirect(self):
     self.runTestOutputPartitionGraphs(session.Session())
 
+  @test_util.run_v1_only('b/120545219')
   def testOutputPartitionGraphsDistributed(self):
     server = server_lib.Server.create_local_server()
     self.runTestOutputPartitionGraphs(session.Session(server.target))
@@ -1796,6 +1808,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     del sess1
     del sess2
 
+  @test_util.run_v1_only('b/120545219')
   def testAsDefault(self):
     c = constant_op.constant(37)
     sess = session.Session()
@@ -1821,6 +1834,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(TypeError, 'graph must be a tf.Graph'):
       session.Session(graph=37)
 
+  @test_util.run_v1_only('b/120545219')
   def testTimeoutWithShortOperations(self):
     num_epochs = 5
     q = data_flow_ops.FIFOQueue(capacity=50, dtypes=[dtypes.int32], shapes=[()])
@@ -1834,6 +1848,7 @@ class SessionTest(test_util.TensorFlowTestCase):
         sess.run(enqueue_op)
       self.assertEqual(sess.run(q.size()), num_epochs * 2)
 
+  @test_util.run_v1_only('b/120545219')
   def testRegisterFetchAndFeedConversionFunctions(self):
 
     class SquaredTensor(object):
@@ -1865,6 +1880,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       squared_eval = sess.partial_run(partial_run, squared_tensor)
       self.assertAllClose(np2 * np2, squared_eval)
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultLogDevicePlacement(self):
 
     class CaptureStderr(str):
@@ -1914,6 +1930,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       self.assertTrue('/job:local/replica:0/task:0/device:CPU:0' in str(log),
                       str(log))
 
+  @test_util.run_v1_only('b/120545219')
   def testLocalMasterSessionTimeout(self):
     # Test that the timeout passed in a config to the session works correctly.
     config = config_pb2.ConfigProto(operation_timeout_in_ms=1000)
@@ -1927,6 +1944,7 @@ class SessionTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.DeadlineExceededError):
         sess.run(dequeued_t)
 
+  @test_util.run_v1_only('b/120545219')
   def testDefaultServerTimeout(self):
     # Test that the default server config timeout gets used when no Session
     # config is provided.
@@ -1952,9 +1970,11 @@ class SessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesOpError('has inputs from different frames'):
       sess.run(res, feed_dict={data: 1.0})
 
+  @test_util.run_v1_only('b/120545219')
   def testBuildGraphErrorDirect(self):
     self.runTestBuildGraphError(session.Session())
 
+  @test_util.run_v1_only('b/120545219')
   def testBuildGraphErrorDist(self):
     server = server_lib.Server.create_local_server()
     self.runTestBuildGraphError(session.Session(server.target))
@@ -1993,9 +2013,11 @@ class SessionTest(test_util.TensorFlowTestCase):
       result = sess.run(f)
       self.assertEqual(result, 2.0)
 
+  @test_util.run_v1_only('b/120545219')
   def testAddFunctionToSession(self):
     self.runTestAddFunctionToSession()
 
+  @test_util.run_v1_only('b/120545219')
   def testAddFunctionToGrpcSession(self):
     server = server_lib.Server.create_local_server()
     self.runTestAddFunctionToSession(server.target)
@@ -2009,6 +2031,7 @@ class SessionTest(test_util.TensorFlowTestCase):
     with session.Session():
       pass
 
+  @test_util.run_v1_only('b/120545219')
   def testAutoConvertAndCheckData(self):
     with self.cached_session() as sess:
       a = array_ops.placeholder(dtype=dtypes.string)
diff --git a/tensorflow/python/client/timeline_test.py b/tensorflow/python/client/timeline_test.py
index dfd0147643017b3a7ae17498ac638b7b5e093022..61c0da01b836843a756c90fee20fbcb0ee94f59c 100644
--- a/tensorflow/python/client/timeline_test.py
+++ b/tensorflow/python/client/timeline_test.py
@@ -57,6 +57,7 @@ class TimelineTest(test.TestCase):
     ctf = tl.generate_chrome_trace_format()
     self._validateTrace(ctf)
 
+  @test_util.run_deprecated_v1
   def testTimelineCpu(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -147,7 +148,7 @@ class TimelineTest(test.TestCase):
         num2 = variables.Variable(2.0, name='num2')
       with ops.device('/cpu:2'):
         result = num1 + num2 + num1 * num2
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(result, options=run_options, run_metadata=run_metadata)
 
     self.assertTrue(run_metadata.HasField('step_stats'))
@@ -176,7 +177,7 @@ class TimelineTest(test.TestCase):
         num2 = variables.Variable(2.0, name='num2')
       with ops.device('/cpu:2'):
         result = num1 + num2 + num1 * num2
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(result, options=run_options, run_metadata=run_metadata)
     self.assertTrue(run_metadata.HasField('step_stats'))
     step_stats = run_metadata.step_stats
diff --git a/tensorflow/python/client/virtual_gpu_test.py b/tensorflow/python/client/virtual_gpu_test.py
index 5892e0fc845787a3d690b2085f22905306e9a10b..e82ee0666c30f8dcf71d3e6609fc7d7a8ec7eeed 100644
--- a/tensorflow/python/client/virtual_gpu_test.py
+++ b/tensorflow/python/client/virtual_gpu_test.py
@@ -216,7 +216,7 @@ class VirtualGpuTest(test_util.TensorFlowTestCase):
       for d in self._util.devices:
         with ops.device(d):
           var = variables.Variable(random_ops.random_uniform(mat_shape))
-          sess.run(var.initializer)
+          self.evaluate(var.initializer)
           data.append(var)
       s = data[0]
       for i in range(1, len(data)):
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index e0a1c8e0571879e9661cdb0714cc6a794b7ea455..9f2ce8c676e77480106c525bdc9c6440c599acec 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -9,7 +9,10 @@ py_library(
     srcs = ["compat.py"],
     srcs_version = "PY2AND3",
     visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/python:util"],
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python:util",
+    ],
 )
 
 tf_py_test(
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 385fd431f4c7fcd4a46915db1caee35e9d7251f3..51cd68436add963e4a08d9ed7ad43400f27b83f0 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -23,10 +23,16 @@ from __future__ import division
 from __future__ import print_function
 
 import datetime
+
+from tensorflow.python import tf2
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import variable_scope
+
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 11, 13)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2018, 12, 9)
 
 
 @tf_export("compat.forward_compatible")
@@ -132,3 +138,40 @@ def forward_compatibility_horizon(year, month, day):
     yield
   finally:
     _FORWARD_COMPATIBILITY_HORIZON = old_compat_date
+
+
+@tf_export(v1=["enable_v2_behavior"])
+def enable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 2.x.
+
+  This function is called in the main TensorFlow `__init__.py` file, user should
+  not need to call it, except during complex migrations.
+  """
+  tf2.enable()  # Switches TensorArrayV2 and control flow V2
+  ops.enable_eager_execution()
+  tensor_shape.enable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.enable_resource_variables()
+
+
+@tf_export(v1=["disable_v2_behavior"])
+def disable_v2_behavior():
+  """Enables TensorFlow 2.x behaviors.
+
+  This function can be called at the beginning of the program (before `Tensors`,
+  `Graphs` or other structures have been created, and before devices have been
+  initialized. It switches all global behaviors that are different between
+  TensorFlow 1.x and 2.x to behave as intended for 1.x.
+
+  User can call this function to disable 2.x behavior during complex migrations.
+  """
+  tf2.disable()  # Switches TensorArrayV2 and control flow V2
+  ops.disable_eager_execution()
+  tensor_shape.disable_v2_tensorshape()  # Also switched by tf2
+  variable_scope.disable_resource_variables()
+
+
diff --git a/tensorflow/python/data/__init__.py b/tensorflow/python/data/__init__.py
index 7536ba668abf5f3aa62fb73921d14e7ffe5b8c19..75ba88f3034632bd925c7736fe7af42cd3aa274f 100644
--- a/tensorflow/python/data/__init__.py
+++ b/tensorflow/python/data/__init__.py
@@ -24,6 +24,8 @@ from __future__ import print_function
 # pylint: disable=unused-import
 from tensorflow.python.data import experimental
 from tensorflow.python.data.ops.dataset_ops import Dataset
+from tensorflow.python.data.ops.dataset_ops import make_initializable_iterator
+from tensorflow.python.data.ops.dataset_ops import make_one_shot_iterator
 from tensorflow.python.data.ops.iterator_ops import Iterator
 from tensorflow.python.data.ops.readers import FixedLengthRecordDataset
 from tensorflow.python.data.ops.readers import TextLineDataset
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index fd723e0d7120a0b3cdcacca4e194ce3b1eb48ffd..5b0500eae1970b4f183737d4fc0cd4171dd1ea15 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -6,6 +6,61 @@ exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+py_test(
+    name = "batch_benchmark",
+    srcs = ["batch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "filter_benchmark",
+    srcs = ["filter_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "from_tensor_slices_benchmark",
+    srcs = ["from_tensor_slices_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "map_benchmark",
+    srcs = ["map_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "range_benchmark",
     srcs = ["range_benchmark.py"],
diff --git a/tensorflow/python/data/benchmarks/batch_benchmark.py b/tensorflow/python/data/benchmarks/batch_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e063849f70381b8244a8a916353a3cc3be15c230
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/batch_benchmark.py
@@ -0,0 +1,85 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class BatchBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.batch()`."""
+
+  def benchmarkBatchSparse(self):
+    non_zeros_per_row_values = [0, 1, 5, 10, 100]
+    batch_size_values = [1, 32, 64, 128, 1024]
+
+    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
+    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
+
+    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
+        ).batch(batch_size_placeholder)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    for non_zeros_per_row in non_zeros_per_row_values:
+
+      sparse_value = sparse_tensor.SparseTensorValue(
+          indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
+          values=np.arange(non_zeros_per_row, dtype=np.int64),
+          dense_shape=[1000])
+
+      for batch_size in batch_size_values:
+
+        with session.Session() as sess:
+          sess.run(iterator.initializer, feed_dict={
+              sparse_placeholder: sparse_value,
+              batch_size_placeholder: batch_size})
+          # Run five steps to warm up the session caches before taking the
+          # first measurement.
+          for _ in range(5):
+            sess.run(next_element.indices.op)
+          deltas = []
+          for _ in range(100):
+            start = time.time()
+            for _ in range(100):
+              sess.run(next_element.indices.op)
+            end = time.time()
+            deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100.0
+
+        print("Batch sparse dataset non-zeros per row: %d batch_size: %d "
+              "wall time: %f"
+              % (non_zeros_per_row, batch_size, median_wall_time))
+        self.report_benchmark(
+            iters=10000, wall_time=median_wall_time,
+            name="batch_sparse_dataset_nnz_%d_batch_size_%d" % (
+                non_zeros_per_row, batch_size))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/filter_benchmark.py b/tensorflow/python/data/benchmarks/filter_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6d86fe2218aec835e4f09f0c8c708596cf511f8
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/filter_benchmark.py
@@ -0,0 +1,69 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class FilterBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.filter()`."""
+
+  def _benchmark(self, predicate, name):
+    with ops.Graph().as_default():
+      dataset = (
+          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        print("Filter dataset using %s. Median wall time: %f" %
+              (name, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name=name)
+
+  def benchmarkSimpleFunction(self):
+    self._benchmark(array_ops.identity, "simple_function")
+
+  def benchmarkReturnComponentOptimization(self):
+    self._benchmark(lambda x: x, "return_component")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7f1a4e7af5b00569e71900df8f2a7486d7c813b
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/from_tensor_slices_benchmark.py
@@ -0,0 +1,188 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class FromTensorSlicesBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.Dataset.from_tensor_slices()`."""
+
+  def benchmarkSliceRepeatBatch(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data)
+        .repeat(num_epochs + 1).batch(batch_size))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        sess.run(next_element)
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          sess.run(next_element)
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
+          "Median wall time per element: %f" % (input_size, batch_size,
+                                                median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_repeat_batch_input_%d_batch_%d" % (input_size, batch_size))
+
+  def benchmarkSliceRepeatBatchCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data)
+        .repeat(num_epochs + 1).batch(batch_size))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print(
+        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
+        " wall time per element: %f" % (input_size, batch_size,
+                                        median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_repeat_batch_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+  def benchmarkReshapeSliceRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
+        .repeat(num_epochs + 1))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
+          "Median wall time per element: %f" % (input_size, batch_size,
+                                                median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="reshape_slice_repeat_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+  def benchmarkSliceBatchCacheRepeatCallable(self):
+    input_size = 10000
+    batch_size = 100
+    num_epochs = 100
+
+    input_data = np.random.randn(input_size)
+
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
+        .cache().repeat(num_epochs + 1))
+    iterator = dataset_ops.make_initializable_iterator(dataset)
+    next_element = iterator.get_next()
+
+    with session.Session() as sess:
+      sess.run(iterator.initializer)
+      get_next_element = sess.make_callable(next_element)
+      # Run one whole epoch to burn in the computation.
+      for _ in range(input_size // batch_size):
+        get_next_element()
+      deltas = []
+      try:
+        while True:
+          start = time.time()
+          get_next_element()
+          deltas.append(time.time() - start)
+      except errors.OutOfRangeError:
+        pass
+
+    median_wall_time = np.median(deltas)
+    print(
+        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
+        "Median wall time per element: %f"
+        % (input_size, batch_size, median_wall_time))
+    self.report_benchmark(
+        iters=len(deltas),
+        wall_time=median_wall_time,
+        name="slice_batch_cache_repeat_callable_input_%d_batch_%d" %
+        (input_size, batch_size))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/map_benchmark.py b/tensorflow/python/data/benchmarks/map_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..65d945cdae87aedad55351cfb63ad06e3521d570
--- /dev/null
+++ b/tensorflow/python/data/benchmarks/map_benchmark.py
@@ -0,0 +1,135 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Bechmarks for `tf.data.Dataset.map()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+# TODO(b/119837791): Add eager benchmarks.
+class MapBenchmark(test.Benchmark):
+  """Bechmarks for `tf.data.Dataset.map()`."""
+
+  def benchmarkChainOfMaps(self):
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda x: x + 1
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda x: x
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
+        with ops.Graph().as_default():
+          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+          for _ in range(chain_length):
+            dataset = dataset_ops.MapDataset(
+                dataset,
+                map_fn,
+                use_inter_op_parallelism=use_inter_op_parallelism)
+          iterator = dataset_ops.make_one_shot_iterator(dataset)
+          next_element = iterator.get_next()
+
+          with session.Session() as sess:
+            for _ in range(5):
+              sess.run(next_element.op)
+            deltas = []
+            for _ in range(100):
+              start = time.time()
+              for _ in range(100):
+                sess.run(next_element.op)
+              end = time.time()
+              deltas.append(end - start)
+
+            median_wall_time = np.median(deltas) / 100
+            print("Map dataset chain length%s: %d Median wall time: %f" %
+                  (print_label, chain_length, median_wall_time))
+            self.report_benchmark(
+                iters=1000,
+                wall_time=median_wall_time,
+                name="map_dataset_chain_length_%d%s" % (chain_length,
+                                                        benchmark_label))
+
+  def benchmarkMapFanOut(self):
+    fan_outs = [1, 2, 5, 10, 20, 50, 100]
+    for fan_out in fan_outs:
+      for mode in ["general", "single-threaded", "short-circuit"]:
+        if mode == "general":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = True
+          print_label = ""
+          benchmark_label = ""
+        if mode == "single-threaded":
+          map_fn = lambda *xs: [x + 1 for x in xs]
+          use_inter_op_parallelism = False
+          print_label = " (single threaded mode)"
+          benchmark_label = "_single_threaded"
+        if mode == "short-circuit":
+          map_fn = lambda *xs: xs
+          use_inter_op_parallelism = True  # should not have any significance
+          print_label = " (short circuit mode)"
+          benchmark_label = "_short_circuit"
+
+        with ops.Graph().as_default():
+          dataset = dataset_ops.Dataset.from_tensors(
+              tuple(0 for _ in range(fan_out))).repeat(None)
+          dataset = dataset_ops.MapDataset(
+              dataset,
+              map_fn,
+              use_inter_op_parallelism=use_inter_op_parallelism)
+          iterator = dataset_ops.make_one_shot_iterator(dataset)
+          next_element = iterator.get_next()
+
+          with session.Session() as sess:
+            for _ in range(5):
+              sess.run(next_element[0].op)
+            deltas = []
+            for _ in range(100):
+              start = time.time()
+              for _ in range(100):
+                sess.run(next_element[0].op)
+              end = time.time()
+              deltas.append(end - start)
+
+            median_wall_time = np.median(deltas) / 100
+            print("Map dataset fan out%s: %d Median wall time: %f" %
+                  (print_label, fan_out, median_wall_time))
+            self.report_benchmark(
+                iters=1000,
+                wall_time=median_wall_time,
+                name="map_dataset_fan_out_%d%s" % (fan_out, benchmark_label))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/benchmarks/range_benchmark.py b/tensorflow/python/data/benchmarks/range_benchmark.py
index 25f63b79a26e37bd381df7c1f3c0ae91667a70bf..a5020e2873063ea8b01801c0889a23cb60601ec3 100644
--- a/tensorflow/python/data/benchmarks/range_benchmark.py
+++ b/tensorflow/python/data/benchmarks/range_benchmark.py
@@ -39,7 +39,7 @@ class RangeBenchmark(test.Benchmark):
     # costs).
     dataset = dataset_ops.Dataset.range(num_elements).skip(
         num_elements - 1).take(1).with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with session.Session() as sess:
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 126c2be44209f5697386fe210be853ca676bbd13..ffc2e5ef5fa239beada67687ec700437b2fc44ba 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -25,19 +25,29 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@Counter
 @@CheckpointInputPipelineHook
 @@CsvDataset
+@@DatasetStructure
+@@NestedStructure
+@@OptimizationOptions
 @@Optional
+@@OptionalStructure
 @@RandomDataset
 @@Reducer
+@@SparseTensorStructure
 @@SqlDataset
 @@StatsAggregator
 @@StatsOptions
+@@Structure
 @@TFRecordWriter
+@@TensorStructure
+@@ThreadingOptions
 
 @@bucket_by_sequence_length
+@@cardinality
 @@choose_from_datasets
 @@copy_to_device
 @@dense_to_sparse_batch
 @@enumerate_dataset
+@@filter_for_shard
 @@get_next_as_optional
 @@get_single_element
 @@group_by_reducer
@@ -59,6 +69,8 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@unique
 
 @@AUTOTUNE
+@@INFINITE_CARDINALITY
+@@UNKNOWN_CARDINALITY
 """
 
 from __future__ import absolute_import
@@ -70,9 +82,13 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops.batching import dense_to_sparse_batch
 from tensorflow.python.data.experimental.ops.batching import map_and_batch
 from tensorflow.python.data.experimental.ops.batching import unbatch
+from tensorflow.python.data.experimental.ops.cardinality import cardinality
+from tensorflow.python.data.experimental.ops.cardinality import INFINITE as INFINITE_CARDINALITY
+from tensorflow.python.data.experimental.ops.cardinality import UNKNOWN as UNKNOWN_CARDINALITY
 from tensorflow.python.data.experimental.ops.counter import Counter
 from tensorflow.python.data.experimental.ops.enumerate_ops import enumerate_dataset
 from tensorflow.python.data.experimental.ops.error_ops import ignore_errors
+from tensorflow.python.data.experimental.ops.filter_for_shard_ops import filter_for_shard
 from tensorflow.python.data.experimental.ops.get_single_element import get_single_element
 from tensorflow.python.data.experimental.ops.grouping import bucket_by_sequence_length
 from tensorflow.python.data.experimental.ops.grouping import group_by_reducer
@@ -83,10 +99,8 @@ from tensorflow.python.data.experimental.ops.interleave_ops import parallel_inte
 from tensorflow.python.data.experimental.ops.interleave_ops import sample_from_datasets
 from tensorflow.python.data.experimental.ops.iterator_ops import CheckpointInputPipelineHook
 from tensorflow.python.data.experimental.ops.iterator_ops import make_saveable_from_iterator
-
-# Optimization constant that can be used to enable auto-tuning.
 from tensorflow.python.data.experimental.ops.optimization import AUTOTUNE
-
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.experimental.ops.parsing_ops import parse_example_dataset
 from tensorflow.python.data.experimental.ops.prefetching_ops import copy_to_device
 from tensorflow.python.data.experimental.ops.prefetching_ops import prefetch_to_device
@@ -101,10 +115,17 @@ from tensorflow.python.data.experimental.ops.shuffle_ops import shuffle_and_repe
 from tensorflow.python.data.experimental.ops.stats_aggregator import StatsAggregator
 from tensorflow.python.data.experimental.ops.stats_ops import latency_stats
 from tensorflow.python.data.experimental.ops.stats_options import StatsOptions
+from tensorflow.python.data.experimental.ops.threading_options import ThreadingOptions
 from tensorflow.python.data.experimental.ops.unique import unique
 from tensorflow.python.data.experimental.ops.writers import TFRecordWriter
+from tensorflow.python.data.ops.dataset_ops import DatasetStructure
 from tensorflow.python.data.ops.iterator_ops import get_next_as_optional
 from tensorflow.python.data.ops.optional_ops import Optional
+from tensorflow.python.data.ops.optional_ops import OptionalStructure
+from tensorflow.python.data.util.structure import NestedStructure
+from tensorflow.python.data.util.structure import SparseTensorStructure
+from tensorflow.python.data.util.structure import Structure
+from tensorflow.python.data.util.structure import TensorStructure
 # pylint: enable=unused-import
 
 from tensorflow.python.util.all_util import remove_undocumented
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index b89fbe7757bb8dacb58ca18ecb6b37244a6bd3d2..8175116c6eddf4a754202a2fbb22499c79a3f5b8 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -7,36 +7,118 @@ exports_files(["LICENSE"])
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 load("//tensorflow:tensorflow.bzl", "py_test")
 
+py_test(
+    name = "autotune_benchmark",
+    srcs = ["autotune_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "csv_dataset_benchmark",
+    srcs = ["csv_dataset_benchmark.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/experimental/ops:readers",
+        "//tensorflow/python/data/ops:readers",
+        "//third_party/py/numpy",
+    ],
+)
+
 py_test(
     name = "map_and_batch_benchmark",
-    size = "medium",
     srcs = ["map_and_batch_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
 )
 
 py_test(
-    name = "map_benchmark",
-    size = "medium",
-    srcs = ["map_benchmark.py"],
+    name = "map_vectorization_benchmark",
+    srcs = ["map_vectorization_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "matching_files_benchmark",
+    size = "small",
+    srcs = ["matching_files_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:matching_files",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "optimize_benchmark",
+    srcs = ["optimize_benchmark.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "unbatch_benchmark",
+    srcs = ["unbatch_benchmark.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:batching",
-        "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..e713494b526320f2c18774c7198406521c373033
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/autotune_benchmark.py
@@ -0,0 +1,187 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for autotuning performance knobs."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class AutotuneBenchmark(test.Benchmark):
+  """Benchmarks for autotuning performance knobs."""
+
+  def benchmarkMap(self):
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.map(
+        math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(1000):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=1000, wall_time=np.median(deltas), name="map_autotune")
+
+  def benchmarkMapAndBatch(self):
+    self._benchmarkMapAndBatch(numa_aware=False)
+    self._benchmarkMapAndBatch(numa_aware=True)
+
+  def _benchmarkMapAndBatch(self, numa_aware):
+    batch_size = 16
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.apply(
+        batching.map_and_batch(
+            math_ops.matmul,
+            num_parallel_calls=optimization.AUTOTUNE,
+            batch_size=batch_size))
+    options = dataset_ops.Options()
+    options.experimental_numa_aware = numa_aware
+    dataset = dataset.with_options(options)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(100):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+
+    self.report_benchmark(
+        iters=100,
+        wall_time=np.median(deltas),
+        name=("numa_" if numa_aware else "") + "map_and_batch_autotune")
+
+  def benchmarkInterleave(self):
+    k = 1024 * 1024
+    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
+                                                np.random.rand(4 * k,
+                                                               1))).repeat()
+    dataset = dataset.map(math_ops.matmul)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        cycle_length=10,
+        num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next.op)
+      for _ in range(1000):
+        start = time.time()
+        sess.run(get_next.op)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=1000,
+        wall_time=np.median(deltas),
+        name="interleave_autotune")
+
+  def benchmarkMapAndInterleave(self):
+    k = 1024 * 1024
+    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
+    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
+    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
+    dataset_a = dataset_ops.Dataset.from_tensors(a).repeat()
+    dataset_b = dataset_ops.Dataset.from_tensors(b).repeat()
+    dataset_c = dataset_ops.Dataset.from_tensors(c).repeat()
+
+    def f1(x, y):
+      return math_ops.matmul(x, y)
+
+    def f2(a, b):
+      x, y = b
+      return a, math_ops.matmul(x, y)
+
+    dataset = dataset_a
+    dataset = dataset.map(f1, num_parallel_calls=optimization.AUTOTUNE)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        num_parallel_calls=optimization.AUTOTUNE,
+        cycle_length=2)
+
+    dataset = dataset_ops.Dataset.zip((dataset, dataset_b))
+    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
+    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
+        lambda _: dataset,
+        num_parallel_calls=optimization.AUTOTUNE,
+        cycle_length=2)
+
+    dataset = dataset_ops.Dataset.zip((dataset, dataset_c))
+    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next()
+
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        sess.run(get_next)
+      for _ in range(100):
+        start = time.time()
+        sess.run(get_next)
+        end = time.time()
+        deltas.append(end - start)
+
+    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
+          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
+           np.max(deltas)))
+    self.report_benchmark(
+        iters=100,
+        wall_time=np.median(deltas),
+        name="map_and_interleave_autotune")
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..03345ce4e6648fecf47348806c55adba10aeed5a
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/csv_dataset_benchmark.py
@@ -0,0 +1,130 @@
+#  Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for `tf.data.experimental.CsvDataset`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import string
+import tempfile
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers as core_readers
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
+
+
+class CsvDatasetBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.CsvDataset`."""
+
+  FLOAT_VAL = '1.23456E12'
+  STR_VAL = string.ascii_letters * 10
+
+  def _setUp(self, str_val):
+    # Since this isn't test.TestCase, have to manually create a test dir
+    gfile.MakeDirs(googletest.GetTempDir())
+    self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
+
+    self._num_cols = [4, 64, 256]
+    self._num_per_iter = 5000
+    self._filenames = []
+    for n in self._num_cols:
+      fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
+      with open(fn, 'wb') as f:
+        # Just write 100 rows and use `repeat`... Assumes the cost
+        # of creating an iterator is not significant
+        row = ','.join([str_val for _ in range(n)])
+        f.write('\n'.join([row for _ in range(100)]))
+      self._filenames.append(fn)
+
+  def _tearDown(self):
+    gfile.DeleteRecursively(self._temp_dir)
+
+  def _runBenchmark(self, dataset, num_cols, prefix):
+    dataset = dataset.skip(self._num_per_iter - 1)
+    deltas = []
+    for _ in range(10):
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+      with session.Session() as sess:
+        start = time.time()
+        # NOTE: This depends on the underlying implementation of skip, to have
+        # the net effect of calling `GetNext` num_per_iter times on the
+        # input dataset. We do it this way (instead of a python for loop, or
+        # batching N inputs in one iter) so that the overhead from session.run
+        # or batch doesn't dominate. If we eventually optimize skip, this has
+        # to change.
+        sess.run(next_element)
+        end = time.time()
+      deltas.append(end - start)
+    # Median wall time per CSV record read and decoded
+    median_wall_time = np.median(deltas) / self._num_per_iter
+    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
+                                                    median_wall_time))
+    self.report_benchmark(
+        iters=self._num_per_iter,
+        wall_time=median_wall_time,
+        name='%s_with_cols_%d' % (prefix, num_cols))
+
+  def benchmarkMapWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkMapWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
+    self._tearDown()
+
+  def benchmarkCsvDatasetWithFloats(self):
+    self._setUp(self.FLOAT_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [[0.0]] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
+    self._tearDown()
+
+  def benchmarkCsvDatasetWithStrings(self):
+    self._setUp(self.STR_VAL)
+    for i in range(len(self._filenames)):
+      num_cols = self._num_cols[i]
+      kwargs = {'record_defaults': [['']] * num_cols}
+      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
+      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
+      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
+    self._tearDown()
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
index a90156cd33e9d37383bbeb21a1346a3413dba2f8..b17f2bcd12b2b78c97e7c390d919331ac4ef5386 100644
--- a/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
+++ b/tensorflow/python/data/experimental/benchmarks/map_and_batch_benchmark.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import hashlib
+import itertools
 import time
 
 import numpy as np
@@ -24,12 +26,17 @@ import numpy as np
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
+_NUMPY_RANDOM_SEED = 42
+
 
 class MapAndBatchBenchmark(test.Benchmark):
   """Benchmarks for `tf.data.experimental.map_and_batch()`."""
@@ -48,7 +55,7 @@ class MapAndBatchBenchmark(test.Benchmark):
 
     dataset = dataset.apply(batching.map_and_batch(
         lambda _: dense_value, batch_size_placeholder))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     for shape in shapes:
@@ -89,6 +96,131 @@ class MapAndBatchBenchmark(test.Benchmark):
             name="benchmark_batch_dense_dataset_nnz_%d_batch_size_%d" % (
                 np.prod(shape), batch_size))
 
+  def benchmarkMapAndBatchChainingVersusFusing(self):
+    """Compares the performance of chaining and fusing map and batch.
+
+    NOTE: It is recommended to build the benchmark with
+    `-c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-gmlt`
+    and execute it on a machine with at least 32 CPU cores.
+    """
+
+    # Sequential pipeline configurations.
+    seq_elem_size_series = itertools.product([1], [1], [1, 2, 4, 8], [16])
+    seq_batch_size_series = itertools.product([1], [1], [1], [8, 16, 32, 64])
+
+    # Parallel pipeline configuration.
+    par_elem_size_series = itertools.product([32], [32], [1, 2, 4, 8], [256])
+    par_batch_size_series = itertools.product([32], [32], [1],
+                                              [128, 256, 512, 1024])
+    par_num_calls_series = itertools.product([8, 16, 32, 64], [32], [1], [512])
+    par_inter_op_series = itertools.product([32], [8, 16, 32, 64], [1], [512])
+
+    def name(method, label, num_calls, inter_op, element_size, batch_size):
+      return ("%s_id_%s_num_calls_%d_inter_op_%d_elem_size_%d_batch_size_%d" % (
+          method,
+          hashlib.sha1(label).hexdigest()[:8],
+          num_calls,
+          inter_op,
+          element_size,
+          batch_size,
+      ))
+
+    def benchmark(label, series):
+      """Runs benchmark the given series."""
+
+      print("%s:" % label)
+
+      def make_base_dataset(element_size):
+        k = 1024 * 1024
+        x = constant_op.constant(np.random.rand(element_size, 4 * k))
+        y = constant_op.constant(np.random.rand(4 * k, 1))
+        return dataset_ops.Dataset.range(1000000000000).map(lambda _: (x, y))
+
+      for num_calls, inter_op, element_size, batch_size in series:
+
+        num_iters = 1024 // (
+            (element_size * batch_size) // min(num_calls, inter_op))
+        fused_dataset = make_base_dataset(element_size)
+        fused_dataset = fused_dataset.map(
+            math_ops.matmul,
+            num_parallel_calls=num_calls).batch(batch_size=batch_size)
+
+        fused_iterator = dataset_ops.make_one_shot_iterator(fused_dataset)
+        fused_get_next = fused_iterator.get_next()
+
+        fused_deltas = []
+        with session.Session(
+            config=config_pb2.ConfigProto(
+                inter_op_parallelism_threads=inter_op,
+                use_per_session_threads=True)) as sess:
+
+          for _ in range(5):
+            sess.run(fused_get_next.op)
+          for _ in range(num_iters):
+            start = time.time()
+            sess.run(fused_get_next.op)
+            end = time.time()
+            fused_deltas.append(end - start)
+
+        # `map_and_batch_fusion` is optimized by default. To get the chained
+        # dataset, with have to disable it.
+        options = dataset_ops.Options()
+        options.experimental_optimization = OptimizationOptions()
+        options.experimental_optimization.map_and_batch_fusion = False
+        chained_dataset = fused_dataset.with_options(options)
+        chained_iterator = dataset_ops.make_one_shot_iterator(chained_dataset)
+        chained_get_next = chained_iterator.get_next()
+
+        chained_deltas = []
+        with session.Session(
+            config=config_pb2.ConfigProto(
+                inter_op_parallelism_threads=inter_op,
+                use_per_session_threads=True)) as sess:
+          for _ in range(5):
+            sess.run(chained_get_next.op)
+          for _ in range(num_iters):
+            start = time.time()
+            sess.run(chained_get_next.op)
+            end = time.time()
+            chained_deltas.append(end - start)
+
+        print(
+            "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
+            "element size: %d, num iters: %d\nchained wall time: %f (median), "
+            "%f (mean), %f (stddev), %f (min), %f (max)\n  fused wall time: "
+            "%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n    "
+            "chained/fused:    %.2fx (median),    %.2fx (mean)" %
+            (batch_size, num_calls, inter_op, element_size, num_iters,
+             np.median(chained_deltas), np.mean(chained_deltas),
+             np.std(chained_deltas), np.min(chained_deltas),
+             np.max(chained_deltas), np.median(fused_deltas),
+             np.mean(fused_deltas), np.std(fused_deltas), np.min(fused_deltas),
+             np.max(fused_deltas),
+             np.median(chained_deltas) / np.median(fused_deltas),
+             np.mean(chained_deltas) / np.mean(fused_deltas)))
+
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=np.median(chained_deltas),
+            name=name("chained", label, num_calls, inter_op, element_size,
+                      batch_size))
+
+        self.report_benchmark(
+            iters=num_iters,
+            wall_time=np.median(fused_deltas),
+            name=name("fused", label, num_calls, inter_op, element_size,
+                      batch_size))
+
+      print()
+
+    np.random.seed(_NUMPY_RANDOM_SEED)
+    benchmark("Sequential element size evaluation", seq_elem_size_series)
+    benchmark("Sequential batch size evaluation", seq_batch_size_series)
+    benchmark("Parallel element size evaluation", par_elem_size_series)
+    benchmark("Parallel batch size evaluation", par_batch_size_series)
+    benchmark("Transformation parallelism evaluation", par_num_calls_series)
+    benchmark("Threadpool size evaluation", par_inter_op_series)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_benchmark.py
deleted file mode 100644
index ad253cffa568b3abe367b661b409348ce9f56fa1..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/benchmarks/map_benchmark.py
+++ /dev/null
@@ -1,245 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import hashlib
-import itertools
-import time
-
-import numpy as np
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.experimental.ops import optimization
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-_NUMPY_RANDOM_SEED = 42
-
-
-class MapDatasetBenchmark(test.Benchmark):
-
-  # The purpose of this benchmark is to compare the performance of chaining vs
-  # fusing of the map and batch transformations across various configurations.
-  #
-  # NOTE: It is recommended to build the benchmark with
-  # `-c opt --copt=-mavx --copt=-mavx2 --copt=-mfma --copt=-gmlt`
-  # and execute it on a machine with at least 32 CPU cores.
-  def benchmarkMapAndBatch(self):
-
-    # Sequential pipeline configurations.
-    seq_elem_size_series = itertools.product([1], [1], [1, 2, 4, 8], [16])
-    seq_batch_size_series = itertools.product([1], [1], [1], [8, 16, 32, 64])
-
-    # Parallel pipeline configuration.
-    par_elem_size_series = itertools.product([32], [32], [1, 2, 4, 8], [256])
-    par_batch_size_series = itertools.product([32], [32], [1],
-                                              [128, 256, 512, 1024])
-    par_num_calls_series = itertools.product([8, 16, 32, 64], [32], [1], [512])
-    par_inter_op_series = itertools.product([32], [8, 16, 32, 64], [1], [512])
-
-    def name(method, label, num_calls, inter_op, element_size, batch_size):
-      return ("%s_id_%s_num_calls_%d_inter_op_%d_elem_size_%d_batch_size_%d" % (
-          method,
-          hashlib.sha1(label).hexdigest(),
-          num_calls,
-          inter_op,
-          element_size,
-          batch_size,
-      ))
-
-    def benchmark(label, series):
-
-      print("%s:" % label)
-      for num_calls, inter_op, element_size, batch_size in series:
-
-        num_iters = 1024 // (
-            (element_size * batch_size) // min(num_calls, inter_op))
-        k = 1024 * 1024
-        dataset = dataset_ops.Dataset.from_tensors((np.random.rand(
-            element_size, 4 * k), np.random.rand(4 * k, 1))).repeat()
-
-        chained_dataset = dataset.map(
-            math_ops.matmul,
-            num_parallel_calls=num_calls).batch(batch_size=batch_size)
-        chained_iterator = chained_dataset.make_one_shot_iterator()
-        chained_get_next = chained_iterator.get_next()
-
-        chained_deltas = []
-        with session.Session(
-            config=config_pb2.ConfigProto(
-                inter_op_parallelism_threads=inter_op,
-                use_per_session_threads=True)) as sess:
-          for _ in range(5):
-            sess.run(chained_get_next.op)
-          for _ in range(num_iters):
-            start = time.time()
-            sess.run(chained_get_next.op)
-            end = time.time()
-            chained_deltas.append(end - start)
-
-        fused_dataset = dataset.apply(
-            batching.map_and_batch(
-                math_ops.matmul,
-                num_parallel_calls=num_calls,
-                batch_size=batch_size))
-        fused_iterator = fused_dataset.make_one_shot_iterator()
-        fused_get_next = fused_iterator.get_next()
-
-        fused_deltas = []
-        with session.Session(
-            config=config_pb2.ConfigProto(
-                inter_op_parallelism_threads=inter_op,
-                use_per_session_threads=True)) as sess:
-
-          for _ in range(5):
-            sess.run(fused_get_next.op)
-          for _ in range(num_iters):
-            start = time.time()
-            sess.run(fused_get_next.op)
-            end = time.time()
-            fused_deltas.append(end - start)
-
-        print(
-            "batch size: %d, num parallel calls: %d, inter-op parallelism: %d, "
-            "element size: %d, num iters: %d\nchained wall time: %f (median), "
-            "%f (mean), %f (stddev), %f (min), %f (max)\n  fused wall time: "
-            "%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n    "
-            "chained/fused:    %.2fx (median),    %.2fx (mean)" %
-            (batch_size, num_calls, inter_op, element_size, num_iters,
-             np.median(chained_deltas), np.mean(chained_deltas),
-             np.std(chained_deltas), np.min(chained_deltas),
-             np.max(chained_deltas), np.median(fused_deltas),
-             np.mean(fused_deltas), np.std(fused_deltas), np.min(fused_deltas),
-             np.max(fused_deltas),
-             np.median(chained_deltas) / np.median(fused_deltas),
-             np.mean(chained_deltas) / np.mean(fused_deltas)))
-
-        self.report_benchmark(
-            iters=num_iters,
-            wall_time=np.median(chained_deltas),
-            name=name("chained", label, num_calls, inter_op, element_size,
-                      batch_size))
-
-        self.report_benchmark(
-            iters=num_iters,
-            wall_time=np.median(fused_deltas),
-            name=name("fused", label, num_calls, inter_op, element_size,
-                      batch_size))
-
-      print("")
-
-    np.random.seed(_NUMPY_RANDOM_SEED)
-    benchmark("Sequential element size evaluation", seq_elem_size_series)
-    benchmark("Sequential batch size evaluation", seq_batch_size_series)
-    benchmark("Parallel element size evaluation", par_elem_size_series)
-    benchmark("Parallel batch size evaluation", par_batch_size_series)
-    benchmark("Transformation parallelism evaluation", par_num_calls_series)
-    benchmark("Threadpool size evaluation", par_inter_op_series)
-
-  # This benchmark compares the performance of pipeline with multiple chained
-  # maps with and without map fusion.
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      self._benchmarkChainOfMaps(chain_length, False)
-      self._benchmarkChainOfMaps(chain_length, True)
-
-  def _benchmarkChainOfMaps(self, chain_length, optimize_dataset):
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-      for _ in range(chain_length):
-        dataset = dataset.map(lambda x: x)
-      if optimize_dataset:
-        dataset = dataset.apply(optimization.optimize(["map_fusion"]))
-
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Map dataset {} chain length: {} Median wall time: {}".format(
-            opt_mark, chain_length, median_wall_time))
-        self.report_benchmark(
-            iters=1000,
-            wall_time=median_wall_time,
-            name="benchmark_map_dataset_chain_latency_{}_{}".format(
-                opt_mark, chain_length))
-
-
-class MapAndFilterBenchmark(test.Benchmark):
-
-  # This benchmark compares the performance of pipeline with multiple chained
-  # map + filter with and without map fusion.
-  def benchmarkMapAndFilter(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      self._benchmarkMapAndFilter(chain_length, False)
-      self._benchmarkMapAndFilter(chain_length, True)
-
-  def _benchmarkMapAndFilter(self, chain_length, optimize_dataset):
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-      for _ in range(chain_length):
-        dataset = dataset.map(lambda x: x + 5).filter(
-            lambda x: math_ops.greater_equal(x - 5, 0))
-      if optimize_dataset:
-        dataset = dataset.apply(
-            optimization.optimize(["map_and_filter_fusion"]))
-
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(10):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        opt_mark = "opt" if optimize_dataset else "no-opt"
-        print("Map and filter dataset {} chain length: {} Median wall time: {}".
-              format(opt_mark, chain_length, median_wall_time))
-        self.report_benchmark(
-            iters=1000,
-            wall_time=median_wall_time,
-            name="benchmark_map_and_filter_dataset_chain_latency_{}_{}".format(
-                opt_mark, chain_length))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..a60ba0a857ee18e88e912fc25000a479e4a86e72
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/map_vectorization_benchmark.py
@@ -0,0 +1,205 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for the `MapVectorization` optimization."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.core.example import example_pb2
+from tensorflow.core.example import feature_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import optimization_options
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.platform import test
+
+
+def _generate_csv_test_case():
+  """Generates a `decode_csv()` test case."""
+
+  def csv_factory():
+    return dataset_ops.Dataset.from_tensor_slices(["1.0:2:a",
+                                                   "2.4:5:c"]).repeat(5)
+
+  def decode_csv_fn(x):
+    return parsing_ops.decode_csv(
+        x,
+        record_defaults=[
+            constant_op.constant([], dtypes.float32),
+            constant_op.constant([], dtypes.int32),
+            constant_op.constant([], dtypes.string)
+        ],
+        field_delim=":")
+
+  return decode_csv_fn, csv_factory
+
+
+def _generate_parse_single_example_test_case():
+  """Generates a `parse_single_example()` test case."""
+
+  def parse_example_factory():
+    """Parse example factory."""
+
+    def _int64_feature(*values):
+      return feature_pb2.Feature(int64_list=feature_pb2.Int64List(value=values))
+
+    def _bytes_feature(*values):
+      return feature_pb2.Feature(
+          bytes_list=feature_pb2.BytesList(
+              value=[v.encode("utf-8") for v in values]))
+
+    return dataset_ops.Dataset.from_tensor_slices(
+        constant_op.constant([
+            example_pb2.Example(
+                features=feature_pb2.Features(
+                    feature={
+                        "dense_int": _int64_feature(i),
+                        "dense_str": _bytes_feature(str(i)),
+                        "sparse_int": _int64_feature(i, i * 2, i * 4, i * 8),
+                        "sparse_str": _bytes_feature(*["abc"] * i)
+                    })).SerializeToString() for i in range(10)
+        ]))
+
+  def parse_single_example_fn(x):
+    features = {
+        "dense_int": parsing_ops.FixedLenFeature((), dtypes.int64, 0),
+        "dense_str": parsing_ops.FixedLenFeature((), dtypes.string, ""),
+        "sparse_int": parsing_ops.VarLenFeature(dtypes.int64),
+        "sparse_str": parsing_ops.VarLenFeature(dtypes.string),
+    }
+    return parsing_ops.parse_single_example(x, features)
+
+  return parse_single_example_fn, parse_example_factory
+
+
+# TODO(rachelim): Add a benchmark for more expensive transformations, such as
+# vgg_preprocessing.
+class MapVectorizationBenchmark(test.Benchmark):
+  """Benchmarks for the `MapVectorization` optimization."""
+
+  def _run(self, x, num_iters=100, name=None):
+    deltas = []
+    with session.Session() as sess:
+      for _ in range(5):
+        # Warm up session...
+        sess.run(x)
+      for _ in range(num_iters):
+        start = time.time()
+        sess.run(x)
+        end = time.time()
+        deltas.append(end - start)
+    median_time = np.median(deltas)
+    self.report_benchmark(iters=num_iters, wall_time=median_time, name=name)
+    return median_time
+
+  def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
+    num_elems = int(np.sum([np.prod(x) for x in input_size]))
+    name_template = "{}__batch_size_{}_input_element_size_{}_{}"
+
+    base_dataset = input_dataset.map(map_fn).batch(batch_size)
+
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    # Disable default map_and_batch_fusion optimization
+    opt_options.map_and_batch_fusion = False
+    options.experimental_optimization = opt_options
+    base_dataset = base_dataset.with_options(options)
+
+    unoptimized_op = dataset_ops.make_one_shot_iterator(base_dataset).get_next()
+
+    optimized_options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.map_vectorization = True
+    optimized_options.experimental_optimization = opt_options
+    optimized = base_dataset.with_options(optimized_options)
+    optimized_op = dataset_ops.make_one_shot_iterator(optimized).get_next()
+
+    unoptimized_time = self._run(
+        unoptimized_op,
+        name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
+    optimized_time = self._run(
+        optimized_op,
+        name=name_template.format(str_id, batch_size, num_elems, "optimized"))
+
+    print("Batch size: {}\n"
+          "Input element size: {}\n"
+          "Transformation: {}\n"
+          "Speedup: {}\n".format(batch_size, input_size, str_id,
+                                 (unoptimized_time / optimized_time)))
+
+  # Known cheap functions
+  def benchmarkIdentity(self):
+    self._benchmark_helper(lambda *args: [array_ops.identity(x) for x in args],
+                           "identity")
+
+  def benchmarkAddConst(self):
+    self._benchmark_helper(lambda *args: [x + 1 for x in args], "add_const")
+
+  def benchmarkReturnConst(self):
+    self._benchmark_helper(lambda *args: [constant_op.constant(2)], "ret_const")
+
+  def benchmarkSelect(self):
+    self._benchmark_helper(lambda *args: args[0], "select")
+
+  def benchmarkCast(self):
+    self._benchmark_helper(
+        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
+
+  def benchmarkReshape(self):
+    self._benchmark_helper(
+        lambda *args: [array_ops.reshape(x, (-1, 30)) for x in args], "reshape")
+
+  def benchmarkDecodeCSV(self):
+    csv_fn, csv_factory = _generate_csv_test_case()
+    self._benchmark_helper(csv_fn, "decode_csv", lambda: [csv_factory()])
+
+  def benchmarkParseSingleExample(self):
+    # NOTE: Since we haven't implemented a vectorizer for "SerializeSparse",
+    # this function is only naively vectorized.
+    parse_fn, parse_factory = _generate_parse_single_example_test_case()
+
+    self._benchmark_helper(parse_fn, "parse_single_example",
+                           lambda: [parse_factory()])
+
+  def _default_dataset_factory(self):
+    input_sizes = [(10, 10, 3), (10, 100, 300)]
+    for sz in input_sizes:
+      yield dataset_ops.Dataset.from_tensor_slices(np.random.rand(*sz))
+
+  def _benchmark_helper(self, map_fn, str_id, base_dataset_factory=None):
+    if base_dataset_factory is None:
+      base_dataset_factory = self._default_dataset_factory
+
+    batch_size = 1000
+    for base_dataset in base_dataset_factory():
+      base_dataset = base_dataset.repeat()
+      input_size = [
+          tuple(shape.as_list())
+          for shape in nest.flatten(base_dataset.output_shapes)
+      ]
+      self._compare(base_dataset, map_fn, batch_size, input_size, str_id)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53f8dd7c537fecbfcd551e2a4809aaf5447ff46
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/matching_files_benchmark.py
@@ -0,0 +1,102 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmark for the experimental `MatchingFilesDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import tempfile
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import matching_files
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import test
+
+
+class MatchingFilesBenchmark(test.Benchmark):
+  """Benchmark for the experimental `MatchingFilesDataset`."""
+
+  def benchmarkNestedDirectories(self):
+    tmp_dir = tempfile.mkdtemp()
+    width = 500
+    depth = 10
+    for i in range(width):
+      for j in range(depth):
+        new_base = os.path.join(tmp_dir, str(i),
+                                *[str(dir_name) for dir_name in range(j)])
+        os.makedirs(new_base)
+        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
+        for f in child_files:
+          filename = os.path.join(new_base, f)
+          open(filename, 'w').close()
+
+    patterns = [
+        os.path.join(tmp_dir, os.path.join(*['**'
+                                             for _ in range(depth)]), suffix)
+        for suffix in ['*.txt', '*.log']
+    ]
+
+    deltas = []
+    iters = 3
+    for _ in range(iters):
+      with ops.Graph().as_default():
+        dataset = matching_files.MatchingFilesDataset(patterns)
+        next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
+
+        with session.Session() as sess:
+          sub_deltas = []
+          while True:
+            try:
+              start = time.time()
+              sess.run(next_element)
+              end = time.time()
+              sub_deltas.append(end - start)
+            except errors.OutOfRangeError:
+              break
+          deltas.append(sub_deltas)
+
+    median_deltas = np.median(deltas, axis=0)
+    print('Nested directory size (width*depth): %d*%d Median wall time: '
+          '%fs (read first filename), %fs (read second filename), avg %fs'
+          ' (read %d more filenames)' %
+          (width, depth, median_deltas[0], median_deltas[1],
+           np.average(median_deltas[2:]), len(median_deltas) - 2))
+    self.report_benchmark(
+        iters=iters,
+        wall_time=np.sum(median_deltas),
+        extras={
+            'read first file:':
+                median_deltas[0],
+            'read second file:':
+                median_deltas[1],
+            'avg time for reading %d more filenames:' %
+            (len(median_deltas) - 2):
+                np.average(median_deltas[2:])
+        },
+        name='dataset_nested_directory(%d*%d)' %
+        (width, depth))
+
+    shutil.rmtree(tmp_dir, ignore_errors=True)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f9b89111fcda9230062a4aa7d3477df5d2f36a5
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/optimize_benchmark.py
@@ -0,0 +1,120 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Benchmarks for static optimizations."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+class OptimizationBenchmark(test.Benchmark):
+  """Benchmarks for static optimizations."""
+
+  def benchmarkMapFusion(self):
+    """Evaluates performance map of fusion."""
+
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkMapFusion(chain_length, False)
+      self._benchmarkMapFusion(chain_length, True)
+
+  def _benchmarkMapFusion(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.map(lambda x: x)
+      if optimize_dataset:
+        options = dataset_ops.Options()
+        options.experimental_map_fusion = True
+        dataset = dataset.with_options(options)
+
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(5):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "noopt"
+        print("Map dataset {} chain length: {} Median wall time: {}".format(
+            opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name="map_fusion_{}_chain_length_{}".format(
+                opt_mark, chain_length))
+
+  def benchmarkMapAndFilterFusion(self):
+    """Evaluates performance map of fusion."""
+
+    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
+    for chain_length in chain_lengths:
+      self._benchmarkMapAndFilterFusion(chain_length, False)
+      self._benchmarkMapAndFilterFusion(chain_length, True)
+
+  def _benchmarkMapAndFilterFusion(self, chain_length, optimize_dataset):
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
+      for _ in range(chain_length):
+        dataset = dataset.map(lambda x: x + 5).filter(
+            lambda x: math_ops.greater_equal(x - 5, 0))
+      if optimize_dataset:
+        options = dataset_ops.Options()
+        options.experimental_map_and_filter_fusion = True
+        dataset = dataset.with_options(options)
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for _ in range(10):
+          sess.run(next_element.op)
+        deltas = []
+        for _ in range(100):
+          start = time.time()
+          for _ in range(100):
+            sess.run(next_element.op)
+          end = time.time()
+          deltas.append(end - start)
+
+        median_wall_time = np.median(deltas) / 100
+        opt_mark = "opt" if optimize_dataset else "noopt"
+        print("Map and filter dataset {} chain length: {} Median wall time: {}"
+              .format(opt_mark, chain_length, median_wall_time))
+        self.report_benchmark(
+            iters=100,
+            wall_time=median_wall_time,
+            name="map_and_filter_fusion_{}_chain_length_{}".format(
+                opt_mark, chain_length))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
new file mode 100644
index 0000000000000000000000000000000000000000..c36a32534dddfc29e5f0d4253508e44f9ae4a899
--- /dev/null
+++ b/tensorflow/python/data/experimental/benchmarks/unbatch_benchmark.py
@@ -0,0 +1,107 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.unbatch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import time
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class UnbatchBenchmark(test.Benchmark):
+  """Benchmarks for `tf.data.experimental.unbatch()`."""
+
+  def benchmarkNativeUnbatch(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.apply(batching.unbatch())
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (native) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="native_batch_size_%d" %
+              batch_size)
+
+  # Include a benchmark of the previous `unbatch()` implementation that uses
+  # a composition of more primitive ops. Eventually we'd hope to generate code
+  # that is as good in both cases.
+  def benchmarkOldUnbatchImplementation(self):
+    batch_sizes = [1, 2, 5, 10, 20, 50]
+    elems_per_trial = 10000
+    with ops.Graph().as_default():
+      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
+      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+      dataset = dataset.batch(batch_size_placeholder)
+      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
+      dataset = dataset.skip(elems_per_trial)
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      next_element = iterator.get_next()
+
+      with session.Session() as sess:
+        for batch_size in batch_sizes:
+          deltas = []
+          for _ in range(5):
+            sess.run(
+                iterator.initializer,
+                feed_dict={batch_size_placeholder: batch_size})
+            start = time.time()
+            sess.run(next_element.op)
+            end = time.time()
+            deltas.append((end - start) / elems_per_trial)
+
+          median_wall_time = np.median(deltas)
+          print("Unbatch (unfused) batch size: %d Median wall time per element:"
+                " %f microseconds" % (batch_size, median_wall_time * 1e6))
+          self.report_benchmark(
+              iters=10000,
+              wall_time=median_wall_time,
+              name="unfused_batch_size_%d" %
+              batch_size)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 0141ac730fa5d960d92686ab6c81c72ab08cbe83..548eb422ed06de84447494391ad9e54d9b2df0b2 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -38,6 +38,7 @@ cuda_py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python/compat:compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -71,15 +72,11 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
         "//tensorflow/python/data/experimental/ops:error_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -152,27 +149,6 @@ py_test(
     ],
 )
 
-cuda_py_test(
-    name = "function_buffering_resource_test",
-    size = "small",
-    srcs = ["function_buffering_resource_test.py"],
-    additional_deps = [
-        "//tensorflow/python/data/experimental/ops:prefetching_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-    ],
-    tags = ["no_windows_gpu"],
-)
-
 py_test(
     name = "get_single_element_test",
     size = "small",
@@ -370,6 +346,37 @@ py_test(
     ],
 )
 
+py_test(
+    name = "matching_files_test",
+    size = "small",
+    srcs = ["matching_files_test.py"],
+    srcs_version = "PY2AND3",
+    tags = ["no_pip"],
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:matching_files",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "cardinality_test",
+    srcs = ["cardinality_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python/data/experimental/ops:cardinality",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 py_test(
     name = "override_threadpool_test",
     size = "small",
@@ -538,6 +545,7 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -617,7 +625,9 @@ py_test(
     size = "medium",
     srcs = ["stats_dataset_ops_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+    ],
     deps = [
         ":reader_dataset_ops_test_base",
         ":stats_dataset_test_base",
@@ -706,3 +716,14 @@ py_test(
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
+
+cuda_py_test(
+    name = "wrap_unwrap_test",
+    size = "small",
+    srcs = ["wrap_unwrap_test.py"],
+    additional_deps = [
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
deleted file mode 100644
index e896752a269c9fee5430f96a32fc13f41098b3ce..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/kernel_tests/batch_dataset_op_test.py
+++ /dev/null
@@ -1,688 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import math
-import time
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.experimental.ops import batching
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import script_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def testDenseToSparseBatchDataset(self):
-    components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4,
-                                           [12])).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      for start in range(0, len(components), 4):
-        results = sess.run(get_next)
-        self.assertAllEqual([[i, j]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)], results.indices)
-        self.assertAllEqual(
-            [c for c in components[start:start + 4] for _ in range(c)],
-            results.values)
-        self.assertAllEqual([min(4,
-                                 len(components) - start), 12],
-                            results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDenseToSparseBatchDatasetWithUnknownShape(self):
-    components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(
-            lambda x: array_ops.fill([x, x], x)).apply(
-                batching.dense_to_sparse_batch(
-                    4, [5, None])).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-
-      for start in range(0, len(components), 4):
-        results = sess.run(get_next)
-        self.assertAllEqual([[i, j, z]
-                             for i, c in enumerate(components[start:start + 4])
-                             for j in range(c)
-                             for z in range(c)], results.indices)
-        self.assertAllEqual([
-            c for c in components[start:start + 4] for _ in range(c)
-            for _ in range(c)
-        ], results.values)
-        self.assertAllEqual([
-            min(4,
-                len(components) - start), 5,
-            np.max(components[start:start + 4])
-        ], results.dense_shape)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDenseToSparseBatchDatasetWithInvalidShape(self):
-    input_tensor = array_ops.constant([[1]])
-    with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
-      dataset_ops.Dataset.from_tensors(input_tensor).apply(
-          batching.dense_to_sparse_batch(4,
-                                         [-2])).make_initializable_iterator()
-
-  def testDenseToSparseBatchDatasetShapeErrors(self):
-    input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (
-        dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4,
-                                           [12])).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Initialize with an input tensor of incompatible rank.
-      sess.run(init_op, feed_dict={input_tensor: [[1]]})
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "incompatible with the row shape"):
-        sess.run(get_next)
-
-      # Initialize with an input tensor that is larger than `row_shape`.
-      sess.run(init_op, feed_dict={input_tensor: range(13)})
-      with self.assertRaisesRegexp(errors.DataLossError,
-                                   "larger than the row shape"):
-        sess.run(get_next)
-
-  def testUnbatchWithUnknownRankInput(self):
-    placeholder = array_ops.placeholder(dtypes.int32)
-    dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
-        batching.unbatch())
-    iterator = dataset.make_initializable_iterator()
-    next_elem = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
-      for i in range(4):
-        self.assertEqual(i, sess.run(next_elem))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_elem)
-
-  def testUnbatchScalarDataset(self):
-    data = tuple([math_ops.range(10) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = (dtypes.int32,) * 3
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i,) * 3, sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchDatasetWithStrings(self):
-    data = tuple([math_ops.range(10) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    data = data.map(lambda x, y, z: (x, string_ops.as_string(y), z))
-    expected_types = (dtypes.int32, dtypes.string, dtypes.int32)
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchDatasetWithSparseTensor(self):
-    st = sparse_tensor.SparseTensorValue(
-        indices=[[i, i] for i in range(10)],
-        values=list(range(10)),
-        dense_shape=[10, 10])
-    data = dataset_ops.Dataset.from_tensors(st)
-    data = data.apply(batching.unbatch())
-    data = data.batch(5)
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        st_row = sess.run(next_element)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchDatasetWithDenseAndSparseTensor(self):
-    st = sparse_tensor.SparseTensorValue(
-        indices=[[i, i] for i in range(10)],
-        values=list(range(10)),
-        dense_shape=[10, 10])
-    data = dataset_ops.Dataset.from_tensors((list(range(10)), st))
-    data = data.apply(batching.unbatch())
-    data = data.batch(5)
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        dense_elem, st_row = sess.run(next_element)
-        self.assertEqual(i, dense_elem)
-        self.assertEqual([i], st_row.indices)
-        self.assertEqual([i], st_row.values)
-        self.assertEqual([10], st_row.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchSingleElementTupleDataset(self):
-    data = tuple([(math_ops.range(10),) for _ in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = ((dtypes.int32,),) * 3
-    data = data.batch(2)
-    self.assertEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i,),) * 3, sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchMultiElementTupleDataset(self):
-    data = tuple([(math_ops.range(10 * i, 10 * i + 10),
-                   array_ops.fill([10], "hi")) for i in range(3)])
-    data = dataset_ops.Dataset.from_tensor_slices(data)
-    expected_types = ((dtypes.int32, dtypes.string),) * 3
-    data = data.batch(2)
-    self.assertAllEqual(expected_types, data.output_types)
-    data = data.apply(batching.unbatch())
-    self.assertAllEqual(expected_types, data.output_types)
-
-    iterator = data.make_one_shot_iterator()
-    op = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
-                         sess.run(op))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
-
-  def testUnbatchEmpty(self):
-    data = dataset_ops.Dataset.from_tensors(
-        (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
-         constant_op.constant([], shape=[0, 4, 0])))
-    data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testUnbatchStaticShapeMismatch(self):
-    data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
-                                             np.arange(9)))
-    with self.assertRaises(ValueError):
-      data.apply(batching.unbatch())
-
-  def testUnbatchDynamicShapeMismatch(self):
-    ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
-    ph2 = array_ops.placeholder(dtypes.int32, shape=None)
-    data = dataset_ops.Dataset.from_tensors((ph1, ph2))
-    data = data.apply(batching.unbatch())
-    iterator = data.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Mismatch in the 0th dimension.
-      sess.run(
-          iterator.initializer,
-          feed_dict={
-              ph1: np.arange(7).astype(np.int32),
-              ph2: np.arange(8).astype(np.int32)
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-      # No 0th dimension (i.e. scalar value) for one component.
-      sess.run(
-          iterator.initializer,
-          feed_dict={
-              ph1: np.arange(7).astype(np.int32),
-              ph2: 7
-          })
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-  @parameterized.named_parameters(
-      ("Default", None, None),
-      ("SequentialCalls", 1, None),
-      ("ParallelCalls", 2, None),
-      ("ParallelBatches", None, 10),
-  )
-  def testMapAndBatch(self, num_parallel_calls, num_parallel_batches):
-    """Test a dataset that maps a TF function across its input elements."""
-    # The pipeline is TensorSliceDataset ->
-    # RepeatDataset(count) -> MapAndBatchDataset(square_3, batch_size).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).repeat(count).apply(
-            batching.map_and_batch(
-                map_func=_map_fn,
-                batch_size=batch_size,
-                num_parallel_calls=num_parallel_calls,
-                num_parallel_batches=num_parallel_batches))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      # Batch of a finite input, where the batch_size divides the
-      # total number of elements.
-      sess.run(init_op, feed_dict={count: 28, batch_size: 14})
-      num_batches = (28 * 7) // 14
-      for i in range(num_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(14):
-            self.assertAllEqual(component[(i * 14 + j) % 7]**2,
-                                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of a finite input, where the batch_size does not
-      # divide the total number of elements.
-      sess.run(init_op, feed_dict={count: 14, batch_size: 8})
-
-      # We expect (num_batches - 1) full-sized batches.
-      num_batches = int(math.ceil((14 * 7) / 8))
-      for i in range(num_batches - 1):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(8):
-            self.assertAllEqual(component[(i * 8 + j) % 7]**2,
-                                result_component[j])
-      result = sess.run(get_next)
-      for component, result_component in zip(components, result):
-        for j in range((14 * 7) % 8):
-          self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
-                              result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Batch of an empty input should fail straight away.
-      sess.run(init_op, feed_dict={count: 0, batch_size: 8})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Empty batch should be an initialization time error.
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(init_op, feed_dict={count: 14, batch_size: 0})
-
-  @parameterized.named_parameters(
-      ("Even", False),
-      ("Uneven", True),
-  )
-  def testMapAndBatchPartialBatch(self, drop_remainder):
-    iterator = (
-        dataset_ops.Dataset.range(10).apply(
-            batching.map_and_batch(
-                lambda x: array_ops.reshape(x * x, [1]),
-                batch_size=4,
-                drop_remainder=drop_remainder)).make_one_shot_iterator())
-    if drop_remainder:
-      self.assertEqual([4, 1], iterator.output_shapes.as_list())
-    else:
-      self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      if not drop_remainder:
-        self.assertAllEqual([[64], [81]], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testMapAndBatchYieldsPartialBatch(self):
-    iterator = (
-        dataset_ops.Dataset.range(10).apply(
-            batching.map_and_batch(lambda x: array_ops.reshape(x * x, [1]),
-                                   4)).make_one_shot_iterator())
-    self.assertEqual([None, 1], iterator.output_shapes.as_list())
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      self.assertAllEqual([[64], [81]], sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testMapAndBatchParallelGetNext(self):
-    iterator = (
-        dataset_ops.Dataset.range(50000).apply(
-            batching.map_and_batch(lambda x: x,
-                                   batch_size=100)).make_one_shot_iterator())
-    elements = []
-    for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session() as sess:
-      for i in range(5):
-        got = sess.run(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
-
-  def testMapAndBatchParallelGetNextDropRemainder(self):
-    iterator = (
-        dataset_ops.Dataset.range(49999).apply(
-            batching.map_and_batch(
-                lambda x: x, batch_size=100,
-                drop_remainder=True)).make_one_shot_iterator())
-    elements = []
-    for _ in range(100):
-      elements.append(iterator.get_next())
-    with self.cached_session() as sess:
-      for i in range(4):
-        got = sess.run(elements)
-        got.sort(key=lambda x: x[0])
-        expected = []
-        for j in range(100):
-          expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
-        self.assertAllEqual(got, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
-
-  def testMapAndBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).apply(
-        batching.map_and_batch(_sparse, 5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testMapAndBatchFails(self):
-    """Test a dataset that maps a TF function across its input elements."""
-    dataset = dataset_ops.Dataset.from_tensors(
-        array_ops.check_numerics(
-            constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset.apply(batching.map_and_batch(
-            lambda x: x, batch_size)).make_initializable_iterator())
-    init_op = iterator.initializer
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(errors.InvalidArgumentError, "oops"):
-        sess.run(init_op, feed_dict={batch_size: 14})
-
-  def testMapAndBatchShapeMismatch(self):
-    """Test a dataset that maps a TF function across its input elements."""
-
-    def generator():
-      yield [1]
-      yield [2]
-      yield [3]
-      yield [[4, 5, 6]]
-
-    dataset = dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int32)
-    batch_size = 4
-    iterator = (
-        dataset.apply(batching.map_and_batch(
-            lambda x: x, batch_size)).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                   "number of elements does not match"):
-        sess.run(get_next)
-
-  def testMapAndBatchImplicitDispose(self):
-    # Tests whether a map and batch dataset will be cleaned up correctly when
-    # the pipeline does not run it until exhaustion.
-    # The pipeline is TensorSliceDataset -> RepeatDataset(1000) ->
-    # MapAndBatchDataset(f=square_3, batch_size=100).
-    components = (np.arange(1000),
-                  np.array([[1, 2, 3]]) * np.arange(1000)[:, np.newaxis],
-                  np.array(37.0) * np.arange(1000))
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components).repeat(
-        1000).apply(batching.map_and_batch(_map_fn, batch_size=100))
-    dataset = dataset.prefetch(5)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(3):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", 0),
-      ("2", 5),
-      ("3", 10),
-      ("4", 90),
-      ("5", 95),
-      ("6", 99),
-  )
-  def testMapAndBatchOutOfRangeError(self, threshold):
-
-    def raising_py_fn(i):
-      if i >= threshold:
-        raise StopIteration()
-      else:
-        return i
-
-    iterator = (
-        dataset_ops.Dataset.range(100).apply(
-            batching.map_and_batch(
-                lambda x: script_ops.py_func(raising_py_fn, [x], dtypes.int64),
-                batch_size=10)).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
-      if threshold % 10 != 0:
-        self.assertAllEqual(
-            [threshold // 10 * 10 + j for j in range(threshold % 10)],
-            sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", False, dtypes.bool),
-      ("2", -42, dtypes.int8),
-      ("3", -42, dtypes.int16),
-      ("4", -42, dtypes.int32),
-      ("5", -42, dtypes.int64),
-      ("6", 42, dtypes.uint8),
-      ("7", 42, dtypes.uint16),
-      ("8", 42.0, dtypes.float16),
-      ("9", 42.0, dtypes.float32),
-      ("10", 42.0, dtypes.float64),
-      ("11", b"hello", dtypes.string),
-  )
-  def testMapAndBatchTypes(self, element, dtype):
-
-    def gen():
-      yield element
-
-    dataset = dataset_ops.Dataset.from_generator(gen, dtype).repeat(100).apply(
-        batching.map_and_batch(lambda x: x, batch_size=10))
-
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
-
-
-class UnbatchDatasetBenchmark(test.Benchmark):
-
-  def benchmarkNativeUnbatch(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.apply(batching.unbatch())
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (native) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_native_batch_size_%d" %
-              batch_size)
-
-  # Include a benchmark of the previous `unbatch()` implementation that uses
-  # a composition of more primitive ops. Eventually we'd hope to generate code
-  # that is as good in both cases.
-  def benchmarkOldUnbatchImplementation(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (unfused) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
-              batch_size)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
index 3903ec49b98447bc69e37107c359be748818f1f1..8264dee3c15da3e1c10751b9c3db3d1e2bc3f1ee 100644
--- a/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/bucket_by_sequence_length_test.py
@@ -105,14 +105,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
               boundaries,
               batch_sizes,
               no_padding=no_padding))
-      batch, = dataset.make_one_shot_iterator().get_next()
+      batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
       with self.cached_session() as sess:
         batches = []
         for _ in range(4):
-          batches.append(sess.run(batch))
+          batches.append(self.evaluate(batch))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(batch)
+          self.evaluate(batch)
       batch_sizes_val = []
       lengths_val = []
       for batch in batches:
@@ -155,14 +155,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
+    batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       batches = []
       for _ in range(3):
-        batches.append(sess.run(batch))
+        batches.append(self.evaluate(batch))
       with self.assertRaisesOpError("bucket_boundaries"):
-        sess.run(batch)
+        self.evaluate(batch)
     batch_sizes_val = []
     lengths_val = []
     for batch in batches:
@@ -192,14 +192,14 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
             grouping.bucket_by_sequence_length(
                 element_len, boundaries, batch_sizes,
                 pad_to_bucket_boundary=True))
-    batch, = dataset.make_one_shot_iterator().get_next()
+    batch, = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       batches = []
       for _ in range(5):
-        batches.append(sess.run(batch))
+        batches.append(self.evaluate(batch))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(batch)
+        self.evaluate(batch)
 
     self.assertAllEqual(batches[0], [[1, 0],
                                      [1, 1]])
@@ -295,12 +295,12 @@ class BucketBySequenceLengthTest(test_base.DatasetTestBase):
 
     def _compute_batches(dataset):
       """Computes actual batch outputs of dataset and stores in a set."""
-      batch = dataset.make_one_shot_iterator().get_next()
+      batch = dataset_ops.make_one_shot_iterator(dataset).get_next()
       all_sparse_tensors = set()
       with self.cached_session() as sess:
         with self.assertRaises(errors.OutOfRangeError):
           while True:
-            output = sess.run(batch)
+            output = self.evaluate(batch)
             sprs_tensor = (tuple([tuple(idx) for idx in output.indices]),
                            tuple(output.values))
             all_sparse_tensors.add(sprs_tensor)
diff --git a/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..943f0f1f81272b334f0011a301636e9927c15b7c
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/cardinality_test.py
@@ -0,0 +1,158 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.cardinality()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import cardinality
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.platform import test
+
+
+class NumElementsTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for `tf.data.experimental.cardinality()`."""
+
+  @parameterized.named_parameters(
+      # pylint: disable=g-long-lambda
+      ("Batch1",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=True), 2),
+      ("Batch2",
+       lambda: dataset_ops.Dataset.range(5).batch(2, drop_remainder=False), 3),
+      ("Batch3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).batch(2),
+       cardinality.UNKNOWN),
+      ("Batch4", lambda: dataset_ops.Dataset.range(5).repeat().batch(2),
+       cardinality.INFINITE),
+      ("Cache1", lambda: dataset_ops.Dataset.range(5).cache(), 5),
+      ("Cache2", lambda: dataset_ops.Dataset.range(5).cache("foo"), 5),
+      ("Concatenate1", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5)), 10),
+      ("Concatenate2",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5)), cardinality.UNKNOWN),
+      ("Concatenate3", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5)),
+       cardinality.INFINITE),
+      ("Concatenate4", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.UNKNOWN),
+      ("Concatenate5",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.UNKNOWN),
+      ("Concatenate6", lambda: dataset_ops.Dataset.range(5).repeat().
+       concatenate(dataset_ops.Dataset.range(5).filter(lambda _: True)),
+       cardinality.INFINITE),
+      ("Concatenate7", lambda: dataset_ops.Dataset.range(5).concatenate(
+          dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("Concatenate8",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).concatenate(
+           dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("Concatenate9",
+       lambda: dataset_ops.Dataset.range(5).repeat().concatenate(
+           dataset_ops.Dataset.range(5).repeat()), cardinality.INFINITE),
+      ("FlatMap", lambda: dataset_ops.Dataset.range(5).flat_map(
+          lambda _: dataset_ops.Dataset.from_tensors(0)),
+       cardinality.UNKNOWN),
+      ("Filter", lambda: dataset_ops.Dataset.range(5).filter(lambda _: True),
+       cardinality.UNKNOWN),
+      ("FromTensors1", lambda: dataset_ops.Dataset.from_tensors(0), 1),
+      ("FromTensors2", lambda: dataset_ops.Dataset.from_tensors((0, 1)), 1),
+      ("FromTensorSlices1",
+       lambda: dataset_ops.Dataset.from_tensor_slices([0, 0, 0]), 3),
+      ("FromTensorSlices2",
+       lambda: dataset_ops.Dataset.from_tensor_slices(([0, 0, 0], [1, 1, 1])),
+       3),
+      ("Interleave1", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0), cycle_length=1),
+       cardinality.UNKNOWN),
+      ("Interleave2", lambda: dataset_ops.Dataset.range(5).interleave(
+          lambda _: dataset_ops.Dataset.from_tensors(0),
+          cycle_length=1,
+          num_parallel_calls=1), cardinality.UNKNOWN),
+      ("Map1", lambda: dataset_ops.Dataset.range(5).map(lambda x: x), 5),
+      ("Map2", lambda: dataset_ops.Dataset.range(5).map(
+          lambda x: x, num_parallel_calls=1), 5),
+      ("PaddedBatch1", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=True), 2),
+      ("PaddedBatch2", lambda: dataset_ops.Dataset.range(5).padded_batch(
+          2, [], drop_remainder=False), 3),
+      ("PaddedBatch3", lambda: dataset_ops.Dataset.range(5).filter(
+          lambda _: True).padded_batch(2, []), cardinality.UNKNOWN),
+      ("PaddedBatch4",
+       lambda: dataset_ops.Dataset.range(5).repeat().padded_batch(2, []),
+       cardinality.INFINITE),
+      ("Prefetch", lambda: dataset_ops.Dataset.range(5).prefetch(buffer_size=1),
+       5),
+      ("Range1", lambda: dataset_ops.Dataset.range(0), 0),
+      ("Range2", lambda: dataset_ops.Dataset.range(5), 5),
+      ("Range3", lambda: dataset_ops.Dataset.range(5, 10), 5),
+      ("Range4", lambda: dataset_ops.Dataset.range(10, 5), 0),
+      ("Range5", lambda: dataset_ops.Dataset.range(5, 10, 2), 3),
+      ("Range6", lambda: dataset_ops.Dataset.range(10, 5, -2), 3),
+      ("Repeat1", lambda: dataset_ops.Dataset.range(0).repeat(0), 0),
+      ("Repeat2", lambda: dataset_ops.Dataset.range(1).repeat(0), 0),
+      ("Repeat3", lambda: dataset_ops.Dataset.range(0).repeat(5), 0),
+      ("Repeat4", lambda: dataset_ops.Dataset.range(1).repeat(5), 5),
+      ("Repeat5", lambda: dataset_ops.Dataset.range(0).repeat(), 0),
+      ("Repeat6", lambda: dataset_ops.Dataset.range(1).repeat(),
+       cardinality.INFINITE),
+      ("Shuffle", lambda: dataset_ops.Dataset.range(5).shuffle(buffer_size=1),
+       5),
+      ("Skip1", lambda: dataset_ops.Dataset.range(5).skip(2), 3),
+      ("Skip2", lambda: dataset_ops.Dataset.range(5).skip(8), 0),
+      ("Skip3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).skip(2),
+       cardinality.UNKNOWN),
+      ("Skip4", lambda: dataset_ops.Dataset.range(5).repeat().skip(2),
+       cardinality.INFINITE),
+      ("Take1", lambda: dataset_ops.Dataset.range(5).take(2), 2),
+      ("Take2", lambda: dataset_ops.Dataset.range(5).take(8), 5),
+      ("Take3",
+       lambda: dataset_ops.Dataset.range(5).filter(lambda _: True).take(2),
+       cardinality.UNKNOWN),
+      ("Take4", lambda: dataset_ops.Dataset.range(5).repeat().take(2), 2),
+      ("Window1", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=True), 2),
+      ("Window2", lambda: dataset_ops.Dataset.range(5).window(
+          size=2, shift=2, drop_remainder=False), 3),
+      ("Zip1", lambda: dataset_ops.Dataset.zip(dataset_ops.Dataset.range(5)),
+       5),
+      ("Zip2", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5), dataset_ops.Dataset.range(3))), 3),
+      ("Zip3", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5),
+           dataset_ops.Dataset.range(3).repeat())), 5),
+      ("Zip4", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5).repeat(),
+           dataset_ops.Dataset.range(3).repeat())), cardinality.INFINITE),
+      ("Zip5", lambda: dataset_ops.Dataset.zip(
+          (dataset_ops.Dataset.range(5),
+           dataset_ops.Dataset.range(3).filter(lambda _: True))),
+       cardinality.UNKNOWN),
+      # pylint: enable=g-long-lambda
+  )
+  def testNumElements(self, dataset_fn, expected_result):
+    with self.cached_session() as sess:
+      self.assertEqual(
+          sess.run(cardinality.cardinality(dataset_fn())), expected_result)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
index adfacf1c9f856e08d6bc60f1197391e0d57765bb..b8166fe8334a5117005b7194cd582287eac74dd7 100644
--- a/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/copy_to_device_test.py
@@ -28,18 +28,21 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
+from tensorflow.python.util import compat as util_compat
 
 
 class CopyToDeviceTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testCopyToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -53,19 +56,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceInt32(self):
     host_dataset = dataset_ops.Dataset.from_tensors([0, 1, 2, 3])
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -79,18 +83,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual((4,), next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.test_session(config=worker_config):
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToSameDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:0"))
 
     with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -104,19 +109,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -130,19 +136,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyDictToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -156,19 +163,20 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyDictToDeviceWithPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -182,12 +190,13 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopySparseTensorsToDevice(self):
 
     def make_tensor(i):
@@ -200,7 +209,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -213,15 +222,16 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopySparseTensorsToDeviceWithPrefetch(self):
 
     def make_tensor(i):
@@ -234,7 +244,7 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -247,14 +257,14 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpu(self):
     if not test_util.is_gpu_available():
@@ -265,15 +275,16 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithPrefetch(self):
     if not test_util.is_gpu_available():
@@ -284,15 +295,53 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
+
+  def testCopyToDeviceGpuWithMap(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    def generator():
+      for i in range(10):
+        yield i, float(i), str(i)
+
+    host_dataset = dataset_ops.Dataset.from_generator(
+        generator, output_types=(dtypes.int32, dtypes.float32, dtypes.string))
+    device_dataset = host_dataset.apply(
+        prefetching_ops.copy_to_device("/gpu:0"))
+
+    def gpu_map_func(x, y, z):
+      return math_ops.square(x), math_ops.square(y), z
+
+    device_dataset = device_dataset.apply(
+        prefetching_ops.map_on_gpu(gpu_map_func))
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    device_dataset = device_dataset.with_options(options)
+
+    with ops.device("/gpu:0"):
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
+      next_element = iterator.get_next()
+
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      for i in range(10):
+        x, y, z = self.evaluate(next_element)
+        self.assertEqual(i**2, x)
+        self.assertEqual(float(i**2), y)
+        self.assertEqual(util_compat.as_bytes(str(i)), z)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuInt32(self):
     if not test_util.is_gpu_available():
@@ -303,14 +352,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuInt32AndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -321,14 +371,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([0, 1, 2, 3], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuStrings(self):
     if not test_util.is_gpu_available():
@@ -339,14 +390,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([b"a", b"b", b"c"], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuStringsAndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -357,14 +409,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual([b"a", b"b", b"c"], sess.run(next_element))
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
+      self.assertAllEqual([b"a", b"b", b"c"], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDevicePingPongCPUGPU(self):
     if not test_util.is_gpu_available():
@@ -378,23 +431,25 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
           prefetching_ops.copy_to_device("/cpu:0", source_device="/gpu:0"))
 
       with ops.device("/cpu:0"):
-        iterator = back_to_cpu_dataset.make_initializable_iterator()
+        iterator = dataset_ops.make_initializable_iterator(back_to_cpu_dataset)
         next_element = iterator.get_next()
 
-      with self.cached_session() as sess:
-        sess.run(iterator.initializer)
+      with self.cached_session(
+          config=config_pb2.ConfigProto(allow_soft_placement=False)):
+        self.evaluate(iterator.initializer)
         for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1"))
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -408,23 +463,24 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testCopyToDeviceWithReInitAndPrefetch(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/cpu:1")).prefetch(1)
 
     with ops.device("/cpu:1"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
@@ -438,15 +494,15 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithReInit(self):
     if not test_util.is_gpu_available():
@@ -457,18 +513,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0"))
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testCopyToDeviceGpuWithReInitAndPrefetch(self):
     if not test_util.is_gpu_available():
@@ -479,18 +536,19 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
         prefetching_ops.copy_to_device("/gpu:0")).prefetch(1)
 
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testIteratorGetNextAsOptionalOnGPU(self):
     if not test_util.is_gpu_available():
@@ -500,33 +558,35 @@ class CopyToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.copy_to_device("/gpu:0"))
     with ops.device("/gpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
       next_elem = iterator_ops.get_next_as_optional(iterator)
       elem_has_value_t = next_elem.has_value()
       elem_value_t = next_elem.get_value()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
       # Before initializing the iterator, evaluating the optional fails with
       # a FailedPreconditionError.
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_has_value_t)
+        self.evaluate(elem_has_value_t)
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(elem_value_t)
+        self.evaluate(elem_value_t)
 
       # For each element of the dataset, assert that the optional evaluates to
       # the expected value.
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(3):
-        elem_has_value, elem_value = sess.run([elem_has_value_t, elem_value_t])
+        elem_has_value, elem_value = self.evaluate(
+            [elem_has_value_t, elem_value_t])
         self.assertTrue(elem_has_value)
         self.assertEqual(i, elem_value)
 
       # After exhausting the iterator, `next_elem.has_value()` will evaluate to
       # false, and attempting to get the value will fail.
       for _ in range(2):
-        self.assertFalse(sess.run(elem_has_value_t))
+        self.assertFalse(self.evaluate(elem_has_value_t))
         with self.assertRaises(errors.InvalidArgumentError):
-          sess.run(elem_value_t)
+          self.evaluate(elem_value_t)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/counter_test.py b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
index 4e114ac47914f89666f35a9fbc3c4a0099f0e6b1..49e1f2272b7bea8f2d245d678711a3879774ba06 100644
--- a/tensorflow/python/data/experimental/kernel_tests/counter_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/counter_test.py
@@ -19,32 +19,35 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import counter
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class CounterTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testCounter(self):
     """Test dataset construction using `count`."""
-    iterator = (counter.Counter(start=3, step=4)
-                .make_one_shot_iterator())
+    iterator = dataset_ops.make_one_shot_iterator(
+        counter.Counter(start=3, step=4))
     get_next = iterator.get_next()
     self.assertEqual([], get_next.shape.as_list())
     self.assertEqual(dtypes.int64, get_next.dtype)
 
-    negative_iterator = (counter.Counter(start=0, step=-1)
-                         .make_one_shot_iterator())
+    negative_iterator = dataset_ops.make_one_shot_iterator(
+        counter.Counter(start=0, step=-1))
     negative_get_next = negative_iterator.get_next()
 
     with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(get_next))
-      self.assertEqual(3 + 4, sess.run(get_next))
-      self.assertEqual(3 + 2 * 4, sess.run(get_next))
+      self.assertEqual(3, self.evaluate(get_next))
+      self.assertEqual(3 + 4, self.evaluate(get_next))
+      self.assertEqual(3 + 2 * 4, self.evaluate(get_next))
 
-      self.assertEqual(0, sess.run(negative_get_next))
-      self.assertEqual(-1, sess.run(negative_get_next))
-      self.assertEqual(-2, sess.run(negative_get_next))
+      self.assertEqual(0, self.evaluate(negative_get_next))
+      self.assertEqual(-1, self.evaluate(negative_get_next))
+      self.assertEqual(-2, self.evaluate(negative_get_next))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
index fb75be1fbcf1478994d25ea8b1084c6883adbf8d..b2f1b43ecf6f82725143c95af4d6f4df58e41903 100644
--- a/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/csv_dataset_test.py
@@ -20,14 +20,8 @@ from __future__ import print_function
 
 import gzip
 import os
-import string
-import tempfile
-import time
 import zlib
 
-import numpy as np
-
-from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
@@ -38,8 +32,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
 
 
@@ -537,96 +529,5 @@ class CsvDatasetTest(test_base.DatasetTestBase):
           record_defaults=record_defaults)
 
 
-class CsvDatasetBenchmark(test.Benchmark):
-  """Benchmarks for the various ways of creating a dataset from CSV files.
-  """
-  FLOAT_VAL = '1.23456E12'
-  STR_VAL = string.ascii_letters * 10
-
-  def _setUp(self, str_val):
-    # Since this isn't test.TestCase, have to manually create a test dir
-    gfile.MakeDirs(googletest.GetTempDir())
-    self._temp_dir = tempfile.mkdtemp(dir=googletest.GetTempDir())
-
-    self._num_cols = [4, 64, 256]
-    self._num_per_iter = 5000
-    self._filenames = []
-    for n in self._num_cols:
-      fn = os.path.join(self._temp_dir, 'file%d.csv' % n)
-      with open(fn, 'wb') as f:
-        # Just write 100 rows and use `repeat`... Assumes the cost
-        # of creating an iterator is not significant
-        row = ','.join([str_val for _ in range(n)])
-        f.write('\n'.join([row for _ in range(100)]))
-      self._filenames.append(fn)
-
-  def _tearDown(self):
-    gfile.DeleteRecursively(self._temp_dir)
-
-  def _runBenchmark(self, dataset, num_cols, prefix):
-    dataset = dataset.skip(self._num_per_iter - 1)
-    deltas = []
-    for _ in range(10):
-      next_element = dataset.make_one_shot_iterator().get_next()
-      with session.Session() as sess:
-        start = time.time()
-        # NOTE: This depends on the underlying implementation of skip, to have
-        # the net effect of calling `GetNext` num_per_iter times on the
-        # input dataset. We do it this way (instead of a python for loop, or
-        # batching N inputs in one iter) so that the overhead from session.run
-        # or batch doesn't dominate. If we eventually optimize skip, this has
-        # to change.
-        sess.run(next_element)
-        end = time.time()
-      deltas.append(end - start)
-    # Median wall time per CSV record read and decoded
-    median_wall_time = np.median(deltas) / self._num_per_iter
-    print('%s num_cols: %d Median wall time: %f' % (prefix, num_cols,
-                                                    median_wall_time))
-    self.report_benchmark(
-        iters=self._num_per_iter,
-        wall_time=median_wall_time,
-        name='%s_with_cols_%d' % (prefix, num_cols))
-
-  def benchmarkMapWithFloats(self):
-    self._setUp(self.FLOAT_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [[0.0]] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_float_map_decode_csv')
-    self._tearDown()
-
-  def benchmarkMapWithStrings(self):
-    self._setUp(self.STR_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [['']] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = dataset.map(lambda l: parsing_ops.decode_csv(l, **kwargs))  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_strings_map_decode_csv')
-    self._tearDown()
-
-  def benchmarkCsvDatasetWithFloats(self):
-    self._setUp(self.FLOAT_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [[0.0]] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_float_fused_dataset')
-    self._tearDown()
-
-  def benchmarkCsvDatasetWithStrings(self):
-    self._setUp(self.STR_VAL)
-    for i in range(len(self._filenames)):
-      num_cols = self._num_cols[i]
-      kwargs = {'record_defaults': [['']] * num_cols}
-      dataset = core_readers.TextLineDataset(self._filenames[i]).repeat()
-      dataset = readers.CsvDataset(self._filenames[i], **kwargs).repeat()  # pylint: disable=cell-var-from-loop
-      self._runBenchmark(dataset, num_cols, 'csv_strings_fused_dataset')
-    self._tearDown()
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
index 73be6cbcca8a204ac87cfb6ac8ae87f1d84ffa15..22e057a2848fd154de0ad356f2238fb2028cd647 100644
--- a/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/dense_to_sparse_batch_test.py
@@ -24,27 +24,28 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class DenseToSparseBatchTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDataset(self):
     components = np.random.randint(12, size=(100,)).astype(np.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([x], x)).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [12])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)], results.indices)
@@ -56,23 +57,23 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
                             results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetWithUnknownShape(self):
     components = np.random.randint(5, size=(40,)).astype(np.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([x, x], x)).apply(
-            batching.dense_to_sparse_batch(
-                4, [5, None])).make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [5, None])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       for start in range(0, len(components), 4):
-        results = sess.run(get_next)
+        results = self.evaluate(get_next)
         self.assertAllEqual([[i, j, z]
                              for i, c in enumerate(components[start:start + 4])
                              for j in range(c)
@@ -89,20 +90,22 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
         ], results.dense_shape)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetWithInvalidShape(self):
     input_tensor = array_ops.constant([[1]])
     with self.assertRaisesRegexp(ValueError, "Dimension -2 must be >= 0"):
-      dataset_ops.Dataset.from_tensors(input_tensor).apply(
-          batching.dense_to_sparse_batch(4, [-2])).make_initializable_iterator()
+      dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.from_tensors(input_tensor).apply(
+              batching.dense_to_sparse_batch(4, [-2])))
 
+  @test_util.run_deprecated_v1
   def testDenseToSparseBatchDatasetShapeErrors(self):
     input_tensor = array_ops.placeholder(dtypes.int32)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensors(input_tensor).apply(
-            batching.dense_to_sparse_batch(4, [12]))
-        .make_initializable_iterator())
+            batching.dense_to_sparse_batch(4, [12])))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -111,13 +114,13 @@ class DenseToSparseBatchTest(test_base.DatasetTestBase):
       sess.run(init_op, feed_dict={input_tensor: [[1]]})
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "incompatible with the row shape"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Initialize with an input tensor that is larger than `row_shape`.
       sess.run(init_op, feed_dict={input_tensor: range(13)})
       with self.assertRaisesRegexp(errors.DataLossError,
                                    "larger than the row shape"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
index 796a692c56ffb3cbd1347270ed31b3abcbef1739..214434206669299cf545d68bdc330b1a548b4710 100644
--- a/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/directed_interleave_dataset_test.py
@@ -24,11 +24,13 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     selector_dataset = dataset_ops.Dataset.range(10).repeat(100)
     input_datasets = [
@@ -36,16 +38,16 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
     ]
     dataset = interleave_ops._DirectedInterleaveDataset(selector_dataset,
                                                         input_datasets)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for _ in range(100):
         for i in range(10):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def _normalize(self, vec):
     return vec / vec.sum()
@@ -65,18 +67,19 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
         for i in range(num_datasets)
     ], weights)
     dataset = dataset.take(num_samples)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       freqs = np.zeros([num_datasets])
       for _ in range(num_samples):
-        freqs[sess.run(next_element)] += 1
+        freqs[self.evaluate(next_element)] += 1
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
     return freqs
 
+  @test_util.run_deprecated_v1
   def testSampleFromDatasets(self):
     random_seed.set_random_seed(1619)
     num_samples = 5000
@@ -96,20 +99,21 @@ class DirectedInterleaveDatasetTest(test_base.DatasetTestBase):
       freqs = self._testSampleFromDatasetsHelper(probs_ds, classes, num_samples)
       self.assertLess(self._chi2(probs, freqs / num_samples), 1e-2)
 
+  @test_util.run_deprecated_v1
   def testSelectFromDatasets(self):
     words = [b"foo", b"bar", b"baz"]
     datasets = [dataset_ops.Dataset.from_tensors(w).repeat() for w in words]
     choice_array = np.random.randint(3, size=(15,), dtype=np.int64)
     choice_dataset = dataset_ops.Dataset.from_tensor_slices(choice_array)
     dataset = interleave_ops.choose_from_datasets(datasets, choice_dataset)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in choice_array:
-        self.assertEqual(words[i], sess.run(next_element))
+        self.assertEqual(words[i], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testErrors(self):
     with self.assertRaisesRegexp(ValueError,
diff --git a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
index e54235d9f80c2dc0eaf2c30a8e5eda58310b3284..25742098f18787bc1d2e5bfd9c8717a777b8312c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/enumerate_dataset_test.py
@@ -24,17 +24,20 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class EnumerateDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testEnumerateDataset(self):
     components = (["a", "b"], [1, 2], [37.0, 38])
     start = constant_op.constant(20, dtype=dtypes.int64)
 
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).apply(
-        enumerate_ops.enumerate_dataset(start)).make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensor_slices(components).apply(
+            enumerate_ops.enumerate_dataset(start)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -44,12 +47,12 @@ class EnumerateDatasetTest(test_base.DatasetTestBase):
                      [t.shape for t in get_next[1]])
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertEqual((20, (b"a", 1, 37.0)), sess.run(get_next))
-      self.assertEqual((21, (b"b", 2, 38.0)), sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertEqual((20, (b"a", 1, 37.0)), self.evaluate(get_next))
+      self.assertEqual((21, (b"b", 2, 38.0)), self.evaluate(get_next))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
index c6ee88c676df201f022259abe7ed128db3cc2d73..357b5f1b49b9f75e187fc02a5a89907baa445a76 100644
--- a/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/filter_dataset_op_test.py
@@ -47,17 +47,17 @@ class FilterBenchmark(test.Benchmark):
       if optimize_dataset:
         dataset = dataset.apply(optimization.optimize(["filter_fusion"]))
 
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
 
       with session.Session() as sess:
         for _ in range(10):
-          sess.run(next_element.op)
+          self.evaluate(next_element.op)
         deltas = []
         for _ in range(100):
           start = time.time()
           for _ in range(100):
-            sess.run(next_element.op)
+            self.evaluate(next_element.op)
           end = time.time()
           deltas.append(end - start)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py b/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
deleted file mode 100644
index d38452e265a6c48828cee8f9350c3754d1e32210..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/experimental/kernel_tests/function_buffering_resource_test.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the private `FunctionBufferingResource` used in prefetching."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import threading
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.data.experimental.ops import prefetching_ops
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.eager import function
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
-from tensorflow.python.framework import test_util
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.platform import test
-
-
-class FunctionBufferingResourceTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self._event = threading.Event()
-
-  def _create_ds_and_iterator(self, device0, initializable=False):
-
-    def gen():
-      for i in range(1, 10):
-        yield [float(i)]
-        if i == 6:
-          self._event.set()
-
-    with ops.device(device0):
-      ds = dataset_ops.Dataset.from_generator(gen, (dtypes.float32))
-      if initializable:
-        ds_iterator = ds.make_initializable_iterator()
-      else:
-        ds_iterator = ds.make_one_shot_iterator()
-      return (ds, ds_iterator)
-
-  def _create_ops(self, ds, ds_iterator, buffer_name, device0, device1):
-    ds_iterator_handle = ds_iterator.string_handle()
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, ds.output_types, ds.output_shapes)
-      return remote_iterator.get_next()
-
-    target = constant_op.constant(device0)
-    with ops.device(device1):
-      buffer_resource_handle = prefetching_ops.function_buffering_resource(
-          f=_remote_fn.get_concrete_function(),
-          output_types=[dtypes.float32],
-          target_device=target,
-          string_arg=ds_iterator_handle,
-          buffer_size=3,
-          shared_name=buffer_name)
-
-    with ops.device(device1):
-      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
-          function_buffer_resource=buffer_resource_handle,
-          output_types=[dtypes.float32])
-      reset_op = prefetching_ops.function_buffering_resource_reset(
-          function_buffer_resource=buffer_resource_handle)
-      destroy_op = resource_variable_ops.destroy_resource_op(
-          buffer_resource_handle, ignore_lookup_error=True)
-
-    return (prefetch_op, reset_op, destroy_op)
-
-  def _prefetch_fn_helper_one_shot(self, buffer_name, device0, device1):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=False)
-    prefetch_op, _, destroy_op = self._create_ops(ds, ds_iterator, buffer_name,
-                                                  device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
-
-  def testSameDeviceCPU(self):
-    self._prefetch_fn_helper_one_shot("same_device_cpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/cpu:0")
-
-  def testDifferentDeviceCPU(self):
-    self._prefetch_fn_helper_one_shot("diff_device_cpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/cpu:1")
-
-  def testDifferentDeviceCPUGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    self._prefetch_fn_helper_one_shot("cpu_gpu",
-                                      "/job:localhost/replica:0/task:0/cpu:0",
-                                      "/job:localhost/replica:0/task:0/gpu:0")
-
-  def testReinitialization(self):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/cpu:1"
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
-    prefetch_op, reset_op, destroy_op = self._create_ops(
-        ds, ds_iterator, "reinit", device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      # Lets reset the function buffering resource and reinitialize the
-      # iterator. Should be able to go through this again.
-      self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [1.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [2.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [3.0])
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [4.0])
-      self._event.wait()
-      elem = sess.run(prefetch_op)
-      self.assertEqual(elem, [5.0])
-      sess.run(destroy_op)
-
-  def testReinitializationOutOfRange(self):
-    worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/cpu:1"
-    ds, ds_iterator = self._create_ds_and_iterator(device0, initializable=True)
-    prefetch_op, reset_op, destroy_op = self._create_ops(
-        ds, ds_iterator, "reinit", device0, device1)
-
-    with self.test_session(config=worker_config) as sess:
-      sess.run(ds_iterator.initializer)
-      for i in range(1, 10):
-        elem = sess.run(prefetch_op)
-        self.assertEqual(elem, [float(i)])
-      # Try fetching after its over twice to test out end of sequence.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      # Now reset everything and try it out again.
-      self._event.clear()
-      sess.run(reset_op)
-      sess.run(ds_iterator.initializer)
-      for i in range(1, 10):
-        elem = sess.run(prefetch_op)
-        self.assertEqual(elem, [float(i)])
-      # Try fetching after its over twice to test out end of sequence.
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      sess.run(destroy_op)
-
-  def testStringsGPU(self):
-    if not test_util.is_gpu_available():
-      self.skipTest("No GPU available")
-
-    device0 = "/job:localhost/replica:0/task:0/cpu:0"
-    device1 = "/job:localhost/replica:0/task:0/gpu:0"
-
-    ds = dataset_ops.Dataset.from_tensor_slices(["a", "b", "c"])
-    ds_iterator = ds.make_one_shot_iterator()
-    ds_iterator_handle = ds_iterator.string_handle()
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-    def _remote_fn(h):
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          h, ds.output_types, ds.output_shapes)
-      return remote_iterator.get_next()
-
-    target = constant_op.constant(device0)
-    with ops.device(device1):
-      buffer_resource_handle = prefetching_ops.function_buffering_resource(
-          f=_remote_fn.get_concrete_function(),
-          output_types=[dtypes.string],
-          target_device=target,
-          string_arg=ds_iterator_handle,
-          buffer_size=3,
-          shared_name="strings")
-
-    with ops.device(device1):
-      prefetch_op = prefetching_ops.function_buffering_resource_get_next(
-          function_buffer_resource=buffer_resource_handle,
-          output_types=[dtypes.string])
-      destroy_op = resource_variable_ops.destroy_resource_op(
-          buffer_resource_handle, ignore_lookup_error=True)
-
-    with self.cached_session() as sess:
-      self.assertEqual([b"a"], sess.run(prefetch_op))
-      self.assertEqual([b"b"], sess.run(prefetch_op))
-      self.assertEqual([b"c"], sess.run(prefetch_op))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(prefetch_op)
-
-      sess.run(destroy_op)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
index 8c07afbac57944593ba48f2116f876dbe7ab9e76..ef576563a15a7385d450e4f254e1cb579f79ce8c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/get_single_element_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -39,6 +40,7 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("MoreThanOne", 0, 2, errors.InvalidArgumentError,
        "Dataset had more than one element."),
   )
+  @test_util.run_deprecated_v1
   def testGetSingleElement(self, skip, take, error=None, error_msg=None):
     skip_t = array_ops.placeholder(dtypes.int64, shape=[])
     take_t = array_ops.placeholder(dtypes.int64, shape=[])
@@ -67,6 +69,17 @@ class GetSingleElementTest(test_base.DatasetTestBase, parameterized.TestCase):
         with self.assertRaisesRegexp(error, error_msg):
           sess.run(element, feed_dict={skip_t: skip, take_t: take})
 
+  def testWindow(self):
+    """Test that `get_single_element()` can consume a nested dataset."""
+    def flat_map_func(ds):
+      batched = ds.batch(2)
+      element = get_single_element.get_single_element(batched)
+      return dataset_ops.Dataset.from_tensors(element)
+
+    dataset = dataset_ops.Dataset.range(10).window(2).flat_map(flat_map_func)
+    self.assertDatasetProduces(
+        dataset, [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
index 9030328593181c15981c889cd7b0c0dc370f060d..8507df3d3a27ea62c9d866c94af589fbc566317e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_reducer_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -36,14 +37,15 @@ class GroupByReducerTest(test_base.DatasetTestBase):
 
   def checkResults(self, dataset, shapes, values):
     self.assertEqual(shapes, dataset.output_shapes)
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
     with self.cached_session() as sess:
       for expected in values:
-        got = sess.run(get_next)
+        got = self.evaluate(get_next)
         self.assertEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testSum(self):
     reducer = grouping.Reducer(
         init_func=lambda _: np.int64(0),
@@ -55,6 +57,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
 
+  @test_util.run_deprecated_v1
   def testAverage(self):
 
     def reduce_fn(x, y):
@@ -72,6 +75,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[i - 1, i])
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     components = np.array(list("abcdefghijklmnopqrst")).view(np.chararray)
     reducer = grouping.Reducer(
@@ -88,6 +92,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           shapes=tensor_shape.scalar(),
           values=[b"acegikmoqs" [:i], b"bdfhjlnprt" [:i]])
 
+  @test_util.run_deprecated_v1
   def testSparseSum(self):
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -105,6 +110,7 @@ class GroupByReducerTest(test_base.DatasetTestBase):
       self.checkResults(
           dataset, shapes=tensor_shape.scalar(), values=[(i - 1) * i, i * i])
 
+  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
 
     def reduce_fn(x, _):
@@ -124,14 +130,14 @@ class GroupByReducerTest(test_base.DatasetTestBase):
           grouping.group_by_reducer(lambda x: x, reducer))
       self.assertEqual([None], dataset.output_shapes[0].as_list())
       self.assertIs(None, dataset.output_shapes[1].ndims)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next()
       with self.cached_session() as sess:
-        x, y = sess.run(get_next)
+        x, y = self.evaluate(get_next)
         self.assertAllEqual([0] * (2**i), x)
         self.assertAllEqual(np.array(1, ndmin=i), y)
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   def testTypeMismatch(self):
     reducer = grouping.Reducer(
@@ -188,9 +194,9 @@ class GroupByReducerTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.zip(
         (dataset_ops.Dataset.range(10), dataset_ops.Dataset.range(10))).apply(
             grouping.group_by_reducer(lambda x, y: np.int64(0), reducer))
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
     with self.cached_session() as sess:
-      x, y = sess.run(get_next)
+      x, y = self.evaluate(get_next)
       self.assertAllEqual(x, np.asarray([x for x in range(10)]))
       self.assertEqual(y, 45)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
index 557d56e8b9a60ec4cd4fb248dd6dfeb1c2ed4589..cbb79e55f507a41c0522163dc0b68c56835891a6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/group_by_window_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
@@ -49,6 +50,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
              32, (tensor_shape.TensorShape([]), tensor_shape.TensorShape(
                  [None]), tensor_shape.TensorShape([3])))))
 
+  @test_util.run_deprecated_v1
   def testSingleBucket(self):
 
     def _map_fn(v):
@@ -63,14 +65,14 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda x, y, z: 0,
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
-      which_bucket, bucketed_values = sess.run(get_next)
+      which_bucket, bucketed_values = self.evaluate(get_next)
 
       self.assertEqual(0, which_bucket)
 
@@ -84,6 +86,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(expected_unk_int64, bucketed_values[1])
       self.assertAllEqual(expected_vec3_str, bucketed_values[2])
 
+  @test_util.run_deprecated_v1
   def testEvenOddBuckets(self):
 
     def _map_fn(v):
@@ -98,16 +101,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda x, y, z: math_ops.cast(x % 2, dtypes.int64),
             lambda k, bucket: self._dynamicPad(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       # Get two minibatches (one containing even values, one containing odds)
-      which_bucket_even, bucketed_values_even = sess.run(get_next)
-      which_bucket_odd, bucketed_values_odd = sess.run(get_next)
+      which_bucket_even, bucketed_values_even = self.evaluate(get_next)
+      which_bucket_odd, bucketed_values_odd = self.evaluate(get_next)
 
       # Count number of bucket_tensors.
       self.assertEqual(3, len(bucketed_values_even))
@@ -141,6 +144,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(expected_unk_int64, bucketed_values_odd[1])
       self.assertAllEqual(expected_vec3_str, bucketed_values_odd[2])
 
+  @test_util.run_deprecated_v1
   def testEvenOddBucketsFilterOutAllOdd(self):
 
     def _map_fn(v):
@@ -169,16 +173,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
             lambda d: math_ops.cast(d["x"] % 2, dtypes.int64),
             lambda k, bucket: _dynamic_pad_fn(k, bucket, 32), 32))
 
-    iterator = bucketed_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(bucketed_dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
 
       # Get two minibatches ([0, 2, ...] and [64, 66, ...])
-      which_bucket0, bucketed_values_even0 = sess.run(get_next)
-      which_bucket1, bucketed_values_even1 = sess.run(get_next)
+      which_bucket0, bucketed_values_even0 = self.evaluate(get_next)
+      which_bucket1, bucketed_values_even1 = self.evaluate(get_next)
 
       # Ensure that bucket 1 was completely filtered out
       self.assertAllEqual(0, which_bucket0)
@@ -188,6 +192,7 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertAllEqual(
           np.arange(64, 128, 2, dtype=np.int64), bucketed_values_even1["x"])
 
+  @test_util.run_deprecated_v1
   def testDynamicWindowSize(self):
     components = np.arange(100).astype(np.int64)
 
@@ -202,16 +207,16 @@ class GroupByWindowTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.from_tensor_slices(components).apply(
         grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(20),
                                  None, window_size_func))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.OutOfRangeError):
         batches = 0
         while True:
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           is_even = all(x % 2 == 0 for x in result)
           is_odd = all(x % 2 == 1 for x in result)
           self.assertTrue(is_even or is_odd)
@@ -221,22 +226,23 @@ class GroupByWindowTest(test_base.DatasetTestBase):
 
       self.assertEqual(batches, 15)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(lambda x: x * x)
         .apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       counts = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
-          result = sess.run(get_next)
+          result = self.evaluate(get_next)
           self.assertTrue(
               all(x % 2 == 0
                   for x in result) or all(x % 2 == 1)
@@ -248,61 +254,64 @@ class GroupByWindowTest(test_base.DatasetTestBase):
       self.assertGreaterEqual(num_full_batches, 24)
       self.assertTrue(all(c == 4 for c in counts[:num_full_batches]))
 
+  @test_util.run_deprecated_v1
   def testImmediateOutput(self):
     components = np.array(
         [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 0, 0, 2, 2, 0, 0], dtype=np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).repeat(-1).apply(
             grouping.group_by_window(lambda x: x % 3, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       # The input is infinite, so this test demonstrates that:
       # 1. We produce output without having to consume the entire input,
       # 2. Different buckets can produce output at different rates, and
       # 3. For deterministic input, the output is deterministic.
       for _ in range(3):
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-        self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
-        self.assertAllEqual([2, 2, 2, 2], sess.run(get_next))
-        self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
+        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
+        self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
+        self.assertAllEqual([2, 2, 2, 2], self.evaluate(get_next))
+        self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testSmallGroups(self):
     components = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0], dtype=np.int64)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).apply(
             grouping.group_by_window(lambda x: x % 2, lambda _, xs: xs.batch(4),
-                                     4)).make_initializable_iterator())
+                                     4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual([0, 0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1, 1, 1, 1], sess.run(get_next))
+      self.evaluate(init_op)
+      self.assertAllEqual([0, 0, 0, 0], self.evaluate(get_next))
+      self.assertAllEqual([1, 1, 1, 1], self.evaluate(get_next))
       # The small outputs at the end are deterministically produced in key
       # order.
-      self.assertAllEqual([0, 0, 0], sess.run(get_next))
-      self.assertAllEqual([1], sess.run(get_next))
+      self.assertAllEqual([0, 0, 0], self.evaluate(get_next))
+      self.assertAllEqual([1], self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(4).apply(
-            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0))
-        .make_initializable_iterator())
+            grouping.group_by_window(lambda _: 0, lambda _, xs: xs, 0)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "Window size must be greater than zero, but got 0."):
-        print(sess.run(get_next))
+        print(self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testReduceFuncError(self):
     components = np.random.randint(100, size=(200,)).astype(np.int64)
 
@@ -314,19 +323,19 @@ class GroupByWindowTest(test_base.DatasetTestBase):
           padded_shapes=(tensor_shape.TensorShape([]),
                          constant_op.constant([5], dtype=dtypes.int64) * -1))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: (x, ops.convert_to_tensor([x * x]))).apply(
-            grouping.group_by_window(lambda x, _: x % 2, reduce_func,
-                                     32)).make_initializable_iterator())
+            grouping.group_by_window(lambda x, _: x % 2, reduce_func, 32)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testConsumeWindowDatasetMoreThanOnce(self):
     components = np.random.randint(50, size=(200,)).astype(np.int64)
 
@@ -340,22 +349,21 @@ class GroupByWindowTest(test_base.DatasetTestBase):
               4, padded_shapes=ops.convert_to_tensor([(key + 1) * 10])),
       ))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.fill([math_ops.cast(x, dtypes.int32)], x))
         .apply(grouping.group_by_window(
             lambda x: math_ops.cast(array_ops.shape(x)[0] // 10, dtypes.int64),
-            reduce_func, 4))
-        .make_initializable_iterator())
+            reduce_func, 4)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       counts = []
       with self.assertRaises(errors.OutOfRangeError):
         while True:
-          tight_result, multiple_of_10_result = sess.run(get_next)
+          tight_result, multiple_of_10_result = self.evaluate(get_next)
           self.assertEqual(0, multiple_of_10_result.shape[1] % 10)
           self.assertAllEqual(tight_result,
                               multiple_of_10_result[:, :tight_result.shape[1]])
diff --git a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
index c0ec1486ab8d49e8f1fc3a6ac98fe32cefba605b..81f580fccbd6b0053eaa865408b4f8c5f95ba94f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/ignore_errors_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.data.experimental.ops import error_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ _NUMPY_RANDOM_SEED = 42
 
 class IgnoreErrorsTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -42,17 +44,18 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components)
         .map(lambda x: array_ops.check_numerics(x, "message")).apply(
             error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testParallelMapIgnoreError(self):
     components = np.array([1., 2., 3., np.nan, 5.]).astype(np.float32)
 
@@ -60,17 +63,18 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(components).map(
             lambda x: array_ops.check_numerics(x, "message"),
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for x in [1., 2., 3., 5.]:
-        self.assertEqual(x, sess.run(get_next))
+        self.assertEqual(x, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
+  @test_util.run_deprecated_v1
   def testReadFileIgnoreError(self):
 
     def write_string_to_file(value, filename):
@@ -87,28 +91,28 @@ class IgnoreErrorsTest(test_base.DatasetTestBase):
         dataset_ops.Dataset.from_tensor_slices(filenames).map(
             io_ops.read_file,
             num_parallel_calls=2).prefetch(2).apply(error_ops.ignore_errors()))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       # All of the files are present.
-      sess.run(init_op)
+      self.evaluate(init_op)
       for filename in filenames:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Delete one of the files.
       os.remove(filenames[0])
 
       # Attempting to read filenames[0] will fail, but ignore_errors()
       # will catch the error.
-      sess.run(init_op)
+      self.evaluate(init_op)
       for filename in filenames[1:]:
-        self.assertEqual(compat.as_bytes(filename), sess.run(get_next))
+        self.assertEqual(compat.as_bytes(filename), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
index c93a8353ce01063f52ecc68253df7d02a7689603..c3c4ccd07708d2c7cfdc57c2a6fcbf320f1dfb36 100644
--- a/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/indexed_dataset_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.platform import test
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class IndexedDatasetOpsTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testLowLevelIndexedDatasetOps(self):
     identity = ged_ops.experimental_identity_indexed_dataset(
         ops.convert_to_tensor(16, dtype=dtypes.uint64))
@@ -46,14 +48,15 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
         handle, index, output_types=[dtypes.uint64], output_shapes=[[]])
 
     with self.cached_session() as sess:
-      sess.run(materialize)
+      self.evaluate(materialize)
       self.assertEqual([3], sess.run(get_op, feed_dict={index: 3}))
 
+  @test_util.run_deprecated_v1
   def testIdentityIndexedDataset(self):
     ds = indexed_dataset_ops.IdentityIndexedDataset(16)
     materialized = ds.materialize()
     with self.cached_session() as sess:
-      sess.run(materialized.initializer)
+      self.evaluate(materialized.initializer)
       placeholder = array_ops.placeholder(dtypes.uint64, shape=[])
       for i in range(16):
         output = sess.run(
@@ -68,12 +71,13 @@ class IndexedDatasetOpsTest(test_base.DatasetTestBase):
     itr = ds.make_initializable_iterator()
     n = itr.get_next()
     with self.cached_session() as sess:
-      sess.run(itr.initializer)
+      self.evaluate(itr.initializer)
       for i in range(16):
-        output = sess.run(n)
+        output = self.evaluate(n)
         self.assertEqual(i, output)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(n)
+        self.evaluate(n)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
index 91ae8cb1bd24711e56241c88c5f6b2c496527f01..7c78810494866cbd4cac4201d23182e083037e1c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_batched_features_dataset_test.py
@@ -21,11 +21,13 @@ import numpy as np
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -40,11 +42,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from file 0.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames[0],
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames[0],
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -57,11 +60,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from file 1.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames[1],
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames[1],
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -74,11 +78,12 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames,
-                label_key="label",
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames,
+                    label_key="label",
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(
                 sess,
                 batch_size,
@@ -90,14 +95,16 @@ class MakeBatchedFeaturesDatasetTest(
         with ops.Graph().as_default() as g:
           with self.session(graph=g) as sess:
             # Basic test: read from both files.
-            self.outputs = self.make_batch_feature(
-                filenames=self.test_filenames,
-                num_epochs=num_epochs,
-                batch_size=batch_size).make_one_shot_iterator().get_next()
+            self.outputs = dataset_ops.make_one_shot_iterator(
+                self.make_batch_feature(
+                    filenames=self.test_filenames,
+                    num_epochs=num_epochs,
+                    batch_size=batch_size)).get_next()
             self.verify_records(sess, batch_size, num_epochs=num_epochs)
             with self.assertRaises(errors.OutOfRangeError):
               self._next_actual_batch(sess)
 
+  @test_util.run_deprecated_v1
   def testReadWithEquivalentDataset(self):
     features = {
         "file": parsing_ops.FixedLenFeature([], dtypes.int64),
@@ -107,19 +114,19 @@ class MakeBatchedFeaturesDatasetTest(
         core_readers.TFRecordDataset(self.test_filenames)
         .map(lambda x: parsing_ops.parse_single_example(x, features))
         .repeat(10).batch(2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for file_batch, _, _, _, record_batch, _ in self._next_expected_batch(
           range(self._num_files), 2, 10):
-        actual_batch = sess.run(next_element)
+        actual_batch = self.evaluate(next_element)
         self.assertAllEqual(file_batch, actual_batch["file"])
         self.assertAllEqual(record_batch, actual_batch["record"])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testReadWithFusedShuffleRepeatDataset(self):
     num_epochs = 5
@@ -128,18 +135,18 @@ class MakeBatchedFeaturesDatasetTest(
       # Test that shuffling with same seed produces the same result.
       with ops.Graph().as_default() as g:
         with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
+          outputs1 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
+              shuffle_seed=5)).get_next()
+          outputs2 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
+              shuffle_seed=5)).get_next()
           for _ in range(total_records // batch_size):
             batch1 = self._run_actual_batch(outputs1, sess)
             batch2 = self._run_actual_batch(outputs2, sess)
@@ -149,18 +156,18 @@ class MakeBatchedFeaturesDatasetTest(
       # Test that shuffling with different seeds produces a different order.
       with ops.Graph().as_default() as g:
         with self.session(graph=g) as sess:
-          outputs1 = self.make_batch_feature(
+          outputs1 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=5).make_one_shot_iterator().get_next()
-          outputs2 = self.make_batch_feature(
+              shuffle_seed=5)).get_next()
+          outputs2 = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               num_epochs=num_epochs,
               batch_size=batch_size,
               shuffle=True,
-              shuffle_seed=15).make_one_shot_iterator().get_next()
+              shuffle_seed=15)).get_next()
           all_equal = True
           for _ in range(total_records // batch_size):
             batch1 = self._run_actual_batch(outputs1, sess)
@@ -176,14 +183,14 @@ class MakeBatchedFeaturesDatasetTest(
         for parser_num_threads in [2, 4]:
           with ops.Graph().as_default() as g:
             with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
-                  filenames=self.test_filenames,
-                  label_key="label",
-                  num_epochs=num_epochs,
-                  batch_size=batch_size,
-                  reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
+              self.outputs = dataset_ops.make_one_shot_iterator(
+                  self.make_batch_feature(
+                      filenames=self.test_filenames,
+                      label_key="label",
+                      num_epochs=num_epochs,
+                      batch_size=batch_size,
+                      reader_num_threads=reader_num_threads,
+                      parser_num_threads=parser_num_threads)).get_next()
               self.verify_records(
                   sess,
                   batch_size,
@@ -195,13 +202,13 @@ class MakeBatchedFeaturesDatasetTest(
 
           with ops.Graph().as_default() as g:
             with self.session(graph=g) as sess:
-              self.outputs = self.make_batch_feature(
-                  filenames=self.test_filenames,
-                  num_epochs=num_epochs,
-                  batch_size=batch_size,
-                  reader_num_threads=reader_num_threads,
-                  parser_num_threads=parser_num_threads).make_one_shot_iterator(
-                  ).get_next()
+              self.outputs = dataset_ops.make_one_shot_iterator(
+                  self.make_batch_feature(
+                      filenames=self.test_filenames,
+                      num_epochs=num_epochs,
+                      batch_size=batch_size,
+                      reader_num_threads=reader_num_threads,
+                      parser_num_threads=parser_num_threads)).get_next()
               self.verify_records(
                   sess,
                   batch_size,
@@ -215,12 +222,12 @@ class MakeBatchedFeaturesDatasetTest(
       for num_epochs in [1, 10]:
         with ops.Graph().as_default():
           # Basic test: read from file 0.
-          outputs = self.make_batch_feature(
+          outputs = dataset_ops.make_one_shot_iterator(self.make_batch_feature(
               filenames=self.test_filenames[0],
               label_key="label",
               num_epochs=num_epochs,
               batch_size=batch_size,
-              drop_final_batch=True).make_one_shot_iterator().get_next()
+              drop_final_batch=True)).get_next()
           for tensor in nest.flatten(outputs):
             if isinstance(tensor, ops.Tensor):  # Guard against SparseTensor.
               self.assertEqual(tensor.shape[0], batch_size)
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
index e4bf08918420b7b63fbb0d3a0ae56c7395ff9e97..3b7b335e7066175fba6ef190b977362bc461ca1d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_csv_dataset_test.py
@@ -29,10 +29,11 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
   def _make_csv_dataset(self, filenames, batch_size, num_epochs=1, **kwargs):
@@ -74,7 +75,6 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
   def _verify_output(
       self,
-      sess,
       dataset,
       batch_size,
       num_epochs,
@@ -82,7 +82,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
       expected_output,
       expected_keys,
   ):
-    nxt = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
     for expected_features in self._next_expected_batch(
         expected_output,
@@ -90,7 +90,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         batch_size,
         num_epochs,
     ):
-      actual_features = sess.run(nxt)
+      actual_features = self.evaluate(get_next())
 
       if label_name is not None:
         expected_labels = expected_features.pop(label_name)
@@ -102,7 +102,7 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
         self.assertAllEqual(expected_features[k], actual_features[k])
 
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(nxt)
+      self.evaluate(get_next())
 
   def _test_dataset(self,
                     inputs,
@@ -116,16 +116,14 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
     # Convert str type because py3 tf strings are bytestrings
     filenames = self._setup_files(
         inputs, compression_type=kwargs.get("compression_type", None))
-    with ops.Graph().as_default() as g:
-      with self.session(graph=g) as sess:
-        dataset = self._make_csv_dataset(
-            filenames,
-            batch_size=batch_size,
-            num_epochs=num_epochs,
-            label_name=label_name,
-            **kwargs)
-        self._verify_output(sess, dataset, batch_size, num_epochs, label_name,
-                            expected_output, expected_keys)
+    dataset = self._make_csv_dataset(
+        filenames,
+        batch_size=batch_size,
+        num_epochs=num_epochs,
+        label_name=label_name,
+        **kwargs)
+    self._verify_output(dataset, batch_size, num_epochs, label_name,
+                        expected_output, expected_keys)
 
   def testMakeCSVDataset(self):
     """Tests making a CSV dataset with keys and defaults provided."""
@@ -581,69 +579,65 @@ class MakeCsvDatasetTest(test_base.DatasetTestBase):
 
     total_records = 20
     for batch_size in [1, 2]:
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          # Test that shuffling with the same seed produces the same result
-          dataset1 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          dataset2 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          outputs1 = dataset1.make_one_shot_iterator().get_next()
-          outputs2 = dataset2.make_one_shot_iterator().get_next()
-          for _ in range(total_records // batch_size):
-            batch1 = nest.flatten(sess.run(outputs1))
-            batch2 = nest.flatten(sess.run(outputs2))
-            for i in range(len(batch1)):
-              self.assertAllEqual(batch1[i], batch2[i])
-
-      with ops.Graph().as_default() as g:
-        with self.session(graph=g) as sess:
-          # Test that shuffling with a different seed produces different results
-          dataset1 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=5,
-              num_epochs=2,
-          )
-          dataset2 = self._make_csv_dataset(
-              filenames,
-              column_defaults=record_defaults,
-              column_names=column_names,
-              batch_size=batch_size,
-              header=True,
-              shuffle=True,
-              shuffle_seed=6,
-              num_epochs=2,
-          )
-          outputs1 = dataset1.make_one_shot_iterator().get_next()
-          outputs2 = dataset2.make_one_shot_iterator().get_next()
-          all_equal = False
-          for _ in range(total_records // batch_size):
-            batch1 = nest.flatten(sess.run(outputs1))
-            batch2 = nest.flatten(sess.run(outputs2))
-            for i in range(len(batch1)):
-              all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
-          self.assertFalse(all_equal)
+      # Test that shuffling with the same seed produces the same result
+      dataset1 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      dataset2 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      next1 = self.getNext(dataset1)
+      next2 = self.getNext(dataset2)
+      for _ in range(total_records // batch_size):
+        batch1 = nest.flatten(self.evaluate(next1()))
+        batch2 = nest.flatten(self.evaluate(next2()))
+        for i in range(len(batch1)):
+          self.assertAllEqual(batch1[i], batch2[i])
+
+      # Test that shuffling with a different seed produces different results
+      dataset1 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=5,
+          num_epochs=2,
+      )
+      dataset2 = self._make_csv_dataset(
+          filenames,
+          column_defaults=record_defaults,
+          column_names=column_names,
+          batch_size=batch_size,
+          header=True,
+          shuffle=True,
+          shuffle_seed=6,
+          num_epochs=2,
+      )
+      next1 = self.getNext(dataset1)
+      next2 = self.getNext(dataset2)
+      all_equal = False
+      for _ in range(total_records // batch_size):
+        batch1 = nest.flatten(self.evaluate(next1()))
+        batch2 = nest.flatten(self.evaluate(next2()))
+        for i in range(len(batch1)):
+          all_equal = all_equal and np.array_equal(batch1[i], batch2[i])
+      self.assertFalse(all_equal)
 
   def testIndefiniteRepeatShapeInference(self):
     column_names = ["col%d" % i for i in range(5)]
diff --git a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
index 657cf3c00ee899a9a5718d808ba3d7ee2454bf6b..ab2feb642629eef098162ca445f54e84fc0389a9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/make_tf_record_dataset_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.data.experimental.kernel_tests import reader_dataset_ops_test_base
 from tensorflow.python.data.experimental.ops import readers
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -105,7 +106,7 @@ class MakeTFRecordDatasetTest(
     for expected_batch in self._next_expected_batch(
         file_indices, batch_size, num_epochs, interleave_cycle_length,
         drop_final_batch, use_parser_fn):
-      actual_batch = sess.run(outputs)
+      actual_batch = self.evaluate(outputs)
       self.assertAllEqual(expected_batch, actual_batch)
 
   def _read_test(self, batch_size, num_epochs, file_index=None,
@@ -122,20 +123,21 @@ class MakeTFRecordDatasetTest(
 
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
-        outputs = readers.make_tf_record_dataset(
-            file_pattern=file_pattern,
-            num_epochs=num_epochs,
-            batch_size=batch_size,
-            parser_fn=fn,
-            num_parallel_reads=num_parallel_reads,
-            drop_final_batch=drop_final_batch,
-            shuffle=False).make_one_shot_iterator().get_next()
+        outputs = dataset_ops.make_one_shot_iterator(
+            readers.make_tf_record_dataset(
+                file_pattern=file_pattern,
+                num_epochs=num_epochs,
+                batch_size=batch_size,
+                parser_fn=fn,
+                num_parallel_reads=num_parallel_reads,
+                drop_final_batch=drop_final_batch,
+                shuffle=False)).get_next()
         self._verify_records(
             sess, outputs, batch_size, file_index, num_epochs=num_epochs,
             interleave_cycle_length=num_parallel_reads,
             drop_final_batch=drop_final_batch, use_parser_fn=parser_fn)
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(outputs)
+          self.evaluate(outputs)
 
   def testRead(self):
     for batch_size in [1, 2]:
@@ -185,22 +187,22 @@ class MakeTFRecordDatasetTest(
             num_parallel_reads=num_parallel_reads,
             shuffle=True,
             shuffle_seed=seed)
-        iterator = dataset.make_initializable_iterator()
+        iterator = dataset_ops.make_initializable_iterator(dataset)
         next_element = iterator.get_next()
 
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         first_batches = []
         try:
           while True:
-            first_batches.append(sess.run(next_element))
+            first_batches.append(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
 
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         second_batches = []
         try:
           while True:
-            second_batches.append(sess.run(next_element))
+            second_batches.append(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
index 5ead6d1c7547fe2a39ffa826ac3f9a28f3bec90e..5c115f7ae311ddabef1ff6d7279d724bb1e18f85 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_and_batch_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -48,6 +49,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("ParallelCallsNUMA", 2, None, True),
       ("ParallelBatchesNUMA", None, 10, True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatch(self, num_parallel_calls, num_parallel_batches,
                       numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
@@ -76,7 +78,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -89,13 +91,13 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       sess.run(init_op, feed_dict={count: 28, batch_size: 14})
       num_batches = (28 * 7) // 14
       for i in range(num_batches):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(14):
             self.assertAllEqual(component[(i * 14 + j) % 7]**2,
                                 result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Batch of a finite input, where the batch_size does not
       # divide the total number of elements.
@@ -104,23 +106,23 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       # We expect (num_batches - 1) full-sized batches.
       num_batches = int(math.ceil((14 * 7) / 8))
       for i in range(num_batches - 1):
-        result = sess.run(get_next)
+        result = self.evaluate(get_next)
         for component, result_component in zip(components, result):
           for j in range(8):
             self.assertAllEqual(component[(i * 8 + j) % 7]**2,
                                 result_component[j])
-      result = sess.run(get_next)
+      result = self.evaluate(get_next)
       for component, result_component in zip(components, result):
         for j in range((14 * 7) % 8):
           self.assertAllEqual(component[((num_batches - 1) * 8 + j) % 7]**2,
                               result_component[j])
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Batch of an empty input should fail straight away.
       sess.run(init_op, feed_dict={count: 0, batch_size: 8})
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
       # Empty batch should be an initialization time error.
       with self.assertRaises(errors.InvalidArgumentError):
@@ -132,6 +134,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("EvenNUMA", False, True),
       ("UnevenNUMA", True, True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchPartialBatch(self, drop_remainder, numa_aware):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
@@ -144,25 +147,26 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     if drop_remainder:
       self.assertEqual([4, 1], iterator.output_shapes.as_list())
     else:
       self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
+    with self.cached_session():
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
       if not drop_remainder:
-        self.assertAllEqual([[64], [81]], sess.run(next_element))
+        self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchYieldsPartialBatch(self, numa_aware):
     dataset = (
         dataset_ops.Dataset.range(10).apply(
@@ -173,20 +177,21 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     self.assertEqual([None, 1], iterator.output_shapes.as_list())
     next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertAllEqual([[0], [1], [4], [9]], sess.run(next_element))
-      self.assertAllEqual([[16], [25], [36], [49]], sess.run(next_element))
-      self.assertAllEqual([[64], [81]], sess.run(next_element))
+    with self.cached_session():
+      self.assertAllEqual([[0], [1], [4], [9]], self.evaluate(next_element))
+      self.assertAllEqual([[16], [25], [36], [49]], self.evaluate(next_element))
+      self.assertAllEqual([[64], [81]], self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchParallelGetNext(self, numa_aware):
     dataset = dataset_ops.Dataset.range(50000).apply(
         batching.map_and_batch(lambda x: x, batch_size=100))
@@ -194,26 +199,27 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     elements = []
     for _ in range(100):
       elements.append(iterator.get_next())
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(5):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
           expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
         self.assertAllEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
+        self.evaluate(elements)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchParallelGetNextDropRemainder(self, numa_aware):
     dataset = dataset_ops.Dataset.range(49999).apply(
         batching.map_and_batch(
@@ -223,26 +229,27 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     elements = []
     for _ in range(100):
       elements.append(iterator.get_next())
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(4):
-        got = sess.run(elements)
+        got = self.evaluate(elements)
         got.sort(key=lambda x: x[0])
         expected = []
         for j in range(100):
           expected.append(range(i * 10000 + j * 100, i * 10000 + (j + 1) * 100))
         self.assertAllEqual(got, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elements)
+        self.evaluate(elements)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchSparse(self, numa_aware):
 
     def _sparse(i):
@@ -255,15 +262,15 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(init_op)
+    with self.cached_session():
+      self.evaluate(init_op)
       for i in range(2):
-        actual = sess.run(get_next)
+        actual = self.evaluate(get_next)
         expected = sparse_tensor.SparseTensorValue(
             indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
             values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
@@ -271,12 +278,13 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
         self.assertTrue(sparse_tensor.is_sparse(actual))
         self.assertSparseValuesEqual(actual, expected)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchFails(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
     dataset = dataset_ops.Dataset.from_tensors(
@@ -288,7 +296,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
 
     init_op = iterator.initializer
     with self.cached_session() as sess:
@@ -299,6 +307,7 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchShapeMismatch(self, numa_aware):
     """Test a dataset that maps a TF function across its input elements."""
 
@@ -316,15 +325,15 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
 
     init_op = iterator.initializer
     get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      sess.run(init_op)
+    with self.cached_session():
+      self.evaluate(init_op)
       with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                    "number of elements does not match"):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
       ("Normal", False),
@@ -349,12 +358,12 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       for _ in range(3):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
       ("1", 0, False),
@@ -370,13 +379,12 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("5NUMA", 95, True),
       ("6NUMA", 99, True),
   )
-  def testMapAndBatchOutOfRangeError(self, threshold, numa_aware):
+  @test_util.run_deprecated_v1
+  def testMapAndBatchMapError(self, threshold, numa_aware):
 
     def raising_py_fn(i):
-      if i == threshold:
+      if i >= threshold:
         raise StopIteration()
-      elif i > threshold:
-        raise RuntimeError("Alternate error; you shouldn't see me! (i: %s)" % i)
       else:
         return i
 
@@ -388,18 +396,24 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(threshold // 10):
-        self.assertAllEqual([i * 10 + j for j in range(10)], sess.run(get_next))
-      if threshold % 10 != 0:
-        self.assertAllEqual(
-            [threshold // 10 * 10 + j for j in range(threshold % 10)],
-            sess.run(get_next))
+        self.assertAllEqual([i * 10 + j for j in range(10)],
+                            self.evaluate(get_next))
+      if numa_aware:
+        if threshold % 10 != 0:
+          self.assertAllEqual(
+              [threshold // 10 * 10 + j for j in range(threshold % 10)],
+              self.evaluate(get_next))
+      else:
+        for i in range(threshold // 10, 10):
+          with self.assertRaises(errors.InvalidArgumentError):
+            self.evaluate(get_next)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   @parameterized.named_parameters(
       ("1", False, dtypes.bool, False),
@@ -438,11 +452,12 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       for _ in range(10):
-        self.assertAllEqual([element for _ in range(10)], sess.run(get_next))
+        self.assertAllEqual([element for _ in range(10)],
+                            self.evaluate(get_next))
 
   @parameterized.named_parameters(
       ("Identity", None, lambda x: x, None),
@@ -450,10 +465,11 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("Swap", (None, None), lambda x, y: (y, x), None),
       ("Project", (None, None), lambda x, y: x, None),
   )
+  @test_util.run_deprecated_v1
   def testShortCircuit(self, structure, map_fn, num_parallel_calls):
     dataset = self.structuredDataset(structure).repeat().apply(
         batching.map_and_batch(map_fn, batch_size=10))
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       if isinstance(structure, tuple):
@@ -462,23 +478,25 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       else:
         expected = map_fn(
             sess.run(self.structuredElement(structure, shape=[10])))
-      self.assertAllEqual(expected, sess.run(get_next))
+      self.assertAllEqual(expected, self.evaluate(get_next))
 
+  @test_util.run_deprecated_v1
   def testShortCircuitCapturedInput(self):
     captured_t = array_ops.placeholder(dtypes.int64, shape=[])
     dataset = self.structuredDataset(None).repeat().apply(
         batching.map_and_batch(lambda x: captured_t, batch_size=10))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={captured_t: 42})
-      self.assertAllEqual([42] * 10, sess.run(get_next))
+      self.assertAllEqual([42] * 10, self.evaluate(get_next))
 
   @parameterized.named_parameters(
       ("Normal", False),
       ("NUMA", True),
   )
+  @test_util.run_deprecated_v1
   def testMapAndBatchControlFlow(self, numa_aware):
 
     def map_fn(x):
@@ -494,20 +512,19 @@ class MapAndBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
       options = dataset_ops.Options()
       options.experimental_numa_aware = True
       dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(10):
-        print("Case %d" % i)
         if i < 5:
           self.assertAllEqual([i * 10 + j + 1 for j in range(10)],
-                              sess.run(get_next))
+                              self.evaluate(get_next))
         else:
           self.assertAllEqual(
               [((i * 10) + j) * ((i * 10) + j) for j in range(10)],
-              sess.run(get_next))
+              self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
index 11694540fae9996c9249d3a3d3a7c308e2a6f131..6042ca1c63f561a20e58e63e7864e13e847d3b35 100644
--- a/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/map_defun_op_test.py
@@ -218,7 +218,7 @@ class MapDefunTest(test_base.DatasetTestBase):
 
   def _assert_op_cancelled(self, sess, map_defun_op):
     with self.assertRaisesRegexp(errors.CancelledError, "was cancelled"):
-      sess.run(map_defun_op)
+      self.evaluate(map_defun_op)
 
   def testMapDefunWithParentCancellation(self):
     # Checks that a cancellation of the parent graph is threaded through to
@@ -260,10 +260,10 @@ class MapDefunBenchmark(test.Benchmark):
     with session.Session() as sess:
       # Warm up the session
       for _ in range(5):
-        sess.run(op)
+        self.evaluate(op)
       start = time.time()
       for _ in range(num_iters):
-        sess.run(op)
+        self.evaluate(op)
       end = time.time()
       mean_us = (end - start) * 1e6 / num_iters
       self.report_benchmark(
diff --git a/tensorflow/python/data/kernel_tests/matching_files_dataset_op_test.py b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
similarity index 57%
rename from tensorflow/python/data/kernel_tests/matching_files_dataset_op_test.py
rename to tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
index 4d86ec4228a0760da0d1de54f843aac5e16eb35e..0ee7616d35e801743167865d8d8097064ef88126 100644
--- a/tensorflow/python/data/kernel_tests/matching_files_dataset_op_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/matching_files_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for the private `MatchingFilesDataset`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -20,20 +20,17 @@ from __future__ import print_function
 import os
 import shutil
 import tempfile
-import time
 
-import numpy as np
-
-from tensorflow.python.client import session
+from tensorflow.python.data.experimental.ops import matching_files
 from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops.dataset_ops import MatchingFilesDataset
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
-class MatchingFilesDatasetTest(test_base.DatasetTestBase):
+class MatchingFilesTest(test_base.DatasetTestBase):
 
   def setUp(self):
     self.tmp_dir = tempfile.mkdtemp()
@@ -45,34 +42,40 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
     for filename in filenames:
       open(os.path.join(self.tmp_dir, filename), 'a').close()
 
+  @test_util.run_deprecated_v1
   def testNonExistingDirectory(self):
-    """Test the MatchingFiles dataset with a non-existing directory"""
+    """Test the MatchingFiles dataset with a non-existing directory."""
 
     self.tmp_dir = os.path.join(self.tmp_dir, 'nonexistingdir')
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, '*'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       with self.assertRaises(errors.NotFoundError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testEmptyDirectory(self):
-    """Test the MatchingFiles dataset with an empty directory"""
+    """Test the MatchingFiles dataset with an empty directory."""
 
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, '*'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       with self.assertRaises(errors.NotFoundError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testSimpleDirectory(self):
-    """Test the MatchingFiles dataset with a simple directory"""
+    """Test the MatchingFiles dataset with a simple directory."""
 
     filenames = ['a', 'b', 'c']
     self._touchTempFiles(filenames)
 
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, '*'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
       expected_filenames = []
       actual_filenames = []
@@ -85,15 +88,17 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testFileSuffixes(self):
-    """Test the MatchingFiles dataset using the suffixes of filename"""
+    """Test the MatchingFiles dataset using the suffixes of filename."""
 
     filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
     self._touchTempFiles(filenames)
 
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, '*.py'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, '*.py'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       expected_filenames = []
       actual_filenames = []
       for filename in filenames[1:-1]:
@@ -105,15 +110,17 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testFileMiddles(self):
-    """Test the MatchingFiles dataset using the middles of filename"""
+    """Test the MatchingFiles dataset using the middles of filename."""
 
     filenames = ['aa.txt', 'bb.py', 'bbc.pyc', 'cc.pyc']
     self._touchTempFiles(filenames)
 
-    dataset = MatchingFilesDataset(os.path.join(self.tmp_dir, 'b*.py*'))
+    dataset = matching_files.MatchingFilesDataset(
+        os.path.join(self.tmp_dir, 'b*.py*'))
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       expected_filenames = []
       actual_filenames = []
       for filename in filenames[1:3]:
@@ -125,8 +132,9 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(next_element)
 
+  @test_util.run_deprecated_v1
   def testNestedDirectories(self):
-    """Test the MatchingFiles dataset with nested directories"""
+    """Test the MatchingFiles dataset with nested directories."""
 
     filenames = []
     width = 8
@@ -147,9 +155,9 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
                      suffix) for suffix in ['*.txt', '*.log']
     ]
 
-    dataset = MatchingFilesDataset(patterns)
+    dataset = matching_files.MatchingFilesDataset(patterns)
     with self.cached_session() as sess:
-      next_element = dataset.make_one_shot_iterator().get_next()
+      next_element = dataset_ops.make_one_shot_iterator(dataset).get_next()
       expected_filenames = [
           compat.as_bytes(filename)
           for filename in filenames
@@ -165,70 +173,5 @@ class MatchingFilesDatasetTest(test_base.DatasetTestBase):
       self.assertItemsEqual(expected_filenames, actual_filenames)
 
 
-class MatchingFilesDatasetBenchmark(test.Benchmark):
-
-  def benchmarkNestedDirectories(self):
-    tmp_dir = tempfile.mkdtemp()
-    width = 500
-    depth = 10
-    for i in range(width):
-      for j in range(depth):
-        new_base = os.path.join(tmp_dir, str(i),
-                                *[str(dir_name) for dir_name in range(j)])
-        os.makedirs(new_base)
-        child_files = ['a.py', 'b.pyc'] if j < depth - 1 else ['c.txt', 'd.log']
-        for f in child_files:
-          filename = os.path.join(new_base, f)
-          open(filename, 'w').close()
-
-    patterns = [
-        os.path.join(tmp_dir, os.path.join(*['**'
-                                             for _ in range(depth)]), suffix)
-        for suffix in ['*.txt', '*.log']
-    ]
-
-    deltas = []
-    iters = 3
-    for _ in range(iters):
-      with ops.Graph().as_default():
-        dataset = MatchingFilesDataset(patterns)
-        next_element = dataset.make_one_shot_iterator().get_next()
-
-        with session.Session() as sess:
-          sub_deltas = []
-          while True:
-            try:
-              start = time.time()
-              sess.run(next_element)
-              end = time.time()
-              sub_deltas.append(end - start)
-            except errors.OutOfRangeError:
-              break
-          deltas.append(sub_deltas)
-
-    median_deltas = np.median(deltas, axis=0)
-    print('Nested directory size (width*depth): %d*%d Median wall time: '
-          '%fs (read first filename), %fs (read second filename), avg %fs'
-          ' (read %d more filenames)' %
-          (width, depth, median_deltas[0], median_deltas[1],
-           np.average(median_deltas[2:]), len(median_deltas) - 2))
-    self.report_benchmark(
-        iters=iters,
-        wall_time=np.sum(median_deltas),
-        extras={
-            'read first file:':
-                median_deltas[0],
-            'read second file:':
-                median_deltas[1],
-            'avg time for reading %d more filenames:' %
-            (len(median_deltas) - 2):
-                np.average(median_deltas[2:])
-        },
-        name='benchmark_matching_files_dataset_nesteddirectory(%d*%d)' %
-        (width, depth))
-
-    shutil.rmtree(tmp_dir, ignore_errors=True)
-
-
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 9946ef5a42f53a56f9f5bf7deb7c41cdff428c33..bf868ebe79339e3c36473711ece064210db5f47f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -42,6 +42,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -68,6 +69,7 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -127,6 +129,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
@@ -148,6 +151,7 @@ py_test(
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -167,6 +171,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -192,6 +197,7 @@ py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "@absl_py//absl/testing:parameterized",
@@ -202,6 +208,7 @@ py_test(
     name = "map_vectorization_test",
     size = "medium",
     srcs = ["map_vectorization_test.py"],
+    shard_count = 8,
     srcs_version = "PY2AND3",
     tags = [
         "no_oss",
@@ -220,15 +227,15 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python:nn",
         "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -248,12 +255,9 @@ py_test(
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:optimization",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -282,7 +286,7 @@ py_test(
 
 py_test(
     name = "optimize_dataset_test",
-    size = "small",
+    size = "medium",
     srcs = ["optimize_dataset_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -296,10 +300,17 @@ py_test(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:random_ops",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -316,6 +327,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
index ed719a0ce9b2c25f14c40d592f26a46a512c8d83..9b8248a78da11d99e3cf6cd87ab69d30d4d369d6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/assert_next_dataset_test.py
@@ -21,32 +21,27 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class AssertNextDatasetTest(test_base.DatasetTestBase):
 
   def testAssertNext(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Map"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testAssertNextInvalid(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
         optimization.assert_next(["Whoops"])).map(lambda x: x)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Asserted Whoops transformation at offset 0 but encountered "
-          "Map transformation instead."):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Asserted Whoops transformation at offset 0 but encountered "
+            "Map transformation instead."))
 
   def testAssertNextShort(self):
     dataset = dataset_ops.Dataset.from_tensors(0).apply(
@@ -54,14 +49,11 @@ class AssertNextDatasetTest(test_base.DatasetTestBase):
     options = dataset_ops.Options()
     options.experimental_autotune = False
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          "Asserted next 2 transformations but encountered only 1."):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            "Asserted next 2 transformations but encountered only 1."))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
index 80a0d879dc2088024d1a2a7117f79758c779d5d0..7371cf31dff33a5de18f3268ecdfc91c6a08b29c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/filter_fusion_test.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -58,6 +59,7 @@ def _filter_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_filter_fusion_test_cases())
@@ -70,28 +72,25 @@ class FilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
-    options.experimental_filter_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.filter_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(5):
-        r = map_function(x)
-        filtered = False
-        for predicate in predicates:
-          if isinstance(r, tuple):
-            b = predicate(*r)  # Pass tuple as multiple arguments.
-          else:
-            b = predicate(r)
-          if not sess.run(b):
-            filtered = True
-            break
+    expected_output = []
+    for x in range(5):
+      r = map_function(x)
+      filtered = False
+      for predicate in predicates:
+        if isinstance(r, tuple):
+          b = predicate(*r)  # Pass tuple as multiple arguments.
+        else:
+          b = predicate(r)
+        if not self.evaluate(b):
+          filtered = True
+          break
 
-        if not filtered:
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+      if not filtered:
+        expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
index 9f7fbfeba0d0a2d0c503106060985c2e27d6d364..5f3a8683fbb6cb2b43a41ad6d738b4982755bbff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/hoist_random_uniform_test.py
@@ -20,12 +20,15 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -58,23 +61,29 @@ def _hoist_random_uniform_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _testDataset(self, dataset):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
     previous_result = 0
-    with self.cached_session() as sess:
-      for _ in range(5):
-        result = sess.run(get_next)
-        self.assertLessEqual(1, result)
-        self.assertLessEqual(result, 10)
-        # This checks if the result is somehow random by checking if we are not
-        # generating the same values.
-        self.assertNotEqual(previous_result, result)
-        previous_result = result
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    if context.executing_eagerly():
+      iterator = dataset.__iter__()
+      get_next = iterator._next_internal  # pylint: disable=protected-access
+    else:
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next
+    for _ in range(5):
+      result = self.evaluate(get_next())
+      self.assertLessEqual(1, result)
+      self.assertLessEqual(result, 10)
+      # This checks if the result is somehow random by checking if we are not
+      # generating the same values.
+      self.assertNotEqual(previous_result, result)
+      previous_result = result
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(*_hoist_random_uniform_test_cases())
   def testHoisting(self, function, will_optimize):
@@ -83,7 +92,8 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
             ["Zip[0]", "Map"] if will_optimize else ["Map"])).map(function)
 
     options = dataset_ops.Options()
-    options.experimental_hoist_random_uniform = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
 
@@ -99,7 +109,8 @@ class HoistRandomUniformTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(["Zip[0]", "Map"])).map(random_with_capture)
     options = dataset_ops.Options()
-    options.experimental_hoist_random_uniform = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.hoist_random_uniform = True
     dataset = dataset.with_options(options)
     self._testDataset(dataset)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
index 7144d834f9f187c6b2a103d1cf22f4d4db91b429..fc65f52704c3389a24e9f304cfa1cadd5686c7d6 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/latency_all_edges_test.py
@@ -22,10 +22,11 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
 
   def testLatencyStatsOptimization(self):
@@ -39,22 +40,18 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
     options.experimental_stats.latency_all_edges = True
     options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
     summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertEqual(1 * 1, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      summary_str = sess.run(summary_t)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_TensorDataset/_1", 1)
-      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
-                                  1)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_PrefetchDataset/_6", 1)
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
+                                1)
+    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
+    self._assertSummaryHasCount(summary_str,
+                                "record_latency_PrefetchDataset/_6", 1)
 
   def testLatencyStatsOptimizationV2(self):
     aggregator = stats_aggregator.StatsAggregator()
@@ -63,24 +60,21 @@ class LatencyAllEdgesTest(stats_dataset_test_base.StatsDatasetTestBase):
             ["LatencyStats", "Map", "LatencyStats", "Prefetch",
              "LatencyStats"])).map(lambda x: x * x).prefetch(1)
     options = dataset_ops.Options()
-    options.experimental_stats = stats_options.StatsOptions(aggregator)
+    options.experimental_stats = stats_options.StatsOptions()
+    options.experimental_stats.aggregator = aggregator
     dataset = dataset.with_options(options)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[1],
+        requires_initialization=True,
+        num_test_iterations=1)
     summary_t = aggregator.get_summary()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertEqual(1, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      summary_str = sess.run(summary_t)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_TensorDataset/_1", 1)
-      self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4",
-                                  1)
-      self._assertSummaryHasCount(summary_str,
-                                  "record_latency_PrefetchDataset/_6", 1)
+    summary_str = self.evaluate(summary_t)
+    self._assertSummaryHasCount(summary_str, "record_latency_TensorDataset/_1",
+                                1)
+    self._assertSummaryHasCount(summary_str, "record_latency_MapDataset/_4", 1)
+    self._assertSummaryHasCount(summary_str,
+                                "record_latency_PrefetchDataset/_6", 1)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
index 6191a7db0840329ecd9f2de0a112aaf1af2ef8b3..2386dd5f116d660eb93213c935b662c05d90011d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/make_numa_aware_test.py
@@ -21,10 +21,11 @@ from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MakeNumaAwareTest(test_base.DatasetTestBase):
 
   def testMakeNumaAware(self):
@@ -34,13 +35,8 @@ class MakeNumaAwareTest(test_base.DatasetTestBase):
     options = dataset_ops.Options()
     options.experimental_numa_aware = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset, expected_output=[[x * x for x in range(10)]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
index ddf3cbbcc358d765beb4bca3ae4ffdf26f2da9ca..e2ff3116eccf2ccfb7ed72085f4727a1e0262164 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_batch_fusion_test.py
@@ -20,26 +20,19 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndBatchFusionTest(test_base.DatasetTestBase):
 
   def testMapAndBatchFusion(self):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(
             ["MapAndBatch"])).map(lambda x: x * x).batch(10)
-    options = dataset_ops.Options()
-    options.experimental_map_and_batch_fusion = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual([x * x for x in range(10)], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset, expected_output=[[x * x for x in range(10)]])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
index 3b4ca62340917de16b2d62d65da2f8cd924e2478..db8f214fbfca1389af70df55518c885610984031 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_and_filter_fusion_test.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -62,23 +63,20 @@ def _map_and_filter_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _testMapAndFilter(self, dataset, function, predicate):
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(10):
-        r = function(x)
-        if isinstance(r, tuple):
-          b = predicate(*r)  # Pass tuple as multiple arguments.
-        else:
-          b = predicate(r)
-        if sess.run(b):
-          result = sess.run(get_next)
-          self.assertAllEqual(r, result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    expected_output = []
+    for x in range(10):
+      r = function(x)
+      if isinstance(r, tuple):
+        b = predicate(*r)  # Pass tuple as multiple arguments.
+      else:
+        b = predicate(r)
+      if self.evaluate(b):
+        expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   @parameterized.named_parameters(*_map_and_filter_fusion_test_cases())
   def testMapFilterFusion(self, function, predicate):
@@ -86,7 +84,8 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(
             ["Map", "FilterByLastComponent"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_map_and_filter_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
 
@@ -104,7 +103,8 @@ class MapAndFilterFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
         optimization.assert_next(["Map",
                                   "Filter"])).map(function).filter(predicate)
     options = dataset_ops.Options()
-    options.experimental_map_and_filter_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_and_filter_fusion = True
     dataset = dataset.with_options(options)
     self._testMapAndFilter(dataset, function, predicate)
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
index ec63ad72006502afb9f23752c00d5926e8bc9f04..d8d63903749d13b80f662c996ebf5c95f934a0b1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_fusion_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -62,6 +63,7 @@ def _map_fusion_test_cases():
   return tuple(tests)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_map_fusion_test_cases())
@@ -73,23 +75,19 @@ class MapFusionTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = dataset.cache()
     options = dataset_ops.Options()
-    options.experimental_map_fusion = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_fusion = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        r = x
-        for function in functions:
-          if isinstance(r, tuple):
-            r = function(*r)  # Pass tuple as multiple arguments.
-          else:
-            r = function(r)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    expected_output = []
+    for x in range(5):
+      r = x
+      for function in functions:
+        if isinstance(r, tuple):
+          r = function(*r)  # Pass tuple as multiple arguments.
+        else:
+          r = function(r)
+      expected_output.append(r)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
index c95f7b2eb191e5d1c9abba0605f820568aa3225c..0ff3fff4f8550a4221e54ab2b01ddcaf6c340145 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_parallelization_test.py
@@ -20,11 +20,12 @@ from __future__ import print_function
 from absl.testing import parameterized
 
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -58,6 +59,7 @@ def _map_parallelization_test_cases():
           ("AssertWithRandom", assert_with_random, False))
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(*_map_parallelization_test_cases())
@@ -66,23 +68,12 @@ class MapParallelizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.range(5).apply(
         optimization.assert_next(next_nodes)).map(function)
     options = dataset_ops.Options()
-    options.experimental_map_parallelization = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.map_parallelization = True
     dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        # No need to run the pipeline if it was not optimized.  Also the results
-        # might be hard to check because of random.
-        if not should_optimize:
-          return
-        r = function(x)
-        self.assertAllEqual(r, result)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    if should_optimize:
+      self.assertDatasetProduces(
+          dataset, expected_output=[function(x) for x in range(5)])
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
index f10b66ff69159ef8c88232cfd9eebf545aed1771..adc411bfb5996904a92fd5b565eb59a439303500 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/map_vectorization_test.py
@@ -17,23 +17,21 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
-from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import check_ops
@@ -319,6 +317,7 @@ def _generate_optimization_test_cases():
   } for x in test_cases for num_parallel_calls in (None, 12)]
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _get_test_datasets(self,
@@ -345,17 +344,25 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
       Tuple of (unoptimized dataset, optimized dataset).
     """
     map_node_name = "Map" if num_parallel_calls is None else "ParallelMap"
-    batch_size = 100
 
     def _make_dataset(node_names):
-      return base_dataset.apply(optimization.assert_next(node_names)).map(
-          map_fn, num_parallel_calls=num_parallel_calls).batch(batch_size)
+      dataset = base_dataset.apply(optimization.assert_next(node_names))
+      dataset = dataset.map(map_fn, num_parallel_calls)
+      dataset = dataset.batch(100)
+      options = dataset_ops.Options()
+      opt_options = optimization_options.OptimizationOptions()
+      opt_options.map_and_batch_fusion = False
+      options.experimental_optimization = opt_options
+      dataset = dataset.with_options(options)
+      return dataset
 
     unoptimized = _make_dataset([map_node_name, "Batch"])
     optimized = _make_dataset(["Batch", map_node_name]
                               if expect_optimized else [map_node_name, "Batch"])
     options = dataset_ops.Options()
-    options.experimental_map_vectorization = True
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.map_vectorization = True
+    options.experimental_optimization = opt_options
     optimized = optimized.with_options(options)
     return unoptimized, optimized
 
@@ -366,7 +373,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
                                                      num_parallel_calls)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  def testOptimizationBadMapFn(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationBadMapFn(self):
     # Test map functions that give an error
     def map_fn(x):
       # x has leading dimension 5, this will raise an error
@@ -375,7 +383,7 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
     base_dataset = dataset_ops.Dataset.range(5).repeat(5).batch(
         5, drop_remainder=True)
     _, optimized = self._get_test_datasets(base_dataset, map_fn)
-    nxt = optimized.make_one_shot_iterator().get_next()
+    nxt = dataset_ops.make_one_shot_iterator(optimized).get_next()
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"indices = 10 is not in \[0, 5\)"):
       self.evaluate(nxt)
@@ -394,7 +402,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=True)
     self.assertDatasetsEqual(optimized, unoptimized)
 
-  def testOptimizationIgnoreStateful(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationIgnoreStateful(self):
 
     def map_fn(x):
       with ops.control_dependencies([check_ops.assert_equal(x, 0)]):
@@ -420,7 +429,8 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
         base_dataset, map_fn, expect_optimized=False)
     self.assertDatasetsEqual(unoptimized, optimized)
 
-  def testOptimizationIgnoreRaggedMap(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationIgnoreRaggedMap(self):
     # Don't optimize when the output of the map fn shapes are unknown.
     def map_fn(x):
       return array_ops.tile(x, x)
@@ -434,102 +444,5 @@ class MapVectorizationTest(test_base.DatasetTestBase, parameterized.TestCase):
          ("IteratorGetNext", "IteratorGetNext_1", 1)])
 
 
-class MapVectorizationBenchmark(test.Benchmark):
-  # TODO(rachelim): Add a benchmark for more expensive transformations, such as
-  # vgg_preprocessing.
-
-  def _run(self, x, num_iters=100, name=None):
-    deltas = []
-    with session.Session() as sess:
-      for _ in range(5):
-        # Warm up session...
-        sess.run(x)
-      for _ in range(num_iters):
-        start = time.time()
-        sess.run(x)
-        end = time.time()
-        deltas.append(end - start)
-    median_time = np.median(deltas)
-    self.report_benchmark(iters=num_iters, wall_time=median_time, name=name)
-    return median_time
-
-  def _compare(self, input_dataset, map_fn, batch_size, input_size, str_id):
-    num_elems = int(np.sum([np.prod(x) for x in input_size]))
-    name_template = "{}__batch_size_{}_input_element_size_{}_{}"
-    unoptimized = input_dataset.map(map_fn).batch(batch_size)
-    unoptimized_op = unoptimized.make_one_shot_iterator().get_next()
-
-    optimized = input_dataset.map(map_fn).batch(batch_size)
-    options = dataset_ops.Options()
-    options.experimental_map_vectorization = True
-    optimized = optimized.with_options(options)
-    optimized_op = optimized.make_one_shot_iterator().get_next()
-
-    unoptimized_time = self._run(
-        unoptimized_op,
-        name=name_template.format(str_id, batch_size, num_elems, "unoptimized"))
-    optimized_time = self._run(
-        optimized_op,
-        name=name_template.format(str_id, batch_size, num_elems, "optimized"))
-
-    print("Batch size: {}\n"
-          "Input element size: {}\n"
-          "Transformation: {}\n"
-          "Speedup: {}\n".format(batch_size, input_size, str_id,
-                                 (unoptimized_time / optimized_time)))
-
-  # Known cheap functions
-  def benchmarkIdentity(self):
-    self._benchmark_helper(lambda *args: [array_ops.identity(x) for x in args],
-                           "identity")
-
-  def benchmarkAddConst(self):
-    self._benchmark_helper(lambda *args: [x + 1 for x in args], "add_const")
-
-  def benchmarkReturnConst(self):
-    self._benchmark_helper(lambda *args: [constant_op.constant(2)], "ret_const")
-
-  def benchmarkSelect(self):
-    self._benchmark_helper(lambda *args: args[0], "select")
-
-  def benchmarkCast(self):
-    self._benchmark_helper(
-        lambda *args: [math_ops.cast(x, dtypes.float64) for x in args], "cast")
-
-  def benchmarkReshape(self):
-    self._benchmark_helper(
-        lambda *args: [array_ops.reshape(x, (-1, 30)) for x in args], "reshape")
-
-  def benchmarkDecodeCSV(self):
-    csv_fn, csv_factory = _generate_csv_test_case()
-    self._benchmark_helper(csv_fn, "decode_csv", lambda: [csv_factory()])
-
-  def benchmarkParseSingleExample(self):
-    # NOTE: Since we haven't implemented a vectorizer for "SerializeSparse",
-    # this function is only naively vectorized.
-    parse_fn, parse_factory = _generate_parse_single_example_test_case()
-
-    self._benchmark_helper(parse_fn, "parse_single_example",
-                           lambda: [parse_factory()])
-
-  def _default_dataset_factory(self):
-    input_sizes = [(10, 10, 3), (10, 100, 300)]
-    for sz in input_sizes:
-      yield dataset_ops.Dataset.from_tensor_slices(np.random.rand(*sz))
-
-  def _benchmark_helper(self, map_fn, str_id, base_dataset_factory=None):
-    if base_dataset_factory is None:
-      base_dataset_factory = self._default_dataset_factory
-
-    batch_size = 1000
-    for base_dataset in base_dataset_factory():
-      base_dataset = base_dataset.repeat()
-      input_size = [
-          tuple(shape.as_list())
-          for shape in nest.flatten(base_dataset.output_shapes)
-      ]
-      self._compare(base_dataset, map_fn, batch_size, input_size, str_id)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
index 5b49bdf453207c70b232cd68bedfaf1a19e08c79..0f0274b41f2da1add8b2361b54e5c32a5974da41 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/model_dataset_test.py
@@ -17,181 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
-
 from absl.testing import parameterized
-import numpy as np
 
-from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): Add eager coverage for the following tests.
 class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  def testModelMap(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(math_ops.matmul)
-    dataset = dataset_ops._ModelDataset(dataset)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(100):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelParallelMap(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(
-        math_ops.matmul, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops._ModelDataset(dataset)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(1000):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  @parameterized.named_parameters(
-      ("Default", False),
-      ("NUMA", True),
-  )
-  def testModelMapAndBatch(self, numa_aware):
-    batch_size = 16
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.apply(
-        batching.map_and_batch(
-            math_ops.matmul,
-            num_parallel_calls=optimization.AUTOTUNE,
-            batch_size=batch_size))
-    dataset = dataset_ops._ModelDataset(dataset)
-    options = dataset_ops.Options()
-    options.experimental_numa_aware = numa_aware
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(10):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelParallelInterleave(self):
-    k = 1024 * 1024
-    dataset = dataset_ops.Dataset.from_tensors((np.random.rand(1, 4 * k),
-                                                np.random.rand(4 * k,
-                                                               1))).repeat()
-    dataset = dataset.map(math_ops.matmul)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset,
-        cycle_length=10,
-        num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops._ModelDataset(dataset)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next.op)
-      for _ in range(1000):
-        start = time.time()
-        sess.run(get_next.op)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
-  def testModelNested(self):
-    k = 1024 * 1024
-    a = (np.random.rand(1, 8 * k), np.random.rand(8 * k, 1))
-    b = (np.random.rand(1, 4 * k), np.random.rand(4 * k, 1))
-    c = (np.random.rand(1, 2 * k), np.random.rand(2 * k, 1))
-    dataset = dataset_ops.Dataset.from_tensors((a, b, c)).repeat()
-
-    def f1(a, b, c):
-      x, y = a
-      return math_ops.matmul(x, y), b, c
-
-    def f2(a, b, c):
-      x, y = b
-      return a, math_ops.matmul(x, y), c
-
-    def f3(a, b, c):
-      x, y = c
-      return a, b, math_ops.matmul(x, y)
-
-    dataset = dataset.map(f1, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset, cycle_length=2)
-
-    dataset = dataset.map(f2, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops.Dataset.range(1).repeat().interleave(
-        lambda _: dataset, cycle_length=2)
-
-    dataset = dataset.map(f3, num_parallel_calls=optimization.AUTOTUNE)
-    dataset = dataset_ops._ModelDataset(dataset)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    deltas = []
-    with self.cached_session() as sess:
-      for _ in range(5):
-        sess.run(get_next)
-      for _ in range(100):
-        start = time.time()
-        sess.run(get_next)
-        end = time.time()
-        deltas.append(end - start)
-
-    print("%f (median), %f (mean), %f (stddev), %f (min), %f (max)\n" %
-          (np.median(deltas), np.mean(deltas), np.std(deltas), np.min(deltas),
-           np.max(deltas)))
-
   def testAutotuneOption(self):
     dataset = dataset_ops.Dataset.from_tensors(0)
     dataset = dataset.map(lambda x: x).apply(
@@ -200,13 +37,13 @@ class ModelDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     options.experimental_autotune = True
     dataset = dataset.with_options(options)
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
+      self.assertEqual(0, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
index ddcd7f4da4bdbcf9d6b871192c73bc1b0239c5dd..8058f53eea240831545444286fb2c6aa404e240a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/noop_elimination_test.py
@@ -22,11 +22,12 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NoopEliminationTest(test_base.DatasetTestBase):
 
   def testNoopElimination(self):
@@ -40,19 +41,7 @@ class NoopEliminationTest(test_base.DatasetTestBase):
             ["FiniteRepeat", "FiniteSkip", "Prefetch", "MemoryCacheImpl"]))
     dataset = dataset.repeat(some_tensor).skip(5).take(-1).skip(0).repeat(
         1).prefetch(0).prefetch(1).cache()
-    options = dataset_ops.Options()
-    options.experimental_noop_elimination = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.test_session() as sess:
-      for x in range(5):
-        result = sess.run(get_next)
-        self.assertAllEqual(result, x)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(dataset, expected_output=range(5))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
index 739b6a9bf4c059530f1f44cc571d13768eb973b1..230b74e9e8e0e3e26aeabe11faa84c651069c7b8 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/optimize_dataset_test.py
@@ -17,54 +17,125 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import warnings
+
+from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops import optimization_options
+from tensorflow.python.data.experimental.ops import scan_ops
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
 
 
-class OptimizeDatasetTest(test_base.DatasetTestBase):
+def _generate_captured_refvar_test_cases():
+  """Generates testcases.
+
+  Returns:
+    A list of tuples of (testcase_name, make_dataset_fn). make_dataset_fn takes
+    a tf.Variable as input and creates a test dataset that uses that variable.
+  """
+
+  def make_map_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).map(lambda x: x + var)
+
+  def make_flat_map_dataset(var):
+    return dataset_ops.Dataset.from_tensors(
+        0).flat_map(lambda _: dataset_ops.Dataset.from_tensors(var))
+
+  def make_filter_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).filter(lambda x: x < var)
+
+  def make_map_and_batch_dataset(var):
+
+    def map_fn(x):
+      return x + var
+
+    return dataset_ops.Dataset.from_tensors(0).apply(
+        batching.map_and_batch(map_fn, 1))
+
+  def make_group_by_reducer_dataset(var):
+    reducer = grouping.Reducer(
+        init_func=lambda _: 0,
+        reduce_func=lambda x, y: x,
+        finalize_func=lambda _: var)
+    return dataset_ops.Dataset.range(5).apply(
+        grouping.group_by_reducer(lambda x: x % 2, reducer))
+
+  def make_group_by_window_dataset(var):
+
+    def reduce_fn(key, bucket):
+      del key, bucket
+      return dataset_ops.Dataset.from_tensors(var)
+
+    return dataset_ops.Dataset.from_tensors(0).repeat(10).apply(
+        grouping.group_by_window(lambda _: 0, reduce_fn, 10))
+
+  def make_scan_dataset(var):
+    return dataset_ops.Dataset.from_tensors(0).apply(
+        scan_ops.scan(
+            0, lambda old_state, elem: (old_state + 1, elem + old_state + var)))
+
+  return [
+      # Core datasets
+      ("Map", make_map_dataset),
+      ("FlatMap", make_flat_map_dataset),
+      ("Filter", make_filter_dataset),
+      # Experimental datasets
+      ("MapAndBatch", make_map_and_batch_dataset),
+      ("GroupByReducer", make_group_by_reducer_dataset),
+      ("GroupByWindow", make_group_by_window_dataset),
+      ("Scan", make_scan_dataset)
+  ]
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class OptimizeDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testOptimizationStatefulFunction(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda _: random_ops.random_uniform([])).batch(10)
+    dataset = dataset_ops.Dataset.range(
+        10).map(lambda _: random_ops.random_uniform([])).batch(10)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(get_next)
+    get_next = self.getNext(dataset)
+    self.evaluate(get_next())
 
-  def testOptimizationLargeInputFromTensor(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationLargeInputFromTensor(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None))
     dataset = dataset_ops.Dataset.from_tensors(input_t)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(init_op, {input_t: np.ones([512, 1024, 1025], np.int32)})
-      sess.run(get_next)
+      self.evaluate(get_next)
 
-  def testOptimizationLargeInputFromTensorSlices(self):
+  # TODO(b/117581999): Add eager coverage for the following tests.
+  def testSkipEagerOptimizationLargeInputFromTensorSlices(self):
     input_t = array_ops.placeholder(dtypes.int32, (None, None, None, None))
     dataset = dataset_ops.Dataset.from_tensor_slices(input_t)
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(init_op, {input_t: np.ones([1, 512, 1024, 1025], np.int32)})
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   def testOptimizationNestedDataset(self):
 
@@ -78,13 +149,30 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     dataset = dataset_ops.Dataset.range(1)
     dataset = dataset.flat_map(flat_map_fn)
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
-    with self.cached_session() as sess:
-      self.assertEquals(0, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+  def testOptimizationNestedDatasetWithModifiedRetval(self):
+
+    def flat_map_fn(_):
+      dataset = dataset_ops.Dataset.from_tensors(0)
+      dataset = dataset.apply(optimization.assert_next(["MapAndBatch"]))
+      # Should be fused by map and batch fusion
+      dataset = dataset.map(lambda x: x)
+      dataset = dataset.batch(1)
+      return dataset
+
+    dataset = dataset_ops.Dataset.range(1)
+    dataset = dataset.flat_map(flat_map_fn)
+
+    # TODO(b/120558523): We use Options instead of _OptimizeDataset directly
+    # here because of a bug with chaining _OptimizeDatasets when there are
+    # nested dataset functions
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.map_and_batch_fusion = True
+    options.experimental_optimization = opt_options
+    dataset = dataset.with_options(options)
+    self.assertDatasetProduces(dataset, expected_output=[[0]])
 
   def testOptimizationThreadPoolDataset(self):
     dataset = dataset_ops.Dataset.range(10).batch(10)
@@ -95,14 +183,10 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
             2, display_name="private_thread_pool_%d" % 2))
 
     dataset = dataset_ops._OptimizeDataset(dataset, [])
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      self.assertAllEqual(list(range(10)), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[list(range(10))],
+        requires_initialization=True)
 
   def testOptimizationNonSerializable(self):
     dataset = dataset_ops.Dataset.from_tensors(0)
@@ -113,26 +197,86 @@ class OptimizeDatasetTest(test_base.DatasetTestBase):
     dataset = dataset.skip(0)  # Should be removed by noop elimination
     dataset = dataset.cache()
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEquals(0, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(dataset, expected_output=[0])
 
   def testOptimizationNonSerializableAsDirectInput(self):
-    """Tests that non-serializable dataset can be OptimizeDataset's input.
-    """
+    """Tests that non-serializable dataset can be OptimizeDataset's input."""
     dataset = dataset_ops.Dataset.from_tensors(0)
     dataset = dataset.apply(optimization.non_serializable())
     dataset = dataset_ops._OptimizeDataset(dataset, ["noop_elimination"])
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    with self.cached_session() as sess:
-      self.assertEquals(0, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    self.assertDatasetProduces(dataset, expected_output=[0])
+
+  @parameterized.named_parameters(_generate_captured_refvar_test_cases())
+  # Skip eager because RefVariables are not supported in eager mode.
+  def testSkipEagerOptimizationWithCapturedRefVar(self, dataset_fn):
+    """Tests that default optimizations are disabled with ref variables."""
+    variable = variable_scope.get_variable(
+        "v", initializer=0, use_resource=False)
+    assign_op = variable.assign_add(1)
+
+    unoptimized_dataset = dataset_fn(variable)
+
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.noop_elimination = True
+    opt_options.map_and_batch_fusion = True
+    options.experimental_optimization = opt_options
+    optimized_dataset = unoptimized_dataset.with_options(options)
+
+    # Check that warning is logged.
+    warnings.simplefilter("always")
+    with warnings.catch_warnings(record=True) as w:
+      optimized_it = optimized_dataset.make_initializable_iterator()
+
+    self.assertGreaterEqual(len(w), 1)
+    expected = ("tf.data static optimizations are not compatible with "
+                "tf.Variable. The following optimizations will be disabled: %s."
+                " To enable optimizations, use resource variables instead by "
+                "calling `tf.enable_resource_variables()` at the start of the "
+                "program." % (", ".join(opt_options._static_optimizations())))
+    self.assertTrue(any([expected in str(warning) for warning in w]))
+
+    # Check that outputs are the same in the optimized and unoptimized cases,
+    # when the variable value is changing.
+    unoptimized_it = unoptimized_dataset.make_initializable_iterator()
+    with ops.control_dependencies([assign_op]):
+      unoptimized_output = unoptimized_it.get_next()
+      optimized_output = optimized_it.get_next()
+
+    self.evaluate(variable.initializer)
+    self.evaluate((unoptimized_it.initializer, optimized_it.initializer))
+    while True:
+      try:
+        unoptimized, optimized = self.evaluate((unoptimized_output,
+                                                optimized_output))
+        self.assertEqual(unoptimized, optimized)
+      except errors.OutOfRangeError:
+        break
+
+  def testOptimizationEnabledByDefault(self):
+    """Tests that some optimizations are applied to datasets by default."""
+    options = dataset_ops.Options()
+    expected_optimizations = [
+        "map_and_batch_fusion",
+        "noop_elimination",
+        "shuffle_and_repeat_fusion",
+    ]
+    self.assertEqual(
+        set(options._static_optimizations()), set(expected_optimizations))
+
+  def testOptimizationDisableDefault(self):
+    """Tests that we can disable all static optimizations enabled by default.
+
+    If the `apply_default_optimizations` optimization options flag is False,
+    only explicitly enabled optimizations will be applied.
+    """
+    options = dataset_ops.Options()
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.hoist_random_uniform = True
+    opt_options.apply_default_optimizations = False
+    options.experimental_optimization = opt_options
+    expected_optimizations = ["hoist_random_uniform"]
+    self.assertEqual(options._static_optimizations(), expected_optimizations)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
index 36582f449f3a3de6ef9c8b710348bed21ff83880..594b59375febbba6c939dc5429ff59fe9c971a5f 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/shuffle_and_repeat_fusion_test.py
@@ -21,28 +21,27 @@ from tensorflow.python.data.experimental.ops import optimization
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class ShuffleAndRepeatFusionTest(test_base.DatasetTestBase):
 
   def testShuffleAndRepeatFusion(self):
     dataset = dataset_ops.Dataset.range(10).apply(
         optimization.assert_next(["ShuffleAndRepeat"])).shuffle(10).repeat(2)
-    options = dataset_ops.Options()
-    options.experimental_shuffle_and_repeat_fusion = True
-    dataset = dataset.with_options(options)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      for _ in range(2):
-        results = []
-        for _ in range(10):
-          results.append(sess.run(get_next))
-        self.assertAllEqual([x for x in range(10)], sorted(results))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    for _ in range(2):
+      results = []
+      for _ in range(10):
+        results.append(self.evaluate(get_next()))
+      self.assertAllEqual([x for x in range(10)], sorted(results))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
index 5e419a9b2f9e9debef63446263dc51b5c079a495..aa81663a188cfee738acaedfd44e239909a4215e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/override_threadpool_test.py
@@ -22,12 +22,15 @@ import threading
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.core.framework import graph_pb2
+from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.experimental.ops import threadpool
 from tensorflow.python.data.experimental.ops import unique
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
@@ -35,18 +38,7 @@ from tensorflow.python.platform import test
 class OverrideThreadpoolTest(test_base.DatasetTestBase,
                              parameterized.TestCase):
 
-  @parameterized.named_parameters(
-      ("1", 1, None),
-      ("2", 2, None),
-      ("3", 4, None),
-      ("4", 8, None),
-      ("5", 16, None),
-      ("6", 4, -1),
-      ("7", 4, 0),
-      ("8", 4, 1),
-      ("9", 4, 4),
-  )
-  def testNumThreads(self, num_threads, max_intra_op_parallelism):
+  def _testNumThreadsHelper(self, num_threads, override_threadpool_fn):
 
     def get_thread_id(_):
       # Python creates a dummy thread object to represent the current
@@ -60,32 +52,86 @@ class OverrideThreadpoolTest(test_base.DatasetTestBase,
         dataset_ops.Dataset.range(1000).map(
             lambda x: script_ops.py_func(get_thread_id, [x], dtypes.int64),
             num_parallel_calls=32).apply(unique.unique()))
-
-    dataset = threadpool.override_threadpool(
-        dataset,
-        threadpool.PrivateThreadPool(
-            num_threads,
-            max_intra_op_parallelism=max_intra_op_parallelism,
-            display_name="private_thread_pool_%d" % num_threads))
-
-    iterator = dataset.make_initializable_iterator()
+    dataset = override_threadpool_fn(dataset)
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      thread_ids = []
-      try:
-        while True:
-          thread_ids.append(sess.run(next_element))
-      except errors.OutOfRangeError:
-        pass
-      self.assertEqual(len(thread_ids), len(set(thread_ids)))
-      self.assertGreater(len(thread_ids), 0)
+    self.evaluate(iterator.initializer)
+    thread_ids = []
+    try:
+      while True:
+        thread_ids.append(self.evaluate(next_element))
+    except errors.OutOfRangeError:
+      pass
+    self.assertLen(thread_ids, len(set(thread_ids)))
+    self.assertNotEmpty(thread_ids)
+    if num_threads:
       # NOTE(mrry): We don't control the thread pool scheduling, and
       # so cannot guarantee that all of the threads in the pool will
       # perform work.
       self.assertLessEqual(len(thread_ids), num_threads)
 
+  @parameterized.named_parameters(
+      ("1", 1, None),
+      ("2", 2, None),
+      ("3", 4, None),
+      ("4", 8, None),
+      ("5", 16, None),
+      ("6", 4, -1),
+      ("7", 4, 0),
+      ("8", 4, 1),
+      ("9", 4, 4),
+  )
+  @test_util.run_deprecated_v1
+  def testNumThreadsDeprecated(self, num_threads, max_intra_op_parallelism):
+
+    def override_threadpool_fn(dataset):
+      return threadpool.override_threadpool(
+          dataset,
+          threadpool.PrivateThreadPool(
+              num_threads,
+              max_intra_op_parallelism=max_intra_op_parallelism,
+              display_name="private_thread_pool_%d" % num_threads))
+
+    self._testNumThreadsHelper(num_threads, override_threadpool_fn)
+
+  @parameterized.named_parameters(
+      ("1", 1, None),
+      ("2", 2, None),
+      ("3", 4, None),
+      ("4", 8, None),
+      ("5", 16, None),
+      ("6", None, 0),
+      ("7", None, 1),
+      ("8", None, 4),
+      ("9", 4, 0),
+      ("10", 4, 1),
+      ("11", 4, 4),
+      ("12", None, None),
+  )
+  @test_util.run_deprecated_v1
+  def testNumThreads(self, num_threads, max_intra_op_parallelism):
+
+    def override_threadpool_fn(dataset):
+      t_options = threading_options.ThreadingOptions()
+      if max_intra_op_parallelism is not None:
+        t_options.max_intra_op_parallelism = max_intra_op_parallelism
+      if num_threads is not None:
+        t_options.private_threadpool_size = num_threads
+      options = dataset_ops.Options()
+      options.experimental_threading = t_options
+      return dataset.with_options(options)
+
+    self._testNumThreadsHelper(num_threads, override_threadpool_fn)
+
+  def testMaxIntraOpParallelismAsGraphDefInternal(self):
+    dataset = dataset_ops.Dataset.from_tensors(0)
+    dataset = dataset_ops._MaxIntraOpParallelismDataset(dataset, 1)
+    graph = graph_pb2.GraphDef().FromString(
+        self.evaluate(dataset._as_serialized_graph()))
+    self.assertTrue(
+        any([node.op != "MaxIntraOpParallelismDataset" for node in graph.node]))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
index 90ac250df70bfac8c0d73836391900cf83a603e5..113326c028a53be5b6aa3889ace5013fc08843a4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parallel_interleave_test.py
@@ -86,7 +86,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.block_length, self.sloppy,
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -195,9 +195,9 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 1):
         self.write_coordination_events[expected_element].set()
         self.assertEqual(expected_element * expected_element,
-                         sess.run(self.next_element))
+                         self.evaluate(self.next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testSingleThreaded(self):
     self._testSingleThreaded()
@@ -235,10 +235,10 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       for expected_element in self._interleave(
           [[3] * 3, [7] * 7, [4] * 4] * self.repeat_count, 2, 1):
         self.write_coordination_events[expected_element].set()
-        output = sess.run(self.next_element)
+        output = self.evaluate(self.next_element)
         self.assertEqual(expected_element * expected_element, output)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def _testTwoThreadsNoContention(self, sloppy=False):
     # num_threads > 1.
@@ -262,7 +262,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           self.read_coordination_events[expected_element].acquire()
           done_first_event = True
@@ -270,7 +270,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContention(self):
     self._testTwoThreadsNoContention()
@@ -309,7 +309,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         else:
           self.write_coordination_events[expected_element].set()
         time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.assertTrue(
@@ -318,7 +318,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionWithRaces(self):
     self._testTwoThreadsNoContentionWithRaces()
@@ -348,7 +348,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.read_coordination_events[expected_element].acquire()
@@ -356,7 +356,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionBlockLength(self):
     self._testTwoThreadsNoContentionBlockLength()
@@ -396,7 +396,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         else:
           self.write_coordination_events[expected_element].set()
         time.sleep(0.5)  # Sleep to consistently "avoid" the race condition.
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           done_first_event = True
           self.assertTrue(
@@ -405,7 +405,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testTwoThreadsNoContentionWithRacesAndBlocking(self):
     self._testTwoThreadsNoContentionWithRacesAndBlocking()
@@ -428,7 +428,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
               self.prefetch_input_elements: 0,
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testEmptyInput(self):
     self._testEmptyInput()
@@ -451,7 +451,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
               self.prefetch_input_elements: 0,
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testNonEmptyInputIntoEmptyOutputs(self):
     self._testNonEmptyInputIntoEmptyOutputs()
@@ -484,7 +484,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         # presence of finishing iterators.
         if done_first_event and not (sloppy and (i in race_indices)):
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event or (sloppy and (i in race_indices)):
           done_first_event = True
           self.read_coordination_events[expected_element].acquire()
@@ -520,10 +520,10 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
       ]
       for element in mis_ordering:
         self.write_coordination_events[element].set()
-        self.assertEqual(element * element, sess.run(self.next_element))
+        self.assertEqual(element * element, self.evaluate(self.next_element))
         self.assertTrue(self.read_coordination_events[element].acquire(False))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testBlockLengthWithContentionSloppy(self):
     with self.cached_session() as sess:
@@ -549,7 +549,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
         self.write_coordination_events[expected_element].set()
         if done_first_event:  # First event starts the worker threads.
           self.read_coordination_events[expected_element].acquire()
-        actual_element = sess.run(self.next_element)
+        actual_element = self.evaluate(self.next_element)
         if not done_first_event:
           self.read_coordination_events[expected_element].acquire()
           done_first_event = True
@@ -557,7 +557,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                          "At index %s: %s expected, got: %s" %
                          (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def _testEarlyExit(self, sloppy=False):
     # Exiting without consuming all input should not block
@@ -575,7 +575,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           })
       for i in range(4, 7):
         self.write_coordination_events[i].set()
-      elem = sess.run(self.next_element)  # Start all workers
+      elem = self.evaluate(self.next_element)  # Start all workers
       # Allow the one successful worker to progress beyond the py_func again.
       elem = int(math.sqrt(elem))
       self.write_coordination_events[elem].set()
@@ -603,12 +603,12 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
     dataset = dataset.apply(
         interleave_ops.parallel_interleave(
             interleave_fn, cycle_length=16, block_length=2, sloppy=sloppy))
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     with self.cached_session() as sess:
       output_values = []
       for _ in range(30):
-        output_values.append(sess.run(iterator.get_next()))
+        output_values.append(self.evaluate(iterator.get_next()))
 
     expected_values = self._interleave(
         [[4] * 4, [5] * 5, [6] * 6] * self.repeat_count, 1, 2)
@@ -630,20 +630,19 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
     dataset = dataset_ops.Dataset.range(10).map(_map_fn)
-    iterator = dataset.apply(
-        interleave_ops.parallel_interleave(
-            _interleave_fn, cycle_length=1)).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset.apply(
+        interleave_ops.parallel_interleave(_interleave_fn, cycle_length=1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(init_op)
+      self.evaluate(init_op)
       for i in range(10):
         for j in range(2):
           expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
+          self.assertAllEqual(expected, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   def testErrorsInOutputFn(self):
     with self.cached_session() as sess:
@@ -668,15 +667,15 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self.error = ValueError()
           self.write_coordination_events[expected_element].set()
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
           self.write_coordination_events[expected_element].set()
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element * expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testErrorsInInputFn(self):
 
@@ -701,7 +700,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
 
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -720,14 +719,14 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
         if expected_element == 5:
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testErrorsInInterleaveFn(self):
 
@@ -750,7 +749,7 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
                                                self.buffer_output_elements,
                                                self.prefetch_input_elements)))
 
-    self.iterator = self.dataset.make_initializable_iterator()
+    self.iterator = dataset_ops.make_initializable_iterator(self.dataset)
     self.init_op = self.iterator.initializer
     self.next_element = self.iterator.get_next()
 
@@ -769,14 +768,14 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
           self._interleave([[4] * 4, [5], [6] * 6] * self.repeat_count, 2, 1)):
         if expected_element == 5:
           with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(self.next_element)
+            self.evaluate(self.next_element)
         else:
-          actual_element = sess.run(self.next_element)
+          actual_element = self.evaluate(self.next_element)
           self.assertEqual(expected_element, actual_element,
                            "At index %s: %s expected, got: %s" %
                            (i, expected_element, actual_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.next_element)
+        self.evaluate(self.next_element)
 
   def testShutdownRace(self):
     dataset = dataset_ops.Dataset.range(20)
@@ -789,17 +788,17 @@ class ParallelInterleaveTest(test_base.DatasetTestBase):
             buffer_output_elements=1,
             prefetch_input_elements=0))
     dataset = dataset.batch(32)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     results = []
     with self.cached_session() as sess:
       for _ in range(2):
         elements = []
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         try:
           while True:
-            elements.extend(sess.run(next_element))
+            elements.extend(self.evaluate(next_element))
         except errors.OutOfRangeError:
           pass
         results.append(elements)
diff --git a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
index c74f754fefbc88d685d593c3545d34107f5ca2af..76e0d4d72a6d22f24da9c762770d1592ba67b737 100644
--- a/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/parse_example_dataset_test.py
@@ -144,6 +144,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_values=expected_output,
         create_iterator_twice=True)
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithoutDefaultsShouldFail(self):
     input_features = {
         "st_a":
@@ -177,6 +178,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_err=(errors_impl.InvalidArgumentError,
                       "Feature: c \\(data type: float\\) is required"))
 
+  @test_util.run_deprecated_v1
   def testDenseNotMatchingShapeShouldFail(self):
     original = [
         example(features=features({
@@ -669,6 +671,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
+  @test_util.run_deprecated_v1
   def testSkipEagerSerializedShapeMismatch(self):
     aname = "a"
     bname = "b"
@@ -706,6 +709,7 @@ class ParseExampleDatasetTest(test_base.DatasetTestBase):
         expected_err=(ValueError,
                       "Cannot reshape a tensor with 0 elements to shape"))
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
     aname = "a"
     bname = "b"
diff --git a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
index f73725366c46e1b0dca88e3d1b09147a23966eaf..80bd43e9adee52afefc6a6c9866bab671aa4a731 100644
--- a/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/prefetch_to_device_test.py
@@ -31,17 +31,15 @@ from tensorflow.python.platform import test
 
 class PrefetchToDeviceTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testPrefetchToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -50,29 +48,26 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchToSameDevice(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device(
             "/job:localhost/replica:0/task:0/device:CPU:0"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -81,27 +76,24 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
-    with self.cached_session() as sess:
+    with self.cached_session():
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchDictToDevice(self):
     host_dataset = dataset_ops.Dataset.range(10).map(lambda x: {"a": x})
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -110,17 +102,17 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element["a"].dtype)
     self.assertEqual([], next_element["a"].shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        self.assertEqual({"a": i}, sess.run(next_element))
+        self.assertEqual({"a": i}, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchSparseTensorsToDevice(self):
     def make_tensor(i):
       return sparse_tensor.SparseTensorValue(
@@ -130,12 +122,9 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_one_shot_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_one_shot_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -144,18 +133,17 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
+    with self.test_session(config=worker_config):
       for i in range(10):
-        actual = sess.run(next_element)
+        actual = self.evaluate(next_element)
         self.assertAllEqual([i], actual.values)
         self.assertAllEqual([[0, 0]], actual.indices)
         self.assertAllEqual([2, 2], actual.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testPrefetchToDeviceGpu(self):
     if not test_util.is_gpu_available():
@@ -165,26 +153,26 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/gpu:0"))
 
-    iterator = device_dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_initializable_iterator(device_dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testPrefetchToDeviceWithReInit(self):
     host_dataset = dataset_ops.Dataset.range(10)
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/cpu:1"))
 
-    # NOTE(mrry): This device block creates the "host" dataset and iterator on
-    # /cpu:0, and ensures that the prefetching is across devices. In typical use
-    # this would not be necessary, because the GPU device would not support any
-    # of the dataset-related ops.
-    with ops.device("/cpu:0"):
-      iterator = device_dataset.make_initializable_iterator()
+    with ops.device("/cpu:1"):
+      iterator = dataset_ops.make_initializable_iterator(device_dataset)
+      next_element = iterator.get_next()
 
     self.assertEqual(host_dataset.output_types, device_dataset.output_types)
     self.assertEqual(host_dataset.output_types, iterator.output_types)
@@ -193,20 +181,19 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     self.assertEqual(host_dataset.output_classes, device_dataset.output_classes)
     self.assertEqual(host_dataset.output_classes, iterator.output_classes)
 
-    next_element = iterator.get_next()
     self.assertEqual(dtypes.int64, next_element.dtype)
     self.assertEqual([], next_element.shape)
 
     worker_config = config_pb2.ConfigProto(device_count={"CPU": 2})
-    with self.test_session(config=worker_config) as sess:
-      sess.run(iterator.initializer)
+    with self.test_session(config=worker_config):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testPrefetchToDeviceGpuWithReInit(self):
     if not test_util.is_gpu_available():
@@ -216,18 +203,19 @@ class PrefetchToDeviceTest(test_base.DatasetTestBase):
     device_dataset = host_dataset.apply(
         prefetching_ops.prefetch_to_device("/gpu:0"))
 
-    iterator = device_dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(device_dataset)
     next_element = iterator.get_next()
 
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+    with self.cached_session(
+        config=config_pb2.ConfigProto(allow_soft_placement=False)):
+      self.evaluate(iterator.initializer)
       for i in range(5):
-        self.assertEqual(i, sess.run(next_element))
-      sess.run(iterator.initializer)
+        self.assertEqual(i, self.evaluate(next_element))
+      self.evaluate(iterator.initializer)
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
index 4c879dbae68b358e46b2546dc61befe060df4daa..76f68f50c8188e58affc353e62b7ff8c952c4955 100644
--- a/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/rejection_resample_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
@@ -47,7 +48,7 @@ def _time_resampling(
           initial_dist=init_dist,
           seed=142))
 
-  get_next = dataset.make_one_shot_iterator().get_next()
+  get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
   with test_obj.test_session() as sess:
     start_time = time.time()
@@ -63,6 +64,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("InitialDistributionKnown", True),
       ("InitialDistributionUnknown", False))
+  @test_util.run_deprecated_v1
   def testDistribution(self, initial_known):
     classes = np.random.randint(5, size=(20000,))  # Uniformly sampled
     target_dist = [0.9, 0.05, 0.05, 0.0, 0.0]
@@ -71,12 +73,12 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices(classes).shuffle(
         200, seed=21).map(lambda c: (c, string_ops.as_string(c))).repeat()
 
-    get_next = dataset.apply(
+    get_next = dataset_ops.make_one_shot_iterator(dataset.apply(
         resampling.rejection_resample(
             target_dist=target_dist,
             initial_dist=initial_dist,
             class_func=lambda c, _: c,
-            seed=27)).make_one_shot_iterator().get_next()
+            seed=27))).get_next()
 
     with self.cached_session() as sess:
       returned = []
@@ -97,6 +99,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
   @parameterized.named_parameters(
       ("OnlyInitial", True),
       ("NotInitial", False))
+  @test_util.run_deprecated_v1
   def testEdgeCasesSampleFromInitialDataset(self, only_initial_dist):
     init_dist = [0.5, 0.5]
     target_dist = [0.5, 0.5] if only_initial_dist else [0.0, 1.0]
@@ -114,7 +117,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       returned = []
@@ -122,6 +125,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
         while True:
           returned.append(sess.run(get_next))
 
+  @test_util.run_deprecated_v1
   def testRandomClasses(self):
     init_dist = [0.25, 0.25, 0.25, 0.25]
     target_dist = [0.0, 0.0, 0.0, 1.0]
@@ -145,7 +149,7 @@ class RejectionResampleTest(test_base.DatasetTestBase, parameterized.TestCase):
             target_dist=target_dist,
             initial_dist=init_dist))
 
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       returned = []
diff --git a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
index 516e489d043ccec513267ae3d51b639540a4fcd6..658e6120cf9e30d7f79e542c8df726d997b1abb9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/restructured_dataset_test.py
@@ -22,12 +22,14 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class RestructuredDatasetTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testRestructureDataset(self):
     components = (array_ops.placeholder(dtypes.int32),
                   (array_ops.placeholder(dtypes.int32, shape=[None]),
diff --git a/tensorflow/python/data/experimental/kernel_tests/scan_test.py b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
index 0730455431f9a3faaeb22b62f59d45c04d07c208..bd974b21e301806e5282c8970e091df684c85144 100644
--- a/tensorflow/python/data/experimental/kernel_tests/scan_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/scan_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
@@ -40,6 +41,7 @@ class ScanTest(test_base.DatasetTestBase):
     return dataset_ops.Dataset.from_tensors(0).repeat().apply(
         scan_ops.scan(start, scan_fn))
 
+  @test_util.run_deprecated_v1
   def testCount(self):
     def make_scan_fn(step):
       return lambda state, _: (state + step, state)
@@ -47,8 +49,8 @@ class ScanTest(test_base.DatasetTestBase):
     start = array_ops.placeholder(dtypes.int32, shape=[])
     step = array_ops.placeholder(dtypes.int32, shape=[])
     take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._counting_dataset(
-        start, make_scan_fn(step)).take(take).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(self._counting_dataset(
+        start, make_scan_fn(step)).take(take))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -60,15 +62,15 @@ class ScanTest(test_base.DatasetTestBase):
                  feed_dict={start: start_val, step: step_val, take: take_val})
         for expected, _ in zip(
             itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, sess.run(next_element))
+          self.assertEqual(expected, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
   @test_util.run_in_graph_and_eager_modes
   def testFibonacci(self):
-    iterator = dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
-        scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))
-    ).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensors(1).repeat(None).apply(
+            scan_ops.scan([0, 1], lambda a, _: ([a[1], a[0] + a[1]], a[1]))))
 
     if context.executing_eagerly():
       next_element = iterator.get_next
@@ -83,6 +85,7 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertEqual(5, self.evaluate(next_element()))
     self.assertEqual(8, self.evaluate(next_element()))
 
+  @test_util.run_deprecated_v1
   def testSparseCount(self):
     def _sparse(i):
       return sparse_tensor.SparseTensorValue(
@@ -96,9 +99,8 @@ class ScanTest(test_base.DatasetTestBase):
     start = array_ops.placeholder(dtypes.int32, shape=[])
     step = array_ops.placeholder(dtypes.int32, shape=[])
     take = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = self._counting_dataset(
-        _sparse(start),
-        make_scan_fn(step)).take(take).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(self._counting_dataset(
+        _sparse(start), make_scan_fn(step)).take(take))
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -110,10 +112,11 @@ class ScanTest(test_base.DatasetTestBase):
                  feed_dict={start: start_val, step: step_val, take: take_val})
         for expected, _ in zip(
             itertools.count(start_val, step_val), range(take_val)):
-          self.assertEqual(expected, sess.run(next_element).values[0])
+          self.assertEqual(expected, self.evaluate(next_element).values[0])
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testChangingStateShape(self):
     # Test the fixed-point shape invariant calculations: start with
     # initial values with known shapes, and use a scan function that
@@ -131,16 +134,16 @@ class ScanTest(test_base.DatasetTestBase):
     self.assertIs(None, dataset.output_shapes[0][1].ndims)
     self.assertEqual([], dataset.output_shapes[1].as_list())
 
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(5):
-        (longer_vector_val, larger_rank_val), _ = sess.run(next_element)
+        (longer_vector_val, larger_rank_val), _ = self.evaluate(next_element)
         self.assertAllEqual([0] * (2**i), longer_vector_val)
         self.assertAllEqual(np.array(1, ndmin=i), larger_rank_val)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testIncorrectStateType(self):
 
@@ -167,6 +170,21 @@ class ScanTest(test_base.DatasetTestBase):
       dataset.apply(
           scan_ops.scan(constant_op.constant(1, dtype=dtypes.int32), _scan_fn))
 
+  def testPreserveCardinality(self):
+
+    def scan_fn(state, val):
+
+      def py_fn(_):
+        raise StopIteration()
+
+      return state, script_ops.py_func(py_fn, [val], dtypes.int64)
+
+    dataset = dataset_ops.Dataset.from_tensors(0).apply(
+        scan_ops.scan(constant_op.constant(1), scan_fn))
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
index 2cfb57590367e586fb4a3195b11b0eee681d9f61..4a2e28f49649ea698e9d426d86dae4bb42cdebf9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/BUILD
@@ -74,7 +74,11 @@ py_test(
     size = "small",
     srcs = ["checkpoint_input_pipeline_hook_test.py"],
     srcs_version = "PY2AND3",
-    tags = ["no_pip"],
+    tags = [
+        "no_pip",
+        "no_windows",
+        "notsan",
+    ],
     deps = [
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -313,6 +317,7 @@ py_test(
     srcs_version = "PY2AND3",
     tags = [
         "no_pip",
+        "no_windows",
         "notap",
     ],
     deps = [
@@ -355,9 +360,13 @@ py_test(
     size = "small",
     srcs = ["matching_files_dataset_serialization_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":dataset_serialization_test_base",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python/data/experimental/ops:matching_files",
         "//tensorflow/python/data/ops:dataset_ops",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
index 94393d6d4ba98eae72a29883ebef9c6d075c6fec..8cc66d0c29392b206015ad886780d854fb2b5d5c 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/checkpoint_input_pipeline_hook_test.py
@@ -21,17 +21,18 @@ from __future__ import print_function
 from tensorflow.python.data.experimental.ops import iterator_ops
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.estimator import estimator
-from tensorflow.python.estimator import model_fn
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import saver as saver_lib
 from tensorflow.python.training import training_util
+from tensorflow_estimator.python.estimator import estimator
+from tensorflow_estimator.python.estimator import model_fn
 
 
 class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
@@ -68,6 +69,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
   def _build_iterator_saver_hook(self, est):
     return iterator_ops.CheckpointInputPipelineHook(est)
 
+  @test_util.run_deprecated_v1
   def testReturnDatasetFromInputFn(self):
 
     def _input_fn():
@@ -80,6 +82,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
+  @test_util.run_deprecated_v1
   def testBuildIteratorInInputFn(self):
 
     def _input_fn():
@@ -94,6 +97,7 @@ class CheckpointInputPipelineHookTest(test_base.DatasetTestBase):
     est.train(_input_fn, steps=2, hooks=[self._build_iterator_saver_hook(est)])
     self.assertSequenceEqual(self._read_vars(est.model_dir), (4, 3))
 
+  @test_util.run_deprecated_v1
   def testDoNotRestore(self):
 
     def _input_fn():
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
index 7f435b823975ad7a12661d909f37cebae67a0018..bdbd8702b7f8d315a730c5cd2b000218ea5e19be 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/dataset_serialization_test_base.py
@@ -23,6 +23,8 @@ import os
 import numpy as np
 
 from tensorflow.python.data.experimental.ops import iterator_ops as contrib_iterator_ops
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -73,23 +75,39 @@ class DatasetSerializationTestBase(test.TestCase):
     Raises:
       AssertionError if any test fails.
     """
+    # NOTE: We disable all default optimizations in serialization tests in order
+    # to test the actual dataset in question.
+    options = dataset_ops.Options()
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.apply_default_optimizations = False
+
+    def ds_fn1_no_opt():
+      return ds_fn1().with_options(options)
+
     self.verify_unused_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_fully_used_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_exhausted_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_init_before_restore(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_multiple_breaks(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_reset_restored_iterator(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     self.verify_restore_in_empty_graph(
-        ds_fn1, num_outputs, sparse_tensors=sparse_tensors)
+        ds_fn1_no_opt, num_outputs, sparse_tensors=sparse_tensors)
     if ds_fn2:
+
+      def ds_fn2_no_opt():
+        return ds_fn2().with_options(options)
+
       self.verify_restore_in_modified_graph(
-          ds_fn1, ds_fn2, num_outputs, sparse_tensors=sparse_tensors)
+          ds_fn1_no_opt,
+          ds_fn2_no_opt,
+          num_outputs,
+          sparse_tensors=sparse_tensors)
 
   def verify_unused_iterator(self,
                              ds_fn,
@@ -578,7 +596,7 @@ class DatasetSerializationTestBase(test.TestCase):
     return np.linspace(0, num_outputs, num_samples, dtype=int)
 
   def _build_graph(self, ds_fn, sparse_tensors=False):
-    iterator = ds_fn().make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(ds_fn())
 
     saveable = contrib_iterator_ops.make_saveable_from_iterator(iterator)
     ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
index 225f6cbac01adb22383a2d682886e1f4810871c8..e3ba8ad231b5c5c534ebc632b5f6cc6bf62451ff 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/filter_dataset_serialization_test.py
@@ -17,8 +17,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import numpy as np
-
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import sparse_tensor
@@ -35,7 +33,7 @@ class FilterDatasetSerializationTest(
 
   def testFilterCore(self):
     div = 3
-    num_outputs = np.sum([x % 3 != 2 for x in range(100)])
+    num_outputs = sum(x % 3 != 2 for x in range(100))
     self.run_core_tests(lambda: self._build_filter_range_graph(div),
                         lambda: self._build_filter_range_graph(div * 2),
                         num_outputs)
@@ -47,7 +45,7 @@ class FilterDatasetSerializationTest(
                 lambda d: d["foo"] + d["bar"])
 
   def testFilterDictCore(self):
-    num_outputs = np.sum([(x**2) % 2 == 0 for x in range(10)])
+    num_outputs = sum((x**2) % 2 == 0 for x in range(10))
     self.run_core_tests(self._build_filter_dict_graph, None, num_outputs)
 
   def _build_sparse_filter(self):
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
index 166ffa99ca02eabe8b8b30ba6f1fa8ed99d8b45c..8bfe6ce2f30e02c78f4a5b760849b92dd0a8fc65 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/map_and_batch_dataset_serialization_test.py
@@ -22,6 +22,7 @@ import math
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -83,6 +84,19 @@ class MapAndBatchDatasetSerializationTest(
     self.run_core_tests(lambda: build_ds(10, True), lambda: build_ds(15, True),
                         num_outputs_drop_remainder)
 
+  def testSparse(self):
+
+    def build_dataset():
+
+      def map_fn(i):
+        return sparse_tensor.SparseTensorValue(
+            indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+      return dataset_ops.Dataset.range(10).apply(
+          batching.map_and_batch(map_fn, 5))
+
+    self.run_core_tests(build_dataset, None, 2)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
index 7edb200d2ec33bc41cad61d0da4fb94bbc5d6962..c026e97835ccf32d0801a5f6eb1a49d1173dffed 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/matching_files_dataset_serialization_test.py
@@ -22,7 +22,7 @@ import shutil
 import tempfile
 
 from tensorflow.python.data.experimental.kernel_tests.serialization import dataset_serialization_test_base
-from tensorflow.python.data.ops.dataset_ops import MatchingFilesDataset
+from tensorflow.python.data.experimental.ops import matching_files
 from tensorflow.python.platform import test
 
 
@@ -30,7 +30,7 @@ class MatchingFilesDatasetSerializationTest(
     dataset_serialization_test_base.DatasetSerializationTestBase):
 
   def _build_iterator_graph(self, test_patterns):
-    return MatchingFilesDataset(test_patterns)
+    return matching_files.MatchingFilesDataset(test_patterns)
 
   def testMatchingFilesCore(self):
     tmp_dir = tempfile.mkdtemp()
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
index ef99d01c73ce164265c06bdf08b76ff67a90dd89..34419a314938560818f3a9f4cdd1979a8dbb44d4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/range_dataset_serialization_test.py
@@ -56,8 +56,8 @@ class RangeDatasetSerializationTest(
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -71,36 +71,36 @@ class RangeDatasetSerializationTest(
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, _ = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
 
     with ops.Graph().as_default() as g:
       init_op, get_next, _, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
+        self.evaluate(init_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
     # Saving and restoring in same session.
     with ops.Graph().as_default() as g:
       init_op, get_next, save_op, restore_op = _build_graph(start, stop)
       with self.session(graph=g) as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(init_op)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(init_op)
         for i in range(start, break_point):
-          self.assertEqual(i, sess.run(get_next))
-        sess.run(save_op)
-        sess.run(restore_op)
+          self.assertEqual(i, self.evaluate(get_next))
+        self.evaluate(save_op)
+        self.evaluate(restore_op)
         for i in range(break_point, stop):
-          self.assertEqual(i, sess.run(get_next))
+          self.assertEqual(i, self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   def _build_range_dataset(self, start, stop):
     return dataset_ops.Dataset.range(start, stop)
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
index 88d5c896c9fd9710e41026b321daa1fc90a7c66f..12fa0989d0778a6e7734413789fbc8a00390937d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/serialization_integration_test.py
@@ -60,9 +60,9 @@ class SerializationIntegrationTest(test.TestCase):
       init_ops, get_next_ops, saver = self._build_graph(num_pipelines,
                                                         num_outputs)
       with self.session(graph=g) as sess:
-        sess.run(init_ops)
+        self.evaluate(init_ops)
         for _ in range(break_point):
-          output = sess.run(get_next_ops)
+          output = self.evaluate(get_next_ops)
           for i in range(num_pipelines):
             all_outputs[i].append(output[i])
         saver.save(sess, self._ckpt_path())
@@ -73,7 +73,7 @@ class SerializationIntegrationTest(test.TestCase):
       with self.session(graph=g) as sess:
         saver.restore(sess, self._ckpt_path())
         for _ in range(num_outputs - break_point):
-          output = sess.run(get_next_ops)
+          output = self.evaluate(get_next_ops)
           for i in range(num_pipelines):
             all_outputs[i].append(output[i])
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
index a04f1ddafce2e386b04694adb81061e99c6b8abd..e753a7a15be4ea609ce69568da1c88847bdc5727 100644
--- a/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/serialization/shuffle_dataset_serialization_test.py
@@ -138,9 +138,9 @@ class ShuffleDatasetSerializationTest(
           saver = saver_lib.Saver(allow_empty=True)
           with self.session(graph=g) as sess:
             self._save(sess, saver)
-            expected = [sess.run(get_next_ops) for _ in range(num_outputs)]
+            expected = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
             self._restore(saver, sess)
-            actual = [sess.run(get_next_ops) for _ in range(num_outputs)]
+            actual = [self.evaluate(get_next_ops) for _ in range(num_outputs)]
             self.match(expected, actual)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
index c208963a8612228ecf9ff8b91328a2d02c0d3890..9528f83291f9e4b752a266499e9ec6d7e5239f7d 100644
--- a/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/shuffle_and_repeat_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -34,16 +35,17 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
         shuffle_ops.shuffle_and_repeat(buffer_size=5, count=count, seed=seed))
 
   def _gen_outputs(self, ds_fn, num_outputs, verify_exhausted=True):
-    get_next = ds_fn().make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(ds_fn()).get_next()
     outputs = []
     with self.cached_session() as sess:
       for _ in range(num_outputs):
-        outputs.append(sess.run(get_next))
+        outputs.append(self.evaluate(get_next))
       if verify_exhausted:
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
     return outputs
 
+  @test_util.run_deprecated_v1
   def testCorrectOutput(self):
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertSequenceEqual(
@@ -52,6 +54,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     for i in range(5):
       self.assertSequenceEqual(sorted(output[i * 20:(i + 1) * 20]), range(20))
 
+  @test_util.run_deprecated_v1
   def testReshuffling(self):
     # Check that the output orders of different epochs are indeed different.
     output = self._gen_outputs(lambda: self._build_ds(10), 100)
@@ -60,17 +63,20 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
       epoch2 = output[(i + 1) * 20:(i + 2) * 20]
       self.assertNotEqual(epoch1, epoch2)
 
+  @test_util.run_deprecated_v1
   def testSameOrderForSameSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(10), 100)
     self.assertEqual(output1, output2)
 
+  @test_util.run_deprecated_v1
   def testDifferentOrderForDifferentSeeds(self):
     output1 = self._gen_outputs(lambda: self._build_ds(10), 100)
     output2 = self._gen_outputs(lambda: self._build_ds(20), 100)
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
+  @test_util.run_deprecated_v1
   def testCountNone(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=None), 100, verify_exhausted=False)
@@ -79,6 +85,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
     self.assertNotEqual(output1, output2)
     self.assertEqual(sorted(output1), sorted(output2))
 
+  @test_util.run_deprecated_v1
   def testCountMinusOne(self):
     output1 = self._gen_outputs(
         lambda: self._build_ds(10, count=-1), 100, verify_exhausted=False)
@@ -108,7 +115,7 @@ class ShuffleAndRepeatTest(test_base.DatasetTestBase):
           shuffle_ops.shuffle_and_repeat(buffer_size=21))
       get_next_op = ds.make_one_shot_iterator().get_next()
       with self.session(graph=g) as sess:
-        sess.run(get_next_op)
+        self.evaluate(get_next_op)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
index bf53acc82a8259e04e470ab5e7b87ec3ab00911f..46b22f80b6d5f918624dcc98b894fbc37e0e46bc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sleep_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.data.experimental.ops import sleep
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 _NUMPY_RANDOM_SEED = 42
@@ -30,22 +31,23 @@ _NUMPY_RANDOM_SEED = 42
 
 class SleepTest(test_base.DatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testSleep(self):
     sleep_microseconds = 100
     dataset = dataset_ops.Dataset.range(10).apply(
         sleep.sleep(sleep_microseconds))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       start_time = time.time()
       for i in range(10):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       end_time = time.time()
       self.assertGreater(end_time - start_time, (10 * sleep_microseconds) / 1e6)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
index a2c11696387ddbf81546765734854897a279adbf..eb66927ee5c73c67325f3764d29d5c8461c05cbb 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test.py
@@ -39,10 +39,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                             "ORDER BY first_name DESC"
             })
         for _ in range(2):  # Dataset is repeated. See setUp.
-          self.assertEqual((b"John", b"Doe", b"Hi!"), sess.run(get_next))
-          self.assertEqual((b"Jane", b"Moe", b"Hi again!"), sess.run(get_next))
+          self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
+          self.assertEqual((b"Jane", b"Moe", b"Hi again!"),
+                           self.evaluate(get_next))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+          self.evaluate(get_next)
 
   # Test that SqlDataset works on a join query.
   def testReadResultSetJoinQuery(self):
@@ -58,9 +59,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ON students.first_name = people.first_name "
                   "AND students.last_name = people.last_name"
           })
-      self.assertEqual((b"John", b"California", b"Hi!"), sess.run(get_next))
+      self.assertEqual((b"John", b"California", b"Hi!"),
+                       self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that SqlDataset can read a database entry with a null-terminator
   # in the middle of the text and place the entry in a `string` tensor.
@@ -75,10 +77,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, last_name, favorite_nonsense_word "
                   "FROM students ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"n\0nsense"), sess.run(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"n\0nsense"), self.evaluate(get_next))
+      self.assertEqual((b"Jane", b"Moe", b"nonsense\0"),
+                       self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that SqlDataset works when used on two different queries.
   # Because the output types of the dataset must be determined at graph-creation
@@ -93,21 +96,22 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, last_name, motto FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"Hi!"), sess.run(get_next))
-      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"Hi!"), self.evaluate(get_next))
+      self.assertEqual((b"Jane", b"Moe", b"Hi again!"), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
       sess.run(
           init_op,
           feed_dict={
               self.query: "SELECT first_name, last_name, state FROM people "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", b"Doe", b"California"), sess.run(get_next))
+      self.assertEqual((b"John", b"Doe", b"California"),
+                       self.evaluate(get_next))
       self.assertEqual((b"Benjamin", b"Franklin", b"Pennsylvania"),
-                       sess.run(get_next))
+                       self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an `OutOfRangeError` is raised on the first call to
   # `get_next_str_only` if result set is empty.
@@ -122,7 +126,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "WHERE first_name = 'Nonexistent'"
           })
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when `driver_name` is invalid.
   def testReadResultSetWithInvalidDriverName(self):
@@ -151,7 +155,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.UnknownError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when there is a syntax error in `query`.
   def testReadResultSetOfQueryWithSyntaxError(self):
@@ -166,7 +170,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.UnknownError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that an error is raised when the number of columns in `query`
   # does not match the length of `output_types`.
@@ -181,7 +185,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that no results are returned when `query` is an insert query rather
   # than a select query. In particular, the error refers to the number of
@@ -199,7 +203,7 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "VALUES ('Foo', 'Bar', 'Baz'), ('Fizz', 'Buzz', 'Fizzbuzz')"
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int8` tensor.
@@ -212,10 +216,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int8` tensor.
@@ -230,9 +234,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students "
                           "WHERE first_name = 'John' ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0, -2), sess.run(get_next))
+      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int8` tensor.
@@ -246,11 +250,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT desk_number, favorite_negative_number FROM students "
                   "ORDER BY first_name DESC"
           })
-      self.assertEqual((9, -2), sess.run(get_next))
+      self.assertEqual((9, -2), self.evaluate(get_next))
       # Max and min values of int8
-      self.assertEqual((127, -128), sess.run(get_next))
+      self.assertEqual((127, -128), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int16` tensor.
@@ -263,10 +267,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int16` tensor.
@@ -281,9 +285,9 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students "
                           "WHERE first_name = 'John' ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0, -2), sess.run(get_next))
+      self.assertEqual((b"John", 0, -2), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int16` tensor.
@@ -297,11 +301,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "FROM students ORDER BY first_name DESC"
           })
       # Max value of int16
-      self.assertEqual((b"John", 32767), sess.run(get_next))
+      self.assertEqual((b"John", 32767), self.evaluate(get_next))
       # Min value of int16
-      self.assertEqual((b"Jane", -32768), sess.run(get_next))
+      self.assertEqual((b"Jane", -32768), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in an `int32` tensor.
@@ -314,8 +318,8 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int32` tensor.
@@ -328,10 +332,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, income FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0), sess.run(get_next))
-      self.assertEqual((b"Jane", -20000), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
+      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int32` tensor.
@@ -345,11 +349,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Max value of int32
-      self.assertEqual((b"John", 2147483647), sess.run(get_next))
+      self.assertEqual((b"John", 2147483647), self.evaluate(get_next))
       # Min value of int32
-      self.assertEqual((b"Jane", -2147483648), sess.run(get_next))
+      self.assertEqual((b"Jane", -2147483648), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a numeric `varchar` from a SQLite database
   # table and place it in an `int32` tensor.
@@ -362,10 +366,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, school_id FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 123), sess.run(get_next))
-      self.assertEqual((b"Jane", 1000), sess.run(get_next))
+      self.assertEqual((b"John", 123), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 1000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in an `int64` tensor.
@@ -378,10 +382,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a negative or 0-valued integer from a
   # SQLite database table and place it in an `int64` tensor.
@@ -394,10 +398,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, income FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 0), sess.run(get_next))
-      self.assertEqual((b"Jane", -20000), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
+      self.assertEqual((b"Jane", -20000), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a large (positive or negative) integer from
   # a SQLite database table and place it in an `int64` tensor.
@@ -412,11 +416,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name DESC"
           })
       # Max value of int64
-      self.assertEqual((b"John", 9223372036854775807), sess.run(get_next))
+      self.assertEqual((b"John", 9223372036854775807), self.evaluate(get_next))
       # Min value of int64
-      self.assertEqual((b"Jane", -9223372036854775808), sess.run(get_next))
+      self.assertEqual((b"Jane", -9223372036854775808), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table and
   # place it in a `uint8` tensor.
@@ -429,10 +433,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read the minimum and maximum uint8 values from a
   # SQLite database table and place them in `uint8` tensors.
@@ -446,11 +450,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Min value of uint8
-      self.assertEqual((b"John", 0), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
       # Max value of uint8
-      self.assertEqual((b"Jane", 255), sess.run(get_next))
+      self.assertEqual((b"Jane", 255), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer from a SQLite database table
   # and place it in a `uint16` tensor.
@@ -463,10 +467,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, desk_number FROM students "
                           "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", 9), sess.run(get_next))
-      self.assertEqual((b"Jane", 127), sess.run(get_next))
+      self.assertEqual((b"John", 9), self.evaluate(get_next))
+      self.assertEqual((b"Jane", 127), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read the minimum and maximum uint16 values from a
   # SQLite database table and place them in `uint16` tensors.
@@ -480,11 +484,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                           "ORDER BY first_name DESC"
           })
       # Min value of uint16
-      self.assertEqual((b"John", 0), sess.run(get_next))
+      self.assertEqual((b"John", 0), self.evaluate(get_next))
       # Max value of uint16
-      self.assertEqual((b"Jane", 65535), sess.run(get_next))
+      self.assertEqual((b"Jane", 65535), self.evaluate(get_next))
     with self.assertRaises(errors.OutOfRangeError):
-      sess.run(get_next)
+      self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a 0-valued and 1-valued integer from a
   # SQLite database table and place them as `True` and `False` respectively
@@ -499,10 +503,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, registration_complete FROM students "
                   "ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", True), sess.run(get_next))
-      self.assertEqual((b"Jane", False), sess.run(get_next))
+      self.assertEqual((b"John", True), self.evaluate(get_next))
+      self.assertEqual((b"Jane", False), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read an integer that is not 0-valued or 1-valued
   # from a SQLite database table and place it as `True` in a `bool` tensor.
@@ -515,10 +519,10 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
               self.query: "SELECT first_name, favorite_medium_sized_number "
                           "FROM students ORDER BY first_name DESC"
           })
-      self.assertEqual((b"John", True), sess.run(get_next))
-      self.assertEqual((b"Jane", True), sess.run(get_next))
+      self.assertEqual((b"John", True), self.evaluate(get_next))
+      self.assertEqual((b"Jane", True), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table
   # and place it in a `float64` tensor.
@@ -533,10 +537,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "SELECT first_name, last_name, victories FROM townspeople "
                   "ORDER BY first_name"
           })
-      self.assertEqual((b"George", b"Washington", 20.0), sess.run(get_next))
-      self.assertEqual((b"John", b"Adams", -19.95), sess.run(get_next))
+      self.assertEqual((b"George", b"Washington", 20.0),
+                       self.evaluate(get_next))
+      self.assertEqual((b"John", b"Adams", -19.95), self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table beyond
   # the precision of 64-bit IEEE, without throwing an error. Test that
@@ -555,13 +560,13 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
       self.assertEqual(
           (b"George", b"Washington",
            1331241.321342132321324589798264627463827647382647382643874),
-          sess.run(get_next))
+          self.evaluate(get_next))
       self.assertEqual(
           (b"John", b"Adams",
            1331241321342132321324589798264627463827647382647382643874.0),
-          sess.run(get_next))
+          self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
   # Test that `SqlDataset` can read a float from a SQLite database table,
   # representing the largest integer representable as a 64-bit IEEE float
@@ -579,11 +584,11 @@ class SqlDatasetTest(sql_dataset_test_base.SqlDatasetTestBase):
                   "ORDER BY first_name"
           })
       self.assertNotEqual((b"George", b"Washington", 9007199254740992.0),
-                          sess.run(get_next))
+                          self.evaluate(get_next))
       self.assertNotEqual((b"John", b"Adams", 9007199254740991.0),
-                          sess.run(get_next))
+                          self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+        self.evaluate(get_next)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
index 6aaaa90c651ebab7ce5d98371d45a7f64831e883..809e09c80420979b84dc5e4706398f793466a059 100644
--- a/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/sql_dataset_test_base.py
@@ -24,6 +24,7 @@ import sqlite3
 
 from tensorflow.python.data.experimental.ops import readers
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
@@ -35,7 +36,7 @@ class SqlDatasetTestBase(test_base.DatasetTestBase):
   def _createSqlDataset(self, output_types, num_repeats=1):
     dataset = readers.SqlDataset(self.driver_name, self.data_source_name,
                                  self.query, output_types).repeat(num_repeats)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
     return init_op, get_next
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
index 83028937d3649928cefc7d602be3c4e50d0fa1fa..f19b08a2dde821124b6f5065eed4c825afa9f107 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.data.experimental.ops import stats_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -45,84 +46,84 @@ def function_set_stats_aggregator(dataset,
 
 def function_apply_options(dataset, aggregator, prefix="", counter_prefix=""):
   options = dataset_ops.Options()
-  options.experimental_stats = stats_options.StatsOptions(aggregator)
+  options.experimental_stats = stats_options.StatsOptions()
+  options.experimental_stats.aggregator = aggregator
+  options.experimental_stats.prefix = prefix
+  options.experimental_stats.counter_prefix = counter_prefix
   options.experimental_stats.latency_all_edges = False
-  if prefix:
-    options.experimental_stats.prefix = prefix
-  if counter_prefix:
-    options.experimental_stats.counter_prefix = counter_prefix
   return dataset.with_options(options)
 
 
 @parameterized.named_parameters(
-    dict(
-        testcase_name="SetStatsAggregator",
-        dataset_transformation=function_set_stats_aggregator),
-    dict(
-        testcase_name="StatsOptions",
-        dataset_transformation=function_apply_options))
+    ("SetStatsAggregator", function_set_stats_aggregator),
+    ("StatsOptions", function_apply_options),
+)
 class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testBytesProduced(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).apply(
             stats_ops.bytes_produced_stats("bytes_produced"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       expected_sum = 0.0
       for i in range(100):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasCount(summary_str, "bytes_produced", float(i + 1))
         expected_sum += i * 8.0
         self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      summary_str = sess.run(summary_t)
+        self.evaluate(next_element)
+      summary_str = self.evaluate(summary_t)
       self._assertSummaryHasCount(summary_str, "bytes_produced", 100.0)
       self._assertSummaryHasSum(summary_str, "bytes_produced", expected_sum)
 
+  @test_util.run_deprecated_v1
   def testLatencyStats(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(i + 1))
+            self.evaluate(summary_t), "record_latency", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 100.0)
 
+  @test_util.run_deprecated_v1
   def testPrefetchBufferUtilization(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).map(
         lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(-1)
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                     float(i + 1))
         self._assertSummaryContains(summary_str, "Prefetch::buffer_capacity")
@@ -130,58 +131,63 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         self._assertSummaryHasRange(summary_str, "Prefetch::buffer_utilization",
                                     0, 1)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      summary_str = sess.run(summary_t)
+        self.evaluate(next_element)
+      summary_str = self.evaluate(summary_t)
       self._assertSummaryHasCount(summary_str, "Prefetch::buffer_utilization",
                                   100)
 
+  @test_util.run_deprecated_v1
   def testPrefetchBufferScalars(self, dataset_transformation):
+    def map_fn(x):
+      return array_ops.tile([x], ops.convert_to_tensor([x]))
     aggregator = stats_aggregator.StatsAggregator()
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: array_ops.tile([x], ops.convert_to_tensor([x]))).prefetch(0)
+    dataset = dataset_ops.Dataset.range(10).map(map_fn).prefetch(1)
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(10):
         self.assertAllEqual(
-            np.array([i] * i, dtype=np.int64), sess.run(next_element))
-        summary_str = sess.run(summary_t)
+            np.array([i] * i, dtype=np.int64), self.evaluate(next_element))
+        summary_str = self.evaluate(summary_t)
         self._assertSummaryHasScalarValue(summary_str,
-                                          "Prefetch::buffer_capacity", 0)
+                                          "Prefetch::buffer_capacity", 1)
         self._assertSummaryHasScalarValue(summary_str, "Prefetch::buffer_size",
-                                          0)
+                                          1)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testFilteredElementsStats(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(101).filter(
         lambda x: math_ops.equal(math_ops.mod(x, 3), 0))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(34):
-        self.assertEqual(i * 3, sess.run(next_element))
+        self.assertEqual(i * 3, self.evaluate(next_element))
         if i is not 0:
           self._assertSummaryHasScalarValue(
-              sess.run(summary_t), "Filter::dropped_elements", float(i * 2))
+              self.evaluate(summary_t), "Filter::dropped_elements",
+              float(i * 2))
         self._assertSummaryHasScalarValue(
-            sess.run(summary_t), "Filter::filtered_elements", float(i + 1))
+            self.evaluate(summary_t), "Filter::filtered_elements", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasScalarValue(
-          sess.run(summary_t), "Filter::dropped_elements", 67.0)
+          self.evaluate(summary_t), "Filter::dropped_elements", 67.0)
       self._assertSummaryHasScalarValue(
-          sess.run(summary_t), "Filter::filtered_elements", 34.0)
+          self.evaluate(summary_t), "Filter::filtered_elements", 34.0)
 
+  @test_util.run_deprecated_v1
   def testMapBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -196,6 +202,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_transformation,
         function_processing_time=True)
 
+  @test_util.run_deprecated_v1
   def testMapAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -213,6 +220,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         dataset_transformation,
         function_processing_time=True)
 
+  @test_util.run_deprecated_v1
   def testInterleaveAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -229,6 +237,7 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     self._testParallelCallsStats(dataset_fn, "ParallelInterleaveV2", 10,
                                  dataset_transformation)
 
+  @test_util.run_deprecated_v1
   def testMapAndBatchAutoTuneBufferUtilization(self, dataset_transformation):
 
     def dataset_fn():
@@ -250,104 +259,114 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
         check_elements=False,
         function_processing_time=True)
 
+  @test_util.run_deprecated_v1
   def testReinitialize(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
       for j in range(5):
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for i in range(100):
-          self.assertEqual(i, sess.run(next_element))
+          self.assertEqual(i, self.evaluate(next_element))
           self._assertSummaryHasCount(
-              sess.run(summary_t), "record_latency", float((j * 100) + i + 1))
+              self.evaluate(summary_t), "record_latency",
+              float((j * 100) + i + 1))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", (j + 1) * 100.0)
+            self.evaluate(summary_t), "record_latency", (j + 1) * 100.0)
 
+  @test_util.run_deprecated_v1
   def testNoAggregatorRegistered(self, dataset_transformation):
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testMultipleTags(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
             stats_ops.latency_stats("record_latency_2"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(i + 1))
+            self.evaluate(summary_t), "record_latency", float(i + 1))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency_2", float(i + 1))
+            self.evaluate(summary_t), "record_latency_2", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 100.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 100.0)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_latency_2", 100.0)
+          self.evaluate(summary_t), "record_latency_2", 100.0)
 
+  @test_util.run_deprecated_v1
   def testRepeatedTags(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency")).apply(
             stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for i in range(100):
-        self.assertEqual(i, sess.run(next_element))
+        self.assertEqual(i, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 200.0)
 
+  @test_util.run_deprecated_v1
   def testMultipleIteratorsSameAggregator(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset = dataset_transformation(dataset, aggregator)
-    iterator_0 = dataset.make_initializable_iterator()
-    iterator_1 = dataset.make_initializable_iterator()
+    iterator_0 = dataset_ops.make_initializable_iterator(dataset)
+    iterator_1 = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator_0.get_next() + iterator_1.get_next()
     summary_t = aggregator.get_summary()
 
     with self.cached_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer])
+      self.evaluate([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
-        self.assertEqual(i * 2, sess.run(next_element))
+        self.assertEqual(i * 2, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "record_latency", float(2 * (i + 1)))
+            self.evaluate(summary_t), "record_latency", float(2 * (i + 1)))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self._assertSummaryHasCount(sess.run(summary_t), "record_latency", 200.0)
+        self.evaluate(next_element)
+      self._assertSummaryHasCount(
+          self.evaluate(summary_t), "record_latency", 200.0)
 
+  @test_util.run_deprecated_v1
   def testMultipleDatasetWithPrefixes(self, dataset_transformation):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_ops.Dataset.range(100).apply(
@@ -356,25 +375,25 @@ class StatsDatasetTest(stats_dataset_test_base.StatsDatasetTestBase):
     dataset2 = dataset_ops.Dataset.range(100).apply(
         stats_ops.latency_stats("record_latency"))
     dataset2 = dataset_transformation(dataset2, aggregator, prefix="dataset2")
-    iterator_0 = dataset.make_initializable_iterator()
-    iterator_1 = dataset2.make_initializable_iterator()
+    iterator_0 = dataset_ops.make_initializable_iterator(dataset)
+    iterator_1 = dataset_ops.make_initializable_iterator(dataset2)
     next_element = iterator_0.get_next() + iterator_1.get_next()
     summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run([iterator_0.initializer, iterator_1.initializer])
+      self.evaluate([iterator_0.initializer, iterator_1.initializer])
       for i in range(100):
-        self.assertEqual(i * 2, sess.run(next_element))
+        self.assertEqual(i * 2, self.evaluate(next_element))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "dataset1_record_latency", float(i + 1))
+            self.evaluate(summary_t), "dataset1_record_latency", float(i + 1))
         self._assertSummaryHasCount(
-            sess.run(summary_t), "dataset2_record_latency", float(i + 1))
+            self.evaluate(summary_t), "dataset2_record_latency", float(i + 1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "dataset1_record_latency", 100.0)
+          self.evaluate(summary_t), "dataset1_record_latency", 100.0)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "dataset2_record_latency", 100.0)
+          self.evaluate(summary_t), "dataset2_record_latency", 100.0)
 
 
 @parameterized.named_parameters(
@@ -388,6 +407,7 @@ class FeatureStatsDatasetTest(
     stats_dataset_test_base.StatsDatasetTestBase,
     reader_dataset_ops_test_base.MakeBatchedFeaturesDatasetTestBase):
 
+  @test_util.run_deprecated_v1
   def testFeaturesStats(self, dataset_transformation):
     num_epochs = 5
     total_records = num_epochs * self._num_records
@@ -416,25 +436,26 @@ class FeatureStatsDatasetTest(
 
     dataset = dataset_transformation(
         dataset_fn(), aggregator, prefix="record_stats")
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
     with self.test_session() as sess:
-      sess.run(iterator.initializer)
+      self.evaluate(iterator.initializer)
       for _ in range(num_output):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_stats_features", total_records)
+          self.evaluate(summary_t), "record_stats_features", total_records)
       self._assertSummaryHasCount(
-          sess.run(summary_t), "record_stats_feature-values", total_records)
+          self.evaluate(summary_t), "record_stats_feature-values",
+          total_records)
       self._assertSummaryHasSum(
-          sess.run(summary_t), "record_stats_features", total_records * 4)
+          self.evaluate(summary_t), "record_stats_features", total_records * 4)
       self._assertSummaryHasSum(
-          sess.run(summary_t), "record_stats_feature-values",
+          self.evaluate(summary_t), "record_stats_feature-values",
           self._sum_keywords(1) * num_epochs + 3 * total_records)
 
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
index c5bf9267590b105bcb681455d9488d09451345b9..ab1d1c3028a4ee99b99145c7296b7b0d5b8ea6b9 100644
--- a/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
+++ b/tensorflow/python/data/experimental/kernel_tests/stats_dataset_test_base.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.data.experimental.ops import stats_aggregator
 from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 
 
@@ -93,7 +94,7 @@ class StatsDatasetTestBase(test_base.DatasetTestBase):
     aggregator = stats_aggregator.StatsAggregator()
     dataset = dataset_fn()
     dataset = dataset_transformation(dataset, aggregator)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
     summary_t = aggregator.get_summary()
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
index 0278a208cbba5c84cb19732172277cf6685d5520..cef5e8d269ce8d4db861b97efc1a75a1dbf2ff8e 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unbatch_test.py
@@ -17,20 +17,18 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import time
 
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.client import session
 from tensorflow.python.data.experimental.ops import batching
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import string_ops
@@ -40,20 +38,22 @@ from tensorflow.python.util import compat
 
 class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testUnbatchWithUnknownRankInput(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     dataset = dataset_ops.Dataset.from_tensors(placeholder).apply(
         batching.unbatch())
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_elem = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(iterator.initializer, feed_dict={placeholder: [0, 1, 2, 3]})
       for i in range(4):
-        self.assertEqual(i, sess.run(next_elem))
+        self.assertEqual(i, self.evaluate(next_elem))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_elem)
+        self.evaluate(next_elem)
 
+  @test_util.run_deprecated_v1
   def testUnbatchScalarDataset(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -63,16 +63,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i,) * 3, sess.run(op))
+        self.assertEqual((i,) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithStrings(self):
     data = tuple([math_ops.range(10) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -83,16 +84,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual((i, compat.as_bytes(str(i)), i), sess.run(op))
+        self.assertEqual((i, compat.as_bytes(str(i)), i), self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
@@ -102,18 +104,19 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        st_row = sess.run(next_element)
+        st_row = self.evaluate(next_element)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
         self.assertEqual([10], st_row.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testUnbatchDatasetWithDenseAndSparseTensor(self):
     st = sparse_tensor.SparseTensorValue(
         indices=[[i, i] for i in range(10)],
@@ -123,19 +126,20 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     data = data.batch(5)
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        dense_elem, st_row = sess.run(next_element)
+        dense_elem, st_row = self.evaluate(next_element)
         self.assertEqual(i, dense_elem)
         self.assertEqual([i], st_row.indices)
         self.assertEqual([i], st_row.values)
         self.assertEqual([10], st_row.dense_shape)
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testUnbatchSingleElementTupleDataset(self):
     data = tuple([(math_ops.range(10),) for _ in range(3)])
     data = dataset_ops.Dataset.from_tensor_slices(data)
@@ -145,16 +149,17 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
-        self.assertEqual(((i,),) * 3, sess.run(op))
+        self.assertEqual(((i,),) * 3, self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchMultiElementTupleDataset(self):
     data = tuple([(math_ops.range(10 * i, 10 * i + 10),
                    array_ops.fill([10], "hi")) for i in range(3)])
@@ -165,28 +170,29 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     data = data.apply(batching.unbatch())
     self.assertAllEqual(expected_types, data.output_types)
 
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     op = iterator.get_next()
 
     with self.cached_session() as sess:
       for i in range(10):
         self.assertEqual(((i, b"hi"), (10 + i, b"hi"), (20 + i, b"hi")),
-                         sess.run(op))
+                         self.evaluate(op))
 
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(op)
+        self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def testUnbatchEmpty(self):
     data = dataset_ops.Dataset.from_tensors(
         (constant_op.constant([]), constant_op.constant([], shape=[0, 4]),
          constant_op.constant([], shape=[0, 4, 0])))
     data = data.apply(batching.unbatch())
-    iterator = data.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
   def testUnbatchStaticShapeMismatch(self):
     data = dataset_ops.Dataset.from_tensors((np.arange(7), np.arange(8),
@@ -194,12 +200,13 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       data.apply(batching.unbatch())
 
+  @test_util.run_deprecated_v1
   def testUnbatchDynamicShapeMismatch(self):
     ph1 = array_ops.placeholder(dtypes.int32, shape=[None])
     ph2 = array_ops.placeholder(dtypes.int32, shape=None)
     data = dataset_ops.Dataset.from_tensors((ph1, ph2))
     data = data.apply(batching.unbatch())
-    iterator = data.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(data)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -211,7 +218,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
               ph2: np.arange(8).astype(np.int32)
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
       # No 0th dimension (i.e. scalar value) for one component.
       sess.run(
@@ -221,79 +228,7 @@ class UnbatchTest(test_base.DatasetTestBase, parameterized.TestCase):
               ph2: 7
           })
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(next_element)
-
-
-class UnbatchBenchmark(test.Benchmark):
-
-  def benchmarkNativeUnbatch(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.apply(batching.unbatch())
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (native) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_native_batch_size_%d" %
-              batch_size)
-
-  # Include a benchmark of the previous `unbatch()` implementation that uses
-  # a composition of more primitive ops. Eventually we'd hope to generate code
-  # that is as good in both cases.
-  def benchmarkOldUnbatchImplementation(self):
-    batch_sizes = [1, 2, 5, 10, 20, 50]
-    elems_per_trial = 10000
-    with ops.Graph().as_default():
-      dataset = dataset_ops.Dataset.from_tensors("element").repeat(None)
-      batch_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-      dataset = dataset.batch(batch_size_placeholder)
-      dataset = dataset.flat_map(dataset_ops.Dataset.from_tensor_slices)
-      dataset = dataset.skip(elems_per_trial)
-      iterator = dataset.make_initializable_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for batch_size in batch_sizes:
-          deltas = []
-          for _ in range(5):
-            sess.run(
-                iterator.initializer,
-                feed_dict={batch_size_placeholder: batch_size})
-            start = time.time()
-            sess.run(next_element.op)
-            end = time.time()
-            deltas.append((end - start) / elems_per_trial)
-
-          median_wall_time = np.median(deltas)
-          print("Unbatch (unfused) batch size: %d Median wall time per element:"
-                " %f microseconds" % (batch_size, median_wall_time * 1e6))
-          self.report_benchmark(
-              iters=10000,
-              wall_time=median_wall_time,
-              name="benchmark_unbatch_dataset_unfused_batch_size_%d" %
-              batch_size)
+        self.evaluate(next_element)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/experimental/kernel_tests/unique_test.py b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
index 847cff26b0d047f852658344529750b908250a19..1d9941d7f4d0729e5e0f62ebbac80d0d4d385f59 100644
--- a/tensorflow/python/data/experimental/kernel_tests/unique_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/unique_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -43,20 +44,21 @@ class UniqueTest(test_base.DatasetTestBase):
     current_test_case = []
     dataset = dataset_ops.Dataset.from_generator(lambda: current_test_case,
                                                  dtype).apply(unique.unique())
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
       for test_case, expected in test_cases:
         current_test_case = test_case
-        sess.run(iterator.initializer)
+        self.evaluate(iterator.initializer)
         for element in expected:
           if dtype == dtypes.string:
             element = compat.as_bytes(element)
-          self.assertAllEqual(element, sess.run(next_element))
+          self.assertAllEqual(element, self.evaluate(next_element))
         with self.assertRaises(errors.OutOfRangeError):
-          sess.run(next_element)
+          self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testSimpleInt(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       self._testSimpleHelper(dtype, [
@@ -69,6 +71,7 @@ class UniqueTest(test_base.DatasetTestBase):
           ([[1, 1], [1, 1], [2, 2], [3, 3], [1, 1]], [[1, 1], [2, 2], [3, 3]]),
       ])
 
+  @test_util.run_deprecated_v1
   def testSimpleString(self):
     self._testSimpleHelper(dtypes.string, [
         ([], []),
diff --git a/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c734b65e056df954a8597ab6f23489353cc057b
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/wrap_unwrap_test.py
@@ -0,0 +1,69 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Wrapping / Unwrapping dataset variants."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.platform import test
+
+
+class WrapDatasetVariantTest(test_base.DatasetTestBase):
+
+  def testBasic(self):
+    ds = dataset_ops.Dataset.range(100)
+    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+
+    wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
+    unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(wrapped_variant)
+
+    variant_ds = dataset_ops._VariantDataset(unwrapped_variant,
+                                             ds._element_structure)
+    iterator = dataset_ops.make_initializable_iterator(variant_ds)
+    get_next = iterator.get_next()
+
+    with self.cached_session():
+      self.evaluate(iterator.initializer)
+      for i in range(100):
+        self.assertEqual(i, self.evaluate(get_next))
+
+  def testGPU(self):
+    ds = dataset_ops.Dataset.range(100)
+    ds_variant = ds._as_variant_tensor()  # pylint: disable=protected-access
+    wrapped_variant = gen_dataset_ops.wrap_dataset_variant(ds_variant)
+
+    with ops.device("/gpu:0"):
+      gpu_wrapped_variant = array_ops.identity(wrapped_variant)
+
+    unwrapped_variant = gen_dataset_ops.unwrap_dataset_variant(
+        gpu_wrapped_variant)
+    variant_ds = dataset_ops._VariantDataset(unwrapped_variant,
+                                             ds._element_structure)
+    iterator = dataset_ops.make_initializable_iterator(variant_ds)
+    get_next = iterator.get_next()
+
+    with self.cached_session():
+      self.evaluate(iterator.initializer)
+      for i in range(100):
+        self.assertEqual(i, self.evaluate(get_next))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 170fda90b68a05c7732ce607e26ef06b1a82528c..60c20e0bcf2d875a15ffcc4c42d10cb6e0cc25ea 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -4,6 +4,16 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+py_library(
+    name = "cardinality",
+    srcs = ["cardinality.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "counter",
     srcs = ["counter.py"],
@@ -54,14 +64,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:random_seed",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -125,6 +134,7 @@ py_library(
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
         "//third_party/py/numpy",
     ],
 )
@@ -139,6 +149,18 @@ py_library(
     ],
 )
 
+py_library(
+    name = "filter_for_shard_ops",
+    srcs = ["filter_for_shard_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:ops",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "error_ops",
     srcs = ["error_ops.py"],
@@ -165,7 +187,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -188,6 +210,30 @@ py_library(
     ],
 )
 
+py_library(
+    name = "map_defun",
+    srcs = ["map_defun.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+    ],
+)
+
+py_library(
+    name = "matching_files",
+    srcs = ["matching_files.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
+    ],
+)
+
 py_library(
     name = "optimization",
     srcs = ["optimization.py"],
@@ -202,29 +248,28 @@ py_library(
 )
 
 py_library(
-    name = "parsing_ops",
-    srcs = ["parsing_ops.py"],
+    name = "optimization_options",
+    srcs = ["optimization_options.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
     ],
 )
 
 py_library(
-    name = "map_defun",
-    srcs = ["map_defun.py"],
+    name = "parsing_ops",
+    srcs = ["parsing_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -253,12 +298,13 @@ py_library(
     srcs = ["scan_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -303,6 +349,18 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":stats_aggregator",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
+    ],
+)
+
+py_library(
+    name = "threading_options",
+    srcs = ["threading_options.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/util:options",
     ],
 )
 
@@ -313,9 +371,8 @@ py_library(
     deps = [
         "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/eager:context",
     ],
 )
@@ -378,14 +435,17 @@ py_library(
     name = "dataset_ops",
     deps = [
         ":batching",
+        ":cardinality",
         ":counter",
         ":enumerate_ops",
         ":error_ops",
+        ":filter_for_shard_ops",
         ":get_single_element",
         ":grouping",
         ":indexed_dataset_ops",
         ":interleave_ops",
         ":map_defun",
+        ":matching_files",
         ":optimization",
         ":prefetching_ops",
         ":readers",
diff --git a/tensorflow/python/data/experimental/ops/batching.py b/tensorflow/python/data/experimental/ops/batching.py
index d8985fd13bf3e976764654f83cf02eb464254d18..29df98f4ea4c90d80f3518684febacc101ec2ba5 100644
--- a/tensorflow/python/data/experimental/ops/batching.py
+++ b/tensorflow/python/data/experimental/ops/batching.py
@@ -24,17 +24,18 @@ from tensorflow.python.data.experimental.ops import grouping
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -364,23 +365,19 @@ class _UnbatchDataset(dataset_ops.UnaryDataset):
                          "different batch sizes.")
     self._input_dataset = input_dataset
 
+    self._structure = structure.convert_legacy_structure(
+        input_dataset.output_types,
+        nest.map_structure(lambda s: s[1:], input_dataset.output_shapes),
+        input_dataset.output_classes)
+
   def _as_variant_tensor(self):
-    return gen_dataset_ops.unbatch_dataset(
+    return ged_ops.experimental_unbatch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return nest.map_structure(lambda s: s[1:],
-                              self._input_dataset.output_shapes)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export("data.experimental.unbatch")
@@ -408,21 +405,19 @@ def unbatch():
 
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
-    if not sparse.any_sparse(dataset.output_classes):
-      return _UnbatchDataset(dataset)
-
     # NOTE(mrry): We must ensure that any SparseTensors in `dataset`
     # are normalized to the rank-1 dense representation, so that the
     # sparse-oblivious unbatching logic will slice them
     # appropriately. This leads to a somewhat inefficient re-encoding step
     # for all SparseTensor components.
-    # TODO(mrry): Consider optimizing this in future
-    # if it turns out to be a bottleneck.
+    # TODO(mrry): Consider optimizing this in future if it turns out to be
+    # a bottleneck.
     def normalize(arg, *rest):
+      # pylint: disable=protected-access
       if rest:
-        return sparse.serialize_many_sparse_tensors((arg,) + rest)
+        return dataset._element_structure._to_batched_tensor_list((arg,) + rest)
       else:
-        return sparse.serialize_many_sparse_tensors(arg)
+        return dataset._element_structure._to_batched_tensor_list(arg)
 
     normalized_dataset = dataset.map(normalize)
 
@@ -453,25 +448,20 @@ class _DenseToSparseBatchDataset(dataset_ops.UnaryDataset):
     self._input_dataset = input_dataset
     self._batch_size = batch_size
     self._row_shape = row_shape
+    self._structure = structure.SparseTensorStructure(
+        input_dataset.output_types,
+        tensor_shape.vector(None).concatenate(self._row_shape))
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.dense_to_sparse_batch_dataset(
+    return ged_ops.experimental_dense_to_sparse_batch_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._batch_size,
         row_shape=convert.partial_shape_to_tensor(self._row_shape),
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return sparse_tensor.SparseTensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.vector(None).concatenate(self._row_shape)
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class _RestructuredDataset(dataset_ops.UnaryDataset):
@@ -522,13 +512,10 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
             "Dataset with output types %r cannot be restructured to have "
             "output types %r" % (dataset.output_types, output_types))
 
-    self._output_types = output_types
-
     if output_shapes is None:
       # Inherit shapes from the original `dataset`.
-      self._output_shapes = nest.pack_sequence_as(output_types,
-                                                  nest.flatten(
-                                                      dataset.output_shapes))
+      output_shapes = nest.pack_sequence_as(
+          output_types, nest.flatten(dataset.output_shapes))
     else:
       if not allow_unsafe_cast:
         # Validate that the shapes are compatible.
@@ -543,39 +530,34 @@ class _RestructuredDataset(dataset_ops.UnaryDataset):
                 "Dataset with output shapes %r cannot be restructured to have "
                 "incompatible output shapes %r" % (dataset.output_shapes,
                                                    output_shapes))
-      self._output_shapes = nest.map_structure_up_to(
+      output_shapes = nest.map_structure_up_to(
           output_types, tensor_shape.as_shape, output_shapes)
     if output_classes is None:
       # Inherit class types from the original `dataset`.
-      self._output_classes = nest.pack_sequence_as(output_types,
-                                                   nest.flatten(
-                                                       dataset.output_classes))
-    else:
-      self._output_classes = output_classes
+      output_classes = nest.pack_sequence_as(
+          output_types, nest.flatten(dataset.output_classes))
+
+    self._structure = structure.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
 
   def _as_variant_tensor(self):
     return self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
 
   @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
 
-
-class _MapAndBatchDataset(dataset_ops.MapDataset):
+class _MapAndBatchDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that maps a function over a batch of elements."""
 
   def __init__(self, input_dataset, map_func, batch_size, num_parallel_calls,
                drop_remainder):
     """See `Dataset.map()` for details."""
-    super(_MapAndBatchDataset, self).__init__(input_dataset, map_func)
+    super(_MapAndBatchDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
+        map_func, "tf.data.experimental.map_and_batch()", dataset=input_dataset)
     self._batch_size_t = ops.convert_to_tensor(
         batch_size, dtype=dtypes.int64, name="batch_size")
     self._num_parallel_calls_t = ops.convert_to_tensor(
@@ -583,36 +565,33 @@ class _MapAndBatchDataset(dataset_ops.MapDataset):
     self._drop_remainder_t = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-    self._batch_size = batch_size
-    self._drop_remainder = drop_remainder
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder_t)
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._structure = self._map_func.output_structure._batch(  # pylint: disable=protected-access
+          tensor_util.constant_value(self._batch_size_t))
+    else:
+      self._structure = self._map_func.output_structure._batch(None)  # pylint: disable=protected-access
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    input_resource = self._input_dataset._as_variant_tensor()
-    return gen_dataset_ops.map_and_batch_dataset_v2(
-        input_resource,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+    return ged_ops.experimental_map_and_batch_dataset(
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         batch_size=self._batch_size_t,
         num_parallel_calls=self._num_parallel_calls_t,
         drop_remainder=self._drop_remainder_t,
+        preserve_cardinality=True,
         **dataset_ops.flat_structure(self))
-    # pylint: enable=protected-access
-
-  @property
-  def output_shapes(self):
-    dim = self._batch_size if self._drop_remainder else None
-    return nest.pack_sequence_as(self._output_shapes, [
-        tensor_shape.vector(dim).concatenate(s)
-        for s in nest.flatten(self._output_shapes)
-    ])
 
   @property
-  def output_types(self):
-    return self._output_types
-
-  def _transformation_name(self):
-    return "tf.data.experimental.map_and_batch()"
+  def _element_structure(self):
+    return self._structure
 
 
 @tf_export("data.experimental.map_and_batch")
@@ -644,9 +623,10 @@ def map_and_batch(map_func,
       whether the last batch should be dropped in case its size is smaller than
       desired; the default behavior is not to drop the smaller batch.
     num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
-        representing the number of elements to process in parallel. If not
-        specified, `batch_size * num_parallel_batches` elements will be
-        processed in parallel.
+      representing the number of elements to process in parallel. If not
+      specified, `batch_size * num_parallel_batches` elements will be processed
+      in parallel. If the value `tf.data.experimental.AUTOTUNE` is used, then
+      the number of parallel calls is set dynamically based on available CPU.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/python/data/experimental/ops/cardinality.py b/tensorflow/python/data/experimental/ops/cardinality.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cf0a8801e8339f233eb61c8e0b1223b8b94358b
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/cardinality.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cardinality analysis of `Dataset` objects."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+INFINITE = -1
+UNKNOWN = -2
+tf_export("data.experimental.INFINITE_CARDINALITY").export_constant(
+    __name__, "INFINITE")
+tf_export("data.experimental.UNKNOWN_CARDINALITY").export_constant(
+    __name__, "UNKNOWN")
+
+
+@tf_export("data.experimental.cardinality")
+def cardinality(dataset):
+  """Returns the cardinality of `dataset`, if known.
+
+  The operation returns the cardinality of `dataset`. The operation may return
+  `tf.data.experimental.INFINITE_CARDINALITY` if `dataset` contains an infinite
+  number of elements or `tf.data.experimental.UNKNOWN_CARDINALITY` if the
+  analysis fails to determine the number of elements in `dataset` (e.g. when the
+  dataset source is a file).
+
+  Args:
+    dataset: A `tf.data.Dataset` for which to determine cardinality.
+
+  Returns:
+    A scalar `tf.int64` `Tensor` representing the cardinality of `dataset`. If
+    the cardinality is infinite or unknown, the operation returns the named
+    constant `INFINITE_CARDINALITY` and `UNKNOWN_CARDINALITY` respectively.
+  """
+  return ged_ops.experimental_dataset_cardinality(dataset._as_variant_tensor())  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/experimental/ops/counter.py b/tensorflow/python/data/experimental/ops/counter.py
index 42200eaef9cb078afa0a9f598b6fa21e5e91f04b..652eb9d002992a737f3f8f0018db3a7316d0091e 100644
--- a/tensorflow/python/data/experimental/ops/counter.py
+++ b/tensorflow/python/data/experimental/ops/counter.py
@@ -25,8 +25,8 @@ from tensorflow.python.framework import ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.Counter")
-def Counter(start=0, step=1, dtype=dtypes.int64):
+@tf_export("data.experimental.Counter", v1=[])
+def CounterV2(start=0, step=1, dtype=dtypes.int64):
   """Creates a `Dataset` that counts from `start` in steps of size `step`.
 
   For example:
@@ -53,3 +53,13 @@ def Counter(start=0, step=1, dtype=dtypes.int64):
     step = ops.convert_to_tensor(step, dtype=dtype, name="step")
     return dataset_ops.Dataset.from_tensors(0).repeat(None).apply(
         scan_ops.scan(start, lambda state, _: (state + step, state)))
+
+
+@tf_export(v1=["data.experimental.Counter"])
+def CounterV1(start=0, step=1, dtype=dtypes.int64):
+  return dataset_ops.DatasetV1Adapter(CounterV2(start, step, dtype))
+CounterV1.__doc__ = CounterV2.__doc__
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+Counter = CounterV1  # pylint: disable=invalid-name
diff --git a/tensorflow/python/data/experimental/ops/enumerate_ops.py b/tensorflow/python/data/experimental/ops/enumerate_ops.py
index a1af98f552c8e68f458f3e9ab33ff29bc53e6136..04d875c7af238930a673fe744b3912f6ba44b5d2 100644
--- a/tensorflow/python/data/experimental/ops/enumerate_ops.py
+++ b/tensorflow/python/data/experimental/ops/enumerate_ops.py
@@ -26,9 +26,9 @@ from tensorflow.python.util.tf_export import tf_export
 
 @tf_export("data.experimental.enumerate_dataset")
 def enumerate_dataset(start=0):
-  """A transformation that enumerate the elements of a dataset.
+  """A transformation that enumerates the elements of a dataset.
 
-  It is Similar to python's `enumerate`.
+  It is similar to python's `enumerate`.
   For example:
 
   ```python
@@ -44,8 +44,8 @@ def enumerate_dataset(start=0):
   ```
 
   Args:
-    start: A `tf.int64` scalar `tf.Tensor`, representing the start
-      value for enumeration.
+    start: A `tf.int64` scalar `tf.Tensor`, representing the start value for
+      enumeration.
 
   Returns:
     A `Dataset` transformation function, which can be passed to
diff --git a/tensorflow/python/data/experimental/ops/error_ops.py b/tensorflow/python/data/experimental/ops/error_ops.py
index 82e274b70c5b703c62dcc143df371fae3d80065e..879b13ce092f20c2a6cfc911ba4c6e11992e23a8 100644
--- a/tensorflow/python/data/experimental/ops/error_ops.py
+++ b/tensorflow/python/data/experimental/ops/error_ops.py
@@ -52,7 +52,7 @@ def ignore_errors():
   return _apply_fn
 
 
-class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
+class _IgnoreErrorsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that silently ignores errors when computing its input."""
 
   def __init__(self, input_dataset):
@@ -64,15 +64,3 @@ class _IgnoreErrorsDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_ignore_errors_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py b/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..91d3dca3e9a883cf5eeacb368bbbf1af4420f3a1
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/filter_for_shard_ops.py
@@ -0,0 +1,106 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Naive shard dataset transformation."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.filter_for_shard")
+def filter_for_shard(num_shards, shard_index):
+  """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+  This dataset operator is very useful when running distributed training, as
+  it allows each worker to read a unique subset.
+
+  When reading a single input file, you can skip elements as follows:
+
+  ```python
+  d = tf.data.TFRecordDataset(FLAGS.input_file)
+  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
+                                               FLAGS.worker_index))
+  d = d.repeat(FLAGS.num_epochs)
+  d = d.shuffle(FLAGS.shuffle_buffer_size)
+  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+  ```
+
+  Important caveats:
+
+  - Be sure to shard before you use any randomizing operator (such as
+    shuffle).
+  - Generally it is best if the shard operator is used early in the dataset
+    pipeline. For example, when reading from a set of TFRecord files, shard
+    before converting the dataset to input samples. This avoids reading every
+    file on every worker. The following is an example of an efficient
+    sharding strategy within a complete pipeline:
+
+  ```python
+  d = Dataset.list_files(FLAGS.pattern)
+  d = d.apply(tf.data.experimental.naive_shard(FLAGS.num_workers,
+                                               FLAGS.worker_index))
+  d = d.repeat(FLAGS.num_epochs)
+  d = d.shuffle(FLAGS.shuffle_buffer_size)
+  d = d.interleave(tf.data.TFRecordDataset,
+                   cycle_length=FLAGS.num_readers, block_length=1)
+  d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+  ```
+
+  Args:
+    num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+      shards operating in parallel.
+    shard_index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+
+  Raises:
+    ValueError: if `num_shards` or `shard_index` are illegal values. Note: error
+      checking is done on a best-effort basis, and errors aren't guaranteed to
+      be caught upon dataset creation. (e.g. providing in a placeholder tensor
+      bypasses the early checking, and will instead result in an error during
+      a session.run call.)
+  """
+  num_shards = ops.convert_to_tensor(
+      num_shards, name="num_shards", dtype=dtypes.int64)
+  num_shards_static = tensor_util.constant_value(num_shards)
+  shard_index = ops.convert_to_tensor(shard_index, name="shard_index",
+                                      dtype=dtypes.int64)
+  shard_index_static = tensor_util.constant_value(shard_index)
+
+  if num_shards_static is not None and num_shards_static < 1:
+    raise ValueError("num_shards must be >= 1; got: %s" % num_shards_static)
+  if shard_index_static is not None and shard_index_static < 0:
+    raise ValueError("shard_index must be >= 0; got: %s" % shard_index_static)
+  if (shard_index_static is not None and num_shards_static is not None and
+      shard_index_static >= num_shards_static):
+    raise ValueError("shard_index must be < num_shards; %s is not < %s" %
+                     (shard_index_static, num_shards_static))
+
+  def filter_fn(elem_index, _):
+    mod_result = math_ops.mod(elem_index, num_shards)
+    return math_ops.equal(mod_result, shard_index)
+
+  def _apply_fn(dataset):
+    # pylint: disable=protected-access
+    return dataset._enumerate().filter(filter_fn).map(lambda _, elem: elem)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/get_single_element.py b/tensorflow/python/data/experimental/ops/get_single_element.py
index 132526166cfe49e267b1569b9e7851c8256234dd..d649a0701270c55d399af140f5e2bae79484fec2 100644
--- a/tensorflow/python/data/experimental/ops/get_single_element.py
+++ b/tensorflow/python/data/experimental/ops/get_single_element.py
@@ -18,8 +18,6 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -60,13 +58,11 @@ def get_single_element(dataset):
     InvalidArgumentError (at runtime): if `dataset` does not contain exactly
       one element.
   """
-  if not isinstance(dataset, dataset_ops.Dataset):
+  if not isinstance(dataset, dataset_ops.DatasetV2):
     raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
 
-  nested_ret = nest.pack_sequence_as(
-      dataset.output_types, gen_dataset_ops.dataset_to_single_element(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+  # pylint: disable=protected-access
+  return dataset._element_structure._from_compatible_tensor_list(
+      gen_dataset_ops.dataset_to_single_element(
+          dataset._as_variant_tensor(),
           **dataset_ops.flat_structure(dataset)))
-  return sparse.deserialize_sparse_tensors(
-      nested_ret, dataset.output_types, dataset.output_shapes,
-      dataset.output_classes)
diff --git a/tensorflow/python/data/experimental/ops/grouping.py b/tensorflow/python/data/experimental/ops/grouping.py
index 026867d405fc47f12ae251e851bf8669ad29d7d1..ef6b232429b872016842bcf513a851445b4d8a5e 100644
--- a/tensorflow/python/data/experimental/ops/grouping.py
+++ b/tensorflow/python/data/experimental/ops/grouping.py
@@ -21,13 +21,14 @@ import numpy as np
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -236,29 +237,6 @@ def bucket_by_sequence_length(element_length_func,
     return _apply_fn
 
 
-def _map_x_dataset(map_func):
-  """A transformation that maps `map_func` across its input.
-
-  This transformation is similar to `tf.data.Dataset.map`, but in addition to
-  supporting dense and sparse tensor inputs, it also supports dataset inputs.
-
-  Args:
-    map_func: A function mapping a nested structure of tensors and/or datasets
-      (having shapes and types defined by `self.output_shapes` and
-     `self.output_types`) to another nested structure of tensors and/or
-     datasets.
-
-  Returns:
-    Dataset: A `Dataset`.
-  """
-
-  def _apply_fn(dataset):
-    """Function from `Dataset` to `Dataset` that applies the transformation."""
-    return _MapXDataset(dataset, map_func)
-
-  return _apply_fn
-
-
 class _GroupByReducerDataset(dataset_ops.UnaryDataset):
   """A `Dataset` that groups its input and performs a reduction."""
 
@@ -275,50 +253,44 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
 
   def _make_key_func(self, key_func, input_dataset):
     """Make wrapping defun for key_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._key_func = dataset_ops.StructuredFunctionWrapper(
         key_func, self._transformation_name(), dataset=input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+    if not self._key_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`key_func` must return a single tf.int64 tensor. "
           "Got type=%s and shape=%s"
-          % (wrapped_func.output_types, wrapped_func.output_shapes))
-    self._key_func = wrapped_func.function
-
+          % (self._key_func.output_types, self._key_func.output_shapes))
   def _make_init_func(self, init_func):
     """Make wrapping defun for init_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._init_func = dataset_ops.StructuredFunctionWrapper(
         init_func,
         self._transformation_name(),
-        input_classes=ops.Tensor,
-        input_shapes=tensor_shape.scalar(),
-        input_types=dtypes.int64)
-    self._init_func = wrapped_func.function
-    self._state_classes = wrapped_func.output_classes
-    self._state_shapes = wrapped_func.output_shapes
-    self._state_types = wrapped_func.output_types
+        input_structure=structure.TensorStructure(dtypes.int64, []))
 
   def _make_reduce_func(self, reduce_func, input_dataset):
     """Make wrapping defun for reduce_func."""
 
     # Iteratively rerun the reduce function until reaching a fixed point on
-    # `self._state_shapes`.
+    # `self._state_structure`.
+    self._state_structure = self._init_func.output_structure
+    state_types = self._init_func.output_types
+    state_shapes = self._init_func.output_shapes
+    state_classes = self._init_func.output_classes
     need_to_rerun = True
     while need_to_rerun:
 
       wrapped_func = dataset_ops.StructuredFunctionWrapper(
           reduce_func,
           self._transformation_name(),
-          input_classes=(self._state_classes, input_dataset.output_classes),
-          input_shapes=(self._state_shapes, input_dataset.output_shapes),
-          input_types=(self._state_types, input_dataset.output_types),
+          input_structure=structure.NestedStructure(
+              (self._state_structure, input_dataset._element_structure)),  # pylint: disable=protected-access
           add_to_graph=False)
 
       # Extract and validate class information from the returned values.
       for new_state_class, state_class in zip(
           nest.flatten(wrapped_func.output_classes),
-          nest.flatten(self._state_classes)):
+          nest.flatten(state_classes)):
         if not issubclass(new_state_class, state_class):
           raise TypeError(
               "The element classes for the new state must match the initial "
@@ -327,16 +299,15 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
 
       # Extract and validate type information from the returned values.
       for new_state_type, state_type in zip(
-          nest.flatten(wrapped_func.output_types),
-          nest.flatten(self._state_types)):
+          nest.flatten(wrapped_func.output_types), nest.flatten(state_types)):
         if new_state_type != state_type:
           raise TypeError(
               "The element types for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_types, wrapped_func.output_types))
+              (self._init_func.output_types, wrapped_func.output_types))
 
       # Extract shape information from the returned values.
-      flat_state_shapes = nest.flatten(self._state_shapes)
+      flat_state_shapes = nest.flatten(state_shapes)
       flat_new_state_shapes = nest.flatten(wrapped_func.output_shapes)
       weakened_state_shapes = [
           original.most_specific_compatible_shape(new)
@@ -353,48 +324,40 @@ class _GroupByReducerDataset(dataset_ops.UnaryDataset):
           break
 
       if need_to_rerun:
-        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
-                                                   weakened_state_shapes)
+        state_shapes = nest.pack_sequence_as(
+            self._init_func.output_shapes, weakened_state_shapes)
+        self._state_structure = structure.convert_legacy_structure(
+            state_types, state_shapes, state_classes)
 
-    self._reduce_func = wrapped_func.function
-    self._reduce_func.add_to_graph(ops.get_default_graph())
+    self._reduce_func = wrapped_func
+    self._reduce_func.function.add_to_graph(ops.get_default_graph())
 
   def _make_finalize_func(self, finalize_func):
     """Make wrapping defun for finalize_func."""
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        finalize_func,
-        self._transformation_name(),
-        input_classes=self._state_classes,
-        input_shapes=self._state_shapes,
-        input_types=self._state_types)
-    self._finalize_func = wrapped_func.function
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
+    self._finalize_func = dataset_ops.StructuredFunctionWrapper(
+        finalize_func, self._transformation_name(),
+        input_structure=self._state_structure)
 
   @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._finalize_func.output_structure
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _functions(self):
+    return [
+        self._key_func, self._init_func, self._reduce_func, self._finalize_func
+    ]
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_reducer_dataset(
+    return ged_ops.experimental_group_by_reducer_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._init_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._finalize_func.captured_inputs,
-        key_func=self._key_func,
-        init_func=self._init_func,
-        reduce_func=self._reduce_func,
-        finalize_func=self._finalize_func,
+        self._key_func.function.captured_inputs,
+        self._init_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        init_func=self._init_func.function,
+        reduce_func=self._reduce_func.function,
+        finalize_func=self._finalize_func.function,
         **dataset_ops.flat_structure(self))
 
   def _transformation_name(self):
@@ -419,72 +382,59 @@ class _GroupByWindowDataset(dataset_ops.UnaryDataset):
 
     def window_size_func_wrapper(key):
       return ops.convert_to_tensor(window_size_func(key), dtype=dtypes.int64)
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._window_size_func = dataset_ops.StructuredFunctionWrapper(
         window_size_func_wrapper,
         self._transformation_name(),
-        input_classes=ops.Tensor,
-        input_shapes=tensor_shape.scalar(),
-        input_types=dtypes.int64)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+        input_structure=structure.TensorStructure(dtypes.int64, []))
+    if not self._window_size_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`window_size_func` must return a single tf.int64 scalar tensor.")
-    self._window_size_func = wrapped_func.function
 
   def _make_key_func(self, key_func, input_dataset):
     """Make wrapping defun for key_func."""
 
     def key_func_wrapper(*args):
       return ops.convert_to_tensor(key_func(*args), dtype=dtypes.int64)
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
+    self._key_func = dataset_ops.StructuredFunctionWrapper(
         key_func_wrapper, self._transformation_name(), dataset=input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.int64 and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+    if not self._key_func.output_structure.is_compatible_with(
+        structure.TensorStructure(dtypes.int64, [])):
       raise ValueError(
           "`key_func` must return a single tf.int64 scalar tensor.")
-    self._key_func = wrapped_func.function
 
   def _make_reduce_func(self, reduce_func, input_dataset):
     """Make wrapping defun for reduce_func."""
-    nested_dataset = dataset_ops._NestedDatasetComponent(input_dataset)  # pylint: disable=protected-access
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        reduce_func,
-        self._transformation_name(),
-        input_classes=(ops.Tensor, nested_dataset),
-        input_shapes=(tensor_shape.scalar(), nested_dataset),
-        input_types=(dtypes.int64, nested_dataset),
-        experimental_nested_dataset_support=True)
+    nested_dataset = dataset_ops.DatasetStructure(
+        input_dataset._element_structure)  # pylint: disable=protected-access
+    input_structure = structure.NestedStructure(
+        (structure.TensorStructure(dtypes.int64, []), nested_dataset))
+    self._reduce_func = dataset_ops.StructuredFunctionWrapper(
+        reduce_func, self._transformation_name(),
+        input_structure=input_structure)
     if not isinstance(
-        wrapped_func.output_classes, dataset_ops._NestedDatasetComponent):  # pylint: disable=protected-access
+        self._reduce_func.output_structure, dataset_ops.DatasetStructure):
       raise TypeError("`reduce_func` must return a `Dataset` object.")
-    self._output_classes = wrapped_func.output_classes.output_classes
-    self._output_types = wrapped_func.output_types.output_types
-    self._output_shapes = wrapped_func.output_shapes.output_shapes
-    self._reduce_func = wrapped_func.function
+    # pylint: disable=protected-access
+    self._structure = (
+        self._reduce_func.output_structure._element_structure)
 
   @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _functions(self):
+    return [self._key_func, self._reduce_func, self._window_size_func]
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.group_by_window_dataset(
+    return ged_ops.experimental_group_by_window_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._key_func.captured_inputs,
-        self._reduce_func.captured_inputs,
-        self._window_size_func.captured_inputs,
-        key_func=self._key_func,
-        reduce_func=self._reduce_func,
-        window_size_func=self._window_size_func,
+        self._key_func.function.captured_inputs,
+        self._reduce_func.function.captured_inputs,
+        self._window_size_func.function.captured_inputs,
+        key_func=self._key_func.function,
+        reduce_func=self._reduce_func.function,
+        window_size_func=self._window_size_func.function,
         **dataset_ops.flat_structure(self))
 
   def _transformation_name(self):
@@ -517,45 +467,3 @@ class Reducer(object):
   @property
   def finalize_func(self):
     return self._finalize_func
-
-
-class _MapXDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` that maps a function over elements in its input."""
-
-  def __init__(self, input_dataset, map_func):
-    """See `map_x_dataset()` for details."""
-    super(_MapXDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-
-    wrapped_func = dataset_ops.StructuredFunctionWrapper(
-        map_func,
-        self._transformation_name(),
-        dataset=input_dataset,
-        experimental_nested_dataset_support=True)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
-
-  def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.map_dataset(
-        input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
-        **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  def _transformation_name(self):
-    return "tf.data.experimental.map_x_dataset()"
diff --git a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
index 9c06474a2f8076d3ded5fd798665ea05930ecfe5..fdf3692420b1943db0b4ff0de826e6203593e2c7 100644
--- a/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
+++ b/tensorflow/python/data/experimental/ops/indexed_dataset_ops.py
@@ -22,9 +22,9 @@ import abc
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 
 
@@ -65,6 +65,7 @@ class MaterializedIndexedDataset(object):
             sparse.as_dense_types(self._output_shapes, self._output_classes)))
 
 
+# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
 class IndexedDataset(dataset_ops.Dataset):
   """IndexedDataset is highly experimental!
   """
@@ -93,11 +94,7 @@ class IndexedDataset(dataset_ops.Dataset):
         ged_ops.experimental_materialized_index_dataset_handle(
             container=container,
             shared_name=shared_name,
-            output_types=nest.flatten(
-                sparse.as_dense_types(self.output_types, self.output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_types(self.output_shapes,
-                                      self.output_classes))))
+            **dataset_ops.flat_structure(self)))
 
     with ops.colocate_with(materialized_resource):
       materializer = ged_ops.experimental_indexed_dataset_materialize(
@@ -106,38 +103,6 @@ class IndexedDataset(dataset_ops.Dataset):
                                       self.output_classes, self.output_types,
                                       self.output_shapes)
 
-  @abc.abstractproperty
-  def output_types(self):
-    """Returns the type of each component of an element of this IndexedDataset.
-
-    Returns:
-      A nested structure of `tf.DType` objects corresponding to each component
-      of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_types")
-
-  @abc.abstractproperty
-  def output_classes(self):
-    """Returns the class of each component of an element of this IndexedDataset.
-
-    The expected values are `tf.Tensor` and `tf.SparseTensor`.
-
-    Returns:
-      A nested structure of Python `type` objects corresponding to each
-      component of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_classes")
-
-  @abc.abstractproperty
-  def output_shapes(self):
-    """Returns the shape of each component of an element of this IndexedDataset.
-
-    Returns:
-      A nested structure of `tf.TensorShape` objects corresponding to each
-      component of an element of this IndexedDataset.
-    """
-    raise NotImplementedError("IndexedDataset.output_shapes")
-
   @abc.abstractmethod
   def _as_variant_tensor(self):
     """Creates a `tf.variant` `tf.Tensor` representing this IndexedDataset.
@@ -149,6 +114,7 @@ class IndexedDataset(dataset_ops.Dataset):
     raise NotImplementedError("IndexedDataset._as_variant_tensor")
 
 
+# TODO(saeta): Add a `DatasetV1` wrapper if this is exposed via the public API.
 class IdentityIndexedDataset(IndexedDataset):
   """IdentityIndexedDataset is a trivial indexed dataset used for testing.
   """
@@ -159,16 +125,8 @@ class IdentityIndexedDataset(IndexedDataset):
     self._size = ops.convert_to_tensor(size, dtype=dtypes.uint64, name="size")
 
   @property
-  def output_types(self):
-    return dtypes.uint64
-
-  @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.uint64, [])
 
   def _as_variant_tensor(self):
     return ged_ops.experimental_identity_indexed_dataset(self._size)
diff --git a/tensorflow/python/data/experimental/ops/interleave_ops.py b/tensorflow/python/data/experimental/ops/interleave_ops.py
index a3c094859efb7586b3ddcf1823ab27bf0a733445..5a719f8ed8f0176f628a89eb1b3e535064d9a72e 100644
--- a/tensorflow/python/data/experimental/ops/interleave_ops.py
+++ b/tensorflow/python/data/experimental/ops/interleave_ops.py
@@ -21,6 +21,7 @@ from tensorflow.python.data.experimental.ops import random_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -101,6 +102,18 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
           data_input.output_classes != data_inputs[0].output_classes):
         raise TypeError("All datasets must have the same type and class.")
 
+    output_shapes = self._data_inputs[0].output_shapes
+    for data_input in self._data_inputs[1:]:
+      output_shapes = nest.pack_sequence_as(output_shapes, [
+          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
+              nest.flatten(output_shapes),
+              nest.flatten(data_input.output_shapes))
+      ])
+
+    self._structure = structure.convert_legacy_structure(
+        data_inputs[0].output_types, output_shapes,
+        data_inputs[0].output_classes)
+
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
     return (
@@ -115,26 +128,12 @@ class _DirectedInterleaveDataset(dataset_ops.Dataset):
     return [self._selector_input] + self._data_inputs
 
   @property
-  def output_classes(self):
-    return self._data_inputs[0].output_classes
-
-  @property
-  def output_shapes(self):
-    ret = self._data_inputs[0].output_shapes
-    for data_input in self._data_inputs[1:]:
-      ret = nest.pack_sequence_as(ret, [
-          ts1.most_specific_compatible_shape(ts2) for (ts1, ts2) in zip(
-              nest.flatten(ret), nest.flatten(data_input.output_shapes))
-      ])
-    return ret
-
-  @property
-  def output_types(self):
-    return self._data_inputs[0].output_types
+  def _element_structure(self):
+    return self._structure
 
 
-@tf_export("data.experimental.sample_from_datasets")
-def sample_from_datasets(datasets, weights=None, seed=None):
+@tf_export("data.experimental.sample_from_datasets", v1=[])
+def sample_from_datasets_v2(datasets, weights=None, seed=None):
   """Samples elements at random from the datasets in `datasets`.
 
   Args:
@@ -158,7 +157,7 @@ def sample_from_datasets(datasets, weights=None, seed=None):
       length of the `datasets` element.
   """
   num_datasets = len(datasets)
-  if not isinstance(weights, dataset_ops.Dataset):
+  if not isinstance(weights, dataset_ops.DatasetV2):
     if weights is None:
       # Select inputs with uniform probability.
       logits = [[1.0] * num_datasets]
@@ -217,8 +216,15 @@ def sample_from_datasets(datasets, weights=None, seed=None):
   return _DirectedInterleaveDataset(selector_input, datasets)
 
 
-@tf_export("data.experimental.choose_from_datasets")
-def choose_from_datasets(datasets, choice_dataset):
+@tf_export(v1=["data.experimental.sample_from_datasets"])
+def sample_from_datasets_v1(datasets, weights=None, seed=None):
+  return dataset_ops.DatasetV1Adapter(
+      sample_from_datasets_v2(datasets, weights, seed))
+sample_from_datasets_v1.__doc__ = sample_from_datasets_v2.__doc__
+
+
+@tf_export("data.experimental.choose_from_datasets", v1=[])
+def choose_from_datasets_v2(datasets, choice_dataset):
   """Creates a dataset that deterministically chooses elements from `datasets`.
 
   For example, given the following datasets:
@@ -260,3 +266,16 @@ def choose_from_datasets(datasets, choice_dataset):
     raise TypeError("`choice_dataset` must be a dataset of scalar "
                     "`tf.int64` tensors.")
   return _DirectedInterleaveDataset(choice_dataset, datasets)
+
+
+@tf_export(v1=["data.experimental.choose_from_datasets"])
+def choose_from_datasets_v1(datasets, choice_dataset):
+  return dataset_ops.DatasetV1Adapter(
+      choose_from_datasets_v2(datasets, choice_dataset))
+choose_from_datasets_v1.__doc__ = choose_from_datasets_v2.__doc__
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+choose_from_datasets = choose_from_datasets_v1
+sample_from_datasets = sample_from_datasets_v1
diff --git a/tensorflow/python/data/experimental/ops/matching_files.py b/tensorflow/python/data/experimental/ops/matching_files.py
new file mode 100644
index 0000000000000000000000000000000000000000..63b99cb1e4533d165902893918d5aea2c6f02613
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/matching_files.py
@@ -0,0 +1,41 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for matching input filenames."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+class MatchingFilesDataset(dataset_ops.DatasetSource):
+  """A `Dataset` that list the files according to the input patterns."""
+
+  def __init__(self, patterns):
+    super(MatchingFilesDataset, self).__init__()
+    self._patterns = ops.convert_to_tensor(
+        patterns, dtype=dtypes.string, name="patterns")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_matching_files_dataset(self._patterns)
+
+  @property
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
diff --git a/tensorflow/python/data/experimental/ops/optimization.py b/tensorflow/python/data/experimental/ops/optimization.py
index b744db7f1e5fbdf9869721ce8fd6ea5122534d5a..c6c7de9265c32245dfbc348a4e7c4fd06eda653b 100644
--- a/tensorflow/python/data/experimental/ops/optimization.py
+++ b/tensorflow/python/data/experimental/ops/optimization.py
@@ -100,7 +100,7 @@ def optimize(optimizations=None):
   return _apply_fn
 
 
-class _AssertNextDataset(dataset_ops.UnaryDataset):
+class _AssertNextDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that asserts which transformations happen next."""
 
   def __init__(self, input_dataset, transformations):
@@ -118,20 +118,8 @@ class _AssertNextDataset(dataset_ops.UnaryDataset):
         self._transformations,
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _NonSerializableDataset(dataset_ops.UnaryDataset):
+class _NonSerializableDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that performs non-serializable identity transformation."""
 
   def __init__(self, input_dataset):
@@ -143,15 +131,3 @@ class _NonSerializableDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_non_serializable_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/optimization_options.py b/tensorflow/python/data/experimental/ops/optimization_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..11b8b86f64b204782030411cc533d57dcc348bd3
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/optimization_options.py
@@ -0,0 +1,118 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling optimizations in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.data.util import options
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.OptimizationOptions")
+class OptimizationOptions(options.OptionsBase):
+  """Represents options for dataset optimizations.
+
+  You can apply `OptimizationOptions` to a `dataset` object, as follows:
+
+  ```python
+  options = tf.data.Options()
+  options.optimization = tf.data.experimental.OptimizationOptions()
+  options.optimization.map_and_batch_fusion = True
+  dataset = dataset.with_options(options)
+  ```
+  """
+  apply_default_optimizations = options.create_option(
+      name="apply_default_optimizations",
+      ty=bool,
+      docstring=
+      "Whether to apply default static optimizations. If False, only static "
+      "optimizations that have been explicitly enabled will be applied.")
+
+  filter_fusion = options.create_option(
+      name="filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse filter transformations.")
+
+  hoist_random_uniform = options.create_option(
+      name="hoist_random_uniform",
+      ty=bool,
+      docstring=
+      "Whether to hoist `tf.random_uniform()` ops out of map transformations.")
+
+  map_and_batch_fusion = options.create_option(
+      name="map_and_batch_fusion",
+      ty=bool,
+      docstring="Whether to fuse map and batch transformations.")
+
+  map_and_filter_fusion = options.create_option(
+      name="map_and_filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse map and filter transformations.")
+
+  map_fusion = options.create_option(
+      name="map_and_filter_fusion",
+      ty=bool,
+      docstring="Whether to fuse map transformations.")
+
+  map_parallelization = options.create_option(
+      name="map_parallelization",
+      ty=bool,
+      docstring="Whether to parallelize stateless map transformations.")
+
+  map_vectorization = options.create_option(
+      name="map_vectorization",
+      ty=bool,
+      docstring="Whether to vectorize map transformations.")
+
+  noop_elimination = options.create_option(
+      name="noop_elimination",
+      ty=bool,
+      docstring="Whether to eliminate no-op transformations.")
+
+  shuffle_and_repeat_fusion = options.create_option(
+      name="shuffle_and_repeat_fusion",
+      ty=bool,
+      docstring="Whether to fuse shuffle and repeat transformations. If None, "
+      "defaults to True.")
+
+  def _static_optimizations(self):
+    """Produces the list of enabled static optimizations."""
+    result = []
+    optimizations_to_enable = [
+        "filter_fusion",
+        "hoist_random_uniform",
+        "map_and_filter_fusion",
+        "map_fusion",
+        "map_parallelization",
+        "map_vectorization",
+    ]
+    for optimization in optimizations_to_enable:
+      if getattr(self, optimization):
+        result.append(optimization)
+
+    if self.apply_default_optimizations is not False:
+      # The following optimizations are turned on by default, unless the
+      # user explicitly disables them.
+      optimizations_to_disable = [
+          "map_and_batch_fusion",
+          "noop_elimination",
+          "shuffle_and_repeat_fusion",
+      ]
+      for optimization in optimizations_to_disable:
+        if getattr(self, optimization) is not False:
+          result.append(optimization)
+    return result
diff --git a/tensorflow/python/data/experimental/ops/parsing_ops.py b/tensorflow/python/data/experimental/ops/parsing_ops.py
index 6615b9022a23628fb5c37fb51762c429086b983c..deb20d61888adeeff078997fc8adfede604de8eb 100644
--- a/tensorflow/python/data/experimental/ops/parsing_ops.py
+++ b/tensorflow/python/data/experimental/ops/parsing_ops.py
@@ -18,11 +18,11 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.util.tf_export import tf_export
 
@@ -33,8 +33,8 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
   def __init__(self, input_dataset, features, num_parallel_calls):
     super(_ParseExampleDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
-    if not all(types == dtypes.string
-               for types in nest.flatten(input_dataset.output_types)):
+    if not input_dataset._element_structure.is_compatible_with(  # pylint: disable=protected-access
+        structure.TensorStructure(dtypes.string, [None])):
       raise TypeError("Input dataset should be a dataset of vectors of strings")
     self._num_parallel_calls = num_parallel_calls
     # pylint: disable=protected-access
@@ -67,20 +67,22 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
         for _ in range(len(sparse_keys))
     ]
 
-    self._output_shapes = dict(
+    output_shapes = dict(
         zip(self._dense_keys + self._sparse_keys,
             dense_output_shapes + sparse_output_shapes))
-    self._output_types = dict(
+    output_types = dict(
         zip(self._dense_keys + self._sparse_keys,
             self._dense_types + self._sparse_types))
-    self._output_classes = dict(
+    output_classes = dict(
         zip(self._dense_keys + self._sparse_keys,
             [ops.Tensor for _ in range(len(self._dense_defaults))] +
             [sparse_tensor.SparseTensor for _ in range(len(self._sparse_keys))
             ]))
+    self._structure = structure.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.parse_example_dataset(
+    return gen_experimental_dataset_ops.experimental_parse_example_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._num_parallel_calls,
         self._dense_defaults,
@@ -91,16 +93,8 @@ class _ParseExampleDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
 
 # TODO(b/111553342): add arguments names and example names as well.
@@ -138,10 +132,10 @@ def parse_example_dataset(features, num_parallel_calls=1):
   def _apply_fn(dataset):
     """Function from `Dataset` to `Dataset` that applies the transformation."""
     out_dataset = _ParseExampleDataset(dataset, features, num_parallel_calls)
-    if any([
+    if any(
         isinstance(feature, parsing_ops.SparseFeature)
         for _, feature in features.items()
-    ]):
+    ):
       # pylint: disable=protected-access
       # pylint: disable=g-long-lambda
       out_dataset = out_dataset.map(
diff --git a/tensorflow/python/data/experimental/ops/prefetching_ops.py b/tensorflow/python/data/experimental/ops/prefetching_ops.py
index a55b8bfb769e7adbc3013c81acce447e6d3595fd..e46dfb6568d5d0c29187c233e503cef98eecece1 100644
--- a/tensorflow/python/data/experimental/ops/prefetching_ops.py
+++ b/tensorflow/python/data/experimental/ops/prefetching_ops.py
@@ -17,13 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import warnings
-
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
-from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import device as framework_device
 from tensorflow.python.framework import dtypes
@@ -37,304 +33,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-def function_buffering_resource(string_arg,
-                                target_device,
-                                f,
-                                buffer_size,
-                                output_types,
-                                container="",
-                                shared_name=None,
-                                name=None):
-  """Creates a FunctionBufferingResource.
-
-  A FunctionBufferingResource fills up a buffer by calling a function `f` on
-  `target_device`. `f` should take in only a single string argument as input.
-
-  Args:
-    string_arg: The single string argument to the function.
-    target_device: The device to run `f` on.
-    f: The function to be executed.
-    buffer_size: Size of the buffer to be populated.
-    output_types: The output types generated by the function.
-    container: (Optional) string. Defaults to "".
-    shared_name: (Optional) string.
-    name: (Optional) string to name the op.
-
-  Returns:
-    Handle to a FunctionBufferingResource.
-  """
-  if shared_name is None:
-    shared_name = ""
-  return ged_ops.experimental_function_buffering_resource(
-      string_arg=string_arg,
-      target_device=target_device,
-      shared_name=shared_name,
-      f=f,
-      buffer_size=buffer_size,
-      container=container,
-      name=name,
-      output_types=output_types)
-
-
-def function_buffering_resource_get_next(function_buffer_resource,
-                                         output_types,
-                                         name=None):
-  return ged_ops.experimental_function_buffering_resource_get_next(
-      function_buffer_resource=function_buffer_resource,
-      output_types=output_types,
-      name=name)
-
-
-def function_buffering_resource_reset(function_buffer_resource, name=None):
-  return ged_ops.experimental_function_buffering_resource_reset(
-      function_buffer_resource=function_buffer_resource, name=name)
-
-
-# pylint: disable=protected-access
-class _PrefetchToDeviceIterator(object):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    device: A fully specified device string where we want to prefetch to
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               one_shot,
-               device,
-               buffer_size,
-               shared_name=None):
-    self._input_dataset = input_dataset
-    self._get_next_call_count = 0
-    self._one_shot = one_shot
-    if shared_name is None:
-      shared_name = ""
-
-    if self._one_shot:
-      self._input_iterator = input_dataset.make_one_shot_iterator()
-    else:
-      self._input_iterator = iterator_ops.Iterator.from_structure(
-          self._input_dataset.output_types, self._input_dataset.output_shapes,
-          shared_name, self._input_dataset.output_classes)
-    input_iterator_handle = self._input_iterator.string_handle()
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-    # handle is a scalar `tf.Tensor` of type `tf.string`
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self._input_iterator.output_types,
-          self._input_iterator.output_shapes,
-          self._input_iterator.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access
-
-    iterator_device = ged_ops.experimental_iterator_get_device(
-        self._input_iterator._iterator_resource)
-
-    with ops.device(device):
-      self._buffering_resource = function_buffering_resource(
-          f=self._prefetch_fn,
-          target_device=iterator_device,
-          string_arg=input_iterator_handle,
-          buffer_size=buffer_size,
-          shared_name=shared_name,
-          output_types=nest.flatten(
-              sparse.as_dense_types(self._input_dataset.output_types,
-                                    self._input_dataset.output_classes)))
-
-    if not self._one_shot:
-      reset_op = function_buffering_resource_reset(self._buffering_resource)
-      with ops.control_dependencies([reset_op]):
-        self._initializer = self._input_iterator.make_initializer(
-            self._input_dataset)
-
-  def get_next(self, name=None):
-    """See `tf.data.Iterator.get_next`."""
-    self._get_next_call_count += 1
-    if self._get_next_call_count > iterator_ops.GET_NEXT_CALL_WARNING_THRESHOLD:
-      warnings.warn(iterator_ops.GET_NEXT_CALL_WARNING_MESSAGE)
-
-    flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
-        self._buffering_resource,
-        output_types=nest.flatten(
-            sparse.as_dense_types(self.output_types, self.output_classes)),
-        name=name)
-
-    ret = sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(self.output_types, flat_ret),
-        self.output_types, self.output_shapes, self.output_classes)
-
-    for tensor, shape in zip(
-        nest.flatten(ret), nest.flatten(self.output_shapes)):
-      if isinstance(tensor, ops.Tensor):
-        tensor.set_shape(shape)
-
-    return ret
-
-  @property
-  def initializer(self):
-    if self._one_shot:
-      raise NotImplementedError("Can't initialize a one_shot_iterator")
-    return self._initializer
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-
-class _PrefetchToDeviceEagerIterator(iterator_ops.EagerIterator):
-  """A replacement for `tf.data.Iterator` that prefetches to another device.
-
-  Args:
-    input_dataset: The input dataset
-    one_shot: If true, we make a one shot iterator that's already initialized.
-    device: A fully specified device string where we want to prefetch to
-    buffer_size: Size of the prefetching buffer.
-    shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-  Returns:
-    An Iterator type object.
-  """
-
-  def __init__(self,
-               input_dataset,
-               device,
-               buffer_size):
-    with ops.device("/device:CPU:0"):
-      super(_PrefetchToDeviceEagerIterator, self).__init__(input_dataset)
-      input_iterator_handle = gen_dataset_ops.iterator_to_string_handle(
-          self._resource)
-
-    self._device = device
-
-    @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
-    def _prefetch_fn(handle):
-      """Prefetches one element from `input_iterator`."""
-      remote_iterator = iterator_ops.Iterator.from_string_handle(
-          handle, self.output_types, self.output_shapes, self.output_classes)
-      ret = remote_iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
-
-    self._prefetch_fn = _prefetch_fn._get_concrete_function_internal()  # pylint: disable=protected-access
-
-    with ops.device(device):
-      self._buffering_resource = function_buffering_resource(
-          f=self._prefetch_fn,
-          output_types=self._flat_output_types,
-          target_device=ged_ops.experimental_iterator_get_device(
-              self._resource),
-          string_arg=input_iterator_handle,
-          buffer_size=buffer_size,
-          shared_name=iterator_ops._generate_shared_name(
-              "function_buffer_resource"))
-
-  def _next_internal(self):
-    """Returns a nested structure of `tf.Tensor`s containing the next element.
-    """
-    # This runs in sync mode as iterators use an error status to communicate
-    # that there is no more data to iterate over.
-    # TODO(b/77291417): Fix
-    with context.execution_mode(context.SYNC):
-      with ops.device(self._device):
-        flat_ret = ged_ops.experimental_function_buffering_resource_get_next(
-            function_buffer_resource=self._buffering_resource,
-            output_types=self._flat_output_types)
-      return self._element_structure._from_tensor_list(flat_ret)
-# pylint: enable=protected-access
-
-
-class _PrefetchToDeviceDataset(dataset_ops.UnaryDataset):
-  """A `Dataset` whose iterator prefetches elements to another device."""
-
-  def __init__(self, input_dataset, device, buffer_size):
-    super(_PrefetchToDeviceDataset, self).__init__(input_dataset)
-    self._input_dataset = input_dataset
-    self._device = device
-    self._buffer_size = buffer_size if buffer_size is not None else 1
-
-  # The static analysis cannot tell that the eager iterator's superclass has
-  # a `next()` method.
-  # pylint: disable=non-iterator-returned
-  def __iter__(self):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    The returned iterator implements the Python iterator protocol and therefore
-    can only be used in eager mode.
-
-    Returns:
-      An `Iterator` over the elements of this dataset.
-
-    Raises:
-      RuntimeError: If eager execution is enabled.
-    """
-    if context.executing_eagerly():
-      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
-                                            self._buffer_size)
-    else:
-      raise RuntimeError("dataset.__iter__() is only supported when eager "
-                         "execution is enabled.")
-  # pylint: enable=non-iterator-returned
-
-  def make_one_shot_iterator(self):
-    if context.executing_eagerly():
-      return _PrefetchToDeviceEagerIterator(self._input_dataset, self._device,
-                                            self._buffer_size)
-    else:
-      return _PrefetchToDeviceIterator(self._input_dataset, one_shot=True,
-                                       device=self._device,
-                                       buffer_size=self._buffer_size)
-
-  def make_initializable_iterator(self, shared_name=None):
-    return _PrefetchToDeviceIterator(
-        self._input_dataset,
-        one_shot=False,
-        device=self._device,
-        buffer_size=self._buffer_size,
-        shared_name=shared_name)
-
-  def _as_variant_tensor(self):
-    # TODO(mrry): Raise this error earlier (e.g. when one of the Dataset
-    # transformation methods is called.
-    # TODO(mrry): Investigate support for chaining further transformations after
-    # the prefetch, including GPU support.
-    raise NotImplementedError("`prefetch_to_device()` must be the last "
-                              "transformation in a dataset pipeline.")
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-
 @tf_export("data.experimental.prefetch_to_device")
 def prefetch_to_device(device, buffer_size=None):
   """A transformation that prefetches dataset values to the given `device`.
@@ -352,7 +50,8 @@ def prefetch_to_device(device, buffer_size=None):
     `tf.data.Dataset.apply`.
   """
   def _apply_fn(dataset):
-    return _PrefetchToDeviceDataset(dataset, device, buffer_size)
+    return dataset.apply(
+        copy_to_device(target_device=device)).prefetch(buffer_size)
 
   return _apply_fn
 
@@ -371,8 +70,14 @@ def copy_to_device(target_device, source_device="/cpu:0"):
   """
 
   def _apply_fn(dataset):
+    options = dataset_ops.Options()
+    options.experimental_autotune = False
+    opt_options = optimization_options.OptimizationOptions()
+    opt_options.apply_default_optimizations = False
+    options.experimental_optimization = opt_options
     return _CopyToDeviceDataset(
-        dataset, target_device=target_device, source_device=source_device)
+        dataset, target_device=target_device,
+        source_device=source_device).with_options(options)
 
   return _apply_fn
 
@@ -380,7 +85,7 @@ def copy_to_device(target_device, source_device="/cpu:0"):
 # TODO(rohanj): Use the _input_hostmem attr on the RemoteCall ops to indicate
 # all inputs to the Op are in host memory, thereby avoiding some unnecessary
 # Sends and Recvs.
-class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
+class _CopyToDeviceDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that copies elements to another device."""
 
   def __init__(self, input_dataset, target_device, source_device="/cpu:0"):
@@ -399,13 +104,6 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
     self._source_device_string = source_device
     self._source_device = ops.convert_to_tensor(source_device)
 
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._input_dataset.output_shapes,
-                               self._input_dataset.output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._input_dataset.output_types,
-                              self._input_dataset.output_classes))
-
     @function.defun()
     def _init_func():
       """Creates an iterator for the input dataset.
@@ -416,8 +114,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
       # pylint: disable=protected-access
       ds_variant = self._input_dataset._as_variant_tensor()
       resource = gen_dataset_ops.anonymous_iterator(
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
       with ops.control_dependencies(
           [gen_dataset_ops.make_iterator(ds_variant, resource)]):
         return gen_dataset_ops.iterator_to_string_handle(resource)
@@ -448,8 +145,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
         iterator = iterator_ops.Iterator.from_string_handle(
             string_handle, self.output_types, self.output_shapes,
             self.output_classes)
-      ret = iterator.get_next()
-      return nest.flatten(sparse.serialize_sparse_tensors(ret))
+      return self._element_structure._to_tensor_list(iterator.get_next())  # pylint: disable=protected-access
 
     next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
@@ -459,7 +155,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           target=self._source_device,
           args=[string_handle] +
           next_func_concrete.captured_inputs,
-          Tout=self._flat_output_types,
+          Tout=self._input_dataset._element_structure._flat_types,  # pylint: disable=protected-access
           f=next_func_concrete)
 
     self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
@@ -476,8 +172,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
       """
       iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
           string_handle,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
       with ops.control_dependencies([
           resource_variable_ops.destroy_resource_op(
               iterator_resource, ignore_lookup_error=True)]):
@@ -489,8 +184,7 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
           target=self._source_device,
-          args=[string_handle] +
-          finalize_func_concrete.captured_inputs,
+          args=[string_handle] + finalize_func_concrete.captured_inputs,
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
@@ -526,17 +220,63 @@ class _CopyToDeviceDataset(dataset_ops.UnaryDataset):
           init_func=self._init_func,
           next_func=self._next_func,
           finalize_func=self._finalize_func,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self._input_dataset))
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
+class _MapOnGpuDataset(dataset_ops.UnaryDataset):
+  """A `Dataset` that maps a function over elements in its using a GPU."""
+
+  def __init__(self, input_dataset, map_func, use_inter_op_parallelism=True):
+    """See `Dataset.map()` for details."""
+    super(_MapOnGpuDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._use_inter_op_parallelism = use_inter_op_parallelism
+
+    self._map_func = dataset_ops.StructuredFunctionWrapper(
+        map_func,
+        self._transformation_name(),
+        dataset=input_dataset,
+        defun_kwargs={"experimental_ints_on_device": True})
+
+  def _functions(self):
+    return [self._map_func]
+
+  def _as_variant_tensor(self):
+    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
+    return ged_ops.experimental_map_dataset(
+        input_t,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
+        use_inter_op_parallelism=self._use_inter_op_parallelism,
+        **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
+  def _element_structure(self):
+    return self._map_func.output_structure
+
+  def _transformation_name(self):
+    return "map_on_gpu()"
+
+
+def map_on_gpu(map_func):
+  """Maps `map_func` across the elements of this dataset.
+
+  NOTE: This is a highly experimental version of `tf.data.Dataset.map` that runs
+  `map_func` on GPU. It must be used after applying the
+  `tf.data.experimental.copy_to_device` transformation with a GPU device
+  argument.
+
+  Args:
+    map_func: A function mapping a nested structure of tensors (having shapes
+      and types defined by `self.output_shapes` and `self.output_types`) to
+      another nested structure of tensors.
+
+  Returns:
+    A `Dataset` transformation function, which can be passed to
+    `tf.data.Dataset.apply`.
+  """
+
+  def _apply_fn(dataset):
+    return _MapOnGpuDataset(dataset, map_func)
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/ops/random_ops.py b/tensorflow/python/data/experimental/ops/random_ops.py
index e3a2aeab31ea35ee9636821e3e8b8db35ed72b65..cbdf367db6bd5b4ce27e636c08a19cd4fedda041 100644
--- a/tensorflow/python/data/experimental/ops/random_ops.py
+++ b/tensorflow/python/data/experimental/ops/random_ops.py
@@ -17,38 +17,46 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import random_seed
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.experimental.RandomDataset")
-class RandomDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.RandomDataset", v1=[])
+class RandomDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` of pseudorandom values."""
 
   def __init__(self, seed=None):
     """A `Dataset` of pseudorandom values."""
-    super(RandomDataset, self).__init__()
+    super(RandomDatasetV2, self).__init__()
     self._seed, self._seed2 = random_seed.get_seed(seed)
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.random_dataset(
+    return gen_experimental_dataset_ops.experimental_random_dataset(
         seed=self._seed,
         seed2=self._seed2,
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return ops.Tensor
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.int64, [])
 
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
 
-  @property
-  def output_types(self):
-    return dtypes.int64
+@tf_export(v1=["data.experimental.RandomDataset"])
+class RandomDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` of pseudorandom values."""
+
+  @functools.wraps(RandomDatasetV2.__init__)
+  def __init__(self, seed=None):
+    wrapped = RandomDatasetV2(seed)
+    super(RandomDatasetV1, self).__init__(wrapped)
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+RandomDataset = RandomDatasetV1
diff --git a/tensorflow/python/data/experimental/ops/readers.py b/tensorflow/python/data/experimental/ops/readers.py
index fe601925860b4ed87a682c2124dee31ba8bea266..c2d82aeb59174fb9d35c4cc2c3d850fb351d8a90 100644
--- a/tensorflow/python/data/experimental/ops/readers.py
+++ b/tensorflow/python/data/experimental/ops/readers.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import collections
 import csv
+import functools
 
 import numpy as np
 
@@ -31,12 +32,11 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers as core_readers
 from tensorflow.python.data.util import convert
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.lib.io import file_io
-from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import gfile
@@ -307,8 +307,8 @@ def make_tf_record_dataset(file_pattern,
     return dataset.prefetch(buffer_size=prefetch_buffer_size)
 
 
-@tf_export("data.experimental.make_csv_dataset")
-def make_csv_dataset(
+@tf_export("data.experimental.make_csv_dataset", v1=[])
+def make_csv_dataset_v2(
     file_pattern,
     batch_size,
     column_names=None,
@@ -507,11 +507,42 @@ def make_csv_dataset(
   return dataset
 
 
+@tf_export(v1=["data.experimental.make_csv_dataset"])
+def make_csv_dataset_v1(
+    file_pattern,
+    batch_size,
+    column_names=None,
+    column_defaults=None,
+    label_name=None,
+    select_columns=None,
+    field_delim=",",
+    use_quote_delim=True,
+    na_value="",
+    header=True,
+    num_epochs=None,
+    shuffle=True,
+    shuffle_buffer_size=10000,
+    shuffle_seed=None,
+    prefetch_buffer_size=optimization.AUTOTUNE,
+    num_parallel_reads=1,
+    sloppy=False,
+    num_rows_for_inference=100,
+    compression_type=None,
+):  # pylint: disable=missing-docstring
+  return dataset_ops.DatasetV1Adapter(make_csv_dataset_v2(
+      file_pattern, batch_size, column_names, column_defaults, label_name,
+      select_columns, field_delim, use_quote_delim, na_value, header,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, num_parallel_reads, sloppy, num_rows_for_inference,
+      compression_type))
+make_csv_dataset_v1.__doc__ = make_csv_dataset_v2.__doc__
+
+
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 4 * 1024 * 1024  # 4 MB
 
 
-@tf_export("data.experimental.CsvDataset")
-class CsvDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.CsvDataset", v1=[])
+class CsvDatasetV2(dataset_ops.DatasetSource):
   """A Dataset comprising lines from one or more CSV files."""
 
   def __init__(self,
@@ -541,7 +572,9 @@ class CsvDataset(dataset_ops.DatasetSource):
 
     We can construct a CsvDataset from it as follows:
     ```python
-    dataset = tf.data.experimental.CsvDataset(
+    tf.enable_eager_execution()
+
+     dataset = tf.data.experimental.CsvDataset(
         "my_file*.csv",
         [tf.float32,  # Required field, use dtype or empty tensor
          tf.constant([0.0], dtype=tf.float32),  # Optional field, default to 0.0
@@ -553,13 +586,8 @@ class CsvDataset(dataset_ops.DatasetSource):
 
     The expected output of its iterations is:
     ```python
-    next_element = dataset.make_one_shot_iterator().get_next()
-    with tf.Session() as sess:
-      while True:
-        try:
-          print(sess.run(next_element))
-        except tf.errors.OutOfRangeError:
-          break
+    for element in dataset:
+      print(element)
 
     >> (4.28e10, 5.55e6, 12)
     >> (-5.3e14, 0.0, 2)
@@ -594,7 +622,7 @@ class CsvDataset(dataset_ops.DatasetSource):
         the input data. If specified, only this subset of columns will be
         parsed. Defaults to parsing all columns.
     """
-    super(CsvDataset, self).__init__()
+    super(CsvDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -624,11 +652,9 @@ class CsvDataset(dataset_ops.DatasetSource):
         argument_default=[],
         argument_dtype=dtypes.int64,
     )
-    self._output_shapes = tuple(
-        tensor_shape.scalar() for _ in range(len(record_defaults)))
-    self._output_types = tuple(d.dtype for d in self._record_defaults)
-    self._output_classes = tuple(
-        ops.Tensor for _ in range(len(record_defaults)))
+    self._structure = structure.NestedStructure(
+        tuple(structure.TensorStructure(d.dtype, [])
+              for d in self._record_defaults))
 
   def _as_variant_tensor(self):
     # Constructs graph node for the dataset op.
@@ -637,7 +663,7 @@ class CsvDataset(dataset_ops.DatasetSource):
         record_defaults=self._record_defaults,
         buffer_size=self._buffer_size,
         header=self._header,
-        output_shapes=self._output_shapes,
+        output_shapes=self._structure._flat_shapes,  # pylint: disable=protected-access
         field_delim=self._field_delim,
         use_quote_delim=self._use_quote_delim,
         na_value=self._na_value,
@@ -646,34 +672,47 @@ class CsvDataset(dataset_ops.DatasetSource):
     )
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    return self._output_shapes
 
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-
-@tf_export("data.experimental.make_batched_features_dataset")
-def make_batched_features_dataset(file_pattern,
-                                  batch_size,
-                                  features,
-                                  reader=core_readers.TFRecordDataset,
-                                  label_key=None,
-                                  reader_args=None,
-                                  num_epochs=None,
-                                  shuffle=True,
-                                  shuffle_buffer_size=10000,
-                                  shuffle_seed=None,
-                                  prefetch_buffer_size=optimization.AUTOTUNE,
-                                  reader_num_threads=1,
-                                  parser_num_threads=2,
-                                  sloppy_ordering=False,
-                                  drop_final_batch=False):
+@tf_export(v1=["data.experimental.CsvDataset"])
+class CsvDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A Dataset comprising lines from one or more CSV files."""
+
+  @functools.wraps(CsvDatasetV2.__init__)
+  def __init__(self,
+               filenames,
+               record_defaults,
+               compression_type=None,
+               buffer_size=None,
+               header=False,
+               field_delim=",",
+               use_quote_delim=True,
+               na_value="",
+               select_cols=None):
+    wrapped = CsvDatasetV2(filenames, record_defaults, compression_type,
+                           buffer_size, header, field_delim, use_quote_delim,
+                           na_value, select_cols)
+    super(CsvDatasetV1, self).__init__(wrapped)
+
+
+@tf_export("data.experimental.make_batched_features_dataset", v1=[])
+def make_batched_features_dataset_v2(file_pattern,
+                                     batch_size,
+                                     features,
+                                     reader=core_readers.TFRecordDataset,
+                                     label_key=None,
+                                     reader_args=None,
+                                     num_epochs=None,
+                                     shuffle=True,
+                                     shuffle_buffer_size=10000,
+                                     shuffle_seed=None,
+                                     prefetch_buffer_size=optimization.AUTOTUNE,
+                                     reader_num_threads=1,
+                                     parser_num_threads=2,
+                                     sloppy_ordering=False,
+                                     drop_final_batch=False):
   """Returns a `Dataset` of feature dictionaries from `Example` protos.
 
   If label_key argument is provided, returns a `Dataset` of tuple
@@ -819,6 +858,31 @@ def make_batched_features_dataset(file_pattern,
   return dataset
 
 
+@tf_export(v1=["data.experimental.make_batched_features_dataset"])
+def make_batched_features_dataset_v1(file_pattern,  # pylint: disable=missing-docstring
+                                     batch_size,
+                                     features,
+                                     reader=core_readers.TFRecordDataset,
+                                     label_key=None,
+                                     reader_args=None,
+                                     num_epochs=None,
+                                     shuffle=True,
+                                     shuffle_buffer_size=10000,
+                                     shuffle_seed=None,
+                                     prefetch_buffer_size=optimization.AUTOTUNE,
+                                     reader_num_threads=1,
+                                     parser_num_threads=2,
+                                     sloppy_ordering=False,
+                                     drop_final_batch=False):
+  return dataset_ops.DatasetV1Adapter(make_batched_features_dataset_v2(
+      file_pattern, batch_size, features, reader, label_key, reader_args,
+      num_epochs, shuffle, shuffle_buffer_size, shuffle_seed,
+      prefetch_buffer_size, reader_num_threads, parser_num_threads,
+      sloppy_ordering, drop_final_batch))
+make_batched_features_dataset_v2.__doc__ = (
+    make_batched_features_dataset_v1.__doc__)
+
+
 def _get_file_names(file_pattern, shuffle):
   """Parse list of file names from pattern, optionally shuffled.
 
@@ -850,8 +914,8 @@ def _get_file_names(file_pattern, shuffle):
   return file_names
 
 
-@tf_export("data.experimental.SqlDataset")
-class SqlDataset(dataset_ops.DatasetSource):
+@tf_export("data.experimental.SqlDataset", v1=[])
+class SqlDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` consisting of the results from a SQL query."""
 
   def __init__(self, driver_name, data_source_name, query, output_types):
@@ -861,17 +925,14 @@ class SqlDataset(dataset_ops.DatasetSource):
     For example:
 
     ```python
+    tf.enable_eager_execution()
+
     dataset = tf.data.experimental.SqlDataset("sqlite", "/foo/bar.sqlite3",
                                               "SELECT name, age FROM people",
                                               (tf.string, tf.int32))
-    iterator = dataset.make_one_shot_iterator()
-    next_element = iterator.get_next()
     # Prints the rows of the result set of the above query.
-    while True:
-      try:
-        print(sess.run(next_element))
-      except tf.errors.OutOfRangeError:
-        break
+    for element in dataset:
+      print(element)
     ```
 
     Args:
@@ -883,30 +944,40 @@ class SqlDataset(dataset_ops.DatasetSource):
       output_types: A tuple of `tf.DType` objects representing the types of the
         columns returned by `query`.
     """
-    super(SqlDataset, self).__init__()
+    super(SqlDatasetV2, self).__init__()
     self._driver_name = ops.convert_to_tensor(
         driver_name, dtype=dtypes.string, name="driver_name")
     self._data_source_name = ops.convert_to_tensor(
         data_source_name, dtype=dtypes.string, name="data_source_name")
     self._query = ops.convert_to_tensor(
         query, dtype=dtypes.string, name="query")
-    self._output_types = output_types
+    self._structure = structure.NestedStructure(
+        nest.map_structure(
+            lambda dtype: structure.TensorStructure(dtype, []), output_types))
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.sql_dataset(self._driver_name,
-                                       self._data_source_name, self._query,
-                                       nest.flatten(self.output_types),
-                                       nest.flatten(self.output_shapes))
+    return gen_experimental_dataset_ops.experimental_sql_dataset(
+        self._driver_name, self._data_source_name, self._query,
+        nest.flatten(self.output_types), nest.flatten(self.output_shapes))
 
   @property
-  def output_classes(self):
-    return nest.map_structure(lambda _: ops.Tensor, self._output_types)
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    return nest.map_structure(lambda _: tensor_shape.TensorShape([]),
-                              self._output_types)
 
-  @property
-  def output_types(self):
-    return self._output_types
+@tf_export(v1=["data.experimental.SqlDataset"])
+class SqlDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` consisting of the results from a SQL query."""
+
+  @functools.wraps(SqlDatasetV2.__init__)
+  def __init__(self, driver_name, data_source_name, query, output_types):
+    wrapped = SqlDatasetV2(driver_name, data_source_name, query, output_types)
+    super(SqlDatasetV1, self).__init__(wrapped)
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+CsvDataset = CsvDatasetV1
+SqlDataset = SqlDatasetV1
+make_batched_features_dataset = make_batched_features_dataset_v1
+make_csv_dataset = make_csv_dataset_v1
diff --git a/tensorflow/python/data/experimental/ops/scan_ops.py b/tensorflow/python/data/experimental/ops/scan_ops.py
index 1194238e2f987f9acc9028955f670df9e0efb4ad..5c77ad734348401ed666c562b36ef52ec8c5525b 100644
--- a/tensorflow/python/data/experimental/ops/scan_ops.py
+++ b/tensorflow/python/data/experimental/ops/scan_ops.py
@@ -21,10 +21,10 @@ import collections
 
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -49,18 +49,7 @@ class _ScanDataset(dataset_ops.UnaryDataset):
     # Compute initial values for the state classes, shapes and types based on
     # the initial state. The shapes may be refined by running `tf_scan_func` one
     # or more times below.
-    self._state_classes = sparse.get_classes(self._initial_state)
-    self._state_shapes = nest.pack_sequence_as(
-        self._initial_state,
-        [t.get_shape() for t in nest.flatten(self._initial_state)])
-    self._state_types = nest.pack_sequence_as(
-        self._initial_state,
-        [t.dtype for t in nest.flatten(self._initial_state)])
-
-    # Will be populated by calling `tf_scan_func`.
-    self._output_classes = None
-    self._output_shapes = None
-    self._output_types = None
+    self._state_structure = structure.Structure.from_value(self._initial_state)
 
     # Iteratively rerun the scan function until reaching a fixed point on
     # `self._state_shapes`.
@@ -70,9 +59,8 @@ class _ScanDataset(dataset_ops.UnaryDataset):
       wrapped_func = dataset_ops.StructuredFunctionWrapper(
           scan_func,
           self._transformation_name(),
-          input_classes=(self._state_classes, input_dataset.output_classes),
-          input_shapes=(self._state_shapes, input_dataset.output_shapes),
-          input_types=(self._state_types, input_dataset.output_types),
+          input_structure=structure.NestedStructure(
+              (self._state_structure, input_dataset._element_structure)),  # pylint: disable=protected-access
           add_to_graph=False)
       if not (
           isinstance(wrapped_func.output_types, collections.Sequence) and
@@ -83,29 +71,35 @@ class _ScanDataset(dataset_ops.UnaryDataset):
       new_state_classes, self._output_classes = wrapped_func.output_classes
 
       # Extract and validate class information from the returned values.
-      for new_state_class, state_class in zip(
+      new_state_classes, output_classes = wrapped_func.output_classes
+      old_state_classes = self._state_structure._to_legacy_output_classes()  # pylint: disable=protected-access
+      for new_state_class, old_state_class in zip(
           nest.flatten(new_state_classes),
-          nest.flatten(self._state_classes)):
-        if not issubclass(new_state_class, state_class):
+          nest.flatten(old_state_classes)):
+        if not issubclass(new_state_class, old_state_class):
           raise TypeError(
               "The element classes for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_classes, new_state_classes))
+              (old_state_classes, new_state_classes))
 
       # Extract and validate type information from the returned values.
-      new_state_types, self._output_types = wrapped_func.output_types
-      for new_state_type, state_type in zip(
-          nest.flatten(new_state_types), nest.flatten(self._state_types)):
-        if new_state_type != state_type:
+      new_state_types, output_types = wrapped_func.output_types
+      old_state_types = self._state_structure._to_legacy_output_types()  # pylint: disable=protected-access
+      for new_state_type, old_state_type in zip(
+          nest.flatten(new_state_types), nest.flatten(old_state_types)):
+        if new_state_type != old_state_type:
           raise TypeError(
               "The element types for the new state must match the initial "
               "state. Expected %s; got %s." %
-              (self._state_types, new_state_types))
+              (old_state_types, new_state_types))
 
       # Extract shape information from the returned values.
-      new_state_shapes, self._output_shapes = wrapped_func.output_shapes
+      new_state_shapes, output_shapes = wrapped_func.output_shapes
+      old_state_shapes = self._state_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
+      self._structure = structure.convert_legacy_structure(
+          output_types, output_shapes, output_classes)
 
-      flat_state_shapes = nest.flatten(self._state_shapes)
+      flat_state_shapes = nest.flatten(old_state_shapes)
       flat_new_state_shapes = nest.flatten(new_state_shapes)
       weakened_state_shapes = [
           original.most_specific_compatible_shape(new)
@@ -122,32 +116,34 @@ class _ScanDataset(dataset_ops.UnaryDataset):
           break
 
       if need_to_rerun:
-        self._state_shapes = nest.pack_sequence_as(self._state_shapes,
-                                                   weakened_state_shapes)
+        # TODO(b/110122868): Support a "most specific compatible structure"
+        # method for combining structures, to avoid using legacy structures
+        # in this method.
+        self._state_structure = structure.convert_legacy_structure(
+            old_state_types,
+            nest.pack_sequence_as(old_state_shapes, weakened_state_shapes),
+            old_state_classes)
 
-    self._scan_func = wrapped_func.function
-    self._scan_func.add_to_graph(ops.get_default_graph())
+    self._scan_func = wrapped_func
+    self._scan_func.function.add_to_graph(ops.get_default_graph())
+
+  def _functions(self):
+    return [self._scan_func]
 
   def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
-    return gen_dataset_ops.scan_dataset(
+    # pylint: disable=protected-access
+    input_t = self._input_dataset._as_variant_tensor()
+    return gen_experimental_dataset_ops.experimental_scan_dataset(
         input_t,
-        nest.flatten(sparse.serialize_sparse_tensors(self._initial_state)),
-        self._scan_func.captured_inputs,
-        f=self._scan_func,
+        self._state_structure._to_tensor_list(self._initial_state),
+        self._scan_func.function.captured_inputs,
+        f=self._scan_func.function,
+        preserve_cardinality=True,
         **dataset_ops.flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
   def _transformation_name(self):
     return "tf.data.experimental.scan()"
diff --git a/tensorflow/python/data/experimental/ops/shuffle_ops.py b/tensorflow/python/data/experimental/ops/shuffle_ops.py
index a4307212daf488deae986073264911fcf778588f..d12328a7145992880aedd939d7a02a8a12c61d4c 100644
--- a/tensorflow/python/data/experimental/ops/shuffle_ops.py
+++ b/tensorflow/python/data/experimental/ops/shuffle_ops.py
@@ -26,7 +26,7 @@ from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
+class _ShuffleAndRepeatDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that fuses `shuffle` and `repeat`."""
 
   def __init__(self, input_dataset, buffer_size, count=None, seed=None):
@@ -53,18 +53,6 @@ class _ShuffleAndRepeatDataset(dataset_ops.UnaryDataset):
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 @tf_export("data.experimental.shuffle_and_repeat")
 def shuffle_and_repeat(buffer_size, count=None, seed=None):
diff --git a/tensorflow/python/data/experimental/ops/sleep.py b/tensorflow/python/data/experimental/ops/sleep.py
index 7e7d370f702e2c0bc037a1e1455728c52d476327..2da832395b2e665168c1cd9cd7f52fb13e50c830 100644
--- a/tensorflow/python/data/experimental/ops/sleep.py
+++ b/tensorflow/python/data/experimental/ops/sleep.py
@@ -21,7 +21,7 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.ops import gen_experimental_dataset_ops
 
 
-class _SleepDataset(dataset_ops.UnaryDataset):
+class _SleepDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that sleeps before producing each upstream element."""
 
   def __init__(self, input_dataset, sleep_microseconds):
@@ -35,18 +35,6 @@ class _SleepDataset(dataset_ops.UnaryDataset):
         self._sleep_microseconds,
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
 
 def sleep(sleep_microseconds):
   """Sleeps for `sleep_microseconds` before producing each input element.
diff --git a/tensorflow/python/data/experimental/ops/stats_aggregator.py b/tensorflow/python/data/experimental/ops/stats_aggregator.py
index 5274c816a49bf70bf25b18cf7d981b90e100ba10..d5fcc033ab7df34369e0680275df744c431ed069 100644
--- a/tensorflow/python/data/experimental/ops/stats_aggregator.py
+++ b/tensorflow/python/data/experimental/ops/stats_aggregator.py
@@ -17,7 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -47,7 +47,6 @@ class StatsAggregator(object):
   options = dataset_ops.Options()
   options.experimental_stats = tf.data.experimental.StatsOptions(aggregator)
   dataset = dataset.with_options(options)
-  iterator = dataset.make_one_shot_iterator()
   ```
 
   To get a protocol buffer summary of the currently aggregated statistics,
@@ -69,7 +68,7 @@ class StatsAggregator(object):
 
   def __init__(self):
     """Creates a `StatsAggregator`."""
-    self._resource = gen_dataset_ops.stats_aggregator_handle()
+    self._resource = ged_ops.experimental_stats_aggregator_handle()
 
   # TODO(b/116314787): Update this/add support for V2 summary API.
   def get_summary(self):
@@ -81,4 +80,4 @@ class StatsAggregator(object):
     Returns:
       A scalar string `tf.Tensor` that summarizes the aggregated statistics.
     """
-    return gen_dataset_ops.stats_aggregator_summary(self._resource)
+    return ged_ops.experimental_stats_aggregator_summary(self._resource)
diff --git a/tensorflow/python/data/experimental/ops/stats_ops.py b/tensorflow/python/data/experimental/ops/stats_ops.py
index ca2f5f2a887cc5e2650f5c2dd076ba4c84950d45..15a9d24546e950543cc3274dbead26178620b5ed 100644
--- a/tensorflow/python/data/experimental/ops/stats_ops.py
+++ b/tensorflow/python/data/experimental/ops/stats_ops.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -66,8 +66,10 @@ def bytes_produced_stats(tag):
   """
 
   def _apply_fn(dataset):
-    return _StatsDataset(dataset, gen_dataset_ops.bytes_produced_stats_dataset,
-                         tag)
+    return _StatsDataset(
+        dataset,
+        gen_experimental_dataset_ops.experimental_bytes_produced_stats_dataset,
+        tag)
 
   return _apply_fn
 
@@ -89,12 +91,14 @@ def latency_stats(tag):
   """
 
   def _apply_fn(dataset):
-    return _StatsDataset(dataset, gen_dataset_ops.latency_stats_dataset, tag)
+    return _StatsDataset(
+        dataset,
+        gen_experimental_dataset_ops.experimental_latency_stats_dataset, tag)
 
   return _apply_fn
 
 
-class _StatsDataset(dataset_ops.UnaryDataset):
+class _StatsDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and also records statistics."""
 
   def __init__(self, input_dataset, op_function, tag):
@@ -108,15 +112,3 @@ class _StatsDataset(dataset_ops.UnaryDataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._tag,
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
diff --git a/tensorflow/python/data/experimental/ops/stats_options.py b/tensorflow/python/data/experimental/ops/stats_options.py
index c088d3d8881a23bc58742aa64b8368601503f058..6e884aa08ae9173df0fda0e81e176644cd342bfa 100644
--- a/tensorflow/python/data/experimental/ops/stats_options.py
+++ b/tensorflow/python/data/experimental/ops/stats_options.py
@@ -20,25 +20,24 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.experimental.ops import stats_aggregator
+from tensorflow.python.data.util import options
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("data.experimental.StatsOptions")
-class StatsOptions(object):
+class StatsOptions(options.OptionsBase):
   """Represents options for collecting dataset stats using `StatsAggregator`.
 
   To apply `StatsOptions` with a `tf.data.Dataset` object, use the following
   pattern:
 
   ```python
-  aggretator = tf.data.experimental.StatsAggregator()
+  aggregator = tf.data.experimental.StatsAggregator()
 
-  options = dataset_ops.Options()
+  options = tf.data.Options()
   options.experimental_stats = tf.data.experimental.StatsOptions()
   options.experimental_stats.aggregator = aggregator
   dataset = dataset.with_options(options)
-
-  iterator = dataset.make_one_shot_iterator()
   ```
 
   Note: a `StatsAggregator` object can be attached either duing construction or
@@ -52,52 +51,29 @@ class StatsOptions(object):
   ```
   """
 
-  for _name, _ty, _default, _docstring in [
-      ("aggregator", stats_aggregator.StatsAggregator, None,
-       "Associate the given statistics options with the dataset pipeline."),
-      ("prefix", str, "",
-       "Prefix to prepend all statistics recorded for the input `dataset` with."
-      ),
-      ("counter_prefix", str, "",
-       "Prefix for the statistics recorded as counter."),
-      ("latency_all_edges", bool, True,
-       "Whether to add latency measurements on all edges."),
-  ]:
-
-    def _make_getter(name):  # pylint: disable=no-self-argument
-
-      def getter(self):
-        return getattr(self, "_" + name)
-
-      return getter
-
-    def _make_setter(name, ty):  # pylint: disable=no-self-argument
-
-      def setter(self, value):
-        if not isinstance(value, ty):
-          raise TypeError(
-              "Attempting to set the option %s to incompatible value: %r when "
-              "it expects  %r" % (name, value, ty))
-        setattr(self, "_" + name, value)
-
-      return setter
-
-    vars()["_" + _name] = _default
-    vars()[_name] = property(
-        _make_getter(_name), _make_setter(_name, _ty), _default, _docstring)
-
-  def __init__(self, aggregator=None):
-    if aggregator:
-      self.aggregator = aggregator
-
-  def __eq__(self, other):
-    if isinstance(other, self.__class__):
-      return self.__dict__ == other.__dict__
-    else:
-      return False
-
-  def __ne__(self, other):
-    return not self.__eq__(other)
-
-  def __str__(self):
-    return str(self.__dict__)
+  aggregator = options.create_option(
+      name="aggregator",
+      ty=stats_aggregator.StatsAggregator,
+      docstring=
+      "Associates the given statistics aggregator with the dataset pipeline.")
+
+  prefix = options.create_option(
+      name="prefix",
+      ty=str,
+      docstring=
+      "Prefix to prepend all statistics recorded for the input `dataset` with.",
+      default="")
+
+  counter_prefix = options.create_option(
+      name="counter_prefix",
+      ty=str,
+      docstring=
+      "Prefix for the statistics recorded as counter.",
+      default="")
+
+  latency_all_edges = options.create_option(
+      name="latency_all_edges",
+      ty=bool,
+      docstring=
+      "Whether to add latency measurements on all edges.",
+      default=True)
diff --git a/tensorflow/python/data/experimental/ops/threading_options.py b/tensorflow/python/data/experimental/ops/threading_options.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbf662186f818a24a3b19ea678f87351ab45ed6e
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/threading_options.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Experimental API for controlling threading in `tf.data` pipelines."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+from tensorflow.python.data.util import options
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.ThreadingOptions")
+class ThreadingOptions(options.OptionsBase):
+  """Represents options for dataset threading.
+
+  To apply `ThreadingOptions` to a `dataset` object, use the following pattern:
+
+  ```python
+  options = tf.data.Options()
+  options.experimental_threading = tf.data.experimental.ThreadingOptions()
+  options.experimental_threading.private_threadpool_size = 10
+  dataset = dataset.with_options(options)
+  ```
+  """
+
+  max_intra_op_parallelism = options.create_option(
+      name="max_intra_op_parallelism",
+      ty=int,
+      docstring=
+      "If set, it overrides the maximum degree of intra-op parallelism.")
+
+  private_threadpool_size = options.create_option(
+      name="private_threadpool_size",
+      ty=int,
+      docstring=
+      "If set, the dataset will use a private threadpool of the given size.",
+      default=None)
diff --git a/tensorflow/python/data/experimental/ops/threadpool.py b/tensorflow/python/data/experimental/ops/threadpool.py
index 3ea017c6e80a1a22a6bd82770db1952aebd38849..69e8829d687fb54767bca1716c259efa150b4887 100644
--- a/tensorflow/python/data/experimental/ops/threadpool.py
+++ b/tensorflow/python/data/experimental/ops/threadpool.py
@@ -60,7 +60,7 @@ class PrivateThreadPool(object):
           display_name=display_name)
 
 
-class _ThreadPoolDataset(dataset_ops.UnaryDataset):
+class _ThreadPoolDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and sets a custom threadpool."""
 
   def __init__(self, input_dataset, thread_pool):
@@ -74,18 +74,6 @@ class _ThreadPoolDataset(dataset_ops.UnaryDataset):
         self._thread_pool._resource,  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
 
 # TODO(b/73383364): Properly export in the `tf.data.experimental` API when
 # stable or make private / remove.
diff --git a/tensorflow/python/data/experimental/ops/unique.py b/tensorflow/python/data/experimental/ops/unique.py
index 2a7775c456e86a9339cdfccf1e05f545238bb145..55ed98d8542187b1bd353e2ca581ef2fd2180875 100644
--- a/tensorflow/python/data/experimental/ops/unique.py
+++ b/tensorflow/python/data/experimental/ops/unique.py
@@ -48,7 +48,7 @@ def unique():
   return _apply_fn
 
 
-class _UniqueDataset(dataset_ops.UnaryDataset):
+class _UniqueDataset(dataset_ops.UnaryUnchangedStructureDataset):
   """A `Dataset` contains the unique elements from its input."""
 
   def __init__(self, input_dataset):
@@ -65,15 +65,3 @@ class _UniqueDataset(dataset_ops.UnaryDataset):
     return gen_experimental_dataset_ops.experimental_unique_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **dataset_ops.flat_structure(self))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
diff --git a/tensorflow/python/data/experimental/ops/writers.py b/tensorflow/python/data/experimental/ops/writers.py
index 994447cb4db352432e6f2a672c45ba8242930126..aef6da51409dbe13f59408b650fc5947f088d89d 100644
--- a/tensorflow/python/data/experimental/ops/writers.py
+++ b/tensorflow/python/data/experimental/ops/writers.py
@@ -22,7 +22,7 @@ from tensorflow.python.data.util import convert
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -48,7 +48,7 @@ class TFRecordWriter(object):
     Returns:
       A `tf.Operation` that, when run, writes contents of `dataset` to a file.
     """
-    if not isinstance(dataset, dataset_ops.Dataset):
+    if not isinstance(dataset, dataset_ops.DatasetV2):
       raise TypeError("`dataset` must be a `tf.data.Dataset` object.")
     if (dataset.output_types != dtypes.string or
         dataset.output_shapes != tensor_shape.scalar()):
@@ -56,5 +56,5 @@ class TFRecordWriter(object):
           "`dataset` must produce scalar `DT_STRING` tensors whereas it "
           "produces shape {0} and types {1}".format(dataset.output_shapes,
                                                     dataset.output_types))
-    return gen_dataset_ops.dataset_to_tf_record(
+    return gen_experimental_dataset_ops.experimental_dataset_to_tf_record(
         dataset._as_variant_tensor(), self._filename, self._compression_type)  # pylint: disable=protected-access
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 21eed2b070a70c13658246fda5693c8c0a4e9573..3390100bed5c6dbe937d26f008d794c0fbf3a753 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -10,48 +10,46 @@ load("//tensorflow:tensorflow.bzl", "tf_py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 tf_py_test(
-    name = "batch_dataset_op_test",
+    name = "batch_test",
     size = "small",
-    srcs = ["batch_dataset_op_test.py"],
+    srcs = ["batch_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
 tf_py_test(
-    name = "cache_dataset_op_test",
+    name = "cache_test",
     size = "small",
-    srcs = ["cache_dataset_op_test.py"],
+    srcs = ["cache_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
         "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
 tf_py_test(
-    name = "concatenate_dataset_op_test",
+    name = "concatenate_test",
     size = "small",
-    srcs = ["concatenate_dataset_op_test.py"],
+    srcs = ["concatenate_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
@@ -64,89 +62,96 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "dataset_constructor_op_test",
+    name = "dataset_checkpoint_test",
     size = "small",
-    srcs = ["dataset_constructor_op_test.py"],
+    srcs = ["dataset_checkpoint_test.py"],
     additional_deps = [
         ":test_base",
-        "//third_party/py/numpy",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:platform",
         "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:nest",
-        "//tensorflow/python/data/util:sparse",
-    ],
-    tags = [
-        "manual",
-        "nomac",  # b/62040583
+        "//tensorflow/python:variables",
     ],
 )
 
 tf_py_test(
-    name = "dataset_from_generator_op_test",
-    size = "medium",
-    srcs = ["dataset_from_generator_op_test.py"],
+    name = "dataset_test",
+    size = "small",
+    srcs = ["dataset_test.py"],
     additional_deps = [
         ":test_base",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/util:sparse",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:sparse_tensor",
     ],
 )
 
 tf_py_test(
-    name = "dataset_ops_test",
+    name = "filter_test",
     size = "small",
-    srcs = ["dataset_ops_test.py"],
+    srcs = ["filter_test.py"],
     additional_deps = [
         ":test_base",
-        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
 tf_py_test(
-    name = "filter_dataset_op_test",
+    name = "fixed_length_record_dataset_test",
     size = "small",
-    srcs = ["filter_dataset_op_test.py"],
+    srcs = ["fixed_length_record_dataset_test.py"],
     additional_deps = [
         ":test_base",
-        "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
     ],
 )
 
 tf_py_test(
-    name = "flat_map_dataset_op_test",
-    size = "small",
-    srcs = ["flat_map_dataset_op_test.py"],
+    name = "flat_map_test",
+    size = "medium",
+    srcs = ["flat_map_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/data/util:nest",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:errors",
         "//tensorflow/python:session",
@@ -159,58 +164,157 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "list_files_dataset_op_test",
+    name = "from_generator_test",
+    size = "medium",
+    srcs = ["from_generator_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
+    ],
+)
+
+tf_py_test(
+    name = "from_sparse_tensor_slices_test",
     size = "small",
-    srcs = ["list_files_dataset_op_test.py"],
+    srcs = ["from_sparse_tensor_slices_test.py"],
     additional_deps = [
         ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:util",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
     ],
 )
 
 tf_py_test(
-    name = "inputs_test",
+    name = "from_tensors_test",
     size = "small",
-    srcs = ["inputs_test.py"],
+    srcs = ["from_tensors_test.py"],
     additional_deps = [
         ":test_base",
-        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+    tags = [
+        "nomac",  # b/62040583
     ],
 )
 
 tf_py_test(
-    name = "interleave_dataset_op_test",
+    name = "from_tensor_slices_test",
     size = "small",
-    srcs = ["interleave_dataset_op_test.py"],
+    srcs = ["from_tensor_slices_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "interleave_test",
+    size = "medium",
+    srcs = ["interleave_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:session",
+        "//tensorflow/python:script_ops",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
+    ],
+)
+
+tf_py_test(
+    name = "iterator_checkpoint_test",
+    size = "medium",
+    srcs = ["iterator_checkpoint_test.py"],
+    additional_deps = [
+        ":test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:util",
+        "//tensorflow/python:checkpoint_management",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
     ],
+    grpc_enabled = True,
 )
 
-cuda_py_test(
-    name = "iterator_ops_test",
+tf_py_test(
+    name = "iterator_cluster_test",
     size = "small",
-    srcs = ["iterator_ops_test.py"],
+    srcs = ["iterator_cluster_test.py"],
+    additional_deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:function",
+        "//tensorflow/python:functional_ops",
+        "//tensorflow/python:lookup_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:session",
+        "//tensorflow/python:string_ops",
+    ],
+    grpc_enabled = True,
+    tags = [
+        "no_oss",  # Test flaky due to port collisions.
+        "no_windows",
+    ],
+)
+
+cuda_py_test(
+    name = "iterator_test",
+    size = "medium",
+    srcs = ["iterator_test.py"],
     additional_deps = [
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
@@ -249,41 +353,30 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "iterator_ops_cluster_test",
+    name = "list_files_test",
     size = "small",
-    srcs = ["iterator_ops_cluster_test.py"],
+    srcs = ["list_files_test.py"],
     additional_deps = [
-        "//tensorflow/core:protos_all_py",
+        ":test_base",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:lookup_ops",
-    ],
-    grpc_enabled = True,
-    tags = [
-        "no_oss",  # Test flaky due to port collisions.
-        "no_windows",
     ],
 )
 
 tf_py_test(
-    name = "map_dataset_op_test",
-    size = "small",
-    srcs = ["map_dataset_op_test.py"],
+    name = "map_test",
+    size = "medium",
+    srcs = ["map_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:constant_op",
@@ -297,27 +390,12 @@ tf_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:script_ops",
+        "//tensorflow/python:session",
         "//tensorflow/python:sparse_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_util",
         "//tensorflow/python:variable_scope",
-        "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "matching_files_dataset_op_test",
-    size = "small",
-    srcs = ["matching_files_dataset_op_test.py"],
-    additional_deps = [
-        ":test_base",
-        "//third_party/py/numpy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:dataset_ops",
     ],
 )
 
@@ -332,6 +410,7 @@ cuda_py_test(
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/experimental/ops:optimization",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
@@ -345,9 +424,9 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "optional_ops_test",
+    name = "optional_test",
     size = "small",
-    srcs = ["optional_ops_test.py"],
+    srcs = ["optional_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -366,70 +445,58 @@ cuda_py_test(
 )
 
 tf_py_test(
-    name = "prefetch_dataset_op_test",
+    name = "padded_batch_test",
     size = "small",
-    srcs = ["prefetch_dataset_op_test.py"],
+    srcs = ["padded_batch_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
     ],
 )
 
 tf_py_test(
-    name = "range_dataset_op_test",
+    name = "prefetch_test",
     size = "small",
-    srcs = ["range_dataset_op_test.py"],
+    srcs = ["prefetch_test.py"],
     additional_deps = [
         ":test_base",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:platform",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:iterator_ops",
     ],
 )
 
 tf_py_test(
-    name = "reader_dataset_ops_test",
+    name = "range_test",
     size = "small",
-    srcs = ["reader_dataset_ops_test.py"],
+    srcs = ["range_test.py"],
     additional_deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:util",
-        "//tensorflow/python/data/ops:iterator_ops",
-        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:framework_test_lib",
     ],
 )
 
 tf_py_test(
-    name = "reduce_dataset_op_test",
+    name = "reduce_test",
     size = "small",
-    srcs = ["reduce_dataset_op_test.py"],
+    srcs = ["reduce_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -437,7 +504,6 @@ tf_py_test(
         "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -445,9 +511,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "sequence_dataset_op_test",
+    name = "repeat_test",
     size = "small",
-    srcs = ["sequence_dataset_op_test.py"],
+    srcs = ["repeat_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
@@ -460,9 +526,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "shard_dataset_op_test",
+    name = "shard_test",
     size = "small",
-    srcs = ["shard_dataset_op_test.py"],
+    srcs = ["shard_test.py"],
     additional_deps = [
         ":test_base",
         "//tensorflow/python:client_testlib",
@@ -472,9 +538,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "shuffle_dataset_op_test",
+    name = "shuffle_test",
     size = "small",
-    srcs = ["shuffle_dataset_op_test.py"],
+    srcs = ["shuffle_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -491,21 +557,91 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "skip_test",
+    size = "small",
+    srcs = ["skip_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "take_test",
+    size = "small",
+    srcs = ["take_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/data/ops:dataset_ops",
+    ],
+)
+
+tf_py_test(
+    name = "text_line_dataset_test",
+    size = "small",
+    srcs = ["text_line_dataset_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:util",
+    ],
+)
+
+tf_py_test(
+    name = "tf_record_dataset_test",
+    size = "small",
+    srcs = ["tf_record_dataset_test.py"],
+    additional_deps = [
+        ":test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+    ],
+)
+
 py_library(
     name = "test_base",
     srcs = ["test_base.py"],
     deps = [
+        "//tensorflow/python:array_ops",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/eager:context",
     ],
 )
 
 tf_py_test(
-    name = "window_dataset_op_test",
-    size = "small",
-    srcs = ["window_dataset_op_test.py"],
+    name = "window_test",
+    size = "medium",
+    srcs = ["window_test.py"],
     additional_deps = [
         ":test_base",
         "@absl_py//absl/testing:parameterized",
@@ -521,9 +657,9 @@ tf_py_test(
 )
 
 tf_py_test(
-    name = "zip_dataset_op_test",
+    name = "zip_test",
     size = "small",
-    srcs = ["zip_dataset_op_test.py"],
+    srcs = ["zip_test.py"],
     additional_deps = [
         ":test_base",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
deleted file mode 100644
index e8decb9ad0ecf7768f4bf0f77ff74f9b79bff791..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/batch_dataset_op_test.py
+++ /dev/null
@@ -1,515 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class BatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('even', 28, 14, False),
-      ('uneven_with_remainder', 28, 15, False),
-      ('uneven_without_remainder', 28, 15, True),
-      ('empty', 0, 14, False),
-  )
-  def testBatchDataset(self, count, batch_size, drop_remainder):
-    """Tests the batch dataset logic for various input configurations.
-
-    Args:
-      count: the number of input elements
-      batch_size: the batch size
-      drop_remainder: whether a smaller batch size should be produced if batch
-        size does not divide number of inputs evenly
-    """
-
-    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
-    # RepeatDataset(count) -> BatchDataset(batch_size).
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count).batch(batch_size,
-                             drop_remainder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    if drop_remainder:
-      dim0 = batch_size
-    else:
-      dim0 = None
-    self.assertEqual([[dim0] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              count_t: count,
-              batch_size_t: batch_size,
-              drop_remainder_t: drop_remainder
-          })
-      num_full_batches = (count * 7) // batch_size
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(batch_size):
-            self.assertAllEqual(component[(i * batch_size + j) % 7]**2,
-                                result_component[j])
-      if not drop_remainder and (count * 7) % batch_size > 0:
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range((count * 7) % batch_size):
-            self.assertAllEqual(
-                component[(num_full_batches * batch_size + j) % 7]**2,
-                result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchDatasetInvalidBatchSize(self):
-    iterator = (dataset_ops.Dataset.range(10).batch(0).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(get_next)
-
-  def testBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
-        5).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchSparseWithDifferentDenseShapes(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=array_ops.expand_dims(
-              math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
-          dense_shape=[i])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(
-        5).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(2):
-        actual = sess.run(get_next)
-        expected_indices = []
-        expected_values = []
-        for j in range(5):
-          for k in range(i * 5 + j):
-            expected_indices.append([j, k])
-            expected_values.append(i * 5 + j)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=expected_indices,
-            values=expected_values,
-            dense_shape=[5, (i + 1) * 5 - 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedBatchSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(
-        2).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
-                   [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
-          values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
-          dense_shape=[2, 5, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testBatchShapeError(self):
-
-    def generator():
-      yield [1.0, 2.0, 3.0]
-      yield [4.0, 5.0, 6.0]
-      yield [7.0, 8.0, 9.0, 10.0]
-
-    iterator = (
-        dataset_ops.Dataset.from_generator(
-            generator, dtypes.float32, output_shapes=[None]).batch(3)
-        .make_initializable_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'Cannot batch tensors with different shapes in component 0. '
-          r'First element had shape \[3\] and element 2 had shape \[4\].'):
-        sess.run(next_element)
-
-
-def _random_seq_lens(count):
-  return np.random.randint(20, size=(count,)).astype(np.int32)
-
-
-class PaddedBatchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ('default_padding', _random_seq_lens(32), 4, [-1], False),
-      ('constant_padding', _random_seq_lens(32), 4, [25], False),
-      ('uneven_with_remainder', _random_seq_lens(34), 4, [-1], False),
-      ('uneven_without_remainder', _random_seq_lens(34), 4, [-1], True),
-  )
-  def testPaddedBatchDataset(self, seq_lens, batch_size, padded_shapes,
-                             drop_remainder):
-    """Tests the padded batch dataset logic for various input configurations.
-
-    Args:
-      seq_lens: the input sequence lengths
-      batch_size: the batch size
-      padded_shapes: the padded shapes to use
-      drop_remainder: whether a smaller batch size should be produced if batch
-        size does not divide number of inputs evenly
-    """
-
-    seq_lens_t = array_ops.placeholder(dtypes.int32, shape=[None])
-    batch_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    padded_shapes_t = array_ops.placeholder(dtypes.int64, shape=[1])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens_t)
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=batch_size_t,
-            drop_remainder=drop_remainder_t,
-            padded_shapes=padded_shapes_t).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              seq_lens_t: seq_lens,
-              batch_size_t: batch_size,
-              padded_shapes_t: padded_shapes,
-              drop_remainder_t: drop_remainder,
-          })
-
-      num_full_batches = len(seq_lens) // batch_size
-
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        padded_len = padded_shapes[0]
-        if padded_len is None or padded_len == -1:
-          padded_len = np.max(result) if result.size > 0 else 0
-        self.assertEqual((batch_size, padded_len), result.shape)
-        for j in range(batch_size):
-          seq_len = seq_lens[(i * batch_size) + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:],
-                              [0] * (padded_len - seq_len))
-
-      if not drop_remainder and len(seq_lens) % batch_size > 0:
-        result = sess.run(get_next)
-        padded_len = np.max(result) if result.size > 0 else 0
-        self.assertEqual((len(seq_lens) % batch_size, padded_len),
-                         result.shape)
-        for j in range(len(seq_lens) % batch_size):
-          seq_len = seq_lens[num_full_batches * batch_size + j]
-          self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[j, seq_len:],
-                              [0] * (padded_len - seq_len))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchShortPadding(self):
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([6, 5, 5, 5, 5])
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=4, padded_shapes=[5]).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.DataLossError):
-        sess.run(get_next)
-
-  def testPaddedBatchEmptyTensors(self):
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([0, 0, 0, 0])
-        .map(lambda x: array_ops.fill([x], x)).padded_batch(
-            batch_size=4, padded_shapes=[-1]).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      result = sess.run(get_next)
-      self.assertAllEqual([[], [], [], []], result)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDatasetNonDefaultPadding(self):
-    seq_lens = array_ops.placeholder(dtypes.int32, shape=[None])
-    padded_shape = array_ops.placeholder(dtypes.int64, shape=[1])
-
-    def fill_tuple(x):
-      filled = array_ops.fill([x], x)
-      return (filled, string_ops.as_string(filled))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(seq_lens).map(fill_tuple)
-        .padded_batch(
-            4,
-            padded_shapes=(padded_shape, padded_shape),
-            padding_values=(-1, '<end>')).make_initializable_iterator())
-
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Test with random sequence lengths, and max padding.
-      random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
-      sess.run(
-          init_op, feed_dict={
-              padded_shape: [-1],
-              seq_lens: random_seq_lens
-          })
-      for i in range(8):
-        result = sess.run(get_next)
-        padded_len = np.max(result[0])
-        self.assertEqual((4, padded_len), result[0].shape)
-        self.assertEqual((4, padded_len), result[1].shape)
-        for j in range(4):
-          seq_len = random_seq_lens[(i * 4) + j]
-          self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
-          self.assertAllEqual(result[0][j, seq_len:],
-                              [-1] * (padded_len - seq_len))
-          self.assertAllEqual(result[1][j, :seq_len],
-                              [compat.as_bytes(str(seq_len))] * seq_len)
-          self.assertAllEqual(result[1][j, seq_len:],
-                              [b'<end>'] * (padded_len - seq_len))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testPaddedBatchDatasetUnicode(self):
-    # See GitHub issue 16149
-    def generator():
-      data = [[u'Простой', u'тест', u'юникода'],
-              [u'никогда', u'не', u'бывает', u'простым']]
-
-      for seq in data:
-        yield seq, [0, 1, 2, 3]
-
-    dataset = dataset_ops.Dataset.from_generator(
-        generator, (dtypes.string, dtypes.int32),
-        (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
-    padded_dataset = dataset.padded_batch(
-        2, padded_shapes=([None], [None]), padding_values=('', 0))
-    with self.cached_session() as sess:
-      next_element = padded_dataset.make_one_shot_iterator().get_next()
-      sess.run(next_element)
-
-  def testPaddedBatchDatasetShapeSpecifications(self):
-    int_placeholder = array_ops.placeholder(dtypes.int32)
-    float_placeholder = array_ops.placeholder(dtypes.float32)
-    string_placeholder = array_ops.placeholder(dtypes.string)
-    input_dataset = dataset_ops.Dataset.from_tensors(
-        (int_placeholder, float_placeholder, string_placeholder))
-
-    # Test different ways of specifying the `padded_shapes` argument.
-    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
-        32,
-        padded_shapes=(tensor_shape.TensorShape([None]),
-                       tensor_shape.TensorShape([None, None]),
-                       tensor_shape.TensorShape([37])))
-    dynamic_padding_from_lists = input_dataset.padded_batch(
-        32, padded_shapes=([None], [None, None], [37]))
-    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
-        32, padded_shapes=([-1], [-1, -1], [37]))
-    dynamic_padding_from_tensors = input_dataset.padded_batch(
-        32,
-        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
-                       constant_op.constant([-1, -1], dtype=dtypes.int64),
-                       constant_op.constant([37], dtype=dtypes.int64)))
-
-    for dataset in [
-        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
-        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
-    ]:
-      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
-      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
-      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
-
-  def testPaddedBatchSparseError(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
-
-    with self.assertRaises(TypeError):
-      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
-
-  def testPaddedBatchShapeError(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      _ = dataset_ops.Dataset.range(10).padded_batch(5, padded_shapes=[1])
-
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(3,\).'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=[1])
-
-    with self.assertRaisesRegexp(
-        ValueError, r'Padded shape .* must be a 1-D tensor '
-        r'of tf.int64 values, but its shape was \(2, 2\).'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=[[1, 1], [1, 1]])
-
-    with self.assertRaisesRegexp(
-        TypeError, r'Padded shape .* must be a 1-D tensor '
-        r'of tf.int64 values, but its element type was float32.'):
-      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
-          5, padded_shapes=constant_op.constant([1., 2., 3.]))
-
-    with self.assertRaisesRegexp(
-        ValueError, r'The padded shape \(1,\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      shape_as_tensor = constant_op.constant([1], dtype=dtypes.int64)
-      _ = dataset_ops.Dataset.range(10).padded_batch(
-          5, padded_shapes=shape_as_tensor)
-
-    with self.assertRaisesRegexp(
-        ValueError,
-        r'The padded shape \((\?|None), (\?|None)\) is not compatible with the '
-        r'corresponding input component shape \(\).'):
-      shape_as_tensor = array_ops.placeholder(dtypes.int64, shape=[2])
-      _ = dataset_ops.Dataset.range(10).padded_batch(
-          5, padded_shapes=shape_as_tensor)
-
-
-class BatchDatasetBenchmark(test.Benchmark):
-
-  def benchmarkBatchSparse(self):
-    non_zeros_per_row_values = [0, 1, 5, 10, 100]
-    batch_size_values = [1, 32, 64, 128, 1024]
-
-    sparse_placeholder = array_ops.sparse_placeholder(dtype=dtypes.int64)
-    batch_size_placeholder = array_ops.placeholder(dtype=dtypes.int64, shape=[])
-
-    dataset = dataset_ops.Dataset.from_tensors(sparse_placeholder).repeat(
-        ).batch(batch_size_placeholder)
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    for non_zeros_per_row in non_zeros_per_row_values:
-
-      sparse_value = sparse_tensor.SparseTensorValue(
-          indices=np.arange(non_zeros_per_row, dtype=np.int64)[:, np.newaxis],
-          values=np.arange(non_zeros_per_row, dtype=np.int64),
-          dense_shape=[1000])
-
-      for batch_size in batch_size_values:
-
-        with session.Session() as sess:
-          sess.run(iterator.initializer, feed_dict={
-              sparse_placeholder: sparse_value,
-              batch_size_placeholder: batch_size})
-          # Run five steps to warm up the session caches before taking the
-          # first measurement.
-          for _ in range(5):
-            sess.run(next_element.indices.op)
-          deltas = []
-          for _ in range(100):
-            start = time.time()
-            for _ in range(100):
-              sess.run(next_element.indices.op)
-            end = time.time()
-            deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100.0
-
-        print('Batch sparse dataset non-zeros per row: %d batch_size: %d '
-              'wall time: %f'
-              % (non_zeros_per_row, batch_size, median_wall_time))
-        self.report_benchmark(
-            iters=10000, wall_time=median_wall_time,
-            name='benchmark_batch_sparse_dataset_nnz_%d_batch_size_%d' % (
-                non_zeros_per_row, batch_size))
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/batch_test.py b/tensorflow/python/data/kernel_tests/batch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b035e59173e6ee52be8ec0aab21c761093d07ce
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/batch_test.py
@@ -0,0 +1,173 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BatchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('even', 28, 14, False),
+      ('uneven_with_remainder', 28, 15, False),
+      ('uneven_without_remainder', 28, 15, True),
+      ('empty', 0, 14, False),
+  )
+  def testBatchDataset(self, count, batch_size, drop_remainder):
+    """Tests the batch dataset logic for various input configurations.
+
+    Args:
+      count: the number of input elements
+      batch_size: the batch size
+      drop_remainder: whether a smaller batch size should be produced if batch
+        size does not divide number of inputs evenly
+    """
+
+    # The pipeline is TensorSliceDataset -> MapDataset(square_3) ->
+    # RepeatDataset(count) -> BatchDataset(batch_size).
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn).repeat(count).batch(batch_size, drop_remainder)
+    get_next = self.getNext(dataset)
+
+    if drop_remainder:
+      dim0 = batch_size
+    else:
+      dim0 = None
+    self.assertEqual(
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)],
+        [[dim0] + list(c.shape[1:]) for c in components])
+
+    num_full_batches = (count * 7) // batch_size
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range(batch_size):
+          self.assertAllEqual(component[(i * batch_size + j) % 7]**2,
+                              result_component[j])
+    if not drop_remainder and (count * 7) % batch_size > 0:
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range((count * 7) % batch_size):
+          self.assertAllEqual(
+              component[(num_full_batches * batch_size + j) % 7]**2,
+              result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      result = self.evaluate(get_next())
+
+  def testBatchDatasetInvalidBatchSize(self):
+    dataset = (dataset_ops.Dataset.range(10).batch(0))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ''))
+
+  def testBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5)
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 5, i * 5 + 1, i * 5 + 2, i * 5 + 3, i * 5 + 4],
+            dense_shape=[5, 1]) for i in range(2)
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testBatchSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5)
+    expected_output = []
+    for i in range(2):
+      expected_indices = []
+      expected_outputs = []
+      for j in range(5):
+        for k in range(i * 5 + j):
+          expected_indices.append([j, k])
+          expected_outputs.append(i * 5 + j)
+      expected_output.append(
+          sparse_tensor.SparseTensorValue(
+              indices=expected_indices,
+              values=expected_outputs,
+              dense_shape=[5, (i + 1) * 5 - 1]))
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testNestedBatchSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).batch(5).batch(2)
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [0, 4, 0],
+                     [1, 0, 0], [1, 1, 0], [1, 2, 0], [1, 3, 0], [1, 4, 0]],
+            values=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
+            dense_shape=[2, 5, 1])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testBatchShapeError(self):
+
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    dataset = (
+        dataset_ops.Dataset.from_generator(
+            generator, dtypes.float32, output_shapes=[None]).batch(3))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r'Cannot batch tensors with different shapes in component 0. First '
+            r'element had shape \[3\] and element 2 had shape \[4\].'))
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py b/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
deleted file mode 100644
index 63625fac03beeb3f8756bfa5c8e543fdc3488fc4..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/cache_dataset_op_test.py
+++ /dev/null
@@ -1,318 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import variables
-from tensorflow.python.platform import test
-
-
-class FileCacheDatasetTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-    self.cache_prefix = path.join(self.tmp_dir, "cache")
-
-  def tearDown(self):
-    if self.tmp_dir:
-      shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def testCacheDatasetPassthrough(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache(filename_placeholder)
-
-    self.assertEqual(
-        tuple([c.shape[1:] for c in components]), cache_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = iterator_ops.Iterator.from_structure(cache_dataset.output_types,
-                                                    cache_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_cache_op = iterator.make_initializer(cache_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # First run without caching to collect the "ground truth".
-      sess.run(init_fifo_op)
-      elements = []
-      for _ in range(20):
-        elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the cached dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_cache_op, feed_dict={filename_placeholder: self.cache_prefix})
-      cached_elements = []
-      for _ in range(20):
-        cached_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(elements, cached_elements)
-
-      # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
-      # if we didn't use the cache).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix
-          })
-      replayed_elements = []
-      for _ in range(20):
-        replayed_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(cached_elements, replayed_elements)
-
-      # Re-initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(
-          init_cache_op,
-          feed_dict={
-              count_placeholder: 0,
-              filename_placeholder: self.cache_prefix + "nonsense"
-          })
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentWriters(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(get_next1)  # this should succeed
-
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-      with self.assertRaises(errors.AlreadyExistsError):
-        sess.run(get_next2)
-
-      sess.run(get_next1)  # this should continue to succeed
-
-  def testConcurrentReaders(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-
-    cache_dataset1 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-    cache_dataset2 = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .cache(filename_placeholder))
-
-    iterator1 = cache_dataset1.make_initializable_iterator()
-    iterator2 = cache_dataset2.make_initializable_iterator()
-    init_cache_op1 = iterator1.initializer
-    init_cache_op2 = iterator2.initializer
-
-    get_next1 = iterator1.get_next()
-    get_next2 = iterator2.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      elements = []
-      for _ in range(4):
-        elements.append(sess.run(get_next1))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      # Re-initialize
-      sess.run(
-          init_cache_op1, feed_dict={filename_placeholder: self.cache_prefix})
-      sess.run(
-          init_cache_op2, feed_dict={filename_placeholder: self.cache_prefix})
-
-      # Reading concurrently should succeed.
-      elements_itr1 = []
-      elements_itr2 = []
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      # Intentionally reversing the order
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-      elements_itr1.append(sess.run(get_next1))
-      elements_itr2.append(sess.run(get_next2))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next2)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next1)
-
-      self.assertAllEqual(elements, elements_itr1)
-      self.assertAllEqual(elements, elements_itr2)
-
-
-class MemoryCacheDatasetTest(test_base.DatasetTestBase):
-
-  def testCacheDatasetPassthrough(self):
-    with ops.device("cpu:0"):
-      repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
-      dataset = dataset_ops.Dataset.range(3).flat_map(
-          lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
-
-      cached_dataset = dataset.cache().repeat(2)
-      uncached_dataset = dataset.repeat(2)
-
-      # Needs to be initializable to capture the variable.
-      cached_iterator = cached_dataset.make_initializable_iterator()
-      cached_next = cached_iterator.get_next()
-      uncached_iterator = uncached_dataset.make_initializable_iterator()
-      uncached_next = uncached_iterator.get_next()
-
-      with self.cached_session() as sess:
-
-        sess.run(repeat_count.initializer)
-        sess.run(cached_iterator.initializer)
-        sess.run(uncached_iterator.initializer)
-
-        for i in range(3):
-          for _ in range(10):
-            self.assertEqual(sess.run(cached_next), i)
-            self.assertEqual(sess.run(uncached_next), i)
-
-        sess.run(repeat_count.assign(0))
-
-        # The uncached iterator should now be empty.
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(uncached_next)
-
-        # The cached iterator replays from cache.
-        for i in range(3):
-          for _ in range(10):
-            self.assertEqual(sess.run(cached_next), i)
-
-        # The cached iterator should now be empty.
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(cached_next)
-
-  def testEmptyCacheReading(self):
-    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-                  np.array([9.0, 10.0, 11.0, 12.0]))
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    cache_dataset = repeat_dataset.cache()
-
-    # Create initialization ops for iterators without and with
-    # caching, respectively.
-    iterator = cache_dataset.make_initializable_iterator()
-    init_cache_op = iterator.initializer
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Initialize with an empty upstream and a missing cache file (should
-      # throw errors.OutOfRangeError immediately).
-      sess.run(init_cache_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testConcurrentReaders(self):
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    dataset = dataset_ops.Dataset.range(count_placeholder).cache()
-    d1 = dataset.map(lambda x: x + 1)
-    d2 = dataset.map(lambda x: x + 6)
-
-    i1 = d1.make_initializable_iterator()
-    i2 = d2.make_initializable_iterator()
-
-    with self.cached_session() as sess:
-      sess.run(i1.initializer)
-
-      self.assertEqual(1, sess.run(i1.get_next()))
-      self.assertEqual(2, sess.run(i1.get_next()))
-      self.assertEqual(3, sess.run(i1.get_next()))
-
-      sess.run(i2.initializer, feed_dict={count_placeholder: 3})
-
-      self.assertEqual(6, sess.run(i2.get_next()))
-      self.assertEqual(7, sess.run(i2.get_next()))
-      self.assertEqual(4, sess.run(i1.get_next()))  # interleave execution
-      self.assertEqual([8, 5], sess.run([i2.get_next(), i1.get_next()]))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i1.get_next())
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(i2.get_next())
-
-  def testCacheTakeRepeat(self):
-    dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
-    itr = dataset.make_one_shot_iterator()
-    n = itr.get_next()
-
-    expected_values = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
-
-    with self.cached_session() as sess:
-      for i, expected in enumerate(expected_values):
-        self.assertEqual(expected, sess.run(n),
-                         "Unexpected value at index %s" % i)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/cache_test.py b/tensorflow/python/data/kernel_tests/cache_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b561cd58baf732f557d518e7eb237ab00512acc1
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/cache_test.py
@@ -0,0 +1,253 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.cache()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import shutil
+import tempfile
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FileCacheTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+    self.cache_prefix = path.join(self.tmp_dir, "cache")
+
+  def tearDown(self):
+    if self.tmp_dir:
+      shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def testCacheDatasetPassthrough(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    def dataset_fn(count=5, filename=None):
+      repeat_dataset = (
+          dataset_ops.Dataset.from_tensor_slices(components).repeat(count))
+      if filename:
+        return repeat_dataset.cache(filename)
+      else:
+        return repeat_dataset
+
+    self.assertEqual(
+        tuple([c.shape[1:] for c in components]),
+        dataset_fn().output_shapes)
+
+    get_next = self.getNext(dataset_fn())
+
+    # First run without caching to collect the "ground truth".
+    elements = []
+    for _ in range(20):
+      elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Assert that the cached dataset has the same elements as the
+    # "ground truth".
+    get_next = self.getNext(dataset_fn(filename=self.cache_prefix))
+    cached_elements = []
+    for _ in range(20):
+      cached_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(elements, cached_elements)
+
+    # Re-initialize with an empty upstream (to throw errors.OutOfRangeError
+    # if we didn't use the cache).
+    get_next = self.getNext(dataset_fn(count=0, filename=self.cache_prefix))
+    replayed_elements = []
+    for _ in range(20):
+      replayed_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(cached_elements, replayed_elements)
+
+    # Re-initialize with an empty upstream and a missing cache file (should
+    # throw errors.OutOfRangeError immediately).
+    get_next = self.getNext(
+        dataset_fn(count=0, filename=self.cache_prefix + "nonsense"))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testConcurrentWriters(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    cache_dataset1 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+    cache_dataset2 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    self.evaluate(get_next1())  # this should succeed
+
+    with self.assertRaises(errors.AlreadyExistsError):
+      self.evaluate(get_next2())
+
+    self.evaluate(get_next1())  # this should continue to succeed
+
+  def testConcurrentReaders(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    cache_dataset1 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+    cache_dataset2 = (
+        dataset_ops.Dataset.from_tensor_slices(components).cache(
+            self.cache_prefix))
+
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    elements = []
+    for _ in range(4):
+      elements.append(self.evaluate(get_next1()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+    # Re-initialize
+    get_next1 = self.getNext(cache_dataset1)
+    get_next2 = self.getNext(cache_dataset2)
+
+    # Reading concurrently should succeed.
+    elements_itr1 = []
+    elements_itr2 = []
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    # Intentionally reversing the order
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+    elements_itr1.append(self.evaluate(get_next1()))
+    elements_itr2.append(self.evaluate(get_next2()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next2())
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+    self.assertAllEqual(elements, elements_itr1)
+    self.assertAllEqual(elements, elements_itr2)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MemoryCacheTest(test_base.DatasetTestBase):
+
+  def testCacheDatasetPassthrough(self):
+    with ops.device("cpu:0"):
+      repeat_count = variables.Variable(constant_op.constant(10, dtypes.int64))
+      dataset = dataset_ops.Dataset.range(3).flat_map(
+          lambda x: dataset_ops.Dataset.from_tensors(x).repeat(repeat_count))
+
+      cached_dataset = dataset.cache().repeat(2)
+      uncached_dataset = dataset.repeat(2)
+
+      self.evaluate(repeat_count.initializer)
+      # Needs to be initializable to capture the variable.
+      cached_next = self.getNext(cached_dataset, requires_initialization=True)
+      uncached_next = self.getNext(
+          uncached_dataset, requires_initialization=True)
+      for i in range(3):
+        for _ in range(10):
+          self.assertEqual(self.evaluate(cached_next()), i)
+          self.assertEqual(self.evaluate(uncached_next()), i)
+
+      self.evaluate(repeat_count.assign(0))
+
+      # The uncached iterator should now be empty.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(uncached_next())
+
+      # The cached iterator replays from cache.
+      for i in range(3):
+        for _ in range(10):
+          self.assertEqual(self.evaluate(cached_next()), i)
+
+      # The cached iterator should now be empty.
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(cached_next())
+
+  def testEmptyCacheReading(self):
+    components = (np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+                  np.array([9.0, 10.0, 11.0, 12.0]))
+
+    repeat_dataset = (
+        dataset_ops.Dataset.from_tensor_slices(components).repeat(0))
+    cache_dataset = repeat_dataset.cache()
+
+    # Create initialization ops for iterators without and with
+    # caching, respectively.
+    self.assertDatasetProduces(cache_dataset, expected_output=[])
+
+  def testConcurrentReaders(self):
+
+    dataset = dataset_ops.Dataset.range(5).cache()
+    d1 = dataset.map(lambda x: x + 1)
+    d2 = dataset.map(lambda x: x + 6)
+
+    get_next1 = self.getNext(d1)
+
+    self.assertEqual(1, self.evaluate(get_next1()))
+    self.assertEqual(2, self.evaluate(get_next1()))
+    self.assertEqual(3, self.evaluate(get_next1()))
+
+    get_next2 = self.getNext(d2)
+
+    self.assertEqual(6, self.evaluate(get_next2()))
+    self.assertEqual(7, self.evaluate(get_next2()))
+    self.assertEqual(4, self.evaluate(get_next1()))  # interleave execution
+    self.assertEqual([8, 5],
+                     [self.evaluate(get_next2()),
+                      self.evaluate(get_next1())])
+    self.assertEqual(9, self.evaluate(get_next2()))
+    self.assertEqual(10, self.evaluate(get_next2()))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next2())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next1())
+
+  def testCacheTakeRepeat(self):
+    dataset = dataset_ops.Dataset.range(10).cache().take(5).repeat(2)
+
+    expected_output = [0, 1, 2, 3, 4, 0, 1, 2, 3, 4]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py b/tensorflow/python/data/kernel_tests/concatenate_test.py
similarity index 75%
rename from tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/concatenate_test.py
index 83af31f380efabc0d8654668a9a81d5789b8eeb1..5d8bfdc8f3afc2aed265f3907c22ff442ba590c4 100644
--- a/tensorflow/python/data/kernel_tests/concatenate_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/concatenate_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.concatenate()."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -24,10 +24,12 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class ConcatenateDatasetTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class ConcatenateTest(test_base.DatasetTestBase):
 
   def testConcatenateDataset(self):
     input_components = (
@@ -46,23 +48,19 @@ class ConcatenateDatasetTest(test_base.DatasetTestBase):
     self.assertEqual(concatenated.output_shapes, (tensor_shape.TensorShape(
         [20]), tensor_shape.TensorShape([15]), tensor_shape.TensorShape([])))
 
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(concatenated)
+
+    for i in range(9):
+      result = self.evaluate(get_next())
+      if i < 4:
+        for component, result_component in zip(input_components, result):
+          self.assertAllEqual(component[i], result_component)
+      else:
+        for component, result_component in zip(to_concatenate_components,
+                                               result):
+          self.assertAllEqual(component[i - 4], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testConcatenateDatasetDifferentShape(self):
     input_components = (
@@ -79,24 +77,18 @@ class ConcatenateDatasetTest(test_base.DatasetTestBase):
     self.assertEqual(
         [ts.as_list()
          for ts in nest.flatten(concatenated.output_shapes)], [[20], [None]])
-
-    iterator = concatenated.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(9):
-        result = sess.run(get_next)
-        if i < 4:
-          for component, result_component in zip(input_components, result):
-            self.assertAllEqual(component[i], result_component)
-        else:
-          for component, result_component in zip(to_concatenate_components,
-                                                 result):
-            self.assertAllEqual(component[i - 4], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    get_next = self.getNext(concatenated)
+    for i in range(9):
+      result = self.evaluate(get_next())
+      if i < 4:
+        for component, result_component in zip(input_components, result):
+          self.assertAllEqual(component[i], result_component)
+      else:
+        for component, result_component in zip(to_concatenate_components,
+                                               result):
+          self.assertAllEqual(component[i - 4], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testConcatenateDatasetDifferentStructure(self):
     input_components = (
diff --git a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
similarity index 84%
rename from tensorflow/python/data/kernel_tests/range_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
index b71e6b2ea43a198cd7c82f6dcb8f0b4926980d3e..6dcd94ea0207a53be1e3444db2a3e6643b8841ed 100644
--- a/tensorflow/python/data/kernel_tests/range_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_checkpoint_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test RangeDataset."""
+"""Checkpoint tests for `tf.data.Dataset`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,7 +26,6 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
@@ -35,51 +34,7 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
 
 
-@test_util.run_all_in_graph_and_eager_modes
-class RangeDatasetTest(test_base.DatasetTestBase):
-
-  def testStop(self):
-    dataset = dataset_ops.Dataset.range(5)
-    self.assertDatasetProduces(dataset, expected_output=range(5))
-
-  def testStartStop(self):
-    start, stop = 2, 5
-    dataset = dataset_ops.Dataset.range(start, stop)
-    self.assertDatasetProduces(dataset, expected_output=range(2, 5))
-
-  def testStartStopStep(self):
-    start, stop, step = 2, 10, 2
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(dataset, expected_output=range(2, 10, 2))
-
-  def testZeroStep(self):
-    start, stop, step = 2, 10, 0
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(
-        dataset, expected_err=(errors.InvalidArgumentError, ""))
-
-  def testNegativeStep(self):
-    start, stop, step = 2, 10, -1
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(dataset, expected_output=range(2, 10, -1))
-
-  def testStopLessThanStart(self):
-    start, stop = 10, 2
-    dataset = dataset_ops.Dataset.range(start, stop)
-    self.assertDatasetProduces(dataset, expected_output=range(10, 2))
-
-  def testStopLessThanStartWithPositiveStep(self):
-    start, stop, step = 10, 2, 2
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(dataset, expected_output=range(10, 2, 2))
-
-  def testStopLessThanStartWithNegativeStep(self):
-    start, stop, step = 10, 2, -1
-    dataset = dataset_ops.Dataset.range(start, stop, step)
-    self.assertDatasetProduces(dataset, expected_output=range(10, 2, -1))
-
-
-class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
+class DatasetCheckpointTest(test_base.DatasetTestBase):
 
   def tearDown(self):
     # Remove all checkpoint files.
@@ -109,8 +64,8 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
   def testSaveRestore(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -159,7 +114,7 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop, num_epochs):
       dataset = dataset_ops.Dataset.range(start, stop).repeat(num_epochs)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -206,7 +161,7 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop):
       dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -245,7 +200,7 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
 
     def _build_graph(start, stop):
       dataset = dataset_ops.Dataset.range(start, stop)
-      iterator = dataset.make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(dataset)
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -278,8 +233,8 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
   def testMultipleSaves(self):
 
     def _build_graph(start, stop):
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -321,8 +276,8 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
   def testSaveRestoreWithRepeat(self):
 
     def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop).repeat(num_epochs))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
@@ -366,8 +321,8 @@ class ExperimentalCheckpointDatasetTest(test_base.DatasetTestBase):
   def testSaveRestoreExhaustedIterator(self):
 
     def _build_graph(start, stop, num_epochs):
-      iterator = dataset_ops.Dataset.range(
-          start, stop).repeat(num_epochs).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop).repeat(num_epochs))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = self._save_op(iterator._iterator_resource)
diff --git a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py b/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
deleted file mode 100644
index bc6b36285aa417e6812e44e97e4f3a30ceb8e6a0..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/dataset_constructor_op_test.py
+++ /dev/null
@@ -1,650 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.platform import test
-
-
-class DatasetConstructorTest(test_base.DatasetTestBase):
-
-  def testFromTensors(self):
-    """Test a dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorsSparse(self):
-    """Test a dataset that represents a single tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0]]),
-        values=np.array([0]),
-        dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorsMixed(self):
-    """Test an dataset that represents a single tuple of tensors."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0]]),
-                      values=np.array([0]),
-                      dense_shape=np.array([1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1]]),
-                      values=np.array([-1, 1]),
-                      dense_shape=np.array([2, 2])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape)
-        if sparse_tensor.is_sparse(c) else c.shape for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      results = sess.run(get_next)
-      for component, result_component in zip(components, results):
-        if sparse_tensor.is_sparse(component):
-          self.assertSparseValuesEqual(component, result_component)
-        else:
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlices(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (
-        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
-            np.array([[12], [13], [14], [15]]), 22),
-        np.array([37.0, 38.0, 39.0, 40.0])
-    )
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesSparse(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (sparse_tensor.SparseTensorValue(
-        indices=np.array([[0, 0], [1, 0], [2, 0]]),
-        values=np.array([0, 0, 0]),
-        dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(
-        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
-        [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(expected[i], results):
-          self.assertSparseValuesEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesMixed(self):
-    """Test a dataset that represents the slices from a tuple of tensors."""
-    components = (np.tile(np.array([[1], [2], [3]]), 20),
-                  np.tile(np.array([[12], [13], [14]]), 22),
-                  np.array([37.0, 38.0, 39.0]),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
-                      values=np.array([0, 0, 0]),
-                      dense_shape=np.array([3, 1])),
-                  sparse_tensor.SparseTensorValue(
-                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
-                      values=np.array([1, 2, 3]),
-                      dense_shape=np.array([3, 3])))
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([
-        tensor_shape.TensorShape(c.dense_shape[1:])
-        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
-    ], [shape for shape in iterator.output_shapes])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      expected = [
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[0]]),
-               values=np.array([1]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[1]]),
-               values=np.array([2]),
-               dense_shape=np.array([3]))),
-          (sparse_tensor.SparseTensorValue(
-              indices=np.array([[0]]),
-              values=np.array([0]),
-              dense_shape=np.array([1])),
-           sparse_tensor.SparseTensorValue(
-               indices=np.array([[2]]),
-               values=np.array([3]),
-               dense_shape=np.array([3]))),
-      ]
-      for i in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            (list(zip(*components[:3]))[i] + expected[i]), results):
-          if sparse_tensor.is_sparse(component):
-            self.assertSparseValuesEqual(component, result_component)
-          else:
-            self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromTensorSlicesWithDict(self):
-    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual(dtypes.int32, iterator.output_types["foo"])
-    self.assertEqual(dtypes.float32, iterator.output_types["bar"])
-    self.assertEqual((), iterator.output_shapes["foo"])
-    self.assertEqual((1,), iterator.output_shapes["bar"])
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(3):
-        results = sess.run(get_next)
-        self.assertEqual(components["foo"][i], results["foo"])
-        self.assertEqual(components["bar"][i], results["bar"])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFromSparseTensorSlices(self):
-    """Test a dataset based on slices of a `tf.SparseTensor`."""
-    st = array_ops.sparse_placeholder(dtypes.float64)
-    iterator = (dataset_ops.Dataset.from_sparse_tensor_slices(st)
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
-
-    with self.cached_session() as sess:
-      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
-
-      # Test with sparse tensor in the appropriate order.
-      indices = np.array(
-          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
-      values = np.array([val for s in slices for val in s])
-      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
-      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
-                                                    dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      for i, s in enumerate(slices):
-        results = sess.run(get_next)
-        self.assertAllEqual(s, results.values)
-        expected_indices = np.array(
-            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
-        self.assertAllEqual(expected_indices, results.indices)
-        self.assertAllEqual(dense_shape[1:], results.dense_shape)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test with sparse tensor in the reverse order, which is not
-      # currently supported.
-      reverse_order_indices = indices[::-1, :]
-      reverse_order_values = values[::-1]
-      sparse_feed = sparse_tensor.SparseTensorValue(
-          reverse_order_indices, reverse_order_values, dense_shape)
-      with self.assertRaises(errors.UnimplementedError):
-        sess.run(init_op, feed_dict={st: sparse_feed})
-
-      # Test with an empty sparse tensor.
-      empty_indices = np.empty((0, 4), dtype=np.int64)
-      empty_values = np.empty((0,), dtype=np.float64)
-      empty_dense_shape = [0, 4, 37, 9]
-      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
-                                                    empty_dense_shape)
-      sess.run(init_op, feed_dict={st: sparse_feed})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  # pylint: disable=g-long-lambda,unnecessary-lambda
-  def testNestedStructure(self):
-    components = (np.array([1, 2, 3], dtype=np.int64),
-                  (np.array([4., 5.]), np.array([6., 7.])),
-                  np.array([8, 9, 10], dtype=np.int64))
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.shuffle(10, 10)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.repeat(-1)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.filter(lambda x, y, z: True)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.take(5)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
-
-    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
-                                                       (y[0], y[1])))
-    )
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
-
-    dataset = dataset.batch(32)
-    self.assertEquals(((dtypes.int64, dtypes.int64),
-                       (dtypes.float64, dtypes.float64)), dataset.output_types)
-    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
-                      nest.pack_sequence_as(dataset.output_shapes, [
-                          s.as_list()
-                          for s in nest.flatten(dataset.output_shapes)
-                      ]))
-
-    iterator = dataset.make_one_shot_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    iterator = dataset.make_initializable_iterator()
-    (w, x), (y, z) = iterator.get_next()
-    self.assertEquals(dtypes.int64, w.dtype)
-    self.assertEquals(dtypes.int64, x.dtype)
-    self.assertEquals(dtypes.float64, y.dtype)
-    self.assertEquals(dtypes.float64, z.dtype)
-    self.assertEquals([None, 3], w.shape.as_list())
-    self.assertEquals([None, 3], x.shape.as_list())
-    self.assertEquals([None, 2], y.shape.as_list())
-    self.assertEquals([None, 2], z.shape.as_list())
-
-    # Define a separate set of components with matching leading
-    # dimension for the from-slices constructor.
-    components_for_slices = (np.array([1, 2, 3], dtype=np.int64),
-                             (np.array([4., 5., 6.]),
-                              np.array([7., 8., 9.])),
-                             np.array([10, 11, 12], dtype=np.int64))
-
-    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
-    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
-                       dtypes.int64), dataset.output_types)
-    self.assertEquals(([], ([], []), []), dataset.output_shapes)
-
-  def testNestedDict(self):
-    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
-    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
-    self.assertEquals(dtypes.int32, dataset.output_types["b"])
-    self.assertEquals([], dataset.output_shapes["a"]["aa"])
-    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
-    self.assertEquals([3], dataset.output_shapes["b"])
-
-  def testNonSequenceNestedStructure(self):
-    components = np.array([1, 2, 3], dtype=np.int64)
-
-    dataset = dataset_ops.Dataset.from_tensors(components)
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.filter(
-        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([2, 3], dataset.output_shapes)
-
-    dataset = dataset.flat_map(
-        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
-    self.assertEquals(dtypes.int64, dataset.output_types)
-    self.assertEquals([3], dataset.output_shapes)
-
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-    self.assertEquals(dtypes.int64, get_next.dtype)
-    self.assertEquals([3], get_next.shape)
-
-  def testSplitPipelineFailsWithPlacementError(self):
-    with session.Session(
-        target="",
-        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
-
-      dataset = dataset_ops.Dataset.from_tensors(0)
-
-      # Define a pipeline that attempts to use variables on two
-      # different devices.
-      #
-      # Initialize the variables before creating to iterator, to avoid the
-      # placement algorithm overriding the DT_RESOURCE colocation constraints.
-      with ops.device("/cpu:0"):
-        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_0.read_value())
-      sess.run(var_0.initializer)
-
-      with ops.device("/cpu:1"):
-        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
-        dataset = dataset.map(lambda x: x + var_1.read_value())
-      sess.run(var_1.initializer)
-
-      iterator = dataset.make_initializable_iterator()
-      sess.run(iterator.initializer)
-
-      with self.assertRaisesRegexp(
-          errors.FailedPreconditionError,
-          "Error while reading resource variable Variable"):
-        sess.run(iterator.get_next())
-
-
-class DatasetConstructorBenchmark(test.Benchmark):
-
-  def benchmarkSliceRepeatBatch(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        sess.run(next_element)
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          sess.run(next_element)
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print("Slice/repeat/batch with sess.run() input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_repeat_batch_input_%d_batch_%d" % (input_size,
-                                                                 batch_size))
-
-  def benchmarkSliceRepeatBatchCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data)
-        .repeat(num_epochs + 1).batch(batch_size))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print(
-        "Slice/repeat/batch with callable input size: %d batch size: %d Median"
-        " wall time per element: %f" % (input_size, batch_size,
-                                        median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_repeat_batch_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkReshapeSliceRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data.reshape(100, 100))
-        .repeat(num_epochs + 1))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print("Reshape/slice/repeat with callable input size: %d batch size: %d "
-          "Median wall time per element: %f" % (input_size, batch_size,
-                                                median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_reshape_slice_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-  def benchmarkSliceBatchCacheRepeatCallable(self):
-    input_size = 10000
-    batch_size = 100
-    num_epochs = 100
-
-    input_data = np.random.randn(input_size)
-
-    dataset = (
-        dataset_ops.Dataset.from_tensor_slices(input_data).batch(batch_size)
-        .cache().repeat(num_epochs + 1))
-    iterator = dataset.make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with session.Session() as sess:
-      sess.run(iterator.initializer)
-      get_next_element = sess.make_callable(next_element)
-      # Run one whole epoch to burn in the computation.
-      for _ in range(input_size // batch_size):
-        get_next_element()
-      deltas = []
-      try:
-        while True:
-          start = time.time()
-          get_next_element()
-          deltas.append(time.time() - start)
-      except errors.OutOfRangeError:
-        pass
-
-    median_wall_time = np.median(deltas)
-    print(
-        "Slice/batch/cache/repeat with callable input size: %d batch size: %d "
-        "Median wall time per element: %f"
-        % (input_size, batch_size, median_wall_time))
-    self.report_benchmark(
-        iters=len(deltas),
-        wall_time=median_wall_time,
-        name="benchmark_slice_batch_cache_repeat_callable_input_%d_batch_%d" %
-        (input_size, batch_size))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/dataset_ops_test.py b/tensorflow/python/data/kernel_tests/dataset_test.py
similarity index 69%
rename from tensorflow/python/data/kernel_tests/dataset_ops_test.py
rename to tensorflow/python/data/kernel_tests/dataset_test.py
index a5324af4d0cf951c9f228758df09b9912532819e..2952c08be02b76fb221ee0f31f4b9fc34a14d659 100644
--- a/tensorflow/python/data/kernel_tests/dataset_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/dataset_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the input pipeline ops."""
+"""Tests for `tf.data.Dataset`."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,21 +24,26 @@ import numpy as np
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import structure
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class DatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testAsSerializedGraph(self):
     dataset = dataset_ops.Dataset.range(10)
-    with self.cached_session() as sess:
-      graph = graph_pb2.GraphDef().FromString(
-          sess.run(dataset._as_serialized_graph()))
-      self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
+    graph = graph_pb2.GraphDef().FromString(
+        self.evaluate(dataset._as_serialized_graph()))
+    self.assertTrue(any([node.op != "RangeDataset" for node in graph.node]))
 
   @staticmethod
   def make_apply_fn(dataset):
@@ -76,7 +81,7 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
        lambda: readers.FixedLengthRecordDataset("", 42)),
       ("FromGenerator",
        lambda: dataset_ops.Dataset.from_generator(
-           DatasetOpsTest.make_gen(), dtypes.int32),
+           DatasetTest.make_gen(), dtypes.int32),
        1),
       ("FromTensors", lambda: dataset_ops.Dataset.from_tensors([42])),
       ("FromTensorSlices", lambda: dataset_ops.Dataset.from_tensors([42])),
@@ -222,12 +227,12 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options1 = dataset_ops.Options()
     options1.experimental_autotune = True
     options2 = dataset_ops.Options()
-    options2.experimental_filter_fusion = False
+    options2.experimental_deterministic = False
     ds = dataset_ops.Dataset.range(0).with_options(options1).with_options(
         options2)
     self.assertTrue(ds.options().experimental_autotune)
     # Explicitly check that flag is False since assertFalse allows None
-    self.assertIs(ds.options().experimental_filter_fusion, False)
+    self.assertIs(ds.options().experimental_deterministic, False)
 
   def testOptionsTwiceDifferentError(self):
     options1 = dataset_ops.Options()
@@ -235,20 +240,78 @@ class DatasetOpsTest(test_base.DatasetTestBase, parameterized.TestCase):
     options2 = dataset_ops.Options()
     options2.experimental_autotune = False
     with self.assertRaisesRegexp(ValueError,
-                                 "Cannot merge incompatible values of option"):
+                                 "Cannot merge incompatible values"):
       dataset_ops.Dataset.range(0).with_options(options1).with_options(options2)
 
   def testOptionsMergeOptionsFromMultipleInputs(self):
     options1 = dataset_ops.Options()
     options1.experimental_autotune = True
     options2 = dataset_ops.Options()
-    options2.experimental_filter_fusion = True
+    options2.experimental_deterministic = True
     ds = dataset_ops.Dataset.zip(
         (dataset_ops.Dataset.range(0).with_options(options1),
          dataset_ops.Dataset.range(0).with_options(options2)))
     self.assertTrue(ds.options().experimental_autotune)
-    self.assertTrue(ds.options().experimental_filter_fusion)
+    self.assertTrue(ds.options().experimental_deterministic)
 
+  # TODO(b/119882922): use-after-free bug in eager mode.
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant(37.0),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[0]], values=constant_op.constant([0], dtype=dtypes.int32),
+          dense_shape=[1]),
+       structure.SparseTensorStructure(dtypes.int32, [1])),
+      ("Nest", lambda: {
+          "a": constant_op.constant(37.0),
+          "b": (constant_op.constant(["Foo"]), constant_op.constant("Bar"))},
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, []),
+           "b": (structure.TensorStructure(dtypes.string, [1]),
+                 structure.TensorStructure(dtypes.string, []))})),
+      ("Dataset", lambda: dataset_ops.Dataset.from_tensor_slices(
+          constant_op.constant([1, 2, 3])),
+       dataset_ops.DatasetStructure(
+           structure.TensorStructure(dtypes.int32, []))),
+      ("Optional", lambda: optional_ops.Optional.from_value(37.0),
+       optional_ops.OptionalStructure(
+           structure.TensorStructure(dtypes.float32, []))),
+  )
+  def testSkipEagerDatasetStructure(self, tf_value_fn,
+                                    expected_element_structure):
+    dataset = dataset_ops.Dataset.from_tensors(0).map(lambda _: tf_value_fn())
+    dataset_structure = structure.Structure.from_value(dataset)
+    self.assertIsInstance(dataset_structure, dataset_ops.DatasetStructure)
+
+    # TODO(b/110122868): Add a public API to `tf.data.Dataset` for accessing
+    # the element structure.
+    self.assertTrue(expected_element_structure.is_compatible_with(
+        dataset_structure._element_structure))
+    self.assertTrue(dataset_structure._element_structure.is_compatible_with(
+        expected_element_structure))
+
+    self.assertEqual([dtypes.variant], dataset_structure._flat_types)
+    self.assertEqual([tensor_shape.scalar()], dataset_structure._flat_shapes)
+
+    # Assert that the `Dataset` survives a round-trip via _from_tensor_list()
+    # and _to_tensor_list().
+    round_trip_dataset = dataset_structure._from_tensor_list(
+        dataset_structure._to_tensor_list(dataset))
+
+    value = tf_value_fn()
+
+    if isinstance(value, dataset_ops.Dataset):
+      self.assertDatasetsEqual(value, dataset.flat_map(lambda x: x))
+    elif isinstance(value, optional_ops.Optional):
+      self.assertDatasetProduces(
+          round_trip_dataset.map(lambda opt: opt.get_value()),
+          [self.evaluate(value.get_value())],
+          requires_initialization=True)
+    else:
+      self.assertDatasetProduces(
+          round_trip_dataset, [self.evaluate(tf_value_fn())],
+          requires_initialization=True)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py b/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
deleted file mode 100644
index a0c6b37a6dc0c7f4cec829efb26bec08899b8b34..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/filter_dataset_op_test.py
+++ /dev/null
@@ -1,220 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import time
-
-import numpy as np
-
-from tensorflow.python.client import session
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import functional_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class FilterDatasetTest(test_base.DatasetTestBase):
-
-  def testFilterDataset(self):
-    components = (
-        np.arange(7, dtype=np.int64),
-        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
-            7, dtype=np.int64)[:, np.newaxis],
-        np.array(37.0, dtype=np.float64) * np.arange(7)
-    )
-    count = array_ops.placeholder(dtypes.int64, shape=[])
-    modulus = array_ops.placeholder(dtypes.int64)
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(count)
-        .filter(lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Test that we can dynamically feed a different modulus value for each
-      # iterator.
-      def do_test(count_val, modulus_val):
-        sess.run(init_op, feed_dict={count: count_val, modulus: modulus_val})
-        for _ in range(count_val):
-          for i in [x for x in range(7) if x**2 % modulus_val == 0]:
-            result = sess.run(get_next)
-            for component, result_component in zip(components, result):
-              self.assertAllEqual(component[i]**2, result_component)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
-
-      do_test(14, 2)
-      do_test(4, 18)
-
-      # Test an empty dataset.
-      do_test(0, 1)
-
-  def testFilterRange(self):
-    dataset = dataset_ops.Dataset.range(100).filter(
-        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(get_next))
-      self.assertEqual(1, sess.run(get_next))
-      self.assertEqual(3, sess.run(get_next))
-
-  def testFilterDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .filter(lambda d: math_ops.equal(d["bar"] % 2, 0))
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        if (i ** 2) % 2 == 0:
-          self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testUseStepContainerInFilter(self):
-    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
-
-    # Define a predicate that returns true for the first element of
-    # the sequence and not the second, and uses `tf.map_fn()`.
-    def _predicate(xs):
-      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
-      summed = math_ops.reduce_sum(squared_xs)
-      return math_ops.equal(summed, 1 + 4 + 9)
-
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices([[1, 2, 3], [4, 5, 6]])
-        .filter(_predicate)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      self.assertAllEqual(input_data[0], sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSparse(self):
-
-    def _map_fn(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=np.array([[0, 0]]),
-          values=(i * np.array([1])),
-          dense_shape=np.array([1, 1])), i
-
-    def _filter_fn(_, i):
-      return math_ops.equal(i % 2, 0)
-
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
-            lambda x, i: x).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(5):
-        actual = sess.run(get_next)
-        self.assertTrue(isinstance(actual, sparse_tensor.SparseTensorValue))
-        self.assertSparseValuesEqual(actual, _map_fn(i * 2)[0])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testShortCircuit(self):
-    iterator = (
-        dataset_ops.Dataset.zip(
-            (dataset_ops.Dataset.range(10),
-             dataset_ops.Dataset.from_tensors(True).repeat(None)))
-        .filter(lambda x, y: y).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        self.assertEqual((i, True), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testParallelFilters(self):
-    dataset = dataset_ops.Dataset.range(10).filter(
-        lambda x: math_ops.equal(x % 2, 0))
-    iterators = [dataset.make_one_shot_iterator() for _ in range(10)]
-    next_elements = [iterator.get_next() for iterator in iterators]
-    with self.cached_session() as sess:
-      self.assertEqual([0 for _ in range(10)], sess.run(next_elements))
-
-
-class FilterDatasetBenchmark(test.Benchmark):
-
-  def _benchmark(self, predicate, name):
-    with ops.Graph().as_default():
-      dataset = (
-          dataset_ops.Dataset.from_tensors(True).repeat(None).filter(predicate))
-      iterator = dataset.make_one_shot_iterator()
-      next_element = iterator.get_next()
-
-      with session.Session() as sess:
-        for _ in range(5):
-          sess.run(next_element.op)
-        deltas = []
-        for _ in range(100):
-          start = time.time()
-          for _ in range(100):
-            sess.run(next_element.op)
-          end = time.time()
-          deltas.append(end - start)
-
-        median_wall_time = np.median(deltas) / 100
-        print("Filter dataset using %s. Median wall time: %f" %
-              (name, median_wall_time))
-        self.report_benchmark(
-            iters=100,
-            wall_time=median_wall_time,
-            name="benchmark_filter_dataset_%s" % name)
-
-  def benchmarkSimpleFunction(self):
-    self._benchmark(array_ops.identity, "simple_function")
-
-  def benchmarkReturnComponentOptimization(self):
-    self._benchmark(lambda x: x, "return_component")
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/filter_test.py b/tensorflow/python/data/kernel_tests/filter_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..afaf954cbc6a96984239cb22665bbe1f17d6d40d
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/filter_test.py
@@ -0,0 +1,128 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.filter()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FilterTest(test_base.DatasetTestBase):
+
+  def testFilterDataset(self):
+    components = (
+        np.arange(7, dtype=np.int64),
+        np.array([[1, 2, 3]], dtype=np.int64) * np.arange(
+            7, dtype=np.int64)[:, np.newaxis],
+        np.array(37.0, dtype=np.float64) * np.arange(7)
+    )
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def do_test(count, modulus):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+          _map_fn).repeat(count).filter(
+              lambda x, _y, _z: math_ops.equal(math_ops.mod(x, modulus), 0))
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      get_next = self.getNext(dataset)
+      for _ in range(count):
+        for i in [x for x in range(7) if x**2 % modulus == 0]:
+          result = self.evaluate(get_next())
+          for component, result_component in zip(components, result):
+            self.assertAllEqual(component[i]**2, result_component)
+      with self.assertRaises(errors.OutOfRangeError):
+        self.evaluate(get_next())
+
+    do_test(14, 2)
+    do_test(4, 18)
+
+    # Test an empty dataset.
+    do_test(0, 1)
+
+  def testFilterRange(self):
+    dataset = dataset_ops.Dataset.range(4).filter(
+        lambda x: math_ops.not_equal(math_ops.mod(x, 3), 2))
+    self.assertDatasetProduces(dataset, expected_output=[0, 1, 3])
+
+  def testFilterDict(self):
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2}).filter(
+            lambda d: math_ops.equal(d["bar"] % 2, 0)).map(
+                lambda d: d["foo"] + d["bar"])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[(i * 2 + i**2) for i in range(10) if not (i**2) % 2])
+
+  def testUseStepContainerInFilter(self):
+    input_data = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int64)
+
+    # Define a predicate that returns true for the first element of
+    # the sequence and not the second, and uses `tf.map_fn()`.
+    def _predicate(xs):
+      squared_xs = functional_ops.map_fn(lambda x: x * x, xs)
+      summed = math_ops.reduce_sum(squared_xs)
+      return math_ops.equal(summed, 1 + 4 + 9)
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [[1, 2, 3], [4, 5, 6]]).filter(_predicate)
+    self.assertDatasetProduces(dataset, expected_output=[input_data[0]])
+
+  def testSparse(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=np.array([[0, 0]]),
+          values=(i * np.array([1])),
+          dense_shape=np.array([1, 1])), i
+
+    def _filter_fn(_, i):
+      return math_ops.equal(i % 2, 0)
+
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).filter(_filter_fn).map(
+        lambda x, i: x)
+    self.assertDatasetProduces(
+        dataset, expected_output=[_map_fn(i * 2)[0] for i in range(5)])
+
+  def testShortCircuit(self):
+    dataset = dataset_ops.Dataset.zip(
+        (dataset_ops.Dataset.range(10),
+         dataset_ops.Dataset.from_tensors(True).repeat(None)
+        )).filter(lambda x, y: y)
+    self.assertDatasetProduces(
+        dataset, expected_output=[(i, True) for i in range(10)])
+
+  def testParallelFilters(self):
+    dataset = dataset_ops.Dataset.range(10).filter(
+        lambda x: math_ops.equal(x % 2, 0))
+    next_elements = [self.getNext(dataset) for _ in range(10)]
+    self.assertEqual([0 for _ in range(10)],
+                     self.evaluate(
+                         [next_element() for next_element in next_elements]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py b/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9503e57ca7c1b3e1823b30c80e7785a25b133a24
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/fixed_length_record_dataset_test.py
@@ -0,0 +1,171 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.FixedLengthRecordDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FixedLengthRecordDatasetTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    super(FixedLengthRecordDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+    self._header_bytes = 5
+    self._record_bytes = 3
+    self._footer_bytes = 2
+
+  def _record(self, f, r):
+    return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
+
+  def _createFiles(self, compression_type=None):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
+      filenames.append(fn)
+
+      contents = []
+      contents.append(b"H" * self._header_bytes)
+      for j in range(self._num_records):
+        contents.append(self._record(i, j))
+      contents.append(b"F" * self._footer_bytes)
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
+    return filenames
+
+  def _testFixedLengthRecordDataset(self, compression_type=None):
+    test_filenames = self._createFiles(compression_type=compression_type)
+
+    def dataset_fn(filenames, num_epochs, batch_size=None):
+      repeat_dataset = readers.FixedLengthRecordDataset(
+          filenames,
+          self._record_bytes,
+          self._header_bytes,
+          self._footer_bytes,
+          compression_type=compression_type).repeat(num_epochs)
+      if batch_size:
+        return repeat_dataset.batch(batch_size)
+      return repeat_dataset
+
+    # Basic test: read from file 0.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[0]], 1),
+        expected_output=[
+            self._record(0, i) for i in range(self._num_records)
+        ])
+
+    # Basic test: read from file 1.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[1]], 1),
+        expected_output=[
+            self._record(1, i) for i in range(self._num_records)
+        ])
+
+    # Basic test: read from both files.
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 1), expected_output=expected_output)
+
+    # Test repeated iteration through both files.
+    get_next = self.getNext(dataset_fn(test_filenames, 10))
+    for _ in range(10):
+      for j in range(self._num_files):
+        for i in range(self._num_records):
+          self.assertEqual(self._record(j, i), self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Test batched and repeated iteration through both files.
+    get_next = self.getNext(dataset_fn(test_filenames, 10, self._num_records))
+    for _ in range(10):
+      for j in range(self._num_files):
+        self.assertAllEqual(
+            [self._record(j, i) for i in range(self._num_records)],
+            self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testFixedLengthRecordDatasetNoCompression(self):
+    self._testFixedLengthRecordDataset()
+
+  def testFixedLengthRecordDatasetGzipCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="GZIP")
+
+  def testFixedLengthRecordDatasetZlibCompression(self):
+    self._testFixedLengthRecordDataset(compression_type="ZLIB")
+
+  def testFixedLengthRecordDatasetBuffering(self):
+    test_filenames = self._createFiles()
+    dataset = readers.FixedLengthRecordDataset(
+        test_filenames,
+        self._record_bytes,
+        self._header_bytes,
+        self._footer_bytes,
+        buffer_size=10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testFixedLengthRecordDatasetWrongSize(self):
+    test_filenames = self._createFiles()
+    dataset = readers.FixedLengthRecordDataset(
+        test_filenames,
+        self._record_bytes + 1,  # Incorrect record length.
+        self._header_bytes,
+        self._footer_bytes,
+        buffer_size=10)
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r"Excluding the header \(5 bytes\) and footer \(2 bytes\), input "
+            r"file \".*fixed_length_record.0.txt\" has body length 21 bytes, "
+            r"which is not an exact multiple of the record length \(4 bytes\).")
+        )
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
similarity index 57%
rename from tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/flat_map_test.py
index 68038f9cfc09efcc08c5fa2d8d8af93a4a3c50db..ff52821b10740196286c30d19b0cda3b4b44bae5 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.flat_map()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,54 +26,42 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
 
-class FlatMapDatasetTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class FlatMapTest(test_base.DatasetTestBase):
 
   # pylint: disable=g-long-lambda
   def testFlatMapDataset(self):
     repeats = [1, 2, 3, 4, 5, 0, 1]
     components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in repeats:
-        for _ in range(i):
-          self.assertEqual(i, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+        lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
+    expected_output = []
+    for i in repeats:
+      expected_output.extend([[i]] * i)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
   def testNestedFlatMapDataset(self):
     repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
     components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.Dataset.from_tensor_slices(components)
-        .flat_map(lambda x: dataset_ops.Dataset.from_tensor_slices(x)
-                  .flat_map(lambda y: dataset_ops.Dataset.from_tensors(y)
-                            .repeat(y))).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for row in repeats:
-        for i in row:
-          for _ in range(i):
-            self.assertEqual(i, sess.run(get_next))
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSharedResourceNestedFlatMapDataset(self):
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+        lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
+            lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y))
+    )
+    expected_output = []
+    for row in repeats:
+      for i in row:
+        expected_output.extend([i] * i)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  # Note: no eager mode coverage, session specific test.
+  @test_util.run_deprecated_v1
+  def testSkipEagerSharedResourceNestedFlatMapDataset(self):
     repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
     components = np.array(repeats, dtype=np.int64)
     iterator = (
@@ -106,22 +94,16 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
           sess.run(get_next)
 
   def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .flat_map(lambda d: dataset_ops.Dataset.from_tensors(d["foo"])
-                          .repeat(d["bar"]))
-                .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for _ in range(i ** 2):
-          self.assertEqual(i * 2, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-  # pylint: enable=g-long-lambda
+    dataset = dataset_ops.Dataset.range(10).map(
+        lambda x: {"foo": x * 2, "bar": x ** 2}).flat_map(
+            lambda d: dataset_ops.Dataset.from_tensors(
+                d["foo"]).repeat(d["bar"]))
+    get_next = self.getNext(dataset)
+    for i in range(10):
+      for _ in range(i**2):
+        self.assertEqual(i * 2, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testSparse(self):
     def _map_fn(i):
@@ -132,20 +114,12 @@ class FlatMapDatasetTest(test_base.DatasetTestBase):
       return dataset_ops.Dataset.from_tensor_slices(
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
-        .make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
+    expected_output = []
+    for i in range(10):
+      for j in range(2):
+        expected_output.append([i, 0] if j % 2 == 0 else [0, -i])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py b/tensorflow/python/data/kernel_tests/from_generator_test.py
similarity index 85%
rename from tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
rename to tensorflow/python/data/kernel_tests/from_generator_test.py
index cb8cb9a77df0b897a87dfecb96395c1bbee450b0..a6625534e7a1a0efc5e39dc53ef57666f601c05b 100644
--- a/tensorflow/python/data/kernel_tests/dataset_from_generator_op_test.py
+++ b/tensorflow/python/data/kernel_tests/from_generator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for tf.data.Dataset.from_generator()."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -27,21 +27,21 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
 
-class DatasetConstructorTest(test_base.DatasetTestBase):
+class FromGeneratorTest(test_base.DatasetTestBase):
 
   def _testFromGenerator(self, generator, elem_sequence, num_repeats,
                          output_types=None):
     if output_types is None:
       output_types = dtypes.int64
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(generator, output_types=output_types)
         .repeat(num_repeats)
-        .prefetch(5)
-        .make_initializable_iterator())
+        .prefetch(5))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -55,11 +55,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
           sess.run(get_next)
 
   def _testFromGeneratorOneShot(self, generator, elem_sequence, num_repeats):
-    iterator = (
+    iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.from_generator(generator, output_types=dtypes.int64)
         .repeat(num_repeats)
-        .prefetch(5)
-        .make_one_shot_iterator())
+        .prefetch(5))
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -69,6 +68,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorUsingFunction(self):
     def generator():
       for i in range(1, 100):
@@ -79,18 +79,21 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     self._testFromGeneratorOneShot(generator, elem_sequence, 1)
     self._testFromGeneratorOneShot(generator, elem_sequence, 5)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorUsingList(self):
     generator = lambda: [[i] * i for i in range(1, 100)]
     elem_sequence = list(generator())
     self._testFromGenerator(generator, elem_sequence, 1)
     self._testFromGenerator(generator, elem_sequence, 5)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorUsingNdarray(self):
     generator = lambda: np.arange(100, dtype=np.int64)
     elem_sequence = list(generator())
     self._testFromGenerator(generator, elem_sequence, 1, output_types=np.int64)
     self._testFromGenerator(generator, elem_sequence, 5, output_types=np.int64)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorUsingGeneratorExpression(self):
     # NOTE(mrry): Generator *expressions* are not repeatable (or in
     # general reusable), because they eagerly evaluate the `for`
@@ -102,6 +105,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
     self._testFromGenerator(generator, elem_sequence, 1)
     self._testFromGenerator(generator, elem_sequence, 5)
 
+  @test_util.run_deprecated_v1
   def testFromMultipleConcurrentGenerators(self):
     num_inner_repeats = 5
     num_outer_repeats = 100
@@ -124,11 +128,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
           output_shapes=([None], [3]))
               .repeat(num_inner_repeats).prefetch(5))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(num_outer_repeats)
         .interleave(interleave_fn, cycle_length=10,
-                    block_length=len(input_list))
-        .make_initializable_iterator())
+                    block_length=len(input_list)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -183,11 +186,10 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       return dataset_ops.Dataset.from_generator(
           generator, output_types=dtypes.int64, output_shapes=[]).prefetch(2)
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(num_parallel_iterators)
         .interleave(
-            interleave_fn, cycle_length=num_parallel_iterators, block_length=1)
-        .make_initializable_iterator())
+            interleave_fn, cycle_length=num_parallel_iterators, block_length=1))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -199,6 +201,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorImplicitConversion(self):
     def generator():
       yield [1]
@@ -206,9 +209,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield [3]
 
     for dtype in [dtypes.int8, dtypes.int32, dtypes.int64]:
-      iterator = (dataset_ops.Dataset.from_generator(
-          generator, output_types=dtype, output_shapes=[1])
-                  .make_initializable_iterator())
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.from_generator(
+              generator, output_types=dtype, output_shapes=[1]))
       init_op = iterator.initializer
       get_next = iterator.get_next()
 
@@ -223,15 +226,16 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorString(self):
     def generator():
       yield "foo"
       yield b"bar"
       yield u"baz"
 
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.string, output_shapes=[])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.string, output_shapes=[]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -243,6 +247,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorTypeError(self):
     def generator():
       yield np.array([1, 2, 3], dtype=np.int64)
@@ -250,9 +255,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield "ERROR"
       yield np.array([7, 8, 9], dtype=np.int64)
 
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64, output_shapes=[3]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -266,6 +271,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorShapeError(self):
     def generator():
       yield np.array([1, 2, 3], dtype=np.int64)
@@ -273,9 +279,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield np.array([7, 8, 9, 10], dtype=np.int64)
       yield np.array([11, 12, 13], dtype=np.int64)
 
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=dtypes.int64, output_shapes=[3])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=dtypes.int64, output_shapes=[3]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -289,6 +295,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorStructureError(self):
     def generator():
       yield 1, 2
@@ -297,9 +304,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield 6, 7, 8
       yield 9, 10
 
-    iterator = (dataset_ops.Dataset.from_generator(
-        generator, output_types=(dtypes.int64, dtypes.int64))
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            generator, output_types=(dtypes.int64, dtypes.int64)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -317,14 +324,15 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorHeterogeneous(self):
     def generator():
       yield 1
       yield [2, 3]
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64).make_initializable_iterator())
+            generator, output_types=dtypes.int64))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -335,6 +343,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorStopShort(self):
 
     def generator():
@@ -342,9 +351,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       yield 1
       yield 2
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_generator(
-            generator, output_types=dtypes.int64).make_initializable_iterator())
+            generator, output_types=dtypes.int64))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -353,6 +362,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       self.assertAllEqual(0, sess.run(get_next))
       self.assertAllEqual(1, sess.run(get_next))
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorDestructorCalled(self):
     # Use an `Event` to signal that the generator has been deleted.
     event = threading.Event()
@@ -371,9 +381,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       def __del__(self):
         event.set()
 
-    iterator = dataset_ops.Dataset.from_generator(
-        GeneratorWrapper,
-        output_types=dtypes.int64).take(2).make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_generator(
+            GeneratorWrapper, output_types=dtypes.int64).take(2))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -387,6 +397,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       # iterator terminates (and the generator iterator is deleted).
       self.assertTrue(event.is_set())
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorWithArgs(self):
 
     def flat_map_fn(elem):
@@ -399,10 +410,8 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
           generator_with_arg, output_types=dtypes.int64, output_shapes=(),
           args=(elem,))
 
-    iterator = (dataset_ops.Dataset
-                .range(5)
-                .flat_map(flat_map_fn)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(5).flat_map(flat_map_fn))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -414,6 +423,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testFromGeneratorWithTwoArgs(self):
 
     def flat_map_fn(elem, message):
@@ -426,12 +436,11 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
           generator_with_arg, output_types=(dtypes.int64, dtypes.string),
           output_shapes=((), ()), args=(elem, message))
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.zip(
             (dataset_ops.Dataset.range(5),
              dataset_ops.Dataset.from_tensors("Hi!").repeat(None)))
-        .flat_map(flat_map_fn)
-        .make_initializable_iterator())
+        .flat_map(flat_map_fn))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -446,6 +455,7 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testGeneratorDatasetFinalizeFunctionCalled(self):
     # NOTE(mrry): This test tests the internal `_GeneratorDataset`,
     # which affords more control over what the finalize function can do than
@@ -462,10 +472,9 @@ class DatasetConstructorTest(test_base.DatasetTestBase):
                                 stateful=True)
 
     dummy = constant_op.constant(37)
-    iterator = (dataset_ops._GeneratorDataset(dummy, lambda x: x,
-                                              lambda x: x, finalize_fn)
-                .take(2)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops._GeneratorDataset(
+            dummy, lambda x: x, lambda x: x, finalize_fn).take(2))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef608ebb67007c7605e7bea36058d0cd5c5d146f
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_sparse_tensor_slices_test.py
@@ -0,0 +1,86 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_sparse_tensor_slices()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromSparseTensorSlicesTest(test_base.DatasetTestBase):
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerFromSparseTensorSlices(self):
+    """Test a dataset based on slices of a `tf.SparseTensor`."""
+    st = array_ops.sparse_placeholder(dtypes.float64)
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_sparse_tensor_slices(st))
+    init_op = iterator.initializer
+    get_next = sparse_tensor.SparseTensor(*iterator.get_next())
+
+    with self.cached_session() as sess:
+      slices = [[1., 2., 3.], [1.], [1.], [1., 2.], [], [1., 2.], [], [], []]
+
+      # Test with sparse tensor in the appropriate order.
+      indices = np.array(
+          [[i, j] for i in range(len(slices)) for j in range(len(slices[i]))])
+      values = np.array([val for s in slices for val in s])
+      dense_shape = np.array([len(slices), max(len(s) for s in slices) + 1])
+      sparse_feed = sparse_tensor.SparseTensorValue(indices, values,
+                                                    dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      for i, s in enumerate(slices):
+        results = sess.run(get_next)
+        self.assertAllEqual(s, results.values)
+        expected_indices = np.array(
+            [[j] for j in range(len(slices[i]))]).reshape([-1, 1])
+        self.assertAllEqual(expected_indices, results.indices)
+        self.assertAllEqual(dense_shape[1:], results.dense_shape)
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+      # Test with sparse tensor in the reverse order, which is not
+      # currently supported.
+      reverse_order_indices = indices[::-1, :]
+      reverse_order_values = values[::-1]
+      sparse_feed = sparse_tensor.SparseTensorValue(
+          reverse_order_indices, reverse_order_values, dense_shape)
+      with self.assertRaises(errors.UnimplementedError):
+        sess.run(init_op, feed_dict={st: sparse_feed})
+
+      # Test with an empty sparse tensor.
+      empty_indices = np.empty((0, 4), dtype=np.int64)
+      empty_values = np.empty((0,), dtype=np.float64)
+      empty_dense_shape = [0, 4, 37, 9]
+      sparse_feed = sparse_tensor.SparseTensorValue(empty_indices, empty_values,
+                                                    empty_dense_shape)
+      sess.run(init_op, feed_dict={st: sparse_feed})
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a480e56789aee9198fc88201f0eecb2c2eaab52
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_tensor_slices_test.py
@@ -0,0 +1,177 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_tensor_slices()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromTensorSlicesTest(test_base.DatasetTestBase):
+
+  def testFromTensorSlices(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (
+        np.tile(np.array([[1], [2], [3], [4]]), 20), np.tile(
+            np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    )
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual([c.shape[1:] for c in components],
+                     [shape for shape in dataset.output_shapes])
+
+    for i in range(4):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(components, results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      results = self.evaluate(get_next())
+
+  def testSkipEagerFromTensorSlicesSparse(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0, 0], [1, 0], [2, 0]]),
+        values=np.array([0, 0, 0]),
+        dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape[1:]) for c in components],
+        [shape for shape in dataset.output_shapes])
+
+    expected = [
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[0]]),
+             values=np.array([1]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[1]]),
+             values=np.array([2]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[2]]),
+             values=np.array([3]),
+             dense_shape=np.array([3]))),
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected)
+
+  def testFromTensorSlicesMixed(self):
+    """Test a dataset that represents the slices from a tuple of tensors."""
+    components = (np.tile(np.array([[1], [2], [3]]), 20),
+                  np.tile(np.array([[12], [13], [14]]), 22),
+                  np.array([37.0, 38.0, 39.0]),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 0], [2, 0]]),
+                      values=np.array([0, 0, 0]),
+                      dense_shape=np.array([3, 1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1], [2, 2]]),
+                      values=np.array([1, 2, 3]),
+                      dense_shape=np.array([3, 3])))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape[1:])
+        if sparse_tensor.is_sparse(c) else c.shape[1:] for c in components
+    ], [shape for shape in dataset.output_shapes])
+
+    expected = [
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[0]]),
+             values=np.array([1]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[1]]),
+             values=np.array([2]),
+             dense_shape=np.array([3]))),
+        (sparse_tensor.SparseTensorValue(
+            indices=np.array([[0]]),
+            values=np.array([0]),
+            dense_shape=np.array([1])),
+         sparse_tensor.SparseTensorValue(
+             indices=np.array([[2]]),
+             values=np.array([3]),
+             dense_shape=np.array([3]))),
+    ]
+    for i in range(3):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(
+          (list(zip(*components[:3]))[i] + expected[i]), results):
+        if sparse_tensor.is_sparse(component):
+          self.assertSparseValuesEqual(component, result_component)
+        else:
+          self.assertAllEqual(component, result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testFromTensorSlicesWithDict(self):
+    components = {"foo": [1, 2, 3], "bar": [[4.0], [5.0], [6.0]]}
+    dataset = dataset_ops.Dataset.from_tensor_slices(components)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual(dtypes.int32, dataset.output_types["foo"])
+    self.assertEqual(dtypes.float32, dataset.output_types["bar"])
+    self.assertEqual((), dataset.output_shapes["foo"])
+    self.assertEqual((1,), dataset.output_shapes["bar"])
+
+    for i in range(3):
+      results = self.evaluate(get_next())
+      self.assertEqual(components["foo"][i], results["foo"])
+      self.assertEqual(components["bar"][i], results["bar"])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/from_tensors_test.py b/tensorflow/python/data/kernel_tests/from_tensors_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab3c15263fdaa0829686f90450e0e79081299a2e
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/from_tensors_test.py
@@ -0,0 +1,259 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.from_tensors()."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.client import session
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FromTensorsTest(test_base.DatasetTestBase):
+
+  def testFromTensors(self):
+    """Test a dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+
+    self.assertEqual([c.shape for c in components],
+                     nest.flatten(dataset.output_shapes))
+
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  def testSkipEagerFromTensorsSparse(self):
+    """Test a dataset that represents a single tuple of tensors."""
+    components = (sparse_tensor.SparseTensorValue(
+        indices=np.array([[0]]),
+        values=np.array([0]),
+        dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+
+    self.assertEqual(
+        [tensor_shape.TensorShape(c.dense_shape) for c in components],
+        [shape for shape in dataset.output_shapes])
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  def testFromTensorsMixed(self):
+    """Test an dataset that represents a single tuple of tensors."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0]]),
+                      values=np.array([0]),
+                      dense_shape=np.array([1])),
+                  sparse_tensor.SparseTensorValue(
+                      indices=np.array([[0, 0], [1, 1]]),
+                      values=np.array([-1, 1]),
+                      dense_shape=np.array([2, 2])))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEqual([
+        tensor_shape.TensorShape(c.dense_shape)
+        if sparse_tensor.is_sparse(c) else c.shape for c in components
+    ], [shape for shape in dataset.output_shapes])
+
+    self.assertDatasetProduces(dataset, expected_output=[components])
+
+  # pylint: disable=g-long-lambda,unnecessary-lambda
+  def testNestedStructure(self):
+    components = (np.array([1, 2, 3], dtype=np.int64),
+                  (np.array([4., 5.]), np.array([6., 7.])),
+                  np.array([8, 9, 10], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.shuffle(10, 10)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.repeat(-1)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.filter(lambda x, y, z: True)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.take(5)
+    self.assertEquals((dtypes.int64, (dtypes.float64, dtypes.float64),
+                       dtypes.int64), dataset.output_types)
+    self.assertEquals(([3], ([2], [2]), [3]), dataset.output_shapes)
+
+    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x, y: dataset_ops.Dataset.from_tensors(((x[0], x[1]),
+                                                       (y[0], y[1])))
+    )
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([3], [3]), ([2], [2])), dataset.output_shapes)
+
+    dataset = dataset.batch(32)
+    self.assertEquals(((dtypes.int64, dtypes.int64),
+                       (dtypes.float64, dtypes.float64)), dataset.output_types)
+    self.assertEquals((([None, 3], [None, 3]), ([None, 2], [None, 2])),
+                      nest.pack_sequence_as(dataset.output_shapes, [
+                          s.as_list()
+                          for s in nest.flatten(dataset.output_shapes)
+                      ]))
+
+    # Define a separate set of components with matching leading
+    # dimension for the from-slices constructor.
+    components_for_slices = (np.array([1, 2, 3], dtype=np.int64),
+                             (np.array([4., 5., 6.]), np.array([7., 8., 9.])),
+                             np.array([10, 11, 12], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components_for_slices)
+    self.assertEquals((dtypes.int64,
+                       (dtypes.float64, dtypes.float64), dtypes.int64),
+                      dataset.output_types)
+    self.assertEquals(([], ([], []), []), dataset.output_shapes)
+
+  # TODO(b/117581999): more specific shapes in eager mode.
+  @test_util.run_deprecated_v1
+  def testSkipEagerNestedStructure(self):
+    components = (np.array([1, 2, 3], dtype=np.int64), (np.array([4., 5.]),
+                                                        np.array([6., 7.])),
+                  np.array([8, 9, 10], dtype=np.int64))
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    dataset = dataset.map(lambda x, y, z: ((x, z), (y[0], y[1])))
+
+    dataset = dataset.flat_map(
+        lambda x, y: dataset_ops.Dataset.from_tensors(
+            ((x[0], x[1]), (y[0], y[1])))).batch(32)
+
+    get_next = self.getNext(dataset)
+    (w, x), (y, z) = get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+    get_next = self.getNext(dataset)
+    (w, x), (y, z) = get_next()
+    self.assertEquals(dtypes.int64, w.dtype)
+    self.assertEquals(dtypes.int64, x.dtype)
+    self.assertEquals(dtypes.float64, y.dtype)
+    self.assertEquals(dtypes.float64, z.dtype)
+    self.assertEquals([None, 3], w.shape.as_list())
+    self.assertEquals([None, 3], x.shape.as_list())
+    self.assertEquals([None, 2], y.shape.as_list())
+    self.assertEquals([None, 2], z.shape.as_list())
+
+  def testNestedDict(self):
+    components = {"a": {"aa": 1, "ab": [2.0, 2.0]}, "b": [3, 3, 3]}
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int32, dataset.output_types["a"]["aa"])
+    self.assertEquals(dtypes.float32, dataset.output_types["a"]["ab"])
+    self.assertEquals(dtypes.int32, dataset.output_types["b"])
+    self.assertEquals([], dataset.output_shapes["a"]["aa"])
+    self.assertEquals([2], dataset.output_shapes["a"]["ab"])
+    self.assertEquals([3], dataset.output_shapes["b"])
+
+  def testNonSequenceNestedStructure(self):
+    components = np.array([1, 2, 3], dtype=np.int64)
+
+    dataset = dataset_ops.Dataset.from_tensors(components)
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.filter(
+        lambda x: math_ops.reduce_all(math_ops.equal(x, components)))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    dataset = dataset.map(lambda x: array_ops.stack([x, x]))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([2, 3], dataset.output_shapes)
+
+    dataset = dataset.flat_map(
+        lambda x: dataset_ops.Dataset.from_tensor_slices(x))
+    self.assertEquals(dtypes.int64, dataset.output_types)
+    self.assertEquals([3], dataset.output_shapes)
+
+    get_next = self.getNext(dataset)
+    self.assertEquals(dtypes.int64, get_next().dtype)
+    self.assertEquals([3], get_next().shape)
+
+  def testSkipEagerSplitPipelineFailsWithPlacementError(self):
+    with session.Session(
+        target="",
+        config=config_pb2.ConfigProto(device_count={"CPU": 2})) as sess:
+
+      dataset = dataset_ops.Dataset.from_tensors(0)
+
+      # Define a pipeline that attempts to use variables on two
+      # different devices.
+      #
+      # Initialize the variables before creating to iterator, to avoid the
+      # placement algorithm overriding the DT_RESOURCE colocation constraints.
+      with ops.device("/cpu:0"):
+        var_0 = resource_variable_ops.ResourceVariable(initial_value=0)
+        dataset = dataset.map(lambda x: x + var_0.read_value())
+      sess.run(var_0.initializer)
+
+      with ops.device("/cpu:1"):
+        var_1 = resource_variable_ops.ResourceVariable(initial_value=0)
+        dataset = dataset.map(lambda x: x + var_1.read_value())
+      sess.run(var_1.initializer)
+
+      iterator = dataset_ops.make_initializable_iterator(dataset)
+      sess.run(iterator.initializer)
+
+      with self.assertRaisesRegexp(
+          errors.FailedPreconditionError,
+          "Error while reading resource variable Variable"):
+        sess.run(iterator.get_next())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/inputs_test.py b/tensorflow/python/data/kernel_tests/inputs_test.py
deleted file mode 100644
index d089b49bcc6f80b734ad5e7cb96dfea321504e6f..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/inputs_test.py
+++ /dev/null
@@ -1,149 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.data.util import nest
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.platform import test
-
-
-class InputsTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @staticmethod
-  def make_apply_fn(dataset):
-
-    def apply_fn(dataset):
-
-      def _apply_fn(dataset):
-        return dataset.cache()
-
-      return dataset.apply(_apply_fn)
-
-    return apply_fn
-
-  @staticmethod
-  def make_gen():
-
-    def gen():
-      yield 42
-
-    return gen
-
-  @staticmethod
-  def make_interleave_fn(dataset, num_parallel_calls=None):
-
-    def interleave_fn(dataset):
-      return dataset.interleave(
-          lambda x: dataset_ops.Dataset.range(0),
-          cycle_length=2,
-          num_parallel_calls=num_parallel_calls)
-
-    return interleave_fn
-
-  @parameterized.named_parameters(
-      ("FixedLengthRecord", readers.FixedLengthRecordDataset("", 42)),
-      ("FromGenerator",
-       dataset_ops.Dataset.from_generator(make_gen.__func__(), dtypes.int32),
-       1),
-      ("FromSparseTensorSlices",
-       dataset_ops.Dataset.from_sparse_tensor_slices(
-           sparse_tensor.SparseTensor(
-               indices=np.array([[0, 0], [1, 0], [2, 0]]),
-               values=np.array([0, 0, 0]),
-               dense_shape=np.array([3, 1])))),
-      ("FromTensors", dataset_ops.Dataset.from_tensors([42])),
-      ("FromTensorSlices", dataset_ops.Dataset.from_tensors([42])),
-      ("Range", dataset_ops.Dataset.range(10)),
-      ("TextLine", readers.TextLineDataset("")),
-      ("TFRecord", readers.TFRecordDataset(""), 1),
-  )
-  def testDatasetSourceInputs(self, dataset, num_inputs=0):
-    self.assertEqual(num_inputs, len(dataset._inputs()))
-
-  @parameterized.named_parameters(
-      ("Apply", make_apply_fn.__func__(dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Batch", lambda x: x.batch(10), dataset_ops.Dataset.range(0)),
-      ("Cache", lambda x: x.cache(), dataset_ops.Dataset.range(0)),
-      ("Filter", lambda x: x.filter(lambda x: True),
-       dataset_ops.Dataset.range(0)),
-      ("FlatMap", lambda x: x.flat_map(lambda x: dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Interleave", make_interleave_fn.__func__(dataset_ops.Dataset.range(0)),
-       dataset_ops.Dataset.range(0)),
-      ("Map", lambda x: x.map(lambda x: x), dataset_ops.Dataset.range(0)),
-      ("PaddedBatch", lambda x: x.padded_batch(10, []),
-       dataset_ops.Dataset.range(0)),
-      ("ParallelInterleave",
-       make_interleave_fn.__func__(dataset_ops.Dataset.range(0), 2),
-       dataset_ops.Dataset.range(0)),
-      ("ParallelMap", lambda x: x.map(lambda x: x, num_parallel_calls=2),
-       dataset_ops.Dataset.range(0)),
-      ("Repeat", lambda x: x.repeat(), dataset_ops.Dataset.range(0)),
-      ("Shuffle", lambda x: x.shuffle(10), dataset_ops.Dataset.range(0)),
-      ("Skip", lambda x: x.skip(1), dataset_ops.Dataset.range(0)),
-      ("Take", lambda x: x.take(1), dataset_ops.Dataset.range(0)),
-      ("Window", lambda x: x.window(10), dataset_ops.Dataset.range(0)),
-  )
-  def testUnaryTransformationInputs(self, dataset_fn, input_dataset):
-    self.assertEqual([input_dataset], dataset_fn(input_dataset)._inputs())
-
-  @parameterized.named_parameters(
-      ("Concatenate", lambda x, y: x.concatenate(y),
-       dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1)))
-  def testBinaryTransformationInputs(self, dataset_fn, input1, input2):
-    self.assertEqual([input1, input2], dataset_fn(input1, input2)._inputs())
-
-  @parameterized.named_parameters(
-      ("ZipOne", dataset_ops.Dataset.zip, (dataset_ops.Dataset.range(0))),
-      ("ZipNest", dataset_ops.Dataset.zip,
-       (dataset_ops.Dataset.range(0),
-        (dataset_ops.Dataset.range(1), dataset_ops.Dataset.range(2)))),
-      ("ZipTuple", dataset_ops.Dataset.zip,
-       (dataset_ops.Dataset.range(0), dataset_ops.Dataset.range(1))))
-  def testVariadicTransformationInputs(self, dataset_fn, input_datasets):
-    self.assertEqual(
-        nest.flatten(input_datasets),
-        dataset_fn(input_datasets)._inputs())
-
-  def testCollectInputs(self):
-    ds1 = dataset_ops.Dataset.range(0)
-    ds2 = ds1.concatenate(ds1)
-    ds3 = dataset_ops.Dataset.zip((ds2, ds1, ds2))
-
-    inputs = []
-    queue = [ds3]
-    while queue:
-      ds = queue[0]
-      queue = queue[1:]
-      queue.extend(ds._inputs())
-      inputs.append(ds)
-
-    self.assertEqual(5, inputs.count(ds1))
-    self.assertEqual(2, inputs.count(ds2))
-    self.assertEqual(1, inputs.count(ds3))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py b/tensorflow/python/data/kernel_tests/interleave_test.py
similarity index 85%
rename from tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/interleave_test.py
index b911c249ced1286223ae5477df75c71b3fececab..05a211afcc177faaeb1a00ad03d8f117448f8315 100644
--- a/tensorflow/python/data/kernel_tests/interleave_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/interleave_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.interleave()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -27,6 +27,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import sparse_ops
@@ -115,7 +116,7 @@ def _make_coordinated_sloppy_dataset(input_values, cycle_length, block_length,
   dataset = dataset_ops.Dataset.from_tensor_slices(input_values).repeat(
       2).interleave(interleave_fn, cycle_length, block_length,
                     num_parallel_calls).with_options(options)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   get_next = iterator.get_next()
   return get_next, coordination_events
 
@@ -133,7 +134,8 @@ def _repeat(values, count):
   return [[value] * value for value in np.tile(values, count)]
 
 
-class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class InterleaveTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.named_parameters(
       ("1", [4, 5, 6], 1, 1, [
@@ -191,16 +193,11 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         count).interleave(
             lambda x: dataset_ops.Dataset.from_tensors(x).repeat(x),
             cycle_length, block_length, num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      for expected_element in _interleave(
-          _repeat(input_values, count), cycle_length, block_length):
-        self.assertEqual(expected_element, sess.run(get_next))
-
-      for _ in range(2):
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next)
+    expected_output = [
+        element for element in _interleave(
+            _repeat(input_values, count), cycle_length, block_length)
+    ]
+    self.assertDatasetProduces(dataset, expected_output)
 
   @parameterized.named_parameters(
       ("1", np.float32([1., np.nan, 2., np.nan, 3.]), 1, 3, None),
@@ -223,17 +220,16 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         lambda x: array_ops.check_numerics(x, "message")).interleave(
             dataset_ops.Dataset.from_tensors, cycle_length, block_length,
             num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = self.getNext(dataset)
 
-    with self.cached_session() as sess:
-      for value in input_values:
-        if np.isnan(value):
-          with self.assertRaises(errors.InvalidArgumentError):
-            sess.run(get_next)
-        else:
-          self.assertEqual(value, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    for value in input_values:
+      if np.isnan(value):
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(get_next())
+      else:
+        self.assertEqual(value, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   def testInterleaveSparse(self):
 
@@ -245,18 +241,17 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return dataset_ops.Dataset.from_tensor_slices(
           sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
 
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_map_fn).interleave(
-            _interleave_fn, cycle_length=1).make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      for i in range(10):
-        for j in range(2):
-          expected = [i, 0] if j % 2 == 0 else [0, -i]
-          self.assertAllEqual(expected, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).map(_map_fn).interleave(
+        _interleave_fn, cycle_length=1)
+    get_next = self.getNext(dataset)
+    for i in range(10):
+      for j in range(2):
+        expected = [i, 0] if j % 2 == 0 else [0, -i]
+        self.assertAllEqual(expected, self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
 
   @parameterized.named_parameters(
       ("1", np.int64([4, 5, 6]), 2, 1, 1),
@@ -269,8 +264,9 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("8", np.int64([4, 0, 6]), 2, 3, 1),
       ("9", np.int64([4, 0, 6]), 2, 3, 2),
   )
-  def testSloppyInterleaveInOrder(self, input_values, cycle_length,
-                                  block_length, num_parallel_calls):
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerSloppyInterleaveInOrder(self, input_values, cycle_length,
+                                           block_length, num_parallel_calls):
     get_next, coordination_events = _make_coordinated_sloppy_dataset(
         input_values, cycle_length, block_length, num_parallel_calls)
     config = config_pb2.ConfigProto(
@@ -281,7 +277,7 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           _repeat(input_values, 2), cycle_length, block_length):
         coordination_events[expected_element].set()
         self.assertEqual(expected_element * expected_element,
-                         sess.run(get_next))
+                         self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -291,8 +287,9 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       ("3", np.int64([4, 5, 6]), 3, 2, 3),
       ("4", np.int64([4, 0, 6]), 2, 3, 2),
   )
-  def testSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
-                                     block_length, num_parallel_calls):
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerSloppyInterleaveOutOfOrder(self, input_values, cycle_length,
+                                              block_length, num_parallel_calls):
     get_next, coordination_events = _make_coordinated_sloppy_dataset(
         input_values, cycle_length, block_length, num_parallel_calls)
     config = config_pb2.ConfigProto(
@@ -308,7 +305,7 @@ class InterleaveDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
       for element in elements:
         coordination_events[element].set()
-        self.assertEqual(element * element, sess.run(get_next))
+        self.assertEqual(element * element, self.evaluate(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
diff --git a/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..91b356691b75eb337ad61643646ba717e4929ab9
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/iterator_checkpoint_test.py
@@ -0,0 +1,129 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Checkpoint tests for `tf.data.Iterator`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import functools
+import os
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+from tensorflow.python.training import checkpoint_management
+from tensorflow.python.training.checkpointable import util as checkpointable_utils
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class IteratorCheckpointingTest(test_base.DatasetTestBase):
+
+  def testSaveRestoreOneShotIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
+        math_ops.square).batch(2)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual([1, 4], get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([9, 16], get_next())
+    self.assertAllEqual([25, 36], get_next())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual([9, 16], get_next())
+    self.assertAllEqual([25, 36], get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      get_next()
+
+  def testSaveRestoreMultipleIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.from_tensor_slices(
+        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+    dataset = dataset.map(math_ops.square).batch(2)
+    iterator_1 = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next_1 = iterator_1.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_1.get_next())
+    iterator_2 = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next_2 = iterator_2.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_2.get_next())
+    dataset_2 = dataset_ops.Dataset.range(10)
+    iterator_3 = iter(dataset_2) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset_2)
+    get_next_3 = iterator_3.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator_3.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(
+        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
+    self.assertAllEqual([1, 4], get_next_1())
+    self.assertAllEqual(0, get_next_3())
+    self.assertAllEqual(1, get_next_3())
+    self.assertAllEqual(2, get_next_3())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual([1, 4], get_next_2())
+    self.assertAllEqual([9, 16], get_next_2())
+    self.assertAllEqual(3, get_next_3())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual([9, 16], get_next_1())
+    self.assertAllEqual([1, 4], get_next_2())
+    self.assertAllEqual(3, get_next_3())
+
+  def testRestoreExhaustedIterator(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.range(3)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_one_shot_iterator(dataset)
+    get_next = iterator.get_next if context.executing_eagerly(
+    ) else functools.partial(self.evaluate, iterator.get_next())
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    self.assertAllEqual(0, get_next())
+    self.assertAllEqual(1, get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    self.assertAllEqual(2, get_next())
+    checkpoint.restore(save_path).run_restore_ops()
+    self.assertAllEqual(2, get_next())
+    save_path = checkpoint.save(checkpoint_prefix)
+    checkpoint.restore(save_path).run_restore_ops()
+    with self.assertRaises(errors.OutOfRangeError):
+      get_next()
+
+  def testRestoreInReconstructedIteratorInitializable(self):
+    checkpoint_directory = self.get_temp_dir()
+    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
+    dataset = dataset_ops.Dataset.range(10)
+    iterator = iter(dataset) if context.executing_eagerly(
+    ) else dataset_ops.make_initializable_iterator(dataset)
+    get_next = iterator.get_next
+    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
+    for i in range(5):
+      checkpoint.restore(
+          checkpoint_management.latest_checkpoint(
+              checkpoint_directory)).initialize_or_restore()
+      for j in range(2):
+        self.assertEqual(i * 2 + j, self.evaluate(get_next()))
+      checkpoint.save(file_prefix=checkpoint_prefix)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
similarity index 93%
rename from tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
rename to tensorflow/python/data/kernel_tests/iterator_cluster_test.py
index bf5fd781d65cd11ded307221dc80cb58567a41b6..20088234953b1cdc8f85381ded45cf22aa93c75a 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_cluster_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_cluster_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops that need test_util."""
+"""Tests for `tf.data.Iterator` using distributed sessions."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -39,6 +39,7 @@ from tensorflow.python.platform import test
 
 class IteratorClusterTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRemoteIteratorWithoutRemoteCallFail(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -47,7 +48,7 @@ class IteratorClusterTest(test.TestCase):
 
     with ops.device("/job:worker/replica:0/task:0/cpu:1"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     with ops.device("/job:worker/replica:0/task:0/cpu:0"):
@@ -62,7 +63,7 @@ class IteratorClusterTest(test.TestCase):
   def _testRemoteIteratorHelper(self, device0, device1, target):
     with ops.device(device1):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     @function.Defun(dtypes.string)
@@ -92,6 +93,7 @@ class IteratorClusterTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(remote_op, feed_dict={target_placeholder: device1})
 
+  @test_util.run_v1_only("b/120545219")
   def testRemoteIteratorUsingRemoteCallOp(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -102,6 +104,7 @@ class IteratorClusterTest(test.TestCase):
                                    "/job:worker/replica:0/task:0/cpu:1",
                                    worker[0].target)
 
+  @test_util.run_v1_only("b/120545219")
   def testRemoteIteratorUsingRemoteCallOpCrossProcess(self):
     workers, _ = test_util.create_local_cluster(2, 1)
 
@@ -109,6 +112,7 @@ class IteratorClusterTest(test.TestCase):
                                    "/job:worker/replica:0/task:1/cpu:0",
                                    workers[0].target)
 
+  @test_util.run_v1_only("b/120545219")
   def testCaptureHashTableInSharedIterator(self):
     worker, _ = test_util.create_local_cluster(1, 1)
 
@@ -143,6 +147,7 @@ class IteratorClusterTest(test.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_v1_only("b/120545219")
   def testImplicitDisposeParallelMapDataset(self):
     # Tests whether a parallel map dataset will be cleaned up correctly when
     # the pipeline does not run it until exhaustion.
@@ -161,7 +166,7 @@ class IteratorClusterTest(test.TestCase):
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
         .repeat(None).prefetch(10000))
 
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
diff --git a/tensorflow/python/data/kernel_tests/iterator_ops_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
similarity index 83%
rename from tensorflow/python/data/kernel_tests/iterator_ops_test.py
rename to tensorflow/python/data/kernel_tests/iterator_test.py
index a2a3528cc620df85ea797aaa7657cc79ff320285..916cf8bb45ce7dbf55261d3f67ca17c0cdbb10fd 100644
--- a/tensorflow/python/data/kernel_tests/iterator_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Iterator`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import functools
 import os
 import warnings
 
@@ -50,24 +49,24 @@ from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import server_lib
-from tensorflow.python.training.checkpointable import util as checkpointable_utils
 from tensorflow.python.util import compat
 
 
 class IteratorTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoGradients(self):
     component = constant_op.constant([1.])
     side = constant_op.constant(0.)
     add = lambda x: x + side
     dataset = dataset_ops.Dataset.from_tensor_slices(component).map(add)
-    value = dataset.make_one_shot_iterator().get_next()
+    value = dataset_ops.make_one_shot_iterator(dataset).get_next()
     self.assertIsNone(gradients_impl.gradients(value, component)[0])
     self.assertIsNone(gradients_impl.gradients(value, side)[0])
     self.assertIsNone(gradients_impl.gradients(value, [component, side])[0])
 
+  @test_util.run_deprecated_v1
   def testCapturingStateInOneShotRaisesException(self):
     var = variables.Variable(37.0, name="myvar")
     dataset = (
@@ -76,8 +75,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(
         ValueError, r"`Dataset.make_one_shot_iterator\(\)` does not support "
         "datasets that capture stateful objects.+myvar"):
-      dataset.make_one_shot_iterator()
+      dataset_ops.make_one_shot_iterator(dataset)
 
+  @test_util.run_deprecated_v1
   def testOneShotIterator(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -86,9 +86,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
+    iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.from_tensor_slices(components).map(_map_fn)
-        .repeat(14).make_one_shot_iterator())
+        .repeat(14))
     get_next = iterator.get_next()
 
     self.assertEqual([c.shape[1:] for c in components],
@@ -103,6 +103,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testOneShotIteratorCaptureByValue(self):
     components = (np.arange(7),
                   np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
@@ -112,9 +113,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _map_fn(x, y, z):
       return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-    iterator = (
+    iterator = dataset_ops.make_one_shot_iterator(
         dataset_ops.Dataset.from_tensor_slices(tensor_components)
-        .map(_map_fn).repeat(14).make_one_shot_iterator())
+        .map(_map_fn).repeat(14))
     get_next = iterator.get_next()
 
     self.assertEqual([c.shape[1:] for c in components],
@@ -139,9 +140,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       def _map_fn(x, y, z):
         return math_ops.square(x), math_ops.square(y), math_ops.square(z)
 
-      iterator = (
+      iterator = dataset_ops.make_one_shot_iterator(
           dataset_ops.Dataset.from_tensor_slices(components)
-          .map(_map_fn).repeat(14).make_one_shot_iterator())
+          .map(_map_fn).repeat(14))
       return iterator.get_next()
 
     server = server_lib.Server.create_local_server()
@@ -165,9 +166,10 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testOneShotIteratorNonBlocking(self):
     dataset = dataset_ops.Dataset.from_tensors([1, 2, 3]).map(lambda x: x * x)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     # Create a session with a single thread to ensure that the
@@ -203,12 +205,13 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                        len([None for r in results if r is None]))
       self.assertAllEqual([[1, 4, 9]], [r for r in results if r is not None])
 
+  @test_util.run_deprecated_v1
   def testOneShotIteratorInitializerFails(self):
     # Define a dataset whose initialization will always fail.
     dataset = dataset_ops.Dataset.from_tensors(
         array_ops.check_numerics(
             constant_op.constant(1.0) / constant_op.constant(0.0), "oops"))
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     next_element = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -283,11 +286,11 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.OutOfRangeError):
           sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testNotInitializedError(self):
     components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    iterator = (
-        dataset_ops.Dataset.from_tensors(components)
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(components))
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -295,6 +298,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                                    "iterator has not been initialized"):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
   def testReinitializableIterator(self):
     dataset_3 = dataset_ops.Dataset.from_tensors(
         constant_op.constant([1, 2, 3]))
@@ -334,6 +338,33 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @test_util.run_deprecated_v1
+  def testReinitializableIteratorWithFunctions(self):
+
+    def g():
+      for i in range(10):
+        yield i
+
+    iterator = iterator_ops.Iterator.from_structure(dtypes.int64, [])
+    next_element = iterator.get_next()
+
+    with self.cached_session() as sess:
+      dataset_1 = dataset_ops.Dataset.from_generator(
+          g, output_types=dtypes.int64)
+      sess.run(iterator.make_initializer(dataset_1))
+      for expected in range(10):
+        self.assertEqual(expected, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
+      dataset_2 = dataset_ops.Dataset.from_generator(
+          g, output_types=dtypes.int64)
+      sess.run(iterator.make_initializer(dataset_2))
+      for expected in range(10):
+        self.assertEqual(expected, sess.run(next_element))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(next_element)
+
   def testReinitializableIteratorStaticErrors(self):
     # Non-matching structure for types and shapes.
     with self.assertRaises(TypeError):
@@ -367,12 +398,13 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
               (constant_op.constant([1, 2, 3], dtype=dtypes.int64),
                constant_op.constant([4., 5., 6., 7.], dtype=dtypes.float64))))
 
+  @test_util.run_deprecated_v1
   def testIteratorStringHandle(self):
     dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
     dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
 
-    iterator_3 = dataset_3.make_one_shot_iterator()
-    iterator_4 = dataset_4.make_one_shot_iterator()
+    iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+    iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)
 
     handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
     feedable_iterator = iterator_ops.Iterator.from_string_handle(
@@ -422,13 +454,14 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         sess.run(
             next_element, feed_dict={handle_placeholder: iterator_4_handle})
 
+  @test_util.run_deprecated_v1
   def testIteratorStringHandleFuture(self):
     with forward_compat.forward_compatibility_horizon(2018, 8, 4):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
       dataset_4 = dataset_ops.Dataset.from_tensor_slices([10, 20, 30, 40])
 
-      iterator_3 = dataset_3.make_one_shot_iterator()
-      iterator_4 = dataset_4.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
+      iterator_4 = dataset_ops.make_one_shot_iterator(dataset_4)
 
       handle_placeholder = array_ops.placeholder(dtypes.string, shape=[])
       feedable_iterator = iterator_ops.Iterator.from_string_handle(
@@ -485,10 +518,11 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
           sess.run(
               next_element, feed_dict={handle_placeholder: iterator_4_handle})
 
+  @test_util.run_deprecated_v1
   def testIteratorStringHandleReuseTensorObject(self):
     dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-    one_shot_iterator = dataset.make_one_shot_iterator()
-    initializable_iterator = dataset.make_initializable_iterator()
+    one_shot_iterator = dataset_ops.make_one_shot_iterator(dataset)
+    initializable_iterator = dataset_ops.make_initializable_iterator(dataset)
     structure_iterator = iterator_ops.Iterator.from_structure(
         dataset.output_types)
 
@@ -513,6 +547,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     self.assertEqual("foo_1", handle_with_same_name.op.name)
     self.assertIsNot(handle_with_name, handle_with_same_name)
 
+  @test_util.run_deprecated_v1
   def testIteratorStringHandleError(self):
     dataset_int_scalar = (
         dataset_ops.Dataset.from_tensor_slices([1, 2, 3]).repeat())
@@ -528,10 +563,10 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         handle_placeholder, dtypes.int32)
 
     with self.cached_session() as sess:
-      handle_int_scalar = sess.run(
-          dataset_int_scalar.make_one_shot_iterator().string_handle())
-      handle_float_vector = sess.run(
-          dataset_float_vector.make_one_shot_iterator().string_handle())
+      handle_int_scalar = sess.run(dataset_ops.make_one_shot_iterator(
+          dataset_int_scalar).string_handle())
+      handle_float_vector = sess.run(dataset_ops.make_one_shot_iterator(
+          dataset_float_vector).string_handle())
 
       self.assertEqual(1,
                        sess.run(
@@ -553,13 +588,14 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
             feedable_int_vector.get_next(),
             feed_dict={handle_placeholder: handle_float_vector}))
 
+  @test_util.run_deprecated_v1
   def testRemoteIteratorUsingRemoteCallOpDirectSession(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 3
 
     with ops.device("/job:localhost/replica:0/task:0/cpu:1"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     @function.Defun(dtypes.string)
@@ -609,6 +645,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:1"
             })
 
+  @test_util.run_deprecated_v1
   def testRemoteIteratorUsingRemoteCallOpMultiWorkers(self):
     s1 = server_lib.Server.create_local_server()
     s2 = server_lib.Server.create_local_server()
@@ -631,7 +668,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     for device in worker_devices:
       with ops.device(device):
         src = dataset_ops.Dataset.from_tensor_slices([device])
-        itr = src.make_one_shot_iterator()
+        itr = dataset_ops.make_one_shot_iterator(src)
         itr_handles.append(itr.string_handle())
 
     targets = dataset_ops.Dataset.from_tensor_slices(worker_devices)
@@ -649,7 +686,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     with ops.device("/job:client"):
       client_dataset = dataset_ops.Dataset.zip((targets, handles)).map(map_fn)
-      itr = client_dataset.make_initializable_iterator()
+      itr = dataset_ops.make_initializable_iterator(client_dataset)
       n = itr.get_next()
 
     with session.Session(s3.target, config=config) as sess:
@@ -667,7 +704,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     with ops.device("/job:localhost/replica:0/task:0/cpu:0"):
       dataset_3 = dataset_ops.Dataset.from_tensor_slices([1, 2, 3])
-      iterator_3 = dataset_3.make_one_shot_iterator()
+      iterator_3 = dataset_ops.make_one_shot_iterator(dataset_3)
       iterator_3_handle = iterator_3.string_handle()
 
     def _encode_raw(byte_array):
@@ -716,6 +753,7 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                 target_placeholder: "/job:localhost/replica:0/task:0/cpu:0"
             })
 
+  @test_util.run_deprecated_v1
   def testIncorrectIteratorRestore(self):
 
     def _path():
@@ -738,8 +776,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
     def _build_range_dataset_graph():
       start = 1
       stop = 10
-      iterator = dataset_ops.Dataset.range(start,
-                                           stop).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(start, stop))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       save_op = _save_op(iterator._iterator_resource)
@@ -748,8 +786,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
     def _build_reader_dataset_graph():
       filenames = ["test"]  # Does not exist but we don't care in this test.
-      iterator = readers.FixedLengthRecordDataset(
-          filenames, 1, 0, 0).make_initializable_iterator()
+      iterator = dataset_ops.make_initializable_iterator(
+          readers.FixedLengthRecordDataset(filenames, 1, 0, 0))
       init_op = iterator.initializer
       get_next_op = iterator.get_next()
       save_op = _save_op(iterator._iterator_resource)
@@ -774,8 +812,9 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
         with self.assertRaises(errors.InvalidArgumentError):
           sess.run(restore_op)
 
+  @test_util.run_deprecated_v1
   def testRepeatedGetNextWarning(self):
-    iterator = dataset_ops.Dataset.range(10).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset_ops.Dataset.range(10))
     warnings.simplefilter("always")
     with warnings.catch_warnings(record=True) as w:
       for _ in range(100):
@@ -818,8 +857,8 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
                             expected_output_classes, expected_output_types,
                             expected_output_shapes):
     tf_value = tf_value_fn()
-    iterator = dataset_ops.Dataset.from_tensors(
-        tf_value).make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.from_tensors(tf_value))
 
     self.assertTrue(expected_element_structure.is_compatible_with(
         iterator._element_structure))
@@ -832,100 +871,11 @@ class IteratorTest(test.TestCase, parameterized.TestCase):
 
   def testIteratorGetNextName(self):
     with ops.Graph().as_default():
-      iterator = dataset_ops.Dataset.from_tensors(37.0).make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(
+          dataset_ops.Dataset.from_tensors(37.0))
       next_element = iterator.get_next(name="overridden_name")
       self.assertEqual("overridden_name", next_element.op.name)
 
 
-class IteratorCheckpointingTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveRestoreOneShotIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.from_tensor_slices([1, 2, 3, 4, 5, 6]).map(
-        math_ops.square).batch(2)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    self.assertAllEqual([1, 4], get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual([9, 16], get_next())
-    self.assertAllEqual([25, 36], get_next())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual([9, 16], get_next())
-    self.assertAllEqual([25, 36], get_next())
-    with self.assertRaises(errors.OutOfRangeError):
-      get_next()
-
-  @test_util.run_in_graph_and_eager_modes
-  def testSaveRestoreMultipleIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.from_tensor_slices(
-        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
-    dataset = dataset.map(math_ops.square).batch(2)
-    iterator_1 = dataset.make_one_shot_iterator()
-    get_next_1 = iterator_1.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_1.get_next())
-    iterator_2 = dataset.make_one_shot_iterator()
-    get_next_2 = iterator_2.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_2.get_next())
-    dataset_2 = dataset_ops.Dataset.range(10)
-    iterator_3 = dataset_2.make_one_shot_iterator()
-    get_next_3 = iterator_3.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator_3.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(
-        iterator_1=iterator_1, iterator_2=iterator_2, iterator_3=iterator_3)
-    self.assertAllEqual([1, 4], get_next_1())
-    self.assertAllEqual(0, get_next_3())
-    self.assertAllEqual(1, get_next_3())
-    self.assertAllEqual(2, get_next_3())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual([1, 4], get_next_2())
-    self.assertAllEqual([9, 16], get_next_2())
-    self.assertAllEqual(3, get_next_3())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual([9, 16], get_next_1())
-    self.assertAllEqual([1, 4], get_next_2())
-    self.assertAllEqual(3, get_next_3())
-
-  @test_util.run_in_graph_and_eager_modes
-  def testRestoreExhaustedIterator(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.range(3)
-    iterator = dataset.make_one_shot_iterator()
-    get_next = iterator.get_next if context.executing_eagerly(
-    ) else functools.partial(self.evaluate, iterator.get_next())
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    self.assertAllEqual(0, get_next())
-    self.assertAllEqual(1, get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    self.assertAllEqual(2, get_next())
-    checkpoint.restore(save_path).run_restore_ops()
-    self.assertAllEqual(2, get_next())
-    save_path = checkpoint.save(checkpoint_prefix)
-    checkpoint.restore(save_path).run_restore_ops()
-    with self.assertRaises(errors.OutOfRangeError):
-      get_next()
-
-  def testRestoreInReconstructedIteratorInitializable(self):
-    checkpoint_directory = self.get_temp_dir()
-    checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
-    dataset = dataset_ops.Dataset.range(10)
-    iterator = dataset.make_initializable_iterator()
-    get_next = iterator.get_next()
-    checkpoint = checkpointable_utils.Checkpoint(iterator=iterator)
-    for i in range(5):
-      with self.cached_session() as sess:
-        checkpoint.restore(checkpoint_management.latest_checkpoint(
-            checkpoint_directory)).initialize_or_restore(sess)
-        for j in range(2):
-          self.assertEqual(i * 2 + j, sess.run(get_next))
-        checkpoint.save(file_prefix=checkpoint_prefix)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py b/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
deleted file mode 100644
index b58c1444daeb03e9fa0b02a7288cbdaebbc0e42e..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/list_files_dataset_op_test.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from os import path
-import shutil
-import tempfile
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-class ListFilesDatasetOpTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    self.tmp_dir = tempfile.mkdtemp()
-
-  def tearDown(self):
-    shutil.rmtree(self.tmp_dir, ignore_errors=True)
-
-  def _touchTempFiles(self, filenames):
-    for filename in filenames:
-      open(path.join(self.tmp_dir, filename), 'a').close()
-
-  def testEmptyDirectory(self):
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testSimpleDirectory(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testSimpleDirectoryNotShuffled(self):
-    filenames = ['b', 'c', 'a']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=False)
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      for filename in sorted(filenames):
-        self.assertEqual(compat.as_bytes(path.join(self.tmp_dir, filename)),
-                         sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFixedSeedResultsInRepeatableOrder(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = [compat.as_bytes(path.join(self.tmp_dir, filename))
-                        for filename in filenames]
-
-      all_produced_filenames = []
-      for _ in range(3):
-        produced_filenames = []
-        sess.run(itr.initializer)
-        try:
-          while True:
-            produced_filenames.append(sess.run(next_element))
-        except errors.OutOfRangeError:
-          pass
-        all_produced_filenames.append(produced_filenames)
-
-      # Each run should produce the same set of filenames, which may be
-      # different from the order of `full_filenames`.
-      self.assertItemsEqual(full_filenames, all_produced_filenames[0])
-      # However, the different runs should produce filenames in the same order
-      # as each other.
-      self.assertEqual(all_produced_filenames[0], all_produced_filenames[1])
-      self.assertEqual(all_produced_filenames[0], all_produced_filenames[2])
-
-  def testEmptyDirectoryInitializer(self):
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError, 'No files matched pattern: '):
-        sess.run(
-            itr.initializer,
-            feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-  def testSimpleDirectoryInitializer(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileSuffixes(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testFileMiddles(self):
-    filenames = ['a.txt', 'b.py', 'c.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(dtypes.string, shape=[])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      sess.run(
-          itr.initializer,
-          feed_dict={filename_placeholder: path.join(self.tmp_dir, '*.py*')})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[1:]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testNoShuffle(self):
-    filenames = ['a', 'b', 'c']
-    self._touchTempFiles(filenames)
-
-    # Repeat the list twice and ensure that the order is the same each time.
-    # NOTE(mrry): This depends on an implementation detail of `list_files()`,
-    # which is that the list of files is captured when the iterator is
-    # initialized. Otherwise, or if e.g. the iterator were initialized more than
-    # once, it's possible that the non-determinism of `tf.matching_files()`
-    # would cause this test to fail. However, it serves as a useful confirmation
-    # that the `shuffle=False` argument is working as intended.
-    # TODO(b/73959787): Provide some ordering guarantees so that this test is
-    # more meaningful.
-    dataset = dataset_ops.Dataset.list_files(
-        path.join(self.tmp_dir, '*'), shuffle=False).repeat(2)
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames * 2:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-      self.assertItemsEqual(full_filenames, produced_filenames)
-      self.assertEqual(produced_filenames[:len(filenames)],
-                       produced_filenames[len(filenames):])
-
-  def testMultiplePatternsAsList(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    patterns = [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']]
-    dataset = dataset_ops.Dataset.list_files(patterns)
-    with self.cached_session() as sess:
-      itr = dataset.make_one_shot_iterator()
-      next_element = itr.get_next()
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-  def testMultiplePatternsAsTensor(self):
-    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
-    self._touchTempFiles(filenames)
-
-    filename_placeholder = array_ops.placeholder(
-        dtypes.string, shape=[
-            2,
-        ])
-    dataset = dataset_ops.Dataset.list_files(filename_placeholder)
-
-    with self.cached_session() as sess:
-      itr = dataset.make_initializable_iterator()
-      next_element = itr.get_next()
-      patterns = [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']]
-      sess.run(itr.initializer, feed_dict={filename_placeholder: patterns})
-
-      full_filenames = []
-      produced_filenames = []
-      for filename in filenames[:-1]:
-        full_filenames.append(
-            compat.as_bytes(path.join(self.tmp_dir, filename)))
-        produced_filenames.append(compat.as_bytes(sess.run(next_element)))
-      self.assertItemsEqual(full_filenames, produced_filenames)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(itr.get_next())
-
-
-if __name__ == '__main__':
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/list_files_test.py b/tensorflow/python/data/kernel_tests/list_files_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a70c4b081d5c710082eb485a1dbb6179a90da2ce
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/list_files_test.py
@@ -0,0 +1,217 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.list_files()`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+import shutil
+import tempfile
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ListFilesTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    self.tmp_dir = tempfile.mkdtemp()
+
+  def tearDown(self):
+    shutil.rmtree(self.tmp_dir, ignore_errors=True)
+
+  def _touchTempFiles(self, filenames):
+    for filename in filenames:
+      open(path.join(self.tmp_dir, filename), 'a').close()
+
+  # Note: eager mode fails in assertion error same as initializer in graph mode.
+  @test_util.run_deprecated_v1
+  def testSkipEagerEmptyDirectory(self):
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(dataset, expected_output=[])
+
+  def testSimpleDirectory(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames
+        ],
+        assert_items_equal=True)
+
+  def testSimpleDirectoryNotShuffled(self):
+    filenames = ['b', 'c', 'a']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in sorted(filenames)
+        ])
+
+  def testFixedSeedResultsInRepeatableOrder(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=True, seed=37)
+
+    expected_filenames = [
+        compat.as_bytes(path.join(self.tmp_dir, filename))
+        for filename in filenames
+    ]
+
+    all_actual_filenames = []
+    for _ in range(3):
+      actual_filenames = []
+      next_element = self.getNext(dataset, requires_initialization=True)
+      try:
+        while True:
+          actual_filenames.append(self.evaluate(next_element()))
+      except errors.OutOfRangeError:
+        pass
+      all_actual_filenames.append(actual_filenames)
+
+    # Each run should produce the same set of filenames, which may be
+    # different from the order of `expected_filenames`.
+    self.assertItemsEqual(expected_filenames, all_actual_filenames[0])
+    # However, the different runs should produce filenames in the same order
+    # as each other.
+    self.assertEqual(all_actual_filenames[0], all_actual_filenames[1])
+    self.assertEqual(all_actual_filenames[0], all_actual_filenames[2])
+
+  # TODO(b/117581999): eager mode assertion fail wrapped, debug.
+  def tesSkipEagerEmptyDirectoryInitializer(self):
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(errors.InvalidArgumentError,
+                        'No files matched pattern'),
+        requires_initialization=True)
+
+  def testSimpleDirectoryInitializer(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames
+        ],
+        assert_items_equal=True)
+
+  def testFileSuffixes(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*.py'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[1:-1]
+        ],
+        assert_items_equal=True)
+
+  def testFileMiddles(self):
+    filenames = ['a.txt', 'b.py', 'c.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(path.join(self.tmp_dir, '*.py*'))
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[1:]
+        ],
+        assert_items_equal=True)
+
+  def testNoShuffle(self):
+    filenames = ['a', 'b', 'c']
+    self._touchTempFiles(filenames)
+
+    # Repeat the list twice and ensure that the order is the same each time.
+    # NOTE(mrry): This depends on an implementation detail of `list_files()`,
+    # which is that the list of files is captured when the iterator is
+    # initialized. Otherwise, or if e.g. the iterator were initialized more than
+    # once, it's possible that the non-determinism of `tf.matching_files()`
+    # would cause this test to fail. However, it serves as a useful confirmation
+    # that the `shuffle=False` argument is working as intended.
+    # TODO(b/73959787): Provide some ordering guarantees so that this test is
+    # more meaningful.
+    dataset = dataset_ops.Dataset.list_files(
+        path.join(self.tmp_dir, '*'), shuffle=False).repeat(2)
+    next_element = self.getNext(dataset)
+
+    expected_filenames = []
+    actual_filenames = []
+    for filename in filenames * 2:
+      expected_filenames.append(
+          compat.as_bytes(path.join(self.tmp_dir, filename)))
+      actual_filenames.append(compat.as_bytes(self.evaluate(next_element())))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+    self.assertItemsEqual(expected_filenames, actual_filenames)
+    self.assertEqual(actual_filenames[:len(filenames)],
+                     actual_filenames[len(filenames):])
+
+  def testMultiplePatternsAsList(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    patterns = [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']]
+    dataset = dataset_ops.Dataset.list_files(patterns)
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[:-1]
+        ],
+        assert_items_equal=True)
+
+  def testMultiplePatternsAsTensor(self):
+    filenames = ['a.txt', 'b.py', 'c.py', 'd.pyc']
+    self._touchTempFiles(filenames)
+
+    dataset = dataset_ops.Dataset.list_files(
+        [path.join(self.tmp_dir, pat) for pat in ['*.py', '*.txt']])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[
+            compat.as_bytes(path.join(self.tmp_dir, filename))
+            for filename in filenames[:-1]
+        ],
+        assert_items_equal=True)
+
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py b/tensorflow/python/data/kernel_tests/map_test.py
similarity index 80%
rename from tensorflow/python/data/kernel_tests/map_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/map_test.py
index 81ef7d16be2c9d7eb8513ebdcb83d93b750670c8..e07706413dea9932c0b83f9eaedd62707b57e668 100644
--- a/tensorflow/python/data/kernel_tests/map_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/map_test.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 from collections import namedtuple
 import threading
-import time
 import warnings
 
 from absl.testing import parameterized
@@ -27,7 +26,6 @@ import numpy as np
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.python.client import session
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.framework import constant_op
@@ -36,6 +34,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
@@ -79,11 +78,12 @@ def _make_coordinated_sloppy_dataset(num_elements, num_parallel_calls):
   options.experimental_deterministic = False
   dataset = dataset_ops.Dataset.range(num_elements).map(
       map_fn, num_parallel_calls).with_options(options)
-  iterator = dataset.make_one_shot_iterator()
+  iterator = dataset_ops.make_one_shot_iterator(dataset)
   next_element = iterator.get_next()
   return next_element, coordination_events
 
 
+@test_util.run_v1_only("b/120545219")
 class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def _buildMapDataset(self, components, count):
@@ -102,7 +102,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     count = array_ops.placeholder(dtypes.int64, shape=[])
 
     dataset = self._buildMapDataset(components, count)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -168,7 +168,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     dataset = self._buildParallelMapDataset(
         components, count, num_parallel_calls, output_buffer_size)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -237,7 +237,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = self._buildParallelMapDataset(components, 1000, 100, 100)
     # NOTE(mrry): Also test that the prefetching thread is cancelled correctly.
     dataset = dataset.prefetch(100)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -252,7 +252,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"),
                     num_parallel_calls=2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -267,7 +267,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"),
                     num_parallel_calls=2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -288,7 +288,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset = (dataset_ops.Dataset.from_tensor_slices(components)
                .map(lambda x: array_ops.check_numerics(x, "message"))
                .prefetch(2))
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -314,8 +314,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return dataset_ops.Dataset.range(10).map(_map_fn)
 
     def _build_graph():
-      captured_iterator = dataset_ops.Dataset.range(
-          10).make_initializable_iterator()
+      captured_iterator = dataset_ops.make_initializable_iterator(
+          dataset_ops.Dataset.range(10))
       ds = _build_ds(captured_iterator)
       iterator = ds.make_initializable_iterator()
       init_op = iterator.initializer
@@ -345,10 +345,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     input_sentences = dataset_ops.Dataset.from_tensor_slices(
         ["brain brain tank salad surgery", "surgery brain"])
 
-    iterator = (input_sentences
-                .map(lambda x: string_ops.string_split([x]).values)
-                .map(table.lookup)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        input_sentences
+        .map(lambda x: string_ops.string_split([x]).values).map(table.lookup))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -365,8 +364,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     queue = data_flow_ops.FIFOQueue(200, dtypes.int64, shapes=[])
     enqueue_op = queue.enqueue_many(elements)
     close_op = queue.close()
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: queue.dequeue()).make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(-1)
+        .map(lambda _: queue.dequeue()))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -389,9 +389,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     enqueue_op = queue.enqueue_many(elements)
     close_op = queue.close()
 
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(-1)
-                .map(lambda _: (queue.dequeue(), queue_2.dequeue()))
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(-1)
+        .map(lambda _: (queue.dequeue(), queue_2.dequeue())))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -408,9 +408,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testCaptureVariable(self):
     counter_var = variable_scope.get_variable(
         "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: counter_var.assign_add(1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -428,9 +428,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testCaptureUninitializedVariableError(self):
     counter_var = variable_scope.get_variable(
         "counter", (), dtypes.int32, use_resource=True)
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: counter_var.assign_add(1))
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: counter_var.assign_add(1)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -440,9 +440,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         sess.run(get_next)
 
   def testSeededStatefulOperatorIsProperlyStateful(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: random_ops.random_uniform((), seed=11)).batch(2))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -464,11 +464,11 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertAllClose(random_values, random_values_2)
 
   def testStatefulMapKeepsStateAcrossIterators(self):
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10)
-                .map(lambda _: random_ops.random_uniform((), seed=11))
-                .repeat(1000)
-                .batch(10)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensors(0).repeat(10)
+        .map(lambda _: random_ops.random_uniform((), seed=11))
+        .repeat(1000)
+        .batch(10))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -493,9 +493,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       counter_var.assign_add(1)
       return x
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(increment_fn)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(increment_fn))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -511,17 +510,17 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertEqual(10, sess.run(counter_var))
 
   def testMapDict(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: {"foo": x * 2, "bar": x ** 2})
-                .map(lambda d: d["foo"] + d["bar"])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10)
+        .map(lambda x: {"foo": x * 2, "bar": x ** 2})
+        .map(lambda d: d["foo"] + d["bar"]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(init_op)
       for i in range(10):
-        self.assertEqual(i * 2 + i ** 2, sess.run(get_next))
+        self.assertEqual(i * 2 + i**2, sess.run(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -546,8 +545,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     dataset_tuple = dataset_tuple.map(preprocess_tuple)
     dataset_namedtuple = dataset_namedtuple.map(preprocess_namedtuple)
 
-    next_tuple = dataset_tuple.make_one_shot_iterator().get_next()
-    next_namedtuple = dataset_namedtuple.make_one_shot_iterator().get_next()
+    next_tuple = dataset_ops.make_one_shot_iterator(dataset_tuple).get_next()
+    next_namedtuple = dataset_ops.make_one_shot_iterator(
+        dataset_namedtuple).get_next()
 
     # make sure both datasets contain the same data
     with self.cached_session() as sess:
@@ -561,16 +561,15 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testUseStepContainerInMap(self):
     row = np.arange(6)
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensors(row)
-        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems))
-        .make_initializable_iterator())
+        .map(lambda elems: functional_ops.map_fn(lambda x: x * x, elems)))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
       sess.run(init_op)
-      self.assertAllEqual(row ** 2, sess.run(get_next))
+      self.assertAllEqual(row**2, sess.run(get_next))
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
@@ -600,9 +599,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           pred_fn_pairs, default=multiply, exclusive=True)
 
     def build_dataset(row, num):
-      iterator = (
+      iterator = dataset_ops.make_initializable_iterator(
           dataset_ops.Dataset.from_tensor_slices(row).map(
-              lambda x: control_map_fn(x, num)).make_initializable_iterator())
+              lambda x: control_map_fn(x, num)))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       return init_op, get_next
@@ -639,11 +638,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     def build_dataset(row, num):
       # pylint: disable=g-long-lambda
-      iterator = (
+      iterator = dataset_ops.make_initializable_iterator(
           dataset_ops.Dataset.from_tensors(row).map(
-              lambda elems: functional_ops.map_fn(lambda x:
-                                                  control_map_fn(x, num), elems)
-              ).make_initializable_iterator())
+              lambda elems: functional_ops.map_fn(
+                  lambda x: control_map_fn(x, num), elems)))
       init_op = iterator.initializer
       get_next = iterator.get_next()
       return init_op, get_next
@@ -687,11 +685,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     row = np.arange(6)
     num = 2
     # pylint: disable=g-long-lambda
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.from_tensors(row).map(
-            lambda elems: functional_ops.map_fn(lambda x:
-                                                control_map_fn(x, num), elems)
-            ).make_initializable_iterator())
+            lambda elems: functional_ops.map_fn(
+                lambda x: control_map_fn(x, num), elems)))
     # pylint: enable=g-long-lambda
     init_op = iterator.initializer
     get_next = iterator.get_next()
@@ -721,11 +718,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return script_ops.py_func(_map_py_func, [x], x.dtype)
 
     buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(100)
         .map(_map_fn)
-        .prefetch(buffer_size_placeholder)
-        .make_initializable_iterator())
+        .prefetch(buffer_size_placeholder))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -761,9 +757,9 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           sess.run(get_next)
 
   def testReturnList(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(lambda x: [x, constant_op.constant(37.0)])
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10)
+        .map(lambda x: [x, constant_op.constant(37.0)]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -782,9 +778,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return script_ops.py_func(
           _map_py_func, [x_tensor], [dtypes.int64, dtypes.float64])
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_map_fn)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_map_fn))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -803,9 +798,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
           values=(i * np.array([1])),
           dense_shape=np.array([1, 1]))
 
-    iterator = (dataset_ops.Dataset.range(10)
-                .map(_sparse)
-                .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -830,9 +824,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertTrue(sparse_tensor.is_sparse(i))
       return sparse_ops.sparse_concat(0, [i, i])
 
-    iterator = (
-        dataset_ops.Dataset.range(10).map(_sparse).map(_check)
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(_sparse).map(_check))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -852,11 +845,10 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       else:
         return i
 
-    iterator = (
+    iterator = dataset_ops.make_initializable_iterator(
         dataset_ops.Dataset.range(105)
         .map(lambda x: script_ops.py_func(raising_py_func, [x], dtypes.int64),
-             num_parallel_calls=2)
-        .make_initializable_iterator())
+             num_parallel_calls=2))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -868,9 +860,8 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         sess.run(get_next)
 
   def testConstantOutput(self):
-    iterator = (
-        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10])
-        .make_initializable_iterator())
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).map(lambda x: [x, "hello", 10]))
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -901,12 +892,14 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
         break
     self.assertTrue(found_warning)
 
-  def testNestedDatasetError(self):
-    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
-    with self.assertRaisesRegexp(
-        NotImplementedError, r"The Dataset.map\(\) transformation does not "
-        "currently support nested datasets as outputs."):
-      _ = dataset.map(dataset_ops.Dataset.from_tensor_slices)
+  def testNestedDatasetMap(self):
+    # TODO(b/110122868): When iterators can yield a `tf.data.Dataset`, remove
+    # the `get_single_element()` call.
+    dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0]).map(
+        dataset_ops.Dataset.from_tensor_slices).map(
+            lambda ds: ds.batch(3)).flat_map(lambda x: x)
+
+    self.assertDatasetProduces(dataset, [[1.0, 2.0, 3.0]])
 
   def testReturnValueError(self):
     dataset = dataset_ops.Dataset.from_tensors([1.0, 2.0, 3.0])
@@ -939,7 +932,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return const_tensor
 
     dataset = dataset.map(broken_function)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
 
     with self.cached_session() as sess:
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "BrokenConst"):
@@ -966,7 +959,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       return tids
 
     dataset = make_dataset_fn(dataset, _map_fn)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -987,7 +980,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testShortCircuit(self, structure, map_fn, num_parallel_calls):
     dataset = self.structuredDataset(structure).repeat().map(
         map_fn, num_parallel_calls=num_parallel_calls)
-    get_next = dataset.make_one_shot_iterator().get_next()
+    get_next = dataset_ops.make_one_shot_iterator(dataset).get_next()
 
     with self.cached_session() as sess:
       if isinstance(structure, tuple):
@@ -1004,7 +997,7 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     captured_t = array_ops.placeholder(dtypes.int64, shape=[])
     dataset = self.structuredDataset(None).repeat().map(
         lambda x: captured_t, num_parallel_calls=num_parallel_calls)
-    iterator = dataset.make_initializable_iterator()
+    iterator = dataset_ops.make_initializable_iterator(dataset)
     get_next = iterator.get_next()
 
     with self.cached_session() as sess:
@@ -1054,108 +1047,22 @@ class MapDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
       with self.assertRaises(errors.OutOfRangeError):
         sess.run(get_next)
 
+  @parameterized.named_parameters(
+      ("Map", None),
+      ("ParallelMap", 12),
+  )
+  def testPreserveCardinality(self, num_parallel_calls):
+
+    def py_fn(_):
+      raise StopIteration()
+
+    dataset = dataset_ops.DatasetV2.from_tensors(0).map(
+        lambda x: script_ops.py_func(py_fn, [x], dtypes.int64),
+        num_parallel_calls=num_parallel_calls)
+    get_next = self.getNext(dataset)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(get_next())
 
-class MapDatasetBenchmark(test.Benchmark):
-
-  def benchmarkChainOfMaps(self):
-    chain_lengths = [0, 1, 2, 5, 10, 20, 50]
-    for chain_length in chain_lengths:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = True
-          print_label = ""
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda x: x + 1
-          use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda x: x
-          use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
-          benchmark_label = "_short_circuit"
-
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(0).repeat(None)
-          for _ in range(chain_length):
-            dataset = dataset_ops.MapDataset(
-                dataset,
-                map_fn,
-                use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset.make_one_shot_iterator()
-          next_element = iterator.get_next()
-
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element.op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element.op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            print("Map dataset chain length%s: %d Median wall time: %f" %
-                  (print_label, chain_length, median_wall_time))
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="benchmark_map_dataset_chain_latency_%d%s" %
-                (chain_length, benchmark_label))
-
-  def benchmarkMapFanOut(self):
-    fan_outs = [1, 2, 5, 10, 20, 50, 100]
-    for fan_out in fan_outs:
-      for mode in ["general", "single-threaded", "short-circuit"]:
-        if mode == "general":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = True
-          print_label = ""
-          benchmark_label = ""
-        if mode == "single-threaded":
-          map_fn = lambda *xs: [x + 1 for x in xs]
-          use_inter_op_parallelism = False
-          print_label = " (single threaded mode)"
-          benchmark_label = "_single_threaded"
-        if mode == "short-circuit":
-          map_fn = lambda *xs: xs
-          use_inter_op_parallelism = True  # should not have any significance
-          print_label = " (short circuit mode)"
-          benchmark_label = "_short_circuit"
-
-        with ops.Graph().as_default():
-          dataset = dataset_ops.Dataset.from_tensors(
-              tuple(0 for _ in range(fan_out))).repeat(None)
-          dataset = dataset_ops.MapDataset(
-              dataset,
-              map_fn,
-              use_inter_op_parallelism=use_inter_op_parallelism)
-          iterator = dataset.make_one_shot_iterator()
-          next_element = iterator.get_next()
-
-          with session.Session() as sess:
-            for _ in range(5):
-              sess.run(next_element[0].op)
-            deltas = []
-            for _ in range(100):
-              start = time.time()
-              for _ in range(100):
-                sess.run(next_element[0].op)
-              end = time.time()
-              deltas.append(end - start)
-
-            median_wall_time = np.median(deltas) / 100
-            print("Map dataset fan out%s: %d Median wall time: %f" %
-                  (print_label, fan_out, median_wall_time))
-            self.report_benchmark(
-                iters=1000,
-                wall_time=median_wall_time,
-                name="benchmark_map_dataset_fan_out_%d%s" % (fan_out,
-                                                             benchmark_label))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
index 42ee1e218644291e48bd06757c183c37f9c5e8a4..0322d1f2c604c3f9588eb8eaa39eb9829bb0a26e 100644
--- a/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/multi_device_iterator_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""MultiDeviceIterator tests."""
+"""Tests for `tf.data.MultiDeviceIterator`."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.data.experimental.ops import optimization
+from tensorflow.python.data.experimental.ops.optimization_options import OptimizationOptions
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
@@ -31,8 +32,10 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
+# TODO(b/117581999): Add eager coverage.
 class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
+  @test_util.run_v1_only("b/120545219")
   def testNoGetNext(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -40,8 +43,9 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -50,14 +54,15 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testOneOnSameDevice(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(10)
@@ -67,14 +72,15 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testRepeatDevices(self):
     with ops.device("/cpu:0"):
       dataset = dataset_ops.Dataset.range(20)
@@ -85,18 +91,19 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 20, 4):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
-        self.assertEqual(i + 2, sess.run(elem_on_3))
-        self.assertEqual(i + 3, sess.run(elem_on_4))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+        self.assertEqual(i + 2, self.evaluate(elem_on_3))
+        self.assertEqual(i + 3, self.evaluate(elem_on_4))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
-        sess.run(elem_on_3)
-        sess.run(elem_on_4)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
+        self.evaluate(elem_on_3)
+        self.evaluate(elem_on_4)
 
+  @test_util.run_v1_only("b/120545219")
   def testNotFullyDivisible(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -105,15 +112,16 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
-      self.assertEqual(8, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
+      self.assertEqual(8, self.evaluate(elem_on_1))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetNextAsOptional(self):
     dataset = dataset_ops.Dataset.range(9)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -126,7 +134,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
         elem_on_1_has_value, elem_on_1_value = sess.run(
             [elem_on_1_has_value_t, elem_on_1_t])
@@ -140,13 +148,14 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
           [elem_on_1_has_value_t, elem_on_1_t])
       self.assertTrue(elem_on_1_has_value)
       self.assertEqual(8, elem_on_1_value)
-      self.assertFalse(sess.run(elem_on_1_has_value_t))
-      self.assertFalse(sess.run(elem_on_2_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_1_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_2_has_value_t))
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(elem_on_1_t)
+        self.evaluate(elem_on_1_t)
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(elem_on_2_t)
+        self.evaluate(elem_on_2_t)
 
+  @test_util.run_v1_only("b/120545219")
   def testUneven(self):
     dataset = dataset_ops.Dataset.range(10)
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -155,15 +164,16 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleInitializations(self):
     with ops.device("/cpu:0"):
       epoch = array_ops.placeholder(dtypes.int64, shape=[])
@@ -179,7 +189,8 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     with self.test_session(config=config) as sess:
       for i in range(1000):
         sess.run(init_op, feed_dict={epoch: i})
-        self.assertEqual([(i, 0), (i, 1)], sess.run([elem_on_1, elem_on_2]))
+        self.assertEqual([(i, 0), (i, 1)], self.evaluate([elem_on_1,
+                                                          elem_on_2]))
 
   def testBasicGpu(self):
     if not test_util.is_gpu_available():
@@ -192,13 +203,13 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testUnevenGpu(self):
     if not test_util.is_gpu_available():
@@ -211,14 +222,14 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
+        self.assertEqual(i, self.evaluate(elem_on_1))
       for i in range(0, 10, 2):
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
   def testGetNextAsOptionalGpu(self):
     if not test_util.is_gpu_available():
@@ -235,7 +246,7 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 2, "GPU": 1})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 8, 2):
         elem_on_1_has_value, elem_on_1_value = sess.run(
             [elem_on_1_has_value_t, elem_on_1_t])
@@ -249,13 +260,14 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
           [elem_on_1_has_value_t, elem_on_1_t])
       self.assertTrue(elem_on_1_has_value)
       self.assertEqual(8, elem_on_1_value)
-      self.assertFalse(sess.run(elem_on_1_has_value_t))
-      self.assertFalse(sess.run(elem_on_2_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_1_has_value_t))
+      self.assertFalse(self.evaluate(elem_on_2_has_value_t))
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(elem_on_1_t)
+        self.evaluate(elem_on_1_t)
       with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(elem_on_2_t)
+        self.evaluate(elem_on_2_t)
 
+  @test_util.run_v1_only("b/120545219")
   def testOptimization(self):
     dataset = dataset_ops.Dataset.range(10)
     dataset = dataset.apply(optimization.assert_next(["MemoryCacheImpl"]))
@@ -263,7 +275,8 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
     dataset = dataset.cache()
 
     options = dataset_ops.Options()
-    options.experimental_noop_elimination = True
+    options.experimental_optimization = OptimizationOptions()
+    options.experimental_optimization.noop_elimination = True
     dataset = dataset.with_options(options)
 
     multi_device_iterator = multi_device_iterator_ops.MultiDeviceIterator(
@@ -272,13 +285,13 @@ class MultiDeviceIteratorTest(test_base.DatasetTestBase):
 
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
     with self.test_session(config=config) as sess:
-      sess.run(multi_device_iterator.initializer)
+      self.evaluate(multi_device_iterator.initializer)
       for i in range(0, 10, 2):
-        self.assertEqual(i, sess.run(elem_on_1))
-        self.assertEqual(i + 1, sess.run(elem_on_2))
+        self.assertEqual(i, self.evaluate(elem_on_1))
+        self.assertEqual(i + 1, self.evaluate(elem_on_2))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(elem_on_1)
-        sess.run(elem_on_2)
+        self.evaluate(elem_on_1)
+        self.evaluate(elem_on_2)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/kernel_tests/optional_ops_test.py b/tensorflow/python/data/kernel_tests/optional_test.py
similarity index 64%
rename from tensorflow/python/data/kernel_tests/optional_ops_test.py
rename to tensorflow/python/data/kernel_tests/optional_test.py
index 604e3ad88ec96233771b475705ecac016ac6978c..c2c62e9423e6e082fd6fc42668e2827cc06246e1 100644
--- a/tensorflow/python/data/kernel_tests/optional_ops_test.py
+++ b/tensorflow/python/data/kernel_tests/optional_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the Optional data type wrapper."""
+"""Tests for `tf.data.Optional`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -33,18 +33,18 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromValue(self):
     opt = optional_ops.Optional.from_value(constant_op.constant(37.0))
     self.assertTrue(self.evaluate(opt.has_value()))
     self.assertEqual(37.0, self.evaluate(opt.get_value()))
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromStructuredValue(self):
     opt = optional_ops.Optional.from_value({
         "a": constant_op.constant(37.0),
@@ -56,7 +56,6 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
         "b": ([b"Foo"], b"Bar")
     }, self.evaluate(opt.get_value()))
 
-  @test_util.run_in_graph_and_eager_modes
   def testFromSparseTensor(self):
     st_0 = sparse_tensor.SparseTensorValue(
         indices=np.array([[0]]),
@@ -75,7 +74,7 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
       self.assertAllEqual(expected.dense_shape,
                           self.evaluate(actual.dense_shape))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testFromNone(self):
     value_structure = structure.TensorStructure(dtypes.float32, [])
     opt = optional_ops.Optional.none_from_structure(value_structure)
@@ -90,7 +89,90 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(opt.get_value())
 
-  @test_util.run_in_graph_and_eager_modes
+  def testAddN(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        # With value
+        opt1 = optional_ops.Optional.from_value((1.0, 2.0))
+        opt2 = optional_ops.Optional.from_value((3.0, 4.0))
+
+        add_tensor = math_ops.add_n([opt1._variant_tensor,
+                                     opt2._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor, opt1.value_structure)
+        self.assertAllEqual(self.evaluate(add_opt.get_value()), (4.0, 6.0))
+
+        # Without value
+        opt_none1 = optional_ops.Optional.none_from_structure(
+            opt1.value_structure)
+        opt_none2 = optional_ops.Optional.none_from_structure(
+            opt2.value_structure)
+        add_tensor = math_ops.add_n([opt_none1._variant_tensor,
+                                     opt_none2._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor,
+                                             opt_none1.value_structure)
+        self.assertFalse(self.evaluate(add_opt.has_value()))
+
+  def testNestedAddN(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        opt1 = optional_ops.Optional.from_value([1, 2.0])
+        opt2 = optional_ops.Optional.from_value([3, 4.0])
+        opt3 = optional_ops.Optional.from_value((5.0, opt1._variant_tensor))
+        opt4 = optional_ops.Optional.from_value((6.0, opt2._variant_tensor))
+
+        add_tensor = math_ops.add_n([opt3._variant_tensor,
+                                     opt4._variant_tensor])
+        add_opt = optional_ops._OptionalImpl(add_tensor, opt3.value_structure)
+        self.assertEqual(self.evaluate(add_opt.get_value()[0]), 11.0)
+
+        inner_add_opt = optional_ops._OptionalImpl(add_opt.get_value()[1],
+                                                   opt1.value_structure)
+        self.assertAllEqual(inner_add_opt.get_value(), [4, 6.0])
+
+  def testZerosLike(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        # With value
+        opt = optional_ops.Optional.from_value((1.0, 2.0))
+        zeros_tensor = array_ops.zeros_like(opt._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt.value_structure)
+        self.assertAllEqual(self.evaluate(zeros_opt.get_value()),
+                            (0.0, 0.0))
+
+        # Without value
+        opt_none = optional_ops.Optional.none_from_structure(
+            opt.value_structure)
+        zeros_tensor = array_ops.zeros_like(opt_none._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt_none.value_structure)
+        self.assertFalse(self.evaluate(zeros_opt.has_value()))
+
+  def testNestedZerosLike(self):
+    devices = ["/cpu:0"]
+    if test_util.is_gpu_available():
+      devices.append("/gpu:0")
+    for device in devices:
+      with ops.device(device):
+        opt1 = optional_ops.Optional.from_value(1.0)
+        opt2 = optional_ops.Optional.from_value(opt1._variant_tensor)
+
+        zeros_tensor = array_ops.zeros_like(opt2._variant_tensor)
+        zeros_opt = optional_ops._OptionalImpl(zeros_tensor,
+                                               opt2.value_structure)
+        inner_zeros_opt = optional_ops._OptionalImpl(zeros_opt.get_value(),
+                                                     opt1.value_structure)
+        self.assertEqual(self.evaluate(inner_zeros_opt.get_value()), 0.0)
+
   def testCopyToGPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
@@ -120,6 +202,41 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
                      self.evaluate(gpu_optional_with_value_values))
     self.assertFalse(self.evaluate(gpu_optional_none_has_value))
 
+  def testNestedCopyToGPU(self):
+    if not test_util.is_gpu_available():
+      self.skipTest("No GPU available")
+
+    with ops.device("/cpu:0"):
+      optional_with_value = optional_ops.Optional.from_value(
+          (constant_op.constant(37.0), constant_op.constant("Foo"),
+           constant_op.constant(42)))
+      optional_none = optional_ops.Optional.none_from_structure(
+          structure.TensorStructure(dtypes.float32, []))
+      nested_optional = optional_ops.Optional.from_value(
+          (optional_with_value._variant_tensor, optional_none._variant_tensor,
+           1.0))
+
+    with ops.device("/gpu:0"):
+      gpu_nested_optional = optional_ops._OptionalImpl(
+          array_ops.identity(nested_optional._variant_tensor),
+          nested_optional.value_structure)
+
+      gpu_nested_optional_has_value = gpu_nested_optional.has_value()
+      gpu_nested_optional_values = gpu_nested_optional.get_value()
+
+    self.assertTrue(self.evaluate(gpu_nested_optional_has_value))
+
+    inner_with_value = optional_ops._OptionalImpl(
+        gpu_nested_optional_values[0], optional_with_value.value_structure)
+
+    inner_none = optional_ops._OptionalImpl(
+        gpu_nested_optional_values[1], optional_none.value_structure)
+
+    self.assertEqual((37.0, b"Foo", 42),
+                     self.evaluate(inner_with_value.get_value()))
+    self.assertFalse(self.evaluate(inner_none.has_value()))
+    self.assertEqual(1.0, self.evaluate(gpu_nested_optional_values[2]))
+
   def _assertElementValueEqual(self, expected, actual):
     if isinstance(expected, dict):
       self.assertItemsEqual(list(expected.keys()), list(actual.keys()))
@@ -151,7 +268,9 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
        optional_ops.OptionalStructure(
            structure.TensorStructure(dtypes.float32, []))),
   )
-  def testOptionalStructure(self, tf_value_fn, expected_value_structure):
+  @test_util.run_deprecated_v1
+  def testSkipEagerOptionalStructure(self, tf_value_fn,
+                                     expected_value_structure):
     tf_value = tf_value_fn()
     opt = optional_ops.Optional.from_value(tf_value)
 
@@ -205,7 +324,9 @@ class OptionalTest(test_base.DatasetTestBase, parameterized.TestCase):
                     indices=[[0, 1], [1, 0]], values=[37.0, 42.0],
                     dense_shape=[2, 2])}, False),
   )
-  def testIteratorGetNextAsOptional(self, np_value, tf_value_fn, works_on_gpu):
+  @test_util.run_deprecated_v1
+  def testSkipEagerIteratorGetNextAsOptional(self, np_value, tf_value_fn,
+                                             works_on_gpu):
     if not works_on_gpu and test.is_gpu_available():
       self.skipTest("Test case not yet supported on GPU.")
     ds = dataset_ops.Dataset.from_tensors(np_value).repeat(3)
diff --git a/tensorflow/python/data/kernel_tests/padded_batch_test.py b/tensorflow/python/data/kernel_tests/padded_batch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcfb2f507bf1a7d91041eb5f24c95c6de2c18362
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/padded_batch_test.py
@@ -0,0 +1,243 @@
+# -*- coding: utf-8 -*-
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.padded_batch()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+def _random_seq_lens(count):
+  return np.random.randint(20, size=(count,)).astype(np.int32)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PaddedBatchTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('default_padding', _random_seq_lens(32), 4, [-1], False),
+      ('constant_padding', _random_seq_lens(32), 4, [25], False),
+      ('uneven_with_remainder', _random_seq_lens(34), 4, [-1], False),
+      ('uneven_without_remainder', _random_seq_lens(34), 4, [-1], True),
+  )
+  def testPaddedBatchDataset(self, seq_lens, batch_size, padded_shapes,
+                             drop_remainder):
+    """Tests the padded batch dataset logic for various input configurations.
+
+    Args:
+      seq_lens: the input sequence lengths
+      batch_size: the batch size
+      padded_shapes: the padded shapes to use
+      drop_remainder: whether a smaller batch size should be produced if batch
+        size does not divide number of inputs evenly
+    """
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(seq_lens).map(
+        lambda x: array_ops.fill([x], x)).padded_batch(
+            batch_size=batch_size,
+            drop_remainder=drop_remainder,
+            padded_shapes=padded_shapes)
+
+    num_full_batches = len(seq_lens) // batch_size
+    get_next = self.getNext(dataset)
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      padded_len = padded_shapes[0]
+      if padded_len is None or padded_len == -1:
+        padded_len = np.max(result) if result.size > 0 else 0
+      self.assertEqual((batch_size, padded_len), result.shape)
+      for j in range(batch_size):
+        seq_len = seq_lens[(i * batch_size) + j]
+        self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
+
+    if not drop_remainder and len(seq_lens) % batch_size > 0:
+      result = self.evaluate(get_next())
+      padded_len = np.max(result) if result.size > 0 else 0
+      self.assertEqual((len(seq_lens) % batch_size, padded_len), result.shape)
+      for j in range(len(seq_lens) % batch_size):
+        seq_len = seq_lens[num_full_batches * batch_size + j]
+        self.assertAllEqual(result[j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[j, seq_len:], [0] * (padded_len - seq_len))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @test_util.run_deprecated_v1
+  def testPaddedBatchShortPadding(self):
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(
+            [6, 5, 5, 5, 5]).map(lambda x: array_ops.fill([x], x)).padded_batch(
+                batch_size=4, padded_shapes=[5]))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.DataLossError, ''))
+
+  def testPaddedBatchEmptyTensors(self):
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(
+            [0, 0, 0, 0]).map(lambda x: array_ops.fill([x], x)).padded_batch(
+                batch_size=4, padded_shapes=[-1]))
+    self.assertDatasetProduces(dataset, expected_output=[[[], [], [], []]])
+
+  def testPaddedBatchDatasetNonDefaultPadding(self):
+
+    def fill_tuple(x):
+      filled = array_ops.fill([x], x)
+      return (filled, string_ops.as_string(filled))
+
+    random_seq_lens = np.random.randint(20, size=(32,)).astype(np.int32)
+    dataset = (
+        dataset_ops.Dataset.from_tensor_slices(random_seq_lens).map(fill_tuple)
+        .padded_batch(
+            4, padded_shapes=([-1], [-1]), padding_values=(-1, '<end>')))
+
+    get_next = self.getNext(dataset)
+    for i in range(8):
+      result = self.evaluate(get_next())
+      padded_len = np.max(result[0])
+      self.assertEqual((4, padded_len), result[0].shape)
+      self.assertEqual((4, padded_len), result[1].shape)
+      for j in range(4):
+        seq_len = random_seq_lens[(i * 4) + j]
+        self.assertAllEqual(result[0][j, :seq_len], [seq_len] * seq_len)
+        self.assertAllEqual(result[0][j, seq_len:],
+                            [-1] * (padded_len - seq_len))
+        self.assertAllEqual(result[1][j, :seq_len],
+                            [compat.as_bytes(str(seq_len))] * seq_len)
+        self.assertAllEqual(result[1][j, seq_len:],
+                            [b'<end>'] * (padded_len - seq_len))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testPaddedBatchDatasetUnicode(self):
+    # See GitHub issue 16149
+    def generator():
+      data = [[u'Простой', u'тест', u'юникода'],
+              [u'никогда', u'не', u'бывает', u'простым']]
+
+      for seq in data:
+        yield seq, [0, 1, 2, 3]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, (dtypes.string, dtypes.int32),
+        (tensor_shape.TensorShape([None]), tensor_shape.TensorShape([None])))
+    padded_dataset = dataset.padded_batch(
+        2, padded_shapes=([None], [None]), padding_values=('', 0))
+    next_element = self.getNext(padded_dataset)
+    self.evaluate(next_element())
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerPaddedBatchDatasetShapeSpecifications(self):
+    int_placeholder = array_ops.placeholder(dtypes.int32)
+    float_placeholder = array_ops.placeholder(dtypes.float32)
+    string_placeholder = array_ops.placeholder(dtypes.string)
+    input_dataset = dataset_ops.Dataset.from_tensors(
+        (int_placeholder, float_placeholder, string_placeholder))
+
+    # Test different ways of specifying the `padded_shapes` argument.
+    dynamic_padding_from_tensor_shapes = input_dataset.padded_batch(
+        32,
+        padded_shapes=(tensor_shape.TensorShape([None]),
+                       tensor_shape.TensorShape([None, None]),
+                       tensor_shape.TensorShape([37])))
+    dynamic_padding_from_lists = input_dataset.padded_batch(
+        32, padded_shapes=([None], [None, None], [37]))
+    dynamic_padding_from_lists_with_minus_one = input_dataset.padded_batch(
+        32, padded_shapes=([-1], [-1, -1], [37]))
+    dynamic_padding_from_tensors = input_dataset.padded_batch(
+        32,
+        padded_shapes=(constant_op.constant([-1], dtype=dtypes.int64),
+                       constant_op.constant([-1, -1], dtype=dtypes.int64),
+                       constant_op.constant([37], dtype=dtypes.int64)))
+
+    for dataset in [
+        dynamic_padding_from_tensor_shapes, dynamic_padding_from_lists,
+        dynamic_padding_from_lists_with_minus_one, dynamic_padding_from_tensors
+    ]:
+      self.assertEqual([None, None], dataset.output_shapes[0].as_list())
+      self.assertEqual([None, None, None], dataset.output_shapes[1].as_list())
+      self.assertEqual([None, 37], dataset.output_shapes[2].as_list())
+
+  def testPaddedBatchSparseError(self):
+
+    def _map_fn(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0, 0]], values=(i * [1]), dense_shape=[1, 1]), i
+
+    with self.assertRaises(TypeError):
+      _ = dataset_ops.Dataset.range(10).map(_map_fn).padded_batch(10)
+
+  def testPaddedBatchShapeError(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      _ = dataset_ops.Dataset.range(10).padded_batch(5, padded_shapes=[1])
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(3,\).'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=[1])
+
+    with self.assertRaisesRegexp(
+        ValueError, r'Padded shape .* must be a 1-D tensor '
+        r'of tf.int64 values, but its shape was \(2, 2\).'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=[[1, 1], [1, 1]])
+
+    with self.assertRaisesRegexp(
+        TypeError, r'Padded shape .* must be a 1-D tensor '
+        r'of tf.int64 values, but its element type was float32.'):
+      _ = dataset_ops.Dataset.from_tensors([1, 2, 3]).padded_batch(
+          5, padded_shapes=constant_op.constant([1., 2., 3.]))
+
+    with self.assertRaisesRegexp(
+        ValueError, r'The padded shape \(1,\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      shape_as_tensor = constant_op.constant([1], dtype=dtypes.int64)
+      _ = dataset_ops.Dataset.range(10).padded_batch(
+          5, padded_shapes=shape_as_tensor)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerPaddedBatchShapeError(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'The padded shape \((\?|None), (\?|None)\) is not compatible with the '
+        r'corresponding input component shape \(\).'):
+      shape_as_tensor = array_ops.placeholder(dtypes.int64, shape=[2])
+      _ = dataset_ops.Dataset.range(10).padded_batch(
+          5, padded_shapes=shape_as_tensor)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py b/tensorflow/python/data/kernel_tests/prefetch_test.py
similarity index 52%
rename from tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/prefetch_test.py
index 76e2697b29d368f5607c827fe32d017fbefd5ecd..a143ba0ac63d42667faa4cfdee6fa74cf0a82f57 100644
--- a/tensorflow/python/data/kernel_tests/prefetch_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/prefetch_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Test PrefetchDataset."""
+"""Tests for `tf.data.Dataset.prefetch()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -21,40 +21,24 @@ from absl.testing import parameterized
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class PrefetchDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class PrefetchTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @parameterized.parameters((-1), (0), (5))
   def testBufferSize(self, buffer_size):
-    buffer_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(10).prefetch(
-        buffer_size=buffer_size_t).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={buffer_size_t: buffer_size})
-      for m in range(10):
-        self.assertEqual(m, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
+    dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
+    self.assertDatasetProduces(dataset, expected_output=range(10))
 
   @parameterized.parameters((-2), (-42))
   def testInvalidBufferSize(self, buffer_size):
-    buffer_size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = dataset_ops.Dataset.range(10).prefetch(
-        buffer_size=buffer_size_t).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.assertRaisesRegexp(errors.InvalidArgumentError, "buffer_size"):
-      with self.cached_session() as sess:
-        sess.run(init_op, feed_dict={buffer_size_t: buffer_size})
-
+    dataset = dataset_ops.Dataset.range(10).prefetch(buffer_size=buffer_size)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, "buffer_size"))
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/range_test.py b/tensorflow/python/data/kernel_tests/range_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f5d25e7f3959eed70754db827052a91fd224dbc
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/range_test.py
@@ -0,0 +1,72 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.range()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RangeTest(test_base.DatasetTestBase):
+
+  def testStop(self):
+    dataset = dataset_ops.Dataset.range(5)
+    self.assertDatasetProduces(dataset, expected_output=range(5))
+
+  def testStartStop(self):
+    start, stop = 2, 5
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 5))
+
+  def testStartStopStep(self):
+    start, stop, step = 2, 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, 2))
+
+  def testZeroStep(self):
+    start, stop, step = 2, 10, 0
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ""))
+
+  def testNegativeStep(self):
+    start, stop, step = 2, 10, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(2, 10, -1))
+
+  def testStopLessThanStart(self):
+    start, stop = 10, 2
+    dataset = dataset_ops.Dataset.range(start, stop)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2))
+
+  def testStopLessThanStartWithPositiveStep(self):
+    start, stop, step = 10, 2, 2
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, 2))
+
+  def testStopLessThanStartWithNegativeStep(self):
+    start, stop, step = 10, 2, -1
+    dataset = dataset_ops.Dataset.range(start, stop, step)
+    self.assertDatasetProduces(dataset, expected_output=range(10, 2, -1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py b/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
deleted file mode 100644
index 4fef4f30bf95e97f3dea491d9a2f69b120b7b8e1..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/reader_dataset_ops_test.py
+++ /dev/null
@@ -1,846 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import gzip
-import os
-import zlib
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.ops import readers
-from tensorflow.python.eager import context
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.lib.io import python_io
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gen_dataset_ops
-from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.platform import test
-from tensorflow.python.util import compat
-
-
-try:
-  import psutil  # pylint: disable=g-import-not-at-top
-  psutil_import_succeeded = True
-except ImportError:
-  psutil_import_succeeded = False
-
-
-class TextLineDatasetTest(test_base.DatasetTestBase):
-
-  def _lineText(self, f, l):
-    return compat.as_bytes("%d: %d" % (f, l))
-
-  def _createFiles(self,
-                   num_files,
-                   num_lines,
-                   crlf=False,
-                   compression_type=None):
-    filenames = []
-    for i in range(num_files):
-      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
-      filenames.append(fn)
-      contents = []
-      for j in range(num_lines):
-        contents.append(self._lineText(i, j))
-        # Always include a newline after the record unless it is
-        # at the end of the file, in which case we include it
-        if j + 1 != num_lines or i == 0:
-          contents.append(b"\r\n" if crlf else b"\n")
-      contents = b"".join(contents)
-
-      if not compression_type:
-        with open(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "GZIP":
-        with gzip.GzipFile(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "ZLIB":
-        contents = zlib.compress(contents)
-        with open(fn, "wb") as f:
-          f.write(contents)
-      else:
-        raise ValueError("Unsupported compression_type", compression_type)
-
-    return filenames
-
-  def _testTextLineDataset(self, compression_type=None):
-    test_filenames = self._createFiles(
-        2, 5, crlf=True, compression_type=compression_type)
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = readers.TextLineDataset(
-        filenames, compression_type=compression_type).repeat(num_epochs)
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(5):
-        self.assertEqual(self._lineText(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(2):
-          for i in range(5):
-            self.assertEqual(self._lineText(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={filenames: test_filenames,
-                     num_epochs: 10,
-                     batch_size: 5})
-      for _ in range(10):
-        self.assertAllEqual([self._lineText(0, i) for i in range(5)],
-                            sess.run(get_next))
-        self.assertAllEqual([self._lineText(1, i) for i in range(5)],
-                            sess.run(get_next))
-
-  def testTextLineDatasetNoCompression(self):
-    self._testTextLineDataset()
-
-  def testTextLineDatasetGzipCompression(self):
-    self._testTextLineDataset(compression_type="GZIP")
-
-  def testTextLineDatasetZlibCompression(self):
-    self._testTextLineDataset(compression_type="ZLIB")
-
-  def testTextLineDatasetBuffering(self):
-    test_filenames = self._createFiles(2, 5, crlf=True)
-
-    repeat_dataset = readers.TextLineDataset(test_filenames, buffer_size=10)
-    iterator = repeat_dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      for j in range(2):
-        for i in range(5):
-          self.assertEqual(self._lineText(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testIteratorResourceCleanup(self):
-    filename = os.path.join(self.get_temp_dir(), "text.txt")
-    with open(filename, "wt") as f:
-      for i in range(3):
-        f.write("%d\n" % (i,))
-    with context.eager_mode():
-      first_iterator = iter(readers.TextLineDataset(filename))
-      self.assertEqual(b"0", next(first_iterator).numpy())
-      second_iterator = iter(readers.TextLineDataset(filename))
-      self.assertEqual(b"0", next(second_iterator).numpy())
-      # Eager kernel caching is based on op attributes, which includes the
-      # Dataset's output shape. Create a different kernel to test that they
-      # don't create resources with the same names.
-      different_kernel_iterator = iter(
-          readers.TextLineDataset(filename).repeat().batch(16))
-      self.assertEqual([16], next(different_kernel_iterator).shape)
-      # Remove our references to the Python Iterator objects, which (assuming no
-      # reference cycles) is enough to trigger DestroyResourceOp and close the
-      # partially-read files.
-      del first_iterator
-      del second_iterator
-      del different_kernel_iterator
-      if not psutil_import_succeeded:
-        self.skipTest(
-            "psutil is required to check that we've closed our files.")
-      open_files = psutil.Process().open_files()
-      self.assertNotIn(filename, [open_file.path for open_file in open_files])
-
-
-class FixedLengthRecordReaderTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    super(FixedLengthRecordReaderTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-    self._header_bytes = 5
-    self._record_bytes = 3
-    self._footer_bytes = 2
-
-  def _record(self, f, r):
-    return compat.as_bytes(str(f * 2 + r) * self._record_bytes)
-
-  def _createFiles(self, compression_type=None):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "fixed_length_record.%d.txt" % i)
-      filenames.append(fn)
-
-      contents = []
-      contents.append(b"H" * self._header_bytes)
-      for j in range(self._num_records):
-        contents.append(self._record(i, j))
-      contents.append(b"F" * self._footer_bytes)
-      contents = b"".join(contents)
-
-      if not compression_type:
-        with open(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "GZIP":
-        with gzip.GzipFile(fn, "wb") as f:
-          f.write(contents)
-      elif compression_type == "ZLIB":
-        contents = zlib.compress(contents)
-        with open(fn, "wb") as f:
-          f.write(contents)
-      else:
-        raise ValueError("Unsupported compression_type", compression_type)
-
-    return filenames
-
-  def _testFixedLengthRecordDataset(self, compression_type=None):
-    test_filenames = self._createFiles(compression_type=compression_type)
-    filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    num_epochs = array_ops.placeholder(dtypes.int64, shape=[])
-    batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (
-        readers.FixedLengthRecordDataset(
-            filenames,
-            self._record_bytes,
-            self._header_bytes,
-            self._footer_bytes,
-            compression_type=compression_type).repeat(num_epochs))
-    batch_dataset = repeat_dataset.batch(batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    init_op = iterator.make_initializer(repeat_dataset)
-    init_batch_op = iterator.make_initializer(batch_dataset)
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[0]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(0, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          init_op, feed_dict={filenames: [test_filenames[1]],
-                              num_epochs: 1})
-      for i in range(self._num_records):
-        self.assertEqual(self._record(1, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Basic test: read from both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test repeated iteration through both files.
-      sess.run(init_op, feed_dict={filenames: test_filenames, num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertEqual(self._record(j, i), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test batched and repeated iteration through both files.
-      sess.run(
-          init_batch_op,
-          feed_dict={
-              filenames: test_filenames,
-              num_epochs: 10,
-              batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)],
-              sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testFixedLengthRecordDatasetNoCompression(self):
-    self._testFixedLengthRecordDataset()
-
-  def testFixedLengthRecordDatasetGzipCompression(self):
-    self._testFixedLengthRecordDataset(compression_type="GZIP")
-
-  def testFixedLengthRecordDatasetZlibCompression(self):
-    self._testFixedLengthRecordDataset(compression_type="ZLIB")
-
-  def testFixedLengthRecordDatasetBuffering(self):
-    test_filenames = self._createFiles()
-    dataset = readers.FixedLengthRecordDataset(
-        test_filenames,
-        self._record_bytes,
-        self._header_bytes,
-        self._footer_bytes,
-        buffer_size=10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertEqual(self._record(j, i), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
-  def testFixedLengthRecordDatasetWrongSize(self):
-    test_filenames = self._createFiles()
-    dataset = readers.FixedLengthRecordDataset(
-        test_filenames,
-        self._record_bytes + 1,  # Incorrect record length.
-        self._header_bytes,
-        self._footer_bytes,
-        buffer_size=10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"Excluding the header \(5 bytes\) and footer \(2 bytes\), input "
-          r"file \".*fixed_length_record.0.txt\" has body length 21 bytes, "
-          r"which is not an exact multiple of the record length \(4 bytes\)."):
-        sess.run(iterator.get_next())
-
-  def _iterator_checkpoint_path(self):
-    return os.path.join(self.get_temp_dir(), "iterator")
-
-  def _save_op(self, iterator_resource):
-    iterator_state_variant = gen_dataset_ops.serialize_iterator(
-        iterator_resource)
-    save_op = io_ops.write_file(
-        self._iterator_checkpoint_path(),
-        parsing_ops.serialize_tensor(iterator_state_variant))
-    return save_op
-
-  def _restore_op(self, iterator_resource):
-    iterator_state_variant = parsing_ops.parse_tensor(
-        io_ops.read_file(self._iterator_checkpoint_path()), dtypes.variant)
-    restore_op = gen_dataset_ops.deserialize_iterator(iterator_resource,
-                                                      iterator_state_variant)
-    return restore_op
-
-  def _build_iterator_graph(self, num_epochs):
-    filenames = self._createFiles()
-    dataset = (readers.FixedLengthRecordDataset(
-        filenames, self._record_bytes, self._header_bytes, self._footer_bytes)
-               .repeat(num_epochs))
-    iterator = dataset.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next_op = iterator.get_next()
-    save_op = self._save_op(iterator._iterator_resource)
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return init_op, get_next_op, save_op, restore_op
-
-  def _restore_iterator(self):
-    output_types = dtypes.string
-    output_shapes = tensor_shape.scalar()
-    iterator = iterator_ops.Iterator.from_structure(output_types, output_shapes)
-    get_next = iterator.get_next()
-    restore_op = self._restore_op(iterator._iterator_resource)
-    return restore_op, get_next
-
-  def testSaveRestore(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testInitThenRestore(self):
-    # Note: Calling init_op before restore_op is redundant. This test just makes
-    # sure we do not fail if restore is called on an already initialized
-    # iterator resource.
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreInModifiedGraph(self):
-    num_epochs = 10
-    num_epochs_1 = 20
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs_1)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreWithoutBuildingDatasetGraph(self):
-    num_epochs = 10
-    epoch_break = 5
-    file_break = self._num_files // 2
-    record_break = self._num_records // 2
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch == epoch_break and f == file_break and
-                  r == record_break):
-                sess.run(save_op)
-                break
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-            else:
-              continue
-            break
-          else:
-            continue
-          break
-        else:
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(get_next_op)
-
-    with ops.Graph().as_default() as g:
-      restore_op, get_next_op = self._restore_iterator()
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for epoch in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              if (epoch < epoch_break or
-                  (epoch == epoch_break and f < file_break) or
-                  (epoch == epoch_break and f == file_break and
-                   r < record_break)):
-                continue
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreUnusedIterator(self):
-    num_epochs = 10
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        # Save unused iterator.
-        sess.run(save_op)
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        for _ in range(num_epochs * self._num_files * self._num_records):
-          sess.run(get_next_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-  def testRestoreExhaustedIterator(self):
-    num_epochs = 10
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(init_op)
-        # Note: There is no checkpoint saved currently so a NotFoundError is
-        # raised.
-        with self.assertRaises(errors.NotFoundError):
-          sess.run(restore_op)
-        for _ in range(num_epochs):
-          for f in range(self._num_files):
-            for r in range(self._num_records):
-              self.assertEqual(self._record(f, r), sess.run(get_next_op))
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-        sess.run(save_op)
-
-    with ops.Graph().as_default() as g:
-      init_op, get_next_op, save_op, restore_op = self._build_iterator_graph(
-          num_epochs=num_epochs)
-      with self.session(graph=g) as sess:
-        sess.run(restore_op)
-        with self.assertRaises(errors.OutOfRangeError):
-          sess.run(get_next_op)
-
-
-class TFRecordDatasetTest(test_base.DatasetTestBase):
-
-  def setUp(self):
-    super(TFRecordDatasetTest, self).setUp()
-    self._num_files = 2
-    self._num_records = 7
-
-    self.test_filenames = self._createFiles()
-
-    self.filenames = array_ops.placeholder(dtypes.string, shape=[None])
-    self.num_epochs = array_ops.placeholder_with_default(
-        constant_op.constant(1, dtypes.int64), shape=[])
-    self.compression_type = array_ops.placeholder_with_default("", shape=[])
-    self.batch_size = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = readers.TFRecordDataset(self.filenames,
-                                             self.compression_type).repeat(
-                                                 self.num_epochs)
-    batch_dataset = repeat_dataset.batch(self.batch_size)
-
-    iterator = iterator_ops.Iterator.from_structure(batch_dataset.output_types)
-    self.init_op = iterator.make_initializer(repeat_dataset)
-    self.init_batch_op = iterator.make_initializer(batch_dataset)
-    self.get_next = iterator.get_next()
-
-  def _record(self, f, r):
-    return compat.as_bytes("Record %d of file %d" % (r, f))
-
-  def _createFiles(self):
-    filenames = []
-    for i in range(self._num_files):
-      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
-      filenames.append(fn)
-      writer = python_io.TFRecordWriter(fn)
-      for j in range(self._num_records):
-        writer.write(self._record(i, j))
-      writer.close()
-    return filenames
-
-  def testReadOneEpoch(self):
-    with self.cached_session() as sess:
-      # Basic test: read from file 0.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[0]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(0, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from file 1.
-      sess.run(
-          self.init_op,
-          feed_dict={
-              self.filenames: [self.test_filenames[1]],
-              self.num_epochs: 1
-          })
-      for i in range(self._num_records):
-        self.assertAllEqual(self._record(1, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-      # Basic test: read from both files.
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 1})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochs(self):
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: self.test_filenames,
-                     self.num_epochs: 10})
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadTenEpochsOfBatches(self):
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_batch_op,
-          feed_dict={
-              self.filenames: self.test_filenames,
-              self.num_epochs: 10,
-              self.batch_size: self._num_records
-          })
-      for _ in range(10):
-        for j in range(self._num_files):
-          values = sess.run(self.get_next)
-          self.assertAllEqual(
-              [self._record(j, i) for i in range(self._num_records)], values)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadZlibFiles(self):
-    zlib_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        cdata = zlib.compress(f.read())
-
-        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
-        with open(zfn, "wb") as f:
-          f.write(cdata)
-        zlib_files.append(zfn)
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: zlib_files,
-                     self.compression_type: "ZLIB"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadGzipFiles(self):
-    gzip_files = []
-    for i, fn in enumerate(self.test_filenames):
-      with open(fn, "rb") as f:
-        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
-        with gzip.GzipFile(gzfn, "wb") as gzf:
-          gzf.write(f.read())
-        gzip_files.append(gzfn)
-
-    with self.cached_session() as sess:
-      sess.run(
-          self.init_op,
-          feed_dict={self.filenames: gzip_files,
-                     self.compression_type: "GZIP"})
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(self.get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(self.get_next)
-
-  def testReadWithBuffer(self):
-    one_mebibyte = 2**20
-    d = readers.TFRecordDataset(self.test_filenames, buffer_size=one_mebibyte)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testReadFromDatasetOfFiles(self):
-    files = dataset_ops.Dataset.from_tensor_slices(self.test_filenames)
-    d = readers.TFRecordDataset(files)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    with self.cached_session() as sess:
-      for j in range(self._num_files):
-        for i in range(self._num_records):
-          self.assertAllEqual(self._record(j, i), sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testReadTenEpochsFromDatasetOfFilesInParallel(self):
-    files = dataset_ops.Dataset.from_tensor_slices(
-        self.test_filenames).repeat(10)
-    d = readers.TFRecordDataset(files, num_parallel_reads=4)
-    iterator = d.make_one_shot_iterator()
-    next_element = iterator.get_next()
-    expected = []
-    actual = []
-    with self.cached_session() as sess:
-      for _ in range(10):
-        for j in range(self._num_files):
-          for i in range(self._num_records):
-            expected.append(self._record(j, i))
-            actual.append(sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-      self.assertEqual(sorted(expected), sorted(actual))
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py b/tensorflow/python/data/kernel_tests/reduce_test.py
similarity index 72%
rename from tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/reduce_test.py
index 11e07300b9716d60d0d96587018dd63dce3f9d24..14bbc0bf72caa07445ca7d077845e2bc4569cc01 100644
--- a/tensorflow/python/data/kernel_tests/reduce_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/reduce_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.reduce()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -22,21 +22,24 @@ import numpy as np
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
-class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class ReduceTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testSum(self):
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1)
-      result = ds.reduce(np.int64(0), lambda x, y: x + y)
-      with self.cached_session() as sess:
-        self.assertEqual(((i + 1) * i) // 2, sess.run(result))
+      result = ds.reduce(
+          constant_op.constant(0, dtype=dtypes.int64), lambda x, y: x + y)
+      self.assertEqual(((i + 1) * i) // 2, self.evaluate(result))
 
   def testSumTuple(self):
 
@@ -47,9 +50,8 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1)
       ds = dataset_ops.Dataset.zip((ds, ds))
-      result = ds.reduce(np.int64(0), reduce_fn)
-      with self.cached_session() as sess:
-        self.assertEqual(((i + 1) * i), sess.run(result))
+      result = ds.reduce(constant_op.constant(0, dtype=dtypes.int64), reduce_fn)
+      self.assertEqual(((i + 1) * i), self.evaluate(result))
 
   def testSumAndCount(self):
 
@@ -59,13 +61,15 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
 
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1)
-      result = ds.reduce((np.int64(0), np.int64(0)), reduce_fn)
-      with self.cached_session() as sess:
-        s, c = sess.run(result)
-        self.assertEqual(((i + 1) * i) // 2, s)
-        self.assertEqual(i, c)
-
-  def testSquareUsingPlaceholder(self):
+      result = ds.reduce((constant_op.constant(0, dtype=dtypes.int64),
+                          constant_op.constant(0, dtype=dtypes.int64)),
+                         reduce_fn)
+      s, c = self.evaluate(result)
+      self.assertEqual(((i + 1) * i) // 2, s)
+      self.assertEqual(i, c)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSquareUsingPlaceholder(self):
     delta = array_ops.placeholder(dtype=dtypes.int64)
 
     def reduce_fn(state, _):
@@ -92,8 +96,7 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(10):
       ds = dataset_ops.Dataset.from_tensors(make_sparse_fn(i+1))
       result = ds.reduce(make_sparse_fn(0), reduce_fn)
-      with self.cached_session() as sess:
-        self.assertSparseValuesEqual(make_sparse_fn(i+1), sess.run(result))
+      self.assertSparseValuesEqual(make_sparse_fn(i + 1), self.evaluate(result))
 
   def testNested(self):
 
@@ -115,10 +118,10 @@ class ReduceDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
     for i in range(10):
       ds = dataset_ops.Dataset.range(1, i + 1).map(map_fn)
       result = ds.reduce(map_fn(0), reduce_fn)
-      with self.cached_session() as sess:
-        result = sess.run(result)
-        self.assertEqual(((i + 1) * i) // 2, result["dense"])
-        self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
+      result = self.evaluate(result)
+      self.assertEqual(((i + 1) * i) // 2, result["dense"])
+      self.assertSparseValuesEqual(make_sparse_fn(i), result["sparse"])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/repeat_test.py b/tensorflow/python/data/kernel_tests/repeat_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ef2fc1bfc8fb139cb855305f4e4f2ec70221ce2
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/repeat_test.py
@@ -0,0 +1,84 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.repeat()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RepeatTest(test_base.DatasetTestBase):
+
+  def testRepeatTensorDataset(self):
+    """Test a dataset that repeats its input multiple times."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    # This placeholder can be fed when dataset-definition subgraph
+    # runs (i.e. `init_op` below) to configure the number of
+    # repetitions used in a particular iterator.
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensors(components).repeat(count)
+      self.assertEqual([c.shape for c in components],
+                       [shape for shape in dataset.output_shapes])
+      self.assertDatasetProduces(dataset, [components] * count)
+
+    # Test a finite repetition.
+    do_test(3)
+
+    # test a different finite repetition.
+    do_test(7)
+
+    # Test an empty repetition.
+    do_test(0)
+
+    # Test an infinite repetition.
+    # NOTE(mrry): There's not a good way to test that the sequence
+    # actually is infinite.
+    dataset = dataset_ops.Dataset.from_tensors(components).repeat(-1)
+    self.assertEqual([c.shape for c in components],
+                     [shape for shape in dataset.output_shapes])
+    get_next = self.getNext(dataset)
+    for _ in range(17):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(components, results):
+        self.assertAllEqual(component, result_component)
+
+  def testRepeatRepeatTensorDataset(self):
+    """Test the composition of repeat datasets."""
+    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
+    inner_count, outer_count = 7, 14
+
+    dataset = dataset_ops.Dataset.from_tensors(components).repeat(
+        inner_count).repeat(outer_count)
+    self.assertEqual([c.shape for c in components],
+                     [shape for shape in dataset.output_shapes])
+    self.assertDatasetProduces(dataset,
+                               [components] * (inner_count * outer_count))
+
+  def testRepeatEmptyDataset(self):
+    """Test that repeating an empty dataset does not hang."""
+    dataset = dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10).repeat(-1)
+    self.assertDatasetProduces(dataset, [])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py b/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
deleted file mode 100644
index e86356dee7c63e062c9dfe945246a0461c3e6526..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/sequence_dataset_op_test.py
+++ /dev/null
@@ -1,210 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class SequenceDatasetTest(test_base.DatasetTestBase):
-
-  def testRepeatTensorDataset(self):
-    """Test a dataset that repeats its input multiple times."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    # This placeholder can be fed when dataset-definition subgraph
-    # runs (i.e. `init_op` below) to configure the number of
-    # repetitions used in a particular iterator.
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components)
-                .repeat(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Test a finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 3})
-      for _ in range(3):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test a different finite repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 7})
-      for _ in range(7):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an empty repetition.
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Test an infinite repetition.
-      # NOTE(mrry): There's not a good way to test that the sequence
-      # actually is infinite.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for _ in range(17):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-
-  def testTakeTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .take(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Take fewer than input size
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take more than input size
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take all of input
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      for i in range(10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Take nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSkipTensorDataset(self):
-    components = (np.arange(10),)
-    count_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components)
-                .skip(count_placeholder).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape[1:] for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      # Skip fewer than input size, we should skip
-      # the first 4 elements and then read the rest.
-      sess.run(init_op, feed_dict={count_placeholder: 4})
-      for i in range(4, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip more than input size: get nothing.
-      sess.run(init_op, feed_dict={count_placeholder: 25})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip exactly input size.
-      sess.run(init_op, feed_dict={count_placeholder: 10})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Set -1 for 'count': skip the entire dataset.
-      sess.run(init_op, feed_dict={count_placeholder: -1})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Skip nothing
-      sess.run(init_op, feed_dict={count_placeholder: 0})
-      for i in range(0, 10):
-        results = sess.run(get_next)
-        self.assertAllEqual(results, components[0][i:i+1])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatRepeatTensorDataset(self):
-    """Test the composition of repeat datasets."""
-    components = (np.array(1), np.array([1, 2, 3]), np.array(37.0))
-    inner_count = array_ops.placeholder(dtypes.int64, shape=[])
-    outer_count = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = (dataset_ops.Dataset.from_tensors(components).repeat(inner_count)
-                .repeat(outer_count).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([c.shape for c in components],
-                     [t.shape for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(init_op, feed_dict={inner_count: 7, outer_count: 14})
-      for _ in range(7 * 14):
-        results = sess.run(get_next)
-        for component, result_component in zip(components, results):
-          self.assertAllEqual(component, result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testRepeatEmptyDataset(self):
-    """Test that repeating an empty dataset does not hang."""
-    iterator = (dataset_ops.Dataset.from_tensors(0).repeat(10).skip(10)
-                .repeat(-1).make_initializable_iterator())
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shard_test.py
similarity index 52%
rename from tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
rename to tensorflow/python/data/kernel_tests/shard_test.py
index b9f3c79da56ee20ba3cb96392d97352988089f81..928550676d5b05c2e5a459af355acebe2f1f1cc4 100644
--- a/tensorflow/python/data/kernel_tests/shard_dataset_op_test.py
+++ b/tensorflow/python/data/kernel_tests/shard_test.py
@@ -12,50 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for the experimental input pipeline ops."""
+"""Tests for `tf.data.Dataset.shard()`."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
-class ShardDatasetOpTest(test_base.DatasetTestBase):
+@test_util.run_all_in_graph_and_eager_modes
+class ShardTest(test_base.DatasetTestBase):
 
   def testSimpleCase(self):
     dataset = dataset_ops.Dataset.range(10).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual(2, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[2, 7])
 
   def testNestedData(self):
     dataset_a = dataset_ops.Dataset.range(10)
     dataset_b = dataset_ops.Dataset.range(10, 0, -1)
     dataset = dataset_ops.Dataset.zip((dataset_a, dataset_b)).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual((2, 8), sess.run(iterator.get_next()))
-      self.assertEqual((7, 3), sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[(2, 8), (7, 3)])
 
   def testOffsetZero(self):
     dataset = dataset_ops.Dataset.range(10).shard(5, 0)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(iterator.get_next()))
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[0, 5])
 
   def testOffsetGreaterNumShards(self):
     with self.assertRaises(ValueError):
@@ -75,38 +58,19 @@ class ShardDatasetOpTest(test_base.DatasetTestBase):
 
   def testIteratorEndsBeforeFirstElem(self):
     dataset = dataset_ops.Dataset.range(1).shard(5, 2)
-    iterator = dataset.make_one_shot_iterator()
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[])
 
   def testLargerWorkerPool(self):
     dataset = dataset_ops.Dataset.range(10).shard(7, 5)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(5, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[5])
 
   def testIndexEqualsNumShards(self):
     dataset = dataset_ops.Dataset.range(10).shard(5, 4)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(4, sess.run(iterator.get_next()))
-      self.assertEqual(9, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
+    self.assertDatasetProduces(dataset, expected_output=[4, 9])
 
   def testIndexEqualsNumShards2(self):
     dataset = dataset_ops.Dataset.range(10).shard(4, 3)
-    iterator = dataset.make_one_shot_iterator()
-    with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(iterator.get_next()))
-      self.assertEqual(7, sess.run(iterator.get_next()))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(iterator.get_next())
-
+    self.assertDatasetProduces(dataset, expected_output=[3, 7])
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py b/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
deleted file mode 100644
index cad28f860e9d04647c510146f73d7d39de774d4a..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/shuffle_dataset_op_test.py
+++ /dev/null
@@ -1,278 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import random_seed
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class ShuffleDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  def testShuffleDataset(self):
-    components = (
-        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
-        np.array([9.0, 10.0, 11.0, 12.0])
-    )
-    count_placeholder = array_ops.placeholder_with_default(
-        constant_op.constant(5, dtypes.int64), shape=[])
-    buffer_size_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-
-    repeat_dataset = (dataset_ops.Dataset.from_tensor_slices(components)
-                      .repeat(count_placeholder))
-
-    shuffle_dataset = repeat_dataset.shuffle(buffer_size_placeholder,
-                                             seed_placeholder)
-
-    self.assertEqual(tuple([c.shape[1:] for c in components]),
-                     shuffle_dataset.output_shapes)
-
-    # Create initialization ops for iterators without and with
-    # shuffling, respectively.
-    iterator = iterator_ops.Iterator.from_structure(
-        shuffle_dataset.output_types, shuffle_dataset.output_shapes)
-    init_fifo_op = iterator.make_initializer(repeat_dataset)
-    init_shuffle_op = iterator.make_initializer(shuffle_dataset)
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      # First run without shuffling to collect the "ground truth".
-      sess.run(init_fifo_op)
-      unshuffled_elements = []
-      for _ in range(20):
-        unshuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth".
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      shuffled_elements = []
-      for _ in range(20):
-        shuffled_elements.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(shuffled_elements))
-
-      # Assert that shuffling twice with the same seeds gives the same sequence.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 37})
-      reshuffled_elements_same_seed = []
-      for _ in range(20):
-        reshuffled_elements_same_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
-
-      # Assert that shuffling twice with a different seed gives a different
-      # permutation of the same elements.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 100,
-                     seed_placeholder: 1037})
-      reshuffled_elements_different_seed = []
-      for _ in range(20):
-        reshuffled_elements_different_seed.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
-      self.assertAllEqual(
-          sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
-
-      # Assert that the shuffled dataset has the same elements as the
-      # "ground truth" when the buffer size is smaller than the input
-      # dataset.
-      sess.run(
-          init_shuffle_op,
-          feed_dict={buffer_size_placeholder: 2,
-                     seed_placeholder: 37})
-      reshuffled_elements_small_buffer = []
-      for _ in range(20):
-        reshuffled_elements_small_buffer.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-      self.assertAllEqual(
-          sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
-
-      # Test the case of shuffling an empty dataset.
-      sess.run(init_shuffle_op, feed_dict={buffer_size_placeholder: 2,
-                                           seed_placeholder: 37,
-                                           count_placeholder: 0})
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testSeedZero(self):
-    """Test for same behavior when the seed is a Python or Tensor zero."""
-    iterator = (
-        dataset_ops.Dataset.range(10).shuffle(10, seed=0)
-        .make_one_shot_iterator())
-    get_next = iterator.get_next()
-
-    elems = []
-    with self.cached_session() as sess:
-      for _ in range(10):
-        elems.append(sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
-    iterator = (
-        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder)
-        .make_initializable_iterator())
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer, feed_dict={seed_placeholder: 0})
-      for elem in elems:
-        self.assertEqual(elem, sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testDefaultArguments(self):
-    components = [0, 1, 2, 3, 4]
-    iterator = (dataset_ops.Dataset.from_tensor_slices(components).shuffle(5)
-                .repeat().make_one_shot_iterator())
-
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      counts = collections.defaultdict(lambda: 0)
-      for _ in range(10):
-        for _ in range(5):
-          counts[sess.run(get_next)] += 1
-
-    for i in range(5):
-      self.assertEqual(10, counts[i])
-
-  def testShuffleNoReshuffleEachIteration(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .shuffle(10, reshuffle_each_iteration=False)
-                .batch(10)
-                .repeat(3)
-                .make_one_shot_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      initial_permutation = sess.run(next_element)
-      self.assertAllEqual(initial_permutation, sess.run(next_element))
-      self.assertAllEqual(initial_permutation, sess.run(next_element))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  def testShuffleReshuffleEachIteration(self):
-    iterator = (dataset_ops.Dataset.range(10)
-                .shuffle(10, seed=3, reshuffle_each_iteration=True)
-                .batch(10)
-                .repeat(3)
-                .make_one_shot_iterator())
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      initial_permutation = list(sess.run(next_element))
-      for _ in range(2):
-        next_permutation = list(sess.run(next_element))
-        self.assertNotEqual(initial_permutation, next_permutation)
-        self.assertAllEqual(
-            sorted(initial_permutation), sorted(next_permutation))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
-
-  @parameterized.named_parameters(
-      ("ReshuffleGraphLevelSeed", True, 38, None),
-      ("ReshuffleOpLevelSeed", True, None, 42),
-      ("ReshuffleGraphAndOpLevelSeed", True, 38, 42),
-      ("NoReshuffleGraphLevelSeed", False, 38, None),
-      ("NoReshuffleOpLevelSeed", False, None, 42),
-      ("NoReshuffleGraphAndOpLevelSeed", False, 38, 42),
-  )
-  def testShuffleSeed(self, reshuffle, graph_level_seed, op_level_seed):
-    results = []
-    for _ in range(2):
-      with ops.Graph().as_default() as g:
-        random_seed.set_random_seed(graph_level_seed)
-        dataset = dataset_ops.Dataset.range(10).shuffle(
-            10, seed=op_level_seed, reshuffle_each_iteration=reshuffle).repeat(
-                3)
-        iterator = dataset.make_one_shot_iterator()
-        next_element = iterator.get_next()
-
-        run_results = []
-        with self.session(graph=g) as sess:
-          for _ in range(30):
-            run_results.append(sess.run(next_element))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(next_element)
-        results.append(run_results)
-
-    self.assertAllEqual(results[0], results[1])
-
-  @parameterized.named_parameters(
-      ("ReshuffleOneShot", True, False),
-      ("ReshuffleInitializable", True, True),
-      ("NoReshuffleOneShot", False, False),
-      ("NoReshuffleInitializable", False, True),
-  )
-  def testMultipleIterators(self, reshuffle, initializable):
-    with ops.Graph().as_default() as g:
-      dataset = dataset_ops.Dataset.range(100).shuffle(
-          10, reshuffle_each_iteration=reshuffle).repeat(3)
-
-      if initializable:
-        iterators = [dataset.make_initializable_iterator() for _ in range(2)]
-      else:
-        iterators = [dataset.make_one_shot_iterator() for _ in range(2)]
-
-      results = []
-      with self.session(graph=g) as sess:
-        for iterator in iterators:
-          if initializable:
-            sess.run(iterator.initializer)
-          next_element = iterator.get_next()
-          run_results = []
-          for _ in range(300):
-            run_results.append(sess.run(next_element))
-          with self.assertRaises(errors.OutOfRangeError):
-            sess.run(next_element)
-
-          results.append(run_results)
-
-        self.assertNotEqual(results[0], results[1])
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..13df870938d1cee7b29e0189b9b1db1731bb4114
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -0,0 +1,249 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.shuffle()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  def testShuffleDataset(self):
+    components = (
+        np.array([1, 2, 3, 4]), np.array([5, 6, 7, 8]),
+        np.array([9.0, 10.0, 11.0, 12.0])
+    )
+
+    def dataset_fn(count=5, buffer_size=None, seed=0):
+      repeat_dataset = (
+          dataset_ops.Dataset.from_tensor_slices(components).repeat(count))
+      if buffer_size:
+        shuffle_dataset = repeat_dataset.shuffle(buffer_size, seed)
+
+        self.assertEqual(
+            tuple([c.shape[1:] for c in components]),
+            shuffle_dataset.output_shapes)
+        return shuffle_dataset
+      else:
+        return repeat_dataset
+
+    # First run without shuffling to collect the "ground truth".
+    get_next = self.getNext(dataset_fn())
+    unshuffled_elements = []
+    for _ in range(20):
+      unshuffled_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    # Assert that the shuffled dataset has the same elements as the
+    # "ground truth".
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=37))
+    shuffled_elements = []
+    for _ in range(20):
+      shuffled_elements.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(sorted(unshuffled_elements), sorted(shuffled_elements))
+
+    # Assert that shuffling twice with the same seeds gives the same sequence.
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=37))
+    reshuffled_elements_same_seed = []
+    for _ in range(20):
+      reshuffled_elements_same_seed.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertEqual(shuffled_elements, reshuffled_elements_same_seed)
+
+    # Assert that shuffling twice with a different seed gives a different
+    # permutation of the same elements.
+    get_next = self.getNext(dataset_fn(buffer_size=100, seed=137))
+    reshuffled_elements_different_seed = []
+    for _ in range(20):
+      reshuffled_elements_different_seed.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertNotEqual(shuffled_elements, reshuffled_elements_different_seed)
+    self.assertAllEqual(
+        sorted(shuffled_elements), sorted(reshuffled_elements_different_seed))
+
+    # Assert that the shuffled dataset has the same elements as the
+    # "ground truth" when the buffer size is smaller than the input
+    # dataset.
+    get_next = self.getNext(dataset_fn(buffer_size=2, seed=37))
+    reshuffled_elements_small_buffer = []
+    for _ in range(20):
+      reshuffled_elements_small_buffer.append(self.evaluate(get_next()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    self.assertAllEqual(
+        sorted(unshuffled_elements), sorted(reshuffled_elements_small_buffer))
+
+    # Test the case of shuffling an empty dataset.
+    get_next = self.getNext(dataset_fn(count=0, buffer_size=100, seed=37))
+
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSeedZero(self):
+    """Test for same behavior when the seed is a Python or Tensor zero."""
+    iterator = dataset_ops.make_one_shot_iterator(
+        dataset_ops.Dataset.range(10).shuffle(10, seed=0))
+    get_next = iterator.get_next()
+
+    elems = []
+    with self.cached_session() as sess:
+      for _ in range(10):
+        elems.append(sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+    seed_placeholder = array_ops.placeholder(dtypes.int64, shape=[])
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.range(10).shuffle(10, seed=seed_placeholder))
+    get_next = iterator.get_next()
+
+    with self.cached_session() as sess:
+      sess.run(iterator.initializer, feed_dict={seed_placeholder: 0})
+      for elem in elems:
+        self.assertEqual(elem, sess.run(get_next))
+      with self.assertRaises(errors.OutOfRangeError):
+        sess.run(get_next)
+
+  def testDefaultArguments(self):
+    components = [0, 1, 2, 3, 4]
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).shuffle(
+        5).repeat()
+    get_next = self.getNext(dataset)
+    counts = collections.defaultdict(lambda: 0)
+    for _ in range(10):
+      for _ in range(5):
+        counts[self.evaluate(get_next())] += 1
+
+    for i in range(5):
+      self.assertEqual(10, counts[i])
+
+  def testShuffleNoReshuffleEachIteration(self):
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, reshuffle_each_iteration=False).batch(10).repeat(3)
+    next_element = self.getNext(dataset)
+
+    initial_permutation = self.evaluate(next_element())
+    self.assertAllEqual(initial_permutation, self.evaluate(next_element()))
+    self.assertAllEqual(initial_permutation, self.evaluate(next_element()))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+  def testShuffleReshuffleEachIteration(self):
+    dataset = dataset_ops.Dataset.range(10).shuffle(
+        10, seed=3, reshuffle_each_iteration=True).batch(10).repeat(3)
+    next_element = self.getNext(dataset)
+
+    initial_permutation = list(self.evaluate(next_element()))
+    for _ in range(2):
+      next_permutation = list(self.evaluate(next_element()))
+      self.assertNotEqual(initial_permutation, next_permutation)
+      self.assertAllEqual(sorted(initial_permutation), sorted(next_permutation))
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(next_element())
+
+  @parameterized.named_parameters(
+      ("ReshuffleGraphLevelSeed", True, 38, None),
+      ("ReshuffleOpLevelSeed", True, None, 42),
+      ("ReshuffleGraphAndOpLevelSeed", True, 38, 42),
+      ("NoReshuffleGraphLevelSeed", False, 38, None),
+      ("NoReshuffleOpLevelSeed", False, None, 42),
+      ("NoReshuffleGraphAndOpLevelSeed", False, 38, 42),
+  )
+  def testSkipEagerShuffleSeed(self, reshuffle, graph_level_seed,
+                               op_level_seed):
+    results = []
+    for _ in range(2):
+      with ops.Graph().as_default() as g:
+        random_seed.set_random_seed(graph_level_seed)
+        dataset = dataset_ops.Dataset.range(10).shuffle(
+            10, seed=op_level_seed, reshuffle_each_iteration=reshuffle).repeat(
+                3)
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+        next_element = iterator.get_next()
+
+        run_results = []
+        with self.session(graph=g) as sess:
+          for _ in range(30):
+            run_results.append(sess.run(next_element))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(next_element)
+        results.append(run_results)
+
+    self.assertAllEqual(results[0], results[1])
+
+  # TODO(b/117581999): fails for eager mode with result[0] equal to result[1],
+  # debug.
+  @parameterized.named_parameters(
+      ("ReshuffleOneShot", True, False),
+      ("ReshuffleInitializable", True, True),
+      ("NoReshuffleOneShot", False, False),
+      ("NoReshuffleInitializable", False, True),
+  )
+  def testSkipEagerMultipleIterators(self, reshuffle, initializable):
+    with ops.Graph().as_default() as g:
+      dataset = dataset_ops.Dataset.range(100).shuffle(
+          10, reshuffle_each_iteration=reshuffle).repeat(3)
+
+      if initializable:
+        iterators = [dataset_ops.make_initializable_iterator(dataset)
+                     for _ in range(2)]
+      else:
+        iterators = [dataset_ops.make_one_shot_iterator(dataset)
+                     for _ in range(2)]
+
+      results = []
+      with self.session(graph=g) as sess:
+        for iterator in iterators:
+          if initializable:
+            sess.run(iterator.initializer)
+          next_element = iterator.get_next()
+          run_results = []
+          for _ in range(300):
+            run_results.append(sess.run(next_element))
+          with self.assertRaises(errors.OutOfRangeError):
+            sess.run(next_element)
+
+          results.append(run_results)
+
+        self.assertNotEqual(results[0], results[1])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/skip_test.py b/tensorflow/python/data/kernel_tests/skip_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c22be576921c6d8e569ecb60c90925d004a0e5de
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/skip_test.py
@@ -0,0 +1,62 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.skip()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SkipTest(test_base.DatasetTestBase):
+
+  def testSkipTensorDataset(self):
+    components = (np.arange(10),)
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).skip(count)
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      start_range = min(count, 10) if count != -1 else 10
+      self.assertDatasetProduces(
+          dataset,
+          [tuple(components[0][i:i + 1]) for i in range(start_range, 10)])
+
+    # Skip fewer than input size, we should skip
+    # the first 4 elements and then read the rest.
+    do_test(4)
+
+    # Skip more than input size: get nothing.
+    do_test(25)
+
+    # Skip exactly input size.
+    do_test(10)
+
+    # Set -1 for 'count': skip the entire dataset.
+    do_test(-1)
+
+    # Skip nothing
+    do_test(0)
+
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/take_test.py b/tensorflow/python/data/kernel_tests/take_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..03a7ece2d8c8ea88d4504a4341ae3bb13ee2c3bf
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/take_test.py
@@ -0,0 +1,55 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.take()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TakeTest(test_base.DatasetTestBase):
+
+  def testTakeTensorDataset(self):
+    components = (np.arange(10),)
+
+    def do_test(count):
+      dataset = dataset_ops.Dataset.from_tensor_slices(components).take(count)
+      self.assertEqual([c.shape[1:] for c in components],
+                       [shape for shape in dataset.output_shapes])
+      num_output = min(count, 10) if count != -1 else 10
+      self.assertDatasetProduces(
+          dataset, [tuple(components[0][i:i + 1]) for i in range(num_output)])
+
+    # Take fewer than input size
+    do_test(4)
+
+    # Take more than input size
+    do_test(25)
+
+    # Take all of input
+    do_test(-1)
+
+    # Take nothing
+    do_test(0)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/test_base.py b/tensorflow/python/data/kernel_tests/test_base.py
index edb3eff3c172d9b5f5997fc8de0a189d3758e59a..85f6c9de231a9054a2d7a6f434502dbecce1d601 100644
--- a/tensorflow/python/data/kernel_tests/test_base.py
+++ b/tensorflow/python/data/kernel_tests/test_base.py
@@ -38,56 +38,102 @@ class DatasetTestBase(test.TestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
-  def getNext(self, dataset):
+  def getNext(self, dataset, requires_initialization=False):
     """Returns a callable that returns the next element of the dataset.
 
     Example use:
     ```python
     # In both graph and eager modes
     dataset = ...
-    nxt = self.getNext(dataset)
-    result = self.evaluate(nxt())
+    get_next = self.getNext(dataset)
+    result = self.evaluate(get_next())
     ```
 
     Args:
-      dataset: A dataset whose next element is returned
-
+      dataset: A dataset whose elements will be returned.
+      requires_initialization: Indicates that when the test is executed in graph
+        mode, it should use an initializable iterator to iterate through the
+        dataset (e.g. when it contains stateful nodes). Defaults to False.
     Returns:
-      A callable that returns the next element of `dataset`
+      A callable that returns the next element of `dataset`.
     """
-    it = dataset.make_one_shot_iterator()
     if context.executing_eagerly():
-      return it.get_next
+      iterator = dataset.__iter__()
+      return iterator._next_internal  # pylint: disable=protected-access
     else:
-      nxt = it.get_next()
-      return lambda: nxt
-
-  def _compare_output_to_expected(self, result_values, expected_values):
-    for i in range(len(result_values)):
-      if sparse_tensor.is_sparse(result_values[i]):
-        self.assertSparseValuesEqual(result_values[i], expected_values[i])
+      if requires_initialization:
+        iterator = dataset_ops.make_initializable_iterator(dataset)
+        self.evaluate(iterator.initializer)
       else:
-        self.assertAllEqual(result_values[i], expected_values[i])
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+      get_next = iterator.get_next()
+      return lambda: get_next
+
+  def _compareOutputToExpected(self, result_values, expected_values,
+                               assert_items_equal):
+    if assert_items_equal:
+      # TODO(shivaniagrawal): add support for nested elements containing sparse
+      # tensors when needed.
+      self.assertItemsEqual(result_values, expected_values)
+      return
+    for i in range(len(result_values)):
+      nest.assert_same_structure(result_values[i], expected_values[i])
+      for result_value, expected_value in zip(
+          nest.flatten(result_values[i]), nest.flatten(expected_values[i])):
+        if sparse_tensor.is_sparse(result_value):
+          self.assertSparseValuesEqual(result_value, expected_value)
+        else:
+          self.assertAllEqual(result_value, expected_value)
 
   def assertDatasetProduces(self,
-                            input_dataset,
+                            dataset,
                             expected_output=None,
-                            expected_err=None,
-                            create_iterator_twice=True):
+                            expected_error=None,
+                            requires_initialization=False,
+                            num_test_iterations=1,
+                            assert_items_equal=False):
+    """Asserts that a dataset produces the expected output / error.
 
-    if expected_err:
-      with self.assertRaisesWithPredicateMatch(expected_err[0],
-                                               expected_err[1]):
-        get_next = self.getNext(input_dataset)
+    Args:
+      dataset: A dataset to check for the expected output / error.
+      expected_output: A list of elements that the dataset is expected to
+        produce.
+      expected_error: A tuple `(type, predicate)` identifying the expected error
+        `dataset` should raise. The `type` should match the expected exception
+        type, while `predicate` should either be 1) a unary function that inputs
+        the raised exception and returns a boolean indicator of success or 2) a
+        regular expression that is expected to match the error message
+        partially.
+      requires_initialization: Indicates that when the test is executed in graph
+        mode, it should use an initializable iterator to iterate through the
+        dataset (e.g. when it contains stateful nodes). Defaults to False.
+      num_test_iterations: Number of times `dataset` will be iterated. Defaults
+        to 2.
+      assert_items_equal: Tests expected_output has (only) the same elements
+        regardless of order.
+    """
+    self.assertTrue(
+        expected_error is not None or expected_output is not None,
+        "Exactly one of expected_output or expected error should be provided.")
+    if expected_error:
+      self.assertTrue(
+          expected_output is None,
+          "Exactly one of expected_output or expected error should be provided."
+      )
+      with self.assertRaisesWithPredicateMatch(expected_error[0],
+                                               expected_error[1]):
+        get_next = self.getNext(
+            dataset, requires_initialization=requires_initialization)
         self.evaluate(get_next())
       return
-    repeated = 2 if create_iterator_twice else 1
-    for _ in range(repeated):
-      get_next = self.getNext(input_dataset)
+    self.assertGreater(num_test_iterations, 0)
+    for _ in range(num_test_iterations):
+      get_next = self.getNext(
+          dataset, requires_initialization=requires_initialization)
       result = []
       for _ in range(len(expected_output)):
         result.append(self.evaluate(get_next()))
-      self._compare_output_to_expected(result, expected_output)
+      self._compareOutputToExpected(result, expected_output, assert_items_equal)
       with self.assertRaises(errors.OutOfRangeError):
         self.evaluate(get_next())
       with self.assertRaises(errors.OutOfRangeError):
@@ -132,7 +178,7 @@ class DatasetTestBase(test.TestCase):
     try:
       self.evaluate(next1())
       raise ValueError(
-          'Expected dataset to raise an error of type %s, but it did not.' %
+          "Expected dataset to raise an error of type %s, but it did not." %
           repr(exception_class))
     except exception_class as e:
       expected_message = e.message
diff --git a/tensorflow/python/data/kernel_tests/text_line_dataset_test.py b/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4db09a98084fb5430a4430da35d8018da3827dae
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/text_line_dataset_test.py
@@ -0,0 +1,165 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.TextLineDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import readers
+from tensorflow.python.eager import context
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+try:
+  import psutil  # pylint: disable=g-import-not-at-top
+  psutil_import_succeeded = True
+except ImportError:
+  psutil_import_succeeded = False
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TextLineDatasetTest(test_base.DatasetTestBase):
+
+  def _lineText(self, f, l):
+    return compat.as_bytes("%d: %d" % (f, l))
+
+  def _createFiles(self,
+                   num_files,
+                   num_lines,
+                   crlf=False,
+                   compression_type=None):
+    filenames = []
+    for i in range(num_files):
+      fn = os.path.join(self.get_temp_dir(), "text_line.%d.txt" % i)
+      filenames.append(fn)
+      contents = []
+      for j in range(num_lines):
+        contents.append(self._lineText(i, j))
+        # Always include a newline after the record unless it is
+        # at the end of the file, in which case we include it
+        if j + 1 != num_lines or i == 0:
+          contents.append(b"\r\n" if crlf else b"\n")
+      contents = b"".join(contents)
+
+      if not compression_type:
+        with open(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "GZIP":
+        with gzip.GzipFile(fn, "wb") as f:
+          f.write(contents)
+      elif compression_type == "ZLIB":
+        contents = zlib.compress(contents)
+        with open(fn, "wb") as f:
+          f.write(contents)
+      else:
+        raise ValueError("Unsupported compression_type", compression_type)
+
+    return filenames
+
+  def _testTextLineDataset(self, compression_type=None):
+    test_filenames = self._createFiles(
+        2, 5, crlf=True, compression_type=compression_type)
+
+    def dataset_fn(filenames, num_epochs, batch_size=None):
+      repeat_dataset = readers.TextLineDataset(
+          filenames, compression_type=compression_type).repeat(num_epochs)
+      if batch_size:
+        return repeat_dataset.batch(batch_size)
+      return repeat_dataset
+
+    # Basic test: read from file 0.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[0]], 1), expected_output=expected_output)
+
+    # Basic test: read from file 1.
+    self.assertDatasetProduces(
+        dataset_fn([test_filenames[1]], 1),
+        expected_output=[self._lineText(1, i) for i in range(5)])
+
+    # Basic test: read from both files.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    expected_output.extend([self._lineText(1, i) for i in range(5)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 1), expected_output=expected_output)
+
+    # Test repeated iteration through both files.
+    expected_output = [self._lineText(0, i) for i in range(5)]
+    expected_output.extend([self._lineText(1, i) for i in range(5)])
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 10), expected_output=expected_output * 10)
+
+    # Test batched and repeated iteration through both files.
+    self.assertDatasetProduces(
+        dataset_fn(test_filenames, 10, 5),
+        expected_output=[[self._lineText(0, i) for i in range(5)],
+                         [self._lineText(1, i) for i in range(5)]] * 10)
+
+  def testTextLineDatasetNoCompression(self):
+    self._testTextLineDataset()
+
+  def testTextLineDatasetGzipCompression(self):
+    self._testTextLineDataset(compression_type="GZIP")
+
+  def testTextLineDatasetZlibCompression(self):
+    self._testTextLineDataset(compression_type="ZLIB")
+
+  def testTextLineDatasetBuffering(self):
+    test_filenames = self._createFiles(2, 5, crlf=True)
+
+    repeat_dataset = readers.TextLineDataset(test_filenames, buffer_size=10)
+    expected_output = []
+    for j in range(2):
+      expected_output.extend([self._lineText(j, i) for i in range(5)])
+    self.assertDatasetProduces(repeat_dataset, expected_output=expected_output)
+
+  def testIteratorResourceCleanup(self):
+    filename = os.path.join(self.get_temp_dir(), "text.txt")
+    with open(filename, "wt") as f:
+      for i in range(3):
+        f.write("%d\n" % (i,))
+    with context.eager_mode():
+      first_iterator = iter(readers.TextLineDataset(filename))
+      self.assertEqual(b"0", next(first_iterator).numpy())
+      second_iterator = iter(readers.TextLineDataset(filename))
+      self.assertEqual(b"0", next(second_iterator).numpy())
+      # Eager kernel caching is based on op attributes, which includes the
+      # Dataset's output shape. Create a different kernel to test that they
+      # don't create resources with the same names.
+      different_kernel_iterator = iter(
+          readers.TextLineDataset(filename).repeat().batch(16))
+      self.assertEqual([16], next(different_kernel_iterator).shape)
+      # Remove our references to the Python Iterator objects, which (assuming no
+      # reference cycles) is enough to trigger DestroyResourceOp and close the
+      # partially-read files.
+      del first_iterator
+      del second_iterator
+      del different_kernel_iterator
+      if not psutil_import_succeeded:
+        self.skipTest(
+            "psutil is required to check that we've closed our files.")
+      open_files = psutil.Process().open_files()
+      self.assertNotIn(filename, [open_file.path for open_file in open_files])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py b/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..13a70aa88d0b427cfc19717bc1202a032b564938
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/tf_record_dataset_test.py
@@ -0,0 +1,170 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.TFRecordDataset`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import gzip
+import os
+import zlib
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import test_util
+from tensorflow.python.lib.io import python_io
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TFRecordDatasetTest(test_base.DatasetTestBase):
+
+  def setUp(self):
+    super(TFRecordDatasetTest, self).setUp()
+    self._num_files = 2
+    self._num_records = 7
+
+    self.test_filenames = self._createFiles()
+
+  def dataset_fn(self,
+                 filenames,
+                 compression_type="",
+                 num_epochs=1,
+                 batch_size=None):
+
+    repeat_dataset = readers.TFRecordDataset(
+        filenames, compression_type).repeat(num_epochs)
+    if batch_size:
+      return repeat_dataset.batch(batch_size)
+    return repeat_dataset
+
+  def _record(self, f, r):
+    return compat.as_bytes("Record %d of file %d" % (r, f))
+
+  def _createFiles(self):
+    filenames = []
+    for i in range(self._num_files):
+      fn = os.path.join(self.get_temp_dir(), "tf_record.%d.txt" % i)
+      filenames.append(fn)
+      writer = python_io.TFRecordWriter(fn)
+      for j in range(self._num_records):
+        writer.write(self._record(i, j))
+      writer.close()
+    return filenames
+
+  def testReadOneEpoch(self):
+    # Basic test: read from file 0.
+    dataset = self.dataset_fn(self.test_filenames[0])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[self._record(0, i) for i in range(self._num_records)])
+
+    # Basic test: read from file 1.
+    dataset = self.dataset_fn(self.test_filenames[1])
+    self.assertDatasetProduces(
+        dataset,
+        expected_output=[self._record(1, i) for i in range(self._num_records)])
+
+    # Basic test: read from both files.
+    dataset = self.dataset_fn(self.test_filenames)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadTenEpochs(self):
+    dataset = self.dataset_fn(self.test_filenames, num_epochs=10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output * 10)
+
+  def testReadTenEpochsOfBatches(self):
+    dataset = self.dataset_fn(
+        self.test_filenames, num_epochs=10, batch_size=self._num_records)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.append(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output * 10)
+
+  def testReadZlibFiles(self):
+    zlib_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        cdata = zlib.compress(f.read())
+
+        zfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.z" % i)
+        with open(zfn, "wb") as f:
+          f.write(cdata)
+        zlib_files.append(zfn)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = self.dataset_fn(zlib_files, compression_type="ZLIB")
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadGzipFiles(self):
+    gzip_files = []
+    for i, fn in enumerate(self.test_filenames):
+      with open(fn, "rb") as f:
+        gzfn = os.path.join(self.get_temp_dir(), "tfrecord_%s.gz" % i)
+        with gzip.GzipFile(gzfn, "wb") as gzf:
+          gzf.write(f.read())
+        gzip_files.append(gzfn)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = self.dataset_fn(gzip_files, compression_type="GZIP")
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadWithBuffer(self):
+    one_mebibyte = 2**20
+    dataset = readers.TFRecordDataset(
+        self.test_filenames, buffer_size=one_mebibyte)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadFromDatasetOfFiles(self):
+    files = dataset_ops.Dataset.from_tensor_slices(self.test_filenames)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = readers.TFRecordDataset(files)
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testReadTenEpochsFromDatasetOfFilesInParallel(self):
+    files = dataset_ops.Dataset.from_tensor_slices(
+        self.test_filenames).repeat(10)
+    expected_output = []
+    for j in range(self._num_files):
+      expected_output.extend(
+          [self._record(j, i) for i in range(self._num_records)])
+    dataset = readers.TFRecordDataset(files, num_parallel_reads=4)
+    self.assertDatasetProduces(
+        dataset, expected_output=expected_output * 10, assert_items_equal=True)
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/window_dataset_op_test.py b/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
deleted file mode 100644
index 9d067810944c23a19418a4625dae2997d122d119..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/window_dataset_op_test.py
+++ /dev/null
@@ -1,291 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from absl.testing import parameterized
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.platform import test
-
-
-class WindowDatasetTest(test_base.DatasetTestBase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      ("1", 20, 14, 7, 1),
-      ("2", 20, 17, 9, 1),
-      ("3", 20, 14, 14, 1),
-      ("4", 20, 10, 14, 1),
-      ("5", 20, 14, 19, 1),
-      ("6", 20, 4, 1, 2),
-      ("7", 20, 2, 1, 6),
-      ("8", 20, 4, 7, 2),
-      ("9", 20, 2, 7, 6),
-      ("10", 1, 10, 4, 1),
-      ("11", 0, 10, 4, 1),
-      ("12", 20, 14, 7, 1, False),
-      ("13", 20, 17, 9, 1, False),
-      ("14", 20, 14, 14, 1, False),
-      ("15", 20, 10, 14, 1, False),
-      ("16", 20, 14, 19, 1, False),
-      ("17", 20, 4, 1, 2, False),
-      ("18", 20, 2, 1, 6, False),
-      ("19", 20, 4, 7, 2, False),
-      ("20", 20, 2, 7, 6, False),
-      ("21", 1, 10, 4, 1, False),
-      ("22", 0, 10, 4, 1, False),
-  )
-  def testWindowDataset(self, count, size, shift, stride, drop_remainder=True):
-    """Tests a dataset that slides a window its input elements."""
-    components = (np.arange(7),
-                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
-                  np.array(37.0) * np.arange(7))
-
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    shift_t = array_ops.placeholder(dtypes.int64, shape=[])
-    stride_t = array_ops.placeholder(dtypes.int64, shape=[])
-    drop_remainder_t = array_ops.placeholder(dtypes.bool, shape=[])
-
-    def _map_fn(x, y, z):
-      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
-
-    def _flat_map_fn(x, y, z):
-      return dataset_ops.Dataset.zip((x.batch(batch_size=size_t),
-                                      y.batch(batch_size=size_t),
-                                      z.batch(batch_size=size_t)))
-
-    iterator = dataset_ops.Dataset.from_tensor_slices(components).map(
-        _map_fn).repeat(count).window(
-            size=size_t,
-            shift=shift_t,
-            stride=stride_t,
-            drop_remainder=drop_remainder_t).flat_map(
-                _flat_map_fn).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([[None] + list(c.shape[1:]) for c in components],
-                     [t.shape.as_list() for t in get_next])
-
-    with self.cached_session() as sess:
-      sess.run(
-          init_op,
-          feed_dict={
-              count_t: count,
-              size_t: size,
-              shift_t: shift,
-              stride_t: stride,
-              drop_remainder_t: drop_remainder
-          })
-      num_full_batches = max(
-          0, (count * 7 - ((size - 1) * stride + 1)) // shift + 1)
-      for i in range(num_full_batches):
-        result = sess.run(get_next)
-        for component, result_component in zip(components, result):
-          for j in range(size):
-            self.assertAllEqual(component[(i * shift + j * stride) % 7]**2,
-                                result_component[j])
-      if not drop_remainder:
-        num_partial_batches = (count * 7) // shift + (
-            (count * 7) % shift > 0) - num_full_batches
-        for i in range(num_partial_batches):
-          result = sess.run(get_next)
-          for component, result_component in zip(components, result):
-            remaining = (count * 7) - ((num_full_batches + i) * shift)
-            num_elements = remaining // stride + ((remaining % stride) > 0)
-            for j in range(num_elements):
-              self.assertAllEqual(
-                  component[((num_full_batches + i) * shift + j * stride) % 7]
-                  **2, result_component[j])
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  @parameterized.named_parameters(
-      ("1", 14, 0, 3, 1),
-      ("2", 14, 3, 0, 1),
-      ("3", 14, 3, 3, 0),
-  )
-  def testWindowDatasetInvalid(self, count, size, shift, stride):
-    count_t = array_ops.placeholder(dtypes.int64, shape=[])
-    size_t = array_ops.placeholder(dtypes.int64, shape=[])
-    shift_t = array_ops.placeholder(dtypes.int64, shape=[])
-    stride_t = array_ops.placeholder(dtypes.int64, shape=[])
-
-    iterator = dataset_ops.Dataset.range(10).map(lambda x: x).repeat(
-        count_t).window(
-            size=size_t, shift=shift_t,
-            stride=stride_t).flat_map(lambda x: x.batch(batch_size=size_t)
-                                     ).make_initializable_iterator()
-    init_op = iterator.initializer
-
-    with self.cached_session() as sess:
-      with self.assertRaises(errors.InvalidArgumentError):
-        sess.run(
-            init_op,
-            feed_dict={
-                count_t: count,
-                size_t: size,
-                shift_t: shift,
-                stride_t: stride
-            })
-
-  def testWindowSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=5, shift=3, drop_remainder=True).flat_map(
-            lambda x: x.batch(batch_size=5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      num_batches = (10 - 5) // 3 + 1
-      for i in range(num_batches):
-        actual = sess.run(get_next)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
-            values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
-            dense_shape=[5, 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testWindowSparseWithDifferentDenseShapes(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=array_ops.expand_dims(
-              math_ops.range(i, dtype=dtypes.int64), 1),
-          values=array_ops.fill([math_ops.to_int32(i)], i),
-          dense_shape=[i])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=5, shift=3, drop_remainder=True).flat_map(
-            lambda x: x.batch(batch_size=5)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      num_batches = (10 - 5) // 3 + 1
-      for i in range(num_batches):
-        actual = sess.run(get_next)
-        expected_indices = []
-        expected_values = []
-        for j in range(5):
-          for k in range(i * 3 + j):
-            expected_indices.append([j, k])
-            expected_values.append(i * 3 + j)
-        expected = sparse_tensor.SparseTensorValue(
-            indices=expected_indices,
-            values=expected_values,
-            dense_shape=[5, i * 3 + 5 - 1])
-        self.assertTrue(sparse_tensor.is_sparse(actual))
-        self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedWindowSparse(self):
-
-    def _sparse(i):
-      return sparse_tensor.SparseTensorValue(
-          indices=[[0]], values=(i * [1]), dense_shape=[1])
-
-    iterator = dataset_ops.Dataset.range(10).map(_sparse).window(
-        size=4, shift=2,
-        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=4)).window(
-            size=3, shift=1, drop_remainder=True).flat_map(
-                lambda x: x.batch(batch_size=3)).make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(init_op)
-      # Slide: 1st batch.
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
-                   [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
-                   [2, 2, 0], [2, 3, 0]],
-          values=[0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7],
-          dense_shape=[3, 4, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      # Slide: 2nd batch.
-      actual = sess.run(get_next)
-      expected = sparse_tensor.SparseTensorValue(
-          indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
-                   [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
-                   [2, 2, 0], [2, 3, 0]],
-          values=[2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9],
-          dense_shape=[3, 4, 1])
-      self.assertTrue(sparse_tensor.is_sparse(actual))
-      self.assertSparseValuesEqual(actual, expected)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testWindowShapeError(self):
-
-    def generator():
-      yield [1.0, 2.0, 3.0]
-      yield [4.0, 5.0, 6.0]
-      yield [7.0, 8.0, 9.0, 10.0]
-
-    iterator = dataset_ops.Dataset.from_generator(
-        generator, dtypes.float32, output_shapes=[None]).window(
-            size=3, shift=1).flat_map(
-                lambda x: x.batch(batch_size=3)).make_initializable_iterator()
-    next_element = iterator.get_next()
-
-    with self.cached_session() as sess:
-      sess.run(iterator.initializer)
-      with self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r"Cannot batch tensors with different shapes in component 0. "
-          r"First element had shape \[3\] and element 2 had shape \[4\]."):
-        sess.run(next_element)
-
-  def testWindowIgnoreErrors(self):
-    input_values = np.float32([1., np.nan, 2., np.nan, 3.])
-    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
-        lambda x: array_ops.check_numerics(x, "message")).window(
-            size=2, shift=2, stride=2,
-            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
-    get_next = dataset.make_one_shot_iterator().get_next()
-
-    with self.cached_session() as sess:
-      self.assertAllEqual(np.float32([1., 2.]), sess.run(get_next))
-      self.assertAllEqual(np.float32([2., 3.]), sess.run(get_next))
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/window_test.py b/tensorflow/python/data/kernel_tests/window_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d083142ab6a1f300b9e51b50d0113474053af05e
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/window_test.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.window()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import nest
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class WindowTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ("1", 20, 14, 7, 1),
+      ("2", 20, 17, 9, 1),
+      ("3", 20, 14, 14, 1),
+      ("4", 20, 10, 14, 1),
+      ("5", 20, 14, 19, 1),
+      ("6", 20, 4, 1, 2),
+      ("7", 20, 2, 1, 6),
+      ("8", 20, 4, 7, 2),
+      ("9", 20, 2, 7, 6),
+      ("10", 1, 10, 4, 1),
+      ("11", 0, 10, 4, 1),
+      ("12", 20, 14, 7, 1, False),
+      ("13", 20, 17, 9, 1, False),
+      ("14", 20, 14, 14, 1, False),
+      ("15", 20, 10, 14, 1, False),
+      ("16", 20, 14, 19, 1, False),
+      ("17", 20, 4, 1, 2, False),
+      ("18", 20, 2, 1, 6, False),
+      ("19", 20, 4, 7, 2, False),
+      ("20", 20, 2, 7, 6, False),
+      ("21", 1, 10, 4, 1, False),
+      ("22", 0, 10, 4, 1, False),
+  )
+  def testWindowDataset(self, count, size, shift, stride, drop_remainder=True):
+    """Tests a dataset that slides a window its input elements."""
+    components = (np.arange(7),
+                  np.array([[1, 2, 3]]) * np.arange(7)[:, np.newaxis],
+                  np.array(37.0) * np.arange(7))
+
+    def _map_fn(x, y, z):
+      return math_ops.square(x), math_ops.square(y), math_ops.square(z)
+
+    def _flat_map_fn(x, y, z):
+      return dataset_ops.Dataset.zip((x.batch(batch_size=size),
+                                      y.batch(batch_size=size),
+                                      z.batch(batch_size=size)))
+
+    dataset = dataset_ops.Dataset.from_tensor_slices(components).map(
+        _map_fn).repeat(count).window(
+            size=size,
+            shift=shift,
+            stride=stride,
+            drop_remainder=drop_remainder).flat_map(_flat_map_fn)
+    get_next = self.getNext(dataset)
+
+    self.assertEqual(
+        [[None] + list(c.shape[1:]) for c in components],
+        [ts.as_list() for ts in nest.flatten(dataset.output_shapes)])
+
+    num_full_batches = max(0,
+                           (count * 7 - ((size - 1) * stride + 1)) // shift + 1)
+    for i in range(num_full_batches):
+      result = self.evaluate(get_next())
+      for component, result_component in zip(components, result):
+        for j in range(size):
+          self.assertAllEqual(component[(i * shift + j * stride) % 7]**2,
+                              result_component[j])
+    if not drop_remainder:
+      num_partial_batches = (count * 7) // shift + (
+          (count * 7) % shift > 0) - num_full_batches
+      for i in range(num_partial_batches):
+        result = self.evaluate(get_next())
+        for component, result_component in zip(components, result):
+          remaining = (count * 7) - ((num_full_batches + i) * shift)
+          num_elements = remaining // stride + ((remaining % stride) > 0)
+          for j in range(num_elements):
+            self.assertAllEqual(
+                component[((num_full_batches + i) * shift + j * stride) % 7]**2,
+                result_component[j])
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  @parameterized.named_parameters(
+      ("1", 14, 0, 3, 1),
+      ("2", 14, 3, 0, 1),
+      ("3", 14, 3, 3, 0),
+  )
+  def testWindowDatasetInvalid(self, count, size, shift, stride):
+    dataset = dataset_ops.Dataset.range(10).map(lambda x: x).repeat(
+        count).window(
+            size=size, shift=shift,
+            stride=stride).flat_map(lambda x: x.batch(batch_size=size))
+    self.assertDatasetProduces(
+        dataset, expected_error=(errors.InvalidArgumentError, ""))
+
+  def testWindowSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=5, shift=3,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=5))
+
+    num_batches = (10 - 5) // 3 + 1
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [1, 0], [2, 0], [3, 0], [4, 0]],
+            values=[i * 3, i * 3 + 1, i * 3 + 2, i * 3 + 3, i * 3 + 4],
+            dense_shape=[5, 1]) for i in range(num_batches)
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testWindowSparseWithDifferentDenseShapes(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=array_ops.expand_dims(
+              math_ops.range(i, dtype=dtypes.int64), 1),
+          values=array_ops.fill([math_ops.to_int32(i)], i),
+          dense_shape=[i])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=5, shift=3,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=5))
+
+    expected_output = []
+    num_batches = (10 - 5) // 3 + 1
+    for i in range(num_batches):
+      expected_indices = []
+      expected_values = []
+      for j in range(5):
+        for k in range(i * 3 + j):
+          expected_indices.append([j, k])
+          expected_values.append(i * 3 + j)
+      expected_output.append(
+          sparse_tensor.SparseTensorValue(
+              indices=expected_indices,
+              values=expected_values,
+              dense_shape=[5, i * 3 + 5 - 1]))
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testNestedWindowSparse(self):
+
+    def _sparse(i):
+      return sparse_tensor.SparseTensorValue(
+          indices=[[0]], values=(i * [1]), dense_shape=[1])
+
+    dataset = dataset_ops.Dataset.range(10).map(_sparse).window(
+        size=4, shift=2,
+        drop_remainder=True).flat_map(lambda x: x.batch(batch_size=4)).window(
+            size=3, shift=1,
+            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=3))
+
+    expected_output = [
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
+                     [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
+                     [2, 2, 0], [2, 3, 0]],
+            values=[0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7],
+            dense_shape=[3, 4, 1]),
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0, 0], [0, 1, 0], [0, 2, 0], [0, 3, 0], [1, 0, 0],
+                     [1, 1, 0], [1, 2, 0], [1, 3, 0], [2, 0, 0], [2, 1, 0],
+                     [2, 2, 0], [2, 3, 0]],
+            values=[2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9],
+            dense_shape=[3, 4, 1])
+    ]
+    self.assertDatasetProduces(dataset, expected_output=expected_output)
+
+  def testWindowShapeError(self):
+
+    def generator():
+      yield [1.0, 2.0, 3.0]
+      yield [4.0, 5.0, 6.0]
+      yield [7.0, 8.0, 9.0, 10.0]
+
+    dataset = dataset_ops.Dataset.from_generator(
+        generator, dtypes.float32, output_shapes=[None]).window(
+            size=3, shift=1).flat_map(lambda x: x.batch(batch_size=3))
+    self.assertDatasetProduces(
+        dataset,
+        expected_error=(
+            errors.InvalidArgumentError,
+            r"Cannot batch tensors with different shapes in component 0. "
+            r"First element had shape \[3\] and element 2 had shape \[4\]."))
+
+  def testWindowIgnoreErrors(self):
+    input_values = np.float32([1., np.nan, 2., np.nan, 3.])
+    dataset = dataset_ops.Dataset.from_tensor_slices(input_values).map(
+        lambda x: array_ops.check_numerics(x, "message")).window(
+            size=2, shift=2, stride=2,
+            drop_remainder=True).flat_map(lambda x: x.batch(batch_size=2))
+    self.assertDatasetProduces(
+        dataset, expected_output=[np.float32([1., 2.]),
+                                  np.float32([2., 3.])])
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py b/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
deleted file mode 100644
index 9d76387a343de6e8652dd595c08bf72680a8197e..0000000000000000000000000000000000000000
--- a/tensorflow/python/data/kernel_tests/zip_dataset_op_test.py
+++ /dev/null
@@ -1,115 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests for the experimental input pipeline ops."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.data.kernel_tests import test_base
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.platform import test
-
-
-class ZipDatasetTest(test_base.DatasetTestBase):
-
-  def testZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.int64),
-        array_ops.placeholder(dtypes.float64)
-    ]
-
-    datasets = tuple([
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ])
-    zipped = dataset_ops.Dataset.zip(datasets)
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    with self.cached_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            equal_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-      variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, variable_length_components)})
-      for i in range(2):
-        results = sess.run(get_next)
-        for component, result_component in zip(
-            variable_length_components, results):
-          self.assertAllEqual(component[i], result_component)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-  def testNestedZipDataset(self):
-    component_placeholders = [
-        array_ops.placeholder(dtypes.int64, shape=[4, 20]),
-        array_ops.placeholder(dtypes.int64, shape=[4, 22]),
-        array_ops.placeholder(dtypes.float64, shape=[4])
-    ]
-
-    datasets = [
-        dataset_ops.Dataset.from_tensor_slices(component_placeholder)
-        for component_placeholder in component_placeholders
-    ]
-    zipped = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
-
-    iterator = zipped.make_initializable_iterator()
-    init_op = iterator.initializer
-    get_next = iterator.get_next()
-
-    self.assertEqual([20], get_next[0].shape)
-    self.assertEqual([22], get_next[1][0].shape)
-    self.assertEqual([], get_next[1][1].shape)
-
-    with self.cached_session() as sess:
-      equal_length_components = [
-          np.tile(np.array([[1], [2], [3], [4]]), 20),
-          np.tile(np.array([[12], [13], [14], [15]]), 22),
-          np.array([37.0, 38.0, 39.0, 40.0])
-      ]
-      sess.run(init_op, feed_dict={ph: value for ph, value in zip(
-          component_placeholders, equal_length_components)})
-      for i in range(4):
-        result1, (result2, result3) = sess.run(get_next)
-        self.assertAllEqual(equal_length_components[0][i], result1)
-        self.assertAllEqual(equal_length_components[1][i], result2)
-        self.assertAllEqual(equal_length_components[2][i], result3)
-      with self.assertRaises(errors.OutOfRangeError):
-        sess.run(get_next)
-
-
-if __name__ == "__main__":
-  test.main()
diff --git a/tensorflow/python/data/kernel_tests/zip_test.py b/tensorflow/python/data/kernel_tests/zip_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..477c9fa7da14276f5ad0b503402e24711b139832
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/zip_test.py
@@ -0,0 +1,101 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.Dataset.zip()`."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class ZipTest(test_base.DatasetTestBase):
+
+  def testZipDataset(self):
+
+    def dataset_fn(components):
+      datasets = tuple([
+          dataset_ops.Dataset.from_tensor_slices(component)
+          for component in components
+      ])
+      return dataset_ops.Dataset.zip(datasets)
+
+    equal_length_components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    ]
+
+    get_next = self.getNext(dataset_fn(equal_length_components))
+    for i in range(4):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(equal_length_components, results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+    variable_length_components = [[1, 2, 3, 4], [1, 2, 3, 4, 5], [1.0, 2.0]]
+    get_next = self.getNext(dataset_fn(variable_length_components))
+    for i in range(2):
+      results = self.evaluate(get_next())
+      for component, result_component in zip(variable_length_components,
+                                             results):
+        self.assertAllEqual(component[i], result_component)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+  def testNestedZipDataset(self):
+
+    equal_length_components = [
+        np.tile(np.array([[1], [2], [3], [4]]), 20),
+        np.tile(np.array([[12], [13], [14], [15]]), 22),
+        np.array([37.0, 38.0, 39.0, 40.0])
+    ]
+    datasets = [
+        dataset_ops.Dataset.from_tensor_slices(component)
+        for component in equal_length_components
+    ]
+    dataset = dataset_ops.Dataset.zip((datasets[0], (datasets[1], datasets[2])))
+
+    self.assertEqual(
+        dataset.output_shapes,
+        (tensor_shape.TensorShape([20]),
+         (tensor_shape.TensorShape([22]), tensor_shape.TensorShape([]))))
+
+    get_next = self.getNext(dataset)
+    for i in range(4):
+      result1, (result2, result3) = self.evaluate(get_next())
+      self.assertAllEqual(equal_length_components[0][i], result1)
+      self.assertAllEqual(equal_length_components[1][i], result2)
+      self.assertAllEqual(equal_length_components[2][i], result3)
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+    with self.assertRaises(errors.OutOfRangeError):
+      self.evaluate(get_next())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 18edc0872d766291acc243581b868869d9be65d3..fbff7df9c379e04a2b12a14ed5f5534339cde543 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -14,6 +14,7 @@ py_library(
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:function",
         "//tensorflow/python:math_ops",
@@ -25,8 +26,12 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
+        "//tensorflow/python/data/experimental/ops:optimization_options",
         "//tensorflow/python/data/experimental/ops:stats_options",
+        "//tensorflow/python/data/experimental/ops:threading_options",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/data/util:options",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/data/util:sparse",
         "//tensorflow/python/data/util:structure",
@@ -47,6 +52,7 @@ py_library(
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python/compat",
         "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/data/util:structure",
     ],
 )
 
@@ -81,6 +87,7 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:tensor_shape",
+        "//tensorflow/python:util",
         "//tensorflow/python/data/util:structure",
     ],
 )
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 59389a24f73cd271fbc35bb5b8ef01f2511f8714..bee04aaef2b382ffce179bf7b44a699bd4c7b778 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import abc
+import functools
 import threading
 import warnings
 
@@ -25,11 +26,16 @@ import numpy as np
 import six
 
 from tensorflow.python.compat import compat
+from tensorflow.python.data.experimental.ops import filter_for_shard_ops
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.experimental.ops import stats_options
+from tensorflow.python.data.experimental.ops import threading_options
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.data.util import nest
+from tensorflow.python.data.util import options as options_lib
 from tensorflow.python.data.util import random_seed
 from tensorflow.python.data.util import sparse
+from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -43,6 +49,7 @@ from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
@@ -52,17 +59,18 @@ from tensorflow.python.util import function_utils
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("data.Dataset")
+ops.NotDifferentiable("ReduceDataset")
+
+
+@tf_export("data.Dataset", v1=[])
 @six.add_metaclass(abc.ABCMeta)
-class Dataset(object):
+class DatasetV2(object):
   """Represents a potentially large set of elements.
 
   A `Dataset` can be used to represent an input pipeline as a
   collection of elements (nested structures of tensors) and a "logical
   plan" of transformations that act on those elements.
   """
-  def __init__(self):
-    pass
 
   def _as_serialized_graph(self):
     """Produces serialized graph representation of the dataset.
@@ -88,6 +96,37 @@ class Dataset(object):
 
     raise NotImplementedError("Dataset._inputs")
 
+  def _has_captured_ref(self):
+    """Whether this dataset uses a function that captures ref variables.
+
+    Returns:
+      A boolean, which if true indicates that the dataset or one of its inputs
+      uses a function that captures ref variables.
+    """
+    if context.executing_eagerly():
+      # RefVariables are not supported in eager mode
+      return False
+
+    def is_tensor_or_parent_ref(tensor):
+      if tensor.dtype._is_ref_dtype:  # pylint: disable=protected-access
+        return True
+      return any([is_tensor_or_parent_ref(x) for x in tensor.op.inputs])
+
+    for fn in self._functions():
+      if any([is_tensor_or_parent_ref(t) for t in fn.function.captured_inputs]):
+        return True
+
+    return any(
+        [input_dataset._has_captured_ref() for input_dataset in self._inputs()])  # pylint: disable=protected-access
+
+  def _functions(self):
+    """Returns a list of functions associated with this dataset.
+
+    Returns:
+      A list of `StructuredFunctionWrapper` objects.
+    """
+    return []
+
   def options(self):
     """Returns the options for this dataset and its inputs.
 
@@ -106,9 +145,26 @@ class Dataset(object):
 
     dataset = self
     options = self.options()
+    if options.experimental_threading is not None:
+      t_options = options.experimental_threading
+      if t_options.private_threadpool_size is not None:
+        dataset = _PrivateThreadPoolDataset(dataset,
+                                            t_options.private_threadpool_size)
+      if t_options.max_intra_op_parallelism is not None:
+        dataset = _MaxIntraOpParallelismDataset(
+            dataset, t_options.max_intra_op_parallelism)
     static_optimizations = options._static_optimizations()  # pylint: disable=protected-access
     if static_optimizations:
-      dataset = _OptimizeDataset(dataset, static_optimizations)
+      if self._has_captured_ref():
+        warnings.warn(
+            "tf.data static optimizations are not compatible with tf.Variable. "
+            "The following optimizations will be disabled: %s. To enable "
+            "optimizations, use resource variables instead by calling "
+            "`tf.enable_resource_variables()` at the start of the program." %
+            ", ".join(static_optimizations))
+      else:
+        dataset = _OptimizeDataset(dataset, static_optimizations)
+
     if options.experimental_autotune is not False:
       dataset = _ModelDataset(dataset)
     if options.experimental_stats and options.experimental_stats.aggregator:  # pylint: disable=line-too-long
@@ -118,51 +174,6 @@ class Dataset(object):
           options.experimental_stats.counter_prefix)
     return dataset
 
-  def make_initializable_iterator(self, shared_name=None):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    Note: The returned iterator will be in an uninitialized state,
-    and you must run the `iterator.initializer` operation before using it:
-
-    ```python
-    dataset = ...
-    iterator = dataset.make_initializable_iterator()
-    # ...
-    sess.run(iterator.initializer)
-    ```
-
-    Args:
-      shared_name: (Optional.) If non-empty, the returned iterator will be
-        shared under the given name across multiple sessions that share the
-        same devices (e.g. when using a remote server).
-
-    Returns:
-      An `Iterator` over the elements of this dataset.
-
-    Raises:
-      RuntimeError: If eager execution is enabled.
-    """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "dataset.make_initializable_iterator is not supported when eager "
-          "execution is enabled.")
-    dataset = self._apply_options()
-    if shared_name is None:
-      shared_name = ""
-    if compat.forward_compatible(2018, 8, 3):
-      iterator_resource = gen_dataset_ops.iterator_v2(
-          container="", shared_name=shared_name, **flat_structure(self))
-    else:
-      iterator_resource = gen_dataset_ops.iterator(
-          container="", shared_name=shared_name, **flat_structure(self))
-    with ops.colocate_with(iterator_resource):
-      initializer = gen_dataset_ops.make_iterator(
-          dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          iterator_resource)
-    return iterator_ops.Iterator(iterator_resource, initializer,
-                                 dataset.output_types, dataset.output_shapes,
-                                 dataset.output_classes)
-
   def __iter__(self):
     """Creates an `Iterator` for enumerating the elements of this dataset.
 
@@ -176,61 +187,22 @@ class Dataset(object):
       RuntimeError: If eager execution is not enabled.
     """
     if context.executing_eagerly():
-      dataset = self._apply_options()
-      return iterator_ops.EagerIterator(dataset)
+      return iterator_ops.EagerIterator(self)
     else:
       raise RuntimeError("dataset.__iter__() is only supported when eager "
                          "execution is enabled.")
 
-  def make_one_shot_iterator(self):
-    """Creates an `Iterator` for enumerating the elements of this dataset.
-
-    Note: The returned iterator will be initialized automatically.
-    A "one-shot" iterator does not currently support re-initialization.
+  @abc.abstractproperty
+  def _element_structure(self):
+    """The structure of an element of this dataset.
 
     Returns:
-      An `Iterator` over the elements of this dataset.
+      A `Structure` object representing the structure of an element of this
+      dataset.
     """
-    if context.executing_eagerly():
-      dataset = self._apply_options()
-      return iterator_ops.EagerIterator(dataset)
-
-    graph_level_seed, op_level_seed = core_random_seed.get_seed(None)
-
-    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
-    # a 0-argument function.
-    @function.Defun(capture_by_value=True)
-    def _make_dataset():
-      # NOTE(mrry): `Defun` does not capture the graph-level seed from the
-      # enclosing graph, so if a graph-level seed is present we set the local
-      # graph seed based on a combination of the graph- and op-level seeds.
-      if graph_level_seed is not None:
-        assert op_level_seed is not None
-        core_random_seed.set_random_seed(
-            (graph_level_seed + 87654321 * op_level_seed) % (2 ** 63 - 1))
-
-      dataset = self._apply_options()
-      return dataset._as_variant_tensor()  # pylint: disable=protected-access
-
-    try:
-      _make_dataset.add_to_graph(ops.get_default_graph())
-    except ValueError as err:
-      if "Cannot capture a stateful node" in str(err):
-        raise ValueError(
-            "Failed to create a one-shot iterator for a dataset. "
-            "`Dataset.make_one_shot_iterator()` does not support datasets that "
-            "capture stateful objects, such as a `Variable` or `LookupTable`. "
-            "In these cases, use `Dataset.make_initializable_iterator()`. "
-            "(Original error: %s)" % err)
-      else:
-        six.reraise(ValueError, err)
-
-    return iterator_ops.Iterator(
-        gen_dataset_ops.one_shot_iterator(
-            dataset_factory=_make_dataset, **flat_structure(self)),
-        None, self.output_types, self.output_shapes, self.output_classes)
+    raise NotImplementedError("Dataset._element_structure")
 
-  @abc.abstractproperty
+  @property
   def output_classes(self):
     """Returns the class of each component of an element of this dataset.
 
@@ -240,9 +212,9 @@ class Dataset(object):
       A nested structure of Python `type` objects corresponding to each
       component of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_classes")
+    return self._element_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
-  @abc.abstractproperty
+  @property
   def output_shapes(self):
     """Returns the shape of each component of an element of this dataset.
 
@@ -250,9 +222,9 @@ class Dataset(object):
       A nested structure of `tf.TensorShape` objects corresponding to each
       component of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_shapes")
+    return self._element_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
-  @abc.abstractproperty
+  @property
   def output_types(self):
     """Returns the type of each component of an element of this dataset.
 
@@ -260,7 +232,7 @@ class Dataset(object):
       A nested structure of `tf.DType` objects corresponding to each component
       of an element of this dataset.
     """
-    raise NotImplementedError("Dataset.output_types")
+    return self._element_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
   def __repr__(self):
     output_shapes = nest.map_structure(str, self.output_shapes)
@@ -277,9 +249,10 @@ class Dataset(object):
     Note that if `tensors` contains a NumPy array, and eager execution is not
     enabled, the values will be embedded in the graph as one or more
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
-    memory and run into byte limits of graph serialization.  If tensors contains
-    one or more large NumPy arrays, consider the alternative described in
-    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
+    memory and run into byte limits of graph serialization. If `tensors`
+    contains one or more large NumPy arrays, consider the alternative described
+    in [this
+    guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors.
@@ -296,9 +269,10 @@ class Dataset(object):
     Note that if `tensors` contains a NumPy array, and eager execution is not
     enabled, the values will be embedded in the graph as one or more
     `tf.constant` operations. For large datasets (> 1 GB), this can waste
-    memory and run into byte limits of graph serialization.  If tensors contains
-    one or more large NumPy arrays, consider the alternative described in
-    [this guide](https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
+    memory and run into byte limits of graph serialization. If `tensors`
+    contains one or more large NumPy arrays, consider the alternative described
+    in [this guide](
+    https://tensorflow.org/guide/datasets#consuming_numpy_arrays).
 
     Args:
       tensors: A nested structure of tensors, each having the same size in the
@@ -309,19 +283,6 @@ class Dataset(object):
     """
     return TensorSliceDataset(tensors)
 
-  @staticmethod
-  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
-  def from_sparse_tensor_slices(sparse_tensor):
-    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
-
-    Args:
-      sparse_tensor: A `tf.SparseTensor`.
-
-    Returns:
-      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
-    """
-    return SparseTensorSliceDataset(sparse_tensor)
-
   class _GeneratorState(object):
     """Stores outstanding iterators created from a Python generator.
 
@@ -371,17 +332,19 @@ class Dataset(object):
 
     ```python
     import itertools
+    tf.enable_eager_execution()
 
     def gen():
       for i in itertools.count(1):
         yield (i, [1] * i)
 
-    ds = Dataset.from_generator(
+    ds = tf.data.Dataset.from_generator(
         gen, (tf.int64, tf.int64), (tf.TensorShape([]), tf.TensorShape([None])))
-    value = ds.make_one_shot_iterator().get_next()
 
-    sess.run(value)  # (1, array([1]))
-    sess.run(value)  # (2, array([1, 1]))
+    for value in ds.take(2):
+      print value
+    # (1, array([1]))
+    # (2, array([1, 1]))
     ```
 
     NOTE: The current implementation of `Dataset.from_generator()` uses
@@ -433,7 +396,7 @@ class Dataset(object):
     flattened_types = [dtypes.as_dtype(dt) for dt in nest.flatten(output_types)]
     flattened_shapes = nest.flatten(output_shapes)
 
-    generator_state = Dataset._GeneratorState(generator)
+    generator_state = DatasetV2._GeneratorState(generator)
 
     def get_iterator_id_fn(unused_dummy):
       """Creates a unique `iterator_id` for each pass over the dataset.
@@ -578,7 +541,7 @@ class Dataset(object):
     ```
 
     Args:
-      *args: follow same semantics as python's xrange.
+      *args: follows the same semantics as python's xrange.
         len(args) == 1 -> start = 0, stop = args[0], step = 1
         len(args) == 2 -> start = args[0], stop = args[1], step = 1
         len(args) == 3 -> start = args[0], stop = args[1, stop = args[2]
@@ -818,78 +781,6 @@ class Dataset(object):
     """
     return SkipDataset(self, count)
 
-  def shard(self, num_shards, index):
-    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
-
-    This dataset operator is very useful when running distributed training, as
-    it allows each worker to read a unique subset.
-
-    When reading a single input file, you can skip elements as follows:
-
-    ```python
-    d = tf.data.TFRecordDataset(FLAGS.input_file)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Important caveats:
-
-    - Be sure to shard before you use any randomizing operator (such as
-      shuffle).
-    - Generally it is best if the shard operator is used early in the dataset
-      pipeline. For example, when reading from a set of TFRecord files, shard
-      before converting the dataset to input samples. This avoids reading every
-      file on every worker. The following is an example of an efficient
-      sharding strategy within a complete pipeline:
-
-    ```python
-    d = Dataset.list_files(FLAGS.pattern)
-    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
-    d = d.repeat(FLAGS.num_epochs)
-    d = d.shuffle(FLAGS.shuffle_buffer_size)
-    d = d.interleave(tf.data.TFRecordDataset,
-                     cycle_length=FLAGS.num_readers, block_length=1)
-    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
-    ```
-
-    Args:
-      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel.
-      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
-
-    Returns:
-      Dataset: A `Dataset`.
-
-    Raises:
-      ValueError: if `num_shards` or `index` are illegal values. Note: error
-        checking is done on a best-effort basis, and aren't guaranteed to be
-        caught upon dataset creation. (e.g. providing in a placeholder tensor
-        bypasses the early checking, and will instead result in an error during
-        a session.run call.)
-    """
-    num_shards = ops.convert_to_tensor(
-        num_shards, name="num_shards", dtype=dtypes.int64)
-    num_shards_static = tensor_util.constant_value(num_shards)
-    index = ops.convert_to_tensor(index, name="index", dtype=dtypes.int64)
-    index_static = tensor_util.constant_value(index)
-
-    if num_shards_static is not None and num_shards_static < 1:
-      raise ValueError("num_shards must be >= 1; got: %s" % num_shards_static)
-    if index_static is not None and index_static < 0:
-      raise ValueError("index must be >= 0; got: %s" % index_static)
-    if (index_static is not None and num_shards_static is not None and
-        index_static >= num_shards_static):
-      raise ValueError("index must be <= num_shards; %s is not < %s" %
-                       (index_static, num_shards_static))
-
-    def filter_fn(elem_index, _):
-      mod_result = math_ops.mod(elem_index, num_shards)
-      return math_ops.equal(mod_result, index)
-
-    return self._enumerate().filter(filter_fn).map(lambda _, elem: elem)
-
   def batch(self, batch_size, drop_remainder=False):
     """Combines consecutive elements of this dataset into batches.
 
@@ -904,7 +795,7 @@ class Dataset(object):
       batch_size: A `tf.int64` scalar `tf.Tensor`, representing the number of
         consecutive elements of this dataset to combine in a single batch.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
-        whether the last batch should be dropped in the case its has fewer than
+        whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller
         batch.
 
@@ -961,7 +852,7 @@ class Dataset(object):
         respective components.  Defaults are `0` for numeric types and
         the empty string for string types.
       drop_remainder: (Optional.) A `tf.bool` scalar `tf.Tensor`, representing
-        whether the last batch should be dropped in the case its has fewer than
+        whether the last batch should be dropped in the case it has fewer than
         `batch_size` elements; the default behavior is not to drop the smaller
         batch.
 
@@ -1056,15 +947,18 @@ class Dataset(object):
        `self.output_types`) to another nested structure of tensors.
       num_parallel_calls: (Optional.) A `tf.int32` scalar `tf.Tensor`,
         representing the number elements to process in parallel. If not
-        specified, elements will be processed sequentially.
+        specified, elements will be processed sequentially. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
 
     Returns:
       Dataset: A `Dataset`.
     """
     if num_parallel_calls is None:
-      return MapDataset(self, map_func)
+      return MapDataset(self, map_func, preserve_cardinality=True)
     else:
-      return ParallelMapDataset(self, map_func, num_parallel_calls)
+      return ParallelMapDataset(
+          self, map_func, num_parallel_calls, preserve_cardinality=True)
 
   def flat_map(self, map_func):
     """Maps `map_func` across this dataset and flattens the result.
@@ -1166,7 +1060,9 @@ class Dataset(object):
       num_parallel_calls: (Optional.) If specified, the implementation creates
         a threadpool, which is used to fetch inputs from cycle elements
         asynchronously and in parallel. The default behavior is to fetch inputs
-        from cycle elements synchronously with no parallelism.
+        from cycle elements synchronously with no parallelism. If the value
+        `tf.data.experimental.AUTOTUNE` is used, then the number of parallel
+        calls is set dynamically based on available CPU.
 
     Returns:
       Dataset: A `Dataset`.
@@ -1215,7 +1111,7 @@ class Dataset(object):
           dataset.
     """
     dataset = transformation_func(self)
-    if not isinstance(dataset, Dataset):
+    if not isinstance(dataset, DatasetV2):
       raise TypeError("`transformation_func` must return a Dataset.")
     dataset._input_datasets = [self]  # pylint: disable=protected-access
     return dataset
@@ -1299,27 +1195,23 @@ class Dataset(object):
 
     # Compute initial values for the state classes, shapes and types based on
     # the initial state.
-    state_classes = sparse.get_classes(initial_state)
-    state_shapes = nest.pack_sequence_as(
-        initial_state, [t.get_shape() for t in nest.flatten(initial_state)])
-    state_types = nest.pack_sequence_as(
-        initial_state, [t.dtype for t in nest.flatten(initial_state)])
+    state_structure = structure_lib.Structure.from_value(initial_state)
 
     # Iteratively rerun the reduce function until reaching a fixed point on
-    # `self._state_shapes`.
+    # `state_structure`.
     need_to_rerun = True
     while need_to_rerun:
 
       wrapped_func = StructuredFunctionWrapper(
           reduce_func,
           "reduce()",
-          input_classes=(state_classes, self.output_classes),
-          input_shapes=(state_shapes, self.output_shapes),
-          input_types=(state_types, self.output_types),
+          input_structure=structure_lib.NestedStructure(
+              (state_structure, self._element_structure)),
           add_to_graph=False)
 
       # Extract and validate class information from the returned values.
       output_classes = wrapped_func.output_classes
+      state_classes = state_structure._to_legacy_output_classes()  # pylint: disable=protected-access
       for new_state_class, state_class in zip(
           nest.flatten(output_classes), nest.flatten(state_classes)):
         if not issubclass(new_state_class, state_class):
@@ -1330,6 +1222,7 @@ class Dataset(object):
 
       # Extract and validate type information from the returned values.
       output_types = wrapped_func.output_types
+      state_types = state_structure._to_legacy_output_types()  # pylint: disable=protected-access
       for new_state_type, state_type in zip(
           nest.flatten(output_types), nest.flatten(state_types)):
         if new_state_type != state_type:
@@ -1340,6 +1233,7 @@ class Dataset(object):
 
       # Extract shape information from the returned values.
       output_shapes = wrapped_func.output_shapes
+      state_shapes = state_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
       flat_state_shapes = nest.flatten(state_shapes)
       flat_new_state_shapes = nest.flatten(output_shapes)
       weakened_state_shapes = [
@@ -1357,35 +1251,33 @@ class Dataset(object):
           break
 
       if need_to_rerun:
-        state_shapes = nest.pack_sequence_as(state_shapes,
-                                             weakened_state_shapes)
+        # TODO(b/110122868): Support a "most specific compatible structure"
+        # method for combining structures, to avoid using legacy structures
+        # here.
+        state_structure = structure_lib.convert_legacy_structure(
+            state_types,
+            nest.pack_sequence_as(state_shapes, weakened_state_shapes),
+            state_classes)
 
     reduce_func = wrapped_func.function
     reduce_func.add_to_graph(ops.get_default_graph())
 
-    return sparse.deserialize_sparse_tensors(
-        nest.pack_sequence_as(
-            output_types,
-            gen_dataset_ops.reduce_dataset(
-                self._as_variant_tensor(),  # pylint: disable=protected-access
-                nest.flatten(sparse.serialize_sparse_tensors(initial_state)),
-                reduce_func.captured_inputs,
-                f=reduce_func,
-                output_shapes=nest.flatten(
-                    sparse.as_dense_shapes(output_shapes, output_classes)),
-                output_types=nest.flatten(
-                    sparse.as_dense_types(output_types, output_classes)))),
-        output_types,
-        output_shapes,
-        output_classes)
+    # pylint: disable=protected-access
+    return state_structure._from_compatible_tensor_list(
+        gen_dataset_ops.reduce_dataset(
+            self._as_variant_tensor(),
+            state_structure._to_tensor_list(initial_state),
+            reduce_func.captured_inputs,
+            f=reduce_func,
+            output_shapes=state_structure._flat_shapes,
+            output_types=state_structure._flat_types))
 
   def with_options(self, options):
     """Returns a new `tf.data.Dataset` with the given options set.
 
-    The options are "global" in the sense they apply to the entire input
-    pipeline in which the `with_options` transformation is used. If options are
-    set multiple times, they are merged if possible (see
-    `tf.data.Options.merge()` for details).
+    The options are "global" in the sense they apply to the entire dataset.
+    If options are set multiple times, they are merged as long as different
+    options do not use different non-default values.
 
     Args:
       options: A `tf.data.Options` that identifies the options the use.
@@ -1394,164 +1286,489 @@ class Dataset(object):
       Dataset: A `Dataset` with the given options.
 
     Raises:
-      ValueError: if options are set more than once
+      ValueError: when an option is set more than once to a non-default value
     """
     return _OptionsDataset(self, options)
 
 
-@tf_export("data.Options")
-class Options(object):
-  """Represents options for tf.data.Dataset.
+@tf_export(v1=["data.Dataset"])
+class DatasetV1(DatasetV2):
+  """Represents a potentially large set of elements.
 
-  An `Options` object can be for instance used to control which static
-  optimizations to apply or whether to use performance modeling to dynamically
-  tune the parallelism of operations such as `tf.data.Dataset.map` or
-  `tf.data.Dataset.interleave`.
+  A `Dataset` can be used to represent an input pipeline as a
+  collection of elements (nested structures of tensors) and a "logical
+  plan" of transformations that act on those elements.
   """
-  for _name, _ty, _docstring in [
-      ("experimental_autotune", bool,
-       "Whether to dynamically adjust the values of tunable parameters (e.g. "
-       "degrees of parallelism)."),
-      ("experimental_deterministic", bool,
-       "Whether the outputs need to be produced in deterministic order."),
-      ("experimental_filter_fusion", bool,
-       "Whether to fuse filter transformations."),
-      ("experimental_hoist_random_uniform", bool,
-       "Whether to hoist `tf.random_uniform()` ops out of map transformations."
-      ),
-      ("experimental_stats", stats_options.StatsOptions,
-       "Associate the given statistics options with the dataset pipeline."),
-      ("experimental_map_and_batch_fusion", bool,
-       "Whether to fuse map and batch transformations."),
-      ("experimental_map_and_filter_fusion", bool,
-       "Whether to fuse map and filter transformations."),
-      ("experimental_map_fusion", bool, "Whether to fuse map transformations."),
-      ("experimental_map_parallelization", bool,
-       "Whether to parallelize stateless map transformations."),
-      ("experimental_map_vectorization", bool,
-       "Whether to vectorize map transformations."),
-      ("experimental_noop_elimination", bool,
-       "Whether to eliminate no-op transformations."),
-      ("experimental_shuffle_and_repeat_fusion", bool,
-       "Whether to fuse shuffle and repeat transformations."),
-      ("experimental_numa_aware", bool,
-       "Whether to use NUMA-aware operations."),
-  ]:
-
-    def _make_getter(name):  # pylint: disable=no-self-argument
-
-      def getter(self):
-        return getattr(self, "_" + name)
-
-      return getter
-
-    def _make_setter(name, ty):  # pylint: disable=no-self-argument
-
-      def setter(self, value):
-        if not isinstance(value, ty):
-          raise TypeError(
-              "Attempting to set the option %s to incompatible value: %r when "
-              "it expects  %r" % (name, value, ty))
-        setattr(self, "_" + name, value)
-
-      return setter
-
-    vars()["_" + _name] = None
-    vars()[_name] = property(
-        _make_getter(_name), _make_setter(_name, _ty), None, _docstring)
 
   def __init__(self):
     pass
 
-  def __eq__(self, other):
-    if isinstance(other, self.__class__):
-      return self.__dict__ == other.__dict__
-    else:
-      return False
+  @deprecation.deprecated(
+      None, "Use `for ... in dataset:` to iterate over a dataset. If using "
+      "`tf.estimator`, return the `Dataset` object directly from your input "
+      "function. As a last resort, you can use "
+      "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+  def make_one_shot_iterator(self):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
 
-  def __ne__(self, other):
-    return not self.__eq__(other)
+    Note: The returned iterator will be initialized automatically.
+    A "one-shot" iterator does not currently support re-initialization.
 
-  def _static_optimizations(self):
-    """Produces the list of enabled static optimizations."""
-    experimental_optimizations = [
-        "filter_fusion",
-        "hoist_random_uniform",
-        "map_and_batch_fusion",
-        "map_and_filter_fusion",
-        "map_fusion",
-        "map_parallelization",
-        "map_vectorization",
-        "noop_elimination",
-        "shuffle_and_repeat_fusion",
-    ]
-    result = []
-    for exp_opt in experimental_optimizations:
-      if getattr(self, "experimental_" + exp_opt):
-        result.append(exp_opt)
+    Returns:
+      An `Iterator` over the elements of this dataset.
+    """
+    if context.executing_eagerly():
+      return iterator_ops.EagerIterator(self)
 
-    if getattr(self, "experimental_numa_aware"):
-      result.append("make_numa_aware")
-    if getattr(self, "experimental_deterministic") is False:
-      result.append("make_sloppy")
-    experimental_stats_options = getattr(self, "experimental_stats")
-    if experimental_stats_options and getattr(experimental_stats_options,
-                                              "latency_all_edges"):
-      result.append("latency_all_edges")
-    return result
+    graph_level_seed, op_level_seed = core_random_seed.get_seed(None)
 
-  def merge(self, options):
-    """Merges itself with the given `tf.data.Options`.
+    # NOTE(mrry): We capture by value here to ensure that `_make_dataset()` is
+    # a 0-argument function.
+    @function.Defun(capture_by_value=True)
+    def _make_dataset():
+      """Factory function for a dataset."""
+      # NOTE(mrry): `Defun` does not capture the graph-level seed from the
+      # enclosing graph, so if a graph-level seed is present we set the local
+      # graph seed based on a combination of the graph- and op-level seeds.
+      if graph_level_seed is not None:
+        assert op_level_seed is not None
+        core_random_seed.set_random_seed(
+            (graph_level_seed + 87654321 * op_level_seed) % (2 ** 63 - 1))
 
-    The given `tf.data.Options` can be merged as long as there does not exist an
-    attribute that is set to different values in `self` and `options`.
+      dataset = self._apply_options()
+      return dataset._as_variant_tensor()  # pylint: disable=protected-access
 
-    Args:
-      options: a `tf.data.Options` to merge with
+    try:
+      _make_dataset.add_to_graph(ops.get_default_graph())
+    except ValueError as err:
+      if "Cannot capture a stateful node" in str(err):
+        raise ValueError(
+            "Failed to create a one-shot iterator for a dataset. "
+            "`Dataset.make_one_shot_iterator()` does not support datasets that "
+            "capture stateful objects, such as a `Variable` or `LookupTable`. "
+            "In these cases, use `Dataset.make_initializable_iterator()`. "
+            "(Original error: %s)" % err)
+      else:
+        six.reraise(ValueError, err)
 
-    Raises:
-      ValueError: if the given `tf.data.Options` cannot be merged
+    return iterator_ops.Iterator(
+        gen_dataset_ops.one_shot_iterator(
+            dataset_factory=_make_dataset, **flat_structure(self)),
+        None, self.output_types, self.output_shapes, self.output_classes)
 
-    Returns:
-      New `tf.data.Options()` object which is the result of merging self with
-      the input `tf.data.Options`.
-    """
-    result = Options()
-    for other in [self, options]:
-      for name in [
-          "experimental_autotune",
-          "experimental_deterministic",
-          "experimental_filter_fusion",
-          "experimental_hoist_random_uniform",
-          "experimental_map_and_batch_fusion",
-          "experimental_map_and_filter_fusion",
-          "experimental_map_fusion",
-          "experimental_map_parallelization",
-          "experimental_map_vectorization",
-          "experimental_noop_elimination",
-          "experimental_numa_aware",
-          "experimental_shuffle_and_repeat_fusion",
-          "experimental_stats",
-      ]:
-        this = getattr(result, name)
-        that = getattr(other, name)
-        if that is not None:
-          if this is None:
-            setattr(result, name, that)
-          elif this != that:
-            raise ValueError(
-                "Cannot merge incompatible values of option: %s" % (name))
-    return result
+  @deprecation.deprecated(
+      None, "Use `for ... in dataset:` to iterate over a dataset. If using "
+      "`tf.estimator`, return the `Dataset` object directly from your input "
+      "function. As a last resort, you can use "
+      "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+  def make_initializable_iterator(self, shared_name=None):
+    """Creates an `Iterator` for enumerating the elements of this dataset.
 
+    Note: The returned iterator will be in an uninitialized state,
+    and you must run the `iterator.initializer` operation before using it:
 
-class DatasetSource(Dataset):
-  """Abstract class representing a dataset with no inputs."""
+    ```python
+    dataset = ...
+    iterator = dataset.make_initializable_iterator()
+    # ...
+    sess.run(iterator.initializer)
+    ```
+
+    Args:
+      shared_name: (Optional.) If non-empty, the returned iterator will be
+        shared under the given name across multiple sessions that share the
+        same devices (e.g. when using a remote server).
+
+    Returns:
+      An `Iterator` over the elements of this dataset.
+
+    Raises:
+      RuntimeError: If eager execution is enabled.
+    """
+    if context.executing_eagerly():
+      raise RuntimeError(
+          "dataset.make_initializable_iterator is not supported when eager "
+          "execution is enabled.")
+    dataset = self._apply_options()
+    if shared_name is None:
+      shared_name = ""
+    if compat.forward_compatible(2018, 8, 3):
+      iterator_resource = gen_dataset_ops.iterator_v2(
+          container="", shared_name=shared_name, **flat_structure(self))
+    else:
+      iterator_resource = gen_dataset_ops.iterator(
+          container="", shared_name=shared_name, **flat_structure(self))
+    with ops.colocate_with(iterator_resource):
+      initializer = gen_dataset_ops.make_iterator(
+          dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          iterator_resource)
+    return iterator_ops.Iterator(iterator_resource, initializer,
+                                 dataset.output_types, dataset.output_shapes,
+                                 dataset.output_classes)
+
+  @property
+  def _element_structure(self):
+    # TODO(b/110122868): Remove this override once all `Dataset` instances
+    # implement `element_structure`.
+    return structure_lib.convert_legacy_structure(
+        self.output_types, self.output_shapes, self.output_classes)
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_tensors)
+  def from_tensors(tensors):
+    return DatasetV1Adapter(DatasetV2.from_tensors(tensors))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_tensor_slices)
+  def from_tensor_slices(tensors):
+    return DatasetV1Adapter(DatasetV2.from_tensor_slices(tensors))
+
+  @staticmethod
+  @deprecation.deprecated(None, "Use `tf.data.Dataset.from_tensor_slices()`.")
+  def from_sparse_tensor_slices(sparse_tensor):
+    """Splits each rank-N `tf.SparseTensor` in this dataset row-wise.
+
+    Args:
+      sparse_tensor: A `tf.SparseTensor`.
+
+    Returns:
+      Dataset: A `Dataset` of rank-(N-1) sparse tensors.
+    """
+    return DatasetV1Adapter(SparseTensorSliceDataset(sparse_tensor))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.from_generator)
+  def from_generator(generator, output_types, output_shapes=None, args=None):
+    return DatasetV1Adapter(DatasetV2.from_generator(
+        generator, output_types, output_shapes, args))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.range)
+  def range(*args):
+    return DatasetV1Adapter(DatasetV2.range(*args))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.zip)
+  def zip(datasets):
+    return DatasetV1Adapter(DatasetV2.zip(datasets))
+
+  @functools.wraps(DatasetV2.concatenate)
+  def concatenate(self, dataset):
+    return DatasetV1Adapter(super(DatasetV1, self).concatenate(dataset))
+
+  @functools.wraps(DatasetV2.prefetch)
+  def prefetch(self, buffer_size):
+    return DatasetV1Adapter(super(DatasetV1, self).prefetch(buffer_size))
+
+  @staticmethod
+  @functools.wraps(DatasetV2.list_files)
+  def list_files(file_pattern, shuffle=None, seed=None):
+    return DatasetV1Adapter(DatasetV2.list_files(file_pattern, shuffle, seed))
+
+  @functools.wraps(DatasetV2.repeat)
+  def repeat(self, count=None):
+    return DatasetV1Adapter(super(DatasetV1, self).repeat(count))
+
+  @functools.wraps(DatasetV2.shuffle)
+  def shuffle(self, buffer_size, seed=None, reshuffle_each_iteration=None):
+    return DatasetV1Adapter(super(DatasetV1, self).shuffle(
+        buffer_size, seed, reshuffle_each_iteration))
+
+  @functools.wraps(DatasetV2.cache)
+  def cache(self, filename=""):
+    return DatasetV1Adapter(super(DatasetV1, self).cache(filename))
+
+  @functools.wraps(DatasetV2.take)
+  def take(self, count):
+    return DatasetV1Adapter(super(DatasetV1, self).take(count))
+
+  @functools.wraps(DatasetV2.skip)
+  def skip(self, count):
+    return DatasetV1Adapter(super(DatasetV1, self).skip(count))
+
+  @deprecation.deprecated(
+      None, "Use `dataset.apply(tf.data.experimental.filter_for_shard(...))`.")
+  def shard(self, num_shards, index):
+    """Creates a `Dataset` that includes only 1/`num_shards` of this dataset.
+
+    This dataset operator is very useful when running distributed training, as
+    it allows each worker to read a unique subset.
+
+    When reading a single input file, you can skip elements as follows:
+
+    ```python
+    d = tf.data.TFRecordDataset(FLAGS.input_file)
+    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
+    d = d.repeat(FLAGS.num_epochs)
+    d = d.shuffle(FLAGS.shuffle_buffer_size)
+    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+    ```
+
+    Important caveats:
+
+    - Be sure to shard before you use any randomizing operator (such as
+      shuffle).
+    - Generally it is best if the shard operator is used early in the dataset
+      pipeline. For example, when reading from a set of TFRecord files, shard
+      before converting the dataset to input samples. This avoids reading every
+      file on every worker. The following is an example of an efficient
+      sharding strategy within a complete pipeline:
+
+    ```python
+    d = Dataset.list_files(FLAGS.pattern)
+    d = d.shard(FLAGS.num_workers, FLAGS.worker_index)
+    d = d.repeat(FLAGS.num_epochs)
+    d = d.shuffle(FLAGS.shuffle_buffer_size)
+    d = d.interleave(tf.data.TFRecordDataset,
+                     cycle_length=FLAGS.num_readers, block_length=1)
+    d = d.map(parser_fn, num_parallel_calls=FLAGS.num_map_threads)
+    ```
+
+    Args:
+      num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
+        shards operating in parallel.
+      index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
+
+    Returns:
+      Dataset: A `Dataset`.
+
+    Raises:
+      ValueError: if `num_shards` or `index` are illegal values. Note: error
+        checking is done on a best-effort basis, and errors aren't guaranteed
+        to be caught upon dataset creation. (e.g. providing in a placeholder
+        tensor bypasses the early checking, and will instead result in an error
+        during a session.run call.)
+    """
+    return self.apply(filter_for_shard_ops.filter_for_shard(num_shards, index))
+
+  @functools.wraps(DatasetV2.batch)
+  def batch(self, batch_size, drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).batch(
+        batch_size, drop_remainder))
+
+  @functools.wraps(DatasetV2.padded_batch)
+  def padded_batch(self,
+                   batch_size,
+                   padded_shapes,
+                   padding_values=None,
+                   drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).padded_batch(
+        batch_size, padded_shapes, padding_values, drop_remainder))
+
+  @functools.wraps(DatasetV2.map)
+  def map(self, map_func, num_parallel_calls=None):
+    if num_parallel_calls is None:
+      return DatasetV1Adapter(
+          MapDataset(self, map_func, preserve_cardinality=False))
+    else:
+      return DatasetV1Adapter(
+          ParallelMapDataset(
+              self, map_func, num_parallel_calls, preserve_cardinality=False))
+
+  @functools.wraps(DatasetV2.flat_map)
+  def flat_map(self, map_func):
+    return DatasetV1Adapter(super(DatasetV1, self).flat_map(map_func))
+
+  @functools.wraps(DatasetV2.interleave)
+  def interleave(self,
+                 map_func,
+                 cycle_length,
+                 block_length=1,
+                 num_parallel_calls=None):
+    return DatasetV1Adapter(super(DatasetV1, self).interleave(
+        map_func, cycle_length, block_length, num_parallel_calls))
+
+  @functools.wraps(DatasetV2.filter)
+  def filter(self, predicate):
+    return DatasetV1Adapter(super(DatasetV1, self).filter(predicate))
+
+  @functools.wraps(DatasetV2.apply)
+  def apply(self, transformation_func):
+    return DatasetV1Adapter(super(DatasetV1, self).apply(transformation_func))
+
+  @functools.wraps(DatasetV2.window)
+  def window(self, size, shift=None, stride=1, drop_remainder=False):
+    return DatasetV1Adapter(super(DatasetV1, self).window(
+        size, shift, stride, drop_remainder))
+
+  @functools.wraps(DatasetV2.with_options)
+  def with_options(self, options):
+    return DatasetV1Adapter(super(DatasetV1, self).with_options(options))
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# this alias in place.
+Dataset = DatasetV1
+
+
+class DatasetV1Adapter(DatasetV1):
+  """Wraps a V2 `Dataset` object in the `tf.compat.v1.data.Dataset` API."""
+
+  def __init__(self, dataset):
+    super(DatasetV1Adapter, self).__init__()
+    self._dataset = dataset
+
+  def _as_variant_tensor(self):
+    return self._dataset._as_variant_tensor()  # pylint: disable=protected-access
+
+  def _has_captured_ref(self):
+    return self._dataset._has_captured_ref()  # pylint: disable=protected-access
+
+  def _inputs(self):
+    return self._dataset._inputs()  # pylint: disable=protected-access
+
+  def options(self):
+    return self._dataset.options()
+
+  @property
+  def _element_structure(self):
+    return self._dataset._element_structure  # pylint: disable=protected-access
+
+  def __iter__(self):
+    return iter(self._dataset)
+
+
+@tf_export(v1=["data.make_one_shot_iterator"])
+def make_one_shot_iterator(dataset):
+  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+
+  Note: The returned iterator will be initialized automatically.
+  A "one-shot" iterator does not support re-initialization.
+
+  Args:
+    dataset: A `tf.data.Dataset`.
+
+  Returns:
+    A `tf.data.Iterator` over the elements of this dataset.
+  """
+  try:
+    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_one_shot_iterator()
+  except AttributeError:
+    return DatasetV1Adapter(dataset).make_one_shot_iterator()
+
+
+@tf_export(v1=["data.make_initializable_iterator"])
+def make_initializable_iterator(dataset):
+  """Creates a `tf.data.Iterator` for enumerating the elements of a dataset.
+
+  Note: The returned iterator will be in an uninitialized state,
+  and you must run the `iterator.initializer` operation before using it:
+
+  ```python
+  dataset = ...
+  iterator = dataset.make_initializable_iterator()
+  # ...
+  sess.run(iterator.initializer)
+  ```
+
+  Args:
+    dataset: A `tf.data.Dataset`.
+
+  Returns:
+    A `tf.data.Iterator` over the elements of `dataset`.
+
+  Raises:
+    RuntimeError: If eager execution is enabled.
+  """
+  try:
+    # Call the defined `make_one_shot_iterator()` if there is one, because some
+    # datasets (e.g. for prefetching) override its behavior.
+    return dataset.make_initializable_iterator()
+  except AttributeError:
+    return DatasetV1Adapter(dataset).make_initializable_iterator()
+
+
+@tf_export("data.Options")
+class Options(options_lib.OptionsBase):
+  """Represents options for tf.data.Dataset.
+
+  An `Options` object can be, for instance, used to control which static
+  optimizations to apply or whether to use performance modeling to dynamically
+  tune the parallelism of operations such as `tf.data.Dataset.map` or
+  `tf.data.Dataset.interleave`.
+  """
+
+  experimental_autotune = options_lib.create_option(
+      name="experimental_autotune",
+      ty=bool,
+      docstring=
+      "Whether to dynamically adjust the values of tunable parameters (e.g. "
+      "degrees of parallelism).")
+
+  experimental_deterministic = options_lib.create_option(
+      name="experimental_deterministic",
+      ty=bool,
+      docstring=
+      "Whether the outputs need to be produced in deterministic order."
+  )
+
+  experimental_numa_aware = options_lib.create_option(
+      name="experimental_numa_aware",
+      ty=bool,
+      docstring="Whether to use NUMA-aware operations.")
+
+  experimental_optimization = options_lib.create_option(
+      name="experimental_optimization",
+      ty=optimization_options.OptimizationOptions,
+      docstring="Associates the given optimization options with the dataset.")
+
+  experimental_stats = options_lib.create_option(
+      name="experimental_stats",
+      ty=stats_options.StatsOptions,
+      docstring="Associates the given statistics options with the dataset.")
+
+  experimental_threading = options_lib.create_option(
+      name="experimental_threading",
+      ty=threading_options.ThreadingOptions,
+      docstring="Associates the given threading options with the dataset.")
+
+  def _static_optimizations(self):
+    """Produces the list of enabled static optimizations."""
+
+    result = []
+    exp_optimization_options = (
+        self.experimental_optimization or
+        optimization_options.OptimizationOptions())  # If not set, use default
+    result.extend(exp_optimization_options._static_optimizations())  # pylint: disable=protected-access
+
+    if self.experimental_numa_aware:
+      result.append("make_numa_aware")
+    if self.experimental_deterministic is False:
+      result.append("make_sloppy")
+    exp_stats_options = self.experimental_stats
+    if exp_stats_options and exp_stats_options.latency_all_edges:
+      result.append("latency_all_edges")
+    return result
+
+  def merge(self, options):
+    """Merges itself with the given `tf.data.Options`.
+
+    The given `tf.data.Options` can be merged as long as there does not exist an
+    attribute that is set to different values in `self` and `options`.
+
+    Args:
+      options: a `tf.data.Options` to merge with
+
+    Raises:
+      ValueError: if the given `tf.data.Options` cannot be merged
+
+    Returns:
+      New `tf.data.Options()` object which is the result of merging self with
+      the input `tf.data.Options`.
+    """
+    return options_lib.merge_options(self, options)
+
+
+class DatasetSource(DatasetV2):
+  """Abstract class representing a dataset with no inputs."""
 
   def _inputs(self):
     return []
 
 
-class UnaryDataset(Dataset):
+class UnaryDataset(DatasetV2):
   """Abstract class representing a dataset with one input."""
 
   def __init__(self, input_dataset):
@@ -1562,6 +1779,14 @@ class UnaryDataset(Dataset):
     return [self._input_dataset]
 
 
+class UnaryUnchangedStructureDataset(UnaryDataset):
+  """Represents a unary dataset with the same input and output structure."""
+
+  @property
+  def _element_structure(self):
+    return self._input_dataset._element_structure  # pylint: disable=protected-access
+
+
 class TensorDataset(DatasetSource):
   """A `Dataset` with a single element, viz. a nested structure of tensors."""
 
@@ -1575,31 +1800,16 @@ class TensorDataset(DatasetSource):
               t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
       ])
-
-    self._tensors = sparse.serialize_sparse_tensors(tensors)
-    self._output_classes = sparse.get_classes(tensors)
-    self._output_shapes = nest.pack_sequence_as(
-        tensors, [t.get_shape() for t in nest.flatten(tensors)])
-    self._output_types = nest.pack_sequence_as(
-        tensors, [t.dtype for t in nest.flatten(tensors)])
+    self._structure = structure_lib.Structure.from_value(tensors)
+    self._tensors = self._structure._to_tensor_list(tensors)  # pylint: disable=protected-access
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_dataset(
-        nest.flatten(self._tensors),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
+        self._tensors, output_shapes=self._structure._flat_shapes)  # pylint: disable=protected-access
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class TensorSliceDataset(DatasetSource):
@@ -1615,37 +1825,26 @@ class TensorSliceDataset(DatasetSource):
               t, name="component_%d" % i)
           for i, t in enumerate(nest.flatten(tensors))
       ])
-      flat_tensors = nest.flatten(tensors)
+
+    batched_structure = structure_lib.Structure.from_value(tensors)
+    # pylint: disable=protected-access
+    self._tensors = batched_structure._to_batched_tensor_list(tensors)
+    self._structure = batched_structure._unbatch()
+    # pylint: enable=protected-access
 
     batch_dim = tensor_shape.Dimension(tensor_shape.dimension_value(
-        flat_tensors[0].get_shape()[0]))
-    for t in flat_tensors[1:]:
+        self._tensors[0].get_shape()[0]))
+    for t in self._tensors[1:]:
       batch_dim.assert_is_compatible_with(tensor_shape.Dimension(
           tensor_shape.dimension_value(t.get_shape()[0])))
-    self._tensors = sparse.serialize_many_sparse_tensors(tensors)
-    self._output_classes = sparse.get_classes(tensors)
-    self._output_shapes = nest.pack_sequence_as(
-        tensors, [t.get_shape()[1:] for t in nest.flatten(tensors)])
-    self._output_types = nest.pack_sequence_as(
-        tensors, [t.dtype for t in nest.flatten(tensors)])
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.tensor_slice_dataset(
-        nest.flatten(self._tensors),
-        output_shapes=nest.flatten(
-            sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+        self._tensors, output_shapes=self._structure._flat_shapes)  # pylint: disable=protected-access
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
 class SparseTensorSliceDataset(DatasetSource):
@@ -1658,111 +1857,105 @@ class SparseTensorSliceDataset(DatasetSource):
       raise TypeError("`sparse_tensor` must be a `tf.SparseTensor` object.")
     self._sparse_tensor = sparse_tensor
 
+    indices_shape = self._sparse_tensor.indices.get_shape()
+    shape_shape = self._sparse_tensor.dense_shape.get_shape()
+    rank = (indices_shape.dims[1] - 1).merge_with(shape_shape.dims[0] - 1)
+    self._structure = structure_lib.NestedStructure(
+        (structure_lib.TensorStructure(dtypes.int64, [None, rank]),
+         structure_lib.TensorStructure(self._sparse_tensor.dtype, [None]),
+         structure_lib.TensorStructure(dtypes.int64, [rank])))
+
   def _as_variant_tensor(self):
     return gen_dataset_ops.sparse_tensor_slice_dataset(
         self._sparse_tensor.indices, self._sparse_tensor.values,
         self._sparse_tensor.dense_shape)
 
   @property
-  def output_classes(self):
-    return (ops.Tensor, ops.Tensor, ops.Tensor)
+  def _element_structure(self):
+    return self._structure
 
-  @property
-  def output_shapes(self):
-    indices_shape = self._sparse_tensor.indices.get_shape()
-    shape_shape = self._sparse_tensor.dense_shape.get_shape()
-    rank = (indices_shape.dims[1] - 1).merge_with(shape_shape.dims[0] - 1)
-    num_values = tensor_shape.Dimension(None)
-    return (tensor_shape.TensorShape([num_values, rank]),
-            tensor_shape.TensorShape([num_values]),
-            tensor_shape.TensorShape([rank]))
 
-  @property
-  def output_types(self):
-    return (dtypes.int64, self._sparse_tensor.dtype, dtypes.int64)
+class _VariantDataset(DatasetV2):
+  """A Dataset wrapper around a `tf.variant`-typed function argument."""
 
+  def __init__(self, dataset_variant, structure):
+    super(_VariantDataset, self).__init__()
+    self._dataset_variant = dataset_variant
+    self._structure = structure
 
-class _NestedDatasetComponent(object):
-  """The structure of a `Dataset` nested in a component of another `Dataset`.
+  def _as_variant_tensor(self):
+    return self._dataset_variant
 
-  A `StructuredFunctionWrapper` around a function that returns a `Dataset` as
-  one of its components will have a `NestedDatasetComponent` in the
-  corresponding position in the `output_classes`, `output_shapes`, and
-  `output_types` properties.
+  def _inputs(self):
+    return []
 
-  NOTE(mrry): This class is not currently exposed via the public API. Support
-  for nested datasets can be enabled on a function-by-function basis by setting
-  `experimental_nested_dataset_support=True` in the `StructuredFunctionWrapper`
-  initializer.
+  @property
+  def _element_structure(self):
+    return self._structure
 
-  TODO(b/110122868): Add this class, or something equivalent, to the public API.
-  We are considering revising the public API for accessing Dataset structure
-  (`output_classes` etc.) based on experience with nested datasets and other
-  custom component types.
-  """
 
-  def __init__(self,
-               dataset=None,
-               output_shapes=None,
-               output_types=None,
-               output_classes=None):
-    if dataset is None:
-      if (output_classes is None or output_shapes is None or
-          output_types is None):
-        raise ValueError(
-            "Either `dataset`, or all of `output_classes`, "
-            "`output_shapes`, and `output_types` must be specified.")
-      self._output_classes = output_classes
-      self._output_shapes = output_shapes
-      self._output_types = output_types
-    else:
-      if not (output_classes is None and output_shapes is None and
-              output_types is None):
-        raise ValueError(
-            "Either `dataset`, or all of `output_classes`, "
-            "`output_shapes`, and `output_types` must be specified.")
-      self._output_classes = dataset.output_classes
-      self._output_shapes = dataset.output_shapes
-      self._output_types = dataset.output_types
+@tf_export("data.experimental.DatasetStructure")
+class DatasetStructure(structure_lib.Structure):
+  """Represents a `Dataset` of structured values."""
 
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def __init__(self, element_structure):
+    self._element_structure = element_structure
 
   @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _flat_shapes(self):
+    return [tensor_shape.scalar()]
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _flat_types(self):
+    return [dtypes.variant]
 
+  def is_compatible_with(self, other):
+    # pylint: disable=protected-access
+    return (isinstance(other, DatasetStructure) and
+            self._element_structure.is_compatible_with(
+                other._element_structure))
 
-class _VariantDataset(Dataset):
-  """A Dataset wrapper around a `tf.variant`-typed function argument."""
+  def _to_tensor_list(self, value):
+    return [value._as_variant_tensor()]  # pylint: disable=protected-access
 
-  def __init__(self, dataset_variant, structure):
-    super(_VariantDataset, self).__init__()
-    self._dataset_variant = dataset_variant
-    self._structure = structure
+  def _to_batched_tensor_list(self, value):
+    raise NotImplementedError("Unbatching for `tf.data.Dataset` objects.")
 
-  def _as_variant_tensor(self):
-    return self._dataset_variant
+  def _from_tensor_list(self, flat_value):
+    if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
+        not flat_value[0].shape.is_compatible_with(tensor_shape.scalar())):
+      raise ValueError(
+          "DatasetStructure corresponds to a single tf.variant scalar.")
+    return self._from_compatible_tensor_list(flat_value)
 
-  def _inputs(self):
-    return []
+  def _from_compatible_tensor_list(self, flat_value):
+    # pylint: disable=protected-access
+    return _VariantDataset(flat_value[0], self._element_structure)
 
-  @property
-  def output_classes(self):
-    return self._structure.output_classes
+  @staticmethod
+  def from_value(value):
+    return DatasetStructure(value._element_structure)  # pylint: disable=protected-access
+
+  def _to_legacy_output_types(self):
+    return self
+
+  def _to_legacy_output_shapes(self):
+    return self
+
+  def _to_legacy_output_classes(self):
+    return self
+
+  def _batch(self, batch_size):
+    raise NotImplementedError("Batching for `tf.data.Dataset` objects.")
+
+  def _unbatch(self):
+    raise NotImplementedError("Unbatching for `tf.data.Dataset` objects.")
 
-  @property
-  def output_shapes(self):
-    return self._structure.output_shapes
 
-  @property
-  def output_types(self):
-    return self._structure.output_types
+# pylint: disable=protected-access
+structure_lib.Structure._register_custom_converter(DatasetV2,
+                                                   DatasetStructure.from_value)
+# pylint: enable=protected-access
 
 
 class StructuredFunctionWrapper(object):
@@ -1776,8 +1969,9 @@ class StructuredFunctionWrapper(object):
                input_classes=None,
                input_shapes=None,
                input_types=None,
+               input_structure=None,
                add_to_graph=True,
-               experimental_nested_dataset_support=False):
+               defun_kwargs=None):
     """Creates a new `StructuredFunctionWrapper` for the given function.
 
     Args:
@@ -1794,30 +1988,40 @@ class StructuredFunctionWrapper(object):
         arguments.
       input_types: (Optional.) A nested structure of `tf.DType`. If given, this
         argument defines the element types and structure for `func` arguments.
+      input_structure: (Optional.) A `Structure` object. If given, this argument
+        defines the element types and structure for `func` arguments.
       add_to_graph: (Optional.) If `True`, the function will be added to the
         default graph.
-      experimental_nested_dataset_support: (Optional.) If `True`, the function
-        will support `tf.data.Dataset` objects as arguments and return values.
+      defun_kwargs: (Optional.) A dictionary mapping string argument names to
+        values. If supplied, will be passed to `function.Defun()` as keyword
+        arguments.
 
     Raises:
       ValueError: If an invalid combination of `dataset`, `input_classes`,
         `input_shapes`, and `input_types` is passed.
     """
-    if dataset is None:
-      if input_classes is None or input_shapes is None or input_types is None:
-        raise ValueError("Either `dataset`, or all of `input_classes`, "
-                         "`input_shapes`, and `input_types` must be specified.")
-      self._input_shapes = input_shapes
-      self._input_types = input_types
-      self._input_classes = input_classes
+    if input_structure is None:
+      if dataset is None:
+        if input_classes is None or input_shapes is None or input_types is None:
+          raise ValueError("Either `dataset`, `input_structure` or all of "
+                           "`input_classes`, `input_shapes`, and `input_types` "
+                           "must be specified.")
+        self._input_structure = structure_lib.convert_legacy_structure(
+            input_types, input_shapes, input_classes)
+      else:
+        if not (input_classes is None and input_shapes is None and
+                input_types is None):
+          raise ValueError("Either `dataset`, `input_structure` or all of "
+                           "`input_classes`, `input_shapes`, and `input_types` "
+                           "must be specified.")
+        self._input_structure = dataset._element_structure  # pylint: disable=protected-access
     else:
-      if not (input_classes is None and input_shapes is None and
-              input_types is None):
-        raise ValueError("Either `dataset`, or all of `input_classes`, "
-                         "`input_shapes`, and `input_types` must be specified.")
-      self._input_shapes = dataset.output_shapes
-      self._input_types = dataset.output_types
-      self._input_classes = dataset.output_classes
+      if not (dataset is None and input_classes is None and input_shapes is None
+              and input_types is None):
+        raise ValueError("Either `dataset`, `input_structure`, or all of "
+                         "`input_classes`, `input_shapes`, and `input_types` "
+                         "must be specified.")
+      self._input_structure = input_structure
 
     self._transformation_name = transformation_name
     readable_transformation_name = transformation_name.replace(
@@ -1826,35 +2030,18 @@ class StructuredFunctionWrapper(object):
         readable_transformation_name,
         function_utils.get_func_name(func),
         str(ops.uid())
-
     ])
 
-    # TODO(b/110122868): Enable this support for all `tf.data` functions.
-    self._nested_dataset_support = experimental_nested_dataset_support
+    if defun_kwargs is None:
+      defun_kwargs = {}
 
-    @function.Defun(*self._defun_args(), func_name=self._func_name)
+    @function.Defun(
+        *self._input_structure._flat_types, func_name=self._func_name,  # pylint: disable=protected-access
+        **defun_kwargs)
     def tf_data_structured_function_wrapper(*args):
       """Wrapper for passing nested structures to and from tf.data functions."""
-      flat_args = []
-      for arg, arg_class, arg_shape, arg_type in zip(
-          args,
-          nest.flatten(self._input_classes),
-          nest.flatten(self._input_shapes),
-          nest.flatten(self._input_types)):
-        # TODO(b/110122868): Add a registration mechanism for new component
-        # types.
-        if arg_class is sparse_tensor_lib.SparseTensor:
-          arg = sparse.deserialize_sparse_tensors(
-              arg, arg_type, arg_shape, arg_class)
-          arg.indices.set_shape([None, arg_shape.ndims])
-          arg.dense_shape.set_shape([arg_shape.ndims])
-        elif isinstance(arg_class, _NestedDatasetComponent):
-          assert self._nested_dataset_support
-          arg = _VariantDataset(arg, arg_class)
-        else:
-          arg.set_shape(arg_shape)
-        flat_args.append(arg)
-      nested_args = nest.pack_sequence_as(self._input_classes, flat_args)
+      # pylint: disable=protected-access
+      nested_args = self._input_structure._from_compatible_tensor_list(args)
       if not _should_unpack_args(nested_args):
         nested_args = (nested_args,)
 
@@ -1872,55 +2059,14 @@ class StructuredFunctionWrapper(object):
       if isinstance(ret, list):
         ret = tuple(ret)
 
-      # Convert any `SparseTensorValue`s to `SparseTensor`s and all other
-      # values to tensors.
-      flat_ret = []
-      flat_classes = []
-      flat_shapes = []
-      flat_types = []
-      for t in nest.flatten(ret):
-        # TODO(b/110122868): Add a registration mechanism for new component
-        # types.
-        if sparse_tensor_lib.is_sparse(t):
-          t = sparse_tensor_lib.SparseTensor.from_value(t)
-          flat_ret.append(sparse.serialize_sparse_tensors(t))
-          flat_classes.append(sparse_tensor_lib.SparseTensor)
-          flat_shapes.append(t.get_shape())
-          flat_types.append(t.dtype)
-        elif isinstance(t, Dataset):
-          if not self._nested_dataset_support:
-            raise NotImplementedError(
-                "The %s transformation does not currently support nested "
-                "datasets as outputs." % self._transformation_name)
-
-          flat_ret.append(t._as_variant_tensor())  # pylint: disable=protected-access
-          component = _NestedDatasetComponent(t)
-          flat_classes.append(component)
-          flat_shapes.append(component)
-          flat_types.append(component)
-          if t.options() != Options():
-            warnings.warn("Encountered a nested dataset with non-default "
-                          "options. These options will not be propagated to "
-                          "the outer dataset.")
-        else:
-          try:
-            t = ops.convert_to_tensor(t)
-          except (ValueError, TypeError):
-            raise TypeError("Unsupported return value from function passed to "
-                            "%s: %s." % (transformation_name, t))
-          flat_ret.append(t)
-          flat_classes.append(ops.Tensor)
-          flat_shapes.append(t.get_shape())
-          flat_types.append(t.dtype)
-
-      ret = nest.pack_sequence_as(ret, flat_ret)
-      self._output_classes = nest.pack_sequence_as(ret, flat_classes)
-      self._output_shapes = nest.pack_sequence_as(ret, flat_shapes)
-      self._output_types = nest.pack_sequence_as(ret, flat_types)
+      try:
+        self._output_structure = structure_lib.Structure.from_value(ret)
+      except (ValueError, TypeError):
+        raise TypeError("Unsupported return value from function passed to "
+                        "%s: %s." % (transformation_name, ret))
 
       _warn_if_collections(transformation_name)
-
-      return flat_ret
+      return self._output_structure._to_tensor_list(ret)
 
     self._function = tf_data_structured_function_wrapper
     if add_to_graph:
@@ -1931,36 +2077,21 @@ class StructuredFunctionWrapper(object):
       # in case (e.g.) we need to rerun the function.
       self._function._create_definition_if_needed()  # pylint: disable=protected-access
 
-  def _defun_args(self):
-    """Returns a flat list of `tf.DType` for the input element structure."""
-    ret = []
-    for input_type, input_class in zip(nest.flatten(self._input_types),
-                                       nest.flatten(self._input_classes)):
-      # TODO(b/110122868): Add a registration mechanism for new component types.
-      if input_class is sparse_tensor_lib.SparseTensor:
-        ret.append(dtypes.variant)
-      elif isinstance(input_class, _NestedDatasetComponent):
-        if not self._nested_dataset_support:
-          raise NotImplementedError(
-              "The %s transformation does not currently support nested "
-              "datasets as inputs." % self._transformation_name)
-        ret.append(dtypes.variant)
-      else:
-        assert isinstance(input_type, dtypes.DType)
-        ret.append(input_type)
-    return ret
+  @property
+  def output_structure(self):
+    return self._output_structure
 
   @property
   def output_classes(self):
-    return self._output_classes
+    return self._output_structure._to_legacy_output_classes()  # pylint: disable=protected-access
 
   @property
   def output_shapes(self):
-    return self._output_shapes
+    return self._output_structure._to_legacy_output_shapes()  # pylint: disable=protected-access
 
   @property
   def output_types(self):
-    return self._output_types
+    return self._output_structure._to_legacy_output_types()  # pylint: disable=protected-access
 
   @property
   def function(self):
@@ -1983,30 +2114,11 @@ def flat_structure(dataset):
     A dictionary of keyword arguments that can be passed to many Dataset op
     constructors.
   """
-  output_classes = []
-  output_shapes = []
-  output_types = []
-  for output_class, output_shape, output_type in zip(
-      nest.flatten(dataset.output_classes), nest.flatten(dataset.output_shapes),
-      nest.flatten(dataset.output_types)):
-    if isinstance(output_class, _NestedDatasetComponent):
-      output_classes.append(output_class.output_classes)
-      output_shapes.append(output_shape.output_shapes)
-      output_types.append(output_type.output_types)
-    else:
-      output_classes.append(output_class)
-      output_shapes.append(output_shape)
-      output_types.append(output_type)
-
-  output_classes = nest.pack_sequence_as(dataset.output_classes, output_classes)
-  output_shapes = nest.pack_sequence_as(dataset.output_shapes, output_shapes)
-  output_types = nest.pack_sequence_as(dataset.output_types, output_types)
-
+  # pylint: disable=protected-access
+  structure = dataset._element_structure
   return {
-      "output_shapes":
-          nest.flatten(sparse.as_dense_shapes(output_shapes, output_classes)),
-      "output_types":
-          nest.flatten(sparse.as_dense_types(output_types, output_classes)),
+      "output_shapes": structure._flat_shapes,
+      "output_types": structure._flat_types,
   }
 
 
@@ -2029,83 +2141,52 @@ class _GeneratorDataset(DatasetSource):
         destroyed. The return value is ignored.
     """
     super(_GeneratorDataset, self).__init__()
-    # These members will be initialized by `tf_init_func`.
-    self._state_classes = None
-    self._state_shapes = None
-    self._state_types = None
-
     self._init_args = init_args
 
-    init_args_classes = sparse.get_classes(init_args)
-    init_args_shapes = nest.pack_sequence_as(
-        init_args, [t.get_shape() for t in nest.flatten(init_args)])
-    init_args_types = nest.pack_sequence_as(
-        init_args, [t.dtype for t in nest.flatten(init_args)])
+    self._init_structure = structure_lib.Structure.from_value(init_args)
 
-    wrapped_init_func = StructuredFunctionWrapper(
+    self._init_func = StructuredFunctionWrapper(
         init_func,
         self._transformation_name(),
-        input_classes=init_args_classes,
-        input_shapes=init_args_shapes,
-        input_types=init_args_types)
-    self._state_classes = wrapped_init_func.output_classes
-    self._state_shapes = wrapped_init_func.output_shapes
-    self._state_types = wrapped_init_func.output_types
-    self._init_func = wrapped_init_func.function
-
-    wrapped_next_func = StructuredFunctionWrapper(
+        input_structure=self._init_structure)
+
+    self._next_func = StructuredFunctionWrapper(
         next_func,
         self._transformation_name(),
-        input_classes=self._state_classes,
-        input_shapes=self._state_shapes,
-        input_types=self._state_types)
-    self._output_classes = wrapped_next_func.output_classes
-    self._output_shapes = wrapped_next_func.output_shapes
-    self._output_types = wrapped_next_func.output_types
-    self._next_func = wrapped_next_func.function
-
-    wrapped_finalize_func = StructuredFunctionWrapper(
+        input_structure=self._init_func.output_structure)
+
+    self._finalize_func = StructuredFunctionWrapper(
         finalize_func,
         self._transformation_name(),
-        input_classes=self._state_classes,
-        input_shapes=self._state_shapes,
-        input_types=self._state_types)
-    self._finalize_func = wrapped_finalize_func.function
+        input_structure=self._init_func.output_structure)
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.generator_dataset(
-        nest.flatten(self._init_args) + self._init_func.captured_inputs,
-        self._next_func.captured_inputs,
-        self._finalize_func.captured_inputs,
-        init_func=self._init_func,
-        next_func=self._next_func,
-        finalize_func=self._finalize_func,
+        self._init_structure._to_tensor_list(self._init_args)  # pylint: disable=protected-access
+        + self._init_func.function.captured_inputs,
+        self._next_func.function.captured_inputs,
+        self._finalize_func.function.captured_inputs,
+        init_func=self._init_func.function,
+        next_func=self._next_func.function,
+        finalize_func=self._finalize_func.function,
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._next_func.output_structure
 
   def _transformation_name(self):
     return "Dataset.from_generator()"
 
 
-class ZipDataset(Dataset):
+class ZipDataset(DatasetV2):
   """A `Dataset` that zips its inputs together."""
 
   def __init__(self, datasets):
     """See `Dataset.zip()` for details."""
     super(ZipDataset, self).__init__()
     for ds in nest.flatten(datasets):
-      if not isinstance(ds, Dataset):
+      if not isinstance(ds, DatasetV2):
         if isinstance(ds, list):
           message = ("The argument to `Dataset.zip()` must be a nested "
                      "structure of `Dataset` objects. Nested structures do not "
@@ -2115,6 +2196,10 @@ class ZipDataset(Dataset):
                      "structure of `Dataset` objects.")
         raise TypeError(message)
     self._datasets = datasets
+    self._structure = structure_lib.NestedStructure(
+        nest.pack_sequence_as(
+            self._datasets,
+            [ds._element_structure for ds in nest.flatten(self._datasets)]))  # pylint: disable=protected-access
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
@@ -2127,25 +2212,11 @@ class ZipDataset(Dataset):
     return nest.flatten(self._datasets)
 
   @property
-  def output_classes(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_classes for ds in nest.flatten(self._datasets)])
-
-  @property
-  def output_shapes(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_shapes for ds in nest.flatten(self._datasets)])
-
-  @property
-  def output_types(self):
-    return nest.pack_sequence_as(
-        self._datasets,
-        [ds.output_types for ds in nest.flatten(self._datasets)])
+  def _element_structure(self):
+    return self._structure
 
 
-class ConcatenateDataset(Dataset):
+class ConcatenateDataset(DatasetV2):
   """A `Dataset` that concatenates its input with given dataset."""
 
   def __init__(self, input_dataset, dataset_to_concatenate):
@@ -2154,26 +2225,29 @@ class ConcatenateDataset(Dataset):
     self._input_dataset = input_dataset
     self._dataset_to_concatenate = dataset_to_concatenate
 
-    self._output_types = input_dataset.output_types
-    if self._output_types != dataset_to_concatenate.output_types:
+    output_types = input_dataset.output_types
+    if output_types != dataset_to_concatenate.output_types:
       raise TypeError(
           "Two datasets to concatenate have different types %s and %s" %
-          (self._output_types, dataset_to_concatenate.output_types))
+          (output_types, dataset_to_concatenate.output_types))
 
-    self._output_classes = input_dataset.output_classes
-    if self._output_classes != dataset_to_concatenate.output_classes:
+    output_classes = input_dataset.output_classes
+    if output_classes != dataset_to_concatenate.output_classes:
       raise TypeError(
           "Two datasets to concatenate have different classes %s and %s" %
-          (self._output_classes, dataset_to_concatenate.output_classes))
+          (output_classes, dataset_to_concatenate.output_classes))
 
     input_shapes = self._input_dataset.output_shapes
-    self._output_shapes = nest.pack_sequence_as(input_shapes, [
+    output_shapes = nest.pack_sequence_as(input_shapes, [
         ts1.most_specific_compatible_shape(ts2)
         for (ts1, ts2) in zip(
             nest.flatten(input_shapes),
             nest.flatten(self._dataset_to_concatenate.output_shapes))
     ])
 
+    self._structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
+
     self._input_datasets = [input_dataset, dataset_to_concatenate]
 
   def _as_variant_tensor(self):
@@ -2188,19 +2262,11 @@ class ConcatenateDataset(Dataset):
     return [self._input_dataset, self._dataset_to_concatenate]
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
-class RepeatDataset(UnaryDataset):
+class RepeatDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that repeats its input several times."""
 
   def __init__(self, input_dataset, count):
@@ -2219,18 +2285,6 @@ class RepeatDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class RangeDataset(DatasetSource):
   """A `Dataset` of a step separated range of values."""
@@ -2268,19 +2322,11 @@ class RangeDataset(DatasetSource):
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.int64
+  def _element_structure(self):
+    return structure_lib.TensorStructure(dtypes.int64, [])
 
 
-class CacheDataset(UnaryDataset):
+class CacheDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that caches elements of its input."""
 
   def __init__(self, input_dataset, filename):
@@ -2296,20 +2342,8 @@ class CacheDataset(UnaryDataset):
         filename=self._filename,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class ShuffleDataset(UnaryDataset):
+class ShuffleDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that randomly shuffles the elements of its input."""
 
   def __init__(self,
@@ -2357,20 +2391,8 @@ class ShuffleDataset(UnaryDataset):
         reshuffle_each_iteration=self._reshuffle_each_iteration,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
 
-
-class TakeDataset(UnaryDataset):
+class TakeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` containing the first `count` elements from its input."""
 
   def __init__(self, input_dataset, count):
@@ -2385,20 +2407,8 @@ class TakeDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class SkipDataset(UnaryDataset):
+class SkipDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` skipping the first `count` elements from its input."""
 
   def __init__(self, input_dataset, count):
@@ -2413,18 +2423,6 @@ class SkipDataset(UnaryDataset):
         count=self._count,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class BatchDataset(UnaryDataset):
   """A `Dataset` that batches contiguous elements from its input."""
@@ -2438,37 +2436,26 @@ class BatchDataset(UnaryDataset):
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
-  def _as_variant_tensor(self):
-    # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
-    if smart_cond.smart_constant_value(self._drop_remainder) is False:
-      return gen_dataset_ops.batch_dataset(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          batch_size=self._batch_size,
-          **flat_structure(self))
+    constant_drop_remainder = tensor_util.constant_value(self._drop_remainder)
+    # pylint: disable=protected-access
+    if constant_drop_remainder:
+      # NOTE(mrry): `constant_drop_remainder` may be `None` (unknown statically)
+      # or `False` (explicitly retaining the remainder).
+      self._structure = input_dataset._element_structure._batch(
+          tensor_util.constant_value(self._batch_size))
     else:
-      return gen_dataset_ops.batch_dataset_v2(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-          batch_size=self._batch_size,
-          drop_remainder=self._drop_remainder,
-          **flat_structure(self))
+      self._structure = input_dataset._element_structure._batch(None)
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    input_shapes = self._input_dataset.output_shapes
-    return nest.pack_sequence_as(input_shapes, [
-        tensor_shape.vector(
-            tensor_util.constant_value(self._batch_size) if smart_cond.
-            smart_constant_value(self._drop_remainder) else None).concatenate(s)
-        for s in nest.flatten(self._input_dataset.output_shapes)
-    ])
+  def _as_variant_tensor(self):
+    return gen_dataset_ops.batch_dataset_v2(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        batch_size=self._batch_size,
+        drop_remainder=self._drop_remainder,
+        **flat_structure(self))
 
   @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 def _is_padded_shape_compatible_with(padded_shape, input_component_shape):
@@ -2617,22 +2604,34 @@ class PaddedBatchDataset(UnaryDataset):
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
 
+    def _padded_shape_to_batch_shape(s):
+      return tensor_shape.vector(
+          tensor_util.constant_value(self._batch_size) if smart_cond.
+          smart_constant_value(self._drop_remainder) else None).concatenate(
+              tensor_util.constant_value_as_shape(s))
+
+    output_shapes = nest.map_structure(
+        _padded_shape_to_batch_shape, self._padded_shapes)
+    self._structure = structure_lib.convert_legacy_structure(
+        self._input_dataset.output_types, output_shapes,
+        self._input_dataset.output_classes)
+
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     # TODO(jsimsa): Switch to using v2 only any time after 6/30/2018.
     if smart_cond.smart_constant_value(self._drop_remainder) is False:
       return gen_dataset_ops.padded_batch_dataset(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._input_dataset._as_variant_tensor(),
           batch_size=self._batch_size,
           padded_shapes=[
               ops.convert_to_tensor(s, dtype=dtypes.int64)
               for s in nest.flatten(self._padded_shapes)
           ],
           padding_values=nest.flatten(self._padding_values),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
+          output_shapes=self._structure._flat_shapes)
     else:
       return gen_dataset_ops.padded_batch_dataset_v2(
-          self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+          self._input_dataset._as_variant_tensor(),
           batch_size=self._batch_size,
           padded_shapes=[
               ops.convert_to_tensor(s, dtype=dtypes.int64)
@@ -2640,27 +2639,11 @@ class PaddedBatchDataset(UnaryDataset):
           ],
           padding_values=nest.flatten(self._padding_values),
           drop_remainder=self._drop_remainder,
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(self.output_shapes, self.output_classes)))
-
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-
-    def _padded_shape_to_batch_shape(s):
-      return tensor_shape.vector(
-          tensor_util.constant_value(self._batch_size) if smart_cond.
-          smart_constant_value(self._drop_remainder) else None).concatenate(
-              tensor_util.constant_value_as_shape(s))
-
-    return nest.map_structure(_padded_shape_to_batch_shape, self._padded_shapes)
+          output_shapes=self._structure._flat_shapes)
 
   @property
-  def output_types(self):
-    return self._input_dataset.output_types
+  def _element_structure(self):
+    return self._structure
 
 
 def _should_unpack_args(args):
@@ -2689,71 +2672,40 @@ def _warn_if_collections(transformation_name):
 class MapDataset(UnaryDataset):
   """A `Dataset` that maps a function over elements in its input."""
 
-  def __init__(self, input_dataset, map_func, use_inter_op_parallelism=True):
+  def __init__(self,
+               input_dataset,
+               map_func,
+               use_inter_op_parallelism=True,
+               preserve_cardinality=False):
     """See `Dataset.map()` for details."""
     super(MapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
     self._use_inter_op_parallelism = use_inter_op_parallelism
-
-    wrapped_func = StructuredFunctionWrapper(
+    self._preserve_cardinality = preserve_cardinality
+    self._map_func = StructuredFunctionWrapper(
         map_func, self._transformation_name(), dataset=input_dataset)
-    self._output_classes = wrapped_func.output_classes
-    self._output_shapes = wrapped_func.output_shapes
-    self._output_types = wrapped_func.output_types
-    self._map_func = wrapped_func.function
 
   def _as_variant_tensor(self):
     input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     return gen_dataset_ops.map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
+  def _functions(self):
+    return [self._map_func]
 
   @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._map_func.output_structure
 
   def _transformation_name(self):
     return "Dataset.map()"
 
 
-class MatchingFilesDataset(Dataset):
-  """A `Dataset` that list the files according to the input patterns."""
-
-  def __init__(self, patterns):
-    super(MatchingFilesDataset, self).__init__()
-    self._patterns = ops.convert_to_tensor(
-        patterns, dtype=dtypes.string, name="patterns")
-
-  def _as_variant_tensor(self):
-    return gen_dataset_ops.matching_files_dataset(self._patterns)
-
-  def _inputs(self):
-    return []
-
-  @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
-
-  @property
-  def output_types(self):
-    return dtypes.string
-
-
 class ParallelMapDataset(MapDataset):
   """A `Dataset` that maps a function over elements in its input in parallel."""
 
@@ -2761,25 +2713,26 @@ class ParallelMapDataset(MapDataset):
                input_dataset,
                map_func,
                num_parallel_calls,
-               use_inter_op_parallelism=True):
+               use_inter_op_parallelism=True,
+               preserve_cardinality=False):
     """See `Dataset.map()` for details."""
-    super(ParallelMapDataset, self).__init__(input_dataset, map_func,
-                                             use_inter_op_parallelism)
+    super(ParallelMapDataset, self).__init__(
+        input_dataset, map_func, use_inter_op_parallelism, preserve_cardinality)
 
     self._num_parallel_calls = ops.convert_to_tensor(
         num_parallel_calls, dtype=dtypes.int32, name="num_parallel_calls")
 
   def _as_variant_tensor(self):
-    input_t = self._input_dataset._as_variant_tensor()  # pylint: disable=protected-access
     # pylint: disable=protected-access
+    input_t = self._input_dataset._as_variant_tensor()
     return gen_dataset_ops.parallel_map_dataset(
         input_t,
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         num_parallel_calls=self._num_parallel_calls,
         use_inter_op_parallelism=self._use_inter_op_parallelism,
+        preserve_cardinality=self._preserve_cardinality,
         **flat_structure(self))
-    # pylint: enable=protected-access
 
 
 class FlatMapDataset(UnaryDataset):
@@ -2790,36 +2743,25 @@ class FlatMapDataset(UnaryDataset):
     super(FlatMapDataset, self).__init__(input_dataset)
     self._input_dataset = input_dataset
 
-    wrapped_func = StructuredFunctionWrapper(
-        map_func,
-        self._transformation_name(),
-        dataset=input_dataset,
-        experimental_nested_dataset_support=True)
-    if not isinstance(wrapped_func.output_classes, _NestedDatasetComponent):
+    self._map_func = StructuredFunctionWrapper(
+        map_func, self._transformation_name(), dataset=input_dataset)
+    if not isinstance(self._map_func.output_structure, DatasetStructure):
       raise TypeError("`map_func` must return a `Dataset` object.")
-    self._output_classes = wrapped_func.output_classes.output_classes
-    self._output_types = wrapped_func.output_types.output_types
-    self._output_shapes = wrapped_func.output_shapes.output_shapes
-    self._map_func = wrapped_func.function
+    self._structure = self._map_func.output_structure._element_structure  # pylint: disable=protected-access
+
+  def _functions(self):
+    return [self._map_func]
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.flat_map_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,
-        f=self._map_func,
+        self._map_func.function.captured_inputs,
+        f=self._map_func.function,
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
   def _transformation_name(self):
     return "Dataset.flat_map()"
@@ -2838,12 +2780,13 @@ class InterleaveDataset(FlatMapDataset):
         block_length, dtype=dtypes.int64, name="block_length")
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.interleave_dataset(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
-        f=self._map_func,  # pylint: disable=protected-access
+        f=self._map_func.function,
         **flat_structure(self))
 
   def _transformation_name(self):
@@ -2867,20 +2810,21 @@ class ParallelInterleaveDataset(FlatMapDataset):
         num_parallel_calls, dtype=dtypes.int64, name="num_parallel_calls")
 
   def _as_variant_tensor(self):
+    # pylint: disable=protected-access
     return gen_dataset_ops.parallel_interleave_dataset_v2(
-        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        self._map_func.captured_inputs,  # pylint: disable=protected-access
+        self._input_dataset._as_variant_tensor(),
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
         self._num_parallel_calls,
-        f=self._map_func,  # pylint: disable=protected-access
+        f=self._map_func.function,
         **flat_structure(self))
 
   def _transformation_name(self):
     return "Dataset.interleave()"
 
 
-class FilterDataset(UnaryDataset):
+class FilterDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that filters its input according to a predicate function."""
 
   def __init__(self, input_dataset, predicate):
@@ -2889,36 +2833,26 @@ class FilterDataset(UnaryDataset):
     self._input_dataset = input_dataset
     wrapped_func = StructuredFunctionWrapper(
         predicate, self._transformation_name(), dataset=input_dataset)
-    if not (
-        wrapped_func.output_types == dtypes.bool and
-        wrapped_func.output_shapes.is_compatible_with(tensor_shape.scalar())):
+    if not wrapped_func.output_structure.is_compatible_with(
+        structure_lib.TensorStructure(dtypes.bool, [])):
       raise ValueError("`predicate` must return a scalar boolean tensor.")
-    self._predicate = wrapped_func.function
+    self._predicate = wrapped_func
+
+  def _functions(self):
+    return [self._predicate]
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.filter_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
-        other_arguments=self._predicate.captured_inputs,
-        predicate=self._predicate,
+        other_arguments=self._predicate.function.captured_inputs,
+        predicate=self._predicate.function,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
   def _transformation_name(self):
     return "Dataset.filter()"
 
 
-class PrefetchDataset(UnaryDataset):
+class PrefetchDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that asynchronously prefetches its input."""
 
   def __init__(self, input_dataset, buffer_size):
@@ -2936,18 +2870,6 @@ class PrefetchDataset(UnaryDataset):
         buffer_size=self._buffer_size,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
 class WindowDataset(UnaryDataset):
   """A dataset that creates window datasets from the input elements."""
@@ -2962,20 +2884,17 @@ class WindowDataset(UnaryDataset):
         stride, dtype=dtypes.int64, name="stride")
     self._drop_remainder = ops.convert_to_tensor(
         drop_remainder, dtype=dtypes.bool, name="drop_remainder")
-    self._output_classes = nest.pack_sequence_as(
+    nest_of_structures = nest.pack_sequence_as(
         input_dataset.output_classes,
         [
-            _NestedDatasetComponent(  # pylint: disable=protected-access
-                output_classes=output_class,
-                output_shapes=output_shape,
-                output_types=output_type)
+            DatasetStructure(structure_lib.convert_legacy_structure(
+                output_type, output_shape, output_class))
             for output_class, output_shape, output_type in zip(
                 nest.flatten(input_dataset.output_classes),
                 nest.flatten(input_dataset.output_shapes),
                 nest.flatten(input_dataset.output_types))
         ])
-    self._output_shapes = self._output_classes
-    self._output_types = self._output_classes
+    self._structure = structure_lib.NestedStructure(nest_of_structures)
 
   def _as_variant_tensor(self):
     return gen_dataset_ops.window_dataset(
@@ -2987,19 +2906,11 @@ class WindowDataset(UnaryDataset):
         **flat_structure(self))
 
   @property
-  def output_classes(self):
-    return self._output_classes
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_types(self):
-    return self._output_types
+  def _element_structure(self):
+    return self._structure
 
 
-class _OptionsDataset(UnaryDataset):
+class _OptionsDataset(UnaryUnchangedStructureDataset):
   """An identity `Dataset` that stores options."""
 
   def __init__(self, input_dataset, options):
@@ -3017,20 +2928,8 @@ class _OptionsDataset(UnaryDataset):
   def options(self):
     return self._options
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
 
-
-class _ModelDataset(UnaryDataset):
+class _ModelDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and models performance."""
 
   def __init__(self, input_dataset):
@@ -3043,20 +2942,8 @@ class _ModelDataset(UnaryDataset):
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class _OptimizeDataset(UnaryDataset):
+class _OptimizeDataset(UnaryUnchangedStructureDataset):
   """A `Dataset` that acts as an identity, and applies optimizations."""
 
   def __init__(self, input_dataset, optimizations):
@@ -3074,21 +2961,9 @@ class _OptimizeDataset(UnaryDataset):
         self._optimizations,
         **flat_structure(self))
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
-
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
-
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
-
 
-class _SetStatsAggregatorDataset(UnaryDataset):
-  """A `Dataset` that acts as an identity, and sets stats aggregator."""
+class _SetStatsAggregatorDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, and sets a stats aggregator."""
 
   def __init__(self, input_dataset, aggregator, prefix, counter_prefix):
     super(_SetStatsAggregatorDataset, self).__init__(input_dataset)
@@ -3098,21 +2973,43 @@ class _SetStatsAggregatorDataset(UnaryDataset):
     self._counter_prefix = counter_prefix
 
   def _as_variant_tensor(self):
-    return gen_dataset_ops.set_stats_aggregator_dataset(
+    return ged_ops.experimental_set_stats_aggregator_dataset(
         self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
         self._stats_aggregator._resource,  # pylint: disable=protected-access
         self._prefix,
         self._counter_prefix,
         **flat_structure(self))
 
-  @property
-  def output_shapes(self):
-    return self._input_dataset.output_shapes
 
-  @property
-  def output_types(self):
-    return self._input_dataset.output_types
+class _MaxIntraOpParallelismDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, overriding intra-op parallelism."""
 
-  @property
-  def output_classes(self):
-    return self._input_dataset.output_classes
+  def __init__(self, input_dataset, max_intra_op_parallelism):
+    super(_MaxIntraOpParallelismDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._max_intra_op_parallelism = ops.convert_to_tensor(
+        max_intra_op_parallelism,
+        dtype=dtypes.int64,
+        name="max_intra_op_parallelism")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_max_intra_op_parallelism_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._max_intra_op_parallelism,
+        **flat_structure(self))
+
+
+class _PrivateThreadPoolDataset(UnaryUnchangedStructureDataset):
+  """A `Dataset` that acts as an identity, setting a private threadpool."""
+
+  def __init__(self, input_dataset, num_threads):
+    super(_PrivateThreadPoolDataset, self).__init__(input_dataset)
+    self._input_dataset = input_dataset
+    self._num_threads = ops.convert_to_tensor(
+        num_threads, dtype=dtypes.int64, name="num_threads")
+
+  def _as_variant_tensor(self):
+    return ged_ops.experimental_private_thread_pool_dataset(
+        self._input_dataset._as_variant_tensor(),  # pylint: disable=protected-access
+        self._num_threads,
+        **flat_structure(self))
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 68b03ba93be6757636642af3d5e58f43786621f7..d0e91b01f9138470cd2a06a8b353149b74af2497 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -23,7 +23,6 @@ import warnings
 from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import optional_ops
 from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.data.util import structure as structure_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -68,7 +67,7 @@ def _device_stack_is_empty():
   return not bool(device_stack)
 
 
-@tf_export("data.Iterator")
+@tf_export(v1=["data.Iterator"])
 class Iterator(checkpointable.CheckpointableBase):
   """Represents the state of iterating through a `Dataset`."""
 
@@ -100,10 +99,8 @@ class Iterator(checkpointable.CheckpointableBase):
       raise ValueError("If `structure` is not specified, all of "
                        "`output_types`, `output_shapes`, and `output_classes`"
                        " must be specified.")
-    # pylint: disable=protected-access
-    self._structure = structure_lib.Structure._from_legacy_structure(
+    self._structure = structure_lib.convert_legacy_structure(
         output_types, output_shapes, output_classes)
-    # pylint: enable=protected-access
 
     self._string_handle = gen_dataset_ops.iterator_to_string_handle(
         self._iterator_resource)
@@ -190,34 +187,32 @@ class Iterator(checkpointable.CheckpointableBase):
     if output_classes is None:
       output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
+    output_structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
     if shared_name is None:
       shared_name = ""
+    # pylint: disable=protected-access
     if compat.forward_compatible(2018, 8, 3):
       if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_v2(
               container="",
               shared_name=shared_name,
-              output_types=nest.flatten(
-                  sparse.as_dense_types(output_types, output_classes)),
-              output_shapes=nest.flatten(
-                  sparse.as_dense_shapes(output_shapes, output_classes)))
+              output_types=output_structure._flat_types,
+              output_shapes=output_structure._flat_shapes)
       else:
         iterator_resource = gen_dataset_ops.iterator_v2(
             container="",
             shared_name=shared_name,
-            output_types=nest.flatten(
-                sparse.as_dense_types(output_types, output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_shapes(output_shapes, output_classes)))
+            output_types=output_structure._flat_types,
+            output_shapes=output_structure._flat_shapes)
     else:
       iterator_resource = gen_dataset_ops.iterator(
           container="",
           shared_name=shared_name,
-          output_types=nest.flatten(
-              sparse.as_dense_types(output_types, output_classes)),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(output_shapes, output_classes)))
+          output_types=output_structure._flat_types,
+          output_shapes=output_structure._flat_shapes)
+    # pylint: enable=protected-access
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -280,30 +275,28 @@ class Iterator(checkpointable.CheckpointableBase):
     if output_classes is None:
       output_classes = nest.map_structure(lambda _: ops.Tensor, output_types)
     nest.assert_same_structure(output_types, output_shapes)
+    output_structure = structure_lib.convert_legacy_structure(
+        output_types, output_shapes, output_classes)
     string_handle = ops.convert_to_tensor(string_handle, dtype=dtypes.string)
+    # pylint: disable=protected-access
     if compat.forward_compatible(2018, 8, 3):
       if _device_stack_is_empty():
         with ops.device("/cpu:0"):
           iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
               string_handle,
-              output_types=nest.flatten(
-                  sparse.as_dense_types(output_types, output_classes)),
-              output_shapes=nest.flatten(
-                  sparse.as_dense_shapes(output_shapes, output_classes)))
+              output_types=output_structure._flat_types,
+              output_shapes=output_structure._flat_shapes)
       else:
         iterator_resource = gen_dataset_ops.iterator_from_string_handle_v2(
             string_handle,
-            output_types=nest.flatten(
-                sparse.as_dense_types(output_types, output_classes)),
-            output_shapes=nest.flatten(
-                sparse.as_dense_shapes(output_shapes, output_classes)))
+            output_types=output_structure._flat_types,
+            output_shapes=output_structure._flat_shapes)
     else:
       iterator_resource = gen_dataset_ops.iterator_from_string_handle(
           string_handle,
-          output_types=nest.flatten(
-              sparse.as_dense_types(output_types, output_classes)),
-          output_shapes=nest.flatten(
-              sparse.as_dense_shapes(output_shapes, output_classes)))
+          output_types=output_structure._flat_types,
+          output_shapes=output_structure._flat_shapes)
+    # pylint: enable=protected-access
     return Iterator(iterator_resource, None, output_types, output_shapes,
                     output_classes)
 
@@ -530,8 +523,9 @@ class EagerIterator(checkpointable.CheckpointableBase):
     self._device = context.context().device_name
     with ops.device("/cpu:0"):
       # pylint: disable=protected-access
+      dataset = dataset._apply_options()
       ds_variant = dataset._as_variant_tensor()
-      self._structure = structure_lib.Structure._from_legacy_structure(
+      self._structure = structure_lib.convert_legacy_structure(
           dataset.output_types, dataset.output_shapes, dataset.output_classes)
       self._flat_output_types = self._structure._flat_types
       self._flat_output_shapes = self._structure._flat_shapes
@@ -543,6 +537,7 @@ class EagerIterator(checkpointable.CheckpointableBase):
         # Delete the resource when this object is deleted
         self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
             handle=self._resource, handle_device=self._device)
+      # pylint: enable=protected-access
 
   def __iter__(self):
     return self
diff --git a/tensorflow/python/data/ops/multi_device_iterator_ops.py b/tensorflow/python/data/ops/multi_device_iterator_ops.py
index 0f9add6461aeeb1e1d81dfb75fefb345b659c349..7586012574d39d7409e28f0d830a5fdadb25b61c 100644
--- a/tensorflow/python/data/ops/multi_device_iterator_ops.py
+++ b/tensorflow/python/data/ops/multi_device_iterator_ops.py
@@ -17,10 +17,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.experimental.ops import optimization_options
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.data.util import nest
-from tensorflow.python.data.util import sparse
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -36,16 +35,9 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
   """A `dummy` generator dataset."""
 
   def __init__(self, shard_num, multi_device_iterator_resource, incarnation_id,
-               source_device, target_device, output_shapes, output_types,
-               output_classes):
+               source_device, target_device, element_structure):
     self._target_device = target_device
-    self._output_types = output_types
-    self._output_shapes = output_shapes
-    self._output_classes = output_classes
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._output_shapes, self._output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._output_types, self._output_classes))
+    self._structure = element_structure
 
     multi_device_iterator_string_handle = (
         gen_dataset_ops.multi_device_iterator_to_string_handle(
@@ -70,17 +62,18 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
 
     @function.defun(input_signature=[tensor_spec.TensorSpec([], dtypes.string)])
     def _next_func(string_handle):
+      # pylint: disable=protected-access
       multi_device_iterator = (
           gen_dataset_ops.multi_device_iterator_from_string_handle(
               string_handle=string_handle,
-              output_types=self._flat_output_types,
-              output_shapes=self._flat_output_shapes))
+              output_types=self._structure._flat_types,
+              output_shapes=self._structure._flat_shapes))
       return gen_dataset_ops.multi_device_iterator_get_next_from_shard(
           multi_device_iterator=multi_device_iterator,
           shard_num=shard_num,
           incarnation_id=incarnation_id,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          output_types=self._structure._flat_types,
+          output_shapes=self._structure._flat_shapes)
 
     next_func_concrete = _next_func._get_concrete_function_internal()  # pylint: disable=protected-access
 
@@ -90,9 +83,8 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     def _remote_next_func(string_handle):
       return functional_ops.remote_call(
           target=source_device,
-          args=[string_handle] +
-          next_func_concrete.captured_inputs,
-          Tout=self._flat_output_types,
+          args=[string_handle] + next_func_concrete.captured_inputs,
+          Tout=self._structure._flat_types,  # pylint: disable=protected-access
           f=next_func_concrete)
 
     self._next_func = _remote_next_func._get_concrete_function_internal()  # pylint: disable=protected-access
@@ -108,8 +100,7 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
     def _remote_finalize_func(string_handle):
       return functional_ops.remote_call(
           target=source_device,
-          args=[string_handle] +
-          finalize_func_concrete.captured_inputs,
+          args=[string_handle] + finalize_func_concrete.captured_inputs,
           Tout=[dtypes.int64],
           f=finalize_func_concrete)
 
@@ -126,24 +117,15 @@ class _PerDeviceGenerator(dataset_ops.Dataset):
           init_func=self._init_func,
           next_func=self._next_func,
           finalize_func=self._finalize_func,
-          output_types=self._flat_output_types,
-          output_shapes=self._flat_output_shapes)
+          **dataset_ops.flat_structure(self))
 
   def _inputs(self):
     # TODO(b/116506223): Determine which datasets should be used as inputs here.
     return []
 
   @property
-  def output_types(self):
-    return self._output_types
-
-  @property
-  def output_shapes(self):
-    return self._output_shapes
-
-  @property
-  def output_classes(self):
-    return self._output_classes
+  def _element_structure(self):
+    return self._structure
 
 
 class MultiDeviceIterator(object):
@@ -183,13 +165,6 @@ class MultiDeviceIterator(object):
     self._source_device = source_device
     self._source_device_tensor = ops.convert_to_tensor(source_device)
 
-    self._flat_output_shapes = nest.flatten(
-        sparse.as_dense_shapes(self._dataset.output_shapes,
-                               self._dataset.output_classes))
-    self._flat_output_types = nest.flatten(
-        sparse.as_dense_types(self._dataset.output_types,
-                              self._dataset.output_classes))
-
     # Create the MultiDeviceIterator.
     with ops.device(self._source_device):
       self._multi_device_iterator_resource = (
@@ -197,8 +172,7 @@ class MultiDeviceIterator(object):
               devices=self._devices,
               shared_name="",
               container="",
-              output_types=self._flat_output_types,
-              output_shapes=self._flat_output_shapes))
+              **dataset_ops.flat_structure(dataset)))
 
       # The incarnation ID is used to ensure consistency between the per-device
       # iterators and the multi-device iterator.
@@ -216,13 +190,16 @@ class MultiDeviceIterator(object):
     for i, device in enumerate(self._devices):
       ds = _PerDeviceGenerator(
           i, self._multi_device_iterator_resource, self._incarnation_id,
-          self._source_device_tensor, device, self._dataset.output_shapes,
-          self._dataset.output_types, self._dataset.output_classes)
+          self._source_device_tensor, device, dataset._element_structure)  # pylint: disable=protected-access
       if prefetch_buffer_size > 0:
         ds = ds.prefetch(prefetch_buffer_size)
-      # TODO(jsimsa): Enable auto-tuning when supported for non-CPU devices.
+      # TODO(jsimsa): Enable auto-tuning and optimizations when supported for
+      # non-CPU devices.
       options = dataset_ops.Options()
       options.experimental_autotune = False
+      opt_options = optimization_options.OptimizationOptions()
+      opt_options.apply_default_optimizations = False
+      options.experimental_optimization = opt_options
       ds = ds.with_options(options)
       with ops.device(device):
         self._device_iterators.append(ds.make_initializable_iterator())
diff --git a/tensorflow/python/data/ops/optional_ops.py b/tensorflow/python/data/ops/optional_ops.py
index 91cf883ce946486fabf09073b5f4790f7a5fd42a..dcb743bee01964baf06543587661bb73b2225abb 100644
--- a/tensorflow/python/data/ops/optional_ops.py
+++ b/tensorflow/python/data/ops/optional_ops.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 @six.add_metaclass(abc.ABCMeta)
@@ -145,6 +146,7 @@ class _OptionalImpl(Optional):
     return self._value_structure
 
 
+@tf_export("data.experimental.OptionalStructure")
 class OptionalStructure(structure.Structure):
   """Represents an optional potentially containing a structured value."""
 
@@ -167,6 +169,10 @@ class OptionalStructure(structure.Structure):
   def _to_tensor_list(self, value):
     return [value._variant_tensor]  # pylint: disable=protected-access
 
+  def _to_batched_tensor_list(self, value):
+    raise NotImplementedError(
+        "Unbatching for `tf.data.experimental.Optional` objects.")
+
   def _from_tensor_list(self, flat_value):
     if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
         not flat_value[0].shape.is_compatible_with(tensor_shape.scalar())):
@@ -183,19 +189,21 @@ class OptionalStructure(structure.Structure):
     return OptionalStructure(value.value_structure)
 
   def _to_legacy_output_types(self):
-    raise NotImplementedError("The `output_types` property is not supported on "
-                              "structured objects containing an `Optional`. "
-                              "Use the corresponding `structure` property.")
+    return self
 
   def _to_legacy_output_shapes(self):
-    raise NotImplementedError("The `output_shapes` property is not supported on"
-                              " structured objects containing an `Optional`. "
-                              "Use the corresponding `structure` property.")
+    return self
 
   def _to_legacy_output_classes(self):
-    raise NotImplementedError("The `output_classes` property is not supported "
-                              "on structured objects containing an `Optional`. "
-                              "Use the corresponding `structure` property.")
+    return self
+
+  def _batch(self, batch_size):
+    raise NotImplementedError(
+        "Batching for `tf.data.experimental.Optional` objects.")
+
+  def _unbatch(self):
+    raise NotImplementedError(
+        "Unbatching for `tf.data.experimental.Optional` objects.")
 
 
 # pylint: disable=protected-access
diff --git a/tensorflow/python/data/ops/readers.py b/tensorflow/python/data/ops/readers.py
index 7e165a052d7094e9be2969a96daed372ef3794fe..0d6023dea28e3cefa13b32717e2aee87ac2c2bbf 100644
--- a/tensorflow/python/data/ops/readers.py
+++ b/tensorflow/python/data/ops/readers.py
@@ -20,11 +20,13 @@ from __future__ import print_function
 from tensorflow.python.compat import compat
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import convert
+from tensorflow.python.data.util import structure
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -32,8 +34,8 @@ from tensorflow.python.util.tf_export import tf_export
 _DEFAULT_READER_BUFFER_SIZE_BYTES = 256 * 1024  # 256 KB
 
 
-@tf_export("data.TextLineDataset")
-class TextLineDataset(dataset_ops.DatasetSource):
+@tf_export("data.TextLineDataset", v1=[])
+class TextLineDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` comprising lines from one or more text files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None):
@@ -47,7 +49,7 @@ class TextLineDataset(dataset_ops.DatasetSource):
         to buffer. A value of 0 results in the default buffering values chosen
         based on the compression type.
     """
-    super(TextLineDataset, self).__init__()
+    super(TextLineDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._compression_type = convert.optional_param_to_tensor(
@@ -63,16 +65,26 @@ class TextLineDataset(dataset_ops.DatasetSource):
         self._filenames, self._compression_type, self._buffer_size)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
+
+@tf_export(v1=["data.TextLineDataset"])
+class TextLineDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` comprising lines from one or more text files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None):
+    wrapped = TextLineDatasetV2(filenames, compression_type, buffer_size)
+    super(TextLineDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = TextLineDatasetV2.__init__.__doc__
 
   @property
-  def output_types(self):
-    return dtypes.string
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
 
 
 class _TFRecordDataset(dataset_ops.DatasetSource):
@@ -107,16 +119,8 @@ class _TFRecordDataset(dataset_ops.DatasetSource):
         self._filenames, self._compression_type, self._buffer_size)
 
   @property
-  def output_classes(self):
-    return ops.Tensor
-
-  @property
-  def output_shapes(self):
-    return tensor_shape.TensorShape([])
-
-  @property
-  def output_types(self):
-    return dtypes.string
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
 
 class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
@@ -140,15 +144,15 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
 
   def _as_variant_tensor(self):
     # pylint: disable=protected-access
-    return gen_dataset_ops.parallel_interleave_dataset(
+    return ged_ops.experimental_parallel_interleave_dataset(
         self._input_dataset._as_variant_tensor(),
-        self._map_func.captured_inputs,
+        self._map_func.function.captured_inputs,
         self._cycle_length,
         self._block_length,
         self._sloppy,
         self._buffer_output_elements,
         self._prefetch_input_elements,
-        f=self._map_func,
+        f=self._map_func.function,
         **dataset_ops.flat_structure(self))
     # pylint: enable=protected-access
 
@@ -156,13 +160,13 @@ class ParallelInterleaveDataset(dataset_ops.InterleaveDataset):
     return "tf.data.experimental.parallel_interleave()"
 
 
-@tf_export("data.TFRecordDataset")
-class TFRecordDataset(dataset_ops.Dataset):
+@tf_export("data.TFRecordDataset", v1=[])
+class TFRecordDatasetV2(dataset_ops.DatasetV2):
   """A `Dataset` comprising records from one or more TFRecord files."""
 
   def __init__(self, filenames, compression_type=None, buffer_size=None,
                num_parallel_reads=None):
-    """Creates a `TFRecordDataset` to read for one or more TFRecord files.
+    """Creates a `TFRecordDataset` to read one or more TFRecord files.
 
     NOTE: The `num_parallel_reads` argument can be used to improve performance
     when reading from a remote filesystem.
@@ -182,8 +186,8 @@ class TFRecordDataset(dataset_ops.Dataset):
       TypeError: If any argument does not have the expected type.
       ValueError: If any argument does not have the expected shape.
     """
-    super(TFRecordDataset, self).__init__()
-    if isinstance(filenames, dataset_ops.Dataset):
+    super(TFRecordDatasetV2, self).__init__()
+    if isinstance(filenames, dataset_ops.DatasetV2):
       if filenames.output_types != dtypes.string:
         raise TypeError(
             "`filenames` must be a `tf.data.Dataset` of `tf.string` elements.")
@@ -194,7 +198,7 @@ class TFRecordDataset(dataset_ops.Dataset):
     else:
       filenames = ops.convert_to_tensor(filenames, dtype=dtypes.string)
       filenames = array_ops.reshape(filenames, [-1], name="flat_filenames")
-      filenames = dataset_ops.Dataset.from_tensor_slices(filenames)
+      filenames = dataset_ops.DatasetV2.from_tensor_slices(filenames)
 
     self._filenames = filenames
     self._compression_type = compression_type
@@ -217,10 +221,10 @@ class TFRecordDataset(dataset_ops.Dataset):
              compression_type=None,
              buffer_size=None,
              num_parallel_reads=None):
-    return TFRecordDataset(filenames or self._filenames,
-                           compression_type or self._compression_type,
-                           buffer_size or self._buffer_size,
-                           num_parallel_reads or self._num_parallel_reads)
+    return TFRecordDatasetV2(filenames or self._filenames,
+                             compression_type or self._compression_type,
+                             buffer_size or self._buffer_size,
+                             num_parallel_reads or self._num_parallel_reads)
 
   def _as_variant_tensor(self):
     return self._impl._as_variant_tensor()  # pylint: disable=protected-access
@@ -229,20 +233,44 @@ class TFRecordDataset(dataset_ops.Dataset):
     return self._impl._inputs()  # pylint: disable=protected-access
 
   @property
-  def output_classes(self):
-    return self._impl.output_classes
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
-  @property
-  def output_shapes(self):
-    return self._impl.output_shapes
+
+@tf_export(v1=["data.TFRecordDataset"])
+class TFRecordDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` comprising records from one or more TFRecord files."""
+
+  def __init__(self, filenames, compression_type=None, buffer_size=None,
+               num_parallel_reads=None):
+    wrapped = TFRecordDatasetV2(
+        filenames, compression_type, buffer_size, num_parallel_reads)
+    super(TFRecordDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = TFRecordDatasetV2.__init__.__doc__
+
+  def _clone(self,
+             filenames=None,
+             compression_type=None,
+             buffer_size=None,
+             num_parallel_reads=None):
+    # pylint: disable=protected-access
+    return TFRecordDatasetV1(
+        filenames or self._dataset._filenames,
+        compression_type or self._dataset._compression_type,
+        buffer_size or self._dataset._buffer_size,
+        num_parallel_reads or self._dataset._num_parallel_reads)
 
   @property
-  def output_types(self):
-    return self._impl.output_types
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
 
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
 
-@tf_export("data.FixedLengthRecordDataset")
-class FixedLengthRecordDataset(dataset_ops.DatasetSource):
+
+@tf_export("data.FixedLengthRecordDataset", v1=[])
+class FixedLengthRecordDatasetV2(dataset_ops.DatasetSource):
   """A `Dataset` of fixed-length records from one or more binary files."""
 
   def __init__(self,
@@ -267,7 +295,7 @@ class FixedLengthRecordDataset(dataset_ops.DatasetSource):
       compression_type: (Optional.) A `tf.string` scalar evaluating to one of
         `""` (no compression), `"ZLIB"`, or `"GZIP"`.
     """
-    super(FixedLengthRecordDataset, self).__init__()
+    super(FixedLengthRecordDatasetV2, self).__init__()
     self._filenames = ops.convert_to_tensor(
         filenames, dtype=dtypes.string, name="filenames")
     self._record_bytes = ops.convert_to_tensor(
@@ -296,15 +324,39 @@ class FixedLengthRecordDataset(dataset_ops.DatasetSource):
           self._filenames, self._header_bytes, self._record_bytes,
           self._footer_bytes, self._buffer_size)
 
-
   @property
-  def output_classes(self):
-    return ops.Tensor
+  def _element_structure(self):
+    return structure.TensorStructure(dtypes.string, [])
 
-  @property
-  def output_shapes(self):
-    return tensor_shape.scalar()
+
+@tf_export(v1=["data.FixedLengthRecordDataset"])
+class FixedLengthRecordDatasetV1(dataset_ops.DatasetV1Adapter):
+  """A `Dataset` of fixed-length records from one or more binary files."""
+
+  def __init__(self,
+               filenames,
+               record_bytes,
+               header_bytes=None,
+               footer_bytes=None,
+               buffer_size=None,
+               compression_type=None):
+    wrapped = FixedLengthRecordDatasetV2(
+        filenames, record_bytes, header_bytes, footer_bytes, buffer_size,
+        compression_type)
+    super(FixedLengthRecordDatasetV1, self).__init__(wrapped)
+  __init__.__doc__ = FixedLengthRecordDatasetV2.__init__.__doc__
 
   @property
-  def output_types(self):
-    return dtypes.string
+  def _filenames(self):
+    return self._dataset._filenames  # pylint: disable=protected-access
+
+  @_filenames.setter
+  def _filenames(self, value):
+    self._dataset._filenames = value  # pylint: disable=protected-access
+
+
+# TODO(b/119044825): Until all `tf.data` unit tests are converted to V2, keep
+# these aliases in place.
+FixedLengthRecordDataset = FixedLengthRecordDatasetV1
+TFRecordDataset = TFRecordDatasetV1
+TextLineDataset = TextLineDatasetV1
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 39082ce3707bb11585694e553b840f94209b1029..04e80299e0d57965c21b88bd94250cb62e76d452 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -93,10 +93,28 @@ py_test(
         "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:variables",
+        "//tensorflow/python/data/kernel_tests:test_base",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
+py_library(
+    name = "options",
+    srcs = ["options.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_test(
+    name = "options_test",
+    size = "small",
+    srcs = ["options_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":options",
+        "//tensorflow/python:client_testlib",
+    ],
+)
+
 py_library(
     name = "convert",
     srcs = ["convert.py"],
diff --git a/tensorflow/python/data/util/convert_test.py b/tensorflow/python/data/util/convert_test.py
index 89c3afb29691f4f24b7cb4208b16663b616515fa..78ca6e951390b8c248e55dcb7f1ce99f9fa1085f 100644
--- a/tensorflow/python/data/util/convert_test.py
+++ b/tensorflow/python/data/util/convert_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.data.util import convert
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
@@ -30,47 +31,53 @@ class ConvertTest(test.TestCase):
 
   def testInteger(self):
     resp = convert.optional_param_to_tensor("foo", 3)
-    with self.cached_session() as sess:
-      self.assertEqual(3, sess.run(resp))
+    self.assertEqual(3, self.evaluate(resp))
 
   def testIntegerDefault(self):
     resp = convert.optional_param_to_tensor("foo", None)
-    with self.cached_session() as sess:
-      self.assertEqual(0, sess.run(resp))
+    self.assertEqual(0, self.evaluate(resp))
 
   def testStringDefault(self):
     resp = convert.optional_param_to_tensor("bar", None, "default",
                                             dtypes.string)
-    with self.cached_session() as sess:
-      self.assertEqual(compat.as_bytes("default"), sess.run(resp))
+    self.assertEqual(compat.as_bytes("default"), self.evaluate(resp))
 
   def testString(self):
     resp = convert.optional_param_to_tensor("bar", "value", "default",
                                             dtypes.string)
-    with self.cached_session() as sess:
-      self.assertEqual(compat.as_bytes("value"), sess.run(resp))
+    self.assertEqual(compat.as_bytes("value"), self.evaluate(resp))
 
   def testPartialShapeToTensorKnownDimension(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([1]))))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor((1,))))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor([1])))
-      self.assertAllEqual([1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([1], dtype=dtypes.int64))))
-
+    self.assertAllEqual([1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([1]))))
+    self.assertAllEqual([1], self.evaluate(
+        convert.partial_shape_to_tensor((1,))))
+    self.assertAllEqual([1], self.evaluate(
+        convert.partial_shape_to_tensor([1])))
+    self.assertAllEqual([1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([1], dtype=dtypes.int64))))
+
+  @test_util.run_deprecated_v1
   def testPartialShapeToTensorUnknownDimension(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([None]))))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          (None,))))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          [None])))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          [-1])))
-      self.assertAllEqual([-1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([-1], dtype=dtypes.int64))))
+    self.assertAllEqual([-1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([None]))))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor((None,))))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor([None])))
+    self.assertAllEqual([-1],
+                        self.evaluate(convert.partial_shape_to_tensor([-1])))
+    self.assertAllEqual([-1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([-1],
+                                                     dtype=dtypes.int64))))
 
     with self.assertRaisesRegexp(
         ValueError, r"The given shape .* must be a 1-D tensor of tf.int64 "
@@ -84,42 +91,63 @@ class ConvertTest(test.TestCase):
       convert.partial_shape_to_tensor(constant_op.constant([1., 1.]))
 
   def testPartialShapeToTensorMultipleDimensions(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([3, 6]))))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          (3, 6))))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          [3, 6])))
-      self.assertAllEqual([3, 6], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([3, 6], dtype=dtypes.int64))))
-
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([3, None]))))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          (3, None))))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          [3, None])))
-      self.assertAllEqual([3, -1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([3, -1], dtype=dtypes.int64))))
-
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([None, None]))))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          (None, None))))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          [None, None])))
-      self.assertAllEqual([-1, -1], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([-1, -1], dtype=dtypes.int64))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([3, 6]))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(convert.partial_shape_to_tensor((3, 6))))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(convert.partial_shape_to_tensor([3, 6])))
+    self.assertAllEqual([3, 6],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([3, 6],
+                                                     dtype=dtypes.int64))))
+
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([3, None]))))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor((3, None))))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor([3, None])))
+    self.assertAllEqual([3, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([3, -1],
+                                                     dtype=dtypes.int64))))
+
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([None, None]))))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor((None, None))))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor([None, None])))
+    self.assertAllEqual([-1, -1],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([-1, -1],
+                                                     dtype=dtypes.int64))))
 
   def testPartialShapeToTensorScalar(self):
-    with self.cached_session() as sess:
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(
-          tensor_shape.TensorShape([]))))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(())))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor([])))
-      self.assertAllEqual([], sess.run(convert.partial_shape_to_tensor(
-          constant_op.constant([], dtype=dtypes.int64))))
+    self.assertAllEqual([],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                tensor_shape.TensorShape([]))))
+    self.assertAllEqual([], self.evaluate(convert.partial_shape_to_tensor(())))
+    self.assertAllEqual([], self.evaluate(convert.partial_shape_to_tensor([])))
+    self.assertAllEqual([],
+                        self.evaluate(
+                            convert.partial_shape_to_tensor(
+                                constant_op.constant([], dtype=dtypes.int64))))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
new file mode 100644
index 0000000000000000000000000000000000000000..9badba8e5670c749b833da7f1e2094f4f3548098
--- /dev/null
+++ b/tensorflow/python/data/util/options.py
@@ -0,0 +1,131 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for tf.data options."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+
+def _internal_attr_name(name):
+  return "_" + name
+
+
+class OptionsBase(object):
+  """Base class for representing a set of tf.data options.
+
+  Attributes:
+    _options: Stores the option values.
+  """
+
+  def __init__(self):
+    self._options = {}
+
+  def __eq__(self, other):
+    if not isinstance(other, self.__class__):
+      return NotImplemented
+    for name in set(self._options) | set(other._options):  # pylint: disable=protected-access
+      if getattr(self, name) != getattr(other, name):
+        return False
+    return True
+
+  def __ne__(self, other):
+    if isinstance(other, self.__class__):
+      return not self.__eq__(other)
+    else:
+      return NotImplemented
+
+
+def create_option(name, ty, docstring, default=None):
+  """Creates a type-checked property.
+
+  Args:
+    name: the name to use
+    ty: the type to use
+    docstring: the docstring to use
+    default: the default value to use
+
+  Returns:
+    A type-checked property.
+  """
+
+  def get_fn(self):
+    return self._options.get(name, default)  # pylint: disable=protected-access
+
+  def set_fn(self, value):
+    if not isinstance(value, ty):
+      raise TypeError("Property \"%s\" must be of type %s, got: %r (type: %r)" %
+                      (name, ty, value, type(value)))
+    self._options[name] = value  # pylint: disable=protected-access
+
+  return property(get_fn, set_fn, None, docstring)
+
+
+def merge_options(*options_list):
+  """Merges the given options, returning the result as a new options object.
+
+  The input arguments are expected to have a matching type that derives from
+  `OptionsBase` (and thus each represent a set of options). The method outputs
+  an object of the same type created by merging the sets of options represented
+  by the input arguments.
+
+  The sets of options can be merged as long as there does not exist an option
+  with different non-default values.
+
+  If an option is an instance of `OptionsBase` itself, then this method is
+  applied recursively to the set of options represented by this option.
+
+  Args:
+    *options_list: options to merge
+
+  Raises:
+    TypeError: if the input arguments are incompatible or not derived from
+      `OptionsBase`
+    ValueError: if the given options cannot be merged
+
+  Returns:
+    A new options object which is the result of merging the given options.
+  """
+  if len(options_list) < 1:
+    raise ValueError("At least one options should be provided")
+  result_type = type(options_list[0])
+
+  for options in options_list:
+    if not isinstance(options, result_type):
+      raise TypeError("Incompatible options type: %r vs %r" % (type(options),
+                                                               result_type))
+
+  if not isinstance(options_list[0], OptionsBase):
+    raise TypeError("The inputs should inherit from `OptionsBase`")
+
+  default_options = result_type()
+  result = result_type()
+  for options in options_list:
+    # Iterate over all set options and merge the into the result.
+    for name in options._options:  # pylint: disable=protected-access
+      this = getattr(result, name)
+      that = getattr(options, name)
+      default = getattr(default_options, name)
+      if that == default:
+        continue
+      elif this == default:
+        setattr(result, name, that)
+      elif isinstance(this, OptionsBase):
+        setattr(result, name, merge_options(this, that))
+      elif this != that:
+        raise ValueError(
+            "Cannot merge incompatible values (%r and %r) of option: %s" %
+            (this, that, name))
+  return result
diff --git a/tensorflow/python/data/util/options_test.py b/tensorflow/python/data/util/options_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5169835a322923d7bf2d644717870d87bfab13f
--- /dev/null
+++ b/tensorflow/python/data/util/options_test.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for dataset options utilities."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.data.util import options
+from tensorflow.python.platform import test
+
+
+class _TestOptions(options.OptionsBase):
+  x = options.create_option(
+      name="x", ty=int, docstring="the answer to everything", default=42)
+  y = options.create_option(
+      name="y", ty=float, docstring="a tasty pie", default=3.14)
+
+
+class _NestedTestOptions(options.OptionsBase):
+  opts = options.create_option(
+      name="opts", ty=_TestOptions, docstring="nested options")
+
+
+class OptionsTest(test.TestCase):
+
+  def testDocumentation(self):
+    self.assertEqual(_TestOptions.x.__doc__, "the answer to everything")
+    self.assertEqual(_TestOptions.y.__doc__, "a tasty pie")
+
+  def testCreateOption(self):
+    opts = _TestOptions()
+    self.assertEqual(opts.x, 42)
+    self.assertEqual(opts.y, 3.14)
+    self.assertIsInstance(opts.x, int)
+    self.assertIsInstance(opts.y, float)
+    opts.x = 0
+    self.assertEqual(opts.x, 0)
+    with self.assertRaises(TypeError):
+      opts.x = 3.14
+    opts.y = 0.0
+    self.assertEqual(opts.y, 0.0)
+    with self.assertRaises(TypeError):
+      opts.y = 42
+
+  def testMergeOptions(self):
+    options1, options2 = _TestOptions(), _TestOptions()
+    with self.assertRaises(ValueError):
+      options.merge_options()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.x, 42)
+    self.assertEqual(merged_options.y, 3.14)
+    options1.x = 0
+    options2.y = 0.0
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.x, 0)
+    self.assertEqual(merged_options.y, 0.0)
+
+  def testMergeNestedOptions(self):
+    options1, options2 = _NestedTestOptions(), _NestedTestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, None)
+    options1.opts = _TestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, _TestOptions())
+    options2.opts = _TestOptions()
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts, _TestOptions())
+    options1.opts.x = 0
+    options2.opts.y = 0.0
+    merged_options = options.merge_options(options1, options2)
+    self.assertEqual(merged_options.opts.x, 0)
+    self.assertEqual(merged_options.opts.y, 0.0)
+
+  def testMergeOptionsInvalid(self):
+    with self.assertRaises(TypeError):
+      options.merge_options(0)
+    options1, options2 = _TestOptions(), _NestedTestOptions()
+    with self.assertRaises(TypeError):
+      options.merge_options(options1, options2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/util/sparse.py b/tensorflow/python/data/util/sparse.py
index 5e6d22470978d97c5e73640e86d3f8b82cbc1b60..f2e22fefd31749faf52c5db0b967b936c1c76707 100644
--- a/tensorflow/python/data/util/sparse.py
+++ b/tensorflow/python/data/util/sparse.py
@@ -34,7 +34,7 @@ def any_sparse(classes):
   Returns:
     `True` if `classes` contains a sparse tensor type and `False` otherwise.
   """
-  return any([c is sparse_tensor.SparseTensor for c in nest.flatten(classes)])
+  return any(c is sparse_tensor.SparseTensor for c in nest.flatten(classes))
 
 
 def as_dense_shapes(shapes, classes):
diff --git a/tensorflow/python/data/util/sparse_test.py b/tensorflow/python/data/util/sparse_test.py
index 056b32480f3898726940f3c228c9b9eefa28b237..06acf55ab9d1154ec4972b799538948fa76bdb43 100644
--- a/tensorflow/python/data/util/sparse_test.py
+++ b/tensorflow/python/data/util/sparse_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -292,10 +293,11 @@ class SparseTest(test.TestCase):
       return
     self.assertTrue(isinstance(b, sparse_tensor.SparseTensor))
     with self.cached_session():
-      self.assertAllEqual(a.eval().indices, b.eval().indices)
-      self.assertAllEqual(a.eval().values, b.eval().values)
-      self.assertAllEqual(a.eval().dense_shape, b.eval().dense_shape)
+      self.assertAllEqual(a.eval().indices, self.evaluate(b).indices)
+      self.assertAllEqual(a.eval().values, self.evaluate(b).values)
+      self.assertAllEqual(a.eval().dense_shape, self.evaluate(b).dense_shape)
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserialize(self):
     test_cases = (
         (),
@@ -325,6 +327,7 @@ class SparseTest(test.TestCase):
       for a, e in zip(nest.flatten(actual), nest.flatten(expected)):
         self.assertSparseValuesEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserialize(self):
     test_cases = (
         (),
diff --git a/tensorflow/python/data/util/structure.py b/tensorflow/python/data/util/structure.py
index 9a3118297dbf7154cb25ffd5d52351c0f8f8bd86..9de0c4da0ebe0beec31aa652397f06d6dc665e63 100644
--- a/tensorflow/python/data/util/structure.py
+++ b/tensorflow/python/data/util/structure.py
@@ -28,11 +28,13 @@ from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.util.tf_export import tf_export
 
 
 _STRUCTURE_CONVERSION_FUNCTION_REGISTRY = {}
 
 
+@tf_export("data.experimental.Structure")
 @six.add_metaclass(abc.ABCMeta)
 class Structure(object):
   """Represents structural information, such as type and shape, about a value.
@@ -111,6 +113,26 @@ class Structure(object):
     """
     raise NotImplementedError("Structure._to_tensor_list()")
 
+  @abc.abstractmethod
+  def _to_batched_tensor_list(self, value):
+    """Returns a flat list of rank >= 1 `tf.Tensor` representing `value`.
+
+    This method can be used, along with `self._flat_shapes` and
+    `self._flat_types` to represent structured values in lower level APIs
+    (such as plain TensorFlow operations) that do not understand structure,
+    *and* that require that the plain tensors have a rank of at least one
+    (e.g. for the purpose of slicing the tensors).
+
+    Requires: `self.is_compatible_with(Structure.from_value(value))`.
+
+    Args:
+      value: A value with compatible structure.
+
+    Returns:
+      A flat list of `tf.Tensor` representing `value`.
+    """
+    raise NotImplementedError("Structure._to_batched_tensor_list()")
+
   @abc.abstractmethod
   def _from_tensor_list(self, flat_value):
     """Builds a flat list of `tf.Tensor` into a value matching this structure.
@@ -144,6 +166,23 @@ class Structure(object):
     """
     return self._from_tensor_list(flat_value)
 
+  @abc.abstractmethod
+  def _batch(self, batch_size):
+    """Returns a structure representing a batch of objects with this structure.
+
+    Args:
+      batch_size: An `int` representing the number of elements in a batch,
+        or `None` if the batch size may vary.
+
+    Returns:
+      A `Structure` representing a batch of objects with this structure.
+    """
+    raise NotImplementedError("Structure._batch()")
+
+  @abc.abstractmethod
+  def _unbatch(self):
+    raise NotImplementedError("Structure._unbatch()")
+
   @staticmethod
   def from_value(value):
     """Returns a `Structure` that represents the given `value`.
@@ -177,54 +216,6 @@ class Structure(object):
         raise TypeError("Could not build a structure for %r" % value)
       return TensorStructure.from_value(tensor)
 
-  @staticmethod
-  def _from_legacy_structure(output_types, output_shapes, output_classes):
-    """Returns a `Structure` that represents the given legacy structure.
-
-    This method provides a way to convert from the existing `Dataset` and
-    `Iterator` structure-related properties to a `Structure` object.
-
-    TODO(b/110122868): Remove this method once `Structure` is used throughout
-    `tf.data`.
-
-    Args:
-      output_types: A nested structure of `tf.DType` objects corresponding to
-        each component of a structured value.
-      output_shapes: A nested structure of `tf.TensorShape` objects
-        corresponding to each component a structured value.
-      output_classes: A nested structure of Python `type` objects corresponding
-        to each component of a structured value.
-
-    Returns:
-      A `Structure`.
-
-    Raises:
-      TypeError: If a structure cannot be built the arguments, because one of
-        the component classes in `output_classes` is not supported.
-    """
-    flat_types = nest.flatten(output_types)
-    flat_shapes = nest.flatten(output_shapes)
-    flat_classes = nest.flatten(output_classes)
-    flat_ret = []
-    for flat_type, flat_shape, flat_class in zip(flat_types, flat_shapes,
-                                                 flat_classes):
-      if issubclass(flat_class, sparse_tensor_lib.SparseTensor):
-        flat_ret.append(SparseTensorStructure(flat_type, flat_shape))
-      elif issubclass(flat_class, ops.Tensor):
-        flat_ret.append(TensorStructure(flat_type, flat_shape))
-      else:
-        # NOTE(mrry): Since legacy structures produced by iterators only
-        # comprise Tensors, SparseTensors, and nests, we do not need to support
-        # all structure types here.
-        raise TypeError(
-            "Could not build a structure for output class %r" % flat_type)
-
-    ret = nest.pack_sequence_as(output_classes, flat_ret)
-    if isinstance(ret, Structure):
-      return ret
-    else:
-      return NestedStructure(ret)
-
   @staticmethod
   def _register_custom_converter(type_object, converter_fn):
     """Registers `converter_fn` for converting values of the given type.
@@ -250,9 +241,63 @@ class Structure(object):
     raise NotImplementedError("Structure._to_legacy_output_classes()")
 
 
+def convert_legacy_structure(output_types, output_shapes, output_classes):
+  """Returns a `Structure` that represents the given legacy structure.
+
+  This method provides a way to convert from the existing `Dataset` and
+  `Iterator` structure-related properties to a `Structure` object. A "legacy"
+  structure is represented by the `tf.data.Dataset.output_types`,
+  `tf.data.Dataset.output_shapes`, and `tf.data.Dataset.output_classes`
+  properties.
+
+  TODO(b/110122868): Remove this function once `Structure` is used throughout
+  `tf.data`.
+
+  Args:
+    output_types: A nested structure of `tf.DType` objects corresponding to
+      each component of a structured value.
+    output_shapes: A nested structure of `tf.TensorShape` objects
+      corresponding to each component a structured value.
+    output_classes: A nested structure of Python `type` objects corresponding
+      to each component of a structured value.
+
+  Returns:
+    A `Structure`.
+
+  Raises:
+    TypeError: If a structure cannot be built from the arguments, because one of
+      the component classes in `output_classes` is not supported.
+  """
+  flat_types = nest.flatten(output_types)
+  flat_shapes = nest.flatten(output_shapes)
+  flat_classes = nest.flatten(output_classes)
+  flat_ret = []
+  for flat_type, flat_shape, flat_class in zip(flat_types, flat_shapes,
+                                               flat_classes):
+    if isinstance(flat_class, Structure):
+      flat_ret.append(flat_class)
+    elif issubclass(flat_class, sparse_tensor_lib.SparseTensor):
+      flat_ret.append(SparseTensorStructure(flat_type, flat_shape))
+    elif issubclass(flat_class, ops.Tensor):
+      flat_ret.append(TensorStructure(flat_type, flat_shape))
+    else:
+      # NOTE(mrry): Since legacy structures produced by iterators only
+      # comprise Tensors, SparseTensors, and nests, we do not need to
+      # support all structure types here.
+      raise TypeError(
+          "Could not build a structure for output class %r" % flat_type)
+
+  ret = nest.pack_sequence_as(output_classes, flat_ret)
+  if isinstance(ret, Structure):
+    return ret
+  else:
+    return NestedStructure(ret)
+
+
 # NOTE(mrry): The following classes make extensive use of non-public methods of
 # their base class, so we disable the protected-access lint warning once here.
 # pylint: disable=protected-access
+@tf_export("data.experimental.NestedStructure")
 class NestedStructure(Structure):
   """Represents a nested structure in which each leaf is a `Structure`."""
 
@@ -308,21 +353,45 @@ class NestedStructure(Structure):
       ret.extend(structure._to_tensor_list(sub_value))
     return ret
 
+  def _to_batched_tensor_list(self, value):
+    ret = []
+
+    try:
+      flat_value = nest.flatten_up_to(self._nested_structure, value)
+    except (ValueError, TypeError):
+      raise ValueError("The value %r is not compatible with the nested "
+                       "structure %r." % (value, self._nested_structure))
+
+    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
+      if not structure.is_compatible_with(Structure.from_value(sub_value)):
+        raise ValueError("Component value %r is not compatible with the nested "
+                         "structure %r." % (sub_value, structure))
+      ret.extend(structure._to_batched_tensor_list(sub_value))
+    return ret
+
   def _from_tensor_list(self, flat_value):
     if len(flat_value) != len(self._flat_types):
       raise ValueError("Expected %d flat values in NestedStructure but got %d."
                        % (len(self._flat_types), len(flat_value)))
 
     flat_ret = []
-    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
-      flat_ret.append(structure._from_tensor_list([sub_value]))
+    i = 0
+    for structure in self._flat_nested_structure:
+      num_flat_values = len(structure._flat_types)
+      sub_value = flat_value[i:i + num_flat_values]
+      flat_ret.append(structure._from_tensor_list(sub_value))
+      i += num_flat_values
 
     return nest.pack_sequence_as(self._nested_structure, flat_ret)
 
   def _from_compatible_tensor_list(self, flat_value):
     flat_ret = []
-    for sub_value, structure in zip(flat_value, self._flat_nested_structure):
-      flat_ret.append(structure._from_compatible_tensor_list([sub_value]))
+    i = 0
+    for structure in self._flat_nested_structure:
+      num_flat_values = len(structure._flat_types)
+      sub_value = flat_value[i:i + num_flat_values]
+      flat_ret.append(structure._from_compatible_tensor_list(sub_value))
+      i += num_flat_values
 
     return nest.pack_sequence_as(self._nested_structure, flat_ret)
 
@@ -345,7 +414,16 @@ class NestedStructure(Structure):
     return nest.map_structure(
         lambda s: s._to_legacy_output_classes(), self._nested_structure)
 
+  def _batch(self, batch_size):
+    return NestedStructure(nest.map_structure(
+        lambda s: s._batch(batch_size), self._nested_structure))
+
+  def _unbatch(self):
+    return NestedStructure(nest.map_structure(
+        lambda s: s._unbatch(), self._nested_structure))
+
 
+@tf_export("data.experimental.TensorStructure")
 class TensorStructure(Structure):
   """Represents structural information about a `tf.Tensor`."""
 
@@ -372,6 +450,11 @@ class TensorStructure(Structure):
                        "and shape %s." % (value, self._dtype, self._shape))
     return [value]
 
+  def _to_batched_tensor_list(self, value):
+    if self._shape.merge_with(value.shape).ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return [value]
+
   def _from_tensor_list(self, flat_value):
     if len(flat_value) != 1:
       raise ValueError("TensorStructure corresponds to a single tf.Tensor.")
@@ -381,6 +464,13 @@ class TensorStructure(Structure):
     return self._from_compatible_tensor_list(flat_value)
 
   def _from_compatible_tensor_list(self, flat_value):
+    # TODO(b/112266545): It would be cleaner to create a new `ensure_shape()`
+    # op here and return that, instead of mutating the input's shape using
+    # `Tensor.set_shape()`. However, that would add extra ops on the arguments
+    # of each `tf.data` function, which could impact performance. When this
+    # bug is resolved, we should be able to add the `ensure_shape()` ops and
+    # optimize them away using contextual shape information.
+    flat_value[0].set_shape(self._shape)
     return flat_value[0]
 
   @staticmethod
@@ -396,7 +486,18 @@ class TensorStructure(Structure):
   def _to_legacy_output_classes(self):
     return ops.Tensor
 
+  def _batch(self, batch_size):
+    return TensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._shape))
+
+  def _unbatch(self):
+    if self._shape.ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return TensorStructure(self._dtype, self._shape[1:])
 
+
+@tf_export("data.experimental.SparseTensorStructure")
 class SparseTensorStructure(Structure):
   """Represents structural information about a `tf.SparseTensor`."""
 
@@ -406,7 +507,11 @@ class SparseTensorStructure(Structure):
 
   @property
   def _flat_shapes(self):
-    return [tensor_shape.vector(3)]
+    # NOTE(mrry): The default flat shape of a boxed `SparseTensor` is `(3,)`,
+    # but a `SparseTensorStructure` can also represent a batch of boxed
+    # `SparseTensor` objects with shape `(?, 3)` (and batches of batches, etc.),
+    # so the flat shape must be unknown.
+    return [tensor_shape.unknown_shape(None)]
 
   @property
   def _flat_types(self):
@@ -420,6 +525,13 @@ class SparseTensorStructure(Structure):
   def _to_tensor_list(self, value):
     return [sparse_ops.serialize_sparse(value, out_type=dtypes.variant)]
 
+  def _to_batched_tensor_list(self, value):
+    if self._dense_shape.merge_with(
+        tensor_util.constant_value_as_shape(value.dense_shape)).ndims == 0:
+      raise ValueError(
+          "Unbatching a sparse tensor is only supported for rank >= 1")
+    return [sparse_ops.serialize_many_sparse(value, out_type=dtypes.variant)]
+
   def _from_tensor_list(self, flat_value):
     if (len(flat_value) != 1 or flat_value[0].dtype != dtypes.variant or
         not flat_value[0].shape.is_compatible_with(tensor_shape.vector(3))):
@@ -428,8 +540,11 @@ class SparseTensorStructure(Structure):
     return self._from_compatible_tensor_list(flat_value)
 
   def _from_compatible_tensor_list(self, flat_value):
-    return sparse_ops.deserialize_sparse(
+    ret = sparse_ops.deserialize_sparse(
         flat_value[0], dtype=self._dtype, rank=self._dense_shape.ndims)
+    ret.indices.set_shape([None, self._dense_shape.ndims])
+    ret.dense_shape.set_shape([self._dense_shape.ndims])
+    return ret
 
   @staticmethod
   def from_value(value):
@@ -446,3 +561,13 @@ class SparseTensorStructure(Structure):
 
   def _to_legacy_output_classes(self):
     return sparse_tensor_lib.SparseTensor
+
+  def _batch(self, batch_size):
+    return SparseTensorStructure(
+        self._dtype,
+        tensor_shape.TensorShape([batch_size]).concatenate(self._dense_shape))
+
+  def _unbatch(self):
+    if self._dense_shape.ndims == 0:
+      raise ValueError("Unbatching a tensor is only supported for rank >= 1")
+    return SparseTensorStructure(self._dtype, self._dense_shape[1:])
diff --git a/tensorflow/python/data/util/structure_test.py b/tensorflow/python/data/util/structure_test.py
index 630a0c912bcb7cacdd249e29bd6978db1c40cc9b..91dcfa6f6089bf052526e17ca8f0e646f7e86d71 100644
--- a/tensorflow/python/data/util/structure_test.py
+++ b/tensorflow/python/data/util/structure_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.framework import constant_op
@@ -28,12 +29,13 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
-class StructureTest(test.TestCase, parameterized.TestCase):
+class StructureTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   # NOTE(mrry): The arguments must be lifted into lambdas because otherwise they
   # will be executed before the (eager- or graph-mode) test environment has been
@@ -44,7 +46,7 @@ class StructureTest(test.TestCase, parameterized.TestCase):
        [dtypes.float32], [[]]),
       (lambda: sparse_tensor.SparseTensor(
           indices=[[3, 4]], values=[-1], dense_shape=[4, 5]),
-       structure.SparseTensorStructure, [dtypes.variant], [[3]]),
+       structure.SparseTensorStructure, [dtypes.variant], [None]),
       (lambda: (constant_op.constant(37.0), constant_op.constant([1, 2, 3])),
        structure.NestedStructure, [dtypes.float32, dtypes.int32], [[], [3]]),
       (lambda: {
@@ -58,14 +60,17 @@ class StructureTest(test.TestCase, parameterized.TestCase):
                 sparse_tensor.SparseTensor(
                     indices=[[3, 4]], values=[-1], dense_shape=[4, 5]))
       }, structure.NestedStructure,
-       [dtypes.float32, dtypes.variant, dtypes.variant], [[], [3], [3]]))
+       [dtypes.float32, dtypes.variant, dtypes.variant], [[], None, None]))
   def testFlatStructure(self, value_fn, expected_structure, expected_types,
                         expected_shapes):
     value = value_fn()
     s = structure.Structure.from_value(value)
     self.assertIsInstance(s, expected_structure)
     self.assertEqual(expected_types, s._flat_types)
-    self.assertEqual(expected_shapes, s._flat_shapes)
+    for expected, actual in zip(expected_shapes, s._flat_shapes):
+      self.assertTrue(actual.is_compatible_with(expected))
+      self.assertTrue(
+          tensor_shape.as_shape(expected).is_compatible_with(actual))
 
   @parameterized.parameters(
       (lambda: constant_op.constant(37.0), lambda: [
@@ -112,6 +117,7 @@ class StructureTest(test.TestCase, parameterized.TestCase):
                   indices=[[0], [1], [2]], values=[4, 5, 6], dense_shape=[3])
       }, (constant_op.constant(15.0), constant_op.constant([4, 5, 6]))]),
   )
+  @test_util.run_deprecated_v1
   def testIsCompatibleWithStructure(
       self, original_value_fn, compatible_values_fn, incompatible_values_fn):
     original_value = original_value_fn()
@@ -347,12 +353,141 @@ class StructureTest(test.TestCase, parameterized.TestCase):
            "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
                  structure.TensorStructure(dtypes.string, []))})),
   )
-  def testFromLegacyStructure(self, output_types, output_shapes, output_classes,
-                              expected_structure):
-    actual_structure = structure.Structure._from_legacy_structure(
+  def testConvertLegacyStructure(self, output_types, output_shapes,
+                                 output_classes, expected_structure):
+    actual_structure = structure.convert_legacy_structure(
         output_types, output_shapes, output_classes)
     self.assertTrue(expected_structure.is_compatible_with(actual_structure))
     self.assertTrue(actual_structure.is_compatible_with(expected_structure))
 
+  def testNestedNestedStructure(self):
+    # Although `Structure.from_value()` will not construct one, a nested
+    # structure containing nested `NestedStructure` objects can occur if a
+    # structure is constructed manually.
+    s = structure.NestedStructure(
+        (structure.TensorStructure(dtypes.int64, []),
+         structure.NestedStructure(
+             (structure.TensorStructure(dtypes.float32, []),
+              structure.TensorStructure(dtypes.string, [])))))
+
+    int64_t = constant_op.constant(37, dtype=dtypes.int64)
+    float32_t = constant_op.constant(42.0)
+    string_t = constant_op.constant("Foo")
+
+    nested_tensors = (int64_t, (float32_t, string_t))
+
+    tensor_list = s._to_tensor_list(nested_tensors)
+    for expected, actual in zip([int64_t, float32_t, string_t], tensor_list):
+      self.assertIs(expected, actual)
+
+    (actual_int64_t, (actual_float32_t, actual_string_t)) = s._from_tensor_list(
+        tensor_list)
+    self.assertIs(int64_t, actual_int64_t)
+    self.assertIs(float32_t, actual_float32_t)
+    self.assertIs(string_t, actual_string_t)
+
+    (actual_int64_t, (actual_float32_t, actual_string_t)) = (
+        s._from_compatible_tensor_list(tensor_list))
+    self.assertIs(int64_t, actual_int64_t)
+    self.assertIs(float32_t, actual_float32_t)
+    self.assertIs(string_t, actual_string_t)
+
+  @parameterized.named_parameters(
+      ("Tensor", structure.TensorStructure(dtypes.float32, []), 32,
+       structure.TensorStructure(dtypes.float32, [32])),
+      ("TensorUnknown", structure.TensorStructure(dtypes.float32, []), None,
+       structure.TensorStructure(dtypes.float32, [None])),
+      ("SparseTensor", structure.SparseTensorStructure(dtypes.float32, [None]),
+       32, structure.SparseTensorStructure(dtypes.float32, [32, None])),
+      ("SparseTensorUnknown",
+       structure.SparseTensorStructure(dtypes.float32, [4]), None,
+       structure.SparseTensorStructure(dtypes.float32, [None, 4])),
+      ("Nest", structure.NestedStructure({
+          "a": structure.TensorStructure(dtypes.float32, []),
+          "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
+                structure.TensorStructure(dtypes.string, []))}), 128,
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, [128]),
+           "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
+                 structure.TensorStructure(dtypes.string, [128]))})),
+  )
+  def testBatch(self, element_structure, batch_size,
+                expected_batched_structure):
+    batched_structure = element_structure._batch(batch_size)
+    self.assertTrue(
+        batched_structure.is_compatible_with(expected_batched_structure))
+    self.assertTrue(
+        expected_batched_structure.is_compatible_with(batched_structure))
+
+  @parameterized.named_parameters(
+      ("Tensor", structure.TensorStructure(dtypes.float32, [32]),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("TensorUnknown", structure.TensorStructure(dtypes.float32, [None]),
+       structure.TensorStructure(dtypes.float32, [])),
+      ("SparseTensor",
+       structure.SparseTensorStructure(dtypes.float32, [32, None]),
+       structure.SparseTensorStructure(dtypes.float32, [None])),
+      ("SparseTensorUnknown",
+       structure.SparseTensorStructure(dtypes.float32, [None, 4]),
+       structure.SparseTensorStructure(dtypes.float32, [4])),
+      ("Nest", structure.NestedStructure({
+          "a": structure.TensorStructure(dtypes.float32, [128]),
+          "b": (structure.SparseTensorStructure(dtypes.int32, [128, 2, 2]),
+                structure.TensorStructure(dtypes.string, [None]))}),
+       structure.NestedStructure({
+           "a": structure.TensorStructure(dtypes.float32, []),
+           "b": (structure.SparseTensorStructure(dtypes.int32, [2, 2]),
+                 structure.TensorStructure(dtypes.string, []))})),
+  )
+  def testUnbatch(self, element_structure, expected_unbatched_structure):
+    unbatched_structure = element_structure._unbatch()
+    self.assertTrue(
+        unbatched_structure.is_compatible_with(expected_unbatched_structure))
+    self.assertTrue(
+        expected_unbatched_structure.is_compatible_with(unbatched_structure))
+
+  # pylint: disable=g-long-lambda
+  @parameterized.named_parameters(
+      ("Tensor", lambda: constant_op.constant([[1.0, 2.0], [3.0, 4.0]]),
+       lambda: constant_op.constant([1.0, 2.0])),
+      ("SparseTensor", lambda: sparse_tensor.SparseTensor(
+          indices=[[0, 0], [1, 1]], values=[13, 27], dense_shape=[2, 2]),
+       lambda: sparse_tensor.SparseTensor(
+           indices=[[0]], values=[13], dense_shape=[2])),
+      ("Nest", lambda: (
+          constant_op.constant([[1.0, 2.0], [3.0, 4.0]]),
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 1]], values=[13, 27], dense_shape=[2, 2])),
+       lambda: (constant_op.constant([1.0, 2.0]), sparse_tensor.SparseTensor(
+           indices=[[0]], values=[13], dense_shape=[2]))),
+  )
+  def testToBatchedTensorList(self, value_fn, element_0_fn):
+    batched_value = value_fn()
+    s = structure.Structure.from_value(batched_value)
+    batched_tensor_list = s._to_batched_tensor_list(batched_value)
+
+    # The batch dimension is 2 for all of the test cases.
+    # NOTE(mrry): `tf.shape()` does not currently work for the DT_VARIANT
+    # tensors in which we store sparse tensors.
+    for t in batched_tensor_list:
+      if t.dtype != dtypes.variant:
+        self.assertEqual(2, self.evaluate(array_ops.shape(t)[0]))
+
+    # Test that the 0th element from the unbatched tensor is equal to the
+    # expected value.
+    expected_element_0 = self.evaluate(element_0_fn())
+    unbatched_s = s._unbatch()
+    actual_element_0 = unbatched_s._from_tensor_list(
+        [t[0] for t in batched_tensor_list])
+
+    for expected, actual in zip(
+        nest.flatten(expected_element_0), nest.flatten(actual_element_0)):
+      if sparse_tensor.is_sparse(expected):
+        self.assertSparseValuesEqual(expected, actual)
+      else:
+        self.assertAllEqual(expected, actual)
+
+  # pylint: enable=g-long-lambda
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/debug/BUILD b/tensorflow/python/debug/BUILD
index 79951232097e1c577e009ffa94023e060791c7cc..c6abd476d9d274a3aab270a548f5b0ebd3b6d257 100644
--- a/tensorflow/python/debug/BUILD
+++ b/tensorflow/python/debug/BUILD
@@ -557,6 +557,7 @@ py_test(
         ":source_utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
+        "//tensorflow/python:cond_v2",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:framework_ops",
@@ -566,6 +567,7 @@ py_test(
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_v2",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/debug/cli/analyzer_cli_test.py b/tensorflow/python/debug/cli/analyzer_cli_test.py
index f197a9e4dcefdb528a3a843effa95f7311ca007a..586982dc4bf3511925f46268c537ed53d54ed700 100644
--- a/tensorflow/python/debug/cli/analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/analyzer_cli_test.py
@@ -573,6 +573,7 @@ def create_analyzer_cli(dump):
   return analyzer, registry
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -1583,7 +1584,7 @@ class AnalyzerCLISimpleMulAddTest(test_util.TensorFlowTestCase):
       x = variables.VariableV1([1, 3, 3, 7], name="x")
       _, idx = array_ops.unique(x, name="x_unique")
       idx_times_two = math_ops.multiply(idx, 2, name="idx_times_two")
-      sess.run(x.initializer)
+      self.evaluate(x.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(
@@ -1668,6 +1669,7 @@ class AnalyzerCLIPrintLargeTensorTest(test_util.TensorFlowTestCase):
     self.assertNotIn("...,", out.lines[4])
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
 
   @classmethod
@@ -1995,6 +1997,7 @@ class AnalyzerCLIControlDepTest(test_util.TensorFlowTestCase):
                      out.font_attr_segs[0])
 
 
+@test_util.run_v1_only("b/120545219")
 class AnalyzerCLIWhileLoopTest(test_util.TensorFlowTestCase):
 
   @classmethod
diff --git a/tensorflow/python/debug/cli/cli_shared_test.py b/tensorflow/python/debug/cli/cli_shared_test.py
index 07b364db9f2aab9c11ecb769a94f36e0809d70a0..66a12efda53470b33edf4788984e632bfe55f2b9 100644
--- a/tensorflow/python/debug/cli/cli_shared_test.py
+++ b/tensorflow/python/debug/cli/cli_shared_test.py
@@ -105,6 +105,7 @@ class TimeToReadableStrTest(test_util.TensorFlowTestCase):
       cli_shared.time_to_readable_str(100, force_time_unit="ks")
 
 
+@test_util.run_v1_only("b/120545219")
 class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -118,6 +119,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
   def tearDown(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testSingleFetchNoFeeds(self):
     run_start_intro = cli_shared.get_run_start_intro(12, self.const_a, None, {})
 
@@ -181,6 +183,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     run_start_intro = cli_shared.get_run_start_intro(1, self.sparse_d, None, {})
     self.assertEqual(str(self.sparse_d), run_start_intro.lines[4].strip())
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesListNoFeeds(self):
     fetches = [self.const_a, self.const_b]
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -197,6 +200,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testNestedListAsFetches(self):
     fetches = [self.const_c, [self.const_a, self.const_b]]
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -210,6 +214,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 3 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testNestedDictAsFetches(self):
     fetches = {"c": self.const_c, "ab": {"a": self.const_a, "b": self.const_b}}
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -227,6 +232,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 3 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesAsTupleNoFeeds(self):
     fetches = (self.const_a, self.const_b)
     run_start_intro = cli_shared.get_run_start_intro(1, fetches, None, {})
@@ -243,6 +249,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTwoFetchesAsNamedTupleNoFeeds(self):
     fetches_namedtuple = namedtuple("fetches", "x y")
     fetches = fetches_namedtuple(self.const_b, self.const_c)
@@ -260,6 +267,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     description = cli_shared.get_run_short_description(1, fetches, None)
     self.assertEqual("run #1: 2 fetches; 0 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testWithFeedDict(self):
     feed_dict = {
         self.const_a: 10.0,
@@ -283,6 +291,7 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
                                                        feed_dict)
     self.assertEqual("run #1: 1 fetch (c:0); 2 feeds", description)
 
+  @test_util.run_deprecated_v1
   def testTensorFilters(self):
     feed_dict = {self.const_a: 10.0}
     tensor_filters = {
@@ -313,17 +322,20 @@ class GetRunStartIntroAndDescriptionTest(test_util.TensorFlowTestCase):
     command_set.add(annot[2].content)
     self.assertEqual({"run -f filter_a", "run -f filter_b"}, command_set)
 
+  @test_util.run_deprecated_v1
   def testGetRunShortDescriptionWorksForTensorFeedKey(self):
     short_description = cli_shared.get_run_short_description(
         1, self.const_a, {self.const_a: 42.0})
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (a:0)", short_description)
 
+  @test_util.run_deprecated_v1
   def testGetRunShortDescriptionWorksForUnicodeFeedKey(self):
     short_description = cli_shared.get_run_short_description(
         1, self.const_a, {u"foo": 42.0})
     self.assertEqual("run #1: 1 fetch (a:0); 1 feed (foo)", short_description)
 
 
+@test_util.run_v1_only("b/120545219")
 class GetErrorIntroTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
index 60b6047970732f7e3f015216cbf2c91f8241e956..d6d2b58b5f8138643bb4b9886da01b72295b5df7 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli_test.py
@@ -70,6 +70,7 @@ def _assert_no_lines_match(pattern, lines):
         "%s matched at least one line in %s." % (pattern, str(lines)))
 
 
+@test_util.run_v1_only("b/120545219")
 class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
 
   def testNodeInfoEmpty(self):
@@ -321,6 +322,7 @@ class ProfileAnalyzerListProfileTest(test_util.TensorFlowTestCase):
     _assert_at_least_one_line_matches(r"Device Total.*0\.009ms", prof_output)
 
 
+@test_util.run_v1_only("b/120545219")
 class ProfileAnalyzerPrintSourceTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/cli/stepper_cli_test.py b/tensorflow/python/debug/cli/stepper_cli_test.py
index 7b8a42c25380dde8bc2ce0d34eb79f2ddd54922f..5cf69d0168b70a4d03162512b5024736c50cf23a 100644
--- a/tensorflow/python/debug/cli/stepper_cli_test.py
+++ b/tensorflow/python/debug/cli/stepper_cli_test.py
@@ -129,6 +129,7 @@ def _parse_updated(lines):
   return updated
 
 
+@test_util.run_v1_only("b/120545219")
 class NodeStepperSimpleGraphTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/examples/debug_errors.py b/tensorflow/python/debug/examples/debug_errors.py
index 28abc9734370630b864da4f693cbddd88c382502..e3692072cc558fa11a47daafb6fb0834d70ee654 100644
--- a/tensorflow/python/debug/examples/debug_errors.py
+++ b/tensorflow/python/debug/examples/debug_errors.py
@@ -77,4 +77,5 @@ if __name__ == "__main__":
       default=False,
       help="Use debugger to track down bad values during training")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_fibonacci.py b/tensorflow/python/debug/examples/debug_fibonacci.py
index 3821b393ec6847db71b7c4b7396b1ed448ae9538..777fb089881a069e403eb897f4efabcff815e2bf 100644
--- a/tensorflow/python/debug/examples/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/debug_fibonacci.py
@@ -100,4 +100,5 @@ if __name__ == "__main__":
       "--debug flag.")
 
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_keras.py b/tensorflow/python/debug/examples/debug_keras.py
index 3272d85ade957b254b2c1a0977156179cd71bb9d..019121fa0a61a4e69ce370bac23c4575a27a72c9 100644
--- a/tensorflow/python/debug/examples/debug_keras.py
+++ b/tensorflow/python/debug/examples/debug_keras.py
@@ -86,4 +86,5 @@ if __name__ == "__main__":
       default=2,
       help="Number of epochs to train the model for.")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/examples/debug_mnist.py b/tensorflow/python/debug/examples/debug_mnist.py
index ab1c90371cd18bbaf278b72248bcc7e9e9c34b06..09fb06c9c065f544a4c9bb47b96157704a8306e2 100644
--- a/tensorflow/python/debug/examples/debug_mnist.py
+++ b/tensorflow/python/debug/examples/debug_mnist.py
@@ -190,4 +190,5 @@ if __name__ == "__main__":
       "the gRPC address (e.g., localhost:1234). Mutually exclusive with the "
       "--debug flag.")
   FLAGS, unparsed = parser.parse_known_args()
-  tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
+  with tf.Graph().as_default():
+    tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
diff --git a/tensorflow/python/debug/lib/common_test.py b/tensorflow/python/debug/lib/common_test.py
index 5af0dafcf9fd81763b30eb159a3e21ef8b7f9ac9..f6413f6b7b3dee82ea67ca664e8645152fbb5b83 100644
--- a/tensorflow/python/debug/lib/common_test.py
+++ b/tensorflow/python/debug/lib/common_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.platform import googletest
 
 class CommonTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testOnFeedOneFetch(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -35,6 +36,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["b:0"], loaded[1])
 
+  @test_util.run_deprecated_v1
   def testGetRunKeyFlat(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
@@ -43,6 +45,7 @@ class CommonTest(test_util.TensorFlowTestCase):
     self.assertItemsEqual(["a:0"], loaded[0])
     self.assertItemsEqual(["a:0", "b:0"], loaded[1])
 
+  @test_util.run_deprecated_v1
   def testGetRunKeyNestedFetches(self):
     a = constant_op.constant(10.0, name="a")
     b = constant_op.constant(20.0, name="b")
diff --git a/tensorflow/python/debug/lib/debug_gradients_test.py b/tensorflow/python/debug/lib/debug_gradients_test.py
index 01867fc69d0782b34edb1e8eb873b19f5dfc8529..885691c3ef71ba995ec3ab38e2d1bda7e1e30b1a 100644
--- a/tensorflow/python/debug/lib/debug_gradients_test.py
+++ b/tensorflow/python/debug/lib/debug_gradients_test.py
@@ -36,6 +36,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
+@test_util.run_v1_only("b/120545219")
 class IdentifyGradientTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
index 1f67f8a0d4e55c7faf8ca65af51169831e731576..34030c0adcab30647d360260741a8dcbb870cc73 100644
--- a/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
+++ b/tensorflow/python/debug/lib/debug_graph_reconstruction_test.py
@@ -126,8 +126,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       u = variables.Variable([12.0], name="u")
       v = variables.Variable([30.0], name="v")
       w = math_ops.add(u, v, name="w")
-      sess.run(u.initializer)
-      sess.run(v.initializer)
+      self.evaluate(u.initializer)
+      self.evaluate(v.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, w, expected_output=[42.0])
@@ -139,7 +139,7 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
         b = math_ops.add(a, a, name="b")
       with ops.control_dependencies([a, b]):
         c = math_ops.multiply(b, b, name="c")
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, c, expected_output=400.0)
@@ -150,8 +150,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       y = variables.Variable(20.0, name="y")
       cond = control_flow_ops.cond(
           x > y, lambda: math_ops.add(x, 1), lambda: math_ops.add(y, 1))
-      sess.run(x.initializer)
-      sess.run(y.initializer)
+      self.evaluate(x.initializer)
+      self.evaluate(y.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(
           sess, cond, expected_output=21.0)
@@ -173,8 +173,8 @@ class ReconstructNonDebugGraphTest(test_util.TensorFlowTestCase):
       toy_loss = x * (u - v)
       train_op = gradient_descent.GradientDescentOptimizer(
           learning_rate=0.1).minimize(toy_loss, name="train_op")
-      sess.run(u.initializer)
-      sess.run(v.initializer)
+      self.evaluate(u.initializer)
+      self.evaluate(v.initializer)
 
       self._compareOriginalAndReconstructedGraphDefs(sess, train_op)
 
diff --git a/tensorflow/python/debug/lib/debug_utils_test.py b/tensorflow/python/debug/lib/debug_utils_test.py
index 23ab98444cd0777700daaca26ccafe9c68444cb7..9d59cfc1792a8df472998e115dc01387a9ba3cdf 100644
--- a/tensorflow/python/debug/lib/debug_utils_test.py
+++ b/tensorflow/python/debug/lib/debug_utils_test.py
@@ -185,6 +185,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertEqual(["file:///tmp/tfdbg_1", "file:///tmp/tfdbg_2"],
                      watch_0.debug_urls)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_allNodes(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -216,6 +217,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertTrue("p1" in node_names)
     self.assertTrue("s" in node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -230,6 +232,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["a1_init", "a1", "a1/Assign", "a1/read", "p1"]),
         sorted(node_names))
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_opTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -255,6 +258,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["p1"], node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_tensorDTypeWhitelist(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -267,6 +271,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign", "b", "b/Assign"], node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndTensorDTypeWhitelists(self):
     debug_utils.watch_graph(
         self._run_options,
@@ -280,6 +285,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertItemsEqual(["a1", "a1/Assign"], node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -294,6 +300,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         sorted(["b_init", "b", "b/Assign", "b/read", "c", "s"]),
         sorted(node_names))
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_opTypeBlacklist(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -306,6 +313,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(sorted(["p1", "s"]), sorted(node_names))
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndOpTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -319,6 +327,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
         ["DebugIdentity"], ["file:///tmp/tfdbg_1"])
     self.assertEqual(["s"], node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_tensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
@@ -335,6 +344,7 @@ class DebugUtilsTest(test_util.TensorFlowTestCase):
     self.assertNotIn("b/Assign", node_names)
     self.assertIn("s", node_names)
 
+  @test_util.run_v1_only("b/120545219")
   def testWatchGraph_nodeNameAndTensorDTypeBlacklists(self):
     debug_utils.watch_graph_with_blacklists(
         self._run_options,
diff --git a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
index 74498c8ea3dd494cd8fc6237b60b11a202497990..2405e29aaa51c2e0c422fa6f950ec46553ae75c0 100644
--- a/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/dist_session_debug_grpc_test.py
@@ -44,6 +44,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class DistributedSessionDebugTest(test_util.TensorFlowTestCase):
   """Test the debugging of distributed sessions."""
 
diff --git a/tensorflow/python/debug/lib/session_debug_file_test.py b/tensorflow/python/debug/lib/session_debug_file_test.py
index 1874160dd637596ffb8a935148baf1f308de0210..16ab815d92ddffe2108776388f668427fd140f06 100644
--- a/tensorflow/python/debug/lib/session_debug_file_test.py
+++ b/tensorflow/python/debug/lib/session_debug_file_test.py
@@ -28,11 +28,13 @@ from tensorflow.python.debug.lib import debug_utils
 from tensorflow.python.debug.lib import session_debug_testlib
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugFileTest(session_debug_testlib.SessionDebugTestBase):
 
   def _debug_urls(self, run_number=None):
diff --git a/tensorflow/python/debug/lib/session_debug_grpc_test.py b/tensorflow/python/debug/lib/session_debug_grpc_test.py
index bfc9a3a382744676fafe9f280ab54f8dee3fedcb..472e2449156fefc2c00bb4079018de224097692e 100644
--- a/tensorflow/python/debug/lib/session_debug_grpc_test.py
+++ b/tensorflow/python/debug/lib/session_debug_grpc_test.py
@@ -91,6 +91,7 @@ class GrpcDebugServerTest(test_util.TensorFlowTestCase):
     server.stop_server().wait()
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugGrpcTest(session_debug_testlib.SessionDebugTestBase):
 
   @classmethod
@@ -353,6 +354,7 @@ class SessionDebugConcurrentTest(
     return urls
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
   """Test server gating of debug ops."""
 
@@ -730,6 +732,7 @@ class SessionDebugGrpcGatingTest(test_util.TensorFlowTestCase):
       self.assertEqual("DebugNumericSummary", debug_watch.debug_op)
 
 
+@test_util.run_v1_only("b/120545219")
 class DelayedDebugServerTest(test_util.TensorFlowTestCase):
 
   def testDebuggedSessionRunWorksWithDelayedDebugServerStartup(self):
diff --git a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
index b0dc25851ca3101a48543aeca1325fa155dd29b7..8eef45392f2fb56bc57b6bd6156f9fed8a93cd1f 100644
--- a/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
+++ b/tensorflow/python/debug/lib/session_debug_multi_gpu_test.py
@@ -67,7 +67,7 @@ class SessionDebugMultiGPUTest(test_util.TensorFlowTestCase):
         u1 = math_ops.multiply(v, v, name="u1")
       w = math_ops.subtract(u1, u0, name="w")
 
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(run_options, sess.graph,
diff --git a/tensorflow/python/debug/lib/session_debug_testlib.py b/tensorflow/python/debug/lib/session_debug_testlib.py
index 25ef91b575957164691bccd9d15107d9a4812eac..5165febff52506d07e2d3b0aea361c31567cc419 100644
--- a/tensorflow/python/debug/lib/session_debug_testlib.py
+++ b/tensorflow/python/debug/lib/session_debug_testlib.py
@@ -84,6 +84,7 @@ class _RNNCellForTest(rnn_cell_impl.RNNCell):
     return (math_ops.multiply(self._w, input_), state)
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionDebugTestBase(test_util.TensorFlowTestCase):
   """Base class for unit tests of tfdbg running with tf.Session."""
 
diff --git a/tensorflow/python/debug/lib/source_utils_test.py b/tensorflow/python/debug/lib/source_utils_test.py
index 4a8d4eaa99f28db26f05a00e7759c79699ca9ab4..4f4aea032132d09f025392587038b79d7f0804c5 100644
--- a/tensorflow/python/debug/lib/source_utils_test.py
+++ b/tensorflow/python/debug/lib/source_utils_test.py
@@ -65,6 +65,7 @@ class GuessIsTensorFlowLibraryTest(test_util.TensorFlowTestCase):
     self.assertTrue(
         source_utils.guess_is_tensorflow_py_library(source_utils.__file__))
 
+  @test_util.run_deprecated_v1
   def testFileInPythonKernelsPathReturnsTrue(self):
     x = constant_op.constant(42.0, name="x")
     self.assertTrue(
@@ -109,8 +110,8 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
       self.w = math_ops.matmul(self.u, self.v, name="w")
       self.w_line_number = line_number_above()
 
-      sess.run(self.u.initializer)
-      sess.run(self.v.initializer)
+      self.evaluate(self.u.initializer)
+      self.evaluate(self.v.initializer)
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       debug_utils.watch_graph(
@@ -215,6 +216,7 @@ class SourceHelperTest(test_util.TensorFlowTestCase):
     os.remove(unrelated_source_path)
 
 
+@test_util.run_v1_only("b/120545219")
 class ListSourceAgainstDumpTest(test_util.TensorFlowTestCase):
 
   def createAndRunGraphWithWhileLoop(self):
diff --git a/tensorflow/python/debug/lib/stepper_test.py b/tensorflow/python/debug/lib/stepper_test.py
index 3839c671982f80158273ea40de73ff920306316d..9e78e207b80a99f3812c5909cf3753d90eab3680 100644
--- a/tensorflow/python/debug/lib/stepper_test.py
+++ b/tensorflow/python/debug/lib/stepper_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import gradient_descent
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -443,6 +444,7 @@ class StepperTest(test_util.TensorFlowTestCase):
           self.assertAllClose(-4.0, result["fz"]["z"])
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -577,6 +579,7 @@ class StepperTestWithPlaceHolders(test_util.TensorFlowTestCase):
       self.assertAllClose([[-1.0], [6.0]], stepper.finalize())
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperAssignAddTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -692,6 +695,7 @@ class StepperAssignAddTest(test_util.TensorFlowTestCase):
       self.assertAllClose(12.0, stepper.cont(self.v))
 
 
+@test_util.run_v1_only("b/120545219")
 class StepperBackwardRunTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/wrappers/disk_usage_test.py b/tensorflow/python/debug/wrappers/disk_usage_test.py
index 0874525966ceb34b9cb99df9affd63cf1865b663..88b1cd540de7a6a56db6e5165be53ae8c9c2df26 100644
--- a/tensorflow/python/debug/wrappers/disk_usage_test.py
+++ b/tensorflow/python/debug/wrappers/disk_usage_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
+@test_util.run_v1_only("b/120545219")
 class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
 
   @classmethod
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 11011a5c1342b281ab86c7f861d895f570bd037d..42e3b09382d825840ea12eeaf2baf35f33c17da9 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -41,6 +41,7 @@ from tensorflow.python.platform import googletest
 from tensorflow.python.training import monitored_session
 
 
+@test_util.run_v1_only("b/120545219")
 class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/debug/wrappers/framework_test.py b/tensorflow/python/debug/wrappers/framework_test.py
index 73e08ce7d5969de2ae54e2505fa7b449bfaf631a..a50fa7cf4b870868a61ea4df173fc24bc8a8e110 100644
--- a/tensorflow/python/debug/wrappers/framework_test.py
+++ b/tensorflow/python/debug/wrappers/framework_test.py
@@ -141,6 +141,7 @@ class TestDebugWrapperSessionBadAction(framework.BaseDebugWrapperSession):
     return framework.OnRunEndResponse()
 
 
+@test_util.run_v1_only("b/120545219")
 class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def _no_rewrite_session_config(self):
@@ -339,7 +340,7 @@ class DebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
     with wrapper.as_default():
       foo = constant_op.constant(42, name="foo")
-      self.assertEqual(42, foo.eval())
+      self.assertEqual(42, self.evaluate(foo))
       self.assertEqual(foo, self._observer["run_fetches"])
 
   def testWrapperShouldSupportSessionClose(self):
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index 149a7497df8fecc19a665afc1483ad55c890c335..e38df861f5b633baf94c99e4892e1bd90943337d 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -127,6 +127,7 @@ class LocalCLIDebuggerWrapperSessionForTest(
         return e.exit_token
 
 
+@test_util.run_v1_only("b/120545219")
 class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 1c3d8ea67e524775e26c2fcb41af4349b0706353..887c61cb8fd81c6be4d20ba6b25c2997cea8cb7f 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -7,15 +7,152 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "all_reduce",
+    srcs = [
+        "all_reduce.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
+    ],
+)
+
+tf_py_test(
+    name = "all_reduce_test",
+    srcs = ["all_reduce_test.py"],
+    additional_deps = [
+        ":all_reduce",
+        "//third_party/py/numpy",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:state_ops",
+    ],
+)
+
+py_library(
+    name = "cross_device_ops",
+    srcs = ["cross_device_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":cross_device_utils",
+        ":device_util",
+        ":reduce_util",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:device_lib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/eager:context",
+        "@six_archive//:six",
+    ],
+)
+
+py_library(
+    name = "cross_device_utils",
+    srcs = ["cross_device_utils.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":all_reduce",
+        ":values",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:collective_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:nccl_ops",
+    ],
+)
+
+py_library(
+    name = "device_util",
+    srcs = ["device_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:device",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/eager:context",
+    ],
+)
+
+cuda_py_test(
+    name = "device_util_test",
+    srcs = ["device_util_test.py"],
+    additional_deps = [
+        ":device_util",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+    ],
+)
 
 py_library(
     name = "distribute",
+    srcs = [
+        "__init__.py",
+    ],
     srcs_version = "PY2AND3",
-    visibility = ["//visibility:public"],
     deps = [
-        ":distribute_config",
-        ":distribute_coordinator",
-        ":distribute_coordinator_context",
+        ":distribute_lib",
+        ":mirrored_strategy",
+    ],
+)
+
+py_library(
+    name = "distribute_lib",
+    srcs = [
+        "distribute_lib.py",
+        "distribution_strategy_context.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":device_util",
+        ":reduce_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/data",
+        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/tools/docs:doc_controls",
+    ],
+)
+
+py_test(
+    name = "distribute_lib_test",
+    size = "small",
+    srcs = ["distribute_lib_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":distribute_lib",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:variable_scope",
     ],
 )
 
@@ -45,7 +182,6 @@ py_library(
 
 py_test(
     name = "distribute_coordinator_test",
-    size = "large",
     srcs = ["distribute_coordinator_test.py"],
     srcs_version = "PY2AND3",
     tags = [
@@ -76,6 +212,35 @@ py_library(
     deps = [],
 )
 
+py_library(
+    name = "mirrored_strategy",
+    srcs = ["mirrored_strategy.py"],
+    deps = [
+        ":cross_device_ops",
+        ":device_util",
+        ":distribute_lib",
+        ":multi_worker_util",
+        ":reduce_util",
+        ":shared_variable_creator",
+        ":values",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:device",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:tensor_util",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:tape",
+    ],
+)
+
 py_library(
     name = "multi_worker_util",
     srcs = [
@@ -88,6 +253,35 @@ py_library(
     ],
 )
 
+py_library(
+    name = "input_ops",
+    srcs = ["input_ops.py"],
+    deps = [
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/data/experimental/ops:filter_for_shard_ops",
+        "//tensorflow/python/data/util:nest",
+    ],
+)
+
+cuda_py_test(
+    name = "input_ops_test",
+    srcs = ["input_ops_test.py"],
+    additional_deps = [
+        ":input_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python:errors",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:io_ops",
+        "//tensorflow/python:util",
+    ],
+    tags = [
+        "no_pip",
+    ],
+)
+
 py_test(
     name = "multi_worker_util_test",
     srcs = ["multi_worker_util_test.py"],
@@ -120,3 +314,49 @@ py_library(
         "//tensorflow/python:training",
     ],
 )
+
+py_library(
+    name = "reduce_util",
+    srcs = ["reduce_util.py"],
+    deps = [
+        "//tensorflow/python:util",
+        "//tensorflow/python:variable_scope",
+    ],
+)
+
+py_library(
+    name = "shared_variable_creator",
+    srcs = ["shared_variable_creator.py"],
+)
+
+py_test(
+    name = "shared_variable_creator_test",
+    srcs = ["shared_variable_creator_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":shared_variable_creator",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "values",
+    srcs = ["values.py"],
+    deps = [
+        ":device_util",
+        ":distribute_lib",
+        ":input_ops",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:training",
+        "//tensorflow/python:util",
+        "//tensorflow/python/data/ops:multi_device_iterator_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/checkpointable:base",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py b/tensorflow/python/distribute/__init__.py
similarity index 53%
rename from tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
rename to tensorflow/python/distribute/__init__.py
index 7894418c4a16063da88710b43bbbbeb191fc1a2d..4ff912ae10d8336cfeeb42d060bd0d9c52e24482 100644
--- a/tensorflow/contrib/estimator/python/estimator/dnn_linear_combined.py
+++ b/tensorflow/python/distribute/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,23 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""dnn_linear_combined python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+"""Distribution Strategy library."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow_estimator.contrib.estimator.python.estimator import dnn_linear_combined
-
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-dnn_linear_combined.__all__ = [
-    s for s in dir(dnn_linear_combined) if not s.startswith('__')
-]
-
-from tensorflow_estimator.contrib.estimator.python.estimator.dnn_linear_combined import *
+# pylint: disable=unused-import
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import mirrored_strategy
+# pylint: enable=unused-import
diff --git a/tensorflow/python/distribute/all_reduce.py b/tensorflow/python/distribute/all_reduce.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd7c45ae27ac2d093c7feaf9d490ffa074533ddc
--- /dev/null
+++ b/tensorflow/python/distribute/all_reduce.py
@@ -0,0 +1,860 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities to construct a TF subgraph implementing distributed All-Reduce."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import math
+
+from tensorflow.python.framework import device as device_lib
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nccl_ops
+
+
+def _flatten_tensors(tensors):
+  """Check tensors for isomorphism and flatten.
+
+  Args:
+    tensors: list of T `tf.Tensor` which must all have the same shape.
+
+  Returns:
+    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
+    shape: the original shape of each element of input tensors
+
+  Raises:
+    ValueError: tensors are empty or non-isomorphic or have unknown shape.
+  """
+  if not tensors:
+    raise ValueError("tensors cannot be empty")
+  shape = tensors[0].shape
+  for tensor in tensors:
+    shape = shape.merge_with(tensor.shape)
+  if not shape.is_fully_defined():
+    raise ValueError("Tensors must have statically known shape.")
+  if len(shape) != 1:
+    reshaped = []
+    for t in tensors:
+      with ops.colocate_with(t):
+        reshaped.append(array_ops.reshape(t, [-1]))
+    tensors = reshaped
+  return tensors, shape
+
+
+def _reshape_tensors(tensors, shape):
+  """Reshape tensors flattened by _flatten_tensors.
+
+  Args:
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
+    shape: list of integers describing the desired shape.  Product of
+      the elements must equal the length of each tensor.
+
+  Returns:
+    list of T `tf.Tensor` which are the reshaped inputs.
+  """
+  reshaped = []
+  for t in tensors:
+    with ops.colocate_with(t):
+      reshaped.append(array_ops.reshape(t, shape))
+  return reshaped
+
+
+def _padded_split(tensor, pieces):
+  """Like split for 1D tensors but pads-out case where len % pieces != 0.
+
+  Args:
+    tensor: T `tf.Tensor` that must be 1D.
+    pieces: a positive integer specifying the number of pieces into which
+      tensor should be split.
+
+  Returns:
+    list of T `tf.Tensor` of length pieces, which hold the values of
+      thin input tensor, in order.  The final tensor may
+      be zero-padded on the end to make its size equal to those of all
+      of the other tensors.
+
+  Raises:
+    ValueError: The input tensor is not 1D.
+  """
+  shape = tensor.shape
+  if 1 != len(shape):
+    raise ValueError("input tensor must be 1D")
+  tensor_len = shape.dims[0].value
+  with ops.colocate_with(tensor):
+    if tensor_len % pieces != 0:
+      # pad to an even length
+      chunk_size = 1 + tensor_len // pieces
+      if pieces > tensor_len:
+        # This is an edge case that should not come up in practice,
+        # i.e. a different reduction algorithm would be better,
+        # but we'll make it work just for completeness.
+        pad_len = pieces - tensor_len
+        extended_whole = array_ops.concat(
+            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        parts = array_ops.split(extended_whole, pieces)
+        return parts, pad_len
+      elif (pieces - 1) * chunk_size >= tensor_len:
+        # Another edge case of limited real interest.
+        pad_len = (pieces * chunk_size) % tensor_len
+        extended_whole = array_ops.concat(
+            [tensor, array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        parts = array_ops.split(extended_whole, pieces)
+        return parts, pad_len
+      else:
+        last_chunk_size = tensor_len - (pieces - 1) * chunk_size
+        pad_len = chunk_size - last_chunk_size
+        piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
+        parts = array_ops.split(tensor, piece_lens)
+        parts[-1] = array_ops.concat(
+            [parts[-1], array_ops.zeros([pad_len], dtype=tensor.dtype)], 0)
+        return parts, pad_len
+    else:
+      return array_ops.split(tensor, pieces), 0
+
+
+def _strip_padding(tensors, pad_len):
+  """Strip the suffix padding added by _padded_split.
+
+  Args:
+    tensors: list of T `tf.Tensor` of identical length 1D tensors.
+    pad_len: number of elements to be stripped from the end of each tensor.
+
+  Returns:
+    list of T `tf.Tensor` which are the stripped inputs.
+
+  Raises:
+    ValueError: tensors must be a non-empty list of 1D tensors, and
+      each must be longer than pad_len.
+  """
+  if not tensors:
+    raise ValueError("tensors cannot be empty")
+  shape = tensors[0].shape
+  if len(shape) > 1:
+    raise ValueError("tensors must be 1D")
+  prefix_len = int(shape[0] - pad_len)
+  if prefix_len < 0:
+    raise ValueError("pad_len longer than tensor")
+  stripped = []
+  for t in tensors:
+    with ops.colocate_with(t):
+      stripped.append(array_ops.slice(t, [0], [prefix_len]))
+  return stripped
+
+
+def _ragged_split(tensor, pieces):
+  """Like split for 1D tensors but allows case where len % pieces != 0.
+
+  Args:
+    tensor: T `tf.Tensor` that must be 1D.
+    pieces: a positive integer specifying the number of pieces into which
+      tensor should be split.
+
+  Returns:
+    list of T `tf.Tensor` of length pieces, which hold the values of
+      the input tensor, in order.  The final tensor may be shorter
+      than the others, which will all be of equal length.
+
+  Raises:
+    ValueError: input tensor must be 1D.
+  """
+  shape = tensor.shape
+  if 1 != len(shape):
+    raise ValueError("input tensor must be 1D")
+  tensor_len = shape.dims[0].value
+  chunk_size = tensor_len // pieces
+  with ops.colocate_with(tensor):
+    if tensor_len != (pieces * chunk_size):
+      # last piece will be short
+      assert pieces > 1
+      last_chunk_size = tensor_len - ((pieces - 1) * chunk_size)
+      assert last_chunk_size > 0
+      piece_lens = [chunk_size for _ in range(pieces - 1)] + [last_chunk_size]
+      return array_ops.split(tensor, piece_lens)
+    else:
+      return array_ops.split(tensor, pieces)
+
+
+def _ring_permutations(num_workers, num_subchunks, gpu_perm):
+  """"Generate an array of device index arrays, one for each subchunk.
+
+  In the basic ring reduction algorithm there are size(T)/num_devices
+  data chunks and each device process one chunk per tick, i.e. sending
+  one chunk and receiving one chunk.  The idea of subchunking is that
+  each device processes num_subchunks smaller data regions per tick,
+  and the ring rank permutation is different for each subchunk index
+  so that a device is potentially sending to and receiving from
+  num_subchunks different other devices at each tick.  Where multiple
+  independent data channels exist between devices, this strategy
+  supplies a method of using them in parallel.
+
+  Args:
+    num_workers: number of worker tasks
+    num_subchunks: number of subchunks into which to divide each per-GPU chunk.
+    gpu_perm: an array of integers in [0, num_gpus-1] giving the default
+      ring order of GPUs at each worker.  Other permutations will be generated
+      by rotating this array and splicing together per-worker instances.
+
+  Raises:
+    ValueError: the number of subchunks may not exceed the number of GPUs.
+
+  Returns:
+    pred_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
+        preceding device in the permutation for that subchunk.  The
+        device index of GPU i at worker j is i + (j * num_gpus).
+    rank_by_s_d: list of lists that maps (by index) from (subchunk, dev) to
+       local rank of device d in the permutation for that subchunk.
+  """
+  num_gpus = len(gpu_perm)
+  devices = num_workers * num_gpus
+  if devices == 0:
+    return [], []
+  if num_subchunks > num_gpus:
+    raise ValueError(
+        "num_subchunks %d must be <= num_gpus %d" % (num_subchunks, num_gpus))
+  rotation_interval = max(1, int(num_gpus / num_subchunks))
+  perms_by_s = []
+  for s in range(0, num_subchunks):
+    full_order = []
+    offset = s * rotation_interval
+    for w in range(0, num_workers):
+      default_order = [(w * num_gpus) + i for i in gpu_perm]
+      dev_order = default_order[offset:] + default_order[:offset]
+      full_order += dev_order
+    perms_by_s.append(full_order)
+  pred_by_s_d = [[-1 for d in range(0, devices)]
+                 for s in range(0, num_subchunks)]
+  rank_by_s_d = [[-1 for d in range(0, devices)]
+                 for s in range(0, num_subchunks)]
+  for s in range(0, num_subchunks):
+    for d in range(0, devices):
+      for t in range(0, devices):
+        if d == perms_by_s[s][t]:
+          rank_by_s_d[s][d] = t
+          pred_by_s_d[s][d] = perms_by_s[s][(t + devices - 1) % devices]
+          break
+  return (pred_by_s_d, rank_by_s_d)
+
+
+def build_ring_all_reduce(input_tensors, num_workers, num_subchunks,
+                          gpu_perm, red_op, un_op=None):
+  """Construct a subgraph performing a ring-style all-reduce of input_tensors.
+
+  Args:
+    input_tensors: a list of T `tf.Tensor` objects, which must all
+      have the same shape and type.
+    num_workers: number of worker tasks spanned by input_tensors.
+    num_subchunks: number of subchunks each device should process in one tick.
+    gpu_perm: a list of ints giving a ring-wise rank ordering of GPUs at
+      each worker.  All workers must have the same number of
+      GPUs with the same rank ordering.  If NVLINK is available, this should
+      be a ring order supported by NVLINK edges.
+    red_op: a binary operator for elementwise reduction.
+    un_op: an optional unary operator to apply to fully reduced values.
+
+  Raises:
+    ValueError: empty input_tensors or they don't all have same
+    size.
+
+  Returns:
+    a list of T `tf.Tensor` identical sum-reductions of input_tensors.
+  """
+  if len(input_tensors) < 2:
+    raise ValueError("input_tensors must be length 2 or longer")
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  (pred_by_s_d, rank_by_s_d) = _ring_permutations(
+      num_workers, num_subchunks, gpu_perm)
+  chunks_by_dev, pad_len = _build_ring_gather(
+      input_tensors, devices,
+      num_subchunks, pred_by_s_d, rank_by_s_d, red_op)
+  if un_op:
+    chunks_by_dev = _apply_unary_to_chunks(un_op, chunks_by_dev)
+  output_tensors = _build_ring_scatter(pred_by_s_d, rank_by_s_d,
+                                       chunks_by_dev)
+  if pad_len > 0:
+    output_tensors = _strip_padding(output_tensors, pad_len)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_ring_gather(input_tensors, devices, num_subchunks,
+                       pred_by_s_d, rank_by_s_d, red_op):
+  """Construct a subgraph for the first (reduction) pass of ring all-reduce.
+
+  Args:
+    input_tensors: a list of T `tf.Tensor` 1D input tensors of same
+      shape and type.
+    devices: array of device name strings
+    num_subchunks: number of subchunks each device should process in one tick.
+    pred_by_s_d: as produced by _ring_permutations
+    rank_by_s_d: as produced by _ring_permutations
+    red_op: a binary operator for elementwise reduction
+
+  Raises:
+    ValueError: tensors must all be one dimensional.
+
+  Returns:
+    list of list of T `tf.Tensor` of (partially) reduced values where
+    exactly num_subchunks chunks at each device are fully reduced.
+  """
+  num_devices = len(input_tensors)
+  if num_devices == 0:
+    return []
+  if num_devices == 1:
+    return input_tensors
+  shape = input_tensors[0].shape
+  if 1 != len(shape):
+    raise ValueError("input tensors must be 1D")
+  num_chunks = num_devices * num_subchunks
+  num_ticks = num_devices - 1
+  # Initialize chunks_by_dev with splits of the input tensors.
+  chunks_by_dev = []
+  split_pad_len = 0
+  for d in range(0, num_devices):
+    with ops.device(devices[d]):
+      splits, split_pad_len = _padded_split(input_tensors[d], num_chunks)
+      chunks_by_dev.append(splits)
+  # Reduction phase
+  for tick in range(0, num_ticks):
+    # One new partial reduction for every chunk
+    new_partial_reductions = [None for _ in range(0, num_chunks)]
+    # Compute reductions with respect to last tick's values
+    for d in range(0, num_devices):
+      with ops.device(devices[d]):
+        for s in range(0, num_subchunks):
+          rank = rank_by_s_d[s][d]
+          seg_index = (rank + num_devices - (2 + tick)) % num_devices
+          pred_dev = pred_by_s_d[s][d]
+          chunk_index = (seg_index * num_subchunks) + s
+          new_partial_reductions[chunk_index] = red_op(
+              chunks_by_dev[pred_dev][chunk_index],
+              chunks_by_dev[d][chunk_index])
+    # Update chunks_by_dev with the new values at the end of the tick.
+    for d in range(0, num_devices):
+      for s in range(0, num_subchunks):
+        rank = rank_by_s_d[s][d]
+        seg_index = (rank + num_devices - (2 + tick)) % num_devices
+        chunk_index = (seg_index * num_subchunks) + s
+        chunks_by_dev[d][chunk_index] = new_partial_reductions[chunk_index]
+  return chunks_by_dev, split_pad_len
+
+
+def _apply_unary_to_chunks(f, chunks_by_dev):
+  """Apply a unary op to each tensor in chunks_by_dev, on same device.
+
+  Args:
+    f: a unary function over T `tf.Tensor`.
+    chunks_by_dev: list of lists of T `tf.Tensor`.
+
+  Returns:
+    new list of lists of T `tf.Tensor` with the same structure as
+    chunks_by_dev containing the derived tensors.
+  """
+  output = []
+  for x in chunks_by_dev:
+    with ops.colocate_with(x[0]):
+      output.append([f(t) for t in x])
+  return output
+
+
+def _build_ring_scatter(pred_by_s_d, rank_by_s_d,
+                        chunks_by_dev):
+  """Construct subgraph for second (scatter) pass of ring all-reduce.
+
+  Args:
+    pred_by_s_d: as produced by _ring_permutations
+    rank_by_s_d: as produced by _ring_permutations
+    chunks_by_dev: list of list of T `tf.Tensor` indexed by ints
+      (device, chunk)
+
+  Raises:
+    ValueError: chunks_by_dev is not well-formed
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors, one
+    at each device corresponding to the outer dimension of chunks_by_dev.
+  """
+  num_devices = len(chunks_by_dev)
+  num_chunks = len(chunks_by_dev[0])
+  if 0 != num_chunks % num_devices:
+    raise ValueError(
+        "Expect number of chunks per device to be divisible by num_devices")
+  num_subchunks = int(num_chunks / num_devices)
+  num_ticks = num_devices - 1
+  for tick in range(0, num_ticks):
+    passed_values = [None for _ in range(0, num_chunks)]
+    for d in range(0, num_devices):
+      with ops.colocate_with(chunks_by_dev[d][0]):
+        for s in range(0, num_subchunks):
+          rank = rank_by_s_d[s][d]
+          seg_index = (rank + num_devices - (1 + tick)) % num_devices
+          pred_dev = pred_by_s_d[s][d]
+          chunk_index = (seg_index * num_subchunks) + s
+          passed_values[chunk_index] = array_ops.identity(
+              chunks_by_dev[pred_dev][chunk_index])
+    for d in range(0, num_devices):
+      for s in range(0, num_subchunks):
+        rank = rank_by_s_d[s][d]
+        seg_index = (rank + num_devices - (1 + tick)) % num_devices
+        chunk_index = (seg_index * num_subchunks) + s
+        chunks_by_dev[d][chunk_index] = passed_values[chunk_index]
+  # Join chunks at each device.
+  output = []
+  for x in chunks_by_dev:
+    with ops.colocate_with(x[0]):
+      output.append(array_ops.concat(x, 0))
+  return output
+
+
+def build_recursive_hd_all_reduce(input_tensors, red_op, un_op=None):
+  """Construct a subgraph for recursive halving-doubling all-reduce.
+
+  The recursive halving-doubling algorithm is described in
+  http://www.mcs.anl.gov/~thakur/papers/ijhpca-coll.pdf
+
+  The concept is to arrange the participating n devices in
+  a linear sequence where devices exchange data pairwise
+  with one other device in each round.  During the gather
+  phase there are lg(n) rounds where devices exchange
+  increasingly smaller sub-tensors with another device
+  at increasingly greater distances, until at the top
+  each device has 1/n of the fully reduced values.  During the
+  scatter phase each device exchanges its fully reduced
+  sub-tensor (which doubles in length at each round)
+  with one other device at increasingly smaller distances
+  until each device has all of the fully reduced values.
+
+  Note: this preliminary version requires that len(input_tensors) be a
+    power of 2.  TODO(tucker): relax this restriction.  Also, the
+    number of elements in each tensor must be divisible by 2^h where h
+    is the number of hops in each phase.  This will also be relaxed in
+    the future with edge-case specific logic.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
+    red_op: a binary elementwise reduction Op.
+    un_op: an optional unary elementwise Op to apply to reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors, one
+    at each device of input_tensors.
+
+  Raises:
+    ValueError: num_devices not a power of 2, or tensor len not divisible
+    by 2 the proper number of times.
+  """
+  devices = [t.device for t in input_tensors]
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  reduced_shards = _build_recursive_hd_gather(input_tensors, devices, red_op)
+  if un_op:
+    reduced_shards = [un_op(t) for t in reduced_shards]
+  output_tensors = _build_recursive_hd_scatter(reduced_shards, devices)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_recursive_hd_gather(input_tensors, devices, red_op):
+  """Construct the gather phase of recursive halving-doubling all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` to be elementwise reduced.
+    devices: a list of strings naming the devices hosting input_tensors,
+      which will also be used to host the (partial) reduction values.
+    red_op: a binary elementwise reduction Op.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensor shards.
+
+  Raises:
+    ValueError: num_devices not a power of 2, or tensor len not divisible
+    by 2 the proper number of times.
+  """
+  num_devices = len(devices)
+  num_hops = int(math.log(num_devices, 2))
+  if num_devices != (2 ** num_hops):
+    raise ValueError("num_devices must be a power of 2")
+  chunks = input_tensors
+  for h in range(0, num_hops):
+    span = 2 ** h
+    group_size = span * 2
+    new_chunks = [[] for _ in devices]
+    for d in range(0, num_devices):
+      if (d % group_size) >= (group_size / 2):
+        # skip right half of a pair
+        continue
+      left_dev = devices[d]
+      right_dev = devices[d + span]
+      left_split = array_ops.split(chunks[d], 2)
+      right_split = array_ops.split(chunks[d+span], 2)
+      with ops.device(left_dev):
+        new_chunks[d] = red_op(left_split[0], right_split[0])
+      with ops.device(right_dev):
+        new_chunks[d + span] = red_op(left_split[1], right_split[1])
+    chunks = new_chunks
+  return chunks
+
+
+def _build_recursive_hd_scatter(input_tensors, devices):
+  """Construct the scatter phase of recursive halving-doublng all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` that are fully-reduced shards.
+    devices: a list of strings naming the devices on which the reconstituted
+      full tensors should be placed.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors.
+  """
+  num_devices = len(devices)
+  num_hops = int(math.log(num_devices, 2))
+  assert num_devices == (2 ** num_hops), "num_devices must be a power of 2"
+  chunks = input_tensors
+  for h in reversed(range(0, num_hops)):
+    span = 2 ** h
+    group_size = span * 2
+    new_chunks = [[] for _ in devices]
+    for d in range(0, num_devices):
+      if (d % group_size) >= (group_size / 2):
+        # skip right half of a pair
+        continue
+      left_idx = d
+      right_idx = d + span
+      left_dev = devices[left_idx]
+      right_dev = devices[right_idx]
+      with ops.device(left_dev):
+        new_chunks[left_idx] = array_ops.concat([chunks[left_idx],
+                                                 chunks[right_idx]], 0)
+      with ops.device(right_dev):
+        new_chunks[right_idx] = array_ops.concat([chunks[left_idx],
+                                                  chunks[right_idx]], 0)
+    chunks = new_chunks
+  return chunks
+
+
+def build_shuffle_all_reduce(input_tensors, gather_devices, red_op, un_op=None):
+  """Construct a subgraph for shuffle all-reduce.
+
+  Shuffle reduce is essentially the algorithm implemented when using
+  parameter servers.  Suppose tensor length is n, there are d devices
+  and g gather shards.  Each device sends a n/g length sub-tensor to
+  each gather shard.  The gather shards perform a reduction across d
+  fragments, then broadcast the result back to each device.  The
+  devices then join the g fully reduced fragments they receive from
+  the shards.  The gather shards could perform d-1 pairwise
+  reductions, or one d-way reduction.  The first is better where
+  reduction Op time is low compared to transmission time, the second
+  better in the other case.
+
+  Args:
+    input_tensors: list of T @(tf.Tensor} values to be reduced.
+    gather_devices: list of names of devices on which reduction shards
+      should be placed.
+    red_op: an n-array elementwise reduction Op
+    un_op: optional elementwise unary Op to be applied to fully-reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced tensors.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  dst_devices = [t.device for t in input_tensors]
+  reduced_shards = _build_shuffle_gather(input_tensors, gather_devices,
+                                         red_op, un_op)
+  output_tensors = _build_shuffle_scatter(reduced_shards, dst_devices)
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
+  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.
+
+  Args:
+    input_tensors: list of T @(tf.Tensor} values to be reduced.
+    gather_devices: list of names of devices on which reduction shards
+      should be placed.
+    red_op: the binary reduction Op
+    un_op: optional elementwise unary Op to be applied to fully-reduced values.
+
+  Returns:
+    list of T `tf.Tensor` which are the fully reduced shards.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  num_source_devices = len(input_tensors)
+  num_gather_devices = len(gather_devices)
+  shape = input_tensors[0].shape
+  if len(shape) != 1:
+    raise ValueError("input_tensors must be 1D")
+  shards_by_source = []
+  for d in range(0, num_source_devices):
+    with ops.colocate_with(input_tensors[d]):
+      shards_by_source.append(
+          _ragged_split(input_tensors[d], num_gather_devices))
+  reduced_shards = []
+  for d in range(0, num_gather_devices):
+    with ops.device(gather_devices[d]):
+      values = [s[d] for s in shards_by_source]
+      red_shard = red_op(values)
+      if un_op:
+        red_shard = un_op(red_shard)
+      reduced_shards.append(red_shard)
+  return reduced_shards
+
+
+def _build_shuffle_scatter(reduced_shards, dst_devices):
+  """Build the scatter phase of shuffle all-reduce.
+
+  Args:
+    reduced_shards:  list of T @(tf.Tensor} fully reduced shards
+    dst_devices: list of names of devices at which the fully-reduced value
+      should be reconstituted.
+
+  Returns:
+    list of T `tf.Tensor` scattered tensors.
+  """
+  num_devices = len(dst_devices)
+  out_tensors = []
+  for d in range(0, num_devices):
+    with ops.device(dst_devices[d]):
+      out_tensors.append(array_ops.concat(reduced_shards, 0))
+  return out_tensors
+
+
+def _split_by_task(devices, values):
+  """Partition devices and values by common task.
+
+  Args:
+    devices: list of device name strings
+    values: list of T `tf.tensor` of same length as devices.
+
+  Returns:
+    (per_task_devices, per_task_values) where both values are
+    lists of lists with isomorphic structure: the outer list is
+    indexed by task, and the inner list has length of the number
+    of values belonging to that task.  per_task_devices contains
+    the specific devices to which the values are local, and
+    per_task_values contains the corresponding values.
+
+  Raises:
+    ValueError: devices must be same length as values.
+  """
+  num_devices = len(devices)
+  if num_devices != len(values):
+    raise ValueError("len(devices) must equal len(values)")
+  per_task_devices = collections.OrderedDict()
+  per_task_values = collections.OrderedDict()
+  for d in range(num_devices):
+    d_spec = device_lib.DeviceSpec.from_string(devices[d])
+    if not hasattr(d_spec, "task") or d_spec.task is None:
+      assert False, "failed to parse device %s" % devices[d]
+    index = (d_spec.job or "localhost", d_spec.replica or 0, d_spec.task)
+    if index not in per_task_devices:
+      per_task_devices[index] = []
+      per_task_values[index] = []
+    per_task_devices[index].append(devices[d])
+    per_task_values[index].append(values[d])
+
+  return (list(per_task_devices.values()), list(per_task_values.values()))
+
+
+def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
+  """Build a subgraph that does one full all-reduce, using NCCL.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    red_op: binary elementwise reduction operator.  Must be one of
+      {tf.add}
+    un_op: optional unary elementwise Op to apply to fully-reduce values.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: red_op not supported.
+  """
+  if red_op == math_ops.add:
+    output_tensors = nccl_ops.all_sum(input_tensors)
+  else:
+    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
+  if un_op:
+    un_op_wrapped = []
+    for t in output_tensors:
+      with ops.colocate_with(t):
+        un_op_wrapped.append(un_op(t))
+    output_tensors = un_op_wrapped
+  return output_tensors
+
+
+def _build_nccl_hybrid(input_tensors, red_op, upper_level_f):
+  """Construct a subgraph for NCCL hybrid all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    red_op: binary elementwise reduction operator.
+    upper_level_f: function for reducing one value per worker, across
+      workers.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  devices = [t.device for t in input_tensors]
+  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
+  num_workers = len(per_worker_devices)
+  up_values = [None for w in range(0, num_workers)]
+  up_devices = up_values[:]
+  down_values = up_values[:]
+  # First stage: reduce within each worker using NCCL
+  for w in range(0, num_workers):
+    worker_values = build_nccl_all_reduce(per_worker_values[w], red_op)
+    # NOTE: these reductions will not run to completion unless
+    # every output value is used.  Since we only need one, we
+    # need to put control dependencies on the rest.
+    with ops.control_dependencies(worker_values):
+      with ops.device(worker_values[0].device):
+        up_values[w] = array_ops.identity(worker_values[0])
+      up_devices[w] = per_worker_devices[w][0]
+  # Second stage: Apply upper_level_f to reduce across first device at
+  # each worker
+  level_2_output = upper_level_f(up_values)
+  # Third stage: propagate within each worker using NCCL Broadcast
+  for w in range(0, num_workers):
+    dst_tensors = []
+    with ops.device(per_worker_devices[w][0]):
+      broadcast_src = nccl_ops.broadcast(array_ops.identity(level_2_output[w]))
+    for d in per_worker_devices[w]:
+      with ops.device(d):
+        dst_tensors.append(array_ops.identity(broadcast_src))
+    down_values[w] = dst_tensors
+  output_tensors = [v for sublist in down_values for v in sublist]
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def _reduce_non_singleton(input_tensors, red_f, un_op):
+  """If len(input_tensors) > 1, apply red_f, else apply un_op."""
+  if len(input_tensors) > 1:
+    return red_f(input_tensors)
+  else:
+    if not un_op:
+      return input_tensors
+    output_tensors = []
+    for t in input_tensors:
+      with ops.colocate_with(t):
+        output_tensors.append(un_op(t))
+    return output_tensors
+
+
+def build_nccl_then_ring(input_tensors, subdiv, red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Ring across workers."""
+  def upper_builder(y):
+    return build_ring_all_reduce(y, len(y), subdiv, [0], red_op, un_op)
+  def upper_level_f(x):
+    return _reduce_non_singleton(x, upper_builder, un_op)
+  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
+
+
+def build_nccl_then_recursive_hd(input_tensors, red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Recursive-HD across workers."""
+  upper_level_f = lambda x: build_recursive_hd_all_reduce(x, red_op, un_op)
+  return _build_nccl_hybrid(input_tensors, red_op, upper_level_f)
+
+
+def build_nccl_then_shuffle(input_tensors, gather_devices, nccl_red_op,
+                            shuffle_red_op, un_op=None):
+  """Construct hybrid of NCCL within workers, Shuffle across workers."""
+  def upper_level_f(x):
+    return build_shuffle_all_reduce(x, gather_devices, shuffle_red_op, un_op)
+
+  return _build_nccl_hybrid(input_tensors, nccl_red_op, upper_level_f)
+
+
+def _build_shuffle_hybrid(input_tensors, gather_devices, red_op, upper_level_f):
+  """Construct a subgraph for Shuffle hybrid all-reduce.
+
+  Args:
+    input_tensors: list of T `tf.Tensor` of same-shape and type values to
+      be reduced.
+    gather_devices: list of device names on which to host gather shards.
+    red_op: binary elementwise reduction operator.
+    upper_level_f: function for reducing one value per worker, across
+      workers.
+
+  Returns:
+    list of T `tf.Tensor` of reduced values.
+
+  Raises:
+    ValueError: inputs not well-formed.
+  """
+  input_tensors, shape = _flatten_tensors(input_tensors)
+  # First stage, reduce across each worker using gather_devices.
+  devices = [t.device for t in input_tensors]
+  per_worker_devices, per_worker_values = _split_by_task(devices, input_tensors)
+  num_workers = len(per_worker_devices)
+  up_values = []
+  if len(gather_devices) != num_workers:
+    raise ValueError("For shuffle hybrid, gather_devices must contain one "
+                     "device per worker. ")
+  for w in range(0, num_workers):
+    reduced_shards = _build_shuffle_gather(
+        per_worker_values[w], [gather_devices[w]], red_op)
+    up_values.append(reduced_shards[0])
+  # Second stage, apply upper_level_f.
+  level_2_output = upper_level_f(up_values)
+  # Third stage, apply shuffle scatter at each worker.
+  output_tensors = []
+  for w in range(0, num_workers):
+    output_tensors += _build_shuffle_scatter(
+        [level_2_output[w]], per_worker_devices[w])
+  if len(shape) != 1:
+    output_tensors = _reshape_tensors(output_tensors, shape)
+  return output_tensors
+
+
+def build_shuffle_then_ring(input_tensors, gather_devices, subdiv,
+                            red_n_op, red_op, un_op=None):
+  """Construct hybrid of Shuffle within workers, Ring across workers."""
+  def upper_builder(tensors):
+    return build_ring_all_reduce(tensors, len(tensors), subdiv, [0],
+                                 red_op, un_op)
+  def upper_level_f(tensors):
+    return _reduce_non_singleton(tensors, upper_builder, un_op)
+  return _build_shuffle_hybrid(
+      input_tensors, gather_devices, red_n_op, upper_level_f)
+
+
+def build_shuffle_then_shuffle(input_tensors, first_gather_devices,
+                               second_gather_devices, red_op, un_op=None):
+  """Construct hybrid of Shuffle within workers, Shuffle across workers."""
+  def upper_builder(tensors):
+    return build_shuffle_all_reduce(tensors, second_gather_devices,
+                                    red_op, un_op)
+  def upper_level_f(tensors):
+    return _reduce_non_singleton(tensors, upper_builder, un_op)
+  return _build_shuffle_hybrid(
+      input_tensors, first_gather_devices, red_op, upper_level_f)
diff --git a/tensorflow/contrib/all_reduce/python/all_reduce_test.py b/tensorflow/python/distribute/all_reduce_test.py
similarity index 97%
rename from tensorflow/contrib/all_reduce/python/all_reduce_test.py
rename to tensorflow/python/distribute/all_reduce_test.py
index 304fd7fb8a37f1aab91f47d754eb2efba81304a5..2c6b853124cf838d99da0628d8a610b74429e014 100644
--- a/tensorflow/contrib/all_reduce/python/all_reduce_test.py
+++ b/tensorflow/python/distribute/all_reduce_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tensorflow.contrib.all_reduce.python..all_reduce."""
+"""Tests for all_reduce."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,8 +22,8 @@ import time
 
 import numpy as np
 
-from tensorflow.contrib.all_reduce.python import all_reduce as ar
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.distribute import all_reduce as ar
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
@@ -37,6 +37,7 @@ from tensorflow.python.platform import tf_logging
 
 class AllReduceTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFlattenTensorsShapesDefined(self):
     x = array_ops.placeholder(types_pb2.DT_FLOAT, [None])
     with self.assertRaisesRegexp(ValueError,
@@ -100,6 +101,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
           input_tensors.append(array_ops.identity(t8))
     return input_tensors, device_names
 
+  @test_util.run_deprecated_v1
   def testBuildRingGatherPassStructure(self):
     # 1 worker, 1 device
     input_tensors, device_names = self._buildInput(1, 1)
@@ -159,7 +161,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
       output_tensors = build_f(input_tensors, un_op)
       sum_reduced = math_ops.add_n(output_tensors)
       sum_reduced.op.run()
-      self.assertAllClose(sum_reduced.eval(), simple_sum.eval())
+      self.assertAllClose(sum_reduced.eval(), self.evaluate(simple_sum))
 
   def _testRingAllReduce(self, num_workers, num_gpus, shape, subdiv):
     start_time = time.time()
@@ -170,6 +172,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     "subdiv=%d elapsed=%f" %
                     (num_workers, num_gpus, shape, subdiv, elapsed))
 
+  @test_util.run_deprecated_v1
   def testRingAllReduce(self):
     self._testRingAllReduce(1, 2, [], 1)
     self._testRingAllReduce(1, 2, [8], 1)
@@ -199,6 +202,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
     tf_logging.info("ShuffleAllReduce num_workers=%d num_gpus=%d shape=%s "
                     "elapsed=%f" % (num_workers, num_gpus, shape, elapsed))
 
+  @test_util.run_deprecated_v1
   def testShuffleAllReduce(self):
     self._testShuffleAllReduce(1, 2, [], 1)
     self._testShuffleAllReduce(1, 2, [8], 1)
@@ -225,6 +229,7 @@ class AllReduceTest(test_util.TensorFlowTestCase):
                     "shape=%s elapsed=%f" %
                     (num_workers, num_gpus, shape, elapsed))
 
+  @test_util.run_deprecated_v1
   def testRecursiveHDAllReduce(self):
     self._testRecursiveHDAllReduce(1, 2, [8])
     self._testRecursiveHDAllReduce(1, 2, [4, 4])
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..360a2993cd9e01c59551a5a8177c8bec03133c45
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -0,0 +1,180 @@
+# Description: Operations defined for Cluster Resolvers
+
+load("//tensorflow:tensorflow.bzl", "tf_py_test")
+
+package(
+    default_visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+py_library(
+    name = "cluster_resolver_lib",
+    srcs = [
+        "__init__.py",
+    ],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":base_cluster_resolver_py",
+        ":gce_cluster_resolver_py",
+        ":kubernetes_cluster_resolver_py",
+        ":slurm_cluster_resolver_py",
+        ":tfconfig_cluster_resolver_py",
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:util",
+    ],
+)
+
+py_library(
+    name = "base_cluster_resolver_py",
+    srcs = ["cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "gce_cluster_resolver_py",
+    srcs = ["gce_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "tfconfig_cluster_resolver_py",
+    srcs = ["tfconfig_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "tpu_cluster_resolver_py",
+    srcs = ["tpu_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "slurm_cluster_resolver_py",
+    srcs = ["slurm_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+py_library(
+    name = "kubernetes_cluster_resolver_py",
+    srcs = ["kubernetes_cluster_resolver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:training_server_lib",
+    ],
+)
+
+tf_py_test(
+    name = "base_cluster_resolver_py_test",
+    srcs = ["cluster_resolver_test.py"],
+    additional_deps = [
+        ":base_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "gce_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["gce_cluster_resolver_test.py"],
+    additional_deps = [
+        ":gce_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "gce_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "tfconfig_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tfconfig_cluster_resolver_test.py"],
+    additional_deps = [
+        ":tfconfig_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    grpc_enabled = True,
+    main = "tfconfig_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "tpu_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["tpu_cluster_resolver_test.py"],
+    additional_deps = [
+        ":tpu_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    grpc_enabled = True,
+    main = "tpu_cluster_resolver_test.py",
+)
+
+tf_py_test(
+    name = "slurm_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["slurm_cluster_resolver_test.py"],
+    additional_deps = [
+        ":slurm_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "slurm_cluster_resolver_test.py",
+    tags = [],
+)
+
+tf_py_test(
+    name = "kubernetes_cluster_resolver_py_test",
+    size = "small",
+    srcs = ["kubernetes_cluster_resolver_test.py"],
+    additional_deps = [
+        ":kubernetes_cluster_resolver_py",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:training_server_lib",
+    ],
+    main = "kubernetes_cluster_resolver_test.py",
+)
diff --git a/tensorflow/contrib/cluster_resolver/README.md b/tensorflow/python/distribute/cluster_resolver/README.md
similarity index 100%
rename from tensorflow/contrib/cluster_resolver/README.md
rename to tensorflow/python/distribute/cluster_resolver/README.md
diff --git a/tensorflow/contrib/cluster_resolver/python/training/README.slurm b/tensorflow/python/distribute/cluster_resolver/README.slurm
similarity index 100%
rename from tensorflow/contrib/cluster_resolver/python/training/README.slurm
rename to tensorflow/python/distribute/cluster_resolver/README.slurm
diff --git a/tensorflow/python/distribute/cluster_resolver/__init__.py b/tensorflow/python/distribute/cluster_resolver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef87f59b7fd7ef1774ed97370c75e16f3ec4e295
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/__init__.py
@@ -0,0 +1,57 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library Imports for Cluster Resolvers."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import gce_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import kubernetes_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import slurm_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver.kubernetes_cluster_resolver import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+
+from tensorflow.python.util.all_util import remove_undocumented
+
+_allowed_symbols = [
+    'cluster_resolver',
+    'gce_cluster_resolver',
+    'kubernetes_cluster_resolver',
+    'slurm_cluster_resolver',
+    'tfconfig_cluster_resolver',
+    'tpu_cluster_resolver',
+    'ClusterResolver',
+    'SimpleClusterResolver',
+    'UnionClusterResolver',
+    'GceClusterResolver',
+    'KubernetesClusterResolver',
+    'TFConfigClusterResolver',
+    'TPUClusterResolver',
+    'SlurmClusterResolver',
+]
+
+remove_undocumented(__name__, _allowed_symbols)
+
diff --git a/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca40e60a557d8fb1a5db8565369d1d1ae7e0c136
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver.py
@@ -0,0 +1,403 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Cluster Resolvers are used for dynamic cluster IP/hostname resolution."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import abc
+
+import six
+
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+@six.add_metaclass(abc.ABCMeta)
+class ClusterResolver(object):
+  """Abstract class for all implementations of ClusterResolvers.
+
+  This defines the skeleton for all implementations of ClusterResolvers.
+  ClusterResolvers are a way for TensorFlow to communicate with various cluster
+  management systems (e.g. GCE, AWS, etc...).
+
+  By letting TensorFlow communicate with these systems, we will be able to
+  automatically discover and resolve IP addresses for various TensorFlow
+  workers. This will eventually allow us to automatically recover from
+  underlying machine failures and scale TensorFlow worker clusters up and down.
+
+  Note to Implementors: In addition to these abstract methods, you must also
+  implement the task_type, task_index, and rpc_layer attributes. You may choose
+  to implement them either as properties with getters or setters or directly
+  set the attributes.
+
+  - task_type is the name of the server's current named job (e.g. 'worker',
+     'ps' in a distributed parameterized training job).
+  - task_index is the ordinal index of the server within the task type.
+  - rpc_layer is the protocol used by TensorFlow to communicate with other
+      TensorFlow servers in a distributed environment.
+  """
+
+  @abc.abstractmethod
+  def cluster_spec(self):
+    """Retrieve the current state of the cluster and returns a ClusterSpec.
+
+    Returns:
+      A ClusterSpec representing the state of the cluster at the moment this
+      function is called.
+
+    Implementors of this function must take care in ensuring that the
+    ClusterSpec returned is up-to-date at the time of calling this function.
+    This usually means retrieving the information from the underlying cluster
+    management system every time this function is invoked and reconstructing
+    a cluster_spec, rather than attempting to cache anything.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Retrieves the name or URL of the session master.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+
+    Implementors of this function must take care in ensuring that the master
+    returned is up-to-date at the time to calling this function. This usually
+    means retrieving the master every time this function is invoked.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractmethod
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    """Returns the number of accelerator cores per worker.
+
+    This returns the number of accelerator cores (such as GPUs and TPUs)
+    available per worker. If workers only has CPU cores available, then this
+    should return 0. This method will query the master for this information
+    if it is not otherwise known.
+
+    Optionally, we allow callers to specify the task_type, task_index, and
+    rpc_layer, if they want to target a specific TensorFlow process to query
+    the number of accelerators. This is to support heterogenous environments,
+    where the number of accelerators cores per host is different.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the machine we
+        want to query.
+      task_index: (Optional) The index of the TensorFlow task of the machine we
+        want to query.
+      accelerator_type: (Optional) The type of accelerator we are trying to
+        query (defaults to 'GPU').
+      config_proto: (Optional) Configuration for starting a new session to
+        query how many accelerator cores it has.
+    """
+    raise NotImplementedError()
+
+  @abc.abstractproperty
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    raise NotImplementedError()
+
+
+class SimpleClusterResolver(ClusterResolver):
+  """Simple implementation of ClusterResolver that accepts a ClusterSpec."""
+
+  def __init__(self, cluster_spec, master='', task_type=None, task_index=None,
+               environment='', num_accelerators=0,
+               rpc_layer=None):
+    """Creates a SimpleClusterResolver from a ClusterSpec."""
+    super(SimpleClusterResolver, self).__init__()
+
+    self._task_type = task_type
+    self._task_index = task_index
+    self._environment = environment
+    self._num_accelerators = num_accelerators
+    self._rpc_layer = rpc_layer
+
+    if not isinstance(cluster_spec, ClusterSpec):
+      raise TypeError('cluster_spec must be a ClusterSpec.')
+    self._cluster_spec = cluster_spec
+
+    if not isinstance(master, str):
+      raise TypeError('master must be a string.')
+    self._master = master
+
+  def cluster_spec(self):
+    """Returns the ClusterSpec passed into the constructor."""
+    return self._cluster_spec
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC used by distributed TensorFlow.
+
+    Returns:
+      The name or URL of the session master.
+
+    If a task_type and task_index is given, this will override the `master`
+    string passed into the initialization function.
+    """
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+    else:
+      master = self._master
+
+    return format_master_url(master, rpc_layer=rpc_layer or self._rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    """Returns the number of accelerator cores per worker.
+
+    The SimpleClusterResolver does not do automatic detection of accelerators,
+    so a TensorFlow session will never be created, and thus all arguments are
+    unused and we simply return whatever was passed in when this object was
+    initialized.
+
+    Args:
+      task_type: Unused.
+      task_index: Unused.
+      accelerator_type: Unused.
+      config_proto: Unused.
+    """
+    # Unused
+    del task_type, task_index, accelerator_type, config_proto
+    return self._num_accelerators
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+
+class UnionClusterResolver(ClusterResolver):
+  """Performs a union on underlying ClusterResolvers.
+
+  This class performs a union given two or more existing ClusterResolvers. It
+  merges the underlying ClusterResolvers, and returns one unified ClusterSpec
+  when cluster_spec is called. The details of the merge function is
+  documented in the cluster_spec function.
+
+  For additional Cluster Resolver properties such as task type, task index,
+  rpc layer, environment, etc..., we will return the value from the first
+  ClusterResolver in the union.
+  """
+
+  def __init__(self, *args, **kwargs):
+    """Initializes a UnionClusterResolver with other ClusterResolvers.
+
+    Args:
+      *args: `ClusterResolver` objects to be unionized.
+      **kwargs:
+        rpc_layer - (Optional) Override value for the RPC layer used by
+          TensorFlow.
+        task_type - (Optional) Override value for the current task type.
+        task_index - (Optional) Override value for the current task index.
+
+    Raises:
+      TypeError: If any argument is not a subclass of `ClusterResolvers`.
+      ValueError: If there are no arguments passed.
+    """
+    super(UnionClusterResolver, self).__init__()
+
+    self._rpc_layer = kwargs.pop('rpc_layer', None)
+    self._task_type = kwargs.pop('task_type', None)
+    self._task_index = kwargs.pop('task_index', None)
+
+    if kwargs:
+      raise ValueError('Unexpected kwargs provided {!r}'.format(kwargs))
+
+    if not args:
+      raise ValueError('At least one ClusterResolver is required.')
+
+    for cluster_resolver in args:
+      if not isinstance(cluster_resolver, ClusterResolver):
+        raise TypeError('All arguments must be a sub-class of '
+                        '`ClusterResolver.`')
+    self._cluster_resolvers = args
+
+  def cluster_spec(self):
+    """Returns a union of all the ClusterSpecs from the ClusterResolvers.
+
+    Returns:
+      A ClusterSpec containing host information merged from all the underlying
+      ClusterResolvers.
+
+    Raises:
+      KeyError: If there are conflicting keys detected when merging two or
+      more dictionaries, this exception is raised.
+
+    Note: If there are multiple ClusterResolvers exposing ClusterSpecs with the
+    same job name, we will merge the list/dict of workers.
+
+    If *all* underlying ClusterSpecs expose the set of workers as lists, we will
+    concatenate the lists of workers, starting with the list of workers from
+    the first ClusterResolver passed into the constructor.
+
+    If *any* of the ClusterSpecs expose the set of workers as a dict, we will
+    treat all the sets of workers as dicts (even if they are returned as lists)
+    and will only merge them into a dict if there is no conflicting keys. If
+    there is a conflicting key, we will raise a `KeyError`.
+    """
+
+    merged_cluster = {}
+
+    # We figure out whether it is all lists for a particular job, or whether
+    # there are dicts inside.
+    for cluster_resolver in self._cluster_resolvers:
+      cluster_spec = cluster_resolver.cluster_spec()
+      cluster_dict = cluster_spec.as_dict()
+
+      for job_name, tasks in cluster_dict.items():
+        if job_name in merged_cluster:
+          # If we see a dict, then we write a dict out regardless.
+          if isinstance(tasks, dict):
+            merged_cluster[job_name] = {}
+        else:
+          # We take whichever type is present.
+          if isinstance(tasks, list):
+            merged_cluster[job_name] = []
+          else:
+            merged_cluster[job_name] = {}
+
+    # We then do the merge as appropriate in merged_cluster[job].
+    for cluster_resolver in self._cluster_resolvers:
+      cluster_spec = cluster_resolver.cluster_spec()
+      cluster_dict = cluster_spec.as_dict()
+
+      for job_name, tasks in cluster_dict.items():
+        if isinstance(merged_cluster[job_name], list):
+          # We all have lists, we can just concatenate and be done.
+          merged_cluster[job_name].extend(tasks)
+        else:
+          if isinstance(tasks, list):
+            # We convert to a dictionary if the type is a list.
+            task_dict = dict(zip(range(0, len(tasks)), tasks))
+          else:
+            # We can simply make a copy (for update) and be done.
+            task_dict = tasks.copy()
+
+          # We detect if there are duplicates, and raise an error if so.
+          task_keys = set(task_dict)
+          merged_keys = set(merged_cluster[job_name].keys())
+          intersected_keys = task_keys.intersection(merged_keys)
+          if intersected_keys:
+            raise KeyError('Duplicate keys detected when merging two '
+                           'ClusterSpecs: %s' % repr(intersected_keys))
+
+          # We do the merge after all the processing.
+          merged_cluster[job_name].update(task_dict)
+
+    return ClusterSpec(merged_cluster)
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    This usually returns the master from the first ClusterResolver passed in,
+    but you can override this by specifying the task_type and task_index.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+    """
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      return format_master_url(master, rpc_layer or self._rpc_layer)
+
+    return self._cluster_resolvers[0].master(rpc_layer=rpc_layer)
+
+  @property
+  def task_type(self):
+    return self._task_type or self._cluster_resolvers[0].task_type
+
+  @property
+  def task_index(self):
+    return self._task_index or self._cluster_resolvers[0].task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._cluster_resolvers[0].environment
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    return self._cluster_resolvers[0].num_accelerators(
+        task_type, task_index, accelerator_type, config_proto)
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer or self._cluster_resolvers[0].rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
diff --git a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
similarity index 95%
rename from tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
index b94c9612b5bd4d92e84319f22932ce5599ba4b36..3f7b46972746f46ee866a5891ed2ca9ef0722a0c 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/cluster_resolver_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import SimpleClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
+from tensorflow.python.distribute.cluster_resolver import SimpleClusterResolver
+from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -65,13 +65,13 @@ class UnionClusterResolverTest(test.TestCase):
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
                                             task_index=1, environment="cloud",
-                                            num_accelerators_per_worker=8,
+                                            num_accelerators=8,
                                             rpc_layer="grpc")
 
     self.assertEqual(simple_resolver.task_type, "ps")
     self.assertEqual(simple_resolver.task_index, 1)
     self.assertEqual(simple_resolver.environment, "cloud")
-    self.assertEqual(simple_resolver.num_accelerators_per_worker(), 8)
+    self.assertEqual(simple_resolver.num_accelerators(), 8)
     self.assertEqual(simple_resolver.rpc_layer, "grpc")
 
   def testOverrideSimpleClusterResolver(self):
@@ -82,7 +82,7 @@ class UnionClusterResolverTest(test.TestCase):
 
     simple_resolver = SimpleClusterResolver(base_cluster_spec, task_type="ps",
                                             task_index=1, environment="cloud",
-                                            num_accelerators_per_worker=8,
+                                            num_accelerators=8,
                                             rpc_layer="grpc")
 
     simple_resolver.task_type = "worker"
@@ -130,7 +130,7 @@ class UnionClusterResolverTest(test.TestCase):
     })
     resolver1 = SimpleClusterResolver(cluster_spec_1, task_type="ps",
                                       task_index=1, environment="cloud",
-                                      num_accelerators_per_worker=8,
+                                      num_accelerators=8,
                                       rpc_layer="grpc")
 
     cluster_spec_2 = server_lib.ClusterSpec({
@@ -139,7 +139,7 @@ class UnionClusterResolverTest(test.TestCase):
     })
     resolver2 = SimpleClusterResolver(cluster_spec_2, task_type="worker",
                                       task_index=2, environment="local",
-                                      num_accelerators_per_worker=16,
+                                      num_accelerators=16,
                                       rpc_layer="http")
 
     union_resolver = UnionClusterResolver(resolver1, resolver2)
@@ -147,7 +147,7 @@ class UnionClusterResolverTest(test.TestCase):
     self.assertEqual(union_resolver.task_type, "ps")
     self.assertEqual(union_resolver.task_index, 1)
     self.assertEqual(union_resolver.environment, "cloud")
-    self.assertEqual(union_resolver.num_accelerators_per_worker(), 8)
+    self.assertEqual(union_resolver.num_accelerators(), 8)
     self.assertEqual(union_resolver.rpc_layer, "grpc")
 
     union_resolver.task_type = "worker"
diff --git a/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..2412f6dad095bb2282ba51b7edb1f293f57d428d
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver.py
@@ -0,0 +1,212 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for GCE Instance Groups."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+_GOOGLE_API_CLIENT_INSTALLED = True
+try:
+  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
+  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _GOOGLE_API_CLIENT_INSTALLED = False
+
+
+def _format_master_url(master, rpc_layer=None):
+  return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+
+class GceClusterResolver(ClusterResolver):
+  """Cluster Resolver for Google Compute Engine.
+
+  This is an implementation of cluster resolvers for the Google Compute Engine
+  instance group platform. By specifying a project, zone, and instance group,
+  this will retrieve the IP address of all the instances within the instance
+  group and return a Cluster Resolver object suitable for use for distributed
+  TensorFlow.
+  """
+
+  def __init__(self,
+               project,
+               zone,
+               instance_group,
+               port,
+               task_type='worker',
+               task_index=0,
+               rpc_layer='grpc',
+               num_accelerators=0,
+               credentials='default',
+               service=None):
+    """Creates a new GceClusterResolver object.
+
+    This takes in a few parameters and creates a GceClusterResolver project. It
+    will then use these parameters to query the GCE API for the IP addresses of
+    each instance in the instance group.
+
+    Args:
+      project: Name of the GCE project.
+      zone: Zone of the GCE instance group.
+      instance_group: Name of the GCE instance group.
+      port: Port of the listening TensorFlow server (default: 8470)
+      task_type: Name of the TensorFlow job this GCE instance group of VM
+        instances belong to.
+      task_index: The task index for this particular VM, within the GCE
+        instance group. In particular, every single instance should be assigned
+        a unique ordinal index within an instance group manually so that they
+        can be distinguished from each other.
+      rpc_layer: The RPC layer TensorFlow should use to communicate across
+        instances.
+      num_accelerators: Number of accelerators (GPUs) present per
+        instance.
+      credentials: GCE Credentials. If nothing is specified, this defaults to
+        GoogleCredentials.get_application_default().
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. (Default: discovery.build('compute', 'v1')). If you specify a
+        custom service object, then the credentials parameter will be ignored.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+    """
+    self._project = project
+    self._zone = zone
+    self._instance_group = instance_group
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
+    self._num_accelerators = num_accelerators
+    self._port = port
+    self._credentials = credentials
+
+    if credentials == 'default':
+      if _GOOGLE_API_CLIENT_INSTALLED:
+        self._credentials = GoogleCredentials.get_application_default()
+
+    if service is None:
+      if not _GOOGLE_API_CLIENT_INSTALLED:
+        raise ImportError('googleapiclient must be installed before using the '
+                          'GCE cluster resolver')
+      self._service = discovery.build(
+          'compute', 'v1',
+          credentials=self._credentials)
+    else:
+      self._service = service
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest instance group info.
+
+    This returns a ClusterSpec object for use based on information from the
+    specified instance group. We will retrieve the information from the GCE APIs
+    every time this method is called.
+
+    Returns:
+      A ClusterSpec containing host information retrieved from GCE.
+    """
+    request_body = {'instanceState': 'RUNNING'}
+    request = self._service.instanceGroups().listInstances(
+        project=self._project,
+        zone=self._zone,
+        instanceGroups=self._instance_group,
+        body=request_body,
+        orderBy='name')
+
+    worker_list = []
+
+    while request is not None:
+      response = request.execute()
+
+      items = response['items']
+      for instance in items:
+        instance_name = instance['instance'].split('/')[-1]
+
+        instance_request = self._service.instances().get(
+            project=self._project,
+            zone=self._zone,
+            instance=instance_name)
+
+        if instance_request is not None:
+          instance_details = instance_request.execute()
+          ip_address = instance_details['networkInterfaces'][0]['networkIP']
+          instance_url = '%s:%s' % (ip_address, self._port)
+          worker_list.append(instance_url)
+
+      request = self._service.instanceGroups().listInstances_next(
+          previous_request=request,
+          previous_response=response)
+
+    worker_list.sort()
+    return ClusterSpec({self._task_type: worker_list})
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    task_type = task_type if task_type is not None else self._task_type
+    task_index = task_index if task_index is not None else self._task_index
+
+    if task_type is not None and task_index is not None:
+      master = self.cluster_spec().task_address(task_type, task_index)
+      if rpc_layer or self._rpc_layer:
+        return '%s://%s' % (rpc_layer or self._rpc_layer, master)
+      else:
+        return master
+
+    return ''
+
+  @property
+  def task_type(self):
+    return self._task_type
+
+  @property
+  def task_index(self):
+    return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    raise RuntimeError(
+        'You cannot reset the task_type of the GceClusterResolver after it has '
+        'been created.')
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the GCE environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  @property
+  def rpc_layer(self):
+    return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # Unused
+    del task_type, task_index, accelerator_type, config_proto
+    return self._num_accelerators
diff --git a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
similarity index 98%
rename from tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
index c691552e86025896e23891a3e8f7da5ed2f9da31..d4f0660c922d593d81c0927dea0d6271e89c53e1 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/gce_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/gce_cluster_resolver_test.py
@@ -18,8 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training.cluster_resolver import UnionClusterResolver
-from tensorflow.contrib.cluster_resolver.python.training.gce_cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver import GceClusterResolver
+from tensorflow.python.distribute.cluster_resolver import UnionClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
diff --git a/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..b21c3676bee53e785474308435021885dc93377c
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver.py
@@ -0,0 +1,182 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Kubernetes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.client import device_lib
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
+from tensorflow.python.training import server_lib
+
+_KUBERNETES_API_CLIENT_INSTALLED = True
+try:
+  from kubernetes import client as k8sclient  # pylint: disable=g-import-not-at-top
+  from kubernetes import config as k8sconfig  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _KUBERNETES_API_CLIENT_INSTALLED = False
+
+
+class KubernetesClusterResolver(ClusterResolver):
+  """Cluster Resolver for Kubernetes.
+
+  This is an implementation of cluster resolvers for Kubernetes. When given the
+  the Kubernetes namespace and label selector for pods, we will retrieve the
+  pod IP addresses of all running pods matching the selector, and return a
+  ClusterSpec based on that information.
+  """
+
+  def __init__(self,
+               job_to_label_mapping=None,
+               tf_server_port=8470,
+               rpc_layer='grpc',
+               override_client=None):
+    """Initializes a new KubernetesClusterResolver.
+
+    This initializes a new Kubernetes Cluster Resolver. The Cluster Resolver
+    will attempt to talk to the Kubernetes master to retrieve all the instances
+    of pods matching a label selector.
+
+    Args:
+      job_to_label_mapping: A mapping of TensorFlow jobs to label selectors.
+        This allows users to specify many TensorFlow jobs in one Cluster
+        Resolver, and each job can have pods belong with different label
+        selectors. For example, a sample mapping might be
+        ```
+        {'worker': ['job-name=worker-cluster-a', 'job-name=worker-cluster-b'],
+         'ps': ['job-name=ps-1', 'job-name=ps-2']}
+        ```
+      tf_server_port: The port the TensorFlow server is listening on.
+      rpc_layer: (Optional) The RPC layer TensorFlow should use to communicate
+        between tasks in Kubernetes. Defaults to 'grpc'.
+      override_client: The Kubernetes client (usually automatically retrieved
+        using `from kubernetes import client as k8sclient`). If you pass this
+        in, you are responsible for setting Kubernetes credentials manually.
+
+    Raises:
+      ImportError: If the Kubernetes Python client is not installed and no
+        `override_client` is passed in.
+      RuntimeError: If autoresolve_task is not a boolean or a callable.
+    """
+    if _KUBERNETES_API_CLIENT_INSTALLED:
+      k8sconfig.load_kube_config()
+
+    if not job_to_label_mapping:
+      job_to_label_mapping = {'worker': ['job-name=tensorflow']}
+
+    if not override_client and not _KUBERNETES_API_CLIENT_INSTALLED:
+      raise ImportError('The Kubernetes Python client must be installed before'
+                        'using the Kubernetes Cluster Resolver. To install the'
+                        'Kubernetes Python client, run `pip install '
+                        'kubernetes` on your command line.')
+
+    self._job_to_label_mapping = job_to_label_mapping
+    self._tf_server_port = tf_server_port
+    self._override_client = override_client
+
+    self.task_type = None
+    self.task_index = None
+    self.rpc_layer = rpc_layer
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a session.
+
+    You must have set the task_type and task_index object properties before
+    calling this function, or pass in the `task_type` and `task_index`
+    parameters when using this function. If you do both, the function parameters
+    will override the object properties.
+
+    Args:
+      task_type: (Optional) The type of the TensorFlow task of the master.
+      task_index: (Optional) The index of the TensorFlow task of the master.
+      rpc_layer: (Optional) The RPC protocol for the given cluster.
+
+    Returns:
+      The name or URL of the session master.
+    """
+    if task_type is not None and task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(task_type, task_index),
+          rpc_layer or self.rpc_layer)
+
+    if self.task_type is not None and self.task_index is not None:
+      return format_master_url(
+          self.cluster_spec().task_address(self.task_type, self.task_index),
+          rpc_layer or self.rpc_layer)
+
+    return ''
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest info from Kubernetes.
+
+    We retrieve the information from the Kubernetes master every time this
+    method is called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Kubernetes.
+
+    Raises:
+      RuntimeError: If any of the pods returned by the master is not in the
+        `Running` phase.
+    """
+    if not self._override_client:
+      k8sconfig.load_kube_config()
+
+    client = self._override_client or k8sclient.CoreV1Api()
+    cluster_map = {}
+
+    for tf_job in self._job_to_label_mapping:
+      all_pods = []
+      for selector in self._job_to_label_mapping[tf_job]:
+        ret = client.list_pod_for_all_namespaces(label_selector=selector)
+        selected_pods = []
+
+        # Sort the list by the name to make sure it doesn't change call to call.
+        for pod in sorted(ret.items, key=lambda x: x.metadata.name):
+          if pod.status.phase == 'Running':
+            selected_pods.append(
+                '%s:%s' % (pod.status.host_ip, self._tf_server_port))
+          else:
+            raise RuntimeError('Pod "%s" is not running; phase: "%s"' %
+                               (pod.metadata.name, pod.status.phase))
+        all_pods.extend(selected_pods)
+      cluster_map[tf_job] = all_pods
+
+    return server_lib.ClusterSpec(cluster_map)
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the Cloud environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # TODO(frankchn): Make querying non-local accelerators work
+    if task_type is not None or task_index is not None:
+      raise NotImplementedError('Querying non-local accelerators is not yet'
+                                'implemented.')
+
+    local_devices = device_lib.list_local_devices(config_proto)
+    return sum(d.device_type == accelerator_type for d in local_devices)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
similarity index 87%
rename from tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
index fbb26e803d73c96decf57a040a05694a434500f2..a9750fa60b993a3504bbd01f0663cfdf868a2f01 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/kubernetes_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/kubernetes_cluster_resolver_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.cluster_resolver.python.training import KubernetesClusterResolver
+from tensorflow.python.distribute.cluster_resolver import KubernetesClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -109,6 +109,23 @@ class KubernetesClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
 
+  def testGetMasterWithOverrideParameters(self):
+    ret = _create_pod_list(
+        ('worker-0', 'Running', '10.1.2.3'),
+        ('worker-1', 'Running', '10.1.2.4'),
+        ('worker-2', 'Running', '10.1.2.5'))
+
+    cluster_resolver = KubernetesClusterResolver(
+        override_client=_mock_kubernetes_client(
+            {'job-name=tensorflow': ret}))
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_index = 0
+    self.assertEqual(cluster_resolver.task_type, 'worker')
+    self.assertEqual(cluster_resolver.task_index, 0)
+    self.assertEqual(cluster_resolver.master(), 'grpc://10.1.2.3:8470')
+    self.assertEqual(cluster_resolver.master('worker', 2),
+                     'grpc://10.1.2.5:8470')
+
   def testNonRunningPod(self):
     ret = _create_pod_list(('tensorflow-abc123', 'Failed', '10.1.2.3'),)
 
diff --git a/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ab81731b7a111848608068220488a368d9b86ec
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver.py
@@ -0,0 +1,231 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Slurm workload manager."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import subprocess
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+
+class SlurmClusterResolver(ClusterResolver):
+  """Cluster Resolver for system with Slurm workload manager.
+
+  This is an implementation of cluster resolvers for Slurm clusters. This allows
+  the specification of jobs and task counts, number of tasks per node, number of
+  GPUs on each node and number of GPUs for each task, It retrieves system
+  attributes by Slurm environment variables, resolves allocated computing node
+  names, construct a cluster and return a Cluster Resolver object which an be
+  use for distributed TensorFlow.
+  """
+
+  def _resolve_hostnames(self):
+    """Resolve host names of nodes allocated in current jobs.
+
+    Returns:
+      A list of node names as strings.
+    """
+    hostlist = (subprocess.check_output(['scontrol', 'show', 'hostname']).
+                decode('utf-8').strip().split('\n'))
+    return hostlist
+
+  def __init__(self,
+               jobs,
+               port_base=8888,
+               gpus_per_node=1,
+               gpus_per_task=1,
+               tasks_per_node=None,
+               auto_set_gpu=True,
+               rpc_layer='grpc'):
+    """Creates a new SlurmClusterResolver object.
+
+    This takes in parameters and creates a SlurmClusterResolver object. It uses
+    those parameters to check which nodes will processes reside and resolves
+    their hostnames. With the number of the GPUs on each node and number of GPUs
+    for each task it offsets the port number for each processes and allocate
+    GPUs to tasks by setting environment variables. The resolver currently
+    supports homogeneous tasks and default Slurm process allocation.
+
+    Args:
+      jobs: Dictionary with job names as key and number of tasks in the job as
+        value
+      port_base: The first port number to start with for processes on a node.
+      gpus_per_node: Number of GPUs available on each node.
+      gpus_per_task: Number of GPUs to be used for each task.
+      tasks_per_node: Number of tasks to run on each node, if not set defaults
+        to Slurm's output environment variable SLURM_NTASKS_PER_NODE.
+      auto_set_gpu: Set the visible CUDA devices automatically while resolving
+        the cluster by setting CUDA_VISIBLE_DEVICES environment variable.
+        Defaults to True.
+      rpc_layer: (Optional) The protocol TensorFlow uses to communicate between
+        nodes. Defaults to 'grpc'.
+
+    Returns:
+      A ClusterResolver object which can be used with distributed TensorFlow.
+
+    Raises:
+      RuntimeError: If requested more GPUs per node then available or requested
+      more tasks then assigned tasks.
+    """
+
+    # check if launched by mpirun
+    if 'OMPI_COMM_WORLD_RANK' in os.environ:
+      self._rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
+      num_tasks = int(os.environ['OMPI_COMM_WORLD_SIZE'])
+    else:
+      self._rank = int(os.environ['SLURM_PROCID'])
+      num_tasks = int(os.environ['SLURM_NTASKS'])
+
+    self._jobs = collections.OrderedDict(sorted(jobs.items()))
+    self._port_base = port_base
+
+    # user specification overrides SLURM specification
+    if tasks_per_node is not None:
+      self._tasks_per_node = tasks_per_node
+    elif tasks_per_node is None and 'SLURM_NTASKS_PER_NODE' in os.environ:
+      self._tasks_per_node = int(os.environ['SLURM_NTASKS_PER_NODE'])
+    else:
+      raise RuntimeError('Neither `tasks_per_node` or '
+                         'SLURM_NTASKS_PER_NODE is set.')
+
+    self._gpus_per_node = gpus_per_node
+    self._gpus_per_task = gpus_per_task
+
+    self._auto_set_gpu = auto_set_gpu
+    self.task_type = None
+    self.task_index = None
+    self.rpc_layer = rpc_layer
+
+    self._gpu_allocation = []
+    self._cluster_allocation = {}
+
+    if self._tasks_per_node * self._gpus_per_task > self._gpus_per_node:
+      raise RuntimeError('Requested more GPUs per node then available.')
+
+    if sum(self._jobs.values()) != num_tasks:
+      raise RuntimeError('Requested more tasks then assigned tasks.')
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest instance group info.
+
+    This returns a ClusterSpec object for use based on information from the
+    specified initialization parameters and Slurm environment variables. The
+    cluster specification is resolved each time this function is called. The
+    resolver extract hostnames of nodes by scontrol and pack tasks in that
+    order until a node a has number of tasks that is equal to specification.
+    GPUs on nodes are allocated to tasks by specification through setting
+    CUDA_VISIBLE_DEVICES environment variable.
+
+    Returns:
+      A ClusterSpec containing host information retrieved from Slurm's
+        environment variables.
+    """
+    hostlist = self._resolve_hostnames()
+
+    task_list = []
+    self._gpu_allocation = []
+    self._cluster_allocation = {}
+
+    for host in hostlist:
+      for port_offset, gpu_offset in zip(
+          range(self._tasks_per_node),
+          range(0, self._gpus_per_node, self._gpus_per_task)):
+
+        host_addr = '%s:%d' % (host, self._port_base + port_offset)
+        task_list.append(host_addr)
+        gpu_id_list = []
+
+        for gpu_id in range(gpu_offset, gpu_offset + self._gpus_per_task):
+          gpu_id_list.append(str(gpu_id))
+
+        self._gpu_allocation.append(','.join(gpu_id_list))
+
+    cluster_rank_offset_start = 0
+    cluster_rank_offset_end = 0
+
+    for task_type, num_tasks in self._jobs.items():
+      cluster_rank_offset_end = cluster_rank_offset_start + num_tasks
+
+      self._cluster_allocation[task_type] = (
+          task_list[cluster_rank_offset_start:cluster_rank_offset_end])
+
+      if cluster_rank_offset_start <= self._rank < cluster_rank_offset_end:
+        self.task_type = task_type
+        self.task_index = self._rank - cluster_rank_offset_start
+
+      cluster_rank_offset_start = cluster_rank_offset_end
+
+    if self._auto_set_gpu is True:
+      os.environ['CUDA_VISIBLE_DEVICES'] = self._gpu_allocation[self._rank]
+
+    return ClusterSpec(self._cluster_allocation)
+
+  def get_task_info(self):
+    """Returns job name and task_index for the process which calls this.
+
+    This returns the job name and task index for the process which calls this
+    function according to its rank and cluster specification. The job name and
+    task index are set after a cluster is constructed by cluster_spec otherwise
+    defaults to None.
+
+    Returns:
+      A string specifying job name the process belongs to and an integner
+        specifying the task index the process belongs to in that job.
+    """
+    return self.task_type, self.task_index
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master string for connecting to a TensorFlow master.
+
+    Args:
+      task_type: (Optional) Overrides the default auto-selected task type.
+      task_index: (Optional) Overrides the default auto-slected task index.
+      rpc_layer: (Optional) Overrides the default RPC protocol TensorFlow uses
+        to communicate across nodes.
+
+    Returns:
+      A connection string for connecting to a TensorFlow master.
+    """
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+    rpc_layer = rpc_layer or self.rpc_layer
+    master = self.cluster_spec().task_address(task_type, task_index)
+
+    return '%s://%s' % (rpc_layer, master) if rpc_layer else master
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in.
+
+    For users in the Slurm environment, the environment property is always an
+    empty string, and Google users will not use this ClusterResolver for running
+    on internal systems.
+    """
+    return ''
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # Unused, since this is set in __init__ manually.
+    del task_type, task_index, accelerator_type, config_proto
+    return self._gpus_per_node
diff --git a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
similarity index 85%
rename from tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
index 9aa7df745eb8e1c444011485687b213d87c37da5..076539d16f17d64a9a28052960b61a5b99a7c9c6 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/slurm_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/slurm_cluster_resolver_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.cluster_resolver.python.training.slurm_cluster_resolver import SlurmClusterResolver
+from tensorflow.python.distribute.cluster_resolver import SlurmClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -67,6 +67,31 @@ class SlurmClusterResolverTest(test.TestCase):
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
 
+  @mock.patch.dict(os.environ, {'SLURM_PROCID': '0', 'SLURM_NTASKS': '3'})
+  @mock.patch.object(SlurmClusterResolver, '_resolve_hostnames',
+                     mock_resolve_hostnames_output)
+  def testSimpleMasterRetrieval(self):
+    slurm_cluster_resolver = SlurmClusterResolver(
+        jobs={
+            'ps': 1,
+            'worker': 2
+        },
+        port_base=8888,
+        tasks_per_node=1,
+        gpus_per_node=1,
+        gpus_per_task=1,
+        auto_set_gpu=False)
+
+    slurm_cluster_resolver.task_type = 'worker'
+    slurm_cluster_resolver.task_index = 1
+    self.assertEqual(slurm_cluster_resolver.master(), 'grpc://t02n43:8888')
+
+    slurm_cluster_resolver.rpc_layer = 'ab'
+    self.assertEqual(slurm_cluster_resolver.master('ps', 0), 'ab://t02n13:8888')
+    self.assertEqual(
+        slurm_cluster_resolver.master('ps', 0, rpc_layer='test'),
+        'test://t02n13:8888')
+
   @mock.patch.dict(os.environ, {
       'SLURM_PROCID': '0',
       'SLURM_NTASKS': '3',
diff --git a/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4465714b2679f616d8730205c7ad7c020b04da6
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver.py
@@ -0,0 +1,178 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for TF_CONFIG Environment Variables."""
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import json
+import os
+
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.training.server_lib import ClusterSpec
+
+_TF_CONFIG_ENV = 'TF_CONFIG'
+_SESSION_MASTER_KEY = 'session_master'
+_RPC_LAYER_KEY = 'rpc_layer'
+_TASK_KEY = 'task'
+
+
+def format_master_url(master, rpc_layer=None):
+  if rpc_layer:
+    return '%s://%s' % (rpc_layer, master)
+  else:
+    return master
+
+
+def _load_tf_config():
+  return json.loads(os.environ.get(_TF_CONFIG_ENV, '{}'))
+
+
+def _get_value_in_tfconfig(key, default=None):
+  tf_config = _load_tf_config()
+  return tf_config[key] if key in tf_config else default
+
+
+class TFConfigClusterResolver(ClusterResolver):
+  """Implementation of a ClusterResolver which reads the TF_CONFIG EnvVar."""
+
+  def __init__(self,
+               task_type=None,
+               task_index=None,
+               rpc_layer=None,
+               environment=None,
+               num_accelerators=0):
+    """Creates a new TFConfigClusterResolver.
+
+    Args:
+      task_type: (String, optional) Overrides the task type specified in the
+        TF_CONFIG environment variable.
+      task_index: (Integer, optional) Overrides the task index specified in the
+        TF_CONFIG environment variable.
+      rpc_layer: (String, optional) Overrides the rpc layer TensorFlow uses.
+      environment: (String, optional) Overrides the environment TensorFlow
+        operates in.
+      num_accelerators: (Integer, optional) Specifies the number of
+        accelerators (e.g. GPUs, TPUs, others) that each node has.
+    """
+    # TODO(frankchn): num_accelerators is a stop-gap and will be removed
+    # in favor of autodetection of devices soon.
+
+    self._task_type = task_type
+    self._task_index = task_index
+    self._rpc_layer = rpc_layer
+    self._environment = environment
+    self._num_accelerators = num_accelerators
+
+  @property
+  def task_type(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, {})
+      return task_info['type'] if 'type' in task_info else None
+    else:
+      return self._task_type
+
+  @property
+  def task_index(self):
+    if self._task_type is None:
+      task_info = _get_value_in_tfconfig(_TASK_KEY, {})
+      return task_info['index'] if 'index' in task_info else None
+    else:
+      return self._task_index
+
+  @task_type.setter
+  def task_type(self, task_type):
+    self._task_type = task_type
+
+  @task_index.setter
+  def task_index(self, task_index):
+    self._task_index = task_index
+
+  @property
+  def environment(self):
+    return self._environment
+
+  @property
+  def rpc_layer(self):
+    if self._rpc_layer is None:
+      return _get_value_in_tfconfig(_RPC_LAYER_KEY)
+    else:
+      return self._rpc_layer
+
+  @rpc_layer.setter
+  def rpc_layer(self, rpc_layer):
+    self._rpc_layer = rpc_layer
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='GPU',
+                       config_proto=None):
+    # TODO(frankchn): Connect to server (w/ session_config) in the future.
+    # Unused, we do not connect to another server here right now.
+    del task_type, task_index, accelerator_type, config_proto
+    return self._num_accelerators
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec based on the TF_CONFIG environment variable.
+
+    Returns:
+      A ClusterSpec with information from the TF_CONFIG environment variable.
+    """
+    tf_config = _load_tf_config()
+    if 'cluster' not in tf_config:
+      return ClusterSpec({})
+    return ClusterSpec(tf_config['cluster'])
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Returns the master address to use when creating a TensorFlow session.
+
+    Args:
+      task_type: (String, optional) Overrides and sets the task_type of the
+        master.
+      task_index: (Integer, optional) Overrides and sets the task id of the
+        master.
+      rpc_layer: (String, optional) Overrides and sets the protocol over which
+        TensorFlow nodes communicate with each other.
+
+    Returns:
+      The address of the master.
+
+    Raises:
+      RuntimeError: If the task_type or task_id is not specified and the
+        `TF_CONFIG` environment variable does not contain a task section.
+    """
+
+    # If `session_master` is set, just use that.
+    session_master = _get_value_in_tfconfig(_SESSION_MASTER_KEY)
+    if session_master is not None:
+      return session_master
+
+    # Return an empty string if we are the only job in the ClusterSpec.
+    cluster_spec = self.cluster_spec()
+    if (not cluster_spec.jobs or
+        (len(cluster_spec.jobs) == 1 and
+         len(cluster_spec.job_tasks(cluster_spec.jobs[0])) == 1)):
+      return ''
+
+    # We try to auto-detect the task type and id, but uses the user-supplied one
+    # where available
+    task_type = task_type if task_type is not None else self.task_type
+    task_index = task_index if task_index is not None else self.task_index
+
+    return format_master_url(cluster_spec.task_address(task_type, task_index),
+                             self.rpc_layer)
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
similarity index 73%
rename from tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
index 468161d2aa49129f2ec960b1ccddf49c712f00a7..197eba1739017e8665588618e6b64297b310b513 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tfconfig_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tfconfig_cluster_resolver_test.py
@@ -20,7 +20,7 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.cluster_resolver.python.training.tfconfig_cluster_resolver import TFConfigClusterResolver
+from tensorflow.python.distribute.cluster_resolver import TFConfigClusterResolver
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -133,6 +133,58 @@ class TFConfigClusterResolverTest(test.TestCase):
     cluster_resolver = TFConfigClusterResolver()
     self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
 
+  def testTaskTypeIndexRpcRead(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 0
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver()
+    self.assertEqual('ps', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual('grpc', cluster_resolver.rpc_layer)
+
+  def testParameterOverrides(self):
+    os.environ['TF_CONFIG'] = """
+    {
+      "cluster": {
+        "ps": ["ps0:2222", "ps1:2222"],
+        "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
+      },
+      "rpc_layer": "grpc",
+      "task": {
+        "type": "ps",
+        "index": 1
+      }
+    }
+    """
+
+    cluster_resolver = TFConfigClusterResolver(task_type='ps', task_index=0,
+                                               num_accelerators=8)
+
+    self.assertEqual('grpc://ps0:2222', cluster_resolver.master())
+    self.assertEqual('ps', cluster_resolver.task_type)
+    self.assertEqual(0, cluster_resolver.task_index)
+    self.assertEqual(8, cluster_resolver.num_accelerators())
+
+    cluster_resolver.task_type = 'worker'
+    cluster_resolver.task_index = 1
+    cluster_resolver.rpc_layer = 'test'
+
+    self.assertEqual('test://worker1:2222', cluster_resolver.master())
+    self.assertEqual('worker', cluster_resolver.task_type)
+    self.assertEqual(1, cluster_resolver.task_index)
+    self.assertEqual('test', cluster_resolver.rpc_layer)
+
   def testZeroItemsInClusterSpecMasterRead(self):
     os.environ['TF_CONFIG'] = """
     {}
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
new file mode 100644
index 0000000000000000000000000000000000000000..e907d6fde4f7bb63553b85c580149a8cb51c9c3b
--- /dev/null
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -0,0 +1,502 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Implementation of Cluster Resolvers for Cloud TPUs."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import os
+import re
+
+from six.moves.urllib.request import Request
+from six.moves.urllib.request import urlopen
+
+from tensorflow.python.client import session
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import ClusterResolver
+from tensorflow.python.distribute.cluster_resolver.cluster_resolver import format_master_url
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import server_lib
+from tensorflow.python.util import compat
+
+_GOOGLE_API_CLIENT_INSTALLED = True
+try:
+  from googleapiclient import discovery  # pylint: disable=g-import-not-at-top
+  from oauth2client.client import GoogleCredentials  # pylint: disable=g-import-not-at-top
+except ImportError:
+  _GOOGLE_API_CLIENT_INSTALLED = False
+
+
+_GKE_ENV_VARIABLE = 'KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'
+_ENDPOINTS_SEPARATOR = ','
+_DEFAULT_ENV_VARIABLE = 'TPU_NAME'
+_DISCOVERY_SERVICE_URL_ENV_VARIABLE = 'TPU_API_DISCOVERY_URL'
+
+_TPU_DEVICE_REGEX = re.compile(
+    r'.*task:(?P<host_id>\d+)/.*device:TPU:(?P<core_id>\d+)$')
+_TPU_CONN_RETRIES = 120
+
+DeviceDetails = collections.namedtuple(
+    'DeviceDetails', ['device_map', 'total_cores'])
+
+
+def _get_device_dict_and_cores(devices):
+  """Returns a dict of hosts to cores and total cores given devices names.
+
+  Returns a namedtuple with two attributes:
+    device_map: A map of host_ids to a list of core_ids.
+    total_cores: The total number of cores within the TPU system.
+
+  Args:
+    devices: A list of devices returned by session.list_devices()
+  """
+  device_map = collections.defaultdict(list)
+  num_cores = 0
+  for device in devices:
+    match = _TPU_DEVICE_REGEX.match(device.name)
+    if match:
+      host_id = match.group('host_id')
+      core_id = match.group('core_id')
+      device_map[host_id].append(core_id)
+      num_cores += 1
+  return DeviceDetails(device_map, num_cores)
+
+
+def _verify_and_return_same_core_count(device_dict):
+  """Verifies that every device in device_dict has the same number of cores."""
+  num_cores_per_host_set = (
+      {len(core_ids) for core_ids in device_dict.values()})
+  if len(num_cores_per_host_set) != 1:
+    raise RuntimeError('TPU cores on each device is not the same. This '
+                       'should never happen. Devices: {}'.format(device_dict))
+  return num_cores_per_host_set.pop()
+
+
+class TPUClusterResolver(ClusterResolver):
+  """Cluster Resolver for Google Cloud TPUs.
+
+  This is an implementation of cluster resolvers for the Google Cloud TPU
+  service. As Cloud TPUs are in alpha, you will need to specify a API definition
+  file for this to consume, in addition to a list of Cloud TPUs in your Google
+  Cloud Platform project.
+  """
+
+  def _tpuService(self):
+    """Creates a new Cloud TPU API object.
+
+    This works around an issue where the underlying HTTP connection sometimes
+    times out when the script has been running for too long. Other methods in
+    this object calls this method to get a new API object whenever they need
+    to communicate with the Cloud API.
+
+    Returns:
+      A Google Cloud TPU API object.
+    """
+    if self._service:
+      return self._service
+
+    credentials = self._credentials
+    if credentials is None or credentials == 'default':
+      credentials = GoogleCredentials.get_application_default()
+
+    if self._discovery_url:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials,
+          discoveryServiceUrl=self._discovery_url)
+    else:
+      return discovery.build(
+          'tpu', 'v1alpha1',
+          credentials=credentials)
+
+  def _requestComputeMetadata(self, path):
+    req = Request('http://metadata/computeMetadata/v1/%s' % path,
+                  headers={'Metadata-Flavor': 'Google'})
+    resp = urlopen(req)
+    return compat.as_bytes(resp.read())
+
+  def _shouldResolve(self):
+    if isinstance(self._should_resolve_override, bool):
+      return self._should_resolve_override
+    if (self._tpu == compat.as_bytes('') or
+        self._tpu == compat.as_bytes('local') or
+        self._tpu.startswith(compat.as_bytes('/bns')) or
+        self._tpu.startswith(compat.as_bytes('localhost:')) or
+        self._tpu.startswith(compat.as_bytes('grpc://')) or
+        self._tpu.startswith(compat.as_bytes('uptc://'))):
+      return False
+    return True
+
+  @staticmethod
+  def _inGke():
+    """When running in GKE, the environment variable will be set."""
+    return _GKE_ENV_VARIABLE in os.environ
+
+  @staticmethod
+  def _gkeEndpoints():
+    return os.environ[_GKE_ENV_VARIABLE]
+
+  @staticmethod
+  def _envVarFallback():
+    if _DEFAULT_ENV_VARIABLE in os.environ:
+      return os.environ[_DEFAULT_ENV_VARIABLE]
+    return None
+
+  @staticmethod
+  def _environmentDiscoveryUrl():
+    return os.environ.get(_DISCOVERY_SERVICE_URL_ENV_VARIABLE)
+
+  def __init__(self,
+               tpu=None,
+               zone=None,
+               project=None,
+               job_name='worker',
+               coordinator_name=None,
+               coordinator_address=None,
+               credentials='default',
+               service=None,
+               discovery_url=None):
+    """Creates a new TPUClusterResolver object.
+
+    The ClusterResolver will then use the parameters to query the Cloud TPU APIs
+    for the IP addresses and ports of each Cloud TPU listed.
+
+    Args:
+      tpu: Either a string, or a list of strings corresponding to the TPUs to
+        use. If the single string is the empty string, the string 'local', or a
+        string that begins with 'grpc://' or '/bns', then it is assumed to not
+        correspond with a Cloud TPU and will instead be passed as the session
+        master and no ClusterSpec propagation will be done.
+      zone: Zone where the TPUs are located. If omitted or empty, we will assume
+        that the zone of the TPU is the same as the zone of the GCE VM, which we
+        will try to discover from the GCE metadata service.
+      project: Name of the GCP project containing Cloud TPUs. If omitted or
+        empty, we will try to discover the project name of the GCE VM from the
+        GCE metadata service.
+      job_name: Name of the TensorFlow job the TPUs belong to.
+      coordinator_name: The name to use for the coordinator. Set to None if the
+        coordinator should not be included in the computed ClusterSpec.
+      coordinator_address: The address of the coordinator (typically an ip:port
+        pair). If set to None, a TF server will be started. If coordinator_name
+        is None, a TF server will not be started even if coordinator_address is
+        None.
+      credentials: GCE Credentials. If None, then we use default credentials
+        from the oauth2client
+      service: The GCE API object returned by the googleapiclient.discovery
+        function. If you specify a custom service object, then the credentials
+        parameter will be ignored.
+      discovery_url: A URL template that points to the location of
+        the discovery service. It should have two parameters {api} and
+        {apiVersion} that when filled in produce an absolute URL to the
+        discovery document for that service. The environment variable
+        'TPU_API_DISCOVERY_URL' will override this.
+
+    Raises:
+      ImportError: If the googleapiclient is not installed.
+      ValueError: If no TPUs are specified.
+    """
+    if isinstance(tpu, list):
+      if not tpu:
+        raise ValueError('At least one TPU must be specified.')
+      if len(tpu) != 1:
+        raise NotImplementedError(
+            'Using multiple TPUs in a single session is not yet implemented')
+      tpu = tpu[0]
+
+    in_gke = self._inGke()
+    # When using GKE with Cloud TPUs, the env variable will be set.
+    if tpu is None:
+      if in_gke:
+        tpu = self._gkeEndpoints()
+      else:
+        tpu = self._envVarFallback()
+
+    if tpu is None:
+      raise ValueError('Please provide a TPU Name to connect to.')
+
+    self._tpu = compat.as_bytes(tpu)  # self._tpu is always bytes
+
+    # By default the task_type is 'worker` and the task_index is 0 (which is the
+    # first worker in the task).
+    self.task_type = job_name
+    self.task_index = 0
+
+    if tpu.startswith('grpc://'):
+      # Cloud environment, where we are using GRPC to communicate to TPUs.
+      self._environment = ''
+    elif tpu == 'local' or not tpu:
+      # Google environment, where the TPU is attached to the host.
+      self._environment = 'google'
+    elif tpu.startswith('/bns') or tpu.startswith('uptc://'):
+      # Google environment, where we reach the TPU through BNS.
+      self._environment = 'google'
+
+    # If TPU is in the Google environment or exists locally, we don't use any
+    # RPC layer.
+    if tpu.startswith('/bns') or tpu.startswith(
+        'uptc://') or tpu == 'local' or not tpu:
+      self.rpc_layer = None
+    else:
+      self.rpc_layer = 'grpc'
+
+    # Setting this overrides the return value of self._shouldResolve()
+    self._should_resolve_override = None
+
+    # We strip out the protocol if it is included, and override the
+    # shouldResolve function to never resolve. We are adding the protocol back
+    # in later in self.master().
+    if self.rpc_layer is not None and tpu.startswith(self.rpc_layer + '://'):
+      tpu = tpu[len(self.rpc_layer + '://'):]
+      self._tpu = tpu
+      self._should_resolve_override = False
+
+    # Whether we should actually attempt to contact Cloud APIs
+    should_resolve = self._shouldResolve()
+
+    # We error out if we are in a non-Cloud environment which cannot talk to the
+    # Cloud APIs using the standard class and a special object is not passed in.
+    self._service = service
+    if (self._service is None and should_resolve and
+        not _GOOGLE_API_CLIENT_INSTALLED):
+      raise ImportError('googleapiclient and oauth2client must be installed '
+                        'before using the TPU cluster resolver. Execute: '
+                        '`pip install --upgrade google-api-python-client` '
+                        'and `pip install --upgrade oauth2client` to '
+                        'install with pip.')
+
+    # We save user-passed credentials, unless the user didn't pass in anything.
+    self._credentials = credentials
+    if (credentials == 'default' and should_resolve and
+        _GOOGLE_API_CLIENT_INSTALLED):
+      self._credentials = None
+
+    # Automatically detect project and zone if unspecified.
+    if not project and should_resolve:
+      project = compat.as_str(
+          self._requestComputeMetadata('project/project-id'))
+    if not zone and should_resolve:
+      zone_path = compat.as_str(self._requestComputeMetadata('instance/zone'))
+      zone = zone_path.split('/')[-1]
+    self._project = project
+    self._zone = zone
+
+    self._discovery_url = self._environmentDiscoveryUrl() or discovery_url
+
+    self._coordinator_name = coordinator_name
+    if (coordinator_name and not coordinator_address and
+        (should_resolve or in_gke)):
+      self._start_local_server()
+    else:
+      self._coordinator_address = coordinator_address
+
+  def master(self, task_type=None, task_index=None, rpc_layer=None):
+    """Get the Master string to be used for the session.
+
+    In the normal case, this returns the grpc path (grpc://1.2.3.4:8470) of
+    first instance in the ClusterSpec returned by the cluster_spec function.
+
+    If a non-TPU name is used when constructing a TPUClusterResolver, that will
+    be returned instead (e.g. If the tpus argument's value when constructing
+    this TPUClusterResolver was 'grpc://10.240.1.2:8470',
+    'grpc://10.240.1.2:8470' will be returned).
+
+    Args:
+      task_type: (Optional, string) The type of the TensorFlow task of the
+        master.
+      task_index: (Optional, integer) The index of the TensorFlow task of the
+        master.
+      rpc_layer: (Optional, string) The RPC protocol TensorFlow should use to
+        communicate with TPUs.
+
+    Returns:
+      string, the connection string to use when creating a session.
+
+    Raises:
+      ValueError: If none of the TPUs specified exists.
+    """
+    if self._shouldResolve():
+      # We are going to communicate with the Cloud TPU APIs to get a Cluster.
+      cluster_spec = self.cluster_spec()
+      if task_type is not None and task_index is not None:
+        # task_type and task_index is from the function parameter
+        master = cluster_spec.task_address(task_type, task_index)
+      elif self.task_type is not None and self.task_index is not None:
+        # task_type and task_index is from the object
+        master = cluster_spec.task_address(self.task_type, self.task_index)
+      else:
+        # by default we take the first item in the cluster with the right name
+        job_tasks = cluster_spec.job_tasks(self.task_type)
+        if not job_tasks:
+          raise ValueError('No TPUs with the specified names exist.')
+        master = job_tasks[0]
+    else:
+      if isinstance(self._tpu, (bytes, bytearray)):
+        master = self._tpu.split(compat.as_bytes(_ENDPOINTS_SEPARATOR))[0]
+      else:
+        master = self._tpu.split(_ENDPOINTS_SEPARATOR)[0]
+    return format_master_url(master, rpc_layer or self.rpc_layer)
+
+  def get_master(self):
+    return self.master()
+
+  def get_job_name(self):
+    if self._shouldResolve():
+      return self.task_type
+
+  def cluster_spec(self):
+    """Returns a ClusterSpec object based on the latest TPU information.
+
+    We retrieve the information from the GCE APIs every time this method is
+    called.
+
+    Returns:
+      A ClusterSpec containing host information returned from Cloud TPUs.
+
+    Raises:
+      RuntimeError: If the provided TPU is not healthy.
+    """
+    ############################################################################
+    # There are 5 potential cases this code must handle:
+    #  1. [Normal case.] We should resolve the TPU name to a set of tasks, and
+    #      a. Create a ClusterSpec that includes the coordinator job
+    #      b. Create a ClusterSpec without the coordinator job.
+    #  2. [GKE / No API Access.] We should not resolve the TPU name to a set of
+    #     tasks and
+    #      a. Create a ClusterSpec with the coordinator
+    #      b. Create a ClusterSpec without the coordinator
+    #  3. [Other (legacy non-gRPC).] We should return an empty ClusterSpec.
+    ############################################################################
+
+    if self._shouldResolve():
+      # Case 1.
+      full_name = 'projects/%s/locations/%s/nodes/%s' % (
+          self._project, self._zone, compat.as_text(self._tpu))
+      service = self._tpuService()
+      request = service.projects().locations().nodes().get(name=full_name)
+      response = request.execute()
+
+      if 'state' in response and response['state'] != 'READY':
+        raise RuntimeError('TPU "%s" is not yet ready; state: "%s"' %
+                           (compat.as_text(self._tpu), response['state']))
+
+      if 'health' in response and response['health'] != 'HEALTHY':
+        raise RuntimeError('TPU "%s" is unhealthy: "%s"' %
+                           (compat.as_text(self._tpu), response['health']))
+
+      if 'networkEndpoints' in response:
+        worker_list = [
+            '%s:%s' % (endpoint['ipAddress'], endpoint['port'])
+            for endpoint in response['networkEndpoints']
+        ]
+      else:
+        # Fall back to the deprecated response format
+        instance_url = '%s:%s' % (response['ipAddress'], response['port'])
+        worker_list = [instance_url]
+
+      cluster_spec = {self.task_type: worker_list}
+    else:
+      if self.rpc_layer is None:
+        # Case 3.
+        return None
+      # Case 2.
+      tpus = []
+      for tpu in self._tpu.split(_ENDPOINTS_SEPARATOR):
+        # We are working around the fact that GKE environment variable that is
+        # supplied to us has the protocol string embedded in it, but we want
+        # to strip it out for the ClusterSpec.
+        if (self.rpc_layer is not None and
+            tpu.startswith(self.rpc_layer + '://')):
+          tpus.append(tpu[len(self.rpc_layer + '://'):])
+        else:
+          tpus.append(tpu)
+      cluster_spec = {self.task_type: tpus}
+
+    if self._coordinator_address:
+      # {1, 2}.a
+      cluster_spec[self._coordinator_name] = [self._coordinator_address]
+
+    return server_lib.ClusterSpec(cluster_spec)
+
+  def num_accelerators(self,
+                       task_type=None,
+                       task_index=None,
+                       accelerator_type='TPU',
+                       config_proto=None):
+    """Returns the number of TPU cores per worker.
+
+    Connects to the master and list all the devices present in the master,
+    and counts them up. Also verifies that the device counts per host in the
+    cluster is the same before returning the number of TPU cores per host.
+
+    Args:
+      task_type: Unused.
+      task_index: Unused.
+      accelerator_type: Unused.
+      config_proto: Used to create a connection to a TPU master in order to
+        retrieve the system metadata.
+
+    Raises:
+      RuntimeError: If this is used with a non-TPU accelerator_type.
+    """
+    retry_count = 1
+    # TODO(b/120564445): Replace with standard library for retries.
+    while True:
+      try:
+        with ops.Graph().as_default():
+          with session.Session(self.master(), config=config_proto) as s:
+            devices = s.list_devices()
+            device_details = _get_device_dict_and_cores(devices)
+            break
+      except errors.DeadlineExceededError:
+        error_message = ('Failed to connect to master. The TPU might not be '
+                         'ready (e.g. still scheduling) or the master '
+                         'address is incorrect: got (%s)' % self.master())
+        if retry_count <= _TPU_CONN_RETRIES:
+          logging.warning(error_message)
+          logging.warning('Retrying (%d/%d)...', retry_count, _TPU_CONN_RETRIES)
+          retry_count += 1
+        else:
+          raise RuntimeError(error_message)
+
+    if device_details.total_cores:
+      return _verify_and_return_same_core_count(device_details.device_map)
+    return 0
+
+  @property
+  def environment(self):
+    """Returns the current environment which TensorFlow is running in."""
+    return self._environment
+
+  def _start_local_server(self):
+    address = self._requestComputeMetadata('instance/network-interfaces/0/ip')
+    self._server = server_lib.Server(
+        {
+            'local': ['0.0.0.0:0']
+        }, protocol='grpc', config=None, start=True)
+    # self._server.target is of the form: grpc://ipaddress:port
+    target = compat.as_bytes(self._server.target)
+    splits = target.split(compat.as_bytes(':'))
+    assert len(splits) == 3, self._server.target
+    assert splits[0] == compat.as_bytes('grpc'), self._server.target
+    self._coordinator_port = compat.as_text(splits[2])
+    self._coordinator_address = '%s:%s' % (
+        address, compat.as_text(self._coordinator_port))
+
+  def __deepcopy__(self, memo):
+    # TODO(b/73668574): Remove this once RunConfig avoids performing deepcopy.
+    return self
diff --git a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
similarity index 61%
rename from tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
rename to tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
index 478c82967ba993c0551113a38879f87d872517a3..27d92608fa2db95944c94160d716a033ab2f78a2 100644
--- a/tensorflow/contrib/cluster_resolver/python/training/tpu_cluster_resolver_test.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver_test.py
@@ -20,7 +20,10 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.cluster_resolver.python.training.tpu_cluster_resolver import TPUClusterResolver
+from tensorflow.python.client import session
+from tensorflow.python.distribute import cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
@@ -101,7 +104,8 @@ class TPUClusterResolverTest(test.TestCase):
 
     return mock_client
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadata(self):
     tpu_map = {
@@ -112,7 +116,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project=None,
         zone=None,
         tpu=['test-tpu-1'],
@@ -120,7 +124,7 @@ class TPUClusterResolverTest(test.TestCase):
         service=self.mock_service_client(tpu_map=tpu_map),
         coordinator_name='coordinator')
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'coordinator'
@@ -130,10 +134,12 @@ class TPUClusterResolverTest(test.TestCase):
       name: 'worker'
       tasks { key: 0 value: '10.1.2.3:8470' }
     }
-    """ % tpu_cluster_resolver._coordinator_port
+    """ % resolver._coordinator_port
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testRetrieveProjectAndZoneFromMetadataNoCoordinator(self):
     tpu_map = {
@@ -144,7 +150,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project=None,
         zone=None,
         tpu=['test-tpu-1'],
@@ -152,13 +158,15 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testUnhealthyCloudTpu(self):
     tpu_map = {
@@ -169,7 +177,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project=None,
         zone=None,
         tpu='test-tpu-1',
@@ -178,9 +186,10 @@ class TPUClusterResolverTest(test.TestCase):
         service=self.mock_service_client(tpu_map=tpu_map))
 
     with self.assertRaises(RuntimeError):
-      tpu_cluster_resolver.cluster_spec()
+      resolver.cluster_spec()
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testNotReadyCloudTpu(self):
     tpu_map = {
@@ -191,7 +200,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project=None,
         zone=None,
         tpu='test-tpu-1',
@@ -200,7 +209,7 @@ class TPUClusterResolverTest(test.TestCase):
         service=self.mock_service_client(tpu_map=tpu_map))
 
     with self.assertRaises(RuntimeError):
-      tpu_cluster_resolver.cluster_spec()
+      resolver.cluster_spec()
 
   def testSimpleSuccessfulRetrieval(self):
     tpu_map = {
@@ -211,7 +220,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu=['test-tpu-1'],
@@ -220,12 +229,13 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
     job { name: 'worker' tasks { key: 0 value: '10.1.2.3:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
 
   def testNewNetworkEndpointFormat(self):
     tpu_map = {
@@ -238,7 +248,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -247,15 +257,16 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job { name: 'coordinator' tasks { key: 0 value: '10.128.1.5:10203' } }
     job { name: 'worker' tasks { key: 0 value: '10.2.3.4:8470' } }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
-    self.assertEqual('grpc://10.2.3.4:8470', tpu_cluster_resolver.master())
+    self.assertEqual('grpc://10.2.3.4:8470', resolver.master())
 
-  @mock.patch.object(TPUClusterResolver, '_requestComputeMetadata',
+  @mock.patch.object(cluster_resolver.TPUClusterResolver,
+                     '_requestComputeMetadata',
                      mock_request_compute_metadata)
   def testPodResolution(self):
     tpu_map = {
@@ -283,13 +294,13 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         tpu='test-tpu-1',
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map),
         coordinator_name='coordinator')
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'coordinator',
@@ -302,8 +313,9 @@ class TPUClusterResolverTest(test.TestCase):
       tasks { key: 2 value: '10.2.3.6:8470' }
       tasks { key: 3 value: '10.2.3.7:8470' }
     }
-    """ % tpu_cluster_resolver._coordinator_port
+    """ % resolver._coordinator_port
     self._verifyClusterSpecEquality(actual_cluster_spec, str(expected_proto))
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
 
   def testPodResolutionNoCoordinator(self):
     tpu_map = {
@@ -331,7 +343,7 @@ class TPUClusterResolverTest(test.TestCase):
         }
     }
 
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu='test-tpu-1',
@@ -339,7 +351,7 @@ class TPUClusterResolverTest(test.TestCase):
         credentials=None,
         service=self.mock_service_client(tpu_map=tpu_map))
 
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'worker'
@@ -350,12 +362,13 @@ class TPUClusterResolverTest(test.TestCase):
     }
     """
     self._verifyClusterSpecEquality(actual_cluster_spec, expected_proto)
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
 
   def testGetMasterNoEntries(self):
     tpu_map = {}
 
     with self.assertRaises(ValueError):
-      TPUClusterResolver(
+      cluster_resolver.TPUClusterResolver(
           project='test-project',
           zone='us-central1-c',
           tpu=[],
@@ -365,14 +378,14 @@ class TPUClusterResolverTest(test.TestCase):
 
   # TODO(saeta): Convert to parameterized test when included in OSS TF.
   def verifyShouldResolve(self, tpu, should_resolve):
-    tpu_cluster_resolver = TPUClusterResolver(
+    resolver = cluster_resolver.TPUClusterResolver(
         project='test-project',
         zone='us-central1-c',
         tpu=tpu,
         coordinator_name=None,
         credentials=None,
         service=self.mock_service_client(tpu_map={}))
-    self.assertEqual(should_resolve, tpu_cluster_resolver._shouldResolve(),
+    self.assertEqual(should_resolve, resolver._shouldResolve(),
                      "TPU: '%s'" % tpu)
 
   def testShouldResolveNoName(self):
@@ -397,25 +410,26 @@ class TPUClusterResolverTest(test.TestCase):
     self.verifyShouldResolve('grpctpu', True)
 
   def testNoCallComputeMetadata(self):
-    tpu_cluster_resolver = TPUClusterResolver(tpu='/bns/foo/bar')
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='/bns/foo/bar')
     self.assertEqual(
-        compat.as_bytes('/bns/foo/bar'), tpu_cluster_resolver.master())
-    self.assertEqual(None, tpu_cluster_resolver.cluster_spec())
+        compat.as_bytes('/bns/foo/bar'), resolver.master())
+    self.assertEqual(None, resolver.cluster_spec())
 
   def testGkeEnvironmentForDonut(self):
     os.environ['KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS'] = 'grpc://10.120.27.5:8470'
 
     self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(TPUClusterResolver._inGke())
+    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
+        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
 
-    tpu_cluster_resolver = TPUClusterResolver()
+    resolver = cluster_resolver.TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.master()))
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+        compat.as_bytes(resolver.master()))
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'worker'
@@ -433,19 +447,19 @@ class TPUClusterResolverTest(test.TestCase):
                                                      'grpc://10.120.27.8:8470')
 
     self.assertIn('KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS', os.environ)
-    self.assertTrue(TPUClusterResolver._inGke())
+    self.assertTrue(cluster_resolver.TPUClusterResolver._inGke())
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470,'
                         'grpc://10.120.27.6:8470,'
                         'grpc://10.120.27.7:8470,'
                         'grpc://10.120.27.8:8470'),
-        compat.as_bytes(TPUClusterResolver._gkeEndpoints()))
+        compat.as_bytes(cluster_resolver.TPUClusterResolver._gkeEndpoints()))
 
-    tpu_cluster_resolver = TPUClusterResolver()
+    resolver = cluster_resolver.TPUClusterResolver()
     self.assertEqual(
         compat.as_bytes('grpc://10.120.27.5:8470'),
-        compat.as_bytes(tpu_cluster_resolver.master()))
-    actual_cluster_spec = tpu_cluster_resolver.cluster_spec()
+        compat.as_bytes(resolver.master()))
+    actual_cluster_spec = resolver.cluster_spec()
     expected_proto = """
     job {
       name: 'worker'
@@ -462,7 +476,153 @@ class TPUClusterResolverTest(test.TestCase):
   def testEnvironmentDiscoveryUrl(self):
     os.environ['TPU_API_DISCOVERY_URL'] = 'https://{api}.internal/{apiVersion}'
     self.assertEqual('https://{api}.internal/{apiVersion}',
-                     TPUClusterResolver._environmentDiscoveryUrl())
+                     (cluster_resolver.TPUClusterResolver.
+                      _environmentDiscoveryUrl()))
+
+  def testEnvironmentAndRpcDetectionForGoogle(self):
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='/bns/ab/cd/ef')
+    self.assertEqual(resolver.environment, 'google')
+    self.assertEqual(resolver.rpc_layer, None)
+
+  def testEnvironmentAndRpcDetectionForGrpcString(self):
+    resolver = cluster_resolver.TPUClusterResolver(
+        tpu='grpc://10.1.2.3:8470')
+    self.assertEqual(resolver.environment, '')
+    self.assertEqual(resolver.rpc_layer, 'grpc')
+    self.assertEqual(resolver.master(), 'grpc://10.1.2.3:8470')
+
+  def testOverrideTaskTypeAndIndexAndGetMaster(self):
+    tpu_map = {
+        'projects/test-project/locations/us-central1-c/nodes/test-tpu-1': {
+            'health':
+                'HEALTHY',
+            'networkEndpoints': [
+                {
+                    'ipAddress': '10.2.3.4',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.5',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.6',
+                    'port': 8470,
+                },
+                {
+                    'ipAddress': '10.2.3.7',
+                    'port': 8470,
+                },
+            ]
+        }
+    }
+
+    resolver = cluster_resolver.TPUClusterResolver(
+        project='test-project',
+        zone='us-central1-c',
+        tpu='test-tpu-1',
+        coordinator_name=None,
+        credentials=None,
+        service=self.mock_service_client(tpu_map=tpu_map))
+
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.4:8470')
+
+    resolver.task_type = 'worker'
+    resolver.task_index = 3
+    self.assertEqual(resolver.master(), 'grpc://10.2.3.7:8470')
+
+    self.assertEqual(
+        resolver.master(
+            task_type='worker', task_index=2, rpc_layer='test'),
+        'test://10.2.3.6:8470')
+
+  def testGetDeviceDictAndCoresWithTPUs(self):
+    device_names = [
+        '/job:tpu_worker/task:0/device:TPU:0',
+        '/job:tpu_worker/task:1/device:TPU:1',
+        '/job:tpu_worker/task:2/device:TPU:0',
+        '/job:tpu_worker/task:3/device:TPU:1',
+        '/job:tpu_worker/task:0/device:TPU:4',
+        '/job:tpu_worker/task:1/device:TPU:5',
+        '/job:tpu_worker/task:2/device:TPU:4',
+        '/job:tpu_worker/task:3/device:TPU:5',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'TPU', 1024, 0) for name in device_names
+    ]
+
+    device_details = tpu_cluster_resolver._get_device_dict_and_cores(
+        device_list)
+    self.assertEqual(device_details.total_cores, 8)
+    self.assertEqual(device_details.device_map,
+                     {'0': ['0', '4'],
+                      '1': ['1', '5'],
+                      '2': ['0', '4'],
+                      '3': ['1', '5']})
+
+  def testGetDeviceDictAndCoresWithCPUsAndGPUs(self):
+    device_names = [
+        '/job:tpu_worker/task:0/device:CPU:0',
+        '/job:tpu_worker/task:1/device:CPU:0',
+        '/job:tpu_worker/task:2/device:CPU:0',
+        '/job:tpu_worker/task:3/device:CPU:0',
+        '/job:tpu_worker/task:0/device:GPU:1',
+        '/job:tpu_worker/task:1/device:GPU:1',
+        '/job:tpu_worker/task:2/device:GPU:1',
+        '/job:tpu_worker/task:3/device:GPU:1',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'XLA', 1024, 0) for name in device_names
+    ]
+
+    device_dict, num_cores = tpu_cluster_resolver._get_device_dict_and_cores(
+        device_list)
+    self.assertEqual(num_cores, 0)
+    self.assertEqual(device_dict, {})
+
+  def testVerifySameCoreCount(self):
+    self.assertEqual(
+        tpu_cluster_resolver._verify_and_return_same_core_count(
+            {0: [0, 1, 2, 3, 4, 5, 6, 7]}), 8)
+    self.assertEqual(
+        tpu_cluster_resolver._verify_and_return_same_core_count(
+            {0: [0, 1], 1: [2, 3]}), 2)
+    with self.assertRaises(RuntimeError):
+      tpu_cluster_resolver._verify_and_return_same_core_count(
+          {0: [0], 1: [1, 2]})
+
+  @mock.patch.object(session.BaseSession, 'list_devices')
+  def testNumAcceleratorsSuccess(self, mock_list_devices):
+    device_names = [
+        '/job:tpu_worker/task:0/device:TPU:0',
+        '/job:tpu_worker/task:1/device:TPU:1',
+        '/job:tpu_worker/task:2/device:TPU:0',
+        '/job:tpu_worker/task:3/device:TPU:1',
+        '/job:tpu_worker/task:0/device:TPU:4',
+        '/job:tpu_worker/task:1/device:TPU:5',
+        '/job:tpu_worker/task:2/device:TPU:4',
+        '/job:tpu_worker/task:3/device:TPU:5',
+    ]
+    device_list = [
+        session._DeviceAttributes(
+            name, 'TPU', 1024, 0) for name in device_names
+    ]
+    mock_list_devices.return_value = device_list
+
+    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+    self.assertEqual(resolver.num_accelerators(), 2)
+
+  @mock.patch.object(session.BaseSession, 'list_devices')
+  def testNumAcceleratorsRetryFailure(self, mock_list_devices):
+    resolver = cluster_resolver.TPUClusterResolver(tpu='')
+    mock_list_devices.side_effect = errors.DeadlineExceededError(
+        None, None, 'timeout')
+    with self.assertRaises(RuntimeError):
+      resolver.num_accelerators()
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/contrib/distribute/python/cross_tower_ops.py b/tensorflow/python/distribute/cross_device_ops.py
similarity index 85%
rename from tensorflow/contrib/distribute/python/cross_tower_ops.py
rename to tensorflow/python/distribute/cross_device_ops.py
index 6b2fe0acb2edcebf0beb77a69a662f42e11623a4..57c552ca8f0abd36466932d800d9f1f802d9664c 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_ops.py
+++ b/tensorflow/python/distribute/cross_device_ops.py
@@ -21,17 +21,17 @@ from __future__ import print_function
 import collections
 import six
 
-from tensorflow.contrib.distribute.python import cross_tower_utils
-from tensorflow.contrib.distribute.python import values as value_lib
 from tensorflow.python.client import device_lib
+from tensorflow.python.distribute import cross_device_utils
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import device_util
 
 
 def check_destinations(destinations):
@@ -53,10 +53,10 @@ def validate_destinations(destinations):
   if not isinstance(
       destinations,
       (value_lib.DistributedValues, resource_variable_ops.ResourceVariable,
-       value_lib.AggregatingVariable, six.string_types, list)):
+       value_lib.AggregatingVariable, six.string_types, list, tuple)):
     raise ValueError("destinations must be one of a `DistributedValues` object,"
-                     " a tf.Variable object, a device string, a list of device "
-                     "strings")
+                     " a tf.Variable object, a device string, a list or tuple "
+                     "of device strings")
 
   if not check_destinations(destinations):
     raise ValueError("destinations can not be empty")
@@ -103,10 +103,10 @@ def _validate_value_destination_pairs(value_destination_pairs):
   # pylint: disable=g-missing-docstring
   if not value_destination_pairs: return False
   if not isinstance(value_destination_pairs, (list, tuple)): return False
-  if not all([isinstance(pair, tuple) for pair in value_destination_pairs]):
+  if not all(isinstance(pair, tuple) for pair in value_destination_pairs):
     return False
-  if not all([isinstance(v[0], value_lib.PerReplica)
-              for v in value_destination_pairs]):
+  if not all(isinstance(v[0], value_lib.PerReplica)
+             for v in value_destination_pairs):
     return False
   return True
 
@@ -132,10 +132,10 @@ def _devices_match(left, right):
 
 
 def _all_devices_match(value_destination_pairs):
-  if not all([_devices_match(v, d) for v, d in value_destination_pairs]):
+  if not all(_devices_match(v, d) for v, d in value_destination_pairs):
     return False
-  if not all([_devices_match(v, value_destination_pairs[0][0])
-              for v, _ in value_destination_pairs[1:]]):
+  if not all(_devices_match(v, value_destination_pairs[0][0])
+             for v, _ in value_destination_pairs[1:]):
     return False
   return True
 
@@ -144,42 +144,31 @@ def _simple_broadcast(value, destinations):
   index = {}
   devices = get_devices_from(destinations)
   for d in devices:
-    index[d] = cross_tower_utils.copy_tensor_or_indexed_slices_to_device(
+    index[d] = cross_device_utils.copy_tensor_or_indexed_slices_to_device(
         value, d)
   return value_lib.Mirrored(index)
 
 
 def _simple_reduce(per_replica_value, reduce_to_device, accumulation_fn,
-                   aggregation):
+                   reduce_op):
   # pylint: disable=g-missing-docstring
   all_values = []
   count = 0
   for v in per_replica_value._index.values():  # pylint: disable=protected-access
-    if isinstance(v, value_lib.MapOutput):
-      v_list = v.get()
-      if not v_list:
-        continue
-      count += len(v_list)
-      # Sum within each device before aggregating across devices.
-      # TODO(yuefengz): Check whether it helps to use accumulation_fn here.
-      v = cross_tower_utils.aggregate_tensors_or_indexed_slices(
-          v_list, math_ops.add_n)
-    else:
-      count += 1
+    count += 1
     all_values.append(v)
   if not all_values:
     raise ValueError("`per_replica_value` must be non-empty")
 
   with ops.device(reduce_to_device):
     with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
-      reduced = cross_tower_utils.aggregate_tensors_or_indexed_slices(
+      reduced = cross_device_utils.aggregate_tensors_or_indexed_slices(
           all_values, accumulation_fn)
-      if aggregation == vs.VariableAggregation.MEAN:
-        reduced = cross_tower_utils.divide_by_n_tensors_or_indexed_slices(
+      if reduce_op == reduce_util.ReduceOp.MEAN:
+        reduced = cross_device_utils.divide_by_n_tensors_or_indexed_slices(
             reduced, count)
-      elif aggregation != vs.VariableAggregation.SUM:
-        raise ValueError("`aggregation` must be VariableAggregation.SUM "
-                         "or VariableAggregation.MEAN.")
+      elif reduce_op != reduce_util.ReduceOp.SUM:
+        raise ValueError("`reduce_op` must be Reduce.SUM or Reduce.MEAN.")
   return reduced
 
 
@@ -189,15 +178,15 @@ class CrossDeviceOps(object):
   def __init__(self):
     pass
 
-  def reduce(self, aggregation, per_replica_value, destinations):
+  def reduce(self, reduce_op, per_replica_value, destinations):
     """Reduce `per_replica_value` to `destinations`.
 
-    It runs the reduction operation defined by `aggregation` and put the
+    It runs the reduction operation defined by `reduce_op` and put the
     result on `destinations`.
 
     Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
       per_replica_value: a PerReplica object or a tensor with device set.
       destinations: the reduction destinations.
 
@@ -211,17 +200,17 @@ class CrossDeviceOps(object):
       per_replica_value = _make_tensor_into_per_replica(per_replica_value)
 
     validate_destinations(destinations)
-    return self._reduce(aggregation, per_replica_value, destinations)
+    return self._reduce(reduce_op, per_replica_value, destinations)
 
-  def batch_reduce(self, aggregation, value_destination_pairs):
+  def batch_reduce(self, reduce_op, value_destination_pairs):
     """Reduce PerReplica objects in a batch.
 
     Reduce each first element in `value_destination_pairs` to each second
     element which indicates the destinations.
 
     Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
+      reduce_op: Indicates how per_replica_value will be reduced. Accepted
+        values are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
       value_destination_pairs: a list or a tuple of tuples of PerReplica objects
         (or tensors with device set if there is one device) and destinations.
 
@@ -241,7 +230,7 @@ class CrossDeviceOps(object):
     for _, d in value_destination_pairs:
       validate_destinations(d)
 
-    return self._batch_reduce(aggregation, value_destination_pairs)
+    return self._batch_reduce(reduce_op, value_destination_pairs)
 
   def broadcast(self, tensor, destinations):
     """Broadcast the `tensor` to destinations.
@@ -256,11 +245,11 @@ class CrossDeviceOps(object):
     validate_destinations(destinations)
     return self._broadcast(tensor, destinations)
 
-  def _reduce(self, aggregation, per_replica_value, destinations):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
     raise NotImplementedError(
         "_reduce method must be implemented in descendants.")
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     raise NotImplementedError(
         "_batch_reduce method must be implemented in descendants.")
 
@@ -286,19 +275,19 @@ class ReductionToOneDeviceCrossDeviceOps(CrossDeviceOps):
     self.accumulation_fn = accumulation_fn
     super(ReductionToOneDeviceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, aggregation, per_replica_value, destinations):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
     if check_destinations(destinations):
       devices = get_devices_from(destinations)
     else:
       devices = get_devices_from(per_replica_value)
     reduce_to_device = self.reduce_to_device or devices[0]
     reduced = _simple_reduce(per_replica_value, reduce_to_device,
-                             self.accumulation_fn, aggregation)
+                             self.accumulation_fn, reduce_op)
     return self.broadcast(reduced, devices)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     return [
-        self._reduce(aggregation, t, destinations=v)
+        self._reduce(reduce_op, t, destinations=v)
         for t, v in value_destination_pairs
     ]
 
@@ -333,20 +322,20 @@ def _group_value_by_device(per_replica_values):
 
 def _ungroup_and_make_mirrored(grouped_reduced,
                                destinations,
-                               aggregation,
+                               reduce_op,
                                num_between_graph_workers=1):
   """Ungroup results from all-reduce and make Mirrored objects.
 
   Each all-reduce result will be divided by the number of destinations before
-  Mirrored objects are created if aggregation is "mean".
+  Mirrored objects are created if reduce_op is "mean".
 
   Args:
     grouped_reduced: a list of lists, each sublist has components for each
       device, paired with a None. It is the result from
-      cross_tower_utils.aggregate_gradients_using*.
+      cross_device_utils.aggregate_gradients_using*.
     destinations: a list of device strings for returned Mirrored objects.
-    aggregation: Indicates how a variable will be aggregated. Accepted values
-      are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`.
+    reduce_op: Indicates how values will be aggregated. Accepted values
+      are `tf.distribute.ReduceOp.SUM`, `tf.distribute.ReduceOp.MEAN`.
     num_between_graph_workers: number of workers in the between-graph
       replication.
 
@@ -356,7 +345,7 @@ def _ungroup_and_make_mirrored(grouped_reduced,
   index = [{} for _ in range(len(grouped_reduced[0]))]
   for d, per_replica_reduced in enumerate(grouped_reduced):
     for i, (v, _) in enumerate(per_replica_reduced):
-      if aggregation == vs.VariableAggregation.MEAN:
+      if reduce_op == reduce_util.ReduceOp.MEAN:
         index[i][destinations[d]] = v / (
             len(destinations) * num_between_graph_workers)
       else:
@@ -412,7 +401,7 @@ class ConcatAndSplitPacker(object):
         # all gradient shapes are defined, we use another method to get the
         # total size.
         # TODO(yuefengz): move this logic to array_ops.size.
-        if all([g.shape.is_fully_defined() for g, _ in device_grads_and_vars]):
+        if all(g.shape.is_fully_defined() for g, _ in device_grads_and_vars):
           total_grad_size = sum(
               [g.shape.num_elements() for g, _ in device_grads_and_vars])
         else:
@@ -496,7 +485,7 @@ class AggregateSmallTensorPacker(object):
     """Aggregate small tensors."""
     if (self.agg_small_grads_max_bytes > 0 and
         self.agg_small_grads_max_group > 0):
-      device_grads, self.packing = cross_tower_utils.pack_small_tensors(
+      device_grads, self.packing = cross_device_utils.pack_small_tensors(
           grouped_grads_and_vars,
           max_bytes=self.agg_small_grads_max_bytes,
           max_group=self.agg_small_grads_max_group)
@@ -504,8 +493,8 @@ class AggregateSmallTensorPacker(object):
 
   def unpack(self, summed_device_grad_packs):
     """Reverse the aggregation process."""
-    return cross_tower_utils.unpack_small_tensors(summed_device_grad_packs,
-                                                  self.packing)
+    return cross_device_utils.unpack_small_tensors(summed_device_grad_packs,
+                                                   self.packing)
 
 
 def _pack_tensors(device_grads,
@@ -567,13 +556,13 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     self._agg_small_grads_max_group = agg_small_grads_max_group
     super(AllReduceCrossDeviceOps, self).__init__()
 
-  def _reduce(self, aggregation, per_replica_value, destinations):
-    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
         per_replica_value)
     if (_devices_match(per_replica_value, destinations)
         and not context.executing_eagerly()
         and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation, [per_replica_value])[0]
+      return self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     else:
       if contains_indexed_slices:
         logging.log_first_n(
@@ -586,16 +575,16 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
         devices = get_devices_from(per_replica_value)
       reduce_to_device = devices[0]
       reduced = _simple_reduce(per_replica_value, reduce_to_device,
-                               math_ops.add_n, aggregation)
+                               math_ops.add_n, reduce_op)
       return self.broadcast(reduced, devices)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
     all_devices_match = _all_devices_match(value_destination_pairs)
-    contains_indexed_slices = cross_tower_utils.contains_indexed_slices(
+    contains_indexed_slices = cross_device_utils.contains_indexed_slices(
         value_destination_pairs)
     if (all_devices_match and not context.executing_eagerly()
         and not contains_indexed_slices):
-      return self._batch_all_reduce(aggregation,
+      return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs])
     else:
       if not all_devices_match:
@@ -605,11 +594,11 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
                             10)
 
       return [
-          self._reduce(aggregation, t, destinations=v)
+          self._reduce(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, aggregation, per_replica_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO, "batch_all_reduce invoked for batches size = %d with "
@@ -629,18 +618,18 @@ class AllReduceCrossDeviceOps(CrossDeviceOps):
     # the balance on num_splits.
     if self._all_reduce_alg == "nccl":
       # TODO(yuefengz): merge this into the all-reduce library.
-      reduced = cross_tower_utils.aggregate_gradients_using_nccl(
+      reduced = cross_device_utils.aggregate_gradients_using_nccl(
           device_grad_packs)
     else:
       # TODO(yuefengz): check that gpu ids in `destinations` are in ascending
       # order.
       reduced = (
-          cross_tower_utils.aggregate_gradients_using_hierarchical_copy(
+          cross_device_utils.aggregate_gradients_using_hierarchical_copy(
               destinations, device_grad_packs))
 
     reduced = _unpack_tensors(reduced, tensor_packer)
     return _ungroup_and_make_mirrored(reduced, per_replica_values[0].devices,
-                                      aggregation)
+                                      reduce_op)
 
 
 # For compatibility with code using the old name of `AllReduceCrossDeviceOps`.
@@ -723,7 +712,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
           validate_and_complete_spec(spec) for spec in all_reduce_spec
       ]
 
-  def _batch_all_reduce(self, aggregation, per_replica_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All reduce algorithm in a batch."""
     logging.log_first_n(
         logging.INFO,
@@ -751,13 +740,13 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
         this_grads = remaining_grads
         remaining_grads = []
       else:
-        (this_grads, remaining_grads) = cross_tower_utils.split_grads_by_size(
+        (this_grads, remaining_grads) = cross_device_utils.split_grads_by_size(
             spec_tuple.limit, remaining_grads)
       if this_grads:
         device_grad_packs, tensor_packer = _pack_tensors(
             this_grads, self._num_packs, self._agg_small_grads_max_bytes,
             self._agg_small_grads_max_group)
-        range_agg_grads = cross_tower_utils.sum_gradients_all_reduce(
+        range_agg_grads = cross_device_utils.sum_gradients_all_reduce(
             self._worker_devices, device_grad_packs, len(self._worker_devices),
             spec_tuple.alg, spec_tuple.shards, range(self._num_gpus_per_worker))
         range_agg_grads = _unpack_tensors(range_agg_grads, tensor_packer)
@@ -771,7 +760,7 @@ class MultiWorkerAllReduce(AllReduceCrossDeviceOps):
     assert not remaining_grads
 
     return _ungroup_and_make_mirrored(aggregated_grads, destinations,
-                                      aggregation)
+                                      reduce_op)
 
 
 # TODO(yuefengz): support in-graph collective all-reduce.
@@ -800,20 +789,20 @@ class CollectiveAllReduce(CrossDeviceOps):
     self._num_workers = num_workers
     self._num_gpus_per_worker = num_gpus_per_worker
     self._all_reduce_merge_scope = all_reduce_merge_scope
-    self._collective_keys = collective_keys or cross_tower_utils.CollectiveKeys(
-    )
+    self._collective_keys = (collective_keys or
+                             cross_device_utils.CollectiveKeys())
     super(CollectiveAllReduce, self).__init__()
 
   # TODO(yuefengz, tucker): is indexed slices supported by collective ops?
-  def _reduce(self, aggregation, per_replica_value, destinations):
-    if cross_tower_utils.contains_indexed_slices(per_replica_value):
+  def _reduce(self, reduce_op, per_replica_value, destinations):
+    if cross_device_utils.contains_indexed_slices(per_replica_value):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
     if context.executing_eagerly():
       raise ValueError(
           "Eager execution is not supported for Collective All-Reduce")
 
-    all_reduced = self._batch_all_reduce(aggregation, [per_replica_value])[0]
+    all_reduced = self._batch_all_reduce(reduce_op, [per_replica_value])[0]
     if _devices_match(per_replica_value, destinations):
       return all_reduced
     else:
@@ -829,8 +818,8 @@ class CollectiveAllReduce(CrossDeviceOps):
 
       return value_lib.Mirrored(index)
 
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    if cross_tower_utils.contains_indexed_slices(value_destination_pairs):
+  def _batch_reduce(self, reduce_op, value_destination_pairs):
+    if cross_device_utils.contains_indexed_slices(value_destination_pairs):
       raise ValueError(
           "`IndexSlices` is not supported for Collective All-Reduce.")
     if context.executing_eagerly():
@@ -839,7 +828,7 @@ class CollectiveAllReduce(CrossDeviceOps):
 
     all_devices_match = _all_devices_match(value_destination_pairs)
     if all_devices_match:
-      return self._batch_all_reduce(aggregation,
+      return self._batch_all_reduce(reduce_op,
                                     [v[0] for v in value_destination_pairs])
     else:
       if not all_devices_match:
@@ -848,11 +837,11 @@ class CollectiveAllReduce(CrossDeviceOps):
             "destinations are different.", 10)
 
       return [
-          self._reduce(aggregation, t, destinations=v)
+          self._reduce(reduce_op, t, destinations=v)
           for t, v in value_destination_pairs
       ]
 
-  def _batch_all_reduce(self, aggregation, per_replica_values):
+  def _batch_all_reduce(self, reduce_op, per_replica_values):
     """All-reduce across all workers in a batch."""
     if context.executing_eagerly():
       raise ValueError(
@@ -881,7 +870,7 @@ class CollectiveAllReduce(CrossDeviceOps):
       with ops.name_scope("allreduce"):
         for grad_and_vars in chunk:
           scaled_grads = [g for g, _ in grad_and_vars]
-          collective_reduced = cross_tower_utils.build_collective_reduce(
+          collective_reduced = cross_device_utils.build_collective_reduce(
               scaled_grads, self._num_workers, self._collective_keys, "Add",
               "Id")
           result = []
@@ -893,7 +882,7 @@ class CollectiveAllReduce(CrossDeviceOps):
     return _ungroup_and_make_mirrored(
         new_device_grads,
         per_replica_values[0].devices,
-        aggregation,
+        reduce_op,
         num_between_graph_workers=self._num_workers)
 
 
@@ -927,15 +916,15 @@ def _choose_all_reduce_algorithm(device_links):
 
 
 def choose_the_best(devices, session_config=None):
-  """Find the best subclass of CrossDeviceOps given a tensorflow session.
+  """Find the best subclass of CrossDeviceOps given a session config.
 
   Args:
-    devices: a list of devices passed for distribute strategy.
-    session_config: a tensorflow session config or None. If None, it will make
-      deciesion based on all local devices.
+    devices: a list of devices passed to `tf.distribute.Strategy`.
+    session_config: a `tf.ConfigProto` or `None`. If `None`, it will make
+      decision based on all local devices.
 
   Returns:
-    a subclass of CrossDeviceOps.
+    A subclass of `CrossDeviceOps`.
   """
   requested_devices = set([device_util.canonicalize(d) for d in devices])
   machine_devices = device_lib.list_local_devices(session_config=session_config)
@@ -948,13 +937,13 @@ def choose_the_best(devices, session_config=None):
           "Device is available but not used by distribute strategy: %s", d.name)
 
   if len(using_devices) != len(requested_devices):
-    logging.warning("Not all devices in distribute strategy are visible by "
-                    "TensorFlow sessions.")
+    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
+                    "to TensorFlow.")
     return ReductionToOneDeviceCrossDeviceOps()
 
-  if any([d.device_type.lower() != "gpu" for d in using_devices]):
-    logging.warning("Not all devices in DistributionStrategy are visible to "
-                    "TensorFlow session.")
+  if any(d.device_type.lower() != "gpu" for d in using_devices):
+    logging.warning("Not all devices in `tf.distribute.Strategy` are visible "
+                    "to TensorFlow.")
     return ReductionToOneDeviceCrossDeviceOps()
 
   device_links = [[] for _ in range(len(using_devices))]
diff --git a/tensorflow/contrib/distribute/python/cross_tower_utils.py b/tensorflow/python/distribute/cross_device_utils.py
similarity index 98%
rename from tensorflow/contrib/distribute/python/cross_tower_utils.py
rename to tensorflow/python/distribute/cross_device_utils.py
index 35324d15d4416364698390468d65d442f442ec50..0faadd7e0cfe69bf8c80399574dd67be53ebcfe0 100644
--- a/tensorflow/contrib/distribute/python/cross_tower_utils.py
+++ b/tensorflow/python/distribute/cross_device_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utilities for cross_tower_ops."""
+"""Utilities for cross_device_ops."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,8 @@ from __future__ import print_function
 import collections as pycoll
 import threading
 
-from tensorflow.contrib.all_reduce.python import all_reduce
-from tensorflow.contrib.distribute.python import values as value_lib
+from tensorflow.python.distribute import all_reduce
+from tensorflow.python.distribute import values as value_lib
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -420,7 +420,7 @@ def sum_gradients_all_reduce(dev_prefixes, replica_grads, num_workers, alg,
   Returns:
     list of reduced tensors
   """
-  alg_contains_shuffle = any([n in alg for n in ['pscpu', 'psgpu']])
+  alg_contains_shuffle = any(n in alg for n in ['pscpu', 'psgpu'])
   is_hierarchical = '/' in alg
   if 'pscpu' in alg:
     aux_devices = [prefix + '/cpu:0' for prefix in dev_prefixes]
@@ -667,7 +667,5 @@ def contains_indexed_slices(value):
     return any(contains_indexed_slices(v) for v in value)
   elif isinstance(value, value_lib.DistributedValues):
     return contains_indexed_slices(list(value._index.values()))  # pylint: disable=protected-access
-  elif isinstance(value, value_lib.MapOutput):
-    return contains_indexed_slices(value.get())
   else:
     return False
diff --git a/tensorflow/python/training/device_util.py b/tensorflow/python/distribute/device_util.py
similarity index 98%
rename from tensorflow/python/training/device_util.py
rename to tensorflow/python/distribute/device_util.py
index 70e1ca4b5d77e5e7529cb0d06a9ffb4657dc74fe..34474582adfa8c73c4a7bbbe130dcf6faf88ce0b 100644
--- a/tensorflow/python/training/device_util.py
+++ b/tensorflow/python/distribute/device_util.py
@@ -50,7 +50,7 @@ def canonicalize(d, default=None):
   # Fill in missing device fields using defaults.
   result = tf_device.DeviceSpec(
       replica=0, task=0, device_type="CPU", device_index=0)
-  if context.executing_eagerly():
+  if ops.executing_eagerly_outside_functions():
     result.job = "localhost"
   if default:
     result.merge_from(tf_device.DeviceSpec.from_string(default))
diff --git a/tensorflow/python/training/device_util_test.py b/tensorflow/python/distribute/device_util_test.py
similarity index 95%
rename from tensorflow/python/training/device_util_test.py
rename to tensorflow/python/distribute/device_util_test.py
index cdbb08229d2f06c2cfeeb855b32665f7c03ea969..2f0d7ed3b317f59e314148c583a8f1f69240b37b 100644
--- a/tensorflow/python/training/device_util_test.py
+++ b/tensorflow/python/distribute/device_util_test.py
@@ -18,14 +18,16 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import device_util
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
-from tensorflow.python.training import device_util
 
 
 class DeviceUtilTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCurrentDeviceWithGlobalGraph(self):
     with ops.device("/cpu:0"):
       self.assertEqual(device_util.current(), "/device:CPU:0")
@@ -49,6 +51,7 @@ class DeviceUtilTest(test.TestCase):
         self.assertEqual(device_util.current(),
                          "/job:localhost/replica:0/task:0/device:CPU:0")
 
+  @test_util.run_deprecated_v1
   def testCanonicalizeWithoutDefaultDevice(self):
     self.assertEqual(
         device_util.canonicalize("/cpu:0"),
diff --git a/tensorflow/python/distribute/distribute_coordinator.py b/tensorflow/python/distribute/distribute_coordinator.py
index 520413102bec27f762acc242a8e2a99a58ed4ce5..c0f9b8a1fdfdf8bd95375f489058cadcd63c9cb9 100644
--- a/tensorflow/python/distribute/distribute_coordinator.py
+++ b/tensorflow/python/distribute/distribute_coordinator.py
@@ -245,7 +245,7 @@ class _WorkerContext(object):
     else:
       session_config = self._session_config
 
-    if not self._strategy or self._strategy.should_init:
+    if not self._strategy or self._strategy.extended.experimental_should_init:
       logging.info("Creating chief session creator with config: %r", config)
       return monitored_session.ChiefSessionCreator(
           scaffold,
@@ -261,6 +261,10 @@ class _WorkerContext(object):
           config=session_config,
           max_wait_secs=max_wait_secs)
 
+  @property
+  def session_config(self):
+    return copy.deepcopy(self._session_config)
+
   @property
   def has_barrier(self):
     """Whether the barrier is set or not."""
@@ -301,15 +305,20 @@ class _WorkerContext(object):
     """Returns number of workers in the cluster, including chief."""
     return self._num_workers
 
+  @property
+  def experimental_should_init(self):
+    """Whether to run init ops."""
+    return self._strategy.extended.experimental_should_init
+
   @property
   def should_checkpoint(self):
     """Whether to save checkpoint."""
-    return self._strategy.should_checkpoint
+    return self._strategy.extended.should_checkpoint
 
   @property
   def should_save_summary(self):
     """Whether to save summaries."""
-    return self._strategy.should_save_summary
+    return self._strategy.extended.should_save_summary
 
 
 def _run_single_worker(worker_fn,
@@ -623,10 +632,10 @@ def run_distribute_coordinator(worker_fn,
   The `strategy` object is expected to be a DistributionStrategy object which
   has implemented methods needed by distributed coordinator such as
   `configure(session_config, cluster_spec, task_type, task_id)` which configures
-  the strategy object for a specific task and `should_init` property which
-  instructs the distribute coordinator whether to run init ops for a task. The
-  distribute coordinator will make a copy of the `strategy` object, call its
-  `configure` method and pass it to `worker_fn` as an argument.
+  the strategy object for a specific task and `experimental_should_init`
+  property which instructs the distribute coordinator whether to run init ops
+  for a task. The distribute coordinator will make a copy of the `strategy`
+  object, call its `configure` method and pass it to `worker_fn` as an argument.
 
   The `worker_fn` defines the training logic and is called under a its own
   worker context which can be accessed to via `get_current_worker_context`. A
@@ -749,7 +758,7 @@ def run_distribute_coordinator(worker_fn,
     # The client must know the cluster but servers in the cluster don't have to
     # know the client.
     if task_type in [_TaskType.CLIENT, None]:
-      if strategy.between_graph:
+      if strategy.extended.experimental_between_graph:
         return _run_between_graph_client(worker_fn, strategy, eval_fn,
                                          eval_strategy, cluster_spec,
                                          session_config, rpc_layer)
@@ -795,7 +804,7 @@ def run_distribute_coordinator(worker_fn,
         environment=environment)
 
     if task_type in [_TaskType.CHIEF, _TaskType.WORKER]:
-      if strategy.between_graph:
+      if strategy.extended.experimental_between_graph:
         # All jobs run `worker_fn` if between-graph.
         _run_single_worker(worker_fn, strategy, cluster_spec, task_type,
                            task_id, session_config, rpc_layer)
diff --git a/tensorflow/python/distribute/distribute_coordinator_test.py b/tensorflow/python/distribute/distribute_coordinator_test.py
index 5d336648ce97f30dc034b1b42af994830baeffc8..7598c105c2dd763c524e50e139fdd9984f1bd0c0 100644
--- a/tensorflow/python/distribute/distribute_coordinator_test.py
+++ b/tensorflow/python/distribute/distribute_coordinator_test.py
@@ -47,6 +47,7 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
+from tensorflow.python.training import session_manager
 
 
 CHIEF = distribute_coordinator._TaskType.CHIEF
@@ -78,46 +79,53 @@ def _strip_protocol(target):
     return target
 
 
-class MockStrategy(object):
+class MockExtended(object):
 
   def __init__(self,
                between_graph=False,
                should_init=None,
                should_checkpoint=None,
                should_save_summary=None):
-    self._between_graph = between_graph
-    self._should_init = should_init
-    self._should_checkpoint = should_checkpoint
-    self._should_save_summary = should_save_summary
+    self.experimental_between_graph = between_graph
+    self.experimental_should_init = should_init
+    self.should_checkpoint = should_checkpoint
+    self.should_save_summary = should_save_summary
 
-  @property
-  def between_graph(self):
-    return self._between_graph
+
+class MockStrategy(object):
+
+  def __init__(self,
+               between_graph=False,
+               should_init=None,
+               should_checkpoint=None,
+               should_save_summary=None):
+    self.extended = MockExtended(between_graph, should_init, should_checkpoint,
+                                 should_save_summary)
 
   def configure(self,
                 session_config=None,
                 cluster_spec=None,
                 task_type=None,
                 task_id=None):
-    if self._should_init is None:
+    if self.extended.experimental_should_init is None:
       if task_id == 0:
-        self._should_init = True
+        self.extended.experimental_should_init = True
       else:
-        self._should_init = False
-    if self._should_checkpoint is None:
+        self.extended.experimental_should_init = False
+    if self.extended.should_checkpoint is None:
       if task_id == 0:
-        self._should_checkpoint = True
+        self.extended.should_checkpoint = True
       else:
-        self._should_checkpoint = False
-    if self._should_save_summary is None:
+        self.extended.should_checkpoint = False
+    if self.extended.should_save_summary is None:
       if task_id == 0:
-        self._should_save_summary = True
+        self.extended.should_save_summary = True
       else:
-        self._should_save_summary = False
+        self.extended.should_save_summary = False
 
     if session_config:
       if (cluster_spec and task_type and task_id is not None and
-          self._between_graph):
+          self.extended.experimental_between_graph):
         session_config.intra_op_parallelism_threads += 1
         if task_type in ["chief", "worker"]:
           session_config.device_filters.extend(
@@ -126,18 +134,6 @@ class MockStrategy(object):
         session_config.inter_op_parallelism_threads += 1
         session_config.device_filters.append("/job:somejob")
 
-  @property
-  def should_init(self):
-    return self._should_init
-
-  @property
-  def should_checkpoint(self):
-    return self._should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    return self._should_save_summary
-
 
 class MockServer(object):
 
@@ -372,9 +368,12 @@ class DistributeCoordinatorTestBase(test.TestCase):
     context = distribute_coordinator_context.get_current_worker_context()
     self.assertTrue(context is not None)
 
-    self.assertEqual(context._strategy.should_init, strategy.should_init)
-    self.assertEqual(context.should_checkpoint, strategy.should_checkpoint)
-    self.assertEqual(context.should_save_summary, strategy.should_save_summary)
+    self.assertEqual(context._strategy.extended.experimental_should_init,
+                     strategy.extended.experimental_should_init)
+    self.assertEqual(context.should_checkpoint,
+                     strategy.extended.should_checkpoint)
+    self.assertEqual(context.should_save_summary,
+                     strategy.extended.should_save_summary)
 
     task_type = str(context.task_type)
     task_id = context.task_id or 0
@@ -384,7 +383,8 @@ class DistributeCoordinatorTestBase(test.TestCase):
       while len(self._strategy_property[task_type]) <= task_id:
         self._strategy_property[task_type].append(None)
       self._strategy_property[task_type][task_id] = (
-          context._strategy.should_init, context.should_checkpoint,
+          context._strategy.extended.experimental_should_init,
+          context.should_checkpoint,
           context.should_save_summary)
 
   def _run_mock_std_server(self,
@@ -427,6 +427,7 @@ class DistributeCoordinatorTestStandaloneMode(DistributeCoordinatorTestBase):
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
 
+  @test_util.run_v1_only("b/120545219")
   def testBetweenGraphWithMonitoredSession(self):
     """Test monitored session in standalone client mode."""
     distribute_coordinator.run_distribute_coordinator(
@@ -600,6 +601,7 @@ class DistributeCoordinatorTestInpendentWorkerMode(
     # Each finished worker will increment self._result_correct.
     self.assertEqual(self._result_correct, NUM_WORKERS)
 
+  @test_util.run_v1_only("b/120545219")
   def testBetweenGraphWithMonitoredSession(self):
     cluster_spec = self._create_cluster_spec(
         num_workers=NUM_WORKERS, num_ps=NUM_PS)
@@ -930,4 +932,14 @@ class RunStandardTensorflowServerTest(test.TestCase):
 if __name__ == "__main__":
   # TODO(yuefengz): find a smart way to terminite std server threads.
   with test.mock.patch.object(sys, "exit", os._exit):
+    # Reduce `recovery_wait_secs` from 30 seconds so the test completes quickly.
+    orig_init = session_manager.SessionManager.__init__
+
+    def new_init(*args, **kwargs):
+      kwargs.pop("recovery_wait_secs", None)
+      kwargs["recovery_wait_secs"] = 0.5
+      orig_init(*args, **kwargs)
+
+    session_manager.SessionManager.__init__ = new_init
+
     test.main()
diff --git a/tensorflow/python/distribute/distribute_lib.py b/tensorflow/python/distribute/distribute_lib.py
new file mode 100644
index 0000000000000000000000000000000000000000..87bf510ec549f6bf1ccabfba438d2c64fd5a88d9
--- /dev/null
+++ b/tensorflow/python/distribute/distribute_lib.py
@@ -0,0 +1,1682 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Library for running a computation across multiple devices."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import threading
+import weakref
+import enum
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.eager import context as eager_context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+from tensorflow.tools.docs import doc_controls
+
+
+# ------------------------------------------------------------------------------
+# Context tracking whether in a strategy.update() or .update_non_slot() call.
+
+
+_update_device = threading.local()
+
+
+def get_update_device():
+  """Get the current device if in a `tf.distribute.Strategy.update()` call."""
+  try:
+    return _update_device.current
+  except AttributeError:
+    return None
+
+
+class UpdateContext(object):
+  """Context manager when you are in `update()` or `update_non_slot()`."""
+
+  def __init__(self, device):
+    self._device = device
+    self._old_device = None
+
+  def __enter__(self):
+    self._old_device = get_update_device()
+    _update_device.current = self._device
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+    _update_device.current = self._old_device
+
+
+# ------------------------------------------------------------------------------
+# Public utility functions.
+
+
+@tf_export("distribute.get_loss_reduction")
+def get_loss_reduction():
+  """`tf.distribute.ReduceOp` corresponding to the last loss reduction."""
+  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
+  if (loss_reduction == losses_impl.Reduction.SUM or
+      loss_reduction == losses_impl.ReductionV2.SUM):
+    return reduce_util.ReduceOp.SUM
+  return reduce_util.ReduceOp.MEAN
+
+
+# ------------------------------------------------------------------------------
+# Internal API for validating the current thread mode
+
+
+def _require_cross_replica_context_extended(extended):
+  """Verify in cross-replica context."""
+  context = _get_per_thread_mode()
+  cross_replica = context.cross_replica_context
+  if cross_replica is not None and cross_replica.extended is extended:
+    return
+  strategy = extended._container_strategy()  # pylint: disable=protected-access
+  # We have an error to report, figure out the right message.
+  if context.distribution_strategy is not strategy:
+    _wrong_strategy_scope(strategy, context)
+  assert cross_replica is None
+  raise RuntimeError("Method requires being in cross-replica context, use "
+                     "get_replica_context().merge_call()")
+
+
+def _wrong_strategy_scope(strategy, context):
+  # Figure out the right error message.
+  if not distribution_strategy_context.has_distribution_strategy():
+    raise RuntimeError(
+        'Need to be inside "with strategy.scope()" for %s' %
+        (strategy,))
+  else:
+    raise RuntimeError(
+        "Mixing different tf.distribute.Strategy objects: %s is not %s" %
+        (context.distribution_strategy, strategy))
+
+
+def require_replica_context(replica_ctx):
+  """Verify in `replica_ctx` replica context."""
+  context = _get_per_thread_mode()
+  if context.replica_context is replica_ctx: return
+  # We have an error to report, figure out the right message.
+  if context.replica_context is None:
+    raise RuntimeError("Need to be inside `call_for_each_replica()`")
+  if context.distribution_strategy is replica_ctx.distribution_strategy:
+    # Two different ReplicaContexts with the same tf.distribute.Strategy.
+    raise RuntimeError("Mismatching ReplicaContext.")
+  raise RuntimeError(
+      "Mismatching tf.distribute.Strategy objects: %s is not %s." %
+      (context.distribution_strategy, replica_ctx.distribution_strategy))
+
+
+def _require_distribution_strategy_scope_strategy(strategy):
+  """Verify in a `strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy is strategy: return
+  _wrong_strategy_scope(strategy, context)
+
+
+def _require_distribution_strategy_scope_extended(extended):
+  """Verify in a `distribution_strategy.scope()` in this thread."""
+  context = _get_per_thread_mode()
+  if context.distribution_strategy.extended is extended: return
+  # Report error.
+  strategy = extended._container_strategy()  # pylint: disable=protected-access
+  _wrong_strategy_scope(strategy, context)
+
+
+# ------------------------------------------------------------------------------
+# Internal context managers used to implement the DistributionStrategy
+# base class
+
+
+class _CurrentDistributionContext(object):
+  """Context manager setting the current `tf.distribute.Strategy`.
+
+  Also: overrides the variable creator and optionally the current device.
+  """
+
+  def __init__(self,
+               strategy,
+               var_creator_scope,
+               var_scope=None,
+               default_device=None):
+    self._context = distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
+        strategy)
+    self._var_creator_scope = var_creator_scope
+    self._var_scope = var_scope
+    if default_device:
+      self._device_scope = ops.device(default_device)
+    else:
+      self._device_scope = None
+
+  def __enter__(self):
+    _push_per_thread_mode(self._context)
+    if self._var_scope:
+      self._var_scope.__enter__()
+    self._var_creator_scope.__enter__()
+    if self._device_scope:
+      self._device_scope.__enter__()
+    return self._context.distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    if self._device_scope:
+      self._device_scope.__exit__(exception_type, exception_value, traceback)
+    self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
+    if self._var_scope:
+      self._var_scope.__exit__(exception_type, exception_value, traceback)
+    _pop_per_thread_mode()
+
+
+class _SameScopeAgainContext(object):
+  """Trivial context manager when you are already in `scope()`."""
+
+  def __init__(self, strategy):
+    self._distribution_strategy = strategy
+
+  def __enter__(self):
+    return self._distribution_strategy
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    del exception_type, exception_value, traceback
+
+
+# TODO(yuefengz): add more replication modes.
+@tf_export("distribute.InputReplicationMode")
+class InputReplicationMode(enum.Enum):
+  """Replication mode for input function."""
+
+  # The input function will be called on each worker independently, creating as
+  # many input pipelines as number of workers. Replicas will dequeue from the
+  # local Dataset on their worker. Distribution Strategy doesn't manage any
+  # state sharing between such separate input pipelines.
+  PER_WORKER = "PER_WORKER"
+
+
+@tf_export("distribute.InputContext")
+class InputContext(object):
+  """A class wrapping information needed by an input function.
+
+  This is a context class that is passed to the user's input fn and contains
+  information about the compute replicas and input pipelines. The number of
+  compute replicas (in sync training) helps compute per input pipeline batch
+  size from the desired global batch size. Input pipeline information can be
+  used to return a different subset of the input in each input pipeline (for
+  e.g. shard the input pipeline, use a different input source etc).
+  """
+
+  def __init__(self,
+               num_input_pipelines=1,
+               input_pipeline_id=0,
+               num_replicas_in_sync=1):
+    """Initializes an InputContext object.
+
+    Args:
+      num_input_pipelines: the number of input pipelines in a cluster.
+      input_pipeline_id: the current input pipeline id, should be an int in
+        [0,`num_input_pipelines`).
+      num_replicas_in_sync: the number of replicas that are in sync.
+    """
+    self._num_input_pipelines = num_input_pipelines
+    self._input_pipeline_id = input_pipeline_id
+    self._num_replicas_in_sync = num_replicas_in_sync
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns the number of compute replicas in sync."""
+    return self._num_replicas_in_sync
+
+  @property
+  def input_pipeline_id(self):
+    """Returns the input pipeline ID."""
+    return self._input_pipeline_id
+
+  @property
+  def num_input_pipelines(self):
+    """Returns the number of input pipelines."""
+    return self._num_input_pipelines
+
+  def get_per_replica_batch_size(self, global_batch_size):
+    """Returns the per-replica batch size.
+
+    Args:
+      global_batch_size: the global batch size which should be divisible by
+        `num_replicas_in_sync`.
+
+    Returns:
+      the per-replica batch size.
+
+    Raises:
+      ValueError: if `global_batch_size` not divisible by
+        `num_replicas_in_sync`.
+    """
+    if global_batch_size % self._num_replicas_in_sync != 0:
+      raise ValueError("The `global_batch_size` %r is not divisible by "
+                       "`num_replicas_in_sync` %r " %
+                       (global_batch_size, self._num_replicas_in_sync))
+    return global_batch_size // self._num_replicas_in_sync
+
+
+# ------------------------------------------------------------------------------
+# Base classes for all distribution strategies.
+
+
+@tf_export("distribute.Strategy")
+class DistributionStrategy(object):
+  """A list of devices with a state & compute distribution policy.
+
+  See [tensorflow/contrib/distribute/README.md](
+  https://www.tensorflow.org/code/tensorflow/contrib/distribute/README.md)
+  for overview and examples.
+  """
+
+  # TODO(josh11b): Raise an exception if variable partitioning requested before
+  #   we add support.
+  # TODO(josh11b): Also `parameter_device_index` property?
+  # TODO(josh11b): `map()`
+  # TODO(josh11b): ClusterSpec/ClusterResolver
+  # TODO(josh11b): Partitioned computations, state; sharding
+  # TODO(josh11b): Model parallelism: "replicas" with multiple devices; shuffling
+  # TODO(josh11b): List of replicas with their worker and parameter devices
+  #   (where the parameter devices may overlap in the ps case).
+
+  def __init__(self, extended):
+    self._extended = extended
+
+  @property
+  def extended(self):
+    """`tf.distribute.StrategyExtended` with additional methods."""
+    return self._extended
+
+  def scope(self):
+    """Returns a context manager selecting this Strategy as current.
+
+    Inside a `with strategy.scope():` code block, this thread
+    will use a variable creator set by `strategy`, and will
+    enter its "cross-replica context".
+
+    Returns:
+      A context manager.
+    """
+    return self._extended._scope(self)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def read_var(self, v):
+    """DEPRECATED: use extended.read_var() instead."""
+    return self._extended.read_var(v)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def colocate_vars_with(self, colocate_with_variable):
+    """DEPRECATED: use extended.colocate_vars_with() instead."""
+    return self._extended.colocate_vars_with(colocate_with_variable)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED
+  def distribute_dataset(self, dataset_fn):
+    """Return a `dataset` split across all replicas.  DEPRECATED.
+
+    DEPRECATED: Please use `make_dataset_iterator` or
+    `make_input_fn_iterator` instead.
+
+    Suitable for providing input to `extended.call_for_each_replica()` by
+    creating an iterator:
+
+    ```
+    def dataset_fn():
+      return tf.data.Dataset.from_tensors([[1.]]).repeat()
+
+    with strategy.scope():
+      distributed_dataset = strategy.distribute_dataset(dataset_fn)
+      iterator = distributed_dataset.make_initializable_iterator()
+      replica_results = strategy.extended.call_for_each_replica(
+          replica_fn, args=(iterator.get_next(),))
+    ```
+
+    Args:
+      dataset_fn: A function that returns a `tf.data.Dataset`.
+
+    Returns:
+      A `PerReplicaDataset` that will produce data for each replica.
+    """
+    return self._extended._distribute_dataset(dataset_fn)  # pylint: disable=protected-access
+
+  def make_dataset_iterator(self, dataset):
+    """Makes an iterator for input provided via input_dataset.
+
+    Data from the given dataset will be distributed evenly across all the
+    compute replicas. We will assume that the input dataset is batched by the
+    global batch size. With this assumption, we will make a best effort to
+    divide each batch across all the replicas (one or more workers).
+    If this effort fails, an error will be thrown, and the user should instead
+    use `make_input_fn_iterator` which provides more control to the user, and
+    does not try to divide a batch across replicas.
+
+    The user could also use `make_input_fn_iterator` if they want to
+    customize which input is fed to which replica/worker etc.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be distributed evenly across all
+        replicas.
+
+    Returns:
+      An `tf.distribute.InputIterator` which returns inputs for each step of the
+      computation.  User should call `initialize` on the returned iterator.
+    """
+    return self._extended._make_dataset_iterator(dataset)  # pylint: disable=protected-access
+
+  def make_input_fn_iterator(self,
+                             input_fn,
+                             replication_mode=InputReplicationMode.PER_WORKER):
+    """Returns an iterator split across replicas created from an input function.
+
+    The `input_fn` should take an `tf.distribute.InputContext` object where
+    information about input sharding can be accessed:
+
+    ```
+    def input_fn(input_context):
+      d = tf.data.Dataset.from_tensors([[1.]]).repeat()
+      return d.shard(input_context.num_input_pipelines,
+                     input_context.input_pipeline_id)
+    with strategy.scope():
+      iterator = strategy.make_input_fn_iterator(
+          input_fn)
+      replica_results = strategy.extended.call_for_each_replica(
+          replica_fn, iterator.get_next())
+    ```
+
+    Args:
+      input_fn: A function that returns a `tf.data.Dataset`. This function is
+        expected to take an `tf.distribute.InputContext` object.
+      replication_mode: an enum value of `tf.distribute.InputReplicationMode`.
+        Only `PER_WORKER` is supported currently.
+
+    Returns:
+      An iterator object that can be initialized and fetched next element.
+    """
+    if replication_mode != InputReplicationMode.PER_WORKER:
+      raise ValueError(
+          "Input replication mode not supported: %r" % replication_mode)
+    return self.extended._make_input_fn_iterator(  # pylint: disable=protected-access
+        input_fn, replication_mode=replication_mode)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def broadcast(self, tensor, destinations=None):
+    """DEPRECATED: use extended.broadcast_to() instead."""
+    return self._extended.broadcast_to(tensor, destinations)
+
+  @doc_controls.do_not_generate_docs  # Use experimental_initialize() instead.
+  def initialize(self):
+    """DEPRECATED: Use `experimental_initialize()` instead."""
+    return self._extended._initialize()  # pylint: disable=protected-access
+
+  def experimental_initialize(self):
+    """Any initialization to be done before running any computations.
+
+    In eager mode, it executes any initialization as a side effect.
+    In graph mode, it creates the initialization ops and returns them.
+
+    For example, TPU initialize_system ops.
+
+    Returns:
+      A list of ops to execute.
+    """
+    return self._extended._initialize()  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # Use experimental_finalize() instead.
+  def finalize(self):
+    """DEPRECATED: Use `experimental_finalize()` instead."""
+    return self._extended._finalize()  # pylint: disable=protected-access
+
+  def experimental_finalize(self):
+    """Any final actions to be done at the end of all computations.
+
+    In eager mode, it executes any finalize actions as a side effect.
+    In graph mode, it creates the finalize ops and returns them.
+
+    For example, TPU shutdown ops.
+
+    Returns:
+      A list of ops to execute.
+    """
+    return self._extended._finalize()  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def run_steps_on_dataset(self, fn, iterator, iterations=1,
+                           initial_loop_values=None):
+    """DEPRECATED: use extended.experimental_run_steps_on_iterator() instead."""
+    return self._extended.experimental_run_steps_on_iterator(
+        fn, iterator, iterations, initial_loop_values)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def call_for_each_replica(self, fn, *args, **kwargs):
+    """DEPRECATED: use extended.call_for_each_replica() instead."""
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to call_for_each_replica")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
+      kwargs = k
+    kwargs.pop("run_concurrently", None)  # Ignore old option.
+    return self._extended.call_for_each_replica(fn, args, kwargs)
+
+  def reduce(self, reduce_op, value):
+    """Reduce `value` across replicas.
+
+    Args:
+      reduce_op: A `tf.distribute.ReduceOp` value specifying how values should
+        be combined.
+      value: A "per replica" value to be combined into a single tensor.
+
+    Returns:
+      A `Tensor`.
+    """
+    _require_cross_replica_context_extended(self._extended)
+    return self._extended._reduce(reduce_op, value)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def batch_reduce(self, aggregation, value_destination_pairs):
+    """DEPRECATED: use extended.batch_reduce_to() instead."""
+    return self._extended.batch_reduce_to(aggregation, value_destination_pairs)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def update(self, var, fn, *args, **kwargs):
+    """DEPRECATED: use extended.update() instead."""
+    group = kwargs.pop("group", True)
+    # We temporarily support "grouped" in addition to "group" for backward-
+    # compatibility.
+    group = kwargs.pop("grouped", True) and group
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to update")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to update")
+      kwargs = k
+    return self._extended.update(var, fn, args, kwargs, group)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
+    """DEPRECATED: use extended.update_non_slot() instead."""
+    group = kwargs.pop("group", True)
+    # We temporarily support "grouped" in addition to "group" for backward-
+    # compatibility.
+    group = kwargs.pop("grouped", True) and group
+    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
+    # allow transition.
+    a = kwargs.pop("args", None)
+    if a is not None:
+      if args:
+        raise ValueError(
+            "Can't pass *args and args=... to update_non_slot")
+      args = a
+    k = kwargs.pop("kwargs", None)
+    if k is not None:
+      if kwargs:
+        raise ValueError(
+            "Can't pass **kwargs and kwargs=... to update_non_slot")
+      kwargs = k
+    return self._extended.update_non_slot(
+        colocate_with, fn, args, kwargs, group)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  def unwrap(self, value):
+    """Returns the list of all per-replica values contained in `value`.
+
+    Args:
+      value: A value returned by `extended.call_for_each_replica()` or a
+        variable created in `scope`.
+
+    Returns:
+      A tuple of values contained in `value`. If `value` represents a single
+      value, this returns `(value,).`
+    """
+    return self._extended._unwrap(value)  # pylint: disable=protected-access
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def value_container(self, value):
+    """DEPRECATED: use extended.value_container() instead."""
+    return self._extended.value_container(value)
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, -> `DistributedValues`
+  def group(self, value, name=None):
+    """Shortcut for `tf.group(self.unwrap(value))`."""
+    return self._extended._group(value, name)  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def require_static_shapes(self):
+    """DEPRECATED: use extended.require_static_shapes instead."""
+    return self._extended.experimental_require_static_shapes
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._extended._num_replicas_in_sync  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def worker_devices(self):
+    """DEPRECATED: use extended.worker_devices instead."""
+    return self._extended.worker_devices
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def parameter_devices(self):
+    """DEPRECATED: use extended.parameter_devices instead."""
+    return self._extended.parameter_devices
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def non_slot_devices(self, var_list):
+    """DEPRECATED: use extended.non_slot_devices instead."""
+    return self._extended.non_slot_devices(var_list)
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def between_graph(self):
+    """DEPRECATED: use extended.experimental_between_graph instead."""
+    return self._extended.experimental_between_graph
+
+  @doc_controls.do_not_generate_docs  # DEPRECATED, being replaced by a new API.
+  def configure(self,
+                session_config=None,
+                cluster_spec=None,
+                task_type=None,
+                task_id=None):
+    # pylint: disable=g-doc-return-or-yield,g-doc-args
+    """DEPRECATED: use `update_config_proto` instead.
+
+    Configures the strategy class.
+
+    DEPRECATED: This method's functionality has been split into the strategy
+    constructor and `update_config_proto`. In the future, we will allow passing
+    cluster and config_proto to the constructor to configure the strategy. And
+    `update_config_proto` can be used to update the config_proto based on the
+    specific strategy.
+    """
+    return self._extended._configure(  # pylint: disable=protected-access
+        session_config, cluster_spec, task_type, task_id)
+
+  def update_config_proto(self, config_proto):
+    """Returns a copy of `config_proto` modified for use with this strategy.
+
+    The updated config has something needed to run a strategy, e.g.
+    configuration to run collective ops, or device filters to improve
+    distributed training performance.
+
+    Args:
+      config_proto: a `tf.ConfigProto` object.
+
+    Returns:
+      The updated copy of the `config_proto`.
+    """
+    return self._extended._update_config_proto(config_proto)  # pylint: disable=protected-access
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_init(self):
+    """DEPRECATED: use extended.should_init instead."""
+    return self._extended.experimental_should_init
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_checkpoint(self):
+    """DEPRECATED: use extended.should_checkpoint instead."""
+    return self._extended.should_checkpoint
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, moving to `extended`
+  def should_save_summary(self):
+    """DEPRECATED: use extended.should_save_summary instead."""
+    return self._extended.should_save_summary
+
+  def __deepcopy__(self, memo):
+    # First do a regular deepcopy of `self`.
+    cls = self.__class__
+    result = cls.__new__(cls)
+    memo[id(self)] = result
+    for k, v in self.__dict__.items():
+      setattr(result, k, copy.deepcopy(v, memo))
+    # One little fix-up: we want `result._extended` to reference `result`
+    # instead of `self`.
+    result._extended._container_strategy_weakref = weakref.ref(result)  # pylint: disable=protected-access
+    return result
+
+  def __copy__(self):
+    raise RuntimeError("Must only deepcopy DistributionStrategy.")
+
+
+@tf_export("distribute.StrategyExtended")
+class DistributionStrategyExtended(object):
+  """Additional APIs for algorithms that need to be distribution-aware.
+
+  The intent is that you can write an algorithm in a stylized way and
+  it will be usable with a variety of different
+  `tf.distribute.Strategy`
+  implementations. Each descendant will implement a different strategy
+  for distributing the algorithm across multiple devices/machines.
+  Furthermore, these changes can be hidden inside the specific layers
+  and other library classes that need special treatment to run in a
+  distributed setting, so that most users' model definition code can
+  run unchanged. The `tf.distribute.Strategy` API works the same way
+  with eager and graph execution.
+
+  First let's introduce a few high-level concepts:
+
+  * _Data parallelism_ is where we run multiple copies of the model
+    on different slices of the input data. This is in contrast to
+    _model parallelism_ where we divide up a single copy of a model
+    across multiple devices.
+    Note: we only support data parallelism for now, but
+    hope to add support for model parallelism in the future.
+  * A _replica_ is one copy of the model, running on one slice of the
+    input data.
+  * _Synchronous_, or more commonly _sync_, training is where the
+    updates from each replica are aggregated together before updating
+    the model variables. This is in contrast to _asynchronous_, or
+    _async_ training, where each replica updates the model variables
+    independently.
+  * Furthermore you might run your computation on multiple devices
+    on one machine (or "host"), or on multiple machines/hosts.
+    If you are running on multiple machines, you might have a
+    single master host that drives computation across all of them,
+    or you might have multiple clients driving the computation
+    asynchronously.
+
+  To distribute an algorithm, we might use some of these ingredients:
+
+  * Parameter servers: These are hosts that hold a single copy of
+    parameters/variables. All replicas that want to operate on a variable
+    retrieve it at the beginning of a step and send an update to be
+    applied at the end of the step. Can support either sync or async
+    training.
+  * Mirrored variables: These are variables that are copied to multiple
+    devices, where we keep the copies in sync by applying the same
+    updates to every copy. Normally would only be used with sync training.
+  * Reductions and Allreduce: A _reduction_ is some method of
+    aggregating multiple values into one value, like "sum" or
+    "mean". If doing sync training, we will perform a reduction on the
+    gradients to a parameter from all replicas before applying the
+    update. Allreduce is an algorithm for performing a reduction on
+    values from multiple devices and making the result available on
+    all of those devices.
+  * In the future we will have support for TensorFlow's partitioned
+    variables, where a single variable is split across multiple
+    devices.
+
+  We have then a few approaches we want to support:
+
+  * Code written (as if) with no knowledge of class `tf.distribute.Strategy`.
+    This code should work as before, even if some of the layers, etc.
+    used by that code are written to be distribution-aware. This is done
+    by having a default `tf.distribute.Strategy` that gives ordinary behavior,
+    and by default being in a single replica context.
+  * Ordinary model code that you want to run using a specific
+    `tf.distribute.Strategy`. This can be as simple as:
+
+    ```
+    with my_strategy.scope():
+      iterator = my_strategy.make_dataset_iterator(dataset)
+      session.run(iterator.initialize())
+      replica_train_ops = my_strategy.extended.call_for_each_replica(
+          replica_fn, args=(iterator.get_next(),))
+      train_op = my_strategy.group(replica_train_ops)
+    ```
+
+    This takes an ordinary `dataset` and `replica_fn` and runs it
+    distributed using a particular `tf.distribute.Strategy` in
+    `my_strategy`. Any variables created in `replica_fn` are created
+    using `my_strategy`'s policy, and library functions called by
+    `replica_fn` can use the `get_replica_context()` API to get enhanced
+    behavior in this case.
+
+  * If you want to write a distributed algorithm, you may use any of
+    the `tf.distribute.Strategy` APIs inside a
+    `with my_strategy.scope():` block of code.
+
+  Lower-level concepts:
+
+  * Wrapped values: In order to represent values parallel across devices
+    (either replicas or the devices associated with a particular value), we
+    wrap them in a "PerReplica" or "Mirrored" object that contains a map
+    from device to values. "PerReplica" is used when the value may be
+    different across replicas, and "Mirrored" when the value are the same.
+  * Unwrapping and merging: Consider calling a function `fn` on multiple
+    replicas, like `extended.call_for_each_replica(fn, args=[w])` with an
+    argument `w` that is a wrapped value. This means `w` will have a map taking
+    replica device `d0` to `w0`, replica device `d1` to `w1`,
+    etc. `extended.call_for_each_replica()` unwraps `w` before calling `fn`, so
+    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges the return
+    values from `fn()`, which can possibly result in wrapped values. For
+    example, let's say `fn()` returns a tuple with three components: `(x, a,
+    v0)` from replica 0, `(x, b, v1)` on replica 1, etc. If the first component
+    is the same object `x` from every replica, then the first component of the
+    merged result will also be `x`. If the second component is different (`a`,
+    `b`, ...)  from each replica, then the merged value will have a wrapped map
+    from replica device to the different values. If the third component is the
+    members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to `v1`, etc.),
+    then the merged result will be that mirrored variable (`v`).
+  * Replica context vs. Cross-replica context: _replica context_ is when we
+    are in some function that is being called once for each replica.
+    Otherwise we are in cross-replica context, which is useful for
+    calling `tf.distribute.Strategy` methods which operate across the
+    replicas (like `reduce_to()`). By default you start in a replica context
+    (the default "single replica context") and then some methods can
+    switch you back and forth, as described below.
+  * Worker devices vs. parameter devices: Most replica computations will
+    happen on worker devices. Since we don't yet support model
+    parallelism, there will be one worker device per replica. When using
+    parameter servers (see above), the set of devices holding
+    variables may be different, otherwise the parameter devices might
+    match the worker devices.
+  * Non-slot devices are some subset of the parameter devices where we
+    put all the non-slot variables. We need to ensure that all
+    non-slot variables are allocated on the same device, or mirrored
+    across the same set of devices. If you have some variable you want
+    to colocate all the non-slot variables with, you can use
+    `colocate_vars_with()` to get the remaining non-slot variables on
+    the same device.  Otherwise you can use `non_slot_devices()` to
+    pick a consistent set of devices to pass to both
+    `colocate_vars_with()` and `update_non_slot()`.
+
+  When using a `tf.distribute.Strategy`, we have a new type dimension
+  called _locality_ that says what values are compatible with which
+  APIs:
+
+  * T: different value for each replica (e.g. a PerReplica-wrapped value).
+  * M: value is "mirrored" across replicas, i.e. there are copies with the
+    same value on each replica (e.g. a Mirrored-wrapped value).
+  * V(`v`): value is "mirrored" across all the devices which have a
+    copy of variable `v` (also a Mirrored-wrapped value, but over
+    parameter devices instead of worker devices).
+  * N: value is "mirrored" across all the "non-slot" devices
+
+  Rules for methods with respect to locality and single-replica vs.
+  cross-replica context:
+
+  * `with d.scope()`: default single-replica context -> cross-replica context
+    for `d`
+  * `with d.extended.colocate_vars_with(v)`: in replica/cross-replica context,
+    variables will be created with locality V(`v`). That is, if we write
+    `with d.extended.colocate_vars_with(v1): v2 = tf.get_variable(...)`,
+    then `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
+    V(`v1`).
+  * `with d.extended.colocate_vars_with(d.extended.non_slot_devices(...))`: in
+    replica/cross-replica context, variables will be created with locality N
+  * `v = tf.get_variable(...)`: in replica/cross-replica context, creates
+    a variable (which by definition will have locality V(`v`), though
+    will match another locality if inside a `colocate_vars_with`
+    scope).
+  * `d.make_dataset_iterator(dataset)` (or the deprecated
+    `d.distribute_dataset(dataset).make_one_shot_iterator()`): in cross-replica
+    context, produces an iterator with locality T
+  * `d.extended.broadcast_to(t)`: in cross-replica context, produces a value
+    with locality M
+  * `d.extended.broadcast_to(t, v)`: in cross-replica context, produces a value
+    with locality V(`v`)
+  * `d.extended.call_for_each_replica(fn, ...)`: in cross-replica context, runs
+    `fn()` in a replica context (and so may call `get_replica_context()` and
+    use its API, including `merge_call()` to get back to cross-replica
+    context), once for each replica. May use values with locality T or
+    M, and any variable.
+  * `d.extended.reduce_to(m, t, t)`: in cross-replica context, accepts t with
+    locality T and produces a value with locality M.
+  * `d.extended.reduce_to(m, t, v)`: in cross-replica context, accepts t with
+    locality T and produces a value with locality V(`v`).
+  * `d.extended.batch_reduce_to(m, [(t, v)]): see `d.extended.reduce_to()`
+  * `d.extended.update(v, fn, ...)`: in cross-replica context, runs `fn()` once
+    for each device `v` is copied to, all inputs should have locality
+    V(`v`), output will have locality V(`v`) as well.
+  * `d.extended.update_non_slot(d.extended.non_slot_devices(), fn)`: in
+    cross-replica context, like `d.extended.update()` except with locality N.
+  * `d.extended.read_var(v)`: Gets the (read-only) value of the variable `v` (on
+    the device determined by the current device scope), aggregating
+    across replicas for replica-local variables. Frequently, this will be
+    done automatically when using `v` in an expression or fetching it in
+    a cross-replica context, but this function can be used to force that
+    conversion happens at a particular point in time (for example, to
+    add the result of the conversion to a graph collection).
+
+  The standard pattern for updating variables is to:
+
+  1. Create an input iterator with `d.make_dataset_iterator()`.
+  2. Define each replica `d.extended.call_for_each_replica()` up to the point of
+     getting a list of gradient, variable pairs.
+  3. Call `d.extended.reduce_to(VariableAggregation.SUM, t, v)` or
+     `d.extended.batch_reduce_to()` to sum the gradients (with locality T)
+     into values with locality V(`v`).
+  4. Call `d.extended.update(v)` for each variable to update its value.
+
+  Steps 3 and 4 are done automatically by class `Optimizer` if you call
+  its `apply_gradients` method in a replica context. Otherwise you can
+  manually call its `_distributed_apply` method in a cross-replica context.
+
+  Another thing you might want to do in the middle of your replica function is
+  an all-reduce of some intermediate value, using `d.extended.reduce_to()` or
+  `d.extended.batch_reduce_to()`. You simply provide the same tensor as the
+  input and destination.
+
+  Layers should expect to be called in a replica context, and can use
+  the `tf.distribute.get_replica_context` function to get a
+  `tf.distribute.ReplicaContext` object. The
+  `ReplicaContext` object has a `merge_call()` method for entering
+  cross-replica context where you can use `reduce_to()` (or
+  `batch_reduce_to()`) and then optionally `update()` to update state.
+
+  You may use this API whether or not a `tf.distribute.Strategy` is
+  being used, since there is a default implementation of
+  `ReplicaContext` and `tf.distribute.Strategy`.
+
+  NOTE for new `tf.distribute.Strategy` implementations: Please put all logic
+  in a subclass of `tf.distribute.StrategyExtended`. The only code needed for
+  the `tf.distribute.Strategy` subclass is for instantiating your subclass of
+  `tf.distribute.StrategyExtended` in the `__init__` method.
+  """
+
+  def __init__(self, container_strategy):
+    self._container_strategy_weakref = weakref.ref(container_strategy)
+    self._default_device = None
+    # This property is used to determine if we should set drop_remainder=True
+    # when creating Datasets from numpy array inputs.
+    self._require_static_shapes = False
+
+  def _container_strategy(self):
+    """Get the containing `DistributionStrategy`.
+
+    This should not generally be needed except when creating a new
+    `ReplicaContext` and to validate that the caller is in the correct
+    `scope()`.
+
+    Returns:
+      The `DistributionStrategy` such that `strategy.extended` is `self`.
+    """
+    container_strategy = self._container_strategy_weakref()
+    assert container_strategy is not None
+    return container_strategy
+
+  def _scope(self, strategy):
+    """Implementation of DistributionStrategy.scope()."""
+    if distribution_strategy_context.has_distribution_strategy():
+      _require_cross_replica_context_extended(self)
+      return _SameScopeAgainContext(strategy)
+
+    def creator_with_resource_vars(*args, **kwargs):
+      _require_distribution_strategy_scope_extended(self)
+      kwargs["use_resource"] = True
+      return self._create_variable(*args, **kwargs)
+
+    def distributed_getter(getter, *args, **kwargs):
+      if not self._allow_variable_partition():
+        if kwargs.pop("partitioner", None) is not None:
+          tf_logging.log_first_n(
+              tf_logging.WARN, "Partitioned variables are disabled when using "
+              "current tf.distribute.Strategy.", 1)
+      return getter(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        strategy,
+        variable_scope.variable_creator_scope(creator_with_resource_vars),
+        variable_scope.variable_scope(
+            variable_scope.get_variable_scope(),
+            custom_getter=distributed_getter), self._default_device)
+
+  def _allow_variable_partition(self):
+    return False
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    # Note: should support "colocate_with" argument.
+    raise NotImplementedError("must be implemented in descendants")
+
+  def read_var(self, v):
+    """Reads the value of a variable.
+
+    Returns the aggregate value of a replica-local variable, or the
+    (read-only) value of any other variable.
+
+    Args:
+      v: A variable allocated within the scope of this `tf.distribute.Strategy`.
+
+    Returns:
+      A tensor representing the value of `v`, aggregated across replicas if
+      necessary.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Scope that controls which devices variables will be created on.
+
+    No operations should be added to the graph inside this scope, it
+    should only be used when creating variables (some implementations
+    work by changing variable creation, others work by using a
+    tf.colocate_with() scope).
+
+    This may only be used inside `self.scope()`.
+
+    Example usage:
+
+    ```
+    with strategy.scope():
+      var1 = tf.get_variable(...)
+      with strategy.extended.colocate_vars_with(v1):
+        # var2 and var3 will be created on the same device(s) as var1
+        var2 = tf.get_variable(...)
+        var3 = tf.get_variable(...)
+
+      def fn(v1, v2, v3):
+        # operates on v1 from var1, v2 from var2, and v3 from var3
+
+      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
+      strategy.extended.update(v1, fn, args=(v2, v3))
+    ```
+
+    Args:
+      colocate_with_variable: A created in `self.scope()`. Variables created
+        while in the returned context manager will be on the same set of
+        devices as `colocate_with_variable`.
+
+    Returns:
+      A context manager.
+    """
+    def create_colocated_variable(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope_extended(self)
+      kwargs["use_resource"] = True
+      kwargs["colocate_with"] = colocate_with_variable
+      return next_creator(*args, **kwargs)
+
+    _require_distribution_strategy_scope_extended(self)
+    return variable_scope.variable_creator_scope(create_colocated_variable)
+
+  def _call_dataset_fn(self, dataset_fn):
+    """Call the `dataset_fn` with `input_context` as argument."""
+    result = dataset_fn()
+    if not isinstance(result, dataset_ops.DatasetV2):
+      raise ValueError(
+          "dataset_fn() must return a tf.data.Dataset when using a "
+          "tf.distribute.Strategy.")
+    return result
+
+  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
+  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
+  # Extend to implement more functionality of datasets.
+  def _distribute_dataset(self, dataset_fn):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _make_dataset_iterator(self, dataset):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _make_input_fn_iterator(self, input_fn, replication_mode):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def broadcast_to(self, tensor, destinations):
+    """Mirror a tensor on one device to all worker devices.
+
+    Args:
+      tensor: A Tensor value to broadcast.
+      destinations: A mirrored variable or device string specifying the
+        destination devices to copy `tensor` to.
+
+    Returns:
+      A value mirrored to `destinations` devices.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+    assert not isinstance(destinations, (list, tuple))
+    return self._broadcast_to(tensor, destinations)
+
+  def _broadcast_to(self, tensor, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _initialize(self):
+    return []
+
+  def _finalize(self):
+    return []
+
+  def experimental_run_steps_on_iterator(self, fn, iterator, iterations=1,
+                                         initial_loop_values=None):
+    """Run `fn` with input from `iterator` for `iterations` times.
+
+    This method can be used to run a step function for training a number of
+    times using input from a dataset.
+
+    Args:
+      fn: function to run using this distribution strategy. The function must
+        have the following signature: `def fn(context, inputs)`.
+        `context` is an instance of `MultiStepContext` that will be passed when
+        `fn` is run. `context` can be used to specify the outputs to be returned
+        from `fn` by calling `context.set_last_step_output`. It can also be used
+        to capture non tensor outputs by `context.set_non_tensor_output`.
+        See `MultiStepContext` documentation for more information.
+        `inputs` will have same type/structure as `iterator.get_next()`.
+        Typically, `fn` will use `call_for_each_replica` method of the strategy
+        to distribute the computation over multiple replicas.
+      iterator: Iterator of a dataset that represents the input for `fn`. The
+        caller is responsible for initializing the iterator as needed.
+      iterations: (Optional) Number of iterations that `fn` should be run.
+        Defaults to 1.
+      initial_loop_values: (Optional) Initial values to be passed into the
+        loop that runs `fn`. Defaults to `None`. # TODO(priyag): Remove
+        initial_loop_values argument when we have a mechanism to infer the
+        outputs of `fn`.
+
+    Returns:
+      Returns the `MultiStepContext` object which has the following properties,
+      among other things:
+        - run_op: An op that runs `fn` `iterations` times.
+        - last_step_outputs: A dictionary containing tensors set using
+        `context.set_last_step_output`. Evaluating this returns the value of
+        the tensors after the last iteration.
+        - non_tensor_outputs: A dictionatry containing anything that was set by
+          `fn` by calling `context.set_non_tensor_output`.
+    """
+    _require_cross_replica_context_extended(self)
+    return self._experimental_run_steps_on_iterator(
+        fn, iterator, iterations, initial_loop_values)
+
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def call_for_each_replica(self, fn, args=(), kwargs=None):
+    """Run `fn` once per replica.
+
+    `fn` may call `tf.get_replica_context()` to access methods such as
+    `replica_id_in_sync_group` and `merge_call()`.
+
+    `merge_call()` is used to communicate between the replicas and
+    re-enter the cross-replica context. All replicas pause their execution
+    having encountered a `merge_call()` call. After that the
+    `merge_fn`-function is executed. Its results are then unwrapped and
+    given back to each replica call. After that execution resumes until
+    `fn` is complete or encounters another `merge_call()`.  Example:
+
+    ```python
+    # Called once in "cross-replica" context.
+    def merge_fn(distribution, three_plus_replica_id):
+      # sum the values across replicas
+      return sum(distribution.unwrap(three_plus_replica_id))
+
+    # Called once per replica in `distribution`, in a "replica" context.
+    def fn(three):
+      replica_ctx = tf.get_replica_context()
+      v = three + replica_ctx.replica_id_in_sync_group
+      # Computes the sum of the `v` values across all replicas.
+      s = replica_ctx.merge_call(merge_fn, args=(v,))
+      return s + v
+
+    with distribution.scope():
+      # in "cross-replica" context
+      ...
+      merged_results = distribution.call_for_each_replica(fn, args=[3])
+      # merged_results has the values from every replica execution of `fn`.
+      print(distribution.unwrap(merged_results))  # Prints a list
+    ```
+
+    Args:
+      fn: function to run (will be run once per replica).
+      args: Tuple or list with positional arguments for `fn`.
+      kwargs: Dict with keyword arguments for `fn`.
+
+    Returns:
+      Merged return value of `fn` across all replicas.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._call_for_each_replica(fn, args, kwargs)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _reduce(self, reduce_op, value):
+    # Default implementation until we have an implementation for each strategy.
+    return self._unwrap(self._reduce_to(
+        reduce_op, value, device_util.current() or "/device:CPU:0"))[0]
+
+  def reduce_to(self, reduce_op, value, destinations):
+    """Combine (via e.g. sum or mean) values across replicas.
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+        DEPRECATED but still accepted values:
+        `tf.VariableAggregation.SUM`,
+        `tf.VariableAggregation.MEAN`,
+      value: A per-replica value with one value per replica.
+      destinations: A mirrored variable, a per-replica tensor, or a device
+        string. The return value will be copied to all destination devices (or
+        all the devices where the `destinations` value resides). To perform an
+        all-reduction, pass `value` to `destinations`.
+
+    Returns:
+      A value mirrored to `destinations`.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+    assert not isinstance(destinations, (list, tuple))
+
+    # TODO(priyag): Remove this when all callers have been updated.
+    if isinstance(reduce_op, variable_scope.VariableAggregation):
+      assert reduce_op in (
+          variable_scope.VariableAggregation.SUM,
+          variable_scope.VariableAggregation.MEAN,
+      )
+      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    assert (reduce_op == reduce_util.ReduceOp.SUM or
+            reduce_op == reduce_util.ReduceOp.MEAN)
+    return self._reduce_to(reduce_op, value, destinations)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def batch_reduce_to(self, reduce_op, value_destination_pairs):
+    """Combine multiple `reduce_to` calls into one for faster execution.
+
+    Args:
+      reduce_op: Reduction type, an instance of `tf.distribute.ReduceOp` enum.
+        DEPRECATED but still accepted values:
+        `tf.VariableAggregation.SUM`,
+        `tf.VariableAggregation.MEAN`,
+      value_destination_pairs: A sequence of (value, destinations)
+        pairs. See `reduce_to()` for a description.
+
+    Returns:
+      A list of mirrored values, one per pair in `value_destination_pairs`.
+    """
+    # TODO(josh11b): More docstring
+    _require_cross_replica_context_extended(self)
+
+    # TODO(priyag): Remove this when all callers have been updated.
+    if isinstance(reduce_op, variable_scope.VariableAggregation):
+      assert reduce_op in [
+          variable_scope.VariableAggregation.SUM,
+          variable_scope.VariableAggregation.MEAN,
+      ]
+      reduce_op = reduce_util.ReduceOp.from_variable_aggregation(reduce_op)
+    return self._batch_reduce_to(reduce_op, value_destination_pairs)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    return [
+        self.reduce_to(reduce_op, t, destinations=v)
+        for t, v in value_destination_pairs
+    ]
+
+  def update(self, var, fn, args=(), kwargs=None, group=True):
+    """Run `fn` to update `var` using inputs mirrored to the same devices.
+
+    If `var` is mirrored across multiple devices, then this implements
+    logic like:
+
+    ```
+    results = {}
+    for device, v in var:
+      with tf.device(device):
+        # args and kwargs will be unwrapped if they are mirrored.
+        results[device] = fn(v, *args, **kwargs)
+    return merged(results)
+    ```
+
+    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
+
+    Neither `args` nor `kwargs` may contain per-replica values.
+    If they contain mirrored values, they will be unwrapped before
+    calling `fn`.
+
+    Args:
+      var: Variable, possibly mirrored to multiple devices, to operate on.
+      fn: Function to call. Should take the variable as the first argument.
+      args: Tuple or list. Additional positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
+
+    Returns:
+      By default, the merged return value of `fn` across all replicas.  The
+      merged result has dependencies to make sure that if it is evaluated at
+      all, the side effects (updates) will happen on every replica. If instead
+      "group=False" is specified, this function will return a nest of lists
+      where each list has an element per replica, and the caller is responsible
+      for ensuring all elements are executed.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._update(var, fn, args, kwargs, group)
+
+  def _update(self, var, fn, args, kwargs, group):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def update_non_slot(
+      self, colocate_with, fn, args=(), kwargs=None, group=True):
+    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
+
+    Args:
+      colocate_with: The return value of `non_slot_devices()`.
+      fn: Function to execute.
+      args: Tuple or list. Positional arguments to pass to `fn()`.
+      kwargs: Dict with keyword arguments to pass to `fn()`.
+      group: Boolean. Defaults to True. If False, the return value will be
+        unwrapped.
+
+    Returns:
+      Return value of `fn`, possibly merged across devices.
+    """
+    _require_cross_replica_context_extended(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._update_non_slot(colocate_with, fn, args, kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _unwrap(self, distributed_value):
+    raise NotImplementedError("must be implemented in descendants")
+
+  def value_container(self, value):
+    """Returns the container that this per-replica `value` belongs to.
+
+    Args:
+      value: A value returned by `call_for_each_replica()` or a variable
+        created in `scope()`.
+
+    Returns:
+      A container that `value` belongs to.
+      If value does not belong to any container (including the case of
+      container having been destroyed), returns the value itself.
+      `value in unwrap(value_container(value))` will always be true.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _group(self, value, name=None):
+    """Shortcut for `tf.group(distribution.unwrap(value))`."""
+    value = nest.flatten(self._unwrap(value))
+
+    if len(value) != 1 or name is not None:
+      return control_flow_ops.group(value, name=name)
+    # Special handling for the common case of one op.
+    v, = value
+    if hasattr(v, "op"):
+      v = v.op
+    return v
+
+  @property
+  def experimental_require_static_shapes(self):
+    return self._require_static_shapes
+
+  @property
+  def _num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def worker_devices(self):
+    """Returns the tuple of all devices used to for compute replica execution.
+    """
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def parameter_devices(self):
+    """Returns the tuple of all devices used to place variables."""
+    # TODO(josh11b): More docstring
+    raise NotImplementedError("must be implemented in descendants")
+
+  def non_slot_devices(self, var_list):
+    """Device(s) for non-slot variables.
+
+    Create variables on these devices in a
+    `with colocate_vars_with(non_slot_devices(...)):` block.
+    Update those using `update_non_slot()`.
+
+    Args:
+      var_list: The list of variables being optimized, needed with the
+        default `tf.distribute.Strategy`.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def experimental_between_graph(self):
+    """Whether the strategy uses between-graph replication or not.
+
+      This is expected to return a constant value that will not be changed
+      throughout its life cycle.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    """Configures the strategy class."""
+    del session_config, cluster_spec, task_type, task_id
+
+  def _update_config_proto(self, config_proto):
+    return copy.deepcopy(config_proto)
+
+  @property
+  def experimental_should_init(self):
+    """Whether initialization is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_checkpoint(self):
+    """Whether checkpointing is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+  @property
+  def should_save_summary(self):
+    """Whether saving summaries is needed."""
+    raise NotImplementedError("must be implemented in descendants")
+
+
+# A note about the difference between the context managers
+# `ReplicaContext` (defined here) and `_CurrentDistributionContext`
+# (defined above) used by `DistributionStrategy.scope()`:
+#
+# * a ReplicaContext is only present during a `call_for_each_replica()`
+#   call (except during a `merge_run` call) and in such a scope it
+#   will be returned by calls to `get_replica_context()`.  Implementers of new
+#   DistributionStrategy descendants will frequently also need to
+#   define a descendant of ReplicaContext, and are responsible for
+#   entering and exiting this context.
+#
+# * DistributionStrategy.scope() sets up a variable_creator scope that
+#   changes variable creation calls (e.g. to make mirrored
+#   variables). This is intended as an outer scope that users enter once
+#   around their model creation and graph definition. There is no
+#   anticipated need to define descendants of _CurrentDistributionContext.
+#   It sets the current DistributionStrategy for purposes of
+#   `get_strategy()` and `has_strategy()`
+#   and switches the thread mode to a "cross-replica context".
+@tf_export("distribute.ReplicaContext")
+class ReplicaContext(object):
+  """`tf.distribute.Strategy` API when in a replica context.
+
+  To be used inside your replicated step function, such as in a
+  `tf.distribute.StrategyExtended.call_for_each_replica` call.
+  """
+
+  def __init__(self, strategy, replica_id_in_sync_group):
+    self._distribution_strategy = strategy
+    self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
+        self)
+    self._replica_id_in_sync_group = replica_id_in_sync_group
+
+  def __enter__(self):
+    _push_per_thread_mode(self._thread_context)
+
+  def __exit__(self, exception_type, exception_value, traceback):
+    _pop_per_thread_mode()
+
+  def merge_call(self, merge_fn, args=(), kwargs=None):
+    """Merge args across replicas and run `merge_fn` in a cross-replica context.
+
+    This allows communication and coordination when there are multiple calls
+    to a model function triggered by a call to
+    `strategy.extended.call_for_each_replica(model_fn, ...)`.
+
+    See `tf.distribute.StrategyExtended.call_for_each_replica` for an
+    explanation.
+
+    If not inside a distributed scope, this is equivalent to:
+
+    ```
+    strategy = tf.distribute.get_strategy()
+    with cross-replica-context(strategy):
+      return merge_fn(strategy, *args, **kwargs)
+    ```
+
+    Args:
+      merge_fn: function that joins arguments from threads that are given as
+        PerReplica. It accepts `tf.distribute.Strategy` object as
+        the first argument.
+      args: List or tuple with positional per-thread arguments for `merge_fn`.
+      kwargs: Dict with keyword per-thread arguments for `merge_fn`.
+
+    Returns:
+      The return value of `merge_fn`, except for `PerReplica` values which are
+      unpacked.
+    """
+    require_replica_context(self)
+    if kwargs is None:
+      kwargs = {}
+    return self._merge_call(merge_fn, args, kwargs)
+
+  def _merge_call(self, merge_fn, args, kwargs):
+    """Default implementation for single replica."""
+    _push_per_thread_mode(  # thread-local, so not needed with multiple threads
+        distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
+            self._distribution_strategy))
+    try:
+      return merge_fn(self._distribution_strategy, *args, **kwargs)
+    finally:
+      _pop_per_thread_mode()
+
+  @property
+  def num_replicas_in_sync(self):
+    """Returns number of replicas over which gradients are aggregated."""
+    return self._distribution_strategy.num_replicas_in_sync
+
+  @property
+  def replica_id_in_sync_group(self):
+    """Which replica is being defined, from 0 to `num_replicas_in_sync - 1`."""
+    require_replica_context(self)
+    return self._replica_id_in_sync_group
+
+  @property
+  @doc_controls.do_not_generate_docs  # DEPRECATED, use `strategy`
+  def distribution_strategy(self):
+    """DEPRECATED: use `self.stratgey` instead."""
+    return self._distribution_strategy
+
+  @property
+  def strategy(self):
+    """The current `tf.distribute.Strategy` object."""
+    return self._distribution_strategy
+
+  @property
+  def devices(self):
+    """The devices this replica is to be executed on, as a tuple of strings."""
+    require_replica_context(self)
+    return (device_util.current(),)
+
+  # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
+  # all-reduce. It would return a function returning the result of reducing `t`
+  # across all replicas. The caller would wait to call this function until they
+  # needed the reduce result, allowing an efficient implementation:
+  # * With eager execution, the reduction could be performed asynchronously
+  #   in the background, not blocking until the result was needed.
+  # * When constructing a graph, it could batch up all reduction requests up
+  #   to that point that the first result is needed. Most likely this can be
+  #   implemented in terms of `merge_call()` and `batch_reduce_to()`.
+
+# ------------------------------------------------------------------------------
+
+
+class _DefaultDistributionStrategy(DistributionStrategy):
+  """Default `tf.distribute.Strategy` if none is explicitly selected."""
+
+  def __init__(self):
+    super(_DefaultDistributionStrategy, self).__init__(
+        _DefaultDistributionExtended(self))
+
+
+class _DefaultDistributionExtended(DistributionStrategyExtended):
+  """Implementation of _DefaultDistributionStrategy."""
+
+  def _scope(self, strategy):
+    """Context manager setting a variable creator and `self` as current."""
+    if distribution_strategy_context.has_distribution_strategy():
+      raise RuntimeError("Must not nest tf.distribute.Strategy scopes.")
+
+    def creator(next_creator, *args, **kwargs):
+      _require_distribution_strategy_scope_strategy(strategy)
+      return next_creator(*args, **kwargs)
+
+    return _CurrentDistributionContext(
+        strategy, variable_scope.variable_creator_scope(creator))
+
+  def colocate_vars_with(self, colocate_with_variable):
+    """Does not require `self.scope`."""
+    _require_distribution_strategy_scope_extended(self)
+    return ops.colocate_with(colocate_with_variable)
+
+  def _distribute_dataset(self, dataset_fn):
+    return self._call_dataset_fn(dataset_fn)
+
+  def _make_dataset_iterator(self, dataset):
+    return _DefaultDistributionExtended.DefaultInputIterator(dataset)
+
+  def _make_input_fn_iterator(self,
+                              input_fn,
+                              replication_mode=InputReplicationMode.PER_WORKER):
+    return input_fn(InputContext()).make_initializable_iterator()
+
+  def _broadcast_to(self, tensor, destinations):
+    if destinations is None:
+      return tensor
+    else:
+      raise NotImplementedError("TODO")
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    with ReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
+      return fn(*args, **kwargs)
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    # TODO(josh11b): Use destinations?
+    del reduce_op, destinations
+    return value
+
+  def _update(self, var, fn, args, kwargs, group):
+    # The implementations of _update() and _update_non_slot() are identical
+    # except _update() passes `var` as the first argument to `fn()`.
+    return self._update_non_slot(var, fn, (var,) + tuple(args), kwargs, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, should_group):
+    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
+    # once that value is used for something.
+    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
+      result = fn(*args, **kwargs)
+      if should_group:
+        return result
+      else:
+        return nest.map_structure(self._unwrap, result)
+
+  def read_var(self, replica_local_var):
+    return array_ops.identity(replica_local_var)
+
+  def _unwrap(self, distributed_value):
+    return (distributed_value,)
+
+  def value_container(self, value):
+    return value
+
+  @property
+  def _num_replicas_in_sync(self):
+    return 1
+
+  @property
+  def worker_devices(self):
+    raise RuntimeError("worker_devices() method unsupported by default "
+                       "tf.distribute.Strategy.")
+
+  @property
+  def parameter_devices(self):
+    raise RuntimeError("parameter_devices() method unsupported by default "
+                       "tf.distribute.Strategy.")
+
+  def non_slot_devices(self, var_list):
+    return min(var_list, key=lambda x: x.name)
+
+  # TODO(priyag): This should inherit from `InputIterator`, once dependency
+  # issues have been resolved.
+  class DefaultInputIterator(object):
+    """Default implementation of `InputIterator` for default strategy."""
+
+    def __init__(self, dataset):
+      self._dataset = dataset
+      if eager_context.executing_eagerly():
+        self._iterator = dataset.make_one_shot_iterator()
+      else:
+        self._iterator = dataset.make_initializable_iterator()
+
+    def get_next(self):
+      return self._iterator.get_next()
+
+    def initialize(self):
+      if eager_context.executing_eagerly():
+        self._iterator = self._dataset.make_one_shot_iterator()
+        return []
+      else:
+        return [self._iterator.initializer]
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
+
+# ------------------------------------------------------------------------------
+# We haven't yet implemented deserialization for DistributedVariables.
+# So here we catch any attempts to deserialize variables
+# when using distribution strategies.
+# pylint: disable=protected-access
+_original_from_proto = resource_variable_ops._from_proto_fn
+
+
+def _from_proto_fn(v, import_scope=None):
+  if distribution_strategy_context.has_distribution_strategy():
+    raise NotImplementedError(
+        "Deserialization of variables is not yet supported when using a "
+        "tf.distribute.Strategy.")
+  else:
+    return _original_from_proto(v, import_scope=import_scope)
+
+resource_variable_ops._from_proto_fn = _from_proto_fn
+# pylint: enable=protected-access
+
+
+#-------------------------------------------------------------------------------
+# Shorthand for some methods from distribution_strategy_context.
+_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
+_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
+_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
diff --git a/tensorflow/python/training/distribute_test.py b/tensorflow/python/distribute/distribute_lib_test.py
similarity index 75%
rename from tensorflow/python/training/distribute_test.py
rename to tensorflow/python/distribute/distribute_lib_test.py
index 0a7bbd5687047d9549fd9832a8f1a3576a547295..d63d1fe3c323ac1e98afee52cf544c7c7da5fc65 100644
--- a/tensorflow/python/training/distribute_test.py
+++ b/tensorflow/python/distribute/distribute_lib_test.py
@@ -18,13 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import test
-from tensorflow.python.training import distribute
-from tensorflow.python.training import distribution_strategy_context
 
 
-class _TestReplicaContext(distribute.ReplicaContext):
+class _TestReplicaContext(distribute_lib.ReplicaContext):
 
   def merge_call(self, fn, *args, **kwargs):
     return kwargs["test_arg"]
@@ -38,10 +40,18 @@ def _get_test_variable(name, synchronization, aggregation):
   }
 
 
-class _TestStrategy(distribute.DistributionStrategy):
+class _TestStrategy(distribute_lib.DistributionStrategy):
+
+  def __init__(self):
+    super(_TestStrategy, self).__init__(_TestExtended(self))
+
+
+class _TestExtended(distribute_lib.DistributionStrategyExtended):
 
   def _call_for_each_replica(self, fn, args, kwargs):
-    with _TestReplicaContext(self, replica_id=0):
+    with _TestReplicaContext(
+        self._container_strategy(),
+        replica_id_in_sync_group=constant_op.constant(0, dtypes.int32)):
       return fn(*args, **kwargs)
 
   def _create_variable(self, next_creator, *args, **kwargs):
@@ -53,6 +63,7 @@ def _assert_in_default_state(t):
   t.assertIs(distribution_strategy_context._get_default_replica_context(),
              distribution_strategy_context.get_replica_context())
   t.assertIs(None, distribution_strategy_context.get_cross_replica_context())
+  t.assertFalse(distribution_strategy_context.in_cross_replica_context())
   t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
              distribution_strategy_context.get_distribution_strategy())
   t.assertFalse(distribution_strategy_context.has_distribution_strategy())
@@ -69,6 +80,7 @@ class TestStrategyTest(test.TestCase):
       self.assertTrue(replica_context is not None)
       self.assertIs(None,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertFalse(distribution_strategy_context.in_cross_replica_context())
       self.assertTrue(distribution_strategy_context.has_distribution_strategy())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
@@ -80,9 +92,9 @@ class TestStrategyTest(test.TestCase):
                            variable_scope.variable(1.0, name="bar"))
 
     with self.assertRaises(RuntimeError):
-      dist.call_for_each_replica(run_fn)
+      dist.extended.call_for_each_replica(run_fn)
     with dist.scope():
-      dist.call_for_each_replica(run_fn)
+      dist.extended.call_for_each_replica(run_fn)
     _assert_in_default_state(self)
 
   def testScope(self):
@@ -92,6 +104,7 @@ class TestStrategyTest(test.TestCase):
       self.assertIs(None, distribution_strategy_context.get_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
       self.assertTrue(distribution_strategy_context.has_distribution_strategy())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
@@ -131,6 +144,7 @@ class DefaultDistributionStrategyTest(test.TestCase):
       self.assertIs(None, distribution_strategy_context.get_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_cross_replica_context())
+      self.assertTrue(distribution_strategy_context.in_cross_replica_context())
       self.assertIs(dist,
                     distribution_strategy_context.get_distribution_strategy())
       self.assertFalse(
@@ -140,9 +154,26 @@ class DefaultDistributionStrategyTest(test.TestCase):
     replica_ctx = distribution_strategy_context.get_replica_context()
     self.assertIs(distribution_strategy_context._get_default_replica_context(),
                   replica_ctx)
-    self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, "bar"))
+    self.assertEqual("foo_bar", replica_ctx.merge_call(merge_fn, args=("bar",)))
     _assert_in_default_state(self)
 
 
+class InputContextTest(test.TestCase):
+
+  def testProperties(self):
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=2, input_pipeline_id=1, num_replicas_in_sync=6)
+    self.assertEqual(6, input_context.num_replicas_in_sync)
+    self.assertEqual(1, input_context.input_pipeline_id)
+    self.assertEqual(2, input_context.num_input_pipelines)
+
+  def testPerReplicaBatchSize(self):
+    input_context = distribute_lib.InputContext(
+        num_input_pipelines=2, input_pipeline_id=1, num_replicas_in_sync=6)
+    self.assertEqual(2, input_context.get_per_replica_batch_size(12))
+    with self.assertRaises(ValueError):
+      input_context.get_per_replica_batch_size(13)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/distribution_strategy_context.py b/tensorflow/python/distribute/distribution_strategy_context.py
new file mode 100644
index 0000000000000000000000000000000000000000..78e096e286727664830f18ac0236c3626c5733d9
--- /dev/null
+++ b/tensorflow/python/distribute/distribution_strategy_context.py
@@ -0,0 +1,236 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utility to get distribution strategy related contexts."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util.lazy_loader import LazyLoader
+from tensorflow.python.util.tf_export import tf_export
+
+
+# There is a circular dependency between this and `distribute` module. So we
+# load it lazily to workaround this.
+distribute_lib = LazyLoader(
+    "distribute_lib", globals(),
+    "tensorflow.python.distribute.distribute_lib")
+
+# ------------------------------------------------------------------------------
+# Internal API for setting the current thread mode as being either in a
+# replica or cross-replica context for a particular distribution strategy.
+
+
+class _ThreadMode(object):
+
+  def __init__(self, dist, cross, replica):
+    self.distribution_strategy = dist
+    self.cross_replica_context = cross
+    self.replica_context = replica
+
+
+class _CrossReplicaThreadMode(_ThreadMode):
+
+  def __init__(self, distribution_strategy):
+    _ThreadMode.__init__(
+        self, distribution_strategy, distribution_strategy, None)
+
+
+class _InReplicaThreadMode(_ThreadMode):
+
+  def __init__(self, replica_ctx):
+    _ThreadMode.__init__(
+        self, replica_ctx.distribution_strategy, None, replica_ctx)
+
+
+def _push_per_thread_mode(context):
+  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
+
+
+def _pop_per_thread_mode():
+  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
+
+
+class _DefaultReplicaThreadMode(_ThreadMode):
+  """Type of default value returned by `_get_per_thread_mode()`.
+
+  Used when the thread-local stack is empty.
+  """
+
+  def __init__(self):
+    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
+                         _get_default_replica_context())
+
+
+def _get_per_thread_mode():
+  try:
+    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
+  except (AttributeError, IndexError):
+    return _get_default_replica_mode()
+
+
+# ------------------------------------------------------------------------------
+# Public API for accessing the current thread mode
+
+
+@tf_export("distribute.get_replica_context")
+def get_replica_context():
+  """Returns the current `tf.distribute.ReplicaContext` or `None`.
+
+  Returns `None` if in a cross-replica context.
+
+  Note that execution:
+
+  1. starts in the default (single-replica) replica context (this function
+     will return the default `ReplicaContext` object);
+  2. switches to cross-replica context (in which case this will return
+     `None`) when entering a `with tf.distribute.Strategy.scope():` block;
+  3. switches to a (non-default) replica context inside
+     `extended.call_for_each_replica(fn, ...)`;
+  4. if `fn` calls `get_replica_context().merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-replica context (and again
+     this function will return `None`).
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-replica context for the default `tf.distribute.Strategy`. You may
+  also switch from the cross-replica context of 4 to a replica context by
+  calling `extended.call_for_each_replica()`, jumping back to step 3.
+
+  Most `tf.distribute.Strategy` methods may only be executed in
+  a cross-replica context, in a replica context you should use the
+  `ReplicaContext` API instead.
+
+  Returns:
+    The current `ReplicaContext` object when in a replica context scope,
+    else `None`.
+
+    Within a particular block, exactly one of these two things will be true:
+
+    * `get_replica_context()` returns non-`None`, or
+    * `tf.distribute.is_cross_replica_context()` returns True.
+  """
+  return _get_per_thread_mode().replica_context
+
+
+def get_cross_replica_context():
+  """Returns the current tf.distribute.Strategy if in a cross-replica context.
+
+  DEPRECATED: Please use `in_cross_replica_context()` and
+  `get_distribution_strategy()` instead.
+
+  Note that execution:
+
+  1. starts in the default (single-replica) replica context;
+  2. switches to cross-replica context when entering a
+     `with tf.distribute.Strategy.scope():` block;
+  3. switches to a (non-default) replica context inside
+     `call_for_each_replica(fn, ...)`;
+  4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
+     inside `merge_fn` you are back in the cross-replica context.
+
+  Note that you can also go directly from step 1 to 4 to switch to a
+  cross-replica context for the default `tf.distribute.Strategy`. You may
+  also switch from the cross-replica context of 4 to a replica context by
+  calling `call_for_each_replica()`, jumping back to step 3.
+
+  Most `tf.distribute.Strategy` methods may only be executed in
+  a cross-replica context.
+
+  Returns:
+    Returns the current `tf.distribute.Strategy` object in a cross-replica
+    context, or `None`.
+
+    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
+    will return `None` in a particular block.
+  """
+  return _get_per_thread_mode().cross_replica_context
+
+
+@tf_export("distribute.in_cross_replica_context")
+def in_cross_replica_context():
+  """Returns True if in a cross-replica context.
+
+  See `tf.distribute.get_replica_context` for details.
+
+  Returns:
+    True if in a cross-replica context (`get_replica_context()` returns
+    `None`), or False if in a replica context (`get_replica_context()` returns
+    non-`None`).
+  """
+  return _get_per_thread_mode().cross_replica_context is not None
+
+
+@tf_export("distribute.get_strategy")
+def get_distribution_strategy():
+  """Returns the current `tf.distribute.Strategy` object.
+
+  Typically only used in a cross-replica context:
+
+  ```
+  if tf.distribute.in_cross_replica_context():
+    strategy = tf.distribute.get_strategy()
+    ...
+  ```
+
+  Returns:
+    A `tf.distribute.Strategy` object. Inside a
+    `with distribution_strategy.scope()` block, it returns
+    `distribution_strategy`, otherwise it returns the default
+    (single-replica) `tf.distribute.Strategy` object.
+  """
+  return _get_per_thread_mode().distribution_strategy
+
+
+@tf_export("distribute.has_strategy")
+def has_distribution_strategy():
+  """Return if there is a current non-default `tf.distribute.Strategy`.
+
+  Returns:
+    True if inside a `with strategy.scope():`.
+  """
+  return get_distribution_strategy() is not _get_default_distribution_strategy()
+
+
+# ------------------------------------------------------------------------------
+# Defaults that are used when no distribution strategy is explicitly created.
+# We create them lazily in a function so that we can workaround the circular
+# dependency on distribute_lib. See lazy loader at the top of this file.
+
+_defaults = {
+    "distribution_strategy": None,
+    "replica_context": None,
+    "replica_mode": None
+}
+
+
+def _get_default_distribution_strategy():
+  if _defaults["distribution_strategy"] is None:
+    _defaults["distribution_strategy"] = (
+        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
+  return _defaults["distribution_strategy"]
+
+
+def _get_default_replica_context():
+  if _defaults["replica_context"] is None:
+    _defaults["replica_context"] = distribute_lib.ReplicaContext(
+        _get_default_distribution_strategy(), replica_id_in_sync_group=0)
+  return _defaults["replica_context"]
+
+
+def _get_default_replica_mode():
+  if _defaults["replica_mode"] is None:
+    _defaults["replica_mode"] = _DefaultReplicaThreadMode()
+  return _defaults["replica_mode"]
diff --git a/tensorflow/python/distribute/estimator_training.py b/tensorflow/python/distribute/estimator_training.py
index 227b00fb3e566b9d0adc9a8def9b1785a7128854..7d5f231c37da41f10f945adc468f40ffd0ecc743 100644
--- a/tensorflow/python/distribute/estimator_training.py
+++ b/tensorflow/python/distribute/estimator_training.py
@@ -308,7 +308,7 @@ def estimator_train(estimator, train_distributed_fn, hooks):
     raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
                      '`estimator.train`')
 
-  if estimator._config._train_distribute.between_graph:
+  if estimator._config._train_distribute.extended.experimental_between_graph:
     # TODO(yuefengz): remove this limitation once we figure out how to merge
     # return values from `_worker_fn`s.
     raise ValueError('`Estimator.train` API is not supported for %s with '
@@ -354,9 +354,9 @@ def estimator_evaluate(estimator, evaluate_distributed_fn, hooks):
   if (estimator._config._distribute_coordinator_mode !=
       dc.CoordinatorMode.STANDALONE_CLIENT):
     raise ValueError('Only `STANDALONE_CLIENT` mode is supported when you call '
-                     '`Estimator.train`')
+                     '`Estimator.evaluate`')
 
-  if estimator._config._eval_distribute.between_graph:
+  if estimator._config._eval_distribute.extended.experimental_between_graph:
     # TODO(yuefengz): remove this limitation once we figure out how to merge
     # return values from `_worker_fn`s.
     raise ValueError('`Estimator.evaluate` API is not supported for %s with '
diff --git a/tensorflow/contrib/distribute/python/input_ops.py b/tensorflow/python/distribute/input_ops.py
similarity index 89%
rename from tensorflow/contrib/distribute/python/input_ops.py
rename to tensorflow/python/distribute/input_ops.py
index ac1ccd64b3267645cbe10fdc02892fd4abd61df1..2ded209701e74afe45fc96d66fab65b3ae250596 100644
--- a/tensorflow/contrib/distribute/python/input_ops.py
+++ b/tensorflow/python/distribute/input_ops.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.data.experimental.ops import filter_for_shard_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
 from tensorflow.python.data.util import nest
@@ -41,7 +42,8 @@ def auto_shard_dataset(dataset, num_shards, index):
     dataset: A `tf.data.Dataset` instance, typically the result of a bunch of
       dataset transformations.
     num_shards: A `tf.int64` scalar `tf.Tensor`, representing the number of
-        shards operating in parallel. Same usage as in `Dataset.shard`.
+        shards operating in parallel. Same usage as in
+        `tf.data.experimental.filter_for_shard`.
     index: A `tf.int64` scalar `tf.Tensor`, representing the worker index.
       Same usage as in `Dataset.shard`.
 
@@ -74,13 +76,15 @@ def auto_shard_dataset(dataset, num_shards, index):
         # constructor. Eventually we will change all cases to clone datasets
         # instead of updating in-place.
         return dataset._clone(
-            filenames=dataset._filenames.shard(num_shards, index))
+            filenames=dataset._filenames.apply(
+                filter_for_shard_ops.filter_for_shard(num_shards, index)))
       elif isinstance(dataset, dataset_ops.RangeDataset):
-        return dataset.shard(num_shards, index)
+        return dataset.apply(
+            filter_for_shard_ops.filter_for_shard(num_shards, index))
       elif hasattr(dataset, "_map_func"):
         # TODO(priyag): Make this check more robust by enforcing some common
         # property on all map/flatmap/interleave datasets.
-        map_func_def = dataset._map_func.definition
+        map_func_def = dataset._map_func.function.definition
         for node in map_func_def.node_def:
           if node.op in _READER_DATASET_OPS:
             found_reader_op = True
@@ -102,6 +106,11 @@ def auto_shard_dataset(dataset, num_shards, index):
               dataset._input_dataset, found_reader_op)
           return dataset
 
+    if isinstance(dataset, dataset_ops.DatasetV1Adapter):
+      dataset._dataset = _auto_shard_impl(
+          dataset._dataset, found_reader_op)
+      return dataset
+
     # TODO(priyag): Make _input_dataset(s) a common property of all datasets to
     # make this check more robust.
     if hasattr(dataset, "_input_dataset"):
@@ -137,6 +146,7 @@ def auto_shard_dataset(dataset, num_shards, index):
     # TODO(priyag): This will shard the filenames before any shuffling of the
     # filename dataset. It might be desirable to shard after shuffling
     # filenames? If so, how do we achieve that?
-    return dataset.shard(num_shards, index)
+    return dataset.apply(
+        filter_for_shard_ops.filter_for_shard(num_shards, index))
 
   return _auto_shard_impl(dataset=dataset, found_reader_op=False)
diff --git a/tensorflow/contrib/distribute/python/input_ops_test.py b/tensorflow/python/distribute/input_ops_test.py
similarity index 89%
rename from tensorflow/contrib/distribute/python/input_ops_test.py
rename to tensorflow/python/distribute/input_ops_test.py
index 559de97bb1f93f990ddaf775d9203d5a2d46aa99..dcf946ba477635cda5ee3299abf163a2bb9e5bff 100644
--- a/tensorflow/contrib/distribute/python/input_ops_test.py
+++ b/tensorflow/python/distribute/input_ops_test.py
@@ -20,10 +20,11 @@ from __future__ import print_function
 
 import os
 
-from tensorflow.contrib.distribute.python import input_ops
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import readers
+from tensorflow.python.distribute import input_ops
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import python_io
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -92,10 +93,11 @@ class AutoShardDatasetTest(test.TestCase):
     with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(record_fn(r, f), sess.run(next_element))
+          self.assertAllEqual(record_fn(r, f), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testTFRecordDataset(self):
     dataset = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset = input_ops.auto_shard_dataset(
@@ -103,6 +105,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createTFRecordFiles())
@@ -112,6 +115,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testInterleave(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createTFRecordFiles())
@@ -124,9 +128,10 @@ class AutoShardDatasetTest(test.TestCase):
     # contain records in order of files.
     self._verifySimpleShardingOutput(dataset, self._record)
 
+  @test_util.run_deprecated_v1
   def testListfiles(self):
     filenames = self._createTFRecordFiles()
-    file_pattern = filenames[0].rsplit("/", 1)[0] + "/tf_record.*.txt"
+    file_pattern = filenames[0].rsplit(os.sep, 1)[0] + "/tf_record.*.txt"
     dataset = dataset_ops.Dataset.list_files(file_pattern, shuffle=False)
     dataset = dataset.flat_map(readers.TFRecordDataset)
     dataset = input_ops.auto_shard_dataset(
@@ -138,12 +143,13 @@ class AutoShardDatasetTest(test.TestCase):
       actual, expected = [], []
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          actual.append(sess.run(next_element))
+          actual.append(self.evaluate(next_element))
           expected.append(self._record(r, f))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
       self.assertAllEqual(expected, actual)
 
+  @test_util.run_deprecated_v1
   def testComplexPipeline(self):
     # Setup a complex input pipeline.
     batch_size = 2
@@ -171,9 +177,9 @@ class AutoShardDatasetTest(test.TestCase):
       num_iterations = (self._num_files * self._num_records * num_epochs) // (
           self._num_shards * batch_size)
       for _ in range(num_iterations):
-        actual.extend(sess.run(next_element))
+        actual.extend(self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
       expected = []
       for f in range(0, self._num_files, self._num_shards):
@@ -183,6 +189,7 @@ class AutoShardDatasetTest(test.TestCase):
 
       self.assertAllEqual(sorted(expected), sorted(actual))
 
+  @test_util.run_deprecated_v1
   def testZip(self):
     dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset2 = readers.TextLineDataset(self._createTextFiles())
@@ -193,6 +200,7 @@ class AutoShardDatasetTest(test.TestCase):
     record_fn = lambda r, f: (self._record(r, f), self._text_line(r, f))
     self._verifySimpleShardingOutput(dataset, record_fn)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     dataset1 = readers.TFRecordDataset(self._createTFRecordFiles())
     dataset2 = readers.TextLineDataset(self._createTextFiles())
@@ -205,13 +213,15 @@ class AutoShardDatasetTest(test.TestCase):
     with self.cached_session() as sess:
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(self._record(r, f), sess.run(next_element))
+          self.assertAllEqual(self._record(r, f), self.evaluate(next_element))
       for f in range(self._shard_index, self._num_files, self._num_shards):
         for r in range(self._num_records):
-          self.assertAllEqual(self._text_line(r, f), sess.run(next_element))
+          self.assertAllEqual(
+              self._text_line(r, f), self.evaluate(next_element))
       with self.assertRaises(errors.OutOfRangeError):
-        sess.run(next_element)
+        self.evaluate(next_element)
 
+  @test_util.run_deprecated_v1
   def testTextLineReader(self):
     dataset = readers.TextLineDataset(self._createTextFiles())
     dataset = input_ops.auto_shard_dataset(
@@ -219,6 +229,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._text_line)
 
+  @test_util.run_deprecated_v1
   def testTextLineReaderWithFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(self._createTextFiles())
     dataset = dataset.flat_map(readers.TextLineDataset)
@@ -227,6 +238,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._text_line)
 
+  @test_util.run_deprecated_v1
   def testFixedLengthReader(self):
     dataset = readers.FixedLengthRecordDataset(
         self._createFixedLengthRecordFiles(), self._record_bytes)
@@ -235,6 +247,7 @@ class AutoShardDatasetTest(test.TestCase):
 
     self._verifySimpleShardingOutput(dataset, self._fixed_length_record)
 
+  @test_util.run_deprecated_v1
   def testFixedLengthReaderWithFlatMap(self):
     dataset = dataset_ops.Dataset.from_tensor_slices(
         self._createFixedLengthRecordFiles())
diff --git a/tensorflow/python/distribute/mirrored_strategy.py b/tensorflow/python/distribute/mirrored_strategy.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb94dfcfbd206eb81bbb76b36ded23a4f3bc2515
--- /dev/null
+++ b/tensorflow/python/distribute/mirrored_strategy.py
@@ -0,0 +1,919 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Class MirroredStrategy implementing DistributionStrategy."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import contextlib
+import copy
+import functools
+import threading
+
+from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import multi_worker_util
+from tensorflow.python.distribute import reduce_util
+from tensorflow.python.distribute import shared_variable_creator
+from tensorflow.python.distribute import values
+from tensorflow.python.eager import context
+from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device as tf_device
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.training import coordinator
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+# TODO(josh11b): Replace asserts in this file with if ...: raise ...
+
+
+@contextlib.contextmanager
+def _enter_graph(g):
+  if context.executing_eagerly():
+    with g.as_default(), context.eager_mode():
+      yield
+  else:
+    with g.as_default():
+      yield
+
+
+def _cpu_device(device):
+  cpu_device = tf_device.DeviceSpec.from_string(device)
+  cpu_device.merge_from(tf_device.DeviceSpec(device_type="CPU", device_index=0))
+  return cpu_device.to_string()
+
+
+class _RequestedStop(Exception):  # pylint: disable=g-bad-exception-name
+  pass
+
+
+# _call_for_each_replica and _reduce_non_distributed_value are not members of
+# MirroredStrategy so that they are generally not allowed to use anything
+# specific to MirroredStrategy and thus can be shared with other distribution
+# strategies.
+
+
+# TODO(yuefengz): maybe create a common class for those who need to call this
+# _call_for_each_replica.
+def _call_for_each_replica(distribution, fn, args, kwargs):
+  """Run `fn` in separate threads, once per replica/worker device.
+
+  Args:
+    distribution: the DistributionStrategy object.
+    fn: function to run (will be run once per device, each in its own thread).
+    args: positional arguments for `fn`
+    kwargs: keyword arguments for `fn`.
+
+  Returns:
+    Merged return value of `fn` across all replicas.
+
+  Raises:
+    RuntimeError: If fn() calls get_replica_context().merge_call() a different
+        number of times from the available devices.
+  """
+  # TODO(josh11b): Add this option once we add synchronization to variable
+  # creation. Until then, this is pretty unsafe to use.
+  run_concurrently = False
+  if not context.executing_eagerly():
+    # Needed for per-thread device, etc. contexts in graph mode.
+    ops.get_default_graph().switch_to_thread_local()
+
+  coord = coordinator.Coordinator(clean_stop_exception_types=(_RequestedStop,))
+
+  shared_variable_store = {}
+
+  # TODO(isaprykin): Create these threads once instead of during every run()
+  # call.
+  threads = []
+  for index, d in enumerate(distribution.extended.worker_devices):
+    variable_creator_fn = shared_variable_creator.make_fn(
+        shared_variable_store, index)
+    t = MirroredExtended._MirroredReplicaThread(  # pylint: disable=protected-access
+        distribution, coord, d, variable_creator_fn, fn,
+        *values.select_device(d, args), **values.select_device(d, kwargs))
+    threads.append(t)
+
+  for t in threads:
+    t.start()
+
+  # When `fn` starts `should_run` event is set on _MirroredReplicaThread
+  # (`MRT`) threads. The execution waits until
+  # `MRT.has_paused` is set, which indicates that either `fn` is
+  # complete or a `get_replica_context().merge_call()` is called.  If `fn` is
+  # complete, then `MRT.done` is set to True.  Otherwise, arguments
+  # of `get_replica_context().merge_call` from all paused threads are grouped
+  # and the `merge_fn` is performed.  Results of the
+  # `get_replica_context().merge_call` are then set to `MRT.merge_result`.
+  # Each such `get_replica_context().merge_call` call returns the
+  # `MRT.merge_result` for that thread when `MRT.should_run` event
+  # is reset again. Execution of `fn` resumes.
+
+  try:
+    with coord.stop_on_exception():
+      all_done = False
+      while not all_done and not coord.should_stop():
+        done = []
+        if run_concurrently:
+          for t in threads:
+            t.should_run.set()
+          for t in threads:
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        else:
+          for t in threads:
+            t.should_run.set()
+            t.has_paused.wait()
+            t.has_paused.clear()
+            if coord.should_stop():
+              return None
+            done.append(t.done)
+        if coord.should_stop():
+          return None
+        all_done = all(done)
+        if not all_done:
+          if any(done):
+            raise RuntimeError("Some replicas made a different number of "
+                               "replica_context().merge_call() calls.")
+          # get_replica_context().merge_call() case
+          merge_args = values.regroup({t.device: t.merge_args for t in threads})
+          merge_kwargs = values.regroup(
+              {t.device: t.merge_kwargs for t in threads})
+          # We capture the name_scope of the MRT when we call merge_fn
+          # to ensure that if we have opened a name scope in the MRT,
+          # it will be respected when executing the merge function. We only
+          # capture the name_scope from the first MRT and assume it is
+          # the same for all other MRTs.
+          mtt_captured_name_scope = threads[0].captured_name_scope
+          # Capture and merge the control dependencies from all the threads.
+          mtt_captured_control_deps = set()
+          for t in threads:
+            mtt_captured_control_deps.update(t.captured_control_deps)
+          with ops.name_scope(mtt_captured_name_scope),\
+              ops.control_dependencies(mtt_captured_control_deps):
+            merge_result = threads[0].merge_fn(distribution, *merge_args,
+                                               **merge_kwargs)
+          for t in threads:
+            t.merge_result = values.select_device(t.device, merge_result)
+  finally:
+    for t in threads:
+      t.should_run.set()
+    coord.join(threads)
+
+  return values.regroup({t.device: t.main_result for t in threads})
+
+
+def _reduce_non_distributed_value(extended, reduce_op, value, destinations):
+  """Reduce a non-DistributedValue `value` to `destinations`."""
+  if isinstance(value, values.DistributedValues):
+    raise ValueError("You are passing a `DistributedValue` to "
+                     "`_reduce_non_distributed_value`, which is not allowed.")
+
+  # If the same value is present on all replicas then the PerReplica value will
+  # be a single value. We also handle the case when `value` is a single value
+  # and equal to 0.
+  if value == 0:
+    return 0
+  # If there is only a single value and the reduce op is MEAN,
+  # that value should be on all destinations.
+  if reduce_op == reduce_util.ReduceOp.MEAN:
+    return value
+
+  cross_device_ops_lib.validate_destinations(destinations)
+  # We do not support a reduce op of SUM if the value is the same across
+  # all replicas. We call this as part of assign functions for MirroredVariables
+  # and summing up identical values across replicas is not clearly defined.
+  if (len(extended.worker_devices) != 1 or
+      not cross_device_ops_lib.check_destinations(destinations)):
+    raise ValueError("A non-DistributedValues value %s cannot be reduced with "
+                     "the given reduce op %s." % (value, reduce_op))
+  # TODO(anjalisridhar): Moves these methods to a device utility file?
+  devices = cross_device_ops_lib.get_devices_from(destinations)
+  if len(devices) == 1:
+    with ops.device(devices[0]):
+      return array_ops.identity(value)
+  else:
+    value_updates = {}
+    for d in devices:
+      with ops.device(d):
+        value_updates[d] = array_ops.identity(value)
+    return values.Mirrored(value_updates)
+
+
+def _create_mirrored_variable(devices, real_mirrored_creator, *args, **kwargs):  # pylint: disable=g-missing-docstring
+  # Figure out what collections this variable should be added to.
+  # We'll add the MirroredVariable to those collections instead.
+  collections = kwargs.pop("collections", None)
+  if collections is None:
+    collections = [ops.GraphKeys.GLOBAL_VARIABLES]
+  kwargs["collections"] = []
+
+  # Get synchronization value
+  synchronization = kwargs.get("synchronization",
+                               variable_scope.VariableSynchronization.ON_WRITE)
+  if synchronization == variable_scope.VariableSynchronization.NONE:
+    raise ValueError("`NONE` variable synchronization mode is not "
+                     "supported with `Mirrored` distribution strategy. Please"
+                     " change the `synchronization` for variable: " +
+                     kwargs["name"])
+  elif synchronization == variable_scope.VariableSynchronization.ON_READ:
+    # Variables that are to be synced on read are replica local.
+    is_replica_local = True
+    kwargs["trainable"] = False
+  elif (synchronization == variable_scope.VariableSynchronization.ON_WRITE or
+        synchronization == variable_scope.VariableSynchronization.AUTO):
+    # `AUTO` synchronization for `MirroredStrategy` is `ON_WRITE`.
+    is_replica_local = False
+  else:
+    raise ValueError("Invalid variable synchronization mode: " +
+                     synchronization + " for variable: " + kwargs["name"])
+
+  # Get aggregation value
+  aggregation = kwargs.pop("aggregation",
+                           variable_scope.VariableAggregation.NONE)
+  if aggregation not in (
+      variable_scope.VariableAggregation.NONE,
+      variable_scope.VariableAggregation.SUM,
+      variable_scope.VariableAggregation.MEAN,
+      variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
+  ):
+    raise ValueError("Invalid variable aggregation mode: " + aggregation +
+                     " for variable: " + kwargs["name"])
+
+  # Ignore user-specified caching device, not needed for mirrored variables.
+  kwargs.pop("caching_device", None)
+
+  # TODO(josh11b,apassos): It would be better if variable initialization
+  # was never recorded on the tape instead of having to do this manually
+  # here.
+  with tape.stop_recording():
+    index = real_mirrored_creator(devices, *args, **kwargs)
+
+    if is_replica_local:
+      result = values.ReplicaLocalVariable(
+          index, index[devices[0]], aggregation)
+    else:
+      result = values.MirroredVariable(index, index[devices[0]], aggregation)
+
+  # Add the wrapped variable to the requested collections.
+  # The handling of eager mode and the global step matches
+  # ResourceVariable._init_from_args().
+  if not context.executing_eagerly():
+    g = ops.get_default_graph()
+    # If "trainable" is True, next_creator() will add the member variables
+    # to the TRAINABLE_VARIABLES collection, so we manually remove
+    # them and replace with the MirroredVariable. We can't set
+    # "trainable" to False for next_creator() since that causes functions
+    # like implicit_gradients to skip those variables.
+    if kwargs.get("trainable", True):
+      collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
+      l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
+      for v in index.values():
+        if v in l:
+          l.remove(v)
+    g.add_to_collections(collections, result)
+  elif ops.GraphKeys.GLOBAL_STEP in collections:
+    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, result)
+
+  return result
+
+
+def _is_device_list_local(devices):
+  """Checks whether the devices list is for local or multi-worker.
+
+  Args:
+    devices: a list of device strings, either local for remote devices.
+
+  Returns:
+    a boolean indicating whether these device strings are for local or for
+    remote.
+
+  Raises:
+    ValueError: if device strings are not consistent.
+  """
+  all_local = None
+  for d in devices:
+    d_spec = tf_device.DeviceSpec().parse_from_string(d)
+    is_local = d_spec.job in (None, "localhost")
+
+    if all_local is None:  # Determine all_local from first device.
+      all_local = is_local
+
+    if all_local:
+      if not is_local:
+        raise ValueError("Local device string cannot have job specified other "
+                         "than 'localhost'")
+    else:
+      if is_local:
+        raise ValueError("Remote device string must have job specified.")
+      if d_spec.task is None:
+        raise ValueError("Remote device string must have task specified.")
+  return all_local
+
+
+def _cluster_spec_to_device_list(cluster_spec, num_gpus_per_worker):
+  """Returns a device list given a cluster spec."""
+  cluster_spec = multi_worker_util.normalize_cluster_spec(cluster_spec)
+  devices = []
+  for task_type in ("chief", "worker"):
+    for task_id in range(len(cluster_spec.as_dict().get(task_type, []))):
+      if num_gpus_per_worker is 0:
+        devices.append("/job:%s/task:%d" % (task_type, task_id))
+      else:
+        devices.extend([
+            "/job:%s/task:%d/device:GPU:%i" % (task_type, task_id, gpu_id)
+            for gpu_id in range(num_gpus_per_worker)
+        ])
+  return devices
+
+
+def _group_device_list(devices):
+  """Groups the devices list by task_type and task_id.
+
+  Args:
+    devices: a list of device strings for remote devices.
+
+  Returns:
+    a dict of list of device strings mapping from task_type to a list of devices
+    for the task_type in the asceding order of task_id.
+  """
+  assert not _is_device_list_local(devices)
+  device_dict = {}
+
+  for d in devices:
+    d_spec = tf_device.DeviceSpec().parse_from_string(d)
+
+    # Create an entry for the task_type.
+    if d_spec.job not in device_dict:
+      device_dict[d_spec.job] = []
+
+    # Fill the device list for task_type until it covers the task_id.
+    while len(device_dict[d_spec.job]) <= d_spec.task:
+      device_dict[d_spec.job].append([])
+
+    device_dict[d_spec.job][d_spec.task].append(d)
+
+  return device_dict
+
+
+def _infer_num_gpus_per_worker(devices):
+  """Infers the number of GPUs on each worker.
+
+  Currently to make multi-worker cross device ops work, we need all workers to
+  have the same number of GPUs.
+
+  Args:
+    devices: a list of device strings, can be either local devices or remote
+      devices.
+
+  Returns:
+    number of GPUs per worker.
+
+  Raises:
+    ValueError if workers have different number of GPUs or GPU indices are not
+    consecutive and starting from 0.
+  """
+  if _is_device_list_local(devices):
+    return len([d for d in devices if "GPU" in d.upper()])
+  else:
+    device_dict = _group_device_list(devices)
+    num_gpus = None
+    for _, devices_in_task in device_dict.items():
+      for device_in_task in devices_in_task:
+        if num_gpus is None:
+          num_gpus = len([d for d in device_in_task if "GPU" in d.upper()])
+
+        # Verify other workers have the same number of GPUs.
+        elif (
+            num_gpus != len([d for d in device_in_task if "GPU" in d.upper()])):
+          raise ValueError("All workers should have the same number of GPUs.")
+
+        for d in device_in_task:
+          d_spec = tf_device.DeviceSpec().parse_from_string(d)
+          if (d_spec.device_type.upper() == "GPU" and
+              d_spec.device_index >= num_gpus):
+            raise ValueError("Device_index on a worker should be consecutive "
+                             "and start from 0.")
+    return num_gpus
+
+
+def all_local_devices(num_gpus=None):
+  if num_gpus is None:
+    num_gpus = context.num_gpus()
+  return (tuple("/device:GPU:%d" % i for i in range(num_gpus)) or
+          ("/device:CPU:0",))
+
+
+@tf_export("distribute.MirroredStrategy")
+class MirroredStrategy(distribute_lib.DistributionStrategy):
+  """Mirrors vars to distribute across multiple devices and machines.
+
+  This strategy uses one replica per device and sync replication for its
+  multi-GPU version.
+
+  The multi-worker version will be added in the fture.
+
+  Args:
+    devices: a list of device strings.
+    cross_device_ops: optional, a descedant of `CrossDeviceOps`. If this is not
+      set, nccl will be use by default.
+  """
+
+  def __init__(self, devices=None, cross_device_ops=None):
+    extended = MirroredExtended(
+        self, devices=devices, cross_device_ops=cross_device_ops)
+    super(MirroredStrategy, self).__init__(extended)
+
+
+class MirroredExtended(distribute_lib.DistributionStrategyExtended):
+  """Implementation of MirroredStrategy."""
+
+  def __init__(self, container_strategy, devices=None, cross_device_ops=None):
+    super(MirroredExtended, self).__init__(container_strategy)
+    if devices is None:
+      devices = all_local_devices()
+    if not devices:
+      raise ValueError("Got an empty `devices` list. Please make sure the "
+                       "`devices` you pass in is not empty.")
+    self._cross_device_ops = cross_device_ops
+    self._initialize_strategy(devices)
+
+  def _initialize_strategy(self, devices):
+    # The _initialize_strategy method is intended to be used by distribute
+    # coordinator as well.
+    if _is_device_list_local(devices):
+      self._initialize_local(devices)
+    else:
+      self._initialize_multi_worker(devices)
+
+  def _initialize_local(self, devices):
+    """Initializes the object for local training."""
+    self._local_mode = True
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = tuple(device_util.resolve(d) for d in devices)
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
+
+    self._inferred_cross_device_ops = cross_device_ops_lib.choose_the_best(
+        devices)
+
+  def _initialize_multi_worker(self, devices):
+    """Initializes the object for multi-worker training."""
+    self._local_mode = False
+
+    assert devices, "Must specify at least one device."
+    assert len(set(devices)) == len(devices), (
+        "No duplicates allowed in `devices` argument.")
+    # TODO(josh11b): Require at least 2 devices?
+    self._devices = tuple(device_util.resolve(d) for d in devices)
+    self._canonical_device_set = set(self._devices)
+    self._device_index = values.PerReplica(
+        {d: i for i, d in enumerate(devices)})
+
+    device_dict = _group_device_list(devices)
+    self._workers = []
+    self._worker_devices = []
+    for job in ["chief", "worker"]:
+      for task in range(len(device_dict.get(job, []))):
+        worker = "/job:%s/task:%d" % (job, task)
+        self._workers.append(worker)
+        self._worker_devices.append((worker, device_dict[job][task]))
+
+    # Setting `_default_device` will add a device scope in the
+    # distribution.scope. We set the default device to the first worker. When
+    # users specify device under distribution.scope by
+    #   with tf.device("/cpu:0"):
+    #     ...
+    # their ops will end up on the cpu device of its first worker, e.g.
+    # "/job:worker/task:0/device:CPU:0". Note this is not used in replica mode.
+    self._default_device = self._workers[0]
+
+    self._inferred_cross_device_ops = cross_device_ops_lib.MultiWorkerAllReduce(
+        self._workers, _infer_num_gpus_per_worker(self._devices))
+
+  def _create_variable(self, next_creator, *args, **kwargs):
+    """Create a mirrored variable. See `DistributionStrategy.scope`."""
+    colocate_with = kwargs.pop("colocate_with", None)
+    devices = self._get_devices_from(colocate_with)
+
+    def _real_mirrored_creator(devices, *args, **kwargs):  # pylint: disable=g-missing-docstring
+      index = {}
+      for i, d in enumerate(devices):
+        with ops.init_scope(), ops.device(d):
+          if i > 0:
+            # Give replicas meaningful distinct names:
+            var0name = index[devices[0]].name.split(":")[0]
+            # We append a / to variable names created on replicas with id > 0 to
+            # ensure that we ignore the name scope and instead use the given
+            # name as the absolute name of the variable.
+            kwargs["name"] = "%s/replica_%d/" % (var0name, i)
+            # Initialize replicas with the same value:
+            def initial_value_fn(device=d):
+              if context.executing_eagerly():
+                init_value = index[devices[0]].value()
+                return array_ops.identity(init_value)
+              else:
+                with ops.device(device):
+                  init_value = index[devices[0]].initial_value
+                  return array_ops.identity(init_value)
+            kwargs["initial_value"] = initial_value_fn
+          with context.context().device_policy(context.DEVICE_PLACEMENT_SILENT):
+            # Don't record operations (e.g. other variable reads) during
+            # variable creation.
+            with tape.stop_recording():
+              v = next_creator(*args, **kwargs)
+          assert not isinstance(v, values.DistributedVariable)
+          index[d] = v
+      return index
+
+    return _create_mirrored_variable(devices, _real_mirrored_creator, *args,
+                                     **kwargs)
+
+  def _distribute_dataset(self, dataset_fn):
+    if self._local_mode:
+      return values.PerReplicaDataset(
+          self._call_dataset_fn(dataset_fn), self._devices)
+    else:
+      return values.MultiWorkerDataset(
+          functools.partial(self._call_dataset_fn, dataset_fn),
+          self._worker_devices,
+          auto_shard=False)
+
+  def _make_dataset_iterator(self, dataset):
+    if self._local_mode:
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
+    else:
+      worker_device_pairs = self._worker_devices
+
+    return values.DatasetIterator(dataset, worker_device_pairs,
+                                  self._num_replicas_in_sync)
+
+  def _make_input_fn_iterator(
+      self,
+      input_fn,
+      replication_mode=distribute_lib.InputReplicationMode.PER_WORKER):
+    input_contexts = []
+    if self._local_mode:
+      num_workers = 1
+      worker = device_util.canonicalize("/device:CPU:0")
+      worker_device_pairs = [(worker, self._devices)]
+    else:
+      num_workers = len(self._worker_devices)
+      worker_device_pairs = self._worker_devices
+
+    for i in range(num_workers):
+      input_contexts.append(distribute_lib.InputContext(
+          num_input_pipelines=num_workers,
+          input_pipeline_id=i,
+          num_replicas_in_sync=self._num_replicas_in_sync))
+    return values.InputFunctionIterator(
+        input_fn, worker_device_pairs, input_contexts)
+
+  # TODO(priyag): Deal with OutOfRange errors once b/111349762 is fixed.
+  def _experimental_run_steps_on_iterator(self, fn, iterator, iterations,
+                                          initial_loop_values=None):
+    if initial_loop_values is None:
+      initial_loop_values = {}
+    initial_loop_values = nest.flatten(initial_loop_values)
+
+    ctx = values.MultiStepContext()
+    def body(i, *args):
+      """A wrapper around `fn` to create the while loop body."""
+      del args
+      fn_inputs = iterator.get_next()
+      if not isinstance(fn_inputs, tuple):
+        fn_inputs = (fn_inputs,)
+      fn_result = fn(ctx, fn_inputs)
+      for (name, output) in ctx.last_step_outputs.items():
+        # Convert all outputs to tensors, potentially from `DistributedValues`.
+        ctx.last_step_outputs[name] = self._unwrap(output)
+      flat_last_step_outputs = nest.flatten(ctx.last_step_outputs)
+      with ops.control_dependencies([fn_result]):
+        return [i + 1] + flat_last_step_outputs
+
+    # We capture the control_flow_context at this point, before we run `fn`
+    # inside a while_loop. This is useful in cases where we might need to exit
+    # these contexts and get back to the outer context to do some things, for
+    # e.g. create an op which should be evaluated only once at the end of the
+    # loop on the host. One such usage is in creating metrics' value op.
+    self._outer_control_flow_context = (
+        ops.get_default_graph()._get_control_flow_context())  # pylint: disable=protected-access
+
+    cond = lambda i, *args: i < iterations
+    i = constant_op.constant(0)
+    loop_result = control_flow_ops.while_loop(
+        cond, body, [i] + initial_loop_values, name="",
+        parallel_iterations=1, back_prop=False, swap_memory=False,
+        return_same_structure=True)
+    del self._outer_control_flow_context
+
+    ctx.run_op = control_flow_ops.group(loop_result)
+
+    # Convert the last_step_outputs from a list to the original dict structure
+    # of last_step_outputs.
+    last_step_tensor_outputs = loop_result[1:]
+    last_step_tensor_outputs_dict = nest.pack_sequence_as(
+        ctx.last_step_outputs, last_step_tensor_outputs)
+
+    for name, reduce_op in ctx._last_step_outputs_reduce_ops.items():  # pylint: disable=protected-access
+      output = last_step_tensor_outputs_dict[name]
+      # For outputs that have already been reduced, wrap them in a Mirrored
+      # container, else in a PerReplica container.
+      if reduce_op is None:
+        last_step_tensor_outputs_dict[name] = values.regroup(
+            {d: t for d, t in zip(self._devices, output)}, values.PerReplica)
+      else:
+        assert len(output) == 1
+        last_step_tensor_outputs_dict[name] = output[0]
+
+    ctx._set_last_step_outputs(last_step_tensor_outputs_dict)  # pylint: disable=protected-access
+    return ctx
+
+  def _broadcast_to(self, tensor, destinations):
+    # This is both a fast path for Python constants, and a way to delay
+    # converting Python values to a tensor until we know what type it
+    # should be converted to. Otherwise we have trouble with:
+    #   global_step.assign_add(1)
+    # since the `1` gets broadcast as an int32 but global_step is int64.
+    if isinstance(tensor, (float, int)):
+      return tensor
+    # TODO(josh11b): In eager mode, use one thread per device, or async mode.
+    return self._get_cross_device_ops().broadcast(
+        tensor, destinations or self._devices)
+
+  def _call_for_each_replica(self, fn, args, kwargs):
+    return _call_for_each_replica(self._container_strategy(), fn, args, kwargs)
+
+  def _configure(self,
+                 session_config=None,
+                 cluster_spec=None,
+                 task_type=None,
+                 task_id=None):
+    del task_type, task_id
+
+    if session_config:
+      session_config.CopyFrom(self._update_config_proto(session_config))
+
+    if cluster_spec:
+      # TODO(yuefengz): remove the following code once cluster_resolver is
+      # added.
+      num_gpus_per_worker = _infer_num_gpus_per_worker(self._devices)
+      multi_worker_devices = _cluster_spec_to_device_list(
+          cluster_spec, num_gpus_per_worker)
+      self._initialize_multi_worker(multi_worker_devices)
+
+  def _update_config_proto(self, config_proto):
+    updated_config = copy.deepcopy(config_proto)
+    updated_config.isolate_session_state = True
+    return updated_config
+
+  def _get_cross_device_ops(self):
+    return self._cross_device_ops or self._inferred_cross_device_ops
+
+  def _reduce_to(self, reduce_op, value, destinations):
+    if (isinstance(value, values.Mirrored) and
+        reduce_op == reduce_util.ReduceOp.MEAN):
+      return value
+    assert not isinstance(value, values.Mirrored)
+    if not isinstance(value, values.DistributedValues):
+      # This function handles reducing values that are not PerReplica or
+      # Mirrored values. For example, the same value could be present on all
+      # replicas in which case `value` would be a single value or value could
+      # be 0.
+      return _reduce_non_distributed_value(self, reduce_op, value,
+                                           destinations)
+    return self._get_cross_device_ops().reduce(
+        reduce_op, value, destinations=destinations)
+
+  def _batch_reduce_to(self, reduce_op, value_destination_pairs):
+    return self._get_cross_device_ops().batch_reduce(reduce_op,
+                                                     value_destination_pairs)
+
+  def _update(self, var, fn, args, kwargs, group):
+    # TODO(josh11b): In eager mode, use one thread per device.
+    assert isinstance(var, values.DistributedVariable)
+    updates = {}
+    for d, v in var._index.items():  # pylint: disable=protected-access
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        # If args and kwargs are not mirrored, the value is returned as is.
+        updates[d] = fn(v,
+                        *values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, group)
+
+  def _update_non_slot(self, colocate_with, fn, args, kwargs, group):
+    assert isinstance(colocate_with, tuple)
+    # TODO(josh11b): In eager mode, use one thread per device.
+    updates = {}
+    for d in colocate_with:
+      name = "update_%d" % self._device_index.get(d)
+      with ops.device(d), distribute_lib.UpdateContext(d), ops.name_scope(name):
+        updates[d] = fn(*values.select_device_mirrored(d, args),
+                        **values.select_device_mirrored(d, kwargs))
+    return values.update_regroup(self, updates, group)
+
+  def read_var(self, replica_local_var):
+    """Read the aggregate value of a replica-local variable."""
+    if isinstance(replica_local_var, values.ReplicaLocalVariable):
+      return replica_local_var._get_cross_replica()  # pylint: disable=protected-access
+    assert isinstance(replica_local_var, values.Mirrored)
+    return array_ops.identity(replica_local_var.get())
+
+  def _unwrap(self, val):
+    if isinstance(val, values.DistributedValues):
+      # Return in a deterministic order.
+      if set(val.devices) == self._canonical_device_set:
+        return tuple(val.get(device=d) for d in self._devices)
+      return tuple(val.get(device=d) for d in sorted(val.devices))
+    return (val,)
+
+  def value_container(self, val):
+    return values.value_container(val)
+
+  @property
+  def _num_replicas_in_sync(self):
+    return len(self._devices)
+
+  @property
+  def worker_devices(self):
+    return self._devices
+
+  @property
+  def parameter_devices(self):
+    return self._devices
+
+  @property
+  def experimental_between_graph(self):
+    return False
+
+  @property
+  def experimental_should_init(self):
+    return True
+
+  @property
+  def should_checkpoint(self):
+    return True
+
+  @property
+  def should_save_summary(self):
+    return True
+
+  def non_slot_devices(self, var_list):
+    del var_list
+    return tuple(self._devices)
+
+  def _get_devices_from(self, colocate_with=None):
+    if colocate_with is None:
+      return self._devices
+    else:
+      return cross_device_ops_lib.get_devices_from(colocate_with)
+
+  # TODO(priyag): Delete this once all strategies use global batch size.
+  @property
+  def _global_batch_size(self):
+    return True
+
+  class _MirroredReplicaThread(threading.Thread):
+    """A thread that runs() a function on a device."""
+
+    def __init__(self, dist, coord, device, variable_creator_fn, fn, *args,
+                 **kwargs):
+      super(MirroredExtended._MirroredReplicaThread, self).__init__()  # pylint: disable=protected-access
+      self.coord = coord
+      self.distribution = dist
+      self.device = device
+      self.replica_id = dist.extended.worker_devices.index(device)
+      self.variable_creator_fn = variable_creator_fn
+      # State needed to run and return the results of `fn`.
+      self.main_fn = fn
+      self.main_args = args
+      self.main_kwargs = kwargs
+      self.main_result = None
+      self.done = False
+      # State needed to run the next merge_call() (if any) requested via
+      # ReplicaContext.
+      self.merge_fn = None
+      self.merge_args = None
+      self.merge_kwargs = None
+      self.merge_result = None
+      self.captured_name_scope = None
+      # We use a thread.Event for the main thread to signal when this
+      # thread should start running (`should_run`), and another for
+      # this thread to transfer control back to the main thread
+      # (`has_paused`, either when it gets to a
+      # `get_replica_context().merge_call` or when `fn` returns). In
+      # either case the event starts cleared, is signaled by calling
+      # set(). The receiving thread waits for the signal by calling
+      # wait() and then immediately clearing the event using clear().
+      self.should_run = threading.Event()
+      self.has_paused = threading.Event()
+      # These fields have to do with inheriting various contexts from the
+      # parent thread:
+      # pylint: disable=protected-access
+      self.context_mode = context.context()._eager_context.mode
+      if not context.context()._context_handle:
+        context.context()._initialize_handle_and_devices()
+      self.context_device_policy = (
+          pywrap_tensorflow.TFE_ContextGetDevicePlacementPolicy(
+              context.context()._context_handle))
+      self.graph = ops.get_default_graph()
+      self._variable_creator_stack = self.graph._variable_creator_stack[:]
+      self._captured_var_scope = variable_scope.get_variable_scope()
+      # Adding a "/" at end lets us re-enter this scope later.
+      self._name_scope = self.graph.get_name_scope()
+      if self._name_scope:
+        self._name_scope += "/"
+      if self.replica_id > 0:
+        if not self._name_scope:
+          self._name_scope = ""
+        self._name_scope += "replica_%d/" % self.replica_id
+
+    def run(self):
+      # pylint: disable=protected-access
+      self.graph._variable_creator_stack = self._variable_creator_stack
+      self.should_run.wait()
+      self.should_run.clear()
+      try:
+        if self.coord.should_stop():
+          return
+        with self.coord.stop_on_exception(), \
+            context.context()._mode(self.context_mode), \
+            context.context().device_policy(self.context_device_policy), \
+            _enter_graph(self.graph), \
+            MirroredReplicaContext(self.distribution, constant_op.constant(
+                self.replica_id, dtypes.int32)), \
+            ops.device(self.device), \
+            ops.name_scope(self._name_scope), \
+            variable_scope.variable_scope(
+                self._captured_var_scope, reuse=self.replica_id > 0), \
+            variable_scope.variable_creator_scope(self.variable_creator_fn):
+          self.main_result = self.main_fn(*self.main_args, **self.main_kwargs)
+          self.done = True
+      finally:
+        self.has_paused.set()
+
+
+class MirroredReplicaContext(distribute_lib.ReplicaContext):
+  """ReplicaContext used in MirroredStrategy.call_for_each_replica().
+
+  Opened in `_MirroredReplicaThread`, to allow the user to invoke
+  `MirroredStrategy`'s specific implementation of `merge_call()`,
+  which works by delegating the function and its arguments to
+  the main thread (the one that invoked
+  `MirroredStrategy.call_for_each_replica()`).
+  """
+
+  def _merge_call(self, fn, args, kwargs):
+    """Delegate to the main thread to actually perform merge_call()."""
+    t = threading.current_thread()  # a _MirroredReplicaThread
+    t.merge_fn = fn
+    t.merge_args = args
+    t.merge_kwargs = kwargs
+    t.captured_name_scope = t.graph.get_name_scope()
+    # Adding a "/" at end lets us re-enter this scope later.
+    if t.captured_name_scope:
+      t.captured_name_scope += "/"
+
+    t.captured_control_deps = t.graph._current_control_dependencies()  # pylint: disable=protected-access
+    t.has_paused.set()
+    t.should_run.wait()
+    t.should_run.clear()
+    if t.coord.should_stop():
+      raise _RequestedStop()
+    return t.merge_result
+
+  @property
+  def devices(self):
+    distribute_lib.require_replica_context(self)
+    replica_id = tensor_util.constant_value(self._replica_id_in_sync_group)
+    return [self._distribution_strategy.extended.worker_devices[replica_id]]
diff --git a/tensorflow/python/distribute/multi_worker_util.py b/tensorflow/python/distribute/multi_worker_util.py
index 360733eff64606db2c4bde1a83351fb414ff2068..2986a6726a5bc2c837a554892f5aebd09da43c91 100644
--- a/tensorflow/python/distribute/multi_worker_util.py
+++ b/tensorflow/python/distribute/multi_worker_util.py
@@ -45,6 +45,33 @@ def normalize_cluster_spec(cluster_spec):
   return cluster_spec
 
 
+# TODO(yuefengz): add more validations.
+def _validate_cluster_spec(cluster_spec, task_type, task_id):
+  """Validates `cluster_spec`.
+
+  It checks
+  1) whether there is such a task type as `task_type` in the
+  `cluster_spec`.
+  2) whether there is at most one "chief" job.
+  3) whether the `task_id` is smaller than the number of `task_type`.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object to be validated.
+    task_type: string indicating the type of the task.
+    task_id: task_id: the id of the `task_type` in this cluster.
+  Throws:
+    ValueError: if `cluster_spec` fails any check.
+  """
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+  if task_type and task_type not in cluster_spec:
+    raise ValueError("`task_type` %r not found in cluster_spec." % task_type)
+  if len(cluster_spec.get("chief", [])) > 1:
+    raise ValueError("There must be at most one 'chief' job.")
+  if task_id >= len(cluster_spec[task_type]):
+    raise ValueError(
+        "The `task_id` %d exceeds the maximum id of %s." % (task_id, task_type))
+
+
 def is_chief(cluster_spec, task_type, task_id):
   """Returns whether the given task is chief in the cluster.
 
@@ -61,20 +88,73 @@ def is_chief(cluster_spec, task_type, task_id):
     ValueError: if `task_type` is not in the `cluster_spec` or `task_id` exceeds
       the maximum id of the `task_type`.
   """
-  cluster_spec = normalize_cluster_spec(cluster_spec)
-  if task_type not in cluster_spec.jobs:
-    raise ValueError(
-        "The task_type \"%s\" is not in the `cluster_spec`." % task_type)
-  if task_id >= cluster_spec.num_tasks(task_type):
-    raise ValueError("The `task_id` %d exceeds the maximum id of %s." % (
-        task_id, task_type))
+  _validate_cluster_spec(cluster_spec, task_type, task_id)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
 
   if task_type == "chief":
     return True
 
   # If chief not in the cluster_spec, use the first worker as chief. This is
   # common in CollectiveAllReduceStrategy.
-  if ("chief" not in cluster_spec.jobs and task_type == "worker" and
-      task_id == 0):
+  if ("chief" not in cluster_spec and task_type == "worker" and task_id == 0):
     return True
   return False
+
+
+def worker_count(cluster_spec, task_type):
+  """Returns the number of workers in the cluster."""
+  _validate_cluster_spec(cluster_spec, task_type, task_id=0)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+
+  # Other jobs such as "ps" shouldn't call this function.
+  if task_type not in ["chief", "worker", "evaluator"]:
+    raise ValueError("Unexpected `task_type` %r" % task_type)
+
+  if task_type == "evaluator":
+    # The "evaluator" is in its own cluster or its own partition of a cluster.
+    # So we don't have to count "chief" or "worker" if the current task is an
+    # "evaluator".
+    return len(cluster_spec["evaluator"])
+  else:
+    # In the non-evaluator case, we return the total number of "chief" and
+    # "worker" tasks as the "chief" is also a worker.
+    return (len(cluster_spec.get("chief", [])) + len(
+        cluster_spec.get("worker", [])))
+
+
+def id_in_cluster(cluster_spec, task_type, task_id):
+  """Returns a unique id for the task in the `task_type`'s cluster.
+
+  It returns an id ranging from [0, `worker_count(task_type, task_id)`).
+
+  Note: this function assumes that "evaluate" job is in its own cluster or its
+  own partition of a cluster.
+
+  Args:
+    cluster_spec: a dict, `ClusterDef` or `ClusterSpec` object to be validated.
+    task_type: string indicating the type of the task.
+    task_id: the id of the `task_type` in this cluster.
+
+  Returns:
+    an int indicating the unique id.
+
+  Throws:
+    ValueError: if `task_type` is not "chief", "worker" or "evaluator".
+  """
+  _validate_cluster_spec(cluster_spec, task_type, task_id)
+  cluster_spec = normalize_cluster_spec(cluster_spec).as_dict()
+
+  # The "chief" job has always id 0 and there is at most one and "worker" jobs
+  # come after it.
+  if task_type == "chief":
+    return 0
+
+  if task_type == "worker":
+    return task_id + len(cluster_spec.get("chief", []))
+
+  # The "evaluator" is in its own cluster or its own partition of a cluster.
+  if task_type == "evaluator":
+    return task_id
+
+  # We currently don't assign ids to other tasks.
+  raise ValueError("There is no id for task_type %r" % task_type)
diff --git a/tensorflow/python/distribute/multi_worker_util_test.py b/tensorflow/python/distribute/multi_worker_util_test.py
index bdc49725c7751873bed665abd3b24b1722b00525..9e1596eefdf6ee83c3b31ef2ccbf1d0637a6027e 100644
--- a/tensorflow/python/distribute/multi_worker_util_test.py
+++ b/tensorflow/python/distribute/multi_worker_util_test.py
@@ -95,7 +95,7 @@ class IsChiefTest(test.TestCase):
     self.assertFalse(multi_worker_util.is_chief(cluster_spec, "worker", 1))
 
     with self.assertRaisesRegexp(
-        ValueError, "The task_type \"chief\" is not in the `cluster_spec`."):
+        ValueError, "`task_type` 'chief' not found in cluster_spec."):
       multi_worker_util.is_chief(cluster_spec, "chief", 0)
 
     with self.assertRaisesRegexp(
@@ -103,5 +103,94 @@ class IsChiefTest(test.TestCase):
       multi_worker_util.is_chief(cluster_spec, "worker", 2)
 
 
+class NumWorkersTest(test.TestCase):
+
+  def testCountWorker(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="chief"), 3)
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="worker"), 3)
+
+  def testCountEvaluator(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "evaluator": ["127.0.0.1:7566"]
+    }
+    self.assertEqual(
+        multi_worker_util.worker_count(cluster_spec, task_type="evaluator"), 1)
+
+  def testTaskTypeNotFound(self):
+    cluster_spec = {}
+    with self.assertRaisesRegexp(
+        ValueError, "`task_type` 'worker' not found in cluster_spec."):
+      multi_worker_util.worker_count(cluster_spec, task_type="worker")
+
+  def testCountPs(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    # A "ps" job shouldn't call this method.
+    with self.assertRaisesRegexp(ValueError, "Unexpected `task_type` 'ps'"):
+      multi_worker_util.worker_count(cluster_spec, task_type="ps")
+
+
+class IdInClusterTest(test.TestCase):
+
+  def testChiefId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "chief", 0), 0)
+
+  def testWorkerId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "worker", 1), 2)
+
+    cluster_spec = {
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "ps": ["127.0.0.1:1926", "127.0.0.1:3141"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "worker", 1), 1)
+
+  def testEvaluatorId(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:1234"],
+        "worker": ["127.0.0.1:8964", "127.0.0.1:2333"],
+        "evaluator": ["127.0.0.1:7566"]
+    }
+    self.assertEqual(
+        multi_worker_util.id_in_cluster(cluster_spec, "evaluator", 0), 0)
+
+  def testPsId(self):
+    cluster_spec = {"chief": ["127.0.0.1:1234"], "ps": ["127.0.0.1:7566"]}
+    with self.assertRaisesRegexp(ValueError,
+                                 "There is no id for task_type 'ps'"):
+      multi_worker_util.id_in_cluster(cluster_spec, "ps", 0)
+
+  def testMultipleChiefs(self):
+    cluster_spec = {
+        "chief": ["127.0.0.1:8258", "127.0.0.1:7566"],
+    }
+    with self.assertRaisesRegexp(ValueError,
+                                 "There must be at most one 'chief' job."):
+      multi_worker_util.id_in_cluster(cluster_spec, "chief", 0)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/reduce_util.py b/tensorflow/python/distribute/reduce_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b2a4e9dba81e38e6bb3ea970e390628fe3cb540
--- /dev/null
+++ b/tensorflow/python/distribute/reduce_util.py
@@ -0,0 +1,53 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilites for reduce operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import enum
+
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("distribute.ReduceOp")
+class ReduceOp(enum.Enum):
+  """Indicates how a set of values should be reduced.
+
+  * `SUM`: Add all the values.
+  * `MEAN`: Take the arithmetic mean ("average") of the values.
+
+  TODO(priyag): Add the following types:
+  * `MIN`: Return the minimum of all values.
+  * `MAX`: Return the maximum of all values.
+  """
+
+  SUM = "SUM"
+  MEAN = "MEAN"
+
+  @staticmethod
+  def from_variable_aggregation(aggregation):
+    mapping = {
+        variable_scope.VariableAggregation.SUM: ReduceOp.SUM,
+        variable_scope.VariableAggregation.MEAN: ReduceOp.MEAN,
+    }
+
+    reduce_op = mapping.get(aggregation)
+    if not reduce_op:
+      raise ValueError("Could not convert from `tf.VariableAggregation` %s to"
+                       "`tf.distribute.ReduceOp` type" % aggregation)
+    return reduce_op
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator.py b/tensorflow/python/distribute/shared_variable_creator.py
similarity index 100%
rename from tensorflow/contrib/distribute/python/shared_variable_creator.py
rename to tensorflow/python/distribute/shared_variable_creator.py
diff --git a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py b/tensorflow/python/distribute/shared_variable_creator_test.py
similarity index 97%
rename from tensorflow/contrib/distribute/python/shared_variable_creator_test.py
rename to tensorflow/python/distribute/shared_variable_creator_test.py
index 2a9ab51fcfd29a8ae5b37b5c513415af29b277dc..4ddc29f256761c2359f0a49415932b53eda066f4 100644
--- a/tensorflow/contrib/distribute/python/shared_variable_creator_test.py
+++ b/tensorflow/python/distribute/shared_variable_creator_test.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.contrib.distribute.python import shared_variable_creator
+from tensorflow.python.distribute import shared_variable_creator
 from tensorflow.python.eager import test
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variable_scope
diff --git a/tensorflow/contrib/distribute/python/values.py b/tensorflow/python/distribute/values.py
similarity index 77%
rename from tensorflow/contrib/distribute/python/values.py
rename to tensorflow/python/distribute/values.py
index 9b0aa9c99005d6a4134389fe618e71cc7f3472d9..01a1680a246b9beb34c4c5c1b6b3dfe6494c33f3 100644
--- a/tensorflow/contrib/distribute/python/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -23,11 +23,18 @@ from __future__ import print_function
 
 import collections
 import contextlib
+import operator
 import weakref
 import six
 
-from tensorflow.contrib.distribute.python import input_ops
+from tensorflow.python.data.experimental.ops import batching
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import multi_device_iterator_ops
+from tensorflow.python.distribute import device_util
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import input_ops
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as tf_device
@@ -38,10 +45,6 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import saver
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -97,10 +100,21 @@ class DistributedValues(object):
   # DistributionStrategy implementations.
 
 
+# NOTE(josh11b,apassos): It would be great if we could inspect the values this was
+# initialized with and use that to generate the overloaded operators here.
+# Unfortunately, Python's rules for special methods don't allow this, see
+# https://docs.python.org/3/reference/datamodel.html#special-method-names
+# "if a class defines a method named __getitem__(), and x is an instance of
+# this class, then x[i] is roughly equivalent to type(x).__getitem__(x, i)."
+# In particular, these special methods don't go through __getattr__, and
+# it will only use those methods if they are defined in the class, not the
+# object.
 class DistributedDelegate(DistributedValues):
   """A map from device to values; acts as the same type as the values."""
 
   def __getattr__(self, name):
+    # TODO(priyag): This needs to be made robust against pitfalls from mix use
+    # __getattr__ and @property. See b/120402273.
     return getattr(self.get(), name)
 
   # pylint: disable=multiple-statements
@@ -316,6 +330,14 @@ class DistributedVariable(DistributedDelegate):
 ops.register_dense_tensor_like_type(DistributedVariable)
 
 
+def _apply_aggregation(strategy, value, aggregation, destinations):
+  if aggregation == vs.VariableAggregation.ONLY_FIRST_REPLICA:
+    return strategy.broadcast(strategy.unwrap(value)[0],
+                              destinations=destinations)
+  reduce_op = reduce_util.ReduceOp.from_variable_aggregation(aggregation)
+  return strategy.extended.reduce_to(reduce_op, value, destinations)
+
+
 class _MirroredSaveable(saver.BaseSaverBuilder.ResourceVariableSaveable):
   """Class for defining how to restore a MirroredVariable."""
 
@@ -373,14 +395,11 @@ class MirroredVariable(DistributedVariable, Mirrored,
                          "MirroredVariable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
@@ -614,14 +633,11 @@ class TPUMirroredVariable(checkpointable.CheckpointableBase):
                          "TPUMirroredVariable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   @contextlib.contextmanager
   def _handle_graph(self, handle):
@@ -1058,18 +1074,18 @@ def select_device_mirrored(device, structured):
   return nest.map_structure(_get_mirrored, structured)
 
 
-def update_regroup(strategy, updates, should_group):
+def update_regroup(extended, updates, group):
   """Regroup for an update, with dependencies to ensure all updates execute."""
   regrouped = regroup(updates, Mirrored)
-  if not should_group:
-    return nest.map_structure(strategy.unwrap, regrouped)
+  if not group:
+    return nest.map_structure(extended._unwrap, regrouped)  # pylint: disable=protected-access
   grouped_flat = []
   for u in nest.flatten(regrouped):
     if isinstance(u, DistributedValues):
-      g = strategy.group(u)
+      g = extended._group(u)  # pylint: disable=protected-access
       if u.is_tensor_like:
         # Make sure we run all updates. Without this, something like
-        # session.run(strategy.update(...)) may only update one replica.
+        # session.run(extended.update(...)) may only update one replica.
         index = {}
         for d in u.devices:
           with ops.device(d), ops.control_dependencies([g]):
@@ -1155,7 +1171,7 @@ class PerReplicaDataset(object):
     # Eager mode prefetching would error out in constructor. Only remaining
     # case is non-prefetching in eager mode. We delegate to
     # PerReplicaDataIterator to handle that case.
-    dataset_iterator = self._dataset.make_one_shot_iterator()
+    dataset_iterator = dataset_ops.make_one_shot_iterator(self._dataset)
     return PerReplicaDataIterator(
         dataset_iterator, self._devices, prefetch_on_device=False)
 
@@ -1170,7 +1186,7 @@ class PerReplicaDataset(object):
       dataset_iterator = multi_device_iterator_ops.MultiDeviceIterator(
           self._dataset, self._devices)
     else:
-      dataset_iterator = self._dataset.make_initializable_iterator()
+      dataset_iterator = dataset_ops.make_initializable_iterator(self._dataset)
     return PerReplicaDataIterator(
         dataset_iterator,
         self._devices,
@@ -1252,22 +1268,34 @@ class MultiWorkerDataset(object):
     """Initialize the MultiWorkerDataset object.
 
     Args:
-      dataset_fn: a function that returns a `tf.data.Dataset`.
+      dataset_fn: a function or a list of functions that returns a
+        `tf.data.Dataset`.
       worker_device_pairs: a list of (worker, list of devices on that worker)
-        pairs.
+        pairs; it must have same length with `dataset_fn` if `dataset_fn` is a
+        list.
       prefetch_on_device: whether to prefetch to devices.
       auto_shard: whether to auto-shard the dataset.
     """
+    if isinstance(dataset_fn, list):
+      if len(dataset_fn) != len(worker_device_pairs):
+        raise ValueError("If `dataset_fn` is a list, it must have same length "
+                         "as `worker_device_pairs`")
+      if auto_shard:
+        raise ValueError(
+            "If `dataset_fn` is a list, `auto_shard` is not supported.")
     self._worker_device_pairs = worker_device_pairs
     self._datasets = []
     # TODO(yuefengz, priyag): support different set of jobs for input
     # processing.
     for i, (worker, worker_devices) in enumerate(worker_device_pairs):
       with ops.device(worker):
-        worker_input = dataset_fn()
-        if auto_shard:
-          worker_input = input_ops.auto_shard_dataset(
-              worker_input, len(worker_device_pairs), i)
+        if isinstance(dataset_fn, list):
+          worker_input = dataset_fn[i]()
+        else:
+          worker_input = dataset_fn()
+          if auto_shard:
+            worker_input = input_ops.auto_shard_dataset(
+                worker_input, len(worker_device_pairs), i)
         dataset = PerReplicaDataset(
             worker_input, worker_devices, prefetch_on_device=prefetch_on_device)
         self._datasets.append((worker, dataset))
@@ -1276,46 +1304,337 @@ class MultiWorkerDataset(object):
     iterators = []
     for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators.append((worker, dataset.make_one_shot_iterator()))
+        iterators.append((worker, dataset_ops.make_one_shot_iterator(dataset)))
     return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
   def make_initializable_iterator(self):
     iterators = []
     for worker, dataset in self._datasets:
       with ops.device(worker):
-        iterators.append((worker, dataset.make_initializable_iterator()))
+        iterators.append(
+            (worker, dataset_ops.make_initializable_iterator(dataset)))
     return MultiWorkerDataIterator(iterators, self._worker_device_pairs)
 
 
-class MapOutput(object):
-  """Map can result in multiple outputs per device."""
+class InputIterator(object):
+  """An input iterator, intended to be passed to `DistributionStrategy.run`."""
 
-  def __init__(self, l):
-    self._l = l
+  def get_next(self):
+    """Returns the next inputs for all replicas."""
+    raise NotImplementedError("must be implemented in descendants")
 
-  def get(self):
-    return self._l
+  def initialize(self):
+    """Initialize the underlying input dataset, when applicable.
+
+    In eager mode, this will create a new iterator and return it.
+    In graph mode, this will initialize the same underlying iterator(s).
+
+    Users are required to call this if
+    - This iterator was returned from a call to `make_input_fn_iterator` with an
+      input function that returns a dataset.
+    - Or this iterator was returned from a call to `make_dataset_iterator`.
+
+    Returns:
+      A list of initialization ops to be executed.
+    """
+    raise NotImplementedError("must be implemented in descendants")
+
+
+class InputIteratorImpl(InputIterator):
+  """Common implementation for all input iterators."""
+
+  def __init__(self, worker_device_pairs, iterators):
+    if not worker_device_pairs:
+      raise ValueError("Should have at least one worker for input iterator.")
+
+    self._iterators = iterators
+    self._worker_device_pairs = worker_device_pairs
+    self._is_eager = context.executing_eagerly()
+
+  def get_next(self, name=None):
+    """Returns the next input from the iterator for all replicas."""
+    assert self._is_eager == context.executing_eagerly(), (
+        "Iterator should be created and used in same execution mode.")
+
+    index = {}
+    for i, (worker, worker_devices) in enumerate(self._worker_device_pairs):
+      if name is not None:
+        d = tf_device.DeviceSpec.from_string(worker)
+        new_name = "%s_%s_%d" % (name, d.job, d.task)
+      else:
+        new_name = None
+      with ops.device(worker):
+        data_per_worker = self._iterators[i].get_next(new_name)
+
+      # Ungroup these per-replica value so as to get a flat map from devices to
+      # values.
+      for d in worker_devices:
+        v = select_device(d, data_per_worker)
+        if d in index:
+          raise ValueError("Duplicated devices in worker_device_pairs: %r" % v)
+        index[d] = v
+
+    return regroup(index)
+
+  def initialize(self):
+    """Initialze underlying iterators.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    assert self._is_eager == context.executing_eagerly(), (
+        "Iterator should be created and used in same execution mode.")
+
+    init_ops = []
+    for it in self._iterators:
+      init_ops.extend(it.initialize())
+    return init_ops
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_classes(self):
+    return self._iterators[0].output_classes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_shapes(self):
+    return self._iterators[0].output_shapes
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  @property
+  def output_types(self):
+    return self._iterators[0].output_types
+
+  # TODO(priyag): Remove when we switch to using `MultiDeviceIterator` for TPUs.
+  def get_iterator(self, worker):
+    for i, (w, _) in enumerate(self._worker_device_pairs):
+      if worker == w:
+        return self._iterators[i]
+    return None
+
+
+class InputFunctionIterator(InputIteratorImpl):
+  """Iterator created from input function."""
+
+  def __init__(self, input_fn, worker_device_pairs, input_contexts):
+    """Make an iterator for input provided via an input function.
+
+    Currently implements PER_WORKER mode, in which the `input_fn` is called
+    once on each worker.
+
+    TODO(priyag): Add other replication modes.
+    TODO(priyag): Allow taking input function that returns a callable that
+    returns nest of tensors.
+
+    Args:
+      input_fn: Input function that returns a `tf.data.Dataset` object.
+      worker_device_pairs: A list of (worker, list of devices on that worker)
+        pairs.
+      input_contexts: A list of `InputContext` instances to be passed to call(s)
+        to `input_fn`. Length and order should match worker order in
+        `worker_device_pairs`.
+    """
+    if len(worker_device_pairs) != len(input_contexts):
+      raise ValueError(
+          "Number of worker_device_pairs (%d) is not same as number of"
+          "input_contexts (%d)" % (
+              len(worker_device_pairs), len(input_contexts)))
+
+    iterators = []
+    for (worker, devices), ctx in zip(worker_device_pairs, input_contexts):
+      # TODO(priyag): We should probably explicitly specify CPU device on worker.
+      with ops.device(worker):
+        result = input_fn(ctx)
+        if not isinstance(result, dataset_ops.DatasetV2):
+          raise ValueError("input_fn must return a tf.data.Dataset.")
+        iterator = _SingleWorkerDatasetIterator(result, worker, devices)
+        iterators.append(iterator)
+
+    super(InputFunctionIterator, self).__init__(
+        worker_device_pairs, iterators)
+
+
+class DatasetIterator(InputIteratorImpl):
+  """Iterator created from input dataset."""
+
+  def __init__(self, dataset, worker_device_pairs, split_batch_by=None):
+    """Make an iterator for the dataset on given devices.
+
+    If `split_batch_by` is not None, we "split" each batch of the
+    dataset by `split_batch_by` value. To achieve this, we first unbatch the
+    input dataset and then rebatch it with the per replica batch size that is
+    calculated using `global_batch_size // split_batch_by`.
+    The currently supported datasets are as follows:
+    `dataset.batch()` is the last operation on the dataset OR
+    `dataset.apply(map_and_batch)` is the last operation on the dataset OR
+    `dataset.batch().prefetch()` are the last 2 operations on the dataset OR
+    `dataset.apply(map_and_batch).prefetch()` are the last 2 operations.
+
+    TODO(priyag): Support multi worker / host cases properly by cloning
+    and sharding the dataset on each worker. Current setup will only work in
+    some cases, such as in-graph multi worker GPU case. If the input pipeline
+    has random shuffling (with a different seed on each worker), each worker
+    will see random input from the same overall dataset in each step. Otherwise,
+    each worker will see the same input in each step.
+
+    Args:
+      dataset: `tf.data.Dataset` that will be used as the input source.
+      worker_device_pairs: A list of (worker, list of devices on that worker)
+        pairs.
+      split_batch_by: Optional integer. If present, we "split" each batch of the
+        dataset by `split_batch_by` value.
+    """
+    if split_batch_by:
+      dataset = _split_dataset_batch(dataset, split_batch_by)
+
+    iterators = []
+    for worker, worker_devices in worker_device_pairs:
+      with ops.device(worker):
+        iterator = _SingleWorkerDatasetIterator(dataset, worker, worker_devices)
+        iterators.append(iterator)
+
+    super(DatasetIterator, self).__init__(worker_device_pairs, iterators)
+
+
+class _SingleWorkerDatasetIterator(object):
+  """Iterator for a single `tf.data.Dataset`."""
+
+  def __init__(self, dataset, worker, devices):
+    """Create iterator for the `dataset` to fetch data to worker's `devices` .
+
+    `MultiDeviceIterator` is used to prefetch input to the devices on the
+    given worker. `MultiDeviceIterator` doesn't work in eager mode yet.
+
+    Args:
+      dataset: A `tf.data.Dataset` instance.
+      worker: Worker on which ops should be created.
+      devices: Distribute data from `dataset` to these devices.
+    """
+    self._dataset = dataset
+    self._worker = worker
+    self._devices = devices
+    self._is_eager = context.executing_eagerly()
+    self._make_iterator()
+
+  def _make_iterator(self):
+    """Make appropriate iterator on the dataset."""
+    with ops.device(self._worker):
+      if self._is_eager:
+        # TODO(rohanj): Enable prefetching in eager mode.
+        # TODO(priyag): Measure the performance of this approach vs calling
+        # get_next on the original dataset N times.
+        dataset = self._dataset.batch(len(self._devices), drop_remainder=True)
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
+      else:
+        iterator = multi_device_iterator_ops.MultiDeviceIterator(
+            self._dataset, self._devices)
+    self._iterator = iterator
+
+  def get_next(self, name=None):
+    """Get next element from the underlying iterator."""
+    with ops.device(self._worker):
+      if self._is_eager:
+        # Batched dataset case.
+        batch = self._iterator.get_next(name=name)
+        index = {}
+        for i, d in enumerate(self._devices):
+          index[d] = nest.map_structure(operator.itemgetter(i), batch)
+          with ops.device(d):
+            index[d] = nest.map_structure(array_ops.identity, index[d])
+      else:
+        # MultiDeviceIterator case.
+        data_list = self._iterator.get_next()
+        index = dict(zip(self._devices, data_list))
+
+      return regroup(index)
+
+  def initialize(self):
+    """Initialze underlying iterator.
+
+    In eager execution, this simply recreates the underlying iterator.
+    In graph execution, it returns the initializer ops for the underlying
+    iterator.
+
+    Returns:
+      A list of any initializer ops that should be run.
+    """
+    if self._is_eager:
+      self._make_iterator()
+      return []
+    else:
+      return [self._iterator.initializer]
+
+  @property
+  def output_classes(self):
+    return self._iterator.output_classes
+
+  @property
+  def output_shapes(self):
+    return self._iterator.output_shapes
+
+  @property
+  def output_types(self):
+    return self._iterator.output_types
+
+
+def _split_dataset_batch(dataset, split_batch_by):
+  """Divide a batch-ed dataset's batches into smaller batches."""
+  # TODO(sourabhbajaj): Remove this in lieu of distributed datasets
+  # pylint: disable=protected-access
+  def _get_batch_dataset(d):
+    """Get the underlying batch dataset from the dataset object."""
+    if isinstance(d, dataset_ops.DatasetV1Adapter):
+      d = d._dataset
+
+    if isinstance(d, (dataset_ops.BatchDataset, batching._MapAndBatchDataset)):
+      return d
+    elif isinstance(d, dataset_ops.PrefetchDataset):
+      return _get_batch_dataset(d._input_dataset)
+    raise ValueError(
+        "Unable to get batched dataset from the input dataset. `batch` "
+        "`map_and_batch` need to be the last operations on the dataset. "
+        "The batch operations can be followed by a prefetch.")
+
+  batched_dataset = _get_batch_dataset(dataset)
+  batch_size = batched_dataset._batch_size
+  drop_remainder = batched_dataset._drop_remainder
+  # pylint: enable=protected-access
+
+  if tensor_util.is_tensor(batch_size):
+    batch_size = tensor_util.constant_value(batch_size)
+
+  if tensor_util.is_tensor(drop_remainder):
+    drop_remainder = tensor_util.constant_value(drop_remainder)
+
+  if batch_size % split_batch_by:
+    raise ValueError(
+        "Batch size %s cannot be sharded evenly across replicas %s" % (
+            batch_size, split_batch_by))
+  new_batch_size = batch_size // split_batch_by
+
+  dataset = dataset.apply(batching.unbatch())
+  return dataset.batch(new_batch_size, drop_remainder=drop_remainder)
 
 
 class MultiStepContext(object):
   """A context object that can be used to capture things when running steps.
 
   This context object is useful when running multiple steps at a time using the
-  `run_steps_on_dataset` API. For e.g. it allows the user's step function to
-  specify which outputs to emit at what frequency. Currently it supports
-  capturing output from the last step, as well as capturing non tensor outputs.
-  In the future it will be augmented to support other use cases such as output
-  each N steps.
+  `experimental_run_steps_on_iterator` API. For e.g. it allows the user's step
+  function to specify which outputs to emit at what frequency. Currently it
+  supports capturing output from the last step, as well as capturing non tensor
+  outputs.  In the future it will be augmented to support other use cases such
+  as output each N steps.
   """
 
   def __init__(self):
-    """Initializes an output context.
+    """Initialize an output context.
 
     Returns:
       A context object.
     """
     self._last_step_outputs = {}
-    self._last_step_outputs_aggregations = {}
+    self._last_step_outputs_reduce_ops = {}
     self._non_tensor_outputs = {}
 
   @property
@@ -1325,8 +1644,8 @@ class MultiStepContext(object):
     Keys in the dictionary are names of tensors to be captured, as specified
     when `set_last_step_output` is called.
     Values in the dictionary are the tensors themselves. If
-    `set_last_step_output` was called with an `aggregation` for this output,
-    then the value is the aggregated value.
+    `set_last_step_output` was called with a `reduce_op` for this output,
+    then the value is the reduced value.
 
     Returns:
       A dictionary with last step outputs.
@@ -1339,8 +1658,7 @@ class MultiStepContext(object):
       raise ValueError("Need a dictionary to set last_step_outputs.")
     self._last_step_outputs = outputs
 
-  def set_last_step_output(self, name, output,
-                           aggregation=variables_lib.VariableAggregation.NONE):
+  def set_last_step_output(self, name, output, reduce_op=None):
     """Set `output` with `name` to be outputted from the last step.
 
     Args:
@@ -1348,39 +1666,36 @@ class MultiStepContext(object):
         name.
       output: The tensors that should be outputted with `name`. See below for
         actual types supported.
-      aggregation: Aggregation method to use to aggregate outputs from multiple
+      reduce_op: Reduction method to use to reduce outputs from multiple
         replicas. Required if `set_last_step_output` is called in a replica
         context. Optional in cross_replica_context.
-        When present, the outputs from all the replicas are aggregated using the
+        When present, the outputs from all the replicas are reduced using the
         current distribution strategy's `reduce` method. Hence, the type of
         `output` must be what's supported by the corresponding `reduce` method.
-        For e.g. if using MirroredStrategy and aggregation is set, output
+        For e.g. if using MirroredStrategy and reduction is set, output
         must be a `PerReplica` value.
-        The aggregation method is also recorded in a dictionary
-        `_last_step_outputs_aggregations` for later interpreting of the
+        The reduce method is also recorded in a dictionary
+        `_last_step_outputs_reduce_ops` for later interpreting of the
         outputs as already reduced or not.
-
     """
     if distribution_strategy_context.get_cross_replica_context():
-      self._last_step_outputs_aggregations[name] = aggregation
-      if aggregation is variables_lib.VariableAggregation.NONE:
+      self._last_step_outputs_reduce_ops[name] = reduce_op
+      if reduce_op is None:
         self._last_step_outputs[name] = output
       else:
         distribution = distribution_strategy_context.get_distribution_strategy()
-        self._last_step_outputs[name] = distribution.reduce(
-            aggregation, output, destinations="/device:CPU:0")
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, output)
     else:
-      assert aggregation is not variables_lib.VariableAggregation.NONE
+      assert reduce_op is not None
       def merge_fn(distribution, value):
-        self._last_step_outputs[name] = distribution.reduce(
-            aggregation, value, destinations="/device:CPU:0")
+        self._last_step_outputs[name] = distribution.reduce(reduce_op, value)
         # Setting this inside the `merge_fn` because all replicas share the same
         # context object, so it's more robust to set it only once (even if all
         # the replicas are trying to set the same value).
-        self._last_step_outputs_aggregations[name] = aggregation
+        self._last_step_outputs_reduce_ops[name] = reduce_op
 
       distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, output)
+          merge_fn, args=(output,))
 
   @property
   def non_tensor_outputs(self):
@@ -1394,10 +1709,10 @@ class MultiStepContext(object):
     else:
       def merge_fn(distribution, value):
         # NOTE(priyag): For non tensor outputs, we simply return all the values
-        # in a list as aggregation doesn't make sense on non tensors.
+        # in a list as reduction doesn't make sense on non tensors.
         self._non_tensor_outputs[name] = distribution.unwrap(value)
       distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, output)
+          merge_fn, args=(output,))
 
 
 def value_container(val):
@@ -1462,14 +1777,11 @@ class AggregatingVariable(checkpointable.CheckpointableBase):
                          "a variable in Replica Context.")
 
       def merge_fn(strategy, value, *other_args, **other_kwargs):
-        return strategy.update(
-            self, f,
-            strategy.reduce(
-                aggregation=self._aggregation, value=value, destinations=self),
-            *other_args, **other_kwargs)
+        v = _apply_aggregation(strategy, value, self._aggregation, self)
+        return strategy.update(self, f, v, *other_args, **other_kwargs)
 
       return distribution_strategy_context.get_replica_context().merge_call(
-          merge_fn, *args, **kwargs)
+          merge_fn, args=args, kwargs=kwargs)
 
   def assign_sub(self, *args, **kwargs):
     assign_sub_fn = lambda var, *a, **kw: var.assign_sub(*a, **kw)
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index 362e8e3b8329ae0c98dd38603738d5c2a3ca21da..f43cf9327a1ad6b2b83ebcb2482ad3fc27515251 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -174,6 +174,23 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "function_gradients_test",
+    size = "medium",
+    srcs = ["function_gradients_test.py"],
+    additional_deps = [
+        ":backprop",
+        ":context",
+        ":def_function",
+        ":function",
+        ":test",
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+    ],
+    shard_count = 5,
+)
+
 cuda_py_test(
     name = "function_test",
     size = "medium",
@@ -193,7 +210,7 @@ cuda_py_test(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
     ],
-    shard_count = 20,
+    shard_count = 15,
 )
 
 py_library(
@@ -238,6 +255,18 @@ py_library(
     ],
 )
 
+py_test(
+    name = "execution_callbacks_test",
+    srcs = ["execution_callbacks_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":execution_callbacks",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+    ],
+)
+
 py_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
@@ -318,6 +347,7 @@ py_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:execute",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index 844c9b52e7fda6e6582201448ee576d9de752223..29f9b2cda3aa2c6e7fff6c6df10fed81779d02c7 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import functools
 import operator
+import sys
 
 import six
 
@@ -33,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
@@ -42,9 +44,20 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
+# Note that we need to lazy load the following two modules to avoid creating
+# circular dependencies.
+# TODO(b/119775953): fix the circular dependencies.
+pfor_ops = LazyLoader(
+    "pfor_ops", globals(),
+    "tensorflow.python.ops.parallel_for.control_flow_ops")
+
+function = LazyLoader("function", globals(),
+                      "tensorflow.python.eager.function")
+
 _op_attr_type_cache = {}
 
 
@@ -536,11 +549,11 @@ def _aggregate_grads(gradients):
 
   if len(gradients) == 1:
     return gradients[0]
-  if all([isinstance(g, ops.Tensor) for g in gradients]):
+  if all(isinstance(g, ops.Tensor) for g in gradients):
     return gen_math_ops.add_n(gradients)
   else:
-    assert all([isinstance(g, (ops.Tensor, ops.IndexedSlices))
-                for g in gradients])
+    assert all(isinstance(g, (ops.Tensor, ops.IndexedSlices))
+               for g in gradients)
     indexed_slices_list = []
     for grad in gradients:
       # TODO(xpan): Support nested IndexedSlices and core IndexedSlices
@@ -776,6 +789,8 @@ class GradientTape(object):
         context.context().end_step()
       except AttributeError:
         pass
+      except TypeError:
+        pass
 
   def watch(self, tensor):
     """Ensures that `tensor` is being traced by this tape.
@@ -935,3 +950,213 @@ class GradientTape(object):
 
     grad = nest.pack_sequence_as(sources, flat_grad)
     return grad
+
+  def jacobian(self,
+               target,
+               sources,
+               unconnected_gradients=UnconnectedGradients.NONE,
+               parallel_iterations=None,
+               experimental_use_pfor=True):
+    """Computes the jacobian using operations recorded in context of this tape.
+
+    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
+    definition of a Jacobian.
+
+    Example usage:
+
+    with tf.GradientTape() as g:
+      x  = tf.constant([1.0, 2.0])
+      g.watch(x)
+      y = x * x
+    jacobian = g.jacobian(y, x)
+    # jacobian value is [[2., 0.], [0., 4.]]
+
+    Args:
+      target: Tensor to be differentiated.
+      sources: a list or nested structure of Tensors or Variables. `target`
+        will be differentiated against elements in `sources`.
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
+      parallel_iterations: A knob to control how many iterations are dispatched
+        in parallel. This knob can be used to control the total memory usage.
+      experimental_use_pfor: If true, vectorizes the jacobian computation. Else
+        falls back to a sequential while_loop. Vectorization can sometimes fail
+        or lead to excessive memory usage. This option can be used to disable
+        vectorization in such cases.
+
+    Returns:
+      a list or nested structure of Tensors (or IndexedSlices, or None),
+      one for each element in `sources`. Returned structure is the same as
+      the structure of `sources`.
+
+    Raises:
+      RuntimeError: If called on a non-persistent tape with eager execution
+        enabled and without enabling experimental_use_pfor.
+      ValueError: If vectorization of jacobian computation fails.
+    """
+    flat_sources = nest.flatten(sources)
+    target_static_shape = target.shape
+    target_shape = array_ops.shape(target)
+    # Note that we push and pop the tape here and below. This is needed since we
+    # need gradients through the enclosed operations.
+    self._push_tape()
+    target = array_ops.reshape(target, [-1])
+    self._pop_tape()
+
+    def loop_fn(i):
+      self._push_tape()
+      y = array_ops.gather(target, i)
+      self._pop_tape()
+      return self.gradient(y, flat_sources,
+                           unconnected_gradients=unconnected_gradients)
+
+    try:
+      target_size = int(target.shape[0])
+    except TypeError:
+      target_size = array_ops.shape(target)[0]
+
+    if experimental_use_pfor:
+      try:
+        output = pfor_ops.pfor(loop_fn, target_size,
+                               parallel_iterations=parallel_iterations)
+      except ValueError as err:
+        six.reraise(
+            ValueError,
+            ValueError(
+                str(err) + "\nEncountered an exception while vectorizing the "
+                "jacobian computation. Vectorization can be disabled by setting"
+                " experimental_use_pfor to False."),
+            sys.exc_info()[2])
+    else:
+      if context.executing_eagerly() and not self._persistent:
+        raise RuntimeError(
+            "GradientTape must be created with persistent=True"
+            " to compute the jacobian with eager execution enabled and with "
+            " experimental_use_pfor set to False.")
+      output = pfor_ops.for_loop(
+          loop_fn, [target.dtype] * len(flat_sources), target_size,
+          parallel_iterations=parallel_iterations)
+
+    for i, out in enumerate(output):
+      if out is not None:
+        new_shape = array_ops.concat(
+            [target_shape, array_ops.shape(out)[1:]], axis=0)
+        out = array_ops.reshape(out, new_shape)
+        if context.executing_eagerly():
+          out.set_shape(target_static_shape.concatenate(flat_sources[i].shape))
+      output[i] = out
+
+    return nest.pack_sequence_as(sources, output)
+
+  def batch_jacobian(self,
+                     target,
+                     source,
+                     unconnected_gradients=UnconnectedGradients.NONE,
+                     parallel_iterations=None,
+                     experimental_use_pfor=True):
+    """Computes and stacks per-example jacobians.
+
+    See http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant for the
+    definition of a Jacobian.  This function is essentially an efficient
+    implementation of the following:
+    `tf.stack([self.jacobian(y[i], x[i]) for i in range(x.shape[0])])`.
+
+    Note that compared to `GradientTape.jacobian` which computes gradient of
+    each output value w.r.t each input value, this function is useful when
+    `target[i,...] is independent of `source[j,...]` for `j != i`. This
+    independence assumption allows more efficient computation as compared to
+    `GradientTape.jacobian`. The output, as well as intermediate activations,
+    are lower dimensional and avoid a bunch of redundant zeros which would
+    result in the jacobian computation given the independence assumption.
+
+    Example usage:
+    with tf.GradientTape() as g:
+      x = tf.constant([[1, 2], [3, 4]], dtype=tf.float32)
+      g.watch(x)
+      y = x * x
+    batch_jacobian = g.batch_jacobian(y, x)
+    # batch_jacobian is [[[2,  0], [0,  4]], [[6,  0], [0,  8]]]
+
+    Args:
+      target: A tensor with rank 2 or higher and with shape [b, y1, ..., y_n].
+        `target[i,...]` should only depend on `source[i,...]`.
+      source: A tensor with rank 2 or higher and with shape [b, x1, ..., x_m].
+      unconnected_gradients: a value which can either hold 'none' or 'zero' and
+        alters the value which will be returned if the target and sources are
+        unconnected. The possible values and effects are detailed in
+        'UnconnectedGradients' and it defaults to 'none'.
+      parallel_iterations: A knob to control how many iterations are dispatched
+        in parallel. This knob can be used to control the total memory usage.
+      experimental_use_pfor: If true, uses pfor for computing the Jacobian. Else
+        uses a tf.while_loop.
+
+    Returns:
+      A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
+      is the jacobian of `target[i, ...]` w.r.t. `source[i, ...]`, i.e. stacked
+      per-example jacobians.
+
+    Raises:
+      RuntimeError: If called on a non-persistent tape with eager execution
+        enabled and without enabling experimental_use_pfor.
+      ValueError: If vectorization of jacobian computation fails or if first
+        dimension of `target` and `source` do not match.
+    """
+    target_shape = target.shape
+    if not target_shape.with_rank_at_least(2)[0].is_compatible_with(
+        source.shape.with_rank_at_least(2)[0]):
+      raise ValueError(
+          "Need first dimension of target shape (%s) and "
+          "source shape (%s) to match." % (target.shape, source.shape))
+    if target_shape.is_fully_defined():
+      batch_size = int(target_shape[0])
+      target_row_size = target_shape.num_elements() // batch_size
+    else:
+      target_shape = array_ops.shape(target)
+      batch_size = target_shape[0]
+      target_row_size = array_ops.size(target) // batch_size
+    source_shape = array_ops.shape(source)
+    # Flatten target to 2-D.
+    # Note that we push and pop the tape here and below. This is needed since we
+    # need gradients through the enclosed operations.
+    self._push_tape()
+    with ops.control_dependencies(
+        [check_ops.assert_equal(batch_size, source_shape[0])]):
+      target = array_ops.reshape(target, [batch_size, target_row_size])
+    self._pop_tape()
+
+    def loop_fn(i):
+      self._push_tape()
+      y = array_ops.gather(target, i, axis=1)
+      self._pop_tape()
+      return self.gradient(y, source,
+                           unconnected_gradients=unconnected_gradients)
+
+    if experimental_use_pfor:
+      try:
+        output = pfor_ops.pfor(loop_fn, target_row_size,
+                               parallel_iterations=parallel_iterations)
+      except ValueError as err:
+        six.reraise(
+            ValueError,
+            ValueError(
+                str(err) + "\nEncountered an exception while vectorizing the "
+                "batch_jacobian computation. Vectorization can be disabled by "
+                "setting experimental_use_pfor to False."),
+            sys.exc_info()[2])
+    else:
+      if context.executing_eagerly() and not self._persistent:
+        raise RuntimeError(
+            "GradientTape must be created with persistent=True"
+            " to compute the batch_jacobian with eager execution enabled and "
+            " with experimental_use_pfor set to False.")
+      output = pfor_ops.for_loop(loop_fn, target.dtype, target_row_size,
+                                 parallel_iterations=parallel_iterations)
+    if output is None:
+      return None
+    output = array_ops.reshape(output,
+                               [target_row_size, batch_size, -1])
+    output = array_ops.transpose(output, [1, 0, 2])
+    new_shape = array_ops.concat([target_shape, source_shape[1:]], axis=0)
+    return array_ops.reshape(output, new_shape)
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 274d5320df0cdf9210c2f38ad2b25ecc84b31a21..61c47a29fd2427850006cbe2dfe1e6bb69d988ab 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -74,7 +74,7 @@ class BackpropTest(test.TestCase):
       tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1)
       tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2)
       tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3)
-      tf_g4 = math_ops.reduce_sum(tf_var * 2.0, reduction_indices=(0, 1))
+      tf_g4 = math_ops.reduce_sum(tf_var * 2.0, axis=(0, 1))
       tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4
       tf_grad = gradients.gradients(tf_y, [tf_var])[0]
 
@@ -215,7 +215,7 @@ class BackpropTest(test.TestCase):
       self.assertAllClose(tf_grad.values.eval(), grad.values)
 
       tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run()
-      expected = tf_embedding.eval()
+      expected = self.evaluate(tf_embedding)
     opt.apply_gradients([(grad, embedding)])
     self.assertAllClose(expected, embedding.read_value())
 
@@ -233,6 +233,68 @@ class BackpropTest(test.TestCase):
     self.assertTrue(ordered_variables[0] is v0)
     self.assertTrue(ordered_variables[1] is v1)
 
+  def testTapeNoOpGradient(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x
+    self.assertEqual(t.gradient(y, x).numpy(), 1.0)
+
+  def testTapeIdentityGradientIsIdentity(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = array_ops.identity(x)
+    self.assertEqual(t.gradient(y, x).numpy(), 1.0)
+
+  def testTapeGradientMultiTargetOneIsSource(self):
+    x = constant_op.constant(2.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x*x
+    self.assertEqual(t.gradient([x, y], x).numpy(), 5.0)
+
+  def testTapeNoOpGradientWithMultiTargetAllSource(self):
+    x = constant_op.constant(3.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      y = x
+    self.assertEqual(t.gradient([y, y], x).numpy(), 2.0)
+
+  def testTapeNoOpGradientWithMultiTargetMultiSource(self):
+    x = constant_op.constant(3.0)
+    y = constant_op.constant(5.0)
+    with backprop.GradientTape() as t:
+      t.watch(x)
+      t.watch(y)
+      z = y * y
+    self.assertAllEqual(t.gradient([x, y, z], [x, y]), [1.0, 11.0])
+
+  def testTapeNoOpOnVariableIsIdentity(self):
+    v0 = resource_variable_ops.ResourceVariable(1.0)
+    with backprop.GradientTape() as t:
+      y = v0.read_value()
+    self.assertEqual(t.gradient(y, v0).numpy(), 1.0)
+
+  @test_util.assert_no_new_tensors
+  @test_util.assert_no_garbage_created
+  def testTapeNoOpGradient2By2(self):
+    a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+    dy_dy = tape.gradient(a_2_by_2, [a_2_by_2])[0]
+    self.assertAllEqual(dy_dy.numpy(),
+                        constant_op.constant(1.0, shape=[2, 2]).numpy())
+
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testTapeNoOpGradientMultiTarget2By2(self):
+    a_2_by_2 = constant_op.constant(2.0, shape=[2, 2])
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(a_2_by_2)
+    dy_dy = tape.gradient([a_2_by_2, a_2_by_2], [a_2_by_2])[0]
+    self.assertAllEqual(dy_dy.numpy(),
+                        constant_op.constant(2.0, shape=[2, 2]).numpy())
+
   def testTapeStopRecording(self):
     with backprop.GradientTape() as t:
       x = resource_variable_ops.ResourceVariable(1.0)
@@ -586,6 +648,7 @@ class BackpropTest(test.TestCase):
       g.gradient(x, y)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testGradientTapeWithCond(self):
     x = constant_op.constant(3.0)
 
@@ -607,6 +670,7 @@ class BackpropTest(test.TestCase):
       self.assertEqual(self.evaluate(dy), 6.0)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testGradientTapeWithWhileLoop(self):
     i = constant_op.constant(1)
     x = constant_op.constant(2.)
@@ -642,6 +706,7 @@ class BackpropTest(test.TestCase):
 
   @test_util.assert_no_new_tensors
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only('b/120545219')
   def testPersistentTape(self):
     with backprop.GradientTape(persistent=True) as g:
       x = constant_op.constant(3.0)
@@ -1165,5 +1230,208 @@ class BackpropTest(test.TestCase):
       self.assertAllEqual(da[0], tf_da[0].eval())
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class JacobianTest(test.TestCase):
+
+  def _jacobian(self, experimental_use_pfor):
+    persistent = context.executing_eagerly and not experimental_use_pfor
+    with backprop.GradientTape(persistent=persistent) as g:
+      x = constant_op.constant([1., 2.])
+      y = constant_op.constant([3., 4.])
+      g.watch(x)
+      g.watch(y)
+      z = x * x * y
+    jacobian = g.jacobian(z, [x, y],
+                          experimental_use_pfor=experimental_use_pfor)
+    answer = [array_ops.diag(2 * x * y), array_ops.diag(x * x)]
+    return jacobian, answer
+
+  @test_util.run_v1_only('b/120545219')
+  def testPfor(self):
+    jacobian, answer = self._jacobian(experimental_use_pfor=True)
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  @test_util.run_v1_only('b/120545219')
+  def testWhileLoop(self):
+    jacobian, answer = self._jacobian(experimental_use_pfor=False)
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPforDefun(self):
+
+    @function.defun
+    def _f():
+      return self._jacobian(experimental_use_pfor=True)
+
+    jacobian, answer = _f()
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  @test_util.run_v1_only('b/120545219')
+  def testWhileLoopDefun(self):
+
+    @function.defun
+    def _f():
+      return self._jacobian(experimental_use_pfor=False)
+
+    jacobian, answer = _f()
+    for j, a in zip(jacobian, answer):
+      self.assertAllEqual(a, j)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPersistentTape(self):
+    if not context.executing_eagerly():
+      return
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([1.0, 2.0])
+      g.watch(x)
+      y = x * x
+    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+      g.jacobian(y, x, experimental_use_pfor=False)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPforException(self):
+    var = variables.Variable([1.])
+
+    @custom_gradient.custom_gradient
+    def op(x):
+      def grad(_):
+        # Note that we perform a stateful operation here that will not be
+        # compatible with parallel for construct.
+        with ops.control_dependencies(
+            [var.assign(random_ops.random_uniform([1]))]):
+          return constant_op.constant(1.)
+      return x, grad
+
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([1., 2.])
+      g.watch(x)
+      y = op(x)
+    with self.assertRaisesRegexp(ValueError, 'No converter'):
+      g.jacobian(y, x, experimental_use_pfor=True)
+
+  @test_util.run_v1_only('b/120545219')
+  def test_parallel_iterations(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant([[1., 2], [3, 4]])
+      g.watch(x)
+      y = math_ops.matmul(x, x)
+    self.assertAllClose(g.jacobian(y, x, parallel_iterations=2),
+                        g.jacobian(y, x, parallel_iterations=3))
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BatchJacobianTest(test.TestCase):
+
+  def _batch_jacobian(self, experimental_use_pfor):
+    persistent = context.executing_eagerly and not experimental_use_pfor
+    with backprop.GradientTape(persistent=persistent) as g:
+      x = constant_op.constant([[1., 2.], [3., 4.]])
+      y = constant_op.constant([[3., 4.], [5., 6.]])
+      g.watch(x)
+      z = x * x * y
+    batch_jacobian = g.batch_jacobian(
+        z, x, experimental_use_pfor=experimental_use_pfor)
+    answer = array_ops.stack([array_ops.diag(2 * x[0] * y[0]),
+                              array_ops.diag(2 * x[1] * y[1])])
+    return batch_jacobian, answer
+
+  @test_util.run_v1_only('b/120545219')
+  def testPfor(self):
+    batch_jacobian, answer = self._batch_jacobian(experimental_use_pfor=True)
+    self.assertAllEqual(answer, batch_jacobian)
+
+  @test_util.run_v1_only('b/120545219')
+  def testWhileLoop(self):
+    batch_jacobian, answer = self._batch_jacobian(experimental_use_pfor=False)
+    self.assertAllEqual(answer, batch_jacobian)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPforDefun(self):
+
+    @function.defun
+    def _f():
+      return self._batch_jacobian(experimental_use_pfor=True)
+
+    batch_jacobian, answer = _f()
+    self.assertAllEqual(answer, batch_jacobian)
+
+  @test_util.run_v1_only('b/120545219')
+  def testWhileLoopDefun(self):
+
+    @function.defun
+    def _f():
+      return self._batch_jacobian(experimental_use_pfor=False)
+
+    batch_jacobian, answer = _f()
+    self.assertAllEqual(answer, batch_jacobian)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPersistentTape(self):
+    if not context.executing_eagerly():
+      return
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([[1.0, 2.0]])
+      g.watch(x)
+      y = x * x
+    with self.assertRaisesRegexp(RuntimeError, 'persistent'):
+      g.batch_jacobian(y, x, experimental_use_pfor=False)
+
+  @test_util.run_v1_only('b/120545219')
+  def testBadShape(self):
+    x = random_ops.random_uniform([2, 3])
+    with backprop.GradientTape() as g:
+      y = array_ops.concat([x, x], axis=0)
+    with self.assertRaisesRegexp(ValueError, 'Need first dimension'):
+      g.batch_jacobian(y, x)
+
+  @test_util.run_v1_only('b/120545219')
+  def testBadInputRank(self):
+    x = random_ops.random_uniform([2])
+    with backprop.GradientTape() as g:
+      y = random_ops.random_uniform([2, 2])
+    with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
+      g.batch_jacobian(y, x)
+
+  def testBadOutputRank(self):
+    x = random_ops.random_uniform([2, 2])
+    with backprop.GradientTape() as g:
+      y = random_ops.random_uniform([2])
+    with self.assertRaisesRegexp(ValueError, 'must have rank at least 2'):
+      g.batch_jacobian(y, x)
+
+  @test_util.run_v1_only('b/120545219')
+  def testPforException(self):
+    var = variables.Variable([1.])
+
+    @custom_gradient.custom_gradient
+    def op(x):
+      def grad(_):
+        # Note that we perform a stateful operation here that will not be
+        # compatible with parallel for construct.
+        with ops.control_dependencies(
+            [var.assign(random_ops.random_uniform([1]))]):
+          return constant_op.constant(1.)
+      return x, grad
+
+    with backprop.GradientTape() as g:
+      x = constant_op.constant([[1.], [2.]])
+      g.watch(x)
+      y = op(x)
+    with self.assertRaisesRegexp(ValueError, 'No converter'):
+      g.batch_jacobian(y, x, experimental_use_pfor=True)
+
+  @test_util.run_v1_only('b/120545219')
+  def test_parallel_iterations(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = constant_op.constant([[1., 2], [3, 4]])
+      g.watch(x)
+      w = constant_op.constant([[1., 2, 3, 4], [5, 6, 7, 8]])
+      y = math_ops.matmul(x, w)
+    self.assertAllClose(g.batch_jacobian(y, x, parallel_iterations=2),
+                        g.batch_jacobian(y, x, parallel_iterations=3))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/eager/benchmarks_test.py b/tensorflow/python/eager/benchmarks_test.py
index 886715867c8312283811f28e748b14296f668954..31a7efca82b016bc193ab9985ea7603897edc7ac 100644
--- a/tensorflow/python/eager/benchmarks_test.py
+++ b/tensorflow/python/eager/benchmarks_test.py
@@ -80,7 +80,6 @@ class SubclassedKerasModel(keras.Model):
 
   def __init__(self, initializer="ones"):
     super(SubclassedKerasModel, self).__init__()
-    self._can_use_graph_functions = True
     self.layer_a = keras.layers.Dense(
         64, kernel_initializer=initializer, bias_initializer="zeros")
     self.layer_b = keras.layers.Dense(
@@ -733,38 +732,38 @@ class MicroBenchmarks(test.Benchmark):
     assert np.equal(func(), make_keras_model()(data)).all()
     self._run(func, 30000)
 
-  def _benchmark_keras_model_fit(self, model):
+  def _benchmark_keras_model_fit(self, model, run_eagerly=False):
     data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     labels = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     dataset = dataset_ops.Dataset.from_tensors((data, labels)).repeat()
     model.compile(
         gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
-        loss="mse")
+        loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.fit(dataset, epochs=1, steps_per_epoch=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.
     model.fit(dataset, epochs=1, steps_per_epoch=1, verbose=0)
 
     self._run(func, 1)
 
-  def _benchmark_keras_model_evaluate(self, model):
+  def _benchmark_keras_model_evaluate(self, model, run_eagerly=False):
     data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     labels = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     dataset = dataset_ops.Dataset.from_tensors((data, labels)).repeat()
     model.compile(
         gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
-        loss="mse")
+        loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.evaluate(dataset, steps=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.
     model.evaluate(dataset, steps=1, verbose=0)
 
     self._run(func, 1)
 
-  def _benchmark_keras_model_predict(self, model):
+  def _benchmark_keras_model_predict(self, model, run_eagerly=False):
     data = random_ops.random_uniform((10, 10), minval=-1, maxval=1)
     dataset = dataset_ops.Dataset.from_tensors(tuple([data])).repeat()
     model.compile(
         gradient_descent.GradientDescentOptimizer(learning_rate=0.001),
-        loss="mse")
+        loss="mse", run_eagerly=run_eagerly)
     func = lambda: model.predict(dataset, steps=1000, verbose=0)
     # First call is more expensive (creates variables etc.), discount that.
     model.predict(dataset, steps=1, verbose=0)
@@ -780,10 +779,9 @@ class MicroBenchmarks(test.Benchmark):
       model = SubclassedKerasModel(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
-  def benchmark_keras_model_subclassed_fit_disable_defun(self):
+  def benchmark_keras_model_subclassed_fit_run_model_eagerly(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_fit(model)
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
 
   def benchmark_keras_model_functional_fit(self):
     model = make_keras_model(initializer="glorot_uniform")
@@ -794,10 +792,9 @@ class MicroBenchmarks(test.Benchmark):
       model = make_keras_model(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
-  def benchmark_keras_model_functional_fit_disable_defun(self):
+  def benchmark_keras_model_functional_fit_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_fit(model)
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
 
   def benchmark_keras_model_sequential_fit(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
@@ -808,64 +805,57 @@ class MicroBenchmarks(test.Benchmark):
       model = make_sequential_keras_model(initializer="glorot_uniform")
       self._benchmark_keras_model_fit(model)
 
-  def benchmark_keras_model_sequential_fit_disable_defun(self):
+  def benchmark_keras_model_sequential_fit_run_model_eagerly(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_fit(model)
+    self._benchmark_keras_model_fit(model, run_eagerly=True)
 
   def benchmark_keras_model_subclassed_evaluate(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
     self._benchmark_keras_model_evaluate(model)
 
-  def benchmark_keras_model_subclassed_evaluate_disable_defun(self):
+  def benchmark_keras_model_subclassed_evaluate_run_model_eagerly(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_evaluate(model)
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
 
   def benchmark_keras_model_functional_evaluate(self):
     model = make_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_evaluate(model)
 
-  def benchmark_keras_model_functional_evaluate_disable_defun(self):
+  def benchmark_keras_model_functional_evaluate_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_evaluate(model)
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
 
   def benchmark_keras_model_sequential_evaluate(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_evaluate(model)
 
-  def benchmark_keras_model_sequential_evaluate_disable_defun(self):
+  def benchmark_keras_model_sequential_evaluate_run_model_eagerly(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_evaluate(model)
+    self._benchmark_keras_model_evaluate(model, run_eagerly=True)
 
   def benchmark_keras_model_subclassed_predict(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
     self._benchmark_keras_model_predict(model)
 
-  def benchmark_keras_model_subclassed_predict_disable_defun(self):
+  def benchmark_keras_model_subclassed_predict_run_model_eagerly(self):
     model = SubclassedKerasModel(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_predict(model)
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
 
   def benchmark_keras_model_functional_predict(self):
     model = make_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_predict(model)
 
-  def benchmark_keras_model_functional_predict_disable_defun(self):
+  def benchmark_keras_model_functional_predict_run_model_eagerly(self):
     model = make_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_predict(model)
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
 
   def benchmark_keras_model_sequential_predict(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
     self._benchmark_keras_model_predict(model)
 
-  def benchmark_keras_model_sequential_predict_disable_defun(self):
+  def benchmark_keras_model_sequential_predict_run_model_eagerly(self):
     model = make_sequential_keras_model(initializer="glorot_uniform")
-    model._can_use_graph_functions = False
-    self._benchmark_keras_model_predict(model)
+    self._benchmark_keras_model_predict(model, run_eagerly=True)
 
   def benchmarkScan(self):
     elems = math_ops.range(1600)
diff --git a/tensorflow/python/eager/context.py b/tensorflow/python/eager/context.py
index e3fef524bf9f125839ccc3c6d47e54487b5a5961..cbbe5cf49e20afc63e7710e39dc37ecbc4ac5082 100644
--- a/tensorflow/python/eager/context.py
+++ b/tensorflow/python/eager/context.py
@@ -25,7 +25,6 @@ import random
 import threading
 
 from tensorflow.core.protobuf import config_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python import tf2
 from tensorflow.python.framework import c_api_util
@@ -86,21 +85,21 @@ class FunctionCallOptions(object):
   Eager functions are functions decorated with tf.contrib.eager.defun.
   """
 
-  def __init__(self, executor_type=None, rewriter_config=None):
+  def __init__(self, executor_type=None, config_proto=None):
     """Constructor.
 
     Args:
       executor_type: (optional) name of the executor to be used to execute the
         eager function. If None or an empty string, the default Tensorflow
         executor will be used.
-      rewriter_config: (optional) a rewriter_config_pb2.RewriterConfig proto or
+      config_proto: (optional) a `config_pb2.ConfigProto` proto or
         a serialized string of that proto.
         The config used by Grappler when optimizing the function graph.
         Each concrete function is optimized the first time is called. Changing
-        rewriter_config after the first call has no effect.
-        If rewriter_config is None, an empty RewriterConfig will be used.
+        config_proto after the first call has no effect.
+        If config_proto is None, an empty RewriterConfig will be used.
     """
-    self.rewriter_config_serialized = rewriter_config
+    self.config_proto_serialized = config_proto
     self.executor_type = executor_type
 
   @property
@@ -112,24 +111,22 @@ class FunctionCallOptions(object):
     self._executor_type = executor_type
 
   @property
-  def rewriter_config_serialized(self):
-    return self._rewriter_config_serialized
+  def config_proto_serialized(self):
+    return self._config_proto_serialized
 
-  @rewriter_config_serialized.setter
-  def rewriter_config_serialized(self, config):
-    if isinstance(config, rewriter_config_pb2.RewriterConfig):
-      self._rewriter_config_serialized = config.SerializeToString()
+  @config_proto_serialized.setter
+  def config_proto_serialized(self, config):
+    if isinstance(config, config_pb2.ConfigProto):
+      self._config_proto_serialized = config.SerializeToString()
     elif isinstance(config, str):
-      self._rewriter_config_serialized = config
+      self._config_proto_serialized = config
     elif config is None:
-      self._rewriter_config_serialized = rewriter_config_pb2.RewriterConfig(
-      ).SerializeToString()
+      self._config_proto_serialized = (
+          config_pb2.ConfigProto().SerializeToString())
     else:
-      raise ValueError(
-          "the rewriter config must be either a "
-          "rewriter_config_pb2.RewriterConfig, or a serialized string of that "
-          "proto or None. got: {}"
-          .format(type(config)))
+      raise ValueError("the rewriter config must be either a "
+                       "config_pb2.ConfigProto, or a serialized string of that "
+                       "proto or None. got: {}".format(type(config)))
 
 
 # TODO(agarwal): better name ?
@@ -152,14 +149,12 @@ class _EagerContext(threading.local):
 
     # Default rewriter config corresponds to turning all default grappler
     # optimizations on.
-    base_config = rewriter_config_pb2.RewriterConfig()
+    base_config = config_pb2.ConfigProto()
 
-    if config is not None and config.HasField(
-        "graph_options") and config.graph_options.HasField("rewrite_options"):
-      base_config.Merge(config.graph_options.rewrite_options)
+    if config is not None:
+      base_config.MergeFrom(config)
 
-    self.function_call_options = FunctionCallOptions(
-        rewriter_config=base_config)
+    self.function_call_options = FunctionCallOptions(config_proto=base_config)
 
 
 ContextSwitch = collections.namedtuple(
@@ -483,10 +478,6 @@ class Context(object):
     Raises:
       ValueError: If name is not a string or is an invalid device name.
     """
-    devices = self._context_devices
-    if devices is None:
-      self._initialize_handle_and_devices()
-      devices = self._context_devices
     eager_context = self._eager_context
     old_device_name = eager_context.device_name
     old_device_spec = eager_context.device_spec
@@ -507,7 +498,9 @@ class Context(object):
         if old_device_name:
           new_device_spec = copy.copy(old_device_spec)
         else:
-          new_device_spec = pydev.DeviceSpec.from_string(devices[0])
+          self._initialize_handle_and_devices()
+          new_device_spec = pydev.DeviceSpec.from_string(
+              self._context_devices[0])
         new_device_spec.merge_from(device_spec)
       else:
         new_device_spec = pydev.DeviceSpec.from_string("")
@@ -897,21 +890,21 @@ def export_run_metadata():
   return context().export_run_metadata()
 
 
-def function_rewriter_config(rewriter_config):
+def function_config_proto(config_proto):
   """Context manager for setting the grappler rewrite config.
 
   This config is used by Grappler when optimizing the function graph.
 
   Args:
-    rewriter_config: a rewriter_config_pb2.RewriterConfig proto or
+    config_proto: a `config_pb2.ConfigProto` proto or
       a serialized string of that proto or None. If None, the default instance
-      of rewriter_config_pb2.RewriterConfig will be used.
+      of `config_pb2.ConfigProto` will be used.
 
   Returns:
     A context manager.
   """
   def _set_options_func(options):
-    options.rewriter_config_serialized = rewriter_config
+    options.config_proto_serialized = config_proto
 
   return context().function_call_options(_set_options_func)
 
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index cad6721c702a9dd70788eb317e21e1cee70f0f4a..6bacd7a962fdefb8caf11189b0681694d23b97f0 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -214,7 +214,7 @@ class PolymorphicFunction(object):
                python_function,
                name,
                input_signature=None,
-               autograph=False,
+               autograph=True,
                experimental_autograph_options=None):
     """Initializes a polymorphic function.
 
@@ -503,7 +503,7 @@ class PolymorphicFunction(object):
 @tf_export("function", v1=[])
 def function(func=None,
              input_signature=None,
-             autograph=False,
+             autograph=True,
              experimental_autograph_options=None):
   """Creates a callable TensorFlow graph from a Python function.
 
@@ -552,9 +552,9 @@ def function(func=None,
     return x + tf.to_float(c)
 
   assert int(c) == 0
-  assert f(1.0) == 3.0
+  assert f(1.0) == 2.0
   assert int(c) == 1
-  assert f(1.0) == 4.0
+  assert f(1.0) == 3.0
   assert int(c) == 2
   ```
 
diff --git a/tensorflow/python/eager/def_function_test.py b/tensorflow/python/eager/def_function_test.py
index f0f71a219e6337ef710ce8df3a2451fd454cdba1..4100a10044c3c39763de8bb3eec645e278d94e19 100644
--- a/tensorflow/python/eager/def_function_test.py
+++ b/tensorflow/python/eager/def_function_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
@@ -149,9 +150,9 @@ class DefFunctionTest(test.TestCase):
 
       result = fn(3.0)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(sess.run(state[0]), 2.0)
-      self.assertAllEqual(sess.run(result), 6.0)
+      self.assertAllEqual(self.evaluate(result), 6.0)
 
   def testLegacyGraphModeVariablesNonTrivialInitializer(self):
     with ops.Graph().as_default(), self.test_session() as sess:
@@ -168,9 +169,9 @@ class DefFunctionTest(test.TestCase):
 
       result = fn(3.0)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertAllEqual(sess.run(state[0]), 6.0)
-      self.assertAllEqual(sess.run(result), 18.0)
+      self.assertAllEqual(self.evaluate(result), 18.0)
 
   def testLegacyGraphModeInputDependentInitializerFails(self):
     with ops.Graph().as_default():
@@ -207,6 +208,18 @@ class DefFunctionTest(test.TestCase):
     m1 = MyModel()
     self.assertAllEqual(m1.apply(3.0), 6.0)
 
+  def test_functools_partial(self):
+    self.assertAllClose(
+        3.,
+        def_function.function(functools.partial(lambda x, y: x + y, 1.))(
+            constant_op.constant(2.)))
+
+  def test_unspecified_default_argument(self):
+    wrapped = def_function.function(
+        lambda x, y=2: x + y,
+        input_signature=[tensor_spec.TensorSpec((), dtypes.int32)])
+    self.assertEqual(3, wrapped(constant_op.constant(1)).numpy())
+
   def test_optimizer(self):
     x = constant_op.constant([[3., 4.]])
     y = constant_op.constant([2.])
diff --git a/tensorflow/python/eager/execution_callbacks.py b/tensorflow/python/eager/execution_callbacks.py
index 80ff4459d60a33d1a02f14acaafb8370a48fb6ca..28b6b84a82c6550cd0e1b893b5002d13b306233d 100644
--- a/tensorflow/python/eager/execution_callbacks.py
+++ b/tensorflow/python/eager/execution_callbacks.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
 import functools
 
 import numpy as np
@@ -28,8 +29,13 @@ from tensorflow.python.eager import core
 from tensorflow.python.eager import execute
 from tensorflow.python.platform import tf_logging as logging
 
-_DEFAULT_CALLBACK_ACTION = "raise"
-_VALID_CALLBACK_ACTIONS = (None, "ignore", "print", "raise", "warn")
+IGNORE = "ignore"
+PRINT = "print"
+RAISE = "raise"
+WARN = "warn"
+
+_DEFAULT_CALLBACK_ACTION = RAISE
+_VALID_CALLBACK_ACTIONS = (None, IGNORE, PRINT, RAISE, WARN)
 
 
 # TODO(cais): Consider moving this exception class to errors_impl.py.
@@ -335,3 +341,38 @@ def seterr(inf_or_nan=None):
           functools.partial(inf_nan_callback, action=inf_or_nan))
 
   return old_settings
+
+
+@contextlib.contextmanager
+def errstate(inf_or_nan=None):
+  """Context manager setting error state.
+
+  Example:
+  ```
+  c = tf.log(0.)  # -inf
+
+  with errstate(inf_or_nan="raise"):
+    tf.log(0.)  # <-- Raises InfOrNanError.
+  ```
+
+  Args:
+    inf_or_nan: Set action for infinity (`inf`) and NaN (`nan`) values.
+      Possible values: `{IGNORE, PRINT, RAISE, WARN}`.
+      `IGNORE`: take no action when `inf` values appear.
+      `PRINT`: print a warning to `stdout`.
+      `RAISE`: raise an `InfOrNanError`.
+      `WARN`: print a warning using `tf.logging.warn`.
+      A value of `None` leads to no change in the action of the condition.
+
+  Yields:
+    None.
+
+  Raises:
+    ValueError: If the value of any keyword arguments is invalid.
+  """
+  if not context.executing_eagerly():
+    yield
+  else:
+    old_settings = seterr(inf_or_nan=inf_or_nan)
+    yield
+    seterr(**old_settings)
diff --git a/tensorflow/python/eager/execution_callbacks_test.py b/tensorflow/python/eager/execution_callbacks_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5594ab5f12abffb1e2b3bb4d1d0fa4251eedf809
--- /dev/null
+++ b/tensorflow/python/eager/execution_callbacks_test.py
@@ -0,0 +1,55 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for eager execution_callbacks."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.eager import execution_callbacks
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+
+def log_zero():
+  """Computes `log(0.0)`."""
+  return math_ops.log(constant_op.constant(0.))
+
+
+class ExecutionCallbacksTest(test.TestCase):
+
+  def test_errstate_inf_raise(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      with self.assertRaises(execution_callbacks.InfOrNanError):
+        log_zero()
+
+  def test_errstate_inf_ignore(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.IGNORE):
+      self.assertEqual(-float("inf"), log_zero().numpy())
+
+  def test_errstate_nesting(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      with execution_callbacks.errstate(inf_or_nan=execution_callbacks.IGNORE):
+        self.assertEqual(-float("inf"), log_zero().numpy())
+
+      with self.assertRaises(execution_callbacks.InfOrNanError):
+        log_zero()
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index c429dd359bbf7be07afcb3e8d7b84ed1d0493c30..520c85a2c2093436d8d99b4713f0ad5fcc92321d 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -48,6 +48,7 @@ from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
@@ -66,6 +67,11 @@ WHITELIST_FUNCTION_ATTRIBUTE_REGEX = [
     BACKWARD_FUNCTION_ATTRIBUTE_NAME
 ]
 
+CacheKey = collections.namedtuple("CacheKey", [
+    "input_signature", "parent_graph", "device_functions", "colocation_stack",
+    "uses_xla"
+])
+
 
 def _parse_func_attrs(attributes):
   """Convert the keyword arguments into function_def attributes.
@@ -83,8 +89,8 @@ def _parse_func_attrs(attributes):
   """
   attrs = {}
   for key, value in attributes.items():
-    if not any([re.match(reg, key)
-                for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX]):
+    if not any(re.match(reg, key)
+               for reg in WHITELIST_FUNCTION_ATTRIBUTE_REGEX):
       raise ValueError("Attribute name is not whitelisted. "
                        "Whitelisted: prefix %s, got: %s" %
                        (WHITELIST_FUNCTION_ATTRIBUTE_REGEX, key))
@@ -260,7 +266,7 @@ class _EagerDefinedFunction(object):
           f=self,
           tout=self._output_types,
           executing_eagerly=executing_eagerly,
-          config=function_call_options.rewriter_config_serialized,
+          config=function_call_options.config_proto_serialized,
           executor_type=function_call_options.executor_type)
 
     if executing_eagerly:
@@ -418,7 +424,10 @@ class Function(object):
 
     if (tape.should_record(tensor_inputs) or
         tape.should_record(self._captured_inputs)):
-      return self._backprop_call(args)
+      if context.executing_eagerly():
+        return self._eager_backprop_call(args)
+      else:
+        return self._backprop_call_with_delayed_rewrite(args)
 
     # Only need to override the gradient in graph mode and when we have outputs.
     if context.executing_eagerly() or not self.outputs:
@@ -444,37 +453,34 @@ class Function(object):
       name: The name to register the gradient as.
     """
     @ops.RegisterGradient(name)
-    def grad_fn(op, *doutputs):  # pylint: disable=unused-variable
-      """Gradients of this function."""
-      if self._backward_graph_function is None:
-        self._construct_backprop_function()
+    def _registered_grad_fn(op, *doutputs):  # pylint: disable=unused-variable
+      return self._grad_fn(op, *doutputs)
 
-      # pylint: disable=protected-access
-      self._forward_function.add_to_graph(op.graph)
-      num_inference_outputs = self._inference_function._num_outputs
-
-      # Rewrite an inference call op to be a forward call op
-      if op.get_attr("f").name.encode() == self._inference_function.name:
-        func = attr_value_pb2.AttrValue(
-            func=attr_value_pb2.NameAttrList(
-                name=self._forward_function.name))
-        op._set_attr("f", func)
-        types = attr_value_pb2.AttrValue.ListValue(
-            type=self._forward_function._output_types)
-        op._set_attr("Tout", attr_value_pb2.AttrValue(list=types))
-        for i in range(
-            num_inference_outputs, len(self._forward_function._output_types)):
-          t = ops.Tensor(op, i, self._forward_function._output_types[i])
-          t.set_shape(self._forward_function._output_shapes[i])
-          func_graph_output = self._forward_function._func_graph_outputs[i]
-          custom_gradient.copy_handle_data(func_graph_output, t)
-          op._outputs.append(t)
-      # pylint: enable=protected-access
-      # Compute the gradients using the side outputs
-      side_outputs = op.outputs[num_inference_outputs:]
-      args = list(doutputs[:num_inference_outputs]) + list(side_outputs)
-      return self._backward_graph_function._call_flat(  # pylint: disable=protected-access
-          (a for a in args if a is not None))
+  def _grad_fn(self, op, *doutputs):
+    """Gradients of this function."""
+    if self._backward_graph_function is None:
+      self._construct_backprop_function()
+
+    # pylint: disable=protected-access
+    self._forward_function.add_to_graph(op.graph)
+    num_inference_outputs = self._inference_function._num_outputs
+
+    # Rewrite an inference call op to be a forward call op
+    if op.get_attr("f").name.encode() == self._inference_function.name:
+      op._set_func_attr("f", self._forward_function.name)
+      op._set_type_list_attr("Tout", self._forward_function._output_types)
+      op._add_outputs(
+          self._forward_function._output_types[num_inference_outputs:],
+          self._forward_function._output_shapes[num_inference_outputs:])
+      for i in range(num_inference_outputs, len(op.outputs)):
+        func_graph_output = self._forward_function._func_graph_outputs[i]
+        custom_gradient.copy_handle_data(func_graph_output, op.outputs[i])
+    # pylint: enable=protected-access
+    # Compute the gradients using the side outputs
+    side_outputs = op.outputs[num_inference_outputs:]
+    args = list(doutputs[:num_inference_outputs]) + list(side_outputs)
+    return self._backward_graph_function._call_flat(  # pylint: disable=protected-access
+        (a for a in args if a is not None))
 
   @property
   def name(self):
@@ -617,10 +623,13 @@ class Function(object):
         self._func_graph.outputs + backwards_graph_captures,
         forward_function_attr)
 
-  def _backprop_call(self, args):
+  def _eager_backprop_call(self, args):
     """Calls the forward function and records the result on a tape.
 
-    (Only records results on a tape if the function has outputs)
+    This method fully constructs the forward and backward functions before
+    calling the function and recording them on the tape.
+
+    (Only records results on a tape if the function has outputs).
 
     Args:
       args: All inputs to the function, including resolved captured inputs
@@ -662,6 +671,46 @@ class Function(object):
                           args, backward_function)
     return self._build_call_outputs(real_outputs)
 
+  def _backprop_call_with_delayed_rewrite(self, args):
+    """Calls the inference function and records the result on a tape.
+
+    The recorded backwards function will construct the backwards graph and
+    rewrite the inference function to the forward function. This only happens
+    if the recorded backwards function ends up being used to compute gradients.
+
+    This approach avoids constructing unnecessary graphs, but it only works if
+    we are calling this function when not executing eagerly.
+
+    (Only records results on a tape if the function has outputs)
+
+    Args:
+      args: All inputs to the function, including resolved captured inputs
+
+    Returns:
+      The call output.
+    """
+    ctx = context.context()
+
+    if not self._gradient_name:
+      self._gradient_name = "PartitionedCall-%s" % ops.uid()
+      self._register_gradient(self._gradient_name)
+    with ops.get_default_graph().gradient_override_map(
+        {"PartitionedCall": self._gradient_name,
+         "StatefulPartitionedCall": self._gradient_name}):
+      outputs = self._inference_function.call(ctx, args)
+
+    if isinstance(outputs, ops.Operation) or outputs is None:
+      return outputs
+
+    call_op = outputs[0].op
+
+    def backward_function(*args):
+      return self._grad_fn(call_op, *args)
+
+    tape.record_operation(self._inference_function.signature.name, outputs,
+                          args, backward_function)
+    return self._build_call_outputs(outputs)
+
   def _build_call_outputs(self, result):
     """Maps the fdef output list to actual output structure.
 
@@ -724,7 +773,7 @@ class PolymorphicFunction(object):
                name,
                input_signature=None,
                attributes=None,
-               autograph=False):
+               autograph=True):
     """Initializes a polymorphic function.
 
     Args:
@@ -927,17 +976,23 @@ class PolymorphicFunction(object):
     """Computes the cache key given inputs and execution context."""
     if self._input_signature is None:
       inputs = (args, kwargs) if kwargs else args
-      cache_key = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
+      input_signature = pywrap_tensorflow.TFE_Py_EncodeArg(inputs)
     else:
       del args, kwargs
-      cache_key = self._flat_input_signature
+      input_signature = self._flat_input_signature
 
     ctx = context.context()
-    with ops.init_scope():
-      # The graph, or whether we're executing eagerly, should be a part of the
-      # cache key so we don't improperly capture tensors such as variables.
-      executing_eagerly = ctx.executing_eagerly()
-      execution_context = executing_eagerly or ops.get_default_graph()
+
+    # Don't need to open an init_scope if the _cache_key call is in eager mode
+    # already.
+    executing_eagerly = ctx.executing_eagerly()
+    parent_graph = None
+    if not executing_eagerly:
+      with ops.init_scope():
+        # The graph, or whether we're executing eagerly, should be a part of the
+        # cache key so we don't improperly capture tensors such as variables.
+        executing_eagerly = ctx.executing_eagerly()
+        parent_graph = None if executing_eagerly else ops.get_default_graph()
 
     # pylint: disable=protected-access
     default_graph = ops.get_default_graph()
@@ -966,8 +1021,8 @@ class PolymorphicFunction(object):
       else:
         device_functions = ()
     # pylint: enable=protected-access
-    return (cache_key, execution_context, device_functions, colocation_stack,
-            uses_xla)
+    return CacheKey(input_signature, parent_graph, device_functions,
+                    colocation_stack, uses_xla)
 
   def _canonicalize_function_inputs(self, *args, **kwargs):
     """Canonicalizes `args` and `kwargs`.
@@ -1039,16 +1094,21 @@ class PolymorphicFunction(object):
       return inputs, kwargs
     else:
       assert not kwargs
+      signature_relevant_inputs = inputs[:len(self._input_signature)]
       try:
-        nest.assert_same_structure(self._input_signature, inputs)
+        nest.assert_same_structure(self._input_signature,
+                                   signature_relevant_inputs)
       except (ValueError, TypeError):
         raise ValueError("Structure of Python function inputs does not match "
                          "input_signature.")
-      if any(not pywrap_tensorflow.IsTensor(arg) for arg in flat_inputs):
+      signature_inputs_flat = nest.flatten(signature_relevant_inputs)
+      if any(not pywrap_tensorflow.IsTensor(arg)
+             for arg in signature_inputs_flat):
         raise ValueError("When input_signature is provided, all inputs to "
                          "the Python function must be Tensors.")
       if any(not spec.is_compatible_with(other)
-             for spec, other in zip(self._flat_input_signature, flat_inputs)):
+             for spec, other in zip(self._flat_input_signature,
+                                    signature_inputs_flat)):
         raise ValueError("Python inputs incompatible with input_signature: "
                          "inputs (%s), input_signature (%s)" %
                          (str(inputs), str(self._input_signature)))
@@ -1083,6 +1143,9 @@ class PolymorphicFunction(object):
                         "must be hashable.")
 
       if graph_function is None:
+        logging.vlog(1,
+                     "Creating new FuncGraph for Python function %r (key: %r)",
+                     self._python_function, cache_key)
         if self._input_signature is None:
           arglen = len(args)
         else:
@@ -1137,7 +1200,7 @@ def validate_signature(signature):
                     "a possibly nested sequence of TensorSpec objects.")
 
 
-def defun(func=None, input_signature=None, autograph=False):
+def defun(func=None, input_signature=None, autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
   `defun` (short for "define function") trace-compiles a Python function
@@ -1470,7 +1533,7 @@ def defun(func=None, input_signature=None, autograph=False):
 def defun_with_attributes(func=None,
                           input_signature=None,
                           attributes=None,
-                          autograph=False):
+                          autograph=True):
   """Compiles a Python function into a callable TensorFlow graph.
 
   This function supports adding extra function attributes. See detailed
diff --git a/tensorflow/python/eager/function_gradients_test.py b/tensorflow/python/eager/function_gradients_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..98dec0b361b76eadbb107a7cd42e4deba6f2ea25
--- /dev/null
+++ b/tensorflow/python/eager/function_gradients_test.py
@@ -0,0 +1,756 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variable_scope
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class FunctionGradientsTest(test.TestCase, parameterized.TestCase):
+
+  def testGraphModeWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    @def_function.function
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    self.assertAllEqual(step(), 2.0)
+
+  def testGraphGradientVariable(self):
+    with ops.Graph().as_default(), self.cached_session():
+      v = variables.Variable(1.0)
+
+      @def_function.function
+      def f():
+        return 2.0 * v
+
+      node = f()
+      grads, = gradients_impl.gradients(node, v)
+      v.initializer.run()
+      self.assertAllEqual(grads.eval(), 2.0)
+      self.assertEqual(grads.shape, v.shape)
+
+  def testSymGradGatherNd(self):
+    with ops.Graph().as_default(), self.cached_session() as sess:
+
+      @def_function.function
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertAllEqual(self.evaluate(g).values, [[1.0]])
+
+  def testNoSymGradNestedDefun(self):
+
+    @def_function.function
+    def outer():
+
+      @def_function.function
+      def f(x):
+        return array_ops.gather_nd(x, [[0]])
+
+      c = constant_op.constant([[2.]])
+      f_c = f(c)
+      g, = gradients_impl.gradients(f_c, c)
+      self.assertIsInstance(g, ops.IndexedSlices)
+
+    outer()
+
+  def testGraphFunctionWithGradients(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name='v')
+
+    @def_function.function
+    def step():
+      def inner():
+        return v * v
+
+      return backprop.implicit_grad(inner)()[0][0]
+
+    step_op = step.get_concrete_function()
+    self.assertEqual(step_op.output_dtypes, dtypes.float32)
+    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
+    self.assertAllEqual(step_op(), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testDefunCondGradient(self):
+
+    @def_function.function
+    def f(x):
+      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testGraphLoopGradient(self):
+
+    @def_function.function
+    def f(x):
+      return control_flow_ops.while_loop(lambda _, i: i < 2,
+                                         lambda x, i: (2*x, i + 1),
+                                         [x, 0])[0]
+
+    with backprop.GradientTape() as t:
+      x = constant_op.constant(1.0)
+      t.watch(x)
+      y = f(x)
+    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
+
+  def testDefunDifferentiable(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testDefunCanBeDifferentiatedTwice(self):
+    v = resource_variable_ops.ResourceVariable(1.0)
+
+    @def_function.function
+    def f():
+      return v * v
+
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+    # Ensure that v is watched again.
+    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
+
+  def testSymbolicGradientVariableNoneNotZerosLike(self):
+    with ops.Graph().as_default():
+      v = variables.Variable(1.0)
+
+      @def_function.function
+      def f(x, v):
+        v.read_value()
+        return x * x
+
+      x = constant_op.constant(1.0)
+      l = f(x, v)
+      _, dv = gradients_impl.gradients(l, [x, v])
+      with self.cached_session():
+        v.initializer.run()
+        self.assertEqual(dv, None)
+
+  def testDefunCallBackprop(self):
+
+    @def_function.function
+    def f(x):
+      return math_ops.add(x, x)
+
+    @def_function.function
+    def g(x):
+      return backprop.gradients_function(f, [0])(x)[0]
+
+    self.assertAllEqual(2, g(constant_op.constant(2.)))
+
+  @test_util.run_v1_only('b/120545219')
+  def testGraphModeEagerGradError(self):
+    with context.graph_mode():
+      def f():
+        x = variable_scope.get_variable(
+            'v', initializer=constant_op.constant(1.0))
+        return x * constant_op.constant(2.0)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   'No trainable variables were accessed'):
+        backprop.implicit_val_and_grad(f)()
+
+  def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
+
+    @def_function.function
+    def g(x):
+      return backprop.gradients_function(math_ops.multiply, [0, 1])(x, x)
+
+    def np_g(x):
+      return [d.numpy() for d in g(x)]
+
+    x = constant_op.constant(1.)
+    self.assertAllEqual([1., 1.], np_g(x))
+    self.assertAllEqual([1., 1.], np_g(1.))
+
+  def testGradientTensorConversionWithDefun(self):
+    three = resource_variable_ops.ResourceVariable(3.0, name='v')
+
+    @def_function.function
+    def f(x):
+      return math_ops.add(x, three)
+
+    def g(x):
+      return f(x)
+
+    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
+    self.assertAllEqual(g, 1.0)
+
+  def testGradient(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    def sq(x):
+      return matmul(x, x, transpose_a=True)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
+
+  def testGradientInFunction(self):
+
+    @def_function.function
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
+
+  def testGradientOfGatherWithDefun(self):
+    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
+
+    def sum_gather():
+      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
+
+    grad_fn = backprop.implicit_grad(sum_gather)
+    gradient = grad_fn()
+    defun_grad_fn = backprop.implicit_grad(def_function.function(sum_gather))
+    defun_gradient = defun_grad_fn()
+    self.assertEqual(len(gradient), len(defun_gradient))
+
+    gradient = gradient[0][0]
+    defun_gradient = defun_gradient[0][0]
+    self.assertAllEqual(gradient.values, defun_gradient.values)
+    self.assertAllEqual(gradient.indices, defun_gradient.indices)
+    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
+
+  def testDifferentiableFunctionNoneOutputs(self):
+
+    @def_function.function
+    def my_function(x):
+      return x, None
+
+    def wrapper(x):
+      return my_function(x)[0]
+
+    g = backprop.gradients_function(wrapper, [0])(constant_op.constant(0.0))
+    self.assertAllEqual(g[0], 1.)
+
+    @def_function.function
+    def foo(a):
+      return None, a * a
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      none, r = foo(x)
+    g = tp.gradient(r, x)
+
+    self.assertIs(none, None)
+    self.assertAllEqual(r, 25.0)
+    self.assertAllEqual(g, 2 * 5.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testNestedDifferentiableFunction(self):
+    @def_function.function
+    def inner_fn(a, b):
+      return a * math_ops.add(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return inner_fn(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunction(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self):
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return math_ops.mul(a, inner_fn(a, b))
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, 3.0)
+
+    x = constant_op.constant(5.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+    self.assertAllEqual(middle_fn(3.0, x), 3.0 * (3.0 + 5.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = inner_fn(y, y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      with backprop.GradientTape() as tp:
+        tp.watch(x)
+        result = middle_fn(x, 1.0)
+      grad = tp.gradient(result, x)
+      return grad
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    @def_function.function
+    def outer_outer_fn(x):
+      return outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self):
+    @def_function.function
+    def inner_inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def inner_fn(a, b):
+      return inner_inner_fn(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def almost_outer_fn(x):
+      result = middle_fn(x, 1.0)
+      return gradients_impl.gradients(result, [x])[0]
+
+    @def_function.function
+    def outer_fn(x):
+      return almost_outer_fn(x)
+
+    @def_function.function
+    def outer_outer_fn(x):
+      return outer_fn(x)
+
+    x = constant_op.constant(5.0)
+    grad = outer_outer_fn(x)
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariable(self):
+    var = variables.Variable(constant_op.constant(1.0))
+
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return a * inner_fn(a, b)
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, var)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleGradCalls(self):
+    v = variables.Variable(constant_op.constant(3.0))
+
+    @def_function.function
+    def inner_fn(a, b):
+      return math_ops.add(a, b)
+
+    @def_function.function
+    def middle_fn(a, b):
+      return math_ops.mul(a, inner_fn(a, b))
+
+    @def_function.function
+    def outer_fn(x):
+      return middle_fn(x, v)
+
+    x = constant_op.constant(5.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+    self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
+
+    with backprop.GradientTape() as tp:
+      tp.watch(x)
+      result = outer_fn(x)
+    grad = tp.gradient(result, x)
+
+    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+    y = constant_op.constant(4.0)
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+    v.assign(constant_op.constant(1.5))
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = outer_fn(y)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 2 * 4.0 + 1.5)
+
+    with backprop.GradientTape() as tp:
+      tp.watch(y)
+      result = inner_fn(y, v)
+    grad = tp.gradient(result, y)
+
+    self.assertAllEqual(grad, 1.0)
+
+  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleTFGrads(self):
+    with context.graph_mode(), self.cached_session():
+      v = resource_variable_ops.ResourceVariable(3.0)
+      v.initializer.run()
+
+      @def_function.function
+      def inner_fn(a, b):
+        return math_ops.add(a, b)
+
+      @def_function.function
+      def middle_fn(a, b):
+        return math_ops.mul(a, inner_fn(a, b))
+
+      @def_function.function
+      def outer_fn(x):
+        return middle_fn(x, v)
+
+      x = constant_op.constant(5.0)
+      self.assertAllEqual(outer_fn(x).eval(), 5.0 * (5.0 + 3.0))
+
+      grad, = gradients_impl.gradients(outer_fn(x), x)
+
+      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+      self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
+      self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
+
+      grad, = gradients_impl.gradients(outer_fn(x), x)
+
+      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
+
+      y = constant_op.constant(4.0)
+      grad, = gradients_impl.gradients(outer_fn(y), y)
+      self.assertAllEqual(grad, 2 * 4.0 + 3.0)
+
+      self.evaluate(v.assign(constant_op.constant(1.5)))
+      grad, = gradients_impl.gradients(outer_fn(y), y)
+
+      self.assertAllEqual(grad, 2 * 4.0 + 1.5)
+
+      grad, = gradients_impl.gradients(inner_fn(y, v), y)
+      self.assertAllEqual(grad, 1.0)
+
+  def testNestedDifferentiableFunctionNoneOutputs(self):
+    @def_function.function
+    def foo(a, b):
+      return None, a * math_ops.add(a, b), None, 2*a
+
+    @def_function.function
+    def bar(x):
+      return foo(x, 1.0)
+
+    x = constant_op.constant(5.0)
+    with backprop.GradientTape(persistent=True) as tp:
+      tp.watch(x)
+      none1, r1, none2, r2 = bar(x)
+    g1 = tp.gradient(r1, x)
+    g2 = tp.gradient(r2, x)
+
+    self.assertAllEqual(r1, 30.0)
+    self.assertAllEqual(r2, 10.0)
+    self.assertIs(none1, None)
+    self.assertIs(none2, None)
+    self.assertAllEqual(g1, 2 * 5.0 + 1.0)
+    self.assertAllEqual(g2, 2.0)
+
+  def testGradientWithKeywordArguments(self):
+    matmul = def_function.function(math_ops.matmul)
+
+    def sq(x):
+      return matmul(a=x, b=x, transpose_a=True)
+
+    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
+    grad_t, = backprop.gradients_function(sq, [0])(t)
+    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
+
+    with backprop.GradientTape(persistent=True) as tape:
+      tape.watch(t)
+      one = matmul(t, b=t, transpose_a=True)
+      two = matmul(b=t, a=t, transpose_a=True)
+      three = matmul(a=t, b=t, transpose_a=True)
+
+    for output in [one, two, three]:
+      self.assertAllEqual(tape.gradient(output, t), [[6, 6], [14, 14]])
+
+  def testGradientInFunctionWithKeywordArguments(self):
+
+    @def_function.function
+    def f(x):
+      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
+
+    self.assertAllEqual(f(x=constant_op.constant(1.0)), 2.0)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testBackwardNone(self):
+    model = variables.Variable(1.0, name='model')
+    count = variables.Variable(0)
+
+    @function.defun
+    def forward_pass(value):
+      count.assign_add(1)
+      residuals = value - model
+      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
+      # Note: count is an integer, so its doutput will be None
+      return loss, count
+
+    def reduce_fn(x):
+      if context.executing_eagerly():
+        with backprop.GradientTape() as t:
+          loss, count = forward_pass(x)
+        return t.gradient(loss, model), count
+      loss, count = forward_pass(x)
+      grad_only = gradients_impl.gradients(loss, model)
+      return grad_only, count
+
+    g, _ = reduce_fn(constant_op.constant([7.0]))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution(
+      config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  test.main()
diff --git a/tensorflow/python/eager/function_test.py b/tensorflow/python/eager/function_test.py
index 2af08689f8796d53ab0b8d7d041ec55898854ed2..50d1b4b6f77e203e1d9ebb278f1c356024a4226f 100644
--- a/tensorflow/python/eager/function_test.py
+++ b/tensorflow/python/eager/function_test.py
@@ -29,7 +29,6 @@ import numpy
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import keras
-from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
@@ -48,7 +47,6 @@ from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
@@ -102,10 +100,10 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       _ = x * y
       return x + y
 
-    # The default config allows everything.
-    rewrites = rewriter_config_pb2.RewriterConfig()
+    # The default config allows all rewrites.
+    config_proto = config_pb2.ConfigProto()
 
-    with context.function_rewriter_config(rewrites):
+    with context.function_config_proto(config_proto):
       t = constant_op.constant(1.0)
       self.assertAllEqual(add(t, t).numpy(), 2.0)
 
@@ -149,31 +147,22 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     out = a_times_b(pair({'a': t}, {'b': t}))
     self.assertAllEqual(out, math_ops.matmul(t, t).numpy())
 
-  def testGraphModeWithGradients(self):
-    v = resource_variable_ops.ResourceVariable(1.0, name='v')
-
-    @def_function.function
-    def step():
-      def inner():
-        return v * v
-
-      return backprop.implicit_grad(inner)()[0][0]
+  def testNestedOutputsGraphMode(self):
+    matmul = def_function.function(math_ops.matmul)
 
-    self.assertAllEqual(step(), 2.0)
+    pair = collections.namedtuple('pair', ['a', 'b'])
 
-  def testGraphGradientVariable(self):
-    with ops.Graph().as_default(), self.cached_session():
-      v = variables.Variable(1.0)
+    @def_function.function()
+    def pairs_mul(pair_a, pair_b):
+      return pair(matmul(pair_a.a, pair_b.a), matmul(pair_a.b, pair_b.b))
 
-      @def_function.function
-      def f():
-        return 2.0 * v
+    a = constant_op.constant([[1.0, 2.0], [1.0, 2.0]])
+    b = constant_op.constant([[3.0, 4.0], [3.0, 4.0]])
 
-      node = f()
-      grads, = gradients_impl.gradients(node, v)
-      v.initializer.run()
-      self.assertAllEqual(grads.eval(), 2.0)
-      self.assertEqual(grads.shape, v.shape)
+    out = pairs_mul(pair(a, b), pair(b, a))
+    expected = pair(math_ops.matmul(a, b).numpy(),
+                    math_ops.matmul(b, a).numpy())
+    self.assertAllClose(out, expected)
 
   def testGraphEagerIsolation(self):
 
@@ -314,34 +303,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     random_seed.set_random_seed(1)
     self.assertAllEqual(f(), x)
 
-  def testSymGradGatherNd(self):
-    with ops.Graph().as_default(), self.cached_session() as sess:
-
-      @def_function.function
-      def f(x):
-        return array_ops.gather_nd(x, [[0]])
-
-      c = constant_op.constant([[2.]])
-      f_c = f(c)
-      g, = gradients_impl.gradients(f_c, c)
-      self.assertAllEqual(sess.run(g).values, [[1.0]])
-
-  def testNoSymGradNestedDefun(self):
-
-    @def_function.function
-    def outer():
-
-      @def_function.function
-      def f(x):
-        return array_ops.gather_nd(x, [[0]])
-
-      c = constant_op.constant([[2.]])
-      f_c = f(c)
-      g, = gradients_impl.gradients(f_c, c)
-      self.assertIsInstance(g, ops.IndexedSlices)
-
-    outer()
-
   def testNestedInputsGraphFunction(self):
     matmul = def_function.function(math_ops.matmul)
 
@@ -378,21 +339,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(a, math_ops.matmul(t, t).numpy())
     self.assertAllEqual(b['b'].numpy(), 1.0)
 
-  def testGraphFunctionWithGradients(self):
-    v = resource_variable_ops.ResourceVariable(1.0, name='v')
-
-    @def_function.function
-    def step():
-      def inner():
-        return v * v
-
-      return backprop.implicit_grad(inner)()[0][0]
-
-    step_op = step.get_concrete_function()
-    self.assertEqual(step_op.output_dtypes, dtypes.float32)
-    self.assertEqual(step_op.output_shapes, tensor_shape.TensorShape([]))
-    self.assertAllEqual(step_op(), 2.0)
-
   def testGraphFunctionNoneOutput(self):
     @def_function.function
     def fn(unused_a, unused_b):
@@ -404,34 +350,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(fn_op.output_shapes, None)
     self.assertAllEqual(fn_op(x, x), None)
 
-  @test_util.run_in_graph_and_eager_modes()
-  def testDefunCondGradient(self):
-
-    @def_function.function
-    def f(x):
-      return control_flow_ops.cond(x > 0.5, lambda: 2 * x, lambda: 3 * x)
-
-    with backprop.GradientTape() as t:
-      x = constant_op.constant(1.0)
-      t.watch(x)
-      y = f(x)
-    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 2.0)
-
-  @test_util.run_in_graph_and_eager_modes()
-  def testGraphLoopGradient(self):
-
-    @def_function.function
-    def f(x):
-      return control_flow_ops.while_loop(lambda _, i: i < 2,
-                                         lambda x, i: (2*x, i + 1),
-                                         [x, 0])[0]
-
-    with backprop.GradientTape() as t:
-      x = constant_op.constant(1.0)
-      t.watch(x)
-      y = f(x)
-    self.assertAllEqual(self.evaluate(t.gradient(y, x)), 4.0)
-
   def testDefunNumpyArraysConvertedToTensors(self):
 
     def f(x):
@@ -510,20 +428,21 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       self.evaluate(variables.global_variables_initializer())
     self.assertEqual(self.evaluate(value), 2.0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.also_run_as_tf_function
   def testInitScopeTensorInitializationInFunction(self):
 
     @def_function.function
     def tensor_init():
       with ops.init_scope():
         const = constant_op.constant(2.0)
+      # Note: this variable bypasses tf.function's variable creation
+      # requirements by bypassing variable_creator_scope by using
+      # ResourceVariable instead of Variable.
       self.v = resource_variable_ops.ResourceVariable(const)
       return self.v.read_value()
 
     value = tensor_init()
-    if not context.executing_eagerly():
-      self.evaluate(variables.global_variables_initializer())
-    self.assertEqual(self.evaluate(value), 2.0)
+    self.assertAllEqual(value, 2.0)
 
   def testDefunShapeInferenceWithCapturedResourceVariable(self):
     v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
@@ -625,27 +544,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertIsInstance(
         self.v, resource_variable_ops.ResourceVariable)
 
-  def testDefunDifferentiable(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @def_function.function
-    def f():
-      return v * v
-
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-
-  def testDefunCanBeDifferentiatedTwice(self):
-    v = resource_variable_ops.ResourceVariable(1.0)
-
-    @def_function.function
-    def f():
-      return v * v
-
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-    # Ensure that v is watched again.
-    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
-
-  def testRunMetadata(self):
+  def disabled_testRunMetadata(self):
 
     @def_function.function
     def f(x):
@@ -683,23 +582,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       variables.global_variables_initializer().run()
       call = def_function.function(o.call)
       op = call()
-      self.assertAllEqual(sess.run(op), 2.0)
-
-  def testSymbolicGradientVariableNoneNotZerosLike(self):
-    with ops.Graph().as_default():
-      v = variables.Variable(1.0)
-
-      @def_function.function
-      def f(x, v):
-        v.read_value()
-        return x * x
-
-      x = constant_op.constant(1.0)
-      l = f(x, v)
-      _, dv = gradients_impl.gradients(l, [x, v])
-      with self.cached_session():
-        v.initializer.run()
-        self.assertEqual(dv, None)
+      self.assertAllEqual(self.evaluate(op), 2.0)
 
   def testGraphModeManyFunctions(self):
     with ops.Graph().as_default(), self.cached_session():
@@ -742,42 +625,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     self.assertAllEqual(8, g(constant_op.constant(2)))
 
-  def testDefunCallBackprop(self):
-
-    @def_function.function
-    def f(x):
-      return math_ops.add(x, x)
-
-    @def_function.function
-    def g(x):
-      return backprop.gradients_function(f, [0])(x)[0]
-
-    self.assertAllEqual(2, g(constant_op.constant(2.)))
-
-  def testGraphModeEagerGradError(self):
-    with context.graph_mode():
-      def f():
-        x = variable_scope.get_variable(
-            'v', initializer=constant_op.constant(1.0))
-        return x * constant_op.constant(2.0)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   'No trainable variables were accessed'):
-        backprop.implicit_val_and_grad(f)()
-
-  def testDefunCallBackpropUsingSameObjectForMultipleArguments(self):
-
-    @def_function.function
-    def g(x):
-      return backprop.gradients_function(math_ops.multiply, [0, 1])(x, x)
-
-    def np_g(x):
-      return [d.numpy() for d in g(x)]
-
-    x = constant_op.constant(1.)
-    self.assertAllEqual([1., 1.], np_g(x))
-    self.assertAllEqual([1., 1.], np_g(1.))
-
   def testCallShape(self):
 
     @def_function.function
@@ -808,37 +655,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
     g(three)
 
-  def testGradientTensorConversionWithDefun(self):
-    three = resource_variable_ops.ResourceVariable(3.0, name='v')
-
-    @def_function.function
-    def f(x):
-      return math_ops.add(x, three)
-
-    def g(x):
-      return f(x)
-
-    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
-    self.assertAllEqual(g, 1.0)
-
-  def testGradient(self):
-    matmul = def_function.function(math_ops.matmul)
-
-    def sq(x):
-      return matmul(x, x, transpose_a=True)
-
-    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    grad_t, = backprop.gradients_function(sq, [0])(t)
-    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
-
-  def testGradientInFunction(self):
-
-    @def_function.function
-    def f(x):
-      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
-
-    self.assertAllEqual(f(constant_op.constant(1.0)), 2.0)
-
   def testGatherResourceWithDefun(self):
     with ops.device('cpu:0'):
       v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
@@ -849,24 +665,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     defined = def_function.function(sum_gather)
     self.assertAllEqual(sum_gather(), defined())
 
-  def testGradientOfGatherWithDefun(self):
-    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])
-
-    def sum_gather():
-      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))
-
-    grad_fn = backprop.implicit_grad(sum_gather)
-    gradient = grad_fn()
-    defun_grad_fn = backprop.implicit_grad(def_function.function(sum_gather))
-    defun_gradient = defun_grad_fn()
-    self.assertEqual(len(gradient), len(defun_gradient))
-
-    gradient = gradient[0][0]
-    defun_gradient = defun_gradient[0][0]
-    self.assertAllEqual(gradient.values, defun_gradient.values)
-    self.assertAllEqual(gradient.indices, defun_gradient.indices)
-    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
-
   def testReturningIndexedSlicesWithDefun(self):
 
     def validate(indexed_slice):
@@ -1012,440 +810,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     shape = constant_op.constant([2, 1]).gpu()
     reshape(value, shape)  # No error is raised
 
-  def testDifferentiableFunctionNoneOutputs(self):
-
-    @def_function.function
-    def my_function(x):
-      return x, None
-
-    def wrapper(x):
-      return my_function(x)[0]
-
-    g = backprop.gradients_function(wrapper, [0])(constant_op.constant(0.0))
-    self.assertAllEqual(g[0], 1.)
-
-    @def_function.function
-    def foo(a):
-      return None, a * a
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      none, r = foo(x)
-    g = tp.gradient(r, x)
-
-    self.assertIs(none, None)
-    self.assertAllEqual(r, 25.0)
-    self.assertAllEqual(g, 2 * 5.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testNestedDifferentiableFunction(self):
-    @def_function.function
-    def inner_fn(a, b):
-      return a * math_ops.add(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      return inner_fn(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunction(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionWithMultipleGradCalls(self):
-    @def_function.function
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return math_ops.mul(a, inner_fn(a, b))
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, 3.0)
-
-    x = constant_op.constant(5.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-    self.assertAllEqual(middle_fn(3.0, x), 3.0 * (3.0 + 5.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-    y = constant_op.constant(4.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = inner_fn(y, y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInNestedDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def almost_outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    @def_function.function
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionGradientTapeInMultNestedDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def almost_outer_fn(x):
-      with backprop.GradientTape() as tp:
-        tp.watch(x)
-        result = middle_fn(x, 1.0)
-      grad = tp.gradient(result, x)
-      return grad
-
-    @def_function.function
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    @def_function.function
-    def outer_outer_fn(x):
-      return outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInNestedDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def almost_outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    @def_function.function
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  @test_util.run_in_graph_and_eager_modes
-  def testDeeplyNestedDifferentiableFunctionTFGradientInMultNestedDefun(self):
-    @def_function.function
-    def inner_inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def inner_fn(a, b):
-      return inner_inner_fn(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def almost_outer_fn(x):
-      result = middle_fn(x, 1.0)
-      return gradients_impl.gradients(result, [x])[0]
-
-    @def_function.function
-    def outer_fn(x):
-      return almost_outer_fn(x)
-
-    @def_function.function
-    def outer_outer_fn(x):
-      return outer_fn(x)
-
-    x = constant_op.constant(5.0)
-    grad = outer_outer_fn(x)
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariable(self):
-    var = variables.Variable(constant_op.constant(1.0))
-
-    @def_function.function
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return a * inner_fn(a, b)
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, var)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleGradCalls(self):
-    v = variables.Variable(constant_op.constant(3.0))
-
-    @def_function.function
-    def inner_fn(a, b):
-      return math_ops.add(a, b)
-
-    @def_function.function
-    def middle_fn(a, b):
-      return math_ops.mul(a, inner_fn(a, b))
-
-    @def_function.function
-    def outer_fn(x):
-      return middle_fn(x, v)
-
-    x = constant_op.constant(5.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-    self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-    self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
-
-    with backprop.GradientTape() as tp:
-      tp.watch(x)
-      result = outer_fn(x)
-    grad = tp.gradient(result, x)
-
-    self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-    y = constant_op.constant(4.0)
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-    v.assign(constant_op.constant(1.5))
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = outer_fn(y)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 2 * 4.0 + 1.5)
-
-    with backprop.GradientTape() as tp:
-      tp.watch(y)
-      result = inner_fn(y, v)
-    grad = tp.gradient(result, y)
-
-    self.assertAllEqual(grad, 1.0)
-
-  def testDeeplyNestedDifferentiableFunctionWithVariableMultipleTFGrads(self):
-    with context.graph_mode(), self.cached_session():
-      v = resource_variable_ops.ResourceVariable(3.0)
-      v.initializer.run()
-
-      @def_function.function
-      def inner_fn(a, b):
-        return math_ops.add(a, b)
-
-      @def_function.function
-      def middle_fn(a, b):
-        return math_ops.mul(a, inner_fn(a, b))
-
-      @def_function.function
-      def outer_fn(x):
-        return middle_fn(x, v)
-
-      x = constant_op.constant(5.0)
-      self.assertAllEqual(outer_fn(x).eval(), 5.0 * (5.0 + 3.0))
-
-      grad, = gradients_impl.gradients(outer_fn(x), x)
-
-      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-      self.assertAllEqual(outer_fn(x), 5.0 * (5.0 + 3.0))
-      self.assertAllEqual(middle_fn(v, x), 3.0 * (3.0 + 5.0))
-
-      grad, = gradients_impl.gradients(outer_fn(x), x)
-
-      self.assertAllEqual(grad, 2 * 5.0 + 3.0)
-
-      y = constant_op.constant(4.0)
-      grad, = gradients_impl.gradients(outer_fn(y), y)
-      self.assertAllEqual(grad, 2 * 4.0 + 3.0)
-
-      self.evaluate(v.assign(constant_op.constant(1.5)))
-      grad, = gradients_impl.gradients(outer_fn(y), y)
-
-      self.assertAllEqual(grad, 2 * 4.0 + 1.5)
-
-      grad, = gradients_impl.gradients(inner_fn(y, v), y)
-      self.assertAllEqual(grad, 1.0)
-
-  def testNestedDifferentiableFunctionNoneOutputs(self):
-    @def_function.function
-    def foo(a, b):
-      return None, a * math_ops.add(a, b), None, 2*a
-
-    @def_function.function
-    def bar(x):
-      return foo(x, 1.0)
-
-    x = constant_op.constant(5.0)
-    with backprop.GradientTape(persistent=True) as tp:
-      tp.watch(x)
-      none1, r1, none2, r2 = bar(x)
-    g1 = tp.gradient(r1, x)
-    g2 = tp.gradient(r2, x)
-
-    self.assertAllEqual(r1, 30.0)
-    self.assertAllEqual(r2, 10.0)
-    self.assertIs(none1, None)
-    self.assertIs(none2, None)
-    self.assertAllEqual(g1, 2 * 5.0 + 1.0)
-    self.assertAllEqual(g2, 2.0)
-
   def testNoneOutput(self):
 
     @def_function.function
@@ -1599,6 +963,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
   # construction. Eager's configuration is controlled in `__main__`.
   @test_util.run_in_graph_and_eager_modes(
       config=config_pb2.ConfigProto(device_count={'CPU': 4}))
+  @test_util.run_v1_only('b/120545219')
   def testDeviceAnnotationsRespected(self):
 
     def multi_device_fn():
@@ -1637,6 +1002,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_in_graph_and_eager_modes(
       config=config_pb2.ConfigProto(device_count={'CPU': 2}))
+  @test_util.run_v1_only('b/120545219')
   def testCallingGraphFunctionOnDifferentDevice(self):
 
     def func():
@@ -1925,8 +1291,9 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined(array_ops.ones([2, 1]))
 
     # Wrong number of arguments.
-    with self.assertRaisesRegexp(ValueError,
-                                 'Structure of Python function inputs.*'):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Arguments and signature arguments do not match.*'):
       defined(array_ops.ones([2]), array_ops.ones([2]))
     with self.assertRaisesRegexp(ValueError,
                                  'Structure of Python function inputs.*'):
@@ -1945,10 +1312,16 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       else:
         return -1.0 * a
 
-    signature = [tensor_spec.TensorSpec([], dtypes.float32)] * 2
+    signature = [
+        tensor_spec.TensorSpec([], dtypes.float32),
+        tensor_spec.TensorSpec([], dtypes.bool),
+    ]
     defined = def_function.function(foo, input_signature=signature)
     a = constant_op.constant(1.0)
-    with self.assertRaises(TypeError):
+    with self.assertRaisesRegexp(
+        ValueError,
+        'When input_signature is provided, all inputs to '
+        'the Python function must be Tensors.'):
       defined(a, training=True)
 
   def testInputSignatureWithKeywordPositionalArgs(self):
@@ -2039,33 +1412,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(six, 2.0)
     self.assertAllEqual(seven, 2.0)
 
-  def testGradientWithKeywordArguments(self):
-    matmul = def_function.function(math_ops.matmul)
-
-    def sq(x):
-      return matmul(a=x, b=x, transpose_a=True)
-
-    t = constant_op.constant([[1.0, 2.0], [3.0, 4.0]])
-    grad_t, = backprop.gradients_function(sq, [0])(t)
-    self.assertAllEqual(grad_t, [[6, 6], [14, 14]])
-
-    with backprop.GradientTape(persistent=True) as tape:
-      tape.watch(t)
-      one = matmul(t, b=t, transpose_a=True)
-      two = matmul(b=t, a=t, transpose_a=True)
-      three = matmul(a=t, b=t, transpose_a=True)
-
-    for output in [one, two, three]:
-      self.assertAllEqual(tape.gradient(output, t), [[6, 6], [14, 14]])
-
-  def testGradientInFunctionWithKeywordArguments(self):
-
-    @def_function.function
-    def f(x):
-      return backprop.gradients_function(lambda y: y * y, [0])(x)[0]
-
-    self.assertAllEqual(f(x=constant_op.constant(1.0)), 2.0)
-
   def testDefuningInstanceMethod(self):
 
     integer = constant_op.constant(2, dtypes.int64)
@@ -2339,33 +1685,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         # pylint: disable=protected-access
         self.assertEqual(len(graph._functions), 3)
 
-  @test_util.run_in_graph_and_eager_modes
-  def testBackwardNone(self):
-    model = variables.Variable(1.0, name='model')
-    count = variables.Variable(0)
-
-    @function.defun
-    def forward_pass(value):
-      count.assign_add(1)
-      residuals = value - model
-      loss = 0.5 * math_ops.reduce_mean(math_ops.pow(residuals, 2))
-      # Note: count is an integer, so its doutput will be None
-      return loss, count
-
-    def reduce_fn(x):
-      if context.executing_eagerly():
-        with backprop.GradientTape() as t:
-          loss, count = forward_pass(x)
-        return t.gradient(loss, model), count
-      loss, count = forward_pass(x)
-      grad_only = gradients_impl.gradients(loss, model)
-      return grad_only, count
-
-    g, _ = reduce_fn(constant_op.constant([7.0]))
-
-    self.evaluate(variables.global_variables_initializer())
-    self.assertAllEqual(nest.flatten(self.evaluate(g)), [-6.0])
-
   def testCallingFunctionWithDifferentVariables(self):
 
     @function.defun
@@ -2403,8 +1722,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
                                  'be Tensors;.*'):
       graph_function('Not a Tensor.')
 
-  # TODO(scottzhu): Revive the test once the grappler plugin is updated.
-  def disabled_testSwapImplementationWithGrapplerPlugin(self):
+  def testSwapImplementationWithGrapplerPlugin(self):
     rewrites = rewriter_config_pb2.RewriterConfig()
     # function_optimizer has to be turn off, otherwise it will delete the
     # registered function if it does not get called.
@@ -2441,7 +1759,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       function.register(cpu_boost, x)
       y = gpu_boost(x)
-      y_value = sess.run(y)
+      y_value = self.evaluate(y)
 
       if test.is_gpu_available():
         self.assertEqual(y_value, 5.0)
diff --git a/tensorflow/python/eager/graph_only_ops_test.py b/tensorflow/python/eager/graph_only_ops_test.py
index 3cf3a61a62b1b22f092ad505017fd54f278b3f95..914b4d9a95ab307a41d1a3c0dba453475edc3956 100644
--- a/tensorflow/python/eager/graph_only_ops_test.py
+++ b/tensorflow/python/eager/graph_only_ops_test.py
@@ -29,12 +29,14 @@ from tensorflow.python.platform import test
 
 class GraphOnlyOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testGraphZerosLike(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     z_tf = graph_only_ops.graph_zeros_like(x)
     with self.cached_session():
-      self.assertAllClose(np.zeros((2, 3)), z_tf.eval())
+      self.assertAllClose(np.zeros((2, 3)), self.evaluate(z_tf))
 
+  @test_util.run_deprecated_v1
   def testGraphPlaceholder(self):
     x_tf = graph_only_ops.graph_placeholder(dtypes.int32, shape=(1,))
     y_tf = math_ops.square(x_tf)
diff --git a/tensorflow/python/eager/pywrap_tensor.cc b/tensorflow/python/eager/pywrap_tensor.cc
index 55f0896e3b4c1beb714d53e712584fedc9841e4e..30a93fb0e421e0b26f517a03302d2e96913d8b9a 100644
--- a/tensorflow/python/eager/pywrap_tensor.cc
+++ b/tensorflow/python/eager/pywrap_tensor.cc
@@ -220,6 +220,14 @@ TFE_TensorHandle* ConvertToEagerTensor(PyObject* value, PyObject* dtype) {
       return nullptr;
     }
   }
+  tensorflow::Safe_PyObjectPtr value_decrefer;
+  if (PyArray_IsScalar(value, Generic)) {
+    // Convert numpy scalars to numpy arrays.
+    value = PyArray_FromScalar(value, nullptr);
+    // The returned value needs to be DECREF'd, but the original value was
+    // created in python code, and doesn't need to be DECREF'd.
+    value_decrefer.reset(value);
+  }
   if (PyArray_Check(value)) {
     int desired_np_dtype = -1;
     if (desired_dtype >= 0) {
@@ -439,8 +447,8 @@ int EagerTensor_init(EagerTensor* self, PyObject* args, PyObject* kwds) {
       PyErr_SetString(
           PyExc_TypeError,
           tensorflow::strings::StrCat(
-              "Cannot convert value ", TFE_GetPythonString(value_str.get()),
-              " to EagerTensor with requested dtype: ",
+              "Cannot convert provided value to EagerTensor. Provided value: ",
+              TFE_GetPythonString(value_str.get()), " Requested dtype: ",
               tensorflow::DataTypeString(
                   static_cast<tensorflow::DataType>(desired_dtype)))
               .c_str());
@@ -672,11 +680,29 @@ static PyObject* EagerTensor_device(EagerTensor* self) {
 #endif
 }
 
+// Getter `backing_device`.
+static PyObject* EagerTensor_backing_device(EagerTensor* self) {
+  const char* device =
+      TFE_TensorHandleBackingDeviceName(self->handle, self->status);
+  if (MaybeRaiseExceptionFromTFStatus(self->status, PyExc_ValueError)) {
+    // Cleanup self->status before returning.
+    TF_SetStatus(self->status, TF_OK, "");
+    return nullptr;
+  }
+#if PY_MAJOR_VERSION >= 3
+  return PyUnicode_FromString(device);
+#else
+  return PyBytes_FromString(device);
+#endif
+}
+
 static PyGetSetDef EagerTensor_getseters[] = {
     {const_cast<char*>("_id"), (getter)EagerTensor_getid, nullptr,
      const_cast<char*>("_id"), nullptr},
     {const_cast<char*>("device"), (getter)EagerTensor_device, nullptr,
      const_cast<char*>("device"), nullptr},
+    {const_cast<char*>("backing_device"), (getter)EagerTensor_backing_device,
+     nullptr, const_cast<char*>("backing_device"), nullptr},
     {const_cast<char*>("_handle_data"), (getter)EagerTensor_tensor_handle,
      (setter)EagerTensor_settensor_handle, const_cast<char*>("_tensor_handle"),
      nullptr},
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 6ca8eadbdebbd854565d11d7c35b8cccc8ef7c7c..9ce500bc08e478815f2dbe1d5d5353eefa4f17a8 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -1645,6 +1645,29 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   if (PyErr_Occurred()) {
     return nullptr;
   }
+  tensorflow::gtl::FlatSet<tensorflow::int64> sources_set(sources_vec.begin(),
+                                                          sources_vec.end());
+
+  tensorflow::Safe_PyObjectPtr seq =
+      tensorflow::make_safe(PySequence_Fast(target, "expected a sequence"));
+  int len = PySequence_Fast_GET_SIZE(seq.get());
+  tensorflow::gtl::FlatMap<tensorflow::int64, PyTapeTensor>
+      source_tensors_that_are_targets;
+  for (int i = 0; i < len; ++i) {
+    tensorflow::int64 target_id = target_vec[i];
+    if (sources_set.find(target_id) != sources_set.end()) {
+      auto tensor = PySequence_Fast_GET_ITEM(seq.get(), i);
+      source_tensors_that_are_targets.insert(
+          std::make_pair(target_id, TapeTensorFromTensor(tensor)));
+    }
+    if (PyErr_Occurred()) {
+      return nullptr;
+    }
+  }
+  if (PyErr_Occurred()) {
+    return nullptr;
+  }
+
   std::vector<PyObject*> outgrad_vec;
   if (output_gradients != Py_None) {
     outgrad_vec = MakeTensorList(output_gradients);
@@ -1659,7 +1682,8 @@ PyObject* TFE_Py_TapeGradient(PyObject* tape, PyObject* target,
   }
   std::vector<PyObject*> result;
   status->status = tape_obj->tape->ComputeGradient(
-      *py_vspace, target_vec, sources_vec, outgrad_vec, &result);
+      *py_vspace, target_vec, sources_vec, source_tensors_that_are_targets,
+      outgrad_vec, &result);
   if (!status->status.ok()) {
     if (PyErr_Occurred()) {
       // Do not propagate the erroneous status as that would swallow the
@@ -2279,8 +2303,10 @@ bool ConvertToTensor(
       PyErr_SetString(
           PyExc_TypeError,
           tensorflow::strings::StrCat(
-              "Cannot convert value ", TFE_GetPythonString(input_str.get()),
-              " to EagerTensor with requested dtype: ", desired_dtype)
+              "Cannot convert provided value to EagerTensor. Provided value: ",
+              TFE_GetPythonString(input_str.get()), " Requested dtype: ",
+              tensorflow::DataTypeString(
+                  static_cast<tensorflow::DataType>(desired_dtype)))
               .c_str());
       return false;
     }
diff --git a/tensorflow/python/eager/tape.py b/tensorflow/python/eager/tape.py
index 1326f09713065503b2bb359c6c997a0801680dc0..e501b403a39144a673e8ac5155edf0498425bcd6 100644
--- a/tensorflow/python/eager/tape.py
+++ b/tensorflow/python/eager/tape.py
@@ -63,7 +63,7 @@ def watch_variable(tape, variable):
   """Marks this variable to be watched by the given tape."""
   strategy = distribution_strategy_context.get_distribution_strategy()
   if distribution_strategy_context.get_replica_context():
-    variables = [strategy.value_container(variable)]
+    variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
   for var in variables:
@@ -78,7 +78,7 @@ def variable_accessed(variable):
   """
   strategy = distribution_strategy_context.get_distribution_strategy()
   if distribution_strategy_context.get_replica_context():
-    variables = [strategy.value_container(variable)]
+    variables = [strategy.extended.value_container(variable)]
   else:
     variables = strategy.unwrap(variable)
   for var in variables:
diff --git a/tensorflow/python/eager/tape_test.py b/tensorflow/python/eager/tape_test.py
index acd0e569f11a90e2cc53e113f59df6f072a6de42..48d3b8ac6ee0fb5b747caf32b034f82959611292 100644
--- a/tensorflow/python/eager/tape_test.py
+++ b/tensorflow/python/eager/tape_test.py
@@ -80,8 +80,8 @@ class TapeTest(test.TestCase):
       tf_e = tf_d + tf_f
       tf_da, tf_db = gradients_impl.gradients(tf_e, [tf_a, tf_b])
 
-      self.assertAllEqual(da, tf_da.eval())
-      self.assertAllEqual(db, tf_db.eval())
+      self.assertAllEqual(da, self.evaluate(tf_da))
+      self.assertAllEqual(db, self.evaluate(tf_db))
 
   def testBasicFunctional(self):
 
@@ -142,8 +142,8 @@ class TapeTest(test.TestCase):
       tf_rr = 2 * math_ops.reduce_sum(tf_mm)
       tf_da, tf_db = gradients_impl.gradients(tf_rr, [tf_a, tf_b])
 
-      self.assertAllEqual(da, tf_da.eval())
-      self.assertAllEqual(db, tf_db.eval())
+      self.assertAllEqual(da, self.evaluate(tf_da))
+      self.assertAllEqual(db, self.evaluate(tf_db))
 
   def testGcTwoOutputs(self):
 
diff --git a/tensorflow/python/eager/tensor_test.py b/tensorflow/python/eager/tensor_test.py
index f61d84781770b04db235e6ec700f3241c4faeabb..0ee2ff68c209aa13aaeb32be610302c11616b9d7 100644
--- a/tensorflow/python/eager/tensor_test.py
+++ b/tensorflow/python/eager/tensor_test.py
@@ -95,6 +95,18 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     t = _create_tensor(values)
     self.assertAllEqual(values, t)
 
+  @test_util.assert_no_new_pyobjects_executing_eagerly
+  def testNumpyDtypeSurvivesThroughTensorConversion(self):
+    scalar_creators = [np.int32, np.int64, np.float32, np.float64]
+    conversion_functions = [ops.convert_to_tensor, constant_op.constant]
+
+    for scalar_creator in scalar_creators:
+      for conversion_function in conversion_functions:
+        np_val = scalar_creator(3)
+        tensor_val = conversion_function(np_val)
+        self.assertEqual(tensor_val.numpy().dtype, np_val.dtype)
+        self.assertEqual(tensor_val.numpy(), np_val)
+
   def testNumpyValueWithCast(self):
     values = np.array([3.0], dtype=np.float32)
     t = _create_tensor(values, dtype=dtypes.float64)
@@ -128,6 +140,23 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     tensor = constant_op.constant(numpy_tensor)
     self.assertAllEqual(numpy_tensor.ndim, tensor.ndim)
 
+  def testLenAgreesWithNumpy(self):
+    numpy_tensor = np.asarray(1.0)
+    tensor = constant_op.constant(numpy_tensor)
+    with self.assertRaises(TypeError):
+      len(numpy_tensor)
+    with self.assertRaisesRegexp(
+        TypeError, r"Scalar tensor has no `len[(][)]`"):
+      len(tensor)
+
+    numpy_tensor = np.asarray([1.0, 2.0, 3.0])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(len(numpy_tensor), len(tensor))
+
+    numpy_tensor = np.asarray([[1.0, 2.0, 3.0], [1.0, 2.0, 3.0]])
+    tensor = constant_op.constant(numpy_tensor)
+    self.assertAllEqual(len(numpy_tensor), len(tensor))
+
   def testCopy(self):
     t = constant_op.constant(1.0)
     tt = copy.copy(t)
@@ -158,9 +187,13 @@ class TFETensorTest(test_util.TensorFlowTestCase):
     self.assertEqual(dtypes.float64, t.dtype)
 
   def testBool(self):
-    t = _create_tensor(False)
-    if t:
-      self.assertFalse(True)
+    self.assertFalse(bool(_create_tensor(False)))
+    self.assertFalse(bool(_create_tensor([False])))
+    self.assertFalse(bool(_create_tensor([[False]])))
+    self.assertFalse(bool(_create_tensor([0])))
+    self.assertFalse(bool(_create_tensor([0.])))
+    self.assertTrue(bool(_create_tensor([1])))
+    self.assertTrue(bool(_create_tensor([1.])))
 
   def testIntDowncast(self):
     t = _create_tensor(3)
@@ -306,6 +339,14 @@ class TFETensorTest(test_util.TensorFlowTestCase):
   def testConvertToTensorAllowsOverflow(self):
     _ = ops.convert_to_tensor(123456789, dtype=dtypes.uint8)
 
+  def testEagerTensorError(self):
+    with self.assertRaisesRegexp(
+        TypeError,
+        "Cannot convert provided value to EagerTensor. "
+        "Provided value.*Requested dtype.*"):
+      _ = ops.convert_to_tensor(1., dtype=dtypes.int32)
+
+
 
 class TFETensorUtilTest(test_util.TensorFlowTestCase):
 
diff --git a/tensorflow/python/eager/test.py b/tensorflow/python/eager/test.py
index 33ee797678ed73c52ebb17723f688cec4feca402..a45deac962de931ebd8a8804cea7fef2b3f97629 100644
--- a/tensorflow/python/eager/test.py
+++ b/tensorflow/python/eager/test.py
@@ -24,6 +24,6 @@ from tensorflow.python.platform.test import *  # pylint: disable=wildcard-import
 
 
 # TODO(akshayka): Do away with this file.
-def main(argv=None):
+def main(argv=None):  # pylint: disable=function-redefined
   _ops.enable_eager_execution()
   _test.main(argv)
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 48266437ef510d0439d88f0f6c8fe88c979a9497..2b39e99a4ea5d145f9bb8cef5c5931c306bcaeea 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 class VariableHolder(object):
@@ -45,6 +46,7 @@ class VariableHolder(object):
       return self._fn(*args, **kwargs)
 
 
+# TODO(allenl): make this checkpointable
 class WrappedFunction(function.Function):
   """Wraps a tf V1 piece of code in a function."""
 
@@ -77,6 +79,7 @@ class WrappedFunction(function.Function):
     return pruned_fn
 
 
+@tf_export(v1=["wrap_function"])
 def wrap_function(fn, signature, name=None):
   """Wraps the TF 1.x function fn into a graph function.
 
@@ -109,6 +112,21 @@ def wrap_function(fn, signature, name=None):
   assert float(f_sub(1.0)) == 3.0
   ```
 
+  Both `tf.compat.v1.wrap_function` and `tf.function` create a callable
+  TensorFlow graph. But while `tf.function` runs all stateful operations
+  (e.g. `tf.print`) and sequences operations to provide the same semantics as
+  eager execution, `wrap_function` is closer to the behavior of `session.run` in
+  TensorFlow 1.x. It will not run any operations unless they are required to
+  compute the function's outputs, either through a data dependency or a control
+  dependency. Nor will it sequence operations.
+
+  Unlike `tf.function`, `wrap_function` will only trace the Python function
+  once. As with placeholders in TF 1.x, shapes and dtypes must be provided to
+  `wrap_function`'s `signature` argument.
+
+  Since it is only traced once, variables and state may be created inside the
+  function and owned by the function wrapper object.
+
   Args:
     fn: python function to be wrapped
     signature: the placeholder and python arguments to be passed to the
diff --git a/tensorflow/python/feature_column/feature_column.py b/tensorflow/python/feature_column/feature_column.py
index b7a6a88535fd55467e73eceab6272b15f8411267..a858d92608db1a0d9d00b34f91860b7d4be01d68 100644
--- a/tensorflow/python/feature_column/feature_column.py
+++ b/tensorflow/python/feature_column/feature_column.py
@@ -230,7 +230,7 @@ def _internal_input_layer(features,
       return _get_logits()
 
 
-@tf_export('feature_column.input_layer')
+@tf_export(v1=['feature_column.input_layer'])
 def input_layer(features,
                 feature_columns,
                 weight_collections=None,
@@ -365,7 +365,7 @@ class InputLayer(object):
     return self._input_layer_template.weights
 
 
-@tf_export('feature_column.linear_model')
+@tf_export(v1=['feature_column.linear_model'])
 def linear_model(features,
                  feature_columns,
                  units=1,
@@ -746,7 +746,7 @@ def _transform_features(features, feature_columns):
   return outputs
 
 
-@tf_export('feature_column.make_parse_example_spec')
+@tf_export(v1=['feature_column.make_parse_example_spec'])
 def make_parse_example_spec(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
@@ -807,11 +807,14 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
-@tf_export('feature_column.embedding_column')
-def embedding_column(
-    categorical_column, dimension, combiner='mean', initializer=None,
-    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
-    trainable=True):
+def _embedding_column(categorical_column,
+                      dimension,
+                      combiner='mean',
+                      initializer=None,
+                      ckpt_to_load_from=None,
+                      tensor_name_in_ckpt=None,
+                      max_norm=None,
+                      trainable=True):
   """`_DenseColumn` that converts from sparse, categorical input.
 
   Use this when your inputs are sparse, but you want to convert them to a dense
@@ -919,178 +922,11 @@ def embedding_column(
       trainable=trainable)
 
 
-@tf_export('feature_column.shared_embedding_columns')
-def shared_embedding_columns(
-    categorical_columns, dimension, combiner='mean', initializer=None,
-    shared_embedding_collection_name=None, ckpt_to_load_from=None,
-    tensor_name_in_ckpt=None, max_norm=None, trainable=True):
-  """List of dense columns that convert from sparse, categorical input.
-
-  This is similar to `embedding_column`, except that it produces a list of
-  embedding columns that share the same embedding weights.
-
-  Use this when your inputs are sparse and of the same type (e.g. watched and
-  impression video IDs that share the same vocabulary), and you want to convert
-  them to a dense representation (e.g., to feed to a DNN).
-
-  Inputs must be a list of categorical columns created by any of the
-  `categorical_column_*` function. They must all be of the same type and have
-  the same arguments except `key`. E.g. they can be
-  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
-  all columns could also be weighted_categorical_column.
-
-  Here is an example embedding of two features for a DNNClassifier model:
-
-  ```python
-  watched_video_id = categorical_column_with_vocabulary_file(
-      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
-  impression_video_id = categorical_column_with_vocabulary_file(
-      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
-  columns = shared_embedding_columns(
-      [watched_video_id, impression_video_id], dimension=10)
-
-  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
-
-  label_column = ...
-  def input_fn():
-    features = tf.parse_example(
-        ..., features=make_parse_example_spec(columns + [label_column]))
-    labels = features.pop(label_column.name)
-    return features, labels
-
-  estimator.train(input_fn=input_fn, steps=100)
-  ```
-
-  Here is an example using `shared_embedding_columns` with model_fn:
-
-  ```python
-  def model_fn(features, ...):
-    watched_video_id = categorical_column_with_vocabulary_file(
-        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
-    impression_video_id = categorical_column_with_vocabulary_file(
-        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
-    columns = shared_embedding_columns(
-        [watched_video_id, impression_video_id], dimension=10)
-    dense_tensor = input_layer(features, columns)
-    # Form DNN layers, calculate loss, and return EstimatorSpec.
-    ...
-  ```
-
-  Args:
-    categorical_columns: List of categorical columns created by a
-      `categorical_column_with_*` function. These columns produce the sparse IDs
-      that are inputs to the embedding lookup. All columns must be of the same
-      type and have the same arguments except `key`. E.g. they can be
-      categorical_column_with_vocabulary_file with the same vocabulary_file.
-      Some or all columns could also be weighted_categorical_column.
-    dimension: An integer specifying dimension of the embedding, must be > 0.
-    combiner: A string specifying how to reduce if there are multiple entries
-      in a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
-      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
-      with bag-of-words columns. Each of this can be thought as example level
-      normalizations on the column. For more information, see
-      `tf.embedding_lookup_sparse`.
-    initializer: A variable initializer function to be used in embedding
-      variable initialization. If not specified, defaults to
-      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
-      `1/sqrt(dimension)`.
-    shared_embedding_collection_name: Optional name of the collection where
-      shared embedding weights are added. If not given, a reasonable name will
-      be chosen based on the names of `categorical_columns`. This is also used
-      in `variable_scope` when creating shared embedding weights.
-    ckpt_to_load_from: String representing checkpoint name/pattern from which to
-      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
-    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from
-      which to restore the column weights. Required if `ckpt_to_load_from` is
-      not `None`.
-    max_norm: If not `None`, each embedding is clipped if its l2-norm is
-      larger than this value, before combining.
-    trainable: Whether or not the embedding is trainable. Default is True.
-
-  Returns:
-    A list of dense columns that converts from sparse input. The order of
-    results follows the ordering of `categorical_columns`.
-
-  Raises:
-    ValueError: if `dimension` not > 0.
-    ValueError: if any of the given `categorical_columns` is of different type
-      or has different arguments than the others.
-    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
-      is specified.
-    ValueError: if `initializer` is specified and is not callable.
-    RuntimeError: if eager execution is enabled.
-  """
-  if context.executing_eagerly():
-    raise RuntimeError('shared_embedding_columns are not supported when eager '
-                       'execution is enabled.')
-
-  if (dimension is None) or (dimension < 1):
-    raise ValueError('Invalid dimension {}.'.format(dimension))
-  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
-    raise ValueError('Must specify both `ckpt_to_load_from` and '
-                     '`tensor_name_in_ckpt` or none of them.')
-
-  if (initializer is not None) and (not callable(initializer)):
-    raise ValueError('initializer must be callable if specified.')
-  if initializer is None:
-    initializer = init_ops.truncated_normal_initializer(
-        mean=0.0, stddev=1. / math.sqrt(dimension))
-
-  # Sort the columns so the default collection name is deterministic even if the
-  # user passes columns from an unsorted collection, such as dict.values().
-  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
-
-  c0 = sorted_columns[0]
-  num_buckets = c0._num_buckets  # pylint: disable=protected-access
-  if not isinstance(c0, _CategoricalColumn):
-    raise ValueError(
-        'All categorical_columns must be subclasses of _CategoricalColumn. '
-        'Given: {}, of type: {}'.format(c0, type(c0)))
-  if isinstance(c0, _WeightedCategoricalColumn):
-    c0 = c0.categorical_column
-  for c in sorted_columns[1:]:
-    if isinstance(c, _WeightedCategoricalColumn):
-      c = c.categorical_column
-    if not isinstance(c, type(c0)):
-      raise ValueError(
-          'To use shared_embedding_column, all categorical_columns must have '
-          'the same type, or be weighted_categorical_column of the same type. '
-          'Given column: {} of type: {} does not match given column: {} of '
-          'type: {}'.format(c0, type(c0), c, type(c)))
-    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
-      raise ValueError(
-          'To use shared_embedding_column, all categorical_columns must have '
-          'the same number of buckets. Given column: {} with buckets: {} does  '
-          'not match column: {} with buckets: {}'.format(
-              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
-
-  if not shared_embedding_collection_name:
-    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
-    shared_embedding_collection_name += '_shared_embedding'
-
-  result = []
-  for column in categorical_columns:
-    result.append(
-        _SharedEmbeddingColumn(
-            categorical_column=column,
-            initializer=initializer,
-            dimension=dimension,
-            combiner=combiner,
-            shared_embedding_collection_name=shared_embedding_collection_name,
-            ckpt_to_load_from=ckpt_to_load_from,
-            tensor_name_in_ckpt=tensor_name_in_ckpt,
-            max_norm=max_norm,
-            trainable=trainable))
-
-  return result
-
-
-@tf_export('feature_column.numeric_column')
-def numeric_column(key,
-                   shape=(1,),
-                   default_value=None,
-                   dtype=dtypes.float32,
-                   normalizer_fn=None):
+def _numeric_column(key,
+                    shape=(1,),
+                    default_value=None,
+                    dtype=dtypes.float32,
+                    normalizer_fn=None):
   """Represents real valued or numerical features.
 
   Example:
@@ -1161,8 +997,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
-@tf_export('feature_column.bucketized_column')
-def bucketized_column(source_column, boundaries):
+def _bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
   Buckets include the left boundary, and exclude the right boundary. Namely,
@@ -1258,10 +1093,9 @@ def _assert_key_is_string(key):
             type(key), key))
 
 
-@tf_export('feature_column.categorical_column_with_hash_bucket')
-def categorical_column_with_hash_bucket(key,
-                                        hash_bucket_size,
-                                        dtype=dtypes.string):
+def _categorical_column_with_hash_bucket(key,
+                                         hash_bucket_size,
+                                         dtype=dtypes.string):
   """Represents sparse feature where ids are set by hashing.
 
   Use this when your sparse features are in string or integer format, and you
@@ -1317,13 +1151,12 @@ def categorical_column_with_hash_bucket(key,
   return _HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
-@tf_export('feature_column.categorical_column_with_vocabulary_file')
-def categorical_column_with_vocabulary_file(key,
-                                            vocabulary_file,
-                                            vocabulary_size=None,
-                                            num_oov_buckets=0,
-                                            default_value=None,
-                                            dtype=dtypes.string):
+def _categorical_column_with_vocabulary_file(key,
+                                             vocabulary_file,
+                                             vocabulary_size=None,
+                                             num_oov_buckets=0,
+                                             default_value=None,
+                                             dtype=dtypes.string):
   """A `_CategoricalColumn` with a vocabulary file.
 
   Use this when your inputs are in string or integer format, and you have a
@@ -1437,9 +1270,11 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
-@tf_export('feature_column.categorical_column_with_vocabulary_list')
-def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+def _categorical_column_with_vocabulary_list(key,
+                                             vocabulary_list,
+                                             dtype=None,
+                                             default_value=-1,
+                                             num_oov_buckets=0):
   """A `_CategoricalColumn` with in-memory vocabulary.
 
   Use this when your inputs are in string or integer format, and you have an
@@ -1548,8 +1383,7 @@ def categorical_column_with_vocabulary_list(
       default_value=default_value, num_oov_buckets=num_oov_buckets)
 
 
-@tf_export('feature_column.categorical_column_with_identity')
-def categorical_column_with_identity(key, num_buckets, default_value=None):
+def _categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `_CategoricalColumn` that returns identity values.
 
   Use this when your inputs are integers in the range `[0, num_buckets)`, and
@@ -1616,8 +1450,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, num_buckets=num_buckets, default_value=default_value)
 
 
-@tf_export('feature_column.indicator_column')
-def indicator_column(categorical_column):
+def _indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
   - For DNN model, `indicator_column` can be used to wrap any
@@ -1651,9 +1484,9 @@ def indicator_column(categorical_column):
   return _IndicatorColumn(categorical_column)
 
 
-@tf_export('feature_column.weighted_categorical_column')
-def weighted_categorical_column(
-    categorical_column, weight_feature_key, dtype=dtypes.float32):
+def _weighted_categorical_column(categorical_column,
+                                 weight_feature_key,
+                                 dtype=dtypes.float32):
   """Applies weight values to a `_CategoricalColumn`.
 
   Use this when each of your sparse inputs has both an ID and a value. For
@@ -1726,8 +1559,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
-@tf_export('feature_column.crossed_column')
-def crossed_column(keys, hash_bucket_size, hash_key=None):
+def _crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
   Crossed features will be hashed according to `hash_bucket_size`. Conceptually,
diff --git a/tensorflow/python/feature_column/feature_column_lib.py b/tensorflow/python/feature_column/feature_column_lib.py
index 3b818f18b5b0fce99b81e51ce89e58c72cab0b91..68a2712425c56ae4b3e42c6bd7ae497c0358a074 100644
--- a/tensorflow/python/feature_column/feature_column_lib.py
+++ b/tensorflow/python/feature_column/feature_column_lib.py
@@ -20,4 +20,5 @@ from __future__ import print_function
 
 # pylint: disable=unused-import,line-too-long,wildcard-import
 from tensorflow.python.feature_column.feature_column import *
+from tensorflow.python.feature_column.feature_column_v2 import *
 # pylint: enable=unused-import,line-too-long
diff --git a/tensorflow/python/feature_column/feature_column_test.py b/tensorflow/python/feature_column/feature_column_test.py
index 1ae510250cfd030d965d0480599d4e333fe30b50..daa0a3b3a4bb5fd067681c5ca91eaccdc64d3144 100644
--- a/tensorflow/python/feature_column/feature_column_test.py
+++ b/tensorflow/python/feature_column/feature_column_test.py
@@ -30,7 +30,8 @@ from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_v2 as fc_new
 from tensorflow.python.feature_column.feature_column import _CategoricalColumn
 from tensorflow.python.feature_column.feature_column import _DenseColumn
 from tensorflow.python.feature_column.feature_column import _FeatureColumn
@@ -169,6 +170,7 @@ class LazyColumnTest(test.TestCase):
         TypeError, '"key" must be either a "str" or "_FeatureColumn".'):
       builder.get(NotAFeatureColumn())
 
+  @test_util.run_deprecated_v1
   def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
     # empty 1-D sparse tensor:
     builder = _LazyBuilder(features={'a': sparse_tensor.SparseTensor(
@@ -184,8 +186,9 @@ class LazyColumnTest(test.TestCase):
 
 class NumericColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    a = fc.numeric_column('aaa')
+    a = fc._numeric_column('aaa')
     self.assertEqual('aaa', a.key)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a._var_scope_name)
@@ -196,53 +199,53 @@ class NumericColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.numeric_column(key=('aaa',))
+      fc._numeric_column(key=('aaa',))
 
   def test_shape_saved_as_tuple(self):
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual((1, 2), a.shape)
 
   def test_default_value_saved_as_tuple(self):
-    a = fc.numeric_column('aaa', default_value=4.)
+    a = fc._numeric_column('aaa', default_value=4.)
     self.assertEqual((4.,), a.default_value)
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3, 2.]])
     self.assertEqual(((3., 2.),), a.default_value)
 
   def test_shape_and_default_value_compatibility(self):
-    fc.numeric_column('aaa', shape=[2], default_value=[1, 2.])
+    fc._numeric_column('aaa', shape=[2], default_value=[1, 2.])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
-    fc.numeric_column(
+      fc._numeric_column('aaa', shape=[2], default_value=[1, 2, 3.])
+    fc._numeric_column(
         'aaa', shape=[3, 2], default_value=[[2, 3], [1, 2], [2, 3.]])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[3, 1], default_value=[[2, 3], [1, 2], [2, 3.]])
     with self.assertRaisesRegexp(ValueError, 'The shape of default_value'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[3, 3], default_value=[[2, 3], [1, 2], [2, 3.]])
 
   def test_default_value_type_check(self):
-    fc.numeric_column(
+    fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.float32)
-    fc.numeric_column(
+    fc._numeric_column(
         'aaa', shape=[2], default_value=[1, 2], dtype=dtypes.int32)
     with self.assertRaisesRegexp(TypeError, 'must be compatible with dtype'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[2], default_value=[1, 2.], dtype=dtypes.int32)
     with self.assertRaisesRegexp(TypeError,
                                  'default_value must be compatible with dtype'):
-      fc.numeric_column('aaa', default_value=['string'])
+      fc._numeric_column('aaa', default_value=['string'])
 
   def test_shape_must_be_positive_integer(self):
     with self.assertRaisesRegexp(TypeError, 'shape dimensions must be integer'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[
               1.0,
           ])
 
     with self.assertRaisesRegexp(ValueError,
                                  'shape dimensions must be greater than 0'):
-      fc.numeric_column(
+      fc._numeric_column(
           'aaa', shape=[
               0,
           ])
@@ -250,20 +253,21 @@ class NumericColumnTest(test.TestCase):
   def test_dtype_is_convertible_to_float(self):
     with self.assertRaisesRegexp(ValueError,
                                  'dtype must be convertible to float'):
-      fc.numeric_column('aaa', dtype=dtypes.string)
+      fc._numeric_column('aaa', dtype=dtypes.string)
 
   def test_scalar_default_value_fills_the_shape(self):
-    a = fc.numeric_column('aaa', shape=[2, 3], default_value=2.)
+    a = fc._numeric_column('aaa', shape=[2, 3], default_value=2.)
     self.assertEqual(((2., 2., 2.), (2., 2., 2.)), a.default_value)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
+    a = fc._numeric_column('aaa', shape=[2, 3], dtype=dtypes.int32)
     self.assertEqual({
         'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
     }, a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_no_default_value(self):
-    price = fc.numeric_column('price', shape=[2])
+    price = fc._numeric_column('price', shape=[2])
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -277,8 +281,9 @@ class NumericColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+  @test_util.run_deprecated_v1
   def test_parse_example_with_default_value(self):
-    price = fc.numeric_column('price', shape=[2], default_value=11.)
+    price = fc._numeric_column('price', shape=[2], default_value=11.)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -301,29 +306,31 @@ class NumericColumnTest(test.TestCase):
 
   def test_normalizer_fn_must_be_callable(self):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
-      fc.numeric_column('price', normalizer_fn='NotACallable')
+      fc._numeric_column('price', normalizer_fn='NotACallable')
 
+  @test_util.run_deprecated_v1
   def test_normalizer_fn_transform_feature(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
-    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     output = _transform_features({'price': [[1., 2.], [5., 6.]]}, [price])
     with self.cached_session():
       self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
-    price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
+    price = fc._numeric_column('price', shape=[2], normalizer_fn=_increment_two)
     builder = _LazyBuilder({'price': [[1., 2.], [5., 6.]]})
     self.assertEqual(builder.get(price), price._get_dense_tensor(builder))
 
   def test_sparse_tensor_not_supported(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     builder = _LazyBuilder({
         'price':
             sparse_tensor.SparseTensor(
@@ -332,109 +339,113 @@ class NumericColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       price._transform_feature(builder)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
+    a = fc._numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
     a_copy = copy.deepcopy(a)
     self.assertEqual(a_copy.name, 'aaa')
     self.assertEqual(a_copy.shape, (1, 2))
     self.assertEqual(a_copy.default_value, ((3., 2.),))
 
   def test_numpy_default_value(self):
-    a = fc.numeric_column(
+    a = fc._numeric_column(
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
 
 class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_source_column_type(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
+    a = fc._categorical_column_with_hash_bucket('aaa', hash_bucket_size=10)
     with self.assertRaisesRegexp(
         ValueError,
         'source_column must be a column generated with numeric_column'):
-      fc.bucketized_column(a, boundaries=[0, 1])
+      fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_source_column_shape(self):
-    a = fc.numeric_column('aaa', shape=[2, 3])
+    a = fc._numeric_column('aaa', shape=[2, 3])
     with self.assertRaisesRegexp(
         ValueError, 'source_column must be one-dimensional column'):
-      fc.bucketized_column(a, boundaries=[0, 1])
+      fc._bucketized_column(a, boundaries=[0, 1])
 
   def test_invalid_boundaries(self):
-    a = fc.numeric_column('aaa')
+    a = fc._numeric_column('aaa')
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=None)
+      fc._bucketized_column(a, boundaries=None)
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=1.)
+      fc._bucketized_column(a, boundaries=1.)
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=[1, 0])
+      fc._bucketized_column(a, boundaries=[1, 0])
     with self.assertRaisesRegexp(
         ValueError, 'boundaries must be a sorted list'):
-      fc.bucketized_column(a, boundaries=[1, 1])
+      fc._bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
-    a = fc.numeric_column('aaa', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b.name)
 
   def test_var_scope_name(self):
-    a = fc.numeric_column('aaa', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual('aaa_bucketized', b._var_scope_name)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     self.assertEqual({
         'aaa': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32)
     }, b._parse_example_spec)
 
   def test_variable_shape(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     # Column 'aaa` has shape [2] times three buckets -> variable_shape=[2, 3].
     self.assertAllEqual((2, 3), b._variable_shape)
 
   def test_num_buckets(self):
-    a = fc.numeric_column('aaa', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
     # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
     self.assertEqual(6, b._num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -448,9 +459,10 @@ class BucketizedColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       transformed_tensor = _transform_features({
           'price': [[-1., 1.], [5., 6.]]
@@ -461,24 +473,22 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_get_dense_tensor_one_input_value(self):
     """Tests _get_dense_tensor() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
       with _initialized_session():
         bucketized_price_tensor = bucketized_price._get_dense_tensor(builder)
         self.assertAllClose(
             # One-hot tensor.
-            [[[1., 0., 0., 0., 0.]],
-             [[0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.]],
-             [[0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            [[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]],
+             [[0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 1.]]],
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_dense_tensor_two_input_values(self):
     """Tests _get_dense_tensor() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
       with _initialized_session():
@@ -487,12 +497,12 @@ class BucketizedColumnTest(test.TestCase):
             # One-hot tensor.
             [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
              [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+            self.evaluate(bucketized_price_tensor))
 
   def test_get_sparse_tensors_one_input_value(self):
     """Tests _get_sparse_tensors() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1.], [1.], [5.], [6.]]})
       with _initialized_session() as sess:
@@ -506,8 +516,8 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_two_input_values(self):
     """Tests _get_sparse_tensors() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       builder = _LazyBuilder({'price': [[-1., 1.], [5., 6.]]})
       with _initialized_session() as sess:
@@ -522,8 +532,8 @@ class BucketizedColumnTest(test.TestCase):
         self.assertAllEqual([2, 2], id_tensor_value.dense_shape)
 
   def test_sparse_tensor_input_not_supported(self):
-    price = fc.numeric_column('price')
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 1])
+    price = fc._numeric_column('price')
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 1])
     builder = _LazyBuilder({
         'price':
             sparse_tensor.SparseTensor(
@@ -532,9 +542,10 @@ class BucketizedColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       bucketized_price._transform_feature(builder)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('aaa', shape=[2])
-    a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
+    a = fc._numeric_column('aaa', shape=[2])
+    a_bucketized = fc._bucketized_column(a, boundaries=[0, 1])
     a_bucketized_copy = copy.deepcopy(a_bucketized)
     self.assertEqual(a_bucketized_copy.name, 'aaa_bucketized')
     self.assertAllEqual(a_bucketized_copy._variable_shape, (2, 3))
@@ -542,45 +553,48 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = fc.linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = fc.linear_model(features, [bucketized_price])
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(bucketized_price_var.assign(
             [[10.], [20.], [30.], [40.], [50.],
              [60.], [70.], [80.], [90.], [100.]]))
@@ -590,14 +604,14 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_keras_linear_model_one_input_value(self):
     """Tests _LinearModel for input with shape=[1]."""
-    price = fc.numeric_column('price', shape=[1])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[1])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -605,25 +619,28 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_keras_linear_model_two_input_values(self):
     """Tests _LinearModel for input with shape=[2]."""
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1., 1.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -631,12 +648,12 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
                                          [60.], [70.], [80.], [90.], [100.]]))
@@ -646,15 +663,16 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
 
 class HashedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
     self.assertEqual('aaa', a._var_scope_name)
     self.assertEqual('aaa', a.key)
@@ -663,25 +681,26 @@ class HashedCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_hash_bucket(('key',), 10)
+      fc._categorical_column_with_hash_bucket(('key',), 10)
 
   def test_bucket_size_should_be_given(self):
     with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be set.'):
-      fc.categorical_column_with_hash_bucket('aaa', None)
+      fc._categorical_column_with_hash_bucket('aaa', None)
 
   def test_bucket_size_should_be_positive(self):
     with self.assertRaisesRegexp(ValueError,
                                  'hash_bucket_size must be at least 1'):
-      fc.categorical_column_with_hash_bucket('aaa', 0)
+      fc._categorical_column_with_hash_bucket('aaa', 0)
 
   def test_dtype_should_be_string_or_integer(self):
-    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
-    fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.string)
+    fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
+      fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_hash_bucket('aaa', 10)
+    original = fc._categorical_column_with_hash_bucket('aaa', 10)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(10, column.hash_bucket_size)
@@ -689,19 +708,20 @@ class HashedCategoricalColumnTest(test.TestCase):
       self.assertEqual(dtypes.string, column.dtype)
 
   def test_parse_spec_string(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.string)
     }, a._parse_example_spec)
 
   def test_parse_spec_int(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.int32)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_hash_bucket('aaa', 10)
+    a = fc._categorical_column_with_hash_bucket('aaa', 10)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -721,8 +741,9 @@ class HashedCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_strings_should_be_hashed(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -739,11 +760,11 @@ class HashedCategoricalColumnTest(test.TestCase):
                           output.dense_shape.eval())
 
   def test_tensor_dtype_should_be_string_or_integer(self):
-    string_fc = fc.categorical_column_with_hash_bucket(
+    string_fc = fc._categorical_column_with_hash_bucket(
         'a_string', 10, dtype=dtypes.string)
-    int_fc = fc.categorical_column_with_hash_bucket(
+    int_fc = fc._categorical_column_with_hash_bucket(
         'a_int', 10, dtype=dtypes.int32)
-    float_fc = fc.categorical_column_with_hash_bucket(
+    float_fc = fc._categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
         values=[101],
@@ -768,7 +789,7 @@ class HashedCategoricalColumnTest(test.TestCase):
       builder.get(float_fc)
 
   def test_dtype_should_match_with_tensor(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -776,8 +797,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       builder.get(hashed_sparse)
 
+  @test_util.run_deprecated_v1
   def test_ints_should_be_hashed(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=[101, 201, 301],
@@ -790,8 +812,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
+  @test_util.run_deprecated_v1
   def test_int32_64_is_compatible(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket(
+    hashed_sparse = fc._categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
     wire_tensor = sparse_tensor.SparseTensor(
         values=constant_op.constant([101, 201, 301], dtype=dtypes.int32),
@@ -804,8 +827,9 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.cached_session():
       self.assertAllEqual(expected_values, output.values.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({
         'wire':
             sparse_tensor.SparseTensor(
@@ -818,7 +842,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_hash_bucket('aaa', 10)
+    column = fc._categorical_column_with_hash_bucket('aaa', 10)
     inputs = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -832,15 +856,17 @@ class HashedCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     builder = _LazyBuilder({'wire': (('omar', ''), ('stringer', 'marlo'))})
     id_weight_pair = hashed_sparse._get_sparse_tensors(builder)
     self.assertIsNone(id_weight_pair.weight_tensor)
     self.assertEqual(builder.get(hashed_sparse), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -852,16 +878,18 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
+    wire_column = fc._categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column._num_buckets)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
@@ -874,13 +902,14 @@ class HashedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 3: wire_var[3] = 4
         # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+        self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
 
 class CrossedColumnTest(test.TestCase):
@@ -888,100 +917,102 @@ class CrossedColumnTest(test.TestCase):
   def test_keys_empty(self):
     with self.assertRaisesRegexp(
         ValueError, 'keys must be a list with length > 1'):
-      fc.crossed_column([], 10)
+      fc._crossed_column([], 10)
 
   def test_keys_length_one(self):
     with self.assertRaisesRegexp(
         ValueError, 'keys must be a list with length > 1'):
-      fc.crossed_column(['a'], 10)
+      fc._crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
     with self.assertRaisesRegexp(ValueError, 'Unsupported key type'):
-      fc.crossed_column(['a', fc.numeric_column('c')], 10)
+      fc._crossed_column(['a', fc._numeric_column('c')], 10)
 
     with self.assertRaisesRegexp(
         ValueError, 'categorical_column_with_hash_bucket is not supported'):
-      fc.crossed_column(
-          ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
+      fc._crossed_column(
+          ['a', fc._categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], -1)
+      fc._crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], 0)
+      fc._crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
     with self.assertRaisesRegexp(
         ValueError, 'hash_bucket_size must be > 1'):
-      fc.crossed_column(['a', 'c'], None)
+      fc._crossed_column(['a', 'c'], None)
 
   def test_name(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_ordered_alphabetically(self):
     """Tests that the name does not depend on the order of given columns."""
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([crossed1, 'c', b], 10)
+    crossed2 = fc._crossed_column([crossed1, 'c', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_name_leaf_keys_ordered_alphabetically(self):
     """Tests that the name does not depend on the order of given columns."""
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d2', 'c'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d2', 'c'], 10)
 
-    crossed2 = fc.crossed_column([crossed1, 'd1', b], 10)
+    crossed2 = fc._crossed_column([crossed1, 'd1', b], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_var_scope_name(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
 
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 10)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2._var_scope_name)
 
   def test_parse_spec(self):
-    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed = fc.crossed_column([b, 'c'], 10)
+    a = fc._numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed = fc._crossed_column([b, 'c'], 10)
     self.assertEqual({
         'a': parsing_ops.FixedLenFeature((2,), dtype=dtypes.int32),
         'c': parsing_ops.VarLenFeature(dtypes.string),
     }, crossed._parse_example_spec)
 
   def test_num_buckets(self):
-    a = fc.numeric_column('a', shape=[2], dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed = fc.crossed_column([b, 'c'], 15)
+    a = fc._numeric_column('a', shape=[2], dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed = fc._crossed_column([b, 'c'], 15)
     self.assertEqual(15, crossed._num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32)
-    b = fc.bucketized_column(a, boundaries=[0, 1])
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32)
+    b = fc._bucketized_column(a, boundaries=[0, 1])
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 15, hash_key=5)
     crossed2_copy = copy.deepcopy(crossed2)
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
     self.assertEqual(15, crossed2_copy.hash_bucket_size)
     self.assertEqual(5, crossed2_copy.hash_key)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
-    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
+    price_cross_wire = fc._crossed_column([bucketized_price, 'wire'], 10)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'price':
@@ -1004,12 +1035,13 @@ class CrossedColumnTest(test.TestCase):
       self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
       self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    price = fc.numeric_column('price', shape=[2])
-    bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
+    price = fc._numeric_column('price', shape=[2])
+    bucketized_price = fc._bucketized_column(price, boundaries=[0, 50])
     hash_bucket_size = 10
-    price_cross_wire = fc.crossed_column(
-        [bucketized_price, 'wire'], hash_bucket_size)
+    price_cross_wire = fc._crossed_column([bucketized_price, 'wire'],
+                                          hash_bucket_size)
     features = {
         'price': constant_op.constant([[1., 2.], [5., 6.]]),
         'wire': sparse_tensor.SparseTensor(
@@ -1020,18 +1052,19 @@ class CrossedColumnTest(test.TestCase):
     outputs = _transform_features(features, [price_cross_wire])
     output = outputs[price_cross_wire]
     with self.cached_session() as sess:
-      output_val = sess.run(output)
+      output_val = self.evaluate(output)
       self.assertAllEqual(
           [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
       for val in output_val.values:
         self.assertIn(val, list(range(hash_bucket_size)))
       self.assertAllEqual([2, 4], output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed1 = fc.crossed_column(['d1', 'd2'], 10)
-    crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed1 = fc._crossed_column(['d1', 'd2'], 10)
+    crossed2 = fc._crossed_column([b, 'c', crossed1], 15, hash_key=5)
     with ops.Graph().as_default():
       builder = _LazyBuilder({
           'a':
@@ -1069,9 +1102,9 @@ class CrossedColumnTest(test.TestCase):
 
   def test_get_sparse_tensors_simple(self):
     """Same as test_get_sparse_tensors, but with simpler values."""
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       builder = _LazyBuilder({
           'a':
@@ -1094,14 +1127,15 @@ class CrossedColumnTest(test.TestCase):
         self.assertAllEqual(expected_values, id_tensor_eval.values)
         self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     """Tests linear_model.
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
           'a': constant_op.constant(((-1., .5), (.5, 1.))),
@@ -1113,15 +1147,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
     class _TestColumnWithWeights(_CategoricalColumn):
@@ -1155,7 +1189,7 @@ class CrossedColumnTest(test.TestCase):
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
     t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError,
@@ -1175,14 +1209,15 @@ class CrossedColumnTest(test.TestCase):
                 dense_shape=(2, 2)),
         }, (crossed,))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     """Tests _LinearModel.
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
-    b = fc.bucketized_column(a, boundaries=(0, 1))
-    crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
+    a = fc._numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    b = fc._bucketized_column(a, boundaries=(0, 1))
+    crossed = fc._crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
           'a':
@@ -1196,15 +1231,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_keras_linear_model_with_weights(self):
 
@@ -1242,7 +1277,7 @@ class CrossedColumnTest(test.TestCase):
             id_tensor=ids_and_weights[0], weight_tensor=ids_and_weights[1])
 
     t = _TestColumnWithWeights()
-    crossed = fc.crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
+    crossed = fc._crossed_column([t, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
       with self.assertRaisesRegexp(
           ValueError,
@@ -1331,31 +1366,31 @@ class LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.linear_model(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1366,15 +1401,16 @@ class LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1389,7 +1425,7 @@ class LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -1442,25 +1478,25 @@ class LinearModelTest(test.TestCase):
         sess.run(dense_and_sparse_column_var.assign(
             [[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = fc.linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1471,29 +1507,29 @@ class LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
                 1000., 1100., 1200.
             ], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -1504,7 +1540,7 @@ class LinearModelTest(test.TestCase):
       predictions = fc.linear_model(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -1514,7 +1550,7 @@ class LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -1528,11 +1564,11 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    wire_cast_weights = fc.weighted_categorical_column(wire_cast, 'weights')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast_weights = fc._weighted_categorical_column(wire_cast, 'weights')
 
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
@@ -1550,25 +1586,25 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = fc.linear_model(features, [price], units=3)
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -1577,22 +1613,22 @@ class LinearModelTest(test.TestCase):
         fc.linear_model(features, [price])
 
   def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = fc.linear_model(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
@@ -1603,18 +1639,18 @@ class LinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
@@ -1627,8 +1663,8 @@ class LinearModelTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[price2], [price2_var])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2', shape=3)
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [6., 7.]],
@@ -1653,13 +1689,13 @@ class LinearModelTest(test.TestCase):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
     # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
     # creates a Variable.
-    apple_numeric_column = fc.numeric_column('apple_numeric_column')
-    banana_dense_feature = fc.numeric_column('banana_dense_feature')
-    banana_dense_feature_bucketized = fc.bucketized_column(
+    apple_numeric_column = fc._numeric_column('apple_numeric_column')
+    banana_dense_feature = fc._numeric_column('banana_dense_feature')
+    banana_dense_feature_bucketized = fc._bucketized_column(
         banana_dense_feature, boundaries=[0.])
-    cherry_sparse_column = fc.categorical_column_with_hash_bucket(
+    cherry_sparse_column = fc._categorical_column_with_hash_bucket(
         'cherry_sparse_feature', hash_bucket_size=5)
-    dragonfruit_embedding_column = fc.embedding_column(
+    dragonfruit_embedding_column = fc._embedding_column(
         cherry_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -1684,7 +1720,7 @@ class LinearModelTest(test.TestCase):
       self.assertItemsEqual(input_layer_inputs, output_tensors)
 
   def test_dense_collection(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price], weight_collections=['my-vars'])
@@ -1695,7 +1731,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(price_var, my_vars)
 
   def test_sparse_collection(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1709,7 +1745,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, my_vars)
 
   def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price])
@@ -1720,7 +1756,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1733,7 +1769,7 @@ class LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       fc.linear_model(features, [price], trainable=False)
@@ -1741,7 +1777,7 @@ class LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -1751,9 +1787,9 @@ class LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       features = {
           'price_a': [[1.]],
@@ -1787,8 +1823,8 @@ class LinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -1800,9 +1836,9 @@ class LinearModelTest(test.TestCase):
       fc.linear_model(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -1815,8 +1851,8 @@ class LinearModelTest(test.TestCase):
         fc.linear_model(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -1830,8 +1866,8 @@ class LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -1846,10 +1882,16 @@ class LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -1873,14 +1915,21 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(price, boundaries=[0., 10., 100.,])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
+        price, boundaries=[
+            0.,
+            10.,
+            100.,
+        ])
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -1917,8 +1966,9 @@ class LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -1939,7 +1989,7 @@ class LinearModelTest(test.TestCase):
         sess.run(net, feed_dict={features['price']: np.array(1)})
 
   def test_multiple_linear_models(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features1 = {'price': [[1.], [5.]]}
       features2 = {'price': [[2.], [10.]]}
@@ -1950,14 +2000,14 @@ class LinearModelTest(test.TestCase):
       price_var1 = get_linear_model_column_var(price, name='linear_model')
       price_var2 = get_linear_model_column_var(price, name='linear_model_1')
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
 
 class _LinearModelTest(test.TestCase):
@@ -1996,31 +2046,31 @@ class _LinearModelTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.linear_model(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       get_keras_linear_model_predictions(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_dense_bias(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2031,15 +2081,16 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
-    price = fc.numeric_column('price')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2055,7 +2106,7 @@ class _LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -2114,10 +2165,10 @@ class _LinearModelTest(test.TestCase):
             dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
                                                 [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       predictions = get_keras_linear_model_predictions(
@@ -2125,15 +2176,15 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2145,29 +2196,29 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
                                   [1000., 1100.,
                                    1200.], [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = array_ops.sparse_placeholder(dtypes.string)
       wire_value = sparse_tensor.SparseTensorValue(
@@ -2178,7 +2229,7 @@ class _LinearModelTest(test.TestCase):
       predictions = get_keras_linear_model_predictions(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -2188,7 +2239,7 @@ class _LinearModelTest(test.TestCase):
             predictions.eval(feed_dict={wire_tensor: wire_value}))
 
   def test_sparse_combiner(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default():
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar', 'stringer', 'marlo'],  # hashed to = [2, 0, 3]
@@ -2202,10 +2253,10 @@ class _LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       predictions = get_keras_linear_model_predictions(
@@ -2213,15 +2264,15 @@ class _LinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -2230,22 +2281,22 @@ class _LinearModelTest(test.TestCase):
         get_keras_linear_model_predictions(features, [price])
 
   def test_dense_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       predictions = get_keras_linear_model_predictions(features, [price])
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       predictions = get_keras_linear_model_predictions(features,
@@ -2254,18 +2305,18 @@ class _LinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       cols_to_vars = {}
@@ -2279,8 +2330,8 @@ class _LinearModelTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[price2], [price2_var])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2', shape=3)
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2', shape=3)
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [6., 7.]],
@@ -2303,7 +2354,7 @@ class _LinearModelTest(test.TestCase):
         self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
 
   def test_dense_collection(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(
@@ -2315,7 +2366,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(price_var, my_vars)
 
   def test_sparse_collection(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2329,7 +2380,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, my_vars)
 
   def test_dense_trainable_default(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price])
@@ -2340,7 +2391,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(price_var, trainable_vars)
 
   def test_sparse_trainable_default(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2353,7 +2404,7 @@ class _LinearModelTest(test.TestCase):
       self.assertIn(wire_cast_var, trainable_vars)
 
   def test_dense_trainable_false(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default() as g:
       features = {'price': [[1.], [5.]]}
       get_keras_linear_model_predictions(features, [price], trainable=False)
@@ -2361,7 +2412,7 @@ class _LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_sparse_trainable_false(self):
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       wire_tensor = sparse_tensor.SparseTensor(
           values=['omar'], indices=[[0, 0]], dense_shape=[1, 1])
@@ -2371,9 +2422,9 @@ class _LinearModelTest(test.TestCase):
       self.assertEqual([], trainable_vars)
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
-    wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
+    wire_cast = fc._categorical_column_with_hash_bucket('wire_cast', 4)
     with ops.Graph().as_default() as g:
       features = {
           'price_a': [[1.]],
@@ -2407,8 +2458,8 @@ class _LinearModelTest(test.TestCase):
       self.assertIn('wire_cast', my_vars[2].name)
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2420,9 +2471,9 @@ class _LinearModelTest(test.TestCase):
       get_keras_linear_model_predictions(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2435,8 +2486,8 @@ class _LinearModelTest(test.TestCase):
         get_keras_linear_model_predictions(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2451,8 +2502,8 @@ class _LinearModelTest(test.TestCase):
               predictions, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -2468,15 +2519,16 @@ class _LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2506,19 +2558,21 @@ class _LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
-    price = fc.numeric_column('price')
-    price_buckets = fc.bucketized_column(
+    price = fc._numeric_column('price')
+    price_buckets = fc._bucketized_column(
         price, boundaries=[
             0.,
             10.,
             100.,
         ])
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
 
     # Provides 1-dim tensor and dense tensor.
@@ -2554,8 +2608,9 @@ class _LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -2581,7 +2636,7 @@ class InputLayerTest(test.TestCase):
   @test_util.run_in_graph_and_eager_modes
   def test_retrieving_input(self):
     features = {'a': [0.]}
-    input_layer = InputLayer(fc.numeric_column('a'))
+    input_layer = InputLayer(fc._numeric_column('a'))
     inputs = self.evaluate(input_layer(features))
     self.assertAllClose([[0.]], inputs)
 
@@ -2593,8 +2648,8 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
+      categorical_column = fc._categorical_column_with_identity(
+          key='a', num_buckets=3)
       embedding_dimension = 2
       def _embedding_column_initializer(shape, dtype, partition_info):
         del shape  # unused
@@ -2605,7 +2660,8 @@ class InputLayerTest(test.TestCase):
             (0, 1),  # id 1
             (1, 1))  # id 2
         return embedding_values
-      embedding_column = fc.embedding_column(
+
+      embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
@@ -2636,8 +2692,8 @@ class InputLayerTest(test.TestCase):
           dense_shape=(3, 3))
 
       # Create feature columns (categorical and embedding).
-      categorical_column = fc.categorical_column_with_identity(key='a',
-                                                               num_buckets=3)
+      categorical_column = fc._categorical_column_with_identity(
+          key='a', num_buckets=3)
       embedding_dimension = 2
 
       def _embedding_column_initializer(shape, dtype, partition_info):
@@ -2650,7 +2706,7 @@ class InputLayerTest(test.TestCase):
             (1, 1))  # id 2
         return embedding_values
 
-      embedding_column = fc.embedding_column(
+      embedding_column = fc._embedding_column(
           categorical_column,
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
@@ -2687,56 +2743,56 @@ class FunctionalInputLayerTest(test.TestCase):
       fc.input_layer(
           features={'a': [[0]]},
           feature_columns=[
-              fc.categorical_column_with_hash_bucket('wire_cast', 4)
+              fc._categorical_column_with_hash_bucket('wire_cast', 4)
           ])
 
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
       fc.input_layer(
-          features={'a': [[0]]}, feature_columns={'a': fc.numeric_column('a')})
+          features={'a': [[0]]}, feature_columns={'a': fc._numeric_column('a')})
 
   def test_bare_column(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
-      net = fc.input_layer(features, fc.numeric_column('a'))
+      net = fc.input_layer(features, fc._numeric_column('a'))
       with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+        self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
-      columns = (fc.numeric_column(key) for key in features)
+      columns = (fc._numeric_column(key) for key in features)
       net = fc.input_layer(features, columns)
       with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+        self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
       fc.input_layer(
           features={'a': [[0]]},
-          feature_columns=[fc.numeric_column('a'),
-                           fc.numeric_column('a')])
+          feature_columns=[fc._numeric_column('a'),
+                           fc._numeric_column('a')])
 
   def test_one_column(self):
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+        self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
-    price = fc.numeric_column('price', shape=2)
+    price = fc._numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       with self.assertRaisesRegexp(
@@ -2745,16 +2801,16 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price])
 
   def test_reshaping(self):
-    price = fc.numeric_column('price', shape=[1, 2])
+    price = fc._numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       net = fc.input_layer(features, [price])
       with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+        self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
-    price1 = fc.numeric_column('price1', shape=2)
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1', shape=2)
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1., 2.], [5., 6.]],
@@ -2762,19 +2818,19 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       net = fc.input_layer(features, [price1, price2])
       with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_fills_cols_to_vars(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
     # _BucketizedColumn, and an _EmbeddingColumn.  Only the _EmbeddingColumn
     # creates a Variable.
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -2793,24 +2849,25 @@ class FunctionalInputLayerTest(test.TestCase):
                             variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
+  @test_util.run_deprecated_v1
   def test_fills_cols_to_vars_shared_embedding(self):
     # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
     # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
     # EmbeddingColumn creates a Variable and the two SharedEmbeddingColumns
     # shared one variable.
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
       features = {
@@ -2850,13 +2907,13 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[shared_embedding_a][0].shape, [3, 2])
 
   def test_fills_cols_to_vars_partitioned_variables(self):
-    price1 = fc.numeric_column('price1')
-    dense_feature = fc.numeric_column('dense_feature')
-    dense_feature_bucketized = fc.bucketized_column(
+    price1 = fc._numeric_column('price1')
+    dense_feature = fc._numeric_column('dense_feature')
+    dense_feature_bucketized = fc._bucketized_column(
         dense_feature, boundaries=[0.])
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
     with ops.Graph().as_default():
       features = {
@@ -2883,8 +2940,8 @@ class FunctionalInputLayerTest(test.TestCase):
       self.assertAllEqual(cols_to_vars[some_embedding_column][2].shape, [1, 10])
 
   def test_column_order(self):
-    price_a = fc.numeric_column('price_a')
-    price_b = fc.numeric_column('price_b')
+    price_a = fc._numeric_column('price_a')
+    price_b = fc._numeric_column('price_b')
     with ops.Graph().as_default():
       features = {
           'price_a': [[1.]],
@@ -2893,11 +2950,11 @@ class FunctionalInputLayerTest(test.TestCase):
       net1 = fc.input_layer(features, [price_a, price_b])
       net2 = fc.input_layer(features, [price_b, price_a])
       with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+        self.assertAllClose([[1., 3.]], self.evaluate(net1))
+        self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
-    animal = fc.categorical_column_with_identity('animal', num_buckets=4)
+    animal = fc._categorical_column_with_identity('animal', num_buckets=4)
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -2908,8 +2965,8 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [animal])
 
   def test_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': [[1.], [5.], [7.]],  # batchsize = 3
@@ -2921,9 +2978,9 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price1, price2])
 
   def test_subset_of_static_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
-    price3 = fc.numeric_column('price3')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
+    price3 = fc._numeric_column('price3')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2936,8 +2993,8 @@ class FunctionalInputLayerTest(test.TestCase):
         fc.input_layer(features, [price1, price2, price3])
 
   def test_runtime_batch_size_mismatch(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
@@ -2950,8 +3007,8 @@ class FunctionalInputLayerTest(test.TestCase):
           sess.run(net, feed_dict={features['price1']: [[1.], [5.], [7.]]})
 
   def test_runtime_batch_size_matches(self):
-    price1 = fc.numeric_column('price1')
-    price2 = fc.numeric_column('price2')
+    price1 = fc._numeric_column('price1')
+    price2 = fc._numeric_column('price2')
     with ops.Graph().as_default():
       features = {
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
@@ -2967,9 +3024,9 @@ class FunctionalInputLayerTest(test.TestCase):
             })
 
   def test_multiple_layers_with_same_embedding_column(self):
-    some_sparse_column = fc.categorical_column_with_hash_bucket(
+    some_sparse_column = fc._categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)
-    some_embedding_column = fc.embedding_column(
+    some_embedding_column = fc._embedding_column(
         some_sparse_column, dimension=10)
 
     with ops.Graph().as_default():
@@ -2990,13 +3047,14 @@ class FunctionalInputLayerTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
 
@@ -3023,13 +3081,14 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     all_cols = [embedding_column_a, embedding_column_b]
@@ -3074,6 +3133,7 @@ class FunctionalInputLayerTest(test.TestCase):
           ['input_layer/aaa_bbb_shared_embedding/embedding_weights:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -3085,18 +3145,18 @@ class FunctionalInputLayerTest(test.TestCase):
       return embedding_values
 
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
 
     # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
+    one_hot_body_style = fc._indicator_column(body_style)
 
     # embedded_body_style has 5 dims in input_layer.
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(country, dimension=5,
-                                           initializer=_initializer)
+    embedded_country = fc._embedding_column(
+        country, dimension=5, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
     features = {
@@ -3124,6 +3184,7 @@ class FunctionalInputLayerTest(test.TestCase):
            [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
           sess.run(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
@@ -3135,17 +3196,17 @@ class FunctionalInputLayerTest(test.TestCase):
       return embedding_values
 
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
 
     # one_hot_body_style has 3 dims in input_layer.
-    body_style = fc.categorical_column_with_vocabulary_list(
+    body_style = fc._categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    one_hot_body_style = fc.indicator_column(body_style)
+    one_hot_body_style = fc._indicator_column(body_style)
 
     # embedded_body_style has 5 dims in input_layer.
-    country = fc.categorical_column_with_vocabulary_list(
+    country = fc._categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
-    embedded_country = fc.embedding_column(
+    embedded_country = fc._embedding_column(
         country, dimension=2, initializer=_initializer)
 
     # Provides 1-dim tensor and dense tensor.
@@ -3183,9 +3244,10 @@ class FunctionalInputLayerTest(test.TestCase):
                   features['country']: country_data
               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
-    price = fc.numeric_column('price')
+    price = fc._numeric_column('price')
     features = {
         'price': constant_op.constant(0),
     }
@@ -3313,8 +3375,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'python/feature_column/testdata/wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column._var_scope_name)
@@ -3326,22 +3389,30 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    column = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+    column = fc._categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     self.assertEqual(7, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+    original = fc._categorical_column_with_vocabulary_file(
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(7, column._num_buckets)
@@ -3351,16 +3422,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_vocabulary_file_none(self):
     with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file=None, vocabulary_size=3)
 
   def test_vocabulary_file_empty_string(self):
     with self.assertRaisesRegexp(ValueError, 'Missing vocabulary_file'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_invalid_vocabulary_file(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -3373,16 +3445,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
+  @test_util.run_deprecated_v1
   def test_too_large_vocabulary_size(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size + 1)
@@ -3397,20 +3472,24 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+      fc._categorical_column_with_vocabulary_file(
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
         ValueError, 'both num_oov_buckets and default_value'):
-      fc.categorical_column_with_vocabulary_file(
+      fc._categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=self._wire_vocabulary_size,
@@ -3418,7 +3497,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
           default_value=2)
 
   def test_invalid_input_dtype_int32(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3431,7 +3510,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3443,8 +3522,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_file(
+    a = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3465,8 +3545,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3485,8 +3566,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_none_vocabulary_size(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -3503,8 +3585,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                                       dense_shape=inputs.dense_shape),
                                   id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3514,16 +3597,15 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         dense_shape=(2, 2))
     id_tensor = _transform_features({'aaa': inputs}, [column])[column]
     with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_tensor.eval())
+      _assert_sparse_tensor_value(
+          self,
+          sparse_tensor.SparseTensorValue(
+              indices=inputs.indices,
+              values=np.array((2, -1, 0), dtype=np.int64),
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3540,8 +3622,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size)
@@ -3559,8 +3642,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3580,8 +3664,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3601,11 +3686,12 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_small_vocabulary_size(self):
     # 'marlo' is the last entry in our vocabulary file, so be setting
     # `vocabulary_size` to 1 less than number of entries in file, we take
     # 'marlo' out of the vocabulary.
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size - 1)
@@ -3624,8 +3710,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3645,9 +3732,10 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3667,8 +3755,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=(3, 3)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_file(
+    column = fc._categorical_column_with_vocabulary_file(
         key='aaa',
         vocabulary_file=self._warriors_vocabulary_file_name,
         vocabulary_size=self._warriors_vocabulary_size,
@@ -3689,8 +3778,9 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
+    wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3706,16 +3796,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_file(
+    wire_column = fc._categorical_column_with_vocabulary_file(
         key='wire',
         vocabulary_file=self._wire_vocabulary_file_name,
         vocabulary_size=self._wire_vocabulary_size,
@@ -3732,19 +3824,20 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
 
 class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_defaults_string(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
@@ -3756,11 +3849,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key=('aaa',), vocabulary_list=('omar', 'stringer', 'marlo'))
 
   def test_defaults_int(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36))
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
@@ -3770,17 +3863,21 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa',
+        vocabulary_list=(12, 24, 36),
+        dtype=dtypes.int32,
         default_value=-99)
     self.assertEqual(3, column._num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_vocabulary_list(
+    original = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
@@ -3791,65 +3888,65 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary dtype must be string or integer'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
     with self.assertRaisesRegexp(
         ValueError, r'dtype.*and vocabulary dtype.*do not match'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary_list.*must be non-empty'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
     with self.assertRaisesRegexp(
         ValueError, r'vocabulary_list.*must be non-empty'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
   def test_duplicate_mapping(self):
     with self.assertRaisesRegexp(ValueError, 'Duplicate keys'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 12))
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
-      fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=(12, 24, 36),
-          num_oov_buckets=-1)
+      fc._categorical_column_with_vocabulary_list(
+          key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
     with self.assertRaisesRegexp(
         ValueError, 'both num_oov_buckets and default_value'):
-      fc.categorical_column_with_vocabulary_list(
+      fc._categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
           num_oov_buckets=100,
           default_value=2)
 
   def test_invalid_input_dtype_int32(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
@@ -3858,9 +3955,8 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
   def test_invalid_input_dtype_string(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=(12, 24, 36))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=(12, 24, 36))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -3868,8 +3964,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example_string(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3890,8 +3987,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_parse_example_int(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(11, 21, 31))
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
@@ -3912,10 +4010,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -3931,10 +4029,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -3946,13 +4044,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
@@ -3966,10 +4062,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+    column = fc._categorical_column_with_vocabulary_list(
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     id_weight_pair = column._get_sparse_tensors(
         _LazyBuilder({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
@@ -3984,8 +4080,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         default_value=2)
@@ -4004,8 +4101,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=100)
@@ -4024,8 +4122,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32)
@@ -4044,9 +4143,10 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32,
@@ -4067,8 +4167,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=(3, 3)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
-    column = fc.categorical_column_with_vocabulary_list(
+    column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=np.array((30, 35, 11, 23, 22), dtype=np.int32),
         dtype=dtypes.int32,
@@ -4088,8 +4189,9 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
+    wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
@@ -4104,16 +4206,18 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    wire_column = fc.categorical_column_with_vocabulary_list(
+    wire_column = fc._categorical_column_with_vocabulary_list(
         key='aaa',
         vocabulary_list=('omar', 'stringer', 'marlo'),
         num_oov_buckets=1)
@@ -4129,19 +4233,20 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(wire_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
         # 'marlo' -> 2: wire_var[2] = 3
         # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+        self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
 
 class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_constructor(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual('aaa', column.name)
     self.assertEqual('aaa', column.key)
     self.assertEqual('aaa', column._var_scope_name)
@@ -4152,10 +4257,11 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_key_should_be_string(self):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
-      fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
+      fc._categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    original = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(3, column._num_buckets)
@@ -4165,24 +4271,24 @@ class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_invalid_num_buckets_zero(self):
     with self.assertRaisesRegexp(ValueError, 'num_buckets 0 < 1'):
-      fc.categorical_column_with_identity(key='aaa', num_buckets=0)
+      fc._categorical_column_with_identity(key='aaa', num_buckets=0)
 
   def test_invalid_num_buckets_negative(self):
     with self.assertRaisesRegexp(ValueError, 'num_buckets -1 < 1'):
-      fc.categorical_column_with_identity(key='aaa', num_buckets=-1)
+      fc._categorical_column_with_identity(key='aaa', num_buckets=-1)
 
   def test_invalid_default_value_too_small(self):
     with self.assertRaisesRegexp(ValueError, 'default_value -1 not in range'):
-      fc.categorical_column_with_identity(
+      fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=-1)
 
   def test_invalid_default_value_too_big(self):
     with self.assertRaisesRegexp(ValueError, 'default_value 3 not in range'):
-      fc.categorical_column_with_identity(
+      fc._categorical_column_with_identity(
           key='aaa', num_buckets=3, default_value=3)
 
   def test_invalid_input_dtype(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -4190,8 +4296,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'Invalid input, not integer'):
       column._get_sparse_tensors(_LazyBuilder({'aaa': inputs}))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=30)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4211,8 +4318,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4228,8 +4336,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4241,11 +4350,10 @@ class IdentityCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
 
   def test_get_sparse_tensors_weight_collections(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0, 1, 0),
@@ -4259,8 +4367,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES))
     self.assertItemsEqual([], ops.get_collection('my_weights'))
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     id_weight_pair = column._get_sparse_tensors(
         _LazyBuilder({
             'aaa': ((0, -1), (1, 0))
@@ -4275,8 +4384,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=(2, 2)),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_small(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, -1, 0),
@@ -4288,8 +4398,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
           errors.OpError, 'assert_greater_or_equal_0'):
         id_weight_pair.id_tensor.eval()
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_big(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(1, 99, 0),
@@ -4301,8 +4412,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
           errors.OpError, 'assert_less_than_num_buckets'):
         id_weight_pair.id_tensor.eval()
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
-    column = fc.categorical_column_with_identity(
+    column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
@@ -4319,8 +4431,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               dense_shape=inputs.dense_shape),
           id_weight_pair.id_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
-    column = fc.categorical_column_with_identity(
+    column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
     input_indices = array_ops.placeholder(dtype=dtypes.int64)
     input_values = array_ops.placeholder(dtype=dtypes.int32)
@@ -4344,8 +4457,9 @@ class IdentityCategoricalColumnTest(test.TestCase):
               input_shape: (2, 2),
           }))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -4357,16 +4471,17 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
+    column = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column._num_buckets)
     with ops.Graph().as_default():
       predictions = get_keras_linear_model_predictions({
@@ -4379,13 +4494,13 @@ class IdentityCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] = 1
         # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+        self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
 
 class TransformFeaturesTest(test.TestCase):
@@ -4393,9 +4508,9 @@ class TransformFeaturesTest(test.TestCase):
   # All transform tests are distributed in column test.
   # Here we only test multi column case and naming
   def transform_multi_column(self):
-    bucketized_price = fc.bucketized_column(
-        fc.numeric_column('price'), boundaries=[0, 2, 4, 6])
-    hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
+    bucketized_price = fc._bucketized_column(
+        fc._numeric_column('price'), boundaries=[0, 2, 4, 6])
+    hashed_sparse = fc._categorical_column_with_hash_bucket('wire', 10)
     with ops.Graph().as_default():
       features = {
           'price': [[-1.], [5.]],
@@ -4452,32 +4567,33 @@ class TransformFeaturesTest(test.TestCase):
 class IndicatorColumnTest(test.TestCase):
 
   def test_indicator_column(self):
-    a = fc.categorical_column_with_hash_bucket('a', 4)
-    indicator_a = fc.indicator_column(a)
+    a = fc._categorical_column_with_hash_bucket('a', 4)
+    indicator_a = fc._indicator_column(a)
     self.assertEqual(indicator_a.categorical_column.name, 'a')
     self.assertEqual(indicator_a.name, 'a_indicator')
     self.assertEqual(indicator_a._var_scope_name, 'a_indicator')
     self.assertEqual(indicator_a._variable_shape, [1, 4])
 
-    b = fc.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
-    indicator_b = fc.indicator_column(b)
+    b = fc._categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    indicator_b = fc._indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
     self.assertEqual(indicator_b.name, 'b_indicator')
     self.assertEqual(indicator_b._var_scope_name, 'b_indicator')
     self.assertEqual(indicator_b._variable_shape, [1, 100])
 
   def test_1D_shape_succeeds(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_hash_bucket('animal', 4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({'animal': ['fox', 'fox']})
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
     # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
-    animal = fc.indicator_column(
-        fc.categorical_column_with_hash_bucket('animal', 4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_hash_bucket('animal', 4))
     builder = _LazyBuilder({
         'animal':
             sparse_tensor.SparseTensor(
@@ -4487,11 +4603,12 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                          self.evaluate(output))
 
   def test_multi_hot(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
 
     builder = _LazyBuilder({
         'animal':
@@ -4500,11 +4617,11 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+      self.assertAllEqual([[0., 2., 0., 0.]], self.evaluate(output))
 
   def test_multi_hot2(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     builder = _LazyBuilder({
         'animal':
             sparse_tensor.SparseTensor(
@@ -4512,20 +4629,22 @@ class IndicatorColumnTest(test.TestCase):
     })
     output = builder.get(animal)
     with self.cached_session():
-      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
+      self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    a = fc.categorical_column_with_hash_bucket('a', 4)
-    column = fc.indicator_column(a)
+    a = fc._categorical_column_with_hash_bucket('a', 4)
+    column = fc._indicator_column(a)
     column_copy = copy.deepcopy(column)
     self.assertEqual(column_copy.categorical_column.name, 'a')
     self.assertEqual(column.name, 'a_indicator')
     self.assertEqual(column._variable_shape, [1, 4])
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc.indicator_column(a)
+    a_indicator = fc._indicator_column(a)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4545,10 +4664,11 @@ class IndicatorColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_indicator = fc.indicator_column(a)
+    a_indicator = fc._indicator_column(a)
     features = {
         'aaa': sparse_tensor.SparseTensorValue(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -4557,51 +4677,56 @@ class IndicatorColumnTest(test.TestCase):
     }
     indicator_tensor = _transform_features(features, [a_indicator])[a_indicator]
     with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+      self.assertAllEqual([[0, 0, 1], [1, 0, 0]],
+                          self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_weighted_column(self):
     # Github issue 12557
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc.weighted_categorical_column(ids, 'weights')
-    indicator = fc.indicator_column(weights)
+    weights = fc._weighted_categorical_column(ids, 'weights')
+    indicator = fc._indicator_column(weights)
     features = {
         'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
         'weights': constant_op.constant([[2., 4., 6., 1.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[6., 4., 3.]], indicator_tensor.eval())
+      self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_weighted_column(self):
     # Github issue 12583
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    weights = fc.weighted_categorical_column(ids, 'weights')
-    indicator = fc.indicator_column(weights)
+    weights = fc._weighted_categorical_column(ids, 'weights')
+    indicator = fc._indicator_column(weights)
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
         'weights': constant_op.constant([[2., 4., 6.]])
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_categorical_column(self):
     # Github issue 12583
-    ids = fc.categorical_column_with_vocabulary_list(
+    ids = fc._categorical_column_with_vocabulary_list(
         key='ids', vocabulary_list=('a', 'b', 'c'))
-    indicator = fc.indicator_column(ids)
+    indicator = fc._indicator_column(ids)
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
     }
     indicator_tensor = _transform_features(features, [indicator])[indicator]
     with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+      self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4613,14 +4738,15 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4632,14 +4758,15 @@ class IndicatorColumnTest(test.TestCase):
       weight_var = get_linear_model_column_var(animal)
       with _initialized_session():
         # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
+        self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+        self.assertAllClose([[0.]], self.evaluate(predictions))
         weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+        self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
-    animal = fc.indicator_column(
-        fc.categorical_column_with_identity('animal', num_buckets=4))
+    animal = fc._indicator_column(
+        fc._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -4648,16 +4775,17 @@ class IndicatorColumnTest(test.TestCase):
       }
       net = fc.input_layer(features, [animal])
       with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+        self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
 
 class EmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    embedding_column = fc.embedding_column(
+    embedding_column = fc._embedding_column(
         categorical_column, dimension=embedding_dimension)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
@@ -4674,15 +4802,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
@@ -4698,15 +4831,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
-    original = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+    original = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     for embedding_column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', embedding_column.categorical_column.name)
       self.assertEqual(3, embedding_column.categorical_column._num_buckets)
@@ -4727,16 +4865,19 @@ class EmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
+      fc._embedding_column(
+          categorical_column, dimension=2, initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded = fc.embedding_column(a, dimension=2)
+    a_embedded = fc._embedding_column(a, dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -4756,9 +4897,10 @@ class EmbeddingColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['aaa'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    a_embedded = fc.embedding_column(a, dimension=2)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+    a_embedded = fc._embedding_column(a, dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -4769,9 +4911,10 @@ class EmbeddingColumnTest(test.TestCase):
     output_a = outputs[a]
     output_embedded = outputs[a_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_embedded))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -4810,10 +4953,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4828,8 +4972,9 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -4870,10 +5015,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4888,8 +5034,9 @@ class EmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
     sparse_input = sparse_tensor.SparseTensorValue(
         # example 0, ids [2]
@@ -4901,9 +5048,9 @@ class EmbeddingColumnTest(test.TestCase):
         dense_shape=(4, 5))
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    embedding_column = fc.embedding_column(categorical_column, dimension=2)
+    embedding_column = fc._embedding_column(categorical_column, dimension=2)
 
     # Provide sparse input and get dense result.
     embedding_column._get_dense_tensor(
@@ -4919,6 +5066,7 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(
         ('embedding_weights:0',), tuple([v.name for v in my_vars]))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -4957,10 +5105,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -4989,6 +5138,7 @@ class EmbeddingColumnTest(test.TestCase):
               input_shape: sparse_input.dense_shape,
           }))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_restore_from_ckpt(self):
     # Inputs.
     vocabulary_size = 3
@@ -5025,10 +5175,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
 
@@ -5044,8 +5195,9 @@ class EmbeddingColumnTest(test.TestCase):
         ('embedding_weights:0',), tuple([v.name for v in global_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -5070,10 +5222,11 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     with ops.Graph().as_default():
@@ -5100,11 +5253,13 @@ class EmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5119,8 +5274,10 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -5146,9 +5303,9 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
+    embedding_column = fc._embedding_column(
         categorical_column,
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5176,11 +5333,13 @@ class EmbeddingColumnTest(test.TestCase):
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5195,8 +5354,10 @@ class EmbeddingColumnTest(test.TestCase):
         # example 3, ids [1], embedding[3] = [3, 5]
         # sum(embeddings * linear_weights)
         # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                            self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
     vocabulary_size = 3
@@ -5235,10 +5396,11 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
@@ -5255,8 +5417,9 @@ class EmbeddingColumnTest(test.TestCase):
         tuple([v.name for v in trainable_vars]))
     with _initialized_session():
       self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
+  @test_util.run_deprecated_v1
   def test_input_layer_not_trainable(self):
     # Inputs.
     vocabulary_size = 3
@@ -5295,11 +5458,13 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc.categorical_column_with_identity(
+    categorical_column = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        initializer=_initializer, trainable=False)
+    embedding_column = fc._embedding_column(
+        categorical_column,
+        dimension=embedding_dimension,
+        initializer=_initializer,
+        trainable=False)
 
     # Provide sparse input and get dense result.
     input_layer = fc.input_layer({'aaa': sparse_input}, (embedding_column,))
@@ -5313,18 +5478,19 @@ class EmbeddingColumnTest(test.TestCase):
         [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
     with _initialized_session():
       self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_b, embedding_column_a = fc.shared_embedding_columns(
+    embedding_column_b, embedding_column_a = fc_new.shared_embedding_columns(
         [categorical_column_b, categorical_column_a],
         dimension=embedding_dimension)
     self.assertIs(categorical_column_a, embedding_column_a.categorical_column)
@@ -5362,13 +5528,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5413,13 +5580,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     embedding_dimension = 2
-    original_a, _ = fc.shared_embedding_columns(
+    original_a, _ = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         combiner='my_combiner',
@@ -5427,7 +5595,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
         shared_embedding_collection_name='shared_embedding_collection_name',
         ckpt_to_load_from='my_ckpt',
         tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        max_norm=42.,
+        trainable=False)
     for embedding_column_a in (original_a, copy.deepcopy(original_a)):
       self.assertEqual('aaa', embedding_column_a.categorical_column.name)
       self.assertEqual(3, embedding_column_a.categorical_column._num_buckets)
@@ -5450,55 +5619,60 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column_a._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
-      fc.shared_embedding_columns(
-          [categorical_column_a, categorical_column_b], dimension=2,
+      fc_new.shared_embedding_columns(
+          [categorical_column_a, categorical_column_b],
+          dimension=2,
           initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_incompatible_column_type(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    categorical_column_c = fc.categorical_column_with_hash_bucket(
+    categorical_column_c = fc._categorical_column_with_hash_bucket(
         key='ccc', hash_bucket_size=3)
     with self.assertRaisesRegexp(
         ValueError,
         'all categorical_columns must have the same type.*'
         '_IdentityCategoricalColumn.*_HashedCategoricalColumn'):
-      fc.shared_embedding_columns(
+      fc_new.shared_embedding_columns(
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_weighted_categorical_column_ok(self):
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    weighted_categorical_column_a = fc.weighted_categorical_column(
+    weighted_categorical_column_a = fc._weighted_categorical_column(
         categorical_column_a, weight_feature_key='aaa_weights')
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    weighted_categorical_column_b = fc.weighted_categorical_column(
+    weighted_categorical_column_b = fc._weighted_categorical_column(
         categorical_column_b, weight_feature_key='bbb_weights')
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [weighted_categorical_column_a, categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [categorical_column_a, weighted_categorical_column_b], dimension=2)
-    fc.shared_embedding_columns(
+    fc_new.shared_embedding_columns(
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    b = fc.categorical_column_with_vocabulary_list(
+    b = fc._categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                             dimension=2)
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -5529,11 +5703,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['bbb'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
-    a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
-    b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
-    a_embedded, b_embedded = fc.shared_embedding_columns(
-        [a, b], dimension=2)
+    a = fc._categorical_column_with_identity(key='aaa', num_buckets=3)
+    b = fc._categorical_column_with_identity(key='bbb', num_buckets=3)
+    a_embedded, b_embedded = fc_new.shared_embedding_columns([a, b],
+                                                             dimension=2)
     features = {
         'aaa': sparse_tensor.SparseTensor(
             indices=((0, 0), (1, 0), (1, 1)),
@@ -5550,11 +5725,12 @@ class SharedEmbeddingColumnTest(test.TestCase):
     output_b = outputs[b]
     output_b_embedded = outputs[b_embedded]
     with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_a_embedded.eval())
-      _assert_sparse_tensor_value(
-          self, output_b.eval(), output_b_embedded.eval())
+      _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                  self.evaluate(output_a_embedded))
+      _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                  self.evaluate(output_b_embedded))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -5598,13 +5774,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a._get_dense_tensor(
@@ -5618,10 +5795,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
     with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
+      self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+      self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+      self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_weight_collections(self):
     # Inputs.
     vocabulary_size = 3
@@ -5651,11 +5829,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5674,6 +5852,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         ('input_layer/aaa_bbb_shared_embedding/embedding_weights:0',),
         tuple(v.name for v in my_vars))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -5712,13 +5891,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     # Provide sparse input and get dense result.
     embedding_lookup_a = embedding_column_a._get_dense_tensor(
@@ -5729,6 +5909,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 2
@@ -5752,13 +5933,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer)
+        dimension=embedding_dimension,
+        initializer=_initializer)
 
     with ops.Graph().as_default():
       predictions = fc.linear_model({
@@ -5790,13 +5972,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding_1/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
+        self.assertAllClose(
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5814,8 +5998,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
     # Inputs.
     batch_size = 2
@@ -5842,11 +6027,11 @@ class SharedEmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
         dimension=embedding_dimension,
         initializer=_initializer)
@@ -5881,13 +6066,15 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_bbb_shared_embedding_1/weights:0']
       with _initialized_session():
         # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
+        self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+        self.assertAllClose(zeros_embedding_values,
+                            self.evaluate(embedding_weights))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
         self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
+            np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
+        self.assertAllClose(
+            np.zeros((batch_size, 1)), self.evaluate(predictions))
 
         # Predictions with all non-zero weights.
         embedding_weights.assign((
@@ -5905,7 +6092,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         # example 1, ids [], embedding[1] = 0, 0]
         # sum(embeddings * linear_weights)
         # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
+        self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
 
   def _test_input_layer(self, trainable=True):
     # Inputs.
@@ -5949,13 +6136,14 @@ class SharedEmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column_a = fc.categorical_column_with_identity(
+    categorical_column_a = fc._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
-    categorical_column_b = fc.categorical_column_with_identity(
+    categorical_column_b = fc._categorical_column_with_identity(
         key='bbb', num_buckets=vocabulary_size)
-    embedding_column_a, embedding_column_b = fc.shared_embedding_columns(
+    embedding_column_a, embedding_column_b = fc_new.shared_embedding_columns(
         [categorical_column_a, categorical_column_b],
-        dimension=embedding_dimension, initializer=_initializer,
+        dimension=embedding_dimension,
+        initializer=_initializer,
         trainable=trainable)
 
     # Provide sparse input and get dense result.
@@ -5978,20 +6166,23 @@ class SharedEmbeddingColumnTest(test.TestCase):
     shared_embedding_vars = global_vars
     with _initialized_session():
       self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, input_layer.eval())
+      self.assertAllEqual(expected_lookups, self.evaluate(input_layer))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     self._test_input_layer()
 
+  @test_util.run_deprecated_v1
   def test_input_layer_no_trainable(self):
     self._test_input_layer(trainable=False)
 
 
 class WeightedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     self.assertEqual('ids_weighted_by_values', column.name)
@@ -6002,10 +6193,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
         'values': parsing_ops.VarLenFeature(dtypes.float32)
     }, column._parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
-    original = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    original = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     for column in (original, copy.deepcopy(original)):
@@ -6018,23 +6210,23 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_invalid_dtype_none(self):
     with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
           weight_feature_key='values',
           dtype=None)
 
   def test_invalid_dtype_string(self):
     with self.assertRaisesRegexp(ValueError, 'is not convertible to float'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='ids', num_buckets=3),
           weight_feature_key='values',
           dtype=dtypes.string)
 
   def test_invalid_input_dtype(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     strings = sparse_tensor.SparseTensorValue(
@@ -6046,14 +6238,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_column_name_collision(self):
     with self.assertRaisesRegexp(ValueError, r'Parse config.*already exists'):
-      fc.weighted_categorical_column(
-          categorical_column=fc.categorical_column_with_identity(
+      fc._weighted_categorical_column(
+          categorical_column=fc._categorical_column_with_identity(
               key='aaa', num_buckets=3),
           weight_feature_key='aaa')._parse_example_spec()
 
   def test_missing_weights(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6064,10 +6256,12 @@ class WeightedCategoricalColumnTest(test.TestCase):
         ValueError, 'values is not in features dictionary'):
       _transform_features({'ids': inputs}, (column,))
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
-    a = fc.categorical_column_with_vocabulary_list(
+    a = fc._categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
+    a_weighted = fc._weighted_categorical_column(
+        a, weight_feature_key='weights')
     data = example_pb2.Example(features=feature_pb2.Features(
         feature={
             'aaa':
@@ -6098,9 +6292,10 @@ class WeightedCategoricalColumnTest(test.TestCase):
               dense_shape=[1, 2]),
           features['weights'].eval())
 
+  @test_util.run_deprecated_v1
   def test_transform_features(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6121,19 +6316,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_input(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     weights = sparse_tensor.SparseTensorValue(
@@ -6150,19 +6344,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=weights.indices,
               values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
+              dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_weights(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
@@ -6179,19 +6372,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           sparse_tensor.SparseTensorValue(
               indices=inputs.indices,
               values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+              dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
       _assert_sparse_tensor_value(
           self,
           sparse_tensor.SparseTensorValue(
               indices=((0, 0), (1, 0), (1, 1)),
               values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)),
-          weight_tensor.eval())
+              dense_shape=(2, 2)), self.evaluate(weight_tensor))
 
+  @test_util.run_deprecated_v1
   def test_keras_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6210,18 +6402,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_keras_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6241,8 +6433,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }, (column,))
 
   def test_keras_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6263,11 +6455,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_keras_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6282,18 +6474,19 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6310,18 +6503,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model_mismatched_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6339,8 +6532,8 @@ class WeightedCategoricalColumnTest(test.TestCase):
         }, (column,))
 
   def test_linear_model_mismatched_dense_values(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6361,11 +6554,11 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
-    column = fc.weighted_categorical_column(
-        categorical_column=fc.categorical_column_with_identity(
+    column = fc._weighted_categorical_column(
+        categorical_column=fc._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -6379,14 +6572,14 @@ class WeightedCategoricalColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
       with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         weight_var.assign(((1.,), (2.,), (3.,))).eval()
         # weight_var[0] * weights[0, 0] = 1 * .5 = .5
         # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
         # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+        self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 9b4a7e882f9acf58915714dc5ceea1ea6682c5a1..6308926494237f3546ddac0b893e4f6a23b116de 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -165,6 +165,7 @@ from tensorflow.python.training import checkpoint_utils
 from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
 
 
 _FEATURE_COLUMN_DEPRECATION_DATE = '2018-11-30'
@@ -258,7 +259,7 @@ class StateManager(object):
 
 
 class _StateManagerImpl(StateManager):
-  """Manages the state of FeatureLayer and LinearModel."""
+  """Manages the state of DenseFeatures and LinearLayer."""
 
   def __init__(self, layer, trainable):
     """Creates an _StateManagerImpl object.
@@ -302,7 +303,8 @@ class _StateManagerImpl(StateManager):
     raise ValueError('Variable does not exist.')
 
 
-class FeatureLayer(Layer):
+@tf_export('keras.layers.DenseFeatures', v1=[])
+class DenseFeatures(Layer):
   """A layer that produces a dense `Tensor` based on given `feature_columns`.
 
   Generally a single example in training data is described with FeatureColumns.
@@ -318,7 +320,7 @@ class FeatureLayer(Layer):
   keywords_embedded = embedding_column(
       categorical_column_with_hash_bucket("keywords", 10K), dimensions=16)
   columns = [price, keywords_embedded, ...]
-  feature_layer = FeatureLayer(columns)
+  feature_layer = DenseFeatures(columns)
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   dense_tensor = feature_layer(features)
@@ -333,7 +335,7 @@ class FeatureLayer(Layer):
                trainable=True,
                name=None,
                **kwargs):
-    """Constructs a FeatureLayer.
+    """Constructs a DenseFeatures.
 
     Args:
       feature_columns: An iterable containing the FeatureColumns to use as
@@ -344,13 +346,14 @@ class FeatureLayer(Layer):
         `indicator_column`.
       trainable: If `True` also add the variable to the graph collection
         `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
-      name: Name to give to the FeatureLayer.
+      name: Name to give to the DenseFeatures.
       **kwargs: Keyword arguments to construct a layer.
 
     Raises:
       ValueError: if an item in `feature_columns` is not a `DenseColumn`.
     """
-    super(FeatureLayer, self).__init__(name=name, trainable=trainable, **kwargs)
+    super(DenseFeatures, self).__init__(
+        name=name, trainable=trainable, **kwargs)
 
     self._feature_columns = _normalize_feature_columns(feature_columns)
     self._feature_columns = sorted(self._feature_columns, key=lambda x: x.name)
@@ -371,7 +374,7 @@ class FeatureLayer(Layer):
       with variable_scope._pure_variable_scope(self.name):  # pylint: disable=protected-access
         with variable_scope._pure_variable_scope(column.name):  # pylint: disable=protected-access
           column.create_state(self._state_manager)
-      super(FeatureLayer, self).build(None)
+      super(DenseFeatures, self).build(None)
 
   def call(self, features, cols_to_output_tensors=None):
     """Returns a dense tensor corresponding to the `feature_columns`.
@@ -515,6 +518,7 @@ class _LinearModelLayer(Layer):
       return predictions
 
 
+@tf_export('keras.layers.LinearModel', v1=[])
 class LinearModel(training.Model):
   """Produces a linear prediction `Tensor` based on given `feature_columns`.
 
@@ -522,7 +526,7 @@ class LinearModel(training.Model):
   Weighted sum refers to logits in classification problems. It refers to the
   prediction itself for linear regression problems.
 
-  Note on supported columns: `LinearModel` treats categorical columns as
+  Note on supported columns: `LinearLayer` treats categorical columns as
   `indicator_column`s. To be specific, assume the input as `SparseTensor` looks
   like:
 
@@ -547,7 +551,7 @@ class LinearModel(training.Model):
   keywords = categorical_column_with_hash_bucket("keywords", 10K)
   keywords_price = crossed_column('keywords', price_buckets, ...)
   columns = [price_buckets, keywords, keywords_price ...]
-  linear_model = LinearModel(columns)
+  linear_model = LinearLayer(columns)
 
   features = tf.parse_example(..., features=make_parse_example_spec(columns))
   prediction = linear_model(features)
@@ -561,7 +565,7 @@ class LinearModel(training.Model):
                trainable=True,
                name=None,
                **kwargs):
-    """Constructs a LinearModel.
+    """Constructs a LinearLayer.
 
     Args:
       feature_columns: An iterable containing the FeatureColumns to use as
@@ -650,7 +654,7 @@ class LinearModel(training.Model):
     return self.layer.bias
 
 
-def _transform_features(features, feature_columns, state_manager):
+def _transform_features_v2(features, feature_columns, state_manager):
   """Returns transformed features based on features columns passed in.
 
   Please note that most probably you would not need to use this function. Please
@@ -695,7 +699,8 @@ def _transform_features(features, feature_columns, state_manager):
   return outputs
 
 
-def make_parse_example_spec(feature_columns):
+@tf_export('feature_column.make_parse_example_spec', v1=[])
+def make_parse_example_spec_v2(feature_columns):
   """Creates parsing spec dictionary from input feature_columns.
 
   The returned dictionary can be used as arg 'features' in `tf.parse_example`.
@@ -754,10 +759,15 @@ def make_parse_example_spec(feature_columns):
   return result
 
 
-def embedding_column(
-    categorical_column, dimension, combiner='mean', initializer=None,
-    ckpt_to_load_from=None, tensor_name_in_ckpt=None, max_norm=None,
-    trainable=True):
+@tf_export('feature_column.embedding_column')
+def embedding_column(categorical_column,
+                     dimension,
+                     combiner='mean',
+                     initializer=None,
+                     ckpt_to_load_from=None,
+                     tensor_name_in_ckpt=None,
+                     max_norm=None,
+                     trainable=True):
   """`DenseColumn` that converts from sparse, categorical input.
 
   Use this when your inputs are sparse, but you want to convert them to a dense
@@ -854,6 +864,180 @@ def embedding_column(
       trainable=trainable)
 
 
+@tf_export(v1=['feature_column.shared_embedding_columns'])
+def shared_embedding_columns(categorical_columns,
+                             dimension,
+                             combiner='mean',
+                             initializer=None,
+                             shared_embedding_collection_name=None,
+                             ckpt_to_load_from=None,
+                             tensor_name_in_ckpt=None,
+                             max_norm=None,
+                             trainable=True):
+  """List of dense columns that convert from sparse, categorical input.
+
+  This is similar to `embedding_column`, except that it produces a list of
+  embedding columns that share the same embedding weights.
+
+  Use this when your inputs are sparse and of the same type (e.g. watched and
+  impression video IDs that share the same vocabulary), and you want to convert
+  them to a dense representation (e.g., to feed to a DNN).
+
+  Inputs must be a list of categorical columns created by any of the
+  `categorical_column_*` function. They must all be of the same type and have
+  the same arguments except `key`. E.g. they can be
+  categorical_column_with_vocabulary_file with the same vocabulary_file. Some or
+  all columns could also be weighted_categorical_column.
+
+  Here is an example embedding of two features for a DNNClassifier model:
+
+  ```python
+  watched_video_id = categorical_column_with_vocabulary_file(
+      'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+  impression_video_id = categorical_column_with_vocabulary_file(
+      'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+  columns = shared_embedding_columns(
+      [watched_video_id, impression_video_id], dimension=10)
+
+  estimator = tf.estimator.DNNClassifier(feature_columns=columns, ...)
+
+  label_column = ...
+  def input_fn():
+    features = tf.parse_example(
+        ..., features=make_parse_example_spec(columns + [label_column]))
+    labels = features.pop(label_column.name)
+    return features, labels
+
+  estimator.train(input_fn=input_fn, steps=100)
+  ```
+
+  Here is an example using `shared_embedding_columns` with model_fn:
+
+  ```python
+  def model_fn(features, ...):
+    watched_video_id = categorical_column_with_vocabulary_file(
+        'watched_video_id', video_vocabulary_file, video_vocabulary_size)
+    impression_video_id = categorical_column_with_vocabulary_file(
+        'impression_video_id', video_vocabulary_file, video_vocabulary_size)
+    columns = shared_embedding_columns(
+        [watched_video_id, impression_video_id], dimension=10)
+    dense_tensor = input_layer(features, columns)
+    # Form DNN layers, calculate loss, and return EstimatorSpec.
+    ...
+  ```
+
+  Args:
+    categorical_columns: List of categorical columns created by a
+      `categorical_column_with_*` function. These columns produce the sparse IDs
+      that are inputs to the embedding lookup. All columns must be of the same
+      type and have the same arguments except `key`. E.g. they can be
+      categorical_column_with_vocabulary_file with the same vocabulary_file.
+      Some or all columns could also be weighted_categorical_column.
+    dimension: An integer specifying dimension of the embedding, must be > 0.
+    combiner: A string specifying how to reduce if there are multiple entries in
+      a single row. Currently 'mean', 'sqrtn' and 'sum' are supported, with
+      'mean' the default. 'sqrtn' often achieves good accuracy, in particular
+      with bag-of-words columns. Each of this can be thought as example level
+      normalizations on the column. For more information, see
+      `tf.embedding_lookup_sparse`.
+    initializer: A variable initializer function to be used in embedding
+      variable initialization. If not specified, defaults to
+      `tf.truncated_normal_initializer` with mean `0.0` and standard deviation
+      `1/sqrt(dimension)`.
+    shared_embedding_collection_name: Optional name of the collection where
+      shared embedding weights are added. If not given, a reasonable name will
+      be chosen based on the names of `categorical_columns`. This is also used
+      in `variable_scope` when creating shared embedding weights.
+    ckpt_to_load_from: String representing checkpoint name/pattern from which to
+      restore column weights. Required if `tensor_name_in_ckpt` is not `None`.
+    tensor_name_in_ckpt: Name of the `Tensor` in `ckpt_to_load_from` from which
+      to restore the column weights. Required if `ckpt_to_load_from` is not
+      `None`.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
+      than this value, before combining.
+    trainable: Whether or not the embedding is trainable. Default is True.
+
+  Returns:
+    A list of dense columns that converts from sparse input. The order of
+    results follows the ordering of `categorical_columns`.
+
+  Raises:
+    ValueError: if `dimension` not > 0.
+    ValueError: if any of the given `categorical_columns` is of different type
+      or has different arguments than the others.
+    ValueError: if exactly one of `ckpt_to_load_from` and `tensor_name_in_ckpt`
+      is specified.
+    ValueError: if `initializer` is specified and is not callable.
+    RuntimeError: if eager execution is enabled.
+  """
+  if context.executing_eagerly():
+    raise RuntimeError('shared_embedding_columns are not supported when eager '
+                       'execution is enabled.')
+
+  if (dimension is None) or (dimension < 1):
+    raise ValueError('Invalid dimension {}.'.format(dimension))
+  if (ckpt_to_load_from is None) != (tensor_name_in_ckpt is None):
+    raise ValueError('Must specify both `ckpt_to_load_from` and '
+                     '`tensor_name_in_ckpt` or none of them.')
+
+  if (initializer is not None) and (not callable(initializer)):
+    raise ValueError('initializer must be callable if specified.')
+  if initializer is None:
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1. / math.sqrt(dimension))
+
+  # Sort the columns so the default collection name is deterministic even if the
+  # user passes columns from an unsorted collection, such as dict.values().
+  sorted_columns = sorted(categorical_columns, key=lambda x: x.name)
+
+  c0 = sorted_columns[0]
+  num_buckets = c0._num_buckets  # pylint: disable=protected-access
+  if not isinstance(c0, fc_old._CategoricalColumn):  # pylint: disable=protected-access
+    raise ValueError(
+        'All categorical_columns must be subclasses of _CategoricalColumn. '
+        'Given: {}, of type: {}'.format(c0, type(c0)))
+  if isinstance(c0,
+                (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)):  # pylint: disable=protected-access
+    c0 = c0.categorical_column
+  for c in sorted_columns[1:]:
+    if isinstance(
+        c, (fc_old._WeightedCategoricalColumn, WeightedCategoricalColumn)):  # pylint: disable=protected-access
+      c = c.categorical_column
+    if not isinstance(c, type(c0)):
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same type, or be weighted_categorical_column of the same type. '
+          'Given column: {} of type: {} does not match given column: {} of '
+          'type: {}'.format(c0, type(c0), c, type(c)))
+    if num_buckets != c._num_buckets:  # pylint: disable=protected-access
+      raise ValueError(
+          'To use shared_embedding_column, all categorical_columns must have '
+          'the same number of buckets. Given column: {} with buckets: {} does  '
+          'not match column: {} with buckets: {}'.format(
+              c0, num_buckets, c, c._num_buckets))  # pylint: disable=protected-access
+
+  if not shared_embedding_collection_name:
+    shared_embedding_collection_name = '_'.join(c.name for c in sorted_columns)
+    shared_embedding_collection_name += '_shared_embedding'
+
+  result = []
+  for column in categorical_columns:
+    result.append(
+        fc_old._SharedEmbeddingColumn(  # pylint: disable=protected-access
+            categorical_column=column,
+            initializer=initializer,
+            dimension=dimension,
+            combiner=combiner,
+            shared_embedding_collection_name=shared_embedding_collection_name,
+            ckpt_to_load_from=ckpt_to_load_from,
+            tensor_name_in_ckpt=tensor_name_in_ckpt,
+            max_norm=max_norm,
+            trainable=trainable))
+
+  return result
+
+
+@tf_export('feature_column.shared_embedding_columns', v1=[])
 def shared_embedding_columns_v2(categorical_columns,
                                 dimension,
                                 combiner='mean',
@@ -1019,6 +1203,7 @@ def shared_embedding_columns_v2(categorical_columns,
   return result
 
 
+@tf_export('feature_column.numeric_column')
 def numeric_column(key,
                    shape=(1,),
                    default_value=None,
@@ -1094,6 +1279,7 @@ def numeric_column(key,
       normalizer_fn=normalizer_fn)
 
 
+@tf_export('feature_column.bucketized_column')
 def bucketized_column(source_column, boundaries):
   """Represents discretized dense input.
 
@@ -1190,6 +1376,7 @@ def _assert_key_is_string(key):
             type(key), key))
 
 
+@tf_export('feature_column.categorical_column_with_hash_bucket')
 def categorical_column_with_hash_bucket(key,
                                         hash_bucket_size,
                                         dtype=dtypes.string):
@@ -1248,6 +1435,7 @@ def categorical_column_with_hash_bucket(key,
   return HashedCategoricalColumn(key, hash_bucket_size, dtype)
 
 
+@tf_export(v1=['feature_column.categorical_column_with_vocabulary_file'])
 def categorical_column_with_vocabulary_file(key,
                                             vocabulary_file,
                                             vocabulary_size=None,
@@ -1325,6 +1513,97 @@ def categorical_column_with_vocabulary_file(key,
   Returns:
     A `CategoricalColumn` with a vocabulary file.
 
+  Raises:
+    ValueError: `vocabulary_file` is missing or cannot be opened.
+    ValueError: `vocabulary_size` is missing or < 1.
+    ValueError: `num_oov_buckets` is a negative integer.
+    ValueError: `num_oov_buckets` and `default_value` are both specified.
+    ValueError: `dtype` is neither string nor integer.
+  """
+  return categorical_column_with_vocabulary_file_v2(
+      key, vocabulary_file, vocabulary_size,
+      dtype, default_value,
+      num_oov_buckets)
+
+
+@tf_export('feature_column.categorical_column_with_vocabulary_file', v1=[])
+def categorical_column_with_vocabulary_file_v2(key,
+                                               vocabulary_file,
+                                               vocabulary_size=None,
+                                               dtype=dtypes.string,
+                                               default_value=None,
+                                               num_oov_buckets=0):
+  """A `CategoricalColumn` with a vocabulary file.
+
+  Use this when your inputs are in string or integer format, and you have a
+  vocabulary file that maps each value to an integer ID. By default,
+  out-of-vocabulary values are ignored. Use either (but not both) of
+  `num_oov_buckets` and `default_value` to specify how to include
+  out-of-vocabulary values.
+
+  For input dictionary `features`, `features[key]` is either `Tensor` or
+  `SparseTensor`. If `Tensor`, missing values can be represented by `-1` for int
+  and `''` for string, which will be dropped by this feature column.
+
+  Example with `num_oov_buckets`:
+  File '/us/states.txt' contains 50 lines, each with a 2-character U.S. state
+  abbreviation. All inputs with values in that file are assigned an ID 0-49,
+  corresponding to its line number. All other values are hashed and assigned an
+  ID 50-54.
+
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=50,
+      num_oov_buckets=5)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction = linear_model(features, columns)
+  ```
+
+  Example with `default_value`:
+  File '/us/states.txt' contains 51 lines - the first line is 'XX', and the
+  other 50 each have a 2-character U.S. state abbreviation. Both a literal 'XX'
+  in input, and other values missing from the file, will be assigned ID 0. All
+  others are assigned the corresponding line number 1-50.
+
+  ```python
+  states = categorical_column_with_vocabulary_file(
+      key='states', vocabulary_file='/us/states.txt', vocabulary_size=51,
+      default_value=0)
+  columns = [states, ...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  linear_prediction, _, _ = linear_model(features, columns)
+  ```
+
+  And to make an embedding with either:
+
+  ```python
+  columns = [embedding_column(states, 3),...]
+  features = tf.parse_example(..., features=make_parse_example_spec(columns))
+  dense_tensor = input_layer(features, columns)
+  ```
+
+  Args:
+    key: A unique string identifying the input feature. It is used as the
+      column name and the dictionary key for feature parsing configs, feature
+      `Tensor` objects, and feature columns.
+    vocabulary_file: The vocabulary file name.
+    vocabulary_size: Number of the elements in the vocabulary. This must be no
+      greater than length of `vocabulary_file`, if less than length, later
+      values are ignored. If None, it is set to the length of `vocabulary_file`.
+    dtype: The type of features. Only string and integer types are supported.
+    default_value: The integer ID value to return for out-of-vocabulary feature
+      values, defaults to `-1`. This can not be specified with a positive
+      `num_oov_buckets`.
+    num_oov_buckets: Non-negative integer, the number of out-of-vocabulary
+      buckets. All out-of-vocabulary inputs will be assigned IDs in the range
+      `[vocabulary_size, vocabulary_size+num_oov_buckets)` based on a hash of
+      the input value. A positive `num_oov_buckets` can not be specified with
+      `default_value`.
+
+  Returns:
+    A `CategoricalColumn` with a vocabulary file.
+
   Raises:
     ValueError: `vocabulary_file` is missing or cannot be opened.
     ValueError: `vocabulary_size` is missing or < 1.
@@ -1367,8 +1646,12 @@ def categorical_column_with_vocabulary_file(key,
       dtype=dtype)
 
 
-def categorical_column_with_vocabulary_list(
-    key, vocabulary_list, dtype=None, default_value=-1, num_oov_buckets=0):
+@tf_export('feature_column.categorical_column_with_vocabulary_list')
+def categorical_column_with_vocabulary_list(key,
+                                            vocabulary_list,
+                                            dtype=None,
+                                            default_value=-1,
+                                            num_oov_buckets=0):
   """A `CategoricalColumn` with in-memory vocabulary.
 
   Use this when your inputs are in string or integer format, and you have an
@@ -1480,6 +1763,7 @@ def categorical_column_with_vocabulary_list(
       num_oov_buckets=num_oov_buckets)
 
 
+@tf_export('feature_column.categorical_column_with_identity')
 def categorical_column_with_identity(key, num_buckets, default_value=None):
   """A `CategoricalColumn` that returns identity values.
 
@@ -1547,6 +1831,7 @@ def categorical_column_with_identity(key, num_buckets, default_value=None):
       key=key, number_buckets=num_buckets, default_value=default_value)
 
 
+@tf_export('feature_column.indicator_column')
 def indicator_column(categorical_column):
   """Represents multi-hot representation of given categorical column.
 
@@ -1581,8 +1866,10 @@ def indicator_column(categorical_column):
   return IndicatorColumn(categorical_column)
 
 
-def weighted_categorical_column(
-    categorical_column, weight_feature_key, dtype=dtypes.float32):
+@tf_export('feature_column.weighted_categorical_column')
+def weighted_categorical_column(categorical_column,
+                                weight_feature_key,
+                                dtype=dtypes.float32):
   """Applies weight values to a `CategoricalColumn`.
 
   Use this when each of your sparse inputs has both an ID and a value. For
@@ -1655,6 +1942,7 @@ def weighted_categorical_column(
       dtype=dtype)
 
 
+@tf_export('feature_column.crossed_column')
 def crossed_column(keys, hash_bucket_size, hash_key=None):
   """Returns a column for performing crosses of categorical features.
 
@@ -2120,7 +2408,7 @@ def _create_categorical_column_weighted_sum(
     weight_tensor = sparse_ops.sparse_reshape(
         weight_tensor, [array_ops.shape(weight_tensor)[0], -1])
 
-  return _safe_embedding_lookup_sparse(
+  return embedding_ops.safe_embedding_lookup_sparse(
       weight_var,
       id_tensor,
       sparse_weights=weight_tensor,
@@ -2731,7 +3019,7 @@ class EmbeddingColumn(
       })
 
     # Return embedding lookup result.
-    return _safe_embedding_lookup_sparse(
+    return embedding_ops.safe_embedding_lookup_sparse(
         embedding_weights=embedding_weights,
         sparse_ids=sparse_ids,
         sparse_weights=sparse_weights,
@@ -2890,7 +3178,7 @@ class EmbeddingColumn(
 def _raise_shared_embedding_column_error():
   raise ValueError('SharedEmbeddingColumns are not supported in '
                    '`linear_model` or `input_layer`. Please use '
-                   '`FeatureLayer` or `LinearModel` instead.')
+                   '`DenseFeatures` or `LinearModel` instead.')
 
 
 class SharedEmbeddingColumnCreator(tracking.Checkpointable):
@@ -3002,7 +3290,7 @@ class SharedEmbeddingColumn(
       embedding_weights = self.shared_embedding_column_creator.embedding_weights
 
       # Return embedding lookup result.
-      return _safe_embedding_lookup_sparse(
+      return embedding_ops.safe_embedding_lookup_sparse(
           embedding_weights=embedding_weights,
           sparse_ids=sparse_ids,
           sparse_weights=sparse_weights,
@@ -3687,9 +3975,13 @@ class WeightedCategoricalColumn(
 
   def transform_feature(self, transformation_cache, state_manager):
     """Applies weights to tensor generated from `categorical_column`'."""
+    print('WeightedCategoricalColumn.transform_feature: ', self.name)
+    print('Weight feature key: ', self.weight_feature_key)
     weight_tensor = transformation_cache.get(self.weight_feature_key,
                                              state_manager)
+    print('Weight tensor before: ', weight_tensor)
     weight_tensor = self._transform_weight_tensor(weight_tensor)
+    print('Weight tensor after: ', weight_tensor)
     return (transformation_cache.get(self.categorical_column, state_manager),
             weight_tensor)
 
@@ -3703,7 +3995,9 @@ class WeightedCategoricalColumn(
 
   def get_sparse_tensors(self, transformation_cache, state_manager):
     """See `CategoricalColumn` base class."""
+    print('WeightedCategoricalColumn.get_sparse_tensors: ', self.name)
     tensors = transformation_cache.get(self, state_manager)
+    print('tensors[1]: ', tensors[1])
     return CategoricalColumn.IdWeightPair(tensors[0], tensors[1])
 
   @deprecation.deprecated(_FEATURE_COLUMN_DEPRECATION_DATE,
@@ -3898,142 +4192,6 @@ def _collect_leaf_level_keys(cross):
   return leaf_level_keys
 
 
-# TODO(zakaria): Move this to embedding_ops and make it public.
-def _safe_embedding_lookup_sparse(embedding_weights,
-                                  sparse_ids,
-                                  sparse_weights=None,
-                                  combiner='mean',
-                                  default_id=None,
-                                  name=None,
-                                  partition_strategy='div',
-                                  max_norm=None):
-  """Lookup embedding results, accounting for invalid IDs and empty features.
-
-  The partitioned embedding in `embedding_weights` must all be the same shape
-  except for the first dimension. The first dimension is allowed to vary as the
-  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
-  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
-  partitioner.
-
-  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
-  with non-positive weight. For an entry with no features, the embedding vector
-  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
-
-  The ids and weights may be multi-dimensional. Embeddings are always aggregated
-  along the last dimension.
-
-  Args:
-    embedding_weights:  A list of `P` float `Tensor`s or values representing
-        partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
-        created by partitioning along dimension 0.  The total unpartitioned
-        shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the
-        vocab size and `e_1, ..., e_m` are the embedding dimensions.
-    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
-        ids. `d_0` is typically batch size.
-    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
-        float weights corresponding to `sparse_ids`, or `None` if all weights
-        are be assumed to be 1.0.
-    combiner: A string specifying how to combine embedding results for each
-        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
-        the default.
-    default_id: The id to use for an entry with no features.
-    name: A name for this operation (optional).
-    partition_strategy: A string specifying the partitioning strategy.
-        Currently `"div"` and `"mod"` are supported. Default is `"div"`.
-    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
-        combining.
-
-
-  Returns:
-    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
-
-  Raises:
-    ValueError: if `embedding_weights` is empty.
-  """
-  if embedding_weights is None:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
-  if isinstance(embedding_weights, variables.PartitionedVariable):
-    embedding_weights = list(embedding_weights)  # get underlying Variables.
-  if not isinstance(embedding_weights, list):
-    embedding_weights = [embedding_weights]
-  if len(embedding_weights) < 1:
-    raise ValueError('Missing embedding_weights %s.' % embedding_weights)
-
-  dtype = sparse_weights.dtype if sparse_weights is not None else None
-  # TODO(rohanj): Look into removing this convert_to_tensor call.
-  embedding_weights = [
-      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
-  ]
-
-  with ops.name_scope(name, 'embedding_lookup',
-                      embedding_weights + [sparse_ids,
-                                           sparse_weights]) as scope:
-    # Reshape higher-rank sparse ids and weights to linear segment ids.
-    original_shape = sparse_ids.dense_shape
-    original_rank_dim = tensor_shape.dimension_value(
-        sparse_ids.dense_shape.get_shape()[0])
-    original_rank = (
-        array_ops.size(original_shape)
-        if original_rank_dim is None
-        else original_rank_dim)
-    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
-        math_ops.reduce_prod(
-            array_ops.slice(original_shape, [0], [original_rank - 1])),
-        array_ops.gather(original_shape, original_rank - 1)])
-    if sparse_weights is not None:
-      sparse_weights = sparse_tensor_lib.SparseTensor(
-          sparse_ids.indices,
-          sparse_weights.values, sparse_ids.dense_shape)
-
-    # Prune invalid ids and weights.
-    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
-    if combiner != 'sum':
-      sparse_ids, sparse_weights = _prune_invalid_weights(
-          sparse_ids, sparse_weights)
-
-    # Fill in dummy values for empty features, if necessary.
-    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
-                                                                 default_id or
-                                                                 0)
-    if sparse_weights is not None:
-      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)
-
-    result = embedding_ops.embedding_lookup_sparse(
-        embedding_weights,
-        sparse_ids,
-        sparse_weights,
-        combiner=combiner,
-        partition_strategy=partition_strategy,
-        name=None if default_id is None else scope,
-        max_norm=max_norm)
-
-    if default_id is None:
-      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
-      # for use in Select.
-      is_row_empty = array_ops.tile(
-          array_ops.reshape(is_row_empty, [-1, 1]),
-          array_ops.stack([1, array_ops.shape(result)[1]]))
-
-      result = array_ops.where(is_row_empty,
-                               array_ops.zeros_like(result),
-                               result,
-                               name=scope)
-
-    # Reshape back from linear ids back into higher-dimensional dense result.
-    final_result = array_ops.reshape(
-        result,
-        array_ops.concat([
-            array_ops.slice(
-                math_ops.cast(original_shape, dtypes.int32), [0],
-                [original_rank - 1]),
-            array_ops.slice(array_ops.shape(result), [1], [-1])
-        ], 0))
-    final_result.set_shape(tensor_shape.unknown_shape(
-        (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
-            result.get_shape()[1:]))
-    return final_result
-
-
 def _prune_invalid_ids(sparse_ids, sparse_weights):
   """Prune invalid IDs (< 0) from the input ids and weights."""
   is_id_valid = math_ops.greater_equal(sparse_ids.values, 0)
@@ -4089,10 +4247,14 @@ class IndicatorColumn(
           sp_ids=id_tensor,
           sp_values=weight_tensor,
           vocab_size=int(self._variable_shape[-1]))
-      # Remove (?, -1) index
+      # Remove (?, -1) index.
       weighted_column = sparse_ops.sparse_slice(weighted_column, [0, 0],
                                                 weighted_column.dense_shape)
-      return sparse_ops.sparse_tensor_to_dense(weighted_column)
+      # Use scatter_nd to merge duplicated indices if existed,
+      # instead of sparse_tensor_to_dense.
+      return array_ops.scatter_nd(weighted_column.indices,
+                                  weighted_column.values,
+                                  weighted_column.dense_shape)
 
     dense_id_tensor = sparse_ops.sparse_tensor_to_dense(
         id_tensor, default_value=-1)
@@ -4534,7 +4696,10 @@ def deserialize_feature_column(config,
         'Expected FeatureColumn class, instead found: {}'.format(cls))
 
   # Always deserialize the FeatureColumn, in order to get the name.
-  new_instance = cls._from_config(cls_config, columns_by_name=columns_by_name)  # pylint: disable=protected-access
+  new_instance = cls._from_config(  # pylint: disable=protected-access
+      cls_config,
+      custom_objects=custom_objects,
+      columns_by_name=columns_by_name)
 
   # If the name already exists, re-use the column from columns_by_name,
   # (new_instance remains unused).
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index a26b8600568fb3ab11497bef78a5f38b542daf43..0755c0b6ac23f5ad73df855ab2bcbce11fec2653 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -31,7 +31,6 @@ from tensorflow.python import keras
 from tensorflow.python.client import session
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.estimator.inputs import numpy_io
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
 from tensorflow.python.framework import constant_op
@@ -50,6 +49,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.training import coordinator
 from tensorflow.python.training import queue_runner_impl
 from tensorflow.python.training import rmsprop
+from tensorflow_estimator.python.estimator.inputs import numpy_io
 
 
 def _initialized_session(config=None):
@@ -218,6 +218,7 @@ class LazyColumnTest(test.TestCase):
         TypeError, '"key" must be either a "str" or "FeatureColumn".'):
       transformation_cache.get(NotAFeatureColumn(), None)
 
+  @test_util.run_deprecated_v1
   def test_expand_dim_rank_1_sparse_tensor_empty_batch(self):
     # empty 1-D sparse tensor:
     transformation_cache = fc.FeatureTransformationCache(
@@ -228,15 +229,16 @@ class LazyColumnTest(test.TestCase):
                     dense_shape=[0],
                     values=np.array([]))
         })
-    with self.cached_session():
-      spv = transformation_cache.get('a', None).eval()
-      self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
-      self.assertAllEqual(
-          np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
+
+    spv = self.evaluate(transformation_cache.get('a', None))
+    self.assertAllEqual(np.array([0, 1], dtype=np.int64), spv.dense_shape)
+    self.assertAllEqual(
+        np.reshape(np.array([], dtype=np.int64), (0, 2)), spv.indices)
 
 
 class NumericColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     a = fc.numeric_column('aaa')
     self.assertEqual('aaa', a.key)
@@ -315,59 +317,67 @@ class NumericColumnTest(test.TestCase):
         'aaa': parsing_ops.FixedLenFeature((2, 3), dtype=dtypes.int32)
     }, a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_no_default_value(self):
     price = fc.numeric_column('price', shape=[2])
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([price]))
+        features=fc.make_parse_example_spec_v2([price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+
+  @test_util.run_deprecated_v1
   def test_parse_example_with_default_value(self):
     price = fc.numeric_column('price', shape=[2], default_value=11.)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
-    no_data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'something_else':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
+    no_data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'something_else':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString(),
                     no_data.SerializeToString()],
-        features=fc.make_parse_example_spec([price]))
+        features=fc.make_parse_example_spec_v2([price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.], [11., 11.]], features['price'].eval())
+
+    self.assertAllEqual([[20., 110.], [11., 11.]],
+                        self.evaluate(features['price']))
 
   def test_normalizer_fn_must_be_callable(self):
     with self.assertRaisesRegexp(TypeError, 'must be a callable'):
       fc.numeric_column('price', normalizer_fn='NotACallable')
 
+  @test_util.run_deprecated_v1
   def test_normalizer_fn_transform_feature(self):
 
     def _increment_two(input_tensor):
       return input_tensor + 2.
 
     price = fc.numeric_column('price', shape=[2], normalizer_fn=_increment_two)
-    output = fc._transform_features({
+    output = fc._transform_features_v2({
         'price': [[1., 2.], [5., 6.]]
     }, [price], None)
-    with self.cached_session():
-      self.assertAllEqual([[3., 4.], [7., 8.]], output[price].eval())
 
+    self.assertAllEqual([[3., 4.], [7., 8.]], self.evaluate(output[price]))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
 
     def _increment_two(input_tensor):
@@ -391,6 +401,7 @@ class NumericColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       price.transform_feature(transformation_cache, None)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('aaa', shape=[1, 2], default_value=[[3., 2.]])
     a_copy = copy.deepcopy(a)
@@ -403,6 +414,7 @@ class NumericColumnTest(test.TestCase):
         'aaa', shape=[1, 2], default_value=np.array([[3., 2.]]))
     self.assertEqual(a.default_value, ((3., 2.),))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
@@ -411,11 +423,11 @@ class NumericColumnTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
   def test_old_linear_model(self):
     price = fc.numeric_column('price')
@@ -425,12 +437,13 @@ class NumericColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[10.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [50.]], self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
 
     def _increment_two(input_tensor):
@@ -471,17 +484,17 @@ class BucketizedColumnTest(test.TestCase):
 
   def test_invalid_boundaries(self):
     a = fc.numeric_column('aaa')
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=None)
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=1.)
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=[1, 0])
-    with self.assertRaisesRegexp(
-        ValueError, 'boundaries must be a sorted list'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'boundaries must be a sorted list'):
       fc.bucketized_column(a, boundaries=[1, 1])
 
   def test_name(self):
@@ -491,7 +504,7 @@ class BucketizedColumnTest(test.TestCase):
     self.assertEqual('aaa_bucketized', b.name)
 
   def test_is_v2_column_old_numeric(self):
-    a = fc_old.numeric_column('aaa', dtype=dtypes.int32)
+    a = fc_old._numeric_column('aaa', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     self.assertFalse(b._is_v2_column)
     self.assertEqual('aaa_bucketized', b.name)
@@ -515,32 +528,38 @@ class BucketizedColumnTest(test.TestCase):
     # Column 'aaa` has shape [2] times three buckets -> num_buckets=6.
     self.assertEqual(6, b.num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([bucketized_price]))
+        features=fc.make_parse_example_spec_v2([bucketized_price]))
     self.assertIn('price', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
-      transformed_tensor = fc._transform_features({
+      transformed_tensor = fc._transform_features_v2({
           'price': [[-1., 1.], [5., 6.]]
       }, [bucketized_price], None)
-      with _initialized_session():
-        self.assertAllEqual([[0, 1], [3, 4]],
-                            transformed_tensor[bucketized_price].eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllEqual([[0, 1], [3, 4]],
+                          self.evaluate(transformed_tensor[bucketized_price]))
 
   def test_get_dense_tensor_one_input_value(self):
     """Tests _get_dense_tensor() for input with shape=[1]."""
@@ -550,16 +569,17 @@ class BucketizedColumnTest(test.TestCase):
       transformation_cache = fc.FeatureTransformationCache({
           'price': [[-1.], [1.], [5.], [6.]]
       })
-      with _initialized_session():
-        bucketized_price_tensor = bucketized_price.get_dense_tensor(
-            transformation_cache, None)
-        self.assertAllClose(
-            # One-hot tensor.
-            [[[1., 0., 0., 0., 0.]],
-             [[0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.]],
-             [[0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      bucketized_price_tensor = bucketized_price.get_dense_tensor(
+          transformation_cache, None)
+      self.assertAllClose(
+          # One-hot tensor.
+          [[[1., 0., 0., 0., 0.]], [[0., 1., 0., 0., 0.]],
+           [[0., 0., 0., 1., 0.]], [[0., 0., 0., 0., 1.]]],
+          self.evaluate(bucketized_price_tensor))
 
   def test_get_dense_tensor_two_input_values(self):
     """Tests _get_dense_tensor() for input with shape=[2]."""
@@ -569,14 +589,17 @@ class BucketizedColumnTest(test.TestCase):
       transformation_cache = fc.FeatureTransformationCache({
           'price': [[-1., 1.], [5., 6.]]
       })
-      with _initialized_session():
-        bucketized_price_tensor = bucketized_price.get_dense_tensor(
-            transformation_cache, None)
-        self.assertAllClose(
-            # One-hot tensor.
-            [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
-             [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
-            bucketized_price_tensor.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      bucketized_price_tensor = bucketized_price.get_dense_tensor(
+          transformation_cache, None)
+      self.assertAllClose(
+          # One-hot tensor.
+          [[[1., 0., 0., 0., 0.], [0., 1., 0., 0., 0.]],
+           [[0., 0., 0., 1., 0.], [0., 0., 0., 0., 1.]]],
+          self.evaluate(bucketized_price_tensor))
 
   def test_get_sparse_tensors_one_input_value(self):
     """Tests _get_sparse_tensors() for input with shape=[1]."""
@@ -591,8 +614,8 @@ class BucketizedColumnTest(test.TestCase):
             transformation_cache, None)
         self.assertIsNone(id_weight_pair.weight_tensor)
         id_tensor_value = sess.run(id_weight_pair.id_tensor)
-        self.assertAllEqual(
-            [[0, 0], [1, 0], [2, 0], [3, 0]], id_tensor_value.indices)
+        self.assertAllEqual([[0, 0], [1, 0], [2, 0], [3, 0]],
+                            id_tensor_value.indices)
         self.assertAllEqual([0, 1, 3, 4], id_tensor_value.values)
         self.assertAllEqual([4, 1], id_tensor_value.dense_shape)
 
@@ -609,8 +632,8 @@ class BucketizedColumnTest(test.TestCase):
             transformation_cache, None)
         self.assertIsNone(id_weight_pair.weight_tensor)
         id_tensor_value = sess.run(id_weight_pair.id_tensor)
-        self.assertAllEqual(
-            [[0, 0], [0, 1], [1, 0], [1, 1]], id_tensor_value.indices)
+        self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1]],
+                            id_tensor_value.indices)
         # Values 0-4 correspond to the first column of the input price.
         # Values 5-9 correspond to the second column of the input price.
         self.assertAllEqual([0, 6, 3, 9], id_tensor_value.values)
@@ -627,6 +650,7 @@ class BucketizedColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'must be a Tensor'):
       bucketized_price.transform_feature(transformation_cache, None)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('aaa', shape=[2])
     a_bucketized = fc.bucketized_column(a, boundaries=[0, 1])
@@ -645,20 +669,23 @@ class BucketizedColumnTest(test.TestCase):
       predictions = model(features)
       bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
-        self.assertAllClose(
-            [[0.], [0.], [0.], [0.], [0.]], bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
-        sess.run(bucketized_price_var.assign(
-            [[10.], [20.], [30.], [40.], [50.]]))
+        self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
@@ -670,24 +697,24 @@ class BucketizedColumnTest(test.TestCase):
       predictions = model(features)
       bucketized_price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
-        sess.run(bucketized_price_var.assign(
-            [[10.], [20.], [30.], [40.], [50.],
-             [60.], [70.], [80.], [90.], [100.]]))
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
+        sess.run(
+            bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
+                                         [60.], [70.], [80.], [90.], [100.]]))
         # 1st example:
         #   price -1. is in the 0th bucket, whose weight is 10.
         #   price 1. is in the 6th bucket, whose weight is 70.
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_old_linear_model_one_input_value(self):
     """Tests linear_model() for input with shape=[1]."""
@@ -699,20 +726,23 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
   def test_old_linear_model_two_input_values(self):
     """Tests linear_model() for input with shape=[2]."""
@@ -724,12 +754,12 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight per bucket per input column, all initialized to zero.
         self.assertAllClose(
             [[0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.], [0.]],
-            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.],
                                          [60.], [70.], [80.], [90.], [100.]]))
@@ -739,13 +769,13 @@ class BucketizedColumnTest(test.TestCase):
         # 2nd example:
         #   price 5. is in the 3rd bucket, whose weight is 40.
         #   price 6. is in the 9th bucket, whose weight is 100.
-        self.assertAllClose([[80.], [140.]], predictions.eval())
+        self.assertAllClose([[80.], [140.]], self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[81.], [141.]], predictions.eval())
+        self.assertAllClose([[81.], [141.]], self.evaluate(predictions))
 
   def test_old_linear_model_one_input_value_old_numeric(self):
     """Tests linear_model() for input with shape=[1]."""
-    price = fc_old.numeric_column('price', shape=[1])
+    price = fc_old._numeric_column('price', shape=[1])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
     with ops.Graph().as_default():
       features = {'price': [[-1.], [1.], [5.], [6.]]}
@@ -753,21 +783,25 @@ class BucketizedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       bucketized_price_var = get_linear_model_column_var(bucketized_price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         # One weight variable per bucket, all initialized to zero.
         self.assertAllClose([[0.], [0.], [0.], [0.], [0.]],
-                            bucketized_price_var.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], predictions.eval())
+                            self.evaluate(bucketized_price_var))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(predictions))
         sess.run(
             bucketized_price_var.assign([[10.], [20.], [30.], [40.], [50.]]))
         # price -1. is in the 0th bucket, whose weight is 10.
         # price 1. is in the 1st bucket, whose weight is 20.
         # price 5. is in the 3rd bucket, whose weight is 40.
         # price 6. is in the 4th bucket, whose weight is 50.
-        self.assertAllClose([[10.], [20.], [40.], [50.]], predictions.eval())
+        self.assertAllClose([[10.], [20.], [40.], [50.]],
+                            self.evaluate(predictions))
         sess.run(bias.assign([1.]))
-        self.assertAllClose([[11.], [21.], [41.], [51.]], predictions.eval())
+        self.assertAllClose([[11.], [21.], [41.], [51.]],
+                            self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 2, 4, 6])
@@ -800,6 +834,7 @@ class BucketizedColumnTest(test.TestCase):
 
 class HashedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     a = fc.categorical_column_with_hash_bucket('aaa', 10)
     self.assertEqual('aaa', a.name)
@@ -827,6 +862,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_hash_bucket('aaa', 10, dtype=dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_hash_bucket('aaa', 10)
     for column in (original, copy.deepcopy(original)):
@@ -847,45 +883,50 @@ class HashedCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_hash_bucket('aaa', 10)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_strings_should_be_hashed(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     wire_tensor = sparse_tensor.SparseTensor(
         values=['omar', 'stringer', 'marlo'],
         indices=[[0, 0], [1, 0], [1, 1]],
         dense_shape=[2, 2])
-    outputs = fc._transform_features({
+    outputs = fc._transform_features_v2({
         'wire': wire_tensor
     }, [hashed_sparse], None)
     output = outputs[hashed_sparse]
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [6, 4, 1]
-    with self.cached_session():
-      self.assertEqual(dtypes.int64, output.values.dtype)
-      self.assertAllEqual(expected_values, output.values.eval())
-      self.assertAllEqual(wire_tensor.indices.eval(), output.indices.eval())
-      self.assertAllEqual(wire_tensor.dense_shape.eval(),
-                          output.dense_shape.eval())
+
+    self.assertEqual(dtypes.int64, output.values.dtype)
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+    self.assertAllEqual(
+        self.evaluate(wire_tensor.indices), self.evaluate(output.indices))
+    self.assertAllEqual(
+        self.evaluate(wire_tensor.dense_shape),
+        self.evaluate(output.dense_shape))
 
   def test_tensor_dtype_should_be_string_or_integer(self):
     string_fc = fc.categorical_column_with_hash_bucket(
@@ -895,17 +936,11 @@ class HashedCategoricalColumnTest(test.TestCase):
     float_fc = fc.categorical_column_with_hash_bucket(
         'a_float', 10, dtype=dtypes.string)
     int_tensor = sparse_tensor.SparseTensor(
-        values=[101],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
+        values=[101], indices=[[0, 0]], dense_shape=[1, 1])
     string_tensor = sparse_tensor.SparseTensor(
-        values=['101'],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
+        values=['101'], indices=[[0, 0]], dense_shape=[1, 1])
     float_tensor = sparse_tensor.SparseTensor(
-        values=[101.],
-        indices=[[0, 0]],
-        dense_shape=[1, 1])
+        values=[101.], indices=[[0, 0]], dense_shape=[1, 1])
     transformation_cache = fc.FeatureTransformationCache({
         'a_int': int_tensor,
         'a_string': string_tensor,
@@ -925,6 +960,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'dtype must be compatible'):
       transformation_cache.get(hashed_sparse, None)
 
+  @test_util.run_deprecated_v1
   def test_ints_should_be_hashed(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
@@ -936,9 +972,10 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = transformation_cache.get(hashed_sparse, None)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.cached_session():
-      self.assertAllEqual(expected_values, output.values.eval())
 
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+
+  @test_util.run_deprecated_v1
   def test_int32_64_is_compatible(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket(
         'wire', 10, dtype=dtypes.int64)
@@ -950,9 +987,10 @@ class HashedCategoricalColumnTest(test.TestCase):
     output = transformation_cache.get(hashed_sparse, None)
     # Check exact hashed output. If hashing changes this test will break.
     expected_values = [3, 7, 5]
-    with self.cached_session():
-      self.assertAllEqual(expected_values, output.values.eval())
 
+    self.assertAllEqual(expected_values, self.evaluate(output.values))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     transformation_cache = fc.FeatureTransformationCache({
@@ -968,6 +1006,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     hashed_sparse = fc.categorical_column_with_hash_bucket('wire', 10)
     transformation_cache = fc.FeatureTransformationCache({
@@ -979,6 +1018,7 @@ class HashedCategoricalColumnTest(test.TestCase):
     self.assertEqual(
         transformation_cache.get(hashed_sparse, None), id_weight_pair.id_tensor)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(4, wire_column.num_buckets)
@@ -992,14 +1032,17 @@ class HashedCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 3: wire_var[3] = 4
-        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 3: wire_var[3] = 4
+      # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+      self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
@@ -1014,15 +1057,19 @@ class HashedCategoricalColumnTest(test.TestCase):
       }, (wire_column,))
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 3: wire_var[3] = 4
-        # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
-        self.assertAllClose(((4.,), (6.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 3: wire_var[3] = 4
+      # 'skywalker' -> 2, 'omar' -> 2: wire_var[2] + wire_var[2] = 3+3 = 6
+      self.assertAllClose(((4.,), (6.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     wire_column = fc.categorical_column_with_hash_bucket('wire', 4)
     self.assertEqual(['wire'], wire_column.parents)
@@ -1041,13 +1088,13 @@ class HashedCategoricalColumnTest(test.TestCase):
 class CrossedColumnTest(test.TestCase):
 
   def test_keys_empty(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'keys must be a list with length > 1'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'keys must be a list with length > 1'):
       fc.crossed_column([], 10)
 
   def test_keys_length_one(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'keys must be a list with length > 1'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'keys must be a list with length > 1'):
       fc.crossed_column(['a'], 10)
 
   def test_key_type_unsupported(self):
@@ -1060,18 +1107,15 @@ class CrossedColumnTest(test.TestCase):
           ['a', fc.categorical_column_with_hash_bucket('c', 10)], 10)
 
   def test_hash_bucket_size_negative(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], -1)
 
   def test_hash_bucket_size_zero(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], 0)
 
   def test_hash_bucket_size_none(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'hash_bucket_size must be > 1'):
+    with self.assertRaisesRegexp(ValueError, 'hash_bucket_size must be > 1'):
       fc.crossed_column(['a', 'c'], None)
 
   def test_name(self):
@@ -1085,7 +1129,7 @@ class CrossedColumnTest(test.TestCase):
     self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2.name)
 
   def test_is_v2_column(self):
-    a = fc_old.numeric_column('a', dtype=dtypes.int32)
+    a = fc_old._numeric_column('a', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     crossed1 = fc.crossed_column(['d1', 'd2'], 10)
     self.assertTrue(crossed1._is_v2_column)
@@ -1127,65 +1171,76 @@ class CrossedColumnTest(test.TestCase):
     crossed = fc.crossed_column([b, 'c'], 15)
     self.assertEqual(15, crossed.num_buckets)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.numeric_column('a', dtype=dtypes.int32)
     b = fc.bucketized_column(a, boundaries=[0, 1])
     crossed1 = fc.crossed_column(['d1', 'd2'], 10)
     crossed2 = fc.crossed_column([b, 'c', crossed1], 15, hash_key=5)
     crossed2_copy = copy.deepcopy(crossed2)
-    self.assertEqual('a_bucketized_X_c_X_d1_X_d2', crossed2_copy.name,)
+    self.assertEqual(
+        'a_bucketized_X_c_X_d1_X_d2',
+        crossed2_copy.name,
+    )
     self.assertEqual(15, crossed2_copy.hash_bucket_size)
     self.assertEqual(5, crossed2_copy.hash_key)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
     price_cross_wire = fc.crossed_column([bucketized_price, 'wire'], 10)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'price':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[20., 110.])),
-            'wire':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'price':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[20., 110.])),
+                'wire':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([price_cross_wire]))
+        features=fc.make_parse_example_spec_v2([price_cross_wire]))
     self.assertIn('price', features)
     self.assertIn('wire', features)
-    with self.cached_session():
-      self.assertAllEqual([[20., 110.]], features['price'].eval())
-      wire_sparse = features['wire']
-      self.assertAllEqual([[0, 0], [0, 1]], wire_sparse.indices.eval())
-      # Use byte constants to pass the open-source test.
-      self.assertAllEqual([b'omar', b'stringer'], wire_sparse.values.eval())
-      self.assertAllEqual([1, 2], wire_sparse.dense_shape.eval())
 
+    self.assertAllEqual([[20., 110.]], self.evaluate(features['price']))
+    wire_sparse = features['wire']
+    self.assertAllEqual([[0, 0], [0, 1]], self.evaluate(wire_sparse.indices))
+    # Use byte constants to pass the open-source test.
+    self.assertAllEqual([b'omar', b'stringer'],
+                        self.evaluate(wire_sparse.values))
+    self.assertAllEqual([1, 2], self.evaluate(wire_sparse.dense_shape))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     price = fc.numeric_column('price', shape=[2])
     bucketized_price = fc.bucketized_column(price, boundaries=[0, 50])
     hash_bucket_size = 10
-    price_cross_wire = fc.crossed_column(
-        [bucketized_price, 'wire'], hash_bucket_size)
+    price_cross_wire = fc.crossed_column([bucketized_price, 'wire'],
+                                         hash_bucket_size)
     features = {
-        'price': constant_op.constant([[1., 2.], [5., 6.]]),
-        'wire': sparse_tensor.SparseTensor(
-            values=['omar', 'stringer', 'marlo'],
-            indices=[[0, 0], [1, 0], [1, 1]],
-            dense_shape=[2, 2]),
+        'price':
+            constant_op.constant([[1., 2.], [5., 6.]]),
+        'wire':
+            sparse_tensor.SparseTensor(
+                values=['omar', 'stringer', 'marlo'],
+                indices=[[0, 0], [1, 0], [1, 1]],
+                dense_shape=[2, 2]),
     }
-    outputs = fc._transform_features(features, [price_cross_wire], None)
+    outputs = fc._transform_features_v2(features, [price_cross_wire], None)
     output = outputs[price_cross_wire]
-    with self.cached_session() as sess:
-      output_val = sess.run(output)
-      self.assertAllEqual(
-          [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]], output_val.indices)
-      for val in output_val.values:
-        self.assertIn(val, list(range(hash_bucket_size)))
-      self.assertAllEqual([2, 4], output_val.dense_shape)
-
+    output_val = self.evaluate(output)
+    self.assertAllEqual([[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [1, 3]],
+                        output_val.indices)
+    for val in output_val.values:
+      self.assertIn(val, list(range(hash_bucket_size)))
+    self.assertAllEqual([2, 4], output_val.dense_shape)
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc.bucketized_column(a, boundaries=(0, 1))
@@ -1212,19 +1267,21 @@ class CrossedColumnTest(test.TestCase):
                   dense_shape=(2, 2)),
       })
       id_weight_pair = crossed2.get_sparse_tensors(transformation_cache, None)
-      with _initialized_session():
-        id_tensor_eval = id_weight_pair.id_tensor.eval()
-        self.assertAllEqual(
-            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
-             (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
-             (1, 14), (1, 15)),
-            id_tensor_eval.indices)
-        # Check exact hashed output. If hashing changes this test will break.
-        # All values are within [0, hash_bucket_size).
-        expected_values = (
-            6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0, 10, 11)
-        self.assertAllEqual(expected_values, id_tensor_eval.values)
-        self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      id_tensor_eval = self.evaluate(id_weight_pair.id_tensor)
+      self.assertAllEqual(
+          ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3), (1, 4), (1, 5),
+           (1, 6), (1, 7), (1, 8), (1, 9), (1, 10), (1, 11), (1, 12), (1, 13),
+           (1, 14), (1, 15)), id_tensor_eval.indices)
+      # Check exact hashed output. If hashing changes this test will break.
+      # All values are within [0, hash_bucket_size).
+      expected_values = (6, 14, 0, 13, 8, 8, 10, 12, 2, 0, 1, 9, 8, 12, 2, 0,
+                         10, 11)
+      self.assertAllEqual(expected_values, id_tensor_eval.values)
+      self.assertAllEqual((2, 16), id_tensor_eval.dense_shape)
 
   def test_get_sparse_tensors_simple(self):
     """Same as test_get_sparse_tensors, but with simpler values."""
@@ -1242,17 +1299,20 @@ class CrossedColumnTest(test.TestCase):
                   dense_shape=(2, 2)),
       })
       id_weight_pair = crossed.get_sparse_tensors(transformation_cache, None)
-      with _initialized_session():
-        id_tensor_eval = id_weight_pair.id_tensor.eval()
-        self.assertAllEqual(
-            ((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
-            id_tensor_eval.indices)
-        # Check exact hashed output. If hashing changes this test will break.
-        # All values are within [0, hash_bucket_size).
-        expected_values = (1, 0, 1, 3, 4, 2)
-        self.assertAllEqual(expected_values, id_tensor_eval.values)
-        self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      id_tensor_eval = self.evaluate(id_weight_pair.id_tensor)
+      self.assertAllEqual(((0, 0), (0, 1), (1, 0), (1, 1), (1, 2), (1, 3)),
+                          id_tensor_eval.indices)
+      # Check exact hashed output. If hashing changes this test will break.
+      # All values are within [0, hash_bucket_size).
+      expected_values = (1, 0, 1, 3, 4, 2)
+      self.assertAllEqual(expected_values, id_tensor_eval.values)
+      self.assertAllEqual((2, 4), id_tensor_eval.dense_shape)
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     """Tests linear_model.
 
@@ -1274,15 +1334,15 @@ class CrossedColumnTest(test.TestCase):
       })
       crossed_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(
-            ((0.,), (0.,), (0.,), (0.,), (0.,)), crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
+        self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_linear_model_with_weights(self):
 
@@ -1301,10 +1361,11 @@ class CrossedColumnTest(test.TestCase):
       @property
       def parse_example_spec(self):
         return {
-            self.name: parsing_ops.VarLenFeature(dtypes.int32),
-            '{}_weights'.format(self.name): parsing_ops.VarLenFeature(
-                dtypes.float32),
-            }
+            self.name:
+                parsing_ops.VarLenFeature(dtypes.int32),
+            '{}_weights'.format(self.name):
+                parsing_ops.VarLenFeature(dtypes.float32),
+        }
 
       @property
       def num_buckets(self):
@@ -1367,15 +1428,15 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
   def test_old_linear_model_with_weights(self):
 
@@ -1461,7 +1522,7 @@ class CrossedColumnTest(test.TestCase):
 
     Uses data from test_get_sparse_tesnsors_simple.
     """
-    a = fc_old.numeric_column('a', dtype=dtypes.int32, shape=(2,))
+    a = fc_old._numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc.bucketized_column(a, boundaries=(0, 1))
     crossed = fc.crossed_column([b, 'c'], hash_bucket_size=5, hash_key=5)
     with ops.Graph().as_default():
@@ -1477,16 +1538,17 @@ class CrossedColumnTest(test.TestCase):
       bias = get_linear_model_bias()
       crossed_var = get_linear_model_column_var(crossed)
       with _initialized_session() as sess:
-        self.assertAllClose((0.,), bias.eval())
+        self.assertAllClose((0.,), self.evaluate(bias))
         self.assertAllClose(((0.,), (0.,), (0.,), (0.,), (0.,)),
-                            crossed_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
+                            self.evaluate(crossed_var))
+        self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
         sess.run(crossed_var.assign(((1.,), (2.,), (3.,), (4.,), (5.,))))
         # Expected ids after cross = (1, 0, 1, 3, 4, 2)
-        self.assertAllClose(((3.,), (14.,)), predictions.eval())
+        self.assertAllClose(((3.,), (14.,)), self.evaluate(predictions))
         sess.run(bias.assign((.1,)))
-        self.assertAllClose(((3.1,), (14.1,)), predictions.eval())
+        self.assertAllClose(((3.1,), (14.1,)), self.evaluate(predictions))
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     a = fc.numeric_column('a', dtype=dtypes.int32, shape=(2,))
     b = fc.bucketized_column(a, boundaries=(0, 1))
@@ -1528,7 +1590,6 @@ class CrossedColumnTest(test.TestCase):
     self.assertIs(b, new_crossed.keys[0])
 
 
-
 class LinearModelTest(test.TestCase):
 
   def test_raises_if_empty_feature_columns(self):
@@ -1581,7 +1642,7 @@ class LinearModelTest(test.TestCase):
       features = [[1.], [5.]]
       model = fc.LinearModel([price])
       with self.assertRaisesRegexp(ValueError, 'We expected a dictionary here'):
-        predictions = model(features)
+        model(features)
 
   def test_dense_bias(self):
     price = fc.numeric_column('price')
@@ -1591,10 +1652,10 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1608,11 +1669,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1630,7 +1692,7 @@ class LinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -1682,10 +1744,11 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       dense_and_sparse_column_var, bias = model.variables
       with _initialized_session() as sess:
-        sess.run(dense_and_sparse_column_var.assign(
-            [[10.], [100.], [1000.], [10000.]]))
+        sess.run(
+            dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
+                                                [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
@@ -1695,12 +1758,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1714,15 +1777,15 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
-            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.], [
-                1000., 1100., 1200.
-            ], [10000., 11000., 12000.]]))
+            wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
+                                  [1000., 1100., 1200.],
+                                  [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
@@ -1732,9 +1795,9 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, _ = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1749,7 +1812,7 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       wire_cast_var, _ = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -1772,7 +1835,7 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -1793,7 +1856,7 @@ class LinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
@@ -1803,12 +1866,12 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -1828,32 +1891,29 @@ class LinearModelTest(test.TestCase):
       predictions = model(features)
       price_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
-      }
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       model = fc.LinearModel([price1, price2])
       predictions = model(features)
       price1_var, price2_var, bias = model.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_dense_trainable_default(self):
     price = fc.numeric_column('price')
@@ -2046,6 +2106,7 @@ class LinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_numpy_input_fn(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2078,11 +2139,13 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [100 - 10 + 5.]],
+                          self.evaluate(net))
 
       coord.request_stop()
       coord.join(threads)
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2096,11 +2159,16 @@ class LinearModelTest(test.TestCase):
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price': constant_op.constant([-1., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
+        'price':
+            constant_op.constant([
+                -1.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
@@ -2114,8 +2182,10 @@ class LinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2140,9 +2210,7 @@ class LinearModelTest(test.TestCase):
 
     price_data = np.array([-1., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
     country_data = np.array(['US', 'CA'])
 
     model = fc.LinearModel([price_buckets, body_style, country])
@@ -2162,6 +2230,7 @@ class LinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     price = fc.numeric_column('price')
     features = {
@@ -2197,14 +2266,14 @@ class LinearModelTest(test.TestCase):
       price_var1, bias1 = model1.variables
       price_var2, bias2 = model2.variables
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
 
 class OldLinearModelTest(test.TestCase):
@@ -2272,10 +2341,10 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
         sess.run(price_var.assign([[10.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions))
 
   def test_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2289,11 +2358,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.], [0.], [0.]], wire_cast_var.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.], [0.], [0.]],
+                            self.evaluate(wire_cast_var))
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_bias(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2312,7 +2382,7 @@ class OldLinearModelTest(test.TestCase):
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
         sess.run(price_var.assign([[10.]]))
-        self.assertAllClose([[1015.], [10065.]], predictions.eval())
+        self.assertAllClose([[1015.], [10065.]], self.evaluate(predictions))
 
   def test_dense_and_sparse_column(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -2394,7 +2464,7 @@ class OldLinearModelTest(test.TestCase):
             dense_and_sparse_column_var.assign([[10.], [100.], [1000.],
                                                 [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [10015.]], predictions.eval())
+        self.assertAllClose([[1005.], [10015.]], self.evaluate(predictions))
 
   def test_dense_multi_output(self):
     price = fc.numeric_column('price')
@@ -2404,12 +2474,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((1, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((1, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[10., 100., 1000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[15., 106., 1007.], [55., 506., 5007.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_sparse_multi_output(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2423,15 +2493,15 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((4, 3)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((4, 3)), self.evaluate(wire_cast_var))
         sess.run(
             wire_cast_var.assign([[10., 11., 12.], [100., 110., 120.],
                                   [1000., 1100., 1200.],
                                   [10000., 11000., 12000.]]))
         sess.run(bias.assign([5., 6., 7.]))
         self.assertAllClose([[1005., 1106., 1207.], [10015., 11017., 12019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_dense_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
@@ -2440,9 +2510,9 @@ class OldLinearModelTest(test.TestCase):
       predictions = fc_old.linear_model(features, [price])
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([[0.], [0.]], price_var.eval())
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_sparse_multi_rank(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2456,7 +2526,7 @@ class OldLinearModelTest(test.TestCase):
       predictions = fc_old.linear_model(features, [wire_cast])
       wire_cast_var = get_linear_model_column_var(wire_cast)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((4, 1)), wire_cast_var.eval())
+        self.assertAllClose(np.zeros((4, 1)), self.evaluate(wire_cast_var))
         self.assertAllClose(
             np.zeros((2, 1)),
             predictions.eval(feed_dict={wire_tensor: wire_value}))
@@ -2480,7 +2550,7 @@ class OldLinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [5010.]], predictions.eval())
+        self.assertAllClose([[1005.], [5010.]], self.evaluate(predictions))
 
   def test_sparse_combiner_with_negative_weights(self):
     wire_cast = fc.categorical_column_with_hash_bucket('wire_cast', 4)
@@ -2502,7 +2572,7 @@ class OldLinearModelTest(test.TestCase):
       with _initialized_session() as sess:
         sess.run(wire_cast_var.assign([[10.], [100.], [1000.], [10000.]]))
         sess.run(bias.assign([5.]))
-        self.assertAllClose([[1005.], [-9985.]], predictions.eval())
+        self.assertAllClose([[1005.], [-9985.]], self.evaluate(predictions))
 
   def test_dense_multi_dimension_multi_output(self):
     price = fc.numeric_column('price', shape=2)
@@ -2512,12 +2582,12 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose(np.zeros((3,)), bias.eval())
-        self.assertAllClose(np.zeros((2, 3)), price_var.eval())
+        self.assertAllClose(np.zeros((3,)), self.evaluate(bias))
+        self.assertAllClose(np.zeros((2, 3)), self.evaluate(price_var))
         sess.run(price_var.assign([[1., 2., 3.], [10., 100., 1000.]]))
         sess.run(bias.assign([2., 3., 4.]))
         self.assertAllClose([[23., 205., 2007.], [67., 613., 6019.]],
-                            predictions.eval())
+                            self.evaluate(predictions))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -2536,11 +2606,11 @@ class OldLinearModelTest(test.TestCase):
       bias = get_linear_model_bias()
       price_var = get_linear_model_column_var(price)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price_var.assign([[10.], [100.]]))
-        self.assertAllClose([[210.], [650.]], predictions.eval())
+        self.assertAllClose([[210.], [650.]], self.evaluate(predictions))
 
   def test_dense_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -2552,14 +2622,14 @@ class OldLinearModelTest(test.TestCase):
       price1_var = get_linear_model_column_var(price1)
       price2_var = get_linear_model_column_var(price2)
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias.eval())
-        self.assertAllClose([[0.], [0.]], price1_var.eval())
-        self.assertAllClose([[0.]], price2_var.eval())
-        self.assertAllClose([[0.], [0.]], predictions.eval())
+        self.assertAllClose([0.], self.evaluate(bias))
+        self.assertAllClose([[0.], [0.]], self.evaluate(price1_var))
+        self.assertAllClose([[0.]], self.evaluate(price2_var))
+        self.assertAllClose([[0.], [0.]], self.evaluate(predictions))
         sess.run(price1_var.assign([[10.], [100.]]))
         sess.run(price2_var.assign([[1000.]]))
         sess.run(bias.assign([7.]))
-        self.assertAllClose([[3217.], [4657.]], predictions.eval())
+        self.assertAllClose([[3217.], [4657.]], self.evaluate(predictions))
 
   def test_fills_cols_to_vars(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -2589,15 +2659,18 @@ class OldLinearModelTest(test.TestCase):
           partitioner=partitioned_variables.fixed_size_partitioner(2, axis=0)):
         fc_old.linear_model(
             features, [price1, price2], cols_to_vars=cols_to_vars)
-      with _initialized_session():
-        self.assertEqual([0.], cols_to_vars['bias'][0].eval())
-        # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
-        self.assertAllEqual([[0.]], cols_to_vars[price1][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price1][1].eval())
-        # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
-        # a [1, 1] Variable.
-        self.assertAllEqual([[0.], [0.]], cols_to_vars[price2][0].eval())
-        self.assertAllEqual([[0.]], cols_to_vars[price2][1].eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertEqual([0.], self.evaluate(cols_to_vars['bias'][0]))
+      # Partitioning shards the [2, 1] price1 var into 2 [1, 1] Variables.
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price1][0]))
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price1][1]))
+      # Partitioning shards the [3, 1] price2 var into a [2, 1] Variable and
+      # a [1, 1] Variable.
+      self.assertAllEqual([[0.], [0.]], self.evaluate(cols_to_vars[price2][0]))
+      self.assertAllEqual([[0.]], self.evaluate(cols_to_vars[price2][1]))
 
   def test_fills_cols_to_output_tensors(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
@@ -2795,6 +2868,7 @@ class OldLinearModelTest(test.TestCase):
                 features['price2']: [[1.], [5.]],
             })
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2832,8 +2906,10 @@ class OldLinearModelTest(test.TestCase):
       sess.run(body_style_var.assign([[-10.], [-100.], [-1000.]]))
       sess.run(bias.assign([5.]))
 
-      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]], sess.run(net))
+      self.assertAllClose([[10 - 1000 + 5.], [1000 - 10 + 5.]],
+                          self.evaluate(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     price = fc.numeric_column('price')
     price_buckets = fc.bucketized_column(
@@ -2879,6 +2955,7 @@ class OldLinearModelTest(test.TestCase):
                                   features['country']: country_data
                               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     price = fc.numeric_column('price')
     features = {
@@ -2912,26 +2989,27 @@ class OldLinearModelTest(test.TestCase):
       price_var1 = get_linear_model_column_var(price, name='linear_model')
       price_var2 = get_linear_model_column_var(price, name='linear_model_1')
       with _initialized_session() as sess:
-        self.assertAllClose([0.], bias1.eval())
+        self.assertAllClose([0.], self.evaluate(bias1))
         sess.run(price_var1.assign([[10.]]))
         sess.run(bias1.assign([5.]))
-        self.assertAllClose([[15.], [55.]], predictions1.eval())
-        self.assertAllClose([0.], bias2.eval())
+        self.assertAllClose([[15.], [55.]], self.evaluate(predictions1))
+        self.assertAllClose([0.], self.evaluate(bias2))
         sess.run(price_var2.assign([[10.]]))
         sess.run(bias2.assign([5.]))
-        self.assertAllClose([[25.], [105.]], predictions2.eval())
+        self.assertAllClose([[25.], [105.]], self.evaluate(predictions2))
 
+  @test_util.run_deprecated_v1
   def test_linear_model_v1_shared_embedding_all_other_v2(self):
     price = fc.numeric_column('price')  # v2
     some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)  # v2
     some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v2
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -2954,9 +3032,13 @@ class OldLinearModelTest(test.TestCase):
       }
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
-      with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
   def test_linear_model_v1_shared_embedding_with_v2_cat_all_other_v2(self):
     price = fc.numeric_column('price')  # v2
     some_sparse_column = fc.categorical_column_with_hash_bucket(
@@ -2967,7 +3049,7 @@ class OldLinearModelTest(test.TestCase):
         key='aaa', num_buckets=3)  # v2
     categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -2990,20 +3072,24 @@ class OldLinearModelTest(test.TestCase):
       }
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
-      with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
   def test_linear_model_v1_v2_mix(self):
     price = fc.numeric_column('price')  # v2
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)  # v1
-    some_embedding_column = fc_old.embedding_column(
+    some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v1
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)  # v2
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)  # v1
     all_cols = [
         price, some_embedding_column, shared_embedding_a, shared_embedding_b
@@ -3026,14 +3112,18 @@ class OldLinearModelTest(test.TestCase):
       }
       fc_old.linear_model(features, all_cols)
       bias = get_linear_model_bias()
-      with _initialized_session():
-        self.assertAllClose([0.], bias.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([0.], self.evaluate(bias))
+
+  @test_util.run_deprecated_v1
   def test_linear_model_v2_shared_embedding_all_other_v1(self):
-    price = fc_old.numeric_column('price')  # v1
-    some_sparse_column = fc_old.categorical_column_with_hash_bucket(
+    price = fc.numeric_column('price')  # v1
+    some_sparse_column = fc.categorical_column_with_hash_bucket(
         'sparse_feature', hash_bucket_size=5)  # v1
-    some_embedding_column = fc_old.embedding_column(
+    some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)  # v1
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)  # v2
@@ -3065,13 +3155,13 @@ class OldLinearModelTest(test.TestCase):
         fc_old.linear_model(features, all_cols)
 
 
-class FeatureLayerTest(test.TestCase):
+class DenseFeaturesTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes()
   def test_retrieving_input(self):
     features = {'a': [0.]}
-    feature_layer = fc.FeatureLayer(fc.numeric_column('a'))
-    inputs = self.evaluate(feature_layer(features))
+    dense_features = fc.DenseFeatures(fc.numeric_column('a'))
+    inputs = self.evaluate(dense_features(features))
     self.assertAllClose([[0.]], inputs)
 
   def test_reuses_variables(self):
@@ -3085,6 +3175,7 @@ class FeatureLayerTest(test.TestCase):
       categorical_column = fc.categorical_column_with_identity(
           key='a', num_buckets=3)
       embedding_dimension = 2
+
       def _embedding_column_initializer(shape, dtype, partition_info):
         del shape  # unused
         del dtype  # unused
@@ -3100,11 +3191,11 @@ class FeatureLayerTest(test.TestCase):
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
 
-      feature_layer = fc.FeatureLayer([embedding_column])
+      dense_features = fc.DenseFeatures([embedding_column])
       features = {'a': sparse_input}
 
-      inputs = feature_layer(features)
-      variables = feature_layer.variables
+      inputs = dense_features(features)
+      variables = dense_features.variables
 
       # Sanity check: test that the inputs are correct.
       self.assertAllEqual([[1, 0], [0, 1], [1, 1]], inputs)
@@ -3112,13 +3203,13 @@ class FeatureLayerTest(test.TestCase):
       # Check that only one variable was created.
       self.assertEqual(1, len(variables))
 
-      # Check that invoking feature_layer on the same features does not create
+      # Check that invoking dense_features on the same features does not create
       # additional variables
-      _ = feature_layer(features)
+      _ = dense_features(features)
       self.assertEqual(1, len(variables))
-      self.assertEqual(variables[0], feature_layer.variables[0])
+      self.assertEqual(variables[0], dense_features.variables[0])
 
-  def test_feature_column_feature_layer_gradient(self):
+  def test_feature_column_dense_features_gradient(self):
     with context.eager_mode():
       sparse_input = sparse_tensor.SparseTensor(
           indices=((0, 0), (1, 0), (2, 0)),
@@ -3145,11 +3236,11 @@ class FeatureLayerTest(test.TestCase):
           dimension=embedding_dimension,
           initializer=_embedding_column_initializer)
 
-      feature_layer = fc.FeatureLayer([embedding_column])
+      dense_features = fc.DenseFeatures([embedding_column])
       features = {'a': sparse_input}
 
       def scale_matrix():
-        matrix = feature_layer(features)
+        matrix = dense_features(features)
         return 2 * matrix
 
       # Sanity check: Verify that scale_matrix returns the correct output.
@@ -3167,11 +3258,11 @@ class FeatureLayerTest(test.TestCase):
   def test_raises_if_empty_feature_columns(self):
     with self.assertRaisesRegexp(ValueError,
                                  'feature_columns must not be empty'):
-      fc.FeatureLayer(feature_columns=[])(features={})
+      fc.DenseFeatures(feature_columns=[])(features={})
 
   def test_should_be_dense_column(self):
     with self.assertRaisesRegexp(ValueError, 'must be a DenseColumn'):
-      fc.FeatureLayer(feature_columns=[
+      fc.DenseFeatures(feature_columns=[
           fc.categorical_column_with_hash_bucket('wire_cast', 4)
       ])(
           features={
@@ -3181,7 +3272,7 @@ class FeatureLayerTest(test.TestCase):
   def test_does_not_support_dict_columns(self):
     with self.assertRaisesRegexp(
         ValueError, 'Expected feature_columns to be iterable, found dict.'):
-      fc.FeatureLayer(feature_columns={'a': fc.numeric_column('a')})(
+      fc.DenseFeatures(feature_columns={'a': fc.numeric_column('a')})(
           features={
               'a': [[0]]
           })
@@ -3189,22 +3280,28 @@ class FeatureLayerTest(test.TestCase):
   def test_bare_column(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
-      net = fc.FeatureLayer(fc.numeric_column('a'))(features)
-      with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+      net = fc.DenseFeatures(fc.numeric_column('a'))(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
       columns = (fc.numeric_column(key) for key in features)
-      net = fc.FeatureLayer(columns)(features)
-      with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+      net = fc.DenseFeatures(columns)(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
         ValueError, 'Duplicate feature column name found for columns'):
-      fc.FeatureLayer(
+      fc.DenseFeatures(
           feature_columns=[fc.numeric_column('a'),
                            fc.numeric_column('a')])(
                                features={
@@ -3215,17 +3312,23 @@ class FeatureLayerTest(test.TestCase):
     price = fc.numeric_column('price')
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
-      net = fc.FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
-      net = fc.FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_compute_output_shape(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3235,12 +3338,15 @@ class FeatureLayerTest(test.TestCase):
           'price1': [[1., 2.], [5., 6.]],
           'price2': [[3., 4., 5., 6.], [7., 8., 9., 10.]]
       }
-      feature_layer = fc.FeatureLayer([price1, price2])
-      self.assertEqual((None, 6), feature_layer.compute_output_shape((None,)))
-      net = feature_layer(features)
-      with _initialized_session():
-        self.assertAllClose(
-            [[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]], net.eval())
+      dense_features = fc.DenseFeatures([price1, price2])
+      self.assertEqual((None, 6), dense_features.compute_output_shape((None,)))
+      net = dense_features(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3., 4., 5., 6.], [5., 6., 7., 8., 9., 10.]],
+                          self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -3249,27 +3355,30 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           Exception,
           r'Cannot reshape a tensor with 2 elements to shape \[2,2\]'):
-        fc.FeatureLayer([price])(features)
+        fc.DenseFeatures([price])(features)
 
   def test_reshaping(self):
     price = fc.numeric_column('price', shape=[1, 2])
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
-      net = fc.FeatureLayer([price])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+      net = fc.DenseFeatures([price])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
     price2 = fc.numeric_column('price2')
     with ops.Graph().as_default():
-      features = {
-          'price1': [[1., 2.], [5., 6.]],
-          'price2': [[3.], [4.]]
-      }
-      net = fc.FeatureLayer([price1, price2])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+      features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
+      net = fc.DenseFeatures([price1, price2])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_cols_to_output_tensors(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3277,12 +3386,16 @@ class FeatureLayerTest(test.TestCase):
     with ops.Graph().as_default():
       cols_dict = {}
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
-      feature_layer = fc.FeatureLayer([price1, price2])
-      net = feature_layer(features, cols_dict)
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], cols_dict[price1].eval())
-        self.assertAllClose([[3.], [4.]], cols_dict[price2].eval())
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+      dense_features = fc.DenseFeatures([price1, price2])
+      net = dense_features(features, cols_dict)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]],
+                          self.evaluate(cols_dict[price1]))
+      self.assertAllClose([[3.], [4.]], self.evaluate(cols_dict[price2]))
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_column_order(self):
     price_a = fc.numeric_column('price_a')
@@ -3292,11 +3405,14 @@ class FeatureLayerTest(test.TestCase):
           'price_a': [[1.]],
           'price_b': [[3.]],
       }
-      net1 = fc.FeatureLayer([price_a, price_b])(features)
-      net2 = fc.FeatureLayer([price_b, price_a])(features)
-      with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+      net1 = fc.DenseFeatures([price_a, price_b])(features)
+      net2 = fc.DenseFeatures([price_b, price_a])(features)
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 3.]], self.evaluate(net1))
+      self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
     animal = fc.categorical_column_with_identity('animal', num_buckets=4)
@@ -3307,7 +3423,7 @@ class FeatureLayerTest(test.TestCase):
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
       with self.assertRaisesRegexp(Exception, 'must be a DenseColumn'):
-        fc.FeatureLayer([animal])(features)
+        fc.DenseFeatures([animal])(features)
 
   def test_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3320,7 +3436,7 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.FeatureLayer([price1, price2])(features)
+        fc.DenseFeatures([price1, price2])(features)
 
   def test_subset_of_static_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3335,7 +3451,7 @@ class FeatureLayerTest(test.TestCase):
       with self.assertRaisesRegexp(
           ValueError,
           'Batch size \(first dimension\) of each feature must be same.'):  # pylint: disable=anomalous-backslash-in-string
-        fc.FeatureLayer([price1, price2, price3])(features)
+        fc.DenseFeatures([price1, price2, price3])(features)
 
   def test_runtime_batch_size_mismatch(self):
     price1 = fc.numeric_column('price1')
@@ -3345,7 +3461,7 @@ class FeatureLayerTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 3
           'price2': [[3.], [4.]]  # batchsize = 2
       }
-      net = fc.FeatureLayer([price1, price2])(features)
+      net = fc.DenseFeatures([price1, price2])(features)
       with _initialized_session() as sess:
         with self.assertRaisesRegexp(errors.OpError,
                                      'Dimensions of inputs should match'):
@@ -3359,7 +3475,7 @@ class FeatureLayerTest(test.TestCase):
           'price1': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
           'price2': array_ops.placeholder(dtype=dtypes.int64),  # batchsize = 2
       }
-      net = fc.FeatureLayer([price1, price2])(features)
+      net = fc.DenseFeatures([price1, price2])(features)
       with _initialized_session() as sess:
         sess.run(
             net,
@@ -3379,19 +3495,20 @@ class FeatureLayerTest(test.TestCase):
           'sparse_feature': [['a'], ['x']],
       }
       all_cols = [some_embedding_column]
-      fc.FeatureLayer(all_cols)(features)
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that 2 variables get created in this case.
       self.assertEqual(2, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
       expected_var_names = [
-          'feature_layer/sparse_feature_embedding/embedding_weights:0',
-          'feature_layer_1/sparse_feature_embedding/embedding_weights:0'
+          'dense_features/sparse_feature_embedding/embedding_weights:0',
+          'dense_features_1/sparse_feature_embedding/embedding_weights:0'
       ]
       self.assertItemsEqual(
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -3416,8 +3533,8 @@ class FeatureLayerTest(test.TestCase):
                   dense_shape=(2, 2)),
       }
       all_cols = [embedding_column_a, embedding_column_b]
-      fc.FeatureLayer(all_cols)(features)
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3425,6 +3542,7 @@ class FeatureLayerTest(test.TestCase):
           ['aaa_bbb_shared_embedding:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_multiple_layers_with_same_shared_embedding_column_diff_graphs(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -3449,7 +3567,7 @@ class FeatureLayerTest(test.TestCase):
                   values=(1, 2, 1),
                   dense_shape=(2, 2)),
       }
-      fc.FeatureLayer(all_cols)(features)
+      fc.DenseFeatures(all_cols)(features)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3468,7 +3586,7 @@ class FeatureLayerTest(test.TestCase):
                   dense_shape=(2, 2)),
       }
 
-      fc.FeatureLayer(all_cols)(features1)
+      fc.DenseFeatures(all_cols)(features1)
       # Make sure that only 1 variable gets created in this case.
       self.assertEqual(1, len(
           ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)))
@@ -3476,23 +3594,25 @@ class FeatureLayerTest(test.TestCase):
           ['aaa_bbb_shared_embedding:0'],
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_with_numpy_input_fn(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
         (6., 7., 8., 9., 10.),  # id 1
         (11., 12., 13., 14., 15.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     one_hot_body_style = fc.indicator_column(body_style)
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     embedded_body_style = fc.embedding_column(
         body_style, dimension=5, initializer=_initializer)
 
@@ -3504,7 +3624,7 @@ class FeatureLayerTest(test.TestCase):
         batch_size=2,
         shuffle=False)
     features = input_fn()
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_body_style])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_body_style])(
         features)
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
@@ -3513,33 +3633,33 @@ class FeatureLayerTest(test.TestCase):
 
       # Each row is formed by concatenating `embedded_body_style`,
       # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[11., 12., 13., 14., 15., 0., 0., 1., 11.],
-           [1., 2., 3., 4., 5., 1., 0., 0., 12]],
-          sess.run(net))
+      self.assertAllEqual([[11., 12., 13., 14., 15., 0., 0., 1., 11.],
+                           [1., 2., 3., 4., 5., 1., 0., 0., 12]], sess.run(net))
 
       coord.request_stop()
       coord.join(threads)
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
         (6., 7., 8., 9., 10.),  # id 1
         (11., 12., 13., 14., 15.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
     embedded_country = fc.embedding_column(
@@ -3547,49 +3667,56 @@ class FeatureLayerTest(test.TestCase):
 
     # Provides 1-dim tensor and dense tensor.
     features = {
-        'price': constant_op.constant([11., 12.,]),
-        'body-style': sparse_tensor.SparseTensor(
-            indices=((0,), (1,)),
-            values=('sedan', 'hardtop'),
-            dense_shape=(2,)),
+        'price':
+            constant_op.constant([
+                11.,
+                12.,
+            ]),
+        'body-style':
+            sparse_tensor.SparseTensor(
+                indices=((0,), (1,)),
+                values=('sedan', 'hardtop'),
+                dense_shape=(2,)),
         # This is dense tensor for the categorical_column.
-        'country': constant_op.constant(['CA', 'US']),
+        'country':
+            constant_op.constant(['CA', 'US']),
     }
     self.assertEqual(1, features['price'].shape.ndims)
     self.assertEqual(1, features['body-style'].dense_shape.get_shape()[0])
     self.assertEqual(1, features['country'].shape.ndims)
 
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_country])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
         features)
     self.assertEqual(1 + 3 + 5, net.shape[1])
     with _initialized_session() as sess:
 
       # Each row is formed by concatenating `embedded_body_style`,
       # `one_hot_body_style`, and `price` in order.
-      self.assertAllEqual(
-          [[0., 0., 1., 11., 12., 13., 14., 15., 11.],
-           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
-          sess.run(net))
+      self.assertAllEqual([[0., 0., 1., 11., 12., 13., 14., 15., 11.],
+                           [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
+                          sess.run(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
         (6., 7.),  # id 1
         (11., 12.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       del shape, dtype, partition_info
       return embedding_values
 
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
 
-    # one_hot_body_style has 3 dims in feature_layer.
+    # one_hot_body_style has 3 dims in dense_features.
     body_style = fc.categorical_column_with_vocabulary_list(
         'body-style', vocabulary_list=['hardtop', 'wagon', 'sedan'])
     one_hot_body_style = fc.indicator_column(body_style)
 
-    # embedded_body_style has 5 dims in feature_layer.
+    # embedded_body_style has 5 dims in dense_features.
     country = fc.categorical_column_with_vocabulary_list(
         'country', vocabulary_list=['US', 'JP', 'CA'])
     embedded_country = fc.embedding_column(
@@ -3608,12 +3735,10 @@ class FeatureLayerTest(test.TestCase):
 
     price_data = np.array([11., 12.])
     body_style_data = sparse_tensor.SparseTensorValue(
-        indices=((0,), (1,)),
-        values=('sedan', 'hardtop'),
-        dense_shape=(2,))
+        indices=((0,), (1,)), values=('sedan', 'hardtop'), dense_shape=(2,))
     country_data = np.array([['US'], ['CA']])
 
-    net = fc.FeatureLayer([price, one_hot_body_style, embedded_country])(
+    net = fc.DenseFeatures([price, one_hot_body_style, embedded_country])(
         features)
     self.assertEqual(1 + 3 + 2, net.shape[1])
     with _initialized_session() as sess:
@@ -3630,8 +3755,9 @@ class FeatureLayerTest(test.TestCase):
                   features['country']: country_data
               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
-    # price has 1 dimension in feature_layer
+    # price has 1 dimension in dense_features
     price = fc.numeric_column('price')
     features = {
         'price': constant_op.constant(0),
@@ -3640,13 +3766,13 @@ class FeatureLayerTest(test.TestCase):
 
     # Static rank 0 should fail
     with self.assertRaisesRegexp(ValueError, 'Feature .* cannot have rank 0'):
-      fc.FeatureLayer([price])(features)
+      fc.DenseFeatures([price])(features)
 
     # Dynamic rank 0 should fail
     features = {
         'price': array_ops.placeholder(dtypes.float32),
     }
-    net = fc.FeatureLayer([price])(features)
+    net = fc.DenseFeatures([price])(features)
     self.assertEqual(1, net.shape[1])
     with _initialized_session() as sess:
       with self.assertRaisesOpError('Feature .* cannot have rank 0'):
@@ -3779,16 +3905,22 @@ class FunctionalInputLayerTest(test.TestCase):
     with ops.Graph().as_default():
       features = features = {'a': [0.]}
       net = fc_old.input_layer(features, fc.numeric_column('a'))
-      with _initialized_session():
-        self.assertAllClose([[0.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0.]], self.evaluate(net))
 
   def test_column_generator(self):
     with ops.Graph().as_default():
       features = features = {'a': [0.], 'b': [1.]}
       columns = (fc.numeric_column(key) for key in features)
       net = fc_old.input_layer(features, columns)
-      with _initialized_session():
-        self.assertAllClose([[0., 1.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1.]], self.evaluate(net))
 
   def test_raises_if_duplicate_name(self):
     with self.assertRaisesRegexp(
@@ -3803,16 +3935,22 @@ class FunctionalInputLayerTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[1.], [5.]]}
       net = fc_old.input_layer(features, [price])
-      with _initialized_session():
-        self.assertAllClose([[1.], [5.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1.], [5.]], self.evaluate(net))
 
   def test_multi_dimension(self):
     price = fc.numeric_column('price', shape=2)
     with ops.Graph().as_default():
       features = {'price': [[1., 2.], [5., 6.]]}
       net = fc_old.input_layer(features, [price])
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_raises_if_shape_mismatch(self):
     price = fc.numeric_column('price', shape=2)
@@ -3828,8 +3966,11 @@ class FunctionalInputLayerTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price': [[[1., 2.]], [[5., 6.]]]}
       net = fc_old.input_layer(features, [price])
-      with _initialized_session():
-        self.assertAllClose([[1., 2.], [5., 6.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2.], [5., 6.]], self.evaluate(net))
 
   def test_multi_column(self):
     price1 = fc.numeric_column('price1', shape=2)
@@ -3837,8 +3978,11 @@ class FunctionalInputLayerTest(test.TestCase):
     with ops.Graph().as_default():
       features = {'price1': [[1., 2.], [5., 6.]], 'price2': [[3.], [4.]]}
       net = fc_old.input_layer(features, [price1, price2])
-      with _initialized_session():
-        self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 2., 3.], [5., 6., 4.]], self.evaluate(net))
 
   def test_fills_cols_to_vars(self):
     # Provide three _DenseColumn's to input_layer: a _NumericColumn, a
@@ -3869,6 +4013,7 @@ class FunctionalInputLayerTest(test.TestCase):
                             variables_lib.Variable)
       self.assertAllEqual(cols_to_vars[some_embedding_column][0].shape, [5, 10])
 
+  @test_util.run_deprecated_v1
   def test_fills_cols_to_vars_shared_embedding(self):
     # Provide 5 DenseColumn's to input_layer: a NumericColumn, a
     # BucketizedColumn, an EmbeddingColumn, two SharedEmbeddingColumns. The
@@ -3882,11 +4027,11 @@ class FunctionalInputLayerTest(test.TestCase):
         'sparse_feature', hash_bucket_size=5)
     some_embedding_column = fc.embedding_column(
         some_sparse_column, dimension=10)
-    categorical_column_a = fc_old.categorical_column_with_identity(
+    categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
-    categorical_column_b = fc_old.categorical_column_with_identity(
+    categorical_column_b = fc.categorical_column_with_identity(
         key='bbb', num_buckets=3)
-    shared_embedding_a, shared_embedding_b = fc_old.shared_embedding_columns(
+    shared_embedding_a, shared_embedding_b = fc.shared_embedding_columns(
         [categorical_column_a, categorical_column_b], dimension=2)
     with ops.Graph().as_default():
       features = {
@@ -3968,9 +4113,12 @@ class FunctionalInputLayerTest(test.TestCase):
       }
       net1 = fc_old.input_layer(features, [price_a, price_b])
       net2 = fc_old.input_layer(features, [price_b, price_a])
-      with _initialized_session():
-        self.assertAllClose([[1., 3.]], net1.eval())
-        self.assertAllClose([[1., 3.]], net2.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[1., 3.]], self.evaluate(net1))
+      self.assertAllClose([[1., 3.]], self.evaluate(net2))
 
   def test_fails_for_categorical_column(self):
     animal = fc.categorical_column_with_identity('animal', num_buckets=4)
@@ -4066,6 +4214,7 @@ class FunctionalInputLayerTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
 
+  @test_util.run_deprecated_v1
   def test_with_1d_sparse_tensor(self):
     embedding_values = (
         (1., 2., 3., 4., 5.),  # id 0
@@ -4122,6 +4271,7 @@ class FunctionalInputLayerTest(test.TestCase):
                            [1., 0., 0., 1., 2., 3., 4., 5., 12.]],
                           sess.run(net))
 
+  @test_util.run_deprecated_v1
   def test_with_1d_unknown_shape_sparse_tensor(self):
     embedding_values = (
         (1., 2.),  # id 0
@@ -4180,6 +4330,7 @@ class FunctionalInputLayerTest(test.TestCase):
                   features['country']: country_data
               }))
 
+  @test_util.run_deprecated_v1
   def test_with_rank_0_feature(self):
     # price has 1 dimension in input_layer
     price = fc.numeric_column('price')
@@ -4220,12 +4371,19 @@ class MakeParseExampleSpecTest(test.TestCase):
     def transform_feature(self, transformation_cache, state_manager):
       pass
 
+    def _transform_feature(self, inputs):
+      pass
+
     @property
     def parse_example_spec(self):
       return self.parse_spec
 
+    @property
+    def _parse_example_spec(self):
+      return self.parse_spec
+
   def test_no_feature_columns(self):
-    actual = fc.make_parse_example_spec([])
+    actual = fc.make_parse_example_spec_v2([])
     self.assertDictEqual({}, actual)
 
   def test_invalid_type(self):
@@ -4235,15 +4393,17 @@ class MakeParseExampleSpecTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'All feature_columns must be FeatureColumn instances.*invalid_column'):
-      fc.make_parse_example_spec(
-          (self._TestFeatureColumn({key1: parse_spec1}), 'invalid_column'))
+      fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+          key1: parse_spec1
+      }), 'invalid_column'))
 
   def test_one_feature_column(self):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }),))
     self.assertDictEqual({key1: parse_spec1}, actual)
 
   def test_two_feature_columns(self):
@@ -4252,9 +4412,11 @@ class MakeParseExampleSpecTest(test.TestCase):
         shape=(2,), dtype=dtypes.float32, default_value=0.)
     key2 = 'key2'
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key2: parse_spec2})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key2: parse_spec2
+    })))
     self.assertDictEqual({key1: parse_spec1, key2: parse_spec2}, actual)
 
   def test_equal_keys_different_parse_spec(self):
@@ -4265,17 +4427,21 @@ class MakeParseExampleSpecTest(test.TestCase):
     with self.assertRaisesRegexp(
         ValueError,
         'feature_columns contain different parse_spec for key key1'):
-      fc.make_parse_example_spec(
-          (self._TestFeatureColumn({key1: parse_spec1}),
-           self._TestFeatureColumn({key1: parse_spec2})))
+      fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+          key1: parse_spec1
+      }), self._TestFeatureColumn({
+          key1: parse_spec2
+      })))
 
   def test_equal_keys_equal_parse_spec(self):
     key1 = 'key1'
     parse_spec1 = parsing_ops.FixedLenFeature(
         shape=(2,), dtype=dtypes.float32, default_value=0.)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key1: parse_spec1})))
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key1: parse_spec1
+    })))
     self.assertDictEqual({key1: parse_spec1}, actual)
 
   def test_multiple_features_dict(self):
@@ -4287,11 +4453,17 @@ class MakeParseExampleSpecTest(test.TestCase):
     parse_spec2 = parsing_ops.VarLenFeature(dtype=dtypes.string)
     key3 = 'key3'
     parse_spec3 = parsing_ops.VarLenFeature(dtype=dtypes.int32)
-    actual = fc.make_parse_example_spec(
-        (self._TestFeatureColumn({key1: parse_spec1}),
-         self._TestFeatureColumn({key2: parse_spec2, key3: parse_spec3})))
-    self.assertDictEqual(
-        {key1: parse_spec1, key2: parse_spec2, key3: parse_spec3}, actual)
+    actual = fc.make_parse_example_spec_v2((self._TestFeatureColumn({
+        key1: parse_spec1
+    }), self._TestFeatureColumn({
+        key2: parse_spec2,
+        key3: parse_spec3
+    })))
+    self.assertDictEqual({
+        key1: parse_spec1,
+        key2: parse_spec2,
+        key3: parse_spec3
+    }, actual)
 
 
 def _assert_sparse_tensor_value(test_case, expected, actual):
@@ -4299,7 +4471,8 @@ def _assert_sparse_tensor_value(test_case, expected, actual):
   test_case.assertAllEqual(expected.indices, actual.indices)
 
   test_case.assertEqual(
-      np.array(expected.values).dtype, np.array(actual.values).dtype)
+      np.array(expected.values).dtype,
+      np.array(actual.values).dtype)
   test_case.assertAllEqual(expected.values, actual.values)
 
   test_case.assertEqual(np.int64, np.array(actual.dense_shape).dtype)
@@ -4321,6 +4494,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         'python/feature_column/testdata/wire_vocabulary.txt')
     self._wire_vocabulary_size = 3
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
@@ -4337,19 +4511,27 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc.categorical_column_with_vocabulary_file(
           key=('aaa',), vocabulary_file='path_to_file', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     self.assertEqual(7, column.num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_vocabulary_file(
-        key='aaa', vocabulary_file='path_to_file', vocabulary_size=3,
-        num_oov_buckets=4, dtype=dtypes.int32)
+        key='aaa',
+        vocabulary_file='path_to_file',
+        vocabulary_size=3,
+        num_oov_buckets=4,
+        dtype=dtypes.int32)
     for column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', column.name)
       self.assertEqual(7, column.num_buckets)
@@ -4367,6 +4549,7 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       fc.categorical_column_with_vocabulary_file(
           key='aaa', vocabulary_file='', vocabulary_size=3)
 
+  @test_util.run_deprecated_v1
   def test_invalid_vocabulary_file(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='file_does_not_exist', vocabulary_size=10)
@@ -4379,19 +4562,21 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     with self.assertRaisesRegexp(errors.OpError, 'file_does_not_exist'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_invalid_vocabulary_size(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=-1)
     with self.assertRaisesRegexp(ValueError, 'Invalid vocabulary_size'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file=self._wire_vocabulary_file_name,
+          key='aaa',
+          vocabulary_file=self._wire_vocabulary_file_name,
           vocabulary_size=0)
 
+  @test_util.run_deprecated_v1
   def test_too_large_vocabulary_size(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4406,24 +4591,27 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     with self.assertRaisesRegexp(errors.OpError, 'Invalid vocab_size'):
-      with self.cached_session():
-        lookup_ops.tables_initializer().run()
+      self.evaluate(lookup_ops.tables_initializer())
 
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           num_oov_buckets=-1)
 
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_file(
-          key='aaa', vocabulary_file='path', vocabulary_size=3,
+          key='aaa',
+          vocabulary_file='path',
+          vocabulary_size=3,
           dtype=dtypes.float64)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'both num_oov_buckets and default_value'):
       fc.categorical_column_with_vocabulary_file(
           key='aaa',
           vocabulary_file=self._wire_vocabulary_file_name,
@@ -4463,28 +4651,31 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file='path_to_file', vocabulary_size=3)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4499,15 +4690,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_none_vocabulary_size(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa', vocabulary_file=self._wire_vocabulary_file_name)
@@ -4520,15 +4715,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4538,16 +4737,21 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(self,
-                                  sparse_tensor.SparseTensorValue(
-                                      indices=inputs.indices,
-                                      values=np.array(
-                                          (2, -1, 0), dtype=np.int64),
-                                      dense_shape=inputs.dense_shape),
-                                  id_tensor.eval())
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4558,15 +4762,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4582,15 +4789,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 2, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4606,15 +4817,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 33, 0, 62), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_small_vocabulary_size(self):
     # 'marlo' is the last entry in our vocabulary file, so be setting
     # `vocabulary_size` to 1 less than number of entries in file, we take
@@ -4632,15 +4847,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((-1, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((-1, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4656,15 +4875,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
     column = fc.categorical_column_with_vocabulary_file(
@@ -4678,15 +4901,18 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': ((11, -1, -1), (100, 30, -1), (-1, -1, 22))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+            values=np.array((2, default_value, 0, 4), dtype=np.int64),
+            dense_shape=(3, 3)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_file(
         key='aaa',
@@ -4703,15 +4929,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 60, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
         key='wire',
@@ -4729,14 +4959,17 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
@@ -4755,15 +4988,19 @@ class VocabularyFileCategoricalColumnTest(test.TestCase):
       }, (wire_column,))
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     wire_column = fc.categorical_column_with_vocabulary_file(
         key='wire',
@@ -4815,15 +5052,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32,
+        key='aaa',
+        vocabulary_list=(12, 24, 36),
+        dtype=dtypes.int32,
         default_value=-99)
     self.assertEqual(3, column.num_buckets)
     self.assertEqual({
         'aaa': parsing_ops.VarLenFeature(dtypes.int32)
     }, column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.int32)
@@ -4837,37 +5078,39 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
   def test_invalid_dtype(self):
     with self.assertRaisesRegexp(ValueError, 'dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.float32)
 
   def test_invalid_mapping_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary dtype must be string or integer'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary dtype must be string or integer'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12., 24., 36.))
 
   def test_mismatched_int_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'),
+          key='aaa',
+          vocabulary_list=('omar', 'stringer', 'marlo'),
           dtype=dtypes.int32)
 
   def test_mismatched_string_dtype(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'dtype.*and vocabulary dtype.*do not match'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'dtype.*and vocabulary dtype.*do not match'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=(12, 24, 36), dtype=dtypes.string)
 
   def test_none_mapping(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary_list.*must be non-empty'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=None)
 
   def test_empty_mapping(self):
-    with self.assertRaisesRegexp(
-        ValueError, r'vocabulary_list.*must be non-empty'):
+    with self.assertRaisesRegexp(ValueError,
+                                 r'vocabulary_list.*must be non-empty'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa', vocabulary_list=tuple([]))
 
@@ -4879,12 +5122,11 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
   def test_invalid_num_oov_buckets(self):
     with self.assertRaisesRegexp(ValueError, 'Invalid num_oov_buckets'):
       fc.categorical_column_with_vocabulary_list(
-          key='aaa', vocabulary_list=(12, 24, 36),
-          num_oov_buckets=-1)
+          key='aaa', vocabulary_list=(12, 24, 36), num_oov_buckets=-1)
 
   def test_invalid_buckets_and_default_value(self):
-    with self.assertRaisesRegexp(
-        ValueError, 'both num_oov_buckets and default_value'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'both num_oov_buckets and default_value'):
       fc.categorical_column_with_vocabulary_list(
           key='aaa',
           vocabulary_list=(12, 24, 36),
@@ -4893,8 +5135,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_input_dtype_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(12, 24, 36),
@@ -4907,8 +5148,7 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
 
   def test_invalid_input_dtype_string(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=(12, 24, 36))
+        key='aaa', vocabulary_list=(12, 24, 36))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
@@ -4919,54 +5159,56 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example_string(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_parse_example_int(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=(11, 21, 31))
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[11, 21]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[11, 21]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=[11, 21],
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]], values=[11, 21], dense_shape=[1, 2]),
+        self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
@@ -4976,51 +5218,61 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     inputs = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=('marlo', 'skywalker', 'omar'),
         dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_vocabulary_list(
-        key='aaa',
-        vocabulary_list=('omar', 'stringer', 'marlo'))
+        key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': (('marlo', ''), ('skywalker', 'omar'))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((2, -1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((2, -1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_default_value_in_vocabulary(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5035,15 +5287,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 2, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 2, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5058,15 +5314,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 33, 0, 62), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 33, 0, 62), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5081,15 +5341,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, -1, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, -1, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_dense_input(self):
     default_value = -100
     column = fc.categorical_column_with_vocabulary_list(
@@ -5104,15 +5368,18 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                          dtype=np.int32)
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1), (2, 2)),
-              values=np.array((2, default_value, 0, 4), dtype=np.int64),
-              dense_shape=(3, 3)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1), (2, 2)),
+            values=np.array((2, default_value, 0, 4), dtype=np.int64),
+            dense_shape=(3, 3)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_int32_with_oov_buckets(self):
     column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5128,15 +5395,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((2, 60, 0, 4), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((2, 60, 0, 4), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5153,14 +5424,17 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       wire_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
@@ -5178,15 +5452,19 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
       }, (wire_column,))
       bias = get_linear_model_bias()
       wire_var = get_linear_model_column_var(wire_column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), wire_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        wire_var.assign(((1.,), (2.,), (3.,), (4.,))).eval()
-        # 'marlo' -> 2: wire_var[2] = 3
-        # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
-        self.assertAllClose(((3.,), (5.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,), (0.,)), self.evaluate(wire_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(wire_var.assign(((1.,), (2.,), (3.,), (4.,))))
+      # 'marlo' -> 2: wire_var[2] = 3
+      # 'skywalker' -> 3, 'omar' -> 0: wire_var[3] + wire_var[0] = 4+1 = 5
+      self.assertAllClose(((3.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     wire_column = fc.categorical_column_with_vocabulary_list(
         key='aaa',
@@ -5208,7 +5486,6 @@ class VocabularyListCategoricalColumnTest(test.TestCase):
                      fc.VocabularyListCategoricalColumn._from_config(config))
 
 
-
 class IdentityCategoricalColumnTest(test.TestCase):
 
   def test_constructor(self):
@@ -5225,6 +5502,7 @@ class IdentityCategoricalColumnTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'key must be a string.'):
       fc.categorical_column_with_identity(key=('aaa',), num_buckets=3)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     original = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     for column in (original, copy.deepcopy(original)):
@@ -5264,63 +5542,70 @@ class IdentityCategoricalColumnTest(test.TestCase):
               'aaa': inputs
           }), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=30)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(int64_list=feature_pb2.Int64List(
-                    value=[11, 21]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        int64_list=feature_pb2.Int64List(value=[11, 21]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a]))
+        features=fc.make_parse_example_spec_v2([a]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([11, 21], dtype=np.int64),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([11, 21], dtype=np.int64),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor = fc._transform_features({'aaa': inputs}, [column], None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
-
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
+    id_tensor = fc._transform_features_v2({
+        'aaa': inputs
+    }, [column], None)[column]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_dense_input(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     id_weight_pair = column.get_sparse_tensors(
@@ -5328,47 +5613,53 @@ class IdentityCategoricalColumnTest(test.TestCase):
             'aaa': ((0, -1), (1, 0))
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_small(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, -1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(1, -1, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      with self.assertRaisesRegexp(
-          errors.OpError, 'assert_greater_or_equal_0'):
-        id_weight_pair.id_tensor.eval()
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    with self.assertRaisesRegexp(errors.OpError, 'assert_greater_or_equal_0'):
+      self.evaluate(id_weight_pair.id_tensor)
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_inputs_too_big(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(1, 99, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(1, 99, 0), dense_shape=(2, 2))
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      with self.assertRaisesRegexp(
-          errors.OpError, 'assert_less_than_num_buckets'):
-        id_weight_pair.id_tensor.eval()
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    with self.assertRaisesRegexp(errors.OpError,
+                                 'assert_less_than_num_buckets'):
+      self.evaluate(id_weight_pair.id_tensor)
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value(self):
     column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
@@ -5381,15 +5672,19 @@ class IdentityCategoricalColumnTest(test.TestCase):
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array((1, 3, 3), dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_weight_pair.id_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array((1, 3, 3), dtype=np.int64),
+            dense_shape=inputs.dense_shape),
+        self.evaluate(id_weight_pair.id_tensor))
+
+  @test_util.run_deprecated_v1
   def test_get_sparse_tensors_with_default_value_and_placeholder_inputs(self):
     column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=4, default_value=3)
@@ -5397,14 +5692,15 @@ class IdentityCategoricalColumnTest(test.TestCase):
     input_values = array_ops.placeholder(dtype=dtypes.int32)
     input_shape = array_ops.placeholder(dtype=dtypes.int64)
     inputs = sparse_tensor.SparseTensorValue(
-        indices=input_indices,
-        values=input_values,
-        dense_shape=input_shape)
+        indices=input_indices, values=input_values, dense_shape=input_shape)
     id_weight_pair = column.get_sparse_tensors(
         fc.FeatureTransformationCache({
             'aaa': inputs
         }), None)
     self.assertIsNone(id_weight_pair.weight_tensor)
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
     with _initialized_session():
       _assert_sparse_tensor_value(
           self,
@@ -5412,12 +5708,14 @@ class IdentityCategoricalColumnTest(test.TestCase):
               indices=np.array(((0, 0), (1, 0), (1, 1)), dtype=np.int64),
               values=np.array((1, 3, 3), dtype=np.int64),
               dense_shape=np.array((2, 2), dtype=np.int64)),
-          id_weight_pair.id_tensor.eval(feed_dict={
-              input_indices: ((0, 0), (1, 0), (1, 1)),
-              input_values: (1, -1, 99),
-              input_shape: (2, 2),
-          }))
+          id_weight_pair.id_tensor.eval(
+              feed_dict={
+                  input_indices: ((0, 0), (1, 0), (1, 1)),
+                  input_values: (1, -1, 99),
+                  input_shape: (2, 2),
+              }))
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     self.assertEqual(3, column.num_buckets)
@@ -5431,14 +5729,17 @@ class IdentityCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] = 1
-        # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] = 1
+      # weight_var[2] + weight_var[1] = 3+2 = 5
+      self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
@@ -5453,15 +5754,19 @@ class IdentityCategoricalColumnTest(test.TestCase):
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] = 1
-        # weight_var[2] + weight_var[1] = 3+2 = 5
-        self.assertAllClose(((1.,), (5.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] = 1
+      # weight_var[2] + weight_var[1] = 3+2 = 5
+      self.assertAllClose(((1.,), (5.,)), self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     column = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
 
@@ -5494,13 +5799,18 @@ class TransformFeaturesTest(test.TestCase):
                   indices=[[0, 0], [1, 0], [1, 1]],
                   dense_shape=[2, 2])
       }
-      transformed = fc._transform_features(
+      transformed = fc._transform_features_v2(
           features, [bucketized_price, hashed_sparse], None)
-      with _initialized_session():
-        self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
-        self.assertAllEqual([[0], [3]], transformed[bucketized_price].eval())
-        self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
-        self.assertAllEqual([6, 4, 1], transformed[hashed_sparse].values.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertIn(bucketized_price.name, transformed[bucketized_price].name)
+      self.assertAllEqual([[0], [3]],
+                          self.evaluate(transformed[bucketized_price]))
+      self.assertIn(hashed_sparse.name, transformed[hashed_sparse].name)
+      self.assertAllEqual([6, 4, 1],
+                          self.evaluate(transformed[hashed_sparse].values))
 
   def test_column_order(self):
     """When the column is both dense and sparse, uses sparse tensors."""
@@ -5531,12 +5841,12 @@ class TransformFeaturesTest(test.TestCase):
       column1 = _LoggerColumn('1')
       column2 = _LoggerColumn('2')
       call_logger = {'count': 0}
-      fc._transform_features({}, [column1, column2], None)
+      fc._transform_features_v2({}, [column1, column2], None)
       self.assertEqual(0, column1.call_order)
       self.assertEqual(1, column2.call_order)
 
       call_logger = {'count': 0}
-      fc._transform_features({}, [column2, column1], None)
+      fc._transform_features_v2({}, [column2, column1], None)
       self.assertEqual(0, column1.call_order)
       self.assertEqual(1, column2.call_order)
 
@@ -5551,7 +5861,7 @@ class IndicatorColumnTest(test.TestCase):
     self.assertEqual(indicator_a.variable_shape, [1, 4])
     self.assertTrue(indicator_a._is_v2_column)
 
-    b = fc_old.categorical_column_with_hash_bucket('b', hash_bucket_size=100)
+    b = fc_old._categorical_column_with_hash_bucket('b', hash_bucket_size=100)
     indicator_b = fc.indicator_column(b)
     self.assertEqual(indicator_b.categorical_column.name, 'b')
     self.assertEqual(indicator_b.name, 'b_indicator')
@@ -5565,8 +5875,9 @@ class IndicatorColumnTest(test.TestCase):
         'animal': ['fox', 'fox']
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                        self.evaluate(output))
 
   def test_2D_shape_succeeds(self):
     # TODO(ispir/cassandrax): Swith to categorical_column_with_keys when ready.
@@ -5580,8 +5891,9 @@ class IndicatorColumnTest(test.TestCase):
                 dense_shape=[2, 1])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 0., 1., 0.], [0., 0., 1., 0.]],
+                        self.evaluate(output))
 
   def test_multi_hot(self):
     animal = fc.indicator_column(
@@ -5593,8 +5905,8 @@ class IndicatorColumnTest(test.TestCase):
                 indices=[[0, 0], [0, 1]], values=[1, 1], dense_shape=[1, 2])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 2., 0., 0.]], output.eval())
+
+    self.assertAllEqual([[0., 2., 0., 0.]], self.evaluate(output))
 
   def test_multi_hot2(self):
     animal = fc.indicator_column(
@@ -5605,9 +5917,10 @@ class IndicatorColumnTest(test.TestCase):
                 indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
     })
     output = transformation_cache.get(animal, None)
-    with self.cached_session():
-      self.assertAllEqual([[0., 1., 1., 0.]], output.eval())
 
+    self.assertAllEqual([[0., 1., 1., 0.]], self.evaluate(output))
+
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     a = fc.categorical_column_with_hash_bucket('a', 4)
     column = fc.indicator_column(a)
@@ -5616,44 +5929,52 @@ class IndicatorColumnTest(test.TestCase):
     self.assertEqual(column.name, 'a_indicator')
     self.assertEqual(column.variable_shape, [1, 4])
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_indicator = fc.indicator_column(a)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_indicator]))
+        features=fc.make_parse_example_spec_v2([a_indicator]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_transform(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_indicator = fc.indicator_column(a)
     features = {
-        'aaa': sparse_tensor.SparseTensorValue(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=('marlo', 'skywalker', 'omar'),
-            dense_shape=(2, 2))
+        'aaa':
+            sparse_tensor.SparseTensorValue(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=('marlo', 'skywalker', 'omar'),
+                dense_shape=(2, 2))
     }
-    indicator_tensor = fc._transform_features(features, [a_indicator],
-                                              None)[a_indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0, 0, 1], [1, 0, 0]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [a_indicator],
+                                                 None)[a_indicator]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual([[0, 0, 1], [1, 0, 0]], self.evaluate(indicator_tensor))
 
+  @test_util.run_deprecated_v1
   def test_transform_with_weighted_column(self):
     # Github issue 12557
     ids = fc.categorical_column_with_vocabulary_list(
@@ -5661,14 +5982,18 @@ class IndicatorColumnTest(test.TestCase):
     weights = fc.weighted_categorical_column(ids, 'weights')
     indicator = fc.indicator_column(weights)
     features = {
-        'ids': constant_op.constant([['c', 'b', 'a']]),
-        'weights': constant_op.constant([[2., 4., 6.]])
+        'ids': constant_op.constant([['c', 'b', 'a', 'c']]),
+        'weights': constant_op.constant([[2., 4., 6., 1.]])
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[6., 4., 2.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
+    self.assertAllEqual([[6., 4., 3.]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_weighted_column(self):
     # Github issue 12583
     ids = fc.categorical_column_with_vocabulary_list(
@@ -5679,11 +6004,15 @@ class IndicatorColumnTest(test.TestCase):
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
         'weights': constant_op.constant([[2., 4., 6.]])
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 4., 2.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
+    self.assertAllEqual([[0., 4., 2.]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_with_missing_value_in_categorical_column(self):
     # Github issue 12583
     ids = fc.categorical_column_with_vocabulary_list(
@@ -5692,11 +6021,15 @@ class IndicatorColumnTest(test.TestCase):
     features = {
         'ids': constant_op.constant([['c', 'b', 'unknown']]),
     }
-    indicator_tensor = fc._transform_features(features, [indicator],
-                                              None)[indicator]
-    with _initialized_session():
-      self.assertAllEqual([[0., 1., 1.]], indicator_tensor.eval())
+    indicator_tensor = fc._transform_features_v2(features, [indicator],
+                                                 None)[indicator]
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
 
+    self.assertAllEqual([[0., 1., 1.]], self.evaluate(indicator_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -5710,12 +6043,15 @@ class IndicatorColumnTest(test.TestCase):
       model = fc.LinearModel([animal])
       predictions = model(features)
       weight_var, _ = model.variables
-      with _initialized_session():
-        # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
-        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
   def test_old_linear_model(self):
     animal = fc.indicator_column(
@@ -5729,16 +6065,19 @@ class IndicatorColumnTest(test.TestCase):
 
       predictions = fc_old.linear_model(features, [animal])
       weight_var = get_linear_model_column_var(animal)
-      with _initialized_session():
-        # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
-        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     animal = fc.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
+        fc_old._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -5748,14 +6087,18 @@ class IndicatorColumnTest(test.TestCase):
 
       predictions = fc_old.linear_model(features, [animal])
       weight_var = get_linear_model_column_var(animal)
-      with _initialized_session():
-        # All should be zero-initialized.
-        self.assertAllClose([[0.], [0.], [0.], [0.]], weight_var.eval())
-        self.assertAllClose([[0.]], predictions.eval())
-        weight_var.assign([[1.], [2.], [3.], [4.]]).eval()
-        self.assertAllClose([[2. + 3.]], predictions.eval())
-
-  def test_feature_layer(self):
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # All should be zero-initialized.
+      self.assertAllClose([[0.], [0.], [0.], [0.]], self.evaluate(weight_var))
+      self.assertAllClose([[0.]], self.evaluate(predictions))
+      self.evaluate(weight_var.assign([[1.], [2.], [3.], [4.]]))
+      self.assertAllClose([[2. + 3.]], self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
@@ -5764,10 +6107,14 @@ class IndicatorColumnTest(test.TestCase):
               sparse_tensor.SparseTensor(
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
-      net = fc.FeatureLayer([animal])(features)
-      with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+      net = fc.DenseFeatures([animal])(features)
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     animal = fc.indicator_column(
         fc.categorical_column_with_identity('animal', num_buckets=4))
@@ -5778,12 +6125,15 @@ class IndicatorColumnTest(test.TestCase):
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
       net = fc_old.input_layer(features, [animal])
-      with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
 
   def test_input_layer_old_categorical(self):
     animal = fc.indicator_column(
-        fc_old.categorical_column_with_identity('animal', num_buckets=4))
+        fc_old._categorical_column_with_identity('animal', num_buckets=4))
     with ops.Graph().as_default():
       features = {
           'animal':
@@ -5791,9 +6141,13 @@ class IndicatorColumnTest(test.TestCase):
                   indices=[[0, 0], [0, 1]], values=[1, 2], dense_shape=[1, 2])
       }
       net = fc_old.input_layer(features, [animal])
-      with _initialized_session():
-        self.assertAllClose([[0., 1., 1., 0.]], net.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose([[0., 1., 1., 0.]], self.evaluate(net))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     parent = fc.categorical_column_with_identity('animal', num_buckets=4)
     animal = fc.indicator_column(parent)
@@ -5822,7 +6176,6 @@ class IndicatorColumnTest(test.TestCase):
     self.assertIs(parent, new_animal.categorical_column)
 
 
-
 class _TestStateManager(fc.StateManager):
 
   def __init__(self, trainable=True):
@@ -5864,6 +6217,7 @@ class _TestStateManager(fc.StateManager):
 
 class EmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -5885,22 +6239,27 @@ class EmbeddingColumnTest(test.TestCase):
     self.assertTrue(embedding_column._is_v2_column)
 
   def test_is_v2_column(self):
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     embedding_column = fc.embedding_column(
         categorical_column, dimension=embedding_dimension)
     self.assertFalse(embedding_column._is_v2_column)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     self.assertIs(categorical_column, embedding_column.categorical_column)
     self.assertEqual(embedding_dimension, embedding_column.dimension)
     self.assertEqual('my_combiner', embedding_column.combiner)
@@ -5914,15 +6273,20 @@ class EmbeddingColumnTest(test.TestCase):
         'aaa': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     embedding_dimension = 2
     original = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
-        combiner='my_combiner', initializer=lambda: 'my_initializer',
-        ckpt_to_load_from='my_ckpt', tensor_name_in_ckpt='my_ckpt_tensor',
-        max_norm=42., trainable=False)
+        categorical_column,
+        dimension=embedding_dimension,
+        combiner='my_combiner',
+        initializer=lambda: 'my_initializer',
+        ckpt_to_load_from='my_ckpt',
+        tensor_name_in_ckpt='my_ckpt_tensor',
+        max_norm=42.,
+        trainable=False)
     for embedding_column in (original, copy.deepcopy(original)):
       self.assertEqual('aaa', embedding_column.categorical_column.name)
       self.assertEqual(3, embedding_column.categorical_column.num_buckets)
@@ -5942,51 +6306,60 @@ class EmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
     with self.assertRaisesRegexp(ValueError, 'initializer must be callable'):
       fc.embedding_column(categorical_column, dimension=2, initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_embedded = fc.embedding_column(a, dimension=2)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer']))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer']))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded]))
+        features=fc.make_parse_example_spec_v2([a_embedded]))
     self.assertIn('aaa', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     a_embedded = fc.embedding_column(a, dimension=2)
     features = {
-        'aaa': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(0, 1, 0),
-            dense_shape=(2, 2))
+        'aaa':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 1, 0),
+                dense_shape=(2, 2))
     }
-    outputs = fc._transform_features(features, [a, a_embedded], None)
+    outputs = fc._transform_features_v2(features, [a, a_embedded], None)
     output_a = outputs[a]
     output_embedded = outputs[a_embedded]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_embedded.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                self.evaluate(output_embedded))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
@@ -6006,6 +6379,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6028,7 +6402,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -6043,10 +6418,14 @@ class EmbeddingColumnTest(test.TestCase):
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_old_categorical(self):
     # Inputs.
     vocabulary_size = 3
@@ -6086,7 +6465,7 @@ class EmbeddingColumnTest(test.TestCase):
     )
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
         categorical_column,
@@ -6103,10 +6482,14 @@ class EmbeddingColumnTest(test.TestCase):
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_3d(self):
     # Inputs.
     vocabulary_size = 4
@@ -6122,11 +6505,12 @@ class EmbeddingColumnTest(test.TestCase):
     # Embedding variable.
     embedding_dimension = 3
     embedding_values = (
-        (1., 2., 4.),   # id 0
-        (3., 5., 1.),   # id 1
+        (1., 2., 4.),  # id 0
+        (3., 5., 1.),  # id 1
         (7., 11., 2.),  # id 2
-        (2., 7., 12.)   # id 3
+        (2., 7., 12.)  # id 3
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6150,7 +6534,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -6165,10 +6550,14 @@ class EmbeddingColumnTest(test.TestCase):
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
     self.assertItemsEqual(('embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
@@ -6188,6 +6577,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6210,7 +6600,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         initializer=_initializer)
     state_manager = _TestStateManager()
     embedding_column.create_state(state_manager)
@@ -6230,17 +6621,23 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
     with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval(
-          feed_dict={
-              input_indices: sparse_input.indices,
-              input_values: sparse_input.values,
-              input_shape: sparse_input.dense_shape,
-          }))
+      self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+      self.assertAllEqual(
+          expected_lookups,
+          embedding_lookup.eval(
+              feed_dict={
+                  input_indices: sparse_input.indices,
+                  input_values: sparse_input.values,
+                  input_shape: sparse_input.dense_shape,
+              }))
 
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_restore_from_ckpt(self):
     # Inputs.
     vocabulary_size = 3
@@ -6280,7 +6677,8 @@ class EmbeddingColumnTest(test.TestCase):
     categorical_column = fc.categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
-        categorical_column, dimension=embedding_dimension,
+        categorical_column,
+        dimension=embedding_dimension,
         ckpt_to_load_from=ckpt_path,
         tensor_name_in_ckpt=ckpt_tensor)
     state_manager = _TestStateManager()
@@ -6294,12 +6692,16 @@ class EmbeddingColumnTest(test.TestCase):
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(
-        ('embedding_weights:0',), tuple([v.name for v in global_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, embedding_lookup.eval())
+    self.assertItemsEqual(('embedding_weights:0',),
+                          tuple([v.name for v in global_vars]))
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(embedding_lookup))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 4
@@ -6317,6 +6719,7 @@ class EmbeddingColumnTest(test.TestCase):
     embedding_dimension = 2
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6343,39 +6746,45 @@ class EmbeddingColumnTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
-          v.name: v for v in ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES)
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
-      linear_weights = trainable_vars[
-          'linear_model/aaa_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
-
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # example 2, ids [], embedding[2] = [0, 0]
-        # example 3, ids [1], embedding[3] = [3, 5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
-
-  def test_feature_layer(self):
+      linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -6394,6 +6803,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6421,23 +6831,27 @@ class EmbeddingColumnTest(test.TestCase):
         initializer=_initializer)
 
     # Provide sparse input and get dense result.
-    l = fc.FeatureLayer((embedding_column,))
-    feature_layer = l({'aaa': sparse_input})
+    l = fc.DenseFeatures((embedding_column,))
+    dense_features = l({'aaa': sparse_input})
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
     for v in global_vars:
       self.assertTrue(isinstance(v, variables_lib.RefVariable))
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
 
-  def test_feature_layer_not_trainable(self):
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features_not_trainable(self):
     # Inputs.
     vocabulary_size = 3
     sparse_input = sparse_tensor.SparseTensorValue(
@@ -6456,6 +6870,7 @@ class EmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -6484,18 +6899,24 @@ class EmbeddingColumnTest(test.TestCase):
         trainable=False)
 
     # Provide sparse input and get dense result.
-    feature_layer = fc.FeatureLayer((embedding_column,))({'aaa': sparse_input})
+    dense_features = fc.DenseFeatures((embedding_column,))({
+        'aaa': sparse_input
+    })
 
     # Assert expected embedding variable and lookups.
     global_vars = ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)
-    self.assertItemsEqual(('feature_layer/aaa_embedding/embedding_weights:0',),
+    self.assertItemsEqual(('dense_features/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in global_vars]))
-    self.assertItemsEqual(
-        [], ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, global_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+    self.assertItemsEqual([],
+                          ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(global_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
 
+  @test_util.run_deprecated_v1
   def test_input_layer(self):
     # Inputs.
     vocabulary_size = 3
@@ -6554,9 +6975,12 @@ class EmbeddingColumnTest(test.TestCase):
     trainable_vars = ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
     self.assertItemsEqual(('input_layer/aaa_embedding/embedding_weights:0',),
                           tuple([v.name for v in trainable_vars]))
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, trainable_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
+
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(trainable_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(feature_layer))
 
   def test_old_linear_model(self):
     # Inputs.
@@ -6611,28 +7035,34 @@ class EmbeddingColumnTest(test.TestCase):
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
-
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # example 2, ids [], embedding[2] = [0, 0]
-        # example 3, ids [1], embedding[3] = [3, 5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     # Inputs.
@@ -6659,7 +7089,7 @@ class EmbeddingColumnTest(test.TestCase):
       return zeros_embedding_values
 
     # Build columns.
-    categorical_column = fc_old.categorical_column_with_identity(
+    categorical_column = fc_old._categorical_column_with_identity(
         key='aaa', num_buckets=vocabulary_size)
     embedding_column = fc.embedding_column(
         categorical_column,
@@ -6687,29 +7117,36 @@ class EmbeddingColumnTest(test.TestCase):
       embedding_weights = trainable_vars[
           'linear_model/aaa_embedding/embedding_weights:0']
       linear_weights = trainable_vars['linear_model/aaa_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
-
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # example 2, ids [], embedding[2] = [0, 0]
-        # example 3, ids [1], embedding[3] = [3, 5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
-        self.assertAllClose(((94.,), (29.,), (0.,), (42.,)), predictions.eval())
 
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # example 2, ids [], embedding[2] = [0, 0]
+      # example 3, ids [1], embedding[3] = [3, 5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5, 4*0 + 6*0, 4*3 + 6*5] = [94, 29, 0, 42]
+      self.assertAllClose(((94.,), (29.,), (0.,), (42.,)),
+                          self.evaluate(predictions))
+
+  @test_util.run_deprecated_v1
   def test_serialization(self):
 
     def _initializer(shape, dtype, partition_info):
@@ -6763,6 +7200,7 @@ class EmbeddingColumnTest(test.TestCase):
 
 class SharedEmbeddingColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6787,6 +7225,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_all_constructor_args(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6818,6 +7257,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         'bbb': parsing_ops.VarLenFeature(dtypes.int64)
     }, embedding_column_b.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6849,6 +7289,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'aaa': parsing_ops.VarLenFeature(dtypes.int64)
       }, embedding_column_a.parse_example_spec)
 
+  @test_util.run_deprecated_v1
   def test_invalid_initializer(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6860,6 +7301,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           dimension=2,
           initializer='not_fn')
 
+  @test_util.run_deprecated_v1
   def test_incompatible_column_type(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6874,6 +7316,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
           [categorical_column_a, categorical_column_b, categorical_column_c],
           dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_weighted_categorical_column_ok(self):
     categorical_column_a = fc.categorical_column_with_identity(
         key='aaa', num_buckets=3)
@@ -6891,82 +7334,90 @@ class SharedEmbeddingColumnTest(test.TestCase):
         [weighted_categorical_column_a, weighted_categorical_column_b],
         dimension=2)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     b = fc.categorical_column_with_vocabulary_list(
         key='bbb', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_embedded, b_embedded = fc.shared_embedding_columns_v2([a, b], dimension=2)
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-            'bbb':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'stringer', b'marlo'])),
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+                'bbb':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'stringer', b'marlo'])),
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_embedded, b_embedded]))
+        features=fc.make_parse_example_spec_v2([a_embedded, b_embedded]))
     self.assertIn('aaa', features)
     self.assertIn('bbb', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'stringer', b'marlo'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['bbb'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'stringer', b'marlo'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['bbb']))
+
+  @test_util.run_deprecated_v1
   def test_transform_feature(self):
     a = fc.categorical_column_with_identity(key='aaa', num_buckets=3)
     b = fc.categorical_column_with_identity(key='bbb', num_buckets=3)
     a_embedded, b_embedded = fc.shared_embedding_columns_v2([a, b], dimension=2)
     features = {
-        'aaa': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(0, 1, 0),
-            dense_shape=(2, 2)),
-        'bbb': sparse_tensor.SparseTensor(
-            indices=((0, 0), (1, 0), (1, 1)),
-            values=(1, 2, 1),
-            dense_shape=(2, 2)),
+        'aaa':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(0, 1, 0),
+                dense_shape=(2, 2)),
+        'bbb':
+            sparse_tensor.SparseTensor(
+                indices=((0, 0), (1, 0), (1, 1)),
+                values=(1, 2, 1),
+                dense_shape=(2, 2)),
     }
-    outputs = fc._transform_features(features, [a, a_embedded, b, b_embedded],
-                                     None)
+    outputs = fc._transform_features_v2(features,
+                                        [a, a_embedded, b, b_embedded], None)
     output_a = outputs[a]
     output_a_embedded = outputs[a_embedded]
     output_b = outputs[b]
     output_b_embedded = outputs[b_embedded]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self, output_a.eval(), output_a_embedded.eval())
-      _assert_sparse_tensor_value(
-          self, output_b.eval(), output_b_embedded.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(self, self.evaluate(output_a),
+                                self.evaluate(output_a_embedded))
+    _assert_sparse_tensor_value(self, self.evaluate(output_b),
+                                self.evaluate(output_b_embedded))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor(self):
     # Inputs.
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
-    input_features = {
-        'aaa': input_a,
-        'bbb': input_b
-    }
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
+    input_features = {'aaa': input_a, 'bbb': input_b}
 
     # Embedding variable.
     embedding_dimension = 2
@@ -6975,6 +7426,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -7016,21 +7468,27 @@ class SharedEmbeddingColumnTest(test.TestCase):
     self.assertItemsEqual(('aaa_bbb_shared_embedding:0',),
                           tuple([v.name for v in global_vars]))
     embedding_var = global_vars[0]
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, embedding_var.eval())
-      self.assertAllEqual(expected_lookups_a, embedding_lookup_a.eval())
-      self.assertAllEqual(expected_lookups_b, embedding_lookup_b.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values, self.evaluate(embedding_var))
+    self.assertAllEqual(expected_lookups_a, self.evaluate(embedding_lookup_a))
+    self.assertAllEqual(expected_lookups_b, self.evaluate(embedding_lookup_b))
+
+  @test_util.run_deprecated_v1
   def test_get_dense_tensor_placeholder_inputs(self):
     # Inputs.
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
     # Specify shape, because dense input must have rank specified.
     input_a_placeholder = array_ops.placeholder(
         dtype=dtypes.int64, shape=[None, 3])
@@ -7052,6 +7510,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -7077,22 +7536,26 @@ class SharedEmbeddingColumnTest(test.TestCase):
     with _initialized_session() as sess:
       sess.run([embedding_lookup_a, embedding_lookup_b], feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     # Inputs.
     batch_size = 2
     vocabulary_size = 3
     # -1 values are ignored.
-    input_a = np.array(
-        [[2, -1, -1],  # example 0, ids [2]
-         [0, 1, -1]])  # example 1, ids [0, 1]
-    input_b = np.array(
-        [[0, -1, -1],  # example 0, ids [0]
-         [-1, -1, -1]])  # example 1, ids []
+    input_a = np.array([
+        [2, -1, -1],  # example 0, ids [2]
+        [0, 1, -1]
+    ])  # example 1, ids [0, 1]
+    input_b = np.array([
+        [0, -1, -1],  # example 0, ids [0]
+        [-1, -1, -1]
+    ])  # example 1, ids []
 
     # Embedding variable.
     embedding_dimension = 2
     embedding_shape = (vocabulary_size, embedding_dimension)
     zeros_embedding_values = np.zeros(embedding_shape)
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual(embedding_shape, shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -7128,8 +7591,8 @@ class SharedEmbeddingColumnTest(test.TestCase):
           expected_var_names,
           [v.name for v in ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)])
       trainable_vars = {
-          v.name: v for v in ops.get_collection(
-              ops.GraphKeys.TRAINABLE_VARIABLES)
+          v.name: v
+          for v in ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)
       }
       self.assertItemsEqual(expected_var_names, trainable_vars.keys())
       bias = trainable_vars['linear_model/bias_weights:0']
@@ -7138,35 +7601,40 @@ class SharedEmbeddingColumnTest(test.TestCase):
           'linear_model/aaa_shared_embedding/weights:0']
       linear_weights_b = trainable_vars[
           'linear_model/bbb_shared_embedding/weights:0']
-      with _initialized_session():
-        # Predictions with all zero weights.
-        self.assertAllClose(np.zeros((1,)), bias.eval())
-        self.assertAllClose(zeros_embedding_values, embedding_weights.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_a.eval())
-        self.assertAllClose(
-            np.zeros((embedding_dimension, 1)), linear_weights_b.eval())
-        self.assertAllClose(np.zeros((batch_size, 1)), predictions.eval())
-
-        # Predictions with all non-zero weights.
-        embedding_weights.assign((
-            (1., 2.),  # id 0
-            (3., 5.),  # id 1
-            (7., 11.)  # id 2
-        )).eval()
-        linear_weights_a.assign(((4.,), (6.,))).eval()
-        # example 0, ids [2], embedding[0] = [7, 11]
-        # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
-        # sum(embeddings * linear_weights)
-        # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
-        linear_weights_b.assign(((3.,), (5.,))).eval()
-        # example 0, ids [0], embedding[0] = [1, 2]
-        # example 1, ids [], embedding[1] = 0, 0]
-        # sum(embeddings * linear_weights)
-        # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
-        self.assertAllClose([[94. + 13.], [29.]], predictions.eval())
-
-  def _test_feature_layer(self, trainable=True):
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      # Predictions with all zero weights.
+      self.assertAllClose(np.zeros((1,)), self.evaluate(bias))
+      self.assertAllClose(zeros_embedding_values,
+                          self.evaluate(embedding_weights))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_a))
+      self.assertAllClose(
+          np.zeros((embedding_dimension, 1)), self.evaluate(linear_weights_b))
+      self.assertAllClose(np.zeros((batch_size, 1)), self.evaluate(predictions))
+
+      # Predictions with all non-zero weights.
+      self.evaluate(
+          embedding_weights.assign((
+              (1., 2.),  # id 0
+              (3., 5.),  # id 1
+              (7., 11.)  # id 2
+          )))
+      self.evaluate(linear_weights_a.assign(((4.,), (6.,))))
+      # example 0, ids [2], embedding[0] = [7, 11]
+      # example 1, ids [0, 1], embedding[1] = mean([1, 2] + [3, 5]) = [2, 3.5]
+      # sum(embeddings * linear_weights)
+      # = [4*7 + 6*11, 4*2 + 6*3.5] = [94, 29]
+      self.evaluate(linear_weights_b.assign(((3.,), (5.,))))
+      # example 0, ids [0], embedding[0] = [1, 2]
+      # example 1, ids [], embedding[1] = 0, 0]
+      # sum(embeddings * linear_weights)
+      # = [3*1 + 5*2, 3*0 +5*0] = [13, 0]
+      self.assertAllClose([[94. + 13.], [29.]], self.evaluate(predictions))
+
+  def _test_dense_features(self, trainable=True):
     # Inputs.
     vocabulary_size = 3
     sparse_input_a = sparse_tensor.SparseTensorValue(
@@ -7201,6 +7669,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
         (3., 5.),  # id 1
         (7., 11.)  # id 2
     )
+
     def _initializer(shape, dtype, partition_info):
       self.assertAllEqual((vocabulary_size, embedding_dimension), shape)
       self.assertEqual(dtypes.float32, dtype)
@@ -7252,7 +7721,7 @@ class SharedEmbeddingColumnTest(test.TestCase):
     }
 
     # Provide sparse input and get dense result.
-    feature_layer = fc.FeatureLayer(
+    dense_features = fc.DenseFeatures(
         feature_columns=(embedding_column_b, embedding_column_a,
                          embedding_column_c, embedding_column_d))(
                              features)
@@ -7272,16 +7741,23 @@ class SharedEmbeddingColumnTest(test.TestCase):
     else:
       self.assertItemsEqual([], tuple([v.name for v in trainable_vars]))
     shared_embedding_vars = global_vars
-    with _initialized_session():
-      self.assertAllEqual(embedding_values, shared_embedding_vars[0].eval())
-      self.assertAllEqual(expected_lookups, feature_layer.eval())
 
-  def test_feature_layer(self):
-    self._test_feature_layer()
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    self.assertAllEqual(embedding_values,
+                        self.evaluate(shared_embedding_vars[0]))
+    self.assertAllEqual(expected_lookups, self.evaluate(dense_features))
+
+  @test_util.run_deprecated_v1
+  def test_dense_features(self):
+    self._test_dense_features()
 
-  def test_feature_layer_no_trainable(self):
-    self._test_feature_layer(trainable=False)
+  @test_util.run_deprecated_v1
+  def test_dense_features_no_trainable(self):
+    self._test_dense_features(trainable=False)
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
 
     def _initializer(shape, dtype, partition_info):
@@ -7302,9 +7778,9 @@ class SharedEmbeddingColumnTest(test.TestCase):
     # TODO(rohanj): Add tests for (from|get)_config once implemented
 
 
-
 class WeightedCategoricalColumnTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_defaults(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -7320,11 +7796,12 @@ class WeightedCategoricalColumnTest(test.TestCase):
 
   def test_is_v2_column(self):
     column = fc.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+        categorical_column=fc_old._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     self.assertFalse(column._is_v2_column)
 
+  @test_util.run_deprecated_v1
   def test_deep_copy(self):
     """Tests deepcopy of categorical_column_with_hash_bucket."""
     original = fc.weighted_categorical_column(
@@ -7365,7 +7842,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
     with self.assertRaisesRegexp(ValueError, 'Bad dtype'):
-      fc._transform_features({
+      fc._transform_features_v2({
           'ids': strings,
           'values': strings
       }, (column,), None)
@@ -7386,77 +7863,79 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=('omar', 'stringer', 'marlo'),
         dense_shape=(2, 2))
-    with self.assertRaisesRegexp(
-        ValueError, 'values is not in features dictionary'):
-      fc._transform_features({'ids': inputs}, (column,), None)
+    with self.assertRaisesRegexp(ValueError,
+                                 'values is not in features dictionary'):
+      fc._transform_features_v2({'ids': inputs}, (column,), None)
 
+  @test_util.run_deprecated_v1
   def test_parse_example(self):
     a = fc.categorical_column_with_vocabulary_list(
         key='aaa', vocabulary_list=('omar', 'stringer', 'marlo'))
     a_weighted = fc.weighted_categorical_column(a, weight_feature_key='weights')
-    data = example_pb2.Example(features=feature_pb2.Features(
-        feature={
-            'aaa':
-                feature_pb2.Feature(bytes_list=feature_pb2.BytesList(
-                    value=[b'omar', b'stringer'])),
-            'weights':
-                feature_pb2.Feature(float_list=feature_pb2.FloatList(
-                    value=[1., 10.]))
-        }))
+    data = example_pb2.Example(
+        features=feature_pb2.Features(
+            feature={
+                'aaa':
+                    feature_pb2.Feature(
+                        bytes_list=feature_pb2.BytesList(
+                            value=[b'omar', b'stringer'])),
+                'weights':
+                    feature_pb2.Feature(
+                        float_list=feature_pb2.FloatList(value=[1., 10.]))
+            }))
     features = parsing_ops.parse_example(
         serialized=[data.SerializeToString()],
-        features=fc.make_parse_example_spec([a_weighted]))
+        features=fc.make_parse_example_spec_v2([a_weighted]))
     self.assertIn('aaa', features)
     self.assertIn('weights', features)
-    with self.cached_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([b'omar', b'stringer'], dtype=np.object_),
-              dense_shape=[1, 2]),
-          features['aaa'].eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=[[0, 0], [0, 1]],
-              values=np.array([1., 10.], dtype=np.float32),
-              dense_shape=[1, 2]),
-          features['weights'].eval())
 
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([b'omar', b'stringer'], dtype=np.object_),
+            dense_shape=[1, 2]), self.evaluate(features['aaa']))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=[[0, 0], [0, 1]],
+            values=np.array([1., 10.], dtype=np.float32),
+            dense_shape=[1, 2]), self.evaluate(features['weights']))
+
+  @test_util.run_deprecated_v1
   def test_transform_features(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(0, 1, 0),
-        dense_shape=(2, 2))
+        indices=((0, 0), (1, 0), (1, 1)), values=(0, 1, 0), dense_shape=(2, 2))
     weights = sparse_tensor.SparseTensorValue(
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0.5, 1.0, 0.1),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': inputs,
         'values': weights,
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array(inputs.values, dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=weights.indices,
+            values=np.array(weights.values, dtype=np.float32),
+            dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_input(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -7466,55 +7945,57 @@ class WeightedCategoricalColumnTest(test.TestCase):
         indices=((0, 0), (1, 0), (1, 1)),
         values=(0.5, 1.0, 0.1),
         dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': ((0, -1), (1, 0)),
         'values': weights,
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((0, 1, 0), dtype=np.int64),
-              dense_shape=(2, 2)),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=weights.indices,
-              values=np.array(weights.values, dtype=np.float32),
-              dense_shape=weights.dense_shape),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((0, 1, 0), dtype=np.int64),
+            dense_shape=(2, 2)), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=weights.indices,
+            values=np.array(weights.values, dtype=np.float32),
+            dense_shape=weights.dense_shape), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_transform_features_dense_weights(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     inputs = sparse_tensor.SparseTensorValue(
-        indices=((0, 0), (1, 0), (1, 1)),
-        values=(2, 1, 0),
-        dense_shape=(2, 2))
-    id_tensor, weight_tensor = fc._transform_features({
+        indices=((0, 0), (1, 0), (1, 1)), values=(2, 1, 0), dense_shape=(2, 2))
+    id_tensor, weight_tensor = fc._transform_features_v2({
         'ids': inputs,
         'values': ((.5, 0.), (1., .1)),
     }, (column,), None)[column]
-    with _initialized_session():
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=inputs.indices,
-              values=np.array(inputs.values, dtype=np.int64),
-              dense_shape=inputs.dense_shape),
-          id_tensor.eval())
-      _assert_sparse_tensor_value(
-          self,
-          sparse_tensor.SparseTensorValue(
-              indices=((0, 0), (1, 0), (1, 1)),
-              values=np.array((.5, 1., .1), dtype=np.float32),
-              dense_shape=(2, 2)),
-          weight_tensor.eval())
 
+    self.evaluate(variables_lib.global_variables_initializer())
+    self.evaluate(lookup_ops.tables_initializer())
+
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=inputs.indices,
+            values=np.array(inputs.values, dtype=np.int64),
+            dense_shape=inputs.dense_shape), self.evaluate(id_tensor))
+    _assert_sparse_tensor_value(
+        self,
+        sparse_tensor.SparseTensorValue(
+            indices=((0, 0), (1, 0), (1, 1)),
+            values=np.array((.5, 1., .1), dtype=np.float32),
+            dense_shape=(2, 2)), self.evaluate(weight_tensor))
+
+  @test_util.run_deprecated_v1
   def test_linear_model(self):
     column = fc.weighted_categorical_column(
         categorical_column=fc.categorical_column_with_identity(
@@ -7535,15 +8016,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
                   dense_shape=(2, 2))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_linear_model_mismatched_shape(self):
     column = fc.weighted_categorical_column(
@@ -7589,7 +8073,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_linear_model_mismatched_dense_shape(self):
     column = fc.weighted_categorical_column(
@@ -7607,15 +8091,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
           'values': ((.5,), (1.,), (.1,))
       })
       weight_var, bias = model.variables
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model(self):
     column = fc.weighted_categorical_column(
@@ -7637,15 +8124,18 @@ class WeightedCategoricalColumnTest(test.TestCase):
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model_mismatched_shape(self):
     column = fc.weighted_categorical_column(
@@ -7690,7 +8180,7 @@ class WeightedCategoricalColumnTest(test.TestCase):
           rewriter_config_pb2.RewriterConfig.OFF)
       with _initialized_session(config):
         with self.assertRaisesRegexp(errors.OpError, 'Incompatible shapes'):
-          predictions.eval()
+          self.evaluate(predictions)
 
   def test_old_linear_model_mismatched_dense_shape(self):
     column = fc.weighted_categorical_column(
@@ -7708,19 +8198,22 @@ class WeightedCategoricalColumnTest(test.TestCase):
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   def test_old_linear_model_old_categorical(self):
     column = fc.weighted_categorical_column(
-        categorical_column=fc_old.categorical_column_with_identity(
+        categorical_column=fc_old._categorical_column_with_identity(
             key='ids', num_buckets=3),
         weight_feature_key='values')
     with ops.Graph().as_default():
@@ -7738,18 +8231,22 @@ class WeightedCategoricalColumnTest(test.TestCase):
       }, (column,))
       bias = get_linear_model_bias()
       weight_var = get_linear_model_column_var(column)
-      with _initialized_session():
-        self.assertAllClose((0.,), bias.eval())
-        self.assertAllClose(((0.,), (0.,), (0.,)), weight_var.eval())
-        self.assertAllClose(((0.,), (0.,)), predictions.eval())
-        weight_var.assign(((1.,), (2.,), (3.,))).eval()
-        # weight_var[0] * weights[0, 0] = 1 * .5 = .5
-        # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
-        # = 3*1 + 2*.1 = 3+.2 = 3.2
-        self.assertAllClose(((.5,), (3.2,)), predictions.eval())
+
+      self.evaluate(variables_lib.global_variables_initializer())
+      self.evaluate(lookup_ops.tables_initializer())
+
+      self.assertAllClose((0.,), self.evaluate(bias))
+      self.assertAllClose(((0.,), (0.,), (0.,)), self.evaluate(weight_var))
+      self.assertAllClose(((0.,), (0.,)), self.evaluate(predictions))
+      self.evaluate(weight_var.assign(((1.,), (2.,), (3.,))))
+      # weight_var[0] * weights[0, 0] = 1 * .5 = .5
+      # weight_var[2] * weights[1, 0] + weight_var[1] * weights[1, 1]
+      # = 3*1 + 2*.1 = 3+.2 = 3.2
+      self.assertAllClose(((.5,), (3.2,)), self.evaluate(predictions))
 
   # TODO(ptucker): Add test with embedding of weighted categorical.
 
+  @test_util.run_deprecated_v1
   def test_serialization(self):
     categorical_column = fc.categorical_column_with_identity(
         key='ids', num_buckets=3)
diff --git a/tensorflow/python/framework/auto_control_deps.py b/tensorflow/python/framework/auto_control_deps.py
index 9a9ee46aabb13a2dc9bff153c49814da5724ebf6..30dc959e9a9f717bdb5c56bfbdde5ffa9d48c257 100644
--- a/tensorflow/python/framework/auto_control_deps.py
+++ b/tensorflow/python/framework/auto_control_deps.py
@@ -21,9 +21,11 @@ from __future__ import print_function
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes as dtypes_module
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 
@@ -70,6 +72,17 @@ class AutomaticControlDependencies(object):
       self._returned_tensors.add(indices)
       self._returned_tensors.add(values)
       return ops.IndexedSlices(values, indices, dense_shape=tensor.dense_shape)
+    elif isinstance(tensor, sparse_tensor.SparseTensor):
+      values = array_ops.identity(tensor.values)
+      indices = array_ops.identity(tensor.indices)
+      self._returned_tensors.add(indices)
+      self._returned_tensors.add(values)
+      return sparse_tensor.SparseTensor(
+          indices, values, dense_shape=tensor.dense_shape)
+    elif isinstance(tensor, tensor_array_ops.TensorArray):
+      flow = array_ops.identity(tensor.flow)
+      self._returned_tensors.add(flow)
+      return tensor_array_ops.build_ta_with_new_flow(tensor, flow)
     # We want to make the return values depend on the stateful operations, but
     # we don't want to introduce a cycle, so we make the return value the result
     # of a new identity operation that the stateful operations definitely don't
diff --git a/tensorflow/python/framework/auto_control_deps_test.py b/tensorflow/python/framework/auto_control_deps_test.py
index a1dff9e8349aba3fb16ac57314f0ea34a37f2c5b..5f5de45b9ee44da8a3440b5f3a5d55fbf7b8a02f 100644
--- a/tensorflow/python/framework/auto_control_deps_test.py
+++ b/tensorflow/python/framework/auto_control_deps_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import auto_control_deps as acd
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -46,6 +47,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
         val = c.mark_as_return(val)
       self.assertAllEqual(val.eval(), 4.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondMustRun(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -67,6 +69,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondMustRunSeparateRead(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -90,6 +93,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
       one.eval(feed_dict={p: True})
       self.assertAllEqual(v.read_value().eval(), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondNested(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -124,6 +128,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: True, q: True}), 7.0)
       self.assertAllEqual(val.eval(feed_dict={p: True, q: False}), 8.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondOneBranch(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -144,6 +149,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: False}), 5.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 5.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondOneBranchUpdateBefore(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -165,6 +171,7 @@ class AutomaticControlDependenciesTest(test.TestCase):
       self.assertAllEqual(val.eval(feed_dict={p: False}), 6.0)
       self.assertAllEqual(val.eval(feed_dict={p: True}), 12.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondOneBranchUpdateAfter(self):
     with context.graph_mode(), self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 53d84b2dc760c9a4e1c332ef4aa0e6bf3327662e..ade0797dcdbac0334a7cc7e657922b2d1139be4c 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -114,8 +114,9 @@ def convert_to_eager_tensor(value, ctx, dtype=None):
     return ops.EagerTensor(value, handle, device, dtype)
 
 
-@tf_export("constant")
-def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
+@tf_export(v1=["constant"])
+def constant_v1(
+    value, dtype=None, shape=None, name="Const", verify_shape=False):
   """Creates a constant tensor.
 
   The resulting tensor is populated with values of type `dtype`, as
@@ -174,6 +175,79 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   Raises:
     TypeError: if shape is incorrectly specified or unsupported.
   """
+  return _constant_impl(value, dtype, shape, name, verify_shape=verify_shape,
+                        allow_broadcast=False)
+
+
+@tf_export("constant", v1=[])
+def constant(value, dtype=None, shape=None, name="Const"):
+  """Creates a constant tensor.
+
+  The resulting tensor is populated with values of type `dtype`, as
+  specified by arguments `value` and (optionally) `shape` (see examples
+  below).
+
+  The argument `value` can be a constant value, or a list of values of type
+  `dtype`. If `value` is a list, then the length of the list must be less
+  than or equal to the number of elements implied by the `shape` argument (if
+  specified). In the case where the list length is less than the number of
+  elements specified by `shape`, the last element in the list will be used
+  to fill the remaining entries.
+
+  The argument `shape` is optional. If present, it specifies the dimensions of
+  the resulting tensor. If not present, the shape of `value` is used.
+
+  If the argument `dtype` is not specified, then the type is inferred from
+  the type of `value`.
+
+  For example:
+
+  ```python
+  # Constant 1-D Tensor populated with value list.
+  tensor = tf.constant([1, 2, 3, 4, 5, 6]) => [1 2 3 4 5 6]
+
+  # Constant 1-D Tensor populated with value list.
+  tensor = tf.constant([1, 2, 3, 4, 5, 6], shape=(2,3))
+       => [[1 2 3], [4 5 6]]
+
+  # Constant 2-D tensor populated with scalar value -1.
+  tensor = tf.constant(-1.0, shape=[2, 3]) => [[-1. -1. -1.]
+                                               [-1. -1. -1.]]
+  ```
+
+  `tf.constant` differs from `tf.fill` in a few ways:
+
+  *   `tf.constant` supports arbitrary constants, not just uniform scalar
+      Tensors like `tf.fill`.
+  *   `tf.constant` creates a `Const` node in the computation graph with the
+      exact value at graph construction time. On the other hand, `tf.fill`
+      creates an Op in the graph that is expanded at runtime.
+  *   Because `tf.constant` only embeds constant values in the graph, it does
+      not support dynamic shapes based on other runtime Tensors, whereas
+      `tf.fill` does.
+
+  Args:
+    value:          A constant value (or list) of output type `dtype`.
+
+    dtype:          The type of the elements of the resulting tensor.
+
+    shape:          Optional dimensions of resulting tensor.
+
+    name:           Optional name for the tensor.
+
+  Returns:
+    A Constant Tensor.
+
+  Raises:
+    TypeError: if shape is incorrectly specified or unsupported.
+  """
+  return _constant_impl(value, dtype, shape, name, verify_shape=False,
+                        allow_broadcast=True)
+
+
+def _constant_impl(
+    value, dtype, shape, name, verify_shape, allow_broadcast):
+  """Implementation of constant."""
   ctx = context.context()
   if ctx.executing_eagerly():
     t = convert_to_eager_tensor(value, ctx, dtype)
@@ -205,7 +279,8 @@ def constant(value, dtype=None, shape=None, name="Const", verify_shape=False):
   tensor_value = attr_value_pb2.AttrValue()
   tensor_value.tensor.CopyFrom(
       tensor_util.make_tensor_proto(
-          value, dtype=dtype, shape=shape, verify_shape=verify_shape))
+          value, dtype=dtype, shape=shape, verify_shape=verify_shape,
+          allow_broadcast=allow_broadcast))
   dtype_value = attr_value_pb2.AttrValue(type=tensor_value.tensor.dtype)
   const_tensor = g.create_op(
       "Const", [], [dtype_value.type],
diff --git a/tensorflow/python/framework/device.py b/tensorflow/python/framework/device.py
index 7f6e0a75a5c508e35ff5bf3c28d4ab31af205715..e7ac6444a4ac1e116675dbb059cd1953df1213ab 100644
--- a/tensorflow/python/framework/device.py
+++ b/tensorflow/python/framework/device.py
@@ -23,7 +23,7 @@ import threading
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("DeviceSpec")
+@tf_export(v1=["DeviceSpec"])
 class DeviceSpec(object):
   """Represents a (possibly partial) specification for a TensorFlow device.
 
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 48e9f0524e8e8cb942206357c80f1205b1c8a7b3..9a4fe4e93b32aeedcb74cf0f7b2703f64d9db23a 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 import numpy as np
+from six.moves import builtins
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.python import pywrap_tensorflow
@@ -346,7 +347,7 @@ tf_export("dtypes.uint32", "uint32").export_constant(__name__, "uint32")
 uint64 = DType(types_pb2.DT_UINT64)
 tf_export("dtypes.uint64", "uint64").export_constant(__name__, "uint64")
 int16 = DType(types_pb2.DT_INT16)
-tf_export("dtypes.uint16", "int16").export_constant(__name__, "int16")
+tf_export("dtypes.int16", "int16").export_constant(__name__, "int16")
 int8 = DType(types_pb2.DT_INT8)
 tf_export("dtypes.int8", "int8").export_constant(__name__, "int8")
 string = DType(types_pb2.DT_STRING)
@@ -548,8 +549,8 @@ _NP_TO_TF = frozenset([
     (np.int8, int8),
     (np.complex64, complex64),
     (np.complex128, complex128),
-    (np.object, string),
-    (np.bool, bool),
+    (np.object_, string),
+    (np.bool_, bool),
     (_np_qint8, qint8),
     (_np_quint8, quint8),
     (_np_qint16, qint16),
@@ -658,8 +659,9 @@ tf_export(
         __name__, "QUANTIZED_DTYPES")
 
 _PYTHON_TO_TF = {
-    float: float32,
-    bool: bool,
+    builtins.float: float32,
+    builtins.bool: bool,
+    builtins.object: string
 }
 
 
diff --git a/tensorflow/python/framework/dtypes_test.py b/tensorflow/python/framework/dtypes_test.py
index a873670e0461884d06cde1db4db2cf2db98fde3c..719fdc0953ae4d5bbe016b3dc2730f5601c3494e 100644
--- a/tensorflow/python/framework/dtypes_test.py
+++ b/tensorflow/python/framework/dtypes_test.py
@@ -81,10 +81,10 @@ class TypesTest(test_util.TensorFlowTestCase):
     self.assertIs(dtypes.int8, dtypes.as_dtype(np.int8))
     self.assertIs(dtypes.complex64, dtypes.as_dtype(np.complex64))
     self.assertIs(dtypes.complex128, dtypes.as_dtype(np.complex128))
-    self.assertIs(dtypes.string, dtypes.as_dtype(np.object))
+    self.assertIs(dtypes.string, dtypes.as_dtype(np.object_))
     self.assertIs(dtypes.string,
                   dtypes.as_dtype(np.array(["foo", "bar"]).dtype))
-    self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool))
+    self.assertIs(dtypes.bool, dtypes.as_dtype(np.bool_))
     with self.assertRaises(TypeError):
       dtypes.as_dtype(np.dtype([("f1", np.uint), ("f2", np.int32)]))
 
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 1b77548592cec08ff4fadfe2e740b746c6a9d115..9eaa4a5f2d04c8baaf720d4b9a32c5c707d33772 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -23,6 +23,7 @@ import os
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import traceable_stack
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_stack
@@ -112,6 +113,7 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
     self.assertIn("No node-device colocations", summary)
 
 
+@test_util.run_v1_only("b/120545219")
 class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
   def setUp(self):
@@ -193,6 +195,7 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
     self.assertRegexpMatches(interpolated_string, "constant_op.py:[0-9]+.*")
 
 
+@test_util.run_v1_only("b/120545219")
 class InterpolateDeviceSummaryTest(test.TestCase):
 
   def _fancy_device_function(self, unused_op):
@@ -236,6 +239,7 @@ class InterpolateDeviceSummaryTest(test.TestCase):
     self.assertRegexpMatches(result, expected_re)
 
 
+@test_util.run_v1_only("b/120545219")
 class InterpolateColocationSummaryTest(test.TestCase):
 
   def setUp(self):
@@ -260,11 +264,13 @@ class InterpolateColocationSummaryTest(test.TestCase):
 
     self.graph = node_three.graph
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeThreeHasColocationInterpolation(self):
     message = "{{colocation_node Three_with_one}}"
     result = error_interpolation.interpolate(message, self.graph)
     self.assertIn("colocate_with(One)", result)
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeFourHasColocationInterpolationForNodeThreeOnly(self):
     message = "{{colocation_node Four_with_three}}"
     result = error_interpolation.interpolate(message, self.graph)
@@ -273,12 +279,14 @@ class InterpolateColocationSummaryTest(test.TestCase):
         "One", result,
         "Node One should not appear in Four_with_three's summary:\n%s" % result)
 
+  @test_util.run_v1_only("b/120545219")
   def testNodeFiveHasColocationInterpolationForNodeOneAndTwo(self):
     message = "{{colocation_node Five_with_one_with_two}}"
     result = error_interpolation.interpolate(message, self.graph)
     self.assertIn("colocate_with(One)", result)
     self.assertIn("colocate_with(Two)", result)
 
+  @test_util.run_v1_only("b/120545219")
   def testColocationInterpolationForNodeLackingColocation(self):
     message = "{{colocation_node One}}"
     result = error_interpolation.interpolate(message, self.graph)
diff --git a/tensorflow/python/framework/file_system_test.py b/tensorflow/python/framework/file_system_test.py
index 6901715e5d0f40a4cd4c3ba2e2556892210ef8c3..8687bc5a7850b25f363d23451ffeb58a68b5d0ef 100644
--- a/tensorflow/python/framework/file_system_test.py
+++ b/tensorflow/python/framework/file_system_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import load_library
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -36,13 +37,14 @@ class FileSystemTest(test.TestCase):
                                        "test_file_system.so")
     load_library.load_file_system_library(file_system_library)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       reader = io_ops.WholeFileReader("test_reader")
       queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
       queue.enqueue_many([["test://foo"]]).run()
       queue.close().run()
-      key, value = sess.run(reader.read(queue))
+      key, value = self.evaluate(reader.read(queue))
     self.assertEqual(key, compat.as_bytes("test://foo"))
     self.assertEqual(value, compat.as_bytes("AAAAAAAAAA"))
 
diff --git a/tensorflow/python/framework/func_graph.py b/tensorflow/python/framework/func_graph.py
index c7a5d1ee201ac5e6a3fc6b2858b12bb309572829..bd4ed5553e7b0b2445344d5c36c2209e59d64d14 100644
--- a/tensorflow/python/framework/func_graph.py
+++ b/tensorflow/python/framework/func_graph.py
@@ -26,11 +26,13 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework.auto_control_deps import AutomaticControlDependencies
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
@@ -111,7 +113,7 @@ class FuncGraph(ops.Graph):
     # this stack from the default graph even in eager mode. Maybe it should be
     # part of the eager context? This would also allow us to remove a
     # get_default_graph() call from the function cache lookup.
-    self._distribution_strategy_stack = graph._distribution_strategy_stack
+    self._distribution_strategy_stack = list(graph._distribution_strategy_stack)
     # We ignore device placements from any outer scopes while tracing the
     # function when possible, to avoid hard-coding them in the function
     # graph. "Default" placements come from the PartitionedCallOp's placement,
@@ -120,7 +122,9 @@ class FuncGraph(ops.Graph):
     # restored.
     if context.executing_eagerly():
       self.seed = context.global_seed()
-      self._xla_compile = (context.context().device_spec.device_type == "TPU")
+      device_type = context.context().device_spec.device_type
+      self._xla_compile = (device_type == "TPU" or device_type == "XLA_GPU"
+                           or device_type == "XLA_CPU")
       if self._distribution_strategy_stack or self._xla_compile:
         self._add_device_to_stack(context.context().device_name)
     else:
@@ -151,6 +155,14 @@ class FuncGraph(ops.Graph):
     self._graph_key = graph._graph_key
     # pylint: enable=protected-access
 
+  @property
+  def output_types(self):
+    return [t.dtype for t in self.outputs]
+
+  @property
+  def output_shapes(self):
+    return [t.shape for t in self.outputs]
+
   @property
   def variables(self):
     """A list of variables accessed by this FuncGraph.
@@ -372,7 +384,7 @@ def func_graph_from_py_func(name,
         # captured Operations).
         with ops.control_dependencies([x]):
           x = array_ops.identity(op_return_value)
-      else:
+      elif not isinstance(x, tensor_array_ops.TensorArray):
         try:
           x = ops.convert_to_tensor_or_indexed_slices(x)
         except (ValueError, TypeError):
@@ -395,9 +407,9 @@ def func_graph_from_py_func(name,
           return autograph.converted_call(
               original_func, None,
               autograph.ConversionOptions(
-                  verbose=True,
+                  verbose=autograph.Verbosity.BRIEF,
                   recursive=True,
-                  strip_decorators=(function.defun, def_function.function),
+                  strip_decorators=(def_function.function,),
                   optional_features=(),
               ), *args, **kwargs)
 
@@ -408,7 +420,8 @@ def func_graph_from_py_func(name,
 
       func_outputs = python_func(*func_args, **func_kwargs)
 
-      # invariant: `func_outputs` contains only Tensors and `None`s.
+      # invariant: `func_outputs` contains only Tensors, IndexedSlices,
+      # SparseTensors, TensorArrays and `None`s.
       func_outputs = nest.map_structure(convert, func_outputs)
 
       check_mutation(func_args_before, func_args)
@@ -495,7 +508,17 @@ def check_mutation(n1, n2):
 
 
 def flatten(sequence):
-  """A wrapper around `nest.flatten` that also unpacks `IndexedSlices`."""
+  """Like `nest.flatten` but also unpacks other Tensor-like objects.
+
+  Flattens non-tensor objects into their constituent tensors.
+
+  Args:
+    sequence: A nested structure of Tensors, IndexedSlices, SparseTensors and
+      TensorArrays.
+
+  Returns:
+    A list of tensors.
+  """
   # TODO(akshayka): Support `SparseTensor` in a similar fashion.
   flat_sequence = nest.flatten(sequence)
   outputs = []
@@ -505,11 +528,58 @@ def flatten(sequence):
         outputs.extend([item.values, item.indices, item.dense_shape])
       else:
         outputs.extend([item.values, item.indices])
+    elif isinstance(item, sparse_tensor.SparseTensor):
+      outputs.extend([item.indices, item.values, item.dense_shape])
+    elif isinstance(item, tensor_array_ops.TensorArray):
+      outputs.append(item.flow)
     else:
       outputs.append(item)
   return outputs
 
 
+def pack_sequence_as(structure, flat_sequence):
+  """Like `nest.pack_sequence_as` but also packs other Tensor-like objects.
+
+  Args:
+    structure: The structure to pack into. May contain Tensors, IndexedSlices,
+      TensorArrays or SparseTensors.
+    flat_sequence: An iterable containing tensors.
+
+  Returns:
+    A nested structure.
+
+  Raises:
+    AssertionError if `structure` and `flat_sequence` are not compatible.
+  """
+  flattened_structure = nest.flatten(structure)
+  flat_sequence_with_slices_and_tas = []
+  index = 0
+  for t in flattened_structure:
+    if isinstance(t, ops.IndexedSlices):
+      if t.dense_shape is not None:
+        flat_sequence_with_slices_and_tas.append(
+            ops.IndexedSlices(*flat_sequence[index:index + 3]))
+        index += 3
+      else:
+        flat_sequence_with_slices_and_tas.append(
+            ops.IndexedSlices(*flat_sequence[index:index + 2]))
+        index += 2
+    elif isinstance(t, sparse_tensor.SparseTensor):
+      flat_sequence_with_slices_and_tas.append(
+          sparse_tensor.SparseTensor(*flat_sequence[index:index + 3]))
+      index += 3
+    elif isinstance(t, tensor_array_ops.TensorArray):
+      flow = flat_sequence[index]
+      ta = tensor_array_ops.build_ta_with_new_flow(t, flow)
+      flat_sequence_with_slices_and_tas.append(ta)
+      index += 1
+    else:
+      flat_sequence_with_slices_and_tas.append(flat_sequence[index])
+      index += 1
+  assert len(flattened_structure) == len(flat_sequence_with_slices_and_tas)
+  return nest.pack_sequence_as(structure, flat_sequence_with_slices_and_tas)
+
+
 def _create_substitute_placeholder(value, name=None, dtype=None):
   """Creates a placeholder for `value` and propagates shape info to it."""
   # Note: setting ops.control_dependencies(None) ensures we always put
diff --git a/tensorflow/python/framework/function.py b/tensorflow/python/framework/function.py
index 230a55464145ffd9fb27b25af9bedad3c0163750..cfdc915a1b34930b8f5205550c547d0eec331e52 100644
--- a/tensorflow/python/framework/function.py
+++ b/tensorflow/python/framework/function.py
@@ -209,6 +209,7 @@ class _DefinedFunction(object):
                out_names=None,
                shape_func=None,
                capture_by_value=False,
+               whitelisted_stateful_ops=None,
                **kwargs):
     """Creates _DefinedFunction.
 
@@ -229,6 +230,8 @@ class _DefinedFunction(object):
         output shapes.
       capture_by_value: Boolean (defaults to False). If True, captured values
         will be copied into the function body.
+      whitelisted_stateful_ops: A set of ops that if stateful we ignore and
+        copy into the function body, when `capture_by_value` is True.
       **kwargs: The keyword arguments. **kwargs is passed to every call
         site of this function.
 
@@ -244,6 +247,9 @@ class _DefinedFunction(object):
     self._out_names = out_names
     self._shape_func = shape_func
     self._capture_by_value = capture_by_value
+    self._whitelisted_stateful_ops = whitelisted_stateful_ops
+    if self._whitelisted_stateful_ops is None:
+      self._whitelisted_stateful_ops = set()
     self._extra_kwargs = kwargs
     # Constructed only when C API is disabled, lazily
     self._definition = None
@@ -340,8 +346,13 @@ class _DefinedFunction(object):
       return
 
     temp_graph = func_graph_from_py_func(
-        self._func, self._arg_names, self._arg_types, self._func_name,
-        self._capture_by_value, self._caller_device)
+        self._func,
+        self._arg_names,
+        self._arg_types,
+        self._func_name,
+        self._capture_by_value,
+        self._caller_device,
+        whitelisted_stateful_ops=self._whitelisted_stateful_ops)
 
     self._extra_inputs = temp_graph.extra_inputs
     # pylint: disable=protected-access
@@ -625,9 +636,11 @@ class _FuncGraph(ops.Graph):
   function argument and the caller passes in the captured tensor.
   """
 
-  def __init__(self, name, capture_by_value, *args, **kwargs):
+  def __init__(self, name, capture_by_value, whitelisted_stateful_ops, *args,
+               **kwargs):
     super(_FuncGraph, self).__init__(*args, **kwargs)
     self._capture_by_value = capture_by_value
+    self._whitelisted_stateful_ops = whitelisted_stateful_ops
     self._building_function = True
     self._outer_graph = ops.get_default_graph()
     self._vscope = vs.get_variable_scope()
@@ -785,7 +798,7 @@ class _FuncGraph(ops.Graph):
     # pylint: disable=protected-access
     op_def = graph_to_function_def._get_op_def(op)
     # pylint: enable=protected-access
-    if op_def.is_stateful:
+    if op_def.is_stateful and op not in self._whitelisted_stateful_ops:
       raise ValueError("Cannot capture a stateful node (name:%s, type:%s) "
                        "by value." % (op.name, op.type))
     elif op.type in ("Placeholder", "PlaceholderV2"):
@@ -807,10 +820,17 @@ class _FuncGraph(ops.Graph):
     return captured_op
 
 
-def func_graph_from_py_func(func, arg_names, arg_types, name=None,
-                            capture_by_value=False, device=None,
-                            colocation_stack=None, container=None,
-                            collections_ref=None, arg_shapes=None):
+def func_graph_from_py_func(func,
+                            arg_names,
+                            arg_types,
+                            name=None,
+                            capture_by_value=False,
+                            device=None,
+                            colocation_stack=None,
+                            container=None,
+                            collections_ref=None,
+                            arg_shapes=None,
+                            whitelisted_stateful_ops=None):
   """Returns a _FuncGraph generated from `func`.
 
   Args:
@@ -828,6 +848,8 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
     collections_ref: A reference to a collections dict the _FuncGraph should
       use internally.
     arg_shapes: A sequence of the function's argument shapes.
+    whitelisted_stateful_ops: A set of ops that if stateful we ignore and
+      re-create.
 
   Returns:
     A _FuncGraph.
@@ -837,7 +859,7 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
   """
   if not name:
     name = function_utils.get_func_name(func)
-  func_graph = _FuncGraph(name, capture_by_value)
+  func_graph = _FuncGraph(name, capture_by_value, whitelisted_stateful_ops)
 
   with func_graph.as_default(), ops.device(device):
     # pylint: disable=protected-access
@@ -874,7 +896,7 @@ def func_graph_from_py_func(func, arg_names, arg_types, name=None,
       # If func only returned one value, make it a tuple.
       if not isinstance(outputs, (list, tuple)):
         outputs = (outputs,)
-      if any([_ is None for _ in outputs]):
+      if any(_ is None for _ in outputs):
         raise ValueError("Function %s can not return None." % name)
     # Ensures each output is a Tensor in the function graph.
     outputs = [ops.convert_to_tensor(t) for t in outputs]
@@ -1190,7 +1212,7 @@ def get_extra_args():
 
 
 def _type_list_to_str(types):
-  if any([_ not in _DTYPE_TO_STR for _ in types]):
+  if any(_ not in _DTYPE_TO_STR for _ in types):
     raise ValueError("Unsupported dtypes: %s" % types)
   return "".join([_DTYPE_TO_STR[_] for _ in types])
 
diff --git a/tensorflow/python/framework/function_def_to_graph_test.py b/tensorflow/python/framework/function_def_to_graph_test.py
index b2ef64f873055c00caac2834b5b058e5ca966e48..ddf1a6e74d2f7772c94dc5b39034a28ba0d715b2 100644
--- a/tensorflow/python/framework/function_def_to_graph_test.py
+++ b/tensorflow/python/framework/function_def_to_graph_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import test_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -52,6 +53,7 @@ class FunctionDefToGraphTest(test.TestCase):
     fdef.signature.name = "_whats_in_a_name"
     return fdef
 
+  @test_util.run_deprecated_v1
   def testInputsAndOutputs(self):
     fdef = self._build_function_def()
     g = function_def_to_graph.function_def_to_graph(fdef)
@@ -186,6 +188,7 @@ class FunctionDefToGraphDefTest(test.TestCase):
     self.assertEqual(g.node[0].attr["shape"].shape.unknown_rank, False)
     self.assertFalse("shape" in g.node[2].attr)
 
+  @test_util.run_deprecated_v1
   def testFunctionCallsFromFunction(self):
     x = constant_op.constant(5.0)
     y = constant_op.constant(10.0)
diff --git a/tensorflow/python/framework/function_test.py b/tensorflow/python/framework/function_test.py
index 13ee6c5d2d7bfb9898a491622b6002cfa78f1952..6ec71ba8e9053000629ce0cd0e020494adabfe2d 100644
--- a/tensorflow/python/framework/function_test.py
+++ b/tensorflow/python/framework/function_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import graph_to_function_def
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.errors import InvalidArgumentError
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -102,8 +103,9 @@ class FunctionTest(test.TestCase):
       call = MyIdentityFunc([18.0])
       self.assertEqual("MyIdentity", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([18.0], sess.run(call))
+        self.assertAllEqual([18.0], self.evaluate(call))
 
+  @test_util.run_deprecated_v1
   def testIdentityImplicitDeref(self):
 
     @function.Defun(dtypes.float32, func_name="MyIdentity")
@@ -116,8 +118,8 @@ class FunctionTest(test.TestCase):
       self.assertEqual("MyIdentity", call.op.name)
       for cfg in _OptimizerOptions():
         with session.Session(config=cfg) as sess:
-          sess.run(var.initializer)
-          self.assertAllEqual([18.0], sess.run(call))
+          self.evaluate(var.initializer)
+          self.assertAllEqual([18.0], self.evaluate(call))
 
   def testIdentityOutputName(self):
 
@@ -130,7 +132,7 @@ class FunctionTest(test.TestCase):
       call = MyIdentityFunc([18.0])
       self.assertEqual("MyIdentity", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([18.0], sess.run(call))
+        self.assertAllEqual([18.0], self.evaluate(call))
 
   def testTooManyOutputNames(self):
 
@@ -158,7 +160,7 @@ class FunctionTest(test.TestCase):
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([5.0], sess.run(call))
+        self.assertAllEqual([5.0], self.evaluate(call))
 
   def testFunctionWithNoOutput(self):
 
@@ -187,7 +189,7 @@ class FunctionTest(test.TestCase):
       call = APlus2B([1.0], [2.0])
       self.assertEqual("APlus2B", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([5.0], sess.run(call))
+        self.assertAllEqual([5.0], self.evaluate(call))
 
   def testDefineFunctionDuplicateOutputs(self):
 
@@ -224,8 +226,8 @@ class FunctionTest(test.TestCase):
       call_g = XSquarePlusOneGrad([2.0], [0.1])
 
       with session.Session() as sess:
-        self.assertAllClose([5.0], sess.run(call_f))
-        self.assertAllClose([0.4], sess.run(call_g))
+        self.assertAllClose([5.0], self.evaluate(call_f))
+        self.assertAllClose([0.4], self.evaluate(call_g))
 
   def testTanhSymGrad(self):
 
@@ -322,6 +324,7 @@ class FunctionTest(test.TestCase):
       self.assertEqual(x.get_shape(), dx.get_shape())
       self.assertEqual(y.get_shape(), dy.get_shape())
 
+  @test_util.run_deprecated_v1
   def testSymGradAttr(self):
 
     @function.Defun(noinline=True)
@@ -365,7 +368,7 @@ class FunctionTest(test.TestCase):
       else:
         dx, dy = gradients_impl.gradients([z], [x, y])
       with session.Session() as sess:
-        dx_val, dy_val = sess.run([dx, dy])
+        dx_val, dy_val = self.evaluate([dx, dy])
         self.assertEqual([2.0], dx_val)
         self.assertEqual([0.0], dy_val)
 
@@ -387,7 +390,7 @@ class FunctionTest(test.TestCase):
       call = AConstant()
       self.assertEqual("AConstant", call.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([42], sess.run(call))
+        self.assertAllEqual([42], self.evaluate(call))
 
   def testDefineFunctionNames(self):
 
@@ -438,6 +441,7 @@ class FunctionTest(test.TestCase):
                                    "assertion failed.*-3"):
         self.assertAllEqual(Foo(constant_op.constant(-3.0)).eval(), 6.0)
 
+  @test_util.run_deprecated_v1
   def testAssertWrapper(self):
 
     @function.Defun(dtypes.float32)
@@ -452,6 +456,7 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         _ = MyFn(100.0).eval()
 
+  @test_util.run_deprecated_v1
   def testWhileLoopCallsFunc(self):
     with self.session(use_gpu=True) as sess:
 
@@ -468,9 +473,10 @@ class FunctionTest(test.TestCase):
 
       loop = control_flow_ops.while_loop(lambda x: x < 1e5, Body, [1.0])
 
-      ans = sess.run(loop)
+      ans = self.evaluate(loop)
       self.assertAllClose(ans, 131072.)
 
+  @test_util.run_deprecated_v1
   def testControlFlowStrictness(self):
     """Inlined functions must not execute in a untaken control flow branch."""
 
@@ -517,6 +523,7 @@ class FunctionTest(test.TestCase):
                                    "assertion"):
         sess.run(loop, {pred: True, x: 3})
 
+  @test_util.run_deprecated_v1
   def testVar(self):
 
     @function.Defun(dtypes.float32)
@@ -532,6 +539,7 @@ class FunctionTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllEqual(z.eval(), 101.)
 
+  @test_util.run_deprecated_v1
   def testResourceVarAsImplicitInput(self):
     g = ops.Graph()
     with g.as_default(), ops.device("cpu:0"):
@@ -552,8 +560,8 @@ class FunctionTest(test.TestCase):
 
     with self.session(graph=g):
       v.initializer.run()
-      self.assertAllEqual(expected_val.eval(), actual_val.eval())
-      self.assertAllEqual(expected_shape, actual_shape.eval())
+      self.assertAllEqual(expected_val.eval(), self.evaluate(actual_val))
+      self.assertAllEqual(expected_shape, self.evaluate(actual_shape))
 
   def testDefineErrors(self):
     with ops.Graph().as_default():
@@ -650,8 +658,8 @@ class FunctionTest(test.TestCase):
       # pylint: enable=unexpected-keyword-arg
       self.assertEqual("next", call2.op.name)
       with session.Session() as sess:
-        self.assertAllEqual([1], sess.run(call1))
-        self.assertAllEqual([0], sess.run(call2))
+        self.assertAllEqual([1], self.evaluate(call1))
+        self.assertAllEqual([0], self.evaluate(call2))
 
   def testNestedFunction(self):
 
@@ -707,6 +715,7 @@ class FunctionTest(test.TestCase):
     gdef = g.as_graph_def()
     self.assertEqual(0, len(gdef.library.function))
 
+  @test_util.run_deprecated_v1
   def testReduction(self):
     g = ops.Graph()
 
@@ -735,6 +744,7 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(vals[0], vals[1])
       self.assertAllClose(vals[2], vals[3])
 
+  @test_util.run_deprecated_v1
   def testCapture(self):
     g = ops.Graph()
     with g.as_default():
@@ -781,6 +791,7 @@ class FunctionTest(test.TestCase):
         # NOTE: We still do not support capturing control deps.
         _ = Foo(x)
 
+  @test_util.run_deprecated_v1
   def testCaptureInWhileLoop(self):
     g = ops.Graph()
     with g.as_default():
@@ -794,8 +805,9 @@ class FunctionTest(test.TestCase):
       y = Foo()
 
     with self.session(graph=g) as sess:
-      self.assertEqual(sess.run(y), 10)
+      self.assertEqual(self.evaluate(y), 10)
 
+  @test_util.run_deprecated_v1
   def testCaptureInCond(self):
     g = ops.Graph()
     with g.as_default():
@@ -809,8 +821,8 @@ class FunctionTest(test.TestCase):
       z = Foo(False)
 
     with self.session(graph=g) as sess:
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 2)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 2)
 
   def testStableName(self):
 
@@ -825,6 +837,7 @@ class FunctionTest(test.TestCase):
       self.assertEqual("Foo_aCYSbwBkR5A",
                        Foo.instantiate([dtypes.float32] * 3).name)
 
+  @test_util.run_deprecated_v1
   def testSignatureHash(self):
     # Foo.Inner and Bar.Inner have identical function body but have
     # different signatures. They should be treated as two different functions.
@@ -854,7 +867,7 @@ class FunctionTest(test.TestCase):
       z = Bar(x)
 
     with self.session(graph=g) as sess:
-      v0, v1 = sess.run([y, z])
+      v0, v1 = self.evaluate([y, z])
       self.assertAllEqual(v0, 20.)
       self.assertAllEqual(v1, 20.)
 
@@ -877,6 +890,7 @@ class FunctionTest(test.TestCase):
       y = Bar(array_ops.zeros([1, 2, 3]))
       self.assertAllEqual(y.get_shape().as_list(), [1, 1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testVariableReuse(self):
 
     def LinearWithReuse(input_tensor, reuse=None):
@@ -900,11 +914,12 @@ class FunctionTest(test.TestCase):
     self.assertEqual(global_vars[0].name, "linear/w:0")
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       output_val = sess.run(
           output_op, feed_dict={input_op: np.random.rand(32, 100)})
       self.assertEqual(output_val.shape, (32, 100))
 
+  @test_util.run_deprecated_v1
   def testFunctionCallInDifferentVariableScopes(self):
 
     @function.Defun(dtypes.float32)
@@ -928,7 +943,7 @@ class FunctionTest(test.TestCase):
     self.assertEqual(global_vars[0].name, "vs1/var:0")
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       out1, out2 = sess.run(
           [out1_op, out2_op], feed_dict={input_op: np.linspace(1, 10, 10)})
       self.assertAllEqual(out1, np.linspace(2, 11, 10))
@@ -968,6 +983,7 @@ class FunctionTest(test.TestCase):
       self.assertAllClose(
           np.array([1.0, 0.0]).astype(np.float32), sess.run(dinp, {inp: x}))
 
+  @test_util.run_deprecated_v1
   def testFunctionMarkedStateful(self):
 
     @function.Defun(dtypes.int32, dtypes.float32)
@@ -991,10 +1007,11 @@ class FunctionTest(test.TestCase):
     result_2 = Bar(constant_op.constant(100, dtype=dtypes.int64))
 
     with session.Session() as sess:
-      self.assertEqual(4.0, sess.run(result_1))
-      self.assertEqual(100, sess.run(result_2))
+      self.assertEqual(4.0, self.evaluate(result_1))
+      self.assertEqual(100, self.evaluate(result_2))
       self.assertEqual((4.0, 100), sess.run((result_1, result_2)))
 
+  @test_util.run_deprecated_v1
   def testStatefulFunction(self):
 
     @function.Defun()
@@ -1037,6 +1054,29 @@ class FunctionTest(test.TestCase):
         self.assertFalse(all(val3 == val1))
         self.assertFalse(all(val4 == val2))
 
+  def testStatefulFunctionWithWhitelisting(self):
+    t = random_ops.random_uniform([100], maxval=10, dtype=dtypes.int32)
+
+    @function.Defun(capture_by_value=True)
+    def StatefulFn():
+      return t + constant_op.constant(3, dtype=dtypes.int32)
+
+    # First time we try to capture a stateful RandomUniform op.
+    with self.assertRaisesRegexp(ValueError, "Cannot capture a stateful node"):
+      res = StatefulFn()
+
+    # This time we whitelist this op, so that its recreated.
+    @function.Defun(capture_by_value=True, whitelisted_stateful_ops=set([t.op]))
+    def StatefulFn2():
+      return t + constant_op.constant(3, dtype=dtypes.int32)
+
+    res = StatefulFn2()
+    with session.Session() as sess:
+      r = sess.run(res)
+      for i in r:
+        self.assertGreaterEqual(i, 3)
+
+  @test_util.run_deprecated_v1
   def testSameFunctionOnTwoDevices(self):
 
     @function.Defun(dtypes.float32)
@@ -1052,10 +1092,11 @@ class FunctionTest(test.TestCase):
     for config in _OptimizerOptions():
       config.device_count["CPU"] = 2
       with session.Session(config=config) as sess:
-        self.assertEqual(42.0, sess.run(f_0))
-        self.assertEqual(44.0, sess.run(f_1))
+        self.assertEqual(42.0, self.evaluate(f_0))
+        self.assertEqual(44.0, self.evaluate(f_1))
         self.assertEqual((42.0, 44.0), sess.run((f_0, f_1)))
 
+  @test_util.run_deprecated_v1
   def testGuaranteedConstsAreCaptured(self):
     var = variables.Variable(1.0)
     const = array_ops.guarantee_const(var)
@@ -1076,9 +1117,10 @@ class FunctionTest(test.TestCase):
       return output
 
     with self.session(use_gpu=False) as sess:
-      sess.run(var.initializer)
+      self.evaluate(var.initializer)
       _ = sess.run(CapturesGuaranteedConst(), {also_not_const: 1.0})
 
+  @test_util.run_deprecated_v1
   def testSameFunctionDifferentGrads(self):
 
     def PartOne(x):
@@ -1127,7 +1169,7 @@ class FunctionTest(test.TestCase):
       dx2, = gradients_impl.gradients(ys=[y2], xs=[x2])
 
     with self.session(graph=g) as sess:
-      v0, v1, v2 = sess.run([dx0, dx1, dx2])
+      v0, v1, v2 = self.evaluate([dx0, dx1, dx2])
 
     self.assertAllEqual(v0, 2.)
     self.assertAllEqual(v1, 101.)
@@ -1150,6 +1192,7 @@ class FunctionsFromProtos(test.TestCase):
     self.assertEqual(func.declared_input_types, new_func.declared_input_types)
     self.assertEqual(func.captured_inputs, new_func.captured_inputs)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
 
     @function.Defun(dtypes.float32, dtypes.float32)
@@ -1359,6 +1402,7 @@ class FunctionsFromProtos(test.TestCase):
 
 class FunctionOverloadTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
 
     @function.Defun()
@@ -1411,6 +1455,7 @@ class FunctionOverloadTest(test.TestCase):
 
 class FunctionCaptureByValueTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCaptureByValue(self):
     g = ops.Graph()
     with g.as_default():
@@ -1532,7 +1577,7 @@ class UnrollLSTMTest(test.TestCase):
       tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start,
                       len(str(gdef)), len(gdef.SerializeToString()))
       with g.as_default(), session.Session(config=cfg) as sess:
-        return sess.run(m)
+        return self.evaluate(m)
 
     mv0 = RunForward("complete")
     for cfg in _OptimizerOptions():
@@ -1561,7 +1606,7 @@ class UnrollLSTMTest(test.TestCase):
       tf_logging.info("time: %f txt size: %d gdef bin size: %d", finish - start,
                       len(str(gdef)), len(gdef.SerializeToString()))
       with g.as_default(), session.Session(config=cfg) as sess:
-        return sess.run(dw)
+        return self.evaluate(dw)
 
     d0 = RunForwardBackward("complete")
     for cfg in _OptimizerOptions():
@@ -1634,6 +1679,7 @@ class FunctionInlineControlTest(test.TestCase):
 
 class ModuleFunctionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
 
     @function.Defun(*[dtypes.float32] * 3)
@@ -1651,8 +1697,8 @@ class ModuleFunctionTest(test.TestCase):
       y = LinearWithCApi(a, b, c)
       z = Linear2WithCApi(a, b, c, d, e)
       with session.Session() as sess:
-        self.assertAllEqual([[1]], sess.run(y))
-        self.assertAllEqual([[5]], sess.run(z))
+        self.assertAllEqual([[1]], self.evaluate(y))
+        self.assertAllEqual([[5]], self.evaluate(z))
 
 
 class VariableHoistingTest(test.TestCase):
@@ -1704,8 +1750,8 @@ class VariableHoistingTest(test.TestCase):
     self.assertEqual("Foo/b", b.op.name)
 
     with self.session(graph=g) as sess:
-      sess.run(variables.global_variables_initializer())
-      w, b, x, y0, loss, dw, db = sess.run([w, b, x, y0, loss, dw, db])
+      self.evaluate(variables.global_variables_initializer())
+      w, b, x, y0, loss, dw, db = self.evaluate([w, b, x, y0, loss, dw, db])
 
     self.assertAllEqual(w.shape, (64, 64))
     self.assertAllClose(np.sum(w), 2050.44)
@@ -1717,10 +1763,12 @@ class VariableHoistingTest(test.TestCase):
     self.assertAllEqual(db.shape, (64,))
     self.assertAllClose(np.sum(db), 0.509, rtol=1e-2)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testSimpleModel(True)
     self._testSimpleModel(False)
 
+  @test_util.run_deprecated_v1
   def testBasicResource(self):
     self._testSimpleModel(True, use_resource=True)
     self._testSimpleModel(False, use_resource=True)
diff --git a/tensorflow/python/framework/graph_util_test.py b/tensorflow/python/framework/graph_util_test.py
index 563a177dd06b3b165335c91c3a92ff8877609efc..dd26b8a78e9d2e13b34770775fcb1219745396e0 100644
--- a/tensorflow/python/framework/graph_util_test.py
+++ b/tensorflow/python/framework/graph_util_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops as math_ops_lib
@@ -102,6 +103,7 @@ class DeviceFunctionsTest(test.TestCase):
     self.assertDeviceEqual(var_5.device, "/device:GPU:0")
     self.assertDeviceEqual(var_6.device, "/device:CPU:0")
 
+  @test_util.run_v1_only("b/120545219")
   def testNestedDeviceFunctions(self):
     with ops.Graph().as_default():
       var_0 = variables.VariableV1(0)
@@ -210,8 +212,8 @@ class DeviceFunctionsTest(test.TestCase):
 
       with session.Session() as sess:
         init = variables.variables_initializer([variable_node])
-        sess.run(init)
-        output = sess.run(output_node)
+        self.evaluate(init)
+        output = self.evaluate(output_node)
         self.assertNear(4.0, output, 0.00001)
         variable_graph_def = sess.graph.as_graph_def()
 
@@ -242,8 +244,8 @@ class DeviceFunctionsTest(test.TestCase):
         output_node = math_ops_lib.multiply(
             variable_node, 2.0, name="output_node")
         with session.Session() as sess:
-          sess.run(variable_node.initializer)
-          output = sess.run(output_node)
+          self.evaluate(variable_node.initializer)
+          output = self.evaluate(output_node)
           self.assertNear(2.0, output, 0.00001)
           variable_graph_def = sess.graph.as_graph_def()
           # First get the constant_graph_def when variable_names_whitelist is
@@ -256,7 +258,7 @@ class DeviceFunctionsTest(test.TestCase):
 
           # Then initialize the unused variable, and get another
           # constant_graph_def when variable_names_whitelist is not set.
-          sess.run(another_variable.initializer)
+          self.evaluate(another_variable.initializer)
           constant_graph_def_without_variable_whitelist = (
               graph_util.convert_variables_to_constants(
                   sess, variable_graph_def, ["output_node"]))
@@ -295,7 +297,7 @@ class DeviceFunctionsTest(test.TestCase):
             ["Variable", "VariableV2", "VarHandleOp", "ReadVariableOp"])
       with session.Session() as sess:
         output_node = sess.graph.get_tensor_by_name("output_node:0")
-        output = sess.run(output_node)
+        output = self.evaluate(output_node)
         self.assertNear(2.0, output, 0.00001)
 
   def create_node_def(self, op, name, inputs):
diff --git a/tensorflow/python/framework/importer.py b/tensorflow/python/framework/importer.py
index c9ac27e788709da5fc5533062694f3b680de9853..98c7aeccc4b19edfc433a6556108ef8b77d12aa4 100644
--- a/tensorflow/python/framework/importer.py
+++ b/tensorflow/python/framework/importer.py
@@ -21,6 +21,7 @@ import contextlib
 
 from tensorflow.core.framework import graph_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python import tf2
 from tensorflow.python.framework import c_api_util
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import errors
@@ -253,7 +254,9 @@ def _ProcessNewOps(graph):
     # Find any device in the list of colocated ops that have a device, if it
     # exists.  We assume that if multiple ops have devices, they refer to the
     # same device.  Otherwise, a runtime error will occur since the colocation
-    # property cannot be guaranteed.
+    # property cannot be guaranteed.  Note in TF2 colocations have been removed
+    # from the public API and will be considered a hint, so there is no runtime
+    # error.
     #
     # One possible improvement is to try to check for compatibility of all
     # devices in this list at import time here, which would require
@@ -262,6 +265,10 @@ def _ProcessNewOps(graph):
       try:
         coloc_op = graph._get_operation_by_name_unsafe(coloc_op_name)  # pylint: disable=protected-access
       except KeyError:
+        # Do not error in TF2 if the colocation cannot be guaranteed
+        if tf2.enabled():
+          continue
+
         raise ValueError('Specified colocation to an op that '
                          'does not exist during import: %s in %s' %
                          (coloc_op_name, op.name))
@@ -431,17 +438,16 @@ def import_graph_def(graph_def,
     #
     # TODO(skyewm): fetch the TF_Functions directly from the TF_Graph
     # TODO(skyewm): avoid sending serialized FunctionDefs back to the TF_Graph
-    # TODO(b/74620627): move this after _ProcessNewOps outside the lock once
-    # _USE_C_SHAPES is removed.
-    if graph_def.library and graph_def.library.function:
-      # pylint: disable=protected-access
-      functions = function._from_library(graph_def.library)
-      for f in functions:
-        f.add_to_graph(graph)
-      # pylint: enable=protected-access
 
     _ProcessNewOps(graph)
 
+  if graph_def.library and graph_def.library.function:
+    # pylint: disable=protected-access
+    functions = function._from_library(graph_def.library)
+    for f in functions:
+      f.add_to_graph(graph)
+    # pylint: enable=protected-access
+
   # Treat input mappings that don't appear in the graph as an error, because
   # they are likely to be due to a typo.
   missing_unused_input_keys = (
diff --git a/tensorflow/python/framework/importer_test.py b/tensorflow/python/framework/importer_test.py
index 2b4d8e7299559b689763e18f204556890a412410..66e80b558523bcab64a1a509aae60d5b9e679e40 100644
--- a/tensorflow/python/framework/importer_test.py
+++ b/tensorflow/python/framework/importer_test.py
@@ -397,11 +397,11 @@ class ImportGraphDefTest(test.TestCase):
       # Run the imported graph.
       # TODO(b/76173421): make this work (currently DCHECKS)
       # with self.cached_session() as sess:
-      #   sess.run(imported_init)
-      #   self.assertEqual(sess.run(imported_var), 1.0)
-      #   self.assertEqual(sess.run(imported_assign), 2.0)
-      #   self.assertEqual(list(sess.run(imported_shape)), [])
-      #   self.assertEqual(list(sess.run(new_var_shape)), [])
+      #   self.evaluate(imported_init)
+      #   self.assertEqual(self.evaluate(imported_var), 1.0)
+      #   self.assertEqual(self.evaluate(imported_assign), 2.0)
+      #   self.assertEqual(list(self.evaluate(imported_shape)), [])
+      #   self.assertEqual(list(self.evaluate(new_var_shape)), [])
 
   def testWhileLoop(self):
     # Produce GraphDef containing while loop.
@@ -418,7 +418,7 @@ class ImportGraphDefTest(test.TestCase):
                                               return_elements=[r.name])
       self.assertEqual(imported_r.name, "import/" + r.name)
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(imported_r), 10)
+        self.assertEqual(self.evaluate(imported_r), 10)
 
   def testImportWhileLoopInCond(self):
     # Produce GraphDef containing while loop.
@@ -458,7 +458,7 @@ class ImportGraphDefTest(test.TestCase):
           lambda i: i < 2, ImportFn, [0],
           shape_invariants=[tensor_shape.TensorShape(None)])
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(out), 10)
+        self.assertEqual(self.evaluate(out), 10)
 
   def testTypeMismatchInGraphDef(self):
     # TODO(skyewm): improve error message
@@ -930,7 +930,7 @@ class ImportGraphDefTest(test.TestCase):
           name="",
           return_elements=["id:0"])
       with self.cached_session():
-        self.assertEqual(5.0, t.eval())
+        self.assertEqual(5.0, self.evaluate(t))
 
   def testInvalidInputForReturnOperations(self):
     with ops.Graph().as_default():
@@ -1071,7 +1071,7 @@ class ImportGraphDefTest(test.TestCase):
       tensor_input = np.ones(input_shape, dtype=np.float32)
       t = constant_op.constant(tensor_input, shape=input_shape)
       g = array_ops.identity(t)
-      g.eval()
+      self.evaluate(g)
 
   def testVersion(self):
     v0 = versions.GRAPH_DEF_VERSION_MIN_CONSUMER
@@ -1255,7 +1255,7 @@ class ImportGraphDefTest(test.TestCase):
     z = TestFunc()
 
     with self.cached_session():
-      z_val = z.eval()
+      z_val = self.evaluate(z)
       self.assertEqual(z_val, -2.0)
 
   def testImportGraphWithFunctionTwice(self):
diff --git a/tensorflow/python/framework/load_library.py b/tensorflow/python/framework/load_library.py
index 908a5f521e15690dee0683ee25dea86e43b5f1f0..727f6aa44c2ed11414e805eb635a9adbc5519da6 100644
--- a/tensorflow/python/framework/load_library.py
+++ b/tensorflow/python/framework/load_library.py
@@ -31,6 +31,7 @@ from tensorflow.core.lib.core import error_codes_pb2  # pylint: disable=unused-i
 from tensorflow.python import pywrap_tensorflow as py_tf
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -83,7 +84,8 @@ def load_op_library(library_filename):
   return module
 
 
-@tf_export('load_file_system_library')
+@deprecation.deprecated(date=None, instructions='Use tf.load_library instead.')
+@tf_export(v1=['load_file_system_library'])
 def load_file_system_library(library_filename):
   """Loads a TensorFlow plugin, containing file system implementation.
 
diff --git a/tensorflow/python/framework/meta_graph_test.py b/tensorflow/python/framework/meta_graph_test.py
index fc98b91a016cf40b32607320bb2ebb65cc7d6a63..e6e87881649729ca65db8cba9914e29b5a0d064e 100644
--- a/tensorflow/python/framework/meta_graph_test.py
+++ b/tensorflow/python/framework/meta_graph_test.py
@@ -63,6 +63,7 @@ def _TestDir(test_name):
 
 class SimpleMetaGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoVariables(self):
     test_dir = _TestDir("no_variables")
     filename = os.path.join(test_dir, "metafile")
@@ -116,6 +117,7 @@ class SimpleMetaGraphTest(test.TestCase):
                                   {new_input_tensor: input_feed_value})
       self.assertEqual(new_output_value, output_value)
 
+  @test_util.run_deprecated_v1
   def testStrippedOpListNestedFunctions(self):
     with self.cached_session():
       # Square two levels deep
@@ -158,6 +160,7 @@ class SimpleMetaGraphTest(test.TestCase):
     op_list = meta_graph.stripped_op_list_for_graph(graph)
     self.assertEqual(["Const"], [op.name for op in op_list.op])
 
+  @test_util.run_deprecated_v1
   def testDefaultAttrStripping(self):
     """Verifies that default attributes are stripped from a graph def."""
 
@@ -210,6 +213,7 @@ class SimpleMetaGraphTest(test.TestCase):
       self.assertEqual(node_def.attr["Tout"].type, dtypes.complex128)
       self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
 
+  @test_util.run_deprecated_v1
   def testDefaultAttrStrippingNestedFunctions(self):
     """Verifies that default attributes are stripped from function node defs."""
     with self.cached_session():
@@ -261,6 +265,7 @@ class SimpleMetaGraphTest(test.TestCase):
       self.assertEqual(node_def.attr["attr_1"].i, 1)
       self.assertTrue(meta_graph_def.meta_info_def.stripped_default_attrs)
 
+  @test_util.run_deprecated_v1
   def testVariableObjectsAreSharedAmongCollections(self):
     with ops.Graph().as_default() as graph1:
       v = variables.Variable(3.0)
@@ -454,6 +459,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
   # Verifies that we can export the subgraph under each layer and import
   # them into new layers in a new graph.
+  @test_util.run_deprecated_v1
   def testScopedExportAndImport(self):
     test_dir = _TestDir("scoped_export_import")
     filenames = [
@@ -492,8 +498,8 @@ class ScopedMetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
       grad = gradients_impl.gradients([output], [var])
       with session.Session() as sess:
-        sess.run(init_op)
-        expected_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        expected_grad_value = self.evaluate(grad)
 
     # Restore the MetaGraphDef into a new Graph with an import scope.
     with ops.Graph().as_default():
@@ -518,10 +524,11 @@ class ScopedMetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with session.Session() as sess:
-        sess.run(init_op)
-        actual_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
+  @test_util.run_v1_only("b/120545219")
   def testImportWhileLoopInWhileLoop(self):
     # Create a simple while loop.
     with ops.Graph().as_default():
@@ -544,9 +551,10 @@ class ScopedMetaGraphTest(test.TestCase):
       _, x = control_flow_ops.while_loop(lambda i, x: i < 2, body, [0, 0.0],
                                          name="")
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
-        sess.run(x)
+        self.evaluate(variables.global_variables_initializer())
+        self.evaluate(x)
 
+  @test_util.run_deprecated_v1
   def testScopedImportUnderNameScope(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -562,6 +570,7 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "foo/bar/myvar:0")
 
+  @test_util.run_deprecated_v1
   def testScopedImportUnderNameScopeNoVarScope(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -590,6 +599,7 @@ class ScopedMetaGraphTest(test.TestCase):
         self.assertEqual(list(imported_variables.values())[0].name,
                          "s" + suffix + "/v:0")
 
+  @test_util.run_deprecated_v1
   def testScopedImportWithSelectedCollections(self):
     meta_graph_filename = os.path.join(
         _TestDir("selected_collections_import"), "meta_graph.pb")
@@ -600,11 +610,11 @@ class ScopedMetaGraphTest(test.TestCase):
     with graph.as_default():
       variables.Variable(initial_value=1.0, trainable=True)
     self.assertTrue(
-        all([
+        all(
             graph.get_collection(key)
             for key in
             [ops.GraphKeys.GLOBAL_VARIABLES, ops.GraphKeys.TRAINABLE_VARIABLES]
-        ]))
+        ))
     meta_graph.export_scoped_meta_graph(
         filename=meta_graph_filename, graph=graph)
 
@@ -687,6 +697,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
   # Verifies that we can export the subgraph containing a FIFOQueue under
   # "queue1" and import it into "new_queue1" in a new graph.
+  @test_util.run_deprecated_v1
   def testScopedWithQueue(self):
     test_dir = _TestDir("scoped_with_queue")
     orig_meta_graph = self._testScopedExportWithQueue(test_dir,
@@ -749,12 +760,15 @@ class ScopedMetaGraphTest(test.TestCase):
     for n, e in zip(nodes, expected):
       self.assertEqual([e], graph2.get_operation_by_name(n).get_attr("_class"))
 
+  @test_util.run_deprecated_v1
   def testExportNestedNames(self):
     self.doTestExportNestedNames(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testExportNestedNamesResource(self):
     self.doTestExportNestedNames(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testPotentialCycle(self):
     graph1 = ops.Graph()
     with graph1.as_default():
@@ -783,6 +797,7 @@ class ScopedMetaGraphTest(test.TestCase):
                   4.0, shape=[2, 2])
           })
 
+  @test_util.run_deprecated_v1
   def testClearDevices(self):
     graph1 = ops.Graph()
     with graph1.as_default():
@@ -842,6 +857,7 @@ class ScopedMetaGraphTest(test.TestCase):
 
 class MetaGraphWithVariableScopeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
 
     def _enqueue_vector(sess, queue, values, shape=None):
@@ -868,8 +884,8 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
       _, update_op = metrics.mean(values)
 
       initializer = variables.local_variables_initializer()
-      sess.run(initializer)
-      sess.run(update_op)
+      self.evaluate(initializer)
+      self.evaluate(update_op)
 
     meta_graph.export_scoped_meta_graph(
         filename=meta_graph_filename, graph=graph)
@@ -880,7 +896,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
     with self.session(graph=graph) as sess:
       meta_graph.import_scoped_meta_graph(meta_graph_filename)
       initializer = variables.local_variables_initializer()
-      sess.run(initializer)
+      self.evaluate(initializer)
 
     # Verifies that importing an old meta_graph where "local_variables"
     # collection is of node_list type works, but cannot build initializer
@@ -899,6 +915,7 @@ class MetaGraphWithVariableScopeTest(test.TestCase):
 
 class ExportImportAcrossScopesTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testPartionedVariables(self):
 
     def make_graph_with_partitioned_variables(use_resource):
diff --git a/tensorflow/python/framework/op_def_library.py b/tensorflow/python/framework/op_def_library.py
index 9955a9a2cdd276d9902c4b56a8340ae57f280ac1..2318b32ef10d67c48950061d2c489f6c7dfb20a0 100644
--- a/tensorflow/python/framework/op_def_library.py
+++ b/tensorflow/python/framework/op_def_library.py
@@ -570,7 +570,7 @@ class OpDefLibrary(object):
                   "than minimum length %d." %
                   (input_name, op_type_name, len(values), num_attr.minimum))
           # All tensors must have the same base type.
-          if any([bt != base_types[0] for bt in base_types]):
+          if any(bt != base_types[0] for bt in base_types):
             raise TypeError(
                 "All tensors passed to '%s' of '%s' Op "
                 "must have the same type." %
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index aaa12bf71ffe46a2643f96290625b79f1dab12cf..fa306936d653b233bba3b54d4f9a03ea202684e6 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -36,14 +36,13 @@ from tensorflow.core.framework import op_def_pb2
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as c_api
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import core
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import c_api_util
-from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_def_registry
 from tensorflow.python.framework import registry
@@ -318,22 +317,13 @@ class Tensor(_TensorLike):
     self._op = op
     self._value_index = value_index
     self._dtype = dtypes.as_dtype(dtype)
-
     # This will be set by self._as_tf_output().
     self._tf_output = None
-
     # This will be set by self.shape().
     self._shape_val = None
-
     # List of operations that use this Tensor as input.  We maintain this list
     # to easily navigate a computation graph.
     self._consumers = []
-
-    if not _USE_C_SHAPES:
-      # Attributes used for C++ shape inference. Not inspected, only forwarded.
-      # If set, will be a HandleData object from cpp_shape_inference.proto.
-      self._handle_data = None
-
     self._id = uid()
 
   @property
@@ -408,17 +398,7 @@ class Tensor(_TensorLike):
 
     """
     if self._shape_val is None:
-      if _USE_C_SHAPES:
-        self._shape_val = self._c_api_shape()
-      else:
-        # Call set_shape_and_handle_data_for_outputs in topological order on all
-        # ops that are needed to compute self.op's shape. We do this instead of
-        # having set_shape_and_handle_data_for_outputs recursively call
-        # Operation.shape on self.op.inputs to overflowing the call stack.
-        need_shapes = self._get_input_ops_without_shapes(self.op)
-        need_shapes.sort(key=lambda op: op._id)
-        for op in need_shapes:
-          set_shape_and_handle_data_for_outputs(op)
+      self._shape_val = self._c_api_shape()
     return self._shape_val
 
   def _get_input_ops_without_shapes(self, target_op):
@@ -533,14 +513,10 @@ class Tensor(_TensorLike):
       ValueError: If `shape` is not compatible with the current shape of
         this tensor.
     """
-    if _USE_C_SHAPES:  # pylint: disable=protected-access
-      # Reset cached shape.
-      self._shape_val = None
-    else:
-      self._shape_val = self.shape.merge_with(shape)
+    # Reset cached shape.
+    self._shape_val = None
 
-    # Update C shape even if _USE_C_SHAPES = False, since we still want
-    # set_shape to be reflected in the C API graph for when we run it.
+    # We want set_shape to be reflected in the C API graph for when we run it.
     if not isinstance(shape, tensor_shape.TensorShape):
       shape = tensor_shape.TensorShape(shape)
     dim_list = []
@@ -634,10 +610,7 @@ class Tensor(_TensorLike):
     return id(self) == id(other)
 
   def __copy__(self):
-    # Make sure _shape_val is computed before we copy.
     # TODO(b/77597810): get rid of Tensor copies.
-    if self._shape_val is None:
-      set_shape_and_handle_data_for_outputs(self.op)
     cls = self.__class__
     result = cls.__new__(cls)
     result.__dict__.update(self.__dict__)
@@ -774,6 +747,18 @@ class _EagerTensorBase(Tensor):
   def _numpy(self):
     raise NotImplementedError()
 
+  @property
+  def backing_device(self):
+    """Returns the name of the device holding this tensor's memory.
+
+    `.backing_device` is usually the same as `.device`, which returns
+    the device on which the kernel of the operation that produced this tensor
+    ran. However, some operations can produce tensors on a different device
+    (e.g., an operation that executes on the GPU but produces output tensors
+    in host memory).
+    """
+    raise NotImplementedError()
+
   def __copy__(self):
     # Eager Tensors are immutable so it's safe to return themselves as a copy.
     return self
@@ -890,6 +875,12 @@ class _EagerTensorBase(Tensor):
     """Returns the number of Tensor dimensions."""
     return self.shape.ndims
 
+  def __len__(self):
+    """Returns the length of the first dimension in the Tensor."""
+    if not self.shape.ndims:
+      raise TypeError("Scalar tensor has no `len()`")
+    return self._shape_tuple()[0]
+
   def _cpu_nograd(self):
     """A copy of this Tensor with contents backed by host memory.
 
@@ -918,13 +909,7 @@ class _EagerTensorBase(Tensor):
     return self._copy(context.context(), "GPU:" + str(gpu_index))
 
   def __bool__(self):
-    if self._shape_tuple() != ():  # pylint: disable=g-explicit-bool-comparison
-      raise ValueError(
-          "Non-scalar tensor %s cannot be converted to boolean." % repr(self))
-    if self.dtype != dtypes.bool:
-      raise ValueError(
-          "Non-boolean tensor %s cannot be converted to boolean." % repr(self))
-    return bool(self.cpu().numpy())
+    return bool(self.numpy())
 
   def __nonzero__(self):
     return self.__bool__()
@@ -1044,12 +1029,12 @@ def convert_to_tensor(value, dtype=None, name=None, preferred_dtype=None):
       `preferred_dtype` is not possible, this argument has no effect.
 
   Returns:
-    An `Output` based on `value`.
+    An `Tensor` based on `value`.
 
   Raises:
-    TypeError: If no conversion function is registered for `value`.
+    TypeError: If no conversion function is registered for `value` to `dtype`.
     RuntimeError: If a registered conversion function returns an invalid value.
-
+    ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
   return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
 
@@ -1097,12 +1082,12 @@ def convert_to_tensor_v2(value, dtype=None, dtype_hint=None, name=None):
     name: Optional name to use if a new `Tensor` is created.
 
   Returns:
-    An `Output` based on `value`.
+    An `Tensor` based on `value`.
 
   Raises:
-    TypeError: If no conversion function is registered for `value`.
+    TypeError: If no conversion function is registered for `value` to `dtype`.
     RuntimeError: If a registered conversion function returns an invalid value.
-
+    ValueError: If the `value` is a tensor not of given `dtype` in graph mode.
   """
   return internal_convert_to_tensor(
       value=value,
@@ -1123,49 +1108,13 @@ def internal_convert_to_tensor(value,
                                preferred_dtype=None,
                                ctx=None,
                                accept_symbolic_tensors=True):
-  """Converts the given `value` to an `Tensor`.
-
-  This function converts Python objects of various types to `Tensor`
-  objects. It accepts `Tensor` objects, numpy arrays, Python lists,
-  and Python scalars. For example:
-
-  This function can be useful when composing a new operation in Python
-  All standard Python op constructors apply this function to each of their
-  Tensor-valued inputs, which allows those ops to accept numpy arrays, Python
-  lists, and scalars in addition to `Tensor` objects.
-
-  Args:
-    value: An object whose type has a registered `Tensor` conversion function.
-    dtype: Optional element type for the returned tensor. If missing, the
-      type is inferred from the type of `value`.
-    name: Optional name to use if a new `Tensor` is created.
-    as_ref: True if we want the mutable view of Variables, if applicable.
-    preferred_dtype: Optional element type for the returned tensor,
-      used when dtype is None. In some cases, a caller may not have a
-      dtype in mind when converting to a tensor, so preferred_dtype
-      can be used as a soft preference.  If the conversion to
-      `preferred_dtype` is not possible, this argument has no effect.
-    ctx: Optional: The value of context.context().
-    accept_symbolic_tensors: Whether Keras graph tensors should be accepted as
-      a valid tensor type during eager execution.
-      If False, this function will raise an exception if it is passed such
-      a tensor during eager eager execution.
-
-  Returns:
-    A `Tensor` based on `value`.
-
-  Raises:
-    TypeError: If no conversion function is registered for `value`.
-    RuntimeError: If a registered conversion function returns an invalid value.
-
-  """
+  """Implementation of the public convert_to_tensor."""
   if ctx is None: ctx = context.context()
   if isinstance(value, EagerTensor):
     if ctx.executing_eagerly():
-      # Fast path for EagerTensors that don't need any conversion.
-      # Note that we don't check that value's dtype matches the dtype
-      # argument.  We expect that the C runtime will do that checking
-      # when we execute the kernel.
+      if dtype is not None:
+        dtype = dtypes.as_dtype(dtype)
+        value = _TensorTensorConversionFunction(value, dtype=dtype)
       return value
     else:
       graph = get_default_graph()
@@ -2129,12 +2078,6 @@ class Operation(object):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
     _assert_same_graph(self, tensor)
 
-    # Make sure output shapes are already computed for this op in case we create
-    # a cycle (we cannot compute shapes for cycles). Usually shapes are computed
-    # lazily upon request.
-    if not _USE_C_SHAPES:
-      set_shape_and_handle_data_for_outputs(self)
-
     # Reset cached inputs.
     self._inputs_val = None
     c_api.UpdateEdge(
@@ -2142,6 +2085,31 @@ class Operation(object):
         tensor._as_tf_output(),  # pylint: disable=protected-access
         self._tf_input(index))
 
+  def _add_while_inputs(self, tensors):
+    """See AddWhileInputHack in python_api.h.
+
+    NOTE: This is for TF internal use only. Please don't use it.
+
+    Args:
+      tensors: list of Tensors
+
+    Raises:
+      TypeError: if tensor is not a Tensor,
+        or if input tensor type is not convertible to dtype.
+      ValueError: if the Tensor is from a different graph.
+    """
+    for tensor in tensors:
+      if not isinstance(tensor, Tensor):
+        raise TypeError("tensor must be a Tensor: %s" % tensor)
+      _assert_same_graph(self, tensor)
+
+      # Reset cached inputs.
+      self._inputs_val = None
+      c_api.AddWhileInputHack(
+          self._graph._c_graph,  # pylint: disable=protected-access
+          tensor._as_tf_output(),  # pylint: disable=protected-access
+          self._c_op)
+
   def _add_control_inputs(self, ops):
     """Add a list of new control inputs to this operation.
 
@@ -2175,6 +2143,23 @@ class Operation(object):
     """Removes any control inputs to this operation."""
     c_api.RemoveAllControlInputs(self._graph._c_graph, self._c_op)  # pylint: disable=protected-access
 
+  def _add_outputs(self, types, shapes):
+    """Adds new Tensors to self.outputs.
+
+    Note: this is generally unsafe to use. This is used in certain situations in
+    conjunction with _set_type_list_attr.
+
+    Arguments:
+      types: list of DTypes
+      shapes: list of TensorShapes
+    """
+    assert len(types) == len(shapes)
+    orig_num_outputs = len(self.outputs)
+    for i in range(len(types)):
+      t = Tensor(self, orig_num_outputs + i, types[i])
+      self._outputs.append(t)
+      t.set_shape(shapes[i])
+
   def __str__(self):
     return str(self.node_def)
 
@@ -2387,6 +2372,25 @@ class Operation(object):
     finally:
       c_api.TF_DeleteBuffer(buf)
 
+  def _set_func_attr(self, attr_name, func_name):
+    """Private method used to set a function attribute in the node_def."""
+    func = attr_value_pb2.NameAttrList(name=func_name)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(func=func))
+
+  def _set_type_list_attr(self, attr_name, types):
+    """Private method used to set a function attribute in the node_def."""
+    if not types: return
+    if isinstance(types[0], dtypes.DType):
+      types = [dt.as_datatype_enum for dt in types]
+    types_list = attr_value_pb2.AttrValue.ListValue(type=types)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(list=types_list))
+
+  def _set_shape_list_attr(self, attr_name, shapes):
+    """Private method used to set a function attribute in the node_def."""
+    shapes = [s.as_proto() for s in shapes]
+    shapes_list = attr_value_pb2.AttrValue.ListValue(shape=shapes)
+    self._set_attr(attr_name, attr_value_pb2.AttrValue(list=shapes_list))
+
   def get_attr(self, name):
     """Returns the value of the attr of this op with the given `name`.
 
@@ -2399,7 +2403,7 @@ class Operation(object):
     Raises:
       ValueError: If this op does not have an attr with the given `name`.
     """
-    fields = ["s", "i", "f", "b", "type", "shape", "tensor", "func"]
+    fields = ("s", "i", "f", "b", "type", "shape", "tensor", "func")
     try:
       with c_api_util.tf_buffer() as buf:
         c_api.TF_OperationGetAttrValueProto(self._c_op, name, buf)
@@ -2410,25 +2414,21 @@ class Operation(object):
     x = attr_value_pb2.AttrValue()
     x.ParseFromString(data)
 
-    # Treat an empty oneof value as an empty list.
-    if not x.WhichOneof("value"):
+    oneof_value = x.WhichOneof("value")
+    if oneof_value is None:
       return []
-    if x.HasField("list"):
+    if oneof_value == "list":
       for f in fields:
         if getattr(x.list, f):
           if f == "type":
-            return [dtypes.as_dtype(x) for x in list(getattr(x.list, f))]
+            return [dtypes.as_dtype(t) for t in x.list.type]
           else:
             return list(getattr(x.list, f))
       return []
-    else:
-      for f in fields:
-        if x.HasField(f):
-          if f == "type":
-            return dtypes.as_dtype(getattr(x, f))
-          else:
-            return getattr(x, f)
-      assert False, "Unsupported field type in " + str(x)
+    if oneof_value == "type":
+      return dtypes.as_dtype(x.type)
+    assert oneof_value in fields, "Unsupported field type in " + str(x)
+    return getattr(x, oneof_value)
 
   def run(self, feed_dict=None, session=None):
     """Runs this operation in a `Session`.
@@ -2608,72 +2608,9 @@ class RegisterShape(object):
     return f
 
 
-# TODO(b/74620627): remove when _USE_C_SHAPES is removed
-def _set_shape_and_handle_data_for_outputs_c_api(op):
-  """Set shapes and resource handle data using info from the C API."""
-  assert not _USE_C_SHAPES
-  for output in op.outputs:
-    output._shape_val = output._c_api_shape()
-    # Set the resource handle data for compatibility with the Python shape
-    # inference code.
-    serialized = c_api.GetHandleShapeAndType(op._graph._c_graph,  # pylint: disable=protected-access
-                                             output._as_tf_output())
-    if serialized:
-      output._handle_data = (
-          cpp_shape_inference_pb2.CppShapeInferenceResult.HandleData
-          .FromString(compat.as_bytes(serialized)))
-    else:
-      output._handle_data = None
-
-
-# TODO(b/74620627): remove when _USE_C_SHAPES is removed
-def set_shape_and_handle_data_for_outputs(op):
-  """Set the shapes and resource handle data for op's outputs.
-
-  When _USE_C_SHAPES = False, this is lazily called when a tensor's shape is
-  first requested. Usually this should work automatically, but some edge cases
-  may require manually calling this first to make sure Tensor._shape_val and
-  Tensor._handle_data are set (e.g. manually overriding _handle_data, copying a
-  Tensor).
-  """
-  if _USE_C_SHAPES: return
-
-  if op.graph._is_function(op.type):
-    for output in op.outputs:
-      output._shape_val = tensor_shape.unknown_shape()
-    return
-
-  try:
-    shape_func = _shape_registry.lookup(op.type)
-  except LookupError:
-    try:
-      shape_func = _default_shape_function_registry.lookup(op.type)
-    except LookupError:
-      shape_func = _call_cpp_shape_fn_and_require_op
-
-  shapes = shape_func(op)
-  if shapes is None:
-    raise RuntimeError(
-        "Shape function for op %s did not return any shapes" % op)
-  elif isinstance(shapes, dict):
-    # Returned by call_cpp_shape_fn
-    shapes_dict = shapes
-    shapes = shapes_dict["shapes"]
-    handle_datas = shapes_dict["handle_data"]
-    for output, handle_data in zip(op.outputs, handle_datas):
-      # Don't override any existing handle data that may have been manually set.
-      # pylint: disable=protected-access
-      if output._handle_data is None:
-        output._handle_data = handle_data
-      # pylint: enable=protected-access
-
-  if len(op.outputs) != len(shapes):
-    raise RuntimeError(
-        "Shape function for op %s returned %d shapes but expected %d %s %s" %
-        (op, len(shapes), len(op.outputs), shape_func.__name__, str(shapes)))
-  for output, s in zip(op.outputs, shapes):
-    output._shape_val = tensor_shape.unknown_shape()
-    output._shape_val = output._shape_val.merge_with(s)
+def set_shape_and_handle_data_for_outputs(_):
+  """No op. TODO(b/74620627): Remove this."""
+  pass
 
 
 class OpStats(object):
@@ -2901,8 +2838,8 @@ class Graph(object):
     self._stack_state_is_thread_local = False
     self._thread_local = threading.local()
     # Functions that will be applied to choose a device if none is specified.
-    # After switch_to_thread_local(), self._thread_local._device_function_stack
-    # is used instead.
+    # In TF2.x or after switch_to_thread_local(),
+    # self._thread_local._device_function_stack is used instead.
     self._graph_device_function_stack = traceable_stack.TraceableStack()
     # Default original_op applied to new ops.
     self._default_original_op = None
@@ -2910,7 +2847,7 @@ class Graph(object):
     # WhileContext defined in ops/control_flow_ops.py
     self._control_flow_context = None
     # A new node will depend of the union of all of the nodes in the stack.
-    # After switch_to_thread_local(),
+    # In TF2.x or after switch_to_thread_local(),
     # self._thread_local._control_dependencies_stack is used instead.
     self._graph_control_dependencies_stack = []
     # Arbitrary collections of objects.
@@ -2934,7 +2871,7 @@ class Graph(object):
         producer=versions.GRAPH_DEF_VERSION,
         min_consumer=versions.GRAPH_DEF_VERSION_MIN_CONSUMER)
     self._building_function = False
-    # Stack of colocate_with ops. After switch_to_thread_local(),
+    # Stack of colocate_with ops. In TF2.x or after switch_to_thread_local(),
     # self._thread_local._colocation_stack is used instead.
     self._graph_colocation_stack = traceable_stack.TraceableStack()
     # Set of tensors that are dangerous to feed!
@@ -2967,6 +2904,8 @@ class Graph(object):
     # requirement (many custom ops do not have shape functions, and we don't
     # want to break these existing cases).
     c_api.SetRequireShapeInferenceFns(self._c_graph, False)
+    if tf2.enabled():
+      self.switch_to_thread_local()
 
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
@@ -3391,36 +3330,6 @@ class Graph(object):
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-  def _make_colocation_conflict_message(self, op, colocation_op):
-    """Return detailed error message about device conflict due to colocation."""
-    # Example error message:
-    #   Tried to colocate op 'a' (defined at file1.py:149) having device
-    #   '/device:GPU:0' with op 'b' (defined at file2:96) which had an
-    #   incompatible device '/device:CPU:0'.
-    #
-    #   No node-device colocations were active during op 'a' creation.
-    #   Device assignments active during op 'a' creation:
-    #     with tf.device(/device:GPU:0): file1.py:148>
-    #
-    #   Node-device colocations active during op 'b' creation:
-    #     with tf.colocate_with(a): file2.py:93>
-    #   Device assignments active during op 'b' creation:
-    #     with tf.device(/cpu:0): file2.py:94
-    op_info = error_interpolation.compute_field_dict(op)
-    coloc_op_info = error_interpolation.compute_field_dict(colocation_op)
-    msg = ("Tried to colocate op '{op_name}'{op_loc} having device '{op_dev}' "
-           "with op '{coloc_op_name}'{coloc_op_loc} which had an incompatible "
-           "device '{coloc_op_dev}'.\n\n{op_summary}\n\n{coloc_op_summary}"
-           .format(op_name=op.name,
-                   op_loc=op_info["defined_at"],
-                   op_dev=op.device,
-                   op_summary=op_info["devs_and_colocs"],
-                   coloc_op_name=colocation_op.name,
-                   coloc_op_loc=coloc_op_info["defined_at"],
-                   coloc_op_dev=colocation_op.device,
-                   coloc_op_summary=coloc_op_info["devs_and_colocs"]))
-    return msg
-
   def _create_op_helper(self, op, compute_device=True):
     """Common logic for creating an op in this graph."""
     # Apply any additional attributes requested. Do not overwrite any existing
@@ -3473,12 +3382,9 @@ class Graph(object):
       for colocation_op in self._colocation_stack.peek_objs():
         all_colocation_groups.extend(colocation_op.colocation_groups())
         if colocation_op.device:
-          if (op.device and pydev.canonical_name(op.device) !=
-              pydev.canonical_name(colocation_op.device)):
-            msg = self._make_colocation_conflict_message(op, colocation_op)
-            logging.warning(msg)
-          else:
-            op._set_device(colocation_op.device)  # pylint: disable=protected-access
+          # pylint: disable=protected-access
+          op._set_device(colocation_op.device)
+          # pylint: enable=protected-access
 
       all_colocation_groups = sorted(set(all_colocation_groups))
       # pylint: disable=protected-access
@@ -3526,11 +3432,6 @@ class Graph(object):
 
     # pylint: disable=protected-access
     for op in new_ops:
-      # Operations created by the C API always retrieve shapes from the C API so
-      # we preserve the shapes of ops created in import_graph_def (from the
-      # "_output_shapes" attr of the imported NodeDef).
-      if not _USE_C_SHAPES:
-        _set_shape_and_handle_data_for_outputs_c_api(op)
       new_control_inputs = self._control_dependencies_for_inputs(op.inputs)
       op._add_control_inputs(new_control_inputs)
       op._control_flow_post_processing()
@@ -5474,6 +5375,10 @@ def init_scope():
 
 def executing_eagerly_outside_functions():
   """Returns True if executing eagerly, even if inside a graph function."""
+  # Fastpath for when this is called eagerly (its not necessary to init_scope).
+  if context.executing_eagerly():
+    return True
+
   with init_scope():
     return context.executing_eagerly()
 
@@ -5482,7 +5387,7 @@ def inside_function():
   return get_default_graph().building_function
 
 
-@tf_export("enable_eager_execution")
+@tf_export(v1=["enable_eager_execution"])
 def enable_eager_execution(config=None,
                            device_policy=None,
                            execution_mode=None):
@@ -5553,6 +5458,17 @@ def enable_eager_execution(config=None,
         server_def=None)
 
 
+@tf_export(v1=["disable_eager_execution"])
+def disable_eager_execution():
+  """Disables eager execution.
+
+  This function can only be called before any Graphs, Ops, or Tensors have been
+  created. It can be used at the beginning of the program for complex migration
+  projects from TensorFlow 1.x to 2.x.
+  """
+  context.default_execution_mode = context.GRAPH_MODE
+
+
 def enable_eager_execution_internal(config=None,
                                     device_policy=None,
                                     execution_mode=None,
@@ -5560,6 +5476,7 @@ def enable_eager_execution_internal(config=None,
   """Enables eager execution for the lifetime of this program.
 
   Most of the doc string for enable_eager_execution is relevant here as well.
+
   Args:
     config: See enable_eager_execution doc string
     device_policy: See enable_eager_execution doc string
@@ -5652,7 +5569,7 @@ def eager_run(main=None, argv=None):
   app.run(main, argv)
 
 
-@tf_export("reset_default_graph")
+@tf_export(v1=["reset_default_graph"])
 def reset_default_graph():
   """Clears the default graph stack and resets the global default graph.
 
@@ -5671,7 +5588,7 @@ def reset_default_graph():
   _default_graph_stack.reset()
 
 
-@tf_export("get_default_graph")
+@tf_export(v1=["get_default_graph"])
 def get_default_graph():
   """Returns the default graph for the current thread.
 
@@ -5798,7 +5715,7 @@ def _get_graph_from_inputs(op_input_list, graph=None):
   return graph or get_default_graph()
 
 
-@tf_export("GraphKeys")
+@tf_export(v1=["GraphKeys"])
 class GraphKeys(object):
   """Standard names to use for graph collections.
 
@@ -6004,7 +5921,7 @@ def add_to_collections(names, value):
   get_default_graph().add_to_collections(names, value)
 
 
-@tf_export("get_collection_ref")
+@tf_export(v1=["get_collection_ref"])
 def get_collection_ref(key):
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
@@ -6028,7 +5945,7 @@ def get_collection_ref(key):
   return get_default_graph().get_collection_ref(key)
 
 
-@tf_export("get_collection")
+@tf_export(v1=["get_collection"])
 def get_collection(key, scope=None):
   """Wrapper for `Graph.get_collection()` using the default graph.
 
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index 0fb17081e758a7f43a1fb1e6d415da3ed630aea7..0fcbcd6ee4dd1f103c599dc4db26432b61879e83 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -57,11 +57,13 @@ ops._set_call_cpp_shape_fn(common_shapes.call_cpp_shape_fn)
 
 class ResourceTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBuildGraph(self):
     with self.cached_session():
       pt = test_ops.stub_resource_handle_op(container="a", shared_name="b")
       test_ops.resource_create_op(pt).run()
 
+  @test_util.run_deprecated_v1
   def testInitialize(self):
     with self.cached_session():
       handle = test_ops.stub_resource_handle_op(container="a", shared_name="b")
@@ -106,6 +108,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual([2, 3], c.shape)
 
+  @test_util.run_deprecated_v1
   def testUnknownDim(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=[2, None, 3])
@@ -113,6 +116,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual([2, None, 3], c.shape.as_list())
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=None)
@@ -120,6 +124,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual(tensor_shape.unknown_shape(), c.shape)
 
+  @test_util.run_deprecated_v1
   def testScalarShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtype=dtypes.float32, shape=[])
@@ -127,6 +132,7 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
       c = a + b
       self.assertEqual(tensor_shape.scalar(), c.shape)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionError(self):
     with self.cached_session():
       a = array_ops.ones([1, 2, 3])
@@ -140,15 +146,16 @@ class TensorAndShapeTest(test_util.TensorFlowTestCase):
 
 class IndexedSlicesTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_in_graph_and_eager_modes
   def testToTensor(self):
-    with self.cached_session():
-      values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
-      indices = constant_op.constant([0, 2])
-      dense_shape = constant_op.constant([3, 2])
-      x = ops.IndexedSlices(values, indices, dense_shape)
-      tensor = ops.convert_to_tensor(x, name="tensor")
-      self.assertAllEqual(tensor.eval(), [[2, 3], [0, 0], [5, 7]])
-
+    values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
+    indices = constant_op.constant([0, 2])
+    dense_shape = constant_op.constant([3, 2])
+    x = ops.IndexedSlices(values, indices, dense_shape)
+    tensor = ops.convert_to_tensor(x, name="tensor")
+    self.assertAllEqual(self.evaluate(tensor), [[2, 3], [0, 0], [5, 7]])
+
+  @test_util.run_deprecated_v1
   def testNegation(self):
     with self.cached_session():
       values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
@@ -157,6 +164,7 @@ class IndexedSlicesTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(x.values.eval(), [[-2, -3], [-5, -7]])
       self.assertAllEqual(x.indices.eval(), [0, 2])
 
+  @test_util.run_deprecated_v1
   def testScalarMul(self):
     with self.cached_session():
       values = constant_op.constant([2, 3, 5, 7], shape=[2, 2])
@@ -190,6 +198,7 @@ def _apply_op(g, *args, **kwargs):
 
 class OperationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testNoInputs(self):
     op = test_ops.float_output_string_output(name="myop").a.op
     self.assertEqual(2, len(op.values()))
@@ -212,6 +221,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertProtoEquals("op:'FloatOutputStringOutput' name:'myop'",
                            op.node_def)
 
+  @test_util.run_deprecated_v1
   def testNoOutputs(self):
     op1 = test_ops.float_output(name="myop1").op
     float_t, = op1.values()
@@ -227,6 +237,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertProtoEquals("op:'FloatInput' name:'myop2' input:'myop1'",
                            op2.node_def)
 
+  @test_util.run_deprecated_v1
   def testInputsAndOutputs(self):
     op1 = test_ops.float_output(name="myop1").op
     self.assertEqual(1, len(op1.values()))
@@ -308,16 +319,17 @@ class OperationTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       ops.Operation(ops._NodeDef("op", "invalid:0"), g)
 
+  @test_util.run_deprecated_v1
   def testNoShapeFunction(self):
     op = test_ops.a()
     self.assertEqual(tensor_shape.unknown_shape(), op.get_shape())
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedArray(self):
-    with self.cached_session():
-      values = [[2], [3], [5], [7]]
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+    values = [[2], [3], [5], [7]]
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
 
   def testShapeTuple(self):
     with self.cached_session():
@@ -333,57 +345,63 @@ class OperationTest(test_util.TensorFlowTestCase):
       converted = ops.convert_to_tensor(1)
       self.assertTrue(isinstance(converted, ops.EagerTensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedTuple(self):
-    with self.cached_session():
-      values = ((2,), (3,), (5,), (7,))
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, ops.convert_to_tensor(values).eval())
+    values = ((2,), (3,), (5,), (7,))
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(ops.convert_to_tensor(values)))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedTensors(self):
-    with self.cached_session():
-      values = ((2,), (3,), (5,), (7,))
-      tensor = ops.convert_to_tensor(
-          [constant_op.constant(row) for row in values])
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
-      tensor = ops.convert_to_tensor(
-          [[constant_op.constant(v) for v in row] for row in values])
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(values, tensor.eval())
+    values = ((2,), (3,), (5,), (7,))
+    tensor = ops.convert_to_tensor(
+        [constant_op.constant(row) for row in values])
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
+    tensor = ops.convert_to_tensor(
+        [[constant_op.constant(v) for v in row] for row in values])
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(values, self.evaluate(tensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorNestedMix(self):
-    with self.cached_session():
-      values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
-      tensor = ops.convert_to_tensor(values)
-      self.assertAllEqual((4, 1), tensor.get_shape().as_list())
-      self.assertAllEqual(((2,), (3,), (5,), (7,)), tensor.eval())
+    values = ([2], (3,), [constant_op.constant(5)], constant_op.constant([7]))
+    tensor = ops.convert_to_tensor(values)
+    self.assertAllEqual((4, 1), tensor.get_shape().as_list())
+    self.assertAllEqual(((2,), (3,), (5,), (7,)), self.evaluate(tensor))
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToTensorPreferred(self):
-    with self.cached_session():
-      values = [2, 3, 5, 7]
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.float32)
-      self.assertEqual(dtypes.float32, tensor.dtype)
+    values = [2, 3, 5, 7]
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.float32)
+    self.assertEqual(dtypes.float32, tensor.dtype)
 
-    with self.cached_session():
-      # Convert empty tensor to anything.
-      values = []
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
-      self.assertEqual(dtypes.int64, tensor.dtype)
+    # Convert empty tensor to anything.
+    values = []
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
+    self.assertEqual(dtypes.int64, tensor.dtype)
 
-    with self.cached_session():
-      # The preferred dtype is a type error and will convert to
-      # float32 instead.
-      values = [1.23]
-      tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
-      self.assertEqual(dtypes.float32, tensor.dtype)
+    # The preferred dtype is a type error and will convert to
+    # float32 instead.
+    values = [1.23]
+    tensor = ops.convert_to_tensor(values, preferred_dtype=dtypes.int64)
+    self.assertEqual(dtypes.float32, tensor.dtype)
 
+  @test_util.run_in_graph_and_eager_modes
   def testConvertToInvalidTensorType(self):
     with self.assertRaises(TypeError):
       # Forcing an invalid dtype should fail with a type error.
       values = [1.23]
-      _ = ops.convert_to_tensor(values, dtype=dtypes.int64)
+      ops.convert_to_tensor(values, dtype=dtypes.int64)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testConvertToTensorFromInvalidTensor(self):
+    tensor = constant_op.constant(42.0, dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      ops.convert_to_tensor(tensor, dtype=dtypes.int32)
+
+  @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Operation cannot be converted to Tensor.
     op = control_flow_ops.no_op()
@@ -401,6 +419,7 @@ class OperationTest(test_util.TensorFlowTestCase):
         ops._NodeDef("None", "op1"), ops.Graph(), [], [dtypes.float32])
     self.assertEqual("<tf.Operation 'op1' type=None>", repr(op))
 
+  @test_util.run_deprecated_v1
   def testGetAttr(self):
     op = test_ops.default_attrs()
     self.assertEqual(op.get_attr("string_val"), b"abc")
@@ -446,6 +465,7 @@ class OperationTest(test_util.TensorFlowTestCase):
 
   # TODO(b/65162920): remove this test when users who are directly mutating the
   # node_def have been updated to proper usage.
+  @test_util.run_deprecated_v1
   def testSetAttr(self):
     op = test_ops.int_attr().op
     op._set_attr("foo", attr_value_pb2.AttrValue(i=2))
@@ -466,6 +486,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(z.control_inputs, [x, y])
     self.assertEqual(x._control_outputs, [z])
 
+  @test_util.run_deprecated_v1
   def testRemoveAllControlInputs(self):
     a = constant_op.constant(1)
     with ops.control_dependencies([a]):
@@ -490,6 +511,7 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(f.op.control_inputs, [])
     self.assertEqual(list(f.op.inputs), [d, e])
 
+  @test_util.run_deprecated_v1
   def testControlInputCycle(self):
     graph = ops.Graph()
     with graph.as_default():
@@ -503,7 +525,7 @@ class OperationTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(
           errors.InvalidArgumentError,
           "Graph is invalid, contains a cycle with 2 nodes"):
-        sess.run(x)
+        self.evaluate(x)
 
   def testUpdateInput(self):
     g = ops.Graph()
@@ -517,21 +539,21 @@ class OperationTest(test_util.TensorFlowTestCase):
     self.assertEquals(x.consumers(), [])
     self.assertEquals(y.consumers(), [z.op, z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 4)
+      self.assertEquals(self.evaluate(z), 4)
 
     z.op._update_input(0, x)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
     self.assertEquals(x.consumers(), [z.op])
     self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 3)
+      self.assertEquals(self.evaluate(z), 3)
 
     z.op._update_input(1, y)  # pylint: disable=protected-access
     self.assertEquals(list(z.op.inputs), [x, y])
     self.assertEquals(x.consumers(), [z.op])
     self.assertEquals(y.consumers(), [z.op])
     with session.Session(graph=g) as sess:
-      self.assertEquals(sess.run(z), 3)
+      self.assertEquals(self.evaluate(z), 3)
 
   def testUpdateInputGraphError(self):
     g_0 = ops.Graph()
@@ -557,7 +579,7 @@ class OperationTest(test_util.TensorFlowTestCase):
           errors.InvalidArgumentError,
           "Input 0 of node add was passed string from Const_1:0 incompatible "
           "with expected int32"):
-        sess.run(z)
+        self.evaluate(z)
 
   def testUpdateInputShapeError(self):
     g = ops.Graph()
@@ -582,6 +604,33 @@ class OperationTest(test_util.TensorFlowTestCase):
     ):
       x.op._update_input(1, x)  # pylint: disable=protected-access
 
+  @test_util.enable_control_flow_v2
+  @test_util.run_v1_only("b/120545219")
+  def testAddWhileInput(self):
+    @eager_function.defun
+    def test():
+      output = control_flow_ops.while_loop(lambda x: x < 3, lambda x: x + 1,
+                                           [1])
+      while_op = output.op.inputs[0].op
+      self.assertEqual(while_op.type, "While")
+      orig_num_inputs = len(while_op.inputs)
+
+      new_input1 = constant_op.constant(1.0)
+      new_input2 = constant_op.constant(True)
+
+      while_op._set_type_list_attr("T",
+                                   [t.dtype for t in while_op.inputs] +
+                                   [new_input1.dtype, new_input2.dtype])
+
+      while_op._add_while_inputs([new_input1, new_input2])
+      # Can't add an edge beyond what's specified by "T"
+      with self.assertRaises(errors.OutOfRangeError):
+        while_op._add_while_inputs([new_input2])
+      self.assertEqual(len(while_op.inputs), orig_num_inputs + 2)  # pylint: disable=g-deprecated-assert
+
+    test()
+
+  @test_util.run_deprecated_v1
   def testOpDef(self):
     x = constant_op.constant(0)
     y = constant_op.constant(1)
@@ -681,6 +730,7 @@ class CreateOpTest(test_util.TensorFlowTestCase):
 # the low-level behavior.
 class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
@@ -701,7 +751,6 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(g.get_operation_by_name("myop"), op)
     self.assertEqual(g.get_tensor_by_name("myop:0"), op.outputs[0])
 
-  @test_util.enable_c_shapes
   def testShape(self):
     g = ops.Graph()
     with g.as_default():
@@ -732,6 +781,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     self.assertEqual(op3.name, "myop_2")
     self.assertEqual(op4.name, "myop_1_1")
 
+  @test_util.run_v1_only("b/120545219")
   def testCond(self):
     g = ops.Graph()
     with g.as_default():
@@ -761,6 +811,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
                      "cond/cond_text")
     # pylint: enable=protected-access
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoop(self):
     g = ops.Graph()
     with g.as_default():
@@ -790,6 +841,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
                      "myloop/while_context")
     # pylint: enable=protected-access
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoopWithInternalControlDep(self):
     g = ops.Graph()
     with g.as_default():
@@ -813,6 +865,7 @@ class CreateOpFromTFOperationTest(test_util.TensorFlowTestCase):
     # Internal control dep is preserved
     self.assertEqual(op.control_inputs, [c])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileLoopWithExternalControlDep(self):
     g = ops.Graph()
     with g.as_default():
@@ -946,6 +999,7 @@ class NameStackTest(test_util.TensorFlowTestCase):
     self.assertEqual("bar_2", g.unique_name("bar", mark_as_used=False))
     self.assertEqual("bar_2", g.unique_name("bar"))
 
+  @test_util.run_deprecated_v1
   def testNameAndVariableScope(self):
     with self.cached_session() as sess:
       with sess.graph.name_scope("l0"):
@@ -1076,6 +1130,13 @@ class DeviceTest(test_util.TensorFlowTestCase):
       node { name: "FloatOutput" op: "FloatOutput" }
     """, gd)
 
+  def testEagerBackingDevice(self):
+    with context.eager_mode():
+      with ops.device("/device:CPU:0"):
+        t = constant_op.constant(1.0)
+        self.assertRegexpMatches(t.device, "/device:CPU:0")
+        self.assertRegexpMatches(t.backing_device, "/device:CPU:0")
+
   def testDevicePartialString(self):
     g = ops.Graph()
     with g.device("/job:worker/replica:2"):
@@ -1665,6 +1726,7 @@ def _CopyOverrideGrad(op, x_grad):  # pylint: disable=invalid-name
 
 class RegistrationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testRegisterGradients(self):
     x = test_ops.float_output()
     y = test_ops.copy_op(x)
@@ -1704,6 +1766,7 @@ class ComparisonTest(test_util.TensorFlowTestCase):
 
 class ControlDependenciesTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     g = ops.Graph()
     with g.as_default():
@@ -1947,6 +2010,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(None, "default2") as scope2:
         self.assertEqual(scope2, "default/default2/")
 
+  @test_util.run_deprecated_v1
   def testNoScopeName(self):
     g0 = ops.Graph()
     values = [
@@ -1960,6 +2024,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(None, None, values):
         pass
 
+  @test_util.run_deprecated_v1
   def testEmptyScopeName(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -1971,6 +2036,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual("", scope)
       self.assertEqual(g0, ops.get_default_graph())
 
+  @test_util.run_deprecated_v1
   def testDefaultScopeName(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -1995,12 +2061,14 @@ class OpScopeTest(test_util.TensorFlowTestCase):
       with ops.name_scope(scope_name, values=graph_elements + [a]):
         pass
 
+  @test_util.run_deprecated_v1
   def testTensor(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
     b = g0.create_op("B", [], [dtypes.float32])
     self._testGraphElements([a, b])
 
+  @test_util.run_deprecated_v1
   def testSparseTensor(self):
     g0 = ops.Graph()
     a = g0.create_op("A", [], [dtypes.float32])
@@ -2011,6 +2079,7 @@ class OpScopeTest(test_util.TensorFlowTestCase):
         _apply_op(g0, "Int64Output", [], [dtypes.int64]))
     self._testGraphElements([a, sparse, b])
 
+  @test_util.run_deprecated_v1
   def testVariable(self):
     g0 = ops.Graph()
     with g0.as_default():
@@ -2215,6 +2284,7 @@ class InitScopeTest(test_util.TensorFlowTestCase):
       self.assertEqual(4, int(compiled_outer(inner=compiled_inner)))
       self.assertEqual(7, int(compiled_outer(inner=compiled_inner)))
 
+  @test_util.run_v1_only("b/120545219")
   def testFallsBackToGlobalGraphWhenAllGraphsAreBuildingFunctions(self):
     with context.graph_mode():
       ops.reset_default_graph()
@@ -2351,6 +2421,7 @@ class GraphTest(test_util.TensorFlowTestCase):
     g.prevent_feeding(a)
     self.assertFalse(g.is_feedable(a))
 
+  @test_util.run_deprecated_v1
   def testPreventFetching(self):
     g = ops.Graph()
     a = constant_op.constant(2.0)
@@ -2391,7 +2462,7 @@ class GraphTest(test_util.TensorFlowTestCase):
       c = math_ops.add(a, b)
     # Create a session we can delete
     with session.Session(graph=g) as sess:
-      sess.run(c)
+      self.evaluate(c)
     # Delete all references and trigger gc
     del g
     del a
@@ -2407,7 +2478,7 @@ class GraphTest(test_util.TensorFlowTestCase):
         math_ops.add([1, 2], [1, 2, 3])
       a = constant_op.constant(1)
       with session.Session() as sess:
-        sess.run(a)
+        self.evaluate(a)
 
   def testRunnableAfterInvalidShapeWithKernelLabelMap(self):
     g = ops.Graph()
@@ -2417,7 +2488,7 @@ class GraphTest(test_util.TensorFlowTestCase):
           test_ops.kernel_label_required(1)
       a = constant_op.constant(1)
       with session.Session() as sess:
-        sess.run(a)
+        self.evaluate(a)
 
 
 class AttrScopeTest(test_util.TensorFlowTestCase):
@@ -2434,10 +2505,12 @@ class AttrScopeTest(test_util.TensorFlowTestCase):
       b = None
     return (a, b)
 
+  @test_util.run_deprecated_v1
   def testNoLabel(self):
     with self.cached_session():
       self.assertAllEqual((None, None), self._get_test_attrs())
 
+  @test_util.run_deprecated_v1
   def testLabelMap(self):
     with self.cached_session() as sess:
       a1 = self._get_test_attrs()
@@ -2472,11 +2545,13 @@ ops.RegisterShape("KernelLabel")(common_shapes.scalar_shape)
 
 class KernelLabelTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testNoLabel(self):
     with self.cached_session():
       self.assertAllEqual(b"My label is: default",
                           test_ops.kernel_label().eval())
 
+  @test_util.run_deprecated_v1
   def testLabelMap(self):
     with self.cached_session() as sess:
       default_1 = test_ops.kernel_label()
@@ -2491,12 +2566,14 @@ class KernelLabelTest(test_util.TensorFlowTestCase):
       # pylint: enable=protected-access
       default_3 = test_ops.kernel_label()
 
-      self.assertAllEqual(b"My label is: default", default_1.eval())
-      self.assertAllEqual(b"My label is: default", default_2.eval())
-      self.assertAllEqual(b"My label is: default", default_3.eval())
-      self.assertAllEqual(b"My label is: overload_1", overload_1_1.eval())
-      self.assertAllEqual(b"My label is: overload_1", overload_1_2.eval())
-      self.assertAllEqual(b"My label is: overload_2", overload_2.eval())
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_1))
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_2))
+      self.assertAllEqual(b"My label is: default", self.evaluate(default_3))
+      self.assertAllEqual(b"My label is: overload_1",
+                          self.evaluate(overload_1_1))
+      self.assertAllEqual(b"My label is: overload_1",
+                          self.evaluate(overload_1_2))
+      self.assertAllEqual(b"My label is: overload_2", self.evaluate(overload_2))
 
 
 class AsGraphDefTest(test_util.TensorFlowTestCase):
@@ -2591,6 +2668,7 @@ class StatisticsTest(test_util.TensorFlowTestCase):
 
 class DeviceStackTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicDeviceAssignmentMetadata(self):
 
     def device_func(unused_op):
@@ -2622,6 +2700,7 @@ class DeviceStackTest(test_util.TensorFlowTestCase):
     expected_regex = r"device_func<.*ops_test.py, [0-9]+"
     self.assertRegexpMatches(func_description, expected_regex)
 
+  @test_util.run_deprecated_v1
   def testDeviceAssignmentMetadataForGraphDeviceAndTfDeviceFunctions(self):
 
     with ops.device("/cpu"):
@@ -2641,6 +2720,7 @@ class DeviceStackTest(test_util.TensorFlowTestCase):
 
 class ColocationGroupTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2651,6 +2731,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       c.op.get_attr("_class")
 
+  @test_util.run_deprecated_v1
   def testBasicColocationMetadata(self):
     const_two = constant_op.constant([2.0], name="two")
     with ops.colocate_with(const_two.op):
@@ -2663,6 +2744,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     # colocation statement.
     self.assertEqual("ops_test.py", os.path.basename(metadata.filename))
 
+  @test_util.run_deprecated_v1
   def testColocationDeviceInteraction(self):
     with ops.device("/cpu:0"):
       with ops.device("/device:GPU:0"):
@@ -2675,6 +2757,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual(a.op.device, b.op.device)
 
+  @test_util.run_deprecated_v1
   def testColocationCanonicalization(self):
     with ops.device("/device:GPU:0"):
       _ = constant_op.constant(2.0)
@@ -2690,6 +2773,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     # inherits B's device name, after canonicalizing the names.
     self.assertEqual(b.op.device, c.op.device)
 
+  @test_util.run_deprecated_v1
   def testLocationOverrides(self):
     with ops.device("/cpu:0"):
       with ops.device("/device:GPU:0"):
@@ -2711,6 +2795,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual("/device:GPU:0", c.op.device)
     self.assertEqual("/device:CPU:0", d.op.device)
 
+  @test_util.run_deprecated_v1
   def testNestedColocateWith(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2720,6 +2805,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual([b"loc:@a"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testMultiColocationGroups(self):
     a = constant_op.constant([2.0], name="a")
     b = constant_op.constant(3.0, name="b")
@@ -2728,6 +2814,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
         c = constant_op.constant(4.0)
     self.assertEqual(set([b"loc:@a", b"loc:@b"]), set(c.op.colocation_groups()))
 
+  @test_util.run_deprecated_v1
   def testColocationIgnoreStack(self):
     a = constant_op.constant([2.0], name="a")
     b = constant_op.constant(3.0, name="b")
@@ -2736,6 +2823,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
         c = constant_op.constant(4.0)
     self.assertEqual(set([b"loc:@b"]), set(c.op.colocation_groups()))
 
+  @test_util.run_deprecated_v1
   def testColocateWithReset(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2745,6 +2833,7 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
     self.assertEqual([b"loc:@c"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testColocateWithInitialNoneThenNested(self):
     a = constant_op.constant([2.0], name="a")
     with ops.colocate_with(a.op):
@@ -2755,47 +2844,13 @@ class ColocationGroupTest(test_util.TensorFlowTestCase):
     self.assertEqual([b"loc:@b"], b.op.colocation_groups())
     self.assertEqual([b"loc:@b"], c.op.colocation_groups())
 
+  @test_util.run_deprecated_v1
   def testColocateVariables(self):
     a = variables.Variable([2.0], name="a")
     with ops.colocate_with(a.op):
       b = variables.Variable([3.0], name="b")
     self.assertEqual([b"loc:@a"], b.op.colocation_groups())
 
-  def testInconsistentDeviceWithinColocate(self):
-    with ops.device("/device:GPU:0"):
-      a = constant_op.constant([2.0], name="a")
-      with ops.colocate_with(a.op):
-        # This is allowed due to legacy but clearly wrong, since we
-        # should really be colocating with 'a'.  We allow devices to
-        # override colocate_with, but we log warnings to suggest that
-        # this is probably unintentional or misguided.
-        with ops.device("/cpu:0"):
-          b = constant_op.constant([3.0], name="b")
-
-    self.assertEqual("/device:CPU:0", b.device)
-
-  def testMakeColocationConflictMessage(self):
-    """Test that provides an example of a complicated error message."""
-    # We could test the message with any ops, but this test will be more
-    # instructive with a real colocation conflict.
-    with ops.device("/device:GPU:0"):
-      a = constant_op.constant([2.0], name="a")
-      with ops.colocate_with(a.op):
-        with ops.device("/cpu:0"):
-          b = constant_op.constant([3.0], name="b")
-    # The definition-location of the nodes will be wrong because of running
-    # from within a TF unittest.  The rest of the info should be correct.
-    message = ops.get_default_graph()._make_colocation_conflict_message(a.op,
-                                                                        b.op)
-    self.assertRegexpMatches(message,
-                             r"Tried to colocate op 'a' \(defined at.*\)")
-    self.assertRegexpMatches(message, "No node-device.*'a'")
-    self.assertRegexpMatches(message, "Device assignments active.*'a'")
-    self.assertRegexpMatches(message, "GPU:0")
-    self.assertRegexpMatches(message, "Node-device colocations active.*'b'")
-    self.assertRegexpMatches(message, "Device assignments active.*'b'")
-    self.assertRegexpMatches(message, "cpu:0")
-
 
 class DeprecatedTest(test_util.TensorFlowTestCase):
 
@@ -2918,6 +2973,7 @@ class NameScopeTest(test_util.TensorFlowTestCase):
 
 class TracebackTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTracebackWithStartLines(self):
     with self.cached_session() as sess:
       a = constant_op.constant(2.0)
@@ -2939,6 +2995,7 @@ class TracebackTest(test_util.TensorFlowTestCase):
 
 class EnableEagerExecutionTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBadArgumentsToEnableEagerExecution(self):
     with self.assertRaisesRegexp(TypeError, "config must be a tf.ConfigProto"):
       ops.enable_eager_execution(context.DEVICE_PLACEMENT_SILENT)
diff --git a/tensorflow/python/framework/python_op_gen.cc b/tensorflow/python/framework/python_op_gen.cc
index 465016b808726f28909013e994b9b23b915d982a..d460168631c3032bb91894c9997b2de29bf026e6 100644
--- a/tensorflow/python/framework/python_op_gen.cc
+++ b/tensorflow/python/framework/python_op_gen.cc
@@ -142,6 +142,7 @@ class GenEagerPythonOp : public python_op_gen_internal::GenPythonOp {
   void AddEagerAttrs(const string& indentation);
   void AddEagerExecute(const string& indentation,
                        const string& num_outputs_expr);
+  void AddDispatch(const string& prefix);
 
   void AddAttrForArg(const string& attr, int arg_index) {
     gtl::InsertIfNotPresent(&inferred_attrs_, attr,
@@ -356,9 +357,14 @@ string GenEagerPythonOp::Code() {
 
 void GenEagerPythonOp::HandleGraphMode(const string& function_setup) {
   strings::StrAppend(&result_, "  # Add nodes to the TensorFlow graph.\n");
-  strings::StrAppend(&result_, function_setup,
-                     "  _, _, _op = _op_def_lib._apply_op_helper(\n");
-  AddBodyNoReturn("        ");
+  strings::StrAppend(&result_, function_setup);
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "  try:\n  ");
+  }
+  strings::StrAppend(&result_, "  _, _, _op = _op_def_lib._apply_op_helper(\n");
+  AddBodyNoReturn(strings::StrCat("        \"", op_def_.name(), "\", "));
+  AddDispatch("  ");
+
   if (num_outs_ > 0) {
     strings::StrAppend(&result_, "  _result = _op.outputs[:]\n");
     // Special case handling for stateful op with single list output
@@ -628,6 +634,9 @@ void GenEagerPythonOp::AddEagerFunctionTeardown(
 bool GenEagerPythonOp::AddEagerFastPathAndGraphCode(
     const string& parameters, const std::vector<string>& output_sizes,
     const string& eager_not_allowed_error) {
+  if (api_def_.visibility() == ApiDef::VISIBLE) {
+    strings::StrAppend(&result_, "@_dispatch.add_dispatch_list\n");
+  }
   AddExport();
   AddDefLine(function_name_, parameters);
   AddDocStringDescription();
@@ -758,6 +767,7 @@ void GenEagerPythonOp::AddEagerFastPathExecute() {
   strings::StrAppend(&result_, "      except _core._SymbolicException:\n");
   strings::StrAppend(&result_,
                      "        pass  # Add nodes to the TensorFlow graph.\n");
+  AddDispatch("      ");
 
   // Any errors thrown from execute need to be unwrapped from
   // _NotOkStatusException.
@@ -898,6 +908,19 @@ void GenEagerPythonOp::AddEagerExecute(const string& indentation,
                      WordWrap(return_prefix, return_args, kRightMargin), "\n");
 }
 
+void GenEagerPythonOp::AddDispatch(const string& prefix) {
+  if (api_def_.visibility() != ApiDef::VISIBLE) return;
+
+  strings::StrAppend(&result_, prefix, "except (TypeError, ValueError):\n");
+  strings::StrAppend(&result_, prefix, "  result = _dispatch.dispatch(\n");
+  AddBodyNoReturn(strings::StrCat(prefix, "        ", function_name_, ", "));
+  strings::StrAppend(&result_, prefix,
+                     "  if result is not "
+                     "_dispatch.OpDispatcher.NOT_SUPPORTED:\n");
+  strings::StrAppend(&result_, prefix, "    return result\n");
+  strings::StrAppend(&result_, prefix, "  raise\n");
+}
+
 string GetPythonOps(const OpList& ops, const ApiDefMap& api_defs,
                     const std::vector<string>& hidden_ops, bool require_shapes,
                     const string& source_file_name = "") {
@@ -937,6 +960,7 @@ from tensorflow.python.framework import op_def_registry as _op_def_registry
 from tensorflow.python.framework import ops as _ops
 from tensorflow.python.framework import op_def_library as _op_def_library
 from tensorflow.python.util.deprecation import deprecated_endpoints
+from tensorflow.python.util import dispatch as _dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 )");
diff --git a/tensorflow/python/framework/python_op_gen_internal.cc b/tensorflow/python/framework/python_op_gen_internal.cc
index 65b9ad5c6a2b5170a70ce376114feff27bb622d2..cbdeecfbfb93ad776ff9d3db755503c47970d330 100644
--- a/tensorflow/python/framework/python_op_gen_internal.cc
+++ b/tensorflow/python/framework/python_op_gen_internal.cc
@@ -804,8 +804,8 @@ void GenPythonOp::AddDocStringOutputs() {
 }
 
 void GenPythonOp::AddBody(const string& prefix) {
-  const string apply_prefix =
-      strings::StrCat(prefix, "_result = _op_def_lib.apply_op(");
+  const string apply_prefix = strings::StrCat(
+      prefix, "_result = _op_def_lib.apply_op(\"", op_def_.name(), "\", ");
   AddBodyNoReturn(apply_prefix);
   if (num_outs_ > 1) {
     strings::StrAppend(&result_, prefix, "_result = _", op_def_.name(),
@@ -815,7 +815,7 @@ void GenPythonOp::AddBody(const string& prefix) {
 }
 
 void GenPythonOp::AddBodyNoReturn(const string& apply_prefix) {
-  string args = strings::StrCat("\"", op_def_.name(), "\", ");
+  string args;
   for (size_t i = 0; i < param_names_.size(); ++i) {
     strings::StrAppend(&args, AvoidPythonReserved(param_names_[i].GetName()),
                        "=", param_names_[i].GetRenameTo(), ", ");
diff --git a/tensorflow/python/framework/random_seed.py b/tensorflow/python/framework/random_seed.py
index 777bb2fe8c544440e2897c73ecabf332b7fd18ee..6b7f56a92cc02fd9f44a541ed3536b35653031d9 100644
--- a/tensorflow/python/framework/random_seed.py
+++ b/tensorflow/python/framework/random_seed.py
@@ -34,7 +34,7 @@ def _truncate_seed(seed):
   return seed % _MAXINT32  # Truncate to fit into 32-bit integer
 
 
-@tf_export('random.get_seed', v1=['random.get_seed', 'get_seed'])
+@tf_export(v1=['random.get_seed', 'get_seed'])
 @deprecation.deprecated_endpoints('get_seed')
 def get_seed(op_seed):
   """Returns the local seeds an operation should use given an op-specific seed.
@@ -45,7 +45,7 @@ def get_seed(op_seed):
   graph, or for only specific operations.
 
   For details on how the graph-level seed interacts with op seeds, see
-  `tf.set_random_seed`.
+  `tf.random.set_random_seed`.
 
   Args:
     op_seed: integer.
@@ -82,7 +82,7 @@ def get_seed(op_seed):
   return seeds
 
 
-@tf_export('random.set_random_seed', 'set_random_seed')
+@tf_export(v1=['random.set_random_seed', 'set_random_seed'])
 def set_random_seed(seed):
   """Sets the graph-level random seed.
 
@@ -154,7 +154,7 @@ def set_random_seed(seed):
   sessions, set a graph-level seed:
 
   ```python
-  tf.set_random_seed(1234)
+  tf.random.set_random_seed(1234)
   a = tf.random_uniform([1])
   b = tf.random_normal([1])
 
@@ -182,3 +182,103 @@ def set_random_seed(seed):
     context.set_global_seed(seed)
   else:
     ops.get_default_graph().seed = seed
+
+
+@tf_export('random.set_seed', v1=[])
+def set_seed(seed):
+  """Sets the graph-level random seed.
+
+  Operations that rely on a random seed actually derive it from two seeds:
+  the graph-level and operation-level seeds. This sets the graph-level seed.
+
+  Its interactions with operation-level seeds is as follows:
+
+    1. If neither the graph-level nor the operation seed is set:
+      A random seed is used for this op.
+    2. If the graph-level seed is set, but the operation seed is not:
+      The system deterministically picks an operation seed in conjunction
+      with the graph-level seed so that it gets a unique random sequence.
+    3. If the graph-level seed is not set, but the operation seed is set:
+      A default graph-level seed and the specified operation seed are used to
+      determine the random sequence.
+    4. If both the graph-level and the operation seed are set:
+      Both seeds are used in conjunction to determine the random sequence.
+
+  To illustrate the user-visible effects, consider these examples:
+
+  To generate different sequences across sessions, set neither
+  graph-level nor op-level seeds:
+
+  ```python
+  a = tf.random_uniform([1])
+  b = tf.random_normal([1])
+
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A3'
+    print(sess2.run(a))  # generates 'A4'
+    print(sess2.run(b))  # generates 'B3'
+    print(sess2.run(b))  # generates 'B4'
+  ```
+
+  To generate the same repeatable sequence for an op across sessions, set the
+  seed for the op:
+
+  ```python
+  a = tf.random_uniform([1], seed=1)
+  b = tf.random_normal([1])
+
+  # Repeatedly running this block with the same graph will generate the same
+  # sequence of values for 'a', but different sequences of values for 'b'.
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A1'
+    print(sess2.run(a))  # generates 'A2'
+    print(sess2.run(b))  # generates 'B3'
+    print(sess2.run(b))  # generates 'B4'
+  ```
+
+  To make the random sequences generated by all ops be repeatable across
+  sessions, set a graph-level seed:
+
+  ```python
+  tf.random.set_seed(1234)
+  a = tf.random_uniform([1])
+  b = tf.random_normal([1])
+
+  # Repeatedly running this block with the same graph will generate the same
+  # sequences of 'a' and 'b'.
+  print("Session 1")
+  with tf.Session() as sess1:
+    print(sess1.run(a))  # generates 'A1'
+    print(sess1.run(a))  # generates 'A2'
+    print(sess1.run(b))  # generates 'B1'
+    print(sess1.run(b))  # generates 'B2'
+
+  print("Session 2")
+  with tf.Session() as sess2:
+    print(sess2.run(a))  # generates 'A1'
+    print(sess2.run(a))  # generates 'A2'
+    print(sess2.run(b))  # generates 'B1'
+    print(sess2.run(b))  # generates 'B2'
+  ```
+
+  Args:
+    seed: integer.
+  """
+  # TODO(go/tf2-random): change doc, update to match design doc
+  set_random_seed(seed)
diff --git a/tensorflow/python/framework/registry.py b/tensorflow/python/framework/registry.py
index 2e45acb499581e02c0661aa7cf63187cc213c5cd..4357c76bd6cc8ccac55b5e123fa0ce7cf3c0d19d 100644
--- a/tensorflow/python/framework/registry.py
+++ b/tensorflow/python/framework/registry.py
@@ -23,10 +23,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import traceback
-
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
+from tensorflow.python.util import tf_stack
 
 
 # Registry mechanism below is based on mapreduce.python.mrpython.Register.
@@ -57,15 +56,17 @@ class Registry(object):
     if name in self._registry:
       (filename, line_number, function_name, _) = (
           self._registry[name][_LOCATION_TAG])
-      raise KeyError("Registering two %s with name '%s' !"
+      raise KeyError("Registering two %s with name '%s'! "
                      "(Previous registration was in %s %s:%d)" %
                      (self._name, name, function_name, filename, line_number))
 
     logging.vlog(1, "Registering %s (%s) in %s.", name, candidate, self._name)
     # stack trace is [this_function, Register(), user_function,...]
     # so the user function is #2.
-    stack = traceback.extract_stack()
-    self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: stack[2]}
+    stack = tf_stack.extract_stack()
+    user_function = stack[2]
+    location_tag = tf_stack.convert_stack([user_function])[0]
+    self._registry[name] = {_TYPE_TAG: candidate, _LOCATION_TAG: location_tag}
 
   def list(self):
     """Lists registered items.
diff --git a/tensorflow/python/framework/registry_test.py b/tensorflow/python/framework/registry_test.py
index a821e16f26007632886532bfd868dbf8716eafb6..1a0d3f200d9427363ae36c19b6214ac6c9b75bec 100644
--- a/tensorflow/python/framework/registry_test.py
+++ b/tensorflow/python/framework/registry_test.py
@@ -45,7 +45,9 @@ class RegistryTest(test.TestCase):
   def testDuplicate(self):
     myreg = registry.Registry('testbar')
     myreg.register(bar, 'Bar')
-    with self.assertRaises(KeyError):
+    with self.assertRaisesRegexp(
+        KeyError, r'Registering two testbar with name \'Bar\'! '
+        r'\(Previous registration was in [^ ]+ .*.py:[0-9]+\)'):
       myreg.register(bar, 'Bar')
 
 
diff --git a/tensorflow/python/framework/smart_cond_test.py b/tensorflow/python/framework/smart_cond_test.py
index b8a9672b06da9b24d567a9779fb703ac7178d411..f964c87f0243bd00faf44a10f1468680a2fb272d 100644
--- a/tensorflow/python/framework/smart_cond_test.py
+++ b/tensorflow/python/framework/smart_cond_test.py
@@ -35,6 +35,7 @@ def raise_exception():
 
 class SmartCondTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTrue(self):
     with ops.Graph().as_default():
       with session.Session():
@@ -44,6 +45,7 @@ class SmartCondTest(test_util.TensorFlowTestCase):
                                   lambda: math_ops.multiply(y, 5))
         self.assertEqual(z.eval(), 32)
 
+  @test_util.run_deprecated_v1
   def testFalse(self):
     with ops.Graph().as_default():
       with session.Session():
@@ -99,6 +101,7 @@ class SmartCondTest(test_util.TensorFlowTestCase):
 
 class SmartCaseTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTrue(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(True, lambda: constant_op.constant(1)),
@@ -109,9 +112,10 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
                               exclusive=True)
     with session.Session() as sess:
       # No feed_dict necessary
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 1)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 1)
 
+  @test_util.run_deprecated_v1
   def testFalse(self):
     conditions = [(False, raise_exception)]
     y = smart_cond.smart_case(conditions,
@@ -121,9 +125,10 @@ class SmartCaseTest(test_util.TensorFlowTestCase):
                               default=lambda: constant_op.constant(1),
                               exclusive=True)
     with session.Session() as sess:
-      self.assertEqual(sess.run(y), 1)
-      self.assertEqual(sess.run(z), 1)
+      self.assertEqual(self.evaluate(y), 1)
+      self.assertEqual(self.evaluate(z), 1)
 
+  @test_util.run_deprecated_v1
   def testMix(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     y = constant_op.constant(10)
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index 3643fc5e00475b8d2ebc2e2fc23fa6fd19bea114..5e1a95a26be034bff0a1f5eb996ac6f16c61e282 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -244,7 +244,7 @@ class SparseTensor(_TensorLike):
 
 SparseTensorValue = collections.namedtuple(
     "SparseTensorValue", ["indices", "values", "dense_shape"])
-tf_export("SparseTensorValue")(SparseTensorValue)
+tf_export(v1=["SparseTensorValue"])(SparseTensorValue)
 pywrap_tensorflow.RegisterType("SparseTensorValue", SparseTensorValue)
 
 
diff --git a/tensorflow/python/framework/sparse_tensor_test.py b/tensorflow/python/framework/sparse_tensor_test.py
index 22423c4f58ca510a2e247b9cd783d5596ca65e46..a999c12ca89b0c1746751eb04e9abfe380abf336 100644
--- a/tensorflow/python/framework/sparse_tensor_test.py
+++ b/tensorflow/python/framework/sparse_tensor_test.py
@@ -46,11 +46,11 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
       self.assertEqual(sp.get_shape(), (4, 5))
 
       with self.cached_session() as sess:
-        value = sp.eval()
+        value = self.evaluate(sp)
         self.assertAllEqual(indices, value.indices)
         self.assertAllEqual(values, value.values)
         self.assertAllEqual(shape, value.dense_shape)
-        sess_run_value = sess.run(sp)
+        sess_run_value = self.evaluate(sp)
         self.assertAllEqual(sess_run_value.indices, value.indices)
         self.assertAllEqual(sess_run_value.values, value.values)
         self.assertAllEqual(sess_run_value.dense_shape, value.dense_shape)
@@ -65,6 +65,7 @@ class SparseTensorTest(test_util.TensorFlowTestCase):
         sparse_tensor.is_sparse(
             sparse_tensor.SparseTensorValue([[0]], [0], [1])))
 
+  @test_util.run_deprecated_v1
   def testConsumers(self):
     sp = sparse_tensor.SparseTensor([[0, 0], [1, 2]], [1.0, 3.0], [3, 4])
     w = ops.convert_to_tensor(np.ones([4, 1], np.float32))
@@ -85,8 +86,9 @@ class ConvertToTensorOrSparseTensorTest(test_util.TensorFlowTestCase):
       value = [42, 43]
       from_value = sparse_tensor.convert_to_tensor_or_sparse_tensor(
           value)
-      self.assertAllEqual(value, from_value.eval())
+      self.assertAllEqual(value, self.evaluate(from_value))
 
+  @test_util.run_deprecated_v1
   def test_convert_sparse(self):
     with self.cached_session():
       indices = [[0, 1], [1, 0]]
diff --git a/tensorflow/python/framework/subscribe_test.py b/tensorflow/python/framework/subscribe_test.py
index cab426844d4eed1bfdb5a7978cd8d98eab3cf0cc..a74e96f9d9d6469b66426dd85628f926297afcd0 100644
--- a/tensorflow/python/framework/subscribe_test.py
+++ b/tensorflow/python/framework/subscribe_test.py
@@ -43,6 +43,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertTrue(
         all(subscribe._is_subscribed_identity(x) for x in container))
 
+  @test_util.run_deprecated_v1
   def testSideEffect(self):
     a = constant_op.constant(1)
     b = constant_op.constant(1)
@@ -66,15 +67,16 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertTrue(c.op in d.op.control_inputs)
 
     with self.cached_session() as sess:
-      c_out = sess.run([c])
-      n_out = sess.run([n])
-      d_out = sess.run([d])
+      c_out = self.evaluate([c])
+      n_out = self.evaluate([n])
+      d_out = self.evaluate([d])
 
     self.assertEqual(n_out, [-2])
     self.assertEqual(c_out, [2])
     self.assertEqual(d_out, [42])
     self.assertEqual(shared, [2, 2, 2])
 
+  @test_util.run_deprecated_v1
   def testSupportedTypes(self):
     """Confirm that supported types are correctly detected and handled."""
 
@@ -120,6 +122,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
       subscribe.subscribe(c.name,
                           lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
+  @test_util.run_deprecated_v1
   def testCaching(self):
     """Confirm caching of control output is recalculated between calls."""
     a = constant_op.constant(1)
@@ -145,13 +148,14 @@ class SubscribeTest(test_util.TensorFlowTestCase):
                             lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
     with self.cached_session() as sess:
-      c_out = sess.run([c])
-      d_out = sess.run([d])
+      c_out = self.evaluate([c])
+      d_out = self.evaluate([d])
 
     self.assertEqual(c_out, [42])
     self.assertEqual(d_out, [11])
     self.assertEqual(shared, {2: 1, 1: 1})
 
+  @test_util.run_deprecated_v1
   def testIsSubscribedIdentity(self):
     """Confirm subscribed identity ops are correctly detected."""
     a = constant_op.constant(1)
@@ -165,6 +169,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertFalse(subscribe._is_subscribed_identity(idop))
     self.assertTrue(subscribe._is_subscribed_identity(c_sub))
 
+  @test_util.run_deprecated_v1
   def testSubscribeExtend(self):
     """Confirm side effect are correctly added for different input types."""
     a = constant_op.constant(1)
@@ -205,11 +210,12 @@ class SubscribeTest(test_util.TensorFlowTestCase):
 
     # Expect the three side effect graphs to have been evaluated.
     with self.cached_session() as sess:
-      sess.run([c_sub])
+      self.evaluate([c_sub])
     self.assertIn('graph1', shared)
     self.assertIn('graph2', shared)
     self.assertIn('graph3', shared)
 
+  @test_util.run_v1_only('b/120545219')
   def testSubscribeVariable(self):
     """Confirm that variables can be subscribed."""
     v1 = variables.VariableV1(0.0)
@@ -229,25 +235,26 @@ class SubscribeTest(test_util.TensorFlowTestCase):
 
     with self.cached_session() as sess:
       # Initialize the variables first.
-      sess.run([v1.initializer])
-      sess.run([v2.initializer])
+      self.evaluate([v1.initializer])
+      self.evaluate([v2.initializer])
 
       # Expect the side effects to be triggered when evaluating the add op as
       # it will read the value of the variable.
-      sess.run([add])
+      self.evaluate([add])
       self.assertEqual(1, len(shared))
 
       # Expect the side effect not to be triggered when evaluating the assign
       # op as it will not access the 'read' output of the variable.
-      sess.run([assign_v1])
+      self.evaluate([assign_v1])
       self.assertEqual(1, len(shared))
 
-      sess.run([add])
+      self.evaluate([add])
       self.assertEqual(2, len(shared))
 
       # Make sure the values read from the variable match the expected ones.
       self.assertEqual([0.0, 3.0], shared)
 
+  @test_util.run_v1_only('b/120545219')
   def testResourceType(self):
     """Confirm that subscribe correctly handles tensors with 'resource' type."""
     tensor_array = tensor_array_ops.TensorArray(
@@ -273,9 +280,10 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertFalse(subscribe._is_subscribed_identity(tensor_array.handle))
 
     with self.cached_session() as sess:
-      sess.run([reader])
+      self.evaluate([reader])
     self.assertEqual(0, len(shared))
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputs(self):
     """Handle subscriptions to multiple outputs from the same op."""
     sparse_tensor_1 = sparse_tensor.SparseTensor(
@@ -304,11 +312,12 @@ class SubscribeTest(test_util.TensorFlowTestCase):
                         lambda t: script_ops.py_func(sub, [t], [t.dtype]))
 
     with self.cached_session() as sess:
-      sess.run([neg])
+      self.evaluate([neg])
 
     # All three ops have been processed.
     self.assertEqual(3, len(shared))
 
+  @test_util.run_deprecated_v1
   def test_subscribe_tensors_on_different_devices(self):
     """Side effect ops are added with the same device of the subscribed op."""
     c1 = constant_op.constant(10)
@@ -335,6 +344,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertEqual(add.device, add_sub.device)
     self.assertEqual(mul.device, mul_sub.device)
 
+  @test_util.run_v1_only('b/120545219')
   def test_subscribe_tensors_within_control_flow_context(self):
     """Side effect ops are added with the same control flow context."""
     c1 = constant_op.constant(10)
@@ -375,7 +385,7 @@ class SubscribeTest(test_util.TensorFlowTestCase):
     self.assertIsNot(context(subscriptions[0]), context(subscriptions[1]))
 
     with self.cached_session() as sess:
-      sess.run(cond)
+      self.evaluate(cond)
 
     self.assertEqual(3, len(results))
 
diff --git a/tensorflow/python/framework/tensor_shape.py b/tensorflow/python/framework/tensor_shape.py
index 5a58d271488080eb1ba0036ec60404b5e28adb76..960a3dad7389553955c999e444a9f98c1857f588 100644
--- a/tensorflow/python/framework/tensor_shape.py
+++ b/tensorflow/python/framework/tensor_shape.py
@@ -169,7 +169,7 @@ def dimension_at_index(shape, index):
     return shape.dims[index]
 
 
-@tf_export("Dimension")
+@tf_export(v1=["Dimension"])
 class Dimension(object):
   """Represents the value of one dimension in a TensorShape."""
 
diff --git a/tensorflow/python/framework/tensor_spec.py b/tensorflow/python/framework/tensor_spec.py
index fbea930fe0e6a4545b9a5ac55c0a7684b3cd8e28..c44636edc4ec5101c588766714c98a7da15793e4 100644
--- a/tensorflow/python/framework/tensor_spec.py
+++ b/tensorflow/python/framework/tensor_spec.py
@@ -24,14 +24,15 @@ from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.util.tf_export import tf_export
 
 
+@tf_export("TensorSpec")
 class TensorSpec(object):
   """Describes a tf.Tensor.
 
-  A TensorSpec allows an API to describe the Tensors that it accepts or
-  returns, before that Tensor exists. This allows dynamic and flexible graph
-  construction and configuration.
+  Metadata for describing the `tf.Tensor` objects accepted or returned
+  by some TensorFlow APIs.
   """
 
   __slots__ = ["_shape", "_shape_tuple", "_dtype", "_name"]
@@ -69,11 +70,6 @@ class TensorSpec(object):
     else:
       raise ValueError("`tensor` should be a tf.Tensor")
 
-  @classmethod
-  def is_bounded(cls):
-    del cls
-    return False
-
   @property
   def shape(self):
     """Returns the `TensorShape` that represents the shape of the tensor."""
@@ -86,21 +82,21 @@ class TensorSpec(object):
 
   @property
   def name(self):
-    """Returns the name of the described tensor."""
+    """Returns the (optionally provided) name of the described tensor."""
     return self._name
 
-  @property
-  def is_discrete(self):
-    """Whether spec is discrete."""
-    return self.dtype.is_integer
+  def is_compatible_with(self, spec_or_tensor):
+    """Returns True if spec_or_tensor is compatible with this TensorSpec.
 
-  @property
-  def is_continuous(self):
-    """Whether spec is continuous."""
-    return self.dtype.is_floating
+    Two tensors are considered compatible if they have the same dtype
+    and their shapes are compatible (see `tf.TensorShape.is_compatible_with`).
 
-  def is_compatible_with(self, spec_or_tensor):
-    """True if the shape and dtype of `spec_or_tensor` are compatible."""
+    Args:
+      spec_or_tensor: A tf.TensorSpec or a tf.Tensor
+
+    Returns:
+      True if spec_or_tensor is compatible with self.
+    """
     return (self._dtype.is_compatible_with(spec_or_tensor.dtype) and
             self._shape.is_compatible_with(spec_or_tensor.shape))
 
@@ -188,11 +184,6 @@ class BoundedTensorSpec(TensorSpec):
     self._maximum = np.array(maximum, dtype=self.dtype.as_numpy_dtype())
     self._maximum.setflags(write=False)
 
-  @classmethod
-  def is_bounded(cls):
-    del cls
-    return True
-
   @classmethod
   def from_spec(cls, spec):
     dtype = dtypes.as_dtype(spec.dtype)
@@ -223,4 +214,3 @@ class BoundedTensorSpec(TensorSpec):
   def __reduce__(self):
     return BoundedTensorSpec, (self._shape, self._dtype, self._minimum,
                                self._maximum, self._name)
-
diff --git a/tensorflow/python/framework/tensor_spec_test.py b/tensorflow/python/framework/tensor_spec_test.py
index 40611e5f840db224f6343f9fdb3852b58c45f5a6..75c197df09e97b8e5c9ebf15ffb33206f69a172f 100644
--- a/tensorflow/python/framework/tensor_spec_test.py
+++ b/tensorflow/python/framework/tensor_spec_test.py
@@ -45,6 +45,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     desc = tensor_spec.TensorSpec(shape=None, dtype=dtypes.float32)
     self.assertEqual(desc.shape, tensor_shape.TensorShape(None))
 
+  @test_util.run_deprecated_v1
   def testShapeCompatibility(self):
     unknown = array_ops.placeholder(dtypes.int64)
     partial = array_ops.placeholder(dtypes.int64, shape=[None, 1])
@@ -75,6 +76,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertFalse(desc_rank3.is_compatible_with(full))
     self.assertTrue(desc_rank3.is_compatible_with(rank3))
 
+  @test_util.run_deprecated_v1
   def testTypeCompatibility(self):
     floats = array_ops.placeholder(dtypes.float32, shape=[10, 10])
     ints = array_ops.placeholder(dtypes.int32, shape=[10, 10])
@@ -106,6 +108,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     spec_2 = tensor_spec.TensorSpec.from_spec(spec_1)
     self.assertEqual(spec_1, spec_2)
 
+  @test_util.run_deprecated_v1
   def testFromTensor(self):
     zero = constant_op.constant(0)
     spec = tensor_spec.TensorSpec.from_tensor(zero)
@@ -113,6 +116,7 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(spec.shape, [])
     self.assertEqual(spec.name, "Const")
 
+  @test_util.run_deprecated_v1
   def testFromPlaceholder(self):
     unknown = array_ops.placeholder(dtypes.int64, name="unknown")
     partial = array_ops.placeholder(dtypes.float32,
@@ -134,22 +138,6 @@ class TensorSpecTest(test_util.TensorFlowTestCase):
     self.assertEqual(bounded_spec.dtype, spec.dtype)
     self.assertEqual(bounded_spec.name, spec.name)
 
-  def testIsDiscrete(self):
-    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
-    self.assertTrue(discrete_spec.is_discrete)
-    self.assertFalse(continuous_spec.is_discrete)
-
-  def testIsContinuous(self):
-    discrete_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    continuous_spec = tensor_spec.TensorSpec((1, 2), dtypes.float32)
-    self.assertFalse(discrete_spec.is_continuous)
-    self.assertTrue(continuous_spec.is_continuous)
-
-  def testIsBounded(self):
-    unbounded_spec = tensor_spec.TensorSpec((1, 2), dtypes.int32)
-    self.assertFalse(unbounded_spec.is_bounded())
-
   def testSerialization(self):
     desc = tensor_spec.TensorSpec([1, 5], dtypes.float32, "test")
     self.assertEqual(pickle.loads(pickle.dumps(desc)), desc)
@@ -165,11 +153,6 @@ class BoundedTensorSpecTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "not compatible"):
       tensor_spec.BoundedTensorSpec((3, 5), dtypes.uint8, 0, (1, 1, 1))
 
-  def testIsBounded(self):
-    bounded_spec = tensor_spec.BoundedTensorSpec(
-        (1, 2), dtypes.int32, minimum=0, maximum=1)
-    self.assertTrue(bounded_spec.is_bounded())
-
   def testMinimumMaximumAttributes(self):
     spec = tensor_spec.BoundedTensorSpec(
         (1, 2, 3), dtypes.float32, 0, (5, 5, 5))
diff --git a/tensorflow/python/framework/tensor_util.py b/tensorflow/python/framework/tensor_util.py
index 9db94f5288cc515e5a764a19520c057bffa64a9b..f98f301b38a946146df3051db9b8d26c8b816b33 100644
--- a/tensorflow/python/framework/tensor_util.py
+++ b/tensorflow/python/framework/tensor_util.py
@@ -371,8 +371,10 @@ def _AssertCompatible(values, dtype):
                       (dtype.name, repr(mismatch), type(mismatch).__name__))
 
 
+# pylint: disable=invalid-name
 @tf_export(v1=["make_tensor_proto"])
-def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
+def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False,
+                      allow_broadcast=False):
   """Create a TensorProto.
 
   Args:
@@ -380,6 +382,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     dtype:          Optional tensor_pb2 DataType value.
     shape:          List of integers representing the dimensions of tensor.
     verify_shape:   Boolean that enables verification of a shape of values.
+    allow_broadcast:Boolean that enables allowing scalars and 1 length vector
+        broadcasting. Cannot be true when verify_shape is true.
 
   Returns:
     A `TensorProto`. Depending on the type, it may contain data in the
@@ -416,6 +420,8 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   can not have more elements than what "shape" specifies.
 
   """
+  if allow_broadcast and verify_shape:
+    raise ValueError("allow_broadcast and verify_shape are not both allowed.")
   if isinstance(values, tensor_pb2.TensorProto):
     return values
 
@@ -504,15 +510,22 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
     shape_size = np.prod(shape, dtype=np.int64)
     is_same_size = shape_size == nparray.size
 
-    if verify_shape:
-      if not nparray.shape == tuple(shape):
+    if allow_broadcast:
+      if nparray.shape == (1,) or nparray.shape == tuple():
+        pass
+      elif nparray.size != shape_size:
         raise TypeError("Expected Tensor's shape: %s, got %s." %
                         (tuple(shape), nparray.shape))
 
-    if nparray.size > shape_size:
-      raise ValueError(
-          "Too many elements provided. Needed at most %d, but received %d" %
-          (shape_size, nparray.size))
+    else:
+      if verify_shape and nparray.shape != tuple(shape):
+        raise TypeError("Expected Tensor's shape: %s, got %s." %
+                        (tuple(shape), nparray.shape))
+
+      if nparray.size > shape_size:
+        raise ValueError(
+            "Too many elements provided. Needed at most %d, but received %d" %
+            (shape_size, nparray.size))
 
   tensor_proto = tensor_pb2.TensorProto(
       dtype=numpy_dtype.as_datatype_enum,
@@ -560,6 +573,7 @@ def make_tensor_proto(values, dtype=None, shape=None, verify_shape=False):
   append_fn(tensor_proto, proto_values)
 
   return tensor_proto
+# pylint: enable=invalid-name
 
 
 @tf_export("make_ndarray")
diff --git a/tensorflow/python/framework/tensor_util_test.py b/tensorflow/python/framework/tensor_util_test.py
index bdf759f22047fe62a7820bc170654fed07f7adc9..00337546186d3a01313a49d11dd266e6dade3227 100644
--- a/tensorflow/python/framework/tensor_util_test.py
+++ b/tensorflow/python/framework/tensor_util_test.py
@@ -758,6 +758,7 @@ class TensorUtilTest(test.TestCase):
     self.assertFalse(tensor_util.ShapeEquals(t, [1, 4]))
     self.assertFalse(tensor_util.ShapeEquals(t, [4]))
 
+  @test_util.run_deprecated_v1
   def testMockArray(self):
 
     class MockArray(object):
@@ -771,7 +772,7 @@ class TensorUtilTest(test.TestCase):
     with self.cached_session() as sess:
       ma = MockArray(np.array([10, 20, 30]))
       t = ops.convert_to_tensor(ma)
-      a = sess.run(t)
+      a = self.evaluate(t)
       self.assertEquals(np.int64, a.dtype)
       self.assertAllClose(np.array([10, 20, 30], dtype=np.int64), a)
 
@@ -787,6 +788,7 @@ class ConstantValueTest(test.TestCase):
     tf_val = constant_op.constant(np_val)
     self.assertAllClose(np_val, tensor_util.constant_value(tf_val))
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     tf_val = gen_state_ops.variable(
         shape=[3, 4, 7],
@@ -815,12 +817,14 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertEqual(6, c_val)
 
+  @test_util.run_deprecated_v1
   def testSizeOfScalar(self):
     tf_val = array_ops.size(constant_op.constant(0.0))
     c_val = tensor_util.constant_value(tf_val)
     self.assertEqual(1, c_val)
     self.assertEqual(np.ndarray, type(c_val))
 
+  @test_util.run_deprecated_v1
   def testRank(self):
     tf_val = array_ops.rank(constant_op.constant(0.0, shape=[1, 2, 3]))
     c_val = tensor_util.constant_value(tf_val)
@@ -852,6 +856,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertAllClose(np_val.astype(np.float64), c_val)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     np_val = np.random.rand(3, 4, 7).astype(np.float32)
     tf_val = array_ops.concat(
@@ -871,6 +876,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Axis0(self):
     inputs = [np.random.rand(4, 7) for _ in range(3)]
     np_val = np.array(inputs)
@@ -883,6 +889,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Axis1(self):
     inputs = [np.random.rand(4, 7) for _ in range(3)]
     tf_val = array_ops.stack(inputs, axis=1)
@@ -894,6 +901,7 @@ class ConstantValueTest(test.TestCase):
     c_val = tensor_util.constant_value(tf_val)
     self.assertIs(None, c_val)
 
+  @test_util.run_deprecated_v1
   def testPack_Partial_Axis0(self):
     input_ = np.random.rand(4, 7)
     tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)])
@@ -901,6 +909,7 @@ class ConstantValueTest(test.TestCase):
     self.assertAllClose(input_, c_val[0])
     self.assertIsNone(c_val[1])
 
+  @test_util.run_deprecated_v1
   def testPack_Partial_Axis1(self):
     input_ = np.random.rand(4, 7)
     tf_val = array_ops.stack([input_, array_ops.placeholder(dtypes.float32)],
@@ -966,12 +975,14 @@ class ConstantValueAsShapeTest(test.TestCase):
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([None, 1, None], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testPack(self):
     tf_val = array_ops.stack(
         [constant_op.constant(16), 37, array_ops.placeholder(dtypes.int32)])
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([16, 37, None], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     tf_val = array_ops.concat(
         [[16, 37], array_ops.placeholder(
@@ -985,6 +996,7 @@ class ConstantValueAsShapeTest(test.TestCase):
     c_val = tensor_util.constant_value_as_shape(tf_val)
     self.assertEqual([16, 37, None, 48], c_val.as_list())
 
+  @test_util.run_deprecated_v1
   def testSlice(self):
     tf_val = array_ops.placeholder(dtypes.int32, shape=(4,))[0:2]
     c_val = tensor_util.constant_value_as_shape(tf_val)
diff --git a/tensorflow/python/framework/test_util.py b/tensorflow/python/framework/test_util.py
index fd55ad2af9e7527758f8ce6d6d35d7514a58ab5f..df3cebd2e0c2f37711dc41cf60409c2660bf3e2c 100644
--- a/tensorflow/python/framework/test_util.py
+++ b/tensorflow/python/framework/test_util.py
@@ -50,9 +50,11 @@ from tensorflow.core.framework import graph_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python import tf2
 from tensorflow.python.client import device_lib
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import tape
 from tensorflow.python.framework import device as pydev
 from tensorflow.python.framework import dtypes
@@ -66,6 +68,8 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import versions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import script_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import tf_logging as logging
@@ -74,6 +78,7 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import memory
 from tensorflow.python.util import nest
+from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
 from tensorflow.python.util.protobuf import compare
 from tensorflow.python.util.tf_export import tf_export
@@ -114,8 +119,28 @@ def assert_ops_in_graph(expected_ops, graph):
   return actual_ops
 
 
-@tf_export("test.assert_equal_graph_def")
-def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
+@tf_export("test.assert_equal_graph_def", v1=[])
+def assert_equal_graph_def_v2(actual, expected):
+  """Asserts that two `GraphDef`s are (mostly) the same.
+
+  Compares two `GraphDef` protos for equality, ignoring versions and ordering of
+  nodes, attrs, and control inputs.  Node names are used to match up nodes
+  between the graphs, so the naming of nodes must be consistent. This function
+  ignores randomized attribute values that may appear in V2 checkpoints.
+
+  Args:
+    actual: The `GraphDef` we have.
+    expected: The `GraphDef` we expected.
+
+  Raises:
+    AssertionError: If the `GraphDef`s do not match.
+    TypeError: If either argument is not a `GraphDef`.
+  """
+  assert_equal_graph_def(actual, expected, checkpoint_v2=True)
+
+
+@tf_export(v1=["test.assert_equal_graph_def"])
+def assert_equal_graph_def_v1(actual, expected, checkpoint_v2=False):
   """Asserts that two `GraphDef`s are (mostly) the same.
 
   Compares two `GraphDef` protos for equality, ignoring versions and ordering of
@@ -132,6 +157,10 @@ def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
     AssertionError: If the `GraphDef`s do not match.
     TypeError: If either argument is not a `GraphDef`.
   """
+  assert_equal_graph_def(actual, expected, checkpoint_v2)
+
+
+def assert_equal_graph_def(actual, expected, checkpoint_v2=False):
   if not isinstance(actual, graph_pb2.GraphDef):
     raise TypeError(
         "Expected tf.GraphDef for actual, got %s" % type(actual).__name__)
@@ -354,53 +383,12 @@ def skip_if(condition):
 
 
 def enable_c_shapes(fn):
-  """Decorator for enabling C shapes on a test.
-
-  Note this enables the C shapes after running the test class's setup/teardown
-  methods.
-
-  Args:
-    fn: the function to be wrapped
-
-  Returns:
-    The wrapped function
-  """
-
-  # pylint: disable=protected-access
-  def wrapper(*args, **kwargs):
-    prev_value = ops._USE_C_SHAPES
-    ops._USE_C_SHAPES = True
-    try:
-      fn(*args, **kwargs)
-    finally:
-      ops._USE_C_SHAPES = prev_value
-
-  # pylint: enable=protected-access
-
-  return wrapper
+  """No-op. TODO(b/74620627): Remove this."""
+  return fn
 
 
 def with_c_shapes(cls):
-  """Adds methods that call original methods but with C API shapes enabled.
-
-  Note this enables C shapes in new methods after running the test class's
-  setup method.
-
-  Args:
-    cls: class to decorate
-
-  Returns:
-    cls with new test methods added
-  """
-  # If C shapes are already enabled, don't do anything. Some tests break if the
-  # same test is run twice, so this allows us to turn on the C shapes by default
-  # without breaking these tests.
-  if ops._USE_C_SHAPES:
-    return cls
-
-  for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith("test"):
-      setattr(cls, name + "WithCShapes", enable_c_shapes(value))
+  """No-op. TODO(b/74620627): Remove this."""
   return cls
 
 
@@ -423,13 +411,40 @@ def enable_control_flow_v2(fn):
   def wrapper(*args, **kwargs):
     enable_cond_v2_old = control_flow_ops.ENABLE_COND_V2
     enable_while_v2_old = control_flow_ops.ENABLE_WHILE_V2
+    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
     control_flow_ops.ENABLE_COND_V2 = True
     control_flow_ops.ENABLE_WHILE_V2 = True
+    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
     try:
       fn(*args, **kwargs)
     finally:
       control_flow_ops.ENABLE_COND_V2 = enable_cond_v2_old
       control_flow_ops.ENABLE_WHILE_V2 = enable_while_v2_old
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
+
+  return wrapper
+
+
+def enable_tensor_array_v2(fn):
+  """Decorator for enabling _GraphTensorArrayV2 on a test.
+
+  Note this enables _GraphTensorArrayV2 after running the test class's
+  setup/teardown methods.
+
+  Args:
+    fn: the function to be wrapped
+
+  Returns:
+    The wrapped function
+  """
+
+  def wrapper(*args, **kwargs):
+    enable_tensor_array_v2_old = tensor_array_ops.ENABLE_TENSOR_ARRAY_V2
+    tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = True
+    try:
+      fn(*args, **kwargs)
+    finally:
+      tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 = enable_tensor_array_v2_old
 
   return wrapper
 
@@ -482,7 +497,8 @@ def with_control_flow_v2(cls):
     return cls
 
   for name, value in cls.__dict__.copy().items():
-    if (callable(value) and name.startswith("test") and
+    if (callable(value) and
+        name.startswith(unittest.TestLoader.testMethodPrefix) and
         not getattr(value, "_disable_control_flow_v2", False)):
       setattr(cls, name + "WithControlFlowV2", enable_control_flow_v2(value))
   return cls
@@ -881,8 +897,10 @@ def run_all_in_graph_and_eager_modes(cls):
   """Execute all test methods in the given class with and without eager."""
   base_decorator = run_in_graph_and_eager_modes
   for name, value in cls.__dict__.copy().items():
-    if callable(value) and name.startswith(
-        "test") and not name.startswith("testSkipEager"):
+    if (callable(value) and
+        name.startswith(unittest.TestLoader.testMethodPrefix) and
+        not (name.startswith("testSkipEager")
+             or name.startswith("test_skip_eager"))):
       setattr(cls, name, base_decorator(value))
   return cls
 
@@ -949,7 +967,7 @@ def run_in_graph_and_eager_modes(func=None,
   def decorator(f):
     if tf_inspect.isclass(f):
       raise ValueError(
-          "`run_test_in_graph_and_eager_modes` only supports test methods. "
+          "`run_in_graph_and_eager_modes` only supports test methods. "
           "Did you mean to use `run_all_in_graph_and_eager_modes`?")
 
     def decorated(self, *args, **kwargs):
@@ -994,6 +1012,235 @@ def run_in_graph_and_eager_modes(func=None,
   return decorator
 
 
+def py_func_if_in_function(f):
+
+  def decorated(*args, **kwds):
+    if not ops.get_default_graph()._building_function:
+      return f(*args, **kwds)
+
+    tensor_args, tensor_indices = zip(
+        *[(x, i) for i, x in enumerate(args)
+          if isinstance(x, (ops.Tensor, variables.Variable))])
+
+    def inner_f(*inner_tensor_args):
+      my_args = list(args)
+      for i, n in zip(tensor_indices, inner_tensor_args):
+        my_args[i] = n
+      return f(*my_args, **kwds)
+
+    return script_ops.py_func(inner_f, tensor_args, [])
+
+  return tf_decorator.make_decorator(f, decorated)
+
+
+def also_run_as_tf_function(f):
+  """Runs the decorated test twice--once as is, once inside a tf.function.
+
+  This allows you to run a test both in eager execution and inside a
+  tf.function, exercising the two execution modes supported in tf 2.0. The test
+  assertions are automatically done inside tf.py_funcs, and tf.function ensures
+  that they run in the proper order and with the proper side effects.
+
+  Currently variable creation is not supported in tests annotated with this
+  decorator since it's tricky to ensure the variable doesn't get repeatedly
+  created when retracing the tf.function.
+
+  Args:
+    f: the test method to be decorated
+
+  Returns:
+    The decorated test method, which will run both in eager and inside a
+    tf.function.
+  """
+
+  def decorated(*args, **kwds):
+    with context.eager_mode():
+      # Running in eager mode
+      f(*args, **kwds)
+
+      defun_f = def_function.function(f)
+      defun_f(*args, **kwds)
+
+  return decorated
+
+
+def run_deprecated_v1(func=None):
+  """Execute the decorated test in graph mode.
+
+  This function returns a decorator intended to be applied to tests that have
+  not been updated to a style that is compatible with both TensorFlow 1.x and
+  2.x. When this decorated is applied, the test body will be run in
+  an environment where API calls construct graphs instead of executing eagerly.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+  Returns:
+    Returns a decorator that will run the decorated test method in graph mode.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_deprecated_v1` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if tf2.enabled():
+        with context.graph_mode():
+          f(self, *args, **kwargs)
+      else:
+        f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_v1_only(reason, func=None):
+  """Execute the decorated test only if running in v1 mode.
+
+  This function is intended to be applied to tests that exercise v1 only
+  functionality. If the test is run in v2 mode it will simply be skipped.
+
+  Args:
+    reason: string giving a reason for limiting the test to v1 only.
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      setup = f.__dict__.get("setUp")
+      if setup is not None:
+        setattr(f, "setUp", decorator(setup))
+
+      for name, value in f.__dict__.copy().items():
+        if (callable(value) and
+            name.startswith(unittest.TestLoader.testMethodPrefix)):
+          setattr(f, name, decorator(value))
+
+      return f
+
+    def decorated(self, *args, **kwargs):
+      if tf2.enabled():
+        self.skipTest(reason)
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_v2_only(func=None):
+  """Execute the decorated test only if running in v2 mode.
+
+  This function is intended to be applied to tests that exercise v2 only
+  functionality. If the test is run in v1 mode it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_v2_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not tf2.enabled():
+        self.skipTest("Test is only comptaible in v2")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_gpu_only(func=None):
+  """Execute the decorated test only if a GPU is available.
+
+  This function is intended to be applied to tests that require the precense
+  of a GPU. If a GPU is absent, it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_gpu_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not is_gpu_available():
+        self.skipTest("Test requires GPU")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
+def run_cuda_only(func=None):
+  """Execute the decorated test only if a GPU is available.
+
+  This function is intended to be applied to tests that require the precense
+  of a CUDA GPU. If a CUDA GPU is absent, it will simply be skipped.
+
+  Args:
+    func: function to be annotated. If `func` is None, this method returns a
+      decorator the can be applied to a function. If `func` is not None this
+      returns the decorator applied to `func`.
+
+  Returns:
+    Returns a decorator that will conditionally skip the decorated test method.
+  """
+
+  def decorator(f):
+    if tf_inspect.isclass(f):
+      raise ValueError("`run_cuda_only` only supports test methods.")
+
+    def decorated(self, *args, **kwargs):
+      if not is_gpu_available(cuda_only=True):
+        self.skipTest("Test requires CUDA GPU")
+
+      f(self, *args, **kwargs)
+
+    return decorated
+
+  if func is not None:
+    return decorator(func)
+
+  return decorator
+
+
 @tf_export("test.is_gpu_available")
 def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
   """Returns whether TensorFlow can access a GPU.
@@ -1033,7 +1280,7 @@ def is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
         return True
     return False
   except errors_impl.NotFoundError as e:
-    if not all([x in str(e) for x in ["CUDA", "not find"]]):
+    if not all(x in str(e) for x in ["CUDA", "not find"]):
       raise e
     else:
       logging.error(str(e))
@@ -1051,6 +1298,27 @@ def device(use_gpu):
     yield
 
 
+@contextlib.contextmanager
+def use_gpu():
+  """Uses gpu when requested and available."""
+  with device(use_gpu=True):
+    yield
+
+
+@contextlib.contextmanager
+def force_gpu():
+  """Force the gpu to be used."""
+  with ops.device("/device:GPU:0"):
+    yield
+
+
+@contextlib.contextmanager
+def force_cpu():
+  """Force the cpu to be used."""
+  with ops.device("/device:CPU:0"):
+    yield
+
+
 class CapturedWrites(object):
   """A utility class to load the captured writes made to a stream."""
 
@@ -1064,6 +1332,63 @@ class CapturedWrites(object):
     return output_data
 
 
+class FakeEagerSession(object):
+  """Fake session so tests that conditionally use placeholders can use eager.
+
+  There are a number of tests that conditionally use placeholders for shape
+  inference. The pattern is demonstrated here:
+
+  ```python
+  with self.cached_session() as sess:
+    if static_shape:
+      y = math_ops.matmul(x, ...)
+      feed_dict = {}
+    else:
+      x_ph = array_ops.placeholder(...)
+      y = math_ops.matmul(x_ph, ...)
+      feed_dict = {x_ph: x}
+    val = sess.run(y, feed_dict=feed_dict)
+  ```
+
+  Since the feed_dict is empty when not using placeholders we should be able to
+  call self.evaluate(), however this requires rewriting the test case.
+  This class shold be considered a stop-gap solution to get tests running with
+  eager with minimal changes to the actual test.
+  """
+
+  def __init__(self, test_case):
+    self._test_case = test_case
+
+  def run(self, fetches, *args, **kwargs):
+    """Evalaute `fetches`.
+
+    Fail if additional args are specified.
+
+    Args:
+      fetches: A Tensor or a nested list/tuple of Tensors.
+      *args: Positional arguments
+      **kwargs: Keyword arguments
+
+    Raises:
+      RuntimeError: If args or kwargs are specified.
+
+    Returns:
+      Tensors as numpy values.
+    """
+    feed_dict = kwargs.pop("feed_dict", {})
+    if feed_dict:
+      raise RuntimeError(
+          "feed_dict is not supported when eager execution is enabled "
+          "(in this case, sess.run(t) is shorthand for t.numpy()")
+
+    if args or kwargs:
+      raise RuntimeError(
+          "Optional args are not supported when eager execution is enabled "
+          "(in this case, sess.run(t) is shorthand for t.numpy()")
+
+    return self._test_case.evaluate(fetches)
+
+
 class ErrorLoggingSession(session.Session):
   """Wrapper around a Session that logs errors in run().
   """
@@ -1105,6 +1430,10 @@ class TensorFlowTestCase(googletest.TestCase):
     ops.reset_default_graph()
     random_seed.set_random_seed(random_seed.DEFAULT_GRAPH_SEED)
 
+    # Avoiding calling setUp() for the poorly named test_session method.
+    if self.id().endswith(".test_session"):
+      self.skipTest("Not a test.")
+
   def tearDown(self):
     for thread in self._threads:
       thread.check_termination()
@@ -1371,7 +1700,7 @@ class TensorFlowTestCase(googletest.TestCase):
       the graph building and execution code in a test case.
     """
     if context.executing_eagerly():
-      yield None
+      yield FakeEagerSession(self)
     else:
       sess = self._get_cached_session(
           graph, config, force_gpu, crash_if_inconsistent_args=True)
@@ -1390,7 +1719,6 @@ class TensorFlowTestCase(googletest.TestCase):
     """Use cached_session instead."""
     if self.id().endswith(".test_session"):
       self.skipTest("Not a test.")
-
     if context.executing_eagerly():
       yield None
     else:
@@ -1513,8 +1841,8 @@ class TensorFlowTestCase(googletest.TestCase):
     return ret
 
 
-# pylint: enable=invalid-name
-
+  # pylint: enable=invalid-name
+  @py_func_if_in_function
   def assertNear(self, f1, f2, err, msg=None):
     """Asserts that two floats are near each other.
 
@@ -1533,6 +1861,7 @@ class TensorFlowTestCase(googletest.TestCase):
         "%f != %f +/- %f%s" % (f1, f2, err, " (%s)" % msg
                                if msg is not None else ""))
 
+  @py_func_if_in_function
   def assertArrayNear(self, farray1, farray2, err, msg=None):
     """Asserts that two float arrays are near each other.
 
@@ -1552,6 +1881,7 @@ class TensorFlowTestCase(googletest.TestCase):
   def _NDArrayNear(self, ndarray1, ndarray2, err):
     return np.linalg.norm(ndarray1 - ndarray2) < err
 
+  @py_func_if_in_function
   def assertNDArrayNear(self, ndarray1, ndarray2, err, msg=None):
     """Asserts that two numpy arrays have near values.
 
@@ -1689,6 +2019,7 @@ class TensorFlowTestCase(googletest.TestCase):
         e.args = ((e.args[0] + " : " + msg,) + e.args[1:])
         raise
 
+  @py_func_if_in_function
   def assertAllClose(self, a, b, rtol=1e-6, atol=1e-6, msg=None):
     """Asserts that two structures of numpy arrays or Tensors, have near values.
 
@@ -1714,6 +2045,7 @@ class TensorFlowTestCase(googletest.TestCase):
     """
     self._assertAllCloseRecursive(a, b, rtol=rtol, atol=atol, msg=msg)
 
+  @py_func_if_in_function
   def assertAllCloseAccordingToType(self,
                                     a,
                                     b,
@@ -1761,6 +2093,7 @@ class TensorFlowTestCase(googletest.TestCase):
 
     self.assertAllClose(a, b, rtol=rtol, atol=atol, msg=msg)
 
+  @py_func_if_in_function
   def assertNotAllClose(self, a, b, **kwargs):
     """Assert that two numpy arrays, or or Tensors, do not have near values.
 
@@ -1779,6 +2112,7 @@ class TensorFlowTestCase(googletest.TestCase):
       return
     raise AssertionError("The two values are close at all elements")
 
+  @py_func_if_in_function
   def assertAllEqual(self, a, b, msg=None):
     """Asserts that two numpy arrays or Tensors have the same values.
 
@@ -1821,6 +2155,7 @@ class TensorFlowTestCase(googletest.TestCase):
       msgs.append("not equal rhs = {}".format(y))
       np.testing.assert_array_equal(a, b, err_msg="\n".join(msgs))
 
+  @py_func_if_in_function
   def assertAllGreater(self, a, comparison_target):
     """Assert element values are all greater than a target value.
 
@@ -1832,6 +2167,7 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertGreater(np.min(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllLess(self, a, comparison_target):
     """Assert element values are all less than a target value.
 
@@ -1843,6 +2179,7 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertLess(np.max(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllGreaterEqual(self, a, comparison_target):
     """Assert element values are all greater than or equal to a target value.
 
@@ -1854,6 +2191,7 @@ class TensorFlowTestCase(googletest.TestCase):
     a = self._GetNdArray(a)
     self.assertGreaterEqual(np.min(a), comparison_target)
 
+  @py_func_if_in_function
   def assertAllLessEqual(self, a, comparison_target):
     """Assert element values are all less than or equal to a target value.
 
@@ -1896,6 +2234,7 @@ class TensorFlowTestCase(googletest.TestCase):
       lines.append(prefix + "...")
     return lines
 
+  @py_func_if_in_function
   def assertAllInRange(self,
                        target,
                        lower_bound,
@@ -1954,6 +2293,7 @@ class TensorFlowTestCase(googletest.TestCase):
           "Subscript(s) and value(s) of the offending elements:\n" +
           "\n".join(self._format_subscripts(violation_subscripts, target)))
 
+  @py_func_if_in_function
   def assertAllInSet(self, target, expected_set):
     """Assert that elements of a Tensor are all in a given closed set.
 
@@ -1975,6 +2315,7 @@ class TensorFlowTestCase(googletest.TestCase):
       raise AssertionError("%d unique element(s) are not in the set %s: %s" %
                            (np.size(diff), expected_set, diff))
 
+  @py_func_if_in_function
   def assertDTypeEqual(self, target, expected_dtype):
     """Assert ndarray data type is equal to expected.
 
diff --git a/tensorflow/python/framework/test_util_test.py b/tensorflow/python/framework/test_util_test.py
index cbefe86481421396c0d67f042cf876e3b8e39b53..dfdced5a9886089884fede9dea9b69587499e28f 100644
--- a/tensorflow/python/framework/test_util_test.py
+++ b/tensorflow/python/framework/test_util_test.py
@@ -49,6 +49,7 @@ from tensorflow.python.platform import googletest
 
 class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_assert_ops_in_graph(self):
     with self.test_session():
       constant_op.constant(["hello", "taffy"], name="hello")
@@ -60,6 +61,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertRaises(ValueError, test_util.assert_ops_in_graph,
                       {"hello": "Variable"}, ops.get_default_graph())
 
+  @test_util.run_deprecated_v1
   def test_session_functions(self):
     with self.test_session() as sess:
       sess_ref = weakref.ref(sess)
@@ -551,6 +553,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises(AssertionError):
       self.assertAllLessEqual(x, 95.0)
 
+  @test_util.run_deprecated_v1
   def testAssertAllInRangeWithNonNumericValuesFails(self):
     s1 = constant_op.constant("Hello, ", name="s1")
     c = constant_op.constant([1 + 2j, -3 + 5j], name="c")
@@ -614,6 +617,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaises(AssertionError):
       self.assertAllInSet(x, (42,))
 
+  @test_util.run_deprecated_v1
   def testRandomSeed(self):
     # Call setUp again for WithCApi case (since it makes a new defeault graph
     # after setup).
@@ -681,7 +685,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIsNone(test_util.get_node_def_from_graph("bar", graph_def))
 
   def test_run_in_eager_and_graph_modes_test_class(self):
-    msg = "`run_test_in_graph_and_eager_modes` only supports test methods.*"
+    msg = "`run_in_graph_and_eager_modes` only supports test methods.*"
     with self.assertRaisesRegexp(ValueError, msg):
       @test_util.run_in_graph_and_eager_modes()
       class Foo(object):
@@ -706,6 +710,7 @@ class TestUtilTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     test_util.run_in_graph_and_eager_modes(_test)(self)
     self.assertEqual(modes, ["graph"])
 
+  @test_util.run_deprecated_v1
   def test_run_in_graph_and_eager_modes_setup_in_same_mode(self):
     modes = []
     mode_name = lambda: "eager" if context.executing_eagerly() else "graph"
diff --git a/tensorflow/python/grappler/constant_folding_test.py b/tensorflow/python/grappler/constant_folding_test.py
index ab1d0ed25b9130fabcffbb8da2265c046206da46..30c1e1468146ce58216acbfbb1aef1ab1408027f 100644
--- a/tensorflow/python/grappler/constant_folding_test.py
+++ b/tensorflow/python/grappler/constant_folding_test.py
@@ -61,7 +61,7 @@ class ConstantFoldingTest(test.TestCase):
           back_prop=False,
           parallel_iterations=1)
       with session.Session() as sess:
-        y_v = sess.run(y)
+        y_v = self.evaluate(y)
         self.assertAllEqual(np.zeros([10, 20, 30]), y_v)
 
 
diff --git a/tensorflow/python/grappler/cost_analyzer_test.py b/tensorflow/python/grappler/cost_analyzer_test.py
index b8225b81a52f1a2ee10663544d54f1c9bd7ee785..ee3e289f65d05e96a580a62adb7f39552e6ced1c 100644
--- a/tensorflow/python/grappler/cost_analyzer_test.py
+++ b/tensorflow/python/grappler/cost_analyzer_test.py
@@ -38,6 +38,7 @@ from tensorflow.python.training import adam
 
 class CostAnalysisTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicCost(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant(10, name="a")
@@ -62,6 +63,7 @@ class CostAnalysisTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testVerbose(self):
     """Make sure the full report is generated with verbose=True."""
     a = constant_op.constant(10, name="a")
@@ -81,6 +83,7 @@ class CostAnalysisTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testSmallNetworkCost(self):
     image = array_ops.placeholder(dtypes.float32, shape=[1, 28, 28, 1])
     label = array_ops.placeholder(dtypes.float32, shape=[1, 10])
@@ -96,8 +99,8 @@ class CostAnalysisTest(test.TestCase):
     b_fc = variables.Variable(random_ops.truncated_normal([10], stddev=0.1))
     y_conv = nn_ops.softmax(math_ops.matmul(h_conv_flat, w_fc) + b_fc)
 
-    cross_entropy = math_ops.reduce_mean(-math_ops.reduce_sum(
-        label * math_ops.log(y_conv), reduction_indices=[1]))
+    cross_entropy = math_ops.reduce_mean(
+        -math_ops.reduce_sum(label * math_ops.log(y_conv), axis=[1]))
     _ = adam.AdamOptimizer(1e-4).minimize(cross_entropy)
 
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
@@ -129,6 +132,7 @@ class CostAnalysisTest(test.TestCase):
       # self.assertTrue(0 < upper)
       # self.assertTrue(lower <= upper)
 
+  @test_util.run_deprecated_v1
   def testBasicMemory(self):
     """Make sure arguments can be passed correctly."""
     with test_util.device(use_gpu=False):
diff --git a/tensorflow/python/grappler/cost_analyzer_tool.py b/tensorflow/python/grappler/cost_analyzer_tool.py
index e6229e18566d7b6431f77ac32118bb56cda615ec..7dbaf449cad6f65fbf84054f9e2d5a631b46d13b 100644
--- a/tensorflow/python/grappler/cost_analyzer_tool.py
+++ b/tensorflow/python/grappler/cost_analyzer_tool.py
@@ -25,8 +25,8 @@ from google.protobuf import message
 from google.protobuf import text_format
 from tensorflow.contrib.fused_conv.ops import gen_fused_conv2d_bias_activation_op  # pylint: disable=unused-import
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
@@ -79,10 +79,11 @@ def get_metagraph():
 
 def main(_):
   metagraph = get_metagraph()
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  config = config_pb2.ConfigProto()
   if FLAGS.rewriter_config is not None:
-    text_format.Merge(FLAGS.rewriter_config, rewriter_config)
-  optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+    text_format.Merge(FLAGS.rewriter_config,
+                      config.graph_options.rewrite_options)
+  optimized_graph = tf_optimizer.OptimizeGraph(config, metagraph)
   metagraph.graph_def.CopyFrom(optimized_graph)
 
   report = cost_analyzer.GenerateCostReport(metagraph, FLAGS.per_node_report,
diff --git a/tensorflow/python/grappler/datasets_test.py b/tensorflow/python/grappler/datasets_test.py
index bd870ad8de4b6526f778fa94e8b71cc789dfe99e..6937301ab255b87fa51444b70bc0e2b20d306ea3 100644
--- a/tensorflow/python/grappler/datasets_test.py
+++ b/tensorflow/python/grappler/datasets_test.py
@@ -48,7 +48,7 @@ class GrapplerTest(test.TestCase):
     for test_case in test_cases:
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -73,7 +73,7 @@ class GrapplerTest(test.TestCase):
     for test_case in test_cases:
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensor_slices(test_case['tensor'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -109,7 +109,7 @@ class GrapplerTest(test.TestCase):
             make_generator(test_case['tensor']),
             dtypes.int64,
             output_shapes=test_case['shape'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -122,7 +122,7 @@ class GrapplerTest(test.TestCase):
   def testRange(self):
     with ops.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(42)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       get_next = iterator.get_next()
       train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
       train_op.append(get_next)
@@ -148,7 +148,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = fn(dataset, test_case['tensor'], test_case['shape'])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -252,7 +252,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.batch(42)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -281,7 +281,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.padded_batch(42, padded_shapes=test_case['shape'][1:])
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -318,7 +318,7 @@ class GrapplerTest(test.TestCase):
           return dataset_fn
 
         dataset = dataset.flat_map(make_dataset(test_case['tensor']))
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -353,7 +353,7 @@ class GrapplerTest(test.TestCase):
 
         dataset = dataset.interleave(
             make_dataset(test_case['tensor']), cycle_length=42)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
@@ -382,7 +382,7 @@ class GrapplerTest(test.TestCase):
       with ops.Graph().as_default() as g:
         dataset = dataset_ops.Dataset.from_tensors(test_case['tensor'])
         dataset = dataset.map(array_ops.transpose)
-        iterator = dataset.make_one_shot_iterator()
+        iterator = dataset_ops.make_one_shot_iterator(dataset)
         get_next = iterator.get_next()
         train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
         train_op.append(get_next)
diff --git a/tensorflow/python/grappler/graph_placer.py b/tensorflow/python/grappler/graph_placer.py
index 654013b23c5811acbd10633d692e2d214d530b26..9c05ad81790d61fe0d19e5738d64e6502ca88915 100644
--- a/tensorflow/python/grappler/graph_placer.py
+++ b/tensorflow/python/grappler/graph_placer.py
@@ -19,8 +19,8 @@ from __future__ import division
 from __future__ import print_function
 
 import time
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
-from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops as tf_ops
 from tensorflow.python.grappler import cluster as gcluster
@@ -54,9 +54,9 @@ def PlaceGraph(metagraph,
     cluster = gcluster.Cluster()
 
   # Optimize the metagraph to speedup the placement
-  rewriter_config = rewriter_config_pb2.RewriterConfig()
+  config = config_pb2.ConfigProto()
   optimized_graph = tf_optimizer.OptimizeGraph(
-      rewriter_config, metagraph, verbose=verbose, cluster=cluster)
+      config, metagraph, verbose=verbose, cluster=cluster)
   optimized_metagraph = meta_graph_pb2.MetaGraphDef()
   optimized_metagraph.CopyFrom(metagraph)
   optimized_metagraph.graph_def.CopyFrom(optimized_graph)
diff --git a/tensorflow/python/grappler/item_test.py b/tensorflow/python/grappler/item_test.py
index d3d96c646cd00ede612ad93cca3975b92389bfa1..c02fd9f55b885c0e8b0647a74547887eff7453f0 100644
--- a/tensorflow/python/grappler/item_test.py
+++ b/tensorflow/python/grappler/item_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import item
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_array_ops
@@ -107,6 +108,7 @@ class ItemTest(test.TestCase):
     newest_tf_item = grappler_item.tf_item
     self.assertEqual(new_tf_item, newest_tf_item)
 
+  @test_util.run_v1_only('b/120545219')
   def testColocationContraints(self):
     with ops.Graph().as_default() as g:
       c = constant_op.constant([10])
diff --git a/tensorflow/python/grappler/layout_optimizer_test.py b/tensorflow/python/grappler/layout_optimizer_test.py
index 8cc971c61d5964d0fad1bfa843c3ef8d3407599f..98f2e6d71816a4b6d8cd3f7fc836b09e5cc058a4 100644
--- a/tensorflow/python/grappler/layout_optimizer_test.py
+++ b/tensorflow/python/grappler/layout_optimizer_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import cluster as gcluster
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.layers import convolutional as conv_layers
@@ -241,7 +242,7 @@ class LayoutOptimizerTest(test.TestCase):
       if restore:
         saver.restore(sess, checkpoint_path)
       else:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
       np.random.seed(0)
       for _ in range(2):
@@ -262,7 +263,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _two_layer_model(x)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -365,7 +366,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(pad)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -396,7 +397,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -425,7 +426,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(cast)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -456,7 +457,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -486,7 +487,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -516,7 +517,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(squeeze)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -545,7 +546,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -574,7 +575,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -603,7 +604,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -632,7 +633,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -662,7 +663,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -691,7 +692,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reduce_sum)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -724,7 +725,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(concat)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -835,7 +836,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(reverse)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -905,7 +906,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -966,7 +967,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(select)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1179,7 +1180,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(s)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1214,7 +1215,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = array_ops.identity(s)
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1347,7 +1348,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1374,7 +1375,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop_with_branch()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1398,7 +1399,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _loop_with_vec_and_4d()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1422,7 +1423,7 @@ class LayoutOptimizerTest(test.TestCase):
       output = _model_with_second_port()
 
       with session.Session(config=_get_config(False)) as sess:
-        output_val_ref = sess.run(output)
+        output_val_ref = self.evaluate(output)
 
       with session.Session(config=_get_config()) as sess:
         metadata = config_pb2.RunMetadata()
@@ -1441,13 +1442,16 @@ class LayoutOptimizerTest(test.TestCase):
       self._assert_trans_nchw_to_nhwc('Add-0-0', nodes)
       self.assertAllClose(output_val_ref, output_val, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     meta_graph = _simple_metagraph()
-    rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
-        min_graph_nodes=-1)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+            min_graph_nodes=-1))
     optimized_graph = tf_optimizer.OptimizeGraph(
-        rewrite_options, meta_graph, cluster=_get_cluster())
+        config, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
@@ -1456,13 +1460,16 @@ class LayoutOptimizerTest(test.TestCase):
         self.assertEqual(node.attr['data_format'].s, b'NCHW')
     self.assertEqual(found, 5)
 
+  @test_util.run_deprecated_v1
   def testDepthwise(self):
     meta_graph = _simple_metagraph(depthwise=True)
-    rewrite_options = rewriter_config_pb2.RewriterConfig(
-        layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
-        min_graph_nodes=-1)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            layout_optimizer=rewriter_config_pb2.RewriterConfig.ON,
+            min_graph_nodes=-1))
     optimized_graph = tf_optimizer.OptimizeGraph(
-        rewrite_options, meta_graph, cluster=_get_cluster())
+        config, meta_graph, cluster=_get_cluster())
 
     found = 0
     for node in optimized_graph.node:
diff --git a/tensorflow/python/grappler/memory_optimizer_test.py b/tensorflow/python/grappler/memory_optimizer_test.py
index 03b42f6453975c097810b300324f8ab0a2879329..e2864ebb4df646262456f2d04e4a24bdd06482b7 100644
--- a/tensorflow/python/grappler/memory_optimizer_test.py
+++ b/tensorflow/python/grappler/memory_optimizer_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
@@ -37,6 +38,7 @@ from tensorflow.python.training import training as train
 class MemoryOptimizerSwapTest(test.TestCase):
   """Tests the Grappler memory optimizer."""
 
+  @test_util.run_deprecated_v1
   def testNoSwapping(self):
     """Make sure the graph is preserved when there is nothing to swap."""
     a = variables.VariableV1(10, name='a')
@@ -49,15 +51,18 @@ class MemoryOptimizerSwapTest(test.TestCase):
     graph_size = len(mg.graph_def.node)
     nodes = [node.name for node in mg.graph_def.node]
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), graph_size)
     self.assertItemsEqual([node.name for node in graph.node], nodes)
 
+  @test_util.run_v1_only('b/120545219')
   def testSimpleSwap(self):
     """Check that the swap annotations are followed."""
     a = variables.VariableV1(10, name='a')
@@ -72,13 +77,15 @@ class MemoryOptimizerSwapTest(test.TestCase):
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
     graph_size = len(mg.graph_def.node)
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig(
-        disable_model_pruning=True,
-        meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
-        constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-        memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
-        min_graph_nodes=-1)
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
+        rewriter_config_pb2.RewriterConfig(
+            disable_model_pruning=True,
+            meta_optimizer_iterations=rewriter_config_pb2.RewriterConfig.ONE,
+            constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL,
+            min_graph_nodes=-1))
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), graph_size + 2)
     self.assertTrue(
@@ -127,7 +134,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
   def testRewritingDefaultGradientNames(self):
     """Tests that rewriting occurs with default gradient names."""
     (original_metagraph, _, _, _) = self._GetMetaGraph()
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
@@ -135,8 +143,9 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS), original_metagraph)
+            memory_optimization=(
+                rewriter_config_pb2.RewriterConfig.RECOMPUTATION_HEURISTICS)))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -153,7 +162,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     """Tests that rewriting occurs with non-standard gradient names."""
     (original_metagraph, _, _, _) = self._GetMetaGraph(
         optimizer_scope_name='optimizer')
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
@@ -161,11 +171,11 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig
+            .RECOMPUTATION_HEURISTICS,
             # Checks that name scope "gradients/" also match sub-scope.
-            memory_optimizer_target_node_name_scope='gradients/'),
-        original_metagraph)
+            memory_optimizer_target_node_name_scope='gradients/'))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertGreater(
         len(rewritten_graph_def.node),
         len(original_metagraph.graph_def.node))
@@ -182,18 +192,19 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
     """Tests that rewriting occurs with non-standard gradient names."""
     (original_metagraph, _, _,
      _) = self._GetMetaGraph(optimizer_scope_name='foo/bar')
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             disable_model_pruning=True,
             constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
             dependency_optimization=rewriter_config_pb2.RewriterConfig.OFF,
             layout_optimizer=rewriter_config_pb2.RewriterConfig.OFF,
             arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.
-            RECOMPUTATION_HEURISTICS,
+            memory_optimization=rewriter_config_pb2.RewriterConfig
+            .RECOMPUTATION_HEURISTICS,
             # This should not match anything.
-            memory_optimizer_target_node_name_scope='r/gradients/'),
-        original_metagraph)
+            memory_optimizer_target_node_name_scope='r/gradients/'))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, original_metagraph)
     self.assertEqual(
         len(rewritten_graph_def.node), len(original_metagraph.graph_def.node))
     self.assertEqual(0,
@@ -223,10 +234,10 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
       train_op = graph.get_operation_by_name(train_op_name)
       loss_op = graph.get_tensor_by_name(loss_op_name)
       with session.Session(config=config, graph=graph) as sess:
-        sess.run(init_op)
-        sess.run(train_op)
-        sess.run(train_op)
-        return sess.run(loss_op)
+        self.evaluate(init_op)
+        self.evaluate(train_op)
+        self.evaluate(train_op)
+        return self.evaluate(loss_op)
 
   def testRecomputationRewritingNoErrors(self):
     """Tests that graph output is not significantly different with rewriting."""
@@ -287,8 +298,8 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
           rewrite_options=manual_memory_config)
       session_config = config_pb2.ConfigProto(graph_options=graph_options)
       with session.Session(config=session_config) as sess:
-        sess.run(init_op)
-        sess.run(train_op)
+        self.evaluate(init_op)
+        self.evaluate(train_op)
 
   def testHintDoesRewrite(self):
     graph = self._annotated_graph()[0]
@@ -298,11 +309,12 @@ class MemoryOptimizerRecomputeTest(test.TestCase):
         0,
         len([node for node in metagraph.graph_def.node
              if 'Recomputed/' in node.name]))
-    rewritten_graph_def = tf_optimizer.OptimizeGraph(
+    config = config_pb2.ConfigProto()
+    config.graph_options.rewrite_options.CopyFrom(
         rewriter_config_pb2.RewriterConfig(
             min_graph_nodes=-1,
-            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL),
-        metagraph)
+            memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+    rewritten_graph_def = tf_optimizer.OptimizeGraph(config, metagraph)
     self.assertEqual(
         9,
         len([node for node in rewritten_graph_def.node
diff --git a/tensorflow/python/grappler/model_analyzer_test.py b/tensorflow/python/grappler/model_analyzer_test.py
index ec172755f1ae43fc7581e97c6a18471da45f9100..d000cfa1ba2ec6ab2974332b8cc0cae8d6cf821d 100644
--- a/tensorflow/python/grappler/model_analyzer_test.py
+++ b/tensorflow/python/grappler/model_analyzer_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import model_analyzer
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -28,6 +29,7 @@ from tensorflow.python.platform import test
 
 class PyWrapOptimizeGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant([10, 11], name="a")
@@ -49,6 +51,7 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     # Also print the report to make it easier to debug
     print("{}".format(report))
 
+  @test_util.run_deprecated_v1
   def testDebugMode(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant([10, 11], name="a")
diff --git a/tensorflow/python/grappler/tf_optimizer.i b/tensorflow/python/grappler/tf_optimizer.i
index 39ca71e99af06c19fb7fe5bf185c29106729f5e9..b746c3ec261e1bc75f6374d27b52a522a83934b9 100644
--- a/tensorflow/python/grappler/tf_optimizer.i
+++ b/tensorflow/python/grappler/tf_optimizer.i
@@ -34,8 +34,8 @@ limitations under the License.
   $1 = &temp;
 }
 
-%typemap(in) const tensorflow::RewriterConfig& (
-    tensorflow::RewriterConfig temp) {
+%typemap(in) const tensorflow::ConfigProto& (
+    tensorflow::ConfigProto temp) {
   char* c_string;
   Py_ssize_t py_size;
   if (PyBytes_AsStringAndSize($input, &c_string, &py_size) == -1) {
@@ -46,7 +46,7 @@ limitations under the License.
   if (!temp.ParseFromString(string(c_string, py_size))) {
     PyErr_SetString(
         PyExc_TypeError,
-        "The RewriterConfig could not be parsed as a valid protocol buffer");
+        "The ConfigProto could not be parsed as a valid protocol buffer");
     SWIG_fail;
   }
   $1 = &temp;
@@ -67,20 +67,20 @@ limitations under the License.
   #include "tensorflow/core/grappler/clusters/utils.h"
   #include "tensorflow/core/grappler/clusters/virtual_cluster.h"
   #include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
+  #include "tensorflow/core/protobuf/config.pb.h"
   #include "tensorflow/core/protobuf/meta_graph.pb.h"
-  #include "tensorflow/core/protobuf/rewriter_config.pb.h"
   #include "tensorflow/core/public/session_options.h"
 
 
 void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* device_map) {
   tensorflow::SessionOptions options;
-  std::vector<tensorflow::Device*> devices;
+  std::vector<std::unique_ptr<tensorflow::Device>> devices;
   tensorflow::Status status = tensorflow::DeviceFactory::AddDevices(options, "", &devices);
   if (!status.ok()) {
     return;
   }
 
-  for (const tensorflow::Device* device : devices) {
+  for (const std::unique_ptr<tensorflow::Device>& device : devices) {
     tensorflow::DeviceProperties& prop = (*device_map)[device->name()];
     prop = tensorflow::grappler::GetDeviceInfo(device->parsed_name());
 
@@ -88,13 +88,12 @@ void DetectDevices(std::unordered_map<string, tensorflow::DeviceProperties>* dev
     // available device memory.
     const tensorflow::DeviceAttributes& attr = device->attributes();
     prop.set_memory_size(attr.memory_limit());
-    delete device;
   }
 }
 
 PyObject* TF_OptimizeGraph(
       GCluster cluster,
-      const tensorflow::RewriterConfig& rewriter_config,
+      const tensorflow::ConfigProto& config_proto,
       const tensorflow::MetaGraphDef& metagraph,
       bool verbose, const string& graph_id, TF_Status* out_status) {
     tensorflow::grappler::ItemConfig item_config;
@@ -110,7 +109,7 @@ PyObject* TF_OptimizeGraph(
 
     tensorflow::DeviceBase* cpu_device = nullptr;
     tensorflow::GraphDef out_graph;
-    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, rewriter_config);
+    tensorflow::grappler::MetaOptimizer optimizer(cpu_device, config_proto);
     tensorflow::Status status = optimizer.Optimize(cluster.get(), *grappler_item, &out_graph);
     if (verbose) {
       optimizer.PrintResult();
@@ -127,7 +126,7 @@ PyObject* TF_OptimizeGraph(
 // Wrap this function
 PyObject* TF_OptimizeGraph(
     GCluster cluster,
-    const tensorflow::RewriterConfig& rewriter_config,
+    const tensorflow::ConfigProto& config_proto,
     const tensorflow::MetaGraphDef& metagraph, bool verbose,
     const string& graph_id, TF_Status* out_status);
 
diff --git a/tensorflow/python/grappler/tf_optimizer.py b/tensorflow/python/grappler/tf_optimizer.py
index a73a4a98fc5a883cf8681a20ca332f16f3b7f0ce..e72667b6f3184c7f2900fb410102a08220c44e2e 100644
--- a/tensorflow/python/grappler/tf_optimizer.py
+++ b/tensorflow/python/grappler/tf_optimizer.py
@@ -19,22 +19,26 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python import pywrap_tensorflow as tf_opt
 from tensorflow.python.framework import errors
 from tensorflow.python.grappler import cluster as gcluster
 
 
-def OptimizeGraph(rewriter_config,
+def OptimizeGraph(config_proto,
                   metagraph,
                   verbose=True,
                   graph_id=b'graph_to_optimize',
                   cluster=None):
   """Optimize the provided metagraph."""
+  if not isinstance(config_proto, config_pb2.ConfigProto):
+    raise TypeError('Expected config_proto to be a ConfigProto, saw type %s' %
+                    type(config_proto))
   with errors.raise_exception_on_not_ok_status() as status:
     if cluster is None:
       cluster = gcluster.Cluster()
     ret_from_swig = tf_opt.TF_OptimizeGraph(cluster.tf_cluster,
-                                            rewriter_config.SerializeToString(),
+                                            config_proto.SerializeToString(),
                                             metagraph.SerializeToString(),
                                             verbose, graph_id, status)
   if ret_from_swig is None:
diff --git a/tensorflow/python/grappler/tf_optimizer_test.py b/tensorflow/python/grappler/tf_optimizer_test.py
index eca0f679829507212608e75f2c792b4bddf9b1da..8186c81378af7c9fdbd39d4001998d2f959d4dd3 100644
--- a/tensorflow/python/grappler/tf_optimizer_test.py
+++ b/tensorflow/python/grappler/tf_optimizer_test.py
@@ -17,12 +17,13 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import item as gitem
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
@@ -34,6 +35,7 @@ from tensorflow.python.platform import test
 
 class PyWrapOptimizeGraphTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     """Make sure arguments can be passed correctly."""
     a = constant_op.constant(10, name='a')
@@ -45,15 +47,17 @@ class PyWrapOptimizeGraphTest(test.TestCase):
     train_op.append(d)
     mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
 
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.optimizers.append('constfold')
     rewriter_config.min_graph_nodes = -1
 
-    graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    graph = tf_optimizer.OptimizeGraph(config, mg)
 
     self.assertEqual(len(graph.node), 1)
     self.assertItemsEqual([node.name for node in graph.node], ['d'])
 
+  @test_util.run_v1_only('b/120545219')
   def testKeepNodes(self):
     g = ops.Graph()
     with g.as_default():
@@ -68,18 +72,21 @@ class PyWrapOptimizeGraphTest(test.TestCase):
 
     # Optimize the graph.
     mg = meta_graph.create_meta_graph_def(graph=g)
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.min_graph_nodes = -1
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    optimized_graph = tf_optimizer.OptimizeGraph(config, mg)
 
     # Check that the nodes referenced in various collections have been preserved
-    self.assertEqual(len(optimized_graph.node), 5)
-    self.assertEqual(d.op.name, optimized_graph.node[0].name)
-    self.assertEqual(a1.op.name, optimized_graph.node[1].name)
-    self.assertEqual('Variable/initial_value', optimized_graph.node[2].name)
-    self.assertEqual(a2.op.name, optimized_graph.node[3].name)
-    self.assertEqual('Variable/Assign', optimized_graph.node[4].name)
-
+    optimized_graph_nodes = [node.name for node in optimized_graph.node]
+    expected_nodes = [
+        d.op.name, a1.op.name, a2.op.name, 'Variable/initial_value',
+        'Variable/Assign'
+    ]
+    self.assertEqual(len(optimized_graph_nodes), len(expected_nodes))
+    self.assertAllInSet(optimized_graph_nodes, expected_nodes)
+
+  @test_util.run_v1_only('b/120545219')
   def testLoops(self):
     g = ops.Graph()
     with g.as_default():
@@ -110,9 +117,10 @@ class PyWrapOptimizeGraphTest(test.TestCase):
 
     # Optimize the graph.
     mg = meta_graph.create_meta_graph_def(graph=g)
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.min_graph_nodes = -1
-    optimized_graph = tf_optimizer.OptimizeGraph(rewriter_config, mg)
+    optimized_graph = tf_optimizer.OptimizeGraph(config, mg)
     mg.graph_def.CopyFrom(optimized_graph)
 
     # Check that the nodes referenced in various collections have been preserved
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index 6f38d822e70dda8062bf4c3736b17c875b15724b..36fea36389dc15104cca8a0d421ba50906295e9a 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -3,10 +3,10 @@
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE"])
-
 package(default_visibility = ["//visibility:public"])
 
+exports_files(["LICENSE"])
+
 load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
@@ -41,6 +41,7 @@ py_library(
         "datasets/mnist.py",
         "datasets/reuters.py",
         "estimator/__init__.py",
+        "keras_parameterized.py",
         "preprocessing/__init__.py",
         "preprocessing/image.py",
         "preprocessing/sequence.py",
@@ -122,8 +123,10 @@ py_library(
         "constraints.py",
         "engine/__init__.py",
         "engine/base_layer.py",
+        "engine/base_layer_utils.py",
         "engine/distributed_training_utils.py",
         "engine/input_layer.py",
+        "engine/input_spec.py",
         "engine/network.py",
         "engine/saving.py",
         "engine/sequential.py",
@@ -141,11 +144,14 @@ py_library(
         "regularizers.py",
         "utils/data_utils.py",
         "utils/io_utils.py",
+        "utils/losses_utils.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
         ":backend",
         "//tensorflow/python/data",
+        "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/training/checkpointable:data_structures",
         "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
@@ -180,7 +186,6 @@ py_library(
         ":engine",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:cudnn_rnn_ops_gen",
-        "//tensorflow/python:distribute",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:embedding_ops",
         "//tensorflow/python:framework_ops",
@@ -194,6 +199,7 @@ py_library(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tensor_shape",
         "//tensorflow/python:util",
+        "//tensorflow/python/distribute:distribute_lib",
     ],
 )
 
@@ -209,6 +215,7 @@ py_test(
         "//tensorflow/python:layers",
         "//tensorflow/python:nn",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -221,6 +228,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -233,6 +241,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -246,6 +255,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:init_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -257,6 +267,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -264,6 +275,7 @@ py_test(
     name = "optimizers_test",
     size = "medium",
     srcs = ["optimizers_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
@@ -271,6 +283,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -283,6 +296,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -300,6 +314,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -318,12 +333,13 @@ py_test(
 
 py_test(
     name = "advanced_activations_test",
-    size = "small",
+    size = "medium",
     srcs = ["layers/advanced_activations_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -337,6 +353,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -350,6 +367,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -369,12 +387,13 @@ cuda_py_test(
 
 py_test(
     name = "pooling_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/pooling_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -388,6 +407,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -397,6 +417,7 @@ cuda_py_test(
     srcs = ["layers/embeddings_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//tensorflow/python:client_testlib",
     ],
 )
@@ -412,6 +433,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -424,6 +446,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -435,6 +458,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -448,6 +472,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -461,6 +486,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -474,6 +500,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -491,21 +518,36 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 py_test(
     name = "recurrent_test",
-    size = "medium",
+    size = "large",
     srcs = ["layers/recurrent_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
+cuda_py_test(
+    name = "unified_lstm_test",
+    size = "medium",
+    srcs = ["layers/unified_lstm_test.py"],
+    additional_deps = [
+        ":keras",
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+    ],
+    shard_count = 4,
+)
+
 py_test(
     name = "serialization_test",
     size = "small",
@@ -514,6 +556,7 @@ py_test(
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -531,6 +574,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -544,6 +588,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -563,6 +608,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -571,6 +617,18 @@ py_test(
     size = "small",
     srcs = ["utils/generic_utils_test.py"],
     srcs_version = "PY2AND3",
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "tf_utils_test",
+    size = "small",
+    srcs = ["utils/tf_utils_test.py"],
+    srcs_version = "PY2AND3",
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
@@ -590,6 +648,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -602,6 +661,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -610,6 +670,7 @@ cuda_py_test(
     srcs = ["utils/multi_gpu_utils_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
@@ -625,6 +686,7 @@ cuda_py_test(
     srcs = ["engine/training_gpu_test.py"],
     additional_deps = [
         ":keras",
+        "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
     ],
@@ -652,6 +714,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -664,6 +727,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -676,6 +740,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -689,6 +754,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -696,13 +762,33 @@ py_test(
     name = "training_test",
     size = "medium",
     srcs = ["engine/training_test.py"],
+    shard_count = 16,
+    srcs_version = "PY2AND3",
+    tags = [
+        "manual",  # TODO(b/120560388)
+        "no_oss",  # TODO(b/120560388)
+        "notap",  # TODO(b/120560388)
+        "notsan",
+    ],
+    deps = [
+        ":keras",
+        "//tensorflow/python:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "training_dataset_test",
+    size = "medium",
+    srcs = ["engine/training_dataset_test.py"],
     shard_count = 4,
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -710,12 +796,17 @@ py_test(
     name = "training_generator_test",
     size = "enormous",
     srcs = ["engine/training_generator_test.py"],
+    shard_count = 3,
     srcs_version = "PY2AND3",
-    tags = ["notsan"],
+    tags = [
+        "no_oss",
+        "notsan",
+    ],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -730,6 +821,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python/feature_column:feature_column_py",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -743,6 +835,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -756,6 +849,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -763,12 +857,14 @@ py_test(
     name = "model_subclassing_test",
     size = "medium",
     srcs = ["model_subclassing_test.py"],
+    shard_count = 2,
     srcs_version = "PY2AND3",
     tags = ["notsan"],
     deps = [
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -784,6 +880,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -796,6 +893,7 @@ py_test(
         ":keras",
         "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -837,6 +935,7 @@ py_test(
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:training",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -854,15 +953,16 @@ py_test(
     ],
 )
 
-py_library(
-    name = "testing_utils",
-    srcs = [
-        "testing_utils.py",
-    ],
+py_test(
+    name = "keras_parameterized_test",
+    size = "small",
+    srcs = ["keras_parameterized_test.py"],
     srcs_version = "PY2AND3",
+    tags = ["notsan"],
     deps = [
         ":keras",
-        "//tensorflow/python:util",
+        "//tensorflow/python:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/keras/activations_test.py b/tensorflow/python/keras/activations_test.py
index ad238cb0a9b7858a7a8396fa8247d3d3a94311ae..6b7bfb698b8abef4a3e0ac115f2f247103b92abc 100644
--- a/tensorflow/python/keras/activations_test.py
+++ b/tensorflow/python/keras/activations_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -67,6 +68,7 @@ class KerasActivationsTest(test.TestCase):
     expected = _ref_softmax(test_values[0, 0])
     self.assertAllClose(result[0, 0], expected, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_selu(self):
     x = keras.backend.placeholder(ndim=2)
     f = keras.backend.function([x], [keras.activations.selu(x)])
@@ -124,6 +126,7 @@ class KerasActivationsTest(test.TestCase):
     expected = sigmoid(test_values)
     self.assertAllClose(result, expected, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_hard_sigmoid(self):
     def ref_hard_sigmoid(x):
       x = (x * 0.2) + 0.5
@@ -147,6 +150,7 @@ class KerasActivationsTest(test.TestCase):
     # No negative values in test values...
     self.assertAllClose(result, test_values, rtol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_elu(self):
     with self.cached_session():
       x = keras.backend.placeholder(ndim=2)
diff --git a/tensorflow/python/keras/backend.py b/tensorflow/python/keras/backend.py
index dd9b0c07e70a761c56f8c8a2f7ea578c6f80aa25..420c457a0ca2c74c5a0148a98e281b4663ab3226 100644
--- a/tensorflow/python/keras/backend.py
+++ b/tensorflow/python/keras/backend.py
@@ -25,6 +25,7 @@ import collections
 import itertools
 import json
 import os
+import threading
 import weakref
 
 import numpy as np
@@ -73,9 +74,9 @@ py_sum = sum
 # while executing eagerly (such as the functional API for model-building).
 _GRAPH = None
 
-# This is the default internal TF session used by Keras.
-# It can be set manually via `set_session(sess)`.
-_SESSION = None
+# This is a thread local object that will hold the default internal TF session
+# used by Keras. It can be set manually via `set_session(sess)`.
+_SESSION = threading.local()
 
 # This dictionary holds a mapping {graph: learning_phase}.
 # A learning phase is a bool tensor used to run Keras models in
@@ -337,7 +338,7 @@ def clear_session():
   global _GRAPH_TF_OPTIMIZERS  # pylint: disable=global-variable-not-assigned
   ops.reset_default_graph()
   reset_uids()
-  _SESSION = None
+  _SESSION.session = None
   graph = get_graph()
   with graph.as_default():
     phase = array_ops.placeholder_with_default(
@@ -376,27 +377,22 @@ def learning_phase():
   Returns:
       Learning phase (scalar integer tensor or Python integer).
   """
-  with ops.init_scope():
-    # We always check & set the learning phase inside the init_scope,
-    # otherwise the wrong default_graph will be used to look up the learning
-    # phase inside of functions & defuns.
-    #
-    # This is because functions & defuns (both in graph & in eager mode)
-    # will always execute non-eagerly using a function-specific default
-    # subgraph.
-    if context.executing_eagerly():
-      if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
-        # Fallback to inference mode as default.
-        return 0
-      return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+  if context.executing_eagerly():
+    if _DUMMY_EAGER_GRAPH not in _GRAPH_LEARNING_PHASES:
+      # Fallback to inference mode as default.
+      return 0
+    return _GRAPH_LEARNING_PHASES[_DUMMY_EAGER_GRAPH]
+  return symbolic_learning_phase()
 
-    graph = get_graph()
-    with graph.as_default():
-      if graph not in _GRAPH_LEARNING_PHASES:
-        phase = array_ops.placeholder_with_default(
-            False, shape=(), name='keras_learning_phase')
-        _GRAPH_LEARNING_PHASES[graph] = phase
-      return _GRAPH_LEARNING_PHASES[graph]
+
+def symbolic_learning_phase():
+  graph = get_graph()
+  with graph.as_default():
+    if graph not in _GRAPH_LEARNING_PHASES:
+      phase = array_ops.placeholder_with_default(
+          False, shape=(), name='keras_learning_phase')
+      _GRAPH_LEARNING_PHASES[graph] = phase
+    return _GRAPH_LEARNING_PHASES[graph]
 
 
 @tf_export('keras.backend.set_learning_phase')
@@ -449,6 +445,20 @@ def learning_phase_scope(value):
         _GRAPH_LEARNING_PHASES[get_graph()] = previous_value
 
 
+def _get_session():
+  """Returns the session object for the current thread."""
+  global _SESSION
+  default_session = ops.get_default_session()
+  if default_session is not None:
+    session = default_session
+  else:
+    if getattr(_SESSION, 'session', None) is None:
+      _SESSION.session = session_module.Session(
+          config=get_default_session_config())
+    session = _SESSION.session
+  return session
+
+
 @tf_export(v1=['keras.backend.get_session'])
 def get_session():
   """Returns the TF session to be used by the backend.
@@ -466,14 +476,7 @@ def get_session():
   Returns:
       A TensorFlow session.
   """
-  global _SESSION
-  default_session = ops.get_default_session()
-  if default_session is not None:
-    session = default_session
-  else:
-    if _SESSION is None:
-      _SESSION = session_module.Session(config=get_default_session_config())
-    session = _SESSION
+  session = _get_session()
   if not _MANUAL_VAR_INIT:
     with session.graph.as_default():
       _initialize_variables(session)
@@ -498,7 +501,7 @@ def set_session(session):
       session: A TF Session.
   """
   global _SESSION
-  _SESSION = session
+  _SESSION.session = session
 
 
 def get_default_session_config():
@@ -2322,7 +2325,7 @@ def concatenate(tensors, axis=-1):
     else:
       axis = 0
 
-  if py_all([is_sparse(x) for x in tensors]):
+  if py_all(is_sparse(x) for x in tensors):
     return sparse_ops.sparse_concat(axis, tensors)
   else:
     return array_ops.concat([to_dense(x) for x in tensors], axis)
@@ -2552,7 +2555,7 @@ def arange(start, stop=None, step=1, dtype='int32'):
     result = cast(result, dtype)
   return result
 
-
+@tf_export('keras.backend.tile')
 def tile(x, n):
   """Creates a tensor by tiling `x` by `n`.
 
@@ -3123,20 +3126,20 @@ class EagerExecutionFunction(object):
             updates_ops.append(update)
 
       # We set the update ops to run at the end by conditioning it on output[0]
-      if updates and not outputs:
+      if updates and not self.outputs:
         # Edge case; never happens in practice
         raise ValueError('Cannot create a Keras backend function with updates'
                          ' but no outputs during eager execution.')
       with ops.control_dependencies(updates_ops):
-        outputs[0] = array_ops.identity(outputs[0])
+        self.outputs[0] = array_ops.identity(self.outputs[0])
 
     # Prepare graph function
     # TODO(fchollet): can we restrict `captures` to variables actually used in
     # the relevant subgraph?
-    graph.inputs = inputs + list(graph.captures.values())
-    graph.outputs = outputs
+    graph.inputs = self.inputs + list(graph.captures.values())
+    graph.outputs = self.outputs
     graph_fn = eager_function.Function(graph)
-    graph_fn._num_positional_args = len(inputs)
+    graph_fn._num_positional_args = len(self.inputs)
     graph_fn._arg_keywords = []
     self._graph_fn = graph_fn
 
@@ -3158,8 +3161,13 @@ class EagerExecutionFunction(object):
         if value is None:
           raise ValueError(
               'You must feed a value for placeholder %s' % (tensor,))
-      converted_inputs.append(
-          ops.convert_to_tensor(value, dtype=tensor.dtype))
+      if not isinstance(value, ops.Tensor):
+        value = ops.convert_to_tensor(value, dtype=tensor.dtype)
+      if value.dtype != tensor.dtype:
+        # Temporary workaround due to `convert_to_tensor` not casting floats.
+        # See b/119637405
+        value = math_ops.cast(value, tensor.dtype)
+      converted_inputs.append(value)
     outputs = self._graph_fn(*converted_inputs)
     return [x.numpy() for x in outputs]
 
@@ -3181,7 +3189,7 @@ def function(inputs, outputs, updates=None, name=None, **kwargs):
   Raises:
       ValueError: if invalid kwargs are passed in or if in eager execution.
   """
-  if context.executing_eagerly():
+  if ops.executing_eagerly_outside_functions():
     if kwargs:
       raise ValueError('Session keyword arguments are not support during '
                        'eager execution. You passed: %s' % (kwargs,))
@@ -3242,7 +3250,8 @@ def rnn(step_function,
         constants=None,
         unroll=False,
         input_length=None,
-        time_major=False):
+        time_major=False,
+        zero_output_for_mask=False):
   """Iterates over the time dimension of a tensor.
 
   Arguments:
@@ -3280,7 +3289,9 @@ def rnn(step_function,
           RNN calculation. However, most TensorFlow data is batch-major, so by
           default this function accepts input and emits output in batch-major
           form.
-
+      zero_output_for_mask: Boolean. If True, the output for masked timestep
+          will be zeros, whereas in the False case, output from previous
+          timestep is returned.
   Returns:
       A tuple, `(last_output, outputs, new_states)`.
           last_output: the latest output of the rnn, of shape `(samples, ...)`
@@ -3332,14 +3343,14 @@ def rnn(step_function,
   # So we need to broadcast the mask to match the shape of inputs.
   # That's what the tile call does, it just repeats the mask along its
   # second dimension n times.
-  def _expand_mask(mask_t, input_t):
+  def _expand_mask(mask_t, input_t, fixed_dim=1):
     assert not nest.is_sequence(mask_t)
     assert not nest.is_sequence(input_t)
     rank_diff = len(input_t.shape) - len(mask_t.shape)
     for _ in range(rank_diff):
-      mask_t = array_ops.expand_dims(mask_t)
-    expand_dims = [1] + input_t.shape.as_list()[1:]
-    return array_ops.tile(mask_t, expand_dims)
+      mask_t = array_ops.expand_dims(mask_t, -1)
+    multiples = [1] * fixed_dim + input_t.shape.as_list()[fixed_dim:]
+    return array_ops.tile(mask_t, multiples)
 
   if unroll:
     if not time_steps:
@@ -3397,6 +3408,17 @@ def rnn(step_function,
       last_output = successive_outputs[-1]
       new_states = successive_states[-1]
       outputs = array_ops.stack(successive_outputs)
+
+      if zero_output_for_mask:
+        last_output = array_ops.where(
+            _expand_mask(mask_list[-1], last_output),
+            last_output,
+            zeros_like(last_output))
+        outputs = array_ops.where(
+            _expand_mask(mask, outputs, fixed_dim=2),
+            outputs,
+            zeros_like(outputs))
+
     else:
       for i in range(time_steps):
         inp = _get_input_tensor(i)
@@ -3490,11 +3512,12 @@ def rnn(step_function,
                                            tuple(states) + tuple(constants))
         # mask output
         flat_output = nest.flatten(output)
-        flat_previous_output = nest.flatten(prev_output)
+        flat_mask_output = (flat_zero_output if zero_output_for_mask
+                            else nest.flatten(prev_output))
         tiled_mask_t = tuple(_expand_mask(mask_t, o) for o in flat_output)
         flat_new_output = tuple(
-            array_ops.where(m, o, po) for m, o, po in zip(
-                tiled_mask_t, flat_output, flat_previous_output))
+            array_ops.where(m, o, zo) for m, o, zo in zip(
+                tiled_mask_t, flat_output, flat_mask_output))
 
         # mask states
         flat_state = nest.flatten(states)
@@ -3503,8 +3526,8 @@ def rnn(step_function,
           new_state.set_shape(state.shape)
         tiled_mask_t = tuple(_expand_mask(mask_t, s) for s in flat_state)
         flat_final_state = tuple(
-            array_ops.where(m, o, po)
-            for m, o, po in zip(tiled_mask_t, flat_new_state, flat_state))
+            array_ops.where(m, s, ps)
+            for m, s, ps in zip(tiled_mask_t, flat_new_state, flat_state))
         new_states = nest.pack_sequence_as(new_states, flat_final_state)
 
         output_ta_t = tuple(
@@ -3552,12 +3575,12 @@ def rnn(step_function,
           **while_loop_kwargs)
       new_states = final_outputs[2:]
 
-    last_time = final_outputs[0]
     output_ta = final_outputs[1]
 
     outputs = tuple(o.stack() for o in output_ta)
+    last_output = tuple(o[-1] for o in outputs)
+
     outputs = nest.pack_sequence_as(output_time_zero, outputs)
-    last_output = tuple(o.read(last_time - 1) for o in output_ta)
     last_output = nest.pack_sequence_as(output_time_zero, last_output)
 
   # static shape inference
@@ -3662,13 +3685,13 @@ def in_train_phase(x, alt, training=None):
   if training is None:
     training = learning_phase()
 
-  if training is 1 or training is True:
+  if training == 1 or training is True:
     if callable(x):
       return x()
     else:
       return x
 
-  elif training is 0 or training is False:
+  elif training == 0 or training is False:
     if callable(alt):
       return alt()
     else:
diff --git a/tensorflow/python/keras/backend_test.py b/tensorflow/python/keras/backend_test.py
index d8aa3e9b529a4d5d0ed618599707f50299fcffdb..af01b46fa9a4a45201de930cfb7827ac1d2bafbd 100644
--- a/tensorflow/python/keras/backend_test.py
+++ b/tensorflow/python/keras/backend_test.py
@@ -136,7 +136,7 @@ class BackendUtilsTest(test.TestCase):
       x = keras.Input((3,))
       y = keras.layers.BatchNormalization()(x)
       if not context.executing_eagerly():
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         sess.run(y, feed_dict={x: np.random.random((2, 3))})
 
   def test_learning_phase_scope(self):
@@ -1069,13 +1069,13 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
                                                              initial_states,
                                                              **kwargs)
         # check static shape inference
-        self.assertEquals(last_output.get_shape().as_list(),
-                          [num_samples, output_dim])
-        self.assertEquals(outputs.get_shape().as_list(),
-                          [num_samples, timesteps, output_dim])
+        self.assertEqual(last_output.get_shape().as_list(),
+                         [num_samples, output_dim])
+        self.assertEqual(outputs.get_shape().as_list(),
+                         [num_samples, timesteps, output_dim])
         for state in new_states:
-          self.assertEquals(state.get_shape().as_list(),
-                            [num_samples, output_dim])
+          self.assertEqual(state.get_shape().as_list(),
+                           [num_samples, output_dim])
 
         last_output_list[i].append(keras.backend.eval(last_output))
         outputs_list[i].append(keras.backend.eval(outputs))
@@ -1173,7 +1173,7 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
         self.assertEqual(outputs.get_shape().as_list(),
                          [num_samples, timesteps, output_dim])
         # for state in new_states:
-        #   self.assertEquals(state.get_shape().as_list(),
+        #   self.assertEqual(state.get_shape().as_list(),
         #                     [num_samples, output_dim])
         self.assertEqual(new_states[0].get_shape().as_list(),
                          [num_samples, output_dim])
@@ -1223,6 +1223,121 @@ class BackendNNOpsTest(test.TestCase, parameterized.TestCase):
       for s, u_s in zip(additional_state_list[2], additional_state_list[3]):
         self.assertAllClose(s, u_s, atol=1e-04)
 
+  def test_rnn_output_and_state_masking_independent(self):
+    num_samples = 2
+    num_timesteps = 4
+    state_and_io_size = 2
+    mask_last_num_timesteps = 2  # for second sample only
+
+    # a step function that just outputs inputs,
+    # but increments states +1 per timestep
+    def step_function(inputs, states):
+      return inputs, [s + 1 for s in states]
+
+    inputs_vals = np.random.random((num_samples, num_timesteps,
+                                    state_and_io_size))
+    initial_state_vals = np.random.random((num_samples, state_and_io_size))
+    # masking of two last timesteps for second sample only
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[1, -mask_last_num_timesteps:] = 0
+
+    # outputs expected to be same as inputs for the first sample
+    expected_outputs = inputs_vals.copy()
+    # but for the second sample all outputs in masked region should be the same
+    # as last output before masked region
+    expected_outputs[1, -mask_last_num_timesteps:] = \
+        expected_outputs[1, -(mask_last_num_timesteps + 1)]
+
+    expected_last_state = initial_state_vals.copy()
+    # first state should be incremented for every timestep (no masking)
+    expected_last_state[0] += num_timesteps
+    # second state should not be incremented for last two timesteps
+    expected_last_state[1] += (num_timesteps - mask_last_num_timesteps)
+
+    # verify same expected output for `unroll=true/false`
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, outputs, last_states = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+      self.assertAllClose(
+          keras.backend.eval(last_states[0]), expected_last_state)
+
+  def test_rnn_output_num_dim_larger_than_2_masking(self):
+    num_samples = 3
+    num_timesteps = 4
+    num_features = 5
+
+    def step_function(inputs, states):
+      outputs = keras.backend.tile(keras.backend.expand_dims(inputs), [1, 1, 2])
+      return outputs, [keras.backend.identity(s) for s in states]
+      # Note: cannot just return states (which can be a problem) ->
+      # tensorflow/python/ops/resource_variable_ops.py", line 824, in set_shape
+      # NotImplementedError: ResourceVariable does not implement set_shape()
+
+    inputs_vals = np.random.random((num_samples, num_timesteps, num_features))
+    initial_state_vals = np.random.random((num_samples, 6))
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[-1, -1] = 0  # final timestep masked for last sample
+
+    expected_outputs = np.repeat(inputs_vals[..., None], repeats=2, axis=-1)
+    # for the last sample, the final timestep (in masked region) should be the
+    # same as the second to final output (before masked region)
+    expected_outputs[-1, -1] = expected_outputs[-1, -2]
+
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, outputs, _ = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(keras.backend.eval(outputs), expected_outputs)
+
+  def test_rnn_state_num_dim_larger_than_2_masking(self):
+    num_samples = 3
+    num_timesteps = 4
+
+    def step_function(inputs, states):
+      return inputs, [s + 1 for s in states]
+
+    inputs_vals = np.random.random((num_samples, num_timesteps, 5))
+    initial_state_vals = np.random.random((num_samples, 6, 7))
+    mask_vals = np.ones((num_samples, num_timesteps))
+    mask_vals[0, -2:] = 0  # final two timesteps masked for first sample
+
+    expected_last_state = initial_state_vals.copy()
+    expected_last_state[0] += (num_timesteps - 2)
+    expected_last_state[1:] += num_timesteps
+
+    inputs = keras.backend.variable(inputs_vals)
+    initial_states = [keras.backend.variable(initial_state_vals)]
+    mask = keras.backend.variable(mask_vals)
+    for unroll in [True, False]:
+      _, _, last_states = keras.backend.rnn(
+          step_function,
+          inputs,
+          initial_states,
+          mask=mask,
+          unroll=unroll,
+          input_length=num_timesteps if unroll else None)
+
+      self.assertAllClose(
+          keras.backend.eval(last_states[0]), expected_last_state)
+
   def test_normalize_batch_in_training(self):
     val = np.random.random((10, 3, 10, 10))
     x = keras.backend.variable(val)
@@ -1307,6 +1422,7 @@ class TestCTC(test.TestCase):
                 decode_truth[i] == keras.backend.eval(decode_pred_tf[i])))
       self.assertAllClose(log_prob_truth, log_prob_pred)
 
+  @test_util.run_v1_only('b/120545219')
   def test_ctc_batch_cost(self):
     with self.cached_session():
       label_lens = np.expand_dims(np.asarray([5, 4]), 1)
@@ -1392,6 +1508,7 @@ class TestRandomOps(test.TestCase):
 
 class BackendGraphTests(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_is_placeholder(self):
     x = keras.backend.placeholder(shape=(1,))
     self.assertEqual(keras.backend.is_placeholder(x), True)
@@ -1431,6 +1548,7 @@ class BackendGraphTests(test.TestCase):
     output_values = f([None, None])
     self.assertEqual(output_values, [5., 6.])
 
+  @test_util.run_deprecated_v1
   def test_function_tf_feed_symbols(self):
     # Test Keras backend functions with TF tensor inputs.
     with self.cached_session():
@@ -1464,6 +1582,7 @@ class BackendGraphTests(test.TestCase):
       outs = f([y5, y2, None])
       self.assertEqual(outs, [11., 2.])
 
+  @test_util.run_deprecated_v1
   def test_function_tf_fetches(self):
     # Additional operations can be passed to tf.Session().run() via its
     # `fetches` arguments. In contrast to `updates` argument of
@@ -1486,6 +1605,7 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
                        [11., 5.])
 
+  @test_util.run_deprecated_v1
   def test_function_tf_feed_dict(self):
     # Additional substitutions can be passed to `tf.Session().run()` via its
     # `feed_dict` arguments. Note that the feed_dict is passed once in the
@@ -1518,6 +1638,7 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(keras.backend.get_session().run(fetches=[x, y]),
                        [30., 40.])
 
+  @test_util.run_deprecated_v1
   def test_function_tf_run_options_with_run_metadata(self):
     with self.cached_session():
       x_placeholder = keras.backend.placeholder(shape=())
@@ -1543,6 +1664,7 @@ class BackendGraphTests(test.TestCase):
       self.assertEqual(output1, [30.])
       self.assertEqual(len(run_metadata.partition_graphs), 0)
 
+  @test_util.run_deprecated_v1
   def test_function_fetch_callbacks(self):
 
     class CallbackStub(object):
@@ -1579,6 +1701,7 @@ class BackendGraphTests(test.TestCase):
     x = keras.backend.placeholder(shape=(3, 4), sparse=True)
     self.assertEqual(x.get_shape().as_list(), [3, 4])
 
+  @test_util.run_deprecated_v1
   def test_batch_normalization(self):
     # No eager CPU kernel.
     g_val = np.random.random((3,))
diff --git a/tensorflow/python/keras/callbacks.py b/tensorflow/python/keras/callbacks.py
index fde17cb6bc4b44abb74812b23e0158a062e1b228..2d7d5a415d422cea300ab722ceacdb83803d3db8 100644
--- a/tensorflow/python/keras/callbacks.py
+++ b/tensorflow/python/keras/callbacks.py
@@ -24,7 +24,6 @@ import copy
 import csv
 import io
 import json
-import math
 import os
 import time
 
@@ -35,7 +34,6 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.training_utils import standardize_input_data
 from tensorflow.python.keras.utils.data_utils import Sequence
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
@@ -54,17 +52,14 @@ except ImportError:
   requests = None
 
 
+# pylint: disable=protected-access
 def configure_callbacks(callbacks,
                         model,
                         do_validation=False,
-                        val_inputs=None,
-                        val_targets=None,
-                        val_sample_weights=None,
                         batch_size=None,
                         epochs=None,
                         steps_per_epoch=None,
                         samples=None,
-                        validation_steps=None,
                         verbose=1,
                         count_mode='steps',
                         mode='train'):
@@ -74,17 +69,10 @@ def configure_callbacks(callbacks,
       callbacks: List of Callbacks.
       model: Model being trained.
       do_validation: Whether or not validation loop will be run.
-      val_inputs: Inputs to Model for validation loop. Can be any
-        data format Keras accepts.
-      val_targets: Targets for Model for validation loop. Can be any
-        data format Keras accepts.
-      val_sample_weights: Sample weights for Model for validation loop.
-        Can be any data format Keras accepts.
       batch_size: Number of samples per batch.
       epochs: Number of epoch to train.
       steps_per_epoch: Number of batches to run per training epoch.
       samples: Number of training samples.
-      validation_steps: Number of batches to run per validation epoch.
       verbose: int, 0 or 1. Keras logging verbosity to pass to ProgbarLogger.
       count_mode: One of 'steps' or 'samples'. Per-batch or per-sample count.
       mode: String. One of 'train', 'test', or 'predict'. Which loop mode to
@@ -114,24 +102,17 @@ def configure_callbacks(callbacks,
   callback_list = CallbackList(callbacks)
 
   # Set callback model
-  callback_model = model._get_callback_model()  # pylint: disable=protected-access
-  if do_validation and val_inputs and not context.executing_eagerly():
-    # Need to create the eval_function before start of the first epoch
-    # because TensorBoard callback on_epoch_begin adds summary to the
-    # list of fetches of the eval_function
-    callback_model._make_eval_function()  # pylint: disable=protected-access
+  callback_model = model._get_callback_model()
   callback_list.set_model(callback_model)
 
   # Set callback parameters
   callback_metrics = []
   # When we have deferred build scenario with iterator input, we will compile
   # when we standardize first batch of data.
-  if mode != 'predict' and model._is_compiled:  # pylint: disable=protected-access
+  if mode != 'predict' and hasattr(model, 'metrics_names'):
     callback_metrics = copy.copy(model.metrics_names)
     if do_validation:
       callback_metrics += ['val_' + n for n in model.metrics_names]
-  if validation_steps is None and isinstance(val_inputs, Sequence):
-    validation_steps = len(val_inputs)
   callback_params = {
       'batch_size': batch_size,
       'epochs': epochs,
@@ -140,27 +121,19 @@ def configure_callbacks(callbacks,
       'verbose': verbose,
       'do_validation': do_validation,
       'metrics': callback_metrics,
-      'validation_steps': validation_steps
   }
   callback_list.set_params(callback_params)
 
-  # Pass validation data to callbacks
-  # TODO(omalleyt): remove this once val hooks are ready.
-  if not val_inputs:
-    val_data = []
-  elif _is_generator_like(val_inputs):
-    val_data = val_inputs
-  else:
-    val_data = val_inputs + val_targets
-    if val_sample_weights:
-      val_data += val_sample_weights
-    if not isinstance(K.learning_phase(), int):
-      val_data += [0.]
-  for cbk in callbacks:
-    cbk.validation_data = val_data
+  if (do_validation and not model._distribution_strategy and
+      not model.run_eagerly):
+    # Need to create the eval_function before start of the first epoch
+    # because TensorBoard callback on_epoch_begin adds summary to the
+    # list of fetches of the eval_function
+    callback_model._make_eval_function()
 
   callback_list.model.stop_training = False
   return callback_list
+# pylint: enable=protected-access
 
 
 def _is_generator_like(data):
@@ -491,7 +464,8 @@ class ProgbarLogger(Callback):
       self.progbar = Progbar(
           target=self.target,
           verbose=self.verbose,
-          stateful_metrics=self.stateful_metrics)
+          stateful_metrics=self.stateful_metrics,
+          unit_name='step' if self.use_steps else 'sample')
 
   def on_batch_begin(self, batch, logs=None):
     if self.seen < self.target:
@@ -953,6 +927,7 @@ class TensorBoard(Callback):
     self.batch_size = batch_size
     self._current_batch = 0
     self._total_batches_seen = 0
+    self._total_val_batches_seen = 0
     self.embeddings_freq = embeddings_freq
     self.embeddings_layer_names = embeddings_layer_names
     self.embeddings_metadata = embeddings_metadata
@@ -1041,8 +1016,10 @@ class TensorBoard(Callback):
     # If both embedding_freq and embeddings_data are available, we will
     # visualize embeddings.
     if self.embeddings_freq and self.embeddings_data is not None:
-      self.embeddings_data = standardize_input_data(self.embeddings_data,
-                                                    model.input_names)
+      # Avoid circular dependency.
+      from tensorflow.python.keras.engine import training_utils  # pylint: disable=g-import-not-at-top
+      self.embeddings_data = training_utils.standardize_input_data(
+          self.embeddings_data, model.input_names)
 
       # If embedding_layer_names are not provided, get all of the embedding
       # layers from the model.
@@ -1107,10 +1084,8 @@ class TensorBoard(Callback):
       projector.visualize_embeddings(self.writer, config)
 
   def _fetch_callback(self, summary):
-    self.writer.add_summary(
-        summary,
-        self._epoch + self._current_val_batch / self._validation_batches)
-    self._current_val_batch += 1
+    self.writer.add_summary(summary, self._total_val_batches_seen)
+    self._total_val_batches_seen += 1
 
   def _write_custom_summaries(self, step, logs=None):
     """Writes metrics out as custom scalar summaries.
@@ -1141,22 +1116,6 @@ class TensorBoard(Callback):
         self.writer.add_summary(summary, step)
     self.writer.flush()
 
-  def on_train_begin(self, logs=None):
-    """Checks if histogram summaries can be run."""
-    # will never be set when in eager
-    if self.histogram_freq:
-      if self.params.get('validation_steps', None) is not None:
-        self._validation_batches = self.params['validation_steps']
-      elif self.validation_data:
-        self._validation_batches = math.ceil(
-            self.validation_data[0].shape[0] / self.batch_size)
-      else:
-        raise ValueError('If printing histograms, validation data must be '
-                         'provided.')
-      if self._validation_batches == 0:
-        raise ValueError(
-            'If printing histograms, validation data must have length > 0.')
-
   def on_batch_end(self, batch, logs=None):
     """Writes scalar summaries for metrics on every training batch."""
     # Don't output batch_size and batch number as Tensorboard summaries
@@ -1177,7 +1136,6 @@ class TensorBoard(Callback):
     # check if histogram summary should be run for this epoch
     if self.histogram_freq and epoch % self.histogram_freq == 0:
       self._epoch = epoch
-      self._current_val_batch = 0
       # pylint: disable=protected-access
       # add the histogram summary op if it should run this epoch
       if self.merged not in self.model._eval_function.fetches:
diff --git a/tensorflow/python/keras/callbacks_test.py b/tensorflow/python/keras/callbacks_test.py
index 9d9ede22c018a85d716534848ba65a98f463e4f5..4a65ade33c7f9c6159ab5cb8f50a06124507dbdd 100644
--- a/tensorflow/python/keras/callbacks_test.py
+++ b/tensorflow/python/keras/callbacks_test.py
@@ -36,7 +36,6 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.summary.writer import writer_cache
 from tensorflow.python.training import adam
 
 try:
@@ -404,6 +403,7 @@ class KerasCallbacksTest(test.TestCase):
           float(keras.backend.get_value(
               model.optimizer.lr)) - 0.01 / 4) < keras.backend.epsilon()
 
+  @test_util.run_v1_only('b/120545219')
   def test_ReduceLROnPlateau(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -675,6 +675,7 @@ class KerasCallbacksTest(test.TestCase):
       self.assertEqual(len(loss), 1)
       self.assertEqual(loss[0], np.inf)
 
+  @test_util.run_v1_only('b/120545219')
   def test_TensorBoard(self):
     np.random.seed(1337)
 
@@ -778,78 +779,7 @@ class KerasCallbacksTest(test.TestCase):
           data_generator(True), len(x_train), epochs=2, callbacks=cbks)
       assert os.path.exists(temp_dir)
 
-  def test_TensorBoard_histogram_freq_must_have_validation_data(self):
-    np.random.seed(1337)
-    tmpdir = self.get_temp_dir()
-    self.addCleanup(shutil.rmtree, tmpdir, ignore_errors=True)
-
-    with self.cached_session():
-      filepath = os.path.join(tmpdir, 'logs')
-
-      (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
-          train_samples=TRAIN_SAMPLES,
-          test_samples=TEST_SAMPLES,
-          input_shape=(INPUT_DIM,),
-          num_classes=NUM_CLASSES)
-      y_test = keras.utils.to_categorical(y_test)
-      y_train = keras.utils.to_categorical(y_train)
-
-      def data_generator(train):
-        if train:
-          max_batch_index = len(x_train) // BATCH_SIZE
-        else:
-          max_batch_index = len(x_test) // BATCH_SIZE
-        i = 0
-        while 1:
-          if train:
-            # simulate multi-input/output models
-            yield (x_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_train[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          else:
-            yield (x_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE],
-                   y_test[i * BATCH_SIZE: (i + 1) * BATCH_SIZE])
-          i += 1
-          i %= max_batch_index
-
-      inp = keras.Input((INPUT_DIM,))
-      hidden = keras.layers.Dense(2, activation='relu')(inp)
-      hidden = keras.layers.Dropout(0.1)(hidden)
-      output = keras.layers.Dense(NUM_CLASSES, activation='softmax')(hidden)
-      model = keras.models.Model(inputs=inp, outputs=output)
-      model.compile(loss='categorical_crossentropy',
-                    optimizer='sgd',
-                    metrics=['accuracy'])
-
-      # we must generate new callbacks for each test, as they aren't stateless
-      def callbacks_factory(histogram_freq):
-        return [keras.callbacks.TensorBoard(
-            log_dir=filepath,
-            histogram_freq=histogram_freq,
-            write_images=True, write_grads=True,
-            batch_size=5)]
-
-      # fit w/o validation data should raise ValueError if histogram_freq > 0
-      cbs = callbacks_factory(histogram_freq=1)
-      with self.assertRaises(ValueError):
-        model.fit(
-            x_train, y_train, batch_size=BATCH_SIZE, callbacks=cbs, epochs=3)
-
-      for cb in cbs:
-        cb.on_train_end()
-
-      # fit generator without validation data should raise ValueError if
-      # histogram_freq > 0
-      cbs = callbacks_factory(histogram_freq=1)
-      with self.assertRaises(ValueError):
-        model.fit_generator(
-            data_generator(True), len(x_train), epochs=2, callbacks=cbs)
-
-      for cb in cbs:
-        cb.on_train_end()
-
-      # Make sure file writer cache is clear to avoid failures during cleanup.
-      writer_cache.FileWriterCache.clear()
-
+  @test_util.run_v1_only('b/120545219')
   def test_TensorBoard_multi_input_output(self):
     np.random.seed(1337)
     tmpdir = self.get_temp_dir()
@@ -921,6 +851,7 @@ class KerasCallbacksTest(test.TestCase):
                           callbacks=callbacks_factory(histogram_freq=1))
       assert os.path.isdir(filepath)
 
+  @test_util.run_v1_only('b/120545219')
   def test_Tensorboard_histogram_summaries_in_test_function(self):
 
     class FileWriterStub(object):
@@ -996,8 +927,9 @@ class KerasCallbacksTest(test.TestCase):
           epochs=3,
           verbose=0)
 
-      self.assertAllEqual(tsb.writer.steps_seen, [0, 0.5, 1, 1.5, 2, 2.5])
+      self.assertAllEqual(tsb.writer.steps_seen, [0, 1, 2, 3, 4, 5])
 
+  @test_util.run_v1_only('b/120545219')
   def test_Tensorboard_histogram_summaries_with_generator(self):
     np.random.seed(1337)
     tmpdir = self.get_temp_dir()
@@ -1129,6 +1061,7 @@ class KerasCallbacksTest(test.TestCase):
 
       assert os.path.exists(temp_dir)
 
+  @test_util.run_deprecated_v1
   def test_Tensorboard_batch_logging(self):
 
     class FileWriterStub(object):
@@ -1163,6 +1096,7 @@ class KerasCallbacksTest(test.TestCase):
     self.assertEqual(tb_cbk.writer.summary_values, [0., 1., 2., 3., 4.])
     self.assertEqual(tb_cbk.writer.summary_tags, ['batch_acc'] * 5)
 
+  @test_util.run_deprecated_v1
   def test_Tensorboard_epoch_and_batch_logging(self):
 
     class FileWriterStub(object):
@@ -1234,6 +1168,7 @@ class KerasCallbacksTest(test.TestCase):
 
     self.assertTrue(os.path.exists(temp_dir))
 
+  @test_util.run_deprecated_v1
   def test_TensorBoard_update_freq(self):
 
     class FileWriterStub(object):
@@ -1325,6 +1260,7 @@ class KerasCallbacksTest(test.TestCase):
             callbacks=cbks,
             epochs=1)
 
+  @test_util.run_deprecated_v1
   def test_fit_generator_with_callback(self):
 
     class TestCallback(keras.callbacks.Callback):
diff --git a/tensorflow/python/keras/engine/__init__.py b/tensorflow/python/keras/engine/__init__.py
index 26aed34766f9e1e2094db7a4c8b66ff057dacc4b..005f6462ffa4e6120c66373f7be9e31d5eac5449 100644
--- a/tensorflow/python/keras/engine/__init__.py
+++ b/tensorflow/python/keras/engine/__init__.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 # TODO(fchollet): Remove hourglass imports once external code is done importing
 # non-public APIs.
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
 
 del absolute_import
diff --git a/tensorflow/python/keras/engine/base_layer.py b/tensorflow/python/keras/engine/base_layer.py
index 23419ae150338f587ea26f0092e3d5d50963d32c..858fa76472b3806f36b76f761043f011a260b66d 100644
--- a/tensorflow/python/keras/engine/base_layer.py
+++ b/tensorflow/python/keras/engine/base_layer.py
@@ -18,8 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import collections as collections_lib
-import enum  # pylint: disable=g-bad-import-order
 import functools
 import inspect  # Necessary supplement to tf_inspect to deal with variadic args.
 
@@ -36,13 +34,14 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
+from tensorflow.python.keras.engine import base_layer_utils
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 # A module that only depends on `keras.layers` import these from here.
 from tensorflow.python.keras.utils.generic_utils import to_snake_case  # pylint: disable=unused-import
 from tensorflow.python.keras.utils.tf_utils import is_tensor_or_tensor_list  # pylint: disable=unused-import
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -54,20 +53,6 @@ from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
 
-class CallConvention(enum.Enum):
-  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
-  # The Layer takes inputs as its first argument, named "inputs" for
-  # compatibility with the signature of Layer.__call__. This is the mode assumed
-  # for Layers which are not subclassed Models.
-  EXPLICIT_INPUTS_ARGUMENT = 1
-  # The Layer takes a single positional argument, not named "inputs". It's
-  # treated like an "inputs" argument.
-  SINGLE_POSITIONAL_ARGUMENT = 2
-  # The Layer has multiple positional arguments to which its inputs should be
-  # bound.
-  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
-
-
 @tf_export('keras.layers.Layer')
 class Layer(checkpointable.CheckpointableBase):
   """Base layer class.
@@ -102,10 +87,6 @@ class Layer(checkpointable.CheckpointableBase):
     name: The name of the layer (string).
     dtype: Default dtype of the layer's weights (default of `None` means use the
       type of the first input).
-    trainable_variables: List of trainable variables.
-    non_trainable_variables: List of non-trainable variables.
-    variables: List of all variables of this layer, trainable and
-      non-trainable.
     updates: List of update ops of this layer.
     losses: List of losses added by this layer.
     trainable_weights: List of variables to be included in backprop.
@@ -150,9 +131,9 @@ class Layer(checkpointable.CheckpointableBase):
     self.built = False
     # Provides information about which inputs are compatible with the layer.
     self.input_spec = None
+    self.supports_masking = False
 
     self._init_set_name(name)
-
     self._activity_regularizer = kwargs.pop('activity_regularizer', None)
     self._trainable_weights = []
     self._non_trainable_weights = []
@@ -170,29 +151,25 @@ class Layer(checkpointable.CheckpointableBase):
     # in eager mode or graph mode alternatively, we need to keep track of
     # eager losses and symbolic losses via separate attributes.
     self._eager_losses = []
+    # A list of metric instances corresponding to the symbolic metric tensors
+    # added using the `add_metric` API.
+    self._metrics = []
+    # TODO(psv): Remove this property.
+    # A dictionary that maps metric names to metric result tensors. The results
+    # are the running averages of metric values over an epoch.
+    self._metrics_tensors = {}
     self._dtype = None if dtype is None else dtypes.as_dtype(dtype).name
     self._call_fn_args = function_utils.fn_args(self.call)
     self._compute_previous_mask = ('mask' in self._call_fn_args or
                                    hasattr(self, 'compute_mask'))
-    self._call_convention = CallConvention.EXPLICIT_INPUTS_ARGUMENT
+    self._call_convention = (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
 
     # These lists will be filled via successive calls
     # to self._add_inbound_node().
     self._inbound_nodes = []
     self._outbound_nodes = []
 
-    self.supports_masking = False
-
-    # Mark if a layer supports using graph functions in the eager
-    # fit/predict/evaluate loop
-    # TODO(kaftan): merge this with the _static_graph_friendly flag once
-    # enough eager function bugs involving control flow / tensorarrays have
-    # been fixed,  and static-graph-friendly layers will almost always work in
-    # eager graph functions.
-    # We conservatively make this flag opt-in for now to avoid causing existing
-    # custom layers to crash.
-    self._can_use_graph_functions = False
-
     call_argspec = tf_inspect.getfullargspec(self.call)
     if 'training' in call_argspec.args:
       self._expects_training_arg = True
@@ -200,7 +177,7 @@ class Layer(checkpointable.CheckpointableBase):
       self._expects_training_arg = False
 
     # Whether the `call` method can be used to build a TF graph without issues.
-    self._static_graph_friendly = True
+    self._call_is_graph_friendly = True
 
     # Manage input shape information if passed.
     if 'input_shape' in kwargs or 'batch_input_shape' in kwargs:
@@ -222,543 +199,348 @@ class Layer(checkpointable.CheckpointableBase):
     else:
       self._initial_weights = None
 
-  @property
-  def _is_static_graph_friendly(self):
-    return self._static_graph_friendly
-
-  @_is_static_graph_friendly.setter
-  def _is_static_graph_friendly(self, value):
-    if value not in {True, False}:
-      raise ValueError('`static_graph_friendly` requires a boolean value. '
-                       'Received: {}'.format(value))
-    self._static_graph_friendly = value
-
-  def _init_set_name(self, name, zero_based=True):
-    if not name:
-      self._name = unique_layer_name(
-          generic_utils.to_snake_case(self.__class__.__name__),
-          zero_based=zero_based)
-    else:
-      self._name = name
-
-  @property
-  def dtype(self):
-    return self._dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def activity_regularizer(self):
-    """Optional regularizer function for the output of this layer."""
-    return self._activity_regularizer
-
-  @activity_regularizer.setter
-  def activity_regularizer(self, regularizer):
-    """Optional regularizer function for the output of this layer."""
-    self._activity_regularizer = self._no_dependency(regularizer)
+  def build(self, input_shape):
+    """Creates the variables of the layer (optional, for subclass implementers).
 
-  @property
-  def trainable_weights(self):
-    return self._trainable_weights if self.trainable else []
+    This is a method that implementers of subclasses of `Layer` or `Model`
+    can override if they need a state-creation step in-between
+    layer instantiation and layer call.
 
-  @property
-  def non_trainable_weights(self):
-    if self.trainable:
-      return self._non_trainable_weights
-    else:
-      return self._trainable_weights + self._non_trainable_weights
+    This is typically used to create the weights of `Layer` subclasses.
 
-  @property
-  def trainable_variables(self):
-    return self.trainable_weights
+    Arguments:
+      input_shape: Instance of `TensorShape`, or list of instances of
+        `TensorShape` if the layer expects a list of inputs
+        (one instance per input).
+    """
+    self.built = True
 
-  @property
-  def non_trainable_variables(self):
-    return self.non_trainable_weights
+  @doc_controls.for_subclass_implementers
+  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
+    """This is where the layer's logic lives.
 
-  @property
-  def weights(self):
-    """Returns the list of all layer variables/weights.
+    Arguments:
+        inputs: Input tensor, or list/tuple of input tensors.
+        **kwargs: Additional keyword arguments.
 
     Returns:
-      A list of variables.
+        A tensor or list/tuple of tensors.
     """
-    return self.trainable_weights + self.non_trainable_weights
+    return inputs
 
-  @property
-  def variables(self):
-    """Returns the list of all layer variables/weights.
+  @doc_controls.for_subclass_implementers
+  def add_weight(self,
+                 name,
+                 shape,
+                 dtype=None,
+                 initializer=None,
+                 regularizer=None,
+                 trainable=None,
+                 constraint=None,
+                 partitioner=None,
+                 use_resource=None,
+                 synchronization=tf_variables.VariableSynchronization.AUTO,
+                 aggregation=tf_variables.VariableAggregation.NONE,
+                 **kwargs):
+    """Adds a new variable to the layer, or gets an existing one; returns it.
 
-    Returns:
-      A list of variables.
-    """
-    return self.weights
+    Arguments:
+      name: variable name.
+      shape: variable shape.
+      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+      initializer: initializer instance (callable).
+      regularizer: regularizer instance (callable).
+      trainable: whether the variable should be part of the layer's
+        "trainable_variables" (e.g. variables, biases)
+        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+        Note, if the current variable scope is marked as non-trainable
+        then this parameter is ignored and any added variables are also
+        marked as non-trainable. `trainable` defaults to `True` unless
+        `synchronization` is set to `ON_READ`.
+      constraint: constraint instance (callable).
+      partitioner: Partitioner to be passed to the `Checkpointable` API.
+      use_resource: Whether to use `ResourceVariable`.
+      synchronization: Indicates when a distributed a variable will be
+        aggregated. Accepted values are constants defined in the class
+        `tf.VariableSynchronization`. By default the synchronization is set to
+        `AUTO` and the current `DistributionStrategy` chooses
+        when to synchronize. If `synchronization` is set to `ON_READ`,
+        `trainable` must not be set to `True`.
+      aggregation: Indicates how a distributed variable will be aggregated.
+        Accepted values are constants defined in the class
+        `tf.VariableAggregation`.
+      **kwargs: Additional keyword arguments. Accepted values are `getter` and
+        `collections`.
 
-  @property
-  def updates(self):
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.updates not supported in Eager mode.')
-    if not self.trainable and not self.stateful:
-      return []
-    return self._updates
+    Returns:
+      The created variable.  Usually either a `Variable` or `ResourceVariable`
+      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
+      instance is returned.
 
-  @doc_controls.for_subclass_implementers
-  def add_update(self, updates, inputs=None):
-    """Add update op(s), potentially dependent on layer inputs.
+    Raises:
+      RuntimeError: If called with partioned variable regularization and
+        eager execution is enabled.
+      ValueError: When giving unsupported dtype and no initializer or when
+        trainable has been set to True with synchronization set as `ON_READ`.
+    """
+    # Validate optional keyword arguments.
+    for kwarg in kwargs:
+      if kwarg not in ['getter', 'collections']:
+        raise TypeError('Unknown keyword argument:', kwarg)
+    getter = kwargs.pop('getter', None)
+    collections = kwargs.pop('collections', None)
 
-    Weight updates (for instance, the updates of the moving mean and variance
-    in a BatchNormalization layer) may be dependent on the inputs passed
-    when calling a layer. Hence, when reusing the same layer on
-    different inputs `a` and `b`, some entries in `layer.updates` may be
-    dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
+    if dtype is None:
+      dtype = self.dtype or backend.floatx()
+    dtype = dtypes.as_dtype(dtype)
+    initializer = initializers.get(initializer)
+    regularizer = regularizers.get(regularizer)
+    constraint = constraints.get(constraint)
 
-    The `get_updates_for` method allows to retrieve the updates relevant to a
-    specific set of inputs.
+    if synchronization == tf_variables.VariableSynchronization.ON_READ:
+      if trainable:
+        raise ValueError(
+            'Synchronization value can be set to '
+            'VariableSynchronization.ON_READ only for non-trainable variables. '
+            'You have specified trainable=True and '
+            'synchronization=VariableSynchronization.ON_READ.')
+      else:
+        # Set trainable to be false when variable is to be synced on read.
+        trainable = False
+    elif trainable is None:
+      trainable = True
 
-    This call is ignored when eager execution is enabled (in that case, variable
-    updates are run on the fly and thus do not need to be tracked for later
-    execution).
+    # Initialize variable when no initializer provided
+    if initializer is None:
+      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
+      if dtype.is_floating:
+        initializer = initializers.glorot_uniform()
+      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
+      # If dtype is DT_BOOL, provide a default value `FALSE`
+      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
+        initializer = initializers.zeros()
+      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
+      else:
+        raise ValueError('An initializer for variable %s of type %s is required'
+                         ' for layer %s' % (name, dtype.base_dtype, self.name))
 
-    Arguments:
-      updates: Update op, or list/tuple of update ops.
-      inputs: If anything other than None is passed, it signals the updates
-        are conditional on some of the layer's inputs,
-        and thus they should only be run where these inputs are available.
-        This is the case for BatchNormalization updates, for instance.
-        If None, the updates will be taken into account unconditionally,
-        and you are responsible for making sure that any dependency they might
-        have is available at runtime.
-        A step counter might fall into this category.
-    """
-    if context.executing_eagerly():
-      return  # Updates already applied when in eager mode.
+    variable = self._add_variable_with_custom_getter(
+        name=name,
+        shape=shape,
+        # TODO(allenl): a `make_variable` equivalent should be added as a
+        # `Checkpointable` method.
+        getter=getter or base_layer_utils.make_variable,
+        # Manage errors in Layer rather than Checkpointable.
+        overwrite=True,
+        initializer=initializer,
+        dtype=dtype,
+        constraint=constraint,
+        trainable=trainable and self.trainable,
+        partitioner=partitioner,
+        use_resource=use_resource,
+        collections=collections,
+        synchronization=synchronization,
+        aggregation=aggregation)
+    backend.track_variable(variable)
 
-    def process_update(x):
-      if isinstance(x, ops.Operation):
-        return x
-      elif hasattr(x, 'op'):
-        return x.op
-      else:
-        return ops.convert_to_tensor(x)
+    if regularizer is not None:
+      # TODO(fchollet): in the future, this should be handled at the
+      # level of variable creation, and weight regularization losses
+      # should be variable attributes.
+      self._handle_weight_regularization(name, variable, regularizer)
 
-    updates = generic_utils.to_list(updates)
-    updates = [process_update(x) for x in updates]
-    self._updates += updates
-    if inputs is None:
-      for u in updates:
-        u._unconditional_update = True  # pylint: disable=protected-access
+    if trainable:
+      self._trainable_weights.append(variable)
     else:
-      for u in updates:
-        u._unconditional_update = False  # pylint: disable=protected-access
+      self._non_trainable_weights.append(variable)
+    return variable
 
-  def get_updates_for(self, inputs):
-    """Retrieves updates relevant to a specific set of inputs.
+  def get_config(self):
+    """Returns the config of the layer.
 
-    Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
+    A layer config is a Python dictionary (serializable)
+    containing the configuration of a layer.
+    The same layer can be reinstantiated later
+    (without its trained weights) from this configuration.
 
-    Returns:
-      List of update ops of the layer that depend on `inputs`.
+    The config of a layer does not include connectivity
+    information, nor the layer class name. These are handled
+    by `Network` (one layer of abstraction above).
 
-    Raises:
-      RuntimeError: If called in Eager mode.
+    Returns:
+        Python dictionary.
     """
-    if context.executing_eagerly():
-      raise RuntimeError('`get_updates_for()` not supported in Eager mode.')
+    config = {'name': self.name, 'trainable': self.trainable}
+    if hasattr(self, '_batch_input_shape'):
+      config['batch_input_shape'] = self._batch_input_shape
+    if hasattr(self, 'dtype'):
+      config['dtype'] = self.dtype
+    return config
 
-    # Updates disabled if layer is not trainable and not explicitly stateful.
-    if not self.trainable and not self.stateful:
-      return []
+  @classmethod
+  def from_config(cls, config):
+    """Creates a layer from its config.
 
-    if inputs is None:
-      # Requesting unconditional updates.
-      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
+    This method is the reverse of `get_config`,
+    capable of instantiating the same layer from the config
+    dictionary. It does not handle layer connectivity
+    (handled by Network), nor weights (handled by `set_weights`).
 
-    # Requesting input-conditional updates.
-    inputs = nest.flatten(inputs)
-    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
-    updates = []
-    for update in self.updates:
-      if update in reachable:
-        updates.append(update)
-    return updates
+    Arguments:
+        config: A Python dictionary, typically the
+            output of get_config.
 
-  @property
-  def losses(self):
-    """Losses which are associated with this `Layer`.
+    Returns:
+        A layer instance.
+    """
+    return cls(**config)
 
-    Variable regularization tensors are created when this property is accessed,
-    so it is eager safe: accessing `losses` under a `tf.GradientTape` will
-    propagate gradients back to the corresponding variables.
+  def compute_output_shape(self, input_shape):
+    """Computes the output shape of the layer.
+
+    Assumes that the layer will be built
+    to match that input shape provided.
+
+    Arguments:
+        input_shape: Shape tuple (tuple of integers)
+            or list of shape tuples (one per output tensor of the layer).
+            Shape tuples can include None for free dimensions,
+            instead of an integer.
 
     Returns:
-      A list of tensors.
+        An input shape tuple.
     """
-    collected_losses = []
     if context.executing_eagerly():
-      collected_losses.extend(self._eager_losses)
-    else:
-      collected_losses.extend(self._losses)
-    for regularizer in self._callable_losses:
-      loss_tensor = regularizer()
-      if loss_tensor is not None:
-        collected_losses.append(loss_tensor)
-    return collected_losses
-
-  @doc_controls.for_subclass_implementers
-  def add_loss(self, losses, inputs=None):
-    """Add loss tensor(s), potentially dependent on layer inputs.
-
-    Some losses (for instance, activity regularization losses) may be dependent
-    on the inputs passed when calling a layer. Hence, when reusing the same
-    layer on different inputs `a` and `b`, some entries in `layer.losses` may
-    be dependent on `a` and some on `b`. This method automatically keeps track
-    of dependencies.
+      # In this case we build the model first in order to do shape inference.
+      # This is acceptable because the framework only calls
+      # `compute_output_shape` on shape values that the layer would later be
+      # built for. It would however cause issues in case a user attempts to
+      # use `compute_output_shape` manually (these users will have to
+      # implement `compute_output_shape` themselves).
+      self.build(input_shape)
+      with context.graph_mode():
+        graph = func_graph.FuncGraph('graph')
+        with graph.as_default():
+          if isinstance(input_shape, list):
+            inputs = [base_layer_utils.generate_placeholders_from_shape(shape)
+                      for shape in input_shape]
+          else:
+            inputs = base_layer_utils.generate_placeholders_from_shape(
+                input_shape)
 
-    The `get_losses_for` method allows to retrieve the losses relevant to a
-    specific set of inputs.
+          try:
+            if self._expects_training_arg:
+              outputs = self(inputs, training=False)
+            else:
+              outputs = self(inputs)
+          except TypeError:
+            raise NotImplementedError('We could not automatically infer '
+                                      'the static shape of the layer\'s output.'
+                                      ' Please implement the '
+                                      '`compute_output_shape` method on your '
+                                      'layer (%s).' % self.__class__.__name__)
+      if isinstance(outputs, list):
+        return [output.shape for output in outputs]
+      else:
+        return outputs.shape
+    raise NotImplementedError
 
-    Note that `add_loss` is not supported when executing eagerly. Instead,
-    variable regularizers may be added through `add_variable`. Activity
-    regularization is not supported directly (but such losses may be returned
-    from `Layer.call()`).
+  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
+    """Computes an output mask tensor.
 
     Arguments:
-      losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
-        may also be zero-argument callables which create a loss tensor.
-      inputs: Ignored when executing eagerly. If anything other than None is
-        passed, it signals the losses are conditional on some of the layer's
-        inputs, and thus they should only be run where these inputs are
-        available. This is the case for activity regularization losses, for
-        instance. If `None` is passed, the losses are assumed
-        to be unconditional, and will apply across all dataflows of the layer
-        (e.g. weight regularization losses).
-    """
-    losses = generic_utils.to_list(losses)
-
-    def _tag_unconditional(loss):
-      if callable(loss):
-        loss = loss()
-      if loss is None:
-        return None  # Will be filtered out when computing the .losses property
-      if not tensor_util.is_tensor(loss):
-        loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
-      loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
-      return loss
+        inputs: Tensor or list of tensors.
+        mask: Tensor or list of tensors.
 
-    for loss in losses:
-      if callable(loss):
-        self._callable_losses.append(
-            functools.partial(_tag_unconditional, loss))
-      else:
-        if context.executing_eagerly():
-          self._eager_losses.append(_tag_unconditional(loss))
+    Returns:
+        None or a tensor (or list of tensors,
+            one per output tensor of the layer).
+    """
+    if not self.supports_masking:
+      if mask is not None:
+        if isinstance(mask, list):
+          if any(m is not None for m in mask):
+            raise TypeError('Layer ' + self.name + ' does not support masking, '
+                            'but was passed an input_mask: ' + str(mask))
         else:
-          self._losses.append(_tag_unconditional(loss))
+          raise TypeError('Layer ' + self.name + ' does not support masking, '
+                          'but was passed an input_mask: ' + str(mask))
+      # masking not explicitly supported: return None as mask
+      return None
+    # if masking is explicitly supported, by default
+    # carry over the input mask
+    return mask
 
-  def get_losses_for(self, inputs):
-    """Retrieves losses relevant to a specific set of inputs.
+  def __call__(self, inputs, *args, **kwargs):
+    """Wraps `call`, applying pre- and post-processing steps.
 
     Arguments:
-      inputs: Input tensor or list/tuple of input tensors.
+      inputs: input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
 
     Returns:
-      List of loss tensors of the layer that depend on `inputs`.
+      Output tensor(s).
+
+    Note:
+      - The following optional keyword arguments are reserved for specific uses:
+        * `training`: Boolean scalar tensor of Python boolean indicating
+          whether the `call` is meant for training or inference.
+        * `mask`: Boolean input mask.
+      - If the layer's `call` method takes a `mask` argument (as some Keras
+        layers do), its default value will be set to the mask generated
+        for `inputs` by the previous layer (if `input` did come from
+        a layer that generated a corresponding mask, i.e. if it came from
+        a Keras layer with masking support.
 
     Raises:
-      RuntimeError: If called in Eager mode.
+      ValueError: if the layer's `call` method returns None (an invalid value).
     """
-    if context.executing_eagerly():
-      raise RuntimeError('Layer.get_losses_for not supported in Eager mode.')
-
-    if inputs is None:
-      # Requesting unconditional losses.
-      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+    input_list = nest.flatten(inputs)
 
-    # Requesting input-conditional losses.
-    inputs = nest.flatten(inputs)
-    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
-    # The losses we want to return will be part of this set.
-    # To avoid unnecessary work, we stop the search in case all of
-    # `self.losses` have been retrieved.
-    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
-    losses = []
-    for loss in self.losses:
-      if loss in reachable:
-        losses.append(loss)
-    return losses
+    if context.executing_eagerly():
+      # Accept NumPy inputs by converting to Tensors when executing eagerly.
+      if all(isinstance(x, (np.ndarray, float, int)) for x in input_list):
+        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
+        input_list = nest.flatten(inputs)
 
-  def _name_scope(self):
-    return self.name
+    # We will attempt to build a TF graph if & only if all inputs are symbolic.
+    # This is always the case in graph mode. It can also be the case in eager
+    # mode when all inputs can be traced back to `keras.Input()` (when building
+    # models using the functional API).
+    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
+    executing_eagerly = context.executing_eagerly()
 
-  def build(self, input_shape):
-    """Creates the variables of the layer."""
-    self.built = True
+    # Handle Keras mask propagation from previous layer to current layer.
+    previous_mask = None
+    if build_graph and (not hasattr(self, '_compute_previous_mask') or
+                        self._compute_previous_mask):
+      previous_mask = base_layer_utils.collect_previous_mask(inputs)
+      if not hasattr(self, '_call_fn_args'):
+        self._call_fn_args = self._no_dependency(
+            function_utils.fn_args(self.call))
+      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
+          not generic_utils.is_all_none(previous_mask)):
+        # The previous layer generated a mask, and mask was not explicitly pass
+        # to __call__, hence we set previous_mask as the default value.
+        kwargs['mask'] = previous_mask
 
-  @doc_controls.for_subclass_implementers
-  def add_variable(self, *args, **kwargs):
-    """Alias for `add_weight`."""
-    return self.add_weight(*args, **kwargs)
+    input_shapes = None
 
-  @doc_controls.for_subclass_implementers
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 partitioner=None,
-                 use_resource=None,
-                 synchronization=tf_variables.VariableSynchronization.AUTO,
-                 aggregation=tf_variables.VariableAggregation.NONE,
-                 **kwargs):
-    """Adds a new variable to the layer, or gets an existing one; returns it.
-
-    Arguments:
-      name: variable name.
-      shape: variable shape.
-      dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-      initializer: initializer instance (callable).
-      regularizer: regularizer instance (callable).
-      trainable: whether the variable should be part of the layer's
-        "trainable_variables" (e.g. variables, biases)
-        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-        Note, if the current variable scope is marked as non-trainable
-        then this parameter is ignored and any added variables are also
-        marked as non-trainable. `trainable` defaults to `True` unless
-        `synchronization` is set to `ON_READ`.
-      constraint: constraint instance (callable).
-      partitioner: Partitioner to be passed to the `Checkpointable` API.
-      use_resource: Whether to use `ResourceVariable`.
-      synchronization: Indicates when a distributed a variable will be
-        aggregated. Accepted values are constants defined in the class
-        `tf.VariableSynchronization`. By default the synchronization is set to
-        `AUTO` and the current `DistributionStrategy` chooses
-        when to synchronize. If `synchronization` is set to `ON_READ`,
-        `trainable` must not be set to `True`.
-      aggregation: Indicates how a distributed variable will be aggregated.
-        Accepted values are constants defined in the class
-        `tf.VariableAggregation`.
-      **kwargs: Additional keyword arguments. Accepted values are `getter` and
-        `collections`.
-
-    Returns:
-      The created variable.  Usually either a `Variable` or `ResourceVariable`
-      instance.  If `partitioner` is not `None`, a `PartitionedVariable`
-      instance is returned.
-
-    Raises:
-      RuntimeError: If called with partioned variable regularization and
-        eager execution is enabled.
-      ValueError: When giving unsupported dtype and no initializer or when
-        trainable has been set to True with synchronization set as `ON_READ`.
-    """
-    # Validate optional keyword arguments.
-    for kwarg in kwargs:
-      if kwarg not in ['getter', 'collections']:
-        raise TypeError('Unknown keyword argument:', kwarg)
-    getter = kwargs.pop('getter', None)
-    collections = kwargs.pop('collections', None)
-
-    if dtype is None:
-      dtype = self.dtype or backend.floatx()
-    dtype = dtypes.as_dtype(dtype)
-    initializer = initializers.get(initializer)
-    regularizer = regularizers.get(regularizer)
-    constraint = constraints.get(constraint)
-
-    if synchronization == tf_variables.VariableSynchronization.ON_READ:
-      if trainable:
-        raise ValueError(
-            'Synchronization value can be set to '
-            'VariableSynchronization.ON_READ only for non-trainable variables. '
-            'You have specified trainable=True and '
-            'synchronization=VariableSynchronization.ON_READ.')
-      else:
-        # Set trainable to be false when variable is to be synced on read.
-        trainable = False
-    elif trainable is None:
-      trainable = True
-
-    # Initialize variable when no initializer provided
-    if initializer is None:
-      # If dtype is DT_FLOAT, provide a uniform unit scaling initializer
-      if dtype.is_floating:
-        initializer = initializers.glorot_uniform()
-      # If dtype is DT_INT/DT_UINT, provide a default value `zero`
-      # If dtype is DT_BOOL, provide a default value `FALSE`
-      elif dtype.is_integer or dtype.is_unsigned or dtype.is_bool:
-        initializer = initializers.zeros()
-      # NOTES:Do we need to support for handling DT_STRING and DT_COMPLEX here?
-      else:
-        raise ValueError('An initializer for variable %s of type %s is required'
-                         ' for layer %s' % (name, dtype.base_dtype, self.name))
-
-    variable = self._add_variable_with_custom_getter(
-        name=name,
-        shape=shape,
-        # TODO(allenl): a `make_variable` equivalent should be added as a
-        # `Checkpointable` method.
-        getter=getter or make_variable,
-        # Manage errors in Layer rather than Checkpointable.
-        overwrite=True,
-        initializer=initializer,
-        dtype=dtype,
-        constraint=constraint,
-        trainable=trainable and self.trainable,
-        partitioner=partitioner,
-        use_resource=use_resource,
-        collections=collections,
-        synchronization=synchronization,
-        aggregation=aggregation)
-    backend.track_variable(variable)
-
-    if regularizer is not None:
-      # TODO(fchollet): in the future, this should be handled at the
-      # level of variable creation, and weight regularization losses
-      # should be variable attributes.
-      self._handle_weight_regularization(name, variable, regularizer)
-
-    if trainable:
-      self._trainable_weights.append(variable)
-    else:
-      self._non_trainable_weights.append(variable)
-    return variable
-
-  def _handle_weight_regularization(self, name, variable, regularizer):
-    """Create lambdas which compute regularization losses."""
-
-    def _loss_for_variable(v):
-      """Creates a regularization loss `Tensor` for variable `v`."""
-      with ops.colocate_with(v):
-        with ops.name_scope(name + '/Regularizer'):
-          regularization = regularizer(v)
-      return regularization
-
-    if isinstance(variable, tf_variables.PartitionedVariable):
-      for v in variable:
-        self.add_loss(functools.partial(_loss_for_variable, v))
-    else:
-      self.add_loss(functools.partial(_loss_for_variable, variable))
-
-  def _handle_activity_regularization(self, inputs, outputs):
-    # Apply activity regularization.
-    # Note that it should be applied every time the layer creates a new
-    # output, since it is output-specific.
-    if self._activity_regularizer:
-      output_list = nest.flatten(outputs)
-      with ops.name_scope('ActivityRegularizer'):
-        for output in output_list:
-          activity_loss = self._activity_regularizer(output)
-          batch_size = math_ops.cast(
-              array_ops.shape(output)[0], activity_loss.dtype)
-          # Make activity regularization strength batch-agnostic.
-          mean_activity_loss = activity_loss / batch_size
-          self.add_loss(mean_activity_loss, inputs=inputs)
-
-  @doc_controls.for_subclass_implementers
-  def call(self, inputs, **kwargs):  # pylint: disable=unused-argument
-    """This is where the layer's logic lives.
-
-    Arguments:
-        inputs: Input tensor, or list/tuple of input tensors.
-        **kwargs: Additional keyword arguments.
-
-    Returns:
-        A tensor or list/tuple of tensors.
-    """
-    return inputs
-
-  def __call__(self, inputs, *args, **kwargs):
-    """Wraps `call`, applying pre- and post-processing steps.
-
-    Arguments:
-      inputs: input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
-
-    Returns:
-      Output tensor(s).
-
-    Note:
-      - The following optional keyword arguments are reserved for specific uses:
-        * `training`: Boolean scalar tensor of Python boolean indicating
-          whether the `call` is meant for training or inference.
-        * `mask`: Boolean input mask.
-      - If the layer's `call` method takes a `mask` argument (as some Keras
-        layers do), its default value will be set to the mask generated
-        for `inputs` by the previous layer (if `input` did come from
-        a layer that generated a corresponding mask, i.e. if it came from
-        a Keras layer with masking support.
-
-    Raises:
-      ValueError: if the layer's `call` method returns None (an invalid value).
-    """
-    input_list = nest.flatten(inputs)
-
-    if context.executing_eagerly():
-      # Accept NumPy inputs by converting to Tensors when executing eagerly.
-      if all([isinstance(x, (np.ndarray, float, int)) for x in input_list]):
-        inputs = nest.map_structure(ops.convert_to_tensor, inputs)
-        input_list = nest.flatten(inputs)
-
-    # We will attempt to build a TF graph if & only if all inputs are symbolic.
-    # This is always the case in graph mode. It can also be the case in eager
-    # mode when all inputs can be traced back to `keras.Input()` (when building
-    # models using the functional API).
-    build_graph = tf_utils.are_all_symbolic_tensors(input_list)
-    executing_eagerly = context.executing_eagerly()
-
-    # Handle Keras mask propagation from previous layer to current layer.
-    previous_mask = None
-    if build_graph and (not hasattr(self, '_compute_previous_mask') or
-                        self._compute_previous_mask):
-      previous_mask = collect_previous_mask(inputs)
-      if not hasattr(self, '_call_fn_args'):
-        self._call_fn_args = self._no_dependency(
-            function_utils.fn_args(self.call))
-      if ('mask' in self._call_fn_args and 'mask' not in kwargs and
-          not generic_utils.is_all_none(previous_mask)):
-        # The previous layer generated a mask, and mask was not explicitly pass
-        # to __call__, hence we set previous_mask as the default value.
-        kwargs['mask'] = previous_mask
-
-    input_shapes = None
-
-    with ops.name_scope(self._name_scope()):
-      if not self.built:
-        # Check input assumptions set before layer building, e.g. input rank.
-        self._assert_input_compatibility(inputs)
-        if input_list and self._dtype is None:
-          try:
-            self._dtype = input_list[0].dtype.base_dtype.name
-          except AttributeError:
-            pass
-
-        if all(hasattr(x, 'shape') for x in input_list):
-          input_shapes = nest.map_structure(lambda x: x.shape, inputs)
-
-        if (not hasattr(self, '_is_graph_network') or
-            self.__class__.__name__ == 'Sequential' or
-            not hasattr(self.build, '_is_default')):
-          # Only if self is a layer, an instance of a sequential model, or
-          # the user has manually overwritten the build method do we need to
-          # build it.
-          self.build(input_shapes)
-        # We must set self.built since user defined build functions are not
-        # constrained to set self.built.
-        self.built = True
+    with ops.name_scope(self._name_scope()):
+      if not self.built:
+        # Build layer if applicable (if the `build` method has been overridden).
+        self._maybe_build(inputs)
+        # We must set self.built since user defined build functions are not
+        # constrained to set self.built.
+        self.built = True
 
       # Check input assumptions set after layer building, e.g. input shape.
       if build_graph:
         # Symbolic execution on symbolic tensors. We will attempt to build
         # the corresponding TF subgraph inside `backend.get_graph()`
-        self._assert_input_compatibility(inputs)
+        input_spec.assert_input_compatibility(
+            self.input_spec, inputs, self.name)
         graph = backend.get_graph()
         with graph.as_default():
           if not executing_eagerly:
@@ -772,10 +554,10 @@ class Layer(checkpointable.CheckpointableBase):
               # Any issue during graph-building means we will later run the
               # model in eager mode, whether the issue was related to
               # graph mode or not. This provides a nice debugging experience.
-              self._is_static_graph_friendly = False
+              self._call_is_graph_friendly = False
               # We will use static shape inference to return symbolic tensors
               # matching the specifications of the layer outputs.
-              # Since we have set `self._is_static_graph_friendly = False`,
+              # Since we have set `self._call_is_graph_friendly = False`,
               # we will never attempt to run the underlying TF graph (which is
               # disconnected).
               # TODO(fchollet): consider py_func as an alternative, which
@@ -792,7 +574,7 @@ class Layer(checkpointable.CheckpointableBase):
                              '(layer: ' + self.name + ').')
           self._handle_activity_regularization(inputs, outputs)
           self._set_mask_metadata(inputs, outputs, previous_mask)
-          if have_all_keras_metadata(inputs):
+          if base_layer_utils.have_all_keras_metadata(inputs):
             inputs, outputs = self._set_connectivity_metadata_(
                 inputs, outputs, args, kwargs)
           if hasattr(self, '_set_inputs') and not self.inputs:
@@ -815,313 +597,299 @@ class Layer(checkpointable.CheckpointableBase):
         del self._initial_weights
     return outputs
 
-  def apply(self, inputs, *args, **kwargs):
-    """Apply the layer on a input.
+  @property
+  def dtype(self):
+    return self._dtype
 
-    This simply wraps `self.__call__`.
+  @property
+  def name(self):
+    return self._name
 
-    Arguments:
-      inputs: Input tensor(s).
-      *args: additional positional arguments to be passed to `self.call`.
-      **kwargs: additional keyword arguments to be passed to `self.call`.
+  @property
+  def activity_regularizer(self):
+    """Optional regularizer function for the output of this layer."""
+    return self._activity_regularizer
+
+  @activity_regularizer.setter
+  def activity_regularizer(self, regularizer):
+    """Optional regularizer function for the output of this layer."""
+    self._activity_regularizer = self._no_dependency(regularizer)
+
+  @property
+  def trainable_weights(self):
+    return self._trainable_weights if self.trainable else []
+
+  @property
+  def non_trainable_weights(self):
+    if self.trainable:
+      return self._non_trainable_weights
+    else:
+      return self._trainable_weights + self._non_trainable_weights
+
+  @property
+  def weights(self):
+    """Returns the list of all layer variables/weights.
 
     Returns:
-      Output tensor(s).
+      A list of variables.
     """
-    return self.__call__(inputs, *args, **kwargs)
+    return self.trainable_weights + self.non_trainable_weights
 
-  def _set_mask_metadata(self, inputs, outputs, previous_mask):
-    # In some cases the mask of the outputs has already been computed by
-    # inner layers and does not need to be recomputed by this layer.
-    mask_already_computed = all(
-        hasattr(x, '_keras_mask') for x in generic_utils.to_list(outputs))
-    if hasattr(self, 'compute_mask') and not mask_already_computed:
-      output_mask = self.compute_mask(inputs, previous_mask)
-    else:
-      output_mask = None
-    if isinstance(outputs, (list, tuple)):
-      if output_mask is None:
-        output_mask = [None for _ in range(len(outputs))]
-      for x, m in zip(outputs, output_mask):
-        try:
-          x._keras_mask = m  # pylint: disable=protected-access
-        except AttributeError:
-          pass  # C type such as dict. Masking not supported in this case.
-    else:
-      try:
-        outputs._keras_mask = output_mask  # pylint: disable=protected-access
-      except AttributeError:
-        pass  # C type such as dict. Masking not supported in this case.
+  @property
+  def updates(self):
+    if not self.trainable and not self.stateful:
+      return []
+    return self._updates
 
-  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
-    call_convention = getattr(self, '_call_convention',
-                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if args:
-      if call_convention == CallConvention.EXPLICIT_INPUTS_ARGUMENT:
-        raise TypeError(
-            'This layer ("{}") takes an `inputs` argument in `call()`, '
-            'and only the `inputs` argument may be specified as a positional '
-            'argument. Pass everything else as a keyword argument '
-            '(those arguments will not be tracked '
-            'as inputs to the layer).'.format(self.name))
-      elif call_convention == CallConvention.SINGLE_POSITIONAL_ARGUMENT:
-        raise TypeError(
-            'This layer ("{}") takes a single positional argument in `call()`,'
-            ' which is by convention the `inputs` argument, '
-            'and only this argument may be specified as a positional argument. '
-            'Pass everything else as a keyword argument '
-            '(those arguments will not be tracked '
-            'as inputs to the layer).'.format(self.name))
+  @property
+  def losses(self):
+    """Losses which are associated with this `Layer`.
 
-    # If the layer returns tensors from its inputs, unmodified,
-    # we copy them to avoid loss of tensor metadata.
-    output_ls = nest.flatten(outputs)
-    output_ls_copy = []
-    for x in output_ls:
-      if x in nest.flatten(inputs):
-        with ops.name_scope(self.name):
-          x = array_ops.identity(x)
-      output_ls_copy.append(x)
-    if len(output_ls_copy) == 1:
-      outputs = output_ls_copy[0]
+    Variable regularization tensors are created when this property is accessed,
+    so it is eager safe: accessing `losses` under a `tf.GradientTape` will
+    propagate gradients back to the corresponding variables.
+
+    Returns:
+      A list of tensors.
+    """
+    collected_losses = []
+    if context.executing_eagerly():
+      collected_losses.extend(self._eager_losses)
     else:
-      outputs = output_ls_copy
+      collected_losses.extend(self._losses)
+    for regularizer in self._callable_losses:
+      loss_tensor = regularizer()
+      if loss_tensor is not None:
+        collected_losses.append(loss_tensor)
+    return collected_losses
 
-    inputs, kwargs = self._inputs_from_call_args(
-        call_args=(inputs,) + args, call_kwargs=kwargs)
-    # Add an inbound node to the layer, so it can keep track of this call.
-    # This updates the layer history of the output tensor(s).
-    kwargs.pop('mask', None)  # `mask` should not be serialized.
-    self._add_inbound_node(
-        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
-    return inputs, outputs
+  @doc_controls.for_subclass_implementers
+  def add_loss(self, losses, inputs=None):
+    """Add loss tensor(s), potentially dependent on layer inputs.
 
-  def _inputs_from_call_args(self, call_args, call_kwargs):
-    """Get Layer inputs from __call__ *args and **kwargs.
+    Some losses (for instance, activity regularization losses) may be dependent
+    on the inputs passed when calling a layer. Hence, when reusing the same
+    layer on different inputs `a` and `b`, some entries in `layer.losses` may
+    be dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
 
-    Args:
-      call_args: The positional arguments passed to __call__.
-      call_kwargs: The keyword argument dict passed to __call__.
+    The `get_losses_for` method allows to retrieve the losses relevant to a
+    specific set of inputs.
 
-    Returns:
-      A tuple of (inputs, non_input_kwargs). These may be the same objects as
-      were passed in (call_args and call_kwargs).
+    Note that `add_loss` is not supported when executing eagerly. Instead,
+    variable regularizers may be added through `add_variable`. Activity
+    regularization is not supported directly (but such losses may be returned
+    from `Layer.call()`).
+
+    Arguments:
+      losses: Loss tensor, or list/tuple of tensors. Rather than tensors, losses
+        may also be zero-argument callables which create a loss tensor.
+      inputs: Ignored when executing eagerly. If anything other than None is
+        passed, it signals the losses are conditional on some of the layer's
+        inputs, and thus they should only be run where these inputs are
+        available. This is the case for activity regularization losses, for
+        instance. If `None` is passed, the losses are assumed
+        to be unconditional, and will apply across all dataflows of the layer
+        (e.g. weight regularization losses).
     """
-    call_convention = getattr(self, '_call_convention',
-                              CallConvention.EXPLICIT_INPUTS_ARGUMENT)
-    if (call_convention in (
-        CallConvention.EXPLICIT_INPUTS_ARGUMENT,
-        CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
-      assert len(call_args) == 1  # TypeError raised earlier in __call__.
-      return call_args[0], call_kwargs
-    else:
-      call_arg_spec = tf_inspect.getfullargspec(self.call)
-      # There is no explicit "inputs" argument expected or provided to
-      # call(). Arguments which have default values are considered non-inputs,
-      # and arguments without are considered inputs.
-      if call_arg_spec.defaults:
-        if call_arg_spec.varargs is not None:
-          raise TypeError(
-              'Layers may not accept both positional arguments and '
-              'arguments with default values (unable to determine which '
-              'are inputs to the layer). '
-              'Issue occurred with layer "%s"' % (self.name))
-        keyword_arg_names = set(
-            call_arg_spec.args[-len(call_arg_spec.defaults):])
+    losses = generic_utils.to_list(losses)
+
+    def _tag_unconditional(loss):
+      if callable(loss):
+        loss = loss()
+      if loss is None:
+        return None  # Will be filtered out when computing the .losses property
+      if not tensor_util.is_tensor(loss):
+        loss = ops.convert_to_tensor(loss, dtype=backend.floatx())
+      loss._unconditional_loss = (inputs is None)  # pylint: disable=protected-access
+      return loss
+
+    for loss in losses:
+      if callable(loss):
+        self._callable_losses.append(
+            functools.partial(_tag_unconditional, loss))
       else:
-        keyword_arg_names = set()
-        # Training is never an input argument name, to allow signatures like
-        # call(x, training).
-      keyword_arg_names.add('training')
-      _, unwrapped_call = tf_decorator.unwrap(self.call)
-      bound_args = inspect.getcallargs(
-          unwrapped_call, *call_args, **call_kwargs)
-      if call_arg_spec.varkw is not None:
-        var_kwargs = bound_args.pop(call_arg_spec.varkw)
-        bound_args.update(var_kwargs)
-        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
-      all_args = call_arg_spec.args
-      if all_args and bound_args[all_args[0]] is self:
-        # Ignore the 'self' argument of methods
-        bound_args.pop(call_arg_spec.args[0])
-        all_args = all_args[1:]
-      non_input_arg_values = {}
-      input_arg_values = []
-      remaining_args_are_keyword = False
-      for argument_name in all_args:
-        if argument_name in keyword_arg_names:
-          remaining_args_are_keyword = True
-        else:
-          if remaining_args_are_keyword:
-            raise TypeError(
-                'Found a positional argument in a layer call after a non-input '
-                'argument. All arguments after "training" must be keyword '
-                'arguments, and are not tracked as inputs to the layer. '
-                'Issue occurred with layer "%s"' % (self.name))
-        if remaining_args_are_keyword:
-          non_input_arg_values[argument_name] = bound_args[argument_name]
+        if context.executing_eagerly():
+          self._eager_losses.append(_tag_unconditional(loss))
         else:
-          input_arg_values.append(bound_args[argument_name])
-      if call_arg_spec.varargs is not None:
-        input_arg_values.extend(bound_args[call_arg_spec.varargs])
-      return input_arg_values, non_input_arg_values
+          self._losses.append(_tag_unconditional(loss))
 
-  def compute_output_shape(self, input_shape):
-    """Computes the output shape of the layer.
+  @doc_controls.for_subclass_implementers
+  def add_metric(self, value, aggregation=None, name=None):
+    """Adds metric tensor to the layer.
 
-    Assumes that the layer will be built
-    to match that input shape provided.
+    Args:
+      value: Metric tensor.
+      aggregation: Sample-wise metric reduction function. If `aggregation=None`,
+        it indicates that the metric tensor provided has been aggregated
+        already. eg, `model.add_metric(BinaryAccuracy(name='acc')(y_true,
+        y_pred))`. If aggregation='mean', the given metric tensor will be
+        sample-wise reduced using `mean` function. eg, `model.add_metric(
+        tf.reduce_mean(outputs), name='output_mean', aggregation='mean')`.
+      name: String metric name.
 
-    Arguments:
-        input_shape: Shape tuple (tuple of integers)
-            or list of shape tuples (one per output tensor of the layer).
-            Shape tuples can include None for free dimensions,
-            instead of an integer.
+    Raises:
+      ValueError: If `aggregation` is anything other than None or `mean`.
+    """
+    if aggregation is not None and aggregation != 'mean':
+      raise ValueError(
+          'We currently support only `mean` sample-wise metric aggregation. '
+          'You provided aggregation=`%s`' % aggregation)
 
-    Returns:
-        An input shape tuple.
+    if tf_utils.is_symbolic_tensor(value):
+      self._symbolic_add_metric(value, aggregation, name)
+    else:
+      self._eager_add_metric(value, aggregation, name)
+
+  @doc_controls.for_subclass_implementers
+  def add_update(self, updates, inputs=None):
+    """Add update op(s), potentially dependent on layer inputs.
+
+    Weight updates (for instance, the updates of the moving mean and variance
+    in a BatchNormalization layer) may be dependent on the inputs passed
+    when calling a layer. Hence, when reusing the same layer on
+    different inputs `a` and `b`, some entries in `layer.updates` may be
+    dependent on `a` and some on `b`. This method automatically keeps track
+    of dependencies.
+
+    The `get_updates_for` method allows to retrieve the updates relevant to a
+    specific set of inputs.
+
+    This call is ignored when eager execution is enabled (in that case, variable
+    updates are run on the fly and thus do not need to be tracked for later
+    execution).
+
+    Arguments:
+      updates: Update op, or list/tuple of update ops.
+      inputs: If anything other than None is passed, it signals the updates
+        are conditional on some of the layer's inputs,
+        and thus they should only be run where these inputs are available.
+        This is the case for BatchNormalization updates, for instance.
+        If None, the updates will be taken into account unconditionally,
+        and you are responsible for making sure that any dependency they might
+        have is available at runtime.
+        A step counter might fall into this category.
     """
     if context.executing_eagerly():
-      # In this case we build the model first in order to do shape inference.
-      # This is acceptable because the framework only calls
-      # `compute_output_shape` on shape values that the layer would later be
-      # built for. It would however cause issues in case a user attempts to
-      # use `compute_output_shape` manually (these users will have to
-      # implement `compute_output_shape` themselves).
-      self.build(input_shape)
-
-      with context.graph_mode():
-        graph = func_graph.FuncGraph('graph')
-        with graph.as_default():
-          if isinstance(input_shape, list):
-            inputs = [generate_placeholders_from_shape(shape)
-                      for shape in input_shape]
-          else:
-            inputs = generate_placeholders_from_shape(input_shape)
+      return  # Updates already applied when in eager mode.
 
-          try:
-            if self._expects_training_arg:
-              outputs = self(inputs, training=False)
-            else:
-              outputs = self(inputs)
-          except TypeError:
-            raise NotImplementedError('We could not automatically infer '
-                                      'the static shape of the layer\'s output.'
-                                      ' Please implement the '
-                                      '`compute_output_shape` method on your '
-                                      'layer (%s).' % self.__class__.__name__)
-      if isinstance(outputs, list):
-        return [output.shape for output in outputs]
+    def process_update(x):
+      if isinstance(x, ops.Operation):
+        return x
+      elif hasattr(x, 'op'):
+        return x.op
       else:
-        return outputs.shape
-    raise NotImplementedError
+        return ops.convert_to_tensor(x)
 
-  def compute_mask(self, inputs, mask=None):  # pylint: disable=unused-argument
-    """Computes an output mask tensor.
+    updates = generic_utils.to_list(updates)
+    updates = [process_update(x) for x in updates]
+    self._updates += updates
+    if inputs is None:
+      for u in updates:
+        u._unconditional_update = True  # pylint: disable=protected-access
+    else:
+      for u in updates:
+        u._unconditional_update = False  # pylint: disable=protected-access
+
+  def set_weights(self, weights):
+    """Sets the weights of the layer, from Numpy arrays.
 
     Arguments:
-        inputs: Tensor or list of tensors.
-        mask: Tensor or list of tensors.
+        weights: a list of Numpy arrays. The number
+            of arrays and their shape must match
+            number of the dimensions of the weights
+            of the layer (i.e. it should match the
+            output of `get_weights`).
+
+    Raises:
+        ValueError: If the provided weights list does not match the
+            layer's specifications.
+    """
+    params = self.weights
+    if len(params) != len(weights):
+      raise ValueError('You called `set_weights(weights)` on layer "' +
+                       self.name + '" with a  weight list of length ' +
+                       str(len(weights)) + ', but the layer was expecting ' +
+                       str(len(params)) + ' weights. Provided weights: ' +
+                       str(weights)[:50] + '...')
+    if not params:
+      return
+    weight_value_tuples = []
+    param_values = backend.batch_get_value(params)
+    for pv, p, w in zip(param_values, params, weights):
+      if pv.shape != w.shape:
+        raise ValueError('Layer weight shape ' + str(pv.shape) +
+                         ' not compatible with '
+                         'provided weight shape ' + str(w.shape))
+      weight_value_tuples.append((p, w))
+    backend.batch_set_value(weight_value_tuples)
+
+  def get_weights(self):
+    """Returns the current weights of the layer.
 
     Returns:
-        None or a tensor (or list of tensors,
-            one per output tensor of the layer).
+        Weights values as a list of numpy arrays.
     """
-    if not self.supports_masking:
-      if mask is not None:
-        if isinstance(mask, list):
-          if any(m is not None for m in mask):
-            raise TypeError('Layer ' + self.name + ' does not support masking, '
-                            'but was passed an input_mask: ' + str(mask))
-        else:
-          raise TypeError('Layer ' + self.name + ' does not support masking, '
-                          'but was passed an input_mask: ' + str(mask))
-      # masking not explicitly supported: return None as mask
-      return None
-    # if masking is explicitly supported, by default
-    # carry over the input mask
-    return mask
+    params = self.weights
+    return backend.batch_get_value(params)
 
-  def _add_inbound_node(self,
-                        input_tensors,
-                        output_tensors,
-                        arguments=None):
-    """Internal method to create an inbound node for the layer.
+  def get_updates_for(self, inputs):
+    """Retrieves updates relevant to a specific set of inputs.
 
     Arguments:
-        input_tensors: list of input tensors.
-        output_tensors: list of output tensors.
-        arguments: dictionary of keyword arguments that were passed to the
-            `call` method of the layer at the call that created the node.
-    """
-    input_tensors = nest.flatten(input_tensors)
-    output_tensors = nest.flatten(output_tensors)
+      inputs: Input tensor or list/tuple of input tensors.
 
-    # Collect input tensor(s) coordinates.
-    inbound_layers = []
-    node_indices = []
-    tensor_indices = []
-    for x in input_tensors:
-      assert hasattr(x, '_keras_history')
-      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
-      inbound_layers.append(inbound_layer)
-      node_indices.append(node_index)
-      tensor_indices.append(tensor_index)
+    Returns:
+      List of update ops of the layer that depend on `inputs`.
 
-    # Create node, add it to inbound nodes.
-    Node(
-        self,
-        inbound_layers=inbound_layers,
-        node_indices=node_indices,
-        tensor_indices=tensor_indices,
-        input_tensors=input_tensors,
-        output_tensors=output_tensors,
-        arguments=arguments)
+    Raises:
+      RuntimeError: If called in Eager mode.
+    """
+    # Updates disabled if layer is not trainable and not explicitly stateful.
+    if not self.trainable and not self.stateful:
+      return []
 
-    # Update tensor history metadata.
-    for i in range(len(output_tensors)):
-      # The metadata attribute consists of 1) a layer instance
-      # 2) a node index for the layer, 3) a tensor index for the node.
-      # The allows layer reuse (multiple nodes per layer) and multi-output
-      # or multi-input layers (e.g. a layer can return multiple tensors,
-      # and each can be sent to a different layer).
-      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
+    if inputs is None:
+      # Requesting unconditional updates.
+      return [x for x in self.updates if x._unconditional_update]  # pylint: disable=protected-access
 
-  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
-    """Private utility to retrieves an attribute (e.g. inputs) from a node.
+    # Requesting input-conditional updates.
+    inputs = nest.flatten(inputs)
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.updates)
+    updates = []
+    for update in self.updates:
+      if update in reachable:
+        updates.append(update)
+    return updates
 
-    This is used to implement the methods:
-        - get_input_shape_at
-        - get_output_shape_at
-        - get_input_at
-        etc...
+  def get_losses_for(self, inputs):
+    """Retrieves losses relevant to a specific set of inputs.
 
     Arguments:
-        node_index: Integer index of the node from which
-            to retrieve the attribute.
-        attr: Exact node attribute name.
-        attr_name: Human-readable attribute name, for error messages.
+      inputs: Input tensor or list/tuple of input tensors.
 
     Returns:
-        The layer's attribute `attr` at the node of index `node_index`.
+      List of loss tensors of the layer that depend on `inputs`.
 
     Raises:
-        RuntimeError: If the layer has no inbound nodes, or if called in Eager
-        mode.
-        ValueError: If the index provided does not match any node.
+      RuntimeError: If called in Eager mode.
     """
-    if not self._inbound_nodes:
-      raise RuntimeError('The layer has never been called '
-                         'and thus has no defined ' + attr_name + '.')
-    if not len(self._inbound_nodes) > node_index:
-      raise ValueError('Asked to get ' + attr_name + ' at node ' +
-                       str(node_index) + ', but the layer has only ' +
-                       str(len(self._inbound_nodes)) + ' inbound nodes.')
-    values = getattr(self._inbound_nodes[node_index], attr)
-    if len(values) == 1:
-      return values[0]
-    else:
-      return values
+    if inputs is None:
+      # Requesting unconditional losses.
+      return [x for x in self.losses if x._unconditional_loss]  # pylint: disable=protected-access
+
+    # Requesting input-conditional losses.
+    inputs = nest.flatten(inputs)
+    # Retrieve the set of tensors in the TF graph that depend on `inputs`.
+    # The losses we want to return will be part of this set.
+    # To avoid unnecessary work, we stop the search in case all of
+    # `self.losses` have been retrieved.
+    reachable = tf_utils.get_reachable_from_inputs(inputs, self.losses)
+    losses = []
+    for loss in self.losses:
+      if loss in reachable:
+        losses.append(loss)
+    return losses
 
   def get_input_mask_at(self, node_index):
     """Retrieves the input mask tensor(s) of a layer at a given node.
@@ -1376,8 +1144,7 @@ class Layer(checkpointable.CheckpointableBase):
                          ', but the layer isn\'t built. '
                          'You can build it manually via: `' + self.name +
                          '.build(batch_input_shape)`.')
-    weight_shapes = [w.shape.as_list() for w in self.weights]
-    return int(sum([np.prod(w) for w in weight_shapes]))
+    return int(sum(np.prod(w.shape.as_list()) for w in self.weights))
 
   @property
   def output_shape(self):
@@ -1429,231 +1196,400 @@ class Layer(checkpointable.CheckpointableBase):
     """Deprecated, do NOT use! Only for compatibility with external Keras."""
     return self._outbound_nodes
 
-  def _assert_input_compatibility(self, inputs):
-    """Checks compatibility between the layer and provided inputs.
+  ##############################################################################
+  # Methods & attributes below are public aliases of other methods.            #
+  ##############################################################################
+
+  def apply(self, inputs, *args, **kwargs):
+    """Apply the layer on a input.
+
+    This is an alias of `self.__call__`.
+
+    Arguments:
+      inputs: Input tensor(s).
+      *args: additional positional arguments to be passed to `self.call`.
+      **kwargs: additional keyword arguments to be passed to `self.call`.
+
+    Returns:
+      Output tensor(s).
+    """
+    return self.__call__(inputs, *args, **kwargs)
+
+  @doc_controls.for_subclass_implementers
+  def add_variable(self, *args, **kwargs):
+    """Alias for `add_weight`."""
+    return self.add_weight(*args, **kwargs)
+
+  @property
+  def variables(self):
+    """Returns the list of all layer variables/weights.
+
+    Alias of `self.weights`.
+
+    Returns:
+      A list of variables.
+    """
+    return self.weights
+
+  @property
+  def trainable_variables(self):
+    return self.trainable_weights
+
+  @property
+  def non_trainable_variables(self):
+    return self.non_trainable_weights
+
+  ##############################################################################
+  # Methods & attributes below are all private and only used by the framework. #
+  ##############################################################################
+
+  def _name_scope(self):
+    return self.name
+
+  def _init_set_name(self, name, zero_based=True):
+    if not name:
+      self._name = base_layer_utils.unique_layer_name(
+          generic_utils.to_snake_case(self.__class__.__name__),
+          zero_based=zero_based)
+    else:
+      self._name = name
+
+  def _get_existing_metric(self, name=None):
+    match = [m for m in self._metrics if m.name == name]
+    if not match:
+      return
+    if len(match) > 1:
+      raise ValueError(
+          'Please provide different names for the metrics you have added. '
+          'We found {} metrics with the name: "{}"'.format(len(match), name))
+    return match[0]
+
+  def _eager_add_metric(self, value, aggregation=None, name=None):
+    # If the given metric is available in `metrics` list we just update state
+    # on it, otherwise we create a new metric instance and
+    # add it to the `metrics` list.
+    match = self._get_existing_metric(name)
+    if match:
+      match(value)  # Update the metric state.
+      return
+    else:
+      if aggregation is None:
+        raise ValueError('We do not support adding an aggregated metric tensor '
+                         'in `call` in eager execution.')
+      metric_obj, _ = base_layer_utils.create_mean_metric(value, name)
+      self._metrics.append(metric_obj)
+
+  def _symbolic_add_metric(self, value, aggregation=None, name=None):
+    if aggregation is None:
+      # Iterate over the metrics and check if the given metric exists already.
+      # This can happen when a metric instance is created in subclassed model
+      # layer `__init__` and we have tracked that instance already in
+      # model.__setattr__.
+      match = self._get_existing_metric(name)
+      if match:
+        result_tensor = value
+        if match.name not in self._metrics_tensors:
+          self._metrics_tensors[match.name] = result_tensor
+          return
+        else:
+          raise ValueError(
+              'We currently do not support reusing a metric instance.')
+      else:
+        # We track the instance using the metadata on the result tensor.
+        result_tensor = value
+        metric_obj = result_tensor._metric_obj
+    else:
+      # If a non-aggregated tensor is given as input (ie. `aggregation` is
+      # explicitly set to `mean`), we wrap the tensor in `Mean` metric.
+      metric_obj, result_tensor = base_layer_utils.create_mean_metric(
+          value, name)
+    self._metrics.append(metric_obj)
+    self._metrics_tensors[metric_obj.name] = result_tensor
+
+  def _handle_weight_regularization(self, name, variable, regularizer):
+    """Create lambdas which compute regularization losses."""
+
+    def _loss_for_variable(v):
+      """Creates a regularization loss `Tensor` for variable `v`."""
+      with ops.name_scope(name + '/Regularizer'):
+        regularization = regularizer(v)
+      return regularization
+
+    if isinstance(variable, tf_variables.PartitionedVariable):
+      for v in variable:
+        self.add_loss(functools.partial(_loss_for_variable, v))
+    else:
+      self.add_loss(functools.partial(_loss_for_variable, variable))
+
+  def _handle_activity_regularization(self, inputs, outputs):
+    # Apply activity regularization.
+    # Note that it should be applied every time the layer creates a new
+    # output, since it is output-specific.
+    if self._activity_regularizer:
+      output_list = nest.flatten(outputs)
+      with ops.name_scope('ActivityRegularizer'):
+        for output in output_list:
+          activity_loss = self._activity_regularizer(output)
+          batch_size = math_ops.cast(
+              array_ops.shape(output)[0], activity_loss.dtype)
+          # Make activity regularization strength batch-agnostic.
+          mean_activity_loss = activity_loss / batch_size
+          self.add_loss(mean_activity_loss, inputs=inputs)
+
+  def _set_mask_metadata(self, inputs, outputs, previous_mask):
+    # In some cases the mask of the outputs has already been computed by
+    # inner layers and does not need to be recomputed by this layer.
+    mask_already_computed = all(
+        hasattr(x, '_keras_mask') for x in generic_utils.to_list(outputs))
+    if hasattr(self, 'compute_mask') and not mask_already_computed:
+      output_mask = self.compute_mask(inputs, previous_mask)
+    else:
+      output_mask = None
+    if isinstance(outputs, (list, tuple)):
+      if output_mask is None:
+        output_mask = [None for _ in range(len(outputs))]
+      for x, m in zip(outputs, output_mask):
+        try:
+          x._keras_mask = m  # pylint: disable=protected-access
+        except AttributeError:
+          pass  # C type such as dict. Masking not supported in this case.
+    else:
+      try:
+        outputs._keras_mask = output_mask  # pylint: disable=protected-access
+      except AttributeError:
+        pass  # C type such as dict. Masking not supported in this case.
+
+  def _set_connectivity_metadata_(self, inputs, outputs, args, kwargs):
+    call_convention = getattr(
+        self, '_call_convention',
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if args:
+      if call_convention == (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT):
+        raise TypeError(
+            'This layer ("{}") takes an `inputs` argument in `call()`, '
+            'and only the `inputs` argument may be specified as a positional '
+            'argument. Pass everything else as a keyword argument '
+            '(those arguments will not be tracked '
+            'as inputs to the layer).'.format(self.name))
+      elif call_convention == (base_layer_utils
+                               .CallConvention.SINGLE_POSITIONAL_ARGUMENT):
+        raise TypeError(
+            'This layer ("{}") takes a single positional argument in `call()`,'
+            ' which is by convention the `inputs` argument, '
+            'and only this argument may be specified as a positional argument. '
+            'Pass everything else as a keyword argument '
+            '(those arguments will not be tracked '
+            'as inputs to the layer).'.format(self.name))
+
+    # If the layer returns tensors from its inputs, unmodified,
+    # we copy them to avoid loss of tensor metadata.
+    output_ls = nest.flatten(outputs)
+    output_ls_copy = []
+    for x in output_ls:
+      if x in nest.flatten(inputs):
+        with ops.name_scope(self.name):
+          x = array_ops.identity(x)
+      output_ls_copy.append(x)
+    if len(output_ls_copy) == 1:
+      outputs = output_ls_copy[0]
+    else:
+      outputs = output_ls_copy
+
+    inputs, kwargs = self._inputs_from_call_args(
+        call_args=(inputs,) + args, call_kwargs=kwargs)
+    # Add an inbound node to the layer, so it can keep track of this call.
+    # This updates the layer history of the output tensor(s).
+    kwargs.pop('mask', None)  # `mask` should not be serialized.
+    self._add_inbound_node(
+        input_tensors=inputs, output_tensors=outputs, arguments=kwargs)
+    return inputs, outputs
 
-    This checks that the tensor(s) `inputs` verify the input assumptions
-    of the layer (if any). If not, a clear and actional exception gets raised.
+  def _inputs_from_call_args(self, call_args, call_kwargs):
+    """Get Layer inputs from __call__ *args and **kwargs.
 
-    Arguments:
-        inputs: input tensor or list of input tensors.
+    Args:
+      call_args: The positional arguments passed to __call__.
+      call_kwargs: The keyword argument dict passed to __call__.
 
-    Raises:
-        ValueError: in case of mismatch between
-            the provided inputs and the expectations of the layer.
+    Returns:
+      A tuple of (inputs, non_input_kwargs). These may be the same objects as
+      were passed in (call_args and call_kwargs).
     """
-    if not self.input_spec:
-      return
-    if not isinstance(self.input_spec, (list, tuple)):
-      input_spec = nest.flatten(self.input_spec)
+    call_convention = getattr(
+        self, '_call_convention',
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT)
+    if (call_convention in (
+        base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT,
+        base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT)):
+      assert len(call_args) == 1  # TypeError raised earlier in __call__.
+      return call_args[0], call_kwargs
     else:
-      input_spec = self.input_spec
-    inputs = nest.flatten(inputs)
-    if len(inputs) != len(input_spec):
-      raise ValueError('Layer ' + self.name + ' expects ' +
-                       str(len(input_spec)) + ' inputs, '
-                       'but it received ' + str(len(inputs)) +
-                       ' input tensors. Inputs received: ' + str(inputs))
-    for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
-      if spec is None:
-        continue
-
-      if (spec.ndim is not None or
-          spec.min_ndim is not None or
-          spec.max_ndim is not None):
-        if x.shape.ndims is None:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'its rank is undefined, but the layer requires a '
-                           'defined rank.')
-
-      # Check ndim.
-      if spec.ndim is not None:
-        ndim = x.shape.ndims
-        if ndim != spec.ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected ndim=' + str(spec.ndim) + ', found ndim=' +
-                           str(ndim) + '. Full shape received: ' +
-                           str(x.shape.as_list()))
-      if spec.max_ndim is not None:
-        ndim = x.shape.ndims
-        if ndim is not None and ndim > spec.max_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected max_ndim=' + str(spec.max_ndim) +
-                           ', found ndim=' + str(ndim))
-      if spec.min_ndim is not None:
-        ndim = x.shape.ndims
-        if ndim is not None and ndim < spec.min_ndim:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           ': expected min_ndim=' + str(spec.min_ndim) +
-                           ', found ndim=' + str(ndim) +
-                           '. Full shape received: ' +
-                           str(x.shape.as_list()))
-      # Check dtype.
-      if spec.dtype is not None:
-        if x.dtype != spec.dtype:
-          raise ValueError('Input ' + str(input_index) + ' of layer ' +
-                           self.name + ' is incompatible with the layer: '
-                           'expected dtype=' + str(spec.dtype) +
-                           ', found dtype=' + str(x.dtype))
-      # Check specific shape axes.
-      if spec.axes:
-        shape = x.shape.as_list()
-        if shape is not None:
-          for axis, value in spec.axes.items():
-            if hasattr(value, 'value'):
-              value = value.value
-            if value is not None and shape[int(axis)] not in {value, None}:
-              raise ValueError(
-                  'Input ' + str(input_index) + ' of layer ' + self.name + ' is'
-                  ' incompatible with the layer: expected axis ' + str(axis) +
-                  ' of input shape to have value ' + str(value) +
-                  ' but received input with shape ' + str(shape))
-      # Check shape.
-      if spec.shape is not None:
-        shape = x.shape.as_list()
-        if shape is not None:
-          for spec_dim, dim in zip(spec.shape, shape):
-            if spec_dim is not None and dim is not None:
-              if spec_dim != dim:
-                raise ValueError('Input ' + str(input_index) +
-                                 ' is incompatible with layer ' + self.name +
-                                 ': expected shape=' + str(spec.shape) +
-                                 ', found shape=' + str(shape))
+      call_arg_spec = tf_inspect.getfullargspec(self.call)
+      # There is no explicit "inputs" argument expected or provided to
+      # call(). Arguments which have default values are considered non-inputs,
+      # and arguments without are considered inputs.
+      if call_arg_spec.defaults:
+        if call_arg_spec.varargs is not None:
+          raise TypeError(
+              'Layers may not accept both positional arguments and '
+              'arguments with default values (unable to determine which '
+              'are inputs to the layer). '
+              'Issue occurred with layer "%s"' % (self.name))
+        keyword_arg_names = set(
+            call_arg_spec.args[-len(call_arg_spec.defaults):])
+      else:
+        keyword_arg_names = set()
+        # Training is never an input argument name, to allow signatures like
+        # call(x, training).
+      keyword_arg_names.add('training')
+      _, unwrapped_call = tf_decorator.unwrap(self.call)
+      bound_args = inspect.getcallargs(
+          unwrapped_call, *call_args, **call_kwargs)
+      if call_arg_spec.varkw is not None:
+        var_kwargs = bound_args.pop(call_arg_spec.varkw)
+        bound_args.update(var_kwargs)
+        keyword_arg_names = keyword_arg_names.union(var_kwargs.keys())
+      all_args = call_arg_spec.args
+      if all_args and bound_args[all_args[0]] is self:
+        # Ignore the 'self' argument of methods
+        bound_args.pop(call_arg_spec.args[0])
+        all_args = all_args[1:]
+      non_input_arg_values = {}
+      input_arg_values = []
+      remaining_args_are_keyword = False
+      for argument_name in all_args:
+        if argument_name in keyword_arg_names:
+          remaining_args_are_keyword = True
+        else:
+          if remaining_args_are_keyword:
+            raise TypeError(
+                'Found a positional argument in a layer call after a non-input '
+                'argument. All arguments after "training" must be keyword '
+                'arguments, and are not tracked as inputs to the layer. '
+                'Issue occurred with layer "%s"' % (self.name))
+        if remaining_args_are_keyword:
+          non_input_arg_values[argument_name] = bound_args[argument_name]
+        else:
+          input_arg_values.append(bound_args[argument_name])
+      if call_arg_spec.varargs is not None:
+        input_arg_values.extend(bound_args[call_arg_spec.varargs])
+      return input_arg_values, non_input_arg_values
 
-  def set_weights(self, weights):
-    """Sets the weights of the layer, from Numpy arrays.
+  def _add_inbound_node(self,
+                        input_tensors,
+                        output_tensors,
+                        arguments=None):
+    """Internal method to create an inbound node for the layer.
 
     Arguments:
-        weights: a list of Numpy arrays. The number
-            of arrays and their shape must match
-            number of the dimensions of the weights
-            of the layer (i.e. it should match the
-            output of `get_weights`).
-
-    Raises:
-        ValueError: If the provided weights list does not match the
-            layer's specifications.
-    """
-    params = self.weights
-    if len(params) != len(weights):
-      raise ValueError('You called `set_weights(weights)` on layer "' +
-                       self.name + '" with a  weight list of length ' +
-                       str(len(weights)) + ', but the layer was expecting ' +
-                       str(len(params)) + ' weights. Provided weights: ' +
-                       str(weights)[:50] + '...')
-    if not params:
-      return
-    weight_value_tuples = []
-    param_values = backend.batch_get_value(params)
-    for pv, p, w in zip(param_values, params, weights):
-      if pv.shape != w.shape:
-        raise ValueError('Layer weight shape ' + str(pv.shape) +
-                         ' not compatible with '
-                         'provided weight shape ' + str(w.shape))
-      weight_value_tuples.append((p, w))
-    backend.batch_set_value(weight_value_tuples)
-
-  def get_weights(self):
-    """Returns the current weights of the layer.
-
-    Returns:
-        Weights values as a list of numpy arrays.
+        input_tensors: list of input tensors.
+        output_tensors: list of output tensors.
+        arguments: dictionary of keyword arguments that were passed to the
+            `call` method of the layer at the call that created the node.
     """
-    params = self.weights
-    return backend.batch_get_value(params)
-
-  def get_config(self):
-    """Returns the config of the layer.
+    input_tensors = nest.flatten(input_tensors)
+    output_tensors = nest.flatten(output_tensors)
 
-    A layer config is a Python dictionary (serializable)
-    containing the configuration of a layer.
-    The same layer can be reinstantiated later
-    (without its trained weights) from this configuration.
+    # Collect input tensor(s) coordinates.
+    inbound_layers = []
+    node_indices = []
+    tensor_indices = []
+    for x in input_tensors:
+      assert hasattr(x, '_keras_history')
+      inbound_layer, node_index, tensor_index = x._keras_history  # pylint: disable=protected-access
+      inbound_layers.append(inbound_layer)
+      node_indices.append(node_index)
+      tensor_indices.append(tensor_index)
 
-    The config of a layer does not include connectivity
-    information, nor the layer class name. These are handled
-    by `Network` (one layer of abstraction above).
+    # Create node, add it to inbound nodes.
+    Node(
+        self,
+        inbound_layers=inbound_layers,
+        node_indices=node_indices,
+        tensor_indices=tensor_indices,
+        input_tensors=input_tensors,
+        output_tensors=output_tensors,
+        arguments=arguments)
 
-    Returns:
-        Python dictionary.
-    """
-    config = {'name': self.name, 'trainable': self.trainable}
-    if hasattr(self, '_batch_input_shape'):
-      config['batch_input_shape'] = self._batch_input_shape
-    if hasattr(self, 'dtype'):
-      config['dtype'] = self.dtype
-    return config
+    # Update tensor history metadata.
+    for i in range(len(output_tensors)):
+      # The metadata attribute consists of 1) a layer instance
+      # 2) a node index for the layer, 3) a tensor index for the node.
+      # The allows layer reuse (multiple nodes per layer) and multi-output
+      # or multi-input layers (e.g. a layer can return multiple tensors,
+      # and each can be sent to a different layer).
+      output_tensors[i]._keras_history = (self, len(self._inbound_nodes) - 1, i)  # pylint: disable=protected-access
 
-  @classmethod
-  def from_config(cls, config):
-    """Creates a layer from its config.
+  def _get_node_attribute_at_index(self, node_index, attr, attr_name):
+    """Private utility to retrieves an attribute (e.g. inputs) from a node.
 
-    This method is the reverse of `get_config`,
-    capable of instantiating the same layer from the config
-    dictionary. It does not handle layer connectivity
-    (handled by Network), nor weights (handled by `set_weights`).
+    This is used to implement the methods:
+        - get_input_shape_at
+        - get_output_shape_at
+        - get_input_at
+        etc...
 
     Arguments:
-        config: A Python dictionary, typically the
-            output of get_config.
+        node_index: Integer index of the node from which
+            to retrieve the attribute.
+        attr: Exact node attribute name.
+        attr_name: Human-readable attribute name, for error messages.
 
     Returns:
-        A layer instance.
-    """
-    return cls(**config)
-
-
-@tf_export(
-    'keras.layers.InputSpec', v1=['keras.layers.InputSpec', 'layers.InputSpec'])
-class InputSpec(object):
-  """Specifies the ndim, dtype and shape of every input to a layer.
+        The layer's attribute `attr` at the node of index `node_index`.
 
-  Every layer should expose (if appropriate) an `input_spec` attribute:
-  a list of instances of InputSpec (one per input tensor).
+    Raises:
+        RuntimeError: If the layer has no inbound nodes, or if called in Eager
+        mode.
+        ValueError: If the index provided does not match any node.
+    """
+    if not self._inbound_nodes:
+      raise RuntimeError('The layer has never been called '
+                         'and thus has no defined ' + attr_name + '.')
+    if not len(self._inbound_nodes) > node_index:
+      raise ValueError('Asked to get ' + attr_name + ' at node ' +
+                       str(node_index) + ', but the layer has only ' +
+                       str(len(self._inbound_nodes)) + ' inbound nodes.')
+    values = getattr(self._inbound_nodes[node_index], attr)
+    if len(values) == 1:
+      return values[0]
+    else:
+      return values
 
-  A None entry in a shape is compatible with any dimension,
-  a None shape is compatible with any shape.
+  @property
+  def _static_graph_friendly(self):
+    """Whether the layer can be called to create a static graph.
 
-  Arguments:
-      dtype: Expected DataType of the input.
-      shape: Shape tuple, expected shape of the input
-          (may include None for unchecked axes).
-      ndim: Integer, expected rank of the input.
-      max_ndim: Integer, maximum rank of the input.
-      min_ndim: Integer, minimum rank of the input.
-      axes: Dictionary mapping integer axes to
-          a specific dimension value.
-  """
+    Because of nesting, there are two components to being "graph-friendly":
+      1) all inner layers are graph-friendly
+      2) the way they are composed is graph-friendly.
+    We denote the latter as "_call_is_graph_friendly", and define
+    "_static_graph_friendly" as being the combination of
+    "_call_is_graph_friendly" and "all inner layers are _static_graph_friendly".
+    For atomic layers (no inner layers), this is just "_call_is_graph_friendly".
 
-  def __init__(self,
-               dtype=None,
-               shape=None,
-               ndim=None,
-               max_ndim=None,
-               min_ndim=None,
-               axes=None):
-    self.dtype = dtype
-    self.shape = shape
-    if shape is not None:
-      self.ndim = len(shape)
-    else:
-      self.ndim = ndim
-    self.max_ndim = max_ndim
-    self.min_ndim = min_ndim
-    self.axes = axes or {}
+    Returns:
+      Boolean.
+    """
+    return self._call_is_graph_friendly
 
-  def __repr__(self):
-    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
-            ('shape=' + str(self.shape)) if self.shape else '',
-            ('ndim=' + str(self.ndim)) if self.ndim else '',
-            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
-            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
-            ('axes=' + str(self.axes)) if self.axes else '']
-    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+  def _maybe_build(self, inputs):
+    # Check input assumptions set before layer building, e.g. input rank.
+    input_spec.assert_input_compatibility(
+        self.input_spec, inputs, self.name)
+    input_list = nest.flatten(inputs)
+    if input_list and self._dtype is None:
+      try:
+        self._dtype = input_list[0].dtype.base_dtype.name
+      except AttributeError:
+        pass
+    input_shapes = None
+    if all(hasattr(x, 'shape') for x in input_list):
+      input_shapes = nest.map_structure(lambda x: x.shape, inputs)
+    # Only call `build` if the user has manually overridden the build method.
+    if not hasattr(self.build, '_is_default'):
+      self.build(input_shapes)
 
 
 class Node(object):
@@ -1768,192 +1704,12 @@ class Node(object):
     }
 
 
-def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
-                      zero_based=False):
-  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
-
-  Arguments:
-    name: String name to make unique.
-    name_uid_map: An optional defaultdict(int) to use when creating unique
-      names. If None (default), uses a per-Graph dictionary.
-    avoid_names: An optional set or dict with names which should not be used. If
-      None (default) does not avoid any names.
-    namespace: Gets a name which is unique within the (graph, namespace). Layers
-      which are not Networks use a blank namespace and so get graph-global
-      names.
-    zero_based: If True, name sequences start with no suffix (e.g. "dense",
-      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
-
-  Returns:
-    Unique string name.
-
-  Example:
-
-  ```python
-  _unique_layer_name('dense')  # dense_1
-  _unique_layer_name('dense')  # dense_2
-  ```
-  """
-  if name_uid_map is None:
-    name_uid_map = get_default_graph_uid_map()
-  if avoid_names is None:
-    avoid_names = set()
-  proposed_name = None
-  while proposed_name is None or proposed_name in avoid_names:
-    name_key = (namespace, name)
-    if zero_based:
-      number = name_uid_map[name_key]
-      if number:
-        proposed_name = name + '_' + str(number)
-      else:
-        proposed_name = name
-      name_uid_map[name_key] += 1
-    else:
-      name_uid_map[name_key] += 1
-      proposed_name = name + '_' + str(name_uid_map[name_key])
-  return proposed_name
-
-
-def have_all_keras_metadata(iterable_or_element):
-  if not isinstance(iterable_or_element, (list, tuple)):
-    iterable = [iterable_or_element]
-  else:
-    iterable = nest.flatten(iterable_or_element)
-  return all([hasattr(x, '_keras_history') for x in iterable])
-
-
-def collect_previous_mask(input_tensors):
-  """Retrieves the output mask(s) of the previous node.
-
-  Arguments:
-      input_tensors: A tensor or list of tensors.
-
-  Returns:
-      A mask tensor or list of mask tensors.
-  """
-  input_tensors = nest.flatten(input_tensors)
-  masks = []
-  for x in input_tensors:
-    if hasattr(x, '_keras_mask'):
-      mask = x._keras_mask  # pylint: disable=protected-access
-      masks.append(mask)
-    else:
-      masks.append(None)
-  if len(masks) == 1:
-    return masks[0]
-  return masks
-
-
-def get_default_graph_uid_map():
-  # TODO(fchollet): refactor this into backend.
-  graph = ops.get_default_graph()
-  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
-  if name_uid_map is None:
-    name_uid_map = collections_lib.defaultdict(int)
-    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
-  return name_uid_map
-
-
-def make_variable(name,
-                  shape=None,
-                  dtype=dtypes.float32,
-                  initializer=None,
-                  partition_info=None,
-                  trainable=None,
-                  caching_device=None,
-                  validate_shape=True,
-                  constraint=None,
-                  use_resource=None,
-                  collections=None,
-                  synchronization=tf_variables.VariableSynchronization.AUTO,
-                  aggregation=tf_variables.VariableAggregation.NONE,
-                  partitioner=None):  # pylint: disable=unused-argument
-  """Temporary util to create a variable (relies on `variable_scope.variable`).
-
-  Some reuse-related technicalities prevent us from using
-  `variable_scope.get_variable()` directly, so we use a subcomponent
-  that has fewer constraints (`variable_scope.variable()`).
-
-  In the longer term, it seems like a similar "default variable creator" method
-  should exist in `CheckpointableBase` instead. When this happens, we can get
-  rid of this temporary solution.
-
-  TODO(fchollet): remove this method when no longer needed.
-  TODO(fchollet): handle `partitioner` argument.
-
-  Arguments:
-    name: Variable name.
-    shape: Variable shape.
-    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
-    initializer: Initializer instance (callable).
-    partition_info: Not handled at this time.
-    trainable: Whether the variable should be part of the layer's
-      "trainable_variables" (e.g. variables, biases)
-      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
-      Note, if the current variable scope is marked as non-trainable
-      then this parameter is ignored and any added variables are also
-      marked as non-trainable. `trainable` defaults to `True` unless
-      `synchronization` is set to `ON_READ`.
-    caching_device: Passed to `tf.Variable`.
-    validate_shape: Passed to `tf.Variable`.
-    constraint: Constraint instance (callable).
-    use_resource: Whether to use a `ResourceVariable`.
-    collections: List of graph collections keys. The new variable is added to
-      these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
-    synchronization: Indicates when a distributed a variable will be
-      aggregated. Accepted values are constants defined in the class
-      `tf.VariableSynchronization`. By default the synchronization is set to
-      `AUTO` and the current `DistributionStrategy` chooses
-      when to synchronize. If `synchronization` is set to `ON_READ`,
-      `trainable` must not be set to `True`.
-    aggregation: Indicates how a distributed variable will be aggregated.
-      Accepted values are constants defined in the class
-      `tf.VariableAggregation`.
-    partitioner: Not handled at this time.
-
-  Returns:
-    Variable instance.
-  """
-  initializing_from_value = False
-  if initializer is not None and not callable(initializer):
-    initializing_from_value = True
-
-  with ops.init_scope():
-    if initializing_from_value:
-      init_val = initializer
-      variable_dtype = None
-    else:
-      # Instantiate initializer if provided initializer is a type object.
-      if isinstance(initializer, type(init_ops.Initializer)):
-        initializer = initializer(dtype=dtype)
-      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
-          shape, dtype=dtype, partition_info=partition_info)
-      variable_dtype = dtype.base_dtype
-  if use_resource is None:
-    use_resource = True
-
-  # TODO(apassos,rohanj) figure out how to remove collections from here so we
-  # can remove the V1.
-  v = tf_variables.VariableV1(
-      initial_value=init_val,
-      name=name,
-      trainable=trainable,
-      caching_device=caching_device,
-      dtype=variable_dtype,
-      validate_shape=validate_shape,
-      constraint=constraint,
-      use_resource=use_resource,
-      collections=collections,
-      synchronization=synchronization,
-      aggregation=aggregation)
-  return v
-
-
 def default(method):
   """Decorates a method to detect overrides in subclasses."""
   method._is_default = True
   return method
 
 
-def generate_placeholders_from_shape(shape):
-  return array_ops.placeholder(shape=shape, dtype=backend.floatx())
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
+InputSpec = input_spec.InputSpec  # pylint:disable=invalid-name
diff --git a/tensorflow/python/keras/engine/base_layer_test.py b/tensorflow/python/keras/engine/base_layer_test.py
index 704589349a85cf3c810d860e5eca7ad0fe8a73ae..798775b6a5b29aa72a2c766584811aa469db2471 100644
--- a/tensorflow/python/keras/engine/base_layer_test.py
+++ b/tensorflow/python/keras/engine/base_layer_test.py
@@ -81,14 +81,14 @@ class BaseLayerTest(test.TestCase):
     inputs = keras.Input((3,))
     outputs = DynamicLayer1()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
     inputs = keras.Input((3,))
     outputs = DynamicLayer2()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
@@ -102,7 +102,7 @@ class BaseLayerTest(test.TestCase):
     outputs = inner_model(x)
 
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
 
@@ -116,7 +116,7 @@ class BaseLayerTest(test.TestCase):
     inputs = keras.Input((3,))
     outputs = InvalidLayer()(inputs)
     model = keras.Model(inputs, outputs)
-    self.assertEqual(model._is_static_graph_friendly, False)
+    self.assertEqual(model._static_graph_friendly, False)
     model.compile(RMSPropOptimizer(0.001), loss='mse')
     with self.assertRaisesRegexp(ValueError, 'You did something wrong!'):
       model.train_on_batch(np.random.random((2, 3)), np.random.random((2, 3)))
diff --git a/tensorflow/python/keras/engine/base_layer_utils.py b/tensorflow/python/keras/engine/base_layer_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f947f17723fbb01280d7ef09f327dd64fc938e
--- /dev/null
+++ b/tensorflow/python/keras/engine/base_layer_utils.py
@@ -0,0 +1,236 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Contains private utilities used mainly by the base Layer class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections as collections_lib
+import enum
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.util import nest
+
+
+class CallConvention(enum.Enum):
+  """Calling conventions for passing `Layer` inputs to `Layer.call`."""
+  # The Layer takes inputs as its first argument, named "inputs" for
+  # compatibility with the signature of Layer.__call__. This is the mode assumed
+  # for Layers which are not subclassed Models.
+  EXPLICIT_INPUTS_ARGUMENT = 1
+  # The Layer takes a single positional argument, not named "inputs". It's
+  # treated like an "inputs" argument.
+  SINGLE_POSITIONAL_ARGUMENT = 2
+  # The Layer has multiple positional arguments to which its inputs should be
+  # bound.
+  POSITIONAL_ARGUMENTS_ARE_INPUTS = 3
+
+
+def create_mean_metric(value, name=None):
+  # TODO(psv): Remove this import when b/110718070 is fixed.
+  from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+  metric_obj = metrics_module.Mean(name=name)
+  result = metric_obj(value)
+  return metric_obj, result
+
+
+def make_variable(name,
+                  shape=None,
+                  dtype=dtypes.float32,
+                  initializer=None,
+                  partition_info=None,
+                  trainable=None,
+                  caching_device=None,
+                  validate_shape=True,
+                  constraint=None,
+                  use_resource=None,
+                  collections=None,
+                  synchronization=tf_variables.VariableSynchronization.AUTO,
+                  aggregation=tf_variables.VariableAggregation.NONE,
+                  partitioner=None):  # pylint: disable=unused-argument
+  """Temporary util to create a variable (relies on `variable_scope.variable`).
+
+  Some reuse-related technicalities prevent us from using
+  `variable_scope.get_variable()` directly, so we use a subcomponent
+  that has fewer constraints (`variable_scope.variable()`).
+
+  In the longer term, it seems like a similar "default variable creator" method
+  should exist in `CheckpointableBase` instead. When this happens, we can get
+  rid of this temporary solution.
+
+  TODO(fchollet): remove this method when no longer needed.
+  TODO(fchollet): handle `partitioner` argument.
+
+  Arguments:
+    name: Variable name.
+    shape: Variable shape.
+    dtype: The type of the variable. Defaults to `self.dtype` or `float32`.
+    initializer: Initializer instance (callable).
+    partition_info: Not handled at this time.
+    trainable: Whether the variable should be part of the layer's
+      "trainable_variables" (e.g. variables, biases)
+      or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
+      Note, if the current variable scope is marked as non-trainable
+      then this parameter is ignored and any added variables are also
+      marked as non-trainable. `trainable` defaults to `True` unless
+      `synchronization` is set to `ON_READ`.
+    caching_device: Passed to `tf.Variable`.
+    validate_shape: Passed to `tf.Variable`.
+    constraint: Constraint instance (callable).
+    use_resource: Whether to use a `ResourceVariable`.
+    collections: List of graph collections keys. The new variable is added to
+      these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
+    synchronization: Indicates when a distributed a variable will be
+      aggregated. Accepted values are constants defined in the class
+      `tf.VariableSynchronization`. By default the synchronization is set to
+      `AUTO` and the current `DistributionStrategy` chooses
+      when to synchronize. If `synchronization` is set to `ON_READ`,
+      `trainable` must not be set to `True`.
+    aggregation: Indicates how a distributed variable will be aggregated.
+      Accepted values are constants defined in the class
+      `tf.VariableAggregation`.
+    partitioner: Not handled at this time.
+
+  Returns:
+    Variable instance.
+  """
+  initializing_from_value = False
+  if initializer is not None and not callable(initializer):
+    initializing_from_value = True
+
+  with ops.init_scope():
+    if initializing_from_value:
+      init_val = initializer
+      variable_dtype = None
+    else:
+      # Instantiate initializer if provided initializer is a type object.
+      if isinstance(initializer, type(init_ops.Initializer)):
+        initializer = initializer(dtype=dtype)
+      init_val = lambda: initializer(  # pylint: disable=g-long-lambda
+          shape, dtype=dtype, partition_info=partition_info)
+      variable_dtype = dtype.base_dtype
+  if use_resource is None:
+    use_resource = True
+
+  # TODO(apassos,rohanj) figure out how to remove collections from here so we
+  # can remove the V1.
+  v = tf_variables.VariableV1(
+      initial_value=init_val,
+      name=name,
+      trainable=trainable,
+      caching_device=caching_device,
+      dtype=variable_dtype,
+      validate_shape=validate_shape,
+      constraint=constraint,
+      use_resource=use_resource,
+      collections=collections,
+      synchronization=synchronization,
+      aggregation=aggregation)
+  return v
+
+
+def get_default_graph_uid_map():
+  # TODO(fchollet): refactor this into backend.
+  graph = ops.get_default_graph()
+  name_uid_map = backend.PER_GRAPH_LAYER_NAME_UIDS.get(graph, None)
+  if name_uid_map is None:
+    name_uid_map = collections_lib.defaultdict(int)
+    backend.PER_GRAPH_LAYER_NAME_UIDS[graph] = name_uid_map
+  return name_uid_map
+
+
+def unique_layer_name(name, name_uid_map=None, avoid_names=None, namespace='',
+                      zero_based=False):
+  """Makes a layer name (or arbitrary string) unique within a TensorFlow graph.
+
+  Arguments:
+    name: String name to make unique.
+    name_uid_map: An optional defaultdict(int) to use when creating unique
+      names. If None (default), uses a per-Graph dictionary.
+    avoid_names: An optional set or dict with names which should not be used. If
+      None (default) does not avoid any names.
+    namespace: Gets a name which is unique within the (graph, namespace). Layers
+      which are not Networks use a blank namespace and so get graph-global
+      names.
+    zero_based: If True, name sequences start with no suffix (e.g. "dense",
+      "dense_1"). If False, naming is one-based ("dense_1", "dense_2").
+
+  Returns:
+    Unique string name.
+
+  Example:
+
+  ```python
+  _unique_layer_name('dense')  # dense_1
+  _unique_layer_name('dense')  # dense_2
+  ```
+  """
+  if name_uid_map is None:
+    name_uid_map = get_default_graph_uid_map()
+  if avoid_names is None:
+    avoid_names = set()
+  proposed_name = None
+  while proposed_name is None or proposed_name in avoid_names:
+    name_key = (namespace, name)
+    if zero_based:
+      number = name_uid_map[name_key]
+      if number:
+        proposed_name = name + '_' + str(number)
+      else:
+        proposed_name = name
+      name_uid_map[name_key] += 1
+    else:
+      name_uid_map[name_key] += 1
+      proposed_name = name + '_' + str(name_uid_map[name_key])
+  return proposed_name
+
+
+def collect_previous_mask(input_tensors):
+  """Retrieves the output mask(s) of the previous node.
+
+  Arguments:
+      input_tensors: A tensor or list of tensors.
+
+  Returns:
+      A mask tensor or list of mask tensors.
+  """
+  input_tensors = nest.flatten(input_tensors)
+  masks = []
+  for x in input_tensors:
+    if hasattr(x, '_keras_mask'):
+      mask = x._keras_mask  # pylint: disable=protected-access
+      masks.append(mask)
+    else:
+      masks.append(None)
+  if len(masks) == 1:
+    return masks[0]
+  return masks
+
+
+def have_all_keras_metadata(iterable_or_element):
+  if not isinstance(iterable_or_element, (list, tuple)):
+    iterable = [iterable_or_element]
+  else:
+    iterable = nest.flatten(iterable_or_element)
+  return all(hasattr(x, '_keras_history') for x in iterable)
+
+
+def generate_placeholders_from_shape(shape):
+  return array_ops.placeholder(shape=shape, dtype=backend.floatx())
diff --git a/tensorflow/python/keras/engine/distributed_training_utils.py b/tensorflow/python/keras/engine/distributed_training_utils.py
index f939b7565a88f8add025f1aa8c0db8c1d20de7c5..32129afe64761048ed219a4e0caaae19292b9bc4 100644
--- a/tensorflow/python/keras/engine/distributed_training_utils.py
+++ b/tensorflow/python/keras/engine/distributed_training_utils.py
@@ -22,15 +22,17 @@ import numpy as np
 from tensorflow.python.client import session as session_module
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.distribute import distribute_coordinator_context as dc_context
+from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -52,14 +54,18 @@ def set_weights(distribution_strategy, dist_model, weights):
     num_param = len(layer.weights)
     layer_weights = weights[:num_param]
     for sw, w in zip(layer.weights, layer_weights):
-      assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
-
+      if ops.executing_eagerly_outside_functions():
+        sw.assign(w)
+      else:
+        assign_ops.append(distribution_strategy.unwrap(sw.assign(w)))
     weights = weights[num_param:]
-  K.get_session().run(assign_ops)
+
+  if not ops.executing_eagerly_outside_functions():
+    K.get_session().run(assign_ops)
 
 
 def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
-                  grouped_updates, grouped_session_args,
+                  grouped_updates=None, grouped_session_args=None,
                   with_loss_tensor=False):
   """Unwrap and return the list of values contained in the PerDevice parameters.
 
@@ -92,11 +98,8 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
                                         grouped_inputs)
   if with_loss_tensor:
     # reduce loss tensor before adding it to the list of fetches
-    loss = distribution_strategy.unwrap(
-        distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
-                                     grouped_outputs[0],
-                                     destinations='/device:CPU:0'))[0]
-
+    loss = distribution_strategy.reduce(distribute_lib.get_loss_reduction(),
+                                        grouped_outputs[0])
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs[1:])
     all_outputs = [loss] + all_outputs
@@ -104,20 +107,25 @@ def unwrap_values(distribution_strategy, grouped_inputs, grouped_outputs,
     all_outputs = flatten_perdevice_values(distribution_strategy,
                                            grouped_outputs)
 
-  all_updates = flatten_perdevice_values(distribution_strategy,
-                                         grouped_updates)
+  if grouped_updates:
+    all_updates = flatten_perdevice_values(distribution_strategy,
+                                           grouped_updates)
+  else:
+    all_updates = None
 
   all_session_args = {}
-  grouped_feed_dict = grouped_session_args.get('feed_dict')
-  if grouped_feed_dict:
-    all_session_args['feed_dict'] = flatten_perdevice_values(
-        distribution_strategy, grouped_feed_dict)
-
-  grouped_fetches = grouped_session_args.get('fetches')
-  if grouped_fetches:
-    all_session_args['fetches'] = flatten_perdevice_values(
-        distribution_strategy, grouped_fetches)
-
+  if grouped_session_args:
+    grouped_feed_dict = grouped_session_args.get('feed_dict')
+    if grouped_feed_dict:
+      all_session_args['feed_dict'] = flatten_perdevice_values(
+          distribution_strategy, grouped_feed_dict)
+
+    grouped_fetches = grouped_session_args.get('fetches')
+    if grouped_fetches:
+      all_session_args['fetches'] = flatten_perdevice_values(
+          distribution_strategy, grouped_fetches)
+
+  # TODO(priyag): Return only non empty/None values
   return all_inputs, all_outputs, all_updates, all_session_args
 
 
@@ -144,11 +152,14 @@ def flatten_perdevice_values(distribution_strategy, perdevice_values):
           for e in distribution_strategy.unwrap(flattened)]
 
 
-def validate_callbacks(input_callbacks):
+def validate_callbacks(input_callbacks, optimizer, current_strategy):
   """Validate whether given callbacks are supported by DistributionStrategy.
 
   Args:
     input_callbacks: List of callbacks passed by the user to fit.
+    optimizer: Optimizer instance used to train the model.
+    current_strategy: The DistributionStrategy used to distribute training
+      and validation.
 
   Raises:
     ValueError: If `LearningRateScheduler` or `ReduceLROnPlateau` is one of the
@@ -170,23 +181,37 @@ def validate_callbacks(input_callbacks):
                         'these attributes are not set. You can access each of '
                         'the individual distributed models using the '
                         '`_grouped_model` attribute of your original model.')
-      if isinstance(callback, callbacks.LearningRateScheduler):
-        raise ValueError('LearningRateScheduler callback is not supported with '
-                         'DistributionStrategy.')
-      if isinstance(callback, callbacks.ReduceLROnPlateau):
-        raise ValueError('ReduceLROnPlateau callback is not supported with '
-                         'DistributionStrategy.')
+      if isinstance(callback, (callbacks.LearningRateScheduler,
+                               callbacks.ReduceLROnPlateau)):
+        strategy_name = current_strategy.__class__.__name__
+        # TODO(anjalisridhar): We might need to add a condition for multi
+        # worker strategy when we support it in Keras.
+        if is_tpu_strategy(current_strategy):
+          raise ValueError('%s callback is not supported with %s.' %
+                           (callback, strategy_name))
+
+        if not isinstance(optimizer, optimizer_v2.OptimizerV2):
+          raise ValueError('You must specify a Keras Optimizer V2 when using '
+                           '%s callback with DistributionStrategy.' % callback)
 
       # If users want to use the TensorBoard callback they cannot use certain
       # features of the callback that involve accessing model attributes and
       # running ops.
       if isinstance(callback, callbacks.TensorBoard):
         if callback.__getattribute__('histogram_freq'):
-          raise ValueError('histogram_freq in the TensorBoard callback is not '
-                           'supported when using DistributionStrategy.')
+          logging.warning(
+              UserWarning(
+                  '`histogram_freq` in the TensorBoard callback is not '
+                  'supported when using DistributionStrategy. Setting '
+                  '`histogram_freq` to `0`.'))
+          callback.histogram_freq = 0
         if callback.__getattribute__('write_grads'):
-          raise ValueError('write_grads in the TensorBoard callback is not '
-                           'supported when using DistributionStrategy.')
+          logging.warning(
+              UserWarning(
+                  '`write_grads` in the TensorBoard callback is not supported '
+                  'when using DistributionStrategy. Setting `write_grads` '
+                  'to `False`.'))
+          callback.histogram_freq = False
 
 
 def validate_distributed_dataset_inputs(distribution_strategy, x, y,
@@ -293,19 +318,64 @@ def validate_all_tensor_shapes(x, x_values):
                        ' inputs {}'.format(x))
 
 
+def _wait_for_variable_initialization(session):
+  """Utility to wait for variables to be initialized."""
+  all_variables = K._get_variables(K.get_graph())  # pylint: disable=protected-access
+  candidate_vars = []
+  for v in all_variables:
+    if not getattr(v, '_keras_initialized', False):
+      candidate_vars.append(v)
+
+  if not candidate_vars:
+    return
+
+  while True:
+    is_initialized = session.run(
+        [variables.is_variable_initialized(v) for v in candidate_vars])
+    uninitialized_vars = []
+    for flag, v in zip(is_initialized, candidate_vars):
+      if not flag:
+        uninitialized_vars.append(v)
+      v._keras_initialized = True  # pylint: disable=protected-access
+    if not uninitialized_vars:
+      break
+
+
+def init_restore_or_wait_for_variables():
+  """Initialize or restore variables or wait for variables to be initialized."""
+  session = K._get_session()  # pylint: disable=protected-access
+  worker_context = dc_context.get_current_worker_context()
+  if not worker_context or worker_context.experimental_should_init:
+    # TODO(yuefengz): if checkpoints exit, restore from checkpoint.
+    K._initialize_variables(session)  # pylint: disable=protected-access
+  else:
+    _wait_for_variable_initialization(session)
+
+
 def configure_and_create_session(distribution_strategy):
   """Configure session config and create a session with it."""
   # TODO(priyag): Throw error if a session already exists.
   session_config = K.get_default_session_config()
-  distribution_strategy.configure(session_config)
 
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
-    # TODO(priyag): Remove this workaround when Distributed Coordinator is
-    # integrated with keras and we can create a session from there.
-    master = distribution_strategy._tpu_cluster_resolver.master()  # pylint: disable=protected-access
+  if is_tpu_strategy(distribution_strategy):
+    # TODO(priyag, yuefengz): Remove this workaround when Distribute
+    # Coordinator is integrated with keras and we can create a session from
+    # there.
+    distribution_strategy.configure(session_config)
+    master = distribution_strategy.extended._tpu_cluster_resolver.master()  # pylint: disable=protected-access
     session = session_module.Session(config=session_config, target=master)
   else:
-    session = session_module.Session(config=session_config)
+    worker_context = dc_context.get_current_worker_context()
+    if worker_context:
+      dc_session_config = worker_context.session_config
+      # Merge the default session config to the one from distribute coordinator,
+      # which is fine for now since they don't have conflicting configurations.
+      dc_session_config.MergeFrom(session_config)
+      session = session_module.Session(
+          config=dc_session_config, target=worker_context.master_target)
+    else:
+      distribution_strategy.configure(session_config)
+      session = session_module.Session(config=session_config)
 
   K.set_session(session)
 
@@ -334,11 +404,15 @@ def validate_inputs(x, y, distribution_strategy):
                      'Iterator. You must pass a `tf.data.Dataset` object or a '
                      'numpy array as input.')
 
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
+  if is_tpu_strategy(distribution_strategy):
     for i in [x, y]:
-      if isinstance(i, dataset_ops.Dataset):
+      if isinstance(i, dataset_ops.DatasetV2):
         shapes = nest.flatten(i.output_shapes)
-        if any([not s.is_fully_defined() for s in shapes]):
+        try:
+          s = next(s for s in shapes if not s.is_fully_defined())
+        except StopIteration:
+          continue
+        else:
           raise ValueError(
               'Using TPUs currently requires fully defined shapes. Either use '
               'set_shape() on the input tensors or use '
@@ -346,37 +420,97 @@ def validate_inputs(x, y, distribution_strategy):
               'Found unknown shape {} in input {}.'.format(s, i))
 
 
-def get_input_batch_params(first_x_value, batch_size, distribution_strategy):
+# TODO(b/118776054): Currently we support global batch size for TPUStrategy and
+# core MirroredStrategy only. Remove this check when contrib MirroredStrategy is
+# no longer needed.
+def global_batch_size_supported(distribution_strategy):
+  return distribution_strategy.extended._global_batch_size  # pylint: disable=protected-access
+
+
+# TODO(sourabhbajaj): Remove this once we use the same API for all strategies.
+def is_tpu_strategy(strategy):
+  """We're executing TPU Strategy."""
+  return strategy is not None and strategy.__class__.__name__ == 'TPUStrategy'
+
+
+def get_input_params(distribution_strategy, first_x_value, steps, batch_size,
+                     is_training=False):
   """Calculate the number of batches and steps/steps_per_epoch.
 
   Args:
+    distribution_strategy: The DistributionStrategy used to compile the model.
     first_x_value: This is the first input numpy array that is passed in as the
       model input.
-    batch_size: The specified batch_size or the default batch_size of 32.
-    distribution_strategy: The current DistributionStrategy used to compile the
-      model.
+    steps:  The specified number of steps.
+    batch_size: The specified batch_size.
+    is_training: Boolean to relax the constraints on consuming all the training
+      samples to keep compatibility till we support partial batches.
 
   Returns:
-    The steps or steps_per_epoch argument depending on if a user is
-    calling `fit`, `evaluate` or `predict`.
+    steps: The steps or steps_per_epoch argument depending on if a user is
+        calling `fit`, `evaluate` or `predict`. If the is_training flag is set
+        we don't require the number of samples to be used completely.
+    batch_size: The batch size to be used in model iterations.
 
   Raises:
     ValueError: If the number of batches or steps evaluates to 0.
 
   """
-  num_batches = first_x_value.shape[0] // batch_size
-  if not num_batches:
-    raise ValueError('Please specify a batch_size that is smaller than'
-                     'the number of input samples %d.' % first_x_value.shape[0])
-  steps = num_batches // distribution_strategy.num_replicas_in_sync
-  if not steps:
-    # TODO(anjalisridhar): Number of replicas in the error message may not
-    # convey what we want to the user. Is there another terminology that we can
-    # use that is consistent across different strategies?
-    raise ValueError('The number of batches %d is smaller than the number '
-                     'of replicas %d used for DistributionStrategy. ' %
-                     (num_batches, distribution_strategy.num_replicas_in_sync))
-  return steps
+  num_samples = first_x_value.shape[0]
+  # TODO(b/118776054): Use global batch size for Keras/DS support.
+  # Currently this is only supported in TPUStrategy and CoreMirroredStrategy.
+  use_per_replica_batch = not global_batch_size_supported(
+      distribution_strategy)
+
+  if steps is None:
+    if batch_size is None:
+      # If neither the batch size or number of steps are set. We choose the
+      # global batch size as the minimum of number of samples and 32. 32 is
+      # chosen to provide backward compatibility.
+      global_batch_size = min(num_samples, 32)
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+    if not is_training and num_samples % global_batch_size:
+      raise ValueError('The number of samples %s is not divisible by '
+                       'batch size %s.' % (num_samples, global_batch_size))
+    steps = num_samples // global_batch_size
+  else:
+    if batch_size is None:
+      # We calculate the batch size based on the number of steps specified
+      if num_samples % steps:
+        raise ValueError('The number of samples %s is not divisible by '
+                         'steps %s. Please change the number of steps to a '
+                         'value that can consume all the samples' % (
+                             num_samples, steps))
+      global_batch_size = num_samples // steps
+    else:
+      # If the user provided the batch size we need to handle the case
+      # between different strategies that use the global/per-replica batch size
+      global_batch_size = batch_size
+      if use_per_replica_batch:
+        global_batch_size *= distribution_strategy.num_replicas_in_sync
+
+      if num_samples < (global_batch_size * steps):
+        raise ValueError('Number of samples %s is less than samples required '
+                         'for specified batch_size %s and steps %s' % (
+                             num_samples, global_batch_size, steps))
+
+  # We need to return the per replica or global batch size based on the strategy
+  if use_per_replica_batch:
+    if global_batch_size % distribution_strategy.num_replicas_in_sync:
+      raise ValueError(
+          'The batch size (%s) could not be sharded evenly across the sync '
+          'replicas (%s) in the distribution strategy.' % (
+              global_batch_size, distribution_strategy.num_replicas_in_sync))
+    batch_size = global_batch_size // distribution_strategy.num_replicas_in_sync
+  else:
+    batch_size = global_batch_size
+
+  return steps, batch_size
 
 
 def get_batch_dimension(iterator):
@@ -387,33 +521,6 @@ def get_batch_dimension(iterator):
   return dims[0] if dims else None
 
 
-def get_batch_size(num_replicas, num_samples, steps):
-  """Calculate and return batch size for numpy inputs.
-
-  Args:
-    num_replicas: Number of devices over which the model input is distributed.
-    num_samples: Total number of input samples in the input numpy arrays.
-    steps: Number of steps that we run the model for.
-
-  Returns:
-    batch size used to create the Dataset object from the input numpy arrays.
-
-  """
-  if num_samples % steps != 0:
-    logging.warning('The number of input samples %d is not evenly '
-                    'divisible by the number of steps %d. '
-                    'Some samples will not be processed as expected.' %
-                    (num_samples, steps))
-  global_batch_size = num_samples // steps
-  if global_batch_size % num_replicas != 0:
-    logging.warning('The total number of batches per step %d is not evenly '
-                    'divisible by the number of replicas %d used in '
-                    'DistributionStrategy. Some samples will not be processed '
-                    'as expected.' %
-                    (global_batch_size, num_replicas))
-  return global_batch_size // num_replicas
-
-
 def get_cpu_device(distribution_strategy):
   """Returns the CPU device of the TPU host or the default CPU device string.
 
@@ -429,12 +536,12 @@ def get_cpu_device(distribution_strategy):
     NotImplementedError: We currently don't support copying numpy data to
     multiple hosts in the case of Cloud TPU pods.
   """
-  if distribution_strategy.__class__.__name__ == 'TPUStrategy':
-    if distribution_strategy.num_hosts > 1:
+  if is_tpu_strategy(distribution_strategy):
+    if distribution_strategy.extended.num_hosts > 1:
       raise NotImplementedError('TPUDistributionStrategy does not '
                                 'support numpy inputs when running on Cloud'
                                 'TPU pods.')
-    return distribution_strategy.get_host_cpu_device(0)
+    return distribution_strategy.extended.get_host_cpu_device(0)
   else:
     # For all strategies except TPUDistributionStrategy
     # TODO(anjalisridhar): We may need to modify this when we add support for
diff --git a/tensorflow/python/keras/engine/feature_columns_integration_test.py b/tensorflow/python/keras/engine/feature_columns_integration_test.py
index e0478ee357b7a5e93d73be2c939930172b5943f7..b3f8cfe72585188d631c072b690729054d5db775 100644
--- a/tensorflow/python/keras/engine/feature_columns_integration_test.py
+++ b/tensorflow/python/keras/engine/feature_columns_integration_test.py
@@ -22,9 +22,10 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.feature_column import feature_column_v2 as fc
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.feature_column import feature_column_lib as fc
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
@@ -33,7 +34,7 @@ class TestDNNModel(keras.models.Model):
 
   def __init__(self, feature_columns, units, name=None, **kwargs):
     super(TestDNNModel, self).__init__(name=name, **kwargs)
-    self._input_layer = fc.FeatureLayer(feature_columns, name='input_layer')
+    self._input_layer = fc.DenseFeatures(feature_columns, name='input_layer')
     self._dense_layer = keras.layers.Dense(units, name='dense_layer')
 
   def call(self, features):
@@ -42,23 +43,24 @@ class TestDNNModel(keras.models.Model):
     return net
 
 
-class FeatureColumnsIntegrationTest(test.TestCase):
+class FeatureColumnsIntegrationTest(keras_parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
 
   """
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_model(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.FeatureLayer(columns),
+        fc.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
     model.compile(
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = {'a': np.random.random((10, 1))}
     y = np.random.randint(20, size=(10, 1))
@@ -68,18 +70,19 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     model.evaluate(x, y, batch_size=5)
     model.predict(x, batch_size=5)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_model_with_ds_input(self):
     columns = [fc.numeric_column('a')]
     model = keras.models.Sequential([
-        fc.FeatureLayer(columns),
+        fc.DenseFeatures(columns),
         keras.layers.Dense(64, activation='relu'),
         keras.layers.Dense(20, activation='softmax')
     ])
     model.compile(
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -92,7 +95,7 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     model.evaluate(ds, steps=1)
     model.predict(ds, steps=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_subclassed_model_with_feature_columns(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
@@ -102,7 +105,8 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.compile(
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = {'a': np.random.random((10, 1)), 'b': np.random.random((10, 1))}
     y = np.random.randint(20, size=(10, 1))
@@ -112,7 +116,7 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.evaluate(x=x, y=y, batch_size=5)
     dnn_model.predict(x=x, batch_size=5)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_subclassed_model_with_feature_columns_with_ds_input(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
@@ -122,7 +126,8 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.compile(
         optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
         loss='categorical_crossentropy',
-        metrics=['accuracy'])
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
     y = np.random.randint(20, size=(100, 1))
     y = keras.utils.to_categorical(y, num_classes=20)
@@ -135,15 +140,16 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     dnn_model.evaluate(ds, steps=1)
     dnn_model.predict(ds, steps=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  # TODO(kaftan) seems to throw an error when enabled.
+  @keras_parameterized.run_all_keras_modes
   def DISABLED_test_function_model_feature_layer_input(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
 
-    feature_layer = fc.FeatureLayer([col_a, col_b], name='fc')
+    feature_layer = fc.DenseFeatures([col_a, col_b], name='fc')
     dense = keras.layers.Dense(4)
 
-    # This seems problematic.... We probably need something for FeatureLayer
+    # This seems problematic.... We probably need something for DenseFeatures
     # the way Input is for InputLayer.
     output = dense(feature_layer)
 
@@ -161,17 +167,18 @@ class FeatureColumnsIntegrationTest(test.TestCase):
     data = ({'a': np.arange(10), 'b': np.arange(10)}, np.arange(10, 20))
     print(model.fit(*data, epochs=1))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  # TODO(kaftan) seems to throw an error when enabled.
+  @keras_parameterized.run_all_keras_modes
   def DISABLED_test_function_model_multiple_feature_layer_inputs(self):
     col_a = fc.numeric_column('a')
     col_b = fc.numeric_column('b')
     col_c = fc.numeric_column('c')
 
-    fc1 = fc.FeatureLayer([col_a, col_b], name='fc1')
-    fc2 = fc.FeatureLayer([col_b, col_c], name='fc2')
+    fc1 = fc.DenseFeatures([col_a, col_b], name='fc1')
+    fc2 = fc.DenseFeatures([col_b, col_c], name='fc2')
     dense = keras.layers.Dense(4)
 
-    # This seems problematic.... We probably need something for FeatureLayer
+    # This seems problematic.... We probably need something for DenseFeatures
     # the way Input is for InputLayer.
     output = dense(fc1) + dense(fc2)
 
diff --git a/tensorflow/python/keras/engine/input_layer.py b/tensorflow/python/keras/engine/input_layer.py
index 4e96106004fe83eb325415b77cc3a3e896a3d0bc..9874efe2bccd5e2db370ed54089424063afe88b5 100644
--- a/tensorflow/python/keras/engine/input_layer.py
+++ b/tensorflow/python/keras/engine/input_layer.py
@@ -19,12 +19,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
 from tensorflow.python.keras.utils import tf_utils
-from tensorflow.python.ops import array_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -84,7 +82,6 @@ class InputLayer(base_layer.Layer):
     self.sparse = sparse
     self.batch_size = batch_size
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
     if isinstance(input_shape, tensor_shape.TensorShape):
       input_shape = tuple(input_shape.as_list())
@@ -95,19 +92,19 @@ class InputLayer(base_layer.Layer):
       else:
         batch_input_shape = None
       graph = backend.get_graph()
-      with context.graph_mode():
-        with graph.as_default():
-          # In graph mode, create a graph placeholder to call the layer on.
-          if sparse:
-            input_tensor = array_ops.sparse_placeholder(
-                shape=batch_input_shape,
-                dtype=dtype,
-                name=self.name)
-          else:
-            input_tensor = array_ops.placeholder(
-                shape=batch_input_shape,
-                dtype=dtype,
-                name=self.name)
+      with graph.as_default():
+        # In graph mode, create a graph placeholder to call the layer on.
+        if sparse:
+          input_tensor = backend.placeholder(
+              shape=batch_input_shape,
+              dtype=dtype,
+              name=self.name,
+              sparse=True)
+        else:
+          input_tensor = backend.placeholder(
+              shape=batch_input_shape,
+              dtype=dtype,
+              name=self.name)
 
       self.is_placeholder = True
       self._batch_input_shape = batch_input_shape
diff --git a/tensorflow/python/keras/engine/input_spec.py b/tensorflow/python/keras/engine/input_spec.py
new file mode 100644
index 0000000000000000000000000000000000000000..7277c16fe51197af3bf0e045814ccc29f7feaf7c
--- /dev/null
+++ b/tensorflow/python/keras/engine/input_spec.py
@@ -0,0 +1,170 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Contains the InputSpec class."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from six.moves import zip  # pylint: disable=redefined-builtin
+
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('keras.layers.InputSpec',
+           v1=['keras.layers.InputSpec', 'layers.InputSpec'])
+class InputSpec(object):
+  """Specifies the ndim, dtype and shape of every input to a layer.
+
+  Every layer should expose (if appropriate) an `input_spec` attribute:
+  a list of instances of InputSpec (one per input tensor).
+
+  A None entry in a shape is compatible with any dimension,
+  a None shape is compatible with any shape.
+
+  Arguments:
+      dtype: Expected DataType of the input.
+      shape: Shape tuple, expected shape of the input
+          (may include None for unchecked axes).
+      ndim: Integer, expected rank of the input.
+      max_ndim: Integer, maximum rank of the input.
+      min_ndim: Integer, minimum rank of the input.
+      axes: Dictionary mapping integer axes to
+          a specific dimension value.
+  """
+
+  def __init__(self,
+               dtype=None,
+               shape=None,
+               ndim=None,
+               max_ndim=None,
+               min_ndim=None,
+               axes=None):
+    self.dtype = dtype
+    self.shape = shape
+    if shape is not None:
+      self.ndim = len(shape)
+    else:
+      self.ndim = ndim
+    self.max_ndim = max_ndim
+    self.min_ndim = min_ndim
+    self.axes = axes or {}
+
+  def __repr__(self):
+    spec = [('dtype=' + str(self.dtype)) if self.dtype else '',
+            ('shape=' + str(self.shape)) if self.shape else '',
+            ('ndim=' + str(self.ndim)) if self.ndim else '',
+            ('max_ndim=' + str(self.max_ndim)) if self.max_ndim else '',
+            ('min_ndim=' + str(self.min_ndim)) if self.min_ndim else '',
+            ('axes=' + str(self.axes)) if self.axes else '']
+    return 'InputSpec(%s)' % ', '.join(x for x in spec if x)
+
+
+def assert_input_compatibility(input_spec, inputs, layer_name):
+  """Checks compatibility between the layer and provided inputs.
+
+  This checks that the tensor(s) `inputs` verify the input assumptions
+  of a layer (if any). If not, a clear and actional exception gets raised.
+
+  Arguments:
+      input_spec: An InputSpec instance, or None.
+      inputs: Input tensor or list of input tensors.
+      layer_name: String, name of the layer (for error message formatting).
+
+  Raises:
+      ValueError: in case of mismatch between
+          the provided inputs and the expectations of the layer.
+  """
+  if not input_spec:
+    return
+  if not isinstance(input_spec, (list, tuple)):
+    input_spec = nest.flatten(input_spec)
+
+  inputs = nest.flatten(inputs)
+  if len(inputs) != len(input_spec):
+    raise ValueError('Layer ' + layer_name + ' expects ' +
+                     str(len(input_spec)) + ' inputs, '
+                     'but it received ' + str(len(inputs)) +
+                     ' input tensors. Inputs received: ' + str(inputs))
+  for input_index, (x, spec) in enumerate(zip(inputs, input_spec)):
+    if spec is None:
+      continue
+
+    if (spec.ndim is not None or
+        spec.min_ndim is not None or
+        spec.max_ndim is not None):
+      if x.shape.ndims is None:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'its rank is undefined, but the layer requires a '
+                         'defined rank.')
+
+    # Check ndim.
+    if spec.ndim is not None:
+      ndim = x.shape.ndims
+      if ndim != spec.ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected ndim=' + str(spec.ndim) + ', found ndim=' +
+                         str(ndim) + '. Full shape received: ' +
+                         str(x.shape.as_list()))
+    if spec.max_ndim is not None:
+      ndim = x.shape.ndims
+      if ndim is not None and ndim > spec.max_ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected max_ndim=' + str(spec.max_ndim) +
+                         ', found ndim=' + str(ndim))
+    if spec.min_ndim is not None:
+      ndim = x.shape.ndims
+      if ndim is not None and ndim < spec.min_ndim:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         ': expected min_ndim=' + str(spec.min_ndim) +
+                         ', found ndim=' + str(ndim) +
+                         '. Full shape received: ' +
+                         str(x.shape.as_list()))
+    # Check dtype.
+    if spec.dtype is not None:
+      if x.dtype != spec.dtype:
+        raise ValueError('Input ' + str(input_index) + ' of layer ' +
+                         layer_name + ' is incompatible with the layer: '
+                         'expected dtype=' + str(spec.dtype) +
+                         ', found dtype=' + str(x.dtype))
+    # Check specific shape axes.
+    if spec.axes:
+      shape = x.shape.as_list()
+      if shape is not None:
+        for axis, value in spec.axes.items():
+          if hasattr(value, 'value'):
+            value = value.value
+          if value is not None and shape[int(axis)] not in {value, None}:
+            raise ValueError(
+                'Input ' + str(input_index) + ' of layer ' + layer_name + ' is'
+                ' incompatible with the layer: expected axis ' + str(axis) +
+                ' of input shape to have value ' + str(value) +
+                ' but received input with shape ' + str(shape))
+    # Check shape.
+    if spec.shape is not None:
+      shape = x.shape.as_list()
+      if shape is not None:
+        for spec_dim, dim in zip(spec.shape, shape):
+          if spec_dim is not None and dim is not None:
+            if spec_dim != dim:
+              raise ValueError('Input ' + str(input_index) +
+                               ' is incompatible with layer ' + layer_name +
+                               ': expected shape=' + str(spec.shape) +
+                               ', found shape=' + str(shape))
diff --git a/tensorflow/python/keras/engine/network.py b/tensorflow/python/keras/engine/network.py
index 9b58180e3d365bdb5acb8a6128b047bc67c6b8ef..7e6cc7bfeef97f9ad567aed82757a0a18e8c06be 100644
--- a/tensorflow/python/keras/engine/network.py
+++ b/tensorflow/python/keras/engine/network.py
@@ -36,7 +36,9 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import layer_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -112,11 +114,6 @@ class Network(base_layer.Layer):
     self.trainable = True
     self._is_compiled = False
     self._expects_training_arg = False
-    # A list of "extra" variables assigned to attributes of this class, included
-    # in self.weights and self.variables. Always empty for graph networks (but
-    # included in base_init to avoid excessive special casing when retrieving
-    # the value).
-    self._extra_variables = []
     # In many internal cases one needs to compute both the model's output
     # and its output mask without relying on `__call__` (which would do both and
     # set mask metadata), but for models, computing the mask requires to
@@ -134,12 +131,19 @@ class Network(base_layer.Layer):
       self.optimizer = None
 
     # Private attributes to implement compatibility with Layer.
+    self._trainable_weights = []
+    self._non_trainable_weights = []
     self._updates = []  # Used in symbolic mode only.
     self._losses = []
     self._eager_losses = []
+    # A list of metric instances corresponding to the symbolic metric tensors
+    # added using the `add_metric` API.
+    self._metrics = []
+    # A dictionary that maps metric names to metric result tensors.
+    self._metrics_tensors = {}
     self._scope = None  # Never used.
     self._reuse = None  # Never used.
-    self._can_use_graph_functions = False
+    self._call_is_graph_friendly = True
     if context.executing_eagerly():
       self._graph = None
     else:
@@ -160,7 +164,8 @@ class Network(base_layer.Layer):
 
   @checkpointable.no_automatic_dependency_tracking
   def _init_graph_network(self, inputs, outputs, name=None):
-    self._call_convention = base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
+    self._call_convention = (base_layer_utils
+                             .CallConvention.EXPLICIT_INPUTS_ARGUMENT)
     # Normalize and set self.inputs, self.outputs.
     if isinstance(inputs, (list, tuple)):
       self.inputs = list(inputs)  # Tensor or list of tensors.
@@ -170,43 +175,7 @@ class Network(base_layer.Layer):
       self.outputs = list(outputs)
     else:
       self.outputs = [outputs]
-
-    # Check for redundancy in inputs.
-    if len(set(self.inputs)) != len(self.inputs):
-      raise ValueError('The list of inputs passed to the model '
-                       'is redundant. '
-                       'All inputs should only appear once.'
-                       ' Found: ' + str(self.inputs))
-    for x in self.inputs:
-      # Check that x has appropriate `_keras_history` metadata.
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Input tensors to a ' + cls_name + ' ' +
-                         'must come from `tf.keras.Input`. '
-                         'Received: ' + str(x) +
-                         ' (missing previous layer metadata).')
-      # Check that x is an input tensor.
-      # pylint: disable=protected-access
-      layer, node_index, tensor_index = x._keras_history
-      if len(layer._inbound_nodes) > 1 or (
-          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
-        cls_name = self.__class__.__name__
-        logging.warning(cls_name + ' inputs must come from '
-                        '`tf.keras.Input` (thus holding past layer metadata), '
-                        'they cannot be the output of '
-                        'a previous non-Input layer. '
-                        'Here, a tensor specified as '
-                        'input to "' + self.name + '" was not an Input tensor, '
-                        'it was generated by layer ' + layer.name + '.\n'
-                        'Note that input tensors are '
-                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
-                        'The tensor that caused the issue was: ' + str(x.name))
-    for x in self.outputs:
-      if not hasattr(x, '_keras_history'):
-        cls_name = self.__class__.__name__
-        raise ValueError('Output tensors to a ' + cls_name + ' must be '
-                         'the output of a TensorFlow `Layer` '
-                         '(thus holding past layer metadata). Found: ' + str(x))
+    self._validate_graph_inputs_and_outputs()
 
     self._base_init(name=name)
     self._compute_previous_mask = (
@@ -258,10 +227,6 @@ class Network(base_layer.Layer):
 
     self._track_layers(layers)
 
-    # A Graph network supports defun-ed eager loops if all of its layers do.
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in layers)
-
     # Create the node linking internal inputs to internal outputs.
     base_layer.Node(
         outbound_layer=self,
@@ -282,9 +247,7 @@ class Network(base_layer.Layer):
       if layer.is_placeholder:
         self._feed_input_names.append(layer.name)
         self._feed_input_shapes.append(backend.int_shape(self.inputs[i]))
-        # layer.input gives an error in eager mode
-        if not context.executing_eagerly():
-          self._feed_inputs.append(layer.input)
+        self._feed_inputs.append(layer.input)
     for layer in self._output_layers:
       self.output_names.append(layer.name)
 
@@ -301,16 +264,15 @@ class Network(base_layer.Layer):
     self.outputs = []
     self.inputs = []
     self.built = False
-    self._static_graph_friendly = True
 
   @property
-  def _is_static_graph_friendly(self):
+  def _static_graph_friendly(self):
     if self._is_graph_network:
-      return all(layer._is_static_graph_friendly for layer in self.layers)
-    return self._static_graph_friendly
+      return all(layer._static_graph_friendly for layer in self.layers)
+    return self._call_is_graph_friendly
 
   def _determine_call_convention(self, call_argspec):
-    """Decides how `self.call()` is invoked. See base_layer.CallConvention."""
+    """Decides how `self.call()` is invoked. See `CallConvention`."""
     if call_argspec.varargs:
       may_take_single_argument = False
     else:
@@ -342,11 +304,11 @@ class Network(base_layer.Layer):
               "Model.call() takes a single positional argument (to which "
               "inputs are passed by convention) and a separate 'inputs' "
               "argument. Unable to determine which arguments are inputs.")
-        return base_layer.CallConvention.SINGLE_POSITIONAL_ARGUMENT
+        return base_layer_utils.CallConvention.SINGLE_POSITIONAL_ARGUMENT
     if 'inputs' in call_argspec.args:
-      return base_layer.CallConvention.EXPLICIT_INPUTS_ARGUMENT
+      return base_layer_utils.CallConvention.EXPLICIT_INPUTS_ARGUMENT
     else:
-      return base_layer.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
+      return base_layer_utils.CallConvention.POSITIONAL_ARGUMENTS_ARE_INPUTS
 
   def _track_layers(self, layers):
     """Add Checkpointable dependencies on a list of Layers."""
@@ -415,44 +377,26 @@ class Network(base_layer.Layer):
             # simply by assigning them to attributes.
           not self._is_graph_network
           and isinstance(value, variables.Variable)):
-        self._extra_variables.append(value)
+        if value.trainable:
+          # Could already be added via `add_weight`.
+          if value not in self._trainable_weights:
+            self._trainable_weights.append(value)
+        else:
+          if value not in self._non_trainable_weights:
+            self._non_trainable_weights.append(value)
+
+    # Keeping track of metric instance created in subclassed model/layer.
+    # We do this so that we can maintain the correct order of metrics by adding
+    # the instance to the `metrics` list as soon as it is created.
+    from tensorflow.python.keras import metrics as metrics_module  # pylint: disable=g-import-not-at-top
+    if isinstance(value, metrics_module.Metric):
+      self._metrics.append(value)
     super(Network, self).__setattr__(name, value)
 
-  def add_variable(self, name, shape, dtype=None, initializer=None,
-                   regularizer=None, trainable=True, constraint=None):
-    if self._is_graph_network:
-      raise NotImplementedError('`add_variable` is not supported on Networks.')
-    else:
-      raise NotImplementedError(
-          '`add_variable` is not supported on Networks. However, you may '
-          'assign variables to attributes and they will show up in the weights '
-          'and variables properties.')
-
-  def add_weight(self,
-                 name,
-                 shape,
-                 dtype=None,
-                 initializer=None,
-                 regularizer=None,
-                 trainable=None,
-                 constraint=None,
-                 partitioner=None,
-                 use_resource=None,
-                 synchronization=variables.VariableSynchronization.AUTO,
-                 aggregation=variables.VariableAggregation.NONE,
-                 **kwargs):
-    if self._is_graph_network:
-      raise NotImplementedError('`add_weight` is not supported on Networks.')
-    else:
-      raise NotImplementedError(
-          '`add_weight` is not supported on Networks. However, you may '
-          'assign variables to attributes and they will show up in the weights '
-          'and variables properties.')
-
   @property
   def stateful(self):
-    return any([(hasattr(layer, 'stateful') and layer.stateful)
-                for layer in self.layers])
+    return any((hasattr(layer, 'stateful') and layer.stateful)
+               for layer in self.layers)
 
   def reset_states(self):
     for layer in self.layers:
@@ -557,14 +501,13 @@ class Network(base_layer.Layer):
 
   @property
   def _unfiltered_updates(self):
-    if context.executing_eagerly():
-      return []
     updates = []
     for layer in self.layers:
       if isinstance(layer, Network):
         updates += layer._unfiltered_updates
       else:
         updates += layer.updates
+    updates += self._updates
     return updates
 
   @property
@@ -641,9 +584,6 @@ class Network(base_layer.Layer):
     Returns:
         A list of update ops.
     """
-    if context.executing_eagerly():
-      return []
-
     if not self.trainable and not self.stateful:
       return []
 
@@ -659,7 +599,7 @@ class Network(base_layer.Layer):
       else:
         relevant_inputs.append(inputs)
     if not relevant_inputs:
-      return updates
+      return list(set(updates))
 
     reachable = tf_utils.get_reachable_from_inputs(relevant_inputs, updates)
     relevant_conditional_updates = [x for x in updates if x in reachable]
@@ -667,8 +607,7 @@ class Network(base_layer.Layer):
         x for x in updates if x._unconditional_update]  # pylint: disable=protected-access
     # A layer could be used multiple times in a nested structure,
     # so the updates list must be de-duped.
-    return list(set(
-        relevant_conditional_updates + unconditional_updates + self._updates))
+    return list(set(relevant_conditional_updates + unconditional_updates))
 
   @property
   def losses(self):
@@ -728,14 +667,38 @@ class Network(base_layer.Layer):
     return checkpointable_layer_utils.gather_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
-        extra_variables=self._extra_variables)
+        extra_variables=self._trainable_weights)
 
   @property
   def non_trainable_weights(self):
     return checkpointable_layer_utils.gather_non_trainable_weights(
         trainable=self.trainable,
         sub_layers=self._layers,
-        extra_variables=self._extra_variables)
+        extra_variables=self._non_trainable_weights + self._trainable_weights)
+
+  @property
+  def metrics(self):
+    """Returns the network's symbolic metrics.
+
+    Model overrides this function to include the metrics from `compile` API.
+    """
+    metrics = []
+    for layer in self.layers:
+      metrics += layer._metrics  # pylint: disable=protected-access
+    return metrics + self._metrics
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    # TODO(psv): Remove this property.
+    metrics_tensors = {}
+    for layer in self.layers:
+      if isinstance(layer, Network):
+        metrics_tensors.update(layer._all_metrics_tensors)
+      else:
+        metrics_tensors.update(layer._metrics_tensors)
+    metrics_tensors.update(self._metrics_tensors)
+    return metrics_tensors
 
   @property
   def input_spec(self):
@@ -771,6 +734,11 @@ class Network(base_layer.Layer):
     This is to be used for subclassed models, which do not know at instantiation
     time what their inputs look like.
 
+    This method only exists for users who want to call `model.build()` in a
+    standalone way (as a substitute for calling the model on real data to
+    build it). It will never be called by the framework (and thus it will
+    never throw unexpected errors in an unrelated workflow).
+
     Args:
      input_shape: Single tuple, TensorShape, or list of shapes, where shapes
          are tuples, integers, or TensorShapes.
@@ -807,48 +775,53 @@ class Network(base_layer.Layer):
       # in a Graph. Since tf.Variable is compatible with both eager execution
       # and graph building, the variables created after building the model in
       # a Graph are still valid when executing eagerly.
-      with context.graph_mode():
-        graph = func_graph.FuncGraph('graph')
-        with graph.as_default():
-          if isinstance(input_shape, list):
-            x = [base_layer.generate_placeholders_from_shape(shape)
-                 for shape in input_shape]
+      if context.executing_eagerly():
+        graph = func_graph.FuncGraph('build_graph')
+      else:
+        graph = backend.get_graph()
+      with graph.as_default():
+        if isinstance(input_shape, list):
+          x = [base_layer_utils.generate_placeholders_from_shape(shape)
+               for shape in input_shape]
+        else:
+          x = base_layer_utils.generate_placeholders_from_shape(input_shape)
+
+        kwargs = {}
+        call_signature = tf_inspect.getfullargspec(self.call)
+        call_args = call_signature.args
+        # Exclude `self`, `inputs`, and any argument with a default value.
+        if len(call_args) > 2:
+          if call_signature.defaults:
+            call_args = call_args[2:-len(call_signature.defaults)]
           else:
-            x = base_layer.generate_placeholders_from_shape(input_shape)
-
-          kwargs = {}
-          num_call_args = len(tf_inspect.getfullargspec(self.call).args)
-          if self._expects_training_arg and num_call_args == 3:
-            # Has call signature of call(self, input, training)
-            kwargs['training'] = False
-          elif num_call_args > 2:
-            # Has invalid call signature of call(self, input, *args, **kwargs)
-            raise ValueError('Currently, you cannot build your model if it has '
-                             'positional or keyword arguments that are not '
-                             'inputs to the model, but are required for its '
-                             '`call` method. Instead, in order to instantiate '
-                             'and build your model, `call` your model on real '
-                             'tensor data with all expected call arguments.')
-
-          try:
-            self.call(x, **kwargs)
-          except (errors.InvalidArgumentError, TypeError):
-            raise ValueError('You cannot build your model by calling `build` '
-                             'if your layers do not support float type inputs. '
-                             'Instead, in order to instantiate and build your '
-                             'model, `call` your model on real tensor data (of '
-                             'the correct dtype).')
-
+            call_args = call_args[2:]
+          for arg in call_args:
+            if arg == 'training':
+              # Case where `training` is a positional arg with no default.
+              kwargs['training'] = False
+            else:
+              # Has invalid call signature with unknown positional arguments.
+              raise ValueError(
+                  'Currently, you cannot build your model if it has '
+                  'positional or keyword arguments that are not '
+                  'inputs to the model, but are required for its '
+                  '`call` method. Instead, in order to instantiate '
+                  'and build your model, `call` your model on real '
+                  'tensor data with all expected call arguments.')
+        elif len(call_args) < 2:
+          # Signature without `inputs`.
+          raise ValueError('You can only call `build` on a model if its `call` '
+                           'method accepts an `inputs` argument.')
+        try:
+          self.call(x, **kwargs)
+        except (errors.InvalidArgumentError, TypeError):
+          raise ValueError('You cannot build your model by calling `build` '
+                           'if your layers do not support float type inputs. '
+                           'Instead, in order to instantiate and build your '
+                           'model, `call` your model on real tensor data (of '
+                           'the correct dtype).')
     if self._layers:
       self._track_layers(self._layers)
-    if self.layers:
-      for layer in self.layers:
-        if not layer.built:
-          raise ValueError('Layer: {} was not built in your model. Calling '
-                           '`build` manually on a subclassed model is only '
-                           'allowed for models with a static topology. '
-                           'In this case, you can build your model by '
-                           'calling it on real tensor data.'.format(layer))
     self.built = True
 
   def call(self, inputs, training=None, mask=None):
@@ -895,9 +868,7 @@ class Network(base_layer.Layer):
 
   def compute_output_shape(self, input_shape):
     if not self._is_graph_network:
-      if context.executing_eagerly():
-        return super(Network, self).compute_output_shape(input_shape)
-      raise NotImplementedError
+      return super(Network, self).compute_output_shape(input_shape)
 
     if isinstance(input_shape, list):
       input_shapes = []
@@ -1686,6 +1657,62 @@ class Network(base_layer.Layer):
                               positions=positions,
                               print_fn=print_fn)
 
+  def _validate_graph_inputs_and_outputs(self):
+    """Validates the inputs and outputs of a Graph Network."""
+    # Check for redundancy in inputs.
+    if len(set(self.inputs)) != len(self.inputs):
+      raise ValueError('The list of inputs passed to the model '
+                       'is redundant. '
+                       'All inputs should only appear once.'
+                       ' Found: ' + str(self.inputs))
+
+    for x in self.inputs:
+      # Check that x has appropriate `_keras_history` metadata.
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Input tensors to a ' + cls_name + ' ' +
+                         'must come from `tf.keras.Input`. '
+                         'Received: ' + str(x) +
+                         ' (missing previous layer metadata).')
+      # Check that x is an input tensor.
+      # pylint: disable=protected-access
+      layer, _, _ = x._keras_history
+      if len(layer._inbound_nodes) > 1 or (
+          layer._inbound_nodes and layer._inbound_nodes[0].inbound_layers):
+        cls_name = self.__class__.__name__
+        logging.warning(cls_name + ' inputs must come from '
+                        '`tf.keras.Input` (thus holding past layer metadata), '
+                        'they cannot be the output of '
+                        'a previous non-Input layer. '
+                        'Here, a tensor specified as '
+                        'input to "' + self.name + '" was not an Input tensor, '
+                        'it was generated by layer ' + layer.name + '.\n'
+                        'Note that input tensors are '
+                        'instantiated via `tensor = tf.keras.Input(shape)`.\n'
+                        'The tensor that caused the issue was: ' + str(x.name))
+
+    # Check compatibility of batch sizes of Input Layers.
+    input_batch_sizes = [
+        training_utils.get_static_batch_size(x._keras_history[0])
+        for x in self.inputs
+    ]
+    consistent_batch_size = None
+    for batch_size in input_batch_sizes:
+      if batch_size is not None:
+        if (consistent_batch_size is not None and
+            batch_size != consistent_batch_size):
+          raise ValueError('The specified batch sizes of the Input Layers'
+                           ' are incompatible. Found batch sizes: {}'.format(
+                               input_batch_sizes))
+        consistent_batch_size = batch_size
+
+    for x in self.outputs:
+      if not hasattr(x, '_keras_history'):
+        cls_name = self.__class__.__name__
+        raise ValueError('Output tensors to a ' + cls_name + ' must be '
+                         'the output of a TensorFlow `Layer` '
+                         '(thus holding past layer metadata). Found: ' + str(x))
+
 
 def _is_hdf5_filepath(filepath):
   return (filepath.endswith('.h5') or filepath.endswith('.keras') or
diff --git a/tensorflow/python/keras/engine/saving.py b/tensorflow/python/keras/engine/saving.py
index 61bff7fff23d188117ab6d86dc4ff2940568a055..54d9e32fb258343dfd9b75351015959952893c1a 100644
--- a/tensorflow/python/keras/engine/saving.py
+++ b/tensorflow/python/keras/engine/saving.py
@@ -79,6 +79,10 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
 
   from tensorflow.python.keras import __version__ as keras_version  # pylint: disable=g-import-not-at-top
 
+  # TODO(psv) Add warning when we save models that contain non-serializable
+  # entities like metrics added using `add_metric` and losses added using
+  # `add_loss.`
+
   if not isinstance(filepath, h5py.File):
     # If file exists and should not be overwritten.
     if not overwrite and os.path.isfile(filepath):
@@ -126,8 +130,8 @@ def save_model(model, filepath, overwrite=True, include_optimizer=True):
                     'config': model.optimizer.get_config()
                 },
                 'loss': model.loss,
-                'metrics': model.metrics,
-                'weighted_metrics': model.weighted_metrics,
+                'metrics': model._compile_metrics,
+                'weighted_metrics': model._compile_weighted_metrics,
                 'sample_weight_mode': model.sample_weight_mode,
                 'loss_weights': model.loss_weights,
             },
@@ -913,7 +917,7 @@ def save_attributes_to_hdf5_group(group, name, data):
   chunked_data = np.array_split(data_npy, num_chunks)
 
   # This will never loop forever thanks to the test above.
-  while any([x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data]):
+  while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data):
     num_chunks += 1
     chunked_data = np.array_split(data_npy, num_chunks)
 
diff --git a/tensorflow/python/keras/engine/saving_test.py b/tensorflow/python/keras/engine/saving_test.py
index f376f081cfe4be2bdcb5df1ef080d0546f485a0c..bc33a3ea7f3ef38e9f94854043fe7bdc7a9bfe46 100644
--- a/tensorflow/python/keras/engine/saving_test.py
+++ b/tensorflow/python/keras/engine/saving_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import saving
 from tensorflow.python.keras.engine import training
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -288,6 +289,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
                                  r'element\(s\)\.'):
       saving.load_weights_from_hdf5_group_by_name(f_model, model.layers)
 
+  @test_util.run_deprecated_v1
   def test_sequential_weight_loading_group_name_with_incorrect_shape(self):
     if h5py is None:
       return
@@ -330,6 +332,7 @@ class TestWeightSavingAndLoading(test.TestCase, parameterized.TestCase):
 
 class TestWholeModelSaving(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def test_sequential_model_saving(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -382,6 +385,7 @@ class TestWholeModelSaving(test.TestCase):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_sequential_model_saving_without_input_shape(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -442,6 +446,7 @@ class TestWholeModelSaving(test.TestCase):
       out2 = new_model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_sequential_model_saving_2(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -478,6 +483,7 @@ class TestWholeModelSaving(test.TestCase):
       out2 = model.predict(x)
       self.assertAllClose(out, out2, atol=1e-05)
 
+  @test_util.run_deprecated_v1
   def test_functional_model_saving(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -629,6 +635,7 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  @test_util.run_v1_only('b/120545219')
   def test_saving_model_with_long_weights_names(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -674,6 +681,7 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
+  @test_util.run_deprecated_v1
   def test_model_saving_to_pre_created_h5py_file(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -715,7 +723,6 @@ class TestWholeModelSaving(test.TestCase):
       os.close(fd)
       os.remove(fname)
 
-
   def test_saving_constant_initializer_with_numpy(self):
     if h5py is None:
       self.skipTest('h5py required to run this test')
@@ -749,6 +756,7 @@ class SubclassedModel(training.Model):
 
 class TestWeightSavingAndLoadingTFFormat(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def test_keras_optimizer_warning(self):
     graph = ops.Graph()
     with graph.as_default(), self.session(graph):
@@ -992,5 +1000,57 @@ class TestWeightSavingAndLoadingTFFormat(test.TestCase):
         AssertionError, 'Nothing except the root object matched'):
       m.load_weights(save_path)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_directory_passed(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    self.evaluate(v.assign(42.))
+    prefix = os.path.join(self.get_temp_dir(), '{}'.format(ops.uid()), 'ckpt/')
+    m.save_weights(prefix)
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_relative_path(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    os.chdir(self.get_temp_dir())
+
+    prefix = 'ackpt'
+    self.evaluate(v.assign(42.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('ackpt.index'))
+    self.evaluate(v.assign(1.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
+    prefix = 'subdir/ackpt'
+    self.evaluate(v.assign(43.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('subdir/ackpt.index'))
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(43., self.evaluate(v))
+
+    prefix = 'ackpt/'
+    self.evaluate(v.assign(44.))
+    m.save_weights(prefix)
+    self.assertTrue(file_io.file_exists('ackpt/.index'))
+    self.evaluate(v.assign(3.))
+    m.load_weights(prefix)
+    self.assertEqual(44., self.evaluate(v))
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_nonexistant_prefix_directory(self):
+    m = keras.Model()
+    v = m.add_weight(name='v', shape=[])
+    self.evaluate(v.assign(42.))
+    prefix = os.path.join(self.get_temp_dir(), '{}'.format(ops.uid()), 'bckpt')
+    m.save_weights(prefix)
+    self.evaluate(v.assign(2.))
+    m.load_weights(prefix)
+    self.assertEqual(42., self.evaluate(v))
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/sequential.py b/tensorflow/python/keras/engine/sequential.py
index 5ce4ca4df41c7a147009f9eabe340518bc02c8dd..3255613f6af07988e874339b96002355e39e6d14 100644
--- a/tensorflow/python/keras/engine/sequential.py
+++ b/tensorflow/python/keras/engine/sequential.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import layers as layer_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
 from tensorflow.python.keras.engine.network import Network
@@ -120,8 +121,8 @@ class Sequential(Model):
     return layers[:]
 
   @property
-  def _is_static_graph_friendly(self):
-    return all(layer._is_static_graph_friendly for layer in self.layers)
+  def _static_graph_friendly(self):
+    return all(layer._static_graph_friendly for layer in self.layers)
 
   @checkpointable.no_automatic_dependency_tracking
   def add(self, layer):
@@ -150,7 +151,7 @@ class Sequential(Model):
         assert len(layer._inbound_nodes[-1].output_tensors) == 1
         set_inputs = True
       else:
-        batch_shape, dtype = get_input_shape_and_dtype(layer)
+        batch_shape, dtype = training_utils.get_input_shape_and_dtype(layer)
         if batch_shape:
           # Instantiate an input layer.
           x = Input(
@@ -190,8 +191,6 @@ class Sequential(Model):
       self._layers.append(layer)
     if self._layers:
       self._track_layers(self._layers)
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in self.layers)
 
   @checkpointable.no_automatic_dependency_tracking
   def pop(self):
@@ -213,23 +212,17 @@ class Sequential(Model):
       self.outputs = [self.layers[-1].output]
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
       self.built = True
-    self._can_use_graph_functions = all(
-        layer._can_use_graph_functions for layer in self.layers)
 
+  @base_layer.default
   def build(self, input_shape=None):
     if self._is_graph_network:
       self._init_graph_network(self.inputs, self.outputs, name=self.name)
     else:
       if input_shape is None:
         raise ValueError('You must provide an `input_shape` argument.')
+      input_shape = tuple(input_shape)
       self._build_input_shape = input_shape
-      shape = input_shape
-      for layer in self.layers:
-        if not layer.built:
-          with ops.name_scope(layer._name_scope()):
-            layer.build(shape)
-          layer.built = True
-        shape = layer.compute_output_shape(shape)
+      super(Sequential, self).build(input_shape)
     self.built = True
 
   def call(self, inputs, training=None, mask=None):
@@ -241,8 +234,8 @@ class Sequential(Model):
     return outputs
 
   def _call_and_compute_mask(self, inputs, training=None, mask=None):
-    if not self.built:
-      self.build(inputs.shape)
+    if not self.built and self._is_graph_network:
+      self._init_graph_network(self.inputs, self.outputs, name=self.name)
 
     x = inputs
     for layer in self.layers:
@@ -255,6 +248,11 @@ class Sequential(Model):
       if isinstance(layer, Network) and layer._compute_output_and_mask_jointly:
         x, mask = layer._call_and_compute_mask(x, **kwargs)
       else:
+        if not layer.built:
+          # Build layer if applicable.
+          with ops.name_scope(layer._name_scope()):
+            layer._maybe_build(x)
+          layer.built = True
         x = layer.call(x, **kwargs)
         if layer.supports_masking:
           mask = layer.compute_mask(x, mask)
@@ -362,38 +360,3 @@ class Sequential(Model):
     if self.layers and hasattr(self.layers[0], 'input_spec'):
       return self.layers[0].input_spec
     return None
-
-
-def get_input_shape_and_dtype(layer):
-  """Retrieve input shape and input dtype of layer if applicable.
-
-  Args:
-    layer: Layer (or model) instance.
-
-  Returns:
-    Tuple (input_shape, input_dtype). Both could be None if the layer
-      does not have a defined input shape.
-
-  Raises:
-    ValueError: in case an empty Sequential or Graph Network is passed.
-  """
-  if ((isinstance(layer, Model) and layer._is_graph_network)
-      or isinstance(layer, Sequential)):
-    # We were passed a model as first layer.
-    # This requires a specific way to figure out the
-    # input shape and dtype.
-    if not layer.layers:
-      raise ValueError('Cannot add an empty model '
-                       'to a `Sequential` model.')
-    # In case of nested models: recover the first layer
-    # of the deepest model to infer input shape and dtype.
-    layer = layer.layers[0]
-    while ((isinstance(layer, Model) and layer._is_graph_network)
-           or isinstance(layer, Sequential)):
-      layer = layer.layers[0]
-
-  if hasattr(layer, '_batch_input_shape'):
-    batch_shape = layer._batch_input_shape
-    dtype = layer.dtype
-    return batch_shape, dtype
-  return None, None
diff --git a/tensorflow/python/keras/engine/sequential_test.py b/tensorflow/python/keras/engine/sequential_test.py
index ea8fdf675a0a32c21c76cdd76045eefa4302fd49..10f69da061c336cd1727ce4d34f1637e21329f3a 100644
--- a/tensorflow/python/keras/engine/sequential_test.py
+++ b/tensorflow/python/keras/engine/sequential_test.py
@@ -26,17 +26,18 @@ from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 from tensorflow.python.training import rmsprop
 
 
-class TestSequential(test.TestCase, parameterized.TestCase):
+class TestSequential(keras_parameterized.TestCase):
   """Most Sequential model API tests are covered in `training_test.py`.
   """
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_basic_methods(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(1, input_dim=2))
@@ -47,7 +48,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertEqual(model.get_layer(name='dp').name, 'dp')
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_pop(self):
     num_hidden = 5
     input_dim = 3
@@ -56,14 +57,16 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
     model = testing_utils.get_small_sequential_mlp(
         num_hidden, num_classes, input_dim)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((batch_size, input_dim))
     y = np.random.random((batch_size, num_classes))
     model.fit(x, y, epochs=1)
     model.pop()
     self.assertEqual(len(model.layers), 1)
     self.assertEqual(model.output_shape, (None, num_hidden))
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     y = np.random.random((batch_size, num_hidden))
     model.fit(x, y, epochs=1)
 
@@ -79,7 +82,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     with self.assertRaises(TypeError):
       model.pop()
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_with_np_arrays(self):
     num_hidden = 5
     input_dim = 3
@@ -90,7 +93,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 0)
     self.assertFalse(model.built)
@@ -102,7 +106,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertFalse(model._is_graph_network)
     self.assertEqual(len(model.weights), 2 * 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_with_dataset_iterators(self):
     num_hidden = 5
     input_dim = 3
@@ -114,7 +118,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertEqual(len(model.layers), 2)
     self.assertEqual(len(model.weights), 0)
     self.assertFalse(model.built)
@@ -124,14 +129,16 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
 
     model.fit(iterator, epochs=1, steps_per_epoch=steps_per_epoch)
     self.assertTrue(model.built)
     self.assertEqual(len(model.weights), 2 * 2)
     self.assertFalse(model._is_graph_network)
 
+  # TODO(kaftan) This test fails w/ run_with_all_keras_modes. File ticket
   @parameterized.parameters((True,), (False,))
+  @tf_test_util.run_deprecated_v1
   def test_training_and_eval_methods_on_symbolic_tensors(self, deferred):
     with self.cached_session():
 
@@ -174,7 +181,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
           validation_data=(inputs, targets),
           validation_steps=2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_invalid_use_cases(self):
     # Added objects must be layer instances
     with self.assertRaises(TypeError):
@@ -198,7 +205,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
       model.add(keras.layers.Dense(1, input_dim=1))
       model.add(MyLayer())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_nested_sequential_trainability(self):
     input_dim = 20
     num_units = 10
@@ -219,6 +226,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     inner_model.trainable = True
     self.assertEqual(len(model.trainable_weights), 4)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_sequential_update_disabling(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -247,7 +255,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
       x2 = model.predict(val_a)
       assert np.abs(np.sum(x1 - x2)) > 1e-5
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_deferred_build_serialization(self):
     num_hidden = 5
     input_dim = 3
@@ -258,7 +266,8 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     model.compile(
         loss='mse',
         optimizer=rmsprop.RMSPropOptimizer(1e-3),
-        metrics=[keras.metrics.CategoricalAccuracy()])
+        metrics=[keras.metrics.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
     self.assertFalse(model.built)
 
     x = np.random.random((batch_size, input_dim))
@@ -273,13 +282,13 @@ class TestSequential(test.TestCase, parameterized.TestCase):
     self.assertEqual(len(new_model.layers), 2)
     self.assertEqual(len(new_model.weights), 4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_shape_inference_deferred(self):
     model = testing_utils.get_small_sequential_mlp(4, 5)
     output_shape = model.compute_output_shape((None, 7))
     self.assertEqual(tuple(output_shape.as_list()), (None, 5))
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_build_deferred(self):
     model = testing_utils.get_small_sequential_mlp(4, 5)
 
@@ -294,21 +303,21 @@ class TestSequential(test.TestCase, parameterized.TestCase):
 
     model.build((None, 10))
     self.assertTrue(model.built)
-    self.assertTrue(model.layers[-1].built)
     self.assertEqual(len(model.weights), 8)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_nesting(self):
     model = testing_utils.get_small_sequential_mlp(4, 3)
     inner_model = testing_utils.get_small_sequential_mlp(4, 5)
     model.add(inner_model)
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_variable_names(self):
     model = keras.models.Sequential([keras.layers.Dense(3)])
     model.add(keras.layers.Dense(2))
@@ -318,7 +327,7 @@ class TestSequential(test.TestCase, parameterized.TestCase):
          'sequential/dense_1/kernel:0', 'sequential/dense_1/bias:0'],
         [v.name for v in model.variables])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_input_assumptions_propagation(self):
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(1))
@@ -328,9 +337,9 @@ class TestSequential(test.TestCase, parameterized.TestCase):
         model(1.0)
 
 
-class TestSequentialEagerIntegration(test.TestCase):
+class TestSequentialEagerIntegration(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_defun_on_call(self):
     # Check that one can subclass Sequential and place the `call` in a `defun`.
 
@@ -344,17 +353,19 @@ class TestSequentialEagerIntegration(test.TestCase):
     model.add(keras.layers.Dense(4, activation='relu'))
     model.add(keras.layers.Dense(5, activation='softmax'))
 
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.random.random((2, 6))
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_build_before_fit(self):
     # Fix for b/112433577
     model = testing_utils.get_small_sequential_mlp(4, 5)
-    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3))
+    model.compile(loss='mse', optimizer=rmsprop.RMSPropOptimizer(1e-3),
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     model.build((None, 6))
 
@@ -362,30 +373,7 @@ class TestSequentialEagerIntegration(test.TestCase):
     y = np.random.random((2, 5))
     model.fit(x, y, epochs=1)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_sequential_can_use_graph_functions(self):
-    model = testing_utils.get_small_sequential_mlp(4, 3)
-    self.assertTrue(model._can_use_graph_functions)
-    inner_model = testing_utils.get_small_sequential_mlp(4, 5)
-    model.add(inner_model)
-
-    self.assertTrue(model._can_use_graph_functions)
-
-    inner_model_two = testing_utils.get_small_sequential_mlp(5, 7)
-    self.assertTrue(inner_model_two._can_use_graph_functions)
-
-    layer = keras.layers.Lambda(lambda x: x)
-    layer._can_use_graph_functions = False
-    inner_model_two.add(layer)
-    self.assertFalse(inner_model_two._can_use_graph_functions)
-
-    model.add(inner_model_two)
-    self.assertFalse(model._can_use_graph_functions)
-
-    model.pop()
-    self.assertTrue(model._can_use_graph_functions)
-
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_sequential_model_fails_with_dict_inputs(self):
     num_classes = 5
     model = testing_utils.get_small_sequential_mlp(
@@ -394,7 +382,8 @@ class TestSequentialEagerIntegration(test.TestCase):
         rmsprop.RMSPropOptimizer(learning_rate=0.001),
         metrics=['acc'],
         weighted_metrics=['mae'],
-        loss='categorical_crossentropy')
+        loss='categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x = {'dense_input': np.random.random((10, 1))}
     y = np.random.randint(num_classes, size=(10, 1))
diff --git a/tensorflow/python/keras/engine/topology_test.py b/tensorflow/python/keras/engine/topology_test.py
index b4a4babf25924a615472cb11e15b7ddc49253bc3..4071e2c091eede29af9418105e63c157ce2dc101 100644
--- a/tensorflow/python/keras/engine/topology_test.py
+++ b/tensorflow/python/keras/engine/topology_test.py
@@ -42,6 +42,7 @@ except ImportError:
 
 class TopologyConstructionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_get_updates(self):
 
     class MyLayer(keras.layers.Layer):
@@ -106,6 +107,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(network.updates), 5)
     self.assertEqual(len(network.get_updates_for(x4)), 2)
 
+  @test_util.run_v1_only('b/120545219')
   def test_get_updates_bn(self):
     x1 = input_layer_lib.Input(shape=(1,))
     layer = keras.layers.BatchNormalization()
@@ -115,6 +117,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(len(layer.get_updates_for(x1)), 2)
     self.assertEqual(len(layer.get_updates_for(None)), 0)
 
+  @test_util.run_deprecated_v1
   def test_get_losses(self):
 
     class MyLayer(keras.layers.Layer):
@@ -268,6 +271,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertEqual(test_layer.input_shape, [(None, 32), (None, 32)])
     self.assertEqual(test_layer.output_shape, (None, 32))
 
+  @test_util.run_deprecated_v1
   def testBasicNetwork(self):
     # minimum viable network
     x = input_layer_lib.Input(shape=(32,))
@@ -341,6 +345,7 @@ class TopologyConstructionTest(test.TestCase):
     self.assertListEqual(model.trainable_weights, [])
     self.assertListEqual(model.non_trainable_weights, weights)
 
+  @test_util.run_deprecated_v1
   def test_layer_call_arguments(self):
     # Test the ability to pass and serialize arguments to `call`.
     inp = keras.layers.Input(shape=(2,))
@@ -491,6 +496,7 @@ class TopologyConstructionTest(test.TestCase):
       fn_outputs = fn([input_a_np, input_b_np])
       self.assertListEqual([x.shape for x in fn_outputs], [(10, 64), (10, 5)])
 
+  @test_util.run_deprecated_v1
   def test_recursion(self):
     with self.cached_session():
       a = keras.layers.Input(shape=(32,), name='input_a')
@@ -675,6 +681,7 @@ class TopologyConstructionTest(test.TestCase):
     with self.assertRaises(Exception):
       keras.models.Model([j, k], [m, n, 0])
 
+  @test_util.run_deprecated_v1
   def test_raw_tf_compatibility(self):
     # test calling layers/models on TF tensors
     a = keras.layers.Input(shape=(32,), name='input_a')
@@ -719,6 +726,7 @@ class TopologyConstructionTest(test.TestCase):
     model = keras.models.Model(a, b)
     self.assertEqual(model.output_mask.get_shape().as_list(), [None, 10])
 
+  @test_util.run_deprecated_v1
   def testMaskingSingleInput(self):
 
     class MaskedLayer(keras.layers.Layer):
@@ -756,6 +764,7 @@ class TopologyConstructionTest(test.TestCase):
       y_2 = network(x_2)
       self.assertEqual(y_2.get_shape().as_list(), [None, 32])
 
+  @test_util.run_deprecated_v1
   def test_activity_regularization_with_model_composition(self):
 
     def reg(x):
@@ -825,6 +834,7 @@ class TopologyConstructionTest(test.TestCase):
       output_val_2 = m2.predict(x_val)
       self.assertAllClose(output_val, output_val_2, atol=1e-6)
 
+  @test_util.run_v1_only('b/120545219')
   def test_explicit_training_argument(self):
     with self.cached_session():
       a = keras.layers.Input(shape=(2,))
@@ -1145,6 +1155,7 @@ class DefaultShapeInferenceBehaviorTest(test.TestCase):
 
 class GraphUtilsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGetReachableFromInputs(self):
 
     with self.cached_session():
diff --git a/tensorflow/python/keras/engine/training.py b/tensorflow/python/keras/engine/training.py
index cb96e3e5d208594984ea1cc69189309ecd2715a1..462694fda690fbaa2d1474b9b1ddba558a84e201 100644
--- a/tensorflow/python/keras/engine/training.py
+++ b/tensorflow/python/keras/engine/training.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import weakref
 import numpy as np
 
@@ -26,6 +27,7 @@ from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import losses
@@ -40,6 +42,8 @@ from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.engine.network import Network
 from tensorflow.python.keras.utils import data_utils
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -121,11 +125,8 @@ class Model(Network):
     # initializing _distribution_strategy here since it is possible to call
     # predict on a model without compiling it.
     self._distribution_strategy = None
-    # This flag must be disabled upon model mutation, such as changing the model
-    # layers or recompiling the model to use a different optimizer. New function
-    # definitions are generated whenever this flag is disabled, ensuring that
-    # internal graph functions are always using the current model structure.
-    self._built_graph_functions = False
+
+    self.run_eagerly = None
 
   def _set_sample_weight_attributes(self, sample_weight_mode,
                                     skip_target_weighing_indices):
@@ -177,25 +178,66 @@ class Model(Network):
       metric_name = '%s_%s' % (self.output_names[output_index], metric_name)
     j = 1
     base_metric_name = metric_name
-    while metric_name in self.metrics_names:
+    while metric_name in self._compile_metrics_names:
       metric_name = '%s_%d' % (base_metric_name, j)
       j += 1
 
     return metric_name
 
+  @property
+  def metrics(self):
+    """Returns the model's metrics added using `compile`, `add_metric` APIs."""
+    metrics = []
+    if self._is_compiled:
+      metrics += self._compile_stateful_metric_functions
+    return metrics + super(Model, self).metrics
+
+  @property
+  def metrics_names(self):
+    """Returns the model's display labels for all outputs."""
+    metrics_names = []
+    if self._is_compiled:
+      metrics_names += self._compile_metrics_names  # Includes names of losses.
+
+    # Add metric names from layers.
+    for layer in self.layers:
+      metrics_names += [m.name for m in layer._metrics]  # pylint: disable=protected-access
+    metrics_names += [m.name for m in self._metrics]
+    return metrics_names
+
+  @property
+  def _all_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
+  @property
+  def _all_stateful_metrics_tensors(self):
+    """Returns the network's symbolic metric tensors."""
+    metrics_tensors = {}
+    if self._is_compiled:
+      metrics_tensors.update(self._compile_stateful_metrics_tensors)
+    metrics_tensors.update(super(Model, self)._all_metrics_tensors)
+    return metrics_tensors
+
   def _init_metric_attributes(self):
     """Initialized model metric attributes."""
     # List of all metric names in the model.
-    self.metrics_names = ['loss']
-    # List of all aggregated metric result tensors. This includes aggregated
-    # loss result tensors.
-    self._stateful_metrics_tensors = []
-    # List of all metric result tensors (aggregated or not - based on the
-    # values given in compile.)
-    self.metrics_tensors = []
+    self._compile_metrics_names = ['loss']
     # List of stateful metric functions. Used for resetting metric state during
-    # training/eval. This includes loss functions.
-    self.stateful_metric_functions = []
+    # training/eval.
+    # This includes loss functions when there are multiple outputs.
+    self._compile_stateful_metric_functions = []
+    # Dict of all aggregated metric result tensors. This includes aggregated
+    # loss result tensors when there are multiple outputs.
+    self._compile_stateful_metrics_tensors = {}
+    # Dict of all metric result tensors (aggregated or not - based on the
+    # values given in compile.). This includes aggregated loss result tensors
+    # when there are multiple outputs.
+    self._compile_metrics_tensors = {}
 
   def _set_per_output_metric_attributes(self, metrics_dict, output_index):
     """Sets the metric attributes on the model for the given output.
@@ -204,24 +246,39 @@ class Model(Network):
       metrics_dict: A dict with metric names as keys and metric fns as values.
       output_index: The index of the model output for which the metric
         attributes are added.
+
+    Returns:
+      Metrics dict updated with unique metric names as keys.
     """
-    for metric_name, (_, stateful_metric_fn) in metrics_dict.items():
+    updated_metrics_dict = collections.OrderedDict()
+    for metric_name, (metric_fn, stateful_metric_fn) in metrics_dict.items():
       metric_name = self._add_unique_metric_name(metric_name, output_index)
-      # Keep track of metric name.
-      self.metrics_names.append(metric_name)
-
-      # Keep track of stateful metric function.
-      self.stateful_metric_functions.append(stateful_metric_fn)
+      updated_metrics_dict[metric_name] = (metric_fn, stateful_metric_fn)
+      # Keep track of metric name, function and stateful function.
+      self._compile_metrics_names.append(metric_name)
+      self._compile_stateful_metric_functions.append(stateful_metric_fn)
+    return updated_metrics_dict
 
   def _set_metric_attributes(self, outputs, skip_target_indices=None):
     """Sets the metric attributes on the model for all the model outputs."""
     skip_target_indices = skip_target_indices or []
+    updated_per_output_metrics = []
+    updated_per_output_weighted_metrics = []
     for i in range(len(outputs)):
       if i in skip_target_indices:
+        updated_per_output_metrics.append(self._per_output_metrics[i])
+        updated_per_output_weighted_metrics.append(
+            self._per_output_weighted_metrics[i])
         continue
-      self._set_per_output_metric_attributes(self._per_output_metrics[i], i)
-      self._set_per_output_metric_attributes(
-          self._per_output_weighted_metrics[i], i)
+      updated_per_output_metrics.append(
+          self._set_per_output_metric_attributes(self._per_output_metrics[i],
+                                                 i))
+      updated_per_output_weighted_metrics.append(
+          self._set_per_output_metric_attributes(
+              self._per_output_weighted_metrics[i], i))
+
+    self._per_output_metrics = updated_per_output_metrics
+    self._per_output_weighted_metrics = updated_per_output_weighted_metrics
 
   def _handle_per_output_metrics(self,
                                  metrics_dict,
@@ -256,17 +313,17 @@ class Model(Network):
           weighted_metric_fn = training_utils.weighted_masked_objective(fn)
           return weighted_metric_fn(y_true, y_pred, weights=weights, mask=mask)
 
-        def _track_metric_tensors(stateless_result, stateful_result):
-          self.metrics_tensors.append(stateless_result)
-          self._stateful_metrics_tensors.append(stateful_result)
+        def _track_metric_tensors(name, stateless_result, stateful_result):
+          self._compile_metrics_tensors[name] = stateless_result
+          self._compile_stateful_metrics_tensors[name] = stateful_result
 
         if isinstance(metric_fn, metrics_module.Metric):
           # If the given metric fn is stateful, call the fn and return result.
           metric_result = _call_stateful_fn(metric_fn)
           metric_results.append(metric_result)
-          if not context.executing_eagerly():
-            _track_metric_tensors(metric_result, metric_result)
-        elif context.executing_eagerly():
+          if not self.run_eagerly:
+            _track_metric_tensors(metric_name, metric_result, metric_result)
+        elif self.run_eagerly:
           # In eager mode, if the given metric fn is not stateful, we invoke the
           # given fn or its stateful version based on the given flag.
           if return_stateful_result:
@@ -279,7 +336,8 @@ class Model(Network):
           # stateless fns.
           stateful_metric_result = _call_stateful_fn(stateful_fn)
           metric_result = _call_stateless_fn(metric_fn)
-          _track_metric_tensors(metric_result, stateful_metric_result)
+          _track_metric_tensors(metric_name, metric_result,
+                                stateful_metric_result)
 
     return metric_results
 
@@ -307,6 +365,7 @@ class Model(Network):
     skip_target_indices = skip_target_indices or []
     metric_results = []
     with K.name_scope('metrics'):
+      # Invoke all metrics added using `compile`.
       for i in range(len(outputs)):
         if i in skip_target_indices:
           continue
@@ -328,8 +387,48 @@ class Model(Network):
                 output_mask,
                 weights=sample_weights[i],
                 return_stateful_result=return_stateful_result))
+
+    # Add metric results from the `add_metric` metrics in eager mode.
+    if context.executing_eagerly():
+      for m in self.metrics:
+        if m not in self._compile_stateful_metric_functions:
+          metric_results.append(m.result())
     return metric_results
 
+  @property
+  def run_eagerly(self):
+    """Settable attribute indicating whether the model should run eagerly.
+
+    Running eagerly means that your model will be run step by step,
+    like Python code. Your model might run slower, but it should become easier
+    for you to debug it by stepping into individual layer calls.
+
+    By default, we will attempt to compile your model to a static graph to
+    deliver the best execution performance.
+
+    Returns:
+      Boolean, whether the model should run eagerly.
+    """
+    if self._run_eagerly is True and not context.executing_eagerly():
+      raise ValueError('You can only set `run_eagerly=True` if eager execution '
+                       'is enabled.')
+    if self._static_graph_friendly:
+      if self._run_eagerly is None:
+        return False
+      else:
+        return self._run_eagerly
+    else:
+      if self._run_eagerly is False:
+        # TODO(fchollet): consider using py_func to enable this.
+        raise ValueError('Your model contains layers that can only be '
+                         'successfully run in eager execution. '
+                         'You cannot set `run_eagerly=False`.')
+      return context.executing_eagerly()
+
+  @run_eagerly.setter
+  def run_eagerly(self, value):
+    self._run_eagerly = value
+
   @checkpointable.no_automatic_dependency_tracking
   def compile(self,
               optimizer,
@@ -391,9 +490,8 @@ class Model(Network):
         ValueError: In case of invalid arguments for
             `optimizer`, `loss`, `metrics` or `sample_weight_mode`.
     """
-    # The correct graph function may have changed,
-    # already-built ones must be updated
-    self._built_graph_functions = False
+    run_eagerly = kwargs.pop('run_eagerly', None)
+    self._run_eagerly = run_eagerly
 
     # Validate that arguments passed by the user to `compile` are supported by
     # DistributionStrategy.
@@ -403,9 +501,6 @@ class Model(Network):
         raise NotImplementedError(
             'optimizer must be an instance of '
             'tf.train.Optimizer, not a %s' % type(optimizer))
-      if context.executing_eagerly():
-        raise NotImplementedError('DistributionStrategy is not supported '
-                                  'when eager execution is enabled.')
       if sample_weight_mode:
         raise NotImplementedError('sample_weight_mode is not supported with '
                                   'DistributionStrategy.')
@@ -417,11 +512,12 @@ class Model(Network):
                          'DistributionStrategy.')
 
     loss = loss or {}
-    if context.executing_eagerly() and not isinstance(
+    if self.run_eagerly and not isinstance(
         optimizer, (tf_optimizer_module.Optimizer, optimizers.TFOptimizer)):
       raise ValueError(
-          'optimizer must be an instance of tf.train.Optimizer, not '
-          'a %s' % type(optimizer))
+          'When running a model in eager execution, the optimizer must be an '
+          'instance of tf.train.Optimizer. Received: '
+          '%s' % optimizer)
 
     self.optimizer = optimizers.get(optimizer)
     # We've disabled automatic dependency tracking for this method, but do want
@@ -430,12 +526,14 @@ class Model(Network):
       self._track_checkpointable(
           self.optimizer, name='optimizer', overwrite=True)
     self.loss = loss
-    self.metrics = metrics or []
+    self._compile_metrics = metrics or []
     self.loss_weights = loss_weights
     self.sample_weight_mode = sample_weight_mode
-    self.weighted_metrics = weighted_metrics
-    if context.executing_eagerly() and target_tensors is not None:
-      raise ValueError('target_tensors is not supported in Eager mode.')
+    self._compile_weighted_metrics = weighted_metrics
+    if self.run_eagerly and target_tensors is not None:
+      raise ValueError(
+          'target_tensors argument is not supported when '
+          'running a model eagerly.')
     self.target_tensors = target_tensors
 
     # Set DistributionStrategy specific parameters.
@@ -445,6 +543,8 @@ class Model(Network):
     if self._distribution_strategy is not None:
       distributed_training_utils.configure_and_create_session(
           self._distribution_strategy)
+    # Initialize model metric attributes.
+    self._init_metric_attributes()
     if not self.built:
       # Model is not compilable because it does not know its number of inputs
       # and outputs, nor their shapes and names. We will compile after the first
@@ -468,16 +568,16 @@ class Model(Network):
               '" missing from loss dictionary. We assume '
               'this was done on purpose. The fit and evaluate APIs will not be '
               'expecting any data to be passed to "' + name + '".')
-        loss_functions.append(losses.get(loss.get(name)))
+        loss_functions.append(training_utils.get_loss_function(loss.get(name)))
     elif isinstance(loss, list):
       if len(loss) != len(self.outputs):
         raise ValueError('When passing a list as loss, '
                          'it should have one entry per model outputs. '
                          'The model has ' + str(len(self.outputs)) +
                          ' outputs, but you passed loss=' + str(loss))
-      loss_functions = [losses.get(l) for l in loss]
+      loss_functions = [training_utils.get_loss_function(l) for l in loss]
     else:
-      loss_function = losses.get(loss)
+      loss_function = training_utils.get_loss_function(loss)
       loss_functions = [loss_function for _ in range(len(self.outputs))]
     self.loss_functions = loss_functions
 
@@ -493,7 +593,7 @@ class Model(Network):
         skip_target_weighing_indices.append(i)
 
     # Prepare output masks.
-    if not context.executing_eagerly():
+    if not self.run_eagerly:
       masks = [getattr(x, '_keras_mask', None) for x in self.outputs]
       if not isinstance(masks, list):
         masks = [masks]
@@ -524,11 +624,8 @@ class Model(Network):
                       str(loss_weights) + ' - expected a list of dicts.')
     self.loss_weights_list = loss_weights_list
 
-    # Initialize model metric attributes.
-    self._init_metric_attributes()
-
     # Initialization for Eager mode execution.
-    if context.executing_eagerly():
+    if self.run_eagerly:
       # Prepare sample weights.
       self._set_sample_weight_attributes(sample_weight_mode,
                                          skip_target_weighing_indices)
@@ -541,7 +638,7 @@ class Model(Network):
       self.total_loss = None
       for i in range(len(self.outputs)):
         if len(self.outputs) > 1:
-          self.metrics_names.append(self.output_names[i] + '_loss')
+          self._compile_metrics_names.append(self.output_names[i] + '_loss')
 
       # Set metric attributes on model.
       self._set_metric_attributes(
@@ -555,145 +652,167 @@ class Model(Network):
       self._collected_trainable_weights = self.trainable_weights
       return
 
-    # Prepare targets of model.
-    self.targets = []
-    self._feed_targets = []
-    if target_tensors not in (None, []):
-      if isinstance(target_tensors, list):
-        if len(target_tensors) != len(self.outputs):
-          raise ValueError(
-              'When passing a list as `target_tensors`, '
-              'it should have one entry per model output. '
-              'The model has ' + str(len(self.outputs)) +
-              ' outputs, but you passed target_tensors=' + str(target_tensors))
-      elif isinstance(target_tensors, dict):
-        for name in target_tensors:
-          if name not in self.output_names:
+    with K.get_graph().as_default():
+      # Prepare targets of model.
+      self.targets = []
+      self._feed_targets = []
+      if target_tensors not in (None, []):
+        if isinstance(target_tensors, list):
+          if len(target_tensors) != len(self.outputs):
             raise ValueError(
-                'Unknown entry in `target_tensors` '
-                'dictionary: "' + name + '". '
-                'Only expected the following keys: ' + str(self.output_names))
-        tmp_target_tensors = []
-        for name in self.output_names:
-          tmp_target_tensors.append(target_tensors.get(name, None))
-        target_tensors = tmp_target_tensors
-      elif tensor_util.is_tensor(target_tensors):
-        target_tensors = [target_tensors]
-      else:
-        raise TypeError('Expected `target_tensors` to be a list or tuple or '
-                        'dict or a single tensor, but got:', target_tensors)
-
-    for i in range(len(self.outputs)):
-      if i in skip_target_indices:
-        self.targets.append(None)
-      else:
-        shape = K.int_shape(self.outputs[i])
-        name = self.output_names[i]
-        if target_tensors not in (None, []):
-          target = target_tensors[i]
-        else:
-          target = None
-        if target is None or K.is_placeholder(target):
-          if target is None:
-            target = K.placeholder(
-                ndim=len(shape),
-                name=name + '_target',
-                sparse=K.is_sparse(self.outputs[i]),
-                dtype=K.dtype(self.outputs[i]))
-          self._feed_targets.append(target)
-          self._feed_outputs.append(self.outputs[i])
-          self._feed_output_names.append(name)
-          self._feed_output_shapes.append(shape)
-          self._feed_loss_fns.append(self.loss_functions[i])
+                'When passing a list as `target_tensors`, '
+                'it should have one entry per model output. '
+                'The model has %s outputs, but you passed target_tensors=%s' %
+                (len(self.outputs), target_tensors))
+        elif isinstance(target_tensors, dict):
+          for name in target_tensors:
+            if name not in self.output_names:
+              raise ValueError(
+                  'Unknown entry in `target_tensors` '
+                  'dictionary: "' + name + '". '
+                  'Only expected the following keys: ' + str(self.output_names))
+          tmp_target_tensors = []
+          for name in self.output_names:
+            tmp_target_tensors.append(target_tensors.get(name, None))
+          target_tensors = tmp_target_tensors
+        elif tensor_util.is_tensor(target_tensors):
+          target_tensors = [target_tensors]
         else:
-          skip_target_weighing_indices.append(i)
-        self.targets.append(target)
-
-    # Prepare sample weights.
-    self._set_sample_weight_attributes(sample_weight_mode,
-                                       skip_target_weighing_indices)
-    # Save all metric attributes per output of the model.
-    self._cache_output_metric_attributes(metrics, weighted_metrics)
-
-    # Compute total loss.
-    total_loss = None
-    with K.name_scope('loss'):
+          raise TypeError('Expected `target_tensors` to be a list or tuple or '
+                          'dict or a single tensor, but got:', target_tensors)
+
       for i in range(len(self.outputs)):
         if i in skip_target_indices:
-          continue
-        y_true = self.targets[i]
-        y_pred = self.outputs[i]
-        loss_fn = loss_functions[i]
-        sample_weight = self.sample_weights[i]
-        mask = masks[i]
-        loss_weight = loss_weights_list[i]
-        with K.name_scope(self.output_names[i] + '_loss'):
-          weighted_loss = training_utils.weighted_masked_objective(loss_fn)
-          output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+          self.targets.append(None)
+        else:
+          shape = K.int_shape(self.outputs[i])
+          name = self.output_names[i]
+          if target_tensors not in (None, []):
+            target = target_tensors[i]
+          else:
+            target = None
+          if target is None or K.is_placeholder(target):
+            if target is None:
+              target_dtype = losses.LABEL_DTYPES_FOR_LOSSES.get(
+                  self.loss_functions[i],
+                  K.dtype(self.outputs[i]))
+
+              target = K.placeholder(
+                  ndim=len(shape),
+                  name=name + '_target',
+                  sparse=K.is_sparse(self.outputs[i]),
+                  dtype=target_dtype)
+            self._feed_targets.append(target)
+            self._feed_outputs.append(self.outputs[i])
+            self._feed_output_names.append(name)
+            self._feed_output_shapes.append(shape)
+            self._feed_loss_fns.append(self.loss_functions[i])
+          else:
+            skip_target_weighing_indices.append(i)
+          self.targets.append(target)
 
-        if len(self.outputs) > 1:
-          # Keep track of the un-aggregated loss result tensor.
-          self.metrics_tensors.append(output_loss)
-
-          # Keep track of stateful result tensor and function for the loss.
-          mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-              loss_fn, name=loss_fn.__name__)
-          result_tensor = training_utils.call_metric_function(
-              mean_wrapped_loss,
-              y_true,
-              y_pred,
-              weights=sample_weight,
-              mask=mask)
-          self._stateful_metrics_tensors.append(result_tensor)
-          self.stateful_metric_functions.append(mean_wrapped_loss)
-
-          self.metrics_names.append(self.output_names[i] + '_loss')
+      # Prepare sample weights.
+      self._set_sample_weight_attributes(sample_weight_mode,
+                                         skip_target_weighing_indices)
+      # Save all metric attributes per output of the model.
+      self._cache_output_metric_attributes(metrics, weighted_metrics)
+
+      # Compute total loss.
+      total_loss = None
+      with K.name_scope('loss'):
+        for i in range(len(self.outputs)):
+          if i in skip_target_indices:
+            continue
+          y_true = self.targets[i]
+          y_pred = self.outputs[i]
+          loss_fn = loss_functions[i]
+          sample_weight = self.sample_weights[i]
+          mask = masks[i]
+          loss_weight = loss_weights_list[i]
+          with K.name_scope(self.output_names[i] + '_loss'):
+            if isinstance(loss_fn, losses.Loss):
+              if mask is not None:
+                mask = math_ops.cast(mask, y_pred.dtype)
+                # Update weights with mask.
+                if sample_weight is None:
+                  sample_weight = mask
+                else:
+                  # Update dimensions of weights to match with mask if possible.
+                  mask, _, sample_weight = squeeze_or_expand_dimensions(
+                      mask, None, sample_weight)
+                  sample_weight *= mask
+              output_loss = loss_fn(y_true, y_pred, sample_weight=sample_weight)
+            else:
+              weighted_loss = training_utils.weighted_masked_objective(loss_fn)
+              output_loss = weighted_loss(y_true, y_pred, sample_weight, mask)
+
+          if len(self.outputs) > 1:
+            # Keep track of the un-aggregated loss result tensor.
+            self._compile_metrics_tensors[self.output_names[i] +
+                                          '_loss'] = output_loss
+
+            # Keep track of stateful result tensor and function for the loss.
+            loss_name = loss_fn.name if isinstance(
+                loss_fn, losses.Loss) else loss_fn.__name__
+            mean_wrapped_loss = metrics_module.MeanMetricWrapper(
+                loss_fn, name=loss_name)
+            result_tensor = training_utils.call_metric_function(
+                mean_wrapped_loss,
+                y_true,
+                y_pred,
+                weights=sample_weight,
+                mask=mask)
+            self._compile_stateful_metrics_tensors[self.output_names[i] +
+                                                   '_loss'] = result_tensor
+            self._compile_stateful_metric_functions.append(mean_wrapped_loss)
+
+            self._compile_metrics_names.append(self.output_names[i] + '_loss')
+          if total_loss is None:
+            total_loss = loss_weight * output_loss
+          else:
+            total_loss += loss_weight * output_loss
         if total_loss is None:
-          total_loss = loss_weight * output_loss
-        else:
-          total_loss += loss_weight * output_loss
-      if total_loss is None:
-        if not self.losses:
-          raise ValueError('The model cannot be compiled '
-                           'because it has no loss to optimize.')
-        else:
-          total_loss = 0.
-
-      # Add regularization penalties
-      # and other layer-specific losses.
-      for loss_tensor in self.losses:
-        total_loss += loss_tensor
-
-    # Set metric attributes on model.
-    self._set_metric_attributes(
-        self.outputs,
-        skip_target_indices=skip_target_indices,
-    )
-    # Invoke metric functions for all the outputs.
-    self._handle_metrics(
-        self.outputs,
-        masks=masks,
-        targets=self.targets,
-        skip_target_indices=skip_target_indices,
-        sample_weights=self.sample_weights)
-
-    # Prepare gradient updates and state updates.
-    self.total_loss = total_loss
-
-    # Functions for train, test and predict will
-    # be compiled lazily when required.
-    # This saves time when the user is not using all functions.
-    self._function_kwargs = kwargs
-
-    self._fit_function = None
-    self._eval_function = None
-    self.train_function = None
-    self.test_function = None
-    self.predict_function = None
-
-    # Collected trainable weights, sorted in topological order.
-    trainable_weights = self.trainable_weights
-    self._collected_trainable_weights = trainable_weights
+          if not self.losses:
+            raise ValueError('The model cannot be compiled '
+                             'because it has no loss to optimize.')
+          else:
+            total_loss = 0.
+
+        # Add regularization penalties
+        # and other layer-specific losses.
+        for loss_tensor in self.losses:
+          total_loss += loss_tensor
+
+      # Set metric attributes on model.
+      self._set_metric_attributes(
+          self.outputs,
+          skip_target_indices=skip_target_indices,
+      )
+      # Invoke metric functions for all the outputs.
+      self._handle_metrics(
+          self.outputs,
+          masks=masks,
+          targets=self.targets,
+          skip_target_indices=skip_target_indices,
+          sample_weights=self.sample_weights)
+
+      # Prepare gradient updates and state updates.
+      self.total_loss = total_loss
+
+      # Functions for train, test and predict will
+      # be compiled lazily when required.
+      # This saves time when the user is not using all functions.
+      self._function_kwargs = kwargs
+
+      self._fit_function = None
+      self._eval_function = None
+      self.train_function = None
+      self.test_function = None
+      self.predict_function = None
+
+      # Collected trainable weights, sorted in topological order.
+      trainable_weights = self.trainable_weights
+      self._collected_trainable_weights = trainable_weights
 
   def _check_trainable_weights_consistency(self):
     """Check trainable weights count consistency.
@@ -721,21 +840,24 @@ class Model(Network):
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if not isinstance(K.learning_phase(), int):
-        inputs += [K.learning_phase()]
+      if not isinstance(K.symbolic_learning_phase(), int):
+        inputs += [K.symbolic_learning_phase()]
+
+      with K.get_graph().as_default():
+        with K.name_scope('training'):
+          with K.name_scope(self.optimizer.__class__.__name__):
+            # Training updates
+            updates = self.optimizer.get_updates(
+                params=self._collected_trainable_weights, loss=self.total_loss)
+      # Unconditional updates
+      updates += self.get_updates_for(None)
+      # Conditional updates relevant to this model
+      updates += self.get_updates_for(self.inputs)
+      # Add stateful metrics updates.
+      if metric_updates is not None:
+        updates += metric_updates
 
       with K.name_scope('training'):
-        with K.name_scope(self.optimizer.__class__.__name__):
-          # Training updates
-          updates = self.optimizer.get_updates(
-              params=self._collected_trainable_weights, loss=self.total_loss)
-        # Unconditional updates
-        updates += self.get_updates_for(None)
-        # Conditional updates relevant to this model
-        updates += self.get_updates_for(self.inputs)
-        # Add stateful metrics updates.
-        if metric_updates is not None:
-          updates += metric_updates
         # Gets loss and metrics. Updates weights at each call.
         fn = K.function(
             inputs,
@@ -746,18 +868,18 @@ class Model(Network):
         setattr(self, fn_name, fn)
 
   def _make_train_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
     self._make_train_function_helper('train_function',
-                                     [self.total_loss] + self.metrics_tensors)
+                                     [self.total_loss] + metrics_tensors)
 
   def _make_fit_function(self):
-    # TODO(psv/anjalisridhar): Remove updates after we fix b/118841692
-    # Stateful metrics updates
-    metric_updates = []
-    for m in self.stateful_metric_functions:
-      metric_updates += m.updates
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
     self._make_train_function_helper(
-        '_fit_function', [self.total_loss] + self._stateful_metrics_tensors,
-        metric_updates)
+        '_fit_function', [self.total_loss] + metrics_tensors)
 
   def _make_test_function_helper(self, fn_name, outputs, metric_updates=None):
     if not hasattr(self, fn_name):
@@ -766,49 +888,53 @@ class Model(Network):
       inputs = (self._feed_inputs +
                 self._feed_targets +
                 self._feed_sample_weights)
-      if not isinstance(K.learning_phase(), int):
-        inputs += [K.learning_phase()]
-      updates = self.state_updates
-      # Add stateful metrics updates.
-      if metric_updates is not None:
-        updates += metric_updates
-      # Return loss and metrics, no gradient updates.
-      # Does update the network states.
-      fn = K.function(
-          inputs,
-          outputs,
-          updates=updates,
-          name='test_function',
-          **self._function_kwargs)
-      setattr(self, fn_name, fn)
+
+      with K.name_scope('evaluation'):
+        updates = self.state_updates
+        # Add stateful metrics updates.
+        if metric_updates is not None:
+          updates += metric_updates
+        # Return loss and metrics, no gradient updates.
+        # Does update the network states.
+        fn = K.function(
+            inputs,
+            outputs,
+            updates=updates,
+            name='test_function',
+            **self._function_kwargs)
+        setattr(self, fn_name, fn)
 
   def _make_test_function(self):
+    metrics_tensors = [
+        self._all_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
     self._make_test_function_helper('test_function',
-                                    [self.total_loss] + self.metrics_tensors)
+                                    [self.total_loss] + metrics_tensors)
 
   def _make_eval_function(self):
+    metrics_tensors = [
+        self._all_stateful_metrics_tensors[m] for m in self.metrics_names[1:]
+    ]
     self._make_test_function_helper(
-        '_eval_function', [self.total_loss] + self._stateful_metrics_tensors)
+        '_eval_function', [self.total_loss] + metrics_tensors)
 
   def _make_predict_function(self):
     if not hasattr(self, 'predict_function'):
       self.predict_function = None
     if self.predict_function is None:
-      if not isinstance(K.learning_phase(), int):
-        inputs = self._feed_inputs + [K.learning_phase()]
-      else:
-        inputs = self._feed_inputs
+      inputs = self._feed_inputs
       # Gets network outputs. Does not update weights.
       # Does update the network states.
       kwargs = getattr(self, '_function_kwargs', {})
-      self.predict_function = K.function(
-          inputs,
-          self.outputs,
-          updates=self.state_updates,
-          name='predict_function',
-          **kwargs)
+      with K.name_scope('predict'):
+        self.predict_function = K.function(
+            inputs,
+            self.outputs,
+            updates=self.state_updates,
+            name='predict_function',
+            **kwargs)
 
-  def _get_execution_function(self, mode):
+  def _make_execution_function(self, mode):
     if mode == 'train':
       self._make_fit_function()
       return self._fit_function
@@ -873,7 +999,8 @@ class Model(Network):
                                 'when using DistributionStrategy.')
 
     if (sample_weight is not None and sample_weight.all() and
-        self._distribution_strategy.__class__.__name__ == 'TPUStrategy'):
+        distributed_training_utils.is_tpu_strategy(
+            self._distribution_strategy)):
       raise NotImplementedError('`sample_weight` is currently not supported '
                                 'when using TPUStrategy.')
 
@@ -882,18 +1009,13 @@ class Model(Network):
     # TODO(anjalisridhar): Remove this check once we refactor the
     # _standardize_user_data code path. This check is already present elsewhere
     # in the codebase.
-    if check_steps and isinstance(x, dataset_ops.Dataset) and steps is None:
+    if check_steps and isinstance(x, dataset_ops.DatasetV2) and steps is None:
       raise ValueError('When using Datasets as input, '
                        'you should specify the `{steps_name}` argument.'
                        .format(steps_name=steps_name))
 
     first_x_value = nest.flatten(x)[0]
     if isinstance(first_x_value, np.ndarray):
-      assert steps is not None
-      x_shape = first_x_value.shape
-      if batch_size is None:
-        batch_size = distributed_training_utils.get_batch_size(
-            self._distribution_strategy.num_replicas_in_sync, x_shape[0], steps)
       # We need to use the drop_remainder argument to allow for a static
       # input shape which is required for TPUs.
       drop_remainder = self._distribution_strategy.require_static_shapes
@@ -928,19 +1050,15 @@ class Model(Network):
         var_x = distributed_training_utils.get_var_for_numpy(
             self._distribution_strategy, x)
         x = dataset_ops.Dataset.from_tensor_slices(var_x)
-        x = x.repeat()
         x = x.batch(batch_size, drop_remainder=drop_remainder)
 
-    assert isinstance(x, dataset_ops.Dataset)
+    assert isinstance(x, dataset_ops.DatasetV2)
 
-    # TODO(anjalisridhar): We want distribute_dataset() to accept a Dataset or a
-    # function which returns a Dataset. Currently distribute_dataset() only
-    # accepts a function that returns a Dataset. Once we add support for being
-    # able to clone a Dataset on multiple workers we can remove this lambda.
-    result = self._distribution_strategy.distribute_dataset(lambda: x)
-    iterator = result.make_initializable_iterator()
     with self._distribution_strategy.scope():
-      K.get_session().run(iterator.initializer)
+      iterator = self._distribution_strategy.make_dataset_iterator(x)
+      init_op = iterator.initialize()
+      if not context.executing_eagerly():
+        K.get_session().run(init_op)
 
     training_utils.validate_iterator_input(x, y, sample_weight,
                                            validation_split)
@@ -1025,14 +1143,14 @@ class Model(Network):
           shuffle=shuffle)
       return iterator, None, None
 
-    if isinstance(x, dataset_ops.Dataset):
+    if isinstance(x, dataset_ops.DatasetV2):
       if context.executing_eagerly():
-        x = x.make_one_shot_iterator()
+        x = iter(x)
       else:
         if x in self._dataset_iterator_cache:
           x = self._dataset_iterator_cache[x]
         else:
-          iterator = x.make_initializable_iterator()
+          iterator = dataset_ops.make_initializable_iterator(x)
           self._dataset_iterator_cache[x] = iterator
           x = iterator
         K.get_session().run(x.initializer)
@@ -1052,7 +1170,7 @@ class Model(Network):
     # For eager iterators, when we have to process multiple batches of samples,
     # we will standardize the data when we actually loop over iterator and get
     # the batches. For now, we just return the iterator as is.
-    if is_x_eager_iterator and steps is not None:
+    if is_x_eager_iterator:
       return x, y, sample_weight
 
     # If input data is a dataset iterator in graph mode or if it is an eager
@@ -1081,12 +1199,57 @@ class Model(Network):
           x, y, sample_weight = next_element
       else:
         x = next_element
-    x, y, sample_weights = self._standardize_weights(x, y, sample_weight,
-                                                     class_weight, batch_size)
+    x, y, sample_weights = self._standardize_weights(
+        x, y, sample_weight, class_weight, batch_size, is_x_iterator)
     return x, y, sample_weights
 
-  def _standardize_weights(self, x, y, sample_weight=None, class_weight=None,
-                           batch_size=None,):
+  def _standardize_weights(self,
+                           x,
+                           y,
+                           sample_weight=None,
+                           class_weight=None,
+                           batch_size=None,
+                           from_iterator=False):
+    """Standardize input data, target data, and weight values.
+
+    This method reformats all data passed to the model to an ordered list of
+    array/tensors, matching the order expected by the model. This also validates
+    the input and target data shapes.
+
+    Args:
+      x: Input data. It could be:
+        - A Numpy array (or array-like), or a list of arrays
+          (in case the model has multiple inputs).
+        - A TensorFlow tensor, or a list of tensors
+          (in case the model has multiple inputs).
+        - A dict mapping input names to the corresponding array/tensors,
+          if the model has named inputs.
+        x cannot not be an iterator.
+      y: Target data. Like the input data `x`,
+        it could be either Numpy array(s) or TensorFlow tensor(s).
+        It should be consistent with `x` (you cannot have Numpy inputs and
+        tensor targets, or inversely).
+      sample_weight: An optional sample-weight array passed by the user to
+        weight the importance of each sample in `x`.
+      class_weight: An optional class-weight array by the user to
+        weight the importance of samples in `x` based on the class they belong
+        to, as conveyed by `y`.
+      batch_size: Integer batch size. If provided, it is used to run additional
+        validation checks on stateful models.
+      from_iterator: Whether x and y were obtained from an iterator.
+
+    Returns:
+      Tuple of standardized data that will be fed to the model:
+        (input data, target data, sample weights)
+
+    Raises:
+      RuntimeError: If target data is provided, but the model has not yet been
+        compiled.
+      ValueError: If the input data, target data, and batch size have invalid
+        shapes or formats (e.g. the model expects input to be a list of three
+        tensors, but x is a list with two tensors). Error is also raised if the
+        input and target data are not both arrays or tensors.
+    """
     # TODO(sourabhbajaj): Split input validation from weight standardization.
     if sample_weight is not None and class_weight is not None:
       logging.warning(
@@ -1096,6 +1259,8 @@ class Model(Network):
     all_inputs = []
     is_build_called = False
     is_compile_called = False
+    # Whether this is a subclassed model that expects dictionary inputs
+    # rather than list inputs (e.g. FeatureColumn-based models).
     dict_inputs = False
     if not self.inputs:
       # We need to use `x` to set the model inputs.
@@ -1118,13 +1283,23 @@ class Model(Network):
         all_inputs.append(x)
 
       # Build the model using the retrieved inputs (value or symbolic).
-      # If values, then in symbolic-mode placeholders will be created
-      # to match the value shapes.
+      # If values or generated from a dataset, then in symbolic-mode
+      # placeholders will be created to match the value shapes.
       if not self.inputs:
         is_build_called = True
-        self._set_inputs(x)
+        if from_iterator:
+          cast_inputs = nest.map_structure(lambda v: v.shape, x)
+        elif training_utils.has_tensors(x):
+          cast_inputs = training_utils.cast_if_floating_dtype(x)
+        else:
+          cast_inputs = x
+        self._set_inputs(cast_inputs)
     else:
       dict_inputs = isinstance(self.inputs, dict)
+    if dict_inputs and context.executing_eagerly():
+      # No support for graph functions when the model expects dictionary inputs
+      # (i.e. FeatureColumn-based models).
+      self.run_eagerly = True
 
     if y is not None:
       if not self.optimizer:
@@ -1134,6 +1309,8 @@ class Model(Network):
       if not self._is_compiled:
         # On-the-fly compilation of the model.
         # We need to use `y` to set the model targets.
+        if training_utils.has_tensors(y):
+          y = training_utils.cast_if_floating_dtype(y)
         if isinstance(y, (list, tuple)):
           if not all(isinstance(v, np.ndarray) or
                      tensor_util.is_tensor(v) for v in y):
@@ -1158,19 +1335,22 @@ class Model(Network):
                              'TensorFlow tensors. '
                              'You passed: x=' + str(x) + '; y=' + str(y))
 
-        if context.executing_eagerly():
+        if self.run_eagerly or from_iterator:
           target_tensors = None
         else:
           # Handle target tensors if any passed.
           if not isinstance(y, (list, tuple)):
             y = [y]
-          target_tensors = [v for v in y if tensor_util.is_tensor(v)]
+          target_tensors = [v for v in y if _is_symbolic_tensor(v)]
         is_compile_called = True
-        self.compile(optimizer=self.optimizer,
-                     loss=self.loss,
-                     metrics=self.metrics,
-                     loss_weights=self.loss_weights,
-                     target_tensors=target_tensors)
+        self.compile(
+            optimizer=self.optimizer,
+            loss=self.loss,
+            metrics=self._compile_metrics,
+            weighted_metrics=self._compile_weighted_metrics,
+            loss_weights=self.loss_weights,
+            target_tensors=target_tensors,
+            run_eagerly=self.run_eagerly)
 
     # In graph mode, if we had just set inputs and targets as symbolic tensors
     # by invoking build and compile on the model respectively, we do not have to
@@ -1178,15 +1358,14 @@ class Model(Network):
     # part of the graph.
     # Note: in this case, `any` and `all` are equivalent since we disallow
     # mixed symbolic/value inputs.
-    if (not context.executing_eagerly() and is_build_called and
-        is_compile_called and
-        any(tensor_util.is_tensor(v) for v in all_inputs)):
+    if (not self.run_eagerly and is_build_called and is_compile_called and
+        not from_iterator and any(_is_symbolic_tensor(v) for v in all_inputs)):
       return [], [], []
 
     # What follows is input validation and standardization to list format,
     # in the case where all inputs are value arrays.
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       # In eager mode, do not do shape validation
       # since the network has no input nodes (placeholders) to be fed.
       feed_input_names = self.input_names
@@ -1242,7 +1421,9 @@ class Model(Network):
       y = training_utils.standardize_input_data(
           y,
           feed_output_names,
-          feed_output_shapes,
+          # Don't enforce target shapes to match output shapes.
+          # Precise checks will be run in `check_loss_and_target_compatibility`.
+          shapes=None,
           check_batch_axis=False,  # Don't enforce the batch size.
           exception_prefix='target')
 
@@ -1260,7 +1441,7 @@ class Model(Network):
       # Check that all arrays have the same length.
       if not self._distribution_strategy:
         training_utils.check_array_lengths(x, y, sample_weights)
-        if self._is_graph_network and not context.executing_eagerly():
+        if self._is_graph_network and not self.run_eagerly:
           # Additional checks to avoid users mistakenly using improper loss fns.
           training_utils.check_loss_and_target_compatibility(
               y, self._feed_loss_fns, feed_output_shapes)
@@ -1292,12 +1473,12 @@ class Model(Network):
 
     Args:
       inputs: Single array, or list of arrays. The arrays could be placeholders,
-        Numpy arrays, or data tensors.
+        Numpy arrays, data tensors, or TensorShapes.
         - if placeholders: the model is built on top of these placeholders,
           and we expect Numpy data to be fed for them when calling `fit`/etc.
-        - if Numpy data: we create placeholders matching the shape of the Numpy
-          arrays. We expect Numpy data to be fed for these placeholders
-          when calling `fit`/etc.
+        - if Numpy data or TensorShapes: we create placeholders matching the
+          TensorShapes or shapes of the Numpy arrays. We expect Numpy data to be
+          fed for these placeholders when calling `fit`/etc.
         - if data tensors: the model is built on top of these tensors.
           We do not expect any Numpy data to be provided when calling `fit`/etc.
       outputs: None, a data tensor, or a list of tensors. If None, the
@@ -1315,8 +1496,9 @@ class Model(Network):
 
     if self.__class__.__name__ == 'Sequential' and not self.built:
       if tensor_util.is_tensor(inputs):
-        input_shape = (None,) + tuple(inputs.get_shape().as_list()[1:])
-        self.build(input_shape=input_shape)
+        input_shape = (None,) + tuple(inputs.shape.as_list()[1:])
+      elif isinstance(inputs, tensor_shape.TensorShape):
+        input_shape = (None,) + tuple(inputs.as_list()[1:])
       elif isinstance(inputs, dict):
         # We assert that the first layer is a FeatureLayer.
         if not training_utils.is_feature_layer(self.layers[0]):
@@ -1324,10 +1506,9 @@ class Model(Network):
                            'which doesn\'t have FeatureLayer as the first layer'
                            ' is an error.')
         input_shape = (None,)
-        self.build(input_shape=input_shape)
       else:
-        input_shape = (None,) + inputs.shape[1:]
-        self.build(input_shape=input_shape)
+        input_shape = (None,) + tuple(inputs.shape[1:])
+      self._build_input_shape = input_shape
 
     # On-the-fly setting of symbolic model inputs (either by using the tensor
     # provided, or by creating a placeholder if Numpy data was provided).
@@ -1346,10 +1527,11 @@ class Model(Network):
         self._feed_input_names.append(k)
         self._feed_input_shapes.append(K.int_shape(v))
 
+    # TODO(fchollet): consider calling `_maybe_build` before calling the model.
+
     if outputs is None:
       # Obtain symbolic outputs by calling the model.
-      graph = K.get_graph()
-      with graph.as_default():
+      with K.get_graph().as_default():
         if self._expects_training_arg:
           outputs = self.call(inputs, training=training)
         else:
@@ -1509,7 +1691,6 @@ class Model(Network):
     """
     # TODO(fchollet): this method may be creating reference cycles, which would
     # lead to accumulating garbage in memory when called in a loop. Investigate.
-
     if data_utils.is_generator_or_sequence(x):
       training_utils.check_generator_arguments(y, sample_weight)
       return self.fit_generator(
@@ -1527,9 +1708,6 @@ class Model(Network):
           shuffle=shuffle,
           initial_epoch=initial_epoch)
 
-    # Backwards compatibility
-    if batch_size is None and steps_per_epoch is None:
-      batch_size = 32
     # Legacy support
     if 'nb_epoch' in kwargs:
       logging.warning(
@@ -1541,15 +1719,21 @@ class Model(Network):
 
     # Validate and standardize user data.
     if self._distribution_strategy:
-      distributed_training_utils.validate_callbacks(callbacks)
+      distributed_training_utils.validate_callbacks(callbacks, self.optimizer,
+                                                    self._distribution_strategy)
 
       distributed_training_utils.validate_inputs(
           x, y, self._distribution_strategy)
 
       first_x_value = nest.flatten(x)[0]
-      if not steps_per_epoch and isinstance(first_x_value, np.ndarray):
-        steps_per_epoch = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps_per_epoch, batch_size = (
+            distributed_training_utils.get_input_params(
+                self._distribution_strategy, first_x_value, steps_per_epoch,
+                batch_size, is_training=True))
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps_per_epoch,
+                                                    x)
 
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1567,7 +1751,7 @@ class Model(Network):
     if validation_data:
       if (isinstance(validation_data, iterator_ops.Iterator) or
           isinstance(validation_data, iterator_ops.EagerIterator) or
-          isinstance(validation_data, dataset_ops.Dataset)):
+          isinstance(validation_data, dataset_ops.DatasetV2)):
         val_x = validation_data
         val_y = None
         val_sample_weight = None
@@ -1590,9 +1774,10 @@ class Model(Network):
         distributed_training_utils.validate_inputs(
             val_x, val_y, self._distribution_strategy)
         first_valx_value = nest.flatten(val_x)[0]
-        if not validation_steps and isinstance(first_valx_value, np.ndarray):
-          validation_steps = distributed_training_utils.get_input_batch_params(
-              first_valx_value, batch_size, self._distribution_strategy)
+        if isinstance(first_valx_value, np.ndarray):
+          validation_steps, _ = distributed_training_utils.get_input_params(
+              self._distribution_strategy, first_valx_value, validation_steps,
+              batch_size)
 
       val_x, val_y, val_sample_weights = self._standardize_user_data(
           val_x,
@@ -1622,27 +1807,25 @@ class Model(Network):
       val_y = None
       val_sample_weights = None
 
-    if context.executing_eagerly():
-      return training_eager.fit_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
-          class_weight=class_weight,
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.fit_generator(
+          self, (x, y, sample_weights),
+          steps_per_epoch=steps_per_epoch,
           batch_size=batch_size,
           epochs=epochs,
+          shuffle=shuffle,
           verbose=verbose,
           callbacks=callbacks,
-          val_inputs=val_x,
-          val_targets=val_y,
-          val_sample_weights=val_sample_weights,
-          shuffle=shuffle,
-          initial_epoch=initial_epoch,
-          steps_per_epoch=steps_per_epoch,
-          validation_steps=validation_steps)
-    elif self._distribution_strategy:
-      return training_distributed.fit_loop(
-          self, x,
+          validation_data=validation_data,
+          validation_steps=validation_steps,
+          workers=0,
+          initial_epoch=initial_epoch)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_fit_loop(
+          self,
+          x,
           epochs=epochs,
           verbose=verbose,
           callbacks=callbacks,
@@ -1757,19 +1940,16 @@ class Model(Network):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
-
     # Validate and standardize user data.
     if self._distribution_strategy:
       distributed_training_utils.validate_inputs(
           x, y, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray) and not steps:
-        steps = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
+
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
     x, y, sample_weights = self._standardize_user_data(
         x,
@@ -1780,21 +1960,18 @@ class Model(Network):
         steps_name='steps',
         steps=steps)
 
-    if context.executing_eagerly():
-      return training_eager.test_loop(
-          self,
-          inputs=x,
-          targets=y,
-          sample_weights=sample_weights,
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.evaluate_generator(
+          self, (x, y, sample_weights),
+          steps=steps,
           batch_size=batch_size,
           verbose=verbose,
-          steps=steps)
-    elif self._distribution_strategy:
-      return training_distributed.test_loop(
-          self,
-          iterator=x,
-          verbose=verbose,
-          steps=steps)
+          workers=0)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_test_loop(
+          self, iterator=x, verbose=verbose, steps=steps)
     else:
       return training_arrays.test_loop(
           self,
@@ -1868,37 +2045,59 @@ class Model(Network):
           max_queue_size=max_queue_size,
           workers=workers,
           use_multiprocessing=use_multiprocessing)
-
-    # Backwards compatibility.
-    if batch_size is None and steps is None:
-      batch_size = 32
-
     if self._distribution_strategy:
       distributed_training_utils.validate_inputs(
           x, None, self._distribution_strategy)
       first_x_value = nest.flatten(x)[0]
-      if isinstance(first_x_value, np.ndarray) and not steps:
-        steps = distributed_training_utils.get_input_batch_params(
-            first_x_value, batch_size, self._distribution_strategy)
+      if isinstance(first_x_value, np.ndarray):
+        steps, batch_size = distributed_training_utils.get_input_params(
+            self._distribution_strategy, first_x_value, steps, batch_size)
 
-    # Validate and standardize user data.
-    # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
-    # means that we end up calculating it twice which we should avoid.
-    x, _, _ = self._standardize_user_data(
-        x, check_steps=True, steps_name='steps', steps=steps)
+    batch_size = self._validate_or_infer_batch_size(batch_size, steps, x)
 
-    if context.executing_eagerly():
-      return training_eager.predict_loop(
-          self, x, batch_size=batch_size, verbose=verbose, steps=steps)
-    elif self._distribution_strategy:
-      results = training_distributed.predict_loop(
+    # Validate and standardize user data.
+    if self._distribution_strategy:
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps,
+          batch_size=batch_size)
+    else:
+      # TODO(anjalisridhar): We don't pass batch_size here for some reason. This
+      # means we need to special case distribution strategy which needs the
+      # batch size.
+      x, _, _ = self._standardize_user_data(
+          x, check_steps=True, steps_name='steps', steps=steps)
+
+    if (self.run_eagerly or (isinstance(x, iterator_ops.EagerIterator) and
+                             not self._distribution_strategy)):
+      return training_generator.predict_generator(
+          self,
+          x,
+          steps=steps,
+          batch_size=batch_size,
+          verbose=verbose,
+          workers=0)
+    elif distributed_training_utils.is_tpu_strategy(
+        self._distribution_strategy):
+      return training_distributed.experimental_predict_loop(
           self, x, verbose=verbose, steps=steps)
-      return results
     else:
       return training_arrays.predict_loop(
           self, x, batch_size=batch_size, verbose=verbose, steps=steps)
 
-  def train_on_batch(self, x, y=None, sample_weight=None, class_weight=None):
+  def reset_metrics(self):
+    """Resets the state of metrics."""
+    if hasattr(self, 'metrics'):
+      for m in self.metrics:
+        m.reset_states()
+      if self._distribution_strategy:
+        training_distributed._reset_metrics(self)  # pylint: disable=protected-access
+
+  def train_on_batch(self,
+                     x,
+                     y=None,
+                     sample_weight=None,
+                     class_weight=None,
+                     reset_metrics=True):
     """Runs a single gradient update on a single batch of data.
 
     Arguments:
@@ -1926,6 +2125,9 @@ class Model(Network):
           weight (float) to apply to the model's loss for the samples from this
           class during training. This can be useful to tell the model to "pay
           more attention" to samples from an under-represented class.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
 
     Returns:
         Scalar training loss
@@ -1944,23 +2146,30 @@ class Model(Network):
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight, class_weight=class_weight)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       outputs = training_eager.train_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if not isinstance(K.learning_phase(), int):
-        ins = x + y + sample_weights + [1]
+      if not isinstance(K.symbolic_learning_phase(), int):
+        ins = x + y + sample_weights + [True]
       else:
         ins = x + y + sample_weights
 
-      self._make_train_function()
-      outputs = self.train_function(ins)  # pylint: disable=not-callable
+      if reset_metrics:
+        self._make_train_function()
+        outputs = self.train_function(ins)  # pylint: disable=not-callable
+      else:
+        self._make_fit_function()
+        outputs = self._fit_function(ins)  # pylint: disable=not-callable
+
+    if reset_metrics:
+      self.reset_metrics()
 
     if len(outputs) == 1:
       return outputs[0]
     return outputs
 
-  def test_on_batch(self, x, y=None, sample_weight=None):
+  def test_on_batch(self, x, y=None, sample_weight=None, reset_metrics=True):
     """Test the model on a single batch of samples.
 
     Arguments:
@@ -1986,6 +2195,9 @@ class Model(Network):
             In this case you should make sure to specify
             sample_weight_mode="temporal" in compile(). This argument is not
             supported when `x` is a dataset or a dataset iterator.
+        reset_metrics: If `True`, the metrics returned will be only for this
+          batch. If `False`, the metrics will be statefully accumulated across
+          batches.
 
     Returns:
         Scalar test loss (if the model has a single output and no metrics)
@@ -2003,16 +2215,20 @@ class Model(Network):
     x, y, sample_weights = self._standardize_user_data(
         x, y, sample_weight=sample_weight)
 
-    if context.executing_eagerly():
+    if self.run_eagerly:
       outputs = training_eager.test_on_batch(
           self, x, y, sample_weights=sample_weights)
     else:
-      if not isinstance(K.learning_phase(), int):
-        ins = x + y + sample_weights + [0]
+      inputs = x + y + sample_weights
+      if reset_metrics:
+        self._make_test_function()
+        outputs = self.test_function(inputs)  # pylint: disable=not-callable
       else:
-        ins = x + y + sample_weights
-      self._make_test_function()
-      outputs = self.test_function(ins)  # pylint: disable=not-callable
+        self._make_eval_function()
+        outputs = self._eval_function(inputs)  # pylint: disable=not-callable
+
+    if reset_metrics:
+      self.reset_metrics()
 
     if len(outputs) == 1:
       return outputs[0]
@@ -2041,28 +2257,21 @@ class Model(Network):
                                 'models compiled with DistributionStrategy.')
     # Validate and standardize user data.
     inputs, _, _ = self._standardize_user_data(x)
-    if context.executing_eagerly():
-      if (isinstance(x, iterator_ops.EagerIterator) or
-          (isinstance(x, dataset_ops.Dataset) and context.executing_eagerly())):
+    if self.run_eagerly:
+      if (isinstance(inputs, iterator_ops.EagerIterator) or
+          (isinstance(inputs, dataset_ops.DatasetV2))):
         inputs = training_utils.cast_if_floating_dtype(inputs)
-      else:
+      elif isinstance(inputs, collections.Sequence):
         inputs = [
-            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs
-        ]
+            ops.convert_to_tensor(val, dtype=K.floatx()) for val in inputs]
       return self(inputs)  # pylint: disable=not-callable
 
-    if not context.executing_eagerly():
-      if not isinstance(K.learning_phase(), int):
-        ins = inputs + [0]
-      else:
-        ins = inputs
-
-      self._make_predict_function()
-      outputs = self.predict_function(ins)
+    self._make_predict_function()
+    outputs = self.predict_function(inputs)
 
-      if len(outputs) == 1:
-        return outputs[0]
-      return outputs
+    if len(outputs) == 1:
+      return outputs[0]
+    return outputs
 
   def fit_generator(self,
                     generator,
@@ -2172,11 +2381,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`fit_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`fit_generator` is not yet enabled for unbuilt Model subclasses')
-
     return training_generator.fit_generator(
         self,
         generator,
@@ -2243,12 +2447,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`evaluate_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`evaluate_generator` is not yet enabled for '
-          'unbuilt Model subclasses')
-
     return training_generator.evaluate_generator(
         self,
         generator,
@@ -2300,11 +2498,6 @@ class Model(Network):
     if self._distribution_strategy:
       raise NotImplementedError('`predict_generator` is not supported for '
                                 'models compiled with DistributionStrategy.')
-
-    if not self.built and not self._is_graph_network:
-      raise NotImplementedError(
-          '`predict_generator` is not yet enabled for unbuilt Model subclasses')
-
     return training_generator.predict_generator(
         self,
         generator,
@@ -2336,15 +2529,64 @@ class Model(Network):
     self._replicated_model = DistributedCallbackModel(first_replicated_model)
     self._replicated_model.set_original_model(self)
 
+  def _validate_or_infer_batch_size(self, batch_size, steps, x):
+    """Validates that the `batch_size` provided is consistent with InputLayer.
+
+    It's possible that the user specified a static batch size in their
+    InputLayer. If so, this method checks the provided `batch_size` and `x`
+    arguments are consistent with this static batch size. Also, if
+    `batch_size` is `None`, this method will attempt to infer the batch size
+    from the static batch size of the InputLayer.
+
+    Arguments:
+      batch_size: The batch_size provided as an argument to
+        fit/evaluate/predict.
+      steps: The steps provided as an argument to fit/evaluate/predict.
+      x: The data passed as `x` to fit/evaluate/predict.
+
+    Returns:
+      The validated batch_size, auto-inferred from the first layer if not
+      provided.
+    """
+    layers = super(Model, self).layers  # Avoids the override in Sequential.
+    if layers:
+      first_layer = layers[0]
+      static_batch_size = training_utils.get_static_batch_size(first_layer)
+      if static_batch_size is not None:
+
+        # Check `batch_size` argument is consistent with InputLayer.
+        if batch_size is not None and batch_size != static_batch_size:
+          raise ValueError('The `batch_size` argument value {} is incompatible '
+                           'with the specified batch size of your Input Layer: '
+                           '{}'.format(batch_size, static_batch_size))
+
+        # Check Dataset/Iterator batch size is consistent with InputLayer.
+        if isinstance(x, (dataset_ops.DatasetV2, iterator_ops.Iterator,
+                          iterator_ops.EagerIterator)):
+          ds_batch_size = tensor_shape.as_dimension(
+              nest.flatten(x.output_shapes)[0][0]).value
+          if ds_batch_size is not None and ds_batch_size != static_batch_size:
+            raise ValueError('The batch output shape of your `Dataset` is {}, '
+                             'which is incompatible with the specified batch '
+                             'size of your Input Layer: {}'.format(
+                                 ds_batch_size, static_batch_size))
+
+        # Set inferred batch size from the InputLayer.
+        if steps is None:
+          batch_size = static_batch_size
+
+    if batch_size is None and steps is None:
+      # Backwards compatibility
+      batch_size = 32
+    return batch_size
+
 
 class DistributedCallbackModel(Model):
   """Model that is used for callbacks with DistributionStrategy."""
 
   def __init__(self, model):
     super(DistributedCallbackModel, self).__init__()
-    # TODO(anjalisridhar): Right now the only attributes set are the layer and
-    # weights. We may need to set additional attributes as needed since we have
-    # not called compile on this model.
+    self.optimizer = model.optimizer
 
   def set_original_model(self, orig_model):
     self._original_model = orig_model
@@ -2376,3 +2618,7 @@ class DistributedCallbackModel(Model):
       logging.warning('You are accessing attribute ' + item + ' of the '
                       'DistributedCallbackModel that may not have been set '
                       'correctly.')
+
+
+def _is_symbolic_tensor(x):
+  return tensor_util.is_tensor(x) and not isinstance(x, ops.EagerTensor)
diff --git a/tensorflow/python/keras/engine/training_arrays.py b/tensorflow/python/keras/engine/training_arrays.py
index a2a13b9bd60b1721cc8332212b0d0e13c5b544e0..196d48faec23acd42bca33414b4862a5084d18f5 100644
--- a/tensorflow/python/keras/engine/training_arrays.py
+++ b/tensorflow/python/keras/engine/training_arrays.py
@@ -23,9 +23,11 @@ import functools
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import errors
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import callbacks as cbks
+from tensorflow.python.keras.engine import training_distributed
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils.generic_utils import make_batches
 from tensorflow.python.keras.utils.generic_utils import slice_arrays
@@ -37,91 +39,6 @@ except ImportError:
   issparse = None
 
 
-class Aggregator(object):
-  """Abstract base class used to aggregate batch-level outputs of a loop.
-
-  Arguments:
-    use_steps: Whether the loop is using `step` or `batch_size`.
-    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
-  """
-
-  def __init__(self, use_steps, num_samples_or_steps):
-    self.use_steps = use_steps
-    self.num_samples_or_steps = num_samples_or_steps
-    self.results = []
-
-  def create(self, batch_outs):
-    """Create the initial results from the first batch outputs.
-
-    Arguments:
-      batch_outs: A list of batch-level outputs.
-    """
-    raise NotImplementedError
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    """Aggregate batch-level results into total results.
-
-    Arguments:
-      batch_outs: A list of batch-level outputs.
-      batch_start: The start index of this batch. Always `None` if `use_steps`
-        is `True`.
-      batch_end: The end index of this batch. Always `None` if `use_steps` is
-        `True`.
-    """
-    raise NotImplementedError
-
-  def finalize(self):
-    """Prepare the total results to be returned."""
-    raise NotImplementedError
-
-
-class MetricsAggregator(Aggregator):
-  """Aggregator that calculates loss and metrics info."""
-
-  def create(self, batch_outs):
-    self.results = [0.] * len(batch_outs)
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    # Loss.
-    if self.use_steps:
-      self.results[0] += batch_outs[0]
-    else:
-      self.results[0] += batch_outs[0] * (batch_end - batch_start)
-    # Metrics (always stateful, just grab current values.)
-    self.results[1:] = batch_outs[1:]
-
-  def finalize(self):
-    self.results[0] /= self.num_samples_or_steps
-
-
-class OutputsAggregator(Aggregator):
-  """Aggregator that concatenates outputs."""
-
-  def create(self, batch_outs):
-    if self.use_steps:
-      # Cannot pre-allocate the returned NumPy arrays bc
-      # batch sizes are unknown. Concatenate batches at the end.
-      for _ in batch_outs:
-        self.results.append([])
-    else:
-      # Pre-allocate NumPy arrays.
-      for batch_out in batch_outs:
-        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
-        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
-
-  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
-    if self.use_steps:
-      for i, batch_out in enumerate(batch_outs):
-        self.results[i].append(batch_out)
-    else:
-      for i, batch_out in enumerate(batch_outs):
-        self.results[i][batch_start:batch_end] = batch_out
-
-  def finalize(self):
-    if self.use_steps:
-      self.results = [np.concatenate(result, axis=0) for result in self.results]
-
-
 def _get_model_feed(model, mode):
   if mode == 'predict':
     feed = model._feed_inputs
@@ -151,13 +68,6 @@ def _print_train_info(inputs, val_inputs, steps_per_epoch, verbose):
           (inputs[0].shape[0], val_inputs[0].shape[0]))
 
 
-def _get_progbar(model, count_mode):
-  stateful_metric_names = None
-  if hasattr(model, 'metrics_names'):
-    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
-  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
-
-
 def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
   """Returns total number of samples (when training in batch mode) or steps."""
   if steps_per_epoch:
@@ -166,16 +76,50 @@ def _get_num_samples_or_steps(ins, batch_size, steps_per_epoch):
                                           'steps_per_epoch')
 
 
-def _make_logs(model, outputs, mode, prefix=''):
-  """Used to make logs to send to `on_batch_end` methods."""
-  logs = {}
-  # TODO(omalleyt): handle outputs in prediction when Callback
-  # hooks are ready.
-  if mode in ['train', 'test']:
-    if hasattr(model, 'metrics_names'):
-      for label, output in zip(model.metrics_names, outputs):
-        logs[prefix + label] = output
-  return logs
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of 'train'/'test'/'predict'.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  if model._distribution_strategy:
+    def get_distributed_inputs():
+      return training_distributed._prepare_feed_values(
+          model, inputs, targets, sample_weights, mode)
+
+    # In the eager case, we want to call the input method per step, so return
+    # a lambda from here that can be called. Note that this is applicable only
+    # in Distribution Strategy case as it follows the same code path for both
+    # eager and graph modes.
+    # TODO(priyag,omalleyt): Either we should move the training DS with
+    # EagerIterator to use training_generator code path, or figure out how to
+    # set a symbolic Iterator out of a Dataset when in eager mode.
+    if context.executing_eagerly():
+      return get_distributed_inputs
+    else:
+      return get_distributed_inputs()
+
+  inputs = training_utils.ModelInputs(inputs).as_list()
+  targets = targets or []
+  sample_weights = sample_weights or []
+  ins = inputs + targets + sample_weights
+  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
+    ins += [True]
+  return ins
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of model execution."""
+  if model._distribution_strategy:
+    return training_distributed._make_execution_function(model, mode)
+  return model._make_execution_function(mode)
 
 
 def model_iteration(model,
@@ -194,6 +138,7 @@ def model_iteration(model,
                     steps_per_epoch=None,
                     validation_steps=None,
                     mode='train',
+                    validation_in_fit=False,
                     **kwargs):
   """Loop function for arrays of data with modes 'train'/'test'/'predict'.
 
@@ -220,6 +165,9 @@ def model_iteration(model,
       validation_steps: Number of steps to run validation for (only if doing
         validation from data tensors). Ignored with the default value of `None`.
       mode: One of 'train'/'test'/'predict'.
+      validation_in_fit: if true, then this method is invoked from within
+        training iteration (for validation). In this case, do not copy weights
+        when using a tf.distribute.Strategy.
       **kwargs: Additional arguments for backwards compatibility.
 
   Returns:
@@ -238,19 +186,18 @@ def model_iteration(model,
   if mode == 'train':
     _print_train_info(inputs, val_inputs, steps_per_epoch, verbose)
 
+  # Enter DistributionStrategy scope.
+  if model._distribution_strategy:
+    scope = model._distribution_strategy.scope()
+    scope.__enter__()
+
   # Get step function and loop type.
-  f = model._get_execution_function(mode)
+  f = _make_execution_function(model, mode)
   use_steps = steps_per_epoch is not None
   do_validation = val_inputs is not None
 
   # Prepare input data.
-  inputs = training_utils.ModelInputs(inputs).as_list()
-  targets = targets or []
-  sample_weights = sample_weights or []
-  learning_phase_input = []
-  if not isinstance(K.learning_phase(), int):
-    learning_phase_input = [1] if mode == 'train' else [0]
-  ins = inputs + targets + sample_weights + learning_phase_input
+  ins = _prepare_feed_values(model, inputs, targets, sample_weights, mode)
   num_samples_or_steps = _get_num_samples_or_steps(ins, batch_size,
                                                    steps_per_epoch)
 
@@ -260,24 +207,19 @@ def model_iteration(model,
       callbacks,
       model,
       do_validation=do_validation,
-      val_inputs=val_inputs,
-      val_targets=val_targets,
-      val_sample_weights=val_sample_weights,
       batch_size=batch_size,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
       samples=num_samples_or_steps,
-      validation_steps=validation_steps,
       verbose=0,  # Handle ProgBarLogger separately in this loop.
-      count_mode=count_mode,
       mode=mode)
   # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
-  progbar = _get_progbar(model, count_mode)
+  progbar = training_utils.get_progbar(model, count_mode)
   progbar.params = callbacks.params
   progbar.params['verbose'] = verbose
 
   # Find beforehand arrays that need sparse-to-dense conversion.
-  if issparse is not None:
+  if issparse is not None and not use_steps:
     indices_for_conversion_to_dense = []
     feed = _get_model_feed(model, mode)
     for i, (input_data, feed_tensor) in enumerate(zip(ins, feed)):
@@ -286,23 +228,27 @@ def model_iteration(model,
 
   # Select aggregation method.
   if mode == 'predict':
-    aggregator = OutputsAggregator(use_steps, num_samples_or_steps)
+    aggregator = training_utils.OutputsAggregator(use_steps,
+                                                  num_samples_or_steps)
   else:
-    aggregator = MetricsAggregator(use_steps, num_samples_or_steps)
+    aggregator = training_utils.MetricsAggregator(use_steps,
+                                                  num_samples_or_steps)
+
+  if model._distribution_strategy and not validation_in_fit:
+    training_distributed._copy_weights_to_distributed_model(
+        model, model._grouped_model)
 
   callbacks.model.stop_training = False
   callbacks._call_begin_hook(mode)
   progbar.on_train_begin()
+
   for epoch in range(initial_epoch, epochs):
     if callbacks.model.stop_training:
       break
 
     # Setup work for each epoch
-    results = []
     epoch_logs = {}
-    if hasattr(model, 'stateful_metric_functions'):
-      for m in model.stateful_metric_functions:
-        m.reset_states()
+    model.reset_metrics()
     callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
     progbar.on_epoch_begin(epoch, epoch_logs)
 
@@ -315,26 +261,31 @@ def model_iteration(model,
 
         # Get outputs.
         try:
-          batch_outs = f(ins)
+          # `ins` can be callable in DistributionStrategy + eager case.
+          actual_inputs = ins() if callable(ins) else ins
+          batch_outs = f(actual_inputs)
         except errors.OutOfRangeError:
           logging.warning('Your dataset iterator ran out of data; '
                           'interrupting training. Make sure that your dataset '
                           'can generate at least `steps_per_epoch * epochs` '
                           'batches (in this case, %d batches). You may need to'
                           'use the repeat() function when building your '
-                          'dataset.' %
-                          steps_per_epoch * epochs)
+                          'dataset.' % steps_per_epoch * epochs)
           break
         if not isinstance(batch_outs, list):
           batch_outs = [batch_outs]
 
+        if model._distribution_strategy:
+          batch_outs = training_distributed._per_device_aggregate_batch(
+              batch_outs, model, mode)
+
         # Aggregate results.
         if step == 0:
           aggregator.create(batch_outs)
         aggregator.aggregate(batch_outs)
 
         # Callbacks batch end.
-        batch_logs.update(_make_logs(model, batch_outs, mode))
+        batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
         callbacks._call_batch_hook(mode, 'end', step, batch_logs)
         progbar.on_batch_end(step, batch_logs)
 
@@ -365,8 +316,9 @@ def model_iteration(model,
                           'pass shuffle="batch".')
 
         # Sparse to dense conversion.
-        for i in indices_for_conversion_to_dense:
-          ins_batch[i] = ins_batch[i].toarray()
+        if issparse is not None:
+          for i in indices_for_conversion_to_dense:
+            ins_batch[i] = ins_batch[i].toarray()
 
         # Callbacks batch_begin.
         batch_logs = {'batch': batch_index, 'size': len(batch_ids)}
@@ -384,7 +336,7 @@ def model_iteration(model,
         aggregator.aggregate(batch_outs, batch_start, batch_end)
 
         # Callbacks batch end.
-        batch_logs.update(_make_logs(model, batch_outs, mode))
+        batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
         callbacks._call_batch_hook(mode, 'end', batch_index, batch_logs)
         progbar.on_batch_end(batch_index, batch_logs)
 
@@ -393,7 +345,7 @@ def model_iteration(model,
 
     aggregator.finalize()
     results = aggregator.results
-    epoch_logs.update(_make_logs(model, results, mode))
+    epoch_logs.update(training_utils.make_logs(model, results, mode))
     if len(results) == 1:
       results = results[0]
 
@@ -408,15 +360,25 @@ def model_iteration(model,
           steps_per_epoch=validation_steps,
           callbacks=callbacks,
           verbose=0,
-          mode='test')
+          mode='test',
+          validation_in_fit=True)
       if not isinstance(val_results, list):
         val_results = [val_results]
-      epoch_logs.update(_make_logs(model, val_results, mode, prefix='val_'))
+      epoch_logs.update(
+          training_utils.make_logs(model, val_results, mode, prefix='val_'))
 
     callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
     progbar.on_epoch_end(epoch, epoch_logs)
   callbacks._call_end_hook(mode)
 
+  if model._distribution_strategy:
+    # TODO(priyag, psv): Copy back metrics to the original model as well?
+    if not validation_in_fit:
+      training_distributed._copy_weights_to_original_model(
+          model, model._grouped_model, mode)
+
+    scope.__exit__(None, None, None)
+
   if mode == 'train':
     return model.history
   return results
diff --git a/tensorflow/python/keras/engine/training_dataset_test.py b/tensorflow/python/keras/engine/training_dataset_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6cc93d1ef77b14142851e6267158d61edcbc13b
--- /dev/null
+++ b/tensorflow/python/keras/engine/training_dataset_test.py
@@ -0,0 +1,351 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for training routines."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+
+import numpy as np
+
+from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.ops.losses import losses_impl
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
+
+
+class TestTrainingWithDatasetIterators(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_training_and_eval_methods_on_iterators_single_io(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(iterator, steps=2, verbose=1)
+    model.predict(iterator, steps=2)
+
+    # Test with validation data
+    model.fit(iterator,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=iterator, validation_steps=2)
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(iterator,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
+
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          iterator,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
+
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(iterator, iterator,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(iterator, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(iterator, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(iterator, verbose=0)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_get_next_op_created_once(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+    # Finalize graph to make sure we are not appending another iterator
+    # get_next op in the graph.
+    ops.get_default_graph().finalize()
+    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_iterators_running_out_of_data(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(2)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+
+    with test.mock.patch.object(logging, 'warning') as mock_log:
+      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
+      self.assertRegexpMatches(
+          str(mock_log.call_args),
+          'dataset iterator ran out of data')
+
+
+class TestTrainingWithDataset(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_calling_model_on_same_dataset(self):
+    if ((not testing_utils.should_run_eagerly())
+        and testing_utils.get_model_type() == 'subclass'
+        and context.executing_eagerly()):
+      self.skipTest('b/120673224')
+
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae']
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    # Call fit with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+    # Finalize the graph to make sure new ops aren't added when calling on the
+    # same dataset
+    ops.get_default_graph().finalize()
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_training_and_eval_methods_on_dataset(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+    # Test with validation data
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=dataset, validation_steps=2)
+
+    # Test with validation split
+    with self.assertRaisesRegexp(
+        ValueError, '`validation_split` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(dataset,
+                epochs=1, steps_per_epoch=2, verbose=0,
+                validation_split=0.5, validation_steps=2)
+
+    # Test with sample weight.
+    sample_weight = np.random.random((10,))
+    with self.assertRaisesRegexp(
+        ValueError, '`sample_weight` argument is not supported '
+        'when input `x` is a dataset or a dataset iterator'):
+      model.fit(
+          dataset,
+          epochs=1,
+          steps_per_epoch=2,
+          verbose=0,
+          sample_weight=sample_weight)
+
+    # Test invalid usage
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should not specify a target'):
+      model.fit(dataset, dataset,
+                epochs=1, steps_per_epoch=2, verbose=0)
+
+    with self.assertRaisesRegexp(
+        ValueError, 'you should specify the `steps_per_epoch` argument'):
+      model.fit(dataset, epochs=1, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.evaluate(dataset, verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'you should specify the `steps` argument'):
+      model.predict(dataset, verbose=0)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_dataset_with_sample_weights(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    metrics = ['mae', metrics_module.CategoricalAccuracy()]
+    model.compile(optimizer, loss, metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    inputs = np.zeros((10, 3), np.float32)
+    targets = np.zeros((10, 4), np.float32)
+    sample_weights = np.ones((10), np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
+                                                      sample_weights))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+
+    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+    model.evaluate(dataset, steps=2, verbose=1)
+    model.predict(dataset, steps=2)
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_dataset_with_sparse_labels(self):
+    model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    for loss in ['sparse_categorical_crossentropy',
+                 losses_impl.sparse_softmax_cross_entropy]:
+      model.compile(optimizer, loss,
+                    run_eagerly=testing_utils.should_run_eagerly())
+
+      inputs = np.zeros((10, 3), dtype=np.float32)
+      targets = np.random.randint(0, 4, size=10, dtype=np.int32)
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  @tf_test_util.run_deprecated_v1
+  def test_dataset_input_shape_validation(self):
+    with self.cached_session():
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+
+      # User forgets to batch the dataset
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+
+      with self.assertRaisesRegexp(
+          ValueError,
+          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
+      ):
+        model.train_on_batch(dataset)
+
+      # Wrong input shape
+      inputs = np.zeros((10, 5))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      with self.assertRaisesRegexp(ValueError,
+                                   r'expected (.*?) to have shape \(3,\)'):
+        model.train_on_batch(dataset)
+
+
+class TestMetricsWithDatasetIterators(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_with_all_model_types
+  @keras_parameterized.run_all_keras_modes
+  def test_metrics_correctness_with_iterator(self):
+    layers = [
+        keras.layers.Dense(8, activation='relu', input_dim=4,
+                           kernel_initializer='ones'),
+        keras.layers.Dense(1, activation='sigmoid', kernel_initializer='ones')
+    ]
+
+    model = testing_utils.get_model_from_layers(layers, (4,))
+
+    model.compile(
+        loss='binary_crossentropy',
+        metrics=['accuracy', metrics_module.BinaryAccuracy()],
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
+
+    np.random.seed(123)
+    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
+    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
+    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
+
+    y = np.zeros((100, 1), dtype=np.float32)
+    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
+    dataset = dataset.repeat(100)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    outs = model.evaluate(iterator, steps=10)
+    self.assertEqual(outs[1], 0.)
+    self.assertEqual(outs[2], 0.)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/engine/training_distributed.py b/tensorflow/python/keras/engine/training_distributed.py
index 808d7c9f3331df3fdc6613b70b64e94570c6e0de..d20d092d8e61499e4a005f7d6770a3c0a0ee60fc 100644
--- a/tensorflow/python/keras/engine/training_distributed.py
+++ b/tensorflow/python/keras/engine/training_distributed.py
@@ -19,9 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import enum
+import enum  # pylint: disable=g-bad-import-order
 import numpy as np
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -34,9 +37,7 @@ from tensorflow.python.keras.engine import distributed_training_utils
 from tensorflow.python.keras.utils.generic_utils import Progbar
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import variable_scope
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribute as distribute_lib
 from tensorflow.python.util import nest
 
 
@@ -48,180 +49,15 @@ class _Mode(enum.Enum):
 # TODO(priyag, sourabhbajaj): Refactor this file to address code duplication.
 
 
-def fit_loop(
-    model,
-    iterator,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    val_iterator=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    validation_steps=None):
-  """Fit loop for training with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      epochs: Number of times to iterate over the data
-      verbose: Integer, Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      val_iterator: Iterator for validation data.
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for
-          (only if doing validation from data tensors).
-          Ignored with the default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-      ValueError: in case of invalid arguments.
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_fit_loop(
-        model, iterator, epochs, verbose, callbacks, initial_epoch,
-        steps_per_epoch, val_iterator, validation_steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy, make_callback_model=True)
-
-  def _per_device_fit_function(model):
-    model._make_fit_function()
-    return (model._fit_function.inputs, model._fit_function.outputs,
-            model._fit_function.updates_op, model._fit_function.session_kwargs)
-
-  inputs, targets, sample_weights = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    # Create train ops on each of the devices when we call
-    # `_per_device_fit_function`.
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_fit_function, args=(model._grouped_model,))
-    # Unwrap all the per device values returned from `call_for_each_replica`.
-    # Unwrapping per device values gives you a list of values that can be
-    # used to construct a new train function that is composed of update ops on
-    # all the devices over which the model is distributed.
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs,
-         grouped_updates, grouped_session_args, with_loss_tensor=True)
-
-    # Dataset inputs and targets are also per devices values that need to be
-    # unwrapped.
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-    dataset_targets = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, targets)
-
-    # Create a train function that is composed of all the parameters above.
-    distributed_fit_function = K.function(
-        all_inputs,
-        all_outputs,
-        updates=all_updates,
-        name='distributed_fit_function',
-        **all_session_args)
-
-    # We need to set sample_weights to None since there are sample weight
-    # placeholders that are created with default values.
-    sample_weights = [None for _ in range(
-        len(model.outputs) * current_strategy.num_replicas_in_sync)]
-    if not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + dataset_targets + sample_weights + [1]
-    else:
-      ins = dataset_inputs + dataset_targets
-
-    do_validation = False
-    if validation_steps:
-      do_validation = True
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        val_inputs=None,
-        val_targets=None,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        verbose=verbose)
-    out_labels = model.metrics_names or []
-    callbacks.on_train_begin()
-
-    assert steps_per_epoch is not None
-
-    for epoch in range(initial_epoch, epochs):
-      # Reset stateful metrics
-      for m in model.stateful_metric_functions:
-        m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      for step_index in range(steps_per_epoch):
-        batch_logs = {'batch': step_index, 'size': 1}
-        callbacks.on_batch_begin(step_index, batch_logs)
-        try:
-          outs = distributed_fit_function(ins)
-        except errors.OutOfRangeError:
-          logging.warning('Your dataset iterator ran out of data; '
-                          'interrupting training. Make sure that your dataset '
-                          'can generate at least `steps_per_epoch * epochs` '
-                          'batches (in this case, %d batches).' %
-                          steps_per_epoch * epochs)
-          break
-
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(out_labels, outs):
-          batch_logs[l] = o
-        callbacks.on_batch_end(step_index, batch_logs)
-        if callbacks.model.stop_training:
-          break
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_iterator,
-            steps=validation_steps,
-            verbose=0)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(out_labels, val_outs):
-          epoch_logs['val_' + l] = o
-
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callbacks.model.stop_training:
-        break
-    callbacks.on_train_end()
-
-    # Copy the weights back from the replicated model to the original model.
-    updated_weights = current_strategy.unwrap(
-        model._grouped_model)[0].get_weights()
-    model.set_weights(updated_weights)
-    return model.history
-
-
-def _experimental_fit_loop(
-    model,
-    iterator,
-    epochs=100,
-    verbose=1,
-    callbacks=None,
-    initial_epoch=0,
-    steps_per_epoch=None,
-    val_iterator=None,
-    validation_steps=None):
+def experimental_fit_loop(model,
+                          iterator,
+                          epochs=100,
+                          verbose=1,
+                          callbacks=None,
+                          initial_epoch=0,
+                          steps_per_epoch=None,
+                          val_iterator=None,
+                          validation_steps=None):
   """Fit loop for training with TPU DistributionStrategy.
 
   Arguments:
@@ -259,11 +95,12 @@ def _experimental_fit_loop(
   K.set_learning_phase(1)
   out_labels = model.metrics_names or []
 
-  def step_fn(ctx, inputs, targets):
+  def step_fn(ctx, inputs):
     """Clones the model and calls make_fit_function."""
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
+    inputs, targets = inputs
     clone_model_on_replicas(
         model,
         current_strategy,
@@ -273,7 +110,7 @@ def _experimental_fit_loop(
         mode=_Mode.TRAIN)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
          _per_device_fit_function, args=(model._grouped_model_train,))
     (all_inputs, all_outputs, all_updates,
      all_session_args) = distributed_training_utils.unwrap_values(
@@ -288,12 +125,12 @@ def _experimental_fit_loop(
 
     for label, output in zip(out_labels, combined_fn.outputs):
       if label == 'loss':
-        aggregation = distribute_lib.get_loss_reduction()
+        reduce_op = distribute_lib.get_loss_reduction()
       else:
-        # We aggregate all other metrics using mean for now. This is temporary
+        # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
-        aggregation = variable_scope.VariableAggregation.MEAN
-      ctx.set_last_step_output(label, output, aggregation)
+        reduce_op = ds_reduce_util.ReduceOp.MEAN
+      ctx.set_last_step_output(label, output, reduce_op)
 
     # TODO(priyag, sourabhbajaj): Ignoring these things from the combined_fn:
     # feed_dict, session kwargs, run options, run_metadata for now. These should
@@ -303,19 +140,20 @@ def _experimental_fit_loop(
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
-  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
+  for name in model.metrics_names[1:]:
+    tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   if steps_per_epoch is None:
     raise ValueError('`steps_per_epoch` should be specified when calling '
                      '`fit` on the model.')
   steps_per_run = K.variable(
-      value=min(steps_per_epoch, current_strategy.steps_per_run),
+      value=min(steps_per_epoch, current_strategy.extended.steps_per_run),
       dtype='int32',
       name='steps_per_run')
 
   with current_strategy.scope():
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=steps_per_run,
         initial_loop_values=initial_loop_values)
 
@@ -325,29 +163,28 @@ def _experimental_fit_loop(
   do_validation = bool(validation_steps)
 
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model_train)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
+    _copy_weights_to_distributed_model(model, model._grouped_model_train)
+
   callbacks = cbks.configure_callbacks(
       callbacks,
       model,
       do_validation=do_validation,
-      val_inputs=None,
-      val_targets=None,
       epochs=epochs,
       steps_per_epoch=steps_per_epoch,
       verbose=verbose)
 
   # Calculate the steps each time on the device.
-  steps_to_run = [current_strategy.steps_per_run] * (
-      steps_per_epoch // current_strategy.steps_per_run)
-  if steps_per_epoch % current_strategy.steps_per_run:
-    steps_to_run.append(steps_per_epoch % current_strategy.steps_per_run)
+  steps_to_run = [current_strategy.extended.steps_per_run] * (
+      steps_per_epoch // current_strategy.extended.steps_per_run)
+  if steps_per_epoch % current_strategy.extended.steps_per_run:
+    steps_to_run.append(
+        steps_per_epoch % current_strategy.extended.steps_per_run)
 
   callbacks.on_train_begin()
   for epoch in range(initial_epoch, epochs):
+    with current_strategy.scope():
+      _reset_metrics(model, model._grouped_model_train)
     callbacks.on_epoch_begin(epoch)
     epoch_logs = {}
     step_index = 0
@@ -380,11 +217,10 @@ def _experimental_fit_loop(
       # Since we create a new clone from the original model we need to copy
       # the weights back to the original model before we can run validation.
       with current_strategy.scope():
-        updated_weights = current_strategy.unwrap(
-            model._grouped_model_train)[0].get_weights()
-        model.set_weights(updated_weights)
+        _copy_weights_to_original_model(model, model._grouped_model_train,
+                                        'train')
 
-      val_outs = _experimental_test_loop(
+      val_outs = experimental_test_loop(  # pylint: disable=undefined-variable
           model,
           val_iterator,
           steps=validation_steps,
@@ -403,113 +239,17 @@ def _experimental_fit_loop(
 
   # Copy the weights back from the replicated model to the original model.
   with current_strategy.scope():
-    updated_weights = current_strategy.unwrap(
-        model._grouped_model_train)[0].get_weights()
-    model.set_weights(updated_weights)
+    _copy_weights_to_original_model(model, model._grouped_model_train, 'train')
 
   K.get_session().run(current_strategy.finalize())
   return model.history
 
 
-def test_loop(model, iterator, verbose=0, steps=None):
-  """Test loop for evaluating with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the outputs.
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_test_loop(model, iterator, verbose, steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy)
-
-  def _per_device_eval_function(model):
-    model._make_eval_function()
-    return (model._eval_function.inputs, model._eval_function.outputs,
-            model._eval_function.updates_op,
-            model._eval_function.session_kwargs)
-
-  inputs, targets, sample_weights = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_eval_function, args=(model._grouped_model,))
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args, with_loss_tensor=True)
-
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-    dataset_targets = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, targets)
-
-    distributed_test_function = K.function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_test_function',
-        **all_session_args)
-
-    # We need to set sample_weights to None since there are sample weight
-    # placeholders that are created with default values.
-    sample_weights = [None for _ in range(
-        len(model.outputs) * current_strategy.num_replicas_in_sync)]
-    if not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + dataset_targets + sample_weights + [0]
-    else:
-      ins = dataset_inputs + dataset_targets
-
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-
-    outs = []
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    assert steps is not None
-    for step in range(steps):
-      batch_outs = distributed_test_function(ins)
-      if isinstance(batch_outs, list):
-        if step == 0:
-          outs = [0.] * len(batch_outs)
-        outs[0] += batch_outs[0]  # index 0 = 'loss'
-        outs[1:] = batch_outs[1:]
-      else:
-        if step == 0:
-          outs.append(0.)
-        outs[0] += batch_outs  # index 0 = 'loss'
-      if verbose >= 1:
-        progbar.update(step + 1)
-    outs[0] /= steps  # index 0 = 'loss'
-
-    if len(outs) == 1:
-      return outs[0]
-    return outs
-
-
-def _experimental_test_loop(model, iterator, verbose=0, steps=None,
-                            initialize_finalize_strategy=True):
+def experimental_test_loop(model,
+                           iterator,
+                           verbose=0,
+                           steps=None,
+                           initialize_finalize_strategy=True):
   """Test loop for evaluating with TPU DistributionStrategy.
 
   Arguments:
@@ -541,11 +281,12 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
   # TODO(priyag, sourabhbajaj): This should likely not be hardcoded here.
   K.set_learning_phase(0)
 
-  def step_fn(ctx, inputs, targets):
+  def step_fn(ctx, inputs):
     """Clones the model and calls make_eval_function."""
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
     # fit/test/predict is called. We should look into caching this keyed on
     # input shapes.
+    inputs, targets = inputs
     clone_model_on_replicas(
         model,
         current_strategy,
@@ -555,7 +296,7 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
         mode=_Mode.TEST)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
          _per_device_eval_function, args=(model._grouped_model_test,))
 
     (all_inputs, all_outputs, all_updates,
@@ -571,25 +312,26 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
 
     for label, output in zip(model.metrics_names, combined_fn.outputs):
       if label == 'loss':
-        aggregation = distribute_lib.get_loss_reduction()
+        reduce_op = distribute_lib.get_loss_reduction()
       else:
-        # We aggregate all other metrics using mean for now. This is temporary
+        # We reduce all other metrics using mean for now. This is temporary
         # workaround until new metrics are in place.
-        aggregation = variable_scope.VariableAggregation.MEAN
-      ctx.set_last_step_output(label, output, aggregation)
+        reduce_op = ds_reduce_util.ReduceOp.MEAN
+      ctx.set_last_step_output(label, output, reduce_op)
 
     return combined_fn.updates_op
 
   # Add initial dummy values for loss and other metric tensors.
   initial_loop_values = {}
   initial_loop_values['loss'] = constant_op.constant(1e7)
-  for name, tensor in zip(model.metrics_names[1:], model.metrics_tensors):
+  for name in model.metrics_names[1:]:
+    tensor = model._all_stateful_metrics_tensors[name]
     initial_loop_values[name] = array_ops.zeros(tensor.shape, tensor.dtype)
 
   with current_strategy.scope():
     # TODO(priyag): Use steps_per_run when we use new metrics as they will
     # allow handling metric computation at each step using variables.
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=1,
         initial_loop_values=initial_loop_values)
 
@@ -600,22 +342,26 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
     progbar = Progbar(target=steps)
 
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model_test)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
+    _copy_weights_to_distributed_model(model, model._grouped_model_test)
+    _reset_metrics(model, model._grouped_model_test)
   assert steps is not None
   outs = [0.] * len(model.metrics_names)
   for step in range(steps):
     _, batch_outs = K.get_session().run([test_op, output_tensors])
     for i, label in enumerate(model.metrics_names):
-      outs[i] += batch_outs[label]
+      if i == 0:
+        # Loss is stateless metrics.
+        outs[i] += batch_outs[label]
+      else:
+        # For all stateful metrics, the aggregation is handled by mirrored vars.
+        outs[i] = batch_outs[label]
+
     if verbose >= 1:
       progbar.update(step + 1)
-  for i in range(len(outs)):
-    outs[i] /= (steps)
+
+  if len(outs) >= 0:
+    outs[0] /= (steps)
 
   if initialize_finalize_strategy:
     K.get_session().run(current_strategy.finalize())
@@ -625,103 +371,7 @@ def _experimental_test_loop(model, iterator, verbose=0, steps=None,
   return outs
 
 
-def predict_loop(model, iterator, verbose=0, steps=None):
-  """Predict loop for predicting with DistributionStrategy.
-
-  Arguments:
-      model: Keras Model instance.
-      iterator: Iterator for input data.
-      verbose: Integer, Verbosity mode 0 or 1.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  current_strategy = model._distribution_strategy
-
-  # TODO(priyag, sourabhbajaj): Remove this when the codepaths are merged.
-  if current_strategy.__class__.__name__ == 'TPUStrategy':
-    return _experimental_predict_loop(model, iterator, verbose, steps)
-
-  if not model._grouped_model:
-    clone_model_on_replicas(model, current_strategy)
-
-  def _per_device_predict_function(model):
-    model._make_predict_function()
-    return (model.predict_function.inputs,
-            model.predict_function.outputs,
-            model.predict_function.updates_op,
-            model.predict_function.session_kwargs)
-
-  inputs, _, _ = _get_input_from_iterator(iterator, model)
-  with current_strategy.scope():
-    (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
-         _per_device_predict_function, args=(model._grouped_model,))
-
-    (all_inputs, all_outputs, all_updates,
-     all_session_args) = distributed_training_utils.unwrap_values(
-         current_strategy, grouped_inputs, grouped_outputs, grouped_updates,
-         grouped_session_args)
-
-    dataset_inputs = distributed_training_utils.flatten_perdevice_values(
-        current_strategy, inputs)
-
-    distributed_predict_function = K.function(
-        all_inputs, all_outputs,
-        updates=all_updates,
-        name='distributed_predict_function',
-        **all_session_args)
-
-    if not isinstance(K.learning_phase(), int):
-      ins = dataset_inputs + [0]
-    else:
-      ins = dataset_inputs
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    # Copy the weights from the original model to each of the replicated models.
-    orig_model_weights = model.get_weights()
-    distributed_model = current_strategy.unwrap(model._grouped_model)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
-    num_replicas = current_strategy.num_replicas_in_sync
-    # Since we do not know how many samples we will see, we cannot
-    # pre-allocate the returned Numpy arrays. Instead, we store one array per
-    # batch seen and concatenate them upon returning.
-    unconcatenated_outs = []
-    assert steps is not None
-    for step in range(steps):
-      batch_outs = distributed_predict_function(ins)
-      if not isinstance(batch_outs, list):
-        batch_outs = [batch_outs]
-      if step == 0:
-        # batch_outs gives you the number of model outputs. In the distributed
-        # case this will be number of model_outputs * num_replicas.
-        for _ in range(len(model.outputs)):
-          unconcatenated_outs.append([])
-      for i in range(len(model.outputs)):
-        nested_outs = batch_outs[i * num_replicas:
-                                 i * num_replicas + num_replicas]
-        outs = nest.flatten(nested_outs)
-        unconcatenated_outs[i].extend(outs)
-      if verbose >= 1:
-        progbar.update(step + 1)
-    if len(unconcatenated_outs) == 1:
-      return np.concatenate(unconcatenated_outs[0], axis=0)
-    return [
-        np.concatenate(unconcatenated_outs[i], axis=0)
-        for i in range(len(unconcatenated_outs))
-    ]
-
-
-def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
+def experimental_predict_loop(model, iterator, verbose=0, steps=None):
   """Predict loop for predicting with TPU DistributionStrategy.
 
   Arguments:
@@ -750,7 +400,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
             model.predict_function.updates_op,
             model.predict_function.session_kwargs)
 
-  def step_fn(ctx, *inputs):
+  def step_fn(ctx, inputs):
     """Clones the model and calls make_predict_function."""
 
     # TODO(priyag, sourabhbajaj): The model gets cloned every time
@@ -764,7 +414,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
         mode=_Mode.PREDICT)
 
     (grouped_inputs, grouped_outputs, grouped_updates,
-     grouped_session_args) = current_strategy.call_for_each_replica(
+     grouped_session_args) = current_strategy.extended.call_for_each_replica(
          _per_device_predict_function, args=(model._grouped_model_predict,))
 
     (all_inputs, all_outputs, all_updates,
@@ -795,7 +445,7 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
 
   with current_strategy.scope():
     # TODO(priyag, sourabhbajaj): Support steps_per_run if/when we add outfeed.
-    ctx = current_strategy.run_steps_on_dataset(
+    ctx = current_strategy.extended.experimental_run_steps_on_iterator(
         step_fn, iterator, iterations=1,
         initial_loop_values=initial_loop_values)
 
@@ -806,12 +456,9 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
     progbar = Progbar(target=steps)
 
   # Copy the weights from the original model to each of the replicated models.
-  orig_model_weights = model.get_weights()
   with current_strategy.scope():
-    distributed_model = current_strategy.unwrap(model._grouped_model_predict)[0]
-    distributed_training_utils.set_weights(
-        current_strategy, distributed_model, orig_model_weights)
-
+    _copy_weights_to_distributed_model(model, model._grouped_model_predict)
+    _reset_metrics(model, model._grouped_model_predict)
   assert steps is not None
   # Since we do not know how many samples we will see, we cannot pre-allocate
   # the returned Numpy arrays. Instead, we store one array per batch seen
@@ -835,7 +482,17 @@ def _experimental_predict_loop(model, iterator, verbose=0, steps=None):
   ]
 
 
-def _clone_and_build_model(model, inputs=None, targets=None):
+def _custom_compile_for_predict(model):
+  """Custom compile for TPU predict mode."""
+  model.total_loss = None
+  model._fit_function = None
+  model._eval_function = None
+  model.train_function = None
+  model.test_function = None
+  model.predict_function = None
+
+
+def _clone_and_build_model(model, inputs=None, targets=None, mode=None):
   """Clone and build the given keras_model."""
   # We need to set the import here since we run into a circular dependency
   # error.
@@ -862,23 +519,27 @@ def _clone_and_build_model(model, inputs=None, targets=None):
 
   if isinstance(targets, tuple):
     targets = nest.flatten(targets)
-  cloned_model.compile(
-      optimizer,
-      model.loss,
-      metrics=metrics_module.clone_metrics(model.metrics),
-      loss_weights=model.loss_weights,
-      sample_weight_mode=model.sample_weight_mode,
-      weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics),
-      target_tensors=targets)
+  if mode == _Mode.PREDICT:
+    _custom_compile_for_predict(cloned_model)
+  else:
+    cloned_model.compile(
+        optimizer,
+        model.loss,
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
+        loss_weights=model.loss_weights,
+        sample_weight_mode=model.sample_weight_mode,
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
+        target_tensors=targets)
   return cloned_model
 
 
 def clone_model_on_replicas(model, strategy, make_callback_model=False,
                             inputs=None, targets=None, mode=None):
   """Create a cloned model on each replica."""
-  with strategy.scope():
-    grouped_model = strategy.call_for_each_replica(
-        _clone_and_build_model, args=(model, inputs, targets))
+  with K.get_graph().as_default(), strategy.scope():
+    grouped_model = strategy.extended.call_for_each_replica(
+        _clone_and_build_model, args=(model, inputs, targets, mode))
     if mode is _Mode.TRAIN:
       model._grouped_model_train = grouped_model
     elif mode is _Mode.TEST:
@@ -915,3 +576,158 @@ def _get_input_from_iterator(iterator, model):
   model._standardize_weights(x_values, y_values,
                              sample_weight=sample_weights_values)
   return x, y, sample_weights
+
+
+def _make_execution_function(model, mode):
+  """Makes function to run one step of distributed model execution."""
+  if context.executing_eagerly():
+    return _make_eager_execution_function(model, mode)
+
+  strategy = model._distribution_strategy
+  if not model._grouped_model:
+    clone_model_on_replicas(
+        model, strategy, make_callback_model=(mode == 'train'))
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs, f.updates_op, f.session_kwargs)
+
+  with strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs, grouped_updates,
+     grouped_session_args) = strategy.extended.call_for_each_replica(
+         _per_device_function, args=(model._grouped_model,))
+
+    if mode == 'train':
+      # Initialize the variables in the replicated model. This is necessary for
+      # multi-worker training because on some workers, initialization is not
+      # needed. This method does initialization or waiting for initialization
+      # according to the context object of distribute coordinator.
+      distributed_training_utils.init_restore_or_wait_for_variables()
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of update ops on
+    # all the devices over which the model is distributed.
+    (all_inputs, all_outputs, all_updates,
+     all_session_args) = distributed_training_utils.unwrap_values(
+         strategy,
+         grouped_inputs,
+         grouped_outputs,
+         grouped_updates,
+         grouped_session_args,
+         with_loss_tensor=(mode != 'predict'))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        updates=all_updates,
+        name='distributed_{}_function'.format(mode),
+        **all_session_args)
+
+
+def _make_eager_execution_function(model, mode):
+  """Makes function to run one step of distributed model eager execution."""
+  strategy = model._distribution_strategy
+  if not model._grouped_model:
+    clone_model_on_replicas(
+        model, strategy, make_callback_model=(mode == 'train'))
+
+  def _per_device_function(model):
+    f = model._make_execution_function(mode)
+    return (f.inputs, f.outputs)
+
+  # NOTE(priyag): Try creating a new FuncGraph within DS scope instead of using
+  # the global one.
+  with K.get_graph().as_default(), strategy.scope():
+    # Create train ops on each of the devices when we call
+    # `_per_device_fit_function`.
+    (grouped_inputs, grouped_outputs) = strategy.call_for_each_replica(
+        _per_device_function, args=(model._grouped_model,))
+
+    # Unwrap all the per device values returned from `call_for_each_replica`.
+    # Unwrapping per device values gives you a list of values that can be
+    # used to construct a new train function that is composed of inptus/outputs
+    # on all the devices over which the model is distributed.
+    (all_inputs, all_outputs, _, _) = distributed_training_utils.unwrap_values(
+        strategy,
+        grouped_inputs,
+        grouped_outputs,
+        with_loss_tensor=(mode != 'predict'))
+
+    return K.function(
+        all_inputs,
+        all_outputs,
+        name='eager_distributed_{}_function'.format(mode))
+
+
+def _prepare_feed_values(model, inputs, targets, sample_weights, mode):
+  """Prepare feed values to the model execution function.
+
+  Arguments:
+    model: Model to prepare feed values for.
+    inputs: List or dict of model inputs.
+    targets: Optional list of model targets.
+    sample_weights: Optional list of sample weight arrays.
+    mode: One of 'train'/'test'/'predict'.
+
+  Returns:
+    Feed values for the model in the given mode.
+  """
+  strategy = model._distribution_strategy
+  inputs, targets, sample_weights = _get_input_from_iterator(inputs, model)
+  inputs = distributed_training_utils.flatten_perdevice_values(strategy, inputs)
+  targets = distributed_training_utils.flatten_perdevice_values(
+      strategy, targets)
+  if mode == 'predict':
+    sample_weights = []
+    targets = []
+  else:
+    sample_weights = [
+        None for _ in range(len(model.outputs) * strategy.num_replicas_in_sync)
+    ]
+  ins = inputs + targets + sample_weights
+  if mode == 'train' and not isinstance(K.symbolic_learning_phase(), int):
+    ins += [True]
+  return ins
+
+
+def _copy_weights_to_distributed_model(original_model, grouped_model):
+  """Copies weights from original model to distributed models."""
+  strategy = original_model._distribution_strategy
+  if strategy:
+    # Copy the weights from the original model to each of the replicated
+    # models.
+    orig_model_weights = original_model.get_weights()
+    distributed_model = strategy.unwrap(grouped_model)[0]
+    distributed_training_utils.set_weights(strategy, distributed_model,
+                                           orig_model_weights)
+
+
+def _copy_weights_to_original_model(model, grouped_model, mode):
+  """Copies weights from first distributed model back to original model."""
+  if model._distribution_strategy and mode == 'train':
+    updated_weights = model._distribution_strategy.unwrap(
+        grouped_model)[0].get_weights()
+    model.set_weights(updated_weights)
+
+
+def _per_device_aggregate_batch(batch_outs, model, mode):
+  """Aggregates the per-device batch-level outputs from a distributed step."""
+  if model._distribution_strategy is not None and mode == 'predict':
+    total_batch_outs = []
+    for i in range(len(model.outputs)):
+      num_replicas = model._distribution_strategy.num_replicas_in_sync
+      nested_outs = batch_outs[i * num_replicas:i * num_replicas + num_replicas]
+      total_batch_outs.append(np.concatenate(nest.flatten(nested_outs)))
+    return total_batch_outs
+  return batch_outs
+
+
+def _reset_metrics(model, distributed_model=None):
+  if model._distribution_strategy:
+    distributed_model = (
+        distributed_model or
+        model._distribution_strategy.unwrap(model._grouped_model)[0])
+    distributed_model.reset_metrics()
diff --git a/tensorflow/python/keras/engine/training_eager.py b/tensorflow/python/keras/engine/training_eager.py
index 9131df5cd0a35502a5786ea40becd7ae1387b68a..895db5bc633669641b0493b8bfb918094f312513 100644
--- a/tensorflow/python/keras/engine/training_eager.py
+++ b/tensorflow/python/keras/engine/training_eager.py
@@ -19,30 +19,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
-import threading
+import collections
 
-import numpy as np
-
-from tensorflow.python.data.ops import iterator_ops
-from tensorflow.python.eager import function as eager_function
 from tensorflow.python.eager.backprop import GradientTape
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend
-from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import losses as losses_module
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 
 
-# A lock for assigning polymorphic functions to models in a thread-safe way
-_graph_function_building_lock = threading.Lock()
-
-
 def _eager_loss_fn(outputs, targets, loss_fn, output_name):
   with backend.name_scope(output_name + '_loss'):
     loss = loss_fn(targets, outputs)
@@ -133,11 +123,24 @@ def _model_loss(model,
       else:
         weights = None
       mask = masks[i]
-
-      weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
       with backend.name_scope(model.output_names[i] + '_loss'):
-        output_loss = weighted_masked_fn(
-            targets[i], outs[i], weights, mask=mask)
+        if isinstance(loss_fn, losses_module.Loss):
+          if mask is not None:
+            mask = math_ops.cast(mask, outs[i].dtype)
+            # Update weights with mask.
+            if weights is None:
+              weights = mask
+            else:
+              # Update dimensions of weights to match with mask if possible.
+              mask, _, weights = squeeze_or_expand_dimensions(
+                  mask, None, weights)
+              weights *= mask
+          output_loss = loss_fn(targets[i], outs[i], sample_weight=weights)
+        else:
+          weighted_masked_fn = training_utils.weighted_masked_objective(loss_fn)
+          output_loss = weighted_masked_fn(
+              targets[i], outs[i], weights, mask=mask)
+
       # If the number of outputs is 1 then we don't append the loss metric
       # associated with each model output. When there are multiple outputs
       # associated with a model, each output's loss is calculated and returned
@@ -171,412 +174,6 @@ def _model_loss(model,
   return outs, total_loss, loss_metrics, aggregated_loss_metrics, masks
 
 
-def _maybe_build_graph_functions(model):
-  """Constructs polymorphic functions to use for fit, evaluate and predict."""
-  # We lock this function to ensure thread-safety in case users are
-  # hypothetically trying to call '.predict' on a model in multiple threads
-  # at once when the graph functions were never previously built.
-  with _graph_function_building_lock:
-    if not model._built_graph_functions:
-      model._eager_process_single_batch_graph_function = eager_function.defun(
-          _process_single_batch
-      )
-      model._eager_model_loss_graph_function = eager_function.defun(_model_loss)
-      model._eager_call_graph_function = eager_function.defun(model.call)
-      model._built_graph_functions = True
-
-
-def _maybe_graph_function_model_loss(model,
-                                     inputs,
-                                     targets,
-                                     output_loss_metrics=None,
-                                     sample_weights=None,
-                                     training=False):
-  """Compute model loss, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_model_loss_graph_function(
-        model,
-        inputs,
-        targets,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=training)
-  else:
-    return _model_loss(
-        model,
-        inputs,
-        targets,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=training)
-
-
-def _maybe_graph_function_model_call(model, *args, **kwargs):
-  """Compute model loss, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_call_graph_function(*args, **kwargs)
-  else:
-    return model.call(*args, **kwargs)
-
-
-def iterator_fit_loop(model,
-                      inputs,
-                      class_weight,
-                      steps_per_epoch,
-                      epoch_logs,
-                      val_inputs=None,
-                      val_targets=None,
-                      val_sample_weights=None,
-                      epochs=1,
-                      verbose=1,
-                      callbacks=None,
-                      validation_steps=None,
-                      do_validation=False,
-                      batch_size=None,
-                      output_loss_metrics=None):
-  """Fit function for eager execution when input is given as dataset iterator.
-
-  Updates the given epoch logs.
-
-  Arguments:
-      model: Instance of the `Model`.
-      inputs: Input dataset iterator.
-      class_weight: Optional class-weight array to weight the importance of
-          samples in `inputs` based on the class they belong to, as conveyed by
-          the targets from the `inputs` iterator.
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch.
-      epoch_logs: Dictionary of logs from every epoch.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: CallbackList instance. Controls callbacks during training.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-      do_validation: Boolean value indicating whether we should do validation.
-      batch_size: int, val_inputs and val_targets will be evaled batch by
-        batch with size batch_size if they are array.
-      output_loss_metrics: List of metrics that are used to aggregated output
-        loss values.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-
-  # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, (list, tuple)) or
-      len(inputs.output_shapes) not in (2, 3)):
-    raise ValueError('Please provide either inputs and targets '
-                     'or inputs, targets, and sample_weights')
-
-  for step_index in range(steps_per_epoch):
-    batch_logs = {'batch': step_index, 'size': 1}
-    callbacks.on_batch_begin(step_index, batch_logs)
-
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data; interrupting training. Make '
-          'sure that your dataset can generate at least '
-          '`steps_per_epoch * epochs` batches (in this case, %d batches). You '
-          'may need to use the repeat() function when building your '
-          'dataset.' % steps_per_epoch * epochs)
-      break
-
-    if len(inputs.output_shapes) == 2:
-      x, y = next_element
-      sample_weights = None
-    else:
-      x, y, sample_weights = next_element
-
-    # Validate and standardize data.
-    x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights, class_weight=class_weight)
-    x = training_utils.cast_if_floating_dtype(x)
-    y = training_utils.cast_if_floating_dtype(y)
-    if sample_weights:
-      sample_weights = [
-          training_utils.cast_if_floating_dtype(
-              ops.convert_to_tensor(val, dtype=backend.floatx()))
-          if val is not None else None for val in sample_weights
-      ]
-
-    # Set stateful_metrics in callbacks. We do not do this before the
-    # `steps_per_epoch` loop because model will be compiled only in the first
-    # iteration of this loop in the deferred build scenario.
-    if step_index == 0:
-      for cbk in callbacks:
-        if (isinstance(cbk, cbks.BaseLogger) or
-            isinstance(cbk, cbks.ProgbarLogger)):
-          cbk.stateful_metrics = model.metrics_names[1:]  # Exclude `loss`
-
-    if step_index == 0 and not callbacks.params['metrics']:
-      callback_metrics = copy.copy(model.metrics_names)
-      if do_validation:
-        callback_metrics += ['val_' + n for n in model.metrics_names]
-      callbacks.set_params({
-          'batch_size': batch_size,
-          'epochs': epochs,
-          'steps': steps_per_epoch,
-          'verbose': verbose,
-          'do_validation': do_validation,
-          'metrics': callback_metrics or [],
-          'validation_steps': validation_steps
-      })
-
-    # Train model.
-    outs, loss, _, aggregated_loss_metrics, masks = \
-      _maybe_graph_function_process_single_batch(
-          model,
-          x,
-          y,
-          output_loss_metrics=output_loss_metrics,
-          sample_weights=sample_weights,
-          training=True)
-    outs = generic_utils.to_list(outs)
-
-    # Calculate metrics.
-    for l, o in zip(model.metrics_names, outs):
-      batch_logs[l] = o
-    metrics_results = _eager_metrics_fn(
-        model, outs, y, sample_weights=sample_weights, masks=masks)
-    batch_logs['loss'] = tensor_util.constant_value(backend.mean(loss))
-
-    for k, v in zip(
-        model.metrics_names,
-        [backend.mean(loss)] + aggregated_loss_metrics + metrics_results):
-      batch_logs[k] = tensor_util.constant_value(v)
-    callbacks.on_batch_end(step_index, batch_logs)
-    if callbacks.model.stop_training:
-      break
-
-    if step_index == steps_per_epoch - 1:
-      if do_validation:
-        val_outs = test_loop(
-            model,
-            val_inputs,
-            val_targets,
-            sample_weights=val_sample_weights,
-            steps=validation_steps,
-            verbose=0,
-            batch_size=batch_size)
-        if not isinstance(val_outs, list):
-          val_outs = [val_outs]
-        # Same labels assumed.
-        for l, o in zip(model.metrics_names, val_outs):
-          epoch_logs['val_' + l] = o
-
-
-def iterator_test_loop(model, inputs, steps, verbose=0):
-  """Test function for eager execution when input is given as dataset iterator.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: Input dataset iterator.
-      steps: Total number of steps (batches of samples) before declaring
-      predictions finished.
-      verbose: Verbosity mode.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-  # make sure either x,y or x,y,sample_weights is provided
-  if (not isinstance(inputs.output_shapes, (list, tuple)) or
-      len(inputs.output_shapes) < 2 or len(inputs.output_shapes) > 3):
-    raise ValueError('Please provide either inputs and targets'
-                     'or inputs, targets, and sample_weights')
-  outs = []
-
-  # Create metric wrapper for the losses.
-  output_loss_metrics = []
-  for i in range(len(model.outputs)):
-    loss_fn = model.loss_functions[i]
-    mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-        loss_fn, name=loss_fn.__name__)
-    output_loss_metrics.append(mean_wrapped_loss)
-
-  num_samples = 0
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=steps)
-  for step_index in range(steps):
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data interrupting testing. '
-          'Make sure that your dataset can generate at least `steps` batches '
-          '(in this case, %d batches). You may need to use the repeat() '
-          'function when building your dataset.', steps)
-      break
-
-    if len(inputs.output_shapes) == 2:
-      x, y = next_element
-      sample_weights = None
-    else:
-      x, y, sample_weights = next_element
-
-    # Validate and standardize data.
-    x, y, sample_weights = model._standardize_user_data(
-        x, y, sample_weight=sample_weights)
-    x = training_utils.cast_if_floating_dtype(x)
-    y = training_utils.cast_if_floating_dtype(y)
-    if sample_weights:
-      sample_weights = [
-          training_utils.cast_if_floating_dtype(
-              ops.convert_to_tensor(val, dtype=backend.floatx()))
-          if val is not None else None for val in sample_weights
-      ]
-
-    if step_index == 0:
-      # Get stateful metrics indices. We do not do this before the `steps` loop
-      # because model will be compiled only in the first iteration of this loop
-      # in the deferred build scenario.
-      if hasattr(model, 'metrics'):
-        for m in model.stateful_metric_functions:
-          m.reset_states()
-      for m in output_loss_metrics:
-        m.reset_states()
-
-    # Calculate model output, loss values.
-    loss_outs, loss, _, aggregated_loss_metrics, masks = \
-      _maybe_graph_function_model_loss(
-          model,
-          x,
-          y,
-          output_loss_metrics=output_loss_metrics,
-          sample_weights=sample_weights,
-          training=False)
-    metrics_results = _eager_metrics_fn(
-        model, loss_outs, y, sample_weights=sample_weights, masks=masks)
-    batch_outs = []
-    for _, v in zip(
-        model.metrics_names,
-        [backend.mean(loss)] + aggregated_loss_metrics + metrics_results):
-      batch_outs.append(tensor_util.constant_value(v))
-
-    # Get current step size.
-    if isinstance(x, list):
-      step_size = x[0].get_shape().as_list()[0]
-    elif isinstance(x, dict):
-      step_size = list(x.values())[0].get_shape().as_list()[0]
-    else:
-      step_size = x.get_shape().as_list()[0]
-
-    # Accumulate results in output array.
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-    if step_index == 0:
-      for _ in enumerate(batch_outs):
-        outs.append(0.)
-    outs[0] += batch_outs[0] * step_size  # index 0 = 'loss'
-    outs[1:] = batch_outs[1:]
-
-    # Calculate sample size.
-    num_samples += step_size
-    if verbose == 1:
-      progbar.update(step_index + 1)
-
-  outs[0] /= num_samples  # index 0 = 'loss'
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
-def iterator_predict_loop(model, inputs, steps, verbose=0):
-  """Predict function for eager execution when input is dataset iterator.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: Input dataset iterator.
-      steps: Total number of steps (batches of samples) before declaring
-          `_predict_loop` finished.
-      verbose: Verbosity mode.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions (if the model has multiple outputs).
-
-  Raises:
-      ValueError: In case of mismatch between given number of inputs and
-        expectations of the model.
-  """
-  assert isinstance(inputs, iterator_ops.EagerIterator)
-  if not isinstance(inputs.output_shapes,
-                    (list, tuple)) or len(inputs.output_shapes) > 3:
-    raise ValueError(
-        'Please provide data as a list or tuple of 1, 2, or 3 elements '
-        ' - `(input)`, or `(input, target)`, or `(input, target,'
-        'sample_weights)`. Received %s. We do not use the `target` or'
-        '`sample_weights` value here.' % inputs.output_shapes)
-  outs = []
-  if verbose == 1:
-    progbar = generic_utils.Progbar(target=steps)
-
-  for step_index in range(steps):
-    # Get data from the iterator.
-    try:
-      next_element = inputs.get_next()
-    except errors.OutOfRangeError:
-      logging.warning(
-          'Your dataset iterator ran out of data; interrupting prediction. '
-          'Make sure that your dataset can generate at least `steps` batches '
-          '(in this case, %d batches). You may need to use the repeat() '
-          'function when building your dataset.', steps)
-      break
-
-    # expects a tuple, where first element of tuple represents inputs
-    x = next_element[0]
-
-    # Validate and standardize data.
-    x, _, _ = model._standardize_user_data(x)
-    x = training_utils.cast_if_floating_dtype(x)
-
-    if isinstance(x, list) and len(x) == 1:
-      x = x[0]
-
-    if model._expects_training_arg:
-      batch_outs = _maybe_graph_function_model_call(model, x, training=False)
-    else:
-      batch_outs = _maybe_graph_function_model_call(model, x)
-    if not isinstance(batch_outs, list):
-      batch_outs = [batch_outs]
-
-    # We collect the results from every step and then concatenate them once
-    # in the end. This is an expensive process. We are doing this because we
-    # do not know the number of samples beforehand.
-    if step_index == 0:
-      for _ in batch_outs:
-        outs.append([])
-    for i, batch_out in enumerate(batch_outs):
-      outs[i].append(backend.get_value(batch_out))
-
-    if verbose == 1:
-      progbar.update(step_index + 1)
-  for i, out in enumerate(outs):
-    outs[i] = np.concatenate(tuple(out), axis=0)
-  if len(outs) == 1:
-    return outs[0]
-  return outs
-
-
 def _process_single_batch(model,
                           inputs,
                           targets,
@@ -630,32 +227,6 @@ def _process_single_batch(model,
     return outs, loss, loss_metrics, aggregated_loss_metrics, masks
 
 
-def _maybe_graph_function_process_single_batch(model,
-                                               inputs,
-                                               targets,
-                                               output_loss_metrics=None,
-                                               sample_weights=None,
-                                               training=False):
-  """Process a single batch, using defun if the model supports it."""
-  if model._can_use_graph_functions:
-    _maybe_build_graph_functions(model)
-    return model._eager_process_single_batch_graph_function(
-        model,
-        inputs,
-        targets,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=training)
-  else:
-    return _process_single_batch(
-        model,
-        inputs,
-        targets,
-        output_loss_metrics=output_loss_metrics,
-        sample_weights=sample_weights,
-        training=training)
-
-
 def train_on_batch(model, inputs, targets, sample_weights=None):
   """Calculates the loss and gradient updates for one input batch.
 
@@ -668,25 +239,25 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss and the loss associated with each output.
   """
-  if len(inputs) and tensor_util.is_tensor(inputs[0]):
-    inputs = training_utils.cast_if_floating_dtype(inputs)
-    targets = training_utils.cast_if_floating_dtype(targets)
-  else:
-    inputs = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-    ]
-    targets = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-    ]
+  if isinstance(inputs, collections.Sequence):
+    if len(inputs) and tensor_util.is_tensor(inputs[0]):
+      inputs = training_utils.cast_if_floating_dtype(inputs)
+      targets = training_utils.cast_if_floating_dtype(targets)
+    else:
+      inputs = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in inputs
+      ])
+      targets = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in targets
+      ])
   if sample_weights:
     sample_weights = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
+        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
 
-  outs, loss, loss_metrics, _, masks = \
-    _maybe_graph_function_process_single_batch(
-        model, inputs, targets, sample_weights=sample_weights, training=True)
+  outs, loss, loss_metrics, _, masks = _process_single_batch(
+      model, inputs, targets, sample_weights=sample_weights, training=True)
   if not isinstance(outs, list):
     outs = [outs]
   metrics_results = _eager_metrics_fn(
@@ -695,7 +266,7 @@ def train_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=False)
+      return_stateful_result=True)
   loss = generic_utils.to_list(loss)
 
   return [
@@ -716,22 +287,23 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
   Returns:
       total loss, loss and metrics associated with each output.
   """
-  if len(inputs) and tensor_util.is_tensor(inputs[0]):
-    inputs = training_utils.cast_if_floating_dtype(inputs)
-    targets = training_utils.cast_if_floating_dtype(targets)
-  else:
-    inputs = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in inputs
-    ]
-    targets = [
-        ops.convert_to_tensor(val, dtype=backend.floatx()) for val in targets
-    ]
+  if isinstance(inputs, collections.Sequence):
+    if len(inputs) and tensor_util.is_tensor(inputs[0]):
+      inputs = training_utils.cast_if_floating_dtype(inputs)
+      targets = training_utils.cast_if_floating_dtype(targets)
+    else:
+      inputs = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in inputs
+      ])
+      targets = training_utils.cast_if_floating_dtype([
+          ops.convert_to_tensor(val) for val in targets
+      ])
   if sample_weights:
     sample_weights = [
-        ops.convert_to_tensor(val, dtype=backend.floatx())
+        training_utils.cast_if_floating_dtype(ops.convert_to_tensor(val))
         if val is not None else None for val in sample_weights
     ]
-  outs, loss, loss_metrics, _, masks = _maybe_graph_function_model_loss(
+  outs, loss, loss_metrics, _, masks = _model_loss(
       model, inputs, targets, sample_weights=sample_weights, training=False)
   if not isinstance(outs, list):
     outs = [outs]
@@ -741,184 +313,10 @@ def test_on_batch(model, inputs, targets, sample_weights=None):
       targets,
       sample_weights=sample_weights,
       masks=masks,
-      return_stateful_result=False)
+      return_stateful_result=True)
   loss = generic_utils.to_list(loss)
 
   return [
       tensor_util.constant_value(v)
       for v in loss + loss_metrics + metrics_results
   ]
-
-
-def fit_loop(model,
-             inputs,
-             targets,
-             sample_weights=None,
-             class_weight=None,
-             val_inputs=None,
-             val_targets=None,
-             val_sample_weights=None,
-             batch_size=None,
-             epochs=1,
-             verbose=1,
-             callbacks=None,
-             shuffle=True,
-             initial_epoch=0,
-             steps_per_epoch=None,
-             validation_steps=None):
-  """Fit function for eager execution.
-
-  Arguments:
-      model: Instance of the model that is being executed in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      class_weight: Optional class-weight array to weight the importance of
-          samples in `inputs` based on the class they belong to, as conveyed by
-          `targets`.
-      val_inputs: Input data for validation.
-      val_targets: Target data for validation.
-      val_sample_weights: Sample weight data for validation.
-      batch_size: Integer batch size or None if unknown.
-      epochs: Number of times to iterate over the data
-      verbose: Verbosity mode, 0, 1 or 2
-      callbacks: List of callbacks to be called during training
-      shuffle: Whether to shuffle the data at the beginning of each epoch
-      initial_epoch: Epoch at which to start training
-          (useful for resuming a previous training run)
-      steps_per_epoch: Total number of steps (batches of samples)
-          before declaring one epoch finished and starting the
-          next epoch. Ignored with the default value of `None`.
-      validation_steps: Number of steps to run validation for (only if doing
-        validation from data tensors). Ignored with default value of `None`.
-
-  Returns:
-      `History` object.
-
-  Raises:
-    ValueError: In case of invalid argument values.
-  """
-  # Convert training inputs to an EagerIterator
-  inputs, steps_per_epoch = training_utils.convert_to_iterator(
-      x=inputs,
-      y=targets,
-      sample_weights=sample_weights,
-      batch_size=batch_size,
-      steps_per_epoch=steps_per_epoch,
-      epochs=epochs,
-      shuffle=shuffle)
-  # Required for eager execution
-  with backend.learning_phase_scope(1):
-    do_validation = val_inputs is not None
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        batch_size=batch_size,
-        epochs=epochs,
-        steps_per_epoch=steps_per_epoch,
-        val_inputs=val_inputs,
-        val_targets=val_targets,
-        val_sample_weights=val_sample_weights,
-        validation_steps=validation_steps,
-        verbose=verbose)
-
-    # Create metric wrapper for the losses.
-    output_loss_metrics = []
-    for i in range(len(model.outputs)):
-      loss_fn = model.loss_functions[i]
-      mean_wrapped_loss = metrics_module.MeanMetricWrapper(
-          loss_fn, name=loss_fn.__name__)
-      output_loss_metrics.append(mean_wrapped_loss)
-
-    callbacks.on_train_begin()
-    for epoch in range(initial_epoch, epochs):
-      if model._is_compiled:  # Model may not be compiled the first time.
-        # Reset stateful metrics
-        for m in model.stateful_metric_functions:
-          m.reset_states()
-
-      for m in output_loss_metrics:
-        m.reset_states()
-
-      callbacks.on_epoch_begin(epoch)
-      epoch_logs = {}
-      iterator_fit_loop(
-          model,
-          inputs,
-          class_weight,
-          steps_per_epoch=steps_per_epoch,
-          epoch_logs=epoch_logs,
-          val_inputs=val_inputs,
-          val_targets=val_targets,
-          val_sample_weights=val_sample_weights,
-          epochs=epochs,
-          verbose=verbose,
-          callbacks=callbacks,
-          validation_steps=validation_steps,
-          do_validation=do_validation,
-          batch_size=batch_size,
-          output_loss_metrics=output_loss_metrics)
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      if callbacks.model.stop_training:
-        break
-  callbacks.on_train_end()
-  return model.history
-
-
-def test_loop(model, inputs, targets,
-              sample_weights=None,
-              batch_size=None,
-              verbose=0,
-              steps=None):
-  """Test function for eager execution.
-
-  Arguments:
-      model: Model instance that is being evaluated in Eager mode.
-      inputs: List of input arrays.
-      targets: List of target arrays.
-      sample_weights: Optional list of sample weight arrays.
-      batch_size: integer batch size or `None`.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring predictions finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Scalar loss (if the model has a single output and no metrics)
-      or list of scalars (if the model has multiple outputs
-      and/or metrics). The attribute `model.metrics_names` will give you
-      the display labels for the scalar outputs.
-  """
-  inputs, steps = training_utils.convert_to_iterator(
-      x=inputs,
-      y=targets,
-      sample_weights=sample_weights,
-      batch_size=batch_size,
-      steps_per_epoch=steps,
-      is_validation=True)
-  with backend.learning_phase_scope(0):
-    return iterator_test_loop(model, inputs, steps, verbose=verbose)
-
-
-def predict_loop(model, inputs, batch_size=32, verbose=0, steps=None):
-  """Predict function for eager execution.
-
-  Arguments:
-      model: Instance of `Model`.
-      inputs: List of input arrays.
-      batch_size: integer batch size.
-      verbose: verbosity mode.
-      steps: Total number of steps (batches of samples)
-          before declaring `_predict_loop` finished.
-          Ignored with the default value of `None`.
-
-  Returns:
-      Array of predictions (if the model has a single output)
-      or list of arrays of predictions
-      (if the model has multiple outputs).
-  """
-  with backend.learning_phase_scope(0):
-    inputs, steps = training_utils.convert_to_iterator(
-        x=inputs, batch_size=batch_size, steps_per_epoch=steps)
-    return iterator_predict_loop(model, inputs, steps, verbose=verbose)
diff --git a/tensorflow/python/keras/engine/training_eager_test.py b/tensorflow/python/keras/engine/training_eager_test.py
index 76aaf1643b07e54c9b2b60e937d1f4146a9a1a6a..3fabbb17edc05138c57bf61c16a94c6647813963 100644
--- a/tensorflow/python/keras/engine/training_eager_test.py
+++ b/tensorflow/python/keras/engine/training_eager_test.py
@@ -20,10 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
@@ -51,6 +51,7 @@ class TrainingTest(test.TestCase):
         loss,
         metrics=metrics,
         loss_weights=loss_weights,
+        run_eagerly=True,
         sample_weight_mode=None)
 
     input_a = keras.backend.zeros(shape=(10, 3))
@@ -111,7 +112,7 @@ class TrainingTest(test.TestCase):
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
     metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
+    model.compile(optimizer, loss, metrics=metrics, run_eagerly=True)
 
     inputs = keras.backend.zeros(shape=(10, 3))
     targets = keras.backend.zeros(shape=(10, 4))
@@ -129,29 +130,34 @@ class TrainingTest(test.TestCase):
     x = keras.layers.Input(shape=(3,), name='input')
     y = keras.layers.Dense(4, name='dense')(x)
     model = keras.Model(x, y)
-    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
+    model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  loss='mse',
+                  run_eagerly=True)
 
     x = keras.backend.zeros(shape=(10, 3))
     y = keras.backend.zeros(shape=(10, 4))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y)).repeat(10).batch(5)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     validation_dataset = dataset_ops.Dataset.from_tensor_slices(
         (x, y)).repeat(10).batch(5)
-    validation_iterator = validation_dataset.make_one_shot_iterator()
+    validation_iterator = dataset_ops.make_one_shot_iterator(validation_dataset)
 
     with self.assertRaisesRegexp(
         ValueError, r'specify .* `steps_per_epoch`'):
       model.fit(iterator, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
-      model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
-                validation_data=(x, y))
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
+    if not context.executing_eagerly():
+      # In eager execution, `keras.backend.zeros` returns value tensors
+      # which can be used for validation without a `validation_steps` argument.
+      with self.assertRaisesRegexp(
+          ValueError, r'provide either `batch_size` or `validation_steps`'):
+        model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
+                  validation_data=(x, y))
+    with self.assertRaisesRegexp(ValueError,
+                                 'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_dataset)
-    with self.assertRaisesRegexp(
-        ValueError, r'provide either `batch_size` or `validation_steps`'):
+    with self.assertRaisesRegexp(ValueError,
+                                 'specify the `validation_steps` argument.'):
       model.fit(iterator, steps_per_epoch=2, epochs=1, verbose=0,
                 validation_data=validation_iterator)
 
@@ -160,25 +166,31 @@ class TrainingTest(test.TestCase):
     model.add(keras.layers.Dense(4, input_shape=(3,)))
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     model.compile(
-        optimizer, 'mse', metrics=['mae',
-                                   metrics_module.CategoricalAccuracy()])
+        optimizer,
+        loss='mse',
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=True)
 
     x = np.random.random((10, 3))
     y = np.random.random((10, 4))
 
-    def iterator():
+    def numpy_iterator():
       while True:
         yield x, y
 
-    model.fit_generator(iterator(), steps_per_epoch=3, epochs=1)
-    model.evaluate_generator(iterator(), steps=3)
-    out = model.predict_generator(iterator(), steps=3)
+    model.fit_generator(numpy_iterator(), steps_per_epoch=3, epochs=1)
+    model.evaluate_generator(numpy_iterator(), steps=3)
+
+    def inference_numpy_iterator():
+      while True:
+        yield x
+
+    out = model.predict_generator(inference_numpy_iterator(), steps=3)
     self.assertEqual(out.shape, (30, 4))
 
 
 class CorrectnessTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_loss_correctness(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
@@ -191,14 +203,14 @@ class CorrectnessTest(test.TestCase):
                                  activation='softmax',
                                  kernel_initializer='ones'))
     model.compile(loss='sparse_categorical_crossentropy',
-                  optimizer=RMSPropOptimizer(learning_rate=0.001))
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  run_eagerly=False)
     x = np.ones((100, 4))
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     history = model.fit(x, y, epochs=1, batch_size=10)
     self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_loss_correctness_with_iterator(self):
     # Test that training loss is the same in eager and graph
     # (by comparing it to a reference value in a deterministic case)
@@ -210,14 +222,15 @@ class CorrectnessTest(test.TestCase):
         keras.layers.Dense(2, activation='softmax', kernel_initializer='ones'))
     model.compile(
         loss='sparse_categorical_crossentropy',
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=True)
     x = np.ones((100, 4), dtype=np.float32)
     np.random.seed(123)
     y = np.random.randint(0, 1, size=(100, 1))
     dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
     dataset = dataset.repeat(100)
     dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
     history = model.fit(iterator, epochs=1, steps_per_epoch=10)
     self.assertAlmostEqual(history.history['loss'][-1], 0.6173, 4)
 
diff --git a/tensorflow/python/keras/engine/training_generator.py b/tensorflow/python/keras/engine/training_generator.py
index b5e3a039767d74b92a04cb58f7164a9ffd91789e..0abf0b8270915a37f1d59803cacd11bdf9abe132 100644
--- a/tensorflow/python/keras/engine/training_generator.py
+++ b/tensorflow/python/keras/engine/training_generator.py
@@ -19,412 +19,433 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
+import math
+
 import numpy as np
 
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.keras import backend
 from tensorflow.python.keras import callbacks as cbks
-from tensorflow.python.keras.utils.data_utils import GeneratorEnqueuer
-from tensorflow.python.keras.utils.data_utils import iter_sequence_infinite
-from tensorflow.python.keras.utils.data_utils import OrderedEnqueuer
-from tensorflow.python.keras.utils.data_utils import Sequence
-from tensorflow.python.keras.utils.generic_utils import Progbar
+from tensorflow.python.keras.engine import training_utils
+from tensorflow.python.keras.utils import data_utils
+from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import nest
+
+
+def model_iteration(model,
+                    data,
+                    steps_per_epoch=None,
+                    epochs=1,
+                    verbose=1,
+                    callbacks=None,
+                    validation_data=None,
+                    validation_steps=None,
+                    class_weight=None,
+                    max_queue_size=10,
+                    workers=1,
+                    use_multiprocessing=False,
+                    shuffle=True,
+                    initial_epoch=0,
+                    mode='train',
+                    batch_size=None,
+                    **kwargs):
+  """Loop function for arrays of data with modes 'train'/'test'/'predict'.
+
+  Arguments:
+      model: Keras Model instance.
+      data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x, y)` or
+        `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      steps_per_epoch: Total number of steps (batches of samples) before
+        declaring one epoch finished and starting the next epoch. Ignored with
+        the default value of `None`.
+      epochs: Number of times to iterate over the data.
+      verbose: Verbosity mode, 0, 1 or 2.
+      callbacks: List of callbacks to be called during training.
+      validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or
+        `(x, y)` or `(x, y, sample_weights)`) or a generator or
+        `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+      validation_steps: Total number of steps (batches of samples) before
+        declaring validation finished.
+      class_weight: Dictionary mapping class indices to a weight for the class.
+      max_queue_size: Integer. Maximum size for the generator queue. If
+        unspecified, `max_queue_size` will default to 10.
+      workers: Integer. Maximum number of processes to spin up when using
+        process-based threading. If unspecified, `workers` will default to 1. If
+        0, will execute the generator on the main thread.
+      use_multiprocessing: Boolean. If `True`, use process-based threading. If
+        unspecified, `use_multiprocessing` will default to `False`. Note that
+        because this implementation relies on multiprocessing, you should not
+        pass non-picklable arguments to the generator as they can't be passed
+        easily to children processes.
+      shuffle: Boolean. Whether to shuffle the order of the batches at the
+        beginning of each epoch. Only used with instances of `Sequence`
+        (`keras.utils.Sequence`). Has no effect when `steps_per_epoch` is not
+        `None`.
+      initial_epoch: Epoch at which to start training (useful for resuming a
+        previous training run).
+      mode: One of 'train'/'test'/'predict'.
+      batch_size: Integer batch size or None if unknown. Will only be used if
+        `data` is in NumPy/Tensor format.
+      **kwargs: Additional arguments for backwards compatibility. `steps` is
+        accepted as an alias for `steps_per_epoch`.
+
+  Returns:
+      - In 'train' mode: `History` object.
+      - In 'test' mode: Evaluation metrics.
+      - In 'predict' mode: Outputs of the Model called on inputs.
+
+  Raises:
+      ValueError: in case of invalid arguments.
+  """
+  if 'steps' in kwargs:
+    steps_per_epoch = kwargs['steps']
+
+  # Convert to a format that supports `next(generator)`.
+  generator, steps_per_epoch = convert_to_generator_like(
+      data,
+      steps_per_epoch=steps_per_epoch,
+      batch_size=batch_size,
+      epochs=epochs - initial_epoch,
+      shuffle=shuffle)
+
+  do_validation = validation_data is not None
+  should_set_learning_phase = context.executing_eagerly() and model.run_eagerly
+  is_sequence = isinstance(generator, data_utils.Sequence)
+  _validate_arguments(is_sequence, use_multiprocessing, workers,
+                      steps_per_epoch, validation_data, validation_steps, mode,
+                      kwargs)
+
+  batch_function = _make_execution_function(
+      model, mode, class_weight=class_weight)
+
+  # Create the queue for the generator.
+  output_generator, enqueuer = _make_enqueued_generator(
+      generator,
+      workers=workers,
+      use_multiprocessing=use_multiprocessing,
+      max_queue_size=max_queue_size,
+      shuffle=shuffle)
+
+  num_samples_or_steps, use_steps = _get_num_samples_or_steps(
+      data, steps_per_epoch)
+
+  count_mode = 'steps' if use_steps else 'samples'
+  callbacks = cbks.configure_callbacks(
+      callbacks,
+      model,
+      do_validation=do_validation,
+      epochs=epochs,
+      steps_per_epoch=steps_per_epoch,
+      batch_size=batch_size,
+      samples=num_samples_or_steps,
+      verbose=0,  # Handle ProgBar as part of Callbacks once hooks are ready.
+      mode=mode)
+  # TODO(omalleyt): Handle ProgBar as part of Callbacks once hooks are ready.
+  progbar = training_utils.get_progbar(model, count_mode)
+  progbar.params = callbacks.params
+  progbar.params['verbose'] = verbose
+
+  if mode == 'predict':
+    aggregator = training_utils.OutputsAggregator(True, steps_per_epoch)
+  else:
+    aggregator = training_utils.MetricsAggregator(True, steps_per_epoch)
 
+  if should_set_learning_phase:
+    old_learning_phase = backend.learning_phase()
+    backend.set_learning_phase(1 if mode == 'train' else 0)
 
-def fit_generator(model,
-                  generator,
-                  steps_per_epoch=None,
-                  epochs=1,
-                  verbose=1,
-                  callbacks=None,
-                  validation_data=None,
-                  validation_steps=None,
-                  class_weight=None,
-                  max_queue_size=10,
-                  workers=1,
-                  use_multiprocessing=False,
-                  shuffle=True,
-                  initial_epoch=0):
-  """See docstring for `Model.fit_generator`."""
-  epoch = initial_epoch
-
-  do_validation = bool(validation_data)
-  if not context.executing_eagerly():
-    model._make_train_function()
-    if do_validation:
-      model._make_test_function()
-
-  is_sequence = isinstance(generator, Sequence)
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
-                    ' class.'))
-  if steps_per_epoch is None:
-    if is_sequence:
-      steps_per_epoch = len(generator)
-    else:
-      raise ValueError('`steps_per_epoch=None` is only valid for a'
-                       ' generator based on the `keras.utils.Sequence`'
-                       ' class. Please specify `steps_per_epoch` or use'
-                       ' the `keras.utils.Sequence` class.')
+  callbacks.model.stop_training = False
+  callbacks._call_begin_hook(mode)
+  progbar.on_train_begin()
+  for epoch in range(initial_epoch, epochs):
+    if callbacks.model.stop_training:
+      break
 
-  # python 2 has 'next', 3 has '__next__'
-  # avoid any explicit version checks
-  val_gen = (
-      hasattr(validation_data, 'next') or
-      hasattr(validation_data, '__next__') or
-      isinstance(validation_data, Sequence))
-  if (val_gen and not isinstance(validation_data, Sequence) and
-      not validation_steps):
-    raise ValueError('`validation_steps=None` is only valid for a'
-                     ' generator based on the `keras.utils.Sequence`'
-                     ' class. Please specify `validation_steps` or use'
-                     ' the `keras.utils.Sequence` class.')
+    # Setup work for each epoch.
+    model.reset_metrics()
+    epoch_logs = {}
+    callbacks.on_epoch_begin(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_begin(epoch, epoch_logs)
 
-  enqueuer = None
-  val_enqueuer = None
+    for step in range(steps_per_epoch):
+      batch_data = _get_next_batch(output_generator, mode)
+      if batch_data is None:
+        callbacks.model.stop_training = True
+        break
 
-  try:
-    val_x, val_y, val_sample_weights = validation_data, None, None
-    if do_validation and not val_gen:
-      # Prepare data for validation
-      if len(validation_data) == 2:
-        val_x, val_y = validation_data  # pylint: disable=unpacking-non-sequence
-        val_sample_weights = None
-      elif len(validation_data) == 3:
-        val_x, val_y, val_sample_weights = validation_data  # pylint: disable=unpacking-non-sequence
-      else:
-        raise ValueError(
-            '`validation_data` should be a tuple '
-            '`(val_x, val_y, val_sample_weight)` '
-            'or `(val_x, val_y)`. Found: ' + str(validation_data))
-      val_x, val_y, val_sample_weights = model._standardize_user_data(
-          val_x, val_y, val_sample_weights)
-
-    callbacks = cbks.configure_callbacks(
-        callbacks,
-        model,
-        do_validation=do_validation,
-        val_inputs=val_x,
-        val_targets=val_y,
-        val_sample_weights=val_sample_weights,
-        epochs=epochs,
-        validation_steps=validation_steps,
-        steps_per_epoch=steps_per_epoch,
-        verbose=verbose)
-
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing,
-            shuffle=shuffle)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
-    else:
-      if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
+      # `batch_size` used for validation data if validation
+      # data is NumPy/EagerTensors.
+      batch_size = int(nest.flatten(batch_data)[0].shape[0])
+
+      # Callbacks batch begin.
+      batch_logs = {'batch': step, 'size': batch_size}
+      callbacks._call_batch_hook(mode, 'begin', step, batch_logs)
+      progbar.on_batch_begin(step, batch_logs)
+
+      batch_outs = batch_function(*batch_data)
+      if not isinstance(batch_outs, list):
+        batch_outs = [batch_outs]
+
+      # Aggregate results.
+      if step == 0:
+        aggregator.create(batch_outs)
+      aggregator.aggregate(batch_outs)
+
+      # Callbacks batch end.
+      batch_logs.update(training_utils.make_logs(model, batch_outs, mode))
+      callbacks._call_batch_hook(mode, 'end', step, batch_logs)
+      progbar.on_batch_end(step, batch_logs)
 
-    callbacks.on_train_begin()
-    # Construct epoch logs.
-    epoch_logs = {}
-    while epoch < epochs:
-      for m in model.stateful_metric_functions:
-        m.reset_states()
-      callbacks.on_epoch_begin(epoch)
-      steps_done = 0
-      batch_index = 0
-      while steps_done < steps_per_epoch:
-        generator_output = next(output_generator)
-
-        if not hasattr(generator_output, '__len__'):
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-
-        if len(generator_output) == 2:
-          x, y = generator_output
-          sample_weight = None
-        elif len(generator_output) == 3:
-          x, y, sample_weight = generator_output
-        else:
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-        # build batch logs
-        batch_logs = {}
-        if isinstance(x, list):
-          batch_size = x[0].shape[0]
-        elif isinstance(x, dict):
-          batch_size = list(x.values())[0].shape[0]
-        else:
-          batch_size = x.shape[0]
-        batch_logs['batch'] = batch_index
-        batch_logs['size'] = batch_size
-        callbacks.on_batch_begin(batch_index, batch_logs)
-
-        outs = model.train_on_batch(
-            x, y, sample_weight=sample_weight, class_weight=class_weight)
-
-        if not isinstance(outs, list):
-          outs = [outs]
-        for l, o in zip(model.metrics_names, outs):
-          batch_logs[l] = o
-
-        callbacks.on_batch_end(batch_index, batch_logs)
-
-        batch_index += 1
-        steps_done += 1
-
-        # Epoch finished.
-        if steps_done >= steps_per_epoch and do_validation:
-          if val_gen:
-            val_outs = evaluate_generator(
-                model,
-                validation_data,
-                validation_steps,
-                workers=workers,
-                use_multiprocessing=use_multiprocessing,
-                max_queue_size=max_queue_size)
-          else:
-            # No need for try/except because
-            # data has already been validated.
-            val_outs = model.evaluate(
-                val_x,
-                val_y,
-                batch_size=batch_size,
-                sample_weight=val_sample_weights,
-                verbose=0)
-          if not isinstance(val_outs, list):
-            val_outs = [val_outs]
-          # Same labels assumed.
-          for l, o in zip(model.metrics_names, val_outs):
-            epoch_logs['val_' + l] = o
-
-        if callbacks.model.stop_training:
-          break
-
-      callbacks.on_epoch_end(epoch, epoch_logs)
-      epoch += 1
       if callbacks.model.stop_training:
         break
 
-  finally:
-    try:
-      if enqueuer is not None:
-        enqueuer.stop()
-    finally:
-      if val_enqueuer is not None:
-        val_enqueuer.stop()
-
-  callbacks.on_train_end()
-  return model.history
-
-
-def evaluate_generator(model,
-                       generator,
-                       steps=None,
-                       max_queue_size=10,
-                       workers=1,
-                       use_multiprocessing=False,
-                       verbose=0):
-  """See docstring for `Model.evaluate_generator`."""
-  if not context.executing_eagerly():
-    model._make_test_function()
-
-  if hasattr(model, 'metrics'):
-    for m in model.stateful_metric_functions:
-      m.reset_states()
-
-  steps_done = 0
-  all_outs = []
-  batch_sizes = []
-  is_sequence = isinstance(generator, Sequence)
-  if not is_sequence and use_multiprocessing and workers > 1:
-    logging.warning(
-        UserWarning('Using a generator with `use_multiprocessing=True`'
-                    ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
-                    ' class.'))
-  if steps is None:
-    if is_sequence:
-      steps = len(generator)
-    else:
-      raise ValueError('`steps=None` is only valid for a generator'
-                       ' based on the `keras.utils.Sequence` class.'
-                       ' Please specify `steps` or use the'
-                       ' `keras.utils.Sequence` class.')
-  enqueuer = None
-
+    aggregator.finalize()
+    results = aggregator.results
+    epoch_logs.update(training_utils.make_logs(model, results, mode))
+    if len(results) == 1:
+      results = results[0]
+
+    # Run the test loop every epoch during training.
+    if do_validation and not callbacks.model.stop_training:
+      val_results = model_iteration(
+          model,
+          validation_data,
+          steps_per_epoch=validation_steps,
+          batch_size=batch_size,
+          class_weight=class_weight,
+          workers=workers,
+          use_multiprocessing=use_multiprocessing,
+          max_queue_size=max_queue_size,
+          mode='test')
+
+      if not isinstance(val_results, list):
+        val_results = [val_results]
+      epoch_logs.update(
+          training_utils.make_logs(model, val_results, mode, prefix='val_'))
+
+    callbacks.on_epoch_end(epoch, epoch_logs, mode=mode)
+    progbar.on_epoch_end(epoch, epoch_logs)
+  callbacks._call_end_hook(mode)
+
+  if enqueuer is not None:
+    enqueuer.stop()
+
+  if should_set_learning_phase:
+    backend.set_learning_phase(old_learning_phase)
+
+  if mode == 'train':
+    return model.history
+  return results
+
+
+# Maintain compatibility with the existing names.
+fit_generator = functools.partial(model_iteration, mode='train')
+evaluate_generator = functools.partial(model_iteration, mode='test')
+predict_generator = functools.partial(model_iteration, mode='predict')
+
+
+def _get_next_batch(output_generator, mode):
+  """Retrieves the next batch of input data."""
   try:
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+    generator_output = next(output_generator)
+  except (errors.OutOfRangeError, StopIteration):
+    # Returning `None` will trigger looping to stop.
+    logging.warning('Your dataset iterator ran out of data.')
+    return None
+  if not isinstance(generator_output, tuple):
+    if mode == 'predict':
+      # Always wrap in a tuple.
+      return (generator_output,)
     else:
-      if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    while steps_done < steps:
-      generator_output = next(output_generator)
-      if not hasattr(generator_output, '__len__'):
-        raise ValueError('Output of generator should be a tuple '
-                         '(x, y, sample_weight) '
-                         'or (x, y). Found: ' + str(generator_output))
-      if len(generator_output) == 2:
-        x, y = generator_output
-        sample_weight = None
-      elif len(generator_output) == 3:
-        x, y, sample_weight = generator_output
-      else:
-        raise ValueError('Output of generator should be a tuple '
-                         '(x, y, sample_weight) '
-                         'or (x, y). Found: ' + str(generator_output))
-      outs = model.test_on_batch(x, y, sample_weight=sample_weight)
-
-      if isinstance(x, list):
-        batch_size = x[0].shape[0]
-      elif isinstance(x, dict):
-        batch_size = list(x.values())[0].shape[0]
-      else:
-        batch_size = x.shape[0]
-      if batch_size == 0:
-        raise ValueError('Received an empty batch. '
-                         'Batches should at least contain one item.')
-      all_outs.append(outs)
-
-      steps_done += 1
-      batch_sizes.append(batch_size)
-      if verbose == 1:
-        progbar.update(steps_done)
-
-  finally:
-    if enqueuer is not None:
-      enqueuer.stop()
-
-  if not isinstance(outs, list):
-    return np.average(np.asarray(all_outs), weights=batch_sizes)
-  else:
-    averages = [float(all_outs[-1][0])]  # index 0 = 'loss'
-    averages.extend([
-        np.average([out[i]
-                    for out in all_outs], weights=batch_sizes)
-        for i in range(1, len(outs))
-    ])
-    return averages
-
-
-def predict_generator(model,
-                      generator,
-                      steps=None,
-                      max_queue_size=10,
-                      workers=1,
-                      use_multiprocessing=False,
-                      verbose=0):
-  """See docstring for `Model.predict_generator`."""
-  if not context.executing_eagerly():
-    model._make_predict_function()
-
-  steps_done = 0
-  all_outs = []
-  is_sequence = isinstance(generator, Sequence)
+      raise ValueError('Output of generator should be '
+                       'a tuple `(x, y, sample_weight)` '
+                       'or `(x, y)`. Found: ' + str(generator_output))
+
+  if len(generator_output) < 1 or len(generator_output) > 3:
+    raise ValueError('Output of generator should be '
+                     'a tuple `(x, y, sample_weight)` '
+                     'or `(x, y)` or (x,). Found: ' + str(generator_output))
+  return generator_output
+
+
+def _validate_arguments(is_sequence, use_multiprocessing, workers,
+                        steps_per_epoch, validation_data, validation_steps,
+                        mode, kwargs):
+  """Raises errors if arguments are invalid.
+
+  Arguments:
+    is_sequence: Boolean, whether data is a `keras.utils.data_utils.Sequence`
+      instance.
+    use_multiprocessing: Boolean. If `True`, use process-based threading. If
+      unspecified, `use_multiprocessing` will default to `False`. Note that
+      because this implementation relies on multiprocessing, you should not pass
+      non-picklable arguments to the generator as they can't be passed easily to
+      children processes.
+    workers: Integer. Maximum number of processes to spin up when using
+      process-based threading. If unspecified, `workers` will default to 1. If
+      0, will execute the generator on the main thread.
+    steps_per_epoch: Total number of steps (batches of samples) before declaring
+      one epoch finished and starting the next epoch. Ignored with the default
+      value of `None`.
+    validation_data: Either a tuple of NumPy/Tensor inputs (i.e. `(x,)` or `(x,
+      y)` or `(x, y, sample_weights)`) or a generator or
+      `keras.utils.data_utils.Sequence` object or Eager Iterator or Dataset.
+    validation_steps: Total number of steps (batches of samples) before
+      declaring validation finished.
+    mode: One of 'train'/'test'/'predict'.
+    kwargs: Additional arguments for backwards compatibility.
+
+  Raises:
+    ValueError: If `steps_per_epoch` or `validation_steps` are not passed
+      for data types that require them, or if unrecognized keyword
+      arguments are passed.
+  """
   if not is_sequence and use_multiprocessing and workers > 1:
     logging.warning(
         UserWarning('Using a generator with `use_multiprocessing=True`'
                     ' and multiple workers may duplicate your data.'
-                    ' Please consider using the`keras.utils.Sequence'
+                    ' Please consider using the `keras.utils.Sequence`'
                     ' class.'))
-  if steps is None:
-    if is_sequence:
-      steps = len(generator)
-    else:
-      raise ValueError('`steps=None` is only valid for a generator'
-                       ' based on the `keras.utils.Sequence` class.'
-                       ' Please specify `steps` or use the'
-                       ' `keras.utils.Sequence` class.')
-  enqueuer = None
 
-  try:
-    if workers > 0:
-      if is_sequence:
-        enqueuer = OrderedEnqueuer(
-            generator, use_multiprocessing=use_multiprocessing)
-      else:
-        enqueuer = GeneratorEnqueuer(
-            generator,
-            use_multiprocessing=use_multiprocessing)
-      enqueuer.start(workers=workers, max_queue_size=max_queue_size)
-      output_generator = enqueuer.get()
+  if steps_per_epoch is None:
+    arg_name = 'steps_per_epoch' if mode == 'train' else 'steps'
+    raise ValueError('Please specify the number of steps via the '
+                     '`{}` argument.'.format(arg_name))
+
+  val_gen = (
+      data_utils.is_generator_or_sequence(validation_data) or
+      isinstance(validation_data, iterator_ops.EagerIterator) or
+      isinstance(validation_data, dataset_ops.DatasetV2))
+  if (val_gen and not isinstance(validation_data, data_utils.Sequence) and
+      not validation_steps):
+    raise ValueError('Please specify the `validation_steps` argument.')
+
+  if any(k != 'steps' for k in kwargs):
+    raise ValueError('Invalid arguments passed: {}'.format(
+        [k for k in kwargs if k != 'steps']))
+
+
+def convert_to_generator_like(data,
+                              batch_size=None,
+                              steps_per_epoch=None,
+                              epochs=1,
+                              shuffle=False):
+  """Make a generator out of NumPy or EagerTensor inputs.
+
+  Arguments:
+    data: Either a generator or `keras.utils.data_utils.Sequence` object or
+      `Dataset` or `EagerIterator` or a {1,2,3}-tuple of NumPy arrays or
+      EagerTensors. If a tuple, the elements represent `(x, y, sample_weights)`
+      and may be `None` or `[None]`.
+    batch_size: Used when creating a generator out of tuples of NumPy arrays or
+      EagerTensors.
+    steps_per_epoch: Steps of the generator to run each epoch.
+    epochs: Total number of epochs to run.
+    shuffle: Whether the data should be shuffled.
+
+  Returns:
+    - Generator or `keras.utils.data_utils.Sequence` or EagerIterator.
+
+  Raises:
+    - ValueError: If `batch_size` is not provided for NumPy or EagerTensor
+      inputs.
+  """
+  if isinstance(data, tuple):
+    # Scrub `Nones` that might have been passed for `targets`, `sample_weights`.
+    data = tuple(
+        ele for ele in data if not all(e is None for e in nest.flatten(ele)))
+    if len(data) == 1:
+      data = data[0]
+
+  if data_utils.is_generator_or_sequence(data) or isinstance(
+      data, iterator_ops.EagerIterator):
+    if isinstance(data, data_utils.Sequence):
+      steps_per_epoch = len(data)
+    return data, steps_per_epoch
+  if isinstance(data, dataset_ops.DatasetV2):
+    return dataset_ops.make_one_shot_iterator(data), steps_per_epoch
+
+  # Create generator from NumPy or EagerTensor Input.
+  num_samples = int(nest.flatten(data)[0].shape[0])
+  if batch_size is None:
+    raise ValueError('You must specify `batch_size`')
+  steps_per_epoch = int(math.ceil(num_samples / batch_size))
+
+  def _gen(data):
+    """Makes a generator out of a structure of NumPy/EagerTensors."""
+    index_array = np.arange(num_samples)
+    for _ in range(epochs):
+      if shuffle:
+        np.random.shuffle(index_array)
+      batches = generic_utils.make_batches(num_samples, batch_size)
+      for (batch_start, batch_end) in batches:
+        batch_ids = index_array[batch_start:batch_end]
+        flat_batch_data = training_utils.slice_arrays(
+            nest.flatten(data), batch_ids, contiguous=(not shuffle))
+        yield nest.pack_sequence_as(data, flat_batch_data)
+
+  return _gen(data), steps_per_epoch
+
+
+def _make_enqueued_generator(generator,
+                             workers=1,
+                             use_multiprocessing=False,
+                             max_queue_size=10,
+                             shuffle=False):
+  """Create a buffered queue of next elements of the generator."""
+  is_sequence = isinstance(generator, data_utils.Sequence)
+  enqueuer = None
+  if workers > 0:
+    if is_sequence:
+      enqueuer = data_utils.OrderedEnqueuer(
+          generator, use_multiprocessing=use_multiprocessing, shuffle=shuffle)
     else:
-      if is_sequence:
-        output_generator = iter_sequence_infinite(generator)
-      else:
-        output_generator = generator
-
-    if verbose == 1:
-      progbar = Progbar(target=steps)
-
-    while steps_done < steps:
-      generator_output = next(output_generator)
-      if isinstance(generator_output, tuple):
-        # Compatibility with the generators
-        # used for training.
-        if len(generator_output) == 2:
-          x, _ = generator_output
-        elif len(generator_output) == 3:
-          x, _, _ = generator_output
-        else:
-          raise ValueError('Output of generator should be '
-                           'a tuple `(x, y, sample_weight)` '
-                           'or `(x, y)`. Found: ' + str(generator_output))
-      else:
-        # Assumes a generator that only
-        # yields inputs (not targets and sample weights).
-        x = generator_output
-
-      outs = model.predict_on_batch(x)
-      if not isinstance(outs, list):
-        outs = [outs]
-
-      if not all_outs:
-        for out in outs:
-          all_outs.append([])
-
-      for i, out in enumerate(outs):
-        all_outs[i].append(out)
-      steps_done += 1
-      if verbose == 1:
-        progbar.update(steps_done)
-
-  finally:
-    if enqueuer is not None:
-      enqueuer.stop()
-
-  if len(all_outs) == 1:
-    if steps_done == 1:
-      return all_outs[0][0]
+      enqueuer = data_utils.GeneratorEnqueuer(
+          generator, use_multiprocessing=use_multiprocessing)
+    enqueuer.start(workers=workers, max_queue_size=max_queue_size)
+    output_generator = enqueuer.get()
+  else:
+    if is_sequence:
+      output_generator = data_utils.iter_sequence_infinite(generator)
     else:
-      return np.concatenate(all_outs[0])
-  if steps_done == 1:
-    return [out[0] for out in all_outs]
+      output_generator = generator
+  return output_generator, enqueuer
+
+
+def _make_execution_function(model, mode, class_weight=None):
+  """Makes function to run one step of model execution."""
+  if mode == 'train':
+    if not context.executing_eagerly():
+      model._make_fit_function()
+    f = functools.partial(model.train_on_batch, class_weight=class_weight)
+  elif mode == 'test':
+    if not context.executing_eagerly():
+      model._make_eval_function()
+    f = model.test_on_batch
   else:
-    return [np.concatenate(out) for out in all_outs]
+    # Match signature of other modes to allow
+    # 1, 2, or 3-tuples from generator
+    def predict_on_batch(x, y=None, sample_weights=None):  # pylint: disable=unused-argument
+      return model.predict_on_batch(x)
+
+    f = predict_on_batch
+
+  # Maintain stateful metrics across batch-level calls.
+  if mode != 'predict':
+    f = functools.partial(f, reset_metrics=False)
+
+  return f
+
+
+def _get_num_samples_or_steps(data, steps_per_epoch):
+  """Returns number of samples or steps, and whether to use steps count mode."""
+  flat_inputs = nest.flatten(data)
+  if hasattr(flat_inputs[0], 'shape'):
+    return int(flat_inputs[0].shape[0]), False
+  return steps_per_epoch, True
diff --git a/tensorflow/python/keras/engine/training_generator_test.py b/tensorflow/python/keras/engine/training_generator_test.py
index 88e89434242a7d7334e025acb5da530675d3f054..8941428e43ac5d7b4b439d86795e93a70fd270f0 100644
--- a/tensorflow/python/keras/engine/training_generator_test.py
+++ b/tensorflow/python/keras/engine/training_generator_test.py
@@ -21,220 +21,274 @@ from __future__ import print_function
 import os
 import unittest
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import iterator_ops
+from tensorflow.python.eager import context
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import metrics as metrics_module
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import training_generator
 from tensorflow.python.platform import test
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.util import nest
+
+
+def custom_generator(mode=2):
+  batch_size = 10
+  num_samples = 50
+  arr_data = np.random.random((num_samples, 2))
+  arr_labels = np.random.random((num_samples, 4))
+  arr_weights = np.random.random((num_samples,))
+  i = 0
+  while True:
+    batch_index = i * batch_size % num_samples
+    i += 1
+    start = batch_index
+    end = start + batch_size
+    x = arr_data[start: end]
+    y = arr_labels[start: end]
+    w = arr_weights[start: end]
+    if mode == 1:
+      yield x
+    elif mode == 2:
+      yield x, y
+    else:
+      yield x, y, w
+
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestGeneratorMethods(test.TestCase, parameterized.TestCase):
 
-
-class TestGeneratorMethods(test.TestCase):
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
+  @parameterized.parameters('sequential', 'functional')
+  def test_fit_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        workers=4,
+                        use_multiprocessing=True)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
+                        validation_data=custom_generator(),
+                        validation_steps=10)
+    model.fit_generator(custom_generator(),
+                        steps_per_epoch=5,
+                        validation_data=custom_generator(),
+                        validation_steps=1,
+                        workers=0)
 
   @unittest.skipIf(
       os.name == 'nt',
       'use_multiprocessing=True does not work on windows properly.')
-  def test_generator_methods(self):
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
+  @parameterized.parameters('sequential', 'functional')
+  def test_evaluate_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+    model.summary()
+
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             workers=2,
+                             verbose=1,
+                             use_multiprocessing=True)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False)
+    model.evaluate_generator(custom_generator(),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False,
+                             workers=0)
 
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        yield x, y
-
-    with self.cached_session():
-      x = keras.Input((2,))
-      y = keras.layers.Dense(1)(x)
-      fn_model = keras.models.Model(x, y)
-      fn_model.compile(
-          loss='mse',
-          optimizer='sgd',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-
-      seq_model = keras.models.Sequential()
-      seq_model.add(keras.layers.Dense(1, input_shape=(2,)))
-      seq_model.compile(loss='mse', optimizer='sgd')
-
-      for model in [fn_model, seq_model]:
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
+  @unittest.skipIf(
+      os.name == 'nt',
+      'use_multiprocessing=True does not work on windows properly.')
+  @parameterized.parameters('sequential', 'functional')
+  def test_predict_generator_method(self, model_type):
+    if model_type == 'sequential':
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    else:
+      model = testing_utils.get_small_functional_mlp(
+          num_hidden=3, num_classes=4, input_dim=2)
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_queue_size=10,
+                            workers=2,
+                            use_multiprocessing=True)
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_queue_size=10,
+                            use_multiprocessing=False)
+    model.predict_generator(custom_generator(),
+                            steps=5,
+                            max_queue_size=10,
+                            workers=0)
+    # Test generator with just inputs (no targets)
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
                             max_queue_size=10,
-                            workers=4,
+                            workers=2,
                             use_multiprocessing=True)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
                             max_queue_size=10,
                             use_multiprocessing=False)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
+    model.predict_generator(custom_generator(mode=1),
+                            steps=5,
                             max_queue_size=10,
-                            use_multiprocessing=False,
-                            validation_data=custom_generator(),
-                            validation_steps=10)
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            validation_data=custom_generator(),
-                            validation_steps=1,
                             workers=0)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                workers=2,
-                                use_multiprocessing=True)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                use_multiprocessing=False)
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                workers=0)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 workers=2,
-                                 verbose=1,
-                                 use_multiprocessing=True)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False)
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False,
-                                 workers=0)
 
   def test_generator_methods_with_sample_weights(self):
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
-    arr_sample_weights = np.random.random((50,))
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(
+        loss='mse',
+        optimizer='sgd',
+        metrics=['mae', metrics_module.CategoricalAccuracy()])
+
+    model.fit_generator(custom_generator(mode=3),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False)
+    model.fit_generator(custom_generator(mode=3),
+                        steps_per_epoch=5,
+                        epochs=1,
+                        verbose=1,
+                        max_queue_size=10,
+                        use_multiprocessing=False,
+                        validation_data=custom_generator(mode=3),
+                        validation_steps=10)
+    model.predict_generator(custom_generator(mode=3),
+                            steps=5,
+                            max_queue_size=10,
+                            use_multiprocessing=False)
+    model.evaluate_generator(custom_generator(mode=3),
+                             steps=5,
+                             max_queue_size=10,
+                             use_multiprocessing=False)
 
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        w = arr_sample_weights[start: end]
-        yield x, y, w
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(
-          loss='mse',
-          optimizer='sgd',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
+  def test_generator_methods_invalid_use_case(self):
 
-      model.fit_generator(custom_generator(),
+    def invalid_generator():
+      while 1:
+        yield 0
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(loss='mse', optimizer='sgd')
+
+    with self.assertRaises(ValueError):
+      model.fit_generator(invalid_generator(),
                           steps_per_epoch=5,
                           epochs=1,
                           verbose=1,
                           max_queue_size=10,
                           use_multiprocessing=False)
+    with self.assertRaises(ValueError):
       model.fit_generator(custom_generator(),
                           steps_per_epoch=5,
                           epochs=1,
                           verbose=1,
                           max_queue_size=10,
                           use_multiprocessing=False,
-                          validation_data=custom_generator(),
+                          validation_data=invalid_generator(),
                           validation_steps=10)
-      model.predict_generator(custom_generator(),
+    with self.assertRaises(AttributeError):
+      model.predict_generator(invalid_generator(),
                               steps=5,
                               max_queue_size=10,
                               use_multiprocessing=False)
-      model.evaluate_generator(custom_generator(),
+    with self.assertRaises(ValueError):
+      model.evaluate_generator(invalid_generator(),
                                steps=5,
                                max_queue_size=10,
                                use_multiprocessing=False)
 
-  def test_generator_methods_invalid_use_case(self):
+  def test_generator_input_to_fit_eval_predict(self):
+    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
-    def custom_generator():
-      while 1:
-        yield 0
+    def ones_generator():
+      while True:
+        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
+
+    inputs = keras.layers.Input(shape=(10,))
+    x = keras.layers.Dense(10, activation='relu')(inputs)
+    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
+    model = keras.Model(inputs, outputs)
 
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(loss='mse', optimizer='sgd')
+    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
+    model.fit(
+        ones_generator(),
+        steps_per_epoch=2,
+        validation_data=val_data,
+        epochs=2)
+    model.evaluate(ones_generator(), steps=2)
+    model.predict(ones_generator(), steps=2)
 
-      with self.assertRaises(ValueError):
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            use_multiprocessing=False)
-      with self.assertRaises(ValueError):
-        model.fit_generator(custom_generator(),
-                            steps_per_epoch=5,
-                            epochs=1,
-                            verbose=1,
-                            max_queue_size=10,
-                            use_multiprocessing=False,
-                            validation_data=custom_generator(),
-                            validation_steps=10)
-      with self.assertRaises(AttributeError):
-        model.predict_generator(custom_generator(),
-                                steps=5,
-                                max_queue_size=10,
-                                use_multiprocessing=False)
-      with self.assertRaises(ValueError):
-        model.evaluate_generator(custom_generator(),
-                                 steps=5,
-                                 max_queue_size=10,
-                                 use_multiprocessing=False)
+
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestGeneratorMethodsWithSequences(test.TestCase):
 
   def test_training_with_sequences(self):
 
     class DummySequence(keras.utils.Sequence):
 
       def __getitem__(self, idx):
-        return np.zeros([10, 2]), np.ones([10])
+        return np.zeros([10, 2]), np.ones([10, 4])
 
       def __len__(self):
         return 10
 
-    arr_data = np.random.random((50, 2))
-    arr_labels = np.random.random((50,))
-    arr_sample_weights = np.random.random((50,))
-
-    def custom_generator():
-      batch_size = 10
-      num_samples = 50
-      while True:
-        batch_index = np.random.randint(0, num_samples - batch_size)
-        start = batch_index
-        end = start + batch_size
-        x = arr_data[start: end]
-        y = arr_labels[start: end]
-        w = arr_sample_weights[start: end]
-        yield x, y, w
-
-    with self.cached_session():
-      model = keras.models.Sequential()
-      model.add(keras.layers.Dense(1, input_shape=(2,)))
-      model.compile(loss='mse', optimizer='sgd')
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(4, input_shape=(2,)))
+    model.compile(loss='mse', optimizer='sgd')
 
     model.fit_generator(DummySequence(),
                         steps_per_epoch=10,
@@ -251,29 +305,6 @@ class TestGeneratorMethods(test.TestCase):
                         workers=0,
                         use_multiprocessing=False)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_generator_input_to_fit_eval_predict(self):
-    val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    def custom_generator():
-      while True:
-        yield np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
-
-    inputs = keras.layers.Input(shape=(10,))
-    x = keras.layers.Dense(10, activation='relu')(inputs)
-    outputs = keras.layers.Dense(1, activation='sigmoid')(x)
-    model = keras.Model(inputs, outputs)
-
-    model.compile(RMSPropOptimizer(0.001), 'binary_crossentropy')
-    model.fit(
-        custom_generator(),
-        steps_per_epoch=2,
-        validation_data=val_data,
-        epochs=2)
-    model.evaluate(custom_generator(), steps=2)
-    model.predict(custom_generator(), steps=2)
-
-  @tf_test_util.run_in_graph_and_eager_modes
   def test_sequence_input_to_fit_eval_predict(self):
     val_data = np.ones([10, 10], np.float32), np.ones([10, 1], np.float32)
 
@@ -303,5 +334,56 @@ class TestGeneratorMethods(test.TestCase):
       model.fit(CustomSequence(), sample_weight=np.ones([10, 1]))
 
 
+@tf_test_util.run_all_in_graph_and_eager_modes
+class TestConvertToGeneratorLike(test.TestCase, parameterized.TestCase):
+  simple_inputs = (np.ones((10, 10)), np.ones((10, 1)))
+  nested_inputs = ((np.ones((10, 10)), np.ones((10, 20))), (np.ones((10, 1)),
+                                                            np.ones((10, 3))))
+
+  def _make_dataset(self, inputs, batches):
+    return dataset_ops.DatasetV2.from_tensors(inputs).repeat(batches)
+
+  def _make_iterator(self, inputs, batches):
+    return dataset_ops.make_one_shot_iterator(
+        self._make_dataset(inputs, batches))
+
+  def _make_generator(self, inputs, batches):
+
+    def _gen():
+      for _ in range(batches):
+        yield inputs
+
+    return _gen()
+
+  def _make_numpy(self, inputs, _):
+    return inputs
+
+  @parameterized.named_parameters(
+      ('simple_dataset', _make_dataset, simple_inputs),
+      ('simple_iterator', _make_iterator, simple_inputs),
+      ('simple_generator', _make_generator, simple_inputs),
+      ('simple_numpy', _make_numpy, simple_inputs),
+      ('nested_dataset', _make_dataset, nested_inputs),
+      ('nested_iterator', _make_iterator, nested_inputs),
+      ('nested_generator', _make_generator, nested_inputs),
+      ('nested_numpy', _make_numpy, nested_inputs))
+  def test_convert_to_generator_like(self, input_fn, inputs):
+    expected_batches = 5
+    data = input_fn(self, inputs, expected_batches)
+
+    # Dataset and Iterator not supported in Legacy Graph mode.
+    if (not context.executing_eagerly() and
+        isinstance(data, (dataset_ops.DatasetV2, iterator_ops.Iterator))):
+      return
+
+    generator, steps = training_generator.convert_to_generator_like(
+        data, batch_size=2, steps_per_epoch=expected_batches)
+    self.assertEqual(steps, expected_batches)
+
+    for _ in range(expected_batches):
+      outputs = next(generator)
+    nest.assert_same_structure(outputs, inputs)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_gpu_test.py b/tensorflow/python/keras/engine/training_gpu_test.py
index 596d085f3fa4c49c7506c35fa1f4ce776bc8f691..45dcfe43995b280072395b11a573e20d57bcadc7 100644
--- a/tensorflow/python/keras/engine/training_gpu_test.py
+++ b/tensorflow/python/keras/engine/training_gpu_test.py
@@ -69,7 +69,7 @@ class TrainingGPUTest(test.TestCase):
       return simple_model
 
     if test.is_gpu_available(cuda_only=True):
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         losses_to_test = ['sparse_categorical_crossentropy',
                           'categorical_crossentropy', 'binary_crossentropy']
 
diff --git a/tensorflow/python/keras/engine/training_test.py b/tensorflow/python/keras/engine/training_test.py
index 3cb24255d1569c254e4e912b700c6012c55fe142..91a0c7cc2f2dc5cf3e76eafdaaa79cfe6bc10336 100644
--- a/tensorflow/python/keras/engine/training_test.py
+++ b/tensorflow/python/keras/engine/training_test.py
@@ -32,11 +32,13 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.keras.callbacks import Callback
 from tensorflow.python.keras.engine.training_utils import weighted_masked_objective
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import test
@@ -49,19 +51,20 @@ except ImportError:
   scipy_sparse = None
 
 
-class TrainingTest(test.TestCase):
+class TrainingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_with_all_model_types(exclude_models='sequential')
+  @keras_parameterized.run_all_keras_modes
   def test_fit_on_arrays(self):
-    a = keras.layers.Input(shape=(3,), name='input_a')
-    b = keras.layers.Input(shape=(3,), name='input_b')
+    input_a = keras.layers.Input(shape=(3,), name='input_a')
+    input_b = keras.layers.Input(shape=(3,), name='input_b')
 
     dense = keras.layers.Dense(4, name='dense')
-    c = dense(a)
-    d = dense(b)
-    e = keras.layers.Dropout(0.5, name='dropout')(c)
+    dropout = keras.layers.Dropout(0.5, name='dropout')
+    branch_a = [input_a, dense]
+    branch_b = [input_b, dense, dropout]
 
-    model = keras.models.Model([a, b], [d, e])
+    model = testing_utils.get_multi_io_model(branch_a, branch_b)
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     loss = 'mse'
@@ -70,7 +73,8 @@ class TrainingTest(test.TestCase):
         optimizer,
         loss,
         metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
-        loss_weights=loss_weights)
+        loss_weights=loss_weights,
+        run_eagerly=testing_utils.should_run_eagerly())
 
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
@@ -134,61 +138,63 @@ class TrainingTest(test.TestCase):
         verbose=0,
         validation_split=0.2)
 
-    # Test with dictionary inputs
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        epochs=1,
-        batch_size=5,
-        verbose=1)
-    model.fit(
-        {
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        },
-        validation_data=({
-            'input_a': input_a_np,
-            'input_b': input_b_np
-        }, {
-            'dense': output_d_np,
-            'dropout': output_e_np
-        }),
-        epochs=1,
-        batch_size=5,
-        verbose=0)
-    model.train_on_batch({
-        'input_a': input_a_np,
-        'input_b': input_b_np
-    }, {
-        'dense': output_d_np,
-        'dropout': output_e_np
-    })
+    if testing_utils.get_model_type() == 'functional':
+      # Test with dictionary inputs
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          epochs=1,
+          batch_size=5,
+          verbose=1)
+      model.fit(
+          {
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          },
+          validation_data=({
+              'input_a': input_a_np,
+              'input_b': input_b_np
+          }, {
+              'dense': output_d_np,
+              'dropout': output_e_np
+          }),
+          epochs=1,
+          batch_size=5,
+          verbose=0)
+      model.train_on_batch({
+          'input_a': input_a_np,
+          'input_b': input_b_np
+      }, {
+          'dense': output_d_np,
+          'dropout': output_e_np
+      })
 
     # Test with lists for loss, metrics
     loss = ['mae', 'mse']
     model.compile(
         optimizer,
         loss,
-        metrics=[metrics_module.CategoricalAccuracy(), 'mae'])
+        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
+        run_eagerly=testing_utils.should_run_eagerly())
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=1,
@@ -196,13 +202,15 @@ class TrainingTest(test.TestCase):
         verbose=0)
 
     # Test with dictionaries for loss, metrics, loss weights
-    loss = {'dense': 'mse', 'dropout': 'mae'}
-    loss_weights = {'dense': 1., 'dropout': 0.5}
-    metrics = {
-        'dense': 'mse',
-        'dropout': metrics_module.CategoricalAccuracy()
-    }
-    model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights)
+    if testing_utils.get_model_type() == 'functional':
+      loss = {'dense': 'mse', 'dropout': 'mae'}
+      loss_weights = {'dense': 1., 'dropout': 0.5}
+      metrics = {
+          'dense': 'mse',
+          'dropout': metrics_module.CategoricalAccuracy()
+      }
+      model.compile(optimizer, loss, metrics=metrics, loss_weights=loss_weights,
+                    run_eagerly=testing_utils.should_run_eagerly())
     model.fit(
         [input_a_np, input_b_np], [output_d_np, output_e_np],
         epochs=1,
@@ -238,7 +246,8 @@ class TrainingTest(test.TestCase):
     x = keras.layers.Input(shape=(3,), name='input_a')
     y = keras.layers.Dense(4)(x)
     model = keras.models.Model(x, y)
-    model.compile(optimizer, loss='mse')
+    model.compile(optimizer, loss='mse',
+                  run_eagerly=testing_utils.should_run_eagerly())
     # This will work
     model.fit([input_a_np], output_d_np, epochs=1)
     with self.assertRaises(ValueError):
@@ -254,7 +263,7 @@ class TrainingTest(test.TestCase):
               batch_size=5,
               verbose=2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_evaluate_predict_on_arrays(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
@@ -274,7 +283,8 @@ class TrainingTest(test.TestCase):
         loss,
         metrics=['mae', metrics_module.CategoricalAccuracy()],
         loss_weights=loss_weights,
-        sample_weight_mode=None)
+        sample_weight_mode=None,
+        run_eagerly=testing_utils.should_run_eagerly())
 
     input_a_np = np.random.random((10, 3))
     input_b_np = np.random.random((10, 3))
@@ -335,7 +345,7 @@ class TrainingTest(test.TestCase):
     })
     self.assertEqual(len(out), 2)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_activity_regularizer_fit(self):
     loss = {}
     for reg in [None, 'l2']:
@@ -351,12 +361,13 @@ class TrainingTest(test.TestCase):
       y = np.ones((10, 1), 'float32')
 
       optimizer = RMSPropOptimizer(learning_rate=0.001)
-      model.compile(optimizer, 'binary_crossentropy')
+      model.compile(optimizer, 'binary_crossentropy',
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, batch_size=2, epochs=5)
       loss[reg] = model.evaluate(x, y)
     self.assertLess(loss[None], loss['l2'])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_activity_regularizer_loss_value(self):
     inputs = keras.layers.Input(shape=(10,))
     outputs = keras.layers.Dense(
@@ -369,11 +380,12 @@ class TrainingTest(test.TestCase):
     x = np.ones((10, 10), 'float32')
     y = np.ones((10, 1), 'float32')
     optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, 'binary_crossentropy')
+    model.compile(optimizer, 'binary_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
     loss = model.test_on_batch(x, y)
     self.assertAlmostEqual(0.01, loss, places=4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_activity_regularizer_batch_independent(self):
     inputs = keras.layers.Input(shape=(10,))
     x = keras.layers.Dense(
@@ -383,7 +395,8 @@ class TrainingTest(test.TestCase):
     model = keras.Model(inputs, outputs)
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
-    model.compile(optimizer, 'binary_crossentropy')
+    model.compile(optimizer, 'binary_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
 
     x = np.ones((10, 10), 'float32')
     y = np.ones((10, 1), 'float32')
@@ -395,7 +408,7 @@ class TrainingTest(test.TestCase):
 
     self.assertAlmostEqual(loss_small_batch, loss_big_batch, places=4)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_activity_regularizer_in_model_call(self):
 
     class MyModel(keras.Model):
@@ -409,45 +422,56 @@ class TrainingTest(test.TestCase):
     _ = model(x)
     self.assertEqual(1, len(model.losses))
 
+  @keras_parameterized.run_all_keras_modes
   def test_training_on_sparse_data_with_dense_placeholders(self):
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
+
     if scipy_sparse is None:
       return
 
-    with self.cached_session():
-      test_inputs = [
-          scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
-      ]
-      test_outputs = [
-          scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
-      ]
-      in1 = keras.layers.Input(shape=(3,))
-      in2 = keras.layers.Input(shape=(3,))
-      out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
-      out2 = keras.layers.Dense(4, name='dense_1')(in2)
-      model = keras.Model([in1, in2], [out1, out2])
-      model.predict(test_inputs, batch_size=2)
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      model.compile(
-          optimizer,
-          'mse',
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
-      model.fit(test_inputs, test_outputs,
-                epochs=1, batch_size=2, validation_split=0.5)
-      model.evaluate(test_inputs, test_outputs, batch_size=2)
+    test_inputs = [
+        scipy_sparse.random(6, 3, density=0.25).tocsr() for _ in range(2)
+    ]
+    test_outputs = [
+        scipy_sparse.random(6, i, density=0.25).tocsr() for i in range(3, 5)
+    ]
+    in1 = keras.layers.Input(shape=(3,))
+    in2 = keras.layers.Input(shape=(3,))
+    out1 = keras.layers.Dropout(0.5, name='dropout')(in1)
+    out2 = keras.layers.Dense(4, name='dense_1')(in2)
+    model = keras.Model([in1, in2], [out1, out2])
+    model.predict(test_inputs, batch_size=2)
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    model.compile(
+        optimizer,
+        'mse',
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
+    model.fit(test_inputs, test_outputs,
+              epochs=1, batch_size=2, validation_split=0.5)
+    model.evaluate(test_inputs, test_outputs, batch_size=2)
 
+  @keras_parameterized.run_all_keras_modes
   def test_compile_with_sparse_placeholders(self):
-    with self.cached_session():
-      input_layer = keras.layers.Input(shape=(10,), sparse=True)
-      weights = variables_lib.Variable(
-          np.ones((10, 1)).astype(np.float32), name='weights')
-      weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
-      output_layer = keras.layers.Lambda(weights_mult)(input_layer)
-      model = keras.Model([input_layer], output_layer)
-      model.compile(
-          loss='binary_crossentropy',
-          optimizer=keras.optimizers.Adam(lr=0.0001),
-          metrics=['accuracy'])
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
+
+    input_layer = keras.layers.Input(shape=(10,), sparse=True)
+    weights = variables_lib.Variable(
+        np.ones((10, 1)).astype(np.float32), name='weights')
+    weights_mult = lambda x: sparse_ops.sparse_tensor_dense_matmul(x, weights)
+    output_layer = keras.layers.Lambda(weights_mult)(input_layer)
+    model = keras.Model([input_layer], output_layer)
+    model.compile(
+        loss='binary_crossentropy',
+        optimizer=keras.optimizers.Adam(lr=0.0001),
+        metrics=['accuracy'],
+        run_eagerly=testing_utils.should_run_eagerly())
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_that_trainable_disables_updates(self):
     val_a = np.random.random((10, 4))
     val_out = np.random.random((10, 4))
@@ -544,10 +568,173 @@ class TrainingTest(test.TestCase):
               'val_loss', 'val_weighted_mean_absolute_error'
           ]))
 
+  @keras_parameterized.run_all_keras_modes
+  def test_mismatched_output_shape_and_target_shape(self):
+    model = keras.Sequential([
+        keras.layers.Dense(2, input_shape=(3, 4)),
+        keras.layers.Dense(5),
+    ])
+    model.compile(RMSPropOptimizer(learning_rate=0.001),
+                  loss='sparse_categorical_crossentropy',
+                  run_eagerly=testing_utils.should_run_eagerly())
+    # Test with Numpy data
+    x_train = np.random.random((10, 3, 4))
+    y_train = np.random.randint(0, 5, size=(10, 3))
+    model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+    # Test with iterator
+    dataset = dataset_ops.Dataset.from_tensor_slices((x_train, y_train))
+    dataset = dataset.repeat(10)
+    dataset = dataset.batch(10)
+    iterator = dataset_ops.make_one_shot_iterator(dataset)
+    model.fit(iterator, epochs=1, steps_per_epoch=2)
+
+    if context.executing_eagerly():
+      # Test with eager execution
+      model.compile(RMSPropOptimizer(learning_rate=0.001),
+                    loss='sparse_categorical_crossentropy',
+                    run_eagerly=True)
+      model.fit(x_train, y_train, batch_size=5, epochs=1)
+
+      # Test with eager execution and iterator
+      model.fit(iterator, epochs=1, steps_per_epoch=2)
+
+  def test_losses_in_defun(self):
+    with context.eager_mode():
+      layer = keras.layers.Dense(1, kernel_regularizer='l1')
+      layer(array_ops.ones([1, 10]))
+
+      @function.defun
+      def get_losses():
+        return layer.losses
+
+      self.assertAllEqual(
+          self.evaluate(layer.losses), self.evaluate(get_losses()))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_logging(self):
+    mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(10, activation='relu'))
+    model.add(keras.layers.Dense(1, activation='sigmoid'))
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
+    with test.mock.patch.object(sys, 'stdout', mock_stdout):
+      model.fit(
+          np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
+    self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_training_with_loss_instance(self):
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
+
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
+
+    model = keras.models.Model([a, b], [d, e])
+    loss_weights = [1., 0.5]
+    model.compile(
+        RMSPropOptimizer(learning_rate=0.001),
+        loss=keras.losses.MeanSquaredError(),
+        metrics=[metrics_module.CategoricalAccuracy(), 'mae'],
+        loss_weights=loss_weights)
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
+              epochs=1,
+              batch_size=5)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_static_batch_in_input_layer(self):
+
+    class Counter(keras.callbacks.Callback):
 
-class TestExceptionsAndWarnings(test.TestCase):
+      def __init__(self):
+        self.batches = 0
+
+      def on_batch_end(self, batch, logs=None):
+        self.batches += 1
+
+    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
+
+    for batch_size, expected_batches in [(None, 2), (4, 16)]:
+      inputs = keras.Input(batch_size=batch_size, shape=(10,))
+      outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+      model = keras.Model(inputs, outputs)
+
+      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      counter = Counter()
+      model.fit(x, y, callbacks=[counter])
+      self.assertEqual(counter.batches, expected_batches)
+
+      model = keras.Sequential(
+          [keras.layers.Dense(1, batch_input_shape=(batch_size, 10))])
+      model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+      counter = Counter()
+      model.fit(x, y, callbacks=[counter])
+      self.assertEqual(counter.batches, expected_batches)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_static_batch_in_input_layer_consistency_checks(self):
+    x, y = np.ones((64, 10), 'float32'), np.ones((64, 1), 'float32')
+
+    inputs = keras.Input(batch_size=2, shape=(10,))
+    outputs = keras.layers.Dense(1, activation='sigmoid')(inputs)
+    model = keras.Model(inputs, outputs)
+    model.compile(keras.optimizer_v2.adam.Adam(0.001), 'binary_crossentropy')
+    with self.assertRaisesRegexp(ValueError,
+                                 'incompatible with the specified batch size'):
+      model.fit(x, y, batch_size=4)
+
+    data = dataset_ops.DatasetV2.from_tensor_slices((x, y))
+    data = data.batch(4, drop_remainder=True)
+    with self.assertRaisesRegexp(ValueError,
+                                 'incompatible with the specified batch size'):
+      model.fit(data, steps_per_epoch=16)
 
   @tf_test_util.run_in_graph_and_eager_modes
+  def test_compatible_batch_size_functional_model(self):
+
+    class MyLayer(keras.layers.Layer):
+
+      def call(self, inputs):
+        return array_ops.concat(inputs, axis=0)
+
+    input1 = keras.Input(batch_size=2, shape=(10,))
+    input2 = keras.Input(batch_size=3, shape=(10,))
+    outputs = MyLayer()([input1, input2])
+    with self.assertRaisesRegexp(ValueError,
+                                 'specified batch sizes of the Input Layers'):
+      keras.Model([input1, input2], outputs)
+
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_calling_subclass_model_on_different_datasets(self):
+
+    class SubclassedModel(keras.models.Model):
+
+      def call(self, inputs):
+        return inputs * 2
+
+    model = SubclassedModel()
+    dataset_one = dataset_ops.Dataset.range(2).batch(2)
+    dataset_two = dataset_ops.Dataset.range(3, 10).batch(2)
+    self.assertAllEqual([[0], [2]], model.predict(dataset_one, steps=1))
+    self.assertAllEqual([[6], [8], [10], [12]],
+                        model.predict(dataset_two, steps=2))
+
+
+class TestExceptionsAndWarnings(keras_parameterized.TestCase):
+
+  @keras_parameterized.run_all_keras_modes
   def test_invalid_loss(self):
     num_classes = 5
     train_samples = 1000
@@ -574,9 +761,10 @@ class TestExceptionsAndWarnings(test.TestCase):
         model.fit(x_train, y_train)
 
       with self.assertRaises(ValueError):
-        model.compile(optimizer, loss=None)
+        model.compile(optimizer, loss=None,
+                      run_eagerly=testing_utils.should_run_eagerly())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_compile_warning_for_loss_missing_output(self):
     with self.cached_session():
       inp = keras.layers.Input(shape=(16,), name='input_a')
@@ -594,17 +782,19 @@ class TestExceptionsAndWarnings(test.TestCase):
             metrics={
                 'dense_2': 'categorical_accuracy',
                 'dense_1': metrics_module.CategoricalAccuracy(),
-            })
+            },
+            run_eagerly=testing_utils.should_run_eagerly())
         msg = ('Output "dense_1" missing from loss dictionary. We assume this '
                'was done on purpose. The fit and evaluate APIs will not be '
                'expecting any data to be passed to "dense_1".')
         self.assertRegexpMatches(str(mock_log.call_args), msg)
 
 
-class LossWeightingTest(test.TestCase):
+class LossWeightingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_class_weights(self):
+  @keras_parameterized.run_all_keras_modes
+  # TODO(b/120562577): Test failing with assertion error.
+  def DISABLED_test_class_weights(self):
     num_classes = 5
     batch_size = 5
     epochs = 5
@@ -620,7 +810,8 @@ class LossWeightingTest(test.TestCase):
         loss='categorical_crossentropy',
         metrics=['acc', metrics_module.CategoricalAccuracy()],
         weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=learning_rate))
+        optimizer=RMSPropOptimizer(learning_rate=learning_rate),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(1337)
     (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -672,7 +863,8 @@ class LossWeightingTest(test.TestCase):
         x_test[test_ids, :], y_test[test_ids, :], verbose=0)
     self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
+  @tf_test_util.run_v1_only('b/120545219')
   def test_sample_weights(self):
     num_classes = 5
     batch_size = 5
@@ -689,7 +881,8 @@ class LossWeightingTest(test.TestCase):
         RMSPropOptimizer(learning_rate=learning_rate),
         metrics=['acc', metrics_module.CategoricalAccuracy()],
         weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-        loss='categorical_crossentropy')
+        loss='categorical_crossentropy',
+        run_eagerly=testing_utils.should_run_eagerly())
 
     np.random.seed(43)
     (x_train, y_train), (x_test, y_test) = testing_utils.get_test_data(
@@ -737,13 +930,15 @@ class LossWeightingTest(test.TestCase):
           x_test[test_ids, :], y_test[test_ids, :], verbose=0)
       self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_warning_for_concurrent_sample_and_class_weights(self):
+
     model = keras.models.Sequential()
     model.add(keras.layers.Dense(10, input_shape=(3,)))
     model.compile(
         loss='mse',
-        optimizer=RMSPropOptimizer(learning_rate=0.01))
+        optimizer=RMSPropOptimizer(learning_rate=0.01),
+        run_eagerly=testing_utils.should_run_eagerly())
     x_train = np.random.random((10, 3))
     y_train = np.random.random((10, 10))
     sample_weight = np.ones((y_train.shape[0]))
@@ -757,11 +952,19 @@ class LossWeightingTest(test.TestCase):
           verbose=0,
           sample_weight=sample_weight,
           class_weight=class_weight)
-      msg = ('The `class_weight` argument will be ignored.')
-      self.assertRegexpMatches(str(mock_log.call_args), msg)
+      msg = 'The `class_weight` argument will be ignored.'
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_temporal_sample_weights(self):
+      msg_found = False
+      for call_args in mock_log.call_args_list:
+        if msg in str(call_args):
+          msg_found = True
+
+      self.assertTrue(msg_found)
+
+  @keras_parameterized.run_all_keras_modes
+  @tf_test_util.run_v1_only('b/120545219')
+  # TODO(b/120562577): Test failing with assertion error.
+  def DISABLED_test_temporal_sample_weights(self):
     num_classes = 5
     batch_size = 5
     epochs = 5
@@ -818,7 +1021,8 @@ class LossWeightingTest(test.TestCase):
           loss='binary_crossentropy',
           metrics=['acc', metrics_module.CategoricalAccuracy()],
           weighted_metrics=['mae', metrics_module.CategoricalAccuracy()],
-          sample_weight_mode='temporal')
+          sample_weight_mode='temporal',
+          run_eagerly=testing_utils.should_run_eagerly())
 
       model.fit(
           temporal_x_train,
@@ -850,7 +1054,7 @@ class LossWeightingTest(test.TestCase):
             temporal_x_test[test_ids], temporal_y_test[test_ids], verbose=0)
         self.assertLess(score[0], ref_score[0])
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_class_weight_invalid_use_case(self):
     num_classes = 5
     train_samples = 1000
@@ -867,7 +1071,8 @@ class LossWeightingTest(test.TestCase):
               input_shape=(timesteps, input_dim)))
       model.add(keras.layers.Activation('softmax'))
       optimizer = RMSPropOptimizer(learning_rate=learning_rate)
-      model.compile(optimizer, loss='binary_crossentropy')
+      model.compile(optimizer, loss='binary_crossentropy',
+                    run_eagerly=testing_utils.should_run_eagerly())
 
       (x_train, y_train), _ = testing_utils.get_test_data(
           train_samples=train_samples,
@@ -885,14 +1090,16 @@ class LossWeightingTest(test.TestCase):
 
       with self.assertRaises(ValueError):
         model.compile(
-            optimizer, loss='binary_crossentropy', sample_weight_mode=[])
+            optimizer, loss='binary_crossentropy', sample_weight_mode=[],
+            run_eagerly=testing_utils.should_run_eagerly())
 
       # Build multi-output model
       x = keras.Input((3,))
       y1 = keras.layers.Dense(4, name='1')(x)
       y2 = keras.layers.Dense(4, name='2')(x)
       model = keras.models.Model(x, [y1, y2])
-      model.compile(optimizer, loss='mse')
+      model.compile(optimizer, loss='mse',
+                    run_eagerly=testing_utils.should_run_eagerly())
       x_np = np.random.random((10, 3))
       y_np = np.random.random((10, 4))
       w_np = np.random.random((10,))
@@ -919,7 +1126,7 @@ class LossWeightingTest(test.TestCase):
         model.fit(x_np, [y_np, y_np], epochs=1,
                   sample_weight={'1': bad_w_np})
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_default_sample_weight(self):
     """Verifies that fit works without having to set sample_weight."""
 
@@ -940,38 +1147,46 @@ class LossWeightingTest(test.TestCase):
       optimizer = RMSPropOptimizer(learning_rate=learning_rate)
 
       # sample_weight_mode is a list and mode value is None
-      model.compile(optimizer, loss='mse', sample_weight_mode=[None])
+      model.compile(optimizer, loss='mse', sample_weight_mode=[None],
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a list and mode value is `temporal`
-      model.compile(optimizer, loss='mse', sample_weight_mode=['temporal'])
+      model.compile(optimizer, loss='mse', sample_weight_mode=['temporal'],
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a dict and mode value is None
       model.compile(
-          optimizer, loss='mse', sample_weight_mode={'time_distributed': None})
+          optimizer, loss='mse', sample_weight_mode={'time_distributed': None},
+          run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a dict and mode value is `temporal`
       model.compile(
           optimizer,
           loss='mse',
-          sample_weight_mode={'time_distributed': 'temporal'})
+          sample_weight_mode={'time_distributed': 'temporal'},
+          run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a not a list/dict and mode value is None
-      model.compile(optimizer, loss='mse', sample_weight_mode=None)
+      model.compile(optimizer, loss='mse', sample_weight_mode=None,
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
       # sample_weight_mode is a not a list/dict and mode value is `temporal`
-      model.compile(optimizer, loss='mse', sample_weight_mode='temporal')
+      model.compile(optimizer, loss='mse', sample_weight_mode='temporal',
+                    run_eagerly=testing_utils.should_run_eagerly())
       model.fit(x, y, epochs=1, batch_size=10)
 
 
-class LossMaskingTest(test.TestCase):
+class LossMaskingTest(keras_parameterized.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_graph_sequential(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       model = keras.models.Sequential()
@@ -979,13 +1194,16 @@ class LossMaskingTest(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_deferred_sequential(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       model = keras.models.Sequential()
@@ -993,13 +1211,16 @@ class LossMaskingTest(test.TestCase):
       model.add(
           keras.layers.TimeDistributed(
               keras.layers.Dense(1, kernel_initializer='one')))
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_masking_functional(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       x = np.array([[[1], [1]], [[0], [0]]])
       inputs = keras.layers.Input((2, 1))
@@ -1007,12 +1228,13 @@ class LossMaskingTest(test.TestCase):
       outputs = keras.layers.TimeDistributed(
           keras.layers.Dense(1, kernel_initializer='one'))(outputs)
       model = keras.Model(inputs, outputs)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.array([[[1], [1]], [[1], [1]]])
       loss = model.train_on_batch(x, y)
       self.assertEqual(float(loss), 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_mask_argument_in_layer(self):
     # Test that the mask argument gets correctly passed to a layer in the
     # functional API.
@@ -1037,7 +1259,8 @@ class LossMaskingTest(test.TestCase):
       outputs = CustomMaskedLayer()(masked)
 
       model = keras.Model(inputs, outputs)
-      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
+      model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001),
+                    run_eagerly=testing_utils.should_run_eagerly())
       y = np.random.random((5, 3))
       model.train_on_batch(x, y)
 
@@ -1060,8 +1283,9 @@ class LossMaskingTest(test.TestCase):
               keras.backend.variable(weights), keras.backend.variable(mask)))
 
 
-class TestDynamicTrainability(test.TestCase):
+class TestDynamicTrainability(keras_parameterized.TestCase):
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_trainable_warning(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -1075,6 +1299,7 @@ class TestDynamicTrainability(test.TestCase):
       model.train_on_batch(x, y)
       self.assertRaises(Warning)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_trainable_argument(self):
     with self.cached_session():
       x = np.random.random((5, 3))
@@ -1203,143 +1428,154 @@ class TestDynamicTrainability(test.TestCase):
       self.assertListEqual(outer_model.trainable_weights, [])
 
 
-class TestTrainingWithDataTensors(test.TestCase):
+class TestTrainingWithDataTensors(keras_parameterized.TestCase):
 
+  @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_symbolic_tensors_single_io(self):
-    with self.cached_session():
-      x = keras.layers.Input(shape=(3,), name='input')
-      y = keras.layers.Dense(4, name='dense')(x)
-      model = keras.Model(x, y)
+    # TODO(kaftan) Test seems to not work, file ticket
+    if  context.executing_eagerly():
+      self.skipTest('Skipping eager execution.')
 
-      optimizer = RMSPropOptimizer(learning_rate=0.001)
-      loss = 'mse'
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()])
+    x = keras.layers.Input(shape=(3,), name='input')
+    y = keras.layers.Dense(4, name='dense')(x)
+    model = keras.Model(x, y)
+
+    optimizer = RMSPropOptimizer(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(
+        optimizer,
+        loss,
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      inputs = keras.backend.zeros(shape=(10, 3))
-      targets = keras.backend.zeros(shape=(10, 4))
+    inputs = keras.backend.zeros(shape=(10, 3))
+    targets = keras.backend.zeros(shape=(10, 4))
 
-      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-      model.predict(inputs, steps=2)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
-      model.fit(inputs, targets,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=(inputs, targets), validation_steps=2)
-
-      # Test with dynamic shape
-      inputs = array_ops.placeholder_with_default(
-          np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
-      targets = array_ops.placeholder_with_default(
-          np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
-      self.assertEqual(inputs.shape.dims[0].value, None)
-      model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
-      model.evaluate(inputs, targets, steps=2, verbose=0)
-      model.predict(inputs, steps=2)
-      model.train_on_batch(inputs, targets)
-      model.test_on_batch(inputs, targets)
-      model.fit(inputs, targets,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_data=(inputs, targets), validation_steps=2)
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+    model.evaluate(inputs, targets, steps=2, verbose=0)
+    model.predict(inputs, steps=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+    model.fit(inputs, targets,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=(inputs, targets), validation_steps=2)
+
+    # Test with dynamic shape
+    inputs = array_ops.placeholder_with_default(
+        np.zeros((2, 3)), shape=tensor_shape.TensorShape([None, 3]))
+    targets = array_ops.placeholder_with_default(
+        np.zeros((2, 4)), shape=tensor_shape.TensorShape([None, 4]))
+    self.assertEqual(inputs.shape.dims[0].value, None)
+    model.fit(inputs, targets, epochs=1, steps_per_epoch=2, verbose=0)
+    model.evaluate(inputs, targets, steps=2, verbose=0)
+    model.predict(inputs, steps=2)
+    model.train_on_batch(inputs, targets)
+    model.test_on_batch(inputs, targets)
+    model.fit(inputs, targets,
+              epochs=1, steps_per_epoch=2, verbose=0,
+              validation_data=(inputs, targets), validation_steps=2)
 
+  @keras_parameterized.run_all_keras_modes
   def test_training_and_eval_methods_on_symbolic_tensors_multi_io(self):
-    with self.cached_session():
-      a = keras.layers.Input(shape=(3,), name='input_a')
-      b = keras.layers.Input(shape=(3,), name='input_b')
+    # TODO(kaftan) Test seems to not work, file ticket
+    if context.executing_eagerly():
+      self.skipTest('Skipping eager execution.')
 
-      dense = keras.layers.Dense(4, name='dense')
-      c = dense(a)
-      d = dense(b)
-      e = keras.layers.Dropout(0.5, name='dropout')(c)
+    a = keras.layers.Input(shape=(3,), name='input_a')
+    b = keras.layers.Input(shape=(3,), name='input_b')
 
-      model = keras.models.Model([a, b], [d, e])
+    dense = keras.layers.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = keras.layers.Dropout(0.5, name='dropout')(c)
 
-      optimizer = 'rmsprop'
-      loss = 'mse'
-      loss_weights = [1., 0.5]
-      model.compile(
-          optimizer,
-          loss,
-          metrics=['mae', metrics_module.CategoricalAccuracy()],
-          loss_weights=loss_weights)
+    model = keras.models.Model([a, b], [d, e])
+
+    optimizer = 'rmsprop'
+    loss = 'mse'
+    loss_weights = [1., 0.5]
+    model.compile(
+        optimizer,
+        loss,
+        metrics=['mae', metrics_module.CategoricalAccuracy()],
+        loss_weights=loss_weights,
+        run_eagerly=testing_utils.should_run_eagerly())
 
-      input_a_tf = keras.backend.zeros(shape=(10, 3))
-      input_b_tf = keras.backend.zeros(shape=(10, 3))
+    input_a_tf = keras.backend.zeros(shape=(10, 3))
+    input_b_tf = keras.backend.zeros(shape=(10, 3))
 
-      output_d_tf = keras.backend.zeros(shape=(10, 4))
-      output_e_tf = keras.backend.zeros(shape=(10, 4))
+    output_d_tf = keras.backend.zeros(shape=(10, 4))
+    output_e_tf = keras.backend.zeros(shape=(10, 4))
 
+    model.fit(
+        [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+        epochs=1,
+        steps_per_epoch=2,
+        verbose=0)
+    with self.assertRaisesRegexp(ValueError,
+                                 'should specify the `steps_per_epoch`'):
       model.fit(
           [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
           epochs=1,
-          steps_per_epoch=2,
+          batch_size=5,
           verbose=0)
-      with self.assertRaisesRegexp(ValueError,
-                                   'should specify the `steps_per_epoch`'):
-        model.fit(
-            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-            epochs=1,
-            batch_size=5,
-            verbose=0)
-      model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+    model.train_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
 
-      # Test with dictionary inputs
-      model.fit(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf},
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0)
-      model.fit(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf},
-          validation_data=({'input_a': input_a_tf,
-                            'input_b': input_b_tf},
-                           {'dense': output_d_tf,
-                            'dropout': output_e_tf}),
-          epochs=1,
-          steps_per_epoch=2,
-          validation_steps=2,
-          verbose=0)
-      model.train_on_batch(
-          {'input_a': input_a_tf,
-           'input_b': input_b_tf},
-          {'dense': output_d_tf,
-           'dropout': output_e_tf})
+    # Test with dictionary inputs
+    model.fit(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf},
+        epochs=1,
+        steps_per_epoch=2,
+        verbose=0)
+    model.fit(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf},
+        validation_data=({'input_a': input_a_tf,
+                          'input_b': input_b_tf},
+                         {'dense': output_d_tf,
+                          'dropout': output_e_tf}),
+        epochs=1,
+        steps_per_epoch=2,
+        validation_steps=2,
+        verbose=0)
+    model.train_on_batch(
+        {'input_a': input_a_tf,
+         'input_b': input_b_tf},
+        {'dense': output_d_tf,
+         'dropout': output_e_tf})
 
-      # Test with validation data
+    # Test with validation data
+    model.fit(
+        [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+        validation_data=([input_a_tf, input_b_tf],
+                         [output_d_tf, output_e_tf]),
+        epochs=1,
+        steps_per_epoch=2,
+        validation_steps=2,
+        verbose=0)
+    # Test with validation split
+    with self.assertRaisesRegexp(ValueError,
+                                 'you cannot use `validation_split`'):
       model.fit(
           [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-          validation_data=([input_a_tf, input_b_tf],
-                           [output_d_tf, output_e_tf]),
-          epochs=1,
+          epochs=2,
           steps_per_epoch=2,
-          validation_steps=2,
-          verbose=0)
-      # Test with validation split
-      with self.assertRaisesRegexp(ValueError,
-                                   'you cannot use `validation_split`'):
-        model.fit(
-            [input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-            epochs=2,
-            steps_per_epoch=2,
-            verbose=0,
-            validation_split=0.2,
-            validation_steps=2)
-
-      # Test evaluation / prediction methods
-      model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
-                     steps=2, verbose=0)
-      model.predict([input_a_tf, input_b_tf], steps=2)
-      model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+          verbose=0,
+          validation_split=0.2,
+          validation_steps=2)
 
+    # Test evaluation / prediction methods
+    model.evaluate([input_a_tf, input_b_tf], [output_d_tf, output_e_tf],
+                   steps=2, verbose=0)
+    model.predict([input_a_tf, input_b_tf], steps=2)
+    model.test_on_batch([input_a_tf, input_b_tf], [output_d_tf, output_e_tf])
+
+  @tf_test_util.run_deprecated_v1
   def test_model_with_input_feed_tensor(self):
     """We test building a model with a TF variable as input.
 
@@ -1518,6 +1754,7 @@ class TestTrainingWithDataTensors(test.TestCase):
       # evaluate
       _ = model.evaluate(input_a_np, [output_a_np])
 
+  @tf_test_util.run_deprecated_v1
   def test_model_with_external_loss(self):
     with self.cached_session():
       # None loss, only regularization loss.
@@ -1713,6 +1950,7 @@ class TestTrainingWithDataTensors(test.TestCase):
       model.train_on_batch(input_val, None,
                            sample_weight={'dense_a': np.random.random((10,))})
 
+  @tf_test_util.run_deprecated_v1
   def test_model_custom_target_tensors(self):
     with self.cached_session():
       a = keras.Input(shape=(3,), name='input_a')
@@ -1774,268 +2012,10 @@ class TestTrainingWithDataTensors(test.TestCase):
                            [output_a_np, output_b_np])
 
 
-class TestTrainingWithDatasetIterators(test.TestCase):
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_iterators_single_io(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(iterator, steps=2, verbose=1)
-    model.predict(iterator, steps=2)
-    model.train_on_batch(iterator)
-    model.test_on_batch(iterator)
-    model.predict_on_batch(iterator)
-
-    # Test with validation data
-    model.fit(iterator,
-              epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=iterator, validation_steps=2)
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(iterator,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          iterator,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(iterator, iterator,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
-      model.fit(iterator, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.evaluate(iterator, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.predict(iterator, verbose=0)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_get_next_op_created_once(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-    # Finalize graph to make sure we are not appending another iterator
-    # get_next op in the graph.
-    ops.get_default_graph().finalize()
-    model.fit(iterator, epochs=1, steps_per_epoch=2, verbose=1)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_iterators_running_out_of_data(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(2)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-
-    with test.mock.patch.object(logging, 'warning') as mock_log:
-      model.fit(iterator, epochs=1, steps_per_epoch=3, verbose=0)
-      self.assertRegexpMatches(
-          str(mock_log.call_args),
-          'dataset iterator ran out of data')
-
-
-class TestTrainingWithDataset(test.TestCase):
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_calling_model_on_same_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae']
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    # Call fit with validation data
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-    # Finalize the graph to make sure new ops aren't added when calling on the
-    # same dataset
-    ops.get_default_graph().finalize()
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_training_and_eval_methods_on_dataset(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3))
-    targets = np.zeros((10, 4))
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-    model.train_on_batch(dataset)
-    model.predict_on_batch(dataset)
-
-    # Test with validation data
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=0,
-              validation_data=dataset, validation_steps=2)
-
-    # Test with validation split
-    with self.assertRaisesRegexp(
-        ValueError, '`validation_split` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(dataset,
-                epochs=1, steps_per_epoch=2, verbose=0,
-                validation_split=0.5, validation_steps=2)
-
-    # Test with sample weight.
-    sample_weight = np.random.random((10,))
-    with self.assertRaisesRegexp(
-        ValueError, '`sample_weight` argument is not supported '
-        'when input `x` is a dataset or a dataset iterator'):
-      model.fit(
-          dataset,
-          epochs=1,
-          steps_per_epoch=2,
-          verbose=0,
-          sample_weight=sample_weight)
-
-    # Test invalid usage
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should not specify a target'):
-      model.fit(dataset, dataset,
-                epochs=1, steps_per_epoch=2, verbose=0)
-
-    with self.assertRaisesRegexp(
-        ValueError, 'you should specify the `steps_per_epoch` argument'):
-      model.fit(dataset, epochs=1, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.evaluate(dataset, verbose=0)
-    with self.assertRaisesRegexp(ValueError,
-                                 'you should specify the `steps` argument'):
-      model.predict(dataset, verbose=0)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sample_weights(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'mse'
-    metrics = ['mae', metrics_module.CategoricalAccuracy()]
-    model.compile(optimizer, loss, metrics=metrics)
-
-    inputs = np.zeros((10, 3), np.float32)
-    targets = np.zeros((10, 4), np.float32)
-    sample_weights = np.ones((10), np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets,
-                                                      sample_weights))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-    model.evaluate(dataset, steps=2, verbose=1)
-    model.predict(dataset, steps=2)
-    model.train_on_batch(dataset)
-    model.predict_on_batch(dataset)
-
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_dataset_with_sparse_labels(self):
-    model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-    optimizer = RMSPropOptimizer(learning_rate=0.001)
-    loss = 'sparse_categorical_crossentropy'
-    model.compile(optimizer, loss)
-
-    inputs = np.zeros((10, 3))
-    targets = np.random.randint(0, 4, size=10, dtype=np.int32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-
-    model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
-
-  def test_dataset_input_shape_validation(self):
-    with self.cached_session():
-      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
-      model.compile(optimizer=RMSPropOptimizer(learning_rate=0.001), loss='mse')
-
-      # User forgets to batch the dataset
-      inputs = np.zeros((10, 3))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-
-      with self.assertRaisesRegexp(
-          ValueError,
-          r'expected (.*?) to have shape \(3,\) but got array with shape \(1,\)'
-      ):
-        model.train_on_batch(dataset)
-
-      # Wrong input shape
-      inputs = np.zeros((10, 5))
-      targets = np.zeros((10, 4))
-      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
-      dataset = dataset.repeat(100)
-      dataset = dataset.batch(10)
-
-      with self.assertRaisesRegexp(ValueError,
-                                   r'expected (.*?) to have shape \(3,\)'):
-        model.train_on_batch(dataset)
-
-
-class TestTrainingWithMetrics(test.TestCase):
+class TestTrainingWithMetrics(keras_parameterized.TestCase):
   """Training tests related to metrics."""
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_names(self):
     a = keras.layers.Input(shape=(3,), name='input_a')
     b = keras.layers.Input(shape=(3,), name='input_b')
@@ -2049,7 +2029,8 @@ class TestTrainingWithMetrics(test.TestCase):
 
     optimizer = RMSPropOptimizer(learning_rate=0.001)
     metrics = ['mse', metrics_module.BinaryAccuracy()]
-    model.compile(optimizer, loss='mae', metrics=metrics)
+    model.compile(optimizer, loss='mae', metrics=metrics,
+                  run_eagerly=testing_utils.should_run_eagerly())
     reference_metric_names = [
         'loss', 'dense_loss', 'dropout_loss', 'dense_mean_squared_error',
         'dense_binary_accuracy', 'dropout_mean_squared_error',
@@ -2069,7 +2050,7 @@ class TestTrainingWithMetrics(test.TestCase):
               batch_size=5)
     self.assertEqual(reference_metric_names, model.metrics_names)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_correctness(self):
     model = keras.Sequential()
     model.add(
@@ -2081,7 +2062,8 @@ class TestTrainingWithMetrics(test.TestCase):
     model.compile(
         loss='mae',
         metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     # verify correctness of stateful and stateless metrics.
     x = np.ones((100, 4))
@@ -2095,40 +2077,7 @@ class TestTrainingWithMetrics(test.TestCase):
     self.assertEqual(outs[1], 0.)
     self.assertEqual(outs[2], 0.)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_metrics_correctness_with_iterator(self):
-    model = keras.Sequential()
-    model.add(
-        keras.layers.Dense(
-            8, activation='relu', input_dim=4, kernel_initializer='ones'))
-    model.add(
-        keras.layers.Dense(
-            1, activation='sigmoid', kernel_initializer='ones'))
-    model.compile(
-        loss='binary_crossentropy',
-        metrics=['accuracy', metrics_module.BinaryAccuracy()],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
-
-    np.random.seed(123)
-    x = np.random.randint(10, size=(100, 4)).astype(np.float32)
-    y = np.random.randint(2, size=(100, 1)).astype(np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(np.around(outs[1], decimals=1), 0.5)
-    self.assertEqual(np.around(outs[2], decimals=1), 0.5)
-
-    y = np.zeros((100, 1), dtype=np.float32)
-    dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
-    dataset = dataset.repeat(100)
-    dataset = dataset.batch(10)
-    iterator = dataset.make_one_shot_iterator()
-    outs = model.evaluate(iterator, steps=10)
-    self.assertEqual(outs[1], 0.)
-    self.assertEqual(outs[2], 0.)
-
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_correctness_with_weighted_metrics(self):
     np.random.seed(1337)
     x = np.array([[[1.], [1.]], [[0.], [0.]]])
@@ -2141,7 +2090,8 @@ class TestTrainingWithMetrics(test.TestCase):
         RMSPropOptimizer(learning_rate=0.001),
         loss='mse',
         sample_weight_mode='temporal',
-        weighted_metrics=['accuracy', 'mse'])
+        weighted_metrics=['accuracy', 'mse'],
+        run_eagerly=testing_utils.should_run_eagerly())
     y = np.array([[[1.], [1.]], [[1.], [1.]]])
 
     outs = model.evaluate(x, y)
@@ -2153,7 +2103,7 @@ class TestTrainingWithMetrics(test.TestCase):
 
     w = np.array([[3., 4.], [1., 2.]])
     outs = model.evaluate(x, y, sample_weight=w)
-    self.assertArrayNear(outs, [0.3, 0.7, 0.3], .001)
+    self.assertArrayNear(outs, [0.75, 0.7, 0.3], .001)
 
     # Verify that metric value is same with arbitrary weights and batch size.
     x = np.random.random((50, 2, 1))
@@ -2163,7 +2113,7 @@ class TestTrainingWithMetrics(test.TestCase):
     mse2 = model.evaluate(x, y, sample_weight=w, batch_size=10)[2]
     self.assertNear(mse1, mse2, err=1e-7)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metric_state_reset_between_fit_and_evaluate(self):
     model = keras.Sequential()
     model.add(keras.layers.Dense(3, activation='relu', input_dim=4))
@@ -2172,7 +2122,8 @@ class TestTrainingWithMetrics(test.TestCase):
     model.compile(
         loss='mae',
         metrics=[acc_obj],
-        optimizer=RMSPropOptimizer(learning_rate=0.001))
+        optimizer=RMSPropOptimizer(learning_rate=0.001),
+        run_eagerly=testing_utils.should_run_eagerly())
 
     x_train = np.random.random((100, 4))
     y_train = np.random.random((100, 1))
@@ -2184,7 +2135,7 @@ class TestTrainingWithMetrics(test.TestCase):
     model.evaluate(x_test, y_test, batch_size=5)
     self.assertEqual(self.evaluate(acc_obj.count), 10)
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_invalid_metrics(self):
     num_classes = 5
     input_dim = 5
@@ -2198,10 +2149,13 @@ class TestTrainingWithMetrics(test.TestCase):
       model.compile(
           RMSPropOptimizer(learning_rate=0.001),
           loss='categorical_crossentropy',
-          metrics=metrics_module.CategoricalAccuracy())
+          metrics=metrics_module.CategoricalAccuracy(),
+          run_eagerly=testing_utils.should_run_eagerly())
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @keras_parameterized.run_all_keras_modes
   def test_metrics_masking(self):
+    if testing_utils.should_run_eagerly():
+      self.skipTest('b/120495761')
     with self.cached_session():
       np.random.seed(1337)
       model = keras.models.Sequential()
@@ -2212,7 +2166,8 @@ class TestTrainingWithMetrics(test.TestCase):
       model.compile(
           RMSPropOptimizer(learning_rate=0.001),
           loss='mse',
-          weighted_metrics=['accuracy'])
+          weighted_metrics=['accuracy'],
+          run_eagerly=testing_utils.should_run_eagerly())
 
       # verify that masking is applied.
       x = np.array([[[1], [1]], [[1], [1]], [[0], [0]]])
@@ -2223,32 +2178,274 @@ class TestTrainingWithMetrics(test.TestCase):
       # verify that masking is combined with sample weights.
       w = np.array([3, 2, 4])
       scores = model.train_on_batch(x, y, sample_weight=w)
-      self.assertArrayNear(scores, [0.2, 0.8], 0.1)
+      self.assertArrayNear(scores, [0.3328, 0.8], 0.001)
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_logging(self):
-    mock_stdout = io.BytesIO() if six.PY2 else io.StringIO()
-    model = keras.models.Sequential()
-    model.add(keras.layers.Dense(10, activation='relu'))
-    model.add(keras.layers.Dense(1, activation='sigmoid'))
-    model.compile(
-        RMSPropOptimizer(learning_rate=0.001), loss='binary_crossentropy')
-    with test.mock.patch.object(sys, 'stdout', mock_stdout):
-      model.fit(
-          np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'), epochs=10)
-    self.assertTrue('Epoch 5/10' in mock_stdout.getvalue())
+  @tf_test_util.run_deprecated_v1
+  def test_add_metric_with_tensor_on_model_in_graph_mode(self):
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+
+      # test with a metric which does not have the standard signature:
+      # (y_true, y_pred, sample_Weight)
+      model.add_metric(metrics_module.Mean(name='metric_2')(y))
+      model.compile('sgd', loss='mse')
+
+      inputs = np.ones(shape=(10, 1))
+      targets = np.ones(shape=(10, 1))
+      history = model.fit(
+          inputs,
+          targets,
+          epochs=2,
+          batch_size=5,
+          validation_data=(inputs, targets))
+      self.assertEqual(history.history['metric_1'][-1], 5)
+      self.assertEqual(history.history['metric_2'][-1], 1)
+      self.assertEqual(history.history['val_metric_1'][-1], 5)
+      self.assertEqual(history.history['val_metric_2'][-1], 1)
+
+      eval_results = model.evaluate(inputs, targets, batch_size=5)
+      self.assertEqual(eval_results[-1], 1)
+      self.assertEqual(eval_results[-2], 5)
+
+      model.predict(inputs, batch_size=5)
+      model.train_on_batch(inputs, targets)
+      model.test_on_batch(inputs, targets)
 
-  def test_losses_in_defun(self):
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_in_model_call(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean = metrics_module.Mean(name='metric_1')
+
+      def call(self, x):
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_2', aggregation='mean')
+        # Provide same name as in the instance created in __init__
+        # for eager mode
+        self.add_metric(self.mean(x), name='metric_1')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['val_metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_2'][-1], 5, 0)
+    self.assertAlmostEqual(history.history['val_metric_2'][-1], 5, 0)
+
+    eval_results = model.evaluate(x, y, batch_size=5)
+    self.assertAlmostEqual(eval_results[1], 1, 0)
+    self.assertAlmostEqual(eval_results[2], 5, 0)
+
+    model.predict(x, batch_size=5)
+    model.train_on_batch(x, y)
+    model.test_on_batch(x, y)
+
+  @keras_parameterized.run_all_keras_modes
+  def test_add_metric_in_layer_call(self):
+
+    class TestLayer(keras.layers.Layer):
+
+      def build(self, input_shape):
+        self.a = self.add_variable(
+            'a', (1, 1), initializer='ones', trainable=False)
+        self.built = True
+
+      def call(self, inputs):
+        self.add_metric(
+            math_ops.reduce_sum(inputs), name='metric_1', aggregation='mean')
+        return inputs + 1
+
+    model = keras.Sequential()
+    model.add(TestLayer(input_shape=(1,)))
+    model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertEqual(history.history['metric_1'][-1], 5)
+    self.assertAlmostEqual(history.history['val_metric_1'][-1], 5, 0)
+
+  @tf_test_util.run_deprecated_v1
+  def test_model_metrics_list(self):
+    with self.cached_session():
+      x = keras.layers.Input(shape=(1,))
+      y = keras.layers.Dense(1, kernel_initializer='ones')(x)
+      model = keras.models.Model(x, y)
+      model.add_metric(
+          math_ops.reduce_sum(y), name='metric_1', aggregation='mean')
+      model.add_metric(metrics_module.Mean(name='metric_2')(y))
+      model.compile('sgd', loss='mse', metrics=['acc'])
+
+      # Verify that the metrics added using `compile` and `add_metric` API are
+      # included
+      self.assertEqual(model._compile_metrics, ['acc'])
+      names = []
+      for m in model.metrics:
+        if isinstance(m, metrics_module.Metric):
+          names.append(m.name)
+        else:
+          names.append(m.__name__)
+      self.assertEqual(names, ['binary_accuracy', 'metric_1', 'metric_2'])
+
+  def test_model_eager_metrics_list(self):
     with context.eager_mode():
-      layer = keras.layers.Dense(1, kernel_regularizer='l1')
-      layer(array_ops.ones([1, 10]))
 
-      @function.defun
-      def get_losses():
-        return layer.losses
+      class TestModel(keras.Model):
+
+        def __init__(self):
+          super(TestModel, self).__init__(name='test_model')
+          self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+        def call(self, x):
+          self.add_metric(
+              math_ops.reduce_sum(x), name='metric_1', aggregation='mean')
+          return self.dense1(x)
+
+      model = TestModel()
+      model.compile(
+          loss='mse',
+          optimizer=RMSPropOptimizer(0.01),
+          metrics=['acc'],
+          run_eagerly=True)
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+      self.assertEqual(model._compile_metrics, ['acc'])
+      names = []
+      for m in model.metrics:
+        if isinstance(m, metrics_module.Metric):
+          names.append(m.name)
+        else:
+          names.append(m.__name__)
+      self.assertEqual(names, ['categorical_accuracy', 'metric_1'])
+
+  @keras_parameterized.run_all_keras_modes
+  def test_multiple_add_metric_calls(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean1 = metrics_module.Mean(name='metric_1')
+        self.mean2 = metrics_module.Mean(name='metric_2')
+
+      def call(self, x):
+        self.add_metric(self.mean2(x), name='metric_2')
+        self.add_metric(self.mean1(x), name='metric_1')
+        self.add_metric(
+            math_ops.reduce_sum(x), name='metric_3', aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    history = model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertAlmostEqual(history.history['metric_1'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_2'][-1], 1, 0)
+    self.assertAlmostEqual(history.history['metric_3'][-1], 5, 0)
+
+    eval_results = model.evaluate(x, y, batch_size=5)
+    self.assertArrayNear(eval_results[1:4], [1, 1, 5], 0.1)
+
+    model.predict(x, batch_size=5)
+    model.train_on_batch(x, y)
+    model.test_on_batch(x, y)
+
+  def test_invalid_metric_tensor_in_call(self):
+    with context.eager_mode():
+
+      class TestLayer(keras.layers.Layer):
+
+        def call(self, inputs):
+          self.add_metric(metrics_module.Mean(name='metric_1')(inputs))
+          return inputs + 1
+
+      model = keras.Sequential()
+      model.add(TestLayer(input_shape=(1,)))
+      model.add(keras.layers.Dense(2, kernel_initializer='ones'))
+      model.compile(
+          loss='mse', optimizer=RMSPropOptimizer(0.01), run_eagerly=True)
+
+      x = np.ones(shape=(10, 1))
+      y = np.ones(shape=(10, 2))
+      with self.assertRaisesRegexp(
+          ValueError,
+          'We do not support adding an aggregated metric tensor in `call` in '
+          'eager execution.'):
+        model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_duplicate_metric_name_in_add_metric(self):
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+        self.mean = metrics_module.Mean(name='metric_1')
+        self.mean2 = metrics_module.Mean(name='metric_1')
+
+      def call(self, x):
+        self.add_metric(self.mean(x), name='metric_1')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    with self.assertRaisesRegexp(
+        ValueError,
+        'Please provide different names for the metrics you have added. '
+        'We found 2 metrics with the name: "metric_1"'):
+      model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+
+  @keras_parameterized.run_all_keras_modes
+  def test_multiple_no_name_input_to_add_metric(self):
+    # TODO(kaftan) Test seems to not work, file ticket
+    if testing_utils.should_run_eagerly() and context.executing_eagerly():
+      self.skipTest('Skipping running model eagerly.')
+
+    class TestModel(keras.Model):
+
+      def __init__(self):
+        super(TestModel, self).__init__(name='test_model')
+        self.dense1 = keras.layers.Dense(2, kernel_initializer='ones')
+
+      def call(self, x):
+        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
+        self.add_metric(math_ops.reduce_sum(x), aggregation='mean')
+        return self.dense1(x)
+
+    model = TestModel()
+    model.compile(loss='mse', optimizer=RMSPropOptimizer(0.01),
+                  run_eagerly=testing_utils.should_run_eagerly())
+    x = np.ones(shape=(10, 1))
+    y = np.ones(shape=(10, 2))
+    model.fit(x, y, epochs=2, batch_size=5, validation_data=(x, y))
+    self.assertEqual([m.name for m in model.metrics], ['mean', 'mean_1'])
 
-      self.assertAllEqual(self.evaluate(layer.losses),
-                          self.evaluate(get_losses()))
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/engine/training_utils.py b/tensorflow/python/keras/engine/training_utils.py
index e563b7a23dfa4bc58bdf8ee69a9e289aa284da23..01a09eb031eef20538d587e3f17a31ecbb5e5f9a 100644
--- a/tensorflow/python/keras/engine/training_utils.py
+++ b/tensorflow/python/keras/engine/training_utils.py
@@ -18,163 +18,176 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
 from collections import OrderedDict
 import copy
-import math
 
 import numpy as np
 import six
 
-from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import iterator_ops
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras import callbacks as cbks
 from tensorflow.python.keras import losses
 from tensorflow.python.keras import metrics as metrics_module
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.utils import generic_utils
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.util import nest
 
 
-def _map_nested(data, func):
-  """Maps each nested element using func."""
-  if isinstance(data, list):
-    return [_map_nested(nested_data, func) for nested_data in data]
-  elif isinstance(data, tuple):
-    return tuple(_map_nested(nested_data, func) for nested_data in data)
-  elif isinstance(data, dict):
-    return {
-        k: _map_nested(nested_data, func) for k, nested_data in data.items()
-    }
-  else:
-    return func(data)
+@six.add_metaclass(abc.ABCMeta)
+class Aggregator(object):
+  """Abstract base class used to aggregate batch-level outputs of a loop.
 
+  Attributes:
+    use_steps: Whether the loop is using `step` or `batch_size`.
+    num_samples_or_steps: Either `batch_size*num_batches` or `steps`.
+    results: What to return at the end of the aggregation loop.
+  """
 
-def _nested_all(data, cond_func):
-  """Checks if all elements in a nested structure satisfy cond_func."""
-  if isinstance(data, (tuple, list)):
-    return all([_nested_all(nested_data, cond_func) for nested_data in data])
-  elif isinstance(data, dict):
-    return all(
-        [_nested_all(nested_data, cond_func) for nested_data in data.values()])
-  else:
-    return cond_func(data)
+  def __init__(self, use_steps, num_samples_or_steps):
+    self.use_steps = use_steps
+    self.num_samples_or_steps = num_samples_or_steps
+    self.results = []
 
+  @abc.abstractmethod
+  def create(self, batch_outs):
+    """Creates the initial results from the first batch outputs.
 
-def _nested_any(data, cond_func):
-  """Checks if any nested_elements in a nested structure satisfy cond_func."""
-  if isinstance(data, (tuple, list)):
-    return any([_nested_any(nested_data, cond_func) for nested_data in data])
-  elif isinstance(data, dict):
-    return any(
-        [_nested_any(nested_data, cond_func) for nested_data in data.values()])
-  else:
-    return cond_func(data)
-
-
-def _convert_lists_to_tuples(data):
-  """Converts all lists to tuples, since Datasets expect tuples."""
-  if isinstance(data, (tuple, list)):
-    return tuple(_convert_lists_to_tuples(nested_data) for nested_data in data)
-  elif isinstance(data, dict):
-    return {
-        k: _convert_lists_to_tuples(nested_data)
-        for k, nested_data in data.items()
-    }
-  else:
-    return data
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
 
+  @abc.abstractmethod
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    """Aggregates batch-level results into total results.
 
-def _get_batch_axis_size(data):
-  """Returns batch axis shape for nested data."""
-  if isinstance(data, (tuple, list)):
-    return _get_batch_axis_size(data[0])
-  elif isinstance(data, dict):
-    return _get_batch_axis_size(list(data.values()))
-  else:
-    return int(data.shape[0])
+    Arguments:
+      batch_outs: A list of batch-level outputs.
+      batch_start: The start index of this batch. Always `None` if `use_steps`
+        is `True`.
+      batch_end: The end index of this batch. Always `None` if `use_steps` is
+        `True`.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
 
+  @abc.abstractmethod
+  def finalize(self):
+    """Prepares the total results to be returned."""
+    NotImplementedError('Must be implemented in subclasses.')
 
-def convert_to_iterator(x=None,
-                        y=None,
-                        sample_weights=None,
-                        batch_size=None,
-                        steps_per_epoch=None,
-                        epochs=1,
-                        shuffle=False,
-                        is_validation=False):
-  """Converts NumPy arrays or EagerTensors to an EagerIterator.
 
-  Combines all provided data into a single EagerIterator.
+class MetricsAggregator(Aggregator):
+  """Aggregator that calculates loss and metrics info."""
 
-  Arguments:
-      x: NumPy array or EagerTensor,  or list of Numpy arrays or EagerTensors
-        representing inputs to a model.
-      y: Optional. NumPy array or EagerTensor, or list of Numpy arrays or
-        EagerTensors representing targets of a model.
-      sample_weights: Optional NumPy array or EagerTensor representing sample
-        weights.
-      batch_size: Used to batch data and calculate how many steps EagerIterator
-        should take per epoch.
-      steps_per_epoch: If provided, how many steps EagerIterator should take per
-        epoch.
-      epochs: Epochs to repeat iterator for.
-      shuffle: Whether to shuffle data after each epoch.
-      is_validation: Whether this call is for validation during a training
-        (e.g., `fit()`) call. This info is used to construct error messages
-        (if any).
+  def create(self, batch_outs):
+    self.results = [0.] * len(batch_outs)
 
-  Raises:
-      ValueError: if steps_per_epoch cannot be calculated from the data
-      provided.
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    # Loss.
+    if self.use_steps:
+      self.results[0] += batch_outs[0]
+    else:
+      self.results[0] += batch_outs[0] * (batch_end - batch_start)
+    # Metrics (always stateful, just grab current values.)
+    self.results[1:] = batch_outs[1:]
 
-  Returns:
-      (Iterator, steps_per_epoch).
+  def finalize(self):
+    self.results[0] /= self.num_samples_or_steps
 
-  """
-  if isinstance(x, iterator_ops.EagerIterator):
-    return x, steps_per_epoch
 
-  if not _nested_any(sample_weights, lambda x: x is None):
-    data = (x, y, sample_weights)
-  elif not _nested_any(y, lambda x: x is None):
-    data = (x, y)
-  else:
-    # always wrap in a tuple, so we know y, sample_weights weren't set
-    # even when x has multiple elements
-    data = (x,)
-
-  data = _convert_lists_to_tuples(data)
-  if steps_per_epoch is None and batch_size is not None:
-    num_samples = _get_batch_axis_size(data)
-    steps_per_epoch = int(math.ceil(num_samples / batch_size))
-
-  if steps_per_epoch is None:
-    alternative_arg_name = (
-        'validation_steps' if is_validation else 'steps_per_epoch')
-    raise ValueError(
-        'Could not determine how to convert EagerTensors into EagerIterator. '
-        'Please provide either `batch_size` or '
-        '`%s`.' % alternative_arg_name)
+class OutputsAggregator(Aggregator):
+  """Aggregator that concatenates outputs."""
+
+  def create(self, batch_outs):
+    if self.use_steps:
+      # Cannot pre-allocate the returned NumPy arrays bc
+      # batch sizes are unknown. Concatenate batches at the end.
+      for _ in batch_outs:
+        self.results.append([])
+    else:
+      # Pre-allocate NumPy arrays.
+      for batch_out in batch_outs:
+        shape = (self.num_samples_or_steps,) + batch_out.shape[1:]
+        self.results.append(np.zeros(shape, dtype=batch_out.dtype))
+
+  def aggregate(self, batch_outs, batch_start=None, batch_end=None):
+    if self.use_steps:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i].append(batch_out)
+    else:
+      for i, batch_out in enumerate(batch_outs):
+        self.results[i][batch_start:batch_end] = batch_out
 
-  # TODO(omalleyt) for NumPy arrays in graph mode
-  # placeholder ops should be used
-  # this is only ideal for eager mode
-  dataset = dataset_ops.Dataset.from_tensor_slices(data)
+  def finalize(self):
+    if self.use_steps:
+      self.results = [np.concatenate(result, axis=0) for result in self.results]
 
-  if batch_size is not None:
-    dataset = dataset.batch(batch_size)
-  if shuffle:
-    dataset = dataset.shuffle(buffer_size=10000)
-  dataset = dataset.repeat(epochs)
-  iterator = dataset.make_one_shot_iterator()
 
-  return iterator, steps_per_epoch
+def make_logs(model, outputs, mode, prefix=''):
+  """Computes logs for sending to `on_batch_end` methods."""
+  logs = {}
+  # TODO(omalleyt): handle outputs in prediction when Callback
+  # hooks are ready.
+  if mode in ['train', 'test']:
+    if hasattr(model, 'metrics_names'):
+      for label, output in zip(model.metrics_names, outputs):
+        logs[prefix + label] = output
+  return logs
+
+
+def get_progbar(model, count_mode):
+  """Get Progbar."""
+  stateful_metric_names = None
+  if hasattr(model, 'metrics_names'):
+    stateful_metric_names = model.metrics_names[1:]  # Exclude `loss`
+  return cbks.ProgbarLogger(count_mode, stateful_metrics=stateful_metric_names)
+
+
+def slice_arrays(arrays, indices, contiguous=True):
+  """Slices batches out of provided arrays (workaround for eager tensors).
+
+  Unfortunately eager tensors don't have the same slicing behavior as
+  Numpy arrays (they follow the same slicing behavior as symbolic TF tensors),
+  hence we cannot use `generic_utils.slice_arrays` directly
+  and we have to implement this workaround based on `concat`. This has a
+  performance cost.
+
+  Arguments:
+    arrays: Single array or list of arrays.
+    indices: List of indices in the array that should be included in the output
+      batch.
+    contiguous: Boolean flag indicating whether the indices are contiguous.
+
+  Returns:
+    Slice of data (either single array or list of arrays).
+  """
+  converted_to_list = False
+  if not isinstance(arrays, list):
+    converted_to_list = True
+    arrays = [arrays]
+  if any(tensor_util.is_tensor(x) for x in arrays):
+    if not contiguous:
+      entries = [[x[i:i + 1] for i in indices] for x in arrays]
+      slices = [array_ops.concat(x, axis=0) for x in entries]
+    else:
+      slices = [x[indices[0]:indices[-1] + 1] for x in arrays]
+  else:
+    slices = generic_utils.slice_arrays(arrays, indices)
+
+  if converted_to_list:
+    slices = slices[0]
+  return slices
 
 
 def check_num_samples(ins,
@@ -219,14 +232,18 @@ def check_num_samples(ins,
   return None  # Edge case where ins == [static_learning_phase]
 
 
-def standardize_single_array(x):
+def standardize_single_array(x, expected_shape=None):
+  """Expand data of shape (x,) to (x, 1), unless len(expected_shape)==1."""
   if x is None:
     return None
-  if x.shape is not None and len(x.shape) == 1:
+
+  if (x.shape is not None
+      and len(x.shape) == 1
+      and (expected_shape is None or len(expected_shape) != 1)):
     if tensor_util.is_tensor(x):
-      return array_ops.expand_dims(x, axis=1)
+      x = array_ops.expand_dims(x, axis=1)
     else:
-      return np.expand_dims(x, 1)
+      x = np.expand_dims(x, 1)
   return x
 
 
@@ -288,7 +305,11 @@ def standardize_input_data(data,
   else:
     data = data.values if data.__class__.__name__ == 'DataFrame' else data
     data = [data]
-  data = [standardize_single_array(x) for x in data]
+  if shapes is not None:
+    data = [standardize_single_array(x, shape)
+            for (x, shape) in zip(data, shapes)]
+  else:
+    data = [standardize_single_array(x) for x in data]
 
   if len(data) != len(names):
     if data and hasattr(data[0], 'shape'):
@@ -629,15 +650,14 @@ def weighted_masked_objective(fn):
         weights = mask
       else:
         # Update dimensions of weights to match with mask if possible.
-        mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
-            mask, None, weights)
+        mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
         weights *= mask
 
     # Apply sample weighting.
     if weights is not None:
 
       # Update dimensions of weights to match with values if possible.
-      score_array, _, weights = metrics_module.squeeze_or_expand_dimensions(
+      score_array, _, weights = squeeze_or_expand_dimensions(
           score_array, None, weights)
       try:
         # Broadcast weights if possible.
@@ -651,7 +671,7 @@ def weighted_masked_objective(fn):
       score_array = math_ops.multiply(score_array, weights)
       score_array = math_ops.reduce_sum(score_array)
       weights = math_ops.reduce_sum(weights)
-      score_array = metrics_module.safe_div(score_array, weights)
+      score_array = math_ops.div_no_nan(score_array, weights)
     return K.mean(score_array)
 
   return weighted
@@ -835,12 +855,22 @@ def call_metric_function(metric_fn, y_true, y_pred, weights=None, mask=None):
     return metric_fn(y_true, y_pred, sample_weight=mask)
 
   # Update dimensions of weights to match with mask.
-  mask, _, weights = metrics_module.squeeze_or_expand_dimensions(
-      mask, None, weights)
+  mask, _, weights = squeeze_or_expand_dimensions(mask, None, weights)
   weights *= mask
   return metric_fn(y_true, y_pred, sample_weight=weights)
 
 
+def get_loss_function(loss):
+  """Returns the loss function corresponding to the given loss input."""
+  if loss is None or isinstance(loss, losses.Loss):
+    return loss
+
+  # TODO(psv): After we have added all V2 losses, update this function.
+  if loss in ['mse', 'MSE', 'mean_squared_error']:
+    return losses.MeanSquaredError()
+  return losses.get(loss)
+
+
 def validate_iterator_input(x, y, sample_weight, validation_split=None):
   """Validates user input arguments when a dataset iterator is passed.
 
@@ -1053,9 +1083,11 @@ class ModelInputs(object):
     self._inputs = inputs
     self._is_dict = isinstance(self._inputs, dict)
     self._is_single_input = not isinstance(self._inputs, (list, tuple, dict))
+
     self._flattened_inputs = []
     self._input_names = []
-    if isinstance(self._inputs, dict):
+
+    if self._is_dict:
       for k in sorted(self._inputs.keys()):
         self._flattened_inputs.append(self._inputs[k])
         self._input_names.append(k)
@@ -1064,7 +1096,6 @@ class ModelInputs(object):
       self._input_names = [
           'input_%d' % (i + 1) for i in range(len(self._flattened_inputs))
       ]
-    assert len(self._input_names) == len(self._flattened_inputs)
 
   def get_input_names(self):
     """Returns keys to name inputs by.
@@ -1074,56 +1105,32 @@ class ModelInputs(object):
     """
     return self._input_names
 
-  def _get(self, return_single_as_list=False):
-    """Returns provided inputs, potentially transformed.
-
-    Inputs are returned in the same format they were provided i.e. lists
-    are returned as lists, single entries as single entries (unless
-    `return_single_as_list` is true), dictionaries as dictionaries.
-
-    Args:
-      return_single_as_list: Returns a list of size 1 for single entry case.
-    """
-    if self._is_dict:
-      return dict(zip(self._input_names, self._flattened_inputs))
-    if self._is_single_input and not return_single_as_list:
-      return self._flattened_inputs[0]
-    return self._flattened_inputs
-
-  def get_input_values(self):
-    """Returns input values passed in."""
-    if context.executing_eagerly():
-      for i in range(len(self._flattened_inputs)):
-        v = self._flattened_inputs[i]
-        if tensor_util.is_tensor(v):
-          v = cast_single_tensor(v)
-        else:
-          v = ops.convert_to_tensor(v, dtype=K.floatx())
-        self._flattened_inputs[i] = v
-    return self._get(return_single_as_list=False)
-
   def get_symbolic_inputs(self, return_single_as_list=False):
     """Returns inputs to be set as self.inputs for a model."""
     for i in range(len(self._flattened_inputs)):
       k = self._input_names[i]
       v = self._flattened_inputs[i]
-      if context.executing_eagerly():
-        v = K.placeholder((None,) + tuple(v.shape[1:]), name=k)
-      else:
-        if isinstance(v, list):
-          v = np.asarray(v)
-          if v.ndim == 1:
-            v = np.expand_dims(v, 1)
-        if isinstance(v, (np.ndarray)):
-          # We fix the placeholder shape except the batch size.
-          # This is suboptimal, but it is the best we can do with the info
-          # we have. The user should call `model._set_inputs(placeholders)`
-          # to specify custom placeholders if the need arises.
-          shape = (None,) + v.shape[1:]
-          v = K.placeholder(shape=shape, name=k)
+      if isinstance(v, (list, float, int)):
+        v = np.asarray(v)
+        if v.ndim == 1:
+          v = np.expand_dims(v, 1)
+      if isinstance(v, (np.ndarray, ops.EagerTensor)):
+        # We fix the placeholder shape except the batch size.
+        # This is suboptimal, but it is the best we can do with the info
+        # we have. The user should call `model._set_inputs(placeholders)`
+        # to specify custom placeholders if the need arises.
+        shape = (None,) + tuple(v.shape[1:])
+        v = K.placeholder(shape=shape, name=k)
+      elif isinstance(v, tensor_shape.TensorShape):
+        shape = (None,) + tuple(v.as_list()[1:])
+        v = K.placeholder(shape=shape, name=k)
       self._flattened_inputs[i] = v
 
-    return self._get(return_single_as_list)
+    if self._is_dict:
+      return dict(zip(self._input_names, self._flattened_inputs))
+    if self._is_single_input and not return_single_as_list:
+      return self._flattened_inputs[0]
+    return self._flattened_inputs
 
   def as_dict(self):
     """An iterable over a dictionary version of inputs."""
@@ -1133,3 +1140,54 @@ class ModelInputs(object):
   def as_list(self):
     """Returning the inputs as a list."""
     return self._flattened_inputs
+
+
+# Allow use of methods not exposed to the user.
+# pylint: disable=protected-access
+def get_input_shape_and_dtype(layer):
+  """Retrieves input shape and input dtype of layer if applicable.
+
+  Args:
+    layer: Layer (or model) instance.
+
+  Returns:
+    Tuple (input_shape, input_dtype). Both could be None if the layer
+      does not have a defined input shape.
+
+  Raises:
+    ValueError: in case an empty Sequential or Graph Network is passed.
+  """
+
+  def _is_graph_model(layer):
+    return ((hasattr(layer, '_is_graph_network') and layer._is_graph_network) or
+            layer.__class__.__name__ == 'Sequential')
+
+  # In case of nested models: recover the first layer
+  # of the deepest model to infer input shape and dtype.
+  # Subclassed Models may not have been built so can't be checked.
+  while _is_graph_model(layer):
+    if not layer.layers:
+      raise ValueError('An empty Model cannot be used as a Layer.')
+    layer = layer.layers[0]
+
+  if hasattr(layer, '_batch_input_shape'):
+    return layer._batch_input_shape, layer.dtype
+  return None, None
+
+
+# pylint: enable=protected-access
+
+
+def get_static_batch_size(layer):
+  """Gets the static batch size of a Layer.
+
+  Arguments:
+    layer: a `Layer` instance.
+
+  Returns:
+    The static batch size of a Layer.
+  """
+  batch_input_shape, _ = get_input_shape_and_dtype(layer)
+  if batch_input_shape is not None:
+    return tensor_shape.as_dimension(batch_input_shape[0]).value
+  return None
diff --git a/tensorflow/python/keras/engine/training_utils_test.py b/tensorflow/python/keras/engine/training_utils_test.py
index 7b217cf37323e7013013817035fe60ab295b8c4f..44ea23998fe6f3b614fb09b9667add179cf3fd85 100644
--- a/tensorflow/python/keras/engine/training_utils_test.py
+++ b/tensorflow/python/keras/engine/training_utils_test.py
@@ -21,185 +21,39 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.eager import context
-from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
-from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import training_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.platform import test
 
 
-class TrainingUtilTest(test.TestCase):
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_single_numpy(self):
-    batch_size = 2
-    a = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_single_tensor(self):
-    batch_size = 2
-    a = ops.convert_to_tensor(np.ones([10, 10]))
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_y(self):
-    batch_size = 2
-    a = np.ones([10, 100])
-    b = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, y=b, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x = a[:batch_size, :]
-    expected_y = b[:batch_size, :]
-    actual_x, actual_y = iterator.get_next()
-    self.assertAllEqual(expected_x, actual_x)
-    self.assertAllEqual(expected_y, actual_y)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_sample_weights(self):
-    batch_size = 2
-    a = ops.convert_to_tensor(np.ones([10, 100]))
-    b = ops.convert_to_tensor(np.ones([10, 10]))
-    sw = ops.convert_to_tensor(np.ones([10]))
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, y=b, sample_weights=sw, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x = a[:batch_size, :]
-    expected_y = b[:batch_size, :]
-    expected_sw = sw[:batch_size]
-    actual_x, actual_y, actual_sw = iterator.get_next()
-    self.assertAllEqual(expected_x, actual_x)
-    self.assertAllEqual(expected_y, actual_y)
-    self.assertAllEqual(expected_sw, actual_sw)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_nested(self):
-    batch_size = 2
-    x = {'1': np.ones([10, 100]), '2': [np.zeros([10, 10]), np.ones([10, 20])]}
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=x, batch_size=batch_size)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_x1 = x['1'][:batch_size, :]
-    expected_x2_0 = x['2'][0][:batch_size, :]
-    expected_x2_1 = x['2'][1][:batch_size, :]
-
-    actual_x, = iterator.get_next()
-    actual_x1 = actual_x['1'][:batch_size, :]
-    actual_x2_0 = actual_x['2'][0][:batch_size, :]
-    actual_x2_1 = actual_x['2'][1][:batch_size, :]
-
-    self.assertAllEqual(expected_x1, actual_x1)
-    self.assertAllEqual(expected_x2_0, actual_x2_0)
-    self.assertAllEqual(expected_x2_1, actual_x2_1)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_epochs(self):
-    batch_size = 2
-    a = np.ones([10, 10])
-    iterator, steps_per_epoch = training_utils.convert_to_iterator(
-        x=a, batch_size=batch_size, epochs=2)
-    self.assertEquals(steps_per_epoch, 5)
-
-    expected_batch = a[:batch_size, :]
-    # loop through one whole epoch
-    for _ in range(6):
-      actual_batch, = iterator.get_next()
-    self.assertAllEqual(expected_batch, actual_batch)
-
-  @test_util.run_in_graph_and_eager_modes
-  def test_convert_to_iterator_insufficient_info(self):
-    # with batch_size and steps_per_epoch not set
-    with self.assertRaises(ValueError):
-      a = np.ones([10, 10])
-      _ = training_utils.convert_to_iterator(x=a)
-
-  def test_nested_all(self):
-    nested_data = {'a': True, 'b': [True, True, (False, True)]}
-    all_true = training_utils._nested_all(nested_data, lambda x: x)
-    self.assertEquals(all_true, False)
-
-    nested_data = {'a': True, 'b': [True, True, (True, True)]}
-    all_true = training_utils._nested_all(nested_data, lambda x: x)
-    self.assertEquals(all_true, True)
-
-  def test_nested_any(self):
-    nested_data = [False, {'a': False, 'b': (False, True)}]
-    any_true = training_utils._nested_any(nested_data, lambda x: x)
-    self.assertEquals(any_true, True)
-
-    nested_data = [False, {'a': False, 'b': (False, False)}]
-    any_true = training_utils._nested_any(nested_data, lambda x: x)
-    self.assertEquals(any_true, False)
-
-  def test_check_array_lengths(self):
-    training_utils.check_array_lengths(None, None, None)
-    a_np = np.random.random((4, 3, 3))
-    training_utils.check_array_lengths(a_np, a_np, a_np)
-    training_utils.check_array_lengths(
-        [a_np, a_np], [a_np, a_np], [a_np, a_np])
-    training_utils.check_array_lengths([None], [None], [None])
-
-    b_np = np.random.random((3, 4))
-    with self.assertRaises(ValueError):
-      training_utils.check_array_lengths([a_np], [b_np], None)
-
-
 class ModelInputsTest(test.TestCase):
 
   def test_single_thing(self):
     a = np.ones(10)
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['input_1'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertAllEqual(np.ones(10), vals)
-    self.assertFalse(tensor_util.is_tensor(vals))
+    self.assertEqual(['input_1'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals))
     vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-    self.assertEquals(1, len(vals))
+    self.assertEqual(1, len(vals))
     self.assertTrue(tensor_util.is_tensor(vals[0]))
 
   def test_single_thing_eager(self):
     with context.eager_mode():
       a = np.ones(10)
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['input_1'], model_inputs.get_input_names())
-      val = model_inputs.get_input_values()
-      self.assertAllEqual(np.ones(10), val)
-      self.assertTrue(tensor_util.is_tensor(val))
+      self.assertEqual(['input_1'], model_inputs.get_input_names())
       val = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(val))
       vals = model_inputs.get_symbolic_inputs(return_single_as_list=True)
-      self.assertEquals(1, len(vals))
+      self.assertEqual(1, len(vals))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
 
   def test_list(self):
     a = [np.ones(10), np.ones(20)]
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertEqual(2, len(vals))
-    self.assertAllEqual(np.ones(10), vals[0])
-    self.assertAllEqual(np.ones(20), vals[1])
-    self.assertFalse(tensor_util.is_tensor(vals[0]))
-    self.assertFalse(tensor_util.is_tensor(vals[1]))
+    self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals[0]))
     self.assertTrue(tensor_util.is_tensor(vals[1]))
@@ -208,13 +62,7 @@ class ModelInputsTest(test.TestCase):
     with context.eager_mode():
       a = [np.ones(10), np.ones(20)]
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['input_1', 'input_2'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertEqual(2, len(vals))
-      self.assertAllEqual(np.ones(10), vals[0])
-      self.assertAllEqual(np.ones(20), vals[1])
-      self.assertTrue(tensor_util.is_tensor(vals[0]))
-      self.assertTrue(tensor_util.is_tensor(vals[1]))
+      self.assertEqual(['input_1', 'input_2'], model_inputs.get_input_names())
       vals = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[0]))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals[1]))
@@ -222,12 +70,7 @@ class ModelInputsTest(test.TestCase):
   def test_dict(self):
     a = {'b': np.ones(10), 'a': np.ones(20)}
     model_inputs = training_utils.ModelInputs(a)
-    self.assertEquals(['a', 'b'], model_inputs.get_input_names())
-    vals = model_inputs.get_input_values()
-    self.assertAllEqual(np.ones(20), vals['a'])
-    self.assertAllEqual(np.ones(10), vals['b'])
-    self.assertFalse(tensor_util.is_tensor(vals['a']))
-    self.assertFalse(tensor_util.is_tensor(vals['b']))
+    self.assertEqual(['a', 'b'], model_inputs.get_input_names())
     vals = model_inputs.get_symbolic_inputs()
     self.assertTrue(tensor_util.is_tensor(vals['a']))
     self.assertTrue(tensor_util.is_tensor(vals['b']))
@@ -236,12 +79,7 @@ class ModelInputsTest(test.TestCase):
     with context.eager_mode():
       a = {'b': np.ones(10), 'a': np.ones(20)}
       model_inputs = training_utils.ModelInputs(a)
-      self.assertEquals(['a', 'b'], model_inputs.get_input_names())
-      vals = model_inputs.get_input_values()
-      self.assertAllEqual(np.ones(20), vals['a'])
-      self.assertAllEqual(np.ones(10), vals['b'])
-      self.assertTrue(tensor_util.is_tensor(vals['a']))
-      self.assertTrue(tensor_util.is_tensor(vals['b']))
+      self.assertEqual(['a', 'b'], model_inputs.get_input_names())
       vals = model_inputs.get_symbolic_inputs()
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['a']))
       self.assertTrue(tf_utils.is_symbolic_tensor(vals['b']))
diff --git a/tensorflow/python/keras/estimator/__init__.py b/tensorflow/python/keras/estimator/__init__.py
index b244beb5b58cf339a4687216b87418c88b953c17..dcd0600897005f1905b5f6b65cdc0f225172fa1b 100644
--- a/tensorflow/python/keras/estimator/__init__.py
+++ b/tensorflow/python/keras/estimator/__init__.py
@@ -24,23 +24,54 @@ from tensorflow.python.util.tf_export import tf_export
 # As long as you depend //third_party/py/tensorflow:tensorflow target
 # everything will work as normal.
 
-try:
-  from tensorflow.python.estimator import keras as keras_lib  # pylint: disable=g-import-not-at-top
-  model_to_estimator = tf_export('keras.estimator.model_to_estimator')(
-      keras_lib.model_to_estimator)
-except Exception:  # pylint: disable=broad-except
-
-  # pylint: disable=unused-argument
-  def stub_model_to_estimator(keras_model=None,
-                              keras_model_path=None,
-                              custom_objects=None,
-                              model_dir=None,
-                              config=None):
+
+# LINT.IfChange
+@tf_export('keras.estimator.model_to_estimator')
+def model_to_estimator(
+    keras_model=None,
+    keras_model_path=None,
+    custom_objects=None,
+    model_dir=None,
+    config=None):
+  """Constructs an `Estimator` instance from given keras model.
+
+  For usage example, please see:
+  [Creating estimators from Keras
+  Models](https://tensorflow.org/guide/estimators#model_to_estimator).
+
+  Args:
+    keras_model: A compiled Keras model object. This argument is mutually
+      exclusive with `keras_model_path`.
+    keras_model_path: Path to a compiled Keras model saved on disk, in HDF5
+      format, which can be generated with the `save()` method of a Keras model.
+      This argument is mutually exclusive with `keras_model`.
+    custom_objects: Dictionary for custom objects.
+    model_dir: Directory to save `Estimator` model parameters, graph, summary
+      files for TensorBoard, etc.
+    config: `RunConfig` to config `Estimator`.
+
+  Returns:
+    An Estimator from given keras model.
+
+  Raises:
+    ValueError: if neither keras_model nor keras_model_path was given.
+    ValueError: if both keras_model and keras_model_path was given.
+    ValueError: if the keras_model_path is a GCS URI.
+    ValueError: if keras_model has not been compiled.
+  """
+  try:
+    from tensorflow_estimator.python.estimator import keras as keras_lib  # pylint: disable=g-import-not-at-top
+  except ImportError:
     raise NotImplementedError(
         'tf.keras.estimator.model_to_estimator function not available in your '
         'installation.')
-  # pylint: enable=unused-argument
+  return keras_lib.model_to_estimator(
+      keras_model=keras_model,
+      keras_model_path=keras_model_path,
+      custom_objects=custom_objects,
+      model_dir=model_dir,
+      config=config)
+
+# LINT.ThenChange(//third_party/tensorflow_estimator/python/estimator/keras.py)
 
-  model_to_estimator = tf_export('keras.estimator.model_to_estimator')(
-      stub_model_to_estimator)
 
diff --git a/tensorflow/python/keras/initializers_test.py b/tensorflow/python/keras/initializers_test.py
index 2b758a98f30fee7cb9385db93a97e7a132c3b816..4f91bea1e331f0b52a4f34fc848b3d51509e1360 100644
--- a/tensorflow/python/keras/initializers_test.py
+++ b/tensorflow/python/keras/initializers_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.ops import init_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -38,6 +39,7 @@ class KerasInitializersTest(test.TestCase):
     output_2 = keras.backend.get_value(variable)
     self.assertAllClose(output, output_2, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def test_uniform(self):
     tensor_shape = (9, 6, 7)
     with self.cached_session():
@@ -47,6 +49,7 @@ class KerasInitializersTest(test.TestCase):
                    tensor_shape,
                    target_mean=0., target_max=1, target_min=-1)
 
+  @test_util.run_deprecated_v1
   def test_normal(self):
     tensor_shape = (8, 12, 99)
     with self.cached_session():
@@ -54,6 +57,7 @@ class KerasInitializersTest(test.TestCase):
                    tensor_shape,
                    target_mean=0., target_std=1)
 
+  @test_util.run_deprecated_v1
   def test_truncated_normal(self):
     tensor_shape = (12, 99, 7)
     with self.cached_session():
@@ -69,6 +73,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.Constant(2), tensor_shape,
                    target_mean=2, target_max=2, target_min=2)
 
+  @test_util.run_deprecated_v1
   def test_lecun_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -77,6 +82,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.lecun_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_glorot_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -85,6 +91,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.glorot_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_he_uniform(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -93,6 +100,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.he_uniform(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_lecun_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -101,6 +109,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.lecun_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_glorot_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -109,6 +118,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.glorot_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_he_normal(self):
     tensor_shape = (5, 6, 4, 2)
     with self.cached_session():
@@ -117,6 +127,7 @@ class KerasInitializersTest(test.TestCase):
       self._runner(keras.initializers.he_normal(seed=123), tensor_shape,
                    target_mean=0., target_std=std)
 
+  @test_util.run_deprecated_v1
   def test_orthogonal(self):
     tensor_shape = (20, 20)
     with self.cached_session():
diff --git a/tensorflow/python/keras/integration_test.py b/tensorflow/python/keras/integration_test.py
index 3c0f73b1c3aab037164f612e0e9b3a2fc7b32385..c516514f63270a9507101209680c1be221ba3f99 100644
--- a/tensorflow/python/keras/integration_test.py
+++ b/tensorflow/python/keras/integration_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python import keras
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.layers import core as tf_core_layers
 from tensorflow.python.ops import nn
@@ -34,6 +35,7 @@ class KerasIntegrationTest(test.TestCase):
   def test_version(self):
     self.assertTrue(keras.__version__.endswith('-tf'))
 
+  @test_util.run_v1_only('b/120545219')
   def test_vector_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -59,6 +61,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_vector_classification_functional(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -83,6 +86,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_temporal_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -105,6 +109,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_deprecated_v1
   def test_temporal_classification_sequential_tf_rnn(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -129,6 +134,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_v1_only('b/120545219')
   def test_image_classification_sequential(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -163,6 +169,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_v1_only('b/120545219')
   def test_video_classification_functional(self):
     with self.cached_session():
       np.random.seed(1337)
@@ -191,6 +198,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_v1_only('b/120545219')
   def test_vector_classification_shared_sequential(self):
     # Test that Sequential models that feature internal updates
     # and internal losses can be shared.
@@ -225,6 +233,7 @@ class KerasIntegrationTest(test.TestCase):
                           verbose=2)
       self.assertGreater(history.history['val_acc'][-1], 0.7)
 
+  @test_util.run_v1_only('b/120545219')
   def test_vector_classification_shared_model(self):
     # Test that functional models that feature internal updates
     # and internal losses can be shared.
diff --git a/tensorflow/python/keras/keras_parameterized.py b/tensorflow/python/keras/keras_parameterized.py
new file mode 100644
index 0000000000000000000000000000000000000000..d76bbadeb3613a8e71b1a6fc313fb7e68630de93
--- /dev/null
+++ b/tensorflow/python/keras/keras_parameterized.py
@@ -0,0 +1,298 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for unit-testing Keras."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import functools
+import itertools
+import unittest
+
+from absl.testing import parameterized
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+from tensorflow.python.eager import context
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import test
+from tensorflow.python.util import nest
+
+
+class TestCase(test.TestCase, parameterized.TestCase):
+
+  def tearDown(self):
+    keras.backend.clear_session()
+    super(TestCase, self).tearDown()
+
+
+# TODO(kaftan): Possibly enable 'subclass_custom_build' when tests begin to pass
+# it. Or perhaps make 'subclass' always use a custom build method.
+def run_with_all_model_types(
+    test_or_class=None,
+    exclude_models=None):
+  """Execute the decorated test with all Keras model types.
+
+  This decorator is intended to be applied either to individual test methods in
+  a `keras_parameterized.TestCase` class, or directly to a test class that
+  extends it. Doing so will cause the contents of the individual test
+  method (or all test methods in the class) to be executed multiple times - once
+  for each Keras model type.
+
+  The Keras model types are: ['functional', 'subclass', 'sequential']
+
+  Note: if stacking this decorator with absl.testing's parameterized decorators,
+  those should be at the bottom of the stack.
+
+  Various methods in `testing_utils` to get models will auto-generate a model
+  of the currently active Keras model type. This allows unittests to confirm
+  the equivalence between different Keras models.
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(testing_utils.KerasTestCase):
+
+    @testing_utils.run_with_all_model_types(
+      exclude_models = ['sequential'])
+    def test_foo(self):
+      model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test tries building a small mlp as both a functional model and as a
+  subclass model.
+
+  We can also annotate the whole class if we want this to apply to all tests in
+  the class:
+  ```python
+  @testing_utils.run_with_all_model_types(exclude_models = ['sequential'])
+  class MyTests(testing_utils.KerasTestCase):
+
+    def test_foo(self):
+      model = testing_utils.get_small_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics)
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+
+  Args:
+    test_or_class: test method or class to be annotated. If None,
+      this method returns a decorator that can be applied to a test method or
+      test class. If it is not None this returns the decorator applied to the
+      test or class.
+    exclude_models: A collection of Keras model types to not run.
+      (May also be a single model type not wrapped in a collection).
+      Defaults to None.
+
+  Returns:
+    Returns a decorator that will run the decorated test method multiple times:
+    once for each desired Keras model type.
+
+  Raises:
+    ImportError: If abseil parameterized is not installed or not included as
+      a target dependency.
+  """
+  model_types = ['functional', 'subclass', 'sequential']
+  params = [('_%s' % model, model) for model in model_types
+            if model not in nest.flatten(exclude_models)]
+
+  def single_method_decorator(f):
+    """Decorator that constructs the test cases."""
+    # Use named_parameters so it can be individually run from the command line
+    @parameterized.named_parameters(*params)
+    @functools.wraps(f)
+    def decorated(self, model_type, *args, **kwargs):
+      """A run of a single test case w/ the specified model type."""
+      with testing_utils.model_type_scope(model_type):
+        f(self, *args, **kwargs)
+
+    return decorated
+
+  return _test_or_class_decorator(test_or_class, single_method_decorator)
+
+
+def run_all_keras_modes(
+    test_or_class=None,
+    config=None,
+    always_skip_v1=False):
+  """Execute the decorated test with all keras execution modes.
+
+  This decorator is intended to be applied either to individual test methods in
+  a `keras_parameterized.TestCase` class, or directly to a test class that
+  extends it. Doing so will cause the contents of the individual test
+  method (or all test methods in the class) to be executed multiple times -
+  once executing in legacy graph mode, once running eagerly and with
+  `should_run_eagerly` returning True, and once running eagerly with
+  `should_run_eagerly` returning False.
+
+  If Tensorflow v2 behavior is enabled, legacy graph mode will be skipped, and
+  the test will only run twice.
+
+  Note: if stacking this decorator with absl.testing's parameterized decorators,
+  those should be at the bottom of the stack.
+
+  For example, consider the following unittest:
+
+  ```python
+  class MyTests(testing_utils.KerasTestCase):
+
+    @testing_utils.run_all_keras_modes
+    def test_foo(self):
+      model = testing_utils.get_small_functional_mlp(1, 4, input_dim=3)
+      optimizer = RMSPropOptimizer(learning_rate=0.001)
+      loss = 'mse'
+      metrics = ['mae']
+      model.compile(optimizer, loss, metrics=metrics,
+                    run_eagerly=testing_utils.should_run_eagerly())
+
+      inputs = np.zeros((10, 3))
+      targets = np.zeros((10, 4))
+      dataset = dataset_ops.Dataset.from_tensor_slices((inputs, targets))
+      dataset = dataset.repeat(100)
+      dataset = dataset.batch(10)
+
+      model.fit(dataset, epochs=1, steps_per_epoch=2, verbose=1)
+
+  if __name__ == "__main__":
+    tf.test.main()
+  ```
+
+  This test will try compiling & fitting the small functional mlp using all
+  three Keras execution modes.
+
+  Args:
+    test_or_class: test method or class to be annotated. If None,
+      this method returns a decorator that can be applied to a test method or
+      test class. If it is not None this returns the decorator applied to the
+      test or class.
+    config: An optional config_pb2.ConfigProto to use to configure the
+      session when executing graphs.
+    always_skip_v1: If True, does not try running the legacy graph mode even
+      when Tensorflow v2 behavior is not enabled.
+
+  Returns:
+    Returns a decorator that will run the decorated test method multiple times.
+
+  Raises:
+    ImportError: If abseil parameterized is not installed or not included as
+      a target dependency.
+  """
+  params = [('_v2_eager', 'v2_eager'),
+            ('_v2_function', 'v2_function')]
+  if not (always_skip_v1 or tf2.enabled()):
+    params.append(('_v1_graph', 'v1_graph'))
+
+  def single_method_decorator(f):
+    """Decorator that constructs the test cases."""
+
+    # Use named_parameters so it can be individually run from the command line
+    @parameterized.named_parameters(*params)
+    @functools.wraps(f)
+    def decorated(self, run_mode, *args, **kwargs):
+      """A run of a single test case w/ specified run mode."""
+      if run_mode == 'v1_graph':
+        with context.graph_mode(), testing_utils.run_eagerly_scope(False):
+          with self.test_session(use_gpu=True, config=config):
+            f(self, *args, **kwargs)
+      elif run_mode == 'v2_function':
+        with context.eager_mode():
+          with testing_utils.run_eagerly_scope(False):
+            f(self, *args, **kwargs)
+      elif run_mode == 'v2_eager':
+        with context.eager_mode():
+          with testing_utils.run_eagerly_scope(True):
+            f(self, *args, **kwargs)
+      else:
+        return ValueError('Unknown run mode %s' % run_mode)
+
+    return decorated
+
+  return _test_or_class_decorator(test_or_class, single_method_decorator)
+
+
+def _test_or_class_decorator(test_or_class, single_method_decorator):
+  """Decorate a test or class with a decorator intended for one method.
+
+  If the test_or_class is a class:
+    This will apply the decorator to all test methods in the class.
+
+  If the test_or_class is an iterable of already-parameterized test cases:
+    This will apply the decorator to all the cases, and then flatten the
+    resulting cross-product of test cases. This allows stacking the Keras
+    parameterized decorators w/ each other, and to apply them to test methods
+    that have already been marked with an absl parameterized decorator.
+
+  Otherwise, treat the obj as a single method and apply the decorator directly.
+
+  Args:
+    test_or_class: A test method (that may have already been decorated with a
+      parameterized decorator, or a test class that extends
+      keras_parameterized.TestCase
+    single_method_decorator:
+      A parameterized decorator intended for a single test method.
+  Returns:
+    The decorated result.
+  """
+  def _decorate_test_or_class(obj):
+    if isinstance(obj, collections.Iterable):
+      return itertools.chain.from_iterable(
+          single_method_decorator(method) for method in obj)
+    if isinstance(obj, type):
+      cls = obj
+      for name, value in cls.__dict__.copy().items():
+        if callable(value) and name.startswith(
+            unittest.TestLoader.testMethodPrefix):
+          setattr(cls, name, single_method_decorator(value))
+
+      cls = type(cls).__new__(type(cls), cls.__name__, cls.__bases__,
+                              cls.__dict__.copy())
+      return cls
+
+    return single_method_decorator(obj)
+
+  if test_or_class is not None:
+    return _decorate_test_or_class(test_or_class)
+
+  return _decorate_test_or_class
diff --git a/tensorflow/python/keras/keras_parameterized_test.py b/tensorflow/python/keras/keras_parameterized_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ff40cfc7a17114fad20a51f29a6aed89b56015
--- /dev/null
+++ b/tensorflow/python/keras/keras_parameterized_test.py
@@ -0,0 +1,552 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras testing_utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+from absl.testing import parameterized
+
+from tensorflow.python import keras
+from tensorflow.python import tf2
+from tensorflow.python.eager import context
+from tensorflow.python.keras import keras_parameterized
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.platform import googletest
+
+
+class KerasParameterizedTest(keras_parameterized.TestCase):
+
+  def test_run_with_all_model_types(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    e.testBody_functional()
+    e.testBody_subclass()
+    e.testBody_sequential()
+
+    self.assertLen(model_types, 3)
+    self.assertAllEqual(model_types, [
+        "functional",
+        "subclass",
+        "sequential"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+    self.assertIsInstance(models[2], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 6)
+
+  def test_run_with_all_model_types_and_extra_params(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      @parameterized.named_parameters(
+          [dict(testcase_name="_0", with_brackets=True),
+           dict(testcase_name="_1", with_brackets=False)])
+      def testBody(self, with_brackets):
+        with_brackets = "with_brackets" if with_brackets else "without_brackets"
+        model_types.append((with_brackets, testing_utils.get_model_type()))
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    e.testBody_0_functional()
+    e.testBody_0_subclass()
+    e.testBody_0_sequential()
+    e.testBody_1_functional()
+    e.testBody_1_subclass()
+    e.testBody_1_sequential()
+
+    self.assertLen(model_types, 6)
+    self.assertAllEqual(model_types, [
+        ("with_brackets", "functional"),
+        ("with_brackets", "subclass"),
+        ("with_brackets", "sequential"),
+        ("without_brackets", "functional"),
+        ("without_brackets", "subclass"),
+        ("without_brackets", "sequential"),
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+    self.assertIsInstance(models[2], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 12)
+
+  def test_run_with_all_model_types_exclude_one(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types(exclude_models="sequential")
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_functional"):
+      e.testBody_functional()
+    if hasattr(e, "testBody_subclass"):
+      e.testBody_subclass()
+    if hasattr(e, "testBody_sequential"):
+      e.testBody_sequential()
+
+    self.assertLen(model_types, 2)
+    self.assertAllEqual(model_types, [
+        "functional",
+        "subclass"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertTrue(models[0]._is_graph_network)
+    self.assertFalse(models[1]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+    self.assertNotIsInstance(models[1], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 4)
+
+  def test_run_with_all_model_types_exclude_multiple(self):
+    model_types = []
+    models = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types(
+          exclude_models=["sequential", "functional"])
+      def testBody(self):
+        model_types.append(testing_utils.get_model_type())
+        models.append(testing_utils.get_small_mlp(1, 4, input_dim=3))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_functional"):
+      e.testBody_functional()
+    if hasattr(e, "testBody_subclass"):
+      e.testBody_subclass()
+    if hasattr(e, "testBody_sequential"):
+      e.testBody_sequential()
+
+    self.assertLen(model_types, 1)
+    self.assertAllEqual(model_types, [
+        "subclass"
+    ])
+
+    # Validate that the models are what they should be
+    self.assertFalse(models[0]._is_graph_network)
+    self.assertNotIsInstance(models[0], keras.models.Sequential)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(model_types, 2)
+
+  def test_run_all_keras_modes(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if not tf2.enabled():
+      e.testBody_v1_graph()
+    e.testBody_v2_eager()
+    e.testBody_v2_function()
+
+    if not tf2.enabled():
+      self.assertLen(l, 3)
+      self.assertAllEqual(l, [
+          ("graph", False),
+          ("eager", True),
+          ("eager", False),
+      ])
+
+      ts = unittest.makeSuite(ExampleTest)
+      res = unittest.TestResult()
+      ts.run(res)
+      self.assertLen(l, 6)
+    else:
+      self.assertLen(l, 2)
+      self.assertAllEqual(l, [
+          ("eager", True),
+          ("eager", False),
+      ])
+
+      ts = unittest.makeSuite(ExampleTest)
+      res = unittest.TestResult()
+      ts.run(res)
+      self.assertLen(l, 4)
+
+  def test_run_all_keras_modes_extra_params(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @parameterized.named_parameters(
+          [dict(testcase_name="_0", with_brackets=True),
+           dict(testcase_name="_1", with_brackets=False)])
+      def testBody(self, with_brackets):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        with_brackets = "with_brackets" if with_brackets else "without_brackets"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((with_brackets, mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if not tf2.enabled():
+      e.testBody_0_v1_graph()
+      e.testBody_1_v1_graph()
+
+    e.testBody_0_v2_eager()
+    e.testBody_0_v2_function()
+    e.testBody_1_v2_eager()
+    e.testBody_1_v2_function()
+
+    expected_combinations = {
+        ("with_brackets", "eager", True),
+        ("with_brackets", "eager", False),
+        ("without_brackets", "eager", True),
+        ("without_brackets", "eager", False),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("with_brackets", "graph", False),
+          ("without_brackets", "graph", False),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_always_skip_v1(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes(always_skip_v1=True)
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly))
+
+    e = ExampleTest()
+    if hasattr(e, "testBody_v1_graph"):
+      e.testBody_v1_graph()
+    if hasattr(e, "testBody_v2_eager"):
+      e.testBody_v2_eager()
+    if hasattr(e, "testBody_v2_function"):
+      e.testBody_v2_function()
+
+    self.assertLen(l, 2)
+    self.assertEqual(set(l), {
+        ("eager", True),
+        ("eager", False),
+    })
+
+  def test_run_all_keras_modes_with_all_model_types(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_with_all_model_types
+      @keras_parameterized.run_all_keras_modes
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_v2_eager_functional()
+    e.testBody_v2_function_functional()
+    e.testBody_v2_eager_sequential()
+    e.testBody_v2_function_sequential()
+    e.testBody_v2_eager_subclass()
+    e.testBody_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_v1_graph_functional()
+      e.testBody_v1_graph_sequential()
+      e.testBody_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_model_types_with_all_keras_modes(self):
+    l = []
+
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @keras_parameterized.run_with_all_model_types
+      def testBody(self):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_functional_v2_eager()
+    e.testBody_functional_v2_function()
+    e.testBody_sequential_v2_eager()
+    e.testBody_sequential_v2_function()
+    e.testBody_subclass_v2_eager()
+    e.testBody_subclass_v2_function()
+
+    if not tf2.enabled():
+      e.testBody_functional_v1_graph()
+      e.testBody_sequential_v1_graph()
+      e.testBody_subclass_v1_graph()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_with_all_model_types_annotate_class(self):
+    l = []
+
+    @keras_parameterized.run_with_all_model_types
+    @keras_parameterized.run_all_keras_modes
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @parameterized.named_parameters(dict(testcase_name="_arg",
+                                           arg=True))
+      def testBody(self, arg):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_arg_v2_eager_functional()
+    e.testBody_arg_v2_function_functional()
+    e.testBody_arg_v2_eager_sequential()
+    e.testBody_arg_v2_function_sequential()
+    e.testBody_arg_v2_eager_subclass()
+    e.testBody_arg_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_arg_v1_graph_functional()
+      e.testBody_arg_v1_graph_sequential()
+      e.testBody_arg_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  def test_run_all_keras_modes_with_all_model_types_annotate_class_2(self):
+    l = []
+
+    @keras_parameterized.run_with_all_model_types
+    class ExampleTest(keras_parameterized.TestCase):
+
+      def runTest(self):
+        pass
+
+      @keras_parameterized.run_all_keras_modes
+      @parameterized.named_parameters(dict(testcase_name="_arg",
+                                           arg=True))
+      def testBody(self, arg):
+        mode = "eager" if context.executing_eagerly() else "graph"
+        should_run_eagerly = testing_utils.should_run_eagerly()
+        l.append((mode, should_run_eagerly, testing_utils.get_model_type()))
+
+    e = ExampleTest()
+    e.testBody_arg_v2_eager_functional()
+    e.testBody_arg_v2_function_functional()
+    e.testBody_arg_v2_eager_sequential()
+    e.testBody_arg_v2_function_sequential()
+    e.testBody_arg_v2_eager_subclass()
+    e.testBody_arg_v2_function_subclass()
+
+    if not tf2.enabled():
+      e.testBody_arg_v1_graph_functional()
+      e.testBody_arg_v1_graph_sequential()
+      e.testBody_arg_v1_graph_subclass()
+
+    expected_combinations = {
+        ("eager", True, "functional"),
+        ("eager", False, "functional"),
+        ("eager", True, "sequential"),
+        ("eager", False, "sequential"),
+        ("eager", True, "subclass"),
+        ("eager", False, "subclass"),
+    }
+
+    if not tf2.enabled():
+      expected_combinations = expected_combinations.union({
+          ("graph", False, "functional"),
+          ("graph", False, "sequential"),
+          ("graph", False, "subclass"),
+      })
+
+    self.assertLen(l, len(expected_combinations))
+    self.assertEqual(set(l), expected_combinations)
+
+    ts = unittest.makeSuite(ExampleTest)
+    res = unittest.TestResult()
+    ts.run(res)
+
+    self.assertLen(l, len(expected_combinations) * 2)
+
+  @keras_parameterized.run_all_keras_modes
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  def test_run_all_keras_modes_extra_params_2(self, arg):
+    self.assertEqual(arg, True)
+
+  @keras_parameterized.run_with_all_model_types
+  @parameterized.named_parameters(dict(testcase_name="argument",
+                                       arg=True))
+  def test_run_with_all_model_types_extra_params_2(self, arg):
+    self.assertEqual(arg, True)
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensorflow/python/keras/layers/__init__.py b/tensorflow/python/keras/layers/__init__.py
index 7268040b0287fcfd7c0bd291b0ff7a75e154534e..df7571e5d5fc862c29016fc0e12d1d33059405ad 100644
--- a/tensorflow/python/keras/layers/__init__.py
+++ b/tensorflow/python/keras/layers/__init__.py
@@ -22,7 +22,7 @@ from __future__ import print_function
 # pylint: disable=g-bad-import-order
 from tensorflow.python.keras.engine.input_layer import Input
 from tensorflow.python.keras.engine.input_layer import InputLayer
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
 
 # Advanced activations.
@@ -149,6 +149,7 @@ from tensorflow.python.keras.layers.recurrent import PeepholeLSTMCell
 from tensorflow.python.keras.layers.recurrent import SimpleRNN
 from tensorflow.python.keras.layers.recurrent import GRU
 from tensorflow.python.keras.layers.recurrent import LSTM
+from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
 
 # Convolutional-recurrent layers.
 from tensorflow.python.keras.layers.convolutional_recurrent import ConvLSTM2D
diff --git a/tensorflow/python/keras/layers/advanced_activations.py b/tensorflow/python/keras/layers/advanced_activations.py
index b0dffced3eada824fc3c5656363c94deb00eaa96..35ac7830b2e2f37ffc270227d44450d730a9149c 100644
--- a/tensorflow/python/keras/layers/advanced_activations.py
+++ b/tensorflow/python/keras/layers/advanced_activations.py
@@ -22,8 +22,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util.tf_export import tf_export
@@ -54,7 +54,6 @@ class LeakyReLU(Layer):
     super(LeakyReLU, self).__init__(**kwargs)
     self.supports_masking = True
     self.alpha = K.cast_to_floatx(alpha)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.relu(inputs, alpha=self.alpha)
@@ -118,7 +117,6 @@ class PReLU(Layer):
       self.shared_axes = [shared_axes]
     else:
       self.shared_axes = list(shared_axes)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -193,7 +191,6 @@ class ELU(Layer):
     super(ELU, self).__init__(**kwargs)
     self.supports_masking = True
     self.alpha = K.cast_to_floatx(alpha)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.elu(inputs, self.alpha)
@@ -233,7 +230,6 @@ class ThresholdedReLU(Layer):
     super(ThresholdedReLU, self).__init__(**kwargs)
     self.supports_masking = True
     self.theta = K.cast_to_floatx(theta)
-    self._can_use_graph_functions = True
 
   def call(self, inputs, mask=None):
     return inputs * math_ops.cast(
@@ -269,7 +265,6 @@ class Softmax(Layer):
     super(Softmax, self).__init__(**kwargs)
     self.supports_masking = True
     self.axis = axis
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return K.softmax(inputs, axis=self.axis)
@@ -324,7 +319,6 @@ class ReLU(Layer):
     self.max_value = max_value
     self.negative_slope = K.cast_to_floatx(negative_slope)
     self.threshold = K.cast_to_floatx(threshold)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     # alpha is used for leaky relu slope in activations instead of
diff --git a/tensorflow/python/keras/layers/advanced_activations_test.py b/tensorflow/python/keras/layers/advanced_activations_test.py
index 4aadf535e0cd2161b37cb26eb4cdd9a1da457a68..f32bb457c825d9769c6dccf625d9318c07843237 100644
--- a/tensorflow/python/keras/layers/advanced_activations_test.py
+++ b/tensorflow/python/keras/layers/advanced_activations_test.py
@@ -20,13 +20,13 @@ from __future__ import print_function
 
 from tensorflow.python import keras
 from tensorflow.python.eager import context
-from tensorflow.python.framework import test_util as tf_test_util
+from tensorflow.python.keras import keras_parameterized
 from tensorflow.python.keras import testing_utils
 from tensorflow.python.platform import test
 
 
-@tf_test_util.run_all_in_graph_and_eager_modes
-class AdvancedActivationsTest(test.TestCase):
+@keras_parameterized.run_all_keras_modes
+class AdvancedActivationsTest(keras_parameterized.TestCase):
 
   def test_leaky_relu(self):
     for alpha in [0., .5, -1.]:
diff --git a/tensorflow/python/keras/layers/convolutional.py b/tensorflow/python/keras/layers/convolutional.py
index 0671a5a36d6fc2a4c5a763505548c54eb7568039..6564d6e8fdba6d6f8b384b06125032d16f34e28a 100644
--- a/tensorflow/python/keras/layers/convolutional.py
+++ b/tensorflow/python/keras/layers/convolutional.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 # imports for backwards namespace compatibility
 # pylint: disable=unused-import
 from tensorflow.python.keras.layers.pooling import AveragePooling1D
@@ -120,7 +120,6 @@ class Conv(Layer):
         name=name,
         activity_regularizer=regularizers.get(activity_regularizer),
         **kwargs)
-    self._can_use_graph_functions = True
     self.rank = rank
     self.filters = filters
     self.kernel_size = conv_utils.normalize_tuple(
@@ -1916,7 +1915,6 @@ class UpSampling1D(Layer):
     super(UpSampling1D, self).__init__(**kwargs)
     self.size = int(size)
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -1983,7 +1981,6 @@ class UpSampling2D(Layer):
                        'or `"bilinear"`.')
     self.interpolation = interpolation
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2054,7 +2051,6 @@ class UpSampling3D(Layer):
     self.size = conv_utils.normalize_tuple(size, 3, 'size')
     self.input_spec = InputSpec(ndim=5)
     super(UpSampling3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2109,7 +2105,6 @@ class ZeroPadding1D(Layer):
 
   def __init__(self, padding=1, **kwargs):
     super(ZeroPadding1D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.padding = conv_utils.normalize_tuple(padding, 2, 'padding')
     self.input_spec = InputSpec(ndim=3)
 
@@ -2175,7 +2170,6 @@ class ZeroPadding2D(Layer):
 
   def __init__(self, padding=(1, 1), data_format=None, **kwargs):
     super(ZeroPadding2D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     if isinstance(padding, int):
       self.padding = ((padding, padding), (padding, padding))
@@ -2280,7 +2274,6 @@ class ZeroPadding3D(Layer):
 
   def __init__(self, padding=(1, 1, 1), data_format=None, **kwargs):
     super(ZeroPadding3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     if isinstance(padding, int):
       self.padding = ((padding, padding), (padding, padding), (padding,
@@ -2375,7 +2368,6 @@ class Cropping1D(Layer):
     super(Cropping1D, self).__init__(**kwargs)
     self.cropping = conv_utils.normalize_tuple(cropping, 2, 'cropping')
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2475,7 +2467,6 @@ class Cropping2D(Layer):
                        '((top_crop, bottom_crop), (left_crop, right_crop)). '
                        'Found: ' + str(cropping))
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -2609,7 +2600,6 @@ class Cropping3D(Layer):
           ' (left_dim3_crop, right_dim2_crop)). '
           'Found: ' + str(cropping))
     self.input_spec = InputSpec(ndim=5)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
diff --git a/tensorflow/python/keras/layers/convolutional_recurrent.py b/tensorflow/python/keras/layers/convolutional_recurrent.py
index 100542129bf810b0722e93dce17cc20830141bf0..cf3861da21858d0ef0ab4e7567795edbf41635b8 100644
--- a/tensorflow/python/keras/layers/convolutional_recurrent.py
+++ b/tensorflow/python/keras/layers/convolutional_recurrent.py
@@ -26,8 +26,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _generate_dropout_mask
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.layers.recurrent import RNN
diff --git a/tensorflow/python/keras/layers/core.py b/tensorflow/python/keras/layers/core.py
index e5c37be0aa16e562b6c60a6cbbf727090e841d95..854774c569e3c86d1665f39fcdec74960df2928b 100644
--- a/tensorflow/python/keras/layers/core.py
+++ b/tensorflow/python/keras/layers/core.py
@@ -34,8 +34,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -81,7 +81,6 @@ class Masking(Layer):
     super(Masking, self).__init__(**kwargs)
     self.supports_masking = True
     self.mask_value = mask_value
-    self._can_use_graph_functions = True
 
   def compute_mask(self, inputs, mask=None):
     return K.any(math_ops.not_equal(inputs, self.mask_value), axis=-1)
@@ -125,7 +124,6 @@ class Dropout(Layer):
     self.noise_shape = noise_shape
     self.seed = seed
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
   def _get_noise_shape(self, inputs):
     # Subclasses of `Dropout` may implement `_get_noise_shape(self, inputs)`,
@@ -326,7 +324,6 @@ class Activation(Layer):
     super(Activation, self).__init__(**kwargs)
     self.supports_masking = True
     self.activation = activations.get(activation)
-    self._can_use_graph_functions = True
 
   def call(self, inputs):
     return self.activation(inputs)
@@ -379,7 +376,6 @@ class Reshape(Layer):
   def __init__(self, target_shape, **kwargs):
     super(Reshape, self).__init__(**kwargs)
     self.target_shape = tuple(target_shape)
-    self._can_use_graph_functions = True
 
   def _fix_unknown_dimension(self, input_shape, output_shape):
     """Find and replace a missing dimension in an output shape.
@@ -488,7 +484,6 @@ class Permute(Layer):
           'The set of indices in `dims` must be consecutive and start from 1.' %
           (dims,))
     self.input_spec = InputSpec(ndim=len(self.dims) + 1)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -511,6 +506,9 @@ class Permute(Layer):
 class Flatten(Layer):
   """Flattens the input. Does not affect the batch size.
 
+  If inputs are shaped `(batch,)` without a channel dimension, then flattening
+  adds an extra channel dimension and output shapes are `(batch, 1)`.
+
   Arguments:
       data_format: A string,
           one of `channels_last` (default) or `channels_first`.
@@ -539,24 +537,27 @@ class Flatten(Layer):
   def __init__(self, data_format=None, **kwargs):
     super(Flatten, self).__init__(**kwargs)
     self.data_format = conv_utils.normalize_data_format(data_format)
-    self.input_spec = InputSpec(min_ndim=2)
-    self._can_use_graph_functions = True
+    self.input_spec = InputSpec(min_ndim=1)
 
   def call(self, inputs):
-    if self.data_format == 'channels_first':
+    if (self.data_format == 'channels_first'
+        and K.ndim(inputs) is not None and K.ndim(inputs) > 1):
       permutation = [0]
       permutation.extend([i for i in
                           range(2, K.ndim(inputs))])
       permutation.append(1)
       inputs = array_ops.transpose(inputs, perm=permutation)
 
-    outputs = array_ops.reshape(inputs, (array_ops.shape(inputs)[0], -1))
+    outputs = array_ops.reshape(
+        inputs, (inputs.shape[0].value or array_ops.shape(inputs)[0], -1))
     if not context.executing_eagerly():
       outputs.set_shape(self.compute_output_shape(inputs.get_shape()))
     return outputs
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
+    if not input_shape:
+      output_shape = tensor_shape.TensorShape([1])
     output_shape = [input_shape[0]]
     if all(input_shape[1:]):
       output_shape += [np.prod(input_shape[1:])]
@@ -600,7 +601,6 @@ class RepeatVector(Layer):
     super(RepeatVector, self).__init__(**kwargs)
     self.n = n
     self.input_spec = InputSpec(ndim=2)
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape).as_list()
@@ -929,7 +929,6 @@ class Dense(Layer):
 
     self.supports_masking = True
     self.input_spec = InputSpec(min_ndim=2)
-    self._can_use_graph_functions = True
 
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
@@ -1029,7 +1028,6 @@ class ActivityRegularization(Layer):
     self.supports_masking = True
     self.l1 = l1
     self.l2 = l2
-    self._can_use_graph_functions = True
 
   def compute_output_shape(self, input_shape):
     return input_shape
diff --git a/tensorflow/python/keras/layers/core_test.py b/tensorflow/python/keras/layers/core_test.py
index aad6ab8171ee6e7ff2d0d24b6dc37f556ddc6476..f138adf76026b116b2a4d771e8ae90194e065bef 100644
--- a/tensorflow/python/keras/layers/core_test.py
+++ b/tensorflow/python/keras/layers/core_test.py
@@ -149,6 +149,20 @@ class CoreLayersTest(test.TestCase):
         np.transpose(inputs, (0, 2, 3, 1)), (-1, 5 * 5 * 3))
     self.assertAllClose(outputs, target_outputs)
 
+  @tf_test_util.run_in_graph_and_eager_modes
+  def test_flatten_scalar_channels(self):
+    testing_utils.layer_test(
+        keras.layers.Flatten, kwargs={}, input_shape=(3,))
+
+    # Test channels_first
+    inputs = np.random.random((10,)).astype('float32')
+    outputs = testing_utils.layer_test(
+        keras.layers.Flatten,
+        kwargs={'data_format': 'channels_first'},
+        input_data=inputs)
+    target_outputs = np.expand_dims(inputs, -1)
+    self.assertAllClose(outputs, target_outputs)
+
   @tf_test_util.run_in_graph_and_eager_modes
   def test_repeat_vector(self):
     testing_utils.layer_test(
diff --git a/tensorflow/python/keras/layers/cudnn_recurrent.py b/tensorflow/python/keras/layers/cudnn_recurrent.py
index beacdf2515633cae2cb16b49fbf8b66b11522e73..16692753afbc83d55349f5b3843952f1b8c8d2bf 100644
--- a/tensorflow/python/keras/layers/cudnn_recurrent.py
+++ b/tensorflow/python/keras/layers/cudnn_recurrent.py
@@ -25,7 +25,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
+from tensorflow.python.keras.layers import recurrent
 from tensorflow.python.keras.layers.recurrent import RNN
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_cudnn_rnn_ops
@@ -80,11 +81,6 @@ class _CuDNNRNN(RNN):
     self._num_inputs = None
     self._vector_shape = constant_op.constant([-1])
 
-  def _canonical_to_params(self, weights, biases):
-    weights = [array_ops.reshape(x, self._vector_shape) for x in weights]
-    biases = [array_ops.reshape(x, self._vector_shape) for x in biases]
-    return array_ops.concat(weights + biases, axis=0)
-
   def call(self, inputs, mask=None, training=None, initial_state=None):
     if isinstance(mask, list):
       mask = mask[0]
@@ -279,7 +275,7 @@ class CuDNNGRU(_CuDNNRNN):
     input_h = initial_state[0]
     input_h = array_ops.expand_dims(input_h, axis=0)
 
-    params = self._canonical_to_params(
+    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, self.units:self.units * 2],
             self.kernel[:, :self.units],
@@ -296,7 +292,7 @@ class CuDNNGRU(_CuDNNRNN):
             self.bias[self.units * 3:self.units * 4],
             self.bias[self.units * 5:],
         ],
-    )
+        shape=self._vector_shape)
 
     outputs, h, _, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs,
@@ -474,7 +470,7 @@ class CuDNNLSTM(_CuDNNRNN):
     input_h = array_ops.expand_dims(input_h, axis=0)
     input_c = array_ops.expand_dims(input_c, axis=0)
 
-    params = self._canonical_to_params(
+    params = recurrent._canonical_to_params(    # pylint: disable=protected-access
         weights=[
             self.kernel[:, :self.units],
             self.kernel[:, self.units:self.units * 2],
@@ -495,7 +491,7 @@ class CuDNNLSTM(_CuDNNRNN):
             self.bias[self.units * 6:self.units * 7],
             self.bias[self.units * 7:],
         ],
-    )
+        shape=self._vector_shape)
 
     outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
         inputs,
diff --git a/tensorflow/python/keras/layers/embeddings.py b/tensorflow/python/keras/layers/embeddings.py
index 5d805ea684f7442e8ea1551d7b5c8329a751fabc..e8a8575705ab5c412ae4a793faaa89ef8918130c 100644
--- a/tensorflow/python/keras/layers/embeddings.py
+++ b/tensorflow/python/keras/layers/embeddings.py
@@ -45,11 +45,11 @@ class Embedding(Layer):
     model = Sequential()
     model.add(Embedding(1000, 64, input_length=10))
     # the model will take as input an integer matrix of size (batch,
-    input_length).
+    # input_length).
     # the largest integer (i.e. word index) in the input should be no larger
-    than 999 (vocabulary size).
+    # than 999 (vocabulary size).
     # now model.output_shape == (None, 10, 64), where None is the batch
-    dimension.
+    # dimension.
 
     input_array = np.random.randint(1000, size=(32, 10))
 
@@ -116,7 +116,6 @@ class Embedding(Layer):
     self.mask_zero = mask_zero
     self.supports_masking = mask_zero
     self.input_length = input_length
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
diff --git a/tensorflow/python/keras/layers/local.py b/tensorflow/python/keras/layers/local.py
index 30b83eaf50c2d239503856298dd9a02ae1f1733c..d2c4aaa125e7f1415c4e33224056c18418670769 100644
--- a/tensorflow/python/keras/layers/local.py
+++ b/tensorflow/python/keras/layers/local.py
@@ -23,8 +23,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.util.tf_export import tf_export
@@ -154,7 +154,6 @@ class LocallyConnected1D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.implementation = implementation
     self.input_spec = InputSpec(ndim=3)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
@@ -430,7 +429,6 @@ class LocallyConnected2D(Layer):
     self.bias_constraint = constraints.get(bias_constraint)
     self.implementation = implementation
     self.input_spec = InputSpec(ndim=4)
-    self._can_use_graph_functions = True
 
   @tf_utils.shape_type_conversion
   def build(self, input_shape):
diff --git a/tensorflow/python/keras/layers/local_test.py b/tensorflow/python/keras/layers/local_test.py
index 300e7c96545b3339e480a96f1c218131b1138e99..e4f4d0a639a6bac4605b3f03e23c6f14a2fdaa88 100644
--- a/tensorflow/python/keras/layers/local_test.py
+++ b/tensorflow/python/keras/layers/local_test.py
@@ -28,39 +28,43 @@ from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class LocallyConnected1DLayersTest(test.TestCase):
+  # TODO(fchollet): investigate why LocallyConnected1D
+  # fails inside a graph function in an eager context (fails with error
+  # "Incompatible shapes between op input and calculated input gradient").
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @tf_test_util.run_deprecated_v1
   def test_locallyconnected_1d(self):
-    num_samples = 2
-    num_steps = 8
-    input_dim = 5
-    filter_length = 3
-    filters = 4
-
-    for padding in ['valid', 'same']:
-      for strides in [1]:
-        if padding == 'same' and strides != 1:
-          continue
-        for data_format in ['channels_first', 'channels_last']:
-          for implementation in [1, 2]:
-            kwargs = {
-                'filters': filters,
-                'kernel_size': filter_length,
-                'padding': padding,
-                'strides': strides,
-                'data_format': data_format,
-                'implementation': implementation
-            }
+    with self.cached_session():
+      num_samples = 2
+      num_steps = 8
+      input_dim = 5
+      filter_length = 3
+      filters = 4
 
-            if padding == 'same' and implementation == 1:
-              self.assertRaises(ValueError,
-                                keras.layers.LocallyConnected1D,
-                                **kwargs)
-            else:
-              testing_utils.layer_test(
-                  keras.layers.LocallyConnected1D,
-                  kwargs=kwargs,
-                  input_shape=(num_samples, num_steps, input_dim))
+      for padding in ['valid', 'same']:
+        for strides in [1]:
+          if padding == 'same' and strides != 1:
+            continue
+          for data_format in ['channels_first', 'channels_last']:
+            for implementation in [1, 2]:
+              kwargs = {
+                  'filters': filters,
+                  'kernel_size': filter_length,
+                  'padding': padding,
+                  'strides': strides,
+                  'data_format': data_format,
+                  'implementation': implementation
+              }
+
+              if padding == 'same' and implementation == 1:
+                self.assertRaises(ValueError,
+                                  keras.layers.LocallyConnected1D,
+                                  **kwargs)
+              else:
+                testing_utils.layer_test(
+                    keras.layers.LocallyConnected1D,
+                    kwargs=kwargs,
+                    input_shape=(num_samples, num_steps, input_dim))
 
   def test_locallyconnected_1d_regularization(self):
     num_samples = 2
@@ -113,30 +117,63 @@ class LocallyConnected1DLayersTest(test.TestCase):
 
 
 class LocallyConnected2DLayersTest(test.TestCase):
+  # TODO(fchollet): investigate why LocallyConnected2D
+  # fails inside a graph function in an eager context (fails with error
+  # "Incompatible shapes between op input and calculated input gradient").
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @tf_test_util.run_deprecated_v1
   def test_locallyconnected_2d(self):
-    num_samples = 8
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 10
+    with self.cached_session():
+      num_samples = 8
+      filters = 3
+      stack_size = 4
+      num_row = 6
+      num_col = 10
 
-    for padding in ['valid', 'same']:
-      for strides in [(1, 1), (2, 2)]:
-        for implementation in [1, 2]:
-          if padding == 'same' and strides != (1, 1):
-            continue
+      for padding in ['valid', 'same']:
+        for strides in [(1, 1), (2, 2)]:
+          for implementation in [1, 2]:
+            if padding == 'same' and strides != (1, 1):
+              continue
+
+            kwargs = {
+                'filters': filters,
+                'kernel_size': 3,
+                'padding': padding,
+                'kernel_regularizer': 'l2',
+                'bias_regularizer': 'l2',
+                'strides': strides,
+                'data_format': 'channels_last',
+                'implementation': implementation
+            }
 
+            if padding == 'same' and implementation == 1:
+              self.assertRaises(ValueError,
+                                keras.layers.LocallyConnected2D,
+                                **kwargs)
+            else:
+              testing_utils.layer_test(
+                  keras.layers.LocallyConnected2D,
+                  kwargs=kwargs,
+                  input_shape=(num_samples, num_row, num_col, stack_size))
+
+  @tf_test_util.run_deprecated_v1
+  def test_locallyconnected_2d_channels_first(self):
+    with self.cached_session():
+      num_samples = 8
+      filters = 3
+      stack_size = 4
+      num_row = 6
+      num_col = 10
+
+      for implementation in [1, 2]:
+        for padding in ['valid', 'same']:
           kwargs = {
               'filters': filters,
               'kernel_size': 3,
-              'padding': padding,
-              'kernel_regularizer': 'l2',
-              'bias_regularizer': 'l2',
-              'strides': strides,
-              'data_format': 'channels_last',
-              'implementation': implementation
+              'data_format': 'channels_first',
+              'implementation': implementation,
+              'padding': padding
           }
 
           if padding == 'same' and implementation == 1:
@@ -149,34 +186,6 @@ class LocallyConnected2DLayersTest(test.TestCase):
                 kwargs=kwargs,
                 input_shape=(num_samples, num_row, num_col, stack_size))
 
-  @tf_test_util.run_in_graph_and_eager_modes
-  def test_locallyconnected_2d_channels_first(self):
-    num_samples = 8
-    filters = 3
-    stack_size = 4
-    num_row = 6
-    num_col = 10
-
-    for implementation in [1, 2]:
-      for padding in ['valid', 'same']:
-        kwargs = {
-            'filters': filters,
-            'kernel_size': 3,
-            'data_format': 'channels_first',
-            'implementation': implementation,
-            'padding': padding
-        }
-
-        if padding == 'same' and implementation == 1:
-          self.assertRaises(ValueError,
-                            keras.layers.LocallyConnected2D,
-                            **kwargs)
-        else:
-          testing_utils.layer_test(
-              keras.layers.LocallyConnected2D,
-              kwargs=kwargs,
-              input_shape=(num_samples, num_row, num_col, stack_size))
-
   def test_locallyconnected_2d_regularization(self):
     num_samples = 2
     filters = 3
@@ -226,64 +235,67 @@ class LocallyConnected2DLayersTest(test.TestCase):
 
 class LocallyConnectedImplementationModeTest(test.TestCase):
 
-  @tf_test_util.run_in_graph_and_eager_modes
+  @tf_test_util.run_deprecated_v1
   def test_locallyconnected_implementation(self):
-    num_samples = 4
-    num_classes = 3
-    num_epochs = 2
-
-    np.random.seed(1)
-    targets = np.random.randint(0, num_classes, (num_samples,))
-
-    for width in [1, 6]:
-      for height in [7]:
-        for filters in [2]:
-          for data_format in ['channels_first', 'channels_last']:
-            inputs = get_inputs(
-                data_format, filters, height, num_samples, width)
-
-            for kernel_x in [(3,)]:
-              for kernel_y in [()] if width == 1 else [(2,)]:
-                for stride_x in [(1,)]:
-                  for stride_y in [()] if width == 1 else [(3,)]:
-                    for layers in [2]:
-                      kwargs = {
-                          'layers': layers,
-                          'filters': filters,
-                          'kernel_size': kernel_x + kernel_y,
-                          'strides': stride_x + stride_y,
-                          'data_format': data_format,
-                          'num_classes': num_classes,
-                          'input_shape': inputs.shape
-                      }
-
-                      model_1 = get_model(implementation=1, **kwargs)
-                      model_2 = get_model(implementation=2, **kwargs)
-
-                      copy_model_weights(model_2, model_1)
-
-                      # Compare outputs at initialization.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(out_1, out_2,
-                                                         rtol=1e-5, atol=1e-5)
-
-                      # Train.
-                      model_1.fit(x=inputs,
-                                  y=targets,
-                                  epochs=num_epochs,
-                                  batch_size=num_samples)
-
-                      model_2.fit(x=inputs,
-                                  y=targets,
-                                  epochs=num_epochs,
-                                  batch_size=num_samples)
-
-                      # Compare outputs after a few training steps.
-                      out_1 = model_1.call(inputs)
-                      out_2 = model_2.call(inputs)
-                      self.assertAllCloseAccordingToType(out_1, out_2,
-                                                         rtol=1e-5, atol=1e-5)
+    with self.cached_session():
+      num_samples = 4
+      num_classes = 3
+      num_epochs = 2
+
+      np.random.seed(1)
+      targets = np.random.randint(0, num_classes, (num_samples,))
+
+      for width in [1, 6]:
+        for height in [7]:
+          for filters in [2]:
+            for data_format in ['channels_first', 'channels_last']:
+              inputs = get_inputs(
+                  data_format, filters, height, num_samples, width)
+
+              for kernel_x in [(3,)]:
+                for kernel_y in [()] if width == 1 else [(2,)]:
+                  for stride_x in [(1,)]:
+                    for stride_y in [()] if width == 1 else [(3,)]:
+                      for layers in [2]:
+                        kwargs = {
+                            'layers': layers,
+                            'filters': filters,
+                            'kernel_size': kernel_x + kernel_y,
+                            'strides': stride_x + stride_y,
+                            'data_format': data_format,
+                            'num_classes': num_classes
+                        }
+                        model_1 = get_model(implementation=1, **kwargs)
+                        model_2 = get_model(implementation=2, **kwargs)
+
+                        # Build models.
+                        model_1.train_on_batch(inputs, targets)
+                        model_2.train_on_batch(inputs, targets)
+
+                        # Copy weights.
+                        copy_model_weights(model_2, model_1)
+
+                        # Compare outputs at initialization.
+                        out_1 = model_1.call(inputs)
+                        out_2 = model_2.call(inputs)
+                        self.assertAllCloseAccordingToType(out_1, out_2,
+                                                           rtol=1e-5, atol=1e-5)
+
+                        # Train.
+                        model_1.fit(x=inputs,
+                                    y=targets,
+                                    epochs=num_epochs,
+                                    batch_size=num_samples)
+                        model_2.fit(x=inputs,
+                                    y=targets,
+                                    epochs=num_epochs,
+                                    batch_size=num_samples)
+
+                        # Compare outputs after a few training steps.
+                        out_1 = model_1.call(inputs)
+                        out_2 = model_2.call(inputs)
+                        self.assertAllCloseAccordingToType(
+                            out_1, out_2, atol=2e-4)
 
   @tf_test_util.run_in_graph_and_eager_modes
   def test_make_2d(self):
@@ -360,8 +372,7 @@ def get_model(implementation,
               strides,
               layers,
               num_classes,
-              data_format,
-              input_shape):
+              data_format):
   model = keras.Sequential()
 
   if len(kernel_size) == 1:
@@ -390,7 +401,6 @@ def get_model(implementation,
       metrics=[keras.metrics.categorical_accuracy],
       loss=xent
   )
-  model.build(input_shape)
   return model
 
 
diff --git a/tensorflow/python/keras/layers/lstm_test.py b/tensorflow/python/keras/layers/lstm_test.py
index 9db697871fe27af65cb697f47b8cccf434ad72cd..aea426150260cf4c7b849b18319789eaf4f5da5a 100644
--- a/tensorflow/python/keras/layers/lstm_test.py
+++ b/tensorflow/python/keras/layers/lstm_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
@@ -30,7 +31,7 @@ from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 @tf_test_util.run_all_in_graph_and_eager_modes
-class LSTMLayerTest(test.TestCase):
+class LSTMLayerTest(test.TestCase, parameterized.TestCase):
 
   def test_return_sequences_LSTM(self):
     num_samples = 2
@@ -56,7 +57,7 @@ class LSTMLayerTest(test.TestCase):
     layer = keras.layers.LSTM(units, return_sequences=True)
     model.add(layer)
     outputs = model.layers[-1].output
-    self.assertEquals(outputs.get_shape().as_list(), [None, timesteps, units])
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
 
   def test_dynamic_behavior_LSTM(self):
     num_samples = 2
@@ -83,17 +84,17 @@ class LSTMLayerTest(test.TestCase):
                 'recurrent_dropout': 0.1},
         input_shape=(num_samples, timesteps, embedding_dim))
 
-  def test_implementation_mode_LSTM(self):
+  @parameterized.parameters([0, 1, 2])
+  def test_implementation_mode_LSTM(self, implementation_mode):
     num_samples = 2
     timesteps = 3
     embedding_dim = 4
     units = 2
-    for mode in [0, 1, 2]:
-      testing_utils.layer_test(
-          keras.layers.LSTM,
-          kwargs={'units': units,
-                  'implementation': mode},
-          input_shape=(num_samples, timesteps, embedding_dim))
+    testing_utils.layer_test(
+        keras.layers.LSTM,
+        kwargs={'units': units,
+                'implementation': implementation_mode},
+        input_shape=(num_samples, timesteps, embedding_dim))
 
   def test_constraints_LSTM(self):
     embedding_dim = 4
@@ -114,6 +115,7 @@ class LSTMLayerTest(test.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_with_masking_layer_LSTM(self):
     layer_class = keras.layers.LSTM
     inputs = np.random.random((2, 3, 4))
@@ -126,6 +128,7 @@ class LSTMLayerTest(test.TestCase):
                   optimizer=RMSPropOptimizer(0.01))
     model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_masking_with_stacking_LSTM(self):
     inputs = np.random.random((2, 3, 4))
     targets = np.abs(np.random.random((2, 3, 5)))
@@ -311,6 +314,7 @@ class LSTMLayerTest(test.TestCase):
 
 class LSTMLayerGraphOnlyTest(test.TestCase):
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_statefulness_LSTM(self):
     num_samples = 2
     timesteps = 3
@@ -374,6 +378,7 @@ class LSTMLayerGraphOnlyTest(test.TestCase):
 
       self.assertAllClose(out7, out6, atol=1e-5)
 
+  @tf_test_util.run_deprecated_v1
   def test_regularizers_LSTM(self):
     embedding_dim = 4
     layer_class = keras.layers.LSTM
diff --git a/tensorflow/python/keras/layers/merge.py b/tensorflow/python/keras/layers/merge.py
index 0ded0e42ed329ed1abb7a7c60362e366ec130e63..45e705c69606c4dd839429597aa9903a9442234a 100644
--- a/tensorflow/python/keras/layers/merge.py
+++ b/tensorflow/python/keras/layers/merge.py
@@ -40,7 +40,6 @@ class _Merge(Layer):
 
   def __init__(self, **kwargs):
     super(_Merge, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.supports_masking = True
 
   def _merge_function(self, inputs):
@@ -213,7 +212,7 @@ class _Merge(Layer):
     if len(mask) != len(inputs):
       raise ValueError('The lists `inputs` and `mask` '
                        'should have the same length.')
-    if all([m is None for m in mask]):
+    if all(m is None for m in mask):
       return None
     masks = [array_ops.expand_dims(m, axis=0) for m in mask if m is not None]
     return K.all(K.concatenate(masks, axis=0), axis=0, keepdims=False)
@@ -369,7 +368,6 @@ class Concatenate(_Merge):
 
   def __init__(self, axis=-1, **kwargs):
     super(Concatenate, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.axis = axis
     self.supports_masking = True
     self._reshape_required = False
@@ -380,7 +378,7 @@ class Concatenate(_Merge):
     if not isinstance(input_shape, list) or len(input_shape) < 2:
       raise ValueError('A `Concatenate` layer should be called '
                        'on a list of at least 2 inputs')
-    if all([shape is None for shape in input_shape]):
+    if all(shape is None for shape in input_shape):
       return
     reduced_inputs_shapes = [list(shape) for shape in input_shape]
     shape_set = set()
@@ -420,7 +418,7 @@ class Concatenate(_Merge):
     if len(mask) != len(inputs):
       raise ValueError('The lists `inputs` and `mask` '
                        'should have the same length.')
-    if all([m is None for m in mask]):
+    if all(m is None for m in mask):
       return None
     # Make a list of masks while making sure
     # the dimensionality of each mask
@@ -467,7 +465,6 @@ class Dot(_Merge):
 
   def __init__(self, axes, normalize=False, **kwargs):
     super(Dot, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     if not isinstance(axes, int):
       if not isinstance(axes, (list, tuple)):
         raise TypeError('Invalid type for `axes` - '
diff --git a/tensorflow/python/keras/layers/merge_test.py b/tensorflow/python/keras/layers/merge_test.py
index 698c5662b6f3b055dd6b88960c7805128ec83cd0..fcb161ae20a4caeaa9514477529c2885d6e5bd41 100644
--- a/tensorflow/python/keras/layers/merge_test.py
+++ b/tensorflow/python/keras/layers/merge_test.py
@@ -207,6 +207,7 @@ class MergeLayersGraphOnlyTest(test.TestCase):
       mask = layer.output_mask
       self.assertListEqual(mask.get_shape().as_list(), [None, 4])
 
+  @tf_test_util.run_deprecated_v1
   def test_merge_add_dynamic_shape(self):
     with self.cached_session():
       i1 = array_ops.placeholder(shape=(4, None), dtype='float32')
diff --git a/tensorflow/python/keras/layers/noise.py b/tensorflow/python/keras/layers/noise.py
index e7c0478513d2974b853497be0fa221aca96567ee..cb7cee3ebc3ebd2413836b876f2aaf21985f1d9c 100644
--- a/tensorflow/python/keras/layers/noise.py
+++ b/tensorflow/python/keras/layers/noise.py
@@ -55,7 +55,6 @@ class GaussianNoise(Layer):
     super(GaussianNoise, self).__init__(**kwargs)
     self.supports_masking = True
     self.stddev = stddev
-    self._can_use_graph_functions = True
 
   def call(self, inputs, training=None):
 
@@ -100,7 +99,6 @@ class GaussianDropout(Layer):
     super(GaussianDropout, self).__init__(**kwargs)
     self.supports_masking = True
     self.rate = rate
-    self._can_use_graph_functions = True
 
   def call(self, inputs, training=None):
     if 0 < self.rate < 1:
@@ -155,7 +153,6 @@ class AlphaDropout(Layer):
     self.noise_shape = noise_shape
     self.seed = seed
     self.supports_masking = True
-    self._can_use_graph_functions = True
 
   def _get_noise_shape(self, inputs):
     return self.noise_shape if self.noise_shape else array_ops.shape(inputs)
diff --git a/tensorflow/python/keras/layers/normalization.py b/tensorflow/python/keras/layers/normalization.py
index 3d3bf647e6f1a7fe342250a58f1eeea5a601aab8..75b10222edd19ea59361d1312ead727e02431cac 100644
--- a/tensorflow/python/keras/layers/normalization.py
+++ b/tensorflow/python/keras/layers/normalization.py
@@ -18,6 +18,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import contextlib
+
+from tensorflow.python import tf2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -26,8 +30,8 @@ from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -36,12 +40,11 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('keras.layers.BatchNormalization')
-class BatchNormalization(Layer):
+@tf_export('keras.layers.BatchNormalization', v1=[])
+class BatchNormalizationV2(Layer):
   """Batch normalization layer (Ioffe and Szegedy, 2014).
 
   Normalize the activations of the previous layer at each batch,
@@ -84,8 +87,10 @@ class BatchNormalization(Layer):
       and should be neither too small (which would add noise) nor too large
       (which would give stale estimates). Note that `momentum` is still applied
       to get the means and variances for inference.
-    fused: if `None` or `True`, use a faster, fused implementation if possible.
-      If `False`, use the system recommended implementation.
+    fused: if `True`, use a faster, fused implementation, or raise a ValueError
+      if the fused implementation cannot be used. If `None`, use the faster
+      implementation if possible. If False, do not used the fused
+      implementation.
     trainable: Boolean, if `True` also add variables to the graph collection
       `GraphKeys.TRAINABLE_VARIABLES` (see tf.Variable).
     virtual_batch_size: An `int`. By default, `virtual_batch_size` is `None`,
@@ -120,6 +125,9 @@ class BatchNormalization(Layer):
         Internal Covariate Shift](https://arxiv.org/abs/1502.03167)
   """
 
+  # The BatchNormalizationV1 subclass sets this to False to use the V1 behavior.
+  _USE_V2_BEHAVIOR = True
+
   def __init__(self,
                axis=-1,
                momentum=0.99,
@@ -143,13 +151,15 @@ class BatchNormalization(Layer):
                adjustment=None,
                name=None,
                **kwargs):
-    super(BatchNormalization, self).__init__(
+    super(BatchNormalizationV2, self).__init__(
         name=name, trainable=trainable, **kwargs)
-    self._can_use_graph_functions = True
     if isinstance(axis, list):
       self.axis = axis[:]
-    else:
+    elif isinstance(axis, int):
       self.axis = axis
+    else:
+      raise TypeError('axis must be int or list, type given: %s'
+                      % type(self.axis))
     self.momentum = momentum
     self.epsilon = epsilon
     self.center = center
@@ -166,7 +176,14 @@ class BatchNormalization(Layer):
     self.renorm = renorm
     self.virtual_batch_size = virtual_batch_size
     self.adjustment = adjustment
-    if fused is None:
+    if self._USE_V2_BEHAVIOR:
+      if fused:
+        self._raise_if_fused_cannot_be_used()
+      # We leave fused as None if self._fused_can_be_used()==True, since we
+      # still may set it to False in self.build() if the input rank is not 4.
+      elif fused is None and not self._fused_can_be_used():
+        fused = False
+    elif fused is None:
       fused = True
     self.supports_masking = True
 
@@ -182,6 +199,38 @@ class BatchNormalization(Layer):
       self.renorm_clipping = renorm_clipping
       self.renorm_momentum = renorm_momentum
 
+  def _raise_if_fused_cannot_be_used(self):
+    """Raises a ValueError if fused implementation cannot be used.
+
+    In addition to the checks done in this function, the input tensors rank must
+    be 4. The input rank check can only be done once the input shape is known.
+    """
+    # Currently fused batch norm doesn't support renorm. It also only supports a
+    # channel dimension on axis 1 or 3, when no virtual batch size or adjustment
+    # is used.
+    if self.renorm:
+      raise ValueError('Passing both fused=True and renorm=True is '
+                       'unsupported')
+    axis = [self.axis] if isinstance(self.axis, int) else self.axis
+    # Axis -3 is equivalent to 1, and axis -1 is equivalent to 3, because the
+    # input rank is required to be 4 (which is checked later).
+    if len(axis) > 1 or axis[0] not in (-3, -1, 1, 3):
+      raise ValueError('Passing fused=True is only supported when axis is 1 '
+                       'or 3')
+    if self.virtual_batch_size is not None:
+      raise ValueError('Passing fused=True is unsupported when '
+                       'virtual_batch_size is specified.')
+    if self.adjustment is not None:
+      raise ValueError('Passing fused=True is unsupported when '
+                       'adjustment is specified.')
+
+  def _fused_can_be_used(self):
+    try:
+      self._raise_if_fused_cannot_be_used()
+      return True
+    except ValueError:
+      return False
+
   def build(self, input_shape):
     input_shape = tensor_shape.TensorShape(input_shape)
     if not input_shape.ndims:
@@ -192,10 +241,6 @@ class BatchNormalization(Layer):
     if isinstance(self.axis, int):
       self.axis = [self.axis]
 
-    if not isinstance(self.axis, list):
-      raise TypeError('axis must be int or list, type given: %s'
-                      % type(self.axis))
-
     for idx, x in enumerate(self.axis):
       if x < 0:
         self.axis[idx] = ndims + x
@@ -220,16 +265,18 @@ class BatchNormalization(Layer):
         raise ValueError('When using virtual_batch_size, adjustment cannot '
                          'be specified')
 
-    if self.fused:
-      # Currently fused batch norm doesn't support renorm. It also only supports
-      # an input tensor of rank 4 and a channel dimension on axis 1 or 3.
+    if self.fused in (None, True):
       # TODO(yaozhang): if input is not 4D, reshape it to 4D and reshape the
       # output back to its original shape accordingly.
-      self.fused = (not self.renorm and
-                    ndims == 4 and
-                    self.axis in [[1], [3]] and
-                    self.virtual_batch_size is None and
-                    self.adjustment is None)
+      if self._USE_V2_BEHAVIOR:
+        if self.fused is None:
+          self.fused = (ndims == 4)
+        elif self.fused and ndims != 4:
+          raise ValueError('Batch normalization layers with fused=True only '
+                           'support 4D input tensors.')
+      else:
+        assert self.fused is not None
+        self.fused = (ndims == 4 and self._fused_can_be_used())
       # TODO(chrisying): fused batch norm is currently not supported for
       # multi-axis batch norm and by extension virtual batches. In some cases,
       # it might be possible to use fused batch norm but would require reshaping
@@ -367,11 +414,19 @@ class BatchNormalization(Layer):
   def _assign_moving_average(self, variable, value, momentum):
     with ops.name_scope(None, 'AssignMovingAvg',
                         [variable, value, momentum]) as scope:
-      with ops.colocate_with(variable):
+      # TODO(apassos,srbs,skyewm): the colocation constraints here are disabled
+      # because of a bug which leads cond_v2 to skip rewriting them creating
+      # conflicts.
+      if tf2.enabled():
+        cm = contextlib.contextmanager(lambda: (yield))
+      else:
+        cm = ops.colocate_with(variable)
+      with cm:
         decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
         if decay.dtype != variable.dtype.base_dtype:
           decay = math_ops.cast(decay, variable.dtype.base_dtype)
-        update_delta = (variable - math_ops.cast(value, variable.dtype)) * decay
+        update_delta = (
+            variable - math_ops.cast(value, variable.dtype)) * decay
         return state_ops.assign_sub(variable, update_delta, name=scope)
 
   def _fused_batch_norm(self, inputs, training):
@@ -492,6 +547,9 @@ class BatchNormalization(Layer):
 
     return (r, d, new_mean, new_variance)
 
+  def _moments(self, inputs, reduction_axes, keep_dims):
+    return nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+
   def call(self, inputs, training=None):
     if training is None:
       training = K.learning_phase()
@@ -563,7 +621,8 @@ class BatchNormalization(Layer):
       # Some of the computations here are not necessary when training==False
       # but not a constant. However, this makes the code simpler.
       keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
-      mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims)
+      mean, variance = self._moments(
+          inputs, reduction_axes, keep_dims=keep_dims)
 
       moving_mean = self.moving_mean
       moving_variance = self.moving_variance
@@ -669,5 +728,36 @@ class BatchNormalization(Layer):
                       'layer cannot be serialized and has been omitted from '
                       'the layer config. It will not be included when '
                       're-creating the layer from the saved config.')
-    base_config = super(BatchNormalization, self).get_config()
+    base_config = super(BatchNormalizationV2, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
+
+
+def _replace_in_v2_docstring(old, new):
+  string = BatchNormalizationV2.__doc__
+  if old not in string:
+    raise ValueError('Could not find following string in BatchNormalizationV2 '
+                     'docstring: "{}"'.format(old))
+  return string.replace(old, new)
+
+
+@tf_export(v1=['keras.layers.BatchNormalization'])  # pylint: disable=missing-docstring
+class BatchNormalizationV1(BatchNormalizationV2):
+
+  __doc__ = _replace_in_v2_docstring(
+      '''
+    fused: if `True`, use a faster, fused implementation, or raise a ValueError
+      if the fused implementation cannot be used. If `None`, use the faster
+      implementation if possible. If False, do not used the fused
+      implementation.''',
+
+      '''
+    fused: if `None` or `True`, use a faster, fused implementation if possible.
+      If `False`, use the system recommended implementation.''')
+
+  _USE_V2_BEHAVIOR = False
+
+
+if tf2.enabled():
+  BatchNormalization = BatchNormalizationV2
+else:
+  BatchNormalization = BatchNormalizationV1
diff --git a/tensorflow/python/keras/layers/normalization_test.py b/tensorflow/python/keras/layers/normalization_test.py
index 92e412870773f3b89751df30a3b9250e016fb60c..c1acc2eb3a3a463f4f71d5a010a3388029cb82f4 100644
--- a/tensorflow/python/keras/layers/normalization_test.py
+++ b/tensorflow/python/keras/layers/normalization_test.py
@@ -23,11 +23,13 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers import normalization
 from tensorflow.python.platform import test
 from tensorflow.python.training import gradient_descent
 
 
 @tf_test_util.run_all_in_graph_and_eager_modes
+@tf_test_util.run_v1_only('b/120545219')
 class NormalizationLayersTest(test.TestCase):
 
   def test_basic_batchnorm(self):
@@ -54,6 +56,14 @@ class NormalizationLayersTest(test.TestCase):
         kwargs={'scale': False,
                 'center': False},
         input_shape=(3, 3))
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': True},
+        input_shape=(3, 3, 3, 3))
+    testing_utils.layer_test(
+        normalization.BatchNormalizationV2,
+        kwargs={'fused': None},
+        input_shape=(3, 3, 3))
 
   def test_batchnorm_weights(self):
     layer = keras.layers.BatchNormalization(scale=False, center=False)
@@ -78,15 +88,18 @@ class NormalizationLayersTest(test.TestCase):
     self.assertEqual(layer.gamma.constraint, max_norm)
     self.assertEqual(layer.beta.constraint, max_norm)
 
-  def test_batchnorm_correctness(self):
+  def _test_batchnorm_correctness(self, dtype, use_v2=True, fused=False):
     model = keras.models.Sequential()
-    norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
+    layer_ctor = (normalization.BatchNormalizationV2 if use_v2
+                  else normalization.BatchNormalizationV1)
+    norm = layer_ctor(input_shape=(2, 2, 2), momentum=0.8, fused=fused)
     model.add(norm)
     model.compile(loss='mse',
                   optimizer=gradient_descent.GradientDescentOptimizer(0.01))
 
     # centered on 5.0, variance 10.0
-    x = np.random.normal(loc=5.0, scale=10.0, size=(1000, 10))
+    x = (np.random.normal(loc=5.0, scale=10.0, size=(1000, 2, 2, 2))
+         .astype(dtype))
     model.fit(x, x, epochs=4, verbose=0)
     out = model.predict(x)
     out -= keras.backend.eval(norm.beta)
@@ -95,23 +108,15 @@ class NormalizationLayersTest(test.TestCase):
     np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
     np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
 
-  def test_batchnorm_mixed_precision(self):
-    model = keras.models.Sequential()
-    norm = keras.layers.BatchNormalization(input_shape=(10,), momentum=0.8)
-    model.add(norm)
-    model.compile(loss='mse',
-                  optimizer=gradient_descent.GradientDescentOptimizer(0.01))
-
-    # centered on 5.0, variance 10.0
-    x = np.random.normal(
-        loc=5.0, scale=10.0, size=(1000, 10)).astype(np.float16)
-    model.fit(x, x, epochs=4, verbose=0)
-    out = model.predict(x)
-    out -= keras.backend.eval(norm.beta)
-    out /= keras.backend.eval(norm.gamma)
+  def test_batchnorm_correctness(self):
+    self._test_batchnorm_correctness(np.float32)
+    self._test_batchnorm_correctness(np.float32, fused=True)
+    self._test_batchnorm_correctness(np.float32, use_v2=False)
 
-    np.testing.assert_allclose(out.mean(), 0.0, atol=1e-1)
-    np.testing.assert_allclose(out.std(), 1.0, atol=1e-1)
+  def test_batchnorm_mixed_precision(self):
+    self._test_batchnorm_correctness(np.float16)
+    self._test_batchnorm_correctness(np.float16, fused=True)
+    self._test_batchnorm_correctness(np.float16, use_v2=False)
 
   def test_batchnorm_convnet(self):
     if test.is_gpu_available(cuda_only=True):
@@ -151,7 +156,79 @@ class NormalizationLayersTest(test.TestCase):
     np.testing.assert_allclose(np.mean(out, axis=(0, 1, 2)), 0.0, atol=1e-1)
     np.testing.assert_allclose(np.std(out, axis=(0, 1, 2)), 1.0, atol=1e-1)
 
-
+  def test_v1_fused_attribute(self):
+    norm = normalization.BatchNormalizationV1()
+    inp = keras.layers.Input((4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    norm = normalization.BatchNormalizationV1(fused=False)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV1(virtual_batch_size=2)
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(2, 2, 2))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+  def test_v2_fused_attribute(self):
+    norm = normalization.BatchNormalizationV2()
+    self.assertEqual(norm.fused, None)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    norm = normalization.BatchNormalizationV2()
+    self.assertEqual(norm.fused, None)
+    inp = keras.layers.Input(shape=(4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(virtual_batch_size=2)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(fused=False)
+    self.assertEqual(norm.fused, False)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, False)
+
+    norm = normalization.BatchNormalizationV2(fused=True, axis=[3])
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(4, 4, 4))
+    norm(inp)
+    self.assertEqual(norm.fused, True)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*renorm'):
+      normalization.BatchNormalizationV2(fused=True, renorm=True)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+      normalization.BatchNormalizationV2(fused=True, axis=2)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*when axis is 1 or 3'):
+      normalization.BatchNormalizationV2(fused=True, axis=[1, 3])
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*virtual_batch_size'):
+      normalization.BatchNormalizationV2(fused=True, virtual_batch_size=2)
+
+    with self.assertRaisesRegexp(ValueError, 'fused.*adjustment'):
+      normalization.BatchNormalizationV2(fused=True,
+                                         adjustment=lambda _: (1, 0))
+
+    norm = normalization.BatchNormalizationV2(fused=True)
+    self.assertEqual(norm.fused, True)
+    inp = keras.layers.Input(shape=(4, 4))
+    with self.assertRaisesRegexp(ValueError, '4D input tensors'):
+      norm(inp)
+
+
+@tf_test_util.run_v1_only('b/120545219')
 class NormalizationLayersGraphModeOnlyTest(test.TestCase):
 
   def test_shared_batchnorm(self):
diff --git a/tensorflow/python/keras/layers/pooling.py b/tensorflow/python/keras/layers/pooling.py
index b8d6b03664f48a8aa699cab7cb5e372dfd71830f..a0744cddad682fdcae18f571413b668d7767cb2f 100644
--- a/tensorflow/python/keras/layers/pooling.py
+++ b/tensorflow/python/keras/layers/pooling.py
@@ -22,8 +22,8 @@ import functools
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import conv_utils
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -58,7 +58,6 @@ class Pooling1D(Layer):
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
     super(Pooling1D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -231,7 +230,6 @@ class Pooling2D(Layer):
                padding='valid', data_format=None,
                name=None, **kwargs):
     super(Pooling2D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -427,7 +425,6 @@ class Pooling3D(Layer):
                padding='valid', data_format='channels_last',
                name=None, **kwargs):
     super(Pooling3D, self).__init__(name=name, **kwargs)
-    self._can_use_graph_functions = True
     if data_format is None:
       data_format = backend.image_data_format()
     if strides is None:
@@ -599,7 +596,6 @@ class GlobalPooling1D(Layer):
 
   def __init__(self, data_format='channels_last', **kwargs):
     super(GlobalPooling1D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.input_spec = InputSpec(ndim=3)
     self.data_format = conv_utils.normalize_data_format(data_format)
 
@@ -705,7 +701,6 @@ class GlobalPooling2D(Layer):
 
   def __init__(self, data_format=None, **kwargs):
     super(GlobalPooling2D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=4)
 
@@ -804,7 +799,6 @@ class GlobalPooling3D(Layer):
 
   def __init__(self, data_format=None, **kwargs):
     super(GlobalPooling3D, self).__init__(**kwargs)
-    self._can_use_graph_functions = True
     self.data_format = conv_utils.normalize_data_format(data_format)
     self.input_spec = InputSpec(ndim=5)
 
diff --git a/tensorflow/python/keras/layers/recurrent.py b/tensorflow/python/keras/layers/recurrent.py
index 979187c719f642324a622609585e74487a082e18..86a69e45d900bfd037a9d39076c22d9bd2d11c43 100644
--- a/tensorflow/python/keras/layers/recurrent.py
+++ b/tensorflow/python/keras/layers/recurrent.py
@@ -19,20 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import uuid
+
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager import function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import constraints
 from tensorflow.python.keras import initializers
 from tensorflow.python.keras import regularizers
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_cudnn_rnn_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -446,6 +453,9 @@ class RNN(Layer):
                        'an attribute `state_size` '
                        '(tuple of integers, '
                        'one integer per RNN state).')
+    # If True, the output for masked timestep will be zeros, whereas in the
+    # False case, output from previous timestep is returned for masked timestep.
+    self.zero_output_for_mask = kwargs.pop('zero_output_for_mask', False)
     super(RNN, self).__init__(**kwargs)
     self.cell = cell
     if isinstance(cell, checkpointable.CheckpointableBase):
@@ -743,35 +753,12 @@ class RNN(Layer):
            training=None,
            initial_state=None,
            constants=None):
-    # input shape: `(samples, time (padded with zeros), input_dim)`
-    # note that the .build() method of subclasses MUST define
-    # self.input_spec and self.state_spec with complete input shapes.
-    if isinstance(inputs, list):
-      # get initial_state from full input spec
-      # as they could be copied to multiple GPU.
-      if self._num_constants is None:
-        initial_state = inputs[1:]
-      else:
-        initial_state = inputs[1:-self._num_constants]
-        constants = inputs[-self._num_constants:]
-      if len(initial_state) == 0:
-        initial_state = None
-      inputs = inputs[0]
-    if initial_state is not None:
-      pass
-    elif self.stateful:
-      initial_state = self.states
-    else:
-      initial_state = self.get_initial_state(inputs)
+    inputs, initial_state, constants = self._process_inputs(
+        inputs, initial_state, constants)
 
     if isinstance(mask, list):
       mask = mask[0]
 
-    if len(initial_state) != len(self.states):
-      raise ValueError(
-          'Layer has ' + str(len(self.states)) + ' states but was passed ' +
-          str(len(initial_state)) + ' initial states.')
-
     if nest.is_sequence(inputs):
       # In the case of nested input, use the first element for shape check.
       input_shape = K.int_shape(nest.flatten(inputs)[0])
@@ -829,7 +816,8 @@ class RNN(Layer):
         mask=mask,
         unroll=self.unroll,
         input_length=timesteps,
-        time_major=self.time_major)
+        time_major=self.time_major,
+        zero_output_for_mask=self.zero_output_for_mask)
     if self.stateful:
       updates = []
       for i in range(len(states)):
@@ -850,6 +838,34 @@ class RNN(Layer):
     else:
       return output
 
+  def _process_inputs(self, inputs, initial_state, constants):
+    # input shape: `(samples, time (padded with zeros), input_dim)`
+    # note that the .build() method of subclasses MUST define
+    # self.input_spec and self.state_spec with complete input shapes.
+    if isinstance(inputs, list):
+      # get initial_state from full input spec
+      # as they could be copied to multiple GPU.
+      if self._num_constants is None:
+        initial_state = inputs[1:]
+      else:
+        initial_state = inputs[1:-self._num_constants]
+        constants = inputs[-self._num_constants:]
+      if len(initial_state) == 0:
+        initial_state = None
+      inputs = inputs[0]
+    if initial_state is not None:
+      pass
+    elif self.stateful:
+      initial_state = self.states
+    else:
+      initial_state = self.get_initial_state(inputs)
+
+    if len(initial_state) != len(self.states):
+      raise ValueError('Layer has ' + str(len(self.states)) +
+                       ' states but was passed ' + str(len(initial_state)) +
+                       ' initial states.')
+    return inputs, initial_state, constants
+
   def reset_states(self, states=None):
     if not self.stateful:
       raise AttributeError('Layer must be stateful.')
@@ -923,6 +939,8 @@ class RNN(Layer):
     }
     if self._num_constants is not None:
       config['num_constants'] = self._num_constants
+    if self.zero_output_for_mask:
+      config['zero_output_for_mask'] = self.zero_output_for_mask
 
     cell_config = self.cell.get_config()
     config['cell'] = {
@@ -2515,6 +2533,427 @@ class LSTM(RNN):
     return cls(**config)
 
 
+class UnifiedLSTM(LSTM):
+  """Long Short-Term Memory layer - Hochreiter 1997.
+
+  `UnifiedLSTM` unifies the implementations between standard `LSTM` layer and
+  `CuDNNLSTM` layer. Based on available runtime hardware and constrains,
+  `UnifiedLSTM` will choose different implementations to maximize the
+  performance. For instance, if GPU is available and all the parameters meet the
+  requirement of CuDNN kernel, `UnifiedLSTM` will use CuDNN kernel for the
+  calculation.
+
+  Arguments:
+    units: Positive integer, dimensionality of the output space.
+    activation: Activation function to use.
+        Default: hyperbolic tangent (`tanh`). If you pass `None`, no activation
+          is applied
+        (ie. "linear" activation: `a(x) = x`).
+    recurrent_activation: Activation function to use for the recurrent step.
+        Default: hard sigmoid (`hard_sigmoid`). If you pass `None`, no
+          activation is applied
+        (ie. "linear" activation: `a(x) = x`).
+    use_bias: Boolean, whether the layer uses a bias vector.
+    kernel_initializer: Initializer for the `kernel` weights matrix, used for
+      the linear transformation of the inputs..
+    recurrent_initializer: Initializer for the `recurrent_kernel` weights
+      matrix, used for the linear transformation of the recurrent state..
+    bias_initializer: Initializer for the bias vector.
+    unit_forget_bias: Boolean. If True, add 1 to the bias of the forget gate at
+      initialization. Setting it to true will also force
+      `bias_initializer="zeros"`. This is recommended in [Jozefowicz et
+          al.](http://www.jmlr.org/proceedings/papers/v37/jozefowicz15.pdf)
+    kernel_regularizer: Regularizer function applied to the `kernel` weights
+      matrix.
+    recurrent_regularizer: Regularizer function applied to the
+      `recurrent_kernel` weights matrix.
+    bias_regularizer: Regularizer function applied to the bias vector.
+    activity_regularizer: Regularizer function applied to the output of the
+      layer (its "activation")..
+    kernel_constraint: Constraint function applied to the `kernel` weights
+      matrix.
+    recurrent_constraint: Constraint function applied to the `recurrent_kernel`
+      weights matrix.
+    bias_constraint: Constraint function applied to the bias vector.
+    dropout: Float between 0 and 1. Fraction of the units to drop for the linear
+      transformation of the inputs.
+    recurrent_dropout: Float between 0 and 1. Fraction of the units to drop for
+      the linear transformation of the recurrent state.
+    implementation: Implementation mode, either 1 or 2. Mode 1 will structure
+      its operations as a larger number of smaller dot products and additions,
+      whereas mode 2 will batch them into fewer, larger operations. These modes
+      will have different performance profiles on different hardware and for
+      different applications.
+    return_sequences: Boolean. Whether to return the last output. in the output
+      sequence, or the full sequence.
+    return_state: Boolean. Whether to return the last state in addition to the
+      output.
+    go_backwards: Boolean (default False). If True, process the input sequence
+      backwards and return the reversed sequence.
+    stateful: Boolean (default False). If True, the last state for each sample
+      at index i in a batch will be used as initial state for the sample of
+      index i in the following batch.
+    unroll: Boolean (default False). If True, the network will be unrolled, else
+      a symbolic loop will be used. Unrolling can speed-up a RNN, although it
+      tends to be more memory-intensive. Unrolling is only suitable for short
+      sequences.
+  """
+
+  def __init__(self,
+               units,
+               activation='tanh',
+               recurrent_activation='hard_sigmoid',
+               use_bias=True,
+               kernel_initializer='glorot_uniform',
+               recurrent_initializer='orthogonal',
+               bias_initializer='zeros',
+               unit_forget_bias=True,
+               kernel_regularizer=None,
+               recurrent_regularizer=None,
+               bias_regularizer=None,
+               activity_regularizer=None,
+               kernel_constraint=None,
+               recurrent_constraint=None,
+               bias_constraint=None,
+               dropout=0.,
+               recurrent_dropout=0.,
+               implementation=1,
+               return_sequences=False,
+               return_state=False,
+               go_backwards=False,
+               stateful=False,
+               time_major=False,
+               unroll=False,
+               **kwargs):
+    # return_runtime is a flag for testing, which shows the real backend
+    # implementation chosen by grappler in graph mode.
+    self.return_runtime = kwargs.pop('return_runtime', False)
+
+    super(UnifiedLSTM, self).__init__(
+        units,
+        activation=activation,
+        recurrent_activation=recurrent_activation,
+        use_bias=use_bias,
+        kernel_initializer=kernel_initializer,
+        recurrent_initializer=recurrent_initializer,
+        bias_initializer=bias_initializer,
+        unit_forget_bias=unit_forget_bias,
+        kernel_regularizer=kernel_regularizer,
+        recurrent_regularizer=recurrent_regularizer,
+        bias_regularizer=bias_regularizer,
+        activity_regularizer=activity_regularizer,
+        kernel_constraint=kernel_constraint,
+        recurrent_constraint=recurrent_constraint,
+        bias_constraint=bias_constraint,
+        dropout=dropout,
+        recurrent_dropout=recurrent_dropout,
+        implementation=implementation,
+        return_sequences=return_sequences,
+        return_state=return_state,
+        go_backwards=go_backwards,
+        stateful=stateful,
+        time_major=time_major,
+        unroll=unroll,
+        **kwargs)
+
+    self.state_spec = [
+        InputSpec(shape=(None, dim)) for dim in (self.units, self.units)
+    ]
+    self._num_constants = None
+    self._num_inputs = None
+    self._dropout_mask = None
+    self.could_use_cudnn = (
+        activation == 'tanh' and recurrent_dropout == 0 and
+        not unroll and use_bias and bias_regularizer is None)
+
+  def call(self, inputs, mask=None, training=None, initial_state=None):
+    # LSTM does not support constants. Ignore it during process.
+    inputs, initial_state, _ = self._process_inputs(inputs, initial_state, None)
+
+    if isinstance(mask, list):
+      mask = mask[0]
+
+    input_shape = K.int_shape(inputs)
+    timesteps = input_shape[0] if self.time_major else input_shape[1]
+
+    if mask is not None or not self.could_use_cudnn:
+      # CuDNN does not support masking, fall back to use the normal LSTM.
+      kwargs = {'training': training}
+
+      def step(inputs, states):
+        return self.cell.call(inputs, states, **kwargs)
+
+      last_output, outputs, states = K.rnn(
+          step,
+          inputs,
+          initial_state,
+          constants=None,
+          go_backwards=self.go_backwards,
+          mask=mask,
+          unroll=self.unroll,
+          input_length=timesteps,
+          time_major=self.time_major,
+          zero_output_for_mask=self.zero_output_for_mask)
+      runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+    else:
+      # Use the new defun approach for backend implementation swap.
+      # Note that different implementations need to have same function
+      # signature, eg, the tensor parameters need to have same shape and dtypes.
+      # Since the CuDNN has an extra set of bias, those bias will be passed to
+      # both normal and CuDNN implementations.
+      if self.go_backwards:
+        # Reverse time axis.
+        inputs = K.reverse(inputs, 0 if self.time_major else 1)
+
+      if 0 < self.dropout < 1:
+        if self._dropout_mask is None:
+          self._dropout_mask = _generate_dropout_mask(
+              array_ops.ones_like(inputs),
+              self.dropout,
+              training=training,
+              count=4)
+
+        inputs *= self._dropout_mask[0]
+
+      # Each time a defun function is called, we will give a unique identifiable
+      # API name, so that the grappler won't get confused when it sees multiple
+      # LSTM layer added into same graph, and it will be able to pair up the
+      # different implementations across them.
+      experimental_api_name = 'lstm_' + str(uuid.uuid4())
+      standard_lstm_attributes = {
+          'experimental_api_implements': experimental_api_name,
+          'experimental_api_preferred_device': 'CPU',
+      }
+      cudnn_lstm_attributes = {
+          'experimental_api_implements': experimental_api_name,
+          'experimental_api_preferred_device': 'GPU',
+      }
+      defun_standard_lstm = function.defun_with_attributes(
+          standard_lstm, attributes=standard_lstm_attributes)
+      defun_cudnn_lstm = function.defun_with_attributes(
+          cudnn_lstm, attributes=cudnn_lstm_attributes)
+
+      if ops.executing_eagerly_outside_functions():
+        # Under eager context, the device placement is already known. Prefer the
+        # GPU implementation here.
+        if context.num_gpus() > 0:
+          last_output, outputs, new_h, new_c, runtime = defun_cudnn_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.time_major)
+        else:
+          last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+              inputs, initial_state[0], initial_state[1], self.cell.kernel,
+              self.cell.recurrent_kernel, self.cell.bias, self.activation,
+              self.recurrent_activation, self.time_major)
+      else:
+        # Call the normal LSTM impl and register the CuDNN impl function. The
+        # grappler will kick in during session execution to optimize the graph.
+        last_output, outputs, new_h, new_c, runtime = defun_standard_lstm(
+            inputs, initial_state[0], initial_state[1], self.cell.kernel,
+            self.cell.recurrent_kernel, self.cell.bias, self.activation,
+            self.recurrent_activation, self.time_major)
+
+        function.register(defun_cudnn_lstm, inputs, initial_state[0],
+                          initial_state[1], self.cell.kernel,
+                          self.cell.recurrent_kernel, self.cell.bias,
+                          self.time_major)
+      states = [new_h, new_c]
+
+    if self.stateful:
+      updates = []
+      for i in range(len(states)):
+        updates.append(state_ops.assign(self.states[i], states[i]))
+      self.add_update(updates, inputs)
+
+    if self.return_sequences:
+      output = outputs
+    else:
+      output = last_output
+
+    if self.return_state:
+      return [output] + states
+    elif self.return_runtime:
+      return output, runtime
+    else:
+      return output
+
+  @property
+  def trainable_weights(self):
+    if self.trainable:
+      weights = []
+      weights += self.cell.trainable_weights
+      return weights
+    return []
+
+  @property
+  def non_trainable_weights(self):
+    if not self.trainable:
+      weights = []
+      weights += self.cell.non_trainable_weights
+      return weights
+    return []
+
+  @property
+  def losses(self):
+    losses = []
+    losses += self.cell.losses
+    return losses + self._losses
+
+  @property
+  def updates(self):
+    updates = []
+    updates += self.cell.updates
+    return updates + self._updates
+
+  def get_weights(self):
+    weights = []
+    weights += self.cell.weights
+    return K.batch_get_value(weights)
+
+  def set_weights(self, weights):
+    tuples = []
+    cell_weights = weights[:len(self.cell.weights)]
+    if cell_weights:
+      tuples.append((self.cell.weights, cell_weights))
+    K.batch_set_value(tuples)
+
+
+def _canonical_to_params(weights, biases, shape, transpose_weights=False):
+  """Utility function convert variable to CuDNN compatible parameter.
+
+  Note that Keras weights for kernels are different from the CuDNN format. Eg.:
+
+  ```
+    Keras                 CuDNN
+    [[0, 1, 2],  <--->  [[0, 2, 4],
+     [3, 4, 5]]          [1, 3, 5]]
+  ```
+
+  If the input weights need to be in a unified format, then set
+  `transpose_weights=True` to convert the weights.
+
+  Args:
+    weights: list of weights for the individual kernels and recurrent kernels.
+    biases: list of biases for individual gate.
+    shape: the shape for the converted variables that will be feed to CuDNN.
+    transpose_weights: boolean, whether to transpose the weights.
+
+  Returns:
+    The converted weights that can be feed to CuDNN ops as param.
+  """
+  def convert(w):
+    return array_ops.transpose(w) if transpose_weights else w
+
+  weights = [array_ops.reshape(convert(x), shape) for x in weights]
+  biases = [array_ops.reshape(x, shape) for x in biases]
+  return array_ops.concat(weights + biases, axis=0)
+
+
+def standard_lstm(inputs, init_h, init_c, kernel, recurrent_kernel, bias,
+                  activation, recurrent_activation, time_major):
+  """LSTM with standard kernel implementation.
+
+  This implementation can be run on all types for hardware.
+
+  This implementation lifts out all the layer weights and make them function
+  parameters. It has same number of tensor input params as the CuDNN
+  counterpart. The RNN step logic has been simplified, eg dropout and mask is
+  removed since CuDNN implementation does not support that.
+
+  Note that the first half of the bias tensor should be ignored by this impl.
+  The CuDNN impl need an extra set of input gate bias. In order to make the both
+  function take same shape of parameter, that extra set of bias is also feed
+  here.
+
+  Args:
+    inputs: input tensor of LSTM layer.
+    init_h: initial state tensor for the cell output.
+    init_c: initial state tensor for the cell hidden state.
+    kernel: weights for cell kernel.
+    recurrent_kernel: weights for cell recurrent kernel.
+    bias: weights for cell kernel bias and recurrent bias. Only recurrent bias
+      is used in this case.
+    activation: Activation function to use for output.
+    recurrent_activation: Activation function to use for hidden recurrent state.
+    time_major: boolean, whether the inputs are in the format of
+      [time, batch, feature] or [batch, time, feature].
+
+  Returns:
+    last_output: output tensor for the last timestep, which has shape
+      [batch, units].
+    outputs: output tensor for all timesteps, which has shape
+      [batch, time, units].
+    state_0: the cell output, which has same shape as init_h.
+    state_1: the cell hidden state, which has same shape as init_c.
+    runtime: constant string tensor which indicate real runtime hardware. This
+      value is for testing purpose and should be used by user.
+  """
+  input_shape = K.int_shape(inputs)
+  timesteps = input_shape[0] if time_major else input_shape[1]
+
+  def step(cell_inputs, cell_states):
+    """Step function that will be used by Keras RNN backend."""
+    h_tm1 = cell_states[0]  # previous memory state
+    c_tm1 = cell_states[1]  # previous carry state
+
+    z = K.dot(cell_inputs, kernel)
+    z += K.dot(h_tm1, recurrent_kernel)
+    z = K.bias_add(z, bias)
+
+    z0, z1, z2, z3 = array_ops.split(z, 4, axis=1)
+
+    i = recurrent_activation(z0)
+    f = recurrent_activation(z1)
+    c = f * c_tm1 + i * activation(z2)
+    o = recurrent_activation(z3)
+
+    h = o * activation(c)
+    return h, [h, c]
+
+  last_output, outputs, new_states = K.rnn(
+      step,
+      inputs, [init_h, init_c],
+      constants=None,
+      unroll=False,
+      time_major=time_major,
+      input_length=timesteps)
+  return last_output, outputs, new_states[0], new_states[
+      1], constant_op.constant('cpu', dtype=dtypes.string, name='runtime')
+
+
+def cudnn_lstm(inputs, input_h, input_c, kernel, recurrent_kernel, bias,
+               time_major):
+  """LSTM with CuDNN implementation which is only available for GPU."""
+  if not time_major:
+    inputs = array_ops.transpose(inputs, perm=(1, 0, 2))
+  input_h = array_ops.expand_dims(input_h, axis=0)
+  input_c = array_ops.expand_dims(input_c, axis=0)
+
+  weights = array_ops.split(kernel, 4, axis=1)
+  weights += array_ops.split(recurrent_kernel, 4, axis=1)
+  # CuDNN has an extra set of bias for inputs, we disable them (setting to 0),
+  # so that mathematically it is same as the canonical LSTM implementation.
+  full_bias = array_ops.concat((array_ops.zeros_like(bias), bias), 0)
+
+  params = _canonical_to_params(
+      weights=weights,
+      biases=array_ops.split(full_bias, 8),
+      shape=constant_op.constant([-1]),
+      transpose_weights=True)
+
+  outputs, h, c, _ = gen_cudnn_rnn_ops.cudnn_rnn(
+      inputs, input_h=input_h, input_c=input_c, params=params, is_training=True)
+  last_output = outputs[-1]
+  if not time_major:
+    outputs = array_ops.transpose(outputs, perm=[1, 0, 2])
+  h = h[0]
+  c = c[0]
+
+  return last_output, outputs, h, c, constant_op.constant(
+      'cudnn', dtype=dtypes.string, name='runtime')
+
+
 def _generate_dropout_mask(ones, rate, training=None, count=1):
   def dropped_inputs():
     return K.dropout(ones, rate)
diff --git a/tensorflow/python/keras/layers/recurrent_test.py b/tensorflow/python/keras/layers/recurrent_test.py
index bb14a7a5056de7890d1447646d3e9c4b891f2ffa..b1449069e3279e27b08ecc383e72aed63525e521 100644
--- a/tensorflow/python/keras/layers/recurrent_test.py
+++ b/tensorflow/python/keras/layers/recurrent_test.py
@@ -1013,8 +1013,8 @@ class RNNTest(test.TestCase):
         inputs, _ = cell(inputs, initial_state)
         output = inputs
         if not context.executing_eagerly():
-          sess.run(variables_lib.global_variables_initializer())
-          output = sess.run(output)
+          self.evaluate(variables_lib.global_variables_initializer())
+          output = self.evaluate(output)
         return output
 
     random_seed.set_random_seed(12345)
@@ -1079,6 +1079,32 @@ class RNNTest(test.TestCase):
     # Expect last output to be the same as last output before masking
     self.assertAllClose(y_np, x_np[:, 1, :])
 
+  def test_zero_output_for_masking(self):
+
+    for unroll in [True, False]:
+      cell = keras.layers.SimpleRNNCell(5)
+      x = keras.Input((5, 5))
+      mask = keras.layers.Masking()
+      layer = keras.layers.RNN(
+          cell, return_sequences=True, zero_output_for_mask=True, unroll=unroll)
+      masked_input = mask(x)
+      y = layer(masked_input)
+      model = keras.models.Model(x, y)
+      model.compile(optimizer=rmsprop.RMSPropOptimizer(learning_rate=0.001),
+                    loss='mse')
+
+      np_x = np.ones((6, 5, 5))
+      result_1 = model.predict(np_x)
+
+      # set the time 4 and 5 for last record to be zero (masked).
+      np_x[5, 3:] = 0
+      result_2 = model.predict(np_x)
+
+      # expect the result_2 has same output, except the time 4,5 for last
+      # record.
+      result_1[5, 3:] = 0
+      self.assertAllClose(result_1, result_2)
+
 
 class Minimal2DRNNCell(keras.layers.Layer):
   """The minimal 2D RNN cell is a simple combination of 2 1-D RNN cell.
diff --git a/tensorflow/python/keras/layers/simplernn_test.py b/tensorflow/python/keras/layers/simplernn_test.py
index 93456b5e3a7e1ed9ad82cc779609a14e0c204983..bb3fea26926959c15e76556b836a120c02905c6f 100644
--- a/tensorflow/python/keras/layers/simplernn_test.py
+++ b/tensorflow/python/keras/layers/simplernn_test.py
@@ -98,6 +98,7 @@ class SimpleRNNLayerTest(test.TestCase):
     self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
     self.assertEqual(layer.cell.bias.constraint, b_constraint)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_with_masking_layer_SimpleRNN(self):
     layer_class = keras.layers.SimpleRNN
     inputs = np.random.random((2, 3, 4))
@@ -120,6 +121,7 @@ class SimpleRNNLayerTest(test.TestCase):
 
 class SimpleRNNLayerGraphOnlyTest(test.TestCase):
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_statefulness_SimpleRNN(self):
     num_samples = 2
     timesteps = 3
@@ -183,6 +185,7 @@ class SimpleRNNLayerGraphOnlyTest(test.TestCase):
 
       np.testing.assert_allclose(out7, out6, atol=1e-5)
 
+  @tf_test_util.run_deprecated_v1
   def test_regularizers_SimpleRNN(self):
     embedding_dim = 4
     layer_class = keras.layers.SimpleRNN
diff --git a/tensorflow/python/keras/layers/unified_lstm_test.py b/tensorflow/python/keras/layers/unified_lstm_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..932b2d331dcb60c6ff3a70ec418d47424d4b8575
--- /dev/null
+++ b/tensorflow/python/keras/layers/unified_lstm_test.py
@@ -0,0 +1,918 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnifiedLSTM layer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+import time
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.python import keras
+from tensorflow.python.client import session as session_lib
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.layers.cudnn_recurrent import CuDNNLSTM
+from tensorflow.python.keras.layers.recurrent import UnifiedLSTM
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.losses import losses
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.training import gradient_descent
+
+
+# Global config for grappler setting that is used for graph mode test.
+_rewrites = rewriter_config_pb2.RewriterConfig()
+_rewrites.function_optimization = rewriter_config_pb2.RewriterConfig.OFF
+_customer_optimizer = _rewrites.custom_optimizers.add()
+_customer_optimizer.name = 'ExperimentalImplementationSelector'
+_rewrites.min_graph_nodes = -1
+_graph_options = config_pb2.GraphOptions(rewrite_options=_rewrites)
+_config = config_pb2.ConfigProto(graph_options=_graph_options)
+
+
+@test_util.run_v1_only('b/120545219')
+class UnifiedLSTMTest(test.TestCase, parameterized.TestCase):
+
+  def test_unifiedLSTM(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      outputs, runtime = layer(inputs)
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  def test_unifiedLSTM_with_cond(self):
+    # This test is to demonstrate the graph rewrite of grappler plugin under
+    # the condition that the function returns different number of internal
+    # states.
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 1
+
+    with self.cached_session(config=_config, use_gpu=True) as sess:
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=output_shape)
+      y_train = keras.utils.to_categorical(y_train, output_shape)
+
+      layer = UnifiedLSTM(rnn_state_size, return_runtime=True)
+
+      inputs = array_ops.placeholder(
+          dtypes.float32, shape=(None, timestep, input_shape), name='inputs')
+      predict = array_ops.placeholder(
+          dtypes.float32, shape=(None, output_shape), name='predict')
+
+      zeros = array_ops.zeros([batch, output_shape])
+      dummy_runtime = constant_op.constant(
+          'unknown', dtype=dtypes.string, name='runtime')
+      a = constant_op.constant(0)
+      b = constant_op.constant(1)
+      # Will always run the lstm layer.
+      outputs, runtime = control_flow_ops.cond(
+          gen_math_ops.less(a, b),
+          lambda: layer(inputs),
+          lambda: (zeros, dummy_runtime))
+      loss = losses.softmax_cross_entropy(predict, outputs)
+      optimizer = gradient_descent.GradientDescentOptimizer(0.001)
+      train_op = optimizer.minimize(loss)
+
+      sess.run([variables.global_variables_initializer()])
+      existing_loss = 0
+
+      for _ in range(epoch):
+        loss_value, _, runtime_value = sess.run([loss, train_op, runtime], {
+            inputs: x_train,
+            predict: y_train
+        })
+        if test.is_gpu_available():
+          self.assertEqual(runtime_value, b'cudnn')
+        else:
+          self.assertEqual(runtime_value, b'cpu')
+        # Make sure the loss is updated for every epoch
+        # (layer weights properly updated).
+        self.assertNotEqual(existing_loss, loss_value)
+        existing_loss = loss_value
+
+  @parameterized.named_parameters(
+      ('_non_tan_activation', 'relu', 0, False, True, None),
+      ('_use_recurrent_dropout', 'tanh', 0.1, False, True, None),
+      ('_unroll', 'tanh', 0, True, True, None),
+      ('_not_use_bias', 'tanh', 0, False, False, None),
+      ('_use_bias_regularizer', 'tanh', 0, False, True, 'l2')
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_could_use_defun_backend(self, activation, recurrent_dropout,
+                                   unroll, use_bias, bias_regularizer):
+    layer = UnifiedLSTM(1,
+                        activation=activation,
+                        recurrent_dropout=recurrent_dropout,
+                        unroll=unroll,
+                        use_bias=use_bias,
+                        bias_regularizer=bias_regularizer)
+    self.assertFalse(layer.could_use_cudnn)
+
+  def test_unified_lstm_feature_parity_with_canonical_lstm(self):
+    with context.eager_mode():
+      # Run this test under eager only due to b/120160788 for model.set_weights.
+      input_shape = 10
+      rnn_state_size = 8
+      timestep = 4
+      batch = 20
+
+      (x_train, y_train), _ = testing_utils.get_test_data(
+          train_samples=batch,
+          test_samples=0,
+          input_shape=(timestep, input_shape),
+          num_classes=rnn_state_size)
+      y_train = keras.utils.to_categorical(y_train, rnn_state_size)
+
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      lstm_layer = keras.layers.LSTM(rnn_state_size,
+                                     recurrent_activation='sigmoid')
+      output = lstm_layer(inputs)
+      lstm_model = keras.models.Model(inputs, output)
+      weights = lstm_model.get_weights()
+      y_1 = lstm_model.predict(x_train)
+      lstm_model.compile('rmsprop', 'mse')
+      lstm_model.fit(x_train, y_train)
+      y_2 = lstm_model.predict(x_train)
+
+      with test_util.device(use_gpu=True):
+        cudnn_layer = keras.layers.UnifiedLSTM(rnn_state_size,
+                                               recurrent_activation='sigmoid')
+        cudnn_model = keras.models.Model(inputs, cudnn_layer(inputs))
+      cudnn_model.set_weights(weights)
+      y_3 = cudnn_model.predict(x_train)
+      cudnn_model.compile('rmsprop', 'mse')
+      cudnn_model.fit(x_train, y_train)
+      y_4 = cudnn_model.predict(x_train)
+
+      self.assertAllClose(y_1, y_3)
+      self.assertAllClose(y_2, y_4)
+
+  @parameterized.named_parameters(
+      # test_name, use_bias, bias_initializer, activation
+      ('normal', True, 'zeros'),
+      ('no_bias', False, 'zeros'),
+      ('random_bias', True, 'random_uniform'),
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_unified_lstm_model_save_load(self, use_bias, bias_initializer):
+    temp_dir = self.get_temp_dir()
+    self.addCleanup(shutil.rmtree, temp_dir)
+    h5_path = os.path.join(temp_dir, 'test.h5')
+
+    batch = 10
+    timestep = 3
+    input_dim = 5
+    units = 2
+
+    x = np.random.random((batch, timestep, input_dim))
+
+    def build_model():
+      inputs = keras.layers.Input(
+          shape=[timestep, input_dim], dtype=dtypes.float32)
+      layer = keras.layers.UnifiedLSTM(
+          units,
+          use_bias=use_bias,
+          bias_initializer=bias_initializer)
+      output = layer(inputs)
+      return keras.models.Model(inputs, output), layer
+
+    model, layer = build_model()
+    y_ref = model.predict(x)
+    model.save_weights(h5_path)
+
+    cloned_model, new_layer = build_model()
+    cloned_model.load_weights(h5_path)
+    y = cloned_model.predict(x)
+
+    self.assertAllClose(y, y_ref)
+    self.assertAllClose(layer.get_weights(), new_layer.get_weights())
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_unified_lstm_output_on_multiple_kernel(self):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+    with test_util.device(use_gpu=False):
+      # Note that CuDNN use 'sigmoid' as activation. Force the CPU
+      # implementation to use 'sigmoid' so that it will generate same output as
+      # CuDNN implementation.
+      layer = UnifiedLSTM(rnn_state_size, recurrent_activation='sigmoid')
+      output = layer(inputs)
+      cpu_model = keras.models.Model(inputs, output)
+      weights = cpu_model.get_weights()
+      y_1 = cpu_model.predict(x_train)
+
+    with test_util.device(use_gpu=True):
+      layer = UnifiedLSTM(rnn_state_size, recurrent_activation='sigmoid')
+      output = layer(inputs)
+      gpu_model = keras.models.Model(inputs, output)
+      gpu_model.set_weights(weights)
+      y_2 = gpu_model.predict(x_train)
+
+    with test_util.device(use_gpu=True):
+      layer = keras.layers.LSTM(rnn_state_size, recurrent_activation='sigmoid')
+      output = layer(inputs)
+      canonical_model = keras.models.Model(inputs, output)
+      # Remove the extra cudnn bias since canonical lstm will not use it.
+      canonical_model.set_weights(weights[:3])
+      y_3 = canonical_model.predict(x_train)
+
+    self.assertAllClose(y_1, y_2)
+    self.assertAllClose(y_2, y_3)
+
+  @parameterized.named_parameters(
+      # test_name, time_major, go_backwards
+      ('normal', False, False),
+      ('time_major', True, False),
+      ('go_backwards', False, True),
+      ('both', True, True),
+  )
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_time_major_and_go_backward(self, time_major, go_backwards):
+    input_shape = 10
+    rnn_state_size = 8
+    timestep = 4
+    batch = 100
+
+    x_train = np.random.random((batch, timestep, input_shape))
+
+    def build_model(layer_cls):
+      inputs = keras.layers.Input(
+          shape=[timestep, input_shape], dtype=dtypes.float32)
+      layer = layer_cls(rnn_state_size,
+                        recurrent_activation='sigmoid',
+                        time_major=time_major,
+                        return_sequences=True,
+                        go_backwards=go_backwards)
+      if time_major:
+        converted_input = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(inputs)
+        outputs = layer(converted_input)
+        outputs = keras.layers.Lambda(
+            lambda t: array_ops.transpose(t, [1, 0, 2]))(outputs)
+      else:
+        outputs = layer(inputs)
+      return keras.models.Model(inputs, outputs)
+
+    lstm_model = build_model(keras.layers.LSTM)
+    y_ref = lstm_model.predict(x_train)
+    weights = lstm_model.get_weights()
+
+    unified_lstm_model = build_model(keras.layers.UnifiedLSTM)
+    unified_lstm_model.set_weights(weights)
+    y = unified_lstm_model.predict(x_train)
+
+    self.assertAllClose(y, y_ref)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_keras_model_with_lstm(self):
+    input_shape = 10
+    rnn_state_size = 8
+    output_shape = 8
+    timestep = 4
+    batch = 100
+    epoch = 10
+
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=batch,
+        test_samples=0,
+        input_shape=(timestep, input_shape),
+        num_classes=output_shape)
+    y_train = keras.utils.to_categorical(y_train, output_shape)
+
+    layer = UnifiedLSTM(rnn_state_size)
+
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('rmsprop', loss='mse')
+    model.fit(x_train, y_train, epochs=epoch)
+    model.evaluate(x_train, y_train)
+    model.predict(x_train)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_return_sequences_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'return_sequences': True
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_static_shape_inference_LSTM(self):
+    # Github issue: 15165
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+
+    model = keras.models.Sequential()
+    inputs = keras.layers.Dense(
+        embedding_dim, input_shape=(timesteps, embedding_dim))
+    model.add(inputs)
+    layer = UnifiedLSTM(units, return_sequences=True)
+    model.add(layer)
+    outputs = model.layers[-1].output
+    self.assertEqual(outputs.get_shape().as_list(), [None, timesteps, units])
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dynamic_behavior_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer = UnifiedLSTM(units, input_shape=(None, embedding_dim))
+    model = keras.models.Sequential()
+    model.add(layer)
+    model.compile(gradient_descent.GradientDescentOptimizer(0.001), 'mse')
+    x = np.random.random((num_samples, timesteps, embedding_dim))
+    y = np.random.random((num_samples, units))
+    model.train_on_batch(x, y)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_dropout_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'dropout': 0.1,
+            'recurrent_dropout': 0.1
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @parameterized.parameters([0, 1, 2])
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_implementation_mode_LSTM(self, implementation_mode):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    testing_utils.layer_test(
+        UnifiedLSTM,
+        kwargs={
+            'units': units,
+            'implementation': implementation_mode
+        },
+        input_shape=(num_samples, timesteps, embedding_dim))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_constraints_LSTM(self):
+    embedding_dim = 4
+    layer_class = UnifiedLSTM
+    k_constraint = keras.constraints.max_norm(0.01)
+    r_constraint = keras.constraints.max_norm(0.01)
+    b_constraint = keras.constraints.max_norm(0.01)
+    layer = layer_class(
+        5,
+        return_sequences=False,
+        weights=None,
+        input_shape=(None, embedding_dim),
+        kernel_constraint=k_constraint,
+        recurrent_constraint=r_constraint,
+        bias_constraint=b_constraint)
+    layer.build((None, None, embedding_dim))
+    self.assertEqual(layer.cell.kernel.constraint, k_constraint)
+    self.assertEqual(layer.cell.recurrent_kernel.constraint, r_constraint)
+    self.assertEqual(layer.cell.bias.constraint, b_constraint)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_with_masking_layer_LSTM(self):
+    layer_class = UnifiedLSTM
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(layer_class(units=5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_masking_with_stacking_LSTM(self):
+    inputs = np.random.random((2, 3, 4))
+    targets = np.abs(np.random.random((2, 3, 5)))
+    targets /= targets.sum(axis=-1, keepdims=True)
+    model = keras.models.Sequential()
+    model.add(keras.layers.Masking(input_shape=(3, 4)))
+    model.add(UnifiedLSTM(10, return_sequences=True, unroll=False))
+    model.add(UnifiedLSTM(5, return_sequences=True, unroll=False))
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+    model.fit(inputs, targets, epochs=1, batch_size=2, verbose=1)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_from_config_LSTM(self):
+    layer_class = UnifiedLSTM
+    for stateful in (False, True):
+      l1 = layer_class(units=1, stateful=stateful)
+      l2 = layer_class.from_config(l1.get_config())
+      assert l1.get_config() == l2.get_config()
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_specify_initial_state_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    # Test with Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    layer = UnifiedLSTM(units)
+    if len(initial_state) == 1:
+      output = layer(inputs, initial_state=initial_state[0])
+    else:
+      output = layer(inputs, initial_state=initial_state)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def DISABLED_test_specify_initial_state_non_keras_tensor(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    # Test with non-Keras tensor
+    inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [
+        keras.backend.random_normal_variable((num_samples, units), 0, 1)
+        for _ in range(num_states)
+    ]
+    layer = UnifiedLSTM(units)
+    output = layer(inputs, initial_state=initial_state)
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch(inputs, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_reset_states_with_values(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    layer = UnifiedLSTM(units, stateful=True)
+    layer.build((num_samples, timesteps, embedding_dim))
+    layer.reset_states()
+    assert len(layer.states) == num_states
+    assert layer.states[0] is not None
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.zeros(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+    state_shapes = [keras.backend.int_shape(state) for state in layer.states]
+    values = [np.ones(shape) for shape in state_shapes]
+    if len(values) == 1:
+      values = values[0]
+    layer.reset_states(values)
+    self.assertAllClose(
+        keras.backend.eval(layer.states[0]),
+        np.ones(keras.backend.int_shape(layer.states[0])),
+        atol=1e-4)
+
+    # Test with invalid data
+    with self.assertRaises(ValueError):
+      layer.reset_states([1] * (len(layer.states) + 1))
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_specify_state_with_masking(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input((timesteps, embedding_dim))
+    _ = keras.layers.Masking()(inputs)
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    output = UnifiedLSTM(units)(inputs, initial_state=initial_state)
+
+    model = keras.models.Model([inputs] + initial_state, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([inputs] + initial_state, targets)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_return_state(self):
+    num_states = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = UnifiedLSTM(units, return_state=True, stateful=True)
+    outputs = layer(inputs)
+    state = outputs[1:]
+    assert len(state) == num_states
+    model = keras.models.Model(inputs, state[0])
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    state = model.predict(inputs)
+    self.assertAllClose(keras.backend.eval(layer.states[0]), state, atol=1e-4)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_state_reuse(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+
+    inputs = keras.Input(batch_shape=(num_samples, timesteps, embedding_dim))
+    layer = UnifiedLSTM(units, return_state=True, return_sequences=True)
+    outputs = layer(inputs)
+    output, state = outputs[0], outputs[1:]
+    output = UnifiedLSTM(units)(output, initial_state=state)
+    model = keras.models.Model(inputs, output)
+
+    inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    model.predict(inputs)
+
+  @test_util.run_in_graph_and_eager_modes(config=_config)
+  def test_initial_states_as_other_inputs(self):
+    timesteps = 3
+    embedding_dim = 4
+    units = 3
+    num_samples = 2
+    num_states = 2
+    layer_class = UnifiedLSTM
+
+    # Test with Keras tensor
+    main_inputs = keras.Input((timesteps, embedding_dim))
+    initial_state = [keras.Input((units,)) for _ in range(num_states)]
+    inputs = [main_inputs] + initial_state
+
+    layer = layer_class(units)
+    output = layer(inputs)
+    assert initial_state[0] in layer._inbound_nodes[0].input_tensors
+
+    model = keras.models.Model(inputs, output)
+    model.compile(
+        loss='categorical_crossentropy',
+        optimizer=gradient_descent.GradientDescentOptimizer(0.01))
+
+    main_inputs = np.random.random((num_samples, timesteps, embedding_dim))
+    initial_state = [
+        np.random.random((num_samples, units)) for _ in range(num_states)
+    ]
+    targets = np.random.random((num_samples, units))
+    model.train_on_batch([main_inputs] + initial_state, targets)
+
+
+@test_util.run_v1_only('b/120545219')
+class LSTMLayerGraphOnlyTest(test.TestCase):
+
+  def test_statefulness_LSTM(self):
+    num_samples = 2
+    timesteps = 3
+    embedding_dim = 4
+    units = 2
+    layer_class = UnifiedLSTM
+    with self.cached_session(config=_config):
+      model = keras.models.Sequential()
+      model.add(
+          keras.layers.Embedding(
+              4,
+              embedding_dim,
+              mask_zero=True,
+              input_length=timesteps,
+              batch_input_shape=(num_samples, timesteps)))
+      layer = layer_class(
+          units, return_sequences=False, stateful=True, weights=None)
+      model.add(layer)
+      model.compile(
+          optimizer=gradient_descent.GradientDescentOptimizer(0.01), loss='mse')
+      out1 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertEqual(out1.shape, (num_samples, units))
+
+      # train once so that the states change
+      model.train_on_batch(
+          np.ones((num_samples, timesteps)), np.ones((num_samples, units)))
+      out2 = model.predict(np.ones((num_samples, timesteps)))
+
+      # if the state is not reset, output should be different
+      self.assertNotEqual(out1.max(), out2.max())
+
+      # check that output changes after states are reset
+      # (even though the model itself didn't change)
+      layer.reset_states()
+      out3 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out2.max(), out3.max())
+
+      # check that container-level reset_states() works
+      model.reset_states()
+      out4 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertAllClose(out3, out4, atol=1e-5)
+
+      # check that the call to `predict` updated the states
+      out5 = model.predict(np.ones((num_samples, timesteps)))
+      self.assertNotEqual(out4.max(), out5.max())
+
+      # Check masking
+      layer.reset_states()
+
+      left_padded_input = np.ones((num_samples, timesteps))
+      left_padded_input[0, :1] = 0
+      left_padded_input[1, :2] = 0
+      out6 = model.predict(left_padded_input)
+
+      layer.reset_states()
+
+      right_padded_input = np.ones((num_samples, timesteps))
+      right_padded_input[0, -1:] = 0
+      right_padded_input[1, -2:] = 0
+      out7 = model.predict(right_padded_input)
+
+      self.assertAllClose(out7, out6, atol=1e-5)
+
+  def test_regularizers_LSTM(self):
+    embedding_dim = 4
+    layer_class = UnifiedLSTM
+    with self.cached_session(config=_config):
+      layer = layer_class(
+          5,
+          return_sequences=False,
+          weights=None,
+          input_shape=(None, embedding_dim),
+          kernel_regularizer=keras.regularizers.l1(0.01),
+          recurrent_regularizer=keras.regularizers.l1(0.01),
+          bias_regularizer='l2',
+          activity_regularizer='l1')
+      layer.build((None, None, 2))
+      self.assertEqual(len(layer.losses), 3)
+      x = keras.backend.variable(np.ones((2, 3, 2)))
+      layer(x)
+      self.assertEqual(len(layer.get_losses_for(x)), 1)
+
+
+class UnifiedLSTMPerformanceTest(test.Benchmark):
+
+  def _measure_performance(self, test_config, model, x_train, y_train):
+    batch = test_config['batch']
+    epoch = test_config['epoch']
+    warmup_epoch = test_config['warmup_epoch']
+
+    # warm up the model
+    model.fit(x_train, y_train, batch_size=batch, epochs=warmup_epoch)
+    start_time = time.time()
+    model.fit(x_train, y_train, batch_size=batch, epochs=epoch - warmup_epoch)
+    end_time = time.time()
+    return (end_time - start_time) / (epoch - warmup_epoch)
+
+  def _time_performance_run_cudnn_lstm(self, test_config, x_train, y_train):
+    # Get the performance number for standard Cudnn LSTM
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    cudnn_lstm_layer = CuDNNLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = cudnn_lstm_layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'CuDNN LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _time_performance_run_unifed_lstm_gpu(
+      self, test_config, x_train, y_train):
+    # Get performance number for Unified_LSTM with grappler swap the impl
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    layer = UnifiedLSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Unified LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _time_performance_run_normal_lstm(
+      self, test_config, x_train, y_train):
+    # Get performance number for standard LSTM on GPU.
+    input_shape = test_config['input_shape']
+    rnn_state_size = test_config['rnn_state_size']
+    timestep = test_config['timestep']
+
+    layer = keras.layers.LSTM(rnn_state_size)
+    inputs = keras.layers.Input(
+        shape=[timestep, input_shape], dtype=dtypes.float32)
+
+    outputs = layer(inputs)
+    model = keras.models.Model(inputs, outputs)
+    model.compile('sgd', 'mse')
+
+    sec_per_epoch = self._measure_performance(
+        test_config, model, x_train, y_train)
+    logging.info('Average performance for %s per epoch is: %s',
+                 'Normal LSTM', sec_per_epoch)
+    return sec_per_epoch
+
+  def _benchmark_performance_with_standard_cudnn_impl(self):
+    if not test.is_gpu_available():
+      self.skipTest('performance test will only run on GPU')
+
+    mode = 'eager' if context.executing_eagerly() else 'graph'
+    batch = 64
+    num_batch = 10
+    test_config = {
+        'input_shape': 128,
+        'rnn_state_size': 64,
+        'output_shape': 64,
+        'timestep': 50,
+        'batch': batch,
+        'epoch': 20,
+        # The performance for warmup epoch is ignored.
+        'warmup_epoch': 1,
+    }
+    (x_train, y_train), _ = testing_utils.get_test_data(
+        train_samples=(batch * num_batch),
+        test_samples=0,
+        input_shape=(test_config['timestep'], test_config['input_shape']),
+        num_classes=test_config['output_shape'])
+    y_train = keras.utils.to_categorical(y_train, test_config['output_shape'])
+
+    cudnn_sec_per_epoch = self._time_performance_run_cudnn_lstm(
+        test_config, x_train, y_train)
+    unified_lstm_sec_per_epoch = self._time_performance_run_unifed_lstm_gpu(
+        test_config, x_train, y_train)
+    normal_lstm_sec_per_epoch = self._time_performance_run_normal_lstm(
+        test_config, x_train, y_train)
+
+    cudnn_vs_unified = cudnn_sec_per_epoch / unified_lstm_sec_per_epoch
+    unified_vs_normal = normal_lstm_sec_per_epoch / unified_lstm_sec_per_epoch
+
+    self.report_benchmark(name='keras_cudnn_lstm_' + mode,
+                          wall_time=cudnn_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+    self.report_benchmark(name='keras_unified_lstm_' + mode,
+                          wall_time=unified_lstm_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+    self.report_benchmark(name='keras_canonical_lstm_' + mode,
+                          wall_time=normal_lstm_sec_per_epoch,
+                          iters=test_config['epoch'],
+                          extras=test_config)
+
+    logging.info('Expect the performance of Unified LSTM is within 80% of '
+                 'CuDNN LSTM, got {0:.2f}%'.format(cudnn_vs_unified * 100))
+    logging.info('Expect the performance of Unified LSTM is more than 5 times'
+                 ' of normal LSTM, got {0:.2f}'.format(unified_vs_normal))
+
+  def benchmark_performance_graph(self):
+    with context.graph_mode(), session_lib.Session(config=_config):
+      self._benchmark_performance_with_standard_cudnn_impl()
+
+  def benchmark_performance_eager(self):
+    with context.eager_mode():
+      self._benchmark_performance_with_standard_cudnn_impl()
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/keras/layers/wrappers.py b/tensorflow/python/keras/layers/wrappers.py
index d40f7a2e8099f81fa1253e4fa9323812eca33a0b..67b154141efc036b5fa7920c8179b35f5eb38cc1 100644
--- a/tensorflow/python/keras/layers/wrappers.py
+++ b/tensorflow/python/keras/layers/wrappers.py
@@ -23,8 +23,8 @@ import copy
 
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.keras import backend as K
-from tensorflow.python.keras.engine.base_layer import InputSpec
 from tensorflow.python.keras.engine.base_layer import Layer
+from tensorflow.python.keras.engine.input_spec import InputSpec
 from tensorflow.python.keras.layers.recurrent import _standardize_args
 from tensorflow.python.keras.utils import generic_utils
 from tensorflow.python.keras.utils import tf_utils
@@ -389,6 +389,10 @@ class Bidirectional(Wrapper):
       raise ValueError('Invalid merge mode. '
                        'Merge mode should be one of '
                        '{"sum", "mul", "ave", "concat", None}')
+    if getattr(layer, 'zero_output_for_mask', None) is not None:
+      # Force the zero_output_for_mask to be True if it presents.
+      layer.zero_output_for_mask = True
+
     self.forward_layer = copy.copy(layer)
     config = layer.get_config()
     config['go_backwards'] = not config['go_backwards']
diff --git a/tensorflow/python/keras/layers/wrappers_test.py b/tensorflow/python/keras/layers/wrappers_test.py
index 9584b0186c476055d0416fe3da29b3eb19829a2a..727f33dadc8abf113e9af76ef63e3e016de319ce 100644
--- a/tensorflow/python/keras/layers/wrappers_test.py
+++ b/tensorflow/python/keras/layers/wrappers_test.py
@@ -165,6 +165,7 @@ class TimeDistributedTest(test.TestCase):
       y = model.predict(np.random.random((10, 3, 2)))
       self.assertAllClose(np.mean(y), 0., atol=1e-1, rtol=1e-1)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_batchnorm(self):
     with self.cached_session():
       # test that wrapped BN updates still work.
@@ -187,13 +188,14 @@ class TimeDistributedTest(test.TestCase):
       # Verify input_map has one mapping from inputs to reshaped inputs.
       self.assertEqual(len(td._input_map.keys()), 1)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_trainable(self):
     # test layers that need learning_phase to be set
     x = keras.layers.Input(shape=(3, 2))
     layer = keras.layers.TimeDistributed(keras.layers.BatchNormalization())
     _ = layer(x)
-    self.assertEquals(len(layer.updates), 2)
-    self.assertEquals(len(layer.trainable_weights), 2)
+    self.assertEqual(len(layer.updates), 2)
+    self.assertEqual(len(layer.trainable_weights), 2)
     layer.trainable = False
     assert not layer.updates
     assert not layer.trainable_weights
@@ -201,6 +203,7 @@ class TimeDistributedTest(test.TestCase):
     assert len(layer.updates) == 2
     assert len(layer.trainable_weights) == 2
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_with_masked_embedding_and_unspecified_shape(self):
     with self.cached_session():
       # test with unspecified shape and Embeddings with mask_zero
@@ -233,6 +236,7 @@ class TimeDistributedTest(test.TestCase):
         self.assertAllEqual(mask_outputs_val[i], ref_mask_val[i])
       self.assertIs(mask_outputs[-1], None)  # final layer
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_TimeDistributed_with_masking_layer(self):
     with self.cached_session():
       # test with Masking layer
@@ -375,6 +379,7 @@ class BidirectionalTest(test.TestCase):
       model.compile(loss='mse', optimizer='sgd')
       model.fit(x, y, epochs=1, batch_size=1)
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_merged_value(self):
     rnn = keras.layers.LSTM
     samples = 2
@@ -505,6 +510,7 @@ class BidirectionalTest(test.TestCase):
       layer.trainable = True
       assert len(layer.trainable_weights) == 6
 
+  @tf_test_util.run_v1_only('b/120545219')
   def test_Bidirectional_updates(self):
     with self.cached_session():
       x = keras.layers.Input(shape=(3, 2))
@@ -635,6 +641,34 @@ class BidirectionalTest(test.TestCase):
       y_np_3 = model.predict([x_np, s_fw_np, s_bk_np, c_np])
       self.assertAllClose(y_np, y_np_3, atol=1e-4)
 
+  def test_Bidirectional_with_masking(self):
+    rnn = keras.layers.LSTM
+    samples = 2
+    dim = 5
+    timesteps = 3
+    units = 3
+    merge_mode = 'concat'
+    x = np.random.rand(samples, timesteps, dim)
+    # clear the first record's timestep 2, and expect the output of timestep 2
+    # is also 0s.
+    x[0, 2] = 0
+
+    with self.cached_session():
+      inputs = keras.Input((timesteps, dim))
+      masked_inputs = keras.layers.Masking()(inputs)
+      wrapped = keras.layers.Bidirectional(
+          rnn(units, return_sequences=True),
+          merge_mode=merge_mode)
+      outputs = _to_list(wrapped(masked_inputs, training=True))
+      self.assertEqual(len(outputs), 1)
+      self.assertEqual(outputs[0].get_shape().as_list(),
+                       [None, timesteps, units * 2])
+
+      model = keras.Model(inputs, outputs)
+      y = _to_list(model.predict(x))
+      self.assertEqual(len(y), 1)
+      self.assertAllClose(y[0][0, 2], np.zeros(units * 2))
+
 
 def _to_list(ls):
   if isinstance(ls, list):
diff --git a/tensorflow/python/keras/losses.py b/tensorflow/python/keras/losses.py
index 9f548bfe0408d5c053c25b9ae14810d582b83e1e..4c584d0ff059ba8eabd3de06ebb06b2703400a73 100644
--- a/tensorflow/python/keras/losses.py
+++ b/tensorflow/python/keras/losses.py
@@ -19,16 +19,382 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import abc
+
 import six
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.losses_utils import compute_weighted_loss
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn
+from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.util.tf_export import tf_export
 
 
+class Loss(object):
+  """Loss base class.
+
+  To be implemented by subclasses:
+  * `call()`: Contains the logic for loss calculation using `y_true`, `y_pred`.
+
+  Example subclass implementation:
+  ```
+  class MeanSquaredError(Loss):
+    def call(self, y_true, y_pred):
+      y_pred = ops.convert_to_tensor(y_pred)
+      y_true = math_ops.cast(y_true, y_pred.dtype)
+      return K.mean(math_ops.square(y_pred - y_true), axis=-1)
+  ```
+
+  Args:
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    self.reduction = reduction
+    self.name = name
+
+  def __call__(self, y_true, y_pred, sample_weight=None):
+    """Invokes the `Loss` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional `Tensor` whose rank is either 0, or the same rank
+        as `y_true`, or is broadcastable to `y_true`. `sample_weight` acts as a
+        coefficient for the loss. If a scalar is provided, then the loss is
+        simply scaled by the given value. If `sample_weight` is a tensor of size
+        `[batch_size]`, then the total loss for each sample of the batch is
+        rescaled by the corresponding element in the `sample_weight` vector. If
+        the shape of `sample_weight` matches the shape of `y_pred`, then the
+        loss of each measurable element of `y_pred` is scaled by the
+        corresponding value of `sample_weight`.
+
+    Returns:
+      Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same
+        shape as `y_true`; otherwise, it is scalar.
+
+    Raises:
+      ValueError: If the shape of `sample_weight` is invalid.
+    """
+    with ops.name_scope(self.name, format(self.__class__.__name__),
+                        (y_pred, y_true, sample_weight)):
+      losses = self.call(y_true, y_pred)
+      return compute_weighted_loss(
+          losses, sample_weight, reduction=self.reduction)
+
+  @classmethod
+  def from_config(cls, config):
+    """Instantiates a `Loss` from its config (output of `get_config()`).
+
+    Args:
+        config: Output of `get_config()`.
+
+    Returns:
+        A `Loss` instance.
+    """
+    return cls(**config)
+
+  def get_config(self):
+    return {'reduction': self.reduction, 'name': self.name}
+
+  @abc.abstractmethod
+  def call(self, y_true, y_pred):
+    """Invokes the `Loss` instance.
+
+    Args:
+      y_true: Ground truth values, with the same shape as 'y_pred'.
+      y_pred: The predicted values.
+    """
+    NotImplementedError('Must be implemented in subclasses.')
+
+
+@tf_export('keras.losses.MeanSquaredError')
+class MeanSquaredError(Loss):
+  """Computes the mean of squares of errors between labels and predictions.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean squared error value is 3/4 (0.75).
+
+  Usage:
+
+  ```python
+  mse = tf.keras.losses.MeanSquaredError()
+  loss = mse([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanSquaredError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanSquaredError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean squared error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_squared_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanAbsoluteError')
+class MeanAbsoluteError(Loss):
+  """Computes the mean of absolute difference between labels and predictions.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean absolute error value is 3/4 (0.75).
+
+  Usage:
+
+  ```python
+  mae = tf.keras.losses.MeanAbsoluteError()
+  loss = mae([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanAbsoluteError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanAbsoluteError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean absolute error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_absolute_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanAbsolutePercentageError')
+class MeanAbsolutePercentageError(Loss):
+  """Computes the mean absolute percentage error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean absolute percentage error value is 5e+08.
+
+  Usage:
+
+  ```python
+  mape = tf.keras.losses.MeanAbsolutePercentageError()
+  loss = mape([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 5e+08
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanAbsolutePercentageError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanAbsolutePercentageError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean absolute percentage error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_absolute_percentage_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.MeanSquaredLogarithmicError')
+class MeanSquaredLogarithmicError(Loss):
+  """Computes the mean squared logarithmic error between `y_true` and `y_pred`.
+
+  For example, if `y_true` is [0., 0., 1., 1.] and `y_pred` is [1., 1., 1., 0.]
+  then the mean squared logarithmic error value is 0.36034.
+
+  Usage:
+
+  ```python
+  msle = tf.keras.losses.MeanSquaredLogarithmicError()
+  loss = msle([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 0.36034
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.MeanSquaredLogarithmicError())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Invokes the `MeanSquaredLogarithmicError` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Mean squared logarithmic error losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return mean_squared_logarithmic_error(y_true, y_pred)
+
+
+@tf_export('keras.losses.BinaryCrossentropy')
+class BinaryCrossentropy(Loss):
+  """Computes the binary cross entropy loss between the labels and predictions.
+
+  Usage:
+
+  ```python
+  bce = tf.keras.losses.BinaryCrossentropy()
+  loss = bce([0., 0., 1., 1.], [1., 1., 1., 0.])
+  print('Loss: ', loss.numpy())  # Loss: 12.007
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.BinaryCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `output` is expected to be a logits tensor. By default,
+      we consider that `output` encodes a probability distribution.
+    label_smoothing: If greater than `0` then smooth the labels.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               label_smoothing=0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(BinaryCrossentropy, self).__init__(reduction=reduction, name=name)
+    self.from_logits = from_logits
+    self.label_smoothing = label_smoothing
+
+  def call(self, y_true, y_pred):
+    """Invokes the `BinaryCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Binary cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+
+    if self.label_smoothing > 0:
+      y_true = y_true * (1 - self.label_smoothing) + 0.5 * self.label_smoothing
+
+    return binary_crossentropy(y_true, y_pred, from_logits=self.from_logits)
+
+
+@tf_export('keras.losses.CategoricalCrossentropy')
+class CategoricalCrossentropy(Loss):
+  """Computes categorical cross entropy loss between the `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  cce = tf.keras.losses.CategoricalCrossentropy()
+  loss = cce(
+    [[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+    [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]])
+  print('Loss: ', loss.numpy())  # Loss: 0.3239
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.keras.losses.CategoricalCrossentropy())
+  ````
+
+  Args:
+    from_logits: Whether `output` is expected to be a logits tensor. By default,
+      we consider that `output` encodes a probability distribution.
+    label_smoothing: If greater than `0` then smooth the labels. This option is
+      currently not supported when `y_pred` is a sparse input (not one-hot).
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+  """
+
+  def __init__(self,
+               from_logits=False,
+               label_smoothing=0,
+               reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+               name=None):
+    super(CategoricalCrossentropy, self).__init__(
+        reduction=reduction, name=name)
+    self.from_logits = from_logits
+    self.label_smoothing = label_smoothing
+
+  def call(self, y_true, y_pred):
+    """Invokes the `CategoricalCrossentropy` instance.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Categorical cross entropy losses.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = ops.convert_to_tensor(y_true)
+    is_sparse = y_pred.shape != y_true.shape
+
+    if is_sparse:
+      return sparse_categorical_crossentropy(
+          y_true, y_pred, from_logits=self.from_logits)
+    else:
+      y_true = math_ops.cast(y_true, y_pred.dtype)
+      if self.label_smoothing > 0:
+        num_classes = math_ops.cast(array_ops.shape(y_true)[1], y_pred.dtype)
+        smooth_positives = 1.0 - self.label_smoothing
+        smooth_negatives = self.label_smoothing / num_classes
+        y_true = y_true * smooth_positives + smooth_negatives
+
+      return categorical_crossentropy(
+          y_true, y_pred, from_logits=self.from_logits)
+
+
 @tf_export('keras.metrics.mean_squared_error',
            'keras.metrics.mse',
            'keras.metrics.MSE',
@@ -116,20 +482,22 @@ def logcosh(y_true, y_pred):
 
 @tf_export('keras.metrics.categorical_crossentropy',
            'keras.losses.categorical_crossentropy')
-def categorical_crossentropy(y_true, y_pred):
-  return K.categorical_crossentropy(y_true, y_pred)
+def categorical_crossentropy(y_true, y_pred, from_logits=False):
+  return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
 
 
 @tf_export('keras.metrics.sparse_categorical_crossentropy',
            'keras.losses.sparse_categorical_crossentropy')
-def sparse_categorical_crossentropy(y_true, y_pred):
-  return K.sparse_categorical_crossentropy(y_true, y_pred)
+def sparse_categorical_crossentropy(y_true, y_pred, from_logits=False):
+  return K.sparse_categorical_crossentropy(
+      y_true, y_pred, from_logits=from_logits)
 
 
 @tf_export('keras.metrics.binary_crossentropy',
            'keras.losses.binary_crossentropy')
-def binary_crossentropy(y_true, y_pred):
-  return K.mean(K.binary_crossentropy(y_true, y_pred), axis=-1)
+def binary_crossentropy(y_true, y_pred, from_logits=False):
+  return K.mean(
+      K.binary_crossentropy(y_true, y_pred, from_logits=from_logits), axis=-1)
 
 
 @tf_export('keras.metrics.kullback_leibler_divergence',
@@ -159,6 +527,40 @@ def cosine_proximity(y_true, y_pred):
   return -math_ops.reduce_sum(y_true * y_pred, axis=-1)
 
 
+class CosineProximity(Loss):
+  """Computes the cosine distance between `y_true` and `y_pred`.
+
+  Usage:
+
+  ```python
+  cosine_loss = tf.losses.CosineProximity()
+  loss = cosine_loss([0., 1., 1.], [1., 0., 1.])
+  print('Loss: ', loss.numpy())  # Loss: -0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss=tf.losses.CosineProximity())
+  ```
+  """
+
+  def call(self, y_true, y_pred):
+    """Calculates the cosine proximity loss.
+
+    Args:
+      y_true: Ground truth values.
+      y_pred: The predicted values.
+
+    Returns:
+      Cosine distance loss.
+    """
+    y_pred = ops.convert_to_tensor(y_pred)
+    y_true = math_ops.cast(y_true, y_pred.dtype)
+    return cosine_proximity(y_true, y_pred)
+
+
 # Aliases.
 
 mse = MSE = mean_squared_error
@@ -197,3 +599,9 @@ def get(identifier):
   else:
     raise ValueError('Could not interpret '
                      'loss function identifier:', identifier)
+
+
+LABEL_DTYPES_FOR_LOSSES = {
+    losses_impl.sparse_softmax_cross_entropy: 'int32',
+    sparse_categorical_crossentropy: 'int32'
+}
diff --git a/tensorflow/python/keras/losses_test.py b/tensorflow/python/keras/losses_test.py
index c7015270accc9f8244f8650d7edd78d609a47f09..d2791cdcd3bdac799c92112174f9edf2dbdf87ee 100644
--- a/tensorflow/python/keras/losses_test.py
+++ b/tensorflow/python/keras/losses_test.py
@@ -24,6 +24,11 @@ import shutil
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.losses import losses_impl
 from tensorflow.python.platform import test
 
 try:
@@ -138,5 +143,633 @@ class KerasLossesTest(test.TestCase):
         loaded_model.predict(np.random.rand(128, 2))
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class MeanSquaredErrorTest(test.TestCase):
+
+  def test_config(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.SUM, name='mse_1')
+    self.assertEqual(mse_obj.name, 'mse_1')
+    self.assertEqual(mse_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+    loss = mse_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 49.5, 3)
+
+  def test_scalar_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 113.85, 3)
+
+  def test_sample_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 767.8 / 6, 3)
+
+  def test_timestep_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mse_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 587 / 6, 3)
+
+  def test_zero_weighted(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_invalid_sample_weight(self):
+    mse_obj = keras.losses.MeanSquaredError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, r'Shapes \(2, 2\) and \(2, 3\) are incompatible'):
+      mse_obj(y_true, y_pred, sample_weight=sample_weight)
+
+  def test_no_reduction(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.NONE)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    loss = self.evaluate(loss)
+    self.assertArrayNear(loss, [84.3333, 143.3666], 1e-3)
+
+  def test_sum_reduction(self):
+    mse_obj = keras.losses.MeanSquaredError(
+        reduction=losses_impl.ReductionV2.SUM)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mse_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 227.69998, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsoluteErrorTest(test.TestCase):
+
+  def test_config(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.SUM, name='mae_1')
+    self.assertEqual(mae_obj.name, 'mae_1')
+    self.assertEqual(mae_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3))
+    loss = mae_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 5.5, 3)
+
+  def test_scalar_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 12.65, 3)
+
+  def test_sample_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 81.4 / 6, 3)
+
+  def test_timestep_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mae_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 83 / 6, 3)
+
+  def test_zero_weighted(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_invalid_sample_weight(self):
+    mae_obj = keras.losses.MeanAbsoluteError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3], shape=(2, 3, 1))
+    sample_weight = constant_op.constant([3, 6, 5, 0], shape=(2, 2))
+    with self.assertRaisesRegexp(
+        ValueError, r'Shapes \(2, 2\) and \(2, 3\) are incompatible'):
+      mae_obj(y_true, y_pred, sample_weight=sample_weight)
+
+  def test_no_reduction(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.NONE)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    loss = self.evaluate(loss)
+    self.assertArrayNear(loss, [10.7333, 14.5666], 1e-3)
+
+  def test_sum_reduction(self):
+    mae_obj = keras.losses.MeanAbsoluteError(
+        reduction=losses_impl.ReductionV2.SUM)
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mae_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 25.29999, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanAbsolutePercentageErrorTest(test.TestCase):
+
+  def test_config(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError(
+        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+    self.assertEqual(mape_obj.name, 'mape_1')
+    self.assertEqual(mape_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 211.8518, 3)
+
+  def test_scalar_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 487.259, 3)
+
+  def test_sample_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 422.8888, 3)
+
+  def test_timestep_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = mape_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 694.4445, 3)
+
+  def test_zero_weighted(self):
+    mape_obj = keras.losses.MeanAbsolutePercentageError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = mape_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MeanSquaredLogarithmicErrorTest(test.TestCase):
+
+  def test_config(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError(
+        reduction=losses_impl.ReductionV2.SUM, name='mape_1')
+    self.assertEqual(msle_obj.name, 'mape_1')
+    self.assertEqual(msle_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 1.4370, 3)
+
+  def test_scalar_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 3.3051, 3)
+
+  def test_sample_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 3.7856, 3)
+
+  def test_timestep_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = msle_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 2.6473, 3)
+
+  def test_zero_weighted(self):
+    msle_obj = keras.losses.MeanSquaredLogarithmicError()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = msle_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CosineProximityTest(test.TestCase):
+
+  def test_config(self):
+    cosine_obj = keras.losses.CosineProximity(
+        reduction=losses_impl.ReductionV2.SUM, name='cosine_loss')
+    self.assertEqual(cosine_obj.name, 'cosine_loss')
+    self.assertEqual(cosine_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_unweighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), -0.18722, 3)
+
+  def test_scalar_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), -0.43060, 3)
+
+  def test_sample_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.15599, 3)
+
+  def test_timestep_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3, 1))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3, 1),
+                                  dtype=dtypes.float32)
+    sample_weight = constant_op.constant([3, 6, 5, 0, 4, 2], shape=(2, 3))
+    loss = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), -2.0000, 3)
+
+  def test_zero_weighted(self):
+    cosine_obj = keras.losses.CosineProximity()
+    y_true = constant_op.constant([1, 9, 2, -5, -2, 6], shape=(2, 3))
+    y_pred = constant_op.constant([4, 8, 12, 8, 1, 3],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = cosine_obj(y_true, y_pred, sample_weight=0)
+    self.assertAlmostEqual(self.evaluate(loss), 0., 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class BinaryCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    bce_obj = keras.losses.BinaryCrossentropy(
+        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+    self.assertEqual(bce_obj.name, 'bce_1')
+    self.assertEqual(bce_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                  dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy()
+    loss = bce_obj(y_true, y_true)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[100.0, -100.0, -100.0],
+                                   [-100.0, 100.0, -100.0],
+                                   [-100.0, -100.0, 100.0]])
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = bce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 8.0004, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([10., 10., 10., -10., 10, -10],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 5., 3)
+
+  def test_scalar_weighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float32)
+    loss = bce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 18.4010, 3)
+
+    # Test with logits.
+    y_true = array_ops.ones((32, 1))
+    logits = array_ops.ones((32, 1), dtype=dtypes.float32)
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), 0.7205, 3)
+
+  def test_sample_weighted(self):
+    bce_obj = keras.losses.BinaryCrossentropy()
+    y_true = constant_op.constant([1, 0, 1, 0, 0, 1], shape=(2, 3))
+    y_pred = constant_op.constant([1, 1, 1, 0, 1, 0],
+                                  shape=(2, 3),
+                                  dtype=dtypes.float64)
+    sample_weight = constant_op.constant([1.2, 3.4], shape=(2, 1))
+    loss = bce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 21.4907, 3)
+
+    # Test with logits.
+    y_true = constant_op.constant([[0, 0, 1], [1, 0, 0], [0, 1, 0]])
+    logits = constant_op.constant(
+        [[100.0, -100.0, -100.0], [-100.0, 100.0, -100.0],
+         [-100.0, -100.0, 100.0]],
+        dtype=dtypes.float64)
+    weights = constant_op.constant([3, 2, 8])
+    bce_obj = keras.losses.BinaryCrossentropy(from_logits=True)
+    loss = bce_obj(y_true, logits, sample_weight=weights)
+    self.assertAlmostEqual(self.evaluate(loss), 288.8888, 3)
+
+  def test_no_reduction(self):
+    y_true = constant_op.constant(((1, 0, 1), (1, 1, 0), (0, 1, 1)))
+    logits = constant_op.constant(((100.0, -100.0, 100.0),
+                                   (100.0, -100.0, 100.0),
+                                   (100.0, 100.0, -100.0)))
+    bce_obj = keras.losses.BinaryCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = bce_obj(y_true, logits)
+    self.assertAllClose((0., 66.6666, 66.6666), self.evaluate(loss), 3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant([[100.0, -100.0, -100.0]])
+    y_true = constant_op.constant([[1, 0, 1]])
+    label_smoothing = 0.1
+    # Loss: max(x, 0) - x * z + log(1 + exp(-abs(x)))
+    # Label smoothing: z' = z * (1 - L) + 0.5L
+    #                  1  = 1 - 0.5L
+    #                  0  = 0.5L
+    # Applying the above two fns to the given input:
+    # (100 - 100 * (1 - 0.5 L)  + 0 +
+    #  0   + 100 * (0.5 L)      + 0 +
+    #  0   + 100 * (1 - 0.5 L)  + 0) * (1/3)
+    #  = (100 + 50L) * 1/3
+    bce_obj = keras.losses.BinaryCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    loss = bce_obj(y_true, logits)
+    expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
+    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CategoricalCrossentropyTest(test.TestCase):
+
+  def test_config(self):
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        reduction=losses_impl.ReductionV2.SUM, name='bce_1')
+    self.assertEqual(cce_obj.name, 'bce_1')
+    self.assertEqual(cce_obj.reduction, losses_impl.ReductionV2.SUM)
+
+  def test_all_correct_unweighted(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]],
+                                  dtype=dtypes.int64)
+    y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+                                  dtype=dtypes.float32)
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
+
+  def test_scalar_weighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
+
+  def test_sample_weighted(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    sample_weight = constant_op.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+  def test_no_reduction(self):
+    y_true = constant_op.constant([[1, 0, 0], [0, 1, 0], [0, 0, 1]])
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
+
+  def test_label_smoothing(self):
+    logits = constant_op.constant([[100.0, -100.0, -100.0]])
+    y_true = constant_op.constant([[1, 0, 0]])
+    label_smoothing = 0.1
+    # Softmax Cross Entropy Loss: -\sum_i p_i \log q_i
+    # where for a softmax activation
+    # \log q_i = x_i - \log \sum_j \exp x_j
+    #          = x_i - x_max - \log \sum_j \exp (x_j - x_max)
+    # For our activations, [100, -100, -100]
+    # \log ( exp(0) + exp(-200) + exp(-200) ) = 0
+    # so our log softmaxes become: [0, -200, -200]
+    # Label smoothing: z' = z * (1 - L) + L/n
+    #                  1  = 1 - L + L/n
+    #                  0  = L/n
+    # Applying the above two fns to the given input:
+    # -0 * (1 - L + L/n) + 200 * L/n + 200 * L/n = 400 L/n
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, label_smoothing=label_smoothing)
+    loss = cce_obj(y_true, logits)
+    expected_value = 400.0 * label_smoothing / 3.0
+    self.assertAlmostEqual(self.evaluate(loss), expected_value, 3)
+
+  def test_all_correct_unweighted_sparse(self):
+    y_true = constant_op.constant([[0], [1], [2]], dtype=dtypes.int64)
+    y_pred = constant_op.constant([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]],
+                                  dtype=dtypes.float32)
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[10., 0., 0.], [0., 10., 0.], [0., 0., 10.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), 0.0, 3)
+
+  def test_unweighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([0, 1, 2])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred)
+    self.assertAlmostEqual(self.evaluate(loss), .3239, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits)
+    self.assertAlmostEqual(self.evaluate(loss), .0573, 3)
+
+  def test_scalar_weighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[0], [1], [2]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    loss = cce_obj(y_true, y_pred, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .7449, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=2.3)
+    self.assertAlmostEqual(self.evaluate(loss), .1317, 3)
+
+  def test_sample_weighted_sparse(self):
+    cce_obj = keras.losses.CategoricalCrossentropy()
+    y_true = constant_op.constant([[0], [1], [2]])
+    y_pred = constant_op.constant(
+        [[.9, .05, .05], [.5, .89, .6], [.05, .01, .94]], dtype=dtypes.float32)
+    sample_weight = constant_op.constant([[1.2], [3.4], [5.6]], shape=(3, 1))
+    loss = cce_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 1.0696, 3)
+
+    # Test with logits.
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(from_logits=True)
+    loss = cce_obj(y_true, logits, sample_weight=sample_weight)
+    self.assertAlmostEqual(self.evaluate(loss), 0.31829, 3)
+
+  def test_no_reduction_sparse(self):
+    y_true = constant_op.constant([[0], [1], [2]])
+    logits = constant_op.constant([[8., 1., 1.], [0., 9., 1.], [2., 3., 5.]])
+    cce_obj = keras.losses.CategoricalCrossentropy(
+        from_logits=True, reduction=losses_impl.ReductionV2.NONE)
+    loss = cce_obj(y_true, logits)
+    self.assertAllClose((0.001822, 0.000459, 0.169846), self.evaluate(loss), 3)
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/metrics.py b/tensorflow/python/keras/metrics.py
index 2ea64055979456cc1765c23ac58a41805bf3b2d0..331a8636d1c93ce9c8ee03a8d6c0f486617bf6dd 100644
--- a/tensorflow/python/keras/metrics.py
+++ b/tensorflow/python/keras/metrics.py
@@ -24,9 +24,11 @@ import functools
 import sys
 import types
 import weakref
+from enum import Enum
+import numpy as np
 import six
 
-from tensorflow.python.compat import compat
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
@@ -48,8 +50,10 @@ from tensorflow.python.keras.losses import sparse_categorical_crossentropy
 from tensorflow.python.keras.losses import squared_hinge
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
+from tensorflow.python.keras.utils.generic_utils import to_list
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -57,19 +61,11 @@ from tensorflow.python.ops import nn
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops import weights_broadcast_ops
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
 
-def check_is_tensor_or_operation(x, name):
-  """Raises type error if the given input is not a tensor or operation."""
-  if not (isinstance(x, ops.Tensor) or isinstance(x, ops.Operation)):
-    raise TypeError('{0} must be a Tensor or Operation, given: {1}'.format(
-        name, x))
-
-
 def clone_metric(metric):
   """Returns a clone of the metric if stateful, otherwise returns it as is."""
   if isinstance(metric, Metric):
@@ -102,8 +98,6 @@ def update_state_wrapper(update_state_fn):
     update_op = update_state_fn(*args, **kwargs)
     if update_op is not None:  # update_op will be None in eager execution.
       metric_obj.add_update(update_op, inputs=True)
-      check_is_tensor_or_operation(
-          update_op, 'Metric {0}\'s update'.format(metric_obj.name))
     return update_op
 
   return tf_decorator.make_decorator(update_state_fn, decorated)
@@ -128,7 +122,7 @@ def result_wrapper(result_fn):
     `merge_call()`.
   """
 
-  def decorated(metric_obj, *args):
+  def decorated(_, *args):
     """Decorated function with merge_call."""
     replica_context = distribution_strategy_context.get_replica_context()
     if replica_context is None:  # if in cross replica context already
@@ -147,9 +141,8 @@ def result_wrapper(result_fn):
 
       # Wrapping result in merge_call. merge_call is used when we want to leave
       # replica mode and compute a value in cross replica mode.
-      result_t = replica_context.merge_call(merge_fn_wrapper, result_fn, *args)
-    check_is_tensor_or_operation(result_t,
-                                 'Metric {0}\'s result'.format(metric_obj.name))
+      result_t = replica_context.merge_call(
+          merge_fn_wrapper, args=(result_fn,) + args)
     return result_t
 
   return tf_decorator.make_decorator(result_fn, decorated)
@@ -170,108 +163,169 @@ def weakmethod(method):
   return inner
 
 
-def safe_div(numerator, denominator):
-  """Computes a safe divide which returns 0 if the denominator is zero.
+class _ConfusionMatrix(Enum):
+  TRUE_POSITIVES = 'tp'
+  FALSE_POSITIVES = 'fp'
+  TRUE_NEGATIVES = 'tn'
+  FALSE_NEGATIVES = 'fn'
 
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
 
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
+def _assert_thresholds_range(thresholds):
+  invalid_thresholds = [t for t in thresholds if t < 0 or t > 1]
+  if any(invalid_thresholds):
+    raise ValueError('Threshold values must be in [0, 1]. Invalid values: {}'
+                     .format(invalid_thresholds))
 
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator))
 
+def _update_confusion_matrix_variables(variables_to_update,
+                                       y_true,
+                                       y_pred,
+                                       thresholds,
+                                       sample_weight=None):
+  """Returns op to update the given confusion matrix variables.
+
+  For every pair of values in y_true and y_pred:
 
-def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
-  """Squeeze or expand last dimension if needed.
+  true_positive: y_true == True and y_pred > thresholds
+  false_negatives: y_true == True and y_pred <= thresholds
+  true_negatives: y_true == False and y_pred <= thresholds
+  false_positive: y_true == False and y_pred > thresholds
 
-  1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
-  (using `confusion_matrix.remove_squeezable_dimensions`).
-  2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
-  from the new rank of `y_pred`.
-  If `sample_weight` is scalar, it is kept scalar.
+  The results will be weighted and added together. When multiple thresholds are
+  provided, we will repeat the same for every threshold.
 
-  This will use static shape if available. Otherwise, it will add graph
-  operations, which could result in a performance hit.
+  For estimation of these metrics over a stream of data, the function creates an
+  `update_op` operation that updates the given variables.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use weights of 0 to mask values.
 
   Args:
-    y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
-    y_true: Optional label `Tensor` whose dimensions match `y_pred`.
-    sample_weight: Optional weight scalar or `Tensor` whose dimensions match
-      `y_pred`.
+    variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys
+      and corresponding variables to update as values.
+    y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`.
+    y_pred: A floating point `Tensor` of arbitrary shape and whose values are in
+      the range `[0, 1]`.
+    thresholds: A float value or a python list or tuple of float thresholds in
+      `[0, 1]`.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must
+      be either `1`, or the same as the corresponding `y_true` dimension).
 
   Returns:
-    Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
-    the last dimension squeezed,
-    `sample_weight` could be extended by one dimension.
+    Update op.
+
+  Raises:
+    ValueError: If `y_pred` and `y_true` have mismatched shapes, or if
+      `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if
+      `variables_to_update` contains invalid keys.
   """
-  if y_true is not None:
-    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
-    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
-        y_true, y_pred)
-
-  if sample_weight is None:
-    return y_pred, y_true, None
-
-  sample_weight = ops.convert_to_tensor(sample_weight)
-  weights_shape = sample_weight.get_shape()
-  weights_rank = weights_shape.ndims
-  if weights_rank == 0:  # If weights is scalar, do nothing.
-    return y_pred, y_true, sample_weight
-
-  y_pred_shape = y_pred.get_shape()
-  y_pred_rank = y_pred_shape.ndims
-  if (y_pred_rank is not None) and (weights_rank is not None):
-    # Use static rank.
-    if weights_rank - y_pred_rank == 1:
-      sample_weight = array_ops.squeeze(sample_weight, [-1])
-    elif y_pred_rank - weights_rank == 1:
-      sample_weight = array_ops.expand_dims(sample_weight, [-1])
-    return y_pred, y_true, sample_weight
-
-  # Use dynamic rank.
-  weights_rank_tensor = array_ops.rank(sample_weight)
-  rank_diff = weights_rank_tensor - array_ops.rank(y_pred)
-  maybe_squeeze_weights = lambda: array_ops.squeeze(sample_weight, [-1])
-
-  def _maybe_expand_weights():
-    return control_flow_ops.cond(
-        math_ops.equal(rank_diff,
-                       -1), lambda: array_ops.expand_dims(sample_weight, [-1]),
-        lambda: sample_weight)
-
-  def _maybe_adjust_weights():
-    return control_flow_ops.cond(
-        math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
-        _maybe_expand_weights)
-
-  # squeeze or expand last dim of `sample_weight` if its rank differs by 1
-  # from the new rank of `y_pred`.
-  sample_weight = control_flow_ops.cond(
-      math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
-      _maybe_adjust_weights)
-  return y_pred, y_true, sample_weight
+  if variables_to_update is None:
+    return
+  y_true = ops.convert_to_tensor(y_true)
+  y_pred = ops.convert_to_tensor(y_pred)
+  y_pred.shape.assert_is_compatible_with(y_true.shape)
+
+  if not any(
+      key for key in variables_to_update if key in list(_ConfusionMatrix)):
+    raise ValueError(
+        'Please provide at least one valid confusion matrix '
+        'variable to update. Valid variable key options are: "{}". '
+        'Received: "{}"'.format(
+            list(_ConfusionMatrix), variables_to_update.keys()))
+
+  invalid_keys = [
+      key for key in variables_to_update if key not in list(_ConfusionMatrix)
+  ]
+  if invalid_keys:
+    raise ValueError(
+        'Invalid keys: {}. Valid variable key options are: "{}"'.format(
+            invalid_keys, list(_ConfusionMatrix)))
+
+  with ops.control_dependencies([
+      check_ops.assert_greater_equal(
+          y_pred,
+          math_ops.cast(0.0, dtype=y_pred.dtype),
+          message='predictions must be >= 0'),
+      check_ops.assert_less_equal(
+          y_pred,
+          math_ops.cast(1.0, dtype=y_pred.dtype),
+          message='predictions must be <= 1')
+  ]):
+    y_pred, y_true, sample_weight = squeeze_or_expand_dimensions(
+        math_ops.cast(y_pred, dtype=dtypes.float32),
+        math_ops.cast(y_true, dtype=dtypes.bool), sample_weight)
+
+  thresholds = to_list(thresholds)
+  num_thresholds = len(thresholds)
+  num_predictions = array_ops.size(y_pred)
+
+  # Reshape predictions and labels.
+  predictions_2d = array_ops.reshape(y_pred, [1, -1])
+  labels_2d = array_ops.reshape(
+      math_ops.cast(y_true, dtype=dtypes.bool), [1, -1])
+
+  # Tile the thresholds for every prediction.
+  thresh_tiled = array_ops.tile(
+      array_ops.expand_dims(array_ops.constant(thresholds), 1),
+      array_ops.stack([1, num_predictions]))
+
+  # Tile the predictions for every threshold.
+  preds_tiled = array_ops.tile(predictions_2d, [num_thresholds, 1])
+
+  # Compare predictions and threshold.
+  pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled)
+
+  # Tile labels by number of thresholds
+  label_is_pos = array_ops.tile(labels_2d, [num_thresholds, 1])
+
+  if sample_weight is not None:
+    weights = weights_broadcast_ops.broadcast_weights(
+        math_ops.cast(sample_weight, dtype=dtypes.float32), y_pred)
+    weights_tiled = array_ops.tile(
+        array_ops.reshape(weights, [1, -1]), [num_thresholds, 1])
+  else:
+    weights_tiled = None
+
+  update_ops = []
+
+  def weighted_assign_add(label, pred, weights, var):
+    label_and_pred = math_ops.cast(
+        math_ops.logical_and(label, pred), dtype=dtypes.float32)
+    if weights is not None:
+      label_and_pred *= weights
+    return state_ops.assign_add(var, math_ops.reduce_sum(label_and_pred, 1))
+
+  loop_vars = {
+      _ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos),
+  }
+  update_tn = _ConfusionMatrix.TRUE_NEGATIVES in variables_to_update
+  update_fp = _ConfusionMatrix.FALSE_POSITIVES in variables_to_update
+  update_fn = _ConfusionMatrix.FALSE_NEGATIVES in variables_to_update
+
+  if update_fn or update_tn:
+    pred_is_neg = math_ops.logical_not(pred_is_pos)
+    loop_vars[_ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg)
+
+  if update_fp or update_tn:
+    label_is_neg = math_ops.logical_not(label_is_pos)
+    loop_vars[_ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos)
+    if update_tn:
+      loop_vars[_ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg)
+
+  for matrix_cond, (label, pred) in loop_vars.items():
+    if matrix_cond in variables_to_update:
+      update_ops.append(
+          weighted_assign_add(label, pred, weights_tiled,
+                              variables_to_update[matrix_cond]))
+  return control_flow_ops.group(update_ops)
 
 
 @six.add_metaclass(abc.ABCMeta)
 class Metric(Layer):
   """Encapsulates metric logic and state.
 
-  Usage with eager execution:
+  Usage:
 
   ```python
   m = SomeMetric(...)
@@ -280,19 +334,6 @@ class Metric(Layer):
   print('Final result: ', m.result().numpy())
   ```
 
-  Usage with graph execution:
-
-  ```python
-  m = SomeMetric(...)
-  init_op = tf.variables_initializer(m.variables)  # Initialize variables
-  with tf.Session() as sess:
-    sess.run(init_op)
-    for input in ...:
-      update_op = m.update_state(input)
-      sess.run(update_op)
-    print('Final result: ', sess.run(m.result()))
-  ```
-
   Usage with tf.keras API:
 
   ```python
@@ -388,9 +429,20 @@ class Metric(Layer):
     Returns:
       The metric value tensor.
     """
-    update_op = self.update_state(*args, **kwargs)  # pylint: disable=not-callable
+    update_op = self.update_state(*args, **kwargs)
     with ops.control_dependencies([update_op]):
-      return self.result()  # pylint: disable=not-callable
+      result_t = self.result()
+
+      # We are adding the metric object as metadata on the result tensor.
+      # This is required when we want to use a metric with `add_metric` API on
+      # a Model/Layer in graph mode. This metric instance will later be used
+      # to reset variable state after each epoch of training.
+      # Example:
+      #   model = Model()
+      #   model.add_metric(Mean()(values), name='mean')
+      if not context.executing_eagerly():
+        result_t._metric_obj = self  # pylint: disable=protected-access
+      return result_t
 
   def reset_states(self):
     """Resets all of the metric state variables.
@@ -459,15 +511,35 @@ class Metric(Layer):
   ### End: For use by subclasses ###
 
 
+@tf_export('keras.metrics.Mean')
 class Mean(Metric):
   """Computes the (weighted) mean of the given values.
 
+  For example, if values is [1, 3, 5, 7] then the mean is 4.
+  If the weights were specified as [1, 1, 0, 0] then the mean would be 2.
+
   This metric creates two variables, `total` and `count` that are used to
   compute the average of `values`. This average is ultimately returned as `mean`
   which is an idempotent operation that simply divides `total` by `count`.
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Mean()
+  m.update_state([1, 3, 5, 7])
+  print('Final result: ', m.result().numpy())  # Final result: 4.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.add_metric(tf.keras.metrics.Mean(name='mean_1')(outputs))
+  model.compile('sgd', loss='mse')
+  ```
   """
 
   def __init__(self, name='mean', dtype=None):
@@ -525,11 +597,10 @@ class Mean(Metric):
     # updated.
     update_total_op = state_ops.assign_add(self.total, values)
     with ops.control_dependencies([update_total_op]):
-      update_count_op = state_ops.assign_add(self.count, num_values)
-      return ops.convert_to_tensor(update_count_op)
+      return state_ops.assign_add(self.count, num_values)
 
   def result(self):
-    return safe_div(self.total, self.count)
+    return math_ops.div_no_nan(self.total, self.count)
 
 
 class MeanMetricWrapper(Mean):
@@ -574,14 +645,62 @@ class MeanMetricWrapper(Mean):
         matches, sample_weight=sample_weight)
 
   def get_config(self):
-    config = self._fn_kwargs
+    config = {'fn': self._fn}
+    config.update(self._fn_kwargs)
     base_config = super(MeanMetricWrapper, self).get_config()
     return dict(list(base_config.items()) + list(config.items()))
 
 
+@tf_export('keras.metrics.Accuracy')
+class Accuracy(MeanMetricWrapper):
+  """Calculates how often predictions matches labels.
+
+  For example, if `y_true` is [1, 2, 3, 4] and `y_pred` is [0, 2, 3, 4]
+  then the accuracy is 3/4 or .75.  If the weights were specified as
+  [1, 1, 0, 0] then the accuracy would be 1/2 or .5.
+
+  This metric creates two local variables, `total` and `count` that are used to
+  compute the frequency with which `y_pred` matches `y_true`. This frequency is
+  ultimately returned as `binary accuracy`: an idempotent operation that simply
+  divides `total` by `count`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Accuracy()
+  m.update_state([1, 2, 3, 4], [0, 2, 3, 4])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Accuracy()])
+  ```
+  """
+
+  def __init__(self, name='accuracy', dtype=None):
+    super(Accuracy, self).__init__(accuracy, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(Accuracy, cls).from_config(config)
+
+
+@tf_export('keras.metrics.BinaryAccuracy')
 class BinaryAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
+  For example, if `y_true` is [1, 1, 0, 0] and `y_pred` is [0.98, 1, 0, 0.6]
+  then the binary accuracy is 3/4 or .75.  If the weights were specified as
+  [1, 0, 0, 1] then the binary accuracy would be 1/2 or .5.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `binary accuracy`: an idempotent operation that simply
@@ -589,6 +708,21 @@ class BinaryAccuracy(MeanMetricWrapper):
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.BinaryAccuracy()
+  m.update_state([1, 1, 0, 0], [0.98, 1, 0, 0.6])
+  print('Final result: ', m.result().numpy())  # Final result: 0.75
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.BinaryAccuracy()])
+  ```
   """
 
   def __init__(self, name='binary_accuracy', dtype=None, threshold=0.5):
@@ -603,17 +737,50 @@ class BinaryAccuracy(MeanMetricWrapper):
     super(BinaryAccuracy, self).__init__(
         binary_accuracy, name, dtype=dtype, threshold=threshold)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(BinaryAccuracy, cls).from_config(config)
+
 
+@tf_export('keras.metrics.CategoricalAccuracy')
 class CategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches labels.
 
+  For example, if `y_true` is [[0, 0, 1], [0, 1, 0]] and `y_pred` is
+  [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
+  If the weights were specified as [0.7, 0.3] then the categorical accuracy
+  would be .3.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `categorical accuracy`: an idempotent operation that
   simply divides `total` by `count`.
 
+  `y_pred` and `y_true` should be passed in as vectors of probabilities, rather
+  than as labels. If necessary, use `tf.one_hot` to expand `y_true` as a vector.
+
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.CategoricalAccuracy()
+  m.update_state([[0, 0, 1], [0, 1, 0]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+    'sgd',
+    loss='mse',
+    metrics=[tf.keras.metrics.CategoricalAccuracy()])
+  ```
   """
 
   def __init__(self, name='categorical_accuracy', dtype=None):
@@ -626,10 +793,22 @@ class CategoricalAccuracy(MeanMetricWrapper):
     super(CategoricalAccuracy, self).__init__(
         categorical_accuracy, name, dtype=dtype)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(CategoricalAccuracy, cls).from_config(config)
+
 
+@tf_export('keras.metrics.SparseCategoricalAccuracy')
 class SparseCategoricalAccuracy(MeanMetricWrapper):
   """Calculates how often predictions matches integer labels.
 
+  For example, if `y_true` is [[2], [1]] and `y_pred` is
+  [[0.1, 0.9, 0.8], [0.05, 0.95, 0]] then the categorical accuracy is 1/2 or .5.
+  If the weights were specified as [0.7, 0.3] then the categorical accuracy
+  would be .3.
+
   This metric creates two local variables, `total` and `count` that are used to
   compute the frequency with which `y_pred` matches `y_true`. This frequency is
   ultimately returned as `sparse categorical accuracy`: an idempotent operation
@@ -637,12 +816,712 @@ class SparseCategoricalAccuracy(MeanMetricWrapper):
 
   If `sample_weight` is `None`, weights default to 1.
   Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SparseCategoricalAccuracy()
+  m.update_state([[2], [1]], [[0.1, 0.9, 0.8], [0.05, 0.95, 0]])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])
+  ```
   """
 
   def __init__(self, name='sparse_categorical_accuracy', dtype=None):
     super(SparseCategoricalAccuracy, self).__init__(
         sparse_categorical_accuracy, name, dtype=dtype)
 
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(SparseCategoricalAccuracy, cls).from_config(config)
+
+
+class _ConfusionMatrixConditionCount(Metric):
+  """Calculates the number of the given confusion matrix condition."""
+
+  def __init__(self,
+               confusion_matrix_cond,
+               thresholds=None,
+               name=None,
+               dtype=None):
+    """Creates a `_ConfusionMatrixConditionCount` instance.
+
+    Args:
+      confusion_matrix_cond: One of `_ConfusionMatrix` conditions.
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(_ConfusionMatrixConditionCount, self).__init__(name=name, dtype=dtype)
+    self._confusion_matrix_cond = confusion_matrix_cond
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.accumulator = self.add_weight(
+        'accumulator',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates the given confusion matrix condition statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        self._confusion_matrix_cond: self.accumulator
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    if isinstance(self.thresholds, (list, tuple)):
+      result = self.accumulator
+    else:
+      result = self.accumulator[0]
+    return ops.convert_to_tensor(result)
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@tf_export('keras.metrics.FalsePositives')
+class FalsePositives(_ConfusionMatrixConditionCount):
+  """Calculates the number of false positives.
+
+  For example, if `y_true` is [0, 1, 0, 0] and `y_pred` is [0, 0, 1, 1]
+  then the false positives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the false positives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  false positives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of false positives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.FalsePositives()
+  m.update_state([0, 1, 0, 0], [0, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalsePositives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `FalsePositives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(FalsePositives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.FALSE_POSITIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('keras.metrics.FalseNegatives')
+class FalseNegatives(_ConfusionMatrixConditionCount):
+  """Calculates the number of false negatives.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [0, 1, 0, 0]
+  then the false negatives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the false negatives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  false negatives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of false negatives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.FalseNegatives()
+  m.update_state([0, 1, 1, 1], [0, 1, 0, 0])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.FalseNegatives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `FalseNegatives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(FalseNegatives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.FALSE_NEGATIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('keras.metrics.TrueNegatives')
+class TrueNegatives(_ConfusionMatrixConditionCount):
+  """Calculates the number of true negatives.
+
+  For example, if `y_true` is [0, 1, 0, 0] and `y_pred` is [1, 1, 0, 0]
+  then the true negatives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the true negatives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  true negatives. This metric creates one local variable, `accumulator`
+  that is used to keep track of the number of true negatives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.TrueNegatives()
+  m.update_state([0, 1, 0, 0], [1, 1, 0, 0])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TrueNegatives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `TrueNegatives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TrueNegatives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.TRUE_NEGATIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('keras.metrics.TruePositives')
+class TruePositives(_ConfusionMatrixConditionCount):
+  """Calculates the number of true positives.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the true positives value is 2.  If the weights were specified as
+  [0, 0, 1, 0] then the true positives value would be 1.
+
+  If `sample_weight` is given, calculates the sum of the weights of
+  true positives. This metric creates one local variable, `true_positives`
+  that is used to keep track of the number of true positives.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.TruePositives()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 2
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.TruePositives()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `TruePositives` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(TruePositives, self).__init__(
+        confusion_matrix_cond=_ConfusionMatrix.TRUE_POSITIVES,
+        thresholds=thresholds,
+        name=name,
+        dtype=dtype)
+
+
+@tf_export('keras.metrics.Precision')
+class Precision(Metric):
+  """Computes the precision of the predictions with respect to the labels.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the precision value is 2/(2+1) ie. 0.66. If the weights were specified as
+  [0, 0, 1, 0] then the precision value would be 1.
+
+  The metric creates two local variables, `true_positives` and `false_positives`
+  that are used to compute the precision. This value is ultimately returned as
+  `precision`, an idempotent operation that simply divides `true_positives`
+  by the sum of `true_positives` and `false_positives`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Precision()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Precision()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `Precision` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Precision, self).__init__(name=name, dtype=dtype)
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+    self.fp = self.add_weight(
+        'false_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates true positive and false positive statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.FALSE_POSITIVES: self.fp
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    result = math_ops.div_no_nan(self.tp, self.tp + self.fp)
+    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@tf_export('keras.metrics.Recall')
+class Recall(Metric):
+  """Computes the recall of the predictions with respect to the labels.
+
+  For example, if `y_true` is [0, 1, 1, 1] and `y_pred` is [1, 0, 1, 1]
+  then the recall value is 2/(2+1) ie. 0.66. If the weights were specified as
+  [0, 0, 1, 0] then the recall value would be 1.
+
+  This metric creates two local variables, `true_positives` and
+  `false_negatives`, that are used to compute the recall. This value is
+  ultimately returned as `recall`, an idempotent operation that simply divides
+  `true_positives` by the sum of `true_positives` and `false_negatives`.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.Recall()
+  m.update_state([0, 1, 1, 1], [1, 0, 1, 1])
+  print('Final result: ', m.result().numpy())  # Final result: 0.66
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile('sgd', loss='mse', metrics=[tf.keras.metrics.Recall()])
+  ```
+  """
+
+  def __init__(self, thresholds=None, name=None, dtype=None):
+    """Creates a `Recall` instance.
+
+    Args:
+      thresholds: (Optional) Defaults to 0.5. A float value or a python
+        list/tuple of float threshold values in [0, 1]. A threshold is compared
+        with prediction values to determine the truth value of predictions
+        (i.e., above the threshold is `true`, below is `false`). One metric
+        value is generated for each threshold value.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    super(Recall, self).__init__(name=name, dtype=dtype)
+    self.thresholds = 0.5 if thresholds is None else thresholds
+    thresholds = to_list(thresholds)
+    _assert_thresholds_range(thresholds)
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+    self.fn = self.add_weight(
+        'false_negatives',
+        shape=(len(thresholds),),
+        initializer=init_ops.zeros_initializer)
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates true positive and false negative statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.FALSE_NEGATIVES: self.fn
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def result(self):
+    result = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+    return result if isinstance(self.thresholds, (list, tuple)) else result[0]
+
+  def reset_states(self):
+    num_thresholds = len(to_list(self.thresholds))
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@six.add_metaclass(abc.ABCMeta)
+class SensitivitySpecificityBase(Metric):
+  """Abstract base class for computing sensitivity and specificity.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+  """
+
+  def __init__(self, value, num_thresholds=200, name=None, dtype=None):
+    super(SensitivitySpecificityBase, self).__init__(name=name, dtype=dtype)
+    if num_thresholds <= 0:
+      raise ValueError('`num_thresholds` must be > 0.')
+    self.value = value
+    self.tp = self.add_weight(
+        'true_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.tn = self.add_weight(
+        'true_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.fp = self.add_weight(
+        'false_positives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+    self.fn = self.add_weight(
+        'false_negatives',
+        shape=(num_thresholds,),
+        initializer=init_ops.zeros_initializer)
+
+    # Compute `num_thresholds` thresholds in [0, 1]
+    if num_thresholds == 1:
+      self.thresholds = [0.5]
+    else:
+      thresholds = [(i + 1) * 1.0 / (num_thresholds - 1)
+                    for i in range(num_thresholds - 2)]
+      self.thresholds = [0.0] + thresholds + [1.0]
+
+  def update_state(self, y_true, y_pred, sample_weight=None):
+    """Accumulates confusion matrix statistics.
+
+    Args:
+      y_true: The ground truth values.
+      y_pred: The predicted values.
+      sample_weight: Optional weighting of each example. Defaults to 1. Can be a
+        `Tensor` whose rank is either 0, or the same rank as `y_true`, and must
+        be broadcastable to `y_true`.
+
+    Returns:
+      Update op.
+    """
+    return _update_confusion_matrix_variables({
+        _ConfusionMatrix.TRUE_POSITIVES: self.tp,
+        _ConfusionMatrix.TRUE_NEGATIVES: self.tn,
+        _ConfusionMatrix.FALSE_POSITIVES: self.fp,
+        _ConfusionMatrix.FALSE_NEGATIVES: self.fn,
+    }, y_true, y_pred, self.thresholds, sample_weight)
+
+  def reset_states(self):
+    num_thresholds = len(self.thresholds)
+    for v in self.variables:
+      K.set_value(v, np.zeros((num_thresholds,)))
+
+
+@tf_export('keras.metrics.SensitivityAtSpecificity')
+class SensitivityAtSpecificity(SensitivitySpecificityBase):
+  """Computes the sensitivity at a given specificity.
+
+  `Sensitivity` measures the proportion of actual positives that are correctly
+  identified as such (tp / (tp + fn)).
+  `Specificity` measures the proportion of actual negatives that are correctly
+  identified as such (tn / (tn + fp)).
+
+  This metric creates four local variables, `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` that are used to compute the
+  sensitivity at the given specificity. The threshold for the given specificity
+  value is computed and used to evaluate the corresponding sensitivity.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SensitivityAtSpecificity(0.4, num_thresholds=1)
+  m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+  print('Final result: ', m.result().numpy())  # Final result: 0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.SensitivityAtSpecificity()])
+  ```
+  """
+
+  def __init__(self, specificity, num_thresholds=200, name=None, dtype=None):
+    """Creates a `SensitivityAtSpecificity` instance.
+
+    Args:
+      specificity: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+        use for matching the given specificity.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    if specificity < 0 or specificity > 1:
+      raise ValueError('`specificity` must be in the range [0, 1].')
+    super(SensitivityAtSpecificity, self).__init__(
+        specificity, num_thresholds=num_thresholds, name=name, dtype=dtype)
+
+  def result(self):
+    # Calculate specificities at all the thresholds.
+    specificities = math_ops.div_no_nan(self.tn, self.tn + self.fp)
+
+    # Find the index of the threshold where the specificity is closest to the
+    # given specificity.
+    min_index = math_ops.argmin(
+        math_ops.abs(specificities - self.value), axis=0)
+    min_index = math_ops.cast(min_index, dtypes.int32)
+
+    # Compute sensitivity at that index.
+    return math_ops.div_no_nan(self.tp[min_index],
+                               self.tp[min_index] + self.fn[min_index])
+
+
+@tf_export('keras.metrics.SpecificityAtSensitivity')
+class SpecificityAtSensitivity(SensitivitySpecificityBase):
+  """Computes the specificity at a given sensitivity.
+
+  `Sensitivity` measures the proportion of actual positives that are correctly
+  identified as such (tp / (tp + fn)).
+  `Specificity` measures the proportion of actual negatives that are correctly
+  identified as such (tn / (tn + fp)).
+
+  This metric creates four local variables, `true_positives`, `true_negatives`,
+  `false_positives` and `false_negatives` that are used to compute the
+  specificity at the given sensitivity. The threshold for the given sensitivity
+  value is computed and used to evaluate the corresponding specificity.
+
+  If `sample_weight` is `None`, weights default to 1.
+  Use `sample_weight` of 0 to mask values.
+
+  For additional information about specificity and sensitivity, see the
+  following: https://en.wikipedia.org/wiki/Sensitivity_and_specificity
+
+  Usage:
+
+  ```python
+  m = tf.keras.metrics.SpecificityAtSensitivity(0.8, num_thresholds=1)
+  m.update_state([0, 0, 1, 1], [0, 0.5, 0.3, 0.9])
+  print('Final result: ', m.result().numpy())  # Final result: 1.0
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.keras.metrics.SpecificityAtSensitivity()])
+  ```
+  """
+
+  def __init__(self, sensitivity, num_thresholds=200, name=None, dtype=None):
+    """Creates a `SpecificityAtSensitivity` instance.
+
+    Args:
+      sensitivity: A scalar value in range `[0, 1]`.
+      num_thresholds: (Optional) Defaults to 200. The number of thresholds to
+        use for matching the given specificity.
+      name: (Optional) string name of the metric instance.
+      dtype: (Optional) data type of the metric result.
+    """
+    if sensitivity < 0 or sensitivity > 1:
+      raise ValueError('`sensitivity` must be in the range [0, 1].')
+    super(SpecificityAtSensitivity, self).__init__(
+        sensitivity, num_thresholds=num_thresholds, name=name, dtype=dtype)
+
+  def result(self):
+    # Calculate sensitivities at all the thresholds.
+    sensitivities = math_ops.div_no_nan(self.tp, self.tp + self.fn)
+
+    # Find the index of the threshold where the sensitivity is closest to the
+    # given specificity.
+    min_index = math_ops.argmin(
+        math_ops.abs(sensitivities - self.value), axis=0)
+    min_index = math_ops.cast(min_index, dtypes.int32)
+
+    # Compute specificity at that index.
+    return math_ops.div_no_nan(self.tn[min_index],
+                               self.tn[min_index] + self.fp[min_index])
+
+
+class CosineProximity(MeanMetricWrapper):
+  """Computes the cosine distance between the labels and predictions.
+
+  For example, if `y_true` is [0, 1, 1], and `y_pred` is [1, 0, 1], the cosine
+  proximity is -0.5.
+
+  This metric keeps the average cosine distance between `predictions` and
+  `labels` over a stream of data.
+
+  Usage:
+  ```python
+  m = tf.metrics.CosineProximity()
+  m.update_state([0, 1, 1], [1, 0, 1])
+  print('Final result: ', m.result().numpy())  # Final result: -0.5
+  ```
+
+  Usage with tf.keras API:
+
+  ```python
+  model = keras.models.Model(inputs, outputs)
+  model.compile(
+      'sgd',
+      loss='mse',
+      metrics=[tf.metrics.CosineProximity()])
+  ```
+  """
+
+  def __init__(self, name='cosine_proximity', dtype=None):
+    super(CosineProximity, self).__init__(cosine, name, dtype=dtype)
+
+  @classmethod
+  def from_config(cls, config):
+    if 'fn' in config:
+      config.pop('fn')
+    return super(CosineProximity, cls).from_config(config)
+
+
+def accuracy(y_true, y_pred):
+  y_pred.get_shape().assert_is_compatible_with(y_true.get_shape())
+  if y_true.dtype != y_pred.dtype:
+    y_pred = math_ops.cast(y_pred, y_true.dtype)
+  return math_ops.cast(math_ops.equal(y_true, y_pred), K.floatx())
+
 
 @tf_export('keras.metrics.binary_accuracy')
 def binary_accuracy(y_true, y_pred, threshold=0.5):
diff --git a/tensorflow/python/keras/metrics_test.py b/tensorflow/python/keras/metrics_test.py
index 5f5565d4d5a547d640217cf799a20d0050584ed6..92398acd8e6dc683e37cf759c667c4665961b356 100644
--- a/tensorflow/python/keras/metrics_test.py
+++ b/tensorflow/python/keras/metrics_test.py
@@ -19,22 +19,25 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import layers
 from tensorflow.python.keras import metrics
-from tensorflow.python.keras.engine.training import Model
+from tensorflow.python.keras.models import Sequential
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training.checkpointable import util as checkpointable_utils
+from tensorflow.python.training.rmsprop import RMSPropOptimizer
 
 
 class KerasMetricsTest(test.TestCase):
@@ -47,7 +50,7 @@ class KerasMetricsTest(test.TestCase):
         output = metric(y_a, y_b)
         self.assertEqual(K.eval(output).shape, (6,))
 
-  def test_sparse_categorical_accuracy(self):
+  def test_sparse_categorical_accuracy_int(self):
     with self.cached_session():
       metric = metrics.sparse_categorical_accuracy
       y_true = K.variable(np.random.randint(0, 7, (6,)))
@@ -128,116 +131,6 @@ class KerasMetricsTest(test.TestCase):
       result = K.eval(metrics.top_k_categorical_accuracy(y_true, y_pred, k=1))
       self.assertEqual(result, 0.)
 
-  def test_stateful_metrics(self):
-    with self.cached_session():
-      np.random.seed(1334)
-
-      class BinaryTruePositives(layers.Layer):
-        """Stateful Metric to count the total true positives over all batches.
-
-        Assumes predictions and targets of shape `(samples, 1)`.
-
-        Arguments:
-            threshold: Float, lower limit on prediction value that counts as a
-                positive class prediction.
-            name: String, name for the metric.
-        """
-
-        def __init__(self, name='true_positives', **kwargs):
-          super(BinaryTruePositives, self).__init__(name=name, **kwargs)
-          self.true_positives = K.variable(value=0, dtype='int32')
-          self.stateful = True
-
-        def reset_states(self):
-          K.set_value(self.true_positives, 0)
-
-        def __call__(self, y_true, y_pred):
-          """Computes the number of true positives in a batch.
-
-          Args:
-              y_true: Tensor, batch_wise labels
-              y_pred: Tensor, batch_wise predictions
-
-          Returns:
-              The total number of true positives seen this epoch at the
-                  completion of the batch.
-          """
-          y_true = math_ops.cast(y_true, 'int32')
-          y_pred = math_ops.cast(math_ops.round(y_pred), 'int32')
-          correct_preds = math_ops.cast(math_ops.equal(y_pred, y_true), 'int32')
-          true_pos = math_ops.cast(
-              math_ops.reduce_sum(correct_preds * y_true), 'int32')
-          current_true_pos = self.true_positives * 1
-          self.add_update(
-              state_ops.assign_add(self.true_positives, true_pos),
-              inputs=[y_true, y_pred])
-          return current_true_pos + true_pos
-
-      metric_fn = BinaryTruePositives()
-      config = metrics.serialize(metric_fn)
-      metric_fn = metrics.deserialize(
-          config, custom_objects={'BinaryTruePositives': BinaryTruePositives})
-
-      # Test on simple model
-      inputs = layers.Input(shape=(2,))
-      outputs = layers.Dense(1, activation='sigmoid')(inputs)
-      model = Model(inputs, outputs)
-      model.compile(optimizer='sgd',
-                    loss='binary_crossentropy',
-                    metrics=['acc', metric_fn])
-
-      # Test fit, evaluate
-      samples = 100
-      x = np.random.random((samples, 2))
-      y = np.random.randint(2, size=(samples, 1))
-      val_samples = 10
-      val_x = np.random.random((val_samples, 2))
-      val_y = np.random.randint(2, size=(val_samples, 1))
-
-      history = model.fit(x, y,
-                          epochs=1,
-                          batch_size=10,
-                          validation_data=(val_x, val_y))
-      outs = model.evaluate(x, y, batch_size=10)
-      preds = model.predict(x)
-
-      def ref_true_pos(y_true, y_pred):
-        return np.sum(np.logical_and(y_pred > 0.5, y_true == 1))
-
-      # Test correctness (e.g. updates should have been run)
-      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
-
-      # Test correctness of the validation metric computation
-      val_preds = model.predict(val_x)
-      val_outs = model.evaluate(val_x, val_y, batch_size=10)
-      self.assertAllClose(
-          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
-      self.assertAllClose(
-          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
-
-      # Test with generators
-      gen = [(np.array([x0]), np.array([y0])) for x0, y0 in zip(x, y)]
-      val_gen = [(np.array([x0]), np.array([y0]))
-                 for x0, y0 in zip(val_x, val_y)]
-      history = model.fit_generator(iter(gen),
-                                    epochs=1,
-                                    steps_per_epoch=samples,
-                                    validation_data=iter(val_gen),
-                                    validation_steps=val_samples)
-      outs = model.evaluate_generator(iter(gen), steps=samples)
-      preds = model.predict_generator(iter(gen), steps=samples)
-
-      # Test correctness of the metric results
-      self.assertAllClose(outs[2], ref_true_pos(y, preds), atol=1e-5)
-
-      # Test correctness of the validation metric computation
-      val_preds = model.predict_generator(iter(val_gen), steps=val_samples)
-      val_outs = model.evaluate_generator(iter(val_gen), steps=val_samples)
-      self.assertAllClose(
-          val_outs[2], ref_true_pos(val_y, val_preds), atol=1e-5)
-      self.assertAllClose(
-          val_outs[2], history.history['val_true_positives'][-1], atol=1e-5)
-
   @test_util.run_in_graph_and_eager_modes(assert_no_eager_garbage=True)
   def test_mean(self):
     m = metrics.Mean(name='my_mean')
@@ -319,19 +212,19 @@ class KerasMetricsTest(test.TestCase):
       m = metrics.Mean()
       v = array_ops.placeholder(dtypes.float32)
       w = array_ops.placeholder(dtypes.float32)
-      sess.run(variables.variables_initializer(m.variables))
+      self.evaluate(variables.variables_initializer(m.variables))
 
       # check __call__()
       result_t = m(v, sample_weight=w)
       result = sess.run(result_t, feed_dict=({v: 100, w: 0.5}))
-      self.assertEqual(sess.run(m.total), 50)
-      self.assertEqual(sess.run(m.count), 0.5)
+      self.assertEqual(self.evaluate(m.total), 50)
+      self.assertEqual(self.evaluate(m.count), 0.5)
       self.assertEqual(result, 50 / 0.5)
 
       # check update_state() and result()
       result = sess.run(result_t, feed_dict=({v: [1, 5], w: [1, 0.2]}))
-      self.assertAlmostEqual(sess.run(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
-      self.assertAlmostEqual(sess.run(m.count), 1.7, 2)  # 0.5 + 1.2
+      self.assertAlmostEqual(self.evaluate(m.total), 52, 2)  # 50 + 1 + 5 * 0.2
+      self.assertAlmostEqual(self.evaluate(m.count), 1.7, 2)  # 0.5 + 1.2
       self.assertAlmostEqual(result, 52 / 1.7, 2)
 
   @test_util.run_in_graph_and_eager_modes
@@ -365,6 +258,28 @@ class KerasMetricsTest(test.TestCase):
     self.assertEqual(200., self.evaluate(restore_mean.result()))
     self.assertEqual(3, self.evaluate(restore_mean.count))
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_accuracy(self):
+    acc_obj = metrics.Accuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
+
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[1], [2], [3], [4]], [[1], [2], [3], [4]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
+
+    # check with sample_weight
+    result_t = acc_obj([[2], [1]], [[2], [0]], sample_weight=[[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.96, 2)  # 4.5/4.7
+
   @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy(self):
     acc_obj = metrics.BinaryAccuracy(name='my acc')
@@ -398,11 +313,6 @@ class KerasMetricsTest(test.TestCase):
     result = self.evaluate(result_t)
     self.assertAlmostEqual(result, 0.67, 2)  # 4.5/6.7
 
-    # check incompatible shapes
-    with self.assertRaisesRegexp(ValueError,
-                                 r'Shapes \(1,\) and \(2,\) are incompatible'):
-      acc_obj.update_state([1, 1], [1])
-
   @test_util.run_in_graph_and_eager_modes
   def test_binary_accuracy_threshold(self):
     acc_obj = metrics.BinaryAccuracy(threshold=0.7)
@@ -436,47 +346,830 @@ class KerasMetricsTest(test.TestCase):
     self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
   @test_util.run_in_graph_and_eager_modes
-  def test_invalid_result(self):
+  def test_sparse_categorical_accuracy(self):
+    acc_obj = metrics.SparseCategoricalAccuracy(name='my acc')
+
+    # check config
+    self.assertEqual(acc_obj.name, 'my acc')
+    self.assertTrue(acc_obj.stateful)
+    self.assertEqual(len(acc_obj.variables), 2)
+    self.assertEqual(acc_obj.dtype, dtypes.float32)
+    self.evaluate(variables.variables_initializer(acc_obj.variables))
 
-    class InvalidResult(metrics.Metric):
+    # verify that correct value is returned
+    update_op = acc_obj.update_state([[2], [1]],
+                                     [[0.1, 0.1, 0.8], [0.05, 0.95, 0]])
+    self.evaluate(update_op)
+    result = self.evaluate(acc_obj.result())
+    self.assertEqual(result, 1)  # 2/2
 
-      def __init__(self, name='invalid-result', dtype=dtypes.float64):
-        super(InvalidResult, self).__init__(name=name, dtype=dtype)
+    # check with sample_weight
+    result_t = acc_obj([[2], [1]], [[0.1, 0.1, 0.8], [0.05, 0, 0.95]],
+                       [[0.5], [0.2]])
+    result = self.evaluate(result_t)
+    self.assertAlmostEqual(result, 0.93, 2)  # 2.5/2.7
 
-      def update_state(self, *args, **kwargs):
-        pass
 
-      def result(self):
-        return 1
+def _get_simple_sequential_model(compile_metrics):
+  model = Sequential()
+  model.add(
+      layers.Dense(
+          3, activation='relu', input_dim=4, kernel_initializer='ones'))
+  model.add(layers.Dense(1, activation='sigmoid', kernel_initializer='ones'))
+  model.compile(
+      loss='mae',
+      metrics=compile_metrics,
+      optimizer=RMSPropOptimizer(learning_rate=0.001))
+  return model
 
-    invalid_result_obj = InvalidResult()
-    with self.assertRaisesRegexp(
-        TypeError,
-        'Metric invalid-result\'s result must be a Tensor or Operation, given:'
-    ):
-      invalid_result_obj.result()
 
-  @test_util.run_in_graph_and_eager_modes
-  def test_invalid_update(self):
+@test_util.run_all_in_graph_and_eager_modes
+class FalsePositivesTest(test.TestCase):
 
-    class InvalidUpdate(metrics.Metric):
+  def test_config(self):
+    fp_obj = metrics.FalsePositives(name='my_fp', thresholds=[0.4, 0.9])
+    self.assertEqual(fp_obj.name, 'my_fp')
+    self.assertEqual(len(fp_obj.variables), 1)
+    self.assertEqual(fp_obj.thresholds, [0.4, 0.9])
 
-      def __init__(self, name='invalid-update', dtype=dtypes.float64):
-        super(InvalidUpdate, self).__init__(name=name, dtype=dtype)
+  def test_unweighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
 
-      def update_state(self, *args, **kwargs):
-        return [1]
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
 
-      def result(self):
-        pass
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    fp_obj = metrics.FalsePositives()
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(14., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fp_obj.result()
+    self.assertAllClose([7., 4., 2.], result)
 
-    invalid_update_obj = InvalidUpdate()
-    with self.assertRaisesRegexp(
-        TypeError,
-        'Metric invalid-update\'s update must be a Tensor or Operation, given:'
-    ):
-      invalid_update_obj.update_state()
+  def test_weighted_with_thresholds(self):
+    fp_obj = metrics.FalsePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((1.0, 2.0, 3.0, 5.0), (7.0, 11.0, 13.0, 17.0),
+                     (19.0, 23.0, 29.0, 31.0), (5.0, 15.0, 10.0, 0))
 
+    result = fp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([125., 42., 12.], self.evaluate(result))
+
+  def test_threshold_limit(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        r'Threshold values must be in \[0, 1\]. Invalid values: \[-1, 2\]'):
+      metrics.FalsePositives(thresholds=[-1, 0.5, 2])
+
+  def test_reset_states(self):
+    fp_obj = metrics.FalsePositives()
+    model = _get_simple_sequential_model([fp_obj])
+    x = np.ones((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fp_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class FalseNegativesTest(test.TestCase):
+
+  def test_config(self):
+    fn_obj = metrics.FalseNegatives(name='my_fn', thresholds=[0.4, 0.9])
+    self.assertEqual(fn_obj.name, 'my_fn')
+    self.assertEqual(len(fn_obj.variables), 1)
+    self.assertEqual(fn_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    fn_obj = metrics.FalseNegatives()
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(5., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = fn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = fn_obj.result()
+    self.assertAllClose([1., 4., 6.], result)
+
+  def test_weighted_with_thresholds(self):
+    fn_obj = metrics.FalseNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(fn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((3.0,), (5.0,), (7.0,), (4.0,))
+
+    result = fn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([4., 16., 23.], self.evaluate(result))
+
+  def test_reset_states(self):
+    fn_obj = metrics.FalseNegatives()
+    model = _get_simple_sequential_model([fn_obj])
+    x = np.zeros((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(fn_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TrueNegativesTest(test.TestCase):
+
+  def test_config(self):
+    tn_obj = metrics.TrueNegatives(name='my_tn', thresholds=[0.4, 0.9])
+    self.assertEqual(tn_obj.name, 'my_tn')
+    self.assertEqual(len(tn_obj.variables), 1)
+    self.assertEqual(tn_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose(3., result)
+
+  def test_weighted(self):
+    tn_obj = metrics.TrueNegatives()
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(4., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tn_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tn_obj.result()
+    self.assertAllClose([2., 5., 7.], result)
+
+  def test_weighted_with_thresholds(self):
+    tn_obj = metrics.TrueNegatives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tn_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+    sample_weight = ((0.0, 2.0, 3.0, 5.0),)
+
+    result = tn_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose([5., 15., 23.], self.evaluate(result))
+
+  def test_reset_states(self):
+    tn_obj = metrics.TrueNegatives()
+    model = _get_simple_sequential_model([tn_obj])
+    x = np.zeros((100, 4))
+    y = np.zeros((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tn_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TruePositivesTest(test.TestCase):
+
+  def test_config(self):
+    tp_obj = metrics.TruePositives(name='my_tp', thresholds=[0.4, 0.9])
+    self.assertEqual(tp_obj.name, 'my_tp')
+    self.assertEqual(len(tp_obj.variables), 1)
+    self.assertEqual(tp_obj.thresholds, [0.4, 0.9])
+
+  def test_unweighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose(7., result)
+
+  def test_weighted(self):
+    tp_obj = metrics.TruePositives()
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = tp_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(12., self.evaluate(result))
+
+  def test_unweighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    update_op = tp_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = tp_obj.result()
+    self.assertAllClose([6., 3., 1.], result)
+
+  def test_weighted_with_thresholds(self):
+    tp_obj = metrics.TruePositives(thresholds=[0.15, 0.5, 0.85])
+    self.evaluate(variables.variables_initializer(tp_obj.variables))
+
+    y_pred = constant_op.constant(((0.9, 0.2, 0.8, 0.1), (0.2, 0.9, 0.7, 0.6),
+                                   (0.1, 0.2, 0.4, 0.3), (0, 1, 0.7, 0.3)))
+    y_true = constant_op.constant(((0, 1, 1, 0), (1, 0, 0, 0), (0, 0, 0, 0),
+                                   (1, 1, 1, 1)))
+
+    result = tp_obj(y_true, y_pred, sample_weight=37.)
+    self.assertAllClose([222., 111., 37.], self.evaluate(result))
+
+  def test_reset_states(self):
+    tp_obj = metrics.TruePositives()
+    model = _get_simple_sequential_model([tp_obj])
+    x = np.ones((100, 4))
+    y = np.ones((100, 1))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(tp_obj.accumulator), 100.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class PrecisionTest(test.TestCase):
+
+  def test_config(self):
+    p_obj = metrics.Precision(name='my_precision', thresholds=[0.4, 0.9])
+    self.assertEqual(p_obj.name, 'my_precision')
+    self.assertLen(p_obj.variables, 2)
+    self.assertEqual([v.name for v in p_obj.variables],
+                     ['true_positives:0', 'false_positives:0'])
+    self.assertEqual(p_obj.thresholds, [0.4, 0.9])
+
+  def test_value_is_idempotent(self):
+    p_obj = metrics.Precision(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = p_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_precision = self.evaluate(p_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_precision, self.evaluate(p_obj.result()),
+                           1e-3)
+
+  def test_unweighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    p_obj = metrics.Precision(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 4.0
+    weighted_positives = (1.0 + 3.0) + (4.0 + 2.0)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertAlmostEqual(expected_precision, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    p_obj = metrics.Precision()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    result = p_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    p_obj = metrics.Precision(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[4, 0], [3, 1]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(p_obj.variables))
+    update_op = p_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_precision = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_precision, 0], self.evaluate(p_obj.result()),
+                         1e-3)
+
+  def test_reset_states(self):
+    p_obj = metrics.Precision()
+    model = _get_simple_sequential_model([p_obj])
+    x = np.concatenate((np.ones((50, 4)), np.ones((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.zeros((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.tp), 50.)
+    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(p_obj.tp), 50.)
+    self.assertEqual(self.evaluate(p_obj.fp), 50.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RecallTest(test.TestCase):
+
+  def test_config(self):
+    r_obj = metrics.Recall(name='my_recall', thresholds=[0.4, 0.9])
+    self.assertEqual(r_obj.name, 'my_recall')
+    self.assertLen(r_obj.variables, 2)
+    self.assertEqual([v.name for v in r_obj.variables],
+                     ['true_positives:0', 'false_negatives:0'])
+    self.assertEqual(r_obj.thresholds, [0.4, 0.9])
+
+  def test_value_is_idempotent(self):
+    r_obj = metrics.Recall(thresholds=[0.3, 0.72])
+    y_pred = random_ops.random_uniform(shape=(10, 3))
+    y_true = random_ops.random_uniform(shape=(10, 3))
+    update_op = r_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_recall = self.evaluate(r_obj.result())
+    for _ in range(10):
+      self.assertArrayNear(initial_recall, self.evaluate(r_obj.result()), 1e-3)
+
+  def test_unweighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([1, 0, 1, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.5, self.evaluate(result))
+
+  def test_unweighted_all_incorrect(self):
+    r_obj = metrics.Recall(thresholds=[0.5])
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs)
+    y_true = constant_op.constant(1 - inputs)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertAlmostEqual(0, self.evaluate(result))
+
+  def test_weighted(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
+    y_true = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(
+        y_true,
+        y_pred,
+        sample_weight=constant_op.constant([[1, 2, 3, 4], [4, 3, 2, 1]]))
+    weighted_tp = 3.0 + 1.0
+    weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
+    expected_recall = weighted_tp / weighted_t
+    self.assertAlmostEqual(expected_recall, self.evaluate(result))
+
+  def test_div_by_zero(self):
+    r_obj = metrics.Recall()
+    y_pred = constant_op.constant([0, 0, 0, 0])
+    y_true = constant_op.constant([0, 0, 0, 0])
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertEqual(0, self.evaluate(result))
+
+  def test_unweighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 0.7])
+    y_pred = constant_op.constant([1, 0, 0.6, 0], shape=(1, 4))
+    y_true = constant_op.constant([0, 1, 1, 0], shape=(1, 4))
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred)
+    self.assertArrayNear([0.5, 0.], self.evaluate(result), 0)
+
+  def test_weighted_with_threshold(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    result = r_obj(y_true, y_pred, sample_weight=weights)
+    weighted_tp = 0 + 3.
+    weighted_positives = (0 + 3.) + (4. + 0.)
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(result), 1e-3)
+
+  def test_multiple_updates(self):
+    r_obj = metrics.Recall(thresholds=[0.5, 1.])
+    y_true = constant_op.constant([[0, 1], [1, 0]], shape=(2, 2))
+    y_pred = constant_op.constant([[1, 0], [0.6, 0]],
+                                  shape=(2, 2),
+                                  dtype=dtypes.float32)
+    weights = constant_op.constant([[1, 4], [3, 2]],
+                                   shape=(2, 2),
+                                   dtype=dtypes.float32)
+    self.evaluate(variables.variables_initializer(r_obj.variables))
+    update_op = r_obj.update_state(y_true, y_pred, sample_weight=weights)
+    for _ in range(2):
+      self.evaluate(update_op)
+
+    weighted_tp = (0 + 3.) + (0 + 3.)
+    weighted_positives = ((0 + 3.) + (4. + 0.)) + ((0 + 3.) + (4. + 0.))
+    expected_recall = weighted_tp / weighted_positives
+    self.assertArrayNear([expected_recall, 0], self.evaluate(r_obj.result()),
+                         1e-3)
+
+  def test_reset_states(self):
+    r_obj = metrics.Recall()
+    model = _get_simple_sequential_model([r_obj])
+    x = np.concatenate((np.ones((50, 4)), np.zeros((50, 4))))
+    y = np.concatenate((np.ones((50, 1)), np.ones((50, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.tp), 50.)
+    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(r_obj.tp), 50.)
+    self.assertEqual(self.evaluate(r_obj.fn), 50.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SensitivityAtSpecificityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SensitivityAtSpecificity(
+        0.4, num_thresholds=100, name='sensitivity_at_specificity_1')
+    self.assertEqual(s_obj.name, 'sensitivity_at_specificity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.value, 0.4)
+    self.assertLen(s_obj.thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_sensitivity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_sensitivity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.8, self.evaluate(result))
+
+  def test_unweighted_low_specificity(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SensitivityAtSpecificity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.675, self.evaluate(result))
+
+  def test_invalid_specificity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`specificity` must be in the range \[0, 1\].'):
+      metrics.SensitivityAtSpecificity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SensitivityAtSpecificity(0.4, num_thresholds=-1)
+
+  def test_reset_states(self):
+    s_obj = metrics.SensitivityAtSpecificity(0.5, num_thresholds=1)
+    model = _get_simple_sequential_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SpecificityAtSensitivityTest(test.TestCase, parameterized.TestCase):
+
+  def test_config(self):
+    s_obj = metrics.SpecificityAtSensitivity(
+        0.4, num_thresholds=100, name='specificity_at_sensitivity_1')
+    self.assertEqual(s_obj.name, 'specificity_at_sensitivity_1')
+    self.assertLen(s_obj.variables, 4)
+    self.assertEqual(s_obj.value, 0.4)
+    self.assertLen(s_obj.thresholds, 100)
+
+  def test_value_is_idempotent(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    y_pred = random_ops.random_uniform((10, 3),
+                                       maxval=1,
+                                       dtype=dtypes.float32,
+                                       seed=1)
+    y_true = random_ops.random_uniform((10, 3),
+                                       maxval=2,
+                                       dtype=dtypes.int64,
+                                       seed=1)
+    update_op = s_obj.update_state(y_true, y_pred)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+
+    # Run several updates.
+    for _ in range(10):
+      self.evaluate(update_op)
+
+    # Then verify idempotency.
+    initial_specificity = self.evaluate(s_obj.result())
+    for _ in range(10):
+      self.assertAlmostEqual(initial_specificity, self.evaluate(s_obj.result()),
+                             1e-3)
+
+  def test_unweighted_all_correct(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.7)
+    inputs = np.random.randint(0, 2, size=(100, 1))
+    y_pred = constant_op.constant(inputs, dtype=dtypes.float32)
+    y_true = constant_op.constant(inputs)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(1, self.evaluate(result))
+
+  def test_unweighted_high_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.8)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_unweighted_low_sensitivity(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = constant_op.constant(label_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred)
+    self.assertAlmostEqual(0.6, self.evaluate(result))
+
+  @parameterized.parameters([dtypes.bool, dtypes.int32, dtypes.float32])
+  def test_weighted(self, label_dtype):
+    s_obj = metrics.SpecificityAtSensitivity(0.4)
+    pred_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
+    label_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
+    weight_values = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+
+    y_pred = constant_op.constant(pred_values, dtype=dtypes.float32)
+    y_true = math_ops.cast(label_values, dtype=label_dtype)
+    weights = constant_op.constant(weight_values)
+    self.evaluate(variables.variables_initializer(s_obj.variables))
+    result = s_obj(y_true, y_pred, sample_weight=weights)
+    self.assertAlmostEqual(0.4, self.evaluate(result))
+
+  def test_invalid_sensitivity(self):
+    with self.assertRaisesRegexp(
+        ValueError, r'`sensitivity` must be in the range \[0, 1\].'):
+      metrics.SpecificityAtSensitivity(-1)
+
+  def test_invalid_num_thresholds(self):
+    with self.assertRaisesRegexp(ValueError, '`num_thresholds` must be > 0.'):
+      metrics.SpecificityAtSensitivity(0.4, num_thresholds=-1)
+
+  def test_reset_states(self):
+    s_obj = metrics.SpecificityAtSensitivity(0.5, num_thresholds=1)
+    model = _get_simple_sequential_model([s_obj])
+    x = np.concatenate((np.ones((25, 4)), np.zeros((25, 4)), np.zeros((25, 4)),
+                        np.ones((25, 4))))
+    y = np.concatenate((np.ones((25, 1)), np.zeros((25, 1)), np.ones((25, 1)),
+                        np.zeros((25, 1))))
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+    model.evaluate(x, y)
+    self.assertEqual(self.evaluate(s_obj.tp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fp), 25.)
+    self.assertEqual(self.evaluate(s_obj.fn), 25.)
+    self.assertEqual(self.evaluate(s_obj.tn), 25.)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class CosineProximityTest(test.TestCase):
+
+  def test_config(self):
+    cosine_obj = metrics.CosineProximity(name='my_cos', dtype=dtypes.int32)
+    self.assertEqual(cosine_obj.name, 'my_cos')
+    self.assertEqual(cosine_obj._dtype, dtypes.int32)
+
+  def test_unweighted(self):
+    cosine_obj = metrics.CosineProximity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+
+    update_op = cosine_obj.update_state(y_true, y_pred)
+    self.evaluate(update_op)
+    result = cosine_obj.result()
+    self.assertAllClose(-0.60723, result, atol=1e-5)
+
+  def test_weighted(self):
+    cosine_obj = metrics.CosineProximity()
+    self.evaluate(variables.variables_initializer(cosine_obj.variables))
+    y_true = constant_op.constant(((0, 1, 0, 1, 0), (0, 0, 1, 1, 1),
+                                   (1, 1, 1, 1, 0), (0, 0, 0, 0, 1)))
+    y_pred = constant_op.constant(((0, 0, 1, 1, 0), (1, 1, 1, 1, 1),
+                                   (0, 1, 0, 1, 0), (1, 1, 1, 1, 1)))
+    sample_weight = constant_op.constant((1., 1.5, 2., 2.5))
+    result = cosine_obj(y_true, y_pred, sample_weight=sample_weight)
+    self.assertAllClose(-0.59916, self.evaluate(result), atol=1e-5)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/model_subclassing_test.py b/tensorflow/python/keras/model_subclassing_test.py
index aca058b1111f64ddfdcbc16cab355ca1f33a2a7e..553c7fb00969fd8c1e042b84ffff37bc82981d02 100644
--- a/tensorflow/python/keras/model_subclassing_test.py
+++ b/tensorflow/python/keras/model_subclassing_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -186,6 +187,7 @@ def get_nested_model_3(input_dim, num_classes):
 
 
 @test_util.run_all_in_graph_and_eager_modes
+@test_util.run_v1_only('b/120545219')
 class ModelSubclassingTest(test.TestCase):
 
   def test_custom_build(self):
@@ -455,12 +457,12 @@ class ModelSubclassingTest(test.TestCase):
       model = SimpleTestModel(num_classes=num_classes, use_dp=True, use_bn=True)
       model.compile(loss='mse', optimizer=RMSPropOptimizer(learning_rate=0.001))
 
-      x = np.ones((num_samples, input_dim))
-      y = np.zeros((num_samples, num_classes))
+      x = np.ones((num_samples, input_dim), dtype=np.float32)
+      y = np.zeros((num_samples, num_classes), dtype=np.float32)
       dataset = dataset_ops.Dataset.from_tensor_slices((x, y))
       dataset = dataset.repeat(100)
       dataset = dataset.batch(10)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
 
       model.fit(iterator, epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(iterator, steps=10, verbose=0)
@@ -725,10 +727,41 @@ class ModelSubclassingTest(test.TestCase):
     _ = model.evaluate(x, y, verbose=0)
 
     self.assertEqual(len(model.weights), 16)
-    self.assertEqual(
-        len(model.non_trainable_weights), 4)
+    self.assertEqual(len(model.non_trainable_weights), 4)
     self.assertEqual(len(model.trainable_weights), 12)
 
+  def test_subclass_nested_in_sequential(self):
+    num_classes = 2
+    num_samples = 100
+    input_dim = 50
+
+    class Inner(keras.Model):
+
+      def __init__(self):
+        super(Inner, self).__init__()
+        self.dense1 = keras.layers.Dense(32, activation='relu')
+        self.dense2 = keras.layers.Dense(num_classes, activation='relu')
+        self.bn = keras.layers.BatchNormalization()
+
+      def call(self, inputs):
+        x = self.dense1(inputs)
+        x = self.dense2(x)
+        return self.bn(x)
+
+    model = keras.Sequential([Inner()])
+    model.compile(loss='mse',
+                  optimizer=RMSPropOptimizer(learning_rate=0.001),
+                  metrics=['acc'])
+
+    x = np.ones((num_samples, input_dim))
+    y = np.zeros((num_samples, num_classes))
+    model.fit(x, y, epochs=2, batch_size=32, verbose=0)
+    _ = model.evaluate(x, y, verbose=0)
+
+    self.assertEqual(len(model.weights), 8)
+    self.assertEqual(len(model.non_trainable_weights), 2)
+    self.assertEqual(len(model.trainable_weights), 6)
+
   def test_support_for_manual_training_arg(self):
     # In most cases, the `training` argument is left unspecified, in which
     # case it defaults to value corresponding to the Model method being used
@@ -819,9 +852,74 @@ class ModelSubclassingTest(test.TestCase):
     self.assertEqual([m.dense.kernel, m.dense.bias, m.not_trainable_var],
                      m.non_trainable_variables)
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_add_weight_in_model(self):
+
+    class MyModel(keras.Model):
 
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+    class MyModelCustomBuild(keras.Model):
+
+      def build(self, input_shape):
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,), trainable=False)
+
+      def call(self, inputs):
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModelCustomBuild()
+    model(x)
+    self.assertEqual(1, len(model.trainable_weights))
+    self.assertEqual(1, len(model.non_trainable_weights))
+    self.assertEqual(2, len(model.weights))
+
+  def test_add_update_in_model(self):
+
+    class MyModel(keras.Model):
+
+      def __init__(self):
+        super(MyModel, self).__init__()
+        self.b = self.add_weight('bias', (10,))
+        self.c = self.add_weight('bias2', (10,))
+
+      def call(self, inputs):
+        # Unconditional
+        self.add_update(self.b.assign(self.b * 2))
+        # Conditional
+        self.add_update(self.c.assign(inputs[1, :]), inputs)
+        return inputs + self.b + self.c
+
+    x = ops.convert_to_tensor(np.ones((10, 10), 'float32'))
+    model = MyModel()
+    model(x)
+
+    if context.executing_eagerly():
+      self.assertEqual(0, len(model.updates))
+    else:
+      self.assertEqual(2, len(model.updates))
+      self.assertEqual(1, len(model.get_updates_for(None)))
+      self.assertEqual(1, len(model.get_updates_for(x)))
+
+
+@test_util.run_v1_only('b/120545219')
 class GraphSpecificModelSubclassingTests(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_single_io_workflow_with_tensors(self):
     num_classes = 2
     num_samples = 10
@@ -839,6 +937,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model.fit(x, y, epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(steps=10, verbose=0)
 
+  @test_util.run_deprecated_v1
   def test_multi_io_workflow_with_tensors(self):
     num_classes = (2, 3)
     num_samples = 10
@@ -858,6 +957,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       model.fit([x1, x2], [y1, y2], epochs=2, steps_per_epoch=10, verbose=0)
       _ = model.evaluate(steps=10, verbose=0)
 
+  @test_util.run_deprecated_v1
   def test_updates_and_losses_for_nested_models_in_subclassed_model(self):
 
     # Case 1: deferred-build sequential nested in subclass.
@@ -925,6 +1025,7 @@ class GraphSpecificModelSubclassingTests(test.TestCase):
       self.assertEqual(len(model.get_updates_for(x)), 2)
       self.assertEqual(len(model.get_losses_for(x)), 1)
 
+  @test_util.run_deprecated_v1
   def test_multi_io_workflow_with_numpy_arrays_and_custom_placeholders(self):
     num_classes = (2, 3)
     num_samples = 1000
@@ -974,6 +1075,16 @@ class TrainingNoDefaultModel(keras.Model):
     return self.dense1(x)
 
 
+class TrainingMaskingModel(keras.Model):
+
+  def __init__(self):
+    super(TrainingMaskingModel, self).__init__()
+    self.dense1 = keras.layers.Dense(1)
+
+  def call(self, x, training=False, mask=None):
+    return self.dense1(x)
+
+
 class CustomCallSignatureTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -1003,6 +1114,19 @@ class CustomCallSignatureTests(test.TestCase):
                                     'has been properly built.'))
     self.assertTrue(model.built, 'Model should be built after calling `build`.')
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_training_and_mask_args_call_build(self):
+    input_dim = 2
+
+    model = TrainingMaskingModel()
+    self.assertFalse(model.built, 'Model should not have been built')
+    self.assertFalse(model.weights, ('Model should have no weights since it '
+                                     'has not been built.'))
+    model.build((None, input_dim))
+    self.assertTrue(model.weights, ('Model should have weights now that it '
+                                    'has been properly built.'))
+    self.assertTrue(model.built, 'Model should be built after calling `build`.')
+
   @test_util.run_in_graph_and_eager_modes
   def test_custom_call_kwargs_and_build(self):
     first_input_shape = (2, 3)
diff --git a/tensorflow/python/keras/models.py b/tensorflow/python/keras/models.py
index 225c6c6af8e7c52c8a5ff1976df0ce55bfd80dc2..2637191bb75b357341376a703b2620243bd925bf 100644
--- a/tensorflow/python/keras/models.py
+++ b/tensorflow/python/keras/models.py
@@ -100,17 +100,19 @@ def _clone_functional_model(model, input_tensors=None):
       input_tensors = list(input_tensors)
     input_tensors = generic_utils.to_list(input_tensors)
     input_tensors_ = []
-    for i, x in enumerate(input_tensors):
-      if not K.is_keras_tensor(x):
-        name = model._input_layers[i].name
-        input_tensor = Input(tensor=x, name='input_wrapper_for_' + name)
+    for i in range(len(input_tensors)):
+      input_tensor = input_tensors[i]
+      if not K.is_keras_tensor(input_tensor):
+        original_input_layer = model._input_layers[i]
+        name = original_input_layer.name
+        input_tensor = Input(tensor=input_tensor,
+                             name='input_wrapper_for_' + name)
         input_tensors_.append(input_tensor)
         # Cache newly created input layer.
-        original_input_layer = x._keras_history[0]
         newly_created_input_layer = input_tensor._keras_history[0]
         layer_map[original_input_layer] = newly_created_input_layer
       else:
-        input_tensors_.append(x)
+        input_tensors_.append(input_tensor)
     input_tensors = input_tensors_
 
   for x, y in zip(model.inputs, input_tensors):
@@ -209,14 +211,17 @@ def _clone_sequential_model(model, input_tensors=None):
   # Use model._layers to ensure that all layers are cloned. The model's layers
   # property will exclude the initial InputLayer (if it exists) in the model,
   # resulting in a different Sequential model structure.
-  layers = [clone(layer) for layer in model._layers]
   if input_tensors is None:
+    layers = [clone(layer) for layer in model._layers]
     return Sequential(layers=layers, name=model.name)
   else:
     # If input tensors are provided, the original model's InputLayer is
     # overwritten with a different InputLayer.
-    if isinstance(layers[0], InputLayer):
-      layers = layers[1:]
+    layers = [
+        clone(layer)
+        for layer in model._layers
+        if not isinstance(layer, InputLayer)
+    ]
     if len(generic_utils.to_list(input_tensors)) != 1:
       raise ValueError('To clone a `Sequential` model, we expect '
                        ' at most one tensor '
@@ -304,8 +309,9 @@ def _in_place_subclassed_model_reset(model):
       attributes_cache[name] = value
       assert value in model._layers
     elif isinstance(
-        value, (list, tuple)) and name not in ('layers', '_layers',
-                                               'stateful_metric_functions'):
+        value,
+        (list, tuple)) and name not in ('layers', '_layers', 'metrics',
+                                        '_compile_stateful_metric_functions'):
       # Handle case: list/tuple of layers (also tracked by the Network API).
       if value and all(isinstance(val, Layer) for val in value):
         raise ValueError('We do not support the use of list-of-layers '
@@ -345,9 +351,6 @@ def _in_place_subclassed_model_reset(model):
           'targets',
           '_feed_targets',
           'sample_weight_modes',
-          'weighted_metrics',
-          'metrics_names',
-          'metrics_tensors',
           'total_loss',
           'sample_weights',
           '_feed_sample_weights',
@@ -495,10 +498,11 @@ def clone_and_build_model(
     clone.compile(
         optimizer,
         model.loss,
-        metrics=metrics_module.clone_metrics(model.metrics),
+        metrics=metrics_module.clone_metrics(model._compile_metrics),
         loss_weights=model.loss_weights,
         sample_weight_mode=model.sample_weight_mode,
-        weighted_metrics=metrics_module.clone_metrics(model.weighted_metrics),
+        weighted_metrics=metrics_module.clone_metrics(
+            model._compile_weighted_metrics),
         target_tensors=target_tensors)
 
   return clone
diff --git a/tensorflow/python/keras/models_test.py b/tensorflow/python/keras/models_test.py
index bf778f14971587d22d9d6724d4d10fb486674836..c466d94fed8f34e0ca9e25425f88d6028c806131 100644
--- a/tensorflow/python/keras/models_test.py
+++ b/tensorflow/python/keras/models_test.py
@@ -26,10 +26,12 @@ import numpy as np
 from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras import backend as K
 from tensorflow.python.keras import metrics
 from tensorflow.python.keras import models
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.platform import test
@@ -67,6 +69,7 @@ def sequential_model(add_input_layer, include_input_shape=True):
 
 class TestModelCloning(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def test_clone_sequential_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -81,25 +84,27 @@ class TestModelCloning(test.TestCase):
       # With placeholder creation
       new_model = keras.models.clone_model(model)
       # update ops from batch norm needs to be included
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new tensor
       input_a = keras.Input(shape=(4,))
       new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(val_a, val_out)
 
       # On top of new, non-Keras tensor
       input_a = keras.backend.variable(val_a)
       new_model = keras.models.clone_model(model, input_tensors=input_a)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
+  @test_util.run_v1_only('b/120545219')
   def test_clone_sequential_model_input_layer(self):
+
     def test_input_layer(include_inputs):
       with self.cached_session():
         val_a = np.random.random((10, 4))
@@ -136,6 +141,7 @@ class TestModelCloning(test.TestCase):
     test_input_layer(True)
     test_input_layer(False)
 
+  @test_util.run_v1_only('b/120545219')
   def test_clone_functional_model(self):
     with self.cached_session():
       val_a = np.random.random((10, 4))
@@ -161,7 +167,7 @@ class TestModelCloning(test.TestCase):
     with self.cached_session():
       # With placeholder creation
       new_model = keras.models.clone_model(model)
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch([val_a, val_b], val_out)
 
@@ -170,7 +176,7 @@ class TestModelCloning(test.TestCase):
       input_b = keras.Input(shape=(4,), name='b')
       new_model = keras.models.clone_model(
           model, input_tensors=[input_a, input_b])
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch([val_a, val_b], val_out)
 
@@ -179,7 +185,7 @@ class TestModelCloning(test.TestCase):
       input_b = keras.backend.variable(val_b)
       new_model = keras.models.clone_model(
           model, input_tensors=[input_a, input_b])
-      self.assertEquals(len(new_model.get_updates_for(new_model.inputs)), 2)
+      self.assertEqual(len(new_model.get_updates_for(new_model.inputs)), 2)
       new_model.compile('rmsprop', 'mse')
       new_model.train_on_batch(None, val_out)
 
@@ -219,6 +225,34 @@ class TestModelCloning(test.TestCase):
     with self.assertRaises(ValueError):
       keras.models._clone_sequential_model(seq_model, input_tensors=y)
 
+  def test_functional_cloning_does_not_create_unnecessary_placeholders(self):
+    with ops.Graph().as_default():
+      x = keras.Input((4,))
+      y = keras.layers.Dense(4)(x)
+      model = keras.models.Model(x, y)
+    graph = ops.Graph()
+    with graph.as_default():
+      x = array_ops.ones((10, 4))
+      _ = keras.models.clone_model(model, input_tensors=[x])
+      has_placeholder = _has_placeholder(graph)
+      self.assertFalse(has_placeholder)
+
+  def test_sequential_cloning_does_not_create_unnecessary_placeholders(self):
+    with ops.Graph().as_default():
+      model = keras.models.Sequential()
+      model.add(keras.layers.Dense(4, input_shape=(4,)))
+    graph = ops.Graph()
+    with graph.as_default():
+      x = array_ops.ones((10, 4))
+      _ = keras.models.clone_model(model, input_tensors=[x])
+      has_placeholder = _has_placeholder(graph)
+      self.assertFalse(has_placeholder)
+
+
+def _has_placeholder(graph):
+  ops_types = [op.type for op in graph.get_operations()]
+  return any('Placeholder' in s for s in ops_types)
+
 
 class CheckpointingTests(test.TestCase):
 
@@ -283,6 +317,7 @@ class TestModelDeepCopy(test.TestCase):
                       model_copy.get_weights()[0]))
 
 
+@test_util.run_v1_only('b/120545219')
 class TestCloneAndBuildModel(test.TestCase):
 
   def test_clone_and_build_non_compiled_model(self):
@@ -330,8 +365,11 @@ class TestCloneAndBuildModel(test.TestCase):
 
     self.assertEqual('mse', model.loss)
     self.assertTrue(
-        isinstance(model.optimizer, keras.optimizers.RMSprop))
-    self.assertEqual(['acc', metrics.categorical_accuracy], model.metrics)
+        isinstance(model.optimizer,
+                   (keras.optimizers.RMSprop,
+                    keras.optimizer_v2.rmsprop.RMSprop)))
+    self.assertEqual(['acc', metrics.categorical_accuracy],
+                     model._compile_metrics)
 
   def _clone_and_build_test_helper(self, model, is_subclassed=False):
     inp = np.random.random((10, 4))
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index fb43775fdc491695c20ea34e07afc45d16d7995b..b8f01249419c595a735442310c735bc10648cba6 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -7,14 +7,21 @@ licenses(["notice"])  # Apache 2.0
 
 exports_files(["LICENSE"])
 
+load("//tensorflow:tensorflow.bzl", "py_test")
 load("//tensorflow:tensorflow.bzl", "cuda_py_test")
 
 py_library(
     name = "optimizer_v2",
     srcs = [
+        "adadelta.py",
+        "adagrad.py",
         "adam.py",
+        "adamax.py",
+        "ftrl.py",
         "gradient_descent.py",
+        "nadam.py",
         "optimizer_v2.py",
+        "rmsprop.py",
     ],
     srcs_version = "PY2AND3",
     deps = [
@@ -24,12 +31,107 @@ py_library(
         "//tensorflow/python:math_ops",
         "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python/distribute:reduce_util",
     ],
 )
 
+cuda_py_test(
+    name = "adagrad_test",
+    size = "medium",
+    srcs = ["adagrad_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adam_test",
+    size = "medium",
+    srcs = ["adam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adamax_test",
+    size = "medium",
+    srcs = ["adamax_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "adadelta_test",
+    size = "medium",
+    srcs = ["adadelta_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+cuda_py_test(
+    name = "ftrl_test",
+    size = "medium",
+    srcs = ["ftrl_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
 cuda_py_test(
     name = "gradient_descent_test",
     size = "medium",
@@ -50,9 +152,53 @@ cuda_py_test(
 )
 
 cuda_py_test(
-    name = "optimizer_v2_test",
+    name = "nadam_test",
     size = "medium",
+    srcs = ["nadam_test.py"],
+    additional_deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:embedding_ops",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:resources",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+    ],
+    shard_count = 4,
+)
+
+py_test(
+    name = "optimizer_v2_test",
+    size = "large",
     srcs = ["optimizer_v2_test.py"],
+    shard_count = 4,
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":optimizer_v2",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:state_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/keras",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_test(
+    name = "rmsprop_test",
+    size = "medium",
+    srcs = ["rmsprop_test.py"],
     additional_deps = [
         ":optimizer_v2",
         "//tensorflow/python/eager:def_function",
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta.py b/tensorflow/python/keras/optimizer_v2/adadelta.py
new file mode 100644
index 0000000000000000000000000000000000000000..55b4eba1051287420b8ab1adeea1598eb4647c36
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta.py
@@ -0,0 +1,148 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adadelta for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class Adadelta(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the Adadelta algorithm.
+
+  Adadelta optimization is a stochastic gradient descent method that is based on
+  adaptive learning rate per dimension to address two drawbacks:
+    1) the continual decay of learning rates throughout training
+    2) the need for a manually selected global learning rate
+
+  Two accumulation steps are required:
+    1) the accumulation of gradients squared,
+    2) the accumulation of updates squared.
+
+  Initialization:
+
+  $$accum_g_0 := 0 \text{(Initialize gradient 2nd order moment vector)}$$
+  $$accum_x_0 := 0 \text{(Initialize variable update 2nd order moment vector)}$$
+
+  $$t := t + 1$$
+  $$accum_g_t := rho * accum_g_{t-1} + (1 - rho) * g * g$$
+  $$delta = -\sqrt{accum_x_{t-1}} / (\sqrt{accum_g_{t-1}} + \epsilon)$$
+  $$accum_x_t := rho * accum_x_{t-1} + (1 - rho) * delta * delta$$
+
+  References
+    See [M. D. Zeiler](http://arxiv.org/abs/1212.5701)
+      ([pdf](http://arxiv.org/pdf/1212.5701v1.pdf))
+
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.95,
+               epsilon=1e-7,
+               name='Adadelta',
+               **kwargs):
+    """Construct a new Adadelta optimizer.
+
+    Adadelta is a more robust extension of Adagrad that adapts learning rates
+    based on a moving window of gradient updates, instead of accumulating all
+    past gradients. This way, Adadelta continues learning even when many updates
+    have been done. Compared to Adagrad, in the original version of Adadelta you
+    don't have to set an initial learning rate. In this version, initial
+    learning rate can be set, as in most other Keras optimizers.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value. The learning rate.
+        To match the exact form in the original paper use 1.0.
+      rho: A `Tensor` or a floating point value. The decay rate.
+      epsilon: A `Tensor` or a floating point value.  A constant epsilon used
+               to better conditioning the grad update.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adadelta".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate`, `rho`, and `epsilon` can
+    each be a callable that takes no arguments and returns the actual value to
+    use. This can be useful for changing these values across different
+    invocations of optimizer functions.
+    @end_compatibility
+    """
+    super(Adadelta, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('rho', rho)
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    # Separate for-loops to respect the ordering of slot variables from v1.
+    for v in var_list:
+      self.add_slot(v, 'accum_grad')
+    for v in var_list:
+      self.add_slot(v, 'accum_var')
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(Adadelta, self).set_weights(weights)
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    accum_grad = self.get_slot(var, 'accum_grad')
+    accum_var = self.get_slot(var, 'accum_var')
+    return training_ops.resource_apply_adadelta(
+        var.handle,
+        accum_grad.handle,
+        accum_var.handle,
+        lr_t,
+        self._get_hyper('rho', var_dtype),
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    accum_grad = self.get_slot(var, 'accum_grad')
+    accum_var = self.get_slot(var, 'accum_var')
+    return training_ops.resource_sparse_apply_adadelta(
+        var.handle,
+        accum_grad.handle,
+        accum_var.handle,
+        lr_t,
+        self._get_hyper('rho', var_dtype),
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        indices,
+        use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(Adadelta, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'rho': self._serialize_hyperparameter('rho'),
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adadelta_test.py b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fb67d0cd1675fa0d02db7b78f6d90d86b64888f
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adadelta_test.py
@@ -0,0 +1,170 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adadelta Optimizer."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adadelta
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+class AdadeltaOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    num_updates = 4  # number of ADADELTA steps to perform
+    for dtype in [dtypes.half, dtypes.float32]:
+      for grad in [0.2, 0.1, 0.01]:
+        for lr in [1.0, 0.5, 0.1]:
+          var0_init = [1.0, 2.0]
+          var1_init = [3.0, 4.0]
+          if use_resource:
+            var0 = resource_variable_ops.ResourceVariable(
+                var0_init, dtype=dtype)
+            var1 = resource_variable_ops.ResourceVariable(
+                var1_init, dtype=dtype)
+          else:
+            var0 = variables.Variable(var0_init, dtype=dtype)
+            var1 = variables.Variable(var1_init, dtype=dtype)
+
+          grads = constant_op.constant([grad, grad], dtype=dtype)
+
+          accum = 0.0
+          accum_update = 0.0
+
+          # ADADELTA gradient optimizer
+          rho = 0.95
+          epsilon = 1e-8
+          if use_callable_params:
+            adadelta_opt = adadelta.Adadelta(
+                learning_rate=lambda: lr,  # pylint: disable=cell-var-from-loop
+                rho=lambda: rho,  # pylint: disable=cell-var-from-loop
+                epsilon=lambda: epsilon)  # pylint: disable=cell-var-from-loop
+          else:
+            adadelta_opt = adadelta.Adadelta(
+                learning_rate=lr, rho=rho, epsilon=epsilon)
+          if not context.executing_eagerly():
+            adadelta_update = adadelta_opt.apply_gradients(
+                zip([grads, grads], [var0, var1]))
+            self.evaluate(variables.global_variables_initializer())
+
+            # Assign slots
+            slot = [None] * 2
+            slot_update = [None] * 2
+            slot[0] = adadelta_opt.get_slot(var0, "accum_grad")
+            self.assertEqual(slot[0].get_shape(), var0.get_shape())
+
+            slot_update[0] = adadelta_opt.get_slot(var0, "accum_var")
+            self.assertEqual(slot_update[0].get_shape(), var0.get_shape())
+
+            slot[1] = adadelta_opt.get_slot(var1, "accum_grad")
+            self.assertEqual(slot[1].get_shape(), var1.get_shape())
+
+            slot_update[1] = adadelta_opt.get_slot(var1, "accum_var")
+            self.assertEqual(slot_update[1].get_shape(), var1.get_shape())
+
+          # Fetch params to validate initial values
+          self.assertAllClose(var0_init, self.evaluate(var0))
+          self.assertAllClose(var1_init, self.evaluate(var1))
+
+          update = [None] * num_updates
+          tot_update = 0
+          for step in range(num_updates):
+            # Run adadelta update for comparison
+            if not context.executing_eagerly():
+              self.evaluate(adadelta_update)
+            else:
+              adadelta_opt.apply_gradients(zip([grads, grads], [var0, var1]))
+
+            # Perform initial update without previous accum values
+            accum = accum * rho + (grad**2) * (1 - rho)
+            update[step] = (
+                np.sqrt(accum_update + epsilon) *
+                (1. / np.sqrt(accum + epsilon)) * grad)
+            accum_update = (
+                accum_update * rho + (update[step]**2) * (1.0 - rho))
+            tot_update += update[step] * lr
+
+            if not context.executing_eagerly():
+              # Check that the accumulators have been updated
+              # TODO(lxuechen): This is hard to test in eager mode
+              for slot_idx in range(2):
+                self.assertAllCloseAccordingToType(
+                    np.array([accum, accum], dtype=dtype.as_numpy_dtype()),
+                    self.evaluate(slot[slot_idx]),
+                    rtol=1e-5)
+
+                self.assertAllCloseAccordingToType(
+                    np.array(
+                        [accum_update, accum_update],
+                        dtype=dtype.as_numpy_dtype()),
+                    self.evaluate(slot_update[slot_idx]),
+                    rtol=1e-5)
+
+              # Check that the parameters have been updated
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var0_init[0] - tot_update, var0_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  self.evaluate(var0),
+                  rtol=1e-5)
+
+              self.assertAllCloseAccordingToType(
+                  np.array(
+                      [var1_init[0] - tot_update, var1_init[1] - tot_update],
+                      dtype=dtype.as_numpy_dtype()),
+                  self.evaluate(var1),
+                  rtol=1e-5)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adadelta.Adadelta(1.0, 1.0, 1.0).minimize(
+            loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad.py b/tensorflow/python/keras/optimizer_v2/adagrad.py
new file mode 100644
index 0000000000000000000000000000000000000000..670cad70e63354650aeb47ed2324e2c1756e12c1
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad.py
@@ -0,0 +1,171 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adagrad for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+
+
+class Adagrad(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the Adagrad algorithm.
+
+  Adagrad is an optimizer with parameter-specific learning rates,
+  which are adapted relative to how frequently a parameter gets
+  updated during training. The more updates a parameter receives,
+  the smaller the updates.
+
+  Initialization:
+
+  $$accum_g_0 := initial_accumulator_value$$
+
+  $$t := t + 1$$
+  $$accum_g_t := accum_g_{t-1} + g * g$$
+  $$theta_t := theta_{t-1} - lr * g / (\sqrt{accum_g_t} + \epsilon)$$
+
+  References
+    See [paper]
+      (http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+    or this
+      [intro](https://ppasupat.github.io/a9online/uploads/proximal_notes.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               initial_accumulator_value=0.1,
+               epsilon=1e-7,
+               name='Adagrad',
+               **kwargs):
+    """Construct a new Adagrad optimizer.
+
+    Args:
+      learning_rate: A `Tensor` or a floating point value.  The learning rate.
+      initial_accumulator_value: A floating point value.
+        Starting value for the accumulators, must be positive.
+      epsilon: A floating point value.
+        Starting value for the accumulators, must be positive.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Adagrad".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    Raises:
+      ValueError: If the `initial_accumulator_value` or `epsilon` is invalid.
+
+    @compatibility(eager)
+    When eager execution is enabled, `learning_rate` can be a callable that
+    takes no arguments and returns the actual value to use. This can be useful
+    for changing these values across different invocations of optimizer
+    functions.
+    @end_compatibility
+    """
+    if initial_accumulator_value < 0.0:
+      raise ValueError('initial_accumulator_value must be non-negative: %s' %
+                       initial_accumulator_value)
+    if epsilon < 1e-7:
+      raise ValueError('epsilon must be larger than 1e-7: %s' % epsilon)
+    super(Adagrad, self).__init__(name, **kwargs)
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._initial_accumulator_value = initial_accumulator_value
+    self._set_hyper('epsilon', epsilon)
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = init_ops.constant_initializer(
+          self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, 'accumulator', init)
+
+  def set_weights(self, weights):
+    params = self.weights
+    # Override set_weights for backward compatibility of Keras V1 optimizer
+    # since it does not include iteration at head of the weight list. Set
+    # iteration to 0.
+    if len(params) == len(weights) + 1:
+      weights = [np.array(0)] + weights
+    super(Adagrad, self).set_weights(weights)
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None):
+    """Creates an optimizer from its config.
+
+    This method is the reverse of `get_config`,
+    capable of instantiating the same optimizer from the config
+    dictionary.
+
+    Arguments:
+        config: A Python dictionary, typically the output of get_config.
+        custom_objects: A Python dictionary mapping names to additional Python
+          objects used to create this optimizer, such as a function used for a
+          hyperparameter.
+
+    Returns:
+        An optimizer instance.
+    """
+    if 'initial_accumulator_value' not in config:
+      config['initial_accumulator_value'] = 0.
+    if 'lr' in config:
+      config['learning_rate'] = config.pop('lr')
+    return cls(**config)
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    acc = self.get_slot(var, 'accumulator')
+
+    acc_t = state_ops.assign_add(
+        acc, math_ops.square(grad), use_locking=self._use_locking)
+    var_update = state_ops.assign_sub(
+        var, lr_t * grad / (math_ops.sqrt(acc_t) + epsilon))
+    return var_update
+
+  def _resource_apply_sparse(self, grad, var, indices):
+
+    def _resource_scatter_add(x, i, v):
+      with ops.control_dependencies(
+          [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+        return x.value()
+
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    acc = self.get_slot(var, 'accumulator')
+
+    acc_t = _resource_scatter_add(acc, indices, math_ops.square(grad))
+    acc_t_slice = array_ops.gather(acc_t, indices)
+    var_update = _resource_scatter_add(
+        var, indices, -lr_t * grad / (math_ops.sqrt(acc_t_slice) + epsilon))
+    return var_update
+
+  def get_config(self):
+    config = super(Adagrad, self).get_config()
+    config.update({
+        'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
+        'initial_accumulator_value': self._initial_accumulator_value,
+        'epsilon': self._serialize_hyperparameter('epsilon'),
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/adagrad_test.py b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2c290178fe8a62d1c7240df1d6c04f7b62456e1
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adagrad_test.py
@@ -0,0 +1,400 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for aggregate operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adagrad
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adagrad_update_numpy(param, accum, g_t, lr=0.001, epsilon=1e-7):
+  accum_t = accum + g_t * g_t
+  param_t = param - lr * g_t / (np.sqrt(accum_t) + epsilon)
+  return param_t, accum_t
+
+
+def sparse_adagrad_update_numpy(param,
+                                accum,
+                                gindexs,
+                                gvalues,
+                                lr=0.001,
+                                epsilon=1e-7):
+  accum_t = copy.deepcopy(accum)
+  param_t = copy.deepcopy(param)
+  # first loop accumulates repeated indices if necessary.
+  for i in range(len(gindexs)):
+    gindex = gindexs[i]
+    gvalue = gvalues[i]
+    accum_t[gindex] = accum_t[gindex] + gvalue * gvalue
+  for i in range(len(gindexs)):
+    gindex = gindexs[i]
+    gvalue = gvalues[i]
+    param_t[gindex] = param_t[gindex] - lr * gvalue / (
+        np.sqrt(accum_t[gindex]) + epsilon)
+  return param_t, accum_t
+
+
+class AdagradOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_callable_params=False):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 3.0
+        if not use_callable_params:
+          learning_rate = learning_rate()
+
+        ada_opt = adagrad.Adagrad(learning_rate)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, 3.0)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, 3.0)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasic(self):
+    self.doTestBasic()
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_callable_params=True)
+
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        decay = 0.5
+
+        ada_opt = adagrad.Adagrad(learning_rate, decay=decay)
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        if not context.executing_eagerly():
+          ada_update = ada_opt.apply_gradients(
+              zip([grads0, grads1], [var0, var1]))
+          self.evaluate(variables.global_variables_initializer())
+
+        # Fetch params to validate initial values
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([1.0, 2.0], v0_val)
+        self.assertAllClose([3.0, 4.0], v1_val)
+
+        # Run 3 steps of adagrad
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(ada_update)
+          else:
+            ada_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+          lr_np = learning_rate / (1 + decay * t)
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, lr_np)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, lr_np)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable(
+            [[1.0, 2.0], [3.0, 4.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = adagrad.Adagrad(1.0).minimize(loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType(
+            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = constant_op.constant(3.0)
+        ada_opt = adagrad.Adagrad(learning_rate)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        # Run 3 steps of adagrad
+        for _ in range(3):
+          ada_update.run()
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, learning_rate)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, learning_rate)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseBasic(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate)
+        ada_update = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+
+        accum0_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1, 0.1], dtype=dtype.as_numpy_dtype)
+
+        # Run 3 step of sgd
+        for _ in range(3):
+          ada_update.run()
+
+          var0_np, accum0_np = sparse_adagrad_update_numpy(
+              var0_np, accum0_np, grads0_np_indices,
+              grads0_np[grads0_np_indices], learning_rate)
+          var1_np, accum1_np = sparse_adagrad_update_numpy(
+              var1_np, accum1_np, grads1_np_indices,
+              grads1_np[grads1_np_indices], learning_rate)
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+
+        repeated_index_update_var = resource_variable_ops.ResourceVariable(
+            var_np, dtype=dtype)
+        aggregated_update_var = resource_variable_ops.ResourceVariable(
+            var_np, dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adagrad.Adagrad(3.0).apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndicesByEmbeddingLookUp(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var_repeated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_repeated = math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_repeated, [0, 0]))
+        var_aggregated = resource_variable_ops.ResourceVariable(
+            [1.0, 2.0], dtype=dtype)
+        loss_aggregated = 2 * math_ops.reduce_sum(
+            embedding_ops.embedding_lookup(var_aggregated, [0]))
+        update_op_repeated = adagrad.Adagrad(2.0).minimize(
+            loss_repeated, var_list=[var_repeated])
+        update_op_aggregated = adagrad.Adagrad(2.0).minimize(
+            loss_aggregated, var_list=[var_aggregated])
+        variables.global_variables_initializer().run()
+        self.assertAllCloseAccordingToType(
+            var_repeated.eval(), var_aggregated.eval())
+        for _ in range(3):
+          update_op_repeated.run()
+          update_op_aggregated.run()
+          self.assertAllCloseAccordingToType(
+              var_repeated.eval(), var_aggregated.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparseStability(self):
+    for dtype in [dtypes.half]:
+      with self.cached_session():
+        shape = [1, 6]
+        var0_np = np.array([[
+            0.00872496, -0.106952, 0.110467, 0.226505, -0.0147257, -0.0105945
+        ]],
+                           dtype=dtype.as_numpy_dtype)
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        grads0_np = np.array([[
+            -5.91278e-05, 5.31673e-05, -2.5779e-06, 4.29153e-05, -8.4877e-05,
+            -9.48906e-05
+        ]],
+                             dtype=dtype.as_numpy_dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np), constant_op.constant([0]),
+            constant_op.constant(shape))
+        ada_opt = adagrad.Adagrad(1.0)
+        ada_update = ada_opt.apply_gradients(zip([grads0], [var0]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        init = variables.global_variables_initializer()
+        for _ in range(100):
+          init.run()
+          ada_update.run()
+          self.assertAllCloseAccordingToType(
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+          self.assertAllCloseAccordingToType(
+              np.array([[
+                  0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
+                  -0.01029443
+              ]]), var0.eval())
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 3.0
+        ada_opt = adagrad.Adagrad(learning_rate)
+        # Apply the optimizer twice.  Both applications will use
+        # the same accums.
+        ada_update1 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        ada_update2 = ada_opt.apply_gradients(
+            zip([grads0, grads1], [var0, var1]))
+        slot0 = ada_opt.get_slot(var0, "accumulator")
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
+        slot1 = ada_opt.get_slot(var1, "accumulator")
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values.
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+        # Mix the first and the second adagrad for 3 steps.
+        ada_update1.run()
+        ada_update2.run()
+        ada_update1.run()
+
+        accum0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        for _ in range(3):
+          var0_np, accum0_np = adagrad_update_numpy(var0_np, accum0_np,
+                                                    grads0_np, learning_rate)
+          var1_np, accum1_np = adagrad_update_numpy(var1_np, accum1_np,
+                                                    grads1_np, learning_rate)
+        self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+        self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adam.py b/tensorflow/python/keras/optimizer_v2/adam.py
index b05811c419fa8ee435e635feb66408a4cd2ab06a..ef3d783f8910e791cf8591e0604935102c2b52cf 100644
--- a/tensorflow/python/keras/optimizer_v2/adam.py
+++ b/tensorflow/python/keras/optimizer_v2/adam.py
@@ -1,4 +1,4 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,8 +17,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
 from tensorflow.python.training import training_ops
 
 
@@ -31,36 +35,62 @@ class Adam(optimizer_v2.OptimizerV2):
   requirement, invariant to diagonal rescaling of gradients, and is well suited
   for problems that are large in terms of data/parameters'.
 
+  Note, amsgrad is currently not supported and the argument can only be False.
+
   # References
       See [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
         ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+      For AMSGrad see [Reddi et al., 2-18]
+        (https://openreview.net/pdf?id=ryQu7f-RZ)
   """
 
   def __init__(self,
                learning_rate=0.001,
                beta_1=0.9,
                beta_2=0.999,
-               epsilon=1e-8,
-               name='Adam'):
+               epsilon=1e-7,
+               amsgrad=False,
+               name='Adam',
+               **kwargs):
     r"""Construct a new Adam optimizer.
 
-    Initialization:
+    If amsgrad = False:
+      Initialization:
+
+      $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+      $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$t := 0 \text{(Initialize timestep)}$$
+
+      The update rule for `variable` with gradient `g` uses an optimization
+      described at the end of section2 of the paper:
+
+      $$t := t + 1$$
+      $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
 
-    $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
-    $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
-    $$t := 0 \text{(Initialize timestep)}$$
+      $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+      $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+      $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
 
-    The update rule for `variable` with gradient `g` uses an optimization
-    described at the end of section2 of the paper:
+    If amsgrad = True:
+      Initialization:
 
-    $$t := t + 1$$
-    $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+      $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+      $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$v_hat_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+      $$t := 0 \text{(Initialize timestep)}$$
 
-    $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
-    $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
-    $$variable := variable - lr_t * m_t / (\sqrt{v_t} + \epsilon)$$
+      The update rule for `variable` with gradient `g` uses an optimization
+      described at the end of section2 of the paper:
 
-    The default value of 1e-8 for epsilon might not be a good default in
+      $$t := t + 1$$
+      $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+
+      $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+      $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+      $$v_hat_t := max(v_hat_{t-1}, v_t)
+      $$variable := variable - lr_t * m_t / (\sqrt{v_hat_t} + \epsilon)$$
+
+    The default value of 1e-7 for epsilon might not be a good default in
     general. For example, when training an Inception network on ImageNet a
     current good choice is 1.0 or 0.1. Note that since AdamOptimizer uses the
     formulation just before Section 2.1 of the Kingma and Ba paper rather than
@@ -85,50 +115,142 @@ class Adam(optimizer_v2.OptimizerV2):
       epsilon: A small constant for numerical stability. This epsilon is
         "epsilon hat" in the Kingma and Ba paper (in the formula just before
         Section 2.1), not the epsilon in Algorithm 1 of the paper.
+      amsgrad: boolean. Whether to apply AMSGrad variant of this algorithm from
+        the paper "On the Convergence of Adam and beyond".
       name: Optional name for the operations created when applying gradients.
         Defaults to "Adam".  @compatibility(eager) When eager execution is
         enabled, `learning_rate`, `beta_1`, `beta_2`, and `epsilon` can each be
         a callable that takes no arguments and returns the actual value to use.
         This can be useful for changing these values across different
         invocations of optimizer functions. @end_compatibility
+      **kwargs: keyword arguments. Allowed to be {`decay`}
     """
 
-    super(Adam, self).__init__(name)
+    super(Adam, self).__init__(name, **kwargs)
     self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
     self._set_hyper('beta_1', beta_1)
     self._set_hyper('beta_2', beta_2)
     self._set_hyper('epsilon', epsilon)
+    self._amsgrad = amsgrad
 
   def _create_slots(self, var_list):
     # Create slots for the first and second moments.
+    # Separate for-loops to respect the ordering of slot variables from v1.
     for var in var_list:
       self.add_slot(var, 'm')
+    for var in var_list:
       self.add_slot(var, 'v')
+    if self._amsgrad:
+      for var in var_list:
+        self.add_slot(var, 'vhat')
+
+  def set_weights(self, weights):
+    params = self.weights
+    # If the weights are generated by Keras V1 optimizer, it includes vhats
+    # even without amsgrad, i.e, V1 optimizer has 3x + 1 variables, while V2
+    # optimizer has 2x + 1 variables. Filter vhats out for compatibility.
+    num_vars = int((len(params) - 1) / 2)
+    if len(weights) == 3 * num_vars + 1:
+      weights = weights[:len(params)]
+    super(Adam, self).set_weights(weights)
 
   def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
     m = self.get_slot(var, 'm')
     v = self.get_slot(var, 'v')
-    # TODO(tanzheny): let optimizer have its own step counter, and let
-    # beta1_power and beta2_power depend on it.
-    return training_ops.resource_apply_adam(
-        var.handle,
-        m.handle,
-        v.handle,
-        math_ops.cast(self._get_hyper('beta_1'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_2'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('learning_rate'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_1'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('beta_2'), grad.dtype.base_dtype),
-        math_ops.cast(self._get_hyper('epsilon'), grad.dtype.base_dtype),
-        grad,
-        use_locking=self._use_locking)
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    epsilon = self._get_hyper('epsilon', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    if not self._amsgrad:
+      return training_ops.resource_apply_adam(
+          var.handle,
+          m.handle,
+          v.handle,
+          beta_1_power,
+          beta_2_power,
+          lr_t,
+          beta_1_t,
+          beta_2_t,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      vhat = self.get_slot(var, 'vhat')
+      return training_ops.resource_apply_adam_with_amsgrad(
+          var.handle,
+          m.handle,
+          v.handle,
+          vhat.handle,
+          beta_1_power,
+          beta_2_power,
+          lr_t,
+          beta_1_t,
+          beta_2_t,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+    if not self._amsgrad:
+      v_sqrt = math_ops.sqrt(v_t)
+      var_update = state_ops.assign_sub(
+          var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
+      return control_flow_ops.group(*[var_update, m_t, v_t])
+    else:
+      v_hat = self.get_slot(var, 'vhat')
+      v_hat_t = math_ops.maximum(v_hat, v_t)
+      with ops.control_dependencies([v_hat_t]):
+        v_hat_t = state_ops.assign(
+            v_hat, v_hat_t, use_locking=self._use_locking)
+      v_hat_sqrt = math_ops.sqrt(v_hat_t)
+      var_update = state_ops.assign_sub(
+          var,
+          lr * m_t / (v_hat_sqrt + epsilon_t),
+          use_locking=self._use_locking)
+      return control_flow_ops.group(*[var_update, m_t, v_t, v_hat_t])
+
+  def _resource_scatter_add(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_add(x.handle, i, v)]):
+      return x.value()
 
   def get_config(self):
     config = super(Adam, self).get_config()
     config.update({
         'learning_rate': self._serialize_hyperparameter('learning_rate'),
+        'decay': self._serialize_hyperparameter('decay'),
         'beta_1': self._serialize_hyperparameter('beta_1'),
         'beta_2': self._serialize_hyperparameter('beta_2'),
         'epsilon': self._serialize_hyperparameter('epsilon'),
+        'amsgrad': self._amsgrad,
     })
     return config
diff --git a/tensorflow/python/keras/optimizer_v2/adam_test.py b/tensorflow/python/keras/optimizer_v2/adam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..3bbafe12f8e27df9bcc158ae6b50cba2fb086914
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adam_test.py
@@ -0,0 +1,508 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adam_update_numpy(param,
+                      g_t,
+                      t,
+                      m,
+                      v,
+                      lr=0.001,
+                      beta1=0.9,
+                      beta2=0.999,
+                      epsilon=1e-7):
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - lr_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+def adam_update_numpy_amsgrad(param,
+                              g_t,
+                              t,
+                              m,
+                              v,
+                              vhat,
+                              lr=0.001,
+                              beta1=0.9,
+                              beta2=0.999,
+                              epsilon=1e-7):
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+  vhat_t = np.maximum(vhat, v_t)
+
+  param_t = param - lr_t * m_t / (np.sqrt(vhat_t) + epsilon)
+  return param_t, m_t, v_t, vhat_t
+
+
+def adam_sparse_update_numpy_amsgrad(param,
+                                     indices,
+                                     g_t,
+                                     t,
+                                     m,
+                                     v,
+                                     vhat,
+                                     lr=0.001,
+                                     beta1=0.9,
+                                     beta2=0.999,
+                                     epsilon=1e-7):
+  m_t, v_t, vhat_t, param_t = (np.copy(m), np.copy(v), np.copy(vhat),
+                               np.copy(param))
+  lr_t = lr * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = beta2 * v[indices] + (1 - beta2) * g_t * g_t
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  v_hat_t = np.maximum(vhat_t, v_t)
+  v_hat_t_slice = v_hat_t[indices]
+  param_t_slice = param[indices] - (
+      lr_t * (m_t_slice / (np.sqrt(v_hat_t_slice) + epsilon)))
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t, vhat_t
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+  beta_2_power = math_ops.pow(beta_2_t, local_step)
+  return (beta_1_power, beta_2_power)
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = adam.Adam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 3.0, 4.0], self.evaluate(var1))
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adam.Adam(3.0)
+        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adam.Adam().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adam.Adam().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            self.evaluate(repeated_index_update_var))
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              self.evaluate(repeated_index_update_var))
+
+  def doTestBasic(self, use_callable_params=False):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = adam.Adam(learning_rate=learning_rate)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testResourceBasic(self):
+    self.doTestBasic()
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_callable_params=True)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasicWithAmsgrad(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, v0hat, m1, v1, v1hat = 0.0, 0.0, 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adam.Adam(amsgrad=True)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0, v0hat = adam_update_numpy_amsgrad(
+              var0_np, grads0_np, t, m0, v0, v0hat)
+          var1_np, m1, v1, v1hat = adam_update_numpy_amsgrad(
+              var1_np, grads1_np, t, m1, v1, v1hat)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testSparseWithAmsgrad(self):
+    # dtypes.half does not work on gpu + eager.
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        m0 = np.array([[0.0], [0.0]])
+        v0 = np.array([[0.0], [0.0]])
+        v0hat = np.array([[0.0], [0.0]])
+        indices_np = np.array([1])
+        indices = constant_op.constant(indices_np, dtype=dtypes.int32)
+        var0_np = np.array([[1.0], [2.0]], dtype=dtype.as_numpy_dtype)
+        repeated_index_update_var = variables.Variable(var0_np, dtype=dtype)
+        aggregated_update_var = variables.Variable(var0_np, dtype=dtype)
+        grads0_np = np.array([[0.2]], dtype=dtype.as_numpy_dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant([0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]), constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(grads0_np, indices,
+                                            constant_op.constant([2, 1]))
+        opt_repeated = adam.Adam(amsgrad=True)
+        opt_aggregated = adam.Adam(amsgrad=True)
+        if not context.executing_eagerly():
+          repeated_update = opt_repeated.apply_gradients(
+              [(grad_repeated_index, repeated_index_update_var)])
+          aggregated_update = opt_aggregated.apply_gradients(
+              [(grad_aggregated, aggregated_update_var)])
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllClose(
+            self.evaluate(aggregated_update_var),
+            self.evaluate(repeated_index_update_var))
+        for t in range(3):
+          if not context.executing_eagerly():
+            self.evaluate(repeated_update)
+            self.evaluate(aggregated_update)
+          else:
+            opt_repeated.apply_gradients(
+                [(grad_repeated_index, repeated_index_update_var)])
+            opt_aggregated.apply_gradients(
+                [(grad_aggregated, aggregated_update_var)])
+
+          var0_np, m0, v0, v0hat = adam_sparse_update_numpy_amsgrad(
+              var0_np, indices_np, grads0_np, t, m0, v0, v0hat)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(aggregated_update_var))
+          self.assertAllCloseAccordingToType(
+              self.evaluate(aggregated_update_var),
+              self.evaluate(repeated_index_update_var))
+
+  @test_util.run_deprecated_v1
+  def testBasicWithLearningRateDecay(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 0.001
+        beta_1 = 0.9
+        beta_2 = 0.999
+        epsilon = 1e-7
+        decay = 0.5
+
+        opt = adam.Adam(
+            learning_rate=learning_rate,
+            beta_1=beta_1,
+            beta_2=beta_2,
+            epsilon=epsilon,
+            decay=decay)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        self.evaluate(variables.global_variables_initializer())
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.evaluate(update)
+          lr_np = learning_rate / (1 + decay * t)
+
+          var0_np, m0, v0 = adam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, lr=lr_np)
+          var1_np, m1, v1 = adam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, lr=lr_np)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+        # Run 3 steps of Adam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.Adam()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta_1_power, beta_2_power = get_beta_accumulators(opt, dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          self.assertAllCloseAccordingToType(0.999**(t + 1),
+                                             self.evaluate(beta_2_power))
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adam.Adam(1.)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+      self.assertEqual(
+          self.evaluate(opt.variables()[0]), self.evaluate(opt.iterations))
+
+  def testSetWeightsFromV1AdamWithoutMinimize(self):
+    keras_v1_adam = optimizers.Adam()
+    keras_v2_adam = adam.Adam()
+    keras_v2_adam.set_weights(keras_v1_adam.get_weights())
+    keras_v1_iteration = keras_v1_adam.iterations
+    keras_v2_iteration = keras_v2_adam.iterations
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(
+        self.evaluate(keras_v1_iteration), self.evaluate(keras_v2_iteration))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax.py b/tensorflow/python/keras/optimizer_v2/adamax.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddd78584f852f24f9da6277888d1883bb44db327
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adamax.py
@@ -0,0 +1,159 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Adamax for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training import training_ops
+
+
+class Adamax(adam.Adam):
+  """Optimizer that implements the Adamax algorithm.
+
+  It is a variant of Adam based on the infinity norm.
+  Default parameters follow those provided in the paper.
+  Adamax is sometimes superior to adam, specially in models with embeddings.
+
+  References
+    see Section 7 of [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+      ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               name='Adamax',
+               **kwargs):
+    """Construct a new Adamax optimizer.
+
+    Initialization:
+
+    ```
+    m_0 <- 0 (Initialize initial 1st moment vector)
+    v_0 <- 0 (Initialize the exponentially weighted infinity norm)
+    t <- 0 (Initialize timestep)
+    ```
+
+    The update rule for `variable` with gradient `g` uses an optimization
+    described at the end of section 7.1 of the paper:
+
+    ```
+    t <- t + 1
+
+    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
+    v_t <- max(beta2 * v_{t-1}, abs(g))
+    variable <- variable - learning_rate / (1 - beta1^t) * m_t / (v_t + epsilon)
+    ```
+
+    Similar to AdamOptimizer, the epsilon is added for numerical stability
+    (especially to get rid of division by zero when v_t = 0).
+
+    Contrast to AdamOptimizer, the sparse implementation of this algorithm
+    (used when the gradient is an IndexedSlices object, typically because of
+    `tf.gather` or an embedding lookup in the forward pass) only updates
+    variable slices and corresponding `m_t`, `v_t` terms when that part of
+    the variable was used in the forward pass. This means that the sparse
+    behavior is contrast to the dense behavior (similar to some momentum
+    implementations which ignore momentum unless a variable slice was actually
+    used).
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adamax".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+    # pylint: disable=useless-super-delegation
+    super(Adamax, self).__init__(
+        learning_rate=learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon,
+        amsgrad=False,
+        name=name,
+        **kwargs)
+    # pylint: enable=useless-super-delegation
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    return training_ops.resource_apply_ada_max(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta_1_power,
+        lr_t,
+        beta_1_t,
+        beta_2_t,
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_slice = array_ops.gather(m, indices)
+    m_t_slice = m_slice * beta_1_t + grad * (1 - beta_1_t)
+    with ops.control_dependencies([m_t_slice]):
+      m_t = self._resource_scatter_update(m, indices, m_t_slice)
+
+    # u_t = max(beta2 * u, abs(g_t))
+    v = self.get_slot(var, 'v')
+    v_slice = array_ops.gather(v, indices)
+    v_t_slice = math_ops.maximum(v_slice * beta_2_t, math_ops.abs(grad))
+    with ops.control_dependencies([v_t_slice]):
+      v_t = self._resource_scatter_update(v, indices, v_t_slice)
+    # theta_t = theta - lr / (1 - beta1^t) * m_t / u_t
+    var_slice = -lr_t / (1 - beta_1_power) * (
+        m_t_slice / (v_t_slice + epsilon_t))
+    with ops.control_dependencies([var_slice]):
+      var_update = self._resource_scatter_add(var, indices, var_slice)
+    return control_flow_ops.group(*[var_update, m_t, v_t])
+
+  def _resource_scatter_update(self, x, i, v):
+    with ops.control_dependencies(
+        [resource_variable_ops.resource_scatter_update(
+            x.handle, i, v)]):
+      return x.value()
diff --git a/tensorflow/python/keras/optimizer_v2/adamax_test.py b/tensorflow/python/keras/optimizer_v2/adamax_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..baf131fbb0ce5bd4ab6c7d9b8c49e0519290dcef
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/adamax_test.py
@@ -0,0 +1,367 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Adamax."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import adamax
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def adamax_update_numpy(param,
+                        g_t,
+                        t,
+                        m,
+                        v,
+                        alpha=0.001,
+                        beta1=0.9,
+                        beta2=0.999,
+                        epsilon=1e-8):
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = np.maximum(beta2 * v, np.abs(g_t))
+  param_t = param - (alpha / (1 - beta1**(t + 1))) * (m_t / (v_t + epsilon))
+  return param_t, m_t, v_t
+
+
+def adamax_sparse_update_numpy(param,
+                               indices,
+                               g_t,
+                               t,
+                               m,
+                               v,
+                               alpha=0.001,
+                               beta1=0.9,
+                               beta2=0.999,
+                               epsilon=1e-8):
+  m_t, v_t, param_t = np.copy(m), np.copy(v), np.copy(param)
+  m_t_slice = beta1 * m[indices] + (1 - beta1) * g_t
+  v_t_slice = np.maximum(beta2 * v[indices], np.abs(g_t))
+  param_t_slice = param[indices] - (
+      (alpha / (1 - beta1**(t + 1))) * (m_t_slice / (v_t_slice + epsilon)))
+  m_t[indices] = m_t_slice
+  v_t[indices] = v_t_slice
+  param_t[indices] = param_t_slice
+  return param_t, m_t, v_t
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  return beta_1_power
+
+
+class AdamaxOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        zero_slots = lambda: np.zeros((3), dtype=dtype.as_numpy_dtype)  # pylint: disable=cell-var-from-loop
+        m0, v0, m1, v1 = zero_slots(), zero_slots(), zero_slots(), zero_slots()
+        var0_np = np.array([1.0, 2.0, 3.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+
+        grads0_np_indices = np.array([0, 1], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([2, 1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = adamax.Adamax()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0, 3.0], var0.eval())
+        self.assertAllClose([4.0, 5.0, 6.0], var1.eval())
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_sparse_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_sparse_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testSparseDevicePlacement(self):
+    for index_dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(force_gpu=test.is_gpu_available()):
+        # If a GPU is available, tests that all optimizer ops can be placed on
+        # it (i.e. they have GPU kernels).
+        var = variables.Variable([[1.0], [2.0]])
+        indices = constant_op.constant([0, 1], dtype=index_dtype)
+        gathered_sum = math_ops.reduce_sum(array_ops.gather(var, indices))
+        optimizer = adamax.Adamax(3.0)
+        minimize_op = optimizer.minimize(gathered_sum, var_list=[var])
+        variables.global_variables_initializer().run()
+        minimize_op.run()
+
+  @test_util.run_deprecated_v1
+  def testSparseRepeatedIndices(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        repeated_index_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        aggregated_update_var = variables.Variable(
+            [[1.0], [2.0]], dtype=dtype)
+        grad_repeated_index = ops.IndexedSlices(
+            constant_op.constant(
+                [0.1, 0.1], shape=[2, 1], dtype=dtype),
+            constant_op.constant([1, 1]),
+            constant_op.constant([2, 1]))
+        grad_aggregated = ops.IndexedSlices(
+            constant_op.constant(
+                [0.2], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]))
+        repeated_update = adamax.Adamax().apply_gradients(
+            [(grad_repeated_index, repeated_index_update_var)])
+        aggregated_update = adamax.Adamax().apply_gradients(
+            [(grad_aggregated, aggregated_update_var)])
+        variables.global_variables_initializer().run()
+        self.assertAllClose(aggregated_update_var.eval(),
+                            repeated_index_update_var.eval())
+        for _ in range(3):
+          repeated_update.run()
+          aggregated_update.run()
+          self.assertAllClose(aggregated_update_var.eval(),
+                              repeated_index_update_var.eval())
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasic(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        opt = adamax.Adamax()
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          beta_1_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(
+              var0_np, self.evaluate(var0), rtol=1e-2)
+          self.assertAllCloseAccordingToType(
+              var1_np, self.evaluate(var1), rtol=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes(reset_test=True)
+  def testBasicWithLearningRateDecay(self):
+    for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(
+            var0_np, name="var0_%d" % i)
+        var1 = resource_variable_ops.ResourceVariable(
+            var1_np, name="var1_%d" % i)
+
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = 0.001
+        decay = 0.002
+        opt = adamax.Adamax(learning_rate=learning_rate, decay=decay)
+        if not context.executing_eagerly():
+          update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+        if not context.executing_eagerly():
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          beta_1_power = get_beta_accumulators(opt, dtype)
+          self.assertAllCloseAccordingToType(0.9**(t + 1),
+                                             self.evaluate(beta_1_power))
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          else:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          lr = learning_rate / (1 + decay * t)
+
+          var0_np, m0, v0 = adamax_update_numpy(
+              var0_np, grads0_np, t, m0, v0, alpha=lr)
+          var1_np, m1, v1 = adamax_update_numpy(
+              var1_np, grads1_np, t, m1, v1, alpha=lr)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0),
+                                             rtol=1e-2)
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1),
+                                             rtol=1e-2)
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.Adamax(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Adamax
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adamax.Adamax()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power = get_beta_accumulators(opt, dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        # Run 3 steps of intertwined Adamax1 and Adamax2.
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.)
+      v2 = resource_variable_ops.ResourceVariable(1.)
+      opt = adamax.Adamax(1.)
+      opt.minimize(lambda: v1 + v2, var_list=[v1, v2])
+      # There should be iteration, and two unique slot variables for v1 and v2.
+      self.assertEqual(5, len(set(opt.variables())))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl.py b/tensorflow/python/keras/optimizer_v2/ftrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..e278e352f551a12718f6b400b16f9d7e05d0c02e
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/ftrl.py
@@ -0,0 +1,210 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ftrl-proximal for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.ops import init_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.training import training_ops
+
+
+class Ftrl(optimizer_v2.OptimizerV2):
+  """Optimizer that implements the FTRL algorithm.
+
+  See this [paper](
+  https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf).
+  This version has support for both online L2 (the L2 penalty given in the paper
+  above) and shrinkage-type L2 (which is the addition of an L2 penalty to the
+  loss function).
+  """
+
+  def __init__(self,
+               learning_rate,
+               learning_rate_power=-0.5,
+               initial_accumulator_value=0.1,
+               l1_regularization_strength=0.0,
+               l2_regularization_strength=0.0,
+               name='Ftrl',
+               l2_shrinkage_regularization_strength=0.0,
+               **kwargs):
+    r"""Construct a new FTRL optimizer.
+
+    Args:
+      learning_rate: A float value or a constant float `Tensor`.
+      learning_rate_power: A float value, must be less or equal to zero.
+        Controls how the learning rate decreases during training. Use zero for
+        a fixed learning rate.
+      initial_accumulator_value: The starting value for accumulators.
+        Only zero or positive values are allowed.
+      l1_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      l2_regularization_strength: A float value, must be greater than or
+        equal to zero.
+      name: Optional name prefix for the operations created when applying
+        gradients.  Defaults to "Ftrl".
+      l2_shrinkage_regularization_strength: A float value, must be greater than
+        or equal to zero. This differs from L2 above in that the L2 above is a
+        stabilization penalty, whereas this L2 shrinkage is a magnitude penalty.
+        The FTRL formulation can be written as:
+        w_{t+1} = argmin_w(\hat{g}_{1:t}w + L1*||w||_1 + L2*||w||_2^2), where
+        \hat{g} = g + (2*L2_shrinkage*w), and g is the gradient of the loss
+        function w.r.t. the weights w.
+        Specifically, in the absence of L1 regularization, it is equivalent to
+        the following update rule:
+        w_{t+1} = w_t - lr_t / (1 + 2*L2*lr_t) * g_t -
+                  2*L2_shrinkage*lr_t / (1 + 2*L2*lr_t) * w_t
+        where lr_t is the learning rate at t.
+        When input is sparse shrinkage will only happen on the active weights.\
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+
+    Raises:
+      ValueError: If one of the arguments is invalid.
+
+    References
+      See [paper]
+        (https://www.eecs.tufts.edu/~dsculley/papers/ad-click-prediction.pdf)
+    """
+    super(Ftrl, self).__init__(name, **kwargs)
+
+    if initial_accumulator_value < 0.0:
+      raise ValueError(
+          'initial_accumulator_value %f needs to be positive or zero' %
+          initial_accumulator_value)
+    if learning_rate_power > 0.0:
+      raise ValueError('learning_rate_power %f needs to be negative or zero' %
+                       learning_rate_power)
+    if l1_regularization_strength < 0.0:
+      raise ValueError(
+          'l1_regularization_strength %f needs to be positive or zero' %
+          l1_regularization_strength)
+    if l2_regularization_strength < 0.0:
+      raise ValueError(
+          'l2_regularization_strength %f needs to be positive or zero' %
+          l2_regularization_strength)
+    if l2_shrinkage_regularization_strength < 0.0:
+      raise ValueError(
+          'l2_shrinkage_regularization_strength %f needs to be positive'
+          ' or zero' % l2_shrinkage_regularization_strength)
+
+    self._set_hyper('learning_rate', learning_rate)
+    self._set_hyper('decay', self._initial_decay)
+    self._set_hyper('learning_rate_power', learning_rate_power)
+    self._set_hyper('l1_regularization_strength', l1_regularization_strength)
+    self._set_hyper('l2_regularization_strength', l2_regularization_strength)
+    self._initial_accumulator_value = initial_accumulator_value
+    self._l2_shrinkage_regularization_strength = (
+        l2_shrinkage_regularization_strength)
+
+  def _create_slots(self, var_list):
+    # Create the "accum" and "linear" slots.
+    for var in var_list:
+      dtype = var.dtype.base_dtype
+      init = init_ops.constant_initializer(
+          self._initial_accumulator_value, dtype=dtype)
+      self.add_slot(var, 'accumulator', init)
+      self.add_slot(var, 'linear')
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    learning_rate_power = self._get_hyper('learning_rate_power', var_dtype)
+    l1_regularization_strength = self._get_hyper('l1_regularization_strength',
+                                                 var_dtype)
+    l2_regularization_strength = self._get_hyper('l2_regularization_strength',
+                                                 var_dtype)
+    accum = self.get_slot(var, 'accumulator')
+    linear = self.get_slot(var, 'linear')
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          learning_rate_power,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          math_ops.cast(self._l2_shrinkage_regularization_strength, var_dtype),
+          learning_rate_power,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    learning_rate_power = self._get_hyper('learning_rate_power', var_dtype)
+    l1_regularization_strength = self._get_hyper('l1_regularization_strength',
+                                                 var_dtype)
+    l2_regularization_strength = self._get_hyper('l2_regularization_strength',
+                                                 var_dtype)
+    accum = self.get_slot(var, 'accumulator')
+    linear = self.get_slot(var, 'linear')
+    if self._l2_shrinkage_regularization_strength <= 0.0:
+      return training_ops.resource_sparse_apply_ftrl(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          learning_rate_power,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_ftrl_v2(
+          var.handle,
+          accum.handle,
+          linear.handle,
+          grad,
+          indices,
+          lr_t,
+          l1_regularization_strength,
+          l2_regularization_strength,
+          math_ops.cast(self._l2_shrinkage_regularization_strength, var_dtype),
+          learning_rate_power,
+          use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(Ftrl, self).get_config()
+    config.update({
+        'learning_rate':
+            self._serialize_hyperparameter('learning_rate'),
+        'decay':
+            self._serialize_hyperparameter('decay'),
+        'initial_accumulator_value':
+            self._initial_accumulator_value,
+        'learning_rate_power':
+            self._serialize_hyperparameter('learning_rate_power'),
+        'l1_regularization_strength':
+            self._serializer_hyperparameter('l1_regularization_strength'),
+        'l2_regularization_strength':
+            self._serializer_hyperparameter('l2_regularization_strength'),
+        'l2_shrinkage_regularization_strength':
+            self._l2_shrinkage_regularization_strength,
+    })
+    return config
diff --git a/tensorflow/python/keras/optimizer_v2/ftrl_test.py b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..bec400e8cbba2654decaf520a24800095e4d16f5
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/ftrl_test.py
@@ -0,0 +1,440 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Functional tests for Ftrl operations."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import ftrl
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.training import adagrad
+from tensorflow.python.training import gradient_descent
+
+
+class FtrlOptimizerTest(test.TestCase):
+
+  def doTestFtrlwithoutRegularization(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+          var1 = resource_variable_ops.ResourceVariable([0.0, 0.0], dtype=dtype)
+        else:
+          var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+          var1 = variables.Variable([0.0, 0.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllClose([0.0, 0.0], v0_val)
+        self.assertAllClose([0.0, 0.0], v1_val)
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.60260963, -4.29698515]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28432083, -0.56694895]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithoutRegularization(self):
+    self.doTestFtrlwithoutRegularization(use_resource=False)
+
+  @test_util.run_deprecated_v1
+  def testResourceFtrlWithoutRegularization(self):
+    self.doTestFtrlwithoutRegularization(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testFtrlwithoutRegularization2(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.0,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 3 steps FTRL
+        for _ in range(3):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.55607247, -3.98729396]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.28232238, -0.56096673]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = ftrl.Ftrl(1.0).minimize(loss, var_list=[var0])
+        variables.global_variables_initializer().run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=0.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-7.66718769, -10.91273689]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.93460727, -1.86147261]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.24059935, -0.46829352]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02406147, -0.04830509]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2_L2Shrinkage(self):
+    """Test the new FTRL op with support for l2 shrinkage.
+
+    The addition of this parameter which places a constant pressure on weights
+    towards the origin causes the gradient descent trajectory to differ. The
+    weights will tend to have smaller magnitudes with this parameter set.
+    """
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([4.0, 3.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType(
+            np.array([-0.22578995, -0.44345796]), v0_val)
+        self.assertAllCloseAccordingToType(
+            np.array([-0.14378493, -0.13229476]), v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL1_L2_L2ShrinkageSparse(self):
+    """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[4.0], [3.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.02], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+
+        opt = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
+        self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([[-0.22578995], [2.]], v0_val)
+        self.assertAllCloseAccordingToType([[4.], [-0.13229476]], v1_val)
+
+  @test_util.run_deprecated_v1
+  def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
+    """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session() as sess:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([1.0, 2.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+        grads1 = constant_op.constant([0.1, 0.2], dtype=dtype)
+
+        opt0 = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0,
+            l2_shrinkage_regularization_strength=0.1)
+        opt1 = ftrl.Ftrl(
+            3.0,
+            initial_accumulator_value=0.1,
+            l1_regularization_strength=0.001,
+            l2_regularization_strength=2.0)
+        update0 = opt0.apply_gradients([(grads0, var0)])
+        update1 = opt1.apply_gradients([(grads1, var1)])
+        variables.global_variables_initializer().run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
+        self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
+
+        # Run 10 steps FTRL
+        for _ in range(10):
+          update0.run()
+          update1.run()
+
+        v0_val, v1_val = self.evaluate([var0, var1])
+        # var0 is experiencing L2 shrinkage so it should be smaller than var1
+        # in magnitude.
+        self.assertTrue((v0_val**2 < v1_val**2).all())
+        accum0 = sess.run(opt0.get_slot(var0, "accumulator"))
+        accum1 = sess.run(opt1.get_slot(var1, "accumulator"))
+        # L2 shrinkage should not change how we update grad accumulator.
+        self.assertAllCloseAccordingToType(accum0, accum1)
+
+  def applyOptimizer(self, opt, dtype, steps=5, is_sparse=False):
+    if is_sparse:
+      var0 = variables.Variable([[0.0], [0.0]], dtype=dtype)
+      var1 = variables.Variable([[0.0], [0.0]], dtype=dtype)
+      grads0 = ops.IndexedSlices(
+          constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+          constant_op.constant([0]), constant_op.constant([2, 1]))
+      grads1 = ops.IndexedSlices(
+          constant_op.constant([0.02], shape=[1, 1], dtype=dtype),
+          constant_op.constant([1]), constant_op.constant([2, 1]))
+    else:
+      var0 = variables.Variable([0.0, 0.0], dtype=dtype)
+      var1 = variables.Variable([0.0, 0.0], dtype=dtype)
+      grads0 = constant_op.constant([0.1, 0.2], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.02], dtype=dtype)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    variables.global_variables_initializer().run()
+
+    sess = ops.get_default_session()
+    v0_val, v1_val = self.evaluate([var0, var1])
+    if is_sparse:
+      self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
+      self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
+    else:
+      self.assertAllCloseAccordingToType([0.0, 0.0], v0_val)
+      self.assertAllCloseAccordingToType([0.0, 0.0], v1_val)
+
+    # Run Ftrl for a few steps
+    for _ in range(steps):
+      update.run()
+
+    v0_val, v1_val = self.evaluate([var0, var1])
+    return v0_val, v1_val
+
+  # When variables are initialized with Zero, FTRL-Proximal has two properties:
+  # 1. Without L1&L2 but with fixed learning rate, FTRL-Proximal is identical
+  # with GradientDescent.
+  # 2. Without L1&L2 but with adaptive learning rate, FTRL-Proximal is identical
+  # with Adagrad.
+  # So, basing on these two properties, we test if our implementation of
+  # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+  @test_util.run_deprecated_v1
+  def testEquivAdagradwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Adagrad learning rate
+                learning_rate_power=-0.5,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1), dtype)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivSparseAdagradwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Adagrad learning rate
+                learning_rate_power=-0.5,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype,
+            is_sparse=True)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1),
+            dtype,
+            is_sparse=True)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivSparseGradientDescentwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Fixed learning rate
+                learning_rate_power=-0.0,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype,
+            is_sparse=True)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            gradient_descent.GradientDescentOptimizer(3.0),
+            dtype,
+            is_sparse=True)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+  @test_util.run_deprecated_v1
+  def testEquivGradientDescentwithoutRegularization(self):
+    for dtype in [dtypes.half, dtypes.float32]:
+      with self.cached_session():
+        val0, val1 = self.applyOptimizer(
+            ftrl.Ftrl(
+                3.0,
+                # Fixed learning rate
+                learning_rate_power=-0.0,
+                initial_accumulator_value=0.1,
+                l1_regularization_strength=0.0,
+                l2_regularization_strength=0.0),
+            dtype)
+
+      with self.cached_session():
+        val2, val3 = self.applyOptimizer(
+            gradient_descent.GradientDescentOptimizer(3.0), dtype)
+
+      self.assertAllCloseAccordingToType(val0, val2)
+      self.assertAllCloseAccordingToType(val1, val3)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent.py b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
index 90106c941cca5b1a22d81e5492b750deafea33b0..2b82b5e78dedce5ff68b860d143b1ecadd18e0bd 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent.py
@@ -19,7 +19,6 @@ from __future__ import print_function
 
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.optimizer_v2 import optimizer_v2
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training import training_ops
 
@@ -62,7 +61,8 @@ class SGD(optimizer_v2.OptimizerV2):
                learning_rate=0.001,
                momentum=0.0,
                nesterov=False,
-               name="SGD"):
+               name="SGD",
+               **kwargs):
     """Construct a new Stochastic Gradient Descent or Momentum optimizer.
 
     Arguments:
@@ -72,9 +72,11 @@ class SGD(optimizer_v2.OptimizerV2):
       nesterov: boolean. Whether to apply Nesterov momentum.
       name: Optional name prefix for the operations created when applying
         gradients.  Defaults to 'SGD'.
+      **kwargs: keyword arguments. Allowed to be {`decay`}
     """
-    super(SGD, self).__init__(name)
+    super(SGD, self).__init__(name, **kwargs)
     self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
 
     self._momentum = False
     if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
@@ -91,44 +93,44 @@ class SGD(optimizer_v2.OptimizerV2):
         self.add_slot(var, "momentum")
 
   def _resource_apply_dense(self, grad, var):
-    learning_rate = self._get_hyper("learning_rate")
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
     if self._momentum:
       momentum_var = self.get_slot(var, "momentum")
-      return training_ops.resource_apply_momentum(
+      return training_ops.resource_apply_keras_momentum(
           var.handle,
           momentum_var.handle,
-          math_ops.cast(learning_rate, grad.dtype.base_dtype),
+          lr_t,
           grad,
-          math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
+          self._get_hyper("momentum", var_dtype),
           use_locking=self._use_locking,
           use_nesterov=self._nesterov)
     else:
       return training_ops.resource_apply_gradient_descent(
-          var.handle,
-          math_ops.cast(learning_rate, grad.dtype.base_dtype),
-          grad,
-          use_locking=self._use_locking)
+          var.handle, lr_t, grad, use_locking=self._use_locking)
 
   def _resource_apply_sparse_duplicate_indices(self, grad, var, indices):
     if self._momentum:
       return super(SGD, self)._resource_apply_sparse_duplicate_indices(
           grad, var, indices)
     else:
-      return resource_variable_ops.resource_scatter_add(
-          var.handle, indices, -grad * math_ops.cast(
-              self._get_hyper("learning_rate"), grad.dtype.base_dtype))
+      var_dtype = var.dtype.base_dtype
+      lr_t = self._decayed_lr(var_dtype)
+      return resource_variable_ops.resource_scatter_add(var.handle, indices,
+                                                        -grad * lr_t)
 
   def _resource_apply_sparse(self, grad, var, indices):
     # This method is only needed for momentum optimization.
-    learning_rate = self._get_hyper("learning_rate")
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
     momentum_var = self.get_slot(var, "momentum")
-    return training_ops.resource_sparse_apply_momentum(
+    return training_ops.resource_sparse_apply_keras_momentum(
         var.handle,
         momentum_var.handle,
-        math_ops.cast(learning_rate, grad.dtype.base_dtype),
+        lr_t,
         grad,
         indices,
-        math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
+        self._get_hyper("momentum", var_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._nesterov)
 
@@ -136,6 +138,7 @@ class SGD(optimizer_v2.OptimizerV2):
     config = super(SGD, self).get_config()
     config.update({
         "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
         "momentum": self._serialize_hyperparameter("momentum"),
         "nesterov": self._nesterov,
     })
diff --git a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
index b84bf1a6ecc073740dcff02996b7777d905eb339..0c64202da81c36e4140be7ca7719e9d426c549cc 100644
--- a/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
+++ b/tensorflow/python/keras/optimizer_v2/gradient_descent_test.py
@@ -47,7 +47,6 @@ class GradientDescentOptimizerTest(test.TestCase):
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
         grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
         sgd = gradient_descent.SGD(3.0)
-        # self.assertFalse(sgd._initial_decay)
         sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
         self.evaluate(variables.global_variables_initializer())
         # Run 1 step of sgd
@@ -58,6 +57,43 @@ class GradientDescentOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                            self.evaluate(var1))
 
+  @test_util.run_in_graph_and_eager_modes
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        learning_rate = 3.0
+        decay = 0.5
+        sgd = gradient_descent.SGD(learning_rate=learning_rate, decay=decay)
+        if not context.executing_eagerly():
+          sgd_op = sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 2 steps of sgd
+        if not context.executing_eagerly():
+          self.evaluate(sgd_op)
+        else:
+          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Validate updated params
+        self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
+                                           self.evaluate(var1))
+
+        if not context.executing_eagerly():
+          self.evaluate(sgd_op)
+        else:
+          sgd.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1 - 2.0 * 0.1, 2.0 - 3.0 * 0.1 - 2.0 * 0.1],
+            self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01 - 2.0 * 0.01, 4.0 - 3.0 * 0.01 - 2.0 * 0.01],
+            self.evaluate(var1))
+
   @test_util.run_in_graph_and_eager_modes
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
@@ -98,6 +134,7 @@ class GradientDescentOptimizerTest(test.TestCase):
                                            self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 1.0], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -137,6 +174,7 @@ class GradientDescentOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
                                            self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -170,6 +208,37 @@ class GradientDescentOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
                                            self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
+  def testSparseBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]), constant_op.constant([2, 1]))
+        grads1 = ops.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]), constant_op.constant([2, 1]))
+        sgd_op = gradient_descent.SGD(
+            3.0, decay=0.5).apply_gradients(
+                zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Run 2 steps of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
+                                           self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
+                                           self.evaluate(var1))
+
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[1.0 - 3.0 * 0.1 - 2.0 * 0.1], [2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            [[3.0], [4.0 - 3.0 * 0.01 - 2.0 * 0.01]], self.evaluate(var1))
+
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
       optimizer = gradient_descent.SGD(1.0)
@@ -194,10 +263,8 @@ class GradientDescentOptimizerTest(test.TestCase):
 class MomentumOptimizerTest(test.TestCase):
 
   def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
-    var = var + accum * lr * momentum
-    accum = accum * momentum + g
-    var = var - lr * accum
-    var = var - accum * lr * momentum
+    accum = accum * momentum - g * lr
+    var += (accum * momentum - g * lr)
     return var, accum
 
   @test_util.run_in_graph_and_eager_modes
@@ -222,9 +289,9 @@ class MomentumOptimizerTest(test.TestCase):
 
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
@@ -232,9 +299,9 @@ class MomentumOptimizerTest(test.TestCase):
         self.evaluate(mom_update)
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([0.1, 0.1]), self.evaluate(slot0))
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([0.01, 0.01]), self.evaluate(slot1))
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
@@ -248,11 +315,11 @@ class MomentumOptimizerTest(test.TestCase):
           mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
             self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            self.evaluate(slot1))
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -265,6 +332,7 @@ class MomentumOptimizerTest(test.TestCase):
                 3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
             ]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -289,9 +357,10 @@ class MomentumOptimizerTest(test.TestCase):
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
               var1_np, accum1_np, 3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -329,10 +398,11 @@ class MomentumOptimizerTest(test.TestCase):
               var0_np, accum0_np, var0_np * 10, 2.0, 0.9)
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(
               var1_np, accum1_np, 3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       # This test invokes the ResourceSparseApplyMomentum operation, which
@@ -386,6 +456,7 @@ class MomentumOptimizerTest(test.TestCase):
     self.evaluate(sgd_op)
     self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -401,43 +472,50 @@ class MomentumOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
                 3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -456,57 +534,65 @@ class MomentumOptimizerTest(test.TestCase):
 
         # Check we have slots
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([0, 0], var0.eval()[0])
-        self.assertAllClose([0, 0], var0.eval()[1])
-        self.assertAllClose([1, 1], var1.eval()[2])
+        self.assertAllClose([0, 0], self.evaluate(var0)[0])
+        self.assertAllClose([0, 0], self.evaluate(var0)[1])
+        self.assertAllClose([1, 1], self.evaluate(var1)[2])
 
         # Step 1: the momentum accumulators are 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
-        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
         self.assertAllCloseAccordingToType(
-            np.array([.01, .01]),
-            slot1.eval()[2])
+            np.array([0, 0]),
+            self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.0 * .1, -2.0 * .1]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([-2.0 * .01, -2.0 * .01]),
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
+        self.assertAllCloseAccordingToType(
+            np.array([0, 0]),
+            self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
             np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
-            var0.eval()[1])
+            self.evaluate(var0)[1])
         self.assertAllCloseAccordingToType(
             np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
-            var1.eval()[2])
+            self.evaluate(var1)[2])
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
-            slot0.eval()[1])
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0)[1])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            slot1.eval()[2])
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]),
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
             np.array([
                 -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
             ]),
-            var0.eval()[1])
+            self.evaluate(var0)[1])
         self.assertAllCloseAccordingToType(
             np.array([
                 0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
                 0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
             ]),
-            var1.eval()[2])
+            self.evaluate(var1)[2])
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -522,42 +608,48 @@ class MomentumOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         slot0 = mom_opt.get_slot(var0, "momentum")
-        self.assertEquals(slot0.get_shape(), var0.get_shape())
+        self.assertEqual(slot0.get_shape(), var0.get_shape())
         slot1 = mom_opt.get_slot(var1, "momentum")
-        self.assertEquals(slot1.get_shape(), var1.get_shape())
+        self.assertEqual(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update1.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([-0.2, -0.2]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([-0.02, -0.02]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the second momentum accumulators contain the previous update.
         mom_update2.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * (-0.02) - 2.0 * 0.01),
+                      (0.9 * (-0.02) - 2.0 * 0.01)]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
                 3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testConfig(self):
diff --git a/tensorflow/python/keras/optimizer_v2/nadam.py b/tensorflow/python/keras/optimizer_v2/nadam.py
new file mode 100644
index 0000000000000000000000000000000000000000..00b095e0dc950c7e68414c1657847b891652a5ba
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/nadam.py
@@ -0,0 +1,143 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Nadam for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import adam
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.training import training_ops
+
+
+class Nadam(adam.Adam):
+  r"""Optimizer that implements the NAdam algorithm.
+
+  Much like Adam is essentially RMSprop with momentum, Nadam is Adam with
+  Nesterov momentum.
+
+  Initialization:
+
+  $$m_0 := 0 \text{(Initialize initial 1st moment vector)}$$
+  $$v_0 := 0 \text{(Initialize initial 2nd moment vector)}$$
+  $$t := 0 \text{(Initialize timestep)}$$
+
+  Computes:
+  $$t := t + 1$$
+  $$lr_t := \text{learning\_rate} * \sqrt{1 - beta_2^t} / (1 - beta_1^t)$$
+  $$m_t := beta_1 * m_{t-1} + (1 - beta_1) * g$$
+  $$v_t := beta_2 * v_{t-1} + (1 - beta_2) * g * g$$
+  $$m_bar_t := beta_1 * v_t + (1 - beta_1) * g$$
+  $$theta_t := theta_{t-1} - lr_t * m_bar_t / (\sqrt{v_t} + \epsilon)$$
+
+  gradient is evaluated at theta(t) + momentum * v(t), and the variables always
+  store theta + beta_1 * m / sqrt(v) instead of theta.
+
+  References
+    See [Dozat, T., 2015](http://cs229.stanford.edu/proj2015/054_report.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               beta_1=0.9,
+               beta_2=0.999,
+               epsilon=1e-7,
+               name='Nadam',
+               **kwargs):
+    """Construct a new Nadam optimizer.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      beta_1: A float value or a constant float tensor. The exponential decay
+        rate for the 1st moment estimates.
+      beta_2: A float value or a constant float tensor. The exponential decay
+        rate for the exponentially weighted infinity norm.
+      epsilon: A small constant for numerical stability.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "Adamax".
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+
+    # pylint: disable=useless-super-delegation
+    super(Nadam, self).__init__(
+        learning_rate=learning_rate,
+        beta_1=beta_1,
+        beta_2=beta_2,
+        epsilon=epsilon,
+        amsgrad=False,
+        name=name,
+        **kwargs)
+    # pylint: enable=useless-super-delegation
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    m = self.get_slot(var, 'm')
+    v = self.get_slot(var, 'v')
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    return training_ops.resource_apply_adam(
+        var.handle,
+        m.handle,
+        v.handle,
+        beta_1_power,
+        beta_2_power,
+        lr_t,
+        beta_1_t,
+        beta_2_t,
+        self._get_hyper('epsilon', var_dtype),
+        grad,
+        use_locking=self._use_locking,
+        use_nesterov=True)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    beta_1_t = self._get_hyper('beta_1', var_dtype)
+    beta_2_t = self._get_hyper('beta_2', var_dtype)
+    local_step = math_ops.cast(self.iterations + 1, var_dtype)
+    beta_1_power = math_ops.pow(beta_1_t, local_step)
+    beta_2_power = math_ops.pow(beta_2_t, local_step)
+    epsilon_t = self._get_hyper('epsilon', var_dtype)
+    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))
+
+    # m_t = beta1 * m + (1 - beta1) * g_t
+    m = self.get_slot(var, 'm')
+    m_scaled_g_values = grad * (1 - beta_1_t)
+    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
+    with ops.control_dependencies([m_t]):
+      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)
+      # m_bar = (1 - beta1) * g_t + beta1 * m_t
+      m_bar = m_scaled_g_values + beta_1_t * array_ops.gather(m_t, indices)
+
+    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
+    v = self.get_slot(var, 'v')
+    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
+    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
+    with ops.control_dependencies([v_t]):
+      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)
+
+    v_t_slice = array_ops.gather(v_t, indices)
+    v_sqrt = math_ops.sqrt(v_t_slice)
+    var_update = self._resource_scatter_add(var, indices,
+                                            -lr * m_bar / (v_sqrt + epsilon_t))
+    return control_flow_ops.group(*[var_update, m_bar, v_t])
diff --git a/tensorflow/python/keras/optimizer_v2/nadam_test.py b/tensorflow/python/keras/optimizer_v2/nadam_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d991e3117cad4530ffb1f3a4315b49dc46d26bfc
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/nadam_test.py
@@ -0,0 +1,213 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Nadam."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import nadam
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+def get_beta_accumulators(opt, dtype):
+  local_step = math_ops.cast(opt.iterations + 1, dtype)
+  beta_1_t = math_ops.cast(opt._get_hyper("beta_1"), dtype)
+  beta_1_power = math_ops.pow(beta_1_t, local_step)
+  beta_2_t = math_ops.cast(opt._get_hyper("beta_2"), dtype)
+  beta_2_power = math_ops.pow(beta_2_t, local_step)
+  return (beta_1_power, beta_2_power)
+
+
+def nadam_update_numpy(param,
+                       g_t,
+                       t,
+                       m,
+                       v,
+                       alpha=0.001,
+                       beta1=0.9,
+                       beta2=0.999,
+                       epsilon=1e-8):
+  alpha_t = alpha * np.sqrt(1 - beta2**(t + 1)) / (1 - beta1**(t + 1))
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  m_bar = (1 - beta1) * g_t + beta1 * m_t
+
+  param_t = param - alpha_t * m_bar / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+class NadamOptimizerTest(test.TestCase):
+
+  def doTestSparse(self, use_resource=False):
+    sparse_epsilon = 1e-7
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0, 2], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np[grads0_np_indices]),
+            constant_op.constant(grads0_np_indices), constant_op.constant([3]))
+        grads1_np_indices = np.array([0, 2], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np[grads1_np_indices]),
+            constant_op.constant(grads1_np_indices), constant_op.constant([3]))
+        opt = nadam.Nadam(epsilon=sparse_epsilon)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, epsilon=sparse_epsilon)
+          var1_np, m1, v1 = nadam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, epsilon=sparse_epsilon)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    self.doTestSparse(use_resource=False)
+
+  @test_util.run_deprecated_v1
+  def testResourceSparse(self):
+    self.doTestSparse(use_resource=True)
+
+  def doTestBasic(self, use_resource=False):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(var0_np)
+          var1 = resource_variable_ops.ResourceVariable(var1_np)
+        else:
+          var0 = variables.Variable(var0_np)
+          var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = nadam.Nadam()
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          var0_np, m0, v0 = nadam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = nadam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+  @test_util.run_deprecated_v1
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  @test_util.run_deprecated_v1
+  def testBasicWithLearningRateDecay(self):
+    for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np)
+        var1 = resource_variable_ops.ResourceVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        learning_rate = 0.001
+        decay = 0.5
+        opt = nadam.Nadam(learning_rate=learning_rate, decay=decay)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], var0.eval())
+        self.assertAllClose([3.0, 4.0], var1.eval())
+
+        beta1_power, beta2_power = get_beta_accumulators(opt, dtype)
+
+        # Run 3 steps of Nadam
+        for t in range(3):
+          self.assertAllCloseAccordingToType(0.9**(t + 1), beta1_power.eval())
+          self.assertAllCloseAccordingToType(0.999**(t + 1), beta2_power.eval())
+          update.run()
+
+          lr = learning_rate / (1 + decay * t)
+          var0_np, m0, v0 = nadam_update_numpy(
+              var0_np, grads0_np, t, m0, v0, alpha=lr)
+          var1_np, m1, v1 = nadam_update_numpy(
+              var1_np, grads1_np, t, m1, v1, alpha=lr)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0.eval())
+          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
index c6e1d57c5e4be96d2bc654314f0e6f49baa8d10f..15f3009a4af4270f2f845f6c5bf945f330efe545 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2.py
@@ -24,18 +24,20 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend
 from tensorflow.python.keras import initializers
-from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import gradients
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops import variables
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as optimizer_v1
 from tensorflow.python.util import nest
 
@@ -114,7 +116,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
   """
 
-  def __init__(self, name):
+  def __init__(self, name, **kwargs):
     """Create a new Optimizer.
 
     This must be called by the constructors of subclasses.
@@ -128,6 +130,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Args:
       name: A non-empty string.  The name to use for accumulators created
         for the optimizer.
+      **kwargs: keyword arguments. Allowed to be {`decay`}
 
     Raises:
       ValueError: If name is malformed.
@@ -140,6 +143,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
     # dict: {variable name : {slot name : variable}}
     self._slots = {}
     self._weights = []
+
+    decay = kwargs.pop("decay", 0.0)
+    if decay < 0.:
+      raise ValueError("decay cannot be less than 0: {}".format(decay))
+    self._initial_decay = decay
+
     self._prepared = False
 
   def minimize(self,
@@ -244,12 +253,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
       with backprop.GradientTape() as tape:
         tape.watch(var_list)
         loss_value = loss()
+        loss_value = self._scale_loss(loss_value)
       grads = tape.gradient(loss_value, var_list, grad_loss)
     else:
       if context.executing_eagerly():
         raise RuntimeError("`loss` passed to Optimizer.compute_gradients "
                            "should be a function when eager execution is "
                            "enabled.")
+      loss = self._scale_loss(loss)
       self._assert_valid_dtypes([loss])
       if grad_loss is not None:
         self._assert_valid_dtypes([grad_loss])
@@ -269,6 +280,15 @@ class OptimizerV2(optimizer_v1.Optimizer):
 
     return grads_and_vars
 
+  @staticmethod
+  def _scale_loss(loss_value):
+    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
+      num_replicas = \
+        distribute_ctx.get_distribution_strategy().num_replicas_in_sync
+      if num_replicas > 1:
+        loss_value *= (1. / num_replicas)
+    return loss_value
+
   def apply_gradients(self, grads_and_vars, name=None):
     """Apply gradients to variables.
 
@@ -291,11 +311,12 @@ class OptimizerV2(optimizer_v1.Optimizer):
     """
     grads_and_vars = _filter_grads(grads_and_vars)
     var_list = [v for (_, v) in grads_and_vars]
-    if distribution_strategy_context.has_distribution_strategy():
+    if distribute_ctx.has_distribution_strategy():
       reduced_grads = merge_grads(grads_and_vars)
       grads_and_vars = zip(reduced_grads, var_list)
 
     with ops.init_scope():
+      self._prepare()
       self._create_slots(var_list)
     update_ops = []
 
@@ -317,19 +338,21 @@ class OptimizerV2(optimizer_v1.Optimizer):
         return update_op
 
     with ops.name_scope(name, self._name) as name:
-      self._prepare()
       for grad, var in grads_and_vars:
         scope_name = ("" if ops.executing_eagerly_outside_functions() else
                       "_" + var.op.name)
-        with ops.name_scope("update" + scope_name), ops.colocate_with(var):
+        with ops.name_scope("update" + scope_name):
           update_ops.append(update_grad_to_var(grad, var))
       # control dependencies does not work in per replica mode, please change
       # this once b/118841692 is fixed.
       # with ops.control_dependencies(update_ops):
       #   apply_updates = self._iterations.assign_add(1).op
-      apply_updates = merge_update_step(update_ops, self.iteration)
+      apply_updates = merge_update_step(update_ops, self.iterations)
       return apply_updates
 
+  def get_updates(self, loss, params):
+    return [self.minimize(loss, params)]
+
   def _set_hyper(self, name, value):
     """set hyper `name` to value. value can be callable, tensor, numeric."""
     if name not in self._hyper:
@@ -342,9 +365,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
       else:
         backend.set_value(self._hyper[name], value)
 
-  def _get_hyper(self, name):
+  def _get_hyper(self, name, dtype=None):
     value = self._hyper[name]
-    return self._call_if_callable(value)
+    if callable(value):
+      value = value()
+    if dtype:
+      return math_ops.cast(value, dtype)
+    else:
+      return value
 
   def __getattribute__(self, name):
     """Overridden to support hyperparameter access."""
@@ -371,12 +399,16 @@ class OptimizerV2(optimizer_v1.Optimizer):
     else:
       super(OptimizerV2, self).__setattr__(name, value)
 
-  def add_slot(self, var, slot_name):
+  def add_slot(self, var, slot_name, initializer="zeros"):
     var_key = _var_key(var)
     slot_dict = self._slots.setdefault(var_key, {})
     if slot_name not in slot_dict:
       slot_key = _get_slot_key_from_var(var, slot_name)
-      weight = self.add_weight(name=slot_key, shape=var.shape, dtype=var.dtype)
+      weight = self.add_weight(
+          name=slot_key,
+          shape=var.shape,
+          dtype=var.dtype,
+          initializer=initializer)
       slot_dict[slot_name] = weight
       self._weights.append(weight)
 
@@ -392,8 +424,10 @@ class OptimizerV2(optimizer_v1.Optimizer):
       self._iterations = self.add_weight(
           "iter",
           shape=[],
+          dtype=dtypes.int64,
           trainable=False,
-          aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+          aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
+      self._weights.append(self._iterations)
     for name, value in self._hyper.items():
       if isinstance(value, ops.Tensor) or callable(value):
         pass
@@ -403,15 +437,24 @@ class OptimizerV2(optimizer_v1.Optimizer):
             shape=[],
             trainable=False,
             initializer=value,
-            aggregation=variables.VariableAggregation.ONLY_FIRST_REPLICA)
+            aggregation=tf_variables.VariableAggregation.ONLY_FIRST_REPLICA)
     self._prepared = True
 
   @property
-  def iteration(self):
+  def iterations(self):
     if not self._prepared:
       self._prepare()
     return self._iterations
 
+  def _decayed_lr(self, var_dtype):
+    """Get decayed learning rate as a Tensor with dtype=var_dtype."""
+    lr_t = self._get_hyper("learning_rate", var_dtype)
+    if self._initial_decay > 0.:
+      local_step = math_ops.cast(self.iterations, var_dtype)
+      decay_t = self._get_hyper("decay", var_dtype)
+      lr_t = lr_t / (1. + decay_t * local_step)
+    return lr_t
+
   @abc.abstractmethod
   def get_config(self):
     """Returns the config of the optimimizer.
@@ -443,6 +486,8 @@ class OptimizerV2(optimizer_v1.Optimizer):
     Returns:
         An optimizer instance.
     """
+    if "lr" in config:
+      config["learning_rate"] = config.pop("lr")
     return cls(**config)
 
   def _serialize_hyperparameter(self, hyperparameter_name):
@@ -450,10 +495,14 @@ class OptimizerV2(optimizer_v1.Optimizer):
     value = self._get_hyper(hyperparameter_name)
     if callable(value):
       return value()
-    if isinstance(value, (ops.Tensor, variables.Variable)):
+    if isinstance(value, (ops.Tensor, tf_variables.Variable)):
       return backend.get_value(value)
     return value
 
+  def variables(self):
+    """Returns variables of this Optimizer based on the order created."""
+    return self._weights
+
   @property
   def weights(self):
     """Returns variables of this Optimizer based on the order created."""
@@ -490,15 +539,15 @@ class OptimizerV2(optimizer_v1.Optimizer):
                  dtype=None,
                  initializer="zeros",
                  trainable=None,
-                 synchronization=variables.VariableSynchronization.AUTO,
-                 aggregation=variables.VariableAggregation.NONE):
+                 synchronization=tf_variables.VariableSynchronization.AUTO,
+                 aggregation=tf_variables.VariableAggregation.NONE):
 
     if dtype is None:
       dtype = dtypes.float32
     if isinstance(initializer, six.string_types) or callable(initializer):
       initializer = initializers.get(initializer)
 
-    if synchronization == variables.VariableSynchronization.ON_READ:
+    if synchronization == tf_variables.VariableSynchronization.ON_READ:
       if trainable:
         raise ValueError(
             "Synchronization value can be set to "
@@ -514,7 +563,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
     variable = self._add_variable_with_custom_getter(
         name=name,
         shape=shape,
-        getter=base_layer.make_variable,
+        getter=base_layer_utils.make_variable,
         overwrite=True,
         initializer=initializer,
         dtype=dtype,
@@ -522,6 +571,7 @@ class OptimizerV2(optimizer_v1.Optimizer):
         use_resource=True,
         synchronization=synchronization,
         aggregation=aggregation)
+    backend.track_variable(variable)
 
     return variable
 
@@ -560,20 +610,20 @@ def merge_update_step(update_ops, local_step):
       incre_op = local_step.assign_add(1).op
     return incre_op
 
-  return distribution_strategy_context.get_replica_context().merge_call(
-      merge_update_step_fn, update_ops, local_step)
+  return distribute_ctx.get_replica_context().merge_call(
+      merge_update_step_fn, args=(update_ops, local_step))
 
 
 def merge_grads(grads_and_vars):
   """Merge gradients from different replicas."""
 
   def merge_grad_fn(strategy, grads_and_vars):
-    reduced_grads = strategy.batch_reduce(
-        variable_scope.VariableAggregation.MEAN, grads_and_vars)
+    reduced_grads = strategy.batch_reduce(ds_reduce_util.ReduceOp.SUM,
+                                          grads_and_vars)
     return reduced_grads
 
-  return distribution_strategy_context.get_replica_context().merge_call(
-      merge_grad_fn, grads_and_vars)
+  return distribute_ctx.get_replica_context().merge_call(
+      merge_grad_fn, args=(grads_and_vars,))
 
 
 def _var_key(var):
@@ -591,7 +641,7 @@ def _var_key(var):
   """
 
   # pylint: disable=protected-access
-  if distribution_strategy_context.has_distribution_strategy() and hasattr(
+  if distribute_ctx.has_distribution_strategy() and hasattr(
       var, "_primary_var"):
     var = var._primary_var
   if hasattr(var, "op"):
diff --git a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
index 682deda23f02ee51605770d7c8832d84d3c5d7d5..158577fe64afefaff28ee644caf084cb40d429ea 100644
--- a/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
+++ b/tensorflow/python/keras/optimizer_v2/optimizer_v2_test.py
@@ -18,6 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+import tempfile
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python import keras
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function
@@ -25,15 +32,27 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras import backend
+from tensorflow.python.keras import callbacks
+from tensorflow.python.keras import optimizers
+from tensorflow.python.keras import testing_utils
+from tensorflow.python.keras.engine import input_layer
+from tensorflow.python.keras.engine import saving
+from tensorflow.python.keras.engine import sequential
+from tensorflow.python.keras.engine import training
+from tensorflow.python.keras.layers import core
 from tensorflow.python.keras.optimizer_v2 import adam
 from tensorflow.python.keras.optimizer_v2 import gradient_descent
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
+from tensorflow.python.training import momentum
 
 
 class OptimizerTest(test.TestCase):
@@ -279,8 +298,8 @@ class OptimizerTest(test.TestCase):
   def testIterationWithoutMinimize(self):
     with self.cached_session():
       sgd = gradient_descent.SGD(3.0)
-      self.evaluate(sgd.iteration.initializer)
-      self.assertEqual(0, self.evaluate(sgd.iteration))
+      self.evaluate(sgd.iterations.initializer)
+      self.assertEqual(0, self.evaluate(sgd.iterations))
 
   @test_util.run_in_graph_and_eager_modes
   def testSerializationWithinDefun(self):
@@ -341,8 +360,8 @@ class OptimizerTest(test.TestCase):
       opt2.set_weights(weights)
       self.evaluate([opt_op_1, opt_op_2])
       self.assertAllClose(self.evaluate(var1), self.evaluate(var2))
-      self.assertEqual(1, self.evaluate(opt1.iteration))
-      self.assertEqual(1, self.evaluate(opt2.iteration))
+      self.assertEqual(1, self.evaluate(opt1.iterations))
+      self.assertEqual(1, self.evaluate(opt2.iterations))
 
       var3 = resource_variable_ops.ResourceVariable([1.0, 2.0, 3.0],
                                                     dtype=dtypes.float32)
@@ -394,7 +413,231 @@ class OptimizerTest(test.TestCase):
     with self.assertRaises(AttributeError):
       opt.not_an_attr += 3
 
-  def testOptimizerWithFunction(self):
+  @test_util.run_in_graph_and_eager_modes
+  def testOptimizerWithKerasModel(self):
+    a = input_layer.Input(shape=(3,), name='input_a')
+    b = input_layer.Input(shape=(3,), name='input_b')
+
+    dense = core.Dense(4, name='dense')
+    c = dense(a)
+    d = dense(b)
+    e = core.Dropout(0.5, name='dropout')(c)
+
+    model = training.Model([a, b], [d, e])
+
+    optimizer = gradient_descent.SGD(learning_rate=0.001)
+    loss = 'mse'
+    model.compile(optimizer, loss, metrics=['mae'])
+
+    input_a_np = np.random.random((10, 3))
+    input_b_np = np.random.random((10, 3))
+
+    output_d_np = np.random.random((10, 4))
+    output_e_np = np.random.random((10, 4))
+
+    model.fit([input_a_np, input_b_np], [output_d_np, output_e_np],
+              epochs=1,
+              batch_size=5)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testOptimizerWithCallbacks(self):
+    input_np = np.random.random((10, 3))
+    output_np = np.random.random((10, 4))
+    a = input_layer.Input(shape=(3,), name='input_a')
+    model = sequential.Sequential()
+    model.add(core.Dense(4, name='dense'))
+    model.add(core.Dropout(0.5, name='dropout'))
+    model(a)
+    optimizer = gradient_descent.SGD(learning_rate=0.1)
+    model.compile(optimizer, loss='mse', metrics=['mae'])
+    # This does not reduce the LR after the first epoch (due to low delta).
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='val_loss', factor=0.1, min_delta=0, patience=1, cooldown=5)
+    ]
+    model.fit(
+        input_np,
+        output_np,
+        batch_size=10,
+        validation_data=(input_np, output_np),
+        callbacks=cbks,
+        epochs=5,
+        verbose=0)
+    self.assertAllClose(
+        float(backend.get_value(model.optimizer.lr)), 0.1, atol=1e-4)
+
+    # This should reduce the LR after the first epoch (due to high delta).
+    cbks = [
+        callbacks.ReduceLROnPlateau(
+            monitor='val_loss',
+            factor=0.1,
+            min_delta=10,
+            patience=1,
+            cooldown=5)
+    ]
+    model.fit(
+        input_np,
+        output_np,
+        batch_size=10,
+        validation_data=(input_np, output_np),
+        callbacks=cbks,
+        epochs=5,
+        verbose=2)
+    self.assertAllClose(
+        float(backend.get_value(model.optimizer.lr)), 0.01, atol=1e-4)
+
+
+class OptimizersCompatibilityTest(test.TestCase, parameterized.TestCase):
+
+  # TODO(tanzheny): remove test_numeric after algorithm for Momentum, Adam and
+  # NAdam has been unified: currently these three algorithms behave differently.
+  @parameterized.named_parameters(
+      ('adadelta', 'adadelta', True, True), ('adagrad', 'adagrad', True, True),
+      ('adam', 'adam', True, True), ('adamax', 'adamax', True, True),
+      ('nadam', 'nadam', True, False), ('momentum', 'momentum', True, True),
+      ('sgd', 'sgd', False, True))
+  def testOptimizersCompatibility(self, opt_str, test_weights, test_numeric):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+
+      old_mode = os.environ.get('TF2_BEHAVIOR', None)
+      # Disable tf2 to create V1 optimizer.
+      disable_tf2()
+      if opt_str == 'momentum':
+        opt_v1 = optimizers.SGD(momentum=0.9)
+      else:
+        opt_v1 = optimizers.get(opt_str)
+
+      # Test compile and fit with v1 optimizer.
+      model.compile(opt_v1, loss='categorical_crossentropy', metrics=[])
+      model.fit(x, y, batch_size=5, epochs=1)
+      model_dir = tempfile.mkdtemp()
+      gfile.MakeDirs(model_dir)
+      file_name = os.path.join(model_dir, 'model.h5')
+      model.save(file_name)
+
+      enable_tf2()
+      # Test load and fit with v2 optimizer.
+      model_2 = saving.load_model(file_name)
+      opt_v2 = model_2.optimizer
+      self.assertIsInstance(opt_v2, optimizer_v2.OptimizerV2)
+      # set_weights is called inside load_model but exception is swallowed,
+      # this call checks the weights can be set correctly.
+      if test_weights:
+        opt_v2.set_weights(opt_v1.get_weights())
+      if test_numeric:
+        hist_1 = model.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+        hist_2 = model_2.fit(x, y, batch_size=5, epochs=1, shuffle=False)
+        self.assertAllClose(model.get_weights(), model_2.get_weights())
+        self.assertAllClose(model.get_weights(), model_2.get_weights())
+        self.assertAllClose(hist_1.history['loss'], hist_2.history['loss'])
+
+      if old_mode is not None:
+        os.environ['TF2_BEHAVIOR'] = old_mode
+
+  def testNumericEquivalenceForNesterovMomentum(self):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model_k_v1 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2.set_weights(model_k_v1.get_weights())
+      model_tf = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_tf.set_weights(model_k_v2.get_weights())
+
+      opt_k_v1 = optimizers.SGD(lr=0.001, momentum=0.9, nesterov=True)
+      opt_k_v2 = gradient_descent.SGD(momentum=0.9, nesterov=True)
+      opt_tf = momentum.MomentumOptimizer(
+          learning_rate=0.001, momentum=0.9, use_nesterov=True)
+
+      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
+      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+      model_tf.compile(opt_tf, loss='categorical_crossentropy', metrics=[])
+
+      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_tf = model_tf.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+      self.assertAllClose(model_k_v1.get_weights(), model_tf.get_weights())
+      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
+      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+      self.assertAllClose(hist_k_v1.history['loss'], hist_tf.history['loss'])
+      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
+
+  def testNumericEquivalenceForAmsgrad(self):
+    np.random.seed(1331)
+    with self.cached_session():
+      train_samples = 20
+      input_dim = 3
+      num_classes = 2
+      (x, y), _ = testing_utils.get_test_data(
+          train_samples=train_samples,
+          test_samples=10,
+          input_shape=(input_dim,),
+          num_classes=num_classes)
+      y = keras.utils.to_categorical(y)
+
+      num_hidden = 5
+      model_k_v1 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2 = testing_utils.get_small_sequential_mlp(
+          num_hidden=num_hidden, num_classes=num_classes, input_dim=input_dim)
+      model_k_v2.set_weights(model_k_v1.get_weights())
+
+      opt_k_v1 = optimizers.Adam(amsgrad=True)
+      opt_k_v2 = adam.Adam(amsgrad=True)
+
+      model_k_v1.compile(opt_k_v1, loss='categorical_crossentropy', metrics=[])
+      model_k_v2.compile(opt_k_v2, loss='categorical_crossentropy', metrics=[])
+
+      hist_k_v1 = model_k_v1.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+      hist_k_v2 = model_k_v2.fit(x, y, batch_size=5, epochs=10, shuffle=False)
+
+      self.assertAllClose(model_k_v1.get_weights(), model_k_v2.get_weights())
+      self.assertAllClose(opt_k_v1.get_weights(), opt_k_v2.get_weights())
+      self.assertAllClose(hist_k_v1.history['loss'], hist_k_v2.history['loss'])
+
+
+def disable_tf2():
+  if 'TF2_BEHAVIOR' in os.environ:
+    del os.environ['TF2_BEHAVIOR']
+
+
+def enable_tf2():
+  os.environ['TF2_BEHAVIOR'] = 'enabled'
+
+
+# Note: These tests are kept in a separate class to avoid bugs in some
+# distributions of Python that break AutoGraph which is used by tf.function.
+class OptimizerWithFunctionTest(test.TestCase):
+
+  def testBasic(self):
     with context.eager_mode():
       var = resource_variable_ops.ResourceVariable([1.0, 2.0],
                                                    dtype=dtypes.float32)
@@ -406,10 +649,8 @@ class OptimizerTest(test.TestCase):
         opt.minimize(loss, [var])
         return var
 
-      self.assertAllClose([0., 1.], fn())
-      # This is just to test tf.function. The values needs to be updated
-      # when adam updates beta_1_power.
-      self.assertAllClose([-1.343838, -0.343838], fn())
+      self.assertAllClose([0., 1.], fn(), atol=1e-4)
+      self.assertAllClose([-1, 0.], fn(), atol=1e-4)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop.py b/tensorflow/python/keras/optimizer_v2/rmsprop.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a5b334fc46f6ae76f48cce29bc119cdc8f0eaf2
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop.py
@@ -0,0 +1,196 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""RMSprop for TensorFlow."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.training import training_ops
+
+
+class RMSprop(optimizer_v2.OptimizerV2):
+  r"""Optimizer that implements the RMSprop algorithm.
+
+  A detailed description of rmsprop.
+
+    - maintain a moving (discounted) average of the square of gradients
+    - divide gradient by the root of this average
+
+  $$mean_square_t = rho * mean_square{t-1} + (1-rho) * gradient ** 2$$
+  $$mom_t = momentum * mom_{t-1} + learning_rate * gradient / \sqrt{ /
+      mean_square_t + \epsilon}$$
+  $$variable_t := variable_{t-1} - mom_t
+
+  This implementation of RMSprop uses plain momentum, not Nesterov momentum.
+
+  The centered version additionally maintains a moving average of the
+  gradients, and uses that average to estimate the variance:
+
+  $$mean_grad_t = rho * mean_grad_{t-1} + (1-rho) * gradient$$
+  $$mean_square_t = rho * mean_square_{t-1} + (1-rho) * gradient ** 2$$
+  $$mom_t = momentum * mom_{t-1} + learning_rate * gradient /
+      sqrt(mean_square_t - mean_grad_t**2 + epsilon)$$
+  $$variable_t := variable_{t-1} - mom_t
+
+  References
+    See ([pdf]
+      http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf).
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               rho=0.9,
+               momentum=0.0,
+               epsilon=1e-7,
+               centered=False,
+               name="RMSprop",
+               **kwargs):
+    """Construct a new RMSprop optimizer.
+
+    Note that in the dense implementation of this algorithm, variables and their
+    corresponding accumulators (momentum, gradient moving average, square
+    gradient moving average) will be updated even if the gradient is zero
+    (i.e. accumulators will decay, momentum will be applied). The sparse
+    implementation (used when the gradient is an `IndexedSlices` object,
+    typically because of `tf.gather` or an embedding lookup in the forward pass)
+    will not update variable slices or their accumulators unless those slices
+    were used in the forward pass (nor is there an "eventual" correction to
+    account for these omitted updates). This leads to more efficient updates for
+    large embedding lookup tables (where most of the slices are not accessed in
+    a particular graph execution), but differs from the published algorithm.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      rho: Discounting factor for the history/coming gradient
+      momentum: A scalar tensor.
+      epsilon: Small value to avoid zero denominator.
+      centered: If True, gradients are normalized by the estimated variance of
+        the gradient; if False, by the uncentered second moment. Setting this to
+        True may help with training, but is slightly more expensive in terms of
+        computation and memory. Defaults to False.
+      name: Optional name prefix for the operations created when applying
+        gradients. Defaults to "RMSprop".  @compatibility(eager) When eager
+        execution is enabled, `learning_rate`, `decay`, `momentum`, and
+        `epsilon` can each be a callable that takes no arguments and returns the
+        actual value to use. This can be useful for changing these values across
+        different invocations of optimizer functions. @end_compatibility
+      **kwargs: keyword arguments. Allowed to be {`decay`}
+    """
+    super(RMSprop, self).__init__(name, **kwargs)
+    self._set_hyper("learning_rate", learning_rate)
+    self._set_hyper("decay", self._initial_decay)
+    self._set_hyper("rho", rho)
+
+    self._momentum = False
+    if isinstance(momentum, ops.Tensor) or callable(momentum) or momentum > 0:
+      self._momentum = True
+    if isinstance(momentum, (int, float)) and (momentum < 0 or momentum > 1):
+      raise ValueError("`momentum` must be between [0, 1].")
+    self._set_hyper("momentum", momentum)
+
+    self._set_hyper("epsilon", epsilon)
+    self._centered = centered
+
+  def _create_slots(self, var_list):
+    for var in var_list:
+      self.add_slot(var, "rms")
+      self.add_slot(var, "momentum")
+      if self._centered:
+        self.add_slot(var, "mg")
+
+  def _resource_apply_dense(self, grad, var):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    rms = self.get_slot(var, "rms")
+    mom = self.get_slot(var, "momentum")
+    rho = self._get_hyper("rho", var_dtype)
+    momentum = self._get_hyper("momentum", var_dtype)
+    epsilon = self._get_hyper("epsilon", var_dtype)
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          use_locking=self._use_locking)
+
+  def _resource_apply_sparse(self, grad, var, indices):
+    var_dtype = var.dtype.base_dtype
+    lr_t = self._decayed_lr(var_dtype)
+    rms = self.get_slot(var, "rms")
+    mom = self.get_slot(var, "momentum")
+    rho = self._get_hyper("rho", var_dtype)
+    momentum = self._get_hyper("momentum", var_dtype)
+    epsilon = self._get_hyper("epsilon", var_dtype)
+    if self._centered:
+      mg = self.get_slot(var, "mg")
+      return training_ops.resource_sparse_apply_centered_rms_prop(
+          var.handle,
+          mg.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+    else:
+      return training_ops.resource_sparse_apply_rms_prop(
+          var.handle,
+          rms.handle,
+          mom.handle,
+          lr_t,
+          rho,
+          momentum,
+          epsilon,
+          grad,
+          indices,
+          use_locking=self._use_locking)
+
+  def get_config(self):
+    config = super(RMSprop, self).get_config()
+    config.update({
+        "learning_rate": self._serialize_hyperparameter("learning_rate"),
+        "decay": self._serialize_hyperparameter("decay"),
+        "rho": self._serialize_hyperparameter("rho"),
+        "momentum": self._serialize_hyperparameter("momentum"),
+        "epsilon": self._serialize_hyperparameter("epsilon"),
+        "centered": self._centered,
+    })
+    return config
+
+
+RMSProp = RMSprop
diff --git a/tensorflow/python/keras/optimizer_v2/rmsprop_test.py b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8658a8550760a04c6031e26721038b88fad0ebd
--- /dev/null
+++ b/tensorflow/python/keras/optimizer_v2/rmsprop_test.py
@@ -0,0 +1,410 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for rmsprop."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import itertools
+import math
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.optimizer_v2 import rmsprop
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+_DATA_TYPES = [dtypes.half, dtypes.float32]
+
+_TEST_PARAM_VALUES = [
+    # learning_rate, rho, momentum, epsilon, centered
+    [0.05, 0.9, 0.0, 1e-3, True],
+    [0.05, 0.9, 0.0, 1e-3, False],
+    [0.1, 0.9, 0.0, 1e-3, True],
+    [0.01, 0.9, 0.0, 1e-5, True],
+    [0.01, 0.9, 0.9, 1e-5, True],
+]
+
+_TESTPARAMS = [
+    [data_type] + values
+    for data_type, values in itertools.product(_DATA_TYPES, _TEST_PARAM_VALUES)
+]
+
+
+class RMSpropOptimizerTest(test.TestCase):
+
+  def _rmsprop_update_numpy(self, var, g, mg, rms, mom, lr, rho, momentum,
+                            epsilon, centered):
+    rms_t = rms * rho + (1 - rho) * g * g
+    denom_t = rms_t + epsilon
+    if centered:
+      mg_t = mg * rho + (1 - rho) * g
+      denom_t -= mg_t * mg_t
+    else:
+      mg_t = mg
+    mom_t = momentum * mom + lr * g / np.sqrt(denom_t, dtype=denom_t.dtype)
+    var_t = var - mom_t
+    return var_t, mg_t, rms_t, mom_t
+
+  def _sparse_rmsprop_update_numpy(self, var, gindexs, gvalues, mg, rms, mom,
+                                   lr, rho, momentum, epsilon, centered):
+    mg_t = copy.deepcopy(mg)
+    rms_t = copy.deepcopy(rms)
+    mom_t = copy.deepcopy(mom)
+    var_t = copy.deepcopy(var)
+    for i in range(len(gindexs)):
+      gindex = gindexs[i]
+      gvalue = gvalues[i]
+      rms_t[gindex] = rms[gindex] * rho + (1 - rho) * gvalue * gvalue
+      denom_t = rms_t[gindex] + epsilon
+      if centered:
+        mg_t[gindex] = mg_t[gindex] * rho + (1 - rho) * gvalue
+        denom_t -= mg_t[gindex] * mg_t[gindex]
+      mom_t[gindex] = momentum * mom[gindex] + lr * gvalue / np.sqrt(denom_t)
+      var_t[gindex] = var[gindex] - mom_t[gindex]
+    return var_t, mg_t, rms_t, mom_t
+
+  @test_util.run_deprecated_v1
+  def testDense(self):
+    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
+      with test_util.use_gpu():
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.2], dtype=dtype.as_numpy_dtype)
+
+        var0 = resource_variable_ops.ResourceVariable(var0_np, dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable(var1_np, dtype=dtype)
+        grads0 = constant_op.constant(grads0_np, dtype=dtype)
+        grads1 = constant_op.constant(grads1_np, dtype=dtype)
+        opt = rmsprop.RMSprop(
+            learning_rate=learning_rate,
+            rho=rho,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        if centered:
+          mg0 = opt.get_slot(var0, "mg")
+          mg1 = opt.get_slot(var1, "mg")
+        else:
+          mg0 = None
+          mg1 = None
+
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 4 steps of RMSprop
+        for _ in range(1, 5):
+          self.evaluate(update)
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+              var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate, rho,
+              momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+              var1_np, grads1_np, mg1_np, rms1_np, mom1_np, learning_rate, rho,
+              momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testDenseWithLearningRateDecay(self):
+    var0_np = np.array([1.0, 2.0])
+    grads0_np = np.array([0.1, 0.2])
+    var1_np = np.array([3.0, 4.0])
+    grads1_np = np.array([0.01, 0.2])
+
+    var0 = resource_variable_ops.ResourceVariable(var0_np)
+    var1 = resource_variable_ops.ResourceVariable(var1_np)
+    grads0 = constant_op.constant(grads0_np)
+    grads1 = constant_op.constant(grads1_np)
+    learning_rate = 0.01
+    rho = 0.9
+    momentum = 0.0
+    epsilon = 1e-7
+    centered = False
+    decay = 0.5
+    opt = rmsprop.RMSprop(
+        learning_rate=learning_rate,
+        rho=rho,
+        momentum=momentum,
+        epsilon=epsilon,
+        centered=centered,
+        decay=decay)
+
+    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+    self.evaluate(variables.global_variables_initializer())
+
+    rms0 = opt.get_slot(var0, "rms")
+    self.assertTrue(rms0 is not None)
+    rms1 = opt.get_slot(var1, "rms")
+    self.assertTrue(rms1 is not None)
+    mom0 = opt.get_slot(var0, "momentum")
+    self.assertTrue(mom0 is not None)
+    mom1 = opt.get_slot(var1, "momentum")
+    self.assertTrue(mom1 is not None)
+
+    mg0_np = np.array([0.0, 0.0])
+    mg1_np = np.array([0.0, 0.0])
+    rms0_np = np.array([0.0, 0.0])
+    rms1_np = np.array([0.0, 0.0])
+    mom0_np = np.array([0.0, 0.0])
+    mom1_np = np.array([0.0, 0.0])
+
+    # Fetch params to validate initial values
+    self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+    self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+    # Run 4 steps of RMSprop
+    for t in range(2):
+      self.evaluate(update)
+
+      lr = learning_rate / (1 + decay * t)
+      var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
+          var0_np, grads0_np, mg0_np, rms0_np, mom0_np, lr, rho, momentum,
+          epsilon, centered)
+      var1_np, mg1_np, rms1_np, mom1_np = self._rmsprop_update_numpy(
+          var1_np, grads1_np, mg1_np, rms1_np, mom1_np, lr, rho, momentum,
+          epsilon, centered)
+
+      # Validate updated params
+      self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+      self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+      self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+      self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+      self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+      self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSprop(
+            learning_rate=1.0,
+            rho=0.0,
+            momentum=0.0,
+            epsilon=0.0,
+            centered=False).minimize(
+                loss, var_list=[var0])
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[0., 1.]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testMinimizeSparseResourceVariableCentered(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      with self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        loss = pred * pred
+        sgd_op = rmsprop.RMSprop(
+            learning_rate=1.0,
+            rho=0.0,
+            momentum=0.0,
+            epsilon=1.0,
+            centered=True).minimize(
+                loss, var_list=[var0])
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        # Run 1 step of sgd
+        self.evaluate(sgd_op)
+        # Validate updated params
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
+
+  @test_util.run_deprecated_v1
+  def testSparse(self):
+    for (dtype, learning_rate, rho, momentum, epsilon, centered) in _TESTPARAMS:
+      with test_util.use_gpu():
+        # Initialize variables for numpy implementation.
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0_np_indices = np.array([0], dtype=np.int32)
+        grads0 = ops.IndexedSlices(
+            constant_op.constant(grads0_np),
+            constant_op.constant(grads0_np_indices), constant_op.constant([1]))
+        grads1_np_indices = np.array([1], dtype=np.int32)
+        grads1 = ops.IndexedSlices(
+            constant_op.constant(grads1_np),
+            constant_op.constant(grads1_np_indices), constant_op.constant([1]))
+        opt = rmsprop.RMSprop(
+            learning_rate=learning_rate,
+            rho=rho,
+            momentum=momentum,
+            epsilon=epsilon,
+            centered=centered)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+
+        if centered:
+          mg0 = opt.get_slot(var0, "mg")
+          self.assertEqual(mg0 is not None, centered)
+          mg1 = opt.get_slot(var1, "mg")
+          self.assertEqual(mg1 is not None, centered)
+        else:
+          mg0 = None
+          mg1 = None
+        rms0 = opt.get_slot(var0, "rms")
+        self.assertTrue(rms0 is not None)
+        rms1 = opt.get_slot(var1, "rms")
+        self.assertTrue(rms1 is not None)
+        mom0 = opt.get_slot(var0, "momentum")
+        self.assertTrue(mom0 is not None)
+        mom1 = opt.get_slot(var1, "momentum")
+        self.assertTrue(mom1 is not None)
+
+        mg0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mg1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        rms1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 4 steps of RMSprop
+        for _ in range(1, 5):
+          self.evaluate(update)
+
+          var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
+              var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
+              learning_rate, rho, momentum, epsilon, centered)
+          var1_np, mg1_np, rms1_np, mom1_np = self._sparse_rmsprop_update_numpy(
+              var1_np, grads1_np_indices, grads1_np, mg1_np, rms1_np, mom1_np,
+              learning_rate, rho, momentum, epsilon, centered)
+
+          # Validate updated params
+          if centered:
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testCallableParams(self):
+    with context.eager_mode():
+      for dtype in [dtypes.half, dtypes.float32]:
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+
+        learning_rate = lambda: 2.0
+        rho = lambda: 0.9
+        momentum = lambda: 0.0
+        epsilon = lambda: 1.0
+        opt = rmsprop.RMSprop(learning_rate, rho, momentum, epsilon)
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+        # Step 1: the rms accumulators where 1. So we should see a normal
+        # update: v -= grad * learning_rate
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0))
+            ]), self.evaluate(var1))
+        # Step 2: the root mean square accumulators contain the previous update.
+        opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        # Check the parameters.
+        self.assertAllCloseAccordingToType(
+            np.array([
+                1.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0)),
+                2.0 - (0.1 * 2.0 / math.sqrt(0.001 + 1.0)) -
+                (0.1 * 2.0 / math.sqrt(0.001 * 0.9 + 0.001 + 1.0))
+            ]), self.evaluate(var0))
+        self.assertAllCloseAccordingToType(
+            np.array([
+                3.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0)),
+                4.0 - (0.01 * 2.0 / math.sqrt(0.00001 + 1.0)) -
+                (0.01 * 2.0 / math.sqrt(0.00001 * 0.9 + 1e-5 + 1.0))
+            ]), self.evaluate(var1))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/keras/optimizers.py b/tensorflow/python/keras/optimizers.py
index 715d80a116c0869291d0ce2d7514a31f07114fe3..ee6dbba5ad62ee4b35101d1496a77ae91412fd64 100644
--- a/tensorflow/python/keras/optimizers.py
+++ b/tensorflow/python/keras/optimizers.py
@@ -22,13 +22,23 @@ from __future__ import print_function
 import six
 from six.moves import zip  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.framework import ops
 from tensorflow.python.keras import backend as K
+from tensorflow.python.keras.optimizer_v2 import adadelta as adadelta_v2
+from tensorflow.python.keras.optimizer_v2 import adagrad as adagrad_v2
+from tensorflow.python.keras.optimizer_v2 import adam as adam_v2
+from tensorflow.python.keras.optimizer_v2 import adamax as adamax_v2
+from tensorflow.python.keras.optimizer_v2 import gradient_descent as gradient_descent_v2
+from tensorflow.python.keras.optimizer_v2 import nadam as nadam_v2
+from tensorflow.python.keras.optimizer_v2 import optimizer_v2
+from tensorflow.python.keras.optimizer_v2 import rmsprop as rmsprop_v2
 from tensorflow.python.keras.utils.generic_utils import deserialize_keras_object
 from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import optimizer as tf_optimizer_module
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base as checkpointable
@@ -473,7 +483,7 @@ class Adam(Optimizer):
 
   def get_updates(self, loss, params):
     grads = self.get_gradients(loss, params)
-    self.updates = [state_ops.assign_add(self.iterations, 1)]
+    self.updates = []
 
     lr = self.lr
     if self.initial_decay > 0:
@@ -481,7 +491,8 @@ class Adam(Optimizer):
           1. / (1. + self.decay * math_ops.cast(self.iterations,
                                                 K.dtype(self.decay))))
 
-    t = math_ops.cast(self.iterations, K.floatx()) + 1
+    with ops.control_dependencies([state_ops.assign_add(self.iterations, 1)]):
+      t = math_ops.cast(self.iterations, K.floatx())
     lr_t = lr * (
         K.sqrt(1. - math_ops.pow(self.beta_2, t)) /
         (1. - math_ops.pow(self.beta_1, t)))
@@ -795,16 +806,27 @@ def deserialize(config, custom_objects=None):
   Returns:
       A Keras Optimizer instance.
   """
-  all_classes = {
-      'sgd': SGD,
-      'rmsprop': RMSprop,
-      'adagrad': Adagrad,
-      'adadelta': Adadelta,
-      'adam': Adam,
-      'adamax': Adamax,
-      'nadam': Nadam,
-      'tfoptimizer': TFOptimizer,
-  }
+  if tf2.enabled():
+    all_classes = {
+        'adadelta': adadelta_v2.Adadelta,
+        'adagrad': adagrad_v2.Adagrad,
+        'adam': adam_v2.Adam,
+        'adamax': adamax_v2.Adamax,
+        'nadam': nadam_v2.Nadam,
+        'rmsprop': rmsprop_v2.RMSprop,
+        'sgd': gradient_descent_v2.SGD
+    }
+  else:
+    all_classes = {
+        'adadelta': Adadelta,
+        'adagrad': Adagrad,
+        'adam': Adam,
+        'adamax': Adamax,
+        'nadam': Nadam,
+        'rmsprop': RMSprop,
+        'sgd': SGD,
+        'tfoptimizer': TFOptimizer
+    }
   # Make deserialization case-insensitive for built-in optimizers.
   if config['class_name'].lower() in all_classes:
     config['class_name'] = config['class_name'].lower()
@@ -833,17 +855,17 @@ def get(identifier):
   Raises:
       ValueError: If `identifier` cannot be interpreted.
   """
+  if isinstance(identifier, (Optimizer, optimizer_v2.OptimizerV2)):
+    return identifier
   # Wrap TF optimizer instances
-  if isinstance(identifier, tf_optimizer_module.Optimizer):
+  elif isinstance(identifier, tf_optimizer_module.Optimizer):
     opt = TFOptimizer(identifier)
     K.track_tf_optimizer(opt)
     return opt
-  if isinstance(identifier, dict):
+  elif isinstance(identifier, dict):
     return deserialize(identifier)
   elif isinstance(identifier, six.string_types):
     config = {'class_name': str(identifier), 'config': {}}
     return deserialize(config)
-  if isinstance(identifier, Optimizer):
-    return identifier
   else:
     raise ValueError('Could not interpret optimizer identifier:', identifier)
diff --git a/tensorflow/python/keras/optimizers_test.py b/tensorflow/python/keras/optimizers_test.py
index 9664f09fff0ad872c40b58e3ff2347a2a595d429..77104a5d4d526792dde209b3c7cce2262a138dce 100644
--- a/tensorflow/python/keras/optimizers_test.py
+++ b/tensorflow/python/keras/optimizers_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import gc
+import os
 import weakref
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -88,22 +91,26 @@ def _test_optimizer(optimizer, target=0.75):
 
 class KerasOptimizersTest(test.TestCase):
 
+  @test_util.run_v1_only('b/120545219')
   def test_sgd(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
                                            momentum=0.9,
                                            nesterov=True))
 
+  @test_util.run_v1_only('b/120545219')
   def test_rmsprop(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.RMSprop())
       _test_optimizer(keras.optimizers.RMSprop(decay=1e-3))
 
+  @test_util.run_v1_only('b/120545219')
   def test_adagrad(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adagrad())
       _test_optimizer(keras.optimizers.Adagrad(decay=1e-3))
 
+  @test_util.run_v1_only('b/120545219')
   def test_adadelta(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adadelta(), target=0.6)
@@ -112,27 +119,32 @@ class KerasOptimizersTest(test.TestCase):
       # the accuracy.
       _test_optimizer(keras.optimizers.Adadelta(decay=1e-3), target=0.4)
 
+  @test_util.run_v1_only('b/120545219')
   def test_adam(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adam())
       _test_optimizer(keras.optimizers.Adam(decay=1e-3))
       _test_optimizer(keras.optimizers.Adam(amsgrad=True))
 
+  @test_util.run_v1_only('b/120545219')
   def test_adamax(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Adamax())
       _test_optimizer(keras.optimizers.Adamax(decay=1e-3))
 
+  @test_util.run_v1_only('b/120545219')
   def test_nadam(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.Nadam())
 
+  @test_util.run_v1_only('b/120545219')
   def test_clipnorm(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
                                            momentum=0.9,
                                            clipnorm=0.5))
 
+  @test_util.run_v1_only('b/120545219')
   def test_clipvalue(self):
     with self.cached_session():
       _test_optimizer(keras.optimizers.SGD(lr=0.01,
@@ -208,5 +220,40 @@ class KerasOptimizersTest(test.TestCase):
       _ = keras.optimizers.Adam(clipnorm=-2.0)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class KerasV2OptimizersTest(test.TestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      ('adadelta_tf2', 'adadelta', True), ('adadelta_tf1', 'adadelta', False),
+      ('adagrad_tf2', 'adagrad', True), ('adagrad_tf1', 'adagrad', False),
+      ('adam_tf2', 'adam', True), ('adam_tf1', 'adam', False),
+      ('adamax_tf2', 'adamax', True), ('adamax_tf1', 'adamax', False),
+      ('sgd_tf2', 'sgd', True), ('sgd_tf1', 'sgd', False),
+      ('nadam_tf2', 'nadam', True), ('nadam_tf1', 'nadam', False),
+      ('rmsprop_tf2', 'rmsprop', True), ('rmsprop_tf1', 'rmsprop', False))
+  def test_load_from_string(self, optimizer_string, tf2mode):
+    old_mode = os.environ.get('TF2_BEHAVIOR', None)
+    if tf2mode:
+      os.environ['TF2_BEHAVIOR'] = 'enabled'
+    else:
+      if 'TF2_BEHAVIOR' in os.environ:
+        del os.environ['TF2_BEHAVIOR']
+
+    # Sanity check.
+    self.assertEqual(tf2.enabled(), tf2mode)
+
+    model = keras.models.Sequential()
+    model.add(keras.layers.Dense(1, input_shape=(10,)))
+    model.compile(optimizer_string, 'binary_crossentropy')
+
+    self.assertEqual(optimizer_string,
+                     model.optimizer.__class__.__name__.lower())
+
+    model.fit(np.ones((10, 10), 'float32'), np.ones((10, 1), 'float32'))
+
+    if old_mode is not None:
+      os.environ['TF2_BEHAVIOR'] = old_mode
+
+
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/python/keras/regularizers_test.py b/tensorflow/python/keras/regularizers_test.py
index bba4ebb287b2bd3e8509abd215dc5be4cbcdd929..3d6b259d87de8b6533d008a839f0df2226d71ed4 100644
--- a/tensorflow/python/keras/regularizers_test.py
+++ b/tensorflow/python/keras/regularizers_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python import keras
 from tensorflow.python.keras import testing_utils
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -61,6 +62,7 @@ class KerasRegularizersTest(test.TestCase):
         model.fit(x_train, y_train, batch_size=10,
                   epochs=1, verbose=0)
 
+  @test_util.run_deprecated_v1
   def test_activity_regularization(self):
     with self.cached_session():
       (x_train, y_train), _ = get_data()
diff --git a/tensorflow/python/keras/testing_utils.py b/tensorflow/python/keras/testing_utils.py
index d342131a521a90399090e48cf578f37c2a2e566c..fd062b0ab337aa6fa62a7603a36749cde315c3da 100644
--- a/tensorflow/python/keras/testing_utils.py
+++ b/tensorflow/python/keras/testing_utils.py
@@ -18,11 +18,15 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import threading
+
 import numpy as np
 
 from tensorflow.python import keras
+from tensorflow.python.eager import context
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.training.rmsprop import RMSPropOptimizer
+from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_inspect
 
 
@@ -73,9 +77,13 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   Returns:
     The output data (Numpy array) returned by the layer, for additional
     checks to be done by the calling code.
+
+  Raises:
+    ValueError: if `input_shape is None`.
   """
   if input_data is None:
-    assert input_shape
+    if input_shape is None:
+      raise ValueError('input_shape is None')
     if not input_dtype:
       input_dtype = 'float32'
     input_data_shape = list(input_shape)
@@ -149,7 +157,15 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
     np.testing.assert_allclose(output, actual_output, rtol=1e-3)
 
   # test training mode (e.g. useful for dropout tests)
-  model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
+  # Rebuild the model to avoid the graph being reused between predict() and
+  # train(). This was causing some error for layer with Defun as it body.
+  # See b/120160788 for more details. This should be mitigated after 2.0.
+  model = keras.models.Model(x, layer(x))
+  if _thread_local_data.run_eagerly is not None:
+    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'],
+                  run_eagerly=should_run_eagerly())
+  else:
+    model.compile(RMSPropOptimizer(0.01), 'mse', weighted_metrics=['acc'])
   model.train_on_batch(input_data, actual_output)
 
   # test as first layer in Sequential API
@@ -190,6 +206,74 @@ def layer_test(layer_cls, kwargs=None, input_shape=None, input_dtype=None,
   return actual_output
 
 
+_thread_local_data = threading.local()
+_thread_local_data.model_type = None
+_thread_local_data.run_eagerly = None
+
+
+@tf_contextlib.contextmanager
+def model_type_scope(value):
+  """Provides a scope within which the model type to test is equal to `value`.
+
+  The model type gets restored to its original value upon exiting the scope.
+
+  Arguments:
+     value: model type value
+
+  Yields:
+    The provided value.
+  """
+  previous_value = _thread_local_data.model_type
+  try:
+    _thread_local_data.model_type = value
+    yield value
+  finally:
+    # Restore model type to initial value.
+    _thread_local_data.model_type = previous_value
+
+
+@tf_contextlib.contextmanager
+def run_eagerly_scope(value):
+  """Provides a scope within which we compile models to run eagerly or not.
+
+  The boolean gets restored to its original value upon exiting the scope.
+
+  Arguments:
+     value: Bool specifying if we should run models eagerly in the active test.
+     Should be True or False.
+
+  Yields:
+    The provided value.
+  """
+  previous_value = _thread_local_data.run_eagerly
+  try:
+    _thread_local_data.run_eagerly = value
+    yield value
+  finally:
+    # Restore model type to initial value.
+    _thread_local_data.run_eagerly = previous_value
+
+
+def should_run_eagerly():
+  """Returns whether the models we are testing should be run eagerly."""
+  if _thread_local_data.run_eagerly is None:
+    raise ValueError('Cannot call `should_run_eagerly()` outside of a '
+                     '`run_eagerly_scope()` or `run_all_keras_modes` '
+                     'decorator.')
+
+  return _thread_local_data.run_eagerly and context.executing_eagerly()
+
+
+def get_model_type():
+  """Gets the model type that should be tested."""
+  if _thread_local_data.model_type is None:
+    raise ValueError('Cannot call `get_model_type()` outside of a '
+                     '`model_type_scope()` or `run_with_all_model_types` '
+                     'decorator.')
+
+  return _thread_local_data.model_type
+
+
 def get_small_sequential_mlp(num_hidden, num_classes, input_dim=None):
   model = keras.models.Sequential()
   if input_dim:
@@ -208,3 +292,337 @@ def get_small_functional_mlp(num_hidden, num_classes, input_dim):
   activation = 'sigmoid' if num_classes == 1 else 'softmax'
   outputs = keras.layers.Dense(num_classes, activation=activation)(outputs)
   return keras.Model(inputs, outputs)
+
+
+class _SmallSubclassMLP(keras.Model):
+  """A subclass model based small MLP."""
+
+  def __init__(self, num_hidden, num_classes):
+    super(_SmallSubclassMLP, self).__init__()
+    self.layer_a = keras.layers.Dense(num_hidden, activation='relu')
+    activation = 'sigmoid' if num_classes == 1 else 'softmax'
+    self.layer_b = keras.layers.Dense(num_classes, activation=activation)
+
+  def call(self, inputs, **kwargs):
+    x = self.layer_a(inputs)
+    return self.layer_b(x)
+
+
+class _SmallSubclassMLPCustomBuild(keras.Model):
+  """A subclass model small MLP that uses a custom build method."""
+
+  def __init__(self, num_hidden, num_classes):
+    super(_SmallSubclassMLPCustomBuild, self).__init__()
+    self.layer_a = None
+    self.layer_b = None
+    self.num_hidden = num_hidden
+    self.num_classes = num_classes
+
+  def build(self, input_shape):
+    self.layer_a = keras.layers.Dense(self.num_hidden, activation='relu')
+    activation = 'sigmoid' if self.num_classes == 1 else 'softmax'
+    self.layer_b = keras.layers.Dense(self.num_classes, activation=activation)
+
+  def call(self, inputs, **kwargs):
+    x = self.layer_a(inputs)
+    return self.layer_b(x)
+
+
+def get_small_subclass_mlp(num_hidden, num_classes):
+  return _SmallSubclassMLP(num_hidden, num_classes)
+
+
+def get_small_subclass_mlp_with_custom_build(num_hidden, num_classes):
+  return _SmallSubclassMLPCustomBuild(num_hidden, num_classes)
+
+
+def get_small_mlp(num_hidden, num_classes, input_dim):
+  """Get a small mlp of the model type specified by `get_model_type`."""
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return get_small_subclass_mlp(num_hidden, num_classes)
+  if model_type == 'subclass_custom_build':
+    return get_small_subclass_mlp_with_custom_build(num_hidden, num_classes)
+  if model_type == 'sequential':
+    return get_small_sequential_mlp(num_hidden, num_classes, input_dim)
+  if model_type == 'functional':
+    return get_small_functional_mlp(num_hidden, num_classes, input_dim)
+  raise ValueError('Unknown model type {}'.format(model_type))
+
+
+class _SubclassModel(keras.Model):
+  """A Keras subclass model."""
+
+  def __init__(self, layers):
+    super(_SubclassModel, self).__init__()
+    self.all_layers = layers
+
+  def call(self, inputs, **kwargs):
+    x = inputs
+    for layer in self.all_layers:
+      x = layer(x)
+    return x
+
+
+class _SubclassModelCustomBuild(keras.Model):
+  """A Keras subclass model that uses a custom build method."""
+
+  def __init__(self, layer_generating_func):
+    super(_SubclassModelCustomBuild, self).__init__()
+    self.all_layers = None
+    self._layer_generating_func = layer_generating_func
+
+  def build(self, input_shape):
+    layers = []
+    for layer in self._layer_generating_func():
+      layers.append(layer)
+    self.all_layers = layers
+
+  def call(self, inputs, **kwargs):
+    x = inputs
+    for layer in self.all_layers:
+      x = layer(x)
+    return x
+
+
+def get_model_from_layers(layers, input_shape=None):
+  """Builds a model from a sequence of layers."""
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return _SubclassModel(layers)
+
+  if model_type == 'subclass_custom_build':
+    layer_generating_func = lambda: layers
+    return _SubclassModelCustomBuild(layer_generating_func)
+
+  if model_type == 'sequential':
+    model = keras.models.Sequential()
+    if input_shape:
+      model.add(keras.layers.InputLayer(input_shape=input_shape))
+    for layer in layers:
+      model.add(layer)
+    return model
+
+  if model_type == 'functional':
+    if not input_shape:
+      raise ValueError('Cannot create a functional model from layers with no '
+                       'input shape.')
+    inputs = keras.Input(shape=input_shape)
+    outputs = inputs
+    for layer in layers:
+      outputs = layer(outputs)
+    return keras.Model(inputs, outputs)
+
+  raise ValueError('Unknown model type {}'.format(model_type))
+
+
+class _MultiIOSubclassModel(keras.Model):
+  """Multi IO Keras subclass model."""
+
+  def __init__(self, branch_a, branch_b, shared_input_branch=None,
+               shared_output_branch=None):
+    super(_MultiIOSubclassModel, self).__init__()
+    self._shared_input_branch = shared_input_branch
+    self._branch_a = branch_a
+    self._branch_b = branch_b
+    self._shared_output_branch = shared_output_branch
+
+  def call(self, inputs, **kwargs):
+    if self._shared_input_branch:
+      for layer in self._shared_input_branch:
+        inputs = layer(inputs)
+      a = inputs
+      b = inputs
+    else:
+      a, b = inputs
+
+    for layer in self._branch_a:
+      a = layer(a)
+    for layer in self._branch_b:
+      b = layer(b)
+    outs = [a, b]
+
+    if self._shared_output_branch:
+      for layer in self._shared_output_branch:
+        outs = layer(outs)
+
+    return outs
+
+
+class _MultiIOSubclassModelCustomBuild(keras.Model):
+  """Multi IO Keras subclass model that uses a custom build method."""
+
+  def __init__(self, branch_a_func, branch_b_func,
+               shared_input_branch_func=None,
+               shared_output_branch_func=None):
+    super(_MultiIOSubclassModelCustomBuild, self).__init__()
+    self._shared_input_branch_func = shared_input_branch_func
+    self._branch_a_func = branch_a_func
+    self._branch_b_func = branch_b_func
+    self._shared_output_branch_func = shared_output_branch_func
+
+    self._shared_input_branch = None
+    self._branch_a = None
+    self._branch_b = None
+    self._shared_output_branch = None
+
+  def build(self, input_shape):
+    if self._shared_input_branch_func():
+      self._shared_input_branch = self._shared_input_branch_func()
+    self._branch_a = self._branch_a_func()
+    self._branch_b = self._branch_b_func()
+
+    if self._shared_output_branch_func():
+      self._shared_output_branch = self._shared_output_branch_func()
+
+  def call(self, inputs, **kwargs):
+    if self._shared_input_branch:
+      for layer in self._shared_input_branch:
+        inputs = layer(inputs)
+      a = inputs
+      b = inputs
+    else:
+      a, b = inputs
+
+    for layer in self._branch_a:
+      a = layer(a)
+    for layer in self._branch_b:
+      b = layer(b)
+    outs = a, b
+
+    if self._shared_output_branch:
+      for layer in self._shared_output_branch:
+        outs = layer(outs)
+
+    return outs
+
+
+def get_multi_io_model(
+    branch_a,
+    branch_b,
+    shared_input_branch=None,
+    shared_output_branch=None):
+  """Builds a multi-io model that contains two branches.
+
+  The produced model will be of the type specified by `get_model_type`.
+
+  To build a two-input, two-output model:
+    Specify a list of layers for branch a and branch b, but do not specify any
+    shared input branch or shared output branch. The resulting model will apply
+    each branch to a different input, to produce two outputs.
+
+    The first value in branch_a must be the Keras 'Input' layer for branch a,
+    and the first value in branch_b must be the Keras 'Input' layer for
+    branch b.
+
+    example usage:
+    ```
+    branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
+    branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
+
+    model = get_multi_io_model(branch_a, branch_b)
+    ```
+
+  To build a two-input, one-output model:
+    Specify a list of layers for branch a and branch b, and specify a
+    shared output branch. The resulting model will apply
+    each branch to a different input. It will then apply the shared output
+    branch to a tuple containing the intermediate outputs of each branch,
+    to produce a single output. The first layer in the shared_output_branch
+    must be able to merge a tuple of two tensors.
+
+    The first value in branch_a must be the Keras 'Input' layer for branch a,
+    and the first value in branch_b must be the Keras 'Input' layer for
+    branch b.
+
+    example usage:
+    ```
+    input_branch_a = [Input(shape=(2,), name='a'), Dense(), Dense()]
+    input_branch_b = [Input(shape=(3,), name='b'), Dense(), Dense()]
+    shared_output_branch = [Concatenate(), Dense(), Dense()]
+
+    model = get_multi_io_model(input_branch_a, input_branch_b,
+                               shared_output_branch=shared_output_branch)
+    ```
+  To build a one-input, two-output model:
+    Specify a list of layers for branch a and branch b, and specify a
+    shared input branch. The resulting model will take one input, and apply
+    the shared input branch to it. It will then respectively apply each branch
+    to that intermediate result in parallel, to produce two outputs.
+
+    The first value in the shared_input_branch must be the Keras 'Input' layer
+    for the whole model. Branch a and branch b should not contain any Input
+    layers.
+
+    example usage:
+    ```
+    shared_input_branch = [Input(shape=(2,), name='in'), Dense(), Dense()]
+    output_branch_a = [Dense(), Dense()]
+    output_branch_b = [Dense(), Dense()]
+
+
+    model = get_multi_io_model(output__branch_a, output_branch_b,
+                               shared_input_branch=shared_input_branch)
+    ```
+
+  Args:
+    branch_a: A sequence of layers for branch a of the model.
+    branch_b: A sequence of layers for branch b of the model.
+    shared_input_branch: An optional sequence of layers to apply to a single
+      input, before applying both branches to that intermediate result. If set,
+      the model will take only one input instead of two. Defaults to None.
+    shared_output_branch: An optional sequence of layers to merge the
+      intermediate results produced by branch a and branch b. If set,
+      the model will produce only one output instead of two. Defaults to None.
+
+  Returns:
+    A multi-io model of the type specified by `get_model_type`, specified
+    by the different branches.
+  """
+  # Extract the functional inputs from the layer lists
+  if shared_input_branch:
+    inputs = shared_input_branch[0]
+    shared_input_branch = shared_input_branch[1:]
+  else:
+    inputs = branch_a[0], branch_b[0]
+    branch_a = branch_a[1:]
+    branch_b = branch_b[1:]
+
+  model_type = get_model_type()
+  if model_type == 'subclass':
+    return _MultiIOSubclassModel(branch_a, branch_b, shared_input_branch,
+                                 shared_output_branch)
+
+  if model_type == 'subclass_custom_build':
+    return _MultiIOSubclassModelCustomBuild((lambda: branch_a),
+                                            (lambda: branch_b),
+                                            (lambda: shared_input_branch),
+                                            (lambda: shared_output_branch))
+
+  if model_type == 'sequential':
+    raise ValueError('Cannot use `get_multi_io_model` to construct '
+                     'sequential models')
+
+  if model_type == 'functional':
+    if shared_input_branch:
+      a_and_b = inputs
+      for layer in shared_input_branch:
+        a_and_b = layer(a_and_b)
+      a = a_and_b
+      b = a_and_b
+    else:
+      a, b = inputs
+
+    for layer in branch_a:
+      a = layer(a)
+    for layer in branch_b:
+      b = layer(b)
+    outputs = a, b
+
+    if shared_output_branch:
+      for layer in shared_output_branch:
+        outputs = layer(outputs)
+
+    return keras.Model(inputs, outputs)
+
+  raise ValueError('Unknown model type {}'.format(model_type))
diff --git a/tensorflow/python/keras/utils/__init__.py b/tensorflow/python/keras/utils/__init__.py
index 8939044f71d05d762869d3123eab379362781242..61940ad789c4009fca5462079014482fb8bfec1b 100644
--- a/tensorflow/python/keras/utils/__init__.py
+++ b/tensorflow/python/keras/utils/__init__.py
@@ -34,6 +34,7 @@ from tensorflow.python.keras.utils.generic_utils import serialize_keras_object
 from tensorflow.python.keras.utils.io_utils import HDF5Matrix
 from tensorflow.python.keras.utils.layer_utils import convert_all_kernels_in_model
 from tensorflow.python.keras.utils.layer_utils import get_source_inputs
+from tensorflow.python.keras.utils.losses_utils import squeeze_or_expand_dimensions
 from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
 from tensorflow.python.keras.utils.np_utils import normalize
 from tensorflow.python.keras.utils.np_utils import to_categorical
diff --git a/tensorflow/python/keras/utils/generic_utils.py b/tensorflow/python/keras/utils/generic_utils.py
index 375bd9d196c6296e627b968ff2006fd216e3c68e..c331ce430bd761ca4beb2d6f8ab2e314e2e3178c 100644
--- a/tensorflow/python/keras/utils/generic_utils.py
+++ b/tensorflow/python/keras/utils/generic_utils.py
@@ -319,14 +319,16 @@ class Progbar(object):
           will be displayed as-is. All others will be averaged
           by the progbar before display.
       interval: Minimum visual progress update interval (in seconds).
+      unit_name: Display name for step counts (usually "step" or "sample").
   """
 
   def __init__(self, target, width=30, verbose=1, interval=0.05,
-               stateful_metrics=None):
+               stateful_metrics=None, unit_name='step'):
     self.target = target
     self.width = width
     self.verbose = verbose
     self.interval = interval
+    self.unit_name = unit_name
     if stateful_metrics:
       self.stateful_metrics = set(stateful_metrics)
     else:
@@ -425,12 +427,12 @@ class Progbar(object):
 
         info = ' - ETA: %s' % eta_format
       else:
-        if time_per_unit >= 1:
-          info += ' %.0fs/step' % time_per_unit
+        if time_per_unit >= 1 or time_per_unit == 0:
+          info += ' %.0fs/%s' % (time_per_unit, self.unit_name)
         elif time_per_unit >= 1e-3:
-          info += ' %.0fms/step' % (time_per_unit * 1e3)
+          info += ' %.0fms/%s' % (time_per_unit * 1e3, self.unit_name)
         else:
-          info += ' %.0fus/step' % (time_per_unit * 1e6)
+          info += ' %.0fus/%s' % (time_per_unit * 1e6, self.unit_name)
 
       for k in self._values_order:
         info += ' - %s:' % k
diff --git a/tensorflow/python/keras/utils/layer_utils.py b/tensorflow/python/keras/utils/layer_utils.py
index 158a9a5e76d214eef1f853f964aafe00b030b112..60677be73512c921f9fbbc96911655f28de29638 100644
--- a/tensorflow/python/keras/utils/layer_utils.py
+++ b/tensorflow/python/keras/utils/layer_utils.py
@@ -77,7 +77,7 @@ def count_params(weights):
   Returns:
       The total number of scalars composing the weights
   """
-  return int(np.sum([np.prod(p.get_shape().as_list()) for p in set(weights)]))
+  return int(sum(np.prod(p.get_shape().as_list()) for p in set(weights)))
 
 
 def print_summary(model, line_length=None, positions=None, print_fn=None):
diff --git a/tensorflow/python/keras/utils/losses_utils.py b/tensorflow/python/keras/utils/losses_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc4b4ac7dfd0966af5f4c21d4b78ba8ecd6bf46a
--- /dev/null
+++ b/tensorflow/python/keras/utils/losses_utils.py
@@ -0,0 +1,189 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=protected-access
+"""Utilities related to loss functions."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.keras import backend as K
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import confusion_matrix
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import weights_broadcast_ops
+from tensorflow.python.ops.losses import losses_impl
+
+
+def squeeze_or_expand_dimensions(y_pred, y_true, sample_weight):
+  """Squeeze or expand last dimension if needed.
+
+  1. Squeezes last dim of `y_pred` or `y_true` if their rank differs by 1
+  (using `confusion_matrix.remove_squeezable_dimensions`).
+  2. Squeezes or expands last dim of `sample_weight` if its rank differs by 1
+  from the new rank of `y_pred`.
+  If `sample_weight` is scalar, it is kept scalar.
+
+  This will use static shape if available. Otherwise, it will add graph
+  operations, which could result in a performance hit.
+
+  Args:
+    y_pred: Predicted values, a `Tensor` of arbitrary dimensions.
+    y_true: Optional label `Tensor` whose dimensions match `y_pred`.
+    sample_weight: Optional weight scalar or `Tensor` whose dimensions match
+      `y_pred`.
+
+  Returns:
+    Tuple of `y_pred`, `y_true` and `sample_weight`. Each of them possibly has
+    the last dimension squeezed,
+    `sample_weight` could be extended by one dimension.
+  """
+  if y_true is not None:
+    # squeeze last dim of `y_pred` or `y_true` if their rank differs by 1
+    y_true, y_pred = confusion_matrix.remove_squeezable_dimensions(
+        y_true, y_pred)
+
+  if sample_weight is None:
+    return y_pred, y_true, None
+
+  sample_weight = ops.convert_to_tensor(sample_weight)
+  weights_shape = sample_weight.get_shape()
+  weights_rank = weights_shape.ndims
+  if weights_rank == 0:  # If weights is scalar, do nothing.
+    return y_pred, y_true, sample_weight
+
+  y_pred_shape = y_pred.get_shape()
+  y_pred_rank = y_pred_shape.ndims
+  if (y_pred_rank is not None) and (weights_rank is not None):
+    # Use static rank.
+    if weights_rank - y_pred_rank == 1:
+      sample_weight = array_ops.squeeze(sample_weight, [-1])
+    elif y_pred_rank - weights_rank == 1:
+      sample_weight = array_ops.expand_dims(sample_weight, [-1])
+    return y_pred, y_true, sample_weight
+
+  # Use dynamic rank.
+  weights_rank_tensor = array_ops.rank(sample_weight)
+  rank_diff = weights_rank_tensor - array_ops.rank(y_pred)
+  maybe_squeeze_weights = lambda: array_ops.squeeze(sample_weight, [-1])
+
+  def _maybe_expand_weights():
+    return control_flow_ops.cond(
+        math_ops.equal(rank_diff,
+                       -1), lambda: array_ops.expand_dims(sample_weight, [-1]),
+        lambda: sample_weight)
+
+  def _maybe_adjust_weights():
+    return control_flow_ops.cond(
+        math_ops.equal(rank_diff, 1), maybe_squeeze_weights,
+        _maybe_expand_weights)
+
+  # squeeze or expand last dim of `sample_weight` if its rank differs by 1
+  # from the new rank of `y_pred`.
+  sample_weight = control_flow_ops.cond(
+      math_ops.equal(weights_rank_tensor, 0), lambda: sample_weight,
+      _maybe_adjust_weights)
+  return y_pred, y_true, sample_weight
+
+
+def _safe_mean(losses, num_present):
+  """Computes a safe mean of the losses.
+
+  Args:
+    losses: `Tensor` whose elements contain individual loss measurements.
+    num_present: The number of measurable elements in `losses`.
+
+  Returns:
+    A scalar representing the mean of `losses`. If `num_present` is zero,
+      then zero is returned.
+  """
+  total_loss = math_ops.reduce_sum(losses)
+  return math_ops.div_no_nan(total_loss, num_present, name='value')
+
+
+def _num_elements(losses):
+  """Computes the number of elements in `losses` tensor."""
+  with ops.name_scope(None, 'num_elements', values=[losses]) as scope:
+    return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
+
+
+def _reduce_weighted_loss(
+    weighted_losses, reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE):
+  """Reduces the individual weighted loss measurements."""
+  if reduction == losses_impl.ReductionV2.NONE:
+    loss = weighted_losses
+  else:
+    loss = math_ops.reduce_sum(weighted_losses)
+    if reduction == losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE:
+      loss = _safe_mean(loss, _num_elements(weighted_losses))
+  return loss
+
+
+def compute_weighted_loss(losses,
+                          sample_weight=None,
+                          reduction=losses_impl.ReductionV2.SUM_OVER_BATCH_SIZE,
+                          name=None):
+  """Computes the weighted loss.
+
+  Args:
+    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
+    sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as
+      `losses`, or be broadcastable to `losses`.
+    reduction: Type of `tf.losses.Reduction` to apply to loss. Default value is
+      `SUM_OVER_BATCH_SIZE`.
+    name: Optional name for the op.
+
+  Raises:
+    ValueError: If the shape of `sample_weight` is not compatible with `losses`.
+
+  Returns:
+    Weighted loss `Tensor` of the same type as `losses`. If `reduction` is
+    `NONE`, this has the same shape as `losses`; otherwise, it is scalar.
+  """
+  losses_impl.ReductionV2.validate(reduction)
+  if sample_weight is None:
+    sample_weight = 1.0
+  with ops.name_scope(name, 'weighted_loss', (losses, sample_weight)):
+    # Save the `reduction` argument for loss normalization when distributing
+    # to multiple replicas.
+    # TODO(josh11b): Associate it with the returned op for more precision.
+    ops.get_default_graph()._last_loss_reduction = reduction  # pylint: disable=protected-access
+
+    # Update dimensions of `sample_weight` to match with `losses` if possible.
+    losses, _, sample_weight = squeeze_or_expand_dimensions(
+        losses, None, sample_weight)
+    losses = ops.convert_to_tensor(losses)
+    input_dtype = losses.dtype
+    losses = math_ops.to_float(losses)
+    sample_weight = math_ops.to_float(sample_weight)
+
+    try:
+      # Broadcast weights if possible.
+      sample_weight = weights_broadcast_ops.broadcast_weights(
+          sample_weight, losses)
+    except ValueError:
+      # Reduce values to same ndim as weight array.
+      ndim = K.ndim(losses)
+      weight_ndim = K.ndim(sample_weight)
+      losses = K.mean(losses, axis=list(range(weight_ndim, ndim)))
+
+    sample_weight.get_shape().assert_is_compatible_with(losses.get_shape())
+    weighted_losses = math_ops.multiply(losses, sample_weight)
+    # Apply reduction function to the individual weighted losses.
+    loss = _reduce_weighted_loss(weighted_losses, reduction)
+    # Convert the result back to the input type.
+    loss = math_ops.cast(loss, input_dtype)
+    return loss
diff --git a/tensorflow/python/keras/utils/multi_gpu_utils_test.py b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
index 1780ab65871b1cbb712c612ea252298aadefb265..8c1abd632484273a01fd99cbd72ee73b66e46f27 100644
--- a/tensorflow/python/keras/utils/multi_gpu_utils_test.py
+++ b/tensorflow/python/keras/utils/multi_gpu_utils_test.py
@@ -158,7 +158,7 @@ class TestMultiGPUModel(test.TestCase):
       dataset = data.Dataset.from_tensor_slices((x_train, y_train))
       dataset = dataset.repeat()
       dataset = dataset.batch(4)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = data.make_one_shot_iterator(dataset)
 
       inputs, targets = iterator.get_next()
 
diff --git a/tensorflow/python/keras/utils/tf_utils.py b/tensorflow/python/keras/utils/tf_utils.py
index 6b7c6c34a263c9a0abeb19c85488106a87fc3b2b..7b4c9e7239e2f097e0351b160bd7520ee587a8b3 100644
--- a/tensorflow/python/keras/utils/tf_utils.py
+++ b/tensorflow/python/keras/utils/tf_utils.py
@@ -161,6 +161,9 @@ def are_all_symbolic_tensors(tensors):
   return all(is_symbolic_tensor(tensor) for tensor in tensors)
 
 
+_user_convertible_tensor_types = set()
+
+
 def is_symbolic_tensor(tensor):
   """Returns whether a tensor is symbolic (from a TF graph) or an eager tensor.
 
@@ -176,9 +179,40 @@ def is_symbolic_tensor(tensor):
   if isinstance(tensor, variables.Variable):
     return not context.executing_eagerly()
   if isinstance(tensor, (ops.Tensor, sparse_tensor.SparseTensor)):
-    try:
-      _ = tensor.graph
-      return True
-    except AttributeError:
-      return False
+    return hasattr(tensor, 'graph')
+  if isinstance(tensor, tuple(_user_convertible_tensor_types)):
+    return hasattr(ops.convert_to_tensor(tensor), 'graph')
   return False
+
+
+def register_symbolic_tensor_type(cls):
+  """Allows users to specify types regarded as symbolic `Tensor`s.
+
+  Used in conjunction with `tf.register_tensor_conversion_function`, calling
+  `tf.keras.utils.register_symbolic_tensor_type(cls)` allows non-`Tensor`
+  objects to be plumbed through Keras layers.
+
+  Example:
+
+  ```python
+  # One-time setup.
+  class Foo(object):
+    def __init__(self, input_):
+      self._input = input_
+    def value(self):
+      return tf.constant(42.)
+
+  tf.register_tensor_conversion_function(
+      Foo, lambda x, *args, **kwargs: x.value())
+
+  tf.keras.utils.register_symbolic_tensor_type(Foo)
+
+  # User-land.
+  layer = tf.keras.layers.Lambda(lambda input_: Foo(input_))
+  ```
+
+  Arguments:
+    cls: A `class` type which shall be regarded as a symbolic `Tensor`.
+  """
+  global _user_convertible_tensor_types
+  _user_convertible_tensor_types.add(cls)
diff --git a/tensorflow/python/keras/utils/tf_utils_test.py b/tensorflow/python/keras/utils/tf_utils_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9833a492993feb3a989d09160919fbf85c3a21e7
--- /dev/null
+++ b/tensorflow/python/keras/utils/tf_utils_test.py
@@ -0,0 +1,134 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for Keras TF utils."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import keras
+from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.keras.utils import tf_utils
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class TestIsSymbolicTensor(test.TestCase):
+
+  def test_default_behavior(self):
+    if context.executing_eagerly():
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+    else:
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+
+  def test_works_with_registered(self):
+
+    class CustomClass(object):
+
+      def value(self):
+        return ops.convert_to_tensor(42.)
+
+    ops.register_tensor_conversion_function(
+        CustomClass, lambda value, **_: value.value())
+
+    tf_utils.register_symbolic_tensor_type(CustomClass)
+
+    if context.executing_eagerly():
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertFalse(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+      self.assertFalse(tf_utils.is_symbolic_tensor(CustomClass()))
+    else:
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          variables.Variable(name='blah', initial_value=0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          ops.convert_to_tensor(0.)))
+      self.assertTrue(tf_utils.is_symbolic_tensor(
+          sparse_tensor.SparseTensor(
+              indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4])))
+      self.assertTrue(tf_utils.is_symbolic_tensor(CustomClass()))
+
+  def test_enables_nontensor_plumbing(self):
+    # Setup.
+
+    class Foo(object):
+
+      def __init__(self, input_):
+        self._input = input_
+        self.value = ops.convert_to_tensor(42.)
+
+    ops.register_tensor_conversion_function(
+        Foo, lambda x, *args, **kwargs: x.value)
+    tf_utils.register_symbolic_tensor_type(Foo)
+
+    class PlumbingLayer(keras.layers.Lambda):
+
+      def __init__(self, fn, **kwargs):
+        def _fn(*fargs, **fkwargs):
+          d = fn(*fargs, **fkwargs)
+          x = ops.convert_to_tensor(d)
+          d.shape = x.shape
+          d.get_shape = x.get_shape
+          return d, x
+        super(PlumbingLayer, self).__init__(_fn, **kwargs)
+        self._enter_dunder_call = False
+
+      def __call__(self, inputs, *args, **kwargs):
+        self._enter_dunder_call = True
+        d, _ = super(PlumbingLayer, self).__call__(inputs, *args, **kwargs)
+        self._enter_dunder_call = False
+        return d
+
+      def call(self, inputs, *args, **kwargs):
+        d, v = super(PlumbingLayer, self).call(inputs, *args, **kwargs)
+        if self._enter_dunder_call:
+          return d, v
+        return d
+
+    # User-land.
+    model = keras.Sequential([
+        keras.layers.InputLayer([]),
+        PlumbingLayer(Foo),  # Makes a `Foo` object.
+    ])
+    # Let's ensure Keras graph history is preserved by composing the models.
+    model = keras.Model(model.inputs, model(model.outputs))
+    # Now we instantiate the model and verify we have a `Foo` object, not a
+    # `Tensor`.
+    y = model(ops.convert_to_tensor(7.))
+    self.assertIsInstance(y, Foo)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index 6b990d3a9267a1980e8ce2d7d8964635957e06e2..df8c14970a0af7e2b1bd19162b344ff4329d385f 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -121,8 +121,10 @@ cuda_py_test(
         "@absl_py//absl/testing:parameterized",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python:gradients_impl",
         "//tensorflow/python:list_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/eager:context",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
@@ -268,9 +270,9 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+cuda_py_test(
     name = "ctc_loss_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["ctc_loss_op_test.py"],
     additional_deps = [
         "//third_party/py/numpy",
@@ -659,6 +661,18 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "matrix_square_root_op_test",
+    size = "medium",
+    srcs = ["matrix_square_root_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+    ],
+)
+
 cuda_py_test(
     name = "matrix_solve_op_test",
     size = "medium",
@@ -817,6 +831,7 @@ tf_py_test(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
         "//tensorflow/python:io_ops",
         "//tensorflow/python:io_ops_gen",
     ],
@@ -1142,6 +1157,21 @@ cuda_py_test(
     ],
 )
 
+tf_py_test(
+    name = "unicode_encode_op_test",
+    size = "small",
+    srcs = ["unicode_encode_op_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:errors",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
+    ],
+)
+
 tf_py_test(
     name = "unicode_transcode_op_test",
     size = "small",
@@ -1154,6 +1184,18 @@ tf_py_test(
     ],
 )
 
+tf_py_test(
+    name = "unicode_decode_op_test",
+    size = "small",
+    srcs = ["unicode_decode_op_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:string_ops",
+    ],
+)
+
 tf_py_test(
     name = "unique_op_test",
     size = "small",
@@ -1338,6 +1380,7 @@ cuda_py_test(
         "//tensorflow/python:test_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
     ],
     shard_count = 10,
     tags = [
@@ -1449,6 +1492,7 @@ cuda_py_test(
     additional_deps = [
         "//third_party/py/numpy",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:check_ops",
         "//tensorflow/python:math_ops",
@@ -1736,9 +1780,11 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
+        "//tensorflow/python:while_v2",
         "//tensorflow/python/data/ops:iterator_ops",
     ],
     grpc_enabled = True,
+    shard_count = 2,
     tags = ["no_windows"],
 )
 
@@ -1842,6 +1888,22 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "lu_op_test",
+    size = "small",
+    srcs = ["lu_op_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform",
+        "//tensorflow/python/ops/linalg",
+    ],
+)
+
 cuda_py_test(
     name = "manip_ops_test",
     size = "small",
@@ -2027,12 +2089,13 @@ cuda_py_test(
         "//third_party/py/numpy",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:nn_grad",
         "//tensorflow/python:nn_ops",
         "//tensorflow/python:random_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python:tf2",
     ],
 )
 
@@ -2371,6 +2434,8 @@ cuda_py_test(
         "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:variables",
         "//tensorflow/python:variable_scope",
+        "//tensorflow/python:cond_v2",
+        "//tensorflow/python:while_v2",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
     ],
@@ -2614,34 +2679,6 @@ cuda_py_test(
     tags = ["manual"],
 )
 
-cuda_py_test(
-    name = "dct_ops_test",
-    srcs = ["dct_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-)
-
-cuda_py_test(
-    name = "fft_ops_test",
-    size = "medium",
-    srcs = ["fft_ops_test.py"],
-    additional_deps = [
-        "//third_party/py/numpy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
-        "//tensorflow/python:spectral_ops_test_util",
-    ],
-    shard_count = 4,
-    tags = ["optonly"],
-)
-
 cuda_py_test(
     name = "pooling_ops_3d_test",
     size = "medium",
@@ -3330,7 +3367,9 @@ cuda_py_test(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
+        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:training",
+        "//tensorflow/python:while_v2",
     ],
     grpc_enabled = True,
 )
diff --git a/tensorflow/python/kernel_tests/accumulate_n_test.py b/tensorflow/python/kernel_tests/accumulate_n_test.py
index 7889edc198f48a0a91ad3c3153b0eb1ecbad76b8..5eece9c94137c190331b4c39aea72dc96551d0bb 100644
--- a/tensorflow/python/kernel_tests/accumulate_n_test.py
+++ b/tensorflow/python/kernel_tests/accumulate_n_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.platform import googletest
 class AccumulateNV2Test(test_util.TensorFlowTestCase):
   """Tests of the new, differentiable version of accumulate_n."""
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
@@ -41,6 +42,7 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       self.assertAllClose(x[0] * 5,
                           math_ops.accumulate_n([tf_x[0]] * 5).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
@@ -50,12 +52,14 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       self.assertAllEqual(x[0] * 6,
                           math_ops.accumulate_n([tf_x[0]] * 6).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.session(use_gpu=True):
       x0 = array_ops.placeholder(dtype=dtypes_lib.int32, shape=[None])
       acc = math_ops.accumulate_n([x0, x0], shape=[None])
       self.assertAllEqual([2, 4], acc.eval(feed_dict={x0: [1, 2]}))
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     np.random.seed(42)
     for num_inputs in range(1, 10):
@@ -65,7 +69,7 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
             for _ in range(0, num_inputs)
         ]
         accum_n = math_ops.accumulate_n(input_vars)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         accum_n_grad = gradients.gradients(accum_n, input_vars)
         self.assertAllEqual(
             np.repeat(1.0, num_inputs),  # d/dx (x + y + ...) = 1
@@ -88,13 +92,13 @@ class AccumulateNV2Test(test_util.TensorFlowTestCase):
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testZeroArgs(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         tf_val = math_ops.accumulate_n([])
-        tf_val.eval()
+        self.evaluate(tf_val)
 
   def testWrongShape(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/ackermann_test.py b/tensorflow/python/kernel_tests/ackermann_test.py
index d267e4975272b9df9e189c92695eea2a9500f27b..6c20b19be9e6353e40dedd84db9edda9de8cc827 100644
--- a/tensorflow/python/kernel_tests/ackermann_test.py
+++ b/tensorflow/python/kernel_tests/ackermann_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import load_library
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 
 
 class AckermannTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'ackermann_op.so')
diff --git a/tensorflow/python/kernel_tests/aggregate_ops_test.py b/tensorflow/python/kernel_tests/aggregate_ops_test.py
index 0f15319cb598c775f97631c1e54c9cb2df7b9f4c..d9787cc3bf6b6bdbdc917c9d40b8ebdfed9eb3bb 100644
--- a/tensorflow/python/kernel_tests/aggregate_ops_test.py
+++ b/tensorflow/python/kernel_tests/aggregate_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.core.framework import tensor_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.ops import math_ops
@@ -61,12 +62,13 @@ class AddNTest(test.TestCase):
       for dtype in self._supported_types():
         for count in range(1, self._MAX_N + 1):
           data = [self._buildData((2, 2), dtype) for _ in range(count)]
-          actual = sess.run(math_ops.add_n(data))
+          actual = self.evaluate(math_ops.add_n(data))
           expected = np.sum(np.vstack(
               [np.expand_dims(d, 0) for d in data]), axis=0)
           tol = 5e-3 if dtype == dtypes.float16 else 5e-7
           self.assertAllClose(expected, actual, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapes(self):
     np.random.seed(12345)
     with self.session(use_gpu=True) as sess:
@@ -80,6 +82,7 @@ class AddNTest(test.TestCase):
           tol = 5e-3 if dtype == dtypes.float16 else 5e-7
           self.assertAllClose(expected, actual, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testVariant(self):
 
     def create_constant_variant(value):
diff --git a/tensorflow/python/kernel_tests/argmax_op_test.py b/tensorflow/python/kernel_tests/argmax_op_test.py
index fa370c17b462b899b44a9ec8c5970526222b5eaa..06ec0948c25006c06039bfde9ef9e3e6da760889 100644
--- a/tensorflow/python/kernel_tests/argmax_op_test.py
+++ b/tensorflow/python/kernel_tests/argmax_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -37,14 +38,14 @@ class ArgMaxTest(test.TestCase):
     with self.session(use_gpu=use_gpu):
       ans = method(x, axis=axis)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         # Defaults to int64 output.
         self.assertEqual(np.int64, tf_ans.dtype)
         self.assertAllEqual(tf_ans, expected_values)
         self.assertShapeEqual(expected_values, ans)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBothArg(self,
                    method,
@@ -79,7 +80,7 @@ class ArgMaxTest(test.TestCase):
     expected_values = x.argmax()
     with self.session(use_gpu=True):
       ans = math_ops.argmax(x, axis=0, output_type=dtypes.int32)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
       self.assertEqual(np.int32, tf_ans.dtype)
       # The values are equal when comparing int32 to int64 because
       # the values don't have a range that exceeds 32-bit integers.
@@ -87,7 +88,7 @@ class ArgMaxTest(test.TestCase):
     expected_values = x.argmin()
     with self.session(use_gpu=True):
       ans = math_ops.argmin(x, axis=0, output_type=dtypes.int32)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
       self.assertEqual(np.int32, tf_ans.dtype)
       self.assertAllEqual(tf_ans, expected_values)
 
@@ -110,12 +111,14 @@ class ArgMaxTest(test.TestCase):
             r"Reduction axis 0 is empty in shape \[0\]"):
           op([], 0).eval()
 
+  @test_util.run_deprecated_v1
   def testDefaultAxis(self):
     with self.cached_session():
       for op in math_ops.argmin, math_ops.argmax:
         ans = op([1]).eval()
         self.assertAllEqual(ans, 0)
 
+  @test_util.run_deprecated_v1
   def testOutputEmpty(self):
     with self.cached_session():
       for op in math_ops.argmin, math_ops.argmax:
diff --git a/tensorflow/python/kernel_tests/array_ops_test.py b/tensorflow/python/kernel_tests/array_ops_test.py
index c90794c789210dd49c5326f0609535ab4a7043e7..f4c442b7b1932c3ddab0d255f57c3fac5a23954a 100644
--- a/tensorflow/python/kernel_tests/array_ops_test.py
+++ b/tensorflow/python/kernel_tests/array_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -32,6 +33,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
@@ -46,24 +48,23 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as test_lib
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
 
   def testNonBatchMatrix(self):
     matrix = [[1, 2, 3], [4, 5, 6]]  # Shape (2, 3)
     expected_transposed = [[1, 4], [2, 5], [3, 6]]  # Shape (3, 2)
-    with self.cached_session():
-      transposed = array_ops.matrix_transpose(matrix)
-      self.assertEqual((3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    transposed = array_ops.matrix_transpose(matrix)
+    self.assertEqual((3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testConjugate(self):
     m = [[1 + 1j, 2 + 2j, 3 + 3j], [4 + 4j, 5 + 5j, 6 + 6j]]
     expected_transposed = [[1 - 1j, 4 - 4j], [2 - 2j, 5 - 5j], [3 - 3j, 6 - 6j]]
-    with self.cached_session():
-      matrix = ops.convert_to_tensor(m)
-      transposed = array_ops.matrix_transpose(matrix, conjugate=True)
-      self.assertEqual((3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    matrix = ops.convert_to_tensor(m)
+    transposed = array_ops.matrix_transpose(matrix, conjugate=True)
+    self.assertEqual((3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testBatchMatrix(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
@@ -72,43 +73,44 @@ class BatchMatrixTransposeTest(test_util.TensorFlowTestCase):
     matrix_1_t = [[11, 44], [22, 55], [33, 66]]
     batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
     expected_transposed = [matrix_0_t, matrix_1_t]  # Shape (2, 3, 2)
-    with self.cached_session():
-      transposed = array_ops.matrix_transpose(batch_matrix)
-      self.assertEqual((2, 3, 2), transposed.get_shape())
-      self.assertAllEqual(expected_transposed, transposed.eval())
+    transposed = array_ops.matrix_transpose(batch_matrix)
+    self.assertEqual((2, 3, 2), transposed.get_shape())
+    self.assertAllEqual(expected_transposed, transposed)
 
   def testNonBatchMatrixDynamicallyDefined(self):
-    matrix = [[1, 2, 3], [4, 5, 6]]  # Shape (2, 3)
+    # needs explicit `constant` because lists are not automatically
+    # converted to sensors when applying `transpose` below
+    matrix = constant_op.constant([[1, 2, 3], [4, 5, 6]])  # Shape (2, 3)
     expected_transposed = [[1, 4], [2, 5], [3, 6]]  # Shape (3, 2)
-    with self.cached_session():
-      matrix_ph = array_ops.placeholder(dtypes.int32)
-      transposed = array_ops.matrix_transpose(matrix_ph)
-      self.assertAllEqual(
-          expected_transposed, transposed.eval(feed_dict={
-              matrix_ph: matrix
-          }))
+    @def_function.function(input_signature=
+                           [tensor_spec.TensorSpec
+                            (shape=None, dtype=dtypes.int32)])
+    def transpose(matrix):
+      self.assertIs(matrix.shape.ndims, None)
+      return array_ops.matrix_transpose(matrix)
+    self.assertAllEqual(expected_transposed, transpose(matrix))
 
   def testBatchMatrixDynamicallyDefined(self):
     matrix_0 = [[1, 2, 3], [4, 5, 6]]
     matrix_0_t = [[1, 4], [2, 5], [3, 6]]
     matrix_1 = [[11, 22, 33], [44, 55, 66]]
     matrix_1_t = [[11, 44], [22, 55], [33, 66]]
-    batch_matrix = [matrix_0, matrix_1]  # Shape (2, 2, 3)
+    # needs explicit `constant` because lists are not automatically
+    # converted to sensors when applying `transpose` below
+    batch_matrix = constant_op.constant([matrix_0, matrix_1])  # Shape (2, 2, 3)
     expected_transposed = [matrix_0_t, matrix_1_t]  # Shape (2, 3, 2)
-    with self.cached_session():
-      batch_matrix_ph = array_ops.placeholder(dtypes.int32)
-      transposed = array_ops.matrix_transpose(batch_matrix_ph)
-      self.assertAllEqual(
-          expected_transposed,
-          transposed.eval(feed_dict={
-              batch_matrix_ph: batch_matrix
-          }))
+    @def_function.function(input_signature=
+                           [tensor_spec.TensorSpec
+                            (shape=None, dtype=dtypes.int32)])
+    def transpose(matrix):
+      self.assertIs(matrix.shape.ndims, None)
+      return array_ops.matrix_transpose(matrix)
+    self.assertAllEqual(expected_transposed, transpose(batch_matrix))
 
   def testTensorWithStaticRankLessThanTwoRaisesBecauseNotAMatrix(self):
     vector = [1, 2, 3]
-    with self.cached_session():
-      with self.assertRaisesRegexp(ValueError, "should be a "):
-        array_ops.matrix_transpose(vector)
+    with self.assertRaisesRegexp(ValueError, "should be a "):
+      array_ops.matrix_transpose(vector)
 
 
 class BooleanMaskTest(test_util.TensorFlowTestCase):
@@ -141,36 +143,43 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
 
       self.assertAllClose(masked_arr, masked_tensor.eval())
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim2Axis1(self):
     ndims_mask = 1
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim2Axis1(self):
     ndims_mask = 2
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape, axis=1)
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim1(self):
     ndims_mask = 1
     for arr_shape in [(1,), (2,), (3,), (10,)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim1ArrDim2(self):
     ndims_mask = 1
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim2(self):
     ndims_mask = 2
     for arr_shape in [(1, 1), (2, 2), (2, 5)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testMaskDim2ArrDim3(self):
     ndims_mask = 2
     for arr_shape in [(1, 1, 1), (1, 2, 2), (2, 2, 1)]:
       self.CheckVersusNumpy(ndims_mask, arr_shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyInput2D(self):
     mask = np.array([True, False])
     arr = np.array([[], []]).astype(np.float32)
@@ -189,6 +198,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
     with self.cached_session():
       self.assertAllClose(numpy_result, tf_result.eval())
 
+  @test_util.run_deprecated_v1
   def testEmptyOutput(self):
     make_mask = lambda shape: np.zeros(shape, dtype=bool)
     for ndims_mask in range(1, 4):
@@ -197,6 +207,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
           arr_shape = np.random.randint(1, 5, size=ndims_arr)
           self.CheckVersusNumpy(ndims_mask, arr_shape, make_mask=make_mask)
 
+  @test_util.run_deprecated_v1
   def testWorksWithDimensionsEqualToNoneDuringGraphBuild(self):
     # The rank of the mask tensor must be specified. This is explained
     # in the docstring as well.
@@ -215,6 +226,7 @@ class BooleanMaskTest(test_util.TensorFlowTestCase):
           })
       np.testing.assert_allclose(masked_tensor, arr[mask])
 
+  @test_util.run_deprecated_v1
   def testMaskDimensionsSetToNoneRaises(self):
     # The rank of the mask tensor must be specified. This is explained
     # in the docstring as well.
@@ -281,6 +293,7 @@ class OperatorShapeTest(test_util.TensorFlowTestCase):
 
 class ReverseV2Test(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testReverse0DimAuto(self):
     x_np = 4
     for use_gpu in [False, True]:
@@ -325,6 +338,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
 
   # This test covers the axis validation in the shape function
   # (no eval())
+  @test_util.run_deprecated_v1
   def testInvalidAxis(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
     with self.assertRaisesRegexp(ValueError,
@@ -343,6 +357,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
   #
   # Note: this test passes placeholder as constant axis is validated
   # in shape function (see testInvalidAxis)
+  @test_util.run_deprecated_v1
   def testInvalid(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.float32)
     axis = array_ops.placeholder(dtypes.int32)
@@ -357,6 +372,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
                                    "axis 0 specified more than once"):
         array_ops.reverse_v2(x_np, axis).eval(feed_dict={axis: [0, -2]})
 
+  @test_util.run_deprecated_v1
   def testReverse1DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
@@ -365,6 +381,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     ]:
       self._reverse1DimAuto(dtype)
 
+  @test_util.run_deprecated_v1
   def testReverse2DimAuto(self):
     for dtype in [
         np.uint8, np.int8, np.uint16, np.int16, np.int32, np.int64, np.bool,
@@ -373,6 +390,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     ]:
       self._reverse2DimAuto(dtype)
 
+  @test_util.run_deprecated_v1
   def testUnknownDims(self):
     reverse_v2 = array_ops.reverse_v2
     data_t = array_ops.placeholder(dtypes.float32)
@@ -390,6 +408,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
     reverse_2d_t = reverse_v2(data_2d_t, axis_2d_t)
     self.assertEqual(2, reverse_2d_t.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testReverseRowsOf3Channels(self):
     """Tests optimized code for reversing rows with last dim size = 3."""
     with self.session(use_gpu=True):
@@ -403,6 +422,7 @@ class ReverseV2Test(test_util.TensorFlowTestCase):
             np_answer = x_np[:, ::-1, :]
             self.assertAllEqual(x_tf, np_answer)
 
+  @test_util.run_deprecated_v1
   def testReverseRowsOf4Channels(self):
     with self.session(use_gpu=True):
       for reverse_f in [array_ops.reverse_v2, array_ops.reverse]:
@@ -452,6 +472,7 @@ class MeshgridTest(test_util.TensorFlowTestCase):
         for x_np, x_tf in zip(numpy_out, tf_out):
           self.assertAllEqual(x_np, x_tf.eval())
 
+  @test_util.run_deprecated_v1
   def testCompare(self):
     for t in (np.float16, np.float32, np.float64, np.int32, np.int64,
               np.complex64, np.complex128):
@@ -524,6 +545,7 @@ STRIDED_SLICE_TYPES = [
 class StridedSliceTest(test_util.TensorFlowTestCase):
   """Test the strided slice operation with variants of slices."""
 
+  @test_util.run_deprecated_v1
   def test_basic_slice(self):
     for tensor_type in STRIDED_SLICE_TYPES:
       with self.cached_session(use_gpu=True):
@@ -554,7 +576,8 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
   def testInt64GPU(self):
     if not test_util.is_gpu_available():
       self.skipTest("No GPU available")
-    with self.session(use_gpu=True, force_gpu=True):
+
+    with test_util.force_gpu():
       x = constant_op.constant([1., 2., 3.])
       begin = constant_op.constant([2], dtype=dtypes.int64)
       end = constant_op.constant([3], dtype=dtypes.int64)
@@ -578,6 +601,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       v = variables.Variable([1., 2.])
       v[0]  # pylint: disable=pointless-statement
 
+  @test_util.run_deprecated_v1
   def testDegenerateSlices(self):
     with self.session(use_gpu=True):
       checker = StridedSliceChecker(self, StridedSliceChecker.REF_TENSOR)
@@ -588,6 +612,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       # empty interval in every dimension
       _ = checker[-1:0, 2:2, 2:3:-1]
 
+  @test_util.run_deprecated_v1
   def testEllipsis(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2], [3, 4], [5, 6]]], [[[7, 8], [9, 10], [11, 12]]]]]
@@ -608,6 +633,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "Multiple ellipses"):
         _ = checker[..., :, ...].eval()
 
+  @test_util.run_deprecated_v1
   def testShrink(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
@@ -618,6 +644,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       _ = checker[:, 0]
       _ = checker[:, :, 0]
 
+  @test_util.run_deprecated_v1
   def testBothNewAxisAndShrink(self):
     with self.session(use_gpu=True):
       ones = array_ops.placeholder(shape=[2, 2], dtype=dtypes.int16)
@@ -626,6 +653,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
               feed_dict={ones: [[1, 1], [1, 1]]}),
           [[1, 1]])
 
+  @test_util.run_deprecated_v1
   def testTensorIndexing(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
@@ -636,6 +664,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       _ = checker[..., bar:bar2]
       _ = checker[..., bar]
       _ = checker[..., 3]
+      _ = checker[..., 2 ** 64 // 2**63]  # Test longs in Python 2
 
   def testTensorIndexingTypeError(self):
     with self.session(use_gpu=True):
@@ -650,6 +679,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(TypeError, expected):
         _ = checker[constant_op.constant(0.0)]
 
+  @test_util.run_deprecated_v1
   def testExpand(self):
     with self.session(use_gpu=True):
       raw = [[[[[1, 2, 4, 5], [5, 6, 7, 8], [9, 10, 11, 12]]],
@@ -667,6 +697,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       # Ellipsis in middle of two newaxis
       _ = checker[np.newaxis, ..., np.newaxis]
 
+  @test_util.run_deprecated_v1
   def testExpandVariable(self):
     with self.session(use_gpu=True):
       x = variables.Variable(7, dtype=dtypes.int32)
@@ -675,6 +706,7 @@ class StridedSliceTest(test_util.TensorFlowTestCase):
       self.assertEqual(y.shape, (1,))
       self.assertAllEqual(y, (7,))
 
+  @test_util.run_deprecated_v1
   def testOptimizedCases(self):
     with self.session(use_gpu=True):
       checker = StridedSliceChecker(self,
@@ -704,6 +736,7 @@ class StridedSliceShapeChecker(object):
 class StridedSliceShapeTest(test_util.TensorFlowTestCase):
   """Test the shape inference of StridedSliceShapes."""
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     with self.session(use_gpu=True):
       uncertain_tensor = array_ops.placeholder(dtypes.float32)
@@ -715,6 +748,7 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
     self.assertTrue(x is not None and y is not None or x is None and y is None)
     self.assertEqual(x.as_list(), y.as_list())
 
+  @test_util.run_deprecated_v1
   def testTensorShapeUncertain(self):
     with self.session(use_gpu=True):
       uncertain_tensor = array_ops.placeholder(
@@ -738,6 +772,7 @@ class StridedSliceShapeTest(test_util.TensorFlowTestCase):
       self.tensorShapeEqual(a[::-1, :, array_ops.newaxis, ::-2],
                             tensor_shape.TensorShape([5, None, 1, 4]))
 
+  @test_util.run_deprecated_v1
   def testTensorValuedIndexShape(self):
     with self.session(use_gpu=True):
       defined_shape_tensor = array_ops.placeholder(
@@ -794,6 +829,7 @@ class GradSliceChecker(object):
 class StridedSliceGradTest(test_util.TensorFlowTestCase):
   """Test that strided slice's custom gradient produces correct gradients."""
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.session(use_gpu=True) as sess:
       var = variables.Variable(
@@ -823,18 +859,20 @@ class StridedSliceGradTest(test_util.TensorFlowTestCase):
       grad = GradSliceChecker(self, sess, var, np.array(8))
       _ = grad[tuple()]
 
+  @test_util.run_deprecated_v1
   def testInt64Indices(self):
     with self.session(use_gpu=True) as sess:
       a = math_ops.range(3, dtype=dtypes.float32)
       index = constant_op.constant(1, dtype=dtypes.int64)
       b = 2. * a[index]
       grad, = gradients_impl.gradients(b, a)
-      self.assertAllEqual(sess.run(grad), [0., 2., 0.])
+      self.assertAllEqual(self.evaluate(grad), [0., 2., 0.])
 
 
 class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
   """Test varied index types and host located memory."""
 
+  @test_util.run_deprecated_v1
   def testHostVsDevice(self):
     with self.session(use_gpu=True) as sess:
       var2 = variables.Variable(
@@ -842,20 +880,21 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
               math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
               shape=(4, 1, 1)))
       varshape = variables.Variable([6, 4, 4], dtype=dtypes.int32)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0])
       end = constant_op.constant([4, 1, 1])
       strides = constant_op.constant([1, 1, 1])
       foo = array_ops.strided_slice_grad(varshape, begin, end, strides, var2)
       sess.run(foo)
 
+  @test_util.run_deprecated_v1
   def testInt64Shape(self):
     with self.session(use_gpu=True) as sess:
       original_dy = array_ops.reshape(
           math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
           shape=(4, 1, 1))
       original_shape = constant_op.constant([6, 4, 4], dtype=dtypes.int64)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int64)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
@@ -863,13 +902,14 @@ class StridedSliceGradTypeTest(test_util.TensorFlowTestCase):
                                         original_dy)
       sess.run(dx)
 
+  @test_util.run_deprecated_v1
   def testMixedIndexTypes(self):
     with self.session(use_gpu=True) as sess:
       original_dy = array_ops.reshape(
           math_ops.cast(math_ops.range(1, 5, 1), dtypes.float32),
           shape=(4, 1, 1))
       original_shape = constant_op.constant([6, 4, 4], dtype=dtypes.int64)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       begin = constant_op.constant([0, 0, 0], dtype=dtypes.int32)
       end = constant_op.constant([4, 1, 1], dtype=dtypes.int64)
       strides = constant_op.constant([1, 1, 1], dtype=dtypes.int64)
@@ -971,6 +1011,7 @@ class StridedSliceAssignChecker(object):
 
 class SliceAssignTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvalidSlice(self):
     with self.cached_session() as sess:
       foo = constant_op.constant([1, 2, 3])
@@ -1008,12 +1049,15 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     checker2[...] = 6  # ellipsis
     checker2[None] = [6]  # new axis
 
+  @test_util.run_deprecated_v1
   def testSliceAssign(self):
     self.doTestSliceAssign(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testSliceAssignResource(self):
     self.doTestSliceAssign(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testUninitialized(self):
     with self.assertRaisesRegexp(
         errors.FailedPreconditionError,
@@ -1032,13 +1076,14 @@ class SliceAssignTest(test_util.TensorFlowTestCase):
     with self.assertRaises(TypeError):
       v[:].assign(too_large_val)
 
+  @test_util.run_deprecated_v1
   def testTypeErrorResource(self):
     init_val = constant_op.constant([1, 2], dtype=dtypes.int32)
     too_small_val = constant_op.constant([3, 4], dtype=dtypes.int8)
     too_large_val = constant_op.constant([3, 4], dtype=dtypes.int64)
     v = resource_variable_ops.ResourceVariable(init_val)
     with self.cached_session() as sess:
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
       with self.assertRaises(ValueError):
         sess.run(v[:].assign(too_large_val))
       with self.assertRaises(ValueError):
@@ -1088,6 +1133,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(ValueError, "maxlen must be scalar"):
         array_ops.sequence_mask([10, 20], [10, 20])
 
+  @test_util.run_deprecated_v1
   def testOneDimensionalWithMaxlen(self):
     with self.cached_session():
       res = array_ops.sequence_mask(constant_op.constant([1, 3, 2]), 5)
@@ -1097,7 +1143,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[True, False, False, False, False], [True, True, True, False, False],
            [True, True, False, False, False]])
 
-  @test_util.enable_c_shapes
+  @test_util.run_deprecated_v1
   def testOneDimensionalDtypeWithoutMaxlen(self):
     with self.cached_session():
       # test dtype and default maxlen:
@@ -1108,7 +1154,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           res.eval(),
           [[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]])
 
-  @test_util.enable_c_shapes
+  @test_util.run_deprecated_v1
   def testOneDimensionalWithoutMaxlen(self):
     with self.cached_session():
       res = array_ops.sequence_mask(
@@ -1120,7 +1166,6 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
            [True, False, False, False],
            [True, True, True, True]])
 
-  @test_util.enable_c_shapes
   def testTwoDimensional(self):
     with self.cached_session():
       res = array_ops.sequence_mask(constant_op.constant([[1, 3, 2]]), 5)
@@ -1138,11 +1183,13 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
           [[[0.0, 0.0, 0.0, 0.0], [1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 1.0, 1.0]],
            [[1.0, 0.0, 0.0, 0.0], [1.0, 1.0, 0.0, 0.0], [1.0, 1.0, 1.0, 0.0]]])
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     lengths = array_ops.placeholder(dtype=dtypes.int32)
     res = array_ops.sequence_mask(lengths)
     self.assertEqual(res.shape, None)
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
 
     def check_dtypes(lengths_dtype, maxlen_dtype):
@@ -1165,6 +1212,7 @@ class SequenceMaskTest(test_util.TensorFlowTestCase):
 class ConcatSliceResourceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testConcatSlice(self):
     r1 = test_ops.stub_resource_handle_op(container="a", shared_name="b")
     r2 = test_ops.stub_resource_handle_op(container="a", shared_name="c")
@@ -1187,18 +1235,18 @@ class IdentityTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(x.numpy(), y.numpy())
         self.assertTrue(device in y.device.lower())
 
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         a = constant_op.constant([[2], [3]], dtype=dtypes.float32)
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         b = array_ops.identity(a)
         _test(a, b, "gpu")
-      with ops.device("cpu:0"):
+      with test_util.force_cpu():
         c = array_ops.identity(b)
         _test(b, c, "cpu")
-      with ops.device("cpu:0"):
+      with test_util.force_cpu():
         d = array_ops.identity(c)
         _test(c, d, "cpu")
-      with ops.device("gpu:0"):
+      with test_util.force_gpu():
         e = array_ops.identity(d)
         _test(d, e, "gpu")
 
@@ -1220,6 +1268,7 @@ class PadTest(test_util.TensorFlowTestCase):
 
 class InvertPermutationTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvertPermutation(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -1254,12 +1303,14 @@ class UnravelIndexTest(test_util.TensorFlowTestCase):
 
 class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.cached_session():
       a = array_ops.constant(10)
       guarantee_a = array_ops.guarantee_const(a)
       self.assertEqual(10, guarantee_a.eval())
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
     with self.cached_session() as sess:
       for use_resource in [False, True]:
@@ -1268,9 +1319,10 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
             initializer=init_ops.constant_initializer(10.0),
             use_resource=use_resource)
         guarantee_a = array_ops.guarantee_const(a)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertEqual(10.0, guarantee_a.eval())
 
+  @test_util.run_deprecated_v1
   def testResourceRejection(self):
     with self.cached_session() as sess:
       a = variable_scope.get_variable(
@@ -1278,7 +1330,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
           initializer=init_ops.constant_initializer(10.0),
           use_resource=True)
       guarantee_a = array_ops.guarantee_const(a.handle)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       with self.assertRaisesWithPredicateMatch(errors.InvalidArgumentError,
                                                "cannot be a resource variable"):
         guarantee_a.eval()
@@ -1286,6 +1338,7 @@ class GuaranteeConstOpTest(test_util.TensorFlowTestCase):
 
 class SnapshotOpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testInvertPermutation(self):
     for dtype in [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64]:
       with self.cached_session(use_gpu=True):
diff --git a/tensorflow/python/kernel_tests/as_string_op_test.py b/tensorflow/python/kernel_tests/as_string_op_test.py
index dd4a90e5f65bc66f23bf4d1fb469afb4916fb815..287701a73e43464a5cee4334d0a011de2d3746ba 100644
--- a/tensorflow/python/kernel_tests/as_string_op_test.py
+++ b/tensorflow/python/kernel_tests/as_string_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -27,6 +28,7 @@ from tensorflow.python.platform import test
 
 class AsStringOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     float_inputs_ = [
         0, 1, -1, 0.5, 0.25, 0.125, float("INF"), float("NAN"), float("-INF")
@@ -78,6 +80,7 @@ class AsStringOpTest(test.TestCase):
         output = string_ops.as_string(input_, fill="ab")
         output.eval(feed_dict={input_: float_inputs_})
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     # Cannot use values outside -128..127 for test, because we're also
     # testing int8
@@ -112,6 +115,7 @@ class AsStringOpTest(test.TestCase):
         output = string_ops.as_string(input_, precision=0)
         output.eval(feed_dict={input_: int_inputs_})
 
+  @test_util.run_deprecated_v1
   def testLargeInt(self):
     # Cannot use values outside -128..127 for test, because we're also
     # testing int8
@@ -130,6 +134,7 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  @test_util.run_deprecated_v1
   def testHalfInt(self):
     s = lambda strs: [x.decode("ascii") for x in strs]
 
@@ -140,6 +145,7 @@ class AsStringOpTest(test.TestCase):
       result = output.eval(feed_dict={input_: int_inputs_})
       self.assertAllEqual(s(result), ["%d" % x for x in int_inputs_])
 
+  @test_util.run_deprecated_v1
   def testBool(self):
     bool_inputs_ = [False, True]
     s = lambda strs: [x.decode("ascii") for x in strs]
@@ -152,6 +158,7 @@ class AsStringOpTest(test.TestCase):
         result = output.eval(feed_dict={input_: bool_inputs_})
         self.assertAllEqual(s(result), ["false", "true"])
 
+  @test_util.run_deprecated_v1
   def testComplex(self):
     float_inputs_ = [
         0, 1, -1, 0.5, 0.25, 0.125, complex("INF"), complex("NAN"),
diff --git a/tensorflow/python/kernel_tests/atrous_conv2d_test.py b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
index 1d82b3d058834c7d56668e975a0969e32283a69b..a13e325835cfd343eda61037b8392e83bed0f1c2 100644
--- a/tensorflow/python/kernel_tests/atrous_conv2d_test.py
+++ b/tensorflow/python/kernel_tests/atrous_conv2d_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
@@ -58,6 +59,7 @@ def _upsample_filters(filters, rate):
 
 class AtrousConv2DTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousConv2DForward(self):
     with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
@@ -79,8 +81,10 @@ class AtrousConv2DTest(test.TestCase):
                 y1 = nn_ops.atrous_conv2d(x, f, rate, padding=padding)
                 y2 = nn_ops.conv2d(
                     x, f_up, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAtrousSequence(self):
     """Tests optimization of sequence of atrous convolutions.
 
@@ -131,8 +135,10 @@ class AtrousConv2DTest(test.TestCase):
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = nn_ops.conv2d(y2, f, strides=[1, 1, 1, 1], padding=padding)
               y2 = array_ops.batch_to_space(y2, crops=pad, block_size=rate)
-              self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-2, atol=1e-2)
+              self.assertAllClose(
+                  y1.eval(), self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
@@ -160,6 +166,7 @@ class AtrousConv2DTest(test.TestCase):
 
 class AtrousConv2DTransposeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousConv2DTransposeForward(self):
     with self.session(use_gpu=True):
       # Input: [batch, height, width, input_depth]
@@ -193,11 +200,13 @@ class AtrousConv2DTransposeTest(test.TestCase):
                                                     padding)
                 y2 = nn_ops.conv2d_transpose(
                     x, f_up, y_shape, strides=[1, 1, 1, 1], padding=padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
 
 class AtrousDepthwiseConv2DTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAtrousDepthwiseConv2DForward(self):
     strides = [1, 1, 1, 1]
     with self.session(use_gpu=True):
@@ -220,7 +229,8 @@ class AtrousDepthwiseConv2DTest(test.TestCase):
                 y1 = nn_impl.depthwise_conv2d(
                     x, f, strides, padding, rate=[rate, rate])
                 y2 = nn_impl.depthwise_conv2d(x, f_up, strides, padding)
-                self.assertAllClose(y1.eval(), y2.eval(), rtol=1e-3, atol=1e-3)
+                self.assertAllClose(
+                    y1.eval(), self.evaluate(y2), rtol=1e-3, atol=1e-3)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/atrous_convolution_test.py b/tensorflow/python/kernel_tests/atrous_convolution_test.py
index 6b16fca29d0277e0e5f1f52f6c4a48343a441f67..2fb8a37e2b94bd81409970eb3c485362a17634b6 100644
--- a/tensorflow/python/kernel_tests/atrous_convolution_test.py
+++ b/tensorflow/python/kernel_tests/atrous_convolution_test.py
@@ -110,6 +110,7 @@ class AtrousConvolutionTest(test.TestCase):
 
     add_check(check, y1, y2)
 
+  @test_util.run_v1_only("b/120545219")
   def test_unknown_spatial_dims_for_channel_last_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, None, None, 10])
     w = array_ops.zeros([3, 3, 10, 20])
@@ -117,6 +118,7 @@ class AtrousConvolutionTest(test.TestCase):
         x, w, "VALID", dilation_rate=[2, 2], data_format="NHWC")
     self.assertEqual(y.shape.as_list(), [1, None, None, 20])
 
+  @test_util.run_v1_only("b/120545219")
   def test_unknown_spatial_dims_for_channel_first_format(self):
     x = array_ops.placeholder(dtypes.float32, [1, 10, None, None])
     w = array_ops.zeros([3, 3, 10, 20])
@@ -262,6 +264,7 @@ class AtrousConvolutionTest(test.TestCase):
     err_tolerance = 1e-3
     self.assertLess(err, err_tolerance)
 
+  @test_util.run_v1_only("b/120545219")
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/attention_ops_test.py b/tensorflow/python/kernel_tests/attention_ops_test.py
index 1e09ba5b65cee3b74d350e0d2433c6a459517e5e..00dba9996dd909786301d56da41fa037328ba3e5 100644
--- a/tensorflow/python/kernel_tests/attention_ops_test.py
+++ b/tensorflow/python/kernel_tests/attention_ops_test.py
@@ -85,7 +85,7 @@ class ExtractGlimpseTest(test.TestCase):
 
     # Evaluate the TensorFlow Graph.
     with self.cached_session() as sess:
-      value_rows, value_cols = sess.run([glimpse_rows, glimpse_cols])
+      value_rows, value_cols = self.evaluate([glimpse_rows, glimpse_cols])
 
     # Check dimensions of returned glimpse.
     self.assertEqual(value_rows.shape[1], glimpse_sizes[0])
@@ -121,8 +121,7 @@ class ExtractGlimpseTest(test.TestCase):
     with self.cached_session():
       result = image_ops.extract_glimpse(empty_image, [1, 1], offsets)
       self.assertAllEqual(
-          np.zeros(
-              (0, 1, 1, 0), dtype=np.float32), result.eval())
+          np.zeros((0, 1, 1, 0), dtype=np.float32), self.evaluate(result))
 
   def testLargeCenterGlimpse(self):
     self._VerifyValues(
diff --git a/tensorflow/python/kernel_tests/barrier_ops_test.py b/tensorflow/python/kernel_tests/barrier_ops_test.py
index 4d36b3a4658121729bcde440b1c25b3849a5a818..60fe6f0eecdd597ea78c006b3b5552e118a0eacb 100644
--- a/tensorflow/python/kernel_tests/barrier_ops_test.py
+++ b/tensorflow/python/kernel_tests/barrier_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 
@@ -66,6 +67,7 @@ class BarrierTest(test.TestCase):
       attr { key: 'shared_name' value: { s: 'B' } }
       """, b.barrier_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testInsertMany(self):
     with self.cached_session():
       b = data_flow_ops.Barrier(
@@ -90,6 +92,7 @@ class BarrierTest(test.TestCase):
         data_flow_ops.Barrier(
             (dtypes.float32, dtypes.float32), shapes=((1,), (0,)), name="B")
 
+  @test_util.run_deprecated_v1
   def testInsertManyEmptyTensorUnknown(self):
     with self.cached_session():
       b = data_flow_ops.Barrier((dtypes.float32, dtypes.float32), name="B")
@@ -102,6 +105,7 @@ class BarrierTest(test.TestCase):
           ".*Tensors with no elements are not supported.*"):
         insert_0_op.run()
 
+  @test_util.run_deprecated_v1
   def testTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -127,6 +131,7 @@ class BarrierTest(test.TestCase):
       self.assertEqual(values_0_val[idx], v0)
       self.assertEqual(values_1_val[idx], v1)
 
+  @test_util.run_deprecated_v1
   def testTakeManySmallBatch(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -191,6 +196,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed"):
         insert_1_3_op.run()
 
+  @test_util.run_deprecated_v1
   def testUseBarrierWithShape(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -220,6 +226,7 @@ class BarrierTest(test.TestCase):
       self.assertAllEqual(values_0_val[idx], v0)
       self.assertAllEqual(values_1_val[idx], v1)
 
+  @test_util.run_deprecated_v1
   def testParallelInsertMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -229,7 +236,7 @@ class BarrierTest(test.TestCase):
       insert_ops = [b.insert_many(0, [k], [v]) for k, v in zip(keys, values)]
       take_t = b.take_many(10)
 
-      sess.run(insert_ops)
+      self.evaluate(insert_ops)
       self.assertEquals(size_t.eval(), [10])
 
       indices_val, keys_val, values_val = sess.run(
@@ -240,6 +247,7 @@ class BarrierTest(test.TestCase):
       idx = keys_val.tolist().index(k)
       self.assertEqual(values_val[idx], v)
 
+  @test_util.run_deprecated_v1
   def testParallelTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -274,6 +282,7 @@ class BarrierTest(test.TestCase):
     self.assertItemsEqual(
         zip(keys, values), [(k[0], v[0]) for k, v in zip(key_vals, value_vals)])
 
+  @test_util.run_deprecated_v1
   def testBlockingTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(dtypes.float32, shapes=())
@@ -296,6 +305,7 @@ class BarrierTest(test.TestCase):
         insert_op.run()
       t.join()
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeMany(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -375,6 +385,7 @@ class BarrierTest(test.TestCase):
              2 + outer_indices_from_keys + inner_indices_from_keys)).T
         self.assertAllEqual(taken_i["values_1"], expected_values_1)
 
+  @test_util.run_deprecated_v1
   def testClose(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -433,6 +444,7 @@ class BarrierTest(test.TestCase):
       with self.assertRaisesOpError("is closed and has insufficient elements"):
         sess.run(take_t[0])
 
+  @test_util.run_deprecated_v1
   def testCancel(self):
     with self.cached_session() as sess:
       b = data_flow_ops.Barrier(
@@ -491,10 +503,11 @@ class BarrierTest(test.TestCase):
       b = data_flow_ops.Barrier(
           (dtypes.float32, dtypes.float32), shapes=((), ()), name="B")
       take_t = b.take_many(1, allow_small_batch=True)
-      sess.run(b.close(cancel))
+      self.evaluate(b.close(cancel))
       with self.assertRaisesOpError("is closed and has insufficient elements"):
-        sess.run(take_t)
+        self.evaluate(take_t)
 
+  @test_util.run_deprecated_v1
   def testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(self):
     self._testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(cancel=False)
     self._testClosedEmptyBarrierTakeManyAllowSmallBatchRaises(cancel=True)
@@ -569,9 +582,11 @@ class BarrierTest(test.TestCase):
           sorted(taken),
           [0] * (num_iterations // 2) + [10] * (num_iterations // 2))
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeManyCloseHalfwayThrough(self):
     self._testParallelInsertManyTakeManyCloseHalfwayThrough(cancel=False)
 
+  @test_util.run_deprecated_v1
   def testParallelInsertManyTakeManyCancelHalfwayThrough(self):
     self._testParallelInsertManyTakeManyCloseHalfwayThrough(cancel=True)
 
@@ -669,12 +684,15 @@ class BarrierTest(test.TestCase):
       else:
         self.assertEqual(taken, [10] * num_iterations)
 
+  @test_util.run_deprecated_v1
   def testParallelPartialInsertManyTakeManyCloseHalfwayThrough(self):
     self._testParallelPartialInsertManyTakeManyCloseHalfwayThrough(cancel=False)
 
+  @test_util.run_deprecated_v1
   def testParallelPartialInsertManyTakeManyCancelHalfwayThrough(self):
     self._testParallelPartialInsertManyTakeManyCloseHalfwayThrough(cancel=True)
 
+  @test_util.run_deprecated_v1
   def testIncompatibleSharedBarrierErrors(self):
     with self.cached_session():
       # Do component types and shapes.
diff --git a/tensorflow/python/kernel_tests/base64_ops_test.py b/tensorflow/python/kernel_tests/base64_ops_test.py
index 1b399942efbcef227f24de9737f2fc0f6a427c7f..381f190b8df6d65afaa80654e3d98377a69b9ae3 100644
--- a/tensorflow/python/kernel_tests/base64_ops_test.py
+++ b/tensorflow/python/kernel_tests/base64_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class Base64OpsTest(test_util.TensorFlowTestCase):
 
   def setUp(self):
@@ -93,7 +94,7 @@ class Base64OpsTest(test_util.TensorFlowTestCase):
         decoded = string_ops.decode_base64(encoded)
 
         with self.cached_session() as sess:
-          encoded_value, decoded_value = sess.run([encoded, decoded])
+          encoded_value, decoded_value = self.evaluate([encoded, decoded])
 
         self.assertEqual(encoded_value.shape, msg.shape)
         self.assertEqual(decoded_value.shape, msg.shape)
diff --git a/tensorflow/python/kernel_tests/basic_gpu_test.py b/tensorflow/python/kernel_tests/basic_gpu_test.py
index 225c1b35ae5fb9c5f30fa4966d691c6274a2120d..1a8513d022d43e3bd206bc0ab607012d05aef6a9 100644
--- a/tensorflow/python/kernel_tests/basic_gpu_test.py
+++ b/tensorflow/python/kernel_tests/basic_gpu_test.py
@@ -44,13 +44,13 @@ class GPUBinaryOpsTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = sess.run(out)
+      tf_gpu = self.evaluate(out)
 
     with self.cached_session(use_gpu=False) as sess:
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_cpu = sess.run(out)
+      tf_cpu = self.evaluate(out)
 
     self.assertAllClose(tf_cpu, tf_gpu)
 
@@ -96,7 +96,7 @@ class MathBuiltinUnaryTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       inx = ops.convert_to_tensor(x)
       ofunc = tf_func(inx)
-      tf_out = sess.run(ofunc)
+      tf_out = self.evaluate(ofunc)
     self.assertAllClose(np_out, tf_out)
 
   def _inv(self, x):
@@ -148,7 +148,7 @@ class MathBuiltinUnaryTest(test.TestCase):
       iny = ops.convert_to_tensor(y + 0.1)
       ofunc = inx / iny
       out_func2 = math_ops.floor(ofunc)
-      tf_out = sess.run(out_func2)
+      tf_out = self.evaluate(out_func2)
 
     self.assertAllClose(np_out, tf_out)
 
@@ -159,6 +159,7 @@ class BroadcastSimpleTest(test.TestCase):
     with self.cached_session(use_gpu=True) as sess:
       return sess.run(broadcast_gradient_args(xs, ys))
 
+  @test_util.run_deprecated_v1
   def testBroadcast(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
     self.assertAllEqual(r0, [])
@@ -214,11 +215,12 @@ class BroadcastSimpleTest(test.TestCase):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = out.eval()
+      tf_gpu = self.evaluate(out)
     self.assertAllClose(np_ans, tf_gpu)
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x = (1 + np.linspace(0, 5, np.prod([1, 3, 2]))).astype(np.float32).reshape(
         [1, 3, 2])
@@ -255,6 +257,7 @@ class GpuMultiSessionMemoryTest(test_util.TensorFlowTestCase):
           if len(results) != 1:
             break
 
+  @test_util.run_deprecated_v1
   def testConcurrentSessions(self):
     n_threads = 4
     threads = []
diff --git a/tensorflow/python/kernel_tests/batch_gather_op_test.py b/tensorflow/python/kernel_tests/batch_gather_op_test.py
index 547506d844d3d453f79af895046d51b57721cb73..7e0b3e1b5eadc7fe5541612fc607aeb9a135ceb4 100644
--- a/tensorflow/python/kernel_tests/batch_gather_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_gather_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -52,7 +53,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([3, 7])
         np_val = self._buildParams(expected_result, dtype)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
@@ -68,7 +69,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         gather_t = array_ops.batch_gather(params, indices_tf)
         expected_result = np.array([[3], [15]])
         np_val = self._buildParams(expected_result, dtype)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
@@ -81,12 +82,13 @@ class GatherTest(test.TestCase, parameterized.TestCase):
         params = constant_op.constant(params_np)
         indices_tf = constant_op.constant(indices)
         gather_t = array_ops.batch_gather(params, indices_tf)
-        gather_val = gather_t.eval()
+        gather_val = self.evaluate(gather_t)
         expected_result = np.array([[[2, 0], [7, 5]], [[10, 8], [11, 15]]])
         np_val = self._buildParams(expected_result, dtype)
         self.assertAllEqual(np_val, gather_val)
         self.assertEqual(np_val.shape, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
     with self.cached_session():
@@ -94,6 +96,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual([[b"qwer", b"uiop"]],
                           array_ops.batch_gather(params, indices_tf).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32, shape=[None, None])
@@ -106,6 +109,7 @@ class GatherTest(test.TestCase, parameterized.TestCase):
       with self.assertRaisesOpError(r"indices\[0\] = 7 is not in \[0, 2\)"):
         array_ops.batch_gather(params, [7]).eval()
 
+  @test_util.run_deprecated_v1
   def testEmptySlices(self):
     with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
diff --git a/tensorflow/python/kernel_tests/batch_matmul_op_test.py b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
index 8f6c089b423632aa9acec746cbdd76cb691e3700..c32a6c7e41759ac9abade06bb83be19a7392f2da 100644
--- a/tensorflow/python/kernel_tests/batch_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/batch_matmul_op_test.py
@@ -20,9 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.python.framework import constant_op
+from tensorflow.python import tf2
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -86,7 +87,7 @@ class BatchMatmulOpTest(test.TestCase):
     with self.cached_session(use_gpu=is_floating) as sess:
       if static_shape:
         z0 = math_ops.matmul(x, y, adjoint_a=adjoint_a, adjoint_b=adjoint_b)
-        z0_val = z0.eval()
+        z0_val = self.evaluate(z0)
       else:
         x_ph = array_ops.placeholder(x.dtype)
         y_ph = array_ops.placeholder(y.dtype)
@@ -105,36 +106,37 @@ class BatchMatmulOpTest(test.TestCase):
 
   def _testNonEmpty(self, dtype, adjoint_a, adjoint_b, use_static_shape):
 
-    def compareNonEmpty(self, a_shape, b_shape):
+    def CompareNonEmpty(self, a_shape, b_shape):
       self._compare(
           self._rand(a_shape, dtype),
           self._rand(b_shape, dtype), adjoint_a, adjoint_b, use_static_shape)
 
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 5])
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 1])
-    compareNonEmpty(self, [1, 1, 3], [1, 3, 5])
-    compareNonEmpty(self, [1, 2, 3], [1, 3, 5])
-    compareNonEmpty(self, [7, 1, 3], [7, 3, 5])
-    compareNonEmpty(self, [7, 2, 3], [7, 3, 1])
-    compareNonEmpty(self, [7, 2, 3], [7, 3, 5])
-    compareNonEmpty(self, [10, 64, 75], [10, 75, 30])
-    compareNonEmpty(self, [5, 7, 2, 3], [5, 7, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 1])
+    CompareNonEmpty(self, [1, 1, 3], [1, 3, 5])
+    CompareNonEmpty(self, [1, 2, 3], [1, 3, 5])
+    CompareNonEmpty(self, [7, 1, 3], [7, 3, 5])
+    CompareNonEmpty(self, [7, 2, 3], [7, 3, 1])
+    CompareNonEmpty(self, [7, 2, 3], [7, 3, 5])
+    CompareNonEmpty(self, [10, 64, 75], [10, 75, 30])
+    CompareNonEmpty(self, [5, 7, 2, 3], [5, 7, 3, 5])
 
   def _testEmpty(self, dtype, adjoint_a, adjoint_b, use_static_shape):
 
-    def compareEmpty(self, a_shape, b_shape):
+    def CompareEmpty(self, a_shape, b_shape):
       self._compare(
           np.zeros(a_shape).astype(dtype),
           np.zeros(b_shape).astype(dtype), adjoint_a, adjoint_b,
           use_static_shape)
 
-    compareEmpty(self, [0, 3, 2], [0, 2, 4])
-    compareEmpty(self, [3, 0, 2], [3, 2, 5])
-    compareEmpty(self, [3, 3, 2], [3, 2, 0])
+    CompareEmpty(self, [0, 3, 2], [0, 2, 4])
+    CompareEmpty(self, [3, 0, 2], [3, 2, 5])
+    CompareEmpty(self, [3, 3, 2], [3, 2, 0])
 
 
 def _GetBatchMatmulOpTest(dtype, adjoint_a, adjoint_b, use_static_shape):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     self._testNonEmpty(dtype, adjoint_a, adjoint_b, use_static_shape)
@@ -154,17 +156,13 @@ class BatchMatmulGradientTest(test.TestCase):
     y = y_in if not adjoint_b else y_in.reshape(y_t_shape)
     epsilon = np.finfo(x.dtype).eps
     delta = epsilon**(1.0 / 3.0)
+    def Loss(x, y):
+      z = math_ops.matmul(x, y, adjoint_a, adjoint_b)
+      return math_ops.reduce_sum(z)
     with self.cached_session(use_gpu=True):
-      inx = constant_op.constant(x)
-      iny = constant_op.constant(y)
-      z = math_ops.matmul(inx, iny, adjoint_a, adjoint_b)
-      loss = math_ops.reduce_sum(z)
-      ((x_jacob_t, x_jacob_n),
-       (y_jacob_t, y_jacob_n)) = gradient_checker.compute_gradient(
-           [inx, iny], [x.shape, y.shape],
-           loss, [1],
-           x_init_value=[x, y],
-           delta=delta)
+      ((x_jacob_t, y_jacob_t),
+       (x_jacob_n, y_jacob_n)) = gradient_checker_v2.compute_gradient(
+           Loss, [x, y], delta=delta)
       tol = 20 * delta
       self.assertAllClose(x_jacob_t, x_jacob_n, rtol=tol, atol=tol)
       self.assertAllClose(y_jacob_t, y_jacob_n, rtol=tol, atol=tol)
@@ -188,6 +186,7 @@ class BatchMatmulGradientTest(test.TestCase):
 
 def _GetBatchMatmulGradientTest(dtype, adjoint_a, adjoint_b):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     self._compare(1, 2, 3, 5, dtype, adjoint_a, adjoint_b)
     self._compare(3, 4, 7, 10, dtype, adjoint_a, adjoint_b)
@@ -202,11 +201,12 @@ if __name__ == "__main__":
     for adjoint_a_ in False, True:
       for adjoint_b_ in False, True:
         name = "%s_%s_%s" % (dtype_.__name__, adjoint_a_, adjoint_b_)
-        for use_static_shape in True, False:
+        # TF2 does not support placeholders under eager so we skip it
+        for use_static_shape_ in set([True, tf2.enabled()]):
           setattr(BatchMatmulOpTest,
-                  "testBatchMatmulOp_" + name + ("_%s" % use_static_shape),
+                  "testBatchMatmulOp_" + name + ("_%s" % use_static_shape_),
                   _GetBatchMatmulOpTest(dtype_, adjoint_a_, adjoint_b_,
-                                        use_static_shape))
+                                        use_static_shape_))
         if dtype_ is not np.int32:
           setattr(BatchMatmulGradientTest, "testBatchMatmulGradient_" + name,
                   _GetBatchMatmulGradientTest(dtype_, adjoint_a_, adjoint_b_))
diff --git a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
index 742a204883128b2af3abf91b27f089f8b4410e7c..f70fb93da9d51c1f9838f67977dbbd4aef65562e 100644
--- a/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/batch_scatter_ops_test.py
@@ -22,6 +22,8 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -49,7 +51,8 @@ class ScatterTest(test.TestCase):
                         vtype,
                         itype,
                         repeat_indices=False,
-                        updates_are_scalar=False):
+                        updates_are_scalar=False,
+                        method=False):
     np.random.seed(8)
     with self.cached_session(use_gpu=False):
       for indices_shape in (2,), (3, 7), (3, 4, 7):
@@ -70,9 +73,13 @@ class ScatterTest(test.TestCase):
           # Scatter via tensorflow
           ref = variables.Variable(old)
           ref.initializer.run()
-          tf_scatter(ref, indices, updates).eval()
+          if method:
+            ref.batch_scatter_update(ops.IndexedSlices(indices, updates))
+          else:
+            tf_scatter(ref, indices, updates).eval()
           self.assertAllClose(ref.eval(), new)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     vtypes = [np.float32, np.float64]
     for vtype in vtypes:
@@ -80,6 +87,7 @@ class ScatterTest(test.TestCase):
         self._VariableRankTest(
             state_ops.batch_scatter_update, vtype, itype)
 
+  @test_util.run_deprecated_v1
   def testBooleanScatterUpdate(self):
     with self.session(use_gpu=False) as session:
       var = variables.Variable([True, False])
@@ -91,8 +99,9 @@ class ScatterTest(test.TestCase):
 
       session.run([update0, update1])
 
-      self.assertAllEqual([False, True], var.eval())
+      self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testScatterOutOfRange(self):
     params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
     updates = np.array([-3, -4, -5]).astype(np.float32)
diff --git a/tensorflow/python/kernel_tests/batchtospace_op_test.py b/tensorflow/python/kernel_tests/batchtospace_op_test.py
index 03f3f64353d8367afad18a3fd07750b575b1fafa..c422df8806f595f2926bc603ffa1f40064664df0 100644
--- a/tensorflow/python/kernel_tests/batchtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/batchtospace_op_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -50,6 +51,7 @@ class CppOpImpl(object):
 class BatchToSpaceDepthToSpace(test.TestCase, PythonOpImpl):
 
   # Verifies that: batch_to_space(x) = transpose(depth_to_space(transpose(x)))
+  @test_util.run_deprecated_v1
   def testDepthToSpaceTranspose(self):
     x = np.arange(20 * 5 * 8 * 7, dtype=np.float32).reshape([20, 5, 8, 7])
     block_size = 2
@@ -70,6 +72,7 @@ class BatchToSpaceDepthToSpaceCpp(BatchToSpaceDepthToSpace, CppOpImpl):
 
 class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -78,6 +81,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.batch_to_space(x_np, crops, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -87,6 +91,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -96,6 +101,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -105,6 +111,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.batch_to_space(x_np, crops, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeSquaredNotDivisibleBatch(self):
     # The block size squared does not divide the batch.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -113,6 +120,7 @@ class BatchToSpaceErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.batch_to_space(x_np, crops, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = self.batch_to_space(
         array_ops.placeholder(dtypes.float32),
@@ -160,28 +168,35 @@ class BatchToSpaceNDErrorHandlingTest(test.TestCase):
     self._testStaticShape(input_shape, block_shape, paddings, error)
     self._testDynamicShape(input_shape, block_shape, paddings)
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     self._testShape([2, 2], [2, 2], [[0, 0], [0, 0]], ValueError)
     self._testShape([2, 2, 3], [2, 2, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     self._testShape([1, 2, 2, 1], [0, 1], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNegative(self):
     self._testShape([1, 2, 2, 1], [-1, 1], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testNegativePadding(self):
     self._testShape([1, 2, 2], [1, 1], [[0, -1], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testCropTooLarge(self):
     # The amount to crop exceeds the padded size.
     self._testShape([1 * 2 * 2, 2, 3, 1], [2, 2], [[3, 2], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeSquaredNotDivisibleBatch(self):
     # The batch dimension is not divisible by the product of the block_shape.
     self._testShape([3, 1, 1, 1], [2, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     # Verify that input shape and paddings shape can be unknown.
     _ = array_ops.batch_to_space_nd(
@@ -263,18 +278,21 @@ class BatchToSpaceGradientTest(test.TestCase, PythonOpImpl):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     crop_beg = 0
     crop_end = 0
     self._compare(1, 2, 3, 5, block_size, crop_beg, crop_end)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     crop_beg = 0
     crop_end = 0
     self._compare(2, 4, 3, 2, block_size, crop_beg, crop_end)
 
+  @test_util.run_deprecated_v1
   def testSmallCrop1x1(self):
     block_size = 2
     crop_beg = 1
@@ -316,14 +334,17 @@ class BatchToSpaceNDGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([1, 2, 3, 5], [2, 2], [[0, 0], [0, 0]], dtype)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([2, 4, 3, 2], [2, 2], [[0, 0], [0, 0]], dtype)
 
+  @test_util.run_deprecated_v1
   def testSmallCrop1x1(self):
     for dtype in [dtypes.int64, dtypes.int32]:
       self._compare([1, 2, 3, 5], [2, 2], [[1, 1], [1, 1]], dtype)
diff --git a/tensorflow/python/kernel_tests/bcast_ops_test.py b/tensorflow/python/kernel_tests/bcast_ops_test.py
index 3ec820aeadadf361e9291c5431e21cd7b2ba52be..ae00955ac29001c5748705d8b94c9f560ac60c26 100644
--- a/tensorflow/python/kernel_tests/bcast_ops_test.py
+++ b/tensorflow/python/kernel_tests/bcast_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.gen_array_ops import broadcast_args
 from tensorflow.python.ops.gen_array_ops import broadcast_gradient_args
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ class BcastOpsTest(test.TestCase):
     with self.cached_session() as sess:
       return sess.run(broadcast_gradient_args(xs, ys))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     r = self._GetBroadcastShape([2, 3, 5], [1])
     self.assertAllEqual(r, [2, 3, 5])
@@ -66,6 +68,7 @@ class BcastOpsTest(test.TestCase):
     r = self._GetBroadcastShape([3, 1], [2, 1, 5])
     self.assertAllEqual(r, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testBasicGradient(self):
     r0, r1 = self._GetGradientArgs([2, 3, 5], [1])
     self.assertAllEqual(r0, [])
@@ -107,6 +110,7 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 2])
     self.assertAllEqual(r1, [1])
 
+  @test_util.run_deprecated_v1
   def testZeroDims(self):
     r = self._GetBroadcastShape([2, 0, 3, 0, 5], [3, 0, 5])
     self.assertAllEqual(r, [2, 0, 3, 0, 5])
@@ -120,6 +124,7 @@ class BcastOpsTest(test.TestCase):
     r = self._GetBroadcastShape([3, 1, 5], [2, 0, 3, 0, 5])
     self.assertAllEqual(r, [2, 0, 3, 0, 5])
 
+  @test_util.run_deprecated_v1
   def testZeroDimsGradient(self):
     r0, r1 = self._GetGradientArgs([2, 0, 3, 0, 5], [3, 0, 5])
     self.assertAllEqual(r0, [])
@@ -137,6 +142,7 @@ class BcastOpsTest(test.TestCase):
     self.assertAllEqual(r0, [0, 1, 3])
     self.assertAllEqual(r1, [])
 
+  @test_util.run_deprecated_v1
   def testDataTypes(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       r = self._GetBroadcastShape(
diff --git a/tensorflow/python/kernel_tests/benchmark_test.py b/tensorflow/python/kernel_tests/benchmark_test.py
index 5777a5d0970702783d6fdb47ace7c3de1654f23f..bffa5e6e8f4d9125f5021eb531319f67fd6e77bb 100644
--- a/tensorflow/python/kernel_tests/benchmark_test.py
+++ b/tensorflow/python/kernel_tests/benchmark_test.py
@@ -21,9 +21,12 @@ import json
 import os
 import random
 
+import numpy as np
+
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import session
-from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import benchmark
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -64,11 +67,17 @@ class TestReportingBenchmark(test.Benchmark):
                 "other_key": "string"})
 
   def benchmark_times_an_op(self):
+    input_size = 5
     with session.Session(config=benchmark.benchmark_config()) as sess:
-      a = constant_op.constant(0.0)
+      a = array_ops.placeholder(dtype=dtypes.float32, shape=(input_size))
       a_plus_a = a + a
       return self.run_op_benchmark(
-          sess, a_plus_a, min_iters=1000, store_trace=True, name="op_benchmark")
+          sess,
+          a_plus_a,
+          feed_dict={a: np.arange(input_size)},
+          min_iters=1000,
+          store_trace=True,
+          name="op_benchmark")
 
 
 class BenchmarkTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/betainc_op_test.py b/tensorflow/python/kernel_tests/betainc_op_test.py
index 92d21462d52f40c22aa60dac1a0c3d6b74ab2f3f..9dc34a606282e03cd5729c91bb9c4cffb10afc1c 100644
--- a/tensorflow/python/kernel_tests/betainc_op_test.py
+++ b/tensorflow/python/kernel_tests/betainc_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -48,7 +49,7 @@ class BetaincTest(test.TestCase):
       tf_x_s = constant_op.constant(x_s, dtype=dtype)
       tf_out_t = math_ops.betainc(tf_a_s, tf_b_s, tf_x_s)
       with self.cached_session():
-        tf_out = tf_out_t.eval()
+        tf_out = self.evaluate(tf_out_t)
       scipy_out = special.betainc(a_s, b_s, x_s).astype(np_dt)
 
       # the scipy version of betainc uses a double-only implementation.
@@ -109,36 +110,42 @@ class BetaincTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testBetaIncFloat(self):
     a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDouble(self):
     a_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 30)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDoubleVeryLargeValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e15)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncDoubleVerySmallValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e-16)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testBetaIncFloatVerySmallValues(self):
     a_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
     b_s = np.abs(np.random.randn(10, 10) * 1e-8)  # in (0, infty)
     x_s = np.random.rand(10, 10)  # in (0, 1)
     self._testBetaInc(a_s, b_s, x_s, dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testBetaIncFpropAndBpropAreNeverNAN(self):
     with self.cached_session() as sess:
       space = np.logspace(-8, 5).tolist()
@@ -159,6 +166,7 @@ class BetaincTest(test.TestCase):
       self.assertAllEqual(np.zeros_like(grads_x).astype(np.bool),
                           np.isnan(grads_x))
 
+  @test_util.run_deprecated_v1
   def testBetaIncGrads(self):
     err_tolerance = 1e-3
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/bias_op_test.py b/tensorflow/python/kernel_tests/bias_op_test.py
index 749d6a791e3c4ee3f7b2750526d45285e19df694..66f442dbddb5f609e7525ba0db9809dc3943ee25 100644
--- a/tensorflow/python/kernel_tests/bias_op_test.py
+++ b/tensorflow/python/kernel_tests/bias_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -89,10 +90,12 @@ class BiasAddTest(test.TestCase):
       self._testBiasNCHW(np_inputs, np_bias, use_gpu=True)
 
 
+  @test_util.run_deprecated_v1
   def testInputDims(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add([1, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBiasVec(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add(
@@ -101,6 +104,7 @@ class BiasAddTest(test.TestCase):
           array_ops.reshape(
               [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testBiasInputsMatch(self):
     with self.assertRaises(ValueError):
       nn_ops.bias_add(
@@ -109,23 +113,27 @@ class BiasAddTest(test.TestCase):
           array_ops.reshape(
               [1], shape=[1]))
 
+  @test_util.run_deprecated_v1
   def testIntTypes(self):
     for t in [np.int8, np.int16, np.int32, np.int64]:
       self._testAll(
           np.array([[10, 20, 30], [40, 50, 60]]).astype(t),
           np.array([1, 2, 3]).astype(t))
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testAll(
           np.random.rand(4, 3, 3).astype(t), np.random.rand(3).astype(t))
 
+  @test_util.run_deprecated_v1
   def test4DFloatTypes(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testAll(
           np.random.rand(4, 3, 2, 3).astype(t),
           np.random.rand(3).astype(t))
 
+  @test_util.run_deprecated_v1
   def test5DFloatTypes(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testAll(
@@ -187,6 +195,7 @@ class BiasAddTest(test.TestCase):
       self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
       self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
 
+  @test_util.run_deprecated_v1
   def testGradientTensor(self):
     # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
     # all dimensions are supported.
@@ -198,6 +207,7 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testGradientTensor4D(self):
     # BiasAddGrad with NCHW support 4D so all are enabled.
     for (data_format, use_gpu) in [("NHWC", False), ("NHWC", True),
@@ -209,11 +219,13 @@ class BiasAddTest(test.TestCase):
         bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
         self._testGradient(np_input, bias, dtype, data_format, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     np.random.seed(7)
     for shape in (0, 0), (2, 0), (0, 2), (4, 3, 0), (4, 0, 3), (0, 4, 3):
       self._testAll(np.random.randn(*shape), np.random.randn(shape[-1]))
 
+  @test_util.run_deprecated_v1
   def testEmptyGradient(self):
     # TODO(yongtang): BiasAddGrad with NCHW only works 4D. Reenable once
     # all dimensions are supported.
diff --git a/tensorflow/python/kernel_tests/bincount_op_test.py b/tensorflow/python/kernel_tests/bincount_op_test.py
index 49eb835847eb6f7b5e61709c1b0ad61cb21251f4..d064d736cf253ddf6ebf3ef0f416f449fcf7f565 100644
--- a/tensorflow/python/kernel_tests/bincount_op_test.py
+++ b/tensorflow/python/kernel_tests/bincount_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import googletest
 
 class BincountTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def test_empty(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
@@ -43,6 +44,7 @@ class BincountTest(test_util.TensorFlowTestCase):
           math_ops.bincount([], minlength=3, dtype=np.float64).eval().dtype,
           np.float64)
 
+  @test_util.run_deprecated_v1
   def test_values(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
@@ -58,12 +60,14 @@ class BincountTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(
           math_ops.bincount(np.arange(10000)).eval(), np.ones(10000))
 
+  @test_util.run_deprecated_v1
   def test_maxlength(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(math_ops.bincount([5], maxlength=3).eval(), [0, 0, 0])
       self.assertAllEqual(math_ops.bincount([1], maxlength=3).eval(), [0, 1])
       self.assertAllEqual(math_ops.bincount([], maxlength=3).eval(), [])
 
+  @test_util.run_deprecated_v1
   def test_random_with_weights(self):
     num_samples = 10000
     with self.session(use_gpu=True):
@@ -77,6 +81,7 @@ class BincountTest(test_util.TensorFlowTestCase):
         self.assertAllClose(
             math_ops.bincount(arr, weights).eval(), np.bincount(arr, weights))
 
+  @test_util.run_deprecated_v1
   def test_random_without_weights(self):
     num_samples = 10000
     with self.session(use_gpu=True):
@@ -87,6 +92,7 @@ class BincountTest(test_util.TensorFlowTestCase):
         self.assertAllClose(
             math_ops.bincount(arr, None).eval(), np.bincount(arr, weights))
 
+  @test_util.run_deprecated_v1
   def test_zero_weights(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
@@ -99,6 +105,7 @@ class BincountTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         math_ops.bincount([1, 2, 3, -1, 6, 8]).eval()
 
+  @test_util.run_deprecated_v1
   def test_shape_function(self):
     # size must be scalar.
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/bitcast_op_test.py b/tensorflow/python/kernel_tests/bitcast_op_test.py
index 79e0f36d242bdc828d4216d0e7a868bbccc849a9..b4f9a21a899c9207811e3c72a58180e4370c140a 100644
--- a/tensorflow/python/kernel_tests/bitcast_op_test.py
+++ b/tensorflow/python/kernel_tests/bitcast_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -28,9 +29,9 @@ from tensorflow.python.platform import test
 class BitcastTest(test.TestCase):
 
   def _testBitcast(self, x, datatype, shape):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       tf_ans = array_ops.bitcast(x, datatype)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       buff_after = memoryview(out).tobytes()
       buff_before = memoryview(x).tobytes()
       self.assertEqual(buff_before, buff_after)
@@ -59,6 +60,7 @@ class BitcastTest(test.TestCase):
     shape = [3, 4]
     self._testBitcast(x, dtypes.int64, shape)
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     x = np.zeros([1, 1], np.int8)
     datatype = dtypes.int32
@@ -71,6 +73,7 @@ class BitcastTest(test.TestCase):
     shape = [4]
     self._testBitcast(x, datatype, shape)
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     x = array_ops.placeholder(dtypes.float32)
     datatype = dtypes.int8
diff --git a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
index 7cdc67f83f0af65ff76b7109f088023220ab2b15..6b04e8abf40dc6fc396581e82b59bc6c4dec2a41 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/prediction_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.platform import googletest
 class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
   """Tests prediction ops for training."""
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionOnEmptyEnsemble(self):
     """Tests that prediction on a dummy ensemble does not fail."""
     with self.cached_session() as session:
@@ -61,6 +62,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(cached_node_ids, new_node_ids)
       self.assertAllClose([[0], [0]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testNoCachedPredictionButTreeExists(self):
     """Tests that predictions are updated once trees are added."""
     with self.cached_session() as session:
@@ -127,6 +129,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([2, 1], new_node_ids)
       self.assertAllClose([[0.1 * 8.79], [0.1 * 1.14]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionIsCurrent(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -199,6 +202,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(cached_node_ids, new_node_ids)
       self.assertAllClose([[0], [0]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromTheSameTree(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -313,6 +317,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       # 1.65 and -3.875, and then multiply them by 0.1 (lr)
       self.assertAllClose([[0.1 * 1.65], [0.1 * -3.875]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromPreviousTree(self):
     """Tests the predictions work when we have cache from previous trees."""
     with self.cached_session() as session:
@@ -445,6 +450,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       #            change= 0.1(1.14+7.0-7.0)
       self.assertAllClose([[1], [0.114]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the training prediction work for categorical splits."""
     with self.cached_session() as session:
@@ -517,6 +523,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([3, 4, 2], new_node_ids)
       self.assertAllClose([[5.], [6.], [7.]], logits_updates)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromTheSameTreeWithPostPrunedNodes(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -647,6 +654,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose([[0.01], [0.01], [0.0553], [0.0783], [0.01], [0.01]],
                           logits_updates + cached_values)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionFromThePreviousTreeWithPostPrunedNodes(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -792,6 +800,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
                            [root + 0.0783], [root + 0.01], [root + 0.01]],
                           logits_updates + cached_values)
 
+  @test_util.run_deprecated_v1
   def testCachedPredictionTheWholeTreeWasPruned(self):
     """Tests that prediction based on previous node in the tree works."""
     with self.cached_session() as session:
@@ -864,6 +873,7 @@ class TrainingPredictionOpsTest(test_util.TensorFlowTestCase):
 class PredictionOpsTest(test_util.TensorFlowTestCase):
   """Tests prediction ops for inference."""
 
+  @test_util.run_deprecated_v1
   def testPredictionOnEmptyEnsemble(self):
     """Tests that prediction on a empty ensemble does not fail."""
     with self.cached_session() as session:
@@ -886,6 +896,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
   def testPredictionMultipleTree(self):
     """Tests the predictions work when we have multiple trees."""
     with self.cached_session() as session:
@@ -996,6 +1007,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
       logits = session.run(predict_op)
       self.assertAllClose(expected_logits, logits)
 
+  @test_util.run_deprecated_v1
   def testCategoricalSplits(self):
     """Tests the predictions work for categorical splits."""
     with self.cached_session() as session:
@@ -1062,6 +1074,7 @@ class PredictionOpsTest(test_util.TensorFlowTestCase):
 class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
   """Tests feature contribs ops for model understanding."""
 
+  @test_util.run_deprecated_v1
   def testContribsForOnlyABiasNode(self):
     """Tests case when, after training, only left with a bias node.
 
@@ -1122,6 +1135,7 @@ class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(feature_ids, expected_feature_ids)
       self.assertAllClose(logits_paths, expected_logits_paths)
 
+  @test_util.run_deprecated_v1
   def testContribsMultipleTreeWhenFirstTreeIsABiasNode(self):
     """Tests case when, after training, first tree contains only a bias node."""
     with self.cached_session() as session:
@@ -1219,6 +1233,7 @@ class FeatureContribsOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(feature_ids, expected_feature_ids)
       self.assertAllClose(logits_paths, expected_logits_paths)
 
+  @test_util.run_deprecated_v1
   def testContribsMultipleTree(self):
     """Tests that the contribs work when we have multiple trees."""
     with self.cached_session() as session:
diff --git a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
index 12afb6a2ad8afded852704d3605c6cfcfe65ae8e..2b9863fb89bac80f6a2f012a3f25c23f993d03ad 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/quantile_ops_test.py
@@ -82,6 +82,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
     self.max_elements = 1 << 16
     self.num_quantiles = constant_op.constant(3, dtype=dtypes.int64)
 
+  @test_util.run_v1_only("b/120545219")
   def testBasicQuantileBucketsSingleResource(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle = self.create_resource("floats", self.eps,
@@ -98,14 +99,15 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           quantile_accumulator_handle, num_features=2)
       quantiles = boosted_trees_ops.boosted_trees_bucketize(
           [self._feature_0, self._feature_1], buckets)
-      sess.run(summary_op)
-      sess.run(flush_op)
+      self.evaluate(summary_op)
+      self.evaluate(flush_op)
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testBasicQuantileBucketsMultipleResources(self):
     with self.cached_session() as sess:
       quantile_accumulator_handle_0 = self.create_resource("float_0", self.eps,
@@ -132,14 +134,15 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
           quantile_accumulator_handle_1, num_features=1)
       quantiles = boosted_trees_ops.boosted_trees_bucketize(
           [self._feature_0, self._feature_1], bucket_0 + bucket_1)
-      sess.run([summary_op_0, summary_op_1])
-      sess.run([flush_op_0, flush_op_1])
+      self.evaluate([summary_op_0, summary_op_1])
+      self.evaluate([flush_op_0, flush_op_1])
       self.assertAllClose(self._feature_0_boundaries, bucket_0[0].eval())
       self.assertAllClose(self._feature_1_boundaries, bucket_1[0].eval())
 
       self.assertAllClose(self._feature_0_quantiles, quantiles[0].eval())
       self.assertAllClose(self._feature_1_quantiles, quantiles[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testSaveRestoreAfterFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
@@ -158,7 +161,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
                                             self._example_weights)
       with ops.control_dependencies([summaries]):
         flush = accumulator.flush()
-      sess.run(flush)
+      self.evaluate(flush)
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
       save.save(sess, save_path)
@@ -172,6 +175,7 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testSaveRestoreBeforeFlush(self):
     save_dir = os.path.join(self.get_temp_dir(), "save_restore")
     save_path = os.path.join(tempfile.mkdtemp(prefix=save_dir), "hash")
@@ -185,12 +189,12 @@ class QuantileOpsTest(test_util.TensorFlowTestCase):
 
       summaries = accumulator.add_summaries([self._feature_0, self._feature_1],
                                             self._example_weights)
-      sess.run(summaries)
+      self.evaluate(summaries)
       buckets = accumulator.get_bucket_boundaries()
       self.assertAllClose([], buckets[0].eval())
       self.assertAllClose([], buckets[1].eval())
       save.save(sess, save_path)
-      sess.run(accumulator.flush())
+      self.evaluate(accumulator.flush())
       self.assertAllClose(self._feature_0_boundaries, buckets[0].eval())
       self.assertAllClose(self._feature_1_boundaries, buckets[1].eval())
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
index 65bb9ab55f00c0ad9506122bf357484c7a4acd5f..0a34277bbdb43ca449923550000970e63ca14905 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/resource_ops_test.py
@@ -30,19 +30,21 @@ from tensorflow.python.platform import googletest
 class ResourceOpsTest(test_util.TensorFlowTestCase):
   """Tests resource_ops."""
 
+  @test_util.run_deprecated_v1
   def testCreate(self):
     with self.cached_session():
       ensemble = boosted_trees_ops.TreeEnsemble('ensemble')
       resources.initialize_resources(resources.shared_resources()).run()
       stamp_token = ensemble.get_stamp_token()
-      self.assertEqual(0, stamp_token.eval())
+      self.assertEqual(0, self.evaluate(stamp_token))
       (_, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(0, num_trees.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertEqual(0, num_attempted_layers.eval())
-      self.assertAllEqual([0, 1], nodes_range.eval())
+      self.assertEqual(0, self.evaluate(num_trees))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertEqual(0, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([0, 1], self.evaluate(nodes_range))
 
+  @test_util.run_deprecated_v1
   def testCreateWithProto(self):
     with self.cached_session():
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
@@ -154,12 +156,13 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(7, stamp_token.eval())
-      self.assertEqual(2, num_trees.eval())
-      self.assertEqual(1, num_finalized_trees.eval())
-      self.assertEqual(6, num_attempted_layers.eval())
-      self.assertAllEqual([16, 19], nodes_range.eval())
+      self.assertEqual(7, self.evaluate(stamp_token))
+      self.assertEqual(2, self.evaluate(num_trees))
+      self.assertEqual(1, self.evaluate(num_finalized_trees))
+      self.assertEqual(6, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([16, 19], self.evaluate(nodes_range))
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserialize(self):
     with self.cached_session():
       # Initialize.
@@ -167,11 +170,11 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       resources.initialize_resources(resources.shared_resources()).run()
       (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
        nodes_range) = ensemble.get_states()
-      self.assertEqual(5, stamp_token.eval())
-      self.assertEqual(0, num_trees.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertEqual(0, num_attempted_layers.eval())
-      self.assertAllEqual([0, 1], nodes_range.eval())
+      self.assertEqual(5, self.evaluate(stamp_token))
+      self.assertEqual(0, self.evaluate(num_trees))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertEqual(0, self.evaluate(num_attempted_layers))
+      self.assertAllEqual([0, 1], self.evaluate(nodes_range))
 
       # Deserialize.
       ensemble_proto = boosted_trees_pb2.TreeEnsemble()
@@ -219,18 +222,18 @@ class ResourceOpsTest(test_util.TensorFlowTestCase):
       ]):
         (stamp_token, num_trees, num_finalized_trees, num_attempted_layers,
          nodes_range) = ensemble.get_states()
-      self.assertEqual(3, stamp_token.eval())
-      self.assertEqual(1, num_trees.eval())
+      self.assertEqual(3, self.evaluate(stamp_token))
+      self.assertEqual(1, self.evaluate(num_trees))
       # This reads from metadata, not really counting the layers.
-      self.assertEqual(5, num_attempted_layers.eval())
-      self.assertEqual(0, num_finalized_trees.eval())
-      self.assertAllEqual([3, 7], nodes_range.eval())
+      self.assertEqual(5, self.evaluate(num_attempted_layers))
+      self.assertEqual(0, self.evaluate(num_finalized_trees))
+      self.assertAllEqual([3, 7], self.evaluate(nodes_range))
 
 
       # Serialize.
       new_ensemble_proto = boosted_trees_pb2.TreeEnsemble()
       new_stamp_token, new_serialized = ensemble.serialize()
-      self.assertEqual(3, new_stamp_token.eval())
+      self.assertEqual(3, self.evaluate(new_stamp_token))
       new_ensemble_proto.ParseFromString(new_serialized.eval())
       self.assertProtoEquals(ensemble_proto, new_ensemble_proto)
 
diff --git a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
index 09e9cfa3affb9750938f2292e6e2dc3edddecedb..e2e23486b5a9fb93e11971147b0784a62e636a7b 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/stats_ops_test.py
@@ -65,16 +65,16 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[0.004775, 0.41184], [0.02823, 0.41184]],
-                          sess.run(gains_list))
-      self.assertAllEqual([[1, 1], [1, 1]], sess.run(thresholds_list))
+                          self.evaluate(gains_list))
+      self.assertAllEqual([[1, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[-.416667], [.568966]], [[-.6], [-.75]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.592593], [-.75]], [[-.076923], [.568966]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithL2(self):
     """Testing Gain calculation with L2."""
@@ -113,16 +113,16 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[0., 0.33931375], [0.01879096, 0.33931375]],
-                          sess.run(gains_list))
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+                          self.evaluate(gains_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithL1(self):
     """Testing Gain calculation with L1."""
@@ -162,18 +162,18 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
       self.assertAllClose([[[0.0], [0.3965517]], [[-0.4], [-0.5]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
 
       self.assertAllClose([[[-0.3333333], [-0.5]], [[0.0], [0.396552]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
       # Gain should also include an adjustment of the gradient by l1.
       self.assertAllClose([[0.0, 0.191207], [0.01, 0.191207]],
-                          sess.run(gains_list))
+                          self.evaluate(gains_list))
 
   def testCalculateBestGainsWithTreeComplexity(self):
     """Testing Gain calculation with L2."""
@@ -214,18 +214,18 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           min_node_weight=0,
           max_splits=max_splits)
 
-      self.assertAllEqual([[1, 2], [1, 2]], sess.run(node_ids_list))
+      self.assertAllEqual([[1, 2], [1, 2]], self.evaluate(node_ids_list))
 
       self.assertAllClose([[-3., -2.66068625], [-2.98120904, -2.66068625]],
-                          sess.run(gains_list))
+                          self.evaluate(gains_list))
 
-      self.assertAllEqual([[0, 1], [1, 1]], sess.run(thresholds_list))
+      self.assertAllEqual([[0, 1], [1, 1]], self.evaluate(thresholds_list))
       # The left node contrib will be later added to the previous node value to
       # make the left node value, and the same for right node contrib.
       self.assertAllClose([[[0.], [.485294]], [[-.5], [-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-.424658], [-.6]], [[-.043478], [.485294]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithMinNodeWeight(self):
     """Testing Gain calculation without any regularization."""
@@ -266,13 +266,13 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
 
       # We can't split node 1 on feature 1 and node 2 on feature 2 because of
       # the min node weight.
-      self.assertAllEqual([[2], [1]], sess.run(node_ids_list))
-      self.assertAllClose([[0.384314], [0.098013]], sess.run(gains_list))
-      self.assertAllEqual([[1], [1]], sess.run(thresholds_list))
+      self.assertAllEqual([[2], [1]], self.evaluate(node_ids_list))
+      self.assertAllClose([[0.384314], [0.098013]], self.evaluate(gains_list))
+      self.assertAllEqual([[1], [1]], self.evaluate(thresholds_list))
       self.assertAllClose([[[0.4852941]], [[-.6]]],
-                          sess.run(left_node_contribs_list))
+                          self.evaluate(left_node_contribs_list))
       self.assertAllClose([[[-0.75]], [[-0.014925]]],
-                          sess.run(right_node_contribs_list))
+                          self.evaluate(right_node_contribs_list))
 
   def testCalculateBestGainsWithMinNodeWeightNoSplitOnFeturePossible(self):
     """Testing Gain calculation without any regularization."""
@@ -311,9 +311,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
            max_splits=max_splits)
 
       # We can't split either of the nodes on the first feature
-      self.assertEqual(2, len(sess.run(node_ids_list)))
-      self.assertAllEqual([], sess.run(node_ids_list)[0])
-      self.assertAllEqual([1], sess.run(node_ids_list)[1])
+      self.assertEqual(2, len(self.evaluate(node_ids_list)))
+      self.assertAllEqual([], self.evaluate(node_ids_list)[0])
+      self.assertAllEqual([1], self.evaluate(node_ids_list)[1])
 
       # Now check when we can't split on any feature
       (node_ids_list, _, _, _,
@@ -325,8 +325,9 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
            tree_complexity=0.0,
            min_node_weight=10,
            max_splits=max_splits)
-      self.assertAllEqual([[], []], sess.run(node_ids_list))
+      self.assertAllEqual([[], []], self.evaluate(node_ids_list))
 
+  @test_util.run_deprecated_v1
   def testMakeStatsSummarySimple(self):
     """Simple test for MakeStatsSummary."""
     with self.cached_session():
@@ -359,7 +360,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
               [[0., 0.], [.15, .36], [.06, .07], [.1, .2]],  # node 1
               [[-.33, .58], [0., 0.], [.3, .4], [0., 0.]],  # node 2
           ]],
-          result.eval())
+          self.evaluate(result))
 
   def testMakeStatsSummaryMultipleFeatures(self):
     """Tests that MakeStatsSummary works for multiple features."""
@@ -389,7 +390,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
                   [[.3, .4], [0., 0.], [-.4, .5], [.07, .08]],  # node 2
               ],  # feature 1
           ],
-          result.eval())
+          self.evaluate(result))
 
   def _verify_precision(self, length):
     with self.cached_session():
@@ -408,7 +409,7 @@ class StatsOpsTest(test_util.TensorFlowTestCase):
           node_ids, gradients, hessians, [bucketized_features], max_splits,
           num_buckets)  # shape=[max_splits, num_buckets, num_features, 2]
 
-      self.assertAllClose([[[[2., 0.2]]]], result.eval())
+      self.assertAllClose([[[[2., 0.2]]]], self.evaluate(result))
 
   def testMakeStatsSummaryNumericalPrecisionSmallBatch(self):
     """Tests numeric precision."""
diff --git a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
index ea022820e44e5461585c35a5bd4b9e256d923d13..afc0564fc5a7939d6a7ec7b3f4c3f2108c00ac92 100644
--- a/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
+++ b/tensorflow/python/kernel_tests/boosted_trees/training_ops_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.platform import googletest
 class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
   """Tests for growing tree ensemble from split candidates."""
 
+  @test_util.run_deprecated_v1
   def testGrowWithEmptyEnsemble(self):
     """Test growing an empty ensemble."""
     with self.cached_session() as session:
@@ -139,6 +140,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testBiasCenteringOnEmptyEnsemble(self):
     """Test growing with bias centering on an empty ensemble."""
     with self.cached_session() as session:
@@ -182,6 +184,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testGrowExistingEnsembleTreeNotFinalized(self):
     """Test growing an existing ensemble with the last tree not finalized."""
     with self.cached_session() as session:
@@ -366,6 +369,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testGrowExistingEnsembleTreeFinalized(self):
     """Test growing an existing ensemble with the last tree finalized."""
     with self.cached_session() as session:
@@ -515,6 +519,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPrePruning(self):
     """Test growing an existing ensemble with pre-pruning."""
     with self.cached_session() as session:
@@ -671,6 +676,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testMetadataWhenCantSplitDueToEmptySplits(self):
     """Test that the metadata is updated even though we can't split."""
     with self.cached_session() as session:
@@ -782,6 +788,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testMetadataWhenCantSplitDuePrePruning(self):
     """Test metadata is updated correctly when no split due to prepruning."""
     with self.cached_session() as session:
@@ -917,6 +924,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 1)
       self.assertProtoEquals(expected_result, tree_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningOfSomeNodes(self):
     """Test growing an ensemble with post-pruning."""
     with self.cached_session() as session:
@@ -1251,6 +1259,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       self.assertEqual(new_stamp, 3)
       self.assertProtoEquals(expected_result, res_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningOfAllNodes(self):
     """Test growing an ensemble with post-pruning, with all nodes are pruned."""
     with self.cached_session() as session:
@@ -1434,6 +1443,7 @@ class UpdateTreeEnsembleOpTest(test_util.TensorFlowTestCase):
       }
       """, res_ensemble)
 
+  @test_util.run_deprecated_v1
   def testPostPruningChangesNothing(self):
     """Test growing an ensemble with post-pruning with all gains >0."""
     with self.cached_session() as session:
diff --git a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
index 233c166405224cdfcdeaf5373d4638c4de6820a5..b9eb2391b490f659bd20e26a2c5b290ab4bfea1b 100644
--- a/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
+++ b/tensorflow/python/kernel_tests/broadcast_to_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.platform import test as test_lib
 
 class BroadcastToTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBroadcastToBasic(self):
     for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
       with self.session(use_gpu=True):
@@ -37,6 +38,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
         v_np = np.broadcast_to(x, [3, 3])
         self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToString(self):
     with self.session(use_gpu=True):
       x = np.array([b"1", b"2", b"3"])
@@ -44,6 +46,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToBool(self):
     with self.session(use_gpu=True):
       x = np.array([True, False, True], dtype=np.bool)
@@ -51,6 +54,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToShape(self):
     for input_dim in range(1, 6):
       for output_dim in range(input_dim, 6):
@@ -62,6 +66,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
           v_np = np.broadcast_to(x, output_shape)
           self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1, dtype=np.int32)
@@ -69,6 +74,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [3, 3])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastScalarToNonScalar(self):
     with self.session(use_gpu=True):
       x = np.array(1.0, dtype=np.float)
@@ -76,6 +82,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
       v_np = np.broadcast_to(x, [2, 3, 4])
       self.assertAllEqual(v_tf.eval(), v_np)
 
+  @test_util.run_deprecated_v1
   def testBroadcastToShapeTypeAndInference(self):
     for dtype in [dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -89,6 +96,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
         # check shape inference when shape input is constant
         self.assertAllEqual(shape, v_np.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientForScalar(self):
     x = constant_op.constant(1, dtype=dtypes.float32)
     v = array_ops.broadcast_to(x, [2, 4, 3])
@@ -98,6 +106,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
                                                     out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSameRank(self):
     x = constant_op.constant(np.reshape(np.arange(6), (2, 1, 3)),
                              dtype=dtypes.float32)
@@ -108,6 +117,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithIncreasingRank(self):
     x = constant_op.constant([[1], [2]],
                              dtype=dtypes.float32)
@@ -118,6 +128,7 @@ class BroadcastToTest(test_util.TensorFlowTestCase):
                                                     out, out.get_shape())
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithBroadcastAllDimensions(self):
     x = constant_op.constant([[1, 2, 3], [4, 5, 6]], dtype=dtypes.float32)
     v = array_ops.broadcast_to(x, [5, 4, 6])
diff --git a/tensorflow/python/kernel_tests/bucketize_op_test.py b/tensorflow/python/kernel_tests/bucketize_op_test.py
index 57413e6af500f1c8fbecbfa46e3bb5e846d02d95..95df6943705d3bfcc1e6674782526d3d68fc577a 100644
--- a/tensorflow/python/kernel_tests/bucketize_op_test.py
+++ b/tensorflow/python/kernel_tests/bucketize_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -32,7 +33,7 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0, 3, 8, 11])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def testFloat(self):
     op = math_ops._bucketize(
@@ -40,7 +41,7 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0., 3., 8., 11.])
     expected_out = [0, 1, 1, 2, 2, 3, 3, 4, 4]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
   def test2DInput(self):
     op = math_ops._bucketize(
@@ -48,15 +49,16 @@ class BucketizationOpTest(test.TestCase):
         boundaries=[0, 3, 8, 11])
     expected_out = [[0, 1, 1, 2, 2], [3, 3, 4, 4, 1]]
     with self.session(use_gpu=True) as sess:
-      self.assertAllEqual(expected_out, sess.run(op))
+      self.assertAllEqual(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def testInvalidBoundariesOrder(self):
     op = math_ops._bucketize(
         constant_op.constant([-5, 0]), boundaries=[0, 8, 3, 11])
     with self.session(use_gpu=True) as sess:
       with self.assertRaisesRegexp(
           errors_impl.InvalidArgumentError, "Expected sorted boundaries"):
-        sess.run(op)
+        self.evaluate(op)
 
   def testBoundariesNotList(self):
     with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
index b19077db560363a22ab3c4c5400541edb9ab4600..fa6eb5c968965f0bd1f4e38ae8eec1d8f632d086 100644
--- a/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
+++ b/tensorflow/python/kernel_tests/candidate_sampler_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import candidate_sampling_ops
 from tensorflow.python.ops import math_ops
@@ -37,6 +38,7 @@ class RangeSamplerOpsTest(test.TestCase):
 
   TRUE_LABELS = [[1, 2], [0, 4], [3, 3]]
 
+  @test_util.run_deprecated_v1
   def testTrueCandidates(self):
     with self.cached_session() as sess:
       indices = constant_op.constant([0, 0, 1, 1, 2, 2])
@@ -55,7 +57,7 @@ class RangeSamplerOpsTest(test.TestCase):
           [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
       sampled_candidates, _, _ = candidate_sampling_ops.all_candidate_sampler(
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
-      result = sampled_candidates.eval()
+      result = self.evaluate(sampled_candidates)
 
     expected_ids = [0, 1, 2, 3, 4]
     self.assertAllEqual(result, expected_ids)
@@ -68,7 +70,7 @@ class RangeSamplerOpsTest(test.TestCase):
       _, true_expected_count, _ = candidate_sampling_ops.all_candidate_sampler(
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       true_log_expected_count = math_ops.log(true_expected_count)
-      result = true_log_expected_count.eval()
+      result = self.evaluate(true_log_expected_count)
 
     self.assertAllEqual(result, [[0.0] * self.NUM_TRUE] * self.BATCH_SIZE)
     self.assertEqual(true_expected_count.get_shape(),
@@ -83,7 +85,7 @@ class RangeSamplerOpsTest(test.TestCase):
       _, _, sampled_expected_count = candidate_sampling_ops.all_candidate_sampler(  # pylint: disable=line-too-long
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       sampled_log_expected_count = math_ops.log(sampled_expected_count)
-      result = sampled_log_expected_count.eval()
+      result = self.evaluate(sampled_log_expected_count)
 
     self.assertAllEqual(result, [0.0] * self.NUM_SAMPLED)
     self.assertEqual(sampled_expected_count.get_shape(), [self.NUM_SAMPLED])
@@ -97,7 +99,7 @@ class RangeSamplerOpsTest(test.TestCase):
           true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True)
       accidental_hits = candidate_sampling_ops.compute_accidental_hits(
           true_classes, sampled_candidates, self.NUM_TRUE)
-      indices, ids, weights = sess.run(accidental_hits)
+      indices, ids, weights = self.evaluate(accidental_hits)
 
     self.assertEqual(1, accidental_hits[0].get_shape().ndims)
     self.assertEqual(1, accidental_hits[1].get_shape().ndims)
@@ -106,6 +108,7 @@ class RangeSamplerOpsTest(test.TestCase):
       self.assertTrue(id_ in self.TRUE_LABELS[index])
       self.assertLess(weight, -1.0e37)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
 
     def draw(seed):
@@ -114,7 +117,7 @@ class RangeSamplerOpsTest(test.TestCase):
             [[1, 2], [0, 4], [3, 3]], dtype=dtypes.int64)
         sampled, _, _ = candidate_sampling_ops.log_uniform_candidate_sampler(
             true_classes, self.NUM_TRUE, self.NUM_SAMPLED, True, 5, seed=seed)
-        return sampled.eval()
+        return self.evaluate(sampled)
 
     # Non-zero seed. Repeatable.
     for seed in [1, 12, 123, 1234]:
diff --git a/tensorflow/python/kernel_tests/cast_op_test.py b/tensorflow/python/kernel_tests/cast_op_test.py
index a5dff5df629900c5d6848d5cd10f5b727b96aaf0..b3187e1637193a8b34f7f3668220d94d783b6170 100644
--- a/tensorflow/python/kernel_tests/cast_op_test.py
+++ b/tensorflow/python/kernel_tests/cast_op_test.py
@@ -25,6 +25,7 @@ import platform
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -90,10 +91,12 @@ class CastOpTest(test.TestCase):
     if x.dtype == np.float32 or x.dtype == np.float64:
       self._testTypes(x, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testAll(np.arange(-10, 10).reshape(2, 10))
     self._testAll(np.linspace(-10, 10, 17))
 
+  @test_util.run_deprecated_v1
   def testSmallValues(self):
     f4 = np.finfo(np.float32)
     f8 = np.finfo(np.float64)
@@ -107,11 +110,12 @@ class CastOpTest(test.TestCase):
     a = np.random.uniform(-100, 100, 100).astype(np.float32)
     with self.cached_session(use_gpu=False):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
-      self.assertAllClose(a, b.eval(), rtol=1 / 128.)
+      self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
     with self.cached_session(use_gpu=True):
       b = math_ops.cast(math_ops.cast(a, dtypes.bfloat16), dtypes.float32)
-      self.assertAllClose(a, b.eval(), rtol=1 / 128.)
+      self.assertAllClose(a, self.evaluate(b), rtol=1 / 128.)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     self._testAll(np.random.normal(0, 10, 210).reshape([2, 3, 5, 7]))
     self._testAll(np.random.normal(0, 1e6, 210).reshape([2, 3, 5, 7]))
@@ -124,6 +128,7 @@ class CastOpTest(test.TestCase):
         self._cast(
             x, dst_dtype, use_gpu=use_gpu), dst_dtype(expected))
 
+  @test_util.run_deprecated_v1
   def testIntToFloatBoundary(self):
     i4 = np.iinfo(np.int32)
     i8 = np.iinfo(np.int64)
@@ -138,6 +143,7 @@ class CastOpTest(test.TestCase):
     self._compare(i8.max, np.float64, i8.max, False)
     # NOTE: GPU does not support int32/int64 for casting.
 
+  @test_util.run_deprecated_v1
   def testInfNan(self):
     i4 = np.iinfo(np.int32)
     i8 = np.iinfo(np.int64)
@@ -181,14 +187,16 @@ class CastOpTest(test.TestCase):
   def testNotImplemented(self):
     self._OpError(np.arange(0, 10), dtypes.string, "Cast.*int64.*string.*")
 
+  @test_util.run_deprecated_v1
   def testCastToTypeOfVariable(self):
     with self.cached_session() as sess:
       x = variables.Variable(5, dtype=dtypes.float32)
       y = variables.Variable(True, dtype=dtypes.bool)
       cast = math_ops.cast(y, x.dtype)
       variables.global_variables_initializer().run()
-      self.assertEqual(1.0, sess.run(cast))
+      self.assertEqual(1.0, self.evaluate(cast))
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     t = [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
     for src_t in t:
@@ -203,6 +211,7 @@ class CastOpTest(test.TestCase):
 
 class SparseTensorCastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCast(self):
     indices = constant_op.constant([[0], [1], [2]], dtypes.int64)
     values = constant_op.constant(np.array([1, 2, 3], np.int64))
@@ -229,7 +238,7 @@ class SaturateCastTest(test.TestCase):
               [lo, lo + 1, lo // 2, hi // 2, hi - 1, hi], dtype=in_type)
           y = math_ops.saturate_cast(x, dtype=out_type)
           self.assertEqual(y.dtype, out_type)
-          x, y = sess.run([x, y])
+          x, y = self.evaluate([x, y])
           correct = np.maximum(out_type.min, np.minimum(out_type.max, x))
           self.assertAllEqual(correct, y)
 
diff --git a/tensorflow/python/kernel_tests/check_ops_test.py b/tensorflow/python/kernel_tests/check_ops_test.py
index 88f5cd6f22339dbac4c6f9ec6ea2490e9bd8e7c1..95bac85027bd1709420dcfc7f96f92195f8f2472 100644
--- a/tensorflow/python/kernel_tests/check_ops_test.py
+++ b/tensorflow/python/kernel_tests/check_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -39,6 +40,69 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
+class AssertV2Asserts(test.TestCase):
+
+  def test_passes_when_it_should(self):
+    # This is a v2 test and need to run eagerly
+    with context.eager_mode():
+      c1 = constant_op.constant(-1, name="minus_one", dtype=dtypes.int32)
+      c2 = constant_op.constant(2, name="two", dtype=dtypes.int32)
+      c3 = constant_op.constant([3., 3.], name="three", dtype=dtypes.float32)
+      c4 = constant_op.constant([3., 3.5], name="three_and_a_half",
+                                dtype=dtypes.float32)
+      scalar = c1
+      non_scalar = c3
+      integer = c1
+      non_integer = c3
+      positive = c2
+      negative = c1
+      cases = [
+          (check_ops.assert_equal_v2, (c1, c1), (c1, c2)),
+          (check_ops.assert_less_v2, (c1, c2), (c1, c1)),
+          (check_ops.assert_near_v2, (c3, c3), (c3, c4)),
+          (check_ops.assert_greater_v2, (c2, c1), (c1, c1)),
+          (check_ops.assert_negative_v2, (negative,), (positive,)),
+          (check_ops.assert_positive_v2, (positive,), (negative,)),
+          (check_ops.assert_less_equal_v2, (c1, c1), (c2, c1)),
+          (check_ops.assert_none_equal_v2, (c1, c2), (c3, c4)),
+          (check_ops.assert_non_negative_v2, (positive,), (negative,)),
+          (check_ops.assert_non_positive_v2, (negative,), (positive,)),
+          (check_ops.assert_greater_equal_v2, (c1, c1), (c1, c2)),
+          (check_ops.assert_type_v2, (c1, dtypes.int32), (c1, dtypes.float32),
+           TypeError),
+          (check_ops.assert_integer_v2, (integer,), (non_integer,),
+           TypeError),
+          (check_ops.assert_scalar_v2, (scalar,), (non_scalar,),
+           ValueError),
+          (check_ops.assert_rank_v2, (c1, 0), (c3, 2), ValueError),
+          (check_ops.assert_rank_in_v2, (c1, [0, 1]), (c1, [1, 2]),
+           ValueError),
+          (check_ops.assert_rank_at_least_v2, (non_scalar, 1), (scalar, 1),
+           ValueError),
+      ]
+
+      for case in cases:
+        fn = case[0]
+        passing_args = case[1]
+        failing_args = case[2]
+        error = errors.InvalidArgumentError if len(case) < 4 else case[3]
+
+        print("Testing %s passing properly." % fn)
+
+        fn(*passing_args)
+
+        print("Testing %s failing properly." % fn)
+
+        @def_function.function
+        def failing_fn():
+          fn(*failing_args, message="fail")  # pylint: disable=cell-var-from-loop
+
+        with self.assertRaisesRegexp(error, "fail"):
+          failing_fn()
+
+        del failing_fn
+
+
 class AssertProperIterableTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -109,6 +173,7 @@ class AssertEqualTest(test.TestCase):
       assert x is None
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     # Static check
     static_small = constant_op.constant([1, 2], name="small")
@@ -116,6 +181,7 @@ class AssertEqualTest(test.TestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
+  @test_util.run_deprecated_v1
   def test_raises_when_greater_dynamic(self):
     with self.cached_session():
       small = array_ops.placeholder(dtypes.int32, name="small")
@@ -187,6 +253,7 @@ First 2 elements of y:
                                summarize=2)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     # Static check
     static_small = constant_op.constant([3, 1], name="small")
@@ -194,6 +261,7 @@ First 2 elements of y:
     with self.assertRaisesRegexp(errors.InvalidArgumentError, "fail"):
       check_ops.assert_equal(static_big, static_small, message="fail")
 
+  @test_util.run_deprecated_v1
   def test_raises_when_less_dynamic(self):
     with self.cached_session():
       small = array_ops.placeholder(dtypes.int32, name="small")
@@ -253,6 +321,7 @@ class AssertNoneEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([3, 1], name="small")
     with self.assertRaisesOpError("x != y did not hold"):
@@ -442,6 +511,7 @@ class AssertAllCloseTest(test.TestCase):
 class AssertLessTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([1, 2], name="small")
     with self.assertRaisesOpError("failure message.*\n*.* x < y did not hold"):
@@ -452,6 +522,7 @@ class AssertLessTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -518,6 +589,7 @@ class AssertLessEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_greater(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -573,6 +645,7 @@ class AssertLessEqualTest(test.TestCase):
 class AssertGreaterTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_equal(self):
     small = constant_op.constant([1, 2], name="small")
     with self.assertRaisesOpError("fail"):
@@ -583,6 +656,7 @@ class AssertGreaterTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -642,6 +716,7 @@ class AssertGreaterEqualTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_less(self):
     small = constant_op.constant([1, 2], name="small")
     big = constant_op.constant([3, 4], name="big")
@@ -706,6 +781,7 @@ class AssertNegativeTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_positive(self):
     doug = constant_op.constant([1, 2], name="doug")
     with self.assertRaisesOpError("fail"):
@@ -716,6 +792,7 @@ class AssertNegativeTest(test.TestCase):
       self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_zero(self):
     claire = constant_op.constant([0], name="claire")
     with self.assertRaisesOpError("x < 0 did not hold"):
@@ -738,6 +815,7 @@ class AssertNegativeTest(test.TestCase):
 class AssertPositiveTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_negative(self):
     freddie = constant_op.constant([-1, -2], name="freddie")
     with self.assertRaisesOpError("fail"):
@@ -755,6 +833,7 @@ class AssertPositiveTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_zero(self):
     meechum = constant_op.constant([0], name="meechum")
     with self.assertRaisesOpError("x > 0 did not hold"):
@@ -777,26 +856,31 @@ class AssertPositiveTest(test.TestCase):
 class EnsureShapeTest(test.TestCase):
 
   # Static shape inference
+  @test_util.run_deprecated_v1
   def testStaticShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     ensure_shape_op = check_ops.ensure_shape(placeholder, (3, 3, 3))
     self.assertEqual(ensure_shape_op.get_shape(), (3, 3, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_MergesShapes(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     ensure_shape_op = check_ops.ensure_shape(placeholder, (5, 4, None))
     self.assertEqual(ensure_shape_op.get_shape(), (5, 4, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_RaisesErrorWhenRankIncompatible(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     with self.assertRaises(ValueError):
       check_ops.ensure_shape(placeholder, (2, 3))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_RaisesErrorWhenDimIncompatible(self):
     placeholder = array_ops.placeholder(dtypes.int32, shape=(None, None, 3))
     with self.assertRaises(ValueError):
       check_ops.ensure_shape(placeholder, (2, 2, 4))
 
+  @test_util.run_deprecated_v1
   def testStaticShape_CanSetUnknownShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -804,6 +888,7 @@ class EnsureShapeTest(test.TestCase):
     self.assertEqual(ensure_shape_op.get_shape(), None)
 
   # Dynamic shape check
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_RaisesError(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = math_ops.divide(placeholder, 3, name="MyDivide")
@@ -816,6 +901,7 @@ class EnsureShapeTest(test.TestCase):
           r"expected shape \[3,3,3\]."):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_RaisesErrorDimUnknown(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -828,6 +914,7 @@ class EnsureShapeTest(test.TestCase):
           r"expected shape \[\?,\?,3\]."):
         sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -836,6 +923,7 @@ class EnsureShapeTest(test.TestCase):
     with self.cached_session() as sess:
       sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testEnsuresDynamicShape_WithUnknownDims(self):
     placeholder = array_ops.placeholder(dtypes.int32)
     derived = placeholder / 3
@@ -844,6 +932,7 @@ class EnsureShapeTest(test.TestCase):
     with self.cached_session() as sess:
       sess.run(derived, feed_dict={placeholder: feed_val})
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     placeholder = array_ops.placeholder(dtypes.float32)
     derived = check_ops.ensure_shape(placeholder, (None, None))
@@ -939,6 +1028,7 @@ class AssertRankTest(test.TestCase):
               tensor, desired_rank, message="fail")]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -957,6 +1047,7 @@ class AssertRankTest(test.TestCase):
         [check_ops.assert_rank(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -974,6 +1065,7 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_large_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -991,6 +1083,7 @@ class AssertRankTest(test.TestCase):
         [check_ops.assert_rank(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1008,6 +1101,7 @@ class AssertRankTest(test.TestCase):
           [check_ops.assert_rank(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1023,6 +1117,7 @@ class AssertRankTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
       check_ops.assert_rank(tensor, np.array([], dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1040,6 +1135,7 @@ class AssertRankTest(test.TestCase):
                                  "must be of type <dtype: 'int32'>"):
       check_ops.assert_rank(tensor, .5)
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1063,6 +1159,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank0, (1, 2), message="fail")]):
         self.evaluate(array_ops.identity(tensor_rank0))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_mismatch_dynamic_rank(self):
     with self.cached_session():
       tensor_rank0 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1079,6 +1176,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank0, desired_ranks)]):
         self.evaluate(array_ops.identity(tensor_rank0))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank0 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1095,6 +1193,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank1, desired_ranks)]):
         self.evaluate(array_ops.identity(tensor_rank1))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_matches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank1 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1113,6 +1212,7 @@ class AssertRankInTest(test.TestCase):
           check_ops.assert_rank_in(tensor_rank1, (0, 2))]):
         self.evaluate(array_ops.identity(tensor_rank1))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_mismatches_dynamic_rank(self):
     with self.cached_session():
       tensor_rank1 = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1132,6 +1232,7 @@ class AssertRankInTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Rank must be a scalar"):
       check_ops.assert_rank_in(tensor, desired_ranks)
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_scalar_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1154,6 +1255,7 @@ class AssertRankInTest(test.TestCase):
                                  "must be of type <dtype: 'int32'>"):
       check_ops.assert_rank_in(tensor, (1, .5,))
 
+  @test_util.run_deprecated_v1
   def test_raises_if_rank_is_not_integer_dynamic(self):
     with self.cached_session():
       tensor = constant_op.constant(
@@ -1177,6 +1279,7 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1194,6 +1297,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_zero_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1210,6 +1314,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_ten_doesnt_raise_if_rank_too_large_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1226,6 +1331,7 @@ class AssertRankAtLeastTest(test.TestCase):
         [check_ops.assert_rank_at_least(tensor, desired_rank)]):
       self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_doesnt_raise_if_rank_just_right_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1243,6 +1349,7 @@ class AssertRankAtLeastTest(test.TestCase):
           [check_ops.assert_rank_at_least(tensor, desired_rank)]):
         self.evaluate(array_ops.identity(tensor))
 
+  @test_util.run_deprecated_v1
   def test_rank_one_tensor_raises_if_rank_too_small_dynamic_rank(self):
     with self.cached_session():
       tensor = array_ops.placeholder(dtypes.float32, name="my_tensor")
@@ -1256,6 +1363,7 @@ class AssertRankAtLeastTest(test.TestCase):
 class AssertNonNegativeTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_negative(self):
     zoe = constant_op.constant([-1, -2], name="zoe")
     with self.assertRaisesOpError("x >= 0 did not hold"):
@@ -1292,6 +1400,7 @@ class AssertNonPositiveTest(test.TestCase):
     self.evaluate(out)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_raises_when_positive(self):
     rachel = constant_op.constant([0, 2], name="rachel")
     with self.assertRaisesOpError("x <= 0 did not hold"):
diff --git a/tensorflow/python/kernel_tests/checkpoint_ops_test.py b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
index 51611b75afb051b2f69abb1749c18b3cbf1f66a0..6e289bf9b780ae2ba16f400cc001ddce59f547b3 100644
--- a/tensorflow/python/kernel_tests/checkpoint_ops_test.py
+++ b/tensorflow/python/kernel_tests/checkpoint_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_checkpoint_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
@@ -48,6 +49,7 @@ class GenerateVocabRemappingTest(test.TestCase):
     with open(self.old_vocab_file, 'w') as f:
       f.write('\n'.join(['knitting', 'eminem', 'MISSING']) + '\n')
 
+  @test_util.run_deprecated_v1
   def test_generate_remapping_with_no_vocab_changes(self):
     """Tests where vocab does not change at all."""
     remapping, num_present = gen_checkpoint_ops.generate_vocab_remapping(
@@ -58,8 +60,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = range(0, 3)
     expected_num_present = 3
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_shifted_vocab(self):
     """Tests where vocab is the same, but shifted / ordered differently."""
@@ -71,8 +73,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [2, 0, 1]
     expected_num_present = 3
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_offset(self):
     """Tests offset and num_new_vocab logic."""
@@ -84,8 +86,8 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [0]
     expected_num_present = 1
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
   def test_generate_remapping_with_old_vocab_size(self):
     """Tests where old_vocab_size is specified."""
@@ -99,10 +101,11 @@ class GenerateVocabRemappingTest(test.TestCase):
     expected_remapping = [-1, 0, 1]
     expected_num_present = 2
     with self.cached_session():
-      self.assertAllEqual(expected_remapping, remapping.eval())
-      self.assertAllEqual(expected_num_present, num_present.eval())
+      self.assertAllEqual(expected_remapping, self.evaluate(remapping))
+      self.assertAllEqual(expected_num_present, self.evaluate(num_present))
 
 
+@test_util.run_v1_only('b/120545219')
 class LoadAndRemapMatrixTest(test.TestCase):
   """Tests for the load_and_remap_matrix() op."""
 
@@ -142,7 +145,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=self.old_num_cols)
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
     # No row remapping, new weight matrix has third col, then first col.
     row_remapping = list(range(self.old_num_rows))
@@ -157,7 +160,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=len(col_remapping))
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
     # Both row and column remappings.
     row_remapping = [1, 0, 4]
@@ -172,7 +175,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_cols=len(col_remapping))
     with self.cached_session():
       self.assertAllClose(self.matrix_value[row_remapping][:, col_remapping],
-                          remapped_matrix.eval())
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_with_init(self):
     """Tests the op's load and remap where there are missing entries."""
@@ -190,7 +193,8 @@ class LoadAndRemapMatrixTest(test.TestCase):
         [33, init_val, init_val, init_val, 1, init_val], [3, 2])
 
     with self.cached_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+      self.assertAllClose(expected_remapped_matrix,
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_all_missing_rows(self):
     """Tests when all the rows are missing and need to be initialized."""
@@ -207,7 +211,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session():
       self.assertAllClose(
           np.reshape(initializing_values, (num_rows, self.old_num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_all_missing_rows_and_cols(self):
     """Tests when all the rows & cols are missing and need to be initialized."""
@@ -225,7 +229,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
     with self.cached_session():
       self.assertAllClose(
           np.reshape(initializing_values, (num_rows, num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_invalid_remapping(self):
     """Tests that errors are raised when an ID maps to multiple new IDs.
@@ -244,7 +248,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=len(invalid_remapping),
         num_cols=self.old_num_cols)
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
     # Invalid column remapping.
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
@@ -256,7 +260,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=self.old_num_rows,
         num_cols=len(invalid_remapping))
     with self.cached_session(), self.assertRaises(errors.UnimplementedError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
   def test_load_and_remap_incorrect_initializing_values(self):
     """Tests that errors are raised with incorrect number of init values."""
@@ -273,7 +277,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=3,
         num_cols=2)
     with self.cached_session(), self.assertRaises(errors.InvalidArgumentError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
     remapped_matrix = gen_checkpoint_ops.load_and_remap_matrix(
         ckpt_path=[self.bundle_file],
@@ -285,7 +289,7 @@ class LoadAndRemapMatrixTest(test.TestCase):
         num_rows=3,
         num_cols=2)
     with self.cached_session(), self.assertRaises(errors.InvalidArgumentError):
-      remapped_matrix.eval()
+      self.evaluate(remapped_matrix)
 
 
 class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
@@ -324,7 +328,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           num_rows=num_rows,
           num_cols=num_cols,
           max_rows_in_memory=max_rows_in_memory)
-      self.assertAllClose(np_value[::-1], remapped_matrix.eval())
+      self.assertAllClose(np_value[::-1], self.evaluate(remapped_matrix))
 
       # Tests loading the tensor (except for the first and last rows), with
       # uninitialized values. Requires num_rows to be at least 3 since we're
@@ -348,7 +352,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           np.vstack([
               np.tile(42, [prefix_rows, num_cols]), np_value[1:-1],
               np.tile(42, [suffix_rows, num_cols])
-          ]), remapped_matrix.eval())
+          ]), self.evaluate(remapped_matrix))
 
       # Tests when everything is taken from initializing_values.
       new_rows = 7
@@ -365,8 +369,9 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
           max_rows_in_memory=max_rows_in_memory)
       self.assertAllClose(
           np.reshape(initializing_values, (new_rows, num_cols)),
-          remapped_matrix.eval())
+          self.evaluate(remapped_matrix))
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_divisible_by_max_rows(self):
     """Tests loading normal var when rows are evenly divisible by max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -375,6 +380,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 9 is evenly divisible by 3.
         max_rows_in_memory=3)
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_not_divisible_by_max_rows(self):
     """Tests loading normal var when rows aren't divisible by max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -383,6 +389,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 9 is not evenly divisible by 4.
         max_rows_in_memory=4)
 
+  @test_util.run_deprecated_v1
   def test_loading_rows_less_than_max_rows(self):
     """Tests loading normal var as a single slice.
 
@@ -394,6 +401,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # 10 > 9.
         max_rows_in_memory=10)
 
+  @test_util.run_deprecated_v1
   def test_loading_no_max_rows(self):
     """Tests loading normal var as a single slice with no valid max_rows."""
     self._test_loading_variable_with_max_rows(
@@ -401,6 +409,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         partitioner=None,
         max_rows_in_memory=-1)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_equals_max_rows(self):
     """Tests loading partitioned var sliced on partition boundary."""
     self._test_loading_variable_with_max_rows(
@@ -410,6 +419,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # exactly 3 rows.
         max_rows_in_memory=3)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_greater_than_max_rows(self):
     """Tests loading partitioned var with more slices than partitions."""
     self._test_loading_variable_with_max_rows(
@@ -419,6 +429,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         # row at a time.
         max_rows_in_memory=1)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_less_than_max_rows(self):
     """Tests loading partitioned var as a single slice.
 
@@ -429,6 +440,7 @@ class LoadAndRemapMatrixWithMaxRowsTest(test.TestCase):
         partitioner=partitioned_variables.fixed_size_partitioner(3),
         max_rows_in_memory=10)
 
+  @test_util.run_deprecated_v1
   def test_loading_partitions_no_max_rows(self):
     """Tests loading partitioned var as single slice with no valid max_rows."""
     self._test_loading_variable_with_max_rows(
diff --git a/tensorflow/python/kernel_tests/cholesky_op_test.py b/tensorflow/python/kernel_tests/cholesky_op_test.py
index e96b2772665815a6ea8da5d0a12ac132e69f36e8..a08cfe960d005451ab5a02aff02e90a0fbcb92a0 100644
--- a/tensorflow/python/kernel_tests/cholesky_op_test.py
+++ b/tensorflow/python/kernel_tests/cholesky_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
@@ -97,7 +98,7 @@ def TriAngInvCompositeGrad(l, grad):
 class CholeskyOpTest(test.TestCase):
 
   def _verifyCholeskyBase(self, sess, x, chol, verification):
-    chol_np, verification_np = sess.run([chol, verification])
+    chol_np, verification_np = self.evaluate([chol, verification])
     self.assertAllClose(x, verification_np)
     self.assertShapeEqual(x, chol)
     # Check that the cholesky is lower triangular, and has positive diagonal
@@ -145,6 +146,7 @@ class CholeskyOpTest(test.TestCase):
       matrices[i] = np.dot(matrices[i].T.conj(), matrices[i])
     self._verifyCholesky(matrices)
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     with self.assertRaises(ValueError):
       linalg_ops.cholesky(np.array([[1., 2., 3.], [3., 4., 5.]]))
@@ -153,6 +155,7 @@ class CholeskyOpTest(test.TestCase):
           np.array([[[1., 2., 3.], [3., 4., 5.]], [[1., 2., 3.], [3., 4., 5.]]
                    ]))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     tensor3 = constant_op.constant([1., 2.])
     with self.assertRaises(ValueError):
@@ -175,6 +178,7 @@ class CholeskyOpTest(test.TestCase):
     self._verifyCholesky(np.empty([0, 2, 2]))
     self._verifyCholesky(np.empty([2, 0, 0]))
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
@@ -183,8 +187,8 @@ class CholeskyOpTest(test.TestCase):
       matrix2 = math_ops.matmul(matrix2, matrix2, adjoint_a=True)
       c1 = linalg_ops.cholesky(matrix1)
       c2 = linalg_ops.cholesky(matrix2)
-      c1_val, c2_val = sess.run([c1, c2])
-      self.assertAllEqual(c1_val, c2_val)
+      c1_val, c2_val = self.evaluate([c1, c2])
+      self.assertAllClose(c1_val, c2_val)
 
 
 class CholeskyGradTest(test.TestCase):
@@ -193,18 +197,21 @@ class CholeskyGradTest(test.TestCase):
   def getShapes(self, shapeList):
     return ((elem, int(np.floor(1.2 * elem))) for elem in shapeList)
 
+  @test_util.run_deprecated_v1
   def testSmallMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32, dtypes_lib.float64))
 
+  @test_util.run_deprecated_v1
   def testSmallMatricesComplex(self):
     np.random.seed(0)
     shapes = self.getShapes([1, 2, 10])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex64, dtypes_lib.complex128))
 
+  @test_util.run_deprecated_v1
   def testOneBlockMatrices(self):
     np.random.seed(0)
     shapes = self.getShapes([self._backprop_block_size + 1])
@@ -213,24 +220,28 @@ class CholeskyGradTest(test.TestCase):
         dtypes=(dtypes_lib.float32, dtypes_lib.float64),
         scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float32,), scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.float64,), scalarTest=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testTwoBlockMatrixComplexFloat(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
     self.runFiniteDifferences(
         shapes, dtypes=(dtypes_lib.complex64,), scalarTest=True)
 
+  @test_util.run_deprecated_v1
   def testTwoBlockMatrixComplexDouble(self):
     np.random.seed(0)
     shapes = self.getShapes([2 * self._backprop_block_size + 1])
diff --git a/tensorflow/python/kernel_tests/clip_ops_test.py b/tensorflow/python/kernel_tests/clip_ops_test.py
index efd7eee84743f36bae7ed224759b5c7a5a2bcb9d..45f1e6152a2a335a83dec1f385354df123a192bf 100644
--- a/tensorflow/python/kernel_tests/clip_ops_test.py
+++ b/tensorflow/python/kernel_tests/clip_ops_test.py
@@ -24,10 +24,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
 
@@ -55,7 +57,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-4.4, 2.0, 3.0], [4.0, 4.4, 4.4]]
       clip_value = 4.4
       ans = clip_ops.clip_by_value(x, -clip_value, clip_value)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -71,7 +73,7 @@ class ClipTest(test.TestCase):
         clip_value_min = 2
         clip_value_max = 4
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -88,7 +90,7 @@ class ClipTest(test.TestCase):
             [2, 2, 2, 3, 3, 3], shape=[2, 3], dtype=dtype)
         clip_value_max = 4
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -105,7 +107,7 @@ class ClipTest(test.TestCase):
         clip_value_max = constant_op.constant(
             [6, 6, 6, 6, 6, 6], shape=[2, 3], dtype=dtype)
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -123,7 +125,7 @@ class ClipTest(test.TestCase):
         clip_value_max = constant_op.constant(
             [5, 5, 5, 7, 7, 7], shape=[2, 3], dtype=dtype)
         ans = clip_ops.clip_by_value(x, clip_value_min, clip_value_max)
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
 
       self.assertAllClose(np_ans, tf_ans)
 
@@ -144,7 +146,7 @@ class ClipTest(test.TestCase):
       np_ans = [float('NaN'), 4.0, -4.0]
       clip_value = 4.0
       ans = clip_ops.clip_by_value(x, -clip_value, clip_value)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -157,14 +159,15 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 0.0]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans_tensor = ans.eval()
+      tf_ans_tensor = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
     self.assertAllClose(np_ans, tf_ans_tensor)
 
+  @test_util.run_deprecated_v1
   def testClipByNormGradientZeros(self):
     with self.session(use_gpu=True):
       x = array_ops.zeros([3])
@@ -188,7 +191,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -200,7 +203,7 @@ class ClipTest(test.TestCase):
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -212,7 +215,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.4, 0.0, 0.0], [3.2, 0.0, 3.0]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [0])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -224,7 +227,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [3.2, 0.0, 2.4]]
       clip_norm = 4.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [1])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -236,11 +239,12 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 3.0]]
       clip_norm = 6.0
       ans = clip_ops.clip_by_norm(x, clip_norm, [1])
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
   # ClipByGlobalNorm tests
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormClipped(self):
     # Norm clipping when clip_norm < 5
     with self.session(use_gpu=True):
@@ -256,12 +260,13 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormClippedTensor(self):
     # Norm clipping when clip_norm < 5
     with self.session(use_gpu=True):
@@ -277,12 +282,13 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm((x0, x1), clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormSupportsNone(self):
     # Norm clipping when clip_norm < 5
     with self.session(use_gpu=True):
@@ -300,12 +306,13 @@ class ClipTest(test.TestCase):
       self.assertTrue(ans[3] is None)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[2].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormWithIndexedSlicesClipped(self):
     # Norm clipping when clip_norm < 5
     with self.session(use_gpu=True):
@@ -322,7 +329,7 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].values.eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
@@ -339,6 +346,7 @@ class ClipTest(test.TestCase):
     self.assertEqual(dense_shape, slices.dense_shape)
     self.assertEqual(dense_shape, modified_slices.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormNotClipped(self):
     # No norm clipping when clip_norm >= 5
     with self.session(use_gpu=True):
@@ -352,12 +360,13 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 5.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormZero(self):
     # No norm clipping when norm = 0
     with self.session(use_gpu=True):
@@ -371,12 +380,13 @@ class ClipTest(test.TestCase):
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       tf_ans_1 = ans[0].eval()
       tf_ans_2 = ans[1].eval()
-      tf_norm = norm.eval()
+      tf_norm = self.evaluate(norm)
 
     self.assertAllClose(tf_norm, 0.0)
     self.assertAllClose(np_ans_0, tf_ans_1)
     self.assertAllClose(np_ans_1, tf_ans_2)
 
+  @test_util.run_deprecated_v1
   def testClipByGlobalNormInf(self):
     with self.session(use_gpu=True):
       x0 = constant_op.constant([-2.0, 0.0, np.inf, 4.0, 0.0, 0.0],
@@ -386,7 +396,7 @@ class ClipTest(test.TestCase):
 
       ans, norm = clip_ops.clip_by_global_norm([x0, x1], clip_norm)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
-        norm.eval()
+        self.evaluate(norm)
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
         ans[0].eval()
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "global norm"):
@@ -400,7 +410,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
       clip_norm = 0.8
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -412,7 +422,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
       clip_norm = constant_op.constant(0.8)
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -424,7 +434,7 @@ class ClipTest(test.TestCase):
       np_ans = [[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]]
       clip_norm = 0.9
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
@@ -436,10 +446,26 @@ class ClipTest(test.TestCase):
       np_ans = [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]]
       clip_norm = 0.9
       ans = clip_ops.clip_by_average_norm(x, clip_norm)
-      tf_ans = ans.eval()
+      tf_ans = self.evaluate(ans)
 
     self.assertAllClose(np_ans, tf_ans)
 
+  def testClipByAverageNormReplacedWithClipByNorm(self):
+    # Check clip_by_average_norm(t) is the same as
+    # clip_by_norm(t, clip_norm * tf.to_float(tf.size(t)))
+    with self.session(use_gpu=True):
+      x = constant_op.constant([-3.0, 0.0, 0.0, 4.0, 0.0, 0.0], shape=[2, 3])
+      # Average norm of x = sqrt(3^2 + 4^2) / 6 = 0.83333333
+      # expected answer [[-2.88, 0.0, 0.0], [3.84, 0.0, 0.0]]
+      clip_norm = constant_op.constant(0.8)
+      with_norm = clip_ops.clip_by_average_norm(x, clip_norm)
+      without_norm = clip_ops.clip_by_norm(
+          x, clip_norm * math_ops.to_float(array_ops.size(x)))
+      clip_by_average_norm_ans = self.evaluate(with_norm)
+      clip_by_norm_ans = self.evaluate(without_norm)
+      self.assertAllClose(clip_by_average_norm_ans, clip_by_norm_ans)
+
+  @test_util.run_deprecated_v1
   def testClipByValueEmptyTensor(self):
     # Test case for GitHub issue 19337
     zero = array_ops.placeholder(dtype=dtypes.float32, shape=None)
diff --git a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
index f27a0fc47221d7b200e91b5510c99d9dde3f7d57..215ea97f36d5fc72581f1ad96e7e68166e12e08c 100644
--- a/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
+++ b/tensorflow/python/kernel_tests/compare_and_bitpack_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -30,15 +31,15 @@ class CompareAndBitpackTest(test.TestCase):
                              x, threshold,
                              truth,
                              expected_err_re=None):
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       ans = math_ops.compare_and_bitpack(x, threshold)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertShapeEqual(truth, ans)
         self.assertAllEqual(tf_ans, truth)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBasic(self, dtype):
     rows = 371
diff --git a/tensorflow/python/kernel_tests/concat_op_test.py b/tensorflow/python/kernel_tests/concat_op_test.py
index 92d09986e6cf191acaf956fa5c4606155b9cfd0d..474760a93ff84be698388a7784f66445c21cd8ca 100644
--- a/tensorflow/python/kernel_tests/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/concat_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -34,6 +35,7 @@ from tensorflow.python.platform import test
 
 class ConcatOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testHStack(self):
     with self.session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
@@ -49,6 +51,7 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:4, :], params[p1])
     self.assertAllEqual(result[4:, :], params[p2])
 
+  @test_util.run_deprecated_v1
   def testVStack(self):
     with self.session(use_gpu=True):
       p1 = array_ops.placeholder(dtypes.float32, shape=[4, 4])
@@ -65,25 +68,25 @@ class ConcatOpTest(test.TestCase):
     self.assertAllEqual(result[:, 4:], params[p2])
 
   def testInt32GPU(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       p1 = np.random.rand(2, 3).astype("i")
       p2 = np.random.rand(2, 3).astype("i")
       x1 = constant_op.constant(p1)
       x2 = constant_op.constant(p2)
       c = array_ops.concat([x1, x2], 0)
-      result = c.eval()
+      result = self.evaluate(c)
     self.assertAllEqual(result[:2, :], p1)
     self.assertAllEqual(result[2:, :], p2)
 
   def testRefType(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       p1 = np.random.rand(4, 4).astype("f")
       p2 = np.random.rand(4, 4).astype("f")
       v1 = variables.Variable(p1)
       v2 = variables.Variable(p2)
       c = array_ops.concat([v1, v2], 0)
-      variables.global_variables_initializer().run()
-      result = c.eval()
+      self.evaluate(variables.global_variables_initializer())
+      result = self.evaluate(c)
 
     self.assertEqual(result.shape, c.get_shape())
     self.assertAllEqual(result[:4, :], p1)
@@ -137,6 +140,7 @@ class ConcatOpTest(test.TestCase):
       else:
         self.assertAllClose(result[ind], params[p[i]], 0.01)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     self._testRandom(dtypes.bool)
     self._testRandom(dtypes.float32)
@@ -147,6 +151,7 @@ class ConcatOpTest(test.TestCase):
     self._testRandom(dtypes.complex64)
     self._testRandom(dtypes.complex128)
 
+  @test_util.run_deprecated_v1
   def testInvalidConcatDimTypeAndShape(self):
     a = variables.Variable(constant_op.constant(1.0, shape=[1]))
     b = variables.Variable(constant_op.constant(2.0, shape=[1]))
@@ -172,7 +177,7 @@ class ConcatOpTest(test.TestCase):
     # Test both positive and negative concat axis.
     # -2 and 1 correspond to the same axis for 3-dimensional tensors.
     for axis in [-2, 1]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
@@ -195,15 +200,17 @@ class ConcatOpTest(test.TestCase):
             grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsSimple(self):
     self._testGradientsSimple(dtypes.float32)
     self._testGradientsSimple(dtypes.complex64)
 
+  @test_util.run_deprecated_v1
   def testGradientsFirstDim(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       inp = []
       inp_tensors = []
       for x in [1, 2, 6]:
@@ -222,15 +229,16 @@ class ConcatOpTest(test.TestCase):
           grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, 0)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsLastDim(self):
     # Test both positive and negative concat axis.
     # -1 and 2 correspond to the same axis for 3-dimensional tensors.
     for axis in [-1, 2]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         inp = []
         inp_tensors = []
         for x in [1, 2, 6]:
@@ -249,7 +257,7 @@ class ConcatOpTest(test.TestCase):
             grad_inp.flatten(), shape=output_shape)
         grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
         concated_grad = array_ops.concat(grad, axis)
-        result = concated_grad.eval()
+        result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
@@ -261,7 +269,7 @@ class ConcatOpTest(test.TestCase):
     # Random dim to concat on
     concat_dim = np.random.randint(5)
     concat_dim_sizes = np.random.randint(1, 5, size=num_tensors)
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       inp = []
       inp_tensors = []
       for x in concat_dim_sizes:
@@ -279,14 +287,16 @@ class ConcatOpTest(test.TestCase):
       grad_tensor = constant_op.constant(grad_inp.flatten(), shape=output_shape)
       grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
       concated_grad = array_ops.concat(grad, concat_dim)
-      result = concated_grad.eval()
+      result = self.evaluate(concated_grad)
 
     self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsRandom(self):
     for _ in range(5):
       self._RunAndVerifyGradientsRandom()
 
+  @test_util.run_deprecated_v1
   def testGradientWithUnknownInputDim(self):
     with self.session(use_gpu=True):
       x = array_ops.placeholder(dtypes.float32)
@@ -308,6 +318,7 @@ class ConcatOpTest(test.TestCase):
 
       self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testShapeError(self):
     # Rank doesn't match.
     with self.assertRaises(ValueError):
@@ -337,6 +348,7 @@ class ConcatOpTest(test.TestCase):
            constant_op.constant(20.0, shape=[4, 4, 4])
           ], -4)
 
+  @test_util.run_deprecated_v1
   def testShapeWithUnknownConcatDim(self):
     p1 = array_ops.placeholder(dtypes.float32)
     c1 = constant_op.constant(10.0, shape=[4, 4, 4, 4])
@@ -355,10 +367,11 @@ class ConcatOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.concat([p1, c1, p2, c3], dim)
 
+  @test_util.run_deprecated_v1
   def testZeroSize(self):
     # Verify that concat doesn't crash and burn for zero size inputs
     np.random.seed(7)
-    with self.session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       for shape0 in (), (2,):
         axis = len(shape0)
         for shape1 in (), (3,):
@@ -370,12 +383,13 @@ class ConcatOpTest(test.TestCase):
               # TODO(irving): Make tf.concat handle map, then drop list().
               xs = list(map(constant_op.constant, [x0, x1]))
               c = array_ops.concat(xs, axis)
-              self.assertAllEqual(c.eval(), correct)
+              self.assertAllEqual(self.evaluate(c), correct)
               # Check gradients
               dc = np.random.randn(*c.get_shape().as_list())
-              dxs = sess.run(gradients_impl.gradients(c, xs, dc))
+              dxs = self.evaluate(gradients_impl.gradients(c, xs, dc))
               self.assertAllEqual(dc, np.concatenate(dxs, axis=axis))
 
+  @test_util.run_deprecated_v1
   def testTensorConcatDim0Grad(self):
     x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
     output_shape = [44, 7, 3]
@@ -390,6 +404,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testTensorConcatDim1Grad(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [20, 11, 3]
@@ -404,6 +419,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim0Grad(self):
     x_shapes = [[20, 7, 3], [10, 7, 3], [14, 7, 3]]
     output_shape = [4, 7, 3]
@@ -419,6 +435,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim1Grad(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [4, 11, 3]
@@ -434,6 +451,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim2Grad(self):
     x_shapes = [[20, 7, 3], [20, 7, 1], [20, 7, 2]]
     output_shape = [4, 7, 6]
@@ -449,6 +467,7 @@ class ConcatOpTest(test.TestCase):
                                                     output_shape)
     self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesConcatDim1Grad_UnknownInputDim(self):
     x_shapes = [[20, 7, 3], [20, 3, 3], [20, 1, 3]]
     output_shape = [4, 11, 3]
@@ -473,21 +492,22 @@ class ConcatOpTest(test.TestCase):
   def testConcatTuple(self):
     c1 = np.random.rand(4, 4)
     c2 = np.random.rand(4, 4)
-    with self.cached_session():
-      concat_list_t = array_ops.concat([c1, c2], 0)
-      concat_tuple_t = array_ops.concat((c1, c2), 0)
-      self.assertAllEqual(concat_list_t.eval(), concat_tuple_t.eval())
+    concat_list_t = array_ops.concat([c1, c2], 0)
+    concat_tuple_t = array_ops.concat((c1, c2), 0)
+    self.assertAllEqual(
+        self.evaluate(concat_list_t), self.evaluate(concat_tuple_t))
 
+  @test_util.run_deprecated_v1
   def testConcatNoScalars(self):
-    with self.cached_session():
-      scalar = constant_op.constant(7)
-      dim = array_ops.placeholder(dtypes.int32)
-      with self.assertRaisesRegexp(
-          ValueError, r"Can't concatenate scalars \(use tf\.stack instead\)"):
-        array_ops.concat([scalar, scalar, scalar], dim)
+    scalar = constant_op.constant(7)
+    dim = array_ops.placeholder(dtypes.int32)
+    with self.assertRaisesRegexp(
+        ValueError, r"Can't concatenate scalars \(use tf\.stack instead\)"):
+      array_ops.concat([scalar, scalar, scalar], dim)
 
   # important as gpu implementation could fail if
   # shared memory is not large for all the inputs
+  @test_util.run_deprecated_v1
   def testConcatLargeNumberOfTensors(self):
     with self.session(use_gpu=True):
       for concat_dim in range(2):
@@ -523,33 +543,34 @@ class ConcatOpTest(test.TestCase):
           self.assertAllEqual(result[index], params[p[i]])
 
   def testConcatEmpty(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       t1 = []
       t2 = []
-      output = gen_array_ops.concat_v2([t1, t2], 0).eval()
-      self.assertFalse(output)  # Checks that output is empty
+      output = gen_array_ops.concat_v2([t1, t2], 0)
+      self.assertFalse(self.evaluate(output))  # Checks that output is empty
 
+  @test_util.run_deprecated_v1
   def testConcatInvalidAxis(self):
     with self.assertRaises(ValueError):
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         t1 = [1]
         t2 = [2]
         gen_array_ops.concat_v2([t1, t2], 1).eval()
 
   def testConcatNegativeAxis(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       t1 = [[1, 2, 3], [4, 5, 6]]
       t2 = [[7, 8, 9], [10, 11, 12]]
 
       c = gen_array_ops.concat_v2([t1, t2], -2)
       self.assertEqual([4, 3], c.get_shape().as_list())
-      output = c.eval()
+      output = self.evaluate(c)
       self.assertAllEqual([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]],
                           output)
 
       c = gen_array_ops.concat_v2([t1, t2], -1)
       self.assertEqual([2, 6], c.get_shape().as_list())
-      output = c.eval()
+      output = self.evaluate(c)
       self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
   def _testGradientsForAxis(
@@ -578,6 +599,7 @@ class ConcatOpTest(test.TestCase):
       result = concated_grad.eval(feed_dict=feed_dict)
       self.assertAllEqual(result, grad_inp)
 
+  @test_util.run_deprecated_v1
   def testGradientsNegativeAxis(self):
     x1 = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]
     x2 = [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]]
@@ -608,78 +630,78 @@ class ConcatOpTest(test.TestCase):
 
   def testConcatAxisType(self):
     for dtype in [dtypes.int32, dtypes.int64]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         t1 = [[1, 2, 3], [4, 5, 6]]
         t2 = [[7, 8, 9], [10, 11, 12]]
 
         c = gen_array_ops.concat_v2([t1, t2],
                                     constant_op.constant(1, dtype=dtype))
         self.assertEqual([2, 6], c.get_shape().as_list())
-        output = c.eval()
+        output = self.evaluate(c)
         self.assertAllEqual([[1, 2, 3, 7, 8, 9], [4, 5, 6, 10, 11, 12]], output)
 
 class ConcatOffsetTest(test.TestCase):
 
   def testBasic(self):
-    with self.session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       cdim = constant_op.constant(1, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
+  @test_util.run_deprecated_v1
   def testNotVector(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([[2, 3, 5]], dtypes.int32)
-      s1 = constant_op.constant([[2, 7, 5]], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"should be a vector"):
-        sess.run(off)
-
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([[2, 3, 5]], dtypes.int32)
+    s1 = constant_op.constant([[2, 7, 5]], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"should be a vector"):
+      self.evaluate(off)
+
+  @test_util.run_deprecated_v1
   def testConcatDimOutOfRange(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(4, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 5], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"Concat dim is out of range: 4 vs. 3"):
-        sess.run(off)
-
+    cdim = constant_op.constant(4, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 5], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"Concat dim is out of range: 4 vs. 3"):
+      self.evaluate(off)
+
+  @test_util.run_deprecated_v1
   def testDimMismatch(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 5, 10], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
-                                   r"should contain 3 elem"):
-        sess.run(off)
-
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 5, 10], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
+                                 r"should contain 3 elem"):
+      self.evaluate(off)
+
+  @test_util.run_deprecated_v1
   def testSizeMismatch(self):
-    with self.cached_session() as sess:
-      cdim = constant_op.constant(1, dtypes.int32)
-      s0 = constant_op.constant([2, 3, 5], dtypes.int32)
-      s1 = constant_op.constant([2, 7, 10], dtypes.int32)
-      off = gen_array_ops.concat_offset(cdim, [s0, s1])
-      with self.assertRaisesRegexp(
-          errors_impl.InvalidArgumentError,
-          r"All dimensions except 1 must match. Input 1 has shape \[2 7 10\] "
-          r"and doesn't match input 0 with shape \[2 3 5\]."):
-        sess.run(off)
+    cdim = constant_op.constant(1, dtypes.int32)
+    s0 = constant_op.constant([2, 3, 5], dtypes.int32)
+    s1 = constant_op.constant([2, 7, 10], dtypes.int32)
+    off = gen_array_ops.concat_offset(cdim, [s0, s1])
+    with self.assertRaisesRegexp(
+        errors_impl.InvalidArgumentError,
+        r"All dimensions except 1 must match. Input 1 has shape \[2 7 10\] "
+        r"and doesn't match input 0 with shape \[2 3 5\]."):
+      self.evaluate(off)
 
   def testNegativeDim(self):
-    with self.session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       cdim = constant_op.constant(-2, dtypes.int32)
       s0 = constant_op.constant([2, 3, 5], dtypes.int32)
       s1 = constant_op.constant([2, 7, 5], dtypes.int32)
       s2 = constant_op.constant([2, 20, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [0, 3, 0], [0, 10, 0]])
 
       cdim = constant_op.constant(-3, dtypes.int32)
@@ -687,7 +709,7 @@ class ConcatOffsetTest(test.TestCase):
       s1 = constant_op.constant([1, 3, 5], dtypes.int32)
       s2 = constant_op.constant([3, 3, 5], dtypes.int32)
       off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
-      ans = sess.run(off)
+      ans = self.evaluate(off)
       self.assertAllEqual(ans, [[0, 0, 0], [2, 0, 0], [3, 0, 0]])
 
 
diff --git a/tensorflow/python/kernel_tests/cond_v2_test.py b/tensorflow/python/kernel_tests/cond_v2_test.py
index b077b853edb5ca725bf41d04577b153e15b17924..8fe3ba41e27aa101fd4f2e3b41b0a0b226471047 100644
--- a/tensorflow/python/kernel_tests/cond_v2_test.py
+++ b/tensorflow/python/kernel_tests/cond_v2_test.py
@@ -20,6 +20,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -32,6 +33,7 @@ from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -66,6 +68,7 @@ class CondV2Test(test.TestCase):
       self.assertEqual(expected_val, actual_val)
       self.assertEqual(expected_grad_val, actual_grad_val)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -80,6 +83,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputs(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(3.0, name="y")
@@ -94,6 +98,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testBasic2(self):
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
@@ -108,6 +113,7 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testNoInputs(self):
     with self.cached_session() as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
@@ -124,7 +130,7 @@ class CondV2Test(test.TestCase):
       self.assertEqual(sess.run(out, {pred: False}), (2.0,))
 
   def _createCond(self, name):
-    """Helper function for testDefaultName."""
+    """Creates a cond_v2 call and returns the output tensor and the cond op."""
     pred = constant_op.constant(True, name="pred")
     x = constant_op.constant(1.0, name="x")
 
@@ -137,11 +143,11 @@ class CondV2Test(test.TestCase):
     output = cond_v2.cond_v2(pred, true_fn, false_fn, name=name)
     cond_op = output.op.inputs[0].op
     self.assertEqual(cond_op.type, "If")
-    return cond_op
+    return output, cond_op
 
   def testDefaultName(self):
     with ops.Graph().as_default():
-      cond_op = self._createCond(None)
+      _, cond_op = self._createCond(None)
       self.assertEqual(cond_op.name, "cond")
       self.assertRegexpMatches(
           cond_op.get_attr("then_branch").name, r"cond_true_\d*")
@@ -150,22 +156,22 @@ class CondV2Test(test.TestCase):
 
     with ops.Graph().as_default():
       with ops.name_scope("foo"):
-        cond1_op = self._createCond("")
+        _, cond1_op = self._createCond("")
         self.assertEqual(cond1_op.name, "foo/cond")
         self.assertRegexpMatches(
             cond1_op.get_attr("then_branch").name, r"foo_cond_true_\d*")
         self.assertRegexpMatches(
             cond1_op.get_attr("else_branch").name, r"foo_cond_false_\d*")
 
-        cond2_op = self._createCond(None)
+        _, cond2_op = self._createCond(None)
         self.assertEqual(cond2_op.name, "foo/cond_1")
         self.assertRegexpMatches(
             cond2_op.get_attr("then_branch").name, r"foo_cond_1_true_\d*")
         self.assertRegexpMatches(
             cond2_op.get_attr("else_branch").name, r"foo_cond_1_false_\d*")
 
+  @test_util.run_v1_only("b/120545219")
   def testDefunInCond(self):
-    self.skipTest("b/117293122")
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -184,9 +190,8 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testNestedDefunInCond(self):
-    self.skipTest("b/117284369")
-
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -210,9 +215,8 @@ class CondV2Test(test.TestCase):
     self._testCond(true_fn, false_fn, [x, y])
     self._testCond(true_fn, false_fn, [y])
 
+  @test_util.run_deprecated_v1
   def testDoubleNestedDefunInCond(self):
-    self.skipTest("b/117284369")
-
     x = constant_op.constant(1.0, name="x")
     y = constant_op.constant(2.0, name="y")
 
@@ -538,6 +542,7 @@ class CondV2Test(test.TestCase):
               pred_inner: False
           }), [5., 0.])
 
+  @test_util.run_deprecated_v1
   def testSecondDerivative(self):
     with self.cached_session() as sess:
       pred = array_ops.placeholder(dtypes.bool, name="pred")
@@ -610,11 +615,11 @@ class CondV2Test(test.TestCase):
   def testLowering(self):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
-        out_cond = self._createCond("cond")
+        cond_output, _ = self._createCond("cond")
 
         run_options = config_pb2.RunOptions(output_partition_graphs=True)
         run_metadata = config_pb2.RunMetadata()
-        sess.run(out_cond, options=run_options, run_metadata=run_metadata)
+        sess.run(cond_output, options=run_options, run_metadata=run_metadata)
 
         # If lowering was enabled, there should be a `Switch` node
         switch_found = any(
@@ -634,17 +639,18 @@ class CondV2Test(test.TestCase):
         self.assertFalse(if_found,
                          "An `If` op was found, but it should be lowered.")
 
+  @test_util.run_deprecated_v1
   def testLoweringDisabledInXLA(self):
     with self.session(graph=ops.Graph()) as sess:
       # Build the cond_v2 in an XLA context
       xla_context = control_flow_ops.XLAControlFlowContext()
       xla_context.Enter()
-      out_cond = self._createCond("cond")
+      cond_output, _ = self._createCond("cond")
       xla_context.Exit()
 
       run_options = config_pb2.RunOptions(output_partition_graphs=True)
       run_metadata = config_pb2.RunMetadata()
-      sess.run(out_cond, options=run_options, run_metadata=run_metadata)
+      sess.run(cond_output, options=run_options, run_metadata=run_metadata)
 
       # Lowering disabled in XLA, there should be no `Switch` node
       switch_found = any(
@@ -666,6 +672,130 @@ class CondV2Test(test.TestCase):
           if_found,
           "An `If` op was not found, but the graph should not be lowered.")
 
+  @test_util.run_deprecated_v1
+  def testLoweringDisabledWithSingleThreadedExecutorContext(self):
+    with self.session(graph=ops.Graph()) as sess:
+      @function.defun
+      def _add_cond(x):
+        return cond_v2.cond_v2(
+            constant_op.constant(True, name="pred"),
+            lambda: x,
+            lambda: x + 1)
+
+      x = array_ops.placeholder(shape=None, dtype=dtypes.float32)
+      with context.function_executor_type("SINGLE_THREADED_EXECUTOR"):
+        out_cond = _add_cond(x)
+
+      # The fact that sess.run() succeeds means lowering is disabled, because
+      # the single threaded executor does not support cond v1 ops.
+      sess.run(out_cond, feed_dict={x: 1.0})
+
+  @test_util.enable_control_flow_v2
+  def testStructuredOutputs(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return ((x * y,), y)
+
+    def false_fn():
+      return ((x,), y * 3.0)
+
+    output = control_flow_ops.cond(
+        constant_op.constant(False), true_fn, false_fn)
+    self.assertEqual(self.evaluate(output[0][0]), 1.)
+    self.assertEqual(self.evaluate(output[1]), 9.)
+
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
+  def testRaisesOutputStructuresMismatch(self):
+    x = constant_op.constant(1.0, name="x")
+    y = constant_op.constant(3.0, name="y")
+
+    def true_fn():
+      return x * y, y
+
+    def false_fn():
+      return ((x,), y * 3.0)
+
+    with self.assertRaisesRegexp(
+        ValueError, "Outputs of true_fn and false_fn must"
+        " have the same structure"):
+      control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
+
+  @test_util.enable_control_flow_v2
+  def testCondAndTensorArray(self):
+    x = math_ops.range(-5, 5)
+    output = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=x.shape[0])
+
+    def loop_body(i, output):
+
+      def if_true():
+        return output.write(i, x[i]**2)
+
+      def if_false():
+        return output.write(i, x[i])
+
+      output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+      return i + 1, output
+
+    _, output = control_flow_ops.while_loop(
+        lambda i, arr: i < x.shape[0],
+        loop_body,
+        loop_vars=(constant_op.constant(0), output))
+    output_t = output.stack()
+    self.assertAllEqual(
+        self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
+
+  @test_util.enable_control_flow_v2
+  def testCondAndTensorArrayInDefun(self):
+
+    @function.defun
+    def f():
+      x = math_ops.range(-5, 5)
+      output = tensor_array_ops.TensorArray(dtype=dtypes.int32, size=x.shape[0])
+
+      def loop_body(i, output):
+
+        def if_true():
+          return output.write(i, x[i]**2)
+
+        def if_false():
+          return output.write(i, x[i])
+
+        output = control_flow_ops.cond(x[i] > 0, if_true, if_false)
+        return i + 1, output
+
+      _, output = control_flow_ops.while_loop(
+          lambda i, arr: i < x.shape[0],
+          loop_body,
+          loop_vars=(constant_op.constant(0), output))
+      return output.stack()
+
+    output_t = f()
+    self.assertAllEqual(
+        self.evaluate(output_t), [-5, -4, -3, -2, -1, 0, 1, 4, 9, 16])
+
+  @test_util.run_deprecated_v1
+  def testForwardPassRewrite(self):
+    x = constant_op.constant(1.0, name="x")
+    output = cond_v2.cond_v2(constant_op.constant(True),
+                             lambda: x * 2.0,
+                             lambda: x)
+    if_op = output.op.inputs[0].op
+    self.assertEqual(if_op.type, "If")
+    # pylint: disable=g-deprecated-assert
+    self.assertEqual(len(if_op.outputs), 1)
+
+    gradients_impl.gradients(output, x)
+    # if_op should have been rewritten to output 2.0 intermediate.
+    self.assertEqual(len(if_op.outputs), 2)
+
+    gradients_impl.gradients(output, x)
+    # Computing the gradient again shouldn't rewrite if_op again.
+    self.assertEqual(len(if_op.outputs), 2)
+    # pylint: enable=g-deprecated-assert
+
 
 class CondV2CollectionTest(test.TestCase):
 
diff --git a/tensorflow/python/kernel_tests/conditional_accumulator_test.py b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
index 97ab23fe49b6eea388b61876b99495486e17d9f9..ce34201706492ca488afbec95cddf436f38c820d 100644
--- a/tensorflow/python/kernel_tests/conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/conditional_accumulator_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -79,11 +80,13 @@ class ConditionalAccumulatorTest(test.TestCase):
       attr { key: 'reduction_type' value {s: 'MEAN'} }
       """, q.accumulator_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeEmpty(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(dtypes_lib.float32, name="Q")
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -91,6 +94,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       set_global_step_op = q.set_global_step(1)
       set_global_step_op.run()
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradFloat32(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -98,6 +102,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       accum_op = q.apply_grad((10.0,))
       accum_op.run()
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
     with self.cached_session() as sess:
       dtypes = [dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64]
@@ -111,10 +116,11 @@ class ConditionalAccumulatorTest(test.TestCase):
         for e in elems:
           q.apply_grad((e,)).run()
 
-        result = sess.run(q.take_grad(1))
+        result = self.evaluate(q.take_grad(1))
 
         self.assertEqual(sum(elems) / len(elems), result)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorMultipleAccumulators(self):
     with self.cached_session():
       q_f32_0 = data_flow_ops.ConditionalAccumulator(
@@ -134,6 +140,7 @@ class ConditionalAccumulatorTest(test.TestCase):
         result = accums[i].take_grad(1).eval()
         self.assertEqual(result, i + 10.0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyAndTakeGradWithShape(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -149,12 +156,13 @@ class ConditionalAccumulatorTest(test.TestCase):
         accum_op.run()
 
       is_all_equal = True
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       for i in range(len(val)):
         for j in range(len(val[i])):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
       self.assertTrue(is_all_equal)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradWithWrongShape(self):
     q = data_flow_ops.ConditionalAccumulator(
         dtypes_lib.float32, name="Q", shape=(3, 2))
@@ -165,6 +173,7 @@ class ConditionalAccumulatorTest(test.TestCase):
     with self.assertRaises(ValueError):
       q.apply_grad([[1.0], [2.0], [3.0]])
 
+  @test_util.run_deprecated_v1
   def testAccumulatorDynamicShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -184,12 +193,13 @@ class ConditionalAccumulatorTest(test.TestCase):
         sess.run(accum_op, feed_dict={x: elem})
 
       is_all_equal = True
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       for i in range(len(val)):
         for j in range(len(val[i])):
           is_all_equal &= (val[i][j] == elems_ave[i][j])
       self.assertTrue(is_all_equal)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorWrongDynamicShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -208,6 +218,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         sess.run(accum_op, feed_dict={x: [[1.0], [2.0], [3.0]]})
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeAfterApplyGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -219,6 +230,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 2)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeAfterApplyGradAndTakeGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -247,6 +259,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       extract_t.op.run()
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradMean(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -259,7 +272,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(15.0, val)
 
       accum_ops = [q.apply_grad((x,), local_step=1) for x in elems]
@@ -268,9 +281,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(15.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradSum(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -286,7 +300,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(30.0, val)
 
       accum_ops = [q.apply_grad((x,), local_step=1) for x in elems]
@@ -295,9 +309,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(30.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradInvalidReductionType(self):
     with self.assertRaises(ValueError):
       data_flow_ops.ConditionalAccumulator(
@@ -306,6 +321,7 @@ class ConditionalAccumulatorTest(test.TestCase):
           shape=tensor_shape.TensorShape([1]),
           reduction_type="Invalid")
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorInvalidTakeGrad(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -319,8 +335,9 @@ class ConditionalAccumulatorTest(test.TestCase):
         accum_op.run()
 
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        takeg_t.eval()
+        self.evaluate(takeg_t)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGradMean(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -334,7 +351,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_ave, val)
 
       elems = [20.0, 30.0]
@@ -345,9 +362,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_ave + 0.0, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGradSum(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -364,7 +382,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_sum, val)
 
       elems = [20.0, 30.0]
@@ -375,9 +393,10 @@ class ConditionalAccumulatorTest(test.TestCase):
       for accum_op in accum_ops:
         accum_op.run()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
       self.assertEqual(elems_sum, val)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorIncrementGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -392,8 +411,9 @@ class ConditionalAccumulatorTest(test.TestCase):
       variables.global_variables_initializer().run()
       for _ in range(3):
         set_global_step_op.run()
-        inc_global_step.eval()
+        self.evaluate(inc_global_step)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStepPreventsAccumulation(self):
     with self.cached_session():
       q = data_flow_ops.ConditionalAccumulator(
@@ -410,11 +430,12 @@ class ConditionalAccumulatorTest(test.TestCase):
           accum_op.run()
         takeg_t = q.take_grad(1)
 
-        val = takeg_t.eval()
+        val = self.evaluate(takeg_t)
         self.assertEqual(0.0 + sum(x for x in local_steps
                                    if x >= ls) / sum(1 for x in local_steps
                                                      if x >= ls), val)
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -424,7 +445,7 @@ class ConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_grad(1)
 
       def apply_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(
@@ -436,10 +457,11 @@ class ConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = takeg_t.eval()
+      val = self.evaluate(takeg_t)
 
       self.assertEqual(val, sum(elems) / len(elems))
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -451,14 +473,14 @@ class ConditionalAccumulatorTest(test.TestCase):
       def apply_grad():
         for accum_op in accum_ops:
           time.sleep(1.0)
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       apply_grad_thread = self.checkedThread(target=apply_grad)
 
       results = []
 
       def take_grad():
-        results.append(sess.run(takeg_t))
+        results.append(self.evaluate(takeg_t))
 
       threads = [self.checkedThread(target=take_grad) for _ in range(10)]
 
@@ -472,6 +494,7 @@ class ConditionalAccumulatorTest(test.TestCase):
 
       self.assertItemsEqual(elems, results)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorApplyAndBlockingTake(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
@@ -485,12 +508,12 @@ class ConditionalAccumulatorTest(test.TestCase):
       def apply_grad():
         time.sleep(1.0)
         for accum_op in accum_ops:
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       return_array = []
 
       def take_grad():
-        return_array.append(sess.run(takeg_t))
+        return_array.append(self.evaluate(takeg_t))
 
       accum_thread = self.checkedThread(target=apply_grad)
       takeg_thread = self.checkedThread(target=take_grad)
@@ -503,8 +526,9 @@ class ConditionalAccumulatorTest(test.TestCase):
 
   def _blocking_takeg(self, sess, takeg_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(takeg_op)
+      self.evaluate(takeg_op)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
       q = data_flow_ops.ConditionalAccumulator(
diff --git a/tensorflow/python/kernel_tests/confusion_matrix_test.py b/tensorflow/python/kernel_tests/confusion_matrix_test.py
index bc24345261e5bb7beaa0aa2273ec277b53ea01fb..ae13c8e32e5ed5c8f3e6b670835db66d1e7dad0f 100644
--- a/tensorflow/python/kernel_tests/confusion_matrix_test.py
+++ b/tensorflow/python/kernel_tests/confusion_matrix_test.py
@@ -71,9 +71,11 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32Basic(self):
     self._testBasic(dtype=np.int32)
 
+  @test_util.run_deprecated_v1
   def testInt64Basic(self):
     self._testBasic(dtype=np.int64)
 
@@ -111,9 +113,11 @@ class ConfusionMatrixTest(test.TestCase):
       self.assertEqual(cm_out.dtype, np_dtype)
       self.assertAllClose(cm_out, truth, atol=1e-10)
 
+  @test_util.run_deprecated_v1
   def testOnTensors_int32(self):
     self._testConfMatrixOnTensors(dtypes.int32, np.int32)
 
+  @test_util.run_deprecated_v1
   def testOnTensors_int64(self):
     self._testConfMatrixOnTensors(dtypes.int64, np.int64)
 
@@ -133,9 +137,11 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32DifferentLabels(self, dtype=np.int32):
     self._testDifferentLabelsInPredictionAndTarget(dtype)
 
+  @test_util.run_deprecated_v1
   def testInt64DifferentLabels(self, dtype=np.int64):
     self._testDifferentLabelsInPredictionAndTarget(dtype)
 
@@ -155,12 +161,15 @@ class ConfusionMatrixTest(test.TestCase):
 
     self._testConfMatrix(labels=labels, predictions=predictions, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testInt32MultipleLabels(self, dtype=np.int32):
     self._testMultipleLabels(dtype)
 
+  @test_util.run_deprecated_v1
   def testInt64MultipleLabels(self, dtype=np.int64):
     self._testMultipleLabels(dtype)
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = np.arange(5, dtype=np.int32)
     predictions = np.arange(5, dtype=np.int32)
@@ -177,6 +186,7 @@ class ConfusionMatrixTest(test.TestCase):
     self._testConfMatrix(
         labels=labels, predictions=predictions, weights=weights, truth=truth)
 
+  @test_util.run_deprecated_v1
   def testLabelsTooLarge(self):
     labels = np.asarray([1, 1, 0, 3, 5], dtype=np.int32)
     predictions = np.asarray([2, 1, 0, 2, 2], dtype=np.int32)
@@ -191,6 +201,7 @@ class ConfusionMatrixTest(test.TestCase):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
+  @test_util.run_deprecated_v1
   def testPredictionsTooLarge(self):
     labels = np.asarray([1, 1, 0, 2, 2], dtype=np.int32)
     predictions = np.asarray([2, 1, 0, 3, 5], dtype=np.int32)
@@ -205,6 +216,7 @@ class ConfusionMatrixTest(test.TestCase):
       self._testConfMatrix(
           labels=labels, predictions=predictions, num_classes=3, truth=None)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank_predictionsTooBig(self):
     labels = np.asarray([1, 2, 3])
     predictions = np.asarray([[1, 2, 3]])
@@ -212,6 +224,7 @@ class ConfusionMatrixTest(test.TestCase):
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank_predictionsTooSmall(self):
     labels = np.asarray([[1, 2, 3]])
     predictions = np.asarray([1, 2, 3])
@@ -219,6 +232,7 @@ class ConfusionMatrixTest(test.TestCase):
                             confusion_matrix.confusion_matrix, predictions,
                             labels)
 
+  @test_util.run_deprecated_v1
   def testInputDifferentSize(self):
     labels = np.asarray([1, 2])
     predictions = np.asarray([1, 2, 3])
@@ -232,7 +246,7 @@ class ConfusionMatrixTest(test.TestCase):
     with self.cached_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int32)
-      tf_cm = cm.eval()
+      tf_cm = self.evaluate(cm)
     self.assertEqual(tf_cm.dtype, np.int32)
 
   def testOutputIsInt64(self):
@@ -241,12 +255,13 @@ class ConfusionMatrixTest(test.TestCase):
     with self.cached_session():
       cm = confusion_matrix.confusion_matrix(
           labels, predictions, dtype=dtypes.int64)
-      tf_cm = cm.eval()
+      tf_cm = self.evaluate(cm)
     self.assertEqual(tf_cm.dtype, np.int64)
 
 
 class RemoveSqueezableDimensionsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBothScalarShape(self):
     label_values = 1.0
     prediction_values = 0.0
@@ -261,8 +276,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -272,6 +287,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSameShape(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros_like(label_values)
@@ -286,8 +302,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -297,6 +313,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSameShapeExpectedRankDiff0(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros_like(label_values)
@@ -311,8 +328,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
             labels_placeholder, predictions_placeholder, expected_rank_diff=0))
 
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -322,6 +339,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezableLabels(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros(shape=(2, 3))
@@ -337,8 +355,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_label_values = np.reshape(label_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(expected_label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(expected_label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -348,6 +366,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezableLabelsExpectedRankDiffPlus1(self):
     label_values = np.ones(shape=(2, 3, 1))
     prediction_values = np.zeros(shape=(2, 3, 5))
@@ -363,8 +382,8 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_label_values = np.reshape(label_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(expected_label_values, static_labels.eval())
-      self.assertAllEqual(prediction_values, static_predictions.eval())
+      self.assertAllEqual(expected_label_values, self.evaluate(static_labels))
+      self.assertAllEqual(prediction_values, self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -374,6 +393,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezablePredictions(self):
     label_values = np.ones(shape=(2, 3))
     prediction_values = np.zeros(shape=(2, 3, 1))
@@ -389,8 +409,9 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(expected_prediction_values,
+                          self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -401,6 +422,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           expected_prediction_values,
           dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testSqueezablePredictionsExpectedRankDiffMinus1(self):
     label_values = np.ones(shape=(2, 3, 5))
     prediction_values = np.zeros(shape=(2, 3, 1))
@@ -416,8 +438,9 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
 
     expected_prediction_values = np.reshape(prediction_values, newshape=(2, 3))
     with self.cached_session():
-      self.assertAllEqual(label_values, static_labels.eval())
-      self.assertAllEqual(expected_prediction_values, static_predictions.eval())
+      self.assertAllEqual(label_values, self.evaluate(static_labels))
+      self.assertAllEqual(expected_prediction_values,
+                          self.evaluate(static_predictions))
       feed_dict = {
           labels_placeholder: label_values,
           predictions_placeholder: prediction_values
@@ -428,6 +451,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
           expected_prediction_values,
           dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testUnsqueezableLabels(self):
     label_values = np.ones(shape=(2, 3, 2))
     prediction_values = np.zeros(shape=(2, 3))
@@ -453,6 +477,7 @@ class RemoveSqueezableDimensionsTest(test.TestCase):
       self.assertAllEqual(
           prediction_values, dynamic_predictions.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testUnsqueezablePredictions(self):
     label_values = np.ones(shape=(2, 3))
     prediction_values = np.zeros(shape=(2, 3, 2))
diff --git a/tensorflow/python/kernel_tests/constant_op_test.py b/tensorflow/python/kernel_tests/constant_op_test.py
index 38b8c0c146f0e8137240d67e2c6de4831a90543f..583082c2aa283e326a933d2beaf88f711b7a280f 100644
--- a/tensorflow/python/kernel_tests/constant_op_test.py
+++ b/tensorflow/python/kernel_tests/constant_op_test.py
@@ -70,6 +70,7 @@ class ConstantTest(test.TestCase):
     with self.assertRaises(TypeError):
       constant_op.constant(dtypes_lib.string, "[,]")
 
+  @test_util.run_deprecated_v1
   def testBFloat16(self):
     bfloat16 = dtypes_lib.bfloat16.as_numpy_dtype
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(bfloat16))
@@ -77,36 +78,42 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(bfloat16))
     self._testAll(np.empty((2, 0, 5)).astype(bfloat16))
 
+  @test_util.run_deprecated_v1
   def testHalf(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float16))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float16))
     self._testAll(np.empty((2, 0, 5)).astype(np.float16))
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float32))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float32))
     self._testAll(np.empty((2, 0, 5)).astype(np.float32))
 
+  @test_util.run_deprecated_v1
   def testDouble(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.float64))
     self._testAll(
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.float64))
     self._testAll(np.empty((2, 0, 5)).astype(np.float64))
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.int32))
     self._testAll((100 * np.random.normal(size=30)).reshape([2, 3, 5]).astype(
         np.int32))
     self._testAll(np.empty((2, 0, 5)).astype(np.int32))
 
+  @test_util.run_deprecated_v1
   def testInt64(self):
     self._testAll(np.arange(-15, 15).reshape([2, 3, 5]).astype(np.int64))
     self._testAll((100 * np.random.normal(size=30)).reshape([2, 3, 5]).astype(
         np.int64))
     self._testAll(np.empty((2, 0, 5)).astype(np.int64))
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     self._testAll(
         np.complex(1, 2) *
@@ -116,6 +123,7 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.complex64))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex64))
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     self._testAll(
         np.complex(1, 2) *
@@ -125,12 +133,14 @@ class ConstantTest(test.TestCase):
         np.random.normal(size=30).reshape([2, 3, 5]).astype(np.complex128))
     self._testAll(np.empty((2, 0, 5)).astype(np.complex128))
 
+  @test_util.run_deprecated_v1
   def testString(self):
     self._testCpu(
         np.array([compat.as_bytes(str(x)) for x in np.arange(-15, 15)]).reshape(
             [2, 3, 5]))
     self._testCpu(np.empty((2, 0, 5)).astype(np.str_))
 
+  @test_util.run_deprecated_v1
   def testVariant(self):
     # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
     # copying between CPU and GPU is supported.
@@ -161,6 +171,7 @@ class ConstantTest(test.TestCase):
           message="Variant storing an int, decoded const value:").op
       logging_const_op.run()
 
+  @test_util.run_deprecated_v1
   def testStringWithNulls(self):
     with self.cached_session():
       val = ops.convert_to_tensor(b"\0\0\0\0").eval()
@@ -219,16 +230,28 @@ class ConstantTest(test.TestCase):
 
   def testShapeInconsistent(self):
     with ops.Graph().as_default():
-      c = constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[10])
+      c = constant_op.constant_v1([1, 2, 3, 4, 5, 6, 7], shape=[10])
+    self.assertEqual(c.get_shape(), [10])
+
+    with ops.Graph().as_default():
+      with self.assertRaisesRegexp(
+          TypeError, "Expected Tensor's shape"):
+        c = constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[10])
+
+  def testPromotionShapes(self):
+    with ops.Graph().as_default():
+      c = constant_op.constant([7], shape=[10])
+    self.assertEqual(c.get_shape(), [10])
+    with ops.Graph().as_default():
+      c = constant_op.constant(3, shape=[10])
     self.assertEqual(c.get_shape(), [10])
 
   # pylint: disable=g-long-lambda
   def testShapeWrong(self):
     with ops.Graph().as_default():
-      with self.assertRaisesWithPredicateMatch(
-          ValueError,
-          lambda e: ("Too many elements provided. Needed at most 5, "
-                     "but received 7" == str(e))):
+      with self.assertRaisesRegexp(ValueError, "Too many elements provided."):
+        constant_op.constant_v1([1, 2, 3, 4, 5, 6, 7], shape=[5])
+      with self.assertRaisesRegexp(TypeError, "Expected Tensor's shape"):
         constant_op.constant([1, 2, 3, 4, 5, 6, 7], shape=[5])
 
   # pylint: enable=g-long-lambda
@@ -253,6 +276,7 @@ class ConstantTest(test.TestCase):
                                    "GraphDef cannot be larger than 2GB."):
         g.as_graph_def()
 
+  @test_util.run_deprecated_v1
   def testSparseValuesRaiseErrors(self):
     with self.assertRaisesRegexp(ValueError,
                                  "setting an array element with a sequence"):
@@ -282,29 +306,29 @@ class AsTensorTest(test.TestCase):
     with self.cached_session():
       x = ops.convert_to_tensor(tensor_shape.TensorShape([]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([], x.eval())
+      self.assertAllEqual([], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([1, 2, 3]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([1, 2, 3], x.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]))
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+      self.assertAllEqual([2**31 - 1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31-1, 2, 3]),
                                 dtype=dtypes_lib.int32)
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual([2**31-1, 2, 3], x.eval())
+      self.assertAllEqual([2**31 - 1, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]))
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([2**31, 2, 3], x.eval())
+      self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
       x = ops.convert_to_tensor(tensor_shape.TensorShape([2**31, 2, 3]),
                                 dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([2**31, 2, 3], x.eval())
+      self.assertAllEqual([2**31, 2, 3], self.evaluate(x))
 
       with self.assertRaisesRegexp(
           ValueError, "a dimension is too large .2147483648."):
@@ -314,11 +338,11 @@ class AsTensorTest(test.TestCase):
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual([1, 2, 3], x.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(x))
 
       x = array_ops.reshape(
           array_ops.zeros([6]), tensor_shape.TensorShape([2, 3]))
-      self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], x.eval())
+      self.assertAllEqual([[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], self.evaluate(x))
 
     with self.assertRaisesRegexp(ValueError, "partially known"):
       ops.convert_to_tensor(tensor_shape.TensorShape(None))
@@ -330,16 +354,17 @@ class AsTensorTest(test.TestCase):
       ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3]), dtype=dtypes_lib.float32)
 
+  @test_util.run_deprecated_v1
   def testAsTensorForDimensionInput(self):
     with self.cached_session():
       x = ops.convert_to_tensor(tensor_shape.TensorShape([1, 2, 3])[1])
       self.assertEqual(dtypes_lib.int32, x.dtype)
-      self.assertAllEqual(2, x.eval())
+      self.assertAllEqual(2, self.evaluate(x))
 
       x = ops.convert_to_tensor(
           tensor_shape.TensorShape([1, 2, 3])[1], dtype=dtypes_lib.int64)
       self.assertEqual(dtypes_lib.int64, x.dtype)
-      self.assertAllEqual(2, x.eval())
+      self.assertAllEqual(2, self.evaluate(x))
 
     shape = tensor_shape.TensorShape(None)
     if shape._v2_behavior:
@@ -372,7 +397,7 @@ class ZerosTest(test.TestCase):
     with self.cached_session():
       ret = array_ops.zeros(shape)
       self.assertEqual(shape, ret.get_shape())
-      return ret.eval()
+      return self.evaluate(ret)
 
   def testConst(self):
     self.assertTrue(
@@ -383,7 +408,7 @@ class ZerosTest(test.TestCase):
     self.assertEqual(0, self._Zeros(()))
     with self.cached_session():
       scalar = array_ops.zeros(constant_op.constant([], dtype=dtypes_lib.int32))
-      self.assertEqual(0, scalar.eval())
+      self.assertEqual(0, self.evaluate(scalar))
 
   def testDynamicSizes(self):
     np_ans = np.array([[0] * 3] * 2)
@@ -392,11 +417,12 @@ class ZerosTest(test.TestCase):
       d = array_ops.fill([2, 3], 12., name="fill")
       # Constructs a tensor of zeros of the same dimensions as "d".
       z = array_ops.zeros(array_ops.shape(d))
-      out = z.eval()
+      out = self.evaluate(z)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, d)
     self.assertShapeEqual(np_ans, z)
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       d = array_ops.fill([2, 3], 12., name="fill")
@@ -420,13 +446,13 @@ class ZerosTest(test.TestCase):
         z = array_ops.zeros([2, 3], dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        z_value = z.eval()
+        z_value = self.evaluate(z)
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
         z = array_ops.zeros(array_ops.shape(d), dtype=dtype)
         self.assertEqual(z.dtype, dtype)
         self.assertEqual([2, 3], z.get_shape())
-        z_value = z.eval()
+        z_value = self.evaluate(z)
         self.assertFalse(np.any(z_value))
         self.assertEqual((2, 3), z_value.shape)
 
@@ -465,6 +491,7 @@ class ZerosLikeTest(test.TestCase):
       self.assertFalse(np.any(z_value))
       self.assertEqual((2, 3), z_value.shape)
 
+  @test_util.run_deprecated_v1
   def testZerosLikeCPU(self):
     for dtype in [
         dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
@@ -475,6 +502,7 @@ class ZerosLikeTest(test.TestCase):
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=False)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=False)
 
+  @test_util.run_deprecated_v1
   def testZerosLikeGPU(self):
     for dtype in [
         dtypes_lib.half, dtypes_lib.float32, dtypes_lib.float64,
@@ -484,11 +512,13 @@ class ZerosLikeTest(test.TestCase):
       self._compareZeros(dtype, fully_defined_shape=False, use_gpu=True)
       self._compareZeros(dtype, fully_defined_shape=True, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testZerosLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
     z = array_ops.zeros_like(d)
     self.assertEqual(d.get_shape().as_list(), z.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testZerosLikeDtype(self):
     # Make sure zeros_like works even for dtypes that cannot be cast between
     with self.cached_session():
@@ -502,6 +532,7 @@ class ZerosLikeTest(test.TestCase):
           self.assertEqual(y.shape, shape)
           self.assertAllEqual(y, np.zeros(shape, dtype=out_type))
 
+  @test_util.run_deprecated_v1
   def testZerosLikeVariant(self):
     # TODO(ebrevdo): Re-enable use_gpu=True once non-DMA Variant
     # copying between CPU and GPU is supported AND we register a
@@ -538,7 +569,7 @@ class OnesTest(test.TestCase):
     with self.cached_session():
       ret = array_ops.ones(shape)
       self.assertEqual(shape, ret.get_shape())
-      return ret.eval()
+      return self.evaluate(ret)
 
   def testConst(self):
     self.assertTrue(np.array_equal(self._Ones([2, 3]), np.array([[1] * 3] * 2)))
@@ -548,7 +579,7 @@ class OnesTest(test.TestCase):
     self.assertEqual(1, self._Ones(()))
     with self.cached_session():
       scalar = array_ops.ones(constant_op.constant([], dtype=dtypes_lib.int32))
-      self.assertEqual(1, scalar.eval())
+      self.assertEqual(1, self.evaluate(scalar))
 
   def testDynamicSizes(self):
     np_ans = np.array([[1] * 3] * 2)
@@ -557,11 +588,12 @@ class OnesTest(test.TestCase):
       d = array_ops.fill([2, 3], 12., name="fill")
       # Constructs a tensor of ones of the same dimensions as "d".
       z = array_ops.ones(array_ops.shape(d))
-      out = z.eval()
+      out = self.evaluate(z)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, d)
     self.assertShapeEqual(np_ans, z)
 
+  @test_util.run_deprecated_v1
   def testAutoPack(self):
     with self.cached_session():
       h = array_ops.placeholder(dtypes_lib.int32, shape=[])
@@ -570,6 +602,7 @@ class OnesTest(test.TestCase):
       out = z.eval(feed_dict={h: 4, w: 16})
     self.assertAllEqual(out, np.array([[1] * 16] * 4))
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       d = array_ops.fill([2, 3], 12., name="fill")
@@ -617,12 +650,13 @@ class OnesLikeTest(test.TestCase):
         z_var = array_ops.ones_like(d)
         # Test that the type is correct
         self.assertEqual(z_var.dtype, dtype)
-        z_value = z_var.eval()
+        z_value = self.evaluate(z_var)
 
       # Test that the value is correct
       self.assertTrue(np.array_equal(z_value, np.array([[1] * 3] * 2)))
       self.assertEqual([2, 3], z_var.get_shape())
 
+  @test_util.run_deprecated_v1
   def testOnesLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
     z = array_ops.ones_like(d)
@@ -634,7 +668,7 @@ class FillTest(test.TestCase):
   def _compare(self, dims, val, np_ans, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.fill(dims, val, name="fill")
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     # Fill does not set the shape.
     # self.assertShapeEqual(np_ans, tf_ans)
@@ -667,12 +701,14 @@ class FillTest(test.TestCase):
     np_ans = np.array([[0.15 + 0.3j] * 3] * 2).astype(np.complex128)
     self._compareAll([2, 3], np_ans[0][0], np_ans)
 
+  @test_util.run_deprecated_v1
   def testFillString(self):
     np_ans = np.array([[b"yolo"] * 3] * 2)
     with self.session(use_gpu=False):
       tf_ans = array_ops.fill([2, 3], np_ans[0][0], name="fill").eval()
     self.assertAllEqual(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testFillNegative(self):
     with self.cached_session():
       for shape in (-1,), (2, -1), (-1, 2), (-2), (-3):
@@ -686,6 +722,7 @@ class FillTest(test.TestCase):
         with self.assertRaises(errors_impl.InvalidArgumentError):
           fill_t.eval({dims: shape})
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Non-vector dimensions.
     with self.assertRaises(ValueError):
@@ -704,6 +741,7 @@ class FillTest(test.TestCase):
             dtypes_lib.int32, shape=()), 17], 1.0)
     self.assertEqual([None, 17], f.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       in_v = constant_op.constant(5.0)
@@ -716,6 +754,7 @@ class FillTest(test.TestCase):
 
 class PlaceholderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDtype(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
@@ -726,8 +765,9 @@ class PlaceholderTest(test.TestCase):
 
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=(10, 10), name="p")
@@ -739,12 +779,13 @@ class PlaceholderTest(test.TestCase):
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float and "
           r"shape \[10,10\]"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
       with self.assertRaisesWithPredicateMatch(
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
         p_identity.eval(feed_dict={p: feed_array[:5, :5]})
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=None, name="p")
@@ -757,12 +798,14 @@ class PlaceholderTest(test.TestCase):
       self.assertAllClose(
           p_identity.eval(feed_dict={p: feed_array}), feed_array)
 
+  @test_util.run_deprecated_v1
   def testScalarShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[], name="p")
       p_identity = array_ops.identity(p)
       self.assertAllClose(p_identity.eval(feed_dict={p: 5}), 5)
 
+  @test_util.run_deprecated_v1
   def testPartialShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
@@ -775,6 +818,7 @@ class PlaceholderTest(test.TestCase):
           ValueError, lambda e: "Cannot feed value of shape" in str(e)):
         p_identity.eval(feed_dict={p: feed_array[:5, :2]})
 
+  @test_util.run_deprecated_v1
   def testPartialShapeWhenNotFed(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.float32, shape=[None, 3], name="p")
@@ -783,8 +827,9 @@ class PlaceholderTest(test.TestCase):
       # Should trigger an operator error, not a shape error.
       with self.assertRaisesOpError(
           "must feed a value for placeholder tensor 'p' with dtype float"):
-        p_identity.eval()
+        self.evaluate(p_identity)
 
+  @test_util.run_deprecated_v1
   def testControlDependency(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes_lib.int32, shape=[], name="p")
@@ -794,10 +839,12 @@ class PlaceholderTest(test.TestCase):
       val = np.array(2).astype(np.int)
       self.assertEqual(10, d.eval(feed_dict={p: val}))
 
+  @test_util.run_deprecated_v1
   def testBadShape(self):
     with self.assertRaises(ValueError):
       array_ops.placeholder(dtypes_lib.float32, shape=(-1, 10))
 
+  @test_util.run_deprecated_v1
   def testTensorStr(self):
     a = array_ops.placeholder(dtypes_lib.float32, shape=None, name="a")
     self.assertEqual("<tf.Tensor 'a:0' shape=<unknown> dtype=float32>", repr(a))
@@ -813,6 +860,7 @@ class PlaceholderTest(test.TestCase):
       self.assertEqual(
           "<tf.Tensor 'c:0' shape=(32, ?, 2) dtype=qint32>", repr(c))
 
+  @test_util.run_deprecated_v1
   def testOldGraph(self):
     # Load graph generated from earlier version of TF where
     # placeholder shape was not set.
@@ -892,36 +940,40 @@ versions {
 
 class PlaceholderWithDefaultTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFullShape(self):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([[2, 2], [2, 2]], shape=[2, 2])
       a = array_ops.identity(p)
-      self.assertAllEqual([[2, 2], [2, 2]], a.eval())
+      self.assertAllEqual([[2, 2], [2, 2]], self.evaluate(a))
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
       with self.assertRaises(ValueError):
         a.eval(feed_dict={p: [[6, 6, 6], [6, 6, 6]]})
 
+  @test_util.run_deprecated_v1
   def testPartialShape(self):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([1, 2, 3], shape=[None])
       a = array_ops.identity(p)
-      self.assertAllEqual([1, 2, 3], a.eval())
+      self.assertAllEqual([1, 2, 3], self.evaluate(a))
       self.assertAllEqual([3, 37], a.eval(feed_dict={p: [3, 37]}))
 
       with self.assertRaises(ValueError):
         a.eval(feed_dict={p: [[2, 2], [2, 2]]})
 
+  @test_util.run_deprecated_v1
   def testNoShape(self):
     with self.session(force_gpu=test_util.is_gpu_available()):
       p = array_ops.placeholder_with_default([17], shape=None)
       a = array_ops.identity(p)
-      self.assertAllEqual([17], a.eval())
+      self.assertAllEqual([17], self.evaluate(a))
       self.assertAllEqual([3, 37], a.eval(feed_dict={p: [3, 37]}))
       self.assertAllEqual(
           [[3, 3], [3, 3]], a.eval(feed_dict={p: [[3, 3], [3, 3]]}))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.session(force_gpu=test_util.is_gpu_available()):
       x = array_ops.placeholder(dtypes_lib.float32, [5, 7])
diff --git a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
index 19b067e4499a862a49ec0a4a37ac683817827109..0fd293ebba3044097453c18fb625fc0dee19b19f 100644
--- a/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_ops_py_test.py
@@ -129,6 +129,7 @@ def isum(s, maximum_iterations=None):
 @test_util.with_control_flow_v2
 class ControlFlowTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRefIdentity(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -139,8 +140,9 @@ class ControlFlowTest(test.TestCase):
 
       self.assertTrue(isinstance(v2, ops.Tensor))
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v2.eval())
+      self.assertEqual(9, self.evaluate(v2))
 
+  @test_util.run_v1_only("b/120545219")
   def testRefEnter(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -152,8 +154,9 @@ class ControlFlowTest(test.TestCase):
       v2 = control_flow_ops.with_dependencies([op], enter_v)
       v3 = control_flow_ops.exit(v2)
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v3.eval())
+      self.assertEqual(9, self.evaluate(v3))
 
+  @test_util.run_v1_only("b/120545219")
   def testRefSwitch(self):
     with self.cached_session():
       v = variables.VariableV1(7)
@@ -162,7 +165,7 @@ class ControlFlowTest(test.TestCase):
       v1 = control_flow_ops._SwitchRefOrTensor(v._ref(), p)  # pylint: disable=protected-access
       v2 = state_ops.assign(v1[1], 9)
       variables.global_variables_initializer().run()
-      self.assertEqual(9, v2.eval())
+      self.assertEqual(9, self.evaluate(v2))
 
   def testEnterMulExit(self):
     with self.cached_session():
@@ -173,9 +176,10 @@ class ControlFlowTest(test.TestCase):
       mul_op = math_ops.multiply(enter_data, enter_five)
       exit_op = control_flow_ops.exit(mul_op)
 
-      result = exit_op.eval()
+      result = self.evaluate(exit_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_deprecated_v1
   def testEnterShapePropagation(self):
     with self.cached_session():
       v = variables.Variable([0.0, 0.0], dtype=dtypes.float32)
@@ -190,6 +194,7 @@ class ControlFlowTest(test.TestCase):
           v, "frame2", is_constant=False)
       self.assertEqual(enter_v_non_constant.shape, None)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([1, 2, 3, 4, 5, 6])
@@ -204,6 +209,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(np.arange(1, 7), val)
     self.assertAllEqual(np.arange(0, 12, 2), ind)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchDeadBranch(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -214,8 +220,9 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesWithPredicateMatch(
           errors_impl.InvalidArgumentError,
           lambda e: "Retval[0] does not have value" in str(e)):
-        dead_branch.eval()
+        self.evaluate(dead_branch)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeLess(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -225,9 +232,10 @@ class ControlFlowTest(test.TestCase):
       switch_op = control_flow_ops.switch(data, less_op)
       merge_op = control_flow_ops.merge(switch_op)[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.arange(1, 7), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeAddIdentity(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -238,9 +246,10 @@ class ControlFlowTest(test.TestCase):
       id_op = array_ops.identity(switch_op[1])
       merge_op = control_flow_ops.merge([add_op, id_op])[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x + 1 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testSwitchMergeAddMul(self):
     with self.cached_session():
       data = constant_op.constant([1, 2, 3, 4, 5, 6], name="data")
@@ -252,9 +261,10 @@ class ControlFlowTest(test.TestCase):
       mul_op = math_ops.multiply(switch_op[1], five)
       merge_op = control_flow_ops.merge([add_op, mul_op])[0]
 
-      result = merge_op.eval()
+      result = self.evaluate(merge_op)
     self.assertAllEqual(np.array([x * 5 for x in [1, 2, 3, 4, 5, 6]]), result)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoop_false(self):
     with self.cached_session():
       false = ops.convert_to_tensor(False)
@@ -269,9 +279,10 @@ class ControlFlowTest(test.TestCase):
       next_n = control_flow_ops.next_iteration(switch_n[0])
       merge_n.op._update_input(1, next_n)
 
-      result = exit_n.eval()
+      result = self.evaluate(exit_n)
     self.assertAllEqual(10, result)
 
+  @test_util.run_deprecated_v1
   def testLoop_1(self):
     with self.cached_session():
       zero = constant_op.constant(0)
@@ -295,9 +306,10 @@ class ControlFlowTest(test.TestCase):
       merge_i.op._update_input(1, next_i)
 
       exit_i = control_flow_ops.exit(switch_i[0])
-      result = exit_i.eval()
+      result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoop_2(self):
     with self.cached_session():
       zero = constant_op.constant(0)
@@ -321,9 +333,10 @@ class ControlFlowTest(test.TestCase):
       merge_i.op._update_input(1, next_i)
 
       exit_i = control_flow_ops.exit(switch_i[0])
-      result = exit_i.eval()
+      result = self.evaluate(exit_i)
     self.assertAllEqual(10, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testDifferentFrame(self):
     with self.cached_session():
       data = array_ops.placeholder(dtypes.float32, shape=[])
@@ -333,6 +346,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesOpError("has inputs from different frames"):
         res.eval(feed_dict={data: 1.0})
 
+  @test_util.run_deprecated_v1
   def testCondBool(self):
     values = constant_op.constant(10)
     fn1 = lambda: math_ops.add(values, 1)
@@ -340,6 +354,7 @@ class ControlFlowTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "must not be a Python bool"):
       _ = control_flow_ops.cond(False, fn1, fn2)
 
+  @test_util.run_deprecated_v1
   def testCondInt(self):
     p = array_ops.placeholder(dtypes.bool, shape=[])
     v = constant_op.constant(10)
@@ -356,6 +371,7 @@ class ControlFlowTest(test.TestCase):
         lambda: math_ops.subtract(x, 1.))
     self.assertEqual(b.shape, tensor_shape.scalar())
 
+  @test_util.run_v1_only("b/120545219")
   def testFetchable(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -372,6 +388,7 @@ class ControlFlowTest(test.TestCase):
               sess.run(t, feed_dict={x: 3})
 
   @test_util.disable_control_flow_v2("Not relevant")
+  @test_util.run_v1_only("b/120545219")
   def testFeedable(self):
     with self.cached_session() as sess:
       c = constant_op.constant(2)
@@ -389,7 +406,7 @@ class ControlFlowTest(test.TestCase):
             with self.assertRaisesRegexp(ValueError, "may not be fed"):
               sess.run(r, feed_dict={t: 3})
 
-  @test_util.disable_control_flow_v2("b/113296180 (IndexedSlices)")
+  @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant(10)
@@ -405,7 +422,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(11, val)
     self.assertAllEqual(0, ind)
 
-  @test_util.disable_control_flow_v2("b/113296161 (SparseTensors)")
+  @test_util.run_v1_only("b/120545219")
   def testCondSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -423,6 +440,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual([[1], [4]], r.indices.eval())
       self.assertAllEqual(r.values.get_shape(), (2,))
 
+  @test_util.run_v1_only("b/120545219")
   def testCondResource(self):
 
     with self.cached_session():
@@ -437,7 +455,22 @@ class ControlFlowTest(test.TestCase):
 
       self.assertEqual(1.0, control_flow_ops.cond(rv, case, lambda: t).eval())
 
+  @test_util.run_v1_only("b/120545219")
+  def testCondWithTensorArrayGrad(self):
+    with self.cached_session() as sess:
+      with ops.device(test.gpu_device_name()):
+        pred = array_ops.placeholder(dtypes.bool, [])
+        x = constant_op.constant([1.0, 2.0, 3.0])
+        y = control_flow_ops.cond(
+            pred, lambda: functional_ops.map_fn(lambda z: z * 2.0, x),
+            lambda: constant_op.constant([1.0, 1.0, 1.0]))
+        g = gradients_impl.gradients(y, x)[0]
+
+      self.assertAllEqual(sess.run(g, {pred: True}), [2.0, 2.0, 2.0])
+      self.assertAllEqual(sess.run(g, {pred: False}), [0.0, 0.0, 0.0])
+
   @test_util.disable_control_flow_v2("b/113293074")
+  @test_util.run_v1_only("b/120545219")
   def testCondIndexedSlicesDifferentTypes(self):
     with self.cached_session():
       values = constant_op.constant(10)
@@ -455,6 +488,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(0, ind)
     self.assertTrue(ind.dtype == np.int64)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondColocation(self):
     with self.session(use_gpu=True):
       with ops.device("/cpu:0"):
@@ -478,7 +512,7 @@ class ControlFlowTest(test.TestCase):
       fn2 = lambda: math_ops.subtract(x, 1)
       r = control_flow_ops.cond(pred, fn1, fn2)
 
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(11, result)
 
   def testCond_1(self):
@@ -494,7 +528,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(1, 0), lambda: math_ops.add(x, 1),
           lambda: math_ops.subtract(x, 1))
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(9, result)
 
   def testCond_3(self):
@@ -507,7 +541,7 @@ class ControlFlowTest(test.TestCase):
       fn3 = lambda: math_ops.add(control_flow_ops.cond(pred, fn1, fn2), 1)
       r = control_flow_ops.cond(pred, fn3, fn2)
 
-      result = r.eval()
+      result = self.evaluate(r)
     self.assertAllEqual(12, result)
 
   @test_util.run_in_graph_and_eager_modes
@@ -534,9 +568,9 @@ class ControlFlowTest(test.TestCase):
         result = f().eval()
         self.assertEqual(True, result)
         # Only second cond result was fetched, so v1 assign shouldn't run.
-        self.assertEqual(7, v1.eval())
-        self.assertEqual(2, v2.eval())
-        self.assertEqual(7, v3.eval())
+        self.assertEqual(7, self.evaluate(v1))
+        self.assertEqual(2, self.evaluate(v2))
+        self.assertEqual(7, self.evaluate(v3))
 
     result = f_defun()
     self.assertEqual(True, self.evaluate(result))
@@ -557,10 +591,10 @@ class ControlFlowTest(test.TestCase):
 
       for i in range(10):
         alive, count = body(i)
-      self.assertAllEqual(4, count.eval())
+      self.assertAllEqual(4, self.evaluate(count))
 
+  @test_util.run_v1_only("b/120545219")
   def testCond_6(self):
-
     with self.cached_session():
       v1 = variables.Variable([7])
 
@@ -571,7 +605,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       variables.global_variables_initializer().run()
-      result = r.eval()
+      result = self.evaluate(r)
       self.assertAllEqual(np.array([7]), result)
 
   def testCond_7(self):
@@ -582,8 +616,95 @@ class ControlFlowTest(test.TestCase):
       fn1 = lambda: [math_ops.add(x, 1), math_ops.add(x, 2)]
       fn2 = lambda: [y, y]
       r = control_flow_ops.cond(pred, fn1, fn2)
-      self.assertAllEqual([11, 12], sess.run(r))
+      self.assertAllEqual([11, 12], self.evaluate(r))
+
+  def testCondListOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: [math_ops.add(x, y), math_ops.add(x, y)]
+      fn2 = lambda: [y, y]
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertListEqual([210, 210], test_result)
+
+  def testTupleOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: (math_ops.add(x, y), math_ops.add(x, y))
+      fn2 = lambda: (y, y)
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertTupleEqual((210, 210), test_result)
+
+  def testDictOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
+      fn2 = lambda: {"a": y, "b": y}
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertDictEqual({"a": 210, "b": 210}, test_result)
+
+  @test_util.run_deprecated_v1
+  def testEmbeddedListOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: [[math_ops.add(x, y), math_ops.add(x, y)]]
+      fn2 = lambda: [[y, y]]
+      # Pass strict=True flag as cond_v2 allows for tensors to be
+      # in nested output structures as singletons
+      r = control_flow_ops.cond(pred, fn1, fn2, strict=True)
+      test_result = self.evaluate(r)
+      self.assertListEqual([[210, 210]], test_result)
+
+  def testEmbeddedTupleOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: ((math_ops.add(x, y), math_ops.add(x, y)))
+      fn2 = lambda: ((y, y))
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertTupleEqual(((210, 210)), test_result)
+
+  def testEmbeddedDictOutput(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": {"c": math_ops.add(x, y)},
+                     "b": {"d": math_ops.add(x, y)}}
+      fn2 = lambda: {"a": {"c": y},
+                     "b": {"d": y}}
+      r = control_flow_ops.cond(pred, fn1, fn2)
+      test_result = self.evaluate(r)
+      self.assertDictEqual({"a": {"c": 210}, "b": {"d": 210}}, test_result)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCheckNestedOutputStruct(self):
+    with self.cached_session() as sess:
+      x = constant_op.constant(10)
+      y = constant_op.constant(200)
+      pred = math_ops.less(1, 2)
+      fn1 = lambda: {"a": math_ops.add(x, y), "b": math_ops.add(x, y)}
+      fn2 = lambda: {"c": y, "d": y}
+      v1_msg = "The two structures don't have the same nested structure"
+      v2_msg = "Outputs of true_fn and false_fn must have the same structure"
+      with self.assertRaisesRegexp(
+          ValueError, v2_msg if control_flow_ops.ENABLE_COND_V2 else v1_msg):
+        r = control_flow_ops.cond(pred, fn1, fn2)
+        self.evaluate(r)
 
+  @test_util.run_deprecated_v1
   def testCondRef(self):
 
     with self.cached_session():
@@ -596,9 +717,10 @@ class ControlFlowTest(test.TestCase):
       true_fn = lambda: x
       false_fn = lambda: constant_op.constant([2.0])
       r = control_flow_ops.cond(constant_op.constant(False), true_fn, false_fn)
-      self.assertAllEqual([2.0], r.eval())
+      self.assertAllEqual([2.0], self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_v1_only("b/120545219")
   def testCondWithControl(self):
     with self.cached_session():
       control_holder = array_ops.placeholder(dtypes.float32, shape=())
@@ -612,8 +734,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           constant_op.constant(True), true_branch,
           lambda: constant_op.constant(1))
-      self.assertEqual(5, r.eval())
+      self.assertEqual(5, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testUninitializedRefIdentity(self):
     with self.cached_session() as sess:
       v = gen_state_ops.variable(
@@ -636,7 +759,7 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([v_t_op]):
         orig_v = array_ops.identity(v)
       merged_op = control_flow_ops.merge([assign_v, orig_v])
-      self.assertAllEqual([1.0], sess.run(merged_op.output))
+      self.assertAllEqual([1.0], self.evaluate(merged_op.output))
 
   def testCondSwitchIdentity(self):
     # Make sure the recv identity is not removed by optimization.
@@ -650,7 +773,7 @@ class ControlFlowTest(test.TestCase):
         return control_flow_ops.Assert(False, ["Wrong branch!!!"])
 
       r = control_flow_ops.cond(pred, fn1, fn2)
-      sess.run(r)
+      self.evaluate(r)
 
   def testCondRecvIdentity(self):
     # Make sure the switch identity is not removed by optimization.
@@ -666,8 +789,9 @@ class ControlFlowTest(test.TestCase):
           return control_flow_ops.Assert(False, ["Wrong branch!!!"])
 
       r = control_flow_ops.cond(pred, fn1, fn2)
-      sess.run(r)
+      self.evaluate(r)
 
+  @test_util.run_v1_only("b/120545219")
   def testCondGrad_1(self):
     with self.cached_session():
       x = constant_op.constant(10.0, name="x")
@@ -677,8 +801,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(pred, fn1, fn2)
 
       grad = gradients_impl.gradients(r, [x])[0]
-      self.assertAllEqual(1.0, grad.eval())
+      self.assertAllEqual(1.0, self.evaluate(grad))
 
+  @test_util.run_deprecated_v1
   def testCondGrad_2(self):
     with self.cached_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -694,6 +819,7 @@ class ControlFlowTest(test.TestCase):
 
   @test_util.disable_control_flow_v2(
       "b/110550782 (gradient w.r.t external variable)")
+  @test_util.run_deprecated_v1
   def testCondGrad_3(self):
     with self.cached_session():
       c = array_ops.placeholder(dtypes.int32, shape=[])
@@ -711,6 +837,36 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(980.0, r.eval(feed_dict={c: 1}))
       self.assertAllEqual(30.0, r.eval(feed_dict={c: 3}))
 
+  @test_util.run_deprecated_v1
+  def testCondGradMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 2},
+                                    allow_soft_placement=True)
+    with self.cached_session(use_gpu=True, config=config) as sess:
+      pred = array_ops.placeholder(dtypes.bool, [])
+      x = array_ops.placeholder(dtypes.float32)
+      y = array_ops.placeholder(dtypes.float32)
+
+      with ops.device("/cpu:0"):
+        z = control_flow_ops.cond(pred, lambda: x * y * 2.0, lambda: 2.0)
+
+      with ops.device("/cpu:1"):
+        grad = gradients_impl.gradients(z, x)[0]
+
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x)[0]
+
+      self.assertEqual(sess.run(grad, {pred: True, x: 1.0, y: 2.0}), 4.0)
+      self.assertEqual(sess.run(grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
+      # v1 control flow gets None second derivative for some reason.
+      if not control_flow_ops.ENABLE_COND_V2:
+        self.assertIsNone(grad_grad)
+        return
+
+      self.assertEqual(sess.run(grad_grad, {pred: True, x: 1.0, y: 2.0}), 0.0)
+      self.assertEqual(sess.run(grad_grad, {pred: False, x: 1.0, y: 2.0}), 0.0)
+
+  @test_util.run_v1_only("b/120545219")
   def testNestedCond_Simple(self):
     with self.cached_session():
       x = constant_op.constant(0., name="X")
@@ -718,15 +874,16 @@ class ControlFlowTest(test.TestCase):
           constant_op.constant(True), lambda: x,
           lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(y, x)[0]
-      self.assertEqual(1.0, result.eval())
+      self.assertEqual(1.0, self.evaluate(result))
 
       z = control_flow_ops.cond(
           constant_op.constant(False), lambda: x,
           lambda: control_flow_ops.cond(x < 1., lambda: x, lambda: x))
       result = gradients_impl.gradients(z, x)[0]
-      self.assertEqual(1.0, result.eval())
+      self.assertEqual(1.0, self.evaluate(result))
 
   @test_util.disable_control_flow_v2("b/113327884")
+  @test_util.run_v1_only("b/120545219")
   def testCondGrad_Gather(self):
     with self.cached_session() as sess:
       v1 = variables.Variable([1.0, 42.0])
@@ -740,16 +897,27 @@ class ControlFlowTest(test.TestCase):
       # Should just be [1, 1], but possibly a sparse representation
       gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 1})
       dense_gv = [
-          sum([y for (x, y) in zip(gi, gv) if x == i]) for i in range(2)
+          sum(y for (x, y) in zip(gi, gv) if x == i) for i in range(2)
       ]
       self.assertAllEqual(dense_gv, [1.0, 1.0])
       # Should be [0, 2], as the else forwards v1[1] twice
       gv, gi = sess.run([grad.values, grad.indices], feed_dict={c: 3})
       dense_gv = [
-          sum([y for (x, y) in zip(gi, gv) if x == i]) for i in range(2)
+          sum(y for (x, y) in zip(gi, gv) if x == i) for i in range(2)
       ]
       self.assertAllEqual(dense_gv, [0.0, 2.0])
 
+  @test_util.run_v1_only("b/120545219")
+  def testCondPredicateTensor(self):
+    """Regression test for lowering predicate from non-first output of an op."""
+
+    @eager_function.defun
+    def foo():
+      return constant_op.constant("foo"), constant_op.constant(True)
+
+    r = control_flow_ops.cond(foo()[1], lambda: 1.0, lambda: 2.0)
+    self.assertEqual(self.evaluate(r), 1.0)
+
   # TODO(b/117945658): reenable
   @test_util.run_in_graph_and_eager_modes
   def DISABLED_testCondAutoControlDeps(self):
@@ -863,9 +1031,10 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10000)
       b = lambda x: math_ops.add(x, 1)
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual(10000, r.eval())
+      self.assertEqual(10000, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileExternalControlDependencies(self):
     with self.cached_session():
       v = variables.Variable(0.0)
@@ -882,6 +1051,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllEqual(v.eval(), 1.0)
 
   @test_util.disable_control_flow_v2("b/79881896 (control deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileExternalControlDependenciesNoInput(self):
     with self.cached_session():
       v = variables.Variable(0.0)
@@ -894,10 +1064,11 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(cond=lambda i: i < 5,
                                            body=body_fn, loop_vars=[0])
-      result.eval()
+      self.evaluate(result)
       self.assertAllEqual(v.eval(), 1.0)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithRefs_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0)._ref()  # pylint: disable=protected-access
@@ -917,7 +1088,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r[0].dtype, dtypes.int32)
       self.assertEqual(r[1].dtype, dtypes.int32_ref)
 
-      value_i, value_x = sess.run(r)
+      value_i, value_x = self.evaluate(r)
 
     self.assertEqual(100, value_i)
     self.assertEqual(0, value_x)
@@ -926,21 +1097,23 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session():
       s = constant_op.constant(0)
       r = isum(s)
-      self.assertAllEqual(45, r.eval())
+      self.assertAllEqual(45, self.evaluate(r))
 
   def testWhileWithMaximumIterations(self):
     with self.cached_session():
       s = constant_op.constant([1, 2, 3, 4, 5])
       r = isum(s, maximum_iterations=3)
-      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], r.eval())
+      self.assertAllEqual([1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3], self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with self.cached_session():
       r = control_flow_ops.while_loop(
           lambda i: i < 3, lambda i: i + 1, [0], maximum_iterations=1)
-      self.assertEqual(1, r.eval())
+      self.assertEqual(1, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115776323 (max_iters)")
+  @test_util.run_v1_only("b/120545219")
   def testSingleNestedMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -966,6 +1139,7 @@ class ControlFlowTest(test.TestCase):
     # Should execute without issue.
     self.assertEqual(3, self.evaluate(loop_execute))
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsWhileLoopGradientInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1026,6 +1200,7 @@ class ControlFlowTest(test.TestCase):
           r"context '.*' \(currently defined in '.*'\)"):
         _ = gradients_impl.gradients(loop_with_maxiter, v)
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidMaximumIterationsFromSiblingContextWhileLoopInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1069,6 +1244,7 @@ class ControlFlowTest(test.TestCase):
         _ = gradients_impl.gradients(loop, v)
 
   @test_util.disable_control_flow_v2("b/118457764")
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileLoopWithMaxItersFromOuterContextInXLAContext(self):
     v = constant_op.constant(1.0)
 
@@ -1137,6 +1313,7 @@ class ControlFlowTest(test.TestCase):
 
   # Have more than 10 parallel iterations and hence exercise k-bound
   # most of the time.
+  @test_util.run_deprecated_v1
   def testWhile_3(self):
     with self.cached_session():
 
@@ -1157,6 +1334,7 @@ class ControlFlowTest(test.TestCase):
       result = r[3].eval()
     self.assertAllEqual(10100, result)
 
+  @test_util.run_deprecated_v1
   def testWhile_4(self):
     with self.cached_session():
 
@@ -1178,6 +1356,7 @@ class ControlFlowTest(test.TestCase):
       result = r[3].eval()
     self.assertAllEqual(42, result)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhile_5(self):
     with self.cached_session():
 
@@ -1203,6 +1382,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllEqual(np.array([0, 1, 2, 3, 4, 5, 6]), result)
 
   @test_util.disable_control_flow_v2("b/116338794 (buffer_reuse)")
+  @test_util.run_v1_only("b/120545219")
   def testBufferForwarding(self):
     run_options = config_pb2.RunOptions(
         trace_level=config_pb2.RunOptions.FULL_TRACE)
@@ -1231,7 +1411,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10.0)
       b = lambda x: math_ops.add(x, 1.0)
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllClose(10.0, r.eval())
+      self.assertAllClose(10.0, self.evaluate(r))
 
   def testWhile_Gpu_1(self):
     self._testWhile_Gpu_1(use_gpu=False)
@@ -1247,7 +1427,7 @@ class ControlFlowTest(test.TestCase):
           return math_ops.add(x, 1.0)
 
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllClose(10.0, r.eval())
+      self.assertAllClose(10.0, self.evaluate(r))
 
   def testWhile_Gpu_2(self):
     self._testWhile_Gpu_2(use_gpu=False)
@@ -1268,15 +1448,16 @@ class ControlFlowTest(test.TestCase):
           c, _b, [i, m],
           [i.get_shape(), tensor_shape.unknown_shape()])
       r = r[1] * array_ops.ones([8, 8])
-      self.assertAllEqual(np.ones((8, 8)), r.eval())
+      self.assertAllEqual(np.ones((8, 8)), self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testWhileWithNonTensorInput_Scalar(self):
     with self.cached_session():
       n = 0
       c = lambda x: x < 10000
       b = lambda x: x + 1
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual(10000, r.eval())
+      self.assertEqual(10000, self.evaluate(r))
 
   def testWhileWithNonTensorInput_Vector(self):
     with self.cached_session():
@@ -1284,8 +1465,9 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: x[0] < 10000
       b = lambda x: array_ops.stack([x[0] + 1])
       r = control_flow_ops.while_loop(c, b, [n], parallel_iterations=20)
-      self.assertEqual([10000], r.eval())
+      self.assertEqual([10000], self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInference(self):
     with self.cached_session():
       i = constant_op.constant(0)
@@ -1312,6 +1494,7 @@ class ControlFlowTest(test.TestCase):
         r = control_flow_ops.while_loop(c, b, [i, m])
 
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceSparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -1344,6 +1527,7 @@ class ControlFlowTest(test.TestCase):
             [i.get_shape(), tensor_shape.TensorShape([5])])
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileShapeInferenceIndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([[2.0, 4.0], [3.0, 5.0]], name="values")
@@ -1396,7 +1580,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 200)
       b = lambda x: math_ops.add(x, cpu_sum(n))
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertEqual(225, r.eval())
+      self.assertEqual(225, self.evaluate(r))
 
   def testNestedWhile_1(self):
     self._testNestedWhile_1(use_gpu=False)
@@ -1428,12 +1612,13 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(
           outer_c, outer_b, [s0], parallel_iterations=1)
-      self.assertEqual(1048576.0, r.eval())
+      self.assertEqual(1048576.0, self.evaluate(r))
 
   def testNestedWhile_2(self):
     self._testNestedWhile_2(use_gpu=False)
     self._testNestedWhile_2(use_gpu=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_1(self):
     with self.cached_session():
       n = constant_op.constant(0)
@@ -1450,6 +1635,7 @@ class ControlFlowTest(test.TestCase):
           condition, body, [n, r], parallel_iterations=1)
       self.assertAllEqual(12, res[1].eval())
 
+  @test_util.run_deprecated_v1
   def testWhileWithControl_2(self):
     with self.cached_session():
       r = constant_op.constant(0)
@@ -1462,8 +1648,9 @@ class ControlFlowTest(test.TestCase):
 
       res = control_flow_ops.while_loop(
           condition, body, [r], parallel_iterations=1)
-      self.assertAllEqual(12, res.eval())
+      self.assertAllEqual(12, self.evaluate(res))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_3(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1473,6 +1660,7 @@ class ControlFlowTest(test.TestCase):
         r = control_flow_ops.while_loop(lambda x: x < 10, lambda x: x + c, [x0])
       self.assertEqual(10, sess.run(r, {b: True}))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_4(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1484,6 +1672,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(10, sess.run(r, {b: True}))
 
   @test_util.disable_control_flow_v2("b/79881896 (control_deps)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithControl_5(self):
     with self.cached_session() as sess:
       b = array_ops.placeholder(dtypes.bool)
@@ -1509,9 +1698,10 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([control_flow_ops.no_op()]):
         loop = control_flow_ops.while_loop(cond, body,
                                            (constant_op.constant(5),))
-      self.assertEqual(0, sess.run(loop))
+      self.assertEqual(0, self.evaluate(loop))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileCondWithControl_1(self):
     with self.cached_session():
       v = variable_scope.get_variable(
@@ -1531,10 +1721,11 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(loop_condition, loop_body, (i0,))
       variables.global_variables_initializer().run()
-      self.assertEqual(4, r.eval())
-      self.assertAllClose(65536.0, v.eval())
+      self.assertEqual(4, self.evaluate(r))
+      self.assertAllClose(65536.0, self.evaluate(v))
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileCondExitControl(self):
 
     with self.cached_session():
@@ -1556,8 +1747,8 @@ class ControlFlowTest(test.TestCase):
           constant_op.constant(False), lambda: constant_op.constant(1.0),
           false_branch)
       variables.global_variables_initializer().run()
-      self.assertEqual(6.0, r.eval())
-      self.assertEqual(99, v.eval())
+      self.assertEqual(6.0, self.evaluate(r))
+      self.assertEqual(99, self.evaluate(v))
 
   def testCondWhile_1(self):
 
@@ -1568,7 +1759,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(0, 1), lambda: control_flow_ops.while_loop(c, b, [n]),
           lambda: n)
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testCondWhile_2(self):
 
@@ -1579,7 +1770,7 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(
           math_ops.less(1, 0), lambda: math_ops.add(n, 1),
           lambda: control_flow_ops.while_loop(c, b, [n]))
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def _testCondWhile_3(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu) as sess:
@@ -1604,6 +1795,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([2.0], sess.run(r1, {p: False}))
 
   @test_util.disable_control_flow_v2("b/116743589")
+  @test_util.run_deprecated_v1
   def testCondWhile_3(self):
     self._testCondWhile_3(use_gpu=False)
     self._testCondWhile_3(use_gpu=True)
@@ -1622,7 +1814,7 @@ class ControlFlowTest(test.TestCase):
           lambda: math_ops.add(x, one), lambda: math_ops.subtract(x, one))
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [i])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testWhileCond_2(self):
 
@@ -1631,7 +1823,7 @@ class ControlFlowTest(test.TestCase):
       c = lambda x: math_ops.less(x, 10)
       b = lambda x: control_flow_ops.cond(constant_op.constant(True), lambda: math_ops.add(x, 1), lambda: n)
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
 
   def testWhileCond_3(self):
 
@@ -1645,10 +1837,41 @@ class ControlFlowTest(test.TestCase):
                                           lambda: math_ops.subtract(x, 1))
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [n])
-      self.assertAllEqual(10, r.eval())
+      self.assertAllEqual(10, self.evaluate(r))
+
+  @test_util.run_deprecated_v1
+  def testWhileCondGradMultiDevice(self):
+    config = config_pb2.ConfigProto(device_count={"CPU": 2},
+                                    allow_soft_placement=True)
+    with self.cached_session(use_gpu=True, config=config) as sess:
+      pred = array_ops.placeholder(dtypes.bool, [])
+      x_init = constant_op.constant(1.0)
+
+      with ops.device("/cpu:0"):
+        z = control_flow_ops.while_loop(
+            lambda i, _: i < 3,
+            lambda i, x: (i + 1, control_flow_ops.cond(
+                pred, lambda: x * 2.0, lambda: 10.0)),
+            [0, x_init])
+
+      with ops.device("/cpu:1"):
+        grad = gradients_impl.gradients(z, x_init)[0]
+
+      with ops.device("/cpu:0"):
+        grad_grad = gradients_impl.gradients(grad, x_init)[0]
+
+      self.assertEqual(sess.run(grad, {pred: True}), 8.0)
+      self.assertEqual(sess.run(grad, {pred: False}), 0.0)
+
+      if not control_flow_ops.ENABLE_WHILE_V2:
+        return
+
+      self.assertEqual(sess.run(grad_grad, {pred: True}), 0.0)
+      self.assertEqual(sess.run(grad_grad, {pred: False}), 0.0)
 
   # NOTE: It is ok to have parallel_iterations > 1
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileUpdateVariable_1(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1667,11 +1890,12 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
       variables.global_variables_initializer().run()
-      self.assertEqual(3, r.eval())
-      result = select.eval()
+      self.assertEqual(3, self.evaluate(r))
+      result = self.evaluate(select)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_2(self):
     with self.cached_session():
       select1 = variables.Variable([3.0, 4.0, 5.0])
@@ -1692,13 +1916,14 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(
           loop_iterator, loop_body, [n], parallel_iterations=1)
       variables.global_variables_initializer().run()
-      self.assertEqual(3, r.eval())
-      result1 = select1.eval()
+      self.assertEqual(3, self.evaluate(r))
+      result1 = self.evaluate(select1)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result1)
-      result2 = select2.eval()
+      result2 = self.evaluate(select2)
       self.assertAllClose(np.array([10.0, 10.0, 10.0]), result2)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileUpdateVariable_3(self):
     with self.cached_session():
       select = variables.Variable([3.0, 4.0, 5.0])
@@ -1721,6 +1946,7 @@ class ControlFlowTest(test.TestCase):
     self.assertAllClose(np.array([10.0, 10.0, 10.0]), result)
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_4(self):
     with self.cached_session():
       var_a = variables.Variable(0, name="a")
@@ -1744,11 +1970,12 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [c], parallel_iterations=1)
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(10, var_b.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_5(self):
     with self.cached_session():
       # Create some variables.
@@ -1773,12 +2000,13 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [var_b], parallel_iterations=1, name="loop")
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(10, var_a.eval())
-      self.assertEqual(10, var_b.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(10, self.evaluate(var_a))
+      self.assertEqual(10, self.evaluate(var_b))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileUpdateVariable_6(self):
     with self.cached_session():
       # Create some variables.
@@ -1803,11 +2031,12 @@ class ControlFlowTest(test.TestCase):
       lpa = control_flow_ops.while_loop(
           pred, loop_body, [c], parallel_iterations=1, name="loop")
 
-      self.assertEqual(0, var_b.eval())
-      lpa.eval()  # Run the loop
-      self.assertEqual(55, var_b.eval())
-      self.assertEqual(10, var_a.eval())
+      self.assertEqual(0, self.evaluate(var_b))
+      self.evaluate(lpa)  # Run the loop
+      self.assertEqual(55, self.evaluate(var_b))
+      self.assertEqual(10, self.evaluate(var_a))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileQueue_1(self):
     with self.cached_session():
       q = data_flow_ops.FIFOQueue(-1, dtypes.int32)
@@ -1822,11 +2051,23 @@ class ControlFlowTest(test.TestCase):
         return ni
 
       r = control_flow_ops.while_loop(c, b, [i], parallel_iterations=1)
-      self.assertEqual([10], r.eval())
+      self.assertEqual([10], self.evaluate(r))
       for i in xrange(10):
         self.assertEqual([i], q.dequeue().eval())
 
+  @test_util.run_v1_only("b/120545219")
+  def testWhileTimeOut(self):
+    run_options = config_pb2.RunOptions(timeout_in_ms=1)
+    with self.cached_session() as sess:
+      n = constant_op.constant(0)
+      c = lambda x: True
+      b = lambda x: math_ops.add(x, 1)
+      r = control_flow_ops.while_loop(c, b, [n])
+      with self.assertRaises(errors_impl.DeadlineExceededError):
+        sess.run(r, options=run_options)
+
   @test_util.disable_control_flow_v2("b/117119329 (stack)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileStack_1(self):
     with self.cached_session():
       s = gen_data_flow_ops.stack_v2(-1, dtypes.int32, stack_name="foo")
@@ -1858,7 +2099,7 @@ class ControlFlowTest(test.TestCase):
           b1, [r, x],
           [r.get_shape(), tensor_shape.unknown_shape()],
           parallel_iterations=1)
-      self.assertEqual(45, rx.eval())
+      self.assertEqual(45, self.evaluate(rx))
 
   def _testWhileGrad_ColocateGradients(self, colocate):
     gpu_dev_name = test.gpu_device_name() if test.is_gpu_available(
@@ -1893,13 +2134,15 @@ class ControlFlowTest(test.TestCase):
         self.assertFalse(gpu_dev_name in dev)
 
     with self.session(graph=graph) as sess:
-      self.assertAllClose(1024.0, sess.run(r))
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116351701 (colocation)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ColocateGradients(self):
     self._testWhileGrad_ColocateGradients(colocate=False)
     self._testWhileGrad_ColocateGradients(colocate=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Square(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -1909,8 +2152,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.cond(math_ops.less(1, 2), lambda: r, lambda: v)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(1024.0, r.eval())
+      self.assertAllClose(1024.0, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Shape(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=[None])
@@ -1928,6 +2172,7 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual([None], r.get_shape().as_list())
       self.assertAllClose([810.0, 2560.0], r.eval(feed_dict={x: [3.0, 4.0]}))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_BaseShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32, [None])
@@ -1940,6 +2185,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([r, y], x)[0]
       self.assertAllClose([2.0, 4.0], sess.run(r, feed_dict={x: [1.0, 2.0]}))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_MultipleUses(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -1949,8 +2195,9 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.multiply(r, r)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertEqual(524288.0, r.eval())
+      self.assertEqual(524288.0, self.evaluate(r))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_LoopAdd(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -1960,7 +2207,7 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.add(r, r)
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(2048.0, r.eval())
+      self.assertAllClose(2048.0, self.evaluate(r))
 
   def _testWhileGrad_Mul(self, use_gpu, p_iters):
     with self.cached_session(use_gpu=use_gpu) as sess:
@@ -1971,11 +2218,12 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v], parallel_iterations=p_iters)
 
       grad_a, grad_v = gradients_impl.gradients(r, [a, v])
-      grad_a_val, grad_v_val = sess.run([grad_a, grad_v])
+      grad_a_val, grad_v_val = self.evaluate([grad_a, grad_v])
       self.assertAllClose(216.0, grad_a_val)
       self.assertAllClose(81.0, grad_v_val)
 
   @test_util.disable_control_flow_v2("b/116630618 (parallel_iters: times out)")
+  @test_util.run_deprecated_v1
   def testWhileGrad_Mul(self):
     self._testWhileGrad_Mul(use_gpu=False, p_iters=1)
     self._testWhileGrad_Mul(use_gpu=False, p_iters=10)
@@ -2003,17 +2251,17 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(512.0, r.eval())
+      self.assertAllClose(512.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testNestedWhileCondWhileGrad(self):
-    if control_flow_ops.ENABLE_WHILE_V2 and test_util.is_gpu_available():
-      self.skipTest("b/118459209")
     self._testNestedWhileCondWhileGrad(use_gpu=False)
 
-  @test_util.disable_control_flow_v2("b/118459209")
+  @test_util.run_deprecated_v1
   def testNestedWhileCondWhileGradGpu(self):
     self._testNestedWhileCondWhileGrad(use_gpu=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_Variable(self):
     with self.cached_session():
       a = variables.Variable(3.0)
@@ -2026,6 +2274,7 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllClose(216.0, r[0].eval())
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_ResourceVariable(self):
     with self.cached_session():
       a = resource_variable_ops.ResourceVariable(3.0)
@@ -2038,6 +2287,7 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllClose(216.0, g[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradInCond(self):
 
     with self.cached_session():
@@ -2049,12 +2299,13 @@ class ControlFlowTest(test.TestCase):
       def fn1():
         r = control_flow_ops.while_loop(c, b, [n],
                                         [tensor_shape.unknown_shape()])
-        return gradients_impl.gradients(r, x)
+        return gradients_impl.gradients(r, x)[0]
 
       r = control_flow_ops.cond(math_ops.less(1, 2), fn1, lambda: x)
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
   @test_util.disable_control_flow_v2("b/116340060")
+  @test_util.run_v1_only("b/120545219")
   def testGradInWhileWrtInitialLoopVal(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=(), name="x")
@@ -2072,6 +2323,7 @@ class ControlFlowTest(test.TestCase):
           "loop invariants or wrt the input parameters to the loop body."):
         control_flow_ops.while_loop(lambda i, x: i < 3, body, [0, y])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradInWhile(self):
     with self.cached_session():
       n = ops.convert_to_tensor(1.0, name="n")
@@ -2088,6 +2340,7 @@ class ControlFlowTest(test.TestCase):
                                       [tensor_shape.unknown_shape()])
       self.assertAllClose(9.0, r.eval(feed_dict={x: 1.0}))
 
+  @test_util.run_v1_only("b/120545219")
   def testCondGradInNestedWhiles(self):
 
     def outer_body(i, x):
@@ -2102,10 +2355,53 @@ class ControlFlowTest(test.TestCase):
     i, x = control_flow_ops.while_loop(lambda i, x: i < 3, outer_body, [0, 0.0])
 
     with self.cached_session() as sess:
-      i_val, x_val = sess.run([i, x])
+      i_val, x_val = self.evaluate([i, x])
       self.assertEqual(i_val, 3)
       self.assertAllClose(x_val, 1.0)
 
+  def testNestedResourceAccess(self):
+    var = resource_variable_ops.ResourceVariable(constant_op.constant(3.0))
+
+    @eager_function.defun
+    def test_fn():
+      x = constant_op.constant(0.0)
+      r = control_flow_ops.while_loop(
+          # Outer loop condition
+          lambda i, y: i < 2,
+          # Outer loop body
+          lambda i, y: (i + 1, y + control_flow_ops.cond(
+              constant_op.constant(True),
+              # True branch
+              lambda: control_flow_ops.while_loop(
+                  # Inner loop condition
+                  lambda j, z: j < 3,
+                  # Inner loop body
+                  lambda j, z: (j + 1, z + math_ops.square(var)),
+                  # Inner initial loop value
+                  [0, y])[1],
+              # False branch
+              lambda: (0.0))),
+          # Outer initial loop value
+          [0, x])[1]
+
+      grad = gradients_impl.gradients(r, x)[0]
+      return r, grad
+
+    self.evaluate(variables.global_variables_initializer())
+    r, grad = self.evaluate(test_fn())
+    # 2 * 3 * 3^2
+    self.assertEqual(r, 81.0)
+    # v1 control flow gets the wrong answer!!!
+    # Gradient computation:
+    #   f(x) = x + 3^2
+    #   inner_loop(x) = f(f(f(x))) = x + 3*3^2 = x + 27
+    #   g(x) = x + inner_loop(x) = 2x + 27
+    #   outer_loop(x) = g(g(x)) = 4x + 81
+    #   outer_loop'(x) = 4
+    # Note that v1 control flow gets 4.0 as well if the cond is removed.
+    if control_flow_ops.ENABLE_WHILE_V2 and control_flow_ops.ENABLE_COND_V2:
+      self.assertEqual(grad, 4.0)
+
   def testWhile_NestedInput(self):
     with self.cached_session() as sess:
       named = collections.namedtuple("named", ("a", "b"))
@@ -2131,8 +2427,9 @@ class ControlFlowTest(test.TestCase):
 
       r_flattened = nest.flatten(r)
       self.assertEqual([100.0, 1.0, 102.0, 3.0, 4.0 + 100 * 2.0],
-                       sess.run(r_flattened))
+                       self.evaluate(r_flattened))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhile_NestedBadArityFails(self):
     with self.cached_session():
       named = collections.namedtuple("named", ("a", "b"))
@@ -2149,6 +2446,7 @@ class ControlFlowTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, "the same number of elements"):
         control_flow_ops.while_loop(c, b, loop_vars)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ys_xs(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2172,6 +2470,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([rx], y)
       self.assertAllClose(120.0, r[0].eval())
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Dependency(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2192,6 +2491,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(1024.0, r[0].eval())
 
   @test_util.disable_control_flow_v2("b/116355153 (back_prop flag)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_NoGradient(self):
     with self.cached_session():
       v = constant_op.constant(2.0, name="v")
@@ -2203,6 +2503,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(1.0, r[0].eval())
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_NoDependency(self):
     with self.cached_session() as sess:
       variable = variables.Variable(array_ops.ones([2, 3]))
@@ -2223,6 +2524,7 @@ class ControlFlowTest(test.TestCase):
       variables.global_variables_initializer().run()
       self.assertAllClose(np.ones([2, 3]), sess.run(grad[0]))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Const(self):
     with self.cached_session() as sess:
       c0 = constant_op.constant(0.0, name="c0")
@@ -2242,6 +2544,7 @@ class ControlFlowTest(test.TestCase):
       grad = gradients_impl.gradients(cost, [c0])
       self.assertAllClose(0.0, sess.run(grad[0]))
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_SerialTwoLoops(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2260,6 +2563,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([rx], x)
       self.assertAllClose(1024.0, r[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_ParallelTwoLoops(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2279,6 +2583,7 @@ class ControlFlowTest(test.TestCase):
       r = gradients_impl.gradients([rx], x)
       self.assertAllClose(64.0, r[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_OneOutputWithControlDependencyOnSecond(self):
     with self.cached_session():
       i = constant_op.constant(0, name="i")
@@ -2295,10 +2600,11 @@ class ControlFlowTest(test.TestCase):
       with ops.control_dependencies([x_f]):
         y_f_d = array_ops.identity(y_f, name="y_f_d")
 
-      self.assertAllClose(2.0, y_f_d.eval())  # y_f_d = 1.0 + 1.0
+      self.assertAllClose(2.0, self.evaluate(y_f_d))  # y_f_d = 1.0 + 1.0
       g = gradients_impl.gradients([y_f_d], [x])[0]
       self.assertTrue(g is not None)
-      self.assertAllClose(1.0, g.eval())  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
+      self.assertAllClose(1.0,
+                          self.evaluate(g))  # y_f_d = x + 1.0, dy_f_d/dx = 1.0
 
   def _testNestedWhileGrad_Simple(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -2314,12 +2620,14 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(8.0, r.eval())
+      self.assertAllClose(8.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testNestedWhileGrad_Simple(self):
     self._testNestedWhileGrad_Simple(use_gpu=False)
     self._testNestedWhileGrad_Simple(use_gpu=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileGrad_SerialInner(self):
     with self.cached_session():
       v = constant_op.constant(1.0)
@@ -2341,8 +2649,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(256.0, r.eval())
+      self.assertAllClose(256.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testNestedWhileGrad_ParallelInner(self):
     with self.cached_session():
       v = constant_op.constant(1.0)
@@ -2364,10 +2673,9 @@ class ControlFlowTest(test.TestCase):
       r = control_flow_ops.while_loop(c, b, [v])
 
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(512.0, r.eval())
+      self.assertAllClose(512.0, self.evaluate(r))
 
-  @test_util.disable_control_flow_v2("unsupported: resource creation in body. "
-                                     "Enable with new TAs b/117675481")
+  @test_util.run_v1_only("b/120545219")
   def testNestedWhileGrad_ParallelIterations(self):
     # Make sure the stack pushes and pops of an inner loop are executed in
     # the sequential order of the iterations of its outer loop.
@@ -2386,9 +2694,9 @@ class ControlFlowTest(test.TestCase):
       res = outer_loop(inp)
       optimizer = adam.AdamOptimizer(learning_rate=0.001)
       train_op = optimizer.minimize(math_ops.reduce_mean(math_ops.square(res)))
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(2.999, var.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(train_op)
+      self.assertAllClose(2.999, self.evaluate(var))
 
   def _testWhileCondGrad_Simple(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -2404,14 +2712,16 @@ class ControlFlowTest(test.TestCase):
       # pylint: enable=undefined-variable
       r = control_flow_ops.while_loop(c, b, [v])
       r = gradients_impl.gradients(r, v)[0]
-      self.assertAllClose(1024.0, r.eval())
+      self.assertAllClose(1024.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/117519152")
+  @test_util.run_deprecated_v1
   def testWhileCondGrad_Simple(self):
     self._testWhileCondGrad_Simple(use_gpu=False)
     self._testWhileCondGrad_Simple(use_gpu=True)
 
   @test_util.disable_control_flow_v2("b/117276490")
+  @test_util.run_deprecated_v1
   def testWhileCondGrad_UnknownShape(self):
     with self.cached_session() as sess:
       v = array_ops.placeholder(dtypes.float32)
@@ -2429,6 +2739,7 @@ class ControlFlowTest(test.TestCase):
       r = sess.run(r, feed_dict={v: 2.0})
       self.assertAllClose(1024.0, r)
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_Concat(self):
     with self.cached_session() as sess:
       x = variable_scope.get_variable("x", initializer=[[1., 2.]])
@@ -2446,13 +2757,15 @@ class ControlFlowTest(test.TestCase):
           [i0.get_shape(), tensor_shape.TensorShape([None, 2])])
       s = math_ops.reduce_sum(h)
 
-      sess.run(variables.global_variables_initializer())
       optimizer = gradient_descent.GradientDescentOptimizer(0.01)
       op = optimizer.minimize(s)
-      sess.run(op)
-      self.assertAllClose([[0.98000002, 1.98000002]], sess.run(x))
+
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(op)
+      self.assertAllClose([[0.98000002, 1.98000002]], self.evaluate(x))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithRefsWithGradients_1(self):
     with self.cached_session() as sess:
       x = variables.VariableV1(0.)._ref()  # pylint: disable=protected-access
@@ -2482,6 +2795,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual(73, value_x_grad)
 
   @test_util.disable_control_flow_v2("b/116282023 (IndexedSlices)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_IndexedSlices(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2501,9 +2815,10 @@ class ControlFlowTest(test.TestCase):
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       r = gradients_impl.gradients(r.values, values)[0]
-      self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
+      self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/116328420 (SparseTensor)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGrad_SparseTensor(self):
     with self.cached_session():
       values = constant_op.constant([2.0, 4.0], name="values")
@@ -2524,9 +2839,10 @@ class ControlFlowTest(test.TestCase):
 
       _, r = control_flow_ops.while_loop(c, b, [i, x])
       r = gradients_impl.gradients(r.values, values)[0]
-      self.assertAllClose(np.array([1024.0, 1024.0]), r.eval())
+      self.assertAllClose(np.array([1024.0, 1024.0]), self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
+  @test_util.run_v1_only("b/120545219")
   def testCallGradInLoop(self):
     with self.cached_session() as sess:
       i0 = constant_op.constant(0)
@@ -2544,10 +2860,9 @@ class ControlFlowTest(test.TestCase):
 
       output_grad = control_flow_ops.while_loop(
           c, b, [i0, constant_op.constant(0.0)])
-      self.assertAllClose(600.0, sess.run(output_grad)[1])
+      self.assertAllClose(600.0, self.evaluate(output_grad)[1])
 
-  @test_util.disable_control_flow_v2("unsupported: resource creation in body. "
-                                     "Enable with new TAs b/117675481")
+  @test_util.run_deprecated_v1
   def testWhileAndTensorArray(self):
     with self.cached_session() as sess:
       param = constant_op.constant(2.0)
@@ -2565,8 +2880,9 @@ class ControlFlowTest(test.TestCase):
 
       r = control_flow_ops.while_loop(c, b, [n0, y0], parallel_iterations=1)
       r = gradients_impl.gradients(r, param)[0]
-      self.assertAllClose(107520.0, sess.run(r))
+      self.assertAllClose(107520.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testWhileGrad_StopGrad(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2582,9 +2898,9 @@ class ControlFlowTest(test.TestCase):
       rx, ry = control_flow_ops.while_loop(c, b, [x, y])
 
       r = gradients_impl.gradients(rx, y)[0]
-      self.assertEqual(136.0, r.eval())
+      self.assertEqual(136.0, self.evaluate(r))
       r = gradients_impl.gradients(ry, y)[0]
-      self.assertEqual(32.0, r.eval())
+      self.assertEqual(32.0, self.evaluate(r))
 
       r = gradients_impl.gradients(array_ops.stop_gradient(rx), y)[0]
       self.assertEqual(r, None)
@@ -2602,14 +2918,16 @@ class ControlFlowTest(test.TestCase):
       self.assertEqual(r, None)
 
       r = gradients_impl.gradients(math_ops.add(rx, ry), y)[0]
-      self.assertEqual(168.0, r.eval())
+      self.assertEqual(168.0, self.evaluate(r))
       r = gradients_impl.gradients(
           math_ops.add(rx, array_ops.stop_gradient(ry)), y)[0]
-      self.assertEqual(136.0, r.eval())
+      self.assertEqual(136.0, self.evaluate(r))
       r = gradients_impl.gradients(
           math_ops.add(array_ops.stop_gradient(rx), ry), y)[0]
-      self.assertEqual(32.0, r.eval())
+      self.assertEqual(32.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_control_flow_v2("b/118712257")
   def testWhileGrad_StopGradInside(self):
     with self.cached_session():
       x = constant_op.constant(3.0, name="x")
@@ -2625,10 +2943,12 @@ class ControlFlowTest(test.TestCase):
       rx, _ = control_flow_ops.while_loop(c, b, [x, y])
 
       r = gradients_impl.gradients(rx, y)[0]
-      self.assertAllClose(0.0, r.eval())
+      self.assertAllClose(0.0, self.evaluate(r))
       r = gradients_impl.gradients(rx, x)[0]
-      self.assertAllClose(156.0, r.eval())
+      self.assertAllClose(156.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
+  @test_util.disable_control_flow_v2("b/118712257")
   def testWhileGrad_StopGradInsideNoShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -2650,9 +2970,10 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose([156.0, 400.0], sess.run(r, feed_dict=feed_dict))
       name = "gradients/while/stopped_grad"
       all_ops = x.graph.get_operations()
-      self.assertFalse(any([name in op.name for op in all_ops]))
+      self.assertFalse(any(name in op.name for op in all_ops))
 
   @test_util.disable_control_flow_v2("b/117954949")
+  @test_util.run_deprecated_v1
   def testWhileGradGradFail(self):
     theta = variables.Variable(initial_value=1.)
 
@@ -2667,6 +2988,7 @@ class ControlFlowTest(test.TestCase):
     grad_theta_stopped = array_ops.stop_gradient(grad_theta)
     gradients_impl.gradients(grad_theta_stopped, theta)
 
+  @test_util.run_deprecated_v1
   def testStopGradOnWhileGrad(self):
     with self.cached_session():
       x = constant_op.constant(2.0, name="x")
@@ -2681,9 +3003,10 @@ class ControlFlowTest(test.TestCase):
       r = math_ops.add(math_ops.square(y), rx)
       r = math_ops.add(r, rg)
       r = gradients_impl.gradients(r, y)[0]
-      self.assertEqual(388.0, r.eval())
+      self.assertEqual(388.0, self.evaluate(r))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_deprecated_v1
   def testWhileGradientWithNontrainablePath1(self):
     q = variables.Variable([7., 8.])
 
@@ -2698,10 +3021,11 @@ class ControlFlowTest(test.TestCase):
     dy_dq, = gradients_impl.gradients(y, q)
     self.assertIsNotNone(dy_dq)
     with self.cached_session() as sess:
-      sess.run(q.initializer)
-      self.assertAllClose([0., 0.], sess.run(dy_dq))
+      self.evaluate(q.initializer)
+      self.assertAllClose([0., 0.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/113324949 (RefVariable)")
+  @test_util.run_v1_only("b/120545219")
   def testWhileGradientWithNontrainablePath2(self):
     q = variables.Variable([7., 8.])
 
@@ -2716,10 +3040,11 @@ class ControlFlowTest(test.TestCase):
     dy_dq, = gradients_impl.gradients(y, q)
     self.assertIsNotNone(dy_dq)
     with self.cached_session() as sess:
-      sess.run(q.initializer)
-      self.assertAllClose([1., 1.], sess.run(dy_dq))
+      self.evaluate(q.initializer)
+      self.assertAllClose([1., 1.], self.evaluate(dy_dq))
 
   @test_util.disable_control_flow_v2("b/115920078 (gradients)")
+  @test_util.run_v1_only("b/120545219")
   def testIssue16504(self):
     c = constant_op.constant(np.arange(100), dtype=dtypes.float32)
     w = variables.Variable(
@@ -2743,6 +3068,7 @@ class ControlFlowTest(test.TestCase):
     grad, = gradients_impl.gradients(w, c)
     self.assertIsNotNone(grad)
 
+  @test_util.run_v1_only("b/120545219")
   def testStopGradMultiFlows(self):
     with self.cached_session():
 
@@ -2767,8 +3093,9 @@ class ControlFlowTest(test.TestCase):
       z = math_ops.add(r, array_ops.stop_gradient(math_ops.reduce_sum(grads)))
       result = gradients_impl.gradients(z, vars_)[0]
       variables.global_variables_initializer().run()
-      self.assertEqual(5.0, result.eval())
+      self.assertEqual(5.0, self.evaluate(result))
 
+  @test_util.run_v1_only("b/120545219")
   def testOneValueCond(self):
 
     with self.cached_session():
@@ -2785,6 +3112,7 @@ class ControlFlowTest(test.TestCase):
       # False case: c = 0 is not >= 1
       self.assertEqual([2], i.eval(feed_dict={c: 0}))
 
+  @test_util.run_deprecated_v1
   def testExampleCond(self):
 
     with self.cached_session():
@@ -2801,6 +3129,7 @@ class ControlFlowTest(test.TestCase):
       self.assertAllClose(4.0, i.eval(feed_dict={d: 1}))
       self.assertAllClose(2.0 * math.sqrt(2), i.eval(feed_dict={d: 2}))
 
+  @test_util.run_v1_only("b/120545219")
   def testCase(self):
     with self.cached_session():
       x = constant_op.constant(1)
@@ -2828,7 +3157,7 @@ class ControlFlowTest(test.TestCase):
       r4 = control_flow_ops.case(
           [(x < y, f1), (x < y, f2)], default=f3, exclusive=True)
       with self.assertRaisesOpError("Input error:"):
-        r4.eval()
+        self.evaluate(r4)
 
       # Check that the default is called if none of the others are
       r5 = control_flow_ops.case({x > y: f1}, default=f3)
@@ -2853,6 +3182,7 @@ class ControlFlowTest(test.TestCase):
 
       self.assertAllEqual(r6.eval(), 0)
 
+  @test_util.run_v1_only("b/120545219")
   def testCaseSideEffects(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(-1)
@@ -2874,21 +3204,22 @@ class ControlFlowTest(test.TestCase):
           ((x > y, a), (x > y, b)), default=c, exclusive=True)
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(2, r2.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1, -1, 2])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(2, self.evaluate(r2))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, -1, 2])
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(1, r1.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1, 1, -1])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(1, self.evaluate(r1))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1, 1, -1])
 
       variables.global_variables_initializer().run()
-      self.assertAllEqual(sess.run([v0, v1, v2]), [-1] * 3)
-      self.assertEqual(0, r0.eval())
-      self.assertAllEqual(sess.run([v0, v1, v2]), [0, -1, -1])
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [-1] * 3)
+      self.assertEqual(0, self.evaluate(r0))
+      self.assertAllEqual(self.evaluate([v0, v1, v2]), [0, -1, -1])
 
   @test_util.disable_control_flow_v2("b/113324949 (ref vars)")
+  @test_util.run_v1_only("b/120545219")
   def testOneOpCond(self):
     with self.cached_session():
       v = variables.Variable(0)
@@ -2907,16 +3238,17 @@ class ControlFlowTest(test.TestCase):
       self.assertTrue(isinstance(i, ops.Tensor))
       variables.global_variables_initializer().run()
 
-      self.assertEqual(0, v.eval())
+      self.assertEqual(0, self.evaluate(v))
 
       # True case: c = 2 is >= 1, v is set to 1.
       self.assertEqual(1, i.eval(feed_dict={c.name: 2}))
-      self.assertEqual(1, v.eval())
+      self.assertEqual(1, self.evaluate(v))
 
       # False case: c = 0 is not >= 1, v is set to 2.
       self.assertEqual(2, i.eval(feed_dict={c.name: 0}))
-      self.assertEqual(2, v.eval())
+      self.assertEqual(2, self.evaluate(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWithOpsDependencies(self):
     with self.cached_session() as sess:
       v = variables.VariableV1(0.0)
@@ -2924,7 +3256,7 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching v directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        sess.run([c, v])
+        self.evaluate([c, v])
 
       # Use a control dependency to ensure init_variable is run
       # while asking for c
@@ -2932,7 +3264,7 @@ class ControlFlowTest(test.TestCase):
           name="real_tensor",
           output_tensor=v._ref(),  # pylint: disable=protected-access
           dependencies=[v.initializer])
-      c_val, real_v_val = sess.run([c, real_v])
+      c_val, real_v_val = self.evaluate([c, real_v])
 
     # Ensure the result of 'real_c' is the same as 'c'
     self.assertAllEqual(10, c_val)
@@ -2940,6 +3272,7 @@ class ControlFlowTest(test.TestCase):
     # Ensure that 'v' is initialized
     self.assertAllClose(0.0, real_v_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testWithTensorDependencies(self):
     with self.cached_session():
       v = variables.VariableV1(0.0)
@@ -2957,15 +3290,16 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching v directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        v.eval()
+        self.evaluate(v)
 
       # Get the value of 'c2_with_c1_dep', which should cause 'v'
       # to be initialized.
-      self.assertAllEqual(20, c2_with_c1_dep.eval())
+      self.assertAllEqual(20, self.evaluate(c2_with_c1_dep))
 
       # Ensure that 'v' is initialized
-      self.assertAllClose(0.0, v.eval())
+      self.assertAllClose(0.0, self.evaluate(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWithIndexedSlicesDependencies(self):
     with self.cached_session():
       v = variables.VariableV1(
@@ -2979,13 +3313,15 @@ class ControlFlowTest(test.TestCase):
 
       # Fetching gather_v_at_1 will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        gather_v_at_1.eval()
+        self.evaluate(gather_v_at_1)
 
       # Getting gather_v_at_1_after_init will work, and initialize v.
-      self.assertAllEqual([[10.0, 11.0]], gather_v_at_1_after_init.eval())
+      self.assertAllEqual([[10.0, 11.0]],
+                          self.evaluate(gather_v_at_1_after_init))
 
       # Double check that 'v' is initialized
-      self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]], v.eval())
+      self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]],
+                          self.evaluate(v))
 
   def testDependenciesDevice(self):
     with ops.Graph().as_default():
@@ -3010,6 +3346,7 @@ class ControlFlowTest(test.TestCase):
         self.assertDeviceEqual("", with_vdef_dep.device)
         self.assertEqual([b"loc:@vdef"], with_vdef_dep.op.colocation_groups())
 
+  @test_util.run_v1_only("b/120545219")
   def testGroup(self):
     with self.cached_session() as sess:
       v1 = variables.VariableV1([0.0])
@@ -3019,21 +3356,23 @@ class ControlFlowTest(test.TestCase):
       init = control_flow_ops.group(v1.initializer, v2.initializer)
       # Fetching v1 directly will result in an uninitialized error
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        v1.eval()
+        self.evaluate(v1)
 
       # Runs "init" before fetching v1 and v2.
       init.run()
-      v1_val, v2_val = sess.run([v1, v2])
+      v1_val, v2_val = self.evaluate([v1, v2])
 
     # Ensure that v1 and v2 are initialized
     self.assertAllClose([0.0], v1_val)
     self.assertAllClose([1.0], v2_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testGroupEmpty(self):
     op = control_flow_ops.group()
     self.assertEqual(op.type, "NoOp")
     self.assertEqual(op.control_inputs, [])
 
+  @test_util.run_deprecated_v1
   def testMergeShapes(self):
     # All inputs unknown.
     p1 = array_ops.placeholder(dtypes.float32)
@@ -3088,6 +3427,7 @@ class ControlFlowTest(test.TestCase):
     self.assertEqual([None, None], m.get_shape().as_list())
     self.assertEqual([], index.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testRefSelect(self):
     index = array_ops.placeholder(dtypes.int32)
 
@@ -3121,6 +3461,7 @@ class ControlFlowTest(test.TestCase):
     s = control_flow_ops.ref_select(index, [v1, v2])
     self.assertEqual(None, s.get_shape())
 
+  @test_util.run_deprecated_v1
   def testRunLoopTensor(self):
     with self.cached_session() as sess:
       tensor_list = []
@@ -3134,13 +3475,14 @@ class ControlFlowTest(test.TestCase):
 
       result = control_flow_ops.while_loop(condition, body,
                                            [constant_op.constant(4)])
-      self.assertEqual(10, sess.run(result))
+      self.assertEqual(10, self.evaluate(result))
 
       # Ensure that we cannot run a tensor that escapes the loop body
       # accidentally.
       with self.assertRaises(ValueError):
         sess.run(tensor_list[0])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhilePyFuncBasic(self):
 
     def func(x):
@@ -3154,6 +3496,7 @@ class ControlFlowTest(test.TestCase):
           [tensor_shape.unknown_shape(), tensor_shape.unknown_shape()])
       self.assertEqual(r[1].eval(), 65536.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileFuncBasic(self):
 
     @function.Defun(dtypes.float32)
@@ -3167,10 +3510,9 @@ class ControlFlowTest(test.TestCase):
           [constant_op.constant(0), x],
           [tensor_shape.unknown_shape(),
            tensor_shape.unknown_shape()])
+      grad = gradients_impl.gradients(r, x)[0]
       self.assertEqual(r[1].eval(), 65536.0)
-
-      r = gradients_impl.gradients(r, x)[0]
-      self.assertEqual(r.eval(), 524288.0)
+      self.assertEqual(grad.eval(), 524288.0)
       # while_v2 does not have stacks.
       if not control_flow_ops.ENABLE_WHILE_V2:
         self.assertEqual(
@@ -3178,27 +3520,39 @@ class ControlFlowTest(test.TestCase):
                 ]), 1)
 
 
+  @test_util.run_v1_only("b/120545219")
   def testQIntSwitchMerge(self):
     with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
       constant_qint = constant_op.constant(np.array([42]), dtypes.qint8)
       cond = constant_op.constant(True, dtypes.bool)
       v_f, v_t = control_flow_ops.switch(constant_qint, cond)
       result = control_flow_ops.merge([v_f, v_t])
-      sess.run(result)
+      self.evaluate(result)
 
+  @test_util.run_v1_only("b/120545219")
   def testQIntRefSwitchMerge(self):
     with self.cached_session(use_gpu=test.is_gpu_available()) as sess:
       var_qint = gen_state_ops.variable(
           shape=[1], dtype=dtypes.qint8, name="v", container="", shared_name="")
       assign_op = state_ops.assign(
           var_qint, constant_op.constant(np.array([42]), dtypes.qint8))
-      sess.run(assign_op)
+      self.evaluate(assign_op)
 
       cond = constant_op.constant(True, dtypes.bool)
       v_f, v_t = control_flow_ops.ref_switch(var_qint, cond)
       result = control_flow_ops.ref_merge([v_f, v_t])
-      sess.run(result)
+      self.evaluate(result)
+
+  @test_util.run_v1_only("b/120545219")
+  def testUInt64SwitchMerge(self):
+    with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
+      constant_uint64 = constant_op.constant(np.array([42]), dtypes.uint64)
+      cond = constant_op.constant(True, dtypes.bool)
+      v_f, v_t = control_flow_ops.switch(constant_uint64, cond)
+      result = control_flow_ops.merge([v_f, v_t])
+      self.evaluate(result)
 
+  @test_util.run_deprecated_v1
   def testQIntArgAndRet(self):
 
     @function.Defun(dtypes.qint8)
@@ -3208,7 +3562,7 @@ class ControlFlowTest(test.TestCase):
     with self.cached_session(force_gpu=test.is_gpu_available()) as sess:
       qint = constant_op.constant(np.array([42]), dtypes.qint8)
       result = func(qint)
-      sess.run(result)
+      self.evaluate(result)
 
 
 class ControlFlowContextCheckTest(test.TestCase):
@@ -3237,6 +3591,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         math_ops.less(1, 2), true_fn, lambda: constant_op.constant(0))
     return cond_tensor[0]
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContext(self):
     # Accessing a while loop tensor outside of control flow is illegal.
     while_tensor = self._getWhileTensor()
@@ -3246,6 +3601,7 @@ class ControlFlowContextCheckTest(test.TestCase):
         "is in a while loop. See info log for more details."):
       math_ops.add(1, while_tensor)
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContextInCond(self):
     # Accessing a while loop tensor in cond is illegal.
     while_tensor = self._getWhileTensor()
@@ -3258,6 +3614,7 @@ class ControlFlowContextCheckTest(test.TestCase):
           math_ops.less(1, 2), lambda: math_ops.add(1, while_tensor),
           lambda: constant_op.constant(0))
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidContextInWhile(self):
     # Accessing a while loop tensor in a different while loop is illegal.
     while_tensor = self._getWhileTensor()
@@ -3292,6 +3649,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.cond(math_ops.less(1, 2), branch_fn, branch_fn)
 
+  @test_util.run_v1_only("b/120545219")
   def testValidWhileContext(self):
     # Accessing a tensor in a nested while is OK.
     def body(_):
@@ -3300,6 +3658,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.while_loop(lambda i: i < 5, body, [0])
 
+  @test_util.run_v1_only("b/120545219")
   def testValidNestedContexts(self):
     # Accessing a tensor from a cond context in a while context, all inside an
     # outer while context, is OK.
@@ -3314,6 +3673,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
     control_flow_ops.while_loop(lambda i: i < 5, body, [0])
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidNestedContexts(self):
     # Accessing a tensor from a while context in a different while context, all
     # inside a cond context, is illegal.
@@ -3332,6 +3692,7 @@ class ControlFlowContextCheckTest(test.TestCase):
 
 class TupleTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testTensors(self):
     for v1_first in [True, False]:
       with self.cached_session():
@@ -3347,21 +3708,22 @@ class TupleTest(test.TestCase):
 
         # v1 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v1.eval()
+          self.evaluate(v1)
 
         # v2 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v2.eval()
+          self.evaluate(v2)
 
         if v1_first:
           # Getting t1 initializes v2.
-          self.assertAllClose([3.0], t1.eval())
-          self.assertAllClose([10.0], v2.eval())
+          self.assertAllClose([3.0], self.evaluate(t1))
+          self.assertAllClose([10.0], self.evaluate(v2))
         else:
           # Getting t2 initializes v1.
-          self.assertAllClose([30.0], t2.eval())
-          self.assertAllClose([1.0], v1.eval())
+          self.assertAllClose([30.0], self.evaluate(t2))
+          self.assertAllClose([1.0], self.evaluate(v1))
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlices(self):
     for v1_first in [True, False]:
       with self.cached_session():
@@ -3385,22 +3747,22 @@ class TupleTest(test.TestCase):
 
         # v1 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v1.eval()
+          self.evaluate(v1)
 
         # v2 is not initialized.
         with self.assertRaisesOpError("Attempting to use uninitialized value"):
-          v2.eval()
+          self.evaluate(v2)
 
         if v1_first:
           # Getting g1 initializes v2.
-          self.assertAllClose([[10.0, 11.0]], g1.eval())
+          self.assertAllClose([[10.0, 11.0]], self.evaluate(g1))
           self.assertAllClose([[0.1, 1.1], [10.1, 11.1], [20.1, 21.1]],
-                              v2.eval())
+                              self.evaluate(v2))
         else:
           # Getting g2 initializes v1.
-          self.assertAllClose([[10.1, 11.1]], g2.eval())
+          self.assertAllClose([[10.1, 11.1]], self.evaluate(g2))
           self.assertAllClose([[0.0, 1.0], [10.0, 11.0], [20.0, 21.0]],
-                              v1.eval())
+                              self.evaluate(v1))
 
   def testAcceptTensorsAsControlInputs(self):
     with self.cached_session():
@@ -3410,13 +3772,14 @@ class TupleTest(test.TestCase):
           [constant_op.constant(0)], control_inputs=[assign])
 
       # Should trigger the assign.
-      t.eval()
+      self.evaluate(t)
 
-      self.assertEquals(1, var.eval())
+      self.assertEquals(1, self.evaluate(var))
 
 
 class AssertTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGuardedAssertDoesNotCopyWhenTrue(self):
     with self.session(use_gpu=True) as sess:
       with ops.device(test.gpu_device_name()):
@@ -3513,7 +3876,7 @@ class WhileOpBenchmark(test.Benchmark):
     with session.Session() as sess, ops.device(default_device):
       # Get the initial id i, input x, and kernel.
       i, x, kernel = self._getInitVariables()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       if static_unroll:
         for _ in xrange(steps):
@@ -3532,11 +3895,11 @@ class WhileOpBenchmark(test.Benchmark):
 
       for _ in xrange(3):
         # exclude warm up time
-        sess.run(r)
+        self.evaluate(r)
 
       start_time = time.time()
       for _ in xrange(num_iters):
-        sess.run(r)
+        self.evaluate(r)
       return (time.time() - start_time) / num_iters
 
   def benchmarkWhileOpCrossDevicePlacement(self):
@@ -3612,6 +3975,7 @@ class EagerTest(test.TestCase):
           isum(tensor, maximum_iterations=3).numpy(),
           [1 + 3, 2 + 3, 3 + 3, 4 + 3, 5 + 3])
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileWithMaximumIterationsAndSingleArgument(self):
     with context.eager_mode():
       tensor = constant_op.constant(0)
@@ -3634,6 +3998,7 @@ class EagerTest(test.TestCase):
       self.assertAllEqual(t1.numpy(), tup1.numpy())
       self.assertAllEqual(t2.numpy(), tup2.numpy())
 
+  @test_util.run_v1_only("b/120545219")
   def testCase(self):
     with context.eager_mode():
       x = constant_op.constant(1)
diff --git a/tensorflow/python/kernel_tests/control_flow_util_test.py b/tensorflow/python/kernel_tests/control_flow_util_test.py
index 762c445da05008a78fec1ec9e1cc7186e1539134..573f4b0d250ba5ff75118ed5738c3de2a8711a2f 100644
--- a/tensorflow/python/kernel_tests/control_flow_util_test.py
+++ b/tensorflow/python/kernel_tests/control_flow_util_test.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
 
 class ControlFlowUtilTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testIsSwitch(self):
     switch_false, _ = control_flow_ops.switch(1, True)
     switch = switch_false.op
@@ -44,6 +46,7 @@ class ControlFlowUtilTest(test.TestCase):
 
     self.assertFalse(control_flow_util.IsSwitch(test_ops.int_output().op))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsLoopEnter(self):
     enter = gen_control_flow_ops.enter(1, frame_name="name").op
     self.assertTrue(control_flow_util.IsLoopEnter(enter))
@@ -61,6 +64,7 @@ class ControlFlowUtilTest(test.TestCase):
 
     self.assertFalse(control_flow_util.IsLoopEnter(test_ops.int_output().op))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsLoopExit(self):
     exit_op = control_flow_ops.exit(1).op
     self.assertTrue(control_flow_util.IsLoopExit(exit_op))
diff --git a/tensorflow/python/kernel_tests/conv1d_test.py b/tensorflow/python/kernel_tests/conv1d_test.py
index 8540875d75e19b967aa4da9b4499b030df10dd7e..e8463323df90bd37d927f88bd41b09bef45de541 100644
--- a/tensorflow/python/kernel_tests/conv1d_test.py
+++ b/tensorflow/python/kernel_tests/conv1d_test.py
@@ -43,7 +43,7 @@ class Conv1DTest(test.TestCase):
         with self.cached_session(use_gpu=test.is_gpu_available()):
           c = nn_ops.conv1d(x, filters, stride, padding="VALID")
           reduced = array_ops.squeeze(c)
-          output = reduced.eval()
+          output = self.evaluate(reduced)
           if stride == 1:
             self.assertEqual(len(output), 3)
             self.assertAllClose(output,
@@ -69,7 +69,7 @@ class Conv1DTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv1d_transpose(
           x, f, y_shape, stride=stride, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
diff --git a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
index af6ffc1d195239b25c68326e87422b592fb631db..7b3b560b24005e4fdbac78245ac425865d98dd0b 100644
--- a/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_backprop_filter_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Conv2DBackpropFilterGradTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/conv2d_transpose_test.py b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
index 6f9992a317f44268c772aa6a3316120f0577eeb3..c603c08630661083a65c4c1f6f399925efa537a6 100644
--- a/tensorflow/python/kernel_tests/conv2d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv2d_transpose_test.py
@@ -53,7 +53,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells=kernel_height * kernel_width
@@ -91,7 +91,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[2]):
@@ -124,7 +124,7 @@ class Conv2DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv2d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
@@ -155,6 +155,7 @@ class Conv2DTransposeTest(test.TestCase):
 
     self.assertAllClose(cache_values, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 6, 4, 3]
     f_shape = [3, 3, 2, 3]
@@ -195,7 +196,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="SAME", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         for n in xrange(x_shape[0]):
           for k in xrange(f_shape[2]):
             for w in xrange(y_shape[3]):
@@ -230,7 +231,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="SAME", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         for n in xrange(x_shape[0]):
           for k in xrange(f_shape[2]):
             for w in xrange(y_shape[3]):
@@ -265,7 +266,7 @@ class Conv2DTransposeTest(test.TestCase):
         output = nn_ops.conv2d_transpose(
             x, f, y_shape, strides=strides, padding="VALID", data_format="NCHW")
 
-        value = output.eval()
+        value = self.evaluate(output)
         cache_values = np.zeros(y_shape, dtype=np.float32)
         # The amount of padding added
         pad = 1
@@ -293,7 +294,6 @@ class Conv2DTransposeTest(test.TestCase):
 
         self.assertAllClose(cache_values, value)
 
-  @test_util.enable_c_shapes
   def testConv2DTransposeShapeInference(self):
     # Test case for 8972
     initializer = random_ops.truncated_normal(
diff --git a/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py b/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
index 89b64068ace5803ed7d92cfb6425940b494159cc..7e913febed3dc8f4f698a0ede6ed8670e0b69a50 100644
--- a/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_backprop_filter_v2_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Conv3DBackpropFilterV2GradTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/conv3d_transpose_test.py b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
index 2527b837692b5e31126499db85224d2a8d3b5321..22ba5b90375c61ae7e1c426d88f0c19a546b2bbc 100644
--- a/tensorflow/python/kernel_tests/conv3d_transpose_test.py
+++ b/tensorflow/python/kernel_tests/conv3d_transpose_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -48,7 +49,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       # We count the number of cells being added at the locations in the output.
       # At the center, #cells = kernel_depth * kernel_height * kernel_width
@@ -98,7 +99,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="SAME")
-      value = output.eval()
+      value = self.evaluate(output)
 
       for n in xrange(x_shape[0]):
         for k in xrange(f_shape[3]):
@@ -119,6 +120,7 @@ class Conv3DTransposeTest(test.TestCase):
                   target = 3.0
                 self.assertAllClose(target, value[n, d, h, w, k])
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeShapeMismatch(self):
     # Test case for GitHub issue 18460
     x_shape = [2, 2, 3, 4, 3]
@@ -146,7 +148,7 @@ class Conv3DTransposeTest(test.TestCase):
         output = nn_ops.conv3d_transpose(
             x_value, f_value, constant_op.constant(y_shape, dtype=dtype),
             strides=strides, padding="SAME")
-        output.eval()
+        self.evaluate(output)
 
   def testConv3DTransposeValid(self):
     with self.cached_session():
@@ -165,7 +167,7 @@ class Conv3DTransposeTest(test.TestCase):
           1.0, shape=f_shape, name="filter", dtype=dtypes.float32)
       output = nn_ops.conv3d_transpose(
           x, f, y_shape, strides=strides, padding="VALID")
-      value = output.eval()
+      value = self.evaluate(output)
 
       cache_values = np.zeros(y_shape, dtype=np.float32)
 
@@ -201,6 +203,7 @@ class Conv3DTransposeTest(test.TestCase):
 
     self.assertAllClose(cache_values, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 3, 4, 3, 2]
     f_shape = [3, 3, 3, 2, 2]
diff --git a/tensorflow/python/kernel_tests/conv_ops_3d_test.py b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
index c4a9cdcf8e0b465cd5c048200b0372aaffd77c03..4a689b3fdfa5f43c8b6a4c67b7ebb31104d83db7 100644
--- a/tensorflow/python/kernel_tests/conv_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_3d_test.py
@@ -52,11 +52,11 @@ class Conv3DTest(test.TestCase):
   def _DtypesToTest(self, use_gpu):
     if use_gpu:
       if not test_util.CudaSupportsHalfMatMulAndConv():
-        return [dtypes.float32]
+        return [dtypes.float64, dtypes.float32]
       else:
         # It is important that float32 comes before float16 here,
         # as we will be using its gradients as reference for fp16 gradients.
-        return [dtypes.float32, dtypes.float16]
+        return [dtypes.float64, dtypes.float32, dtypes.float16]
     else:
       return [dtypes.float64, dtypes.float32, dtypes.float16]
 
@@ -109,7 +109,7 @@ class Conv3DTest(test.TestCase):
         results.append(result)
 
       with self.cached_session() as sess:
-        values = sess.run(results)
+        values = self.evaluate(results)
         for value in values:
           print("expected = ", expected)
           print("actual = ", value)
@@ -184,8 +184,8 @@ class Conv3DTest(test.TestCase):
         computed_results.append(computed)
         tolerance = 1e-2 if use_gpu else 1e-5
         with self.cached_session() as sess:
-          expected_values = sess.run(expected_results)
-          computed_values = sess.run(computed_results)
+          expected_values = self.evaluate(expected_results)
+          computed_values = self.evaluate(computed_results)
           for e_value, c_value in zip(expected_values, computed_values):
             print("expected = ", e_value)
             print("actual = ", c_value)
@@ -462,6 +462,7 @@ class Conv3DTest(test.TestCase):
       self._ConstructAndTestGradientForConfig(data_format=data_format,
                                               use_gpu=use_gpu, **kwargs)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -473,6 +474,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=4,
@@ -484,6 +486,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -495,6 +498,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -506,6 +510,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -517,6 +522,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -528,6 +534,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -539,6 +546,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideOne(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -550,6 +558,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -561,6 +570,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideTwo(self):
     self.ConstructAndTestGradient(
         batch=4,
@@ -572,6 +582,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -583,6 +594,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientSamePaddingStrideThree(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -594,6 +606,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientSamePaddingDifferentStrides(self):
     self.ConstructAndTestGradient(
         batch=1,
@@ -605,6 +618,7 @@ class Conv3DTest(test.TestCase):
         padding="SAME",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientKernelSizeMatchesInputSize(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -616,6 +630,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=False)
 
+  @test_util.run_deprecated_v1
   def testInputGradientKernelSizeMatchesInputSize(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -640,6 +655,7 @@ class Conv3DTest(test.TestCase):
 
   # Test the fast path in gemm_pack_rhs/mkldnn_gemm_pack, when channel
   # dimension is a multiple of packet size.
+  @test_util.run_deprecated_v1
   def testInputGradientValidPaddingStrideOneFastPath(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -651,6 +667,7 @@ class Conv3DTest(test.TestCase):
         padding="VALID",
         test_input=True)
 
+  @test_util.run_deprecated_v1
   def testFilterGradientValidPaddingStrideOneFastPath(self):
     self.ConstructAndTestGradient(
         batch=2,
@@ -715,8 +732,8 @@ class Conv3DTest(test.TestCase):
         expected_grad = gradients_impl.gradients(expected, t1
                                                  if mode == "input" else t2)[0]
         # "values" consists of two tensors for two backprops
-        actual_value = sess.run(actual_grad)
-        expected_value = sess.run(expected_grad)
+        actual_value = self.evaluate(actual_grad)
+        expected_value = self.evaluate(expected_grad)
         self.assertShapeEqual(actual_value, actual_grad)
         self.assertShapeEqual(expected_value, expected_grad)
       print("expected = ", expected_value)
diff --git a/tensorflow/python/kernel_tests/conv_ops_test.py b/tensorflow/python/kernel_tests/conv_ops_test.py
index 0ccbbf155c53196043c2c4597168c3b2ace72b20..2f6f3bb383b381de1dac78cc72882fe5fe4291c9 100644
--- a/tensorflow/python/kernel_tests/conv_ops_test.py
+++ b/tensorflow/python/kernel_tests/conv_ops_test.py
@@ -908,8 +908,8 @@ class Conv2DTest(test.TestCase):
         conv = gradients_impl.gradients(conv_forward, t1)[0]
         conv_2 = gradients_impl.gradients(conv_forward_2, t1)[0]
         # "values" consists of two tensors for two backprops
-        value = sess.run(conv)
-        value_2 = sess.run(conv_2)
+        value = self.evaluate(conv)
+        value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
       tf_logging.info("expected = ", value_2)
@@ -961,8 +961,8 @@ class Conv2DTest(test.TestCase):
           conv_forward_2 = test_util.NCHWToNHWC(conv_forward_2)
         conv = gradients_impl.gradients(conv_forward, t2)[0]
         conv_2 = gradients_impl.gradients(conv_forward, t2)[0]
-        value = sess.run(conv)
-        value_2 = sess.run(conv_2)
+        value = self.evaluate(conv)
+        value_2 = self.evaluate(conv_2)
         self.assertShapeEqual(value, conv)
         self.assertShapeEqual(value_2, conv_2)
       tf_logging.info("expected = ", value_2)
@@ -1545,7 +1545,7 @@ class DepthwiseConv2DTest(test.TestCase):
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       conv = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = ", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
@@ -1667,9 +1667,9 @@ class SeparableConv2DTest(test.TestCase):
       if data_format == "NCHW":
         conv = array_ops.transpose(conv, [0, 2, 3, 1])
 
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = ", value)
-    self.assertArrayNear(expected, np.ravel(value), 1e-5)
+    self.assertArrayNear(expected, np.ravel(value), 1e-3)
     self.assertShapeEqual(value, conv)
 
   def _testSeparableConv2D(self, data_format):
@@ -1774,10 +1774,10 @@ class DeepConv2DTest(test.TestCase):
       conv = nn_ops.conv2d(t1, t2, strides=strides, padding=padding)
 
       os.environ["TF_USE_DEEP_CONV2D"] = "0"
-      values_expect = sess.run([conv])
+      values_expect = self.evaluate([conv])
 
       os.environ["TF_USE_DEEP_CONV2D"] = "1"
-      values_test = sess.run([conv])
+      values_test = self.evaluate([conv])
 
       self.assertAllClose(values_expect, values_test, rtol=1e-5, atol=1e-5)
 
diff --git a/tensorflow/python/kernel_tests/cross_grad_test.py b/tensorflow/python/kernel_tests/cross_grad_test.py
index 0bd4006d6ac1e922ed2935ad70d7aa60e87dedf3..b397133fd7328efa137910f4ea503849e23c6abe 100644
--- a/tensorflow/python/kernel_tests/cross_grad_test.py
+++ b/tensorflow/python/kernel_tests/cross_grad_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -26,6 +27,7 @@ from tensorflow.python.platform import test
 
 class CrossOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGradientRandomValues(self):
     with self.cached_session():
       us = [2, 3]
diff --git a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
index d818fbd75ce9069ba31f34ec0c343cbd924b6131..0d86d13c7159bf577c1cca882964fe62b0586e2a 100644
--- a/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
+++ b/tensorflow/python/kernel_tests/ctc_decoder_ops_test.py
@@ -25,6 +25,7 @@ from six.moves import zip_longest
 
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ctc_ops
 from tensorflow.python.platform import test
@@ -94,6 +95,7 @@ class CTCGreedyDecoderTest(test.TestCase):
         with self.assertRaisesOpError(expected_err_re):
           sess.run(decoded_unwrapped + [log_probability])
 
+  @test_util.run_deprecated_v1
   def testCTCGreedyDecoder(self):
     """Test two batch entries - best path decoder."""
     max_time_steps = 6
@@ -170,6 +172,7 @@ class CTCGreedyDecoderTest(test.TestCase):
     self._testCTCDecoder(ctc_ops.ctc_greedy_decoder, inputs, seq_lens,
                          log_prob_truth, decode_truth)
 
+  @test_util.run_deprecated_v1
   def testCTCDecoderBeamSearch(self):
     """Test one batch, two beams - hibernating beam search."""
     # max_time_steps == 8
diff --git a/tensorflow/python/kernel_tests/ctc_loss_op_test.py b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
index cfc7cb98aa678190ae81a2d9ee40ef4984453a91..e24f304c1b80787f43885055cad1de8cf43bb4db 100644
--- a/tensorflow/python/kernel_tests/ctc_loss_op_test.py
+++ b/tensorflow/python/kernel_tests/ctc_loss_op_test.py
@@ -23,9 +23,16 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ctc_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
@@ -52,6 +59,24 @@ def SimpleSparseTensorFrom(x):
   return sparse_tensor.SparseTensor(x_ix, x_val, x_shape)
 
 
+def _ctc_loss_v2(labels, inputs, sequence_length,
+                 preprocess_collapse_repeated=False,
+                 ctc_merge_repeated=True,
+                 ignore_longer_outputs_than_inputs=False,
+                 time_major=True):
+  """Call ctc_loss_v2 with v1 args."""
+  assert not preprocess_collapse_repeated
+  assert ctc_merge_repeated
+  assert not ignore_longer_outputs_than_inputs
+  return ctc_ops.ctc_loss_v2(
+      labels=labels,
+      logits=inputs,
+      logit_length=sequence_length,
+      label_length=None,
+      blank_index=-1,
+      logits_time_major=time_major)
+
+
 class CTCLossTest(test.TestCase):
 
   def _testCTCLoss(self,
@@ -66,7 +91,7 @@ class CTCLossTest(test.TestCase):
     inputs_t = constant_op.constant(inputs)
 
     with self.cached_session(use_gpu=False) as sess:
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       grad = gradients_impl.gradients(loss, [inputs_t])[0]
 
@@ -74,13 +99,14 @@ class CTCLossTest(test.TestCase):
       self.assertShapeEqual(grad_truth, grad)
 
       if expected_err_re is None:
-        (tf_loss, tf_grad) = sess.run([loss, grad])
+        (tf_loss, tf_grad) = self.evaluate([loss, grad])
         self.assertAllClose(tf_loss, loss_truth, atol=1e-6)
         self.assertAllClose(tf_grad, grad_truth, atol=1e-6)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          sess.run([loss, grad])
+          self.evaluate([loss, grad])
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     """Test two batch entries."""
     # Input and ground truth from Alex Graves' implementation.
@@ -216,6 +242,7 @@ class CTCLossTest(test.TestCase):
 
     self._testCTCLoss(inputs, seq_lens, labels, loss_truth, grad_truth)
 
+  @test_util.run_v1_only("b/120545219")
   def test_time_major(self):
     """Testing time_major param.
 
@@ -234,17 +261,18 @@ class CTCLossTest(test.TestCase):
     inputs_t_transposed = constant_op.constant(inputs.transpose(1, 0, 2))
 
     with self.session(use_gpu=False) as sess:
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
-      loss_transposed = ctc_ops.ctc_loss(
+      loss_transposed = _ctc_loss_v2(
           inputs=inputs_t_transposed,
           labels=labels,
           sequence_length=seq_lens,
           time_major=False)
 
-      (tf_loss, tf_loss_transposed) = sess.run([loss, loss_transposed])
+      (tf_loss, tf_loss_transposed) = self.evaluate([loss, loss_transposed])
       self.assertAllEqual(tf_loss, tf_loss_transposed)
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidSecondGradient(self):
     inputs = np.random.randn(2, 2, 3).astype(np.float32)
     inputs_t = constant_op.constant(inputs)
@@ -253,7 +281,7 @@ class CTCLossTest(test.TestCase):
     v = [1.0]
 
     with self.session(use_gpu=False):
-      loss = ctc_ops.ctc_loss(
+      loss = _ctc_loss_v2(
           inputs=inputs_t, labels=labels, sequence_length=seq_lens)
       # Taking ths second gradient should fail, since it is not
       # yet supported.
@@ -261,6 +289,7 @@ class CTCLossTest(test.TestCase):
                                    "explicitly disabled"):
         _ = gradients_impl._hessian_vector_product(loss, [inputs_t], v)
 
+  @test_util.run_v1_only("b/120545219")
   def testEmptyBatch(self):
     inputs = constant_op.constant([], dtype=dtypes.float32, shape=(1, 0, 2))
     sequence_lengths = constant_op.constant([], dtype=dtypes.int32)
@@ -272,7 +301,546 @@ class CTCLossTest(test.TestCase):
     with self.session(use_gpu=False) as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                    "batch_size must not be 0"):
-        sess.run(ctc_ops.ctc_loss(labels, inputs, sequence_lengths))
+        sess.run(_ctc_loss_v2(labels, inputs, sequence_lengths))
+
+
+class CTCLossTestV2(test.TestCase):
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossV2(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    max_label_length = 5
+    num_frames = 12
+
+    labels = random_ops.random_uniform(
+        [batch_size, max_label_length], minval=1, maxval=num_labels,
+        dtype=dtypes.int64)
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+
+    label_length = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=max_label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_length, maxlen=max_label_length, dtype=label_length.dtype)
+    labels *= label_mask
+    logit_length = [num_frames] * batch_size
+
+    ref_loss = ctc_ops.ctc_loss_v2(
+        labels=labels,
+        logits=logits,
+        label_length=label_length,
+        logit_length=logit_length)
+    ref_grad = gradients_impl.gradients(ref_loss, [logits])
+
+    sparse_labels = ctc_ops.dense_labels_to_sparse(labels, label_length)
+
+    def assert_same_loss_and_grads(loss):
+      with self.cached_session() as sess:
+        self.assertAllClose(*self.evaluate([loss, ref_loss]))
+        grad = gradients_impl.gradients(loss, [logits])
+        self.assertAllClose(
+            *self.evaluate([grad, ref_grad]), rtol=2e-06, atol=2e-06)
+
+    assert_same_loss_and_grads(
+        ctc_ops.ctc_loss_v2(
+            labels=sparse_labels,
+            logits=logits,
+            label_length=label_length,
+            logit_length=logit_length,
+            blank_index=0))
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseIsSameAsCtcLoss(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      random_seed.set_random_seed(5)
+
+      batch_size = 8
+      num_labels = 6
+      label_length = 5
+      num_frames = 12
+      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+      labels = random_ops.random_uniform(
+          [batch_size, label_length], minval=1, maxval=num_labels,
+          dtype=dtypes.int64)
+
+      label_lengths = random_ops.random_uniform(
+          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+      label_mask = array_ops.sequence_mask(
+          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+      labels *= label_mask
+
+      logit_lengths = [num_frames] * batch_size
+
+      ctc_loss = ctc_ops.ctc_loss_dense(
+          labels=labels,
+          logits=logits,
+          label_length=label_lengths,
+          logit_length=logit_lengths)
+      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+      # Shift labels down by one (move blank from 0 to num_labels -1)
+      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
+      tf_nn_ctc_logits = array_ops.concat([
+          logits[:, :, 1:],
+          logits[:, :, 0:1],
+      ], axis=2)
+
+      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+          tf_ctc_loss_labels, label_lengths)
+
+      tf_nn_ctc_loss = ctc_ops.ctc_loss(
+          labels=tf_ctc_loss_labels,
+          inputs=tf_nn_ctc_logits,
+          sequence_length=logit_lengths,
+          time_major=True)
+      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+      with self.cached_session() as sess:
+        for _ in range(32):
+          self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+          self.assertAllClose(
+              *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+              rtol=2e-06,
+              atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseUniqueFastPathIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform(
+        [batch_size, label_length], minval=1, maxval=num_labels,
+        dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=labels,
+        logits=logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        unique=ctc_ops.ctc_unique_labels(labels))
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+    # Shift labels down by one (move blank from 0 to num_labels -1)
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32) - 1
+    tf_nn_ctc_logits = array_ops.concat([
+        logits[:, :, 1:],
+        logits[:, :, 0:1],
+    ], axis=2)
+
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+        tf_ctc_loss_labels, label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=tf_nn_ctc_logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    with self.cached_session() as sess:
+      for _ in range(32):
+        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(
+            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+            rtol=2e-06,
+            atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseWithBlankIndexIsSameAsCtcLoss(self):
+    random_seed.set_random_seed(5)
+
+    batch_size = 8
+    num_labels = 6
+    label_length = 5
+    num_frames = 12
+    logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+    labels = random_ops.random_uniform(
+        [batch_size, label_length], minval=0, maxval=num_labels-1,
+        dtype=dtypes.int64)
+
+    label_lengths = random_ops.random_uniform(
+        [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+    label_mask = array_ops.sequence_mask(
+        label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+    labels *= label_mask
+
+    logit_lengths = [num_frames] * batch_size
+
+    tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+    tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+        tf_ctc_loss_labels, label_lengths)
+
+    tf_nn_ctc_loss = ctc_ops.ctc_loss(
+        labels=tf_ctc_loss_labels,
+        inputs=logits,
+        sequence_length=logit_lengths,
+        time_major=True)
+    tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+    # Shift the blank logits/labels to be somewhere in the middle.
+    blank_index = 2
+    shifted_logits = array_ops.concat([
+        logits[:, :, :blank_index],
+        logits[:, :, -1:],
+        logits[:, :, blank_index:-1],
+    ], axis=2)
+    shifted_labels = array_ops.where(labels < blank_index, labels, labels + 1)
+
+    ctc_loss = ctc_ops.ctc_loss_dense(
+        labels=shifted_labels,
+        logits=shifted_logits,
+        label_length=label_lengths,
+        logit_length=logit_lengths,
+        blank_index=blank_index)
+    ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+    with self.cached_session() as sess:
+      for _ in range(32):
+        self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+        self.assertAllClose(
+            *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+            rtol=2e-06,
+            atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCtcLossDenseWithNegativeBlankIndexIsSameAsCtcLoss(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      random_seed.set_random_seed(5)
+
+      batch_size = 8
+      num_labels = 6
+      label_length = 5
+      num_frames = 12
+      logits = random_ops.random_uniform([num_frames, batch_size, num_labels])
+      labels = random_ops.random_uniform(
+          [batch_size, label_length], minval=0, maxval=num_labels-1,
+          dtype=dtypes.int64)
+
+      label_lengths = random_ops.random_uniform(
+          [batch_size], minval=2, maxval=label_length, dtype=dtypes.int64)
+      label_mask = array_ops.sequence_mask(
+          label_lengths, maxlen=label_length, dtype=label_lengths.dtype)
+      labels *= label_mask
+
+      logit_lengths = [num_frames] * batch_size
+
+      ctc_loss = ctc_ops.ctc_loss_dense(
+          labels=labels,
+          logits=logits,
+          label_length=label_lengths,
+          logit_length=logit_lengths,
+          blank_index=-1)
+      ctc_loss_grads = gradients_impl.gradients(ctc_loss, [logits])[0]
+
+      tf_ctc_loss_labels = math_ops.cast(labels, dtypes.int32)
+      tf_ctc_loss_labels = ctc_ops.dense_labels_to_sparse(
+          tf_ctc_loss_labels, label_lengths)
+
+      tf_nn_ctc_loss = ctc_ops.ctc_loss(
+          labels=tf_ctc_loss_labels,
+          inputs=logits,
+          sequence_length=logit_lengths,
+          time_major=True)
+      tf_nn_ctc_grads = gradients_impl.gradients(tf_nn_ctc_loss, [logits])[0]
+
+      with self.cached_session() as sess:
+        for _ in range(32):
+          self.assertAllClose(*self.evaluate([ctc_loss, tf_nn_ctc_loss]))
+          self.assertAllClose(
+              *self.evaluate([ctc_loss_grads, tf_nn_ctc_grads]),
+              rtol=2e-06,
+              atol=2e-06)
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeated(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 3, 3, 3, 0],
+                [1, 4, 4, 4, 0],
+                [4, 2, 2, 9, 4]],
+        seq_length=[4, 5, 5])
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeatedPreservesDtypes(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=constant_op.constant(
+            [[1, 3, 3, 3, 0],
+             [1, 4, 4, 4, 0],
+             [4, 2, 2, 9, 4]],
+            dtype=dtypes.int64),
+        seq_length=constant_op.constant([4, 5, 5], dtype=dtypes.int64))
+    self.assertEqual(new_seq_lengths.dtype, dtypes.int64)
+    self.assertEqual(collapsed.dtype, dtypes.int64)
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeatedExtraPadding(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 3, 3, 3, 0, 0, 0],
+                [1, 4, 4, 4, 0, 1, 2],
+                [4, 2, 2, 9, 4, 0, 0]],
+        seq_length=[4, 5, 5])
+    self.assertAllEqual(new_seq_lengths, [2, 3, 4])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 3, 0, 0],
+         [1, 4, 0, 0],
+         [4, 2, 9, 4]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeatedFrontRepeats(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 1, 1, 2, 2],
+                [1, 1, 1, 2, 2],
+                [1, 1, 1, 2, 2]],
+        seq_length=[5, 4, 3])
+    self.assertAllEqual(new_seq_lengths, [2, 2, 1])
+    self.assertAllEqual(
+        collapsed,
+        [[1, 2],
+         [1, 2],
+         [1, 0]])
+
+  @test_util.run_v1_only("b/120545219")
+  def testCollapseRepeatedAllLabelsTheSame(self):
+    collapsed, new_seq_lengths = ctc_ops.collapse_repeated(
+        labels=[[1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1],
+                [1, 1, 1, 1, 1]],
+        seq_length=[4, 5, 1])
+    self.assertAllEqual(new_seq_lengths, [1, 1, 1])
+    self.assertAllEqual(
+        collapsed,
+        [[1],
+         [1],
+         [1]])
+
+  def testDenseSequencesToSparse(self):
+    labels = [[1, 3, 3, 3, 0],
+              [1, 4, 4, 4, 0],
+              [4, 2, 2, 9, 4]]
+    length = [4, 5, 5]
+    sparse = ctc_ops.dense_labels_to_sparse(labels, length)
+    new_dense = sparse_ops.sparse_tensor_to_dense(sparse)
+
+    self.assertAllEqual(labels, new_dense)
+
+    padded_labels = [[1, 3, 3, 3, 0, 0, 0, 0],
+                     [1, 4, 4, 4, 0, 0, 0, 0],
+                     [4, 2, 2, 9, 4, 0, 0, 0]]
+    length = [4, 5, 5]
+    sparse = ctc_ops.dense_labels_to_sparse(padded_labels, length)
+    padded_dense = sparse_ops.sparse_tensor_to_dense(sparse)
+
+    self.assertAllEqual(padded_dense, new_dense)
+
+  @test_util.run_v1_only("b/120545219")
+  def testUnique(self):
+    labels = [
+        [3, 4, 4, 3],
+        [1, 1, 1, 0],
+    ]
+    unique, idx = ctc_ops.ctc_unique_labels(labels)
+    self.assertAllEqual([
+        [3, 4, 0, 0],
+        [1, 0, 0, 0],
+    ], unique)
+    self.assertAllEqual([
+        [0, 1, 1, 0],
+        [0, 0, 0, 1],
+    ], idx)
+
+  @test_util.run_v1_only("b/120545219")
+  def testSumStates(self):
+    idx = [
+        [0, 1, 0, 1],
+        [0, 0, 0, 1],
+    ]
+    states = math_ops.log([
+        [[1.0, 2.0, 3.0, 4.0],
+         [5.0, 6.0, 7.0, 8.0]],
+        [[0.1, 0.2, 0.3, 0.4],
+         [0.5, 0.6, 0.7, 0.8]],
+    ])
+    sum_of_states = math_ops.exp(ctc_ops._sum_states(idx, states))
+    self.assertAllClose([
+        [[4.0, 6.0, 0.0, 0.0],
+         [18.0, 8.0, 0.0, 0.0]],
+        [[0.4, 0.6, 0.0, 0.0],
+         [1.8, 0.8, 0.0, 0.0]]
+    ], sum_of_states)
+
+  @test_util.run_v1_only("b/120545219")
+  def testStateToOlabel(self):
+    labels = [
+        [3, 4, 3, 4],
+        [1, 1, 1, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 10 states (5 label, 5 blank).
+    states = [
+        [[0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20],
+         [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]],
+        [[1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
+         [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]],
+        [[11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
+         [21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]],
+    ]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel(labels, num_labels, states)
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+    self.assertAllClose(blank, [
+        [0.16 + 0.17 + 0.18 + 0.19 + 0.20,
+         0.26 + 0.27 + 0.28 + 0.29 + 0.30],
+        [1.6 + 1.7 + 1.8 + 1.9 + 2.0,
+         2.6 + 2.7 + 2.8 + 2.9 + 3.0],
+        [16.0 + 17.0 + 18.0 + 19.0 + 20.0,
+         26.0 + 27.0 + 28.0 + 29.0 + 30.0]
+    ])
+    self.assertAllClose(olabel[:, :, 1:], [
+        [[0.0, 0.0, 0.12 + 0.14, 0.13 + 0.15, 0.0, 0.0, 0.0],
+         [0.22 + 0.23 + 0.24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 1.2 + 1.4, 1.3 + 1.5, 0.0, 0.0, 0.0],
+         [2.2 + 2.3 + 2.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 12.0 + 14.0, 13.0 + 15.0, 0.0, 0.0, 0.0],
+         [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+    ])
+
+  @test_util.run_v1_only("b/120545219")
+  def testStateToOlabelUnique(self):
+    labels = [
+        [3, 4, 3, 4],
+        [1, 1, 1, 0],
+    ]
+    num_labels = 8
+
+    # 3 frames, 2 batch, 10 states (5 label, 5 blank).
+    states = [
+        [[0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20],
+         [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30]],
+        [[1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0],
+         [2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0]],
+        [[11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0],
+         [21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0]],
+    ]
+    labels = ops.convert_to_tensor(labels)
+    states = math_ops.log(states)
+    olabel = ctc_ops._state_to_olabel_unique(
+        labels, num_labels, states, ctc_ops.ctc_unique_labels(labels))
+    olabel = math_ops.exp(olabel)
+    blank = olabel[:, :, 0]
+    self.assertAllClose(blank, [
+        [0.16 + 0.17 + 0.18 + 0.19 + 0.20,
+         0.26 + 0.27 + 0.28 + 0.29 + 0.30],
+        [1.6 + 1.7 + 1.8 + 1.9 + 2.0,
+         2.6 + 2.7 + 2.8 + 2.9 + 3.0],
+        [16.0 + 17.0 + 18.0 + 19.0 + 20.0,
+         26.0 + 27.0 + 28.0 + 29.0 + 30.0]])
+    self.assertAllClose(olabel[:, :, 1:], [
+        [[0.0, 0.0, 0.12 + 0.14, 0.13 + 0.15, 0.0, 0.0, 0.0],
+         [0.22 + 0.23 + 0.24, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 1.2 + 1.4, 1.3 + 1.5, 0.0, 0.0, 0.0],
+         [2.2 + 2.3 + 2.4, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+        [[0.0, 0.0, 12.0 + 14.0, 13.0 + 15.0, 0.0, 0.0, 0.0],
+         [22.0 + 23.0 + 24.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]],
+    ])
+
+  @test_util.run_deprecated_v1
+  def testScan(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      out = ctc_ops._scan(
+          lambda accum, elem: accum + elem,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0)
+      self.assertAllEqual([24.0, 26.0, 29.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          inclusive=True)
+      self.assertAllEqual([23.0, 24.0, 26.0, 29.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          reverse=True)
+      self.assertAllEqual([29.0, 28.0, 26.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([1.0, 2.0, 3.0]), 23.0,
+          reverse=True,
+          inclusive=True)
+      self.assertAllEqual([29.0, 28.0, 26.0, 23.0], out)
+
+      out = ctc_ops._scan(
+          lambda a, e: a + e,
+          constant_op.constant([[0.0, 1.0], [2.0, 3.0], [4.0, 5.0]]),
+          constant_op.constant([23.0, 24.0]))
+      self.assertAllEqual([[23.0, 25.0], [25.0, 28.0], [29.0, 33.0]], out)
+
+  @test_util.run_deprecated_v1
+  def testScanCapturesVariables(self):
+    with self.cached_session() as sess:
+      x = random_ops.random_uniform([])
+      fn = lambda accum, elem: accum + x * elem
+      out = ctc_ops._scan(fn, constant_op.constant([0.0, 1.0, 2.0]), 23.0)
+      self.assertAllEqual(*sess.run([
+          [23.0 + x * 0.0, 23.0 + x * 1.0, 23.0 + x * 3.0], out
+      ]))
+
+  @test_util.run_deprecated_v1
+  def testScanMultipleAccumulators(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      def fn(accum, elem):
+        accum_a, accum_b = accum
+        return accum_a + elem, accum_b * elem
+      out = ctc_ops._scan(
+          fn, constant_op.constant([1.0, 2.0, 3.0]),
+          (23.0, constant_op.constant([1.0, 2.0])))
+      a, b = out
+      self.assertAllEqual([24.0, 26.0, 29.0], a)
+      self.assertAllEqual([[1.0, 2.0], [2.0, 4.0], [6.0, 12.0]], b)
+
+  @test_util.run_deprecated_v1
+  def testScanMultipleElements(self):
+    with ops.device("/GPU:0" if test.is_gpu_available() else "/CPU:0"):
+      def fn(accum, elem):
+        elem_a, elem_b = elem
+        return accum + (elem_a * elem_b)
+      elems_a = constant_op.constant([1.0, 2.0, 3.0])
+      elems_b = constant_op.constant([[1.0, 2.0], [2.0, 3.0], [3.0, 4.0]])
+      out = ctc_ops._scan(
+          fn, (elems_a, elems_b),
+          initial=constant_op.constant([0.0, 0.0]))
+      self.assertAllEqual(
+          [[1.0, 2.0], [5.0, 8.0], [14.0, 20.0]], out)
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
index 8028f93a8c561c4e5d416240469c5da1724dd1ab..49dbbb125a162bd5e1abaa4e8e2dc0907ca920ae 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_binary_test.py
@@ -77,23 +77,23 @@ class BinaryOpTest(test.TestCase):
 
   def _compareCpu(self, x, y, np_func, tf_func, also_compare_variables=False):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_cpu = out.eval()
+      tf_cpu = self.evaluate(out)
       # Test that the op takes precedence over numpy operators.
-      np_left = tf_func(x, iny).eval()
-      np_right = tf_func(inx, y).eval()
+      np_left = self.evaluate(tf_func(x, iny))
+      np_right = self.evaluate(tf_func(inx, y))
 
       if also_compare_variables:
         var_x = variables.Variable(x)
         var_y = variables.Variable(y)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         print(type(x), type(y), type(var_x), type(var_y))
         print(type(tf_func(x, var_y)), type(tf_func(var_x, y)))
-        np_var_left = tf_func(x, var_y).eval()
-        np_var_right = tf_func(var_x, y).eval()
+        np_var_left = self.evaluate(tf_func(x, var_y))
+        np_var_right = self.evaluate(tf_func(var_x, y))
 
     if np_ans.dtype != np.object:
       self.assertAllClose(np_ans, tf_cpu)
@@ -174,11 +174,11 @@ class BinaryOpTest(test.TestCase):
 
   def _compareGpu(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_gpu = out.eval()
+      tf_gpu = self.evaluate(out)
     self.assertAllClose(np_ans, tf_gpu)
     self.assertShapeEqual(np_ans, out)
     # TODO(zhifengc/ke): make gradient checker work on GPU.
@@ -196,6 +196,7 @@ class BinaryOpTest(test.TestCase):
         self._compareGradientY(x, y, np_func, tf_func)
       self._compareGpu(x, y, np_func, tf_func)
 
+  @test_util.run_deprecated_v1
   def testFloatBasic(self):
     x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float32)
     y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(np.float32)
@@ -233,6 +234,7 @@ class BinaryOpTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test special functions: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testFloatDifferentShapes(self):
     x = np.array([1, 2, 3, 4]).reshape(2, 2).astype(np.float32)
     y = np.array([1, 2]).reshape(2, 1).astype(np.float32)
@@ -252,14 +254,17 @@ class BinaryOpTest(test.TestCase):
     y = np.array([1, 2]).reshape(2, 1).astype(np.int32)
     var_x = variables.Variable(x)
     var_y = variables.Variable(y)
+
     with self.cached_session() as sess:
-      sess.run([var_x.initializer, var_y.initializer])
-      left_result = (var_x * y).eval()
-      right_result = (x * var_y).eval()
+      self.evaluate([var_x.initializer, var_y.initializer])
+      left_result = self.evaluate(var_x * y)
+      right_result = self.evaluate(x * var_y)
+
     np_result = x * y
     self.assertAllEqual(np_result, left_result)
     self.assertAllEqual(np_result, right_result)
 
+  @test_util.run_deprecated_v1
   def testDoubleBasic(self):
     x = np.linspace(-5, 20, 15).reshape(1, 3, 5).astype(np.float64)
     y = np.linspace(20, -5, 15).reshape(1, 3, 5).astype(np.float64)
@@ -351,6 +356,7 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.floor_divide, _FLOORDIV)
     self._compareBoth(x, y, np.mod, _MOD)
 
+  @test_util.run_deprecated_v1
   def testComplex64Basic(self):
     x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
         np.complex64)
@@ -365,6 +371,7 @@ class BinaryOpTest(test.TestCase):
     self._compareBoth(x, y, np.multiply, _MUL)
     self._compareBoth(x, y + 0.1, np.true_divide, _TRUEDIV)
 
+  @test_util.run_deprecated_v1
   def testComplex128Basic(self):
     x = np.complex(1, 1) * np.linspace(-10, 10, 6).reshape(1, 3, 2).astype(
         np.complex128)
@@ -382,10 +389,10 @@ class BinaryOpTest(test.TestCase):
   def testStringComparison(self):
     x = np.array([["abc", "bh"], ["c", ""]])
     y = np.array([["abc", "bh"], ["def", "hi"]])
-    with self.test_session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       cmp_eq = math_ops.equal(x, y)
       cmp_not_eq = math_ops.not_equal(x, y)
-      values = sess.run([cmp_eq, cmp_not_eq])
+      values = self.evaluate([cmp_eq, cmp_not_eq])
       self.assertAllEqual([[True, True], [False, False]], values[0])
       self.assertAllEqual([[False, False], [True, True]], values[1])
 
@@ -478,198 +485,263 @@ class BinaryOpTest(test.TestCase):
     ]
     self._testBCastByFunc(funcs, xs, ys)
 
+  @test_util.run_deprecated_v1
   def testBCast_0A(self):
     self._testBCastA([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0B(self):
     self._testBCastB([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0C(self):
     self._testBCastC([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_0D(self):
     self._testBCastD([1, 3, 2], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_1A(self):
     self._testBCastA([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1B(self):
     self._testBCastB([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1C(self):
     self._testBCastC([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_1D(self):
     self._testBCastD([1, 3, 2], [2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2A(self):
     self._testBCastA([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2B(self):
     self._testBCastB([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2C(self):
     self._testBCastC([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_2D(self):
     self._testBCastD([1, 3, 2], [3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_3A(self):
     self._testBCastA([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3B(self):
     self._testBCastB([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3C(self):
     self._testBCastC([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_3D(self):
     self._testBCastD([1, 3, 2], [3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_4A(self):
     self._testBCastA([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4B(self):
     self._testBCastB([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4C(self):
     self._testBCastC([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_4D(self):
     self._testBCastD([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_5A(self):
     self._testBCastA([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5B(self):
     self._testBCastB([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5C(self):
     self._testBCastC([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_5D(self):
     self._testBCastD([1, 3, 2], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6A(self):
     self._testBCastA([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6B(self):
     self._testBCastB([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6C(self):
     self._testBCastC([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_6D(self):
     self._testBCastD([1, 3, 2], [2, 1, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7A(self):
     self._testBCastA([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7B(self):
     self._testBCastB([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7C(self):
     self._testBCastC([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_7D(self):
     self._testBCastD([1, 3, 2], [1, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8A(self):
     self._testBCastA([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8B(self):
     self._testBCastB([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8C(self):
     self._testBCastC([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_8D(self):
     self._testBCastD([2, 1, 5], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9A(self):
     self._testBCastA([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9B(self):
     self._testBCastB([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9C(self):
     self._testBCastC([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_9D(self):
     self._testBCastD([2, 0, 5], [2, 0, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10A(self):
     self._testBCastA([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10B(self):
     self._testBCastB([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10C(self):
     self._testBCastC([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_10D(self):
     self._testBCastD([2, 3, 0], [2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testBCast_11A(self):
     self._testBCastA([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11B(self):
     self._testBCastB([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11C(self):
     self._testBCastC([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_11D(self):
     self._testBCastD([1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12A(self):
     self._testBCastA([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12B(self):
     self._testBCastB([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12C(self):
     self._testBCastC([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_12D(self):
     self._testBCastD([1, 1, 1, 1, 3, 2], [1, 3, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_13A(self):
     self._testBCastA([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13B(self):
     self._testBCastB([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13C(self):
     self._testBCastC([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_13D(self):
     self._testBCastD([1, 3, 2, 1, 1], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14A(self):
     self._testBCastA([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14B(self):
     self._testBCastB([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14C(self):
     self._testBCastC([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_14D(self):
     self._testBCastD([2, 3, 1, 1, 5], [1])
 
+  @test_util.run_deprecated_v1
   def testBCast_15A(self):
     self._testBCastA([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15B(self):
     self._testBCastB([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15C(self):
     self._testBCastC([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testBCast_15D(self):
     self._testBCastD([10, 3, 1, 2], [3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testMismatchedDimensions(self):
     for func in [
         math_ops.add, math_ops.subtract, math_ops.multiply, math_ops.div, _ADD,
@@ -681,6 +753,7 @@ class BinaryOpTest(test.TestCase):
             ops.convert_to_tensor([10.0, 20.0, 30.0]),
             ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
+  @test_util.run_deprecated_v1
   def testZeroPowGrad(self):
     with self.cached_session():
       for dtype in (np.float16, np.float32, np.float64, np.complex64,
@@ -691,6 +764,7 @@ class BinaryOpTest(test.TestCase):
         error = gradient_checker.compute_gradient_error(y, [], z, [])
         self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testComplexPowGrad(self):
     with self.cached_session():
       for dtype in np.complex64, np.complex128:
@@ -716,39 +790,39 @@ class BinaryOpTest(test.TestCase):
 
   def testPowNegativeExponent(self):
     for dtype in [np.int32, np.int64]:
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = np.array([-2, 3]).astype(dtype)
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = np.array([2, -3]).astype(dtype)
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
-      with self.test_session(use_gpu=False) as sess:
+      with test_util.force_cpu():
         with self.assertRaisesRegexp(
             errors_impl.InvalidArgumentError,
             "Integers to negative integer powers are not allowed"):
           x = np.array([5, 2]).astype(dtype)
           y = -3
-          sess.run(math_ops.pow(x, y))
+          self.evaluate(math_ops.pow(x, y))
 
 
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
-      ret = out.eval()
+      ret = self.evaluate(out)
     return ret[0]
 
   def testScalarCompareScalar(self):
@@ -777,9 +851,9 @@ class ComparisonOpTest(test.TestCase):
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
@@ -859,6 +933,7 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(
         np.not_equal, math_ops.not_equal, include_complex=True)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
     funcs = [
diff --git a/tensorflow/python/kernel_tests/cwise_ops_test.py b/tensorflow/python/kernel_tests/cwise_ops_test.py
index c5311ad834a700bf3341b5c25fb8a22f837eae62..9bb7d8b8b12baafe15fe9150e58c4e03749e7261 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_test.py
@@ -84,11 +84,11 @@ def _default_tolerance(dtype):
 class ComparisonOpTest(test.TestCase):
 
   def _compareScalar(self, func, x, y, dtype):
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = func(
           ops.convert_to_tensor(np.array([x]).astype(dtype)),
           ops.convert_to_tensor(np.array([y]).astype(dtype)))
-      ret = out.eval()
+      ret = self.evaluate(out)
     return ret[0]
 
   def testScalarCompareScalar(self):
@@ -117,9 +117,9 @@ class ComparisonOpTest(test.TestCase):
 
   def _compare(self, x, y, np_func, tf_func):
     np_ans = np_func(x, y)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
 
   def testTensorCompareTensor(self):
@@ -199,6 +199,7 @@ class ComparisonOpTest(test.TestCase):
     self._testBCastByFunc(
         np.not_equal, math_ops.not_equal, include_complex=True)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     dtypes = [np.float16, np.float32, np.float64, np.int32, np.int64]
     funcs = [
@@ -218,22 +219,20 @@ class LogicalOpTest(test.TestCase):
 
   def _compareBinary(self, x, y, np_func, tf_func, use_gpu=False):
     np_ans = np_func(x, y)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       out = tf_func(inx, iny)
-      tf_val = out.eval()
+      tf_val = self.evaluate(out)
     self.assertEqual(out.dtype, dtypes_lib.bool)
     self.assertAllEqual(np_ans, tf_val)
     self.assertShapeEqual(np_ans, out)
 
   def _not(self, x, use_gpu=False):
     np_ans = np.logical_not(x)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = math_ops.logical_not(ops.convert_to_tensor(x))
-      tf_val = out.eval()
+      tf_val = self.evaluate(out)
     self.assertEqual(out.dtype, dtypes_lib.bool)
     self.assertAllEqual(np_ans, tf_val)
     self.assertShapeEqual(np_ans, out)
@@ -282,6 +281,7 @@ class LogicalOpTest(test.TestCase):
         self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
         self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     y = np.random.randint(0, 2, 6).astype(np.bool).reshape(3, 2, 1)
@@ -290,6 +290,7 @@ class LogicalOpTest(test.TestCase):
           ValueError, lambda e: "Dimensions must" in str(e)):
         f(x, y)
 
+  @test_util.run_deprecated_v1
   def testUsingAsPythonValueFails(self):
     # Ensure that we raise an error when the user attempts to treat a
     # `Tensor` as a Python `bool`.
@@ -316,10 +317,9 @@ class SelectOpTest(test.TestCase):
 
   def _compare(self, c, x, y, use_gpu):
     np_ans = np.where(c, x, y)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = array_ops.where(c, x, y)
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
@@ -399,6 +399,7 @@ class SelectOpTest(test.TestCase):
       if t in [np.float16, np.float32, np.float64]:
         self._compare(c, xt, yt, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
@@ -418,6 +419,7 @@ class SelectOpTest(test.TestCase):
         self._compareGradientX(c, xt, yt)
         self._compareGradientY(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
     x = np.random.rand(1, 3, 2) * 100
@@ -431,6 +433,7 @@ class SelectOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         array_ops.where(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testEmptyTensor(self):
     c = np.random.randint(0, 3, 0).astype(np.bool).reshape(1, 3, 0)
     x = np.random.rand(1, 3, 0) * 100
@@ -442,6 +445,7 @@ class SelectOpTest(test.TestCase):
       z = array_ops.where(c, xt, yt).eval()
       self.assertAllEqual(z_expected, z)
 
+  @test_util.run_deprecated_v1
   def testNan(self):
     """Verify that nans don't propagate where they shouldn't."""
     with self.cached_session():
@@ -460,10 +464,9 @@ class BatchSelectOpTest(test.TestCase):
     np_ans = np.dstack(
         [x_i if c_i else y_i for c_i, x_i, y_i in zip(c, x, y)]).transpose(
             [2, 0, 1])
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       out = array_ops.where(c, x, y)
-      tf_ans = out.eval()
+      tf_ans = self.evaluate(out)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, out)
 
@@ -529,6 +532,7 @@ class BatchSelectOpTest(test.TestCase):
       if t in [np.float16, np.float32, np.float64]:
         self._compare(c, xt, yt, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     c = np.random.randint(0, 2, 16).astype(np.bool)
     x = np.random.rand(16, 2, 8) * 100
@@ -548,6 +552,7 @@ class BatchSelectOpTest(test.TestCase):
         self._compareGradientX(c, xt, yt)
         self._compareGradientY(c, xt, yt)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     c = np.random.randint(0, 2, 8).astype(np.bool)
     x = np.random.rand(16, 3, 2) * 100
@@ -566,13 +571,11 @@ class MinMaxOpTest(test.TestCase):
 
   def _compare(self, x, y, use_gpu):
     np_min, np_max = np.minimum(x, y), np.maximum(x, y)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       iny = ops.convert_to_tensor(y)
       omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
-      tf_min, tf_max = sess.run([omin, omax])
+      tf_min, tf_max = self.evaluate([omin, omax])
     self.assertAllEqual(np_min, tf_min)
     self.assertAllEqual(np_max, tf_max)
 
@@ -628,6 +631,7 @@ class MinMaxOpTest(test.TestCase):
     elif x.dtype == np.float64:
       self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     x = np.random.rand(1, 3, 2) * 100.
     # ensure x != y
@@ -641,16 +645,16 @@ class MinMaxOpTest(test.TestCase):
 class MathOpsOverloadTest(test.TestCase):
 
   def _computeTensorAndLiteral(self, x, y, dtype, func):
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       inx = ops.convert_to_tensor(x, dtype=dtype)
       z = func(inx, y)  # Should use __add__, __sub__, etc.
-      return z.eval()
+      return self.evaluate(z)
 
   def _computeLiteralAndTensor(self, x, y, dtype, func):
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       iny = ops.convert_to_tensor(y, dtype=dtype)
       z = func(x, iny)  # Should use __radd__, __rsub__, etc.
-      return z.eval()
+      return self.evaluate(z)
 
   def _compareBinary(self, x, y, dtype, np_func, tf_func):
     np_ans = np_func(x, y).astype(dtype.as_numpy_dtype)
@@ -661,9 +665,9 @@ class MathOpsOverloadTest(test.TestCase):
 
   def _compareUnary(self, x, dtype, np_func, tf_func):
     np_ans = np_func(x).astype(dtype.as_numpy_dtype)
-    with self.test_session(use_gpu=False):
-      self.assertAllClose(np_ans,
-                          tf_func(ops.convert_to_tensor(x, dtype=dtype)).eval())
+    with test_util.force_cpu():
+      self.assertAllClose(
+          np_ans, self.evaluate(tf_func(ops.convert_to_tensor(x, dtype=dtype))))
 
   def testOverload(self):
     dtypes = [
@@ -730,13 +734,11 @@ class IsFiniteInfNanTest(test.TestCase):
 
   def _compare(self, x, use_gpu):
     np_finite, np_inf, np_nan = np.isfinite(x), np.isinf(x), np.isnan(x)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(x)
       ofinite, oinf, onan = math_ops.is_finite(inx), math_ops.is_inf(
           inx), math_ops.is_nan(inx)
-      tf_finite, tf_inf, tf_nan = sess.run([ofinite, oinf, onan])
+      tf_finite, tf_inf, tf_nan = self.evaluate([ofinite, oinf, onan])
     self.assertAllEqual(np_inf, tf_inf)
     self.assertAllEqual(np_nan, tf_nan)
     self.assertAllEqual(np_finite, tf_finite)
@@ -773,31 +775,33 @@ class IsFiniteInfNanTest(test.TestCase):
           x = np.full((size,), value, dtype=dtype)
           np_y = np.sqrt(x)
           np_nan = np.isnan(np_y)
-          with self.test_session(force_gpu=test_util.is_gpu_available()):
+          with test_util.use_gpu():
             tf_y = math_ops.sqrt(x)
             tf_nan = math_ops.is_nan(tf_y)
             if value < 0:
-              self.assertAllEqual(np_nan, tf_nan.eval())
+              self.assertAllEqual(np_nan, self.evaluate(tf_nan))
             else:
-              self.assertAllCloseAccordingToType(np_y, tf_y.eval())
+              self.assertAllCloseAccordingToType(np_y, self.evaluate(tf_y))
 
 
 class RoundingTest(test.TestCase):
 
   def _compare_values(self, x, y=None):
     y = np.rint(x) if y is None else np.asarray(y)
-    with self.cached_session() as sess:
-      tf_rint = math_ops.rint(x)
-      np_rint = sess.run(tf_rint)
+
+    tf_rint = math_ops.rint(x)
+    np_rint = self.evaluate(tf_rint)
+
     self.assertAllEqual(y, np_rint)
     self.assertShapeEqual(y, tf_rint)
 
   def _compare(self, x):
     np_floor, np_ceil = np.floor(x), np.ceil(x)
-    with self.cached_session() as sess:
-      inx = ops.convert_to_tensor(x)
-      ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
-      tf_floor, tf_ceil = sess.run([ofloor, oceil])
+
+    inx = ops.convert_to_tensor(x)
+    ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
+    tf_floor, tf_ceil = self.evaluate([ofloor, oceil])
+
     self.assertAllEqual(np_floor, tf_floor)
     self.assertAllEqual(np_ceil, tf_ceil)
     self.assertShapeEqual(np_floor, ofloor)
@@ -828,12 +832,13 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareMake(self, real, imag, use_gpu):
     np_ans = real + (1j) * imag
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+
+    with test_util.device(use_gpu=use_gpu):
       real = ops.convert_to_tensor(real)
       imag = ops.convert_to_tensor(imag)
       tf_ans = math_ops.complex(real, imag)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
+
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -848,17 +853,17 @@ class ComplexMakeRealImagTest(test.TestCase):
   def _compareRealImag(self, cplx, use_gpu):
     np_real, np_imag = np.real(cplx), np.imag(cplx)
     np_zeros = np_real * 0
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_real = math_ops.real(inx)
       tf_imag = math_ops.imag(inx)
       tf_real_real = math_ops.real(tf_real)
       tf_imag_real = math_ops.imag(tf_real)
-      self.assertAllEqual(np_real, tf_real.eval())
-      self.assertAllEqual(np_imag, tf_imag.eval())
-      self.assertAllEqual(np_real, tf_real_real.eval())
-      self.assertAllEqual(np_zeros, tf_imag_real.eval())
+      self.assertAllEqual(np_real, self.evaluate(tf_real))
+      self.assertAllEqual(np_imag, self.evaluate(tf_imag))
+      self.assertAllEqual(np_real, self.evaluate(tf_real_real))
+      self.assertAllEqual(np_zeros, self.evaluate(tf_imag_real))
 
   def testRealImag64(self):
     real = (np.arange(-3, 3) / 4.).reshape([1, 3, 2]).astype(np.float32)
@@ -876,12 +881,12 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareAngle(self, cplx, use_gpu):
     np_angle = np.angle(cplx)
-    with self.test_session(
-        use_gpu=use_gpu,
-        force_gpu=use_gpu and test_util.is_gpu_available()) as sess:
+
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_angle = math_ops.angle(inx)
-      tf_angle_val = sess.run(tf_angle)
+      tf_angle_val = self.evaluate(tf_angle)
+
     self.assertAllEqual(np_angle, tf_angle_val)
     self.assertShapeEqual(np_angle, tf_angle)
 
@@ -903,6 +908,7 @@ class ComplexMakeRealImagTest(test.TestCase):
     # build failures on GPU (See #10643 for context).
     # self._compareAngle(cplx, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testRealReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float32,
                   dtypes_lib.float64):
@@ -912,11 +918,10 @@ class ComplexMakeRealImagTest(test.TestCase):
 
   def _compareConj(self, cplx, use_gpu):
     np_ans = np.conj(cplx)
-    with self.test_session(use_gpu=use_gpu,
-                           force_gpu=use_gpu and test_util.is_gpu_available()):
+    with test_util.device(use_gpu=use_gpu):
       inx = ops.convert_to_tensor(cplx)
       tf_conj = math_ops.conj(inx)
-      tf_ans = tf_conj.eval()
+      tf_ans = self.evaluate(tf_conj)
     self.assertAllEqual(np_ans, tf_ans)
     self.assertShapeEqual(np_ans, tf_conj)
 
@@ -934,6 +939,7 @@ class ComplexMakeRealImagTest(test.TestCase):
     self._compareConj(cplx, use_gpu=False)
     self._compareConj(cplx, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testConjReal(self):
     for dtype in (dtypes_lib.int32, dtypes_lib.int64, dtypes_lib.float16,
                   dtypes_lib.float32, dtypes_lib.float64):
@@ -941,6 +947,7 @@ class ComplexMakeRealImagTest(test.TestCase):
       y = math_ops.conj(x)
       self.assertEqual(x, y)
 
+  @test_util.run_deprecated_v1
   def testConjString(self):
     x = array_ops.placeholder(dtypes_lib.string)
     with self.assertRaisesRegexp(TypeError,
@@ -977,6 +984,7 @@ class ComplexMakeRealImagTest(test.TestCase):
             x_, list(x.shape), z, [1], x_init_value=x, delta=epsilon)
         self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     # complex64
     data = np.arange(1, 2, 0.10).reshape([5, 2]).astype(np.float32)
@@ -1012,6 +1020,7 @@ class ComplexMakeRealImagTest(test.TestCase):
           inp, list(data.shape), loss, [1], x_init_value=data, delta=epsilon)
     self.assertAllClose(jacob_t, jacob_n, rtol=epsilon, atol=epsilon)
 
+  @test_util.run_deprecated_v1
   def testMulGradient(self):
     data = np.arange(1, 2, 0.125).reshape([2, 4]).astype(np.float32)
     self._compareMulGradient(data)
@@ -1032,13 +1041,13 @@ class AccumulateTest(test.TestCase):
       np_val = random_arrays[0]
       for random_array in random_arrays[1:]:
         np_val += random_array
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testZeroArgs(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         tf_val = math_ops.accumulate_n([])
-        tf_val.eval()
+        self.evaluate(tf_val)
 
   def testWrongShape(self):
     with self.cached_session():
@@ -1070,7 +1079,7 @@ class PolyvalTest(test.TestCase):
     np_val = np.polyval(coeffs, x)
     with self.cached_session():
       tf_val = math_ops.polyval(coeffs, x)
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testSimple(self):
     for dtype in [
@@ -1093,7 +1102,7 @@ class PolyvalTest(test.TestCase):
         np_val = np.polyval(coeffs, x)
         with self.cached_session():
           tf_val = math_ops.polyval(coeffs, x)
-          self.assertAllClose(np_val, tf_val.eval())
+          self.assertAllClose(np_val, self.evaluate(tf_val))
 
   def testEmpty(self):
     x = np.random.rand(2, 2).astype(np.float32)
@@ -1101,7 +1110,7 @@ class PolyvalTest(test.TestCase):
     np_val = np.polyval(coeffs, x)
     with self.cached_session():
       tf_val = math_ops.polyval(coeffs, x)
-      self.assertAllClose(np_val, tf_val.eval())
+      self.assertAllClose(np_val, self.evaluate(tf_val))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
index 77f182784ebb0a149762e291c4e0bdd937bf8dfa..709a20f3d0da0ea73924589699d5ecb24f963bf2 100644
--- a/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
+++ b/tensorflow/python/kernel_tests/cwise_ops_unary_test.py
@@ -76,7 +76,7 @@ class UnaryOpTest(test.TestCase):
     if grad_atol is None:
       grad_atol = _default_tolerance(x.dtype)
     np_ans = np_func(x)
-    with self.test_session(use_gpu=False):
+    with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
       if x.dtype in (np.float32, np.float64,
                      dtypes_lib.bfloat16.as_numpy_dtype):
@@ -84,7 +84,7 @@ class UnaryOpTest(test.TestCase):
         np_ans *= 1.1
       else:
         y = tf_func(inx)
-      tf_cpu = y.eval()
+      tf_cpu = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       if x.dtype == np.float16:
         self.assertAllClose(np_ans, tf_cpu, rtol=1e-3, atol=1e-3)
@@ -121,26 +121,24 @@ class UnaryOpTest(test.TestCase):
   def _check(self, result_tensor, result_np, input_sp_t, tol):
     self.assertTrue(isinstance(result_tensor, sparse_tensor.SparseTensor))
     self.assertTrue(isinstance(input_sp_t, sparse_tensor.SparseTensor))
-    self.assertAllEqual(input_sp_t.indices.eval(), result_tensor.indices.eval())
-    self.assertAllEqual(input_sp_t.dense_shape.eval(),
-                        result_tensor.dense_shape.eval())
+    self.assertAllEqual(input_sp_t.indices, result_tensor.indices)
+    self.assertAllEqual(input_sp_t.dense_shape, result_tensor.dense_shape)
     if tol is None:
-      self.assertAllClose(result_np, result_tensor.values.eval())
+      self.assertAllClose(result_np, result_tensor.values)
     else:
-      self.assertAllClose(
-          result_np, result_tensor.values.eval(), rtol=tol, atol=tol)
+      self.assertAllClose(result_np, result_tensor.values, rtol=tol, atol=tol)
 
   def _compareSparseCpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
-    with self.test_session(use_gpu=False):
+    with test_util.force_cpu():
       self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareGpu(self, x, np_func, tf_func):
     np_ans = np_func(x)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       result = tf_func(ops.convert_to_tensor(x))
-      tf_gpu = result.eval()
+      tf_gpu = self.evaluate(result)
     if x.dtype == np.float16:
       self.assertAllClose(np_ans, tf_gpu, rtol=1e-3, atol=1e-3)
     else:
@@ -150,7 +148,7 @@ class UnaryOpTest(test.TestCase):
   def _compareSparseGpu(self, x, np_func, tf_func, tol):
     x_sp, x_sp_vals = _sparsify(x)
     res_np = np_func(x_sp_vals)
-    with self.test_session(force_gpu=test_util.is_gpu_available()):
+    with test_util.use_gpu():
       self._check(tf_func(x_sp), res_np, x_sp, tol)
 
   def _compareBoth(self, x, np_func, tf_func):
@@ -186,6 +184,7 @@ class UnaryOpTest(test.TestCase):
 
     return func
 
+  @test_util.run_deprecated_v1
   def testFloatBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
     w = x - x.min() + 1.02  # all greater than 1
@@ -240,12 +239,14 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testFloatTanhEdge(self):
     x = np.arange(40, 40 + 6).reshape(6).astype(np.float32)
     self._compareBoth(x, np.tanh, math_ops.tanh)
     x = np.arange(-40, -40 + 6).reshape(6).astype(np.float32)
     self._compareBoth(x, np.tanh, math_ops.tanh)
 
+  @test_util.run_deprecated_v1
   def testFloatEmpty(self):
     x = np.empty((2, 0, 5), dtype=np.float32)
     self._compareBoth(x, np.abs, math_ops.abs)
@@ -291,6 +292,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(x, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.sign, math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testDoubleBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float64)
     w = x - x.min() + 1.02  # all greater than 1
@@ -344,6 +346,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBothSparse(y, np.sign, math_ops.sign)
     self._compareBothSparse(x, np.vectorize(math.erf), math_ops.erf)
 
+  @test_util.run_deprecated_v1
   def testHalfBasic(self):
     x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float16)
     y = (x + .5).astype(np.float16)  # no zero
@@ -416,6 +419,7 @@ class UnaryOpTest(test.TestCase):
     self._compareCpu(x, np.square, math_ops.square)
     self._compareBothSparse(x, np.square, math_ops.square)
 
+  @test_util.run_deprecated_v1
   def testComplex64Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
         np.complex64)
@@ -460,6 +464,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
+  @test_util.run_deprecated_v1
   def testComplex128Basic(self):
     x = np.complex(1, 1) * np.arange(-3, 3).reshape(1, 3, 2).astype(
         np.complex128)
@@ -499,6 +504,7 @@ class UnaryOpTest(test.TestCase):
     self._compareBoth(y, complex_sign, math_ops.sign)
     self._compareBothSparse(y, complex_sign, math_ops.sign)
 
+  @test_util.run_deprecated_v1
   def testGradGrad(self):
     np.random.seed(7)
     shape = (5,)
diff --git a/tensorflow/python/kernel_tests/decode_bmp_op_test.py b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
index eebaffbe13ab1afbc9c6e36c2e5710dcf56e4b15..5e7991382ed14ed401edd38c6ab28af6630e1099 100644
--- a/tensorflow/python/kernel_tests/decode_bmp_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_bmp_op_test.py
@@ -61,7 +61,7 @@ class DecodeBmpOpTest(test.TestCase):
     decode = array_ops.squeeze(image_ops.decode_bmp(img_in))
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
   def testGrayscale(self):
@@ -136,7 +136,7 @@ class DecodeBmpOpTest(test.TestCase):
     decode = image_ops.decode_bmp(img_in)
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
 
diff --git a/tensorflow/python/kernel_tests/decode_compressed_op_test.py b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
index 1cc1c7da30a9c73935aa11ac9226c15aa2cf7954..fd871c0090699f36df41d1e7f7423bf273c4bba7 100644
--- a/tensorflow/python/kernel_tests/decode_compressed_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_compressed_op_test.py
@@ -24,6 +24,7 @@ import zlib
 from six import BytesIO
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -42,6 +43,7 @@ class DecodeCompressedOpTest(test.TestCase):
         f.write(bytes_in)
       return out.getvalue()
 
+  @test_util.run_deprecated_v1
   def testDecompress(self):
     for compression_type in ["ZLIB", "GZIP", ""]:
       with self.cached_session():
@@ -55,6 +57,7 @@ class DecodeCompressedOpTest(test.TestCase):
                                   self._compress(b"bBbb", compression_type)]})
         self.assertAllEqual([b"AaAA", b"bBbb"], result)
 
+  @test_util.run_deprecated_v1
   def testDecompressWithRaw(self):
     for compression_type in ["ZLIB", "GZIP", ""]:
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/decode_image_op_test.py b/tensorflow/python/kernel_tests/decode_image_op_test.py
index 0975f964b5898d9e100e2fdcd2af98029e28be95..ba5770001ad30eb9b2b0c084faa483dbcb4728b9 100644
--- a/tensorflow/python/kernel_tests/decode_image_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_image_op_test.py
@@ -23,6 +23,7 @@ import os.path
 import numpy as np
 
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import image_ops
 from tensorflow.python.ops import io_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -40,10 +41,11 @@ class DecodeImageOpTest(test.TestCase):
       bmp0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(bmp0)
       image1 = image_ops.decode_bmp(bmp0)
-      bmp0, image0, image1 = sess.run([bmp0, image0, image1])
+      bmp0, image0, image1 = self.evaluate([bmp0, image0, image1])
       self.assertEqual(len(bmp0), 4194)
       self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testGif(self):
     # Read some real GIFs
     path = os.path.join(prefix_path, "gif", "testdata", "scan.gif")
@@ -56,7 +58,7 @@ class DecodeImageOpTest(test.TestCase):
       gif0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(gif0)
       image1 = image_ops.decode_gif(gif0)
-      gif0, image0, image1 = sess.run([gif0, image0, image1])
+      gif0, image0, image1 = self.evaluate([gif0, image0, image1])
 
       self.assertEqual(image0.shape, shape)
       self.assertAllEqual(image0, image1)
@@ -76,8 +78,9 @@ class DecodeImageOpTest(test.TestCase):
 
         bad_channels = image_ops.decode_image(gif0, channels=1)
         with self.assertRaises(errors_impl.InvalidArgumentError):
-          bad_channels.eval()
+          self.evaluate(bad_channels)
 
+  @test_util.run_deprecated_v1
   def testJpeg(self):
     # Read a real jpeg and verify shape
     path = os.path.join(prefix_path, "jpeg", "testdata", "jpeg_merge_test1.jpg")
@@ -85,14 +88,14 @@ class DecodeImageOpTest(test.TestCase):
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_image(jpeg0)
       image1 = image_ops.decode_jpeg(jpeg0)
-      jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
+      jpeg0, image0, image1 = self.evaluate([jpeg0, image0, image1])
       self.assertEqual(len(jpeg0), 3771)
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertAllEqual(image0, image1)
 
       bad_channels = image_ops.decode_image(jpeg0, channels=4)
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        bad_channels.eval()
+        self.evaluate(bad_channels)
 
   def testPng(self):
     # Read some real PNGs, converting to different channel numbers
@@ -104,16 +107,17 @@ class DecodeImageOpTest(test.TestCase):
           png0 = io_ops.read_file(path)
           image0 = image_ops.decode_image(png0, channels=channels)
           image1 = image_ops.decode_png(png0, channels=channels)
-          png0, image0, image1 = sess.run([png0, image0, image1])
+          png0, image0, image1 = self.evaluate([png0, image0, image1])
           self.assertEqual(image0.shape, (26, 51, channels or channels_in))
           self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testInvalidBytes(self):
     image_bytes = b"ThisIsNotAnImage!"
     decode = image_ops.decode_image(image_bytes)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        decode.eval()
+        self.evaluate(decode)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
index 66b3e0f22fd2ab07311895da5df5448ee4e6e6f0..f8fc28062f4d9cd846a5b124611b56c35f652442 100644
--- a/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_jpeg_op_test.py
@@ -80,7 +80,7 @@ class DecodeJpegBenchmark(test.Benchmark):
           initializer=image_ops.encode_jpeg(tiled_image))
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       images = []
       for _ in xrange(parallelism):
         if crop_window is None:
@@ -105,11 +105,11 @@ class DecodeJpegBenchmark(test.Benchmark):
 
       for _ in xrange(3):
         # Skip warm up time.
-        sess.run(r)
+        self.evaluate(r)
 
       start_time = time.time()
       for _ in xrange(num_iters):
-        sess.run(r)
+        self.evaluate(r)
       end_time = time.time()
     return end_time - start_time
 
diff --git a/tensorflow/python/kernel_tests/decode_png_op_test.py b/tensorflow/python/kernel_tests/decode_png_op_test.py
index 8f36343667f72b410f14a1934c93a61debaff59e..5a0b742a6a46aa994eb555f09ab3fb75c8a03b15 100644
--- a/tensorflow/python/kernel_tests/decode_png_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_png_op_test.py
@@ -47,7 +47,7 @@ class DecodePngOpTest(test.TestCase):
             img_in, dtype=dtypes.uint16))
 
     with self.cached_session():
-      decoded = decode.eval()
+      decoded = self.evaluate(decode)
       self.assertAllEqual(decoded, img_bytes)
 
 
diff --git a/tensorflow/python/kernel_tests/decode_raw_op_test.py b/tensorflow/python/kernel_tests/decode_raw_op_test.py
index dcc984811cbcfef206befde7a94b3c948a07c15d..008e59ba3e64915d8642243d335701e8adea19c0 100644
--- a/tensorflow/python/kernel_tests/decode_raw_op_test.py
+++ b/tensorflow/python/kernel_tests/decode_raw_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -28,6 +29,7 @@ from tensorflow.python.platform import test
 
 class DecodeRawOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testToUint8(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[2])
@@ -46,6 +48,7 @@ class DecodeRawOpTest(test.TestCase):
           "element 1 has size 5 != 6"):
         decode.eval(feed_dict={in_bytes: ["short", "longer"]})
 
+  @test_util.run_deprecated_v1
   def testToInt16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -61,6 +64,7 @@ class DecodeRawOpTest(test.TestCase):
           "size of int16"):
         decode.eval(feed_dict={in_bytes: ["123", "456"]})
 
+  @test_util.run_deprecated_v1
   def testEndianness(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -73,6 +77,7 @@ class DecodeRawOpTest(test.TestCase):
       result = decode_be.eval(feed_dict={in_bytes: ["\x01\x02\x03\x04"]})
       self.assertAllEqual([[0x01020304]], result)
 
+  @test_util.run_deprecated_v1
   def testToFloat16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -84,6 +89,7 @@ class DecodeRawOpTest(test.TestCase):
 
       self.assertAllEqual(expected_result, result)
 
+  @test_util.run_deprecated_v1
   def testEmptyStringInput(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
@@ -93,6 +99,7 @@ class DecodeRawOpTest(test.TestCase):
         result = decode.eval(feed_dict={in_bytes: [""] * num_inputs})
         self.assertEqual((num_inputs, 0), result.shape)
 
+  @test_util.run_deprecated_v1
   def testToUInt16(self):
     with self.cached_session():
       in_bytes = array_ops.placeholder(dtypes.string, shape=[None])
diff --git a/tensorflow/python/kernel_tests/denormal_test.py b/tensorflow/python/kernel_tests/denormal_test.py
index 71a528c4aa2ff5ae966e60aff7cd92e4b8a0c5e8..d824e95f213acf5480be9bf2c431a4c4b89d106a 100644
--- a/tensorflow/python/kernel_tests/denormal_test.py
+++ b/tensorflow/python/kernel_tests/denormal_test.py
@@ -22,6 +22,7 @@ import numpy as np
 import platform
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -35,8 +36,9 @@ class DenormalTest(test.TestCase):
       self.assertEqual(tiny, tiny / 16 * 16)
 
   def _flushDenormalsTest(self, use_gpu, dtypes):
-    if platform.machine() == "ppc64le" or platform.machine() == "s390x":
-      # Disabled denormal_test on power/s390x platform
+    if platform.machine() == "ppc64le" or platform.machine(
+    ) == "s390x" or platform.machine() == "aarch64":
+      # Disabled denormal_test on power/s390x/aarch64 platform
       # Check relevant discussion - https://github.com/tensorflow/tensorflow/issues/11902
       return
     with self.cached_session(use_gpu=use_gpu):
@@ -50,10 +52,12 @@ class DenormalTest(test.TestCase):
           # Make sure the flags don't leak out
           self.testPythonHasDenormals()
 
+  @test_util.run_deprecated_v1
   def testFlushDenormalsCPU(self):
     # On CPUs, the processor flags flush for both single and double precision.
     self._flushDenormalsTest(use_gpu=False, dtypes=(np.float32, np.float64))
 
+  @test_util.run_deprecated_v1
   def testFlushDenormalsGPU(self):
     # On GPUs, only single precision can flush to zero.
     self._flushDenormalsTest(use_gpu=True, dtypes=(np.float32,))
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
index affbaf159d82e15d6c15a83ae509851ae1219c7f..4e3da068b8927c324bf9b17fb8e19e1038470777 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_no_tsan_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -32,6 +33,7 @@ class AssignOpTest(test.TestCase):
   # NOTE(mrry): We exclude thess tests from the TSAN TAP target, because they
   #   contain benign and deliberate data races when multiple threads update
   #   the same parameters without a lock.
+  @test_util.run_v1_only("b/120545219")
   def testParallelUpdateWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], 1.0)
@@ -43,7 +45,7 @@ class AssignOpTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       def run_add(add_op):
-        sess.run(add_op)
+        self.evaluate(add_op)
 
       threads = [
           self.checkedThread(
@@ -54,11 +56,12 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertTrue((vals >= ones).all())
       self.assertTrue((vals <= ones * 20).all())
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelAssignWithoutLocking(self):
     with self.cached_session() as sess:
       ones_t = array_ops.fill([1024, 1024], float(1))
@@ -70,7 +73,7 @@ class AssignOpTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       def run_assign(assign_op):
-        sess.run(assign_op)
+        self.evaluate(assign_op)
 
       threads = [
           self.checkedThread(
@@ -81,7 +84,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
 
       # Assert every element is taken from one of the assignments.
       self.assertTrue((vals > 0).all())
@@ -91,6 +94,7 @@ class AssignOpTest(test.TestCase):
   # contain non-benign but known data races between the variable assignment and
   # returning the output tensors. This issue will be resolved with the new
   # resource variables.
+  @test_util.run_v1_only("b/120545219")
   def testParallelUpdateWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -103,7 +107,7 @@ class AssignOpTest(test.TestCase):
       p.initializer.run()
 
       def run_add(add_op):
-        sess.run(add_op)
+        self.evaluate(add_op)
 
       threads = [
           self.checkedThread(
@@ -114,10 +118,11 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
       ones = np.ones((1024, 1024)).astype(np.float32)
       self.assertAllEqual(vals, ones * 20)
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelAssignWithLocking(self):
     with self.cached_session() as sess:
       zeros_t = array_ops.fill([1024, 1024], 0.0)
@@ -131,7 +136,7 @@ class AssignOpTest(test.TestCase):
       p.initializer.run()
 
       def run_assign(assign_op):
-        sess.run(assign_op)
+        self.evaluate(assign_op)
 
       threads = [
           self.checkedThread(
@@ -142,7 +147,7 @@ class AssignOpTest(test.TestCase):
       for t in threads:
         t.join()
 
-      vals = p.eval()
+      vals = self.evaluate(p)
 
       # Assert every element is the same, and taken from one of the assignments.
       self.assertTrue(vals[0, 0] > 0)
diff --git a/tensorflow/python/kernel_tests/dense_update_ops_test.py b/tensorflow/python/kernel_tests/dense_update_ops_test.py
index 3e0a03d634f13f182dcd142f188c6721f18aa4a5..545de87ca10deb6c01ab889f331aa61dc815e19e 100644
--- a/tensorflow/python/kernel_tests/dense_update_ops_test.py
+++ b/tensorflow/python/kernel_tests/dense_update_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -36,8 +37,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       assign = state_ops.assign(p, y)
       p.initializer.run()
-      new_value = assign.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(assign)
+      return self.evaluate(p), new_value
 
   def _initAssignAddFetch(self, x, y, use_gpu=False):
     """Initialize a param to init, and compute param += y."""
@@ -45,8 +46,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       add = state_ops.assign_add(p, y)
       p.initializer.run()
-      new_value = add.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(add)
+      return self.evaluate(p), new_value
 
   def _initAssignSubFetch(self, x, y, use_gpu=False):
     """Initialize a param to init, and compute param -= y."""
@@ -54,8 +55,8 @@ class AssignOpTest(test.TestCase):
       p = variables.Variable(x)
       sub = state_ops.assign_sub(p, y)
       p.initializer.run()
-      new_value = sub.eval()
-      return p.eval(), new_value
+      new_value = self.evaluate(sub)
+      return self.evaluate(p), new_value
 
   def _testTypes(self, vals):
     for dtype in [np.float32, np.float64, np.int32, np.int64]:
@@ -81,23 +82,26 @@ class AssignOpTest(test.TestCase):
         self.assertAllEqual(x - y, var_value)
         self.assertAllEqual(x - y, op_value)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testTypes(np.arange(0, 20).reshape([4, 5]))
 
+  @test_util.run_v1_only("b/120545219")
   def testAssignNonStrictShapeChecking(self):
     with self.cached_session():
       data = array_ops.fill([1024, 1024], 0)
       p = variables.VariableV1([1])
       a = state_ops.assign(p, data, validate_shape=False)
       a.op.run()
-      self.assertAllEqual(p.eval(), data.eval())
+      self.assertAllEqual(p.eval(), self.evaluate(data))
 
       # Assign to yet another shape
       data2 = array_ops.fill([10, 10], 1)
       a2 = state_ops.assign(p, data2, validate_shape=False)
       a2.op.run()
-      self.assertAllEqual(p.eval(), data2.eval())
+      self.assertAllEqual(p.eval(), self.evaluate(data2))
 
+  @test_util.run_v1_only("b/120545219")
   def testInitRequiredAssignAdd(self):
     with self.cached_session():
       p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
@@ -105,6 +109,7 @@ class AssignOpTest(test.TestCase):
       with self.assertRaisesOpError("use uninitialized"):
         a.op.run()
 
+  @test_util.run_v1_only("b/120545219")
   def testInitRequiredAssignSub(self):
     with self.cached_session():
       p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/depthtospace_op_test.py b/tensorflow/python/kernel_tests/depthtospace_op_test.py
index 13a28caf1fd8f3217490da5e594224d493f60850..96c9b5258e2a4a103a3d981a3340f67a01bbec94 100644
--- a/tensorflow/python/kernel_tests/depthtospace_op_test.py
+++ b/tensorflow/python/kernel_tests/depthtospace_op_test.py
@@ -53,12 +53,14 @@ class DepthToSpaceTest(test.TestCase):
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
         self.assertAllEqual(output_nhwc.eval(), outputs)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x_np = [[[[1, 2, 3, 4]]]]
     block_size = 2
     x_out = [[[[1], [2]], [[3], [4]]]]
     self._testOne(x_np, block_size, x_out)
 
+  @test_util.run_deprecated_v1
   def testBasicFloat16(self):
     x_np = [[[[1, 2, 3, 4]]]]
     block_size = 2
@@ -67,6 +69,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
+  @test_util.run_deprecated_v1
   def testBlockSize2(self):
     x_np = [[[[1, 2, 3, 4],
               [5, 6, 7, 8]],
@@ -79,6 +82,7 @@ class DepthToSpaceTest(test.TestCase):
               [[11], [12], [15], [16]]]]
     self._testOne(x_np, block_size, x_out)
 
+  @test_util.run_deprecated_v1
   def testBlockSize2Batch10(self):
     block_size = 2
     def batch_input_elt(i):
@@ -106,15 +110,16 @@ class DepthToSpaceTest(test.TestCase):
       # test NHWC (default) on CPU
       x_tf = array_ops.depth_to_space(input_nhwc, block_size)
       self.assertAllEqual(x_tf.shape, x_out.shape)
-      x_tf.eval()
+      self.evaluate(x_tf)
     if test.is_gpu_available():
       with self.cached_session(use_gpu=True):
         # test NHWC (default) on GPU
         x_tf = array_ops.depth_to_space(input_nhwc, block_size)
         self.assertAllEqual(x_tf.shape, x_out.shape)
-        x_tf.eval()
+        self.evaluate(x_tf)
 
   # Tests for different width and height.
+  @test_util.run_deprecated_v1
   def testNonSquare(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]],
              [[5, 50, 6, 60, 7, 70, 8, 80]],
@@ -130,6 +135,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input dimensions. To make sure elements are
   # correctly ordered spatially.
+  @test_util.run_deprecated_v1
   def testBlockSize4FlatInput(self):
     x_np = [[[[1, 2, 5, 6, 3, 4, 7, 8, 9, 10, 13, 14, 11, 12, 15, 16]]]]
     block_size = 4
@@ -141,6 +147,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleaved(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40]]]]
     block_size = 2
@@ -150,6 +157,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths. Here an odd depth.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleavedDepth3(self):
     x_np = [[[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]]]
     block_size = 2
@@ -159,6 +167,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for larger input depths.
   # To make sure elements are properly interleaved in depth.
+  @test_util.run_deprecated_v1
   def testDepthInterleavedLarger(self):
     x_np = [[[[1, 10, 2, 20, 3, 30, 4, 40],
               [5, 50, 6, 60, 7, 70, 8, 80]],
@@ -175,6 +184,7 @@ class DepthToSpaceTest(test.TestCase):
 
   # Tests for a block larger for the depth. In this case should raise an
   # exception.
+  @test_util.run_deprecated_v1
   def testBlockSizeTooLarge(self):
     x_np = [[[[1, 2, 3, 4],
               [5, 6, 7, 8]],
@@ -185,18 +195,20 @@ class DepthToSpaceTest(test.TestCase):
     # divisible by 16.
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   # Test when the block size is 0.
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     x_np = [[[[1], [2]],
              [[3], [4]]]]
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
   # Test when the block size is 1. The block size should be > 1.
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     x_np = [[[[1, 1, 1, 1],
               [2, 2, 2, 2]],
@@ -205,8 +217,9 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 1
     with self.assertRaises(ValueError):
       out_tf = array_ops.depth_to_space(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLargerThanInput(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]],
@@ -214,8 +227,9 @@ class DepthToSpaceTest(test.TestCase):
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleDepth(self):
     # The depth is not divisible by the square of the block size.
     x_np = [[[[1, 1, 1, 1],
@@ -226,6 +240,7 @@ class DepthToSpaceTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = array_ops.depth_to_space(
         array_ops.placeholder(dtypes.float32), block_size=4)
@@ -277,7 +292,7 @@ class DepthToSpaceTest(test.TestCase):
       actual = array_ops.depth_to_space(t, block_size, data_format=data_format)
 
     with self.session(use_gpu=use_gpu) as sess:
-      actual_vals, expected_vals = sess.run([actual, expected])
+      actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
   def testAgainstTranspose(self):
@@ -343,11 +358,13 @@ class DepthToSpaceGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here, as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     self._compare(3, 2, 5, 3, block_size, "NHWC")
     self._compare(3, 2, 5, 3, block_size, "NCHW")
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 3
     self._compare(1, 2, 3, 2, block_size, "NHWC")
diff --git a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
index 77b27c6c7e01bde8fa005142e7e5c00110bd628f..5b1a47fb03563f3c104e0d0ca158a0918dcb39b6 100644
--- a/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/depthwise_conv_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_impl
@@ -162,7 +163,7 @@ class DepthwiseConv2DTest(test.TestCase):
         conv_native = array_ops.transpose(conv_native, [0, 2, 3, 1])
 
       try:
-        native_result = sess.run(conv_native)
+        native_result = self.evaluate(conv_native)
       except errors.InvalidArgumentError as e:
         # Grouped convolution kernel is only registered for cuDNN 7. Silently
         # return when we are running on an earlier version or without GPU.
@@ -174,7 +175,7 @@ class DepthwiseConv2DTest(test.TestCase):
 
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      interface_result = sess.run(conv_interface)
+      interface_result = self.evaluate(conv_interface)
 
     tf_logging.info(
         "data_type: %r, use_gpu: %r, grouped_conv: %r, max diff = %f",
@@ -185,6 +186,7 @@ class DepthwiseConv2DTest(test.TestCase):
     self.assertShapeEqual(native_result, conv_native)
     self.assertShapeEqual(native_result, conv_interface)
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
@@ -269,7 +271,7 @@ class DepthwiseConv2DTest(test.TestCase):
       t2 = constant_op.constant(x2, shape=filter_in_sizes)
       conv = nn_ops.depthwise_conv2d_native(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     tf_logging.info("value = %r", value)
     self.assertArrayNear(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
@@ -428,6 +430,7 @@ class DepthwiseConv2DTest(test.TestCase):
           use_gpu, grouped_conv, err)
       self.assertLess(err, tolerance)
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DInputGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
@@ -477,6 +480,7 @@ class DepthwiseConv2DTest(test.TestCase):
             use_gpu=True,
             data_format="NCHW")
 
+  @test_util.run_v1_only("b/120545219")
   def testDepthwiseConv2DFilterGrad(self):
     for index, (input_size, filter_size, output_size, stride,
                 padding) in enumerate(CheckGradConfigsToTest()):
@@ -528,7 +532,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -548,7 +552,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_input(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -580,7 +584,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
@@ -600,7 +604,7 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=output_sizes)
         backprop = nn_ops.depthwise_conv2d_native_backprop_filter(
             t0, t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        ret = backprop.eval()
+        ret = self.evaluate(backprop)
         self.assertShapeEqual(ret, backprop)
         return ret
 
diff --git a/tensorflow/python/kernel_tests/determinant_op_test.py b/tensorflow/python/kernel_tests/determinant_op_test.py
index da33b2848b738e54ab03297e7754ad0f59deb4a4..dbfda385ed221cda8c42843326bccb08a10e0689 100644
--- a/tensorflow/python/kernel_tests/determinant_op_test.py
+++ b/tensorflow/python/kernel_tests/determinant_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import linalg_ops
@@ -35,7 +36,7 @@ from tensorflow.python.platform import test
 class DeterminantOpTest(test.TestCase):
 
   def _compareDeterminantBase(self, matrix_x, tf_ans):
-    out = tf_ans.eval()
+    out = self.evaluate(tf_ans)
     shape = matrix_x.shape
     if shape[-1] == 0 and shape[-2] == 0:
       np_ans = np.ones(shape[:-2]).astype(matrix_x.dtype)
@@ -54,15 +55,15 @@ class DeterminantOpTest(test.TestCase):
       np_ans = np_ans.astype(matrix_x.dtype)
 
     self.assertShapeEqual(np_ans, abs_log_det_tf)
-    sign_tf_val = sign_tf.eval()
-    abs_log_det_tf_val = abs_log_det_tf.eval()
+    sign_tf_val = self.evaluate(sign_tf)
+    abs_log_det_tf_val = self.evaluate(abs_log_det_tf)
     self.assertAllClose(
         sign_tf_val * np.exp(abs_log_det_tf_val),
         np_sign * np.exp(np_ans),
         atol=5e-5)
 
   def _compareDeterminant(self, matrix_x):
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       self._compareDeterminantBase(matrix_x,
                                    linalg_ops.matrix_determinant(matrix_x))
       self._compareLogDeterminantBase(
@@ -132,6 +133,7 @@ class DeterminantOpTest(test.TestCase):
     huge_matrix = np.array([[max_double, 0.0], [0.0, max_double]])
     self._compareDeterminant(huge_matrix)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonSquareMatrix(self):
     # When the determinant of a non-square matrix is attempted we should return
     # an error
@@ -139,6 +141,7 @@ class DeterminantOpTest(test.TestCase):
       linalg_ops.matrix_determinant(
           np.array([[1., 2., 3.], [3., 5., 4.]]).astype(np.float32))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to the determinant should be a 2-dimensional tensor.
     tensor1 = constant_op.constant([1., 2.])
@@ -149,13 +152,14 @@ class DeterminantOpTest(test.TestCase):
     self._compareDeterminant(np.empty([0, 2, 2]))
     self._compareDeterminant(np.empty([2, 0, 0]))
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
       det1 = linalg_ops.matrix_determinant(matrix1)
       det2 = linalg_ops.matrix_determinant(matrix2)
-      det1_val, det2_val = sess.run([det1, det2])
+      det1_val, det2_val = self.evaluate([det1, det2])
       self.assertEqual(det1_val, det2_val)
 
 
diff --git a/tensorflow/python/kernel_tests/diag_op_test.py b/tensorflow/python/kernel_tests/diag_op_test.py
index 9e43258fa2d0f82cabed85b32b7fe2a8ee5e11f8..ed2a9e8e47e961549dbaa99a78624e22af146937 100644
--- a/tensorflow/python/kernel_tests/diag_op_test.py
+++ b/tensorflow/python/kernel_tests/diag_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -31,6 +32,7 @@ from tensorflow.python.platform import tf_logging
 
 class MatrixDiagTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testVector(self):
     with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
@@ -49,6 +51,7 @@ class MatrixDiagTest(test.TestCase):
       self.assertEqual((2, 3, 3), v_batch_diag.get_shape())
       self.assertAllEqual(v_batch_diag.eval(), mat_batch)
 
+  @test_util.run_deprecated_v1
   def testBatchVector(self):
     self._testBatchVector(np.float32)
     self._testBatchVector(np.float64)
@@ -56,16 +59,19 @@ class MatrixDiagTest(test.TestCase):
     self._testBatchVector(np.int64)
     self._testBatchVector(np.bool)
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.matrix_diag(0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 1-dim"):
         array_ops.matrix_diag(v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3,), (7, 4))
     with self.session(use_gpu=True):
@@ -81,6 +87,7 @@ class MatrixDiagTest(test.TestCase):
 
 class MatrixSetDiagTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSquare(self):
     with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
@@ -89,8 +96,9 @@ class MatrixSetDiagTest(test.TestCase):
                                [1.0, 1.0, 3.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag, output.eval())
+      self.assertAllEqual(mat_set_diag, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testRectangular(self):
     with self.session(use_gpu=True):
       v = np.array([3.0, 4.0])
@@ -98,14 +106,14 @@ class MatrixSetDiagTest(test.TestCase):
       expected = np.array([[3.0, 1.0, 0.0], [1.0, 4.0, 1.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((2, 3), output.get_shape())
-      self.assertAllEqual(expected, output.eval())
+      self.assertAllEqual(expected, self.evaluate(output))
 
       v = np.array([3.0, 4.0])
       mat = np.array([[0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
       expected = np.array([[3.0, 1.0], [1.0, 4.0], [1.0, 1.0]])
       output = array_ops.matrix_set_diag(mat, v)
       self.assertEqual((3, 2), output.get_shape())
-      self.assertAllEqual(expected, output.eval())
+      self.assertAllEqual(expected, self.evaluate(output))
 
   def _testSquareBatch(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -121,8 +129,9 @@ class MatrixSetDiagTest(test.TestCase):
 
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 3, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag_batch, output.eval())
+      self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testSquareBatch(self):
     self._testSquareBatch(np.float32)
     self._testSquareBatch(np.float64)
@@ -130,6 +139,7 @@ class MatrixSetDiagTest(test.TestCase):
     self._testSquareBatch(np.int64)
     self._testSquareBatch(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRectangularBatch(self):
     with self.session(use_gpu=True):
       v_batch = np.array([[-1.0, -2.0], [-4.0, -5.0]])
@@ -140,14 +150,16 @@ class MatrixSetDiagTest(test.TestCase):
                                      [[-4.0, 0.0, 4.0], [0.0, -5.0, 0.0]]])
       output = array_ops.matrix_set_diag(mat_batch, v_batch)
       self.assertEqual((2, 2, 3), output.get_shape())
-      self.assertAllEqual(mat_set_diag_batch, output.eval())
+      self.assertAllEqual(mat_set_diag_batch, self.evaluate(output))
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
       array_ops.matrix_set_diag(0, [0])
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.matrix_set_diag([[0]], 0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -157,6 +169,7 @@ class MatrixSetDiagTest(test.TestCase):
           r"but received input shape: \[1,1\] and diagonal shape: \[\]"):
         array_ops.matrix_set_diag([[v]], v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3, 4, 4), (3, 3, 4), (3, 4, 3), (7, 4, 8, 8))
     with self.session(use_gpu=True):
@@ -178,6 +191,7 @@ class MatrixSetDiagTest(test.TestCase):
             y.get_shape().as_list())
         self.assertLess(error_x_diag, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradWithNoShapeInformation(self):
     with self.session(use_gpu=True) as sess:
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -200,6 +214,7 @@ class MatrixSetDiagTest(test.TestCase):
 
 class MatrixDiagPartTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSquare(self):
     with self.session(use_gpu=True):
       v = np.array([1.0, 2.0, 3.0])
@@ -208,6 +223,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual((3,), mat_diag.get_shape())
       self.assertAllEqual(mat_diag.eval(), v)
 
+  @test_util.run_deprecated_v1
   def testRectangular(self):
     with self.session(use_gpu=True):
       mat = np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
@@ -228,6 +244,7 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual((2, 3), mat_batch_diag.get_shape())
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
+  @test_util.run_deprecated_v1
   def testSquareBatch(self):
     self._testSquareBatch(np.float32)
     self._testSquareBatch(np.float64)
@@ -235,6 +252,7 @@ class MatrixDiagPartTest(test.TestCase):
     self._testSquareBatch(np.int64)
     self._testSquareBatch(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRectangularBatch(self):
     with self.session(use_gpu=True):
       v_batch = np.array([[1.0, 2.0], [4.0, 5.0]])
@@ -245,16 +263,19 @@ class MatrixDiagPartTest(test.TestCase):
       self.assertEqual((2, 2), mat_batch_diag.get_shape())
       self.assertAllEqual(mat_batch_diag.eval(), v_batch)
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 2"):
       array_ops.matrix_diag_part(0)
 
+  @test_util.run_deprecated_v1
   def testInvalidShapeAtEval(self):
     with self.session(use_gpu=True):
       v = array_ops.placeholder(dtype=dtypes_lib.float32)
       with self.assertRaisesOpError("input must be at least 2-dim"):
         array_ops.matrix_diag_part(v).eval(feed_dict={v: 0.0})
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     shapes = ((3, 3), (2, 3), (3, 2), (5, 3, 3))
     with self.session(use_gpu=True):
@@ -273,9 +294,9 @@ class DiagTest(test.TestCase):
   def _diagOp(self, diag, dtype, expected_ans, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.diag(ops.convert_to_tensor(diag.astype(dtype)))
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       tf_ans_inv = array_ops.diag_part(expected_ans)
-      inv_out = tf_ans_inv.eval()
+      inv_out = self.evaluate(tf_ans_inv)
     self.assertAllClose(out, expected_ans)
     self.assertAllClose(inv_out, diag)
     self.assertShapeEqual(expected_ans, tf_ans)
@@ -407,6 +428,7 @@ class DiagTest(test.TestCase):
           dtype=dtype)
       self.diagOp(x, dtype, expected_ans)
 
+  @test_util.run_deprecated_v1
   def testInvalidRank(self):
     with self.assertRaisesRegexp(ValueError, "must be at least rank 1"):
       array_ops.diag(0.0)
@@ -421,7 +443,7 @@ class DiagPartOpTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       tensor = ops.convert_to_tensor(tensor.astype(dtype))
       tf_ans_inv = array_ops.diag_part(tensor)
-      inv_out = tf_ans_inv.eval()
+      inv_out = self.evaluate(tf_ans_inv)
     self.assertAllClose(inv_out, expected_ans)
     self.assertShapeEqual(expected_ans, tf_ans_inv)
 
@@ -445,7 +467,7 @@ class DiagPartOpTest(test.TestCase):
         t = ops.convert_to_tensor(x.astype(np.float32))
         t.set_shape(shape)
         tf_ans = array_ops.diag_part(t)
-        out = tf_ans.eval()
+        out = self.evaluate(tf_ans)
       self.assertAllClose(out, expected_ans)
       self.assertShapeEqual(expected_ans, tf_ans)
 
@@ -476,6 +498,7 @@ class DiagPartOpTest(test.TestCase):
     self.diagPartOp(x, np.complex64, expected_ans)
     self.diagPartOp(x, np.complex128, expected_ans)
 
+  @test_util.run_deprecated_v1
   def testOddRank(self):
     w = np.random.rand(2)
     x = np.random.rand(2, 2, 2)
@@ -484,6 +507,7 @@ class DiagPartOpTest(test.TestCase):
     with self.assertRaises(ValueError):
       array_ops.diag_part(0.0)
 
+  @test_util.run_deprecated_v1
   def testUnevenDimensions(self):
     w = np.random.rand(2, 5)
     x = np.random.rand(2, 1, 2, 3)
@@ -493,6 +517,7 @@ class DiagPartOpTest(test.TestCase):
 
 class DiagGradOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDiagGrad(self):
     np.random.seed(0)
     shapes = ((3,), (3, 3), (3, 3, 3))
@@ -513,6 +538,7 @@ class DiagGradOpTest(test.TestCase):
 
 class DiagGradPartOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testDiagPartGrad(self):
     np.random.seed(0)
     shapes = ((3, 3), (3, 3, 3, 3))
diff --git a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
index 37b35ba51a884b9f8568be4a3c93e2144271730d..e6d560b4bc4c79885a4529427f5b427b39a166e6 100644
--- a/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bernoulli_test.py
@@ -151,6 +151,7 @@ class BernoulliTest(test.TestCase):
       self.assertAllClose(self.evaluate(dist.prob(x)), expected_pmf)
       self.assertAllClose(self.evaluate(dist.log_prob(x)), np.log(expected_pmf))
 
+  @test_util.run_deprecated_v1
   def testPmfCorrectBroadcastDynamicShape(self):
     with self.cached_session():
       p = array_ops.placeholder(dtype=dtypes.float32)
@@ -167,6 +168,7 @@ class BernoulliTest(test.TestCase):
           }), [[0.2, 0.7, 0.4]])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testPmfInvalid(self):
     p = [0.1, 0.2, 0.7]
     dist = bernoulli.Bernoulli(probs=p, validate_args=True)
@@ -193,6 +195,7 @@ class BernoulliTest(test.TestCase):
         self.evaluate(
             bernoulli.Bernoulli(probs=p, validate_args=False).log_prob(samps)))
 
+  @test_util.run_deprecated_v1
   def testBroadcasting(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes.float32)
@@ -207,6 +210,7 @@ class BernoulliTest(test.TestCase):
               p: [0.5, 0.5, 0.5]
           }))
 
+  @test_util.run_deprecated_v1
   def testPmfShapes(self):
     with self.cached_session():
       p = array_ops.placeholder(dtypes.float32, shape=[None, 1])
@@ -276,6 +280,7 @@ class BernoulliTest(test.TestCase):
     grad_p = tape.gradient(samples, p)
     self.assertIsNone(grad_p)
 
+  @test_util.run_deprecated_v1
   def testSampleActsLikeSampleN(self):
     with self.cached_session() as sess:
       p = [0.2, 0.6]
diff --git a/tensorflow/python/kernel_tests/distributions/bijector_test.py b/tensorflow/python/kernel_tests/distributions/bijector_test.py
index e20f59f48ac7aacaf650195ade7a50228b80e75c..a0e0a36fecc33b155c309dd9ac0dfda65ef698b8 100644
--- a/tensorflow/python/kernel_tests/distributions/bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/bijector_test.py
@@ -132,6 +132,7 @@ class BijectorTestEventNdims(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Expected scalar"):
       bij.inverse_log_det_jacobian(1., event_ndims=(1, 2))
 
+  @test_util.run_deprecated_v1
   def testBijectorDynamicEventNdims(self):
     bij = BrokenBijector(validate_args=True)
     event_ndims = array_ops.placeholder(dtype=np.int32, shape=None)
@@ -301,6 +302,7 @@ class BijectorReduceEventDimsTest(test.TestCase):
         8.,
         self.evaluate(bij.inverse_log_det_jacobian(x, event_ndims=2)))
 
+  @test_util.run_deprecated_v1
   def testHandlesNonStaticEventNdims(self):
     x_ = [[[1., 2.], [3., 4.]]]
     x = array_ops.placeholder_with_default(x_, shape=None)
diff --git a/tensorflow/python/kernel_tests/distributions/categorical_test.py b/tensorflow/python/kernel_tests/distributions/categorical_test.py
index c6bb06eab3090a103f4a7da92a7f1f5354d9020a..ec1d4ed20703e151876c9e315343b10baa76f760 100644
--- a/tensorflow/python/kernel_tests/distributions/categorical_test.py
+++ b/tensorflow/python/kernel_tests/distributions/categorical_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -44,6 +45,7 @@ def make_categorical(batch_shape, num_classes, dtype=dtypes.int32):
 
 class CategoricalTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testP(self):
     p = [0.2, 0.8]
     dist = categorical.Categorical(probs=p)
@@ -51,6 +53,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(p, dist.probs.eval())
       self.assertAllEqual([2], dist.logits.get_shape())
 
+  @test_util.run_deprecated_v1
   def testLogits(self):
     p = np.array([0.2, 0.8], dtype=np.float32)
     logits = np.log(p) - 50.
@@ -61,6 +64,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.probs.eval(), p)
       self.assertAllClose(dist.logits.eval(), logits)
 
+  @test_util.run_deprecated_v1
   def testShapes(self):
     with self.cached_session():
       for batch_shape in ([], [1], [2, 3, 4]):
@@ -107,6 +111,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertEqual(dist.dtype, dtype)
       self.assertEqual(dist.dtype, dist.sample(5).dtype)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     with self.cached_session():
       logits = array_ops.placeholder(dtype=dtypes.float32)
@@ -121,18 +126,21 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           feed_dict={logits: [[-1000.0, 1000.0], [1000.0, -1000.0]]})
       self.assertAllEqual([1, 0], sample_value_batch)
 
+  @test_util.run_deprecated_v1
   def testPMFWithBatch(self):
     histograms = [[0.2, 0.8], [0.6, 0.4]]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
     with self.cached_session():
       self.assertAllClose(dist.prob([0, 1]).eval(), [0.2, 0.4])
 
+  @test_util.run_deprecated_v1
   def testPMFNoBatch(self):
     histograms = [0.2, 0.8]
     dist = categorical.Categorical(math_ops.log(histograms) - 50.)
     with self.cached_session():
       self.assertAllClose(dist.prob(0).eval(), 0.2)
 
+  @test_util.run_deprecated_v1
   def testCDFWithDynamicEventShapeKnownNdims(self):
     """Test that dynamically-sized events with unknown shape work."""
     batch_size = 2
@@ -184,6 +192,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     actual_cdf = self.evaluate(cdf_op)
     self.assertAllClose(actual_cdf, expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFWithBatch(self):
     histograms = [[0.1, 0.2, 0.3, 0.25, 0.15],
                   [0.0, 0.75, 0.2, 0.05, 0.0]]
@@ -195,6 +204,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       self.assertAllClose(cdf_op.eval(), expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFNoBatch(self):
     histogram = [0.1, 0.2, 0.3, 0.4]
     event = 2
@@ -205,6 +215,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     with self.cached_session():
       self.assertAlmostEqual(cdf_op.eval(), expected_cdf)
 
+  @test_util.run_deprecated_v1
   def testCDFBroadcasting(self):
     # shape: [batch=2, n_bins=3]
     histograms = [[0.2, 0.1, 0.7],
@@ -287,7 +298,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     }
 
     with self.cached_session() as sess:
-      run_result = sess.run(to_run)
+      run_result = self.evaluate(to_run)
 
     self.assertAllEqual(run_result["cat_prob"].shape,
                         run_result["norm_prob"].shape)
@@ -298,6 +309,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(run_result["cat_log_cdf"].shape,
                         run_result["norm_log_cdf"].shape)
 
+  @test_util.run_deprecated_v1
   def testLogPMF(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
@@ -305,6 +317,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.log_prob([0, 1]).eval(), np.log([0.2, 0.4]))
       self.assertAllClose(dist.log_prob([0.0, 1.0]).eval(), np.log([0.2, 0.4]))
 
+  @test_util.run_deprecated_v1
   def testEntropyNoBatch(self):
     logits = np.log([0.2, 0.8]) - 50.
     dist = categorical.Categorical(logits)
@@ -312,6 +325,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       self.assertAllClose(dist.entropy().eval(),
                           -(0.2 * np.log(0.2) + 0.8 * np.log(0.8)))
 
+  @test_util.run_deprecated_v1
   def testEntropyWithBatch(self):
     logits = np.log([[0.2, 0.8], [0.6, 0.4]]) - 50.
     dist = categorical.Categorical(logits)
@@ -321,6 +335,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           -(0.6 * np.log(0.6) + 0.4 * np.log(0.4))
       ])
 
+  @test_util.run_deprecated_v1
   def testEntropyGradient(self):
     with self.cached_session() as sess:
       logits = constant_op.constant([[1., 2., 3.], [2., 5., 1.]])
@@ -355,7 +370,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       samples = dist.sample(n, seed=123)
       samples.set_shape([n, 1, 2])
       self.assertEqual(samples.dtype, dtypes.int32)
-      sample_values = samples.eval()
+      sample_values = self.evaluate(samples)
       self.assertFalse(np.any(sample_values < 0))
       self.assertFalse(np.any(sample_values > 1))
       self.assertAllClose(
@@ -371,7 +386,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       samples = dist.sample((100, 100), seed=123)
       prob = dist.prob(samples)
-      prob_val = prob.eval()
+      prob_val = self.evaluate(prob)
       self.assertAllClose(
           [0.2**2 + 0.8**2], [prob_val[:, :, :, 0].mean()], atol=1e-2)
       self.assertAllClose(
@@ -393,26 +408,26 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
 
       prob = dist.prob(1)
-      self.assertAllClose([[0.8, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([1])
-      self.assertAllClose([[0.8, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([0, 1])
-      self.assertAllClose([[0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[0, 1]])
-      self.assertAllClose([[0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[[0, 1]]])
-      self.assertAllClose([[[0.2, 0.6]]], prob.eval())
+      self.assertAllClose([[[0.2, 0.6]]], self.evaluate(prob))
 
       prob = dist.prob([[1, 0], [0, 1]])
-      self.assertAllClose([[0.8, 0.4], [0.2, 0.6]], prob.eval())
+      self.assertAllClose([[0.8, 0.4], [0.2, 0.6]], self.evaluate(prob))
 
       prob = dist.prob([[[1, 1], [1, 0]], [[1, 0], [0, 1]]])
       self.assertAllClose([[[0.8, 0.6], [0.8, 0.4]], [[0.8, 0.4], [0.2, 0.6]]],
-                          prob.eval())
+                          self.evaluate(prob))
 
   def testLogPMFShape(self):
     with self.cached_session():
@@ -440,12 +455,14 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(3, log_prob.get_shape().ndims)
     self.assertAllEqual([2, 2, 2], log_prob.get_shape())
 
+  @test_util.run_deprecated_v1
   def testMode(self):
     with self.cached_session():
       histograms = [[[0.2, 0.8], [0.6, 0.4]]]
       dist = categorical.Categorical(math_ops.log(histograms) - 50.)
       self.assertAllEqual(dist.mode().eval(), [[1, 0]])
 
+  @test_util.run_deprecated_v1
   def testCategoricalCategoricalKL(self):
 
     def np_softmax(logits):
@@ -462,7 +479,7 @@ class CategoricalTest(test.TestCase, parameterized.TestCase):
           b = categorical.Categorical(logits=b_logits)
 
           kl = kullback_leibler.kl_divergence(a, b)
-          kl_val = sess.run(kl)
+          kl_val = self.evaluate(kl)
           # Make sure KL(a||a) is 0
           kl_same = sess.run(kullback_leibler.kl_divergence(a, a))
 
diff --git a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
index d558ca09cc64b1337d2e5f47fc742282eaf7307f..c530037e1edc0437231cd5e968e48028cc4828ff 100644
--- a/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/dirichlet_multinomial_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import dirichlet_multinomial
@@ -36,6 +37,7 @@ class DirichletMultinomialTest(test.TestCase):
   def setUp(self):
     self._rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def testSimpleShapes(self):
     with self.cached_session():
       alpha = np.random.rand(3)
@@ -45,6 +47,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
+  @test_util.run_deprecated_v1
   def testComplexShapes(self):
     with self.cached_session():
       alpha = np.random.rand(3, 2, 2)
@@ -55,6 +58,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
+  @test_util.run_deprecated_v1
   def testNproperty(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -63,6 +67,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual([1, 1], dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
+  @test_util.run_deprecated_v1
   def testAlphaProperty(self):
     alpha = [[1., 2, 3]]
     with self.cached_session():
@@ -70,6 +75,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertEqual([1, 3], dist.concentration.get_shape())
       self.assertAllClose(alpha, dist.concentration.eval())
 
+  @test_util.run_deprecated_v1
   def testPmfNandCountsAgree(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -83,6 +89,7 @@ class DirichletMultinomialTest(test.TestCase):
           "last-dimension must sum to `self.total_count`"):
         dist.prob([3., 3, 0]).eval()
 
+  @test_util.run_deprecated_v1
   def testPmfNonIntegerCounts(self):
     alpha = [[1., 2, 3]]
     n = [[5.]]
@@ -110,7 +117,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [1., 0]
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1 / 3., pmf.eval())
+      self.assertAllClose(1 / 3., self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
@@ -122,7 +129,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [3., 2]
       dist = ds.DirichletMultinomial(5., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1 / 7., pmf.eval())
+      self.assertAllClose(1 / 7., self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesMultidimensionalN(self):
@@ -134,7 +141,7 @@ class DirichletMultinomialTest(test.TestCase):
       n = np.full([4, 3], 5., dtype=np.float32)
       dist = ds.DirichletMultinomial(n, alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, pmf.eval())
+      self.assertAllClose([[1 / 7., 1 / 7., 1 / 7.]] * 4, self.evaluate(pmf))
       self.assertEqual((4, 3), pmf.get_shape())
 
   def testPmfAlphaStretchedInBroadcastWhenSameRank(self):
@@ -145,7 +152,7 @@ class DirichletMultinomialTest(test.TestCase):
       counts = [[1., 0], [0., 1]]
       dist = ds.DirichletMultinomial([1.], alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 3.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfAlphaStretchedInBroadcastWhenLowerRank(self):
@@ -155,7 +162,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [1., 2]
       counts = [[1., 0], [0., 1]]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 3.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 3.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
@@ -165,7 +172,7 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       counts = [[1., 0]]
       pmf = ds.DirichletMultinomial([1., 1.], alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 5.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 5.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
@@ -175,9 +182,10 @@ class DirichletMultinomialTest(test.TestCase):
       alpha = [[1., 2], [2., 3]]
       counts = [1., 0]
       pmf = ds.DirichletMultinomial(1., alpha).prob(counts)
-      self.assertAllClose([1 / 3., 2 / 5.], pmf.eval())
+      self.assertAllClose([1 / 3., 2 / 5.], self.evaluate(pmf))
       self.assertAllEqual([2], pmf.get_shape())
 
+  @test_util.run_deprecated_v1
   def testPmfForOneVoteIsTheMeanWithOneRecordInput(self):
     # The probabilities of one vote falling into class k is the mean for class
     # k.
@@ -194,6 +202,7 @@ class DirichletMultinomialTest(test.TestCase):
         self.assertAllEqual([3], mean.shape)
         self.assertAllEqual([], pmf.shape)
 
+  @test_util.run_deprecated_v1
   def testMeanDoubleTwoVotes(self):
     # The probabilities of two votes falling into class k for
     # DirichletMultinomial(2, alpha) is twice as much as the probability of one
@@ -215,6 +224,7 @@ class DirichletMultinomialTest(test.TestCase):
         self.assertAllClose(mean2[class_num], 2 * mean1[class_num])
         self.assertAllEqual([3], mean1.shape)
 
+  @test_util.run_deprecated_v1
   def testCovarianceFromSampling(self):
     # We will test mean, cov, var, stddev on a DirichletMultinomial constructed
     # via broadcast between alpha, n.
@@ -289,7 +299,7 @@ class DirichletMultinomialTest(test.TestCase):
         expected_covariance = n * (n + alpha_0) / (1 + alpha_0) * shared_matrix
 
         self.assertEqual([2, 2], covariance.get_shape())
-        self.assertAllClose(expected_covariance, covariance.eval())
+        self.assertAllClose(expected_covariance, self.evaluate(covariance))
 
   def testCovarianceNAlphaBroadcast(self):
     alpha_v = [1., 2, 3]
@@ -327,7 +337,7 @@ class DirichletMultinomialTest(test.TestCase):
           ns * (ns + alpha_0) / (1 + alpha_0))[..., array_ops.newaxis]
 
       self.assertEqual([4, 3, 3], covariance.get_shape())
-      self.assertAllClose(expected_covariance, covariance.eval())
+      self.assertAllClose(expected_covariance, self.evaluate(covariance))
 
   def testCovarianceMultidimensional(self):
     alpha = np.random.rand(3, 5, 4).astype(np.float32)
@@ -353,7 +363,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(0., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(1.0, pmf.eval())
+      self.assertAllClose(1.0, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testLargeTauGivesPreciseProbabilities(self):
@@ -368,7 +378,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.8, pmf.eval(), atol=1e-4)
+      self.assertAllClose(0.8, self.evaluate(pmf), atol=1e-4)
       self.assertEqual((), pmf.get_shape())
 
     # Two (three sided) coin flips.  Prob[coin 3] = 0.8.
@@ -376,7 +386,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(2., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.8**2, pmf.eval(), atol=1e-2)
+      self.assertAllClose(0.8**2, self.evaluate(pmf), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
 
     # Three (three sided) coin flips.
@@ -384,7 +394,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(3., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(3 * 0.1 * 0.8 * 0.8, pmf.eval(), atol=1e-2)
+      self.assertAllClose(3 * 0.1 * 0.8 * 0.8, self.evaluate(pmf), atol=1e-2)
       self.assertEqual((), pmf.get_shape())
 
   def testSmallTauPrefersCorrelatedResults(self):
@@ -399,7 +409,7 @@ class DirichletMultinomialTest(test.TestCase):
     with self.cached_session():
       dist = ds.DirichletMultinomial(1., alpha)
       pmf = dist.prob(counts)
-      self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(0.5, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
     # If there are two draws, it is much more likely that they are the same.
@@ -409,9 +419,10 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(2., alpha)
       pmf_same = dist.prob(counts_same)
       pmf_different = dist.prob(counts_different)
-      self.assertLess(5 * pmf_different.eval(), pmf_same.eval())
+      self.assertLess(5 * self.evaluate(pmf_different), self.evaluate(pmf_same))
       self.assertEqual((), pmf_same.get_shape())
 
+  @test_util.run_deprecated_v1
   def testNonStrictTurnsOffAllChecks(self):
     # Make totally invalid input.
     with self.cached_session():
@@ -421,6 +432,7 @@ class DirichletMultinomialTest(test.TestCase):
       dist = ds.DirichletMultinomial(n, alpha, validate_args=False)
       dist.prob(counts).eval()  # Should not raise.
 
+  @test_util.run_deprecated_v1
   def testSampleUnbiasedNonScalarBatch(self):
     with self.cached_session() as sess:
       dist = ds.DirichletMultinomial(
@@ -450,6 +462,7 @@ class DirichletMultinomialTest(test.TestCase):
       self.assertAllClose(
           actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
 
+  @test_util.run_deprecated_v1
   def testSampleUnbiasedScalarBatch(self):
     with self.cached_session() as sess:
       dist = ds.DirichletMultinomial(
diff --git a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
index e35a8e1cdd7087dbf0ce7520412b4f773468c9e5..62b562387d0ebfbb895f4602e24c8af823f0bb4f 100644
--- a/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
+++ b/tensorflow/python/kernel_tests/distributions/identity_bijector_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.distributions import bijector_test_util
 from tensorflow.python.ops.distributions import identity_bijector
 from tensorflow.python.platform import test
@@ -41,6 +42,7 @@ class IdentityBijectorTest(test.TestCase):
         self.evaluate(
             bijector.forward_log_det_jacobian(x, event_ndims=3)))
 
+  @test_util.run_deprecated_v1
   def testScalarCongruency(self):
     with self.cached_session():
       bijector = identity_bijector.Identity()
diff --git a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
index e77e1117d493511748dea2dc1aff46ea8e7658e6..1e967de570f2fa012c84be50d8ecdf9a49a89dc3 100644
--- a/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
+++ b/tensorflow/python/kernel_tests/distributions/kullback_leibler_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.distributions import kullback_leibler
 from tensorflow.python.ops.distributions import normal
@@ -45,6 +46,7 @@ class KLTest(test.TestCase):
     a = MyDist(loc=0.0, scale=1.0)
     self.assertEqual("OK", kullback_leibler.kl_divergence(a, a, name="OK"))
 
+  @test_util.run_deprecated_v1
   def testDomainErrorExceptions(self):
 
     class MyDistException(normal.Normal):
@@ -63,17 +65,17 @@ class KLTest(test.TestCase):
       kl = kullback_leibler.kl_divergence(a, a, allow_nan_stats=False)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
-        kl.eval()
+        self.evaluate(kl)
       with self.assertRaisesOpError(
           "KL calculation between .* and .* returned NaN values"):
         a.kl_divergence(a).eval()
       a = MyDistException(loc=0.0, scale=1.0, allow_nan_stats=True)
       kl_ok = kullback_leibler.kl_divergence(a, a)
-      self.assertAllEqual([float("nan")], kl_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(kl_ok))
       self_kl_ok = a.kl_divergence(a)
-      self.assertAllEqual([float("nan")], self_kl_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(self_kl_ok))
       cross_ok = a.cross_entropy(a)
-      self.assertAllEqual([float("nan")], cross_ok.eval())
+      self.assertAllEqual([float("nan")], self.evaluate(cross_ok))
 
   def testRegistrationFailures(self):
 
diff --git a/tensorflow/python/kernel_tests/distributions/multinomial_test.py b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
index 3840d7331cacf588218e3c7dfea85662d545a13a..187ddd4cf417a54acbdd7bcd5fc60459336f11c9 100644
--- a/tensorflow/python/kernel_tests/distributions/multinomial_test.py
+++ b/tensorflow/python/kernel_tests/distributions/multinomial_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.distributions import multinomial
@@ -33,6 +34,7 @@ class MultinomialTest(test.TestCase):
   def setUp(self):
     self._rng = np.random.RandomState(42)
 
+  @test_util.run_v1_only("b/120545219")
   def testSimpleShapes(self):
     with self.cached_session():
       p = [.1, .3, .6]
@@ -42,6 +44,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([3]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([]), dist.batch_shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testComplexShapes(self):
     with self.cached_session():
       p = 0.5 * np.ones([3, 2, 2], dtype=np.float32)
@@ -52,6 +55,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual(tensor_shape.TensorShape([2]), dist.event_shape)
       self.assertEqual(tensor_shape.TensorShape([3, 2]), dist.batch_shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testN(self):
     p = [[0.1, 0.2, 0.7], [0.2, 0.3, 0.5]]
     n = [[3.], [4]]
@@ -60,6 +64,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((2, 1), dist.total_count.get_shape())
       self.assertAllClose(n, dist.total_count.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testP(self):
     p = [[0.1, 0.2, 0.7]]
     with self.cached_session():
@@ -68,6 +73,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((1, 3), dist.logits.get_shape())
       self.assertAllClose(p, dist.probs.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testLogits(self):
     p = np.array([[0.1, 0.2, 0.7]], dtype=np.float32)
     logits = np.log(p) - 50.
@@ -78,6 +84,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(p, multinom.probs.eval())
       self.assertAllClose(logits, multinom.logits.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfUnderflow(self):
     logits = np.array([[-200, 0]], dtype=np.float32)
     with self.cached_session():
@@ -85,6 +92,7 @@ class MultinomialTest(test.TestCase):
       lp = dist.log_prob([1., 0.]).eval()[0]
       self.assertAllClose(-200, lp, atol=0, rtol=1e-6)
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfandCountsAgree(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
@@ -97,6 +105,7 @@ class MultinomialTest(test.TestCase):
       with self.assertRaisesOpError("counts must sum to `self.total_count`"):
         dist.prob([3., 3, 0]).eval()
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfNonIntegerCounts(self):
     p = [[0.1, 0.2, 0.7]]
     n = [[5.]]
@@ -127,7 +136,7 @@ class MultinomialTest(test.TestCase):
       p = [0.5, 0.5]
       counts = [1., 0]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose(0.5, pmf.eval())
+      self.assertAllClose(0.5, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfBothZeroBatchesNontrivialN(self):
@@ -138,7 +147,7 @@ class MultinomialTest(test.TestCase):
       dist = multinomial.Multinomial(total_count=5., probs=p)
       pmf = dist.prob(counts)
       # 5 choose 3 = 5 choose 2 = 10. 10 * (.9)^2 * (.1)^3 = 81/10000.
-      self.assertAllClose(81. / 10000, pmf.eval())
+      self.assertAllClose(81. / 10000, self.evaluate(pmf))
       self.assertEqual((), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenSameRank(self):
@@ -146,7 +155,7 @@ class MultinomialTest(test.TestCase):
       p = [[0.1, 0.9]]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
   def testPmfPStretchedInBroadcastWhenLowerRank(self):
@@ -154,9 +163,10 @@ class MultinomialTest(test.TestCase):
       p = [0.1, 0.9]
       counts = [[1., 0], [0, 1]]
       pmf = multinomial.Multinomial(total_count=1., probs=p).prob(counts)
-      self.assertAllClose([0.1, 0.9], pmf.eval())
+      self.assertAllClose([0.1, 0.9], self.evaluate(pmf))
       self.assertEqual((2), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfCountsStretchedInBroadcastWhenSameRank(self):
     with self.cached_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
@@ -165,6 +175,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(pmf.eval(), [0.1, 0.7])
       self.assertEqual((2), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testPmfCountsStretchedInBroadcastWhenLowerRank(self):
     with self.cached_session():
       p = [[0.1, 0.9], [0.7, 0.3]]
@@ -182,7 +193,7 @@ class MultinomialTest(test.TestCase):
       # [2]
       counts = [2., 1]
       pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
-      pmf.eval()
+      self.evaluate(pmf)
       self.assertEqual(pmf.get_shape(), (2, 2))
 
   def testPmfShapeCountsPStretchedN(self):
@@ -191,9 +202,10 @@ class MultinomialTest(test.TestCase):
       counts = [3., 2]
       n = np.full([4, 3], 5., dtype=np.float32)
       pmf = multinomial.Multinomial(total_count=n, probs=p).prob(counts)
-      pmf.eval()
+      self.evaluate(pmf)
       self.assertEqual((4, 3), pmf.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialMean(self):
     with self.cached_session():
       n = 5.
@@ -203,6 +215,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3,), dist.mean().get_shape())
       self.assertAllClose(expected_means, dist.mean().eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialCovariance(self):
     with self.cached_session():
       n = 5.
@@ -214,6 +227,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3, 3), dist.covariance().get_shape())
       self.assertAllClose(expected_covariances, dist.covariance().eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultinomialCovarianceBatch(self):
     with self.cached_session():
       # Shape [2]
@@ -246,6 +260,7 @@ class MultinomialTest(test.TestCase):
       self.assertEqual((3, 5, 4, 4), covariance.get_shape())
       self.assertEqual((6, 3, 3, 3), covariance2.get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testCovarianceFromSampling(self):
     # We will test mean, cov, var, stddev on a DirichletMultinomial constructed
     # via broadcast between alpha, n.
@@ -288,6 +303,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(sample_var_, analytic_var, atol=0.01, rtol=0.01)
       self.assertAllClose(sample_stddev_, analytic_stddev, atol=0.01, rtol=0.01)
 
+  @test_util.run_v1_only("b/120545219")
   def testSampleUnbiasedNonScalarBatch(self):
     with self.cached_session() as sess:
       dist = multinomial.Multinomial(
@@ -317,6 +333,7 @@ class MultinomialTest(test.TestCase):
       self.assertAllClose(
           actual_covariance_, sample_covariance_, atol=0., rtol=0.20)
 
+  @test_util.run_v1_only("b/120545219")
   def testSampleUnbiasedScalarBatch(self):
     with self.cached_session() as sess:
       dist = multinomial.Multinomial(
diff --git a/tensorflow/python/kernel_tests/distributions/normal_test.py b/tensorflow/python/kernel_tests/distributions/normal_test.py
index 6625a88843f1ca489799bd19172db437d965a182..f2a193e69bd4393bda3817a45f7a27db70c73115 100644
--- a/tensorflow/python/kernel_tests/distributions/normal_test.py
+++ b/tensorflow/python/kernel_tests/distributions/normal_test.py
@@ -511,6 +511,7 @@ class NormalTest(test.TestCase):
     self.assertAllEqual(self.evaluate(normal.event_shape_tensor()), [])
     self.assertEqual(normal.event_shape, tensor_shape.TensorShape([]))
 
+  @test_util.run_deprecated_v1
   def testNormalShapeWithPlaceholders(self):
     mu = array_ops.placeholder(dtype=dtypes.float32)
     sigma = array_ops.placeholder(dtype=dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/distributions/special_math_test.py b/tensorflow/python/kernel_tests/distributions/special_math_test.py
index cc43e12168697c4f5a0cda48896b3d7d3c108ae4..d97fcfa655f2728b04ee0a2eb7ed71ef07ea1fbf 100644
--- a/tensorflow/python/kernel_tests/distributions/special_math_test.py
+++ b/tensorflow/python/kernel_tests/distributions/special_math_test.py
@@ -104,6 +104,7 @@ class NdtriTest(test.TestCase):
     x = special_math.ndtri(p)
     self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
+  @test_util.run_deprecated_v1
   def testNdtriDynamicShape(self):
     """Verifies that ndtri computation is correct."""
     with self.cached_session() as sess:
@@ -213,9 +214,11 @@ class NdtrTest(test.TestCase):
         rtol=error_spec.rtol,
         atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32(self):
     self._test_grid(np.float32, self._grid32, self._error32)
 
+  @test_util.run_deprecated_v1
   def test_float64(self):
     self._test_grid(np.float64, self._grid64, self._error64)
 
@@ -338,10 +341,12 @@ class NdtrGradientTest(test.TestCase):
           rtol=error_spec.rtol,
           atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32(self):
     self._test_grad_accuracy(np.float32, self._grid, self._error32)
     self._test_grad_finite(np.float32)
 
+  @test_util.run_deprecated_v1
   def test_float64(self):
     self._test_grad_accuracy(np.float64, self._grid, self._error64)
     self._test_grad_finite(np.float64)
@@ -362,7 +367,7 @@ class ErfInvTest(test.TestCase):
 
       expected_x = special.erfinv(x)
       x = special_math.erfinv(x)
-      self.assertAllClose(expected_x, x.eval(), atol=0.)
+      self.assertAllClose(expected_x, self.evaluate(x), atol=0.)
 
   def testErfInvIntegerInput(self):
     with self.cached_session():
@@ -418,6 +423,7 @@ class LogCDFLaplaceTest(test.TestCase):
           rtol=error_spec.rtol,
           atol=error_spec.atol)
 
+  @test_util.run_deprecated_v1
   def test_float32_lower_and_mid_segment_scipy_float32_ok(self):
     # Choose values mild enough that we can use scipy in float32, which will
     # allow for a high accuracy match to scipy (since we both use float32).
@@ -427,6 +433,7 @@ class LogCDFLaplaceTest(test.TestCase):
         GridSpec(min=-10, max=self.CUTOFF_FLOAT32_UPPER - 5, shape=[100]),
         ErrorSpec(rtol=5e-4, atol=0))
 
+  @test_util.run_deprecated_v1
   def test_float32_all_segments_with_scipy_float64_ok(self):
     # Choose values outside the range where scipy float32 works.
     # Let scipy use float64.  This means we
@@ -437,6 +444,7 @@ class LogCDFLaplaceTest(test.TestCase):
         GridSpec(min=-50, max=self.CUTOFF_FLOAT32_UPPER + 5, shape=[100]),
         ErrorSpec(rtol=0.05, atol=0))
 
+  @test_util.run_deprecated_v1
   def test_float32_extreme_values_result_and_gradient_finite_and_nonzero(self):
     with self.cached_session() as sess:
       # On the lower branch, log_cdf_laplace(x) = x, so we know this will be
@@ -448,7 +456,7 @@ class LogCDFLaplaceTest(test.TestCase):
       actual = sm.log_cdf_laplace(grid)
       grad = gradients_impl.gradients(actual, grid)[0]
 
-      actual_, grad_ = sess.run([actual, grad])
+      actual_, grad_ = self.evaluate([actual, grad])
 
       # isfinite checks for NaN and Inf.
       self.assertAllTrue(np.isfinite(actual_))
@@ -456,6 +464,7 @@ class LogCDFLaplaceTest(test.TestCase):
       self.assertFalse(np.any(actual_ == 0))
       self.assertFalse(np.any(grad_ == 0))
 
+  @test_util.run_deprecated_v1
   def test_float64_extreme_values_result_and_gradient_finite_and_nonzero(self):
     with self.cached_session() as sess:
       # On the lower branch, log_cdf_laplace(x) = x, so we know this will be
@@ -467,7 +476,7 @@ class LogCDFLaplaceTest(test.TestCase):
       actual = sm.log_cdf_laplace(grid)
       grad = gradients_impl.gradients(actual, grid)[0]
 
-      actual_, grad_ = sess.run([actual, grad])
+      actual_, grad_ = self.evaluate([actual, grad])
 
       # isfinite checks for NaN and Inf.
       self.assertAllTrue(np.isfinite(actual_))
diff --git a/tensorflow/python/kernel_tests/distributions/util_test.py b/tensorflow/python/kernel_tests/distributions/util_test.py
index f4e651b25bb36cb412f45dcf44bbc2431b8d18c8..030ad601bf4754ebda7b896b14051440adc170d2 100644
--- a/tensorflow/python/kernel_tests/distributions/util_test.py
+++ b/tensorflow/python/kernel_tests/distributions/util_test.py
@@ -59,6 +59,7 @@ def _logit(x):
 
 class AssertCloseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssertIntegerForm(self):
     # This should only be detected as an integer.
     x = array_ops.placeholder(dtypes.float32)
@@ -112,6 +113,7 @@ class MaybeGetStaticTest(test.TestCase):
     self.assertAllClose(
         np.array(2.), du.maybe_get_static_value(x, dtype=np.float64))
 
+  @test_util.run_deprecated_v1
   def testGetStaticPlaceholder(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[1])
     self.assertEqual(None, du.maybe_get_static_value(x))
@@ -235,6 +237,7 @@ class GetLogitsAndProbsTest(test.TestCase):
         probs=p4, multidimensional=True, validate_args=False)
     self.evaluate(prob)
 
+  @test_util.run_deprecated_v1
   def testProbsMultidimShape(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -249,6 +252,7 @@ class GetLogitsAndProbsTest(test.TestCase):
             probs=p, multidimensional=True, validate_args=True)
         prob.eval(feed_dict={p: np.ones([int(2**11+1)])})
 
+  @test_util.run_deprecated_v1
   def testLogitsMultidimShape(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -266,6 +270,7 @@ class GetLogitsAndProbsTest(test.TestCase):
 
 class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTooSmall(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -280,6 +285,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
             param)
         checked_param.eval(feed_dict={param: np.ones([1])})
 
+  @test_util.run_deprecated_v1
   def testTooLarge(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -305,6 +311,7 @@ class EmbedCheckCategoricalEventShapeTest(test.TestCase):
 
 class EmbedCheckIntegerCastingClosedTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsNonnegative(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements must be non-negative"):
@@ -313,6 +320,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, -1], dtype=np.float16)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssersIntegerForm(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements must be int16-equivalent."):
@@ -321,6 +329,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, 1.5], dtype=np.float16)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsLargestPossibleInteger(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements cannot exceed 32767."):
@@ -329,6 +338,7 @@ class EmbedCheckIntegerCastingClosedTest(test.TestCase):
             x, target_dtype=dtypes.int16)
         x_checked.eval(feed_dict={x: np.array([1, 2**15], dtype=np.int32)})
 
+  @test_util.run_deprecated_v1
   def testCorrectlyAssertsSmallestPossibleInteger(self):
     with self.cached_session():
       with self.assertRaisesOpError("Elements cannot be smaller than 0."):
@@ -369,6 +379,7 @@ class LogCombinationsTest(test.TestCase):
 
 class DynamicShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSameDynamicShape(self):
     with self.cached_session():
       scalar = constant_op.constant(2.0)
@@ -493,6 +504,7 @@ class RotateTransposeTest(test.TestCase):
             self._np_rotate_transpose(x, shift), self.evaluate(y))
         self.assertAllEqual(np.roll(x.shape, shift), y.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testRollDynamic(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtypes.float32)
@@ -511,6 +523,7 @@ class RotateTransposeTest(test.TestCase):
 
 class PickVectorTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCorrectlyPicksVector(self):
     with self.cached_session():
       x = np.arange(10, 12)
@@ -529,36 +542,42 @@ class PickVectorTest(test.TestCase):
 
 class PreferStaticRankTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(3, rank)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(1, rank)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     rank = du.prefer_static_rank(x)
     self.assertIsInstance(rank, np.ndarray)
     self.assertEqual(0, rank)
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     rank = du.prefer_static_rank(x)
     with self.cached_session():
       self.assertAllEqual(2, rank.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     rank = du.prefer_static_rank(x)
     with self.cached_session():
       self.assertAllEqual(1, rank.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicRankEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     rank = du.prefer_static_rank(x)
@@ -568,36 +587,42 @@ class PreferStaticRankTest(test.TestCase):
 
 class PreferStaticShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([2, 3, 4]), shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([0]), shape)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     shape = du.prefer_static_shape(x)
     self.assertIsInstance(shape, np.ndarray)
     self.assertAllEqual(np.array([]), shape)
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     shape = du.prefer_static_shape(x)
     with self.cached_session():
       self.assertAllEqual((2, 3), shape.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     shape = du.prefer_static_shape(x)
     with self.cached_session():
       self.assertAllEqual(np.array([0]), shape.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicShapeEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     shape = du.prefer_static_shape(x)
@@ -607,24 +632,28 @@ class PreferStaticShapeTest(test.TestCase):
 
 class PreferStaticValueTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNonEmptyConstantTensor(self):
     x = array_ops.zeros((2, 3, 4))
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.zeros((2, 3, 4)), value)
 
+  @test_util.run_deprecated_v1
   def testEmptyConstantTensor(self):
     x = constant_op.constant([])
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.array([]), value)
 
+  @test_util.run_deprecated_v1
   def testScalarTensor(self):
     x = constant_op.constant(1.)
     value = du.prefer_static_value(x)
     self.assertIsInstance(value, np.ndarray)
     self.assertAllEqual(np.array(1.), value)
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingNonEmpty(self):
     x = array_ops.placeholder(np.float64, shape=None)
     value = du.prefer_static_value(x)
@@ -632,12 +661,14 @@ class PreferStaticValueTest(test.TestCase):
       self.assertAllEqual(np.zeros((2, 3)),
                           value.eval(feed_dict={x: np.zeros((2, 3))}))
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingEmpty(self):
     x = array_ops.placeholder(np.int32, shape=None)
     value = du.prefer_static_value(x)
     with self.cached_session():
       self.assertAllEqual(np.array([]), value.eval(feed_dict={x: []}))
 
+  @test_util.run_deprecated_v1
   def testDynamicValueEndsUpBeingScalar(self):
     x = array_ops.placeholder(np.int32, shape=None)
     value = du.prefer_static_value(x)
@@ -698,43 +729,55 @@ class FillTriangularTest(test.TestCase):
     self.assertAllClose(expected, actual_, rtol=1e-8, atol=1e-9)
     self.assertAllClose(x_, grad_actual_, rtol=1e-8, atol=1e-9)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakes1x1TriLower(self):
     self._run_test(self._rng.randn(3, int(1*2/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesNoBatchTriLower(self):
     self._run_test(self._rng.randn(int(4*5/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriLower(self):
     self._run_test(self._rng.randn(2, 3, int(3*4/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriLowerUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(3*4/2)), use_deferred_shape=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriLowerUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), use_deferred_shape=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriLower(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)))
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakes1x1TriUpper(self):
     self._run_test(self._rng.randn(3, int(1*2/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesNoBatchTriUpper(self):
     self._run_test(self._rng.randn(int(4*5/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriUpper(self):
     self._run_test(self._rng.randn(2, 2, int(3*4/2)), upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatchTriUpperUnknownShape(self):
     self._run_test(self._rng.randn(2, 2, int(3*4/2)),
                    use_deferred_shape=True,
                    upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriUpperUnknownShape(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)),
                    use_deferred_shape=True,
                    upper=True)
 
+  @test_util.run_deprecated_v1
   def testCorrectlyMakesBatch7x7TriUpper(self):
     self._run_test(self._rng.randn(2, 3, int(7*8/2)), upper=True)
 
@@ -773,6 +816,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       m = np.squeeze(m, axis=axis)
     return m + np.log(sgn * sum_), sgn
 
+  @test_util.run_deprecated_v1
   def testNoWeights(self):
     logx_ = np.array([[0., -1, 1000.],
                       [0, 1, -1000.],
@@ -805,7 +849,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       w = constant_op.constant(w_)
       actual, actual_sgn = du.reduce_weighted_logsumexp(
           logx, w, axis=-1, return_sign=True)
-      [actual_, actual_sgn_] = sess.run([actual, actual_sgn])
+      [actual_, actual_sgn_] = self.evaluate([actual, actual_sgn])
     self.assertAllEqual(expected, actual_)
     self.assertAllEqual([-1., -1, 1], actual_sgn_)
 
@@ -823,7 +867,7 @@ class ReduceWeightedLogSumExp(test.TestCase):
       w = constant_op.constant(w_)
       actual, actual_sgn = du.reduce_weighted_logsumexp(
           logx, w, axis=-1, return_sign=True, keep_dims=True)
-      [actual_, actual_sgn_] = sess.run([actual, actual_sgn])
+      [actual_, actual_sgn_] = self.evaluate([actual, actual_sgn])
     self.assertAllEqual(expected, actual_)
     self.assertAllEqual([[-1.], [-1], [1]], actual_sgn_)
 
@@ -903,6 +947,7 @@ class SoftplusTest(test.TestCase):
     self.assertAllEqual(np.ones_like(tf_softplus_inverse).astype(np.bool),
                         np.isfinite(tf_softplus_inverse))
 
+  @test_util.run_deprecated_v1
   def testNumbers(self):
     for t in [np.float16, np.float32, np.float64]:
       lower = {np.float16: -15, np.float32: -50, np.float64: -50}.get(t, -100)
@@ -933,6 +978,7 @@ class SoftplusTest(test.TestCase):
           ],
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -949,6 +995,7 @@ class SoftplusTest(test.TestCase):
     tf_logging.vlog(2, "softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testInverseSoftplusGradientNeverNan(self):
     with self.cached_session():
       # Note that this range contains both zero and inf.
@@ -958,6 +1005,7 @@ class SoftplusTest(test.TestCase):
       # Equivalent to `assertAllFalse` (if it existed).
       self.assertAllEqual(np.zeros_like(grads).astype(np.bool), np.isnan(grads))
 
+  @test_util.run_deprecated_v1
   def testInverseSoftplusGradientFinite(self):
     with self.cached_session():
       # This range of x is all finite, and so is 1 / x.  So the
diff --git a/tensorflow/python/kernel_tests/division_future_test.py b/tensorflow/python/kernel_tests/division_future_test.py
index e477bdc73b90eb104011f476fcfa9b4cf39a628a..85c85809d3f96a22b7994bedef34b10b700a2815 100644
--- a/tensorflow/python/kernel_tests/division_future_test.py
+++ b/tensorflow/python/kernel_tests/division_future_test.py
@@ -65,7 +65,7 @@ class DivisionTestCase(test.TestCase):
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
       # Do only one sess.run for speed
-      for f, (x, y) in zip(checks, sess.run(tensors)):
+      for f, (x, y) in zip(checks, self.evaluate(tensors)):
         f(x, y)
 
 
diff --git a/tensorflow/python/kernel_tests/division_past_test.py b/tensorflow/python/kernel_tests/division_past_test.py
index 63951b5b382947ff17029bc7b7062cf5808f220e..38bb18631ab7be4e191ceca801e8d68b0c3bdd61 100644
--- a/tensorflow/python/kernel_tests/division_past_test.py
+++ b/tensorflow/python/kernel_tests/division_past_test.py
@@ -64,7 +64,7 @@ class DivisionTestCase(test.TestCase):
                 tf_floordiv = tf_x // tf_y
                 check(floordiv, tf_floordiv)
       # Do only one sess.run for speed
-      for f, (x, y) in zip(checks, sess.run(tensors)):
+      for f, (x, y) in zip(checks, self.evaluate(tensors)):
         f(x, y)
 
 
diff --git a/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py b/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
index c6558762809d22ae11be8f485232a9a2972ba1a4..6aa757e293ef69040266d194aef85370b86e5b2b 100644
--- a/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
+++ b/tensorflow/python/kernel_tests/draw_bounding_box_op_test.py
@@ -87,7 +87,7 @@ class DrawBoundingBoxOpTest(test.TestCase):
       image = array_ops.expand_dims(image, 0)
       image = image_ops.draw_bounding_boxes(image, bboxes)
       with self.cached_session(use_gpu=False) as sess:
-        op_drawn_image = np.squeeze(sess.run(image), 0)
+        op_drawn_image = np.squeeze(self.evaluate(image), 0)
         self.assertAllEqual(test_drawn_image, op_drawn_image)
 
   def testDrawBoundingBoxRGBColorCycling(self):
diff --git a/tensorflow/python/kernel_tests/duplicate_op_test.py b/tensorflow/python/kernel_tests/duplicate_op_test.py
index 654267a58252060db890891cc4a0d7f8d0b2afdd..fef3127d4a84e5be59bb5a8e50dd60944fe57606 100644
--- a/tensorflow/python/kernel_tests/duplicate_op_test.py
+++ b/tensorflow/python/kernel_tests/duplicate_op_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import load_library
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
@@ -27,6 +28,7 @@ from tensorflow.python.platform import test
 
 class DuplicateOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     library_filename = os.path.join(resource_loader.get_data_files_path(),
                                     'duplicate_op.so')
diff --git a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
index 07da855a0174d7b217ac383758e358922b7e18e4..8c448194076ba72cc5f8efb66dbfd3d75bd7c7ef 100644
--- a/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_partition_op_test.py
@@ -25,6 +25,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import gradients_impl
@@ -34,13 +35,14 @@ from tensorflow.python.platform import test
 
 class DynamicPartitionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimpleOneDimensional(self):
     with self.session(use_gpu=True) as sess:
       data = constant_op.constant([0, 13, 2, 39, 4, 17], dtype=dtypes.float32)
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([0, 13], partition_vals[0])
@@ -54,6 +56,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual([None], partitions[2].get_shape().as_list())
     self.assertEqual([None], partitions[3].get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testSimpleTwoDimensional(self):
     with self.session(use_gpu=True) as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
@@ -62,7 +65,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant([0, 0, 2, 3, 2, 1])
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([[0, 1, 2], [3, 4, 5]], partition_vals[0])
@@ -87,7 +90,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual(part1, partition_vals[0])
@@ -109,7 +112,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=num_partitions)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(num_partitions, len(partition_vals))
     for i in range(num_partitions):
@@ -125,7 +128,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([3 + 4j, 7 + 8j], partition_vals[0])
@@ -138,7 +141,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = 3
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual(np.array([], dtype=np.float64).reshape(-1, 4),
@@ -151,6 +154,7 @@ class DynamicPartitionTest(test.TestCase):
                                  dtype=np.float64).reshape(-1, 4),
                         partition_vals[3])
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     np.random.seed(7)
     with self.session(use_gpu=True) as sess:
@@ -164,7 +168,7 @@ class DynamicPartitionTest(test.TestCase):
             outputs = data_flow_ops.dynamic_partition(
                 data_t, partitions_t, num_partitions=n)
             self.assertEqual(n, len(outputs))
-            outputs_val = sess.run(outputs)
+            outputs_val = self.evaluate(outputs)
             for i, output in enumerate(outputs_val):
               self.assertAllEqual(output, data[partitions == i])
 
@@ -183,7 +187,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(4, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
@@ -199,7 +203,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=3)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(3, len(partition_vals))
     self.assertAllEqual([[]], partition_vals[0])
@@ -215,7 +219,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([], partition_vals[0])
@@ -236,7 +240,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=2)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(2, len(partition_vals))
     self.assertAllEqual([6], partition_vals[0])
@@ -257,7 +261,7 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=5)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(5, len(partition_vals))
     self.assertAllEqual([5], partition_vals[0])
@@ -281,12 +285,13 @@ class DynamicPartitionTest(test.TestCase):
       indices = constant_op.constant(indices_list, dtype=dtypes.int32)
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=40)
-      partition_vals = sess.run(partitions)
+      partition_vals = self.evaluate(partitions)
 
     self.assertEqual(40, len(partition_vals))
     for i in range(40):
       self.assertAllEqual([], partition_vals[i])
 
+  @test_util.run_deprecated_v1
   def testErrorIndexOutOfRange(self):
     with self.cached_session() as sess:
       data = constant_op.constant([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11],
@@ -295,16 +300,18 @@ class DynamicPartitionTest(test.TestCase):
       partitions = data_flow_ops.dynamic_partition(
           data, indices, num_partitions=4)
       with self.assertRaisesOpError(r"partitions\[2\] = 99 is not in \[0, 4\)"):
-        sess.run(partitions)
+        self.evaluate(partitions)
 
+  @test_util.run_deprecated_v1
   def testScalarIndexOutOfRange(self):
     with self.cached_session() as sess:
       bad = 17
       data = np.zeros(5)
       partitions = data_flow_ops.dynamic_partition(data, bad, num_partitions=7)
       with self.assertRaisesOpError(r"partitions = 17 is not in \[0, 7\)"):
-        sess.run(partitions)
+        self.evaluate(partitions)
 
+  @test_util.run_deprecated_v1
   def testHigherRankIndexOutOfRange(self):
     with self.cached_session() as sess:
       shape = (2, 3)
@@ -320,6 +327,7 @@ class DynamicPartitionTest(test.TestCase):
               r"partitions\[%d,%d\] = 17 is not in \[0, 7\)" % (i, j)):
             sess.run(partitions, feed_dict={indices: bad})
 
+  @test_util.run_deprecated_v1
   def testErrorWrongDimsIndices(self):
     data = constant_op.constant([[0], [1], [2]])
     indices = constant_op.constant([[0], [0]])
@@ -335,7 +343,7 @@ class DynamicPartitionTest(test.TestCase):
     self.assertEqual(len(inds), x.shape[0])
     partitioned = data_flow_ops.dynamic_partition(x, inds, 16)
     with self.cached_session() as sess:
-      res = sess.run(partitioned)
+      res = self.evaluate(partitioned)
     self.assertEqual(res[-1].shape[0], 192)
 
 
diff --git a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
index c3f67d29aa4cf3fd12d9c4b8c990b065aaa401ab..4f338880aa3564c4bf37102c7d01c8768ef07d58 100644
--- a/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
+++ b/tensorflow/python/kernel_tests/dynamic_stitch_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -36,18 +37,19 @@ class DynamicStitchTestBase(object):
     self.stitch_op = stitch_op
 
   def testScalar(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant(0), constant_op.constant(1)]
       data = [constant_op.constant(40), constant_op.constant(60)]
       for step in -1, 1:
         stitched_t = self.stitch_op(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40, 60][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceForScalarWithNonConstantIndices(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           array_ops.placeholder(dtype=dtypes.int32),
           constant_op.constant(1)
@@ -61,7 +63,7 @@ class DynamicStitchTestBase(object):
         self.assertEqual([None], stitched_t.get_shape().as_list())
 
   def testSimpleOneDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       # Test various datatypes in the simple case to ensure that the op was
       # registered under those types.
       dtypes_to_test = [
@@ -78,23 +80,23 @@ class DynamicStitchTestBase(object):
                 constant_op.constant([10, 60, 20, 30, 50]), dtype=dtype)
         ]
         stitched_t = self.stitch_op(indices, data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([8], stitched_t.get_shape().as_list())
 
   def testOneListOneDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant([1, 6, 2, 3, 5, 0, 4, 7])]
       data = [constant_op.constant([10, 60, 20, 30, 50, 0, 40, 70])]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([0, 10, 20, 30, 40, 50, 60, 70], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8], stitched_t.get_shape().as_list())
 
   def testSimpleTwoDimensional(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           constant_op.constant([0, 4, 7]),
           constant_op.constant([1, 6]),
@@ -106,14 +108,14 @@ class DynamicStitchTestBase(object):
           constant_op.constant([[20, 21], [30, 31], [50, 51]])
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
                            [50, 51], [60, 61], [70, 71]], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
   def testZeroSizeTensor(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [
           constant_op.constant([0, 4, 7]),
           constant_op.constant([1, 6]),
@@ -127,12 +129,13 @@ class DynamicStitchTestBase(object):
           array_ops.zeros([0, 2], dtype=dtypes.int32)
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       self.assertAllEqual([[0, 1], [10, 11], [20, 21], [30, 31], [40, 41],
                            [50, 51], [60, 61], [70, 71]], stitched_val)
       # Dimension 0 is max(flatten(indices))+1.
       self.assertEqual([8, 2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     with self.session(use_gpu=True) as sess:
       indices = [
@@ -147,7 +150,7 @@ class DynamicStitchTestBase(object):
                                 [[1., 2.], [31., 32.]]])
       ]
       stitched_t = self.stitch_op(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10. * np.arange(7)[:, None] + [1., 2.]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -157,8 +160,9 @@ class DynamicStitchTestBase(object):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7. * datum.eval(), grad)
+        self.assertAllEqual(7. * self.evaluate(datum), grad)
 
+  @test_util.run_deprecated_v1
   def testErrorIndicesMultiDimensional(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -171,6 +175,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataNumDimsMismatch(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -183,6 +188,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataDimSizeMismatch(self):
     indices = [
         constant_op.constant([0, 4, 5]),
@@ -195,6 +201,7 @@ class DynamicStitchTestBase(object):
     with self.assertRaises(ValueError):
       self.stitch_op(indices, data)
 
+  @test_util.run_deprecated_v1
   def testErrorDataAndIndicesSizeMismatch(self):
     indices = [
         constant_op.constant([0, 4, 7]),
@@ -222,16 +229,17 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
     DynamicStitchTestBase.__init__(self, data_flow_ops.parallel_dynamic_stitch)
 
   def testScalar(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       indices = [constant_op.constant(0), constant_op.constant(1)]
       data = [constant_op.constant(40.0), constant_op.constant(60.0)]
       for step in -1, 1:
         stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40.0, 60.0][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     with self.session(use_gpu=True) as sess:
       indices = [
@@ -246,7 +254,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
               [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -256,7 +264,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * datum.eval(), grad)
+        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
   # GPU version unit tests
   def testScalarGPU(self):
@@ -265,11 +273,12 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
       data = [constant_op.constant(40.0), constant_op.constant(60.0)]
       for step in -1, 1:
         stitched_t = data_flow_ops.dynamic_stitch(indices[::step], data)
-        stitched_val = stitched_t.eval()
+        stitched_val = self.evaluate(stitched_t)
         self.assertAllEqual([40.0, 60.0][::step], stitched_val)
         # Dimension 0 is max(flatten(indices))+1.
         self.assertEqual([2], stitched_t.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testHigherRankGPU(self):
     with self.cached_session() as sess:
       indices = [
@@ -284,7 +293,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
               [[[51, 52], [21, 22]], [[1, 2], [31, 32]]], dtype=dtypes.float32)
       ]
       stitched_t = data_flow_ops.dynamic_stitch(indices, data)
-      stitched_val = stitched_t.eval()
+      stitched_val = self.evaluate(stitched_t)
       correct = 10 * np.arange(7)[:, None] + [1.0, 2.0]
       self.assertAllEqual(correct, stitched_val)
       self.assertEqual([7, 2], stitched_t.get_shape().as_list())
@@ -294,7 +303,7 @@ class ParallelDynamicStitchTest(DynamicStitchTestBase, test.TestCase):
                                        stitched_grad)
       self.assertEqual(grads[:3], [None] * 3)  # Indices have no gradients
       for datum, grad in zip(data, sess.run(grads[3:])):
-        self.assertAllEqual(7.0 * datum.eval(), grad)
+        self.assertAllEqual(7.0 * self.evaluate(datum), grad)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/edit_distance_op_test.py b/tensorflow/python/kernel_tests/edit_distance_op_test.py
index dab5eee7f508bbff3af185299e508536e1f23908..4a06ab770aaa072c8858e0f527f21dcbc10bbbdd 100644
--- a/tensorflow/python/kernel_tests/edit_distance_op_test.py
+++ b/tensorflow/python/kernel_tests/edit_distance_op_test.py
@@ -49,11 +49,11 @@ class EditDistanceTest(test.TestCase):
 
     if expected_err_re is None:
       self.assertEqual(edit_distance.get_shape(), expected_shape)
-      output = edit_distance.eval()
+      output = self.evaluate(edit_distance)
       self.assertAllClose(output, expected_output)
     else:
       with self.assertRaisesOpError(expected_err_re):
-        edit_distance.eval()
+        self.evaluate(edit_distance)
 
   def _testEditDistance(self,
                         hypothesis,
diff --git a/tensorflow/python/kernel_tests/embedding_ops_test.py b/tensorflow/python/kernel_tests/embedding_ops_test.py
index 008d6fbf577ac86553a5c6e58769c4f60d178334..3ea2071e13a24fb804924081add2f2b41f314716 100644
--- a/tensorflow/python/kernel_tests/embedding_ops_test.py
+++ b/tensorflow/python/kernel_tests/embedding_ops_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import embedding_ops
@@ -76,7 +77,7 @@ class ScatterAddSubTest(test.TestCase):
       # p = init
       variables.global_variables_initializer().run()
       # p += vals
-      result = p2.eval()
+      result = self.evaluate(p2)
     # Compute the expected 'p' using numpy operations.
     for i, ind in enumerate(indices):
       if scatter_op == state_ops.scatter_add:
@@ -87,16 +88,19 @@ class ScatterAddSubTest(test.TestCase):
             vals_shape[0], -1)[i, :])
     self.assertTrue(all((p_init == result).ravel()))
 
+  @test_util.run_deprecated_v1
   def testNoRepetitions(self):
     self._TestCase([2, 2], [1])
     self._TestCase([4, 4, 4], [2, 0])
     self._TestCase([43, 20, 10, 10], [42, 5, 6, 1, 3, 5, 7, 9])
 
+  @test_util.run_deprecated_v1
   def testWithRepetitions(self):
     self._TestCase([2, 2], [1, 1])
     self._TestCase([5, 3, 9, 5], [2, 0, 4, 1, 3, 1, 4, 0, 4, 3])
     self._TestCase([32, 4, 4], [31] * 8)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     # Random shapes of rank 4, random indices
     for _ in range(5):
@@ -104,6 +108,7 @@ class ScatterAddSubTest(test.TestCase):
       indices = np.random.randint(shape[0], size=2 * shape[0])
       self._TestCase(_AsLong(list(shape)), list(indices))
 
+  @test_util.run_deprecated_v1
   def testSubRandom(self):
     # Random shapes of rank 4, random indices
     for _ in range(5):
@@ -111,6 +116,7 @@ class ScatterAddSubTest(test.TestCase):
       indices = np.random.randint(shape[0], size=2 * shape[0])
       self._TestCase(_AsLong(list(shape)), list(indices), state_ops.scatter_sub)
 
+  @test_util.run_deprecated_v1
   def testWrongShape(self):
     # Indices and values mismatch.
     var = variables.Variable(
@@ -241,6 +247,7 @@ class EmbeddingLookupTest(test.TestCase):
   # both the ids are in the first shard, one of the resulting lookup
   # vector is going to be empty. The subsequent DivOp fails because of that.
   # TODO(keveman): Disabling the test until the underlying problem is fixed.
+  @test_util.run_deprecated_v1
   def testSimpleSharded(self):
     with self.cached_session():
       num_shards = 2
@@ -257,6 +264,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testMaxNorm(self):
     with self.cached_session():
       embeddings = constant_op.constant([[2.0]])
@@ -267,6 +275,7 @@ class EmbeddingLookupTest(test.TestCase):
 
       self.assertAllEqual(embedding.eval(), [[1.0]])
 
+  @test_util.run_deprecated_v1
   def testMaxNormNontrivial(self):
     with self.cached_session():
       embeddings = constant_op.constant([[2.0, 4.0], [3.0, 1.0]])
@@ -278,8 +287,9 @@ class EmbeddingLookupTest(test.TestCase):
       norms = math_ops.sqrt(
           math_ops.reduce_sum(embeddings * embeddings, axis=1))
       normalized = embeddings / array_ops.stack([norms, norms], axis=1)
-      self.assertAllEqual(embedding.eval(), 2 * normalized.eval())
+      self.assertAllEqual(embedding.eval(), 2 * self.evaluate(normalized))
 
+  @test_util.run_deprecated_v1
   def testSimpleShardedPartitionedVariable(self):
     with self.cached_session() as sess:
       num_shards = 2
@@ -294,7 +304,7 @@ class EmbeddingLookupTest(test.TestCase):
       variables.global_variables_initializer().run()
       params_values = [params[p_i.name] for p_i in p]
       # Test that the PartitionedVariable components equal the list in p
-      p_var_val = sess.run(list(p_variable))
+      p_var_val = self.evaluate(list(p_variable))
       # Actual test
       tf_result = embedding.eval(feed_dict=feed_dict)
     np_result, _, _ = _EmbeddingResult(params, id_vals, num_shards, vocab_size)
@@ -302,6 +312,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testSimpleShardedPartitionedResourceVariable(self):
     with self.cached_session() as sess:
       num_shards = 2
@@ -316,15 +327,16 @@ class EmbeddingLookupTest(test.TestCase):
       variables.global_variables_initializer().run()
       params_values = [params[p_i.name] for p_i in p]
       # Test that the PartitionedVariable components equal the list in p
-      p_var_val = sess.run(list(p_variable))
+      p_var_val = self.evaluate(list(p_variable))
       # Actual test
       print(ops.get_default_graph().as_graph_def())
-      tf_result = embedding.eval()
+      tf_result = self.evaluate(embedding)
     np_result, _, _ = _EmbeddingResult(params, id_vals, num_shards, vocab_size)
     self.assertAllEqual(params_values, p_var_val)
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedModPartitioningInt32Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -347,6 +359,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedModPartitioningInt64Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -369,6 +382,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt32Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -393,6 +407,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt32IdsPartitionedVariable(self):
     with self.cached_session():
       num_shards = 5
@@ -418,6 +433,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningInt64Ids(self):
     with self.cached_session():
       num_shards = 5
@@ -442,6 +458,7 @@ class EmbeddingLookupTest(test.TestCase):
     self.assertAllEqual(np_result, tf_result)
     self.assertShapeEqual(np_result, embedding)
 
+  @test_util.run_deprecated_v1
   def testShardedDivPartitioningUnknownParamShape(self):
     with self.cached_session():
       num_shards = 5
@@ -468,6 +485,7 @@ class EmbeddingLookupTest(test.TestCase):
         params, id_vals, num_shards, vocab_size, partition_strategy="div")
     self.assertAllEqual(np_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookup(self):
     vocab_size = 9
     num_ids = 10
@@ -488,6 +506,7 @@ class EmbeddingLookupTest(test.TestCase):
               x, x_shape, y, y_shape, x_init_value=x_init_value)
         self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookupWithComputedParams(self):
     vocab_size = 9
     num_ids = 5
@@ -526,6 +545,7 @@ class EmbeddingLookupTest(test.TestCase):
         ids = constant_op.constant([0, 1, 1, 17], dtype=dtypes.int32)
       embedding_ops.embedding_lookup(p, ids)
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     np.random.seed(8)
     with self.cached_session():
@@ -546,6 +566,7 @@ class EmbeddingLookupTest(test.TestCase):
             sharded = embedding_ops.embedding_lookup(split_params, ids).eval()
             self.assertAllEqual(simple, sharded)
 
+  @test_util.run_deprecated_v1
   def testHigherRankMaxNorm(self):
     np.random.seed(8)
     with self.cached_session():
@@ -574,6 +595,7 @@ class EmbeddingLookupTest(test.TestCase):
                 split_params, ids, max_norm=1.0).eval()
             self.assertAllEqual(simple, sharded)
 
+  @test_util.run_deprecated_v1
   def testTransform(self):
     # This tests all combinations of:
     #   - ids rank 0, 1, >1
@@ -648,6 +670,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
       index += num_val
     return grouped_vals
 
+  @test_util.run_deprecated_v1
   def testEmbeddingLookupSparse(self):
     vocab_size = 13
     batch_size = 10
@@ -706,6 +729,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
         atol = rtol
         self.assertAllClose(np_embedding_sum, tf_embedding_sum, rtol, atol)
 
+  @test_util.run_deprecated_v1
   def testGradientsEmbeddingLookupSparse(self):
     vocab_size = 12
     batch_size = 4
@@ -733,6 +757,7 @@ class EmbeddingLookupSparseTest(test.TestCase):
             x, x_shape, y, y_shape, x_init_value=x_init_value)
       self.assertLess(err, 1e-5 if dtype == dtypes.float64 else 2e-3)
 
+  @test_util.run_deprecated_v1
   def testIncompatibleShapes(self):
     with self.cached_session():
       x, _, _ = _EmbeddingParams(1, 10, dtype=dtypes.float32)
@@ -758,11 +783,13 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
     assert num_shards > 0
     assert num_shards <= vocab_size
 
-    embedding_weights = partitioned_variables.create_partitioned_variables(
+    initializer = init_ops.truncated_normal_initializer(
+        mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32)
+    embedding_weights = list(variable_scope.get_variable(
+        name="embedding_weights",
         shape=[vocab_size, embed_dim],
-        slicing=[num_shards, 1],
-        initializer=init_ops.truncated_normal_initializer(
-            mean=0.0, stddev=1.0 / math.sqrt(vocab_size), dtype=dtypes.float32))
+        partitioner=partitioned_variables.fixed_size_partitioner(num_shards),
+        initializer=initializer))
     for w in embedding_weights:
       w.initializer.run()
     embedding_weights = [w.eval() for w in embedding_weights]
@@ -818,26 +845,31 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
 
     return sparse_ids, sparse_weights
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_return_zero_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
           [(1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) /
            3.0, [0] * 4, [0] * 4, embedding_weights[0][2], [0] * 4])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_return_special_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights, default_id=3).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights,
+              default_id=3).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -845,13 +877,15 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
            3.0, embedding_weights[0][3], embedding_weights[0][3],
            embedding_weights[0][2], embedding_weights[0][3]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_no_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, _ = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -859,13 +893,15 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
            [0] * 4, embedding_weights[0][2], (
                embedding_weights[0][0] + embedding_weights[0][1]) / 2.0])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_partitioned(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
       sparse_ids, _ = self._ids_and_weights_2d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
       self.assertAllClose(embedding_lookup_result,
@@ -873,6 +909,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                            [0] * 4, [0] * 4, embedding_weights[2],
                            (embedding_weights[0] + embedding_weights[1]) / 2.0])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_partitioned_inconsistent_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
@@ -888,26 +925,31 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
       self.assertRaises(ValueError, embedding_ops.safe_embedding_lookup_sparse,
                         embedding_weights, sparse_ids, sparse_weights)
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_return_zero_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights).eval())
 
       self.assertAllClose(embedding_lookup_result, [[
           (1.0 * embedding_weights[0][0] + 2.0 * embedding_weights[0][1]) / 3.0,
           [0] * 4, [0] * 4
       ], [embedding_weights[0][2], [0] * 4, [0] * 4]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_return_special_vector(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, sparse_weights = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, sparse_weights, default_id=3).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, sparse_weights,
+              default_id=3).eval())
 
       self.assertAllClose(
           embedding_lookup_result,
@@ -917,13 +959,15 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
                 embedding_weights[0][3]
             ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_no_weights(self):
     with self.cached_session():
       embedding_weights = self._random_weights()
       sparse_ids, _ = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       self.assertAllClose(embedding_lookup_result, [[(
           embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4, [
@@ -933,13 +977,15 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
               (embedding_weights[0][0] + embedding_weights[0][1]) / 2.0, [0] * 4
           ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_partitioned(self):
     with self.cached_session():
       embedding_weights = self._random_weights(num_shards=3)
       sparse_ids, _ = self._ids_and_weights_3d()
 
-      embedding_lookup_result = (embedding_ops.safe_embedding_lookup_sparse(
-          embedding_weights, sparse_ids, None).eval())
+      embedding_lookup_result = (
+          embedding_ops.safe_embedding_lookup_sparse_v2(
+              embedding_weights, sparse_ids, None).eval())
 
       embedding_weights = list(itertools.chain(*embedding_weights))
       self.assertAllClose(embedding_lookup_result, [[
@@ -949,6 +995,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
           (embedding_weights[0] + embedding_weights[1]) / 2.0, [0] * 4
       ]])
 
+  @test_util.run_deprecated_v1
   def test_safe_embedding_lookup_sparse_3d_partitioned_inconsistent_weights(
       self):
     with self.cached_session():
@@ -968,6 +1015,7 @@ class SafeEmbeddingLookupSparseTest(test.TestCase):
 
 class DynamicStitchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCint32Cpu(self):
     with self.session(use_gpu=False):
       indices = [
@@ -981,6 +1029,7 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testCint32Gpu(self):
     with self.session(use_gpu=True):
       indices = [
@@ -994,6 +1043,7 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testInt32Cpu(self):
     with self.session(use_gpu=False):
       indices = [
@@ -1007,6 +1057,7 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testInt32Gpu(self):
     with self.session(use_gpu=True):
       indices = [
@@ -1020,6 +1071,7 @@ class DynamicStitchOpTest(test.TestCase):
       self.assertAllEqual(
           data_flow_ops.dynamic_stitch(indices, values).eval(), [12, 23, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testSumGradArgs(self):
     with self.session(use_gpu=False):
       indices = [
@@ -1034,6 +1086,7 @@ class DynamicStitchOpTest(test.TestCase):
           data_flow_ops.dynamic_stitch(indices, values).eval(), [2, 3, 1, 1])
 
   # We expect that the values are merged in order.
+  @test_util.run_deprecated_v1
   def testStitchOrder(self):
     with self.cached_session():
       indices = []
@@ -1049,6 +1102,7 @@ class DynamicStitchOpTest(test.TestCase):
 
 class ParallelDynamicStitchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCint32Cpu(self):
     with self.session(use_gpu=False):
       indices = [
@@ -1063,6 +1117,7 @@ class ParallelDynamicStitchOpTest(test.TestCase):
           data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
           [12, 23, 1, 2, 34, 3, 45])
 
+  @test_util.run_deprecated_v1
   def testInt32Cpu(self):
     with self.session(use_gpu=False):
       indices = [
@@ -1077,6 +1132,7 @@ class ParallelDynamicStitchOpTest(test.TestCase):
           data_flow_ops.parallel_dynamic_stitch(indices, values).eval(),
           [12, 23, 1, 2, 3, 34, 45, 56])
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.session(use_gpu=False):
       indices = [ops.convert_to_tensor([0, 1]), ops.convert_to_tensor([2, 3])]
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
index 7d9d4e517527e457c0da73d4f4b2a8763359a693..7ba2dc6c20951d00994978790a26c17c59233d0a 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_grad_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed as random_seed_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -78,6 +79,7 @@ class ExtractImagePatchesGradTest(test.TestCase):
       },
   ]
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     # Set graph seed for determinism.
     random_seed = 42
@@ -102,6 +104,7 @@ class ExtractImagePatchesGradTest(test.TestCase):
           print('extract_image_patches gradient err: %.4e' % err)
           self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testConstructGradientWithLargeImages(self):
     batch_size = 4
     height = 1024
diff --git a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
index 61436f24cfed348712e3ccfba4fe009932133c12..bb3c0ae80694035dd362f5024ecdddeb0e364bb0 100644
--- a/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_image_patches_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -43,7 +44,7 @@ class ExtractImagePatches(test.TestCase):
     strides = [1] + strides + [1]
     rates = [1] + rates + [1]
 
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       out_tensor = array_ops.extract_image_patches(
           constant_op.constant(image),
           ksizes=ksizes,
@@ -51,7 +52,7 @@ class ExtractImagePatches(test.TestCase):
           rates=rates,
           padding=padding,
           name="im2col")
-      self.assertAllClose(patches, out_tensor.eval())
+      self.assertAllClose(patches, self.evaluate(out_tensor))
 
   def testKsize1x1Stride1x1Rate1x1(self):
     """Verifies that for 1x1 kernel the output equals the input."""
diff --git a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
index bbb3fef85b4cc7f4423c6c3414607db10732fa0b..88f7df8fbb64512c9ca362ec7c310a5805c9c728 100644
--- a/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
+++ b/tensorflow/python/kernel_tests/extract_volume_patches_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
@@ -45,14 +46,14 @@ class ExtractVolumePatches(test.TestCase):
     ksizes = [1] + ksizes + [1]
     strides = [1] + strides + [1]
 
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       out_tensor = array_ops.extract_volume_patches(
           constant_op.constant(image),
           ksizes=ksizes,
           strides=strides,
           padding=padding,
           name="im2col_3d")
-      self.assertAllClose(patches, out_tensor.eval())
+      self.assertAllClose(patches, self.evaluate(out_tensor))
 
   # pylint: disable=bad-whitespace
   def testKsize1x1x1Stride1x1x1(self):
diff --git a/tensorflow/python/kernel_tests/fifo_queue_test.py b/tensorflow/python/kernel_tests/fifo_queue_test.py
index 8961c4b13c25269671fdc16fc425516d01970892..0579dddb70264199a53c140ab60ad2ddf9b00bb9 100644
--- a/tensorflow/python/kernel_tests/fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/fifo_queue_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import compat
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueTest(test.TestCase):
 
   def testConstructor(self):
@@ -159,7 +160,7 @@ class FIFOQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -191,7 +192,7 @@ class FIFOQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -211,7 +212,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testDequeueHalf(self):
@@ -225,7 +226,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -240,13 +241,13 @@ class FIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -269,7 +270,7 @@ class FIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        x_val, y_val = sess.run(dequeued_t)
+        x_val, y_val = self.evaluate(dequeued_t)
         x, y = elems[i]
         self.assertEqual([x], x_val)
         self.assertEqual([y], y_val)
@@ -288,9 +289,9 @@ class FIFOQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -302,7 +303,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i % 4]], vals)
 
   def testEmptyEnqueueMany(self):
@@ -313,9 +314,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -323,9 +324,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpTo(self):
     with self.cached_session():
@@ -333,9 +334,9 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
     with self.cached_session():
@@ -356,7 +357,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -369,8 +370,8 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testDequeueUpToNoBlocking(self):
     with self.cached_session():
@@ -381,8 +382,8 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testMultiDequeueMany(self):
     with self.cached_session() as sess:
@@ -399,17 +400,17 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
@@ -429,13 +430,13 @@ class FIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual([None], dequeued_t[0].get_shape().as_list())
       self.assertEqual([None, 2], dequeued_t[1].get_shape().as_list())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
@@ -518,7 +519,7 @@ class FIFOQueueTest(test.TestCase):
                                    r"Expected \[2,3,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testParallelEnqueueMany(self):
     with self.cached_session() as sess:
@@ -529,7 +530,7 @@ class FIFOQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -552,7 +553,7 @@ class FIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -576,7 +577,7 @@ class FIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -596,11 +597,11 @@ class FIFOQueueTest(test.TestCase):
 
       def enqueue():
         for _ in xrange(100):
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       def dequeue():
         for _ in xrange(100):
-          self.assertTrue(sess.run(dequeued_t) in (10.0, 20.0))
+          self.assertTrue(self.evaluate(dequeued_t) in (10.0, 20.0))
 
       enqueue_threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       dequeue_threads = [self.checkedThread(target=dequeue) for _ in range(10)]
@@ -632,7 +633,7 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         for i in xrange(250):
-          self.assertEqual(i, sess.run(dequeued_t))
+          self.assertEqual(i, self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -663,7 +664,7 @@ class FIFOQueueTest(test.TestCase):
       dequeuemany_t = q.dequeue_many(count_placeholder)
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -672,7 +673,7 @@ class FIFOQueueTest(test.TestCase):
       while elements_dequeued < 250:
         # With equal probability, run Dequeue or dequeue_many.
         if random.random() > 0.5:
-          self.assertEqual(elements_dequeued, dequeued_t.eval())
+          self.assertEqual(elements_dequeued, self.evaluate(dequeued_t))
           elements_dequeued += 1
         else:
           count = random.randint(0, min(20, 250 - elements_dequeued))
@@ -701,10 +702,10 @@ class FIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -728,10 +729,10 @@ class FIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -778,12 +779,12 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
       close_op.run()
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -797,11 +798,11 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         for elem in elems:
-          self.assertEqual([elem], sess.run(dequeued_t))
+          self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -821,7 +822,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -842,11 +843,11 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems, sess.run(dequeued_t))
+        self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -867,11 +868,11 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -892,8 +893,8 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
-        self.assertAllEqual(elems[3:], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
+        self.assertAllEqual(elems[3:], self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -913,16 +914,16 @@ class FIFOQueueTest(test.TestCase):
       cleanup_dequeue_t = q.dequeue()
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        self.assertAllEqual(elems[0:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[0:3], self.evaluate(dequeued_t))
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run(dequeued_t)
-        self.assertEqual(elems[3], sess.run(cleanup_dequeue_t))
+          self.evaluate(dequeued_t)
+        self.assertEqual(elems[3], self.evaluate(cleanup_dequeue_t))
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -955,7 +956,7 @@ class FIFOQueueTest(test.TestCase):
 
       def dequeue():
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run([dequeued_a_t, dequeued_b_t])
+          self.evaluate([dequeued_a_t, dequeued_b_t])
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -968,7 +969,7 @@ class FIFOQueueTest(test.TestCase):
       # Test that the elements in the partially-dequeued batch are
       # restored in the correct order.
       for elem_a, elem_b in zip(elems_a, elems_b):
-        val_a, val_b = sess.run([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
+        val_a, val_b = self.evaluate([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
         self.assertEqual(elem_a, val_a)
         self.assertEqual(elem_b, val_b)
       self.assertEqual(0, q.size().eval())
@@ -983,7 +984,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1003,7 +1004,7 @@ class FIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1051,7 +1052,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1059,8 +1060,8 @@ class FIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
-      self.assertEqual([50.0], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
@@ -1074,7 +1075,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1082,10 +1083,10 @@ class FIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
         time.sleep(0.01)
-      self.assertEqual([50.0], dequeued_t.eval())
-      self.assertEqual([60.0], dequeued_t.eval())
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
+      self.assertEqual([60.0], self.evaluate(dequeued_t))
 
       # Make sure the thread finishes before exiting.
       thread.join()
@@ -1103,7 +1104,7 @@ class FIFOQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # Expect the operation to succeed once the dequeue op runs.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1113,18 +1114,18 @@ class FIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
 
       for elem in [20.0, 30.0, 40.0, 50.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
       self.assertEqual(0, q.size().eval())
 
   def testBlockingEnqueueManyBeforeClose(self):
@@ -1138,7 +1139,7 @@ class FIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1148,17 +1149,17 @@ class FIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
       for elem in [20.0, 30.0, 50.0, 60.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
 
   def testDoesNotLoseValue(self):
     with self.cached_session():
@@ -1266,19 +1267,19 @@ class FIFOQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1321,7 +1322,7 @@ class FIFOQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1331,14 +1332,14 @@ class FIFOQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1364,7 +1365,7 @@ class FIFOQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1373,7 +1374,7 @@ class FIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
@@ -1405,7 +1406,7 @@ class FIFOQueueTest(test.TestCase):
       q.enqueue_many(input_tuple).run()
 
       output_tuple_t = q.dequeue_many(32)
-      output_tuple = sess.run(output_tuple_t)
+      output_tuple = self.evaluate(output_tuple_t)
 
       for (input_elem, output_elem) in zip(input_tuple, output_tuple):
         self.assertAllEqual(input_elem, output_elem)
@@ -1423,6 +1424,7 @@ class FIFOQueueTest(test.TestCase):
         session.run([a, c])
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueDictTest(test.TestCase):
 
   def testConstructor(self):
@@ -1507,10 +1509,10 @@ class FIFOQueueDictTest(test.TestCase):
       enqueue_op4 = q.enqueue_many({"f": [40.0, 50.0]})
       dequeue = q.dequeue()
       dequeue_2 = q.dequeue_many(2)
-      sess.run(enqueue_op)
-      sess.run(enqueue_op2)
-      sess.run(enqueue_op3)
-      sess.run(enqueue_op4)
+      self.evaluate(enqueue_op)
+      self.evaluate(enqueue_op2)
+      self.evaluate(enqueue_op3)
+      self.evaluate(enqueue_op4)
       f = sess.run(dequeue["f"])
       self.assertEqual(10.0, f)
       f = sess.run(dequeue_2["f"])
@@ -1565,10 +1567,10 @@ class FIFOQueueDictTest(test.TestCase):
       })
       dequeue = q.dequeue()
       dequeue_2 = q.dequeue_many(2)
-      sess.run(enqueue_op)
-      sess.run(enqueue_op2)
-      sess.run(enqueue_op3)
-      sess.run(enqueue_op4)
+      self.evaluate(enqueue_op)
+      self.evaluate(enqueue_op2)
+      self.evaluate(enqueue_op3)
+      self.evaluate(enqueue_op4)
       i, f, s = sess.run([dequeue["i"], dequeue["f"], dequeue["s"]])
       self.assertEqual(123, i)
       self.assertEqual(10.0, f)
@@ -1583,6 +1585,7 @@ class FIFOQueueDictTest(test.TestCase):
       self.assertTrue([compat.as_bytes("dd"), compat.as_bytes("ee")], list(s))
 
 
+@test_util.run_v1_only("b/120545219")
 class FIFOQueueWithTimeoutTest(test.TestCase):
 
   def testDequeueWithTimeout(self):
@@ -1597,7 +1600,7 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
       # until operation_timeout_in_ms.
       with self.assertRaisesRegexp(errors_impl.DeadlineExceededError,
                                    "Timed out waiting for notification"):
-        sess.run(dequeued_t)
+        self.evaluate(dequeued_t)
 
   def testReusableAfterTimeout(self):
     with self.cached_session() as sess:
@@ -1613,10 +1616,11 @@ class FIFOQueueWithTimeoutTest(test.TestCase):
                                    "Timed out waiting for notification"):
         sess.run(dequeued_t, options=config_pb2.RunOptions(timeout_in_ms=10))
 
-      sess.run(enqueue_op)
-      self.assertEqual(37, sess.run(dequeued_t))
+      self.evaluate(enqueue_op)
+      self.assertEqual(37, self.evaluate(dequeued_t))
 
 
+@test_util.run_v1_only("b/120545219")
 class QueueContainerTest(test.TestCase):
 
   def testContainer(self):
diff --git a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
index f89d2062f1e736068a50344234b05aad423a17e7..0d5928aefacf5a395c0f1c61ef997914aca000e8 100644
--- a/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_avg_pool_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -37,7 +38,6 @@ class FractionalAvgTest(test.TestCase):
   # Random number generate with seed.
   _PRNG = np.random.RandomState(341261000)
   _SEED = 341261001
-  _SEED2 = 341261002
 
   def _AvgPoolAlongRows(self, input_matrix, row_seq, overlapping):
     """Perform average pool along row of a 2-D matrix based on row_seq.
@@ -128,15 +128,13 @@ class FractionalAvgTest(test.TestCase):
       None
     """
     with self.cached_session() as sess:
-      p, r, c = nn_ops.fractional_avg_pool(
+      p, r, c = nn_ops.fractional_avg_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual, row_seq, col_seq = sess.run([p, r, c])
+          seed=self._SEED)
+      actual, row_seq, col_seq = self.evaluate([p, r, c])
       expected = self._GetExpectedFractionalAvgPoolResult(input_tensor, row_seq,
                                                           col_seq, overlapping)
       self.assertShapeEqual(expected, p)
@@ -161,15 +159,13 @@ class FractionalAvgTest(test.TestCase):
       rand_mat = self._PRNG.randint(10, size=tensor_shape)
       pooling_ratio = [1, math.sqrt(2), math.sqrt(2), 1]
       with self.cached_session() as sess:
-        p, r, c = nn_ops.fractional_avg_pool(
+        p, r, c = nn_ops.fractional_avg_pool_v2(
             rand_mat.astype(np.float32),
             pooling_ratio,
             pseudo_random,
             overlapping,
-            deterministic=True,
-            seed=self._SEED,
-            seed2=self._SEED2)
-        tensor_output, row_seq, col_seq = sess.run([p, r, c])
+            seed=self._SEED)
+        tensor_output, row_seq, col_seq = self.evaluate([p, r, c])
         expected_result = self._GetExpectedFractionalAvgPoolResult(
             rand_mat.astype(np.float32), row_seq, col_seq, overlapping)
         print("row sequence:")
@@ -214,12 +210,6 @@ class FractionalAvgTest(test.TestCase):
 
   def testIntegerTensorInput(self):
     """Test FractionalAvgPool works fine when input tensor is integer type.
-
-    I would have used _ValidateFractionalAvgPoolResult function to automate this
-    process, however, there's rounding issue. It is caused by numpy.mean cast
-    integer input to numpy.float64 for intermediate use. While for
-    fractional_avg_pool, the mean operation is integer division (trucated).  So,
-    for this test case, I will hard code a simple matrix.
     """
     pseudo_random = True
     overlapping = True
@@ -234,29 +224,9 @@ class FractionalAvgTest(test.TestCase):
         [4, 4, 5, 9, 7, 2]
     ])
     # pyformat: enable
-    with self.cached_session() as sess:
-      # Since deterministic = True, seed and seed2 are fixed. Therefore r, and c
-      # are the same each time. We can have an expected result precomputed.
-      # r = [0, 2, 4, 6]
-      # c = [0, 1, 3, 4, 6]
-
-      # pyformat: disable
-      expected = np.array([
-          [6, 5, 3, 5],
-          [5, 5, 4, 5],
-          [5, 4, 7, 5]
-      ]).reshape((1, 3, 4, 1))
-      # pyformat: enable
-      p, unused_r, unused_c = nn_ops.fractional_avg_pool(
-          mat.reshape(tensor_shape), [1, math.sqrt(3), math.sqrt(2), 1],
-          pseudo_random,
-          overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual = sess.run(p)
-      self.assertShapeEqual(expected, p)
-      self.assertAllClose(expected, actual)
+    self._ValidateFractionalAvgPoolResult(mat.reshape(tensor_shape),
+                                          [1, math.sqrt(3), math.sqrt(2), 1],
+                                          pseudo_random, overlapping)
 
   def testDifferentTensorShapes(self):
     """Test different shapes of input tensor.
@@ -312,6 +282,7 @@ class FractionalAvgTest(test.TestCase):
     self._ValidateFractionalAvgPoolResult(rand_mat, [1, 2, 2, 1], pseudo_random,
                                           overlapping)
 
+  @test_util.run_deprecated_v1
   def testDifferentInputTensorShape(self):
     """Runs the operation in one session with different input tensor shapes."""
     with self.cached_session() as sess:
@@ -320,14 +291,12 @@ class FractionalAvgTest(test.TestCase):
       pooling_ratio = [1, 1.5, 1.5, 1]
       pseudo_random = False
       overlapping = False
-      p, r, c = nn_ops.fractional_avg_pool(
+      p, r, c = nn_ops.fractional_avg_pool_v2(
           input_holder,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # First run.
       input_a = np.zeros([3, 32, 32, 3])
       actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
@@ -372,7 +341,6 @@ class FractionalAvgPoolGradTest(test.TestCase):
   """
   _PRNG = np.random.RandomState(341261004)
   _SEED = 341261005
-  _SEED2 = 341261006
 
   def _GenerateRandomInputTensor(self, shape):
     num_elements = 1
@@ -398,7 +366,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.avg_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               num_elements = 1
               for dim_size in output_data.shape:
                 num_elements *= dim_size
@@ -407,7 +375,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
               fap_input_backprop_tensor = gen_nn_ops.fractional_avg_pool_grad(
@@ -416,7 +384,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=False)
-              fap_input_backprop = fap_input_backprop_tensor.eval()
+              fap_input_backprop = self.evaluate(fap_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fap_input_backprop_tensor)
               self.assertAllClose(input_backprop, fap_input_backprop)
 
@@ -437,7 +405,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.avg_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               num_elements = 1
               for dim_size in output_data.shape:
                 num_elements *= dim_size
@@ -446,7 +414,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               input_backprop_tensor = gen_nn_ops.avg_pool_grad(
                   input_tensor.get_shape(), output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
@@ -457,10 +425,11 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=True)
-              fap_input_backprop = fap_input_backprop_tensor.eval()
+              fap_input_backprop = self.evaluate(fap_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fap_input_backprop_tensor)
               self.assertAllClose(input_backprop, fap_input_backprop)
 
+  @test_util.run_deprecated_v1
   def testAllInputOptionsThroughGradientError(self):
     input_shape = (1, 7, 13, 1)
     input_data = self._GenerateRandomInputTensor(input_shape)
@@ -470,15 +439,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
       for overlapping in True, False:
         with self.cached_session() as _:
           input_tensor = constant_op.constant(input_data, shape=input_shape)
-          output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+          output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
               input_tensor,
               pooling_ratio,
               pseudo_random=pseudo_random,
               overlapping=overlapping,
-              deterministic=True,
-              seed=self._SEED,
-              seed2=self._SEED2)
-          output_data = output_tensor.eval()
+              seed=self._SEED)
+          output_data = self.evaluate(output_tensor)
           output_shape = output_data.shape
           # error_margin and delta setting is similar to avg_pool_grad.
           error_margin = 1e-4
@@ -491,6 +458,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
               delta=1e-2)
           self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testDifferentTensorShapesThroughGradientError(self):
     pseudo_random = True
     overlapping = True
@@ -503,15 +471,13 @@ class FractionalAvgPoolGradTest(test.TestCase):
             input_data = self._GenerateRandomInputTensor(input_shape)
             with self.cached_session() as _:
               input_tensor = constant_op.constant(input_data, shape=input_shape)
-              output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+              output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
                   input_tensor,
                   pooling_ratio,
                   pseudo_random=pseudo_random,
                   overlapping=overlapping,
-                  deterministic=True,
-                  seed=self._SEED,
-                  seed2=self._SEED2)
-              output_data = output_tensor.eval()
+                  seed=self._SEED)
+              output_data = self.evaluate(output_tensor)
               output_shape = output_data.shape
               # error_margin and delta setting is similar to avg_pool_grad.
               error_margin = 1e-4
@@ -524,6 +490,7 @@ class FractionalAvgPoolGradTest(test.TestCase):
                   delta=1e-2)
               self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testLargePoolingRatioThroughGradientError(self):
     input_shape = (1, 17, 23, 1)
     input_data = self._GenerateRandomInputTensor(input_shape)
@@ -534,14 +501,12 @@ class FractionalAvgPoolGradTest(test.TestCase):
 
     with self.cached_session() as _:
       input_tensor = constant_op.constant(input_data, shape=input_shape)
-      output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool(
+      output_tensor, unused_a, unused_b = nn_ops.fractional_avg_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random=pseudo_random,
           overlapping=overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # error_margin and delta setting is similar to avg_pool_grad.
       error_margin = 1e-4
       gradient_error = gradient_checker.compute_gradient_error(
diff --git a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
index 9b94ca85547590600306bf8aef2caa0f3c3eac8e..fa886cc215a7d814bbc13cb3be0c8712100f81d7 100644
--- a/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
+++ b/tensorflow/python/kernel_tests/fractional_max_pool_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -37,7 +38,6 @@ class FractionalMaxPoolTest(test.TestCase):
   # Random number generate with seed.
   _PRNG = np.random.RandomState(341261)
   _SEED = 123456
-  _SEED2 = 654321
 
   def _MaxPoolAlongRows(self, input_matrix, row_seq, overlapping):
     """Perform max pool along row of a 2-D matrix based on row_seq.
@@ -128,15 +128,13 @@ class FractionalMaxPoolTest(test.TestCase):
       None
     """
     with self.cached_session() as sess:
-      p, r, c = nn_ops.fractional_max_pool(
+      p, r, c = nn_ops.fractional_max_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
-      actual, row_seq, col_seq = sess.run([p, r, c])
+          seed=self._SEED)
+      actual, row_seq, col_seq = self.evaluate([p, r, c])
       expected = self._GetExpectedFractionalMaxPoolResult(input_tensor, row_seq,
                                                           col_seq, overlapping)
       self.assertShapeEqual(expected, p)
@@ -161,15 +159,13 @@ class FractionalMaxPoolTest(test.TestCase):
       rand_mat = self._PRNG.randint(10, size=tensor_shape)
       pooling_ratio = [1, math.sqrt(2), math.sqrt(2), 1]
       with self.cached_session() as sess:
-        p, r, c = nn_ops.fractional_max_pool(
+        p, r, c = nn_ops.fractional_max_pool_v2(
             rand_mat,
             pooling_ratio,
             pseudo_random,
             overlapping,
-            deterministic=True,
-            seed=self._SEED,
-            seed2=self._SEED2)
-        tensor_output, row_seq, col_seq = sess.run([p, r, c])
+            seed=self._SEED)
+        tensor_output, row_seq, col_seq = self.evaluate([p, r, c])
         expected_result = self._GetExpectedFractionalMaxPoolResult(rand_mat,
                                                                    row_seq,
                                                                    col_seq,
@@ -283,6 +279,7 @@ class FractionalMaxPoolTest(test.TestCase):
     self._ValidateFractionalMaxPoolResult(rand_mat, [1, 2, 2, 1], pseudo_random,
                                           overlapping)
 
+  @test_util.run_deprecated_v1
   def testDifferentInputTensorShape(self):
     """Runs the operation in one session with different input tensor shapes."""
     with self.cached_session() as sess:
@@ -291,14 +288,12 @@ class FractionalMaxPoolTest(test.TestCase):
       pooling_ratio = [1, 1.5, 1.5, 1]
       pseudo_random = False
       overlapping = False
-      p, r, c = nn_ops.fractional_max_pool(
+      p, r, c = nn_ops.fractional_max_pool_v2(
           input_holder,
           pooling_ratio,
           pseudo_random,
           overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # First run.
       input_a = np.zeros([3, 32, 32, 3])
       actual, row_seq, col_seq = sess.run([p, r, c], {input_holder: input_a})
@@ -344,7 +339,6 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
   _PRNG = np.random.RandomState(341261)
   _SEED = 123456
-  _SEED2 = 654321
 
   def _GenerateUniqueRandomInputTensor(self, shape):
     """Generate 'unqiue' random input tensor.
@@ -382,12 +376,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.max_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
               input_backprop_tensor = gen_nn_ops.max_pool_grad(
                   input_tensor, output_tensor, output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows + 1, row_window_size))
               col_seq = list(range(0, num_cols + 1, col_window_size))
               fmp_input_backprop_tensor = gen_nn_ops.fractional_max_pool_grad(
@@ -397,7 +391,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=False)
-              fmp_input_backprop = fmp_input_backprop_tensor.eval()
+              fmp_input_backprop = self.evaluate(fmp_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fmp_input_backprop_tensor)
               self.assertAllClose(input_backprop, fmp_input_backprop)
 
@@ -417,12 +411,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
               padding = "VALID"
               output_tensor = nn_ops.max_pool(input_tensor, window_size,
                                               stride_size, padding)
-              output_data = output_tensor.eval()
+              output_data = self.evaluate(output_tensor)
               output_backprop = self._PRNG.randint(100, size=output_data.shape)
               input_backprop_tensor = gen_nn_ops.max_pool_grad(
                   input_tensor, output_tensor, output_backprop, window_size,
                   stride_size, padding)
-              input_backprop = input_backprop_tensor.eval()
+              input_backprop = self.evaluate(input_backprop_tensor)
               row_seq = list(range(0, num_rows, row_window_size - 1))
               col_seq = list(range(0, num_cols, col_window_size - 1))
               row_seq[-1] += 1
@@ -434,10 +428,11 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   row_seq,
                   col_seq,
                   overlapping=True)
-              fmp_input_backprop = fmp_input_backprop_tensor.eval()
+              fmp_input_backprop = self.evaluate(fmp_input_backprop_tensor)
               self.assertShapeEqual(input_backprop, fmp_input_backprop_tensor)
               self.assertAllClose(input_backprop, fmp_input_backprop)
 
+  @test_util.run_deprecated_v1
   def testAllInputOptionsThroughGradientError(self):
     input_shape = (1, 7, 13, 1)
     input_data = self._GenerateUniqueRandomInputTensor(input_shape)
@@ -449,15 +444,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
       for overlapping in True, False:
         with self.cached_session() as _:
           input_tensor = constant_op.constant(input_data, shape=input_shape)
-          output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+          output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
               input_tensor,
               pooling_ratio,
               pseudo_random=pseudo_random,
               overlapping=overlapping,
-              deterministic=True,
-              seed=self._SEED,
-              seed2=self._SEED2)
-          output_data = output_tensor.eval()
+              seed=self._SEED)
+          output_data = self.evaluate(output_tensor)
           output_shape = output_data.shape
           # error_margin and delta setting is similar to max_pool_grad.
           error_margin = 1e-3
@@ -470,6 +463,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
               delta=1e-2)
           self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testDifferentTensorShapesThroughGradientError(self):
     pseudo_random = True
     overlapping = True
@@ -484,15 +478,13 @@ class FractionalMaxPoolGradTest(test.TestCase):
             input_data += self._PRNG.random_sample(input_shape)
             with self.cached_session() as _:
               input_tensor = constant_op.constant(input_data, shape=input_shape)
-              output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+              output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
                   input_tensor,
                   pooling_ratio,
                   pseudo_random=pseudo_random,
                   overlapping=overlapping,
-                  deterministic=True,
-                  seed=self._SEED,
-                  seed2=self._SEED2)
-              output_data = output_tensor.eval()
+                  seed=self._SEED)
+              output_data = self.evaluate(output_tensor)
               output_shape = output_data.shape
               # error_margin and delta setting is similar to max_pool_grad.
               error_margin = 1e-3
@@ -505,6 +497,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
                   delta=1e-2)
               self.assertLess(gradient_error, error_margin)
 
+  @test_util.run_deprecated_v1
   def testLargePoolingRatioThroughGradientError(self):
     input_shape = (1, 17, 23, 1)
     input_data = self._GenerateUniqueRandomInputTensor(input_shape)
@@ -517,14 +510,12 @@ class FractionalMaxPoolGradTest(test.TestCase):
 
     with self.cached_session() as _:
       input_tensor = constant_op.constant(input_data, shape=input_shape)
-      output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool(
+      output_tensor, unused_a, unused_b = nn_ops.fractional_max_pool_v2(
           input_tensor,
           pooling_ratio,
           pseudo_random=pseudo_random,
           overlapping=overlapping,
-          deterministic=True,
-          seed=self._SEED,
-          seed2=self._SEED2)
+          seed=self._SEED)
       # error_margin and delta setting is similar to max_pool_grad.
       error_margin = 1e-3
       gradient_error = gradient_checker.compute_gradient_error(
@@ -592,7 +583,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
           row_seq,
           col_seq,
           overlapping=False)
-      input_backprop_not_overlapping = r.eval()
+      input_backprop_not_overlapping = self.evaluate(r)
       self.assertShapeEqual(
           np.reshape(expected_input_backprop_not_overlapping, input_size), r)
       self.assertAllClose(expected_input_backprop_not_overlapping,
@@ -602,7 +593,7 @@ class FractionalMaxPoolGradTest(test.TestCase):
           output_data_overlapping, shape=output_size)
       r = gen_nn_ops.fractional_max_pool_grad(
           input_tensor, output_tensor, grad, row_seq, col_seq, overlapping=True)
-      input_backprop_overlapping = r.eval()
+      input_backprop_overlapping = self.evaluate(r)
       self.assertShapeEqual(
           np.reshape(expected_input_backprop_overlapping, input_size), r)
       self.assertAllClose(expected_input_backprop_overlapping,
diff --git a/tensorflow/python/kernel_tests/functional_ops_test.py b/tensorflow/python/kernel_tests/functional_ops_test.py
index 04c1032722caac235c8bdc684dbdaa5f934d090a..95ee454614e6edb633b981e9173b2035550259c3 100644
--- a/tensorflow/python/kernel_tests/functional_ops_test.py
+++ b/tensorflow/python/kernel_tests/functional_ops_test.py
@@ -56,6 +56,7 @@ def simple_scoped_fn(a, x):
     return math_ops.multiply(math_ops.add(a, x), two)
 
 
+@test_util.with_control_flow_v2
 class FunctionalOpsTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
@@ -100,6 +101,7 @@ class FunctionalOpsTest(test.TestCase):
                              (elems, other_elems), initializer)
     self.assertAllEqual([1.0, 2.0, 3.0], self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testFoldl_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -152,6 +154,7 @@ class FunctionalOpsTest(test.TestCase):
                              initializer)
     self.assertAllEqual(1, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testFoldr_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -172,6 +175,7 @@ class FunctionalOpsTest(test.TestCase):
         self.assertAllEqual(1282, self.evaluate(r))
 
   # pylint: disable=unnecessary-lambda
+  @test_util.run_deprecated_v1
   def testFold_Grad(self):
     with self.cached_session():
       elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
@@ -213,6 +217,7 @@ class FunctionalOpsTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "not a scalar"):
       functional_ops.map_fn(lambda x: x, 1)
 
+  @test_util.run_deprecated_v1
   def testMap_Scoped(self):
     with self.cached_session() as sess:
 
@@ -244,6 +249,7 @@ class FunctionalOpsTest(test.TestCase):
         self.assertEqual(len(variables.trainable_variables()), 1)
         self.assertAllEqual(doubles, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testMap_Grad(self):
     with self.cached_session():
       param = constant_op.constant(2.0)
@@ -380,6 +386,7 @@ class FunctionalOpsTest(test.TestCase):
         ValueError, "two structures don't have the same nested structure"):
       functional_ops.scan(lambda a, x: (a, -a), elems, initializer)
 
+  @test_util.run_deprecated_v1
   def testScan_Scoped(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope("root") as varscope:
@@ -424,6 +431,7 @@ class FunctionalOpsTest(test.TestCase):
     #   t_1 == 1, b == 4.5,       y == 0.5, returns b * y * x = 9
     self.assertAllClose([1., 1., 2.25, 9.], self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testScan_Control(self):
     with self.cached_session() as sess:
       s = array_ops.placeholder(dtypes.float32, shape=[None])
@@ -435,6 +443,7 @@ class FunctionalOpsTest(test.TestCase):
           np.array([1.0, 3.0, 9.0]), sess.run(c, {s: [1, 3, 3],
                                                   b: True}))
 
+  @test_util.run_deprecated_v1
   def testScan_Grad(self):
     with self.cached_session():
       elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data")
@@ -447,6 +456,7 @@ class FunctionalOpsTest(test.TestCase):
       r = gradients_impl.gradients(r, v)[0]
       self.assertAllEqual(873.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testScanGradientWithPartStopGradient(self):
     a = variables.Variable(0.0, name="a")
     b = variables.Variable(0.0, name="b")
@@ -457,7 +467,7 @@ class FunctionalOpsTest(test.TestCase):
     grad = gradients_impl.gradients(ys=[loss], xs=[a, b])
     with self.test_session(use_gpu=True) as sess:
       variables.global_variables_initializer().run()
-      sess.run(grad)
+      self.evaluate(grad)
 
   @test_util.run_in_graph_and_eager_modes
   def testFoldShape(self):
@@ -476,12 +486,15 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.map_fn(lambda e: e, x)
     self.assertAllEqual(y.get_shape(), self.evaluate(y).shape)
 
+  @test_util.run_deprecated_v1
   def testMapUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     y = functional_ops.map_fn(lambda e: e, x)
     self.assertIs(None, y.get_shape().dims)
 
+  @test_util.disable_control_flow_v2("b/119323354")
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testMapEmptyScalar(self):
     map_return = functional_ops.map_fn(lambda x: 1, constant_op.constant([]))
     self.assertAllEqual([0], map_return.get_shape().dims)
@@ -489,6 +502,8 @@ class FunctionalOpsTest(test.TestCase):
 
   # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
+  @test_util.disable_control_flow_v2("b/119323354")
+  @test_util.run_v1_only("b/120545219")
   def testMapEmptyTensor(self):
     with self.cached_session():
       map_return = functional_ops.map_fn(lambda x: array_ops.zeros([3, 2]),
@@ -509,6 +524,7 @@ class FunctionalOpsTest(test.TestCase):
 
   # TODO(akshayka): this test fails in eager: the iterable is of length 0 so
   # so the body of the while loop never executes
+  @test_util.run_deprecated_v1
   def testScanEmptyTensor(self):
     with self.cached_session():
       x = functional_ops.scan(
@@ -516,6 +532,7 @@ class FunctionalOpsTest(test.TestCase):
       self.assertAllEqual([0, 2, 4], x.get_shape())
       self.assertAllEqual(x.get_shape(), self.evaluate(x).shape)
 
+  @test_util.run_deprecated_v1
   def testScanUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     initializer = array_ops.placeholder(dtypes.float32)
@@ -526,6 +543,7 @@ class FunctionalOpsTest(test.TestCase):
     y = functional_ops.scan(fn, x, initializer=initializer)
     self.assertIs(None, y.get_shape().dims)
 
+  @test_util.run_deprecated_v1
   def testScanVaryingShape(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(dtype=dtypes.float32, shape=[None, 2])
@@ -542,6 +560,7 @@ class FunctionalOpsTest(test.TestCase):
       sess.run([result, result_t, result_grad, result_t_grad],
                feed_dict={x: [[1.0, 2.0]]})
 
+  @test_util.run_deprecated_v1
   def testRemoteFunction(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -564,10 +583,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:worker/replica:0/task:0/cpu:1")
 
     with session.Session(worker[0].target) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionDirectSession(self):
     worker_config = config_pb2.ConfigProto()
     worker_config.device_count["CPU"] = 2
@@ -588,10 +608,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/cpu:1")
 
     with self.test_session(config=worker_config) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionSameDeviceDirectSession(self):
 
     @function.Defun(dtypes.int32, dtypes.int32)
@@ -607,8 +628,8 @@ class FunctionalOpsTest(test.TestCase):
           args=[a, b], Tout=[dtypes.int32], f=_remote_fn, target="/cpu:0")
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, [6])
 
   def testRemoteFunctionCPUGPU(self):
@@ -631,8 +652,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/device:GPU:0")[0] + 3.0
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
   def testRemoteFunctionGPUCPU(self):
@@ -655,8 +676,8 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:localhost/replica:0/task:0/cpu:0")[0] + 3.0
 
     with self.cached_session() as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9.0)
 
   def testRemoteFunctionGPUCPUStrings(self):
@@ -674,9 +695,10 @@ class FunctionalOpsTest(test.TestCase):
           args=[a], Tout=[dtypes.string], f=_remote_fn, target="/cpu:0")
 
     with self.cached_session() as sess:
-      ret = sess.run(remote_op)
+      ret = self.evaluate(remote_op)
       self.assertAllEqual(ret, [b"a"])
 
+  @test_util.run_deprecated_v1
   def testRemoteFunctionCrossProcess(self):
     workers, _ = test_util.create_local_cluster(2, 1)
 
@@ -696,10 +718,11 @@ class FunctionalOpsTest(test.TestCase):
           target="/job:worker/replica:0/task:1/cpu:0")[0] + 3.0
 
     with session.Session(workers[0].target) as sess:
-      sess.run(variables.global_variables_initializer())
-      mul = sess.run(remote_op)
+      self.evaluate(variables.global_variables_initializer())
+      mul = self.evaluate(remote_op)
       self.assertEqual(mul, 9)
 
+  @test_util.run_deprecated_v1
   def testIf(self):
 
     @function.Defun(dtypes.float32)
@@ -739,6 +762,7 @@ class FunctionalOpsTest(test.TestCase):
           self.assertAllEqual(Run(sess, 20.), 210.)
           self.assertAllEqual(Run(sess, 100.), 5050.)
 
+  @test_util.run_deprecated_v1
   def testWhileLowering(self):
 
     def Run(n, fetch_by_name):
@@ -766,13 +790,14 @@ class FunctionalOpsTest(test.TestCase):
           else:
             fetch = "my_while:1"
         with self.session(graph=g, use_gpu=use_gpu) as sess:
-          return sess.run(fetch)
+          return self.evaluate(fetch)
 
     self.assertAllEqual(Run(20., False), 210.)
     self.assertAllEqual(Run(20., True), 210.)
     self.assertAllEqual(Run(100., False), 5050.)
     self.assertAllEqual(Run(100., True), 5050.)
 
+  @test_util.run_v1_only("b/120545219")
   def testWhileError(self):
     for use_gpu in (True, False):
       with ops.Graph().as_default() as g:
@@ -854,11 +879,11 @@ class FunctionalOpsTest(test.TestCase):
           result_binary = functional_ops.While(
               [1.0, 0., 0.],
               function.Defun(*[dtypes.float32] * 3)(TestCond), TestBinary)
-          sess.run(variables.global_variables_initializer())
+          self.evaluate(variables.global_variables_initializer())
           assert len(result_unary) == 2
-          self.assertEqual([10.0, 54.0], sess.run(result_unary))
+          self.assertEqual([10.0, 54.0], self.evaluate(result_unary))
           assert len(result_binary) == 3
-          self.assertEqual([10.0, 54.0, 9.0], sess.run(result_binary))
+          self.assertEqual([10.0, 54.0, 9.0], self.evaluate(result_binary))
 
           def TestCondCapture(n, *args):
             del args
@@ -889,7 +914,7 @@ class FunctionalOpsTest(test.TestCase):
                 100, 0, -1, [0.], Body, rewrite_with_while=rewrite_with_while)
             [0],
         ]
-        xvals = sess.run(xs)
+        xvals = self.evaluate(xs)
       self.assertAllEqual(210, xvals[0])
       self.assertAllEqual(5050, xvals[1])
 
@@ -919,6 +944,7 @@ class FunctionalOpsTest(test.TestCase):
     self.assertTrue("TestBody_Cond" in names)
     self.assertTrue("TestBody_Body" in names)
 
+  @test_util.run_deprecated_v1
   def testForCapturedInputs(self):
     v = variables.Variable(1.0)
 
@@ -946,16 +972,16 @@ class FunctionalOpsTest(test.TestCase):
         result_binary = functional_ops.For(
             1, 10, 1, [0., 0.], TestBinary,
             rewrite_with_while=rewrite_with_while)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         assert not result_nullary
         # The nullary variant doesn't return anything so we can't easily run it.
         # As a total hack, fetch the operation by name and run it.
         sess.run(ops.get_default_graph().get_operation_by_name(
             "While" if rewrite_with_while else "For"))
         assert len(result_unary) == 1
-        self.assertEqual([54.0], sess.run(result_unary))
+        self.assertEqual([54.0], self.evaluate(result_unary))
         assert len(result_binary) == 2
-        self.assertEqual([54.0, 9.0], sess.run(result_binary))
+        self.assertEqual([54.0, 9.0], self.evaluate(result_binary))
 
   def _tfMLP(self, xval, wsval, bsval, rewrite_with_while):
     # On GPU, don't rewrite using a while loop.
@@ -974,7 +1000,7 @@ class FunctionalOpsTest(test.TestCase):
           MLP,
           rewrite_with_while=rewrite_with_while)[0]
 
-      return ret.eval()
+      return self.evaluate(ret)
 
   def _npMLP(self, xval, wsval, bsval):
     for i in range(wsval.shape[0]):
@@ -993,12 +1019,15 @@ class FunctionalOpsTest(test.TestCase):
     tf_for_ans = self._tfMLP(xval, wsval, bsval, rewrite_with_while)
     self.assertAllClose(np_ans, tf_for_ans)
 
+  @test_util.run_deprecated_v1
   def testForMLP(self):
     self._testForMLP(False)
 
+  @test_util.run_deprecated_v1
   def testForMLPWhile(self):
     self._testForMLP(True)
 
+  @test_util.run_v1_only("b/120545219")
   def testForError(self):
 
     @function.Defun(dtypes.int32, dtypes.float32)
@@ -1021,6 +1050,7 @@ class FunctionalOpsTest(test.TestCase):
           "For loop body returned 2 arguments. Expected: 1"):
         functional_ops.For(0, 10, 1, [0.0], ReturnsTooManyArgs)[0].eval()
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
 
     @function.Defun(dtypes.float32)
@@ -1038,14 +1068,15 @@ class FunctionalOpsTest(test.TestCase):
       avals = [Poly(a), Grad(a)]
       b = constant_op.constant(1.)
       bvals = [Poly(b), Grad(b)]
-      self.assertAllEqual(sess.run(avals), [8., 4.])
-      self.assertAllEqual(sess.run(bvals), [17., 16.])
+      self.assertAllEqual(self.evaluate(avals), [8., 4.])
+      self.assertAllEqual(self.evaluate(bvals), [17., 16.])
 
 
 # TODO(akshayka): Replace `function.Defun` with tf.contrib.eager.defun` in the
 # below test cases.
 class PartitionedCallTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicSingleDevice(self):
 
     @function.Defun(*[dtypes.float32] * 2)
@@ -1061,6 +1092,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testBasicMultiDevice(self):
     config = config_pb2.ConfigProto(device_count={"CPU": 3})
 
@@ -1104,6 +1136,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testBasicNoDeviceAnnotations(self):
 
     @function.Defun(*[dtypes.float32] * 2)
@@ -1118,6 +1151,7 @@ class PartitionedCallTest(test.TestCase):
                   constant_op.constant(2.)], f=Body))
     self.assertEqual(output, 6.)
 
+  @test_util.run_deprecated_v1
   def testShardsRunOnRequestedDevices(self):
     config = config_pb2.ConfigProto(device_count={"CPU": 4})
 
@@ -1147,6 +1181,7 @@ class PartitionedCallTest(test.TestCase):
     self.assertIn(compat.as_bytes("CPU:1"), outputs[1])
     self.assertIn(compat.as_bytes("CPU:2"), outputs[2])
 
+  @test_util.run_deprecated_v1
   def testAssignAddResourceVariable(self):
 
     v = resource_variable_ops.ResourceVariable(1.0)
@@ -1190,14 +1225,15 @@ class PartitionedCallTest(test.TestCase):
             allow_soft_placement=False,
             log_device_placement=True,
             device_count={"CPU": 2})) as sess:
-      sess.run(variables.global_variables_initializer())
-      expected = sess.run(sum_gather())
+      self.evaluate(variables.global_variables_initializer())
+      expected = self.evaluate(sum_gather())
       result = sess.run(
           functional_ops.partitioned_call(
               args=defined.captured_inputs, f=defined))
       self.assertAllEqual(expected, result)
 
   # Use an invalid executor name to test the plumbing of the executor_type attr.
+  @test_util.run_v1_only("b/120545219")
   def testExecutorTypeAttrExecutorNotFound(self):
     @function.Defun(dtypes.int32)
     def AddFive(x):
diff --git a/tensorflow/python/kernel_tests/gather_nd_op_test.py b/tensorflow/python/kernel_tests/gather_nd_op_test.py
index ee761435d84677f35c00f1e146316d59e656e872..320ffc9674bd2e0ce601084ab8fc375c4cbdc3e2 100644
--- a/tensorflow/python/kernel_tests/gather_nd_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_nd_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import variables
@@ -40,7 +41,7 @@ class GatherNdTest(test.TestCase):
       params = constant_op.constant(np.array([8, 1, 2, 3, 7, 5], dtype=dtype))
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertAllEqual(np.array([7, 7, 8], dtype=dtype), gather_nd_val)
     self.assertEqual([3], gather_nd_t.get_shape())
@@ -54,26 +55,27 @@ class GatherNdTest(test.TestCase):
     self._testSimpleDtype(np.complex128)
     self._testSimpleDtype("|S")  # byte strings in python2 + 3
 
+  @test_util.run_deprecated_v1
   def testEmptyIndicesAndParamsOKButJustEmptyParamsFails(self):
     with self.session(use_gpu=True):
       params = np.ones((3, 3), dtype=np.float32)
 
       indices_empty = np.empty((0, 2), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
       indices_empty = np.empty((0, 1), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0, 3], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0, 3), dtype=np.float32), gather_nd_ok_val)
 
       params_empty = np.empty((0, 3), dtype=np.float32)
       indices_empty = np.empty((0, 2), dtype=np.int32)
       gather_nd_ok_t = array_ops.gather_nd(params_empty, indices_empty)
-      gather_nd_ok_val = gather_nd_ok_t.eval()
+      gather_nd_ok_val = self.evaluate(gather_nd_ok_t)
       self.assertEqual([0], gather_nd_ok_t.get_shape())
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
@@ -82,7 +84,7 @@ class GatherNdTest(test.TestCase):
       gather_nd_break_t = array_ops.gather_nd(params_empty, indices_nonempty)
       with self.assertRaisesOpError(
           r"Requested more than 0 entries, but params is empty."):
-        gather_nd_break_t.eval()
+        self.evaluate(gather_nd_break_t)
       self.assertAllClose(np.empty((0,), dtype=np.float32), gather_nd_ok_val)
 
   def testIndexScalar(self):
@@ -91,7 +93,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4, 1])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
       self.assertEqual([], gather_nd_t.get_shape())
       self.assertAllEqual(np.array(7), gather_nd_val)
 
@@ -101,7 +103,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([4])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
       self.assertEqual([2], gather_nd_t.get_shape())
       self.assertAllEqual(np.array([-7, 7]), gather_nd_val)
 
@@ -111,7 +113,7 @@ class GatherNdTest(test.TestCase):
           [[-8, -1, -2, -3, -7, -5], [8, 1, 2, 3, 7, 5]], dtype=np.float32).T
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([3, 2], gather_nd_t.get_shape())
     self.assertAllEqual(np.array([[-7, 7], [-7, 7], [-8, 8]]), gather_nd_val)
@@ -125,7 +127,7 @@ class GatherNdTest(test.TestCase):
       params_t = constant_op.constant(params)
       indices = constant_op.constant([[4], [4], [0]])
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([3, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(params[[4, 4, 0]], gather_nd_val)
@@ -140,7 +142,7 @@ class GatherNdTest(test.TestCase):
       indices = constant_op.constant(
           [[], []], dtype=dtypes.int32)  # Size (2, 0)
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([2, 6, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(
@@ -156,7 +158,7 @@ class GatherNdTest(test.TestCase):
       params_t = constant_op.constant(params)
       indices = constant_op.constant([[[3], [2], [1]], [[4], [4], [0]]])
       gather_nd_t = array_ops.gather_nd(params_t, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     self.assertEqual([2, 3, 2, 2], gather_nd_t.get_shape())
     self.assertAllEqual(params[[3, 2, 1, 4, 4, 0]].reshape(2, 3, 2, 2),
@@ -168,7 +170,7 @@ class GatherNdTest(test.TestCase):
       params = np.random.rand(*shape)
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
       gather_nd_t = array_ops.gather_nd(params, indices)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected, gather_nd_val)
@@ -181,7 +183,7 @@ class GatherNdTest(test.TestCase):
       indices = np.vstack([np.random.randint(0, s, size=2000) for s in shape]).T
       indices_reshaped = indices.reshape([10, 10, 20, 5])
       gather_nd_t = array_ops.gather_nd(params, indices_reshaped)
-      gather_nd_val = gather_nd_t.eval()
+      gather_nd_val = self.evaluate(gather_nd_t)
 
     expected = params[tuple(indices.T)]
     self.assertAllEqual(expected.reshape([10, 10, 20]), gather_nd_val)
@@ -190,6 +192,7 @@ class GatherNdTest(test.TestCase):
   def assertIndexedSlices(self, t):
     self.assertIsInstance(t, ops.IndexedSlices)
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
@@ -198,6 +201,7 @@ class GatherNdTest(test.TestCase):
     self.assertEqual(None, shape.ndims)
     self.assertEqual(None, tensor_shape.dimension_value(shape[0]))
 
+  @test_util.run_deprecated_v1
   def testBadIndicesCPU(self):
     with self.session(use_gpu=False):
       params = [0, 1, 2]
@@ -205,7 +209,7 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def _disabledTestBadIndicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
@@ -218,8 +222,9 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,1\] = \[7\] does not index into param shape \[3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
+  @test_util.run_deprecated_v1
   def testBadIndicesWithSlicesCPU(self):
     with self.session(use_gpu=False):
       params = [[0, 1, 2]]
@@ -227,7 +232,7 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
   def _disabledTestBadIndicesWithSlicesGPU(self):
     # TODO disabled due to different behavior on GPU and CPU
@@ -240,8 +245,9 @@ class GatherNdTest(test.TestCase):
       gather_nd = array_ops.gather_nd(params, indices)
       with self.assertRaisesOpError(
           r"indices\[0,2\] = \[1\] does not index into param shape \[1,3\]"):
-        gather_nd.eval()
+        self.evaluate(gather_nd)
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2Elements(self):
     indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
     inputs = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
@@ -251,8 +257,9 @@ class GatherNdTest(test.TestCase):
     grads = gradients_impl.gradients([outputs], [inputs], [grad_vals])[0]
     expected_grads = np.array([[1, 0], [0, 2]], dtype=np.float64)
     with self.session(use_gpu=True):
-      assert np.array_equal(expected_grads, grads.eval())
+      assert np.array_equal(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2Slices(self):
     indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
     inputs = constant_op.constant([[1, 2], [3, 4]], dtype=dtypes.float64)
@@ -265,6 +272,7 @@ class GatherNdTest(test.TestCase):
       self.assertIndexedSlices(grads)
       self.assertAllEqual(expected_grads, ops.convert_to_tensor(grads).eval())
 
+  @test_util.run_deprecated_v1
   def testGradientsRank3Elements(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int32)
@@ -278,8 +286,9 @@ class GatherNdTest(test.TestCase):
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank7Elements(self):
     # Shape [1,1,2,1,1,2,2]
     indices = constant_op.constant(
@@ -307,8 +316,9 @@ class GatherNdTest(test.TestCase):
             [[[[3, 4], [7, 8]]]]
         ]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsInt64Indices(self):
     indices = constant_op.constant(
         [[[0, 1], [1, 0]], [[0, 0], [1, 1]]], dtype=dtypes.int64)
@@ -322,8 +332,9 @@ class GatherNdTest(test.TestCase):
     expected_grads = np.array(
         [[[5, 6], [1, 2]], [[3, 4], [7, 8]]], dtype=np.float64)
     with self.session(use_gpu=True):
-      self.assertAllEqual(expected_grads, grads.eval())
+      self.assertAllEqual(expected_grads, self.evaluate(grads))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2SlicesWithEmptySpace(self):
     indices = constant_op.constant([[2], [0], [5]], dtype=dtypes.int32)
     inputs = constant_op.constant(
@@ -361,10 +372,10 @@ class GatherNdOpBenchmark(test.Benchmark):
       gather_op = array_ops.gather_nd(t_params, t_indices)
       variables.global_variables_initializer().run()
       for _ in range(10):
-        gather_op.eval()
+        self.evaluate(gather_op)
       t1 = time.time()
       for _ in range(1000):
-        gather_op.eval()
+        self.evaluate(gather_op)
       t2 = time.time()
       self.report_benchmark(iters=1000, wall_time=(t2 - t1) / 1000.0)
 
diff --git a/tensorflow/python/kernel_tests/gather_op_test.py b/tensorflow/python/kernel_tests/gather_op_test.py
index bdafc52ab5ec3bd6157b098712cdd35122bb17af..fc86068c3fc08d1ad01ba8dfa9bb4c5bc6c429f2 100644
--- a/tensorflow/python/kernel_tests/gather_op_test.py
+++ b/tensorflow/python/kernel_tests/gather_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.platform import test
@@ -50,7 +51,7 @@ class GatherTest(test.TestCase):
           params = constant_op.constant(params_np)
           indices_tf = constant_op.constant(indices)
           gather_t = array_ops.gather(params, indices_tf)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           np_val = params_np[indices]
           self.assertAllEqual(np_val, gather_val)
           self.assertEqual(np_val.shape, gather_t.get_shape())
@@ -65,7 +66,7 @@ class GatherTest(test.TestCase):
           params = constant_op.constant(params_np)
           indices = constant_op.constant(2)
           gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
           expected_shape = data.shape[:axis] + data.shape[axis + 1:]
           self.assertEqual(expected_shape, gather_t.get_shape())
@@ -81,12 +82,13 @@ class GatherTest(test.TestCase):
           # The indices must be in bounds for any axis.
           indices = constant_op.constant([0, 1, 0, 2])
           gather_t = array_ops.gather(params, indices, axis=axis)
-          gather_val = gather_t.eval()
+          gather_val = self.evaluate(gather_t)
           self.assertAllEqual(np.take(params_np, [0, 1, 0, 2], axis=axis),
                               gather_val)
           expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1:]
           self.assertEqual(expected_shape, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     # We check that scalar and empty indices shapes work as well
     shape = (2, 1, 3, 2)
@@ -142,9 +144,13 @@ class GatherTest(test.TestCase):
               source_slice = ((slice(None),) * outer_dims + (source_index,) +
                               (slice(None),) * inner_dims)
               correct_params_grad[dest_slice] += gather_grad[source_slice]
-            self.assertAllClose(correct_params_grad, params_grad.eval(),
-                                atol=2e-6, rtol=2e-6)
+            self.assertAllClose(
+                correct_params_grad,
+                self.evaluate(params_grad),
+                atol=2e-6,
+                rtol=2e-6)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     params = np.array([[b"asdf", b"zxcv"], [b"qwer", b"uiop"]])
     with self.cached_session():
@@ -153,6 +159,7 @@ class GatherTest(test.TestCase):
       self.assertAllEqual([b"asdf", b"qwer"],
                           array_ops.gather(params, 0, axis=1).eval())
 
+  @test_util.run_deprecated_v1
   def testUInt32AndUInt64(self):
     for unsigned_type in (dtypes.uint32, dtypes.uint64):
       params = self._buildParams(
@@ -162,12 +169,14 @@ class GatherTest(test.TestCase):
                             array_ops.gather(params, 1, axis=0).eval())
         self.assertAllEqual([1, 7], array_ops.gather(params, 0, axis=1).eval())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = array_ops.placeholder(dtypes.int32)
     gather_t = array_ops.gather(params, indices)
     self.assertEqual(None, gather_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownAxis(self):
     params = constant_op.constant([[0, 1, 2]])
     indices = constant_op.constant([[0, 0], [0, 0]])
@@ -201,6 +210,7 @@ class GatherTest(test.TestCase):
       with self.assertRaisesOpError(r"indices\[0,0\] = 7 is not in \[0, 3\)"):
         array_ops.gather(params, [[7]], axis=1).eval()
 
+  @test_util.run_deprecated_v1
   def testBadAxis(self):
     with self.session(use_gpu=True):
       params = [0, 1, 2]
@@ -217,6 +227,7 @@ class GatherTest(test.TestCase):
           array_ops.gather(params_ph, indices, axis=bad_axis).eval(
               feed_dict={params_ph: params})
 
+  @test_util.run_deprecated_v1
   def testEmptySlices(self):
     with self.session(use_gpu=True):
       for dtype in _TEST_TYPES:
diff --git a/tensorflow/python/kernel_tests/gradient_correctness_test.py b/tensorflow/python/kernel_tests/gradient_correctness_test.py
index 291a69ebac6625ea9b50a54d2e0e28083b463d85..0148de5047afe6144433d69beb03e066ae395865 100644
--- a/tensorflow/python/kernel_tests/gradient_correctness_test.py
+++ b/tensorflow/python/kernel_tests/gradient_correctness_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -29,37 +30,42 @@ from tensorflow.python.platform import test
 
 class GradientCorrectnessTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMultipleOutputChainedGradients(self):
     with self.cached_session() as sess:
       x = constant_op.constant(1.0, dtype=dtypes.float32)
       yexp = math_ops.exp(x)
       yexplog = math_ops.log(yexp)
       grads = gradients_impl.gradients([yexp, yexplog], [x])
-      grad_vals = sess.run(grads)
+      grad_vals = self.evaluate(grads)
       exp1_plus_one = (1.0 + np.exp(1.0)).astype(np.float32)
       # [dexp(x)/dx + d(log(exp(x)))/dx] @ x=1 == exp(1) + 1
       self.assertAllClose(grad_vals[0], exp1_plus_one)
 
+  @test_util.run_deprecated_v1
   def testIdentityGradient(self):
     x = constant_op.constant(3.)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
-      self.assertAllClose(1., sess.run(dx_dx))
+      self.assertAllClose(1., self.evaluate(dx_dx))
 
+  @test_util.run_deprecated_v1
   def testIntegerIdentityGradient(self):
     x = constant_op.constant(3)
     dx_dx, = gradients_impl.gradients(x, x)
     with self.cached_session() as sess:
-      self.assertAllClose(1, sess.run(dx_dx))
+      self.assertAllClose(1, self.evaluate(dx_dx))
 
+  @test_util.run_deprecated_v1
   def testGradientWithIntegerPath(self):
     x = constant_op.constant([3.9, 4.1])
     k = math_ops.to_float(math_ops.to_int32(x))
     y = x * k
     dy_dx, = gradients_impl.gradients(y, x)
     with self.cached_session() as sess:
-      self.assertAllClose([3., 4.], sess.run(dy_dx))
+      self.assertAllClose([3., 4.], self.evaluate(dy_dx))
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient1(self):
     x = constant_op.constant([3.9, 4.1])
     k = math_ops.to_float(math_ops.to_int32(x))
@@ -67,6 +73,7 @@ class GradientCorrectnessTest(test.TestCase):
     dy_dx, = gradients_impl.gradients(y, x)
     self.assertIsNone(dy_dx)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient2(self):
     k = constant_op.constant([3, 4])
     x = math_ops.to_float(k)
@@ -74,18 +81,21 @@ class GradientCorrectnessTest(test.TestCase):
     dy_dk, = gradients_impl.gradients(y, k)
     self.assertIsNone(dy_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient3(self):
     k = constant_op.constant([3, 4])
     m = k * k
     dm_dk, = gradients_impl.gradients(m, k)
     self.assertIsNone(dm_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient4(self):
     k = constant_op.constant([3, 4])
     m = k * k * k
     dm_dk, = gradients_impl.gradients(m, k)
     self.assertIsNone(dm_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient5(self):
     k = constant_op.constant([3, 4])
     m = k * k
@@ -93,6 +103,7 @@ class GradientCorrectnessTest(test.TestCase):
     dn_dk, = gradients_impl.gradients(n, k)
     self.assertIsNone(dn_dk)
 
+  @test_util.run_deprecated_v1
   def testNoIntegerGradient6(self):
     k = constant_op.constant(3)
     x = math_ops.to_float(k)
diff --git a/tensorflow/python/kernel_tests/huge_slice_op_test.py b/tensorflow/python/kernel_tests/huge_slice_op_test.py
index 8646d74c96f179cde41184eab3af1f72583360fa..4074946350aa5ce753a39fb173346d1d4f7fe3c7 100644
--- a/tensorflow/python/kernel_tests/huge_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/huge_slice_op_test.py
@@ -33,11 +33,11 @@ class SliceTest(test.TestCase):
       a_large = array_ops.tile(
           constant_op.constant(np.array([False, True] * 4)), [2**29 + 3])
       slice_t = array_ops.slice(a_large, np.asarray([3]).astype(np.int64), [3])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([True, False, True], slice_val)
 
       slice_t = array_ops.slice(
           a_large, constant_op.constant([long(2)**32 + 3], dtype=dtypes.int64),
           [3])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([True, False, True], slice_val)
diff --git a/tensorflow/python/kernel_tests/identity_n_op_py_test.py b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
index 518733cd8e9064cc5d4637225295571c072a0fc6..a1110d640f01dd5cdfe51fa26c85760ada705b8d 100644
--- a/tensorflow/python/kernel_tests/identity_n_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_n_op_py_test.py
@@ -21,12 +21,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class IdentityNOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt32String_6(self):
     with self.cached_session() as sess:
       [value0, value1] = sess.run(
@@ -36,6 +38,7 @@ class IdentityNOpTest(test.TestCase):
     self.assertAllEqual(
         np.array([b"a", b"b", b"C", b"d", b"E", b"f", b"g"]), value1)
 
+  @test_util.run_deprecated_v1
   def testInt32_shapes(self):
     with self.cached_session() as sess:
       inp0 = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
@@ -50,6 +53,7 @@ class IdentityNOpTest(test.TestCase):
         np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12], [13, 14, 15]]),
         value2)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     source = [b"A", b"b", b"C", b"d", b"E", b"f"]
     with self.cached_session() as sess:
diff --git a/tensorflow/python/kernel_tests/identity_op_py_test.py b/tensorflow/python/kernel_tests/identity_op_py_test.py
index 88ea10c22a30742f62b56b490f00e9fe387cbfa0..40ec9db4226a89305732683118f7f906db1ba965 100644
--- a/tensorflow/python/kernel_tests/identity_op_py_test.py
+++ b/tensorflow/python/kernel_tests/identity_op_py_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import variables
@@ -30,17 +31,20 @@ from tensorflow.python.platform import test
 
 class IdentityOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt32_6(self):
     with self.cached_session():
       value = array_ops.identity([1, 2, 3, 4, 5, 6]).eval()
     self.assertAllEqual(np.array([1, 2, 3, 4, 5, 6]), value)
 
+  @test_util.run_deprecated_v1
   def testInt32_2_3(self):
     with self.cached_session():
       inp = constant_op.constant([10, 20, 30, 40, 50, 60], shape=[2, 3])
       value = array_ops.identity(inp).eval()
     self.assertAllEqual(np.array([[10, 20, 30], [40, 50, 60]]), value)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     source = [b"A", b"b", b"C", b"d", b"E", b"f"]
     with self.cached_session():
@@ -58,6 +62,7 @@ class IdentityOpTest(test.TestCase):
       self.assertEquals(shape,
                         array_ops.identity(np.array(array_2x3)).get_shape())
 
+  @test_util.run_v1_only("b/120545219")
   def testRefIdentityShape(self):
     with self.cached_session():
       shape = [2, 3]
diff --git a/tensorflow/python/kernel_tests/in_topk_op_test.py b/tensorflow/python/kernel_tests/in_topk_op_test.py
index 6fdb497bc6f8d15d54b9d35ed8c15ed9caceb1db..507822b3142a77a3782be52a3d19bb9bd664b684 100644
--- a/tensorflow/python/kernel_tests/in_topk_op_test.py
+++ b/tensorflow/python/kernel_tests/in_topk_op_test.py
@@ -32,7 +32,7 @@ class InTopKTest(test.TestCase):
     np_ans = np.array(expected)
     with self.cached_session():
       precision = nn_ops.in_top_k(predictions, target, k)
-      out = precision.eval()
+      out = self.evaluate(precision)
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
 
@@ -77,7 +77,7 @@ class InTopKTest(test.TestCase):
     np_ans = np.array([False, True])
     with self.cached_session():
       precision = nn_ops.in_top_k(predictions, target, k)
-      out = precision.eval()
+      out = self.evaluate(precision)
       self.assertAllClose(np_ans, out)
       self.assertShapeEqual(np_ans, precision)
 
diff --git a/tensorflow/python/kernel_tests/init_ops_test.py b/tensorflow/python/kernel_tests/init_ops_test.py
index 70bfbf8544a8a8689d6f48c730ee90479236b2a9..09b9944baa1d92bfbcd484f5dba45cea28e6eafe 100644
--- a/tensorflow/python/kernel_tests/init_ops_test.py
+++ b/tensorflow/python/kernel_tests/init_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -106,6 +107,7 @@ def _init_sampler(tc, init, num):
 
 class ConstantInitializersTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testZerosInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -114,6 +116,7 @@ class ConstantInitializersTest(test.TestCase):
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.zeros(shape))
 
+  @test_util.run_deprecated_v1
   def testOnesInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -122,6 +125,7 @@ class ConstantInitializersTest(test.TestCase):
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.ones(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantZeroInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -130,6 +134,7 @@ class ConstantInitializersTest(test.TestCase):
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.zeros(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantOneInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -138,6 +143,7 @@ class ConstantInitializersTest(test.TestCase):
       x.initializer.run()
       self.assertAllEqual(x.eval(), np.ones(shape))
 
+  @test_util.run_deprecated_v1
   def testConstantIntInitializer(self):
     with self.session(use_gpu=True):
       shape = [2, 3]
@@ -150,6 +156,7 @@ class ConstantInitializersTest(test.TestCase):
       self.assertEqual(x.dtype.base_dtype, dtypes.int32)
       self.assertAllEqual(x.eval(), 7 * np.ones(shape, dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def testConstantTupleInitializer(self):
     with self.session(use_gpu=True):
       shape = [3]
@@ -173,6 +180,7 @@ class ConstantInitializersTest(test.TestCase):
       for a, e in zip(actual, expected):
         self.assertEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializer(self):
     value = [0, 1, 2, 3, 4, 5]
     shape = [2, 3]
@@ -199,6 +207,7 @@ class ConstantInitializersTest(test.TestCase):
         e = expected[i] if i < len(expected) else expected[-1]
         self.assertEqual(a, e)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializerLessValues(self):
     value = [0, 1, 2, 3, 4, 5]
     shape = [2, 4]
@@ -222,6 +231,7 @@ class ConstantInitializersTest(test.TestCase):
           shape=shape,
           initializer=init)
 
+  @test_util.run_deprecated_v1
   def testNDimConstantInitializerMoreValues(self):
     value = [0, 1, 2, 3, 4, 5, 6, 7]
     shape = [2, 3]
@@ -243,18 +253,21 @@ class ConstantInitializersTest(test.TestCase):
 
 class RandomNormalInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.random_normal_initializer(0.0, 1.0, seed=1, dtype=dtype)
       init2 = init_ops.random_normal_initializer(0.0, 1.0, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.random_normal_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -270,6 +283,7 @@ class RandomNormalInitializationTest(test.TestCase):
 
 class TruncatedNormalInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.truncated_normal_initializer(
@@ -278,6 +292,7 @@ class TruncatedNormalInitializationTest(test.TestCase):
           0.0, 1.0, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.truncated_normal_initializer(
@@ -286,6 +301,7 @@ class TruncatedNormalInitializationTest(test.TestCase):
           0.0, 1.0, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.truncated_normal_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -301,18 +317,21 @@ class TruncatedNormalInitializationTest(test.TestCase):
 
 class RandomUniformInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64, dtypes.int64]:
       init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       init2 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64]:
       init1 = init_ops.random_uniform_initializer(0, 7, seed=1, dtype=dtype)
       init2 = init_ops.random_uniform_initializer(0, 7, seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.random_uniform_initializer(0.0, 1.0)
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -320,6 +339,7 @@ class RandomUniformInitializationTest(test.TestCase):
 
 class UniformUnitScalingInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.uniform_unit_scaling_initializer(seed=1, dtype=dtype)
@@ -331,6 +351,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
           1.5, seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init3, init4))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.uniform_unit_scaling_initializer(seed=1, dtype=dtype)
@@ -341,6 +362,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
       self.assertFalse(identicaltest(self, init1, init3))
       self.assertFalse(identicaltest(self, init2, init3))
 
+  @test_util.run_deprecated_v1
   def testZeroSize(self):
     shape = [0, 2]
     with self.cached_session():
@@ -349,8 +371,9 @@ class UniformUnitScalingInitializationTest(test.TestCase):
           shape=shape,
           initializer=init_ops.uniform_unit_scaling_initializer())
       variables.global_variables_initializer().run()
-      self.assertAllEqual(shape, x.eval().shape)
+      self.assertAllEqual(shape, self.evaluate(x).shape)
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.uniform_unit_scaling_initializer()
     self.assertFalse(duplicated_initializer(self, init, 1))
@@ -364,6 +387,7 @@ class UniformUnitScalingInitializationTest(test.TestCase):
 
 class VarianceScalingInitializationTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTruncatedNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -381,6 +405,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -397,6 +422,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testUntruncatedNormalDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -414,6 +440,7 @@ class VarianceScalingInitializationTest(test.TestCase):
     self.assertNear(np.mean(x), expect_mean, err=1e-2)
     self.assertNear(np.var(x), expect_var, err=1e-2)
 
+  @test_util.run_deprecated_v1
   def testUniformDistribution(self):
     shape = [100, 100]
     expect_mean = 0.
@@ -435,7 +462,7 @@ class RangeTest(test.TestCase):
       tf_ans = math_ops.range(start, limit, delta, name="range")
       self.assertEqual([len(np.arange(start, limit, delta))],
                        tf_ans.get_shape())
-      return tf_ans.eval()
+      return self.evaluate(tf_ans)
 
   def testBasic(self):
     self.assertTrue(
@@ -449,6 +476,7 @@ class RangeTest(test.TestCase):
             self._Range(100, 500, 100), np.array([100, 200, 300, 400])))
     self.assertEqual(math_ops.range(0, 5, 1).dtype, dtypes.int32)
 
+  @test_util.run_deprecated_v1
   def testLimitOnly(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(np.arange(5), math_ops.range(5).eval())
@@ -524,7 +552,7 @@ class LinSpaceTest(test.TestCase):
       with self.session(graph=graph, force_gpu=self.force_gpu):
         tf_ans = math_ops.linspace(start, stop, num, name="linspace")
         self.assertEqual([num], tf_ans.get_shape())
-        return tf_ans.eval()
+        return self.evaluate(tf_ans)
 
   def testPositive(self):
     for self.force_gpu in self._gpu_modes():
@@ -583,18 +611,21 @@ class DeviceTest(test.TestCase):
 
 class OrthogonalInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       init2 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.orthogonal_initializer(seed=1, dtype=dtype)
       init2 = init_ops.orthogonal_initializer(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.orthogonal_initializer()
     self.assertFalse(duplicated_initializer(self, init, 1, (10, 10)))
@@ -608,6 +639,7 @@ class OrthogonalInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -616,8 +648,9 @@ class OrthogonalInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       for shape in [(10, 10), (10, 9, 8), (100, 5, 5), (50, 40), (40, 50)]:
@@ -639,18 +672,21 @@ class OrthogonalInitializerTest(test.TestCase):
 
 class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_delta_orthogonal(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_delta_orthogonal(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_delta_orthogonal()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
@@ -665,6 +701,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -674,8 +711,9 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     gain = 3.14
     for dtype in [dtypes.float32]:
@@ -704,14 +742,14 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
         ratio = outputs_2norm / inputs_2norm
         my_ops = variables.global_variables_initializer()
         with self.session(use_gpu=True) as sess:
-          sess.run(my_ops)
+          self.evaluate(my_ops)
           # Check the shape of the outputs
-          t = outputs.eval()
+          t = self.evaluate(outputs)
           self.assertAllEqual(t.shape, outputs_shape)
           # Check isometry of the delta-orthogonal kernel.
-          self.assertAllClose(sess.run(ratio), np.sqrt(gain),
-                              rtol=tol, atol=tol)
+          self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
@@ -724,7 +762,7 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
                                         initializer=
                                         init_ops.convolutional_delta_orthogonal)
         x.initializer.run()
-        y = x.eval()[1, 1, :, :]
+        y = self.evaluate(x)[1, 1, :, :]
         determinant = np.linalg.det(y)
         value += determinant
         abs_value += np.abs(determinant)
@@ -739,18 +777,21 @@ class ConvolutionDeltaOrthogonalInitializerTest(test.TestCase):
 
 class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_1d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_1d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_1d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 10, 10)))
@@ -765,6 +806,7 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -774,8 +816,9 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
@@ -800,6 +843,7 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       # Compute the sum of the absolute values of 'count' determinants
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Pad input_ for computing (circular) convolution.
@@ -843,28 +887,31 @@ class ConvolutionOrthogonal1dInitializerTest(test.TestCase):
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
       with self.session(use_gpu=True) as sess:
-        sess.run(my_ops)
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_2d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_2d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_2d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 10, 10)))
@@ -879,6 +926,7 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -888,8 +936,9 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Pad input_ for computing (circular) convolution.
@@ -938,28 +987,31 @@ class ConvolutionOrthogonal2dInitializerTest(test.TestCase):
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
       with self.session(use_gpu=True) as sess:
-        sess.run(my_ops)
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInitializerIdentical(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       self.assertTrue(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testInitializerDifferent(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       init1 = init_ops.convolutional_orthogonal_3d(seed=1, dtype=dtype)
       init2 = init_ops.convolutional_orthogonal_3d(seed=2, dtype=dtype)
       self.assertFalse(identicaltest(self, init1, init2, (3, 3, 3, 10, 10)))
 
+  @test_util.run_deprecated_v1
   def testDuplicatedInitializer(self):
     init = init_ops.convolutional_orthogonal_3d()
     self.assertFalse(duplicated_initializer(self, init, 1, (3, 3, 3, 10, 10)))
@@ -974,6 +1026,7 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertRaises(ValueError, init1, shape=[3, 3, 3, 6, 5])
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (3, 3, 3, 10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -983,8 +1036,9 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         t1 = init1(shape).eval()
         t2 = init2(shape).eval()
-      return np.allclose(t1, t2 / 3.14, rtol=1e-15, atol=1e-15)
+      self.assertAllClose(t1, t2 / 3.14)
 
+  @test_util.run_deprecated_v1
   def testNonuniformity(self):
     value = 0
     abs_value = 0
@@ -1009,6 +1063,7 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       # Compute the sum of the absolute values of 'count' determinants
       self.assertAllClose(abs_value, count, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testShapesValues(self):
     def circular_pad(input_, width, kernel_size):
       """Padding input_ for computing circular convolution.
@@ -1063,12 +1118,12 @@ class ConvolutionOrthogonal3dInitializerTest(test.TestCase):
       ratio = outputs_2norm / inputs_2norm
       my_ops = variables.global_variables_initializer()
       with self.cached_session(use_gpu=True) as sess:
-        sess.run(my_ops)
+        self.evaluate(my_ops)
         # Check the shape of the outputs
-        t = outputs.eval()
+        t = self.evaluate(outputs)
         self.assertAllEqual(t.shape, outputs_shape)
         # Check isometry of the orthogonal kernel.
-        self.assertAllClose(sess.run(ratio), np.sqrt(gain), rtol=tol, atol=tol)
+        self.assertAllClose(self.evaluate(ratio), gain, rtol=tol, atol=tol)
 
 
 class IdentityInitializerTest(test.TestCase):
@@ -1084,12 +1139,14 @@ class IdentityInitializerTest(test.TestCase):
       self.assertRaises(ValueError, init, shape=[5])
       self.assertRaises(ValueError, init, shape=[])
 
+  @test_util.run_deprecated_v1
   def testNonSquare(self):
     init = init_ops.identity_initializer()
     shape = (10, 5)
     with self.session(graph=ops.Graph(), use_gpu=True):
       self.assertAllClose(init(shape).eval(), np.eye(*shape))
 
+  @test_util.run_deprecated_v1
   def testGain(self):
     shape = (10, 10)
     for dtype in [dtypes.float32, dtypes.float64]:
@@ -1100,6 +1157,7 @@ class IdentityInitializerTest(test.TestCase):
       with self.session(graph=ops.Graph(), use_gpu=True):
         self.assertAllClose(init_custom(shape).eval(), np.eye(*shape) * 0.9)
 
+  @test_util.run_deprecated_v1
   def testPartitions(self):
     shape = (10, 10)
     init = init_ops.identity_initializer()
diff --git a/tensorflow/python/kernel_tests/inplace_ops_test.py b/tensorflow/python/kernel_tests/inplace_ops_test.py
index 51d16861dd85e6500cd0fd72e6873de98ecab5ff..9eaaac7a24849600a54a755b80f4418ec905a0bf 100644
--- a/tensorflow/python/kernel_tests/inplace_ops_test.py
+++ b/tensorflow/python/kernel_tests/inplace_ops_test.py
@@ -31,6 +31,7 @@ from tensorflow.python.platform import test as test_lib
 
 class InplaceOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicUpdate(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
       with self.session(use_gpu=True):
@@ -48,6 +49,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[5, :] = 7
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicUpdateBool(self):
     with self.session(use_gpu=True):
       x = array_ops.ones([7, 3], dtypes.bool)
@@ -65,6 +67,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       y[5, :] = False
       self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicAdd(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -84,6 +87,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[:, :] += 99
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testBasicSub(self):
     for dtype in [dtypes.float32, dtypes.int32, dtypes.int64]:
       with self.cached_session(use_gpu=True):
@@ -103,6 +107,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
         y[:, :] -= 99
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     with self.session(use_gpu=True):
       d0, d1, d2 = 100, 3, 5
@@ -123,6 +128,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
           y[idx, :] -= val
         self.assertAllClose(x.eval(), y)
 
+  @test_util.run_deprecated_v1
   def testRandom1D(self):
     with self.session(use_gpu=True):
       d0 = 100
@@ -149,7 +155,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
       y = inplace_ops.alias_inplace_add(x, [0], [[1, 2, 3]])
       with ops.control_dependencies([y]):
         z = array_ops.identity(x)
-        _, vy, vz = sess.run([x, y, z])
+        _, vy, vz = self.evaluate([x, y, z])
       self.assertAllClose(vy, vz)
 
   def testError(self):
@@ -164,6 +170,7 @@ class InplaceOpsTest(test_util.TensorFlowTestCase):
                                    "i and x shape doesn't match"):
         _ = inplace_ops.inplace_update([[1.]], [0, 1], [[10]]).eval()
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.int32, dtypes.int64, dtypes.bool,
diff --git a/tensorflow/python/kernel_tests/io_ops_test.py b/tensorflow/python/kernel_tests/io_ops_test.py
index afa24195cb3fe08d7ed474242d90276861b87f85..c5df5231bf6fd945c41ac1c99fe6a613ca05fca6 100644
--- a/tensorflow/python/kernel_tests/io_ops_test.py
+++ b/tensorflow/python/kernel_tests/io_ops_test.py
@@ -23,6 +23,7 @@ import os
 import shutil
 import tempfile
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
 from tensorflow.python.util import compat
@@ -30,6 +31,7 @@ from tensorflow.python.util import compat
 
 class IoOpsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testReadFile(self):
     cases = ['', 'Some contents', 'Неки садржаји на српском']
     for contents in cases:
@@ -53,7 +55,7 @@ class IoOpsTest(test.TestCase):
         pass
       with self.cached_session() as sess:
         w = io_ops.write_file(temp.name, contents)
-        sess.run(w)
+        self.evaluate(w)
         with open(temp.name, 'rb') as f:
           file_contents = f.read()
         self.assertEqual(file_contents, contents)
@@ -67,7 +69,7 @@ class IoOpsTest(test.TestCase):
       filepath = os.path.join(subdir, 'subdir2', 'filename')
       with self.cached_session() as sess:
         w = io_ops.write_file(filepath, contents)
-        sess.run(w)
+        self.evaluate(w)
         with open(filepath, 'rb') as f:
           file_contents = f.read()
         self.assertEqual(file_contents, contents)
@@ -78,6 +80,7 @@ class IoOpsTest(test.TestCase):
         compat.as_bytes(files[i].name) for i in range(len(files))
         if i in indices)
 
+  @test_util.run_deprecated_v1
   def testMatchingFiles(self):
     cases = [
         'ABcDEF.GH', 'ABzDEF.GH', 'ABasdfjklDEF.GH', 'AB3DEF.GH', 'AB4DEF.GH',
diff --git a/tensorflow/python/kernel_tests/large_concat_op_test.py b/tensorflow/python/kernel_tests/large_concat_op_test.py
index 1b23e747764c65eaf3820a8832df0d657170ca7b..bf6fa9ea71f391287a7c21d042ae67ed57c9fc2b 100644
--- a/tensorflow/python/kernel_tests/large_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/large_concat_op_test.py
@@ -35,7 +35,7 @@ class LargeConcatOpTest(test.TestCase):
     with self.session(use_gpu=False):
       # TODO(dga):  Add more depth to this test to validate correctness,
       # not just non-crashingness, once other large tensor fixes have gone in.
-      _ = onezeros.eval()
+      _ = self.evaluate(onezeros)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index be2e31cb5adec71f7c55633441f7eca23f3ec2b5..ba9e64979a48ccce82a283e74a1a024c4bcceda8 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -40,6 +40,44 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "linear_operator_adjoint_test",
+    size = "medium",
+    srcs = ["linear_operator_adjoint_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
+)
+
+cuda_py_test(
+    name = "linear_operator_algebra_test",
+    size = "small",
+    srcs = ["linear_operator_algebra_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 cuda_py_test(
     name = "linear_operator_block_diag_test",
     size = "medium",
@@ -89,7 +127,6 @@ cuda_py_test(
     size = "medium",
     srcs = ["linear_operator_circulant_test.py"],
     additional_deps = [
-        "//tensorflow/python/ops/linalg",
         "//third_party/py/numpy",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:spectral_ops_test_util",
@@ -99,6 +136,8 @@ cuda_py_test(
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/signal",
     ],
     shard_count = 5,
     tags = [
@@ -150,6 +189,28 @@ cuda_py_test(
     ],
 )
 
+cuda_py_test(
+    name = "linear_operator_inversion_test",
+    size = "medium",
+    srcs = ["linear_operator_inversion_test.py"],
+    additional_deps = [
+        "//tensorflow/python/ops/linalg",
+        "//third_party/py/numpy",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:platform_test",
+    ],
+    shard_count = 5,
+    tags = [
+        "noasan",  # times out, b/63678675
+        "optonly",  # times out
+    ],
+)
+
 cuda_py_test(
     name = "linear_operator_full_matrix_test",
     size = "medium",
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
index 628ed998c54ab52771db210bd1a9aa363f29237b..627349c69b315297d6832576200b28c5b5e2d12f 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_addition_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_addition
@@ -69,6 +70,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
     with self.assertRaisesRegexp(TypeError, "contain only LinearOperator"):
       add_operators([1, 2])
 
+  @test_util.run_deprecated_v1
   def test_two_diag_operators(self):
     op_a = linalg.LinearOperatorDiag(
         [1., 1.], is_positive_definite=True, name="A")
@@ -89,6 +91,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Enforce particular name for this simple case
       self.assertEqual("Add/B__A/", op.name)
 
+  @test_util.run_deprecated_v1
   def test_three_diag_operators(self):
     op1 = linalg.LinearOperatorDiag(
         [1., 1.], is_positive_definite=True, name="op1")
@@ -109,6 +112,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Positive definite ==> non-singular
       self.assertTrue(op.is_non_singular)
 
+  @test_util.run_deprecated_v1
   def test_diag_tril_diag(self):
     op1 = linalg.LinearOperatorDiag(
         [1., 1.], is_non_singular=True, name="diag_a")
@@ -134,6 +138,7 @@ class LinearOperatorAdditionCorrectnessTest(test.TestCase):
       # Since no custom hint was provided, we default to None (unknown).
       self.assertEqual(None, op.is_non_singular)
 
+  @test_util.run_deprecated_v1
   def test_matrix_diag_tril_diag_uses_custom_name(self):
     op0 = linalg.LinearOperatorFullMatrix(
         [[-1., -1.], [-1., -1.]], name="matrix")
@@ -217,6 +222,7 @@ class LinearOperatorOrderOfAdditionTest(test.TestCase):
     self.assertEqual(1, len(op_sum))
     self.assertIsInstance(op_sum[0], linalg.LinearOperatorLowerTriangular)
 
+  @test_util.run_deprecated_v1
   def test_cannot_add_everything_so_return_more_than_one_operator(self):
     diag1 = linalg.LinearOperatorDiag([1.])
     diag2 = linalg.LinearOperatorDiag([2.])
@@ -261,6 +267,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnScaledIdentity()
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_identity(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2)
     id2 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
@@ -279,6 +286,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_scaled_identity(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
     id2 = linalg.LinearOperatorScaledIdentity(num_rows=2, multiplier=2.2)
@@ -297,6 +305,7 @@ class AddAndReturnScaledIdentityTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_scaled_identity_plus_scaled_identity(self):
     id1 = linalg.LinearOperatorScaledIdentity(
         num_rows=2, multiplier=[2.2, 2.2, 2.2])
@@ -322,6 +331,7 @@ class AddAndReturnDiagTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnDiag()
 
+  @test_util.run_deprecated_v1
   def test_identity_plus_identity_returns_diag(self):
     id1 = linalg.LinearOperatorIdentity(num_rows=2)
     id2 = linalg.LinearOperatorIdentity(num_rows=2, batch_shape=[3])
@@ -340,6 +350,7 @@ class AddAndReturnDiagTest(test.TestCase):
     self.assertTrue(operator.is_non_singular)
     self.assertEqual("my_operator", operator.name)
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_diag(self):
     diag1 = rng.rand(2, 3, 4)
     diag2 = rng.rand(4)
@@ -366,6 +377,7 @@ class AddAndReturnTriLTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnTriL()
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_tril(self):
     diag = linalg.LinearOperatorDiag([1., 2.])
     tril = linalg.LinearOperatorLowerTriangular([[10., 0.], [30., 0.]])
@@ -389,6 +401,7 @@ class AddAndReturnMatrixTest(test.TestCase):
   def setUp(self):
     self._adder = linear_operator_addition._AddAndReturnMatrix()
 
+  @test_util.run_deprecated_v1
   def test_diag_plus_diag(self):
     diag1 = linalg.LinearOperatorDiag([1., 2.])
     diag2 = linalg.LinearOperatorDiag([-1., 3.])
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1bed4b5268e8d27a25ab735f7e3e1a6c9e4d5d95
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_adjoint_test.py
@@ -0,0 +1,118 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_adjoint
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+LinearOperatorAdjoint = linear_operator_adjoint.LinearOperatorAdjoint  # pylint: disable=invalid-name
+
+
+class LinearOperatorAdjointTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    self._atol[dtypes.complex64] = 1e-5
+    self._rtol[dtypes.complex64] = 1e-5
+
+  def _operator_and_matrix(self,
+                           build_info,
+                           dtype,
+                           use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+
+    if ensure_self_adjoint_and_pd:
+      matrix = linear_operator_test_util.random_positive_definite_matrix(
+          shape, dtype, force_well_conditioned=True)
+    else:
+      matrix = linear_operator_test_util.random_tril_matrix(
+          shape, dtype, force_well_conditioned=True, remove_upper=True)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    if ensure_self_adjoint_and_pd:
+      operator = LinearOperatorAdjoint(
+          linalg.LinearOperatorFullMatrix(
+              lin_op_matrix, is_positive_definite=True, is_self_adjoint=True))
+    else:
+      operator = LinearOperatorAdjoint(
+          linalg.LinearOperatorLowerTriangular(lin_op_matrix))
+
+    return operator, linalg.adjoint(matrix)
+
+  def test_base_operator_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    operator_adjoint = LinearOperatorAdjoint(operator)
+    self.assertTrue(operator_adjoint.is_positive_definite)
+    self.assertTrue(operator_adjoint.is_non_singular)
+    self.assertFalse(operator_adjoint.is_self_adjoint)
+
+  def test_supplied_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator_adjoint = LinearOperatorAdjoint(
+        operator,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator_adjoint.is_positive_definite)
+    self.assertTrue(operator_adjoint.is_non_singular)
+    self.assertFalse(operator_adjoint.is_self_adjoint)
+
+  def test_contradicting_hints_raise(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=False)
+    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+      LinearOperatorAdjoint(operator, is_positive_definite=True)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
+    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+      LinearOperatorAdjoint(operator, is_self_adjoint=True)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, name="my_operator", is_non_singular=True)
+
+    operator = LinearOperatorAdjoint(operator)
+
+    self.assertEqual("my_operator_adjoint", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e296c026c09b36afd39b891befb767a222f5f19
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_algebra_test.py
@@ -0,0 +1,133 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for registration mechanisms."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops.linalg import cholesky_registrations  # pylint: disable=unused-import
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import matmul_registrations  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+
+# pylint: disable=protected-access
+_CHOLESKY_DECOMPS = linear_operator_algebra._CHOLESKY_DECOMPS
+_MATMUL = linear_operator_algebra._MATMUL
+_registered_cholesky = linear_operator_algebra._registered_cholesky
+_registered_matmul = linear_operator_algebra._registered_matmul
+# pylint: enable=protected-access
+
+
+class CholeskyTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Cholesky to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterCholesky(CustomLinOp)
+    def _cholesky(a):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    with self.assertRaisesRegexp(ValueError, "positive definite"):
+      CustomLinOp(dtype=None, is_self_adjoint=True).cholesky()
+
+    with self.assertRaisesRegexp(ValueError, "self adjoint"):
+      CustomLinOp(dtype=None, is_positive_definite=True).cholesky()
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.cholesky())
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterCholesky(CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterCholesky(CustomLinOp)(lambda a: None)
+
+  def testExactCholeskyRegistrationsAllMatch(self):
+    for (k, v) in _CHOLESKY_DECOMPS.items():
+      self.assertEqual(v, _registered_cholesky(k[0]))
+
+
+class MatmulTest(test.TestCase):
+
+  def testRegistration(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+
+      def _matmul(self, a):
+        pass
+
+      def _shape(self):
+        return tensor_shape.TensorShape([1, 1])
+
+      def _shape_tensor(self):
+        pass
+
+    # Register Matmul to a lambda that spits out the name parameter
+    @linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)
+    def _matmul(a, b):  # pylint: disable=unused-argument,unused-variable
+      return "OK"
+
+    custom_linop = CustomLinOp(
+        dtype=None, is_self_adjoint=True, is_positive_definite=True)
+    self.assertEqual("OK", custom_linop.matmul(custom_linop))
+
+  def testRegistrationFailures(self):
+
+    class CustomLinOp(linear_operator.LinearOperator):
+      pass
+
+    with self.assertRaisesRegexp(TypeError, "must be callable"):
+      linear_operator_algebra.RegisterMatmul(CustomLinOp, CustomLinOp)("blah")
+
+    # First registration is OK
+    linear_operator_algebra.RegisterMatmul(
+        CustomLinOp, CustomLinOp)(lambda a: None)
+
+    # Second registration fails
+    with self.assertRaisesRegexp(ValueError, "has already been registered"):
+      linear_operator_algebra.RegisterMatmul(
+          CustomLinOp, CustomLinOp)(lambda a: None)
+
+  def testExactMatmulRegistrationsAllMatch(self):
+    for (k, v) in _MATMUL.items():
+      self.assertEqual(v, _registered_matmul(k[0], k[1]))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
index 30951b1b0eb27d95c0ce2f7d266ac6aa84da8dd4..f0cc5d709f9bfec2e3dcfadecc8f949bb6ce6e6d 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_block_diag_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_block_diag as block_diag
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -78,7 +79,9 @@ class SquareLinearOperatorBlockDiagTest(
         build_info((2, 1, 5, 5), blocks=[(2, 1, 2, 2), (1, 3, 3)]),
     ]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
     expected_blocks = (
         build_info.__dict__["blocks"] if "blocks" in build_info.__dict__
@@ -98,7 +101,11 @@ class SquareLinearOperatorBlockDiagTest(
 
     operator = block_diag.LinearOperatorBlockDiag(
         [linalg.LinearOperatorFullMatrix(
-            l, is_square=True) for l in lin_op_matrices])
+            l,
+            is_square=True,
+            is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+            is_positive_definite=True if ensure_self_adjoint_and_pd else None)
+         for l in lin_op_matrices])
 
     # Should be auto-set.
     self.assertTrue(operator.is_square)
@@ -129,6 +136,40 @@ class SquareLinearOperatorBlockDiagTest(
     self.assertTrue(operator.is_non_singular)
     self.assertFalse(operator.is_self_adjoint)
 
+  def test_block_diag_cholesky_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = block_diag.LinearOperatorBlockDiag(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+        ],
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    cholesky_factor = operator.cholesky()
+    self.assertTrue(isinstance(
+        cholesky_factor,
+        block_diag.LinearOperatorBlockDiag))
+    self.assertEqual(2, len(cholesky_factor.operators))
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[0],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[1],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+
   def test_is_non_singular_auto_set(self):
     # Matrix with two positive eigenvalues, 11 and 8.
     # The matrix values do not effect auto-setting of the flags.
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index f1e151ebd862ffdbb0a266060dfc6ae7d5a24ef2..6366083ac5b1601c0e71a13a310c6761015bcc45 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -21,12 +21,14 @@ import contextlib
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import spectral_ops_test_util
 from tensorflow.python.ops.linalg import linalg
 from tensorflow.python.ops.linalg import linear_operator_circulant
 from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 rng = np.random.RandomState(0)
@@ -75,8 +77,8 @@ class LinearOperatorCirculantBaseTest(object):
       x = np.zeros([domain_dimension])
       # x is a basis vector.
       x[m] = 1.0
-      fft_x = math_ops.fft(x.astype(np.complex64))
-      h_convolve_x = math_ops.ifft(spectrum * fft_x)
+      fft_x = fft_ops.fft(x.astype(np.complex64))
+      h_convolve_x = fft_ops.ifft(spectrum * fft_x)
       matrix_rows.append(h_convolve_x)
     matrix = array_ops.stack(matrix_rows, axis=-1)
     return math_ops.cast(matrix, dtype)
@@ -97,7 +99,9 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
     # real, the matrix will not be real.
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating real spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -105,6 +109,8 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
     # spectrum is bounded away from zero.
     spectrum = linear_operator_test_util.random_sign_uniform(
         shape=self._shape_to_spectrum_shape(shape), minval=1., maxval=2.)
+    if ensure_self_adjoint_and_pd:
+      spectrum = math_ops.abs(spectrum)
     # If dtype is complex, cast spectrum to complex.  The imaginary part will be
     # zero, so the operator will still be self-adjoint.
     spectrum = math_ops.cast(spectrum, dtype)
@@ -115,12 +121,16 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant(
-        lin_op_spectrum, is_self_adjoint=True, input_output_dtype=dtype)
+        lin_op_spectrum,
+        is_self_adjoint=True,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        input_output_dtype=dtype)
 
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -129,7 +139,8 @@ class LinearOperatorCirculantTestSelfAdjointOperator(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
 
 class LinearOperatorCirculantTestHermitianSpectrum(
@@ -146,7 +157,9 @@ class LinearOperatorCirculantTestHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -160,14 +173,14 @@ class LinearOperatorCirculantTestHermitianSpectrum(
     #  = IFFT[EvenPartOf[pre_spectrum]]
     # is the IFFT of something that is also bounded away from zero.
     # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
-    pre_h = math_ops.ifft(pre_spectrum_c)
+    pre_h = fft_ops.ifft(pre_spectrum_c)
 
     # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
     # So we will make spectrum = FFT[h], for real valued h.
     h = math_ops.real(pre_h)
     h_c = _to_complex(h)
 
-    spectrum = math_ops.fft(h_c)
+    spectrum = fft_ops.fft(h_c)
 
     lin_op_spectrum = spectrum
 
@@ -175,12 +188,17 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant(
-        lin_op_spectrum, input_output_dtype=dtype)
+        lin_op_spectrum,
+        input_output_dtype=dtype,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+    )
 
     mat = self._spectrum_to_circulant_1d(spectrum, shape, dtype=dtype)
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -189,7 +207,8 @@ class LinearOperatorCirculantTestHermitianSpectrum(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
 
 class LinearOperatorCirculantTestNonHermitianSpectrum(
@@ -205,7 +224,16 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  # Skip Cholesky since we are explicitly testing non-hermitian
+  # spectra.
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     shape = build_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
@@ -226,6 +254,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_simple_hermitian_spectrum_gives_operator_with_zero_imag_part(self):
     with self.cached_session():
       spectrum = math_ops.cast([1., 1j, -1j], dtypes.complex64)
@@ -234,8 +263,10 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
-      np.testing.assert_allclose(0, imag_matrix.eval(), rtol=0, atol=eps * 3)
+      np.testing.assert_allclose(
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3)
 
+  @test_util.run_deprecated_v1
   def test_simple_positive_real_spectrum_gives_self_adjoint_pos_def_oper(self):
     with self.cached_session() as sess:
       spectrum = math_ops.cast([6., 4, 2], dtypes.complex64)
@@ -248,10 +279,11 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       operator.assert_positive_definite().run()  # Should not fail
       operator.assert_self_adjoint().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = [1., 2., 1.]
-      spectrum = math_ops.fft(
+      spectrum = fft_ops.fft(
           math_ops.cast(convolution_kernel, dtypes.complex64))
 
       # spectrum is shape [3] ==> operator is shape [3, 3]
@@ -269,15 +301,16 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       # Make spectrum the FFT of a real convolution kernel h.  This ensures that
       # spectrum is Hermitian.
       h = linear_operator_test_util.random_normal(shape=(3, 4))
-      spectrum = math_ops.fft(math_ops.cast(h, dtypes.complex64))
+      spectrum = fft_ops.fft(math_ops.cast(h, dtypes.complex64))
       operator = linalg.LinearOperatorCirculant(
           spectrum, input_output_dtype=dtypes.complex64)
       matrix = operator.to_dense()
       imag_matrix = math_ops.imag(matrix)
       eps = np.finfo(np.float32).eps
       np.testing.assert_allclose(
-          0, imag_matrix.eval(), rtol=0, atol=eps * 3 * 4)
+          0, self.evaluate(imag_matrix), rtol=0, atol=eps * 3 * 4)
 
+  @test_util.run_deprecated_v1
   def test_convolution_kernel_same_as_first_row_of_to_dense(self):
     spectrum = [[3., 2., 1.], [2., 1.5, 1.]]
     with self.cached_session():
@@ -287,8 +320,9 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
 
       self.assertAllEqual((2, 3), h.get_shape())
       self.assertAllEqual((2, 3, 3), c.get_shape())
-      self.assertAllClose(h.eval(), c.eval()[:, :, 0])
+      self.assertAllClose(h.eval(), self.evaluate(c)[:, :, 0])
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_fails_for_singular_operator(self):
     spectrum = math_ops.cast([0, 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -296,12 +330,14 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
     spectrum = math_ops.cast([-3j, 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
     with self.cached_session():
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_fails_for_non_positive_definite(self):
     spectrum = math_ops.cast([6., 4, 2j], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -309,6 +345,7 @@ class LinearOperatorCirculantTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_fail_when_pos_def(self):
     spectrum = math_ops.cast([6., 4, 2j + 2], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant(spectrum)
@@ -397,8 +434,8 @@ class LinearOperatorCirculant2DBaseTest(object):
         x = np.zeros(block_shape)
         # x is a basis vector.
         x[n0, n1] = 1.0
-        fft_x = math_ops.fft2d(x.astype(np.complex64))
-        h_convolve_x = math_ops.ifft2d(spectrum * fft_x)
+        fft_x = fft_ops.fft2d(x.astype(np.complex64))
+        h_convolve_x = fft_ops.ifft2d(spectrum * fft_x)
         # We want the flat version of the action of the operator on a basis
         # vector, not the block version.
         h_convolve_x = array_ops.reshape(h_convolve_x, shape[:-1])
@@ -421,7 +458,9 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = build_info.shape
     # For this test class, we are creating Hermitian spectrums.
     # We also want the spectrum to have eigenvalues bounded away from zero.
@@ -435,14 +474,14 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
     #  = IFFT[EvenPartOf[pre_spectrum]]
     # is the IFFT of something that is also bounded away from zero.
     # Therefore, FFT[pre_h] would be a well-conditioned spectrum.
-    pre_h = math_ops.ifft2d(pre_spectrum_c)
+    pre_h = fft_ops.ifft2d(pre_spectrum_c)
 
     # A spectrum is Hermitian iff it is the DFT of a real convolution kernel.
     # So we will make spectrum = FFT[h], for real valued h.
     h = math_ops.real(pre_h)
     h_c = _to_complex(h)
 
-    spectrum = math_ops.fft2d(h_c)
+    spectrum = fft_ops.fft2d(h_c)
 
     lin_op_spectrum = spectrum
 
@@ -450,7 +489,10 @@ class LinearOperatorCirculant2DTestHermitianSpectrum(
       lin_op_spectrum = array_ops.placeholder_with_default(spectrum, shape=None)
 
     operator = linalg.LinearOperatorCirculant2D(
-        lin_op_spectrum, input_output_dtype=dtype)
+        lin_op_spectrum,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        input_output_dtype=dtype)
 
     mat = self._spectrum_to_circulant_2d(spectrum, shape, dtype=dtype)
 
@@ -470,7 +512,14 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
   def _dtypes_to_test(self):
     return [dtypes.complex64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     shape = build_info.shape
     # Will be well conditioned enough to get accurate solves.
     spectrum = linear_operator_test_util.random_sign_uniform(
@@ -491,6 +540,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_real_hermitian_spectrum_gives_real_symmetric_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -508,6 +558,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       np.testing.assert_allclose(0, imag_matrix, atol=1e-6)
       self.assertAllClose(matrix, matrix_transpose, atol=0)
 
+  @test_util.run_deprecated_v1
   def test_real_spectrum_gives_self_adjoint_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -519,9 +570,10 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       self.assertEqual(matrix_tensor.dtype,
                        linear_operator_circulant._DTYPE_COMPLEX)
       matrix_h = linalg.adjoint(matrix_tensor)
-      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
       self.assertAllClose(matrix, matrix_h, atol=0)
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_fails_for_singular_operator(self):
     spectrum = math_ops.cast([[0, 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -529,12 +581,14 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_fail_for_non_singular_operator(self):
     spectrum = math_ops.cast([[-3j, 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
     with self.cached_session():
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_fails_for_non_positive_definite(self):
     spectrum = math_ops.cast([[6., 4], [2j, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -542,6 +596,7 @@ class LinearOperatorCirculant2DTestNonHermitianSpectrum(
       with self.assertRaisesOpError("Not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_fail_when_pos_def(self):
     spectrum = math_ops.cast([[6., 4], [2j + 2, 3.]], dtypes.complex64)
     operator = linalg.LinearOperatorCirculant2D(spectrum)
@@ -580,6 +635,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       with spectral_ops_test_util.fft_kernel_label_map():
         yield sess
 
+  @test_util.run_deprecated_v1
   def test_real_spectrum_gives_self_adjoint_operator(self):
     with self.cached_session() as sess:
       # This is a real and hermitian spectrum.
@@ -593,16 +649,17 @@ class LinearOperatorCirculant3DTest(test.TestCase):
                        linear_operator_circulant._DTYPE_COMPLEX)
       matrix_h = linalg.adjoint(matrix_tensor)
 
-      matrix, matrix_h = sess.run([matrix_tensor, matrix_h])
+      matrix, matrix_h = self.evaluate([matrix_tensor, matrix_h])
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
       self.assertAllClose(matrix, matrix_h)
 
+  @test_util.run_deprecated_v1
   def test_defining_operator_using_real_convolution_kernel(self):
     with self.cached_session():
       convolution_kernel = linear_operator_test_util.random_normal(
           shape=(2, 2, 3, 5), dtype=dtypes.float32)
       # Convolution kernel is real ==> spectrum is Hermitian.
-      spectrum = math_ops.fft3d(
+      spectrum = fft_ops.fft3d(
           math_ops.cast(convolution_kernel, dtypes.complex64))
 
       # spectrum is Hermitian ==> operator is real.
@@ -615,6 +672,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       self.assertAllEqual((2, 2 * 3 * 5, 2 * 3 * 5), matrix.shape)
       np.testing.assert_allclose(0, np.imag(matrix), atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def test_defining_spd_operator_by_taking_real_part(self):
     with self.cached_session() as sess:
       # S is real and positive.
@@ -634,7 +692,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       #         =      H1  +      H2
       # where H1 is real since it is Hermitian,
       # and H2 is imaginary since it is anti-Hermitian.
-      ifft_s = math_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
+      ifft_s = fft_ops.ifft3d(math_ops.cast(s, dtypes.complex64))
 
       # Throw away H2, keep H1.
       real_ifft_s = math_ops.real(ifft_s)
@@ -642,7 +700,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       # This is the perfect spectrum!
       # spectrum = DFT[H1]
       #          = S1,
-      fft_real_ifft_s = math_ops.fft3d(
+      fft_real_ifft_s = fft_ops.fft3d(
           math_ops.cast(real_ifft_s, dtypes.complex64))
 
       # S1 is Hermitian ==> operator is real.
@@ -665,7 +723,7 @@ class LinearOperatorCirculant3DTest(test.TestCase):
       # S2 is anti-Hermitian ==> operator is imaginary.
       # S2 is real ==> operator is self-adjoint.
       imag_ifft_s = math_ops.imag(ifft_s)
-      fft_imag_ifft_s = math_ops.fft3d(
+      fft_imag_ifft_s = fft_ops.fft3d(
           1j * math_ops.cast(imag_ifft_s, dtypes.complex64))
       operator_imag = linalg.LinearOperatorCirculant3D(fft_imag_ifft_s)
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
index 02f56db5962748ce6fc247f7e672044aeb5e4b3e..214b73aa2f34d436e3430e4e7489c90adb6d52f9 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_composition_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -42,8 +43,12 @@ class SquareLinearOperatorCompositionTest(
     self._rtol[dtypes.float32] = 1e-4
     self._rtol[dtypes.complex64] = 1e-4
 
+  @property
+  def _tests_to_skip(self):
+    # Cholesky not implemented.
+    return ["cholesky"]
+
   def _operator_and_matrix(self, build_info, dtype, use_placeholder):
-    sess = ops.get_default_session()
     shape = list(build_info.shape)
 
     # Either 1 or 2 matrices, depending.
@@ -175,6 +180,7 @@ class NonSquareLinearOperatorCompositionTest(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_static_shapes(self):
     operators = [
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 4)),
@@ -183,6 +189,7 @@ class NonSquareLinearOperatorCompositionTest(
     operator = linalg.LinearOperatorComposition(operators)
     self.assertAllEqual((2, 3, 5), operator.shape)
 
+  @test_util.run_deprecated_v1
   def test_shape_tensors_when_statically_available(self):
     operators = [
         linalg.LinearOperatorFullMatrix(rng.rand(2, 3, 4)),
@@ -192,6 +199,7 @@ class NonSquareLinearOperatorCompositionTest(
     with self.cached_session():
       self.assertAllEqual((2, 3, 5), operator.shape_tensor().eval())
 
+  @test_util.run_deprecated_v1
   def test_shape_tensors_when_only_dynamically_available(self):
     mat_1 = rng.rand(1, 2, 3, 4)
     mat_2 = rng.rand(1, 2, 4, 5)
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
index 0758349531e2da9d29342cbe149933b2fa30bfd9..dcbc0dd7c97184df150fc7094a28441fcfaa1257 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_diag_test.py
@@ -17,6 +17,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -32,17 +33,26 @@ class LinearOperatorDiagTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
     diag = linear_operator_test_util.random_sign_uniform(
         shape[:-1], minval=1., maxval=2., dtype=dtype)
 
+    if ensure_self_adjoint_and_pd:
+      # Abs on complex64 will result in a float32, so we cast back up.
+      diag = math_ops.cast(math_ops.abs(diag), dtype=dtype)
+
     lin_op_diag = diag
 
     if use_placeholder:
       lin_op_diag = array_ops.placeholder_with_default(diag, shape=None)
 
-    operator = linalg.LinearOperatorDiag(lin_op_diag)
+    operator = linalg.LinearOperatorDiag(
+        lin_op_diag,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     matrix = array_ops.matrix_diag(diag)
 
@@ -71,6 +81,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("non-positive real.*not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_raise_if_pd_and_complex(self):
     with self.cached_session():
       x = [1., 2.]
@@ -87,6 +98,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("Singular operator"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_raise_for_complex_nonsingular(self):
     with self.cached_session():
       x = [1., 0.]
@@ -104,6 +116,7 @@ class LinearOperatorDiagTest(
       with self.assertRaisesOpError("imaginary.*not self-adjoint"):
         operator.assert_self_adjoint().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint_does_not_raise_for_diag_with_zero_imag(self):
     with self.cached_session():
       x = [1., 0.]
@@ -138,12 +151,52 @@ class LinearOperatorDiagTest(
       operator_matmul = operator.matmul(x)
       mat_matmul = math_ops.matmul(mat, x)
       self.assertAllEqual(operator_matmul.get_shape(), mat_matmul.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, mat_matmul]))
+      self.assertAllClose(*self.evaluate([operator_matmul, mat_matmul]))
 
       operator_solve = operator.solve(x)
       mat_solve = linalg_ops.matrix_solve(mat, x)
       self.assertAllEqual(operator_solve.get_shape(), mat_solve.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, mat_solve]))
+      self.assertAllClose(*self.evaluate([operator_solve, mat_solve]))
+
+  def test_diag_matmul(self):
+    operator1 = linalg_lib.LinearOperatorDiag([2., 3.])
+    operator2 = linalg_lib.LinearOperatorDiag([1., 2.])
+    operator3 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 6.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([2., 6.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator1.matmul(operator3)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
+
+    operator_matmul = operator3.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorDiag))
+    self.assertAllClose([6., 9.], self.evaluate(operator_matmul.diag))
+
+  def test_diag_cholesky_type(self):
+    diag = [1., 3., 5., 8.]
+    operator = linalg.LinearOperatorDiag(
+        diag,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg.LinearOperatorDiag))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
index 8c2d2cf0774b682835f521f2e434d87fbc2aec84..aff0b1ae14ce5bfb62ba9984f60cf30f9b553ea7 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_full_matrix_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
@@ -33,7 +34,9 @@ class SquareLinearOperatorFullMatrixTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
@@ -44,7 +47,12 @@ class SquareLinearOperatorFullMatrixTest(
     if use_placeholder:
       lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
 
-    operator = linalg.LinearOperatorFullMatrix(lin_op_matrix, is_square=True)
+    # Set the hints to none to test non-symmetric PD code paths.
+    operator = linalg.LinearOperatorFullMatrix(
+        lin_op_matrix,
+        is_square=True,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     return operator, matrix
 
@@ -62,6 +70,7 @@ class SquareLinearOperatorFullMatrixTest(
     # Auto-detected.
     self.assertTrue(operator.is_square)
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_raises_if_cond_too_big_but_finite(self):
     with self.cached_session():
       tril = linear_operator_test_util.random_tril_matrix(
@@ -123,7 +132,13 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
   def _dtypes_to_test(self):
     return [dtypes.float32, dtypes.float64]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+
+    # Matrix is always symmetric and positive definite in this class.
+    del ensure_self_adjoint_and_pd
+
     shape = list(build_info.shape)
 
     matrix = linear_operator_test_util.random_positive_definite_matrix(
@@ -134,7 +149,11 @@ class SquareLinearOperatorFullMatrixSymmetricPositiveDefiniteTest(
     if use_placeholder:
       lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
 
-    operator = linalg.LinearOperatorFullMatrix(lin_op_matrix, is_square=True)
+    operator = linalg.LinearOperatorFullMatrix(
+        lin_op_matrix,
+        is_square=True,
+        is_self_adjoint=True,
+        is_positive_definite=True)
 
     return operator, matrix
 
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
index 465a8194dd98aa9ed704635d14c1315ccf211b0e..2da5e712d77b88ca6bb20a5f0920335f00c7b594 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_identity_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
@@ -41,7 +43,12 @@ class LinearOperatorIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    # Identity matrix is already Hermitian Positive Definite.
+    del ensure_self_adjoint_and_pd
+
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
@@ -54,16 +61,19 @@ class LinearOperatorIdentityTest(
 
     return operator, mat
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_positive_definite().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
       operator.assert_non_singular().run()  # Should not fail
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorIdentity(num_rows=2)
@@ -77,7 +87,7 @@ class LinearOperatorIdentityTest(
           num_rows=2, dtype=dtypes.float16)
       x = rng.randn(2, 3).astype(np.float16)
       y = operator.matmul(x)
-      self.assertAllClose(x, y.eval())
+      self.assertAllClose(x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
     with self.assertRaisesRegexp(ValueError, "must be a 0-D Tensor"):
@@ -103,6 +113,7 @@ class LinearOperatorIdentityTest(
     with self.assertRaisesRegexp(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorIdentity(num_rows=2, batch_shape=[-2])
 
+  @test_util.run_deprecated_v1
   def test_non_scalar_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -111,6 +122,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be a 0-D Tensor"):
         operator.to_dense().eval(feed_dict={num_rows: [2]})
 
+  @test_util.run_deprecated_v1
   def test_negative_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -119,6 +131,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be non-negative"):
         operator.to_dense().eval(feed_dict={num_rows: -2})
 
+  @test_util.run_deprecated_v1
   def test_non_1d_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -127,6 +140,7 @@ class LinearOperatorIdentityTest(
       with self.assertRaisesOpError("must be a 1-D"):
         operator.to_dense().eval(feed_dict={batch_shape: 2})
 
+  @test_util.run_deprecated_v1
   def test_negative_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -141,6 +155,7 @@ class LinearOperatorIdentityTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -164,8 +179,9 @@ class LinearOperatorIdentityTest(
       expected = x
 
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
+  @test_util.run_deprecated_v1
   def test_default_batch_shape_broadcasts_with_everything_dynamic(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
@@ -201,8 +217,9 @@ class LinearOperatorIdentityTest(
 
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
+  @test_util.run_deprecated_v1
   def test_broadcast_matmul_dynamic_shapes(self):
     # These cannot be done in the automated (base test class) tests since they
     # test shapes that tf.batch_matmul cannot handle.
@@ -242,6 +259,16 @@ class LinearOperatorIdentityTest(
           is_non_singular=None,
       )
 
+  def test_identity_cholesky_type(self):
+    operator = linalg_lib.LinearOperatorIdentity(
+        num_rows=2,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg_lib.LinearOperatorIdentity))
+
 
 class LinearOperatorScaledIdentityTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
@@ -253,7 +280,10 @@ class LinearOperatorScaledIdentityTest(
     # 16bit.
     return [dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
 
@@ -266,6 +296,9 @@ class LinearOperatorScaledIdentityTest(
     multiplier = linear_operator_test_util.random_sign_uniform(
         shape=batch_shape, minval=1., maxval=2., dtype=dtype)
 
+    if ensure_self_adjoint_and_pd:
+      # Abs on complex64 will result in a float32, so we cast back up.
+      multiplier = math_ops.cast(math_ops.abs(multiplier), dtype=dtype)
 
     # Nothing to feed since LinearOperatorScaledIdentity takes no Tensor args.
     lin_op_multiplier = multiplier
@@ -275,7 +308,10 @@ class LinearOperatorScaledIdentityTest(
           multiplier, shape=None)
 
     operator = linalg_lib.LinearOperatorScaledIdentity(
-        num_rows, lin_op_multiplier)
+        num_rows,
+        lin_op_multiplier,
+        is_self_adjoint=True if ensure_self_adjoint_and_pd else None,
+        is_positive_definite=True if ensure_self_adjoint_and_pd else None)
 
     multiplier_matrix = array_ops.expand_dims(
         array_ops.expand_dims(multiplier, -1), -1)
@@ -284,6 +320,7 @@ class LinearOperatorScaledIdentityTest(
 
     return operator, matrix
 
+  @test_util.run_deprecated_v1
   def test_assert_positive_definite_does_not_raise_when_positive(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -297,6 +334,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("not positive definite"):
         operator.assert_positive_definite().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_non_singular_does_not_raise_when_non_singular(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -310,6 +348,7 @@ class LinearOperatorScaledIdentityTest(
       with self.assertRaisesOpError("was singular"):
         operator.assert_non_singular().run()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint_does_not_raise_when_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -332,7 +371,7 @@ class LinearOperatorScaledIdentityTest(
           num_rows=2, multiplier=multiplier)
       x = rng.randn(2, 3).astype(np.float16)
       y = operator.matmul(x)
-      self.assertAllClose(multiplier[..., None, None] * x, y.eval())
+      self.assertAllClose(multiplier[..., None, None] * x, self.evaluate(y))
 
   def test_non_scalar_num_rows_raises_static(self):
     # Many "test_...num_rows" tests are performed in LinearOperatorIdentity.
@@ -347,6 +386,7 @@ class LinearOperatorScaledIdentityTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -378,13 +418,13 @@ class LinearOperatorScaledIdentityTest(
       expected = x * 2.2 + zeros
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2 + zeros
       operator_solve = operator.solve(x)
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, expected]))
+      self.assertAllClose(*self.evaluate([operator_solve, expected]))
 
   def test_broadcast_matmul_and_solve_scalar_scale_multiplier(self):
     # These cannot be done in the automated (base test class) tests since they
@@ -404,13 +444,13 @@ class LinearOperatorScaledIdentityTest(
       expected = x * 2.2
       operator_matmul = operator.matmul(x)
       self.assertAllEqual(operator_matmul.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_matmul, expected]))
+      self.assertAllClose(*self.evaluate([operator_matmul, expected]))
 
       # Test solve
       expected = x / 2.2
       operator_solve = operator.solve(x)
       self.assertAllEqual(operator_solve.get_shape(), expected.get_shape())
-      self.assertAllClose(*sess.run([operator_solve, expected]))
+      self.assertAllClose(*self.evaluate([operator_solve, expected]))
 
   def test_is_x_flags(self):
     operator = linalg_lib.LinearOperatorScaledIdentity(
@@ -420,6 +460,41 @@ class LinearOperatorScaledIdentityTest(
     self.assertTrue(operator.is_non_singular)
     self.assertTrue(operator.is_self_adjoint is None)
 
+  def test_identity_matmul(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2, multiplier=3.)
+    self.assertTrue(isinstance(
+        operator1.matmul(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    self.assertTrue(isinstance(
+        operator1.matmul(operator1),
+        linalg_lib.LinearOperatorIdentity))
+
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorScaledIdentity))
+    self.assertAllClose(3., self.evaluate(operator_matmul.multiplier))
+
+  def test_scaled_identity_cholesky_type(self):
+    operator = linalg_lib.LinearOperatorScaledIdentity(
+        num_rows=2,
+        multiplier=3.,
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    self.assertTrue(isinstance(
+        operator.cholesky(),
+        linalg_lib.LinearOperatorScaledIdentity))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9344c526ee8ce3bd68de6876626a86a9ad6ab0d8
--- /dev/null
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_inversion_test.py
@@ -0,0 +1,130 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.linalg import linalg as linalg_lib
+from tensorflow.python.ops.linalg import linear_operator_inversion
+from tensorflow.python.ops.linalg import linear_operator_test_util
+from tensorflow.python.platform import test
+
+linalg = linalg_lib
+
+LinearOperatorInversion = linear_operator_inversion.LinearOperatorInversion  # pylint: disable=invalid-name
+
+
+class LinearOperatorInversionTest(
+    linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
+  """Most tests done in the base class LinearOperatorDerivedClassTest."""
+
+  def setUp(self):
+    self._atol[dtypes.complex64] = 1e-5
+    self._rtol[dtypes.complex64] = 1e-5
+
+  def _operator_and_matrix(self,
+                           build_info,
+                           dtype,
+                           use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
+    shape = list(build_info.shape)
+
+    if ensure_self_adjoint_and_pd:
+      matrix = linear_operator_test_util.random_positive_definite_matrix(
+          shape, dtype, force_well_conditioned=True)
+    else:
+      matrix = linear_operator_test_util.random_tril_matrix(
+          shape, dtype, force_well_conditioned=True, remove_upper=True)
+
+    lin_op_matrix = matrix
+
+    if use_placeholder:
+      lin_op_matrix = array_ops.placeholder_with_default(matrix, shape=None)
+
+    if ensure_self_adjoint_and_pd:
+      operator = LinearOperatorInversion(
+          linalg.LinearOperatorFullMatrix(
+              lin_op_matrix, is_positive_definite=True, is_self_adjoint=True))
+    else:
+      operator = LinearOperatorInversion(
+          linalg.LinearOperatorLowerTriangular(lin_op_matrix))
+
+    return operator, linalg.inv(matrix)
+
+  def test_base_operator_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    operator_inv = LinearOperatorInversion(operator)
+    self.assertTrue(operator_inv.is_positive_definite)
+    self.assertTrue(operator_inv.is_non_singular)
+    self.assertFalse(operator_inv.is_self_adjoint)
+
+  def test_supplied_hint_used(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    operator_inv = LinearOperatorInversion(
+        operator,
+        is_positive_definite=True,
+        is_non_singular=True,
+        is_self_adjoint=False)
+    self.assertTrue(operator_inv.is_positive_definite)
+    self.assertTrue(operator_inv.is_non_singular)
+    self.assertFalse(operator_inv.is_self_adjoint)
+
+  def test_contradicting_hints_raise(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 0.], [1., 1.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, is_positive_definite=False)
+    with self.assertRaisesRegexp(ValueError, "positive-definite"):
+      LinearOperatorInversion(operator, is_positive_definite=True)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_self_adjoint=False)
+    with self.assertRaisesRegexp(ValueError, "self-adjoint"):
+      LinearOperatorInversion(operator, is_self_adjoint=True)
+
+  def test_singular_raises(self):
+    # The matrix values do not effect auto-setting of the flags.
+    matrix = [[1., 1.], [1., 1.]]
+
+    operator = linalg.LinearOperatorFullMatrix(matrix, is_non_singular=False)
+    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+      LinearOperatorInversion(operator)
+
+    operator = linalg.LinearOperatorFullMatrix(matrix)
+    with self.assertRaisesRegexp(ValueError, "is_non_singular"):
+      LinearOperatorInversion(operator, is_non_singular=False)
+
+  def test_name(self):
+    matrix = [[11., 0.], [1., 8.]]
+    operator = linalg.LinearOperatorFullMatrix(
+        matrix, name="my_operator", is_non_singular=True)
+
+    operator = LinearOperatorInversion(operator)
+
+    self.assertEqual("my_operator_inv", operator.name)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
index f039b60f6480e7921fc776ccc223c43b8573e8f0..513b246803233f1117b48f1a3d413be42f15238a 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_kronecker_test.py
@@ -21,9 +21,11 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_kronecker as kronecker
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular as lower_triangular
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import test
@@ -52,6 +54,7 @@ def _kronecker_dense(factors):
 
 class KroneckerDenseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testKroneckerDenseMatrix(self):
     x = ops.convert_to_tensor([[2., 3.], [1., 2.]], dtype=dtypes.float32)
     y = ops.convert_to_tensor([[1., 2.], [5., -1.]], dtype=dtypes.float32)
@@ -69,8 +72,8 @@ class KroneckerDenseTest(test.TestCase):
         [5., 10., -1., -2.]], dtype=dtypes.float32)
 
     with self.cached_session():
-      self.assertAllClose(_kronecker_dense([x, y]).eval(), z.eval())
-      self.assertAllClose(_kronecker_dense([y, x]).eval(), w.eval())
+      self.assertAllClose(_kronecker_dense([x, y]).eval(), self.evaluate(z))
+      self.assertAllClose(_kronecker_dense([y, x]).eval(), self.evaluate(w))
 
 
 class SquareLinearOperatorKroneckerTest(
@@ -99,7 +102,12 @@ class SquareLinearOperatorKroneckerTest(
   def _tests_to_skip(self):
     return ["det", "solve", "solve_with_broadcast"]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    # Kronecker products constructed below will be from symmetric
+    # positive-definite matrices.
+    del ensure_self_adjoint_and_pd
     shape = list(build_info.shape)
     expected_factors = build_info.__dict__["factors"]
     matrices = [
@@ -116,7 +124,11 @@ class SquareLinearOperatorKroneckerTest(
 
     operator = kronecker.LinearOperatorKronecker(
         [linalg.LinearOperatorFullMatrix(
-            l, is_square=True) for l in lin_op_matrices])
+            l,
+            is_square=True,
+            is_self_adjoint=True,
+            is_positive_definite=True)
+         for l in lin_op_matrices])
 
     matrices = linear_operator_util.broadcast_matrix_batch_dims(matrices)
 
@@ -180,6 +192,40 @@ class SquareLinearOperatorKroneckerTest(
     with self.assertRaisesRegexp(ValueError, ">=1 operators"):
       kronecker.LinearOperatorKronecker([])
 
+  def test_kronecker_cholesky_type(self):
+    matrix = [[1., 0.], [0., 1.]]
+    operator = kronecker.LinearOperatorKronecker(
+        [
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+            linalg.LinearOperatorFullMatrix(
+                matrix,
+                is_positive_definite=True,
+                is_self_adjoint=True,
+            ),
+        ],
+        is_positive_definite=True,
+        is_self_adjoint=True,
+    )
+    cholesky_factor = operator.cholesky()
+    self.assertTrue(isinstance(
+        cholesky_factor,
+        kronecker.LinearOperatorKronecker))
+    self.assertEqual(2, len(cholesky_factor.operators))
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[0],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+    self.assertTrue(
+        isinstance(
+            cholesky_factor.operators[1],
+            lower_triangular.LinearOperatorLowerTriangular)
+    )
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
index 207e5edf818f988f1e87a3c21c320d57841145d1..2920f3ae7ebc549ae960215445fc933bb30913dd 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_low_rank_update_test.py
@@ -69,7 +69,8 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     return linear_operator_test_util.random_uniform(
         diag_shape, minval=1e-4, maxval=1., dtype=dtype)
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(self, build_info, dtype, use_placeholder,
+                           ensure_self_adjoint_and_pd=False):
     # Recall A = L + UDV^H
     shape = list(build_info.shape)
     diag_shape = shape[:-1]
@@ -93,7 +94,7 @@ class BaseLinearOperatorLowRankUpdatetest(object):
     lin_op_v = v
 
     # D
-    if self._is_diag_update_positive:
+    if self._is_diag_update_positive or ensure_self_adjoint_and_pd:
       diag_update = self._gen_positive_diag(dtype, diag_update_shape)
     else:
       diag_update = linear_operator_test_util.random_normal(
@@ -178,6 +179,10 @@ class LinearOperatorLowRankUpdatetestWithDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UDU^H, D !> 0, L > 0 ==> A !> 0 and we cannot use a Cholesky."""
 
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
   _use_diag_update = True
   _is_diag_update_positive = False
   _use_v = False
@@ -217,6 +222,10 @@ class LinearOperatorLowRankUpdatetestNoDiagCannotUseCholesky(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """A = L + UV^H, L > 0 ==> A is not symmetric and we cannot use a Cholesky."""
 
+  @property
+  def _tests_to_skip(self):
+    return ["cholesky"]
+
   _use_diag_update = False
   _is_diag_update_positive = None
   _use_v = True
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
index e3c8f5cb688553bad4cbfcfc7fb5e92130ac76a2..bd41f9ed9d335f6f7e77cb7a19c5db1e59482d48 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_lower_triangular_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
 from tensorflow.python.platform import test
@@ -29,6 +30,11 @@ class LinearOperatorLowerTriangularTest(
     linear_operator_test_util.SquareLinearOperatorDerivedClassTest):
   """Most tests done in the base class LinearOperatorDerivedClassTest."""
 
+  @property
+  def _tests_to_skip(self):
+    # Cholesky does not make sense for triangular matrices.
+    return ["cholesky"]
+
   def _operator_and_matrix(self, build_info, dtype, use_placeholder):
     shape = list(build_info.shape)
     # Upper triangle will be nonzero, but ignored.
@@ -71,6 +77,30 @@ class LinearOperatorLowerTriangularTest(
     with self.assertRaisesRegexp(ValueError, "at least 2 dimensions"):
       linalg.LinearOperatorLowerTriangular([1.])
 
+  def test_triangular_diag_matmul(self):
+    operator1 = linalg_lib.LinearOperatorLowerTriangular(
+        [[1., 0., 0.], [2., 1., 0.], [2., 3., 3.]])
+    operator2 = linalg_lib.LinearOperatorDiag([2., 2., 3.])
+    operator_matmul = operator1.matmul(operator2)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorLowerTriangular))
+    self.assertAllClose(
+        math_ops.matmul(
+            operator1.to_dense(),
+            operator2.to_dense()),
+        self.evaluate(operator_matmul.to_dense()))
+
+    operator_matmul = operator2.matmul(operator1)
+    self.assertTrue(isinstance(
+        operator_matmul,
+        linalg_lib.LinearOperatorLowerTriangular))
+    self.assertAllClose(
+        math_ops.matmul(
+            operator2.to_dense(),
+            operator1.to_dense()),
+        self.evaluate(operator_matmul.to_dense()))
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
index 819347343b1d22257e9f3579caced56128596723..18e13a76a097f72887cacc5d3de40b8d6babcb52 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_test.py
@@ -22,6 +22,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -107,6 +108,7 @@ class LinearOperatorTest(test.TestCase):
     self.assertAllEqual(4, operator.domain_dimension)
     self.assertAllEqual(3, operator.range_dimension)
 
+  @test_util.run_deprecated_v1
   def test_all_shape_methods_defined_by_the_one_method_shape(self):
     with self.cached_session():
       shape = (1, 2, 3, 4)
@@ -134,8 +136,9 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       operator_dense = operator.to_dense()
       self.assertAllEqual((2, 3, 4), operator_dense.get_shape())
-      self.assertAllClose(matrix, operator_dense.eval())
+      self.assertAllClose(matrix, self.evaluate(operator_dense))
 
+  @test_util.run_deprecated_v1
   def test_generic_to_dense_method_non_square_matrix_tensor(self):
     matrix = rng.randn(2, 3, 4)
     matrix_ph = array_ops.placeholder(dtypes.float64)
@@ -152,7 +155,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       y = operator.matvec(x)
       self.assertAllEqual((2,), y.get_shape())
-      self.assertAllClose([1., 2.], y.eval())
+      self.assertAllClose([1., 2.], self.evaluate(y))
 
   def test_solvevec(self):
     matrix = [[1., 0], [0., 2.]]
@@ -161,7 +164,7 @@ class LinearOperatorTest(test.TestCase):
     with self.cached_session():
       x = operator.solvevec(y)
       self.assertAllEqual((2,), x.get_shape())
-      self.assertAllClose([1., 1 / 2.], x.eval())
+      self.assertAllClose([1., 1 / 2.], self.evaluate(x))
 
   def test_is_square_set_to_true_for_square_static_shapes(self):
     operator = LinearOperatorShape(shape=(2, 4, 4))
@@ -175,6 +178,7 @@ class LinearOperatorTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "but.*was square"):
       _ = LinearOperatorShape(shape=(2, 4, 4), is_square=False).is_square
 
+  @test_util.run_deprecated_v1
   def test_is_square_set_inconsistent_with_other_hints_raises(self):
     with self.assertRaisesRegexp(ValueError, "is always square"):
       matrix = array_ops.placeholder(dtypes.float32)
@@ -185,6 +189,7 @@ class LinearOperatorTest(test.TestCase):
       LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
+  @test_util.run_deprecated_v1
   def test_non_square_operators_raise_on_determinant_and_solve(self):
     operator = LinearOperatorShape((2, 3))
     with self.assertRaisesRegexp(NotImplementedError, "not be square"):
@@ -199,6 +204,7 @@ class LinearOperatorTest(test.TestCase):
       LinearOperatorMatmulSolve(
           matrix, is_positive_definite=True, is_square=False)
 
+  @test_util.run_deprecated_v1
   def test_is_square_manual_set_works(self):
     matrix = array_ops.placeholder(dtypes.float32)
     # Default is None.
@@ -208,6 +214,80 @@ class LinearOperatorTest(test.TestCase):
     operator = LinearOperatorMatmulSolve(matrix, is_square=True)
     self.assertTrue(operator.is_square)
 
+  @test_util.run_v1_only("b/120545219")
+  def test_linear_operator_matmul_hints_closed(self):
+    matrix = array_ops.placeholder(dtypes.float32)
+    operator1 = LinearOperatorMatmulSolve(matrix)
+
+    operator_matmul = operator1.matmul(operator1)
+
+    self.assertEqual(None, operator_matmul.is_square)
+    self.assertEqual(None, operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    operator2 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=True,
+        is_self_adjoint=True,
+        is_positive_definite=True,
+        is_square=True,
+    )
+
+    operator_matmul = operator2.matmul(operator2)
+
+    self.assertTrue(operator_matmul.is_square)
+    self.assertTrue(operator_matmul.is_non_singular)
+    self.assertTrue(operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+  @test_util.run_v1_only("b/120545219")
+  def test_linear_operator_matmul_hints_false(self):
+    matrix = array_ops.placeholder(dtypes.float32)
+    operator1 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=False,
+        is_self_adjoint=False,
+        is_positive_definite=False,
+        is_square=True,
+    )
+
+    operator_matmul = operator1.matmul(operator1)
+
+    self.assertTrue(operator_matmul.is_square)
+    self.assertFalse(operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+    operator2 = LinearOperatorMatmulSolve(
+        matrix,
+        is_non_singular=False,
+        is_self_adjoint=False,
+        is_positive_definite=False,
+        is_square=False,
+    )
+
+    operator_matmul = operator2.matmul(operator2)
+
+    self.assertEqual(None, operator_matmul.is_square)
+    self.assertEqual(None, operator_matmul.is_non_singular)
+    self.assertEqual(None, operator_matmul.is_self_adjoint)
+    self.assertEqual(None, operator_matmul.is_positive_definite)
+
+  @test_util.run_v1_only("b/120545219")
+  def test_linear_operator_matmul_hint_infer_square(self):
+    matrix1 = array_ops.placeholder(shape=[2, 3], dtype=dtypes.float32)
+    matrix2 = array_ops.placeholder(shape=[3, 2], dtype=dtypes.float32)
+    matrix3 = array_ops.placeholder(shape=[3, 4], dtype=dtypes.float32)
+
+    operator1 = LinearOperatorMatmulSolve(matrix1, is_square=False)
+    operator2 = LinearOperatorMatmulSolve(matrix2, is_square=False)
+    operator3 = LinearOperatorMatmulSolve(matrix3, is_square=False)
+
+    self.assertTrue(operator1.matmul(operator2).is_square)
+    self.assertTrue(operator2.matmul(operator1).is_square)
+    self.assertFalse(operator1.matmul(operator3).is_square)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
index 31fb19e4a69b6847e06cc0aca2e86f91f78e3762..d1e6c37e35af8664454c20f60e712ed6ff7c6fe6 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_util_test.py
@@ -21,6 +21,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -32,12 +33,14 @@ rng = np.random.RandomState(0)
 
 class AssertZeroImagPartTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([0., 2, 3])
     with self.cached_session():
       # Should not raise.
       linear_operator_util.assert_zero_imag_part(x, message="ABC123").run()
 
+  @test_util.run_deprecated_v1
   def test_complex_tensor_with_imag_zero_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([0., 0, 0])
@@ -57,6 +60,7 @@ class AssertZeroImagPartTest(test.TestCase):
 
 class AssertNoEntriesWithModulusZeroTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_nonzero_real_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 2, 3])
     with self.cached_session():
@@ -64,6 +68,7 @@ class AssertNoEntriesWithModulusZeroTest(test.TestCase):
       linear_operator_util.assert_no_entries_with_modulus_zero(
           x, message="ABC123").run()
 
+  @test_util.run_deprecated_v1
   def test_nonzero_complex_tensor_doesnt_raise(self):
     x = ops.convert_to_tensor([1., 0, 3])
     y = ops.convert_to_tensor([1., 2, 0])
@@ -102,8 +107,9 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     self.assertTrue(isinstance(tensor, ops.Tensor))
 
     with self.cached_session():
-      self.assertAllClose(arr, tensor.eval())
+      self.assertAllClose(arr, self.evaluate(tensor))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast(self):
     # x.batch_shape = [3, 1, 2]
     # y.batch_shape = [4, 1]
@@ -119,7 +125,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
       self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
-      x_bc_, y_bc_ = sess.run([x_bc, y_bc])
+      x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
@@ -138,10 +144,11 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
     with self.cached_session() as sess:
       self.assertAllEqual(x_bc_expected.shape, x_bc.get_shape())
       self.assertAllEqual(y_bc_expected.shape, y_bc.get_shape())
-      x_bc_, y_bc_ = sess.run([x_bc, y_bc])
+      x_bc_, y_bc_ = self.evaluate([x_bc, y_bc])
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_32bit(self):
     # x.batch_shape = [3, 1, 2]
     # y.batch_shape = [4, 1]
@@ -162,6 +169,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
       self.assertAllClose(x_bc_expected, x_bc_)
       self.assertAllClose(y_bc_expected, y_bc_)
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_32bit_second_arg_higher_rank(self):
     # x.batch_shape =    [1, 2]
     # y.batch_shape = [3, 4, 1]
@@ -195,6 +203,7 @@ class BroadcastMatrixBatchDimsTest(test.TestCase):
 
 class CholeskySolveWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast(self):
     # batch_shape = [2]
     chol = rng.rand(3, 3)
@@ -205,8 +214,9 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
       result = linear_operator_util.cholesky_solve_with_broadcast(chol, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.cholesky_solve(chol_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
     chol = rng.rand(2, 3, 3)
@@ -233,6 +243,7 @@ class CholeskySolveWithBroadcastTest(test.TestCase):
 
 class MatmulWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_x_has_extra_dims(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
@@ -244,8 +255,9 @@ class MatmulWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matmul_with_broadcast(x, y)
       self.assertAllEqual((2, 1, 7), result.get_shape())
       expected = math_ops.matmul(x, y_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_y_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -261,8 +273,9 @@ class MatmulWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matmul_with_broadcast(x, y)
       self.assertAllEqual((2, 3, 5, 5), result.get_shape())
       expected = math_ops.matmul(x_broadcast, y)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_y_has_extra_dims_transpose_a_and_b(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -280,8 +293,9 @@ class MatmulWithBroadcastTest(test.TestCase):
       self.assertAllEqual((2, 3, 5, 1), result.get_shape())
       expected = math_ops.matmul(
           x_broadcast, y, transpose_a=True, transpose_b=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_y_has_extra_dims_transpose_dynamic(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -308,6 +322,7 @@ class MatmulWithBroadcastTest(test.TestCase):
                               y_ph: y
                           }))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     # for each batch member, we have a 1x3 matrix times a 3x7 matrix ==> 1x7
@@ -333,6 +348,7 @@ class MatmulWithBroadcastTest(test.TestCase):
 
 class MatrixSolveWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_matrix_has_extra_dims(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
@@ -344,8 +360,9 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix, rhs_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -362,8 +379,9 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
       result = linear_operator_util.matrix_solve_with_broadcast(matrix, rhs)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_dynamic(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -385,12 +403,13 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
       self.assertAllEqual(3, result.shape.ndims)
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs)
       self.assertAllClose(
-          expected.eval(),
+          self.evaluate(expected),
           result.eval(feed_dict={
               matrix_ph: matrix,
               rhs_ph: rhs
           }))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -408,8 +427,9 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
           matrix, rhs, adjoint=True)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_solve(matrix_broadcast, rhs, adjoint=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2, 2]
     matrix = rng.rand(2, 3, 3)
@@ -436,6 +456,7 @@ class MatrixSolveWithBroadcastTest(test.TestCase):
 
 class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_matrix_has_extra_dims(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
@@ -447,8 +468,9 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 7), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(matrix, rhs_broadcast)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -466,8 +488,9 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
           matrix, rhs)
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(matrix_broadcast, rhs)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_static_dims_broadcast_rhs_has_extra_dims_and_adjoint(self):
     # Since the second arg has extra dims, and the domain dim of the first arg
     # is larger than the number of linear equations, code will "flip" the extra
@@ -486,8 +509,9 @@ class MatrixTriangularSolveWithBroadcastTest(test.TestCase):
       self.assertAllEqual((2, 3, 2), result.get_shape())
       expected = linalg_ops.matrix_triangular_solve(
           matrix_broadcast, rhs, adjoint=True)
-      self.assertAllClose(expected.eval(), result.eval())
+      self.assertAllClose(expected.eval(), self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def test_dynamic_dims_broadcast_64bit(self):
     # batch_shape = [2]
     matrix = rng.rand(2, 3, 3)
@@ -522,6 +546,7 @@ class DomainDimensionStubOperator(object):
 
 class AssertCompatibleMatrixDimensionsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_compatible_dimensions_do_not_raise(self):
     with self.cached_session():
       x = ops.convert_to_tensor(rng.rand(2, 3, 4))
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
index ad97d1a93ea68ce3f76b78eddb615fca01d8c74a..eb0b8ef127749e9e5709861d14b143877790bffd 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_zeros_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.linalg import linalg as linalg_lib
 from tensorflow.python.ops.linalg import linear_operator_test_util
@@ -35,7 +36,7 @@ class LinearOperatorZerosTest(
 
   @property
   def _tests_to_skip(self):
-    return ["log_abs_det", "solve", "solve_with_broadcast"]
+    return ["cholesky", "log_abs_det", "solve", "solve_with_broadcast"]
 
   @property
   def _operator_build_infos(self):
@@ -46,7 +47,10 @@ class LinearOperatorZerosTest(
         build_info((3, 4, 4)),
         build_info((2, 1, 4, 4))]
 
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
+    del ensure_self_adjoint_and_pd
     del use_placeholder
     shape = list(build_info.shape)
     assert shape[-1] == shape[-2]
@@ -70,6 +74,7 @@ class LinearOperatorZerosTest(
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
       operator.assert_non_singular()
 
+  @test_util.run_deprecated_v1
   def test_assert_self_adjoint(self):
     with self.cached_session():
       operator = linalg_lib.LinearOperatorZeros(num_rows=2)
@@ -105,6 +110,7 @@ class LinearOperatorZerosTest(
     with self.assertRaisesRegexp(ValueError, "must be non-negative"):
       linalg_lib.LinearOperatorZeros(num_rows=2, batch_shape=[-2])
 
+  @test_util.run_deprecated_v1
   def test_non_scalar_num_rows_raises_dynamic(self):
     with self.cached_session():
       num_rows = array_ops.placeholder(dtypes.int32)
@@ -113,6 +119,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be a 0-D Tensor"):
         operator.to_dense().eval(feed_dict={num_rows: [2]})
 
+  @test_util.run_deprecated_v1
   def test_negative_num_rows_raises_dynamic(self):
     with self.cached_session():
       n = array_ops.placeholder(dtypes.int32)
@@ -126,6 +133,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be non-negative"):
         operator.to_dense().eval(feed_dict={n: -2})
 
+  @test_util.run_deprecated_v1
   def test_non_1d_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -134,6 +142,7 @@ class LinearOperatorZerosTest(
       with self.assertRaisesOpError("must be a 1-D"):
         operator.to_dense().eval(feed_dict={batch_shape: 2})
 
+  @test_util.run_deprecated_v1
   def test_negative_batch_shape_raises_dynamic(self):
     with self.cached_session():
       batch_shape = array_ops.placeholder(dtypes.int32)
@@ -148,6 +157,7 @@ class LinearOperatorZerosTest(
     with self.assertRaisesRegexp(ValueError, "Dimensions.*not compatible"):
       operator.matmul(x)
 
+  @test_util.run_deprecated_v1
   def test_wrong_matrix_dimensions_raises_dynamic(self):
     num_rows = array_ops.placeholder(dtypes.int32)
     x = array_ops.placeholder(dtypes.float32)
@@ -166,6 +176,17 @@ class LinearOperatorZerosTest(
     self.assertFalse(operator.is_non_singular)
     self.assertTrue(operator.is_self_adjoint)
 
+  def test_zeros_matmul(self):
+    operator1 = linalg_lib.LinearOperatorIdentity(num_rows=2)
+    operator2 = linalg_lib.LinearOperatorZeros(num_rows=2)
+    self.assertTrue(isinstance(
+        operator1.matmul(operator2),
+        linalg_lib.LinearOperatorZeros))
+
+    self.assertTrue(isinstance(
+        operator2.matmul(operator1),
+        linalg_lib.LinearOperatorZeros))
+
 
 class LinearOperatorZerosNotSquareTest(
     linear_operator_test_util.NonSquareLinearOperatorDerivedClassTest):
diff --git a/tensorflow/python/kernel_tests/linalg_grad_test.py b/tensorflow/python/kernel_tests/linalg_grad_test.py
index 03b640a85a3ba0bc3617be2d8ae8ec5a438343ff..ff84221611813cf37537b843087faa70ae1d3e8e 100644
--- a/tensorflow/python/kernel_tests/linalg_grad_test.py
+++ b/tensorflow/python/kernel_tests/linalg_grad_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -39,6 +40,7 @@ def _AddTest(test, op_name, testcase_name, fn):
 
 class ShapeTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBatchGradientUnknownSize(self):
     with self.cached_session():
       batch_size = constant_op.constant(3)
@@ -50,7 +52,7 @@ class ShapeTest(test_lib.TestCase):
       determinants = linalg_ops.matrix_determinant(batch_identity)
       reduced = math_ops.reduce_sum(determinants)
       sum_grad = gradients_impl.gradients(reduced, batch_identity)[0]
-      self.assertAllClose(batch_identity.eval(), sum_grad.eval())
+      self.assertAllClose(batch_identity.eval(), self.evaluate(sum_grad))
 
 
 class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
@@ -59,6 +61,7 @@ class MatrixUnaryFunctorGradientTest(test_lib.TestCase):
 
 def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
 
+  @test_util.run_v1_only('b/120545219')
   def Test(self):
     with self.session(use_gpu=True):
       np.random.seed(1)
@@ -69,7 +72,7 @@ def _GetMatrixUnaryFunctorGradientTest(functor_, dtype_, shape_, **kwargs_):
       if functor_.__name__ == 'matrix_square_root':
         # Square the input matrix to ensure that its matrix square root exists
         a = math_ops.matmul(a, a)
-        a_np = a.eval()
+        a_np = self.evaluate(a)
       b = functor_(a, **kwargs_)
 
       # Optimal stepsize for central difference is O(epsilon^{1/3}).
@@ -101,6 +104,7 @@ def _GetMatrixBinaryFunctorGradientTest(functor_,
                                         float32_tol_fudge=1.0,
                                         **kwargs_):
 
+  @test_util.run_v1_only('b/120545219')
   def Test(self):
     # TODO(rmlarsen): Debug illegal address bug on CUDA and re-enable
     # GPU test for matrix_solve.
diff --git a/tensorflow/python/kernel_tests/linalg_ops_test.py b/tensorflow/python/kernel_tests/linalg_ops_test.py
index 28391aaa878a81a2d29d2cdba455b631e141d61c..028167a78603b7f2c00ae19ca76f721d38e200c9 100644
--- a/tensorflow/python/kernel_tests/linalg_ops_test.py
+++ b/tensorflow/python/kernel_tests/linalg_ops_test.py
@@ -25,6 +25,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
@@ -52,6 +53,7 @@ class CholeskySolveTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(0)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_type, atol in [(np.float32, 0.05), (np.float64, 1e-5)]:
@@ -73,6 +75,7 @@ class LogdetTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -85,7 +88,7 @@ class LogdetTest(test.TestCase):
           #     [_RandomPDMatrix(n, self.rng, np_dtype),
           #      _RandomPDMatrix(n, self.rng, np_dtype)]).astype(np_dtype)
           logdet_tf = linalg.logdet(matrix)
-          self.assertAllClose(logdet_np, logdet_tf.eval(), atol=atol)
+          self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -94,7 +97,7 @@ class LogdetTest(test.TestCase):
       _, logdet_np = np.linalg.slogdet(matrix)
       with self.session(use_gpu=True):
         logdet_tf = linalg.logdet(matrix)
-        self.assertAllClose(logdet_np, logdet_tf.eval(), atol=atol)
+        self.assertAllClose(logdet_np, self.evaluate(logdet_tf), atol=atol)
 
 
 class SlogdetTest(test.TestCase):
@@ -102,6 +105,7 @@ class SlogdetTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(42)
 
+  @test_util.run_deprecated_v1
   def test_works_with_five_different_random_pos_def_matrices(self):
     for n in range(1, 6):
       for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -110,8 +114,9 @@ class SlogdetTest(test.TestCase):
         sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
         with self.session(use_gpu=True):
           sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-          self.assertAllClose(log_abs_det_np, log_abs_det_tf.eval(), atol=atol)
-          self.assertAllClose(sign_np, sign_tf.eval(), atol=atol)
+          self.assertAllClose(
+              log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+          self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
   def test_works_with_underflow_case(self):
     for np_dtype, atol in [(np.float32, 0.05), (np.float64, 1e-5),
@@ -120,8 +125,9 @@ class SlogdetTest(test.TestCase):
       sign_np, log_abs_det_np = np.linalg.slogdet(matrix)
       with self.session(use_gpu=True):
         sign_tf, log_abs_det_tf = linalg.slogdet(matrix)
-        self.assertAllClose(log_abs_det_np, log_abs_det_tf.eval(), atol=atol)
-        self.assertAllClose(sign_np, sign_tf.eval(), atol=atol)
+        self.assertAllClose(
+            log_abs_det_np, self.evaluate(log_abs_det_tf), atol=atol)
+        self.assertAllClose(sign_np, self.evaluate(sign_tf), atol=atol)
 
 
 class AdjointTest(test.TestCase):
@@ -135,7 +141,7 @@ class AdjointTest(test.TestCase):
         matrix = ops.convert_to_tensor(matrix_np)
         transposed = linalg.adjoint(matrix)
         self.assertEqual((3, 2), transposed.get_shape())
-        self.assertAllEqual(expected_transposed, transposed.eval())
+        self.assertAllEqual(expected_transposed, self.evaluate(transposed))
 
 
 class EyeTest(parameterized.TestCase, test.TestCase):
@@ -230,6 +236,7 @@ class EyeTest(parameterized.TestCase, test.TestCase):
               dtypes.complex128
           ])
       )
+  @test_util.run_deprecated_v1
   def test_eye_with_placeholder(
       self, num_rows, num_columns, batch_shape, dtype):
     eye_np = np.eye(num_rows, M=num_columns, dtype=dtype.as_numpy_dtype)
diff --git a/tensorflow/python/kernel_tests/list_ops_test.py b/tensorflow/python/kernel_tests/list_ops_test.py
index 92552854aa66ef7290f5521d5eecdebb5a638f80..489f6c9b00471e6c10a8a04830613e9c5b99661a 100644
--- a/tensorflow/python/kernel_tests/list_ops_test.py
+++ b/tensorflow/python/kernel_tests/list_ops_test.py
@@ -29,9 +29,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
@@ -39,17 +41,13 @@ from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.platform import test
 
 
-def scalar_shape():
-  return ops.convert_to_tensor([], dtype=dtypes.int32)
-
-
 @test_util.run_all_in_graph_and_eager_modes
 class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def _testPushPop(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
+        element_shape=[],
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
@@ -68,11 +66,10 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with context.device("gpu:0"):
       self._testPushPop(max_num_elements)
 
+  @test_util.run_deprecated_v1
   def testPushInFullListFails(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
-        max_num_elements=1)
+        element_dtype=dtypes.float32, element_shape=[], max_num_elements=1)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "Tried to push item into a full list"):
@@ -81,10 +78,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
+  @test_util.run_deprecated_v1
   def testPopFromEmptyTensorListFails(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
+        element_shape=[],
         max_num_elements=max_num_elements)
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  "Trying to pop from an empty list"):
@@ -94,11 +92,13 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def _testStack(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
+        element_shape=[],
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
     t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
+    if not context.executing_eagerly():
+      self.assertAllEqual(t.shape.as_list(), [None])
     self.assertAllEqual(self.evaluate(t), [1.0, 2.0])
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
@@ -116,10 +116,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testStackWithUnknownElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=-1,
+        element_shape=None,
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
@@ -136,10 +137,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testStackWithPartiallyDefinedElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=[-1],
+        element_shape=[None],
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0]))
@@ -156,6 +158,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 2))
+  @test_util.run_deprecated_v1
   def testStackEmptyList(self, max_num_elements):
     # Should be able to stack empty lists with fully defined element_shape.
     l = list_ops.empty_tensor_list(
@@ -171,7 +174,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=[-1, 2],
+          element_shape=[None, 2],
           max_num_elements=max_num_elements)
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -181,7 +184,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=-1,
+          element_shape=None,
           max_num_elements=max_num_elements)
       t = list_ops.tensor_list_stack(l, element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -192,7 +195,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with backprop.GradientTape() as tape:
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=scalar_shape(),
+          element_shape=[],
           max_num_elements=max_num_elements)
       c0 = constant_op.constant(1.0)
       tape.watch(c0)
@@ -206,10 +209,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testGatherWithUnknownElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=-1,
+        element_shape=None,
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant(2.0))
@@ -229,10 +233,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testGatherWithPartiallyDefinedElementShape(self, max_num_elements):
     l = list_ops.empty_tensor_list(
         element_dtype=dtypes.float32,
-        element_shape=[-1],
+        element_shape=[None],
         max_num_elements=max_num_elements)
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0]))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([2.0, 3.0]))
@@ -252,6 +257,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   @parameterized.named_parameters(("NoMaxNumElements", None),
                                   ("WithMaxNumElements", 3))
+  @test_util.run_deprecated_v1
   def testGatherEmptyList(self, max_num_elements):
     # Should be able to gather from empty lists with fully defined
     # element_shape.
@@ -268,7 +274,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=[-1, 2],
+          element_shape=[None, 2],
           max_num_elements=max_num_elements)
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -279,7 +285,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                  "non-fully-defined"):
       l = list_ops.empty_tensor_list(
           element_dtype=dtypes.float32,
-          element_shape=-1,
+          element_shape=None,
           max_num_elements=max_num_elements)
       t = list_ops.tensor_list_gather(l, [], element_dtype=dtypes.float32)
       self.evaluate(t)
@@ -300,7 +306,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testTensorListFromTensor(self):
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e), 2.0)
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
@@ -315,7 +321,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testGetSetItem(self):
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     e0 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
     self.assertAllEqual(self.evaluate(e0), 1.0)
     l = list_ops.tensor_list_set_item(l, 0, 3.0)
@@ -333,19 +339,16 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       t = constant_op.constant(5.)
       tape.watch(t)
       l = list_ops.tensor_list_reserve(
-          element_dtype=dtypes.float32,
-          element_shape=scalar_shape(),
-          num_elements=3)
+          element_dtype=dtypes.float32, element_shape=[], num_elements=3)
       l = list_ops.tensor_list_set_item(l, 1, 2. * t)
       e = list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32)
       self.assertAllEqual(self.evaluate(e), 10.0)
     self.assertAllEqual(self.evaluate(tape.gradient(e, t)), 2.0)
 
+  @test_util.run_deprecated_v1
   def testSetOnEmptyListWithMaxNumElementsFails(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32,
-        element_shape=scalar_shape(),
-        max_num_elements=3)
+        element_dtype=dtypes.float32, element_shape=[], max_num_elements=3)
     with self.assertRaisesRegexp(
         errors.InvalidArgumentError,
         "Trying to modify element 0 in a list with 0 elements."):
@@ -354,7 +357,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testUnknownShape(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32, element_shape=None)
     l = list_ops.tensor_list_push_back(l, constant_op.constant(1.0))
     l = list_ops.tensor_list_push_back(l, constant_op.constant([1.0, 2.0]))
     l, e = list_ops.tensor_list_pop_back(l, element_dtype=dtypes.float32)
@@ -366,7 +369,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if not context.num_gpus():
       return
     t = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     with context.device("gpu:0"):
       l_gpu = array_ops.identity(l)
       self.assertAllEqual(
@@ -383,7 +386,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if not context.num_gpus():
       return
     t = constant_op.constant([1.0, 2.0])
-    child_l = list_ops.tensor_list_from_tensor(t, element_shape=scalar_shape())
+    child_l = list_ops.tensor_list_from_tensor(t, element_shape=[])
     l = list_ops.empty_tensor_list(
         element_shape=constant_op.constant([], dtype=dtypes.int32),
         element_dtype=dtypes.variant)
@@ -495,9 +498,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
         l = list_ops.tensor_list_reserve(
-            element_dtype=dtypes.float32,
-            element_shape=scalar_shape(),
-            num_elements=2)
+            element_dtype=dtypes.float32, element_shape=[], num_elements=2)
         l = list_ops.tensor_list_set_item(l, 0, 1.)
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
@@ -512,7 +513,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
         t = constant_op.constant([[1.0], [2.0]])
-        l = list_ops.tensor_list_from_tensor(t, element_shape=-1)
+        l = list_ops.tensor_list_from_tensor(t, element_shape=None)
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
         element_shape = list_ops.tensor_list_element_shape(
@@ -529,7 +530,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with ops.Graph().as_default(), session.Session(target=worker.target):
       with ops.device("/job:worker"):
         l = list_ops.empty_tensor_list(
-            element_shape=-1, element_dtype=dtypes.float32, max_num_elements=2)
+            element_shape=None,
+            element_dtype=dtypes.float32,
+            max_num_elements=2)
         l = list_ops.tensor_list_push_back(l, 1.)
       with ops.device("/job:ps"):
         l_ps = array_ops.identity(l)
@@ -543,8 +546,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testPushPopGradients(self):
     with backprop.GradientTape() as tape:
-      l = list_ops.empty_tensor_list(element_dtype=dtypes.float32,
-                                     element_shape=scalar_shape())
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=[])
       c = constant_op.constant(1.0)
       tape.watch(c)
       l = list_ops.tensor_list_push_back(l, c)
@@ -556,7 +559,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       c2 = list_ops.tensor_list_stack(
           l, element_dtype=dtypes.float32, num_elements=2)
       result = c2 * 2.0
@@ -567,7 +570,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with backprop.GradientTape() as tape:
       c = constant_op.constant([1.0, 2.0])
       tape.watch(c)
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       c2 = constant_op.constant(3.0)
       tape.watch(c2)
       l = list_ops.tensor_list_set_item(l, 0, c2)
@@ -578,17 +581,19 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(self.evaluate(grad_c), [0.0, 4.0])
     self.assertAllEqual(self.evaluate(grad_c2), 6.0)
 
+  @test_util.run_deprecated_v1
   def testSetOutOfBounds(self):
     c = constant_op.constant([1.0, 2.0])
-    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(c, element_shape=[])
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(list_ops.tensor_list_set_item(l, 20, 3.0))
 
+  @test_util.run_deprecated_v1
   def testSkipEagerSetItemWithMismatchedShapeFails(self):
     with self.cached_session() as sess:
       ph = array_ops.placeholder(dtypes.float32)
       c = constant_op.constant([1.0, 2.0])
-      l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+      l = list_ops.tensor_list_from_tensor(c, element_shape=[])
       # Set a placeholder with unknown shape to satisfy the shape inference
       # at graph building time.
       l = list_ops.tensor_list_set_item(l, 0, ph)
@@ -599,7 +604,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testResourceVariableScatterGather(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
+    l = list_ops.tensor_list_from_tensor(c, element_shape=[])
     v = vs.get_variable("var", initializer=[l] * 10, use_resource=True)
     v_r_0_stacked = list_ops.tensor_list_stack(v[0], dtypes.float32)
     self.evaluate(v.initializer)
@@ -607,10 +612,8 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     v_r_sparse_stacked = list_ops.tensor_list_stack(
         v.sparse_read(0), dtypes.float32)
     self.assertAllEqual([1.0, 2.0], self.evaluate(v_r_sparse_stacked))
-    l_new_0 = list_ops.tensor_list_from_tensor(
-        [3.0, 4.0], element_shape=scalar_shape())
-    l_new_1 = list_ops.tensor_list_from_tensor(
-        [5.0, 6.0], element_shape=scalar_shape())
+    l_new_0 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l_new_1 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
     updated_v = state_ops.scatter_update(v, [3, 5], [l_new_0, l_new_1])
     updated_v_elems = array_ops.unstack(updated_v)
     updated_v_stacked = [
@@ -620,10 +623,11 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                 [[1.0, 2.0]] * 4)
     self.assertAllEqual(self.evaluate(updated_v_stacked), expected)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=[])
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=[])
     l_batch_0 = array_ops.stack([l0, l1])
     l_batch_1 = array_ops.stack([l1, l0])
 
@@ -659,7 +663,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.evaluate(
           list_ops.tensor_list_concat_lists(
               l_batch_0,
-              list_ops.empty_tensor_list(scalar_shape(), dtypes.float32),
+              list_ops.empty_tensor_list([], dtypes.float32),
               element_dtype=dtypes.float32))
 
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
@@ -673,16 +677,16 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(errors.InvalidArgumentError,
                                  r"input_b\[0\].dtype != element_dtype."):
       l_batch_of_int_tls = array_ops.stack(
-          [list_ops.tensor_list_from_tensor([1], element_shape=scalar_shape())]
-          * 2)
+          [list_ops.tensor_list_from_tensor([1], element_shape=[])] * 2)
       self.evaluate(
           list_ops.tensor_list_concat_lists(l_batch_0, l_batch_of_int_tls,
                                             element_dtype=dtypes.float32))
 
+  @test_util.run_deprecated_v1
   def testPushBackBatch(self):
     c = constant_op.constant([1.0, 2.0], dtype=dtypes.float32)
-    l0 = list_ops.tensor_list_from_tensor(c, element_shape=scalar_shape())
-    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=scalar_shape())
+    l0 = list_ops.tensor_list_from_tensor(c, element_shape=[])
+    l1 = list_ops.tensor_list_from_tensor([-1.0], element_shape=[])
     l_batch = array_ops.stack([l0, l1])
     l_push = list_ops.tensor_list_push_back_batch(l_batch, [3.0, 4.0])
     l_unstack = array_ops.unstack(l_push)
@@ -726,7 +730,7 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                   dtypes.float64, dtypes.complex64, dtypes.complex128,
                   dtypes.bool):
       l_empty = list_ops.empty_tensor_list(
-          element_dtype=dtype, element_shape=scalar_shape())
+          element_dtype=dtype, element_shape=[])
       l_empty_zeros = array_ops.zeros_like(l_empty)
       t_empty_zeros = list_ops.tensor_list_stack(
           l_empty_zeros, element_dtype=dtype)
@@ -750,10 +754,9 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                   dtypes.float64, dtypes.complex64, dtypes.complex128,
                   dtypes.bool):
       l = list_ops.empty_tensor_list(
-          element_dtype=dtypes.variant, element_shape=scalar_shape())
+          element_dtype=dtypes.variant, element_shape=[])
 
-      sub_l = list_ops.empty_tensor_list(
-          element_dtype=dtype, element_shape=scalar_shape())
+      sub_l = list_ops.empty_tensor_list(element_dtype=dtype, element_shape=[])
       l = list_ops.tensor_list_push_back(l, sub_l)
       sub_l = list_ops.tensor_list_push_back(sub_l, math_ops.cast(
           1, dtype=dtype))
@@ -786,13 +789,12 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testElementShape(self):
     l = list_ops.empty_tensor_list(
-        element_dtype=dtypes.float32, element_shape=-1)
+        element_dtype=dtypes.float32, element_shape=None)
     shape = list_ops.tensor_list_element_shape(l, shape_type=dtypes.int32)
     self.assertEqual(self.evaluate(shape), -1)
 
   def testZerosLikeUninitialized(self):
-    l0 = list_ops.tensor_list_reserve(
-        scalar_shape(), 3, element_dtype=dtypes.float32)
+    l0 = list_ops.tensor_list_reserve([], 3, element_dtype=dtypes.float32)
     l1 = list_ops.tensor_list_set_item(l0, 0, 1.)  # [1., _, _]
     zeros_1 = array_ops.zeros_like(l1)  # [0., _, _]
     l2 = list_ops.tensor_list_set_item(l1, 2, 2.)  # [1., _, 2.]
@@ -808,6 +810,292 @@ class ListOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(self.evaluate(res_1), [0.])
     self.assertAllEqual(self.evaluate(res_2), [0., 0.])
 
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorListGetItemGradAggregation(self):
+    l = list_ops.tensor_list_reserve(
+        element_shape=[], num_elements=1, element_dtype=dtypes.float32)
+    x = constant_op.constant(1.0)
+    l = list_ops.tensor_list_set_item(l, 0, x)
+    l_read1 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    l_read2 = list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32)
+    grad = gradients_impl.gradients([l_read1, l_read2], [x])
+    with self.cached_session() as sess:
+      self.assertSequenceEqual(self.evaluate(grad), [2.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerBuildElementShape(self):
+    fn = list_ops._build_element_shape
+    # Unknown shape -> -1.
+    self.assertEqual(fn(None), -1)
+    self.assertEqual(fn(tensor_shape.unknown_shape()), -1)
+    # Scalar shape -> [] with type int32.
+    self.assertEqual(fn([]).dtype, dtypes.int32)
+    self.assertEqual(fn(tensor_shape.scalar()).dtype, dtypes.int32)
+    self.assertAllEqual(self.evaluate(fn([])), np.array([], np.int32))
+    self.assertAllEqual(
+        self.evaluate(fn(tensor_shape.scalar())), np.array([], np.int32))
+    # Tensor -> Tensor
+    shape = constant_op.constant(1)
+    self.assertIs(fn(shape), shape)
+    # Shape with unknown dims -> shape list with -1's.
+    shape = [None, 5]
+    self.assertAllEqual(fn(shape), [-1, 5])
+    self.assertAllEqual(fn(tensor_shape.TensorShape(shape)), [-1, 5])
+    # Shape with unknown dims and tensor dims -> shape list with -1's and tensor
+    # dims.
+    t = array_ops.placeholder(dtypes.int32)
+    shape = [None, 5, t]
+    result = fn(shape)
+    self.assertAllEqual(result[:2], [-1, 5])
+    self.assertIs(result[2], t)
+
+  def testAddN(self):
+    l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
+    l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
+    result = math_ops.add_n((l1, l2, l3))
+    result_t = list_ops.tensor_list_stack(result, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(result_t), [9., 12.])
+
+  def testAddNNestedList(self):
+    l1 = list_ops.tensor_list_from_tensor([1.0, 2.0], element_shape=[])
+    l2 = list_ops.tensor_list_from_tensor([3.0, 4.0], element_shape=[])
+    l3 = list_ops.tensor_list_from_tensor([5.0, 6.0], element_shape=[])
+    l4 = list_ops.tensor_list_from_tensor([7.0, 8.0], element_shape=[])
+    a = list_ops.empty_tensor_list(
+        element_dtype=dtypes.variant, element_shape=[])
+    a = list_ops.tensor_list_push_back(a, l1)
+    a = list_ops.tensor_list_push_back(a, l2)
+    b = list_ops.empty_tensor_list(
+        element_dtype=dtypes.variant, element_shape=[])
+    b = list_ops.tensor_list_push_back(b, l3)
+    b = list_ops.tensor_list_push_back(b, l4)
+    result = math_ops.add_n((a, b))
+    result_0 = list_ops.tensor_list_stack(
+        list_ops.tensor_list_get_item(result, 0, element_dtype=dtypes.variant),
+        element_dtype=dtypes.float32)
+    result_1 = list_ops.tensor_list_stack(
+        list_ops.tensor_list_get_item(result, 1, element_dtype=dtypes.variant),
+        element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(result_0), [6., 8.])
+    self.assertAllEqual(self.evaluate(result_1), [10., 12.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerConcatShapeInference(self):
+
+    def BuildTensor(element_shape):
+      l = list_ops.empty_tensor_list(
+          element_dtype=dtypes.float32, element_shape=element_shape)
+      return list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+
+    self.assertIsNone(BuildTensor(None).shape.rank)
+    self.assertAllEqual(BuildTensor([None, 2, 3]).shape.as_list(), [None, 2, 3])
+    self.assertAllEqual(
+        BuildTensor([None, 2, None]).shape.as_list(), [None, 2, None])
+    self.assertAllEqual(BuildTensor([1, 2, 3]).shape.as_list(), [None, 2, 3])
+
+  def testConcatWithFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[2, 2])
+    l = list_ops.tensor_list_push_back(l, [[0., 1.], [2., 3.]])
+    l = list_ops.tensor_list_push_back(l, [[4., 5.], [6., 7.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(
+        self.evaluate(t), [[0., 1.], [2., 3.], [4., 5.], [6., 7.]])
+
+  def testConcatWithNonFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[None, 2])
+    l = list_ops.tensor_list_push_back(l, [[0., 1.]])
+    l = list_ops.tensor_list_push_back(l, [[2., 3.], [4., 5.]])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t), [[0., 1.], [2., 3.], [4., 5.]])
+
+  def testConcatWithMismatchingTensorShapesFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    l = list_ops.tensor_list_push_back(l, [[0., 1.]])
+    l = list_ops.tensor_list_push_back(l, [[2.], [4.]])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Tried to concat tensors with unequal shapes: "
+        r"\[2\] vs \[1\]"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatEmptyListWithFullyDefinedElementShape(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[5, 2])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 2))
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[None, 2])
+    t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+    self.assertAllEqual(self.evaluate(t).shape, (0, 2))
+
+  def testConcatEmptyListWithUnknownElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "All except the first dimension must be fully"
+        " defined when concating an empty tensor list"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatEmptyListWithPartiallyDefinedElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=[2, None])
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "All except the first dimension must be fully"
+        " defined when concating an empty tensor list"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatListWithScalarElementShapeFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=tensor_shape.scalar())
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Concat requires elements to be at least vectors, "
+        "found scalars instead"):
+      t = list_ops.tensor_list_concat(l, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testConcatListWithScalarElementsFails(self):
+    l = list_ops.empty_tensor_list(
+        element_dtype=dtypes.float32, element_shape=None)
+    l1 = list_ops.tensor_list_push_back(l, 1.)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, "Concat saw a scalar shape at index 0"
+        " but requires at least vectors"):
+      t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
+      self.evaluate(t)
+    l1 = list_ops.tensor_list_push_back(l, [1.])
+    l1 = list_ops.tensor_list_push_back(l1, 2.)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError, "Concat saw a scalar shape at index 1"
+        " but requires at least vectors"):
+      t = list_ops.tensor_list_concat(l1, element_dtype=dtypes.float32)
+      self.evaluate(t)
+
+  def testEvenSplit(self):
+
+    def RunTest(input_tensor, lengths, expected_stacked_output):
+      l = list_ops.tensor_list_split(
+          input_tensor, element_shape=None, lengths=lengths)
+      self.assertAllEqual(
+          list_ops.tensor_list_stack(l, element_dtype=dtypes.float32),
+          expected_stacked_output)
+
+    RunTest([1., 2., 3.], [1, 1, 1], [[1.], [2.], [3.]])
+    RunTest([1., 2., 3., 4.], [2, 2], [[1., 2.], [3., 4.]])
+    RunTest([[1., 2.], [3., 4.]], [1, 1], [[[1., 2.]], [[3., 4.]]])
+
+  def testUnevenSplit(self):
+    l = list_ops.tensor_list_split([1., 2., 3., 4., 5],
+                                   element_shape=None,
+                                   lengths=[3, 2])
+    self.assertAllEqual(list_ops.tensor_list_length(l), 2)
+    self.assertAllEqual(
+        list_ops.tensor_list_get_item(l, 0, element_dtype=dtypes.float32),
+        [1., 2., 3.])
+    self.assertAllEqual(
+        list_ops.tensor_list_get_item(l, 1, element_dtype=dtypes.float32),
+        [4., 5.])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithInvalidTensorShapeFails(self):
+    with self.cached_session():
+      tensor = array_ops.placeholder(dtype=dtypes.float32)
+      l = list_ops.tensor_list_split(tensor, element_shape=None, lengths=[1])
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Tensor must be at least a vector, but saw shape: \[\]"):
+        l.eval({tensor: 1})
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithInvalidLengthsShapeFails(self):
+    with self.cached_session():
+      lengths = array_ops.placeholder(dtype=dtypes.int64)
+      l = list_ops.tensor_list_split([1., 2.],
+                                     element_shape=None,
+                                     lengths=lengths)
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"Expected lengths to be a vector, received shape: \[\]"):
+        l.eval({lengths: 1})
+
+  def testSplitWithInvalidLengthsFails(self):
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r"Invalid value in lengths: -1"):
+      l = list_ops.tensor_list_split([1., 2.],
+                                     element_shape=None,
+                                     lengths=[1, -1])
+      self.evaluate(l)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Attempting to slice \[0, 3\] from tensor with length 2"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[3])
+      self.evaluate(l)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        r"Unused values in tensor. Length of tensor: 2 Values used: 1"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=None, lengths=[1])
+      self.evaluate(l)
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithScalarElementShapeFails(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shapes must be equal rank, but are 1 and 0"):
+      l = list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"TensorListSplit requires element_shape to be at least of rank 1, "
+          r"but saw: \[\]"):
+        element_shape = array_ops.placeholder(dtype=dtypes.int32)
+        l = list_ops.tensor_list_split([1., 2.],
+                                       element_shape=element_shape,
+                                       lengths=[1, 1])
+        l.eval({element_shape: []})
+
+  def testEagerOnlySplitWithScalarElementShapeFails(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"TensorListSplit requires element_shape to be at least of rank 1, "
+          r"but saw: \[\]"):
+        list_ops.tensor_list_split([1., 2.], element_shape=[], lengths=[1, 1])
+
+  @test_util.run_deprecated_v1
+  def testSkipEagerSplitWithIncompatibleTensorShapeAndElementShapeFails(self):
+    with self.assertRaisesRegexp(ValueError,
+                                 r"Shapes must be equal rank, but are 2 and 1"):
+      l = list_ops.tensor_list_split([[1.], [2.]],
+                                     element_shape=[1],
+                                     lengths=[1, 1])
+
+    with self.cached_session():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
+        element_shape = array_ops.placeholder(dtype=dtypes.int32)
+        l = list_ops.tensor_list_split([[1.], [2.]],
+                                       element_shape=element_shape,
+                                       lengths=[1, 1])
+        l.eval({element_shape: [1]})
+
+  def testEagerOnlySplitWithIncompatibleTensorShapeAndElementShapeFails(self):
+    if context.executing_eagerly():
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          r"tensor shape \[2,1\] is not compatible with element_shape \[1\]"):
+        list_ops.tensor_list_split([[1.], [2.]],
+                                   element_shape=[1],
+                                   lengths=[1, 1])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/listdiff_op_test.py b/tensorflow/python/kernel_tests/listdiff_op_test.py
index baeb40dd63584bb47db7914b789d6ce869f09b25..28657107980e2c1ea3356da89b97df624477260d 100644
--- a/tensorflow/python/kernel_tests/listdiff_op_test.py
+++ b/tensorflow/python/kernel_tests/listdiff_op_test.py
@@ -47,7 +47,7 @@ class ListDiffTest(test.TestCase):
             y_tensor = ops.convert_to_tensor(y, dtype=dtype)
             out_tensor, idx_tensor = diff_func(x_tensor, y_tensor,
                                                index_dtype=index_dtype)
-            tf_out, tf_idx = sess.run([out_tensor, idx_tensor])
+            tf_out, tf_idx = self.evaluate([out_tensor, idx_tensor])
           self.assertAllEqual(tf_out, out)
           self.assertAllEqual(tf_idx, idx)
           self.assertEqual(1, out_tensor.get_shape().ndims)
diff --git a/tensorflow/python/kernel_tests/logging_ops_test.py b/tensorflow/python/kernel_tests/logging_ops_test.py
index 8e9b87f6512e975584af2baf9fc4afb0547625ea..85035e5f7d308c323786bc9fd9017fda89dbec13 100644
--- a/tensorflow/python/kernel_tests/logging_ops_test.py
+++ b/tensorflow/python/kernel_tests/logging_ops_test.py
@@ -39,6 +39,7 @@ from tensorflow.python.platform import test
 
 class LoggingOpsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssertDivideByZero(self):
     with self.cached_session() as sess:
       epsilon = ops.convert_to_tensor(1e-20)
@@ -52,7 +53,7 @@ class LoggingOpsTest(test.TestCase):
               math_ops.less(epsilon, y), ["Divide-by-zero"])
       ]):
         out = math_ops.div(z, y)
-      self.assertAllEqual(2.0, out.eval())
+      self.assertAllEqual(2.0, self.evaluate(out))
       # assert(epsilon < x)
       # z / x
       #
@@ -63,7 +64,7 @@ class LoggingOpsTest(test.TestCase):
       ]):
         out = math_ops.div(z, x)
       with self.assertRaisesOpError("less than x"):
-        out.eval()
+        self.evaluate(out)
 
 
 class PrintV2Test(test.TestCase):
@@ -305,12 +306,14 @@ class PrintV2Test(test.TestCase):
             tensor, output_stream="unknown")
         self.evaluate(print_op)
 
+  @test_util.run_deprecated_v1
   def testPrintOpName(self):
     with self.cached_session():
       tensor = math_ops.range(10)
       print_op = logging_ops.print_v2(tensor, name="print_name")
       self.assertEqual(print_op.name, "print_name")
 
+  @test_util.run_deprecated_v1
   def testNoDuplicateFormatOpGraphModeAfterExplicitFormat(self):
     with self.cached_session():
       tensor = math_ops.range(10)
@@ -379,6 +382,7 @@ class PrintGradientTest(test.TestCase):
     inp_printed = logging_ops.Print(inp, ["hello"])
     self.assertEqual(inp.get_shape(), inp_printed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testPrintGradient(self):
     with self.cached_session():
       inp = constant_op.constant(2.0, shape=[100, 32], name="in")
@@ -387,8 +391,8 @@ class PrintGradientTest(test.TestCase):
       wx_print = logging_ops.Print(wx, [w, w, w])
       wx_grad = gradients_impl.gradients(wx, w)[0]
       wx_print_grad = gradients_impl.gradients(wx_print, w)[0]
-      wxg = wx_grad.eval()
-      wxpg = wx_print_grad.eval()
+      wxg = self.evaluate(wx_grad)
+      wxpg = self.evaluate(wx_print_grad)
     self.assertAllEqual(wxg, wxpg)
 
 
diff --git a/tensorflow/python/kernel_tests/lookup_ops_test.py b/tensorflow/python/kernel_tests/lookup_ops_test.py
index bd93942efbd016d5c456c761f26397cddc9a598c..ad81e0be649f17fe97691b1c5739dbe0bf4a63d2 100644
--- a/tensorflow/python/kernel_tests/lookup_ops_test.py
+++ b/tensorflow/python/kernel_tests/lookup_ops_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.training import server_lib
 
 class HashTableOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testHashTable(self):
     with self.cached_session():
       default_val = -1
@@ -52,15 +53,16 @@ class HashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
       self.assertAllEqual([3], output.get_shape())
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
       exported_keys_tensor, exported_values_tensor = table.export()
 
       self.assertItemsEqual([b"brain", b"salad", b"surgery"],
-                            exported_keys_tensor.eval())
-      self.assertItemsEqual([0, 1, 2], exported_values_tensor.eval())
+                            self.evaluate(exported_keys_tensor))
+      self.assertItemsEqual([0, 1, 2], self.evaluate(exported_values_tensor))
 
+  @test_util.run_deprecated_v1
   def testHashTableFindHighRank(self):
     with self.cached_session():
       default_val = -1
@@ -76,9 +78,10 @@ class HashTableOpTest(test.TestCase):
           [["brain", "salad"], ["tank", "tarkus"]])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([[0, 1], [-1, -1]], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableInitWithPythonArrays(self):
     with self.cached_session():
       default_val = -1
@@ -94,9 +97,10 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableInitWithNumPyArrays(self):
     with self.cached_session():
       default_val = -1
@@ -111,9 +115,10 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testMultipleHashTables(self):
     with self.cached_session() as sess:
       default_val = -1
@@ -137,11 +142,12 @@ class HashTableOpTest(test.TestCase):
       output2 = table2.lookup(input_string)
       output3 = table3.lookup(input_string)
 
-      out1, out2, out3 = sess.run([output1, output2, output3])
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
       self.assertAllEqual([0, 1, -1], out1)
       self.assertAllEqual([0, 1, -1], out2)
       self.assertAllEqual([0, 1, -1], out3)
 
+  @test_util.run_deprecated_v1
   def testHashTableWithTensorDefault(self):
     with self.cached_session():
       default_val = constant_op.constant(-1, dtypes.int64)
@@ -154,9 +160,10 @@ class HashTableOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testHashTableWithSparseTensorInput(self):
     with self.cached_session() as sess:
       default_val = constant_op.constant(-1, dtypes.int64)
@@ -174,12 +181,13 @@ class HashTableOpTest(test.TestCase):
           constant_op.constant(sp_shape, dtypes.int64))
       output = table.lookup(input_tensor)
 
-      out_indices, out_values, out_shape = sess.run(output)
+      out_indices, out_values, out_shape = self.evaluate(output)
 
       self.assertAllEqual([0, 1, -1], out_values)
       self.assertAllEqual(sp_indices, out_indices)
       self.assertAllEqual(sp_shape, out_shape)
 
+  @test_util.run_deprecated_v1
   def testSignatureMismatch(self):
     with self.cached_session():
       default_val = -1
@@ -210,6 +218,7 @@ class HashTableOpTest(test.TestCase):
             lookup_ops.KeyValueTensorInitializer(["a"], [1], [dtypes.string],
                                                  dtypes.int64), default_val)
 
+  @test_util.run_deprecated_v1
   def testNotInitialized(self):
     with self.cached_session():
       default_val = -1
@@ -221,8 +230,9 @@ class HashTableOpTest(test.TestCase):
       output = table.lookup(input_string)
 
       with self.assertRaisesOpError("Table not initialized"):
-        output.eval()
+        self.evaluate(output)
 
+  @test_util.run_deprecated_v1
   def testInitializeTwice(self):
     with self.cached_session():
       default_val = -1
@@ -235,6 +245,7 @@ class HashTableOpTest(test.TestCase):
       with self.assertRaisesOpError("Table already initialized"):
         table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def testInitializationWithInvalidDimensions(self):
     with self.cached_session():
       default_val = -1
@@ -245,6 +256,7 @@ class HashTableOpTest(test.TestCase):
         lookup_ops.HashTable(
             lookup_ops.KeyValueTensorInitializer(keys, values), default_val)
 
+  @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     # Start a server
     server = server_lib.Server(
@@ -274,6 +286,7 @@ class HashTableOpTest(test.TestCase):
       table.initializer.run()
       self.assertAllEqual(3, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testHashTableInt32String(self):
     with self.cached_session():
       default_val = "n/a"
@@ -286,7 +299,7 @@ class HashTableOpTest(test.TestCase):
       input_tensor = constant_op.constant([0, 1, -1])
       output = table.lookup(input_tensor)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"n/a"], result)
 
 
@@ -298,6 +311,7 @@ class IndexTableFromFile(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -306,10 +320,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -322,10 +337,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -339,10 +355,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file_tensor_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -352,12 +369,13 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
       self.assertEqual(1,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
+  @test_util.run_deprecated_v1
   def test_string_index_table_from_file_placeholder_filename(self):
     vocabulary_file = self._createVocabFile("f2i_vocab1.txt")
     with self.cached_session():
@@ -367,14 +385,15 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
 
       feed_dict = {vocabulary_placeholder.name: vocabulary_file}
       lookup_ops.tables_initializer().run(feed_dict=feed_dict)
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
       self.assertEqual(0,
                        len(ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)))
 
+  @test_util.run_deprecated_v1
   def test_int32_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab2.txt", values=("42", "1", "-1000"))
@@ -387,10 +406,11 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int64_index_table_from_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab3.txt", values=("42", "1", "-1000"))
@@ -403,10 +423,11 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_default_value(self):
     default_value = -42
     vocabulary_file = self._createVocabFile("f2i_vocab4.txt")
@@ -416,10 +437,11 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), ids.eval())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_oov_buckets(self):
     vocabulary_file = self._createVocabFile("f2i_vocab5.txt")
     with self.cached_session():
@@ -429,7 +451,7 @@ class IndexTableFromFile(test.TestCase):
           constant_op.constant(["salad", "surgery", "tarkus", "toccata"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual(
           (
@@ -437,7 +459,7 @@ class IndexTableFromFile(test.TestCase):
               2,  # From vocabulary file.
               867,  # 3 + fingerprint("tarkus") mod 300.
               860),  # 3 + fingerprint("toccata") mod 300.
-          ids.eval())
+          self.evaluate(ids))
 
   def test_index_table_from_file_fails_with_empty_vocabulary_file_name(self):
     self.assertRaises(
@@ -468,6 +490,7 @@ class IndexTableFromFile(test.TestCase):
         vocabulary_file=vocabulary_file,
         vocab_size=0)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_small(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
@@ -476,11 +499,12 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, -1, -1), ids.eval())
+      self.assertAllEqual((1, -1, -1), self.evaluate(ids))
       self.assertEqual(2, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
@@ -489,6 +513,7 @@ class IndexTableFromFile(test.TestCase):
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", table.initializer.run)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_file_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab8.txt")
 
@@ -504,9 +529,9 @@ class IndexTableFromFile(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, -1), ids.eval())
+      self.assertAllEqual((1, 2, -1), self.evaluate(ids))
       self.assertEqual(3, table.size().eval())
 
   def test_index_table_from_file_with_invalid_hashers(self):
@@ -577,6 +602,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
       table = lookup_ops.HashTable(init, default_value=-1)
       table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def test_int32(self):
     with ops.Graph().as_default(), self.cached_session():
       init = lookup_ops.KeyValueTensorInitializer((42, 1, -1000), (0, 1, 2),
@@ -590,6 +616,7 @@ class KeyValueTensorInitializerTest(test.TestCase):
 class IndexTableFromTensor(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_tensor_init(self):
     table = lookup_ops.index_table_from_tensor(
         vocabulary_list=("brain", "salad", "surgery"), num_oov_buckets=1)
@@ -606,6 +633,7 @@ class IndexTableFromTensor(test.TestCase):
     ids = table.lookup(constant_op.constant(("salad", "surgery", "tarkus")))
     self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int32_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -614,10 +642,11 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int32))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_int64_index_table_from_tensor_with_tensor_init(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
@@ -626,10 +655,11 @@ class IndexTableFromTensor(test.TestCase):
           constant_op.constant((1, -1000, 11), dtype=dtypes.int64))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, 3), ids.eval())
+      self.assertAllEqual((1, 2, 3), self.evaluate(ids))
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_with_default_value(self):
     default_value = -42
     with self.cached_session():
@@ -639,9 +669,9 @@ class IndexTableFromTensor(test.TestCase):
       ids = table.lookup(constant_op.constant(["salad", "surgery", "tarkus"]))
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        ids.eval()
+        self.evaluate(ids)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((1, 2, default_value), ids.eval())
+      self.assertAllEqual((1, 2, default_value), self.evaluate(ids))
 
   def test_index_table_from_tensor_missing_vocabulary_list(self):
     with self.cached_session():
@@ -650,13 +680,14 @@ class IndexTableFromTensor(test.TestCase):
         lookup_ops.index_table_from_tensor(
             vocabulary_list=None, num_oov_buckets=1)
 
+  @test_util.run_deprecated_v1
   def test_index_table_from_tensor_empty_vocabulary_list(self):
     with self.cached_session():
       table = lookup_ops.index_table_from_tensor(
           vocabulary_list=np.array([], dtype=np.str_), num_oov_buckets=1)
       ids = table.lookup(constant_op.constant(["salad", "surgery", "brain"]))
       with self.assertRaises(errors_impl.OpError):
-        ids.eval()
+        self.evaluate(ids)
       with self.assertRaisesRegexp(
           errors_impl.OpError, "keys and values cannot be empty"):
         lookup_ops.tables_initializer().run()
@@ -686,6 +717,7 @@ class IndexToStringTableFromFileTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table(self):
     vocabulary_path = self._createVocabFile("i2f_vocab1.txt")
     # vocabulary_file supports string and tensor
@@ -698,11 +730,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
         features = table.lookup(
             constant_op.constant([0, 1, 2, 3], dtypes.int64))
         with self.assertRaises(errors_impl.OpError):
-          features.eval()
+          self.evaluate(features)
         lookup_ops.tables_initializer().run()
         self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                            features.eval())
+                            self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain\t300", "salad\t20", "surgery\t1"))
@@ -713,11 +746,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
           value_column_index=0)
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_multicolumn_file_custom_delimiter(self):
     vocabulary_file = self._createVocabFile(
         "f2i_vocab1.txt", values=("brain 300", "salad 20", "surgery 1"))
@@ -729,11 +763,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
           delimiter=" ")
       features = table.lookup(constant_op.constant([0, 1, 2, 3], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_default_value(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -742,11 +777,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
           vocabulary_file=vocabulary_file, default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_small(self):
     default_value = b"NONE"
     vocabulary_file = self._createVocabFile("f2i_vocab2.txt")
@@ -757,11 +793,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
           default_value=default_value)
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", default_value, default_value),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size_too_large(self):
     vocabulary_file = self._createVocabFile("f2i_vocab6.txt")
     with self.cached_session():
@@ -770,11 +807,12 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       init = lookup_ops.tables_initializer()
       self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                               "Invalid vocab_size", init.run)
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_with_vocab_size(self):
     vocabulary_file = self._createVocabFile("f2i_vocab7.txt")
     with self.cached_session():
@@ -783,13 +821,15 @@ class IndexToStringTableFromFileTest(test.TestCase):
       features = table.lookup(constant_op.constant([1, 2, 4], dtypes.int64))
 
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"salad", b"surgery", b"UNK"), features.eval())
+      self.assertAllEqual((b"salad", b"surgery", b"UNK"),
+                          self.evaluate(features))
 
 
 class IndexToStringTableFromTensorTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_table_from_tensor(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["brain", "salad", "surgery"])
@@ -799,12 +839,13 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       features = table.lookup(indices)
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
       lookup_ops.tables_initializer().run()
 
       self.assertAllEqual((b"brain", b"salad", b"surgery", b"UNK"),
-                          features.eval())
+                          self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_duplicate_entries(self):
     with self.cached_session():
       vocabulary_list = constant_op.constant(["hello", "hello"])
@@ -813,8 +854,9 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([0, 1, 4], dtypes.int64)
       features = table.lookup(indices)
       lookup_ops.tables_initializer().run()
-      self.assertAllEqual((b"hello", b"hello", b"UNK"), features.eval())
+      self.assertAllEqual((b"hello", b"hello", b"UNK"), self.evaluate(features))
 
+  @test_util.run_deprecated_v1
   def test_index_to_string_with_default_value(self):
     default_value = b"NONE"
     with self.cached_session():
@@ -824,11 +866,11 @@ class IndexToStringTableFromTensorTest(test.TestCase):
       indices = constant_op.constant([1, 2, 4], dtypes.int64)
       features = table.lookup(indices)
       with self.assertRaises(errors_impl.OpError):
-        features.eval()
+        self.evaluate(features)
 
       lookup_ops.tables_initializer().run()
       self.assertAllEqual((b"salad", b"surgery", default_value),
-                          features.eval())
+                          self.evaluate(features))
 
 
 class InitializeTableFromFileOpTest(test.TestCase):
@@ -854,6 +896,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
     result = self.evaluate(output)
     self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInitializeInt64Table(self):
     vocabulary_file = self._createVocabFile(
         "one_column_int64.txt", values=("42", "1", "-1000"))
@@ -870,9 +913,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       output = table.lookup(
           constant_op.constant((42, 1, 11), dtype=dtypes.int64))
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInitializeIndexTable(self):
     vocabulary_file = self._createVocabFile("one_column_2.txt")
 
@@ -889,9 +933,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
       output = table.lookup(input_values)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
 
+  @test_util.run_deprecated_v1
   def testMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -911,9 +956,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([1, 5, 6], result)
 
+  @test_util.run_deprecated_v1
   def testInvalidDataTypeInMultiColumn(self):
     vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt")
     with open(vocabulary_file, "w") as f:
@@ -944,6 +990,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                                            key_index, dtypes.string,
                                            value_index), default_value)
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     vocabulary_file = self._createVocabFile("one_column_4.txt")
     with self.cached_session():
@@ -958,6 +1005,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       with self.assertRaisesOpError("Invalid number of columns"):
         table.initializer.run()
 
+  @test_util.run_deprecated_v1
   def testInitializeSameTableWithMultipleNodes(self):
     vocabulary_file = self._createVocabFile("one_column_5.txt")
 
@@ -994,7 +1042,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       output2 = table2.lookup(input_string)
       output3 = table3.lookup(input_string)
 
-      out1, out2, out3 = sess.run([output1, output2, output3])
+      out1, out2, out3 = self.evaluate([output1, output2, output3])
       self.assertAllEqual([0, 1, -1], out1)
       self.assertAllEqual([0, 1, -1], out2)
       self.assertAllEqual([0, 1, -1], out3)
@@ -1009,6 +1057,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
 
+  @test_util.run_deprecated_v1
   def testInitializeWithVocabSize(self):
     with self.cached_session():
       default_value = -1
@@ -1055,6 +1104,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
       table3.initializer.run()
       self.assertEquals(vocab_size, table3.size().eval())
 
+  @test_util.run_deprecated_v1
   def testFeedVocabularyName(self):
     vocabulary_file = self._createVocabFile("feed_vocabulary.txt")
 
@@ -1078,9 +1128,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "tank"])
       output = table.lookup(input_string)
 
-      result = output.eval()
+      result = self.evaluate(output)
       self.assertAllEqual([0, 1, -1], result)
 
+  @test_util.run_deprecated_v1
   def testInvalidFilenames(self):
     vocabulary_file = self._createVocabFile("filename_shape.txt")
 
@@ -1105,6 +1156,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
                 dtypes.int64, lookup_ops.TextFileIndex.LINE_NUMBER),
             default_value)
 
+  @test_util.run_deprecated_v1
   def testIdToStringTable(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
@@ -1119,9 +1171,11 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64)
 
       out = table.lookup(input_values)
-      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], out.eval())
+      self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"],
+                          self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testStringToIdTable(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt")
     with self.cached_session():
@@ -1135,9 +1189,10 @@ class InitializeTableFromFileOpTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, -1], out.eval())
+      self.assertAllEqual([0, 1, 2, -1], self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt64ToIdTable(self):
     vocab_file = self._createVocabFile(
         "feat_to_id_3.txt", values=("42", "1", "-1000"))
@@ -1152,7 +1207,7 @@ class InitializeTableFromFileOpTest(test.TestCase):
 
       out = table.lookup(
           constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64))
-      self.assertAllEqual((0, 1, 2, -1), out.eval())
+      self.assertAllEqual((0, 1, 2, -1), self.evaluate(out))
       self.assertEquals(vocab_size, table.size().eval())
 
 
@@ -1164,6 +1219,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       f.write("\n".join(values) + "\n")
     return vocabulary_file
 
+  @test_util.run_deprecated_v1
   def testStringIdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_1.txt")
     with self.cached_session():
@@ -1181,9 +1237,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
       input_string = constant_op.constant(["brain", "salad", "surgery", "UNK"])
 
       out = table.lookup(input_string)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt32IdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_2.txt", ("42", "1", "-1000"))
     with self.cached_session():
@@ -1203,9 +1260,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int32)
 
       out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt64IdTableWithHashBuckets(self):
     vocab_file = self._createVocabFile("feat_to_id_3.txt", ("42", "1", "-1000"))
     with self.cached_session():
@@ -1223,9 +1281,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
       values = constant_op.constant((42, 1, -1000, 11), dtype=dtypes.int64)
 
       out = table.lookup(values)
-      self.assertAllEqual([0, 1, 2, 3], out.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out))
       self.assertEquals(vocab_size + oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testStringIdTableWithOnlyHashBucket(self):
     with self.cached_session():
       oov_buckets = 5
@@ -1244,9 +1303,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
               1,  # fingerprint("salad") mod 5.
               4  # fingerprint("surgery") mod 5
           ],
-          out.eval())
+          self.evaluate(out))
       self.assertEquals(oov_buckets, table.size().eval())
 
+  @test_util.run_deprecated_v1
   def testInt32IdTableWithOnlyHashBucket(self):
     with self.cached_session():
       oov_buckets = 5
@@ -1266,7 +1326,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
               4,  # fingerprint("1") mod 5.
               2  # fingerprint("-1000") mod 5
           ],
-          out.eval())
+          self.evaluate(out))
       self.assertEquals(oov_buckets, table.size().eval())
 
   def testFloat64IdTableWithOnlyHashBucket(self):
@@ -1281,6 +1341,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
         lookup_ops.IdTableWithHashBuckets(
             None, num_oov_buckets=5, key_dtype=dtypes.bool)
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializers(self):
     vocab_file = self._createVocabFile("feat_to_id_4.txt")
     with self.cached_session() as sess:
@@ -1311,7 +1372,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string)
       out2 = table2.lookup(input_string)
 
-      out1, out2 = sess.run([out1, out2])
+      out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([5, 0, 1, 2, 5], out1)
       self.assertAllEqual([5, 0, 1, 2, 3], out2)
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
@@ -1321,6 +1382,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
           "table2_Lookup/hash_bucket": "StringToHashBucketStrong",
       }, sess.graph)
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsInitializationAcrossSessions(self):
     vocab_file = self._createVocabFile("feat_to_id_5.txt")
     shared_name = "across-sessions"
@@ -1342,7 +1404,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out1 = table1.lookup(input_string_1)
 
-      self.assertAllEqual([0, 1, 2, 3], out1.eval())
+      self.assertAllEqual([0, 1, 2, 3], self.evaluate(out1))
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
 
     with self.cached_session():
@@ -1363,9 +1425,10 @@ class IdTableWithHashBucketsTest(test.TestCase):
 
       out2 = table2.lookup(input_string_2)
 
-      self.assertAllEqual([3, 1, 3], out2.eval())
+      self.assertAllEqual([3, 1, 3], self.evaluate(out2))
       self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
 
+  @test_util.run_deprecated_v1
   def testIdTableWithHashBucketsWithMultipleInitializersDifferentDefault(self):
     vocab_file = self._createVocabFile("feat_to_id_6.txt")
     with self.cached_session() as sess:
@@ -1394,12 +1457,13 @@ class IdTableWithHashBucketsTest(test.TestCase):
       out1 = table1.lookup(input_string_1)
       out2 = table2.lookup(input_string_2)
 
-      out1, out2 = sess.run([out1, out2])
+      out1, out2 = self.evaluate([out1, out2])
       self.assertAllEqual([0, 1, 2, -1], out1)
       self.assertAllEqual([-2, 1, -2], out2)
       self.assertEquals(vocab_size + oov_buckets, table1.size().eval())
       self.assertEquals(vocab_size + oov_buckets, table2.size().eval())
 
+  @test_util.run_deprecated_v1
   def testSparseTensor(self):
     vocab_file = self._createVocabFile("feat_to_id_7.txt")
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
@@ -1428,6 +1492,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
       self.assertAllEqual(input_shape, sp_ids_shape)
 
+  @test_util.run_deprecated_v1
   def testInt32SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
@@ -1456,6 +1521,7 @@ class IdTableWithHashBucketsTest(test.TestCase):
       self.assertAllEqual([0, 1, 0, 2, 3], sp_ids_val)
       self.assertAllEqual(input_shape, sp_ids_shape)
 
+  @test_util.run_deprecated_v1
   def testInt64SparseTensor(self):
     input_indices = [[0, 0], [0, 1], [2, 0], [2, 2], [3, 0]]
     input_shape = [4, 4]
diff --git a/tensorflow/python/kernel_tests/losses_test.py b/tensorflow/python/kernel_tests/losses_test.py
index b04996f78893da4042c364933aab26b09e029cd1..4584a27e6227bf53e4de5f74730cc9b737214cd5 100644
--- a/tensorflow/python/kernel_tests/losses_test.py
+++ b/tensorflow/python/kernel_tests/losses_test.py
@@ -51,58 +51,62 @@ class AbsoluteDifferenceLossTest(test.TestCase):
         losses.absolute_difference(
             self._predictions, self._predictions, weights=None)
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.absolute_difference(self._predictions, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.absolute_difference(self._labels, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(5.5, loss.eval(), 3)
+      self.assertAlmostEqual(5.5, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.absolute_difference(self._labels, self._predictions,
                                       constant_op.constant(weights))
     with self.cached_session():
-      self.assertAlmostEqual(5.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(5.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     weights = constant_op.constant((1.2, 0.0), shape=(2, 1))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.6, loss.eval(), 3)
+      self.assertAlmostEqual(5.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 0.0], shape=[2, 1])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(5.6, loss.eval(), 3)
+      self.assertAlmostEqual(5.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeights(self):
     weights = constant_op.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(16.6, loss.eval(), 3)
+      self.assertAlmostEqual(16.6, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
     weights = constant_op.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(6.0, loss.eval(), 3)
+      self.assertAlmostEqual(6.0, self.evaluate(loss), 3)
 
   def testLossWithSampleSpecificWeightsAllZero(self):
     weights = array_ops.zeros((2, 3))
     loss = losses.absolute_difference(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   @test_util.assert_no_new_pyobjects_executing_eagerly
   def testEagerNoMemoryLeaked(self):
@@ -123,6 +127,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.softmax_cross_entropy(labels, logits, weights=None)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -132,6 +137,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals('softmax_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -142,6 +148,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -149,8 +156,9 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -159,7 +167,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits,
                                           constant_op.constant(weights))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -168,7 +176,8 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant((1.2, 3.4, 5.6))
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testAllWrongAllWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -177,7 +186,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([0, 0, 0], shape=[3])
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSomeWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -186,7 +195,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 0, 0], shape=[3])
     with self.cached_session():
       loss = losses.softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(12.0, loss.eval(), 3)
+      self.assertAlmostEqual(12.0, self.evaluate(loss), 3)
 
   def testSoftmaxWithMeasurementSpecificWeightsRaisesException(self):
     with self.cached_session():
@@ -199,6 +208,7 @@ class SoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.softmax_cross_entropy(labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testSoftmaxLabelSmoothing(self):
     with self.cached_session():
       # Softmax Cross Entropy Loss is:
@@ -231,6 +241,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.sparse_softmax_cross_entropy(labels, logits, weights=None)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectInt32Labels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -247,6 +258,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     labels = constant_op.constant([[0], [1], [2]], dtype=dtypes.int32)
     losses.sparse_softmax_cross_entropy(labels, logits)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectInt64Labels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -256,6 +268,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllCorrectNonColumnLabels(self):
     with self.cached_session():
       logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -265,6 +278,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 0.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongInt32Labels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -275,6 +289,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongInt64Labels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -285,6 +300,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongNonColumnLabels(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -295,6 +311,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
       self.assertEquals(loss.op.name, 'sparse_softmax_cross_entropy_loss/value')
       self.assertAlmostEqual(loss.eval(), 10.0, 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -302,8 +319,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = 2.3
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
                                    [0.0, 0.0, 10.0]])
@@ -312,7 +330,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits,
                                                  constant_op.constant(weights))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWith1DTensorWeight(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -322,8 +340,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(
           labels, logits, constant_op.constant((weights,)))
-      self.assertAlmostEqual(weights * 10.0, loss.eval(), 3)
+      self.assertAlmostEqual(weights * 10.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPlaceholderForWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0],
                                    [0.0, 10.0, 0.0],
@@ -336,6 +355,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                           feed_dict={weights: ((1.2,), (3.4,), (5.6,))})
       self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss_val, 3)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapePlaceholderForLogitsLabelsButScalarWeights(self):
     logits = array_ops.placeholder(dtypes.float32)
     labels = array_ops.placeholder(dtypes.int32)
@@ -351,6 +371,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
                           })
       self.assertAlmostEqual((1.0 + 1.0 + 1.0) * 10.0 / 3.0, loss_val, 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithPlaceholderForLogitsLabelsAndWeights(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 3))
     labels = array_ops.placeholder(dtypes.int32, shape=(None, 1))
@@ -374,7 +395,8 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 3.4, 5.6], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithColumnWeights(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -383,7 +405,8 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([[1.2], [3.4], [5.6]])
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual((1.2 + 3.4 + 5.6) * 10.0 / 3.0,
+                             self.evaluate(loss), 3)
 
   def testAllWrongAllWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -392,7 +415,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([0, 0, 0], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSomeWeightsMissing(self):
     logits = constant_op.constant([[10.0, 0.0, 0.0], [0.0, 10.0, 0.0],
@@ -401,8 +424,9 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
     weights = constant_op.constant([1.2, 0, 0], shape=(3, 1))
     with self.cached_session():
       loss = losses.sparse_softmax_cross_entropy(labels, logits, weights)
-      self.assertAlmostEqual(12.0, loss.eval(), 3)
+      self.assertAlmostEqual(12.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testMeasurementSpecificWeightsRaisesException(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -441,6 +465,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testInconsistentWeightShapeRaisesException(self):
     """The weight tensor has incorrect shape."""
     with self.cached_session():
@@ -455,6 +480,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
         losses.sparse_softmax_cross_entropy(
             labels, logits, weights=weights).eval()
 
+  @test_util.run_deprecated_v1
   def testInconsistentLabelShapeRaisesException(self):
     """The label tensor has incorrect shape."""
     with self.cached_session():
@@ -472,6 +498,7 @@ class SparseSoftmaxCrossEntropyLossTest(test.TestCase):
 
 class SigmoidCrossEntropyLossTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAllCorrectSigmoid(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -481,8 +508,9 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits)
       self.assertEquals(logits.dtype, loss.dtype)
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testLossWithSingleDimPlaceholderForLogitsAndWeights1(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 1))
     labels = array_ops.placeholder(dtypes.float32, shape=(None, 1))
@@ -499,6 +527,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                       })
       self.assertAlmostEqual(0.313, loss, 3)
 
+  @test_util.run_deprecated_v1
   def testLossWithSingleDimPlaceholderForLogitsAndWeights2(self):
     logits = array_ops.placeholder(dtypes.float32, shape=(None, 2))
     labels = array_ops.placeholder(dtypes.float32, shape=(None, 2))
@@ -515,6 +544,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
                       })
       self.assertAlmostEqual(0.313, loss, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongSigmoid(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -526,6 +556,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
       self.assertAlmostEqual(loss.eval(), 600.0 / 9.0, 3)
 
+  @test_util.run_deprecated_v1
   def testAllWrongSigmoidWithMeasurementSpecificWeights(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0],
@@ -536,8 +567,9 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       loss = losses.sigmoid_cross_entropy(labels, logits, weights)
       self.assertEquals(logits.dtype, loss.dtype)
       self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
-      self.assertAlmostEqual(1700.0 / 7.0, loss.eval(), 3)
+      self.assertAlmostEqual(1700.0 / 7.0, self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testMultiCorrectSigmoid(self):
     logits = constant_op.constant([[100.0, -100.0, 100.0],
                                    [100.0, 100.0, -100.0],
@@ -548,7 +580,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals('sigmoid_cross_entropy_loss/value', loss.op.name)
 
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
   def testSigmoidFloat64(self):
     logits = constant_op.constant((
@@ -563,7 +595,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals(logits.dtype, loss.dtype)
 
     with self.cached_session():
-      self.assertAlmostEqual(44.444, loss.eval(), 3)
+      self.assertAlmostEqual(44.444, self.evaluate(loss), 3)
 
   def testSigmoidNoReduction(self):
     logits = constant_op.constant((
@@ -576,12 +608,10 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
     self.assertEquals(logits.dtype, loss.dtype)
 
     with self.cached_session():
-      self.assertAllClose((
-          (0., 0., 0.),
-          (0., 100., 100.),
-          (100., 0., 100.)
-      ), loss.eval(), 3)
+      self.assertAllClose(((0., 0., 0.), (0., 100., 100.), (100., 0., 100.)),
+                          self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testSigmoidLabelSmoothingCorrect(self):
     with self.cached_session():
       logits = constant_op.constant([[100.0, -100.0, -100.0]])
@@ -605,6 +635,7 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       expected_value = (100.0 + 50.0 * label_smoothing) / 3.0
       self.assertAlmostEqual(loss.eval(), expected_value, 3)
 
+  @test_util.run_deprecated_v1
   def testSigmoidLabelSmoothingEqualsSoftmaxTwoLabel(self):
     with self.cached_session():
       label_smoothing = 0.1
@@ -619,7 +650,8 @@ class SigmoidCrossEntropyLossTest(test.TestCase):
       softmax_labels = constant_op.constant([[0, 1], [1, 0], [0, 1]])
       softmax_loss = losses.softmax_cross_entropy(
           softmax_labels, softmax_logits, label_smoothing=label_smoothing)
-      self.assertAlmostEqual(sigmoid_loss.eval(), softmax_loss.eval(), 3)
+      self.assertAlmostEqual(sigmoid_loss.eval(), self.evaluate(softmax_loss),
+                             3)
 
 
 class LogLossTest(test.TestCase):
@@ -645,11 +677,13 @@ class LogLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         losses.log_loss(self._labels, self._labels, weights=None)
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.log_loss(self._labels, self._labels)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeightWithPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_labels.shape)
@@ -658,27 +692,31 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(
           0.0, loss.eval(feed_dict={tf_predictions: self._np_labels}), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.log_loss(self._labels, self._predictions)
     with self.cached_session():
       self.assertAlmostEqual(-np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.log_loss(self._labels, self._predictions,
                            constant_op.constant(weights))
     with self.cached_session():
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeightAndPlaceholder(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._np_predictions.shape)
@@ -690,6 +728,7 @@ class LogLossTest(test.TestCase):
       self.assertAlmostEqual(weights * -np.sum(self._expected_losses) / 6.0,
                              loss, 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeightAndPlaceholderWithRankOnly(self):
     tf_predictions = array_ops.placeholder(dtypes.float32, shape=[None, None])
     weights = 2.3
@@ -707,7 +746,8 @@ class LogLossTest(test.TestCase):
         np.asarray([1.2, 1.2, 1.2, 3.4, 3.4, 3.4]).reshape((2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 6.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeightsSomeZero(self):
     weights = constant_op.constant((1.2, 0), shape=(2, 1))
@@ -716,7 +756,8 @@ class LogLossTest(test.TestCase):
                                       (2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeightsSomeZero(self):
     weights = constant_op.constant([1.2, 0], shape=[2, 1])
@@ -725,7 +766,8 @@ class LogLossTest(test.TestCase):
                                       (2, 3)))
     loss = losses.log_loss(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 3.0,
+                             self.evaluate(loss), 3)
 
   def testWeightsWithSameNumDimsButWrongShapeThrowsException(self):
     weights = constant_op.constant(np.random.normal(size=(2, 4)), shape=[2, 4])
@@ -743,8 +785,10 @@ class LogLossTest(test.TestCase):
         constant_op.constant(
             weights, shape=(2, 3)))
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0, loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses) / 5.0,
+                             self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithMeasurementSpecificWeightsWithPlaceholder(self):
     weights = np.array([3, 6, 5, 0, 4, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -770,8 +814,9 @@ class LogLossTest(test.TestCase):
         constant_op.constant(
             weights, shape=(2, 3)))
     with self.cached_session():
-      self.assertAlmostEqual(-np.sum(expected_losses), loss.eval(), 3)
+      self.assertAlmostEqual(-np.sum(expected_losses), self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithSampleSpecificWeightsMostZeroWithPlaceholder(self):
     weights = np.array([0, 0, 0, 0, 0, 2]).reshape((2, 3))
     expected_losses = np.multiply(self._expected_losses, weights)
@@ -788,7 +833,7 @@ class LogLossTest(test.TestCase):
     tf_weights = array_ops.zeros(shape=(2, 3))
     loss = losses.log_loss(self._labels, self._predictions, tf_weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
 class HingeLossTest(test.TestCase):
@@ -800,6 +845,7 @@ class HingeLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = losses.hinge_loss(labels, logits).eval()
 
+  @test_util.run_deprecated_v1
   def testAllOutsideMargin(self):
     with self.cached_session():
       logits = constant_op.constant([1.2, -1.4, -1.0, 2.1])
@@ -807,6 +853,7 @@ class HingeLossTest(test.TestCase):
       loss = losses.hinge_loss(labels, logits)
       self.assertAllClose(loss.eval(), 0.0, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testSomeInsideMargin(self):
     with self.cached_session():
       logits = constant_op.constant([[-0.7], [-1.4], [1.4], [0.6]])
@@ -816,6 +863,7 @@ class HingeLossTest(test.TestCase):
       # the margin so they incur some (small) loss.
       self.assertAllClose(loss.eval(), 0.175, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testSomeMisclassified(self):
     with self.cached_session():
       logits = constant_op.constant([[[1.2], [0.4], [-1.0], [-1.1]]])
@@ -835,6 +883,7 @@ class HuberLossTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = losses.huber_loss(labels, predictions).eval()
 
+  @test_util.run_deprecated_v1
   def testAllQuadratic(self):
     with self.cached_session():
       predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
@@ -843,6 +892,7 @@ class HuberLossTest(test.TestCase):
       self.assertAllClose(loss.eval(),
                           0.5 * (0.25 + 0.16 + 1.0 + 0.25) / 4., atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testAllLinear(self):
     with self.cached_session():
       predictions = constant_op.constant([1.5, -1.4, -1.0, 0.0])
@@ -851,6 +901,7 @@ class HuberLossTest(test.TestCase):
       self.assertAllClose(loss.eval(),
                           (1.5 + 2.4 + 1.0 + 1.5) / 4. - 0.5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testMixedQuadraticLinear(self):
     with self.cached_session():
       predictions = constant_op.constant([[1.5, -1.4, -1.0, 0.0],
@@ -870,7 +921,7 @@ class HuberLossTest(test.TestCase):
       labels = constant_op.constant([1.0, -1.0, 0.0, 0.5])
       expected = 0.5 * np.array([0.5**2, 0.4**2, 0.5**2, 0.5**2]).mean()
       loss = losses.huber_loss(labels, predictions, delta=delta)
-      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+      self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
   def testAllLinearDelta(self):
     delta = 0.5
@@ -880,7 +931,7 @@ class HuberLossTest(test.TestCase):
     expected -= 0.5 * delta**2
     loss = losses.huber_loss(labels, predictions, delta=delta)
     with self.cached_session():
-      self.assertAllClose(expected, loss.eval(), atol=1e-5)
+      self.assertAllClose(expected, self.evaluate(loss), atol=1e-5)
 
 
 class MeanSquaredErrorTest(test.TestCase):
@@ -896,6 +947,7 @@ class MeanSquaredErrorTest(test.TestCase):
         losses.mean_squared_error(
             self._predictions, self._predictions, weights=None)
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     with self.cached_session():
       self.assertEqual(
@@ -903,58 +955,62 @@ class MeanSquaredErrorTest(test.TestCase):
           losses.mean_squared_error(predictions=constant_op.constant(0),
                                     labels=constant_op.constant(0)).eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     loss = losses.mean_squared_error(self._predictions, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     loss = losses.mean_squared_error(self._labels, self._predictions)
     with self.cached_session():
-      self.assertAlmostEqual(49.5, loss.eval(), 3)
+      self.assertAlmostEqual(49.5, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_squared_error(self._labels, self._predictions,
                                      constant_op.constant(weights))
     with self.cached_session():
-      self.assertAlmostEqual(49.5 * weights, loss.eval(), 3)
+      self.assertAlmostEqual(49.5 * weights, self.evaluate(loss), 3)
 
   def testNonZeroLossWithOneDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 3.4], shape=(2, 1))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(767.8 / 6.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithTwoDimBatchSpecificWeights(self):
     weights = constant_op.constant([1.2, 3.4], shape=[2, 1])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(767.8 / 6.0, loss.eval(), 3)
+      self.assertAlmostEqual(767.8 / 6.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeights(self):
     weights = constant_op.constant([3, 6, 5, 0, 4, 2], shape=[2, 3])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(587 / 5.0, loss.eval(), 3)
+      self.assertAlmostEqual(587 / 5.0, self.evaluate(loss), 3)
 
   def testNonZeroLossWithSampleSpecificWeightsMostZero(self):
     weights = constant_op.constant([0, 0, 0, 0, 0, 2], shape=[2, 3])
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(18.0, loss.eval(), 3)
+      self.assertAlmostEqual(18.0, self.evaluate(loss), 3)
 
   def testLossWithSampleSpecificWeightsAllZero(self):
     weights = array_ops.zeros((2, 3))
     loss = losses.mean_squared_error(self._labels, self._predictions, weights)
     with self.cached_session():
-      self.assertAlmostEqual(0.0, loss.eval(), 3)
+      self.assertAlmostEqual(0.0, self.evaluate(loss), 3)
 
 
 class MeanPairwiseSquaredErrorTest(test.TestCase):
@@ -991,7 +1047,8 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     with self.cached_session():
       static_inputs_op = losses.mean_pairwise_squared_error(
           predictions=predictions, labels=labels, weights=weights)
-      self.assertAlmostEqual(expected_loss, static_inputs_op.eval(), places=3)
+      self.assertAlmostEqual(
+          expected_loss, self.evaluate(static_inputs_op), places=3)
 
       predictions_placeholder = array_ops.placeholder(
           dtypes.float32, shape=np.asarray(predictions.shape))
@@ -1011,10 +1068,12 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       self.assertAlmostEqual(
           expected_loss, dynamic_inputs_op.eval(feed_dict=feed_dict), places=3)
 
+  @test_util.run_v1_only("b/120545219")
   def testAllCorrectNoLossWeight(self):
     self._test_valid_weights(
         self._labels, self._labels, expected_loss=0.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLoss(self):
     self._test_valid_weights(
         self._labels, self._predictions,
@@ -1040,11 +1099,12 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with self.cached_session() as sess:
-        sess.run(init_op)
+        self.evaluate(init_op)
         for grad, _ in gradients_to_variables:
-          np_grad = sess.run(grad)
+          np_grad = self.evaluate(grad)
           self.assertFalse(np.isnan(np_grad).any())
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithPythonScalarWeight(self):
     weight = 2.3
     self._test_valid_weights(
@@ -1052,6 +1112,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=weight * np.sum(self._expected_losses),
         weights=weight)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonZeroLossWithScalarTensorWeight(self):
     weights = 2.3
     loss = losses.mean_pairwise_squared_error(
@@ -1060,12 +1121,14 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         weights=constant_op.constant(weights))
     with self.cached_session():
       self.assertAlmostEqual(weights * np.sum(self._expected_losses),
-                             loss.eval(), 3)
+                             self.evaluate(loss), 3)
 
+  @test_util.run_deprecated_v1
   def testNonZeroLossWithScalarZeroWeight(self):
     self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0, weights=0.0)
 
+  @test_util.run_deprecated_v1
   def test3d(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1077,6 +1140,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     ])
     self._test_valid_weights(labels, predictions, expected_loss=137.5)
 
+  @test_util.run_deprecated_v1
   def test3dWeightedScalar(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1115,6 +1179,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
             weights_placeholder: weights,
         })
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalid3dWeighted2x0(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1127,6 +1192,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
     self._test_invalid_weights(
         labels, predictions, weights=np.asarray((1.2, 3.4)))
 
+  @test_util.run_deprecated_v1
   def test3dWeighted2x3x3(self):
     labels = np.array([
         [[1, 9, 2], [12, 11, 10], [9, 8, 7]],
@@ -1143,6 +1209,7 @@ class MeanPairwiseSquaredErrorTest(test.TestCase):
         expected_loss=9 * 137.5,
         weights=np.ones((2, 3, 3)))
 
+  @test_util.run_deprecated_v1
   def testLossWithAllZeroBatchSpecificWeights(self):
     self._test_valid_weights(
         self._labels, self._predictions, expected_loss=0.0,
@@ -1215,7 +1282,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2)
     with self.cached_session():
-      self.assertAlmostEqual(0, loss.eval(), 5)
+      self.assertAlmostEqual(0, self.evaluate(loss), 5)
 
   def testPartiallyCorrectWithIntegerValues(self):
     loss = losses.cosine_distance(
@@ -1223,7 +1290,7 @@ class CosineDistanceLossTest(test.TestCase):
         labels=constant_op.constant(self._labels),
         dim=2)
     with self.cached_session():
-      self.assertAlmostEqual(1, loss.eval(), 5)
+      self.assertAlmostEqual(1, self.evaluate(loss), 5)
 
   def testPartiallyCorrectFloatingPointValues(self):
     predictions = np.matrix(
@@ -1241,7 +1308,7 @@ class CosineDistanceLossTest(test.TestCase):
     loss = losses.cosine_distance(tf_labels, tf_preds, dim=2)
 
     with self.cached_session():
-      self.assertAlmostEqual(1.0, loss.eval(), 5)
+      self.assertAlmostEqual(1.0, self.evaluate(loss), 5)
 
   def testSampleSpecificWeights(self):
     loss = losses.cosine_distance(
@@ -1250,7 +1317,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=np.asarray((1, 0, 0)).reshape((3, 1, 1)))
     with self.cached_session():
-      self.assertEqual(1.0, loss.eval())
+      self.assertEqual(1.0, self.evaluate(loss))
 
   def testMeasurementSpecificWeights(self):
     loss = losses.cosine_distance(
@@ -1260,8 +1327,9 @@ class CosineDistanceLossTest(test.TestCase):
         weights=constant_op.constant(
             [1, 0, 0, 1, 1, 1], shape=(3, 2, 1)))
     with self.cached_session():
-      self.assertEqual(3.0 / 4.0, loss.eval())
+      self.assertEqual(3.0 / 4.0, self.evaluate(loss))
 
+  @test_util.run_deprecated_v1
   def testMeasurementSpecificWeightsWithPlaceholderWithShape(self):
     tf_predictions = array_ops.placeholder(
         dtypes.float32, shape=self._labels.shape)
@@ -1282,7 +1350,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=array_ops.zeros((3, 1, 1)))
     with self.cached_session():
-      self.assertEqual(0, loss.eval())
+      self.assertEqual(0, self.evaluate(loss))
 
   def testZeroLossWhenAllMeasurementSpecificWeightsAreZero(self):
     loss = losses.cosine_distance(
@@ -1291,7 +1359,7 @@ class CosineDistanceLossTest(test.TestCase):
         dim=2,
         weights=array_ops.zeros((3, 2, 1)))
     with self.cached_session():
-      self.assertEqual(0, loss.eval())
+      self.assertEqual(0, self.evaluate(loss))
 
 
 class AddLossTest(test.TestCase):
@@ -1351,15 +1419,16 @@ class ComputeWeightedLossTest(test.TestCase):
         with self.session(g):
           for unweighted_loss in unweighted_losses:
             if reduction == losses.Reduction.NONE:
-              self.assertAllClose(self._raw_losses, unweighted_loss.eval())
+              self.assertAllClose(self._raw_losses,
+                                  self.evaluate(unweighted_loss))
             elif reduction == losses.Reduction.SUM:
               self.assertAllClose(
-                  np.sum(self._raw_losses), unweighted_loss.eval())
+                  np.sum(self._raw_losses), self.evaluate(unweighted_loss))
             else:
               # reduction one of MEAN, SUM_OVER_NONZERO_WEIGHTS,
               # SUM_BY_NONZERO_WEIGHTS or SUM_OVER_BATCH_SIZE.
               self.assertAllClose(
-                  np.mean(self._raw_losses), unweighted_loss.eval())
+                  np.mean(self._raw_losses), self.evaluate(unweighted_loss))
 
   def testUnweightedFromPlaceholder(self):
     for reduction in losses.Reduction.all():
@@ -1398,7 +1467,7 @@ class ComputeWeightedLossTest(test.TestCase):
       self.assertEqual(1, len(util.get_losses()))
       with self.cached_session():
         self.assertAllClose(
-            np.mean(weight * self._raw_losses), weighted_loss.eval())
+            np.mean(weight * self._raw_losses), self.evaluate(weighted_loss))
 
   def _test_invalid_weights(self, weights):
     with ops.Graph().as_default():
@@ -1470,24 +1539,22 @@ class ComputeWeightedLossTest(test.TestCase):
           weighted_losses = weights * self._raw_losses
           weighted_sum = np.sum(weighted_losses)
           if reduction == losses.Reduction.NONE:
-            self.assertAllClose(weighted_losses, weighted_loss.eval())
+            self.assertAllClose(weighted_losses, self.evaluate(weighted_loss))
           elif reduction == losses.Reduction.SUM:
-            self.assertAllClose(weighted_sum, weighted_loss.eval())
+            self.assertAllClose(weighted_sum, self.evaluate(weighted_loss))
           else:
             broadcast_weights = weights * np.ones_like(self._raw_losses)
             if reduction == losses.Reduction.MEAN:
-              self.assertAllClose(
-                  weighted_sum / np.sum(broadcast_weights),
-                  weighted_loss.eval())
+              self.assertAllClose(weighted_sum / np.sum(broadcast_weights),
+                                  self.evaluate(weighted_loss))
             elif (reduction == losses.Reduction.SUM_OVER_NONZERO_WEIGHTS or
                   reduction == losses.Reduction.SUM_BY_NONZERO_WEIGHTS):
               self.assertAllClose(
                   weighted_sum / np.count_nonzero(broadcast_weights),
-                  weighted_loss.eval())
+                  self.evaluate(weighted_loss))
             elif reduction == losses.Reduction.SUM_OVER_BATCH_SIZE:
-              self.assertAllClose(
-                  weighted_sum / self._raw_losses.size,
-                  weighted_loss.eval())
+              self.assertAllClose(weighted_sum / self._raw_losses.size,
+                                  self.evaluate(weighted_loss))
 
   def test1x1x1Weight(self):
     self._test_valid_weights((((17.0,),),))
diff --git a/tensorflow/python/kernel_tests/lrn_op_test.py b/tensorflow/python/kernel_tests/lrn_op_test.py
index 7ebeb91d90e8beff1cd1d27d280dce85bdd9124f..fbe628c3944f80b10012cb10f6c43336a5380019 100644
--- a/tensorflow/python/kernel_tests/lrn_op_test.py
+++ b/tensorflow/python/kernel_tests/lrn_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -92,6 +93,7 @@ class LRNOpTest(test.TestCase):
       self.assertTrue(err < 1e-2)
     self.assertShapeEqual(expected, lrn_t)
 
+  @test_util.run_deprecated_v1
   def testCompute(self):
     for _ in range(2):
       self._RunAndVerify(dtypes.float32)
@@ -99,6 +101,7 @@ class LRNOpTest(test.TestCase):
       if not test.is_gpu_available():
         self._RunAndVerify(dtypes.float16)
 
+  @test_util.run_deprecated_v1
   def testGradientsZeroInput(self):
     with self.session(use_gpu=True):
       shape = [4, 4, 4, 4]
@@ -147,6 +150,7 @@ class LRNOpTest(test.TestCase):
     else:
       self.assertLess(err, 1.0)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     for _ in range(2):
       self._RunAndVerifyGradients(dtypes.float32)
diff --git a/tensorflow/python/kernel_tests/lu_op_test.py b/tensorflow/python/kernel_tests/lu_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..06deb0e1c82175c33b028e017a5f54cc2549253b
--- /dev/null
+++ b/tensorflow/python/kernel_tests/lu_op_test.py
@@ -0,0 +1,288 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tensorflow.ops.tf.Lu."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.client import session
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import functional_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import benchmark
+from tensorflow.python.platform import test
+
+
+class LuOpTest(test.TestCase):
+
+  @property
+  def float_types(self):
+    return set((np.float64, np.float32, np.complex64, np.complex128))
+
+  def _verifyLuBase(self, x, lower, upper, perm, verification,
+                    output_idx_type):
+    lower_np, upper_np, perm_np, verification_np = self.evaluate(
+        [lower, upper, perm, verification])
+
+    self.assertAllClose(x, verification_np)
+    self.assertShapeEqual(x, lower)
+    self.assertShapeEqual(x, upper)
+
+    self.assertAllEqual(x.shape[:-1], perm.shape.as_list())
+
+    # Check dtypes are as expected.
+    self.assertEqual(x.dtype, lower_np.dtype)
+    self.assertEqual(x.dtype, upper_np.dtype)
+    self.assertEqual(output_idx_type.as_numpy_dtype, perm_np.dtype)
+
+    # Check that the permutation is valid.
+    if perm_np.shape[-1] > 0:
+      perm_reshaped = np.reshape(perm_np, (-1, perm_np.shape[-1]))
+      for perm_vector in perm_reshaped:
+        self.assertAllClose(np.arange(len(perm_vector)), np.sort(perm_vector))
+
+  def _verifyLu(self, x, output_idx_type=dtypes.int64):
+    # Verify that Px = LU.
+    with test_util.use_gpu():
+
+      lu, perm = linalg_ops.lu(x, output_idx_type=output_idx_type)
+
+      # Prepare the lower factor of shape num_rows x num_rows
+      lu_shape = np.array(lu.shape.as_list())
+      batch_shape = lu_shape[:-2]
+      num_rows = lu_shape[-2]
+      num_cols = lu_shape[-1]
+
+      lower = array_ops.matrix_band_part(lu, -1, 0)
+
+      if num_rows > num_cols:
+        eye = linalg_ops.eye(
+            num_rows, batch_shape=batch_shape, dtype=lower.dtype)
+        lower = array_ops.concat([lower, eye[..., num_cols:]], axis=-1)
+      elif num_rows < num_cols:
+        lower = lower[..., :num_rows]
+
+      # Fill the diagonal with ones.
+      ones_diag = array_ops.ones(
+          np.append(batch_shape, num_rows), dtype=lower.dtype)
+      lower = array_ops.matrix_set_diag(lower, ones_diag)
+
+      # Prepare the upper factor.
+      upper = array_ops.matrix_band_part(lu, 0, -1)
+
+      verification = math_ops.matmul(lower, upper)
+
+      # Permute the rows of product of the Cholesky factors.
+      if num_rows > 0:
+        # Reshape the product of the triangular factors and permutation indices
+        # to a single batch dimension. This makes it easy to apply
+        # invert_permutation and gather_nd ops.
+        perm_reshaped = array_ops.reshape(perm, [-1, num_rows])
+        verification_reshaped = array_ops.reshape(verification,
+                                                  [-1, num_rows, num_cols])
+        # Invert the permutation in each batch.
+        inv_perm_reshaped = functional_ops.map_fn(array_ops.invert_permutation,
+                                                  perm_reshaped)
+        batch_size = perm_reshaped.shape.as_list()[0]
+        # Prepare the batch indices with the same shape as the permutation.
+        # The corresponding batch index is paired with each of the `num_rows`
+        # permutation indices.
+        batch_indices = math_ops.cast(
+            array_ops.broadcast_to(
+                math_ops.range(batch_size)[:, None], perm_reshaped.shape),
+            dtype=output_idx_type)
+        permuted_verification_reshaped = array_ops.gather_nd(
+            verification_reshaped,
+            array_ops.stack([batch_indices, inv_perm_reshaped], axis=-1))
+
+        # Reshape the verification matrix back to the original shape.
+        verification = array_ops.reshape(permuted_verification_reshaped,
+                                         lu_shape)
+
+      self._verifyLuBase(x, lower, upper, perm, verification,
+                         output_idx_type)
+
+  def testBasic(self):
+    data = np.array([[4., -1., 2.], [-1., 6., 0], [10., 0., 5.]])
+
+    for dtype in (np.float32, np.float64):
+      for output_idx_type in (dtypes.int32, dtypes.int64):
+        self._verifyLu(data.astype(dtype), output_idx_type=output_idx_type)
+
+    for dtype in (np.complex64, np.complex128):
+      for output_idx_type in (dtypes.int32, dtypes.int64):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data, output_idx_type=output_idx_type)
+
+  def testPivoting(self):
+    with test_util.use_gpu():
+      # This matrix triggers partial pivoting because the first diagonal entry
+      # is small.
+      data = np.array([[1e-9, 1., 0.], [1., 0., 0], [0., 1., 5]])
+      self._verifyLu(data.astype(np.float32))
+
+      for dtype in (np.float32, np.float64):
+        self._verifyLu(data.astype(dtype))
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
+
+      for dtype in (np.complex64, np.complex128):
+        complex_data = np.tril(1j * data, -1).astype(dtype)
+        complex_data += np.triu(-1j * data, 1).astype(dtype)
+        complex_data += data
+        self._verifyLu(complex_data)
+        _, p = linalg_ops.lu(data)
+        p_val = self.evaluate([p])
+        # Make sure p_val is not the identity permutation.
+        self.assertNotAllClose(np.arange(3), p_val)
+
+  def testInvalidMatrix(self):
+    # LU factorization gives an error when the input is singular.
+    # Note: A singular matrix may return without error but it won't be a valid
+    # factorization.
+    with test_util.use_gpu():
+      for dtype in self.float_types:
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[1., 2., 3.], [2., 4., 6.], [2., 3., 4.]],
+                           dtype=dtype)))
+        with self.assertRaises(errors.InvalidArgumentError):
+          self.evaluate(
+              linalg_ops.lu(
+                  np.array([[[1., 2., 3.], [2., 4., 6.], [1., 2., 3.]],
+                            [[1., 2., 3.], [3., 4., 5.], [5., 6., 7.]]],
+                           dtype=dtype)))
+
+  def testBatch(self):
+    simple_array = np.array([[[1., -1.], [2., 5.]]])  # shape (1, 2, 2)
+    self._verifyLu(simple_array)
+    self._verifyLu(np.vstack((simple_array, simple_array)))
+    odd_sized_array = np.array([[[4., -1., 2.], [-1., 6., 0], [2., 0., 5.]]])
+    self._verifyLu(np.vstack((odd_sized_array, odd_sized_array)))
+
+    batch_size = 200
+
+    # Generate random matrices.
+    np.random.seed(42)
+    matrices = np.random.rand(batch_size, 5, 5)
+    self._verifyLu(matrices)
+
+    # Generate random complex valued matrices.
+    np.random.seed(52)
+    matrices = np.random.rand(batch_size, 5,
+                              5) + 1j * np.random.rand(batch_size, 5, 5)
+    self._verifyLu(matrices)
+
+  def testLargeMatrix(self):
+    # Generate random matrices.
+    n = 500
+    np.random.seed(64)
+    data = np.random.rand(n, n)
+    self._verifyLu(data)
+
+    # Generate random complex valued matrices.
+    np.random.seed(129)
+    data = np.random.rand(n, n) + 1j * np.random.rand(n, n)
+    self._verifyLu(data)
+
+  @test_util.run_v1_only("b/120545219")
+  def testEmpty(self):
+    self._verifyLu(np.empty([0, 2, 2]))
+    self._verifyLu(np.empty([2, 0, 0]))
+
+  @test_util.run_deprecated_v1
+  def testConcurrentExecutesWithoutError(self):
+    with test_util.use_gpu():
+      matrix1 = random_ops.random_normal([5, 5], seed=42)
+      matrix2 = random_ops.random_normal([5, 5], seed=42)
+      lu1, p1 = linalg_ops.lu(matrix1)
+      lu2, p2 = linalg_ops.lu(matrix2)
+      lu1_val, p1_val, lu2_val, p2_val = self.evaluate([lu1, p1, lu2, p2])
+      self.assertAllEqual(lu1_val, lu2_val)
+      self.assertAllEqual(p1_val, p2_val)
+
+
+class LuBenchmark(test.Benchmark):
+  shapes = [
+      (4, 4),
+      (10, 10),
+      (16, 16),
+      (101, 101),
+      (256, 256),
+      (1000, 1000),
+      (1024, 1024),
+      (2048, 2048),
+      (4096, 4096),
+      (513, 2, 2),
+      (513, 8, 8),
+      (513, 256, 256),
+      (4, 513, 2, 2),
+  ]
+
+  def _GenerateMatrix(self, shape):
+    batch_shape = shape[:-2]
+    shape = shape[-2:]
+    assert shape[0] == shape[1]
+    n = shape[0]
+    matrix = np.ones(shape).astype(np.float32) / (2.0 * n) + np.diag(
+        np.ones(n).astype(np.float32))
+    return np.tile(matrix, batch_shape + (1, 1))
+
+  def benchmarkLuOp(self):
+    for shape in self.shapes:
+      with ops.Graph().as_default(), \
+          session.Session(config=benchmark.benchmark_config()) as sess, \
+          ops.device("/cpu:0"):
+        matrix = variables.Variable(self._GenerateMatrix(shape))
+        lu, p = linalg_ops.lu(matrix)
+        variables.global_variables_initializer().run()
+        self.run_op_benchmark(
+            sess,
+            control_flow_ops.group(lu, p),
+            min_iters=25,
+            name="lu_cpu_{shape}".format(shape=shape))
+
+      if test.is_gpu_available(True):
+        with ops.Graph().as_default(), \
+            session.Session(config=benchmark.benchmark_config()) as sess, \
+            ops.device("/device:GPU:0"):
+          matrix = variables.Variable(self._GenerateMatrix(shape))
+          lu, p = linalg_ops.lu(matrix)
+          variables.global_variables_initializer().run()
+          self.run_op_benchmark(
+              sess,
+              control_flow_ops.group(lu, p),
+              min_iters=25,
+              name="lu_gpu_{shape}".format(shape=shape))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/manip_ops_test.py b/tensorflow/python/kernel_tests/manip_ops_test.py
index f71857a3cbaddb52fc4da082f504fcbc5c405bd9..5700db4b950995c5bc59adb84a8e0f81655850cc 100644
--- a/tensorflow/python/kernel_tests/manip_ops_test.py
+++ b/tensorflow/python/kernel_tests/manip_ops_test.py
@@ -62,6 +62,7 @@ class RollTest(test_util.TensorFlowTestCase):
     if np_input.dtype == np.float32:
       self._testGradient(np_input, shift, axis)
 
+  @test_util.run_deprecated_v1
   def testIntTypes(self):
     for t in [np.int32, np.int64]:
       self._testAll(np.random.randint(-100, 100, (5)).astype(t), 3, 0)
@@ -73,6 +74,7 @@ class RollTest(test_util.TensorFlowTestCase):
             np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t), [0, 1, -2],
             [1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
       self._testAll(np.random.rand(5).astype(t), 2, 0)
@@ -80,6 +82,7 @@ class RollTest(test_util.TensorFlowTestCase):
         self._testAll(np.random.rand(3, 4).astype(t), [1, 2], [1, 0])
         self._testAll(np.random.rand(1, 3, 4).astype(t), [1, 0, -3], [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testComplexTypes(self):
     for t in [np.complex64, np.complex128]:
       x = np.random.rand(4, 4).astype(t)
@@ -90,6 +93,7 @@ class RollTest(test_util.TensorFlowTestCase):
         x = np.random.rand(3, 2, 1, 1).astype(t)
         self._testAll(x + 1j * x, [2, 1, 1, 0], [0, 3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testNegativeAxis(self):
     self._testAll(np.random.randint(-100, 100, (5)).astype(np.int32), 3, -1)
     self._testAll(np.random.randint(-100, 100, (4, 4)).astype(np.int32), 3, -2)
@@ -100,12 +104,14 @@ class RollTest(test_util.TensorFlowTestCase):
         manip_ops.roll(np.random.randint(-100, 100, (4, 4)).astype(np.int32),
                        3, -10).eval()
 
+  @test_util.run_deprecated_v1
   def testInvalidInputShape(self):
     # The input should be 1-D or higher, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at least rank 1 but is rank 0"):
       manip_ops.roll(7, 1, 0)
 
+  @test_util.run_deprecated_v1
   def testRollInputMustVectorHigherRaises(self):
     # The input should be 1-D or higher, checked in kernel.
     tensor = array_ops.placeholder(dtype=dtypes.int32)
@@ -116,12 +122,14 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "input must be 1-D or higher"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={tensor: 7})
 
+  @test_util.run_deprecated_v1
   def testInvalidAxisShape(self):
     # The axis should be a scalar or 1-D, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at most rank 1 but is rank 2"):
       manip_ops.roll([[1, 2], [3, 4]], 1, [[0, 1]])
 
+  @test_util.run_deprecated_v1
   def testRollAxisMustBeScalarOrVectorRaises(self):
     # The axis should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
@@ -132,12 +140,14 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "axis must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={axis: [[0, 1]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidShiftShape(self):
     # The shift should be a scalar or 1-D, checked in shape function.
     with self.assertRaisesRegexp(
         ValueError, "Shape must be at most rank 1 but is rank 2"):
       manip_ops.roll([[1, 2], [3, 4]], [[0, 1]], 1)
 
+  @test_util.run_deprecated_v1
   def testRollShiftMustBeScalarOrVectorRaises(self):
     # The shift should be a scalar or 1-D, checked in kernel.
     tensor = [[1, 2], [3, 4]]
@@ -148,11 +158,13 @@ class RollTest(test_util.TensorFlowTestCase):
                                    "shift must be a scalar or a 1-D vector"):
         manip_ops.roll(tensor, shift, axis).eval(feed_dict={shift: [[0, 1]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidShiftAndAxisNotEqualShape(self):
     # The shift and axis must be same size, checked in shape function.
     with self.assertRaisesRegexp(ValueError, "both shapes must be equal"):
       manip_ops.roll([[1, 2], [3, 4]], [1], [0, 1])
 
+  @test_util.run_deprecated_v1
   def testRollShiftAndAxisMustBeSameSizeRaises(self):
     # The shift and axis must be same size, checked in kernel.
     tensor = [[1, 2], [3, 4]]
diff --git a/tensorflow/python/kernel_tests/map_stage_op_test.py b/tensorflow/python/kernel_tests/map_stage_op_test.py
index d503f3d7c9f5625b89e5e4168b3eaed9ab98612c..dd16fad690470e0ca77c31102b8ef2000f0a15d5 100644
--- a/tensorflow/python/kernel_tests/map_stage_op_test.py
+++ b/tensorflow/python/kernel_tests/map_stage_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -29,6 +30,7 @@ TIMEOUT = 1
 
 class MapStageTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -50,6 +52,7 @@ class MapStageTest(test.TestCase):
         _, yval = sess.run([stage, y], feed_dict={x: i, pi: i + 1, gi: i})
         self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testMultiple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -72,6 +75,7 @@ class MapStageTest(test.TestCase):
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testDictionary(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -121,6 +125,7 @@ class MapStageTest(test.TestCase):
 
     G.finalize()
 
+  @test_util.run_deprecated_v1
   def testPeek(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -150,6 +155,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 10)
 
+  @test_util.run_deprecated_v1
   def testSizeAndClear(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -176,6 +182,7 @@ class MapStageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
+  @test_util.run_deprecated_v1
   def testCapacity(self):
     capacity = 3
 
@@ -239,6 +246,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testMemoryLimit(self):
     memory_limit = 512 * 1024  # 512K
     chunk = 200 * 1024  # 256K
@@ -303,6 +311,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testOrdering(self):
     import six
     import random
@@ -341,6 +350,7 @@ class MapStageTest(test.TestCase):
 
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testPartialDictInsert(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -400,6 +410,7 @@ class MapStageTest(test.TestCase):
               'v': 3
           }])
 
+  @test_util.run_deprecated_v1
   def testPartialIndexInsert(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -443,6 +454,7 @@ class MapStageTest(test.TestCase):
       # We can now obtain tuple associated with key 1
       self.assertTrue(sess.run([key, ret], feed_dict={gi: 1}) == [1, [1, 3, 2]])
 
+  @test_util.run_deprecated_v1
   def testPartialDictGetsAndPeeks(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -540,6 +552,7 @@ class MapStageTest(test.TestCase):
       # Nothing is left
       self.assertTrue(sess.run([size, isize]) == [0, 0])
 
+  @test_util.run_deprecated_v1
   def testPartialIndexGets(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
diff --git a/tensorflow/python/kernel_tests/matmul_op_test.py b/tensorflow/python/kernel_tests/matmul_op_test.py
index 1c2822180ac986453159d07c330195dab87c97cf..d31ecbcd3f1d57386fa629cd533f5f698176ca76 100644
--- a/tensorflow/python/kernel_tests/matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/matmul_op_test.py
@@ -21,11 +21,12 @@ from __future__ import print_function
 import operator
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -41,11 +42,9 @@ class MatVecTest(test_lib.TestCase):
   def testTwoByTwoCase(self):
     a = np.array([[1, 2], [3, 4]])
     b = np.array([5, 6])
-    with self.cached_session():
-      c = math_ops.matvec(a, b)
-      self.assertAllEqual((2,), c.shape)
-      c_ = c.eval()
-    self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c_)
+    c = math_ops.matvec(a, b)
+    self.assertAllEqual((2,), c.shape)
+    self.assertAllEqual([5 + 2 * 6, 3 * 5 + 4 * 6], c)
 
 
 def _AddTest(test, op_name, testcase_name, fn):
@@ -85,12 +84,12 @@ def _GetMatMulTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     # np.matrix(a_np_) * np.matrix(b_np_)
     effective_a_np = _GetTransposedMatrices(a_np_, "a", kwargs_)
     effective_b_np = _GetTransposedMatrices(b_np_, "b", kwargs_)
-    with self.session(use_gpu=use_gpu) as sess:
+    with self.cached_session() as sess, test_util.device(use_gpu):
       if use_static_shape_:
         a = constant_op.constant(effective_a_np)
         b = constant_op.constant(effective_b_np)
         res = math_ops.matmul(a, b, **kwargs_)
-        tf_val = res.eval()
+        tf_val = self.evaluate(res)
       else:
         a = array_ops.placeholder(a_np_.dtype)
         b = array_ops.placeholder(b_np_.dtype)
@@ -128,45 +127,45 @@ def _GetMatMulGradientTest(a_np_, b_np_, use_static_shape_, **kwargs_):
     epsilon = np.finfo(a_np_.dtype).eps
     delta = epsilon**(1.0 / 3.0)
     tol = 20 * delta
-    with self.session(use_gpu=True):
-      a = constant_op.constant(effective_a_np)
-      b = constant_op.constant(effective_b_np)
-      res = math_ops.matmul(a, b, **kwargs_)
-      for x, x_init in [a, effective_a_np], [b, effective_b_np]:
-        theoretical, numerical = gradient_checker.compute_gradient(
-            x,
-            x_init.shape,
-            res, [a_np_.shape[0], b_np_.shape[1]],
-            x_init_value=x_init,
-            delta=delta)
-        self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
+    with self.session(), test_util.use_gpu():
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.matmul(x, effective_b_np, **kwargs_),
+          [effective_a_np],
+          delta=delta)
+      self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
+
+      theoretical, numerical = gradient_checker_v2.compute_gradient(
+          lambda x: math_ops.matmul(effective_a_np, x, **kwargs_),
+          [effective_b_np],
+          delta=delta)
+      self.assertAllClose(theoretical, numerical, rtol=tol, atol=tol)
 
   return Test
 
 
 class MatMulStatsTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("Test requires a Graph and NodeDef inspection")
   def testSimpleStatistics(self):
-    g = ops.Graph()
-    with g.as_default():
-      a = variables.Variable(random_ops.random_normal([25, 16]))
-      b = variables.Variable(random_ops.random_normal([16, 9]))
-      math_ops.matmul(a, b)
-      for op in g.get_operations():
-        flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
-        if op.name == "MatMul":
-          self.assertEqual(7200, flops)
-
+    a = variables.Variable(random_ops.random_normal([25, 16]))
+    b = variables.Variable(random_ops.random_normal([16, 9]))
+    math_ops.matmul(a, b)
+    g = ops.get_default_graph()
+    for op in g.get_operations():
+      flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
+      if op.name == "MatMul":
+        self.assertEqual(7200, flops)
+
+  @test_util.run_v1_only("Test requires a Graph and NodeDef inspection")
   def testTransposedStatistics(self):
-    g = ops.Graph()
-    with g.as_default():
-      a = variables.Variable(random_ops.random_normal([16, 25]))
-      b = variables.Variable(random_ops.random_normal([16, 9]))
-      math_ops.matmul(a, b, transpose_a=True)
-      for op in g.get_operations():
-        flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
-        if op.name == "MatMul":
-          self.assertEqual(7200, flops)
+    a = variables.Variable(random_ops.random_normal([16, 25]))
+    b = variables.Variable(random_ops.random_normal([16, 9]))
+    math_ops.matmul(a, b, transpose_a=True)
+    g = ops.get_default_graph()
+    for op in g.get_operations():
+      flops = ops.get_stats_for_node_def(g, op.node_def, "flops").value
+      if op.name == "MatMul":
+        self.assertEqual(7200, flops)
 
 
 try:
@@ -195,19 +194,20 @@ except AttributeError:
 class MatMulInfixOperatorTest(test_lib.TestCase):
 
   def testMismatchedShape(self):
-    with self.assertRaisesWithPredicateMatch(ValueError,
-                                             lambda e: "Shape must" in str(e)):
+    with self.assertRaisesRegexp(
+        Exception, "(Shape must be rank 2 but is rank 1|is not a matrix)"):
       infix_matmul(
           ops.convert_to_tensor([10.0, 20.0, 30.0]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
   def testMismatchedDimensions(self):
-    with self.assertRaisesWithPredicateMatch(
-        ValueError, lambda e: "Dimensions must" in str(e)):
+    with self.assertRaisesRegexp(
+        Exception, "(Dimensions must be equal|Matrix size-incompatible)"):
       infix_matmul(
           ops.convert_to_tensor([[10.0, 20.0, 30.0]]),
           ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0]]))
 
+  @test_util.run_v1_only("Tensor.op is generally not applicable in TF 2")
   def testInfixMatmulIsTfMatmul(self):
     a = ops.convert_to_tensor([[10.0, 20.0, 30.0]])
     b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
@@ -219,14 +219,14 @@ class MatMulInfixOperatorTest(test_lib.TestCase):
     b = ops.convert_to_tensor([[40.0, 50.0], [60.0, 70.0], [80.0, 90.0]])
     c = infix_matmul(a, b)
     d = math_ops.matmul(a, b)
-    with self.cached_session():
-      self.assertAllEqual(c.eval(), d.eval())
+    self.assertAllEqual(c, d)
 
 
 if __name__ == "__main__":
   sizes = [1, 3, 5]
   trans_options = [[False, False], [True, False], [False, True]]
-  for use_static_shape in [False, True]:
+  # TF2 does not support placeholders under eager so we skip it
+  for use_static_shape in set([True, tf2.enabled()]):
     for dtype in (np.int32, np.int64, np.float16, np.float32, np.float64,
                   np.complex64, np.complex128):
       if not use_static_shape and (dtype == np.int32 or dtype == np.int64):
diff --git a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
index 93a668f12598e2029143a241b52888aa6fe52a7c..fdb7e4a1a4e54883afd66e6a856a977b61ff8aaf 100644
--- a/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_band_part_op_test.py
@@ -23,6 +23,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gradient_checker
@@ -44,6 +45,7 @@ class MatrixBandPartTest(test_lib.TestCase):
 
 def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     mat = np.ones(shape_).astype(dtype_)
     batch_mat = np.tile(mat, batch_shape_ + (1, 1))
@@ -62,7 +64,7 @@ def _GetMatrixBandPartTest(dtype_, batch_shape_, shape_):
                 batch_mat,
                 constant_op.constant(lower, index_dtype),
                 constant_op.constant(upper, index_dtype))
-            self.assertAllEqual(band_np, band.eval())
+            self.assertAllEqual(band_np, self.evaluate(band))
 
   return Test
 
@@ -73,6 +75,7 @@ class MatrixBandPartGradTest(test_lib.TestCase):
 
 def _GetMatrixBandPartGradTest(dtype_, batch_shape_, shape_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     shape = batch_shape_ + shape_
     x = constant_op.constant(np.random.rand(*shape), dtype=dtype_)
diff --git a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
index 3abdf50ece5eaf94d01c3f20c2b6a3ec0009b86f..372b6dc17f4d080f3a59705611e05f0f0865c50d 100644
--- a/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_exponential_op_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
@@ -50,7 +51,7 @@ class ExponentialOpTest(test.TestCase):
 
   def _verifyExponential(self, x, np_type):
     inp = x.astype(np_type)
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       tf_ans = linalg_impl.matrix_exponential(inp)
       if x.size == 0:
         np_ans = np.empty(x.shape, dtype=np_type)
@@ -61,7 +62,7 @@ class ExponentialOpTest(test.TestCase):
             np_ans[i] = np_expm(inp[i])
         else:
           np_ans = np_expm(inp)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
 
   def _verifyExponentialReal(self, x):
@@ -121,12 +122,14 @@ class ExponentialOpTest(test.TestCase):
     # Complex batch
     self._verifyExponentialComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # When the exponential of a non-square matrix is attempted we should return
     # an error
     with self.assertRaises(ValueError):
       linalg_impl.matrix_exponential(np.array([[1., 2., 3.], [3., 4., 5.]]))
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The input to the exponential should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.])
@@ -137,6 +140,7 @@ class ExponentialOpTest(test.TestCase):
     self._verifyExponentialReal(np.empty([0, 2, 2]))
     self._verifyExponentialReal(np.empty([2, 0, 0]))
 
+  @test_util.run_deprecated_v1
   def testDynamic(self):
     with self.session(use_gpu=True) as sess:
       inp = array_ops.placeholder(ops.dtypes.float32)
@@ -144,13 +148,14 @@ class ExponentialOpTest(test.TestCase):
       matrix = np.array([[1., 2.], [3., 4.]])
       sess.run(expm, feed_dict={inp: matrix})
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
       expm1 = linalg_impl.matrix_exponential(matrix1)
       expm2 = linalg_impl.matrix_exponential(matrix2)
-      expm = sess.run([expm1, expm2])
+      expm = self.evaluate([expm1, expm2])
       self.assertAllEqual(expm[0], expm[1])
 
 
diff --git a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
index 2247f1541e2cc74a171aebd8ee5e40c2dedf32fc..5cef4b79a32b85e3366ce018d1d8634867c20a75 100644
--- a/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_inverse_op_test.py
@@ -46,7 +46,7 @@ class InverseOpTest(test.TestCase):
           tiling = list(y.shape)
           tiling[-2:] = [1, 1]
           np_ans = np.tile(np_ans, tiling)
-        out = tf_ans.eval()
+        out = self.evaluate(tf_ans)
         self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-3)
         self.assertShapeEqual(y, tf_ans)
 
@@ -146,7 +146,7 @@ class InverseOpTest(test.TestCase):
         inv1 = linalg_ops.matrix_inverse(matrix1, adjoint=adjoint_)
         inv2 = linalg_ops.matrix_inverse(matrix2, adjoint=adjoint_)
         all_ops += [inv1, inv2]
-      inv = sess.run(all_ops)
+      inv = self.evaluate(all_ops)
       self.assertAllEqual(inv[0], inv[1])
       self.assertAllEqual(inv[2], inv[3])
 
diff --git a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
index 2010a4b2a86c245c83176c800af85b1f43cf405e..682ac12adc6acef378ccbb256066cbd2b099e1b9 100644
--- a/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_logarithm_op_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
@@ -39,11 +40,11 @@ class LogarithmOpTest(test.TestCase):
 
   def _verifyLogarithm(self, x, np_type):
     inp = x.astype(np_type)
-    with self.cached_session(use_gpu=True):
+    with test_util.use_gpu():
       # Verify that expm(logm(A)) == A.
       tf_ans = linalg_impl.matrix_exponential(
           gen_linalg_ops.matrix_logarithm(inp))
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertAllClose(inp, out, rtol=1e-4, atol=1e-3)
 
   def _verifyLogarithmComplex(self, x):
@@ -83,6 +84,7 @@ class LogarithmOpTest(test.TestCase):
     # Complex batch
     self._verifyLogarithmComplex(self._makeBatch(matrix1, matrix2))
 
+  @test_util.run_v1_only("b/120545219")
   def testNonSquareMatrix(self):
     # When the logarithm of a non-square matrix is attempted we should return
     # an error
@@ -90,6 +92,7 @@ class LogarithmOpTest(test.TestCase):
       gen_linalg_ops.matrix_logarithm(
           np.array([[1., 2., 3.], [3., 4., 5.]], dtype=np.complex64))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to the logarithm should be at least a 2-dimensional tensor.
     tensor3 = constant_op.constant([1., 2.], dtype=dtypes.complex64)
@@ -120,6 +123,7 @@ class LogarithmOpTest(test.TestCase):
             size=np.prod(shape)).reshape(shape).astype(np.complex128)
         self._verifyLogarithmComplex(matrix)
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       matrix1 = math_ops.cast(
@@ -128,7 +132,7 @@ class LogarithmOpTest(test.TestCase):
           random_ops.random_normal([5, 5], seed=42), dtypes.complex64)
       logm1 = gen_linalg_ops.matrix_logarithm(matrix1)
       logm2 = gen_linalg_ops.matrix_logarithm(matrix2)
-      logm = sess.run([logm1, logm2])
+      logm = self.evaluate([logm1, logm2])
       self.assertAllEqual(logm[0], logm[1])
 
 
diff --git a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
index 13a7df7f95dad8bc96f8c27937527ff577135fdb..463477a6a2cb5cf174b461c1fbffd2024f7ce21e 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_ls_op_test.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
@@ -133,6 +135,7 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
       self.assertEqual(np_ans.shape, tf_ans_val.shape)
       self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
@@ -147,15 +150,20 @@ class MatrixSolveLsOpTest(test_lib.TestCase):
     empty1 = np.empty([0, 2])
     for fast in [True, False]:
       with self.cached_session(use_gpu=True):
-        tf_ans = linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast))
         self.assertEqual(tf_ans.shape, (0, 0))
-        tf_ans = linalg_ops.matrix_solve_ls(empty0, full, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty0, full, fast=fast))
         self.assertEqual(tf_ans.shape, (0, 2))
-        tf_ans = linalg_ops.matrix_solve_ls(full, empty0, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(full, empty0, fast=fast))
         self.assertEqual(tf_ans.shape, (2, 0))
-        tf_ans = linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast).eval()
+        tf_ans = self.evaluate(
+            linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast))
         self.assertEqual(tf_ans.shape, (2, 2))
 
+  @test_util.run_v1_only("b/120545219")
   def testBatchResultSize(self):
     # 3x3x3 matrices, 3x3x1 right-hand sides.
     matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3)
@@ -346,7 +354,8 @@ class MatrixSolveLsBenchmark(test_lib.Benchmark):
 
 if __name__ == "__main__":
   for dtype_ in [np.float32, np.float64, np.complex64, np.complex128]:
-    for use_placeholder_ in [True, False]:
+    # TF2 does not support placeholders under eager so we skip it
+    for use_placeholder_ in set([False, not tf2.enabled()]):
       for fast_ in [True, False]:
         l2_regularizers = [0] if dtype_ == np.complex128 else [0, 0.1]
         for l2_regularizer_ in l2_regularizers:
diff --git a/tensorflow/python/kernel_tests/matrix_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
index 9e30ae162899c8a82c103feed42d69090704c93a..db7c4802f69227627f00565c7398b12af87e3651 100644
--- a/tensorflow/python/kernel_tests/matrix_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_solve_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import linalg_ops
@@ -63,7 +64,7 @@ class MatrixSolveOpTest(test.TestCase):
               out = sess.run(tf_ans, {a_ph: a, b_ph: b})
             else:
               tf_ans = linalg_ops.matrix_solve(a, b, adjoint=adjoint)
-              out = tf_ans.eval()
+              out = self.evaluate(tf_ans)
               self.assertEqual(tf_ans.get_shape(), out.shape)
             self.assertEqual(np_ans.shape, out.shape)
             self.assertAllClose(np_ans, out, atol=tol, rtol=tol)
@@ -75,6 +76,7 @@ class MatrixSolveOpTest(test.TestCase):
         [m, n]))
     return matrix
 
+  @test_util.run_deprecated_v1
   def testSolve(self):
     for n in 1, 2, 4, 9:
       matrix = self._generateMatrix(n, n)
@@ -82,6 +84,7 @@ class MatrixSolveOpTest(test.TestCase):
         rhs = self._generateMatrix(n, nrhs)
         self._verifySolve(matrix, rhs)
 
+  @test_util.run_deprecated_v1
   def testSolveBatch(self):
     for n in 2, 5:
       matrix = self._generateMatrix(n, n)
@@ -90,6 +93,7 @@ class MatrixSolveOpTest(test.TestCase):
         for batch_dims in [[2], [2, 2], [7, 4]]:
           self._verifySolve(matrix, rhs, batch_dims=batch_dims)
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # When the solve of a non-square matrix is attempted we should return
     # an error
@@ -98,6 +102,7 @@ class MatrixSolveOpTest(test.TestCase):
         matrix = constant_op.constant([[1., 2., 3.], [3., 4., 5.]])
         linalg_ops.matrix_solve(matrix, matrix)
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The matrix and right-hand sides should have the same number of rows.
     with self.session(use_gpu=True):
@@ -115,6 +120,7 @@ class MatrixSolveOpTest(test.TestCase):
                                        [0., -1., 1.]])
         linalg_ops.matrix_solve(matrix, matrix).eval()
 
+  @test_util.run_deprecated_v1
   def testConcurrent(self):
     with self.session(use_gpu=True) as sess:
       all_ops = []
@@ -126,7 +132,7 @@ class MatrixSolveOpTest(test.TestCase):
         s1 = linalg_ops.matrix_solve(lhs1, rhs1, adjoint=adjoint_)
         s2 = linalg_ops.matrix_solve(lhs2, rhs2, adjoint=adjoint_)
         all_ops += [s1, s2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       self.assertAllEqual(val[0], val[1])
       self.assertAllEqual(val[2], val[3])
 
diff --git a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
index 9212580313cf6552d6b5de5b8a882aaa21056357..3edb390c724b6c71cd8849efc2b22a579e87247f 100644
--- a/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_square_root_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -31,7 +32,7 @@ class SquareRootOpTest(test.TestCase):
 
   def _verifySquareRoot(self, matrix, np_type):
     matrix = matrix.astype(np_type)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       # Verify that matmul(sqrtm(A), sqrtm(A)) = A
       sqrt = gen_linalg_ops.matrix_square_root(matrix)
       square = math_ops.matmul(sqrt, sqrt)
@@ -89,26 +90,30 @@ class SquareRootOpTest(test.TestCase):
     self._verifySquareRootReal(np.empty([0, 2, 2]))
     self._verifySquareRootReal(np.empty([2, 0, 0]))
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to the square root should be at least a 2-dimensional tensor.
     tensor = constant_op.constant([1., 2.])
     with self.assertRaises(ValueError):
       gen_linalg_ops.matrix_square_root(tensor)
 
+  @test_util.run_v1_only("b/120545219")
   def testNotSquare(self):
-    with self.test_session():
-      with self.assertRaises(ValueError):
-        tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
-        gen_linalg_ops.matrix_square_root(tensor).eval()
+    with self.assertRaises(ValueError):
+      tensor = constant_op.constant([[1., 0., -1.], [-1., 1., 0.]])
+      self.evaluate(gen_linalg_ops.matrix_square_root(tensor))
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       matrix1 = random_ops.random_normal([5, 5], seed=42)
       matrix2 = random_ops.random_normal([5, 5], seed=42)
-      sqrt1 = gen_linalg_ops.matrix_square_root(matrix1)
-      sqrt2 = gen_linalg_ops.matrix_square_root(matrix2)
+      square1 = math_ops.matmul(matrix1, matrix1)
+      square2 = math_ops.matmul(matrix2, matrix2)
+      sqrt1 = gen_linalg_ops.matrix_square_root(square1)
+      sqrt2 = gen_linalg_ops.matrix_square_root(square2)
       all_ops = [sqrt1, sqrt2]
-      sqrt = sess.run(all_ops)
+      sqrt = self.evaluate(all_ops)
       self.assertAllEqual(sqrt[0], sqrt[1])
 
 
diff --git a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
index 445faca3ee2f32d867a6039314d784db1e5f95ae..dde83f12f3cee1882d921be292f6a33b8c7f1b48 100644
--- a/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
+++ b/tensorflow/python/kernel_tests/matrix_triangular_solve_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.platform import test
@@ -87,12 +88,13 @@ class MatrixTriangularSolveOpTest(test.TestCase):
           b_tf = constant_op.constant(b)
           tf_ans = linalg_ops.matrix_triangular_solve(
               a_tf, b_tf, lower=lower, adjoint=adjoint)
-          tf_val = tf_ans.eval()
+          tf_val = self.evaluate(tf_ans)
           np_ans = np.linalg.solve(a_np, b)
           self.assertEqual(np_ans.shape, tf_ans.get_shape())
         self.assertEqual(np_ans.shape, tf_val.shape)
         self.assertAllClose(np_ans, tf_val)
 
+  @test_util.run_deprecated_v1
   def testSolve(self):
     # 1x1 matrix, single rhs.
     matrix = np.array([[0.1]])
@@ -106,6 +108,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     rhs1 = np.array([[1., 0., 1.], [0., 1., 1.]])
     self._verifySolveAllWaysReal(matrix, rhs1)
 
+  @test_util.run_deprecated_v1
   def testSolveComplex(self):
     # 1x1 matrix, single rhs.
     matrix = np.array([[0.1 + 1j * 0.1]])
@@ -122,6 +125,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     rhs1 += 1j * rhs1
     self._verifySolveAllWaysComplex(matrix, rhs1)
 
+  @test_util.run_deprecated_v1
   def testSolveBatch(self):
     matrix = np.array([[1., 2.], [3., 4.]])
     rhs = np.array([[1., 0., 1.], [0., 1., 1.]])
@@ -130,6 +134,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
     self._verifySolveAllWaysReal(matrix, rhs, batch_dims=[3, 2])
 
+  @test_util.run_deprecated_v1
   def testSolveBatchComplex(self):
     matrix = np.array([[1., 2.], [3., 4.]]).astype(np.complex64)
     matrix += 1j * matrix
@@ -140,6 +145,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
     # Batch of 3x2x2x2 matrices, 3x2x2x3 right-hand sides.
     self._verifySolveAllWaysComplex(matrix, rhs, batch_dims=[3, 2])
 
+  @test_util.run_deprecated_v1
   def testNonSquareMatrix(self):
     # A non-square matrix should cause an error.
     matrix = np.array([[1., 2., 3.], [3., 4., 5.]])
@@ -149,6 +155,7 @@ class MatrixTriangularSolveOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         self._verifySolve(matrix, matrix, batch_dims=[2, 3])
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The matrix should have the same number of rows as the
     # right-hand sides.
diff --git a/tensorflow/python/kernel_tests/metrics_test.py b/tensorflow/python/kernel_tests/metrics_test.py
index 5dcdb9e4205e209091bb54474aa5c672f29cd081..64dd5914552d276e91ccaa4eed63e93b0eac37c1 100644
--- a/tensorflow/python/kernel_tests/metrics_test.py
+++ b/tensorflow/python/kernel_tests/metrics_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -175,22 +176,26 @@ class MeanTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean(array_ops.ones([4, 3]))
     _assert_metric_variables(self, ('mean/count:0', 'mean/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean(
         array_ops.ones([4, 3]), metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean(
         array_ops.ones([4, 3]), updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -203,11 +208,12 @@ class MeanTest(test.TestCase):
 
       mean, update_op = metrics.mean(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAlmostEqual(1.65, self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testUpdateOpsReturnsCurrentValue(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -220,15 +226,16 @@ class MeanTest(test.TestCase):
 
       mean, update_op = metrics.mean(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
-      self.assertAlmostEqual(1.475, sess.run(update_op), 5)
-      self.assertAlmostEqual(12.4 / 6.0, sess.run(update_op), 5)
-      self.assertAlmostEqual(1.65, sess.run(update_op), 5)
+      self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(1.475, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(12.4 / 6.0, self.evaluate(update_op), 5)
+      self.assertAlmostEqual(1.65, self.evaluate(update_op), 5)
 
-      self.assertAlmostEqual(1.65, sess.run(mean), 5)
+      self.assertAlmostEqual(1.65, self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     values = _test_values((3, 2, 4, 1))
     mean_results = (
@@ -271,37 +278,44 @@ class MeanTest(test.TestCase):
       self.assertAlmostEqual(expected, update_op.eval(), places=5)
       self.assertAlmostEqual(expected, mean.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def test1x1x1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5,)).reshape((1, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def test1x1xNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11, 3)).reshape((1, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 11)).reshape((1, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def test1xNxNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1Weighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((5, 7, 11)).reshape((3, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def testNx1xNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
         weights=np.asarray((
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def testNxNxNWeighted(self):
     self._test_3d_weighted(
         _test_values((3, 2, 4)),
@@ -309,6 +323,7 @@ class MeanTest(test.TestCase):
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3,
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidWeights(self):
     values_placeholder = array_ops.placeholder(dtype=dtypes_lib.float32)
     values = _test_values((3, 2, 4, 1))
@@ -341,23 +356,27 @@ class MeanTensorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_tensor(array_ops.ones([4, 3]))
     _assert_metric_variables(self,
                              ('mean/total_tensor:0', 'mean/count_tensor:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_tensor(
         array_ops.ones([4, 3]), metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_tensor(
         array_ops.ones([4, 3]), updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -370,11 +389,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean))
+        self.evaluate(update_op)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(mean))
 
+  @test_util.run_deprecated_v1
   def testMultiDimensional(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -391,11 +411,13 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(2):
-        sess.run(update_op)
-      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]], sess.run(mean))
+        self.evaluate(update_op)
+      self.assertAllClose([[[1, 2], [1, 2]], [[2, 3], [5, 6]]],
+                          self.evaluate(mean))
 
+  @test_util.run_deprecated_v1
   def testUpdateOpsReturnsCurrentValue(self):
     with self.cached_session() as sess:
       values_queue = data_flow_ops.FIFOQueue(
@@ -408,15 +430,16 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAllClose([[0, 1]], sess.run(update_op), 5)
-      self.assertAllClose([[-2.1, 5.05]], sess.run(update_op), 5)
-      self.assertAllClose([[2.3 / 3., 10.1 / 3.]], sess.run(update_op), 5)
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(update_op), 5)
+      self.assertAllClose([[0, 1]], self.evaluate(update_op), 5)
+      self.assertAllClose([[-2.1, 5.05]], self.evaluate(update_op), 5)
+      self.assertAllClose([[2.3 / 3., 10.1 / 3.]], self.evaluate(update_op), 5)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(update_op), 5)
 
-      self.assertAllClose([[-0.9 / 4., 3.525]], sess.run(mean), 5)
+      self.assertAllClose([[-0.9 / 4., 3.525]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testBinaryWeighted1d(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -439,11 +462,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[3.25, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[3.25, 0.5]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -466,11 +490,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[0.8, 3.52]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[0.8, 3.52]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_1(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -493,11 +518,12 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[-2.1, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[-2.1, 0.5]], self.evaluate(mean), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_2(self):
     with self.cached_session() as sess:
       # Create the queue that populates the values.
@@ -520,10 +546,10 @@ class MeanTensorTest(test.TestCase):
 
       mean, update_op = metrics.mean_tensor(values, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(4):
-        sess.run(update_op)
-      self.assertAllClose([[0, 0.5]], sess.run(mean), 5)
+        self.evaluate(update_op)
+      self.assertAllClose([[0, 0.5]], self.evaluate(mean), 5)
 
 
 class AccuracyTest(test.TestCase):
@@ -531,6 +557,7 @@ class AccuracyTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.accuracy(
         predictions=array_ops.ones((10, 1)),
@@ -539,6 +566,7 @@ class AccuracyTest(test.TestCase):
     _assert_metric_variables(self,
                              ('my_accuracy/count:0', 'my_accuracy/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.accuracy(
@@ -547,6 +575,7 @@ class AccuracyTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.accuracy(
@@ -555,12 +584,14 @@ class AccuracyTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones((10, 3))
     labels = array_ops.ones((10, 4))
     with self.assertRaises(ValueError):
       metrics.accuracy(labels, predictions)
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones((10, 3))
     labels = array_ops.ones((10, 3))
@@ -568,6 +599,7 @@ class AccuracyTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.accuracy(labels, predictions, weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=3, dtype=dtypes_lib.int64, seed=1)
@@ -576,17 +608,18 @@ class AccuracyTest(test.TestCase):
     accuracy, update_op = metrics.accuracy(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_accuracy = accuracy.eval()
       for _ in range(10):
         self.assertEqual(initial_accuracy, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdates(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -609,32 +642,35 @@ class AccuracyTest(test.TestCase):
 
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in xrange(3):
-        sess.run(update_op)
-      self.assertEqual(0.5, sess.run(update_op))
+        self.evaluate(update_op)
+      self.assertEqual(0.5, self.evaluate(update_op))
       self.assertEqual(0.5, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizes(self):
     predictions = array_ops.ones((40, 1))
     labels = array_ops.ones((40,))
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval())
       self.assertEqual(1.0, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithScalarWeight(self):
     predictions = array_ops.ones((40, 1))
     labels = array_ops.ones((40,))
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions, weights=2.0)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval())
       self.assertEqual(1.0, accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithStaticShapedWeight(self):
     predictions = ops.convert_to_tensor([1, 1, 1])  # shape 3,
     labels = array_ops.expand_dims(ops.convert_to_tensor([1, 0, 0]),
@@ -645,13 +681,14 @@ class AccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       accuracy, update_op = metrics.accuracy(labels, predictions, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
       self.assertGreater(update_op.eval(), .95)
       self.assertGreater(accuracy.eval(), .95)
 
+  @test_util.run_deprecated_v1
   def testEffectivelyEquivalentSizesWithDynamicallyShapedWeight(self):
     predictions = ops.convert_to_tensor([1, 1, 1])  # shape 3,
     labels = array_ops.expand_dims(ops.convert_to_tensor([1, 0, 0]),
@@ -666,13 +703,14 @@ class AccuracyTest(test.TestCase):
       accuracy, update_op = metrics.accuracy(labels, predictions,
                                              weights_placeholder)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # if streaming_accuracy does not flatten the weight, accuracy would be
       # 0.33333334 due to an intended broadcast of weight. Due to flattening,
       # it will be higher than .95
       self.assertGreater(update_op.eval(feed_dict=feed_dict), .95)
       self.assertGreater(accuracy.eval(feed_dict=feed_dict), .95)
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeightedValues(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -704,10 +742,10 @@ class AccuracyTest(test.TestCase):
 
       accuracy, update_op = metrics.accuracy(labels, predictions, weights)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in xrange(3):
-        sess.run(update_op)
-      self.assertEqual(1.0, sess.run(update_op))
+        self.evaluate(update_op)
+      self.assertEqual(1.0, self.evaluate(update_op))
       self.assertEqual(1.0, accuracy.eval())
 
 
@@ -717,12 +755,14 @@ class PrecisionTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.precision(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(self, ('precision/false_positives/count:0',
                                     'precision/true_positives/count:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.precision(
@@ -731,6 +771,7 @@ class PrecisionTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.precision(
@@ -739,6 +780,7 @@ class PrecisionTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
@@ -747,17 +789,18 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_precision = precision.eval()
       for _ in range(10):
         self.assertEqual(initial_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -766,10 +809,11 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op))
       self.assertAlmostEqual(1, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleInputDtypes(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions = math_ops.cast(
@@ -779,10 +823,11 @@ class PrecisionTest(test.TestCase):
       precision, update_op = metrics.precision(labels, predictions)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, update_op.eval())
         self.assertAlmostEqual(0.5, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -797,6 +842,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeightedScalar_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -816,6 +862,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testWeighted1d_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -836,6 +883,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [1, 0, 1, 0]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -852,6 +900,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d_placeholders(self):
     predictions = array_ops.placeholder(dtype=dtypes_lib.float32)
     labels = array_ops.placeholder(dtype=dtypes_lib.float32)
@@ -874,6 +923,7 @@ class PrecisionTest(test.TestCase):
       self.assertAlmostEqual(
           expected_precision, precision.eval(feed_dict=feed_dict))
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -882,18 +932,19 @@ class PrecisionTest(test.TestCase):
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertAlmostEqual(0, precision.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTrueAndFalsePositivesGivesZeroPrecision(self):
     predictions = constant_op.constant([0, 0, 0, 0])
     labels = constant_op.constant([0, 0, 0, 0])
     precision, update_op = metrics.precision(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0.0, precision.eval())
 
 
@@ -903,6 +954,7 @@ class RecallTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.recall(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
@@ -910,6 +962,7 @@ class RecallTest(test.TestCase):
         self,
         ('recall/false_negatives/count:0', 'recall/true_positives/count:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.recall(
@@ -918,6 +971,7 @@ class RecallTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.recall(
@@ -926,6 +980,7 @@ class RecallTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.int64, seed=1)
@@ -934,17 +989,18 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_recall = recall.eval()
       for _ in range(10):
         self.assertEqual(initial_recall, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     np_inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -953,10 +1009,11 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(1, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleInputDtypes(self):
     for dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions = math_ops.cast(
@@ -966,10 +1023,11 @@ class RecallTest(test.TestCase):
       recall, update_op = metrics.recall(labels, predictions)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         self.assertAlmostEqual(0.5, update_op.eval())
         self.assertAlmostEqual(0.5, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -977,13 +1035,14 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       weighted_tp = 2.0 + 5.0
       weighted_t = (2.0 + 2.0) + (5.0 + 5.0)
       expected_precision = weighted_tp / weighted_t
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions = constant_op.constant([[1, 0, 1, 0], [0, 1, 0, 1]])
     labels = constant_op.constant([[0, 1, 1, 0], [1, 0, 0, 1]])
@@ -991,13 +1050,14 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       weighted_tp = 3.0 + 1.0
       weighted_t = (2.0 + 3.0) + (4.0 + 1.0)
       expected_precision = weighted_tp / weighted_t
       self.assertAlmostEqual(expected_precision, update_op.eval())
       self.assertAlmostEqual(expected_precision, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     np_inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1006,18 +1066,19 @@ class RecallTest(test.TestCase):
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0, recall.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTruePositivesAndFalseNegativesGivesZeroRecall(self):
     predictions = array_ops.zeros((1, 4))
     labels = array_ops.zeros((1, 4))
     recall, update_op = metrics.recall(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
       self.assertEqual(0, recall.eval())
 
 
@@ -1027,6 +1088,7 @@ class AUCTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.auc(predictions=array_ops.ones((10, 1)),
                 labels=array_ops.ones((10, 1)))
@@ -1034,6 +1096,7 @@ class AUCTest(test.TestCase):
                              ('auc/true_positives:0', 'auc/false_negatives:0',
                               'auc/false_positives:0', 'auc/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.auc(predictions=array_ops.ones((10, 1)),
@@ -1041,6 +1104,7 @@ class AUCTest(test.TestCase):
                           metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.auc(predictions=array_ops.ones((10, 1)),
@@ -1048,6 +1112,7 @@ class AUCTest(test.TestCase):
                                updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1056,17 +1121,18 @@ class AUCTest(test.TestCase):
     auc, update_op = metrics.auc(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_auc = auc.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_auc, auc.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     self.allCorrectAsExpected('ROC')
 
@@ -1078,11 +1144,12 @@ class AUCTest(test.TestCase):
       labels = constant_op.constant(inputs)
       auc, update_op = metrics.auc(labels, predictions, curve=curve)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
 
       self.assertEqual(1, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleLabelDtypes(self):
     with self.cached_session() as sess:
       for label_dtype in (
@@ -1093,11 +1160,12 @@ class AUCTest(test.TestCase):
             constant_op.constant([0, 1, 1, 0], shape=(1, 4)), dtype=label_dtype)
         auc, update_op = metrics.auc(labels, predictions)
 
-        sess.run(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.5, sess.run(update_op))
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.5, self.evaluate(update_op))
 
         self.assertAlmostEqual(0.5, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1106,11 +1174,12 @@ class AUCTest(test.TestCase):
       weights = constant_op.constant([2], shape=(1, 1))
       auc, update_op = metrics.auc(labels, predictions, weights=weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.5, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.5, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(0.5, auc.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1119,13 +1188,14 @@ class AUCTest(test.TestCase):
       weights = constant_op.constant([1, 2, 3, 4], shape=(1, 4))
       auc, update_op = metrics.auc(labels, predictions, weights=weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.7, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.7, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(0.7, auc.eval(), 5)
 
   # Regarding the AUC-PR tests: note that the preferred method when
   # calculating AUC-PR is summation_method='careful_interpolation'.
+  @test_util.run_deprecated_v1
   def testCorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1134,12 +1204,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.79726744594
       expected = 1 - math.log(1.5) / 2
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testCorrectAnotherAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1150,12 +1221,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.61350593198
       expected = (2.5 - 2 * math.log(4./3) - 0.25 * math.log(7./5)) / 3
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testThirdCorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1166,12 +1238,13 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='careful_interpolation')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       # expected ~= 0.90410597584
       expected = 1 - math.log(4./3) / 3
-      self.assertAlmostEqual(expected, sess.run(update_op), delta=1e-3)
+      self.assertAlmostEqual(expected, self.evaluate(update_op), delta=1e-3)
       self.assertAlmostEqual(expected, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1180,11 +1253,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.79166, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.79166, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.79166, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAnotherIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1195,11 +1269,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.610317, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.610317, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.610317, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testThirdIncorrectAUCPRSpecialCase(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1210,11 +1285,12 @@ class AUCTest(test.TestCase):
       auc, update_op = metrics.auc(labels, predictions, curve='PR',
                                    summation_method='trapezoidal')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.90277, sess.run(update_op), delta=1e-3)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.90277, self.evaluate(update_op), delta=1e-3)
 
       self.assertAlmostEqual(0.90277, auc.eval(), delta=1e-3)
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1223,30 +1299,32 @@ class AUCTest(test.TestCase):
       labels = constant_op.constant(1 - inputs, dtype=dtypes_lib.float32)
       auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0, self.evaluate(update_op))
 
       self.assertAlmostEqual(0, auc.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroTruePositivesAndFalseNegativesGivesOneAUC(self):
     with self.cached_session() as sess:
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
       labels = array_ops.zeros([4])
       auc, update_op = metrics.auc(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 6)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
+  @test_util.run_deprecated_v1
   def testRecallOneAndPrecisionOneGivesOnePRAUC(self):
     with self.cached_session() as sess:
       predictions = array_ops.ones([4], dtype=dtypes_lib.float32)
       labels = array_ops.ones([4])
       auc, update_op = metrics.auc(labels, predictions, curve='PR')
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 6)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 6)
 
       self.assertAlmostEqual(1, auc.eval(), 6)
 
@@ -1277,6 +1355,7 @@ class AUCTest(test.TestCase):
     tp = np.cumsum(sorted_weights * is_positive) / num_positives
     return np.sum((sorted_weights * tp)[~is_positive]) / num_negatives
 
+  @test_util.run_deprecated_v1
   def testWithMultipleUpdates(self):
     num_samples = 1000
     batch_size = 10
@@ -1317,9 +1396,9 @@ class AUCTest(test.TestCase):
                                      num_thresholds=500,
                                      weights=tf_weights)
 
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
         for i in range(num_batches):
-          sess.run(update_op)
+          self.evaluate(update_op)
 
         # Since this is only approximate, we can't expect a 6 digits match.
         # Although with higher number of samples/thresholds we should see the
@@ -1333,6 +1412,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.specificity_at_sensitivity(
         predictions=array_ops.ones((10, 1)),
@@ -1344,6 +1424,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
                               'specificity_at_sensitivity/false_positives:0',
                               'specificity_at_sensitivity/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.specificity_at_sensitivity(
@@ -1353,6 +1434,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.specificity_at_sensitivity(
@@ -1362,6 +1444,7 @@ class SpecificityAtSensitivityTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1371,17 +1454,18 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_specificity = specificity.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_specificity, specificity.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1391,10 +1475,11 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
       self.assertEqual(1, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectHighSensitivity(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.45, 0.5, 0.8, 0.9]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1406,10 +1491,11 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.8)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1.0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1.0, self.evaluate(update_op))
       self.assertAlmostEqual(1.0, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectLowSensitivity(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1421,11 +1507,12 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, sensitivity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.assertAlmostEqual(0.6, self.evaluate(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted1d_multipleLabelDtypes(self):
     for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
@@ -1440,11 +1527,12 @@ class SpecificityAtSensitivityTest(test.TestCase):
           labels, predictions, weights=weights, sensitivity=0.4)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
+        self.evaluate(variables.local_variables_initializer())
 
-        self.assertAlmostEqual(0.6, sess.run(update_op))
+        self.assertAlmostEqual(0.6, self.evaluate(update_op))
         self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted2d(self):
     predictions_values = [0.1, 0.2, 0.4, 0.3, 0.0, 0.1, 0.2, 0.2, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1458,9 +1546,9 @@ class SpecificityAtSensitivityTest(test.TestCase):
         labels, predictions, weights=weights, sensitivity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
-      self.assertAlmostEqual(8.0 / 15.0, sess.run(update_op))
+      self.assertAlmostEqual(8.0 / 15.0, self.evaluate(update_op))
       self.assertAlmostEqual(8.0 / 15.0, specificity.eval())
 
 
@@ -1470,6 +1558,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.sensitivity_at_specificity(
         predictions=array_ops.ones((10, 1)),
@@ -1481,6 +1570,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
                               'sensitivity_at_specificity/false_positives:0',
                               'sensitivity_at_specificity/true_negatives:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.sensitivity_at_specificity(
@@ -1490,6 +1580,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.sensitivity_at_specificity(
@@ -1499,6 +1590,7 @@ class SensitivityAtSpecificityTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1508,17 +1600,18 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_sensitivity = sensitivity.eval()
       for _ in range(10):
         self.assertAlmostEqual(initial_sensitivity, sensitivity.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1528,10 +1621,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.7)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(1, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(1, self.evaluate(update_op))
       self.assertEqual(1, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectHighSpecificity(self):
     predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.1, 0.45, 0.5, 0.8, 0.9]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1543,10 +1637,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.8)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.8, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.8, self.evaluate(update_op))
       self.assertAlmostEqual(0.8, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrectLowSpecificity(self):
     predictions_values = [0.0, 0.1, 0.2, 0.3, 0.4, 0.01, 0.02, 0.25, 0.26, 0.26]
     labels_values = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1]
@@ -1558,10 +1653,11 @@ class SensitivityAtSpecificityTest(test.TestCase):
         labels, predictions, specificity=0.4)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(0.6, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(0.6, self.evaluate(update_op))
       self.assertAlmostEqual(0.6, specificity.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted_multipleLabelDtypes(self):
     for label_dtype in (dtypes_lib.bool, dtypes_lib.int32, dtypes_lib.float32):
       predictions_values = [
@@ -1577,8 +1673,8 @@ class SensitivityAtSpecificityTest(test.TestCase):
           labels, predictions, weights=weights, specificity=0.4)
 
       with self.cached_session() as sess:
-        sess.run(variables.local_variables_initializer())
-        self.assertAlmostEqual(0.675, sess.run(update_op))
+        self.evaluate(variables.local_variables_initializer())
+        self.assertAlmostEqual(0.675, self.evaluate(update_op))
         self.assertAlmostEqual(0.675, specificity.eval())
 
 
@@ -1589,6 +1685,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.precision_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -1599,6 +1696,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         'precision_at_thresholds/false_positives:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     prec, _ = metrics.precision_at_thresholds(
@@ -1613,6 +1711,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [prec, rec])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, precision_op = metrics.precision_at_thresholds(
@@ -1628,6 +1727,7 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     self.assertListEqual(
         ops.get_collection(my_collection_name), [precision_op, recall_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_uniform(
         (10, 3), maxval=1, dtype=dtypes_lib.float32, seed=1)
@@ -1639,18 +1739,19 @@ class PrecisionRecallThresholdsTest(test.TestCase):
     rec, rec_op = metrics.recall_at_thresholds(labels, predictions, thresholds)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates, then verify idempotency.
-      sess.run([prec_op, rec_op])
+      self.evaluate([prec_op, rec_op])
       initial_prec = prec.eval()
       initial_rec = rec.eval()
       for _ in range(10):
-        sess.run([prec_op, rec_op])
+        self.evaluate([prec_op, rec_op])
         self.assertAllClose(initial_prec, prec.eval())
         self.assertAllClose(initial_rec, rec.eval())
 
   # TODO(nsilberman): fix tests (passing but incorrect).
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1663,12 +1764,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertEqual(1, prec.eval())
       self.assertEqual(1, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testSomeCorrect_multipleLabelDtypes(self):
     with self.cached_session() as sess:
       for label_dtype in (
@@ -1683,12 +1785,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
         rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                    thresholds)
 
-        sess.run(variables.local_variables_initializer())
-        sess.run([prec_op, rec_op])
+        self.evaluate(variables.local_variables_initializer())
+        self.evaluate([prec_op, rec_op])
 
         self.assertAlmostEqual(0.5, prec.eval())
         self.assertAlmostEqual(0.5, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testAllIncorrect(self):
     inputs = np.random.randint(0, 2, size=(100, 1))
 
@@ -1701,12 +1804,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0, prec.eval())
       self.assertAlmostEqual(0, rec.eval())
 
+  @test_util.run_deprecated_v1
   def testWeights1d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1729,14 +1833,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec_low = array_ops.reshape(rec_low, shape=())
       rec_high = array_ops.reshape(rec_high, shape=())
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
       self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def testWeights2d(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1759,14 +1864,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec_low = array_ops.reshape(rec_low, shape=())
       rec_high = array_ops.reshape(rec_high, shape=())
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(1.0, prec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, prec_high.eval(), places=5)
       self.assertAlmostEqual(1.0, rec_low.eval(), places=5)
       self.assertAlmostEqual(0.0, rec_high.eval(), places=5)
 
+  @test_util.run_deprecated_v1
   def testExtremeThresholds(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -1783,14 +1889,15 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       [rec_low, rec_high] = array_ops.split(
           value=rec, num_or_size_splits=2, axis=0)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0.75, prec_low.eval())
       self.assertAlmostEqual(0.0, prec_high.eval())
       self.assertAlmostEqual(1.0, rec_low.eval())
       self.assertAlmostEqual(0.0, rec_high.eval())
 
+  @test_util.run_deprecated_v1
   def testZeroLabelsPredictions(self):
     with self.cached_session() as sess:
       predictions = array_ops.zeros([4], dtype=dtypes_lib.float32)
@@ -1801,12 +1908,13 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(labels, predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([prec_op, rec_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([prec_op, rec_op])
 
       self.assertAlmostEqual(0, prec.eval(), 6)
       self.assertAlmostEqual(0, rec.eval(), 6)
 
+  @test_util.run_deprecated_v1
   def testWithMultipleUpdates(self):
     num_samples = 1000
     batch_size = 10
@@ -1869,9 +1977,9 @@ class PrecisionRecallThresholdsTest(test.TestCase):
       rec, rec_op = metrics.recall_at_thresholds(tf_labels, tf_predictions,
                                                  thresholds)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(int(num_samples / batch_size)):
-        sess.run([prec_op, rec_op])
+        self.evaluate([prec_op, rec_op])
       # Since this is only approximate, we can't expect a 6 digits match.
       # Although with higher number of samples/thresholds we should see the
       # accuracy improving
@@ -1989,6 +2097,7 @@ class SingleLabelPrecisionAtKTest(test.TestCase):
     self._test_average_precision_at_k = functools.partial(
         _test_average_precision_at_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_nan(self):
     for labels in self._labels:
       # Classes 0,1,2 have 0 predictions, classes -1 and 4 are out of range.
@@ -1998,6 +2107,7 @@ class SingleLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             self._predictions_idx, labels, k=1, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k1(self):
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
@@ -2025,6 +2135,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_average_precision_at_k = functools.partial(
         _test_average_precision_at_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_average_precision(self):
     # Example 1.
     # Matches example here:
@@ -2100,6 +2211,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
           expected=streaming_average_precision[i],
           weights=weights)
 
+  @test_util.run_deprecated_v1
   def test_average_precision_some_labels_out_of_range(self):
     """Tests that labels outside the [0, n_classes) range are ignored."""
     labels_ex1 = (-1, 0, 1, 2, 3, 4, 7)
@@ -2119,6 +2231,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_average_precision_at_k(
           predictions, labels, k, expected=avg_precision_ex1[i])
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_no_predictions(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2135,6 +2248,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_no_labels(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2151,6 +2265,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
         self._test_precision_at_top_k(
             predictions_idx, labels, k=5, expected=0.0, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5(self):
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                    [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]]
@@ -2184,6 +2299,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=3.0 / 10)
 
+  @test_util.run_deprecated_v1
   def test_three_labels_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) range are ignored."""
     predictions = [[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
@@ -2220,6 +2336,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_precision_at_top_k(
         predictions_idx, sp_labels, k=5, expected=3.0 / 10)
 
+  @test_util.run_deprecated_v1
   def test_3d_nan(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2238,6 +2355,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d_no_labels(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2256,6 +2374,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
       self._test_precision_at_top_k(
           predictions_idx, labels, k=5, expected=0.0, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2291,6 +2410,7 @@ class MultiLabelPrecisionAtKTest(test.TestCase):
     self._test_precision_at_top_k(
         predictions_idx, labels, k=5, expected=7.0 / 20)
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_some(self):
     predictions = [[[0.5, 0.1, 0.6, 0.3, 0.8, 0.0, 0.7, 0.2, 0.4, 0.9],
                     [0.3, 0.0, 0.7, 0.2, 0.4, 0.9, 0.5, 0.8, 0.1, 0.6]],
@@ -2432,6 +2552,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_nan(self):
     # Classes 0,1 have 0 labels, 0 predictions, classes -1 and 4 are out of
     # range.
@@ -2442,6 +2563,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
         self._test_recall_at_top_k(
             self._predictions_idx, labels, k=1, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k1_no_predictions(self):
     for labels in self._labels:
       # Class 2: 0 predictions.
@@ -2450,6 +2572,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=1, expected=0.0, class_id=2)
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1(self):
     for labels in self._labels:
       # Class 3: 1 label, 2 predictions, 1 correct.
@@ -2463,6 +2586,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=1, expected=1.0 / 2)
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1_weighted_class_id3(self):
     predictions = self._predictions
     predictions_idx = self._predictions_idx
@@ -2504,6 +2628,7 @@ class SingleLabelRecallAtKTest(test.TestCase):
           predictions_idx, labels, k=1, expected=2.0 / 2, class_id=3,
           weights=(2.0, 3.0))
 
+  @test_util.run_deprecated_v1
   def test_one_label_at_k1_weighted(self):
     predictions = self._predictions
     predictions_idx = self._predictions_idx
@@ -2553,6 +2678,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_nan(self):
     for labels in self._labels:
       # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
@@ -2562,6 +2688,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
         self._test_recall_at_top_k(
             self._predictions_idx, labels, k=5, expected=NAN, class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_no_predictions(self):
     for labels in self._labels:
       # Class 8: 1 label, no predictions.
@@ -2570,6 +2697,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=5, expected=0.0 / 1, class_id=8)
 
+  @test_util.run_deprecated_v1
   def test_at_k5(self):
     for labels in self._labels:
       # Class 2: 2 labels, both correct.
@@ -2595,6 +2723,7 @@ class MultiLabel2dRecallAtKTest(test.TestCase):
       self._test_recall_at_top_k(
           self._predictions_idx, labels, k=5, expected=3.0 / 6)
 
+  @test_util.run_deprecated_v1
   def test_at_k5_some_out_of_range(self):
     """Tests that labels outside the [0, n_classes) count in denominator."""
     labels = sparse_tensor.SparseTensorValue(
@@ -2647,6 +2776,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k = functools.partial(
         _test_recall_at_top_k, test_case=self)
 
+  @test_util.run_deprecated_v1
   def test_3d_nan(self):
     # Classes 0,3,4,6,9 have 0 labels, class 10 is out of range.
     for class_id in (0, 3, 4, 6, 9, 10):
@@ -2656,6 +2786,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
           self._predictions_idx, self._labels, k=5, expected=NAN,
           class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d_no_predictions(self):
     # Classes 1,8 have 0 predictions, >=1 label.
     for class_id in (1, 8):
@@ -2665,6 +2796,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
           self._predictions_idx, self._labels, k=5, expected=0.0,
           class_id=class_id)
 
+  @test_util.run_deprecated_v1
   def test_3d(self):
     # Class 2: 4 labels, all correct.
     self._test_recall_at_k(
@@ -2693,6 +2825,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
     self._test_recall_at_top_k(
         self._predictions_idx, self._labels, k=5, expected=7.0 / 12)
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_all(self):
     for class_id in xrange(10):
       self._test_recall_at_k(
@@ -2719,6 +2852,7 @@ class MultiLabel3dRecallAtKTest(test.TestCase):
         self._predictions_idx, self._labels, k=5, expected=NAN,
         weights=[[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def test_3d_ignore_some(self):
     # Class 2: 2 labels, both correct.
     self._test_recall_at_k(
@@ -2774,12 +2908,14 @@ class MeanAbsoluteErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_absolute_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(
         self, ('mean_absolute_error/count:0', 'mean_absolute_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_absolute_error(
@@ -2788,6 +2924,7 @@ class MeanAbsoluteErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_absolute_error(
@@ -2796,23 +2933,25 @@ class MeanAbsoluteErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_absolute_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     predictions = constant_op.constant(
         [2, 4, 6, 8], shape=(1, 4), dtype=dtypes_lib.float32)
@@ -2823,8 +2962,8 @@ class MeanAbsoluteErrorTest(test.TestCase):
     error, update_op = metrics.mean_absolute_error(labels, predictions, weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(3, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(3, self.evaluate(update_op))
       self.assertEqual(3, error.eval())
 
 
@@ -2833,6 +2972,7 @@ class MeanRelativeErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_relative_error(
         predictions=array_ops.ones((10, 1)),
@@ -2841,6 +2981,7 @@ class MeanRelativeErrorTest(test.TestCase):
     _assert_metric_variables(
         self, ('mean_relative_error/count:0', 'mean_relative_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_relative_error(
@@ -2850,6 +2991,7 @@ class MeanRelativeErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_relative_error(
@@ -2859,6 +3001,7 @@ class MeanRelativeErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
@@ -2867,17 +3010,18 @@ class MeanRelativeErrorTest(test.TestCase):
                                                    normalizer)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateNormalizedByLabels(self):
     np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
     np_labels = np.asarray([1, 3, 2, 3], dtype=np.float32)
@@ -2892,10 +3036,11 @@ class MeanRelativeErrorTest(test.TestCase):
         labels, predictions, normalizer=labels)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(expected_error, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(expected_error, self.evaluate(update_op))
       self.assertEqual(expected_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateNormalizedByZeros(self):
     np_predictions = np.asarray([2, 4, 6, 8], dtype=np.float32)
 
@@ -2908,8 +3053,8 @@ class MeanRelativeErrorTest(test.TestCase):
         labels, predictions, normalizer=array_ops.zeros_like(labels))
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0.0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0.0, self.evaluate(update_op))
       self.assertEqual(0.0, error.eval())
 
 
@@ -2918,12 +3063,14 @@ class MeanSquaredErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
     _assert_metric_variables(
         self, ('mean_squared_error/count:0', 'mean_squared_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_squared_error(
@@ -2932,6 +3079,7 @@ class MeanSquaredErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_squared_error(
@@ -2940,23 +3088,25 @@ class MeanSquaredErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     predictions = array_ops.zeros((1, 3), dtype=dtypes_lib.float32)
     labels = array_ops.zeros((1, 3), dtype=dtypes_lib.float32)
@@ -2964,10 +3114,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError(self):
     predictions = constant_op.constant(
         [2, 4, 6], shape=(1, 3), dtype=dtypes_lib.float32)
@@ -2977,10 +3128,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(6, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(6, self.evaluate(update_op))
       self.assertEqual(6, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     predictions = constant_op.constant(
         [2, 4, 6, 8], shape=(1, 4), dtype=dtypes_lib.float32)
@@ -2991,10 +3143,11 @@ class MeanSquaredErrorTest(test.TestCase):
     error, update_op = metrics.mean_squared_error(labels, predictions, weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(13, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(13, self.evaluate(update_op))
       self.assertEqual(13, error.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleBatchesOfSizeOne(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -3013,12 +3166,13 @@ class MeanSquaredErrorTest(test.TestCase):
 
       error, update_op = metrics.mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run(update_op)
-      self.assertAlmostEqual(208.0 / 6, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate(update_op)
+      self.assertAlmostEqual(208.0 / 6, self.evaluate(update_op), 5)
 
       self.assertAlmostEqual(208.0 / 6, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testMetricsComputedConcurrently(self):
     with self.cached_session() as sess:
       # Create the queue that populates one set of predictions.
@@ -3054,14 +3208,15 @@ class MeanSquaredErrorTest(test.TestCase):
       mse1, update_op1 = metrics.mean_squared_error(
           labels1, predictions1, name='msd1')
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([update_op0, update_op1])
-      sess.run([update_op0, update_op1])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([update_op0, update_op1])
+      self.evaluate([update_op0, update_op1])
 
-      mse0, mse1 = sess.run([mse0, mse1])
+      mse0, mse1 = self.evaluate([mse0, mse1])
       self.assertAlmostEqual(208.0 / 6, mse0, 5)
       self.assertAlmostEqual(79.0 / 6, mse1, 5)
 
+  @test_util.run_deprecated_v1
   def testMultipleMetricsOnMultipleBatchesOfSizeOne(self):
     with self.cached_session() as sess:
       # Create the queue that populates the predictions.
@@ -3081,9 +3236,9 @@ class MeanSquaredErrorTest(test.TestCase):
       mae, ma_update_op = metrics.mean_absolute_error(labels, predictions)
       mse, ms_update_op = metrics.mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([ma_update_op, ms_update_op])
-      sess.run([ma_update_op, ms_update_op])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([ma_update_op, ms_update_op])
+      self.evaluate([ma_update_op, ms_update_op])
 
       self.assertAlmostEqual(32.0 / 6, mae.eval(), 5)
       self.assertAlmostEqual(208.0 / 6, mse.eval(), 5)
@@ -3094,6 +3249,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.root_mean_squared_error(
         predictions=array_ops.ones((10, 1)), labels=array_ops.ones((10, 1)))
@@ -3101,6 +3257,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
         self,
         ('root_mean_squared_error/count:0', 'root_mean_squared_error/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.root_mean_squared_error(
@@ -3109,6 +3266,7 @@ class RootMeanSquaredErrorTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.root_mean_squared_error(
@@ -3117,23 +3275,25 @@ class RootMeanSquaredErrorTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.root_mean_squared_error(labels, predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3142,11 +3302,12 @@ class RootMeanSquaredErrorTest(test.TestCase):
 
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
 
       self.assertEqual(0, rmse.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3156,10 +3317,11 @@ class RootMeanSquaredErrorTest(test.TestCase):
 
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAlmostEqual(math.sqrt(6), update_op.eval(), 5)
       self.assertAlmostEqual(math.sqrt(6), rmse.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights(self):
     with self.cached_session() as sess:
       predictions = constant_op.constant(
@@ -3171,8 +3333,8 @@ class RootMeanSquaredErrorTest(test.TestCase):
       rmse, update_op = metrics.root_mean_squared_error(labels, predictions,
                                                         weights)
 
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(math.sqrt(13), sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(math.sqrt(13), self.evaluate(update_op))
 
       self.assertAlmostEqual(math.sqrt(13), rmse.eval(), 5)
 
@@ -3187,6 +3349,7 @@ class MeanCosineDistanceTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_cosine_distance(
         predictions=array_ops.ones((10, 3)),
@@ -3197,6 +3360,7 @@ class MeanCosineDistanceTest(test.TestCase):
         'mean_cosine_distance/total:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.mean_cosine_distance(
@@ -3206,6 +3370,7 @@ class MeanCosineDistanceTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_cosine_distance(
@@ -3215,23 +3380,25 @@ class MeanCosineDistanceTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     predictions = random_ops.random_normal((10, 3), seed=1)
     labels = random_ops.random_normal((10, 3), seed=2)
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=1)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_error = error.eval()
       for _ in range(10):
         self.assertEqual(initial_error, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateZeroError(self):
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
 
@@ -3243,10 +3410,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError1(self):
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
@@ -3259,10 +3427,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1, self.evaluate(update_op), 5)
       self.assertAlmostEqual(1, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithError2(self):
     np_predictions = np.matrix(
         ('0.819031913261206 0.567041924552012 0.087465312324590;'
@@ -3280,10 +3449,11 @@ class MeanCosineDistanceTest(test.TestCase):
     error, update_op = metrics.mean_cosine_distance(labels, predictions, dim=2)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertAlmostEqual(1.0, sess.run(update_op), 5)
+      self.evaluate(variables.local_variables_initializer())
+      self.assertAlmostEqual(1.0, self.evaluate(update_op), 5)
       self.assertAlmostEqual(1.0, error.eval(), 5)
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights1(self):
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
@@ -3299,10 +3469,11 @@ class MeanCosineDistanceTest(test.TestCase):
         labels, predictions, dim=2, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
-      self.assertEqual(0, sess.run(update_op))
+      self.evaluate(variables.local_variables_initializer())
+      self.assertEqual(0, self.evaluate(update_op))
       self.assertEqual(0, error.eval())
 
+  @test_util.run_deprecated_v1
   def testSingleUpdateWithErrorAndWeights2(self):
     np_predictions = np.matrix(('1 0 0;' '0 0 -1;' '1 0 0'))
     np_labels = np.matrix(('1 0 0;' '0 0 1;' '0 1 0'))
@@ -3318,7 +3489,7 @@ class MeanCosineDistanceTest(test.TestCase):
         labels, predictions, dim=2, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.5, update_op.eval())
       self.assertEqual(1.5, error.eval())
 
@@ -3328,6 +3499,7 @@ class PcntBelowThreshTest(test.TestCase):
   def setUp(self):
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.percentage_below(values=array_ops.ones((10,)), threshold=2)
     _assert_metric_variables(self, (
@@ -3335,6 +3507,7 @@ class PcntBelowThreshTest(test.TestCase):
         'percentage_below_threshold/total:0',
     ))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollection(self):
     my_collection_name = '__metrics__'
     mean, _ = metrics.percentage_below(
@@ -3343,6 +3516,7 @@ class PcntBelowThreshTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.percentage_below(
@@ -3351,6 +3525,7 @@ class PcntBelowThreshTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testOneUpdate(self):
     with self.cached_session() as sess:
       values = constant_op.constant(
@@ -3360,14 +3535,15 @@ class PcntBelowThreshTest(test.TestCase):
       pcnt1, update_op1 = metrics.percentage_below(values, 7, name='medium')
       pcnt2, update_op2 = metrics.percentage_below(values, 1, name='low')
 
-      sess.run(variables.local_variables_initializer())
-      sess.run([update_op0, update_op1, update_op2])
+      self.evaluate(variables.local_variables_initializer())
+      self.evaluate([update_op0, update_op1, update_op2])
 
-      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      pcnt0, pcnt1, pcnt2 = self.evaluate([pcnt0, pcnt1, pcnt2])
       self.assertAlmostEqual(1.0, pcnt0, 5)
       self.assertAlmostEqual(0.75, pcnt1, 5)
       self.assertAlmostEqual(0.0, pcnt2, 5)
 
+  @test_util.run_deprecated_v1
   def testSomePresentOneUpdate(self):
     with self.cached_session() as sess:
       values = constant_op.constant(
@@ -3382,11 +3558,11 @@ class PcntBelowThreshTest(test.TestCase):
       pcnt2, update_op2 = metrics.percentage_below(
           values, 1, weights=weights, name='low')
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertListEqual([1.0, 0.5, 0.0],
-                           sess.run([update_op0, update_op1, update_op2]))
+                           self.evaluate([update_op0, update_op1, update_op2]))
 
-      pcnt0, pcnt1, pcnt2 = sess.run([pcnt0, pcnt1, pcnt2])
+      pcnt0, pcnt1, pcnt2 = self.evaluate([pcnt0, pcnt1, pcnt2])
       self.assertAlmostEqual(1.0, pcnt0, 5)
       self.assertAlmostEqual(0.5, pcnt1, 5)
       self.assertAlmostEqual(0.0, pcnt2, 5)
@@ -3398,6 +3574,7 @@ class MeanIOUTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_iou(
         predictions=array_ops.ones([10, 1]),
@@ -3405,6 +3582,7 @@ class MeanIOUTest(test.TestCase):
         num_classes=2)
     _assert_metric_variables(self, ('mean_iou/total_confusion_matrix:0',))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
     mean_iou, _ = metrics.mean_iou(
@@ -3414,6 +3592,7 @@ class MeanIOUTest(test.TestCase):
         metrics_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [mean_iou])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_iou(
@@ -3423,12 +3602,14 @@ class MeanIOUTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10, 3])
     labels = array_ops.ones([10, 4])
     with self.assertRaises(ValueError):
       metrics.mean_iou(labels, predictions, num_classes=2)
 
+  @test_util.run_deprecated_v1
   def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10])
     labels = array_ops.ones([10])
@@ -3436,6 +3617,7 @@ class MeanIOUTest(test.TestCase):
     with self.assertRaises(ValueError):
       metrics.mean_iou(labels, predictions, num_classes=2, weights=weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     num_classes = 3
     predictions = random_ops.random_uniform(
@@ -3446,17 +3628,18 @@ class MeanIOUTest(test.TestCase):
         labels, predictions, num_classes=num_classes)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_mean_iou = mean_iou.eval()
       for _ in range(10):
         self.assertEqual(initial_mean_iou, mean_iou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdates(self):
     num_classes = 3
     with self.cached_session() as sess:
@@ -3482,12 +3665,13 @@ class MeanIOUTest(test.TestCase):
 
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 1.0 / 4.0, 0.])
       self.assertEqual(desired_output, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeights(self):
     num_classes = 2
     with self.cached_session() as sess:
@@ -3529,10 +3713,11 @@ class MeanIOUTest(test.TestCase):
 
       variables.local_variables_initializer().run()
       for _ in range(6):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([2.0 / 3.0, 1.0 / 2.0])
       self.assertAlmostEqual(desired_output, mean_iou.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithMissingClass(self):
     # Test the case where there are no predicions and labels for
     # one class, and thus there is one row and one column with
@@ -3563,12 +3748,13 @@ class MeanIOUTest(test.TestCase):
 
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 3.0, 2.0 / 4.0])
       self.assertAlmostEqual(desired_output, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testUpdateOpEvalIsAccumulatedConfusionMatrix(self):
     predictions = array_ops.concat(
         [
@@ -3587,32 +3773,35 @@ class MeanIOUTest(test.TestCase):
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       confusion_matrix = update_op.eval()
       self.assertAllEqual([[3, 0], [2, 5]], confusion_matrix)
       desired_miou = np.mean([3. / 5., 5. / 7.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.zeros([40])
     num_classes = 1
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(40, update_op.eval()[0])
       self.assertEqual(1.0, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.ones([40])
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[0, 0], [40, 0]], update_op.eval())
       self.assertEqual(0., miou.eval())
 
+  @test_util.run_deprecated_v1
   def testResultsWithSomeMissing(self):
     predictions = array_ops.concat(
         [
@@ -3640,11 +3829,12 @@ class MeanIOUTest(test.TestCase):
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(
           labels, predictions, num_classes, weights=weights)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[2, 0], [2, 4]], update_op.eval())
       desired_miou = np.mean([2. / 4., 4. / 6.])
       self.assertAlmostEqual(desired_miou, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassInLabels(self):
     labels = constant_op.constant([
         [[0, 0, 1, 1, 0, 0],
@@ -3659,22 +3849,24 @@ class MeanIOUTest(test.TestCase):
     num_classes = 3
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[7, 4, 3], [3, 5, 2], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
           1 / 3 * (7 / (7 + 3 + 7) + 5 / (5 + 4 + 5) + 0 / (0 + 5 + 0)),
           miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassOverallSmall(self):
     labels = constant_op.constant([0])
     predictions = constant_op.constant([0])
     num_classes = 2
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[1, 0], [0, 0]], update_op.eval())
       self.assertAlmostEqual(1, miou.eval())
 
+  @test_util.run_deprecated_v1
   def testMissingClassOverallLarge(self):
     labels = constant_op.constant([
         [[0, 0, 1, 1, 0, 0],
@@ -3689,7 +3881,7 @@ class MeanIOUTest(test.TestCase):
     num_classes = 3
     with self.cached_session() as sess:
       miou, update_op = metrics.mean_iou(labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([[9, 5, 0], [3, 7, 0], [0, 0, 0]], update_op.eval())
       self.assertAlmostEqual(
           1 / 2 * (9 / (9 + 3 + 5) + 7 / (7 + 5 + 3)), miou.eval())
@@ -3701,6 +3893,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.mean_per_class_accuracy(
         predictions=array_ops.ones([10, 1]),
@@ -3709,6 +3902,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     _assert_metric_variables(self, ('mean_accuracy/count:0',
                                     'mean_accuracy/total:0'))
 
+  @test_util.run_deprecated_v1
   def testMetricsCollections(self):
     my_collection_name = '__metrics__'
     mean_accuracy, _ = metrics.mean_per_class_accuracy(
@@ -3719,6 +3913,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     self.assertListEqual(
         ops.get_collection(my_collection_name), [mean_accuracy])
 
+  @test_util.run_deprecated_v1
   def testUpdatesCollection(self):
     my_collection_name = '__updates__'
     _, update_op = metrics.mean_per_class_accuracy(
@@ -3728,12 +3923,14 @@ class MeanPerClassAccuracyTest(test.TestCase):
         updates_collections=[my_collection_name])
     self.assertListEqual(ops.get_collection(my_collection_name), [update_op])
 
+  @test_util.run_deprecated_v1
   def testPredictionsAndLabelsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10, 3])
     labels = array_ops.ones([10, 4])
     with self.assertRaises(ValueError):
       metrics.mean_per_class_accuracy(labels, predictions, num_classes=2)
 
+  @test_util.run_deprecated_v1
   def testLabelsAndWeightsOfDifferentSizeRaisesValueError(self):
     predictions = array_ops.ones([10])
     labels = array_ops.ones([10])
@@ -3742,6 +3939,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
       metrics.mean_per_class_accuracy(
           labels, predictions, num_classes=2, weights=weights)
 
+  @test_util.run_deprecated_v1
   def testValueTensorIsIdempotent(self):
     num_classes = 3
     predictions = random_ops.random_uniform(
@@ -3752,11 +3950,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
         labels, predictions, num_classes=num_classes)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
 
       # Run several updates.
       for _ in range(10):
-        sess.run(update_op)
+        self.evaluate(update_op)
 
       # Then verify idempotency.
       initial_mean_accuracy = mean_accuracy.eval()
@@ -3788,12 +3986,13 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0, 1.0 / 3.0, 0.0])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithWeights(self):
     num_classes = 2
     with self.cached_session() as sess:
@@ -3835,10 +4034,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
 
       variables.local_variables_initializer().run()
       for _ in range(6):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([2.0 / 2.0, 0.5 / 1.5])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testMultipleUpdatesWithMissingClass(self):
     # Test the case where there are no predicions and labels for
     # one class, and thus there is one row and one column with
@@ -3870,12 +4070,13 @@ class MeanPerClassAccuracyTest(test.TestCase):
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
 
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       for _ in range(5):
-        sess.run(update_op)
+        self.evaluate(update_op)
       desired_output = np.mean([1.0 / 2.0, 2.0 / 3.0, 0.])
       self.assertAlmostEqual(desired_output, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testAllCorrect(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.zeros([40])
@@ -3883,10 +4084,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertEqual(1.0, update_op.eval()[0])
       self.assertEqual(1.0, mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testAllWrong(self):
     predictions = array_ops.zeros([40])
     labels = array_ops.ones([40])
@@ -3894,10 +4096,11 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual([0.0, 0.0], update_op.eval())
       self.assertEqual(0., mean_accuracy.eval())
 
+  @test_util.run_deprecated_v1
   def testResultsWithSomeMissing(self):
     predictions = array_ops.concat([
         constant_op.constant(0, shape=[5]), constant_op.constant(1, shape=[5])
@@ -3913,7 +4116,7 @@ class MeanPerClassAccuracyTest(test.TestCase):
     with self.cached_session() as sess:
       mean_accuracy, update_op = metrics.mean_per_class_accuracy(
           labels, predictions, num_classes, weights=weights)
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       desired_accuracy = np.array([2. / 2., 4. / 6.], dtype=np.float32)
       self.assertAllEqual(desired_accuracy, update_op.eval())
       desired_mean_accuracy = np.mean(desired_accuracy)
@@ -3926,12 +4129,14 @@ class FalseNegativesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_negatives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('false_negatives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -3945,11 +4150,12 @@ class FalseNegativesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(3., tn_update_op.eval())
       self.assertAllClose(3., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -3964,7 +4170,7 @@ class FalseNegativesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(5., tn_update_op.eval())
       self.assertAllClose(5., tn.eval())
@@ -3976,6 +4182,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_negatives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -3983,6 +4190,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('false_negatives/false_negatives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -3994,11 +4202,12 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), fn.eval())
       self.assertAllEqual((0, 2, 3), fn_update_op.eval())
       self.assertAllEqual((0, 2, 3), fn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4013,7 +4222,7 @@ class FalseNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), fn.eval())
       self.assertAllEqual((0.0, 8.0, 11.0), fn_update_op.eval())
       self.assertAllEqual((0.0, 8.0, 11.0), fn.eval())
@@ -4025,12 +4234,14 @@ class FalsePositivesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_positives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('false_positives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4044,11 +4255,12 @@ class FalsePositivesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(7., tn_update_op.eval())
       self.assertAllClose(7., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4063,7 +4275,7 @@ class FalsePositivesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(14., tn_update_op.eval())
       self.assertAllClose(14., tn.eval())
@@ -4075,6 +4287,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.false_positives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4082,6 +4295,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('false_positives/false_positives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4093,11 +4307,12 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), fp.eval())
       self.assertAllEqual((7, 4, 2), fp_update_op.eval())
       self.assertAllEqual((7, 4, 2), fp.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4114,7 +4329,7 @@ class FalsePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), fp.eval())
       self.assertAllEqual((125.0, 42.0, 12.0), fp_update_op.eval())
       self.assertAllEqual((125.0, 42.0, 12.0), fp.eval())
@@ -4126,12 +4341,14 @@ class TrueNegativesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_negatives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('true_negatives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4145,11 +4362,12 @@ class TrueNegativesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(3., tn_update_op.eval())
       self.assertAllClose(3., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4164,7 +4382,7 @@ class TrueNegativesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(4., tn_update_op.eval())
       self.assertAllClose(4., tn.eval())
@@ -4176,6 +4394,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_negatives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4183,6 +4402,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('true_negatives/true_negatives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4194,11 +4414,12 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), tn.eval())
       self.assertAllEqual((2, 5, 7), tn_update_op.eval())
       self.assertAllEqual((2, 5, 7), tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4213,7 +4434,7 @@ class TrueNegativesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), tn.eval())
       self.assertAllEqual((5.0, 15.0, 23.0), tn_update_op.eval())
       self.assertAllEqual((5.0, 15.0, 23.0), tn.eval())
@@ -4225,12 +4446,14 @@ class TruePositivesTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_positives(
         labels=(0, 1, 0, 1),
         predictions=(0, 0, 1, 1))
     _assert_metric_variables(self, ('true_positives/count:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4244,11 +4467,12 @@ class TruePositivesTest(test.TestCase):
         labels=labels, predictions=predictions)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(7., tn_update_op.eval())
       self.assertAllClose(7., tn.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     labels = constant_op.constant(((0, 1, 0, 1, 0),
                                    (0, 0, 1, 1, 1),
@@ -4263,7 +4487,7 @@ class TruePositivesTest(test.TestCase):
         labels=labels, predictions=predictions, weights=weights)
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllClose(0., tn.eval())
       self.assertAllClose(12., tn_update_op.eval())
       self.assertAllClose(12., tn.eval())
@@ -4275,6 +4499,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
     np.random.seed(1)
     ops.reset_default_graph()
 
+  @test_util.run_deprecated_v1
   def testVars(self):
     metrics.true_positives_at_thresholds(
         predictions=array_ops.ones((10, 1)),
@@ -4282,6 +4507,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
     _assert_metric_variables(self, ('true_positives/true_positives:0',))
 
+  @test_util.run_deprecated_v1
   def testUnweighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4293,11 +4519,12 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         predictions=predictions, labels=labels, thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0, 0, 0), tp.eval())
       self.assertAllEqual((3, 1, 0), tp_update_op.eval())
       self.assertAllEqual((3, 1, 0), tp.eval())
 
+  @test_util.run_deprecated_v1
   def testWeighted(self):
     predictions = constant_op.constant(((0.9, 0.2, 0.8, 0.1),
                                         (0.2, 0.9, 0.7, 0.6),
@@ -4310,7 +4537,7 @@ class TruePositivesAtThresholdsTest(test.TestCase):
         thresholds=[0.15, 0.5, 0.85])
 
     with self.cached_session() as sess:
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       self.assertAllEqual((0.0, 0.0, 0.0), tp.eval())
       self.assertAllEqual((111.0, 37.0, 0.0), tp_update_op.eval())
       self.assertAllEqual((111.0, 37.0, 0.0), tp.eval())
diff --git a/tensorflow/python/kernel_tests/morphological_ops_test.py b/tensorflow/python/kernel_tests/morphological_ops_test.py
index 6d601554b80408ff6f419b164cd12bcb493a2f61..f54aaf30d0a928f2ff5f86ec1ec07658f272f8f7 100644
--- a/tensorflow/python/kernel_tests/morphological_ops_test.py
+++ b/tensorflow/python/kernel_tests/morphological_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -52,7 +53,7 @@ class DilationTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="dilation2d")
-      self.assertAllClose(out, out_tensor.eval())
+      self.assertAllClose(out, self.evaluate(out_tensor))
 
   def _testDilationValidPadding(self, use_gpu):
     # [1, 2, 2, 1]
@@ -216,7 +217,7 @@ class DilationTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="dilation2d")
-      out_shape = out_tensor.eval().shape
+      out_shape = self.evaluate(out_tensor).shape
 
       # Small delta is necessary for argmax to remain the same.
       err = gradient_checker.compute_gradient_error(
@@ -291,6 +292,7 @@ class DilationTest(test.TestCase):
         padding="SAME",
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testDilationGrad(self):
     for use_gpu in True, False:
       self._testDilationGradValidPadding_1x1x1(use_gpu)
@@ -327,7 +329,7 @@ class ErosionTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="erosion2d")
-      self.assertAllClose(out, out_tensor.eval())
+      self.assertAllClose(out, self.evaluate(out_tensor))
 
   def _testErosionValidPadding(self, use_gpu):
     # [1, 2, 2, 1]
@@ -491,7 +493,7 @@ class ErosionTest(test.TestCase):
           rates=rates,
           padding=padding,
           name="erosion2d")
-      out_shape = out_tensor.eval().shape
+      out_shape = self.evaluate(out_tensor).shape
 
       # Small delta is necessary for argmax to remain the same.
       err = gradient_checker.compute_gradient_error(
@@ -566,6 +568,7 @@ class ErosionTest(test.TestCase):
         padding="SAME",
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testErosionGrad(self):
     for use_gpu in True, False:
       self._testErosionGradValidPadding_1x1x1(use_gpu)
diff --git a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
index 15e38265421a30277fa46d362afe57249a11a4e7..380d2860da4771faf1c22fe870e38b8c13edd896 100644
--- a/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
+++ b/tensorflow/python/kernel_tests/neon_depthwise_conv_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_impl
 from tensorflow.python.ops import nn_ops
@@ -142,8 +143,8 @@ class DepthwiseConv2DTest(test.TestCase):
       conv_interface = nn_impl.depthwise_conv2d(
           t1, t2, strides=[1, stride, stride, 1], padding=padding)
 
-      native_result = sess.run(conv_native)
-      interface_result = sess.run(conv_interface)
+      native_result = self.evaluate(conv_native)
+      interface_result = self.evaluate(conv_interface)
 
     print("depthwise conv_2d: ", tensor_in_sizes, "*", filter_in_sizes,
           ", stride:", stride, ", padding: ", padding, ", max diff: ",
@@ -153,6 +154,7 @@ class DepthwiseConv2DTest(test.TestCase):
     self.assertShapeEqual(native_result, conv_native)
     self.assertShapeEqual(native_result, conv_interface)
 
+  @test_util.run_deprecated_v1
   def testDepthwiseConv2D(self):
     for index, (input_size, filter_size, _, stride,
                 padding) in enumerate(ConfigsToTest()):
@@ -211,11 +213,12 @@ class DepthwiseConv2DTest(test.TestCase):
         t2 = constant_op.constant(x2, shape=filter_in_sizes)
         conv = nn_ops.depthwise_conv2d_native(
             t1, t2, strides=[1, stride, stride, 1], padding=padding)
-        value = sess.run(conv)
+        value = self.evaluate(conv)
     print("value = ", value)
     self.assertAllClose(expected, np.ravel(value), 1e-5)
     self.assertShapeEqual(value, conv)
 
+  @test_util.run_deprecated_v1
   def testConv2D2x2Filter(self):
     # The inputs look like this (it's a 3 x 2 matrix, each of depth 2):
     #
diff --git a/tensorflow/python/kernel_tests/norm_op_test.py b/tensorflow/python/kernel_tests/norm_op_test.py
index e202b6e8a43b27cc6896a3a5d7e6d2f47b3bed5b..20b9ad95c8be7aa59a2a1b70d59341e2f3ec8fa4 100644
--- a/tensorflow/python/kernel_tests/norm_op_test.py
+++ b/tensorflow/python/kernel_tests/norm_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.platform import test as test_lib
@@ -35,6 +36,7 @@ def _AddTest(test, test_name, fn):
 
 class NormOpTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBadOrder(self):
     matrix = [[0., 1.], [2., 3.]]
     for ord_ in "fro", -7, -1.1, 0:
@@ -52,6 +54,7 @@ class NormOpTest(test_lib.TestCase):
                                    "'ord' must be a supported matrix norm"):
         linalg_ops.norm(matrix, ord=ord_, axis=[-2, -1])
 
+  @test_util.run_v1_only("b/120545219")
   def testInvalidAxis(self):
     matrix = [[0., 1.], [2., 3.]]
     for axis_ in [], [1, 2, 3], [[1]], [[1], [2]], [3.1415], [1, 1]:
@@ -70,7 +73,7 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
         tf_matrix = constant_op.constant(matrix)
         tf_norm = linalg_ops.norm(
             tf_matrix, ord=ord_, axis=axis_, keepdims=keep_dims_)
-        tf_norm_val = sess.run(tf_norm)
+        tf_norm_val = self.evaluate(tf_norm)
       else:
         tf_matrix = array_ops.placeholder(dtype_)
         tf_norm = linalg_ops.norm(
@@ -78,6 +81,7 @@ def _GetNormOpTest(dtype_, shape_, ord_, axis_, keep_dims_, use_static_shape_):
         tf_norm_val = sess.run(tf_norm, feed_dict={tf_matrix: matrix})
     self.assertAllClose(np_norm, tf_norm_val, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     is_matrix_norm = (isinstance(axis_, tuple) or
                       isinstance(axis_, list)) and len(axis_) == 2
diff --git a/tensorflow/python/kernel_tests/nth_element_op_test.py b/tensorflow/python/kernel_tests/nth_element_op_test.py
index 338b6cec0102c7149c0af4f8295c8c7263a5f2f6..4be78b2d5ca57d96e691215cf0f17f3a48fce130 100644
--- a/tensorflow/python/kernel_tests/nth_element_op_test.py
+++ b/tensorflow/python/kernel_tests/nth_element_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
@@ -35,7 +36,7 @@ class NthElementTest(test.TestCase):
     with self.cached_session(use_gpu=False) as sess:
       inputs_op = ops.convert_to_tensor(inputs, dtype=dtype)
       values_op = nn_ops.nth_element(inputs_op, n, reverse=reverse)
-      values = sess.run(values_op)
+      values = self.evaluate(values_op)
 
       self.assertShapeEqual(np_expected_values, values_op)
       self.assertAllClose(np_expected_values, values)
@@ -111,17 +112,20 @@ class NthElementTest(test.TestCase):
     self._testEnumerateN([10, 10, 10])
     self._testEnumerateN([10, 10, 10, 10])
 
+  @test_util.run_deprecated_v1
   def testInvalidInput(self):
     with self.assertRaisesRegexp(ValueError,
                                  "at least rank 1 but is rank 0"):
       nn_ops.nth_element(5, 0)
 
+  @test_util.run_deprecated_v1
   def testInvalidInputAtEval(self):
     with self.session(use_gpu=False):
       v = array_ops.placeholder(dtype=dtypes.float32)
       with self.assertRaisesOpError("Input must be >= 1-D"):
         nn_ops.nth_element(v, 0).eval(feed_dict={v: 5.0})
 
+  @test_util.run_deprecated_v1
   def testInvalidN(self):
     with self.assertRaisesRegexp(ValueError,
                                  "non-negative but is -1"):
@@ -130,6 +134,7 @@ class NthElementTest(test.TestCase):
                                  "scalar but has rank 1"):
       nn_ops.nth_element([5, 6, 3], [1])
 
+  @test_util.run_deprecated_v1
   def testInvalidNAtEval(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.session(use_gpu=False):
@@ -138,12 +143,14 @@ class NthElementTest(test.TestCase):
       with self.assertRaisesOpError("Need n >= 0, got -7"):
         values.eval(feed_dict={n: -7})
 
+  @test_util.run_deprecated_v1
   def testNTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.assertRaisesRegexp(ValueError,
                                  "must have last dimension > n = 2"):
       nn_ops.nth_element(inputs, 2)
 
+  @test_util.run_deprecated_v1
   def testNTooLargeAtEval(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.session(use_gpu=False):
@@ -152,6 +159,7 @@ class NthElementTest(test.TestCase):
       with self.assertRaisesOpError(r"Input must have at least n\+1 columns"):
         values.eval(feed_dict={n: 2})
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     with self.session(use_gpu=False) as sess:
       inputs = array_ops.placeholder(dtypes.float32, shape=[3, 5])
diff --git a/tensorflow/python/kernel_tests/numerics_test.py b/tensorflow/python/kernel_tests/numerics_test.py
index 5db591ed304a60011a24347e12da45b523c6c305..f13f9d68062e7874222b5bc67d6fcc8378af0714 100644
--- a/tensorflow/python/kernel_tests/numerics_test.py
+++ b/tensorflow/python/kernel_tests/numerics_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -35,11 +36,11 @@ class VerifyTensorAllFiniteTest(test.TestCase):
   def testVerifyTensorAllFiniteSucceeds(self):
     x_shape = [5, 4]
     x = np.random.random_sample(x_shape).astype(np.float32)
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
       t_verified = numerics.verify_tensor_all_finite(t,
                                                      "Input is not a number.")
-      self.assertAllClose(x, t_verified.eval())
+      self.assertAllClose(x, self.evaluate(t_verified))
 
   def testVerifyTensorAllFiniteFails(self):
     x_shape = [5, 4]
@@ -48,21 +49,22 @@ class VerifyTensorAllFiniteTest(test.TestCase):
 
     # Test NaN.
     x[0] = np.nan
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaisesOpError(my_msg):
         t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
         t_verified = numerics.verify_tensor_all_finite(t, my_msg)
-        t_verified.eval()
+        self.evaluate(t_verified)
 
     # Test Inf.
     x[0] = np.inf
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       with self.assertRaisesOpError(my_msg):
         t = constant_op.constant(x, shape=x_shape, dtype=dtypes.float32)
         t_verified = numerics.verify_tensor_all_finite(t, my_msg)
-        t_verified.eval()
+        self.evaluate(t_verified)
 
 
+@test_util.run_v1_only("b/120545219")
 class NumericsTest(test.TestCase):
 
   def testInf(self):
@@ -73,7 +75,7 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("Inf"):
-        a.eval()
+        self.evaluate(a)
 
   def testNaN(self):
     with self.session(graph=ops.Graph()):
@@ -83,7 +85,7 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("NaN"):
-        a.eval()
+        self.evaluate(a)
 
   def testBoth(self):
     with self.session(graph=ops.Graph()):
@@ -93,13 +95,13 @@ class NumericsTest(test.TestCase):
       check = numerics.add_check_numerics_ops()
       a = control_flow_ops.with_dependencies([check], a)
       with self.assertRaisesOpError("Inf and NaN"):
-        a.eval()
+        self.evaluate(a)
 
   def testPassThrough(self):
     with self.session(graph=ops.Graph()):
       t1 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], shape=[2, 3])
       checked = array_ops.check_numerics(t1, message="pass through test")
-      value = checked.eval()
+      value = self.evaluate(checked)
       self.assertAllEqual(np.array([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]]), value)
       self.assertEqual([2, 3], checked.get_shape())
 
diff --git a/tensorflow/python/kernel_tests/one_hot_op_test.py b/tensorflow/python/kernel_tests/one_hot_op_test.py
index 377d545c9cdd30fefd1d66d2138716bbb0b153f4..856ba7bb7f3c5fb340a80c88b7c4ff2c33277568 100644
--- a/tensorflow/python/kernel_tests/one_hot_op_test.py
+++ b/tensorflow/python/kernel_tests/one_hot_op_test.py
@@ -41,12 +41,12 @@ class OneHotTest(test.TestCase):
       else:
         ans = array_ops.one_hot(**inputs)
         if expected_err_re is None:
-          tf_ans = ans.eval()
+          tf_ans = self.evaluate(ans)
           self.assertAllEqual(tf_ans, truth)
           self.assertEqual(tf_ans.shape, ans.get_shape())
         else:
           with self.assertRaisesOpError(expected_err_re):
-            ans.eval()
+            self.evaluate(ans)
 
   def _testBothOneHot(self, truth, expected_err_re=None, raises=None, **inputs):
     self._testOneHot(truth, True, expected_err_re, raises, **inputs)
diff --git a/tensorflow/python/kernel_tests/pad_op_test.py b/tensorflow/python/kernel_tests/pad_op_test.py
index fc302c4141af776c25d1d4883765d4bc4989e482..7b1b054ae0656ef8ae988c1a3220a2a643afbcab 100644
--- a/tensorflow/python/kernel_tests/pad_op_test.py
+++ b/tensorflow/python/kernel_tests/pad_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -88,7 +89,7 @@ class PadOpTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       tf_val = array_ops.pad(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
-      out = tf_val.eval()
+      out = self.evaluate(tf_val)
     self.assertAllEqual(np_val, out)
     self.assertShapeEqual(np_val, tf_val)
 
@@ -116,6 +117,7 @@ class PadOpTest(test.TestCase):
           self._testGradient(np_inputs, paddings, mode=mode,
                              constant_values=constant_values)
 
+  @test_util.run_deprecated_v1
   def testInputDims(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -124,6 +126,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -132,6 +135,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2], shape=[2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim2(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -140,6 +144,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2], shape=[2, 1]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim3(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -148,6 +153,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsDim4(self):
     with self.session(use_gpu=True):
       with self.assertRaises(ValueError):
@@ -156,6 +162,7 @@ class PadOpTest(test.TestCase):
                       array_ops.reshape(
                           [1, 2, 3, 4, 5, 6], shape=[3, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsNonNegative(self):
     with self.session(use_gpu=True):
       with self.assertRaisesRegexp(ValueError, "must be non-negative"):
@@ -164,6 +171,7 @@ class PadOpTest(test.TestCase):
                       constant_op.constant(
                           [-1, 0], shape=[1, 2]))
 
+  @test_util.run_deprecated_v1
   def testPaddingsNonNegative2(self):
     with self.session(use_gpu=True):
       with self.assertRaisesRegexp(ValueError, "must be non-negative"):
@@ -208,7 +216,7 @@ class PadOpTest(test.TestCase):
                                  constant_op.constant(paddings, padding_dtype),
                                  mode=mode,
                                  constant_values=0)
-          out = tf_val.eval()
+          out = self.evaluate(tf_val)
         self.assertAllEqual(np_val, out)
         self.assertShapeEqual(np_val, tf_val)
 
@@ -223,6 +231,7 @@ class PadOpTest(test.TestCase):
           np.random.randint(-100, 100, (4, 2, 1, 3)).astype(t),
           [[0, 0], [0, 0], [0, 0], [0, 0]], -123)
 
+  @test_util.run_deprecated_v1
   def testFloatTypes(self):
     for t in [np.float32, np.float64]:
       self._testAll(np.random.rand(2, 5).astype(t), [[1, 0], [2, 0]], 0.0)
@@ -250,17 +259,18 @@ class PadOpTest(test.TestCase):
     symmetric = array_ops.pad(x, [[1, 0], [0, 1]], mode="SYMMETRIC",
                               constant_values="PAD")
     with self.session(use_gpu=True):
-      self.assertAllEqual([[b"PAD", b"PAD", b"PAD"],
-                           [b"Hello", b"World", b"PAD"],
-                           [b"Goodnight", b"Moon", b"PAD"]], constant.eval())
+      self.assertAllEqual(
+          [[b"PAD", b"PAD", b"PAD"], [b"Hello", b"World", b"PAD"],
+           [b"Goodnight", b"Moon", b"PAD"]], self.evaluate(constant))
       self.assertAllEqual([[b"Goodnight", b"Moon", b"Goodnight"],
                            [b"Hello", b"World", b"Hello"],
                            [b"Goodnight", b"Moon", b"Goodnight"]],
-                          reflect.eval())
-      self.assertAllEqual([[b"Hello", b"World", b"World"],
-                           [b"Hello", b"World", b"World"],
-                           [b"Goodnight", b"Moon", b"Moon"]], symmetric.eval())
+                          self.evaluate(reflect))
+      self.assertAllEqual(
+          [[b"Hello", b"World", b"World"], [b"Hello", b"World", b"World"],
+           [b"Goodnight", b"Moon", b"Moon"]], self.evaluate(symmetric))
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Unknown paddings shape.
     inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
@@ -277,6 +287,7 @@ class PadOpTest(test.TestCase):
     padded = array_ops.pad(inp, array_ops.placeholder(dtypes.int32))
     self.assertAllEqual(None, padded.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testPartialShapeInformation(self):
     unknown = array_ops.placeholder(dtypes.int32)
 
@@ -327,7 +338,7 @@ class PadOpTest(test.TestCase):
     inp = np.asarray(7)
     with self.session(use_gpu=True):
       tf_val = array_ops.pad(inp, paddings)
-      out = tf_val.eval()
+      out = self.evaluate(tf_val)
     self.assertAllEqual(inp, out)
     self.assertShapeEqual(inp, tf_val)
 
@@ -337,10 +348,11 @@ class PadOpTest(test.TestCase):
       inp = np.asarray(7)
       with self.cached_session(use_gpu=True):
         tf_val = array_ops.pad(inp, constant_op.constant(paddings, dtype=dtype))
-        out = tf_val.eval()
+        out = self.evaluate(tf_val)
       self.assertAllEqual(inp, out)
       self.assertShapeEqual(inp, tf_val)
 
+  @test_util.run_deprecated_v1
   def testCollapseAdjacentNonPaddedDimensions(self):
     # pyformat: disable
     paddings_values = [[[0, 0], [0, 0], [0, 0], [0, 1]],
@@ -361,11 +373,12 @@ class PadOpTest(test.TestCase):
             [paddings_value[i][0] + inp.shape.dims[i].value for i in range(4)],
             [-1, -1, -1, -1])
         with self.cached_session(use_gpu=True):
-          self.assertAllEqual(inp.eval(), middle.eval())
+          self.assertAllEqual(inp.eval(), self.evaluate(middle))
           self.assertAllEqual(
-              np.zeros([row[0] for row in paddings_value]), left.eval())
+              np.zeros([row[0] for row in paddings_value]), self.evaluate(left))
           self.assertAllEqual(
-              np.zeros([row[1] for row in paddings_value]), right.eval())
+              np.zeros([row[1] for row in paddings_value]),
+              self.evaluate(right))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
index 95f3dcceeaa14909b706b1f1c0676c5df28b8427..e3999695d0605f49d1440c3305f020e4871940a3 100644
--- a/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
+++ b/tensorflow/python/kernel_tests/padding_fifo_queue_test.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class PaddingFIFOQueueTest(test.TestCase):
 
   def testConstructor(self):
@@ -126,7 +128,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -158,7 +160,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -178,7 +180,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i]], vals)
 
   def testEnqueueAndBlockingDequeue(self):
@@ -193,13 +195,13 @@ class PaddingFIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -224,7 +226,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         enqueue_op.run()
 
       for i in xrange(len(elems)):
-        x_val, y_val = sess.run(dequeued_t)
+        x_val, y_val = self.evaluate(dequeued_t)
         x, y = elems[i]
         self.assertEqual([x], x_val)
         self.assertEqual([y], y_val)
@@ -243,9 +245,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual(1, size.eval())
+      self.assertEqual(1, self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual(0, size.eval())
+      self.assertEqual(0, self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -257,7 +259,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        vals = dequeued_t.eval()
+        vals = self.evaluate(dequeued_t)
         self.assertEqual([elems[i % 4]], vals)
 
   def testEmptyEnqueueMany(self):
@@ -269,9 +271,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual([0], size_t.eval())
+      self.assertEqual([0], self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -279,9 +281,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithDynamicShape(self):
     with self.cached_session():
@@ -290,9 +292,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue(([10.0],))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpToWithDynamicShape(self):
     with self.cached_session():
@@ -301,9 +303,9 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op = q.enqueue(([10.0],))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testConstructPaddingFIFOQueueWithNoShape(self):
     with self.cached_session():
@@ -327,7 +329,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -344,7 +346,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       for i in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         self.assertEqual(float_elems[i % 4], float_val)
         self.assertAllEqual(int_elems[i % 4], int_val)
 
@@ -357,8 +359,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testDequeueUpToNoBlocking(self):
     with self.cached_session():
@@ -369,8 +371,8 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      self.assertAllEqual(elems[0:4], dequeued_t.eval())
-      self.assertAllEqual(elems[4:8], dequeued_t.eval())
+      self.assertAllEqual(elems[0:4], self.evaluate(dequeued_t))
+      self.assertAllEqual(elems[4:8], self.evaluate(dequeued_t))
 
   def testMultiDequeueMany(self):
     with self.cached_session() as sess:
@@ -387,17 +389,17 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
@@ -418,7 +420,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[0:4], float_val)
       self.assertAllEqual(int_elems[0:4], int_val)
       self.assertTrue(
@@ -428,11 +430,11 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertAllEqual(float_elems[4:8], float_val)
       self.assertAllEqual(int_elems[4:8], int_val)
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual(float_elems[8], float_val)
       self.assertAllEqual(int_elems[8], int_val)
       self.assertTrue(
@@ -459,7 +461,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       for enqueue_op in enqueue_ops:
         enqueue_op.run()
-      string_val, int_val = sess.run(dequeued_t)
+      string_val, int_val = self.evaluate(dequeued_t)
 
       self.assertAllEqual([[b"a", b"", b""], [b"ab", b"", b""],
                            [b"abc", b"", b""], [b"abc", b"d", b""],
@@ -473,7 +475,7 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      string_val, int_val = sess.run(dequeued_single_t)
+      string_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual([b"abc", b"d", b"e", b"f"], string_val)
       self.assertAllEqual([[1, 2, 3, 4]], int_val)
       self.assertTrue(
@@ -500,7 +502,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       for enqueue_op in enqueue_ops:
         enqueue_op.run()
-      string_val, int_val = sess.run(dequeued_t)
+      string_val, int_val = self.evaluate(dequeued_t)
 
       self.assertAllEqual([[b"a", b"", b""], [b"ab", b"", b""],
                            [b"abc", b"", b""], [b"abc", b"d", b""],
@@ -514,7 +516,7 @@ class PaddingFIFOQueueTest(test.TestCase):
           tensor_shape.TensorShape(int_val.shape).is_compatible_with(dequeued_t[
               1].get_shape()))
 
-      string_val, int_val = sess.run(dequeued_single_t)
+      string_val, int_val = self.evaluate(dequeued_single_t)
       self.assertAllEqual([b"abc", b"d", b"e", b"f"], string_val)
       self.assertAllEqual([[1, 2, 3, 4]], int_val)
       self.assertTrue(
@@ -622,7 +624,7 @@ class PaddingFIFOQueueTest(test.TestCase):
                                    r"Expected \[2,\?,3\], got \[2,3,4\]"):
         sess.run([enqueue_op],
                  feed_dict={elems_bad: np.array([1] * 24).reshape((2, 3, 4))})
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testParallelEnqueueMany(self):
     with self.cached_session() as sess:
@@ -633,7 +635,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -656,7 +658,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -680,7 +682,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -700,11 +702,11 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def enqueue():
         for _ in xrange(100):
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       def dequeue():
         for _ in xrange(100):
-          self.assertTrue(sess.run(dequeued_t) in (10.0, 20.0))
+          self.assertTrue(self.evaluate(dequeued_t) in (10.0, 20.0))
 
       enqueue_threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       dequeue_threads = [self.checkedThread(target=dequeue) for _ in range(10)]
@@ -736,7 +738,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         for i in xrange(250):
-          self.assertEqual(i, sess.run(dequeued_t))
+          self.assertEqual(i, self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -767,7 +769,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       dequeuemany_t = q.dequeue_many(count_placeholder)
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -776,7 +778,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       while elements_dequeued < 250:
         # With equal probability, run Dequeue or dequeue_many.
         if random.random() > 0.5:
-          self.assertEqual(elements_dequeued, dequeued_t.eval())
+          self.assertEqual(elements_dequeued, self.evaluate(dequeued_t))
           elements_dequeued += 1
         else:
           count = random.randint(0, min(20, 250 - elements_dequeued))
@@ -805,10 +807,10 @@ class PaddingFIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -832,10 +834,10 @@ class PaddingFIFOQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -882,12 +884,12 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
       close_op.run()
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
 
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -901,11 +903,11 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         for elem in elems:
-          self.assertEqual([elem], sess.run(dequeued_t))
+          self.assertEqual([elem], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -926,8 +928,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
-        self.assertAllEqual(elems[3:], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
+        self.assertAllEqual(elems[3:], self.evaluate(dequeued_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -947,7 +949,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -968,11 +970,11 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems, sess.run(dequeued_t))
+        self.assertAllEqual(elems, self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -993,11 +995,11 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def dequeue():
-        self.assertAllEqual(elems[:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[:3], self.evaluate(dequeued_t))
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1017,16 +1019,16 @@ class PaddingFIFOQueueTest(test.TestCase):
       cleanup_dequeue_t = q.dequeue()
 
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        self.assertAllEqual(elems[0:3], sess.run(dequeued_t))
+        self.assertAllEqual(elems[0:3], self.evaluate(dequeued_t))
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run(dequeued_t)
-        self.assertEqual(elems[3], sess.run(cleanup_dequeue_t))
+          self.evaluate(dequeued_t)
+        self.assertEqual(elems[3], self.evaluate(cleanup_dequeue_t))
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       enqueue_thread = self.checkedThread(target=enqueue)
       enqueue_thread.start()
@@ -1059,7 +1061,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def dequeue():
         with self.assertRaises(errors_impl.OutOfRangeError):
-          sess.run([dequeued_a_t, dequeued_b_t])
+          self.evaluate([dequeued_a_t, dequeued_b_t])
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1072,7 +1074,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       # Test that the elements in the partially-dequeued batch are
       # restored in the correct order.
       for elem_a, elem_b in zip(elems_a, elems_b):
-        val_a, val_b = sess.run([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
+        val_a, val_b = self.evaluate([cleanup_dequeue_a_t, cleanup_dequeue_b_t])
         self.assertEqual(elem_a, val_a)
         self.assertEqual(elem_b, val_b)
       self.assertEqual(0, q.size().eval())
@@ -1087,7 +1089,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1107,7 +1109,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -1155,7 +1157,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1163,8 +1165,8 @@ class PaddingFIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
-      self.assertEqual([50.0], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
       thread.join()
 
   def testBlockingEnqueueManyToFullQueue(self):
@@ -1178,7 +1180,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -1186,10 +1188,10 @@ class PaddingFIFOQueueTest(test.TestCase):
       # TODO(mrry): Figure out how to do this without sleeping.
       time.sleep(0.1)
       for elem in elems:
-        self.assertEqual([elem], dequeued_t.eval())
+        self.assertEqual([elem], self.evaluate(dequeued_t))
         time.sleep(0.01)
-      self.assertEqual([50.0], dequeued_t.eval())
-      self.assertEqual([60.0], dequeued_t.eval())
+      self.assertEqual([50.0], self.evaluate(dequeued_t))
+      self.assertEqual([60.0], self.evaluate(dequeued_t))
 
       # Make sure the thread finishes before exiting.
       thread.join()
@@ -1207,7 +1209,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # Expect the operation to succeed once the dequeue op runs.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1217,18 +1219,18 @@ class PaddingFIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
 
       for elem in [20.0, 30.0, 40.0, 50.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
       self.assertEqual(0, q.size().eval())
 
   def testBlockingEnqueueManyBeforeClose(self):
@@ -1242,7 +1244,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       enqueue_thread = self.checkedThread(target=blocking_enqueue)
       enqueue_thread.start()
@@ -1252,17 +1254,17 @@ class PaddingFIFOQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       close_thread = self.checkedThread(target=close)
       close_thread.start()
 
       # The dequeue will unblock both threads.
-      self.assertEqual(10.0, dequeued_t.eval())
+      self.assertEqual(10.0, self.evaluate(dequeued_t))
       enqueue_thread.join()
       close_thread.join()
       for elem in [20.0, 30.0, 50.0, 60.0]:
-        self.assertEqual(elem, dequeued_t.eval())
+        self.assertEqual(elem, self.evaluate(dequeued_t))
 
   def testDoesNotLoseValue(self):
     with self.cached_session():
@@ -1379,20 +1381,21 @@ class PaddingFIFOQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
+  @test_util.run_deprecated_v1
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
       q_empty = data_flow_ops.PaddingFIFOQueue(5, dtypes_lib.float32, ((),))
@@ -1434,7 +1437,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1444,14 +1447,14 @@ class PaddingFIFOQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1477,7 +1480,7 @@ class PaddingFIFOQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1486,7 +1489,7 @@ class PaddingFIFOQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
@@ -1517,7 +1520,7 @@ class PaddingFIFOQueueTest(test.TestCase):
       q.enqueue_many(input_tuple).run()
 
       output_tuple_t = q.dequeue_many(32)
-      output_tuple = sess.run(output_tuple_t)
+      output_tuple = self.evaluate(output_tuple_t)
 
       for (input_elem, output_elem) in zip(input_tuple, output_tuple):
         self.assertAllEqual(input_elem, output_elem)
diff --git a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
index c9221f8c209fce0b4a53b89fae043fc858a61953..f87f5170539eab9b3599b271fef9c7cce7cd150f 100644
--- a/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
+++ b/tensorflow/python/kernel_tests/parameterized_truncated_normal_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -166,30 +167,39 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot test truncated normal op: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testDefaults(self):
     self.validateMoments([10**5], 0.0, 1.0, -2.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def testShifted(self):
     self.validateMoments([10**5], -1.0, 1.0, -2.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def testRightTail(self):
     self.validateMoments([10**5], 0.0, 1.0, 4.0, np.infty)
 
+  @test_util.run_deprecated_v1
   def testLeftTail(self):
     self.validateMoments([10**5], 0.0, 1.0, -np.infty, -4.0)
 
+  @test_util.run_deprecated_v1
   def testLeftTailTwoSidedBounds(self):
     self.validateMoments([10**5], 0.0, 1.0, -6.0, -3.0)
 
+  @test_util.run_deprecated_v1
   def testTwoSidedLeftTailShifted(self):
     self.validateKolmogorovSmirnov([10**5], 6.0, 1.0, -1.0, 1.0)
 
+  @test_util.run_deprecated_v1
   def testRightTailShifted(self):
     self.validateMoments([10**5], -5.0, 1.0, 2.0, np.infty)
 
+  @test_util.run_deprecated_v1
   def testSmallStddev(self):
     self.validateKolmogorovSmirnov([10**5], 0.0, 0.1, 0.05, 0.10)
 
+  @test_util.run_deprecated_v1
   def testSamplingWithSmallStdDevFarFromBound(self):
     sample_op = random_ops.parameterized_truncated_normal(
         shape=(int(1e5),), means=0.8, stddevs=0.05, minvals=-1., maxvals=1.)
@@ -202,6 +212,7 @@ class ParameterizedTruncatedNormalTest(test.TestCase):
       no_neg_samples = np.sum(samples < 0.)
       self.assertEqual(no_neg_samples, 0.)
 
+  @test_util.run_deprecated_v1
   def testSamplingAtRandnSwitchover(self):
     # The randn sampler is used as the bounds are moved farther from the mean,
     # and the probability of accepting a sample increases the farther the
diff --git a/tensorflow/python/kernel_tests/parse_single_example_op_test.py b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
index a84895a287eeb0d67cce563254e2383e390c9e2c..43c8fa4ab5c5c9d71a4ac67fd0e90c34b36b45b4 100644
--- a/tensorflow/python/kernel_tests/parse_single_example_op_test.py
+++ b/tensorflow/python/kernel_tests/parse_single_example_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
@@ -107,7 +108,7 @@ class ParseExampleTest(test.TestCase):
         for result_dict in [out, out_with_example_name]:
           result = flatten_values_tensors_or_sparse(result_dict.values())
           # Check values.
-          tf_result = sess.run(result)
+          tf_result = self.evaluate(result)
           _compare_output_to_expected(self, result_dict, expected_values,
                                       tf_result)
 
@@ -121,6 +122,7 @@ class ParseExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
     a_name = "a"
@@ -229,6 +231,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_err=(ValueError, "Missing shape for feature a"))
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparse(self):
     original = [
         example(features=features({
@@ -552,6 +555,7 @@ class ParseExampleTest(test.TestCase):
           }
       }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
     original = [
         example(features=features({
@@ -618,6 +622,7 @@ class ParseExampleTest(test.TestCase):
           },
           expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
     original = [
         example(features=features({
@@ -658,6 +663,7 @@ class ParseExampleTest(test.TestCase):
           }
       }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
     aname = "a"
     bname = "b"
@@ -869,6 +875,7 @@ class ParseSingleExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
     original = example(features=features({
         "c": float_feature([3, 4]),
diff --git a/tensorflow/python/kernel_tests/parsing_ops_test.py b/tensorflow/python/kernel_tests/parsing_ops_test.py
index 71d8b60d3ccf9fafaa16fa705c3261e008d8409c..af76e09f3931004063a5faa2070058ee2e4a0fc5 100644
--- a/tensorflow/python/kernel_tests/parsing_ops_test.py
+++ b/tensorflow/python/kernel_tests/parsing_ops_test.py
@@ -33,6 +33,7 @@ from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
@@ -101,15 +102,15 @@ class ParseExampleTest(test.TestCase):
         out = parsing_ops.parse_example(**kwargs)
         result = flatten_values_tensors_or_sparse(out.values())
         # Check values.
-        tf_result = sess.run(result)
+        tf_result = self.evaluate(result)
         _compare_output_to_expected(self, out, expected_values, tf_result)
 
       # Check shapes; if serialized is a Tensor we need its size to
       # properly check.
       serialized = kwargs["serialized"]
       batch_size = (
-          serialized.eval().size if isinstance(serialized, ops.Tensor) else
-          np.asarray(serialized).size)
+          self.evaluate(serialized).size if isinstance(serialized, ops.Tensor)
+          else np.asarray(serialized).size)
       for k, f in kwargs["features"].items():
         if isinstance(f, parsing_ops.FixedLenFeature) and f.shape is not None:
           self.assertEqual(
@@ -121,6 +122,7 @@ class ParseExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (2,))
 
+  @test_util.run_deprecated_v1
   def testEmptySerializedWithAllDefaults(self):
     sparse_name = "st_a"
     a_name = "a"
@@ -243,6 +245,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_err=(ValueError, "Missing shape for feature a"))
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparse(self):
     original = [
         example(features=features({
@@ -571,6 +574,7 @@ class ParseExampleTest(test.TestCase):
         }
     }, expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureAndDenseWithNoDefault(self):
     expected_st_a = (  # indices, values, shape
         np.empty((0, 2), dtype=np.int64),  # indices
@@ -631,6 +635,7 @@ class ParseExampleTest(test.TestCase):
         },
         expected_output)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingSparseAndSparseFeatureWithReuse(self):
     expected_idx = (  # indices, values, shape
         np.array([[0, 0], [0, 1], [1, 0], [1, 1]], dtype=np.int64),
@@ -740,6 +745,7 @@ class ParseExampleTest(test.TestCase):
     for batch_size in (1, 10, 20, 100, 256):
       self._testSerializedContainingVarLenDenseLargerBatch(batch_size)
 
+  @test_util.run_deprecated_v1
   def testSerializedContainingVarLenDense(self):
     aname = "a"
     bname = "b"
@@ -962,6 +968,7 @@ class ParseSingleExampleTest(test.TestCase):
           self.assertEqual(
               tuple(out[k].dense_shape.get_shape().as_list()), (1,))
 
+  @test_util.run_deprecated_v1
   def testSingleExampleWithSparseAndSparseFeatureAndDense(self):
     original = example(
         features=features({
@@ -1180,6 +1187,7 @@ class ParseSequenceExampleTest(test.TestCase):
         expected_err=expected_err,
         batch=True)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithSparseAndDenseContext(self):
     original = sequence_example(
         context=features({
@@ -1223,6 +1231,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_context_values=expected_context_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithMultipleSizeFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1286,6 +1295,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithoutDebugName(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1343,6 +1353,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithSparseAndDenseFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1401,6 +1412,7 @@ class ParseSequenceExampleTest(test.TestCase):
         },
         expected_feat_list_values=expected_feature_list_output)
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleWithEmptyFeatureInFeatureLists(self):
     original = sequence_example(
         feature_lists=feature_lists({
@@ -1541,6 +1553,7 @@ class ParseSequenceExampleTest(test.TestCase):
             " feature_list_dense_missing_assumed_empty or"
             " feature_list_dense_defaults?"))
 
+  @test_util.run_deprecated_v1
   def testSequenceExampleBatch(self):
     first = sequence_example(
         feature_lists=feature_lists({
@@ -1614,7 +1627,7 @@ class DecodeJSONExampleTest(test.TestCase):
           shape=examples.shape,
           dtype=dtypes.string)
       binary_tensor = parsing_ops.decode_json_example(json_tensor)
-      binary_val = sess.run(binary_tensor)
+      binary_val = self.evaluate(binary_tensor)
 
       if examples.shape:
         self.assertShapeEqual(binary_val, json_tensor)
@@ -1695,16 +1708,18 @@ class DecodeJSONExampleTest(test.TestCase):
             })),
     ])
 
+  @test_util.run_deprecated_v1
   def testInvalidSyntax(self):
     with self.cached_session() as sess:
       json_tensor = constant_op.constant(["{]"])
       binary_tensor = parsing_ops.decode_json_example(json_tensor)
       with self.assertRaisesOpError("Error while parsing JSON"):
-        sess.run(binary_tensor)
+        self.evaluate(binary_tensor)
 
 
 class ParseTensorOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testToFloat32(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.float32)
@@ -1718,6 +1733,7 @@ class ParseTensorOpTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @test_util.run_deprecated_v1
   def testToUint8(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.uint8)
@@ -1731,6 +1747,7 @@ class ParseTensorOpTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @test_util.run_deprecated_v1
   def testTypeMismatch(self):
     with self.cached_session():
       expected = np.random.rand(3, 4, 5).astype(np.uint8)
@@ -1744,6 +1761,7 @@ class ParseTensorOpTest(test.TestCase):
           r"\(uint16\)"):
         tensor.eval(feed_dict={serialized: tensor_proto.SerializeToString()})
 
+  @test_util.run_deprecated_v1
   def testInvalidInput(self):
     with self.cached_session():
       serialized = array_ops.placeholder(dtypes.string)
diff --git a/tensorflow/python/kernel_tests/partitioned_variables_test.py b/tensorflow/python/kernel_tests/partitioned_variables_test.py
index d1f0c6c2a056dc85e8ac038ddb0cf14ef00ccf0d..da79b4ecfc0a3972f610c1ed39cdd0201716bee4 100644
--- a/tensorflow/python/kernel_tests/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/partitioned_variables_test.py
@@ -26,6 +26,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
@@ -322,17 +323,19 @@ class PartitionedVariablesTestCase(test.TestCase):
     for i in xrange(len(expected_specs)):
       self.assertEquals(expected_specs[i], slices[i]._save_slice_info.spec)
 
+  @test_util.run_deprecated_v1
   def testVecConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([1, 2, 3, 4])
       vs = partitioned_variables.create_partitioned_variables([4], [4], rnd_par)
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd_par.eval()
+      rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 4, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["4 0,1", "4 1,1", "4 2,1", "4 3,1"])
 
+  @test_util.run_deprecated_v1
   def testConstantInit(self):
     with self.cached_session():
       rnd_par = constant_op.constant([[1, 2, 3, 4], [5, 6, 7, 8]])
@@ -340,7 +343,7 @@ class PartitionedVariablesTestCase(test.TestCase):
                                                               rnd_par)
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 1).eval()
-      rnd = rnd_par.eval()
+      rnd = self.evaluate(rnd_par)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.int32] * 2, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, ["2 4 0,2:0,2", "2 4 0,2:2,2"])
@@ -401,12 +404,15 @@ class PartitionedVariablesTestCase(test.TestCase):
       self.assertEqual(var2_name + "/part_0:0", vs2[0].name)
       self.assertEqual(var2_name + "/part_1:0", vs2[1].name)
 
+  @test_util.run_deprecated_v1
   def testName(self):
     self._testNameHelper(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceName(self):
     self._testNameHelper(use_resource=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testRandomInitValue(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([200, 40]))
@@ -414,7 +420,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [1, 10], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 1).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self.assertEqual([dtypes.float32] * 10, [v.dtype.base_dtype for v in vs])
       self._TestSaveSpec(vs, [
@@ -424,6 +430,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           "200 40 0,200:36,4"
       ])
 
+  @test_util.run_v1_only("b/120545219")
   def testRandomInitUnevenPartitions(self):
     with self.cached_session():
       rnd = variables.Variable(
@@ -434,7 +441,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           for i in xrange(1, 10)
       ]
       variables.global_variables_initializer().run()
-      rnd_val = rnd.eval()
+      rnd_val = self.evaluate(rnd)
       # Only check the slice save specs for the first 5 tf.
       save_specs = [
           # One slice
@@ -462,6 +469,7 @@ class PartitionedVariablesTestCase(test.TestCase):
         if i < len(save_specs):
           self._TestSaveSpec(vs, save_specs[i])
 
+  @test_util.run_v1_only("b/120545219")
   def testDegenerate(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
@@ -469,10 +477,11 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [1, 1], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, ["10 43 0,10:0,43"])
 
+  @test_util.run_v1_only("b/120545219")
   def testSliceSizeOne(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([10, 43]))
@@ -480,7 +489,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           rnd.get_shape(), [10, 1], rnd.initialized_value())
       variables.global_variables_initializer().run()
       val = array_ops.concat(vs, 0).eval()
-      rnd = rnd.eval()
+      rnd = self.evaluate(rnd)
       self.assertAllClose(rnd, val)
       self._TestSaveSpec(vs, [
           "10 43 0,1:0,43", "10 43 1,1:0,43", "10 43 2,1:0,43",
@@ -488,6 +497,7 @@ class PartitionedVariablesTestCase(test.TestCase):
           "10 43 6,1:0,43", "10 43 7,1:0,43", "10 43 8,1:0,43", "10 43 9,1:0,43"
       ])
 
+  @test_util.run_deprecated_v1
   def testIotaInitializer(self):
     self.assertAllClose([0., 1., 2., 3.], _IotaInitializer([4]))
     self.assertAllClose([[0., 1.], [0., 10.], [0., 100.], [0., 1000.]],
@@ -503,6 +513,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       self.assertAllClose(slice0 + slice1 + slice2, val)
       self._TestSaveSpec(vs, ["13 5 0,5:0,5", "13 5 5,4:0,5", "13 5 9,4:0,5"])
 
+  @test_util.run_deprecated_v1
   def testRandomInitializer(self):
     # Sanity check that the slices uses a different seed when using a random
     # initializer function.
@@ -510,7 +521,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer())
       variables.global_variables_initializer().run()
-      val0, val1 = var0.eval().flatten(), var1.eval().flatten()
+      val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertTrue(np.linalg.norm(val0 - val1) > 1e-6)
     # Negative test that proves that slices have the same values if
     # the random initializer uses a seed.
@@ -518,7 +529,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       var0, var1 = partitioned_variables.create_partitioned_variables(
           [20, 12], [1, 2], init_ops.random_uniform_initializer(seed=201))
       variables.global_variables_initializer().run()
-      val0, val1 = var0.eval().flatten(), var1.eval().flatten()
+      val0, val1 = self.evaluate(var0).flatten(), self.evaluate(var1).flatten()
       self.assertAllClose(val0, val1)
 
   def testSomeErrors(self):
@@ -546,6 +557,7 @@ class PartitionedVariablesTestCase(test.TestCase):
         partitioned_variables.create_partitioned_variables(
             [10, 43], [1, 50], rnd.initialized_value())
 
+  @test_util.run_deprecated_v1
   def testControlDepsNone(self):
     with self.cached_session() as session:
       c = constant_op.constant(1.0)
@@ -572,6 +584,7 @@ class PartitionedVariablesTestCase(test.TestCase):
       for op in reading_ops:
         self.assertEqual([], op.control_inputs)
 
+  @test_util.run_deprecated_v1
   def testConcat(self):
     with self.cached_session() as session:
       var_x = variable_scope.get_variable(
diff --git a/tensorflow/python/kernel_tests/pool_test.py b/tensorflow/python/kernel_tests/pool_test.py
index 372861297fb9243254fcba6f7064ce5cc63a6086..78e786f01ca9c167b5b175fcd833a83281c078de 100644
--- a/tensorflow/python/kernel_tests/pool_test.py
+++ b/tensorflow/python/kernel_tests/pool_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -151,7 +152,7 @@ class PoolingTest(test.TestCase):
         np.prod(input_shape), dtype=np.float32).reshape(input_shape) - 1
     y1 = pool_direct(input=x, **kwargs)
     y2 = nn_ops.pool(input=x, **kwargs)
-    self.assertAllClose(y1, y2.eval(), rtol=1e-2, atol=1e-2)
+    self.assertAllClose(y1, self.evaluate(y2), rtol=1e-2, atol=1e-2)
 
   def testPoolSimple(self):
     with self.session(use_gpu=test.is_gpu_available()):
@@ -301,6 +302,7 @@ class PoolingTest(test.TestCase):
     err_tolerance = 1e-2
     self.assertLess(err, err_tolerance)
 
+  @test_util.run_deprecated_v1
   def testGradient1D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
@@ -327,6 +329,7 @@ class PoolingTest(test.TestCase):
                     dilation_rate=[1],
                     strides=strides)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
@@ -353,6 +356,7 @@ class PoolingTest(test.TestCase):
                     dilation_rate=[1, 1],
                     strides=strides)
 
+  @test_util.run_deprecated_v1
   def testGradient3D(self):
     with self.session(use_gpu=test.is_gpu_available()):
       for padding in ["SAME", "VALID"]:
diff --git a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
index e393c7a0229a907fbccc09b77a788e96069cf825..347e092dee3b964b3abba5fae2a46c80d80f79bf 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_3d_test.py
@@ -81,7 +81,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format)
       if data_format == "NCDHW":
         t = test_util.NCHWToNHWC(t)
-      vals = sess.run(t)
+      vals = self.evaluate(t)
     # Verifies values.
     actual = vals.flatten()
     self.assertAllClose(expected, actual)
@@ -253,6 +253,7 @@ class PoolingTest(test.TestCase):
         ksize = test_util.NHWCToNCHW(ksize)
         strides = test_util.NHWCToNCHW(strides)
         t = test_util.NHWCToNCHW(t)
+        output_sizes = test_util.NHWCToNCHW(output_sizes)
 
       t = pool_func(
           t,
@@ -294,6 +295,7 @@ class PoolingTest(test.TestCase):
                                               use_gpu=use_gpu,
                                               **kwargs)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -303,6 +305,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_1_6_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -312,6 +315,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_1_7_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -321,6 +325,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -330,6 +335,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -339,6 +345,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -348,6 +355,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -357,6 +365,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -366,6 +375,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -375,6 +385,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradSamePadding3_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.max_pool3d,
@@ -384,6 +395,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -393,6 +405,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -402,6 +415,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -411,6 +425,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradValidPadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -420,6 +435,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="VALID")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding1_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -429,6 +445,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding1_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -438,6 +455,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding2_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -447,6 +465,7 @@ class PoolingTest(test.TestCase):
         strides=(1, 1, 1),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding2_2_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
@@ -456,6 +475,7 @@ class PoolingTest(test.TestCase):
         strides=(2, 2, 2),
         padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGradSamePadding3_1_3d(self):
     self._ConstructAndTestGradient(
         nn_ops.avg_pool3d,
diff --git a/tensorflow/python/kernel_tests/pooling_ops_test.py b/tensorflow/python/kernel_tests/pooling_ops_test.py
index 53003a7f284d684bf51f3757043ff330c3066eb8..c33b59bb99b716b7164c82f6e640a8a3f4680351 100644
--- a/tensorflow/python/kernel_tests/pooling_ops_test.py
+++ b/tensorflow/python/kernel_tests/pooling_ops_test.py
@@ -166,7 +166,7 @@ class PoolingTest(test.TestCase):
             strides_placeholder: strides
         })
       else:
-        actual = t.eval()
+        actual = self.evaluate(t)
         self.assertShapeEqual(actual, t)
       self.assertAllCloseAccordingToType(expected, actual.flatten())
 
@@ -384,6 +384,7 @@ class PoolingTest(test.TestCase):
         expected=[],
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testAvgPooling(self):
     for use_gpu in True, False:
       self._testAvgPoolValidPadding(use_gpu)
@@ -577,6 +578,7 @@ class PoolingTest(test.TestCase):
         expected=[],
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPooling(self):
     for use_gpu in True, False:
       self._testMaxPoolValidPadding(use_gpu)
@@ -588,6 +590,7 @@ class PoolingTest(test.TestCase):
       self._testMaxPoolEmptyInput(use_gpu)
 
   # Tests for DepthwiseMaxPooling on CPU only.
+  @test_util.run_deprecated_v1
   def testDepthwiseMaxPool1x1DepthWindow1(self):
     # input is:
     # [1.0, ..., 10.0] along depth,
@@ -613,6 +616,7 @@ class PoolingTest(test.TestCase):
           use_gpu=False,
           v2=v2)
 
+  @test_util.run_deprecated_v1
   def testDepthwiseMaxPool2x2DepthWindow3(self):
     # input is:
     #
@@ -639,6 +643,7 @@ class PoolingTest(test.TestCase):
           use_gpu=False,
           v2=v2)
 
+  @test_util.run_deprecated_v1
   def testKernelSmallerThanStrideValid(self):
     for use_gpu in [True, False]:
       self._VerifyValues(
@@ -670,6 +675,7 @@ class PoolingTest(test.TestCase):
           expected=[5, 8, 26, 29],
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testKernelSmallerThanStrideSame(self):
     for use_gpu in [True, False]:
       for pool_func in [nn_ops.max_pool, nn_ops.avg_pool]:
@@ -750,11 +756,11 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op, _ = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
       self.assertAllCloseAccordingToType(cpu_val, gpu_val)
 
   def _CompareMaxPoolingBk(self, input_shape, output_shape, ksize, strides,
@@ -767,20 +773,20 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        argmax = argmax_op.eval()
+        argmax = self.evaluate(argmax_op)
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
         out_op = gen_nn_ops.max_pool_grad_with_argmax(t, grad_in, argmax, ksize,
                                                       strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
         self.assertShapeEqual(gpu_val, out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        orig_out = out_op.eval()
+        orig_out = self.evaluate(out_op)
         grad_in = constant_op.constant(tensor_output, shape=output_shape)
         out_op = gen_nn_ops.max_pool_grad(t, orig_out, grad_in, ksize, strides,
                                           padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
@@ -796,20 +802,20 @@ class PoolingTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         t = constant_op.constant(tensor_input, shape=input_shape)
         _, argmax_op = nn_ops.max_pool_with_argmax(t, ksize, strides, padding)
-        argmax = argmax_op.eval()
+        argmax = self.evaluate(argmax_op)
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
         out_op = gen_nn_ops.max_pool_grad_grad_with_argmax(
             t, grad_in, argmax, ksize, strides, padding)
-        gpu_val = out_op.eval()
+        gpu_val = self.evaluate(out_op)
         self.assertShapeEqual(gpu_val, out_op)
       with self.cached_session(use_gpu=False):
         t = constant_op.constant(tensor_input, shape=input_shape)
         out_op = nn_ops.max_pool(t, ksize, strides, padding)
-        orig_out = out_op.eval()
+        orig_out = self.evaluate(out_op)
         grad_in = constant_op.constant(tensor_input, shape=input_shape)
         out_op = gen_nn_ops.max_pool_grad_grad(t, orig_out, grad_in, ksize,
                                                strides, padding)
-        cpu_val = out_op.eval()
+        cpu_val = self.evaluate(out_op)
         self.assertShapeEqual(cpu_val, out_op)
       # The CPU version accumulates its gradient on fp16, so it's less
       # accurate than the GPU version that does the accumulation on fp32
@@ -826,7 +832,7 @@ class PoolingTest(test.TestCase):
           strides=[1, 1, 1, 1],
           Targmax=dtypes.int64,
           padding="VALID")
-      out, argmax = sess.run([out_op, argmax_op])
+      out, argmax = self.evaluate([out_op, argmax_op])
       self.assertShapeEqual(out, out_op)
       self.assertShapeEqual(argmax, argmax_op)
       self.assertAllClose(out.ravel(), [1.0, 1.0, 1.0, 1.0])
@@ -848,7 +854,7 @@ class PoolingTest(test.TestCase):
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 1, 1],
           padding="VALID")
-      out = out_op.eval().flatten()
+      out = self.evaluate(out_op).flatten()
       self.assertAllClose(out,
                           [11.0, 12.0, 0.0, 13.0, 0.0, 14.0, 0.0, 0.0, 0.0])
 
@@ -871,7 +877,7 @@ class PoolingTest(test.TestCase):
           ksize=[1, 2, 2, 1],
           strides=[1, 1, 1, 1],
           padding="VALID")
-      out = out_op.eval().flatten()
+      out = self.evaluate(out_op).flatten()
       self.assertAllClose(out, [11.0, 12.0, 14.0, 16.0])
 
   def _ConstructAndTestGradient(self,
@@ -1167,6 +1173,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1221,12 +1228,12 @@ class PoolingTest(test.TestCase):
           input_tensor, output_tensor, output_backprop_tensor, window_rows,
           window_cols, row_stride, col_stride, padding, v2)
 
-      actual_input_backprop = input_backprop_tensor.eval()
+      actual_input_backprop = self.evaluate(input_backprop_tensor)
       self.assertShapeEqual(actual_input_backprop, input_backprop_tensor)
       actual_input_backprop = actual_input_backprop.flatten()
       actual_input_backprop = self._GetNdArray(actual_input_backprop)
 
-      actual_output = output_tensor.eval().flatten()
+      actual_output = self.evaluate(output_tensor).flatten()
       actual_output = self._GetNdArray(actual_output)
 
       self.assertAllClose(
@@ -1497,6 +1504,7 @@ class PoolingTest(test.TestCase):
     else:
       del os.environ["TF_ENABLE_MAXPOOL_NANPROP"]
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradDirect(self):
     self._testMaxPoolGradDirect1_1()
     self._testMaxPoolGradDirect1_2()
@@ -1616,6 +1624,7 @@ class PoolingTest(test.TestCase):
           data_format=data_format,
           use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testMaxPoolGradGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testMaxPoolGradGradValidPadding1_1(data_format, use_gpu)
@@ -1649,6 +1658,7 @@ class PoolingTest(test.TestCase):
         orig_input, orig_output, grad, [1, window_rows, window_cols, 1],
         [1, row_stride, col_stride, 1], padding)
 
+  @test_util.run_deprecated_v1
   def testAvgPoolGrad(self):
     for (data_format, use_gpu) in GetTestConfigs():
       self._testAvgPoolGradValidPadding1_1(data_format, use_gpu)
@@ -1778,6 +1788,7 @@ class PoolingTest(test.TestCase):
         data_format=data_format,
         use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # All shapes unknown.
     for pool_func in [nn_ops.max_pool, nn_ops.avg_pool]:
@@ -1806,6 +1817,7 @@ class PoolingTest(test.TestCase):
             strides=[1, 1, 1, 1],
             padding="SAME")
 
+  @test_util.run_deprecated_v1
   def testOpEdgeCases(self):
     with self.session(use_gpu=test.is_gpu_available()) as sess:
       pool_funcs = [nn_ops.max_pool, nn_ops.avg_pool]
diff --git a/tensorflow/python/kernel_tests/priority_queue_test.py b/tensorflow/python/kernel_tests/priority_queue_test.py
index 73a9c81638259486f28a37755db86e4fe055f738..49ec7ee4836d40719971822aff9e063b7235dc8b 100644
--- a/tensorflow/python/kernel_tests/priority_queue_test.py
+++ b/tensorflow/python/kernel_tests/priority_queue_test.py
@@ -27,6 +27,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -35,6 +36,7 @@ from tensorflow.python.platform import test
 
 class PriorityQueueTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertReadOnceSorts(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string, dtypes.string), (
@@ -50,7 +52,7 @@ class PriorityQueueTest(test.TestCase):
         enq.run()
 
       deq = q.dequeue_many(100)
-      deq_elem, deq_value_0, deq_value_1 = sess.run(deq)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(deq)
 
       allowed = {}
       missed = set()
@@ -81,7 +83,7 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeue_op = q.dequeue_many(100)
 
@@ -93,7 +95,7 @@ class PriorityQueueTest(test.TestCase):
       for t in enqueue_threads:
         t.start()
 
-      deq_elem, deq_value_0, deq_value_1 = sess.run(dequeue_op)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(dequeue_op)
 
       for t in enqueue_threads:
         t.join()
@@ -112,6 +114,7 @@ class PriorityQueueTest(test.TestCase):
         missed.remove((dv0, dv1))
       self.assertEqual(missed, set())
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripFillsCapacityMultiThreadedEnqueueAndDequeue(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(10, (dtypes.int64), (()))
@@ -132,12 +135,12 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeued = []
 
       def dequeue(dequeue_op):
-        (dequeue_indices, dequeue_values) = sess.run(dequeue_op)
+        (dequeue_indices, dequeue_values) = self.evaluate(dequeue_op)
         self.assertAllEqual(dequeue_indices, dequeue_values)
         dequeued.extend(dequeue_indices)
 
@@ -184,10 +187,10 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue(dequeue_op, dequeued):
-        (dequeue_indices, dequeue_values) = sess.run(dequeue_op)
+        (dequeue_indices, dequeue_values) = self.evaluate(dequeue_op)
         self.assertAllEqual(dequeue_indices, dequeue_values)
         dequeue_wait.acquire()
         dequeued.extend(dequeue_indices)
@@ -215,7 +218,7 @@ class PriorityQueueTest(test.TestCase):
 
       # We can't guarantee full sorting because we can't guarantee
       # that the dequeued.extend() call runs immediately after the
-      # sess.run() call.  Here we're just happy everything came out.
+      # self.evaluate() call.  Here we're just happy everything came out.
       self.assertAllEqual(set(dequeued), set(all_enqueued_values))
 
   def testRoundTripInsertManyMultiThreadedReadOnceSorts(self):
@@ -236,7 +239,7 @@ class PriorityQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       dequeue_op = q.dequeue_many(100)
 
@@ -248,7 +251,7 @@ class PriorityQueueTest(test.TestCase):
       for t in enqueue_threads:
         t.start()
 
-      deq_elem, deq_value_0, deq_value_1 = sess.run(dequeue_op)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(dequeue_op)
 
       for t in enqueue_threads:
         t.join()
@@ -267,6 +270,7 @@ class PriorityQueueTest(test.TestCase):
         missed.remove((dv0, dv1))
       self.assertEqual(missed, set())
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadOnceSorts(self):
     with self.cached_session() as sess:
       q = data_flow_ops.PriorityQueue(2000, (dtypes.string, dtypes.string), (
@@ -276,7 +280,7 @@ class PriorityQueueTest(test.TestCase):
       side_value_1 = np.random.rand(1000).astype(bytes)
       q.enqueue_many((elem, side_value_0, side_value_1)).run()
       deq = q.dequeue_many(1000)
-      deq_elem, deq_value_0, deq_value_1 = sess.run(deq)
+      deq_elem, deq_value_0, deq_value_1 = self.evaluate(deq)
 
       allowed = {}
       for e, v0, v1 in zip(elem, side_value_0, side_value_1):
@@ -288,6 +292,7 @@ class PriorityQueueTest(test.TestCase):
       for e, dv0, dv1 in zip(deq_elem, deq_value_0, deq_value_1):
         self.assertTrue((dv0, dv1) in allowed[e])
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadManySorts(self):
     with self.cached_session():
       q = data_flow_ops.PriorityQueue(2000, (dtypes.int64), (()))
@@ -296,6 +301,7 @@ class PriorityQueueTest(test.TestCase):
       deq_values = np.hstack((q.dequeue_many(100)[0].eval() for _ in range(10)))
       self.assertAllEqual(deq_values, sorted(elem))
 
+  @test_util.run_v1_only("b/120545219")
   def testRoundTripInsertOnceReadOnceLotsSorts(self):
     with self.cached_session():
       q = data_flow_ops.PriorityQueue(2000, (dtypes.int64), (()))
@@ -311,6 +317,7 @@ class PriorityQueueTest(test.TestCase):
       with self.assertRaises(TypeError):
         q.enqueue_many((["a", "b", "c"], ["a", "b", "c"])).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testInsertingNonScalarFails(self):
     with self.cached_session() as sess:
       input_priority = array_ops.placeholder(dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/py_func_test.py b/tensorflow/python/kernel_tests/py_func_test.py
index 837f1ec054ff4980cf4868c26dbdbe43cc1d1726..482633d539dfb0d1b0737846ba44ff3e0826ad43 100644
--- a/tensorflow/python/kernel_tests/py_func_test.py
+++ b/tensorflow/python/kernel_tests/py_func_test.py
@@ -102,6 +102,7 @@ class PyFuncTest(test.TestCase):
           script_ops.eager_py_func(np_func, [x, y], [dtypes.float32]))
       self.assertEqual(z[0], np_func(1.0, 2.0).astype(np.float32))
 
+  @test_util.run_v1_only("b/120545219")
   def testArray(self):
     with self.cached_session():
       x = constant_op.constant([1.0, 2.0], dtypes.float64)
@@ -168,6 +169,7 @@ class PyFuncTest(test.TestCase):
                              (dtypes.float64, dtypes.float64)))
       self.assertAllClose(y, [0.0, 1.0])
 
+  @test_util.run_v1_only("b/120545219")
   def testStrings(self):
 
     def read_fixed_length_numpy_strings():
@@ -185,6 +187,7 @@ class PyFuncTest(test.TestCase):
           script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
       self.assertAllEqual(z, [b"hello there", b"hi there"])
 
+  @test_util.run_v1_only("b/120545219")
   def testStringsAreConvertedToBytes(self):
 
     def read_fixed_length_numpy_strings():
@@ -202,6 +205,7 @@ class PyFuncTest(test.TestCase):
           script_ops.py_func(read_and_return_strings, [x, y], dtypes.string))
       self.assertAllEqual(z, [b"hello there", b"hi there"])
 
+  @test_util.run_v1_only("b/120545219")
   def testObjectArraysAreConvertedToBytes(self):
 
     def read_object_array():
@@ -217,12 +221,14 @@ class PyFuncTest(test.TestCase):
       z, = script_ops.py_func(read_and_return_strings, [x, y], [dtypes.string])
       self.assertListEqual(list(z.eval()), [b"hello there", b"hi ya"])
 
+  @test_util.run_v1_only("b/120545219")
   def testStringPadding(self):
     correct = [b"this", b"is", b"a", b"test"]
     with self.cached_session():
       s, = script_ops.py_func(lambda: [correct], [], [dtypes.string])
       self.assertAllEqual(s.eval(), correct)
 
+  @test_util.run_v1_only("b/120545219")
   def testStringPaddingAreConvertedToBytes(self):
     inp = ["this", "is", "a", "test"]
     correct = [b"this", b"is", b"a", b"test"]
@@ -230,6 +236,7 @@ class PyFuncTest(test.TestCase):
       s, = script_ops.py_func(lambda: [inp], [], [dtypes.string])
       self.assertAllEqual(s.eval(), correct)
 
+  @test_util.run_v1_only("b/120545219")
   def testLarge(self):
     with self.cached_session() as sess:
       x = array_ops.zeros([1000000], dtype=np.float32)
@@ -243,6 +250,7 @@ class PyFuncTest(test.TestCase):
       x = self.evaluate(script_ops.py_func(lambda: 42.0, [], dtypes.float64))
       self.assertAllClose(x, 42.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testAlias(self):
     with self.cached_session():
       np_array = np.array([1.0, 2.0], dtype=np.float32)
@@ -251,6 +259,7 @@ class PyFuncTest(test.TestCase):
       value.op.run()
       self.assertAllEqual(np_array, [1.0, 2.0])
 
+  @test_util.run_v1_only("b/120545219")
   def testReturnUnicodeString(self):
     with self.cached_session():
       correct = u"你好 世界"
@@ -261,6 +270,7 @@ class PyFuncTest(test.TestCase):
       z, = script_ops.py_func(unicode_string, [], [dtypes.string])
       self.assertEqual(z.eval(), correct.encode("utf8"))
 
+  @test_util.run_v1_only("b/120545219")
   def testBadNumpyReturnType(self):
     with self.cached_session():
 
@@ -272,8 +282,9 @@ class PyFuncTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported numpy type"):
-        y.eval()
+        self.evaluate(y)
 
+  @test_util.run_v1_only("b/120545219")
   def testBadReturnType(self):
     with self.cached_session():
 
@@ -285,8 +296,9 @@ class PyFuncTest(test.TestCase):
 
       with self.assertRaisesRegexp(errors.UnimplementedError,
                                    "Unsupported object type"):
-        z.eval()
+        self.evaluate(z)
 
+  @test_util.run_v1_only("b/120545219")
   def testReturnInput(self):
     with self.cached_session():
 
@@ -307,9 +319,9 @@ class PyFuncTest(test.TestCase):
     with session_lib.Session() as sess:
       producer = iter(range(3))
       x, = script_ops.py_func(lambda: next(producer), [], [dtypes.int64])
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 1)
-      self.assertEqual(sess.run(x), 2)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 1)
+      self.assertEqual(self.evaluate(x), 2)
 
   def testStateless(self):
     # Not using self.cached_session(), which disables optimization.
@@ -317,10 +329,11 @@ class PyFuncTest(test.TestCase):
       producer = iter(range(3))
       x, = script_ops.py_func(
           lambda: next(producer), [], [dtypes.int64], stateful=False)
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 0)
-      self.assertEqual(sess.run(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
+      self.assertEqual(self.evaluate(x), 0)
 
+  @test_util.run_v1_only("b/120545219")
   def testGradientFunction(self):
     # Input to tf.py_func is necessary, otherwise get_gradient_function()
     # returns None per default.
@@ -330,13 +343,15 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(None, ops.get_gradient_function(x.op))
     self.assertEqual(None, ops.get_gradient_function(y.op))
 
+  @test_util.run_v1_only("b/120545219")
   def testCOrder(self):
     with self.cached_session():
       val = [[1, 2], [3, 4]]
       x, = script_ops.py_func(lambda: np.array(val, order="F"), [],
                               [dtypes.int64])
-      self.assertAllEqual(val, x.eval())
+      self.assertAllEqual(val, self.evaluate(x))
 
+  @test_util.run_v1_only("b/120545219")
   def testParallel(self):
     # Tests that tf.py_func's can run in parallel if they release the GIL.
     with self.cached_session() as session:
@@ -382,6 +397,7 @@ class PyFuncTest(test.TestCase):
       self.assertIsNone(ret)
       self.assertAllEqual([3], s.value)
 
+  @test_util.run_v1_only("b/120545219")
   def testNoReturnValueStateless(self):
 
     def do_nothing(unused_x):
@@ -390,7 +406,7 @@ class PyFuncTest(test.TestCase):
     f = script_ops.py_func(
         do_nothing, [constant_op.constant(3, dtypes.int64)], [], stateful=False)
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(f), [])
+      self.assertEqual(self.evaluate(f), [])
 
   def _testExceptionHandling(self, py_exp, tf_exp, eager=False):
 
@@ -420,6 +436,7 @@ class PyFuncTest(test.TestCase):
     with self.assertRaisesWithPredicateMatch(tf_exp, expected_error_check):
       self.evaluate(f)
 
+  @test_util.run_v1_only("b/120545219")
   def testExceptionHandling(self):
     with self.cached_session():
       self._testExceptionHandling(ValueError, errors.InvalidArgumentError)
@@ -514,6 +531,7 @@ class PyFuncTest(test.TestCase):
       self.assertAllEqual(ret, [[3.0], [3.0], [3.0]])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testEagerExceptionHandling(self):
     with test_util.device(use_gpu=True):
       self._testExceptionHandling(
@@ -533,6 +551,7 @@ class PyFuncTest(test.TestCase):
       self._testExceptionHandling(WeirdError, errors.UnknownError, eager=True)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testEagerReturningVariableRaisesError(self):
     def return_variable():
       return resource_variable_ops.ResourceVariable(0.0)
@@ -556,6 +575,7 @@ class PyFuncTest(test.TestCase):
     dy_dx = tape.gradient(y, x)
     self.assertEqual(self.evaluate(dy_dx), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraph(self):
 
     def f(x):
@@ -566,6 +586,7 @@ class PyFuncTest(test.TestCase):
     dy_dx = gradients_impl.gradients(y, x)[0]
     self.assertEqual(self.evaluate(dy_dx), 6.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphTwoOutputs(self):
 
     def f(x, y):
@@ -595,6 +616,7 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(self.evaluate(dz_dx), 6.0)
     self.assertEqual(self.evaluate(dz_dy), 8.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphMultipleArgs(self):
 
     def f(x, y):
@@ -608,6 +630,7 @@ class PyFuncTest(test.TestCase):
     self.assertEqual(self.evaluate(dz_dx), 6.0)
     self.assertEqual(self.evaluate(dz_dy), 8.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerGradientGraphLogHuber(self):
 
     def log_huber(x, m):
@@ -629,6 +652,7 @@ class PyFuncTest(test.TestCase):
       self.assertEqual(y, 1.0)
       self.assertEqual(dy_dx, 2.0)
 
+  @test_util.run_v1_only("b/120545219")
   def testEagerRespectsDevicePlacmentOfOp(self):
 
     def f(x):
diff --git a/tensorflow/python/kernel_tests/qr_op_test.py b/tensorflow/python/kernel_tests/qr_op_test.py
index a60237fb25a0ca5c2a26797452f0ce08e530f830..5adb95c7d60e88e43f6f171f6594c8542ef53143 100644
--- a/tensorflow/python/kernel_tests/qr_op_test.py
+++ b/tensorflow/python/kernel_tests/qr_op_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
@@ -38,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class QrOpTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to qr should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
@@ -49,6 +52,7 @@ class QrOpTest(test.TestCase):
                                  "Shape must be at least rank 2 but is rank 1"):
       linalg_ops.qr(vector)
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       all_ops = []
@@ -60,7 +64,7 @@ class QrOpTest(test.TestCase):
             q1, r1 = linalg_ops.qr(matrix1, full_matrices=full_matrices_)
             q2, r2 = linalg_ops.qr(matrix2, full_matrices=full_matrices_)
             all_ops += [q1, r1, q2, r2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       for i in range(8):
         q = 4 * i
         self.assertAllEqual(val[q], val[q + 2])  # q1 == q2
@@ -100,7 +104,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-14
     # Tests that a ~= q*r.
     a_recon = math_ops.matmul(q, r)
-    self.assertAllClose(a_recon.eval(), a, rtol=tol, atol=tol)
+    self.assertAllClose(a_recon, a, rtol=tol, atol=tol)
 
   def CheckUnitary(self, x):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
@@ -110,8 +114,9 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       tol = 1e-5
     else:
       tol = 1e-14
-    self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
+    self.assertAllClose(identity, xx, atol=tol)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(1)
     x_np = np.random.uniform(
@@ -129,7 +134,7 @@ def _GetQrOpTest(dtype_, shape_, full_matrices_, use_static_shape_):
       q_tf, r_tf = linalg_ops.qr(x_tf, full_matrices=full_matrices_)
 
       if use_static_shape_:
-        q_tf_val, r_tf_val = sess.run([q_tf, r_tf])
+        q_tf_val, r_tf_val = self.evaluate([q_tf, r_tf])
       else:
         q_tf_val, r_tf_val = sess.run([q_tf, r_tf], feed_dict={x_tf: x_np})
 
@@ -160,6 +165,7 @@ class QrGradOpTest(test.TestCase):
 
 def _GetQrGradOpTest(dtype_, shape_, full_matrices_):
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
@@ -200,7 +206,8 @@ if __name__ == "__main__":
       for cols in 1, 2, 5, 10, 32, 100:
         for full_matrices in False, True:
           for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
-            for use_static_shape in True, False:
+            # TF2 does not support placeholders under eager so we skip it
+            for use_static_shape in set([True, tf2.enabled()]):
               shape = batch_dims + (rows, cols)
               name = "%s_%s_full_%s_static_%s" % (dtype.__name__,
                                                   "_".join(map(str, shape)),
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
index 0023506b77aeb561da2f65183ce7efb60402ba4c..576720528e20d5b4595f106ed7203462e57b2ac7 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_big_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
@@ -39,7 +40,7 @@ class MultinomialTest(test.TestCase):
           num_samples=1000000,
           seed=15)
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
@@ -57,7 +58,7 @@ class MultinomialTest(test.TestCase):
           num_samples=1000000,
           seed=15)
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
@@ -66,6 +67,7 @@ class MultinomialTest(test.TestCase):
             counts_by_indices[index] = count
     self.assertEqual(counts_by_indices[0], 100000000)
 
+  @test_util.run_deprecated_v1
   def testLargeDynamicRange3(self):
     random_seed.set_random_seed(10)
     counts_by_indices = {}
@@ -79,7 +81,7 @@ class MultinomialTest(test.TestCase):
       # we'll run out of memory if we try to draw 1e9 samples directly
       # really should fit in 12GB of memory...
       for _ in range(100):
-        x = sess.run(samples)
+        x = self.evaluate(samples)
         indices, counts = np.unique(x, return_counts=True)
         for index, count in zip(indices, counts):
           if index in counts_by_indices.keys():
diff --git a/tensorflow/python/kernel_tests/random/multinomial_op_test.py b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
index bd64d61af8e793e71a319b6ac1af95bd7dd16a3d..5d123307a8e62c072949d31d3c6b52a8fc39666b 100644
--- a/tensorflow/python/kernel_tests/random/multinomial_op_test.py
+++ b/tensorflow/python/kernel_tests/random/multinomial_op_test.py
@@ -66,12 +66,13 @@ class MultinomialTest(test.TestCase):
             logits, num_samples, output_dtype=output_dtype))
         self.assertAllEqual([[1] * num_samples, [2] * num_samples], samples)
 
+  @test_util.run_deprecated_v1
   def testOneOpMultipleStepsIndependent(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, _ = self._make_ops(10)
       # Consecutive runs shouldn't yield identical output.
-      sample1a = sess.run(sample_op1)
-      sample1b = sess.run(sample_op1)
+      sample1a = self.evaluate(sample_op1)
+      sample1b = self.evaluate(sample_op1)
       self.assertFalse(np.equal(sample1a, sample1b).all())
 
   def testEagerOneOpMultipleStepsIndependent(self):
@@ -81,26 +82,27 @@ class MultinomialTest(test.TestCase):
       self.assertFalse(np.equal(sample1.numpy(), sample2.numpy()).all())
 
   def testTwoOpsIndependent(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, sample_op2 = self._make_ops(32)
-      sample1, sample2 = sess.run([sample_op1, sample_op2])
+      sample1, sample2 = self.evaluate([sample_op1, sample_op2])
       # We expect sample1 and sample2 to be independent.
       # 1 in 2^32 chance of this assertion failing.
       self.assertFalse(np.equal(sample1, sample2).all())
 
+  @test_util.run_deprecated_v1
   def testTwoOpsSameSeedDrawSameSequences(self):
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       sample_op1, sample_op2 = self._make_ops(1000, seed=1)
-      sample1, sample2 = sess.run([sample_op1, sample_op2])
+      sample1, sample2 = self.evaluate([sample_op1, sample_op2])
       self.assertAllEqual(sample1, sample2)
 
   def testLargeLogits(self):
     for neg in [True, False]:
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         logits = np.array([[1000.] * 5])
         if neg:
           logits *= -1
-        samples = random_ops.multinomial(logits, 10).eval()
+        samples = self.evaluate(random_ops.multinomial(logits, 10))
       # Sampled classes should be in-range.
       self.assertTrue((samples >= 0).all())
       self.assertTrue((samples < 5).all())
@@ -157,10 +159,10 @@ class MultinomialTest(test.TestCase):
     Returns:
       Frequencies from sampled classes; shape [batch_size, num_classes].
     """
-    with self.test_session(use_gpu=True) as sess:
+    with test_util.use_gpu():
       random_seed.set_random_seed(1618)
       op = sampler(constant_op.constant(logits), num_samples)
-      d = sess.run(op)
+      d = self.evaluate(op)
 
     batch_size, num_classes = logits.shape
     freqs_mat = []
@@ -186,25 +188,27 @@ class MultinomialTest(test.TestCase):
 
   def testEmpty(self):
     classes = 5
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       for batch in 0, 3:
         for samples in 0, 7:
-          x = random_ops.multinomial(
-              array_ops.zeros([batch, classes]), samples).eval()
+          x = self.evaluate(
+              random_ops.multinomial(
+                  array_ops.zeros([batch, classes]), samples))
           self.assertEqual(x.shape, (batch, samples))
 
+  @test_util.run_deprecated_v1
   def testEmptyClasses(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       x = random_ops.multinomial(array_ops.zeros([5, 0]), 7)
       with self.assertRaisesOpError("num_classes should be positive"):
-        x.eval()
+        self.evaluate(x)
 
   def testNegativeMinLogits(self):
     random_seed.set_random_seed(78844)
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       logits = constant_op.constant([[np.finfo(np.float32).min] * 1023 + [0]])
       num_samples = 1000
-      samples = random_ops.multinomial(logits, num_samples).eval()
+      samples = self.evaluate(random_ops.multinomial(logits, num_samples))
       self.assertAllEqual([[1023] * num_samples], samples)
 
 
diff --git a/tensorflow/python/kernel_tests/random/random_crop_test.py b/tensorflow/python/kernel_tests/random/random_crop_test.py
index 8ded522320b730955e08b43cbf6da537f437b095..724bee07157181fd40b3b0c9ca4a9afac0688e7d 100644
--- a/tensorflow/python/kernel_tests/random/random_crop_test.py
+++ b/tensorflow/python/kernel_tests/random/random_crop_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
 
 
 class RandomCropTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     # No random cropping is performed since the size is value.shape.
     for shape in (2, 1, 1), (2, 1, 3), (4, 5, 3):
@@ -44,10 +46,11 @@ class RandomCropTest(test.TestCase):
           for i in range(2) for j in range(3) for k in range(4))
       crop = random_ops.random_crop(value, size=target)
       for _ in range(20):
-        y = crop.eval()
+        y = self.evaluate(crop)
         self.assertAllEqual(y.shape, target)
         self.assertTrue(tuple(y.ravel()) in value_set)
 
+  @test_util.run_deprecated_v1
   def testRandomization(self):
     # Run 1x1 crop num_samples times in an image and ensure that one finds each
     # pixel 1/size of the time.
@@ -61,7 +64,7 @@ class RandomCropTest(test.TestCase):
       crop = random_ops.random_crop(value, single, seed=7)
       counts = np.zeros(size, dtype=np.int32)
       for _ in range(num_samples):
-        y = crop.eval()
+        y = self.evaluate(crop)
         self.assertAllEqual(y.shape, single)
         counts[y] += 1
 
diff --git a/tensorflow/python/kernel_tests/random/random_gamma_test.py b/tensorflow/python/kernel_tests/random/random_gamma_test.py
index 606e8862c47af0683d7c2695a1e8d4088c6e7afe..a5952a21968c79c8bfbcbfef2b09852f24f29923 100644
--- a/tensorflow/python/kernel_tests/random/random_gamma_test.py
+++ b/tensorflow/python/kernel_tests/random/random_gamma_test.py
@@ -26,6 +26,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -48,14 +49,16 @@ class RandomGammaTest(test.TestCase):
             [num], alpha, beta=beta, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
 
+  @test_util.run_deprecated_v1
   def testMomentsFloat32(self):
     self._testMoments(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testMomentsFloat64(self):
     self._testMoments(dtypes.float64)
 
@@ -208,6 +211,7 @@ class RandomGammaTest(test.TestCase):
         sy = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=use_gpu, seed=345)
         self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     """CSE = constant subexpression eliminator.
 
@@ -222,6 +226,7 @@ class RandomGammaTest(test.TestCase):
           diff = rnd2 - rnd1
           self.assertGreater(np.linalg.norm(diff.eval()), 0.1)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     # Fully known shape.
     rnd = random_ops.random_gamma([150], 2.0)
@@ -253,6 +258,7 @@ class RandomGammaTest(test.TestCase):
     rnd = random_ops.random_gamma([50], array_ops.placeholder(dtypes.float32))
     self.assertIs(None, rnd.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testPositive(self):
     n = int(10e3)
     for dt in [dtypes.float16, dtypes.float32, dtypes.float64]:
diff --git a/tensorflow/python/kernel_tests/random/random_grad_test.py b/tensorflow/python/kernel_tests/random/random_grad_test.py
index d89056c485a3d68a0ea5527391196b41d5fc0090..aac6eeac06abca3148947901b92b43058fe76e3c 100644
--- a/tensorflow/python/kernel_tests/random/random_grad_test.py
+++ b/tensorflow/python/kernel_tests/random/random_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -45,6 +46,7 @@ class AddLeadingUnitDimensionsTest(test.TestCase):
     ret = random_grad.add_leading_unit_dimensions(1.0, 2)
     self.assertAllEqual(ret.shape, [1, 1])
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     x = array_ops.placeholder(dtypes.float32)
     num_dimensions = array_ops.placeholder(dtypes.int32)
@@ -72,6 +74,7 @@ class RandomGammaGradTest(test.TestCase):
   some statistical properties of the derivative.
   """
 
+  @test_util.run_deprecated_v1
   def testGradientsShape(self):
     shape = [2, 3]
     alpha = array_ops.ones([2, 2])
@@ -81,6 +84,7 @@ class RandomGammaGradTest(test.TestCase):
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsShapeWithOneSamplePerParameter(self):
     shape = []
     alpha = array_ops.ones([2, 2])
@@ -90,6 +94,7 @@ class RandomGammaGradTest(test.TestCase):
     self.assertAllEqual(grads_alpha.shape, alpha.shape)
     self.assertAllEqual(grads_beta.shape, beta.shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsUnknownShape(self):
     shape = array_ops.placeholder(dtypes.int32)
     alpha = array_ops.placeholder(dtypes.float32)
@@ -138,9 +143,11 @@ class RandomGammaGradTest(test.TestCase):
     except ImportError as e:
       tf_logging.warn("Cannot use special functions in a test: %s" % str(e))
 
+  @test_util.run_deprecated_v1
   def testCompareToExplicitDerivativeFloat(self):
     self._testCompareToExplicitDerivative(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testCompareToExplicitDerivativeDouble(self):
     self._testCompareToExplicitDerivative(dtypes.float64)
 
@@ -182,12 +189,15 @@ class RandomGammaGradTest(test.TestCase):
 
     self.assertAllClose(actual_val, expected_val, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareToImplicitDerivativeFloat(self):
     self._testCompareToImplicitDerivative(dtypes.float32)
 
+  @test_util.run_deprecated_v1
   def testCompareToImplicitDerivativeDouble(self):
     self._testCompareToImplicitDerivative(dtypes.float64)
 
+  @test_util.run_deprecated_v1
   def testAverageAlphaGradient(self):
     """Statistical test for the gradient.
 
@@ -207,6 +217,7 @@ class RandomGammaGradTest(test.TestCase):
     dsample_dalpha_val = self.evaluate(dsample_dalpha)
     self.assertAllClose(dsample_dalpha_val, [1.0] * 3, atol=1e-1, rtol=1e-1)
 
+  @test_util.run_deprecated_v1
   def testQuadraticLoss(self):
     """Statistical test for the gradient.
 
diff --git a/tensorflow/python/kernel_tests/random/random_ops_test.py b/tensorflow/python/kernel_tests/random/random_ops_test.py
index 6de894846bcac7ccee43fd5e6b843d45a773a9ef..1384c3f446f97a76792a27cfc7f679e80402cbf0 100644
--- a/tensorflow/python/kernel_tests/random/random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/random_ops_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -49,9 +50,9 @@ class RandomOpTestCommon(test.TestCase):
         random_seed.set_random_seed(graph_seed)
       x = rng_func([num], min_or_mean, max_or_stddev, dtype=dtype, seed=op_seed)
 
-      y = sess.run(x)
-      z = sess.run(x)
-      w = sess.run(x)
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
 
       # We use exact equality here. If the random-number generator is producing
       # the same output, all three outputs will be bitwise identical.
@@ -69,7 +70,7 @@ class RandomNormalTest(RandomOpTestCommon):
             [num], mean=mu, stddev=sigma, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -92,6 +93,7 @@ class RandomNormalTest(RandomOpTestCommon):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       results = {}
@@ -104,12 +106,14 @@ class RandomNormalTest(RandomOpTestCommon):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
       sy = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
       self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     for use_gpu in [False, True]:
       with self.session(use_gpu=use_gpu):
@@ -119,12 +123,14 @@ class RandomNormalTest(RandomOpTestCommon):
         diff = rnd2 - rnd1
         self.assertTrue(np.linalg.norm(diff.eval()) > 0.1)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
         self._testSingleSessionNotConstant(
             random_ops.random_normal, 100, dt, 0.0, 1.0, use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionOpSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
@@ -137,6 +143,7 @@ class RandomNormalTest(RandomOpTestCommon):
             use_gpu=use_gpu,
             op_seed=1345)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionGraphSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in dtypes.float16, dtypes.float32, dtypes.float64:
@@ -160,7 +167,7 @@ class TruncatedNormalTest(test.TestCase):
             [num], mean=mu, stddev=sigma, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -185,6 +192,7 @@ class TruncatedNormalTest(test.TestCase):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     # Skip the test if there is no GPU.
     if not test.is_gpu_available():
@@ -203,6 +211,7 @@ class TruncatedNormalTest(test.TestCase):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 0.0, 1.0, dt, use_gpu=True, seed=345)
@@ -219,6 +228,7 @@ class TruncatedNormalTest(test.TestCase):
       print("std(x)", np.std(x), abs(np.std(x) / stddev - 0.85))
       self.assertTrue(abs(np.std(x) / stddev - 0.85) < 0.04)
 
+  @test_util.run_deprecated_v1
   def testLargeShape(self):
     with self.session(use_gpu=True):
       v = variables.Variable(
@@ -226,6 +236,7 @@ class TruncatedNormalTest(test.TestCase):
       n = random_ops.truncated_normal(v.shape)
       self.assertEqual([8589934592, 1], n.shape.as_list())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     with self.session(use_gpu=True):
       shape = [2, 3, 4]
@@ -256,7 +267,7 @@ class RandomUniformTest(RandomOpTestCommon):
             [num], minval=minv, maxval=maxv, dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -287,6 +298,7 @@ class RandomUniformTest(RandomOpTestCommon):
         print("count = ", count)
       self.assertTrue(count < count_limit)
 
+  @test_util.run_deprecated_v1
   def testUniformIntsWithInvalidShape(self):
     for dtype in dtypes.int32, dtypes.int64:
       with self.assertRaisesRegexp(
@@ -299,6 +311,7 @@ class RandomUniformTest(RandomOpTestCommon):
             [1000], minval=1, maxval=[2, 3], dtype=dtype)
 
   # Check that uniform ints actually follow a uniform distribution.
+  @test_util.run_deprecated_v1
   def testUniformInts(self):
     minv = -2
     maxv = 15
@@ -331,6 +344,7 @@ class RandomUniformTest(RandomOpTestCommon):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
                dtypes.int64):
@@ -342,6 +356,7 @@ class RandomUniformTest(RandomOpTestCommon):
         results[use_gpu] = sampler()
       self.assertAllEqual(results[False], results[True])
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
                dtypes.int64):
@@ -350,6 +365,7 @@ class RandomUniformTest(RandomOpTestCommon):
         sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
         self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     shape = [2, 3, 4]
     for dtype in dtypes.float16, dtypes.float32, dtypes.int32:
@@ -359,6 +375,7 @@ class RandomUniformTest(RandomOpTestCommon):
         diff = (rnd2 - rnd1).eval()
         self.assertTrue(np.linalg.norm(diff) > 0.1)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -366,6 +383,7 @@ class RandomUniformTest(RandomOpTestCommon):
         self._testSingleSessionNotConstant(
             random_ops.random_uniform, 100, dt, 0, 17, use_gpu=use_gpu)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionOpSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -379,6 +397,7 @@ class RandomUniformTest(RandomOpTestCommon):
             use_gpu=use_gpu,
             op_seed=1345)
 
+  @test_util.run_deprecated_v1
   def testSingleSessionGraphSeedNotConstant(self):
     for use_gpu in [False, True]:
       for dt in (dtypes.float16, dtypes.float32, dtypes.float64, dtypes.int32,
@@ -395,6 +414,7 @@ class RandomUniformTest(RandomOpTestCommon):
 
 class RandomShapeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTruncatedNormal(self):
     # Fully known shape.
     rnd1 = random_ops.truncated_normal([1, 2, 3])
@@ -407,6 +427,7 @@ class RandomShapeTest(test.TestCase):
     rnd3 = random_ops.truncated_normal(array_ops.placeholder(dtypes.int32))
     self.assertIs(None, rnd3.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testRandomNormal(self):
     # Fully known shape.
     rnd1 = random_ops.random_normal([1, 2, 3])
@@ -419,6 +440,7 @@ class RandomShapeTest(test.TestCase):
     rnd3 = random_ops.random_normal(array_ops.placeholder(dtypes.int32))
     self.assertIs(None, rnd3.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testRandomUniform(self):
     # Fully known shape.
     rnd1 = random_ops.random_uniform([1, 2, 3])
diff --git a/tensorflow/python/kernel_tests/random/random_poisson_test.py b/tensorflow/python/kernel_tests/random/random_poisson_test.py
index 417588f8a391ee73d9f944fe785db3c591a5b450..0a6b004d682e5d810a5a3e09ca6dce867e5f41f1 100644
--- a/tensorflow/python/kernel_tests/random/random_poisson_test.py
+++ b/tensorflow/python/kernel_tests/random/random_poisson_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.platform import test
@@ -43,7 +44,7 @@ class RandomPoissonTest(test.TestCase):
         rng = random_ops.random_poisson(lam, [num], dtype=dtype, seed=seed)
         ret = np.empty([10, num])
         for i in xrange(10):
-          ret[i, :] = sess.run(rng)
+          ret[i, :] = self.evaluate(rng)
       return ret
 
     return func
@@ -104,6 +105,7 @@ class RandomPoissonTest(test.TestCase):
 
   # Checks that the CPU and GPU implementation returns the same results,
   # given the same random seed
+  @test_util.run_deprecated_v1
   def testCPUGPUMatch(self):
     for dt in _SUPPORTED_DTYPES:
       results = {}
@@ -115,12 +117,14 @@ class RandomPoissonTest(test.TestCase):
       else:
         self.assertAllClose(results[False], results[True], rtol=1e-6, atol=1e-6)
 
+  @test_util.run_deprecated_v1
   def testSeed(self):
     for dt in dtypes.float16, dtypes.float32, dtypes.float64:
       sx = self._Sampler(1000, 1.0, dt, use_gpu=True, seed=345)
       sy = self._Sampler(1000, 1.0, dt, use_gpu=True, seed=345)
       self.assertAllEqual(sx(), sy())
 
+  @test_util.run_deprecated_v1
   def testNoCSE(self):
     """CSE = constant subexpression eliminator.
 
@@ -140,8 +144,9 @@ class RandomPoissonTest(test.TestCase):
     with self.cached_session():
       rnd = random_ops.random_poisson([], [], seed=12345)
       self.assertEqual([0], rnd.get_shape().as_list())
-      self.assertAllClose(np.array([], dtype=np.float32), rnd.eval())
+      self.assertAllClose(np.array([], dtype=np.float32), self.evaluate(rnd))
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     # Fully known shape
     rnd = random_ops.random_poisson(2.0, [150], seed=12345)
@@ -184,6 +189,7 @@ class RandomPoissonTest(test.TestCase):
         seed=12345)
     self.assertIs(None, rnd.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testDTypeCombinationsV2(self):
     """Tests random_poisson_v2() for all supported dtype combinations."""
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
index 0d85a072d4a2ff168f5e1c3233c7f7faf5c69a32..dd814a22b4e59261b33e1a57fd9014147792858b 100644
--- a/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
+++ b/tensorflow/python/kernel_tests/random/random_shuffle_queue_test.py
@@ -29,11 +29,13 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class RandomShuffleQueueTest(test.TestCase):
 
   def setUp(self):
@@ -84,9 +86,9 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeue_t = q.dequeue()
       results = []
       for _ in range(2):
-        a, b = sess.run(dequeue_t)
+        a, b = self.evaluate(dequeue_t)
         results.append((a, b))
-      a, b = sess.run(q.dequeue_many(3))
+      a, b = self.evaluate(q.dequeue_many(3))
       for i in range(3):
         results.append((a[i], b[i]))
       self.assertItemsEqual([(1, [5]), (2, [6]), (3, [7]), (4, [8]), (9, [10])],
@@ -101,7 +103,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # Run one producer thread for each element in elems.
       def enqueue(enqueue_op):
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [
           self.checkedThread(
@@ -133,7 +135,7 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in enqueue_ops]
       for thread in threads:
@@ -167,13 +169,13 @@ class RandomShuffleQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         for enqueue_op in enqueue_ops:
-          sess.run(enqueue_op)
+          self.evaluate(enqueue_op)
 
       results = []
 
       def dequeue():
         for _ in xrange(len(elems)):
-          results.append(sess.run(dequeued_t))
+          results.append(self.evaluate(dequeued_t))
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -197,7 +199,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       results = []
       for _ in xrange(len(elems)):
-        x, y = sess.run(dequeued_t)
+        x, y = self.evaluate(dequeued_t)
         results.append((x, y))
       self.assertItemsEqual(elems, results)
 
@@ -215,9 +217,9 @@ class RandomShuffleQueueTest(test.TestCase):
       self.assertEqual([], size.get_shape())
 
       enqueue_op.run()
-      self.assertEqual([1], size.eval())
+      self.assertEqual([1], self.evaluate(size))
       dequeued_t.op.run()
-      self.assertEqual([0], size.eval())
+      self.assertEqual([0], self.evaluate(size))
 
   def testEnqueueMany(self):
     with self.cached_session():
@@ -241,9 +243,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue_many((empty_t,))
       size_t = q.size()
 
-      self.assertEqual(0, size_t.eval())
+      self.assertEqual(0, self.evaluate(size_t))
       enqueue_op.run()
-      self.assertEqual(0, size_t.eval())
+      self.assertEqual(0, self.evaluate(size_t))
 
   def testEmptyDequeueMany(self):
     with self.cached_session():
@@ -251,9 +253,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_many(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueUpTo(self):
     with self.cached_session():
@@ -261,9 +263,9 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op = q.enqueue((10.0,))
       dequeued_t = q.dequeue_up_to(0)
 
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
       enqueue_op.run()
-      self.assertEqual([], dequeued_t.eval().tolist())
+      self.assertEqual([], self.evaluate(dequeued_t).tolist())
 
   def testEmptyDequeueManyWithNoShape(self):
     with self.cached_session():
@@ -275,7 +277,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the shape not being constrained.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
       enqueue_op.run()
 
@@ -284,7 +286,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # elements enqueued.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testEmptyDequeueUpToWithNoShape(self):
     with self.cached_session():
@@ -296,7 +298,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the shape not being constrained.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
       enqueue_op.run()
 
@@ -305,7 +307,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # elements enqueued.
       with self.assertRaisesOpError(
           "require the components to have specified shapes"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testMultiEnqueueMany(self):
     with self.cached_session() as sess:
@@ -321,7 +323,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       results = []
       for _ in range(8):
-        float_val, int_val = sess.run(dequeued_t)
+        float_val, int_val = self.evaluate(dequeued_t)
         results.append((float_val, [int_val[0], int_val[1]]))
       expected = list(zip(float_elems, int_elems)) * 2
       self.assertItemsEqual(expected, results)
@@ -335,7 +337,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      results = dequeued_t.eval().tolist()
+      results = self.evaluate(dequeued_t).tolist()
       results.extend(dequeued_t.eval())
       self.assertItemsEqual(elems, results)
 
@@ -348,7 +350,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       enqueue_op.run()
 
-      results = dequeued_t.eval().tolist()
+      results = self.evaluate(dequeued_t).tolist()
       results.extend(dequeued_t.eval())
       self.assertItemsEqual(elems, results)
 
@@ -368,20 +370,20 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       results = []
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       self.assertEqual(float_val.shape, dequeued_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_t[1].get_shape())
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_single_t[1].get_shape())
       results.append((float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       results.append((float_val, int_val.tolist()))
 
       self.assertItemsEqual(zip(float_elems, int_elems), results)
@@ -402,21 +404,21 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       results = []
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       # dequeue_up_to has undefined shape.
       self.assertEqual([None], dequeued_t[0].get_shape().as_list())
       self.assertEqual([None, 2], dequeued_t[1].get_shape().as_list())
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_t)
+      float_val, int_val = self.evaluate(dequeued_t)
       results.extend(zip(float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       self.assertEqual(float_val.shape, dequeued_single_t[0].get_shape())
       self.assertEqual(int_val.shape, dequeued_single_t[1].get_shape())
       results.append((float_val, int_val.tolist()))
 
-      float_val, int_val = sess.run(dequeued_single_t)
+      float_val, int_val = self.evaluate(dequeued_single_t)
       results.append((float_val, int_val.tolist()))
 
       self.assertItemsEqual(zip(float_elems, int_elems), results)
@@ -442,7 +444,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       # Enqueue 100 items in parallel on 10 threads.
       def enqueue():
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       threads = [self.checkedThread(target=enqueue) for _ in range(10)]
       for thread in threads:
@@ -466,7 +468,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -489,7 +491,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t))
+        dequeued_elems.extend(self.evaluate(dequeued_t))
 
       threads = [self.checkedThread(target=dequeue) for _ in range(10)]
       for thread in threads:
@@ -515,7 +517,7 @@ class RandomShuffleQueueTest(test.TestCase):
       dequeued_elems = []
 
       def dequeue(dequeue_op):
-        dequeued_elems.extend(sess.run(dequeue_op))
+        dequeued_elems.extend(self.evaluate(dequeue_op))
 
       threads = []
       for dequeue_op in dequeue_ops:
@@ -539,10 +541,10 @@ class RandomShuffleQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -566,10 +568,10 @@ class RandomShuffleQueueTest(test.TestCase):
         # The enqueue_op should run after the dequeue op has blocked.
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
-        sess.run(enqueue_op)
+        self.evaluate(enqueue_op)
 
       def dequeue():
-        dequeued_elems.extend(sess.run(dequeued_t).tolist())
+        dequeued_elems.extend(self.evaluate(dequeued_t).tolist())
 
       enqueue_thread = self.checkedThread(target=enqueue)
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -649,7 +651,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # Expect the operation to fail due to the queue being closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                    "is closed and has insufficient"):
-        dequeued_t.eval()
+        self.evaluate(dequeued_t)
 
   def testBlockingDequeueFromClosedQueue(self):
     with self.cached_session() as sess:
@@ -665,18 +667,18 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       # Manually dequeue until we hit min_size.
-      results.append(sess.run(dequeued_t))
-      results.append(sess.run(dequeued_t))
+      results.append(self.evaluate(dequeued_t))
+      results.append(self.evaluate(dequeued_t))
 
       def blocking_dequeue():
-        results.append(sess.run(dequeued_t))
-        results.append(sess.run(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
+        results.append(self.evaluate(dequeued_t))
 
         self.assertItemsEqual(elems, results)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=blocking_dequeue)
       dequeue_thread.start()
@@ -701,7 +703,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         finished.append(True)
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -727,12 +729,12 @@ class RandomShuffleQueueTest(test.TestCase):
       progress = []  # Must be mutable
 
       def dequeue():
-        self.assertItemsEqual(elems, sess.run(dequeued_t))
+        self.assertItemsEqual(elems, self.evaluate(dequeued_t))
         progress.append(1)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         progress.append(2)
 
       self.assertEqual(len(progress), 0)
@@ -763,9 +765,9 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(3, len(results))
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(4, len(results))
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -794,11 +796,11 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(3, len(results))
         # min_after_dequeue is 2, we ask for 3 elements, and we end up only
         # getting the remaining 1.
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEquals(4, len(results))
 
       dequeue_thread = self.checkedThread(target=dequeue)
@@ -824,16 +826,16 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
 
       def dequeue():
-        results.extend(sess.run(dequeued_t))
+        results.extend(self.evaluate(dequeued_t))
         self.assertEqual(len(results), 3)
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
         # While the last dequeue failed, we want to insure that it returns
         # any elements that it potentially reserved to dequeue. Thus the
         # next cleanup should return a single element.
-        results.extend(sess.run(cleanup_dequeue_t))
+        results.extend(self.evaluate(cleanup_dequeue_t))
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -854,7 +856,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -874,7 +876,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.OutOfRangeError,
                                      "is closed and has insufficient"):
-          sess.run(dequeued_t)
+          self.evaluate(dequeued_t)
 
       dequeue_thread = self.checkedThread(target=dequeue)
       dequeue_thread.start()
@@ -922,7 +924,7 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -950,7 +952,7 @@ class RandomShuffleQueueTest(test.TestCase):
       enqueue_op.run()
 
       def blocking_enqueue():
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread = self.checkedThread(target=blocking_enqueue)
       thread.start()
@@ -987,11 +989,11 @@ class RandomShuffleQueueTest(test.TestCase):
       def blocking_enqueue():
         # Expect the operation to succeed since it will complete
         # before the queue is closed.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
         # Expect the operation to fail due to the queue being closed.
         with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
-          sess.run(blocking_enqueue_op)
+          self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
       thread1.start()
@@ -1001,7 +1003,7 @@ class RandomShuffleQueueTest(test.TestCase):
       time.sleep(0.1)
 
       def blocking_close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       thread2 = self.checkedThread(target=blocking_close)
       thread2.start()
@@ -1032,7 +1034,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def blocking_enqueue():
         # This will block until the dequeue after the close.
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
       thread1 = self.checkedThread(target=blocking_enqueue)
       thread1.start()
@@ -1040,7 +1042,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # First blocking_enqueue_op of blocking_enqueue has enqueued 1 of 2
       # elements, and is blocked waiting for one more element to be dequeue.
       for i in range(50):
-        queue_size = size_t.eval()
+        queue_size = self.evaluate(size_t)
         if queue_size == 4:
           break
         elif i == 49:
@@ -1050,7 +1052,7 @@ class RandomShuffleQueueTest(test.TestCase):
         time.sleep(0.1)
 
       def blocking_close():
-        sess.run(close_op)
+        self.evaluate(close_op)
 
       thread2 = self.checkedThread(target=blocking_close)
       thread2.start()
@@ -1064,7 +1066,7 @@ class RandomShuffleQueueTest(test.TestCase):
       # At this point the close operation will complete, so the next enqueue
       # will fail.
       with self.assertRaisesRegexp(errors_impl.CancelledError, "closed"):
-        sess.run(blocking_enqueue_op)
+        self.evaluate(blocking_enqueue_op)
 
   def testSharedQueueSameSession(self):
     with self.cached_session():
@@ -1216,23 +1218,23 @@ class RandomShuffleQueueTest(test.TestCase):
 
   def _blockingDequeue(self, sess, dequeue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_op)
+      self.evaluate(dequeue_op)
 
   def _blockingDequeueMany(self, sess, dequeue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_many_op)
+      self.evaluate(dequeue_many_op)
 
   def _blockingDequeueUpTo(self, sess, dequeue_up_to_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(dequeue_up_to_op)
+      self.evaluate(dequeue_up_to_op)
 
   def _blockingEnqueue(self, sess, enqueue_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_op)
+      self.evaluate(enqueue_op)
 
   def _blockingEnqueueMany(self, sess, enqueue_many_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(enqueue_many_op)
+      self.evaluate(enqueue_many_op)
 
   def testResetOfBlockingOperation(self):
     with self.cached_session() as sess:
@@ -1383,7 +1385,7 @@ class RandomShuffleQueueTest(test.TestCase):
       def blocking_enqueue():
         enq_done.append(False)
         # This will fill the queue and then block until enough dequeues happen.
-        sess.run(enq)
+        self.evaluate(enq)
         enq_done.append(True)
 
       thread = self.checkedThread(target=blocking_enqueue)
@@ -1393,14 +1395,14 @@ class RandomShuffleQueueTest(test.TestCase):
       results = []
       results.append(deq.eval())  # Will only complete after the enqueue starts.
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       for _ in range(3):
         results.append(deq.eval())
 
       time.sleep(0.1)
       self.assertEqual(len(enq_done), 1)
-      self.assertEqual(sess.run(size_op), 5)
+      self.assertEqual(self.evaluate(size_op), 5)
 
       # This dequeue will unblock the thread.
       results.append(deq.eval())
@@ -1415,6 +1417,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       self.assertItemsEqual(elem, results)
 
+  @test_util.run_v1_only("b/120545219")
   def testBigDequeueMany(self):
     with self.cached_session() as sess:
       q = data_flow_ops.RandomShuffleQueue(2, 0, dtypes_lib.int32, ((),))
@@ -1426,7 +1429,7 @@ class RandomShuffleQueueTest(test.TestCase):
 
       def blocking_dequeue():
         # Will only complete after 4 enqueues complete.
-        results.extend(sess.run(deq))
+        results.extend(self.evaluate(deq))
 
       thread = self.checkedThread(target=blocking_dequeue)
       thread.start()
@@ -1435,7 +1438,7 @@ class RandomShuffleQueueTest(test.TestCase):
         # TODO(mrry): Figure out how to do this without sleeping.
         time.sleep(0.1)
         self.assertEqual(len(results), 0)
-        sess.run(enq)
+        self.evaluate(enq)
 
       # Enough enqueued to unblock the dequeue
       thread.join()
diff --git a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
index d57db3c5126059c27cf23b493c4cb09d4987459d..898f38444b7fcab52129c4e53761cdb78c2fd825 100644
--- a/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
+++ b/tensorflow/python/kernel_tests/random/stateless_random_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import stateless_random_ops as stateless
@@ -58,11 +59,11 @@ class StatelessOpsTest(test.TestCase):
       preseed = invert_philox(key, (seed[0], 0, seed[1], 0)).astype(np.uint64)
       preseed = preseed[::2] | preseed[1::2] << 32
       random_seed.set_random_seed(seed[0])
-      with self.test_session(use_gpu=True):
+      with test_util.use_gpu():
         for stateless_op, stateful_op in cases:
           stateful = stateful_op(seed=seed[1])
           pure = stateless_op(seed=preseed)
-          self.assertAllEqual(stateful.eval(), pure.eval())
+          self.assertAllEqual(self.evaluate(stateful), self.evaluate(pure))
 
   def _test_determinism(self, cases):
     # Stateless values should be equal iff the seeds are equal (roughly)
@@ -128,23 +129,29 @@ class StatelessOpsTest(test.TestCase):
           yield (functools.partial(stateless.stateless_multinomial, **kwds),
                  functools.partial(random_ops.multinomial, **kwds))
 
+  @test_util.run_deprecated_v1
   def testMatchFloat(self):
     self._test_match(self._float_cases())
 
+  @test_util.run_deprecated_v1
   def testMatchInt(self):
     self._test_match(self._int_cases())
 
+  @test_util.run_deprecated_v1
   def testMatchMultinomial(self):
     self._test_match(self._multinomial_cases())
 
+  @test_util.run_deprecated_v1
   def testDeterminismFloat(self):
     self._test_determinism(
         self._float_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.run_deprecated_v1
   def testDeterminismInt(self):
     self._test_determinism(
         self._int_cases(shape_dtypes=(dtypes.int32, dtypes.int64)))
 
+  @test_util.run_deprecated_v1
   def testDeterminismMultinomial(self):
     self._test_determinism(self._multinomial_cases())
 
diff --git a/tensorflow/python/kernel_tests/reader_ops_test.py b/tensorflow/python/kernel_tests/reader_ops_test.py
index ac9be56d63fce302928b8de84ac9c1bf7ea6e55e..43d15817e97e37a372dee940ef2c6baa35d8be24 100644
--- a/tensorflow/python/kernel_tests/reader_ops_test.py
+++ b/tensorflow/python/kernel_tests/reader_ops_test.py
@@ -28,6 +28,7 @@ import zlib
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import io_ops
@@ -140,147 +141,147 @@ class TFCompressionTestCase(test.TestCase):
 
 class IdentityReaderTest(test.TestCase):
 
-  def _ExpectRead(self, sess, key, value, expected):
-    k, v = sess.run([key, value])
+  def _ExpectRead(self, key, value, expected):
+    k, v = self.evaluate([key, value])
     self.assertAllEqual(expected, k)
     self.assertAllEqual(expected, v)
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      work_completed = reader.num_work_units_completed()
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queued_length = queue.size()
-      key, value = reader.read(queue)
+    reader = io_ops.IdentityReader("test_reader")
+    work_completed = reader.num_work_units_completed()
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    queued_length = queue.size()
+    key, value = reader.read(queue)
 
-      self.assertAllEqual(0, work_completed.eval())
-      self.assertAllEqual(0, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self.assertAllEqual(0, self.evaluate(work_completed))
+    self.assertAllEqual(0, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
-      queue.enqueue_many([["A", "B", "C"]]).run()
-      queue.close().run()
-      self.assertAllEqual(3, queued_length.eval())
+    self.evaluate(queue.enqueue_many([["A", "B", "C"]]))
+    self.evaluate(queue.close())
+    self.assertAllEqual(3, self.evaluate(queued_length))
 
-      self._ExpectRead(sess, key, value, b"A")
-      self.assertAllEqual(1, produced.eval())
+    self._ExpectRead(key, value, b"A")
+    self.assertAllEqual(1, self.evaluate(produced))
 
-      self._ExpectRead(sess, key, value, b"B")
+    self._ExpectRead(key, value, b"B")
 
-      self._ExpectRead(sess, key, value, b"C")
-      self.assertAllEqual(3, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self._ExpectRead(key, value, b"C")
+    self.assertAllEqual(3, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
 
-      self.assertAllEqual(3, work_completed.eval())
-      self.assertAllEqual(3, produced.eval())
-      self.assertAllEqual(0, queued_length.eval())
+    self.assertAllEqual(3, self.evaluate(work_completed))
+    self.assertAllEqual(3, self.evaluate(produced))
+    self.assertAllEqual(0, self.evaluate(queued_length))
 
+  @test_util.run_deprecated_v1
   def testMultipleEpochs(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      enqueue = queue.enqueue_many([["DD", "EE"]])
-      key, value = reader.read(queue)
-
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      enqueue.run()
-      self._ExpectRead(sess, key, value, b"DD")
-      self._ExpectRead(sess, key, value, b"EE")
-      queue.close().run()
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
-
+    reader = io_ops.IdentityReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    enqueue = queue.enqueue_many([["DD", "EE"]])
+    key, value = reader.read(queue)
+
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, b"DD")
+    self._ExpectRead(key, value, b"EE")
+    self.evaluate(queue.close())
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
+
+  @test_util.run_deprecated_v1
   def testSerializeRestore(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queue.enqueue_many([["X", "Y", "Z"]]).run()
-      key, value = reader.read(queue)
-
-      self._ExpectRead(sess, key, value, b"X")
-      self.assertAllEqual(1, produced.eval())
-      state = reader.serialize_state().eval()
-
-      self._ExpectRead(sess, key, value, b"Y")
-      self._ExpectRead(sess, key, value, b"Z")
-      self.assertAllEqual(3, produced.eval())
-
-      queue.enqueue_many([["Y", "Z"]]).run()
-      queue.close().run()
-      reader.restore_state(state).run()
-      self.assertAllEqual(1, produced.eval())
-      self._ExpectRead(sess, key, value, b"Y")
-      self._ExpectRead(sess, key, value, b"Z")
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
-      self.assertAllEqual(3, produced.eval())
-
-      self.assertEqual(bytes, type(state))
-
-      with self.assertRaises(ValueError):
-        reader.restore_state([])
-
-      with self.assertRaises(ValueError):
-        reader.restore_state([state, state])
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state[1:]).run()
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state[:-1]).run()
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(state + b"ExtraJunk").run()
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(b"PREFIX" + state).run()
-
-      with self.assertRaisesOpError(
-          "Could not parse state for IdentityReader 'test_reader'"):
-        reader.restore_state(b"BOGUS" + state[5:]).run()
-
+    reader = io_ops.IdentityReader("test_reader")
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    self.evaluate(queue.enqueue_many([["X", "Y", "Z"]]))
+    key, value = reader.read(queue)
+
+    self._ExpectRead(key, value, b"X")
+    self.assertAllEqual(1, self.evaluate(produced))
+    state = self.evaluate(reader.serialize_state())
+
+    self._ExpectRead(key, value, b"Y")
+    self._ExpectRead(key, value, b"Z")
+    self.assertAllEqual(3, self.evaluate(produced))
+
+    self.evaluate(queue.enqueue_many([["Y", "Z"]]))
+    self.evaluate(queue.close())
+    self.evaluate(reader.restore_state(state))
+    self.assertAllEqual(1, self.evaluate(produced))
+    self._ExpectRead(key, value, b"Y")
+    self._ExpectRead(key, value, b"Z")
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
+    self.assertAllEqual(3, self.evaluate(produced))
+
+    self.assertEqual(bytes, type(state))
+
+    with self.assertRaises(ValueError):
+      reader.restore_state([])
+
+    with self.assertRaises(ValueError):
+      reader.restore_state([state, state])
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state[1:]))
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state[:-1]))
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(state + b"ExtraJunk"))
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(b"PREFIX" + state))
+
+    with self.assertRaisesOpError(
+        "Could not parse state for IdentityReader 'test_reader'"):
+      self.evaluate(reader.restore_state(b"BOGUS" + state[5:]))
+
+  @test_util.run_deprecated_v1
   def testReset(self):
-    with self.cached_session() as sess:
-      reader = io_ops.IdentityReader("test_reader")
-      work_completed = reader.num_work_units_completed()
-      produced = reader.num_records_produced()
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queued_length = queue.size()
-      key, value = reader.read(queue)
+    reader = io_ops.IdentityReader("test_reader")
+    work_completed = reader.num_work_units_completed()
+    produced = reader.num_records_produced()
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    queued_length = queue.size()
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([["X", "Y", "Z"]]).run()
-      self._ExpectRead(sess, key, value, b"X")
-      self.assertLess(0, queued_length.eval())
-      self.assertAllEqual(1, produced.eval())
+    self.evaluate(queue.enqueue_many([["X", "Y", "Z"]]))
+    self._ExpectRead(key, value, b"X")
+    self.assertLess(0, self.evaluate(queued_length))
+    self.assertAllEqual(1, self.evaluate(produced))
 
-      self._ExpectRead(sess, key, value, b"Y")
-      self.assertLess(0, work_completed.eval())
-      self.assertAllEqual(2, produced.eval())
+    self._ExpectRead(key, value, b"Y")
+    self.assertLess(0, self.evaluate(work_completed))
+    self.assertAllEqual(2, self.evaluate(produced))
 
-      reader.reset().run()
-      self.assertAllEqual(0, work_completed.eval())
-      self.assertAllEqual(0, produced.eval())
-      self.assertAllEqual(1, queued_length.eval())
-      self._ExpectRead(sess, key, value, b"Z")
+    self.evaluate(reader.reset())
+    self.assertAllEqual(0, self.evaluate(work_completed))
+    self.assertAllEqual(0, self.evaluate(produced))
+    self.assertAllEqual(1, self.evaluate(queued_length))
+    self._ExpectRead(key, value, b"Z")
 
-      queue.enqueue_many([["K", "L"]]).run()
-      self._ExpectRead(sess, key, value, b"K")
+    self.evaluate(queue.enqueue_many([["K", "L"]]))
+    self._ExpectRead(key, value, b"K")
 
 
 class WholeFileReaderTest(test.TestCase):
@@ -301,44 +302,44 @@ class WholeFileReaderTest(test.TestCase):
       os.remove(fn)
     super(WholeFileReaderTest, self).tearDown()
 
-  def _ExpectRead(self, sess, key, value, index):
-    k, v = sess.run([key, value])
+  def _ExpectRead(self, key, value, index):
+    k, v = self.evaluate([key, value])
     self.assertAllEqual(compat.as_bytes(self._filenames[index]), k)
     self.assertAllEqual(self._content[index], v)
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
-    with self.cached_session() as sess:
-      reader = io_ops.WholeFileReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      queue.enqueue_many([self._filenames]).run()
-      queue.close().run()
-      key, value = reader.read(queue)
+    reader = io_ops.WholeFileReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    self.evaluate(queue.enqueue_many([self._filenames]))
+    self.evaluate(queue.close())
+    key, value = reader.read(queue)
 
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      self._ExpectRead(sess, key, value, 2)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self._ExpectRead(key, value, 2)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testInfiniteEpochs(self):
-    with self.cached_session() as sess:
-      reader = io_ops.WholeFileReader("test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      enqueue = queue.enqueue_many([self._filenames])
-      key, value = reader.read(queue)
-
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 2)
-      self._ExpectRead(sess, key, value, 0)
-      self._ExpectRead(sess, key, value, 1)
-      enqueue.run()
-      self._ExpectRead(sess, key, value, 2)
-      self._ExpectRead(sess, key, value, 0)
+    reader = io_ops.WholeFileReader("test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    enqueue = queue.enqueue_many([self._filenames])
+    key, value = reader.read(queue)
+
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 2)
+    self._ExpectRead(key, value, 0)
+    self._ExpectRead(key, value, 1)
+    self.evaluate(enqueue)
+    self._ExpectRead(key, value, 2)
+    self._ExpectRead(key, value, 0)
 
 
 class TextLineReaderTest(test.TestCase):
@@ -366,47 +367,48 @@ class TextLineReaderTest(test.TestCase):
     return filenames
 
   def _testOneEpoch(self, files):
-    with self.cached_session() as sess:
-      reader = io_ops.TextLineReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TextLineReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_lines):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j + 1), compat.as_text(k))
-          self.assertAllEqual(self._LineText(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_lines):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j + 1), compat.as_text(k))
+        self.assertAllEqual(self._LineText(i, j), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testOneEpochLF(self):
     self._testOneEpoch(self._CreateFiles(crlf=False))
 
+  @test_util.run_deprecated_v1
   def testOneEpochCRLF(self):
     self._testOneEpoch(self._CreateFiles(crlf=True))
 
+  @test_util.run_deprecated_v1
   def testSkipHeaderLines(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TextLineReader(skip_header_lines=1, name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TextLineReader(skip_header_lines=1, name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_lines - 1):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j + 2), compat.as_text(k))
-          self.assertAllEqual(self._LineText(i, j + 1), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_lines - 1):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j + 2), compat.as_text(k))
+        self.assertAllEqual(self._LineText(i, j + 1), v)
 
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
 
 class FixedLengthRecordReaderTest(TFCompressionTestCase):
@@ -522,56 +524,55 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
   # gap_bytes=hop_bytes-record_bytes
   def _TestOneEpoch(self, files, num_records, gap_bytes, encoding=None):
     hop_bytes = 0 if gap_bytes == 0 else self._record_bytes + gap_bytes
-    with self.cached_session() as sess:
-      reader = io_ops.FixedLengthRecordReader(
-          header_bytes=self._header_bytes,
-          record_bytes=self._record_bytes,
-          footer_bytes=self._footer_bytes,
-          hop_bytes=hop_bytes,
-          encoding=encoding,
-          name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(num_records):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
-          self.assertAllEqual(self._Record(i, j), v)
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    reader = io_ops.FixedLengthRecordReader(
+        header_bytes=self._header_bytes,
+        record_bytes=self._record_bytes,
+        footer_bytes=self._footer_bytes,
+        hop_bytes=hop_bytes,
+        encoding=encoding,
+        name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(num_records):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
+        self.assertAllEqual(self._Record(i, j), v)
+
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
   def _TestOneEpochWithHopBytes(self,
                                 files,
                                 num_overlapped_records,
                                 encoding=None):
-    with self.cached_session() as sess:
-      reader = io_ops.FixedLengthRecordReader(
-          header_bytes=self._header_bytes,
-          record_bytes=self._record_bytes,
-          footer_bytes=self._footer_bytes,
-          hop_bytes=self._hop_bytes,
-          encoding=encoding,
-          name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(num_overlapped_records):
-          k, v = sess.run([key, value])
-          self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
-          self.assertAllEqual(self._OverlappedRecord(i, j), v)
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    reader = io_ops.FixedLengthRecordReader(
+        header_bytes=self._header_bytes,
+        record_bytes=self._record_bytes,
+        footer_bytes=self._footer_bytes,
+        hop_bytes=self._hop_bytes,
+        encoding=encoding,
+        name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(num_overlapped_records):
+        k, v = self.evaluate([key, value])
+        self.assertAllEqual("%s:%d" % (files[i], j), compat.as_text(k))
+        self.assertAllEqual(self._OverlappedRecord(i, j), v)
+
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -580,6 +581,7 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes)
 
+  @test_util.run_deprecated_v1
   def testGzipOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -588,6 +590,7 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateGzipFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes, encoding="GZIP")
 
+  @test_util.run_deprecated_v1
   def testZlibOneEpoch(self):
     for num_records in [0, 7]:
       # gap_bytes=0: hop_bytes=0
@@ -596,17 +599,20 @@ class FixedLengthRecordReaderTest(TFCompressionTestCase):
         files = self._CreateZlibFiles(num_records, gap_bytes)
         self._TestOneEpoch(files, num_records, gap_bytes, encoding="ZLIB")
 
+  @test_util.run_deprecated_v1
   def testOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateOverlappedRecordFiles(num_overlapped_records)
       self._TestOneEpochWithHopBytes(files, num_overlapped_records)
 
+  @test_util.run_deprecated_v1
   def testGzipOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateGzipOverlappedRecordFiles(num_overlapped_records,)
       self._TestOneEpochWithHopBytes(
           files, num_overlapped_records, encoding="GZIP")
 
+  @test_util.run_deprecated_v1
   def testZlibOneEpochWithHopBytes(self):
     for num_overlapped_records in [0, 2]:
       files = self._CreateZlibOverlappedRecordFiles(num_overlapped_records)
@@ -619,90 +625,91 @@ class TFRecordReaderTest(TFCompressionTestCase):
   def setUp(self):
     super(TFRecordReaderTest, self).setUp()
 
+  @test_util.run_deprecated_v1
   def testOneEpoch(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
+    reader = io_ops.TFRecordReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
 
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
+
+  @test_util.run_deprecated_v1
   def testReadUpTo(self):
     files = self._CreateFiles()
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      batch_size = 3
-      key, value = reader.read_up_to(queue, batch_size)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      num_k = 0
-      num_v = 0
-
-      while True:
-        try:
-          k, v = sess.run([key, value])
-          # Test reading *up to* batch_size records
-          self.assertLessEqual(len(k), batch_size)
-          self.assertLessEqual(len(v), batch_size)
-          num_k += len(k)
-          num_v += len(v)
-        except errors_impl.OutOfRangeError:
-          break
-
-      # Test that we have read everything
-      self.assertEqual(self._num_files * self._num_records, num_k)
-      self.assertEqual(self._num_files * self._num_records, num_v)
-
+    reader = io_ops.TFRecordReader(name="test_reader")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    batch_size = 3
+    key, value = reader.read_up_to(queue, batch_size)
+
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    num_k = 0
+    num_v = 0
+
+    while True:
+      try:
+        k, v = self.evaluate([key, value])
+        # Test reading *up to* batch_size records
+        self.assertLessEqual(len(k), batch_size)
+        self.assertLessEqual(len(v), batch_size)
+        num_k += len(k)
+        num_v += len(v)
+      except errors_impl.OutOfRangeError:
+        break
+
+    # Test that we have read everything
+    self.assertEqual(self._num_files * self._num_records, num_k)
+    self.assertEqual(self._num_files * self._num_records, num_v)
+
+  @test_util.run_deprecated_v1
   def testReadZlibFiles(self):
     options = tf_record.TFRecordOptions(TFRecordCompressionType.ZLIB)
     files = self._CreateFiles(options)
 
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader", options=options)
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
+    reader = io_ops.TFRecordReader(name="test_reader", options=options)
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
+
+  @test_util.run_deprecated_v1
   def testReadGzipFiles(self):
     options = tf_record.TFRecordOptions(TFRecordCompressionType.GZIP)
     files = self._CreateFiles(options)
 
-    with self.cached_session() as sess:
-      reader = io_ops.TFRecordReader(name="test_reader", options=options)
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
+    reader = io_ops.TFRecordReader(name="test_reader", options=options)
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
 
-      queue.enqueue_many([files]).run()
-      queue.close().run()
-      for i in range(self._num_files):
-        for j in range(self._num_records):
-          k, v = sess.run([key, value])
-          self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
-          self.assertAllEqual(self._Record(i, j), v)
+    self.evaluate(queue.enqueue_many([files]))
+    self.evaluate(queue.close())
+    for i in range(self._num_files):
+      for j in range(self._num_records):
+        k, v = self.evaluate([key, value])
+        self.assertTrue(compat.as_text(k).startswith("%s:" % files[i]))
+        self.assertAllEqual(self._Record(i, j), v)
 
 
 class AsyncReaderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoDeadlockFromQueue(self):
     """Tests that reading does not block main execution threads."""
     config = config_pb2.ConfigProto(
@@ -724,7 +731,7 @@ class AsyncReaderTest(test.TestCase):
         thread_data.append(thread_data_t(t, queue, output))
 
       # Start all readers. They are all blocked waiting for queue entries.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for d in thread_data:
         d.thread.start()
 
@@ -733,7 +740,7 @@ class AsyncReaderTest(test.TestCase):
         fname = os.path.join(self.get_temp_dir(), "deadlock.%s.txt" % i)
         with open(fname, "wb") as f:
           f.write(("file-%s" % i).encode())
-        d.queue.enqueue_many([[fname]]).run()
+        self.evaluate(d.queue.enqueue_many([[fname]]))
         d.thread.join()
         self.assertEqual([[("file-%s" % i).encode()]], d.output)
 
@@ -751,24 +758,25 @@ class LMDBReaderTest(test.TestCase):
     self.db_path = os.path.join(self.get_temp_dir(), "data.mdb")
     shutil.copy(path, self.db_path)
 
+  @test_util.run_deprecated_v1
   def testReadFromFile(self):
-    with self.cached_session() as sess:
-      reader = io_ops.LMDBReader(name="test_read_from_file")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue([self.db_path]).run()
-      queue.close().run()
-      for i in range(10):
-        k, v = sess.run([key, value])
-        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
-        self.assertAllEqual(
-            compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
-
+    reader = io_ops.LMDBReader(name="test_read_from_file")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue([self.db_path]))
+    self.evaluate(queue.close())
+    for i in range(10):
+      k, v = self.evaluate([key, value])
+      self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+      self.assertAllEqual(
+          compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
+
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
+
+  @test_util.run_deprecated_v1
   def testReadFromSameFile(self):
     with self.cached_session() as sess:
       reader1 = io_ops.LMDBReader(name="test_read_from_same_file1")
@@ -782,30 +790,31 @@ class LMDBReaderTest(test.TestCase):
       threads = queue_runner_impl.start_queue_runners(sess, coord=coord)
       for _ in range(3):
         for _ in range(10):
-          k1, v1, k2, v2 = sess.run([key1, value1, key2, value2])
+          k1, v1, k2, v2 = self.evaluate([key1, value1, key2, value2])
           self.assertAllEqual(compat.as_bytes(k1), compat.as_bytes(k2))
           self.assertAllEqual(compat.as_bytes(v1), compat.as_bytes(v2))
       coord.request_stop()
       coord.join(threads)
 
+  @test_util.run_deprecated_v1
   def testReadFromFolder(self):
-    with self.cached_session() as sess:
-      reader = io_ops.LMDBReader(name="test_read_from_folder")
-      queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
-      key, value = reader.read(queue)
-
-      queue.enqueue([self.db_path]).run()
-      queue.close().run()
-      for i in range(10):
-        k, v = sess.run([key, value])
-        self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
-        self.assertAllEqual(
-            compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
-
-      with self.assertRaisesOpError("is closed and has insufficient elements "
-                                    "\\(requested 1, current size 0\\)"):
-        k, v = sess.run([key, value])
-
+    reader = io_ops.LMDBReader(name="test_read_from_folder")
+    queue = data_flow_ops.FIFOQueue(99, [dtypes.string], shapes=())
+    key, value = reader.read(queue)
+
+    self.evaluate(queue.enqueue([self.db_path]))
+    self.evaluate(queue.close())
+    for i in range(10):
+      k, v = self.evaluate([key, value])
+      self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(i)))
+      self.assertAllEqual(
+          compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + i))))
+
+    with self.assertRaisesOpError("is closed and has insufficient elements "
+                                  "\\(requested 1, current size 0\\)"):
+      k, v = self.evaluate([key, value])
+
+  @test_util.run_deprecated_v1
   def testReadFromFileRepeatedly(self):
     with self.cached_session() as sess:
       reader = io_ops.LMDBReader(name="test_read_from_file_repeated")
@@ -819,7 +828,7 @@ class LMDBReaderTest(test.TestCase):
       for _ in range(3):
         # Go over all 10 records each time.
         for j in range(10):
-          k, v = sess.run([key, value])
+          k, v = self.evaluate([key, value])
           self.assertAllEqual(compat.as_bytes(k), compat.as_bytes(str(j)))
           self.assertAllEqual(
               compat.as_bytes(v), compat.as_bytes(str(chr(ord("a") + j))))
diff --git a/tensorflow/python/kernel_tests/record_input_test.py b/tensorflow/python/kernel_tests/record_input_test.py
index ebb9872f226f35c4642f99c8aa161845657e4a73..ad8188b372fc5e4ac627098cbbfd8fac73359272 100644
--- a/tensorflow/python/kernel_tests/record_input_test.py
+++ b/tensorflow/python/kernel_tests/record_input_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 import os
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.errors_impl import NotFoundError
 from tensorflow.python.lib.io import tf_record
 from tensorflow.python.ops import data_flow_ops
@@ -54,7 +55,7 @@ class RecordInputOpTest(test.TestCase):
           batch_size=1,
           name="record_input").get_yield_op()
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputSimpleGzip(self):
     with self.cached_session() as sess:
@@ -73,7 +74,7 @@ class RecordInputOpTest(test.TestCase):
           compression_type=tf_record.TFRecordCompressionType.GZIP).get_yield_op(
           )
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
   def testRecordInputSimpleZlib(self):
     with self.cached_session() as sess:
@@ -92,8 +93,9 @@ class RecordInputOpTest(test.TestCase):
           compression_type=tf_record.TFRecordCompressionType.ZLIB).get_yield_op(
           )
 
-      self.assertEqual(sess.run(yield_op), b"0000000000")
+      self.assertEqual(self.evaluate(yield_op), b"0000000000")
 
+  @test_util.run_deprecated_v1
   def testRecordInputEpochs(self):
     files = 100
     records_per_file = 100
@@ -117,7 +119,7 @@ class RecordInputOpTest(test.TestCase):
       for _ in range(3):
         epoch_set = set()
         for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
+          op_list = self.evaluate(yield_op)
           self.assertTrue(len(op_list) is batches)
           for r in op_list:
             self.assertTrue(r[0] not in epoch_set)
@@ -138,16 +140,18 @@ class RecordInputOpTest(test.TestCase):
 
         yield_op = records.get_yield_op()
         for _ in range(50):
-          sess.run(yield_op)
+          self.evaluate(yield_op)
 
+  @test_util.run_deprecated_v1
   def testEmptyGlob(self):
     with self.cached_session() as sess:
       record_input = data_flow_ops.RecordInput(file_pattern="foo")
       yield_op = record_input.get_yield_op()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       with self.assertRaises(NotFoundError):
-        sess.run(yield_op)
+        self.evaluate(yield_op)
 
+  @test_util.run_deprecated_v1
   def testBufferTooSmall(self):
     files = 10
     records_per_file = 10
@@ -171,7 +175,7 @@ class RecordInputOpTest(test.TestCase):
       for _ in range(3):
         epoch_set = set()
         for _ in range(int(files * records_per_file / batches)):
-          op_list = sess.run(yield_op)
+          op_list = self.evaluate(yield_op)
           self.assertTrue(len(op_list) is batches)
           for r in op_list:
             self.assertTrue(r[0] not in epoch_set)
diff --git a/tensorflow/python/kernel_tests/reduce_benchmark_test.py b/tensorflow/python/kernel_tests/reduce_benchmark_test.py
index 3a2fb81157d923f39daf77ab23f0f20162e592e7..ef9c4c350fdeafd3ea872fc648f13e1fb246a513 100644
--- a/tensorflow/python/kernel_tests/reduce_benchmark_test.py
+++ b/tensorflow/python/kernel_tests/reduce_benchmark_test.py
@@ -81,7 +81,7 @@ class ReduceBenchmarks(test.Benchmark):
       grad, = gradients_impl.gradients(reduction, tensor)
 
       def fn():
-        sess.run(grad.op)
+        self.evaluate(grad.op)
 
       self._run(fn, 10000)
 
@@ -98,7 +98,7 @@ class ReduceBenchmarks(test.Benchmark):
         grad, = gradients_impl.gradients(reduction, tensor)
 
       def fn():
-        sess.run(grad.op)
+        self.evaluate(grad.op)
 
       self._run(fn, 10000)
 
diff --git a/tensorflow/python/kernel_tests/reduce_join_op_test.py b/tensorflow/python/kernel_tests/reduce_join_op_test.py
index 3bb4986313db74ba439991566ab2947722ab890d..49b6620779e13caa635af493914f13a65a6e0257 100644
--- a/tensorflow/python/kernel_tests/reduce_join_op_test.py
+++ b/tensorflow/python/kernel_tests/reduce_join_op_test.py
@@ -25,6 +25,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -119,7 +120,7 @@ class ReduceJoinTest(UnicodeTestCase):
           axis=axis,
           keep_dims=keep_dims,
           separator=separator)
-      output_array = output.eval()
+      output_array = self.evaluate(output)
 
     self.assertAllEqualUnicode(truth, output_array)
     self.assertAllEqual(truth_shape, output.get_shape())
@@ -149,10 +150,10 @@ class ReduceJoinTest(UnicodeTestCase):
       if not axis:
         truth = constant_op.constant(truth)
       truth_squeezed = array_ops.squeeze(truth, axis=axis)
-      output_array = output.eval()
-      output_keep_dims_array = output_keep_dims.eval()
-      truth_array = truth.eval()
-      truth_squeezed_array = truth_squeezed.eval()
+      output_array = self.evaluate(output)
+      output_keep_dims_array = self.evaluate(output_keep_dims)
+      truth_array = self.evaluate(truth)
+      truth_squeezed_array = self.evaluate(truth_squeezed)
     self.assertAllEqualUnicode(truth_array, output_keep_dims_array)
     self.assertAllEqualUnicode(truth_squeezed_array, output_array)
     self.assertAllEqual(truth.get_shape(), output_keep_dims.get_shape())
@@ -230,6 +231,7 @@ class ReduceJoinTest(UnicodeTestCase):
         axis=1,
         separator="  ")
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     input_array = [["a"], ["b"]]
     truth = ["ab"]
@@ -241,6 +243,7 @@ class ReduceJoinTest(UnicodeTestCase):
       self.assertAllEqualUnicode(truth, output_array)
       self.assertAllEqual(truth_shape, reduced.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownIndices(self):
     input_array = [["this", "is", "a", "test"],
                    ["please", "do", "not", "panic"]]
@@ -297,6 +300,7 @@ class ReduceJoinTest(UnicodeTestCase):
       for permutation in itertools.permutations(xrange(num_dims), i):
         self._testMultipleReduceJoin(input_array, axis=permutation)
 
+  @test_util.run_deprecated_v1
   def testInvalidReductionIndices(self):
     with self.cached_session():
       with self.assertRaisesRegexp(ValueError, "Invalid reduction dim"):
@@ -318,13 +322,14 @@ class ReduceJoinTest(UnicodeTestCase):
 
       # Reduction that drops the dim of size 0.
       output = string_ops.reduce_join(inputs=inputs, axis=0)
-      self.assertAllEqualUnicode([""], output.eval())
+      self.assertAllEqualUnicode([""], self.evaluate(output))
 
       # Reduction that keeps the dim of size 0.
       output = string_ops.reduce_join(inputs=inputs, axis=1)
-      output_shape = output.eval().shape
+      output_shape = self.evaluate(output).shape
       self.assertAllEqual([0], output_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidArgsUnknownShape(self):
     with self.cached_session():
       placeholder = array_ops.placeholder(dtypes.string, name="placeholder")
@@ -335,6 +340,7 @@ class ReduceJoinTest(UnicodeTestCase):
       with self.assertRaisesOpError("Duplicate reduction dimension 1"):
         duplicate_index.eval(feed_dict={placeholder.name: [[""]]})
 
+  @test_util.run_deprecated_v1
   def testInvalidArgsUnknownIndices(self):
     with self.cached_session():
       placeholder = array_ops.placeholder(dtypes.int32, name="placeholder")
diff --git a/tensorflow/python/kernel_tests/reduction_ops_test.py b/tensorflow/python/kernel_tests/reduction_ops_test.py
index 2ac3996e25b1754e829c2744357396c8ddccc07a..67a89461f3a885056f47c62af40bf6cfccd60583 100644
--- a/tensorflow/python/kernel_tests/reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/reduction_ops_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -60,6 +61,7 @@ class ReducedShapeTest(test.TestCase):
     output = math_ops.reduced_shape(shape, axes=axes)
     self.assertAllEqual(output.eval(), result)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.cached_session():
       self._check([3], [], [3])
@@ -69,6 +71,7 @@ class ReducedShapeTest(test.TestCase):
       self._check([5, 3], [1], [5, 1])
       self._check([5, 3], [0, 1], [1, 1])
 
+  @test_util.run_deprecated_v1
   def testZeros(self):
     """Check that reduced_shape does the right thing with zero dimensions."""
     with self.cached_session():
@@ -83,6 +86,7 @@ class ReducedShapeTest(test.TestCase):
       self._check([3, 0], [1], [3, 1])
       self._check([3, 0], [0, 1], [1, 1])
 
+  @test_util.run_deprecated_v1
   def testNegAxes(self):
     with self.cached_session():
       self._check([10, 10, 10], [-1], [10, 10, 1])
@@ -94,6 +98,7 @@ class ReducedShapeTest(test.TestCase):
 
 class ReductionUnknownShape(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session():
       for dtype, reductions in [(dtypes.float32,
@@ -185,9 +190,10 @@ class SumReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_sum([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -195,11 +201,13 @@ class SumReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat16(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float16)
@@ -216,9 +224,10 @@ class SumReductionTest(BaseReductionTest):
       tf_arr = variables.Variable(arr)
       variables.global_variables_initializer().run()
       tf_mean = math_ops.reduce_mean(tf_arr, 0, False)
-      tf_out_mean = sess.run(tf_mean)
+      tf_out_mean = self.evaluate(tf_mean)
     self.assertAllClose(tf_out_mean, 1.)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
@@ -238,7 +247,7 @@ class SumReductionTest(BaseReductionTest):
       with self.session(graph=ops.Graph(), use_gpu=True) as sess:
         tf_row_sum = self._tf_reduce(arr, 1, False)
         tf_col_sum = self._tf_reduce(arr, 0, False)
-        tf_out_row, tf_out_col = sess.run([tf_row_sum, tf_col_sum])
+        tf_out_row, tf_out_col = self.evaluate([tf_row_sum, tf_col_sum])
       self.assertAllClose(col_sum, tf_out_col)
       self.assertAllClose(row_sum, tf_out_row)
 
@@ -252,25 +261,29 @@ class SumReductionTest(BaseReductionTest):
           with self.session(graph=ops.Graph(), use_gpu=True) as sess:
             tf_sum_xz = self._tf_reduce(arr, [0, 2], False)
             tf_sum_y = self._tf_reduce(arr, 1, False)
-            tf_out_sum_xz, tf_out_sum_y = sess.run([tf_sum_xz, tf_sum_y])
+            tf_out_sum_xz, tf_out_sum_y = self.evaluate([tf_sum_xz, tf_sum_y])
           self.assertAllClose(sum_y, tf_out_sum_y)
           self.assertAllClose(sum_xz, tf_out_sum_xz)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     np_arr = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     input_tensor = ops.convert_to_tensor(np_arr)
@@ -284,6 +297,7 @@ class SumReductionTest(BaseReductionTest):
         ValueError, lambda e: "Invalid reduction dimension" in str(e)):
       math_ops.reduce_sum(input_tensor, [0, 2])
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     np.random.seed(1618)
 
@@ -317,6 +331,7 @@ class SumReductionTest(BaseReductionTest):
         c_unknown_indices, unknown_indices, keepdims=True)
     self.assertEqual(2, s_unknown_indices_keep.get_shape().rank)
 
+  @test_util.run_deprecated_v1
   def testWrongShapeForReductionIndices(self):
     reduction_axes = [[1], [2]]
     c_unknown = array_ops.placeholder(dtypes.float32)
@@ -326,6 +341,7 @@ class SumReductionTest(BaseReductionTest):
 
   # Int64??
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for dtype in [
         dtypes.float32, dtypes.float64, dtypes.complex64, dtypes.complex128
@@ -333,6 +349,7 @@ class SumReductionTest(BaseReductionTest):
       x = self._makeIncremental([2, 3, 4, 2], dtype)
       self._compareGradientAxes(x)
 
+  @test_util.run_deprecated_v1
   def testHighRank(self):
     # Do a bunch of random high dimensional reductions
     np.random.seed(42)
@@ -350,11 +367,13 @@ class SumReductionTest(BaseReductionTest):
                    np.arange(1, rank, 2)):
         self._compareAll(data, axes)
 
+  @test_util.run_deprecated_v1
   def testExpand(self):
     # Reduce an empty tensor to a nonempty tensor
     x = np.zeros((5, 0))
     self._compareAll(x, [1])
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
@@ -362,6 +381,7 @@ class SumReductionTest(BaseReductionTest):
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
@@ -400,9 +420,10 @@ class MeanReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_mean([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -410,37 +431,44 @@ class MeanReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     for dtype in [dtypes.float32, dtypes.float64]:
       x = self._makeIncremental(s, dtype)
       self._compareGradientAxes(x, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
@@ -448,6 +476,7 @@ class MeanReductionTest(BaseReductionTest):
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
@@ -473,9 +502,10 @@ class ProdReductionTest(BaseReductionTest):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_prod([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -483,6 +513,7 @@ class ProdReductionTest(BaseReductionTest):
           np_arr = np.array([special_value_x, special_value_y]).astype(dtype)
           self._compareAll(np_arr, None)
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     # Numpy automatically upgrades the type of np.prod from int32 to int64, so
     # Numpy does not overflow an int32 np.prod while TensorFlow does. To avoid
@@ -491,26 +522,31 @@ class ProdReductionTest(BaseReductionTest):
       np_arr = self._makeIncremental((2,) * rank, dtypes.int32) / 2
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat32(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float32)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testFloat64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.float64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex64(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex64)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testComplex128(self):
     for rank in range(1, _MAX_RANK + 1):
       np_arr = self._makeIncremental((2,) * rank, dtypes.complex128)
       self._compareAllAxes(np_arr)
 
+  @test_util.run_deprecated_v1
   def testGradientWithZeros(self):
     s = [2, 3, 4, 2]
     x = self._makeIncremental(s, dtypes.float32) / 20.
@@ -533,6 +569,7 @@ class ProdReductionTest(BaseReductionTest):
     x4[:, :, :, :] = 0
     self._compareGradientAxes(x4, rtol=1e-3, atol=1e-3)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.session(use_gpu=True):
       x = array_ops.zeros([0, 3])
@@ -540,6 +577,7 @@ class ProdReductionTest(BaseReductionTest):
       error = gradient_checker.compute_gradient_error(x, [0, 3], y, [0])
       self.assertEqual(error, 0)
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     with self.session(use_gpu=True):
       for dtype in (dtypes.float16, dtypes.float32, dtypes.float64):
@@ -562,7 +600,7 @@ class MinReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_min(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -576,9 +614,10 @@ class MinReductionTest(test.TestCase):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_min([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -614,6 +653,7 @@ class MinReductionTest(test.TestCase):
     self._compareAll(np_arr, [0, 2])
     self._compareAll(np_arr, [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -624,6 +664,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient2(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -634,6 +675,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 4, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient3(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -644,6 +686,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [2, 3, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient4(self):
     s = [2, 3, 4, 2]
     x = np.arange(1.0, 49.0).reshape(s).astype(np.float64)
@@ -654,6 +697,7 @@ class MinReductionTest(test.TestCase):
           t, s, su, [1], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.cached_session():
       x = array_ops.zeros([0, 3])
@@ -675,7 +719,7 @@ class MaxReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_max(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllClose(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -689,9 +733,10 @@ class MaxReductionTest(test.TestCase):
     for dtype in [dtypes.int64, dtypes.int32]:
       with self.cached_session(use_gpu=True) as sess:
         v = math_ops.reduce_max([0, 0], constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, 0)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     for dtype in [np.float32, np.float64]:
       for special_value_x in [-np.inf, np.inf]:
@@ -741,6 +786,7 @@ class MaxReductionTest(test.TestCase):
     self._compareAll(np_arr, [0, 2])
     self._compareAll(np_arr, [0, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -751,6 +797,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient2(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -761,6 +808,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 4, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient3(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -771,6 +819,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [2, 3, 2], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient4(self):
     s = [2, 3, 4, 2]
     x = np.arange(-49.0, -1.0).reshape(s).astype(np.float64)
@@ -781,6 +830,7 @@ class MaxReductionTest(test.TestCase):
           t, s, su, [1], x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testEmptyGradients(self):
     with self.cached_session():
       x = array_ops.zeros([0, 3])
@@ -802,7 +852,7 @@ class AllReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_all(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -817,7 +867,7 @@ class AllReductionTest(test.TestCase):
       with self.session(use_gpu=True) as sess:
         v = math_ops.reduce_all([True, True],
                                 constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, True)
 
   def testAll3D(self):
@@ -851,7 +901,7 @@ class AnyReductionTest(test.TestCase):
       if reduction_axes is not None:
         reduction_axes = np.array(reduction_axes).astype(np.int32)
       tf_ans = math_ops.reduce_any(x, reduction_axes, keepdims)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, out)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -866,7 +916,7 @@ class AnyReductionTest(test.TestCase):
       with self.session(use_gpu=True) as sess:
         v = math_ops.reduce_any([True, True],
                                 constant_op.constant(0, dtype=dtype))
-        tf_v = sess.run(v)
+        tf_v = self.evaluate(v)
       self.assertAllEqual(tf_v, True)
 
   def testAll3D(self):
@@ -913,6 +963,7 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compare(x, reduction_axes, True, use_gpu=True, feed_dict=feed_dict)
     self._compare(x, reduction_axes, True, use_gpu=False, feed_dict=feed_dict)
 
+  @test_util.run_deprecated_v1
   def testBoolReduce1D(self):
     # Create a 1D array of floats
     np_arr = np.asarray([False, False, True, False, False, True])
@@ -920,11 +971,13 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compareAll(np_arr, [])
     self._compareAll(np_arr, [0])
 
+  @test_util.run_deprecated_v1
   def testFloatReduce1D(self):
     # Create a 1D array of floats
     np_arr = np.asarray([0.0, 1.0, -1.0, 0.0, 0.0, 3.0]).astype(np.float32)
     self._compareAll(np_arr, [0])
 
+  @test_util.run_deprecated_v1
   def testFloatReduce4D(self):
     # Create a 4D array of floats and reduce across some
     # dimensions
@@ -944,11 +997,13 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compareAll(np_arr, [1, 2, 3])
     self._compareAll(np_arr, [0, 1, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testExpand(self):
     # Reduce an empty tensor to a nonempty tensor
     x = np.zeros((5, 0))
     self._compareAll(x, [1])
 
+  @test_util.run_deprecated_v1
   def testDegenerate(self):
     for use_gpu in False, True:
       with self.cached_session(use_gpu=use_gpu):
@@ -962,8 +1017,9 @@ class CountNonzeroReductionTest(test.TestCase):
     # Test case for GitHub issue 18712
     with self.cached_session() as sess:
       v = math_ops.count_nonzero(constant_op.constant(["test"]))
-      self.assertAllClose(sess.run(v), 1)
+      self.assertAllClose(self.evaluate(v), 1)
 
+  @test_util.run_deprecated_v1
   def testStringReduce1D(self):
     # Create a 1D array of strings
     x = np.asarray(["", "", "a", "", "", "b"])
@@ -974,6 +1030,7 @@ class CountNonzeroReductionTest(test.TestCase):
     self._compare(x, [], keepdims=True, zero=np.str(""))
     self._compare(x, [0], keepdims=True, zero=np.str(""))
 
+  @test_util.run_deprecated_v1
   def testStringReduce2D(self):
     # Create a 2D array of strings
     x = np.asarray([["", "", "a", "", "", "b"],
diff --git a/tensorflow/python/kernel_tests/regex_full_match_op_test.py b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
index 98746e7d9b19e5ba52a73b7ca3d9967cc813c133..488ec85ab2cae79d23c0434b075edaaee6869da6 100644
--- a/tensorflow/python/kernel_tests/regex_full_match_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_full_match_op_test.py
@@ -23,6 +23,7 @@ from absl.testing import parameterized
 from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ from tensorflow.python.platform import test
     (gen_string_ops.static_regex_full_match))
 class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatch(self, op):
     values = ["abaaba", "abcdabcde"]
     with self.cached_session():
@@ -40,6 +42,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "a.*a").eval()
       self.assertAllEqual([True, False], matched)
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatchTwoDims(self, op):
     values = [["abaaba", "abcdabcde"], ["acdcba", "ebcda"]]
     with self.cached_session():
@@ -47,6 +50,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "a.*a").eval()
       self.assertAllEqual([[True, False], [True, False]], matched)
 
+  @test_util.run_deprecated_v1
   def testEmptyMatch(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -54,6 +58,7 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       matched = op(input_tensor, "").eval()
       self.assertAllEqual([False, False], matched)
 
+  @test_util.run_deprecated_v1
   def testInvalidPattern(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -61,11 +66,12 @@ class RegexFullMatchOpVariantsTest(test.TestCase, parameterized.TestCase):
       invalid_pattern = "A["
       matched = op(input_tensor, invalid_pattern)
       with self.assertRaisesOpError("Invalid pattern"):
-        matched.eval()
+        self.evaluate(matched)
 
 
 class RegexFullMatchOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRegexFullMatchDelegation(self):
     with compat.forward_compatibility_horizon(2018, 11, 1):
       with self.cached_session():
@@ -78,6 +84,7 @@ class RegexFullMatchOpTest(test.TestCase):
         op_tensor = string_ops.regex_full_match(input_tensor, pattern_tensor)
         self.assertTrue(op_tensor.name.startswith("RegexFullMatch"), op.name)
 
+  @test_util.run_deprecated_v1
   def testStaticRegexFullMatchDelegation(self):
     with compat.forward_compatibility_horizon(2018, 11, 20):
       with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/regex_replace_op_test.py b/tensorflow/python/kernel_tests/regex_replace_op_test.py
index d9b7ed28d21652e964977c1938cd5d2cefb17825..6c7dfee7b401ee317d77367538a5fb41bc62d540 100644
--- a/tensorflow/python/kernel_tests/regex_replace_op_test.py
+++ b/tensorflow/python/kernel_tests/regex_replace_op_test.py
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
     (gen_string_ops.static_regex_replace))
 class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testForwarding(self, op):
     with self.cached_session():
       # Generate an input that is uniquely consumed by the regex op.
@@ -45,6 +47,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(inp, "\\p{Ll}", ".").eval()
       self.assertAllEqual([b"A.C.E", b"H.J.L"], stripped)
 
+  @test_util.run_deprecated_v1
   def testRemovePrefix(self, op):
     values = ["a:foo", "a:bar", "a:foo", "b:baz", "b:qux", "ca:b"]
     with self.cached_session():
@@ -53,6 +56,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       self.assertAllEqual([b"foo", b"bar", b"foo", b"baz", b"qux", b"ca:b"],
                           stripped)
 
+  @test_util.run_deprecated_v1
   def testRegexReplace(self, op):
     values = ["aba\naba", "abcdabcde"]
     with self.cached_session():
@@ -60,6 +64,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(input_vector, "a.*a", "(\\0)").eval()
       self.assertAllEqual([b"(aba)\n(aba)", b"(abcda)bcde"], stripped)
 
+  @test_util.run_deprecated_v1
   def testEmptyMatch(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -67,6 +72,7 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       stripped = op(input_vector, "", "x").eval()
       self.assertAllEqual([b"xaxbxcx", b"x1x"], stripped)
 
+  @test_util.run_deprecated_v1
   def testInvalidPattern(self, op):
     values = ["abc", "1"]
     with self.cached_session():
@@ -74,8 +80,9 @@ class RegexReplaceOpVariantsTest(test.TestCase, parameterized.TestCase):
       invalid_pattern = "A["
       replace = op(input_vector, invalid_pattern, "x")
       with self.assertRaisesOpError("Invalid pattern"):
-        replace.eval()
+        self.evaluate(replace)
 
+  @test_util.run_deprecated_v1
   def testGlobal(self, op):
     values = ["ababababab", "abcabcabc", ""]
     with self.cached_session():
@@ -98,6 +105,7 @@ class RegexReplaceTest(test.TestCase, parameterized.TestCase):
       (as_string, as_tensor),
       (as_tensor, as_string),
       (as_tensor, as_tensor))
+  @test_util.run_deprecated_v1
   def testRegexReplaceDelegation(self, pattern_fn, rewrite_fn):
     with self.cached_session():
       input_vector = constant_op.constant("foo", dtypes.string)
@@ -106,6 +114,7 @@ class RegexReplaceTest(test.TestCase, parameterized.TestCase):
       op = string_ops.regex_replace(input_vector, pattern, replace)
       self.assertTrue(op.name.startswith("RegexReplace"))
 
+  @test_util.run_deprecated_v1
   def testStaticRegexReplaceDelegation(self):
     with self.cached_session():
       input_vector = constant_op.constant("foo", dtypes.string)
diff --git a/tensorflow/python/kernel_tests/relu_op_test.py b/tensorflow/python/kernel_tests/relu_op_test.py
index b0f2796ede176b9d3ea7e69fa5da6394d74c258e..d4ba1ad77d5547ccb9fe4e2154d145751cf63514 100644
--- a/tensorflow/python/kernel_tests/relu_op_test.py
+++ b/tensorflow/python/kernel_tests/relu_op_test.py
@@ -21,13 +21,15 @@ from __future__ import print_function
 import numpy as np
 from six.moves import xrange  # pylint: disable=redefined-builtin
 
+from tensorflow.python import tf2
 from tensorflow.python.compat import compat
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
@@ -55,162 +57,163 @@ class ReluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testRelu(self, np_features, use_gpu=False):
+  def _testRelu(self, np_features):
     np_relu = self._npRelu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      relu = nn_ops.relu(np_features)
-      tf_relu = relu.eval()
+    tf_relu = nn_ops.relu(np_features)
     self.assertAllClose(np_relu, tf_relu)
-    self.assertShapeEqual(np_relu, relu)
+    self.assertShapeEqual(np_relu, tf_relu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testRelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testRelu(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
-  def _testReluInt8x4(self, np_inputs):
-    if not test.is_gpu_available(cuda_only=True):
-      return
-    np_relu = self._npRelu(np_inputs)
-    with self.cached_session(use_gpu=True):
-      relu = nn_ops.relu(constant_op.constant(np_inputs, dtypes.qint8))
-      if np_inputs.size % 4 == 0:
-        tf_relu = relu.eval()
-        self.assertAllClose(np_relu, tf_relu)
-        self.assertShapeEqual(np_relu, relu)
-      else:
-        with self.assertRaisesRegexp(
-            errors.InvalidArgumentError,
-            "Tensor size must be a multiple of 4 for Relu<qint8>. Got %d" %
-            np_inputs.size):
-          tf_relu = relu.eval()
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testReluInt8x4GoodShape(self):
-    self._testReluInt8x4(np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]]))
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest("No GPU available")
+    inputs = np.array([[-50, 7, 23, 0], [-1, -5, 6, 11]])
+    np_relu = self._npRelu(inputs)
+    tf_relu = nn_ops.relu(constant_op.constant(inputs, dtypes.qint8))
+    self.assertAllClose(np_relu, tf_relu)
+    self.assertShapeEqual(np_relu, tf_relu)
 
   def testReluInt8x4BadShape(self):
-    np_inputs = np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]])
-    self.assertEqual(np_inputs.size, 9)
-    self._testReluInt8x4(np_inputs)
-    np_inputs = np.array(
-        [1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1])
-    self.assertEqual(np_inputs.size, 17)
-    self._testReluInt8x4(np_inputs)
+    if not test.is_gpu_available(cuda_only=True):
+      self.skipTest("No GPU available")
+    inputs = constant_op.constant(
+        np.array([[-50, 7, 23], [0, 1, -5], [6, -2, 11]]), dtypes.qint8)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tensor size must be a multiple of 4 for Relu<qint8>. Got 9"):
+      self.evaluate(nn_ops.relu(inputs))
+
+    inputs = constant_op.constant(
+        np.array([1, -2, 3, -4, 5, -6, 7, -8, 9, -8, 7, -6, 5, -4, 3, -2, 1]),
+        dtypes.qint8)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "Tensor size must be a multiple of 4 for Relu<qint8>. Got 17"):
+      self.evaluate(nn_ops.relu(inputs))
 
   # The gradient test for ReLU is a bit tricky as the derivative is not well
   # defined at around zero and we want to avoid that in terms of input values.
   def testGradientFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu, [x]))
     print("relu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   # The gradient for fp16 is inaccurate due to the low-precision.
-  # Instead of relying on compute_gradient_error, we compare the fp16 analytical
-  # gradient against their fp32 counterpart.
+  # We compare the fp16 analytical gradient against their fp32 counterpart.
   def testGradientFloat16(self):
-    with self.session(use_gpu=True) as sess:
-      # Randomly construct a 1D shape from [1, 40)
-      shape = random_ops.random_uniform(
-          [1], minval=1, maxval=40, dtype=dtypes.int32)
-
-      # Construct the fp32 graph and its gradient.
-      x = random_ops.random_uniform(shape, minval=-1, maxval=1, name="x")
-      y1 = nn_ops.relu(x, name="relu_fp32")
-      l1 = nn_ops.l2_loss(y1)
-      dx_f32 = gradients_impl.gradients(l1, x)
-
-      # Construct the fp16 graph and its gradient.
-      # It starts with the same x, in fp32. But before it reaches Relu, it is
-      # cast into fp16. So during backprop, the gradient computation is in fp16.
-      x2 = math_ops.cast(x, dtype=dtypes.float16, name="cast")
-      y2 = nn_ops.relu(x2, name="relu_fp16")
-      l2 = nn_ops.l2_loss(y2)
-      dx_f16 = gradients_impl.gradients(l2, x)
-
-      # Repeat the experiment for 100 times. All tensor shapes and its tensor
-      # values are randomly generated for each run.
-      for _ in xrange(100):
-        dx_f32_v, dx_f16_v = sess.run([dx_f32, dx_f16])
-        self.assertAllClose(dx_f32_v, dx_f16_v, atol=3e-4)
+
+    def grad(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = nn_ops.l2_loss(nn_ops.relu(x))
+      return tape.gradient(y, x)
+
+    def f():
+      with test_util.use_gpu():
+        # Randomly construct a 1D shape from [1, 40)
+        shape = random_ops.random_uniform([1],
+                                          minval=1,
+                                          maxval=40,
+                                          dtype=dtypes.int32)
+        x32 = random_ops.random_uniform(shape, minval=-1, maxval=1)
+        x16 = math_ops.cast(x32, dtype=dtypes.float16)
+        return grad(x32), grad(x16)
+
+    # We're going to ensure that the fp16 and fp32 gradients
+    # are "close" to each other for ~100 random values.
+    #
+    # In TensorFlow 1.x, invoking f() (without eager execution enabled)
+    # would construct a graph. Instead of construct a graph with O(100) nodes,
+    # we construct a single graph to be executed ~100 times in a Session.
+    if not tf2.enabled():
+      d32_tensor, d16_tensor = f()
+      with self.cached_session() as sess:
+        f = lambda: sess.run([d32_tensor, d16_tensor])
+
+    # Repeat the experiment for 100 times. All tensor shapes and its tensor
+    # values are randomly generated for each run.
+    for _ in xrange(100):
+      d32, d16 = f()
+      self.assertAllClose(d32, d16, atol=3e-4)
 
   def testGradientFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu, [x]))
     print("relu (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.relu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("relu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu(x, name="relu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.relu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("relu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-10)
 
   def testGradientScalar(self):
-    with self.cached_session() as sess:
-      x = variables.Variable(100.)
-      y = nn_ops.relu(x)
-      loss = y**2
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.25)
-      train_op = optimizer.minimize(loss)
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(x.eval(), 50.0)
+    x = variables.Variable(100.)
+
+    def loss():
+      return nn_ops.relu(x)**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.25)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(optimizer.minimize(loss))
+    self.assertAllClose(x.read_value(), 50.0)
 
 
 class Relu6Test(test.TestCase):
@@ -228,57 +231,48 @@ class Relu6Test(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7,
                                                     0.9]])))
 
-  def _testRelu6(self, np_features, use_gpu=False):
+  def _testRelu6(self, np_features):
     np_relu6 = self._npRelu6(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      relu6 = nn_ops.relu6(np_features)
-      tf_relu6 = relu6.eval()
+    tf_relu6 = nn_ops.relu6(np_features)
     self.assertAllClose(np_relu6, tf_relu6)
-    self.assertShapeEqual(np_relu6, relu6)
+    self.assertShapeEqual(np_relu6, tf_relu6)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testRelu6(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float, np.double]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testRelu6(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float, np.double]:
+      self._testRelu6(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   # The gradient test for ReLU6 is a bit tricky as the derivative is
   # not well defined at around zero and six and we want to avoid that
   # in terms of input values.
   def testGradientFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 6.1, 6.3, 6.5, 6.7, 6.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.relu6(x, name="relu6")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x]))
     print("relu6 (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradientFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 6.1, 6.3, 6.5, 6.7, 6.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.relu6(x, name="relu6")
-      x_init = np.asarray(
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x]))
     print("relu6 (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
@@ -297,109 +291,103 @@ class LeakyReluTest(test.TestCase):
                                                      0.9]]),
             alpha=0.1))
 
-  def _testLeakyRelu(self, np_features, alpha, use_gpu=False):
+  def _testLeakyRelu(self, np_features, alpha):
     np_leaky_relu = self._npLeakyRelu(np_features, alpha)
-    with self.test_session(use_gpu=use_gpu):
-      leaky_relu = nn_ops.leaky_relu(np_features, alpha)
-      tf_leaky_relu = leaky_relu.eval()
+    tf_leaky_relu = nn_ops.leaky_relu(np_features, alpha)
     self.assertAllClose(np_leaky_relu, tf_leaky_relu)
-    self.assertShapeEqual(np_leaky_relu, leaky_relu)
+    self.assertShapeEqual(np_leaky_relu, tf_leaky_relu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testLeakyRelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          alpha=0.2,
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testLeakyRelu(
             np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            alpha=0.1,
-            use_gpu=True)
+            alpha=0.2)
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testLeakyRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+          alpha=0.1)
 
   # The gradient test for Leaky ReLU is a bit tricky as the derivative is not
   # well defined at around zero and we want to avoid that in terms of input
   # values.
   def testGradientFloat32(self):
-    with self.test_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
-      x_init = np.asarray(
+    with self.cached_session():
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x]))
     print("leaky_relu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradientFloat64(self):
-    with self.test_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.leaky_relu(x, alpha=0.2, name="leaky_relu")
-      x_init = np.asarray(
+    with self.cached_session():
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x]))
     print("leaky_relu (float64) gradient err = ", err)
     self.assertLess(err, 1e-10)
 
   def testGradGradFloat32(self):
     with compat.forward_compatibility_horizon(2018, 11, 2):
-      with self.test_session():
-        x = constant_op.constant(
-            [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-            shape=[2, 5],
-            name="x")
-        y = nn_ops.leaky_relu(x, alpha=0.1, name="leaky_relu")
-        z = gradients_impl.gradients(y, x)
-        x_init = np.asarray(
+      with self.cached_session():
+
+        def f(x):
+          assert x.dtype == dtypes.float32
+          with backprop.GradientTape() as tape:
+            tape.watch(x)
+            y = nn_ops.leaky_relu(x)
+          return tape.gradient(y, x)
+
+        x = np.asarray(
             [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
             dtype=np.float32,
             order="F")
-        err = gradient_checker.compute_gradient_error(
-            x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, [x]))
       print("leaky_relu (float32) gradient of gradient err = ", err)
       self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
     with compat.forward_compatibility_horizon(2018, 11, 2):
-      with self.test_session():
-        x = constant_op.constant(
-            [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-            shape=[2, 5],
-            dtype=dtypes.float64,
-            name="x")
-        y = nn_ops.leaky_relu(x, alpha=0.02, name="leaky_relu")
-        z = gradients_impl.gradients(y, x)
-        x_init = np.asarray(
+      with self.cached_session():
+
+        def f(x):
+          assert x.dtype == dtypes.float64
+          with backprop.GradientTape() as tape:
+            tape.watch(x)
+            y = nn_ops.leaky_relu(x)
+          return tape.gradient(y, x)
+
+        x = np.asarray(
             [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
             dtype=np.float64,
             order="F")
-        err = gradient_checker.compute_gradient_error(
-            x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+        err = gradient_checker_v2.max_error(
+            *gradient_checker_v2.compute_gradient(f, [x]))
       print("leaky_relu (float64) gradient of gradient err = ", err)
       self.assertLess(err, 1e-10)
 
   def testGradientScalar(self):
-    with self.test_session() as sess:
-      x = variables.Variable(-100.)
-      y = nn_ops.leaky_relu(x, 0.05)
-      loss = y**2
-      optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.2)
-      train_op = optimizer.minimize(loss)
-      sess.run(variables.global_variables_initializer())
-      sess.run(train_op)
-      self.assertAllClose(x.eval(), -99.9)
+    x = variables.Variable(-100.)
+
+    def loss():
+      return nn_ops.leaky_relu(x, 0.05)**2
+
+    optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=0.2)
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(optimizer.minimize(loss))
+    self.assertAllClose(x.read_value(), -99.9)
 
 
 class EluTest(test.TestCase):
@@ -415,88 +403,94 @@ class EluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testElu(self, np_features, use_gpu=False):
+  def _testElu(self, np_features):
     np_elu = self._npElu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      elu = nn_ops.elu(np_features)
-      tf_elu = elu.eval()
+    tf_elu = nn_ops.elu(np_features)
     self.assertAllClose(np_elu, tf_elu)
-    self.assertShapeEqual(np_elu, elu)
+    self.assertShapeEqual(np_elu, tf_elu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.float16, np.float32, np.float64]:
-      self._testElu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      self._testElu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=True)
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
+        self._testElu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testElu(np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testGradientFloat32(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, name="x")
-      y = nn_ops.elu(x, name="elu")
-      x_init = np.asarray(x_val, dtype=np.float32, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float32, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.elu, [x]))
     print("elu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradientFloat64(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, dtype=dtypes.float64, name="x")
-      y = nn_ops.elu(x, name="elu")
-      x_init = np.asarray(x_val, dtype=np.float64, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float64, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.elu, [x]))
     print("elu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
   def testGradGrad(self):
     with self.cached_session():
-      x = array_ops.placeholder(dtype=dtypes.float32)
-      elu = nn_ops.elu(x)
-      g, = gradients_impl.gradients(elu, x)
-      gg, = gradients_impl.gradients(g, x)
 
-      for x_val in [-1, -0.5, 0.5, 1]:
-        err = np.abs(gg.eval(feed_dict={x: x_val}) - _elu_grad_grad(x_val))
+      def f(x):
+        with backprop.GradientTape(persistent=True) as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+          dy = tape.gradient(y, x)
+        return tape.gradient(dy, x)
+
+      for x in [-1., -0.5, 0.5, 1.]:
+        got = self.evaluate(f(constant_op.constant(x)))
+        want = _elu_grad_grad(x)
+        err = np.abs(got - want)
         self.assertLess(err, 1e-4)
 
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.elu(x, name="elu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("elu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.elu(x, name="elu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.elu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("elu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-6)
 
@@ -517,77 +511,74 @@ class SeluTest(test.TestCase):
             np.array([[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7,
                                                      0.9]])))
 
-  def _testSelu(self, np_features, use_gpu=False):
+  def _testSelu(self, np_features):
     np_selu = self._npSelu(np_features)
-    with self.cached_session(use_gpu=use_gpu):
-      selu = nn_ops.selu(np_features)
-      tf_selu = selu.eval()
+    tf_selu = nn_ops.selu(np_features)
     self.assertAllClose(np_selu, tf_selu)
-    self.assertShapeEqual(np_selu, selu)
+    self.assertShapeEqual(np_selu, tf_selu)
 
   def testNumbers(self):
     for t in [np.float16, np.float32, np.float64]:
       self._testSelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      self._testSelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=True)
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+      # Force executed on CPU in case GPU kernels are avaiable.
+      with ops.device("/device:CPU:0"):
+        self._testSelu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testGradientFloat32(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, name="x")
-      y = nn_ops.selu(x, name="selu")
-      x_init = np.asarray(x_val, dtype=np.float32, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float32, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.selu, [x]))
     print("selu (float32) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradientFloat64(self):
     with self.cached_session():
       x_val = [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]]
-      x = constant_op.constant(x_val, dtype=dtypes.float64, name="x")
-      y = nn_ops.selu(x, name="selu")
-      x_init = np.asarray(x_val, dtype=np.float64, order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], y, [2, 5], x_init_value=x_init)
+      x = np.asarray(x_val, dtype=np.float64, order="F")
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.selu, [x]))
     print("selu (float64) gradient err = ", err)
     self.assertLess(err, 1e-6)
 
   def testGradGradFloat32(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          name="x")
-      y = nn_ops.selu(x, name="selu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.selu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float32,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("selu (float32) gradient of gradient err = ", err)
     self.assertLess(err, 1e-4)
 
   def testGradGradFloat64(self):
     with self.cached_session():
-      x = constant_op.constant(
-          [-0.9, -0.7, -0.5, -0.3, -0.1, 0.1, 0.3, 0.5, 0.7, 0.9],
-          shape=[2, 5],
-          dtype=dtypes.float64,
-          name="x")
-      y = nn_ops.selu(x, name="selu")
-      z = gradients_impl.gradients(y, x)
-      x_init = np.asarray(
+
+      def f(x):
+        assert x.dtype == dtypes.float64
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.selu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
           [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
           dtype=np.float64,
           order="F")
-      err = gradient_checker.compute_gradient_error(
-          x, [2, 5], z[0], [2, 5], x_init_value=x_init)
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x]))
     print("selu (float64) gradient of gradient err = ", err)
     self.assertLess(err, 1e-6)
 
@@ -599,46 +590,44 @@ class CreluTest(test.TestCase):
     t = nn_ops.crelu(f)
     self.assertEqual([50, 5, 7, 20], t.get_shape())
 
-  def _testCrelu(self, np_features, use_gpu=False):
+  def _testCrelu(self, np_features):
     np_relu = np.maximum(np_features, np.zeros_like(np_features))
     np_neg_relu = np.maximum(-np_features, np.zeros_like(np_features))
     np_crelu = np.concatenate((np_relu, np_neg_relu),
                               len(np_features.shape) - 1)
 
-    with self.cached_session(use_gpu=use_gpu):
-      crelu = nn_ops.crelu(np_features)
-      tf_relu = crelu.eval()
+    tf_crelu = nn_ops.crelu(np_features)
 
-    self.assertAllClose(np_crelu, tf_relu)
-    self.assertShapeEqual(np_crelu, crelu)
+    self.assertAllClose(np_crelu, tf_crelu)
+    self.assertShapeEqual(np_crelu, tf_crelu)
 
-  def testNumbers(self):
+  def testNumbersCPU(self):
     for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
-      self._testCrelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-          use_gpu=False)
-      if t in [np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
         self._testCrelu(
-            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
-            use_gpu=True)
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testCrelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t))
 
   def testNumbersWithAxis0(self):
-    with self.cached_session():
-      crelu = nn_ops.crelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
-      tf_relu = crelu.eval()
-      np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
-                           [0, 3, 0, 7, 0]])
-      self.assertAllEqual(np_crelu, tf_relu)
+    tf_crelu = nn_ops.crelu(
+        np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=0)
+    np_crelu = np.array([[0, 7, 0, 3, 0], [1, 0, 5, 0, 9], [9, 0, 5, 0, 1],
+                         [0, 3, 0, 7, 0]])
+    self.assertAllEqual(np_crelu, tf_crelu)
 
   def testNumbersWithAxis1(self):
-    with self.cached_session():
-      crelu = nn_ops.crelu(
-          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
-      tf_relu = crelu.eval()
-      np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
-                           [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
-      self.assertAllEqual(np_crelu, tf_relu)
+    tf_crelu = nn_ops.crelu(
+        np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]), axis=1)
+    np_crelu = np.array([[0, 7, 0, 3, 0, 9, 0, 5, 0, 1],
+                         [1, 0, 5, 0, 9, 0, 3, 0, 7, 0]])
+    self.assertAllEqual(np_crelu, tf_crelu)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/reshape_op_test.py b/tensorflow/python/kernel_tests/reshape_op_test.py
index 14cdae18370fb047d68eb31f62e92d20ad263146..db3e88a104f44fbea4df757a10203eea7ebcb278 100644
--- a/tensorflow/python/kernel_tests/reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/reshape_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -33,14 +34,14 @@ class ReshapeTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       np_ans = x.reshape(y)
       tf_ans = array_ops.reshape(x, y)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
       # Repeat with an int64 shape tensor.
       y64 = constant_op.constant(y, dtype=dtypes.int64)
       tf_ans = array_ops.reshape(x, y64)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       self.assertEqual(tf_ans.get_shape(), out.shape)
       self.assertShapeEqual(np_ans, tf_ans)
 
@@ -91,6 +92,7 @@ class ReshapeTest(test.TestCase):
   # TODO(vrv): Add tests for failure conditions once python test_util
   # reports errors.
 
+  @test_util.run_deprecated_v1
   def testFloatReshapeGradThreeDimensions(self):
     x = np.arange(1., 25.).reshape([2, 3, 4]).astype(np.float32)
     s = list(np.shape(x))
@@ -111,6 +113,7 @@ class ReshapeTest(test.TestCase):
     self._testBothReshape(x, [0, 0, 0])
     self._testBothReshape(x, [1, -1, 5])
 
+  @test_util.run_deprecated_v1
   def testErrors(self):
     y = constant_op.constant(0.0, shape=[23, 29, 31])
     with self.assertRaisesRegexp(ValueError, "must be evenly divisible by 17"):
@@ -121,6 +124,7 @@ class ReshapeTest(test.TestCase):
                                  "Cannot reshape a tensor with 4096 elements"):
       array_ops.reshape(z, [4095])
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     x = array_ops.placeholder(dtypes.float32)
 
diff --git a/tensorflow/python/kernel_tests/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
index c8227dc117f316c1fa1cb7780c764fc7766f1a2d..433957fd1d38890c0952c443097e4955e1eb99cb 100644
--- a/tensorflow/python/kernel_tests/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/resource_variable_ops_test.py
@@ -29,9 +29,12 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -53,6 +56,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     # involving objects with __del__ defined.
     self.assertEqual(0, len(gc.garbage))
 
+  @test_util.run_deprecated_v1
   def testHandleDtypeShapeMatch(self):
     with self.cached_session():
       handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
@@ -122,6 +126,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       # values.
       self.assertFalse(np.allclose(variable.numpy(), copied_variable.numpy()))
 
+  @test_util.run_deprecated_v1
   def testGraphDeepCopy(self):
     with self.cached_session():
       init_value = np.ones((4, 4, 4))
@@ -137,6 +142,15 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(v[0].assign(2.0))
     self.assertAllEqual(self.evaluate(v), [2.0, 2.0])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testVariableShape(self):
+    v = resource_variable_ops.ResourceVariable([1., 1.])
+    self.assertAllEqual(
+        tensor_util.constant_value(
+            resource_variable_ops.variable_shape(v.handle)),
+        [2])
+
+  @test_util.run_deprecated_v1
   def testDifferentAssignGraph(self):
     with ops.Graph().as_default():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -144,16 +158,18 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.assign(2.0)  # Note: this fails if we run convert_to_tensor on not the
     # variable graph.
 
+  @test_util.run_deprecated_v1
   def testFetchHandle(self):
     with self.cached_session():
       handle = resource_variable_ops.var_handle_op(
           dtype=dtypes.int32, shape=[1], name="foo")
       self.assertGreater(len(handle.eval()), 0)
 
+  @test_util.run_deprecated_v1
   def testCachedValueReadBeforeWrite(self):
     with self.cached_session() as sess:
       v = resource_variable_ops.ResourceVariable(0.0, caching_device="cpu:0")
-      sess.run(v.initializer)
+      self.evaluate(v.initializer)
       value, _ = sess.run([v, v.assign_add(1.0)])
       self.assertAllEqual(value, 0.0)
 
@@ -426,6 +442,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
     self.assertEqual(self.evaluate(read), [[6]])
 
+  @test_util.run_deprecated_v1
   def testScatterUpdateString(self):
     handle = resource_variable_ops.var_handle_op(
         dtype=dtypes.string, shape=[1, 1])
@@ -437,6 +454,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(compat.as_bytes(self.evaluate(read)[0][0]),
                      compat.as_bytes("b"))
 
+  @test_util.run_deprecated_v1
   def testScatterUpdateStringScalar(self):
     handle = resource_variable_ops.var_handle_op(
         dtype=dtypes.string, shape=[1, 1])
@@ -456,7 +474,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
 
   # TODO(alive): get this to work in Eager mode.
   def testGPU(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       abc = variable_scope.get_variable(
           "abc",
           shape=[1],
@@ -491,6 +509,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           initial_value=lambda: 1, constraint=constraint, name="var1")
 
   # TODO(alive): how should this work in Eager mode?
+  @test_util.run_deprecated_v1
   def testInitFn(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(
@@ -568,6 +587,48 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     v.load(2.0)
     self.assertEqual(2.0, self.evaluate(v.value()))
 
+  def testShapePassedToGradient(self):
+    with ops.Graph().as_default():
+      @custom_gradient.custom_gradient
+      def differentiable_scatter_update(handle, indices, values):
+        with ops.control_dependencies([
+            resource_variable_ops.resource_scatter_update(
+                handle, indices, values)]):
+          new_handle = array_ops.identity(handle)
+
+        def grad(dresult):
+          self.assertIsNotNone(
+              tensor_util.constant_value(dresult.dense_shape))
+          return [dresult, None, None]
+
+        return new_handle, grad
+
+      var = variable_scope.get_variable(
+          "foo", shape=[20], initializer=init_ops.zeros_initializer,
+          dtype=dtypes.float64, use_resource=True)
+
+      indices = math_ops.range(10)
+      updates = math_ops.range(9, -1, -1, dtype=dtypes.float64)
+      new_handle = differentiable_scatter_update(var.handle, indices, updates)
+      gathered = resource_variable_ops.resource_gather(
+          new_handle, indices, dtype=var.dtype)
+      gradients_impl.gradients([gathered], [updates])
+
+  def testToFromProtoCachedValue(self):
+    with ops.Graph().as_default():
+      v_def = resource_variable_ops.ResourceVariable(
+          initial_value=constant_op.constant(3.0)).to_proto()
+      v_prime = resource_variable_ops.ResourceVariable(variable_def=v_def)
+      self.assertTrue(getattr(v_prime, "_cached_value", None) is None)
+
+      other_v_def = resource_variable_ops.ResourceVariable(
+          caching_device="cpu:0",
+          initial_value=constant_op.constant(3.0)).to_proto()
+      other_v_prime = resource_variable_ops.ResourceVariable(
+          variable_def=other_v_def)
+      self.assertTrue(other_v_prime._cached_value is not None)
+
+  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = resource_variable_ops.ResourceVariable(
@@ -576,11 +637,11 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     with ops.Graph().as_default(), self.cached_session() as sess:
       # v describes a VariableDef-based variable without an initial value.
       v = resource_variable_ops.ResourceVariable(variable_def=v_def)
-      self.assertEqual(3.0, sess.run(v.initialized_value()))
+      self.assertEqual(3.0, self.evaluate(v.initialized_value()))
 
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
-      sess.run(v.assign(1.0))
+      self.evaluate(v.assign(1.0))
       self.assertEqual(1.0, v.initialized_value().eval())
 
     v_def.ClearField("initial_value_name")
@@ -592,7 +653,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       self.assertProtoEquals(v_def, v.to_proto())
       # But attempts to use initialized_value will result in errors.
       with self.assertRaises(ValueError):
-        sess.run(v.initialized_value())
+        self.evaluate(v.initialized_value())
 
   def testTrainableInProto(self):
     with ops.Graph().as_default():
@@ -623,6 +684,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     value = self.evaluate(v.sparse_read([0, 3, 1, 2]))
     self.assertAllEqual(init_value[[0, 3, 1, 2], ...], value)
 
+  @test_util.run_deprecated_v1
   def testToFromProto(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(1.0)
@@ -671,6 +733,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(0.0, self.evaluate(v.value()))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testDestroyResource(self):
     v = resource_variable_ops.ResourceVariable(3.0, name="var0")
     self.evaluate(variables.global_variables_initializer())
@@ -684,6 +747,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.evaluate(resource_variable_ops.destroy_resource_op(
         handle, ignore_lookup_error=True))
 
+  @test_util.run_deprecated_v1
   def testAssignDifferentShapes(self):
     with self.cached_session() as sess, variable_scope.variable_scope(
         "foo", use_resource=True):
@@ -704,6 +768,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           assign = var.assign(np.zeros(shape=[2, 2]))
           self.evaluate(assign)
 
+  @test_util.run_deprecated_v1
   def testDtypeAfterFromProto(self):
     v = resource_variable_ops.ResourceVariable(2.0)
     w = resource_variable_ops.ResourceVariable.from_proto(v.to_proto())
@@ -711,6 +776,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
     self.assertEqual(v.dtype, w.dtype)
 
   # TODO(alive): get caching to work in eager mode.
+  @test_util.run_deprecated_v1
   def testCachingDevice(self):
     with ops.device("/job:server/task:1"):
       v = resource_variable_ops.ResourceVariable(
@@ -726,6 +792,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         _ = w.value().op.get_attr("_class")
 
+  @test_util.run_v1_only("b/120545219")
   def testSharedName(self):
     with self.cached_session():
       v = resource_variable_ops.ResourceVariable(300.0, name="var4")
@@ -736,7 +803,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           # Needed in Eager since we get a unique container name by default.
           container=ops.get_default_graph()._container)
       w_read = resource_variable_ops.read_variable_op(w, v.dtype.base_dtype)
-      self.assertEqual(300.0, w_read.eval())
+      self.assertEqual(300.0, self.evaluate(w_read))
 
       x = resource_variable_ops.var_handle_op(
           dtype=v.dtype.base_dtype, shape=v.get_shape(), shared_name="var5",
@@ -744,6 +811,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("Resource .*/var5/.* does not exist"):
         resource_variable_ops.read_variable_op(x, v.dtype.base_dtype).eval()
 
+  @test_util.run_deprecated_v1
   def testSharedNameWithNamescope(self):
     with self.cached_session():
       with ops.name_scope("foo"):
@@ -772,6 +840,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
           "<unknown>",
           str(v.sparse_read(array_ops.placeholder(dtypes.int32)).shape))
 
+  @test_util.run_deprecated_v1
   def testSetInitialValue(self):
     with self.cached_session():
       # Initialize variable with a value different from the initial value passed
@@ -780,6 +849,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       v.initializer.run(feed_dict={v.initial_value: 3.0})
       self.assertEqual(3.0, v.value().eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
 
@@ -916,6 +986,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(self.evaluate(v.assign_add(1)), [1, 2, 3, 4])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testCopyToGraphUninitialized(self):
     v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
     copy_to_graph = ops.Graph()
diff --git a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
index 56609bd0a5ea8a2ce161f317b4a6977987b5821d..05307c9834ad2ab05bb5a2b557466255e92c6d1e 100644
--- a/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
+++ b/tensorflow/python/kernel_tests/reverse_sequence_op_test.py
@@ -23,6 +23,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.platform import test
@@ -42,12 +43,12 @@ class ReverseSequenceTest(test.TestCase):
       ans = array_ops.reverse_sequence(
           x, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=seq_lengths)
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
         self.assertShapeEqual(truth, ans)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def _testBothReverseSequence(self,
                                x,
@@ -107,6 +108,7 @@ class ReverseSequenceTest(test.TestCase):
   def testComplex128Basic(self):
     self._testBasic(np.complex128)
 
+  @test_util.run_deprecated_v1
   def testFloatReverseSequenceGrad(self):
     x = np.asarray(
         [[[1, 2, 3, 4], [5, 6, 7, 8]], [[9, 10, 11, 12], [13, 14, 15, 16]],
@@ -133,6 +135,7 @@ class ReverseSequenceTest(test.TestCase):
     print("ReverseSequence gradient error = %g" % err)
     self.assertLess(err, 1e-8)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     t = array_ops.reverse_sequence(
         array_ops.placeholder(
diff --git a/tensorflow/python/kernel_tests/rnn_test.py b/tensorflow/python/kernel_tests/rnn_test.py
index 0090b7332f9184e1d513108bb68a41ccb016a5f4..a49496e4ef15bc2772fe7abdac4d801b77787079 100644
--- a/tensorflow/python/kernel_tests/rnn_test.py
+++ b/tensorflow/python/kernel_tests/rnn_test.py
@@ -262,6 +262,7 @@ class RNNTest(test.TestCase):
       rnn.dynamic_rnn(cell, inputs, dtype=dtypes.float32, sequence_length=[4])
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayStateIsAccepted(self):
     cell = TensorArrayStateRNNCell()
     in_eager_mode = context.executing_eagerly()
@@ -285,6 +286,7 @@ class RNNTest(test.TestCase):
     self.assertAllEqual(4, state[0])
     self.assertAllEqual([[[1]], [[2]], [[3]], [[4]]], state[1])
 
+  @test_util.run_deprecated_v1
   def testCellGetInitialState(self):
     cell = rnn_cell_impl.BasicRNNCell(5)
     with self.assertRaisesRegexp(
@@ -345,6 +347,7 @@ class RNNTest(test.TestCase):
     self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f32, 5, 7, 3)
     self._assert_cell_builds(contrib_rnn.IndyLSTMCell, f64, 5, 7, 3)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasSimpleRNNCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -378,6 +381,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasGRUCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -411,6 +415,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithKerasLSTMCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -448,6 +453,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(state[0]), batch)
       self.assertEqual(len(state[1]), batch)
 
+  @test_util.run_deprecated_v1
   def testRNNWithStackKerasCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -491,6 +497,7 @@ class RNNTest(test.TestCase):
       for s in state:
         self.assertEqual(len(s), batch)
 
+  @test_util.run_deprecated_v1
   def testStaticRNNWithKerasSimpleRNNCell(self):
     with self.cached_session() as sess:
       input_shape = 10
@@ -529,6 +536,7 @@ class RNNTest(test.TestCase):
       self.assertEqual(len(outputs[0]), batch)
       self.assertEqual(len(state), batch)
 
+  @test_util.run_deprecated_v1
   def testKerasAndTFRNNLayerOutputComparison(self):
     input_shape = 10
     output_shape = 5
@@ -562,6 +570,7 @@ class RNNTest(test.TestCase):
     self.assertAllClose(tf_out, k_out)
     self.assertAllClose(tf_state, k_state)
 
+  @test_util.run_deprecated_v1
   def testSimpleRNNCellAndBasicRNNCellComparison(self):
     input_shape = 10
     output_shape = 5
@@ -601,6 +610,7 @@ class RNNTest(test.TestCase):
     self.assertAllClose(tf_out, k_out, atol=1e-5)
     self.assertAllClose(tf_state, k_state, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testBasicLSTMCellInterchangeWithLSTMCell(self):
     with self.session(graph=ops_lib.Graph()) as sess:
       basic_cell = rnn_cell_impl.BasicLSTMCell(1)
diff --git a/tensorflow/python/kernel_tests/save_restore_ops_test.py b/tensorflow/python/kernel_tests/save_restore_ops_test.py
index cb9aa1e34d6eb82efa94e60e7b56c26b181cef04..fecc9a3800fd85958d204144613a3f239ea43404 100644
--- a/tensorflow/python/kernel_tests/save_restore_ops_test.py
+++ b/tensorflow/python/kernel_tests/save_restore_ops_test.py
@@ -17,14 +17,30 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.platform import test
 
 
+class SaveTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testRelativePath(self):
+    os.chdir(self.get_temp_dir())
+    self.evaluate(io_ops.save_v2(
+        "ckpt", ["x"], [""], [constant_op.constant(100.)]))
+    self.assertAllEqual([100.],
+                        self.evaluate(io_ops.restore_v2(
+                            "ckpt", ["x"], [""], [dtypes.float32])))
+
+
 class ShardedFileOpsTest(test.TestCase):
 
   def testShardedFileName(self):
@@ -39,6 +55,7 @@ class ShardedFileOpsTest(test.TestCase):
 
 class ShapeInferenceTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRestoreV2WithSliceInput(self):
     op = io_ops.restore_v2("model", ["var1", "var2"], ["", "3 4 0,1:-"],
                            [dtypes.float32, dtypes.float32])
@@ -46,11 +63,13 @@ class ShapeInferenceTest(test.TestCase):
     self.assertFalse(op[0].get_shape().is_fully_defined())
     self.assertEqual([1, 4], op[1].get_shape())
 
+  @test_util.run_deprecated_v1
   def testRestoreV2NumSlicesNotMatch(self):
     with self.assertRaises(ValueError):
       io_ops.restore_v2("model", ["var1", "var2", "var3"], ["", "3 4 0,1:-"],
                         [dtypes.float32, dtypes.float32])
 
+  @test_util.run_deprecated_v1
   def testRestoreSlice(self):
     op = gen_io_ops.restore_slice("model", "var", "3 4 0,1:-", dtypes.float32)
     self.assertEqual([1, 4], op.get_shape())
diff --git a/tensorflow/python/kernel_tests/scan_ops_test.py b/tensorflow/python/kernel_tests/scan_ops_test.py
index b36922256525f55e4958fa9b0dabfe9215580212..33e491fee1dadbcce225dfa70310d47a21b6893c 100644
--- a/tensorflow/python/kernel_tests/scan_ops_test.py
+++ b/tensorflow/python/kernel_tests/scan_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -88,12 +89,14 @@ class CumsumTest(test.TestCase):
       for reverse in [True, False]:
         self._compare(x, axis, exclusive, reverse)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in self.valid_dtypes:
       x = np.zeros([0]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def testAxisType(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
@@ -102,30 +105,40 @@ class CumsumTest(test.TestCase):
           axis = constant_op.constant(0, axis_dtype)
           tf_out = math_ops.cumsum(x, axis).eval()
 
+  @test_util.run_deprecated_v1
   def test1D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test2D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(0, 10).reshape([2, 5]).astype(dtype)
       for axis in (-2, -1, 0, 1):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test3D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(0, 20).reshape([2, 2, 5]).astype(dtype)
       for axis in (-3, -2, -1, 0, 1, 2):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test6D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
       for axis in range(-6, 6, 3):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
+  def testLarge(self):
+    for dtype in self.valid_dtypes:
+      x = np.ones([1000000], dtype=dtype) / 1024
+      self._compareAll(x, 0)
+
   def testInvalidAxis(self):
     x = np.arange(0, 10).reshape([2, 5]).astype(np.float32)
     input_tensor = ops.convert_to_tensor(x)
@@ -152,22 +165,27 @@ class CumsumTest(test.TestCase):
           t, shape, result, shape, x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, False, False)
 
+  @test_util.run_deprecated_v1
   def testGradientReverse(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, False, True)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusive(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, True, False)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusiveReverse(self):
     for axis in (-1, 0):
       self._compareGradient([50], axis, True, True)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
     for axis in (-1, 0, 1):
       for exclusive in [True, False]:
@@ -194,12 +212,14 @@ class CumprodTest(test.TestCase):
       for reverse in [True, False]:
         self._compare(x, axis, exclusive, reverse)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     for dtype in self.valid_dtypes:
       x = np.zeros([0]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def testAxisType(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
@@ -208,24 +228,28 @@ class CumprodTest(test.TestCase):
           axis = constant_op.constant(0, axis_dtype)
           tf_out = math_ops.cumprod(x, axis).eval()
 
+  @test_util.run_deprecated_v1
   def test1D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 6).reshape([5]).astype(dtype)
       for axis in (-1, 0):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test2D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 11).reshape([2, 5]).astype(dtype)
       for axis in (-2, -1, 0, 1):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test3D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 21).reshape([2, 2, 5]).astype(dtype)
       for axis in (-3, -2, -1, 0, 1, 2):
         self._compareAll(x, axis)
 
+  @test_util.run_deprecated_v1
   def test6D(self):
     for dtype in self.valid_dtypes:
       x = np.arange(1, 145).reshape([2, 2, 3, 3, 2, 2]).astype(dtype)
@@ -258,22 +282,27 @@ class CumprodTest(test.TestCase):
           t, shape, result, shape, x_init_value=x, delta=1)
     self.assertAllClose(jacob_t, jacob_n, rtol=1e-8, atol=1e-8)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, False, False)
 
+  @test_util.run_deprecated_v1
   def testGradientReverse(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, False, True)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusive(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, True, False)
 
+  @test_util.run_deprecated_v1
   def testGradientExclusiveReverse(self):
     for axis in (-1, 0):
       self._compareGradient([8], axis, True, True)
 
+  @test_util.run_deprecated_v1
   def testGradient2D(self):
     for axis in (-2, -1, 0, 1):
       for exclusive in [True, False]:
diff --git a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
index 0ed508b9fe2e4c575a2053af3da099ad64fcaa3e..8510a08f0c96dd9ae08a2ca3e782cc7d28e86264 100644
--- a/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_nd_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -144,7 +145,7 @@ class StatefulScatterNdTest(test.TestCase):
         tf_scatter(ref_var, indices, updates).eval()
 
         # Compare
-        self.assertAllClose(new, ref_var.eval())
+        self.assertAllClose(new, self.evaluate(ref_var))
 
   def _VariableRankTests(self, np_scatter, tf_scatter):
     for vtype in (np.int32, np.float16, np.float32, np.float64, np.complex64,
@@ -161,10 +162,11 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with self.session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testSimpleResource(self):
     indices = constant_op.constant([[4], [3], [1], [7]], dtype=dtypes.int32)
     updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
@@ -175,8 +177,8 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with self.session(use_gpu=True) as sess:
-      sess.run(init)
-      sess.run(scatter)
+      self.evaluate(init)
+      self.evaluate(scatter)
       self.assertAllClose(ref.eval(), expected)
 
   def testSimple2(self):
@@ -189,8 +191,8 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with self.session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
   def testSimple3(self):
@@ -203,16 +205,19 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with self.session(use_gpu=True) as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     self._VariableRankTests(_NumpyUpdate, state_ops.scatter_nd_update)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAdd(self):
     self._VariableRankTests(_NumpyAdd, state_ops.scatter_nd_add)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableRankSub(self):
     self._VariableRankTests(_NumpySub, state_ops.scatter_nd_sub)
 
@@ -230,6 +235,7 @@ class StatefulScatterNdTest(test.TestCase):
         self._VariableRankTest(
             np_scatter, tf_scatter, vtype, itype, repeat_indices=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testScatterRepeatIndices(self):
     """This tests scatter_add using indices that repeat."""
     self._ScatterRepeatIndicesTest(_NumpyAdd, state_ops.scatter_nd_add)
@@ -249,8 +255,9 @@ class StatefulScatterNdTest(test.TestCase):
   #             [[0]], dtype=tf.int64), [False])
   #     var.initializer.run()
   #     session.run([update0, update1])
-  #     self.assertAllEqual([False, True], var.eval())
+  #     self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_v1_only("b/120545219")
   def testScatterOutOfRangeCpu(self):
     # TODO(simister): Re-enable once binary size increase due to
     # scatter_nd ops is under control.
@@ -287,6 +294,7 @@ class StatefulScatterNdTest(test.TestCase):
         state_ops.scatter_nd_update(ref, indices,
                                     updates).get_shape().as_list(), shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testResVarInvalidOutputShape(self):
     res = variables.Variable(
         initial_value=lambda: array_ops.zeros(shape=[], dtype=dtypes.float32),
@@ -296,6 +304,7 @@ class StatefulScatterNdTest(test.TestCase):
       with self.assertRaisesOpError("Output must be at least 1-D"):
         state_ops.scatter_nd_update(res, [[0]], [0.22]).eval()
 
+  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -307,8 +316,9 @@ class StatefulScatterNdTest(test.TestCase):
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
       ref.initializer.run()
-      self.assertAllEqual(expected_result, scatter_update.eval())
+      self.assertAllEqual(expected_result, self.evaluate(scatter_update))
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -318,6 +328,7 @@ class StatefulScatterNdTest(test.TestCase):
         ValueError, "The outer \\d+ dimensions of indices\\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
@@ -327,6 +338,7 @@ class StatefulScatterNdTest(test.TestCase):
         ValueError, "The inner \\d+ dimensions of input\\.shape="):
       state_ops.scatter_nd_update(ref, indices, updates)
 
+  @test_util.run_deprecated_v1
   def testConcurrentUpdates(self):
     num_updates = 10000
     update_values = np.random.rand(num_updates)
@@ -341,8 +353,8 @@ class StatefulScatterNdTest(test.TestCase):
     init = variables.global_variables_initializer()
 
     with session.Session() as sess:
-      sess.run(init)
-      result = sess.run(scatter)
+      self.evaluate(init)
+      result = self.evaluate(scatter)
       assert np.allclose(result, expected_result)
 
   # TODO(fpmc): Re-enable this test when gpu_pip test actually runs on a GPU.
@@ -421,7 +433,7 @@ class ScatterNdTest(test.TestCase):
                          b"", b"", b"seven"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by same value.
@@ -432,7 +444,7 @@ class ScatterNdTest(test.TestCase):
     expected = np.array([b"", b"", b"", b"bb", b"a", b"", b"", b"c"])
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertAllEqual(expected, result)
 
     # Same indice is updated twice by different value.
@@ -444,7 +456,7 @@ class ScatterNdTest(test.TestCase):
                 np.array([b"", b"", b"", b"cb", b"a", b"", b"", b"d"])]
     scatter = self.scatter_nd(indices, updates, shape=(8,))
     with self.cached_session() as sess:
-      result = sess.run(scatter)
+      result = self.evaluate(scatter)
       self.assertTrue(np.array_equal(result, expected[0]) or
                       np.array_equal(result, expected[1]))
 
@@ -455,6 +467,7 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(
         self.scatter_nd(indices, updates, shape).get_shape().as_list(), shape)
 
+  @test_util.run_deprecated_v1
   def testExtraIndicesDimensions(self):
     indices = array_ops.zeros([1, 1, 2], dtypes.int32)
     updates = array_ops.zeros([1, 1], dtypes.int32)
@@ -463,26 +476,30 @@ class ScatterNdTest(test.TestCase):
     self.assertAllEqual(scatter.get_shape().as_list(), shape)
     expected_result = np.zeros([2, 2], dtype=np.int32)
     with self.cached_session():
-      self.assertAllEqual(expected_result, scatter.eval())
+      self.assertAllEqual(expected_result, self.evaluate(scatter))
 
+  @test_util.run_deprecated_v1
   def testUndefinedIndicesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
     updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     shape = constant_op.constant([2, 2, 2], dtypes.int32)
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testUndefinedUpdatesShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     updates = array_ops.placeholder(dtypes.int32, shape=None)
     shape = constant_op.constant([2, 2, 2], dtypes.int32)
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testUndefinedOutputShape(self):
     indices = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     updates = array_ops.placeholder(dtypes.int32, shape=[2, 2, 2])
     shape = array_ops.placeholder(dtypes.int32, shape=[None])
     self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testEmptyOutputShape1(self):
     indices = array_ops.zeros([2, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -492,6 +509,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "Indices and updates specified for empty output shape"):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_v1_only("b/120545219")
   def testEmptyOutputShape2(self):
     indices = array_ops.placeholder(dtypes.int32, shape=None)
     updates = array_ops.placeholder(dtypes.int32, shape=None)
@@ -505,6 +523,7 @@ class ScatterNdTest(test.TestCase):
             updates: np.zeros([2, 2, 2], dtype=np.int32)
         })
 
+  @test_util.run_deprecated_v1
   def testEmptyOutputShape3(self):
     indices = array_ops.zeros([0], dtypes.int32)
     updates = array_ops.zeros([0], dtypes.int32)
@@ -514,6 +533,7 @@ class ScatterNdTest(test.TestCase):
     with self.cached_session():
       self.assertEqual(scatter.eval().size, 0)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape1(self):
     indices = array_ops.zeros([3, 2, 2], dtypes.int32)
     updates = array_ops.zeros([2, 2, 2], dtypes.int32)
@@ -522,6 +542,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "The outer \\d+ dimensions of indices\\.shape="):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testRank3InvalidShape2(self):
     indices = array_ops.zeros([2, 2, 1], dtypes.int32)
     updates = array_ops.zeros([2, 2], dtypes.int32)
@@ -530,6 +551,7 @@ class ScatterNdTest(test.TestCase):
         ValueError, "The inner \\d+ dimensions of (input|output)\\.shape="):
       self.scatter_nd(indices, updates, shape)
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2ElementUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
       indices = constant_op.constant([[0, 0], [1, 1]], dtype=dtypes.int32)
@@ -545,10 +567,11 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[1, 2], [3, 4]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank2SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
       indices = constant_op.constant([[1], [0]], dtype=dtypes.int32)
@@ -565,10 +588,11 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[3, 4], [1, 2]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank3SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
       indices = constant_op.constant([[[0, 1], [1, 0]], [[0, 0], [1, 1]]],
@@ -588,10 +612,11 @@ class ScatterNdTest(test.TestCase):
       expected_input_grad = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
                                      dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
+  @test_util.run_deprecated_v1
   def testGradientsRank7SliceUpdate(self):
     for dtype in GRADIENT_TESTS_DTYPES:
       indices = constant_op.constant(
@@ -615,10 +640,11 @@ class ScatterNdTest(test.TestCase):
           [[[[[[[1, 2], [3, 4]]]], [[[[5, 6], [7, 8]]]]]]],
           dtype=dtype.as_numpy_dtype())
       with self.cached_session():
-        self.assertAllEqual(expected_updates_grad, updates_grad.eval())
+        self.assertAllEqual(expected_updates_grad, self.evaluate(updates_grad))
         if self.non_aliasing_add_test:
-          self.assertAllEqual(expected_input_grad, input_grad.eval())
+          self.assertAllEqual(expected_input_grad, self.evaluate(input_grad))
 
+  @test_util.run_deprecated_v1
   def testScatterNdRepatedIndicesAdd(self):
     indices = array_ops.zeros([100000, 1], dtypes.int32)
     values = np.random.randn(100000)
@@ -627,6 +653,7 @@ class ScatterNdTest(test.TestCase):
       val = self.scatter_nd(indices, values, shape).eval()
     self.assertAllClose([np.sum(values)], val)
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim2(self):
     with self.cached_session():
       indices = array_ops.zeros([3, 5, 2], dtype=dtypes.int32)
@@ -634,6 +661,7 @@ class ScatterNdTest(test.TestCase):
       shape = [4, 6, 7]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim2(self):
     with self.cached_session():
       indices = array_ops.zeros([0, 2], dtype=dtypes.int32)
@@ -641,6 +669,7 @@ class ScatterNdTest(test.TestCase):
       shape = [4, 6, 7]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch1DSliceDim3ShapeRank7(self):
     with self.cached_session():
       indices = array_ops.zeros([1, 3], dtype=dtypes.int32)
@@ -648,6 +677,7 @@ class ScatterNdTest(test.TestCase):
       shape = [3, 4, 5, 6, 7, 8, 9]
       self.scatter_nd(indices, values, shape).eval()
 
+  @test_util.run_deprecated_v1
   def testSmokeScatterNdBatch2DSliceDim3ShapeRank7(self):
     with self.cached_session():
       indices = array_ops.zeros([1, 2, 3], dtype=dtypes.int32)
@@ -669,5 +699,56 @@ class ScatterNdNonAliasingAddTest(ScatterNdTest):
     pass
 
 
+class ScatterNdTensorTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testUpdateAddSub(self):
+    indices = constant_op.constant([[4], [3], [1], [7]])
+    updates = constant_op.constant([9, 10, 11, 12], dtype=dtypes.float32)
+    t = array_ops.ones([8], dtype=dtypes.float32)
+    assigned = array_ops.tensor_scatter_update(t, indices, updates)
+    added = array_ops.tensor_scatter_add(t, indices, updates)
+    subbed = array_ops.tensor_scatter_sub(t, indices, updates)
+
+    self.assertAllEqual(assigned,
+                        constant_op.constant([1, 11, 1, 10, 9, 1, 1, 12]))
+    self.assertAllEqual(added,
+                        constant_op.constant([1, 12, 1, 11, 10, 1, 1, 13]))
+    self.assertAllEqual(subbed,
+                        constant_op.constant([1, -10, 1, -9, -8, 1, 1, -11]))
+
+  @test_util.run_v1_only("b/120545219")
+  def testUpdateAddSubGradients(self):
+
+    with self.cached_session():
+      indices = constant_op.constant([[3], [1]])
+      updates = constant_op.constant([9, 10], dtype=dtypes.float32)
+      x = array_ops.ones([4], dtype=dtypes.float32)
+
+      assigned = array_ops.tensor_scatter_update(x, indices, updates)
+      added = array_ops.tensor_scatter_add(x, indices, updates)
+      subbed = array_ops.tensor_scatter_sub(x, indices, updates)
+
+      err_assigned = gradient_checker.compute_gradient_error(
+          x, [4], assigned, [4])
+      err_added = gradient_checker.compute_gradient_error(x, [4], added, [4])
+      err_subbed = gradient_checker.compute_gradient_error(x, [4], subbed, [4])
+
+      self.assertLess(err_assigned, 2e-4)
+      self.assertLess(err_added, 2e-4)
+      self.assertLess(err_subbed, 2e-4)
+
+      err_assigned_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], assigned, [4])
+      err_added_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], added, [4])
+      err_subbed_wrt_updates = gradient_checker.compute_gradient_error(
+          updates, [2], subbed, [4])
+
+      self.assertLess(err_assigned_wrt_updates, 2e-4)
+      self.assertLess(err_added_wrt_updates, 2e-4)
+      self.assertLess(err_subbed_wrt_updates, 2e-4)
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/scatter_ops_test.py b/tensorflow/python/kernel_tests/scatter_ops_test.py
index 87c345245c1d982b21afd52cf3e3da89fdff20ad..623c17d373cc7231d7191b715a77b6a3cf8701fc 100644
--- a/tensorflow/python/kernel_tests/scatter_ops_test.py
+++ b/tensorflow/python/kernel_tests/scatter_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -196,84 +197,111 @@ class ScatterTest(test.TestCase):
         self._VariableRankTest(tf_scatter, vtype, itype, repeat_indices,
                                updates_are_scalar)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdate(self):
     self._VariableRankTests(state_ops.scatter_update, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAdd(self):
     self._VariableRankTests(state_ops.scatter_add, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankSub(self):
     self._VariableRankTests(state_ops.scatter_sub, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMul(self):
     self._VariableRankTests(state_ops.scatter_mul, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankDiv(self):
     self._VariableRankTests(state_ops.scatter_div, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMin(self):
     self._VariableRankTests(state_ops.scatter_min, False)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMax(self):
     self._VariableRankTests(state_ops.scatter_max, False)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesAdd(self):
     self._VariableRankTests(state_ops.scatter_add, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesSub(self):
     self._VariableRankTests(state_ops.scatter_sub, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMul(self):
     self._VariableRankTests(state_ops.scatter_mul, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesDiv(self):
     self._VariableRankTests(state_ops.scatter_div, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMin(self):
     self._VariableRankTests(state_ops.scatter_min, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMax(self):
     self._VariableRankTests(state_ops.scatter_max, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankUpdateScalar(self):
     self._VariableRankTests(state_ops.scatter_update, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankAddScalar(self):
     self._VariableRankTests(state_ops.scatter_add, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankSubScalar(self):
     self._VariableRankTests(state_ops.scatter_sub, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMulScalar(self):
     self._VariableRankTests(state_ops.scatter_mul, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankDivScalar(self):
     self._VariableRankTests(state_ops.scatter_div, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMinScalar(self):
     self._VariableRankTests(state_ops.scatter_min, False, True)
 
+  @test_util.run_deprecated_v1
   def testVariableRankMaxScalar(self):
     self._VariableRankTests(state_ops.scatter_max, False, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesAddScalar(self):
     self._VariableRankTests(state_ops.scatter_add, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesSubScalar(self):
     self._VariableRankTests(state_ops.scatter_sub, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMulScalar(self):
     self._VariableRankTests(state_ops.scatter_mul, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesDivScalar(self):
     self._VariableRankTests(state_ops.scatter_div, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMinScalar(self):
     self._VariableRankTests(state_ops.scatter_min, True, True)
 
+  @test_util.run_deprecated_v1
   def testRepeatIndicesMaxScalar(self):
     self._VariableRankTests(state_ops.scatter_max, True, True)
 
+  @test_util.run_deprecated_v1
   def testBooleanScatterUpdate(self):
     if not test.is_gpu_available():
       with self.session(use_gpu=False) as session:
@@ -286,8 +314,9 @@ class ScatterTest(test.TestCase):
 
         session.run([update0, update1])
 
-        self.assertAllEqual([False, True], var.eval())
+        self.assertAllEqual([False, True], self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testScatterOutOfRangeCpu(self):
     for op, _ in _TF_OPS_TO_NUMPY.items():
       params = np.array([1, 2, 3, 4, 5, 6]).astype(np.float32)
@@ -320,19 +349,19 @@ class ScatterTest(test.TestCase):
       updates = np.array([-3, -4, -5]).astype(np.float32)
       # With GPU, the code ignores indices that are out of range.
       # We don't test the implementation; just test there's no failures.
-      with self.cached_session(force_gpu=True):
+      with test_util.force_gpu():
         ref = variables.Variable(params)
         ref.initializer.run()
 
         # Indices all in range, no problem.
         indices = np.array([2, 0, 5])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
         # Indicies out of range should not fail.
         indices = np.array([-1, 0, 5])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
         indices = np.array([2, 0, 6])
-        op(ref, indices, updates).eval()
+        self.evaluate(op(ref, indices, updates))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
index 3f7e43b5335f37651914d95091094ddd4000e1b5..8af1b47e83c94ba117d4f4f9168da7b91b606dbf 100644
--- a/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
+++ b/tensorflow/python/kernel_tests/segment_reduction_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -118,7 +119,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
           for np_op1, np_op2, tf_op in curr_ops_list:
             np_ans = self._segmentReduce(indices, np_x, np_op1, np_op2)
             s = tf_op(data=tf_x, segment_ids=indices)
-            tf_ans = s.eval()
+            tf_ans = self.evaluate(s)
             self.assertAllClose(np_ans, tf_ans)
             # NOTE(mrry): The static shape inference that computes
             # `tf_ans.shape` can only infer that sizes from dimension 1
@@ -126,6 +127,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
             # and may therefore vary dynamically.
             self.assertAllEqual(np_ans.shape[1:], tf_ans.shape[1:])
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsShape(self):
     shape = [4, 4]
     tf_x, _ = self._input(shape)
@@ -133,6 +135,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
     with self.assertRaises(ValueError):
       math_ops.segment_sum(data=tf_x, segment_ids=indices)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsSize(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
@@ -141,8 +144,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 1]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment_ids should be the same size"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsValid(self):
     # This is a baseline for the following SegmentIdsInvalid* tests.
     shape = [4, 4]
@@ -161,7 +165,7 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [1, 1, 2, 2]
         np_ans = self._segmentReduce(indices, np_x, np.add)
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testSegmentIdsHole(self):
@@ -172,9 +176,10 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 3, 3]
         np_ans = self._segmentReduce(indices, np_x, np.add)
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid1(self):
     shape = [4, 4]
     with self.cached_session():
@@ -184,8 +189,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError(
           r"Segment id -1 out of range \[0, 1\), possibly because "
           "'segment_ids' input is not sorted."):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid2(self):
     shape = [4, 4]
     with self.cached_session():
@@ -193,8 +199,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       indices = [0, 1, 0, 1]
       s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
       with self.assertRaisesOpError("segment ids are not increasing"):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid3(self):
     shape = [4, 4]
     with self.cached_session():
@@ -204,8 +211,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
       with self.assertRaisesOpError(
           r"Segment id 1 out of range \[0, 1\), possibly "
           "because 'segment_ids' input is not sorted."):
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid4(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
@@ -214,8 +222,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 0, -1]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentIdsInvalid5(self):
     shape = [4, 4]
     for use_gpu in [True, False]:
@@ -224,8 +233,9 @@ class SegmentReductionOpTest(SegmentReductionHelper):
         indices = [0, 0, 0, -2]
         s = math_ops.segment_sum(data=tf_x, segment_ids=indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     shape = [4, 4]
     indices = [0, 1, 2, 2]
@@ -297,7 +307,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                   indices, np_x, np_op1, np_op2, num_segments=num_segments,
                   initial_value=init_op(dtype))
               s = tf_op(tf_x, segment_ids=indices, num_segments=num_segments)
-              tf_ans = s.eval()
+              tf_ans = self.evaluate(s)
               if dtype is dtypes_lib.bfloat16:
                 tf_ans = tf_ans.astype(np.float32)
               self.assertAllCloseAccordingToType(np_ans, tf_ans)
@@ -320,10 +330,11 @@ class UnsortedSegmentTest(SegmentReductionHelper):
               data=tf_x,
               segment_ids=indices,
               num_segments=num_segments_constant)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     num_cols = 2
     indices_flat = np.array([0, 4, 0, -1, 3, -1, 4, 7, 7, 3])
@@ -346,6 +357,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
                   delta=1)
             self.assertAllClose(jacob_t, jacob_n)
 
+  @test_util.run_deprecated_v1
   def testProdGrad(self):
     # additional test for the prod gradient to ensure correct handling of zeros
     values = np.array([0, 0, 1, 0, 2, 2, 3, 3, 3], dtype=np.float32)
@@ -370,6 +382,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
           self.assertAllClose(jacob_t, jacob_n)
           self.assertAllClose(jacob_t, grad_gt)
 
+  @test_util.run_deprecated_v1
   def testGradientMatchesSegmentSum(self):
     # Strategy: compute the gradient for UnsortedSegmentSum and SegmentSum
     # and compare the outputs, which should be identical.
@@ -403,6 +416,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
       self.assertAllClose(unsorted_jacob_t, sorted_jacob_t)
       self.assertAllClose(unsorted_jacob_n, sorted_jacob_n)
 
+  @test_util.run_deprecated_v1
   def testBadIndices(self):
     # Note: GPU kernel does not return the out-of-range error needed for this
     # test, so this test is marked as cpu-only.
@@ -412,8 +426,9 @@ class UnsortedSegmentTest(SegmentReductionHelper):
         unsorted = math_ops.unsorted_segment_sum([[17]], bad, num_segments=2)
         with self.assertRaisesOpError(
             r"segment_ids\[0,0\] = %d is out of range \[0, 2\)" % bad[0][0]):
-          unsorted.eval()
+          self.evaluate(unsorted)
 
+  @test_util.run_deprecated_v1
   def testEmptySecondDimension(self):
     dtypes = [np.float16, np.float32, np.float64, np.int64, np.int32,
               np.complex64, np.complex128]
@@ -443,7 +458,7 @@ class UnsortedSegmentTest(SegmentReductionHelper):
           np.place(indices, indices == 8, [-1])
           s = math_ops.unsorted_segment_sum(
               data=tf_x, segment_ids=indices, num_segments=num_segments)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, s)
 
@@ -499,7 +514,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
           np_ans = self._sparseSegmentReduce(np_x, np_indices, segment_indices,
                                              np_op1, np_op2)
           s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-          tf_ans = s.eval()
+          tf_ans = self.evaluate(s)
           self.assertAllClose(np_ans, tf_ans)
           # NOTE(mrry): The static shape inference that computes
           # `tf_ans.shape` can only infer that sizes from dimension 1
@@ -518,7 +533,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
                                            np_op1, np_op2)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testWithNumSegments(self):
@@ -543,7 +558,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testWithEmptySegments(self):
@@ -562,7 +577,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np.zeros([5, 4]), tf_ans)
 
   def testSegmentIdsGreaterThanZero(self):
@@ -576,7 +591,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         np_ans = self._sparseSegmentReduce(np_x, tf_indices, segment_indices,
                                            np_op1, np_op2)
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        tf_ans = s.eval()
+        tf_ans = self.evaluate(s)
         self.assertAllClose(np_ans, tf_ans)
 
   def testValid(self):
@@ -588,8 +603,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testIndicesInvalid1(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -600,8 +616,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"indices\[1\] == -1 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testIndicesInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -612,8 +629,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError(
             r"indices\[3\] == 10 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -623,8 +641,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids are not increasing"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid3(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -636,8 +655,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError(
             r"Segment id 1 out of range \[0, 1\), possibly because "
             "'segment_ids' input is not sorted"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid4(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -649,8 +669,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
         with self.assertRaisesOpError(
             r"Segment id -1 out of range \[0, 2\), possibly because "
             "'segment_ids' input is not sorted"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid6(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -660,8 +681,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentsInvalid7(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [math_ops.sparse_segment_sum, math_ops.sparse_segment_mean]
@@ -671,7 +693,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(data=tf_x, indices=tf_indices, segment_ids=segment_indices)
         with self.assertRaisesOpError("segment ids must be >= 0"):
-          s.eval()
+          self.evaluate(s)
 
   def testSegmentWithNumSegmentsValid(self):
     # Baseline for the test*WithNumSegmentsInvalid* methods below.
@@ -690,8 +712,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             indices=tf_indices,
             segment_ids=segment_indices,
             num_segments=num_segments)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentWithNumSegmentsInvalid1(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -709,8 +732,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             segment_ids=segment_indices,
             num_segments=num_segments)
         with self.assertRaisesOpError("segment ids must be < num_segments"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testSegmentWithNumSegmentsInvalid2(self):
     tf_x, _ = self._input([10, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -730,6 +754,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
               segment_ids=segment_indices,
               num_segments=num_segments)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     shape = [10, 4]
 
@@ -748,6 +773,7 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
             delta=1)
       self.assertAllClose(jacob_t, jacob_n)
 
+  @test_util.run_deprecated_v1
   def testGradientWithEmptySegmentsAtEnd(self):
     shape = [10, 4]
 
@@ -785,8 +811,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
     with self.session(use_gpu=False):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
-        s.eval()
+        self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientIndicesInvalid1(self):
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -798,8 +825,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Index 10 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientIndicesInvalid2(self):
     tf_x, _ = self._input([3, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -811,8 +839,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Index -1 out of range \[0, 10\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid1(self):
     tf_x, _ = self._input(
         [3, 4], dtype=dtypes_lib.float32)  # expecting 3 segments
@@ -825,8 +854,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError("Invalid number of segments"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid2(self):
     tf_x, _ = self._input([1, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -838,8 +868,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id 1 out of range \[0, 1\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid3(self):
     tf_x, _ = self._input([2, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -851,8 +882,9 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id -1 out of range \[0, 2\)"):
-          s.eval()
+          self.evaluate(s)
 
+  @test_util.run_deprecated_v1
   def testGradientSegmentsInvalid4(self):
     tf_x, _ = self._input([0, 4], dtype=dtypes_lib.float32)
     ops_list = [
@@ -864,7 +896,8 @@ class SparseSegmentReductionOpTest(SparseSegmentReductionHelper):
       for tf_op in ops_list:
         s = tf_op(tf_x, tf_indices, segment_indices, 10)
         with self.assertRaisesOpError(r"Segment id 0 out of range \[0, 0\)"):
-          s.eval()
+          self.evaluate(s)
+
 
 class SegmentReductionOpBenchmark(test.Benchmark):
   outer_dim_options = [2**x for x in range(9, 14, 2)]
diff --git a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
index 1b4aff8c9cae8c387a517c448bb1c7aee1ed6094..47b22ec29673f31c3216d4b4a39687a40bc95a95 100644
--- a/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
+++ b/tensorflow/python/kernel_tests/self_adjoint_eig_op_test.py
@@ -22,8 +22,9 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -39,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SelfAdjointEigTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testWrongDimensions(self):
     # The input to self_adjoint_eig should be a tensor of
     # at least rank 2.
@@ -49,6 +51,7 @@ class SelfAdjointEigTest(test.TestCase):
     with self.assertRaises(ValueError):
       linalg_ops.self_adjoint_eig(vector)
 
+  @test_util.run_deprecated_v1
   def testConcurrentExecutesWithoutError(self):
     all_ops = []
     with self.session(use_gpu=True) as sess:
@@ -63,7 +66,7 @@ class SelfAdjointEigTest(test.TestCase):
           e1 = linalg_ops.self_adjoint_eigvals(matrix1)
           e2 = linalg_ops.self_adjoint_eigvals(matrix2)
           all_ops += [e1, e2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       self.assertAllEqual(val[0], val[2])
       # The algorithm is slightly different for compute_v being True and False,
       # so require approximate equality only here.
@@ -81,7 +84,7 @@ class SelfAdjointEigTest(test.TestCase):
     self.assertEqual(matrix.shape, (32, 32))
     matrix_tensor = constant_op.constant(matrix)
     with self.session(use_gpu=True) as sess:
-      (e, v) = sess.run(linalg_ops.self_adjoint_eig(matrix_tensor))
+      (e, v) = self.evaluate(linalg_ops.self_adjoint_eig(matrix_tensor))
       self.assertEqual(e.size, 32)
       self.assertAllClose(
           np.matmul(v, v.transpose()), np.eye(32, dtype=np.float32), atol=2e-3)
@@ -161,15 +164,15 @@ def _GetSelfAdjointEigTest(dtype_, shape_, compute_v_):
             math_ops.matmul(tf_v, array_ops.matrix_diag(tf_e)),
             tf_v,
             adjoint_b=True)
-        self.assertAllClose(a_ev.eval(), a, atol=atol)
+        self.assertAllClose(self.evaluate(a_ev), a, atol=atol)
 
         # Compare to numpy.linalg.eigh.
-        CompareEigenDecompositions(self, np_e, np_v,
-                                   tf_e.eval(), tf_v.eval(), atol)
+        CompareEigenDecompositions(self, np_e, np_v, self.evaluate(tf_e),
+                                   self.evaluate(tf_v), atol)
       else:
         tf_e = linalg_ops.self_adjoint_eigvals(constant_op.constant(a))
         self.assertAllClose(
-            np.sort(np_e, -1), np.sort(tf_e.eval(), -1), atol=atol)
+            np.sort(np_e, -1), np.sort(self.evaluate(tf_e), -1), atol=atol)
 
   return Test
 
@@ -185,53 +188,51 @@ def _GetSelfAdjointEigGradTest(dtype_, shape_, compute_v_):
     n = shape_[-1]
     batch_shape = shape_[:-2]
     np_dtype = dtype_.as_numpy_dtype
-    a = np.random.uniform(
-        low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-    if dtype_.is_complex:
-      a += 1j * np.random.uniform(
+
+    def RandomInput():
+      a = np.random.uniform(
           low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-    a += np.conj(a.T)
-    a = np.tile(a, batch_shape + (1, 1))
+      if dtype_.is_complex:
+        a += 1j * np.random.uniform(
+            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
+      a += np.conj(a.T)
+      a = np.tile(a, batch_shape + (1, 1))
+      return a
+
     # Optimal stepsize for central difference is O(epsilon^{1/3}).
     epsilon = np.finfo(np_dtype).eps
     delta = 0.1 * epsilon**(1.0 / 3.0)
     # tolerance obtained by looking at actual differences using
     # np.linalg.norm(theoretical-numerical, np.inf) on -mavx build
+    # after discarding one random input sample
+    _ = RandomInput()
     if dtype_ in (dtypes_lib.float32, dtypes_lib.complex64):
       tol = 1e-2
     else:
       tol = 1e-7
     with self.session(use_gpu=True):
-      tf_a = constant_op.constant(a)
-      if compute_v_:
-        tf_e, tf_v = linalg_ops.self_adjoint_eig(tf_a)
+      def Compute(x):
+        e, v = linalg_ops.self_adjoint_eig(x)
         # (complex) Eigenvectors are only unique up to an arbitrary phase
         # We normalize the vectors such that the first component has phase 0.
-        top_rows = tf_v[..., 0:1, :]
-        if tf_a.dtype.is_complex:
+        top_rows = v[..., 0:1, :]
+        if dtype_.is_complex:
           angle = -math_ops.angle(top_rows)
           phase = math_ops.complex(math_ops.cos(angle), math_ops.sin(angle))
         else:
           phase = math_ops.sign(top_rows)
-        tf_v *= phase
-        outputs = [tf_e, tf_v]
+        v *= phase
+        return e, v
+
+      if compute_v_:
+        funcs = [lambda x: Compute(x)[0], lambda x: Compute(x)[1]]
       else:
-        tf_e = linalg_ops.self_adjoint_eigvals(tf_a)
-        outputs = [tf_e]
-      for b in outputs:
-        x_init = np.random.uniform(
-            low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-        if dtype_.is_complex:
-          x_init += 1j * np.random.uniform(
-              low=-1.0, high=1.0, size=n * n).reshape([n, n]).astype(np_dtype)
-        x_init += np.conj(x_init.T)
-        x_init = np.tile(x_init, batch_shape + (1, 1))
-        theoretical, numerical = gradient_checker.compute_gradient(
-            tf_a,
-            tf_a.get_shape().as_list(),
-            b,
-            b.get_shape().as_list(),
-            x_init_value=x_init,
+        funcs = [linalg_ops.self_adjoint_eigvals]
+
+      for f in funcs:
+        theoretical, numerical = gradient_checker_v2.compute_gradient(
+            f,
+            [RandomInput()],
             delta=delta)
         self.assertAllClose(theoretical, numerical, atol=tol, rtol=tol)
 
@@ -245,7 +246,7 @@ if __name__ == "__main__":
       for size in 1, 2, 5, 10:
         for batch_dims in [(), (3,)] + [(3, 2)] * (max(size, size) < 10):
           shape = batch_dims + (size, size)
-          name = "%s_%s_%s" % (dtype, "_".join(map(str, shape)), compute_v)
+          name = "%s_%s_%s" % (dtype.name, "_".join(map(str, shape)), compute_v)
           _AddTest(SelfAdjointEigTest, "SelfAdjointEig", name,
                    _GetSelfAdjointEigTest(dtype, shape, compute_v))
           _AddTest(SelfAdjointEigGradTest, "SelfAdjointEigGrad", name,
diff --git a/tensorflow/python/kernel_tests/session_ops_test.py b/tensorflow/python/kernel_tests/session_ops_test.py
index 03e1ae852fc5b4ce4297b70b37964310f02306e5..bc5d8e81511494ea82bbf703544ec36448b5e982 100644
--- a/tensorflow/python/kernel_tests/session_ops_test.py
+++ b/tensorflow/python/kernel_tests/session_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import session_ops
@@ -28,6 +29,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 
 
+@test_util.run_v1_only("b/120545219")
 class SessionOpsTest(test.TestCase):
 
   def testHandleBasic(self):
@@ -37,7 +39,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Feed a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -51,7 +53,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Get the tensor from its handle.
       self.assertEqual(50, h.eval())
@@ -64,7 +66,7 @@ class SessionOpsTest(test.TestCase):
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
       v = math_ops.multiply(a, c)
-      h, v = sess.run([h, v])
+      h, v = self.evaluate([h, v])
 
       self.assertEqual(50, h.eval())
       self.assertEqual(500, v)
@@ -77,7 +79,7 @@ class SessionOpsTest(test.TestCase):
       p = math_ops.less(a, b)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      p, h = sess.run([p, h])
+      p, h = self.evaluate([p, h])
 
       # Run by feeding a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -94,7 +96,7 @@ class SessionOpsTest(test.TestCase):
       # Initialize a handle.
       a = constant_op.constant(0)
       h = session_ops.get_session_handle(a)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Do some computation.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -111,7 +113,7 @@ class SessionOpsTest(test.TestCase):
       # Initialize a handle.
       a = constant_op.constant(0)
       h = session_ops.get_session_handle(a)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Do some computation.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -133,7 +135,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Feed a tensor handle.
       f, x = session_ops.get_session_tensor(h.handle, dtypes.int32)
@@ -144,7 +146,7 @@ class SessionOpsTest(test.TestCase):
       with ops.device(test.gpu_device_name()):
         a = constant_op.constant(10)
         h = session_ops.get_session_handle(a)
-        h = sess.run(h)
+        h = self.evaluate(h)
         self.assertEqual(100, sess.run(y, feed_dict={f: h.handle}))
 
   def testHandleDelete(self):
@@ -154,7 +156,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      sess.run(h).delete()
+      self.evaluate(h).delete()
 
   def testHandleDeleteRaw(self):
     with self.cached_session() as sess:
@@ -163,7 +165,7 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(5)
       c = math_ops.multiply(a, b)
       h = session_ops.get_session_handle(c)
-      h = sess.run(h)
+      h = self.evaluate(h)
 
       # Delete using a raw tensor handle.
       raw_h = h.get_raw_handle()
@@ -174,10 +176,10 @@ class SessionOpsTest(test.TestCase):
     with self.cached_session() as sess:
       with ops.device(test.gpu_device_name()):
         a = constant_op.constant(1.0)
-        a_handle = sess.run(session_ops.get_session_handle(a))
+        a_handle = self.evaluate(session_ops.get_session_handle(a))
       with ops.device("/cpu:0"):
         b = constant_op.constant(2.0)
-        b_handle = sess.run(session_ops.get_session_handle(b))
+        b_handle = self.evaluate(session_ops.get_session_handle(b))
 
       a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32)
       b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32)
@@ -193,8 +195,8 @@ class SessionOpsTest(test.TestCase):
       # initial values live on CPU
       with ops.device("/cpu:0"):
         one = constant_op.constant(1, dtype=dtypes.float32)
-        one_handle = sess.run(session_ops.get_session_handle(one))
-        x_handle = sess.run(session_ops.get_session_handle(one))
+        one_handle = self.evaluate(session_ops.get_session_handle(one))
+        x_handle = self.evaluate(session_ops.get_session_handle(one))
 
       # addition lives on GPU
       with ops.device(test.gpu_device_name()):
@@ -219,8 +221,8 @@ class SessionOpsTest(test.TestCase):
       b = constant_op.constant(2.0)
       b_handle_op = session_ops.get_session_handle(b)
 
-      a_handle = sess.run(a_handle_op)
-      b_handle = sess.run(b_handle_op)
+      a_handle = self.evaluate(a_handle_op)
+      b_handle = self.evaluate(b_handle_op)
 
       a_p, a_t = session_ops.get_session_tensor(a_handle.handle, dtypes.float32)
       b_p, b_t = session_ops.get_session_tensor(b_handle.handle, dtypes.float32)
@@ -232,6 +234,7 @@ class SessionOpsTest(test.TestCase):
                      b_p: b_handle.handle})
       self.assertEqual(3.0, c_handle.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testFeedOneHandleDirectly(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
@@ -239,16 +242,17 @@ class SessionOpsTest(test.TestCase):
       c = math_ops.multiply(a, b)
       d = math_ops.multiply(c, c)
 
-      h_c = sess.run(session_ops.get_session_handle(c))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
 
       self.assertAllClose(2500.0, sess.run(d, feed_dict={c: h_c}))
 
+  @test_util.run_v1_only("b/120545219")
   def testDirectHandleFeedOverlappingWithFetches(self):
     with self.cached_session() as sess:
       a = constant_op.constant(10.0)
       b = constant_op.constant(5.0)
       c = math_ops.multiply(a, b)
-      h_c = sess.run(session_ops.get_session_handle(c))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
       d = array_ops.identity(c)
 
       c_val = sess.run(c, feed_dict={c: h_c})
@@ -277,24 +281,25 @@ class SessionOpsTest(test.TestCase):
       d = math_ops.div(a, b)
       e = math_ops.subtract(c, d)
 
-      h_c = sess.run(session_ops.get_session_handle(c))
-      h_d = sess.run(session_ops.get_session_handle(d))
+      h_c = self.evaluate(session_ops.get_session_handle(c))
+      h_d = self.evaluate(session_ops.get_session_handle(d))
 
       self.assertAllClose(48.0, sess.run(e, feed_dict={c: h_c, d: h_d}))
       self.assertAllClose(-48.0, sess.run(e, feed_dict={c: h_d, d: h_c}))
 
+  @test_util.run_v1_only("b/120545219")
   def testFeedHandleToVariableDirectly(self):
     with self.cached_session() as sess:
       a = variables.Variable(12.0)
       inc_a = state_ops.assign_add(a, 2.0)
       b = math_ops.add(a, 5.0)
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
 
       h_a_read = sess.run(session_ops.get_session_handle(a.read_value()))
-      self.assertAllClose(12.0, sess.run(a))
+      self.assertAllClose(12.0, self.evaluate(a))
 
       self.assertAllClose(17.0, sess.run(b, feed_dict={a: h_a_read}))
-      sess.run(inc_a)
+      self.evaluate(inc_a)
       self.assertAllClose(19.0, sess.run(b, feed_dict={a: h_a_read}))
 
 
diff --git a/tensorflow/python/kernel_tests/sets_test.py b/tensorflow/python/kernel_tests/sets_test.py
index 8335e9c139a581a22e06bd2fbfc5c027956d1714..b4f232293482b08b31fefa0f3b2a61ba115d1c47 100644
--- a/tensorflow/python/kernel_tests/sets_test.py
+++ b/tensorflow/python/kernel_tests/sets_test.py
@@ -70,6 +70,7 @@ def _dense_to_sparse(dense, dtype):
 
 class SetOpsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def test_set_size_2d(self):
     for dtype in _DTYPES:
       self._test_set_size_2d(dtype)
@@ -83,6 +84,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(
         [0, 3], self._set_size(_dense_to_sparse([[], [1, 9, 2]], dtype)))
 
+  @test_util.run_deprecated_v1
   def test_set_size_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_size_duplicates_2d(dtype)
@@ -96,6 +98,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
                                 6, 7, 8, 8, 6, 7, 5, 3, 3, 0, 6, 6, 9, 0, 0, 0
                             ], [999, 1, -1000], [], [-1]], dtype)))
 
+  @test_util.run_deprecated_v1
   def test_set_size_3d(self):
     for dtype in _DTYPES:
       self._test_set_size_3d(dtype)
@@ -159,10 +162,11 @@ class SetOpsTest(test_util.TensorFlowTestCase):
       self.assertEqual(None, op.get_shape().dims)
       self.assertEqual(dtypes.int32, op.dtype)
     with self.cached_session() as sess:
-      results = sess.run(ops)
+      results = self.evaluate(ops)
     self.assertAllEqual(results[0], results[1])
     return results[0]
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_multirow_2d(dtype)
@@ -199,6 +203,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_intersection_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_intersection_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_intersection_multirow_2d(dtype)
@@ -223,6 +228,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_intersection_count(a, b))
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_duplicates_2d(dtype)
@@ -270,6 +276,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_intersection_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_set_intersection_3d(self):
     for dtype in _DTYPES:
       self._test_set_intersection_3d(dtype=dtype)
@@ -534,8 +541,9 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_intersection_count(self, a, b):
     op = sets.set_size(sets.set_intersection(a, b))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def test_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_difference_multirow_2d(dtype)
@@ -604,6 +612,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(sp_a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_difference_multirow_2d(dtype)
@@ -647,6 +656,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(a, b, False))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_difference_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_difference_multirow_2d(dtype)
@@ -688,6 +698,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(sp_a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_set_difference_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_difference_duplicates_2d(dtype)
@@ -755,6 +766,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(expected_counts,
                         self._set_difference_count(a, sp_b, False))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_difference_3d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_difference_3d(dtype)
@@ -972,8 +984,9 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_difference_count(self, a, b, aminusb=True):
     op = sets.set_size(sets.set_difference(a, b, aminusb))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
+  @test_util.run_deprecated_v1
   def test_set_union_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_set_union_multirow_2d(dtype)
@@ -1001,6 +1014,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_union_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_dense_set_union_multirow_2d(self):
     for dtype in _DTYPES:
       self._test_dense_set_union_multirow_2d(dtype)
@@ -1021,6 +1035,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual(expected_counts, self._set_union_count(a, b))
 
+  @test_util.run_deprecated_v1
   def test_set_union_duplicates_2d(self):
     for dtype in _DTYPES:
       self._test_set_union_duplicates_2d(dtype)
@@ -1047,6 +1062,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
         expected_indices, expected_values, expected_shape, union, dtype=dtype)
     self.assertAllEqual([2], self._set_union_count(sp_a, sp_b))
 
+  @test_util.run_deprecated_v1
   def test_sparse_set_union_3d(self):
     for dtype in _DTYPES:
       self._test_sparse_set_union_3d(dtype)
@@ -1221,7 +1237,7 @@ class SetOpsTest(test_util.TensorFlowTestCase):
   def _set_union_count(self, a, b):
     op = sets.set_size(sets.set_union(a, b))
     with self.cached_session() as sess:
-      return sess.run(op)
+      return self.evaluate(op)
 
   def _assert_set_operation(self, expected_indices, expected_values,
                             expected_shape, sparse_tensor_value, dtype):
diff --git a/tensorflow/python/kernel_tests/shape_ops_test.py b/tensorflow/python/kernel_tests/shape_ops_test.py
index ee813e5ffd91d6a83e665dfb013c8a082ed2ad32..c8e7c143ade2ca740833ea5f9bd18ab5c7b4a2e6 100644
--- a/tensorflow/python/kernel_tests/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/shape_ops_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
@@ -53,8 +54,8 @@ class ShapeOpsTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.shape(x)
       tf_ans_64 = array_ops.shape(x, out_type=dtypes.int64)
-      result = tf_ans.eval()
-      result_64 = tf_ans_64.eval()
+      result = self.evaluate(tf_ans)
+      result_64 = self.evaluate(tf_ans_64)
     self.assertAllEqual(np_ans, result)
     self.assertAllEqual(np_ans, result_64)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -64,7 +65,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.shape(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -73,8 +74,8 @@ class ShapeOpsTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       tf_ans = array_ops.shape_n([x, x, x])
       tf_ans_64 = array_ops.shape_n([x, x, x], out_type=dtypes.int64)
-      result = sess.run(tf_ans)
-      result_64 = sess.run(tf_ans_64)
+      result = self.evaluate(tf_ans)
+      result_64 = self.evaluate(tf_ans_64)
     for i in range(3):
       self.assertAllEqual(np_ans, result[i])
       self.assertAllEqual(np_ans, result_64[i])
@@ -84,7 +85,7 @@ class ShapeOpsTest(test.TestCase):
     np_ans = np.asarray(np.ndim(x))
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.rank(x)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -93,7 +94,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.rank(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -101,9 +102,9 @@ class ShapeOpsTest(test.TestCase):
     np_ans = np.asarray(np.size(x))
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.size(x)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
       tf_ans_64 = array_ops.size(x, out_type=dtypes.int64)
-      result_64 = tf_ans_64.eval()
+      result_64 = self.evaluate(tf_ans_64)
     self.assertAllEqual(np_ans, result)
     self.assertAllEqual(np_ans, result_64)
     self.assertShapeEqual(np_ans, tf_ans)
@@ -113,7 +114,7 @@ class ShapeOpsTest(test.TestCase):
     x_tf, unused_nnz = _sparsify(x_np)
     with self.cached_session(use_gpu=use_gpu):
       tf_ans = array_ops.size(x_tf)
-      result = tf_ans.eval()
+      result = self.evaluate(tf_ans)
     self.assertAllEqual(np_ans, result)
     self.assertShapeEqual(np_ans, tf_ans)
 
@@ -162,7 +163,7 @@ class ShapeOpsTest(test.TestCase):
       inp = array_ops.zeros([2**31])
       num_elements = array_ops.size_internal(
           inp, optimize=False, out_type=dtypes.int64)
-      self.assertEqual(2**31, num_elements.eval())
+      self.assertEqual(2**31, self.evaluate(num_elements))
 
     # Too large for tf.int32 output.
     with self.assertRaises(errors_impl.InvalidArgumentError):
@@ -170,13 +171,13 @@ class ShapeOpsTest(test.TestCase):
         inp = array_ops.zeros([2**31])
         num_elements = array_ops.size_internal(
             inp, optimize=False, out_type=dtypes.int32)
-        self.assertEqual(2**31, num_elements.eval())
+        self.assertEqual(2**31, self.evaluate(num_elements))
 
   def _compareExpandDims(self, x, dim, use_gpu):
     np_ans = np.expand_dims(x, axis=dim)
     with self.cached_session(use_gpu=use_gpu):
       tensor = array_ops.expand_dims(x, dim)
-      tf_ans = tensor.eval()
+      tf_ans = self.evaluate(tensor)
     self.assertShapeEqual(np_ans, tensor)
     self.assertAllEqual(np_ans, tf_ans)
 
@@ -227,6 +228,7 @@ class ShapeOpsTest(test.TestCase):
     self._compareExpandDimsAll(choice([2, 3, 5]), -3)
     self._compareExpandDimsAll(choice([2, 3, 5]), -4)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsErrors(self):
     with self.cached_session():
       self.assertRaises(ValueError, array_ops.expand_dims,
@@ -238,6 +240,7 @@ class ShapeOpsTest(test.TestCase):
       self.assertRaises(ValueError, array_ops.expand_dims,
                         [False, True, True], 4)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsGradient(self):
     with self.cached_session():
       inp = constant_op.constant(
@@ -248,6 +251,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 1, 2])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testExpandDimsScalar(self):
     with self.cached_session():
       inp = constant_op.constant(7)
@@ -264,7 +268,7 @@ class ShapeOpsTest(test.TestCase):
       np_ans = np.expand_dims(x, axis=0)
       with self.cached_session(use_gpu=True):
         tensor = array_ops.expand_dims(x, constant_op.constant(0, dtype))
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
       self.assertShapeEqual(np_ans, tensor)
       self.assertAllEqual(np_ans, tf_ans)
 
@@ -273,11 +277,11 @@ class ShapeOpsTest(test.TestCase):
       if squeeze_dims:
         np_ans = np.squeeze(x, axis=tuple(squeeze_dims))
         tensor = array_ops.squeeze(x, squeeze_dims)
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
       else:
         np_ans = np.squeeze(x)
         tensor = array_ops.squeeze(x)
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
     self.assertShapeEqual(np_ans, tensor)
     self.assertAllEqual(np_ans, tf_ans)
 
@@ -340,7 +344,7 @@ class ShapeOpsTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         tensor = array_ops.squeeze(np.zeros([1, 1, 1]), [])
         self.assertEqual(np.shape(1), tensor.get_shape())
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
         self.assertEqual(np.shape(1), tf_ans.shape)
 
   def testSqueezeAllOnesBool(self):
@@ -350,9 +354,10 @@ class ShapeOpsTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         tensor = array_ops.squeeze([[[False]]], [])
         self.assertEqual(np.shape(1), tensor.get_shape())
-        tf_ans = tensor.eval()
+        tf_ans = self.evaluate(tensor)
         self.assertEqual(np.shape(1), tf_ans.shape)
 
+  @test_util.run_deprecated_v1
   def testSqueezeOnlyOnes(self):
     for use_gpu in [False, True]:
       with self.cached_session(use_gpu=use_gpu):
@@ -362,6 +367,7 @@ class ShapeOpsTest(test.TestCase):
         self._compareSqueezeAll(input_1x1x3, [1])
         self.assertRaises(ValueError, array_ops.squeeze, input_1x1x3, [2])
 
+  @test_util.run_deprecated_v1
   def testSqueezeErrors(self):
     for use_gpu in [False, True]:
       with self.cached_session(use_gpu=use_gpu):
@@ -374,6 +380,7 @@ class ShapeOpsTest(test.TestCase):
         self.assertRaises(ValueError, array_ops.squeeze,
                           np.zeros([1, 2, 1]), [2, 3])
 
+  @test_util.run_deprecated_v1
   def testSqueezeGradient(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -384,6 +391,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 2])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testSqueezeGradientWithSqueezeDims(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -394,6 +402,7 @@ class ShapeOpsTest(test.TestCase):
                                                     [4, 2, 1])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testSqueezeWithUnknownShape(self):
     with self.cached_session():
       a = array_ops.placeholder(dtypes.float32, shape=[2, None])
@@ -415,7 +424,7 @@ class TileTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu):
         a = constant_op.constant(7, shape=[], dtype=dtypes.float32)
         tiled = array_ops.tile(a, [])
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, ())
       self.assertEqual([], tiled.get_shape())
       self.assertEqual(7, result)
@@ -427,7 +436,7 @@ class TileTest(test.TestCase):
         inp = np.random.rand(4, 1).astype(np.float32)
         a = constant_op.constant(inp)
         tiled = array_ops.tile(a, constant_op.constant([1, 4], dtype=dtype))
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, (4, 4))
       self.assertEqual([4, 4], tiled.get_shape())
       self.assertTrue((result == np.tile(inp, (1, 4))).all())
@@ -437,7 +446,7 @@ class TileTest(test.TestCase):
       inp = np.random.rand(4, 1).astype(np.float32)
       a = constant_op.constant(inp)
       tiled = array_ops.tile(a, [1, 1])
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertEqual(result.shape, (4, 1))
     self.assertEqual([4, 1], tiled.get_shape())
     self.assertTrue((result == np.tile(inp, (1, 1))).all())
@@ -447,10 +456,11 @@ class TileTest(test.TestCase):
       inp = np.random.rand(2, 3).astype(np.float32)
       a = constant_op.constant(inp)
       tiled = array_ops.tile(a, [5, 0])
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertEqual(result.shape, (10, 0))
     self.assertEqual([10, 0], tiled.get_shape())
 
+  @test_util.run_deprecated_v1
   def testUnknownInputShape(self):
     """Importing can call _TileShape without shape of <multiples> known."""
     with self.cached_session():
@@ -497,11 +507,12 @@ class TileTest(test.TestCase):
             shape=[4, 1],
             dtype=dtype_tf)
         tiled = array_ops.tile(a, [1, 4])
-        result = tiled.eval()
+        result = self.evaluate(tiled)
       self.assertEqual(result.shape, (4, 4))
       self.assertEqual([4, 4], tiled.get_shape())
       self.assertAllEqual(result, np.tile(inp, (1, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidDim(self):
     with self.cached_session():
       inp = np.random.rand(4, 1).astype("f")
@@ -527,7 +538,7 @@ class TileTest(test.TestCase):
           dtype=dtypes.float32)
       multiples = np.random.randint(1, 4, size=rank).astype(np.int32)
       tiled = array_ops.tile(a, multiples)
-      result = tiled.eval()
+      result = self.evaluate(tiled)
     self.assertTrue((np.array(multiples) * np.array(inp.shape) == np.array(
         result.shape)).all())
     self.assertAllEqual(result, np.tile(inp, tuple(multiples)))
@@ -545,6 +556,7 @@ class TileTest(test.TestCase):
     for _ in range(5):
       self._RunAndVerifyResult(10, use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradientSimpleReduction(self):
     with self.cached_session():
       inp = np.random.rand(4, 1).astype("f")
@@ -557,9 +569,10 @@ class TileTest(test.TestCase):
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
       self.assertShapeEqual(inp, grad)
-      result = grad.eval()
+      result = self.evaluate(grad)
     self.assertAllClose(np.sum(grad_inp, axis=1).reshape(4, 1), result, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReduction(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -572,13 +585,14 @@ class TileTest(test.TestCase):
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
       self.assertShapeEqual(inp, grad)
-      result = grad.eval()
+      result = self.evaluate(grad)
     expected_shape = [4, 2]
     expected = np.zeros(expected_shape)
     expected[:, 0] = grad_inp[:, 0] + grad_inp[:, 2]
     expected[:, 1] = grad_inp[:, 1] + grad_inp[:, 3]
     self.assertTrue((np.abs(expected - result) < 1e-3).all())
 
+  @test_util.run_deprecated_v1
   def testGradientSimpleReductionOnGPU(self):
     with self.session(use_gpu=True):
       inp = np.random.rand(4, 1).astype("f")
@@ -590,9 +604,10 @@ class TileTest(test.TestCase):
       grad_tensor = constant_op.constant(
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     self.assertAllClose(np.sum(grad_inp, axis=1).reshape(4, 1), result, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReductionOnGPU(self):
     with self.session(use_gpu=True):
       inp = np.random.rand(4, 2).astype("f")
@@ -604,7 +619,7 @@ class TileTest(test.TestCase):
       grad_tensor = constant_op.constant(
           [float(x) for x in grad_inp.flatten()], shape=grad_shape)
       grad = gradients_impl.gradients([tiled], [a], [grad_tensor])[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     expected_shape = [4, 2]
     expected = np.zeros(expected_shape)
     expected[:, 0] = grad_inp[:, 0] + grad_inp[:, 2]
@@ -624,15 +639,18 @@ class TileTest(test.TestCase):
       print("tile(float) error = ", err)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientRandomScalar(self):
     self._RunAndVerifyGradientResult([], [])
 
+  @test_util.run_deprecated_v1
   def testGradientRandom(self):
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 1, 1, 1, 1])
     self._RunAndVerifyGradientResult([2, 2, 1, 1, 3], [1, 2, 1, 3, 1])
     self._RunAndVerifyGradientResult([2, 3, 1, 1, 3], [3, 1, 1, 2, 2])
     self._RunAndVerifyGradientResult([2, 1, 3, 3, 2], [1, 3, 3, 1, 2])
 
+  @test_util.run_deprecated_v1
   def testGradientStridedReductionGC(self):
     with self.cached_session():
       inp = np.random.rand(4, 2).astype("f")
@@ -642,6 +660,7 @@ class TileTest(test.TestCase):
       err = gradient_checker.compute_gradient_error(a, [4, 2], tiled, [4, 4])
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSparseGradWithRank1(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0],
                                   dtype=dtypes.float32)
@@ -653,6 +672,7 @@ class TileTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithSparseGradWithRank3(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0],
                                   dtype=dtypes.float32)
@@ -665,6 +685,7 @@ class TileTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # Unknown multiples shape.
     inp = constant_op.constant(0.0, shape=[4, 4, 4, 4])
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index 56d4d46d462726e8e6448f2ee2248303257b6e4b..8f4e31abe3c90af01029be719ee83c7c7dc42f0c 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -18,6 +18,35 @@ py_library(
     ],
 )
 
+cuda_py_tests(
+    name = "dct_ops_test",
+    srcs = ["dct_ops_test.py"],
+    additional_deps = [
+        "@absl_py//absl/testing:parameterized",
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+)
+
+cuda_py_tests(
+    name = "fft_ops_test",
+    size = "medium",
+    srcs = ["fft_ops_test.py"],
+    additional_deps = [
+        "//third_party/py/numpy",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
+    ],
+    shard_count = 4,
+    tags = ["optonly"],
+)
+
 cuda_py_tests(
     name = "mel_ops_test",
     srcs = ["mel_ops_test.py"],
@@ -91,9 +120,9 @@ cuda_py_tests(
         "//tensorflow/python:framework",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/ops/signal",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:spectral_ops_test_util",
+        "//tensorflow/python/ops/signal",
     ],
     tags = ["nomac"],
 )
diff --git a/tensorflow/python/kernel_tests/dct_ops_test.py b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
similarity index 67%
rename from tensorflow/python/kernel_tests/dct_ops_test.py
rename to tensorflow/python/kernel_tests/signal/dct_ops_test.py
index c9d0167608ed0447d1a0fcdcc8e054fd8c1fe863..a3ac15bab8a7b8223bd1ea085386b965b7fdd62e 100644
--- a/tensorflow/python/kernel_tests/dct_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/dct_ops_test.py
@@ -20,10 +20,12 @@ from __future__ import print_function
 
 import importlib
 
+from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import dct_ops
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 
@@ -40,6 +42,20 @@ def try_import(name):  # pylint: disable=invalid-name
 fftpack = try_import("scipy.fftpack")
 
 
+def _np_dct1(signals, norm=None):
+  """Computes the DCT-I manually with NumPy."""
+  # X_k = (x_0 + (-1)**k * x_{N-1} +
+  #       2 * sum_{n=0}^{N-2} x_n * cos(\frac{pi}{N-1} * n * k)  k=0,...,N-1
+  del norm
+  dct_size = signals.shape[-1]
+  dct = np.zeros_like(signals)
+  for k in range(dct_size):
+    phi = np.cos(np.pi * np.arange(1, dct_size - 1) * k / (dct_size - 1))
+    dct[..., k] = 2 * np.sum(signals[..., 1:-1] * phi, axis=-1) + (
+        signals[..., 0] + (-1) ** k * signals[..., -1])
+  return dct
+
+
 def _np_dct2(signals, norm=None):
   """Computes the DCT-II manually with NumPy."""
   # X_k = sum_{n=0}^{N-1} x_n * cos(\frac{pi}{N} * (n + 0.5) * k)  k=0,...,N-1
@@ -81,19 +97,19 @@ def _np_dct3(signals, norm=None):
   return dct
 
 
-NP_DCT = {2: _np_dct2, 3: _np_dct3}
-NP_IDCT = {2: _np_dct3, 3: _np_dct2}
+NP_DCT = {1: _np_dct1, 2: _np_dct2, 3: _np_dct3}
+NP_IDCT = {1: _np_dct1, 2: _np_dct3, 3: _np_dct2}
 
 
-class DCTOpsTest(test.TestCase):
+class DCTOpsTest(parameterized.TestCase, test.TestCase):
 
   def _compare(self, signals, norm, dct_type, atol=5e-4, rtol=5e-4):
     """Compares (I)DCT to SciPy (if available) and a NumPy implementation."""
     np_dct = NP_DCT[dct_type](signals, norm)
-    tf_dct = spectral_ops.dct(signals, type=dct_type, norm=norm).eval()
+    tf_dct = dct_ops.dct(signals, type=dct_type, norm=norm).eval()
     self.assertAllClose(np_dct, tf_dct, atol=atol, rtol=rtol)
     np_idct = NP_IDCT[dct_type](signals, norm)
-    tf_idct = spectral_ops.idct(signals, type=dct_type, norm=norm).eval()
+    tf_idct = dct_ops.idct(signals, type=dct_type, norm=norm).eval()
     self.assertAllClose(np_idct, tf_idct, atol=atol, rtol=rtol)
     if fftpack:
       scipy_dct = fftpack.dct(signals, type=dct_type, norm=norm)
@@ -101,38 +117,52 @@ class DCTOpsTest(test.TestCase):
       scipy_idct = fftpack.idct(signals, type=dct_type, norm=norm)
       self.assertAllClose(scipy_idct, tf_idct, atol=atol, rtol=rtol)
     # Verify inverse(forward(s)) == s, up to a normalization factor.
-    tf_idct_dct = spectral_ops.idct(
+    tf_idct_dct = dct_ops.idct(
         tf_dct, type=dct_type, norm=norm).eval()
-    tf_dct_idct = spectral_ops.dct(
+    tf_dct_idct = dct_ops.dct(
         tf_idct, type=dct_type, norm=norm).eval()
     if norm is None:
-      tf_idct_dct *= 0.5 / signals.shape[-1]
-      tf_dct_idct *= 0.5 / signals.shape[-1]
+      if dct_type == 1:
+        tf_idct_dct *= 0.5 / (signals.shape[-1] - 1)
+        tf_dct_idct *= 0.5 / (signals.shape[-1] - 1)
+      else:
+        tf_idct_dct *= 0.5 / signals.shape[-1]
+        tf_dct_idct *= 0.5 / signals.shape[-1]
     self.assertAllClose(signals, tf_idct_dct, atol=atol, rtol=rtol)
     self.assertAllClose(signals, tf_dct_idct, atol=atol, rtol=rtol)
 
-  def test_random(self):
+  @parameterized.parameters([
+      [[2]], [[3]], [[10]], [[2, 20]], [[2, 3, 25]]])
+  @test_util.run_deprecated_v1
+  def test_random(self, shape):
     """Test randomly generated batches of data."""
     with spectral_ops_test_util.fft_kernel_label_map():
       with self.session(use_gpu=True):
-        for shape in ([1], [2], [3], [10], [2, 20], [2, 3, 25]):
-          signals = np.random.rand(*shape).astype(np.float32)
-          for norm in (None, "ortho"):
-            self._compare(signals, norm, 2)
-            self._compare(signals, norm, 3)
+        signals = np.random.rand(*shape).astype(np.float32)
+        # Normalization not implemented for orthonormal.
+        self._compare(signals, norm=None, dct_type=1)
+        for norm in (None, "ortho"):
+          self._compare(signals, norm, 2)
+          self._compare(signals, norm, 3)
 
   def test_error(self):
     signals = np.random.rand(10)
     # Unsupported type.
     with self.assertRaises(ValueError):
-      spectral_ops.dct(signals, type=1)
+      dct_ops.dct(signals, type=5)
+    # DCT-I normalization not implemented.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(signals, type=1, norm="ortho")
+    # DCT-I requires at least two inputs.
+    with self.assertRaises(ValueError):
+      dct_ops.dct(np.random.rand(1), type=1)
     # Unknown normalization.
     with self.assertRaises(ValueError):
-      spectral_ops.dct(signals, norm="bad")
+      dct_ops.dct(signals, norm="bad")
     with self.assertRaises(NotImplementedError):
-      spectral_ops.dct(signals, n=10)
+      dct_ops.dct(signals, n=10)
     with self.assertRaises(NotImplementedError):
-      spectral_ops.dct(signals, axis=0)
+      dct_ops.dct(signals, axis=0)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/fft_ops_test.py b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
similarity index 96%
rename from tensorflow/python/kernel_tests/fft_ops_test.py
rename to tensorflow/python/kernel_tests/signal/fft_ops_test.py
index 8592550f99a8da997de5d8abd4dee0ca541259db..5b1053428c0096c15fce7c4fa7b46d5999602057 100644
--- a/tensorflow/python/kernel_tests/fft_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/fft_ops_test.py
@@ -25,12 +25,13 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_spectral_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
 from tensorflow.python.ops import spectral_ops_test_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.platform import test
 
 VALID_FFT_RANKS = (1, 2, 3)
@@ -139,24 +140,25 @@ class FFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.fft
+      return fft_ops.fft
     elif rank == 2:
-      return spectral_ops.fft2d
+      return fft_ops.fft2d
     elif rank == 3:
-      return spectral_ops.fft3d
+      return fft_ops.fft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.ifft
+      return fft_ops.ifft
     elif rank == 2:
-      return spectral_ops.ifft2d
+      return fft_ops.ifft2d
     elif rank == 3:
-      return spectral_ops.ifft3d
+      return fft_ops.ifft3d
     else:
       raise ValueError("invalid rank")
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -166,6 +168,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self.assertEqual(x.shape, self._tfFFT(x, rank).shape)
             self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
@@ -194,6 +197,7 @@ class FFTOpsTest(BaseFFTOpsTest):
   #           np.mod(np.arange(np.power(128, dims)), 64).reshape(
   #               (128,) * dims).astype(np.complex64), rank)
 
+  @test_util.run_deprecated_v1
   def testBasicPlaceholder(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 1e-8)):
@@ -204,6 +208,7 @@ class FFTOpsTest(BaseFFTOpsTest):
                     (4,) * dims).astype(np_type),
                 rank, use_placeholder=True, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.complex64, 1e-4), (np.complex128, 5e-6)):
@@ -218,6 +223,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self._compare(gen((4,) * dims).astype(np_type), rank,
                           rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testRandom1D(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type in (np.complex64, np.complex128):
@@ -240,6 +246,7 @@ class FFTOpsTest(BaseFFTOpsTest):
         for dim in (127, 255, 511, 1023):
           self._compare(gen((dim,)).astype(np_type), 1, rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testError(self):
     for rank in VALID_FFT_RANKS:
       for dims in xrange(0, rank):
@@ -251,6 +258,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             ValueError, "Shape must be .*rank {}.*".format(rank)):
           self._tfIFFT(x, rank)
 
+  @test_util.run_deprecated_v1
   def testGrad_Simple(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.float32, 1e-4), (np.float64, 1e-10)):
@@ -263,6 +271,7 @@ class FFTOpsTest(BaseFFTOpsTest):
             self._checkGradComplex(self._tfIFFTForRank(rank), re, im,
                                    rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testGrad_Random(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for np_type, tol in ((np.float32, 1e-2), (np.float64, 1e-10)):
@@ -312,24 +321,25 @@ class RFFTOpsTest(BaseFFTOpsTest):
 
   def _tfFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.rfft
+      return fft_ops.rfft
     elif rank == 2:
-      return spectral_ops.rfft2d
+      return fft_ops.rfft2d
     elif rank == 3:
-      return spectral_ops.rfft3d
+      return fft_ops.rfft3d
     else:
       raise ValueError("invalid rank")
 
   def _tfIFFTForRank(self, rank):
     if rank == 1:
-      return spectral_ops.irfft
+      return fft_ops.irfft
     elif rank == 2:
-      return spectral_ops.irfft2d
+      return fft_ops.irfft2d
     elif rank == 3:
-      return spectral_ops.irfft3d
+      return fft_ops.irfft3d
     else:
       raise ValueError("invalid rank")
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -339,6 +349,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           x = np.zeros((0,) * dims).astype(np.complex64)
           self.assertEqual(x.shape, self._tfIFFT(x, rank).shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -366,6 +377,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                        10).reshape((size,) * (dims - 1) + (inner_dim,))
           self._compareBackward(c2r.astype(np.complex64), rank, (size,) * rank)
 
+  @test_util.run_deprecated_v1
   def testBasicPlaceholder(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -427,6 +439,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
                   fft_length,
                   use_placeholder=True)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       def gen_real(shape):
@@ -451,6 +464,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
             self._compareBackward(
                 gen_complex(complex_dims), rank, (size,) * rank)
 
+  @test_util.run_deprecated_v1
   def testError(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -507,6 +521,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
           with self.cached_session():
             irfft_fn(x, fft_length).eval()
 
+  @test_util.run_deprecated_v1
   def testGrad_Simple(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
@@ -521,6 +536,7 @@ class RFFTOpsTest(BaseFFTOpsTest):
             self._checkGradComplex(
                 self._tfIFFTForRank(rank), re, im, result_is_complex=False)
 
+  @test_util.run_deprecated_v1
   def testGrad_Random(self):
     with spectral_ops_test_util.fft_kernel_label_map():
       for rank in VALID_FFT_RANKS:
diff --git a/tensorflow/python/kernel_tests/signal/mel_ops_test.py b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
index 1ed4429b42a9c15f4b4b33ba0bd3121b4811ed8b..3134503daec4d3ebeeb014f7ea99123cb4a0f694 100644
--- a/tensorflow/python/kernel_tests/signal/mel_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mel_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.signal import mel_ops
@@ -141,14 +142,16 @@ class LinearToMelTest(test.TestCase):
       for config in configs:
         mel_matrix_np = spectrogram_to_mel_matrix(*config)
         mel_matrix = mel_ops.linear_to_mel_weight_matrix(*config)
-        self.assertAllClose(mel_matrix_np, mel_matrix.eval(), atol=3e-6)
+        self.assertAllClose(mel_matrix_np, self.evaluate(mel_matrix), atol=3e-6)
 
+  @tf_test_util.run_deprecated_v1
   def test_dtypes(self):
     # LinSpace is not supported for tf.float16.
     for dtype in (dtypes.bfloat16, dtypes.float32, dtypes.float64):
       self.assertEqual(dtype,
                        mel_ops.linear_to_mel_weight_matrix(dtype=dtype).dtype)
 
+  @tf_test_util.run_deprecated_v1
   def test_error(self):
     with self.assertRaises(ValueError):
       mel_ops.linear_to_mel_weight_matrix(num_mel_bins=0)
@@ -177,6 +180,7 @@ class LinearToMelTest(test.TestCase):
         rewritten_graph = test_util.grappler_optimize(g, [mel_matrix])
         self.assertEqual(1, len(rewritten_graph.node))
 
+  @tf_test_util.run_deprecated_v1
   def test_num_spectrogram_bins_dynamic(self):
     with self.session(use_gpu=True):
       num_spectrogram_bins = array_ops.placeholder(shape=(),
diff --git a/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
index 79d23d77d1e3112822bc5968d957b095ae378188..935922657cd4dd088c30dae7c74997339b3cb7f1 100644
--- a/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/mfcc_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import spectral_ops_test_util
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
 # HTK conventions.
 class MFCCTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_error(self):
     # num_mel_bins must be positive.
     with self.assertRaises(ValueError):
@@ -43,6 +45,7 @@ class MFCCTest(test.TestCase):
       signal = array_ops.zeros((2, 3, 5), dtype=dtypes.float64)
       mfcc_ops.mfccs_from_log_mel_spectrograms(signal)
 
+  @test_util.run_deprecated_v1
   def test_basic(self):
     """A basic test that the op runs on random input."""
     with spectral_ops_test_util.fft_kernel_label_map():
@@ -50,6 +53,7 @@ class MFCCTest(test.TestCase):
         signal = random_ops.random_normal((2, 3, 5))
         mfcc_ops.mfccs_from_log_mel_spectrograms(signal).eval()
 
+  @test_util.run_deprecated_v1
   def test_unknown_shape(self):
     """A test that the op runs when shape and rank are unknown."""
     with spectral_ops_test_util.fft_kernel_label_map():
diff --git a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
index c4e5b6f67408f7ff2de38ceb013e0e97c9624fd3..e0ce06418a457eee9a45b172f9cc5887d1167153 100644
--- a/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/reconstruction_ops_test.py
@@ -20,8 +20,10 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -52,16 +54,78 @@ class ReconstructionOpsTest(test.TestCase):
                             "100000000000000"]
 
   def test_all_ones(self):
-    signal = constant_op.constant(np.ones((3, 5)), dtype=dtypes.int64)
+    signal = array_ops.ones([3, 5])
     reconstruction = reconstruction_ops.overlap_and_add(signal, 2)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+    self.assertEqual(reconstruction.shape.as_list(), [9])
+
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
 
       expected_output = np.array([1, 1, 2, 2, 3, 2, 2, 1, 1])
 
       self.assertAllClose(output, expected_output)
 
+  @test_util.run_deprecated_v1
+  def test_unknown_shapes(self):
+    # This test uses placeholders and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.placeholder(dtype=dtypes.int32, shape=[None, None, None])
+    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.shape.as_list(), [None, None])
+
+    with self.session(use_gpu=True) as sess:
+      output = sess.run(reconstruction,
+                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
+
+      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
+  def test_unknown_rank(self):
+    # This test uses placeholders and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.placeholder(dtype=dtypes.int32, shape=None)
+    frame_step = array_ops.placeholder(dtype=dtypes.int32, shape=[])
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.shape, None)
+
+    with self.session(use_gpu=True) as sess:
+      output = sess.run(reconstruction,
+                        feed_dict={signal: np.ones([4, 3, 5]), frame_step: 2})
+
+      expected_output = np.array([[1, 1, 2, 2, 3, 2, 2, 1, 1]] * 4)
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
+  def test_fast_path(self):
+    # This test uses tensor names and does not work in eager mode.
+    if context.executing_eagerly():
+      return
+
+    signal = array_ops.ones([3, 5])
+    frame_step = 5
+    reconstruction = reconstruction_ops.overlap_and_add(signal, frame_step)
+
+    self.assertEqual(reconstruction.name, "overlap_and_add/fast_path:0")
+
+    with self.session(use_gpu=True) as sess:
+      output = self.evaluate(reconstruction)
+
+      expected_output = np.ones([15])
+
+      self.assertAllClose(output, expected_output)
+
+  @test_util.run_deprecated_v1
   def test_simple(self):
     def make_input(frame_length, num_frames=3):
       """Generate a tensor of num_frames frames of frame_length."""
@@ -98,8 +162,8 @@ class ReconstructionOpsTest(test.TestCase):
                                   dtype=dtypes.int64)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
       string_output = [np.base_repr(x, self.bases[0]) for x in output]
 
       self.assertEqual(string_output, self.expected_string)
@@ -108,8 +172,8 @@ class ReconstructionOpsTest(test.TestCase):
     signal = constant_op.constant(self.powers, dtype=dtypes.int64)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
 
       accumulator = True
       for i in range(self.batch_size):
@@ -124,8 +188,8 @@ class ReconstructionOpsTest(test.TestCase):
     signal = constant_op.constant(input_matrix, dtype=dtypes.float32)
     reconstruction = reconstruction_ops.overlap_and_add(signal, self.frame_hop)
 
-    with self.session(use_gpu=True) as sess:
-      output = sess.run(reconstruction)
+    with self.session(use_gpu=True):
+      output = self.evaluate(reconstruction)
 
       string_output = [np.base_repr(int(x), self.bases[0]) for x in
                        np.squeeze(output)]
@@ -133,6 +197,7 @@ class ReconstructionOpsTest(test.TestCase):
       self.assertEqual(output.shape, (1, 9))
       self.assertEqual(string_output, self.expected_string)
 
+  @test_util.run_deprecated_v1
   def test_gradient(self):
     configurations = [
         ((1, 128), 1),
@@ -154,6 +219,7 @@ class ReconstructionOpsTest(test.TestCase):
         gradient = sess.run(gradients_impl.gradients([loss], [signal])[0])
         self.assertTrue((gradient == 1.0).all())
 
+  @test_util.run_deprecated_v1
   def test_gradient_batch(self):
     with self.session(use_gpu=True) as sess:
       signal = array_ops.zeros((2, 10, 10))
@@ -176,6 +242,7 @@ class ReconstructionOpsTest(test.TestCase):
           np.reshape(np.arange(100).astype(np.float32), (10, 10))])
       self.assertAllEqual(expected_gradient, gradient)
 
+  @test_util.run_deprecated_v1
   def test_gradient_numerical(self):
     with self.session(use_gpu=True):
       shape = (2, 10, 10)
diff --git a/tensorflow/python/kernel_tests/signal/shape_ops_test.py b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
index 398fba8b6df6d86cfaf01c36acb4d835ac0043a6..32ac76e80d00660e0784ee44cda7e325862c7816 100644
--- a/tensorflow/python/kernel_tests/signal/shape_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/shape_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -32,6 +33,7 @@ from tensorflow.python.platform import test
 
 class FrameTest(test.TestCase):
 
+  @tf_test_util.run_deprecated_v1
   def test_mapping_of_indices_without_padding(self):
     with self.session(use_gpu=True):
       tensor = constant_op.constant(np.arange(9152), dtypes.int32)
@@ -47,6 +49,7 @@ class FrameTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_mapping_of_indices_with_padding(self):
     with self.session(use_gpu=True):
       tensor = constant_op.constant(np.arange(10000), dtypes.int32)
@@ -64,6 +67,7 @@ class FrameTest(test.TestCase):
 
       self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_invalid_inputs(self):
     # Rank 0 input signal.
     with self.assertRaises(ValueError):
@@ -84,6 +88,7 @@ class FrameTest(test.TestCase):
     with self.assertRaises(ValueError):
       shape_ops.frame([1], 1, 1, pad_end=True, pad_value=[1])
 
+  @tf_test_util.run_deprecated_v1
   def test_length_zero(self):
     signal = constant_op.constant([], dtype=dtypes.float32)
     frame_length = 2
@@ -98,6 +103,7 @@ class FrameTest(test.TestCase):
                                pad_end=False).eval()
       self.assertEqual((0, 2), result.shape)
 
+  @tf_test_util.run_deprecated_v1
   def test_shape_inference(self):
     signal = array_ops.placeholder(dtypes.int32, shape=[1, 1])
     frame_length = 2
@@ -150,9 +156,10 @@ class FrameTest(test.TestCase):
           op = shape_ops.frame(signal, frame_length, frame_step,
                                pad_end=pad_end, pad_value=99)
           with self.cached_session(use_gpu=True):
-            result = op.eval()
+            result = self.evaluate(op)
           self.assertEqual(op.shape.as_list(), list(result.shape))
 
+  @tf_test_util.run_deprecated_v1
   def test_basic_mono(self):
     signal = np.arange(6)
     frame_length = 3
@@ -178,6 +185,7 @@ class FrameTest(test.TestCase):
                                  pad_end=False).eval()
         self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_basic_stereo(self):
     signal = np.vstack([np.arange(6),
                         np.arange(6) + 10])
@@ -207,6 +215,7 @@ class FrameTest(test.TestCase):
                                  pad_end=False).eval()
         self.assertAllEqual(expected, result)
 
+  @tf_test_util.run_deprecated_v1
   def test_complex_shape(self):
     signal = np.vstack([np.arange(6),
                         np.arange(6) + 10,
@@ -248,7 +257,7 @@ class FrameTest(test.TestCase):
       result = shape_ops.frame(signal, frame_length=2, frame_step=2,
                                pad_end=True, axis=1)
       expected = np.reshape(np.arange(16), (2, 2, 2, 2))
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
       result = shape_ops.frame(signal, frame_length=2, frame_step=1,
                                pad_end=True, axis=1)
@@ -260,7 +269,7 @@ class FrameTest(test.TestCase):
                    [[10, 11], [12, 13]],
                    [[12, 13], [14, 15]],
                    [[14, 15], [0, 0]]]]
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
       result = shape_ops.frame(signal, frame_length=3, frame_step=1,
                                pad_end=True, axis=1)
@@ -272,8 +281,9 @@ class FrameTest(test.TestCase):
                    [[10, 11], [12, 13], [14, 15]],
                    [[12, 13], [14, 15], [0, 0]],
                    [[14, 15], [0, 0], [0, 0]]]]
-      self.assertAllEqual(expected, result.eval())
+      self.assertAllEqual(expected, self.evaluate(result))
 
+  @tf_test_util.run_deprecated_v1
   def test_window_larger_than_signal(self):
     signal = constant_op.constant([[1, 2], [11, 12]], dtype=dtypes.float32)
     frame_length = 4
@@ -307,6 +317,7 @@ class FrameTest(test.TestCase):
       result = shape_ops.frame(signal, frame_length, frame_step)
       self.assertEqual(result.dtype, signal.dtype)
 
+  @tf_test_util.run_deprecated_v1
   def test_dynamic_tensor(self):
     # Show that frame works even when the dimensions of its input are
     # not known at graph creation time.
@@ -325,6 +336,7 @@ class FrameTest(test.TestCase):
                            [[10, 11], [12, 13]],
                            [[20, 21], [22, 23]]], result)
 
+  @tf_test_util.run_deprecated_v1
   def test_gradient_numerical(self):
     with self.session(use_gpu=True):
       signal_shape = (2, 128)
diff --git a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
index 26cb1270639b1ee4dfa5c27592d2ab4ae4159fc0..7b9748c7f260b60d7322a6de68e35970513ac969 100644
--- a/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/spectral_ops_test.py
@@ -125,22 +125,22 @@ class SpectralOpsTest(test.TestCase):
       stft = spectral_ops.stft(signal, frame_length=7, frame_step=8,
                                pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
                                pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=8, frame_step=8,
                                fft_length=16, pad_end=True)
       self.assertAllEqual([64, 9], stft.shape.as_list())
-      self.assertAllEqual([64, 9], stft.eval().shape)
+      self.assertAllEqual([64, 9], self.evaluate(stft).shape)
 
       stft = spectral_ops.stft(signal, frame_length=16, frame_step=8,
                                fft_length=8, pad_end=True)
       self.assertAllEqual([64, 5], stft.shape.as_list())
-      self.assertAllEqual([64, 5], stft.eval().shape)
+      self.assertAllEqual([64, 5], self.evaluate(stft).shape)
 
       stft = np.zeros((32, 9)).astype(np.complex64)
 
@@ -148,7 +148,7 @@ class SpectralOpsTest(test.TestCase):
                                                fft_length=16, frame_step=8)
       expected_length = (stft.shape[0] - 1) * 8 + 8
       self.assertAllEqual([256], inverse_stft.shape.as_list())
-      self.assertAllEqual([expected_length], inverse_stft.eval().shape)
+      self.assertAllEqual([expected_length], self.evaluate(inverse_stft).shape)
 
   def test_stft_and_inverse_stft(self):
     """Test that spectral_ops.stft/inverse_stft match a NumPy implementation."""
@@ -235,7 +235,8 @@ class SpectralOpsTest(test.TestCase):
       inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
 
       with self.cached_session(use_gpu=True) as sess:
-        hann_window, inverse_window = sess.run([hann_window, inverse_window])
+        hann_window, inverse_window = self.evaluate(
+            [hann_window, inverse_window])
 
       # Expect unit gain at each phase of the window.
       product_window = hann_window * inverse_window
@@ -263,7 +264,8 @@ class SpectralOpsTest(test.TestCase):
       inverse_window = inverse_window_fn(frame_length, dtype=dtypes.float32)
 
       with self.cached_session(use_gpu=True) as sess:
-        hann_window, inverse_window = sess.run([hann_window, inverse_window])
+        hann_window, inverse_window = self.evaluate(
+            [hann_window, inverse_window])
 
       self.assertAllClose(hann_window, inverse_window * 1.5)
 
@@ -293,7 +295,7 @@ class SpectralOpsTest(test.TestCase):
       # the sum of the magnitude STFT.
       sinusoid = math_ops.sin(
           2 * np.pi * math_ops.linspace(0.0, 1.0, signal_length))
-      sinusoid_gradient = sess.run(self._compute_stft_gradient(sinusoid))
+      sinusoid_gradient = self.evaluate(self._compute_stft_gradient(sinusoid))
       self.assertFalse((sinusoid_gradient == 0.0).all())
 
   def test_gradients_numerical(self):
diff --git a/tensorflow/python/kernel_tests/signal/test_util.py b/tensorflow/python/kernel_tests/signal/test_util.py
index f2c4d0dc8f8dcf75385a5d8e66a9513777b41ae4..0a8a621c3eeee1b943a55aced138a6abad233059 100644
--- a/tensorflow/python/kernel_tests/signal/test_util.py
+++ b/tensorflow/python/kernel_tests/signal/test_util.py
@@ -18,12 +18,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.training import saver
 
 
-def grappler_optimize(graph, fetches=None, rewriter_config=None):
+def grappler_optimize(graph, fetches=None, config_proto=None):
   """Tries to optimize the provided graph using grappler.
 
   Args:
@@ -31,17 +31,17 @@ def grappler_optimize(graph, fetches=None, rewriter_config=None):
     fetches: An optional list of `Tensor`s to fetch (i.e. not optimize away).
       Grappler uses the 'train_op' collection to look for fetches, so if not
       provided this collection should be non-empty.
-    rewriter_config: An optional `tf.RewriterConfig` to use when rewriting the
+    config_proto: An optional `tf.ConfigProto` to use when rewriting the
       graph.
 
   Returns:
     A `tf.GraphDef` containing the rewritten graph.
   """
-  if rewriter_config is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
-    rewriter_config.min_graph_nodes = -1
+  if config_proto is None:
+    config_proto = config_pb2.ConfigProto()
+    config_proto.graph_options.rewrite_options.min_graph_nodes = -1
   if fetches is not None:
     for fetch in fetches:
       graph.add_to_collection('train_op', fetch)
   metagraph = saver.export_meta_graph(graph_def=graph.as_graph_def())
-  return tf_optimizer.OptimizeGraph(rewriter_config, metagraph)
+  return tf_optimizer.OptimizeGraph(config_proto, metagraph)
diff --git a/tensorflow/python/kernel_tests/signal/window_ops_test.py b/tensorflow/python/kernel_tests/signal/window_ops_test.py
index 2f19134f5a8c1b43852c14a857a709c61fef30a9..a72cdb288bb93d96237fa84261a7bc1e9dcdf118 100644
--- a/tensorflow/python/kernel_tests/signal/window_ops_test.py
+++ b/tensorflow/python/kernel_tests/signal/window_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util as tf_test_util
 from tensorflow.python.kernel_tests.signal import test_util
 from tensorflow.python.ops.signal import window_ops
 from tensorflow.python.platform import test
@@ -75,6 +76,7 @@ class WindowOpsTest(test.TestCase):
                                   dtype=tf_dtype).eval()
             self.assertAllClose(expected, actual, tol, tol)
 
+  @tf_test_util.run_deprecated_v1
   def test_hann_window(self):
     """Check that hann_window matches scipy.signal.hann behavior."""
     # The Hann window is a raised cosine window with parameters alpha=0.5 and
@@ -84,6 +86,7 @@ class WindowOpsTest(test.TestCase):
         functools.partial(_scipy_raised_cosine, a=0.5, b=0.5),
         window_ops.hann_window)
 
+  @tf_test_util.run_deprecated_v1
   def test_hamming_window(self):
     """Check that hamming_window matches scipy.signal.hamming's behavior."""
     # The Hamming window is a raised cosine window with parameters alpha=0.54
diff --git a/tensorflow/python/kernel_tests/slice_op_test.py b/tensorflow/python/kernel_tests/slice_op_test.py
index 41f040ab739451cf5a42287702ee4cffe1d8d3fa..8f7245214a20d88caf426558b9699fec9f9c908f 100644
--- a/tensorflow/python/kernel_tests/slice_op_test.py
+++ b/tensorflow/python/kernel_tests/slice_op_test.py
@@ -24,6 +24,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
@@ -38,7 +39,7 @@ class SliceTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.float32)
         slice_t = a[2, k:k]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[2, k:k])
 
   def testInt32(self):
@@ -47,7 +48,7 @@ class SliceTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.int32)
         slice_t = a[2, k:k]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[2, k:k])
 
   def testSlicingWithInt64Index(self):
@@ -57,33 +58,33 @@ class SliceTest(test.TestCase):
       # Slice using int64 Tensor.
       i = constant_op.constant(1, dtype=dtypes.int64)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i+1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       # Slice using int64 integer.
       i = np.asarray(1).astype(np.int64)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i+1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       a_int32 = constant_op.constant([0, 1, 2], dtype=dtypes.int32)
       slice_t = array_ops.slice(a_int32,
                                 np.asarray([1]).astype(np.int64),
                                 np.asarray([2]).astype(np.int64))
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
       a_float32 = constant_op.constant([0, 1, 2], dtype=dtypes.float32)
       slice_t = array_ops.slice(a_float32,
                                 np.asarray([1]).astype(np.int64),
                                 np.asarray([2]).astype(np.int64))
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
   def testSlicingInt64Tensor(self):
@@ -93,23 +94,23 @@ class SliceTest(test.TestCase):
       # Slice using int32 Tensor.
       i = constant_op.constant(1, dtype=dtypes.int32)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i + 1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       # Slice using int32 integer.
       i = np.asarray(1).astype(np.int32)
       slice_t = a[i]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual(1, slice_val)
       slice_t = a[i:i + 1]
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1], slice_val)
 
       slice_t = array_ops.slice(a, [1], [2])
-      slice_val = slice_t.eval()
+      slice_val = self.evaluate(slice_t)
       self.assertAllEqual([1, 2], slice_val)
 
   def testSelectAll(self):
@@ -121,8 +122,8 @@ class SliceTest(test.TestCase):
         slice_explicit_t = array_ops.slice(a, [0, 0, 0, 0], [-1, -1, -1, -1])
         slice_implicit_t = a[:, :, :, :]
 
-        self.assertAllEqual(inp, slice_explicit_t.eval())
-        self.assertAllEqual(inp, slice_implicit_t.eval())
+        self.assertAllEqual(inp, self.evaluate(slice_explicit_t))
+        self.assertAllEqual(inp, self.evaluate(slice_implicit_t))
         self.assertEqual(inp.shape, slice_explicit_t.get_shape())
         self.assertEqual(inp.shape, slice_implicit_t.get_shape())
 
@@ -134,7 +135,7 @@ class SliceTest(test.TestCase):
 
         hi = np.random.randint(0, 9)
         scalar_t = a[hi]
-        scalar_val = scalar_t.eval()
+        scalar_val = self.evaluate(scalar_t)
         self.assertAllEqual(scalar_val, inp[hi])
 
         if hi > 0:
@@ -142,9 +143,10 @@ class SliceTest(test.TestCase):
         else:
           lo = 0
         slice_t = a[lo:hi]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
         self.assertAllEqual(slice_val, inp[lo:hi])
 
+  @test_util.run_deprecated_v1
   def testScalarInput(self):
     input_val = 0
     with self.cached_session() as sess:
@@ -159,6 +161,7 @@ class SliceTest(test.TestCase):
                                                "out of range"):
         sess.run([slice_t], feed_dict={input_t: input_val})
 
+  @test_util.run_deprecated_v1
   def testInvalidIndex(self):
     input_val = [1, 2]
     with self.cached_session() as sess:
@@ -179,6 +182,7 @@ class SliceTest(test.TestCase):
     np_ans = x[begin:begin + size, :]
     self.assertAllEqual(tf_ans, np_ans)
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixDim0(self):
     x = np.random.rand(8, 4).astype("f")
     self._testSliceMatrixDim0(x, 1, 2)
@@ -195,7 +199,7 @@ class SliceTest(test.TestCase):
 
         x, y = np.random.randint(0, 3, size=2).tolist()
         slice_t = a[x, 0:y]
-        slice_val = slice_t.eval()
+        slice_val = self.evaluate(slice_t)
       self.assertAllEqual(slice_val, inp[x, 0:y])
 
   def testSimple(self):
@@ -207,12 +211,13 @@ class SliceTest(test.TestCase):
           dtype=dtypes.float32)
       slice_t = array_ops.slice(a, [0, 0], [2, 2])
       slice2_t = a[:2, :2]
-      slice_val, slice2_val = sess.run([slice_t, slice2_t])
+      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
     self.assertAllEqual(slice_val, inp[:2, :2])
     self.assertAllEqual(slice2_val, inp[:2, :2])
     self.assertEqual(slice_val.shape, slice_t.get_shape())
     self.assertEqual(slice2_val.shape, slice2_t.get_shape())
 
+  @test_util.run_deprecated_v1
   def testComplex(self):
     with self.session(use_gpu=True):
       inp = np.random.rand(4, 10, 10, 4).astype("f")
@@ -247,7 +252,7 @@ class SliceTest(test.TestCase):
                    + sizes[3], indices[4]:indices[4] + sizes[4], indices[5]:
                    indices[5] + sizes[5]]
 
-      slice_val, slice2_val = sess.run([slice_t, slice2_t])
+      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
 
     expected_val = inp[indices[0]:indices[0] + sizes[0], indices[1]:indices[
         1] + sizes[1], indices[2]:indices[2] + sizes[2], indices[3]:indices[
@@ -282,7 +287,7 @@ class SliceTest(test.TestCase):
       grads = np.random.rand(num_grads).astype("f").reshape(slice_size)
       grad_tensor = constant_op.constant(grads)
       grad = gradients_impl.gradients(slice_t, [a], grad_tensor)[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
 
     # Create a zero tensor of the input shape ane place
     # the grads into the right location to compare against TensorFlow.
@@ -313,9 +318,10 @@ class SliceTest(test.TestCase):
       g1 = gradients_impl.gradients(loss1, x)[0]
       g2 = gradients_impl.gradients(loss2, x)[0]
 
-      g1_val, g2_val = sess.run([g1, g2])
+      g1_val, g2_val = self.evaluate([g1, g2])
     self.assertAllEqual(g1_val, g2_val)
 
+  @test_util.run_deprecated_v1
   def testGradientsAll(self):
     # Slice the middle square out of a 4x4 input
     self._testGradientSlice([4, 4], [1, 1], [2, 2])
@@ -335,6 +341,7 @@ class SliceTest(test.TestCase):
     # Use -1 as a slice dimension on a 2D tensor.
     self._testGradientVariableSize2D()
 
+  @test_util.run_deprecated_v1
   def testNotIterable(self):
     # NOTE(mrry): If we register __getitem__ as an overloaded
     # operator, Python will valiantly attempt to iterate over the
@@ -346,6 +353,7 @@ class SliceTest(test.TestCase):
       for _ in c:
         pass
 
+  @test_util.run_deprecated_v1
   def testComputedShape(self):
     # NOTE(mrry): We cannot currently handle partially-known values,
     # because `tf.slice()` uses -1 to specify a wildcard size, and
@@ -368,7 +376,7 @@ class SliceTest(test.TestCase):
       c = b[:-1, :]
       d = c[1, :]
       res = 2 * d - c[1, :] + a[2, :] - 2 * b[-2, :]
-      self.assertAllEqual([0, 0, 0], res.eval())
+      self.assertAllEqual([0, 0, 0], self.evaluate(res))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/kernel_tests/softmax_op_test.py b/tensorflow/python/kernel_tests/softmax_op_test.py
index ef9301d4e35385d78a0487d7557b50ca6dbcd4e0..707b8a429f2be1fcce39516d368e2b7a05570652 100644
--- a/tensorflow/python/kernel_tests/softmax_op_test.py
+++ b/tensorflow/python/kernel_tests/softmax_op_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
@@ -64,7 +65,7 @@ class SoftmaxTest(test.TestCase):
         tf_softmax = nn_ops.log_softmax(np_features, axis=dim, name=name)
       else:
         tf_softmax = nn_ops.softmax(np_features, axis=dim, name=name)
-      out = tf_softmax.eval()
+      out = self.evaluate(tf_softmax)
     self.assertAllCloseAccordingToType(np_softmax, out)
     self.assertShapeEqual(np_softmax, tf_softmax)
     if not log:
@@ -113,7 +114,7 @@ class SoftmaxTest(test.TestCase):
     features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type)
     with self.cached_session(use_gpu=use_gpu):
       tf_log_softmax = nn_ops.log_softmax(features)
-      out = tf_log_softmax.eval()
+      out = self.evaluate(tf_log_softmax)
     self.assertAllClose(
         np.array([[-1.386294, -1.386294, -1.386294, -1.386294],
                   [0, -max, -max, -max]]),
@@ -206,6 +207,7 @@ class SoftmaxTest(test.TestCase):
                          [[5., 4., 3., 2.], [1., 2., 3., 4.]]])
     self.assertEqual([3, 2, 4], op.get_shape())
 
+  @test_util.run_deprecated_v1
   def testEmptyInput(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32, shape=[0, 3])
@@ -229,6 +231,7 @@ class SoftmaxTest(test.TestCase):
       with self.assertRaises(errors_impl.InvalidArgumentError):
         nn_ops.softmax(ones, axis=2).eval()
 
+  @test_util.run_deprecated_v1
   def testLargeDims(self):
     # Make sure that we properly handle large inputs. See
     # https://github.com/tensorflow/tensorflow/issues/4425 for details
diff --git a/tensorflow/python/kernel_tests/softplus_op_test.py b/tensorflow/python/kernel_tests/softplus_op_test.py
index 50a8291ea88f0046d6c94eebf89e7bd79bd97659..5273dd7ffc7cec6807bdcdf2ad2a4e9c18a573d1 100644
--- a/tensorflow/python/kernel_tests/softplus_op_test.py
+++ b/tensorflow/python/kernel_tests/softplus_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -39,7 +40,7 @@ class SoftplusTest(test.TestCase):
     np_softplus = self._npSoftplus(np_features)
     with self.cached_session(use_gpu=use_gpu):
       softplus = nn_ops.softplus(np_features)
-      tf_softplus = softplus.eval()
+      tf_softplus = self.evaluate(softplus)
     self.assertAllCloseAccordingToType(np_softplus, tf_softplus)
     self.assertTrue(np.all(tf_softplus > 0))
     self.assertShapeEqual(np_softplus, softplus)
@@ -70,6 +71,7 @@ class SoftplusTest(test.TestCase):
           ],
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -86,6 +88,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradGrad(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -103,6 +106,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) gradient of gradient err = ", err)
     self.assertLess(err, 5e-5)
 
+  @test_util.run_deprecated_v1
   def testGradGradGrad(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -121,6 +125,7 @@ class SoftplusTest(test.TestCase):
     print("softplus (float) third-order gradient err = ", err)
     self.assertLess(err, 5e-5)
 
+  @test_util.run_deprecated_v1
   def testNoInts(self):
     with self.cached_session():
       with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/softsign_op_test.py b/tensorflow/python/kernel_tests/softsign_op_test.py
index ee2e2e03032aca6afc9aaa73e56598920932bb55..5554240c82621e5bceb89fab17f6d824f61252f3 100644
--- a/tensorflow/python/kernel_tests/softsign_op_test.py
+++ b/tensorflow/python/kernel_tests/softsign_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import nn_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
@@ -36,7 +37,7 @@ class SoftsignTest(test.TestCase):
     np_softsign = self._npSoftsign(np_features)
     with self.cached_session(use_gpu=use_gpu):
       softsign = nn_ops.softsign(np_features)
-      tf_softsign = softsign.eval()
+      tf_softsign = self.evaluate(softsign)
     self.assertAllClose(np_softsign, tf_softsign)
     self.assertShapeEqual(np_softsign, softsign)
 
@@ -49,6 +50,7 @@ class SoftsignTest(test.TestCase):
           np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
           use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session():
       x = constant_op.constant(
@@ -65,6 +67,7 @@ class SoftsignTest(test.TestCase):
     print("softsign (float) gradient err = ", err)
     self.assertLess(err, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testNoInts(self):
     with self.cached_session():
       with self.assertRaisesRegexp(
diff --git a/tensorflow/python/kernel_tests/spacetobatch_op_test.py b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
index 21134adf2cac172c43ba1776b7208f89de388efd..8641156604c98e2737f8854db3a218905cfd9281 100644
--- a/tensorflow/python/kernel_tests/spacetobatch_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetobatch_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gradient_checker
@@ -115,6 +116,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
     self._testPad(inputs, paddings, block_size, outputs)
 
   # [1, 2, 2, 1] <-> [4, 1, 1, 1]
+  @test_util.run_deprecated_v1
   def testSmallInput2x2(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 2
@@ -122,6 +124,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
     self._testOne(x_np, block_size, x_out)
 
   # [1, 2, 2, 1] <-> [1, 3, 3, 1] (padding) <-> [9, 1, 1, 1]
+  @test_util.run_deprecated_v1
   def testSmallInput2x2Pad1x0(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
     paddings = np.array([[1, 0], [1, 0]], dtype=np.int32)
@@ -132,6 +135,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test with depth larger than 1.
   # [1, 2, 2, 3] <-> [4, 1, 1, 3]
+  @test_util.run_deprecated_v1
   def testDepthInput2x2(self):
     x_np = [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]
     block_size = 2
@@ -140,6 +144,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test for larger input dimensions.
   # [1, 4, 4, 1] <-> [4, 2, 2, 1]
+  @test_util.run_deprecated_v1
   def testLargerInput2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
              [[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
@@ -150,6 +155,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
 
   # Test with batch larger than 1.
   # [2, 2, 4, 1] <-> [8, 1, 2, 1]
+  @test_util.run_deprecated_v1
   def testBatchInput2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]]],
             [[[9], [10], [11], [12]], [[13], [14], [15], [16]]]]
@@ -162,6 +168,7 @@ class SpaceToBatchTest(test.TestCase, PythonOpImpl):
   # that elements are correctly laid out spatially and properly interleaved
   # along the batch dimension.
   # [2, 4, 4, 1] <-> [8, 2, 2, 1]
+  @test_util.run_deprecated_v1
   def testLargerInputBatch2x2(self):
     x_np = [[[[1], [2], [3], [4]], [[5], [6], [7], [8]],
              [[9], [10], [11], [12]], [[13], [14], [15], [16]]],
@@ -206,6 +213,7 @@ class SpaceToBatchNDTest(test.TestCase):
     self._testPad(inputs, block_shape, paddings,
                   space_to_batch_direct(inputs, block_shape, paddings))
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsZeroRemainingDims(self):
     self._testPad(
         inputs=[1, 2],
@@ -213,6 +221,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[],
         outputs=[1, 2],)
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsOneRemainingDim(self):
     self._testPad(
         inputs=[[1, 2], [3, 4]],
@@ -227,6 +236,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[[0, 0]],
         outputs=[[1, 2], [3, 4]])
 
+  @test_util.run_deprecated_v1
   def testZeroBlockDimsTwoRemainingDims(self):
     self._testPad(
         inputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
@@ -248,6 +258,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[[0, 0], [0, 0]],
         outputs=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
 
+  @test_util.run_deprecated_v1
   def testOneBlockDimZeroRemainingDims(self):
     self._testPad(
         inputs=[[1, 2, 3], [4, 5, 6]],
@@ -255,6 +266,7 @@ class SpaceToBatchNDTest(test.TestCase):
         paddings=[1, 0],
         outputs=[[0, 2], [0, 5], [1, 3], [4, 6]])
 
+  @test_util.run_deprecated_v1
   def testOneBlockDimOneRemainingDim(self):
     self._testPad(
         inputs=[[[1, 11], [2, 21], [3, 31]], [[4, 41], [5, 51], [6, 61]]],
@@ -263,6 +275,7 @@ class SpaceToBatchNDTest(test.TestCase):
         outputs=[[[0, 0], [2, 21]], [[0, 0], [5, 51]], [[1, 11], [3, 31]],
                  [[4, 41], [6, 61]]])
 
+  @test_util.run_deprecated_v1
   def testDirect(self):
     # Test with zero-size remaining dimension.
     self._testDirect(
@@ -300,6 +313,7 @@ class SpaceToBatchNDTest(test.TestCase):
 class SpaceToBatchSpaceToDepth(test.TestCase, PythonOpImpl):
 
   # Verifies that: space_to_batch(x) = transpose(space_to_depth(transpose(x)))
+  @test_util.run_deprecated_v1
   def testSpaceToDepthTranspose(self):
     x = np.arange(5 * 10 * 16 * 7, dtype=np.float32).reshape([5, 10, 16, 7])
     block_size = 2
@@ -319,6 +333,7 @@ class SpaceToBatchSpaceToDepthCpp(SpaceToBatchSpaceToDepth, CppOpImpl):
 
 class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -327,6 +342,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -336,6 +352,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -345,6 +362,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -354,6 +372,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
       out_tf = self.space_to_batch(x_np, paddings, block_size)
       out_tf.eval()
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleWidth(self):
     # The block size divides width but not height.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -362,6 +381,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleHeight(self):
     # The block size divides height but not width.
     x_np = [[[[1], [2]], [[3], [4]], [[5], [6]]]]
@@ -370,6 +390,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleBoth(self):
     # The block size does not divide neither width or height.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -378,6 +399,7 @@ class SpaceToBatchErrorHandlingTest(test.TestCase, PythonOpImpl):
     with self.assertRaises(ValueError):
       _ = self.space_to_batch(x_np, paddings, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = self.space_to_batch(
         array_ops.placeholder(dtypes.float32),
@@ -424,25 +446,31 @@ class SpaceToBatchNDErrorHandlingTest(test.TestCase):
     self._testStaticShape(input_shape, block_shape, paddings, error)
     self._testDynamicShape(input_shape, block_shape, paddings)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     self._testShape([1, 2, 2], [0, 2], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNegative(self):
     self._testShape([1, 2, 2], [-1, 2], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testNegativePadding(self):
     # The padding is negative.
     self._testShape([1, 2, 2], [1, 1], [[0, -1], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisible(self):
     # The padded size is not divisible by the block size.
     self._testShape([1, 2, 3, 1], [3, 3], [[0, 0], [0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testBlockDimsMismatch(self):
     # Shape of block_shape does not match shape of paddings.
     self._testStaticShape([1, 3, 3, 1], [3, 3], [[0, 0]], ValueError)
 
+  @test_util.run_deprecated_v1
   def testUnknown(self):
     # Verify that input shape and paddings shape can be unknown.
     _ = array_ops.space_to_batch_nd(
@@ -522,18 +550,21 @@ class SpaceToBatchGradientTest(test.TestCase, PythonOpImpl):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     pad_beg = 0
     pad_end = 0
     self._compare(1, 2, 3, 5, block_size, pad_beg, pad_end)
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     pad_beg = 0
     pad_end = 0
     self._compare(2, 4, 3, 2, block_size, pad_beg, pad_end)
 
+  @test_util.run_deprecated_v1
   def testSmallPad1x1(self):
     block_size = 2
     pad_beg = 1
@@ -572,15 +603,19 @@ class SpaceToBatchNDGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     self._compare([1, 4, 6, 5], [2, 2], [[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     self._compare([2, 8, 6, 2], [2, 2], [[0, 0], [0, 0]])
 
+  @test_util.run_deprecated_v1
   def testSmallPad1(self):
     self._compare([2, 4, 6, 2], [2, 2], [[1, 1], [1, 1]])
 
+  @test_util.run_deprecated_v1
   def testSmallPadThreeBlockDims(self):
     self._compare([2, 2, 4, 3, 2], [2, 2, 2], [[1, 1], [1, 1], [1, 0]])
 
@@ -644,6 +679,7 @@ class RequiredSpaceToBatchPaddingsTest(test.TestCase):
     self.assertAllEqual(paddings_result, paddings_const)
     self.assertAllEqual(crops_result, crops_const)
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     self._test(
         input_shape=np.zeros((0,), np.int32),
diff --git a/tensorflow/python/kernel_tests/spacetodepth_op_test.py b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
index b05f14f7381bca60fdd0fae51b20f5968a44973c..e96bc09f3652aaa4d41bddac6ad06daaff8bfbd6 100644
--- a/tensorflow/python/kernel_tests/spacetodepth_op_test.py
+++ b/tensorflow/python/kernel_tests/spacetodepth_op_test.py
@@ -36,21 +36,22 @@ class SpaceToDepthTest(test.TestCase):
 
   def _testOne(self, inputs, block_size, outputs, dtype=dtypes.float32):
     input_nhwc = math_ops.cast(inputs, dtype)
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
-      self.assertAllEqual(x_tf.eval(), outputs)
-    if test.is_gpu_available():
-      with self.session(force_gpu=True):
+      self.assertAllEqual(self.evaluate(x_tf), outputs)
+
+    if test_util.is_gpu_available():
+      with test_util.force_gpu():
         # test NHWC (default) on GPU
         x_tf = array_ops.space_to_depth(input_nhwc, block_size)
-        self.assertAllEqual(x_tf.eval(), outputs)
+        self.assertAllEqual(self.evaluate(x_tf), outputs)
         # test NCHW on GPU
         input_nchw = test_util.NHWCToNCHW(input_nhwc)
         output_nchw = array_ops.space_to_depth(
             input_nchw, block_size, data_format="NCHW")
         output_nhwc = test_util.NCHWToNHWC(output_nchw)
-        self.assertAllEqual(output_nhwc.eval(), outputs)
+        self.assertAllEqual(self.evaluate(output_nhwc), outputs)
 
   def testBasic(self):
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -134,17 +135,18 @@ class SpaceToDepthTest(test.TestCase):
     input_nhwc = array_ops.ones([batch_size, 4, 6, 3])
     x_out = array_ops.ones([batch_size, 2, 3, 12])
 
-    with self.session(use_gpu=False):
+    with test_util.force_cpu():
       # test NHWC (default) on CPU
       x_tf = array_ops.space_to_depth(input_nhwc, block_size)
       self.assertAllEqual(x_tf.shape, x_out.shape)
-      x_tf.eval()
+      self.evaluate(x_tf)
+
     if test.is_gpu_available():
-      with self.session(use_gpu=True):
+      with test_util.use_gpu():
         # test NHWC (default) on GPU
         x_tf = array_ops.space_to_depth(input_nhwc, block_size)
         self.assertAllEqual(x_tf.shape, x_out.shape)
-        x_tf.eval()
+        self.evaluate(x_tf)
 
   # Tests for different width and height.
   def testNonSquare(self):
@@ -157,14 +159,16 @@ class SpaceToDepthTest(test.TestCase):
 
   # Error handling:
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingDepth(self):
     # The input is missing the last dimension ("depth")
     x_np = [[[1, 2], [3, 4]]]
     block_size = 2
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testInputWrongDimMissingBatch(self):
     # The input is missing the first dimension ("batch")
     x_np = [[[1], [2]], [[3], [4]]]
@@ -172,30 +176,34 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSize0(self):
     # The block size is 0.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 0
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeOne(self):
     # The block size is 1. The block size needs to be > 1.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 1
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeLarger(self):
     # The block size is too large for this input.
     x_np = [[[[1], [2]], [[3], [4]]]]
     block_size = 10
     with self.assertRaises(ValueError):
       out_tf = array_ops.space_to_depth(x_np, block_size)
-      out_tf.eval()
+      self.evaluate(out_tf)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleWidth(self):
     # The block size divides width but not height.
     x_np = [[[[1], [2], [3]], [[3], [4], [7]]]]
@@ -203,6 +211,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleHeight(self):
     # The block size divides height but not width.
     x_np = [[[[1], [2]], [[3], [4]], [[5], [6]]]]
@@ -210,6 +219,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testBlockSizeNotDivisibleBoth(self):
     # The block size does not divide neither width or height.
     x_np = [[[[1], [2]], [[3], [4]]]]
@@ -217,6 +227,7 @@ class SpaceToDepthTest(test.TestCase):
     with self.assertRaises(ValueError):
       _ = array_ops.space_to_depth(x_np, block_size)
 
+  @test_util.run_deprecated_v1
   def testUnknownShape(self):
     t = array_ops.space_to_depth(
         array_ops.placeholder(dtypes.float32), block_size=4)
@@ -271,7 +282,7 @@ class SpaceToDepthTest(test.TestCase):
       actual = array_ops.space_to_depth(t, block_size, data_format=data_format)
 
     with self.cached_session(use_gpu=use_gpu) as sess:
-      actual_vals, expected_vals = sess.run([actual, expected])
+      actual_vals, expected_vals = self.evaluate([actual, expected])
       self.assertTrue(np.array_equal(actual_vals, expected_vals))
 
   def testAgainstTranspose(self):
@@ -332,11 +343,13 @@ class SpaceToDepthGradientTest(test.TestCase):
 
   # Don't use very large numbers as dimensions here as the result is tensor
   # with cartesian product of the dimensions.
+  @test_util.run_deprecated_v1
   def testSmall(self):
     block_size = 2
     self._compare(1, 2, 3, 5, block_size, "NHWC")
     self._compare(1, 2, 3, 5, block_size, "NCHW")
 
+  @test_util.run_deprecated_v1
   def testSmall2(self):
     block_size = 2
     self._compare(2, 4, 3, 2, block_size, "NHWC")
diff --git a/tensorflow/python/kernel_tests/sparse_add_op_test.py b/tensorflow/python/kernel_tests/sparse_add_op_test.py
index a746830afb377c5708640abd6a7590381d213b3f..00eff54077caa4c60c8d910439a73704159a6ee6 100644
--- a/tensorflow/python/kernel_tests/sparse_add_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_add_op_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
@@ -85,13 +86,13 @@ class SparseAddTest(test.TestCase):
         constant_op.constant(shape, dtypes.int64))
 
   def testAddSelf(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       for sp_a in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
         for sp_b in (self._SparseTensorValue_3x3(), self._SparseTensor_3x3()):
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
           self.assertAllEqual((3, 3), sp_sum.get_shape())
 
-          sum_out = sess.run(sp_sum)
+          sum_out = self.evaluate(sp_sum)
 
           self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
           self.assertAllEqual(sum_out.indices, [[0, 1], [1, 0], [2, 0], [2, 1]])
@@ -99,12 +100,12 @@ class SparseAddTest(test.TestCase):
           self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
   def testAddSelfAndNegation(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_a = self._SparseTensor_3x3()
       sp_b = self._SparseTensor_3x3(negate=True)
 
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, 0.1)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, np.empty([0, 2]))
@@ -112,7 +113,7 @@ class SparseAddTest(test.TestCase):
       self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
   def testSmallValuesShouldVanish(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       sp_a = self._SparseTensor_3x3()
       sp_b = self._SparseTensor_3x3_v2()
 
@@ -123,7 +124,7 @@ class SparseAddTest(test.TestCase):
 
       # two values should vanish: |.1| < .21, and |-.2| < .21
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, thresh=0.21)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0]])
@@ -132,13 +133,14 @@ class SparseAddTest(test.TestCase):
 
       # only .1 vanishes
       sp_sum = sparse_ops.sparse_add(sp_a, sp_b, thresh=0.11)
-      sum_out = sess.run(sp_sum)
+      sum_out = self.evaluate(sp_sum)
 
       self.assertEqual(sp_sum.dense_shape.get_shape(), [2])
       self.assertAllEqual(sum_out.indices, [[0, 1], [2, 0], [2, 1]])
       self.assertAllClose(sum_out.values, [2, 6, -.2])
       self.assertAllEqual(sum_out.dense_shape, [3, 3])
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(1618)  # Make it reproducible.
     with self.session(use_gpu=False):
@@ -147,7 +149,7 @@ class SparseAddTest(test.TestCase):
           sp_a, nnz_a = self._randomTensor([n, m], np.float32)
           sp_b, nnz_b = self._randomTensor([n, m], np.float32)
           sp_sum = sparse_ops.sparse_add(sp_a, sp_b)
-          nnz_sum = len(sp_sum.values.eval())
+          nnz_sum = len(self.evaluate(sp_sum.values))
 
           err = gradient_checker.compute_gradient_error(
               [sp_a.values, sp_b.values], [(nnz_a,), (nnz_b,)], sp_sum.values,
@@ -162,19 +164,20 @@ class SparseAddTest(test.TestCase):
         rand_vals_np = np.random.randn(n, m).astype(dtype)
         dense_np = np.random.randn(n, m).astype(dtype)
 
-        with self.cached_session(use_gpu=False):
+        with test_util.force_cpu():
           sparse, unused_nnz = _sparsify(rand_vals_np, index_dtype=index_dtype)
-          s = sparse_ops.sparse_add(sparse,
-                                    constant_op.constant(dense_np)).eval()
+          s = self.evaluate(
+              sparse_ops.sparse_add(sparse, constant_op.constant(dense_np)))
           self.assertAllEqual(dense_np + rand_vals_np, s)
           self.assertTrue(s.dtype == dtype)
 
           # check commutativity
-          s = sparse_ops.sparse_add(constant_op.constant(dense_np),
-                                    sparse).eval()
+          s = self.evaluate(
+              sparse_ops.sparse_add(constant_op.constant(dense_np), sparse))
           self.assertAllEqual(dense_np + rand_vals_np, s)
           self.assertTrue(s.dtype == dtype)
 
+  @test_util.run_deprecated_v1
   def testSparseTensorDenseAddGradients(self):
     np.random.seed(1618)  # Make it reproducible.
     n, m = np.random.randint(30, size=2)
@@ -190,8 +193,9 @@ class SparseAddTest(test.TestCase):
                                                     [(nnz,), (n, m)], s, (n, m))
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testInvalidSparseTensor(self):
-    with self.session(use_gpu=False) as sess:
+    with test_util.force_cpu():
       shape = [2, 2]
       val = [0]
       dense = constant_op.constant(np.zeros(shape, dtype=np.int32))
@@ -205,7 +209,7 @@ class SparseAddTest(test.TestCase):
 
         with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
                                      "invalid index"):
-          sess.run(s)
+          self.evaluate(s)
 
 ######################## Benchmarking code
 
diff --git a/tensorflow/python/kernel_tests/sparse_concat_op_test.py b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
index 402c5eb4ea3c52752ed8ff8494014aab6cf15d33..04b6b9b8d20fe683add967e2877ae3766caf1c4f 100644
--- a/tensorflow/python/kernel_tests/sparse_concat_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_concat_op_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -147,7 +148,7 @@ class SparseConcatTest(test.TestCase):
           self.assertEqual(sp_concat.values.get_shape(), [4])
           self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-          concat_out = sess.run(sp_concat)
+          concat_out = self.evaluate(sp_concat)
 
           self.assertAllEqual(concat_out.indices,
                               [[0, 2], [1, 0], [2, 0], [2, 2]])
@@ -169,7 +170,7 @@ class SparseConcatTest(test.TestCase):
             self.assertEqual(sp_concat.values.get_shape(), [8])
             self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-            concat_out = sess.run(sp_concat)
+            concat_out = self.evaluate(sp_concat)
 
             self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4],
                                                      [2, 0], [2, 2], [2, 3],
@@ -195,7 +196,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [7])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(
             concat_out.indices,
@@ -220,7 +221,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [10])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(concat_out.indices, [[0, 2], [1, 0], [1, 4], [1, 8],
                                                  [2, 0], [2, 2], [2, 3], [2, 6],
@@ -244,7 +245,7 @@ class SparseConcatTest(test.TestCase):
         self.assertEqual(sp_concat.values.get_shape(), [8])
         self.assertEqual(sp_concat.dense_shape.get_shape(), [2])
 
-        concat_out = sess.run(sp_concat)
+        concat_out = self.evaluate(sp_concat)
 
         self.assertAllEqual(
             concat_out.indices,
@@ -253,6 +254,7 @@ class SparseConcatTest(test.TestCase):
                             [b"a", b"b", b"e", b"c", b"d", b"f", b"g", b"h"])
         self.assertAllEqual(concat_out.dense_shape, [3, 8])
 
+  @test_util.run_deprecated_v1
   def testMismatchedRank(self):
     with self.session(use_gpu=False):
       sp_a = self._SparseTensor_3x3()
@@ -263,6 +265,7 @@ class SparseConcatTest(test.TestCase):
         with self.assertRaises(ValueError):
           sparse_ops.sparse_concat(concat_dim, [sp_a, sp_e])
 
+  @test_util.run_deprecated_v1
   def testMismatchedRankExpandNonconcatDim(self):
     with self.session(use_gpu=False):
       sp_a = self._SparseTensor_3x3()
@@ -275,6 +278,7 @@ class SparseConcatTest(test.TestCase):
           sparse_ops.sparse_concat(
               concat_dim, [sp_a, sp_e], expand_nonconcat_dim=True)
 
+  @test_util.run_deprecated_v1
   def testMismatchedShapes(self):
     with self.session(use_gpu=False) as sess:
       sp_a = self._SparseTensor_3x3()
@@ -287,7 +291,7 @@ class SparseConcatTest(test.TestCase):
 
         # Shape mismatches can only be caught when the op is run
         with self.assertRaisesOpError("Input shapes must match"):
-          sess.run(sp_concat)
+          self.evaluate(sp_concat)
 
   def testMismatchedShapesExpandNonconcatDim(self):
     with self.session(use_gpu=False) as sess:
@@ -302,8 +306,8 @@ class SparseConcatTest(test.TestCase):
           sp_concat_dim1 = sparse_ops.sparse_concat(
               concat_dim1, [sp_a, sp_b, sp_c, sp_d], expand_nonconcat_dim=True)
 
-          sp_concat_dim0_out = sess.run(sp_concat_dim0)
-          sp_concat_dim1_out = sess.run(sp_concat_dim1)
+          sp_concat_dim0_out = self.evaluate(sp_concat_dim0)
+          sp_concat_dim1_out = self.evaluate(sp_concat_dim1)
 
           self.assertAllEqual(sp_concat_dim0_out.indices,
                               [[0, 2], [1, 0], [2, 0], [2, 2], [4, 1], [5, 0],
@@ -321,6 +325,7 @@ class SparseConcatTest(test.TestCase):
                               [1, 1, 2, 1, 1, 1, 2, 3, 4, 2, 1, 0, 2])
           self.assertAllEqual(sp_concat_dim1_out.dense_shape, [3, 13])
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceUnknownShapes(self):
     with self.session(use_gpu=False):
       sp_inputs = [
diff --git a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
index a824d5c826305a04bdc8c96d67837a39ae2dd5de..4a967b656285a1094b8eef17fb0b7f41f83cd8e7 100644
--- a/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
+++ b/tensorflow/python/kernel_tests/sparse_conditional_accumulator_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.platform import test
@@ -98,12 +99,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       attr { key: 'reduction_type' value {s: 'MEAN'} }
       """, q.accumulator_ref.op.node_def)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSizeEmpty(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q")
       self.assertEqual(q.num_accumulated().eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorSetGlobalStep(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -111,6 +114,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       set_global_step_op = q.set_global_step(1)
       set_global_step_op.run()
 
+  @test_util.run_deprecated_v1
   def testAccumulatorApplyGradFloat32(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -122,6 +126,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 1)
 
+  @test_util.run_deprecated_v1
   def testDtypes(self):
     with self.cached_session() as sess:
       dtypes = [dtypes_lib.float16, dtypes_lib.float32, dtypes_lib.float64]
@@ -140,10 +145,11 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           t = _indexedslice(mat_to_add)
           q.apply_indexed_slices_grad(t).run()
 
-        result = sess.run(q.take_indexed_slices_grad(1))
+        result = self.evaluate(q.take_indexed_slices_grad(1))
 
         self._assertEqual_nparray(sum_elems / len(elems), result, sess)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorMultipleAccumulators(self):
     with self.cached_session() as sess:
       q_f32_0 = data_flow_ops.SparseConditionalAccumulator(
@@ -174,6 +180,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         result = sess.run(accums[i].take_indexed_slices_grad(1))
         self._assertEqual_indexedslices(expected_tensors[i], result)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradMean(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -189,11 +196,12 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual([0, 1, 2], val.indices)
       self.assertAllEqual([[0.5, 0.5], [0, 2], [3, 0]], val.values)
       self.assertAllEqual([-1, 2], val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradSum(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -209,16 +217,18 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual([0, 1, 2], val.indices)
       self.assertAllEqual([[1, 1], [0, 2], [3, 0]], val.values)
       self.assertAllEqual([-1, 2], val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testAccumulatorTakeGradInvalidReductionType(self):
     with self.assertRaises(ValueError):
       data_flow_ops.SparseConditionalAccumulator(
           dtypes_lib.float32, name="Q", shape=(), reduction_type="Invalid")
 
+  @test_util.run_deprecated_v1
   def testAccumulatorRepeatedTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -235,7 +245,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual(val.indices, [0, 1, 2])
       self.assertAllEqual(val.values, [[0.5, 0.5], [0, 2], [3, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
@@ -252,11 +262,12 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
 
       takeg_t = q.take_indexed_slices_grad(1)
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
       self.assertAllEqual(val.indices, [0, 1, 2])
       self.assertAllEqual(val.values, [[5, 5], [0, 20], [30, 0]])
       self.assertAllEqual(val.dense_shape, [-1, 2])
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGradMean(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -269,7 +280,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_indexed_slices_grad(1)
 
       def apply_indexed_slices_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(
@@ -281,13 +292,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
 
       expected_val = sum(elems) / len(elems)
       self._assertEqual_nparray(
           np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32),
           val, sess)
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelApplyGradSum(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -303,7 +315,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       takeg_t = q.take_indexed_slices_grad(1)
 
       def apply_indexed_slices_grad(accum_op):
-        sess.run(accum_op)
+        self.evaluate(accum_op)
 
       threads = [
           self.checkedThread(target=apply_indexed_slices_grad, args=(o,))
@@ -315,13 +327,14 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       for thread in threads:
         thread.join()
 
-      val = sess.run(takeg_t)
+      val = self.evaluate(takeg_t)
 
       expected_val = 550.0
       self._assertEqual_nparray(
           np.array([[expected_val, 0], [0, expected_val]]).astype(np.float32),
           val, sess)
 
+  @test_util.run_v1_only("b/120545219")
   def testParallelTakeGrad(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -338,13 +351,13 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       def apply_indexed_slices_grad():
         for accum_op in accum_ops:
           time.sleep(1.0)
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       apply_indexed_slices_grad_thread = self.checkedThread(
           target=apply_indexed_slices_grad)
 
       def take_grad():
-        t = sess.run(takeg_t)
+        t = self.evaluate(takeg_t)
         results.append(t)
 
       threads = [self.checkedThread(target=take_grad) for _ in range(10)]
@@ -361,6 +374,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         self._assertEqual_nparray(
             np.array([[0, 0], [elems[i], 0]]), results[i], sess)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorApplyAndBlockingTake(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -378,10 +392,10 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       def apply_indexed_slices_grad():
         for accum_op in accum_ops:
-          sess.run(accum_op)
+          self.evaluate(accum_op)
 
       def take_grad():
-        results.append(sess.run(takeg_t))
+        results.append(self.evaluate(takeg_t))
 
       accum_thread = self.checkedThread(target=apply_indexed_slices_grad)
       takeg_thread = self.checkedThread(target=take_grad)
@@ -394,8 +408,9 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
   def _blocking_takeg(self, sess, takeg_op):
     with self.assertRaisesOpError("was cancelled"):
-      sess.run(takeg_op)
+      self.evaluate(takeg_op)
 
+  @test_util.run_v1_only("b/120545219")
   def testAccumulatorCancel(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -415,6 +430,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
 
       takeg_thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testNonVectorIndices(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -427,6 +443,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_indices=[[0, 1], [1, 0]],
             grad_values=np.array([1, 2]).astype(np.float32)).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testZeroDimensionValues(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -437,6 +454,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
         q.apply_grad(
             grad_indices=[0], grad_values=np.array(1).astype(np.float32)).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongNonEmptyInputValues(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -448,6 +466,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
             grad_indices=[0, 1],
             grad_values=np.array([[0, 1, 1]]).astype(np.float32)).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testDynamicNonVectorIndices(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -467,6 +486,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                      x_values: np.array([1, 2]).astype(np.float32)
                  })
 
+  @test_util.run_v1_only("b/120545219")
   def testDynamicWrongNonEmptyInputValues(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -485,6 +505,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                      x_values: np.array([[0, 1, 1]]).astype(np.float32)
                  })
 
+  @test_util.run_v1_only("b/120545219")
   def testEmptyShapeApply(self):
     with self.cached_session():
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -510,6 +531,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       q.apply_grad(grad_indices=[0], grad_values=[1.0], grad_shape=[]).run()
       q.apply_grad(grad_indices=[0], grad_values=[1.0]).run()
 
+  @test_util.run_v1_only("b/120545219")
   def testValidateShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -585,7 +607,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                     np.float32)).run()
 
       # After take grad, constraints on accumulated gradient are removed
-      sess.run(q.take_grad(1))
+      self.evaluate(q.take_grad(1))
 
       # First successful gradient imposes new constraints.
       # Hereafter, shape will additionally constrained to [None,2,2,3]
@@ -605,6 +627,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
                 [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]]]).astype(np.float32),
             local_step=1).run()
 
+  @test_util.run_deprecated_v1
   def testReturnShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -615,7 +638,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
           grad_values=np.array(
               [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]]]).astype(np.float32)).run()
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.dense_shape, [2, 2, 2, 2])
 
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -627,9 +650,10 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
               [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]]).astype(
                   np.float32)).run()
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.dense_shape, [-1, 2, 2, 3])
 
+  @test_util.run_deprecated_v1
   def testApplyGradtInt32IndicesAndShape(self):
     with self.cached_session() as sess:
       q = data_flow_ops.SparseConditionalAccumulator(
@@ -653,7 +677,7 @@ class IndexedSlicesConditionalAccumulatorTest(test.TestCase):
       accum_op.run()
       self.assertEqual(q.num_accumulated().eval(), 2)
 
-      val = sess.run(q.take_indexed_slices_grad(1))
+      val = self.evaluate(q.take_indexed_slices_grad(1))
       self.assertAllEqual(val.indices, [0, 2])
       self.assertAllEqual(val.values, [[0, 0, 1], [3, 0, 4]])
       self.assertAllEqual(val.dense_shape, [3, 3])
diff --git a/tensorflow/python/kernel_tests/sparse_cross_op_test.py b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
index 6e0714da702a09735ca10f7bb8658ecb25cbe8fb..566bbb56f007eacfd11bce3f1f2d78a8b22755a1 100644
--- a/tensorflow/python/kernel_tests/sparse_cross_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_cross_op_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
 
 class SparseCrossOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_simple(self):
     """Tests a simple scenario."""
     op = sparse_ops.sparse_cross([
@@ -43,8 +45,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_dense(self):
     """Tests only dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -63,8 +66,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_mixed_string_sparse(self):
     """Tests mixed type."""
     op = sparse_ops.sparse_cross([
@@ -77,8 +81,9 @@ class SparseCrossOpTest(test.TestCase):
         '55555_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_mixed_string_dense(self):
     """Tests mixed dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -95,8 +100,9 @@ class SparseCrossOpTest(test.TestCase):
         '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_sparse_cross_dense(self):
     """Tests sparse and dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -112,8 +118,9 @@ class SparseCrossOpTest(test.TestCase):
             'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2'
         ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_integer_sparse_input(self):
     """Tests mixed type sparse and dense inputs."""
     op = sparse_ops.sparse_cross([
@@ -128,8 +135,9 @@ class SparseCrossOpTest(test.TestCase):
             '5555_X_batch2-FC2-F1', '5555_X_batch2-FC2-F2'
         ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_permutation_3x3x3(self):
     """Tests 3x3x3 permutation."""
     op = sparse_ops.sparse_cross([
@@ -170,8 +178,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F3_X_batch1-FC2-F3_X_batch1-FC3-F3'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_permutation_3x1x2(self):
     """Tests 3x1x2 permutation."""
     op = sparse_ops.sparse_cross([
@@ -189,8 +198,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F3_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_large_batch(self):
     """Tests with large batch size to force multithreading."""
     batch_size = 5000
@@ -222,8 +232,9 @@ class SparseCrossOpTest(test.TestCase):
 
     expected_out = self._sparse_tensor(col_out)
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_one_column_empty(self):
     """Tests when one column is empty.
 
@@ -235,8 +246,9 @@ class SparseCrossOpTest(test.TestCase):
         self._sparse_tensor([['batch1-FC3-F1', 'batch1-FC3-F2']])
     ])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_empty(sess.run(op))
+      self._assert_sparse_tensor_empty(self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_some_columns_empty(self):
     """Tests when more than one columns are empty.
 
@@ -254,8 +266,9 @@ class SparseCrossOpTest(test.TestCase):
         'batch1-FC1-F2_X_batch1-FC2-F1_X_batch1-FC3-F2'
     ]], 2)
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_all_columns_empty(self):
     """Tests when all columns are empty.
 
@@ -267,8 +280,9 @@ class SparseCrossOpTest(test.TestCase):
         self._sparse_tensor([])
     ])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_empty(sess.run(op))
+      self._assert_sparse_tensor_empty(self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_zero_bucket_no_hash_key(self):
     op = sparse_ops.sparse_cross_hashed([
         self._sparse_tensor([['batch1-FC1-F1']]),
@@ -278,8 +292,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[1971693436396284976]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_zero_bucket(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -291,9 +306,10 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[4847552627144134031]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
   # TODO(sibyl-Aix6ihai): Add benchmark to compare Hashed vs Non-hashed.
+  @test_util.run_deprecated_v1
   def test_hashed_no_hash_key(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -305,8 +321,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[83]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed_output(self):
     op = sparse_ops.sparse_cross_hashed(
         [
@@ -319,8 +336,9 @@ class SparseCrossOpTest(test.TestCase):
     # Check actual hashed output to prevent unintentional hashing changes.
     expected_out = self._sparse_tensor([[31]])
     with self.cached_session() as sess:
-      self._assert_sparse_tensor_equals(expected_out, sess.run(op))
+      self._assert_sparse_tensor_equals(expected_out, self.evaluate(op))
 
+  @test_util.run_deprecated_v1
   def test_hashed__has_no_collision(self):
     """Tests that fingerprint concatenation has no collisions."""
     # Although the last 10 bits of 359 and 1024+359 are identical.
@@ -331,7 +349,7 @@ class SparseCrossOpTest(test.TestCase):
         [t2, t1], num_buckets=1024, hash_key=sparse_ops._DEFAULT_HASH_KEY + 1)
     cross_dense = sparse_ops.sparse_tensor_to_dense(cross)
     with session.Session():
-      values = cross_dense.eval()
+      values = self.evaluate(cross_dense)
       self.assertTrue(numpy.not_equal(values[0], values[1]).all())
 
   def test_hashed_3x1x2(self):
@@ -345,7 +363,7 @@ class SparseCrossOpTest(test.TestCase):
         ],
         num_buckets=1000)
     with self.cached_session() as sess:
-      out = sess.run(op)
+      out = self.evaluate(op)
       self.assertEqual(6, len(out.values))
       self.assertAllEqual([[0, i] for i in range(6)], out.indices)
       self.assertTrue(all(x < 1000 and x >= 0 for x in out.values))
diff --git a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
index 541463e76bbc7b5569bb4deabd86872dd75c9533..2e17a9c608fcf64ad7e8eb48476bbfa0215ce178 100644
--- a/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_matmul_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -58,7 +59,7 @@ class SparseMatMulTest(test.TestCase):
           transpose_b=tr_b,
           a_is_sparse=sp_a,
           b_is_sparse=sp_b)
-      out = tf_ans.eval()
+      out = self.evaluate(tf_ans)
       np_x = math_ops.cast(tf_x, dtypes.float32).eval()
       np_y = math_ops.cast(tf_y, dtypes.float32).eval()
 
@@ -71,6 +72,7 @@ class SparseMatMulTest(test.TestCase):
     self.assertShapeEqual(np_ans, tf_ans)
     self.assertAllCloseAccordingToType(np_ans, out, rtol=1e-4, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x = np.arange(0., 4.).reshape([4, 1]).astype(np.float32)
     y = np.arange(-1., 1.).reshape([1, 2]).astype(np.float32)
@@ -78,6 +80,7 @@ class SparseMatMulTest(test.TestCase):
       for y_dtype in (dtypes.float32, dtypes.bfloat16):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
+  @test_util.run_deprecated_v1
   def testZeroDim(self):
     x = np.ones((4, 0)).astype(np.float32)
     y = np.ones((0, 3)).astype(np.float32)
@@ -85,6 +88,7 @@ class SparseMatMulTest(test.TestCase):
       for y_dtype in (dtypes.float32, dtypes.bfloat16):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
+  @test_util.run_deprecated_v1
   def testEmpty(self):
     x = np.ones((0, 0)).astype(np.float32)
     y = np.ones((0, 0)).astype(np.float32)
@@ -93,6 +97,7 @@ class SparseMatMulTest(test.TestCase):
         self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
   # Tests setting one dimension to be a high value.
+  @test_util.run_deprecated_v1
   def testLarge(self):
     r1 = np.random.randint(6000, 20000)
     r2 = np.random.randint(1, 10)
@@ -105,6 +110,7 @@ class SparseMatMulTest(test.TestCase):
           self._testCpuMatmul(x, y, x_dtype=x_dtype, y_dtype=y_dtype)
 
   # Tests random sized matrices.
+  @test_util.run_deprecated_v1
   def testRandom(self):
     for tr_a in [True, False]:
       for tr_b in [True, False]:
@@ -159,6 +165,7 @@ class MatMulGradientTest(test.TestCase):
               delta=delta))
     self.assertLessEqual(err, delta / 2.)
 
+  @test_util.run_deprecated_v1
   def testGradientInput(self):
     for tr_a in [True, False]:
       for tr_b in [True, False]:
diff --git a/tensorflow/python/kernel_tests/sparse_ops_test.py b/tensorflow/python/kernel_tests/sparse_ops_test.py
index a45ce2e13b471994b7fceece3536ed43ce9add86..7598991489ce6019352e19cb6c50819d91085b0d 100644
--- a/tensorflow/python/kernel_tests/sparse_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
@@ -71,6 +72,7 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtype),
         constant_op.constant(shape, dtypes.int64))
 
+  @test_util.run_deprecated_v1
   def testInt32(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x6(dtypes.int32)
@@ -83,6 +85,7 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
+  @test_util.run_deprecated_v1
   def testInt64(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x6(dtypes.int64)
@@ -95,6 +98,7 @@ class SparseToIndicatorTest(test_util.TensorFlowTestCase):
 
       self.assertAllEqual(output, expected_output)
 
+  @test_util.run_deprecated_v1
   def testHigherRank(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_2x3x4(dtypes.int64)
@@ -154,7 +158,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
                        sparse_tensor.SparseTensor.from_value(values_v)):
           sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-          output = sess.run(sp_output)
+          output = self.evaluate(sp_output)
           self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat32(self):
@@ -163,7 +167,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
@@ -172,7 +176,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt32AndFloat32NonCanonicalOrder(self):
@@ -182,7 +186,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testInt64AndFloat32NonCanonicalOrder(self):
@@ -192,7 +196,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testInt64AndFloat64NonCanonicalOrder(self):
@@ -203,7 +207,7 @@ class SparseMergeTest(test_util.TensorFlowTestCase):
       sp_output = sparse_ops.sparse_merge(
           indices, values, vocab_size_tensor, already_sorted=True)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsNotSorted(output, vocab_size)
 
   def testShouldSetLastDimensionInDynamicShape(self):
@@ -261,7 +265,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float32)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64(self):
@@ -270,7 +274,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
   def testInt64AndFloat64Shape(self):
@@ -279,7 +283,7 @@ class SparseMergeHighDimTest(test_util.TensorFlowTestCase):
       indices, values = self._SparseTensor_3x50(np.int64, np.float64)
       sp_output = sparse_ops.sparse_merge(indices, values, vocab_size)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
       self._AssertResultsSorted(output, vocab_size)
 
 
@@ -296,13 +300,14 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
   def _SparseTensor_5x6(self):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_5x6())
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.session(use_gpu=False) as sess:
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
         to_retain = np.array([1, 0, 0, 1, 1, 0], dtype=np.bool)
         sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
 
-        output = sess.run(sp_output)
+        output = self.evaluate(sp_output)
 
         self.assertAllEqual(output.indices, [[0, 0], [1, 4], [3, 2]])
         self.assertAllEqual(output.values, [0, 14, 32])
@@ -314,7 +319,7 @@ class SparseRetainTest(test_util.TensorFlowTestCase):
       to_retain = np.zeros((6,), dtype=np.bool)
       sp_output = sparse_ops.sparse_retain(sp_input, to_retain)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, np.array([]).reshape((0, 2)))
       self.assertAllEqual(output.values, [])
@@ -353,38 +358,42 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     return sparse_tensor.SparseTensorValue(self._IND_2_5_6, self._VAL_2_5_6,
                                            self._SHP_2_5_6)
 
+  @test_util.run_deprecated_v1
   def testStaticShapeInfoPreservedWhenNewShapeIsProvidedAndStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 6, 7], dtype=np.int64)
     sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
     self.assertAllEqual([3, 6, 7], sp_output.get_shape())
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testInputUnavailableInGraphConstructionOk(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorValue_2x5x6()
       new_shape = np.array([3, 6, 7], dtype=np.int64)
       sp_output = sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
       self.assertAllEqual(output.values, [0, 10, 13, 14, 32, 33])
       self.assertAllEqual(output.dense_shape, [3, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testFeedInputUnavailableInGraphConstructionOk(self):
     with self.session(use_gpu=False) as sess:
       sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
@@ -404,7 +413,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       sp_input = self._SparseTensor_2x5x6()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices, [[0, 0, 0], [0, 1, 0], [0, 1, 3],
                                            [1, 1, 4], [1, 3, 2], [1, 3, 3]])
@@ -416,12 +425,13 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       sp_input = self._SparseTensor_2x5x6_Empty()
       sp_output = sparse_ops.sparse_reset_shape(sp_input)
 
-      output = sess.run(sp_output)
+      output = self.evaluate(sp_output)
 
       self.assertAllEqual(output.indices.shape, [0, 3])
       self.assertAllEqual(output.values.shape, [0])
       self.assertAllEqual(output.dense_shape, [0, 0, 0])
 
+  @test_util.run_deprecated_v1
   def testInvalidRank(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_2x5x6()
@@ -430,6 +440,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaises(ValueError):
         sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidRankNewShapeUnavailableInGraphConstruction(self):
     with self.session(use_gpu=False) as sess:
       new_shape = array_ops.placeholder(dtype=dtypes.int64)
@@ -439,6 +450,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x == y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: np.array([3, 7], dtype=np.int64)})
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeStatic(self):
     sp_input = self._SparseTensor_2x5x6()
     new_shape = np.array([3, 7, 5], dtype=np.int64)
@@ -446,6 +458,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegexp(ValueError, "should have dimension sizes"):
       sparse_ops.sparse_reset_shape(sp_input, new_shape)
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeDynamic(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x5x6()
@@ -455,6 +468,7 @@ class SparseResetShapeTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("x <= y did not hold element-wise"):
         sess.run(out, feed_dict={new_shape: [3, 7, 5]})
 
+  @test_util.run_deprecated_v1
   def testInvalidDimensionSizeInputUnavailableInGraphConstruction(self):
     sp_input = array_ops.sparse_placeholder(dtype=dtypes.int32)
     with self.session(use_gpu=False) as sess:
@@ -496,6 +510,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         constant_op.constant(val, dtypes.int32),
         constant_op.constant(shape, dtypes.int64))
 
+  @test_util.run_deprecated_v1
   def testFillNumber(self):
     with self.session(use_gpu=False) as sess:
       for sp_input in (self._SparseTensorValue_5x6(), self._SparseTensor_5x6()):
@@ -513,6 +528,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(empty_row_indicator_out,
                             np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
+  @test_util.run_deprecated_v1
   def testFillFloat(self):
     with self.session(use_gpu=False) as sess:
       values = constant_op.constant(
@@ -547,6 +563,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertGreater(default_value_grad_err, 0)
       self.assertLess(default_value_grad_err, 1e-8)
 
+  @test_util.run_deprecated_v1
   def testFillString(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_String5x6()
@@ -565,6 +582,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
       self.assertAllEqual(empty_row_indicator_out,
                           np.array([0, 0, 1, 0, 1]).astype(np.bool))
 
+  @test_util.run_deprecated_v1
   def testNoEmptyRows(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensor_2x6()
@@ -582,6 +600,7 @@ class SparseFillEmptyRowsTest(test_util.TensorFlowTestCase):
 
 class SparseAddTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testValuesInVariable(self):
     indices = constant_op.constant([[1]], dtype=dtypes.int64)
     values = variables.Variable([1], trainable=False, dtype=dtypes.float32)
@@ -591,8 +610,8 @@ class SparseAddTest(test_util.TensorFlowTestCase):
     sp_output = sparse_ops.sparse_add(sp_input, sp_input)
 
     with self.session(use_gpu=False) as sess:
-      sess.run(variables.global_variables_initializer())
-      output = sess.run(sp_output)
+      self.evaluate(variables.global_variables_initializer())
+      output = self.evaluate(sp_output)
       self.assertAllEqual(output.values, [2])
 
 
@@ -635,7 +654,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
       else:
         tf_dense_ans = sparse_ops.sparse_reduce_max(sp_t, reduction_axes,
                                                     keep_dims)
-      out_dense = tf_dense_ans.eval()
+      out_dense = self.evaluate(tf_dense_ans)
 
       if do_sum:
         tf_sparse_ans = sparse_ops.sparse_reduce_sum_sparse(sp_t,
@@ -657,6 +676,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
     self._compare(sp_t, reduction_axes, ndims, True, False)
     self._compare(sp_t, reduction_axes, ndims, True, True)
 
+  @test_util.run_deprecated_v1
   def testSimpleAndRandomInputs(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -696,6 +716,7 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
       with self.assertRaisesOpError("Invalid reduction dimension 2"):
         sparse_ops.sparse_reduce_max(sp_t, 2).eval()
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -710,18 +731,59 @@ class SparseReduceTest(test_util.TensorFlowTestCase):
           axes = np.random.choice(len(dims), size=d, replace=False).tolist()
           reduced = sparse_ops.sparse_reduce_sum(sp_t, axes)
 
-          err = gradient_checker.compute_gradient_error(sp_t.values, (nnz,),
-                                                        reduced,
-                                                        reduced.eval().shape)
+          err = gradient_checker.compute_gradient_error(
+              sp_t.values, (nnz,), reduced,
+              self.evaluate(reduced).shape)
           self.assertLess(err, 1e-3)
 
         # Tests for negative axes.
         reduced = sparse_ops.sparse_reduce_sum(sp_t, -1)
-        err = gradient_checker.compute_gradient_error(sp_t.values, (nnz,),
-                                                      reduced,
-                                                      reduced.eval().shape)
+        err = gradient_checker.compute_gradient_error(
+            sp_t.values, (nnz,), reduced,
+            self.evaluate(reduced).shape)
         self.assertLess(err, 1e-3)
 
+  def _testSparseReduceShape(self, sp_t, reduction_axes, ndims, keep_dims,
+                             do_sum):
+    densified = sparse_ops.sparse_tensor_to_dense(sp_t).eval()
+
+    np_op = np.sum
+    tf_op = sparse_ops.sparse_reduce_sum
+    if not do_sum:
+      np_op = np.max
+      tf_op = sparse_ops.sparse_reduce_max
+
+    np_ans = densified
+    if reduction_axes is None:
+      np_ans = np_op(np_ans, keepdims=keep_dims)
+    else:
+      if not isinstance(reduction_axes, list):  # Single scalar.
+        reduction_axes = [reduction_axes]
+      reduction_axes = np.array(reduction_axes).astype(np.int32)
+      # Handles negative axes.
+      reduction_axes = (reduction_axes + ndims) % ndims
+      # Loop below depends on sorted.
+      reduction_axes.sort()
+      for ra in reduction_axes.ravel()[::-1]:
+        np_ans = np_op(np_ans, axis=ra, keepdims=keep_dims)
+
+    tf_ans = tf_op(sp_t, reduction_axes, keep_dims)
+    self.assertAllEqual(np_ans.shape, tf_ans.get_shape().as_list())
+
+  def testSparseReduceSumOrMaxShape(self):
+    sp_t = sparse_tensor.SparseTensor(self.ind, self.vals, self.dense_shape)
+
+    with self.session(use_gpu=False):
+      for do_sum in [True, False]:
+        for keep_dims in [True, False]:
+          self._testSparseReduceShape(sp_t, None, 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, 0, 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [0, 1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1, 0], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [-1], 2, keep_dims, do_sum)
+          self._testSparseReduceShape(sp_t, [1, -2], 2, keep_dims, do_sum)
+
 
 class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
@@ -737,6 +799,20 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
                                                result_tensor.values).eval()
     self.assertAllEqual(result_np, res_densified)
 
+  @test_util.run_deprecated_v1
+  def testCwiseShapeValidation(self):
+    # Test case for GitHub 24072.
+    with self.session(use_gpu=False):
+      a = array_ops.ones([3, 4, 1], dtype=dtypes.int32)
+      b = sparse_tensor.SparseTensor([[0, 0, 1, 0], [0, 0, 3, 0]], [10, 20],
+                                     [1, 1, 4, 2])
+      c = a * b
+      with self.assertRaisesRegexp(
+          errors.InvalidArgumentError,
+          "broadcasts dense to sparse only; got incompatible shapes"):
+        c.eval()
+
+  @test_util.run_deprecated_v1
   def testCwiseDivAndMul(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
@@ -760,6 +836,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
             res = sp_t / dense_t  # should invoke "__truediv__"
             self.assertEqual(res.values.eval().dtype, np.float64)
 
+  @test_util.run_deprecated_v1
   def testCwiseAdd(self):
     with self.session(use_gpu=False):
       # Identity(2) + AllOnes(2,2).  Should be equal to 2 * Identity(2).
@@ -779,6 +856,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
           sparse_ops.sparse_dense_cwise_add(sp_t, dense_t),
           np.identity(2) * 2, sp_t)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(1618)
     sp_shapes = [(10, 10, 10), (5, 5), (1618,), (3, 3, 7)]
@@ -812,6 +890,7 @@ class SparseMathOpsTest(test_util.TensorFlowTestCase):
 
 class SparseSoftmaxTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testEquivalentToDensified(self):
     np.random.seed(1618)
     n, m = np.random.choice(20, size=2)
@@ -831,6 +910,7 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
 
         self.assertAllClose(dense_result.eval(), sp_result)
 
+  @test_util.run_deprecated_v1
   def testHigherRanks(self):
     # For the first shape:
     # First batch:
@@ -860,6 +940,7 @@ class SparseSoftmaxTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(sp_t.indices.eval(), result.indices)
         self.assertAllEqual(shape, result.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [2, 5, 10]
     with self.cached_session(use_gpu=False):
@@ -879,6 +960,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
     self.assertAllEqual(a.values, b.values)
     self.assertAllEqual(a.dense_shape, b.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     with self.cached_session(use_gpu=False):
       # 1-D, values at index 0.
@@ -898,6 +980,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
       self._assertSparseTensorValueEqual(expected.eval(), max_tf)
       self._assertSparseTensorValueEqual(expected.eval(), min_tf)
 
+  @test_util.run_deprecated_v1
   def testRandom(self):
     np.random.seed(1618)
     shapes = [(13,), (6, 8), (1, 7, 1)]
@@ -939,6 +1022,7 @@ class SparseMinimumMaximumTest(test_util.TensorFlowTestCase):
 
 class SparseTransposeTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testTranspose(self):
     if np.__version__ == "1.13.0":
       self.skipTest("numpy 1.13.0 bug")
@@ -961,16 +1045,19 @@ class SparseTransposeTest(test.TestCase):
 
 class SparsePlaceholderTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testPlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=(10, 47))
     self.assertAllEqual([10, 47], foo.get_shape())
     self.assertAllEqual([None, 2], foo.indices.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testPartialShapePlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=(None, 47))
     self.assertAllEqual([None, None], foo.get_shape().as_list())
     self.assertAllEqual([None, 2], foo.indices.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testNoShapePlaceholder(self):
     foo = array_ops.sparse_placeholder(dtypes.float32, shape=None)
     self.assertAllEqual(None, foo.get_shape())
diff --git a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
index 7b83ae51779143377ef3ca6b9c909731f7829ca9..93fcc6a18e615d43d8145633e5720ce785ad017c 100644
--- a/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reorder_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
@@ -60,11 +61,12 @@ class SparseReorderTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       sp_output = sparse_ops.sparse_reorder(input_val)
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices, input_val.indices)
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedAlreadyInOrder(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -83,12 +85,13 @@ class SparseReorderTest(test.TestCase):
         input_val = self._SparseTensorValue_5x6(np.random.permutation(6))
         sp_output = sparse_ops.sparse_reorder(input_val)
 
-        output_val = sess.run(sp_output)
+        output_val = self.evaluate(sp_output)
         self.assertAllEqual(output_val.indices, expected_output_val.indices)
         self.assertAllEqual(output_val.values, expected_output_val.values)
         self.assertAllEqual(output_val.dense_shape,
                             expected_output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedOutOfOrder(self):
     expected_output_val = self._SparseTensorValue_5x6(np.arange(6))
     with self.session(use_gpu=False) as sess:
@@ -103,6 +106,7 @@ class SparseReorderTest(test.TestCase):
         self.assertAllEqual(output_val.dense_shape,
                             expected_output_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     with self.session(use_gpu=False):
       for _ in range(5):  # To test various random permutations
diff --git a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
index f7be397c333e3012e994a942b3428b92b0f7c54d..9341228d57e6cea8651b13c70f53ebd229c65b7e 100644
--- a/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_reshape_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -64,12 +65,14 @@ class SparseReshapeTest(test.TestCase):
     sp_output = sparse_ops.sparse_reshape(sp_input, shape=(2, -1))
     self.assertAllEqual((2, 3 * 4), sp_output.get_shape())
 
+  @test_util.run_deprecated_v1
   def testRaisesIfMoreThanOneInferredDim(self):
     sp_input = sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_2x3x4())
     with self.assertRaisesRegexp(ValueError, "At most one dimension can"):
       sparse_ops.sparse_reshape(sp_input, shape=(-1, 2, -1))
 
+  @test_util.run_deprecated_v1
   def testRaisesIfInferredShapeNotPossible(self):
     sp_input = sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_2x3x4())
@@ -81,11 +84,12 @@ class SparseReshapeTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(input_val, [5, 6])
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices, input_val.indices)
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedSameShape(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -97,6 +101,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testWorksWellWithTfShape(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -109,6 +114,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedSameShapeWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -120,6 +126,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, input_val.dense_shape)
 
+  @test_util.run_deprecated_v1
   def testFeedNewShapeSameRank(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -133,6 +140,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [3, 10])
 
+  @test_util.run_deprecated_v1
   def testFeedNewShapeSameRankWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -151,13 +159,14 @@ class SparseReshapeTest(test.TestCase):
       input_val = self._SparseTensorValue_5x6()
       sp_output = sparse_ops.sparse_reshape(input_val, [2, 3, 5])
 
-      output_val = sess.run(sp_output)
+      output_val = self.evaluate(sp_output)
       self.assertAllEqual(output_val.indices,
                           np.array([[0, 0, 0], [0, 1, 1], [0, 1, 4], [0, 2, 0],
                                     [1, 1, 0], [1, 1, 1]]))
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedUpRank(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -171,6 +180,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedUpRankWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -184,6 +194,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [2, 3, 5])
 
+  @test_util.run_deprecated_v1
   def testFeedDownRank(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -197,6 +208,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [6, 4])
 
+  @test_util.run_deprecated_v1
   def testFeedDownRankWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -210,6 +222,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertAllEqual(output_val.values, input_val.values)
       self.assertAllEqual(output_val.dense_shape, [6, 4])
 
+  @test_util.run_deprecated_v1
   def testFeedMultipleInferredDims(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -218,12 +231,14 @@ class SparseReshapeTest(test.TestCase):
       with self.assertRaisesOpError("only one output dimension may be -1"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testProvideStaticallyMismatchedSizes(self):
     input_val = self._SparseTensorValue_5x6()
     sp_input = sparse_tensor.SparseTensor.from_value(input_val)
     with self.assertRaisesRegexp(ValueError, "Cannot reshape"):
       sparse_ops.sparse_reshape(sp_input, [4, 7])
 
+  @test_util.run_deprecated_v1
   def testFeedMismatchedSizes(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -233,6 +248,7 @@ class SparseReshapeTest(test.TestCase):
           "Input to reshape is a tensor with 30 dense values"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testFeedMismatchedSizesWithInferredDim(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -241,6 +257,7 @@ class SparseReshapeTest(test.TestCase):
       with self.assertRaisesOpError("requested shape requires a multiple"):
         sess.run(sp_output, {sp_input: input_val})
 
+  @test_util.run_deprecated_v1
   def testFeedPartialShapes(self):
     with self.session(use_gpu=False):
       # Incorporate new rank into shape information if known
@@ -266,6 +283,7 @@ class SparseReshapeTest(test.TestCase):
       self.assertListEqual(sp_output.indices.get_shape().as_list(), [5, None])
       self.assertListEqual(sp_output.dense_shape.get_shape().as_list(), [None])
 
+  @test_util.run_deprecated_v1
   def testFeedDenseReshapeSemantics(self):
     with self.session(use_gpu=False) as sess:
       # Compute a random rank-5 initial shape and new shape, randomly sparsify
diff --git a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
index b24a08696991dda7051f1348c4afc1675362b6d8..5a48eb825dbfa8231062be2d2db33fc0756a690f 100644
--- a/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_serialization_ops_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -73,7 +74,7 @@ class SerializeSparseTest(test.TestCase):
       serialized = serialize_fn(sp_input, out_type=out_type)
       sp_deserialized = deserialize_fn(serialized, dtype=dtypes.int32)
 
-      indices, values, shape = sess.run(sp_deserialized)
+      indices, values, shape = self.evaluate(sp_deserialized)
 
       self.assertAllEqual(indices, sp_input[0])
       self.assertAllEqual(values, sp_input[1])
@@ -110,14 +111,17 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeBatch(self):
     self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeManyBatch(self):
     self._testSerializeDeserializeBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeBatch(self):
     self._testSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse,
@@ -145,10 +149,12 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeBatchInconsistentShape(self):
     self._testSerializeDeserializeBatchInconsistentShapeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeBatchInconsistentShape(self):
     self._testSerializeDeserializeBatchInconsistentShapeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -188,10 +194,12 @@ class SerializeSparseTest(test.TestCase):
 
       self.assertAllEqual(combined_shape, [2, 2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testSerializeDeserializeNestedBatch(self):
     self._testSerializeDeserializeNestedBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeNestedBatch(self):
     self._testSerializeDeserializeNestedBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -224,14 +232,17 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testFeedSerializeDeserializeBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                                   sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testFeedSerializeDeserializeManyBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testFeedVariantSerializeDeserializeBatch(self):
     self._testFeedSerializeDeserializeBatchHelper(sparse_ops.serialize_sparse,
                                                   sparse_ops.deserialize_sparse,
@@ -256,6 +267,7 @@ class SerializeSparseTest(test.TestCase):
           })
       self.assertEqual(serialized_value.shape, (4, 3))
 
+  @test_util.run_deprecated_v1
   def testSerializeManyShape(self):
     self._testSerializeManyShapeHelper(sparse_ops.serialize_many_sparse)
 
@@ -287,19 +299,23 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserializeBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testSerializeManyDeserializeManyBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeManyDeserializeBatch(self):
     self._testSerializeManyDeserializeBatchHelper(
         sparse_ops.serialize_many_sparse, sparse_ops.deserialize_sparse,
         dtypes.variant)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeScalar(self):
     with self.session(use_gpu=False) as sess:
       indices_value = np.array([[]], dtype=np.int64)
@@ -321,6 +337,7 @@ class SerializeSparseTest(test.TestCase):
       self.assertAllEqual(deserialized_value.values, values_value)
       self.assertAllEqual(deserialized_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testVariantSerializeDeserializeScalarBatch(self):
     with self.session(use_gpu=False) as sess:
       indices_value = np.array([[]], dtype=np.int64)
@@ -367,14 +384,17 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantDeserializeFailsWrongType(self):
     self._testDeserializeFailsWrongTypeHelper(sparse_ops.serialize_sparse,
                                               sparse_ops.deserialize_sparse,
@@ -402,14 +422,17 @@ class SerializeSparseTest(test.TestCase):
                  {sp_input0: input0_val,
                   sp_input1: input1_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
 
+  @test_util.run_deprecated_v1
   def testVariantDeserializeFailsInconsistentRank(self):
     self._testDeserializeFailsInconsistentRankHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_sparse,
@@ -431,10 +454,12 @@ class SerializeSparseTest(test.TestCase):
       with self.assertRaisesOpError(r"Could not parse serialized proto"):
         sess.run(sp_deserialized, {sp_input0: input0_val})
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInvalidProto(self):
     self._testDeserializeFailsInvalidProtoHelper(sparse_ops.serialize_sparse,
                                                  sparse_ops.deserialize_sparse)
 
+  @test_util.run_deprecated_v1
   def testDeserializeManyFailsInvalidProto(self):
     self._testDeserializeFailsInvalidProtoHelper(
         sparse_ops.serialize_sparse, sparse_ops.deserialize_many_sparse)
diff --git a/tensorflow/python/kernel_tests/sparse_slice_op_test.py b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
index 098353741f3cd6a606c314a2fba704023a4a1bda..7f8c91bde6748369211f66b50ed253cdcd513a2a 100644
--- a/tensorflow/python/kernel_tests/sparse_slice_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_slice_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
 import tensorflow.python.ops.sparse_grad  # pylint: disable=unused-import
@@ -79,6 +80,7 @@ class SparseSliceOpTest(test.TestCase):
     return sparse_tensor.SparseTensor.from_value(
         self._SparseTensorValue_3x4x2())
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixRows(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
@@ -96,6 +98,7 @@ class SparseSliceOpTest(test.TestCase):
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensor1.dense_shape.eval(), [2, 6])
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixUnevenCols(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x7()
@@ -137,6 +140,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor3.values.eval(), [16, 46])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [5, 1])
 
+  @test_util.run_deprecated_v1
   def testSliceMatrixUnevenRows(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_5x7()
@@ -173,6 +177,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor2.dense_shape.eval(), [1, 7])
     return
 
+  @test_util.run_deprecated_v1
   def testSliceAllRows(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
@@ -195,6 +200,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sp_tensor3.values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensor3.dense_shape.eval(), [1, 6])
 
+  @test_util.run_deprecated_v1
   def testSliceColumns(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
@@ -215,6 +221,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor2.values.eval(), [4, 5, 14, 25, 35])
       self.assertAllEqual(sparse_tensor2.dense_shape.eval(), [4, 2])
 
+  @test_util.run_deprecated_v1
   def testSliceAllColumns(self):
     with self.session(use_gpu=False):
       sp_input = self._SparseTensor_4x6()
@@ -246,6 +253,7 @@ class SparseSliceOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensor5.values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensor5.dense_shape.eval(), [4, 1])
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     sp_input = self._SparseTensor_4x6(val_dtype=np.float32)
     start_and_size = [([0, 0], [4, 2]),
diff --git a/tensorflow/python/kernel_tests/sparse_split_op_test.py b/tensorflow/python/kernel_tests/sparse_split_op_test.py
index 95661ded4bedd797677b9786b05d9670fbfa4a39..f4bb7498b02f91abb2f93fb16a7e77b65e27257f 100644
--- a/tensorflow/python/kernel_tests/sparse_split_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_split_op_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
 
@@ -75,6 +76,7 @@ class SparseSplitOpTest(test.TestCase):
     return sparse_tensor.SparseTensor.from_value(self._SparseTensorValue_3x4x2(
     ))
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixRows(self):
     with self.session(use_gpu=False):
       sp_tensors = sparse_ops.sparse_split(
@@ -92,6 +94,7 @@ class SparseSplitOpTest(test.TestCase):
                           [20, 23, 25, 30, 32, 33, 35])
       self.assertAllEqual(sp_tensors[1].dense_shape.eval(), [2, 6])
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixUnevenCols(self):
     with self.session(use_gpu=False):
       sp_tensors_3 = sparse_ops.sparse_split(
@@ -131,6 +134,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors_4[3].values.eval(), [16, 46])
       self.assertAllEqual(sp_tensors_4[3].dense_shape.eval(), [5, 1])
 
+  @test_util.run_deprecated_v1
   def testSplitMatrixUnevenRows(self):
     with self.session(use_gpu=False):
       sp_tensors_2 = sparse_ops.sparse_split(
@@ -167,6 +171,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors_3[2].dense_shape.eval(), [1, 7])
     return
 
+  @test_util.run_deprecated_v1
   def testSplitAllRows(self):
     with self.session(use_gpu=False):
       sp_tensors = sparse_ops.sparse_split(
@@ -189,6 +194,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sp_tensors[3].values.eval(), [30, 32, 33, 35])
       self.assertAllEqual(sp_tensors[3].dense_shape.eval(), [1, 6])
 
+  @test_util.run_deprecated_v1
   def testSplitColumns(self):
     with self.session(use_gpu=False):
       sparse_tensors = sparse_ops.sparse_split(
@@ -207,6 +213,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensors[2].values.eval(), [4, 5, 14, 25, 35])
       self.assertAllEqual(sparse_tensors[2].dense_shape.eval(), [4, 2])
 
+  @test_util.run_deprecated_v1
   def testSplitAllColumns(self):
     with self.session(use_gpu=False):
       sparse_tensors = sparse_ops.sparse_split(
@@ -234,6 +241,7 @@ class SparseSplitOpTest(test.TestCase):
       self.assertAllEqual(sparse_tensors[5].values.eval(), [5, 25, 35])
       self.assertAllEqual(sparse_tensors[5].dense_shape.eval(), [4, 1])
 
+  @test_util.run_deprecated_v1
   def testSliceConcat(self):
     for sp_input in (self._SparseTensorValue_3x4x2(),
                      self._SparseTensor_3x4x2()):
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
index b8f33d6a813ad34a9bd0b383a558f0ee810a430d..fa2bab1fca68000ec54c93bc9cb2ab1cf5b98a4f 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import sparse_ops
 import tensorflow.python.ops.sparse_grad  # pylint: disable=unused-import
@@ -89,6 +90,7 @@ class SparseTensorDenseMatMulGradientTest(test.TestCase):
         self._testGradients(adjoint_a, adjoint_b, name, values_dtype,
                             indices_dtype)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
     self._testGradientsType(np.float32, np.int64)
diff --git a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
index fe334045afe5ff4d034528c815e3485e1e98f8f9..637cfaec9907a59f7559053792e513739aad293f 100644
--- a/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensor_dense_matmul_op_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
@@ -80,7 +81,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
       self.assertEqual(tf_value_ans.get_shape()[1], np_ans.shape[1])
       self.assertEqual(tf_tensor_ans.get_shape()[1], np_ans.shape[1])
 
-      for out in (tf_value_ans.eval(), tf_tensor_ans.eval()):
+      for out in (tf_value_ans.eval(), self.evaluate(tf_tensor_ans)):
         if x.dtype == np.float32:
           self.assertAllClose(np_ans, out, rtol=1e-4, atol=1e-4)
         elif x.dtype == np.float64:
@@ -96,6 +97,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
 
     self._testMatmul(x, y, indices_dtype=indices_dtype)
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     np.random.seed(127)  # Repeatable results
     self._testBasic(np.int32)
@@ -106,6 +108,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testBasic(np.int32, indices_dtype=np.int32)
     self._testBasic(np.float32, indices_dtype=np.int32)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     x = np.random.rand(10, 10)
     x[np.abs(x) < 0.5] = 0  # Make it sparse
@@ -229,6 +232,7 @@ class SparseTensorDenseMatMulTest(test.TestCase):
     self._testLarge(np.complex128)
 
   # Tests random sized matrices.
+  @test_util.run_deprecated_v1
   def testFloatRandom(self):
     np.random.seed(127)  # Repeatable results
     for _ in range(8):
diff --git a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
index e08464a701c98f24d4ca977b824cdf1e7c329763..6039ff1afa74ed3d56dd3974bd10312c4c9870ca 100644
--- a/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
+++ b/tensorflow/python/kernel_tests/sparse_tensors_map_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor as sparse_tensor_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variables
@@ -76,6 +77,7 @@ class SparseTensorsMapTest(test.TestCase):
     shape = np.array([3, 4, 5]).astype(np.int64)
     return sparse_tensor_lib.SparseTensorValue(ind, val, shape)
 
+  @test_util.run_deprecated_v1
   def testAddTakeMany(self):
     with self.session(graph=ops.Graph(), use_gpu=False) as sess:
       sp_input0 = self._SparseTensorValue_5x6(np.arange(6))
@@ -88,7 +90,7 @@ class SparseTensorsMapTest(test.TestCase):
       sp_out = take_many_sparse_from_tensors_map(
           sparse_map_op=handle0.op, sparse_handles=handles_concat)
 
-      combined_indices, combined_values, combined_shape = sess.run(sp_out)
+      combined_indices, combined_values, combined_shape = self.evaluate(sp_out)
 
       self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
       self.assertAllEqual(combined_indices[:6, 1:], sp_input0[0])
@@ -98,6 +100,7 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], sp_input1[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testFeedAddTakeMany(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -114,7 +117,8 @@ class SparseTensorsMapTest(test.TestCase):
       sp_roundtrip = take_many_sparse_from_tensors_map(
           sparse_map_op=handle.op, sparse_handles=sparse_handles)
 
-      combined_indices, combined_values, combined_shape = sess.run(sp_roundtrip)
+      combined_indices, combined_values, combined_shape = self.evaluate(
+          sp_roundtrip)
 
       self.assertAllEqual(combined_indices[:6, 0], [0] * 6)  # minibatch 0
       self.assertAllEqual(combined_indices[:6, 1:], input0_val[0])
@@ -124,6 +128,7 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(combined_values[6:], input1_val[1])
       self.assertAllEqual(combined_shape, [2, 5, 6])
 
+  @test_util.run_deprecated_v1
   def testAddManyTakeManyRoundTrip(self):
     with self.session(use_gpu=False) as sess:
       # N == 4 because shape_value == [4, 5]
@@ -146,6 +151,7 @@ class SparseTensorsMapTest(test.TestCase):
       self.assertAllEqual(roundtrip_value.values, values_value)
       self.assertAllEqual(roundtrip_value.dense_shape, shape_value)
 
+  @test_util.run_deprecated_v1
   def testDeserializeFailsInconsistentRank(self):
     with self.session(use_gpu=False) as sess:
       sp_input = self._SparseTensorPlaceholder()
@@ -165,19 +171,20 @@ class SparseTensorsMapTest(test.TestCase):
       with self.assertRaisesOpError(
           r"Inconsistent rank across SparseTensors: rank prior to "
           r"SparseTensor\[1\] was: 3 but rank of SparseTensor\[1\] is: 4"):
-        sess.run(sp_roundtrip)
+        self.evaluate(sp_roundtrip)
 
+  @test_util.run_deprecated_v1
   def testTakeManyFailsWrongInputOp(self):
     with self.session(use_gpu=False) as sess:
       input_val = self._SparseTensorValue_5x6(np.arange(6))
       handle = add_sparse_to_tensors_map(input_val)
-      handle_value = sess.run(handle)
+      handle_value = self.evaluate(handle)
       bad_handle = handle_value + 10
       sp_roundtrip = take_many_sparse_from_tensors_map(
           sparse_map_op=handle.op, sparse_handles=[handle_value, bad_handle])
 
       with self.assertRaisesOpError(r"Unable to find SparseTensor: 10"):
-        sess.run(sp_roundtrip)
+        self.evaluate(sp_roundtrip)
 
 
 class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
@@ -212,8 +219,8 @@ class BenchmarkSparseTensorsMapVsSerialization(test.Benchmark):
 
         variables.global_variables_initializer().run()
 
-        st_roundtrip_values = sess.run(st_roundtrip)
-        st_deserialized_values = sess.run(st_deserialized)
+        st_roundtrip_values = self.evaluate(st_roundtrip)
+        st_deserialized_values = self.evaluate(st_deserialized)
         np.testing.assert_equal(st_roundtrip_values.values,
                                 st_deserialized_values.values)
         np.testing.assert_equal(st_roundtrip_values.indices,
diff --git a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
index 7f63532e10de88b65f6c7fecb2eaf4f42d6519e4..c6c45db4f9ac50d6986516fc18860d162b70b29c 100644
--- a/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
+++ b/tensorflow/python/kernel_tests/sparse_to_dense_op_py_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import test
@@ -41,36 +42,42 @@ def _SparseToDense(sparse_indices,
 
 class SparseToDenseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1, 0).eval()
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1.0, 0.0).eval()
     np_ans = np.array([0, 1, 0, 1, 0]).astype(np.float32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testString(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], "a", "b").eval()
     np_ans = np.array(["b", "a", "b", "a", "b"]).astype(np.string_)
     self.assertAllEqual(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSetValue(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], [1, 2], -1).eval()
     np_ans = np.array([-1, 1, -1, 2, -1]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testSetSingleValue(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([1, 3], [5], 1, -1).eval()
     np_ans = np.array([-1, 1, -1, 1, -1]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def test2d(self):
     # pylint: disable=bad-whitespace
     with self.session(use_gpu=False):
@@ -80,11 +87,13 @@ class SparseToDenseTest(test.TestCase):
                        [ 1, -1, -1, -1]]).astype(np.int32)
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testZeroDefault(self):
     with self.cached_session():
       x = sparse_ops.sparse_to_dense(2, [4], 7).eval()
       self.assertAllEqual(x, [0, 0, 7, 0])
 
+  @test_util.run_deprecated_v1
   def test3d(self):
     with self.session(use_gpu=False):
       tf_ans = _SparseToDense([[1, 3, 0], [2, 0, 1]], [3, 4, 2], 1, -1).eval()
@@ -93,32 +102,37 @@ class SparseToDenseTest(test.TestCase):
     np_ans[2, 0, 1] = 1
     self.assertAllClose(np_ans, tf_ans)
 
+  @test_util.run_deprecated_v1
   def testBadShape(self):
     with self.cached_session():
       with self.assertRaisesWithPredicateMatch(ValueError, "must be rank 1"):
         _SparseToDense([1, 3], [[5], [3]], 1, -1)
 
+  @test_util.run_deprecated_v1
   def testBadValue(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [[5], [3]], -1)
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[2,1\], "
           r"should be \[\] or \[2\]"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testBadNumValues(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [1, 2, 3], -1)
       with self.assertRaisesOpError(
           r"sparse_values has incorrect shape \[3\], should be \[\] or \[2\]"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testBadDefault(self):
     with self.cached_session():
       dense = _SparseToDense([1, 3], [5], [1, 2], [0])
       with self.assertRaisesOpError("default_value should be a scalar"):
-        dense.eval()
+        self.evaluate(dense)
 
+  @test_util.run_deprecated_v1
   def testOutOfBoundsIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -128,7 +142,7 @@ class SparseToDenseTest(test.TestCase):
           default_value=0.0)
       with self.assertRaisesOpError(
           r"indices\[1\] = \[10\] is out of bounds: need 0 <= index < \[5\]"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks, the allocation should still fail.
       with self.assertRaisesOpError("out of bounds"):
         dense_without_validation = _SparseToDense(
@@ -137,8 +151,9 @@ class SparseToDenseTest(test.TestCase):
             sparse_values=[-1.0, 1.0],
             default_value=0.0,
             validate_indices=False)
-        dense_without_validation.eval()
+        self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testRepeatingIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -147,7 +162,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0)
       with self.assertRaisesOpError(r"indices\[1\] = \[1\] is repeated"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks
       dense_without_validation = _SparseToDense(
           sparse_indices=[[1], [1]],
@@ -155,8 +170,9 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0,
           validate_indices=False)
-      dense_without_validation.eval()
+      self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testUnsortedIndicesWithWithoutValidation(self):
     with self.cached_session():
       dense = _SparseToDense(
@@ -165,7 +181,7 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0)
       with self.assertRaisesOpError(r"indices\[1\] = \[1\] is out of order"):
-        dense.eval()
+        self.evaluate(dense)
       # Disable checks
       dense_without_validation = _SparseToDense(
           sparse_indices=[[2], [1]],
@@ -173,8 +189,9 @@ class SparseToDenseTest(test.TestCase):
           sparse_values=[-1.0, 1.0],
           default_value=0.0,
           validate_indices=False)
-      dense_without_validation.eval()
+      self.evaluate(dense_without_validation)
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceKnownShape(self):
     with self.session(use_gpu=False):
       indices = array_ops.placeholder(dtypes.int64)
@@ -187,6 +204,7 @@ class SparseToDenseTest(test.TestCase):
       output = sparse_ops.sparse_to_dense(indices, shape, 1, 0)
       self.assertEqual(output.get_shape().as_list(), [None, None, None])
 
+  @test_util.run_deprecated_v1
   def testShapeInferenceUnknownShape(self):
     with self.session(use_gpu=False):
       indices = array_ops.placeholder(dtypes.int64)
diff --git a/tensorflow/python/kernel_tests/sparse_xent_op_test.py b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
index 0510bc5321445e3db0dcfef169100fbc4dd013da..8f0842f7f50c61ea23361fa255ae45deae2ebfc1 100644
--- a/tensorflow/python/kernel_tests/sparse_xent_op_test.py
+++ b/tensorflow/python/kernel_tests/sparse_xent_op_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops as ops_lib
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -66,7 +67,7 @@ class SparseXentTest(test.TestCase):
     with self.cached_session(use_gpu=True) as sess:
       loss, backprop = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
           np_features, np_labels)
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
@@ -76,10 +77,11 @@ class SparseXentTest(test.TestCase):
         loss, backprop = gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
             np.array([[1.], [-1.], [0.]]).astype(np.float32),
             np.array([0, 0, 0]).astype(label_dtype))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllClose([0.0, 0.0, 0.0], tf_loss)
       self.assertAllClose([[0.0], [0.0], [0.0]], tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testInvalidLabel(self):
     features = [[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 2., 3., 4.],
                 [1., 2., 3., 4.]]
@@ -90,7 +92,7 @@ class SparseXentTest(test.TestCase):
         loss, backprop = (
             gen_nn_ops.sparse_softmax_cross_entropy_with_logits(
                 features, labels))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
         self.assertAllClose(
             [[np.nan] * 4, [0.25, 0.25, 0.25, -0.75],
              [-0.968, 0.087, 0.237, 0.6439], [np.nan] * 4],
@@ -104,7 +106,7 @@ class SparseXentTest(test.TestCase):
       loss, backprop = (
           gen_nn_ops.sparse_softmax_cross_entropy_with_logits(features, labels))
       with self.assertRaisesOpError("Received a label value of"):
-        sess.run([loss, backprop])
+        self.evaluate([loss, backprop])
 
   def testNpXent(self):
     # We create 2 batches of logits for testing.
@@ -152,6 +154,7 @@ class SparseXentTest(test.TestCase):
         nn_ops.sparse_softmax_cross_entropy_with_logits(
             labels=constant_op.constant(0), logits=constant_op.constant(1.0))
 
+  @test_util.run_deprecated_v1
   def testLabelsPlaceholderScalar(self):
     with self.session(use_gpu=True):
       labels = array_ops.placeholder(np.int32)
@@ -164,7 +167,7 @@ class SparseXentTest(test.TestCase):
     with self.session(use_gpu=True):
       loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=constant_op.constant(0), logits=constant_op.constant([1.0]))
-      self.assertAllClose(0.0, loss.eval())
+      self.assertAllClose(0.0, self.evaluate(loss))
 
   def testFloat(self):
     for label_dtype in np.int32, np.int64:
@@ -187,6 +190,7 @@ class SparseXentTest(test.TestCase):
   def testEmpty(self):
     self._testXent(np.zeros((0, 3)), np.zeros((0,), dtype=np.int32))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.session(use_gpu=True):
       l = constant_op.constant([3, 0, 1], name="l")
@@ -201,6 +205,7 @@ class SparseXentTest(test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testSecondGradient(self):
     images_placeholder = array_ops.placeholder(dtypes.float32, shape=(3, 2))
     labels_placeholder = array_ops.placeholder(dtypes.int32, shape=(3))
@@ -226,21 +231,24 @@ class SparseXentTest(test.TestCase):
       loss = nn_ops.sparse_softmax_cross_entropy_with_logits(
           labels=labels, logits=features)
       backprop = loss.op.inputs[0].op.outputs[1]
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testHighDim(self):
     features = [[[1., 1., 1., 1.]], [[1., 2., 3., 4.]]]
     labels = [[3], [0]]
     self._testHighDim(features, labels)
 
+  @test_util.run_deprecated_v1
   def testHighDim2(self):
     features = [[[1., 1., 1., 1.], [2., 2., 2., 2.]],
                 [[1., 2., 3., 4.], [5., 6., 7., 8.]]]
     labels = [[3, 2], [0, 3]]
     self._testHighDim(features, labels)
 
+  @test_util.run_deprecated_v1
   def testScalarHandling(self):
     with self.session(use_gpu=False) as sess:
       with self.assertRaisesRegexp(errors_impl.InvalidArgumentError,
@@ -318,7 +326,7 @@ def sparse_vs_dense_xent_benchmark(batch_size, num_entries, use_gpu):
   # Using sparse_softmax_cross_entropy_with_logits
   with session.Session(config=config) as sess:
     if not use_gpu:
-      with ops_lib.device("/cpu:0"):
+      with test_util.device("/cpu:0"):
         ops = _sparse_vs_dense_xent_benchmark_sparse(labels, logits)
     else:
       ops = _sparse_vs_dense_xent_benchmark_sparse(labels, logits)
diff --git a/tensorflow/python/kernel_tests/sparsemask_op_test.py b/tensorflow/python/kernel_tests/sparsemask_op_test.py
index 6f5dd45b616c13133a70d82823d4e5030d4e41ea..b1cd0227bc0a71db05c120cff7f70afc7ef1f10e 100644
--- a/tensorflow/python/kernel_tests/sparsemask_op_test.py
+++ b/tensorflow/python/kernel_tests/sparsemask_op_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import test
 
 
 class SparseMaskTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     values = np.random.rand(4, 4).astype(np.single)
     indices = np.array([0, 2, 3, 4], dtype=np.int32)
diff --git a/tensorflow/python/kernel_tests/split_op_test.py b/tensorflow/python/kernel_tests/split_op_test.py
index 944b0e59b12a1382d64941ebda8018c9f30acdfe..517db3450f3c43ea0989b59db5ccc7c089e9cec3 100644
--- a/tensorflow/python/kernel_tests/split_op_test.py
+++ b/tensorflow/python/kernel_tests/split_op_test.py
@@ -42,6 +42,7 @@ class SplitOpTest(test.TestCase):
       data -= 1j * data
     return data
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     model_input = array_ops.placeholder(dtypes.float32, shape=(1, 10))
 
@@ -85,6 +86,7 @@ class SplitOpTest(test.TestCase):
     with self.cached_session(use_gpu=True) as sess:
       sess.run(result, feed_dict={model_input2: np.ones([4, 2])})
 
+  @test_util.run_deprecated_v1
   def testFailWithoutExplicitNum(self):
     size_splits = array_ops.placeholder(dtype=dtypes.int32, shape=[None])
 
@@ -209,6 +211,7 @@ class SplitOpTest(test.TestCase):
     self.assertAllEqual(result[:, 0:1], inp_grads[0])
     self.assertAllEqual(result[:, 1:4], inp_grads[1])
 
+  @test_util.run_deprecated_v1
   def testOutputShape(self):
     for axis in [1, -1]:
       with self.cached_session(use_gpu=True):
@@ -318,15 +321,17 @@ class SplitOpTest(test.TestCase):
       inp_grads = [self._makeData((4, 1), dtype)for _ in range(4)]
       grad_tensors = [constant_op.constant(x) for x in inp_grads]
       grad = gradients_impl.gradients(s, [inp_tensor], grad_tensors)[0]
-      result = grad.eval()
+      result = self.evaluate(grad)
     for i in range(4):
       self.assertAllEqual(result[:, i:i + 1], inp_grads[i])
 
+  @test_util.run_deprecated_v1
   def testGradientsAll(self):
     for dtype in _TEST_DTYPES:
       self._testGradientsSimple(dtype)
       self._testGradientsSimpleVariable(dtype)
 
+  @test_util.run_deprecated_v1
   def testShapeFunctionEdgeCases(self):
     # split_dim greater than rank of input.
     with self.assertRaises(ValueError):
@@ -356,6 +361,7 @@ class SplitOpTest(test.TestCase):
     for s in splits:
       self.assertEqual(None, s.get_shape().ndims)
 
+  @test_util.run_deprecated_v1
   def testVariableShapeFunction(self):
     # size_splits too big
     with self.assertRaises(ValueError):
@@ -366,6 +372,7 @@ class SplitOpTest(test.TestCase):
     assert s0.shape.as_list() == [2]
     assert s1.shape.as_list() == [1]
 
+  @test_util.run_deprecated_v1
   def testNonexistentDimTensor(self):
     x = array_ops.placeholder(dtypes.int32)
     values = np.zeros([5, 30])
diff --git a/tensorflow/python/kernel_tests/stack_op_test.py b/tensorflow/python/kernel_tests/stack_op_test.py
index 4b355620bf93bc100b6ce399a183b485d3ccd32f..ca3357a0ed8f87cfcccd08a62c5b8526a898b664 100644
--- a/tensorflow/python/kernel_tests/stack_op_test.py
+++ b/tensorflow/python/kernel_tests/stack_op_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import variables
@@ -41,6 +42,7 @@ def np_split_squeeze(array, axis):
 
 class StackOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
@@ -54,6 +56,7 @@ class StackOpTest(test.TestCase):
           c = array_ops.stack(xs)
           self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testSimpleParallelCPU(self):
     np.random.seed(7)
     with self.session(use_gpu=False):
@@ -63,6 +66,7 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(xs)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testSimpleParallelGPU(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
@@ -72,6 +76,7 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(xs)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testConst(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
@@ -96,6 +101,7 @@ class StackOpTest(test.TestCase):
         b = array_ops.reshape(a, array_ops.stack([2, 3]))
         self.assertAllEqual(b.get_shape(), [2, 3])
 
+  @test_util.run_deprecated_v1
   def testConstParallelCPU(self):
     np.random.seed(7)
     with self.session(use_gpu=False):
@@ -110,6 +116,7 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(data)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testConstParallelGPU(self):
     np.random.seed(7)
     with self.session(use_gpu=True):
@@ -124,6 +131,7 @@ class StackOpTest(test.TestCase):
         c = array_ops.parallel_stack(data)
         self.assertAllEqual(c.eval(), data)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
     np.random.seed(7)
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
@@ -136,6 +144,7 @@ class StackOpTest(test.TestCase):
         err = gradient_checker.compute_gradient_error(xs, shapes, c, shape)
         self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis1(self):
     np.random.seed(7)
     for shape in (2, 3), (3, 2), (4, 3, 2):
@@ -150,6 +159,7 @@ class StackOpTest(test.TestCase):
         err = gradient_checker.compute_gradient_error(xs, shapes, c, out_shape)
         self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testZeroSizeCPU(self):
     # Verify that stack doesn't crash for zero size inputs
     with self.session(use_gpu=False):
@@ -161,6 +171,7 @@ class StackOpTest(test.TestCase):
         p = array_ops.parallel_stack(list(x)).eval()
         self.assertAllEqual(p, x)
 
+  @test_util.run_deprecated_v1
   def testZeroSizeGPU(self):
     # Verify that stack doesn't crash for zero size inputs
     with self.session(use_gpu=True):
@@ -172,6 +183,7 @@ class StackOpTest(test.TestCase):
         p = array_ops.parallel_stack(list(x)).eval()
         self.assertAllEqual(p, x)
 
+  @test_util.run_deprecated_v1
   def testAxis0DefaultCPU(self):
     with self.session(use_gpu=False):
       t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
@@ -182,6 +194,7 @@ class StackOpTest(test.TestCase):
     self.assertAllEqual(stacked, expected)
     self.assertAllEqual(parallel_stacked, expected)
 
+  @test_util.run_deprecated_v1
   def testAxis0DefaultGPU(self):
     with self.session(use_gpu=True):
       t = [constant_op.constant([1, 2, 3]), constant_op.constant([4, 5, 6])]
@@ -204,11 +217,11 @@ class StackOpTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           actual_pack = array_ops.stack(test_arrays, axis=j)
           self.assertEqual(expected.shape, actual_pack.get_shape())
-          actual_pack = actual_pack.eval()
+          actual_pack = self.evaluate(actual_pack)
 
           actual_stack = array_ops.stack(test_arrays, axis=j)
           self.assertEqual(expected.shape, actual_stack.get_shape())
-          actual_stack = actual_stack.eval()
+          actual_stack = self.evaluate(actual_stack)
 
         self.assertNDArrayNear(expected, actual_stack, 1e-6)
 
@@ -225,6 +238,7 @@ class StackOpTest(test.TestCase):
 
 class AutomaticStackingTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with self.session(use_gpu=True):
       self.assertAllEqual(
@@ -253,17 +267,20 @@ class AutomaticStackingTest(test.TestCase):
                                           [[2., 2.], [3., 3.]],
                                           dtype=np.float32)])
       self.assertAllEqual([[[0., 0.], [1., 1.]], [[2., 2.], [3., 3.]]],
-                          result.eval())
+                          self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def testVariable(self):
     with self.session(use_gpu=True):
       v = variables.Variable(17)
       result = ops.convert_to_tensor([[0, 0, 0], [0, v, 0], [0, 0, 0]])
       v.initializer.run()
-      self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]], result.eval())
+      self.assertAllEqual([[0, 0, 0], [0, 17, 0], [0, 0, 0]],
+                          self.evaluate(result))
 
       v.assign(38).op.run()
-      self.assertAllEqual([[0, 0, 0], [0, 38, 0], [0, 0, 0]], result.eval())
+      self.assertAllEqual([[0, 0, 0], [0, 38, 0], [0, 0, 0]],
+                          self.evaluate(result))
 
   def testDtype(self):
     t_0 = ops.convert_to_tensor([[0., 0., 0.], [0., 0., 0.], [0., 0., 0.]])
@@ -306,6 +323,7 @@ class AutomaticStackingTest(test.TestCase):
     t_2 = ops.convert_to_tensor([t_0, t_0, t_1], dtype=dtypes.float64)
     self.assertEqual(dtypes.float64, t_2.dtype)
 
+  @test_util.run_deprecated_v1
   def testPlaceholder(self):
     with self.session(use_gpu=True):
       # Test using placeholder with a defined shape.
@@ -324,6 +342,7 @@ class AutomaticStackingTest(test.TestCase):
       self.assertAllEqual(
           [[0, 0, 0], [0, 2, 0], [0, 0, 0]], result_1.eval(feed_dict={ph_1: 2}))
 
+  @test_util.run_deprecated_v1
   def testShapeErrors(self):
     # Static shape error.
     ph_0 = array_ops.placeholder(dtypes.int32, shape=[1])
diff --git a/tensorflow/python/kernel_tests/stack_ops_test.py b/tensorflow/python/kernel_tests/stack_ops_test.py
index 1aa12009ea5e1aa1bad3d1b4a3696178831d6a03..1930d2484fdc986ba8c5ab50df55769aa4fdc45a 100644
--- a/tensorflow/python/kernel_tests/stack_ops_test.py
+++ b/tensorflow/python/kernel_tests/stack_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -39,8 +40,9 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
-      self.assertAllClose([[4.0, 5.0]], c1.eval())
+      self.assertAllClose([[4.0, 5.0]], self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPop(self):
     self._testStackPushPop(use_gpu=False)
     self._testStackPushPop(use_gpu=True)
@@ -54,8 +56,9 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop_v2(h, dtypes.float32)
-      self.assertAllClose(a, c1.eval())
+      self.assertAllClose(a, self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPopSwap(self):
     self._testStackPushPopSwap(use_gpu=False)
     self._testStackPushPopSwap(use_gpu=True)
@@ -91,8 +94,9 @@ class StackOpTest(test.TestCase):
 
       _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
-      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+      self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
+  @test_util.run_v1_only("b/120545219")
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
@@ -110,8 +114,9 @@ class StackOpTest(test.TestCase):
       with ops.control_dependencies([c2]):
         c2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
       r = c1 + c2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
+  @test_util.run_deprecated_v1
   def testMultiStack(self):
     self._testMultiStack(use_gpu=False)
     self._testMultiStack(use_gpu=True)
@@ -131,10 +136,11 @@ class StackOpTest(test.TestCase):
         pop1 = gen_data_flow_ops.stack_pop_v2(h1, dtypes.float32)
         pop2 = gen_data_flow_ops.stack_pop_v2(h2, dtypes.float32)
 
-      out1, out2 = sess.run([pop1, pop2])
+      out1, out2 = self.evaluate([pop1, pop2])
       self.assertAllClose(out1, 4.0)
       self.assertAllClose(out2, 5.0)
 
+  @test_util.run_deprecated_v1
   def testSameNameStacks(self):
     self._testSameNameStacks(use_gpu=False)
     self._testSameNameStacks(use_gpu=True)
@@ -144,8 +150,9 @@ class StackOpTest(test.TestCase):
       h = gen_data_flow_ops.stack_v2(
           -1, elem_type=dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_close_v2(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testCloseStack(self):
     self._testCloseStack(use_gpu=False)
     self._testCloseStack(use_gpu=True)
@@ -157,8 +164,9 @@ class StackOpTest(test.TestCase):
       c = gen_data_flow_ops.stack_push_v2(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_close_v2(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testPushCloseStack(self):
     self._testPushCloseStack(use_gpu=False)
     self._testPushCloseStack(use_gpu=True)
@@ -173,8 +181,9 @@ class StackOpRefTest(test.TestCase):
       c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
-      self.assertAllClose([[4.0, 5.0]], c1.eval())
+      self.assertAllClose([[4.0, 5.0]], self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPop(self):
     self._testStackPushPop(use_gpu=False)
     self._testStackPushPop(use_gpu=True)
@@ -187,8 +196,9 @@ class StackOpRefTest(test.TestCase):
       c = gen_data_flow_ops.stack_push(h, x, swap_memory=True)
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_pop(h, dtypes.float32)
-      self.assertAllClose(a, c1.eval())
+      self.assertAllClose(a, self.evaluate(c1))
 
+  @test_util.run_deprecated_v1
   def testStackPushPopSwap(self):
     self._testStackPushPopSwap(use_gpu=False)
     self._testStackPushPopSwap(use_gpu=True)
@@ -204,7 +214,7 @@ class StackOpRefTest(test.TestCase):
       with ops.control_dependencies([c2]):
         c2 = gen_data_flow_ops.stack_pop(h2, dtypes.float32)
       r = c1 + c2
-      self.assertAllClose(9.0, r.eval())
+      self.assertAllClose(9.0, self.evaluate(r))
 
   def _testStackWhileSwap(self, use_gpu):
     with self.cached_session(use_gpu=use_gpu):
@@ -236,12 +246,14 @@ class StackOpRefTest(test.TestCase):
 
       _, ry = control_flow_ops.while_loop(
           c1, b1, [r, v], [r.get_shape(), tensor_shape.unknown_shape()])
-      self.assertAllClose(np.ones(2000) * 10.0, ry.eval())
+      self.assertAllClose(np.ones(2000) * 10.0, self.evaluate(ry))
 
+  @test_util.run_v1_only("b/120545219")
   def testStackWhileSwap(self):
     self._testStackWhileSwap(use_gpu=False)
     self._testStackWhileSwap(use_gpu=True)
 
+  @test_util.run_deprecated_v1
   def testMultiStack(self):
     self._testMultiStack(use_gpu=False)
     self._testMultiStack(use_gpu=True)
@@ -253,8 +265,9 @@ class StackOpRefTest(test.TestCase):
       h2 = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c2 = gen_data_flow_ops.stack_push(h2, 5.0)
       _ = c1 + c2
-      self.assertNotEqual(h1.eval()[1], h2.eval()[1])
+      self.assertNotEqual(h1.eval()[1], self.evaluate(h2)[1])
 
+  @test_util.run_deprecated_v1
   def testSameNameStacks(self):
     self._testSameNameStacks(use_gpu=False)
     self._testSameNameStacks(use_gpu=True)
@@ -263,8 +276,9 @@ class StackOpRefTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       h = gen_data_flow_ops._stack(dtypes.float32, stack_name="foo")
       c1 = gen_data_flow_ops.stack_close(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testCloseStack(self):
     self._testCloseStack(use_gpu=False)
     self._testCloseStack(use_gpu=True)
@@ -275,8 +289,9 @@ class StackOpRefTest(test.TestCase):
       c = gen_data_flow_ops.stack_push(h, [[4.0, 5.0]])
       with ops.control_dependencies([c]):
         c1 = gen_data_flow_ops.stack_close(h)
-      sess.run(c1)
+      self.evaluate(c1)
 
+  @test_util.run_deprecated_v1
   def testPushCloseStack(self):
     self._testPushCloseStack(use_gpu=False)
     self._testPushCloseStack(use_gpu=True)
diff --git a/tensorflow/python/kernel_tests/stage_op_test.py b/tensorflow/python/kernel_tests/stage_op_test.py
index b814843b86c7321ffdb98072d96909d569b32a62..83e06ba48bdbbe3189eafde7d0f42c2e4ced68ab 100644
--- a/tensorflow/python/kernel_tests/stage_op_test.py
+++ b/tensorflow/python/kernel_tests/stage_op_test.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -28,6 +29,7 @@ TIMEOUT = 1
 
 class StageTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSimple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -47,6 +49,7 @@ class StageTest(test.TestCase):
         _, yval = sess.run([stage, y], feed_dict={x: i})
         self.assertAllClose(4 * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testMultiple(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -67,6 +70,7 @@ class StageTest(test.TestCase):
         self.assertAllClose(
             4 * (i - 1) * (i - 1) * (i - 1) * 128, yval, rtol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testDictionary(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -110,6 +114,7 @@ class StageTest(test.TestCase):
 
     G.finalize()
 
+  @test_util.run_deprecated_v1
   def testPeek(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -133,6 +138,7 @@ class StageTest(test.TestCase):
       for i in range(10):
         self.assertTrue(sess.run(peek, feed_dict={p: i}) == [i])
 
+  @test_util.run_deprecated_v1
   def testSizeAndClear(self):
     with ops.Graph().as_default() as G:
       with ops.device('/cpu:0'):
@@ -158,6 +164,7 @@ class StageTest(test.TestCase):
       sess.run(clear)
       self.assertEqual(sess.run(size), 0)
 
+  @test_util.run_deprecated_v1
   def testCapacity(self):
     capacity = 3
 
@@ -219,6 +226,7 @@ class StageTest(test.TestCase):
       # It should now be empty
       self.assertTrue(sess.run(size) == 0)
 
+  @test_util.run_deprecated_v1
   def testMemoryLimit(self):
     memory_limit = 512 * 1024  # 512K
     chunk = 200 * 1024  # 256K
diff --git a/tensorflow/python/kernel_tests/string_join_op_test.py b/tensorflow/python/kernel_tests/string_join_op_test.py
index e4371ab5b933a9bd2cf891f24a254bd14e584e3d..2548e8695fe5861644dbac6481bb01ef18515b3e 100644
--- a/tensorflow/python/kernel_tests/string_join_op_test.py
+++ b/tensorflow/python/kernel_tests/string_join_op_test.py
@@ -17,12 +17,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class StringJoinOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testStringJoin(self):
     input0 = ["a", "b"]
     input1 = "a"
diff --git a/tensorflow/python/kernel_tests/string_length_op_test.py b/tensorflow/python/kernel_tests/string_length_op_test.py
index 57db7302b155991c966e66aa77152d297ce0cf2b..bfa6ac2454a3fba97abdd4ed8376661a0bc6fd70 100644
--- a/tensorflow/python/kernel_tests/string_length_op_test.py
+++ b/tensorflow/python/kernel_tests/string_length_op_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -29,9 +30,10 @@ class StringLengthOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       lengths = string_ops.string_length(strings)
-      values = sess.run(lengths)
+      values = self.evaluate(lengths)
       self.assertAllEqual(values, [[[1, 2], [3, 4], [5, 6]]])
 
+  @test_util.run_deprecated_v1
   def testUnit(self):
     unicode_strings = [u"H\xc3llo", u"\U0001f604"]
     utf8_strings = [s.encode("utf-8") for s in unicode_strings]
@@ -43,14 +45,15 @@ class StringLengthOpTest(test.TestCase):
       utf8_char_lengths = string_ops.string_length(
           utf8_strings, unit="UTF8_CHAR")
       self.assertAllEqual(
-          sess.run(utf8_byte_lengths), expected_utf8_byte_lengths)
+          self.evaluate(utf8_byte_lengths), expected_utf8_byte_lengths)
       self.assertAllEqual(
-          sess.run(utf8_char_lengths), expected_utf8_char_lengths)
+          self.evaluate(utf8_char_lengths), expected_utf8_char_lengths)
       with self.assertRaisesRegexp(
           ValueError, "Attr 'unit' of 'StringLength' Op passed string 'XYZ' "
           'not in: "BYTE", "UTF8_CHAR"'):
         string_ops.string_length(utf8_strings, unit="XYZ")
 
+  @test_util.run_deprecated_v1
   def testLegacyPositionalName(self):
     # Code that predates the 'unit' parameter may have used a positional
     # argument for the 'name' parameter.  Check that we don't break such code.
diff --git a/tensorflow/python/kernel_tests/string_split_op_test.py b/tensorflow/python/kernel_tests/string_split_op_test.py
index b968e885edafcbdebd3b32e11c6bdf35e65e7616..0c91deb5220bf268366bbc65dbd001617439fa12 100644
--- a/tensorflow/python/kernel_tests/string_split_op_test.py
+++ b/tensorflow/python/kernel_tests/string_split_op_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -34,17 +35,18 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
       self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
       self.assertAllEqual(shape, [2, 4])
 
+  @test_util.run_deprecated_v1
   def testStringSplitEmptyDelimiter(self):
     strings = ["hello", "hola", b"\xF0\x9F\x98\x8E"]  # Last string is U+1F60E
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, delimiter="")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [2, 0],
                                     [2, 1], [2, 2], [2, 3]])
@@ -62,7 +64,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices,
           [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
@@ -74,13 +76,14 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, delimiter=" .")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices,
           [[1, 0], [2, 0], [3, 0], [5, 0], [6, 0], [7, 0], [8, 0]])
       self.assertAllEqual(values, [b"a", b"b", b"c", b"d", b"e", b"f", b"g"])
       self.assertAllEqual(shape, [10, 1])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimiter(self):
     strings = ["hello|world", "hello world"]
 
@@ -92,17 +95,18 @@ class StringSplitOpTest(test.TestCase):
           ValueError, string_ops.string_split, strings, delimiter=["a"])
 
       tokens = string_ops.string_split(strings, delimiter="|")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0]])
       self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
       self.assertAllEqual(shape, [2, 2])
 
       tokens = string_ops.string_split(strings, delimiter="| ")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"hello", b"world", b"hello", b"world"])
       self.assertAllEqual(shape, [2, 2])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimiterTensor(self):
     strings = ["hello|world", "hello world"]
 
@@ -121,6 +125,7 @@ class StringSplitOpTest(test.TestCase):
       self.assertAllEqual(values, [b"hello", b"world", b"hello world"])
       self.assertAllEqual(shape, [2, 2])
 
+  @test_util.run_deprecated_v1
   def testStringSplitWithDelimitersTensor(self):
     strings = ["hello.cruel,world", "hello cruel world"]
 
@@ -145,7 +150,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, "#", skip_empty=False)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1],
                                     [2, 0], [2, 1], [2, 2]])
@@ -154,7 +159,7 @@ class StringSplitOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split(strings, "#")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(values, [b"a", b"b", b"c"])
       self.assertAllEqual(indices, [[0, 0], [1, 0], [2, 0]])
       self.assertAllEqual(shape, [3, 1])
@@ -167,7 +172,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2], [0, 3], [1, 0]])
       self.assertAllEqual(values, [b"pigs", b"on", b"the", b"wing", b"animals"])
       self.assertAllEqual(shape, [2, 4])
@@ -182,7 +187,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep="<>")
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(
           indices, [[0, 0], [0, 1], [0, 2],
                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]])
@@ -200,7 +205,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep=',')
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2], [1, 3], [1, 4]])
       self.assertAllEqual(values, [b"1", b"2", b"3",
@@ -217,7 +222,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1], [0, 2],
                                     [1, 0], [1, 1], [1, 2]])
       self.assertAllEqual(values, [b"1", b"2", b"3", b"4", b"5", b"6"])
@@ -233,7 +238,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, sep=',', maxsplit=1)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"1", b"2,3", b"4", b"5,,6,"])
@@ -249,7 +254,7 @@ class StringSplitV2OpTest(test.TestCase):
 
     with self.cached_session() as sess:
       tokens = string_ops.string_split_v2(strings, maxsplit=1)
-      indices, values, shape = sess.run(tokens)
+      indices, values, shape = self.evaluate(tokens)
       self.assertAllEqual(indices, [[0, 0], [0, 1],
                                     [1, 0], [1, 1]])
       self.assertAllEqual(values, [b"1", b"2 3", b"4", b"5    6  "])
diff --git a/tensorflow/python/kernel_tests/string_strip_op_test.py b/tensorflow/python/kernel_tests/string_strip_op_test.py
index 1e404b7146220761c764be62234cd31d38c6349f..edff3862ff6984393c497f76943dc460d6f2541c 100644
--- a/tensorflow/python/kernel_tests/string_strip_op_test.py
+++ b/tensorflow/python/kernel_tests/string_strip_op_test.py
@@ -30,7 +30,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [b"pigs on the wing", b"animals"])
 
   def test_string_strip_2d(self):
@@ -39,7 +39,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [[b"pigs on the wing", b"animals"],
                                    [b"hello", b"world"]])
 
@@ -48,7 +48,7 @@ class StringStripOpTest(test.TestCase):
 
     with self.cached_session() as sess:
       output = string_ops.string_strip(strings)
-      output = sess.run(output)
+      output = self.evaluate(output)
       self.assertAllEqual(output, [b"hello", b"", b"world", b""])
 
 
diff --git a/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py b/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
index 9cb0c9d18f32803aff5b5c7d1d5643d0742fee05..25f573fc144a6252ce8de3b88adf3874ab7f9bab 100644
--- a/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
+++ b/tensorflow/python/kernel_tests/string_to_hash_bucket_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
@@ -26,6 +27,7 @@ from tensorflow.python.platform import test
 
 class StringToHashBucketOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testStringToOneHashBucketFast(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -34,6 +36,7 @@ class StringToHashBucketOpTest(test.TestCase):
 
       self.assertAllEqual([0, 0, 0], result)
 
+  @test_util.run_deprecated_v1
   def testStringToHashBucketsFast(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -46,6 +49,7 @@ class StringToHashBucketOpTest(test.TestCase):
       # Fingerprint64('d') -> 4470636696479570465 -> mod 10 -> 5
       self.assertAllEqual([9, 2, 2, 5], result)
 
+  @test_util.run_deprecated_v1
   def testStringToOneHashBucketLegacyHash(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -54,6 +58,7 @@ class StringToHashBucketOpTest(test.TestCase):
 
       self.assertAllEqual([0, 0, 0], result)
 
+  @test_util.run_deprecated_v1
   def testStringToHashBucketsLegacyHash(self):
     with self.cached_session():
       input_string = array_ops.placeholder(dtypes.string)
@@ -70,7 +75,7 @@ class StringToHashBucketOpTest(test.TestCase):
       input_string = constant_op.constant(['a', 'b', 'c'])
       output = string_ops.string_to_hash_bucket_strong(
           input_string, 1, key=[123, 345])
-      self.assertAllEqual([0, 0, 0], output.eval())
+      self.assertAllEqual([0, 0, 0], self.evaluate(output))
 
   def testStringToHashBucketsStrong(self):
     with self.cached_session():
@@ -81,7 +86,7 @@ class StringToHashBucketOpTest(test.TestCase):
       # StrongKeyedHash(key, 'a') -> 7157389809176466784 -> mod 10 -> 4
       # StrongKeyedHash(key, 'b') -> 15805638358933211562 -> mod 10 -> 2
       # StrongKeyedHash(key, 'c') -> 18100027895074076528 -> mod 10 -> 8
-      self.assertAllEqual([4, 2, 8], output.eval())
+      self.assertAllEqual([4, 2, 8], self.evaluate(output))
 
   def testStringToHashBucketsStrongInvalidKey(self):
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/string_to_number_op_test.py b/tensorflow/python/kernel_tests/string_to_number_op_test.py
index 99ee25e1253740653f9c92d3722ecf2f682ca003..49ccfd1028fa5b6dd290a949a841ea7653517431 100644
--- a/tensorflow/python/kernel_tests/string_to_number_op_test.py
+++ b/tensorflow/python/kernel_tests/string_to_number_op_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import parsing_ops
 from tensorflow.python.platform import test
@@ -45,6 +46,7 @@ class StringToNumberOpTest(test.TestCase):
         with self.assertRaisesOpError(outstr):
           output.eval(feed_dict={input_string: [instr]})
 
+  @test_util.run_deprecated_v1
   def testToFloat(self):
     self._test(dtypes.float32,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -58,6 +60,7 @@ class StringToNumberOpTest(test.TestCase):
                 ("INF", float("INF"))],
                [("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToDouble(self):
     self._test(dtypes.float64,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -71,6 +74,7 @@ class StringToNumberOpTest(test.TestCase):
                 ("INF", float("INF"))],
                [("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToInt32(self):
     self._test(dtypes.int32,
                [("0", 0), ("3", 3), ("-1", -1),
@@ -84,6 +88,7 @@ class StringToNumberOpTest(test.TestCase):
                    ("2.9", _ERROR_MESSAGE + "2.9"),
                    ("10foobar", _ERROR_MESSAGE + "10foobar")])
 
+  @test_util.run_deprecated_v1
   def testToInt64(self):
     self._test(dtypes.int64,
                [("0", 0), ("3", 3), ("-1", -1),
diff --git a/tensorflow/python/kernel_tests/substr_op_test.py b/tensorflow/python/kernel_tests/substr_op_test.py
index 37aa624b07e86c68a48d3859bf88d8ef0ce93253..9302152e82bfa9c807a644f73ef1e705594b45f8 100644
--- a/tensorflow/python/kernel_tests/substr_op_test.py
+++ b/tensorflow/python/kernel_tests/substr_op_test.py
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -51,7 +52,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -71,7 +72,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Full string
@@ -83,7 +84,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, test_string)
 
     # Full string (Negative)
@@ -95,7 +96,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, test_string)
 
     # Length is larger in magnitude than a negative position
@@ -111,7 +112,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(5, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_string)
 
   @parameterized.parameters(
@@ -138,7 +139,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     length = np.array(3, dtype)
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -173,7 +174,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     position = np.array(-3, dtype)
@@ -188,7 +189,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -229,7 +230,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -271,7 +272,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Broadcast input string onto pos/len
@@ -294,7 +295,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
     # Test 1D broadcast
@@ -310,7 +311,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     }[unit]
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
-      substr = substr_op.eval()
+      substr = self.evaluate(substr_op)
       self.assertAllEqual(substr, expected_value)
 
   @parameterized.parameters(
@@ -319,6 +320,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testBadBroadcast(self, dtype, unit):
     test_string = [[b"ten", b"eleven", b"twelve"],
                    [b"thirteen", b"fourteen", b"fifteen"],
@@ -338,6 +340,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, -6, "UTF8_CHAR"),
       (np.int64, -6, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_Scalar(self, dtype, pos, unit):
     # Scalar/Scalar
     test_string = {
@@ -349,7 +352,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, 4, "BYTE"),
@@ -361,6 +364,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, -4, "UTF8_CHAR"),
       (np.int64, -4, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_VectorScalar(self, dtype, pos, unit):
     # Vector/Scalar
     test_string = {
@@ -373,7 +377,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -381,6 +385,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_MatrixMatrix(self, dtype, unit):
     # Matrix/Matrix
     test_string = {
@@ -398,7 +403,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
     # Matrix/Matrix (with negative)
     position = np.array([[1, 2, -3], [1, 2, -4], [1, 2, -3]], dtype)
@@ -406,7 +411,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -414,6 +419,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testOutOfRangeError_Broadcast(self, dtype, unit):
     # Broadcast
     test_string = {
@@ -428,7 +434,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
     # Broadcast (with negative)
     position = np.array([-1, -2, -4], dtype)
@@ -436,7 +442,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     substr_op = string_ops.substr(test_string, position, length, unit=unit)
     with self.cached_session():
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        substr_op.eval()
+        self.evaluate(substr_op)
 
   @parameterized.parameters(
       (np.int32, "BYTE"),
@@ -444,6 +450,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       (np.int32, "UTF8_CHAR"),
       (np.int64, "UTF8_CHAR"),
   )
+  @test_util.run_deprecated_v1
   def testMismatchPosLenShapes(self, dtype, unit):
     test_string = {
         "BYTE": [[b"ten", b"eleven", b"twelve"],
@@ -471,6 +478,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
     with self.assertRaises(ValueError):
       string_ops.substr(test_string, position, length)
 
+  @test_util.run_deprecated_v1
   def testWrongDtype(self):
     with self.cached_session():
       with self.assertRaises(TypeError):
@@ -478,6 +486,7 @@ class SubstrOpTest(test.TestCase, parameterized.TestCase):
       with self.assertRaises(TypeError):
         string_ops.substr(b"test", 3, 1.0)
 
+  @test_util.run_deprecated_v1
   def testInvalidUnit(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
diff --git a/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
index 63ce77b9d55b04aaaba60469d1da62a903bc6a0d..1547c55f8b0b112325c6049f2052091228c171bf 100644
--- a/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_audio_op_test.py
@@ -60,7 +60,7 @@ class SummaryV1AudioOpTest(test.TestCase):
         sample_rate = 8000
         summ = summary.audio(
             "snd", const, max_outputs=3, sample_rate=sample_rate)
-        value = sess.run(summ)
+        value = self.evaluate(summ)
         self.assertEqual([], summ.get_shape())
         audio_summ = self._AsSummary(value)
 
diff --git a/tensorflow/python/kernel_tests/summary_v1_image_op_test.py b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
index 094606944ffa48be434ed11904219cbe6c1c24c6..56de2e933db6498d274d9463e89d01ac3c06b2bc 100644
--- a/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_image_op_test.py
@@ -24,6 +24,7 @@ from six.moves import xrange  # pylint: disable=redefined-builtin
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import image_ops
 import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -49,6 +50,7 @@ class SummaryV1ImageOpTest(test.TestCase):
         }""" % ((i,) + shape[1:]) for i in xrange(3))
     self.assertProtoEquals(expected, image_summ)
 
+  @test_util.run_deprecated_v1
   def testImageSummary(self):
     for depth in (1, 3, 4):
       for positive in False, True:
@@ -70,7 +72,7 @@ class SummaryV1ImageOpTest(test.TestCase):
 
           # Summarize
           summ = summary.image("img", const)
-          value = sess.run(summ)
+          value = self.evaluate(summ)
           self.assertEqual([], summ.get_shape())
           image_summ = self._AsSummary(value)
 
@@ -84,6 +86,7 @@ class SummaryV1ImageOpTest(test.TestCase):
           # Check the rest of the proto
           self._CheckProto(image_summ, shape)
 
+  @test_util.run_deprecated_v1
   def testImageSummaryUint8(self):
     np.random.seed(7)
     for depth in (1, 3, 4):
@@ -97,7 +100,7 @@ class SummaryV1ImageOpTest(test.TestCase):
 
         # Summarize
         summ = summary.image("img", tf_images)
-        value = sess.run(summ)
+        value = self.evaluate(summ)
         self.assertEqual([], summ.get_shape())
         image_summ = self._AsSummary(value)
 
diff --git a/tensorflow/python/kernel_tests/summary_v1_ops_test.py b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
index 6c4e106b11889f9211b882febd4489297e3ce310..e070f5bf6f5d892aab2df630a3f1e1b96ee2dfce 100644
--- a/tensorflow/python/kernel_tests/summary_v1_ops_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_ops_test.py
@@ -26,6 +26,7 @@ from __future__ import print_function
 from tensorflow.core.framework import summary_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import logging_ops
 from tensorflow.python.platform import test
 from tensorflow.python.summary import summary
@@ -42,7 +43,7 @@ class SummaryV1OpsTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant([10.0, 20.0])
       summ = logging_ops.scalar_summary(["c1", "c2"], const, name="mysumm")
-      value = sess.run(summ)
+      value = self.evaluate(summ)
     self.assertEqual([], summ.get_shape())
     self.assertProtoEquals("""
       value { tag: "c1" simple_value: 10.0 }
@@ -53,20 +54,21 @@ class SummaryV1OpsTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant([10.0, 20.0])
       summ = logging_ops.scalar_summary(["c1", "c2"], const)
-      value = sess.run(summ)
+      value = self.evaluate(summ)
     self.assertEqual([], summ.get_shape())
     self.assertProtoEquals("""
       value { tag: "c1" simple_value: 10.0 }
       value { tag: "c2" simple_value: 20.0 }
       """, self._AsSummary(value))
 
+  @test_util.run_deprecated_v1
   def testMergeSummary(self):
     with self.cached_session() as sess:
       const = constant_op.constant(10.0)
       summ1 = summary.histogram("h", const)
       summ2 = logging_ops.scalar_summary("c", const)
       merge = summary.merge([summ1, summ2])
-      value = sess.run(merge)
+      value = self.evaluate(merge)
     self.assertEqual([], merge.get_shape())
     self.assertProtoEquals("""
       value {
diff --git a/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
index 34f771679ae0032679117035ea85815080ca229d..b8e5b5b882a3090620ecdb14292ae8d73f2c8bcd 100644
--- a/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
+++ b/tensorflow/python/kernel_tests/summary_v1_tensor_op_test.py
@@ -50,7 +50,7 @@ class SummaryV1TensorOpTest(test.TestCase):
         with ops.name_scope("zod"):
           s3 = summary_lib.tensor_summary("s3", c)
           s4 = summary_lib.tensor_summary("TensorSummary", c)
-      summ1, summ2, summ3, summ4 = sess.run([s1, s2, s3, s4])
+      summ1, summ2, summ3, summ4 = self.evaluate([s1, s2, s3, s4])
 
     v1 = self._SummarySingleValue(summ1)
     self.assertEqual(v1.tag, "s1")
@@ -68,7 +68,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant(10.0)
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -79,7 +79,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant(s)
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -89,7 +89,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = array_ops.ones([5, 5, 5])
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
     self._AssertNumpyEq(n, np.ones([5, 5, 5]))
@@ -99,7 +99,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant(strings)
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
     self._AssertNumpyEq(n, strings)
@@ -109,7 +109,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
       const = constant_op.constant(bools)
       summ = summary_lib.tensor_summary("foo", const)
-      result = sess.run(summ)
+      result = self.evaluate(summ)
 
     value = self._SummarySingleValue(result)
     n = tensor_util.MakeNdarray(value.tensor)
@@ -119,7 +119,7 @@ class SummaryV1TensorOpTest(test.TestCase):
     with self.cached_session() as sess:
 
       def get_description(summary_op):
-        summ_str = sess.run(summary_op)
+        summ_str = self.evaluate(summary_op)
         summ = summary_pb2.Summary()
         summ.ParseFromString(summ_str)
         return summ.value[0].metadata
diff --git a/tensorflow/python/kernel_tests/svd_op_test.py b/tensorflow/python/kernel_tests/svd_op_test.py
index 57298c0fecca859ffdc581560cda4e3a0423d762..cfa9f122d1fcee1748cd30bdc4212d81a5709ae6 100644
--- a/tensorflow/python/kernel_tests/svd_op_test.py
+++ b/tensorflow/python/kernel_tests/svd_op_test.py
@@ -20,7 +20,9 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import linalg_ops
@@ -38,6 +40,7 @@ def _AddTest(test_class, op_name, testcase_name, fn):
 
 class SvdOpTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testWrongDimensions(self):
     # The input to svd should be a tensor of at least rank 2.
     scalar = constant_op.constant(1.)
@@ -49,6 +52,7 @@ class SvdOpTest(test.TestCase):
                                  "Shape must be at least rank 2 but is rank 1"):
       linalg_ops.svd(vector)
 
+  @test_util.run_v1_only("b/120545219")
   def testConcurrentExecutesWithoutError(self):
     with self.session(use_gpu=True) as sess:
       all_ops = []
@@ -68,7 +72,7 @@ class SvdOpTest(test.TestCase):
             s2 = linalg_ops.svd(
                 matrix2, compute_uv=compute_uv_, full_matrices=full_matrices_)
             all_ops += [s1, s2]
-      val = sess.run(all_ops)
+      val = self.evaluate(all_ops)
       for i in range(2):
         s = 6 * i
         self.assertAllEqual(val[s], val[s + 3])  # s1 == s2
@@ -117,14 +121,15 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         diag_s = array_ops.concat([diag_s, zeros], a.ndim - 1)
     a_recon = math_ops.matmul(u, diag_s)
     a_recon = math_ops.matmul(a_recon, v, adjoint_b=True)
-    self.assertAllClose(a_recon.eval(), a, rtol=tol, atol=tol)
+    self.assertAllClose(a_recon, a, rtol=tol, atol=tol)
 
   def CheckUnitary(self, x, tol):
     # Tests that x[...,:,:]^H * x[...,:,:] is close to the identity.
     xx = math_ops.matmul(x, x, adjoint_a=True)
     identity = array_ops.matrix_band_part(array_ops.ones_like(xx), 0, 0)
-    self.assertAllClose(identity.eval(), xx.eval(), atol=tol)
+    self.assertAllClose(identity, xx, atol=tol)
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     is_complex = dtype_ in (np.complex64, np.complex128)
     is_single = dtype_ in (np.float32, np.complex64)
@@ -150,7 +155,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         s_tf, u_tf, v_tf = linalg_ops.svd(
             x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
         if use_static_shape_:
-          s_tf_val, u_tf_val, v_tf_val = sess.run([s_tf, u_tf, v_tf])
+          s_tf_val, u_tf_val, v_tf_val = self.evaluate([s_tf, u_tf, v_tf])
         else:
           s_tf_val, u_tf_val, v_tf_val = sess.run(
               [s_tf, u_tf, v_tf], feed_dict={x_tf: x_np})
@@ -158,7 +163,7 @@ def _GetSvdOpTest(dtype_, shape_, use_static_shape_, compute_uv_,
         s_tf = linalg_ops.svd(
             x_tf, compute_uv=compute_uv_, full_matrices=full_matrices_)
         if use_static_shape_:
-          s_tf_val = sess.run(s_tf)
+          s_tf_val = self.evaluate(s_tf)
         else:
           s_tf_val = sess.run(s_tf, feed_dict={x_tf: x_np})
 
@@ -213,6 +218,7 @@ def _GetSvdGradOpTest(dtype_, shape_, compute_uv_, full_matrices_):
     tf_v *= phase[..., :n]
     return tf_s, tf_u, tf_v
 
+  @test_util.run_v1_only("b/120545219")
   def Test(self):
     np.random.seed(42)
     a = np.random.uniform(low=-1.0, high=1.0, size=shape_).astype(dtype_)
@@ -263,7 +269,8 @@ if __name__ == "__main__":
           for cols in 1, 2, 5, 10, 32, 100:
             for batch_dims in [(), (3,)] + [(3, 2)] * (max(rows, cols) < 10):
               shape = batch_dims + (rows, cols)
-              for use_static_shape in True, False:
+              # TF2 does not support placeholders under eager so we skip it
+              for use_static_shape in set([True, tf2.enabled()]):
                 name = "%s_%s_static_shape_%s__compute_uv_%s_full_%s" % (
                     dtype.__name__, "_".join(map(str, shape)), use_static_shape,
                     compute_uv, full_matrices)
diff --git a/tensorflow/python/kernel_tests/template_test.py b/tensorflow/python/kernel_tests/template_test.py
index 9dcdaa61ed2c0c12940817ccb311e27d1a19fa0c..3b2a56bd1ff6ef81ae17773fd5a23bc96778ce63 100644
--- a/tensorflow/python/kernel_tests/template_test.py
+++ b/tensorflow/python/kernel_tests/template_test.py
@@ -72,6 +72,7 @@ def variable_scoped_function_with_local_variable():
 
 class TemplateTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_end_to_end(self):
     """This test shows a very simple line model with test_loss.
 
@@ -104,10 +105,10 @@ class TemplateTest(test.TestCase):
     train_op = optimizer.minimize(train_loss)
 
     with session.Session() as sess:
-      sess.run(variables.global_variables_initializer())
-      initial_test_loss = sess.run(test_loss)
-      sess.run(train_op)
-      final_test_loss = sess.run(test_loss)
+      self.evaluate(variables.global_variables_initializer())
+      initial_test_loss = self.evaluate(test_loss)
+      self.evaluate(train_op)
+      final_test_loss = self.evaluate(test_loss)
 
     # Parameters are tied, so the loss should have gone down when we trained it.
     self.assertLess(final_test_loss, initial_test_loss)
@@ -172,6 +173,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1/dummy:0", v1.name)
     self.assertEqual("s1_1/dummy:0", v3.name)
 
+  @test_util.run_deprecated_v1
   def test_same_unique_name_raise_error(self):
     tmpl1 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
@@ -190,6 +192,7 @@ class TemplateTest(test.TestCase):
         template.make_template(
             "_", variable_scoped_function, unique_name_="s1")
 
+  @test_util.run_deprecated_v1
   def test_unique_name_and_reuse(self):
     tmpl1 = template.make_template(
         "_", variable_scoped_function, unique_name_="s1")
@@ -260,6 +263,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual("s1/test/dummy:0", v1.name)
     self.assertEqual("s1_1/test/dummy:0", v3.name)
 
+  @test_util.run_deprecated_v1
   def test_enforces_no_extra_trainable_variables(self):
     tmpl = template.make_template("s", function_with_create, trainable=True)
 
@@ -675,6 +679,7 @@ class TemplateTest(test.TestCase):
     self.assertEqual(1, len(tb.variables))
 
   # TODO(apassos) handle local variables in Eager
+  @test_util.run_deprecated_v1
   def test_local_variables(self):
     # Make sure trainable_variables are created.
     with variable_scope.variable_scope("foo3"):
diff --git a/tensorflow/python/kernel_tests/tensor_array_ops_test.py b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
index 0188eb246f0637b4029351d2966f66a1234729a2..88625841bcc982bf477b619f3da0b70498f0542f 100644
--- a/tensorflow/python/kernel_tests/tensor_array_ops_test.py
+++ b/tensorflow/python/kernel_tests/tensor_array_ops_test.py
@@ -63,6 +63,8 @@ def _make_ta(size, name, dtype=dtypes.float32, infer_shape=False):
       dtype=dtype, tensor_array_name=name, size=size, infer_shape=infer_shape)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+@test_util.with_control_flow_v2
 class TensorArrayTest(test.TestCase):
 
   @classmethod
@@ -123,11 +125,9 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayWritePack(dtypes.complex128)
     self._testTensorArrayWritePack(dtypes.string)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayWritePack(self):
     self._testTensorArrayWritePackMaybeLegacy()
 
-  @test_util.run_in_graph_and_eager_modes
   def testEmptyTensorArrayPack(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -161,7 +161,7 @@ class TensorArrayTest(test.TestCase):
           convert([[4.0, 5.0], [104.0, 105.0], [204.0, 205.0], [6.0, 7.0],
                    [106.0, 107.0], [8.0, 9.0]]), c0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArrayWriteConcat(self):
     self._testTensorArrayWriteConcat(dtypes.float32)
     self._testTensorArrayWriteConcat(dtypes.float64)
@@ -184,7 +184,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                           self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118890905")
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableFillsZeros()
 
@@ -200,7 +201,8 @@ class TensorArrayTest(test.TestCase):
     self.assertAllEqual([[0.0, 0.0], [4.0, 5.0], [0.0, 0.0]],
                         self.evaluate(ta.write(1, [[4.0, 5.0]]).concat()))
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/118890905")
+  @test_util.run_v1_only("b/118890905")
   def testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros(self):
     self._testTensorArrayReadOrPackNotAllValuesAvailableInferShapeFillsZeros()
 
@@ -251,7 +253,6 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArrayUnpackRead(dtypes.complex128)
     self._testTensorArrayUnpackRead(dtypes.string)
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayUnpackRead(self):
     self._testTensorArrayUnpackReadMaybeLegacy()
 
@@ -297,7 +298,7 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(convert([]).reshape(0, 2), d1)
       self.assertAllEqual(convert([[3.0, 301.0]]), d2)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testTensorArraySplitRead(self):
     self._testTensorArraySplitRead(dtypes.float32)
     self._testTensorArraySplitRead(dtypes.float64)
@@ -307,7 +308,9 @@ class TensorArrayTest(test.TestCase):
     self._testTensorArraySplitRead(dtypes.complex128)
     self._testTensorArraySplitRead(dtypes.string)
 
-  def testTensorGradArrayWriteRead(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradArrayWriteRead(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -340,7 +343,29 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[2.0]], g_d1)
       self.assertAllEqual(-2.0, g_d2)
 
-  def testTensorGradArrayDynamicWriteRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradGrad(self):
+    if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+      self.skipTest("Legacy TensorArray does not support double derivatives.")
+    with self.test_session(use_gpu=True) as session:
+      x = constant_op.constant(4.0)
+
+      ta = tensor_array_ops.TensorArray(
+          dtype=dtypes.float32,
+          tensor_array_name="foo",
+          size=1,
+          infer_shape=False)
+      w0 = ta.write(0, x)
+      r0 = w0.read(0)
+      y = r0 * r0
+
+      g1 = gradients_impl.gradients(ys=[y], xs=[x])
+      g2 = gradients_impl.gradients(ys=[g1], xs=[x])
+      self.assertAllEqual([2.0], session.run(g2))
+
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradArrayDynamicWriteRead(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -381,7 +406,9 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(3, vs)
       self.assertAllEqual(3, g_vs)
 
-  def testTensorGradAccessTwiceReceiveSameObject(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorGradAccessTwiceReceiveSameObject(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -397,26 +424,41 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(t_g_ta_0, t_g_ta_1)
       self.assertAllEqual([[4.0, 5.0]], d_r1_0)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayWriteWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
       # Test writing the wrong datatype
-      with self.assertRaisesOpError(
-          "TensorArray dtype is (float|float32) but Op is trying to write "
-          "dtype string"):
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = ("Invalid data types; op elements string but list elements "
+                     "float")
+      else:
+        error_msg = (
+            "TensorArray dtype is (float|float32) but Op is trying to write "
+            "dtype string")
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(0, "wrong_type_scalar").flow)
 
-      with self.assertRaisesOpError("index -1"):
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to modify element -1 in a list with 3 elements."
+      else:
+        error_msg = "index -1"
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(-1, 3.0).flow)
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to modify element 3 in a list with 3 elements"
+      else:
+        error_msg = ("Tried to write to index 3 but array is not "
+                     "resizeable and size is: 3")
       # Test reading from too large an index
-      with self.assertRaisesOpError(
-          "Tried to write to index 3 but array is not "
-          "resizeable and size is: 3"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.write(3, 3.0).flow)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayReadWrongIndexOrDataTypeFails(self):
     with self.session(use_gpu=True):
       ta = _make_ta(3, "foo", dtype=dtypes.float32)
@@ -424,23 +466,35 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.write(0, [[4.0, 5.0]])
 
       # Test reading wrong datatype (only possible when constructing graphs).
-      if not context.executing_eagerly():
+      if (not context.executing_eagerly() and
+          not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2):
         r0_bad = gen_data_flow_ops.tensor_array_read_v3(
             handle=w0.handle, index=0, dtype=dtypes.float64, flow_in=w0.flow)
         with self.assertRaisesOpError(
             "TensorArray dtype is float but Op requested dtype double."):
-          r0_bad.eval()
+          self.evaluate(r0_bad)
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to access element -1 in a list with 3 elements."
+      else:
+        error_msg = "index -1"
       # Test reading from a negative index, which is not allowed
-      with self.assertRaisesOpError("index -1"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(-1))
 
+      if (tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+          not context.executing_eagerly()):
+        error_msg = "Trying to access element 3 in a list with 3 elements."
+      else:
+        error_msg = "Tried to read from index 3 but array size is: 3"
       # Test reading from too large an index
-      with self.assertRaisesOpError(
-          "Tried to read from index 3 but array size is: 3"):
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.read(3))
 
-  def testTensorArrayWriteMultipleFails(self):
+  @test_util.disable_control_flow_v2("v2 allows multiple writes.")
+  @test_util.run_v1_only("v2 allows multiple writes.")
+  def testSkipEagerTensorArrayWriteMultipleFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -450,7 +504,7 @@ class TensorArrayTest(test.TestCase):
           "it has already been written to."):
         self.evaluate(ta.write(2, 3.0).write(2, 3.0).flow)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArrayConcatIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -482,7 +536,7 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaisesOpError("shape"):
         self.evaluate(w3.concat())
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTensorArraySplitIncompatibleShapesFails(self):
     with self.session(use_gpu=True):
       in_eager_mode = context.executing_eagerly()
@@ -495,22 +549,32 @@ class TensorArrayTest(test.TestCase):
           lengths = array_ops.placeholder(dtypes.int64)
           ta.split([1.0, 2.0, 3.0], lengths).flow.eval(feed_dict={lengths: 1})
 
-      with self.assertRaisesOpError(
-          r"Expected sum of lengths to be equal to values.shape\[0\], "
-          r"but sum of lengths is 1 and value's shape is: \[3\]"):
+      error_msg = ("Unused values in tensor. Length of tensor: 3 Values used: 1"
+                   if tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and
+                   not in_eager_mode else
+                   r"Expected sum of lengths to be equal to values.shape\[0\], "
+                   r"but sum of lengths is 1 and value's shape is: \[3\]")
+      with self.assertRaisesOpError(error_msg):
         self.evaluate(ta.split([1.0, 2.0, 3.0], [1]).flow)
 
       ta = _make_ta(1, "baz")
-      with self.assertRaisesOpError(
-          r"Expected value to be at least a vector, but received shape: \[\]"):
-        self.evaluate(ta.split(1.0, [1]).flow)
+      if tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 and not in_eager_mode:
+        with self.assertRaisesRegexp(
+            ValueError, "Shape must be at least rank 1 but is rank 0"):
+          self.evaluate(ta.split(1.0, [1]).flow)
+      else:
+        with self.assertRaisesOpError(
+            r"Expected value to be at least a vector, but received shape: \[\]"
+        ):
+          self.evaluate(ta.split(1.0, [1]).flow)
 
-      ta = _make_ta(2, "buz")
-      with self.assertRaisesOpError(
-          r"TensorArray's size is not equal to the size of lengths "
-          r"\(2 vs. 1\), and the TensorArray is not marked as "
-          r"dynamically resizeable"):
-        self.evaluate(ta.split([1.0], [1]).flow)
+      if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2 or in_eager_mode:
+        ta = _make_ta(2, "buz")
+        with self.assertRaisesOpError(
+            r"TensorArray's size is not equal to the size of lengths "
+            r"\(2 vs. 1\), and the TensorArray is not marked as "
+            r"dynamically resizeable"):
+          self.evaluate(ta.split([1.0], [1]).flow)
 
   def _testTensorArrayWriteGradientAddMultipleAdds(self, dtype):
     with self.cached_session(use_gpu=True):
@@ -546,12 +610,16 @@ class TensorArrayTest(test.TestCase):
           r"existing shape is \[\] but the new input shape is \[1\]"):
         wb1_grad.flow.eval()
 
-  def testTensorArrayWriteGradientAddMultipleAdds(self):
+  @test_util.disable_control_flow_v2("v2 does not support TensorArray.grad.")
+  @test_util.run_v1_only("v2 does not support TensorArray.grad.")
+  def testSkipEagerTensorArrayWriteGradientAddMultipleAdds(self):
     for dtype in (dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64,
                   dtypes.complex64, dtypes.complex128):
       self._testTensorArrayWriteGradientAddMultipleAdds(dtype)
 
-  def testTensorArrayGradWithShapeKnownElementShape(self):
+  @test_util.disable_control_flow_v2("Low level legacy TA op test.")
+  @test_util.run_v1_only("Low level legacy TA op test.")
+  def testSkipEagerTensorArrayGradWithShapeKnownElementShape(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           size=3,
@@ -580,7 +648,9 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(fed_value,
                           sess.run(read_value, feed_dict={value: fed_value}))
 
-  def testTensorArrayGradWithShapeUnknownElementShape(self):
+  @test_util.disable_control_flow_v2("Low level legacy TA op test.")
+  @test_util.run_v1_only("Low level legacy TA op test.")
+  def testSkipEagerTensorArrayGradWithShapeUnknownElementShape(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           size=3, dtype=dtypes.float32,
@@ -603,7 +673,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(fed_value,
                           sess.run(read_value, feed_dict={value: fed_value}))
 
-  @test_util.run_in_graph_and_eager_modes
   def testMultiTensorArray(self):
     with self.session(use_gpu=True):
       h1 = tensor_array_ops.TensorArray(
@@ -667,7 +736,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual(c([[3.0, 2.0]]), grad_vals[0])
       self.assertAllEqual(c(-2.0), grad_vals[1])
 
-  def testTensorArrayGradientWriteRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientWriteRead(self):
     for dtype in (np.float32, np.float64, np.complex64, np.complex128):
       self._testTensorArrayGradientWriteReadType(dtype)
 
@@ -698,15 +768,17 @@ class TensorArrayTest(test.TestCase):
                 [-0.5, 1.5],  # read(0) gradient
                 [20.0, 30.0, 40.0, 50.0]
             ])  # concat gradient
-      grad_vals = sess.run(grad_r)  # 2 + 2 entries
+      grad_vals = self.evaluate(grad_r)  # 2 + 2 entries
 
       self.assertAllClose([2.0 - 0.5 + 20.0, 3.0 + 1.5 + 30.0], grad_vals[0])
       self.assertAllEqual([4.0 + 40.0, 5.0 + 50.0], grad_vals[1])
 
-  def testTensorArrayGradientWritePackConcatAndRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientWritePackConcatAndRead(self):
     self._testTensorArrayGradientWritePackConcatAndRead()
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("v2 does not support clear_after_read.")
+  @test_util.run_v1_only("v2 does not support clear_after_read.")
   def testTensorArrayReadTwice(self):
     with self.session(use_gpu=True):
       value = constant_op.constant([[1.0, -1.0], [10.0, -10.0]])
@@ -760,10 +832,12 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0 - 1.5, 3.0 + 1.5], [4.0, 5.0]], grad_vals[0])
 
-  def testTensorArrayGradientUnpackRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientUnpackRead(self):
     self._testTensorArrayGradientUnpackRead()
 
-  def testTensorArrayGradientSplitConcat(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientSplitConcat(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=2,
@@ -808,17 +882,16 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(len(grad_vals), 1)
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  def testTensorArrayGradientDynamicUnpackRead(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradientDynamicUnpackRead(self):
     self._testTensorArrayGradientDynamicUnpackRead()
 
-  @test_util.run_in_graph_and_eager_modes
   def testCloseTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
       self.evaluate(ta.close())
 
-  @test_util.run_in_graph_and_eager_modes
   def testSizeTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -826,7 +899,6 @@ class TensorArrayTest(test.TestCase):
       s = ta.size()
       self.assertAllEqual(3, self.evaluate(s))
 
-  @test_util.run_in_graph_and_eager_modes
   def testWriteCloseTensorArray(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -924,7 +996,6 @@ class TensorArrayTest(test.TestCase):
       self.assertAllClose(grad_val.sum(axis=0), var_grad_t)
       self.assertAllClose(grad_val.sum(axis=0), state0_grad_t)
 
-  @test_util.run_in_graph_and_eager_modes
   def testWhileLoopWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=False, dtype=dtypes.float32)
@@ -932,11 +1003,28 @@ class TensorArrayTest(test.TestCase):
     # self._testWhileLoopWritePackGradients(
     #     dynamic_size=False, dtype=tf.int64)
 
-  def testWhileLoopDynamicWritePackGradients(self):
+  @test_util.disable_control_flow_v2("Testing v1 while_loop with v2 TA")
+  @test_util.enable_tensor_array_v2
+  def testWhileLoopV1WithTensorArrayV2(self):
+    size = 3
+    ta = tensor_array_ops.TensorArray(
+        dtype=dtypes.int32, size=size, element_shape=tensor_shape.scalar())
+
+    def Body(counter, ta):
+      return counter + 1, ta.write(counter, counter)
+
+    _, ta = control_flow_ops.while_loop(lambda i, _: i < size, Body, [0, ta])
+
+    for i in range(size):
+      self.assertEqual(self.evaluate(ta.read(i)), i)
+
+  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerWhileLoopDynamicWritePackGradients(self):
     self._testWhileLoopWritePackGradients(
         dynamic_size=True, dtype=dtypes.float32)
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/119323158")
   def testGradSerialTwoLoops(self):
     with self.session(use_gpu=True):
       def loop(x):
@@ -976,7 +1064,8 @@ class TensorArrayTest(test.TestCase):
         grad = gradients_impl.gradients(loop(x), [x])[0]
       self.assertAllClose(31.0, self.evaluate(grad))
 
-  def testSumOfTwoReadVariablesWithoutRepeatGrad(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerSumOfTwoReadVariablesWithoutRepeatGrad(self):
     with self.session(use_gpu=True) as session:
       a = array_ops.identity(
           np.arange(
@@ -1011,7 +1100,8 @@ class TensorArrayTest(test.TestCase):
   def _grad_source_for_name(self, name):
     return tensor_array_grad._GetGradSource(constant_op.constant(0, name=name))
 
-  def testGetGradSource_Invalid(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_Invalid(self):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("")
     with self.assertRaises(ValueError):
@@ -1019,7 +1109,8 @@ class TensorArrayTest(test.TestCase):
     with self.assertRaises(ValueError):
       self._grad_source_for_name("foo/bar")
 
-  def testGetGradSource_NoEnclosingScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_NoEnclosingScope(self):
     self.assertEqual("gradients:0", self._grad_source_for_name("gradients"))
     self.assertEqual("gradients_0:0", self._grad_source_for_name("gradients_0"))
     self.assertEqual("gradients", self._grad_source_for_name("gradients/foo"))
@@ -1030,7 +1121,8 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("gradients_0",
                      self._grad_source_for_name("gradients_0/foo/bar"))
 
-  def testGetGradSource_EnclosingScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_EnclosingScope(self):
     self.assertEqual("foo/gradients:0",
                      self._grad_source_for_name("foo/gradients"))
     self.assertEqual("foo/gradients_0:0",
@@ -1044,12 +1136,14 @@ class TensorArrayTest(test.TestCase):
     self.assertEqual("foo/bar/gradients_0",
                      self._grad_source_for_name("foo/bar/gradients_0/baz"))
 
-  def testGetGradSource_NestedUsesInnermost(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerGetGradSource_NestedUsesInnermost(self):
     self.assertEqual(
         "foo/gradients/bar/gradients_0",
         self._grad_source_for_name("foo/gradients/bar/gradients_0/baz"))
 
-  def testWriteShape(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerWriteShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=3)
@@ -1073,7 +1167,8 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w0.write(0, c2)
 
-  def testPartlyUnknownShape(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerPartlyUnknownShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, tensor_array_name="foo", size=6)
@@ -1113,7 +1208,6 @@ class TensorArrayTest(test.TestCase):
       r5 = w5.read(0)
       self.assertAllEqual([5, 4, 2, 3], r5.get_shape().as_list())
 
-  @test_util.run_in_graph_and_eager_modes
   def _testUnpackShape(self):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1144,10 +1238,12 @@ class TensorArrayTest(test.TestCase):
       with self.assertRaises(ValueError):
         w1.write(4, c2)
 
+  @test_util.disable_control_flow_v2("b/117943489 (dynamic_size)")
+  @test_util.run_v1_only("b/117943489")
   def testUnpackShape(self):
     self._testUnpackShape()
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.run_deprecated_v1
   def testSplitShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
@@ -1174,11 +1270,13 @@ class TensorArrayTest(test.TestCase):
         self.assertEqual((2, 2), w0.read(1).get_shape())
       else:
         self.assertEqual(r0.get_shape().ndims, None)
-        self.assertEqual(
-            tensor_shape.TensorShape(
-                ta1.handle.op.get_attr("element_shape")).ndims, None)
+        if not tensor_array_ops.ENABLE_TENSOR_ARRAY_V2:
+          self.assertEqual(
+              tensor_shape.TensorShape(
+                  ta1.handle.op.get_attr("element_shape")).ndims, None)
 
-  def testWriteUnknownShape(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerWriteUnknownShape(self):
     with self.session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -1201,7 +1299,11 @@ class TensorArrayTest(test.TestCase):
       grad_r0_vals = session.run(grad_r0)[0]
       self.assertAllEqual(grad_r0_vals, [1.0, 0.0])
 
-  def testGradientWhenNotAllComponentsRead(self):
+  # TODO(srbs): Figure out how to enable this. This is probably failing
+  # because we are trying to stack a TensorList with invalid tensors.
+  # That is because we do not receive gradients for all list indices.
+  # Figure out how TensorArray handles this.
+  def disabletestGradientWhenNotAllComponentsRead(self):
     self._testGradientWhenNotAllComponentsRead()
 
   def _testTensorArrayUnpackDynamic(self):
@@ -1212,14 +1314,18 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.unstack(x)
       w1 = w0.write(3, 4.0)
       r = w1.stack()
-      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
+      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), self.evaluate(r))
       grad = gradients_impl.gradients(ys=[r], xs=[x])
-      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), sess.run(grad)[0])
+      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
-  def testTensorArrayUnpackDynamic(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayUnpackDynamic(self):
     self._testTensorArrayUnpackDynamic()
 
-  def testTensorArraySplitDynamic(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArraySplitDynamic(self):
     with self.session(use_gpu=True) as sess:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=3, dynamic_size=True)
@@ -1227,21 +1333,26 @@ class TensorArrayTest(test.TestCase):
       w0 = ta.split(x, [1, 1, 1])
       w1 = w0.write(3, [4.0])
       r = w1.concat()
-      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), r.eval())
+      self.assertAllEqual(np.array([1.0, 2.0, 3.0, 4.0]), self.evaluate(r))
       grad = gradients_impl.gradients(ys=[r], xs=[x])
-      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), sess.run(grad)[0])
+      self.assertAllEqual(np.array([1.0, 1.0, 1.0]), self.evaluate(grad)[0])
 
   def _testTensorArrayEvalEmpty(self):
     with self.cached_session(use_gpu=True):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=0, dynamic_size=False, infer_shape=False)
-      with self.assertRaisesOpError(
-          "TensorArray has size zero, but element shape <unknown> is not fully "
-          "defined. Currently only static shapes are supported when packing "
-          "zero-size TensorArrays."):
+      v2_msg = ("Tried to stack elements of a empty list with "
+                "non-fully-defined shape")
+      v1_msg = (
+          "TensorArray has size zero, but element shape <unknown> is not "
+          "fully defined. Currently only static shapes are supported when "
+          "packing zero-size TensorArrays.")
+      with self.assertRaisesOpError(v2_msg if tensor_array_ops
+                                    .ENABLE_TENSOR_ARRAY_V2 else v1_msg):
         ta.stack().eval()
 
-  def testTensorArrayEvalEmpty(self):
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerTensorArrayEvalEmpty(self):
     self._testTensorArrayEvalEmpty()
 
   # this test is ill-defined for Eager mode --- unpacking an empty tensor
@@ -1255,15 +1366,19 @@ class TensorArrayTest(test.TestCase):
       ta.unstack(array_ops.zeros([0, 3, 5])).mark_used()
       packed = ta.stack()
       concatenated = ta.concat()
-      self.assertAllEqual([0, 3, 5], packed.eval().shape)
+      self.assertAllEqual([0, 3, 5], self.evaluate(packed).shape)
       # Concatenating zero tensors along their first dimension gives a
       # first dimension of zero
-      self.assertAllEqual([0, 5], concatenated.eval().shape)
+      self.assertAllEqual([0, 5], self.evaluate(concatenated).shape)
 
-  def testTensorArrayEvalEmptyWithDefault(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayEvalEmptyWithDefault(self):
     self._testTensorArrayEvalEmptyWithDefault()
 
-  def testTensorArrayScatterReadAndGradients(self):
+  @test_util.disable_control_flow_v2("b/117943489")
+  @test_util.run_v1_only("b/117943489")
+  def testSkipEagerTensorArrayScatterReadAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32,
@@ -1289,7 +1404,8 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([10.0, -10.0], read_vals[1])
       self.assertAllEqual([[2.0, 3.0], [4.0, 5.0]], grad_vals[0])
 
-  @test_util.run_in_graph_and_eager_modes
+  @test_util.disable_control_flow_v2("b/117943286")
+  @test_util.run_v1_only("b/117943286")
   def testTensorArrayWriteGatherAndGradients(self):
     with self.session(use_gpu=True) as session:
       ta = tensor_array_ops.TensorArray(
@@ -1326,7 +1442,9 @@ class TensorArrayTest(test.TestCase):
       self.assertAllEqual([[1.0, -1.0], [8.0, -8.0]], g_vals[0])
       self.assertAllEqual(expected_grad, grad_vals[0])
 
-  def testTensorArrayGetsDeviceFromFirstWrite(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerTensorArrayGetsDeviceFromFirstWrite(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       # this initial device will be ignored.
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
@@ -1374,7 +1492,9 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "/TensorArray" in s.node_name])
 
-  def testTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerTensorArrayGetsDeviceFromFirstWriteInWhileLoop(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2)
 
@@ -1403,7 +1523,9 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
-  def testTensorArrayDisabledColocateWithFirstWriteCall(self):
+  @test_util.disable_control_flow_v2("colocate_with not supported in v2.")
+  @test_util.run_v1_only("b/120545219")
+  def testSkipEagerTensorArrayDisabledColocateWithFirstWriteCall(self):
     with ops.device("/job:worker/task:0/cpu:0"):
       ta = tensor_array_ops.TensorArray(
           dtype=dtypes.float32, size=2, colocate_with_first_write_call=False)
@@ -1433,7 +1555,6 @@ class TensorArrayTest(test.TestCase):
         self.assertFalse(
             [s for s in dev_stats[d] if "TensorArray" == s.node_name])
 
-  @test_util.run_in_graph_and_eager_modes
   def testTensorArrayIdentity(self):
     with self.session(use_gpu=True):
       ta0 = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=2,
@@ -1486,7 +1607,8 @@ class TensorArrayTest(test.TestCase):
       self.assertEqual(size0_v, 2)
       self.assertEqual(size1_v, 4)
 
-  def testTensorArrayGradYsInCorrectScope(self):
+  @test_util.run_deprecated_v1
+  def testSkipEagerTensorArrayGradYsInCorrectScope(self):
     n_time = 1
     n_dim = 1
     x = constant_op.constant([[1.42]])
@@ -1501,10 +1623,10 @@ class TensorArrayTest(test.TestCase):
       # wrap it in the correct name scope.
       dx, = gradients_impl.gradients(ys=[y], xs=[x], grad_ys=[dy])
       with self.cached_session(use_gpu=True) as sess:
-        vdx, vdy = sess.run([dx, dy])
+        vdx, vdy = self.evaluate([dx, dy])
       self.assertAllClose(vdx, vdy)
 
-  def testTensorArrayInt64GPU(self):
+  def testSkipEagerTensorArrayInt64GPU(self):
     if not test.is_gpu_available():
       return
     with self.session(use_gpu=True, force_gpu=True) as sess:
diff --git a/tensorflow/python/kernel_tests/tensordot_op_test.py b/tensorflow/python/kernel_tests/tensordot_op_test.py
index 123c9b376c9de0b39b1b6a61548819501ec4bd59..febfe23b16d0a5b56102dd1c4c21d5cf16a0e1dc 100644
--- a/tensorflow/python/kernel_tests/tensordot_op_test.py
+++ b/tensorflow/python/kernel_tests/tensordot_op_test.py
@@ -20,9 +20,11 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test as test_lib
@@ -39,6 +41,7 @@ def _add_test(test, test_name, fn):
 
 class TensordotTest(test_lib.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def test_invalid_shape(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4], [5, 6]]
@@ -62,6 +65,7 @@ class TensordotTest(test_lib.TestCase):
                 axes_ph: (a_axes, b_axes)
             })
 
+  @test_util.run_v1_only("b/120545219")
   def test_invalid_axes(self):
     a = [[1, 2], [3, 4]]
     b = [[1, 2], [3, 4]]
@@ -99,11 +103,12 @@ class TensordotTest(test_lib.TestCase):
 
         tf_a = array_ops.ones((3, 3), dtype=dtypes.float32)
         tf_b = constant_op.constant([2, 3, 1], dtype=dtypes.float32)[None, None]
-        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value).eval()
+        tf_ans = math_ops.tensordot(tf_a, tf_b, axes_value)
 
         self.assertAllEqual(tf_ans.shape, np_ans.shape)
         self.assertAllEqual(tf_ans, np_ans)
 
+  @test_util.run_v1_only("b/120545219")
   def test_partial_shape_inference(self):
     for axes in ([1], [0]), 1:
       a = array_ops.placeholder(dtypes.float32)
@@ -178,7 +183,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
                   axes: (a_dims_np, b_dims_np)
               })
         else:
-          tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np)).eval()
+          tf_ans = math_ops.tensordot(a_np, b_np, (a_dims_np, b_dims_np))
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
@@ -208,7 +213,7 @@ def _get_tensordot_tests(dtype_, rank_a_, rank_b_, num_dims_, dynamic_shape_):
           c = math_ops.tensordot(a, b, axes=axes)
           tf_ans = sess.run(c, feed_dict={a: a_np, b: b_np})
         else:
-          tf_ans = math_ops.tensordot(a_np, b_np, axes=axes).eval()
+          tf_ans = math_ops.tensordot(a_np, b_np, axes=axes)
       self.assertAllClose(tf_ans, np_ans, rtol=tol, atol=tol)
       self.assertAllEqual(tf_ans.shape, np_ans.shape)
 
@@ -220,7 +225,8 @@ if __name__ == "__main__":
     for rank_a in 1, 2, 4, 5:
       for rank_b in 1, 2, 4, 5:
         for num_dims in range(0, min(rank_a, rank_b) + 1):
-          for dynamic_shape in False, True:
+          # TF2 does not support placeholders under eager so we skip it
+          for dynamic_shape in set([False, not tf2.enabled()]):
             for testcase in _get_tensordot_tests(dtype, rank_a, rank_b,
                                                  num_dims, dynamic_shape):
               name = "%s_%s_%s_%s_%s_%s" % (testcase.__name__, dtype.__name__,
diff --git a/tensorflow/python/kernel_tests/topk_op_test.py b/tensorflow/python/kernel_tests/topk_op_test.py
index d9f340de6b2af442fd23b9cc627bac7ae6602efd..5d46176bce87a94ac6f2c2ce51739c0289b38b80 100644
--- a/tensorflow/python/kernel_tests/topk_op_test.py
+++ b/tensorflow/python/kernel_tests/topk_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_ops
@@ -48,7 +49,7 @@ class TopKTest(test.TestCase):
     np_expected_indices = np.array(expected_indices)
     with self.cached_session(use_gpu=True) as sess:
       values_op, indices_op = nn_ops.top_k(inputs, k, sorted=sorted)
-      values, indices = sess.run([values_op, indices_op])
+      values, indices = self.evaluate([values_op, indices_op])
 
       self.assertShapeEqual(np_expected_values, values_op)
       self.assertShapeEqual(np_expected_indices, indices_op)
@@ -181,6 +182,7 @@ class TopKTest(test.TestCase):
     k = constant_op.constant(3)
     self._validateTopK(inputs, k, [19, 18, 17], [11, 3, 7])
 
+  @test_util.run_deprecated_v1
   def testKNegative(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.session(use_gpu=True):
@@ -189,12 +191,14 @@ class TopKTest(test.TestCase):
       with self.assertRaisesOpError("Need k >= 0, got -7"):
         values.eval(feed_dict={k: -7})
 
+  @test_util.run_deprecated_v1
   def testKTooLarge(self):
     inputs = [[0.1, 0.2], [0.3, 0.4]]
     with self.assertRaisesRegexp(ValueError,
                                  r"must have last dimension >= k = 4"):
       nn_ops.top_k(inputs, 4)
 
+  @test_util.run_deprecated_v1
   def testTopKGradients(self):
     with self.session(use_gpu=True) as sess:
       inputs = array_ops.placeholder(dtypes.float32, shape=[2, 5])
diff --git a/tensorflow/python/kernel_tests/trace_op_test.py b/tensorflow/python/kernel_tests/trace_op_test.py
index f1abaefb66b9dd4d6055d14fd4fb9a436a8acd62..52640c02c22770ba516a61488de7166b6d45ddf6 100644
--- a/tensorflow/python/kernel_tests/trace_op_test.py
+++ b/tensorflow/python/kernel_tests/trace_op_test.py
@@ -19,6 +19,7 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 
@@ -34,6 +35,7 @@ class TraceTest(test.TestCase):
       tf_ans = math_ops.trace(x).eval()
     self.assertAllClose(tf_ans, np_ans)
 
+  @test_util.run_deprecated_v1
   def testTrace(self):
     for dtype in [np.int32, np.float32, np.float64]:
       for shape in [[2, 2], [2, 3], [3, 2], [2, 3, 2], [2, 2, 2, 3]]:
diff --git a/tensorflow/python/kernel_tests/transpose_op_test.py b/tensorflow/python/kernel_tests/transpose_op_test.py
index 8c11c2070973cbd4780871e2d716bb9bd2cbb3f9..76e1002ee1b97cea9fa29763b39f39a486a0ec16 100644
--- a/tensorflow/python/kernel_tests/transpose_op_test.py
+++ b/tensorflow/python/kernel_tests/transpose_op_test.py
@@ -50,7 +50,7 @@ class TransposeTest(test.TestCase):
     with self.cached_session(use_gpu=False):
       inx = ops.convert_to_tensor(x)
       y = array_ops.transpose(inx, p, conjugate=conjugate)
-      tf_ans = y.eval()
+      tf_ans = self.evaluate(y)
       self.assertShapeEqual(np_ans, y)
       self.assertAllEqual(np_ans, tf_ans)
 
@@ -81,7 +81,7 @@ class TransposeTest(test.TestCase):
     with self.cached_session(use_gpu=True):
       inx = ops.convert_to_tensor(x)
       y = array_ops.transpose(inx, p, conjugate=conjugate)
-      tf_ans = y.eval()
+      tf_ans = self.evaluate(y)
 
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
@@ -168,7 +168,7 @@ class TransposeTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           inx = ops.convert_to_tensor(inp)
           y = array_ops.transpose(inx, perm)
-          tf_ans = y.eval()
+          tf_ans = self.evaluate(y)
         self.assertAllEqual(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, y)
 
@@ -189,7 +189,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -224,7 +224,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -246,7 +246,7 @@ class TransposeTest(test.TestCase):
         with self.cached_session(use_gpu=True):
           inx = ops.convert_to_tensor(inp)
           y = array_ops.transpose(inx, perm)
-          tf_ans = y.eval()
+          tf_ans = self.evaluate(y)
         self.assertAllEqual(np_ans, tf_ans)
         self.assertShapeEqual(np_ans, y)
 
@@ -267,7 +267,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
 
@@ -319,7 +319,7 @@ class TransposeTest(test.TestCase):
       with self.cached_session(use_gpu=True):
         inx = ops.convert_to_tensor(inp)
         y = array_ops.transpose(inx, perm)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
       self.assertAllEqual(np_ans, tf_ans)
       self.assertShapeEqual(np_ans, y)
       self._ClearCachedSession()
@@ -341,7 +341,7 @@ class TransposeTest(test.TestCase):
         inx = ops.convert_to_tensor(x)
         inp = constant_op.constant(p)
         y = array_ops.transpose(inx, inp)
-        tf_ans = y.eval()
+        tf_ans = self.evaluate(y)
         self.assertShapeEqual(np_ans, y)
         self.assertAllEqual(np_ans, tf_ans)
 
diff --git a/tensorflow/python/kernel_tests/unicode_decode_op_test.py b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c165021eea3eba54fbc77aa328acebaccd844a74
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_decode_op_test.py
@@ -0,0 +1,153 @@
+# -*- coding: utf-8 -*-
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for unicode_decode and unicode_decode_with_splits."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.platform import test
+
+
+# Account for python2 and python3 execution of the test.
+def codepoint(s):
+  if isinstance(s, bytes):
+    return ord(s.decode("utf-8"))
+  elif isinstance(s, str):
+    return ord(s)
+
+
+class UnicodeDecodeTest(test.TestCase):
+
+  def testBatchDecode(self):
+    text = constant_op.constant(
+        ["仅今年前", "分享介面終於迎來更新"])
+    row_splits, utf8_text, offsets = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8")
+
+    with self.test_session():
+      self.assertAllEqual([
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+          codepoint("分"),
+          codepoint("享"),
+          codepoint("介"),
+          codepoint("面"),
+          codepoint("終"),
+          codepoint("於"),
+          codepoint("迎"),
+          codepoint("來"),
+          codepoint("更"),
+          codepoint("新")
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual([0, 4, 14], self.evaluate(row_splits).tolist())
+      self.assertAllEqual([0, 3, 6, 9, 0, 3, 6, 9, 12, 15, 18, 21, 24, 27],
+                          self.evaluate(offsets).tolist())
+
+  def testBasicDecodeWithOffset(self):
+    text = constant_op.constant(["仅今年前"])
+    row_splits, utf8_text, starts = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8")
+
+    with self.test_session():
+      self.assertAllEqual([
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual(self.evaluate(row_splits).tolist(), [0, 4])
+      self.assertAllEqual(self.evaluate(starts).tolist(), [0, 3, 6, 9])
+
+  @test_util.run_deprecated_v1
+  def testStrictError(self):
+    text = constant_op.constant([b"\xFEED"])
+    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="strict")
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      with self.test_session():
+        self.evaluate(error)
+
+  def testReplaceOnError(self):
+    text = constant_op.constant([b"\xFE"])
+
+    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="replace")
+
+    with self.test_session():
+      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [65533])
+
+  @test_util.run_deprecated_v1
+  def testBadReplacementChar(self):
+    text = constant_op.constant([b"\xFE"])
+    _, error, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="replace", replacement_char=11141111)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      with self.test_session():
+        self.evaluate(error)
+
+  def testIgnoreOnError(self):
+    text = constant_op.constant([b"\xFEhello"])
+
+    _, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", errors="ignore")
+
+    with self.test_session():
+      self.assertAllEqual(self.evaluate(utf8_text).tolist(), [
+          codepoint("h"),
+          codepoint("e"),
+          codepoint("l"),
+          codepoint("l"),
+          codepoint("o")
+      ])
+
+  @test_util.run_deprecated_v1
+  def testBadErrorPolicy(self):
+    text = constant_op.constant(["hippopotamus"])
+
+    with self.assertRaises(ValueError):
+      _, _, _ = gen_string_ops.unicode_decode_with_offsets(
+          text, "utf-8", errors="oranguatan")
+
+  def testReplaceControlChars(self):
+    text = constant_op.constant(["\x02仅今年前"])
+    row_splits, utf8_text, _ = gen_string_ops.unicode_decode_with_offsets(
+        text, "utf-8", replace_control_characters=True)
+
+    with self.test_session():
+      self.assertAllEqual([
+          65533,
+          codepoint("仅"),
+          codepoint("今"),
+          codepoint("年"),
+          codepoint("前"),
+      ],
+                          self.evaluate(utf8_text).tolist())
+      self.assertAllEqual([0, 5], self.evaluate(row_splits).tolist())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unicode_encode_op_test.py b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f3cd8a6577e06fc4b3de81585d8b48231ae7076
--- /dev/null
+++ b/tensorflow/python/kernel_tests/unicode_encode_op_test.py
@@ -0,0 +1,271 @@
+# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for UnicodeEncode op from ragged_string_ops."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import errors_impl as errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.platform import test
+
+
+class UnicodeEncodeOpTest(test.TestCase, parameterized.TestCase):
+
+  def assertRaggedEqual(self, rt, expected):
+    with self.cached_session() as sess:
+      value = sess.run(rt)
+      if isinstance(value, np.ndarray):
+        value = value.tolist()
+      elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
+        value = value.to_list()
+      self.assertEqual(value, expected)
+
+  def testScalar(self):
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        ragged_string_ops.unicode_encode(72, "UTF-8")
+    with self.cached_session():
+      with self.assertRaises(ValueError):
+        ragged_string_ops.unicode_encode(constant_op.constant(72), "UTF-8")
+
+  def testRequireParams(self):
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode()
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode(72)
+    with self.cached_session():
+      with self.assertRaises(TypeError):
+        ragged_string_ops.unicode_encode(encoding="UTF-8")
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  def testStrictErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    with self.cached_session() as session:
+      with self.assertRaises(errors.InvalidArgumentError):
+        session.run(
+            ragged_string_ops.unicode_encode(test_value, encoding, "strict"))
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testIgnoreErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"Heo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "ignore")
+    with self.cached_session() as session:
+      result = session.run(unicode_encode_op)
+      self.assertIsInstance(result, bytes)
+      self.assertAllEqual(result, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testReplaceErrors(self, encoding):
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace")
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    # Test custom replacement character
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"Heooo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace", 111)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    # Verify "replace" is default
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    expected_value = u"He\U0000fffd\U0000fffdo".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    # Replacement_char must be within range
+    test_value = np.array([72, 101, 2147483647, -1, 111], np.int32)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding,
+                                                         "replace", 1114112)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(unicode_encode_op)
+
+  # -- regular Tensor tests -- #
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testVector(self, encoding):
+    test_value = np.array([72, 101, 108, 108, 111], np.int32)
+    expected_value = u"Hello".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    test_value = np.array([72, 101, 195, 195, 128516], np.int32)
+    expected_value = u"He\xc3\xc3\U0001f604".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    # Single character string
+    test_value = np.array([72], np.int32)
+    expected_value = u"H".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+    test_value = np.array([128516], np.int32)
+    expected_value = u"\U0001f604".encode(encoding)
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testMatrix(self, encoding):
+    test_value = np.array(
+        [[72, 128516, 108, 108, 111], [87, 128516, 114, 108, 100]], np.int32)
+    expected_value = [
+        u"H\U0001f604llo".encode(encoding), u"W\U0001f604rld".encode(encoding)
+    ]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertAllEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test3DimMatrix(self, encoding):
+    test_value = constant_op.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
+         [[102, 105, 120, 101, 100], [119, 111, 114, 100, 115]],
+         [[72, 121, 112, 101, 114], [99, 117, 98, 101, 46]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
+                      [u"fixed".encode(encoding), u"words".encode(encoding)],
+                      [u"Hyper".encode(encoding), u"cube.".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test4DimMatrix(self, encoding):
+    test_value = constant_op.constant(
+        [[[[72, 101, 108, 108, 111]], [[87, 111, 114, 108, 100]]],
+         [[[102, 105, 120, 101, 100]], [[119, 111, 114, 100, 115]]],
+         [[[72, 121, 112, 101, 114]], [[99, 117, 98, 101, 46]]]], np.int32)
+    expected_value = [[[u"Hello".encode(encoding)],
+                       [u"World".encode(encoding)]],
+                      [[u"fixed".encode(encoding)],
+                       [u"words".encode(encoding)]],
+                      [[u"Hyper".encode(encoding)],
+                       [u"cube.".encode(encoding)]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  # -- Ragged Tensor tests -- #
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testRaggedMatrix(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[72, 195, 108, 108, 111], [87, 128516, 114, 108, 100, 46]], np.int32)
+    expected_value = [
+        u"H\xc3llo".encode(encoding), u"W\U0001f604rld.".encode(encoding)
+    ]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test3DimMatrixWithRagged2ndDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]],
+         [[102, 105, 120, 101, 100]],
+         [[72, 121, 112, 101, 114], [119, 111, 114, 100, 115],
+          [99, 117, 98, 101, 46]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World".encode(encoding)],
+                      [u"fixed".encode(encoding)],
+                      [
+                          u"Hyper".encode(encoding), u"words".encode(encoding),
+                          u"cube.".encode(encoding)
+                      ]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test3DimMatrixWithRagged3rdDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]],
+         [[68, 111, 110, 39, 116], [119, 195, 114, 114, 121, 44, 32, 98, 101]],
+         [[128516], []]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
+                      [
+                          u"Don't".encode(encoding),
+                          u"w\xc3rry, be".encode(encoding)
+                      ], [u"\U0001f604".encode(encoding), u"".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test3DimMatrixWithRagged2ndAnd3rdDim(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100, 46]], [],
+         [[128516]]], np.int32)
+    expected_value = [[u"Hello".encode(encoding), u"World.".encode(encoding)],
+                      [], [u"\U0001f604".encode(encoding)]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def test4DimRaggedMatrix(self, encoding):
+    test_value = ragged_factory_ops.constant(
+        [[[[72, 101, 108, 108, 111], [87, 111, 114, 108, 100]]],
+         [[[]], [[72, 121, 112, 101]]]], np.int32)
+    expected_value = [[[u"Hello".encode(encoding), u"World".encode(encoding)]],
+                      [[u"".encode(encoding)], [u"Hype".encode(encoding)]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+  @parameterized.parameters("UTF-8", "UTF-16-BE", "UTF-32-BE")
+  @test_util.run_v1_only("b/120545219")
+  def testRaggedMatrixWithMultiDimensionInnerValues(self, encoding):
+    test_flat_values = constant_op.constant([[[72, 101, 108, 108, 111],
+                                              [87, 111, 114, 108, 100]],
+                                             [[102, 105, 120, 101, 100],
+                                              [119, 111, 114, 100, 115]],
+                                             [[72, 121, 112, 101, 114],
+                                              [99, 117, 98, 101, 46]]])
+    test_row_splits = [
+        constant_op.constant([0, 2, 3], dtype=np.int64),
+        constant_op.constant([0, 1, 1, 3], dtype=np.int64)
+    ]
+    test_value = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        test_flat_values, test_row_splits)
+    expected_value = [[[[u"Hello".encode(encoding), u"World".encode(encoding)]],
+                       []],
+                      [[[u"fixed".encode(encoding), u"words".encode(encoding)],
+                        [u"Hyper".encode(encoding),
+                         u"cube.".encode(encoding)]]]]
+    unicode_encode_op = ragged_string_ops.unicode_encode(test_value, encoding)
+    self.assertRaggedEqual(unicode_encode_op, expected_value)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/kernel_tests/unicode_script_op_test.py b/tensorflow/python/kernel_tests/unicode_script_op_test.py
index 927e5459ed2cf56c6adc59323ef4e3a33eeb5dc7..83cfeb20216455a5fc11177991ef8aa7c5c44703 100644
--- a/tensorflow/python/kernel_tests/unicode_script_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_script_op_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
 
 class UnicodeScriptOpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testValidScripts(self):
     inputs = [
         ord("a"),
@@ -45,6 +47,7 @@ class UnicodeScriptOpTest(test.TestCase):
               0  # USCRIPT_COMMON (ZYYY)
           ])
 
+  @test_util.run_deprecated_v1
   def testInvalidScript(self):
     inputs = [-100, 0xffffff]
     with self.cached_session():
diff --git a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
index 2908e2bfc56981d15e54594ed155f30bc21b1aab..a3b4fd03474010b06009c52ad3afabf3e31ca024 100644
--- a/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
+++ b/tensorflow/python/kernel_tests/unicode_transcode_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for string_length_op."""
+"""Tests for unicode_transcode op."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -22,6 +22,7 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import string_ops
 from tensorflow.python.platform import test
 
@@ -42,7 +43,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
       outputs = string_ops.unicode_transcode(
@@ -52,7 +53,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
       outputs = string_ops.unicode_transcode(
@@ -62,7 +63,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
   def test_transcode_utf16_to_utf8(self):
@@ -77,7 +78,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   def test_transcode_bad_utf8(self):
@@ -90,7 +91,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=True)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"  ")
 
       outputs = string_ops.unicode_transcode(
@@ -100,7 +101,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00 ")
 
   def test_transcode_bad_utf8_with_some_good(self):
@@ -113,7 +114,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="replace",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"abc abcdefg")
 
   def test_transcode_bad_utf8_with_defaults(self):
@@ -121,7 +122,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00\xef\xbf\xbd")
 
   def test_transcode_bad_utf8_with_space_replacement(self):
@@ -130,9 +131,10 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8",
           replacement_char=ord(" "))
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00 ")
 
+  @test_util.run_deprecated_v1
   def test_transcode_bad_utf8_with_strict_errors(self):
     bad_string = b"\x00\xff"
     with self.cached_session() as sess:
@@ -143,8 +145,9 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="strict")
       with self.assertRaisesOpError(
           "Invalid formatting on input string"):
-        sess.run(outputs)
+        self.evaluate(outputs)
 
+  @test_util.run_deprecated_v1
   def test_transcode_bad_utf8_start_with_strict_errors(self):
     bad_string = b"\xffabcd"
     with self.cached_session() as sess:
@@ -155,7 +158,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           errors="strict")
       with self.assertRaisesOpError(
           "Invalid formatting on input string"):
-        sess.run(outputs)
+        self.evaluate(outputs)
 
   def test_transcode_bad_utf8_with_elision_of_malformatting(self):
     bad_string = b"\x00\xff"
@@ -165,7 +168,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           input_encoding="UTF-8",
           output_encoding="UTF-8",
           errors="ignore")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\x00")
 
   def test_transcode_bad_utf8_with_elision_including_control_chars(self):
@@ -177,7 +180,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           errors="ignore",
           replace_control_characters=True)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"")
 
   def test_transcode_bad_utf8_termination_with_defaults(self):
@@ -185,7 +188,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bad_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"a\xef\xbf\xbd")   # 0xFFFD
 
   def test_transcode_utf8_with_replacement_char(self):
@@ -194,13 +197,13 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
       outputs = string_ops.unicode_transcode(
           strings, input_encoding="UTF-8", output_encoding="UTF-8",
           errors="strict")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
 
       outputs = string_ops.unicode_transcode(
           strings, input_encoding="UTF-8", output_encoding="UTF-8",
           errors="replace", replacement_char=ord("?"))
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, [b"a\xef\xbf\xbd"])
 
   def test_transcode_utf8_to_utf16(self):
@@ -214,7 +217,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-16-BE",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       print("values=", values)
       self.assertAllEqual(values, expected)
 
@@ -230,7 +233,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   def test_transcode_utf8_to_utf32(self):
@@ -243,7 +246,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-32-BE",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, expected)
 
   # Documentation in ICU suggests that getNextUChar may produce a different
@@ -258,7 +261,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           output_encoding="UTF-8",
           replacement_char=ord(" "),
           replace_control_characters=False)
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, strings)
 
   def test_transcode_utf8_with_bom(self):
@@ -266,12 +269,12 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-8", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\xef\xbb\xbfabcdefg")  # BOM preserved
 
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-8", output_encoding="UTF-16-BE")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       utf16expected = bom_string.decode("UTF-8").encode("UTF-16-BE")
       self.assertAllEqual(values, utf16expected)
 
@@ -280,20 +283,20 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
     with self.cached_session() as sess:
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-BE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       # BOM is preserved in output
       self.assertAllEqual(values, b"\xef\xbb\xbfa")
 
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       # mangled BOM and value from (incorrect) LE encoding
       self.assertAllEqual(values, b"\xef\xbf\xbe\xe6\x84\x80")
 
       bom_string = b"\xff\xfe\x61\x00"  # Little-endian BOM with 'a' encoded
       outputs = string_ops.unicode_transcode(
           bom_string, input_encoding="UTF-16-LE", output_encoding="UTF-8")
-      values = sess.run(outputs)
+      values = self.evaluate(outputs)
       self.assertAllEqual(values, b"\xef\xbb\xbfa")
 
   @parameterized.parameters(
@@ -317,12 +320,14 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
       (b"\xfe\xff\x00<\xfe\xff\x00>", "UTF-16", b"<\xef\xbb\xbf>"),
       (b"\xff\xfe<\x00\xff\xfe>\x00", "UTF-16", b"<\xef\xbb\xbf>"),
   )
+  @test_util.run_deprecated_v1
   def test_bom_handling(self, string, input_encoding, expected):
     with self.test_session():
       output = string_ops.unicode_transcode(
           string, input_encoding=input_encoding, output_encoding="UTF-8")
       self.assertAllEqual(output.eval(), expected)
 
+  @test_util.run_deprecated_v1
   def test_invalid_encoding_causes_errors(self):
     strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]
 
@@ -336,7 +341,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
           replace_control_characters=False)
       with self.assertRaisesOpError(
           "Could not create converter for input encoding: invalid"):
-        sess.run(outputs)
+        self.evaluate(outputs)
 
     with self.assertRaisesRegexp(ValueError, "Op passed string 'invalid'"):
       with self.cached_session() as sess:
@@ -347,8 +352,9 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
             errors="replace",
             replacement_char=ord(" "),
             replace_control_characters=False)
-        sess.run(outputs)
+        self.evaluate(outputs)
 
+  @test_util.run_deprecated_v1
   def test_invalid_error_policy_causes_errors(self):
     strings = [[b"a", b"abc"], [b"ABC", b"DEF"]]
 
@@ -362,7 +368,7 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
             errors="invalid",
             replacement_char=ord(" "),
             replace_control_characters=False)
-        sess.run(outputs)
+        self.evaluate(outputs)
 
   def test_forwarding(self):
     with self.cached_session():
@@ -378,6 +384,61 @@ class UnicodeTranscodeOpTest(test.TestCase, parameterized.TestCase):
 
       self.assertAllEqual([b"AbCdE", b"HiJkL"], transcoded)
 
+  @test_util.run_deprecated_v1
+  def test_cjk_encodings(self):
+    strings_ja = [
+        b"\x5c\x5c",  # Yen sign
+        b"\x8f\x70",  # kanji character "waza"
+        b"\x83\x4f"
+    ]  # katakana character "gu"
+    strings_zh_cn = [b"\xca\xf5"]  # simplified "shu4"
+    strings_zh_tw = [b"\xb3\x4e"]  # traditional "shu4"
+    strings_ko = [b"\xc7\xd1\xb9\xce"]  # hangul "hanmin"
+
+    expected_ja = [s.decode("shift_jis").encode("UTF-8") for s in strings_ja]
+    expected_zh_cn = [
+        s.decode("gb18030").encode("UTF-8") for s in strings_zh_cn
+    ]
+    expected_zh_tw = [s.decode("big5").encode("UTF-8") for s in strings_zh_tw]
+    expected_ko = [s.decode("euc_kr").encode("UTF-8") for s in strings_ko]
+
+    with self.cached_session() as sess:
+      outputs_ja = string_ops.unicode_transcode(
+          strings_ja,
+          input_encoding="shift_jis",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_zh_cn = string_ops.unicode_transcode(
+          strings_zh_cn,
+          input_encoding="gb18030",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_zh_tw = string_ops.unicode_transcode(
+          strings_zh_tw,
+          input_encoding="big5",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      outputs_ko = string_ops.unicode_transcode(
+          strings_ko,
+          input_encoding="euc_kr",
+          output_encoding="UTF-8",
+          replacement_char=ord(" "),
+          replace_control_characters=False)
+
+      result_ja, result_zh_cn, result_zh_tw, result_ko = sess.run(
+          [outputs_ja, outputs_zh_cn, outputs_zh_tw, outputs_ko])
+
+      self.assertAllEqual(result_ja, expected_ja)
+      self.assertAllEqual(result_zh_cn, expected_zh_cn)
+      self.assertAllEqual(result_zh_tw, expected_zh_tw)
+      self.assertAllEqual(result_ko, expected_ko)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/unique_op_test.py b/tensorflow/python/kernel_tests/unique_op_test.py
index 316570e13e263a1a0b7bcef9c64448d58bb747af..f203263e0c567bb43ce1cb997bd343774d083d43 100644
--- a/tensorflow/python/kernel_tests/unique_op_test.py
+++ b/tensorflow/python/kernel_tests/unique_op_test.py
@@ -32,7 +32,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -43,7 +43,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x, out_idx=dtypes.int64)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -55,7 +55,7 @@ class UniqueTest(test.TestCase):
     x = [chr(i) for i in indx]
     with self.cached_session() as sess:
       y, idx = array_ops.unique(x)
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -67,9 +67,9 @@ class UniqueTest(test.TestCase):
       x = np.array([[1, 0, 0], [1, 0, 0], [2, 0, 0]])
       with self.cached_session() as sess:
         y0, idx0 = gen_array_ops.unique_v2(x, axis=np.array([0], dtype))
-        tf_y0, tf_idx0 = sess.run([y0, idx0])
+        tf_y0, tf_idx0 = self.evaluate([y0, idx0])
         y1, idx1 = gen_array_ops.unique_v2(x, axis=np.array([1], dtype))
-        tf_y1, tf_idx1 = sess.run([y1, idx1])
+        tf_y1, tf_idx1 = self.evaluate([y1, idx1])
       self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
       self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
       self.assertAllEqual(tf_y1, np.array([[1, 0], [1, 0], [2, 0]]))
@@ -81,7 +81,7 @@ class UniqueTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx = gen_array_ops.unique_v2(x, axis=np.array([], np.int32))
-      tf_y, tf_idx = sess.run([y, idx])
+      tf_y, tf_idx = self.evaluate([y, idx])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -95,7 +95,7 @@ class UniqueWithCountsTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -108,7 +108,7 @@ class UniqueWithCountsTest(test.TestCase):
     x = np.random.randint(2, high=10, size=7000)
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x, out_idx=dtypes.int64)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -123,7 +123,7 @@ class UniqueWithCountsTest(test.TestCase):
 
     with self.cached_session() as sess:
       y, idx, count = array_ops.unique_with_counts(x)
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
@@ -139,10 +139,10 @@ class UniqueWithCountsTest(test.TestCase):
       with self.cached_session() as sess:
         y0, idx0, count0 = gen_array_ops.unique_with_counts_v2(
             x, axis=np.array([0], dtype))
-        tf_y0, tf_idx0, tf_count0 = sess.run([y0, idx0, count0])
+        tf_y0, tf_idx0, tf_count0 = self.evaluate([y0, idx0, count0])
         y1, idx1, count1 = gen_array_ops.unique_with_counts_v2(
             x, axis=np.array([1], dtype))
-        tf_y1, tf_idx1, tf_count1 = sess.run([y1, idx1, count1])
+        tf_y1, tf_idx1, tf_count1 = self.evaluate([y1, idx1, count1])
       self.assertAllEqual(tf_y0, np.array([[1, 0, 0], [2, 0, 0]]))
       self.assertAllEqual(tf_idx0, np.array([0, 0, 1]))
       self.assertAllEqual(tf_count0, np.array([2, 1]))
@@ -157,7 +157,7 @@ class UniqueWithCountsTest(test.TestCase):
     with self.cached_session() as sess:
       y, idx, count = gen_array_ops.unique_with_counts_v2(
           x, axis=np.array([], np.int32))
-      tf_y, tf_idx, tf_count = sess.run([y, idx, count])
+      tf_y, tf_idx, tf_count = self.evaluate([y, idx, count])
 
     self.assertEqual(len(x), len(tf_idx))
     self.assertEqual(len(tf_y), len(np.unique(x)))
diff --git a/tensorflow/python/kernel_tests/unstack_op_test.py b/tensorflow/python/kernel_tests/unstack_op_test.py
index 6aea42990acf3541fa888f580ac8d82ea378096a..f5ba475e7adabc9bb5b057504ad854f550395440 100644
--- a/tensorflow/python/kernel_tests/unstack_op_test.py
+++ b/tensorflow/python/kernel_tests/unstack_op_test.py
@@ -41,7 +41,7 @@ class UnstackOpTest(test.TestCase):
 
   def testSimple(self):
     np.random.seed(7)
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [
             np.bool, np.float16, np.float32, np.float64, np.int32, np.int64
@@ -53,14 +53,15 @@ class UnstackOpTest(test.TestCase):
           cs = array_ops.unstack(x, num=shape[0])
           self.assertEqual(type(cs), list)
           self.assertEqual(len(cs), shape[0])
-          cs = [c.eval() for c in cs]
+          cs = [self.evaluate(c) for c in cs]
           self.assertAllEqual(cs, data)
 
   def testSimpleGpu(self):
     if not test_util.is_gpu_available():
       self.skipTest('No GPU available')
+
     np.random.seed(7)
-    with self.session(use_gpu=True, force_gpu=True):
+    with test_util.force_gpu():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
         for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
           data = np.random.randn(*shape).astype(dtype)
@@ -70,9 +71,10 @@ class UnstackOpTest(test.TestCase):
           cs = array_ops.unstack(x, num=shape[0])
           self.assertEqual(type(cs), list)
           self.assertEqual(len(cs), shape[0])
-          cs = [c.eval() for c in cs]
+          cs = [self.evaluate(c) for c in cs]
           self.assertAllEqual(cs, data)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis0(self):
     for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
@@ -85,6 +87,7 @@ class UnstackOpTest(test.TestCase):
                                                         shapes[i])
           self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testGradientsAxis1(self):
     for shape in (2, 3), (3, 2), (4, 3, 2):
       data = np.random.randn(*shape)
@@ -98,6 +101,7 @@ class UnstackOpTest(test.TestCase):
                                                         out_shape)
           self.assertLess(err, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testInferNum(self):
     with self.cached_session():
       for shape in (2,), (3,), (2, 3), (3, 2), (4, 3, 2):
@@ -106,16 +110,19 @@ class UnstackOpTest(test.TestCase):
         self.assertEqual(type(cs), list)
         self.assertEqual(len(cs), shape[0])
 
+  @test_util.run_deprecated_v1
   def testCannotInferNumFromUnknownShape(self):
     x = array_ops.placeholder(np.float32)
     with self.assertRaisesRegexp(ValueError,
                                  r'Cannot infer num from shape <unknown>'):
       array_ops.unstack(x)
 
+  @test_util.run_deprecated_v1
   def testUnknownShapeOkWithNum(self):
     x = array_ops.placeholder(np.float32)
     array_ops.unstack(x, num=2)
 
+  @test_util.run_deprecated_v1
   def testCannotInferNumFromNoneShape(self):
     x = array_ops.placeholder(np.float32, shape=(None,))
     with self.assertRaisesRegexp(ValueError,
@@ -131,15 +138,13 @@ class UnstackOpTest(test.TestCase):
       for j in range(-i, i):
         expected = np_split_squeeze(a, j)
 
-        with self.cached_session() as sess:
-          actual_unstack = sess.run(array_ops.unstack(a, axis=j))
+        actual_unstack = self.evaluate(array_ops.unstack(a, axis=j))
 
         self.assertAllEqual(expected, actual_unstack)
 
   def testAxis0Default(self):
-    with self.cached_session() as sess:
-      a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
-      unstacked = sess.run(array_ops.unstack(a))
+    a = constant_op.constant([[1, 2, 3], [4, 5, 6]], name='a')
+    unstacked = self.evaluate(array_ops.unstack(a))
 
     self.assertEqual(len(unstacked), 2)
     self.assertAllEqual(unstacked[0], [1, 2, 3])
@@ -156,10 +161,9 @@ class UnstackOpTest(test.TestCase):
       array_ops.unstack(a, axis=-3)
 
   def testZeroLengthDim(self):
-    with self.cached_session():
-      x = array_ops.zeros(shape=(0, 1, 2))
-      y = array_ops.unstack(x, axis=1)[0].eval()
-      self.assertEqual(y.shape, (0, 2))
+    x = array_ops.zeros(shape=(0, 1, 2))
+    y = self.evaluate(array_ops.unstack(x, axis=1)[0])
+    self.assertEqual(y.shape, (0, 2))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/kernel_tests/variable_ops_test.py b/tensorflow/python/kernel_tests/variable_ops_test.py
index 3d2f8b61555f277cd67d65b27c43b81c2a45538e..0f3e261992537f6d57a2a6d7234ab255fe55e79c 100644
--- a/tensorflow/python/kernel_tests/variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variable_ops_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
@@ -46,7 +47,7 @@ class VariableOpTest(test.TestCase):
       p = state_ops.variable_op(x.shape, tftype)
       op = state_ops.assign(p, x)
       op.op.run()
-      return p.eval()
+      return self.evaluate(p)
 
   def _testTypes(self, vals):
     for dtype in [np.float32, np.float64, np.int32, np.int64]:
@@ -59,15 +60,18 @@ class VariableOpTest(test.TestCase):
       # that Variable and Assign have GPU implementations for matching tf.
       self.assertAllEqual(x, self._initFetch(x, tftype, use_gpu=True))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     self._testTypes(np.arange(0, 20).reshape([4, 5]))
 
+  @test_util.run_deprecated_v1
   def testset_shape(self):
     p = state_ops.variable_op([1, 2], dtypes.float32)
     self.assertEqual([1, 2], p.get_shape())
     p = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     self.assertEqual(tensor_shape.unknown_shape(), p.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssign(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32)
@@ -75,6 +79,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoValidateShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32)
@@ -82,6 +87,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value, validate_shape=False)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoVarShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32, set_shape=False)
@@ -89,6 +95,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value)
     self.assertShapeEqual(value, assigned)
 
+  @test_util.run_deprecated_v1
   def testAssignNoVarShapeNoValidateShape(self):
     value = np.array([[42.0, 43.0]])
     var = state_ops.variable_op(value.shape, dtypes.float32, set_shape=False)
@@ -101,6 +108,7 @@ class VariableOpTest(test.TestCase):
     self.assertEqual(tensor_shape.unknown_shape(), tensor.get_shape())
     return tensor
 
+  @test_util.run_deprecated_v1
   def testAssignNoValueShape(self):
     value = self._NewShapelessTensor()
     shape = [1, 2]
@@ -109,6 +117,7 @@ class VariableOpTest(test.TestCase):
     self.assertEqual(shape, var.get_shape())
     self.assertEqual(shape, assigned.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoValueShapeNoValidateShape(self):
     value = self._NewShapelessTensor()
     shape = [1, 2]
@@ -117,6 +126,7 @@ class VariableOpTest(test.TestCase):
     assigned = state_ops.assign(var, value, validate_shape=False)
     self.assertEqual(tensor_shape.unknown_shape(), assigned.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoShape(self):
     with self.cached_session():
       value = self._NewShapelessTensor()
@@ -125,6 +135,7 @@ class VariableOpTest(test.TestCase):
       self.assertEqual(tensor_shape.unknown_shape(),
                        state_ops.assign(var, value).get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignNoShapeNoValidateShape(self):
     with self.cached_session():
       value = self._NewShapelessTensor()
@@ -135,6 +146,7 @@ class VariableOpTest(test.TestCase):
           state_ops.assign(
               var, value, validate_shape=False).get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdate(self):
     var = state_ops.variable_op([1, 2], dtypes.float32)
     added = state_ops.assign_add(var, [[2.0, 3.0]])
@@ -142,6 +154,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, [[12.0, 13.0]])
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoVarShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     added = state_ops.assign_add(var, [[2.0, 3.0]])
@@ -149,6 +162,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, [[12.0, 13.0]])
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoValueShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32)
     added = state_ops.assign_add(var, self._NewShapelessTensor())
@@ -156,6 +170,7 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual([1, 2], subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testAssignUpdateNoShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     added = state_ops.assign_add(var, self._NewShapelessTensor())
@@ -163,24 +178,27 @@ class VariableOpTest(test.TestCase):
     subbed = state_ops.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual(tensor_shape.unknown_shape(), subbed.get_shape())
 
+  @test_util.run_deprecated_v1
   def testTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="foo")
       var = state_ops.assign(var, [[4.0, 5.0]])
       var = state_ops.assign_add(var, [[6.0, 7.0]])
       final = gen_state_ops.destroy_temporary_variable(var, var_name="foo")
-      self.assertAllClose([[10.0, 12.0]], final.eval())
+      self.assertAllClose([[10.0, 12.0]], self.evaluate(final))
 
+  @test_util.run_deprecated_v1
   def testDestroyNonexistentTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
       final = gen_state_ops.destroy_temporary_variable(var, var_name="bad")
       with self.assertRaises(errors.NotFoundError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testDuplicateTemporaryVariable(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var1 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="dup")
       var1 = state_ops.assign(var1, [[1.0, 2.0]])
@@ -189,48 +207,53 @@ class VariableOpTest(test.TestCase):
       var2 = state_ops.assign(var2, [[3.0, 4.0]])
       final = var1 + var2
       with self.assertRaises(errors.AlreadyExistsError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testDestroyTemporaryVariableTwice(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable([1, 2], dtypes.float32)
       val1 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
       val2 = gen_state_ops.destroy_temporary_variable(var, var_name="dup")
       final = val1 + val2
       with self.assertRaises(errors.NotFoundError):
-        final.eval()
+        self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testTemporaryVariableNoLeak(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="bar")
       final = array_ops.identity(var)
-      final.eval()
+      self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testTwoTemporaryVariablesNoLeaks(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       var1 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var1")
       var2 = gen_state_ops.temporary_variable(
           [1, 2], dtypes.float32, var_name="var2")
       final = var1 + var2
-      final.eval()
+      self.evaluate(final)
 
+  @test_util.run_deprecated_v1
   def testAssignDependencyAcrossDevices(self):
-    with self.test_session(use_gpu=True):
+    with test_util.use_gpu():
       # The variable and an op to increment it are on the GPU.
       var = state_ops.variable_op([1], dtypes.float32)
-      state_ops.assign(var, [1.0]).eval()
+      self.evaluate(state_ops.assign(var, [1.0]))
       increment = state_ops.assign_add(var, [1.0])
       with ops.control_dependencies([increment]):
-        with ops.device("/cpu:0"):
+        with test_util.force_cpu():
           # This mul op is pinned to the CPU, but reads the variable from the
           # GPU. The test ensures that the dependency on 'increment' is still
           # honored, i.e., the Send and Recv from GPU to CPU should take place
           # only after the increment.
           result = math_ops.multiply(var, var)
-      self.assertAllClose([4.0], result.eval())
+      self.assertAllClose([4.0], self.evaluate(result))
 
+  @test_util.run_deprecated_v1
   def testIsVariableInitialized(self):
     for use_gpu in [True, False]:
       with self.test_session(use_gpu=use_gpu):
diff --git a/tensorflow/python/kernel_tests/variable_scope_test.py b/tensorflow/python/kernel_tests/variable_scope_test.py
index 2ba064f8a502fa203f156895985169ce6b50a135..451eb3853062203a190def09f432f9d9e12f2edd 100644
--- a/tensorflow/python/kernel_tests/variable_scope_test.py
+++ b/tensorflow/python/kernel_tests/variable_scope_test.py
@@ -152,6 +152,7 @@ class VariableScopeTest(test.TestCase):
   # TypeError: Fetch argument <tf.Variable 'string:0' shape=() dtype=string>
   # has invalid type <class '...ResourceVariable'>, must be a string or Tensor.
   # (Can not convert a ResourceVariable into a Tensor or Operation.)
+  @test_util.run_deprecated_v1
   def testStringDefaultInitializer(self):
     with self.cached_session():
       v = variable_scope.get_variable("string", shape=[], dtype=dtypes.string)
@@ -236,7 +237,8 @@ class VariableScopeTest(test.TestCase):
         _ = d2(x)
         self.assertEqual(len(d2.variables), 2)
         v3, v4 = d2.variables
-        self.assertAllEqual([v1, v2], [v3, v4])
+        self.assertEqual(v1, v3)
+        self.assertEqual(v2, v4)
       f()
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
@@ -308,9 +310,9 @@ class VariableScopeTest(test.TestCase):
       self.evaluate(variables_lib.global_variables_initializer())
       self.assertAllEqual(self.evaluate(x.value()), self.evaluate(y.value()))
 
-  # TODO(alive): support variable partitioning/caching in eager mode.
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # InvalidArgumentError: /job:moo/replica:0/task:0/device:CPU:0 unknown device.
+  @test_util.run_deprecated_v1
   def testVarScopeCachingDevice(self):
     with self.cached_session():
       caching_device = "/job:moo"
@@ -425,6 +427,7 @@ class VariableScopeTest(test.TestCase):
   # invalid type <class '...ops.resource_variable_ops.ResourceVariable'>, must
   # be a string or Tensor. (Can not convert a ResourceVariable into a Tensor or
   # Operation.)
+  @test_util.run_deprecated_v1
   def testControlDeps(self):
     with self.cached_session() as sess:
       v0 = variable_scope.get_variable(
@@ -435,22 +438,23 @@ class VariableScopeTest(test.TestCase):
         add = v1 + v0
       # v0 should be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should be able to initialize and run v1 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual(1, sess.run(v1))
+      self.evaluate(v1.initializer)
+      self.assertEqual(1, self.evaluate(v1))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # AssertionError: True is not false (last assertFalse)
+  @test_util.run_deprecated_v1
   def testEnableResourceVariables(self):
     old = variable_scope._DEFAULT_USE_RESOURCE
     try:
@@ -465,6 +469,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # TypeError: Fetch argument None has invalid type <type 'NoneType'>
+  @test_util.run_deprecated_v1
   def testControlFlow(self):
     with self.cached_session() as sess:
       v0 = variable_scope.get_variable(
@@ -490,19 +495,19 @@ class VariableScopeTest(test.TestCase):
       v2 = var_dict["v2"]
       # We should be able to initialize and run v1 and v2 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual([1], sess.run(v1))
-      sess.run(v2.initializer)
-      self.assertEqual([2], sess.run(v2))
+      self.evaluate(v1.initializer)
+      self.assertEqual([1], self.evaluate(v1))
+      self.evaluate(v2.initializer)
+      self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should not be able to run 'add' yet.
       with self.assertRaisesRegexp(errors.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # TypeError: Expected tf.group() expected Tensor arguments not 'None' with
@@ -649,7 +654,7 @@ class VariableScopeTest(test.TestCase):
             "testVarScopeGetOrCreateReuse_bar",
             reuse=variable_scope.AUTO_REUSE):
           _ = variable_scope.get_variable("var", [])
-        self.assertEqual(value, x.eval())
+        self.assertEqual(value, self.evaluate(x))
 
       test_value(42.)  # Variable is created.
       test_value(13.)  # Variable is reused hereafter.
@@ -1149,6 +1154,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetCollection(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetCollection_a", [])
@@ -1205,6 +1211,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetTrainableVariablesWithGetVariable(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetTrainableVariables_a", [])
@@ -1243,6 +1250,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetTrainableVariablesWithVariable(self):
     with self.cached_session():
       _ = variable_scope.variable(1.0, name="testGetTrainableVariables_a")
@@ -1284,6 +1292,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetGlobalVariables(self):
     with self.cached_session():
       _ = variable_scope.get_variable("testGetGlobalVariables_a", [])
@@ -1296,6 +1305,7 @@ class VariableScopeTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testGetLocalVariables(self):
     with self.cached_session():
       _ = variable_scope.get_variable(
@@ -1313,6 +1323,28 @@ class VariableScopeTest(test.TestCase):
     # Ensure it is possible to do get_variable with a _ref dtype passed in.
     _ = variable_scope.get_variable("w", shape=[5, 6], dtype=v.dtype)
 
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesNoArgs(self):
+    v = variable_scope.get_variable("foo", initializer=lambda: [2])
+    self.assertEqual(v.name, "foo:0")
+
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesOptionalArgs(self):
+    v = variable_scope.get_variable("foo", initializer=lambda x=True: [2])
+    self.assertEqual(v.name, "foo:0")
+
+  @test_util.run_in_graph_and_eager_modes
+  @run_inside_wrap_function_in_eager_mode
+  def testGetVariableWithInitializerWhichTakesUnprovidedArgsAndNoShape(self):
+    with self.assertRaisesRegexp(
+        ValueError,
+        "The initializer passed is not valid. It should be a callable with no "
+        "arguments and the shape should not be provided or an instance of "
+        "`tf.keras.initializers.*' and `shape` should be fully defined."):
+      variable_scope.get_variable("foo", initializer=lambda x: [2])
+
   @test_util.run_in_graph_and_eager_modes
   @run_inside_wrap_function_in_eager_mode
   def testTwoGraphs(self):
@@ -1349,6 +1381,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testResultNameMatchesRequested(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1404,6 +1437,14 @@ class VariableScopeWithPartitioningTest(test.TestCase):
       v_reused = variable_scope.get_variable("name0")
     self.assertEqual(v, v_reused)
 
+  def testNoReuseInEagerByDefault(self):
+    with context.eager_mode():
+      with variable_scope.variable_scope(
+          "scope0", partitioner=axis0_into2_partitioner):
+        v1 = variable_scope.get_variable("name0", shape=(3, 1, 1))
+        v2 = variable_scope.get_variable("name0", shape=(3, 1, 1))
+        self.assertIsNot(v1, v2)
+
   @test_util.run_in_graph_and_eager_modes
   @run_inside_wrap_function_in_eager_mode
   def testPropagatePartitionerOnReopening(self):
@@ -1415,6 +1456,7 @@ class VariableScopeWithPartitioningTest(test.TestCase):
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
   # obtaining different results in the eager case compared to the graph one
+  @test_util.run_deprecated_v1
   def testScalarIgnoresPartitioner(self):
     with variable_scope.variable_scope(
         "scope0", partitioner=axis0_into2_partitioner):
@@ -1459,6 +1501,10 @@ class VariableScopeWithPartitioningTest(test.TestCase):
   def testPartitionConcatenatesAlongCorrectAxisResource(self):
     self._testPartitionConcatenatesAlongCorrectAxis(use_resource=True)
 
+  def testPartitionConcatenatesAlongCorrectAxisResourceInEager(self):
+    with context.eager_mode():
+      self._testPartitionConcatenatesAlongCorrectAxis(use_resource=True)
+
 
 class VariableScopeWithCustomGetterTest(test.TestCase):
 
@@ -1550,6 +1596,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
   # dtype=float32> cannot be interpreted as a Tensor. (Tensor
   # Tensor("custom_getter/add:0", shape=(1, 2, 3), dtype=float32) is not an
   # element of this graph.)
+  @test_util.run_deprecated_v1
   def testGetterThatCreatesTwoVariablesAndSumsThem(self):
 
     def custom_getter(getter, name, *args, **kwargs):
@@ -1569,7 +1616,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
     self.assertEqual("custom_getter/add:0", v.name)
     with self.cached_session() as sess:
       variables_lib.global_variables_initializer().run()
-      np_vars, np_v = sess.run([true_vars, v])
+      np_vars, np_v = self.evaluate([true_vars, v])
       self.assertAllClose(np_v, sum(np_vars))
 
   # TODO(mihaimaruseac): Not converted to use wrap_function because of
@@ -1577,6 +1624,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
   # dtype=float32> cannot be interpreted as a Tensor. (Tensor
   # Tensor("sum_getter_2/add:0", shape=(1, 2, 3), dtype=float32) is not an
   # element of this graph.)
+  @test_util.run_deprecated_v1
   def testNestedCustomGetters(self):
 
     def sum_getter(getter, name, *args, **kwargs):
@@ -1614,7 +1662,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
 
     with self.cached_session() as sess:
       variables_lib.global_variables_initializer().run()
-      np_vars, np_v = sess.run([true_vars, v])
+      np_vars, np_v = self.evaluate([true_vars, v])
       # take products of sums of products
       self.assertAllClose(
           np_v, (((np_vars[0] * np_vars[1]) + (np_vars[2] * np_vars[3])) + (
@@ -1637,7 +1685,7 @@ class VariableScopeWithCustomGetterTest(test.TestCase):
       with variable_scope.variable_creator_scope(creator_b):
         variable_scope.variable(1.0, name="one_name")
 
-    self.assertAllEqual(variable_names, ["forced_name"])
+    self.assertEqual(variable_names[0], "forced_name")
 
     called = [False]
 
diff --git a/tensorflow/python/kernel_tests/variables_test.py b/tensorflow/python/kernel_tests/variables_test.py
index b3eebf83168fecaf21fb3e6be7329f97dd207b52..336e9b0bca2339554339b655e2226ea35558bb00 100644
--- a/tensorflow/python/kernel_tests/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import operator
 
 import numpy as np
@@ -27,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_state_ops
@@ -41,6 +43,7 @@ from tensorflow.python.util import compat
 
 class VariablesTestCase(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testInitialization(self):
     with self.cached_session():
       var0 = variables.VariableV1(0.0)
@@ -58,16 +61,17 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([], var1.shape)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        var0.eval()
+        self.evaluate(var0)
 
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        var1.eval()
+        self.evaluate(var1)
 
       variables.global_variables_initializer().run()
 
-      self.assertAllClose(0.0, var0.eval())
-      self.assertAllClose(1.1, var1.eval())
+      self.assertAllClose(0.0, self.evaluate(var0))
+      self.assertAllClose(1.1, self.evaluate(var1))
 
+  @test_util.run_v1_only("b/120545219")
   def testInitializationOrder(self):
     with self.cached_session():
       rnd = variables.Variable(random_ops.random_uniform([3, 6]), name="rnd")
@@ -94,8 +98,9 @@ class VariablesTestCase(test.TestCase):
 
       variables.global_variables_initializer().run()
 
-      self.assertAllClose(rnd.eval(), dep.eval())
-      self.assertAllClose(rnd.eval() + dep.eval() + 2.0, depdep.eval())
+      self.assertAllClose(rnd.eval(), self.evaluate(dep))
+      self.assertAllClose(rnd.eval() + self.evaluate(dep) + 2.0,
+                          self.evaluate(depdep))
 
   def testIterable(self):
     with self.assertRaisesRegexp(TypeError, "not iterable"):
@@ -105,6 +110,7 @@ class VariablesTestCase(test.TestCase):
       for _ in variables.Variable([0.0, 1.0]):
         pass
 
+  @test_util.run_deprecated_v1
   def testAssignments(self):
     with self.cached_session():
       var = variables.Variable(0.0)
@@ -112,17 +118,18 @@ class VariablesTestCase(test.TestCase):
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(0.0, var.eval())
+      self.assertAllClose(0.0, self.evaluate(var))
 
-      self.assertAllClose(1.0, plus_one.eval())
-      self.assertAllClose(1.0, var.eval())
+      self.assertAllClose(1.0, self.evaluate(plus_one))
+      self.assertAllClose(1.0, self.evaluate(var))
 
-      self.assertAllClose(-1.0, minus_one.eval())
-      self.assertAllClose(-1.0, var.eval())
+      self.assertAllClose(-1.0, self.evaluate(minus_one))
+      self.assertAllClose(-1.0, self.evaluate(var))
 
-      self.assertAllClose(4.0, four.eval())
-      self.assertAllClose(4.0, var.eval())
+      self.assertAllClose(4.0, self.evaluate(four))
+      self.assertAllClose(4.0, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testResourceAssignments(self):
     with self.session(use_gpu=True):
       var = resource_variable_ops.ResourceVariable(0.0)
@@ -130,16 +137,16 @@ class VariablesTestCase(test.TestCase):
       minus_one = var.assign_sub(2.0)
       four = var.assign(4.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(0.0, var.eval())
+      self.assertAllClose(0.0, self.evaluate(var))
 
-      plus_one.eval()
-      self.assertAllClose(1.0, var.eval())
+      self.evaluate(plus_one)
+      self.assertAllClose(1.0, self.evaluate(var))
 
-      minus_one.eval()
-      self.assertAllClose(-1.0, var.eval())
+      self.evaluate(minus_one)
+      self.assertAllClose(-1.0, self.evaluate(var))
 
-      four.eval()
-      self.assertAllClose(4.0, var.eval())
+      self.evaluate(four)
+      self.assertAllClose(4.0, self.evaluate(var))
 
   def testZeroSizeStringAssign(self):
     with self.cached_session() as sess:
@@ -148,10 +155,10 @@ class VariablesTestCase(test.TestCase):
           name="foo",
           trainable=False,
           collections=[ops.GraphKeys.LOCAL_VARIABLES])
-      sess.run(variables.local_variables_initializer())
+      self.evaluate(variables.local_variables_initializer())
       old_value = array.value()
       copy_op = array.assign(old_value)
-      self.assertEqual([], list(sess.run(copy_op)))
+      self.assertEqual([], list(self.evaluate(copy_op)))
 
   def _countUpToTest(self, dtype):
     with self.cached_session():
@@ -160,31 +167,34 @@ class VariablesTestCase(test.TestCase):
       count_up_to = var.count_up_to(3)
 
       variables.global_variables_initializer().run()
-      self.assertEqual(0, var.eval())
+      self.assertEqual(0, self.evaluate(var))
 
-      self.assertEqual(0, count_up_to.eval())
-      self.assertEqual(1, var.eval())
+      self.assertEqual(0, self.evaluate(count_up_to))
+      self.assertEqual(1, self.evaluate(var))
 
-      self.assertEqual(1, count_up_to.eval())
-      self.assertEqual(2, var.eval())
+      self.assertEqual(1, self.evaluate(count_up_to))
+      self.assertEqual(2, self.evaluate(var))
 
-      self.assertEqual(2, count_up_to.eval())
-      self.assertEqual(3, var.eval())
+      self.assertEqual(2, self.evaluate(count_up_to))
+      self.assertEqual(3, self.evaluate(var))
 
       with self.assertRaisesOpError("Reached limit of 3"):
-        count_up_to.eval()
-      self.assertEqual(3, var.eval())
+        self.evaluate(count_up_to)
+      self.assertEqual(3, self.evaluate(var))
 
       with self.assertRaisesOpError("Reached limit of 3"):
-        count_up_to.eval()
-      self.assertEqual(3, var.eval())
+        self.evaluate(count_up_to)
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testCountUpToInt32(self):
     self._countUpToTest(dtypes.int32)
 
+  @test_util.run_deprecated_v1
   def testCountUpToInt64(self):
     self._countUpToTest(dtypes.int64)
 
+  @test_util.run_v1_only("b/120545219")
   def testControlDepsNone(self):
     with self.cached_session():
       c = constant_op.constant(1.0)
@@ -198,6 +208,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([], var_x.value().op.control_inputs)
       self.assertEqual([], var_x._ref().op.control_inputs)  # pylint: disable=protected-access
 
+  @test_util.run_v1_only("b/120545219")
   def testControlFlow(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(0, name="v0")
@@ -220,20 +231,21 @@ class VariablesTestCase(test.TestCase):
       v2 = var_dict["v2"]
       # We should be able to initialize and run v1 and v2 without initializing
       # v0, even if the variable was created with a control dep on v0.
-      sess.run(v1.initializer)
-      self.assertEqual([1], sess.run(v1))
-      sess.run(v2.initializer)
-      self.assertEqual([2], sess.run(v2))
+      self.evaluate(v1.initializer)
+      self.assertEqual([1], self.evaluate(v1))
+      self.evaluate(v2.initializer)
+      self.assertEqual([2], self.evaluate(v2))
       # v0 should still be uninitialized.
       with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
-        sess.run(v0)
+        self.evaluate(v0)
       # We should not be able to run 'add' yet.
       with self.assertRaisesRegexp(errors_impl.OpError, "uninitialized"):
-        sess.run(add)
+        self.evaluate(add)
       # If we initialize v0 we should be able to run 'add'.
-      sess.run(v0.initializer)
-      sess.run(add)
+      self.evaluate(v0.initializer)
+      self.evaluate(add)
 
+  @test_util.run_v1_only("b/120545219")
   def testControlFlowInitialization(self):
     """Expects an error if an initializer is in a control-flow scope."""
     def cond(i, _):
@@ -247,15 +259,17 @@ class VariablesTestCase(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "inside a control-flow"):
       control_flow_ops.while_loop(cond, body, [0, 0])
 
+  @test_util.run_deprecated_v1
   def testUseVariableAsTensor(self):
     with self.cached_session():
       var_x = variables.Variable(2.0)
       var_y = variables.Variable(3.0)
       variables.global_variables_initializer().run()
-      self.assertAllClose(2.0, var_x.eval())
-      self.assertAllClose(3.0, var_y.eval())
+      self.assertAllClose(2.0, self.evaluate(var_x))
+      self.assertAllClose(3.0, self.evaluate(var_y))
       self.assertAllClose(5.0, math_ops.add(var_x, var_y).eval())
 
+  @test_util.run_deprecated_v1
   def testZeroSizeVarSameAsConst(self):
     with self.cached_session():
       zero_size_var = variables.Variable(array_ops.zeros([0, 2]))
@@ -264,10 +278,11 @@ class VariablesTestCase(test.TestCase):
       const_mul = math_ops.matmul(
           zero_size_const, zero_size_const, transpose_b=True)
       variables.global_variables_initializer().run()
-      variable_output = variable_mul.eval()
+      variable_output = self.evaluate(variable_mul)
       self.assertAllClose(const_mul.eval(), variable_output)
       self.assertAllClose([[0., 0.], [0., 0.]], variable_output)
 
+  @test_util.run_deprecated_v1
   def testCachingDevice(self):
     with self.cached_session():
       var = variables.Variable(2.0)
@@ -278,6 +293,7 @@ class VariablesTestCase(test.TestCase):
       self.assertFalse(var_cached.device.startswith("/job:foo"))
       self.assertTrue(var_cached.value().device.startswith("/job:foo"))
 
+  @test_util.run_deprecated_v1
   def testCollections(self):
     with self.cached_session():
       var_x = variables.VariableV1(2.0)
@@ -293,6 +309,7 @@ class VariablesTestCase(test.TestCase):
                        variables.global_variables())
       self.assertEqual([var_x, var_z, var_t], variables.trainable_variables())
 
+  @test_util.run_deprecated_v1
   def testCollectionsWithScope(self):
     with self.cached_session():
       with ops.name_scope("scope_1"):
@@ -308,6 +325,13 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual([var_x], variables.trainable_variables("scope_1"))
       self.assertEqual([var_y], variables.trainable_variables("scope_2"))
 
+  def testOperatorWrapping(self):
+    for attr in functools.WRAPPER_ASSIGNMENTS:
+      self.assertEqual(
+          getattr(variables.Variable.__add__, attr),
+          getattr(ops.Tensor.__add__, attr))
+
+  @test_util.run_deprecated_v1
   def testOperators(self):
     with self.cached_session():
       var_f = variables.Variable([2.0])
@@ -349,54 +373,46 @@ class VariablesTestCase(test.TestCase):
       rmatmul = var_m.__rmatmul__([[10.0], [20.0]])
 
       variables.global_variables_initializer().run()
-      self.assertAllClose([2.0], add.eval())
-      self.assertAllClose([3.0], radd.eval())
-      self.assertAllClose([1.0], sub.eval())
-      self.assertAllClose([-1.0], rsub.eval())
-      self.assertAllClose([20.0], mul.eval())
-      self.assertAllClose([20.0], rmul.eval())
-      self.assertAllClose([0.2], div.eval())
-      self.assertAllClose([5.0], rdiv.eval())
-      self.assertAllClose([-2.0], neg.eval())
-      self.assertAllClose([2.0], abs_v.eval())
-      self.assertAllClose([True], lt.eval())
-      self.assertAllClose([False], rlt.eval())
-      self.assertAllClose([True], le.eval())
-      self.assertAllClose([True], rle.eval())
-      self.assertAllClose([False], gt.eval())
-      self.assertAllClose([True], rgt.eval())
-      self.assertAllClose([True], ge.eval())
-      self.assertAllClose([True], rge.eval())
-
-      self.assertAllClose([6], mod.eval())
-      self.assertAllClose([3], rmod.eval())
-
-      self.assertAllClose([True, False], and_v.eval())
-      self.assertAllClose([True, True], or_v.eval())
-      self.assertAllClose([True, False], xor_v.eval())
-      self.assertAllClose([False, True], invert_v.eval())
-
-      self.assertAllClose(rnd[2, 0:0], slice_v.eval())
-
-      self.assertAllClose([[80.0]], matmul.eval())
-      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], rmatmul.eval())
-
+      self.assertAllClose([2.0], self.evaluate(add))
+      self.assertAllClose([3.0], self.evaluate(radd))
+      self.assertAllClose([1.0], self.evaluate(sub))
+      self.assertAllClose([-1.0], self.evaluate(rsub))
+      self.assertAllClose([20.0], self.evaluate(mul))
+      self.assertAllClose([20.0], self.evaluate(rmul))
+      self.assertAllClose([0.2], self.evaluate(div))
+      self.assertAllClose([5.0], self.evaluate(rdiv))
+      self.assertAllClose([-2.0], self.evaluate(neg))
+      self.assertAllClose([2.0], self.evaluate(abs_v))
+      self.assertAllClose([True], self.evaluate(lt))
+      self.assertAllClose([False], self.evaluate(rlt))
+      self.assertAllClose([True], self.evaluate(le))
+      self.assertAllClose([True], self.evaluate(rle))
+      self.assertAllClose([False], self.evaluate(gt))
+      self.assertAllClose([True], self.evaluate(rgt))
+      self.assertAllClose([True], self.evaluate(ge))
+      self.assertAllClose([True], self.evaluate(rge))
+
+      self.assertAllClose([6], self.evaluate(mod))
+      self.assertAllClose([3], self.evaluate(rmod))
+
+      self.assertAllClose([True, False], self.evaluate(and_v))
+      self.assertAllClose([True, True], self.evaluate(or_v))
+      self.assertAllClose([True, False], self.evaluate(xor_v))
+      self.assertAllClose([False, True], self.evaluate(invert_v))
+
+      self.assertAllClose(rnd[2, 0:0], self.evaluate(slice_v))
+
+      self.assertAllClose([[80.0]], self.evaluate(matmul))
+      self.assertAllClose([[20.0, 30.0], [40.0, 60.0]], self.evaluate(rmatmul))
+
+  @test_util.run_deprecated_v1
   def testSession(self):
     with self.cached_session() as sess:
       var = variables.Variable([1, 12])
       variables.global_variables_initializer().run()
-      self.assertAllClose([1, 12], sess.run(var))
-
-  def testDevicePlacement(self):
-    with self.cached_session() as sess:
-      with ops.device("/cpu:0"):
-        var = variables.Variable([1, 12])
-      init_value = var.initialized_value()
-      init_op = variables.global_variables_initializer()
-      self.assertEqual(var.op.device, init_value.device)
-      self.assertEqual(var.op.device, init_op.device)
-      sess.run(init_op)
+      self.assertAllClose([1, 12], self.evaluate(var))
 
+  @test_util.run_v1_only("b/120545219")
   def testColocation(self):
     with ops.device("/job:ps"):
       var = variables.VariableV1(0, name="v")
@@ -405,6 +421,7 @@ class VariablesTestCase(test.TestCase):
     self.assertDeviceEqual("/job:ps", assign_op.device)
     self.assertEqual([b"loc:@v"], assign_op.op.colocation_groups())
 
+  @test_util.run_v1_only("b/120545219")
   def testInitializerFunction(self):
     value = [[-42], [133.7]]
     shape = [2, 1]
@@ -416,7 +433,7 @@ class VariablesTestCase(test.TestCase):
       self.assertEqual(shape, v1.shape)
       self.assertAllClose(value, v1.initial_value.eval())
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        v1.eval()
+        self.evaluate(v1)
 
       v2 = variables.Variable(
           math_ops.negative(v1.initialized_value()), dtype=dtypes.float32)
@@ -425,9 +442,9 @@ class VariablesTestCase(test.TestCase):
       self.assertAllClose(np.negative(value), v2.initial_value.eval())
 
       with self.assertRaises(errors_impl.FailedPreconditionError):
-        v2.eval()
+        self.evaluate(v2)
       variables.global_variables_initializer().run()
-      self.assertAllClose(np.negative(value), v2.eval())
+      self.assertAllClose(np.negative(value), self.evaluate(v2))
 
   def testConstraintArg(self):
     constraint = lambda x: x
@@ -442,6 +459,7 @@ class VariablesTestCase(test.TestCase):
           lambda: constant_op.constant(1.),
           constraint=constraint)
 
+  @test_util.run_v1_only("b/120545219")
   def testNoRefDataRace(self):
     with self.cached_session():
       a = variables.Variable([1, 2, 3], dtype=dtypes.float32)
@@ -452,6 +470,7 @@ class VariablesTestCase(test.TestCase):
       self.assertAllEqual(b.eval(), [3, 4, 5])
       self.assertAllEqual(c.eval(), [5, 6, 7])
 
+  @test_util.run_deprecated_v1
   def testInitializerFunctionDevicePlacement(self):
     with self.cached_session():
       initializer = lambda: constant_op.constant(42.0)
@@ -470,6 +489,7 @@ class VariablesTestCase(test.TestCase):
       for i in v2.initializer.inputs:
         self.assertEqual(expected_group_v2, i.op.colocation_groups())
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableDefInitializedInstances(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v_def = variables.Variable(
@@ -478,11 +498,11 @@ class VariablesTestCase(test.TestCase):
     with ops.Graph().as_default(), self.cached_session() as sess:
       # v describes a VariableDef-based variable without an initial value.
       v = variables.Variable(variable_def=v_def)
-      self.assertEqual(3.0, sess.run(v.initialized_value()))
+      self.assertEqual(3.0, self.evaluate(v.initialized_value()))
 
       # initialized_value should not rerun the initializer_op if the variable
       # has already been initialized elsewhere.
-      sess.run(v.assign(1.0))
+      self.evaluate(v.assign(1.0))
       self.assertEqual(1.0, v.initialized_value().eval())
 
     v_def.ClearField("initial_value_name")
@@ -494,7 +514,7 @@ class VariablesTestCase(test.TestCase):
       self.assertProtoEquals(v_def, v.to_proto())
       # But attempts to use initialized_value will result in errors.
       with self.assertRaises(ValueError):
-        sess.run(v.initialized_value())
+        self.evaluate(v.initialized_value())
 
   def testTrainableInProto(self):
     with ops.Graph().as_default():
@@ -513,14 +533,16 @@ class VariablesTestCase(test.TestCase):
           variables.Variable(variable_def=trainable_variable.to_proto())
           .trainable)
 
+  @test_util.run_deprecated_v1
   def testLoad(self):
     with self.cached_session():
       var = variables.Variable(np.zeros((5, 5), np.float32))
       variables.global_variables_initializer().run()
       var.load(np.ones((5, 5), np.float32))
 
-      self.assertAllClose(np.ones((5, 5), np.float32), var.eval())
+      self.assertAllClose(np.ones((5, 5), np.float32), self.evaluate(var))
 
+  @test_util.run_v1_only("b/120545219")
   def testRepr(self):
     var = variables.VariableV1(np.zeros((5, 5), np.float32), name="noop")
     self.assertEqual(
@@ -542,7 +564,7 @@ class IsInitializedTest(test.TestCase):
   def testNoVars(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       uninited = variables.report_uninitialized_variables()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testAssertVariablesInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -550,27 +572,28 @@ class IsInitializedTest(test.TestCase):
       w = variables.Variable([3, 4], name="w")
       _ = v, w
       uninited = variables.report_uninitialized_variables()
-      self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
+      self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
       variables.global_variables_initializer().run()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2], name="v")
       w = variables.VariableV1([3, 4], name="w")
       uninited = variables.report_uninitialized_variables()
-      self.assertAllEqual(np.array([b"v", b"w"]), sess.run(uninited))
-      sess.run(w.initializer)
-      self.assertAllEqual(np.array([b"v"]), sess.run(uninited))
+      self.assertAllEqual(np.array([b"v", b"w"]), self.evaluate(uninited))
+      self.evaluate(w.initializer)
+      self.assertAllEqual(np.array([b"v"]), self.evaluate(uninited))
       v.initializer.run()
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testZeroSizeVarInitialized(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.Variable(array_ops.zeros([0, 2]), name="v")
       uninited = variables.report_uninitialized_variables()
       v.initializer.run()  # not strictly necessary
-      self.assertEqual(0, sess.run(uninited).size)
+      self.assertEqual(0, self.evaluate(uninited).size)
 
   def testTrainingWithZeroSizeVar(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
@@ -581,8 +604,8 @@ class IsInitializedTest(test.TestCase):
       variables.global_variables_initializer().run()
       do_opt = gradient_descent.GradientDescentOptimizer(0.1).minimize(
           objective)
-      sess.run([do_opt])
-      self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], b.eval())
+      self.evaluate([do_opt])
+      self.assertAllClose([[0.9, 0.9], [0.9, 0.9]], self.evaluate(b))
 
 
 class ObsoleteIsInitializedTest(test.TestCase):
@@ -591,6 +614,7 @@ class ObsoleteIsInitializedTest(test.TestCase):
     with ops.Graph().as_default():
       self.assertEqual(None, variables.assert_variables_initialized())
 
+  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -598,10 +622,11 @@ class ObsoleteIsInitializedTest(test.TestCase):
       _ = v, w
       inited = variables.assert_variables_initialized()
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
-        sess.run(inited)
+        self.evaluate(inited)
       variables.global_variables_initializer().run()
-      sess.run(inited)
+      self.evaluate(inited)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableList(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
       v = variables.VariableV1([1, 2])
@@ -609,7 +634,7 @@ class ObsoleteIsInitializedTest(test.TestCase):
       inited = variables.assert_variables_initialized([v])
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
-      sess.run(w.initializer)
+      self.evaluate(w.initializer)
       with self.assertRaisesOpError("Attempting to use uninitialized value"):
         inited.op.run()
       v.initializer.run()
@@ -744,34 +769,34 @@ class PartitionedVariableTest(test.TestCase):
       variables.global_variables_initializer().run()
 
       self.assertEqual([1.0], plus_delta[0].eval())
-      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], self.evaluate(v0))
       self.assertEqual([3.0], plus_delta[1].eval())
-      self.assertEqual([3.0], v1.eval())
+      self.assertEqual([3.0], self.evaluate(v1))
 
       self.assertEqual([-2.0], minus_delta[0].eval())
-      self.assertEqual([-2.0], v0.eval())
+      self.assertEqual([-2.0], self.evaluate(v0))
       self.assertEqual([-1.0], minus_delta[1].eval())
-      self.assertEqual([-1.0], v1.eval())
+      self.assertEqual([-1.0], self.evaluate(v1))
 
       self.assertEqual([1.0], assign_ones[0].eval())
-      self.assertEqual([1.0], v0.eval())
+      self.assertEqual([1.0], self.evaluate(v0))
       self.assertEqual([1.0], assign_ones[1].eval())
-      self.assertEqual([1.0], v1.eval())
+      self.assertEqual([1.0], self.evaluate(v1))
 
       self.assertEqual([2.0], assign_list[0].eval())
-      self.assertEqual([2.0], v2.eval())
+      self.assertEqual([2.0], self.evaluate(v2))
       self.assertEqual([3.0], assign_list[1].eval())
-      self.assertEqual([3.0], v3.eval())
+      self.assertEqual([3.0], self.evaluate(v3))
 
       self.assertEqual([3.0], assign_part_value[0].eval())
-      self.assertEqual([3.0], v2.eval())
+      self.assertEqual([3.0], self.evaluate(v2))
       self.assertEqual([4.0], assign_part_value[1].eval())
-      self.assertEqual([4.0], v3.eval())
+      self.assertEqual([4.0], self.evaluate(v3))
 
       self.assertEqual([2.0], assign_part_var[0].eval())
-      self.assertEqual([2.0], v2.eval())
+      self.assertEqual([2.0], self.evaluate(v2))
       self.assertEqual([3.0], assign_part_var[1].eval())
-      self.assertEqual([3.0], v3.eval())
+      self.assertEqual([3.0], self.evaluate(v3))
 
 
 class VariableContainerTest(test.TestCase):
diff --git a/tensorflow/python/kernel_tests/weights_broadcast_test.py b/tensorflow/python/kernel_tests/weights_broadcast_test.py
index 85f9abc69f78b048c78d4d0ab908371e7a8650d3..677d8f2f22f0e2877553d3698ef02a6902986727 100644
--- a/tensorflow/python/kernel_tests/weights_broadcast_test.py
+++ b/tensorflow/python/kernel_tests/weights_broadcast_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import test
@@ -51,40 +52,48 @@ class AssertBroadcastableTest(test.TestCase):
           values_placeholder: values,
       })
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     self._test_valid(weights=5, values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1x1(self):
     self._test_valid(
         weights=np.asarray((5,)).reshape((1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1xN(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11, 3)).reshape((1, 1, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1(self):
     self._test_valid(
         weights=np.asarray((5, 11)).reshape((1, 2, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNxN(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1(self):
     self._test_valid(
         weights=np.asarray((5, 7, 11)).reshape((3, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1xN(self):
     self._test_valid(
         weights=np.asarray((
             5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNxNxN(self):
     self._test_valid(
         weights=np.asarray((
@@ -107,29 +116,35 @@ class AssertBroadcastableTest(test.TestCase):
             values_placeholder: values,
         })
 
+  @test_util.run_deprecated_v1
   def testInvalid1(self):
     self._test_invalid(weights=np.asarray((5,)), values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalid1x1(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12)).reshape((3, 2)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12, 7, 5)).reshape((2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidOnesExtraDim(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -137,6 +152,7 @@ class AssertBroadcastableTest(test.TestCase):
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -158,24 +174,27 @@ class BroadcastWeightsTest(test.TestCase):
     dynamic_op = weights_broadcast_ops.broadcast_weights(
         weights=weights_placeholder, values=values_placeholder)
     with self.cached_session():
-      self.assertAllEqual(expected, static_op.eval())
+      self.assertAllEqual(expected, self.evaluate(static_op))
       self.assertAllEqual(expected, dynamic_op.eval(feed_dict={
           weights_placeholder: weights,
           values_placeholder: values,
       }))
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     self._test_valid(
         weights=5,
         values=_test_values((3, 2, 4)),
         expected=5 * np.ones((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1x1(self):
     self._test_valid(
         weights=np.asarray((5,)).reshape((1, 1, 1)),
         values=_test_values((3, 2, 4)),
         expected=5 * np.ones((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def test1x1xN(self):
     weights = np.asarray((5, 7, 11, 3)).reshape((1, 1, 4))
     self._test_valid(
@@ -183,6 +202,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def test1xNx1(self):
     weights = np.asarray((5, 11)).reshape((1, 2, 1))
     self._test_valid(
@@ -190,6 +210,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 1, 4)))
 
+  @test_util.run_deprecated_v1
   def test1xNxN(self):
     weights = np.asarray((5, 7, 11, 3, 2, 13, 7, 5)).reshape((1, 2, 4))
     self._test_valid(
@@ -197,6 +218,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(3, 1, 1)))
 
+  @test_util.run_deprecated_v1
   def testNx1x1(self):
     weights = np.asarray((5, 7, 11)).reshape((3, 1, 1))
     self._test_valid(
@@ -204,6 +226,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(1, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testNx1xN(self):
     weights = np.asarray((
         5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3)).reshape((3, 1, 4))
@@ -212,6 +235,7 @@ class BroadcastWeightsTest(test.TestCase):
         values=_test_values((3, 2, 4)),
         expected=np.tile(weights, reps=(1, 2, 1)))
 
+  @test_util.run_deprecated_v1
   def testNxNxN(self):
     weights = np.asarray((
         5, 7, 11, 3, 2, 12, 7, 5, 2, 17, 11, 3,
@@ -234,29 +258,35 @@ class BroadcastWeightsTest(test.TestCase):
             values_placeholder: values,
         })
 
+  @test_util.run_deprecated_v1
   def testInvalid1(self):
     self._test_invalid(weights=np.asarray((5,)), values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalid1x1(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12)).reshape((3, 2)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatch(self):
     self._test_invalid(
         weights=np.asarray((5, 7, 11, 3, 2, 12, 7, 5)).reshape((2, 4)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidOnesExtraDim(self):
     self._test_invalid(
         weights=np.asarray((5,)).reshape((1, 1, 1, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidPrefixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
@@ -264,6 +294,7 @@ class BroadcastWeightsTest(test.TestCase):
             2, 17, 11, 3, 5, 7, 11, 3, 2, 12, 7, 5)).reshape((3, 2, 4, 1)),
         values=_test_values((3, 2, 4)))
 
+  @test_util.run_deprecated_v1
   def testInvalidSuffixMatchExtraDim(self):
     self._test_invalid(
         weights=np.asarray((
diff --git a/tensorflow/python/kernel_tests/where_op_test.py b/tensorflow/python/kernel_tests/where_op_test.py
index fca45c3ece41a50d48583ef16baca823d4607602..56c1390411324acf1cf4ff36c30f1c473e1df95c 100644
--- a/tensorflow/python/kernel_tests/where_op_test.py
+++ b/tensorflow/python/kernel_tests/where_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -41,11 +42,11 @@ class WhereOpTest(test.TestCase):
       ans = array_ops.where(x)
       self.assertEqual([None, x.ndim], ans.get_shape().as_list())
       if expected_err_re is None:
-        tf_ans = ans.eval()
+        tf_ans = self.evaluate(ans)
         self.assertAllClose(tf_ans, truth, atol=1e-10)
       else:
         with self.assertRaisesOpError(expected_err_re):
-          ans.eval()
+          self.evaluate(ans)
 
   def testWrongNumbers(self):
     with self.session(use_gpu=True):
@@ -54,6 +55,7 @@ class WhereOpTest(test.TestCase):
       with self.assertRaises(ValueError):
         array_ops.where([False, True], None, [1, 2])
 
+  @test_util.run_deprecated_v1
   def testBasicVec(self):
     x = np.asarray([True, False])
     truth = np.asarray([[0]], dtype=np.int64)
@@ -67,11 +69,13 @@ class WhereOpTest(test.TestCase):
     truth = np.asarray([[2], [4]], dtype=np.int64)
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testRandomVec(self):
     x = np.random.rand(1000000) > 0.5
     truth = np.vstack([np.where(x)[0].astype(np.int64)]).T
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testBasicMat(self):
     x = np.asarray([[True, False], [True, False]])
 
@@ -80,6 +84,7 @@ class WhereOpTest(test.TestCase):
 
     self._testWhere(x, truth)
 
+  @test_util.run_deprecated_v1
   def testBasic3Tensor(self):
     x = np.asarray([[[True, False], [True, False]],
                     [[False, True], [False, True]],
@@ -99,36 +104,47 @@ class WhereOpTest(test.TestCase):
     truth = np.vstack(truth).T  # Convert to [num_true, indices].
     self._testWhere(x, truth, expected_err_re)
 
+  @test_util.run_deprecated_v1
   def testRandomBool(self):
     self._testRandom(np.bool)
 
+  @test_util.run_deprecated_v1
   def testRandomInt32(self):
     self._testRandom(np.int32)
 
+  @test_util.run_deprecated_v1
   def testRandomInt64(self):
     self._testRandom(np.int64)
 
+  @test_util.run_deprecated_v1
   def testRandomFloat(self):
     self._testRandom(np.float32)
 
+  @test_util.run_deprecated_v1
   def testRandomDouble(self):
     self._testRandom(np.float64)
 
+  @test_util.run_deprecated_v1
   def testRandomComplex64(self):
     self._testRandom(np.complex64)
 
+  @test_util.run_deprecated_v1
   def testRandomComplex128(self):
     self._testRandom(np.complex128)
 
+  @test_util.run_deprecated_v1
   def testRandomUint8(self):
     self._testRandom(np.uint8)
 
+  @test_util.run_deprecated_v1
   def testRandomInt8(self):
     self._testRandom(np.int8)
 
+  @test_util.run_deprecated_v1
   def testRandomInt16(self):
     self._testRandom(np.int16)
 
+  @test_util.run_deprecated_v1
   def testThreeArgument(self):
     x = np.array([[-2, 3, -1], [1, -3, -3]])
     np_val = np.where(x > 0, x * x, -x)
@@ -136,6 +152,7 @@ class WhereOpTest(test.TestCase):
       tf_val = array_ops.where(constant_op.constant(x) > 0, x * x, -x).eval()
     self.assertAllEqual(tf_val, np_val)
 
+  @test_util.run_deprecated_v1
   def testBatchSelect(self):
     x = np.array([[-2, 3, -1] * 64, [1, -3, -3] * 64] * 8192)  # [16384, 192]
     c_mat = np.array([[False] * 192, [True] * 192] * 8192)  # [16384, 192]
diff --git a/tensorflow/python/kernel_tests/while_v2_test.py b/tensorflow/python/kernel_tests/while_v2_test.py
index dc1bcb78b8066c83cb0d9693d7e23ce68b0463d6..cae459a34e934cc804a56f5738202377a1227274 100644
--- a/tensorflow/python/kernel_tests/while_v2_test.py
+++ b/tensorflow/python/kernel_tests/while_v2_test.py
@@ -20,14 +20,15 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.grappler import tf_optimizer
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import list_ops
@@ -42,14 +43,30 @@ from tensorflow.python.platform import test
 
 class WhileV2Test(test.TestCase, parameterized.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSingleLoopVar(self):
     x = constant_op.constant(2.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v * v, [x])
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * v, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+
+  @test_util.run_v1_only("b/120545219")
+  def testReturnSameStructureTrue(self):
+    x = constant_op.constant(2.)
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * v, [x], return_same_structure=True)
+    grad = gradients_impl.gradients(ret, [x])
+    with self.cached_session() as sess:
+      eval_result = sess.run(ret)
+      self.assertIsInstance(eval_result, list)
+      self.assertLen(eval_result, 1)
+      self.assertEqual(16., eval_result[0])
       self.assertSequenceEqual(sess.run(grad), [32.])
 
+  @test_util.run_deprecated_v1
   def testMultipleLoopVarsBasic(self):
     x = constant_op.constant(5.)
     y = constant_op.constant(3.)
@@ -58,15 +75,19 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # y = 3.
     # while x < 45.:
     #   x = x * y
-    ret = while_loop_v2(lambda v, _: v < 45., lambda v, w: (v * w, w), [x, y])
+    ret = while_loop_v2(
+        lambda v, _: v < 45.,
+        lambda v, w: (v * w, w), [x, y],
+        return_same_structure=False)
     # ret = [x*y^2, y]
 
     # Note: This is simply d_ret[0]/d_x since d_ret[1]/d_x is 0.
     grad = gradients_impl.gradients(ret, [x])  # [2*x*y]
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(ret), [45., 3.])
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertSequenceEqual(self.evaluate(ret), [45., 3.])
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testMultipleLoopVars(self):
     x = constant_op.constant(5.)
     y = constant_op.constant(3.)
@@ -76,8 +97,10 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # while x < 45.:
     #   x = x * y
     #   y = x + y
-    ret = while_loop_v2(lambda v, _: v < 45., lambda v, w: (v * w, v + w),
-                        [x, y])
+    ret = while_loop_v2(
+        lambda v, _: v < 45.,
+        lambda v, w: (v * w, v + w), [x, y],
+        return_same_structure=False)
     # ret = [y*x**2 + x*y**2, x*y + x + y]
 
     gradx_0 = gradients_impl.gradients(ret[0], [x])  # [2*x*y + y**2]
@@ -87,34 +110,43 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grady_1 = gradients_impl.gradients(ret[1], [y])  # [x + 1]
     grady_2 = gradients_impl.gradients(ret, [y])  # [2*x*y + x**2 + x + 1]
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(ret), [120., 23.])
-      self.assertSequenceEqual(sess.run(gradx_0), [39.])
-      self.assertSequenceEqual(sess.run(gradx_1), [4.])
-      self.assertSequenceEqual(sess.run(gradx_2), [43.])
-      self.assertSequenceEqual(sess.run(grady_0), [55.])
-      self.assertSequenceEqual(sess.run(grady_1), [6.])
-      self.assertSequenceEqual(sess.run(grady_2), [61.])
-
+      self.assertSequenceEqual(self.evaluate(ret), [120., 23.])
+      self.assertSequenceEqual(self.evaluate(gradx_0), [39.])
+      self.assertSequenceEqual(self.evaluate(gradx_1), [4.])
+      self.assertSequenceEqual(self.evaluate(gradx_2), [43.])
+      self.assertSequenceEqual(self.evaluate(grady_0), [55.])
+      self.assertSequenceEqual(self.evaluate(grady_1), [6.])
+      self.assertSequenceEqual(self.evaluate(grady_2), [61.])
+
+  @test_util.run_deprecated_v1
   def testMultipleWhileLoops(self):
     x = constant_op.constant(2.)
-    ret1 = while_loop_v2(lambda v: v < 4., lambda v: v * v, [x])  # x**2
-    ret2 = while_loop_v2(lambda v: v < 16., lambda v: v * v, [ret1])  # x**4
+    ret1 = while_loop_v2(
+        lambda v: v < 4., lambda v: v * v, [x],
+        return_same_structure=False)  # x**2
+    ret2 = while_loop_v2(
+        lambda v: v < 16., lambda v: v * v, [ret1],
+        return_same_structure=False)  # x**4
     grad = gradients_impl.gradients(ret2, [x])  # 4x**3
     grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
     with self.cached_session() as sess:
-      self.assertSequenceEqual(sess.run(grad), [32.])
-      self.assertSequenceEqual(sess.run(grad_grad), [48.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
+  @test_util.run_deprecated_v1
   def testDoubleDerivative(self):
     x = constant_op.constant(2.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v**2, [x])  # x**4
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v**2, [x],
+        return_same_structure=False)  # x**4
     grad = gradients_impl.gradients(ret, [x])  # 4x**3
     grad_grad = gradients_impl.gradients(grad, [x])  # 12x**2
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
-      self.assertSequenceEqual(sess.run(grad_grad), [48.])
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad_grad), [48.])
 
+  @test_util.run_v1_only("b/120545219")
   def testPruning(self):
     x = constant_op.constant(1)
 
@@ -135,10 +167,12 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
 
     def GetOptimizedGraph():
       mg = meta_graph.create_meta_graph_def(graph=ops.get_default_graph())
-      rewriter_config = rewriter_config_pb2.RewriterConfig(
-          constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
-          memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL)
-      return tf_optimizer.OptimizeGraph(rewriter_config, mg)
+      config = config_pb2.ConfigProto()
+      config.graph_options.rewrite_options.CopyFrom(
+          rewriter_config_pb2.RewriterConfig(
+              constant_folding=rewriter_config_pb2.RewriterConfig.OFF,
+              memory_optimization=rewriter_config_pb2.RewriterConfig.MANUAL))
+      return tf_optimizer.OptimizeGraph(config, mg)
 
     g = GetOptimizedGraph()
     self.assertEqual(len([n for n in g.node if n.op == "Enter"]), 1)
@@ -148,24 +182,31 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     g = GetOptimizedGraph()
     self.assertEqual(len([n for n in g.node if n.op == "Enter"]), 2)
 
+  @test_util.run_deprecated_v1
   def testCaptureExternalTensorInCond(self):
     x = constant_op.constant(2.)
     y = constant_op.constant(1.)
-    ret = while_loop_v2(lambda v: v + y < 9., lambda v: v * 3., [x])
+    ret = while_loop_v2(
+        lambda v: v + y < 9.,
+        lambda v: v * 3., [x],
+        return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 18.)
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertEqual(self.evaluate(ret), 18.)
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testCaptureExternalTensorInBody(self):
     x = constant_op.constant(2.)
     y = constant_op.constant(3.)
-    ret = while_loop_v2(lambda v: v < 8., lambda v: v * y, [x])
+    ret = while_loop_v2(
+        lambda v: v < 8., lambda v: v * y, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 18.)
-      self.assertSequenceEqual(sess.run(grad), [9.])
+      self.assertEqual(self.evaluate(ret), 18.)
+      self.assertSequenceEqual(self.evaluate(grad), [9.])
 
+  @test_util.run_deprecated_v1
   def testLoopWithTensorListPushBack(self):
     x = constant_op.constant(2.)
 
@@ -181,12 +222,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       tl = list_ops.tensor_list_push_back(tl, constant_op.constant(100.))
       return x**2., tl
 
-    ret = while_loop_v2(Cond, Body, [x, tensor_list])
+    ret = while_loop_v2(
+        Cond, Body, [x, tensor_list], return_same_structure=False)
     grad = gradients_impl.gradients(ret[0], x)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(ret[0]), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
+  @test_util.run_deprecated_v1
   def testDuplicateAccumulator(self):
     x = constant_op.constant(2.)
 
@@ -203,13 +246,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       tl = list_ops.tensor_list_push_back(tl, x)
       return x**2., tl
 
-    ret = while_loop_v2(Cond, Body, [x, tensor_list])
+    ret = while_loop_v2(
+        Cond, Body, [x, tensor_list], return_same_structure=False)
 
     for op in ops.get_default_graph().get_operations():
       if op.type == "While":
         while_op = op
 
-    body_graph = while_v2._get_body_graph(while_op)
+    body_graph = while_v2._get_graph(while_op, "body")
     # body_graph.inputs: [counter_arg, x_arg, tl_arg, *accumulators]
     x_input_t = body_graph.inputs[1]
     accumulator_count = len(
@@ -219,13 +263,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     grad = gradients_impl.gradients(ret[0], x)
     with self.cached_session() as sess:
       self.assertEqual(sess.run(ret[0]), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
   @parameterized.named_parameters(
       ("UnknownShape", None),
       ("PartiallyDefinedShape", [None, 2]),
       ("FullyDefinedShape", [1, 2]),
   )
+  @test_util.run_deprecated_v1
   def testAccumulatorElementShape(self, shape):
 
     def MatchShape(actual_tensor_shape):
@@ -239,19 +284,26 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         self.assertListEqual(actual_tensor_shape.as_list(), shape)
 
     def GetAccumulatorForInputAtIndex(while_op, idx):
-      body_graph = while_v2._get_body_graph(while_op)
+      body_graph = while_v2._get_graph(while_op, "body")
       y_input_t = body_graph.inputs[idx]
       push_back_node = [c for c in y_input_t.consumers()
                         if c.type == "TensorListPushBack"][0]
       output_idx = body_graph.outputs.index(push_back_node.outputs[0])
       return while_op.outputs[output_idx]
 
-    x = constant_op.constant(2.)
+    x = array_ops.placeholder(dtype=dtypes.float32, shape=shape)
     y = array_ops.placeholder(dtype=dtypes.float32, shape=shape)
 
     # Forward pass.
-    ret = while_loop_v2(lambda v, u: v < 8., lambda v, u: (v * v, u), [x, y])
+    ret = while_loop_v2(lambda v, u: v < 8.,
+                        lambda v, u: (math_ops.pow(v, u), u),
+                        [x, y],
+                        return_same_structure=True)
     while_op = ret[0].op.inputs[0].op
+    # Gradient pass.
+    grad = gradients_impl.gradients(ret[0], x)
+    grad_while_op = grad[0].op.inputs[0].op
+
     # Get the TensorList output of While op containing the accumulated values
     # of y.
     # while_op.inputs: [counter_arg, x_arg, y_arg, *accumulators]
@@ -260,22 +312,24 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
                                            element_dtype=dtypes.float32)
     MatchShape(val.shape)
 
-    # Gradient pass.
-    grad = gradients_impl.gradients(ret[1], y)
-    grad_while_op = grad[0].op
+    # Take second derivative to generate intermediate grad_while_op outputs
+    gradients_impl.gradients(grad, x)
+
     # Get the TensorList output of gradient While op containing the accumulated
-    # values of grad_y.
+    # values of grad_x (note that grad_x is needed by the second derivative).
     # grad_while_op.inputs:
     # [counter_arg, total_iters_arg, grad_x_arg, grad_y_arg, *other_args]
-    grad_output = GetAccumulatorForInputAtIndex(grad_while_op, 3)
+    grad_output = GetAccumulatorForInputAtIndex(grad_while_op, 2)
     _, val = list_ops.tensor_list_pop_back(grad_output,
                                            element_dtype=dtypes.float32)
     MatchShape(val.shape)
 
   def _createWhile(self, name):
     """Helper function testDefaultName."""
-    output = while_v2.while_loop(lambda i: i < 3, lambda i: i + 1,
-                                 [constant_op.constant(0)])
+    output = while_v2.while_loop(
+        lambda i: i < 3,
+        lambda i: i + 1, [constant_op.constant(0)],
+        return_same_structure=False)
     while_op = output.op.inputs[0].op
     self.assertEqual(while_op.type, "While")
     return while_op
@@ -305,19 +359,18 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         self.assertRegexpMatches(
             while2_op.get_attr("body").name, r"foo_while_1_body_\d*")
 
+  @test_util.enable_control_flow_v2
+  @test_util.run_deprecated_v1
   def testWhileAndTensorArray(self):
-    old_enable_while_v2 = control_flow_ops.ENABLE_WHILE_V2
-    control_flow_ops.ENABLE_WHILE_V2 = True
-    with self.cached_session() as sess:
-      param = constant_op.constant(2.0)
-      y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
-      # map_fn uses TensorArray internally.
-      r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
-      self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], sess.run(r))
-      r = gradients_impl.gradients(r, param)[0]
-      self.assertAllClose(21.0, sess.run(r))
-    control_flow_ops.ENABLE_WHILE_V2 = old_enable_while_v2
-
+    param = constant_op.constant(2.0)
+    y0 = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="elems")
+    # map_fn uses TensorArray internally.
+    r = functional_ops.map_fn(lambda x: math_ops.multiply(x, param), y0)
+    grad = gradients_impl.gradients(r, param)[0]
+    self.assertAllClose([2.0, 4.0, 6.0, 8.0, 10.0, 12.0], self.evaluate(r))
+    self.assertAllClose(21.0, self.evaluate(grad))
+
+  @test_util.run_deprecated_v1
   def testNestedWhile(self):
     # Compute sum of geometric progression: n^0 + n^1 + ... + n^m
     # We compute the pow using a while loop.
@@ -328,14 +381,20 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     def Body(i, previous_sum):
       prod = constant_op.constant(1.)
       return i - 1., previous_sum + while_loop_v2(
-          lambda c, _: c > 0, lambda c, v: (c - 1., v * n), [i, prod])[1]
-
-    result = while_loop_v2(lambda i, _: i >= 0, Body, [m, sum_of_powers])[1]
+          lambda c, _: c > 0,
+          lambda c, v: (c - 1., v * n), [i, prod],
+          return_same_structure=False)[1]
+
+    result = while_loop_v2(
+        lambda i, _: i >= 0,
+        Body, [m, sum_of_powers],
+        return_same_structure=False)[1]
     grad = gradients_impl.gradients(result, [n])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(result), 364.)
-      self.assertSequenceEqual(sess.run(grad), [547.])
+      self.assertEqual(self.evaluate(result), 364.)
+      self.assertSequenceEqual(self.evaluate(grad), [547.])
 
+  @test_util.run_deprecated_v1
   def testIdentityNodeInBody(self):
 
     def Body(v):
@@ -344,12 +403,14 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
       return v * v
 
     x = constant_op.constant(2.)
-    ret = while_loop_v2(lambda v: v < 8., Body, [x])
+    ret = while_loop_v2(
+        lambda v: v < 8., Body, [x], return_same_structure=False)
     grad = gradients_impl.gradients(ret, [x])
     with self.cached_session() as sess:
-      self.assertEqual(sess.run(ret), 16.)
-      self.assertSequenceEqual(sess.run(grad), [32.])
+      self.assertEqual(self.evaluate(ret), 16.)
+      self.assertSequenceEqual(self.evaluate(grad), [32.])
 
+  @test_util.run_deprecated_v1
   def testNestedWhileAndTensorArray(self):
     n = constant_op.constant(3.0)
 
@@ -362,13 +423,17 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
         return row, col + 1., ta, n
 
       # TODO(b/118457764): Remove n from loop_vars from both loops once fixed.
-      ta = while_loop_v2(lambda _, col, _1, n: col <= n, InnerBody,
-                         [row, constant_op.constant(1.), ta, n])[2]
+      ta = while_loop_v2(
+          lambda _, col, _1, n: col <= n,
+          InnerBody, [row, constant_op.constant(1.), ta, n],
+          return_same_structure=False)[2]
       return row + 1., ta, n
 
     ta = tensor_array_ops.TensorArray(dtype=dtypes.float32, size=9)
-    ta = while_loop_v2(lambda row, _, _1: row <= n, Body,
-                       [constant_op.constant(1.), ta, n])[1]
+    ta = while_loop_v2(
+        lambda row, _, _1: row <= n,
+        Body, [constant_op.constant(1.), ta, n],
+        return_same_structure=False)[1]
 
     output = array_ops.reshape(ta.stack(), [3, 3])
     self.assertAllEqual(
@@ -377,6 +442,26 @@ class WhileV2Test(test.TestCase, parameterized.TestCase):
     # grad = gradients_impl.gradients(output, [n])
     # self.assertEqual(self.evaluate(grad), 3.5)
 
+  @test_util.run_deprecated_v1
+  def testForwardPassRewrite(self):
+    x = constant_op.constant(1.0, name="x")
+    output = while_v2.while_loop(lambda x: x < 10.0,
+                                 lambda x: x * 2.0,
+                                 [x])[0]
+    while_op = output.op.inputs[0].op
+    self.assertEqual(while_op.type, "While")
+    # outputs = [loop_counter, x]
+    self.assertLen(while_op.outputs, 2)
+
+    gradients_impl.gradients(output, x)
+    # while_op should have been rewritten to output 2.0 intermediate.
+    # outputs = [loop_counter, x, 2.0_accumulator, x_accumulator]
+    self.assertLen(while_op.outputs, 4)
+
+    gradients_impl.gradients(output, x)
+    # Computing the gradient again shouldn't rewrite while_op again.
+    self.assertLen(while_op.outputs, 4)
+
 
 def ScalarShape():
   return ops.convert_to_tensor([], dtype=dtypes.int32)
diff --git a/tensorflow/python/kernel_tests/xent_op_test.py b/tensorflow/python/kernel_tests/xent_op_test.py
index c3c7f867a1e34efd98ca8a84e8f2d2a002b75ac9..f5d03c2370186e39cad2ba9aa29d03c454de9168 100644
--- a/tensorflow/python/kernel_tests/xent_op_test.py
+++ b/tensorflow/python/kernel_tests/xent_op_test.py
@@ -27,6 +27,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gradient_checker
@@ -56,7 +57,7 @@ class XentTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
           np_features, np_labels)
-      tf_loss, tf_backprop = sess.run([loss, backprop])
+      tf_loss, tf_backprop = self.evaluate([loss, backprop])
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
     self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
@@ -65,7 +66,7 @@ class XentTest(test.TestCase):
     with self.cached_session(use_gpu=use_gpu) as sess:
       loss = nn_ops.softmax_cross_entropy_with_logits(
           labels=np_labels, logits=np_features, dim=dim)
-      tf_loss = sess.run(loss)
+      tf_loss = self.evaluate(loss)
     print("np_loss:", np_loss)
     print("tf_loss:", tf_loss)
     self.assertAllCloseAccordingToType(np_loss, tf_loss)
@@ -80,7 +81,7 @@ class XentTest(test.TestCase):
         loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
             np.array([[1.], [-1.], [0.]]).astype(dtype),
             np.array([[-1.], [0.], [1.]]).astype(dtype))
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllClose([0.0, 0.0, 0.0], tf_loss)
       self.assertAllClose([[2.0], [1.0], [0.0]], tf_backprop)
 
@@ -88,6 +89,7 @@ class XentTest(test.TestCase):
     self._testSingleClass(True)
     self._testSingleClass(False)
 
+  @test_util.run_deprecated_v1
   def testRankTooLarge(self):
     for dtype in np.float16, np.float32:
       np_features = np.array([[[1., 1., 1., 1.]], [[1., 2., 3.,
@@ -148,16 +150,18 @@ class XentTest(test.TestCase):
       with self.cached_session(use_gpu=use_gpu) as sess:
         loss, backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
             tf_f, tf_l)
-        tf_loss, tf_backprop = sess.run([loss, backprop])
+        tf_loss, tf_backprop = self.evaluate([loss, backprop])
       self.assertAllCloseAccordingToType(np_loss, tf_loss)
       self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
         gen_nn_ops.softmax_cross_entropy_with_logits(
             [[0., 1.], [2., 3.]], [[0., 1., 0.], [1., 0., 0.]])
 
+  @test_util.run_deprecated_v1
   def testNotMatrix(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -179,6 +183,7 @@ class XentTest(test.TestCase):
         np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float64),
         np.array([[0., 0., 0., 1.], [0., .5, .5, 0.]]).astype(np.float64))
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     with self.cached_session() as sess:
       l = constant_op.constant(
@@ -206,6 +211,7 @@ class XentTest(test.TestCase):
     print("cross entropy gradient err = ", err)
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testGradientLabelWithV2(self):
     with self.cached_session():
       l = constant_op.constant(
@@ -224,6 +230,7 @@ class XentTest(test.TestCase):
 
     self.assertLess(err, 5e-8)
 
+  @test_util.run_deprecated_v1
   def testSecondGradient(self):
     with self.cached_session() as sess:
       l = constant_op.constant(
@@ -280,7 +287,7 @@ class XentTest(test.TestCase):
     with self.session(use_gpu=True) as sess:
       loss = nn_ops.softmax_cross_entropy_with_logits(
           labels=labels, logits=features)
-      tf_loss = sess.run(loss)
+      tf_loss = self.evaluate(loss)
     self.assertAllEqual(np_loss, tf_loss)
 
 
diff --git a/tensorflow/python/kernel_tests/zero_division_test.py b/tensorflow/python/kernel_tests/zero_division_test.py
index e68b96e670f914b0f243aa2617d378f2430fbdc2..3dd9ec4ba9459b95f690a2146c7f94ad75043d6d 100644
--- a/tensorflow/python/kernel_tests/zero_division_test.py
+++ b/tensorflow/python/kernel_tests/zero_division_test.py
@@ -21,13 +21,15 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
 class ZeroDivisionTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testZeros(self):
-    with self.session(use_gpu=True):
+    with test_util.use_gpu():
       for dtype in dtypes.uint8, dtypes.int16, dtypes.int32, dtypes.int64:
         zero = constant_op.constant(0, dtype=dtype)
         one = constant_op.constant(1, dtype=dtype)
@@ -36,7 +38,7 @@ class ZeroDivisionTest(test.TestCase):
           bads.append(one % zero)
         for bad in bads:
           try:
-            result = bad.eval()
+            result = self.evaluate(bad)
           except errors_impl.OpError as e:
             # Ideally, we'd get a nice exception.  In theory, this should only
             # happen on CPU, but 32 bit integer GPU division is actually on
diff --git a/tensorflow/python/layers/base.py b/tensorflow/python/layers/base.py
index fccea484b0f8392ca4de31a3fc8a37f32433e0c1..bfe591f875556c9dbcf3001bec4fe836bca3593f 100644
--- a/tensorflow/python/layers/base.py
+++ b/tensorflow/python/layers/base.py
@@ -23,6 +23,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.keras.engine import base_layer
+from tensorflow.python.keras.engine import base_layer_utils
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.util import function_utils
@@ -30,10 +31,10 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
 
-
+# Avoid breaking users who directly import this symbol from this file.
+# TODO(fchollet): remove this.
 InputSpec = base_layer.InputSpec  # pylint: disable=invalid-name
 
-
 _KERAS_STYLE_SCOPE = False
 
 
@@ -242,11 +243,11 @@ class Layer(base_layer.Layer):
   def _make_unique_name(self, name_uid_map=None, avoid_names=None,
                         namespace='', zero_based=False):
     base_name = base_layer.to_snake_case(self.__class__.__name__)
-    name = base_layer.unique_layer_name(base_name,
-                                        name_uid_map=name_uid_map,
-                                        avoid_names=avoid_names,
-                                        namespace=namespace,
-                                        zero_based=zero_based)
+    name = base_layer_utils.unique_layer_name(base_name,
+                                              name_uid_map=name_uid_map,
+                                              avoid_names=avoid_names,
+                                              namespace=namespace,
+                                              zero_based=zero_based)
     return (name, base_name)
 
   @property
diff --git a/tensorflow/python/layers/base_test.py b/tensorflow/python/layers/base_test.py
index 90abf35e87595adcf9917f075f4b9f26ecc820bc..d0ec4f4425f2ea92ba5699cf4ae2d81a86ea27dd 100644
--- a/tensorflow/python/layers/base_test.py
+++ b/tensorflow/python/layers/base_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.keras.engine import base_layer as keras_base_layer
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.layers import base as base_layers
 from tensorflow.python.layers import core as core_layers
 from tensorflow.python.ops import array_ops
@@ -142,6 +143,7 @@ class BaseLayerTest(test.TestCase):
           synchronization=variable_scope.VariableSynchronization.ON_READ,
           trainable=True)
 
+  @test_util.run_deprecated_v1
   def testReusePartitionedVaraiblesAndRegularizers(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     partitioner = partitioned_variables.fixed_size_partitioner(3)
@@ -251,7 +253,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(ndim=2)
+        self.input_spec = input_spec.InputSpec(ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -278,7 +280,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(min_ndim=2)
+        self.input_spec = input_spec.InputSpec(min_ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -306,7 +308,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(max_ndim=2)
+        self.input_spec = input_spec.InputSpec(max_ndim=2)
 
       def call(self, inputs):
         return inputs
@@ -334,7 +336,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(dtype='float32')
+        self.input_spec = input_spec.InputSpec(dtype='float32')
 
       def call(self, inputs):
         return inputs
@@ -354,7 +356,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(axes={-1: 2})
+        self.input_spec = input_spec.InputSpec(axes={-1: 2})
 
       def call(self, inputs):
         return inputs
@@ -376,7 +378,7 @@ class BaseLayerTest(test.TestCase):
 
       def __init__(self):
         super(CustomerLayer, self).__init__()
-        self.input_spec = base_layers.InputSpec(shape=(None, 3))
+        self.input_spec = input_spec.InputSpec(shape=(None, 3))
 
       def call(self, inputs):
         return inputs
@@ -444,6 +446,7 @@ class BaseLayerTest(test.TestCase):
       self.assertTrue(isinstance(result, dict))
       self.assertEqual(set(['label', 'logits']), set(result.keys()))
 
+  @test_util.run_deprecated_v1
   def testActivityRegularizer(self):
     regularizer = math_ops.reduce_sum
     layer = base_layers.Layer(activity_regularizer=regularizer)
@@ -532,6 +535,7 @@ class BaseLayerTest(test.TestCase):
         self.assertEqual(len(layer.trainable_variables), 1)
         self.assertEqual(layer.variables[0].graph, outer_graph)
 
+  @test_util.run_deprecated_v1
   def testGetUpdateFor(self):
 
     class MyLayer(base_layers.Layer):
@@ -576,6 +580,7 @@ class BaseLayerTest(test.TestCase):
     self.assertEqual(len(layer.get_updates_for([intermediate_inputs])), 1)
     self.assertEqual(len(layer.get_updates_for([outputs])), 0)
 
+  @test_util.run_deprecated_v1
   def testGetLossesFor(self):
 
     class MyLayer(base_layers.Layer):
diff --git a/tensorflow/python/layers/convolutional_test.py b/tensorflow/python/layers/convolutional_test.py
index 257fa27156749713bd35f22f82b7cc6c81c23a70..a3e493edfeadfe6f68446616df8b81177e013921 100644
--- a/tensorflow/python/layers/convolutional_test.py
+++ b/tensorflow/python/layers/convolutional_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
@@ -59,6 +60,7 @@ class ConvTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv2d(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv2D(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -87,6 +89,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 4, height, width))
@@ -97,6 +100,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannels(self):
     images = array_ops.placeholder(dtypes.float32, (5, 7, 9, None))
     layer = conv_layers.Conv2D(32, [3, 3], activation=nn_ops.relu)
@@ -140,6 +144,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height / 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv1D(self):
     width = 7
     data = random_ops.random_uniform((5, width, 4))
@@ -156,6 +161,7 @@ class ConvTest(test.TestCase):
     output = conv_layers.conv1d(data, 32, 3, activation=nn_ops.relu)
     self.assertListEqual(output.get_shape().as_list(), [5, width - 2, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateConv1DChannelsFirst(self):
     width = 7
     data = random_ops.random_uniform((5, 4, width))
@@ -165,6 +171,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannelsConv1D(self):
     data = array_ops.placeholder(dtypes.float32, (5, 4, None))
     layer = conv_layers.Conv1D(32, 3, activation=nn_ops.relu)
@@ -180,6 +187,7 @@ class ConvTest(test.TestCase):
                                  'should be defined. Found `None`.'):
       _ = layer.apply(data)
 
+  @test_util.run_deprecated_v1
   def testCreateConv3D(self):
     depth, height, width = 6, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 4))
@@ -191,6 +199,7 @@ class ConvTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testUnknownInputChannelsConv3D(self):
     volumes = array_ops.placeholder(dtypes.float32, (5, 6, 7, 9, None))
     layer = conv_layers.Conv3D(32, [3, 3, 3], activation=nn_ops.relu)
@@ -199,6 +208,7 @@ class ConvTest(test.TestCase):
                                  'should be defined. Found `None`.'):
       _ = layer.apply(volumes)
 
+  @test_util.run_deprecated_v1
   def testConv2DKernelRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -210,6 +220,7 @@ class ConvTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -221,6 +232,7 @@ class ConvTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -247,6 +259,7 @@ class ConvTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, height - 2, 3, 32])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -255,6 +268,7 @@ class ConvTest(test.TestCase):
     conv_layers.conv2d(images, 32, [3, 3], name='conv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -265,6 +279,7 @@ class ConvTest(test.TestCase):
       conv_layers.conv2d(images, 32, [3, 3], name='conv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -276,13 +291,14 @@ class ConvTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -325,6 +341,7 @@ class ConvTest(test.TestCase):
     self.assertEqual(conv3d.kernel_constraint, k_constraint)
     self.assertEqual(conv3d.bias_constraint, b_constraint)
 
+  @test_util.run_deprecated_v1
   def testConv3DChannelsFirst(self):
     # Test case for GitHub issue 15655
     images = array_ops.placeholder(
@@ -358,6 +375,7 @@ class SeparableConv1DTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.separable_conv1d(data, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1D(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -379,6 +397,7 @@ class SeparableConv1DTest(test.TestCase):
     self.assertEqual(layer.pointwise_kernel.get_shape().as_list(), [1, 8, 32])
     self.assertEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1DChannelsFirst(self):
     length = 9
     data = random_ops.random_uniform((5, 4, length))
@@ -404,6 +423,7 @@ class SeparableConv1DTest(test.TestCase):
     output = layer.apply(data)
     self.assertEqual(output.get_shape().as_list(), [5, length // 2, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv1DWithStridesChannelsFirst(self):
     data_format = 'channels_first'
     length = 10
@@ -413,6 +433,7 @@ class SeparableConv1DTest(test.TestCase):
     output = layer.apply(data)
     self.assertEqual(output.get_shape().as_list(), [5, 32, length // 2])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DReuse(self):
     length = 10
     data = random_ops.random_uniform((5, length, 3), seed=1)
@@ -421,6 +442,7 @@ class SeparableConv1DTest(test.TestCase):
     conv_layers.separable_conv1d(data, 32, 3, name='sepconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       length = 10
@@ -431,6 +453,7 @@ class SeparableConv1DTest(test.TestCase):
       conv_layers.separable_conv1d(data, 32, 3, name='sepconv1')
       self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv1DNoReuse(self):
     length = 10
     data = random_ops.random_uniform((5, length, 3), seed=1)
@@ -439,6 +462,7 @@ class SeparableConv1DTest(test.TestCase):
     conv_layers.separable_conv1d(data, 32, 3)
     self.assertEqual(len(variables.trainable_variables()), 6)
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DDepthwiseRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -450,6 +474,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DPointwiseRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -461,6 +486,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DBiasRegularizer(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -472,6 +498,7 @@ class SeparableConv1DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv1DNoBias(self):
     length = 9
     data = random_ops.random_uniform((5, length, 4))
@@ -522,6 +549,7 @@ class SeparableConv2DTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.separable_conv2d(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv2D(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -562,6 +590,7 @@ class SeparableConv2DTest(test.TestCase):
                          [1, 1, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConv2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 4, height, width))
@@ -584,6 +613,7 @@ class SeparableConv2DTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, height, width, 64])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConvWithStrides(self):
     height, width = 6, 8
     # Test strides tuple
@@ -607,6 +637,7 @@ class SeparableConv2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height / 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testCreateSeparableConvWithStridesChannelsFirst(self):
     data_format = 'channels_first'
     height, width = 6, 8
@@ -632,6 +663,7 @@ class SeparableConv2DTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, 32, height / 2, width])
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -641,6 +673,7 @@ class SeparableConv2DTest(test.TestCase):
         images, 32, [3, 3], name='sepconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -651,6 +684,7 @@ class SeparableConv2DTest(test.TestCase):
       conv_layers.separable_conv2d(images, 32, [3, 3], name='sepconv1')
       self.assertEqual(len(variables.trainable_variables()), 3)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -663,14 +697,15 @@ class SeparableConv2DTest(test.TestCase):
         self.assertTrue('depthwise_kernel' in weights[0].name)
         self.assertTrue('pointwise_kernel' in weights[1].name)
         self.assertTrue('bias' in weights[2].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 1)))
         self.assertAllClose(weights[1], np.ones((1, 1, 3, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[2], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -679,6 +714,7 @@ class SeparableConv2DTest(test.TestCase):
     conv_layers.separable_conv2d(images, 32, [3, 3])
     self.assertEqual(len(variables.trainable_variables()), 6)
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DDepthwiseRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -690,6 +726,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DPointwiseRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -701,6 +738,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -712,6 +750,7 @@ class SeparableConv2DTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testSeparableConv2DNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -768,6 +807,7 @@ class Conv2DTransposeTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv2d_transpose(images, 32, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv2DTranspose(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -839,6 +879,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, height * 2, width, 32])
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeKernelRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -850,6 +891,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeBiasRegularizer(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -861,6 +903,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv2DTransposeNoBias(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 4))
@@ -873,6 +916,7 @@ class Conv2DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 32, 4])
     self.assertEqual(layer.bias, None)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -881,6 +925,7 @@ class Conv2DTransposeTest(test.TestCase):
     conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       height, width = 7, 9
@@ -891,6 +936,7 @@ class Conv2DTransposeTest(test.TestCase):
       conv_layers.conv2d_transpose(images, 32, [3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -902,13 +948,14 @@ class Conv2DTransposeTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 32, 3)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((32)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv2DTransposeNoReuse(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, height, width, 3), seed=1)
@@ -955,6 +1002,7 @@ class Conv3DTransposeTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, 'kernel_size'):
       conv_layers.conv3d_transpose(volumes, 4, None)
 
+  @test_util.run_deprecated_v1
   def testCreateConv3DTranspose(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -976,6 +1024,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertListEqual(layer.bias.get_shape().as_list(), [4])
 
+  @test_util.run_deprecated_v1
   def testCreateConv3DTransposeChannelsFirst(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, 32, depth, height, width))
@@ -1019,6 +1068,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(output.get_shape().as_list(),
                          [5, depth * 2, height, width, 4])
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeKernelRegularizer(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1030,6 +1080,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeBiasRegularizer(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1041,6 +1092,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.evaluate([v.initializer for v in layer.variables])
     self.assertListEqual(self.evaluate(layer.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testConv3DTransposeNoBias(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32))
@@ -1053,6 +1105,7 @@ class Conv3DTransposeTest(test.TestCase):
     self.assertListEqual(layer.kernel.get_shape().as_list(), [3, 3, 3, 4, 32])
     self.assertEqual(layer.bias, None)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeReuse(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
@@ -1062,6 +1115,7 @@ class Conv3DTransposeTest(test.TestCase):
         volumes, 4, [3, 3, 3], name='deconv1', reuse=True)
     self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeReuseFromScope(self):
     with variable_scope.variable_scope('scope'):
       depth, height, width = 5, 7, 9
@@ -1072,6 +1126,7 @@ class Conv3DTransposeTest(test.TestCase):
       conv_layers.conv3d_transpose(volumes, 4, [3, 3, 3], name='deconv1')
       self.assertEqual(len(variables.trainable_variables()), 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeInitializerFromScope(self):
     with self.cached_session() as sess:
       with variable_scope.variable_scope(
@@ -1084,13 +1139,14 @@ class Conv3DTransposeTest(test.TestCase):
         # Check the names of weights in order.
         self.assertTrue('kernel' in weights[0].name)
         self.assertTrue('bias' in weights[1].name)
-        sess.run(variables.global_variables_initializer())
-        weights = sess.run(weights)
+        self.evaluate(variables.global_variables_initializer())
+        weights = self.evaluate(weights)
         # Check that the kernel weights got initialized to ones (from scope)
         self.assertAllClose(weights[0], np.ones((3, 3, 3, 4, 32)))
         # Check that the bias still got initialized to zeros.
         self.assertAllClose(weights[1], np.zeros((4)))
 
+  @test_util.run_deprecated_v1
   def testFunctionalConv3DTransposeNoReuse(self):
     depth, height, width = 5, 7, 9
     volumes = random_ops.random_uniform((5, depth, height, width, 32), seed=1)
diff --git a/tensorflow/python/layers/core_test.py b/tensorflow/python/layers/core_test.py
index 0343bfa8bd2d0fdfd80bd49709fa734d8df8f7ec..b40a2682381ad50da67fe7499b75f4f862e00b3d 100644
--- a/tensorflow/python/layers/core_test.py
+++ b/tensorflow/python/layers/core_test.py
@@ -59,6 +59,7 @@ class DenseTest(test.TestCase):
     dense.apply(random_ops.random_uniform((5, 2)))
     self.assertEqual(dense.name, 'dense_2')
 
+  @test_util.run_deprecated_v1
   def testVariableInput(self):
     with self.cached_session():
       v = variable_scope.get_variable(
@@ -140,6 +141,7 @@ class DenseTest(test.TestCase):
     outputs = dense.apply(inputs)
     self.assertEqual(outputs.get_shape().as_list(), [1, 2, 4, 7])
 
+  @test_util.run_deprecated_v1
   def testCallOnPlaceHolder(self):
     inputs = array_ops.placeholder(dtype=dtypes.float32)
     dense = core_layers.Dense(4, name='my_dense')
@@ -179,6 +181,7 @@ class DenseTest(test.TestCase):
     if not context.executing_eagerly():
       self.assertEqual(outputs.op.name, 'dense2/BiasAdd')
 
+  @test_util.run_deprecated_v1
   def testActivityRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(
@@ -189,6 +192,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(len(loss_keys), 1)
     self.assertListEqual(dense.losses, loss_keys)
 
+  @test_util.run_deprecated_v1
   def testKernelRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(
@@ -200,6 +204,7 @@ class DenseTest(test.TestCase):
     self.evaluate([v.initializer for v in dense.variables])
     self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testKernelRegularizerWithReuse(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     inputs = random_ops.random_uniform((5, 3), seed=1)
@@ -212,6 +217,7 @@ class DenseTest(test.TestCase):
     self.assertEqual(
         len(ops.get_collection(ops.GraphKeys.REGULARIZATION_LOSSES)), 1)
 
+  @test_util.run_deprecated_v1
   def testBiasRegularizer(self):
     regularizer = lambda x: math_ops.reduce_sum(x) * 1e-3
     dense = core_layers.Dense(2, name='my_dense', bias_regularizer=regularizer)
@@ -222,6 +228,7 @@ class DenseTest(test.TestCase):
     self.evaluate([v.initializer for v in dense.variables])
     self.assertAllEqual(self.evaluate(dense.losses), self.evaluate(loss_keys))
 
+  @test_util.run_deprecated_v1
   def testFunctionalDense(self):
     with self.cached_session():
       inputs = random_ops.random_uniform((5, 3), seed=1)
@@ -231,6 +238,7 @@ class DenseTest(test.TestCase):
           len(ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES)), 2)
       self.assertEqual(outputs.op.name, 'my_dense/Relu')
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseTwice(self):
     inputs = random_ops.random_uniform((5, 3), seed=1)
     core_layers.dense(inputs, 2)
@@ -262,6 +270,7 @@ class DenseTest(test.TestCase):
         vars2 = variables.trainable_variables()
       self.assertEqual(vars1, vars2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseInitializerFromScope(self):
     with variable_scope.variable_scope(
         'scope',
@@ -307,6 +316,7 @@ class DenseTest(test.TestCase):
       core_layers.dense(inputs, 2)
     self.assertEqual(called[0], 2)
 
+  @test_util.run_deprecated_v1
   def testFunctionalDenseInScope(self):
     with self.cached_session():
       with variable_scope.variable_scope('test'):
@@ -393,6 +403,7 @@ class DropoutTest(test.TestCase):
     np_output = self.evaluate(dropped)
     self.assertAllClose(np.ones((5, 3)), np_output)
 
+  @test_util.run_deprecated_v1
   def testDynamicLearningPhase(self):
     with self.cached_session() as sess:
       dp = core_layers.Dropout(0.5, seed=1)
@@ -426,6 +437,7 @@ class DropoutTest(test.TestCase):
     self.assertAlmostEqual(0., np_output.min())
     self.assertAllClose(np_output[:, 0, :], np_output[:, 1, :])
 
+  @test_util.run_deprecated_v1
   def testFunctionalDropout(self):
     with self.cached_session():
       inputs = array_ops.ones((5, 5))
@@ -437,13 +449,14 @@ class DropoutTest(test.TestCase):
       np_output = self.evaluate(dropped)
       self.assertAllClose(np.ones((5, 5)), np_output)
 
+  @test_util.run_deprecated_v1
   def testDynamicRate(self):
     with self.cached_session() as sess:
       rate = array_ops.placeholder(dtype='float32', name='rate')
       dp = core_layers.Dropout(rate, name='dropout')
       inputs = array_ops.ones((5, 5))
       dropped = dp.apply(inputs, training=True)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       np_output = sess.run(dropped, feed_dict={rate: 0.5})
       self.assertAlmostEqual(0., np_output.min())
       np_output = sess.run(dropped, feed_dict={rate: 0.0})
@@ -452,6 +465,7 @@ class DropoutTest(test.TestCase):
 
 class FlattenTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testCreateFlatten(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
@@ -476,6 +490,7 @@ class FlattenTest(test.TestCase):
     shape = core_layers.Flatten().compute_output_shape((None, 3, None))
     self.assertEqual(shape.as_list(), [None, None])
 
+  @test_util.run_deprecated_v1
   def testDataFormat5d(self):
     np_input_channels_last = np.arange(
         120, dtype='float32').reshape([1, 5, 4, 3, 2])
@@ -493,6 +508,7 @@ class FlattenTest(test.TestCase):
 
       self.assertAllEqual(np_output_cl, np_output_cf)
 
+  @test_util.run_deprecated_v1
   def testDataFormat4d(self):
     np_input_channels_last = np.arange(
         24, dtype='float32').reshape([1, 4, 3, 2])
@@ -510,16 +526,22 @@ class FlattenTest(test.TestCase):
 
       self.assertAllEqual(np_output_cl, np_output_cf)
 
+  @test_util.run_deprecated_v1
   def testFunctionalFlatten(self):
     x = array_ops.placeholder(shape=(None, 2, 3), dtype='float32')
     y = core_layers.flatten(x, name='flatten')
     self.assertEqual(y.get_shape().as_list(), [None, 6])
 
-  def testFlattenValueError(self):
+  @test_util.run_deprecated_v1
+  def testFlatten0D(self):
     x = array_ops.placeholder(shape=(None,), dtype='float32')
-    with self.assertRaises(ValueError):
-      core_layers.Flatten()(x)
+    y = core_layers.Flatten()(x)
+    with self.cached_session() as sess:
+      np_output = sess.run(y, feed_dict={x: np.zeros((5,))})
+    self.assertEqual(list(np_output.shape), [5, 1])
+    self.assertEqual(y.shape.as_list(), [None, 1])
 
+  @test_util.run_deprecated_v1
   def testFlattenUnknownAxes(self):
     with self.cached_session() as sess:
       x = array_ops.placeholder(shape=(5, None, None), dtype='float32')
diff --git a/tensorflow/python/layers/layers.py b/tensorflow/python/layers/layers.py
index 11a2ebc040f0177e38d5b0f38cf609071f91fd07..93eec38a08c476a746fa5ee1604076ce1e4e904f 100644
--- a/tensorflow/python/layers/layers.py
+++ b/tensorflow/python/layers/layers.py
@@ -24,7 +24,7 @@ from __future__ import print_function
 
 # Base objects.
 from tensorflow.python.layers.base import Layer
-from tensorflow.python.layers.base import InputSpec
+from tensorflow.python.keras.engine.input_spec import InputSpec
 
 # Core layers.
 from tensorflow.python.layers.core import Dense
diff --git a/tensorflow/python/layers/normalization_test.py b/tensorflow/python/layers/normalization_test.py
index ba2bf10cf3aa558bde2253f6fe6d44f37f9efb4c..6535f74129ae166d41675aad494be09bdd0f5cd3 100644
--- a/tensorflow/python/layers/normalization_test.py
+++ b/tensorflow/python/layers/normalization_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.core.protobuf import saver_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import convolutional as conv_layers
 from tensorflow.python.layers import normalization as normalization_layers
 from tensorflow.python.ops import array_ops
@@ -37,6 +38,7 @@ from tensorflow.python.training import gradient_descent
 from tensorflow.python.training import saver as saver_lib
 
 
+@test_util.run_v1_only('b/120545219')
 class BNTest(test.TestCase):
 
   def _simple_model(self, image, fused, freeze_mode):
@@ -78,7 +80,7 @@ class BNTest(test.TestCase):
       if restore:
         saver.restore(sess, checkpoint_path)
       else:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
       np.random.seed(0)
       for _ in range(2):
         image_val = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
@@ -321,9 +323,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 4, 1))
       np_beta = np.reshape(np_beta, (1, 4, 1))
 
@@ -336,8 +338,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 2))
       std = np.std(np_inputs, axis=(0, 2))
       variance = np.square(std)
@@ -363,8 +366,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 3))
       np_beta = np.reshape(np_beta, (1, 1, 3))
       for _ in range(100):
@@ -376,8 +379,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1))
       std = np.std(np_inputs, axis=(0, 1))
       variance = np.square(std)
@@ -404,8 +408,8 @@ class BNTest(test.TestCase):
 
       with self.session(use_gpu=True) as sess:
         # Test training with placeholder learning phase.
-        sess.run(variables.global_variables_initializer())
-        np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+        self.evaluate(variables.global_variables_initializer())
+        np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
         np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
         np_beta = np.reshape(np_beta, (1, 4, 1, 1))
         for _ in range(100):
@@ -417,8 +421,9 @@ class BNTest(test.TestCase):
           self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
         # Verify that the statistics are updated during training.
-        moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-        np_inputs = sess.run(inputs)
+        moving_mean, moving_var = self.evaluate(
+            [bn.moving_mean, bn.moving_variance])
+        np_inputs = self.evaluate(inputs)
         mean = np.mean(np_inputs, axis=(0, 2, 3))
         std = np.std(np_inputs, axis=(0, 2, 3))
         variance = np.square(std)
@@ -444,8 +449,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 3, 1))
       np_beta = np.reshape(np_beta, (1, 1, 3, 1))
       for _ in range(100):
@@ -457,8 +462,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 3))
       std = np.std(np_inputs, axis=(0, 1, 3))
       variance = np.square(std)
@@ -484,8 +490,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -497,8 +503,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -524,8 +531,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -537,8 +544,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -565,8 +573,8 @@ class BNTest(test.TestCase):
 
       with self.cached_session() as sess:
         # Test training with placeholder learning phase.
-        sess.run(variables.global_variables_initializer())
-        np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+        self.evaluate(variables.global_variables_initializer())
+        np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
         np_gamma = np.reshape(np_gamma, (1, 4, 1, 1))
         np_beta = np.reshape(np_beta, (1, 4, 1, 1))
         for _ in range(100):
@@ -578,8 +586,9 @@ class BNTest(test.TestCase):
           self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
         # Verify that the statistics are updated during training.
-        moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-        np_inputs = sess.run(inputs)
+        moving_mean, moving_var = self.evaluate(
+            [bn.moving_mean, bn.moving_variance])
+        np_inputs = self.evaluate(inputs)
         mean = np.mean(np_inputs, axis=(0, 2, 3))
         std = np.std(np_inputs, axis=(0, 2, 3))
         variance = np.square(std)
@@ -605,8 +614,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -619,8 +628,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -646,8 +656,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -658,8 +668,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 1, 2))
       std = np.std(np_inputs, axis=(0, 1, 2))
       variance = np.square(std)
@@ -667,7 +678,7 @@ class BNTest(test.TestCase):
       self.assertAllClose(variance, moving_var, atol=1e-2)
 
       # Test inference with placeholder learning phase.
-      np_output = sess.run(outputs_infer)
+      np_output = self.evaluate(outputs_infer)
 
       # Verify that the axis is normalized during inference.
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
@@ -696,8 +707,8 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
-      np_gamma, np_beta = sess.run([gamma, beta])
+      self.evaluate(variables.global_variables_initializer())
+      np_gamma, np_beta = self.evaluate([gamma, beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       for _ in range(100):
@@ -709,8 +720,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
-      np_inputs = sess.run(inputs)
+      np_moving_mean, np_moving_var = self.evaluate(
+          [moving_mean, moving_variance])
+      np_inputs = self.evaluate(inputs)
       np_mean = np.mean(np_inputs, axis=(0, 1, 2))
       np_std = np.std(np_inputs, axis=(0, 1, 2))
       np_variance = np.square(np_std)
@@ -758,14 +770,15 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(100):
         np_output, _, _ = sess.run([outputs2] + updates,
                                    feed_dict={training: True})
 
       # Verify that the statistics are updated during training.
-      np_moving_mean, np_moving_var = sess.run([moving_mean, moving_variance])
-      np_inputs = sess.run(inputs2)
+      np_moving_mean, np_moving_var = self.evaluate(
+          [moving_mean, moving_variance])
+      np_inputs = self.evaluate(inputs2)
       np_mean = np.mean(np_inputs, axis=(0, 1, 2))
       np_std = np.std(np_inputs, axis=(0, 1, 2))
       np_variance = np.square(np_std)
@@ -773,7 +786,7 @@ class BNTest(test.TestCase):
       self.assertAllClose(np_variance, np_moving_var, atol=1e-2)
 
       # Verify that the axis is normalized during training.
-      np_gamma, np_beta = sess.run([gamma, beta])
+      np_gamma, np_beta = self.evaluate([gamma, beta])
       np_gamma = np.reshape(np_gamma, (1, 1, 1, 6))
       np_beta = np.reshape(np_beta, (1, 1, 1, 6))
       normed_np_output = ((np_output - epsilon) * np_gamma) + np_beta
@@ -885,7 +898,7 @@ class BNTest(test.TestCase):
     renorm_mean = renorm_stddev = 0.
     renorm_weight = 0.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -937,7 +950,7 @@ class BNTest(test.TestCase):
     moving_mean = 0.
     moving_variance = 1.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
         yt_val_train, adj_scale_val, adj_bias_val = sess.run(
@@ -990,7 +1003,7 @@ class BNTest(test.TestCase):
     renorm_mean = renorm_stddev = 0.
     renorm_weight = 0.
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
         yt_val_train, adj_scale_val, adj_bias_val = sess.run(
@@ -1040,7 +1053,7 @@ class BNTest(test.TestCase):
         out1.shape.as_list(), out2.shape.as_list())
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       x = np.random.random(shape)
       y1, y2 = sess.run([out1, out2], feed_dict={inp: x})
@@ -1062,7 +1075,7 @@ class BNTest(test.TestCase):
         inp, virtual_batch_size=2)
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       x = np.random.random(np_shape)
       y = sess.run(out, feed_dict={inp: x})
@@ -1093,7 +1106,7 @@ class BNTest(test.TestCase):
                     shape[1]])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1146,7 +1159,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1200,7 +1213,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
@@ -1256,9 +1269,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
 
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
@@ -1269,8 +1282,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=0, keepdims=True)
       std = np.std(np_inputs, axis=0, keepdims=True)
       variance = np.square(std)
@@ -1296,9 +1310,9 @@ class BNTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Test training with placeholder learning phase.
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
-      np_gamma, np_beta = sess.run([bn.gamma, bn.beta])
+      np_gamma, np_beta = self.evaluate([bn.gamma, bn.beta])
 
       for _ in range(100):
         np_output, _, _ = sess.run([outputs] + bn.updates,
@@ -1309,8 +1323,9 @@ class BNTest(test.TestCase):
         self.assertAlmostEqual(np.std(normed_np_output), 1., places=1)
 
       # Verify that the statistics are updated during training.
-      moving_mean, moving_var = sess.run([bn.moving_mean, bn.moving_variance])
-      np_inputs = sess.run(inputs)
+      moving_mean, moving_var = self.evaluate(
+          [bn.moving_mean, bn.moving_variance])
+      np_inputs = self.evaluate(inputs)
       mean = np.mean(np_inputs, axis=(0, 4), keepdims=True)
       std = np.std(np_inputs, axis=(0, 4), keepdims=True)
       variance = np.square(std)
@@ -1350,7 +1365,7 @@ class BNTest(test.TestCase):
                    shape[1:])
 
     with self.session(use_gpu=True) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in range(5):
         x = np.random.random(shape)
 
diff --git a/tensorflow/python/layers/pooling_test.py b/tensorflow/python/layers/pooling_test.py
index 7533674e5a0cf60f91551cd6333c8d802612e03d..cf1fa1e6915695cc3d4c130ef501b466a73a1953 100644
--- a/tensorflow/python/layers/pooling_test.py
+++ b/tensorflow/python/layers/pooling_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.layers import pooling as pooling_layers
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
@@ -64,6 +65,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 3, 4, 4])
 
+  @test_util.run_deprecated_v1
   def testCreateMaxPooling2DChannelsFirst(self):
     height, width = 7, 9
     images = random_ops.random_uniform((5, 2, height, width))
@@ -73,6 +75,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [5, 2, 6, 8])
 
+  @test_util.run_deprecated_v1
   def testCreateAveragePooling2DChannelsFirst(self):
     height, width = 5, 6
     images = random_ops.random_uniform((3, 4, height, width))
@@ -83,6 +86,7 @@ class PoolingTest(test.TestCase):
     output = layer.apply(images)
     self.assertListEqual(output.get_shape().as_list(), [3, 4, 4, 5])
 
+  @test_util.run_deprecated_v1
   def testCreateAveragePooling2DChannelsFirstWithNoneBatch(self):
     height, width = 5, 6
     images = array_ops.placeholder(dtype='float32',
diff --git a/tensorflow/python/lib/core/py_func.cc b/tensorflow/python/lib/core/py_func.cc
index 9364aec373df9575282ae9254bce50a307bf61a0..97bebe86177ee264ef00bc9b969b293389aa2122 100644
--- a/tensorflow/python/lib/core/py_func.cc
+++ b/tensorflow/python/lib/core/py_func.cc
@@ -302,15 +302,14 @@ Status DoCallPyFunc(PyCall* call, bool* out_log_on_error) {
 class NumpyTensorBuffer : public TensorBuffer {
  public:
   NumpyTensorBuffer(PyArrayObject* array, size_t len, void* data)
-      : array_(array), len_(len), data_(data) {}
+      : TensorBuffer(data), array_(array), len_(len) {}
 
   ~NumpyTensorBuffer() override {
     // Note: The session::run wrapper is responsible for freeing this while
     // holding the GIL.
-    DelayedNumpyDecref(data_, len_, array_);
+    DelayedNumpyDecref(data(), len_, array_);
   }
 
-  void* data() const override { return data_; }
   size_t size() const override { return len_; }
   TensorBuffer* root_buffer() override { return this; }
   void FillAllocationDescription(AllocationDescription* proto) const override {
@@ -329,7 +328,6 @@ class NumpyTensorBuffer : public TensorBuffer {
  private:
   PyArrayObject* array_;
   size_t len_;
-  void* data_;
 };
 
 Status PyObjectToString(PyObject* obj, string* str) {
diff --git a/tensorflow/python/lib/io/file_io.py b/tensorflow/python/lib/io/file_io.py
index f22fb253e4d59813226f0e9741cabcfbf0cdcd1a..ee55d89bffcbaca2a68cbb028ae9ca5157e6f6df 100644
--- a/tensorflow/python/lib/io/file_io.py
+++ b/tensorflow/python/lib/io/file_io.py
@@ -241,7 +241,7 @@ class FileIO(object):
     self._writable_file = None
 
 
-@tf_export("gfile.Exists")
+@tf_export(v1=["gfile.Exists"])
 def file_exists(filename):
   """Determines whether a path exists or not.
 
@@ -252,18 +252,35 @@ def file_exists(filename):
     True if the path exists, whether its a file or a directory.
     False if the path does not exist and there are no filesystem errors.
 
+  Raises:
+    errors.OpError: Propagates any errors reported by the FileSystem API.
+  """
+  return file_exists_v2(filename)
+
+
+@tf_export("io.gfile.exists")
+def file_exists_v2(path):
+  """Determines whether a path exists or not.
+
+  Args:
+    path: string, a path
+
+  Returns:
+    True if the path exists, whether its a file or a directory.
+    False if the path does not exist and there are no filesystem errors.
+
   Raises:
     errors.OpError: Propagates any errors reported by the FileSystem API.
   """
   try:
     with errors.raise_exception_on_not_ok_status() as status:
-      pywrap_tensorflow.FileExists(compat.as_bytes(filename), status)
+      pywrap_tensorflow.FileExists(compat.as_bytes(path), status)
   except errors.NotFoundError:
     return False
   return True
 
 
-@tf_export("gfile.Remove")
+@tf_export(v1=["gfile.Remove"])
 def delete_file(filename):
   """Deletes the file located at 'filename'.
 
@@ -274,8 +291,22 @@ def delete_file(filename):
     errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
     NotFoundError if the file does not exist.
   """
+  delete_file_v2(filename)
+
+
+@tf_export("io.gfile.remove")
+def delete_file_v2(path):
+  """Deletes the path located at 'path'.
+
+  Args:
+    path: string, a path
+
+  Raises:
+    errors.OpError: Propagates any errors reported by the FileSystem API.  E.g.,
+    NotFoundError if the path does not exist.
+  """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.DeleteFile(compat.as_bytes(filename), status)
+    pywrap_tensorflow.DeleteFile(compat.as_bytes(path), status)
 
 
 def read_file_to_string(filename, binary_mode=False):
@@ -314,7 +345,7 @@ def write_string_to_file(filename, file_content):
     f.write(file_content)
 
 
-@tf_export("gfile.Glob")
+@tf_export(v1=["gfile.Glob"])
 def get_matching_files(filename):
   """Returns a list of files that match the given pattern(s).
 
@@ -324,28 +355,44 @@ def get_matching_files(filename):
   Returns:
     A list of strings containing filenames that match the given pattern(s).
 
+  Raises:
+    errors.OpError: If there are filesystem / directory listing errors.
+  """
+  return get_matching_files_v2(filename)
+
+
+@tf_export("io.gfile.glob")
+def get_matching_files_v2(pattern):
+  """Returns a list of files that match the given pattern(s).
+
+  Args:
+    pattern: string or iterable of strings. The glob pattern(s).
+
+  Returns:
+    A list of strings containing filenames that match the given pattern(s).
+
   Raises:
     errors.OpError: If there are filesystem / directory listing errors.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    if isinstance(filename, six.string_types):
+    if isinstance(pattern, six.string_types):
       return [
           # Convert the filenames to string from bytes.
           compat.as_str_any(matching_filename)
           for matching_filename in pywrap_tensorflow.GetMatchingFiles(
-              compat.as_bytes(filename), status)
+              compat.as_bytes(pattern), status)
       ]
     else:
       return [
           # Convert the filenames to string from bytes.
           compat.as_str_any(matching_filename)
-          for single_filename in filename
+          for single_filename in pattern
           for matching_filename in pywrap_tensorflow.GetMatchingFiles(
               compat.as_bytes(single_filename), status)
       ]
 
 
-@tf_export("gfile.MkDir")
+@tf_export(v1=["gfile.MkDir"])
 def create_dir(dirname):
   """Creates a directory with the name 'dirname'.
 
@@ -356,14 +403,31 @@ def create_dir(dirname):
     The parent directories need to exist. Use recursive_create_dir instead if
     there is the possibility that the parent dirs don't exist.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  create_dir_v2(dirname)
+
+
+@tf_export("io.gfile.mkdir")
+def create_dir_v2(path):
+  """Creates a directory with the name given by 'path'.
+
+  Args:
+    path: string, name of the directory to be created
+
+  Notes:
+    The parent directories need to exist. Use recursive_create_dir instead if
+    there is the possibility that the parent dirs don't exist.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.CreateDir(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.CreateDir(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.MakeDirs")
+@tf_export(v1=["gfile.MakeDirs"])
 def recursive_create_dir(dirname):
   """Creates a directory and all parent/intermediate directories.
 
@@ -372,14 +436,29 @@ def recursive_create_dir(dirname):
   Args:
     dirname: string, name of the directory to be created
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  recursive_create_dir_v2(dirname)
+
+
+@tf_export("io.gfile.makedirs")
+def recursive_create_dir_v2(path):
+  """Creates a directory and all parent/intermediate directories.
+
+  It succeeds if path already exists and is writable.
+
+  Args:
+    path: string, name of the directory to be created
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.RecursivelyCreateDir(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.Copy")
+@tf_export(v1=["gfile.Copy"])
 def copy(oldpath, newpath, overwrite=False):
   """Copies data from oldpath to newpath.
 
@@ -389,15 +468,31 @@ def copy(oldpath, newpath, overwrite=False):
     overwrite: boolean, if false its an error for newpath to be occupied by an
         existing file.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  copy_v2(oldpath, newpath, overwrite)
+
+
+@tf_export("io.gfile.copy")
+def copy_v2(src, dst, overwrite=False):
+  """Copies data from src to dst.
+
+  Args:
+    src: string, name of the file whose contents need to be copied
+    dst: string, name of the file to which to copy to
+    overwrite: boolean, if false its an error for newpath to be occupied by an
+        existing file.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
     pywrap_tensorflow.CopyFile(
-        compat.as_bytes(oldpath), compat.as_bytes(newpath), overwrite, status)
+        compat.as_bytes(src), compat.as_bytes(dst), overwrite, status)
 
 
-@tf_export("gfile.Rename")
+@tf_export(v1=["gfile.Rename"])
 def rename(oldname, newname, overwrite=False):
   """Rename or move a file / directory.
 
@@ -407,12 +502,28 @@ def rename(oldname, newname, overwrite=False):
     overwrite: boolean, if false it's an error for `newname` to be occupied by
         an existing file.
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  rename_v2(oldname, newname, overwrite)
+
+
+@tf_export("io.gfile.rename")
+def rename_v2(src, dst, overwrite=False):
+  """Rename or move a file / directory.
+
+  Args:
+    src: string, pathname for a file
+    dst: string, pathname to which the file needs to be moved
+    overwrite: boolean, if false it's an error for `dst` to be occupied by
+        an existing file.
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
     pywrap_tensorflow.RenameFile(
-        compat.as_bytes(oldname), compat.as_bytes(newname), overwrite, status)
+        compat.as_bytes(src), compat.as_bytes(dst), overwrite, status)
 
 
 def atomic_write_string_to_file(filename, contents, overwrite=True):
@@ -439,35 +550,61 @@ def atomic_write_string_to_file(filename, contents, overwrite=True):
     raise
 
 
-@tf_export("gfile.DeleteRecursively")
+@tf_export(v1=["gfile.DeleteRecursively"])
 def delete_recursively(dirname):
   """Deletes everything under dirname recursively.
 
   Args:
     dirname: string, a path to a directory
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  delete_recursively_v2(dirname)
+
+
+@tf_export("io.gfile.rmtree")
+def delete_recursively_v2(path):
+  """Deletes everything under path recursively.
+
+  Args:
+    path: string, a path
+
   Raises:
     errors.OpError: If the operation fails.
   """
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.DeleteRecursively(compat.as_bytes(dirname), status)
+    pywrap_tensorflow.DeleteRecursively(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.IsDirectory")
+@tf_export(v1=["gfile.IsDirectory"])
 def is_directory(dirname):
   """Returns whether the path is a directory or not.
 
   Args:
     dirname: string, path to a potential directory
 
+  Returns:
+    True, if the path is a directory; False otherwise
+  """
+  return is_directory_v2(dirname)
+
+
+@tf_export("io.gfile.isdir")
+def is_directory_v2(path):
+  """Returns whether the path is a directory or not.
+
+  Args:
+    path: string, path to a potential directory
+
   Returns:
     True, if the path is a directory; False otherwise
   """
   status = c_api_util.ScopedTFStatus()
-  return pywrap_tensorflow.IsDirectory(compat.as_bytes(dirname), status)
+  return pywrap_tensorflow.IsDirectory(compat.as_bytes(path), status)
 
 
-@tf_export("gfile.ListDirectory")
+@tf_export(v1=["gfile.ListDirectory"])
 def list_directory(dirname):
   """Returns a list of entries contained within a directory.
 
@@ -483,7 +620,26 @@ def list_directory(dirname):
   Raises:
     errors.NotFoundError if directory doesn't exist
   """
-  if not is_directory(dirname):
+  return list_directory_v2(dirname)
+
+
+@tf_export("io.gfile.listdir")
+def list_directory_v2(path):
+  """Returns a list of entries contained within a directory.
+
+  The list is in arbitrary order. It does not contain the special entries "."
+  and "..".
+
+  Args:
+    path: string, path to a directory
+
+  Returns:
+    [filename1, filename2, ... filenameN] as strings
+
+  Raises:
+    errors.NotFoundError if directory doesn't exist
+  """
+  if not is_directory(path):
     raise errors.NotFoundError(None, None, "Could not find directory")
   with errors.raise_exception_on_not_ok_status() as status:
     # Convert each element to string, since the return values of the
@@ -491,11 +647,11 @@ def list_directory(dirname):
     return [
         compat.as_str_any(filename)
         for filename in pywrap_tensorflow.GetChildren(
-            compat.as_bytes(dirname), status)
+            compat.as_bytes(path), status)
     ]
 
 
-@tf_export("gfile.Walk")
+@tf_export(v1=["gfile.Walk"])
 def walk(top, in_order=True):
   """Recursive directory tree generator for directories.
 
@@ -505,6 +661,27 @@ def walk(top, in_order=True):
 
   Errors that happen while listing directories are ignored.
 
+  Yields:
+    Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
+    all its subdirectories and leaf files.
+    (dirname, [subdirname, subdirname, ...], [filename, filename, ...])
+    as strings
+  """
+  return walk_v2(top, in_order)
+
+
+@tf_export("io.gfile.walk")
+def walk_v2(top, topdown=True, onerror=None):
+  """Recursive directory tree generator for directories.
+
+  Args:
+    top: string, a Directory name
+    topdown: bool, Traverse pre order if True, post order if False.
+    onerror: optional handler for errors. Should be a function, it will be
+      called with the error as argument. Rethrowing the error aborts the walk.
+
+  Errors that happen while listing directories are ignored.
+
   Yields:
     Each yield is a 3-tuple:  the pathname of a directory, followed by lists of
     all its subdirectories and leaf files.
@@ -514,8 +691,11 @@ def walk(top, in_order=True):
   top = compat.as_str_any(top)
   try:
     listing = list_directory(top)
-  except errors.NotFoundError:
-    return
+  except errors.NotFoundError as err:
+    if onerror:
+      onerror(err)
+    else:
+      return
 
   files = []
   subdirs = []
@@ -528,18 +708,18 @@ def walk(top, in_order=True):
 
   here = (top, subdirs, files)
 
-  if in_order:
+  if topdown:
     yield here
 
   for subdir in subdirs:
-    for subitem in walk(os.path.join(top, subdir), in_order):
+    for subitem in walk_v2(os.path.join(top, subdir), topdown, onerror=onerror):
       yield subitem
 
-  if not in_order:
+  if not topdown:
     yield here
 
 
-@tf_export("gfile.Stat")
+@tf_export(v1=["gfile.Stat"])
 def stat(filename):
   """Returns file statistics for a given path.
 
@@ -549,12 +729,28 @@ def stat(filename):
   Returns:
     FileStatistics struct that contains information about the path
 
+  Raises:
+    errors.OpError: If the operation fails.
+  """
+  return stat_v2(filename)
+
+
+@tf_export("io.gfile.stat")
+def stat_v2(path):
+  """Returns file statistics for a given path.
+
+  Args:
+    path: string, path to a file
+
+  Returns:
+    FileStatistics struct that contains information about the path
+
   Raises:
     errors.OpError: If the operation fails.
   """
   file_statistics = pywrap_tensorflow.FileStatistics()
   with errors.raise_exception_on_not_ok_status() as status:
-    pywrap_tensorflow.Stat(compat.as_bytes(filename), file_statistics, status)
+    pywrap_tensorflow.Stat(compat.as_bytes(path), file_statistics, status)
     return file_statistics
 
 
diff --git a/tensorflow/python/lib/io/tf_record.py b/tensorflow/python/lib/io/tf_record.py
index b7fae8529559efd1369db1364e730fbbc5d1df5a..43086ab18d7774f54be2b393deccec6be180801f 100644
--- a/tensorflow/python/lib/io/tf_record.py
+++ b/tensorflow/python/lib/io/tf_record.py
@@ -150,10 +150,11 @@ class TFRecordOptions(object):
     return options
 
 
-@tf_export(
-    "io.tf_record_iterator",
-    v1=["io.tf_record_iterator", "python_io.tf_record_iterator"])
-@deprecation.deprecated_endpoints("python_io.tf_record_iterator")
+@tf_export(v1=["io.tf_record_iterator", "python_io.tf_record_iterator"])
+@deprecation.deprecated(
+    date=None,
+    instructions=("Use eager execution and: \n"
+                  "`tf.data.TFRecordDataset(path)`"))
 def tf_record_iterator(path, options=None):
   """An iterator that read the records from a TFRecords file.
 
diff --git a/tensorflow/python/ops/array_grad.py b/tensorflow/python/ops/array_grad.py
index 68c392bf28d19fda8e39905560f04e4810c203f7..45e741ef222b1dcde21b66ab6cdc3db9576a85ce 100644
--- a/tensorflow/python/ops/array_grad.py
+++ b/tensorflow/python/ops/array_grad.py
@@ -489,10 +489,12 @@ def _GatherNdGrad(op, grad):
 
 
 @ops.RegisterGradient("CheckNumerics")
-def _CheckNumericsGrad(_, grad):
+def _CheckNumericsGrad(op, grad):
   """Gradient for check_numerics op."""
   return array_ops.check_numerics(
-      grad, "Not a number (NaN) or infinity (Inf) values detected in gradient.")
+      grad,
+      "Not a number (NaN) or infinity (Inf) values detected in gradient. %s" %
+      op.get_attr("message"))
 
 
 @ops.RegisterGradient("PlaceholderWithDefault")
@@ -800,6 +802,32 @@ def _ScatterNdGrad(op, grad):
   return [None, updates_grad, None]
 
 
+@ops.RegisterGradient("TensorScatterUpdate")
+def _TensorScatterUpdateGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.tensor_scatter_update(
+      array_ops.identity(grad), indices,
+      array_ops.zeros_like(op.inputs[2], dtype=grad.dtype))
+  return [tensor_grad, None, updates_grad]
+
+
+@ops.RegisterGradient("TensorScatterAdd")
+def _TensorScatterAddGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.identity(grad)
+  return [tensor_grad, None, updates_grad]
+
+
+@ops.RegisterGradient("TensorScatterSub")
+def _TensorScatterSubGrad(op, grad):
+  indices = op.inputs[1]
+  updates_grad = array_ops.gather_nd(grad, indices)
+  tensor_grad = array_ops.identity(grad)
+  return [tensor_grad, None, -updates_grad]
+
+
 @ops.RegisterGradient("ScatterNdNonAliasingAdd")
 def _ScatterNdNonAliasingAddGrad(op, grad):
   indices = op.inputs[1]
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index bbf7d166bf9e2d5512a338666ee00832c77c769b..9dabbffb138093db6d3bd0dcf983d2f6cfdc5081 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -22,6 +22,7 @@ from __future__ import print_function
 import sys
 
 import numpy as np
+import six
 
 from tensorflow.python.eager import context
 from tensorflow.python.framework import common_shapes
@@ -40,6 +41,7 @@ from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops.gen_array_ops import *
 from tensorflow.python.ops.gen_array_ops import reverse_v2 as reverse  # pylint: disable=unused-import
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=wildcard-import
@@ -54,6 +56,7 @@ _BaseSlice = slice
 
 
 @tf_export("identity")
+@dispatch.add_dispatch_support
 def identity(input, name=None):  # pylint: disable=redefined-builtin
   r"""Return a tensor with the same shape and contents as input.
 
@@ -75,11 +78,15 @@ def identity(input, name=None):  # pylint: disable=redefined-builtin
       return input._copy()  # pylint: disable=protected-access
     return input
   else:
-    return gen_array_ops.identity(input, name=name)
+    ret = gen_array_ops.identity(input, name=name)
+    # Propagate handle data for happier shape inference for resource variables.
+    if hasattr(input, "_handle_data"):
+      ret._handle_data = input._handle_data  # pylint: disable=protected-access
+    return ret
 
 
 # pylint: disable=redefined-builtin,protected-access
-@tf_export("expand_dims")
+@tf_export(v1=["expand_dims"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead", "dim")
 def expand_dims(input, axis=None, name=None, dim=None):
   """Inserts a dimension of 1 into a tensor's shape.
@@ -133,6 +140,56 @@ def expand_dims(input, axis=None, name=None, dim=None):
   axis = deprecation.deprecated_argument_lookup("axis", axis, "dim", dim)
   if axis is None:
     raise ValueError("Must specify an axis argument to tf.expand_dims()")
+  return expand_dims_v2(input, axis, name)
+
+
+@tf_export("expand_dims", v1=[])
+@dispatch.add_dispatch_support
+def expand_dims_v2(input, axis, name=None):
+  """Inserts a dimension of 1 into a tensor's shape.
+
+  Given a tensor `input`, this operation inserts a dimension of 1 at the
+  dimension index `axis` of `input`'s shape. The dimension index `axis` starts
+  at zero; if you specify a negative number for `axis` it is counted backward
+  from the end.
+
+  This operation is useful if you want to add a batch dimension to a single
+  element. For example, if you have a single image of shape `[height, width,
+  channels]`, you can make it a batch of 1 image with `expand_dims(image, 0)`,
+  which will make the shape `[1, height, width, channels]`.
+
+  Other examples:
+
+  ```python
+  # 't' is a tensor of shape [2]
+  tf.shape(tf.expand_dims(t, 0))  # [1, 2]
+  tf.shape(tf.expand_dims(t, 1))  # [2, 1]
+  tf.shape(tf.expand_dims(t, -1))  # [2, 1]
+
+  # 't2' is a tensor of shape [2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 0))  # [1, 2, 3, 5]
+  tf.shape(tf.expand_dims(t2, 2))  # [2, 3, 1, 5]
+  tf.shape(tf.expand_dims(t2, 3))  # [2, 3, 5, 1]
+  ```
+
+  This operation requires that:
+
+  `-1-input.dims() <= dim <= input.dims()`
+
+  This operation is related to `squeeze()`, which removes dimensions of
+  size 1.
+
+  Args:
+    input: A `Tensor`.
+    axis: 0-D (scalar). Specifies the dimension index at which to
+      expand the shape of `input`. Must be in the range
+      `[-rank(input) - 1, rank(input)]`.
+    name: The name of the output `Tensor` (optional).
+
+  Returns:
+    A `Tensor` with the same data as `input`, but its shape has an additional
+    dimension of size 1 added.
+  """
   return gen_array_ops.expand_dims(input, axis, name)
 
 
@@ -219,7 +276,13 @@ def broadcast_static_shape(shape_x, shape_y):
   return common_shapes.broadcast_shape(shape_x, shape_y)
 
 
-@tf_export("shape")
+@tf_export("shape", v1=[])
+def shape_v2(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  return shape(input, name, out_type)
+
+
+@tf_export(v1=["shape"])
 def shape(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
@@ -292,7 +355,13 @@ def shape_n(input, out_type=dtypes.int32, name=None):
   return gen_array_ops.shape_n(input, out_type=out_type, name=name)
 
 
-@tf_export("size")
+@tf_export("size", v1=[])
+def size_v2(input, out_type=dtypes.int32, name=None):
+  # pylint: disable=redefined-builtin
+  return size(input, name, out_type)
+
+
+@tf_export(v1=["size"])
 def size(input, name=None, out_type=dtypes.int32):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
@@ -341,7 +410,7 @@ def size_internal(input, name=None, optimize=True, out_type=dtypes.int32):
       input, (sparse_tensor.SparseTensor, sparse_tensor.SparseTensorValue)):
     input = ops.convert_to_tensor(input)
     np_out_type = out_type.as_numpy_dtype
-    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-acces:
+    num_elements = np.prod(input._shape_tuple(), dtype=np_out_type)  # pylint: disable=protected-access
     return ops.convert_to_tensor(num_elements, dtype=out_type)
   with ops.name_scope(name, "Size", [input]) as name:
     if isinstance(input, (sparse_tensor.SparseTensor,
@@ -431,7 +500,7 @@ _SUPPORTED_SLICE_DTYPES = (
 
 def _check_index(idx):
   """Check if a given value is a valid index into a tensor."""
-  if isinstance(idx, (int, tensor_shape.Dimension)):
+  if isinstance(idx, (six.integer_types, tensor_shape.Dimension)):
     return
 
   # Optimistic check. Assumptions:
@@ -878,6 +947,7 @@ def parallel_stack(values, name="parallel_stack"):
 
 
 @tf_export("stack")
+@dispatch.add_dispatch_support
 def stack(values, axis=0, name="stack"):
   """Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
 
@@ -1088,6 +1158,7 @@ def unstack(value, num=None, axis=0, name="unstack"):
 
 
 @tf_export("concat")
+@dispatch.add_dispatch_support
 def concat(values, axis, name="concat"):
   """Concatenates tensors along one dimension.
 
@@ -1265,6 +1336,7 @@ def boolean_mask(tensor, mask, name="boolean_mask", axis=None):
 
 
 @tf_export("boolean_mask", v1=[])
+@dispatch.add_dispatch_support
 def boolean_mask_v2(tensor, mask, axis=None, name="boolean_mask"):
   """Apply boolean mask to tensor.
 
@@ -1445,7 +1517,75 @@ def split(value, num_or_size_splits, axis=0, num=None, name="split"):
       value=value, size_splits=size_splits, axis=axis, num_split=num, name=name)
 
 
-@tf_export("transpose")
+@tf_export("transpose", v1=[])
+def transpose_v2(a, perm=None, conjugate=False, name="transpose"):
+  """Transposes `a`. Permutes the dimensions according to `perm`.
+
+  The returned tensor's dimension i will correspond to the input dimension
+  `perm[i]`. If `perm` is not given, it is set to (n-1...0), where n is
+  the rank of the input tensor. Hence by default, this operation performs a
+  regular matrix transpose on 2-D input Tensors. If conjugate is True and
+  `a.dtype` is either `complex64` or `complex128` then the values of `a`
+  are conjugated and transposed.
+
+  @compatibility(numpy)
+  In `numpy` transposes are memory-efficient constant time operations as they
+  simply return a new view of the same data with adjusted `strides`.
+
+  TensorFlow does not support strides, so `transpose` returns a new tensor with
+  the items permuted.
+  @end_compatibility
+
+  For example:
+
+  ```python
+  x = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.transpose(x)  # [[1, 4]
+                   #  [2, 5]
+                   #  [3, 6]]
+
+  # Equivalently
+  tf.transpose(x, perm=[1, 0])  # [[1, 4]
+                                #  [2, 5]
+                                #  [3, 6]]
+
+  # If x is complex, setting conjugate=True gives the conjugate transpose
+  x = tf.constant([[1 + 1j, 2 + 2j, 3 + 3j],
+                   [4 + 4j, 5 + 5j, 6 + 6j]])
+  tf.transpose(x, conjugate=True)  # [[1 - 1j, 4 - 4j],
+                                   #  [2 - 2j, 5 - 5j],
+                                   #  [3 - 3j, 6 - 6j]]
+
+  # 'perm' is more useful for n-dimensional tensors, for n > 2
+  x = tf.constant([[[ 1,  2,  3],
+                    [ 4,  5,  6]],
+                   [[ 7,  8,  9],
+                    [10, 11, 12]]])
+
+  # Take the transpose of the matrices in dimension-0
+  # (this common operation has a shorthand `linalg.transpose`)
+  tf.transpose(x, perm=[0, 2, 1])  # [[[1,  4],
+                                   #   [2,  5],
+                                   #   [3,  6]],
+                                   #  [[7, 10],
+                                   #   [8, 11],
+                                   #   [9, 12]]]
+  ```
+
+  Args:
+    a: A `Tensor`.
+    perm: A permutation of the dimensions of `a`.
+    conjugate: Optional bool. Setting it to `True` is mathematically equivalent
+      to tf.conj(tf.transpose(input)).
+    name: A name for the operation (optional).
+
+  Returns:
+    A transposed `Tensor`.
+  """
+  return transpose(a=a, perm=perm, name=name, conjugate=conjugate)
+
+
+@tf_export(v1=["transpose"])
 def transpose(a, perm=None, name="transpose", conjugate=False):
   """Transposes `a`. Permutes the dimensions according to `perm`.
 
@@ -1678,7 +1818,8 @@ def zeros(shape, dtype=dtypes.float32, name=None):
   return output
 
 
-@tf_export("zeros_like")
+@tf_export(v1=["zeros_like"])
+@dispatch.add_dispatch_support
 def zeros_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to zero.
 
@@ -1705,6 +1846,43 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
   Returns:
     A `Tensor` with all elements set to zero.
   """
+  return zeros_like_impl(tensor, dtype, name, optimize)
+
+
+@tf_export("zeros_like", v1=[])
+@dispatch.add_dispatch_support
+def zeros_like_v2(
+    input,  # pylint: disable=redefined-builtin
+    dtype=None,
+    name=None):
+  """Creates a tensor with all elements set to zero.
+
+  Given a single tensor (`tensor`), this operation returns a tensor of the
+  same type and shape as `tensor` with all elements set to zero. Optionally,
+  you can use `dtype` to specify a new type for the returned tensor.
+
+  For example:
+
+  ```python
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.zeros_like(tensor)  # [[0, 0, 0], [0, 0, 0]]
+  ```
+
+  Args:
+    input: A `Tensor`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with all elements set to zero.
+  """
+  return zeros_like_impl(input, dtype, name, optimize=True)
+
+
+def zeros_like_impl(tensor, dtype, name, optimize=True):
+  """Internal implementation for the v1/v2 zeros_like API calls."""
   with ops.name_scope(name, "zeros_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
 
@@ -1731,7 +1909,8 @@ def zeros_like(tensor, dtype=None, name=None, optimize=True):
       return gen_array_ops.zeros_like(tensor, name=name)
 
 
-@tf_export("ones_like")
+@tf_export(v1=["ones_like"])
+@dispatch.add_dispatch_support
 def ones_like(tensor, dtype=None, name=None, optimize=True):
   """Creates a tensor with all elements set to 1.
 
@@ -1758,6 +1937,43 @@ def ones_like(tensor, dtype=None, name=None, optimize=True):
   Returns:
     A `Tensor` with all elements set to 1.
   """
+  return ones_like_impl(tensor, dtype, name, optimize)
+
+
+@tf_export("ones_like", v1=[])
+@dispatch.add_dispatch_support
+def ones_like_v2(
+    input,  # pylint: disable=redefined-builtin
+    dtype=None,
+    name=None):
+  """Creates a tensor with all elements set to zero.
+
+  Given a single tensor (`tensor`), this operation returns a tensor of the
+  same type and shape as `tensor` with all elements set to zero. Optionally,
+  you can use `dtype` to specify a new type for the returned tensor.
+
+  For example:
+
+  ```python
+  tensor = tf.constant([[1, 2, 3], [4, 5, 6]])
+  tf.ones_like(tensor)  # [[1, 1, 1], [1, 1, 1]]
+  ```
+
+  Args:
+    input: A `Tensor`.
+    dtype: A type for the returned `Tensor`. Must be `float16`, `float32`,
+      `float64`, `int8`, `uint8`, `int16`, `uint16`, `int32`, `int64`,
+      `complex64`, `complex128`, `bool` or `string`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with all elements set to zero.
+  """
+  return ones_like_impl(input, dtype, name, optimize=True)
+
+
+def ones_like_impl(tensor, dtype, name, optimize=True):
+  """Internal implementation for the v1/v2 ones_like API calls."""
   with ops.name_scope(name, "ones_like", [tensor]) as name:
     tensor = ops.convert_to_tensor(tensor, name="tensor")
     ones_shape = shape_internal(tensor, optimize=optimize)
@@ -1955,7 +2171,65 @@ def sparse_placeholder(dtype, shape=None, name=None):
 # pylint: enable=redefined-outer-name
 
 
-@tf_export("pad")
+@tf_export("pad", v1=[])
+def pad_v2(tensor, paddings, mode="CONSTANT", constant_values=0, name=None):
+  """Pads a tensor.
+
+  This operation pads a `tensor` according to the `paddings` you specify.
+  `paddings` is an integer tensor with shape `[n, 2]`, where n is the rank of
+  `tensor`. For each dimension D of `input`, `paddings[D, 0]` indicates how
+  many values to add before the contents of `tensor` in that dimension, and
+  `paddings[D, 1]` indicates how many values to add after the contents of
+  `tensor` in that dimension. If `mode` is "REFLECT" then both `paddings[D, 0]`
+  and `paddings[D, 1]` must be no greater than `tensor.dim_size(D) - 1`. If
+  `mode` is "SYMMETRIC" then both `paddings[D, 0]` and `paddings[D, 1]` must be
+  no greater than `tensor.dim_size(D)`.
+
+  The padded size of each dimension D of the output is:
+
+  `paddings[D, 0] + tensor.dim_size(D) + paddings[D, 1]`
+
+  For example:
+
+  ```python
+  t = tf.constant([[1, 2, 3], [4, 5, 6]])
+  paddings = tf.constant([[1, 1,], [2, 2]])
+  # 'constant_values' is 0.
+  # rank of 't' is 2.
+  tf.pad(t, paddings, "CONSTANT")  # [[0, 0, 0, 0, 0, 0, 0],
+                                   #  [0, 0, 1, 2, 3, 0, 0],
+                                   #  [0, 0, 4, 5, 6, 0, 0],
+                                   #  [0, 0, 0, 0, 0, 0, 0]]
+
+  tf.pad(t, paddings, "REFLECT")  # [[6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1],
+                                  #  [6, 5, 4, 5, 6, 5, 4],
+                                  #  [3, 2, 1, 2, 3, 2, 1]]
+
+  tf.pad(t, paddings, "SYMMETRIC")  # [[2, 1, 1, 2, 3, 3, 2],
+                                    #  [2, 1, 1, 2, 3, 3, 2],
+                                    #  [5, 4, 4, 5, 6, 6, 5],
+                                    #  [5, 4, 4, 5, 6, 6, 5]]
+  ```
+
+  Args:
+    tensor: A `Tensor`.
+    paddings: A `Tensor` of type `int32`.
+    mode: One of "CONSTANT", "REFLECT", or "SYMMETRIC" (case-insensitive)
+    constant_values: In "CONSTANT" mode, the scalar pad value to use. Must be
+      same type as `tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `tensor`.
+
+  Raises:
+    ValueError: When mode is not one of "CONSTANT", "REFLECT", or "SYMMETRIC".
+  """
+  return pad(tensor, paddings, mode, name, constant_values)
+
+
+@tf_export(v1=["pad"])
 def pad(tensor, paddings, mode="CONSTANT", name=None, constant_values=0):  # pylint: disable=invalid-name
   """Pads a tensor.
 
@@ -2382,7 +2656,7 @@ def required_space_to_batch_paddings(input_shape,
     return result_paddings, result_crops
 
 
-@tf_export("nn.space_to_batch", v1=["nn.space_to_batch", "space_to_batch"])
+@tf_export(v1=["nn.space_to_batch", "space_to_batch"])
 @deprecation.deprecated_endpoints("space_to_batch")
 def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=redefined-builtin
   result = space_to_batch_nd(
@@ -2397,7 +2671,15 @@ def space_to_batch(input, paddings, block_size, name=None):  # pylint: disable=r
 space_to_batch.__doc__ = gen_array_ops.space_to_batch.__doc__
 
 
-@tf_export("nn.space_to_depth", v1=["nn.space_to_depth", "space_to_depth"])
+@tf_export("space_to_batch", "nn.space_to_batch", v1=[])
+def space_to_batch_v2(input, block_shape, paddings, name=None):  # pylint: disable=redefined-builtin
+  return space_to_batch_nd(input, block_shape, paddings, name)
+
+
+space_to_batch_v2.__doc__ = gen_array_ops.space_to_batch_nd.__doc__
+
+
+@tf_export(v1=["nn.space_to_depth", "space_to_depth"])
 @deprecation.deprecated_endpoints("space_to_depth")
 def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
@@ -2406,7 +2688,15 @@ def space_to_depth(input, block_size, name=None, data_format="NHWC"):  # pylint:
 space_to_depth.__doc__ = gen_array_ops.space_to_depth.__doc__
 
 
-@tf_export("nn.depth_to_space", v1=["nn.depth_to_space", "depth_to_space"])
+@tf_export("nn.space_to_depth", v1=[])
+def space_to_depth_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
+  return gen_array_ops.space_to_depth(input, block_size, data_format, name=name)
+
+
+space_to_depth_v2.__doc__ = gen_array_ops.space_to_depth.__doc__
+
+
+@tf_export(v1=["nn.depth_to_space", "depth_to_space"])
 @deprecation.deprecated_endpoints("depth_to_space")
 def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint: disable=redefined-builtin
   return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
@@ -2415,7 +2705,15 @@ def depth_to_space(input, block_size, name=None, data_format="NHWC"):  # pylint:
 depth_to_space.__doc__ = gen_array_ops.depth_to_space.__doc__
 
 
-@tf_export("batch_to_space")
+@tf_export("nn.depth_to_space", v1=[])
+def depth_to_space_v2(input, block_size, data_format="NHWC", name=None):  # pylint: disable=redefined-builtin
+  return gen_array_ops.depth_to_space(input, block_size, data_format, name=name)
+
+
+depth_to_space_v2.__doc__ = gen_array_ops.depth_to_space.__doc__
+
+
+@tf_export(v1=["batch_to_space"])
 def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=redefined-builtin
   result = batch_to_space_nd(
       input,
@@ -2429,6 +2727,151 @@ def batch_to_space(input, crops, block_size, name=None):  # pylint: disable=rede
 batch_to_space.__doc__ = gen_array_ops.batch_to_space.__doc__
 
 
+@tf_export("batch_to_space", v1=[])
+def batch_to_space_v2(input, block_shape, crops, name=None):  # pylint: disable=redefined-builtin
+  """BatchToSpace for N-D tensors of type T.
+
+  This operation reshapes the "batch" dimension 0 into `M + 1` dimensions of
+  shape `block_shape + [batch]`, interleaves these blocks back into the grid
+  defined by the spatial dimensions `[1, ..., M]`, to obtain a result with the
+  same rank as the input.  The spatial dimensions of this intermediate result
+  are then optionally cropped according to `crops` to produce the output.  This
+  is the reverse of SpaceToBatch.  See below for a precise description.
+
+  Args:
+    input: A `Tensor`.
+      N-D with shape `input_shape = [batch] + spatial_shape + remaining_shape`,
+      where spatial_shape has M dimensions.
+    block_shape: A `Tensor`. Must be one of the following types:
+      `int32`, `int64`. 1-D with shape `[M]`, all values must be >= 1.
+      For backwards compatibility with TF 1.0, this parameter may be an int, in
+      which case it is converted to
+      `numpy.array([block_shape, block_shape], dtype=numpy.int64)`.
+    crops: A `Tensor`. Must be one of the following types: `int32`, `int64`.
+      2-D with shape `[M, 2]`, all values must be >= 0.
+        `crops[i] = [crop_start, crop_end]` specifies the amount to crop from
+        input dimension `i + 1`, which corresponds to spatial dimension `i`.  It
+        is required that
+        `crop_start[i] + crop_end[i] <= block_shape[i] * input_shape[i + 1]`.
+
+      This operation is equivalent to the following steps:
+
+      1. Reshape `input` to `reshaped` of shape:
+           [block_shape[0], ..., block_shape[M-1],
+            batch / prod(block_shape),
+            input_shape[1], ..., input_shape[N-1]]
+
+      2. Permute dimensions of `reshaped` to produce `permuted` of shape
+           [batch / prod(block_shape),
+
+            input_shape[1], block_shape[0],
+            ...,
+            input_shape[M], block_shape[M-1],
+
+            input_shape[M+1], ..., input_shape[N-1]]
+
+      3. Reshape `permuted` to produce `reshaped_permuted` of shape
+           [batch / prod(block_shape),
+
+            input_shape[1] * block_shape[0],
+            ...,
+            input_shape[M] * block_shape[M-1],
+
+            input_shape[M+1],
+            ...,
+            input_shape[N-1]]
+
+      4. Crop the start and end of dimensions `[1, ..., M]` of
+         `reshaped_permuted` according to `crops` to produce the
+         output of shape:
+           [batch / prod(block_shape),
+
+            input_shape[1] * block_shape[0] - crops[0,0] - crops[0,1],
+            ...,
+            input_shape[M] * block_shape[M-1] - crops[M-1,0] - crops[M-1,1],
+
+            input_shape[M+1], ..., input_shape[N-1]]
+
+      Some examples:
+
+      (1) For the following input of shape `[4, 1, 1, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      [[[[1]]], [[[2]]], [[[3]]], [[[4]]]]
+      ```
+
+      The output tensor has shape `[1, 2, 2, 1]` and value:
+
+      ```
+      x = [[[[1], [2]], [[3], [4]]]]
+      ```
+
+      (2) For the following input of shape `[4, 1, 1, 3]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      [[[1, 2, 3]], [[4, 5, 6]], [[7, 8, 9]], [[10, 11, 12]]]
+      ```
+
+      The output tensor has shape `[1, 2, 2, 3]` and value:
+
+      ```
+      x = [[[[1, 2, 3], [4, 5, 6]],
+            [[7, 8, 9], [10, 11, 12]]]]
+      ```
+
+      (3) For the following input of shape `[4, 2, 2, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [0, 0]]`:
+
+      ```
+      x = [[[[1], [3]], [[9], [11]]],
+           [[[2], [4]], [[10], [12]]],
+           [[[5], [7]], [[13], [15]]],
+           [[[6], [8]], [[14], [16]]]]
+      ```
+
+      The output tensor has shape `[1, 4, 4, 1]` and value:
+
+      ```
+      x = [[[1],   [2],  [3],  [4]],
+           [[5],   [6],  [7],  [8]],
+           [[9],  [10], [11],  [12]],
+           [[13], [14], [15],  [16]]]
+      ```
+
+      (4) For the following input of shape `[8, 1, 3, 1]`,
+          `block_shape = [2, 2]`, and `crops = [[0, 0], [2, 0]]`:
+
+      ```
+      x = [[[[0], [1], [3]]], [[[0], [9], [11]]],
+           [[[0], [2], [4]]], [[[0], [10], [12]]],
+           [[[0], [5], [7]]], [[[0], [13], [15]]],
+           [[[0], [6], [8]]], [[[0], [14], [16]]]]
+      ```
+
+      The output tensor has shape `[2, 2, 4, 1]` and value:
+
+      ```
+      x = [[[[1],   [2],  [3],  [4]],
+            [[5],   [6],  [7],  [8]]],
+           [[[9],  [10], [11],  [12]],
+            [[13], [14], [15],  [16]]]]
+      ```
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if isinstance(block_shape, int):
+    block_shape = np.array([block_shape, block_shape], dtype=np.int64)
+
+  return batch_to_space_nd(input=input,
+                           block_shape=block_shape,
+                           crops=crops,
+                           name=name)
+
+
 @tf_export("one_hot")
 def one_hot(indices,
             depth,
@@ -2652,7 +3095,7 @@ def sequence_mask(lengths, maxlen=None, dtype=dtypes.bool, name=None):
       return gen_math_ops.cast(result, dtype)
 
 
-@tf_export("squeeze")
+@tf_export(v1=["squeeze"])
 @deprecation.deprecated_args(None, "Use the `axis` argument instead",
                              "squeeze_dims")
 def squeeze(input, axis=None, name=None, squeeze_dims=None):
@@ -2702,7 +3145,14 @@ def squeeze(input, axis=None, name=None, squeeze_dims=None):
   return gen_array_ops.squeeze(input, axis, name)
 
 
+@tf_export("squeeze", v1=[])
+def squeeze_v2(input, axis=None, name=None):
+  # pylint: disable=redefined-builtin
+  return squeeze(input, axis, name)
+
+
 @tf_export("where")
+@dispatch.add_dispatch_support
 def where(condition, x=None, y=None, name=None):
   """Return the elements, either from `x` or `y`, depending on the `condition`.
 
@@ -2756,7 +3206,7 @@ def where(condition, x=None, y=None, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("reverse_sequence")
+@tf_export(v1=["reverse_sequence"])
 @deprecation.deprecated_args(
     None, "seq_dim is deprecated, use seq_axis instead", "seq_dim")
 @deprecation.deprecated_args(
@@ -2780,15 +3230,32 @@ def reverse_sequence(input,
       name=name)
 
 
-# pylint: enable=redefined-builtin
-
 reverse_sequence.__doc__ = deprecation.rewrite_argument_docstring(
     deprecation.rewrite_argument_docstring(
         gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
     "seq_dim", "seq_axis")
 
 
-@tf_export("gather")
+@tf_export("reverse_sequence", v1=[])
+def reverse_sequence_v2(
+    input, seq_lengths, seq_axis=None, batch_axis=None, name=None):
+  return gen_array_ops.reverse_sequence(
+      input=input,
+      seq_lengths=seq_lengths,
+      seq_dim=seq_axis,
+      batch_dim=batch_axis,
+      name=name)
+
+
+reverse_sequence_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        gen_array_ops.reverse_sequence.__doc__, "batch_dim", "batch_axis"),
+    "seq_dim", "seq_axis")
+
+# pylint: enable=redefined-builtin
+
+
+@tf_export(v1=["gather"])
 def gather(params, indices, validate_indices=None, name=None, axis=0):
   del validate_indices
   if axis != 0:
@@ -2804,10 +3271,19 @@ def gather(params, indices, validate_indices=None, name=None, axis=0):
     return gen_array_ops.gather_v2(params, indices, axis, name=name)
 
 
-gather.__doc__ = gen_array_ops.gather_v2.__doc__
+@tf_export("gather", v1=[])
+@dispatch.add_dispatch_support
+def gather_v2(params, indices, validate_indices=None, axis=0, name=None):
+  return gather(params, indices, validate_indices=validate_indices, name=name,
+                axis=axis)
+
+
+gather.__doc__ = gather_v2.__doc__ = gen_array_ops.gather_v2.__doc__
+
 
 
 @tf_export("batch_gather")
+@dispatch.add_dispatch_support
 def batch_gather(params, indices, name=None):
   """Gather slices from `params` according to `indices` with leading batch dims.
 
@@ -2885,7 +3361,7 @@ def batch_gather(params, indices, name=None):
 
 # Define quantize_v2 here in order to make name the second-to-last attribute,
 # because round_mode was added later.
-@tf_export("quantize_v2")
+@tf_export(v1=["quantize_v2"])
 @deprecation.deprecated(
     "2017-10-25",
     "`tf.quantize_v2` is deprecated, please use `tf.quantization.quantize` "
@@ -2906,7 +3382,7 @@ def quantize_v2(input,  # pylint: disable=redefined-builtin
                                    round_mode=round_mode)
 
 
-quantize_v2.__doc__ = """Please use `tf.quantize` instead."""
+quantize_v2.__doc__ = """Please use `tf.quantization.quantize` instead."""
 
 
 # We want to expose tf.quantize instead of tf.quantize_v2; we can deprecate
@@ -2992,3 +3468,48 @@ def searchsorted(sorted_sequence,
 
 
 quantize.__doc__ = gen_array_ops.quantize_v2.__doc__
+
+
+@tf_export("image.extract_image_patches", v1=[])
+def extract_image_patches_v2(
+    images,
+    sizes,
+    strides,
+    rates,
+    padding,
+    name=None):
+  # pylint: disable=line-too-long
+  r"""Extract `patches` from `images` and put them in the \"depth\" output dimension.
+
+  Args:
+    images: A 4-D Tensor with shape `[batch, in_rows, in_cols, depth]
+    sizes: The size of the sliding window for each dimension of `images`.
+    strides: A 1-D Tensor of length 4. How far the centers of two consecutive
+      patches are in the images. Must be: `[1, stride_rows, stride_cols, 1]`.
+    rates: A 1-D Tensor of length 4. Must be: `[1, rate_rows, rate_cols, 1]`.
+      This is the input stride, specifying how far two consecutive patch samples
+      are in the input. Equivalent to extracting patches with `patch_sizes_eff =
+      patch_sizes + (patch_sizes - 1) * (rates - 1)`, followed by subsampling
+      them spatially by a factor of `rates`. This is equivalent to `rate` in
+      dilated (a.k.a. Atrous) convolutions.
+    padding: The type of padding algorithm to use.
+      We specify the size-related attributes as: ```python ksizes = [1,
+        ksize_rows, ksize_cols, 1] strides = [1, strides_rows, strides_cols, 1]
+        rates = [1, rates_rows, rates_cols, 1]
+    name: A name for the operation (optional).
+
+  Returns:
+    A 4-D Tensor. Has the same type as `images`, and with shape `[batch,
+    out_rows, out_cols, ksize_rows * ksize_cols * depth]` containing image
+    patches with size `ksize_rows x ksize_cols x depth` vectorized in the
+    \"depth\" dimension. Note `out_rows` and `out_cols` are the dimensions of
+    the output patches.
+  """
+  # pylint: enable=line-too-long
+  return gen_array_ops.extract_image_patches(
+      images, sizes, strides, rates, padding, name)
+
+extract_image_patches_deprecation = deprecation.deprecated_args(
+    None, "ksizes is deprecated, use sizes instead", "ksizes")
+tf_export(v1=["image.extract_image_patches", "extract_image_patches"])(
+    extract_image_patches_deprecation(gen_array_ops.extract_image_patches))
diff --git a/tensorflow/python/ops/bitwise_ops_test.py b/tensorflow/python/ops/bitwise_ops_test.py
index dfb40db2d5ae4ad4a319283d6f376c78e05271fa..d154b6759bfbc50ad2e5ea34e4f04b945ef2d397 100644
--- a/tensorflow/python/ops/bitwise_ops_test.py
+++ b/tensorflow/python/ops/bitwise_ops_test.py
@@ -34,6 +34,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
   def __init__(self, method_name="runTest"):
     super(BitwiseOpTest, self).__init__(method_name)
 
+  @test_util.run_deprecated_v1
   def testBinaryOps(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -59,16 +60,18 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                   2**31 - 1, 2**31, 2**32 - 1, 2**32, -2**32 + 1, -2**32,
                   -2**63 + 1, 2**63 - 1]
     def count_bits(x):
-      return sum([bin(z).count("1") for z in six.iterbytes(x.tobytes())])
+      return sum(bin(z).count("1") for z in six.iterbytes(x.tobytes()))
     for dtype in dtype_list:
       with self.cached_session(use_gpu=True) as sess:
         print("PopulationCount test: ", dtype)
         inputs = np.array(raw_inputs, dtype=dtype.as_numpy_dtype)
         truth = [count_bits(x) for x in inputs]
         input_tensor = constant_op.constant(inputs, dtype=dtype)
-        popcnt_result = sess.run(gen_bitwise_ops.population_count(input_tensor))
+        popcnt_result = self.evaluate(
+            gen_bitwise_ops.population_count(input_tensor))
         self.assertAllEqual(truth, popcnt_result)
 
+  @test_util.run_deprecated_v1
   def testInvertOp(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16, dtypes.uint32, dtypes.uint64]
@@ -89,10 +92,11 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(not_a_or_a, [not_0] * 4)
         # For unsigned dtypes let's also check the result directly.
         if dtype.is_unsigned:
-          inverted = sess.run(bitwise_ops.invert(input_tensor))
+          inverted = self.evaluate(bitwise_ops.invert(input_tensor))
           expected = [dtype.max - x for x in inputs]
           self.assertAllEqual(inverted, expected)
 
+  @test_util.run_deprecated_v1
   def testShiftsWithPositiveLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64,
                   np.uint8, np.uint16, np.uint32, np.uint64]
@@ -107,6 +111,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(left_shift_result, np.left_shift(lhs, rhs))
         self.assertAllEqual(right_shift_result, np.right_shift(lhs, rhs))
 
+  @test_util.run_deprecated_v1
   def testShiftsWithNegativeLHS(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64]
 
@@ -120,6 +125,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(left_shift_result, np.left_shift(lhs, rhs))
         self.assertAllEqual(right_shift_result, np.right_shift(lhs, rhs))
 
+  @test_util.run_deprecated_v1
   def testImplementationDefinedShiftsDoNotCrash(self):
     dtype_list = [np.int8, np.int16, np.int32, np.int64]
 
@@ -135,6 +141,7 @@ class BitwiseOpTest(test_util.TensorFlowTestCase):
                   bitwise_ops.right_shift(lhs, rhs)])
 
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     dtype_list = [dtypes.int8, dtypes.int16, dtypes.int32, dtypes.int64,
                   dtypes.uint8, dtypes.uint16]
diff --git a/tensorflow/python/ops/candidate_sampling_ops.py b/tensorflow/python/ops/candidate_sampling_ops.py
index f0bfdb2b7a3c57b80e3ef01fa91da12b99cdb3d9..56f76a49d51bec99d35593041f3e72c2fcb580a4 100644
--- a/tensorflow/python/ops/candidate_sampling_ops.py
+++ b/tensorflow/python/ops/candidate_sampling_ops.py
@@ -151,7 +151,10 @@ def log_uniform_candidate_sampler(true_classes, num_true, num_sampled, unique,
       seed2=seed2, name=name)
 
 
-@tf_export('nn.learned_unigram_candidate_sampler')
+@tf_export(
+    'random.learned_unigram_candidate_sampler',
+    'nn.learned_unigram_candidate_sampler')
+@deprecation.deprecated_endpoints(['nn.learned_unigram_candidate_sampler'])
 def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
                                       unique, range_max, seed=None, name=None):
   """Samples a set of classes from a distribution learned during training.
@@ -208,7 +211,8 @@ def learned_unigram_candidate_sampler(true_classes, num_true, num_sampled,
       seed2=seed2, name=name)
 
 
-@tf_export('nn.fixed_unigram_candidate_sampler')
+@tf_export('random.fixed_unigram_candidate_sampler',
+           'nn.fixed_unigram_candidate_sampler')
 def fixed_unigram_candidate_sampler(true_classes,
                                     num_true,
                                     num_sampled,
@@ -300,7 +304,7 @@ def fixed_unigram_candidate_sampler(true_classes,
       unigrams=unigrams, seed=seed1, seed2=seed2, name=name)
 
 
-@tf_export('nn.all_candidate_sampler')
+@tf_export('random.all_candidate_sampler', 'nn.all_candidate_sampler')
 def all_candidate_sampler(true_classes, num_true, num_sampled, unique,
                           seed=None, name=None):
   """Generate the set of all classes.
diff --git a/tensorflow/python/ops/check_ops.py b/tensorflow/python/ops/check_ops.py
index 5589bbc848597ce5d0b0d2a30375fbe2df1d177a..f1f36269cf2bd9bcd3d25638a82d776850bc6bb8 100644
--- a/tensorflow/python/ops/check_ops.py
+++ b/tensorflow/python/ops/check_ops.py
@@ -119,9 +119,31 @@ def assert_proper_iterable(values):
         'Expected argument "values" to be iterable.  Found: %s' % type(values))
 
 
-@tf_export(
-    'debugging.assert_negative',
-    v1=['debugging.assert_negative', 'assert_negative'])
+@tf_export('debugging.assert_negative', v1=[])
+def assert_negative_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x < 0` holds element-wise.
+
+  This Op checks that `x[i] < 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not negative everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_negative".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] < 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_negative(x=x, message=message, summarize=summarize, name=name)
+
+
+@tf_export(v1=['debugging.assert_negative', 'assert_negative'])
 @deprecation.deprecated_endpoints('assert_negative')
 def assert_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < 0` holds element-wise.
@@ -163,9 +185,31 @@ def assert_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less(x, zero, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_positive',
-    v1=['debugging.assert_positive', 'assert_positive'])
+@tf_export('debugging.assert_positive', v1=[])
+def assert_positive_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x > 0` holds element-wise.
+
+  This Op checks that `x[i] > 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not positive everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional). Defaults to "assert_positive".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] > 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_positive(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_positive', 'assert_positive'])
 @deprecation.deprecated_endpoints('assert_positive')
 def assert_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > 0` holds element-wise.
@@ -206,9 +250,32 @@ def assert_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less(zero, x, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_non_negative',
-    v1=['debugging.assert_non_negative', 'assert_non_negative'])
+@tf_export('debugging.assert_non_negative', v1=[])
+def assert_non_negative_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x >= 0` holds element-wise.
+
+  This Op checks that `x[i] >= 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not >= 0 everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+      "assert_non_negative".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] >= 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_non_negative(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_non_negative', 'assert_non_negative'])
 @deprecation.deprecated_endpoints('assert_non_negative')
 def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x >= 0` holds element-wise.
@@ -251,9 +318,32 @@ def assert_non_negative(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(zero, x, data=data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_non_positive',
-    v1=['debugging.assert_non_positive', 'assert_non_positive'])
+@tf_export('debugging.assert_non_positive', v1=[])
+def assert_non_positive_v2(x, message=None, summarize=None, name=None):
+  """Assert the condition `x <= 0` holds element-wise.
+
+  This Op checks that `x[i] <= 0` holds for every element of `x`. If `x` is
+  empty, this is trivially satisfied.
+
+  If `x` is not <= 0 everywhere, `message`, as well as the first `summarize`
+  entries of `x` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+      "assert_non_positive".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x[i] <= 0` is False. The check can be performed immediately during eager
+      execution or if `x` is statically known.
+  """
+  assert_non_positive(x=x, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_non_positive', 'assert_non_positive'])
 @deprecation.deprecated_endpoints('assert_non_positive')
 def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= 0` holds element-wise.
@@ -296,7 +386,33 @@ def assert_non_positive(x, data=None, summarize=None, message=None, name=None):
     return assert_less_equal(x, zero, data=data, summarize=summarize)
 
 
-@tf_export('debugging.assert_equal', 'assert_equal')
+@tf_export('debugging.assert_equal', 'assert_equal', v1=[])
+def assert_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x == y` holds element-wise.
+
+  This Op checks that `x[i] == y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` and `y` are not equal, `message`, as well as the first `summarize`
+  entries of `x` and `y` are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x == y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_equal', 'assert_equal'])
 def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x == y` holds element-wise.
 
@@ -396,9 +512,36 @@ def assert_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_none_equal',
-    v1=['debugging.assert_none_equal', 'assert_none_equal'])
+@tf_export('debugging.assert_none_equal', v1=[])
+def assert_none_equal_v2(x, y, summarize=None, message=None, name=None):
+  """Assert the condition `x != y` holds for all elements.
+
+  This Op checks that `x[i] != y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If any elements of `x` and `y` are equal, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
+  is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    summarize: Print this many entries of each tensor.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to
+    "assert_none_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x != y` is False for any pair of elements in `x` and `y`. The check can
+      be performed immediately during eager execution or if `x` and `y` are
+      statically known.
+  """
+  assert_none_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_none_equal', 'assert_none_equal'])
 @deprecation.deprecated_endpoints('assert_none_equal')
 def assert_none_equal(
     x, y, data=None, summarize=None, message=None, name=None):
@@ -450,7 +593,52 @@ def assert_none_equal(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_near', v1=['debugging.assert_near', 'assert_near'])
+@tf_export('debugging.assert_near', v1=[])
+def assert_near_v2(x, y, rtol=None, atol=None, message=None, summarize=None,
+                   name=None):
+  """Assert the condition `x` and `y` are close element-wise.
+
+  This Op checks that `x[i] - y[i] < atol + rtol * tf.abs(y[i])` holds for every
+  pair of (possibly broadcast) elements of `x` and `y`. If both `x` and `y` are
+  empty, this is trivially satisfied.
+
+  If any elements of `x` and `y` are not close, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError`
+  is raised.
+
+  The default `atol` and `rtol` is `10 * eps`, where `eps` is the smallest
+  representable positive number such that `1 + eps != 1`.  This is about
+  `1.2e-6` in `32bit`, `2.22e-15` in `64bit`, and `0.00977` in `16bit`.
+  See `numpy.finfo`.
+
+  Args:
+    x: Float or complex `Tensor`.
+    y: Float or complex `Tensor`, same dtype as and broadcastable to `x`.
+    rtol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The relative tolerance.  Default is `10 * eps`.
+    atol:  `Tensor`.  Same `dtype` as, and broadcastable to, `x`.
+      The absolute tolerance.  Default is `10 * eps`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_near".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x != y` is False for any pair of elements in `x` and `y`. The check can
+      be performed immediately during eager execution or if `x` and `y` are
+      statically known.
+
+  @compatibility(numpy)
+  Similar to `numpy.assert_allclose`, except tolerance depends on data type.
+  This is due to the fact that `TensorFlow` is often used with `32bit`, `64bit`,
+  and even `16bit` data.
+  @end_compatibility
+  """
+  assert_near(x=x, y=y, rtol=rtol, atol=atol, summarize=summarize,
+              message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_near', 'assert_near'])
 @deprecation.deprecated_endpoints('assert_near')
 def assert_near(
     x, y, rtol=None, atol=None, data=None, summarize=None, message=None,
@@ -529,7 +717,34 @@ def assert_near(
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_less', 'assert_less')
+@tf_export('debugging.assert_less', 'assert_less', v1=[])
+def assert_less_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x < y` holds element-wise.
+
+  This Op checks that `x[i] < y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not less than `y` element-wise, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
+  raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_less".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x < y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_less(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_less', 'assert_less'])
 def assert_less(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x < y` holds element-wise.
 
@@ -577,9 +792,34 @@ def assert_less(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_less_equal',
-    v1=['debugging.assert_less_equal', 'assert_less_equal'])
+@tf_export('debugging.assert_less_equal', v1=[])
+def assert_less_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x <= y` holds element-wise.
+
+  This Op checks that `x[i] <= y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not less or equal than `y` element-wise, `message`, as well as the
+  first `summarize` entries of `x` and `y` are printed, and
+  `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional). Defaults to "assert_less_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x <= y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_less_equal(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_less_equal', 'assert_less_equal'])
 @deprecation.deprecated_endpoints('assert_less_equal')
 def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x <= y` holds element-wise.
@@ -628,7 +868,34 @@ def assert_less_equal(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_greater', 'assert_greater')
+@tf_export('debugging.assert_greater', 'assert_greater', v1=[])
+def assert_greater_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x > y` holds element-wise.
+
+  This Op checks that `x[i] > y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not greater than `y` element-wise, `message`, as well as the first
+  `summarize` entries of `x` and `y` are printed, and `InvalidArgumentError` is
+  raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to "assert_greater".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x > y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_greater(x=x, y=y, summarize=summarize, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_greater', 'assert_greater'])
 def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
   """Assert the condition `x > y` holds element-wise.
 
@@ -676,9 +943,36 @@ def assert_greater(x, y, data=None, summarize=None, message=None, name=None):
     return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_greater_equal',
-    v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
+@tf_export('debugging.assert_greater_equal', v1=[])
+def assert_greater_equal_v2(x, y, message=None, summarize=None, name=None):
+  """Assert the condition `x >= y` holds element-wise.
+
+  This Op checks that `x[i] >= y[i]` holds for every pair of (possibly
+  broadcast) elements of `x` and `y`. If both `x` and `y` are empty, this is
+  trivially satisfied.
+
+  If `x` is not greater or equal to `y` element-wise, `message`, as well as the
+  first `summarize` entries of `x` and `y` are printed, and
+  `InvalidArgumentError` is raised.
+
+  Args:
+    x:  Numeric `Tensor`.
+    y:  Numeric `Tensor`, same dtype as and broadcastable to `x`.
+    message: A string to prefix to the default message.
+    summarize: Print this many entries of each tensor.
+    name: A name for this operation (optional).  Defaults to
+    "assert_greater_equal".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x >= y` is False. The check can be performed immediately during eager
+      execution or if `x` and `y` are statically known.
+  """
+  assert_greater_equal(x=x, y=y, summarize=summarize, message=message,
+                       name=name)
+
+
+@tf_export(v1=['debugging.assert_greater_equal', 'assert_greater_equal'])
 @deprecation.deprecated_endpoints('assert_greater_equal')
 def assert_greater_equal(x, y, data=None, summarize=None, message=None,
                          name=None):
@@ -777,7 +1071,31 @@ def _assert_rank_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export('debugging.assert_rank', 'assert_rank')
+@tf_export('debugging.assert_rank', 'assert_rank', v1=[])
+def assert_rank_v2(x, rank, message=None, name=None):
+  """Assert that `x` has rank equal to `rank`.
+
+  This Op checks that the rank of `x` is equal to `rank`.
+
+  If `x` has a different rank, `message`, as well as the shape of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    rank: Scalar integer `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to
+      "assert_rank".
+
+  Raises:
+    InvalidArgumentError: if the check can be performed immediately and
+      `x` does not have rank `rank`. The check can be performed immediately
+      during eager execution or if the shape of `x` is statically known.
+  """
+  assert_rank(x=x, rank=rank, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank', 'assert_rank'])
 def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   """Assert `x` has rank equal to `rank`.
 
@@ -792,7 +1110,7 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
     x:  Numeric `Tensor`.
     rank:  Scalar integer `Tensor`.
     data:  The tensors to print out if the condition is False.  Defaults to
-      error message and first few entries of `x`.
+      error message and the shape of `x`.
     summarize: Print this many entries of each tensor.
     message: A string to prefix to the default message.
     name: A name for this operation (optional).  Defaults to "assert_rank".
@@ -839,9 +1157,31 @@ def assert_rank(x, rank, data=None, summarize=None, message=None, name=None):
   return assert_op
 
 
-@tf_export(
-    'debugging.assert_rank_at_least',
-    v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
+@tf_export('debugging.assert_rank_at_least', v1=[])
+def assert_rank_at_least_v2(x, rank, message=None, name=None):
+  """Assert that `x` has rank of at least `rank`.
+
+  This Op checks that the rank of `x` is greater or equal to `rank`.
+
+  If `x` has a rank lower than `rank`, `message`, as well as the shape of `x`
+  are printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    rank: Scalar integer `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional).  Defaults to
+      "assert_rank_at_least".
+
+  Raises:
+    InvalidArgumentError: `x` does not have rank at least `rank`, but the rank
+      cannot be statically determined.
+    ValueError: If static checks determine `x` has mismatched rank.
+  """
+  assert_rank_at_least(x=x, rank=rank, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank_at_least', 'assert_rank_at_least'])
 @deprecation.deprecated_endpoints('assert_rank_at_least')
 def assert_rank_at_least(
     x, rank, data=None, summarize=None, message=None, name=None):
@@ -973,9 +1313,30 @@ def _assert_ranks_condition(
   return control_flow_ops.Assert(condition, data, summarize=summarize)
 
 
-@tf_export(
-    'debugging.assert_rank_in',
-    v1=['debugging.assert_rank_in', 'assert_rank_in'])
+@tf_export('debugging.assert_rank_in', v1=[])
+def assert_rank_in_v2(x, ranks, message=None, name=None):
+  """Assert that `x` has a rank in `ranks`.
+
+  This Op checks that the rank of `x` is in `ranks`.
+
+  If `x` has a different rank, `message`, as well as the shape of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: `Tensor`.
+    ranks: `Iterable` of scalar `Tensor` objects.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to "assert_rank_in".
+
+  Raises:
+    InvalidArgumentError: `x` does not have rank in `ranks`, but the rank cannot
+      be statically determined.
+    ValueError: If static checks determine `x` has mismatched rank.
+  """
+  assert_rank_in(x=x, ranks=ranks, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_rank_in', 'assert_rank_in'])
 @deprecation.deprecated_endpoints('assert_rank_in')
 def assert_rank_in(
     x, ranks, data=None, summarize=None, message=None, name=None):
@@ -1038,9 +1399,25 @@ def assert_rank_in(
   return assert_op
 
 
-@tf_export(
-    'debugging.assert_integer',
-    v1=['debugging.assert_integer', 'assert_integer'])
+@tf_export('debugging.assert_integer', v1=[])
+def assert_integer_v2(x, message=None, name=None):
+  """Assert that `x` is of integer dtype.
+
+  If `x` has a non-integer type, `message`, as well as the dtype of `x` are
+  printed, and `InvalidArgumentError` is raised.
+
+  Args:
+    x: A `Tensor`.
+    message: A string to prefix to the default message.
+    name: A name for this operation (optional). Defaults to "assert_integer".
+
+  Raises:
+    TypeError:  If `x.dtype` is not a non-quantized integer type.
+  """
+  assert_integer(x=x, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_integer', 'assert_integer'])
 @deprecation.deprecated_endpoints('assert_integer')
 def assert_integer(x, message=None, name=None):
   """Assert that `x` is of integer dtype.
@@ -1079,13 +1456,30 @@ def assert_integer(x, message=None, name=None):
     return control_flow_ops.no_op('statically_determined_was_integer')
 
 
-@tf_export('debugging.assert_type', v1=['debugging.assert_type', 'assert_type'])
+@tf_export('debugging.assert_type', v1=[])
+def assert_type_v2(tensor, tf_type, message=None, name=None):
+  """Asserts that the given `Tensor` is of the specified type.
+
+  Args:
+    tensor: A `Tensor`.
+    tf_type: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
+      etc).
+    message: A string to prefix to the default message.
+    name:  A name for this operation. Defaults to "assert_type"
+
+  Raises:
+    TypeError: If the tensor's data type doesn't match `tf_type`.
+  """
+  assert_type(tensor=tensor, tf_type=tf_type, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_type', 'assert_type'])
 @deprecation.deprecated_endpoints('assert_type')
 def assert_type(tensor, tf_type, message=None, name=None):
   """Statically asserts that the given `Tensor` is of the specified type.
 
   Args:
-    tensor: A tensorflow `Tensor`.
+    tensor: A `Tensor`.
     tf_type: A tensorflow type (`dtypes.float32`, `tf.int64`, `dtypes.bool`,
       etc).
     message: A string to prefix to the default message.
@@ -1136,9 +1530,13 @@ def is_numeric_tensor(tensor):
 
 
 @tf_export(
-    'debugging.is_non_decreasing',
-    v1=['debugging.is_non_decreasing', 'is_non_decreasing'])
-@deprecation.deprecated_endpoints('is_non_decreasing')
+    'math.is_non_decreasing',
+    v1=[
+        'math.is_non_decreasing', 'debugging.is_non_decreasing',
+        'is_non_decreasing'
+    ])
+@deprecation.deprecated_endpoints('debugging.is_non_decreasing',
+                                  'is_non_decreasing')
 def is_non_decreasing(x, name=None):
   """Returns `True` if `x` is non-decreasing.
 
@@ -1166,9 +1564,13 @@ def is_non_decreasing(x, name=None):
 
 
 @tf_export(
-    'debugging.is_strictly_increasing',
-    v1=['debugging.is_strictly_increasing', 'is_strictly_increasing'])
-@deprecation.deprecated_endpoints('is_strictly_increasing')
+    'math.is_strictly_increasing',
+    v1=[
+        'math.is_strictly_increasing', 'debugging.is_strictly_increasing',
+        'is_strictly_increasing'
+    ])
+@deprecation.deprecated_endpoints('debugging.is_strictly_increasing',
+                                  'is_strictly_increasing')
 def is_strictly_increasing(x, name=None):
   """Returns `True` if `x` is strictly increasing.
 
@@ -1260,8 +1662,10 @@ def assert_same_float_dtype(tensors=None, dtype=None):
     tensors: Tensors of input values. Can include `None` elements, which will be
         ignored.
     dtype: Expected type.
+
   Returns:
     Validated type.
+
   Raises:
     ValueError: if neither `tensors` nor `dtype` is supplied, or result is not
         float, or the common type of the inputs is not a floating point type.
@@ -1275,20 +1679,57 @@ def assert_same_float_dtype(tensors=None, dtype=None):
   return dtype
 
 
-@tf_export(
-    'debugging.assert_scalar', v1=['debugging.assert_scalar', 'assert_scalar'])
+@tf_export('debugging.assert_scalar', v1=[])
+def assert_scalar_v2(tensor, message=None, name=None):
+  """Asserts that the given `tensor` is a scalar.
+
+  This function raises `ValueError` unless it can be certain that the given
+  `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
+  unknown.
+
+  Args:
+    tensor: A `Tensor`.
+    message: A string to prefix to the default message.
+    name:  A name for this operation. Defaults to "assert_scalar"
+
+  Raises:
+    ValueError: If the tensor is not scalar (rank 0), or if its shape is
+      unknown.
+  """
+  assert_scalar(tensor=tensor, message=message, name=name)
+
+
+@tf_export(v1=['debugging.assert_scalar', 'assert_scalar'])
 @deprecation.deprecated_endpoints('assert_scalar')
-def assert_scalar(tensor, name=None):
+def assert_scalar(tensor, name=None, message=None):
+  """Asserts that the given `tensor` is a scalar.
+
+  This function raises `ValueError` unless it can be certain that the given
+  `tensor` is a scalar. `ValueError` is also raised if the shape of `tensor` is
+  unknown.
+
+  Args:
+    tensor: A `Tensor`.
+    name:  A name for this operation. Defaults to "assert_scalar"
+    message: A string to prefix to the default message.
+
+  Returns:
+    The input tensor (potentially converted to a `Tensor`).
+
+  Raises:
+    ValueError: If the tensor is not scalar (rank 0), or if its shape is
+      unknown.
+  """
   with ops.name_scope(name, 'assert_scalar', [tensor]) as name_scope:
     tensor = ops.convert_to_tensor(tensor, name=name_scope)
     shape = tensor.get_shape()
     if shape.ndims != 0:
       if context.executing_eagerly():
-        raise ValueError('Expected scalar shape, saw shape: %s.'
-                         % (shape,))
+        raise ValueError('%sExpected scalar shape, saw shape: %s.'
+                         % (message or '', shape,))
       else:
-        raise ValueError('Expected scalar shape for %s, saw shape: %s.'
-                         % (tensor.name, shape))
+        raise ValueError('%sExpected scalar shape for %s, saw shape: %s.'
+                         % (message or '', tensor.name, shape))
     return tensor
 
 
diff --git a/tensorflow/python/ops/clip_ops.py b/tensorflow/python/ops/clip_ops.py
index 5cd626b92dc7711559ad1b9624eb9d818a4084a7..a237cfff826bf0fb4cacd0c25fe5d361e3d7b26e 100644
--- a/tensorflow/python/ops/clip_ops.py
+++ b/tensorflow/python/ops/clip_ops.py
@@ -31,10 +31,12 @@ from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("clip_by_value")
+@dispatch.add_dispatch_support
 def clip_by_value(t, clip_value_min, clip_value_max,
                   name=None):
   """Clips tensor values to a specified min and max.
@@ -300,7 +302,12 @@ def clip_by_global_norm(t_list, clip_norm, use_norm=None, name=None):
   return list_clipped, use_norm
 
 
-@tf_export("clip_by_average_norm")
+@deprecation.deprecated(
+    date=None,
+    instructions=
+    "clip_by_average_norm is deprecated in TensorFlow 2.0. Please use "
+    "clip_by_norm(t, clip_norm * tf.to_float(tf.size(t), name)) instead.")
+@tf_export(v1=["clip_by_average_norm"])
 def clip_by_average_norm(t, clip_norm, name=None):
   """Clips tensor values to a maximum average L2-norm.
 
diff --git a/tensorflow/python/ops/clip_ops_test.py b/tensorflow/python/ops/clip_ops_test.py
index 8aa9c4ffb3412ae48528baa356804db4ea41cfd3..a59a0c22d409e68a821c6b069827b15cd9ecca52 100644
--- a/tensorflow/python/ops/clip_ops_test.py
+++ b/tensorflow/python/ops/clip_ops_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import clip_ops
 from tensorflow.python.ops import numerics
 from tensorflow.python.platform import test
@@ -35,7 +36,7 @@ class ClipOpsTest(test.TestCase):
       input_op = constant_op.constant(inputs)
       clipped = clip_ops.clip_by_norm(input_op, max_norm)
       check_op = numerics.add_check_numerics_ops()
-      result, _ = sess.run([clipped, check_op])
+      result, _ = self.evaluate([clipped, check_op])
     self.assertAllClose(result, expected)
 
   def _testClipIndexedSlicesByNorm(self, values, indices, shape, max_norm,
@@ -54,9 +55,10 @@ class ClipOpsTest(test.TestCase):
       # Tensor mode
       dense_tensor = ops.convert_to_tensor(indixed_slices)
       dense_clipped = clip_ops.clip_by_norm(dense_tensor, max_norm, axes)
-      result, expected = sess.run([clipped, dense_clipped])
+      result, expected = self.evaluate([clipped, dense_clipped])
     self.assertAllClose(result, expected)
 
+  @test_util.run_deprecated_v1
   def testClipTensorByNorm(self):
     # Simple example
     self._testClipTensorByNorm([[-3.0, 0.0, 0.0], [4.0, 0.0, 0.0]], 4.0,
diff --git a/tensorflow/python/ops/collective_ops_test.py b/tensorflow/python/ops/collective_ops_test.py
index 9c772a93548b90eef0e52429545af393a06bcb4e..0fd9368d2194e875aa5c4ddfb716f0898d6a9c49 100644
--- a/tensorflow/python/ops/collective_ops_test.py
+++ b/tensorflow/python/ops/collective_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.platform import test
 
@@ -49,16 +50,19 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], expected, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], expected, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testCollectiveReduce(self):
     self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
                                [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], True)
 
+  @test_util.run_deprecated_v1
   def testCollectiveAutoGraphKey(self):
     self._testCollectiveReduce([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1],
                                [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3],
                                [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2], False)
 
+  @test_util.run_deprecated_v1
   def testCollectiveReduceScalar(self):
     self._testCollectiveReduce(0.1, 0.3, 0.2, True)
 
@@ -81,6 +85,7 @@ class CollectiveOpTest(test.TestCase):
     self.assertAllClose(results[0], t0, rtol=1e-5, atol=1e-5)
     self.assertAllClose(results[1], t0, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testCollectiveBroadcast(self):
     self._testCollectiveBroadcast([0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1])
 
diff --git a/tensorflow/python/ops/cond_v2.py b/tensorflow/python/ops/cond_v2.py
index 998c3e08f6f002b167310c607ac1960e993b6bd2..abc99c1205159bd4eb87e3a378fe95693ac84aa7 100644
--- a/tensorflow/python/ops/cond_v2.py
+++ b/tensorflow/python/ops/cond_v2.py
@@ -25,15 +25,19 @@ from __future__ import print_function
 
 import collections
 
-from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.ops import control_flow_util_v2 as util
+from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import gen_functional_ops
+from tensorflow.python.ops import gen_resource_variable_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.util import nest
+
 
 # NOTE(skyewm): TensorFlow uses protected class methods and fields to signify
 # that they aren't part of the official public API. These protected members
@@ -74,73 +78,14 @@ def cond_v2(pred, true_fn, false_fn, name="cond"):
             false_name, read_only_collections=False),
         add_control_dependencies=add_control_dependencies,
         op_return_value=pred)
-    _check_same_outputs(true_graph, false_graph)
-
-    # Add inputs to true_graph and false_graph to make them match. Note that
-    # this modifies true_graph and false_graph.
-    cond_inputs = _make_inputs_match(true_graph, false_graph,
-                                     true_graph.external_captures,
-                                     false_graph.external_captures)
-
-    # Add all intermediate tensors as function outputs so they're available for
-    # the gradient computation.
 
-    true_intermediates = _get_intermediates(true_graph)
-    false_intermediates = _get_intermediates(false_graph)
+    outputs = _build_cond(pred, true_graph, false_graph,
+                          true_graph.external_captures,
+                          false_graph.external_captures,
+                          name=scope)
 
-    # Save the original number of outputs to return to the caller.
-    num_cond_outputs = len(true_graph.outputs)
-
-    # Make the number/type of new intermediate outputs match.
-    extra_true_outputs, extra_false_outputs = _pad_params(
-        true_graph, false_graph, true_intermediates, false_intermediates)
-
-    true_graph.outputs.extend(extra_true_outputs)
-    false_graph.outputs.extend(extra_false_outputs)
-
-    # Create the If op.
-    tensors = gen_functional_ops._if(  # pylint: disable=protected-access
-        pred,
-        cond_inputs, [t.dtype for t in true_graph.outputs],
-        util.create_new_tf_function(true_graph),
-        util.create_new_tf_function(false_graph),
-        output_shapes=_get_output_shapes(true_graph.outputs,
-                                         false_graph.outputs),
-        name=scope)
-
-    # Set the flag to enable lowering on the `if` op if necessary
-    # Lowering allows cond_v2 to avoid some of the limitations of Functions,
-    # allowing users to specify devices & colocation inside of cond_v2 branches,
-    # and enabling non-strict evaluation & partial pruning of cond_v2 branches.
-    # This brings cond_v2 closer to feature parity with tf.cond.
-    #
-    # However, we do not lower `If` in the XLA context because it is easier for
-    # XLA to apply its own optimizations when dealing with un-lowered `If`
-    # operators than with lowered switch/merge control flow.
-    #
-    # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
-    if_op = tensors[0].op
-    if not control_flow_util.IsInXLAContext(if_op):
-      # pylint: disable=protected-access
-      if_op._set_attr("_lower_using_switch_merge",
-                      attr_value_pb2.AttrValue(b=True))
-      # pylint: enable=protected-access
-
-    # Return identities for each output of the If op, rather than the output of
-    # the If op directly. This makes pruning work if the output of cond() is
-    # fetched: the lowering pass converts the If outputs into IdentityN outputs,
-    # which if fetched will cause all ops in the taken branch to be run (since
-    # it takes all merge ops as input). After lowering, each output identity op
-    # will end up with only the appropriate merge op as input.
-    # TODO(b/79984175): this doesn't have to be a tuple once we covert to the
-    # correct output structure
-    tensors = tuple(array_ops.identity(t) for t in tensors)
-
-    result = tuple(tensors[:num_cond_outputs])
-    if len(result) == 1:
-      return result[0]
-    else:
-      return result
+    return func_graph_module.pack_sequence_as(true_graph.structured_outputs,
+                                              outputs)
 
 
 @ops.RegisterGradient("If")
@@ -163,44 +108,115 @@ def _IfGrad(op, *grads):  # pylint: disable=invalid-name
   assert ([t.dtype for t in true_grad_graph.outputs] ==
           [t.dtype for t in false_grad_graph.outputs])
 
+  if (true_grad_graph.if_op_needs_rewrite or
+      false_grad_graph.if_op_needs_rewrite):
+    # Modify 'op' to output the intermediates needed by the grad functions. Note
+    # that all needed intermediates are wrapped in optionals. Each optional
+    # intermediate output will have a value iff its corresponding branch is
+    # taken.
+    # NOTE(skyewm): if there are any active sessions, this modification to `op`
+    # may make them unrunnable!
+
+    if control_flow_util.InXlaContext(ops.get_default_graph()):
+      # XLA does not yet support optionals, so output intermediates directly and
+      # make them match via FakeParams, which can be converted to zeros in XLA.
+      # TODO(skyewm,jpienaar): can XLA support optionals?
+      true_intermediates = true_grad_graph.xla_intermediates
+      false_intermediates = false_grad_graph.xla_intermediates
+      extra_true_outputs, extra_false_outputs = _make_intermediates_match_xla(
+          true_graph, false_graph, true_intermediates, false_intermediates)
+    else:
+      true_intermediates = true_grad_graph.wrapped_intermediates
+      false_intermediates = false_grad_graph.wrapped_intermediates
+      # Make outputs match by adding none optionals.
+      extra_true_outputs, extra_false_outputs = _make_intermediates_match(
+          true_graph, false_graph, true_intermediates, false_intermediates)
+
+    true_graph.outputs.extend(extra_true_outputs)
+    false_graph.outputs.extend(extra_false_outputs)
+    # TODO(skyewm): indicate it's an internal bug if this fails.
+    _check_same_outputs(true_graph, false_graph)
+
+    true_graph.name += "_rewritten"
+    false_graph.name += "_rewritten"
+
+    op._set_func_attr("then_branch", util.create_new_tf_function(true_graph))
+    op._set_func_attr("else_branch", util.create_new_tf_function(false_graph))
+    op._set_type_list_attr("Tout", true_graph.output_types)
+    op._set_shape_list_attr("output_shapes", true_graph.output_shapes)
+    op._add_outputs(
+        [t.dtype for t in extra_true_outputs],
+        [t.shape for t in extra_true_outputs])
+
   # Resolve references to forward graph tensors in grad graphs and ensure
   # they are in-scope, i.e., belong to one of outer graphs of the grad graph.
   true_grad_inputs = _resolve_grad_inputs(true_graph, true_grad_graph)
   false_grad_inputs = _resolve_grad_inputs(false_graph, false_grad_graph)
 
-  # Make the inputs to true_grad_graph and false_grad_graph match. Note that
-  # this modifies true_grad_graph and false_grad_graph.
-  grad_inputs = _make_inputs_match(true_grad_graph, false_grad_graph,
-                                   true_grad_inputs, false_grad_inputs)
+  outputs = _build_cond(op.inputs[0], true_grad_graph, false_grad_graph,
+                        true_grad_inputs, false_grad_inputs)
 
-  # Add all intermediate tensors as function outputs so they're available for
-  # higher-order gradient computations.
+  # The predicate has no gradient.
+  return [None] + outputs
 
-  true_grad_intermediates = _get_intermediates(true_grad_graph)
-  false_grad_intermediates = _get_intermediates(false_grad_graph)
 
-  # Save the original number of gradient outputs to return.
-  num_grad_outputs = len(true_grad_graph.outputs)
+def _build_cond(pred, true_graph, false_graph, true_inputs, false_inputs,
+                name=None):
+  """Creates an If op from the specified predicate, branch functions and inputs.
 
-  # Make the number/type of new intermediate outputs match.
-  extra_true_grad_outputs, extra_false_grad_outputs = _pad_params(
-      true_grad_graph, false_grad_graph,
-      true_grad_intermediates, false_grad_intermediates)
+  Note that this modifies true_graph and false_graph to make the inputs match,
+  and to output all intermediates values so they're available for the gradient
+  computation.
 
-  true_grad_graph.outputs.extend(extra_true_grad_outputs)
-  false_grad_graph.outputs.extend(extra_false_grad_outputs)
+  true_graph and false_graph need not have the same input types, but they must
+  have the same outpute types.
 
-  # Create the gradient If op.
-  tensors = gen_functional_ops._if(
-      op.inputs[0],
-      grad_inputs, [t.dtype for t in true_grad_graph.outputs],
-      util.create_new_tf_function(true_grad_graph),
-      util.create_new_tf_function(false_grad_graph),
-      output_shapes=_get_output_shapes(true_grad_graph.outputs,
-                                       false_grad_graph.outputs))
+  Args:
+    pred: boolean Tensor
+    true_graph: FuncGraph
+    false_graph: FuncGraph
+    true_inputs: a list of Tensors to be passed to true_graph as input.
+    false_inputs: a list of Tensors to be passed to false_graph as input.
+    name: the name for the If op.
 
-  # The predicate has no gradient.
-  return [None] + tensors[:num_grad_outputs]
+  Returns:
+    A list of Tensors which are the outputs of the If op. Does not include added
+    intermediate outputs.
+  """
+  _check_same_outputs(true_graph, false_graph)
+
+  # Add inputs to true_graph and false_graph to make them match. Note that
+  # this modifies true_graph and false_graph.
+  cond_inputs = _make_inputs_match(true_graph, false_graph,
+                                   true_inputs, false_inputs)
+
+  # Create the If op.
+  tensors = gen_functional_ops._if(  # pylint: disable=protected-access
+      pred,
+      cond_inputs, [t.dtype for t in true_graph.outputs],
+      util.create_new_tf_function(true_graph),
+      util.create_new_tf_function(false_graph),
+      output_shapes=_get_output_shapes(true_graph.outputs,
+                                       false_graph.outputs),
+      name=name)
+
+  # TODO(b/110167197) this approach requires cond_v2 to have at least 1 output
+  if_op = tensors[0].op
+  util.maybe_set_lowering_attr(if_op)
+
+  # Return identities for each output of the If op, rather than the output of
+  # the If op directly. This makes pruning work if the output of cond() is
+  # fetched: the lowering pass converts the If outputs into IdentityN outputs,
+  # which if fetched will cause all ops in the taken branch to be run (since
+  # it takes all merge ops as input). After lowering, each output identity op
+  # will end up with only the appropriate merge op as input.
+  # TODO(b/79984175): this doesn't have to be a tuple once we covert to the
+  # correct output structure
+  tensors = [array_ops.identity(t) for t in tensors]
+
+  # Prevent fetching since the variant outputs can't be fetched directly.
+  if_op.graph.prevent_fetching(if_op)
+  return tensors
 
 
 def _get_func_graphs(if_op):
@@ -277,7 +293,11 @@ def _grad_fn(func_graph, grads):
   # both branches have zero gradient.
   for i in range(len(result)):
     if result[i] is None:
-      result[i] = array_ops.zeros_like(func_graph.inputs[i])
+      if func_graph.inputs[i].dtype == dtypes.resource:
+        result[i] = array_ops.zeros(
+            gen_resource_variable_ops.variable_shape(func_graph.inputs[i]))
+      else:
+        result[i] = array_ops.zeros_like(func_graph.inputs[i])
 
   return result
 
@@ -287,7 +307,7 @@ def _create_grad_func(func_graph, grads, name):
   return func_graph_module.func_graph_from_py_func(
       name,
       lambda: _grad_fn(func_graph, grads), [], {},
-      func_graph=util.CondBranchFuncGraph(name, read_only_collections=False))
+      func_graph=_CondGradFuncGraph(name, func_graph))
 
 
 def _resolve_grad_inputs(cond_graph, grad_graph):
@@ -369,28 +389,39 @@ def _separate_unique_inputs(true_inputs, false_inputs):
   return list(shared_inputs), list(true_only_inputs), list(false_only_inputs)
 
 
-def _pad_params(true_graph, false_graph, true_params, false_params):
-  """Returns new param lists that have matching signatures.
+def _make_intermediates_match(true_graph, false_graph,
+                              true_optionals, false_optionals):
+  """Returns new optionals lists that have matching signatures.
 
-  This is done by mirroring each param list in the other using dummy params.
-  There is no merging of params.
+  This is done by mirroring each list in the other using none optionals.
+  There is no merging of like optionals.
 
   Args:
     true_graph: FuncGraph
     false_graph: FuncGraph
-    true_params: a list of Tensors from true_graph
-    false_params: a list of Tensors from false_graph
+    true_optionals: a list of optional Tensors from true_graph
+    false_optionals: a list of optional Tensors from false_graph
 
   Returns:
     A new list of Tensors in true_graph and a new list of Tensors in
-    false_graph. The two lists have the same number of Tensors, with matching
-    types and shapes across the lists.
+    false_graph. The two lists have the same number of Tensors, all of which
+    will be optionals of the same shape/type.
   """
-  new_true_params = (true_params +
-                     _create_dummy_params(true_graph, false_params))
-  new_false_inputs = (_create_dummy_params(false_graph, true_params)
-                      + false_params)
-  return new_true_params, new_false_inputs
+  new_true_optionals = (true_optionals +
+                        _create_none_optionals(true_graph, false_optionals))
+  new_false_optionals = (_create_none_optionals(false_graph, true_optionals)
+                         + false_optionals)
+  return new_true_optionals, new_false_optionals
+
+
+def _make_intermediates_match_xla(true_graph, false_graph, true_intermediates,
+                                  false_intermediates):
+  """Like _make_intermediates_match but for the XLA case."""
+  new_true_intermediates = (true_intermediates +
+                            _create_fakeparams(true_graph, false_intermediates))
+  new_false_intermediates = (_create_fakeparams(false_graph, true_intermediates)
+                             + false_intermediates)
+  return new_true_intermediates, new_false_intermediates
 
 
 def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
@@ -425,11 +456,11 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   true_graph.inputs = (
       [true_input_to_param[t] for t in shared_inputs] +
       [true_input_to_param[t] for t in true_only_inputs] +
-      _create_dummy_params(true_graph, false_only_inputs))
+      _create_dummy_inputs(true_graph, false_only_inputs))
 
   false_graph.inputs = (
       [false_input_to_param[t] for t in shared_inputs] +
-      _create_dummy_params(false_graph, true_only_inputs) +
+      _create_dummy_inputs(false_graph, true_only_inputs) +
       [false_input_to_param[t] for t in false_only_inputs])
 
   # Rewrite the FuncGraphs' state to reflect the new inputs.
@@ -441,7 +472,12 @@ def _make_inputs_match(true_graph, false_graph, true_inputs, false_inputs):
   return new_inputs
 
 
-def _create_dummy_params(func_graph, template_tensors):
+def _wrap_intermediates(func_graph, intermediates):
+  with func_graph.as_default():
+    return [gen_dataset_ops.optional_from_value([t]) for t in intermediates]
+
+
+def _create_dummy_inputs(func_graph, template_tensors):
   """Creates tensors in func_graph to represent template_tensors.
 
   Args:
@@ -451,6 +487,27 @@ def _create_dummy_params(func_graph, template_tensors):
   Returns:
     A list of tensors in func_graph.
   """
+  with func_graph.as_default():
+    return [array_ops.placeholder(t.dtype, shape=t.shape)
+            for t in template_tensors]
+
+
+def _create_none_optionals(func_graph, template_tensors):
+  """Creates none optionals in func_graph to represent template_tensors.
+
+  Args:
+    func_graph: FuncGraph.
+    template_tensors: a list of tensors in func_graph.
+
+  Returns:
+    A list of tensors in func_graph.
+  """
+  with func_graph.as_default():
+    return [gen_dataset_ops.optional_none() for _ in template_tensors]
+
+
+def _create_fakeparams(func_graph, template_tensors):
+  """Create FakeParams for the XLA case."""
   with func_graph.as_default():
     return [gen_functional_ops.fake_param(dtype=t.dtype, shape=t.shape)
             for t in template_tensors]
@@ -462,12 +519,20 @@ def _check_same_outputs(true_graph, false_graph):
   false_output_types = [t.dtype for t in false_graph.outputs]
   if (len(true_graph.outputs) != len(false_graph.outputs) or
       true_output_types != false_output_types):
-    raise ValueError(
+    raise TypeError(
         "true_fn() and false_fn() must return the same number and type of "
         "arguments, got:\n"
         "  true_fn: %s\n"
         "  false_fn: %s" % (true_output_types, false_output_types))
 
+  # Make sure `structured_outputs` for both graphs have the same structure.
+  try:
+    nest.assert_same_structure(true_graph.structured_outputs,
+                               false_graph.structured_outputs)
+  except (ValueError, TypeError) as e:
+    raise ValueError("Outputs of true_fn and false_fn must have the same "
+                     "structure: %s" % str(e))
+
 
 def _get_output_shapes(true_graph_outputs, false_graph_outputs):
   output_shapes = [
@@ -475,3 +540,86 @@ def _get_output_shapes(true_graph_outputs, false_graph_outputs):
       for t_out, f_out in zip(true_graph_outputs, false_graph_outputs)
   ]
   return output_shapes
+
+
+class _CondGradFuncGraph(util.CondBranchFuncGraph):
+  """FuncGraph for the gradient function of the branch of an If op.
+
+  Handles wrapping and unwrapping intermediate values that are captured by the
+  gradient computation in optionals.
+
+  Attributes:
+    if_op_needs_rewrite: True if any intermediates were captured, meaning the
+      forward If op needs to be written to output the wrapped intermediates.
+  """
+
+  def __init__(self, name, forward_graph):
+    super(_CondGradFuncGraph, self).__init__(name, read_only_collections=False)
+    self.if_op_needs_rewrite = False
+    self._forward_graph = forward_graph
+    # Maps from forward intermediate tensor -> the unwrapped captured
+    # intermediate.
+    self._indirect_captures = {}
+    # Maps unwrapped intermediate -> optional-wrapped intermediate in the
+    # forward graph.
+    self._wrapped_intermediates = collections.OrderedDict()
+    # Raw intermediates captured from the forward graph. Populated iff we're in
+    # an XLA context.
+    self._xla_intermediates = []
+
+  @property
+  def wrapped_intermediates(self):
+    """The optional-wrapped intermediates captured from the forward graph."""
+    return list(self._wrapped_intermediates.values())
+
+  @property
+  def xla_intermediates(self):
+    """Raw intermediates captured from the forward graph if XLA is enabled."""
+    return self._xla_intermediates
+
+  def _capture_helper(self, tensor, name):
+    if (tensor.graph is not self._forward_graph or
+        tensor in self._forward_graph.inputs or
+        tensor in self._forward_graph.outputs):
+      return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
+
+    if control_flow_util.InXlaContext(ops.get_default_graph()):
+      # XLA does not yet support optionals, so capture intermediates directly.
+      # TODO(skyewm,jpienaar): can XLA support optionals?
+      if tensor not in self.captures:
+        self.xla_intermediates.append(tensor)
+        self.if_op_needs_rewrite = True
+      return super(_CondGradFuncGraph, self)._capture_helper(tensor, name)
+
+    captured_tensor = self._indirect_captures.get(tensor)
+    if captured_tensor is not None:
+      return captured_tensor
+
+    # 'tensor' is an uncaptured intermediate in the forward graph. We wrap it in
+    # an optional in the forward graph and capture the optional normally. We
+    # then unwrap the captured optional value in the gradient graph to get the
+    # raw intermediate value.
+
+    if tensor not in self._wrapped_intermediates:
+      # If the gradient has already been computed for this If op, 'tensor' may
+      # already be wrapped.
+      for consumer in tensor.consumers():
+        if (consumer.type == "OptionalFromValue"
+            and consumer.outputs[0] in self._forward_graph.outputs):
+          optional = consumer.outputs[0]
+          break
+      else:
+        # 'tensor' hasn't been wrapped, do it now.
+        with self._forward_graph.as_default():
+          optional = gen_dataset_ops.optional_from_value([tensor])
+        self.if_op_needs_rewrite = True
+
+      self._wrapped_intermediates[tensor] = optional
+
+    optional = self._wrapped_intermediates[tensor]
+    captured_optional = super(_CondGradFuncGraph, self)._capture_helper(
+        optional, name)
+    captured_tensor = gen_dataset_ops.optional_get_value(
+        captured_optional, [tensor.dtype], [tensor.shape])[0]
+    self._indirect_captures[tensor] = captured_tensor
+    return captured_tensor
diff --git a/tensorflow/python/ops/confusion_matrix.py b/tensorflow/python/ops/confusion_matrix.py
index b86b174afe5625e630511028952f0f16cce311a6..ccfe3b65c2d90b37836e2e48c3235f399f77df49 100644
--- a/tensorflow/python/ops/confusion_matrix.py
+++ b/tensorflow/python/ops/confusion_matrix.py
@@ -90,12 +90,13 @@ def remove_squeezable_dimensions(
     return labels, predictions
 
 
-@tf_export(
-    'math.confusion_matrix',
-    v1=['math.confusion_matrix', 'confusion_matrix'])
-@deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
-def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
-                     name=None, weights=None):
+@tf_export('math.confusion_matrix', v1=[])
+def confusion_matrix(labels,
+                     predictions,
+                     num_classes=None,
+                     weights=None,
+                     dtype=dtypes.int32,
+                     name=None):
   """Computes the confusion matrix from predictions and labels.
 
   The matrix columns represent the prediction labels and the rows represent the
@@ -132,9 +133,9 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     num_classes: The possible number of labels the classification task can
                  have. If this value is not provided, it will be calculated
                  using both predictions and labels array.
+    weights: An optional `Tensor` whose shape matches `predictions`.
     dtype: Data type of the confusion matrix.
     name: Scope name.
-    weights: An optional `Tensor` whose shape matches `predictions`.
 
   Returns:
     A `Tensor` of type `dtype` with shape `[n, n]` representing the confusion
@@ -193,3 +194,65 @@ def confusion_matrix(labels, predictions, num_classes=None, dtype=dtypes.int32,
     zero_matrix = array_ops.zeros(math_ops.to_int32(shape), dtype)
 
     return sparse_ops.sparse_add(zero_matrix, cm_sparse)
+
+
+@tf_export(v1=['math.confusion_matrix', 'confusion_matrix'])
+@deprecation.deprecated_endpoints('confusion_matrix', 'train.confusion_matrix')
+def confusion_matrix_v1(labels,
+                        predictions,
+                        num_classes=None,
+                        dtype=dtypes.int32,
+                        name=None,
+                        weights=None):
+  """Computes the confusion matrix from predictions and labels.
+
+  The matrix columns represent the prediction labels and the rows represent the
+  real labels. The confusion matrix is always a 2-D array of shape `[n, n]`,
+  where `n` is the number of valid labels for a given classification task. Both
+  prediction and labels must be 1-D arrays of the same shape in order for this
+  function to work.
+
+  If `num_classes` is `None`, then `num_classes` will be set to one plus the
+  maximum value in either predictions or labels. Class labels are expected to
+  start at 0. For example, if `num_classes` is 3, then the possible labels
+  would be `[0, 1, 2]`.
+
+  If `weights` is not `None`, then each prediction contributes its
+  corresponding weight to the total value of the confusion matrix cell.
+
+  For example:
+
+  ```python
+    tf.confusion_matrix([1, 2, 4], [2, 2, 4]) ==>
+        [[0 0 0 0 0]
+         [0 0 1 0 0]
+         [0 0 1 0 0]
+         [0 0 0 0 0]
+         [0 0 0 0 1]]
+  ```
+
+  Note that the possible labels are assumed to be `[0, 1, 2, 3, 4]`,
+  resulting in a 5x5 confusion matrix.
+
+  Args:
+    labels: 1-D `Tensor` of real labels for the classification task.
+    predictions: 1-D `Tensor` of predictions for a given classification.
+    num_classes: The possible number of labels the classification task can have.
+      If this value is not provided, it will be calculated using both
+      predictions and labels array.
+    dtype: Data type of the confusion matrix.
+    name: Scope name.
+    weights: An optional `Tensor` whose shape matches `predictions`.
+
+  Returns:
+    A `Tensor` of type `dtype` with shape `[n, n]` representing the confusion
+    matrix, where `n` is the number of possible labels in the classification
+    task.
+
+  Raises:
+    ValueError: If both predictions and labels are not 1-D vectors and have
+      mismatched shapes, or if `weights` is not `None` and its shape doesn't
+      match `predictions`.
+  """
+  return confusion_matrix(labels, predictions, num_classes, weights, dtype,
+                          name)
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 0d04f0697df72c28901097216a3333f0cee55681..b7e50c1dae5ac1dc0968a3badb8f017e6b0384e1 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -158,7 +158,7 @@ def Assert(condition, data, summarize=None, name=None):
 
   with ops.name_scope(name, "Assert", [condition, data]) as name:
     xs = ops.convert_n_to_tensor(data)
-    if all([x.dtype in {dtypes.string, dtypes.int32} for x in xs]):
+    if all(x.dtype in {dtypes.string, dtypes.int32} for x in xs):
       # As a simple heuristic, we assume that string and int32 are
       # on host to avoid the need to use cond. If it is not case,
       # we will pay the price copying the tensor to host memory.
@@ -457,19 +457,19 @@ def merge(inputs, name=None):
     ValueError: If any of the inputs is None, or inputs are IndexedSlices and
       some but not all have a dense_shape property.
   """
-  if any([inp is None for inp in inputs]):
+  if any(inp is None for inp in inputs):
     raise ValueError("At least one of the merge inputs is None: %s" % inputs)
   with ops.name_scope(name, "Merge", inputs) as name:
     inputs = [
         ops.internal_convert_to_tensor_or_indexed_slices(inp, as_ref=True)
         for inp in inputs
     ]
-    if all([isinstance(v, ops.Tensor) for v in inputs]):
-      if all([v.dtype._is_ref_dtype for v in inputs]):  # pylint: disable=protected-access
+    if all(isinstance(v, ops.Tensor) for v in inputs):
+      if all(v.dtype._is_ref_dtype for v in inputs):  # pylint: disable=protected-access
         return gen_control_flow_ops.ref_merge(inputs, name)
       else:
         return gen_control_flow_ops.merge(inputs, name)
-    elif all([isinstance(v, sparse_tensor.SparseTensor) for v in inputs]):
+    elif all(isinstance(v, sparse_tensor.SparseTensor) for v in inputs):
       # Only handle the case when all inputs are SparseTensor.
       values, _ = merge([inp.values for inp in inputs], name=name)
       indices, chosen_index = gen_control_flow_ops.merge(
@@ -557,7 +557,7 @@ def _SetShapeInvariants(input_vars, enter_vars, shapes):
   if shapes is None:
     return
   flat_shapes = nest.flatten(shapes)
-  if not all([isinstance(s, tensor_shape.TensorShape) for s in flat_shapes]):
+  if not all(isinstance(s, tensor_shape.TensorShape) for s in flat_shapes):
     raise ValueError("`shapes` must be a (possibly nested) list of shapes.")
   # Check that the shapes of the inputs are less than the shape invariants,
   # and set the shapes of `enter_vars` to the shape invariants.
@@ -1976,7 +1976,7 @@ def _UnpackIfSingleton(res):
 
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
-@tf_export("cond")
+@tf_export(v1=["cond"])
 @deprecation.deprecated_args(
     None, "fn1/fn2 are deprecated in favor of the true_fn/false_fn arguments.",
     "fn1", "fn2")
@@ -2173,6 +2173,77 @@ def cond(pred,
 # pylint: enable=redefined-outer-name
 
 
+@tf_export("cond", v1=[])
+def cond_for_tf_v2(pred,
+                   true_fn=None,
+                   false_fn=None,
+                   name=None):
+  """Return `true_fn()` if the predicate `pred` is true else `false_fn()`.
+
+  `true_fn` and `false_fn` both return lists of output tensors. `true_fn` and
+  `false_fn` must have the same non-zero number and type of outputs.
+
+  **WARNING**: Any Tensors or Operations created outside of `true_fn` and
+  `false_fn` will be executed regardless of which branch is selected at runtime.
+
+  Although this behavior is consistent with the dataflow model of TensorFlow,
+  it has frequently surprised users who expected a lazier semantics.
+  Consider the following simple program:
+
+  ```python
+  z = tf.multiply(a, b)
+  result = tf.cond(x < y, lambda: tf.add(x, z), lambda: tf.square(y))
+  ```
+
+  If `x < y`, the `tf.add` operation will be executed and `tf.square`
+  operation will not be executed. Since `z` is needed for at least one
+  branch of the `cond`, the `tf.multiply` operation is always executed,
+  unconditionally.
+
+  Note that `cond` calls `true_fn` and `false_fn` *exactly once* (inside the
+  call to `cond`, and not at all during `Session.run()`). `cond`
+  stitches together the graph fragments created during the `true_fn` and
+  `false_fn` calls with some additional graph nodes to ensure that the right
+  branch gets executed depending on the value of `pred`.
+
+  `tf.cond` supports nested structures as implemented in
+  `tensorflow.python.util.nest`. Both `true_fn` and `false_fn` must return the
+  same (possibly nested) value structure of lists, tuples, and/or named tuples.
+  Singleton lists and tuples form the only exceptions to this: when returned by
+  `true_fn` and/or `false_fn`, they are implicitly unpacked to single values.
+
+  Args:
+    pred: A scalar determining whether to return the result of `true_fn` or
+      `false_fn`.
+    true_fn: The callable to be performed if pred is true.
+    false_fn: The callable to be performed if pred is false.
+    name: Optional name prefix for the returned tensors.
+
+  Returns:
+    Tensors returned by the call to either `true_fn` or `false_fn`. If the
+    callables return a singleton list, the element is extracted from the list.
+
+  Raises:
+    TypeError: if `true_fn` or `false_fn` is not callable.
+    ValueError: if `true_fn` and `false_fn` do not return the same number of
+      tensors, or return tensors of different types.
+
+  Example:
+
+  ```python
+  x = tf.constant(2)
+  y = tf.constant(5)
+  def f1(): return tf.multiply(x, 17)
+  def f2(): return tf.add(y, 23)
+  r = tf.cond(tf.less(x, y), f1, f2)
+  # r is set to f1().
+  # Operations in f2 (e.g., tf.add) are not executed.
+  ```
+
+  """
+  return cond(pred, true_fn=true_fn, false_fn=false_fn, strict=True, name=name)
+
+
 def _resource_safe_shape(t):
   """Returns the shape of t or the variable it points to."""
   if t.dtype == dtypes.resource:
@@ -3065,7 +3136,186 @@ class WhileContext(ControlFlowContext):
 
 
 # pylint: disable=redefined-outer-name
-@tf_export("while_loop")
+@tf_export("while_loop", v1=[])
+def while_loop_v2(cond,
+                  body,
+                  loop_vars,
+                  shape_invariants=None,
+                  parallel_iterations=10,
+                  back_prop=True,
+                  swap_memory=False,
+                  maximum_iterations=None,
+                  name=None):
+  """Repeat `body` while the condition `cond` is true.
+
+  `cond` is a callable returning a boolean scalar tensor. `body` is a callable
+  returning a (possibly nested) tuple, namedtuple or list of tensors of the same
+  arity (length and structure) and types as `loop_vars`. `loop_vars` is a
+  (possibly nested) tuple, namedtuple or list of tensors that is passed to both
+  `cond` and `body`. `cond` and `body` both take as many arguments as there are
+  `loop_vars`.
+
+  In addition to regular Tensors or IndexedSlices, the body may accept and
+  return TensorArray objects.  The flows of the TensorArray objects will
+  be appropriately forwarded between loops and during gradient calculations.
+
+  Note that `while_loop` calls `cond` and `body` *exactly once* (inside the
+  call to `while_loop`, and not at all during `Session.run()`). `while_loop`
+  stitches together the graph fragments created during the `cond` and `body`
+  calls with some additional graph nodes to create the graph flow that
+  repeats `body` until `cond` returns false.
+
+  For correctness, `tf.while_loop()` strictly enforces shape invariants for
+  the loop variables. A shape invariant is a (possibly partial) shape that
+  is unchanged across the iterations of the loop. An error will be raised
+  if the shape of a loop variable after an iteration is determined to be more
+  general than or incompatible with its shape invariant. For example, a shape
+  of [11, None] is more general than a shape of [11, 17], and [11, 21] is not
+  compatible with [11, 17]. By default (if the argument `shape_invariants` is
+  not specified), it is assumed that the initial shape of each tensor in
+  `loop_vars` is the same in every iteration. The `shape_invariants` argument
+  allows the caller to specify a less specific shape invariant for each loop
+  variable, which is needed if the shape varies between iterations. The
+  `tf.Tensor.set_shape`
+  function may also be used in the `body` function to indicate that
+  the output loop variable has a particular shape. The shape invariant for
+  SparseTensor and IndexedSlices are treated specially as follows:
+
+  a) If a loop variable is a SparseTensor, the shape invariant must be
+  TensorShape([r]) where r is the rank of the dense tensor represented
+  by the sparse tensor. It means the shapes of the three tensors of the
+  SparseTensor are ([None], [None, r], [r]). NOTE: The shape invariant here
+  is the shape of the SparseTensor.dense_shape property. It must be the shape of
+  a vector.
+
+  b) If a loop variable is an IndexedSlices, the shape invariant must be
+  a shape invariant of the values tensor of the IndexedSlices. It means
+  the shapes of the three tensors of the IndexedSlices are (shape, [shape[0]],
+  [shape.ndims]).
+
+  `while_loop` implements non-strict semantics, enabling multiple iterations
+  to run in parallel. The maximum number of parallel iterations can be
+  controlled by `parallel_iterations`, which gives users some control over
+  memory consumption and execution order. For correct programs, `while_loop`
+  should return the same result for any parallel_iterations > 0.
+
+  For training, TensorFlow stores the tensors that are produced in the
+  forward inference and are needed in back propagation. These tensors are a
+  main source of memory consumption and often cause OOM errors when training
+  on GPUs. When the flag swap_memory is true, we swap out these tensors from
+  GPU to CPU. This for example allows us to train RNN models with very long
+  sequences and large batches.
+
+  Args:
+    cond: A callable that represents the termination condition of the loop.
+    body: A callable that represents the loop body.
+    loop_vars: A (possibly nested) tuple, namedtuple or list of numpy array,
+      `Tensor`, and `TensorArray` objects.
+    shape_invariants: The shape invariants for the loop variables.
+    parallel_iterations: The number of iterations allowed to run in parallel. It
+      must be a positive integer.
+    back_prop: Whether backprop is enabled for this while loop.
+    swap_memory: Whether GPU-CPU memory swap is enabled for this loop.
+    maximum_iterations: Optional maximum number of iterations of the while loop
+      to run.  If provided, the `cond` output is AND-ed with an additional
+      condition ensuring the number of iterations executed is no greater than
+      `maximum_iterations`.
+    name: Optional name prefix for the returned tensors.
+
+  Returns:
+    The output tensors for the loop variables after the loop. The return value
+      has the same structure as `loop_vars`.
+
+  Raises:
+    TypeError: if `cond` or `body` is not callable.
+    ValueError: if `loop_vars` is empty.
+
+  Example:
+
+  ```python
+  i = tf.constant(0)
+  c = lambda i: tf.less(i, 10)
+  b = lambda i: tf.add(i, 1)
+  r = tf.while_loop(c, b, [i])
+  ```
+
+  Example with nesting and a namedtuple:
+
+  ```python
+  import collections
+  Pair = collections.namedtuple('Pair', 'j, k')
+  ijk_0 = (tf.constant(0), Pair(tf.constant(1), tf.constant(2)))
+  c = lambda i, p: i < 10
+  b = lambda i, p: (i + 1, Pair((p.j + p.k), (p.j - p.k)))
+  ijk_final = tf.while_loop(c, b, ijk_0)
+  ```
+
+  Example using shape_invariants:
+
+  ```python
+  i0 = tf.constant(0)
+  m0 = tf.ones([2, 2])
+  c = lambda i, m: i < 10
+  b = lambda i, m: [i+1, tf.concat([m, m], axis=0)]
+  tf.while_loop(
+      c, b, loop_vars=[i0, m0],
+      shape_invariants=[i0.get_shape(), tf.TensorShape([None, 2])])
+  ```
+
+  Example which demonstrates non-strict semantics: In the following
+  example, the final value of the counter `i` does not depend on `x`. So
+  the `while_loop` can increment the counter parallel to updates of `x`.
+  However, because the loop counter at one loop iteration depends
+  on the value at the previous iteration, the loop counter itself cannot
+  be incremented in parallel. Hence if we just want the final value of the
+  counter (which we print on the line `print(sess.run(i))`), then
+  `x` will never be incremented, but the counter will be updated on a
+  single thread. Conversely, if we want the value of the output (which we
+  print on the line `print(sess.run(out).shape)`), then the counter may be
+  incremented on its own thread, while `x` can be incremented in
+  parallel on a separate thread. In the extreme case, it is conceivable
+  that the thread incrementing the counter runs until completion before
+  `x` is incremented even a single time. The only thing that can never
+  happen is that the thread updating `x` can never get ahead of the
+  counter thread because the thread incrementing `x` depends on the value
+  of the counter.
+
+  ```python
+  import tensorflow as tf
+
+  n = 10000
+  x = tf.constant(list(range(n)))
+  c = lambda i, x: i < n
+  b = lambda i, x: (tf.Print(i + 1, [i]), tf.Print(x + 1, [i], "x:"))
+  i, out = tf.while_loop(c, b, (0, x))
+  with tf.Session() as sess:
+      print(sess.run(i))  # prints [0] ... [9999]
+
+      # The following line may increment the counter and x in parallel.
+      # The counter thread may get ahead of the other thread, but not the
+      # other way around. So you may see things like
+      # [9996] x:[9987]
+      # meaning that the counter thread is on iteration 9996,
+      # while the other thread is on iteration 9987
+      print(sess.run(out).shape)
+  ```
+
+  """
+  return while_loop(
+      cond=cond,
+      body=body,
+      loop_vars=loop_vars,
+      shape_invariants=shape_invariants,
+      parallel_iterations=parallel_iterations,
+      back_prop=back_prop,
+      swap_memory=swap_memory,
+      name=name,
+      maximum_iterations=maximum_iterations,
+      return_same_structure=True)
+
+
+# pylint: disable=redefined-outer-name
+@tf_export(v1=["while_loop"])
 def while_loop(cond,
                body,
                loop_vars,
@@ -3244,7 +3494,8 @@ def while_loop(cond,
         loop_vars,
         shape_invariants=shape_invariants,
         maximum_iterations=maximum_iterations,
-        name=name)
+        name=name,
+        return_same_structure=return_same_structure)
 
   with ops.name_scope(name, "while", loop_vars):
     if not loop_vars:
@@ -3465,7 +3716,43 @@ def group(*inputs, **kwargs):
       return no_op(name=name)
 
 
-@tf_export("tuple")
+@tf_export("tuple", v1=[])
+def tuple_v2(tensors, control_inputs=None, name=None):
+  """Group tensors together.
+
+  This creates a tuple of tensors with the same values as the `tensors`
+  argument, except that the value of each tensor is only returned after the
+  values of all tensors have been computed.
+
+  `control_inputs` contains additional ops that have to finish before this op
+  finishes, but whose outputs are not returned.
+
+  This can be used as a "join" mechanism for parallel computations: all the
+  argument tensors can be computed in parallel, but the values of any tensor
+  returned by `tuple` are only available after all the parallel computations
+  are done.
+
+  See also `tf.group` and
+  `tf.control_dependencies`.
+
+  Args:
+    tensors: A list of `Tensor`s or `IndexedSlices`, some entries can be `None`.
+    control_inputs: List of additional ops to finish before returning.
+    name: (optional) A name to use as a `name_scope` for the operation.
+
+  Returns:
+    Same as `tensors`.
+
+  Raises:
+    ValueError: If `tensors` does not contain any `Tensor` or `IndexedSlices`.
+    TypeError: If `control_inputs` is not a list of `Operation` or `Tensor`
+      objects.
+
+  """
+  return tuple(tensors=tensors, name=name, control_inputs=control_inputs)  # pylint: disable=redefined-builtin
+
+
+@tf_export(v1=["tuple"])
 def tuple(tensors, name=None, control_inputs=None):  # pylint: disable=redefined-builtin
   """Group tensors together.
 
diff --git a/tensorflow/python/ops/control_flow_ops_test.py b/tensorflow/python/ops/control_flow_ops_test.py
index c3514c183c4c9e24a3af747123519f3f9e20f5c0..0c18b7208f5c4049722012504a26563f55aeca3c 100644
--- a/tensorflow/python/ops/control_flow_ops_test.py
+++ b/tensorflow/python/ops/control_flow_ops_test.py
@@ -21,9 +21,10 @@ from __future__ import print_function
 import collections
 import numpy as np
 
+from tensorflow.python import tf2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
-from tensorflow.python.client import session
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -126,56 +127,56 @@ class GroupTestCase(test_util.TensorFlowTestCase):
       node { name: "root" op: "NoOp" input: "^a" input: "^b" }
     """, self._StripGraph(gd))
 
+  @test_util.run_deprecated_v1
   def testPassingNonTensors(self):
-    with ops.Graph().as_default():
-      with self.assertRaises(TypeError):
-        control_flow_ops.group(1, 2)
+    with self.assertRaises(TypeError):
+      control_flow_ops.group(1, 2)
 
 
 class ShapeTestCase(test_util.TensorFlowTestCase):
 
   def testShape(self):
-    with ops.Graph().as_default():
-      tensor = constant_op.constant([1.0, 2.0])
-      self.assertEquals([2], tensor.get_shape())
-      self.assertEquals([2],
-                        control_flow_ops.with_dependencies(
-                            [constant_op.constant(1.0)], tensor).get_shape())
+    tensor = constant_op.constant([1.0, 2.0])
+    self.assertEquals([2], tensor.get_shape())
+    self.assertEquals([2],
+                      control_flow_ops.with_dependencies(
+                          [constant_op.constant(1.0)], tensor).get_shape())
 
 
 class WithDependenciesTestCase(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testTupleDependencies(self):
-    with ops.Graph().as_default():
-      counter = variable_scope.get_variable(
-          "my_counter", shape=[], initializer=init_ops.zeros_initializer())
-      increment_counter = state_ops.assign_add(counter, 1)
-      const_with_dep = control_flow_ops.with_dependencies(
-          (increment_counter, constant_op.constant(42)),
-          constant_op.constant(7))
-      with self.cached_session():
-        variables.global_variables_initializer().run()
-        self.assertEquals(0, counter.eval())
-        self.assertEquals(7, const_with_dep.eval())
-        self.assertEquals(1, counter.eval())
-
+    counter = variable_scope.get_variable(
+        "my_counter", shape=[], initializer=init_ops.zeros_initializer())
+    increment_counter = state_ops.assign_add(counter, 1)
+    const_with_dep = control_flow_ops.with_dependencies(
+        (increment_counter, constant_op.constant(42)),
+        constant_op.constant(7))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEquals(0, self.evaluate(counter))
+    self.assertEquals(7, self.evaluate(const_with_dep))
+    self.assertEquals(1, self.evaluate(counter))
+
+  @test_util.run_deprecated_v1
   def testListDependencies(self):
-    with ops.Graph().as_default():
-      counter = variable_scope.get_variable(
-          "my_counter", shape=[], initializer=init_ops.zeros_initializer())
-      increment_counter = state_ops.assign_add(counter, 1)
-      const_with_dep = control_flow_ops.with_dependencies(
-          [increment_counter, constant_op.constant(42)],
-          constant_op.constant(7))
-      with self.cached_session():
-        variables.global_variables_initializer().run()
-        self.assertEquals(0, counter.eval())
-        self.assertEquals(7, const_with_dep.eval())
-        self.assertEquals(1, counter.eval())
+    counter = variable_scope.get_variable(
+        "my_counter", shape=[], initializer=init_ops.zeros_initializer())
+    increment_counter = state_ops.assign_add(counter, 1)
+    const_with_dep = control_flow_ops.with_dependencies(
+        [increment_counter, constant_op.constant(42)],
+        constant_op.constant(7))
+
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEquals(0, self.evaluate(counter))
+    self.assertEquals(7, self.evaluate(const_with_dep))
+    self.assertEquals(1, self.evaluate(counter))
 
 
 class SwitchTestCase(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesWithDenseShape(self):
     with self.cached_session():
       data = ops.IndexedSlices(
@@ -189,68 +190,64 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       self.assertAllEqual([1, 2, 3], switch_true.values.eval())
       self.assertAllEqual([0, 1], switch_true.indices.eval())
 
+  @test_util.run_deprecated_v1
   def testIndexedSlicesGradient(self):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix", [5, 5],
-          initializer=init_ops.random_normal_initializer())
-
-      def cond(it, _):
-        return it < 5
-
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
-        cost += math_ops.reduce_sum(embedding)
-        return it + 1, cost
-
-      _, cost = control_flow_ops.while_loop(
-          cond, body, [constant_op.constant(0),
-                       constant_op.constant(0.0)])
-      optimizer = momentum.MomentumOptimizer(0.1, 0.9)
-      train_op = optimizer.minimize(cost)
-      with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        for _ in range(10):
-          sess.run([train_op])
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", [5, 5],
+        initializer=init_ops.random_normal_initializer())
+
+    def cond(it, _):
+      return it < 5
+
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix + 0.0, [0])
+      cost += math_ops.reduce_sum(embedding)
+      return it + 1, cost
+
+    _, cost = control_flow_ops.while_loop(
+        cond, body, [constant_op.constant(0),
+                     constant_op.constant(0.0)])
+    optimizer = momentum.MomentumOptimizer(0.1, 0.9)
+    train_op = optimizer.minimize(cost)
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
+      for _ in range(10):
+        self.evaluate([train_op])
 
   def testResourceReadInLoop(self):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix",
-          initializer=[[2.0], [3.0]],
-          use_resource=True)
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", initializer=[[2.0], [3.0]], use_resource=True)
 
-      def cond(it, _):
-        return it < 5
+    def cond(it, _):
+      return it < 5
 
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
-        cost += math_ops.reduce_sum(embedding)
-        return it + 1, cost
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
+      cost += math_ops.reduce_sum(embedding)
+      return it + 1, cost
 
-      _, cost = control_flow_ops.while_loop(
-          cond, body, [constant_op.constant(0),
-                       constant_op.constant(0.0)])
-      with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertAllEqual(10.0, cost.eval())
+    _, cost = control_flow_ops.while_loop(
+        cond, body, [constant_op.constant(0),
+                     constant_op.constant(0.0)])
+    with self.cached_session():
+      self.evaluate(variables.global_variables_initializer())
+      self.assertAllEqual(10.0, self.evaluate(cost))
 
   def doTestIndexedSlicesGradientInCondInWhileLoop(self, use_resource=False):
-    with ops.Graph().as_default():
-      embedding_matrix = variable_scope.get_variable(
-          "embedding_matrix", [5, 5],
-          initializer=init_ops.random_normal_initializer(),
-          use_resource=use_resource)
-
-      def cond(it, _):
-        return it < 5
-
-      def body(it, cost):
-        embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
-        cost = control_flow_ops.cond(
-            math_ops.equal(it, 3), lambda: math_ops.square(cost),
-            lambda: cost + math_ops.reduce_sum(embedding))
-        return it + 1, cost
+    embedding_matrix = variable_scope.get_variable(
+        "embedding_matrix", [5, 5],
+        initializer=init_ops.random_normal_initializer(),
+        use_resource=use_resource)
+
+    def cond(it, _):
+      return it < 5
+
+    def body(it, cost):
+      embedding = embedding_ops.embedding_lookup(embedding_matrix, [0])
+      cost = control_flow_ops.cond(
+          math_ops.equal(it, 3), lambda: math_ops.square(cost),
+          (lambda: cost + math_ops.reduce_sum(embedding)))
+      return it + 1, cost
 
       _, cost = control_flow_ops.while_loop(
           cond, body, [constant_op.constant(0),
@@ -268,9 +265,9 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
       static_grads = math_ops.segment_sum(static_grads.values,
                                           static_grads.indices)
 
-      with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertAllEqual(*sess.run([static_grads, dynamic_grads]))
+      with self.cached_session():
+        self.evaluate(variables.global_variables_initializer())
+        self.assertAllEqual(*self.evaluate([static_grads, dynamic_grads]))
 
   def testIndexedSlicesGradientInCondInWhileLoop(self):
     self.doTestIndexedSlicesGradientInCondInWhileLoop(use_resource=False)
@@ -278,6 +275,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
   def testIndexedSlicesGradientInCondInWhileLoopResource(self):
     self.doTestIndexedSlicesGradientInCondInWhileLoop(use_resource=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesWithShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session() as sess:
@@ -307,6 +305,7 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(o, 20)
         self.assertAllEqual(grad, [1] * num_steps)
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesWithDynamicShapeGradientInWhileLoop(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session() as sess:
@@ -334,105 +333,94 @@ class SwitchTestCase(test_util.TensorFlowTestCase):
         self.assertEquals(o, 6)
         self.assertAllEqual(grad, [1] * 3)
 
+  @test_util.run_deprecated_v1
   def testGradientThroughSingleBranchOutsideOfContext(self):
-    with self.cached_session():
-      x = constant_op.constant(2.)
-      s = constant_op.constant(True)
-      x_false, x_true = control_flow_ops.switch(x, s)
-      grad_x_true = gradients_impl.gradients(x_true, x)[0]
-      grad_x_false = gradients_impl.gradients(x_false, x)[0]
-      self.assertEquals(grad_x_true.eval(), 1.)
-      self.assertEquals(grad_x_false.eval(), 0.)
+    x = constant_op.constant(2.)
+    s = constant_op.constant(True)
+    x_false, x_true = control_flow_ops.switch(x, s)
+    grad_x_true = gradients_impl.gradients(x_true, x)[0]
+    grad_x_false = gradients_impl.gradients(x_false, x)[0]
+    self.assertEquals(self.evaluate(grad_x_true), 1.)
+    self.assertEquals(self.evaluate(grad_x_false), 0.)
 
 
 class CondTest(test_util.TensorFlowTestCase):
 
   def testCondTrue(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(5)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
-            lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 34)
+    x = constant_op.constant(2)
+    y = constant_op.constant(5)
+    z = control_flow_ops.cond(
+        math_ops.less(
+            x,
+            y), lambda: math_ops.multiply(x, 17), lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 34)
 
   def testCondFalse(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(1)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), lambda: math_ops.multiply(x, 17),
-            lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 24)
+    x = constant_op.constant(2)
+    y = constant_op.constant(1)
+    z = control_flow_ops.cond(
+        math_ops.less(
+            x,
+            y), lambda: math_ops.multiply(x, 17), lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 24)
 
   def testCondTrueLegacy(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(5)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
-            fn2=lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 34)
+    x = constant_op.constant(2)
+    y = constant_op.constant(5)
+    z = control_flow_ops.cond(
+        math_ops.less(x, y),
+        fn1=lambda: math_ops.multiply(x, 17),
+        fn2=lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 34)
 
   def testCondFalseLegacy(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(2)
-        y = constant_op.constant(1)
-        z = control_flow_ops.cond(
-            math_ops.less(x, y), fn1=lambda: math_ops.multiply(x, 17),
-            fn2=lambda: math_ops.add(y, 23))
-        self.assertEquals(z.eval(), 24)
-
+    x = constant_op.constant(2)
+    y = constant_op.constant(1)
+    z = control_flow_ops.cond(
+        math_ops.less(x, y),
+        fn1=lambda: math_ops.multiply(x, 17),
+        fn2=lambda: math_ops.add(y, 23))
+    self.assertEquals(self.evaluate(z), 24)
+
+  @test_util.run_deprecated_v1
   def testCondModifyBoolPred(self):
     # This test in particular used to fail only when running in GPU, hence
     # use_gpu=True.
-    with ops.Graph().as_default():
-      with session.Session() as sess:
-        bool_var = variable_scope.get_variable("bool_var", dtype=dtypes.bool,
-                                               initializer=True)
-        cond_on_bool_var = control_flow_ops.cond(
-            pred=bool_var,
-            true_fn=lambda: state_ops.assign(bool_var, False),
-            false_fn=lambda: True)
-        sess.run(bool_var.initializer)
-        self.assertEquals(sess.run(cond_on_bool_var), False)
-        self.assertEquals(sess.run(cond_on_bool_var), True)
+    with test_util.use_gpu():
+      bool_var = variable_scope.get_variable(
+          "bool_var", dtype=dtypes.bool, initializer=True)
+      cond_on_bool_var = control_flow_ops.cond(
+          pred=bool_var,
+          true_fn=lambda: state_ops.assign(bool_var, False),
+          false_fn=lambda: True)
+      self.evaluate(bool_var.initializer)
+      self.assertEquals(self.evaluate(cond_on_bool_var), False)
+      self.assertEquals(self.evaluate(cond_on_bool_var), True)
 
   def testCondMissingArg1(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, false_fn=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, false_fn=lambda: x)
 
   def testCondMissingArg2(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x)
 
   def testCondDuplicateArg1(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x, lambda: x, fn1=lambda: x)
 
   def testCondDuplicateArg2(self):
-    with ops.Graph().as_default():
-      with session.Session():
-        x = constant_op.constant(1)
-        with self.assertRaises(TypeError):
-          control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
+    x = constant_op.constant(1)
+    with self.assertRaises(TypeError):
+      control_flow_ops.cond(True, lambda: x, lambda: x, fn2=lambda: x)
 
 
 class ContextTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testCondContext(self):
     with self.cached_session() as sess:
       x = constant_op.constant(2)
@@ -462,12 +450,15 @@ class ContextTest(test_util.TensorFlowTestCase):
               control_flow_ops.WhileContext.from_proto(
                   control_flow_context.to_proto()).to_proto())
 
+  @test_util.run_deprecated_v1
   def testWhileContext(self):
     self._testWhileContextHelper()
 
+  @test_util.run_deprecated_v1
   def testWhileContextWithMaximumIterations(self):
     self._testWhileContextHelper(maximum_iterations=10)
 
+  @test_util.run_deprecated_v1
   def testControlContextImportScope(self):
     class NoABCControlFlowContext(control_flow_ops.ControlFlowContext):
       """A noop wrapper around `ControlFlowContext`.
@@ -590,6 +581,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       if check_cond:
         self.assertAllEqualNested(result_case, expected_value_false)
 
+  @test_util.run_deprecated_v1
   def test_int(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: 1
@@ -599,6 +591,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape, strict=True)
     self._testReturnValues(fn_true, fn_false, 1, 2, strict=True)
 
+  @test_util.run_deprecated_v1
   def test_float(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: 1.0
@@ -606,12 +599,14 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, 1.0, 2.0)
 
+  @test_util.run_deprecated_v1
   def test_noop(self):
     shape = tensor_shape.TensorShape(None)
     self._testShape(control_flow_ops.no_op, control_flow_ops.no_op, shape)
     self._testReturnValues(control_flow_ops.no_op, control_flow_ops.no_op,
                            True, False, check_cond=False)
 
+  @test_util.run_deprecated_v1
   def test_string(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: "abc"
@@ -619,6 +614,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, b"abc", b"xyz")
 
+  @test_util.run_deprecated_v1
   def test_variable(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: variables.Variable(3.0)
@@ -626,6 +622,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, 3.0, 4.0)
 
+  @test_util.run_v1_only("b/120553181")
   def test_none(self):
     fn_none = lambda: None
     fn_tensor = lambda: constant_op.constant(1)
@@ -636,6 +633,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     with self.assertRaises(ValueError):
       control_flow_ops.cond(constant_op.constant(True), fn_tensor, fn_none)
 
+  @test_util.run_deprecated_v1
   def test_tensors(self):
 
     def _build_true_branch(dtype):
@@ -664,6 +662,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                              (np.zeros([2, 2]), np.ones([3, 3])),
                              (np.ones([2, 2]), np.zeros([3, 3])))
 
+  @test_util.run_deprecated_v1
   def test_tensors_unknown_shape(self):
 
     def _build_true_branch(dtype):
@@ -692,6 +691,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                              feed_dict={true_tensor: np.zeros([2, 2]),
                                         false_tensor: np.ones([2, 2])})
 
+  @test_util.run_deprecated_v1
   def test_sparse_tensors(self):
     shape = tensor_shape.TensorShape([None, None])
 
@@ -707,11 +707,14 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                              values=[1, 2], dense_shape=[3, 4])
     value2 = sparse_tensor.SparseTensorValue(indices=[[0, 0], [2, 1]],
                                              values=[3, 4], dense_shape=[3, 4])
-    self._testShape(true_fn, false_fn, shape)
-    self._testReturnValues(true_fn, false_fn, value1, value2)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(true_fn, false_fn, shape)
+      self._testReturnValues(true_fn, false_fn, value1, value2)
     self._testShape(true_fn, false_fn, [shape], strict=True)
     self._testReturnValues(true_fn, false_fn, [value1], [value2], strict=True)
 
+  @test_util.run_deprecated_v1
   def test_tensors_with_partially_specified_shapes(self):
 
     def _build_branch(dtype, shape):
@@ -741,6 +744,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
                                         true_tensors[2]: np.ones([3, 3]),
                                         false_tensors[2]: np.ones([3, 3])})
 
+  @test_util.run_deprecated_v1
   def test_tensor_arrays(self):
     element_shape = tensor_shape.TensorShape([2])
     ta1 = _create_tensor_array(4, element_shape)
@@ -750,6 +754,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_false = lambda: ta2
     self._testShape(fn_true, fn_false, shape)
 
+  @test_util.run_deprecated_v1
   def test_tensor_array_reads(self):
     shape = tensor_shape.TensorShape([2])
     ta = _create_tensor_array(4, shape)
@@ -757,6 +762,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     fn_false = lambda: ta.read(1)
     self._testShape(fn_true, fn_false, shape)
 
+  @test_util.run_deprecated_v1
   def test_list(self):
     shape = [tensor_shape.TensorShape([]), tensor_shape.TensorShape([]),
              tensor_shape.TensorShape([])]
@@ -765,6 +771,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, [1, 2, 3.0], [3, 4, 5.0])
 
+  @test_util.run_v1_only("Non-strict cond is only available in v1")
   def test_non_strict(self):
     shape = tensor_shape.TensorShape([])
     fn_tensor = lambda: constant_op.constant(1)
@@ -777,6 +784,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testReturnValues(fn_tensor, fn_tuple, 1, 3)
     self._testReturnValues(fn_list, fn_tuple, 2, 3)
 
+  @test_util.run_v1_only("b/120553181")
   def test_singleton_strict(self):
     fn_tensor = lambda: constant_op.constant(1)
     fn_list = lambda: [constant_op.constant(2)]
@@ -798,36 +806,46 @@ class DataTypesTest(test_util.TensorFlowTestCase):
       control_flow_ops.case([(constant_op.constant(True), fn_list)], fn_tuple,
                             strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_list(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: [constant_op.constant(1)]
     fn_false = lambda: [constant_op.constant(3)]
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, [shape], strict=True)
     self._testReturnValues(fn_true, fn_false, [1], [3], strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_tuple(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: (constant_op.constant(1),)
     fn_false = lambda: (constant_op.constant(3),)
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, (shape,), strict=True)
     self._testReturnValues(fn_true, fn_false, (1,), (3,),
                            strict=True)
 
+  @test_util.run_deprecated_v1
   def test_singleton_namedtuple(self):
     shape = tensor_shape.TensorShape([])
     fn_true = lambda: SingletonTestTuple(constant_op.constant(1))
     fn_false = lambda: SingletonTestTuple(constant_op.constant(3))
-    self._testShape(fn_true, fn_false, shape)
-    self._testReturnValues(fn_true, fn_false, 1, 3)
+    # Non-strict cond is only available in v1
+    if not tf2.enabled():
+      self._testShape(fn_true, fn_false, shape)
+      self._testReturnValues(fn_true, fn_false, 1, 3)
     self._testShape(fn_true, fn_false, SingletonTestTuple(shape),
                     strict=True)
     self._testReturnValues(fn_true, fn_false, SingletonTestTuple(1),
                            SingletonTestTuple(3), strict=True)
 
+  @test_util.run_deprecated_v1
   def test_tuple(self):
     shape = (tensor_shape.TensorShape([]), tensor_shape.TensorShape([]))
     fn_true = lambda: (constant_op.constant(1), 2)
@@ -835,6 +853,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, (1, 2), (3, 4))
 
+  @test_util.run_deprecated_v1
   def test_namedtuple(self):
     shape = TestTuple(tensor_shape.TensorShape([]),
                       tensor_shape.TensorShape([]))
@@ -843,6 +862,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
     self._testShape(fn_true, fn_false, shape)
     self._testReturnValues(fn_true, fn_false, TestTuple(1, 2), TestTuple(3, 4))
 
+  @test_util.run_deprecated_v1
   def test_nested(self):
     shape = [tensor_shape.TensorShape([]),
              TestTuple(tensor_shape.TensorShape([]),
@@ -868,6 +888,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
         [11, TestTuple(12, [13, 14]),
          np.ones([5, 5]), 16])
 
+  @test_util.run_deprecated_v1
   def test_cond_inside_while_loop(self):
 
     def body(i, matrix):
@@ -889,6 +910,7 @@ class DataTypesTest(test_util.TensorFlowTestCase):
 
 class CaseTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testCase_withDefault(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -900,6 +922,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 6)
 
+  @test_util.run_deprecated_v1
   def testCase_multiple_matches_exclusive(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -913,6 +936,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 2})
 
+  @test_util.run_deprecated_v1
   def testCase_multiple_matches_non_exclusive(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -925,6 +949,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       self.assertEqual(sess.run(output, feed_dict={x: 2}), 4)
       self.assertEqual(sess.run(output, feed_dict={x: 3}), 8)
 
+  @test_util.run_deprecated_v1
   def testCase_withoutDefault(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2)),
@@ -938,6 +963,7 @@ class CaseTest(test_util.TensorFlowTestCase):
       with self.assertRaisesRegexp(errors.InvalidArgumentError, "Input error:"):
         sess.run(output, feed_dict={x: 4})
 
+  @test_util.run_deprecated_v1
   def testCase_withoutDefault_oneCondition(self):
     x = array_ops.placeholder(dtype=dtypes.int32, shape=[])
     conditions = [(math_ops.equal(x, 1), lambda: constant_op.constant(2))]
@@ -979,6 +1005,7 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
     # Expect a tuple since that is what the body returns.
     self.assertEqual(self.evaluate(r), (10,))
 
+  @test_util.run_deprecated_v1
   def testWhileLoopSameReturnShape_False(self):
     i = constant_op.constant(0)
     c = lambda i, _: math_ops.less(i, 10)
@@ -1004,6 +1031,7 @@ class WhileLoopTestCase(test_util.TensorFlowTestCase):
 
 class AssertTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testAssert(self):
     i = constant_op.constant(0)
     c = control_flow_ops.Assert(i < 10, [i, [10], [i + 1]])
@@ -1014,6 +1042,18 @@ class AssertTest(test_util.TensorFlowTestCase):
     with self.assertRaises(errors.InvalidArgumentError):
       self.evaluate(c)
 
+  @test_util.run_in_graph_and_eager_modes
+  def testAssertInFunction(self):
+
+    @def_function.function
+    def whiny(value):
+      control_flow_ops.Assert(value, ["Raised false"])
+      return constant_op.constant(5)
+
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(whiny(False))
+
+    self.assertAllEqual(whiny(True), 5)
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/control_flow_util.py b/tensorflow/python/ops/control_flow_util.py
index 72c074ed1af208da274edd52572961ecaa613b34..cb628f4aa6441ec9cb03dfe873a79d06a66e37a1 100644
--- a/tensorflow/python/ops/control_flow_util.py
+++ b/tensorflow/python/ops/control_flow_util.py
@@ -38,6 +38,11 @@ def IsInXLAContext(op):
   return GetContainingXLAContext(ctxt) is not None
 
 
+def InXlaContext(graph):
+  ctxt = graph._get_control_flow_context()  # pylint: disable=protected-access
+  return GetContainingXLAContext(ctxt) is not None
+
+
 def IsInWhileLoop(op):
   ctxt = op._get_control_flow_context()  # pylint: disable=protected-access
   return GetContainingWhileContext(ctxt) is not None
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index cab1d7b02e10812bba4cd6b1697a0da60031fa75..5f56850884a5e9e424c77515406ef8c9b513e972 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -19,10 +19,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework.func_graph import FuncGraph
+from tensorflow.python.ops import control_flow_util
 
 
 class CondBranchFuncGraph(FuncGraph):
@@ -90,3 +92,31 @@ def unique_fn_name(scope, name):
 
 def unique_grad_fn_name(forward_name):
   return "%s_grad_%s" % (forward_name, ops.uid())
+
+
+def maybe_set_lowering_attr(op):
+  """Sets the flag to enable lowering on `op` if necessary.
+
+  Lowering allows cond_v2 and while_v2 to avoid some of the limitations of
+  Functions, allowing users to specify devices & colocation inside of cond_v2
+  and while_v2 input functions, and enabling non-strict evaluation & partial
+  pruning. This brings v2 control flow closer to feature parity with v1 control
+  flow.
+
+  However, we do not lower in the following cases:
+    - When the `If` or `While` ops are in the XLA context. Because it is easier
+      for XLA to apply its own optimizations when dealing with un-lowered
+      control flow operators than with low-level control flow primitives.
+    - When the eager execution context specifies the executor of functions to
+      be the single threaded executor (see context.function_executor_type()).
+      Because the single threaded executor does not support v1 control flow ops.
+
+  Args:
+    op: An `If` or `While` Operation.
+  """
+  if (not control_flow_util.IsInXLAContext(op) and
+      context.context().get_function_call_options().executor_type
+      != "SINGLE_THREADED_EXECUTOR"):
+    # pylint: disable=protected-access
+    op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
+    # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/ctc_ops.py b/tensorflow/python/ops/ctc_ops.py
index e1071afd8e00728b896a7ac03eb2e07cea2dbe74..3a7eb9355a66a213d3d60f103b818ef22fd839bd 100644
--- a/tensorflow/python/ops/ctc_ops.py
+++ b/tensorflow/python/ops/ctc_ops.py
@@ -19,17 +19,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import functional_ops
 from tensorflow.python.ops import gen_ctc_ops
+from tensorflow.python.ops import inplace_ops
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops.nn_grad import _BroadcastMul
+from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
 # pylint: disable=protected-access, invalid-name
-@tf_export("nn.ctc_loss")
+@tf_export(v1=["nn.ctc_loss"])
 def ctc_loss(labels, inputs, sequence_length,
              preprocess_collapse_repeated=False,
              ctc_merge_repeated=True,
@@ -336,6 +346,785 @@ def ctc_beam_search_decoder_v2(inputs, sequence_length, beam_width=100,
 
 
 ops.NotDifferentiable("CTCGreedyDecoder")
+ops.NotDifferentiable("CTCBeamSearchDecoder")
 
 
-ops.NotDifferentiable("CTCBeamSearchDecoder")
+def _ctc_state_trans(label_seq):
+  """Compute CTC alignment model transition matrix.
+
+  Args:
+    label_seq: tensor of shape [batch_size, max_seq_length]
+
+  Returns:
+    tensor of shape [batch_size, states, states] with a state transition matrix
+    computed for each sequence of the batch.
+  """
+
+  with ops.name_scope("ctc_state_trans"):
+    label_seq = ops.convert_to_tensor(label_seq, name="label_seq")
+    batch_size = _get_dim(label_seq, 0)
+    num_labels = _get_dim(label_seq, 1)
+
+    num_label_states = num_labels + 1
+    num_states = 2 * num_label_states
+
+    label_states = math_ops.range(num_label_states)
+    blank_states = label_states + num_label_states
+
+    # Start state to first label.
+    start_to_label = [[1, 0]]
+
+    # Blank to label transitions.
+    blank_to_label = array_ops.stack([label_states[1:], blank_states[:-1]], 1)
+
+    # Label to blank transitions.
+    label_to_blank = array_ops.stack([blank_states, label_states], 1)
+
+    # Scatter transitions that don't depend on sequence.
+    indices = array_ops.concat(
+        [start_to_label, blank_to_label, label_to_blank], 0)
+    values = array_ops.ones([_get_dim(indices, 0)])
+    trans = array_ops.scatter_nd(
+        indices, values, shape=[num_states, num_states])
+    trans += linalg_ops.eye(num_states)  # Self-loops.
+
+    # Label to label transitions. Disallow transitions between repeated labels
+    # with no blank state in between.
+    batch_idx = array_ops.zeros_like(label_states[2:])
+    indices = array_ops.stack(
+        [batch_idx, label_states[2:], label_states[1:-1]], 1)
+    indices = array_ops.tile(
+        array_ops.expand_dims(indices, 0), [batch_size, 1, 1])
+    batch_idx = array_ops.expand_dims(math_ops.range(batch_size), 1) * [1, 0, 0]
+    indices += array_ops.expand_dims(batch_idx, 1)
+    repeats = math_ops.equal(label_seq[:, :-1], label_seq[:, 1:])
+    values = 1.0 - math_ops.cast(repeats, dtypes.float32)
+    batched_shape = [batch_size, num_states, num_states]
+    label_to_label = array_ops.scatter_nd(indices, values, batched_shape)
+
+    return array_ops.expand_dims(trans, 0) + label_to_label
+
+
+def ctc_state_log_probs(seq_lengths, max_seq_length):
+  """Computes CTC alignment initial and final state log probabilities.
+
+  Create the initial/final state values directly as log values to avoid
+  having to take a float64 log on tpu (which does not exist).
+
+  Args:
+    seq_lengths: int tensor of shape [batch_size], seq lengths in the batch.
+    max_seq_length: int, max sequence length possible.
+
+  Returns:
+    initial_state_log_probs, final_state_log_probs
+  """
+
+  batch_size = _get_dim(seq_lengths, 0)
+  num_label_states = max_seq_length + 1
+  num_duration_states = 2
+  num_states = num_duration_states * num_label_states
+  log_0 = math_ops.cast(
+      math_ops.log(math_ops.cast(0, dtypes.float64) + 1e-307),
+      dtypes.float32)
+
+  initial_state_log_probs = array_ops.one_hot(
+      indices=array_ops.zeros([batch_size], dtype=dtypes.int32),
+      depth=num_states,
+      on_value=0.0,
+      off_value=log_0, axis=1)
+
+  label_final_state_mask = array_ops.one_hot(
+      seq_lengths, depth=num_label_states, axis=0)
+  duration_final_state_mask = array_ops.ones(
+      [num_duration_states, 1, batch_size])
+  final_state_mask = duration_final_state_mask * label_final_state_mask
+  final_state_log_probs = (1.0 - final_state_mask) * log_0
+  final_state_log_probs = array_ops.reshape(
+      final_state_log_probs, [num_states, batch_size])
+
+  return initial_state_log_probs, array_ops.transpose(final_state_log_probs)
+
+
+def _ilabel_to_state(labels, num_labels, ilabel_log_probs):
+  """Project ilabel log probs to state log probs."""
+
+  num_label_states = _get_dim(labels, 1)
+  blank = ilabel_log_probs[:, :, :1]
+  blank = array_ops.tile(blank, [1, 1, num_label_states + 1])
+  one_hot = array_ops.one_hot(labels, depth=num_labels)
+  one_hot = array_ops.expand_dims(one_hot, axis=0)
+  ilabel_log_probs = array_ops.expand_dims(ilabel_log_probs, axis=2)
+  state_log_probs = math_ops.reduce_sum(ilabel_log_probs * one_hot, axis=3)
+  state_log_probs = array_ops.concat([state_log_probs, blank], axis=2)
+  return array_ops.pad(
+      state_log_probs, [[0, 0], [0, 0], [1, 0]],
+      constant_values=math_ops.log(0.0))
+
+
+def _state_to_olabel(labels, num_labels, states):
+  """Sum state log probs to ilabel log probs."""
+
+  num_label_states = _get_dim(labels, 1) + 1
+  label_states = states[:, :, 1:num_label_states]
+  blank_states = states[:, :, num_label_states:]
+  one_hot = array_ops.one_hot(
+      labels - 1, depth=(num_labels - 1),
+      on_value=0.0, off_value=math_ops.log(0.0))
+  one_hot = array_ops.expand_dims(one_hot, axis=0)
+  label_states = array_ops.expand_dims(label_states, axis=3)
+  label_olabels = math_ops.reduce_logsumexp(label_states + one_hot, axis=2)
+  blank_olabels = math_ops.reduce_logsumexp(
+      blank_states, axis=2, keepdims=True)
+  return array_ops.concat([blank_olabels, label_olabels], axis=-1)
+
+
+# pylint: disable=redefined-outer-name
+def _state_to_olabel_unique(labels, num_labels, states, unique):
+  """Sum state log probs to ilabel log probs using unique label indices."""
+
+  num_label_states = _get_dim(labels, 1) + 1
+  label_states = states[:, :, 1:num_label_states]
+  blank_states = states[:, :, num_label_states:]
+
+  unique_y, unique_idx = unique
+  mul_reduce = _sum_states(unique_idx, label_states)
+
+  num_frames = states.shape[0]
+  batch_size = states.shape[1]
+  num_states = num_label_states - 1
+  batch_state_major = array_ops.transpose(mul_reduce, perm=[1, 2, 0])
+  batch_state_major = array_ops.reshape(
+      batch_state_major, [batch_size * num_states, num_frames])
+  batch_offset = math_ops.range(batch_size, dtype=unique_y.dtype) * num_labels
+  indices = unique_y + array_ops.expand_dims(batch_offset, axis=-1)
+  indices = array_ops.reshape(indices, [-1, 1])
+  scatter = array_ops.scatter_nd(
+      indices=indices,
+      updates=batch_state_major,
+      shape=[batch_size * num_labels, num_frames])
+  scatter = array_ops.reshape(scatter, [batch_size, num_labels, num_frames])
+  scatter = array_ops.where(
+      math_ops.equal(scatter, 0.0),
+      array_ops.fill(array_ops.shape(scatter), math_ops.log(0.0)),
+      scatter)
+  label_olabels = array_ops.transpose(scatter, [2, 0, 1])
+  label_olabels = label_olabels[:, :, 1:]
+
+  blank_olabels = math_ops.reduce_logsumexp(
+      blank_states, axis=2, keepdims=True)
+
+  return array_ops.concat([blank_olabels, label_olabels], axis=-1)
+
+
+def ctc_loss_and_grad(logits, labels, label_length, logit_length, unique=None):
+  """Computes the CTC loss and gradients.
+
+  Most users will want fwd_bwd.ctc_loss
+
+  This function returns the computed gradient, it does not have a gradient
+  of its own defined.
+
+  Args:
+    logits: tensor of shape [frames, batch_size, num_labels]
+    labels: tensor of shape [batch_size, max_label_seq_length]
+    label_length: tensor of shape [batch_size]
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    unique: (optional) unique label indices as computed by unique(labels)
+      If supplied, enables an implementation that is faster and more memory
+      efficient on TPU.
+
+  Returns:
+    loss: tensor of shape [batch_size]
+    gradient: tensor of shape [frames, batch_size, num_labels]
+  """
+
+  num_labels = _get_dim(logits, 2)
+  max_label_seq_length = _get_dim(labels, 1)
+
+  ilabel_log_probs = nn_ops.log_softmax(logits)
+  state_log_probs = _ilabel_to_state(labels, num_labels, ilabel_log_probs)
+  state_trans_probs = _ctc_state_trans(labels)
+  initial_state_log_probs, final_state_log_probs = ctc_state_log_probs(
+      label_length, max_label_seq_length)
+  fwd_bwd_log_probs, log_likelihood = _forward_backward_log(
+      state_trans_log_probs=math_ops.log(state_trans_probs),
+      initial_state_log_probs=initial_state_log_probs,
+      final_state_log_probs=final_state_log_probs,
+      observed_log_probs=state_log_probs,
+      sequence_length=logit_length)
+
+  if unique:
+    olabel_log_probs = _state_to_olabel_unique(
+        labels, num_labels, fwd_bwd_log_probs, unique)
+  else:
+    olabel_log_probs = _state_to_olabel(labels, num_labels, fwd_bwd_log_probs)
+
+  grad = math_ops.exp(ilabel_log_probs) - math_ops.exp(olabel_log_probs)
+  loss = -log_likelihood
+  return loss, grad
+
+
+def _ctc_loss_grad(op, grad_loss, _):
+  grad = op.outputs[1]
+  grad = [array_ops.reshape(grad_loss, [1, -1, 1]) * grad]
+  grad += [None] * (len(op.inputs) - len(grad))
+  return grad
+
+
+def _ctc_loss_shape(op):
+  return [op.inputs[2].get_shape(), op.inputs[0].get_shape()]
+
+
+@tf_export("nn.ctc_loss", v1=["nn.ctc_loss_v2"])
+def ctc_loss_v2(labels, logits, label_length, logit_length,
+                logits_time_major=True, unique=None,
+                blank_index=None, name=None):
+  """Computes CTC (Connectionist Temporal Classification) loss.
+
+  This op implements the CTC loss as presented in the article:
+
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Notes:
+      - Same as the "Classic CTC" in TensorFlow 1.x's tf.nn.ctc_loss setting of
+        preprocess_collapse_repeated=False, ctc_merge_repeated=True
+      - Labels may be supplied as either a dense, zero-padded tensor with a
+        vector of label sequence lengths OR as a SparseTensor.
+      - On TPU and GPU:
+          - Only dense padded labels are supported.
+      - On CPU:
+          - Caller may use SparseTensor or dense padded labels but calling with
+            a SparseTensor will be significantly faster.
+      - Default blank label is 0 rather num_classes - 1, unless overridden by
+        blank_index.
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_seq_length] or SparseTensor
+    logits: tensor of shape [frames, batch_size, num_labels],
+      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size], None if labels is SparseTensor
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    logits_time_major: (optional) If True (default), logits is shaped
+      [time, batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by
+      ctc_unique_labels(labels).  If supplied, enable a faster, memory
+      efficient implementation on TPU.
+    blank_index: (optional) Set the class index to use for the blank label.
+      Negative values will start from num_classes, ie, -1 will reproduce the
+      ctc_loss behavior of using num_classes - 1 for the blank symbol.
+      There is some memory/performance overhead to switching from the default
+      of 0 as an additional shifted copy of the logits may be created.
+    name: A name for this `Op`. Defaults to "ctc_loss_dense".
+
+  Returns:
+    loss: tensor of shape [batch_size], negative log probabilities.
+  """
+  if isinstance(labels, sparse_tensor.SparseTensor):
+    if blank_index is None:
+      raise ValueError(
+          "blank_index must be given when using SparseTensor labels.")
+
+    if blank_index < 0:
+      blank_index += _get_dim(logits, 2)
+
+    if blank_index != _get_dim(logits, 2) - 1:
+      logits = array_ops.concat([
+          logits[:, :, :blank_index],
+          logits[:, :, blank_index+1:],
+          logits[:, :, blank_index:blank_index+1],
+      ], axis=2)
+      labels = sparse_tensor.SparseTensor(
+          labels.indices,
+          array_ops.where(labels.values < blank_index,
+                          labels.values,
+                          labels.values - 1),
+          labels.dense_shape)
+
+    return ctc_loss(labels=labels,
+                    inputs=logits,
+                    sequence_length=logit_length,
+                    time_major=logits_time_major)
+
+  if blank_index is None:
+    blank_index = 0
+
+  return ctc_loss_dense(labels=labels,
+                        logits=logits,
+                        label_length=label_length,
+                        logit_length=logit_length,
+                        logits_time_major=logits_time_major,
+                        unique=unique,
+                        blank_index=blank_index,
+                        name=name)
+
+
+def ctc_loss_dense(labels, logits, label_length, logit_length,
+                   logits_time_major=True, unique=None,
+                   blank_index=0, name=None):
+  """Computes CTC (Connectionist Temporal Classification) loss.
+
+  This op implements the CTC loss as presented in the article:
+
+  [A. Graves, S. Fernandez, F. Gomez, J. Schmidhuber.
+  Connectionist Temporal Classification: Labeling Unsegmented Sequence Data
+  with Recurrent Neural Networks. ICML 2006, Pittsburgh, USA,
+  pp. 369-376.](http://www.cs.toronto.edu/~graves/icml_2006.pdf)
+
+  Using the batched forward backward algorithm described in:
+
+  [Sim, K. C., Narayanan, A., Bagby, T., Sainath, T. N., & Bacchiani, M.
+  Improving the efficiency of forward-backward algorithm using batched
+    computation in TensorFlow.
+  Automatic Speech Recognition and Understanding Workshop (ASRU),
+    2017 IEEE (pp. 258-264).
+  ](https://ieeexplore.ieee.org/iel7/8260578/8268903/08268944.pdf)
+
+  Notes:
+    Significant differences from tf.nn.ctc_loss:
+      Supports GPU and TPU (tf.nn.ctc_loss supports CPU only):
+        For batched operations, GPU and TPU are significantly faster than using
+        ctc_loss on CPU.
+        This implementation runs on CPU, but significantly slower than ctc_loss.
+      Blank label is 0 rather num_classes - 1, unless overridden by blank_index.
+      Logits and labels are dense arrays with padding rather than SparseTensor.
+      The only mode supported is the same as:
+        preprocess_collapse_repeated=False, ctc_merge_repeated=True
+        To collapse labels, the caller can preprocess label sequence first.
+
+    The dense implementation supports both CPU, GPU and TPU. A fast path is
+    provided that significantly improves memory use for large vocabulary if the
+    caller preprocesses label sequences to get unique label indices on the CPU
+    (eg. in the data input pipeline) using ctc_ops.unique and simplies this in
+    the optional "unique" kwarg. This is especially useful for TPU and GPU but
+    also works with if used on CPU.
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_seq_length]
+    logits: tensor of shape [frames, batch_size, num_labels],
+      if logits_time_major == False, shape is [batch_size, frames, num_labels].
+    label_length: tensor of shape [batch_size]
+      Length of reference label sequence in labels.
+    logit_length: tensor of shape [batch_size]
+      Length of input sequence in logits.
+    logits_time_major: (optional) If True (default), logits is shaped
+      [time, batch, logits]. If False, shape is [batch, time, logits]
+    unique: (optional) Unique label indices as computed by unique(labels).
+      If supplied, enable a faster, memory efficient implementation on TPU.
+    blank_index: (optional) Set the class index to use for the blank label.
+      Negative values will start from num_classes, ie, -1 will reproduce the
+      ctc_loss behavior of using num_classes - 1 for the blank symbol.
+      There is some memory/performance overhead to switching from the default
+      of 0 as an additional shifted copy of the logits may be created.
+    name: A name for this `Op`. Defaults to "ctc_loss_dense".
+
+  Returns:
+    loss: tensor of shape [batch_size], negative log probabilities.
+  """
+
+  with ops.name_scope(name, "ctc_loss_dense",
+                      [logits, labels, label_length, logit_length]):
+    logits = ops.convert_to_tensor(logits, name="logits")
+    labels = ops.convert_to_tensor(labels, name="labels")
+    label_length = ops.convert_to_tensor(label_length, name="label_length")
+    logit_length = ops.convert_to_tensor(logit_length, name="logit_length")
+
+    if not logits_time_major:
+      logits = array_ops.transpose(logits, perm=[1, 0, 2])
+
+    if blank_index != 0:
+      if blank_index < 0:
+        blank_index += _get_dim(logits, 2)
+      logits = array_ops.concat([
+          logits[:, :, blank_index:blank_index+1],
+          logits[:, :, :blank_index],
+          logits[:, :, blank_index+1:],
+      ], axis=2)
+      labels = array_ops.where(labels < blank_index, labels + 1, labels)
+
+    args = [logits, labels, label_length, logit_length]
+
+    if unique:
+      unique_y, unique_idx = unique
+      args.extend([unique_y, unique_idx])
+
+    # TODO(tombagby): Update to tfe.defun
+    @function.Defun(*[x.dtype for x in args],
+                    python_grad_func=_ctc_loss_grad,
+                    shape_func=_ctc_loss_shape)
+    def compute_ctc_loss(logits_t, labels_t, label_length_t, logit_length_t,
+                         *unique_t):
+      """Compute CTC loss."""
+      logits_t.set_shape(logits.shape)
+      labels_t.set_shape(labels.shape)
+      label_length_t.set_shape(label_length.shape)
+      logit_length_t.set_shape(logit_length.shape)
+      kwargs = dict(
+          logits=logits_t,
+          labels=labels_t,
+          label_length=label_length_t,
+          logit_length=logit_length_t)
+      if unique_t:
+        kwargs["unique"] = unique_t
+      return ctc_loss_and_grad(**kwargs)
+
+    return compute_ctc_loss(*args)[0]
+
+
+@tf_export("nn.collapse_repeated")
+def collapse_repeated(labels, seq_length, name=None):
+  """Merge repeated labels into single labels.
+
+  Args:
+    labels: Tensor of shape (batch, max value in seq_length)
+    seq_length: Tensor of shape (batch), sequence length of each batch element.
+    name: A name for this `Op`. Defaults to "collapse_repeated_labels".
+
+  Returns:
+    tuple of Tensor of shape (batch, max_seq_length) with repeated labels
+    collapsed and padded to max_seq_length, eg:
+        [[A, A, B, B, A],
+         [A, B, C, D, E]] => [[A, B, A, 0, 0],
+                              [A, B, C, D, E]]
+    and int tensor of shape [batch] with new sequence lengths.
+  """
+
+  with ops.name_scope(name, "collapse_repeated_labels",
+                      [labels, seq_length]):
+    labels = ops.convert_to_tensor(labels, name="labels")
+    seq_length = ops.convert_to_tensor(seq_length, name="seq_length")
+
+    # Mask labels that don't equal previous label.
+    label_mask = array_ops.concat(
+        [array_ops.ones_like(labels[:, :1], dtypes.bool),
+         math_ops.not_equal(labels[:, 1:], labels[:, :-1])],
+        axis=1)
+
+    # Filter labels that aren't in the original sequence.
+    maxlen = _get_dim(labels, 1)
+    seq_mask = array_ops.sequence_mask(seq_length, maxlen=maxlen)
+    label_mask = math_ops.logical_and(label_mask, seq_mask)
+
+    # Count masks for new sequence lengths.
+    new_seq_len = math_ops.reduce_sum(
+        math_ops.cast(label_mask, dtypes.int32), axis=1)
+
+    # Mask indexes based on sequence length mask.
+    new_maxlen = math_ops.reduce_max(new_seq_len)
+    idx_mask = array_ops.sequence_mask(new_seq_len, maxlen=new_maxlen)
+
+    # Flatten everything and mask out labels to keep and sparse indices.
+    flat_labels = array_ops.reshape(labels, [-1])
+    flat_label_mask = array_ops.reshape(label_mask, [-1])
+    flat_idx_mask = array_ops.reshape(idx_mask, [-1])
+    idx = math_ops.range(_get_dim(flat_idx_mask, 0))
+
+    # Scatter to flat shape.
+    flat = array_ops.scatter_nd(
+        indices=array_ops.expand_dims(
+            array_ops.boolean_mask(idx, flat_idx_mask), axis=1),
+        updates=array_ops.boolean_mask(flat_labels, flat_label_mask),
+        shape=array_ops.shape(flat_idx_mask))
+
+    # Reshape back to square batch.
+    batch_size = _get_dim(labels, 0)
+    new_shape = [batch_size, new_maxlen]
+    return (array_ops.reshape(flat, new_shape),
+            math_ops.cast(new_seq_len, seq_length.dtype))
+
+
+def dense_labels_to_sparse(dense, length):
+  """Convert dense labels with sequence lengths to sparse tensor.
+
+  Args:
+    dense: tensor of shape [batch, max_length]
+    length: int tensor of shape [batch]
+      The length of each sequence in dense.
+
+  Returns:
+    tf.SparseTensor with values only for the valid elements of sequences.
+  """
+
+  flat_values = array_ops.reshape(dense, [-1])
+  flat_indices = math_ops.range(
+      array_ops.shape(flat_values, out_type=dtypes.int64)[0])
+  mask = array_ops.sequence_mask(length, maxlen=array_ops.shape(dense)[1])
+  flat_mask = array_ops.reshape(mask, [-1])
+  indices = array_ops.expand_dims(
+      array_ops.boolean_mask(flat_indices, flat_mask), 1)
+  values = array_ops.boolean_mask(flat_values, flat_mask)
+  sparse = sparse_tensor.SparseTensor(
+      indices=indices, values=math_ops.cast(values, dtypes.int32),
+      dense_shape=array_ops.shape(flat_values, out_type=dtypes.int64))
+  reshaped = sparse_ops.sparse_reshape(sparse, array_ops.shape(dense))
+  max_length = math_ops.reduce_max(length)
+  return sparse_tensor.SparseTensor(
+      indices=reshaped.indices,
+      values=reshaped.values,
+      dense_shape=[
+          math_ops.cast(reshaped.dense_shape[0], dtypes.int64),
+          math_ops.cast(max_length, dtypes.int64)])
+
+
+@tf_export("nn.ctc_unique_labels")
+def ctc_unique_labels(labels, name=None):
+  """Get unique labels and indices for batched labels for tf.nn.ctc_loss.
+
+  For use with tf.nn.ctc_loss_v2 optional argument `unique`: This op can be
+  used to preprocess labels in input pipeline to for better speed/memory use
+  computing the ctc loss on TPU.
+
+  Example:
+    ctc_unique_labels([[3, 4, 4, 3]]) ->
+      unique labels padded with 0: [[3, 4, 0, 0]]
+      indices of original labels in unique: [0, 1, 1, 0]
+
+  Args:
+    labels: tensor of shape [batch_size, max_label_length] padded with 0.
+    name: A name for this `Op`. Defaults to "ctc_unique_labels".
+
+  Returns:
+    tuple of
+      - unique labels, tensor of shape `[batch_size, max_label_length]`
+      - indices into unique labels, shape `[batch_size, max_label_length]`
+  """
+
+  with ops.name_scope(name, "ctc_unique_labels", [labels]):
+    labels = ops.convert_to_tensor(labels, name="labels")
+    def _unique(x):
+      u = array_ops.unique(x)
+      y = array_ops.pad(
+          u.y, [[0, _get_dim(u.idx, 0) - _get_dim(u.y, 0)]])
+      y = math_ops.cast(y, dtypes.int64)
+      return [y, u.idx]
+    return functional_ops.map_fn(
+        _unique, labels, dtype=[dtypes.int64, dtypes.int32])
+
+
+def _sum_states(idx, states):
+  """Take logsumexp for each unique state out of all label states.
+
+  Args:
+    idx: tensor of shape [batch, label_length]
+      For each sequence, indices into a set of unique labels as computed by
+      calling unique.
+    states: tensor of shape [frames, batch, label_length]
+      Log probabilities for each label state.
+
+  Returns:
+    tensor of shape [frames, batch_size, label_length], log probabilites summed
+      for each unique label of the sequence.
+  """
+
+  with ops.name_scope("sum_states"):
+    idx = ops.convert_to_tensor(idx, name="idx")
+    num_states = _get_dim(states, 2)
+    states = array_ops.expand_dims(states, axis=2)
+    one_hot = array_ops.one_hot(
+        idx, depth=num_states, on_value=0.0, off_value=math_ops.log(0.0),
+        axis=1)
+    return math_ops.reduce_logsumexp(states + one_hot, axis=-1)
+
+
+def _forward_backward_log(state_trans_log_probs, initial_state_log_probs,
+                          final_state_log_probs, observed_log_probs,
+                          sequence_length):
+  """Forward-backward algorithm computed in log domain.
+
+  Args:
+    state_trans_log_probs: tensor of shape [states, states] or
+      if different transition matrix per batch [batch_size, states, states]
+    initial_state_log_probs: tensor of shape [batch_size, states]
+    final_state_log_probs: tensor of shape [batch_size, states]
+    observed_log_probs: tensor of shape [frames, batch_size, states]
+    sequence_length: tensor of shape [batch_size]
+
+  Returns:
+    forward backward log probabilites: tensor of shape [frames, batch, states]
+    log_likelihood: tensor of shape [batch_size]
+
+  Raises:
+    ValueError: If state_trans_log_probs has unknown or incorrect rank.
+  """
+
+  if state_trans_log_probs.shape.ndims == 2:
+    perm = [1, 0]
+  elif state_trans_log_probs.shape.ndims == 3:
+    perm = [0, 2, 1]
+  else:
+    raise ValueError(
+        "state_trans_log_probs rank must be known and == 2 or 3, is: %s" %
+        state_trans_log_probs.shape.ndims)
+
+  bwd_state_trans_log_probs = array_ops.transpose(state_trans_log_probs, perm)
+  batch_size = _get_dim(observed_log_probs, 1)
+
+  def _forward(state_log_prob, obs_log_prob):
+    state_log_prob = array_ops.expand_dims(state_log_prob, axis=1)  # Broadcast.
+    state_log_prob += state_trans_log_probs
+    state_log_prob = math_ops.reduce_logsumexp(state_log_prob, axis=-1)
+    state_log_prob += obs_log_prob
+    log_prob_sum = math_ops.reduce_logsumexp(
+        state_log_prob, axis=-1, keepdims=True)
+    state_log_prob -= log_prob_sum
+    return state_log_prob
+
+  fwd = _scan(_forward, observed_log_probs, initial_state_log_probs,
+              inclusive=True)
+
+  def _backward(accs, elems):
+    """Calculate log probs and cumulative sum masked for sequence length."""
+    state_log_prob, cum_log_sum = accs
+    obs_log_prob, mask = elems
+    state_log_prob += obs_log_prob
+    state_log_prob = array_ops.expand_dims(state_log_prob, axis=1)  # Broadcast.
+    state_log_prob += bwd_state_trans_log_probs
+    state_log_prob = math_ops.reduce_logsumexp(state_log_prob, axis=-1)
+
+    log_prob_sum = math_ops.reduce_logsumexp(
+        state_log_prob, axis=-1, keepdims=True)
+    state_log_prob -= log_prob_sum
+
+    cum_log_sum += array_ops.squeeze(log_prob_sum) * mask
+    batched_mask = array_ops.expand_dims(mask, axis=1)
+    out = state_log_prob * batched_mask
+    out += final_state_log_probs * (1.0 - batched_mask)
+    return out, cum_log_sum
+
+  zero_log_sum = array_ops.zeros([batch_size])
+  maxlen = _get_dim(observed_log_probs, 0)
+  mask = array_ops.sequence_mask(sequence_length, maxlen, dtypes.float32)
+  mask = array_ops.transpose(mask, perm=[1, 0])
+
+  bwd, cum_log_sum = _scan(_backward, (observed_log_probs, mask),
+                           (final_state_log_probs, zero_log_sum),
+                           reverse=True, inclusive=True)
+
+  fwd_bwd_log_probs = fwd[1:] + bwd[1:]
+  fwd_bwd_log_probs_sum = math_ops.reduce_logsumexp(
+      fwd_bwd_log_probs, axis=2, keepdims=True)
+  fwd_bwd_log_probs -= fwd_bwd_log_probs_sum
+  fwd_bwd_log_probs += math_ops.log(array_ops.expand_dims(mask, axis=2))
+
+  log_likelihood = bwd[0, :, 0] + cum_log_sum[0]
+
+  return fwd_bwd_log_probs, log_likelihood
+
+
+# TODO(tombagby): This is currently faster for the ctc implementation than using
+# functional_ops.scan, but could be replaced by that or something similar if
+# things change.
+def _scan(fn, elems, initial, reverse=False, inclusive=False, final_only=False):
+  """Repeatedly applies callable `fn` to a sequence of elements.
+
+  Implemented by functional_ops.While, tpu friendly, no gradient.
+
+  This is similar to functional_ops.scan but significantly faster on tpu/gpu
+  for the forward backward use case.
+
+  Examples:
+    scan(lambda a, e: a + e, [1.0, 2.0, 3.0], 1.0) => [2.0, 3.0, 4.0]
+
+    Multiple accumulators:
+      scan(lambda a, e: (a[0] + e, a[1] * e), [1.0, 2.0, 3.0], (0.0, 1.0))
+
+    Multiple inputs:
+      scan(lambda a, e: a + (e[0] * e[1]), (elems1, elems2), 0.0)
+
+  Args:
+    fn: callable, fn(accumulators, element) return new accumulator values.
+      The (possibly nested) sequence of accumulators is the same as `initial`
+      and the return value must have the same structure.
+    elems: A (possibly nested) tensor which will be unpacked along the first
+      dimension. The resulting slices will be the second argument to fn. The
+      first dimension of all nested input tensors must be the same.
+    initial: A tensor or (possibly nested) sequence of tensors with initial
+      values for the accumulators.
+    reverse: (optional) True enables scan and output elems in reverse order.
+    inclusive: (optional) True includes the initial accumulator values in the
+      output. Length of output will be len(elem sequence) + 1. Not meaningful
+      if final_only is True.
+    final_only: (optional) When True, return only the final accumulated values,
+      not the concatenation of accumulated values for each input.
+
+  Returns:
+    A (possibly nested) sequence of tensors with the results of applying fn
+    to tensors unpacked from elems and previous accumulator values.
+  """
+
+  flat_elems = [ops.convert_to_tensor(x) for x in nest.flatten(elems)]
+  num_elems = array_ops.shape(flat_elems[0])[0]
+  pack_elems = lambda x: nest.pack_sequence_as(structure=elems, flat_sequence=x)
+  flat_initial = [ops.convert_to_tensor(x) for x in nest.flatten(initial)]
+  pack = lambda x: nest.pack_sequence_as(structure=initial, flat_sequence=x)
+  accum_dtypes = [x.dtype for x in flat_initial]
+  num_accums = len(flat_initial)
+
+  # Types for counter, [outputs], [accumulators] loop arguments.
+  if final_only:
+    loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes
+  else:
+    loop_dtypes = [dtypes.int32, dtypes.int32] + accum_dtypes + accum_dtypes
+
+  # TODO(tombagby): Update to tfe.defun
+  @function.Defun(*loop_dtypes)
+  def cond(i, num_elems, *args):
+    del args
+    return i >= 0 if reverse else i < num_elems
+
+  # The loop *args are [output tensors] + [accumulator tensors] which must
+  # be paired. Each output corresponds to one accumulator.
+  @function.Defun(*loop_dtypes)
+  def body(i, num_elems, *args):
+    """Loop body."""
+    i.set_shape([])
+    if final_only:
+      accum = args
+    else:
+      out, accum = args[:num_accums], args[num_accums:]
+    slices = [array_ops.gather(e, i) for e in flat_elems]
+    accum = fn(pack(accum), pack_elems(slices))
+    flat_accum = nest.flatten(accum)
+    if final_only:
+      new_out = []
+    else:
+      update_i = i + 1 if inclusive and not reverse else i
+      new_out = [inplace_ops.alias_inplace_update(x, update_i, y)
+                 for x, y in zip(out, flat_accum)]
+    i = i - 1 if reverse else i + 1
+    return [i, num_elems] + new_out + flat_accum
+
+  init_i = (array_ops.shape(flat_elems[0])[0] - 1 if reverse
+            else constant_op.constant(0, dtype=dtypes.int32))
+  outputs = []
+  if not final_only:
+    num_outputs = array_ops.shape(flat_elems[0])[0] + (1 if inclusive else 0)
+    for initial_accum in flat_initial:
+      out_shape = array_ops.concat(
+          [[num_outputs], array_ops.shape(initial_accum)], 0)
+      out = inplace_ops.empty(out_shape, dtype=initial_accum.dtype, init=True)
+      if inclusive:
+        out = inplace_ops.alias_inplace_add(
+            out, init_i + (1 if reverse else 0), initial_accum)
+      outputs.append(out)
+  loop_in = [init_i, num_elems] + outputs + flat_initial
+  hostmem = [
+      i for i, x in enumerate(loop_in)
+      if x.dtype.base_dtype in (dtypes.int32, dtypes.int64)
+  ]
+
+  # TODO(tombagby): Update to while_v2.
+  loop_results = functional_ops.While(loop_in, cond, body, hostmem=hostmem)
+  out = loop_results[2:num_accums + 2]
+  return pack(out)
+
+
+def _get_dim(tensor, i):
+  """Get value of tensor shape[i] preferring static value if available."""
+  return tensor.shape[i].value or array_ops.shape(tensor)[i]
diff --git a/tensorflow/python/ops/custom_gradient.py b/tensorflow/python/ops/custom_gradient.py
index 1426e8851c5f2a379c750f34d34f60fe0674cdf8..d96601ac21c7d7d62423b65a2e43d08449e23129 100644
--- a/tensorflow/python/ops/custom_gradient.py
+++ b/tensorflow/python/ops/custom_gradient.py
@@ -236,6 +236,10 @@ def _graph_mode_decorator(f, *args, **kwargs):
   original_tensors = all_tensors
   with ops.get_default_graph().gradient_override_map({"IdentityN": name}):
     all_tensors = array_ops.identity_n(all_tensors)
+  # Propagate handle data for happier shape inference for resource variables.
+  for i, t in enumerate(original_tensors):
+    if t.dtype == dtypes.resource and hasattr(t, "_handle_data"):
+      all_tensors[i]._handle_data = t._handle_data  # pylint: disable=protected-access
   tape_lib.record_operation(
       f.__name__, all_tensors, original_tensors, tape_grad_fn)
   for ot, t in zip(original_tensors, all_tensors):
diff --git a/tensorflow/python/ops/data_flow_ops.py b/tensorflow/python/ops/data_flow_ops.py
index cca8e12b43460917a51783e5e87322116403f5de..2030332e4eaec8574010217d26ef6ac52dd988d5 100644
--- a/tensorflow/python/ops/data_flow_ops.py
+++ b/tensorflow/python/ops/data_flow_ops.py
@@ -79,7 +79,7 @@ def _as_shape_list(shapes,
     shapes = [shapes]
   shapes = [tensor_shape.as_shape(shape) for shape in shapes]
   if not unknown_dim_allowed:
-    if any([not shape.is_fully_defined() for shape in shapes]):
+    if any(not shape.is_fully_defined() for shape in shapes):
       raise ValueError("All shapes must be fully defined: %s" % shapes)
   if not unknown_rank_allowed:
     if any([shape.dims is None for shape in shapes]):
@@ -171,7 +171,10 @@ class QueueBase(object):
       self._names = None
     self._queue_ref = queue_ref
     if context.executing_eagerly():
-      self._name = context.context().scope_name
+      if context.context().scope_name:
+        self._name = context.context().scope_name
+      else:
+        self._name = "Empty"
       self._resource_deleter = resource_variable_ops.EagerResourceDeleter(
           queue_ref, None)
     else:
@@ -198,11 +201,11 @@ class QueueBase(object):
       raise TypeError("A list of queues expected")
 
     dtypes = queues[0].dtypes
-    if not all([dtypes == q.dtypes for q in queues[1:]]):
+    if not all(dtypes == q.dtypes for q in queues[1:]):
       raise TypeError("Queues do not have matching component dtypes.")
 
     names = queues[0].names
-    if not all([names == q.names for q in queues[1:]]):
+    if not all(names == q.names for q in queues[1:]):
       raise TypeError("Queues do not have matching component names.")
 
     queue_shapes = [q.shapes for q in queues]
@@ -1148,7 +1151,7 @@ class Barrier(object):
         self._barrier_ref, name=name)
 
 
-@tf_export("ConditionalAccumulatorBase")
+@tf_export(v1=["ConditionalAccumulatorBase"])
 class ConditionalAccumulatorBase(object):
   """A conditional accumulator for aggregating gradients.
 
@@ -1227,7 +1230,7 @@ class ConditionalAccumulatorBase(object):
         name=name)
 
 
-@tf_export("ConditionalAccumulator")
+@tf_export(v1=["ConditionalAccumulator"])
 class ConditionalAccumulator(ConditionalAccumulatorBase):
   """A conditional accumulator for aggregating gradients.
 
diff --git a/tensorflow/python/ops/dequantize_op_test.py b/tensorflow/python/ops/dequantize_op_test.py
index 13e50273d863f3c157ee7a089532df0c925c0e5f..794985b2dbb77e4d7691753432c53ddf3ad31377 100644
--- a/tensorflow/python/ops/dequantize_op_test.py
+++ b/tensorflow/python/ops/dequantize_op_test.py
@@ -35,7 +35,7 @@ class DequantizeOpTest(test.TestCase):
     with self.cached_session():
       input_op = constant_op.constant(inputs, shape=[len(inputs)], dtype=dtype)
       dequantized = array_ops.dequantize(input_op, min_range, max_range)
-      tf_ans = dequantized.eval()
+      tf_ans = self.evaluate(dequantized)
 
     # TODO(vrv): Add support for DT_QINT32 quantization if needed.
     type_dict = {
diff --git a/tensorflow/python/ops/distributions/util.py b/tensorflow/python/ops/distributions/util.py
index 760e7a8a84bbf3316175136fd9203035165435d0..24314e8fc92b3aef2718dd6668ca5564764aa8f4 100644
--- a/tensorflow/python/ops/distributions/util.py
+++ b/tensorflow/python/ops/distributions/util.py
@@ -343,7 +343,7 @@ def embed_check_categorical_event_shape(
     x_dtype = x.dtype.base_dtype
     max_event_size = (_largest_integer_by_dtype(x_dtype)
                       if x_dtype.is_floating else 0)
-    if max_event_size is 0:
+    if max_event_size == 0:
       raise TypeError("Unable to validate size of unrecognized dtype "
                       "({}).".format(x_dtype.name))
     try:
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index 9ce024ad9653e11be6410d8becf0d2c469bc018c..d0291e2095bdb6574c707c7458e4cc335fc4b825 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -247,7 +247,7 @@ def _embedding_lookup_and_transform(params,
       return ret
 
 
-@tf_export("nn.embedding_lookup")
+@tf_export(v1=["nn.embedding_lookup"])
 def embedding_lookup(
     params,
     ids,
@@ -316,7 +316,66 @@ def embedding_lookup(
       transform_fn=None)
 
 
-@tf_export("nn.embedding_lookup_sparse")
+@tf_export("nn.embedding_lookup", v1=[])
+def embedding_lookup_v2(
+    params,
+    ids,
+    partition_strategy="mod",
+    max_norm=None,
+    name=None):
+  """Looks up `ids` in a list of embedding tensors.
+
+  This function is used to perform parallel lookups on the list of
+  tensors in `params`.  It is a generalization of
+  `tf.gather`, where `params` is
+  interpreted as a partitioning of a large embedding tensor.  `params` may be
+  a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  If `len(params) > 1`, each element `id` of `ids` is partitioned between
+  the elements of `params` according to the `partition_strategy`.
+  In all strategies, if the id space does not evenly divide the number of
+  partitions, each of the first `(max_id + 1) % len(params)` partitions will
+  be assigned one more id.
+
+  If `partition_strategy` is `"mod"`, we assign each id to partition
+  `p = id % len(params)`. For instance,
+  13 ids are split across 5 partitions as:
+  `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`
+
+  If `partition_strategy` is `"div"`, we assign ids to partitions in a
+  contiguous manner. In this case, 13 ids are split across 5 partitions as:
+  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`
+
+  The results of the lookup are concatenated into a dense
+  tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.
+
+  Args:
+    params: A single tensor representing the complete embedding tensor,
+      or a list of P tensors all of same shape except for the first dimension,
+      representing sharded embedding tensors.  Alternatively, a
+      `PartitionedVariable`, created by partitioning along dimension 0. Each
+      element must be appropriately sized for the given `partition_strategy`.
+    ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
+      up in `params`.
+    partition_strategy: A string specifying the partitioning strategy, relevant
+      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
+      is `"mod"`.
+    max_norm: If not `None`, each embedding is clipped if its l2-norm is
+      larger than this value.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` with the same type as the tensors in `params`.
+
+  Raises:
+    ValueError: If `params` is empty.
+  """
+  return embedding_lookup(params, ids, partition_strategy, name,
+                          max_norm=max_norm)
+
+
+@tf_export(v1=["nn.embedding_lookup_sparse"])
 def embedding_lookup_sparse(params,
                             sp_ids,
                             sp_weights,
@@ -491,7 +550,85 @@ def embedding_lookup_sparse(params,
     return embeddings
 
 
-@tf_export("nn.safe_embedding_lookup_sparse")
+@tf_export("nn.embedding_lookup_sparse", v1=[])
+def embedding_lookup_sparse_v2(params,
+                               sp_ids,
+                               sp_weights,
+                               partition_strategy="mod",
+                               combiner=None,
+                               max_norm=None,
+                               name=None):
+  return embedding_lookup_sparse_v2(
+      params, sp_ids, sp_weights, partition_strategy, name, combiner, max_norm)
+
+
+embedding_lookup_sparse_v2.__doc__ = embedding_lookup_sparse.__doc__
+
+
+@tf_export("nn.safe_embedding_lookup_sparse", v1=[])
+def safe_embedding_lookup_sparse_v2(embedding_weights,
+                                    sparse_ids,
+                                    sparse_weights=None,
+                                    combiner="mean",
+                                    default_id=None,
+                                    max_norm=None,
+                                    name=None):
+  """Lookup embedding results, accounting for invalid IDs and empty features.
+
+  The partitioned embedding in `embedding_weights` must all be the same shape
+  except for the first dimension. The first dimension is allowed to vary as the
+  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
+  may be a `PartitionedVariable` as returned by using `tf.get_variable()` with a
+  partitioner.
+
+  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
+  with non-positive weight. For an entry with no features, the embedding vector
+  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
+
+  The ids and weights may be multi-dimensional. Embeddings are always aggregated
+  along the last dimension.
+
+  Note: when doing embedding lookup on `embedding_weights`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Args:
+    embedding_weights:  A list of `P` float `Tensor`s or values representing
+      partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
+      created by partitioning along dimension 0.  The total unpartitioned shape
+      should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size
+      and `e_1, ..., e_m` are the embedding dimensions.
+    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
+      ids. `d_0` is typically batch size.
+    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
+      float weights corresponding to `sparse_ids`, or `None` if all weights are
+      be assumed to be 1.0.
+    combiner: A string specifying how to combine embedding results for each
+      entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
+      default.
+    default_id: The id to use for an entry with no features.
+    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
+      combining.
+    name: A name for this operation (optional).
+
+  Returns:
+    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
+
+  Raises:
+    ValueError: if `embedding_weights` is empty.
+  """
+  return safe_embedding_lookup_sparse(
+      embedding_weights,
+      sparse_ids,
+      sparse_weights=sparse_weights,
+      combiner=combiner,
+      default_id=default_id,
+      name=name,
+      partition_strategy="div",
+      max_norm=max_norm)
+
+
+@tf_export(v1=["nn.safe_embedding_lookup_sparse"])
 def safe_embedding_lookup_sparse(embedding_weights,
                                  sparse_ids,
                                  sparse_weights=None,
@@ -554,7 +691,10 @@ def safe_embedding_lookup_sparse(embedding_weights,
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
   embedding_weights = [
-      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
+      w if (isinstance(w, resource_variable_ops.ResourceVariable)
+            and dtype in (None, w.dtype))
+      else ops.convert_to_tensor(w, dtype=dtype)
+      for w in embedding_weights
   ]
 
   with ops.name_scope(name, 'embedding_lookup',
diff --git a/tensorflow/python/ops/functional_ops.py b/tensorflow/python/ops/functional_ops.py
index fecd7ddbf9ffd8dd68f84e772350e5b05946d12c..57542e3c7baa0f4eb3dc53431c9a3060f0998c5b 100644
--- a/tensorflow/python/ops/functional_ops.py
+++ b/tensorflow/python/ops/functional_ops.py
@@ -19,7 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.core.protobuf import rewriter_config_pb2
+from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
@@ -1027,9 +1027,10 @@ _rewriter_config_optimizer_disabled = None
 def _get_disabled_rewriter_config():
   global _rewriter_config_optimizer_disabled
   if _rewriter_config_optimizer_disabled is None:
-    rewriter_config = rewriter_config_pb2.RewriterConfig()
+    config = config_pb2.ConfigProto()
+    rewriter_config = config.graph_options.rewrite_options
     rewriter_config.disable_meta_optimizer = True
-    _rewriter_config_optimizer_disabled = rewriter_config.SerializeToString()
+    _rewriter_config_optimizer_disabled = config.SerializeToString()
   return _rewriter_config_optimizer_disabled
 
 
@@ -1048,7 +1049,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
       the signature of `f`.
     executing_eagerly: (Optional) A boolean indicating whether the context is
       executing eagerly. If `None`, fetched from the global context.
-    config: (Optional) A tensorflow::RewriterConfig proto, serialized. If
+    config: (Optional) A `tensorflow::ConfigProto` proto, serialized. If
       `None`, all optimizations are disabled. Currently only handled for eager
       defined functions.
     executor_type: (Optional) A string for the name of the executor to be used
@@ -1076,10 +1077,12 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
   if executing_eagerly or len(tout):
     if f.stateful_ops:
       outputs = gen_functional_ops.stateful_partitioned_call(
-          args=args, Tout=tout, f=f, config=config, executor_type=executor_type)
+          args=args, Tout=tout, f=f, config_proto=config,
+          executor_type=executor_type)
     else:
       outputs = gen_functional_ops.partitioned_call(
-          args=args, Tout=tout, f=f, config=config, executor_type=executor_type)
+          args=args, Tout=tout, f=f, config_proto=config,
+          executor_type=executor_type)
     return outputs if outputs else None
 
   # The generated binding returns an empty list for functions that don't
@@ -1098,7 +1101,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
   # When running in graph mode, the graph and function graphs are optimized
   # (i.e. run through grappler) per the session options, so we can disable any
   # eager-specific rewriting.
-  rewriter_config = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
+  config_proto = attr_value_pb2.AttrValue(s=_get_disabled_rewriter_config())
 
   graph = ops.get_default_graph()
   f.add_to_graph(graph)
@@ -1113,7 +1116,7 @@ def partitioned_call(args, f, tout=None, executing_eagerly=None, config=None,
           "Tin": tin_attr,
           "Tout": tout_attr,
           "f": func_attr,
-          "config": rewriter_config,
+          "config_proto": config_proto,
           "executor_type": executor_type_attr,
       })
   outputs = op.outputs
diff --git a/tensorflow/python/ops/gradient_checker.py b/tensorflow/python/ops/gradient_checker.py
index 1665219c80c4cc92e25d132f5d84e384b5b6a704..683f78ce9b21c5a1b5d8b60017588ee8a09686f2 100644
--- a/tensorflow/python/ops/gradient_checker.py
+++ b/tensorflow/python/ops/gradient_checker.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -158,7 +157,8 @@ def _compute_numeric_jacobian(x, x_shape, x_data, y, y_shape, delta,
   # as delta. Convert to float32 here. Since numeric_jacobian is expected to
   # be the groundtruth to compare against, it shouldn't lose any information.
   if x.dtype == dtypes.bfloat16:
-    x = math_ops.cast(x, dtypes.float32)
+    x = math_ops.cast(x, dtypes.float32)  # TODO(wangpeng): Now that the new x
+            # is an output of the old x, isn't feeding to the new x a mistake?
   if y.dtype == dtypes.bfloat16:
     y = math_ops.cast(y, dtypes.float32)
   if x_data.dtype == dtypes.bfloat16.as_numpy_dtype:
@@ -266,7 +266,7 @@ def _compute_gradient_list(x,
   return ret
 
 
-@tf_export("test.compute_gradient")
+@tf_export(v1=["test.compute_gradient"])
 def compute_gradient(x,
                      x_shape,
                      y,
@@ -301,7 +301,6 @@ def compute_gradient(x,
       as the initial value.
     delta: (optional) the amount of perturbation.
     init_targets: list of targets to run to initialize model params.
-      TODO(mrry): remove this argument.
     extra_feed_dict: dict that allows fixing specified tensor values
       during the Jacobian calculation.
 
@@ -311,6 +310,7 @@ def compute_gradient(x,
     where "x_size" is the number of elements in x and "y_size" is the
     number of elements in y. If x is a list, returns a list of two numpy arrays.
   """
+  # TODO(mrry): remove argument `init_targets`
   if extra_feed_dict is None:
     extra_feed_dict = {}
 
@@ -328,10 +328,17 @@ def compute_gradient(x,
     return ret
 
 
+def _compute_error(grad):
+  if isinstance(grad, tuple):
+    grad = [grad]
+  error = 0
+  for j_t, j_n in grad:
+    if j_t.size or j_n.size:  # Handle zero size tensors correctly
+      error = np.maximum(error, np.fabs(j_t - j_n).max())
+  return error
+
+
 @tf_export(v1=["test.compute_gradient_error"])
-@deprecation.deprecated_args(
-    None, "init_targets will be deprecated in TensorFlow 2.0",
-    ("init_targets", None))  # Do not trigger warning in V2
 def compute_gradient_error(x,
                            x_shape,
                            y,
@@ -373,59 +380,4 @@ def compute_gradient_error(x,
   """
   grad = compute_gradient(x, x_shape, y, y_shape, x_init_value, delta,
                           init_targets, extra_feed_dict=extra_feed_dict)
-  if isinstance(grad, tuple):
-    grad = [grad]
-  error = 0
-  for j_t, j_n in grad:
-    if j_t.size or j_n.size:  # Handle zero size tensors correctly
-      error = np.maximum(error, np.fabs(j_t - j_n).max())
-  return error
-
-
-@tf_export("test.compute_gradient_error", v1=[])
-def compute_gradient_error_v2(x,
-                              x_shape,
-                              y,
-                              y_shape,
-                              x_init_value=None,
-                              delta=1e-3,
-                              extra_feed_dict=None):
-  """Computes the gradient error.
-
-  Computes the maximum error for dy/dx between the computed Jacobian and the
-  numerically estimated Jacobian.
-
-  This function will modify the tensors passed in as it adds more operations
-  and hence changing the consumers of the operations of the input tensors.
-
-  This function adds operations to the current session. To compute the error
-  using a particular device, such as a GPU, use the standard methods for
-  setting a device (e.g. using with sess.graph.device() or setting a device
-  function in the session constructor).
-
-  Args:
-    x: a tensor or list of tensors
-    x_shape: the dimensions of x as a tuple or an array of ints. If x is a list,
-      then this is the list of shapes.
-    y: a tensor
-    y_shape: the dimensions of y as a tuple or an array of ints.
-    x_init_value: (optional) a numpy array of the same shape as "x" representing
-      the initial value of x. If x is a list, this should be a list of numpy
-      arrays.  If this is none, the function will pick a random tensor as the
-      initial value.
-    delta: (optional) the amount of perturbation.
-    extra_feed_dict: dict that allows fixing specified tensor values during the
-      Jacobian calculation.
-
-  Returns:
-    The maximum error in between the two Jacobians.
-  """
-  return compute_gradient_error(
-      x,
-      x_shape,
-      y,
-      y_shape,
-      x_init_value=x_init_value,
-      delta=delta,
-      init_targets=None,
-      extra_feed_dict=extra_feed_dict)
+  return _compute_error(grad)
diff --git a/tensorflow/python/ops/gradient_checker_test.py b/tensorflow/python/ops/gradient_checker_test.py
index 66c7b9a71b530b2b799a4ade3799dc18bfc526ea..4d2b5efac7beb258f2720055bb3db56e9790042f 100644
--- a/tensorflow/python/ops/gradient_checker_test.py
+++ b/tensorflow/python/ops/gradient_checker_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import math_ops
@@ -46,6 +47,7 @@ def _nan_grad(unused_op, grad):
 
 class GradientCheckerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAddSimple(self):
     np.random.seed(1)  # Fix seed to avoid flakiness
     with self.session(use_gpu=False):
@@ -60,6 +62,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x1 error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testAddSimpleGPU(self):
     np.random.seed(2)  # Fix seed to avoid flakiness
     with self.session(use_gpu=True):
@@ -74,6 +77,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x1 error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testAddCustomized(self):
     np.random.seed(3)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -92,6 +96,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("x2 error = %f", error)
     assert error < 1e-10
 
+  @test_util.run_deprecated_v1
   def testGather(self):
     np.random.seed(4)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -109,6 +114,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("gather error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testNestedGather(self):
     np.random.seed(5)  # Fix seed to avoid flakiness
     with self.cached_session():
@@ -130,6 +136,7 @@ class GradientCheckerTest(test.TestCase):
     tf_logging.info("nested gather error = %f", error)
     assert error < 1e-4
 
+  @test_util.run_deprecated_v1
   def testComplexMul(self):
     with self.cached_session():
       size = ()
@@ -144,6 +151,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertLess(
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-4)
 
+  @test_util.run_deprecated_v1
   def testComplexConj(self):
     with self.cached_session():
       size = ()
@@ -157,6 +165,7 @@ class GradientCheckerTest(test.TestCase):
       self.assertLess(
           gradient_checker.compute_gradient_error(x, size, y, size), 2e-5)
 
+  @test_util.run_deprecated_v1
   def testEmptySucceeds(self):
     with self.cached_session():
       x = array_ops.placeholder(dtypes.float32)
@@ -279,18 +288,23 @@ class MiniMNISTTest(test.TestCase):
     tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
     return err
 
+  @test_util.run_deprecated_v1
   def testInputGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(0, "input"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testHiddenWeightGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(1, "hidden_weight"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testHiddenBiasGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(2, "hidden_bias"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testSoftmaxWeightGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(3, "softmax_weight"), 1e-8)
 
+  @test_util.run_deprecated_v1
   def testSoftmaxBiasGradient(self):
     self.assertLess(self._BuildAndTestMiniMNIST(4, "softmax_bias"), 1e-8)
 
diff --git a/tensorflow/python/ops/gradient_checker_v2.py b/tensorflow/python/ops/gradient_checker_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d473eeb5f4f00087672da53c5fef3ab63bdbd08
--- /dev/null
+++ b/tensorflow/python/ops/gradient_checker_v2.py
@@ -0,0 +1,329 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Gradient checker for functions.
+
+The gradient checker verifies numerically that an function properly
+computes the gradients
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _product(t):
+  if isinstance(t, int):
+    return t
+  else:
+    y = 1
+    for x in t:
+      y *= x
+    return y
+
+
+def _eval_indexed_slices(a):
+  """Converts IndexedSlices to IndexedSlicesValue with numpy indices/values.
+
+  When eager execution is enabled, converts IndexedSlices
+  to IndexedSlicesValue with numpy indices/values.
+
+  Args:
+    a: any value.
+
+  Returns:
+    If a is IndexedSlices and eager execution is enabled, calls numpy() on a's
+    fields. Otherwise returns a unchanged.
+  """
+  if isinstance(a, ops.IndexedSlices) and context.executing_eagerly():
+    return ops.IndexedSlicesValue(
+        indices=[x.numpy() for x in a.indices],
+        values=[x.numpy() for x in a.values],
+        dense_shape=a.dense_shape)
+  return a
+
+
+def _to_numpy(a):
+  """Converts Tensors and EagerTensors to numpy arrays.
+
+  Args:
+    a: any value.
+
+  Returns:
+    If a is EagerTensor or Tensor, returns the evaluation of a by calling
+    numpy() or run(). Otherwise returns a unchanged.
+  """
+  if isinstance(a, ops.EagerTensor):
+    return a.numpy()
+  if isinstance(a, ops.Tensor):
+    sess = ops.get_default_session()
+    return sess.run(a)
+  return a
+
+
+def _prepare(f, xs_dtypes):
+  """Return a function that executes 'f'.
+
+    In TF 2.x, this is the same as `f`.
+    In TF 1.x, returns a Python function that executes the graph defined by `f`
+    in a Session.
+
+  Args:
+    f: the function.
+    xs_dtypes: dtypes of f's arguments.
+
+  Returns:
+    a function that will be evaluated in both graph and eager mode
+  """
+  if context.executing_eagerly():
+
+    def decorated_eager(*xs_data):
+      return f(*map(ops.convert_to_tensor, xs_data))
+
+    return decorated_eager
+  xs = [array_ops.placeholder(x_dtype) for x_dtype in xs_dtypes]
+  y = f(*xs)
+  sess = ops.get_default_session()
+  def decorated_graph(*xs_data):
+    xs_data = [_to_numpy(a) for a in xs_data]
+    return sess.run(y, feed_dict=dict(zip(xs, xs_data)))
+  return decorated_graph
+
+
+def _compute_theoretical_jacobian(f, y_shape, y_dtype, xs, param):
+  """Computes the theoretical Jacobian for f regarding xs[param].
+
+  One can think of the relation among f, xs and y as y = f(xs).
+
+  Args:
+    f: the function.
+    y_shape: the shape of the result.
+    y_dtype: the dtype of the result.
+    xs: a list of tensors.
+    param: the index of the target parameter.
+
+  Returns:
+    A 2-d numpy array representing the Jacobian. It has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in xs[param]
+    and "y_size" is the number of elements in the result.
+
+  Raises:
+    ValueError: If result is empty but the gradient is nonzero.
+  """
+  x = xs[param]
+  # Complex vectors are treated as vectors of twice as many reals.
+  x_shape = tuple(x.shape) + (2,) if x.dtype.is_complex else x.shape
+  y_factor = 2 if y_dtype.is_complex else 1
+
+  # To compute the jacobian, we treat x and y as one-dimensional vectors.
+  x_size = _product(x_shape)
+  x_val_size = _product(x_shape[1:])  # This is used for sparse gradients
+  y_size = _product(y_shape) * y_factor
+
+  # Allocate 2-D Jacobian, with x dimensions smashed into the first
+  # dimension and y dimensions smashed into the second.
+  jacobian = np.zeros((x_size, y_size), dtype=x.dtype.real_dtype.as_numpy_dtype)
+
+  # For each of the entry of dy, we set this to be 1 and
+  # everything else to be 0 and compute the gradients -- this will give us one
+  # one column of the Jacobian matrix.
+  dy_data = np.zeros(y_shape, dtype=y_dtype.as_numpy_dtype)
+  dy_data_flat = dy_data.ravel().view(y_dtype.real_dtype.as_numpy_dtype)
+  grad_fn_unprep = backprop.gradients_function(f, [param])
+  grad_fn = _prepare(lambda dy, *xs: grad_fn_unprep(*xs, dy=dy),
+                     [y_dtype] + [x.dtype for x in xs])
+  for col in range(y_size):
+    dy_data_flat[col] = 1
+    grad = _to_numpy(grad_fn(dy_data, *xs)[0])
+    grad = _eval_indexed_slices(grad)
+    dy_data_flat[col] = 0
+    if isinstance(grad, ops.IndexedSlicesValue):
+      for i, v in zip(grad.indices, grad.values):
+        r_begin = i * x_val_size
+        r_end = r_begin + x_val_size
+        jacobian[r_begin:r_end, col] += v.flat
+    else:
+      jacobian[:, col] = grad.ravel().view(jacobian.dtype)
+
+  # If the output is empty, run the gradients at least once and make sure
+  # they produce zeros.
+  if y_size == 0:  # don't use 'not y_size', because y_size may not be an int
+    grad = _to_numpy(grad_fn(dy_data, *xs)[0])
+    if grad.shape != x.shape:
+      raise ValueError("Empty gradient has wrong shape: expected %s, got %s" %
+                       (x.shape, grad.shape))
+    if np.any(grad):
+      raise ValueError("Empty tensor with nonzero gradients")
+
+  logging.vlog(1, "Theoretical Jacobian =\n%s", jacobian)
+  return jacobian
+
+
+def _compute_numeric_jacobian(f, y_size, y_dtype, xs, param,
+                              delta):
+  """Computes the numeric Jacobian for f regarding xs[param].
+
+  One can think of the relation among f, xs and y as y = f(xs).
+
+  Args:
+    f: the function.
+    y_size: the number of elements of the result.
+    y_dtype: the dtype of the result.
+    xs: a list of tensors.
+    param: the index of the target parameter.
+    delta: the amount of perturbation we give to the input.
+
+  Returns:
+    A 2-d numpy array representing the Jacobian. It has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in xs[param]
+    and "y_size" is the number of elements in the result.
+  """
+  # bfloat16 doesn't have enough bits to represent high precision numbers such
+  # as delta. Convert to float32 here. Since numeric_jacobian is expected to
+  # be the groundtruth to compare against, it shouldn't lose any information.
+  x_shape = xs[param].shape
+  x_dtype = xs[param].dtype
+  if y_dtype == dtypes.bfloat16:
+    f = lambda *xs: math_ops.cast(f(*xs), dtypes.float32)
+    y_dtype = dtypes.float32
+
+  # To compute the jacobian, we treat x and y as one-dimensional vectors
+  x_size = _product(x_shape) * (2 if x_dtype.is_complex else 1)
+  y_size = y_size * (2 if y_dtype.is_complex else 1)
+  x_dtype = x_dtype.real_dtype.as_numpy_dtype
+  y_dtype = y_dtype.real_dtype.as_numpy_dtype
+
+  xs_dtypes = [x.dtype for x in xs]
+  # Converts xs to numpy arrays to do in-place perturbation.
+  # Calls asarray() to avoid copying in ravel() later.
+  xs = [np.asarray(_to_numpy(x)) for x in xs]
+  x = xs[param]
+
+  # Make sure we have the right types
+  scale = np.asarray(2 * delta, dtype=y_dtype)[()]
+
+  jacobian = np.zeros((x_size, y_size), dtype=x_dtype)
+  # For each of the entry of x, we slightly perturbs this by adding and
+  # subtracting a delta and then compute difference between the outputs. This
+  # will give us one row of the Jacobian matrix.
+
+  f = _prepare(f, xs_dtypes)
+  for row in range(x_size):
+    original = x.ravel().view(x_dtype)[row]
+    x.ravel().view(x_dtype)[row] += delta
+    y_pos = _to_numpy(f(*xs))
+    x.ravel().view(x_dtype)[row] = original
+    x.ravel().view(x_dtype)[row] -= delta
+    y_neg = _to_numpy(f(*xs))
+    x.ravel().view(x_dtype)[row] = original
+    diff = (y_pos - y_neg) / scale
+    jacobian[row, :] = diff.ravel().view(y_dtype)
+
+  logging.vlog(1, "Numeric Jacobian =\n%s", jacobian)
+  return jacobian
+
+
+def _compute_gradient(f,
+                      y_shape,
+                      y_dtype,
+                      xs,
+                      param,
+                      delta):
+  """Computes the theoretical and numerical jacobian."""
+  x = xs[param]
+  t = x.dtype
+  allowed_types = [dtypes.float16, dtypes.bfloat16, dtypes.float32,
+                   dtypes.float64, dtypes.complex64, dtypes.complex128]
+  assert t.base_dtype in allowed_types, ("Cannot compute gradient for"
+                                         "unsupported type %s of argument %s" %
+                                         (t.name, param))
+  t2 = y_dtype
+  assert t2.base_dtype in allowed_types, ("Cannot compute gradient for"
+                                          "unsupported type %s of y" % t2.name)
+  y_size = _product(y_shape)
+  jacob_t = _compute_theoretical_jacobian(f, y_shape, y_dtype,
+                                          xs, param)
+  jacob_n = _compute_numeric_jacobian(f, y_size, y_dtype, xs,
+                                      param, delta)
+  return jacob_t, jacob_n
+
+
+def _compute_gradient_list(f, xs, delta):
+  """Compute gradients for a list of x values."""
+  # convert xs to tensors so that dtype and shape have uniform types
+  xs = list(map(ops.convert_to_tensor, xs))
+  # run the function to get info of the result
+  xs_dtypes = [x.dtype for x in xs]
+  f_temp = _prepare(f, xs_dtypes)
+  y = f_temp(*xs)
+  return zip(*[_compute_gradient(f, y.shape, dtypes.as_dtype(y.dtype),
+                                 xs, i, delta) for i in range(len(xs))])
+
+
+@tf_export("test.compute_gradient", v1=[])
+def compute_gradient(f, x, delta=1e-3):
+  """Computes the theoretical and numeric Jacobian of f.
+
+  With y = f(x), computes the theoretical and numeric Jacobian dy/dx.
+
+  Args:
+    f: the function.
+    x: a list of tensors.
+    delta: (optional) perturbation used to compute numeric Jacobian.
+
+  Returns:
+    A pair of lists, where the first is a list of 2-d numpy arrays representing
+    the theoretical Jacobians for each argument, and the second list is the
+    numerical ones. Each 2-d array has "x_size" rows
+    and "y_size" columns where "x_size" is the number of elements in the
+    corresponding argument and "y_size" is the number of elements in f(x).
+
+  Raises:
+    ValueError: If result is empty but the gradient is nonzero.
+  """
+  if not isinstance(x, list):
+    raise ValueError(
+        "`x` must be a list of Tensors (arguments to `f`), not a %s" % type(x))
+  return _compute_gradient_list(f, x, delta)
+
+
+def max_error(grad1, grad2):
+  """Computes maximum elementwise gap.
+
+  Computes the maximum elementwise gap between two lists of tensors of the same
+  shape.
+
+  Args:
+    grad1: a lists of tensors.
+    grad2: a lists of tensors with the same shape as grad1.
+
+  Returns:
+    The maximum elementwise gap between the two.
+  """
+  error = 0
+  for j_t, j_n in zip(grad1, grad2):
+    if j_t.size or j_n.size:  # Handle zero size tensors correctly
+      error = np.maximum(error, np.fabs(j_t - j_n).max())
+  return error
diff --git a/tensorflow/python/ops/gradient_checker_v2_test.py b/tensorflow/python/ops/gradient_checker_v2_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..191b2b6568104b7cf49aa2844f7929284c00d74d
--- /dev/null
+++ b/tensorflow/python/ops/gradient_checker_v2_test.py
@@ -0,0 +1,300 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for compute_gradient.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.eager import backprop
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import \
+gradient_checker_v2 as gradient_checker
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+# needs this to register gradient for SoftmaxCrossEntropyWithLogits:
+import tensorflow.python.ops.nn_grad  # pylint: disable=unused-import
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+
+
+def _random_complex(shape, dtype):
+  data = np.random.random_sample(shape).astype(dtype.as_numpy_dtype)
+  if dtype.is_complex:
+    data.imag = np.random.random_sample(shape)
+  return data
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class GradientCheckerTest(test.TestCase):
+
+  def testAddSimple(self):
+    size = (2, 3)
+    x1 = constant_op.constant(2.0, shape=size, name="x1")
+    x2 = constant_op.constant(3.0, shape=size, name="x2")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x1: math_ops.add(x1, x2), [x1]))
+    tf_logging.info("x1 error = %f", error)
+    assert error < 1e-4
+
+  def testAddCustomized(self):
+    size = (2, 3)
+    x1 = constant_op.constant(
+        2.0, shape=size, dtype=dtypes.float64, name="x1")
+    x2 = np.asarray(np.arange(6, dtype=np.float64).reshape(2, 3))
+    # checkint gradients for x2 using a special delta
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        lambda x2: math_ops.add(x1, x2),
+        [x2], delta=1e-2))
+    tf_logging.info("x2 error = %f", error)
+    assert error < 1e-10
+
+  def testGather(self):
+    def f(params):
+      index_values = [1, 3]
+      indices = constant_op.constant(index_values, name="i")
+      return array_ops.gather(params, indices, name="y")
+    p_shape = (4, 2)
+    p_size = 8
+    params = constant_op.constant(
+        np.arange(p_size).astype(np.float), shape=p_shape, name="p")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [params]))
+    tf_logging.info("gather error = %f", error)
+    assert error < 1e-4
+
+  def testNestedGather(self):
+    def f(params):
+      index_values = [1, 3, 5, 6]
+      indices = constant_op.constant(index_values, name="i")
+      y = array_ops.gather(params, indices, name="y")
+      index_values2 = [0, 2]
+      indices2 = constant_op.constant(index_values2, name="i2")
+      return array_ops.gather(y, indices2, name="y2")
+    p_shape = (8, 2)
+    p_size = 16
+    params = constant_op.constant(
+        np.arange(p_size).astype(np.float), shape=p_shape, name="p")
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [params]))
+    tf_logging.info("nested gather error = %f", error)
+    assert error < 1e-4
+
+  def testComplexMul(self):
+    c = constant_op.constant(5 + 7j, dtype=dtypes.complex64)
+    def f(x):
+      return c * x
+    x_shape = c.shape
+    x_dtype = c.dtype
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    analytical, numerical = gradient_checker.compute_gradient(
+        f, [x])
+    correct = np.array([[5, 7], [-7, 5]])
+    self.assertAllEqual(correct, analytical[0])
+    self.assertAllClose(correct, numerical[0], rtol=1e-4)
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    self.assertLess(
+        gradient_checker.max_error(*gradient_checker.compute_gradient(
+            f, [x])), 3e-4)
+
+  def testComplexConj(self):
+    def f(x):
+      return math_ops.conj(x)
+    x_shape = ()
+    x_dtype = dtypes.complex64
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    analytical, numerical = gradient_checker.compute_gradient(
+        f, [x])
+    correct = np.array([[1, 0], [0, -1]])
+    self.assertAllEqual(correct, analytical[0])
+    self.assertAllClose(correct, numerical[0], rtol=2e-5)
+    x = constant_op.constant(_random_complex(x_shape, x_dtype))
+    self.assertLess(
+        gradient_checker.max_error(*gradient_checker.compute_gradient(
+            f, [x])), 2e-5)
+
+  def testEmptySucceeds(self):
+    def f(x):
+      return array_ops.identity(x)
+    x = constant_op.constant(np.random.random_sample((0, 3)),
+                             dtype=dtypes.float32)
+    for grad in gradient_checker.compute_gradient(f, [x]):
+      self.assertEqual(grad[0].shape, (0, 0))
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [x]))
+    self.assertEqual(error, 0)
+
+  def testEmptyFails(self):
+    @custom_gradient.custom_gradient
+    def id_bad_grad(x):
+      y = array_ops.identity(x)
+      def grad_fn(dy):
+        # dx = constant_op.constant(np.zeros((1, 4)), dtype=dtypes.float32)
+        dx = array_ops.transpose(dy)
+        return dx
+      return y, grad_fn
+    def f(x):
+      return id_bad_grad(x)
+    x = constant_op.constant(np.random.random_sample((0, 3)),
+                             dtype=dtypes.float32)
+    bad = r"Empty gradient has wrong shape: expected \(0, 3\), got \(3, 0\)"
+    with self.assertRaisesRegexp(ValueError, bad):
+      gradient_checker.compute_gradient(f, [x])
+
+  def testNaNGradFails(self):
+    @custom_gradient.custom_gradient
+    def id_nan_grad(x):
+      y = array_ops.identity(x)
+      def grad_fn(dy):
+        dx = np.nan * dy
+        # dx = dy
+        return dx
+      return y, grad_fn
+    def f(x):
+      return id_nan_grad(x)
+    x = constant_op.constant(np.random.random_sample((1, 1)),
+                             dtype=dtypes.float32)
+    error = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f, [x]))
+    # Typical test would assert error < max_err, so assert this test would
+    # raise AssertionError, since NaN is not < 1.0.
+    with self.assertRaisesRegexp(AssertionError, "False is not true"):
+      self.assertTrue(error < 1.0)
+
+  def testGradGrad(self):
+
+    def f(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        y = math_ops.square(x)
+        z = math_ops.square(y)
+      return tape.gradient(z, x)
+
+    analytical, numerical = gradient_checker.compute_gradient(f, [2.0])
+    self.assertAllEqual([[[48.]]], analytical)
+    self.assertAllClose([[[48.]]], numerical, rtol=1e-4)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class MiniMNISTTest(test.TestCase):
+
+  # Gradient checker for MNIST.
+  def _BuildAndTestMiniMNIST(self, param_index, tag):
+    # Fix seed to avoid occasional flakiness
+    np.random.seed(6)
+
+    # Hyperparameters
+    batch = 3
+    inputs = 16
+    features = 32
+    classes = 10
+
+    # Define the parameters
+    inp_data = np.random.random_sample(inputs * batch)
+    hidden_weight_data = np.random.randn(inputs * features) / np.sqrt(inputs)
+    hidden_bias_data = np.random.random_sample(features)
+    sm_weight_data = np.random.randn(features * classes) / np.sqrt(features)
+    sm_bias_data = np.random.random_sample(classes)
+
+    # special care for labels since they need to be normalized per batch
+    label_data = np.random.random(batch * classes).reshape((batch, classes))
+    s = label_data.sum(axis=1)
+    label_data /= s[:, None]
+
+    # We treat the inputs as "parameters" here
+    inp = constant_op.constant(
+        inp_data.tolist(),
+        shape=[batch, inputs],
+        dtype=dtypes.float64,
+        name="inp")
+    hidden_weight = constant_op.constant(
+        hidden_weight_data.tolist(),
+        shape=[inputs, features],
+        dtype=dtypes.float64,
+        name="hidden_weight")
+    hidden_bias = constant_op.constant(
+        hidden_bias_data.tolist(),
+        shape=[features],
+        dtype=dtypes.float64,
+        name="hidden_bias")
+    softmax_weight = constant_op.constant(
+        sm_weight_data.tolist(),
+        shape=[features, classes],
+        dtype=dtypes.float64,
+        name="softmax_weight")
+    softmax_bias = constant_op.constant(
+        sm_bias_data.tolist(),
+        shape=[classes],
+        dtype=dtypes.float64,
+        name="softmax_bias")
+
+    # List all the parameter so that we can test them one at a time
+    all_params = [
+        inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias
+    ]
+
+    # Now, Building MNIST
+    def f(inp, hidden_weight, hidden_bias, softmax_weight, softmax_bias):
+      features = nn_ops.relu(
+          nn_ops.xw_plus_b(inp, hidden_weight, hidden_bias), name="features")
+      logits = nn_ops.xw_plus_b(
+          features, softmax_weight, softmax_bias, name="logits")
+      labels = constant_op.constant(
+          label_data.tolist(),
+          shape=[batch, classes],
+          dtype=dtypes.float64,
+          name="labels")
+      cost = nn_ops.softmax_cross_entropy_with_logits(
+          labels=labels, logits=logits, name="cost")
+      return cost
+
+    def f_restricted(x):
+      xs = all_params
+      i = param_index
+      # use x for the i-th parameter
+      xs = xs[0:i]+[x]+xs[i+1:]
+      return f(*xs)
+    # Test the gradients.
+    err = gradient_checker.max_error(*gradient_checker.compute_gradient(
+        f_restricted, [all_params[param_index]], delta=1e-5))
+
+    tf_logging.info("Mini MNIST: %s gradient error = %g", tag, err)
+    return err
+
+  def testInputGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(0, "input"), 1e-8)
+
+  def testHiddenWeightGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(1, "hidden_weight"), 1e-8)
+
+  def testHiddenBiasGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(2, "hidden_bias"), 1e-8)
+
+  def testSoftmaxWeightGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(3, "softmax_weight"), 1e-8)
+
+  def testSoftmaxBiasGradient(self):
+    self.assertLess(self._BuildAndTestMiniMNIST(4, "softmax_bias"), 1e-8)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/gradients_impl.py b/tensorflow/python/ops/gradients_impl.py
index 4f0fb54dcab8553043d464392ccd011a90a6bb62..0a70d6ee61e64f94c41c1f1d0a5b6c3610b45c04 100644
--- a/tensorflow/python/ops/gradients_impl.py
+++ b/tensorflow/python/ops/gradients_impl.py
@@ -49,9 +49,9 @@ from tensorflow.python.ops import logging_ops  # pylint: disable=unused-import
 from tensorflow.python.ops import manip_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import optional_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import random_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import spectral_grad  # pylint: disable=unused-import
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.unconnected_gradients import UnconnectedGradients
 from tensorflow.python.platform import tf_logging as logging
@@ -297,8 +297,12 @@ def _DefaultGradYs(grad_ys,
   return new_grad_ys
 
 
-def IsTrainable(tensor):
-  dtype = dtypes.as_dtype(tensor.dtype)
+def IsTrainable(tensor_or_dtype):
+  if isinstance(tensor_or_dtype, ops.Tensor):
+    dtype = tensor_or_dtype.dtype
+  else:
+    dtype = tensor_or_dtype
+  dtype = dtypes.as_dtype(dtype)
   return dtype.base_dtype in (dtypes.float16, dtypes.float32, dtypes.float64,
                               dtypes.complex64, dtypes.complex128,
                               dtypes.resource, dtypes.variant)
@@ -322,6 +326,10 @@ def _VerifyGeneratedGradients(grads, op):
     ValueError: if sizes of gradients and inputs don't match.
     TypeError: if type of any gradient is not valid for its input.
   """
+  # While ops have inputs added to them during the gradient computation, so we
+  # skip the below check. See while_v2 for details.
+  if op.type == "While": return
+
   if len(grads) != len(op.inputs):
     raise ValueError("Num gradients %d generated for op %s do not match num "
                      "inputs %d" % (len(grads), op.node_def, len(op.inputs)))
@@ -540,7 +548,7 @@ def _Consumers(t, func_graphs):
   return consumers
 
 
-@tf_export("gradients")
+@tf_export(v1=["gradients"])
 def gradients(ys,
               xs,
               grad_ys=None,
@@ -656,6 +664,119 @@ def gradients(ys,
                             unconnected_gradients)
 
 
+@tf_export("gradients", v1=[])
+def gradients_v2(ys,  # pylint: disable=invalid-name
+                 xs,
+                 grad_ys=None,
+                 name="gradients",
+                 gate_gradients=False,
+                 aggregation_method=None,
+                 stop_gradients=None,
+                 unconnected_gradients=UnconnectedGradients.NONE):
+  """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.
+
+  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
+  is a list of `Tensor`, holding the gradients received by the
+  `ys`. The list must be the same length as `ys`.
+
+  `gradients()` adds ops to the graph to output the derivatives of `ys` with
+  respect to `xs`.  It returns a list of `Tensor` of length `len(xs)` where
+  each tensor is the `sum(dy/dx)` for y in `ys`.
+
+  `grad_ys` is a list of tensors of the same length as `ys` that holds
+  the initial gradients for each y in `ys`.  When `grad_ys` is None,
+  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
+  user can provide their own initial `grad_ys` to compute the
+  derivatives using a different initial gradient for each y (e.g., if
+  one wanted to weight the gradient differently for each value in
+  each y).
+
+  `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
+  with respect to all `xs`. These tensors will not be backpropagated through,
+  as though they had been explicitly disconnected using `stop_gradient`.  Among
+  other things, this allows computation of partial derivatives as opposed to
+  total derivatives. For example:
+
+  ```python
+  a = tf.constant(0.)
+  b = 2 * a
+  g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
+  ```
+
+  Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
+  total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
+  influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
+  equivalent to:
+
+  ```python
+  a = tf.stop_gradient(tf.constant(0.))
+  b = tf.stop_gradient(2 * a)
+  g = tf.gradients(a + b, [a, b])
+  ```
+
+  `stop_gradients` provides a way of stopping gradient after the graph has
+  already been constructed, as compared to `tf.stop_gradient` which is used
+  during graph construction.  When the two approaches are combined,
+  backpropagation stops at both `tf.stop_gradient` nodes and nodes in
+  `stop_gradients`, whichever is encountered first.
+
+  All integer tensors are considered constant with respect to all `xs`, as if
+  they were included in `stop_gradients`.
+
+  `unconnected_gradients` determines the value returned for each x in xs if it
+  is unconnected in the graph to ys. By default this is None to safeguard
+  against errors. MAthematically these gradients are zero which can be requested
+  using the `'zero'` option. `tf.UnconnectedGradients` provides the
+  following options and behaviors:
+
+  ```python
+  a = tf.ones([1, 2])
+  b = tf.ones([3, 1])
+  g1 = tf.gradients([b], [a], unnconnected_gradients='none')
+  sess.run(g1)  # [None]
+
+  g2 = tf.gradients([b], [a], unconnected_gradients='zero')
+  sess.run(g2)  # [array([[0., 0.]], dtype=float32)]
+  ```
+
+
+  Args:
+    ys: A `Tensor` or list of tensors to be differentiated.
+    xs: A `Tensor` or list of tensors to be used for differentiation.
+    grad_ys: Optional. A `Tensor` or list of tensors the same size as
+      `ys` and holding the gradients computed for each y in `ys`.
+    name: Optional name to use for grouping all the gradient ops together.
+      defaults to 'gradients'.
+    gate_gradients: If True, add a tuple around the gradients returned
+      for an operations.  This avoids some race conditions.
+    aggregation_method: Specifies the method used to combine gradient terms.
+      Accepted values are constants defined in the class `AggregationMethod`.
+    stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
+      through.
+    unconnected_gradients: Optional. Specifies the gradient value returned when
+      the given input tensors are unconnected. Accepted values are constants
+      defined in the class `tf.UnconnectedGradients` and the default value is
+      `none`.
+
+  Returns:
+    A list of `sum(dy/dx)` for each x in `xs`.
+
+  Raises:
+    LookupError: if one of the operations between `x` and `y` does not
+      have a registered gradient function.
+    ValueError: if the arguments are invalid.
+    RuntimeError: if called in Eager mode.
+
+  """
+  # Creating the gradient graph for control flow mutates Operations.
+  # _mutation_lock ensures a Session.run call cannot occur between creating and
+  # mutating new ops.
+  with ops.get_default_graph()._mutation_lock():  # pylint: disable=protected-access
+    return _GradientsHelper(ys, xs, grad_ys, name, True, gate_gradients,
+                            aggregation_method, stop_gradients,
+                            unconnected_gradients)
+
+
 def _GradientsHelper(ys,
                      xs,
                      grad_ys=None,
@@ -896,7 +1017,7 @@ def _HasAnyNotNoneGrads(grads, op):
     if isinstance(out_grad, (ops.Tensor, ops.IndexedSlices)):
       return True
     if out_grad and isinstance(out_grad, collections.Sequence):
-      if any([g is not None for g in out_grad]):
+      if any(g is not None for g in out_grad):
         return True
   return False
 
@@ -1111,11 +1232,11 @@ def _AggregatedGrads(grads,
         assert control_flow_util.IsLoopSwitch(op)
         continue
     # Grads have to be Tensors or IndexedSlices
-    if (isinstance(out_grad, collections.Sequence) and not all([
+    if (isinstance(out_grad, collections.Sequence) and not all(
         isinstance(g, (ops.Tensor, ops.IndexedSlices))
         for g in out_grad
         if g is not None
-    ])):
+    )):
       raise TypeError("gradients have to be either all Tensors "
                       "or all IndexedSlices")
     # Aggregate multiple gradients, and convert [] to None.
@@ -1123,7 +1244,7 @@ def _AggregatedGrads(grads,
       if len(out_grad) < 2:
         used = "nop"
         out_grads[i] = out_grad[0]
-      elif all([isinstance(g, ops.Tensor) for g in out_grad if g is not None]):
+      elif all(isinstance(g, ops.Tensor) for g in out_grad if g is not None):
         tensor_shape = _AccumulatorShape(out_grad)
         if (aggregation_method == AggregationMethod.EXPERIMENTAL_ACCUMULATE_N
             and len(out_grad) > 2 and tensor_shape.is_fully_defined()):
@@ -1240,7 +1361,7 @@ def _hessian_vector_product(ys, xs, v):
   return gradients(elemwise_products, xs)
 
 
-@tf_export("hessians")
+@tf_export(v1=["hessians"])
 def hessians(ys,
              xs,
              name="hessians",
@@ -1305,3 +1426,16 @@ def hessians(ys,
                                           array_ops.concat((_shape, _shape), 0))
     hessians.append(_reshaped_hessian)
   return hessians
+
+
+@tf_export("hessians", v1=[])
+def HessiansV2(ys,
+               xs,
+               gate_gradients=False,
+               aggregation_method=None,
+               name="hessians"):
+  return hessians(ys, xs, name=name, gate_gradients=gate_gradients,
+                  aggregation_method=aggregation_method)
+
+
+HessiansV2.__doc__ = hessians.__doc__
diff --git a/tensorflow/python/ops/gradients_test.py b/tensorflow/python/ops/gradients_test.py
index 103e3902b60f153531fa899d8f3d92df25a9e11c..abdcbc7a3ac3b2e6d42bacf4ae454e277220f497 100644
--- a/tensorflow/python/ops/gradients_test.py
+++ b/tensorflow/python/ops/gradients_test.py
@@ -144,7 +144,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
                                  gate_gradients=True)[0]
       with session.Session():
         # Make sure the placer doesn't complain.
-        gz_x.eval()
+        self.evaluate(gz_x)
 
   def testBoundaryStop(self):
     # Test that we don't differentiate 'x'. The gradient function for 'x' is
@@ -158,6 +158,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(z, [x])
       self.assertTrue(all(x is not None for x in grads))
 
+  @test_util.run_v1_only("b/120545219")
   def testBoundaryContinue(self):
     # Test that we differentiate both 'x' and 'y' correctly when x is a
     # predecessor of y.
@@ -169,6 +170,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertTrue(all(x is not None for x in grads))
       self.assertEqual(6.0, grads[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodAccumulateN(self):
     with self.cached_session():
       x = constant(1.0)
@@ -182,6 +184,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(20.0, grads[0].eval())
       self.assertEqual(10.0, grads[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodAddN(self):
     with self.cached_session():
       x = constant(1.0)
@@ -193,6 +196,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       self.assertEqual(20.0, grads[0].eval())
       self.assertEqual(10.0, grads[1].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testAggregationMethodTree(self):
     with self.cached_session():
       x = constant(1.0)
@@ -239,6 +243,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
             [dx, dy], feed_dict={x: [1.0], dy.indices: [0], dy.values: [2.0]})
       self.assertEqual(vdx, vdy)
 
+  @test_util.run_v1_only("b/120545219")
   def testNonDifferentiableSwitchInWhileLoop(self):
     with ops.Graph().as_default():
       v = array_ops.placeholder(dtypes.float32, [])
@@ -270,6 +275,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       gradient = gradients.gradients(graph.as_graph_element(var), var)
       self.assertIsNotNone(gradient)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariableRefGradient(self):
     with ops.Graph().as_default():
       init = constant_op.constant(100.0)
@@ -277,6 +283,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       gradient = gradients.gradients(var._ref(), var)
       self.assertIsNotNone(gradient)
 
+  @test_util.run_v1_only("b/120545219")
   def testDependentYs(self):
     with self.cached_session():
       x = constant_op.constant(3.0)
@@ -292,6 +299,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       g = gradients.gradients([z, z2], x)
       self.assertAllClose(17502.0, g[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testPartialDerivatives(self):
     with self.cached_session():
       x = constant_op.constant(1.)
@@ -302,6 +310,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       partialg = gradients.gradients(z, [x, y], stop_gradients=[x, y])
       self.assertEqual([1.0, 1.0], [g.eval() for g in partialg])
 
+  @test_util.run_v1_only("b/120545219")
   def testStopGradients(self):
     def _MakeGraph(rng, stop_gradients=()):
       def _FunctionOf(xs, k=3):
@@ -365,7 +374,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
       with self.cached_session() as sess:
-        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], sess.run(grads)[0])
+        self.assertAllEqual([[0.0, 0.0], [0.0, 0.0]], self.evaluate(grads)[0])
 
   def testUnconnectedGradientsZeroConnectedGradients(self):
     with ops.Graph().as_default():
@@ -374,7 +383,7 @@ class GradientsTest(test_util.TensorFlowTestCase):
       grad = gradients.gradients(
           [y], [x], unconnected_gradients="zero")
       with self.cached_session() as sess:
-        self.assertEquals(3.0, sess.run(grad)[0])
+        self.assertEquals(3.0, self.evaluate(grad)[0])
 
   def testUnknownUnconnectedGradientsValueGiven(self):
     with ops.Graph().as_default():
@@ -438,8 +447,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       grads = gradients.gradients(y, [x, b1])
 
       with self.cached_session() as sess:
-        self.assertAllEqual([40.0], sess.run(grads)[0])
-        self.assertAllEqual([10.0], sess.run(grads)[1])
+        self.assertAllEqual([40.0], self.evaluate(grads)[0])
+        self.assertAllEqual([10.0], self.evaluate(grads)[1])
 
   def testFunctionGradientsWithGradFunc(self):
     g = ops.Graph()
@@ -487,7 +496,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(f), 2.0)
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testGradientOfCaptured(self):
     with ops.Graph().as_default():
@@ -501,7 +510,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(f), 2.0)
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedResourceVariable(self):
     with ops.Graph().as_default():
@@ -515,8 +524,8 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       f = Foo()
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
-        self.assertEqual(sess.run(f), 2.0)
+        self.evaluate(variables.global_variables_initializer())
+        self.assertEqual(self.evaluate(f), 2.0)
 
   def testCapturedNested(self):
     with ops.Graph().as_default():
@@ -541,9 +550,9 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
       x1_grad, x2_grad = Outer()
       with self.cached_session() as sess:
         # 1.0 + None + 2.0 + 1.0 = 4.0
-        self.assertEqual(sess.run(x1_grad), 4.0)
+        self.assertEqual(self.evaluate(x1_grad), 4.0)
         # None + 1.0 + 1.0 + None = 2.0
-        self.assertEqual(sess.run(x2_grad), 2.0)
+        self.assertEqual(self.evaluate(x2_grad), 2.0)
 
   def testCapturedFromFunction(self):
     with ops.Graph().as_default():
@@ -563,7 +572,7 @@ class FunctionGradientsTest(test_util.TensorFlowTestCase):
 
       z_grad = Outer()
       with self.cached_session() as sess:
-        self.assertEqual(sess.run(z_grad), 3.0)
+        self.assertEqual(self.evaluate(z_grad), 3.0)
 
   def testCapturedEagerTensors(self):
     # Test that we can handle captured eager tensors unrelated to the gradient
@@ -606,6 +615,7 @@ class PreventGradientTest(test_util.TensorFlowTestCase):
 
 class HessianVectorProductTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testHessianVectorProduct(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that HessianVectorProduct matches multiplication by the
@@ -628,12 +638,13 @@ class HessianVectorProductTest(test_util.TensorFlowTestCase):
         mat_x = math_ops.matmul(mat, x, name="Ax")
         x_mat_x = math_ops.matmul(array_ops.transpose(x), mat_x, name="xAx")
         hess_v = gradients_impl._hessian_vector_product(x_mat_x, [x], [v])[0]
-        hess_v_actual = hess_v.eval()
+        hess_v_actual = self.evaluate(hess_v)
       self.assertAllClose(hess_v_value, hess_v_actual)
 
 
 class HessianTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian1D(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that `hessian` matches. Specifically, the Hessian of
@@ -648,9 +659,10 @@ class HessianTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_value)
       x_mat_x = math_ops.reduce_sum(x[:, None] * mat * x[None, :])
       hess = gradients.hessians(x_mat_x, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     self.assertAllClose(hess_value, hess_actual)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian1D_multi(self):
     # Test the computation of the hessian with respect to multiple tensors
     m = 4
@@ -671,6 +683,7 @@ class HessianTest(test_util.TensorFlowTestCase):
     for hess_value, hess_actual in zip(hess_values, hessians_actual):
       self.assertAllClose(hess_value, hess_actual)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessianInvalidDimension(self):
     for shape in [(10, 10), None]:
       with self.cached_session(use_gpu=True):
@@ -679,6 +692,7 @@ class HessianTest(test_util.TensorFlowTestCase):
         with self.assertRaises(ValueError):
           gradients.hessians(x, x)
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian2D_square_matrix(self):
     # Manually compute the Hessian explicitly for a low-dimensional problem
     # and check that `hessian` matches. Specifically, the Hessian of
@@ -692,7 +706,7 @@ class HessianTest(test_util.TensorFlowTestCase):
           math_ops.matmul(array_ops.transpose(x), x) * 0.5
       )
       hess = gradients.hessians(x_square, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     hess_value = np.bmat([
         [elem*np.ones((m, m)) for elem in vec]
         for vec in np.eye(m)
@@ -700,6 +714,7 @@ class HessianTest(test_util.TensorFlowTestCase):
     self.assertAllEqual((m, m, m, m), hess_actual.shape)
     self.assertAllClose(hess_value, hess_actual.reshape((m * m, m * m)))
 
+  @test_util.run_v1_only("b/120545219")
   def testHessian2D_non_square_matrix(self):
     m = 3
     n = 4
@@ -711,7 +726,7 @@ class HessianTest(test_util.TensorFlowTestCase):
           math_ops.matmul(array_ops.transpose(x), x) * 0.5
       )
       hess = gradients.hessians(x_square, x)[0]
-      hess_actual = hess.eval()
+      hess_actual = self.evaluate(hess)
     hess_value = np.bmat([
         [elem*np.ones((n, n)) for elem in vec]
         for vec in np.eye(m)
@@ -722,6 +737,7 @@ class HessianTest(test_util.TensorFlowTestCase):
 
 class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesToTensor(self):
     with self.cached_session():
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
@@ -729,8 +745,9 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
       c_sparse = math_ops._as_indexed_slices(c)
       self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
       c_dense = math_ops.multiply(c_sparse, 1.0)
-      self.assertAllClose(np_val, c_dense.eval())
+      self.assertAllClose(np_val, self.evaluate(c_dense))
 
+  @test_util.run_v1_only("b/120545219")
   def testIndexedSlicesToTensorList(self):
     with self.cached_session():
       numpy_list = []
@@ -745,8 +762,9 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
         sparse_list.append(c_sparse)
       packed_dense = array_ops.stack(dense_list)
       packed_sparse = array_ops.stack(sparse_list)
-      self.assertAllClose(packed_dense.eval(), packed_sparse.eval())
+      self.assertAllClose(packed_dense.eval(), self.evaluate(packed_sparse))
 
+  @test_util.run_v1_only("b/120545219")
   def testInt64Indices(self):
     with self.cached_session():
       np_val = np.random.rand(4, 4, 4, 4).astype(np.float32)
@@ -757,8 +775,9 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
           math_ops.cast(c_sparse.indices, dtypes.int64), c_sparse.dense_shape)
       self.assertAllEqual(np_val.shape, c_sparse.dense_shape.eval())
       c_dense = math_ops.multiply(c_sparse, 1.0)
-      self.assertAllClose(np_val, c_dense.eval())
+      self.assertAllClose(np_val, self.evaluate(c_dense))
 
+  @test_util.run_v1_only("b/120545219")
   def testWarnings(self):
     # TODO(gunan) Reenable after this issue is fixed:
     # https://github.com/google/protobuf/issues/2812
@@ -802,6 +821,7 @@ class IndexedSlicesToTensorTest(test_util.TensorFlowTestCase):
 
 class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testRealOnly(self):
     x = constant_op.constant(7+3j, dtype=dtypes.complex64)
     y = math_ops.square(x)
@@ -814,6 +834,7 @@ class OnlyRealGradientsTest(test_util.TensorFlowTestCase):
 
 class ResourceCondTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     gamma = resource_variable_ops.ResourceVariable(
         np.random.random((3,)),
@@ -853,7 +874,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       y = MyIdentity(MyIdentity(x))
       dy = gradients.gradients(y, x)[0]
       with session.Session():
-        self.assertEqual(9., dy.eval())
+        self.assertEqual(9., self.evaluate(dy))
 
   def testCustomGradient(self):
 
@@ -873,7 +894,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       y = MyMultiply(x1, x2)
       dy = gradients.gradients(y, [x1, x2])
       with session.Session() as sess:
-        self.assertAllEqual([3., 5.], sess.run(dy))
+        self.assertAllEqual([3., 5.], self.evaluate(dy))
 
   def testCustomGradientErrors(self):
 
@@ -914,7 +935,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       for g in grads:
         self.assertTrue(g is not None)
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         dw = sess.run(math_ops.reduce_sum(grads[1]))
         self.assertEqual(12., dw)
 
@@ -943,6 +964,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       self.assertEqual(6., math_ops.reduce_sum(dx).numpy())
       self.assertEqual(8., math_ops.reduce_sum(dw).numpy())
 
+  @test_util.run_v1_only("b/120545219")
   def testCustomGradientErrorsWithNonResourceVariables(self):
 
     def F(x, use_resource=False):
@@ -993,6 +1015,7 @@ class CustomGradientTest(test_util.TensorFlowTestCase):
       # Smoke test to ensure numpy inputs are accepted
       F(x)
 
+  @test_util.run_v1_only("b/120545219")
   def testRVGradientsDynamicCond(self):
     with self.cached_session():
       alpha = resource_variable_ops.ResourceVariable(
@@ -1074,7 +1097,7 @@ class TensorListGradientsTest(test_util.TensorFlowTestCase):
 
       grad = gradients.gradients(tl, a, grad_ys=grad_tl)[0]
       with self.cached_session() as sess:
-        self.assertEquals(sess.run(grad), 5.)
+        self.assertEquals(self.evaluate(grad), 5.)
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/ops/histogram_ops_test.py b/tensorflow/python/ops/histogram_ops_test.py
index e7fe0efba4e5a1e7216b471c248af650b3736328..b48ef67196bd7d1d56f51b61bc0b28ca2054d28d 100644
--- a/tensorflow/python/ops/histogram_ops_test.py
+++ b/tensorflow/python/ops/histogram_ops_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework import constant_op
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import histogram_ops
@@ -39,7 +40,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_1d_values_int32_output(self):
     # Bins will be:
@@ -51,7 +52,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_1d_float64_values_int32_output(self):
     # Bins will be:
@@ -63,7 +64,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
   def test_2d_values(self):
     # Bins will be:
@@ -76,7 +77,7 @@ class BinValuesFixedWidth(test.TestCase):
       bins = histogram_ops.histogram_fixed_width_bins(
           values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, bins.dtype)
-      self.assertAllClose(expected_bins, bins.eval())
+      self.assertAllClose(expected_bins, self.evaluate(bins))
 
 
 class HistogramFixedWidthTest(test.TestCase):
@@ -84,6 +85,7 @@ class HistogramFixedWidthTest(test.TestCase):
   def setUp(self):
     self.rng = np.random.RandomState(0)
 
+  @test_util.run_deprecated_v1
   def test_with_invalid_value_range(self):
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     with self.assertRaisesRegexp(
@@ -92,6 +94,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "Dimension must be 2 but is 3"):
       histogram_ops.histogram_fixed_width(values, [1.0, 2.0, 3.0])
 
+  @test_util.run_deprecated_v1
   def test_with_invalid_nbins(self):
     values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]
     with self.assertRaisesRegexp(
@@ -110,7 +113,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_1d_values_int64_output(self):
     # Bins will be:
@@ -122,7 +125,7 @@ class HistogramFixedWidthTest(test.TestCase):
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=5, dtype=dtypes.int64)
       self.assertEqual(dtypes.int64, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_1d_float64_values(self):
     # Bins will be:
@@ -133,7 +136,7 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
   def test_2d_values(self):
     # Bins will be:
@@ -144,8 +147,9 @@ class HistogramFixedWidthTest(test.TestCase):
     with self.session(use_gpu=True):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
+  @test_util.run_deprecated_v1
   def test_shape_inference(self):
     value_range = [0.0, 5.0]
     values = [[-1.0, 0.0, 1.5], [2.0, 5.0, 15]]
@@ -155,7 +159,7 @@ class HistogramFixedWidthTest(test.TestCase):
       hist = histogram_ops.histogram_fixed_width(values, value_range, nbins=5)
       self.assertAllEqual(hist.shape.as_list(), (5,))
       self.assertEqual(dtypes.int32, hist.dtype)
-      self.assertAllClose(expected_bin_counts, hist.eval())
+      self.assertAllClose(expected_bin_counts, self.evaluate(hist))
 
       hist = histogram_ops.histogram_fixed_width(
           values, value_range, nbins=placeholder)
diff --git a/tensorflow/python/ops/image_grad_test.py b/tensorflow/python/ops/image_grad_test.py
index 32c2f37c0b769de6564f968c44df2bb552cd7edc..c481266dd71c1300612dbc384d240d34b98b3599 100644
--- a/tensorflow/python/ops/image_grad_test.py
+++ b/tensorflow/python/ops/image_grad_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import image_ops
@@ -44,9 +45,10 @@ class ResizeNearestNeighborOpTest(test.TestCase):
                                                        out_shape[1:3])
         self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-        resize_out = sess.run(resize_out)
+        resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -62,6 +64,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -77,6 +80,7 @@ class ResizeNearestNeighborOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareGpuVsCpu(self):
     in_shape = [1, 4, 6, 3]
     out_shape = [1, 8, 16, 3]
@@ -113,9 +117,10 @@ class ResizeBilinearOpTest(test.TestCase):
       resize_out = image_ops.resize_bilinear(input_tensor, out_shape[1:3])
       self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-      resize_out = sess.run(resize_out)
+      resize_out = self.evaluate(resize_out)
       self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -129,6 +134,7 @@ class ResizeBilinearOpTest(test.TestCase):
           input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -142,6 +148,7 @@ class ResizeBilinearOpTest(test.TestCase):
           input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
     self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testCompareGpuVsCpu(self):
     in_shape = [2, 4, 6, 3]
     out_shape = [2, 8, 16, 3]
@@ -160,6 +167,7 @@ class ResizeBilinearOpTest(test.TestCase):
 
       self.assertAllClose(grad[False], grad[True], rtol=1e-4, atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testTypes(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -196,9 +204,10 @@ class ResizeBicubicOpTest(test.TestCase):
                                               align_corners=align_corners)
         self.assertEqual(out_shape, list(resize_out.get_shape()))
 
-        resize_out = sess.run(resize_out)
+        resize_out = self.evaluate(resize_out)
         self.assertEqual(out_shape, list(resize_out.shape))
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToLargerInBothDims(self):
     in_shape = [1, 2, 3, 1]
     out_shape = [1, 4, 6, 1]
@@ -214,6 +223,7 @@ class ResizeBicubicOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradFromResizeToSmallerInBothDims(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -229,6 +239,7 @@ class ResizeBicubicOpTest(test.TestCase):
             input_tensor, in_shape, resize_out, out_shape, x_init_value=x)
       self.assertLess(err, 1e-3)
 
+  @test_util.run_deprecated_v1
   def testGradOnUnsupportedType(self):
     in_shape = [1, 4, 6, 1]
     out_shape = [1, 2, 3, 1]
@@ -273,7 +284,7 @@ class CropAndResizeOpTest(test.TestCase):
           constant_op.constant(
               crop_size, shape=[2]))
       self.assertEqual(crops_shape, list(crops.get_shape()))
-      crops = sess.run(crops)
+      crops = self.evaluate(crops)
       self.assertEqual(crops_shape, list(crops.shape))
 
   def _randomUniformAvoidAnchors(self, low, high, anchors, radius, num_samples):
@@ -306,6 +317,7 @@ class CropAndResizeOpTest(test.TestCase):
         samples.append(sample)
     return samples
 
+  @test_util.run_deprecated_v1
   def testGradRandomBoxes(self):
     """Test that the gradient is correct for randomly generated boxes.
 
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 3ab3695a03c080c9f1491a9c871a62808ee3f2cb..24d049b726fb93401d916d60c0d37fe85de30719 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -24,6 +24,7 @@ from tensorflow.python.compat import compat
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -37,6 +38,7 @@ from tensorflow.python.ops import nn_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable('RandomCrop')
@@ -511,15 +513,20 @@ def _rot90_4D(images, k, name_scope):
   result.set_shape([shape[0], None, None, shape[3]])
   return result
 
-@tf_export('image.transpose_image')
+
+@tf_export(v1=['image.transpose', 'image.transpose_image'])
 def transpose_image(image):
-  """Transpose image(s) by swapping the height and width dimension.
+  return transpose(image=image, name=None)
+
 
-  See also `transpose()`.
+@tf_export('image.transpose', v1=[])
+def transpose(image, name=None):
+  """Transpose image(s) by swapping the height and width dimension.
 
   Args:
     image: 4-D Tensor of shape `[batch, height, width, channels]` or
            3-D Tensor of shape `[height, width, channels]`.
+    name: A name for this operation (optional).
 
   Returns:
     If `image` was 4-D, a 4-D float Tensor of shape
@@ -530,14 +537,14 @@ def transpose_image(image):
   Raises:
     ValueError: if the shape of `image` not supported.
   """
-  with ops.name_scope(None, 'transpose_image', [image]):
+  with ops.name_scope(name, 'transpose', [image]):
     image = ops.convert_to_tensor(image, name='image')
     image = _AssertAtLeast3DImage(image)
     shape = image.get_shape()
     if shape.ndims == 3 or shape.ndims is None:
-      return array_ops.transpose(image, [1, 0, 2], name='transpose_image')
+      return array_ops.transpose(image, [1, 0, 2], name=name)
     elif shape.ndims == 4:
-      return array_ops.transpose(image, [0, 2, 1, 3], name='transpose_image')
+      return array_ops.transpose(image, [0, 2, 1, 3], name=name)
     else:
       raise ValueError('\'image\' must have either 3 or 4 dimensions.')
 
@@ -938,12 +945,28 @@ class ResizeMethod(object):
   AREA = 3
 
 
-@tf_export('image.resize_images')
+@tf_export(v1=['image.resize_images', 'image.resize'])
 def resize_images(images,
                   size,
                   method=ResizeMethod.BILINEAR,
                   align_corners=False,
                   preserve_aspect_ratio=False):
+  return resize_images_v2(
+      images=images,
+      size=size,
+      method=method,
+      align_corners=align_corners,
+      preserve_aspect_ratio=preserve_aspect_ratio,
+      name=None)
+
+
+@tf_export('image.resize', v1=[])
+def resize_images_v2(images,
+                     size,
+                     method=ResizeMethod.BILINEAR,
+                     align_corners=False,
+                     preserve_aspect_ratio=False,
+                     name=None):
   """Resize `images` to `size` using the specified `method`.
 
   Resized images will be distorted if their original aspect ratio is not
@@ -979,6 +1002,7 @@ def resize_images(images,
       then `images` will be resized to a size that fits in `size` while
       preserving the aspect ratio of the original image. Scales up the image if
       `size` is bigger than the current size of the `image`. Defaults to False.
+    name: A name for this operation (optional).
 
   Raises:
     ValueError: if the shape of `images` is incompatible with the
@@ -992,7 +1016,7 @@ def resize_images(images,
     If `images` was 3-D, a 3-D float Tensor of shape
     `[new_height, new_width, channels]`.
   """
-  with ops.name_scope(None, 'resize_images', [images, size]):
+  with ops.name_scope(name, 'resize', [images, size]):
     images = ops.convert_to_tensor(images, name='images')
     if images.get_shape().ndims is None:
       raise ValueError('\'images\' contains no shape.')
@@ -1736,7 +1760,7 @@ def adjust_saturation(image, saturation_factor, name=None):
         orig_dtype)
 
 
-@tf_export('image.is_jpeg')
+@tf_export('io.is_jpeg', 'image.is_jpeg', v1=['io.is_jpeg', 'image.is_jpeg'])
 def is_jpeg(contents, name=None):
   r"""Convenience function to check if the 'contents' encodes a JPEG image.
 
@@ -1771,8 +1795,28 @@ def _is_png(contents, name=None):
     substr = string_ops.substr(contents, 0, 3)
     return math_ops.equal(substr, b'\211PN', name=name)
 
+tf_export('io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg',
+          v1=['io.decode_and_crop_jpeg', 'image.decode_and_crop_jpeg'])(
+              gen_image_ops.decode_and_crop_jpeg)
 
-@tf_export('image.decode_image')
+tf_export('io.decode_bmp', 'image.decode_bmp',
+          v1=['io.decode_bmp', 'image.decode_bmp'])(gen_image_ops.decode_bmp)
+tf_export('io.decode_gif', 'image.decode_gif',
+          v1=['io.decode_gif', 'image.decode_gif'])(gen_image_ops.decode_gif)
+tf_export('io.decode_jpeg', 'image.decode_jpeg',
+          v1=['io.decode_jpeg', 'image.decode_jpeg'])(gen_image_ops.decode_jpeg)
+tf_export('io.decode_png', 'image.decode_png',
+          v1=['io.decode_png', 'image.decode_png'])(gen_image_ops.decode_png)
+
+tf_export('io.encode_jpeg', 'image.encode_jpeg',
+          v1=['io.encode_jpeg', 'image.encode_jpeg'])(gen_image_ops.encode_jpeg)
+tf_export('io.extract_jpeg_shape', 'image.extract_jpeg_shape',
+          v1=['io.extract_jpeg_shape', 'image.extract_jpeg_shape'])(
+              gen_image_ops.extract_jpeg_shape)
+
+
+@tf_export('io.decode_image', 'image.decode_image',
+           v1=['io.decode_image', 'image.decode_image'])
 def decode_image(contents, channels=None, dtype=dtypes.uint8, name=None):
   """Convenience function for `decode_bmp`, `decode_gif`, `decode_jpeg`,
   and `decode_png`.
@@ -1942,7 +1986,113 @@ def total_variation(images, name=None):
   return tot_var
 
 
-@tf_export('image.sample_distorted_bounding_box')
+@tf_export('image.sample_distorted_bounding_box', v1=[])
+def sample_distorted_bounding_box_v2(image_size,
+                                     bounding_boxes,
+                                     seed=0,
+                                     min_object_covered=0.1,
+                                     aspect_ratio_range=None,
+                                     area_range=None,
+                                     max_attempts=None,
+                                     use_image_if_no_bounding_boxes=None,
+                                     name=None):
+  """Generate a single randomly distorted bounding box for an image.
+
+  Bounding box annotations are often supplied in addition to ground-truth labels
+  in image recognition or object localization tasks. A common technique for
+  training such a system is to randomly distort an image while preserving
+  its content, i.e. *data augmentation*. This Op outputs a randomly distorted
+  localization of an object, i.e. bounding box, given an `image_size`,
+  `bounding_boxes` and a series of constraints.
+
+  The output of this Op is a single bounding box that may be used to crop the
+  original image. The output is returned as 3 tensors: `begin`, `size` and
+  `bboxes`. The first 2 tensors can be fed directly into `tf.slice` to crop the
+  image. The latter may be supplied to `tf.image.draw_bounding_boxes` to
+  visualize what the bounding box looks like.
+
+  Bounding boxes are supplied and returned as `[y_min, x_min, y_max, x_max]`.
+  The bounding box coordinates are floats in `[0.0, 1.0]` relative to the width
+  and height of the underlying image.
+
+  For example,
+
+  ```python
+      # Generate a single distorted bounding box.
+      begin, size, bbox_for_draw = tf.image.sample_distorted_bounding_box(
+          tf.shape(image),
+          bounding_boxes=bounding_boxes,
+          min_object_covered=0.1)
+
+      # Draw the bounding box in an image summary.
+      image_with_box = tf.image.draw_bounding_boxes(tf.expand_dims(image, 0),
+                                                    bbox_for_draw)
+      tf.summary.image('images_with_box', image_with_box)
+
+      # Employ the bounding box to distort the image.
+      distorted_image = tf.slice(image, begin, size)
+  ```
+
+  Note that if no bounding box information is available, setting
+  `use_image_if_no_bounding_boxes = true` will assume there is a single implicit
+  bounding box covering the whole image. If `use_image_if_no_bounding_boxes` is
+  false and no bounding boxes are supplied, an error is raised.
+
+  Args:
+    image_size: A `Tensor`. Must be one of the following types: `uint8`, `int8`,
+      `int16`, `int32`, `int64`.
+      1-D, containing `[height, width, channels]`.
+    bounding_boxes: A `Tensor` of type `float32`.
+      3-D with shape `[batch, N, 4]` describing the N bounding boxes
+      associated with the image.
+    seed: An optional `int`. Defaults to `0`.
+      If `seed` is set to non-zero, the random number generator is seeded by
+      the given `seed`.  Otherwise, it is seeded by a random seed.
+    min_object_covered: A Tensor of type `float32`. Defaults to `0.1`.
+      The cropped area of the image must contain at least this
+      fraction of any bounding box supplied. The value of this parameter should
+      be non-negative. In the case of 0, the cropped area does not need to
+      overlap any of the bounding boxes supplied.
+    aspect_ratio_range: An optional list of `floats`. Defaults to `[0.75,
+      1.33]`.
+      The cropped area of the image must have an aspect `ratio =
+      width / height` within this range.
+    area_range: An optional list of `floats`. Defaults to `[0.05, 1]`.
+      The cropped area of the image must contain a fraction of the
+      supplied image within this range.
+    max_attempts: An optional `int`. Defaults to `100`.
+      Number of attempts at generating a cropped region of the image
+      of the specified constraints. After `max_attempts` failures, return the
+      entire image.
+    use_image_if_no_bounding_boxes: An optional `bool`. Defaults to `False`.
+      Controls behavior if no bounding boxes supplied.
+      If true, assume an implicit bounding box covering the whole input. If
+      false, raise an error.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (begin, size, bboxes).
+
+    begin: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[offset_height, offset_width, 0]`. Provide as input to
+      `tf.slice`.
+    size: A `Tensor`. Has the same type as `image_size`. 1-D, containing
+    `[target_height, target_width, -1]`. Provide as input to
+      `tf.slice`.
+    bboxes: A `Tensor` of type `float32`. 3-D with shape `[1, 1, 4]` containing
+    the distorted bounding box.
+    Provide as input to `tf.image.draw_bounding_boxes`.
+  """
+  seed1, seed2 = random_seed.get_seed(seed) if seed else (0, 0)
+  return sample_distorted_bounding_box(
+      image_size, bounding_boxes, seed1, seed2, min_object_covered,
+      aspect_ratio_range, area_range, max_attempts,
+      use_image_if_no_bounding_boxes, name)
+
+
+@tf_export(v1=['image.sample_distorted_bounding_box'])
+@deprecation.deprecated(date=None, instructions='`seed2` arg is deprecated.'
+                        'Use sample_distorted_bounding_box_v2 instead.')
 def sample_distorted_bounding_box(image_size,
                                   bounding_boxes,
                                   seed=None,
@@ -2808,3 +2958,102 @@ def sobel_edges(image):
   output = array_ops.reshape(output, shape=shape)
   output.set_shape(static_image_shape.concatenate([num_kernels]))
   return output
+
+
+resize_area_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.AREA...)` instead.'))
+tf_export(v1=['image.resize_area'])(
+    resize_area_deprecation(gen_image_ops.resize_area))
+
+resize_bicubic_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.BICUBIC...)` instead.'))
+tf_export(v1=['image.resize_bicubic'])(
+    resize_bicubic_deprecation(gen_image_ops.resize_bicubic))
+
+resize_bilinear_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.BILINEAR...)` instead.'))
+tf_export(v1=['image.resize_bilinear'])(
+    resize_bilinear_deprecation(gen_image_ops.resize_bilinear))
+
+resize_nearest_neighbor_deprecation = deprecation.deprecated(
+    date=None,
+    instructions=(
+        'Use `tf.image.resize(...method=ResizeMethod.NEAREST_NEIGHBOR...)` '
+        'instead.'))
+tf_export(v1=['image.resize_nearest_neighbor'])(
+    resize_nearest_neighbor_deprecation(gen_image_ops.resize_nearest_neighbor))
+
+
+@tf_export('image.crop_and_resize', v1=[])
+def crop_and_resize_v2(
+    image,
+    boxes,
+    box_indices,
+    crop_size,
+    method='bilinear',
+    extrapolation_value=0,
+    name=None):
+  """Extracts crops from the input image tensor and resizes them.
+
+  Extracts crops from the input image tensor and resizes them using bilinear
+  sampling or nearest neighbor sampling (possibly with aspect ratio change) to a
+  common output size specified by `crop_size`. This is more general than the
+  `crop_to_bounding_box` op which extracts a fixed size slice from the input
+  image and does not allow resizing or aspect ratio change.
+
+  Returns a tensor with `crops` from the input `image` at positions defined at
+  the bounding box locations in `boxes`. The cropped boxes are all resized (with
+  bilinear or nearest neighbor interpolation) to a fixed
+  `size = [crop_height, crop_width]`. The result is a 4-D tensor
+  `[num_boxes, crop_height, crop_width, depth]`. The resizing is corner aligned.
+  In particular, if `boxes = [[0, 0, 1, 1]]`, the method will give identical
+  results to using `tf.image.resize_bilinear()` or
+  `tf.image.resize_nearest_neighbor()`(depends on the `method` argument) with
+  `align_corners=True`.
+
+  Args:
+    image: A 4-D tensor of shape `[batch, image_height, image_width, depth]`.
+      Both `image_height` and `image_width` need to be positive.
+    boxes: A 2-D tensor of shape `[num_boxes, 4]`. The `i`-th row of the tensor
+      specifies the coordinates of a box in the `box_ind[i]` image and is
+      specified in normalized coordinates `[y1, x1, y2, x2]`. A normalized
+      coordinate value of `y` is mapped to the image coordinate at `y *
+      (image_height - 1)`, so as the `[0, 1]` interval of normalized image
+      height is mapped to `[0, image_height - 1]` in image height coordinates.
+      We do allow `y1` > `y2`, in which case the sampled crop is an up-down
+      flipped version of the original image. The width dimension is treated
+      similarly. Normalized coordinates outside the `[0, 1]` range are allowed,
+      in which case we use `extrapolation_value` to extrapolate the input image
+      values.
+    box_indices: A 1-D tensor of shape `[num_boxes]` with int32 values in `[0,
+      batch)`. The value of `box_ind[i]` specifies the image that the `i`-th box
+      refers to.
+    crop_size: A 1-D tensor of 2 elements, `size = [crop_height, crop_width]`.
+      All cropped image patches are resized to this size. The aspect ratio of
+      the image content is not preserved. Both `crop_height` and `crop_width`
+      need to be positive.
+    method: An optional string specifying the sampling method for resizing. It
+      can be either `"bilinear"` or `"nearest"` and default to `"bilinear"`.
+      Currently two sampling methods are supported: Bilinear and Nearest
+      Neighbor.
+    extrapolation_value: An optional `float`. Defaults to `0`. Value used for
+      extrapolation, when applicable.
+    name: A name for the operation (optional).
+
+  Returns:
+    A 4-D tensor of shape `[num_boxes, crop_height, crop_width, depth]`.
+  """
+  return gen_image_ops.crop_and_resize(
+      image, boxes, box_indices, crop_size, method, extrapolation_value, name)
+
+
+crop_and_resize_deprecation = deprecation.deprecated_args(
+    None, 'box_ind is deprecated, use box_indices instead', 'box_ind')
+tf_export(v1=['image.crop_and_resize'])(
+    crop_and_resize_deprecation(gen_image_ops.crop_and_resize))
diff --git a/tensorflow/python/ops/image_ops_test.py b/tensorflow/python/ops/image_ops_test.py
index a3aeb79586be2cad6eb5d6e84f9a19dcc582c07a..e7249333bd35d07821004a39c3c78e52c1ee904d 100644
--- a/tensorflow/python/ops/image_ops_test.py
+++ b/tensorflow/python/ops/image_ops_test.py
@@ -70,7 +70,8 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.hsv_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1)
@@ -84,7 +85,7 @@ class RGBToHSVTest(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=True):
         hsv = image_ops.rgb_to_hsv(rgb_np)
         rgb = image_ops.hsv_to_rgb(hsv)
-        rgb_tf = rgb.eval()
+        rgb_tf = self.evaluate(rgb)
       self.assertAllClose(rgb_tf, rgb_np)
 
 
@@ -109,7 +110,8 @@ class RGBToYIQTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.yiq_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
@@ -138,7 +140,8 @@ class RGBToYUVTest(test_util.TensorFlowTestCase):
         split2 = list(map(image_ops.yuv_to_rgb, split1))
         join1 = array_ops.stack(split1)
         join2 = array_ops.stack(split2)
-        batch1, batch2, join1, join2 = sess.run([batch1, batch2, join1, join2])
+        batch1, batch2, join1, join2 = self.evaluate(
+            [batch1, batch2, join1, join2])
 
       # Verify that processing batch elements together is the same as separate
       self.assertAllClose(batch1, join1, rtol=1e-4, atol=1e-4)
@@ -173,7 +176,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.rgb_to_grayscale(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBasicRGBToGrayscale(self):
@@ -195,7 +198,7 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
     # 3-D input with no batch dimension.
@@ -205,9 +208,10 @@ class GrayscaleToRGBTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.grayscale_to_rgb(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     # Shape inference works and produces expected output where possible
     rgb_shape = [7, None, 19, 3]
@@ -245,7 +249,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=1)
 
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       y_np = x_np
 
       self.assertAllClose(y_tf, y_np, 1e-6)
@@ -268,6 +272,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       else:
         raise AssertionError("Exception not raised: %s" % err_msg)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_zero_tensor(self):
     """White image should be returned for gamma equal to zero"""
     with self.cached_session():
@@ -281,7 +286,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       err_msg = "Gamma should be a non-negative real number."
       try:
-        image.eval()
+        self.evaluate(image)
       except Exception as e:
         if err_msg not in str(e):
           raise
@@ -297,7 +302,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_gamma(x, gamma=0)
 
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       dtype = x.dtype.as_numpy_dtype
       y_np = np.array([dtypes.dtype_range[dtype][1]] * x_np.size)
@@ -305,6 +310,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_less_one(self):
     """Verifying the output with expected results for gamma
     correction with gamma equal to half"""
@@ -326,6 +332,7 @@ class AdjustGamma(test_util.TensorFlowTestCase):
 
       self.assertAllClose(y_tf, y_np, 1e-6)
 
+  @test_util.run_deprecated_v1
   def test_adjust_gamma_greater_one(self):
     """Verifying the output with expected results for gamma
     correction with gamma equal to two"""
@@ -360,7 +367,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testAdjustPositiveHue(self):
@@ -375,7 +382,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBatchAdjustHue(self):
@@ -390,7 +397,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_hue(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjustHueNp(self, x_np, delta_h):
@@ -415,7 +422,7 @@ class AdjustHueTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_hue(x, delta_h)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
     return y_tf
 
   def testAdjustRandomHue(self):
@@ -488,11 +495,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -518,11 +525,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -548,11 +555,11 @@ class FlipImageBenchmark(test.Benchmark):
             trainable=False,
             dtype=dtypes.float32)
         run_op = image_ops.random_flip_left_right(inputs)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in xrange(warmup_rounds + benchmark_rounds):
           if i == warmup_rounds:
             start = time.time()
-          sess.run(run_op)
+          self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -610,11 +617,11 @@ class AdjustHueBenchmark(test.Benchmark):
       delta = constant_op.constant(0.1, dtype=dtypes.float32)
       outputs = image_ops.adjust_hue(inputs, delta)
       run_op = control_flow_ops.group(outputs)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for i in xrange(warmup_rounds + benchmark_rounds):
         if i == warmup_rounds:
           start = time.time()
-        sess.run(run_op)
+        self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -653,12 +660,12 @@ class AdjustSaturationBenchmark(test.Benchmark):
       delta = constant_op.constant(0.1, dtype=dtypes.float32)
       outputs = image_ops.adjust_saturation(inputs, delta)
       run_op = control_flow_ops.group(outputs)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       for _ in xrange(warmup_rounds):
-        sess.run(run_op)
+        self.evaluate(run_op)
       start = time.time()
       for _ in xrange(benchmark_rounds):
-        sess.run(run_op)
+        self.evaluate(run_op)
     end = time.time()
     step_time = (end - start) / benchmark_rounds
     tag = device + "_%s" % (cpu_count if cpu_count is not None else "_all")
@@ -698,7 +705,7 @@ class ResizeBilinearBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -746,7 +753,7 @@ class ResizeBicubicBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -803,7 +810,7 @@ class ResizeAreaBenchmark(test.Benchmark):
       benchmark_op = control_flow_ops.group(*deps)
 
     with self.benchmark_session() as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       results = self.run_op_benchmark(
           sess,
           benchmark_op,
@@ -846,7 +853,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTwiceSaturation(self):
@@ -861,7 +868,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testBatchSaturation(self):
@@ -876,7 +883,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjust_saturation(self, image, saturation_factor):
@@ -899,7 +906,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTwiceSaturationFused(self):
@@ -914,7 +921,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = self._adjust_saturation(x, saturation_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def _adjustSaturationNp(self, x_np, scale):
@@ -935,6 +942,7 @@ class AdjustSaturationTest(test_util.TensorFlowTestCase):
       y_v[i][2] = b
     return y_v.reshape(x_np.shape)
 
+  @test_util.run_deprecated_v1
   def testAdjustRandomSaturation(self):
     x_shapes = [
         [2, 2, 3],
@@ -980,7 +988,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionLeftRightWithBatch(self):
@@ -990,9 +998,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(image_ops.flip_left_right(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1001,7 +1010,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
       self.assertTrue(y.op.name.startswith("flip_left_right"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testLeftRightWithBatch(self):
@@ -1015,9 +1024,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_left_right(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipLeftRight(self):
     x_np = np.array([[1, 2, 3], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[3, 2, 1], [3, 2, 1]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1031,7 +1041,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1046,6 +1056,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipLeftRightWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1070,7 +1081,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1096,7 +1107,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionUpDownWithBatch(self):
@@ -1107,9 +1118,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(image_ops.flip_up_down(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1118,7 +1130,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
       self.assertTrue(y.op.name.startswith("flip_up_down"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testUpDownWithBatch(self):
@@ -1132,9 +1144,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.flip_up_down(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipUpDown(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[4, 5, 6], [1, 2, 3]], dtype=np.uint8).reshape([2, 3, 1])
@@ -1148,7 +1161,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         if y_tf[0][0] == 1:
           self.assertAllEqual(y_tf, x_np)
           count_unflipped += 1
@@ -1163,6 +1176,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       self.assertGreaterEqual(count_flipped, 20)
       self.assertGreaterEqual(count_unflipped, 20)
 
+  @test_util.run_deprecated_v1
   def testRandomFlipUpDownWithBatch(self):
     batch_size = 16
     seed = 42
@@ -1187,7 +1201,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       count_flipped = 0
       count_unflipped = 0
       for _ in range(100):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
 
         # check every element of the batch
         for i in range(batch_size):
@@ -1213,7 +1227,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
   def testInvolutionTransposeWithBatch(self):
@@ -1224,9 +1238,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(image_ops.transpose_image(x_tf))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, x_np)
 
+  @test_util.run_deprecated_v1
   def testTranspose(self):
     x_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.uint8).reshape([2, 3, 1])
     y_np = np.array([[1, 4], [2, 5], [3, 6]], dtype=np.uint8).reshape([3, 2, 1])
@@ -1234,8 +1249,8 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      self.assertTrue(y.op.name.startswith("transpose_image"))
-      y_tf = y.eval()
+      self.assertTrue(y.op.name.startswith("transpose"))
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
   def testTransposeWithBatch(self):
@@ -1250,9 +1265,10 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x_tf = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.transpose_image(x_tf)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
 
+  @test_util.run_deprecated_v1
   def testPartialShapes(self):
     p_unknown_rank = array_ops.placeholder(dtypes.uint8)
     p_unknown_dims_3 = array_ops.placeholder(
@@ -1301,7 +1317,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
-      self.assertAllEqual(image, rotated.eval())
+      self.assertAllEqual(image, self.evaluate(rotated))
 
   def testRot90GroupOrderWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
@@ -1309,8 +1325,9 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
       rotated = image
       for _ in xrange(4):
         rotated = image_ops.rot90(rotated)
-      self.assertAllEqual(image, rotated.eval())
+      self.assertAllEqual(image, self.evaluate(rotated))
 
+  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalence(self):
     image = np.arange(24, dtype=np.uint8).reshape([2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1320,6 +1337,7 @@ class FlipTransposeRotateTest(test_util.TensorFlowTestCase):
         y_np = np.rot90(image, k=k)
         self.assertAllEqual(y_np, y_tf.eval({k_placeholder: k}))
 
+  @test_util.run_deprecated_v1
   def testRot90NumpyEquivalenceWithBatch(self):
     image = np.arange(48, dtype=np.uint8).reshape([2, 2, 4, 3])
     with self.test_session(use_gpu=True):
@@ -1335,7 +1353,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_contrast(x, contrast_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testDoubleContrastUint8(self):
@@ -1390,7 +1408,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np)
       y = image_ops.adjust_contrast(x, contrast_factor)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
     return y_tf
 
   def testRandomContrast(self):
@@ -1408,6 +1426,7 @@ class AdjustContrastTest(test_util.TensorFlowTestCase):
       y_tf = self._adjustContrastTf(x_np, contrast_factor)
       self.assertAllClose(y_tf, y_np, rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testContrastFactorShape(self):
     x_shape = [1, 2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
@@ -1423,7 +1442,7 @@ class AdjustBrightnessTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_np.shape)
       y = image_ops.adjust_brightness(x, delta)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, 1e-6)
 
   def testPositiveDeltaUint8(self):
@@ -1471,6 +1490,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     y /= stddev
     return y
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     x_shape = [13, 9, 3]
     x_np = np.arange(0, np.prod(x_shape), dtype=np.int32).reshape(x_shape)
@@ -1480,7 +1500,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.per_image_standardization(x)
       self.assertTrue(y.op.name.startswith("per_image_standardization"))
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllClose(y_tf, y_np, atol=1e-4)
 
   def testUniformImage(self):
@@ -1488,7 +1508,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     im = constant_op.constant(im_np)
     whiten = image_ops.per_image_standardization(im)
     with self.test_session(use_gpu=True):
-      whiten_np = whiten.eval()
+      whiten_np = self.evaluate(whiten)
       self.assertFalse(np.any(np.isnan(whiten_np)))
 
   def testBatchWhitening(self):
@@ -1497,7 +1517,7 @@ class PerImageWhiteningTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       imgs = constant_op.constant(imgs_np)
       whiten = image_ops.per_image_standardization(imgs)
-      whiten_tf = whiten.eval()
+      whiten_tf = self.evaluate(whiten)
       for w_tf, w_np in zip(whiten_tf, whiten_np):
         self.assertAllClose(w_tf, w_np, atol=1e-4)
 
@@ -1571,11 +1591,13 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     y = image_ops.crop_to_bounding_box(image, 0, 0, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     self._assertReturns(x, x_shape, 0, 0, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testCrop(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -1600,6 +1622,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     y = [1, 2, 4, 5, 7, 8]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([59, 69, 3], 55, 66, [55, 66, 3])
@@ -1613,6 +1636,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -1624,6 +1648,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -1655,6 +1680,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
           "assertion failed:",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
@@ -1672,6 +1698,7 @@ class CropToBoundingBoxTest(test_util.TensorFlowTestCase):
     for params, err_msg in test_config:
       self._assertRaises(x, x_shape, *params, err_msg=err_msg)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.crop_to_bounding_box(image, 0, 0, 55, 66)
@@ -1688,6 +1715,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     else:
       self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shapes = [[13, 9, 3], [5, 13, 9, 3]]
     for x_shape in x_shapes:
@@ -1696,7 +1724,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
         with self.test_session(use_gpu=use_gpu):
           x = constant_op.constant(x_np, shape=x_shape)
           y = image_ops.central_crop(x, 1.0)
-          y_tf = y.eval()
+          y_tf = self.evaluate(y)
           self.assertAllEqual(y_tf, x_np)
           self.assertEqual(y.op.name, x.op.name)
 
@@ -1711,7 +1739,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
       with self.test_session(use_gpu=use_gpu):
         x = constant_op.constant(x_np, shape=x_shape)
         y = image_ops.central_crop(x, 0.5)
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         self.assertAllEqual(y_tf, y_np)
         self.assertAllEqual(y_tf.shape, y_np.shape)
 
@@ -1727,10 +1755,11 @@ class CentralCropTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       x = constant_op.constant(x_np, shape=x_shape)
       y = image_ops.central_crop(x, 0.5)
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
       self.assertAllEqual(y_tf, y_np)
       self.assertAllEqual(y_tf.shape, y_np.shape)
 
+  @test_util.run_deprecated_v1
   def testCropping2(self):
     # Test case for 10315
     x_shapes = [[240, 320, 3], [5, 240, 320, 3]]
@@ -1747,6 +1776,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           self.assertAllEqual(y_tf, y_np)
           self.assertAllEqual(y_tf.shape, y_np.shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     # Test no-op fraction=1.0, with 3-D tensors.
     self._assertShapeInference([50, 60, 3], 1.0, [50, 60, 3])
@@ -1807,6 +1837,7 @@ class CentralCropTest(test_util.TensorFlowTestCase):
           with self.assertRaises(ValueError):
             _ = image_ops.central_crop(x, 0.5)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     x_shape = [13, 9, 3]
     x_np = np.ones(x_shape, dtype=np.float32)
@@ -1897,14 +1928,16 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     i = constant_op.constant([1, 0, 4, 3], dtype=dtypes.int64)
     y_tf = image_ops.pad_to_bounding_box(x, i[0], i[1], i[2], i[3])
     with self.test_session(use_gpu=True):
-      self.assertAllClose(y, y_tf.eval())
+      self.assertAllClose(y, self.evaluate(y_tf))
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
     offset_height, offset_width = [0, 0]
     self._assertReturns(x, x_shape, offset_height, offset_width, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPadding(self):
     x = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     x_shape = [3, 3, 1]
@@ -1929,6 +1962,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     y_shape = [3, 4, 1]
     self._assertReturns(x, x_shape, offset_height, offset_width, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
@@ -1942,6 +1976,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -1953,6 +1988,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
                          target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     # Each line is a test configuration:
@@ -1985,6 +2021,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
           "all dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [3, 3, 1]
     x = np.zeros(x_shape)
@@ -1999,6 +2036,7 @@ class PadToBoundingBoxTest(test_util.TensorFlowTestCase):
     for config_item in test_config:
       self._assertRaises(x, x_shape, *config_item)
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[55, 66, 3])
     y = image_ops.pad_to_bounding_box(image, 0, 0, 55, 66)
@@ -2040,7 +2078,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       y = array_ops.strided_slice(image_tf, begin, begin + size)
 
       for _ in xrange(num_iter):
-        y_tf = y.eval()
+        y_tf = self.evaluate(y)
         crop_height = y_tf.shape[0]
         crop_width = y_tf.shape[1]
         aspect_ratio = float(crop_width) / float(crop_height)
@@ -2106,6 +2144,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
     # TODO(wicke, shlens, dga): Restore this test so that it is no longer flaky.
     # self.assertGreaterEqual(min(fraction_object_covered), min_object_covered)
 
+  @test_util.run_deprecated_v1
   def testWholeImageBoundingBox(self):
     height = 40
     width = 50
@@ -2120,6 +2159,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
+  @test_util.run_deprecated_v1
   def testWithBoundingBox(self):
     height = 40
     width = 50
@@ -2150,6 +2190,7 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
         aspect_ratio_range=(0.75, 1.33),
         area_range=(0.05, 1.0))
 
+  @test_util.run_deprecated_v1
   def testSampleDistortedBoundingBoxShape(self):
     with self.test_session(use_gpu=True):
       image_size = constant_op.constant(
@@ -2171,9 +2212,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
       # Actual run to make sure shape is correct inside Compute().
-      begin = begin.eval()
-      end = end.eval()
-      bbox_for_drawing = bbox_for_drawing.eval()
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
       begin, end, bbox_for_drawing = image_ops.sample_distorted_bounding_box(
           image_size=image_size,
@@ -2207,9 +2248,9 @@ class SelectDistortedCropBoxTest(test_util.TensorFlowTestCase):
       self.assertAllEqual([3], end.get_shape().as_list())
       self.assertAllEqual([1, 1, 4], bbox_for_drawing.get_shape().as_list())
       # Actual run to make sure shape is correct inside Compute().
-      begin = begin.eval()
-      end = end.eval()
-      bbox_for_drawing = bbox_for_drawing.eval()
+      begin = self.evaluate(begin)
+      end = self.evaluate(end)
+      bbox_for_drawing = self.evaluate(bbox_for_drawing)
 
 
 class ResizeImagesTest(test_util.TensorFlowTestCase):
@@ -2245,6 +2286,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     else:
       return False
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -2265,7 +2307,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(image, [target_height, target_width], opt)
           yshape = array_ops.shape(y)
-          resized, newshape = sess.run([y, yshape])
+          resized, newshape = self.evaluate([y, yshape])
           self.assertAllEqual(img_shape, newshape)
           self.assertAllClose(resized, img_np, atol=1e-5)
 
@@ -2276,9 +2318,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         y = image_ops.resize_images(image, [target_height, target_width],
                                     self.OPTIONS[0])
         yshape = array_ops.shape(y)
-        newshape = yshape.eval()
+        newshape = self.evaluate(yshape)
         self.assertAllEqual(single_shape, newshape)
 
+  @test_util.run_deprecated_v1
   def testTensorArguments(self):
     img_shape = [1, 6, 4, 1]
     single_shape = [6, 4, 1]
@@ -2340,6 +2383,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       _ = image_ops.resize_images(image, [6, None],
                                   image_ops.ResizeMethod.BILINEAR)
 
+  @test_util.run_deprecated_v1
   def testReturnDtype(self):
     target_shapes = [[6, 4], [3, 2], [
         array_ops.placeholder(dtypes.int32),
@@ -2379,7 +2423,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
         image = constant_op.constant(img_np, shape=img_shape)
         y = image_ops.resize_images(image, [height, width], opt)
         yshape = array_ops.shape(y)
-        resized, newshape = sess.run([y, yshape])
+        resized, newshape = self.evaluate([y, yshape])
         self.assertAllEqual(img_shape, newshape)
         self.assertAllClose(resized, img_np, atol=1e-5)
 
@@ -2411,7 +2455,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
               y = image_ops.resize_images(image, [target_height, target_width],
                                           opt)
               expected = np.array(expected_data).reshape(target_shape)
-              resized = y.eval()
+              resized = self.evaluate(y)
               self.assertAllClose(resized, expected, atol=1e-5)
 
   def testResizeUpAlignCornersFalse(self):
@@ -2446,7 +2490,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
               image, [target_height, target_width], opt, align_corners=False)
-          resized = y.eval()
+          resized = self.evaluate(y)
           expected = np.array(expected_data[opt]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
@@ -2482,7 +2526,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
           image = constant_op.constant(img_np, shape=img_shape)
           y = image_ops.resize_images(
               image, [target_height, target_width], opt, align_corners=True)
-          resized = y.eval()
+          resized = self.evaluate(y)
           expected = np.array(expected_data[opt]).reshape(
               [1, target_height, target_width, 1])
           self.assertAllClose(resized, expected, atol=1e-05)
@@ -2509,7 +2553,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
       image = constant_op.constant(img_np, shape=img_shape)
       y = image_ops.resize_images(image, [target_height, target_width],
                                   image_ops.ResizeMethod.BICUBIC)
-      resized = y.eval()
+      resized = self.evaluate(y)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
       self.assertAllClose(resized, expected, atol=1)
@@ -2534,7 +2578,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                   image_ops.ResizeMethod.AREA)
       expected = np.array(expected_data).reshape(
           [1, target_height, target_width, 1])
-      resized = y.eval()
+      resized = self.evaluate(y)
       self.assertAllClose(resized, expected, atol=1)
 
   def testCompareNearestNeighbor(self):
@@ -2554,7 +2598,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 new_size,
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
-            gpu_val = out_op.eval()
+            gpu_val = self.evaluate(out_op)
           with self.test_session(use_gpu=False):
             image = constant_op.constant(img_np, shape=input_shape)
             new_size = constant_op.constant([target_height, target_width])
@@ -2563,7 +2607,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                 new_size,
                 image_ops.ResizeMethod.NEAREST_NEIGHBOR,
                 align_corners=align_corners)
-            cpu_val = out_op.eval()
+            cpu_val = self.evaluate(out_op)
           self.assertAllClose(cpu_val, gpu_val, rtol=1e-5, atol=1e-5)
 
   def testCompareBilinear(self):
@@ -2585,9 +2629,10 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                   new_size,
                   image_ops.ResizeMethod.BILINEAR,
                   align_corners=align_corners)
-              value[use_gpu] = out_op.eval()
+              value[use_gpu] = self.evaluate(out_op)
           self.assertAllClose(value[True], value[False], rtol=1e-5, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([50, 60, 3], [55, 66], [55, 66, 3])
     self._assertShapeInference([55, 66, 3], [55, 66], [55, 66, 3])
@@ -2608,12 +2653,13 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([59, 60, None], [55, 66], [55, 66, None])
     self._assertShapeInference([None, None, None], [55, 66], [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     img_shape = [1, 3, 2, 1]
     with self.test_session(use_gpu=True):
       single_image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
       y = image_ops.resize_images(single_image, [55, 66])
-      self.assertTrue(y.op.name.startswith("resize_images"))
+      self.assertTrue(y.op.name.startswith("resize"))
 
   def _ResizeImageCall(self, x, max_h, max_w, preserve_aspect_ratio,
                        use_tensor_inputs):
@@ -2658,6 +2704,7 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
                                    preserve_aspect_ratio, use_tensor_inputs)
       self.assertShapeEqual(y, ops.convert_to_tensor(y_tf))
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
@@ -2665,36 +2712,42 @@ class ResizeImagesTest(test_util.TensorFlowTestCase):
     self._assertResizeCheckShape(x, x_shape, [250, 250], [10, 250, 250, 10],
                                  preserve_aspect_ratio=False)
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeEqual(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmaller(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [50, 50, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSmallerMultipleImages(self):
     x_shape = [10, 100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [75, 50], [10, 50, 50, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioLarger(self):
     x_shape = [100, 100, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [150, 200], [150, 150, 10])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSameRatio(self):
     x_shape = [1920, 1080, 3]
     x = np.random.uniform(size=x_shape)
 
     self._assertResizeCheckShape(x, x_shape, [3840, 2160], [3840, 2160, 3])
 
+  @test_util.run_deprecated_v1
   def testPreserveAspectRatioSquare(self):
     x_shape = [299, 299, 3]
     x = np.random.uniform(size=x_shape)
@@ -2764,12 +2817,14 @@ class ResizeImageWithPadTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_image_with_pad(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPad(self):
     # Reduce vertical dimension
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2860,12 +2915,14 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     y = image_ops.resize_image_with_crop_or_pad(image, height, width)
     self.assertEqual(y.get_shape().as_list(), post_shape)
 
+  @test_util.run_deprecated_v1
   def testNoOp(self):
     x_shape = [10, 10, 10]
     x = np.random.uniform(size=x_shape)
 
     self._assertReturns(x, x_shape, x, x_shape)
 
+  @test_util.run_deprecated_v1
   def testPad(self):
     # Pad even along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2903,6 +2960,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testCrop(self):
     # Crop even along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2940,6 +2998,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testCropAndPad(self):
     # Pad along row but crop along col.
     x = [1, 2, 3, 4, 5, 6, 7, 8]
@@ -2959,6 +3018,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
 
     self._assertReturns(x, x_shape, y, y_shape)
 
+  @test_util.run_deprecated_v1
   def testShapeInference(self):
     self._assertShapeInference([50, 60, 3], 55, 66, [55, 66, 3])
     self._assertShapeInference([55, 66, 3], 55, 66, [55, 66, 3])
@@ -2980,6 +3040,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertShapeInference([None, None, None], 55, 66, [55, 66, None])
     self._assertShapeInference(None, 55, 66, [55, 66, None])
 
+  @test_util.run_deprecated_v1
   def testNon3DInput(self):
     # Input image is not 3D
     x = [0] * 15
@@ -2993,6 +3054,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
       self._assertRaises(x, x_shape, target_height, target_width,
                          "'image' must have either 3 or 4 dimensions.")
 
+  @test_util.run_deprecated_v1
   def testZeroLengthInput(self):
     # Input image has 0-length dimension(s).
     target_height, target_width = [1, 1]
@@ -3018,6 +3080,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
           "all dims of \\'image.shape\\' must be > 0",
           use_tensor_inputs_options=[True])
 
+  @test_util.run_deprecated_v1
   def testBadParams(self):
     x_shape = [4, 4, 1]
     x = np.zeros(x_shape)
@@ -3032,6 +3095,7 @@ class ResizeImageWithCropOrPadTest(test_util.TensorFlowTestCase):
     self._assertRaises(x, x_shape, target_height, target_width,
                        "target_width must be > 0")
 
+  @test_util.run_deprecated_v1
   def testNameScope(self):
     image = array_ops.placeholder(dtypes.float32, shape=[50, 60, 3])
     y = image_ops.resize_image_with_crop_or_pad(image, 55, 66)
@@ -3066,7 +3130,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       jpeg0 = io_ops.read_file(path)
       image0 = image_ops.decode_jpeg(jpeg0)
       image1 = image_ops.decode_jpeg(image_ops.encode_jpeg(image0))
-      jpeg0, image0, image1 = sess.run([jpeg0, image0, image1])
+      jpeg0, image0, image1 = self.evaluate([jpeg0, image0, image1])
       self.assertEqual(len(jpeg0), 3771)
       self.assertEqual(image0.shape, (256, 128, 3))
       self.assertLess(self.averageError(image0, image1), 1.4)
@@ -3083,7 +3147,7 @@ class JpegTest(test_util.TensorFlowTestCase):
             io_ops.read_file(rgb_path), channels=channels)
         cmyk = image_ops.decode_jpeg(
             io_ops.read_file(cmyk_path), channels=channels)
-        rgb, cmyk = sess.run([rgb, cmyk])
+        rgb, cmyk = self.evaluate([rgb, cmyk])
         self.assertEqual(rgb.shape, shape)
         self.assertEqual(cmyk.shape, shape)
         error = self.averageError(rgb, cmyk)
@@ -3112,9 +3176,10 @@ class JpegTest(test_util.TensorFlowTestCase):
                             image2.get_shape().as_list())
 
         # CropAndDecode should be equal to DecodeJpeg+Crop.
-        image1_crop, image2 = sess.run([image1_crop, image2])
+        image1_crop, image2 = self.evaluate([image1_crop, image2])
         self.assertAllEqual(image1_crop, image2)
 
+  @test_util.run_deprecated_v1
   def testCropAndDecodeJpegWithInvalidCropWindow(self):
     with self.cached_session() as sess:
       # Encode it, then decode it, then encode it
@@ -3131,7 +3196,7 @@ class JpegTest(test_util.TensorFlowTestCase):
         with self.assertRaisesWithPredicateMatch(
             errors.InvalidArgumentError,
             lambda e: "Invalid JPEG data or crop window" in str(e)):
-          sess.run(result)
+          self.evaluate(result)
 
   def testSynthetic(self):
     with self.test_session(use_gpu=True) as sess:
@@ -3141,7 +3206,8 @@ class JpegTest(test_util.TensorFlowTestCase):
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_ACCURATE")
       image2 = image_ops.decode_jpeg(
           image_ops.encode_jpeg(image1), dct_method="INTEGER_ACCURATE")
-      jpeg0, image0, image1, image2 = sess.run([jpeg0, image0, image1, image2])
+      jpeg0, image0, image1, image2 = self.evaluate(
+          [jpeg0, image0, image1, image2])
 
       # The decoded-encoded image should be similar to the input
       self.assertLess(self.averageError(image0, image1), 0.6)
@@ -3161,7 +3227,8 @@ class JpegTest(test_util.TensorFlowTestCase):
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(
           image_ops.encode_jpeg(image1), dct_method="INTEGER_FAST")
-      jpeg0, image0, image1, image2 = sess.run([jpeg0, image0, image1, image2])
+      jpeg0, image0, image1, image2 = self.evaluate(
+          [jpeg0, image0, image1, image2])
 
       # The decoded-encoded image should be similar to the input, but
       # note this is worse than the slower algorithm because it is
@@ -3184,11 +3251,12 @@ class JpegTest(test_util.TensorFlowTestCase):
       jpeg0 = image_ops.encode_jpeg(image0)
       image1 = image_ops.decode_jpeg(jpeg0, dct_method="INTEGER_FAST")
       image2 = image_ops.decode_jpeg(jpeg0)
-      image1, image2 = sess.run([image1, image2])
+      image1, image2 = self.evaluate([image1, image2])
 
       # The images should be the same.
       self.assertAllClose(image1, image2)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
       jpeg = constant_op.constant("nonsense")
@@ -3197,6 +3265,7 @@ class JpegTest(test_util.TensorFlowTestCase):
         self.assertEqual(image.get_shape().as_list(),
                          [None, None, channels or None])
 
+  @test_util.run_deprecated_v1
   def testExtractJpegShape(self):
     # Read a real jpeg and verify shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
@@ -3207,6 +3276,7 @@ class JpegTest(test_util.TensorFlowTestCase):
       [image_shape] = sess.run([image_ops.extract_jpeg_shape(jpeg)])
       self.assertEqual(image_shape.tolist(), [256, 128, 3])
 
+  @test_util.run_deprecated_v1
   def testExtractJpegShapeforCmyk(self):
     # Read a cmyk jpeg image, and verify its shape.
     path = ("tensorflow/core/lib/jpeg/testdata/"
@@ -3230,11 +3300,11 @@ class PngTest(test_util.TensorFlowTestCase):
         with self.test_session(use_gpu=True) as sess:
           png0 = io_ops.read_file(prefix + filename)
           image0 = image_ops.decode_png(png0, channels=channels)
-          png0, image0 = sess.run([png0, image0])
+          png0, image0 = self.evaluate([png0, image0])
           self.assertEqual(image0.shape, (26, 51, channels or channels_in))
           if channels == channels_in:
             image1 = image_ops.decode_png(image_ops.encode_png(image0))
-            self.assertAllEqual(image0, image1.eval())
+            self.assertAllEqual(image0, self.evaluate(image1))
 
   def testSynthetic(self):
     with self.test_session(use_gpu=True) as sess:
@@ -3242,7 +3312,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(_SimpleColorRamp())
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
 
       # PNG is lossless
       self.assertAllEqual(image0, image1)
@@ -3257,7 +3327,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(_SimpleColorRamp(), dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0, dtype=dtypes.uint16)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
 
       # PNG is lossless
       self.assertAllEqual(image0, image1)
@@ -3273,7 +3343,7 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(gray_alpha)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
       self.assertEqual(2, image0.shape[-1])
       self.assertAllEqual(image0, image1)
 
@@ -3284,10 +3354,11 @@ class PngTest(test_util.TensorFlowTestCase):
       image0 = constant_op.constant(gray_alpha, dtype=dtypes.uint16)
       png0 = image_ops.encode_png(image0, compression=7)
       image1 = image_ops.decode_png(png0, dtype=dtypes.uint16)
-      png0, image0, image1 = sess.run([png0, image0, image1])
+      png0, image0, image1 = self.evaluate([png0, image0, image1])
       self.assertEqual(2, image0.shape[-1])
       self.assertAllEqual(image0, image1)
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True):
       png = constant_op.constant("nonsense")
@@ -3310,7 +3381,7 @@ class GifTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True) as sess:
       gif0 = io_ops.read_file(prefix + filename)
       image0 = image_ops.decode_gif(gif0)
-      gif0, image0 = sess.run([gif0, image0])
+      gif0, image0 = self.evaluate([gif0, image0])
 
       self.assertEqual(image0.shape, shape)
 
@@ -3332,6 +3403,7 @@ class GifTest(test_util.TensorFlowTestCase):
     self._testValid("scan.gif")
     self._testValid("optimized.gif")
 
+  @test_util.run_deprecated_v1
   def testShape(self):
     with self.test_session(use_gpu=True) as sess:
       gif = constant_op.constant("nonsense")
@@ -3358,6 +3430,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
         self.assertTrue(y_saturate.dtype == output_dtype)
         self.assertAllClose(y_saturate.eval(), y_np, atol=1e-5)
 
+  @test_util.run_deprecated_v1
   def testNoConvert(self):
     # Make sure converting to the same data type creates only an identity op
     with self.test_session(use_gpu=True):
@@ -3367,6 +3440,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self.assertEquals(y.op.type, "Identity")
       self.assertEquals(y.op.inputs[0], image)
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenInteger(self):
     # Make sure converting to between integer types scales appropriately
     with self.test_session(use_gpu=True):
@@ -3375,6 +3449,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([0, 2**32], dtypes.int64, dtypes.int32, [0, 1])
       self._convert([0, 1], dtypes.int32, dtypes.int64, [0, 2**32])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenFloat(self):
     # Make sure converting to between float types does nothing interesting
     with self.test_session(use_gpu=True):
@@ -3383,6 +3458,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([-1.0, 0, 1.0, 200000], dtypes.float64, dtypes.float32,
                     [-1.0, 0, 1.0, 200000])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenIntegerAndFloat(self):
     # Make sure converting from and to a float type scales appropriately
     with self.test_session(use_gpu=True):
@@ -3391,6 +3467,7 @@ class ConvertImageTest(test_util.TensorFlowTestCase):
       self._convert([0, 1.1 / 255.0, 1], dtypes.float32, dtypes.uint8,
                     [0, 1, 255])
 
+  @test_util.run_deprecated_v1
   def testConvertBetweenInt16AndInt8(self):
     with self.test_session(use_gpu=True):
       # uint8, uint16
@@ -3431,7 +3508,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
       y = image_ops.total_variation(images=x_tf)
 
       # Run the TensorFlow session to calculate the result.
-      y_tf = y.eval()
+      y_tf = self.evaluate(y)
 
       # Assert that the results are as expected within
       # some small error-bound in case they are float-values.
@@ -3582,6 +3659,7 @@ class TotalVariationTest(test_util.TensorFlowTestCase):
 
 class FormatTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFormats(self):
     prefix = "tensorflow/core/lib"
     paths = ("png/testdata/lena_gray.png", "jpeg/testdata/jpeg_merge_test1.jpg",
@@ -3614,6 +3692,7 @@ class FormatTest(test_util.TensorFlowTestCase):
 
 class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSelectFromThreeClusters(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
                 [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
@@ -3629,6 +3708,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
           boxes, scores, max_output_size, iou_threshold).eval()
       self.assertAllClose(selected_indices, [3, 0, 5])
 
+  @test_util.run_deprecated_v1
   def testInvalidShape(self):
     # The boxes should be 2D of shape [num_boxes, 4].
     with self.assertRaisesRegexp(ValueError,
@@ -3671,6 +3751,7 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
       scores = constant_op.constant([0.9])
       image_ops.non_max_suppression(boxes, scores, 3, [[0.5]])
 
+  @test_util.run_deprecated_v1
   def testDataTypes(self):
     # Test case for GitHub issue 20199.
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
@@ -3709,12 +3790,13 @@ class NonMaxSuppressionTest(test_util.TensorFlowTestCase):
         iou_threshold = constant_op.constant(iou_threshold_np)
         selected_indices, _ = gen_image_ops.non_max_suppression_v4(
             boxes, scores, max_output_size, iou_threshold, score_threshold)
-        selected_indices = selected_indices.eval()
+        selected_indices = self.evaluate(selected_indices)
         self.assertAllClose(selected_indices, [3, 0, 5])
 
 
 class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSelectFromThreeClusters(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.1, 1, 1.1], [0, -0.1, 1, 0.9],
                 [0, 10, 1, 11], [0, 10.1, 1, 11.1], [0, 100, 1, 101]]
@@ -3747,6 +3829,7 @@ class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
       self.assertAllClose(selected_indices.eval(), [3, 0, 5])
       self.assertEqual(num_valid.eval(), 3)
 
+  @test_util.run_deprecated_v1
   def testSelectFromContinuousOverLap(self):
     boxes_np = [[0, 0, 1, 1], [0, 0.2, 1, 1.2], [0, 0.4, 1, 1.4],
                 [0, 0.6, 1, 1.6], [0, 0.8, 1, 1.8], [0, 2, 1, 2]]
@@ -3774,6 +3857,7 @@ class NonMaxSuppressionPaddedTest(test_util.TensorFlowTestCase):
 
 class NonMaxSuppressionWithOverlapsTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testSelectOneFromThree(self):
     overlaps_np = [
         [1.0, 0.7, 0.2],
@@ -3799,6 +3883,7 @@ class NonMaxSuppressionWithOverlapsTest(test_util.TensorFlowTestCase):
 class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
   """Tests utility function used by ssim() and psnr()."""
 
+  @test_util.run_deprecated_v1
   def testWrongDims(self):
     img = array_ops.placeholder(dtype=dtypes.float32)
     img_np = np.array((2, 2))
@@ -3808,6 +3893,7 @@ class VerifyCompatibleImageShapesTest(test_util.TensorFlowTestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sess.run(checks, {img: img_np})
 
+  @test_util.run_deprecated_v1
   def testShapeMismatch(self):
     img1 = array_ops.placeholder(dtype=dtypes.float32)
     img2 = array_ops.placeholder(dtype=dtypes.float32)
@@ -3829,7 +3915,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/psnr/testdata", filename))
     im = image_ops.decode_jpeg(content, dct_method="INTEGER_ACCURATE")
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -3848,6 +3934,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testPSNRSingleImage(self):
     image1 = self._RandomImage((8, 8, 1), 1)
     image2 = self._RandomImage((8, 8, 1), 1)
@@ -3861,6 +3948,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1.0, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testPSNRMultiImage(self):
     image1 = self._RandomImage((10, 8, 8, 1), 1)
     image2 = self._RandomImage((10, 8, 8, 1), 1)
@@ -3874,6 +3962,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_image1, tf_image2, 1, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGoldenPSNR(self):
     q20, q72, q95 = self._LoadTestImages()
 
@@ -3898,6 +3987,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       self.assertAllClose(psnr2, tf_psnr2, atol=0.001)
       self.assertAllClose(psnr3, tf_psnr3, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     q20, _, _ = self._LoadTestImages()
     psnr = self._PSNR_NumPy(q20, q20, 1)
@@ -3906,6 +3996,7 @@ class PSNRTest(test_util.TensorFlowTestCase):
       tf_psnr = image_ops.psnr(tf_q20, tf_q20, 1, "psnr").eval()
       self.assertAllClose(psnr, tf_psnr, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((10, 8, 8, 1), 255)
     img2 = self._RandomImage((10, 8, 8, 1), 255)
@@ -3916,7 +4007,8 @@ class PSNRTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     psnr_float32 = image_ops.psnr(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(psnr_uint8.eval(), psnr_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          psnr_uint8.eval(), self.evaluate(psnr_float32), atol=0.001)
 
 
 class SSIMTest(test_util.TensorFlowTestCase):
@@ -3935,7 +4027,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/ssim/testdata", filename))
     im = image_ops.decode_png(content)
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -3946,6 +4038,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testAgainstMatlab(self):
     """Tests against values produced by Matlab."""
     img = self._LoadTestImages()
@@ -3969,7 +4062,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     ssim = image_ops.ssim(constant_op.constant(img1),
                           constant_op.constant(img2), 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
   def testBroadcast(self):
     img = self._LoadTestImages()[:2]
@@ -3981,8 +4074,9 @@ class SSIMTest(test_util.TensorFlowTestCase):
 
     ssim = image_ops.ssim(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, ssim.eval(), atol=1e-4)
+      self.assertAllClose(expected, self.evaluate(ssim), atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testNegative(self):
     """Tests against negative SSIM index."""
     step = np.expand_dims(np.arange(0, 256, 16, dtype=np.uint8), axis=0)
@@ -3997,6 +4091,7 @@ class SSIMTest(test_util.TensorFlowTestCase):
     with self.test_session(use_gpu=True):
       self.assertLess(ssim.eval(), 0)
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((1, 16, 16, 3), 255)
     img2 = self._RandomImage((1, 16, 16, 3), 255)
@@ -4007,7 +4102,8 @@ class SSIMTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
 
 class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
@@ -4026,7 +4122,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
         "tensorflow/core/lib/ssim/testdata", filename))
     im = image_ops.decode_png(content)
     im = image_ops.convert_image_dtype(im, dtypes.float32)
-    im, = sess.run([im])
+    im, = self.evaluate([im])
     return np.expand_dims(im, axis=0)
 
   def _LoadTestImages(self):
@@ -4037,6 +4133,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     """Returns an image or image batch with given shape."""
     return np.random.rand(*shape).astype(np.float32) * max_val
 
+  @test_util.run_deprecated_v1
   def testAgainstMatlab(self):
     """Tests against MS-SSIM computed with Matlab implementation.
 
@@ -4053,6 +4150,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     self.assertAllClose(expected, np.squeeze(scores), atol=1e-4)
 
+  @test_util.run_deprecated_v1
   def testUnweightedIsDifferentiable(self):
     img = self._LoadTestImages()
     ph = [array_ops.placeholder(dtype=dtypes.float32) for _ in range(2)]
@@ -4077,7 +4175,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     msssim = image_ops.ssim_multiscale(constant_op.constant(img1),
                                        constant_op.constant(img2), 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, msssim.eval(), 1e-4)
+      self.assertAllClose(expected, self.evaluate(msssim), 1e-4)
 
   def testBroadcast(self):
     """Tests MS-SSIM broadcasting."""
@@ -4090,7 +4188,7 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
 
     score_tensor = image_ops.ssim_multiscale(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(expected, score_tensor.eval(), 1e-4)
+      self.assertAllClose(expected, self.evaluate(score_tensor), 1e-4)
 
   def testRange(self):
     """Tests against low MS-SSIM score.
@@ -4108,12 +4206,13 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
       images = [ops.convert_to_tensor(x, dtype=dtypes.float32) for x in images]
       msssim_ops = [image_ops.ssim_multiscale(x, y, 1.0)
                     for x, y in itertools.combinations(images, 2)]
-      msssim = sess.run(msssim_ops)
+      msssim = self.evaluate(msssim_ops)
       msssim = np.squeeze(msssim)
 
     self.assertTrue(np.all(msssim >= 0.0))
     self.assertTrue(np.all(msssim <= 1.0))
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     img1 = self._RandomImage((1, 180, 240, 3), 255)
     img2 = self._RandomImage((1, 180, 240, 3), 255)
@@ -4124,7 +4223,8 @@ class MultiscaleSSIMTest(test_util.TensorFlowTestCase):
     img2 = image_ops.convert_image_dtype(img2, dtypes.float32)
     ssim_float32 = image_ops.ssim_multiscale(img1, img2, 1.0)
     with self.test_session(use_gpu=True):
-      self.assertAllClose(ssim_uint8.eval(), ssim_float32.eval(), atol=0.001)
+      self.assertAllClose(
+          ssim_uint8.eval(), self.evaluate(ssim_float32), atol=0.001)
 
 
 class ImageGradientsTest(test_util.TensorFlowTestCase):
@@ -4139,8 +4239,8 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
 
     dy, dx = image_ops.image_gradients(img)
     with self.cached_session():
-      actual_dy = dy.eval()
-      actual_dx = dx.eval()
+      actual_dy = self.evaluate(dy)
+      actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
       self.assertAllClose(expected_dx, actual_dx)
 
@@ -4164,8 +4264,8 @@ class ImageGradientsTest(test_util.TensorFlowTestCase):
     assert batch.get_shape().as_list() == [2, 2, 3, 2]
     dy, dx = image_ops.image_gradients(batch)
     with self.test_session(use_gpu=True):
-      actual_dy = dy.eval()
-      actual_dx = dx.eval()
+      actual_dy = self.evaluate(dy)
+      actual_dx = self.evaluate(dx)
       self.assertAllClose(expected_dy, actual_dy)
       self.assertAllClose(expected_dx, actual_dx)
 
@@ -4185,7 +4285,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
                            [[0, 0], [0, 12], [0, 0]]], [1, 2, 3, 1, 2])
     sobel = image_ops.sobel_edges(img)
     with self.test_session(use_gpu=True):
-      actual_sobel = sobel.eval()
+      actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected, actual_sobel)
 
   def testSobelEdges5x3x4x2(self):
@@ -4207,7 +4307,7 @@ class SobelEdgesTest(test_util.TensorFlowTestCase):
 
     sobel = image_ops.sobel_edges(img)
     with self.test_session(use_gpu=True):
-      actual_sobel = sobel.eval()
+      actual_sobel = self.evaluate(sobel)
       self.assertAllClose(expected_batch, actual_sobel)
 
 
@@ -4220,7 +4320,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testPngUint16(self):
@@ -4230,7 +4330,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(png0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(
           image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testGifUint16(self):
@@ -4240,7 +4340,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(gif0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testBmpUint16(self):
@@ -4250,7 +4350,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.uint16)
       image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
                                              dtypes.uint16)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testJpegFloat32(self):
@@ -4260,7 +4360,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(jpeg0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_jpeg(jpeg0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testPngFloat32(self):
@@ -4270,7 +4370,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(png0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(
           image_ops.decode_png(png0, dtype=dtypes.uint16), dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testGifFloat32(self):
@@ -4280,7 +4380,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(gif0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_gif(gif0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
   def testBmpFloat32(self):
@@ -4290,7 +4390,7 @@ class DecodeImageTest(test_util.TensorFlowTestCase):
       image0 = image_ops.decode_image(bmp0, dtype=dtypes.float32)
       image1 = image_ops.convert_image_dtype(image_ops.decode_bmp(bmp0),
                                              dtypes.float32)
-      image0, image1 = sess.run([image0, image1])
+      image0, image1 = self.evaluate([image0, image1])
       self.assertAllEqual(image0, image1)
 
 
diff --git a/tensorflow/python/ops/init_ops.py b/tensorflow/python/ops/init_ops.py
index 4fe6d05620f6a9d1e29ddc0831642335f893ad7d..c0a4bcd51dd10f352366b74955241e5f97133130 100644
--- a/tensorflow/python/ops/init_ops.py
+++ b/tensorflow/python/ops/init_ops.py
@@ -55,6 +55,15 @@ class Initializer(object):
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
+    """Returns a tensor object initialized as specified by the initializer.
+
+    Args:
+      shape: Shape of the tensor.
+      dtype: Optional dtype of the tensor. If not provided use the initializer
+        dtype.
+      partition_info: Optional information about the possible partitioning of a
+        tensor.
+    """
     raise NotImplementedError
 
   def get_config(self):
@@ -143,7 +152,8 @@ class Constant(Initializer):
     value: A Python scalar, list or tuple of values, or a N-dimensional numpy
       array. All elements of the initialized variable will be set to the
       corresponding value in the `value` argument.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer.
     verify_shape: Boolean that enables verification of the shape of `value`. If
       `True`, the initializer will throw an error if the shape of `value` is not
       compatible with the shape of the initialized tensor.
@@ -216,7 +226,7 @@ class Constant(Initializer):
       dtype = self.dtype
     if verify_shape is None:
       verify_shape = self._verify_shape
-    return constant_op.constant(
+    return constant_op.constant_v1(
         self.value, dtype=dtype, shape=shape, verify_shape=verify_shape)
 
   def get_config(self):
@@ -239,7 +249,8 @@ class RandomUniform(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer.
   """
 
   def __init__(self, minval=0, maxval=None, seed=None, dtype=dtypes.float32):
@@ -275,7 +286,8 @@ class RandomNormal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
@@ -316,7 +328,8 @@ class TruncatedNormal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, mean=0.0, stddev=1.0, seed=None, dtype=dtypes.float32):
@@ -360,8 +373,7 @@ class UniformUnitScaling(Initializer):
   A similar calculation for convolutional networks gives an analogous result
   with `dim` equal to the product of the first 3 dimensions.  When
   nonlinearities are present, we need to multiply this by a constant `factor`.
-  See [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
-  ([pdf](http://arxiv.org/pdf/1412.6558.pdf)) for deeper motivation, experiments
+  See (Sussillo et al., 2014) for deeper motivation, experiments
   and the calculation of constants. In section 2.3 there, the constants were
   numerically computed: for a linear layer it's 1.0, relu: ~1.43, tanh: ~1.15.
 
@@ -370,7 +382,12 @@ class UniformUnitScaling(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Sussillo et al., 2014](https://arxiv.org/abs/1412.6558)
+      ([pdf](http://arxiv.org/pdf/1412.6558.pdf))
   """
 
   @deprecated(None,
@@ -434,7 +451,8 @@ class VarianceScaling(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
 
   Raises:
     ValueError: In case of an invalid value for the "scale", mode" or
@@ -480,7 +498,7 @@ class VarianceScaling(Initializer):
     else:
       scale /= max(1., (fan_in + fan_out) / 2.)
     if self.distribution == "normal" or self.distribution == "truncated_normal":
-      # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
+    # constant taken from scipy.stats.truncnorm.std(a=-2, b=2, loc=0., scale=1.)
       stddev = math.sqrt(scale) / .87962566103423978
       return random_ops.truncated_normal(
           shape, 0.0, stddev, dtype, seed=self.seed)
@@ -531,7 +549,12 @@ class Orthogonal(Initializer):
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Saxe et al., 2014](https://openreview.net/forum?id=_wzZwKpTDF_9C)
+      ([pdf](https://arxiv.org/pdf/1312.6120.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -576,16 +599,21 @@ class ConvolutionDeltaOrthogonal(Initializer):
   The shape of the tensor must have length 3, 4 or 5. The number of input
   filters must not exceed the number of output filters. The center pixels of the
   tensor form an orthogonal matrix. Other pixels are set to be zero. See
-  algorithm 2 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  algorithm 2 in (Xiao et al., 2018).
 
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -613,7 +641,7 @@ class ConvolutionDeltaOrthogonal(Initializer):
     d = array_ops.diag_part(r)
     q *= math_ops.sign(d)
     q = q[:shape[-2], :]
-    q *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    q *= math_ops.cast(self.gain, dtype=dtype)
     if len(shape) == 3:
       weight = array_ops.scatter_nd([[(shape[0]-1)//2]],
                                     array_ops.expand_dims(q, 0), shape)
@@ -636,12 +664,17 @@ class ConvolutionOrthogonal(Initializer):
   Base class used to construct 1D, 2D and 3D orthogonal kernels for convolution.
 
   Args:
-    gain: multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __init__(self, gain=1.0, seed=None, dtype=dtypes.float32):
@@ -698,15 +731,20 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  See algorithm 1 in (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      This has the effect of scaling the output 2-norm by a factor of
-      `sqrt(gain)`.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. This has the effect of scaling the output 2-norm by
+      a factor of `gain`.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -722,7 +760,7 @@ class ConvolutionOrthogonal2D(ConvolutionOrthogonal):
       raise ValueError("Kernel sizes must be equal.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[2], shape[3])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k1, k2):
@@ -834,16 +872,21 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 in [Xiao et al., 2018]: https://arxiv.org/abs/1806.05393
+  See algorithm 1 in (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -856,7 +899,7 @@ class ConvolutionOrthogonal1D(ConvolutionOrthogonal):
       raise ValueError("In_filters cannot be greater than out_filters.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k):
@@ -951,15 +994,20 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
   filters must not exceed the number of output filters.
   The orthogonality(==isometry) is exact when the inputs are circular padded.
   There are finite-width effects with non-circular padding (e.g. zero padding).
-  See algorithm 1 [Xiao et al., 2018] in: https://arxiv.org/abs/1806.05393
+  See algorithm 1 (Xiao et al., 2018).
 
   Args:
-    gain: Multiplicative factor to apply to the orthogonal matrix. Default is 1.
-      The 2-norm of an input is multiplied by a factor of 'sqrt(gain)' after
-      applying this convolution.
+    gain: Multiplicative factor to apply to the orthogonal
+      matrix. Default is 1. The 2-norm of an input is multiplied by a factor of
+      `gain` after applying this convolution.
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed` for behavior.
-    dtype: The data type.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Xiao et al., 2018](http://proceedings.mlr.press/v80/xiao18a.html)
+      ([pdf](http://proceedings.mlr.press/v80/xiao18a/xiao18a.pdf))
   """
 
   def __call__(self, shape, dtype=None, partition_info=None):
@@ -975,7 +1023,7 @@ class ConvolutionOrthogonal3D(ConvolutionOrthogonal):
       raise ValueError("Kernel sizes must be equal.")
 
     kernel = self._orthogonal_kernel(shape[0], shape[-2], shape[-1])
-    kernel *= math_ops.sqrt(math_ops.cast(self.gain, dtype=dtype))
+    kernel *= math_ops.cast(self.gain, dtype=dtype)
     return kernel
 
   def _dict_to_tensor(self, x, k1, k2, k3):
@@ -1105,7 +1153,8 @@ class Identity(Initializer):
 
   Args:
     gain: Multiplicative factor to apply to the identity matrix.
-    dtype: The type of the output.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
   """
 
   def __init__(self, gain=1.0, dtype=dtypes.float32):
@@ -1139,18 +1188,19 @@ class GlorotUniform(VarianceScaling):
   where `fan_in` is the number of input units in the weight tensor
   and `fan_out` is the number of output units in the weight tensor.
 
-  Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-
   Args:
     seed: A Python integer. Used to create random seeds. See
       `tf.set_random_seed`
       for behavior.
-    dtype: The data type. Only floating point types are supported.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
-  def __init__(self,
-               seed=None,
-               dtype=dtypes.float32):
+  def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotUniform, self).__init__(
         scale=1.0,
         mode="fan_avg",
@@ -1159,10 +1209,7 @@ class GlorotUniform(VarianceScaling):
         dtype=dtype)
 
   def get_config(self):
-    return {
-        "seed": self.seed,
-        "dtype": self.dtype.name
-    }
+    return {"seed": self.seed, "dtype": self.dtype.name}
 
 
 @tf_export(
@@ -1181,18 +1228,18 @@ class GlorotNormal(VarianceScaling):
   where `fan_in` is the number of input units in the weight tensor
   and `fan_out` is the number of output units in the weight tensor.
 
-  Reference: http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf
-
   Args:
     seed: A Python integer. Used to create random seeds. See
-      `tf.set_random_seed`
-      for behavior.
-    dtype: The data type. Only floating point types are supported.
+      `tf.set_random_seed` for behavior.
+    dtype: Default data type, used if no `dtype` argument is provided when
+      calling the initializer. Only floating point types are supported.
+
+  References:
+      [Glorot et al., 2010](http://proceedings.mlr.press/v9/glorot10a.html)
+      ([pdf](http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf))
   """
 
-  def __init__(self,
-               seed=None,
-               dtype=dtypes.float32):
+  def __init__(self, seed=None, dtype=dtypes.float32):
     super(GlorotNormal, self).__init__(
         scale=1.0,
         mode="fan_avg",
@@ -1201,10 +1248,7 @@ class GlorotNormal(VarianceScaling):
         dtype=dtype)
 
   def get_config(self):
-    return {
-        "seed": self.seed,
-        "dtype": self.dtype.name
-    }
+    return {"seed": self.seed, "dtype": self.dtype.name}
 
 
 # Aliases.
@@ -1244,9 +1288,11 @@ def lecun_normal(seed=None):
       An initializer.
 
   References:
-      - [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515)
-      - [Efficient
-      Backprop](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
   """
   return VarianceScaling(
       scale=1., mode="fan_in", distribution="truncated_normal", seed=seed)
@@ -1267,8 +1313,11 @@ def lecun_uniform(seed=None):
       An initializer.
 
   References:
-      LeCun 98, Efficient Backprop,
-      http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf
+      - Self-Normalizing Neural Networks,
+      [Klambauer et al., 2017](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks)
+      ([pdf](https://papers.nips.cc/paper/6698-self-normalizing-neural-networks.pdf))
+      - Efficient Backprop,
+      [Lecun et al., 1998](http://yann.lecun.com/exdb/publis/pdf/lecun-98b.pdf)
   """
   return VarianceScaling(
       scale=1., mode="fan_in", distribution="uniform", seed=seed)
@@ -1289,7 +1338,8 @@ def he_normal(seed=None):
       An initializer.
 
   References:
-      He et al., http://arxiv.org/abs/1502.01852
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
       scale=2., mode="fan_in", distribution="truncated_normal", seed=seed)
@@ -1310,7 +1360,8 @@ def he_uniform(seed=None):
       An initializer.
 
   References:
-      He et al., http://arxiv.org/abs/1502.01852
+      [He et al., 2015](https://www.cv-foundation.org/openaccess/content_iccv_2015/html/He_Delving_Deep_into_ICCV_2015_paper.html)
+      ([pdf](https://www.cv-foundation.org/openaccess/content_iccv_2015/papers/He_Delving_Deep_into_ICCV_2015_paper.pdf))
   """
   return VarianceScaling(
       scale=2., mode="fan_in", distribution="uniform", seed=seed)
diff --git a/tensorflow/python/ops/init_ops_test.py b/tensorflow/python/ops/init_ops_test.py
index 5693c3caaf5ca80fd6528c94bb952acc7bc8957c..1f22248004697438d2c8c05dc0c6762a20902d31 100644
--- a/tensorflow/python/ops/init_ops_test.py
+++ b/tensorflow/python/ops/init_ops_test.py
@@ -45,8 +45,8 @@ class InitializersTest(test.TestCase):
       output = variable.numpy()
     else:
       sess = ops.get_default_session()
-      sess.run(variable.initializer)
-      output = sess.run(variable)
+      self.evaluate(variable.initializer)
+      output = self.evaluate(variable)
     lim = 3e-2
     if target_std is not None:
       self.assertGreater(lim, abs(output.std() - target_std))
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index c7314d77749130e4696d58896249b73cc2de4a12..5df2d6b83816334f46ef45eec675ed9b7e35bd00 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -18,6 +18,7 @@ py_library(
         "//tensorflow/python:random_ops",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
+        "//tensorflow/python/ops/signal",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
diff --git a/tensorflow/python/ops/linalg/cholesky_registrations.py b/tensorflow/python/ops/linalg/cholesky_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5284cf22ac2981f79c0d3c7a6a60635c9d0bf02
--- /dev/null
+++ b/tensorflow/python/ops/linalg/cholesky_registrations.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.cholesky."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops import linalg_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_block_diag
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_kronecker
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+
+
+# By default, compute the Cholesky of the dense matrix, and return a
+# LowerTriangular operator. Methods below specialize this registration.
+@linear_operator_algebra.RegisterCholesky(linear_operator.LinearOperator)
+def _cholesky_linear_operator(linop):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      linalg_ops.cholesky(linop.to_dense()),
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_diag.LinearOperatorDiag)
+def _cholesky_diag(diag_operator):
+  return linear_operator_diag.LinearOperatorDiag(
+      math_ops.sqrt(diag_operator.diag),
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_identity.LinearOperatorIdentity)
+def _cholesky_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      batch_shape=identity_operator.batch_shape,
+      dtype=identity_operator.dtype,
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _cholesky_scaled_identity(identity_operator):
+  return linear_operator_identity.LinearOperatorScaledIdentity(
+      num_rows=identity_operator._num_rows,  # pylint: disable=protected-access
+      multiplier=math_ops.sqrt(identity_operator.multiplier),
+      is_non_singular=True,
+      is_self_adjoint=True,
+      is_positive_definite=True,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_block_diag.LinearOperatorBlockDiag)
+def _cholesky_block_diag(block_diag_operator):
+    # We take the cholesky of each block on the diagonal.
+  return linear_operator_block_diag.LinearOperatorBlockDiag(
+      operators=[
+          operator.cholesky() for operator in block_diag_operator.operators],
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterCholesky(
+    linear_operator_kronecker.LinearOperatorKronecker)
+def _cholesky_kronecker(kronecker_operator):
+    # Cholesky decomposition of a Kronecker product is the Kronecker product
+    # of cholesky decompositions.
+  return linear_operator_kronecker.LinearOperatorKronecker(
+      operators=[
+          operator.cholesky() for operator in kronecker_operator.operators],
+      is_non_singular=True,
+      is_self_adjoint=False,
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg/linalg.py b/tensorflow/python/ops/linalg/linalg.py
index c29b5033bb137e8376e1c19985755b4fc72e8834..ac4fd4ebc6059a187828c757c852a470d8ee69a8 100644
--- a/tensorflow/python/ops/linalg/linalg.py
+++ b/tensorflow/python/ops/linalg/linalg.py
@@ -20,6 +20,9 @@ from __future__ import print_function
 
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,unused-import
+from tensorflow.python.ops.linalg import cholesky_registrations as _cholesky_registrations
+from tensorflow.python.ops.linalg import linear_operator_algebra as _linear_operator_algebra
+from tensorflow.python.ops.linalg import matmul_registrations as _matmul_registrations
 from tensorflow.python.ops.linalg.linalg_impl import *
 from tensorflow.python.ops.linalg.linear_operator import *
 from tensorflow.python.ops.linalg.linear_operator_block_diag import *
diff --git a/tensorflow/python/ops/linalg/linalg_impl.py b/tensorflow/python/ops/linalg/linalg_impl.py
index 08d50ce622f68b3c116a2ccc3fa4d546a635e9c1..df2bd887cdde6f651db572c2bdfebd2bc0170716 100644
--- a/tensorflow/python/ops/linalg/linalg_impl.py
+++ b/tensorflow/python/ops/linalg/linalg_impl.py
@@ -44,6 +44,7 @@ einsum = special_math_ops.einsum
 eye = linalg_ops.eye
 inv = linalg_ops.matrix_inverse
 logm = gen_linalg_ops.matrix_logarithm
+lu = gen_linalg_ops.lu
 tf_export('linalg.logm')(logm)
 lstsq = linalg_ops.matrix_solve_ls
 norm = linalg_ops.norm
@@ -88,7 +89,7 @@ def logdet(matrix, name=None):
     chol = gen_linalg_ops.cholesky(matrix)
     return 2.0 * math_ops.reduce_sum(
         math_ops.log(math_ops.real(array_ops.matrix_diag_part(chol))),
-        reduction_indices=[-1])
+        axis=[-1])
 
 
 @tf_export('linalg.adjoint')
diff --git a/tensorflow/python/ops/linalg/linear_operator.py b/tensorflow/python/ops/linalg/linear_operator.py
index 9ef6c42b04c2e34d019c67faa4cf9e7568705b54..8efafda3a1e7424442163a76aca95d14af4b8a70 100644
--- a/tensorflow/python/ops/linalg/linear_operator.py
+++ b/tensorflow/python/ops/linalg/linear_operator.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import linalg_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator_algebra
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util.tf_export import tf_export
@@ -284,7 +285,7 @@ class LinearOperator(object):
     `[B1,...,Bb, M, N]`, equivalent to `tf.shape(A)`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`
@@ -318,7 +319,7 @@ class LinearOperator(object):
     `[B1,...,Bb]`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`
@@ -340,7 +341,7 @@ class LinearOperator(object):
     `A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       Python integer, or None if the tensor rank is undefined.
@@ -356,7 +357,7 @@ class LinearOperator(object):
     `A.shape = [B1,...,Bb, M, N]`, then this returns `b + 2`.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `int32` `Tensor`, determined at runtime.
@@ -581,16 +582,29 @@ class LinearOperator(object):
     ```
 
     Args:
-      x: `Tensor` with compatible shape and same `dtype` as `self`.
-        See class docstring for definition of compatibility.
+      x: `LinearOperator` or `Tensor` with compatible shape and same `dtype` as
+        `self`. See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
       adjoint_arg:  Python `bool`.  If `True`, compute `A x^H` where `x^H` is
         the hermitian transpose (transposition and complex conjugation).
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
-      A `Tensor` with shape `[..., M, R]` and same `dtype` as `self`.
+      A `LinearOperator` or `Tensor` with shape `[..., M, R]` and same `dtype`
+        as `self`.
     """
+    if isinstance(x, LinearOperator):
+      if adjoint or adjoint_arg:
+        raise ValueError(".matmul not supported with adjoints.")
+      if (x.range_dimension is not None and
+          self.domain_dimension is not None and
+          x.range_dimension != self.domain_dimension):
+        raise ValueError(
+            "Operators are incompatible. Expected `x` to have dimension"
+            " {} but got {}.".format(self.domain_dimension, x.range_dimension))
+      with self._name_scope(name):
+        return linear_operator_algebra.matmul(self, x)
+
     with self._name_scope(name, values=[x]):
       x = ops.convert_to_tensor(x, name="x")
       self._check_input_dtype(x)
@@ -630,7 +644,7 @@ class LinearOperator(object):
         dimensions, the last dimension defines a vector.
         See class docstring for definition of compatibility.
       adjoint: Python `bool`.  If `True`, left multiply by the adjoint: `A^H x`.
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       A `Tensor` with shape `[..., M]` and same `dtype` as `self`.
@@ -655,7 +669,7 @@ class LinearOperator(object):
     """Determinant for every batch member.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
@@ -676,7 +690,7 @@ class LinearOperator(object):
         "  Requires conversion to a dense matrix and O(N^3) operations.")
     if self._can_use_cholesky():
       diag = array_ops.matrix_diag_part(linalg_ops.cholesky(self.to_dense()))
-      return 2 * math_ops.reduce_sum(math_ops.log(diag), reduction_indices=[-1])
+      return 2 * math_ops.reduce_sum(math_ops.log(diag), axis=[-1])
     _, log_abs_det = linalg.slogdet(self.to_dense())
     return log_abs_det
 
@@ -684,7 +698,7 @@ class LinearOperator(object):
     """Log absolute value of determinant for every batch member.
 
     Args:
-      name:  A name for this `Op.
+      name:  A name for this `Op`.
 
     Returns:
       `Tensor` with shape `self.batch_shape` and same `dtype` as `self`.
@@ -830,6 +844,31 @@ class LinearOperator(object):
 
       return self._solvevec(rhs, adjoint=adjoint)
 
+  def cholesky(self, name="cholesky"):
+    """Returns a Cholesky factor as a `LinearOperator`.
+
+    Given `A` representing this `LinearOperator`, if `A` is positive definite
+    self-adjoint, return `L`, where `A = L L^T`, i.e. the cholesky
+    decomposition.
+
+    Args:
+      name:  A name for this `Op`.
+
+    Returns:
+      `LinearOperator` which represents the lower triangular matrix
+      in the Cholesky decomposition.
+
+    Raises:
+      ValueError: When the `LinearOperator` is not hinted to be positive
+        definite and self adjoint.
+    """
+
+    if not self._can_use_cholesky():
+      raise ValueError("Cannot take the Cholesky decomposition: "
+                       "Not a positive definite self adjoint matrix.")
+    with self._name_scope(name):
+      return linear_operator_algebra.cholesky(self)
+
   def _to_dense(self):
     """Generic and often inefficient implementation.  Override often."""
     logging.warn("Using (possibly slow) default implementation of to_dense."
@@ -922,6 +961,4 @@ class LinearOperator(object):
       return self._add_to_tensor(x)
 
   def _can_use_cholesky(self):
-    # TODO(langmore) Add complex types when tf.cholesky can use them.
-    return (not self.dtype.is_complex and self.is_self_adjoint and
-            self.is_positive_definite)
+    return self.is_self_adjoint and self.is_positive_definite
diff --git a/tensorflow/python/ops/linalg/linear_operator_adjoint.py b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..858e224b9adda57b4d472ae2f61b2b6cda74c243
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_adjoint.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Takes the adjoint of a `LinearOperator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.linalg import linalg_impl as linalg
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = []
+
+
+@tf_export("linalg.LinearOperatorAdjoint")
+class LinearOperatorAdjoint(linear_operator.LinearOperator):
+  """`LinearOperator` representing the adjoint of another operator.
+
+  This operator represents the adjoint of another operator.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  operator = LinearOperatorFullMatrix([[1 - i., 3.], [0., 1. + i]])
+  operator_adjoint = LinearOperatorAdjoint(operator)
+
+  operator_adjoint.to_dense()
+  ==> [[1. + i, 0.]
+       [3., 1 - i]]
+
+  operator_adjoint.shape
+  ==> [2, 2]
+
+  operator_adjoint.log_abs_determinant()
+  ==> - log(2)
+
+  x = ... Shape [2, 4] Tensor
+  operator_adjoint.matmul(x)
+  ==> Shape [2, 4] Tensor, equal to operator.matmul(x, adjoint=True)
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorAdjoint` depends on the underlying
+  operators performance.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operator,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorAdjoint`.
+
+    `LinearOperatorAdjoint` is initialized with an operator `A`.  The `solve`
+    and `matmul` methods effectively flip the `adjoint` argument.  E.g.
+
+    ```
+    A = MyLinearOperator(...)
+    B = LinearOperatorAdjoint(A)
+    x = [....]  # a vector
+
+    assert A.matvec(x, adjoint=True) == B.matvec(x, adjoint=False)
+    ```
+
+    Args:
+      operator: `LinearOperator` object.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`. Default is `operator.name +
+        "_adjoint"`.
+
+    Raises:
+      ValueError:  If `operator.is_non_singular` is False.
+    """
+
+    self._operator = operator
+
+    # The congruency of is_non_singular and is_self_adjoint was checked in the
+    # base operator.
+    def _combined_hint(hint_str, provided_hint_value, message):
+      """Get combined hint in the case where operator.hint should equal hint."""
+      op_hint = getattr(operator, hint_str)
+      if op_hint is False and provided_hint_value:
+        raise ValueError(message)
+      if op_hint and provided_hint_value is False:
+        raise ValueError(message)
+      return (op_hint or provided_hint_value) or None
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its adjoint is square.")
+
+    is_non_singular = _combined_hint(
+        "is_non_singular", is_non_singular,
+        "An operator is non-singular if and only if its adjoint is "
+        "non-singular.")
+
+    is_self_adjoint = _combined_hint(
+        "is_self_adjoint", is_self_adjoint,
+        "An operator is self-adjoint if and only if its adjoint is "
+        "self-adjoint.")
+
+    is_positive_definite = _combined_hint(
+        "is_positive_definite", is_positive_definite,
+        "An operator is positive-definite if and only if its adjoint is "
+        "positive-definite.")
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its adjoint is square.")
+
+    # Initialization.
+    if name is None:
+      name = operator.name + "_adjoint"
+    with ops.name_scope(name, values=operator.graph_parents):
+      super(LinearOperatorAdjoint, self).__init__(
+          dtype=operator.dtype,
+          graph_parents=operator.graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operator(self):
+    """The operator before taking the adjoint."""
+    return self._operator
+
+  def _assert_non_singular(self):
+    return self.operator.assert_non_singular()
+
+  def _assert_positive_definite(self):
+    return self.operator.assert_positive_definite()
+
+  def _assert_self_adjoint(self):
+    return self.operator.assert_self_adjoint()
+
+  def _shape(self):
+    return self.operator.shape
+
+  def _shape_tensor(self):
+    return self.operator.shape_tensor()
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.matmul(
+        x, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
+  def _determinant(self):
+    if self.is_self_adjoint:
+      return self.operator.determinant()
+    return math_ops.conj(self.operator.determinant())
+
+  def _log_abs_determinant(self):
+    return self.operator.log_abs_determinant()
+
+  def _trace(self):
+    if self.is_self_adjoint:
+      return self.operator.trace()
+    return math_ops.conj(self.operator.trace())
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.solve(
+        rhs, adjoint=(not adjoint), adjoint_arg=adjoint_arg)
+
+  def _to_dense(self):
+    if self.is_self_adjoint:
+      return self.operator.to_dense()
+    return linalg.adjoint(self.operator.to_dense())
diff --git a/tensorflow/python/ops/linalg/linear_operator_algebra.py b/tensorflow/python/ops/linalg/linear_operator_algebra.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b99066e4c121ebd7546dfad1039c0dfa46bca11
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_algebra.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Registration mechanisms for various n-ary operations on LinearOperators."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.framework import ops
+from tensorflow.python.util import tf_inspect
+
+
+_CHOLESKY_DECOMPS = {}
+_MATMUL = {}
+
+
+def _registered_function(type_list, registry):
+  """Given a list of classes, finds the most specific function registered."""
+  enumerated_hierarchies = [enumerate(tf_inspect.getmro(t)) for t in type_list]
+  # Get all possible combinations of hierarchies.
+  cls_combinations = list(itertools.product(*enumerated_hierarchies))
+
+  def hierarchy_distance(cls_combination):
+    candidate_distance = sum(c[0] for c in cls_combination)
+    if tuple(c[1] for c in cls_combination) in registry:
+      return candidate_distance
+    return 10000
+
+  registered_combination = min(cls_combinations, key=hierarchy_distance)
+  return registry.get(tuple(r[1] for r in registered_combination), None)
+
+
+def _registered_cholesky(type_a):
+  """Get the Cholesky function registered for class a."""
+  return _registered_function([type_a], _CHOLESKY_DECOMPS)
+
+
+def _registered_matmul(type_a, type_b):
+  """Get the Matmul function registered for classes a and b."""
+  return _registered_function([type_a, type_b], _MATMUL)
+
+
+def cholesky(lin_op_a, name=None):
+  """Get the Cholesky factor associated to lin_op_a.
+
+  Args:
+    lin_op_a: The LinearOperator to decompose.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the lower Cholesky factor of `lin_op_a`.
+
+  Raises:
+    NotImplementedError: If no Cholesky method is defined for the LinearOperator
+      type of `lin_op_a`.
+  """
+  cholesky_fn = _registered_cholesky(type(lin_op_a))
+  if cholesky_fn is None:
+    raise ValueError("No cholesky decomposition registered for {}".format(
+        type(lin_op_a)))
+
+  with ops.name_scope(name, "Cholesky"):
+    return cholesky_fn(lin_op_a)
+
+
+def matmul(lin_op_a, lin_op_b, name=None):
+  """Compute lin_op_a.matmul(lin_op_b).
+
+  Args:
+    lin_op_a: The LinearOperator on the left.
+    lin_op_b: The LinearOperator on the right.
+    name: Name to use for this operation.
+
+  Returns:
+    A LinearOperator that represents the matmul between `lin_op_a` and
+      `lin_op_b`.
+
+  Raises:
+    NotImplementedError: If no matmul method is defined between types of
+      `lin_op_a` and `lin_op_b`.
+  """
+  matmul_fn = _registered_matmul(type(lin_op_a), type(lin_op_b))
+  if matmul_fn is None:
+    raise ValueError("No matmul registered for {}.matmul({})".format(
+        type(lin_op_a), type(lin_op_b)))
+
+  with ops.name_scope(name, "Matmul"):
+    return matmul_fn(lin_op_a, lin_op_b)
+
+
+class RegisterCholesky(object):
+  """Decorator to register a Cholesky implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterCholesky(lin_op.LinearOperatorIdentity)
+  def _cholesky_identity(lin_op_a):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to decompose.
+    """
+    self._key = (lin_op_cls_a,)
+
+  def __call__(self, cholesky_fn):
+    """Perform the Cholesky registration.
+
+    Args:
+      cholesky_fn: The function to use for the Cholesky.
+
+    Returns:
+      cholesky_fn
+
+    Raises:
+      TypeError: if cholesky_fn is not a callable.
+      ValueError: if a Cholesky function has already been registered for
+        the given argument classes.
+    """
+    if not callable(cholesky_fn):
+      raise TypeError(
+          "cholesky_fn must be callable, received: {}".format(cholesky_fn))
+    if self._key in _CHOLESKY_DECOMPS:
+      raise ValueError("Cholesky({}) has already been registered to: {}".format(
+          self._key[0].__name__, _CHOLESKY_DECOMPS[self._key]))
+    _CHOLESKY_DECOMPS[self._key] = cholesky_fn
+    return cholesky_fn
+
+
+class RegisterMatmul(object):
+  """Decorator to register a Matmul implementation function.
+
+  Usage:
+
+  @linear_operator_algebra.RegisterMatmul(
+    lin_op.LinearOperatorIdentity,
+    lin_op.LinearOperatorIdentity)
+  def _matmul_identity(a, b):
+    # Return the identity matrix.
+  """
+
+  def __init__(self, lin_op_cls_a, lin_op_cls_b):
+    """Initialize the LinearOperator registrar.
+
+    Args:
+      lin_op_cls_a: the class of the LinearOperator to multiply.
+      lin_op_cls_b: the class of the second LinearOperator to multiply.
+    """
+    self._key = (lin_op_cls_a, lin_op_cls_b)
+
+  def __call__(self, matmul_fn):
+    """Perform the Matmul registration.
+
+    Args:
+      matmul_fn: The function to use for the Matmul.
+
+    Returns:
+      matmul_fn
+
+    Raises:
+      TypeError: if matmul_fn is not a callable.
+      ValueError: if a Matmul function has already been registered for
+        the given argument classes.
+    """
+    if not callable(matmul_fn):
+      raise TypeError(
+          "matmul_fn must be callable, received: {}".format(matmul_fn))
+    if self._key in _MATMUL:
+      raise ValueError("Matmul({}, {}) has already been registered.".format(
+          self._key[0].__name__,
+          self._key[1].__name__))
+    _MATMUL[self._key] = matmul_fn
+    return matmul_fn
diff --git a/tensorflow/python/ops/linalg/linear_operator_block_diag.py b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
index 438c3496bdf4277e239c488d947ac743165179a5..b0b418c99706ad9468668d52e48e79f2add7552d 100644
--- a/tensorflow/python/ops/linalg/linear_operator_block_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_block_diag.py
@@ -29,9 +29,7 @@ from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
 from tensorflow.python.util.tf_export import tf_export
 
-__all__ = [
-    "LinearOperatorBlockDiag",
-]
+__all__ = ["LinearOperatorBlockDiag"]
 
 
 @tf_export("linalg.LinearOperatorBlockDiag")
diff --git a/tensorflow/python/ops/linalg/linear_operator_circulant.py b/tensorflow/python/ops/linalg/linear_operator_circulant.py
index 021ef47383673dd1ccd42e58d04631ef2f3b2e7a..b74baa5dfdb0a70f035ee5a2633ba571147aa5e6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_circulant.py
+++ b/tensorflow/python/ops/linalg/linear_operator_circulant.py
@@ -30,6 +30,7 @@ from tensorflow.python.ops.distributions import util as distribution_util
 from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.ops.linalg import linear_operator_util
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -39,8 +40,8 @@ __all__ = [
 ]
 
 # Different FFT Ops will be used for different block depths.
-_FFT_OP = {1: math_ops.fft, 2: math_ops.fft2d, 3: math_ops.fft3d}
-_IFFT_OP = {1: math_ops.ifft, 2: math_ops.ifft2d, 3: math_ops.ifft3d}
+_FFT_OP = {1: fft_ops.fft, 2: fft_ops.fft2d, 3: fft_ops.fft3d}
+_IFFT_OP = {1: fft_ops.ifft, 2: fft_ops.ifft2d, 3: fft_ops.ifft3d}
 
 # This is the only dtype allowed with fft ops.
 # TODO(langmore) Add other types once available.
@@ -417,15 +418,13 @@ class _BaseLinearOperatorCirculant(linear_operator.LinearOperator):
     return math_ops.cast(y, self.dtype)
 
   def _determinant(self):
-    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
-    det = math_ops.reduce_prod(
-        self.spectrum, reduction_indices=reduction_indices)
+    axis = [-(i + 1) for i in range(self.block_depth)]
+    det = math_ops.reduce_prod(self.spectrum, axis=axis)
     return math_ops.cast(det, self.dtype)
 
   def _log_abs_determinant(self):
-    reduction_indices = [-(i + 1) for i in range(self.block_depth)]
-    lad = math_ops.reduce_sum(
-        math_ops.log(self._abs_spectrum), reduction_indices=reduction_indices)
+    axis = [-(i + 1) for i in range(self.block_depth)]
+    lad = math_ops.reduce_sum(math_ops.log(self._abs_spectrum), axis=axis)
     return math_ops.cast(lad, self.dtype)
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
diff --git a/tensorflow/python/ops/linalg/linear_operator_composition.py b/tensorflow/python/ops/linalg/linear_operator_composition.py
index 0292bc51dcf9809941087dd4aa1ea4c760c064d1..f499b3066129bce83706a94d93d943422ccc1ffd 100644
--- a/tensorflow/python/ops/linalg/linear_operator_composition.py
+++ b/tensorflow/python/ops/linalg/linear_operator_composition.py
@@ -275,6 +275,3 @@ class LinearOperatorComposition(linear_operator.LinearOperator):
     for operator in solve_order_list[1:]:
       solution = operator.solve(solution, adjoint=adjoint)
     return solution
-
-  def _add_to_tensor(self, x):
-    return self.to_dense() + x
diff --git a/tensorflow/python/ops/linalg/linear_operator_diag.py b/tensorflow/python/ops/linalg/linear_operator_diag.py
index ed53decc00dc90df5c6c97d9fd9d5cb124ddf660..be893c705c970bcf100a686d64171806e2d9ace6 100644
--- a/tensorflow/python/ops/linalg/linear_operator_diag.py
+++ b/tensorflow/python/ops/linalg/linear_operator_diag.py
@@ -228,11 +228,11 @@ class LinearOperatorDiag(linear_operator.LinearOperator):
     return diag_mat * x
 
   def _determinant(self):
-    return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
+    return math_ops.reduce_prod(self._diag, axis=[-1])
 
   def _log_abs_determinant(self):
     log_det = math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
+        math_ops.log(math_ops.abs(self._diag)), axis=[-1])
     if self.dtype.is_complex:
       log_det = math_ops.cast(log_det, dtype=self.dtype)
     return log_det
diff --git a/tensorflow/python/ops/linalg/linear_operator_inversion.py b/tensorflow/python/ops/linalg/linear_operator_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aa4b40e16bd82941357e394101a0a9d55c7a7fe
--- /dev/null
+++ b/tensorflow/python/ops/linalg/linear_operator_inversion.py
@@ -0,0 +1,207 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Inverts a non-singular `LinearOperator`."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.util.tf_export import tf_export
+
+__all__ = []
+
+
+@tf_export("linalg.LinearOperatorInversion")
+class LinearOperatorInversion(linear_operator.LinearOperator):
+  """`LinearOperator` representing the inverse of another operator.
+
+  This operator represents the inverse of another operator.
+
+  ```python
+  # Create a 2 x 2 linear operator.
+  operator = LinearOperatorFullMatrix([[1., 0.], [0., 2.]])
+  operator_inv = LinearOperatorInversion(operator)
+
+  operator_inv.to_dense()
+  ==> [[1., 0.]
+       [0., 0.5]]
+
+  operator_inv.shape
+  ==> [2, 2]
+
+  operator_inv.log_abs_determinant()
+  ==> - log(2)
+
+  x = ... Shape [2, 4] Tensor
+  operator_inv.matmul(x)
+  ==> Shape [2, 4] Tensor, equal to operator.solve(x)
+  ```
+
+  #### Performance
+
+  The performance of `LinearOperatorInversion` depends on the underlying
+  operators performance:  `solve` and `matmul` are swapped, and determinant is
+  inverted.
+
+  #### Matrix property hints
+
+  This `LinearOperator` is initialized with boolean flags of the form `is_X`,
+  for `X = non_singular, self_adjoint, positive_definite, square`.
+  These have the following meaning:
+
+  * If `is_X == True`, callers should expect the operator to have the
+    property `X`.  This is a promise that should be fulfilled, but is *not* a
+    runtime assert.  For example, finite floating point precision may result
+    in these promises being violated.
+  * If `is_X == False`, callers should expect the operator to not have `X`.
+  * If `is_X == None` (the default), callers should have no expectation either
+    way.
+  """
+
+  def __init__(self,
+               operator,
+               is_non_singular=None,
+               is_self_adjoint=None,
+               is_positive_definite=None,
+               is_square=None,
+               name=None):
+    r"""Initialize a `LinearOperatorInversion`.
+
+    `LinearOperatorInversion` is initialized with an operator `A`.  The `solve`
+    and `matmul` methods are effectively swapped.  E.g.
+
+    ```
+    A = MyLinearOperator(...)
+    B = LinearOperatorInversion(A)
+    x = [....]  # a vector
+
+    assert A.matvec(x) == B.solvevec(x)
+    ```
+
+    Args:
+      operator: `LinearOperator` object. If `operator.is_non_singular == False`,
+        an exception is raised.  We do allow `operator.is_non_singular == None`,
+        in which case this operator will have `is_non_singular == None`.
+        Similarly for `is_self_adjoint` and `is_positive_definite`.
+      is_non_singular:  Expect that this operator is non-singular.
+      is_self_adjoint:  Expect that this operator is equal to its hermitian
+        transpose.
+      is_positive_definite:  Expect that this operator is positive definite,
+        meaning the quadratic form `x^H A x` has positive real part for all
+        nonzero `x`.  Note that we do not require the operator to be
+        self-adjoint to be positive-definite.  See:
+        https://en.wikipedia.org/wiki/Positive-definite_matrix#Extension_for_non-symmetric_matrices
+      is_square:  Expect that this operator acts like square [batch] matrices.
+      name: A name for this `LinearOperator`. Default is `operator.name +
+        "_inv"`.
+
+    Raises:
+      ValueError:  If `operator.is_non_singular` is False.
+    """
+
+    self._operator = operator
+
+    # Auto-set and check hints.
+    if operator.is_non_singular is False or is_non_singular is False:
+      raise ValueError(
+          "operator and supplied hints must have `is_non_singular` equal to "
+          "`True` or `None`.  Found %s, %s" % (operator.is_non_singular,
+                                               is_non_singular))
+    if operator.is_square is False or is_square is False:
+      raise ValueError(
+          "operator and supplied hints must have `is_square` equal to "
+          "`True` or `None`.  Found %s, %s" % (operator.is_square, is_square))
+
+    # The congruency of is_non_singular and is_self_adjoint was checked in the
+    # base operator.  Other hints are, in this special case of inversion, ones
+    # that must be the same for base/derived operator.
+    def _combined_hint(hint_str, provided_hint_value, message):
+      """Get combined hint in the case where operator.hint should equal hint."""
+      op_hint = getattr(operator, hint_str)
+      if op_hint is False and provided_hint_value:
+        raise ValueError(message)
+      if op_hint and provided_hint_value is False:
+        raise ValueError(message)
+      return (op_hint or provided_hint_value) or None
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its inverse is square.")
+
+    is_non_singular = _combined_hint(
+        "is_non_singular", is_non_singular,
+        "An operator is non-singular if and only if its inverse is "
+        "non-singular.")
+
+    is_self_adjoint = _combined_hint(
+        "is_self_adjoint", is_self_adjoint,
+        "An operator is self-adjoint if and only if its inverse is "
+        "self-adjoint.")
+
+    is_positive_definite = _combined_hint(
+        "is_positive_definite", is_positive_definite,
+        "An operator is positive-definite if and only if its inverse is "
+        "positive-definite.")
+
+    is_square = _combined_hint(
+        "is_square", is_square,
+        "An operator is square if and only if its inverse is square.")
+
+    # Initialization.
+    if name is None:
+      name = operator.name + "_inv"
+    with ops.name_scope(name, values=operator.graph_parents):
+      super(LinearOperatorInversion, self).__init__(
+          dtype=operator.dtype,
+          graph_parents=operator.graph_parents,
+          is_non_singular=is_non_singular,
+          is_self_adjoint=is_self_adjoint,
+          is_positive_definite=is_positive_definite,
+          is_square=is_square,
+          name=name)
+
+  @property
+  def operator(self):
+    """The operator before inversion."""
+    return self._operator
+
+  def _assert_non_singular(self):
+    return self.operator.assert_non_singular()
+
+  def _assert_positive_definite(self):
+    return self.operator.assert_positive_definite()
+
+  def _assert_self_adjoint(self):
+    return self.operator.assert_self_adjoint()
+
+  def _shape(self):
+    return self.operator.shape
+
+  def _shape_tensor(self):
+    return self.operator.shape_tensor()
+
+  def _matmul(self, x, adjoint=False, adjoint_arg=False):
+    return self.operator.solve(x, adjoint=adjoint, adjoint_arg=adjoint_arg)
+
+  def _determinant(self):
+    return 1. / self.operator.determinant()
+
+  def _log_abs_determinant(self):
+    return -1. * self.operator.log_abs_determinant()
+
+  def _solve(self, rhs, adjoint=False, adjoint_arg=False):
+    return self.operator.matmul(rhs, adjoint=adjoint, adjoint_arg=adjoint_arg)
diff --git a/tensorflow/python/ops/linalg/linear_operator_kronecker.py b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
index 1fd5073c17832f0689616f2842c33c95d186e487..f7e785caa5d8cc290f037944378f709633423a74 100644
--- a/tensorflow/python/ops/linalg/linear_operator_kronecker.py
+++ b/tensorflow/python/ops/linalg/linear_operator_kronecker.py
@@ -30,9 +30,7 @@ from tensorflow.python.ops.linalg import linalg_impl as linalg
 from tensorflow.python.ops.linalg import linear_operator
 from tensorflow.python.util.tf_export import tf_export
 
-__all__ = [
-    "LinearOperatorKronecker",
-]
+__all__ = ["LinearOperatorKronecker"]
 
 
 def _vec(x):
diff --git a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
index c4288ff8f87c5c152dd590be0a22b2e3c511a8d9..aa0500aff06e0c9eddf7a3059ebf9480b670ca9d 100644
--- a/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
+++ b/tensorflow/python/ops/linalg/linear_operator_low_rank_update.py
@@ -391,7 +391,7 @@ class LinearOperatorLowRankUpdate(linear_operator.LinearOperator):
     if self._use_cholesky:
       chol_cap_diag = array_ops.matrix_diag_part(self._chol_capacitance)
       log_abs_det_c = 2 * math_ops.reduce_sum(
-          math_ops.log(chol_cap_diag), reduction_indices=[-1])
+          math_ops.log(chol_cap_diag), axis=[-1])
     else:
       det_c = linalg_ops.matrix_determinant(self._capacitance)
       log_abs_det_c = math_ops.log(math_ops.abs(det_c))
diff --git a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
index ca6d3f54051d7bf0ff748804d3cd314b144c2f88..d33fe17e042bfc53ab2f53aa6f79ee5dfa24c4a2 100644
--- a/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
+++ b/tensorflow/python/ops/linalg/linear_operator_lower_triangular.py
@@ -195,11 +195,11 @@ class LinearOperatorLowerTriangular(linear_operator.LinearOperator):
         self._tril, x, adjoint_a=adjoint, adjoint_b=adjoint_arg)
 
   def _determinant(self):
-    return math_ops.reduce_prod(self._diag, reduction_indices=[-1])
+    return math_ops.reduce_prod(self._diag, axis=[-1])
 
   def _log_abs_determinant(self):
     return math_ops.reduce_sum(
-        math_ops.log(math_ops.abs(self._diag)), reduction_indices=[-1])
+        math_ops.log(math_ops.abs(self._diag)), axis=[-1])
 
   def _solve(self, rhs, adjoint=False, adjoint_arg=False):
     rhs = linalg.adjoint(rhs) if adjoint_arg else rhs
diff --git a/tensorflow/python/ops/linalg/linear_operator_test_util.py b/tensorflow/python/ops/linalg/linear_operator_test_util.py
index 76d659f1097579a9b5c92a90938f71b90268503f..e50f572b5f431ae8b7cf3470ee799f170e83656c 100644
--- a/tensorflow/python/ops/linalg/linear_operator_test_util.py
+++ b/tensorflow/python/ops/linalg/linear_operator_test_util.py
@@ -102,7 +102,9 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     raise NotImplementedError("operator_build_infos has not been implemented.")
 
   @abc.abstractmethod
-  def _operator_and_matrix(self, build_info, dtype, use_placeholder):
+  def _operator_and_matrix(
+      self, build_info, dtype, use_placeholder,
+      ensure_self_adjoint_and_pd=False):
     """Build a batch matrix and an Operator that should have similar behavior.
 
     Every operator acts like a (batch) matrix.  This method returns both
@@ -114,6 +116,11 @@ class LinearOperatorDerivedClassTest(test.TestCase):
       dtype:  Numpy dtype.  Data type of returned array/operator.
       use_placeholder:  Python bool.  If True, initialize the operator with a
         placeholder of undefined shape and correct dtype.
+      ensure_self_adjoint_and_pd: If `True`,
+        construct this operator to be Hermitian Positive Definite, as well
+        as ensuring the hints `is_positive_definite` and `is_self_adjoint`
+        are set.
+        This is useful for testing methods such as `cholesky`.
 
     Returns:
       operator:  `LinearOperator` subclass instance.
@@ -271,6 +278,21 @@ class LinearOperatorDerivedClassTest(test.TestCase):
     self._skip_if_tests_to_skip_contains("matmul_with_broadcast")
     self._test_matmul(with_batch=False)
 
+  def test_cholesky(self):
+    self._skip_if_tests_to_skip_contains("cholesky")
+    for use_placeholder in self._use_placeholder_options:
+      for build_info in self._operator_build_infos:
+        for dtype in self._dtypes_to_test:
+          with self.test_session(graph=ops.Graph()) as sess:
+            sess.graph.seed = random_seed.DEFAULT_GRAPH_SEED
+            operator, mat = self._operator_and_matrix(
+                build_info, dtype, use_placeholder=use_placeholder,
+                ensure_self_adjoint_and_pd=True)
+            op_chol = operator.cholesky().to_dense()
+            mat_chol = linalg_ops.cholesky(mat)
+            op_chol_v, mat_chol_v = sess.run([op_chol, mat_chol])
+            self.assertAC(mat_chol_v, op_chol_v)
+
   def _test_solve(self, with_batch):
     for use_placeholder in self._use_placeholder_options:
       for build_info in self._operator_build_infos:
@@ -441,7 +463,7 @@ class NonSquareLinearOperatorDerivedClassTest(LinearOperatorDerivedClassTest):
   @property
   def _tests_to_skip(self):
     """List of test names to skip."""
-    return ["solve", "solve_with_broadcast", "det", "log_abs_det"]
+    return ["cholesky", "solve", "solve_with_broadcast", "det", "log_abs_det"]
 
   @property
   def _operator_build_infos(self):
diff --git a/tensorflow/python/ops/linalg/matmul_registrations.py b/tensorflow/python/ops/linalg/matmul_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0ac988ba274dd99b03733eff38b07055d68543b
--- /dev/null
+++ b/tensorflow/python/ops/linalg/matmul_registrations.py
@@ -0,0 +1,252 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Registrations for LinearOperator.matmul."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.ops.linalg import linear_operator
+from tensorflow.python.ops.linalg import linear_operator_algebra
+from tensorflow.python.ops.linalg import linear_operator_circulant
+from tensorflow.python.ops.linalg import linear_operator_composition
+from tensorflow.python.ops.linalg import linear_operator_diag
+from tensorflow.python.ops.linalg import linear_operator_identity
+from tensorflow.python.ops.linalg import linear_operator_lower_triangular
+from tensorflow.python.ops.linalg import linear_operator_zeros
+
+
+def _combined_self_adjoint_hint(operator_a, operator_b):
+  """Get combined hint for self-adjoint-ness."""
+  # Note: only use this method in the commuting case.
+  # The property is preserved under composition when the operators commute.
+  if operator_a.is_self_adjoint and operator_b.is_self_adjoint:
+    return True
+
+  # The property is not preserved when an operator with the property is composed
+  # with an operator without the property.
+  if ((operator_a.is_self_adjoint is True and
+       operator_b.is_self_adjoint is False) or
+      (operator_a.is_self_adjoint is False and
+       operator_b.is_self_adjoint is True)):
+    return False
+
+  # The property is not known when operators are not known to have the property
+  # or both operators don't have the property (the property for the complement
+  # class is not closed under composition).
+  return None
+
+
+def _is_square(operator_a, operator_b):
+  """Return a hint to whether the composition is square."""
+  if operator_a.is_square and operator_b.is_square:
+    return True
+  if operator_a.is_square is False and operator_b.is_square is False:
+    # Let A have shape [B, M, N], B have shape [B, N, L].
+    m = operator_a.range_dimension
+    l = operator_b.domain_dimension
+    if m is not None and l is not None:
+      return m == l
+
+    return None
+
+
+def _combined_positive_definite_hint(operator_a, operator_b):
+  """Get combined PD hint for compositions."""
+  # Note: Positive definiteness is only guaranteed to be preserved
+  # when the operators commute and are symmetric. Only use this method in
+  # commuting cases.
+
+  if (operator_a.is_positive_definite is True and
+      operator_a.is_self_adjoint is True and
+      operator_b.is_positive_definite is True and
+      operator_b.is_self_adjoint is True):
+    return True
+
+  return None
+
+
+def _combined_non_singular_hint(operator_a, operator_b):
+  """Get combined hint for when ."""
+  # If either operator is not-invertible the composition isn't.
+  if (operator_a.is_non_singular is False or
+      operator_b.is_non_singular is False):
+    return False
+
+  return operator_a.is_non_singular and operator_b.is_non_singular
+
+
+# By default, use a LinearOperatorComposition to delay the computation.
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator, linear_operator.LinearOperator)
+def _matmul_linear_operator(linop_a, linop_b):
+  """Generic matmul of two `LinearOperator`s."""
+  is_square = _is_square(linop_a, linop_b)
+  is_non_singular = None
+  is_self_adjoint = None
+  is_positive_definite = None
+
+  if is_square:
+    is_non_singular = _combined_non_singular_hint(linop_a, linop_b)
+    is_self_adjoint = _combined_self_adjoint_hint(linop_a, linop_b)
+  elif is_square is False:
+    is_non_singular = False
+    is_self_adjoint = False
+    is_positive_definite = False
+
+  return linear_operator_composition.LinearOperatorComposition(
+      operators=[linop_a, linop_b],
+      is_non_singular=is_non_singular,
+      is_self_adjoint=is_self_adjoint,
+      is_positive_definite=is_positive_definite,
+      is_square=is_square,
+  )
+
+# Identity
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_identity.LinearOperatorIdentity,
+    linear_operator.LinearOperator)
+def _matmul_linear_operator_identity_left(identity, linop):
+  del identity
+  return linop
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator,
+    linear_operator_identity.LinearOperatorIdentity)
+def _matmul_linear_operator_identity_right(linop, identity):
+  del identity
+  return linop
+
+
+# Zeros
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator.LinearOperator,
+    linear_operator_zeros.LinearOperatorZeros)
+def _matmul_linear_operator_zeros_right(linop, zeros):
+  if not zeros.is_square or not linop.is_square:
+    raise ValueError("Matmul with non-square `LinearOperator`s or non-square "
+                     "`LinearOperatorZeros` not supported at this time.")
+  return zeros
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_zeros.LinearOperatorZeros,
+    linear_operator.LinearOperator)
+def _matmul_linear_operator_zeros_left(zeros, linop):
+  if not zeros.is_square or not linop.is_square:
+    raise ValueError("Matmul with non-square `LinearOperator`s or non-square "
+                     "`LinearOperatorZeros` not supported at this time.")
+  return zeros
+
+
+# Diag.
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_diag(linop_a, linop_b):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_a.diag * linop_b.diag,
+      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_a, linop_b),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_a, linop_b),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_identity.LinearOperatorScaledIdentity)
+def _matmul_linear_operator_diag_scaled_identity_right(
+    linop_diag, linop_scaled_identity):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag * linop_scaled_identity.multiplier,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_diag, linop_scaled_identity),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_identity.LinearOperatorScaledIdentity,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_diag_scaled_identity_left(
+    linop_scaled_identity, linop_diag):
+  return linear_operator_diag.LinearOperatorDiag(
+      diag=linop_diag.diag * linop_scaled_identity.multiplier,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_scaled_identity),
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_scaled_identity),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_diag, linop_scaled_identity),
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_diag.LinearOperatorDiag,
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular)
+def _matmul_linear_operator_diag_tril(linop_diag, linop_triangular):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_diag.diag[..., None] * linop_triangular.to_dense(),
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_lower_triangular.LinearOperatorLowerTriangular,
+    linear_operator_diag.LinearOperatorDiag)
+def _matmul_linear_operator_tril_diag(linop_triangular, linop_diag):
+  return linear_operator_lower_triangular.LinearOperatorLowerTriangular(
+      tril=linop_triangular.to_dense() * linop_diag.diag,
+      is_non_singular=_combined_non_singular_hint(
+          linop_diag, linop_triangular),
+      # This is safe to do since the Triangular matrix is only self-adjoint
+      # when it is a diagonal matrix, and hence commutes.
+      is_self_adjoint=_combined_self_adjoint_hint(
+          linop_diag, linop_triangular),
+      is_positive_definite=None,
+      is_square=True)
+
+# Circulant.
+
+
+@linear_operator_algebra.RegisterMatmul(
+    linear_operator_circulant.LinearOperatorCirculant,
+    linear_operator_circulant.LinearOperatorCirculant)
+def _matmul_linear_operator_circulant_circulant(linop_a, linop_b):
+  return linear_operator_circulant.LinearOperatorCirculant(
+      spectrum=linop_a.spectrum * linop_b.spectrum,
+      is_non_singular=_combined_non_singular_hint(linop_a, linop_b),
+      is_self_adjoint=_combined_self_adjoint_hint(linop_a, linop_b),
+      is_positive_definite=_combined_positive_definite_hint(
+          linop_a, linop_b),
+      is_square=True)
diff --git a/tensorflow/python/ops/linalg_ops.py b/tensorflow/python/ops/linalg_ops.py
index bbccc7e0369886a0d6bc5eac139c09b8f399d366..1a9e7112b45cacb711ac176b92cb3bef0dc72f00 100644
--- a/tensorflow/python/ops/linalg_ops.py
+++ b/tensorflow/python/ops/linalg_ops.py
@@ -423,7 +423,78 @@ def svd(tensor, full_matrices=False, compute_uv=True, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export('norm', 'linalg.norm')
+@tf_export('norm', 'linalg.norm', v1=[])
+def norm_v2(tensor,
+            ord='euclidean',
+            axis=None,
+            keepdims=None,
+            name=None):
+  r"""Computes the norm of vectors, matrices, and tensors.
+
+  This function can compute several different vector norms (the 1-norm, the
+  Euclidean or 2-norm, the inf-norm, and in general the p-norm for p > 0) and
+  matrix norms (Frobenius, 1-norm, 2-norm and inf-norm).
+
+  Args:
+    tensor: `Tensor` of types `float32`, `float64`, `complex64`, `complex128`
+    ord: Order of the norm. Supported values are 'fro', 'euclidean',
+      `1`, `2`, `np.inf` and any positive real number yielding the corresponding
+      p-norm. Default is 'euclidean' which is equivalent to Frobenius norm if
+      `tensor` is a matrix and equivalent to 2-norm for vectors.
+      Some restrictions apply:
+        a) The Frobenius norm `fro` is not defined for vectors,
+        b) If axis is a 2-tuple (matrix norm), only 'euclidean', 'fro', `1`,
+           `2`, `np.inf` are supported.
+      See the description of `axis` on how to compute norms for a batch of
+      vectors or matrices stored in a tensor.
+    axis: If `axis` is `None` (the default), the input is considered a vector
+      and a single vector norm is computed over the entire set of values in the
+      tensor, i.e. `norm(tensor, ord=ord)` is equivalent to
+      `norm(reshape(tensor, [-1]), ord=ord)`.
+      If `axis` is a Python integer, the input is considered a batch of vectors,
+      and `axis` determines the axis in `tensor` over which to compute vector
+      norms.
+      If `axis` is a 2-tuple of Python integers it is considered a batch of
+      matrices and `axis` determines the axes in `tensor` over which to compute
+      a matrix norm.
+      Negative indices are supported. Example: If you are passing a tensor that
+      can be either a matrix or a batch of matrices at runtime, pass
+      `axis=[-2,-1]` instead of `axis=None` to make sure that matrix norms are
+      computed.
+    keepdims: If True, the axis indicated in `axis` are kept with size 1.
+      Otherwise, the dimensions in `axis` are removed from the output shape.
+    name: The name of the op.
+
+  Returns:
+    output: A `Tensor` of the same type as tensor, containing the vector or
+      matrix norms. If `keepdims` is True then the rank of output is equal to
+      the rank of `tensor`. Otherwise, if `axis` is none the output is a scalar,
+      if `axis` is an integer, the rank of `output` is one less than the rank
+      of `tensor`, if `axis` is a 2-tuple the rank of `output` is two less
+      than the rank of `tensor`.
+
+  Raises:
+    ValueError: If `ord` or `axis` is invalid.
+
+  @compatibility(numpy)
+  Mostly equivalent to numpy.linalg.norm.
+  Not supported: ord <= 0, 2-norm for matrices, nuclear norm.
+  Other differences:
+    a) If axis is `None`, treats the flattened `tensor` as a vector
+     regardless of rank.
+    b) Explicitly supports 'euclidean' norm as the default, including for
+     higher order tensors.
+  @end_compatibility
+  """
+  return norm(tensor=tensor,
+              ord=ord,
+              axis=axis,
+              keepdims=keepdims,
+              name=name)
+
+
+# pylint: disable=redefined-builtin
+@tf_export(v1=['norm', 'linalg.norm'])
 @deprecation.deprecated_args(
     None, 'keep_dims is deprecated, use keepdims instead', 'keep_dims')
 def norm(tensor,
diff --git a/tensorflow/python/ops/list_ops.py b/tensorflow/python/ops/list_ops.py
index b4a1fc6af613da7f6ae3047ba43d1afee16a19a4..dbaae886d43e46ac193d1e7f28a6367192d2a640 100644
--- a/tensorflow/python/ops/list_ops.py
+++ b/tensorflow/python/ops/list_ops.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_list_ops
 # go/tf-wildcard-import
@@ -29,7 +30,9 @@ from tensorflow.python.ops.gen_list_ops import *
 # pylint: enable=wildcard-import
 
 
-ops.NotDifferentiable("TensorListConcat")
+ops.NotDifferentiable("TensorListConcatLists")
+ops.NotDifferentiable("TensorListElementShape")
+ops.NotDifferentiable("TensorListLength")
 ops.NotDifferentiable("TensorListPushBackBatch")
 
 
@@ -41,12 +44,42 @@ def empty_tensor_list(element_shape,
     max_num_elements = -1
 
   return gen_list_ops.empty_tensor_list(
-      element_shape=element_shape,
+      element_shape=_build_element_shape(element_shape),
       element_dtype=element_dtype,
       max_num_elements=max_num_elements,
       name=name)
 
 
+def tensor_list_reserve(element_shape, num_elements, element_dtype, name=None):
+  return gen_list_ops.tensor_list_reserve(
+      element_shape=_build_element_shape(element_shape),
+      num_elements=num_elements,
+      element_dtype=element_dtype,
+      name=name)
+
+
+def tensor_list_from_tensor(tensor, element_shape, name=None):
+  return gen_list_ops.tensor_list_from_tensor(
+      tensor=tensor,
+      element_shape=_build_element_shape(element_shape),
+      name=name)
+
+
+def tensor_list_concat(input_handle, element_dtype, name=None):
+  # Ignore the lengths output of TensorListConcat. It is only used during
+  # gradient computation.
+  return gen_list_ops.tensor_list_concat(
+      input_handle=input_handle, element_dtype=element_dtype, name=name)[0]
+
+
+def tensor_list_split(tensor, element_shape, lengths, name=None):
+  return gen_list_ops.tensor_list_split(
+      tensor=tensor,
+      element_shape=_build_element_shape(element_shape),
+      lengths=lengths,
+      name=name)
+
+
 @ops.RegisterGradient("TensorListPushBack")
 def _PushBackGrad(op, dresult):
   return gen_list_ops.tensor_list_pop_back(
@@ -65,14 +98,32 @@ def _PopBackGrad(op, dlist, delement):
 
 @ops.RegisterGradient("TensorListStack")
 def _TensorListStackGrad(unused_op, dtensor):
-  return gen_list_ops.tensor_list_from_tensor(dtensor,
-                                              element_shape=dtensor.shape[1:])
+  return tensor_list_from_tensor(dtensor, element_shape=dtensor.shape[1:])
+
+
+@ops.RegisterGradient("TensorListConcat")
+def _TensorListConcatGrad(op, dtensor, unused_dlengths):
+  # TODO(srbs): We lose the element_shape information in tensor_list_concat.
+  # Consider providing that as an output of TensorListConcat?
+  if dtensor.shape.rank is None:
+    element_shape = None
+  else:
+    element_shape = [None] + dtensor.shape.as_list()[1:]
+  return tensor_list_split(
+      dtensor,
+      element_shape=_build_element_shape(element_shape),
+      lengths=op.outputs[1])
+
+
+@ops.RegisterGradient("TensorListSplit")
+def _TensorListSplitGrad(op, dlist):
+  return tensor_list_concat(dlist, element_dtype=op.inputs[0].dtype), None, None
 
 
 @ops.RegisterGradient("TensorListFromTensor")
 def _TensorListFromTensorGrad(op, dlist):
   """Gradient for TensorListFromTensor."""
-  if op.inputs[0].shape.dims[0].value is not None:
+  if op.inputs[0].shape.dims and op.inputs[0].shape.dims[0].value is not None:
     num_elements = op.inputs[0].shape.dims[0].value
   else:
     num_elements = None
@@ -126,3 +177,40 @@ def _TensorListScatterGrad(op, dlist):
   t, indices, _ = op.inputs
   return gen_list_ops.tensor_list_gather(
       dlist, indices, element_dtype=t.dtype), None
+
+
+def _build_element_shape(shape):
+  """Converts shape to a format understood by list_ops for element_shape.
+
+  If `shape` is already a `Tensor` it is returned as-is. We do not perform a
+  type check here.
+
+  If shape is None or a TensorShape with unknown rank, -1 is returned.
+
+  If shape is a scalar, an int32 tensor with empty list is returned. Note we
+  do directly return an empty list since ops.convert_to_tensor would conver it
+  to a float32 which is not a valid type for element_shape.
+
+  If shape is a sequence of dims, None's in the list are replaced with -1. We
+  do not check the dtype of the other dims.
+
+  Args:
+    shape: Could be None, Tensor, TensorShape or a list of dims (each dim could
+      be a None, scalar or Tensor).
+
+  Returns:
+    A None-free shape that can be converted to a tensor.
+  """
+  if isinstance(shape, ops.Tensor):
+    return shape
+  if isinstance(shape, tensor_shape.TensorShape):
+    # `TensorShape.as_list` requires rank to be known.
+    shape = shape.as_list() if shape else None
+  # Shape is unknown.
+  if shape is None:
+    return -1
+  # Shape is a scalar.
+  if not shape:
+    return ops.convert_to_tensor(shape, dtype=dtypes.int32)
+  # Shape is a sequence of dimensions. Convert None dims to -1.
+  return [d if d is not None else -1 for d in shape]
diff --git a/tensorflow/python/ops/lookup_ops.py b/tensorflow/python/ops/lookup_ops.py
index 397d56ef40936c02d879c719027ceb5cfd10d93a..758cb8041da63956c7a451e2030b9e9d98016f42 100644
--- a/tensorflow/python/ops/lookup_ops.py
+++ b/tensorflow/python/ops/lookup_ops.py
@@ -39,6 +39,7 @@ from tensorflow.python.ops import string_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_lookup_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.training.checkpointable import base as checkpointable_base
 from tensorflow.python.training.checkpointable import tracking as checkpointable
 from tensorflow.python.util import compat
 from tensorflow.python.util.deprecation import deprecated
@@ -160,7 +161,9 @@ class InitializableLookupTableBase(LookupInterface):
     self._default_value = ops.convert_to_tensor(
         default_value, dtype=self._value_dtype)
     self._default_value.get_shape().merge_with(tensor_shape.scalar())
-    self._initializer = initializer
+    if isinstance(initializer, checkpointable_base.CheckpointableBase):
+      self._initializer = self._track_checkpointable(
+          initializer, "_initializer")
     self._resource_handle = self.create_resource()
     self._init_op = self.initialize()
 
@@ -309,7 +312,7 @@ class HashTable(InitializableLookupTableBase):
     return exported_keys, exported_values
 
 
-class TableInitializerBase(object):
+class TableInitializerBase(checkpointable_base.CheckpointableBase):
   """Base class for lookup table initializers."""
 
   def __init__(self, key_dtype, value_dtype):
@@ -522,12 +525,14 @@ class TextFileInitializer(TableInitializerBase):
     if (vocab_size is not None) and (vocab_size <= 0):
       raise ValueError("Invalid vocab_size %s." % vocab_size)
 
-    self._filename = filename
     self._key_index = key_index
     self._value_index = value_index
     self._vocab_size = vocab_size
     self._delimiter = delimiter
     self._name = name
+    self._filename = self._track_checkpointable(
+        checkpointable.TrackableAsset(filename),
+        "_filename")
 
     super(TextFileInitializer, self).__init__(key_dtype, value_dtype)
 
diff --git a/tensorflow/python/ops/losses/losses_impl.py b/tensorflow/python/ops/losses/losses_impl.py
index 53c09ee8ddf22420b768ec58056933088d9e7881..20397612bca9a9b81d9816ac1626ce15024d45f6 100644
--- a/tensorflow/python/ops/losses/losses_impl.py
+++ b/tensorflow/python/ops/losses/losses_impl.py
@@ -18,7 +18,6 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -34,28 +33,50 @@ from tensorflow.python.util.deprecation import deprecated_argument_lookup
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("losses.Reduction")
-class Reduction(object):
+@tf_export("losses.Reduction", "keras.losses.Reduction", v1=[])
+class ReductionV2(object):
   """Types of loss reduction.
 
   Contains the following values:
-  `NONE`: Un-reduced weighted losses with the same shape as input.
-  `SUM`: Scalar sum of weighted losses.
-  `MEAN`: Scalar `SUM` divided by sum of weights.
-  `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
-  `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
-     weights.
-  `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
   """
 
   NONE = "none"
+  SUM = "sum"
+  SUM_OVER_BATCH_SIZE = "sum_over_batch_size"
 
-  SUM = "weighted_sum"
+  @classmethod
+  def all(cls):
+    return (cls.NONE, cls.SUM, cls.SUM_OVER_BATCH_SIZE)
 
-  MEAN = "weighted_mean"
+  @classmethod
+  def validate(cls, key):
+    if key not in cls.all():
+      raise ValueError("Invalid Reduction Key %s." % key)
 
-  SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
 
+@tf_export(v1=["losses.Reduction"])
+class Reduction(object):
+  """Types of loss reduction.
+
+  Contains the following values:
+
+  * `NONE`: Un-reduced weighted losses with the same shape as input.
+  * `SUM`: Scalar sum of weighted losses.
+  * `MEAN`: Scalar `SUM` divided by sum of weights. DEPRECATED.
+  * `SUM_OVER_BATCH_SIZE`: Scalar `SUM` divided by number of elements in losses.
+  * `SUM_OVER_NONZERO_WEIGHTS`: Scalar `SUM` divided by number of non-zero
+     weights. DEPRECATED.
+  * `SUM_BY_NONZERO_WEIGHTS`: Same as `SUM_OVER_NONZERO_WEIGHTS`.
+  """
+
+  NONE = "none"
+  SUM = "weighted_sum"
+  SUM_OVER_BATCH_SIZE = "weighted_sum_over_batch_size"
+  MEAN = "weighted_mean"
   SUM_BY_NONZERO_WEIGHTS = "weighted_sum_by_nonzero_weights"
   SUM_OVER_NONZERO_WEIGHTS = SUM_BY_NONZERO_WEIGHTS
 
@@ -72,35 +93,7 @@ class Reduction(object):
   @classmethod
   def validate(cls, key):
     if key not in cls.all():
-      raise ValueError("Invalid ReductionKey %s." % key)
-
-
-def _safe_div(numerator, denominator, name="value"):
-  """Computes a safe divide which returns 0 if the denominator is zero.
-
-  Note that the function contains an additional conditional check that is
-  necessary for avoiding situations where the loss is zero causing NaNs to
-  creep into the gradient computation.
-
-  Args:
-    numerator: An arbitrary `Tensor`.
-    denominator: A `Tensor` whose shape matches `numerator` and whose values are
-      assumed to be non-negative.
-    name: An optional name for the returned op.
-
-  Returns:
-    The element-wise value of the numerator divided by the denominator.
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator, name=name)
-  return array_ops.where(
-      math_ops.greater(denominator, 0),
-      math_ops.div(numerator,
-                   array_ops.where(
-                       math_ops.equal(denominator, 0),
-                       array_ops.ones_like(denominator), denominator)),
-      array_ops.zeros_like(numerator),
-      name=name)
+      raise ValueError("Invalid Reduction Key %s." % key)
 
 
 def _safe_mean(losses, num_present):
@@ -115,7 +108,7 @@ def _safe_mean(losses, num_present):
       then zero is returned.
   """
   total_loss = math_ops.reduce_sum(losses)
-  return _safe_div(total_loss, num_present)
+  return math_ops.div_no_nan(total_loss, num_present, name="value")
 
 
 def _num_present(losses, weights, per_batch=False):
@@ -166,7 +159,7 @@ def _num_elements(losses):
     return math_ops.cast(array_ops.size(losses, name=scope), dtype=losses.dtype)
 
 
-@tf_export("losses.compute_weighted_loss")
+@tf_export(v1=["losses.compute_weighted_loss"])
 def compute_weighted_loss(
     losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES,
     reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -236,7 +229,7 @@ def compute_weighted_loss(
       return loss
 
 
-@tf_export("losses.absolute_difference")
+@tf_export(v1=["losses.absolute_difference"])
 def absolute_difference(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -289,7 +282,7 @@ def absolute_difference(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.cosine_distance")
+@tf_export(v1=["losses.cosine_distance"])
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def cosine_distance(
     labels, predictions, axis=None, weights=1.0, scope=None,
@@ -345,7 +338,7 @@ def cosine_distance(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.hinge_loss")
+@tf_export(v1=["losses.hinge_loss"])
 def hinge_loss(labels, logits, weights=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -395,7 +388,7 @@ def hinge_loss(labels, logits, weights=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.huber_loss")
+@tf_export(v1=["losses.huber_loss"])
 def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
                loss_collection=ops.GraphKeys.LOSSES,
                reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -473,7 +466,7 @@ def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None,
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.log_loss")
+@tf_export(v1=["losses.log_loss"])
 def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
              loss_collection=ops.GraphKeys.LOSSES,
              reduction=Reduction.SUM_BY_NONZERO_WEIGHTS):
@@ -530,7 +523,7 @@ def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None,
 
 
 # TODO(b/37208492): Add reduction arg.
-@tf_export("losses.mean_pairwise_squared_error")
+@tf_export(v1=["losses.mean_pairwise_squared_error"])
 def mean_pairwise_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES):
@@ -595,26 +588,24 @@ def mean_pairwise_squared_error(
 
       diffs = math_ops.subtract(predictions, labels)
 
-      reduction_indices = math_ops.range(1, array_ops.rank(diffs))
+      axis = math_ops.range(1, array_ops.rank(diffs))
 
       sum_squares_diff_per_batch = math_ops.reduce_sum(
-          math_ops.square(diffs),
-          reduction_indices=reduction_indices,
-          keepdims=True)
+          math_ops.square(diffs), axis=axis, keepdims=True)
       num_present_per_batch = _num_present(diffs, weights, per_batch=True)
 
-      term1 = 2.0 * _safe_div(
+      term1 = 2.0 * math_ops.div_no_nan(
           sum_squares_diff_per_batch,
-          math_ops.maximum(num_present_per_batch - 1, 0))
+          math_ops.maximum(num_present_per_batch - 1, 0),
+          name="value")
 
-      sum_diff = math_ops.reduce_sum(
-          diffs, reduction_indices=reduction_indices, keepdims=True)
-      term2 = 2.0 * _safe_div(
+      sum_diff = math_ops.reduce_sum(diffs, axis=axis, keepdims=True)
+      term2 = 2.0 * math_ops.div_no_nan(
           math_ops.square(sum_diff),
           math_ops.maximum(
               math_ops.multiply(num_present_per_batch,
-                                num_present_per_batch - 1),
-              0))
+                                num_present_per_batch - 1), 0),
+          name="value")
 
       weighted_losses = math_ops.multiply(term1 - term2, weights)
       loss = math_ops.reduce_sum(weighted_losses)
@@ -628,7 +619,7 @@ def mean_pairwise_squared_error(
       return mean_loss
 
 
-@tf_export("losses.mean_squared_error")
+@tf_export(v1=["losses.mean_squared_error"])
 def mean_squared_error(
     labels, predictions, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -681,7 +672,7 @@ def mean_squared_error(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.sigmoid_cross_entropy")
+@tf_export(v1=["losses.sigmoid_cross_entropy"])
 def sigmoid_cross_entropy(
     multi_class_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -745,7 +736,7 @@ def sigmoid_cross_entropy(
         losses, weights, scope, loss_collection, reduction=reduction)
 
 
-@tf_export("losses.softmax_cross_entropy")
+@tf_export(v1=["losses.softmax_cross_entropy"])
 def softmax_cross_entropy(
     onehot_labels, logits, weights=1.0, label_smoothing=0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
@@ -867,7 +858,7 @@ def _remove_squeezable_dimensions(
   return labels, predictions, weights
 
 
-@tf_export("losses.sparse_softmax_cross_entropy")
+@tf_export(v1=["losses.sparse_softmax_cross_entropy"])
 def sparse_softmax_cross_entropy(
     labels, logits, weights=1.0, scope=None,
     loss_collection=ops.GraphKeys.LOSSES,
diff --git a/tensorflow/python/ops/losses/util_test.py b/tensorflow/python/ops/losses/util_test.py
index df2e60e2e45c9aa38184e36e126f519fdb8beb5e..22a8eaae2666634c7132bdbd537fac5d731ed2f6 100644
--- a/tensorflow/python/ops/losses/util_test.py
+++ b/tensorflow/python/ops/losses/util_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops.losses import util
 from tensorflow.python.platform import test
 
 
 class LossesUtilTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testGetRegularizationLoss(self):
     # Empty regularization collection should evaluate to 0.0.
     with self.cached_session():
diff --git a/tensorflow/python/ops/math_grad.py b/tensorflow/python/ops/math_grad.py
index 35278d9680408aa44c81ec3276e61cd382a58c57..c7ec1c57d1b07232e2bdb05fc30f5456b792890f 100644
--- a/tensorflow/python/ops/math_grad.py
+++ b/tensorflow/python/ops/math_grad.py
@@ -1041,11 +1041,12 @@ def _PowGrad(op, grad):
   # Avoid false singularity at x = 0
   if x.dtype.is_complex:
     # real(x) < 0 is fine for the complex case
-    log_x = array_ops.where(
-        math_ops.not_equal(x, 0), math_ops.log(x), array_ops.zeros_like(x))
+    mask = math_ops.not_equal(x, 0)
   else:
     # There's no sensible real value to return if x < 0, so return 0
-    log_x = array_ops.where(x > 0, math_ops.log(x), array_ops.zeros_like(x))
+    mask = x > 0
+  safe_x = array_ops.where(mask, x, array_ops.ones_like(x))
+  log_x = array_ops.where(mask, math_ops.log(safe_x), array_ops.zeros_like(x))
   gy = array_ops.reshape(math_ops.reduce_sum(grad * z * log_x, ry), sy)
   return gx, gy
 
diff --git a/tensorflow/python/ops/math_grad_test.py b/tensorflow/python/ops/math_grad_test.py
index d1fe834fc78f2c39996a0690af96ba17d28d8706..822f89768c53c45def3bb93a53382b2375944528 100644
--- a/tensorflow/python/ops/math_grad_test.py
+++ b/tensorflow/python/ops/math_grad_test.py
@@ -20,9 +20,13 @@ from __future__ import print_function
 
 import numpy as np
 
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager import execution_callbacks
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients
@@ -52,6 +56,7 @@ class SquaredDifferenceOpTest(test.TestCase):
     self.assertLess(left_err, 1e-10)
     self.assertLess(right_err, 1e-10)
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     self._testGrad([1, 2, 3, 2], [3, 2])
     self._testGrad([2, 4], [3, 2, 4])
@@ -83,6 +88,7 @@ class AbsOpTest(test.TestCase):
           value, shape, output, output.get_shape().as_list())
     self.assertLess(error, max_error)
 
+  @test_util.run_deprecated_v1
   def testComplexAbs(self):
     # Bias random test values away from zero to avoid numeric instabilities.
     self._testGrad(
@@ -99,6 +105,7 @@ class AbsOpTest(test.TestCase):
 
 class MinOrMaxGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMinGradient(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     outputs = math_ops.reduce_min(array_ops.concat([inputs, inputs], 0))
@@ -106,6 +113,7 @@ class MinOrMaxGradientTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(inputs, [1], outputs, [])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testMaxGradient(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     outputs = math_ops.reduce_max(array_ops.concat([inputs, inputs], 0))
@@ -116,6 +124,7 @@ class MinOrMaxGradientTest(test.TestCase):
 
 class MaximumOrMinimumGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testMaximumGradient(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
     outputs = math_ops.maximum(inputs, 3.0)
@@ -123,6 +132,7 @@ class MaximumOrMinimumGradientTest(test.TestCase):
       error = gradient_checker.compute_gradient_error(inputs, [4], outputs, [4])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testMinimumGradient(self):
     inputs = constant_op.constant([1.0, 2.0, 3.0, 4.0], dtype=dtypes.float32)
     outputs = math_ops.minimum(inputs, 2.0)
@@ -133,6 +143,7 @@ class MaximumOrMinimumGradientTest(test.TestCase):
 
 class ProdGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testProdGradient(self):
     inputs = constant_op.constant([[1., 2.], [3., 4.]],
                                   dtype=dtypes.float32)
@@ -143,6 +154,7 @@ class ProdGradientTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientForNegativeAxis(self):
     inputs = constant_op.constant([[1., 2.], [3., 4.]],
                                   dtype=dtypes.float32)
@@ -153,6 +165,7 @@ class ProdGradientTest(test.TestCase):
           outputs, outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientComplex(self):
     for dtype in dtypes.complex64, dtypes.complex128:
       inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
@@ -164,6 +177,7 @@ class ProdGradientTest(test.TestCase):
             outputs, outputs.get_shape().as_list())
         self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testProdGradientForNegativeAxisComplex(self):
     for dtype in dtypes.complex64, dtypes.complex128:
       inputs = constant_op.constant([[1 + 3j, 2 - 1j], [3j, 4]],
@@ -178,6 +192,7 @@ class ProdGradientTest(test.TestCase):
 
 class SegmentMinOrMaxGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testSegmentMinGradient(self):
     data = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.float32)
     segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
@@ -187,6 +202,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [2])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMaxGradient(self):
     data = constant_op.constant([1.0, 2.0, 3.0], dtype=dtypes.float32)
     segment_ids = constant_op.constant([0, 0, 1], dtype=dtypes.int64)
@@ -196,6 +212,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [2])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMinGradientWithTies(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     data = array_ops.concat([inputs, inputs], 0)
@@ -206,6 +223,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
                                                       [1])
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testSegmentMaxGradientWithTies(self):
     inputs = constant_op.constant([1.0], dtype=dtypes.float32)
     data = array_ops.concat([inputs, inputs], 0)
@@ -219,6 +237,7 @@ class SegmentMinOrMaxGradientTest(test.TestCase):
 
 class FloorModGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testFloorModGradient(self):
     # Making sure the input is not near the discontinuity point where
     # x/y == floor(x/y)
@@ -233,6 +252,7 @@ class FloorModGradientTest(test.TestCase):
 
 class DivNoNanGradientTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicGradient(self):
     inputs = constant_op.constant(np.arange(-3, 3),
                                   dtype=dtypes.float32)
@@ -244,6 +264,7 @@ class DivNoNanGradientTest(test.TestCase):
           outputs.get_shape().as_list())
       self.assertLess(error, 1e-4)
 
+  @test_util.run_deprecated_v1
   def testGradientWithDenominatorIsZero(self):
     x = constant_op.constant(np.arange(-3, 3),
                              dtype=dtypes.float32)
@@ -263,6 +284,7 @@ class XlogyTest(test.TestCase):
     xlogy_ygrad = self.evaluate(gradients.gradients(math_ops.xlogy(x, y), y)[0])
     return xlogy_xgrad, xlogy_ygrad
 
+  @test_util.run_deprecated_v1
   def testNonZeroValuesGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -273,6 +295,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(xlogy_expected_xgrad, xlogy_xgrad)
       self.assertAllClose(xlogy_expected_ygrad, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -282,6 +305,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(zero, xlogy_xgrad)
       self.assertAllClose(zero, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -290,6 +314,7 @@ class XlogyTest(test.TestCase):
       self.assertAllClose(-np.inf, xlogy_xgrad)
       self.assertAllClose(np.inf, xlogy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -307,6 +332,7 @@ class XdivyTest(test.TestCase):
     xdivy_ygrad = self.evaluate(gradients.gradients(math_ops.xdivy(x, y), y)[0])
     return xdivy_xgrad, xdivy_ygrad
 
+  @test_util.run_deprecated_v1
   def testNonZeroValuesGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -317,6 +343,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(xdivy_expected_xgrad, xdivy_xgrad)
       self.assertAllClose(xdivy_expected_ygrad, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -326,6 +353,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(zero, xdivy_xgrad)
       self.assertAllClose(zero, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0.1, dtype=dtype)
@@ -334,6 +362,7 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(np.inf, xdivy_xgrad)
       self.assertAllClose(-np.inf, xdivy_ygrad)
 
+  @test_util.run_deprecated_v1
   def testZeroXYGrad(self):
     for dtype in [dtypes.float16, dtypes.float32, dtypes.float64]:
       x = constant_op.constant(0., dtype=dtype)
@@ -344,5 +373,25 @@ class XdivyTest(test.TestCase):
       self.assertAllClose(zero, xdivy_ygrad)
 
 
+@test_util.run_all_in_graph_and_eager_modes
+class PowGradTest(test.TestCase):
+
+  def test_zero_grad_tf_gradients(self):
+    if context.executing_eagerly():
+      self.skipTest("tf.gradients not supported in eager.")
+
+    x = constant_op.constant([-1., 0., 1.])
+    g = self.evaluate(gradients.gradients(math_ops.pow(x, 2), x)[0])
+    self.assertAllClose([-2., 0., 2.], g)
+
+  def test_zero_grad_tape(self):
+    with execution_callbacks.errstate(inf_or_nan=execution_callbacks.RAISE):
+      x = constant_op.constant([-1, 0., 1.])
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        g = tape.gradient(math_ops.pow(x, 2), x)
+      g = self.evaluate(g)
+      self.assertAllClose([-2., 0., 2.], g)
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 39b1ca8993e2c38a2a1fd4a95540be7d18a092df..e2b634ee8f8d18e1e0e43a9e10cb7f2532bbbf12 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -36,7 +36,6 @@ from tensorflow.python.ops import gen_data_flow_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import gen_sparse_ops
-from tensorflow.python.ops import gen_spectral_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_math_ops import *
@@ -44,6 +43,7 @@ from tensorflow.python.ops.gen_math_ops import *
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
@@ -52,8 +52,8 @@ linspace = gen_math_ops.lin_space
 
 arg_max = deprecation.deprecated(None, "Use `tf.math.argmax` instead")(arg_max)  # pylint: disable=used-before-assignment
 arg_min = deprecation.deprecated(None, "Use `tf.math.argmin` instead")(arg_min)  # pylint: disable=used-before-assignment
-tf_export("arg_max")(arg_max)
-tf_export("arg_min")(arg_min)
+tf_export(v1=["arg_max"])(arg_max)
+tf_export(v1=["arg_min"])(arg_min)
 
 # This is set by resource_variable_ops.py. It is included in this way since
 # there is a circular dependency between math_ops and resource_variable_ops
@@ -83,8 +83,6 @@ def argmax(input,
            output_type=dtypes.int64):
   axis = deprecation.deprecated_argument_lookup(
       "axis", axis, "dimension", dimension)
-  if axis is None:
-    axis = 0
   return argmax_v2(input, axis, output_type, name)
 
 
@@ -112,6 +110,8 @@ def argmax_v2(input,
   Returns:
     A `Tensor` of type `output_type`.
   """
+  if axis is None:
+    axis = 0
   return gen_math_ops.arg_max(input, axis, name=name, output_type=output_type)
 
 
@@ -128,8 +128,6 @@ def argmin(input,
            output_type=dtypes.int64):
   axis = deprecation.deprecated_argument_lookup(
       "axis", axis, "dimension", dimension)
-  if axis is None:
-    axis = 0
   return argmin_v2(input, axis, output_type, name)
 
 
@@ -157,6 +155,8 @@ def argmin_v2(input,
   Returns:
     A `Tensor` of type `output_type`.
   """
+  if axis is None:
+    axis = 0
   return gen_math_ops.arg_min(input, axis, name=name, output_type=output_type)
 
 
@@ -166,6 +166,7 @@ def argmin_v2(input,
 # pylint: disable=anomalous-backslash-in-string,protected-access
 # pylint: disable=g-docstring-has-escape
 @tf_export("math.abs", "abs")
+@dispatch.add_dispatch_support
 def abs(x, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the absolute value of a tensor.
 
@@ -190,22 +191,10 @@ def abs(x, name=None):  # pylint: disable=redefined-builtin
       of type `float32` or `float64`, respectively.
   """
   with ops.name_scope(name, "Abs", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      if x.values.dtype.is_complex:
-        x_abs = gen_math_ops.complex_abs(
-            x.values, Tout=x.values.dtype.real_dtype, name=name)
-        return sparse_tensor.SparseTensor(
-            indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
-      x_abs = gen_math_ops._abs(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_abs, dense_shape=x.dense_shape)
-    else:
-      x = ops.convert_to_tensor(x, name="x")
-      if x.dtype.is_complex:
-        return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
-      return gen_math_ops._abs(x, name=name)
-
-
+    x = ops.convert_to_tensor(x, name="x")
+    if x.dtype.is_complex:
+      return gen_math_ops.complex_abs(x, Tout=x.dtype.real_dtype, name=name)
+    return gen_math_ops._abs(x, name=name)
 # pylint: enable=g-docstring-has-escape
 
 
@@ -241,6 +230,7 @@ class DivideDelegateWithName(object):
 
 
 @tf_export("math.divide", "divide")
+@dispatch.add_dispatch_support
 def divide(x, y, name=None):
   """Computes Python style division of `x` by `y`."""
 
@@ -253,6 +243,7 @@ def divide(x, y, name=None):
 
 
 @tf_export("math.multiply", "multiply")
+@dispatch.add_dispatch_support
 def multiply(x, y, name=None):
   return gen_math_ops.mul(x, y, name)
 
@@ -273,6 +264,7 @@ _mul.__doc__ = (
 
 
 @tf_export("math.subtract", "subtract")
+@dispatch.add_dispatch_support
 def subtract(x, y, name=None):
   return gen_math_ops.sub(x, y, name)
 
@@ -292,31 +284,7 @@ _sub.__doc__ = (
     gen_math_ops.sub.__doc__ + ("" if _sub.__doc__ is None else _sub.__doc__))
 
 
-# pylint: disable=g-docstring-has-escape
-@tf_export("math.negative", "negative")
-def negative(x, name=None):
-  """Computes numerical negative value element-wise.
-
-  I.e., \\(y = -x\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Neg", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_neg = gen_math_ops.neg(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_neg, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.neg(x, name=name)
-
-
-# pylint: enable=g-docstring-has-escape
+negative = gen_math_ops.neg
 
 
 # pylint: disable=g-docstring-has-escape
@@ -342,107 +310,8 @@ def _neg(x, name=None):
 # pylint: enable=g-docstring-has-escape
 
 
-@tf_export("math.sign", "sign")
-def sign(x, name=None):
-  """Returns an element-wise indication of the sign of a number.
-
-  `y = sign(x) = -1` if `x < 0`; 0 if `x == 0` or `tf.is_nan(x)`; 1 if `x > 0`.
-
-  Zero is returned for NaN inputs.
-
-  For complex numbers, `y = sign(x) = x / |x|` if `x != 0`, otherwise `y = 0`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(numpy)
-  Equivalent to numpy.sign except for the behavior for input values of NaN.
-  @end_compatibility
-  """
-  with ops.name_scope(name, "Sign", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_sign = gen_math_ops.sign(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sign, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.sign(x, name=name)
-
-
-@tf_export("math.square", "square")
-def square(x, name=None):
-  r"""Computes square of x element-wise.
-
-  I.e., \\(y = x * x = x^2\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `int32`, `int64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Square", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_square = gen_math_ops.square(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_square, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.square(x, name=name)
-
-
-@tf_export("math.sqrt", "sqrt")
-def sqrt(x, name=None):
-  r"""Computes square root of x element-wise.
-
-  I.e., \\(y = \sqrt{x} = x^{1/2}\\).
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`, `complex64`, `complex128`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Sqrt", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_sqrt = gen_math_ops.sqrt(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_sqrt, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.sqrt(x, name=name)
-
-
-@tf_export("math.erf", v1=["math.erf", "erf"])
-@deprecation.deprecated_endpoints("erf")
-def erf(x, name=None):
-  """Computes the Gauss error function of `x` element-wise.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-  """
-  with ops.name_scope(name, "Erf", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_erf = gen_math_ops.erf(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_erf, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.erf(x, name=name)
-
-
-@tf_export("math.scalar_mul", "scalar_mul")
-def scalar_mul(scalar, x):
+@tf_export(v1=["math.scalar_mul", "scalar_mul"])
+def scalar_mul(scalar, x, name=None):
   """Multiplies a scalar times a `Tensor` or `IndexedSlices` object.
 
   Intended for use in gradient code which might deal with `IndexedSlices`
@@ -452,6 +321,7 @@ def scalar_mul(scalar, x):
   Args:
     scalar: A 0-D scalar `Tensor`. Must have known shape.
     x: A `Tensor` or `IndexedSlices` to be scaled.
+    name: A name for the operation (optional).
 
   Returns:
     `scalar * x` of the same type (`Tensor` or `IndexedSlices`) as `x`.
@@ -464,14 +334,23 @@ def scalar_mul(scalar, x):
   shape = scalar.get_shape()
   if shape.ndims == 0:
     if isinstance(x, ops.IndexedSlices):
-      return ops.IndexedSlices(scalar * x.values, x.indices, x.dense_shape)
+      return ops.IndexedSlices(gen_math_ops.mul(scalar, x.values, name),
+                               x.indices, x.dense_shape)
     else:
-      return scalar * x
+      return gen_math_ops.mul(scalar, x, name)
   else:
     raise ValueError("Only scalar multiply works, got shape %s" % shape)
 
 
+@tf_export("math.scalar_mul", "scalar_mul", v1=[])
+@_set_doc(scalar_mul.__doc__)
+def scalar_mul_v2(scalar, x, name=None):
+  with ops.name_scope(name, "scalar_mul", [x]) as name:
+    return scalar_mul(scalar, x, name)
+
+
 @tf_export("math.pow", "pow")
+@dispatch.add_dispatch_support
 def pow(x, y, name=None):  # pylint: disable=redefined-builtin
   r"""Computes the power of one value to another.
 
@@ -500,6 +379,7 @@ def pow(x, y, name=None):  # pylint: disable=redefined-builtin
 
 # pylint: disable=redefined-builtin,redefined-outer-name
 @tf_export("dtypes.complex", "complex")
+@dispatch.add_dispatch_support
 def complex(real, imag, name=None):
   r"""Converts two real numbers to a complex number.
 
@@ -543,6 +423,7 @@ def complex(real, imag, name=None):
 
 @tf_export("math.real", v1=["math.real", "real"])
 @deprecation.deprecated_endpoints("real")
+@dispatch.add_dispatch_support
 def real(input, name=None):
   r"""Returns the real part of a complex (or real) tensor.
 
@@ -575,6 +456,7 @@ def real(input, name=None):
 
 @tf_export("math.imag", v1=["math.imag", "imag"])
 @deprecation.deprecated_endpoints("imag")
+@dispatch.add_dispatch_support
 def imag(input, name=None):
   r"""Returns the imaginary part of a complex (or real) tensor.
 
@@ -606,6 +488,7 @@ def imag(input, name=None):
 
 @tf_export("math.angle", v1=["math.angle", "angle"])
 @deprecation.deprecated_endpoints("angle")
+@dispatch.add_dispatch_support
 def angle(input, name=None):
   r"""Returns the element-wise argument of a complex (or real) tensor.
 
@@ -645,6 +528,7 @@ def angle(input, name=None):
 
 
 @tf_export("math.round", "round")
+@dispatch.add_dispatch_support
 def round(x, name=None):  # pylint: disable=redefined-builtin
   """Rounds the values of a tensor to the nearest integer, element-wise.
 
@@ -672,6 +556,7 @@ def round(x, name=None):  # pylint: disable=redefined-builtin
 
 
 @tf_export("dtypes.cast", "cast")
+@dispatch.add_dispatch_support
 def cast(x, dtype, name=None):
   """Casts a tensor to a new type.
 
@@ -735,6 +620,7 @@ def cast(x, dtype, name=None):
 
 
 @tf_export("dtypes.saturate_cast", "saturate_cast")
+@dispatch.add_dispatch_support
 def saturate_cast(value, dtype, name=None):
   """Performs a safe saturating cast of `value` to `dtype`.
 
@@ -1060,6 +946,7 @@ def _div_python2(x, y, name=None):
 
 
 @tf_export("math.truediv", "truediv")
+@dispatch.add_dispatch_support
 def truediv(x, y, name=None):
   """Divides x / y elementwise (using Python 3 division operator semantics).
 
@@ -1091,7 +978,10 @@ def truediv(x, y, name=None):
   return _truediv_python3(x, y, name)
 
 
-@tf_export("div")
+@deprecation.deprecated(
+    date=None,
+    instructions="Deprecated in favor of operator or tf.math.divide.")
+@tf_export(v1=["div"])
 def div(x, y, name=None):
   """Divides x / y elementwise (using Python 2 division operator semantics).
 
@@ -1114,6 +1004,7 @@ def div(x, y, name=None):
 
 
 @tf_export("div_no_nan")
+@dispatch.add_dispatch_support
 def div_no_nan(x, y, name=None):
   """Computes an unsafe divide which returns 0 if the y is zero.
 
@@ -1143,6 +1034,7 @@ mod = gen_math_ops.floor_mod
 # TODO(aselle): Deprecate this once all internal functionality uses
 # tf.truncatediv
 @tf_export("math.floordiv", v1=["math.floordiv", "floordiv"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("floordiv")
 def floordiv(x, y, name=None):
   """Divides `x / y` elementwise, rounding toward the most negative integer.
@@ -1172,16 +1064,11 @@ def floordiv(x, y, name=None):
 
 
 realdiv = gen_math_ops.real_div
-tf_export("realdiv")(realdiv)
 truncatediv = gen_math_ops.truncate_div
-tf_export("truncatediv")(truncatediv)
 # TODO(aselle): Rename this to floordiv when we can.
 floor_div = gen_math_ops.floor_div
-tf_export("floor_div")(floor_div)
 truncatemod = gen_math_ops.truncate_mod
-tf_export("truncatemod")(truncatemod)
 floormod = gen_math_ops.floor_mod
-tf_export("floormod", "mod")(floormod)
 
 
 def _mul_dispatch(x, y, name=None):
@@ -1217,6 +1104,7 @@ _OverrideBinaryOperatorHelper(pow, "pow")
 
 
 @tf_export("math.logical_xor", v1=["math.logical_xor", "logical_xor"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("logical_xor")
 def logical_xor(x, y, name="LogicalXor"):
   """x ^ y = (x | y) & ~(x & y)."""
@@ -1312,7 +1200,7 @@ def range(start, limit=None, delta=1, dtype=None, name="range"):  # pylint: disa
 
 
 # Reduction operations
-def _ReductionDims(x, axis, reduction_indices):
+def _ReductionDims(x, axis, reduction_indices=None):  # pylint: disable=invalid-name
   """Returns range(0, rank(x)) if reduction_indices is None."""
   # TODO(aselle): Remove this after deprecation
   if reduction_indices is not None:
@@ -1335,23 +1223,23 @@ def _ReductionDims(x, axis, reduction_indices):
     return range(0, array_ops.rank(x))
 
 
-def _may_reduce_to_scalar(keepdims, axis, reduction_indices, output):
+def _may_reduce_to_scalar(keepdims, axis, output):
   """Set a reduction's output shape to be a scalar if we are certain."""
   if not common_shapes.has_fully_defined_shape(output) and (not keepdims) and (
-      axis is None) and (reduction_indices is None):
+      axis is None):
     output.set_shape(())
   return output
 
 
-@tf_export("math.reduce_sum", "reduce_sum")
+@tf_export(v1=["math.reduce_sum", "reduce_sum"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_sum(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
+def reduce_sum_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
   """Computes the sum of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1391,21 +1279,62 @@ def reduce_sum(input_tensor,
   int64 while tensorflow returns the same dtype as the input.
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
+  return reduce_sum(input_tensor, axis, keepdims, name)
+
+
+@tf_export("math.reduce_sum", "reduce_sum", v1=[])
+@dispatch.add_dispatch_support
+def reduce_sum(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the sum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._sum(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
 
+  For example:
 
-@tf_export("math.count_nonzero", "count_nonzero")
+  ```python
+  x = tf.constant([[1, 1, 1], [1, 1, 1]])
+  tf.reduce_sum(x)  # 6
+  tf.reduce_sum(x, 0)  # [2, 2, 2]
+  tf.reduce_sum(x, 1)  # [3, 3]
+  tf.reduce_sum(x, 1, keepdims=True)  # [[3], [3]]
+  tf.reduce_sum(x, [0, 1])  # 6
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor, of the same dtype as the input_tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.sum apart the fact that numpy upcast uint8 and int32 to
+  int64 while tensorflow returns the same dtype as the input.
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._sum(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.count_nonzero", "count_nonzero"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
 def count_nonzero(input_tensor,
@@ -1466,32 +1395,89 @@ def count_nonzero(input_tensor,
   """
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis,
+      "reduction_indices", reduction_indices
+      )
+
+  return count_nonzero_v2(input_tensor, axis, keepdims, dtype, name)
+
+
+@tf_export("math.count_nonzero", v1=[])
+def count_nonzero_v2(input,  # pylint: disable=redefined-builtin
+                     axis=None,
+                     keepdims=None,
+                     dtype=dtypes.int64,
+                     name=None):
+  """Computes number of nonzero elements across dimensions of a tensor.
+
+  Reduces `input` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  **NOTE** Floating point comparison to zero is done by exact floating point
+  equality check.  Small values are **not** rounded to zero for purposes of
+  the nonzero check.
+
+  For example:
+
+  ```python
+  x = tf.constant([[0, 1, 0], [1, 1, 0]])
+  tf.count_nonzero(x)  # 3
+  tf.count_nonzero(x, 0)  # [1, 2, 0]
+  tf.count_nonzero(x, 1)  # [1, 2]
+  tf.count_nonzero(x, 1, keepdims=True)  # [[1], [2]]
+  tf.count_nonzero(x, [0, 1])  # 3
+  ```
+
+  **NOTE** Strings are compared against zero-length empty string `""`. Any
+  string with a size greater than zero is already considered as nonzero.
+
+  For example:
+  ```python
+  x = tf.constant(["", "a", "  ", "b", ""])
+  tf.count_nonzero(x) # 3, with "a", "  ", and "b" as nonzero strings.
+  ```
+
+  Args:
+    input: The tensor to reduce. Should be of numeric type, `bool`,
+      or `string`.
+    axis: The dimensions to reduce. If `None` (the default),
+      reduces all dimensions. Must be in the range
+      `[-rank(input), rank(input))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    dtype: The output dtype; defaults to `tf.int64`.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor (number of nonzero values).
+  """
   if keepdims is None:
     keepdims = False
-
-  with ops.name_scope(name, "count_nonzero", [input_tensor]):
-    input_tensor = ops.convert_to_tensor(input_tensor, name="input_tensor")
+  with ops.name_scope(name, "count_nonzero", [input]):
+    input = ops.convert_to_tensor(input, name="input")
     # A scalar of 'zero' is enough as `not_equal` will broadcast.
-    zero = array_ops.zeros([], dtype=input_tensor.dtype)
+    zero = array_ops.zeros([], dtype=input.dtype)
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-            to_int64(gen_math_ops.not_equal(input_tensor, zero)),
+            to_int64(gen_math_ops.not_equal(input, zero)),
             axis=axis,
-            keepdims=keepdims,
-            reduction_indices=reduction_indices),
+            keepdims=keepdims),
         dtype=dtype)
 
 
-@tf_export("math.reduce_mean", "reduce_mean")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_mean(input_tensor,
-                axis=None,
-                keepdims=None,
-                name=None,
-                reduction_indices=None,
-                keep_dims=None):
+@tf_export(v1=["math.reduce_mean", "reduce_mean"])
+def reduce_mean_v1(input_tensor,
+                   axis=None,
+                   keepdims=None,
+                   name=None,
+                   reduction_indices=None,
+                   keep_dims=None):
   """Computes the mean of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1541,22 +1527,73 @@ def reduce_mean(input_tensor,
 
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
+  return reduce_mean(input_tensor, axis, keepdims, name)
 
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops.mean(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+
+@tf_export("math.reduce_mean", "reduce_mean", v1=[])
+@dispatch.add_dispatch_support
+def reduce_mean(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the mean of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[1., 1.], [2., 2.]])
+  tf.reduce_mean(x)  # 1.5
+  tf.reduce_mean(x, 0)  # [1.5, 1.5]
+  tf.reduce_mean(x, 1)  # [1.,  2.]
+  ```
+
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.mean
+
+  Please note that `np.mean` has a `dtype` parameter that could be used to
+  specify the output type. By default this is `dtype=float64`. On the other
+  hand, `tf.reduce_mean` has an aggressive type inference from `input_tensor`,
+  for example:
+
+  ```python
+  x = tf.constant([1, 0, 1, 0])
+  tf.reduce_mean(x)  # 0
+  y = tf.constant([1., 0., 1., 0.])
+  tf.reduce_mean(y)  # 0.5
+  ```
+
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.mean(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
 
 
 @tf_export("math.reduce_variance")
-def reduce_variance(input_tensor, axis=None, keepdims=None, name=None):
+def reduce_variance(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the variance of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1599,12 +1636,12 @@ def reduce_variance(input_tensor, axis=None, keepdims=None, name=None):
   name = name if name else "reduce_variance"
   with ops.name_scope(name):
     means = reduce_mean(input_tensor, axis=axis, keepdims=True)
-    squared_deviations = square(input_tensor - means)
+    squared_deviations = gen_math_ops.square(input_tensor - means)
     return reduce_mean(squared_deviations, axis=axis, keepdims=keepdims)
 
 
 @tf_export("math.reduce_std")
-def reduce_std(input_tensor, axis=None, keepdims=None, name=None):
+def reduce_std(input_tensor, axis=None, keepdims=False, name=None):
   """Computes the standard deviation of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1646,18 +1683,54 @@ def reduce_std(input_tensor, axis=None, keepdims=None, name=None):
   name = name if name else "reduce_std"
   with ops.name_scope(name):
     variance = reduce_variance(input_tensor, axis=axis, keepdims=keepdims)
-    return sqrt(variance)
+    return gen_math_ops.sqrt(variance)
+
+
+@tf_export("math.reduce_prod", "reduce_prod", v1=[])
+@dispatch.add_dispatch_support
+def reduce_prod(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the product of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
 
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
 
-@tf_export("math.reduce_prod", "reduce_prod")
+  Args:
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default),
+      reduces all dimensions. Must be in the range
+      `[-rank(input_tensor), rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.prod
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops.prod(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_prod", "reduce_prod"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_prod(input_tensor,
-                axis=None,
-                keepdims=None,
-                name=None,
-                reduction_indices=None,
-                keep_dims=None):
+def reduce_prod_v1(input_tensor,
+                   axis=None,
+                   keepdims=None,
+                   name=None,
+                   reduction_indices=None,
+                   keep_dims=None):
   """Computes the product of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1670,6 +1743,128 @@ def reduce_prod(input_tensor,
 
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+    reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.prod
+  @end_compatibility
+  """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  return reduce_prod(input_tensor, axis, keepdims, name)
+
+
+@tf_export(v1=["math.reduce_min", "reduce_min"])
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+def reduce_min_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
+  """Computes the minimum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have real numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+    reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.min
+  @end_compatibility
+  """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  return reduce_min(input_tensor, axis, keepdims, name)
+
+
+@tf_export("math.reduce_min", "reduce_min", v1=[])
+@dispatch.add_dispatch_support
+def reduce_min(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the minimum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have real numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.min
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._min(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_max", "reduce_max"])
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+def reduce_max_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
+  """Computes the maximum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have real numeric type.
     axis: The dimensions to reduce. If `None` (the default),
       reduces all dimensions. Must be in the range
       `[-rank(input_tensor), rank(input_tensor))`.
@@ -1682,33 +1877,108 @@ def reduce_prod(input_tensor,
     The reduced tensor.
 
   @compatibility(numpy)
-  Equivalent to np.prod
+  Equivalent to np.max
+  @end_compatibility
+  """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
+  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
+                                                    "keep_dims", keep_dims)
+  return reduce_max(input_tensor, axis, keepdims, name)
+
+
+@tf_export("math.reduce_max", "reduce_max", v1=[])
+@dispatch.add_dispatch_support
+def reduce_max(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the maximum of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  Args:
+    input_tensor: The tensor to reduce. Should have real numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.max
+  @end_compatibility
+  """
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._max(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
+
+
+@tf_export(v1=["math.reduce_all", "reduce_all"])
+@deprecation.deprecated_args(
+    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+def reduce_all_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
+  """Computes the "logical and" of elements across dimensions of a tensor.
+
+  Reduces `input_tensor` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+
+  If `axis` is None, all dimensions are reduced, and a
+  tensor with a single element is returned.
+
+  For example:
+
+  ```python
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_all(x)  # False
+  tf.reduce_all(x, 0)  # [False, False]
+  tf.reduce_all(x, 1)  # [True, False]
+  ```
+
+  Args:
+    input_tensor: The boolean tensor to reduce.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+    name: A name for the operation (optional).
+    reduction_indices: The old (deprecated) name for axis.
+    keep_dims: Deprecated alias for `keepdims`.
+
+  Returns:
+    The reduced tensor.
+
+  @compatibility(numpy)
+  Equivalent to np.all
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops.prod(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_all(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_min", "reduce_min")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_min(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
-  """Computes the minimum of elements across dimensions of a tensor.
+@tf_export("reduce_all", "math.reduce_all", v1=[])
+@dispatch.add_dispatch_support
+def reduce_all(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the "logical and" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
@@ -1718,46 +1988,48 @@ def reduce_min(input_tensor,
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
+  For example:
+
+  ```python
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_all(x)  # False
+  tf.reduce_all(x, 0)  # [False, False]
+  tf.reduce_all(x, 1)  # [True, False]
+  ```
+
   Args:
-    input_tensor: The tensor to reduce. Should have real numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    input_tensor: The boolean tensor to reduce.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
-    reduction_indices: The old (deprecated) name for axis.
-    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
 
   @compatibility(numpy)
-  Equivalent to np.min
+  Equivalent to np.all
   @end_compatibility
   """
-  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
-                                                    "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._min(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._all(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
 
 
-@tf_export("math.reduce_max", "reduce_max")
+@tf_export(v1=["math.reduce_any", "reduce_any"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_max(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
-  """Computes the maximum of elements across dimensions of a tensor.
+def reduce_any_v1(input_tensor,
+                  axis=None,
+                  keepdims=None,
+                  name=None,
+                  reduction_indices=None,
+                  keep_dims=None):
+  """Computes the "logical or" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
@@ -1767,11 +2039,20 @@ def reduce_max(input_tensor,
   If `axis` is None, all dimensions are reduced, and a
   tensor with a single element is returned.
 
+  For example:
+
+  ```python
+  x = tf.constant([[True,  True], [False, False]])
+  tf.reduce_any(x)  # True
+  tf.reduce_any(x, 0)  # [True, True]
+  tf.reduce_any(x, 1)  # [True, False]
+  ```
+
   Args:
-    input_tensor: The tensor to reduce. Should have real numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    input_tensor: The boolean tensor to reduce.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1781,32 +2062,20 @@ def reduce_max(input_tensor,
     The reduced tensor.
 
   @compatibility(numpy)
-  Equivalent to np.max
+  Equivalent to np.any
   @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._max(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_any(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_all", "reduce_all")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_all(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
-  """Computes the "logical and" of elements across dimensions of a tensor.
+@tf_export("math.reduce_any", "reduce_any", v1=[])
+@dispatch.add_dispatch_support
+def reduce_any(input_tensor, axis=None, keepdims=False, name=None):
+  """Computes the "logical or" of elements across dimensions of a tensor.
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
@@ -1820,74 +2089,73 @@ def reduce_all(input_tensor,
 
   ```python
   x = tf.constant([[True,  True], [False, False]])
-  tf.reduce_all(x)  # False
-  tf.reduce_all(x, 0)  # [False, False]
-  tf.reduce_all(x, 1)  # [True, False]
+  tf.reduce_any(x)  # True
+  tf.reduce_any(x, 0)  # [True, True]
+  tf.reduce_any(x, 1)  # [True, False]
   ```
 
   Args:
     input_tensor: The boolean tensor to reduce.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
-    reduction_indices: The old (deprecated) name for axis.
-    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
 
   @compatibility(numpy)
-  Equivalent to np.all
+  Equivalent to np.any
   @end_compatibility
   """
-  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
-                                                    "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._all(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  keepdims = False if keepdims is None else keepdims
+  return _may_reduce_to_scalar(
+      keepdims, axis,
+      gen_math_ops._any(
+          input_tensor, _ReductionDims(input_tensor, axis), keepdims,
+          name=name))
 
 
-@tf_export("math.reduce_any", "reduce_any")
+@tf_export(v1=["math.reduce_logsumexp", "reduce_logsumexp"])
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_any(input_tensor,
-               axis=None,
-               keepdims=None,
-               name=None,
-               reduction_indices=None,
-               keep_dims=None):
-  """Computes the "logical or" of elements across dimensions of a tensor.
+def reduce_logsumexp_v1(input_tensor,
+                        axis=None,
+                        keepdims=None,
+                        name=None,
+                        reduction_indices=None,
+                        keep_dims=None):
+  """Computes log(sum(exp(elements across dimensions of a tensor))).
 
   Reduces `input_tensor` along the dimensions given in `axis`.
   Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
   entry in `axis`. If `keepdims` is true, the reduced dimensions
   are retained with length 1.
 
-  If `axis` is None, all dimensions are reduced, and a
+  If `axis` has no entries, all dimensions are reduced, and a
   tensor with a single element is returned.
 
+  This function is more numerically stable than log(sum(exp(input))). It avoids
+  overflows caused by taking the exp of large inputs and underflows caused by
+  taking the log of small inputs.
+
   For example:
 
   ```python
-  x = tf.constant([[True,  True], [False, False]])
-  tf.reduce_any(x)  # True
-  tf.reduce_any(x, 0)  # [True, True]
-  tf.reduce_any(x, 1)  # [True, False]
+  x = tf.constant([[0., 0., 0.], [0., 0., 0.]])
+  tf.reduce_logsumexp(x)  # log(6)
+  tf.reduce_logsumexp(x, 0)  # [log(2), log(2), log(2)]
+  tf.reduce_logsumexp(x, 1)  # [log(3), log(3)]
+  tf.reduce_logsumexp(x, 1, keepdims=True)  # [[log(3)], [log(3)]]
+  tf.reduce_logsumexp(x, [0, 1])  # log(6)
   ```
 
   Args:
-    input_tensor: The boolean tensor to reduce.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    input_tensor: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
     reduction_indices: The old (deprecated) name for axis.
@@ -1895,33 +2163,16 @@ def reduce_any(input_tensor,
 
   Returns:
     The reduced tensor.
-
-  @compatibility(numpy)
-  Equivalent to np.any
-  @end_compatibility
   """
+  axis = deprecation.deprecated_argument_lookup(
+      "axis", axis, "reduction_indices", reduction_indices)
   keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
                                                     "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
-  return _may_reduce_to_scalar(keepdims, axis, reduction_indices,
-                               gen_math_ops._any(
-                                   input_tensor,
-                                   _ReductionDims(input_tensor, axis,
-                                                  reduction_indices),
-                                   keepdims,
-                                   name=name))
+  return reduce_logsumexp(input_tensor, axis, keepdims, name)
 
 
-@tf_export("math.reduce_logsumexp", "reduce_logsumexp")
-@deprecation.deprecated_args(
-    None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
-def reduce_logsumexp(input_tensor,
-                     axis=None,
-                     keepdims=None,
-                     name=None,
-                     reduction_indices=None,
-                     keep_dims=None):
+@tf_export("math.reduce_logsumexp", "reduce_logsumexp", v1=[])
+def reduce_logsumexp(input_tensor, axis=None, keepdims=False, name=None):
   """Computes log(sum(exp(elements across dimensions of a tensor))).
 
   Reduces `input_tensor` along the dimensions given in `axis`.
@@ -1949,27 +2200,21 @@ def reduce_logsumexp(input_tensor,
 
   Args:
     input_tensor: The tensor to reduce. Should have numeric type.
-    axis: The dimensions to reduce. If `None` (the default),
-      reduces all dimensions. Must be in the range
-      `[-rank(input_tensor), rank(input_tensor))`.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(input_tensor),
+      rank(input_tensor))`.
     keepdims: If true, retains reduced dimensions with length 1.
     name: A name for the operation (optional).
-    reduction_indices: The old (deprecated) name for axis.
-    keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
     The reduced tensor.
   """
-  keepdims = deprecation.deprecated_argument_lookup("keepdims", keepdims,
-                                                    "keep_dims", keep_dims)
-  if keepdims is None:
-    keepdims = False
+  keepdims = False if keepdims is None else keepdims
   input_tensor = ops.convert_to_tensor(input_tensor)
   with ops.name_scope(name, "ReduceLogSumExp", [input_tensor]) as name:
     raw_max = reduce_max(
         input_tensor,
         axis=axis,
-        reduction_indices=reduction_indices,
         keepdims=True)
     my_max = array_ops.stop_gradient(
         array_ops.where(
@@ -1979,12 +2224,11 @@ def reduce_logsumexp(input_tensor,
         reduce_sum(
             gen_math_ops.exp(gen_math_ops.sub(input_tensor, my_max)),
             axis,
-            keepdims=keepdims,
-            reduction_indices=reduction_indices))
+            keepdims=keepdims))
     if not keepdims:
       my_max = array_ops.reshape(my_max, array_ops.shape(result))
     result = gen_math_ops.add(result, my_max)
-    return _may_reduce_to_scalar(keepdims, axis, reduction_indices, result)
+    return _may_reduce_to_scalar(keepdims, axis, result)
 
 
 @tf_export("linalg.trace", v1=["linalg.trace", "trace"])
@@ -2311,7 +2555,8 @@ def matvec(a,
 
 _OverrideBinaryOperatorHelper(matmul, "matmul")
 
-sparse_matmul = gen_math_ops.sparse_mat_mul
+sparse_matmul = deprecation.deprecated(None, "Use `tf.linalg.matmul` instead")(
+    gen_math_ops.sparse_mat_mul)
 tf_export(v1=["sparse_matmul"])(sparse_matmul)
 
 
@@ -2391,6 +2636,7 @@ def _as_indexed_slices_list(inputs, optimize=True):
 
 
 @tf_export("math.add_n", "add_n")
+@dispatch.add_dispatch_support
 def add_n(inputs, name=None):
   """Adds all input tensors element-wise.
 
@@ -2536,6 +2782,7 @@ def sigmoid(x, name=None):
 
 
 @tf_export("math.log_sigmoid", v1=["math.log_sigmoid", "log_sigmoid"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("log_sigmoid")
 def log_sigmoid(x, name=None):
   """Computes log sigmoid of `x` element-wise.
@@ -2555,34 +2802,64 @@ def log_sigmoid(x, name=None):
     return gen_math_ops.neg(gen_nn_ops.softplus(-x), name=name)
 
 
-@tf_export("math.tanh", "nn.tanh", "tanh")
-def tanh(x, name=None):
-  """Computes hyperbolic tangent of `x` element-wise.
+@tf_export("math.bincount", v1=[])
+def bincount(arr,
+             weights=None,
+             minlength=None,
+             maxlength=None,
+             dtype=dtypes.int32,
+             name=None):
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
 
   Args:
-    x: A Tensor or SparseTensor with type `float16`, `float32`, `double`,
-      `complex64`, or `complex128`.
-    name: A name for the operation (optional).
+    arr: An int32 tensor of non-negative values.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+    name: A name scope for the associated operations (optional).
 
   Returns:
-    A Tensor or SparseTensor respectively with the same type as `x`.
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
   """
-  with ops.name_scope(name, "Tanh", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_tanh = gen_math_ops.tanh(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_tanh, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.tanh(x, name=name)
-
-
-@tf_export("math.bincount", v1=["math.bincount", "bincount"])
+  name = "bincount" if name is None else name
+  with ops.name_scope(name):
+    arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
+    array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
+    output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
+    if minlength is not None:
+      minlength = ops.convert_to_tensor(
+          minlength, name="minlength", dtype=dtypes.int32)
+      output_size = gen_math_ops.maximum(minlength, output_size)
+    if maxlength is not None:
+      maxlength = ops.convert_to_tensor(
+          maxlength, name="maxlength", dtype=dtypes.int32)
+      output_size = gen_math_ops.minimum(maxlength, output_size)
+    if weights is not None:
+      weights = ops.convert_to_tensor(weights, name="weights")
+      return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
+    weights = constant_op.constant([], dtype)
+    return gen_math_ops.bincount(arr, output_size, weights)
+
+
+@tf_export(v1=["math.bincount", "bincount"])
 @deprecation.deprecated_endpoints("bincount")
-def bincount(arr,
-             weights=None,
-             minlength=None,
-             maxlength=None,
-             dtype=dtypes.int32):
+def bincount_v1(arr,
+                weights=None,
+                minlength=None,
+                maxlength=None,
+                dtype=dtypes.int32):
   """Counts the number of occurrences of each value in an integer array.
 
   If `minlength` and `maxlength` are not given, returns a vector with length
@@ -2594,34 +2871,19 @@ def bincount(arr,
   Args:
     arr: An int32 tensor of non-negative values.
     weights: If non-None, must be the same shape as arr. For each value in
-        `arr`, the bin will be incremented by the corresponding weight instead
-        of 1.
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
     minlength: If given, ensures the output has length at least `minlength`,
-        padding with zeros at the end if necessary.
+      padding with zeros at the end if necessary.
     maxlength: If given, skips values in `arr` that are equal or greater than
-        `maxlength`, ensuring that the output has length at most `maxlength`.
+      `maxlength`, ensuring that the output has length at most `maxlength`.
     dtype: If `weights` is None, determines the type of the output bins.
 
   Returns:
     A vector with the same dtype as `weights` or the given `dtype`. The bin
     values.
   """
-  arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
-  array_is_nonempty = reduce_prod(array_ops.shape(arr)) > 0
-  output_size = cast(array_is_nonempty, dtypes.int32) * (reduce_max(arr) + 1)
-  if minlength is not None:
-    minlength = ops.convert_to_tensor(
-        minlength, name="minlength", dtype=dtypes.int32)
-    output_size = gen_math_ops.maximum(minlength, output_size)
-  if maxlength is not None:
-    maxlength = ops.convert_to_tensor(
-        maxlength, name="maxlength", dtype=dtypes.int32)
-    output_size = gen_math_ops.minimum(maxlength, output_size)
-  if weights is not None:
-    weights = ops.convert_to_tensor(weights, name="weights")
-    return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
-  weights = constant_op.constant([], dtype)
-  return gen_math_ops.bincount(arr, output_size, weights)
+  return bincount(arr, weights, minlength, maxlength, dtype)
 
 
 @tf_export("math.cumsum", "cumsum")
@@ -2730,6 +2992,7 @@ def cumprod(x, axis=0, exclusive=False, reverse=False, name=None):
 
 
 @tf_export("math.conj", v1=["math.conj", "conj"])
+@dispatch.add_dispatch_support
 @deprecation.deprecated_endpoints("conj")
 def conj(x, name=None):
   r"""Returns the complex conjugate of a complex number.
@@ -2834,6 +3097,7 @@ def _unsorted_segment_N(data, segment_ids, num_segments):
     "math.unsorted_segment_mean",
     v1=["math.unsorted_segment_mean", "unsorted_segment_mean"])
 @deprecation.deprecated_endpoints("unsorted_segment_mean")
+@dispatch.add_dispatch_support
 def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
   r"""Computes the mean along segments of a tensor.
 
@@ -2879,6 +3143,7 @@ def unsorted_segment_mean(data, segment_ids, num_segments, name=None):
     "math.unsorted_segment_sqrt_n",
     v1=["math.unsorted_segment_sqrt_n", "unsorted_segment_sqrt_n"])
 @deprecation.deprecated_endpoints("unsorted_segment_sqrt_n")
+@dispatch.add_dispatch_support
 def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
   r"""Computes the sum along segments of a tensor divided by the sqrt(N).
 
@@ -2923,8 +3188,7 @@ def unsorted_segment_sqrt_n(data, segment_ids, num_segments, name=None):
     return summed / gen_math_ops.sqrt(N)
 
 
-@tf_export(
-    "sparse.segment_sum", v1=["sparse.segment_sum", "sparse_segment_sum"])
+@tf_export(v1=["sparse.segment_sum", "sparse_segment_sum"])
 @deprecation.deprecated_endpoints("sparse_segment_sum")
 def sparse_segment_sum(data, indices, segment_ids, name=None,
                        num_segments=None):
@@ -2998,8 +3262,17 @@ def sparse_segment_sum(data, indices, segment_ids, name=None,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export(
-    "sparse.segment_mean", v1=["sparse.segment_mean", "sparse_segment_mean"])
+@tf_export("sparse.segment_sum", v1=[])
+def sparse_segment_sum_v2(data,
+                          indices,
+                          segment_ids,
+                          num_segments=None,
+                          name=None):
+  return sparse_segment_mean(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
+@tf_export(v1=["sparse.segment_mean", "sparse_segment_mean"])
 @deprecation.deprecated_endpoints("sparse_segment_mean")
 def sparse_segment_mean(data,
                         indices,
@@ -3045,9 +3318,44 @@ def sparse_segment_mean(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
-@tf_export(
-    "sparse.segment_sqrt_n",
-    v1=["sparse.segment_sqrt_n", "sparse_segment_sqrt_n"])
+@tf_export("sparse.segment_mean", v1=[])
+def sparse_segment_mean_v2(data,
+                           indices,
+                           segment_ids,
+                           num_segments=None,
+                           name=None):
+  r"""Computes the mean along sparse segments of a tensor.
+
+  Read [the section on
+  segmentation](https://tensorflow.org/api_guides/python/math_ops#Segmentation)
+  for an explanation of segments.
+
+  Like `SegmentMean`, but `segment_ids` can have rank less than `data`'s first
+  dimension, selecting a subset of dimension 0, specified by `indices`.
+  `segment_ids` is allowed to have missing ids, in which case the output will
+  be zeros at those indices. In those cases `num_segments` is used to determine
+  the size of the output.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  return sparse_segment_mean(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
+@tf_export(v1=["sparse.segment_sqrt_n", "sparse_segment_sqrt_n"])
 @deprecation.deprecated_endpoints("sparse_segment_sqrt_n")
 def sparse_segment_sqrt_n(data,
                           indices,
@@ -3085,6 +3393,35 @@ def sparse_segment_sqrt_n(data,
         data=data, indices=indices, segment_ids=segment_ids, name=name)
 
 
+@tf_export("sparse.segment_sqrt_n", v1=[])
+def sparse_segment_sqrt_n_v2(data,
+                             indices,
+                             segment_ids,
+                             num_segments=None,
+                             name=None):
+  r"""Computes the sum along sparse segments of a tensor divided by the sqrt(N).
+
+  `N` is the size of the segment being reduced.
+
+  Args:
+    data: A `Tensor` with data that will be assembled in the output.
+    indices: A 1-D `Tensor` with indices into `data`. Has same rank as
+      `segment_ids`.
+    segment_ids: A 1-D `Tensor` with indices into the output `Tensor`. Values
+      should be sorted and can be repeated.
+    num_segments: An optional int32 scalar. Indicates the size of the output
+      `Tensor`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `tensor` of the shape as data, except for dimension 0 which
+    has size `k`, the number of segments specified via `num_segments` or
+    inferred for the last element in `segments_ids`.
+  """
+  return sparse_segment_sqrt_n(
+      data, indices, segment_ids, name=name, num_segments=num_segments)
+
+
 @tf_export("tensordot", "linalg.tensordot")
 def tensordot(a, b, axes, name=None):
   r"""Tensor contraction of a and b along specified axes.
@@ -3118,12 +3455,11 @@ def tensordot(a, b, axes, name=None):
     a: `Tensor` of type `float32` or `float64`.
     b: `Tensor` with the same type as `a`.
     axes: Either a scalar `N`, or a list or an `int32` `Tensor` of shape [2, k].
-     If axes is a scalar, sum over the last N axes of a and the first N axes
-     of b in order.
-     If axes is a list or `Tensor` the first and second row contain the set of
-     unique integers specifying axes along which the contraction is computed,
-     for `a` and `b`, respectively. The number of axes for `a` and `b` must
-     be equal.
+      If axes is a scalar, sum over the last N axes of a and the first N axes of
+      b in order. If axes is a list or `Tensor` the first and second row contain
+      the set of unique integers specifying axes along which the contraction is
+      computed, for `a` and `b`, respectively. The number of axes for `a` and
+      `b` must be equal.
     name: A name for the operation (optional).
 
   Returns:
@@ -3295,73 +3631,3 @@ def polyval(coeffs, x, name=None):
     for c in coeffs[1:]:
       p = c + p * x
     return p
-
-
-@tf_export("math.bessel_i0e")
-def bessel_i0e(x, name=None):
-  """Computes the Bessel i0e function of `x` element-wise.
-
-  Exponentially scaled modified Bessel function of order 0 defined as
-  `bessel_i0e(x) = exp(-abs(x)) bessel_i0(x)`.
-
-  This function is faster and numerically stabler than `bessel_i0(x)`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.special.i0e
-  @end_compatibility
-  """
-  with ops.name_scope(name, "bessel_i0e", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_i0e = gen_math_ops.bessel_i0e(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_i0e, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.bessel_i0e(x, name=name)
-
-
-@tf_export("math.bessel_i1e")
-def bessel_i1e(x, name=None):
-  """Computes the Bessel i1e function of `x` element-wise.
-
-  Exponentially scaled modified Bessel function of order 1 defined as
-  `bessel_i1e(x) = exp(-abs(x)) bessel_i1(x)`.
-
-  This function is faster and numerically stabler than `bessel_i1(x)`.
-
-  Args:
-    x: A `Tensor` or `SparseTensor`. Must be one of the following types: `half`,
-      `float32`, `float64`.
-    name: A name for the operation (optional).
-
-  Returns:
-    A `Tensor` or `SparseTensor`, respectively. Has the same type as `x`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.special.i1e
-  @end_compatibility
-  """
-  with ops.name_scope(name, "bessel_i1e", [x]) as name:
-    if isinstance(x, sparse_tensor.SparseTensor):
-      x_i1e = gen_math_ops.bessel_i1e(x.values, name=name)
-      return sparse_tensor.SparseTensor(
-          indices=x.indices, values=x_i1e, dense_shape=x.dense_shape)
-    else:
-      return gen_math_ops.bessel_i1e(x, name=name)
-
-
-# FFT ops were moved to tf.spectral. tf.fft symbols were part of the TensorFlow
-# 1.0 API so we leave these here for backwards compatibility.
-fft = gen_spectral_ops.fft
-ifft = gen_spectral_ops.ifft
-fft2d = gen_spectral_ops.fft2d
-ifft2d = gen_spectral_ops.ifft2d
-fft3d = gen_spectral_ops.fft3d
-ifft3d = gen_spectral_ops.ifft3d
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index a4da0c6c33959511f5f713ab7c7cd246b198b081..e185dbcd230906270b6c92fe70e6a350c34f030f 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -92,6 +92,7 @@ class ReduceTest(test_util.TensorFlowTestCase):
 
 class LogSumExpTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testReduceLogSumExp(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
@@ -104,22 +105,23 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
       with self.cached_session(use_gpu=True):
-        y_tf = math_ops.reduce_logsumexp(x_np, reduction_indices=[0])
+        y_tf = math_ops.reduce_logsumexp(x_np, axis=[0])
         y_np = log(np.sum(exp(x_np), axis=0))
         self.assertShapeEqual(y_np, y_tf)
-        y_tf_np = y_tf.eval()
+        y_tf_np = self.evaluate(y_tf)
         self.assertAllClose(y_tf_np, y_np)
 
   def testReductionIndices2(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
       with self.cached_session(use_gpu=True):
-        y_tf = math_ops.reduce_logsumexp(x_np, reduction_indices=0)
+        y_tf = math_ops.reduce_logsumexp(x_np, axis=0)
         y_np = log(np.sum(exp(x_np), axis=0))
         self.assertShapeEqual(y_np, y_tf)
-        y_tf_np = y_tf.eval()
+        y_tf_np = self.evaluate(y_tf)
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testKeepDims(self):
     for dtype in [np.float16, np.float32, np.double]:
       x_np = np.random.rand(5, 5).astype(dtype)
@@ -129,6 +131,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_np = log(np.sum(exp(x_np), keepdims=True))
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testOverflow(self):
     x = [1000, 1001, 1002, 1003]
     for dtype in [np.float16, np.float32, np.double]:
@@ -146,6 +149,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_np = log(np.sum(exp(x_np - max_np))) + max_np
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testUnderflow(self):
     x = [-1000, -1001, -1002, -1003]
     for dtype in [np.float16, np.float32, np.double]:
@@ -163,6 +167,7 @@ class LogSumExpTest(test_util.TensorFlowTestCase):
         y_np = log(np.sum(exp(x_np - max_np))) + max_np
         self.assertAllClose(y_tf_np, y_np)
 
+  @test_util.run_deprecated_v1
   def testInfinity(self):
     with self.session(use_gpu=True):
       res = math_ops.reduce_logsumexp(-np.inf).eval()
@@ -186,6 +191,7 @@ class RoundTest(test_util.TensorFlowTestCase):
 
 class ModTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     x = [0.5, 0.7, 0.3]
     for dtype in [np.float32, np.double]:
@@ -195,7 +201,7 @@ class ModTest(test_util.TensorFlowTestCase):
         with self.cached_session(use_gpu=True):
           x_tf = constant_op.constant(x_np, shape=x_np.shape)
           y_tf = math_ops.mod(x_tf, denom)
-          y_tf_np = y_tf.eval()
+          y_tf_np = self.evaluate(y_tf)
           y_np = np.fmod(x_np, denom)
         self.assertAllClose(y_tf_np, y_np, atol=1e-2)
 
@@ -208,7 +214,7 @@ class ModTest(test_util.TensorFlowTestCase):
         with self.cached_session(use_gpu=True):
           x_tf = constant_op.constant(x_np, shape=x_np.shape)
           y_tf = math_ops.mod(x_tf, denom)
-          y_tf_np = y_tf.eval()
+          y_tf_np = self.evaluate(y_tf)
           y_np = np.mod(x_np, denom)
         self.assertAllClose(y_tf_np, y_np)
 
@@ -217,7 +223,7 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
 
   @test_util.run_in_graph_and_eager_modes
   def testSquaredDifference(self):
-    for dtype in [np.int32, np.float16]:
+    for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
       x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
       y = np.array([-3, -2, -1], dtype=dtype)
       z = (x - y) * (x - y)
@@ -225,6 +231,17 @@ class SquaredDifferenceTest(test_util.TensorFlowTestCase):
         z_tf = self.evaluate(math_ops.squared_difference(x, y))
         self.assertAllClose(z, z_tf)
 
+  @test_util.run_in_graph_and_eager_modes()
+  def testComplexSquaredDifference(self):
+    for dtype in [np.complex64, np.complex128]:
+      x = np.array([[1 + 3j, 2 + 2j, 3 + 1j], [4 - 1j, 5 - 2j, 6 - 3j]],
+                   dtype=dtype)
+      y = np.array([-3 + 1j, -2 + 2j, -1 + 3j], dtype=dtype)
+      z = np.conj(x - y) * (x - y)
+      with test_util.device(use_gpu=False):
+        z_tf = self.evaluate(math_ops.squared_difference(x, y))
+        self.assertAllClose(z, z_tf)
+
 
 class ApproximateEqualTest(test_util.TensorFlowTestCase):
 
@@ -256,6 +273,7 @@ class ApproximateEqualTest(test_util.TensorFlowTestCase):
         z_tf = self.evaluate(math_ops.approximate_equal(x, y, tolerance=0.0001))
         self.assertAllEqual(z, z_tf)
 
+  @test_util.run_deprecated_v1
   def testApproximateEqualShape(self):
     for dtype in [np.float32, np.double]:
       x = np.array([1, 2], dtype=dtype)
@@ -309,6 +327,7 @@ class ScalarMulTest(test_util.TensorFlowTestCase):
 
 class AccumulateNTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     x = [np.random.random((1, 2, 3, 4, 5)) - 0.5 for _ in range(5)]
@@ -317,6 +336,7 @@ class AccumulateNTest(test_util.TensorFlowTestCase):
       self.assertAllClose(sum(x), math_ops.accumulate_n(tf_x).eval())
       self.assertAllClose(x[0] * 5, math_ops.accumulate_n([tf_x[0]] * 5).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     x = [np.random.randint(-128, 128, (5, 4, 3, 2, 1)) for _ in range(6)]
@@ -328,6 +348,7 @@ class AccumulateNTest(test_util.TensorFlowTestCase):
 
 class AddNTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testPartials(self):
     """Test that previously revealed a bug in buffer forwarding for AddN."""
     partials = []
@@ -341,6 +362,7 @@ class AddNTest(test_util.TensorFlowTestCase):
     with self.session(use_gpu=True):
       self.assertAllEqual(res.eval(), 100)
 
+  @test_util.run_deprecated_v1
   def testFloat(self):
     np.random.seed(12345)
     for num_inputs in range(1, 10):
@@ -351,6 +373,7 @@ class AddNTest(test_util.TensorFlowTestCase):
         self.assertAllClose(x[0] * num_inputs,
                             math_ops.add_n([tf_x[0]] * num_inputs).eval())
 
+  @test_util.run_deprecated_v1
   def testInt(self):
     np.random.seed(54321)
     for num_inputs in range(1, 10):
@@ -364,6 +387,7 @@ class AddNTest(test_util.TensorFlowTestCase):
         self.assertAllEqual(x[0] * num_inputs,
                             math_ops.add_n([tf_x[0]] * num_inputs).eval())
 
+  @test_util.run_deprecated_v1
   def testGrad(self):
     np.random.seed(42)
     for num_inputs in range(1, 10):
@@ -373,7 +397,7 @@ class AddNTest(test_util.TensorFlowTestCase):
             for i in range(0, num_inputs)
         ]
         addn = math_ops.add_n(input_vars)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         add_n_grad = gradients.gradients(addn, input_vars)
         self.assertAllEqual(np.repeat(1.0, num_inputs), # d/dx (x + y + ...) = 1
                             [g.eval() for g in add_n_grad])
@@ -392,6 +416,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
     divs = np.arange(-3, 0, .25).reshape(1, 12)
     return nums, divs
 
+  @test_util.run_deprecated_v1
   def testFloorModInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -401,6 +426,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = nums % divs
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testFloorModFloat(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -412,6 +438,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               % array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testTruncateModInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -419,6 +446,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testTruncateModFloat(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -426,6 +454,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.fmod(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testDivideInt(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -437,12 +466,14 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       #               // array_ops.constant(divs)).eval()
       # self.assertAllEqual(tf2_result, tf_result)
 
+  @test_util.run_deprecated_v1
   def testDivideName(self):
     with self.cached_session():
       op = math_ops.divide(
           array_ops.constant(3), array_ops.constant(4), name="my_cool_divide")
       self.assertEqual(op.name, "my_cool_divide:0")
 
+  @test_util.run_deprecated_v1
   def testRealDiv(self):
     nums, divs = self.floatTestData()
     with self.cached_session():
@@ -450,26 +481,30 @@ class DivAndModTest(test_util.TensorFlowTestCase):
       np_result = np.divide(nums, divs)
       self.assertAllEqual(tf_result, np_result)
 
+  @test_util.run_deprecated_v1
   def testComplexDiv(self):
     foo = array_ops.constant([1. + 3.j])
     with self.cached_session():
       _ = math_ops.divide(foo, 1.).eval()
       _ = math_ops.div(foo, 2.).eval()
 
+  @test_util.run_deprecated_v1
   def testFloorDivGrad(self):
     with self.cached_session():
       a = variables.Variable(2.)
       b = variables.Variable(4.)
       with self.cached_session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         c_grad = gradients.gradients(math_ops.divide(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.div(a, b), [a, b])
         self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125])
         c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b])
-        self.assertAllEqual([None if x is None else x.eval()
-                             for x in c_grad], [None, None])
+        self.assertAllEqual(
+            [None if x is None else self.evaluate(x) for x in c_grad],
+            [None, None])
 
+  @test_util.run_deprecated_v1
   def testConsistent(self):
     nums, divs = self.intTestData()
     with self.cached_session():
@@ -496,6 +531,7 @@ class DivAndModTest(test_util.TensorFlowTestCase):
 
 class DivNoNanTest(test_util.TensorFlowTestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for dtype in [np.float32, np.float64]:
       nums = np.arange(-10, 10, .25, dtype=dtype).reshape(80, 1)
diff --git a/tensorflow/python/ops/metrics_impl.py b/tensorflow/python/ops/metrics_impl.py
index e86a3b85360ae2d65fa2013556cfce7dcc250d06..ec39b1790e340a0d194dea8ab3419ca78fc9d126 100644
--- a/tensorflow/python/ops/metrics_impl.py
+++ b/tensorflow/python/ops/metrics_impl.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.compat import compat
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -35,7 +35,6 @@ from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import weights_broadcast_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
@@ -213,26 +212,6 @@ def _maybe_expand_labels(labels, predictions):
         lambda: array_ops.expand_dims(labels, -1, name=scope), lambda: labels)
 
 
-def _safe_div(numerator, denominator, name):
-  """Divides two tensors element-wise, returning 0 if the denominator is <= 0.
-
-  Args:
-    numerator: A real `Tensor`.
-    denominator: A real `Tensor`, with dtype matching `numerator`.
-    name: Name for the returned op.
-
-  Returns:
-    0 if `denominator` <= 0, else `numerator` / `denominator`
-  """
-  if compat.forward_compatible(2018, 11, 1):
-    return math_ops.div_no_nan(numerator, denominator, name=name)
-  t = math_ops.truediv(numerator, denominator)
-  zero = array_ops.zeros_like(t, dtype=denominator.dtype)
-  condition = math_ops.greater(denominator, zero)
-  zero = math_ops.cast(zero, t.dtype)
-  return array_ops.where(condition, t, zero, name=name)
-
-
 def _safe_scalar_div(numerator, denominator, name):
   """Divides two values, returning 0 if the denominator is 0.
 
@@ -246,7 +225,7 @@ def _safe_scalar_div(numerator, denominator, name):
   """
   numerator.get_shape().with_rank_at_most(1)
   denominator.get_shape().with_rank_at_most(1)
-  return _safe_div(numerator, denominator, name=name)
+  return math_ops.div_no_nan(numerator, denominator, name=name)
 
 
 def _streaming_confusion_matrix(labels, predictions, num_classes, weights=None):
@@ -302,7 +281,7 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
   """Aggregate metric value across replicas."""
   def fn(distribution, *a):
     """Call `metric_value_fn` in the correct control flow context."""
-    if hasattr(distribution, '_outer_control_flow_context'):
+    if hasattr(distribution.extended, '_outer_control_flow_context'):
       # If there was an outer context captured before this method was called,
       # then we enter that context to create the metric value op. If the
       # caputred context is `None`, ops.control_dependencies(None) gives the
@@ -315,13 +294,13 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
       # once the update ops have been evaluted.
 
       # pylint: disable=protected-access
-      if distribution._outer_control_flow_context is None:
+      if distribution.extended._outer_control_flow_context is None:
         with ops.control_dependencies(None):
           metric_value = metric_value_fn(distribution, *a)
       else:
-        distribution._outer_control_flow_context.Enter()
+        distribution.extended._outer_control_flow_context.Enter()
         metric_value = metric_value_fn(distribution, *a)
-        distribution._outer_control_flow_context.Exit()
+        distribution.extended._outer_control_flow_context.Exit()
         # pylint: enable=protected-access
     else:
       metric_value = metric_value_fn(distribution, *a)
@@ -330,10 +309,10 @@ def _aggregate_across_replicas(metrics_collections, metric_value_fn, *args):
     return metric_value
 
   return distribution_strategy_context.get_replica_context().merge_call(
-      fn, *args)
+      fn, args=args)
 
 
-@tf_export('metrics.mean')
+@tf_export(v1=['metrics.mean'])
 def mean(values,
          weights=None,
          metrics_collections=None,
@@ -401,13 +380,12 @@ def mean(values,
       update_count_op = state_ops.assign_add(count, num_values)
 
     def compute_mean(_, t, c):
-      return _safe_div(t, math_ops.maximum(c, 0), name='value')
+      return math_ops.div_no_nan(t, math_ops.maximum(c, 0), name='value')
 
     mean_t = _aggregate_across_replicas(
         metrics_collections, compute_mean, total, count)
-    update_op = _safe_div(update_total_op,
-                          math_ops.maximum(update_count_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_total_op, math_ops.maximum(update_count_op, 0), name='update_op')
 
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
@@ -415,7 +393,7 @@ def mean(values,
     return mean_t, update_op
 
 
-@tf_export('metrics.accuracy')
+@tf_export(v1=['metrics.accuracy'])
 def accuracy(labels,
              predictions,
              weights=None,
@@ -647,7 +625,7 @@ def _aggregate_variable(v, collections):
   return _aggregate_across_replicas(collections, f, v)
 
 
-@tf_export('metrics.auc')
+@tf_export(v1=['metrics.auc'])
 def auc(labels,
         predictions,
         weights=None,
@@ -779,19 +757,19 @@ def auc(labels,
       """
       dtp = tp[:num_thresholds - 1] - tp[1:]
       p = tp + fp
-      prec_slope = _safe_div(
+      prec_slope = math_ops.div_no_nan(
           dtp,
           math_ops.maximum(p[:num_thresholds - 1] - p[1:], 0),
           name='prec_slope')
       intercept = tp[1:] - math_ops.multiply(prec_slope, p[1:])
       safe_p_ratio = array_ops.where(
           math_ops.logical_and(p[:num_thresholds - 1] > 0, p[1:] > 0),
-          _safe_div(p[:num_thresholds - 1],
-                    math_ops.maximum(p[1:], 0),
-                    name='recall_relative_ratio'),
-          array_ops.ones_like(p[1:]))
+          math_ops.div_no_nan(
+              p[:num_thresholds - 1],
+              math_ops.maximum(p[1:], 0),
+              name='recall_relative_ratio'), array_ops.ones_like(p[1:]))
       return math_ops.reduce_sum(
-          _safe_div(
+          math_ops.div_no_nan(
               prec_slope * (dtp + intercept * math_ops.log(safe_p_ratio)),
               math_ops.maximum(tp[1:] + fn[1:], 0),
               name='pr_auc_increment'),
@@ -852,7 +830,7 @@ def auc(labels,
     return auc_value, update_op
 
 
-@tf_export('metrics.mean_absolute_error')
+@tf_export(v1=['metrics.mean_absolute_error'])
 def mean_absolute_error(labels,
                         predictions,
                         weights=None,
@@ -913,7 +891,7 @@ def mean_absolute_error(labels,
               updates_collections, name or 'mean_absolute_error')
 
 
-@tf_export('metrics.mean_cosine_distance')
+@tf_export(v1=['metrics.mean_cosine_distance'])
 def mean_cosine_distance(labels,
                          predictions,
                          dim,
@@ -970,7 +948,7 @@ def mean_cosine_distance(labels,
       predictions=predictions, labels=labels, weights=weights)
   radial_diffs = math_ops.multiply(predictions, labels)
   radial_diffs = math_ops.reduce_sum(
-      radial_diffs, reduction_indices=[
+      radial_diffs, axis=[
           dim,
       ], keepdims=True)
   mean_distance, update_op = mean(radial_diffs, weights, None, None, name or
@@ -987,7 +965,7 @@ def mean_cosine_distance(labels,
   return mean_distance, update_op
 
 
-@tf_export('metrics.mean_per_class_accuracy')
+@tf_export(v1=['metrics.mean_per_class_accuracy'])
 def mean_per_class_accuracy(labels,
                             predictions,
                             num_classes,
@@ -1074,7 +1052,7 @@ def mean_per_class_accuracy(labels,
     update_count_op = state_ops.scatter_add(count, labels, is_correct)
 
     def compute_mean_accuracy(_, count, total):
-      per_class_accuracy = _safe_div(
+      per_class_accuracy = math_ops.div_no_nan(
           count, math_ops.maximum(total, 0), name=None)
       mean_accuracy_v = math_ops.reduce_mean(
           per_class_accuracy, name='mean_accuracy')
@@ -1083,16 +1061,15 @@ def mean_per_class_accuracy(labels,
     mean_accuracy_v = _aggregate_across_replicas(
         metrics_collections, compute_mean_accuracy, count, total)
 
-    update_op = _safe_div(update_count_op,
-                          math_ops.maximum(update_total_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_count_op, math_ops.maximum(update_total_op, 0), name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
     return mean_accuracy_v, update_op
 
 
-@tf_export('metrics.mean_iou')
+@tf_export(v1=['metrics.mean_iou'])
 def mean_iou(labels,
              predictions,
              num_classes,
@@ -1193,7 +1170,7 @@ def mean_iou(labels,
     return mean_iou_v, update_op
 
 
-@tf_export('metrics.mean_relative_error')
+@tf_export(v1=['metrics.mean_relative_error'])
 def mean_relative_error(labels,
                         predictions,
                         normalizer,
@@ -1262,7 +1239,7 @@ def mean_relative_error(labels,
               updates_collections, name or 'mean_relative_error')
 
 
-@tf_export('metrics.mean_squared_error')
+@tf_export(v1=['metrics.mean_squared_error'])
 def mean_squared_error(labels,
                        predictions,
                        weights=None,
@@ -1323,7 +1300,7 @@ def mean_squared_error(labels,
               name or 'mean_squared_error')
 
 
-@tf_export('metrics.mean_tensor')
+@tf_export(v1=['metrics.mean_tensor'])
 def mean_tensor(values,
                 weights=None,
                 metrics_collections=None,
@@ -1394,22 +1371,21 @@ def mean_tensor(values,
     with ops.control_dependencies([values]):
       update_count_op = state_ops.assign_add(count, num_values)
 
-    compute_mean = lambda _, t, c: _safe_div(
+    compute_mean = lambda _, t, c: math_ops.div_no_nan(  # pylint: disable=g-long-lambda
         t, math_ops.maximum(c, 0), name='value')
 
     mean_t = _aggregate_across_replicas(
         metrics_collections, compute_mean, total, count)
 
-    update_op = _safe_div(update_total_op,
-                          math_ops.maximum(update_count_op, 0),
-                          name='update_op')
+    update_op = math_ops.div_no_nan(
+        update_total_op, math_ops.maximum(update_count_op, 0), name='update_op')
     if updates_collections:
       ops.add_to_collections(updates_collections, update_op)
 
     return mean_t, update_op
 
 
-@tf_export('metrics.percentage_below')
+@tf_export(v1=['metrics.percentage_below'])
 def percentage_below(values,
                      threshold,
                      weights=None,
@@ -1509,7 +1485,7 @@ def _count_condition(values,
   return value_tensor, update_op
 
 
-@tf_export('metrics.false_negatives')
+@tf_export(v1=['metrics.false_negatives'])
 def false_negatives(labels,
                     predictions,
                     weights=None,
@@ -1561,7 +1537,7 @@ def false_negatives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.false_negatives_at_thresholds')
+@tf_export(v1=['metrics.false_negatives_at_thresholds'])
 def false_negatives_at_thresholds(labels,
                                   predictions,
                                   thresholds,
@@ -1617,7 +1593,7 @@ def false_negatives_at_thresholds(labels,
     return fn_value, update_ops['fn']
 
 
-@tf_export('metrics.false_positives')
+@tf_export(v1=['metrics.false_positives'])
 def false_positives(labels,
                     predictions,
                     weights=None,
@@ -1670,7 +1646,7 @@ def false_positives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.false_positives_at_thresholds')
+@tf_export(v1=['metrics.false_positives_at_thresholds'])
 def false_positives_at_thresholds(labels,
                                   predictions,
                                   thresholds,
@@ -1726,7 +1702,7 @@ def false_positives_at_thresholds(labels,
     return fp_value, update_ops['fp']
 
 
-@tf_export('metrics.true_negatives')
+@tf_export(v1=['metrics.true_negatives'])
 def true_negatives(labels,
                    predictions,
                    weights=None,
@@ -1779,7 +1755,7 @@ def true_negatives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.true_negatives_at_thresholds')
+@tf_export(v1=['metrics.true_negatives_at_thresholds'])
 def true_negatives_at_thresholds(labels,
                                  predictions,
                                  thresholds,
@@ -1835,7 +1811,7 @@ def true_negatives_at_thresholds(labels,
     return tn_value, update_ops['tn']
 
 
-@tf_export('metrics.true_positives')
+@tf_export(v1=['metrics.true_positives'])
 def true_positives(labels,
                    predictions,
                    weights=None,
@@ -1888,7 +1864,7 @@ def true_positives(labels,
                             updates_collections)
 
 
-@tf_export('metrics.true_positives_at_thresholds')
+@tf_export(v1=['metrics.true_positives_at_thresholds'])
 def true_positives_at_thresholds(labels,
                                  predictions,
                                  thresholds,
@@ -1944,7 +1920,7 @@ def true_positives_at_thresholds(labels,
     return tp_value, update_ops['tp']
 
 
-@tf_export('metrics.precision')
+@tf_export(v1=['metrics.precision'])
 def precision(labels,
               predictions,
               weights=None,
@@ -2039,7 +2015,7 @@ def precision(labels,
     return p, update_op
 
 
-@tf_export('metrics.precision_at_thresholds')
+@tf_export(v1=['metrics.precision_at_thresholds'])
 def precision_at_thresholds(labels,
                             predictions,
                             thresholds,
@@ -2120,7 +2096,7 @@ def precision_at_thresholds(labels,
     return prec, update_op
 
 
-@tf_export('metrics.recall')
+@tf_export(v1=['metrics.recall'])
 def recall(labels,
            predictions,
            weights=None,
@@ -2471,7 +2447,7 @@ def _streaming_sparse_false_negative_at_k(labels,
     return var, state_ops.assign_add(var, batch_total_fn, name='update')
 
 
-@tf_export('metrics.recall_at_k')
+@tf_export(v1=['metrics.recall_at_k'])
 def recall_at_k(labels,
                 predictions,
                 k,
@@ -2564,7 +2540,7 @@ def recall_at_k(labels,
         name=scope)
 
 
-@tf_export('metrics.recall_at_top_k')
+@tf_export(v1=['metrics.recall_at_top_k'])
 def recall_at_top_k(labels,
                     predictions_idx,
                     k=None,
@@ -2648,7 +2624,7 @@ def recall_at_top_k(labels,
     return metric, update
 
 
-@tf_export('metrics.recall_at_thresholds')
+@tf_export(v1=['metrics.recall_at_thresholds'])
 def recall_at_thresholds(labels,
                          predictions,
                          thresholds,
@@ -2726,7 +2702,7 @@ def recall_at_thresholds(labels,
     return rec, update_op
 
 
-@tf_export('metrics.root_mean_squared_error')
+@tf_export(v1=['metrics.root_mean_squared_error'])
 def root_mean_squared_error(labels,
                             predictions,
                             weights=None,
@@ -2797,7 +2773,7 @@ def root_mean_squared_error(labels,
   return rmse, update_rmse_op
 
 
-@tf_export('metrics.sensitivity_at_specificity')
+@tf_export(v1=['metrics.sensitivity_at_specificity'])
 def sensitivity_at_specificity(labels,
                                predictions,
                                specificity,
@@ -3069,7 +3045,7 @@ def _sparse_average_precision_at_top_k(labels, predictions_idx):
 
     # Reduce along k dimension to get the sum, yielding a [D1, ... DN] tensor.
     precision_sum = math_ops.reduce_sum(
-        relevant_precision_per_k, reduction_indices=(-1,), name='precision_sum')
+        relevant_precision_per_k, axis=(-1,), name='precision_sum')
 
     # Divide by number of relevant items to get average precision. These are
     # the "num_relevant_items" and "AveP" terms from the formula above.
@@ -3170,7 +3146,7 @@ def _streaming_sparse_average_precision_at_top_k(labels,
     return mean_average_precision, update
 
 
-@tf_export('metrics.sparse_average_precision_at_k')
+@tf_export(v1=['metrics.sparse_average_precision_at_k'])
 @deprecated(None, 'Use average_precision_at_k instead')
 def sparse_average_precision_at_k(labels,
                                   predictions,
@@ -3190,7 +3166,7 @@ def sparse_average_precision_at_k(labels,
       name=name)
 
 
-@tf_export('metrics.average_precision_at_k')
+@tf_export(v1=['metrics.average_precision_at_k'])
 def average_precision_at_k(labels,
                            predictions,
                            k,
@@ -3364,7 +3340,7 @@ def _streaming_sparse_false_positive_at_k(labels,
     return var, state_ops.assign_add(var, batch_total_fp, name='update')
 
 
-@tf_export('metrics.precision_at_top_k')
+@tf_export(v1=['metrics.precision_at_top_k'])
 def precision_at_top_k(labels,
                        predictions_idx,
                        k=None,
@@ -3453,7 +3429,7 @@ def precision_at_top_k(labels,
     return metric, update
 
 
-@tf_export('metrics.sparse_precision_at_k')
+@tf_export(v1=['metrics.sparse_precision_at_k'])
 @deprecated(None, 'Use precision_at_k instead')
 def sparse_precision_at_k(labels,
                           predictions,
@@ -3475,7 +3451,7 @@ def sparse_precision_at_k(labels,
       name=name)
 
 
-@tf_export('metrics.precision_at_k')
+@tf_export(v1=['metrics.precision_at_k'])
 def precision_at_k(labels,
                    predictions,
                    k,
@@ -3569,7 +3545,7 @@ def precision_at_k(labels,
         name=scope)
 
 
-@tf_export('metrics.specificity_at_sensitivity')
+@tf_export(v1=['metrics.specificity_at_sensitivity'])
 def specificity_at_sensitivity(labels,
                                predictions,
                                sensitivity,
diff --git a/tensorflow/python/ops/nccl_ops_test.py b/tensorflow/python/ops/nccl_ops_test.py
index 1b496fec4739a8c8f05e257a4e44e01605774d29..3b2e2b0175f109bf698cf52e695d452ae5eae3ec 100644
--- a/tensorflow/python/ops/nccl_ops_test.py
+++ b/tensorflow/python/ops/nccl_ops_test.py
@@ -102,7 +102,7 @@ class NcclTestCase(test.TestCase):
             continue
 
           # Test execution and results.
-          for t in sess.run(result_tensors):
+          for t in self.evaluate(result_tensors):
             self.assertAllClose(t, np_ans)
 
   def _TestGradient(self, nccl_reduce, numpy_fn):
diff --git a/tensorflow/python/ops/nn_batchnorm_test.py b/tensorflow/python/ops/nn_batchnorm_test.py
index c8a5b58e4584e6deeb33380b53c02be564989206..e978f1d32601890f8eb9b54fdd5738f626b7f863 100644
--- a/tensorflow/python/ops/nn_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_batchnorm_test.py
@@ -71,6 +71,7 @@ class BatchNormalizationTest(test.TestCase):
                                        gamma if scale_after_normalization else
                                        None, epsilon)
 
+  @test_util.run_deprecated_v1
   def testBatchNorm(self):
     x_shape = [3, 5, 4, 2]
     param_shape = [2]
@@ -169,16 +170,20 @@ class BatchNormalizationTest(test.TestCase):
                                       shift_after_normalization, v,
                                       err_tolerance)
 
+  @test_util.run_deprecated_v1
   def testBatchNormInputGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(0, "x")
 
+  @test_util.run_deprecated_v1
   def testBatchNormMeanGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(1, "mean")
 
+  @test_util.run_deprecated_v1
   def testBatchNormVarianceGradient(self):
     self._testBatchNormGradientInAllNeedConfigs(
         2, "variance", err_tolerance=1e-03)
 
+  @test_util.run_deprecated_v1
   def testBatchNormBetaGradient(self):
     # Since beta does not exist when scale_after_normalization=False, we only
     # test for scale_after_normalization=True.
@@ -187,6 +192,7 @@ class BatchNormalizationTest(test.TestCase):
         self._testBatchNormGradient(3, "beta", scale_after_normalization, True,
                                     v)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGammaGradient(self):
     # If scale_after_normalization is False, backprop for gamma in v1
     # will be 0. In version 2 of the API, if scale_after_normalization is False,
@@ -199,6 +205,7 @@ class BatchNormalizationTest(test.TestCase):
       self._testBatchNormGradient(4, "gamma", True, shift_after_normalization,
                                   2)
 
+  @test_util.run_deprecated_v1
   def testBatchNormGradImpl(self):
     x_shape = [7, 5, 4, 6]
     param_shape = [6]
@@ -235,15 +242,17 @@ class BatchNormalizationTest(test.TestCase):
           odx, odm, odv, odb, odg = gradients_impl.gradients(
               [on], [x, m, v, beta, gamma], [backprop])
           if scale_after_normalization:
-            all_grads = sess.run([dx, dm, dv, db, dg, odx, odm, odv, odb, odg])
+            all_grads = self.evaluate(
+                [dx, dm, dv, db, dg, odx, odm, odv, odb, odg])
             to_check = ["dx", "dm", "dv", "db", "dg"]
           else:
-            all_grads = sess.run([dx, dm, dv, db, odx, odm, odv, odb])
+            all_grads = self.evaluate([dx, dm, dv, db, odx, odm, odv, odb])
             to_check = ["dx", "dm", "dv", "db"]
           for i, _ in enumerate(to_check):
             self.assertAllClose(
                 all_grads[i + len(to_check)], all_grads[i], atol=0.000001)
 
+  @test_util.run_deprecated_v1
   def testBatchNormKeepDims(self):
     """Test for tf.nn.moments(..., keep_dims=True / False).
 
@@ -318,7 +327,7 @@ class BatchNormalizationTest(test.TestCase):
                                               gamma_val, epsilon,
                                               scale_after_normalization,
                                               shift_after_normalization)
-            [tf_batch_norm] = sess.run([bn])
+            [tf_batch_norm] = self.evaluate([bn])
             self.assertEquals(x_shape, np_batch_norm.shape)
             self.assertEquals(x_shape, tf_batch_norm.shape)
             self.assertAllClose(np_batch_norm, tf_batch_norm, atol=atol)
@@ -371,9 +380,9 @@ class SufficientStatisticsTest(test.TestCase):
           x.set_shape(x_shape)
           op_c, op_m, op_v, op_s = self._opSuffStats(x, axes, shift, keep_dims)
           if shift:
-            tf_c, tf_m, tf_v, tf_s = sess.run([op_c, op_m, op_v, op_s])
+            tf_c, tf_m, tf_v, tf_s = self.evaluate([op_c, op_m, op_v, op_s])
           else:
-            tf_c, tf_m, tf_v = sess.run([op_c, op_m, op_v])
+            tf_c, tf_m, tf_v = self.evaluate([op_c, op_m, op_v])
         else:
           x = array_ops.placeholder(
               dtype=dtypes.float32, shape=[None] * len(x_shape), name="x")
@@ -390,6 +399,7 @@ class SufficientStatisticsTest(test.TestCase):
         if shift:
           self.assertAllClose(np_s, tf_s, atol=0.000001)
 
+  @test_util.run_deprecated_v1
   def testSuffStats(self):
     for has_shape in [True, False]:
       for keep_dims in [True, False]:
@@ -432,7 +442,7 @@ class NormalizeMomentsTest(test.TestCase):
           tf_shift_v = None
         opm, opv = self._opNormalizeMoments(tf_counts, tf_mean_ss,
                                             tf_variance_ss, tf_shift_v)
-        tfm, tfv = sess.run([opm, opv])
+        tfm, tfv = self.evaluate([opm, opv])
         self.assertAllClose(npm, tfm, atol=0.000001)
         self.assertAllClose(npv, tfv, atol=0.000001)
 
@@ -507,9 +517,10 @@ class MomentsTest(test.TestCase):
       expected_variance = expected_x_squared - expected_mean_squared
 
       # Check that the moments are correct.
-      self.assertAllCloseAccordingToType(expected_mean, mean.eval())
-      self.assertAllCloseAccordingToType(expected_variance, var.eval())
+      self.assertAllCloseAccordingToType(expected_mean, self.evaluate(mean))
+      self.assertAllCloseAccordingToType(expected_variance, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -518,6 +529,7 @@ class MomentsTest(test.TestCase):
         self.RunMomentTestWithDynamicShape(
             shape=[2, 3, 5, 4], axes=[0], keep_dims=keep_dims, dtype=dtype)
 
+  @test_util.run_deprecated_v1
   def testGlobalNormalization(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -532,6 +544,7 @@ class MomentsTest(test.TestCase):
             keep_dims=keep_dims,
             dtype=dtype)
 
+  @test_util.run_deprecated_v1
   def testAxes(self):
     for keep_dims in [False, True]:
       for dtype in [dtypes.float32, dtypes.float16]:
@@ -572,9 +585,11 @@ class MomentsTest(test.TestCase):
         print("Moments %s gradient err vs input %d = %g" % (from_y, i, err))
         self.assertLess(err, 1e-11)
 
+  @test_util.run_deprecated_v1
   def testMeanGlobalGradient(self):
     self._testGlobalGradient(from_y="mean")
 
+  @test_util.run_deprecated_v1
   def testVarGlobalGradient(self):
     self._testGlobalGradient(from_y="var")
 
diff --git a/tensorflow/python/ops/nn_fused_batchnorm_test.py b/tensorflow/python/ops/nn_fused_batchnorm_test.py
index 5ac8eba6f7345af79fda2a68dad3e289ba5df5e9..4bc33ff8bdb845510a9872db26c8adfdf1f50995 100644
--- a/tensorflow/python/ops/nn_fused_batchnorm_test.py
+++ b/tensorflow/python/ops/nn_fused_batchnorm_test.py
@@ -50,7 +50,7 @@ class BatchNormalizationTest(test.TestCase):
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
-    return y.eval()
+    return self.evaluate(y)
 
   def _test_inference(self,
                       x_shape,
@@ -82,7 +82,7 @@ class BatchNormalizationTest(test.TestCase):
           epsilon=epsilon,
           data_format=data_format,
           is_training=False)
-      y_val = sess.run(y)
+      y_val = self.evaluate(y)
       y_ref = self._inference_ref(x, scale, offset, mean, var, epsilon,
                                   data_format)
     # An atol value of 1e-3 is too small for float16's, because some adjacent
@@ -102,7 +102,7 @@ class BatchNormalizationTest(test.TestCase):
     y = self._batch_norm(x, mean, var, offset, scale, epsilon)
     if data_format == 'NCHW':
       y = array_ops.transpose(y, [0, 3, 1, 2])
-    return y.eval(), mean.eval(), var.eval()
+    return self.evaluate(y), self.evaluate(mean), self.evaluate(var)
 
   def _test_training(self,
                      x_shape,
@@ -127,7 +127,7 @@ class BatchNormalizationTest(test.TestCase):
           epsilon=epsilon,
           data_format=data_format,
           is_training=True)
-      y_val, mean_val, var_val = sess.run([y, mean, var])
+      y_val, mean_val, var_val = self.evaluate([y, mean, var])
       y_ref, mean_ref, var_ref = self._training_ref(x, scale, offset, epsilon,
                                                     data_format)
     y_atol = 2e-3 if x_dtype == np.float16 else 1e-3
@@ -277,10 +277,10 @@ class BatchNormalizationTest(test.TestCase):
       if is_training:
         epsilon = y.op.get_attr('epsilon')
         data_format = y.op.get_attr('data_format')
-        grad_vals = sess.run([grad_x, grad_scale, grad_offset])
+        grad_vals = self.evaluate([grad_x, grad_scale, grad_offset])
         grad_internal = nn_grad._BatchNormGrad(grad_y, x, scale, pop_mean,
                                                pop_var, epsilon, data_format)
-        grad_internal_vals = sess.run(list(grad_internal))
+        grad_internal_vals = self.evaluate(list(grad_internal))
         for grad_val, grad_internal_val in zip(grad_vals, grad_internal_vals):
           self.assertAllClose(grad_val, grad_internal_val, atol=err_tolerance)
 
diff --git a/tensorflow/python/ops/nn_grad.py b/tensorflow/python/ops/nn_grad.py
index 902653befc44eb9b2a8c0df9d5ce69a7a0138fed..34404edc9a1250710d4cd7a50e04ad8d187a5d7f 100644
--- a/tensorflow/python/ops/nn_grad.py
+++ b/tensorflow/python/ops/nn_grad.py
@@ -18,13 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_nn_ops
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_ops
 
@@ -948,10 +948,14 @@ def _FusedBatchNormGradGrad(op, *grad):
   grad_grad_x = grad[0]
   grad_grad_scale = grad[1]
   grad_grad_offset = grad[2]
-  grad_x, grad_scale, grad_offset = _BatchNormGrad(
-      grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
-  grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
-  grad_grad_y, grad_x, grad_scale = gradients_impl.gradients(
+  with backprop.GradientTape() as tape:
+    tape.watch(grad_y)
+    tape.watch(x)
+    tape.watch(scale)
+    grad_x, grad_scale, grad_offset = _BatchNormGrad(
+        grad_y, x, scale, pop_mean, pop_var, epsilon, data_format, is_training)
+    grad_initial = [grad_grad_x, grad_grad_scale, grad_grad_offset]
+  grad_grad_y, grad_x, grad_scale = tape.gradient(
       [grad_x, grad_scale, grad_offset], [grad_y, x, scale], grad_initial)
   return grad_grad_y, grad_x, grad_scale, None, None
 
diff --git a/tensorflow/python/ops/nn_grad_test.py b/tensorflow/python/ops/nn_grad_test.py
index 8065df4b1658dc1bac068bee1ae7c6052f82d4f1..95e05a977b856505f0b608442e85fda8468ead1f 100644
--- a/tensorflow/python/ops/nn_grad_test.py
+++ b/tensorflow/python/ops/nn_grad_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
@@ -31,6 +32,7 @@ from tensorflow.python.platform import test
 
 class Relu6OpTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRelu6GradGrad(self):
     inputs = constant_op.constant(
         [[-2, -1, 1, 3], [5, 7, 8, 9]], dtype=dtypes.float32)
diff --git a/tensorflow/python/ops/nn_impl.py b/tensorflow/python/ops/nn_impl.py
index ef763a4b6147fdf58d4eb9f6b55d789d74aac086..48dcab4842864b7322610e4328c1771f95ee352d 100644
--- a/tensorflow/python/ops/nn_impl.py
+++ b/tensorflow/python/ops/nn_impl.py
@@ -262,7 +262,7 @@ def weighted_cross_entropy_with_logits(targets, logits, pos_weight, name=None):
         name=name)
 
 
-@tf_export("nn.relu_layer")
+@tf_export(v1=["nn.relu_layer"])
 def relu_layer(x, weights, biases, name=None):
   """Computes Relu(x * weight + biases).
 
@@ -329,7 +329,7 @@ def swish(features):
   return features * math_ops.sigmoid(features)
 
 
-@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize")
+@tf_export(v1=["math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize"])
 @deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
   """Normalizes along dimension `axis` using an L2 norm.
@@ -350,11 +350,36 @@ def l2_normalize(x, axis=None, epsilon=1e-12, name=None, dim=None):
     name: A name for this operation (optional).
     dim: Deprecated alias for axis.
 
+  Returns:
+    A `Tensor` with the same shape as `x`.
+  """
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  return l2_normalize_v2(x, axis, epsilon, name)
+
+
+@tf_export("math.l2_normalize", "linalg.l2_normalize", "nn.l2_normalize", v1=[])
+def l2_normalize_v2(x, axis=None, epsilon=1e-12, name=None):
+  """Normalizes along dimension `axis` using an L2 norm.
+
+  For a 1-D tensor with `axis = 0`, computes
+
+      output = x / sqrt(max(sum(x**2), epsilon))
+
+  For `x` with more dimensions, independently normalizes each 1-D slice along
+  dimension `axis`.
+
+  Args:
+    x: A `Tensor`.
+    axis: Dimension along which to normalize.  A scalar or a vector of
+      integers.
+    epsilon: A lower bound value for the norm. Will use `sqrt(epsilon)` as the
+      divisor if `norm < sqrt(epsilon)`.
+    name: A name for this operation (optional).
+
   Returns:
     A `Tensor` with the same shape as `x`.
   """
   with ops.name_scope(name, "l2_normalize", [x]) as name:
-    axis = deprecated_argument_lookup("axis", axis, "dim", dim)
     x = ops.convert_to_tensor(x, name="x")
     square_sum = math_ops.reduce_sum(math_ops.square(x), axis, keepdims=True)
     x_inv_norm = math_ops.rsqrt(math_ops.maximum(square_sum, epsilon))
@@ -424,7 +449,7 @@ def zero_fraction(value, name=None):
 
 
 # pylint: disable=redefined-builtin
-@tf_export("nn.depthwise_conv2d")
+@tf_export(v1=["nn.depthwise_conv2d"])
 def depthwise_conv2d(input,
                      filter,
                      strides,
@@ -497,11 +522,68 @@ def depthwise_conv2d(input,
         op=op)
 
 
+@tf_export("nn.depthwise_conv2d", v1=[])
+def depthwise_conv2d_v2(input,
+                        filter,
+                        strides,
+                        padding,
+                        data_format=None,
+                        dilations=None,
+                        name=None):
+  """Depthwise 2-D convolution.
+
+  Given a 4D input tensor ('NHWC' or 'NCHW' data formats)
+  and a filter tensor of shape
+  `[filter_height, filter_width, in_channels, channel_multiplier]`
+  containing `in_channels` convolutional filters of depth 1, `depthwise_conv2d`
+  applies a different filter to each input channel (expanding from 1 channel
+  to `channel_multiplier` channels for each), then concatenates the results
+  together.  The output has `in_channels * channel_multiplier` channels.
+
+  In detail,
+
+      output[b, i, j, k * channel_multiplier + q] = sum_{di, dj}
+           filter[di, dj, k, q] * input[b, strides[1] * i + rate[0] * di,
+                                           strides[2] * j + rate[1] * dj, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the
+  same horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
+  If any value in `rate` is greater than 1, we perform atrous depthwise
+  convolution, in which case all values in the `strides` tensor must be equal
+  to 1.
+
+  Args:
+    input: 4-D with shape according to `data_format`.
+    filter: 4-D with shape
+      `[filter_height, filter_width, in_channels, channel_multiplier]`.
+    strides: 1-D of size 4.  The stride of the sliding window for each
+      dimension of `input`.
+    padding: A string, either `'VALID'` or `'SAME'`. The padding algorithm.
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: 1-D of size 2. The dilation rate in which we sample input values
+      across the `height` and `width` dimensions in atrous convolution. If it is
+      greater than 1, then all values of strides must be 1.
+    name: A name for this operation (optional).
+
+  Returns:
+    A 4-D `Tensor` with shape according to `data_format`.  E.g., for
+    "NHWC" format, shape is
+    `[batch, out_height, out_width, in_channels * channel_multiplier].`
+  """
+  return depthwise_conv2d(input=input,
+                          filter=filter,
+                          strides=strides,
+                          padding=padding,
+                          rate=dilations,
+                          name=name,
+                          data_format=data_format)
+
 # pylint: enable=redefined-builtin
 
 
 # pylint: disable=redefined-builtin,line-too-long
-@tf_export("nn.separable_conv2d")
+@tf_export(v1=["nn.separable_conv2d"])
 def separable_conv2d(input,
                      depthwise_filter,
                      pointwise_filter,
@@ -599,10 +681,76 @@ def separable_conv2d(input,
         name=name)
 
 
+@tf_export("nn.separable_conv2d", v1=[])
+def separable_conv2d_v2(
+    input,
+    depthwise_filter,
+    pointwise_filter,
+    strides,
+    padding,
+    data_format=None,
+    dilations=None,
+    name=None,
+):
+  """2-D convolution with separable filters.
+
+  Performs a depthwise convolution that acts separately on channels followed by
+  a pointwise convolution that mixes channels.  Note that this is separability
+  between dimensions `[1, 2]` and `3`, not spatial separability between
+  dimensions `1` and `2`.
+
+  In detail,
+
+      output[b, i, j, k] = sum_{di, dj, q, r}
+          input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+          depthwise_filter[di, dj, q, r] *
+          pointwise_filter[0, 0, q * channel_multiplier + r, k]
+
+  `strides` controls the strides for the depthwise convolution only, since
+  the pointwise convolution has implicit strides of `[1, 1, 1, 1]`.  Must have
+  `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertical strides, `strides = [1, stride, stride, 1]`.
+  If any value in `rate` is greater than 1, we perform atrous depthwise
+  convolution, in which case all values in the `strides` tensor must be equal
+  to 1.
+
+  Args:
+    input: 4-D `Tensor` with shape according to `data_format`.
+    depthwise_filter: 4-D `Tensor` with shape `[filter_height, filter_width,
+      in_channels, channel_multiplier]`. Contains `in_channels` convolutional
+      filters of depth 1.
+    pointwise_filter: 4-D `Tensor` with shape `[1, 1, channel_multiplier *
+      in_channels, out_channels]`.  Pointwise filter to mix channels after
+      `depthwise_filter` has convolved spatially.
+    strides: 1-D of size 4.  The strides for the depthwise convolution for each
+      dimension of `input`.
+    padding: A string, either `'VALID'` or `'SAME'`.  The padding algorithm. See
+      the "returns" section of `tf.nn.convolution` for details.
+    data_format: The data format for input. Either "NHWC" (default) or "NCHW".
+    dilations: 1-D of size 2. The dilation rate in which we sample input values
+      across the `height` and `width` dimensions in atrous convolution. If it is
+      greater than 1, then all values of strides must be 1.
+    name: A name for this operation (optional).
+
+  Returns:
+    A 4-D `Tensor` with shape according to 'data_format'. For
+      example, with data_format="NHWC", shape is [batch, out_height,
+      out_width, out_channels].
+  """
+  return separable_conv2d(
+      input,
+      depthwise_filter,
+      pointwise_filter,
+      strides,
+      padding,
+      rate=dilations,
+      name=name,
+      data_format=data_format)
+
 # pylint: enable=redefined-builtin,line-too-long
 
 
-@tf_export("nn.sufficient_statistics")
+@tf_export(v1=["nn.sufficient_statistics"])
 def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   """Calculate the sufficient statistics for the mean and variance of `x`.
 
@@ -652,6 +800,35 @@ def sufficient_statistics(x, axes, shift=None, keep_dims=False, name=None):
   return counts, m_ss, v_ss, shift
 
 
+@tf_export("nn.sufficient_statistics", v1=[])
+def sufficient_statistics_v2(x, axes, shift=None, keepdims=False, name=None):
+  """Calculate the sufficient statistics for the mean and variance of `x`.
+
+  These sufficient statistics are computed using the one pass algorithm on
+  an input that's optionally shifted. See:
+  https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Computing_shifted_data
+
+  Args:
+    x: A `Tensor`.
+    axes: Array of ints. Axes along which to compute mean and variance.
+    shift: A `Tensor` containing the value by which to shift the data for
+      numerical stability, or `None` if no shift is to be performed. A shift
+      close to the true mean provides the most numerically stable results.
+    keepdims: produce statistics with the same dimensionality as the input.
+    name: Name used to scope the operations that compute the sufficient stats.
+
+  Returns:
+    Four `Tensor` objects of the same type as `x`:
+
+    * the count (number of elements to average over).
+    * the (possibly shifted) sum of the elements in the array.
+    * the (possibly shifted) sum of squares of the elements in the array.
+    * the shift by which the mean must be corrected or None if `shift` is None.
+  """
+  return sufficient_statistics(
+      x=x, axes=axes, shift=shift, keep_dims=keepdims, name=name)
+
+
 @tf_export("nn.normalize_moments")
 def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   """Calculate the mean and variance of based on the sufficient statistics.
@@ -684,7 +861,7 @@ def normalize_moments(counts, mean_ss, variance_ss, shift, name=None):
   return (mean, variance)
 
 
-@tf_export("nn.moments")
+@tf_export(v1=["nn.moments"])
 def moments(
     x,
     axes,
@@ -743,7 +920,43 @@ def moments(
       return (mean, variance)
 
 
-@tf_export("nn.weighted_moments")
+@tf_export("nn.moments", v1=[])
+def moments_v2(
+    x,
+    axes,
+    shift=None,
+    keepdims=False,
+    name=None):
+  """Calculates the mean and variance of `x`.
+
+  The mean and variance are calculated by aggregating the contents of `x`
+  across `axes`.  If `x` is 1-D and `axes = [0]` this is just the mean
+  and variance of a vector.
+
+  Note: shift is currently not used; the true mean is computed and used.
+
+  When using these moments for batch normalization (see
+  `tf.nn.batch_normalization`):
+
+   * for so-called "global normalization", used with convolutional filters with
+     shape `[batch, height, width, depth]`, pass `axes=[0, 1, 2]`.
+   * for simple batch normalization pass `axes=[0]` (batch only).
+
+  Args:
+    x: A `Tensor`.
+    axes: Array of ints.  Axes along which to compute mean and
+      variance.
+    shift: Not used in the current implementation.
+    keepdims: produce moments with the same dimensionality as the input.
+    name: Name used to scope the operations that compute the moments.
+
+  Returns:
+    Two `Tensor` objects: `mean` and `variance`.
+  """
+  return moments(x=x, axes=axes, shift=shift, name=name, keep_dims=keepdims)
+
+
+@tf_export(v1=["nn.weighted_moments"])
 def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
   """Returns the frequency-weighted mean and variance of `x`.
 
@@ -815,6 +1028,30 @@ def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False):
     return weighted_mean, weighted_variance
 
 
+@tf_export("nn.weighted_moments", v1=[])
+def weighted_moments_v2(x, axes, frequency_weights, keepdims=False, name=None):
+  """Returns the frequency-weighted mean and variance of `x`.
+
+  Args:
+    x: A tensor.
+    axes: 1-d tensor of int32 values; these are the axes along which
+      to compute mean and variance.
+    frequency_weights: A tensor of positive weights which can be
+      broadcast with x.
+    keepdims: Produce moments with the same dimensionality as the input.
+    name: Name used to scope the operation.
+
+  Returns:
+    Two tensors: `weighted_mean` and `weighted_variance`.
+  """
+  return weighted_moments(
+      x=x,
+      axes=axes,
+      frequency_weights=frequency_weights,
+      name=name,
+      keep_dims=keepdims)
+
+
 @tf_export("nn.batch_normalization")
 def batch_normalization(x,
                         mean,
@@ -875,7 +1112,7 @@ def batch_normalization(x,
         offset - mean * inv if offset is not None else -mean * inv, x.dtype)
 
 
-@tf_export("nn.fused_batch_norm")
+@tf_export(v1=["nn.fused_batch_norm"])
 def fused_batch_norm(
     x,
     scale,
@@ -946,7 +1183,7 @@ def fused_batch_norm(
   return y, batch_mean, batch_var
 
 
-@tf_export("nn.batch_norm_with_global_normalization")
+@tf_export(v1=["nn.batch_norm_with_global_normalization"])
 def batch_norm_with_global_normalization(t,
                                          m,
                                          v,
@@ -984,6 +1221,53 @@ def batch_norm_with_global_normalization(t,
                              else None, variance_epsilon, name)
 
 
+# pylint: disable=redefined-builtin,line-too-long
+@tf_export("nn.batch_norm_with_global_normalization", v1=[])
+def batch_norm_with_global_normalization_v2(input,
+                                            mean,
+                                            variance,
+                                            beta,
+                                            gamma,
+                                            variance_epsilon,
+                                            scale_after_normalization,
+                                            name=None):
+  """Batch normalization.
+
+  This op is deprecated. See `tf.nn.batch_normalization`.
+
+  Args:
+    input: A 4D input Tensor.
+    mean: A 1D mean Tensor with size matching the last dimension of t.
+      This is the first output from tf.nn.moments,
+      or a saved moving average thereof.
+    variance: A 1D variance Tensor with size matching the last dimension of t.
+      This is the second output from tf.nn.moments,
+      or a saved moving average thereof.
+    beta: A 1D beta Tensor with size matching the last dimension of t.
+      An offset to be added to the normalized tensor.
+    gamma: A 1D gamma Tensor with size matching the last dimension of t.
+      If "scale_after_normalization" is true, this tensor will be multiplied
+      with the normalized tensor.
+    variance_epsilon: A small float number to avoid dividing by 0.
+    scale_after_normalization: A bool indicating whether the resulted tensor
+      needs to be multiplied with gamma.
+    name: A name for this operation (optional).
+
+  Returns:
+     A batch-normalized `t`.
+  """
+  return batch_norm_with_global_normalization(t=input,
+                                              m=mean,
+                                              v=variance,
+                                              beta=beta,
+                                              gamma=gamma,
+                                              variance_epsilon=variance_epsilon,
+                                              scale_after_normalization=scale_after_normalization,
+                                              name=name)
+
+# pylint: enable=redefined-builtin,line-too-long
+
+
 def _sum_rows(x):
   """Returns a vector summing up each row of the matrix x."""
   # _sum_rows(x) is equivalent to math_ops.reduce_sum(x, 1) when x is
@@ -1178,7 +1462,111 @@ def _compute_sampled_logits(weights,
     return out_logits, out_labels
 
 
-@tf_export("nn.nce_loss")
+@tf_export("nn.nce_loss", v1=[])
+def nce_loss_v2(weights,
+                biases,
+                labels,
+                inputs,
+                num_sampled,
+                num_classes,
+                num_true=1,
+                sampled_values=None,
+                remove_accidental_hits=False,
+                name="nce_loss"):
+  """Computes and returns the noise-contrastive estimation training loss.
+
+  See [Noise-contrastive estimation: A new estimation principle for
+  unnormalized statistical
+  models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+  Also see our [Candidate Sampling Algorithms
+  Reference](https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  A common use case is to use this method for training, and calculate the full
+  sigmoid loss for evaluation or inference as in the following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.nce_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...)
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.sigmoid_cross_entropy_with_logits(
+        labels=labels_one_hot,
+        logits=logits)
+    loss = tf.reduce_sum(loss, axis=1)
+  ```
+
+  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Note: By default this uses a log-uniform (Zipfian) distribution for sampling,
+  so your labels must be sorted in order of decreasing frequency to achieve
+  good results.  For more details, see
+  `tf.nn.log_uniform_candidate_sampler`.
+
+  Note: In the case where `num_true` > 1, we assign to each target class
+  the target probability 1 / `num_true` so that the target probabilities
+  sum to 1 per-example.
+
+  Note: It would be useful to allow a variable number of target classes per
+  example.  We hope to provide this functionality in a future release.
+  For now, if you have a variable number of target classes, you can pad them
+  out to a constant number by either repeating them or by padding
+  with an otherwise unused class.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+      objects whose concatenation along dimension 0 has shape [num_classes,
+      dim].  The (possibly-partitioned) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
+      target classes.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
+      the input network.
+    num_sampled: An `int`.  The number of negative classes to randomly sample
+      per batch. This single sample of negative classes is evaluated for each
+      element in the batch.
+    num_classes: An `int`. The number of possible classes.
+    num_true: An `int`.  The number of target classes per training example.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+      (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  Whether to remove "accidental hits"
+      where a sampled class equals one of the target classes.  If set to `True`,
+      this is a "Sampled Logistic" loss instead of NCE, and we are learning to
+      generate log-odds instead of log probabilities.  See our [Candidate
+      Sampling Algorithms Reference]
+        (https://www.tensorflow.org/extras/candidate_sampling.pdf). Default is
+          False.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example NCE losses.
+  """
+  # TODO(yuefengz): get partition_strategy from either variables or distribution
+  # strategies.
+  return nce_loss(
+      weights,
+      biases,
+      labels,
+      inputs,
+      num_sampled,
+      num_classes,
+      num_true=num_true,
+      sampled_values=sampled_values,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy="div",
+      name=name)
+
+
+@tf_export(v1=["nn.nce_loss"])
 def nce_loss(weights,
              biases,
              labels,
@@ -1289,7 +1677,98 @@ def nce_loss(weights,
   return _sum_rows(sampled_losses)
 
 
-@tf_export("nn.sampled_softmax_loss")
+@tf_export("nn.sampled_softmax_loss", v1=[])
+def sampled_softmax_loss_v2(weights,
+                            biases,
+                            labels,
+                            inputs,
+                            num_sampled,
+                            num_classes,
+                            num_true=1,
+                            sampled_values=None,
+                            remove_accidental_hits=True,
+                            seed=None,
+                            name="sampled_softmax_loss"):
+  """Computes and returns the sampled softmax training loss.
+
+  This is a faster way to train a softmax classifier over a huge number of
+  classes.
+
+  This operation is for training only.  It is generally an underestimate of
+  the full softmax loss.
+
+  A common use case is to use this method for training, and calculate the full
+  sigmoid loss for evaluation or inference as in the following example:
+
+  ```python
+  if mode == "train":
+    loss = tf.nn.sampled_softmax_loss(
+        weights=weights,
+        biases=biases,
+        labels=labels,
+        inputs=inputs,
+        ...)
+  elif mode == "eval":
+    logits = tf.matmul(inputs, tf.transpose(weights))
+    logits = tf.nn.bias_add(logits, biases)
+    labels_one_hot = tf.one_hot(labels, n_classes)
+    loss = tf.nn.softmax_cross_entropy_with_logits_v2(
+        labels=labels_one_hot,
+        logits=logits)
+  ```
+
+  See our [Candidate Sampling Algorithms Reference]
+  (https://www.tensorflow.org/extras/candidate_sampling.pdf)
+
+  Also see Section 3 of [Jean et al., 2014](http://arxiv.org/abs/1412.2007)
+  ([pdf](http://arxiv.org/pdf/1412.2007.pdf)) for the math.
+
+  Note: when doing embedding lookup on `weights` and `bias`, "div" partition
+  strategy will be used. Support for other partition strategy will be added
+  later.
+
+  Args:
+    weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor`
+      objects whose concatenation along dimension 0 has shape [num_classes,
+      dim].  The (possibly-sharded) class embeddings.
+    biases: A `Tensor` of shape `[num_classes]`.  The class biases.
+    labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The
+      target classes.  Note that this format differs from the `labels` argument
+      of `nn.softmax_cross_entropy_with_logits_v2`.
+    inputs: A `Tensor` of shape `[batch_size, dim]`.  The forward activations of
+      the input network.
+    num_sampled: An `int`.  The number of classes to randomly sample per batch.
+    num_classes: An `int`. The number of possible classes.
+    num_true: An `int`.  The number of target classes per training example.
+    sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`,
+      `sampled_expected_count`) returned by a `*_candidate_sampler` function.
+      (if None, we default to `log_uniform_candidate_sampler`)
+    remove_accidental_hits:  A `bool`.  whether to remove "accidental hits"
+      where a sampled class equals one of the target classes.  Default is True.
+    seed: random seed for candidate sampling. Default to None, which doesn't set
+      the op-level random seed for candidate sampling.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `batch_size` 1-D tensor of per-example sampled softmax losses.
+
+  """
+  return sampled_softmax_loss(
+      weights,
+      biases,
+      labels,
+      inputs,
+      num_sampled,
+      num_classes,
+      num_true=num_true,
+      sampled_values=sampled_values,
+      remove_accidental_hits=remove_accidental_hits,
+      partition_strategy="div",
+      name=name,
+      seed=seed)
+
+
+@tf_export(v1=["nn.sampled_softmax_loss"])
 def sampled_softmax_loss(weights,
                          biases,
                          labels,
diff --git a/tensorflow/python/ops/nn_ops.py b/tensorflow/python/ops/nn_ops.py
index bc195993c2e3582e5c9a9663400ef5cc4b6868d9..611bfdac9a1b10a808cafeed585ac6e3427d18e9 100644
--- a/tensorflow/python/ops/nn_ops.py
+++ b/tensorflow/python/ops/nn_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import graph_util
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
@@ -35,13 +36,14 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_nn_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
-
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_nn_ops import *
 # pylint: enable=wildcard-import
-
 from tensorflow.python.util import deprecation
+from tensorflow.python.util.deprecation import deprecated_args
+from tensorflow.python.util.deprecation import deprecated_argument_lookup
+
 from tensorflow.python.util.tf_export import tf_export
 
 # Aliases for some automatically-generated names.
@@ -206,6 +208,73 @@ class _NonAtrousConvolution(object):
         name=self.name)
 
 
+@tf_export("nn.dilation2d", v1=[])
+def dilation2d_v2(
+    input,   # pylint: disable=redefined-builtin
+    filters,  # pylint: disable=redefined-builtin
+    strides,
+    padding,
+    data_format,
+    dilations,
+    name=None):
+  """Computes the grayscale dilation of 4-D `input` and 3-D `filters` tensors.
+
+  The `input` tensor has shape `[batch, in_height, in_width, depth]` and the
+  `filters` tensor has shape `[filter_height, filter_width, depth]`, i.e., each
+  input channel is processed independently of the others with its own
+  structuring function. The `output` tensor has shape
+  `[batch, out_height, out_width, depth]`. The spatial dimensions of the output
+  tensor depend on the `padding` algorithm. We currently only support the
+  default "NHWC" `data_format`.
+
+  In detail, the grayscale morphological 2-D dilation is the max-sum correlation
+  (for consistency with `conv2d`, we use unmirrored filters):
+
+      output[b, y, x, c] =
+         max_{dy, dx} input[b,
+                            strides[1] * y + rates[1] * dy,
+                            strides[2] * x + rates[2] * dx,
+                            c] +
+                      filters[dy, dx, c]
+
+  Max-pooling is a special case when the filter has size equal to the pooling
+  kernel size and contains all zeros.
+
+  Note on duality: The dilation of `input` by the `filters` is equal to the
+  negation of the erosion of `-input` by the reflected `filters`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+      `int32`, `uint8`, `int16`, `int8`, `int64`, `bfloat16`, `uint16`, `half`,
+      `uint32`, `uint64`.
+      4-D with shape `[batch, in_height, in_width, depth]`.
+    filters: A `Tensor`. Must have the same type as `input`.
+      3-D with shape `[filter_height, filter_width, depth]`.
+    strides: A list of `ints` that has length `>= 4`.
+      The stride of the sliding window for each dimension of the input
+      tensor. Must be: `[1, stride_height, stride_width, 1]`.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: A `string`, only `"NCHW"` is currently supported.
+    dilations: A list of `ints` that has length `>= 4`.
+      The input stride for atrous morphological dilation. Must be:
+      `[1, rate_height, rate_width, 1]`.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if data_format != "NCHW":
+    raise ValueError("Data formats other than NCHW are not yet supported")
+
+  return gen_nn_ops.dilation2d(input=input,
+                               filter=filters,
+                               strides=strides,
+                               rates=dilations,
+                               padding=padding,
+                               name=name)
+
+
 @tf_export("nn.with_space_to_batch")
 def with_space_to_batch(
     input,  # pylint: disable=redefined-builtin
@@ -644,7 +713,7 @@ def _get_strides_and_dilation_rate(num_spatial_dims, strides, dilation_rate):
   return strides, dilation_rate
 
 
-@tf_export("nn.convolution")
+@tf_export(v1=["nn.convolution"])
 def convolution(
     input,  # pylint: disable=redefined-builtin
     filter,  # pylint: disable=redefined-builtin
@@ -782,6 +851,30 @@ def convolution(
     return op(input, filter)
 
 
+@tf_export("nn.convolution", v1=[])
+def convolution_v2(
+    input,  # pylint: disable=redefined-builtin
+    filters,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  return convolution(
+      input,  # pylint: disable=redefined-builtin
+      filters,
+      padding=padding,
+      strides=strides,
+      dilation_rate=dilations,
+      name=name,
+      data_format=data_format)
+
+convolution_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        convolution.__doc__, "dilation_rate", "dilations"),
+    "filter", "filters")
+
+
 class Convolution(object):
   """Helper class for convolution.
 
@@ -873,7 +966,7 @@ class Convolution(object):
     return self.conv_op(inp, filter)
 
 
-@tf_export("nn.pool")
+@tf_export(v1=["nn.pool"])
 def pool(
     input,  # pylint: disable=redefined-builtin
     window_shape,
@@ -1044,6 +1137,105 @@ def pool(
         filter_shape=window_shape)
 
 
+@tf_export("nn.pool", v1=[])
+def pool_v2(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    strides=None,
+    padding="VALID",
+    data_format=None,
+    dilations=None,
+    name=None):
+  # pylint: disable=line-too-long
+  """Performs an N-D pooling operation.
+
+  In the case that `data_format` does not start with "NC", computes for
+      0 <= b < batch_size,
+      0 <= x[i] < output_spatial_shape[i],
+      0 <= c < num_channels:
+
+  ```
+    output[b, x[0], ..., x[N-1], c] =
+      REDUCE_{z[0], ..., z[N-1]}
+        input[b,
+              x[0] * strides[0] - pad_before[0] + dilation_rate[0]*z[0],
+              ...
+              x[N-1]*strides[N-1] - pad_before[N-1] + dilation_rate[N-1]*z[N-1],
+              c],
+  ```
+
+  where the reduction function REDUCE depends on the value of `pooling_type`,
+  and pad_before is defined based on the value of `padding` as described in
+  the "returns" section of `tf.nn.convolution` for details.
+  The reduction never includes out-of-bounds positions.
+
+  In the case that `data_format` starts with `"NC"`, the `input` and output are
+  simply transposed as follows:
+
+  ```
+    pool(input, data_format, **kwargs) =
+      tf.transpose(pool(tf.transpose(input, [0] + range(2,N+2) + [1]),
+                        **kwargs),
+                   [0, N+1] + range(1, N+1))
+  ```
+
+  Args:
+    input: Tensor of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]` if data_format does not start with "NC" (default), or
+      `[batch_size, num_channels] + input_spatial_shape` if data_format starts
+      with "NC".  Pooling happens over the spatial dimensions only.
+    window_shape: Sequence of N ints >= 1.
+    pooling_type: Specifies pooling operation, must be "AVG" or "MAX".
+    strides: Optional. Sequence of N ints >= 1.  Defaults to [1]*N. If any value of
+      strides is > 1, then all values of dilation_rate must be 1.
+    padding: The padding algorithm, must be "SAME" or "VALID". Defaults to "SAME".
+      See the "returns" section of `tf.nn.convolution` for details.
+    data_format: A string or None.  Specifies whether the channel dimension of
+      the `input` and output is the last dimension (default, or if `data_format`
+      does not start with "NC"), or the second dimension (if `data_format`
+      starts with "NC").  For N=1, the valid values are "NWC" (default) and
+      "NCW".  For N=2, the valid values are "NHWC" (default) and "NCHW". For
+      N=3, the valid values are "NDHWC" (default) and "NCDHW".
+    dilations: Optional.  Dilation rate.  List of N ints >= 1. Defaults to
+      [1]*N.  If any value of dilation_rate is > 1, then all values of strides
+      must be 1.
+    name: Optional. Name of the op.
+
+  Returns:
+    Tensor of rank N+2, of shape
+      [batch_size] + output_spatial_shape + [num_channels]
+
+    if data_format is None or does not start with "NC", or
+
+      [batch_size, num_channels] + output_spatial_shape
+
+    if data_format starts with "NC",
+    where `output_spatial_shape` depends on the value of padding:
+
+    If padding = "SAME":
+      output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
+
+    If padding = "VALID":
+      output_spatial_shape[i] =
+        ceil((input_spatial_shape[i] - (window_shape[i] - 1) * dilation_rate[i])
+             / strides[i]).
+
+  Raises:
+    ValueError: if arguments are invalid.
+
+  """
+  return pool(
+      input=input,
+      window_shape=window_shape,
+      pooling_type=pooling_type,
+      padding=padding,
+      dilation_rate=dilations,
+      strides=strides,
+      name=name,
+      data_format=data_format)
+
+
 @tf_export("nn.atrous_conv2d")
 def atrous_conv2d(value, filters, rate, padding, name=None):
   """Atrous convolution (a.k.a. convolution with holes or dilated convolution).
@@ -1181,7 +1373,208 @@ def atrous_conv2d(value, filters, rate, padding, name=None):
       name=name)
 
 
-@tf_export("nn.conv2d_transpose")
+@tf_export("nn.conv2d", v1=[])
+def conv2d_v2(input,  # pylint: disable=redefined-builtin
+              filters,
+              strides,
+              padding,
+              data_format="NHWC",
+              dilations=None,
+              name=None):
+  # pylint: disable=line-too-long
+  r"""Computes a 2-D convolution given 4-D `input` and `filters` tensors.
+
+  Given an input tensor of shape `[batch, in_height, in_width, in_channels]`
+  and a filter / kernel tensor of shape
+  `[filter_height, filter_width, in_channels, out_channels]`, this op
+  performs the following:
+
+  1. Flattens the filter to a 2-D matrix with shape
+     `[filter_height * filter_width * in_channels, output_channels]`.
+  2. Extracts image patches from the input tensor to form a *virtual*
+     tensor of shape `[batch, out_height, out_width,
+     filter_height * filter_width * in_channels]`.
+  3. For each patch, right-multiplies the filter matrix and the image patch
+     vector.
+
+  In detail, with the default NHWC format,
+
+      output[b, i, j, k] =
+          sum_{di, dj, q} input[b, strides[1] * i + di, strides[2] * j + dj, q] *
+                          filter[di, dj, q, k]
+
+  Must have `strides[0] = strides[3] = 1`.  For the most common case of the same
+  horizontal and vertices strides, `strides = [1, stride, stride, 1]`.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      A 4-D tensor. The dimension order is interpreted according to the value
+      of `data_format`, see below for details.
+    filters: A `Tensor`. Must have the same type as `input`.
+      A 4-D tensor of shape
+      `[filter_height, filter_width, in_channels, out_channels]`
+    strides: A list of `ints`.
+      1-D tensor of length 4.  The stride of the sliding window for each
+      dimension of `input`. The dimension order is determined by the value of
+      `data_format`, see below for details.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, height, width, channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, channels, height, width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by the
+      value of `data_format`, see above for details. Dilations in the batch and
+      depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  # pylint: enable=line-too-long
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d(input,  # pylint: disable=redefined-builtin
+                           filters,
+                           strides,
+                           padding,
+                           use_cudnn_on_gpu=True,
+                           data_format=data_format,
+                           dilations=dilations,
+                           name=name)
+tf_export(v1=["nn.conv2d"])(gen_nn_ops.conv2d)
+
+
+@tf_export("nn.conv2d_backprop_filter", v1=[])
+def conv2d_backprop_filter_v2(input,  # pylint: disable=redefined-builtin
+                              filter_sizes,
+                              out_backprop,
+                              strides,
+                              padding,
+                              data_format="NHWC",
+                              dilations=None,
+                              name=None):
+  r"""Computes the gradients of convolution with respect to the filter.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape `[batch, in_height, in_width, in_channels]`.
+    filter_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the tensor shape of `filter`,
+      where `filter` is a 4-D
+      `[filter_height, filter_width, in_channels, out_channels]` tensor.
+    out_backprop: A `Tensor`. Must have the same type as `input`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `input`.
+  """
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d_backprop_filter(input,  # pylint: disable=redefined-builtin
+                                           filter_sizes,
+                                           out_backprop,
+                                           strides,
+                                           padding,
+                                           use_cudnn_on_gpu=True,
+                                           data_format=data_format,
+                                           dilations=dilations,
+                                           name=name)
+tf_export(v1=["nn.conv2d_backprop_filter"])(
+    gen_nn_ops.conv2d_backprop_filter)
+
+
+@tf_export("nn.conv2d_backprop_input", v1=[])
+def conv2d_backprop_input_v2(input_sizes,
+                             filters,
+                             out_backprop,
+                             strides,
+                             padding,
+                             data_format="NHWC",
+                             dilations=None,
+                             name=None):
+  r"""Computes the gradients of convolution with respect to the input.
+
+  Args:
+    input_sizes: A `Tensor` of type `int32`.
+      An integer vector representing the shape of `input`,
+      where `input` is a 4-D `[batch, height, width, channels]` tensor.
+    filters: A `Tensor`. Must be one of the following types:
+      `half`, `bfloat16`, `float32`, `float64`.
+      4-D with shape
+      `[filter_height, filter_width, in_channels, out_channels]`.
+    out_backprop: A `Tensor`. Must have the same type as `filters`.
+      4-D with shape `[batch, out_height, out_width, out_channels]`.
+      Gradients w.r.t. the output of the convolution.
+    strides: A list of `ints`.
+      The stride of the sliding window for each dimension of the input
+      of the convolution. Must be in the same order as the dimension specified
+      with format.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string` from: `"NHWC", "NCHW"`.
+      Defaults to `"NHWC"`.
+      Specify the data format of the input and output data. With the
+      default format "NHWC", the data is stored in the order of:
+          [batch, in_height, in_width, in_channels].
+      Alternatively, the format could be "NCHW", the data storage order of:
+          [batch, in_channels, in_height, in_width].
+    dilations: An optional list of `ints`. Defaults to `[1, 1, 1, 1]`.
+      1-D tensor of length 4.  The dilation factor for each dimension of
+      `input`. If set to k > 1, there will be k-1 skipped cells between each
+      filter element on that dimension. The dimension order is determined by
+      the value of `data_format`, see above for details. Dilations in the batch
+      and depth dimensions must be 1.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `filters`.
+  """
+  if dilations is None:
+    dilations = [1, 1, 1, 1]
+  return gen_nn_ops.conv2d_backprop_input(input_sizes,
+                                          filters,
+                                          out_backprop,
+                                          strides,
+                                          padding,
+                                          use_cudnn_on_gpu=True,
+                                          data_format=data_format,
+                                          dilations=dilations,
+                                          name=name)
+tf_export(v1=["nn.conv2d_backprop_input"])(
+    gen_nn_ops.conv2d_backprop_input)
+
+
+@tf_export(v1=["nn.conv2d_transpose"])
 def conv2d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -1261,6 +1654,31 @@ def conv2d_transpose(
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.conv2d_transpose", v1=[])
+def conv2d_transpose_v2(
+    input,
+    filters,  # pylint: disable=redefined-builtin
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NHWC",
+    name=None):
+  return conv2d_transpose(
+      input,
+      filters,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+conv2d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        conv2d_transpose.__doc__, "filter", "filters"),
+    "value", "input")
+
+
 @tf_export("nn.atrous_conv2d_transpose")
 def atrous_conv2d_transpose(value,
                             filters,
@@ -1409,7 +1827,29 @@ def atrous_conv2d_transpose(value,
         input=value, crops=batch_to_space_crop, block_size=rate)
 
 
-@tf_export("nn.conv3d_transpose")
+@tf_export("nn.conv3d", v1=[])
+def conv3d_v2(input,  # pylint: disable=redefined-builtin,missing-docstring
+              filters,
+              strides,
+              padding,
+              data_format="NDHWC",
+              dilations=None,
+              name=None):
+  if dilations is None:
+    dilations = [1, 1, 1, 1, 1]
+  return gen_nn_ops.conv3d(input,  # pylint: disable=redefined-builtin
+                           filters,
+                           strides,
+                           padding,
+                           data_format=data_format,
+                           dilations=dilations,
+                           name=name)
+tf_export(v1=["nn.conv3d"])(gen_nn_ops.conv3d)
+conv3d_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    gen_nn_ops.conv3d.__doc__, "filter", "filters")
+
+
+@tf_export(v1=["nn.conv3d_transpose"])
 def conv3d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -1487,6 +1927,31 @@ def conv3d_transpose(
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.conv3d_transpose", v1=[])
+def conv3d_transpose_v2(
+    input,
+    filters,
+    output_shape,
+    strides,
+    padding="SAME",
+    data_format="NDHWC",
+    name=None):
+  return conv3d_transpose(
+      input,
+      filters,
+      output_shape,
+      strides,
+      padding=padding,
+      data_format=data_format,
+      name=name)
+# pylint: enable=redefined-builtin
+conv3d_transpose_v2.__doc__ = deprecation.rewrite_argument_docstring(
+    deprecation.rewrite_argument_docstring(
+        conv3d_transpose.__doc__, "filter", "filters"),
+    "value", "input")
+
+
 @tf_export("nn.bias_add")
 def bias_add(value, bias, data_format=None, name=None):
   """Adds `bias` to `value`.
@@ -1542,7 +2007,7 @@ def bias_add_v1(value, bias, name=None):
     return gen_nn_ops.bias_add_v1(value, bias, name=name)
 
 
-@tf_export("nn.crelu")
+@tf_export(v1=["nn.crelu"])
 def crelu(features, name=None, axis=-1):
   """Computes Concatenated ReLU.
 
@@ -1568,6 +2033,12 @@ def crelu(features, name=None, axis=-1):
     return gen_nn_ops.relu(c)
 
 
+@tf_export("nn.crelu", v1=[])
+def crelu_v2(features, axis=-1, name=None):
+  return crelu(features, name=name, axis=axis)
+crelu_v2.__doc__ = crelu.__doc__
+
+
 @tf_export("nn.relu6")
 def relu6(features, name=None):
   """Computes Rectified Linear 6: `min(max(features, 0), 6)`.
@@ -1715,7 +2186,7 @@ def _softmax(logits, compute_op, dim=-1, name=None):
   return output
 
 
-@tf_export("nn.softmax", "math.softmax")
+@tf_export(v1=["nn.softmax", "math.softmax"])
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def softmax(logits, axis=None, name=None, dim=None):
   """Computes softmax activations.
@@ -1745,7 +2216,34 @@ def softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.softmax, axis, name)
 
 
-@tf_export("nn.log_softmax", "math.log_softmax")
+@tf_export("nn.softmax", "math.softmax", v1=[])
+def softmax_v2(logits, axis=None, name=None):
+  """Computes softmax activations.
+
+  This function performs the equivalent of
+
+      softmax = tf.exp(logits) / tf.reduce_sum(tf.exp(logits), axis)
+
+  Args:
+    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    axis: The dimension softmax would be performed on. The default is -1 which
+      indicates the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type and shape as `logits`.
+
+  Raises:
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
+      dimension of `logits`.
+  """
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops.softmax, axis, name)
+
+
+@tf_export(v1=["nn.log_softmax", "math.log_softmax"])
 @deprecation.deprecated_args(None, "dim is deprecated, use axis instead", "dim")
 def log_softmax(logits, axis=None, name=None, dim=None):
   """Computes log softmax activations.
@@ -1775,6 +2273,33 @@ def log_softmax(logits, axis=None, name=None, dim=None):
   return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
 
 
+@tf_export("nn.log_softmax", "math.log_softmax", v1=[])
+def log_softmax_v2(logits, axis=None, name=None):
+  """Computes log softmax activations.
+
+  For each batch `i` and class `j` we have
+
+      logsoftmax = logits - log(reduce_sum(exp(logits), axis))
+
+  Args:
+    logits: A non-empty `Tensor`. Must be one of the following types: `half`,
+      `float32`, `float64`.
+    axis: The dimension softmax would be performed on. The default is -1 which
+      indicates the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`. Has the same type as `logits`. Same shape as `logits`.
+
+  Raises:
+    InvalidArgumentError: if `logits` is empty or `axis` is beyond the last
+      dimension of `logits`.
+  """
+  if axis is None:
+    axis = -1
+  return _softmax(logits, gen_nn_ops.log_softmax, axis, name)
+
+
 def _ensure_xent_args(name, sentinel, labels, logits):
   # Make sure that all arguments were passed as named arguments.
   if sentinel is not None:
@@ -1784,9 +2309,8 @@ def _ensure_xent_args(name, sentinel, labels, logits):
     raise ValueError("Both labels and logits must be provided.")
 
 
-@tf_export("nn.softmax_cross_entropy_with_logits",
-           v1=["nn.softmax_cross_entropy_with_logits_v2"])
-def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
+@tf_export("nn.softmax_cross_entropy_with_logits", v1=[])
+def softmax_cross_entropy_with_logits_v2(labels, logits, axis=-1, name=None):
   """Computes softmax cross entropy between `logits` and `labels`.
 
   Measures the probability error in discrete classification tasks in which the
@@ -1808,7 +2332,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
 
   A common use case is to have logits and labels of shape
   `[batch_size, num_classes]`, but higher dimensions are supported, with
-  the `dim` argument specifying the class dimension.
+  the `axis` argument specifying the class dimension.
 
   `logits` and `labels` must have the same dtype (either `float16`, `float32`,
   or `float64`).
@@ -1826,8 +2350,64 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
       `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
       probability distribution.
     logits: Unscaled log probabilities.
-    dim: The class dimension. Defaulted to -1 which is the last dimension.
+    axis: The class dimension. Defaulted to -1 which is the last dimension.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` that contains the softmax cross entropy loss. Its type is the
+    same as `logits` and its shape is the same as `labels` except that it does
+    not have the last dimension of `labels`.
+  """
+  return softmax_cross_entropy_with_logits_v2_helper(
+      labels=labels, logits=logits, axis=axis, name=name)
+
+
+@tf_export(v1=["nn.softmax_cross_entropy_with_logits_v2"])
+@deprecated_args(None, "dim is deprecated, use axis instead", "dim")
+def softmax_cross_entropy_with_logits_v2_helper(
+    labels, logits, axis=None, name=None, dim=None):
+  """Computes softmax cross entropy between `logits` and `labels`.
+
+  Measures the probability error in discrete classification tasks in which the
+  classes are mutually exclusive (each entry is in exactly one class).  For
+  example, each CIFAR-10 image is labeled with one and only one label: an image
+  can be a dog or a truck, but not both.
+
+  **NOTE:**  While the classes are mutually exclusive, their probabilities
+  need not be.  All that is required is that each row of `labels` is
+  a valid probability distribution.  If they are not, the computation of the
+  gradient will be incorrect.
+
+  If using exclusive `labels` (wherein one and only
+  one class is true at a time), see `sparse_softmax_cross_entropy_with_logits`.
+
+  **WARNING:** This op expects unscaled logits, since it performs a `softmax`
+  on `logits` internally for efficiency.  Do not call this op with the
+  output of `softmax`, as it will produce incorrect results.
+
+  A common use case is to have logits and labels of shape
+  `[batch_size, num_classes]`, but higher dimensions are supported, with
+  the `axis` argument specifying the class dimension.
+
+  `logits` and `labels` must have the same dtype (either `float16`, `float32`,
+  or `float64`).
+
+  Backpropagation will happen into both `logits` and `labels`.  To disallow
+  backpropagation into `labels`, pass label tensors through `tf.stop_gradient`
+  before feeding it to this function.
+
+  **Note that to avoid confusion, it is required to pass only named arguments to
+  this function.**
+
+  Args:
+    labels: Each vector along the class dimension should hold a valid
+      probability distribution e.g. for the case in which labels are of shape
+      `[batch_size, num_classes]`, each row of `labels[i]` must be a valid
+      probability distribution.
+    logits: Unscaled log probabilities.
+    axis: The class dimension. Defaulted to -1 which is the last dimension.
     name: A name for the operation (optional).
+    dim: Deprecated alias for axis.
 
   Returns:
     A `Tensor` that contains the softmax cross entropy loss. Its type is the
@@ -1837,6 +2417,10 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
   # TODO(pcmurray) Raise an error when the labels do not sum to 1. Note: This
   # could break users who call this with bad labels, but disregard the bad
   # results.
+  axis = deprecated_argument_lookup("axis", axis, "dim", dim)
+  del dim
+  if axis is None:
+    axis = -1
 
   with ops.name_scope(name, "softmax_cross_entropy_with_logits",
                       [logits, labels]) as name:
@@ -1853,7 +2437,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     shape = logits.get_shape()
 
     # Move the dim to the end if dim is not the last dimension.
-    if dim is not -1:
+    if axis != -1:
 
       def _move_dim_to_end(tensor, dim_index, rank):
         return array_ops.transpose(
@@ -1863,8 +2447,8 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
                 math_ops.range(dim_index + 1, rank), [dim_index]
             ], 0))
 
-      precise_logits = _move_dim_to_end(precise_logits, dim, input_rank)
-      labels = _move_dim_to_end(labels, dim, input_rank)
+      precise_logits = _move_dim_to_end(precise_logits, axis, input_rank)
+      labels = _move_dim_to_end(labels, axis, input_rank)
 
     input_shape = array_ops.shape(precise_logits)
 
@@ -1878,7 +2462,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     cost, unused_backprop = gen_nn_ops.softmax_cross_entropy_with_logits(
         precise_logits, labels, name=name)
 
-    # The output cost shape should be the input minus dim.
+    # The output cost shape should be the input minus axis.
     output_shape = array_ops.slice(input_shape, [0],
                                    [math_ops.subtract(input_rank, 1)])
     cost = array_ops.reshape(cost, output_shape)
@@ -1888,7 +2472,7 @@ def softmax_cross_entropy_with_logits_v2(labels, logits, dim=-1, name=None):
     if not context.executing_eagerly(
     ) and shape is not None and shape.dims is not None:
       shape = shape.as_list()
-      del shape[dim]
+      del shape[axis]
       cost.set_shape(shape)
 
     if convert_to_float32:
@@ -1966,7 +2550,7 @@ def softmax_cross_entropy_with_logits(
     labels = array_ops.stop_gradient(labels, name="labels_stop_gradient")
 
   return softmax_cross_entropy_with_logits_v2(
-      labels=labels, logits=logits, dim=dim, name=name)
+      labels=labels, logits=logits, axis=dim, name=name)
 
 
 @tf_export("nn.sparse_softmax_cross_entropy_with_logits")
@@ -1993,8 +2577,9 @@ def sparse_softmax_cross_entropy_with_logits(
   on `logits` internally for efficiency.  Do not call this op with the
   output of `softmax`, as it will produce incorrect results.
 
-  A common use case is to have logits and labels of shape
-  `[batch_size, num_classes]`, but higher dimensions are supported, in which
+  A common use case is to have logits of shape
+  `[batch_size, num_classes]` and have labels of shape
+  `[batch_size]`, but higher dimensions are supported, in which
   case the `dim`-th dimension is assumed to be of size `num_classes`.
   `logits` must have the dtype of `float16`, `float32`, or `float64`, and
   `labels` must have the dtype of `int32` or `int64`.
@@ -2155,6 +2740,67 @@ def max_pool(value, ksize, strides, padding, data_format="NHWC", name=None):
         name=name)
 
 
+# pylint: disable=redefined-builtin
+@tf_export("nn.max_pool_with_argmax", v1=[])
+def max_pool_with_argmax_v2(input,
+                            ksize,
+                            strides,
+                            padding,
+                            data_format="NHWC",
+                            output_dtype=dtypes.int64,
+                            name=None):
+  """Performs max pooling on the input and outputs both max values and indices.
+
+  The indices in `argmax` are flattened, so that a maximum value at position
+  `[b, y, x, c]` becomes flattened index
+  `((b * height + y) * width + x) * channels + c`.
+
+  The indices returned are always in `[0, height) x [0, width)` before
+  flattening, even if padding is involved and the mathematically correct answer
+  is outside (either negative or too large).  This is a bug, but fixing it is
+  difficult to do in a safe backwards compatible way, especially due to
+  flattening.
+
+  Args:
+    input: A `Tensor`. Must be one of the following types: `float32`, `float64`,
+      `int32`, `uint8`, `int16`, `int8`, `int64`, `bfloat16`, `uint16`, `half`,
+      `uint32`, `uint64`.
+      4-D with shape `[batch, height, width, channels]`.  Input to pool over.
+    ksize: A list of `ints` that has length `>= 4`.
+      The size of the window for each dimension of the input tensor.
+    strides: A list of `ints` that has length `>= 4`.
+      The stride of the sliding window for each dimension of the
+      input tensor.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: An optional `string`, must be set to `"NHWC"`. Defaults to
+      `"NHWC"`.
+      Specify the data format of the input and output data.
+    output_dtype: An optional `tf.DType` from: `tf.int32, tf.int64`.
+      Defaults to `tf.int64`.
+      The dtype of the returned argmax tensor.
+    name: A name for the operation (optional).
+
+  Returns:
+    A tuple of `Tensor` objects (output, argmax).
+
+    output: A `Tensor`. Has the same type as `input`.
+    argmax: A `Tensor` of type `output_dtype`.
+  """
+
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than 'NHWC' are not yet supported")
+
+  return gen_nn_ops.max_pool_with_argmax(input=input,
+                                         ksize=ksize,
+                                         strides=strides,
+                                         padding=padding,
+                                         Targmax=output_dtype,
+                                         name=name)
+
+# pylint: enable=redefined-builtin
+
+
 @ops.RegisterStatistics("Conv2D", "flops")
 def _calc_conv_flops(graph, node):
   """Calculates the compute resources needed for Conv2D."""
@@ -2199,7 +2845,7 @@ def _calc_bias_add_flops(graph, node):
   return ops.OpStats("flops", input_count)
 
 
-@tf_export("nn.xw_plus_b")
+@tf_export(v1=["nn.xw_plus_b"])
 def xw_plus_b(x, weights, biases, name=None):  # pylint: disable=invalid-name
   """Computes matmul(x, weights) + biases.
 
@@ -2271,12 +2917,16 @@ def _get_noise_shape(x, noise_shape):
   return noise_shape
 
 
-@tf_export("nn.dropout")
-def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+@tf_export(v1=["nn.dropout"])
+@deprecation.deprecated_args(None, "Please use `rate` instead of `keep_prob`. "
+                             "Rate should be set to `rate = 1 - keep_prob`.",
+                             "keep_prob")
+def dropout(x, keep_prob=None, noise_shape=None, seed=None, name=None,
+            rate=None):  # pylint: disable=invalid-name
   """Computes dropout.
 
-  With probability `keep_prob`, outputs the input element scaled up by
-  `1 / keep_prob`, otherwise outputs `0`.  The scaling is so that the expected
+  For each element of `x`, with probability `rate`, outputs `0`, and otherwise
+  scales up the input by `1 / (1-rate)`. The scaling is such that the expected
   sum is unchanged.
 
   By default, each element is kept or dropped independently.  If `noise_shape`
@@ -2289,8 +2939,59 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
 
   Args:
     x: A floating point tensor.
-    keep_prob: A scalar `Tensor` with the same type as x. The probability
-      that each element is kept.
+    keep_prob: (deprecated) A deprecated alias for `(1-rate)`.
+    noise_shape: A 1-D `Tensor` of type `int32`, representing the
+      shape for randomly generated keep/drop flags.
+    seed: A Python integer. Used to create random seeds. See
+      `tf.set_random_seed` for behavior.
+    name: A name for this operation (optional).
+    rate: A scalar `Tensor` with the same type as `x`. The probability that each
+      element of `x` is discarded.
+
+  Returns:
+    A Tensor of the same shape of `x`.
+
+  Raises:
+    ValueError: If `rate` is not in `[0, 1)` or if `x` is not a floating
+      point tensor.
+  """
+  try:
+    keep = 1. - keep_prob if keep_prob is not None else None
+  except TypeError:
+    raise ValueError("keep_prob must be a floating point number or Tensor "
+                     "(got %r)" % keep_prob)
+
+  rate = deprecation.deprecated_argument_lookup(
+      "rate", rate,
+      "keep_prob", keep)
+
+  if rate is None:
+    raise ValueError("You must provide a rate to dropout.")
+
+  return dropout_v2(x, rate, noise_shape=noise_shape, seed=seed, name=name)
+
+
+@tf_export("nn.dropout", v1=[])
+def dropout_v2(x, rate, noise_shape=None, seed=None, name=None):  # pylint: disable=invalid-name
+  """Computes dropout.
+
+  With probability `rate`, drops elements of `x`. Input that are kept are
+  scaled up by `1 / (1 - rate)`, otherwise outputs `0`.  The scaling is so that
+  the expected sum is unchanged.
+
+  By default, each element is kept or dropped independently.  If `noise_shape`
+  is specified, it must be
+  [broadcastable](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
+  to the shape of `x`, and only dimensions with `noise_shape[i] == shape(x)[i]`
+  will make independent decisions.  For example, if `shape(x) = [k, l, m, n]`
+  and `noise_shape = [k, 1, 1, n]`, each batch and channel component will be
+  kept independently and each row and column will be kept or not kept together.
+
+  Args:
+    x: A floating point tensor.
+    rate: A scalar `Tensor` with the same type as x. The probability
+      that each element is dropped. For example, setting rate=0.1 would drop
+      10% of input elements.
     noise_shape: A 1-D `Tensor` of type `int32`, representing the
       shape for randomly generated keep/drop flags.
     seed: A Python integer. Used to create random seeds. See
@@ -2310,35 +3011,36 @@ def dropout(x, keep_prob, noise_shape=None, seed=None, name=None):  # pylint: di
     if not x.dtype.is_floating:
       raise ValueError("x has to be a floating point tensor since it's going to"
                        " be scaled. Got a %s tensor instead." % x.dtype)
-    if isinstance(keep_prob, numbers.Real) and not 0 < keep_prob <= 1:
-      raise ValueError("keep_prob must be a scalar tensor or a float in the "
-                       "range (0, 1], got %g" % keep_prob)
+    if isinstance(rate, numbers.Real) and not (rate >= 0 and rate < 1):
+      raise ValueError("rate must be a scalar tensor or a float in the "
+                       "range [0, 1), got %g" % rate)
 
     # Early return if nothing needs to be dropped.
-    if isinstance(keep_prob, float) and keep_prob == 1:
+    if isinstance(rate, numbers.Real) and rate == 0:
       return x
     if context.executing_eagerly():
-      if isinstance(keep_prob, ops.EagerTensor):
-        if keep_prob.numpy() == 1:
+      if isinstance(rate, ops.EagerTensor):
+        if rate.numpy() == 0:
           return x
     else:
-      keep_prob = ops.convert_to_tensor(
-          keep_prob, dtype=x.dtype, name="keep_prob")
-      keep_prob.get_shape().assert_is_compatible_with(tensor_shape.scalar())
+      rate = ops.convert_to_tensor(
+          rate, dtype=x.dtype, name="rate")
+      rate.get_shape().assert_is_compatible_with(tensor_shape.scalar())
 
-      # Do nothing if we know keep_prob == 1
-      if tensor_util.constant_value(keep_prob) == 1:
+      # Do nothing if we know rate == 0
+      if tensor_util.constant_value(rate) == 0:
         return x
 
     noise_shape = _get_noise_shape(x, noise_shape)
 
+    keep_prob = 1 - rate
     # uniform [keep_prob, 1.0 + keep_prob)
     random_tensor = keep_prob
     random_tensor += random_ops.random_uniform(
         noise_shape, seed=seed, dtype=x.dtype)
     # 0. if [keep_prob, 1.0) and 1. if [1.0, 1.0 + keep_prob)
     binary_tensor = math_ops.floor(random_tensor)
-    ret = math_ops.div(x, keep_prob) * binary_tensor
+    ret = math_ops.divide(x, keep_prob) * binary_tensor
     if not context.executing_eagerly():
       ret.set_shape(x.get_shape())
     return ret
@@ -2402,7 +3104,293 @@ def nth_element(input, n, reverse=False, name=None):  # pylint: disable=redefine
   return gen_nn_ops.nth_element(input, n, reverse=reverse, name=name)
 
 
-@tf_export("nn.conv1d")
+@tf_export(v1=["nn.fractional_max_pool"])
+@deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
+                        "args are deprecated.  Use fractional_max_pool_v2.")
+def fractional_max_pool(value,
+                        pooling_ratio,
+                        pseudo_random=False,
+                        overlapping=False,
+                        deterministic=False,
+                        seed=0,
+                        seed2=0,
+                        name=None):   # pylint: disable=redefined-builtin
+  r"""Performs fractional max pooling on the input.
+
+  This is a deprecated version of `fractional_max_pool`.
+
+  Fractional max pooling is slightly different than regular max pooling.  In
+  regular max pooling, you downsize an input set by taking the maximum value of
+  smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+  a factor of N, where N is an integer.  Fractional max pooling, as you might
+  expect from the word "fractional", means that the overall reduction ratio N
+  does not have to be an integer.
+
+  The sizes of the pooling regions are generated randomly but are fairly
+  uniform.  For example, let's look at the height dimension, and the constraints
+  on the list of rows that will be pool boundaries.
+
+  First we define the following:
+
+  1.  input_row_length : the number of rows from the input set
+  2.  output_row_length : which will be smaller than the input
+  3.  alpha = input_row_length / output_row_length : our reduction ratio
+  4.  K = floor(alpha)
+  5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+  Then, row_pooling_sequence should satisfy:
+
+  1.  a[0] = 0 : the first value of the sequence is 0
+  2.  a[end] = input_row_length : the last value of the sequence is the size
+  3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+  4.  length(row_pooling_sequence) = output_row_length+1
+
+  For more details on fractional max pooling, see this paper: [Benjamin Graham,
+  Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional max pooling.
+    deterministic: An optional `bool`.  Deprecated; use `fractional_max_pool_v2`
+      instead.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    seed2: An optional `int`.  Deprecated; use `fractional_max_pool_v2` instead.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional max pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                        overlapping, deterministic, seed, seed2,
+                                        name)
+
+
+@tf_export("nn.fractional_max_pool", v1=[])
+def fractional_max_pool_v2(value,
+                           pooling_ratio,
+                           pseudo_random=False,
+                           overlapping=False,
+                           seed=0,
+                           name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional max pooling on the input.
+
+  Fractional max pooling is slightly different than regular max pooling.  In
+  regular max pooling, you downsize an input set by taking the maximum value of
+  smaller N x N subsections of the set (often 2x2), and try to reduce the set by
+  a factor of N, where N is an integer.  Fractional max pooling, as you might
+  expect from the word "fractional", means that the overall reduction ratio N
+  does not have to be an integer.
+
+  The sizes of the pooling regions are generated randomly but are fairly
+  uniform.  For example, let's look at the height dimension, and the constraints
+  on the list of rows that will be pool boundaries.
+
+  First we define the following:
+
+  1.  input_row_length : the number of rows from the input set
+  2.  output_row_length : which will be smaller than the input
+  3.  alpha = input_row_length / output_row_length : our reduction ratio
+  4.  K = floor(alpha)
+  5.  row_pooling_sequence : this is the result list of pool boundary rows
+
+  Then, row_pooling_sequence should satisfy:
+
+  1.  a[0] = 0 : the first value of the sequence is 0
+  2.  a[end] = input_row_length : the last value of the sequence is the size
+  3.  K <= (a[i+1] - a[i]) <= K+1 : all intervals are K or K+1 size
+  4.  length(row_pooling_sequence) = output_row_length+1
+
+  For more details on fractional max pooling, see this paper: [Benjamin Graham,
+  Fractional Max-Pooling](http://arxiv.org/abs/1412.6071)
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional max pooling.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional max pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  if seed == 0:
+    return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=False,
+                                          seed=0, seed2=0, name=name)
+  else:
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_nn_ops.fractional_max_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=True,
+                                          seed=seed1, seed2=seed2, name=name)
+
+
+@tf_export(v1=["nn.fractional_avg_pool"])
+@deprecation.deprecated(date=None, instructions="`seed2` and `deterministic` "
+                        "args are deprecated.  Use fractional_avg_pool_v2.")
+def fractional_avg_pool(value,
+                        pooling_ratio,
+                        pseudo_random=False,
+                        overlapping=False,
+                        deterministic=False,
+                        seed=0,
+                        seed2=0,
+                        name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional average pooling on the input.
+
+  This is a deprecated version of `fractional_avg_pool`.
+
+  Fractional average pooling is similar to Fractional max pooling in the pooling
+  region generation step. The only difference is that after pooling regions are
+  generated, a mean operation is performed instead of a max operation in each
+  pooling region.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional avg pooling.
+    deterministic: An optional `bool`.  Deprecated; use `fractional_avg_pool_v2`
+      instead.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    seed2: An optional `int`.  Deprecated; use `fractional_avg_pool_v2` instead.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional avg pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                        overlapping, deterministic, seed, seed2,
+                                        name=name)
+
+
+@tf_export("nn.fractional_avg_pool", v1=[])
+def fractional_avg_pool_v2(value,
+                           pooling_ratio,
+                           pseudo_random=False,
+                           overlapping=False,
+                           seed=0,
+                           name=None):  # pylint: disable=redefined-builtin
+  r"""Performs fractional average pooling on the input.
+
+  Fractional average pooling is similar to Fractional max pooling in the pooling
+  region generation step. The only difference is that after pooling regions are
+  generated, a mean operation is performed instead of a max operation in each
+  pooling region.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, height, width, channels]`.
+    pooling_ratio: A list of `floats` that has length >= 4.  Pooling ratio for
+      each dimension of `value`, currently only supports row and col dimension
+      and should be >= 1.0. For example, a valid pooling ratio looks like [1.0,
+      1.44, 1.73, 1.0]. The first and last elements must be 1.0 because we don't
+      allow pooling on batch and channels dimensions.  1.44 and 1.73 are pooling
+      ratio on height and width dimensions respectively.
+    pseudo_random: An optional `bool`.  Defaults to `False`. When set to `True`,
+      generates the pooling sequence in a pseudorandom fashion, otherwise, in a
+      random fashion. Check paper [Benjamin Graham, Fractional
+      Max-Pooling](http://arxiv.org/abs/1412.6071) for difference between
+      pseudorandom and random.
+    overlapping: An optional `bool`.  Defaults to `False`.  When set to `True`,
+      it means when pooling, the values at the boundary of adjacent pooling
+      cells are used by both cells. For example:
+      `index  0  1  2  3  4`
+      `value  20 5  16 3  7`
+      If the pooling sequence is [0, 2, 4], then 16, at index 2 will be used
+      twice.  The result would be [20, 16] for fractional avg pooling.
+    seed: An optional `int`.  Defaults to `0`.  If set to be non-zero, the
+      random number generator is seeded by the given seed.  Otherwise it is
+      seeded by a random seed.
+    name: A name for the operation (optional).
+
+  Returns:
+  A tuple of `Tensor` objects (`output`, `row_pooling_sequence`,
+  `col_pooling_sequence`).
+    output: Output `Tensor` after fractional avg pooling.  Has the same type as
+      `value`.
+    row_pooling_sequence: A `Tensor` of type `int64`.
+    col_pooling_sequence: A `Tensor` of type `int64`.
+  """
+  if seed == 0:
+    return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=False,
+                                          seed=0, seed2=0, name=name)
+  else:
+    seed1, seed2 = random_seed.get_seed(seed)
+    return gen_nn_ops.fractional_avg_pool(value, pooling_ratio, pseudo_random,
+                                          overlapping, deterministic=True,
+                                          seed=seed1, seed2=seed2, name=name)
+
+
+@tf_export(v1=["nn.conv1d"])
 @deprecation.deprecated_arg_values(
     None,
     "`NCHW` for data_format is deprecated, use `NCW` instead",
@@ -2487,6 +3475,64 @@ def conv1d(value,
     return array_ops.squeeze(result, [spatial_start_dim])
 
 
+@tf_export("nn.conv1d", v1=[])
+def conv1d_v2(input,  # pylint: disable=redefined-builtin
+              filters,
+              stride,
+              padding,
+              data_format=None,
+              name=None):
+  r"""Computes a 1-D convolution given 3-D input and filter tensors.
+
+  Given an input tensor of shape
+    [batch, in_width, in_channels]
+  if data_format is "NWC", or
+    [batch, in_channels, in_width]
+  if data_format is "NCW",
+  and a filter / kernel tensor of shape
+  [filter_width, in_channels, out_channels], this op reshapes
+  the arguments to pass them to conv2d to perform the equivalent
+  convolution operation.
+
+  Internally, this op reshapes the input tensors and invokes `tf.nn.conv2d`.
+  For example, if `data_format` does not start with "NC", a tensor of shape
+    [batch, in_width, in_channels]
+  is reshaped to
+    [batch, 1, in_width, in_channels],
+  and the filter is reshaped to
+    [1, filter_width, in_channels, out_channels].
+  The result is then reshaped back to
+    [batch, out_width, out_channels]
+  \(where out_width is a function of the stride and padding as in conv2d\) and
+  returned to the caller.
+
+  Args:
+    input: A 3D `Tensor`.  Must be of type `float16`, `float32`, or `float64`.
+    filters: A 3D `Tensor`.  Must have the same type as `input`.
+    stride: An `integer`.  The number of entries by which
+      the filter is moved right at each step.
+    padding: 'SAME' or 'VALID'
+    data_format: An optional `string` from `"NWC", "NCW"`.  Defaults
+      to `"NWC"`, the data is stored in the order of
+      [batch, in_width, in_channels].  The `"NCW"` format stores
+      data as [batch, in_channels, in_width].
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor`.  Has the same type as input.
+
+  Raises:
+    ValueError: if `data_format` is invalid.
+  """
+  return conv1d(input,  # pylint: disable=redefined-builtin
+                filters,
+                stride,
+                padding,
+                use_cudnn_on_gpu=True,
+                data_format=data_format,
+                name=name)
+
+
 def conv1d_transpose(
     value,
     filter,  # pylint: disable=redefined-builtin
@@ -2602,7 +3648,7 @@ def _calc_dilation2d_flops(graph, node):
   return ops.OpStats("flops", (output_count * filter_height * filter_width * 2))
 
 
-@tf_export("nn.erosion2d")
+@tf_export(v1=["nn.erosion2d"])
 def erosion2d(value, kernel, strides, rates, padding, name=None):
   """Computes the grayscale erosion of 4-D `value` and 3-D `kernel` tensors.
 
@@ -2661,7 +3707,76 @@ def erosion2d(value, kernel, strides, rates, padding, name=None):
             name=name))
 
 
-@tf_export("math.in_top_k", "nn.in_top_k")
+@tf_export("nn.erosion2d", v1=[])
+def erosion2d_v2(value,
+                 filters,
+                 strides,
+                 padding,
+                 data_format,
+                 dilations,
+                 name=None):
+  """Computes the grayscale erosion of 4-D `value` and 3-D `filters` tensors.
+
+  The `value` tensor has shape `[batch, in_height, in_width, depth]` and the
+  `filters` tensor has shape `[filters_height, filters_width, depth]`, i.e.,
+  each input channel is processed independently of the others with its own
+  structuring function. The `output` tensor has shape
+  `[batch, out_height, out_width, depth]`. The spatial dimensions of the
+  output tensor depend on the `padding` algorithm. We currently only support the
+  default "NHWC" `data_format`.
+
+  In detail, the grayscale morphological 2-D erosion is given by:
+
+      output[b, y, x, c] =
+         min_{dy, dx} value[b,
+                            strides[1] * y - dilations[1] * dy,
+                            strides[2] * x - dilations[2] * dx,
+                            c] -
+                      filters[dy, dx, c]
+
+  Duality: The erosion of `value` by the `filters` is equal to the negation of
+  the dilation of `-value` by the reflected `filters`.
+
+  Args:
+    value: A `Tensor`. 4-D with shape `[batch, in_height, in_width, depth]`.
+    filters: A `Tensor`. Must have the same type as `value`.
+      3-D with shape `[filters_height, filters_width, depth]`.
+    strides: A list of `ints` that has length `>= 4`.
+      1-D of length 4. The stride of the sliding window for each dimension of
+      the input tensor. Must be: `[1, stride_height, stride_width, 1]`.
+    padding: A `string` from: `"SAME", "VALID"`.
+      The type of padding algorithm to use.
+    data_format: A `string`, only `"NHWC"` is currently supported.
+    dilations: A list of `ints` that has length `>= 4`.
+      1-D of length 4. The input stride for atrous morphological dilation.
+      Must be: `[1, rate_height, rate_width, 1]`.
+    name: A name for the operation (optional). If not specified "erosion2d"
+      is used.
+
+  Returns:
+    A `Tensor`. Has the same type as `value`.
+    4-D with shape `[batch, out_height, out_width, depth]`.
+
+  Raises:
+    ValueError: If the `value` depth does not match `filters`' shape, or if
+      padding is other than `'VALID'` or `'SAME'`.
+  """
+  if data_format != "NHWC":
+    raise ValueError("Data formats other than NHWC are not yet supported")
+
+  with ops.name_scope(name, "erosion2d", [value, filters]) as name:
+    # Reduce erosion to dilation by duality.
+    return math_ops.negative(
+        gen_nn_ops.dilation2d(
+            input=math_ops.negative(value),
+            filter=array_ops.reverse_v2(filters, [0, 1]),
+            strides=strides,
+            rates=dilations,
+            padding=padding,
+            name=name))
+
+
+@tf_export(v1=["math.in_top_k", "nn.in_top_k"])
 def in_top_k(predictions, targets, k, name=None):
   r"""Says whether the targets are in the top `K` predictions.
 
@@ -2693,3 +3808,17 @@ def in_top_k(predictions, targets, k, name=None):
   """
   with ops.name_scope(name, "in_top_k"):
     return gen_nn_ops.in_top_kv2(predictions, targets, k, name=name)
+
+
+@tf_export("math.in_top_k", "nn.in_top_k", v1=[])
+def in_top_k_v2(targets, predictions, k, name=None):
+  return in_top_k(predictions, targets, k, name)
+
+
+in_top_k_v2.__doc__ = in_top_k.__doc__
+
+
+tf_export(v1=["nn.quantized_avg_pool"])(gen_nn_ops.quantized_avg_pool)
+tf_export(v1=["nn.quantized_conv2d"])(gen_nn_ops.quantized_conv2d)
+tf_export(v1=["nn.quantized_relu_x"])(gen_nn_ops.quantized_relu_x)
+tf_export(v1=["nn.quantized_max_pool"])(gen_nn_ops.quantized_max_pool)
diff --git a/tensorflow/python/ops/nn_test.py b/tensorflow/python/ops/nn_test.py
index 152b2020ebb5d7fa7137a33de9a5493d00f1c58c..82fab741830fddd4ee0ba5c8e2644702ec199b4d 100644
--- a/tensorflow/python/ops/nn_test.py
+++ b/tensorflow/python/ops/nn_test.py
@@ -49,36 +49,39 @@ class ZeroFractionTest(test_lib.TestCase):
     nonzeros = np.count_nonzero(x.flatten())
     return 1.0 - nonzeros / total_elements
 
+  @test_util.run_deprecated_v1
   def testZeroFraction(self):
     x_shape = [5, 17]
     x_np = np.random.randint(0, 2, size=x_shape).astype(np.float32)
     y_np = self._ZeroFraction(x_np)
-    with self.cached_session():
-      x_tf = constant_op.constant(x_np)
-      x_tf.set_shape(x_shape)
-      y_tf = nn_impl.zero_fraction(x_tf)
-      y_tf_np = y_tf.eval()
+
+    x_tf = constant_op.constant(x_np)
+    x_tf.set_shape(x_shape)
+    y_tf = nn_impl.zero_fraction(x_tf)
+    y_tf_np = self.evaluate(y_tf)
+
     eps = 1e-8
     self.assertAllClose(y_tf_np, y_np, eps)
 
+  @test_util.run_deprecated_v1
   def testZeroFractionEmpty(self):
-    with self.cached_session():
-      x = np.zeros(0)
-      y = nn_impl.zero_fraction(x).eval()
-      self.assertTrue(np.isnan(y))
+    x = np.zeros(0)
+    y = self.evaluate(nn_impl.zero_fraction(x))
+    self.assertTrue(np.isnan(y))
 
+  @test_util.run_deprecated_v1
   def testZeroFraction2_27Zeros(self):
     sparsity = nn_impl.zero_fraction(
         array_ops.zeros([int(2**27 * 1.01)], dtype=dtypes.int8))
-    with self.cached_session():
-      self.assertAllClose(1.0, sparsity.eval())
+    self.assertAllClose(1.0, self.evaluate(sparsity))
 
+  @test_util.run_deprecated_v1
   def testZeroFraction2_27Ones(self):
     sparsity = nn_impl.zero_fraction(
         array_ops.ones([int(2**27 * 1.01)], dtype=dtypes.int8))
-    with self.cached_session():
-      self.assertAllClose(0.0, sparsity.eval())
+    self.assertAllClose(0.0, self.evaluate(sparsity))
 
+  @test_util.run_deprecated_v1
   def testUnknownSize(self):
     value = array_ops.placeholder(dtype=dtypes.float32)
     sparsity = nn_impl.zero_fraction(value)
@@ -103,8 +106,8 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._softmax(x_np)
     x_tf = constant_op.constant(x_np)
-    y_tf = nn_ops.softmax(x_tf)
-    y_tf_last_dim = nn_ops.softmax(x_tf, 1)
+    y_tf = nn_ops.softmax_v2(x_tf)
+    y_tf_last_dim = nn_ops.softmax_v2(x_tf, 1)
     y_tf_np = self.evaluate(y_tf)
     y_tf_last_dim_np = self.evaluate(y_tf_last_dim)
     eps = 1e-3
@@ -113,9 +116,9 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
 
   def testSoftmaxAxes(self):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
-    x_neg_axis = nn_ops.softmax(arr, axis=-2)
-    y_pos_axis = nn_ops.softmax(arr, axis=0)
-    z_gt_axis = nn_ops.softmax(arr, axis=0)
+    x_neg_axis = nn_ops.softmax_v2(arr, axis=-2)
+    y_pos_axis = nn_ops.softmax_v2(arr, axis=0)
+    z_gt_axis = nn_ops.softmax_v2(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -124,11 +127,12 @@ class SoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
+  @test_util.run_deprecated_v1
   def testGradient(self, x_shape):
     x_np = np.random.randn(*x_shape).astype(np.float64)
     with self.cached_session():
       x_tf = constant_op.constant(x_np)
-      y_tf = nn_ops.softmax(x_tf)
+      y_tf = nn_ops.softmax_v2(x_tf)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                     x_shape)
     eps = 2e-8
@@ -159,6 +163,7 @@ class LogPoissonLossTest(test_lib.TestCase):
     self.assertAllClose(y_tf_np, y_np, eps)
     self.assertAllClose(y_tf_np_stirling, y_np_stirling, eps)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [5, 10]
     x_np = np.random.randn(*x_shape).astype(np.float64)
@@ -191,16 +196,16 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     x_np = np.random.randn(*x_shape).astype(np.float32)
     y_np = self._log_softmax(x_np)
     x_tf = constant_op.constant(x_np)
-    y_tf = nn_ops.log_softmax(x_tf)
+    y_tf = nn_ops.log_softmax_v2(x_tf)
     y_tf_np = self.evaluate(y_tf)
     eps = 1e-3
     self.assertAllClose(y_tf_np, y_np, eps)
 
   def testLogSoftmaxAxes(self):
     arr = np.linspace(0., 1, 12).reshape(3, 4)
-    x_neg_axis = nn_ops.log_softmax(arr, axis=-2)
-    y_pos_axis = nn_ops.log_softmax(arr, axis=0)
-    z_gt_axis = nn_ops.log_softmax(arr, axis=0)
+    x_neg_axis = nn_ops.log_softmax_v2(arr, axis=-2)
+    y_pos_axis = nn_ops.log_softmax_v2(arr, axis=0)
+    z_gt_axis = nn_ops.log_softmax_v2(arr, axis=0)
     x_neg_axis_tf = self.evaluate(x_neg_axis)
     y_pos_axis_tf = self.evaluate(y_pos_axis)
     z_gt_axis_tf = self.evaluate(z_gt_axis)
@@ -209,11 +214,12 @@ class LogSoftmaxTest(test_lib.TestCase, parameterized.TestCase):
     self.assertAllClose(y_pos_axis_tf, z_gt_axis_tf, eps)
 
   @parameterized.parameters(((5, 10),), ((2, 3, 4),))
+  @test_util.run_deprecated_v1
   def testGradient(self, x_shape):
     x_np = np.random.randn(*x_shape).astype(np.float64)
     with self.cached_session():
       x_tf = constant_op.constant(x_np)
-      y_tf = nn_ops.log_softmax(x_tf)
+      y_tf = nn_ops.log_softmax_v2(x_tf)
       err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                     x_shape)
     eps = 1e-7
@@ -231,6 +237,7 @@ class L2LossTest(test_lib.TestCase):
       value = self.evaluate(l2loss)
       self.assertAllClose(7.0, value)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     x_shape = [20, 7, 3]
     np.random.seed(1)  # Make it reproducible.
@@ -264,7 +271,7 @@ class L2NormalizeTest(test_lib.TestCase):
     for dim in range(len(x_shape)):
       y_np = self._l2Normalize(x_np, dim)
       x_tf = constant_op.constant(x_np, name="x")
-      y_tf = nn_impl.l2_normalize(x_tf, dim)
+      y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
       self.assertAllClose(y_np, self.evaluate(y_tf))
 
   @test_util.run_in_graph_and_eager_modes
@@ -275,9 +282,10 @@ class L2NormalizeTest(test_lib.TestCase):
     dim = [1, 2]
     y_np = self._l2Normalize(x_np, dim)
     x_tf = constant_op.constant(x_np, name="x")
-    y_tf = nn_impl.l2_normalize(x_tf, dim)
+    y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
     self.assertAllClose(y_np, self.evaluate(y_tf))
 
+  @test_util.run_deprecated_v1
   def testL2NormalizeGradient(self):
     x_shape = [20, 7, 3]
     np.random.seed(1)
@@ -285,7 +293,7 @@ class L2NormalizeTest(test_lib.TestCase):
     for dim in range(len(x_shape)):
       with self.cached_session():
         x_tf = constant_op.constant(x_np, name="x")
-        y_tf = nn_impl.l2_normalize(x_tf, dim)
+        y_tf = nn_impl.l2_normalize_v2(x_tf, dim)
         err = gradient_checker.compute_gradient_error(x_tf, x_shape, y_tf,
                                                       x_shape)
       print("L2Normalize gradient err = %g " % err)
@@ -302,19 +310,18 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob)
-        final_count = 0
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob)
+      final_count = 0
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -330,19 +337,18 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        final_count = 0
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      final_count = 0
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
@@ -355,18 +361,17 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 30
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          # Verifies that each y column as only one type of activation.
-          for i in xrange(x_dim):
-            sorted_value = np.unique(np.sort(value[i, :]))
-            self.assertEqual(sorted_value.size, 1)
-
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[x_dim, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        # Verifies that each y column as only one type of activation.
+        for i in xrange(x_dim):
+          sorted_value = np.unique(np.sort(value[i, :]))
+          self.assertEqual(sorted_value.size, 1)
+
+  @test_util.run_deprecated_v1
   def testDropoutPlaceholderKeepProb(self):
     # Runs dropout with 0-1 tensor 10 times, sum the number of ones and validate
     # that it is producing approximately the right number of ones over a large
@@ -395,6 +400,7 @@ class DropoutTest(test_lib.TestCase):
       print(rel_error)
       self.assertTrue(rel_error < 0.15)
 
+  @test_util.run_deprecated_v1
   def testShapedDropoutUnknownShape(self):
     x_dim = 40
     y_dim = 30
@@ -409,26 +415,26 @@ class DropoutTest(test_lib.TestCase):
     y_dim = 3
     num_iter = 10
     for keep_prob in [0.1, 0.5, 0.8]:
-      with self.cached_session():
-        t = constant_op.constant(
-            1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
-        # Set noise_shape=[None, 1] which means [x_dim, 1].
-        dropout = nn_ops.dropout(t, keep_prob, noise_shape=[None, 1])
-        self.assertEqual([x_dim, y_dim], dropout.get_shape())
-        final_count = 0
-        for _ in xrange(0, num_iter):
-          value = dropout.eval()
-          final_count += np.count_nonzero(value)
-          # Verifies that there are only two values: 0 and 1/keep_prob.
-          sorted_value = np.unique(np.sort(value))
-          self.assertEqual(0, sorted_value[0])
-          self.assertAllClose(1 / keep_prob, sorted_value[1])
+      t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+      # Set noise_shape=[None, 1] which means [x_dim, 1].
+      dropout = nn_ops.dropout(t, keep_prob, noise_shape=[None, 1])
+      self.assertEqual([x_dim, y_dim], dropout.get_shape())
+      final_count = 0
+      for _ in xrange(0, num_iter):
+        value = self.evaluate(dropout)
+        final_count += np.count_nonzero(value)
+        # Verifies that there are only two values: 0 and 1/keep_prob.
+        sorted_value = np.unique(np.sort(value))
+        self.assertEqual(0, sorted_value[0])
+        self.assertAllClose(1 / keep_prob, sorted_value[1])
+
       # Check that we are in the 15% error range
       expected_count = x_dim * y_dim * keep_prob * num_iter
       rel_error = math.fabs(final_count - expected_count) / expected_count
       print(rel_error)
       self.assertTrue(rel_error < 0.15)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepProb(self):
     x_dim = 40
     y_dim = 30
@@ -444,6 +450,19 @@ class DropoutTest(test_lib.TestCase):
     with self.assertRaises(ValueError):
       nn_ops.dropout(t, array_ops.placeholder(dtypes.float32, shape=[2]))
 
+  @test_util.run_deprecated_v1
+  def testInvalidRate(self):
+    x_dim = 40
+    y_dim = 30
+    t = constant_op.constant(1.0, shape=[x_dim, y_dim], dtype=dtypes.float32)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, -1.0)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, 1.1)
+    with self.assertRaises(ValueError):
+      nn_ops.dropout_v2(t, [0.0, 1.0])
+
+  @test_util.run_deprecated_v1
   def testShapedDropoutShapeError(self):
     # Runs shaped dropout and verifies an error is thrown on misshapen noise.
     x_dim = 40
@@ -466,9 +485,11 @@ class DropoutTest(test_lib.TestCase):
 
   def testNoDropoutFast(self):
     x = array_ops.zeros((5,))
-    for p in 1, constant_op.constant(1.0):
-      y = nn_ops.dropout(x, keep_prob=p)
-      self.assertTrue(x is y)
+    y = nn_ops.dropout(x, keep_prob=1)
+    self.assertTrue(x is y)
+
+    y = nn_ops.dropout_v2(x, rate=0)
+    self.assertTrue(x is y)
 
   def testDropoutWithIntegerInputs(self):
     x = constant_op.constant([1, 1, 1, 1, 1])
@@ -563,78 +584,78 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
           initializer=constant_op.constant(biases))
       with self.session(graph=g) as sess:
         variables.global_variables_initializer().run()
-        return sess.run([list(sharded_weights), list(sharded_biases)])
+        return self.evaluate([list(sharded_weights), list(sharded_biases)])
 
   def testShapes(self):
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_basic_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
-        self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_basic_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertEqual(exp_logits.shape, got_logits.shape, self._eps)
+      self.assertEqual(exp_labels.shape, got_labels.shape, self._eps)
 
   def testBasic(self):
     """Without accidental hit removal or subtract_log_q."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_basic_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_basic_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testAccidentalHitRemoval(self):
     """With accidental hit removal, no subtract_log_q."""
@@ -642,118 +663,118 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     num_classes = 5
     batch_size = 3
     sampled = [1, 0, 2, 3]
-    with self.cached_session():
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, _,
-         _) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=sampled,
-             subtract_log_q=False)
-        logits_tensor, _ = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=len(sampled),
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=True,
-            partition_strategy="div",
-            name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
-        # Test that the exponentiated logits of accidental hits are near 0.
-        # First we need to find the hits in this random test run:
-        labels_reshape = labels.reshape((batch_size, num_true))
-        got_logits = logits_tensor.eval()
-        for row in xrange(batch_size):
-          row_labels = labels_reshape[row, :]
-          for col in xrange(len(sampled)):
-            if sampled[col] in row_labels:
-              # We need to add the num_true_test offset into logits_*
-              self.assertNear(
-                  np.exp(got_logits[row, col + num_true]), 0., self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, _,
+       _) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=sampled,
+           subtract_log_q=False)
+      logits_tensor, _ = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=len(sampled),
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=True,
+          partition_strategy="div",
+          name="sampled_logits_accidental_hit_removal_num_true_%d" % num_true)
+      # Test that the exponentiated logits of accidental hits are near 0.
+      # First we need to find the hits in this random test run:
+      labels_reshape = labels.reshape((batch_size, num_true))
+      got_logits = self.evaluate(logits_tensor)
+      for row in xrange(batch_size):
+        row_labels = labels_reshape[row, :]
+        for col in xrange(len(sampled)):
+          if sampled[col] in row_labels:
+            # We need to add the num_true_test offset into logits_*
+            self.assertNear(
+                np.exp(got_logits[row, col + num_true]), 0., self._eps)
 
   def testSubtractLogQ(self):
     """With subtract_log_q, no accidental hit removal."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=True)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=constant_op.constant(weights),
-            biases=constant_op.constant(biases),
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=True,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=True)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=constant_op.constant(weights),
+          biases=constant_op.constant(biases),
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=True,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_subtract_log_q_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testSharded(self):
     """With sharded weights and sharded biases."""
     np.random.seed(0)
     num_classes = 5
     batch_size = 3
-    with self.cached_session() as sess:
-      for num_true in range(1, 5):
-        labels = np.random.randint(
-            low=0, high=num_classes, size=batch_size * num_true)
-        (weights, biases, hidden_acts, sampled_vals, exp_logits,
-         exp_labels) = self._GenerateTestData(
-             num_classes=num_classes,
-             dim=10,
-             batch_size=batch_size,
-             num_true=num_true,
-             labels=labels,
-             sampled=[1, 0, 2, 3],
-             subtract_log_q=False)
-        weight_shards, bias_shards = self._ShardTestEmbeddings(
-            weights, biases, num_shards=3)
-        logits_tensor, labels_tensor = _compute_sampled_logits(
-            weights=[constant_op.constant(shard) for shard in weight_shards],
-            biases=[constant_op.constant(shard) for shard in bias_shards],
-            labels=constant_op.constant(
-                labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
-            inputs=constant_op.constant(hidden_acts),
-            num_sampled=4,
-            num_classes=num_classes,
-            num_true=num_true,
-            sampled_values=sampled_vals,
-            subtract_log_q=False,
-            remove_accidental_hits=False,
-            partition_strategy="div",
-            name="sampled_logits_sharded_num_true_%d" % num_true)
-        got_logits, got_labels = sess.run([logits_tensor, labels_tensor])
-        self.assertAllClose(exp_logits, got_logits, self._eps)
-        self.assertAllClose(exp_labels, got_labels, self._eps)
+
+    for num_true in range(1, 5):
+      labels = np.random.randint(
+          low=0, high=num_classes, size=batch_size * num_true)
+      (weights, biases, hidden_acts, sampled_vals, exp_logits,
+       exp_labels) = self._GenerateTestData(
+           num_classes=num_classes,
+           dim=10,
+           batch_size=batch_size,
+           num_true=num_true,
+           labels=labels,
+           sampled=[1, 0, 2, 3],
+           subtract_log_q=False)
+      weight_shards, bias_shards = self._ShardTestEmbeddings(
+          weights, biases, num_shards=3)
+      logits_tensor, labels_tensor = _compute_sampled_logits(
+          weights=[constant_op.constant(shard) for shard in weight_shards],
+          biases=[constant_op.constant(shard) for shard in bias_shards],
+          labels=constant_op.constant(
+              labels, dtype=dtypes.int64, shape=(batch_size, num_true)),
+          inputs=constant_op.constant(hidden_acts),
+          num_sampled=4,
+          num_classes=num_classes,
+          num_true=num_true,
+          sampled_values=sampled_vals,
+          subtract_log_q=False,
+          remove_accidental_hits=False,
+          partition_strategy="div",
+          name="sampled_logits_sharded_num_true_%d" % num_true)
+      got_logits, got_labels = self.evaluate([logits_tensor, labels_tensor])
+      self.assertAllClose(exp_logits, got_logits, self._eps)
+      self.assertAllClose(exp_labels, got_labels, self._eps)
 
   def testNCELoss(self):
     # A simple test to verify the numerics.
@@ -782,35 +803,32 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_nce_loss = np.sum(
         _SigmoidCrossEntropyWithLogits(exp_logits, exp_labels), 1)
 
-    with self.cached_session():
-      got_nce_loss = nn_impl.nce_loss(
-          weights=constant_op.constant(weights),
-          biases=constant_op.constant(biases),
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
-
-      # Test with sharded weights and sharded biases.
-      weight_shards, bias_shards = self._ShardTestEmbeddings(
-          weights, biases, num_shards=3)
-      got_nce_loss = nn_impl.nce_loss(
-          weights=[constant_op.constant(shard) for shard in weight_shards],
-          biases=[constant_op.constant(shard) for shard in bias_shards],
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_nce_loss, got_nce_loss.eval(), 1e-4)
+    got_nce_loss = nn_impl.nce_loss_v2(
+        weights=constant_op.constant(weights),
+        biases=constant_op.constant(biases),
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals)
+
+    self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
+
+    # Test with sharded weights and sharded biases.
+    weight_shards, bias_shards = self._ShardTestEmbeddings(
+        weights, biases, num_shards=3)
+    got_nce_loss = nn_impl.nce_loss_v2(
+        weights=[constant_op.constant(shard) for shard in weight_shards],
+        biases=[constant_op.constant(shard) for shard in bias_shards],
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals)
+
+    self.assertAllClose(exp_nce_loss, self.evaluate(got_nce_loss), 1e-4)
 
   def testSampledSoftmaxLoss(self):
     # A simple test to verify the numerics.
@@ -839,39 +857,36 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.cached_session():
-      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
-          weights=constant_op.constant(weights),
-          biases=constant_op.constant(biases),
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          remove_accidental_hits=False,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-4)
-
-      # Test with sharded weights and sharded biases.
-      weight_shards, bias_shards = self._ShardTestEmbeddings(
-          weights, biases, num_shards=3)
-      got_sampled_softmax_loss = nn_impl.sampled_softmax_loss(
-          weights=[constant_op.constant(shard) for shard in weight_shards],
-          biases=[constant_op.constant(shard) for shard in bias_shards],
-          labels=constant_op.constant(labels, shape=(batch_size, 1)),
-          inputs=constant_op.constant(hidden_acts),
-          num_sampled=4,
-          num_classes=num_classes,
-          num_true=1,
-          sampled_values=sampled_vals,
-          remove_accidental_hits=False,
-          partition_strategy="div")
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-4)
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss_v2(
+        weights=constant_op.constant(weights),
+        biases=constant_op.constant(biases),
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        remove_accidental_hits=False)
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-4)
+
+    # Test with sharded weights and sharded biases.
+    weight_shards, bias_shards = self._ShardTestEmbeddings(
+        weights, biases, num_shards=3)
+    got_sampled_softmax_loss = nn_impl.sampled_softmax_loss_v2(
+        weights=[constant_op.constant(shard) for shard in weight_shards],
+        biases=[constant_op.constant(shard) for shard in bias_shards],
+        labels=constant_op.constant(labels, shape=(batch_size, 1)),
+        inputs=constant_op.constant(hidden_acts),
+        num_sampled=4,
+        num_classes=num_classes,
+        num_true=1,
+        sampled_values=sampled_vals,
+        remove_accidental_hits=False)
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-4)
 
   def testSampledSoftmaxLossBf16(self):
     # A simple test to verify the numerics for bfloat16.
@@ -900,29 +915,29 @@ class ComputeSampledLogitsTest(test_lib.TestCase):
     exp_sampled_softmax_loss = _SoftmaxCrossEntropyWithLogits(
         exp_logits, exp_labels)
 
-    with self.cached_session():
-      true_exp_bf16 = np.full(
-          [batch_size, 1], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
-      sampled_exp_bf16 = np.full(
-          [len(sampled)], fill_value=0.5, dtype=dtypes.bfloat16.as_numpy_dtype)
-      sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
-
-      got_sampled_softmax_loss = math_ops.cast(
-          nn_impl.sampled_softmax_loss(
-              weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
-              biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
-              labels=constant_op.constant(
-                  labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
-              inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
-              num_sampled=4,
-              num_classes=num_classes,
-              num_true=1,
-              sampled_values=sampled_vals_bf16,
-              remove_accidental_hits=False,
-              partition_strategy="div"), dtypes.float32)
-
-      self.assertAllClose(exp_sampled_softmax_loss,
-                          got_sampled_softmax_loss.eval(), 1e-1)
+    true_exp_bf16 = np.full([batch_size, 1],
+                            fill_value=0.5,
+                            dtype=dtypes.bfloat16.as_numpy_dtype)
+    sampled_exp_bf16 = np.full([len(sampled)],
+                               fill_value=0.5,
+                               dtype=dtypes.bfloat16.as_numpy_dtype)
+    sampled_vals_bf16 = (sampled, true_exp_bf16, sampled_exp_bf16)
+
+    got_sampled_softmax_loss = math_ops.cast(
+        nn_impl.sampled_softmax_loss_v2(
+            weights=constant_op.constant(weights, dtype=dtypes.bfloat16),
+            biases=constant_op.constant(biases, dtype=dtypes.bfloat16),
+            labels=constant_op.constant(
+                labels, shape=(batch_size, 1), dtype=dtypes.bfloat16),
+            inputs=constant_op.constant(hidden_acts, dtype=dtypes.bfloat16),
+            num_sampled=4,
+            num_classes=num_classes,
+            num_true=1,
+            sampled_values=sampled_vals_bf16,
+            remove_accidental_hits=False), dtypes.float32)
+
+    self.assertAllClose(exp_sampled_softmax_loss,
+                        self.evaluate(got_sampled_softmax_loss), 1e-1)
 
 
 class CReluTest(test_lib.TestCase):
@@ -931,9 +946,9 @@ class CReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.concatenate([x * (x > 0), -x * (x < 0)], axis=1)
-    with self.cached_session():
-      z = nn_ops.crelu(constant_op.constant(x)).eval()
-      self.assertAllClose(y, z, 1e-4)
+
+    z = self.evaluate(nn_ops.crelu(constant_op.constant(x)))
+    self.assertAllClose(y, z, 1e-4)
 
 
 class ReluTest(test_lib.TestCase):
@@ -942,10 +957,11 @@ class ReluTest(test_lib.TestCase):
     np.random.seed(1)  # Make it reproducible.
     x = np.random.randn(3, 4).astype(np.float32)
     y = np.maximum(x, 0.0)
-    with self.cached_session():
-      z = nn_ops.relu(constant_op.constant(x)).eval()
-      self.assertAllEqual(y, z)
 
+    z = self.evaluate(nn_ops.relu(constant_op.constant(x)))
+    self.assertAllEqual(y, z)
+
+  @test_util.run_deprecated_v1
   def testNaNs(self):
     # Test that relu(nan) = nan for various sizes.
     for i in range(18):
@@ -967,22 +983,26 @@ class LeakyReluTest(test_lib.TestCase):
 
     outputs = nn_ops.leaky_relu(inputs)
     self.assertEquals(inputs.shape, outputs.shape)
-    with self.cached_session() as sess:
-      inputs, outputs = sess.run([inputs, outputs])
+
+    inputs, outputs = self.evaluate([inputs, outputs])
+
     self.assertGreaterEqual(outputs.min(), 0.0)
     self.assertLessEqual(outputs.max(), 1.0)
     self.assertAllClose(inputs, outputs)
 
+  @test_util.run_deprecated_v1
   def testValues(self):
     for dtype in [np.int32, np.int64, np.float16, np.float32, np.float64]:
       np_values = np.array([-2, -1, 0, 1, 2], dtype=dtype)
       outputs = nn_ops.leaky_relu(constant_op.constant(np_values))
-      with self.cached_session() as sess:
-        outputs = sess.run(outputs)
+
+      outputs = self.evaluate(outputs)
+
       tol = 2e-3 if dtype == np.float16 else 1e-6
       self.assertAllClose(
           outputs, [-0.4, -0.2, 0.0, 1.0, 2.0], rtol=tol, atol=tol)
 
+  @test_util.run_deprecated_v1
   def testName(self):
     np_values = np.array([-2, -1, 0, 1, 2], dtype=np.float64)
     outputs_with_name_set = nn_ops.leaky_relu(
@@ -996,6 +1016,7 @@ class LeakyReluTest(test_lib.TestCase):
 
 class SwishTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testValues(self):
     np_values = np.array(
         [np.linspace(-10.0, 0.0, 100),
@@ -1004,11 +1025,13 @@ class SwishTest(test_lib.TestCase):
     tf_values = constant_op.constant(np_values)
     actual_tf_outputs = nn_impl.swish(tf_values)
     expected_tf_outputs = tf_values * math_ops.sigmoid(tf_values)
-    with self.cached_session() as sess:
-      actual_outputs, expected_outputs = sess.run(
-          [actual_tf_outputs, expected_tf_outputs])
+
+    actual_outputs, expected_outputs = self.evaluate(
+        [actual_tf_outputs, expected_tf_outputs])
+
     self.assertAllClose(actual_outputs, expected_outputs)
 
+  @test_util.run_deprecated_v1
   def testGradients(self):
     shape = [5, 3, 4]
     sigma = 5
@@ -1039,8 +1062,8 @@ class MomentsTest(test_lib.TestCase):
             with self.session(graph=g) as sess:
               inputs = constant_op.constant(
                   input_values, shape=input_shape, dtype=dtypes.float32)
-              mean, variance = nn_impl.moments(
-                  inputs, moments_axes, keep_dims=keep_dims)
+              mean, variance = nn_impl.moments_v2(
+                  inputs, moments_axes, keepdims=keep_dims)
 
               if check_gradients:
                 err = gradient_checker.compute_gradient_error(
@@ -1051,7 +1074,7 @@ class MomentsTest(test_lib.TestCase):
                 self.assertLess(err, 1e-3)
 
               # Evaluate.
-              [mean, variance] = sess.run([mean, variance])
+              [mean, variance] = self.evaluate([mean, variance])
               # Make sure that there are no NaNs
               self.assertFalse(np.isnan(mean).any())
               self.assertFalse(np.isnan(variance).any())
@@ -1094,9 +1117,9 @@ class DataFormatDimMapTest(test_lib.TestCase):
   def _test(self, x_val, y_val_expected):
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x)
-    with self.cached_session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
-      self.assertAllEqual(y_val, y_val_expected)
+
+    y_val = self.evaluate(y)
+    self.assertAllEqual(y_val, y_val_expected)
 
   def test(self):
     self._test(0, 0)
@@ -1117,8 +1140,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [2, 2, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="NCHW")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testNHWCtoHWNC(self):
@@ -1126,8 +1149,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [2, 0, 1, 3, 2, 0, 1, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testNHWCtoWHCN(self):
@@ -1135,8 +1158,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [3, 1, 0, 2, 3, 1, 0, 2]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="NHWC", dst_format="WHCN")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
   def testArbitraryASCII(self):
@@ -1144,8 +1167,8 @@ class DataFormatDimMapTest(test_lib.TestCase):
     y_val_expected = [3, 2, 1, 0, 3, 2, 1, 0]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_dim_map(x, src_format="qwer", dst_format="rewq")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, y_val_expected)
 
 
@@ -1155,64 +1178,64 @@ class DataFormatVectorPermuteTest(test_lib.TestCase):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x)
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 3, 4, 9])
 
   def testNCHWToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [7, 9, 3, 4])
 
   def testNHWCToHWNC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [4, 9, 7, 3])
 
   def testHWNCToNHWC(self):
     x_val = [7, 4, 9, 3]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [9, 7, 4, 3])
 
   def testNHWCToNCHW2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x)
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[7, 4], [5, 1], [9, 3], [4, 5]])
 
   def testNHWCToHWNC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NHWC", dst_format="HWNC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[9, 3], [4, 5], [7, 4], [5, 1]])
 
   def testHWNCToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="HWNC", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[4, 5], [7, 4], [9, 3], [5, 1]])
 
   def testNCHWToNHWC2D(self):
     x_val = [[7, 4], [9, 3], [4, 5], [5, 1]]
     x = constant_op.constant(x_val)
     y = nn_ops.data_format_vec_permute(x, src_format="NCHW", dst_format="NHWC")
-    with self.session(use_gpu=test_lib.is_gpu_available()) as sess:
-      y_val = sess.run(y)
+    with test_util.use_gpu():
+      y_val = self.evaluate(y)
       self.assertAllEqual(y_val, [[7, 4], [4, 5], [5, 1], [9, 3]])
 
 
diff --git a/tensorflow/python/ops/nn_xent_test.py b/tensorflow/python/ops/nn_xent_test.py
index 57ce4fd0a995f5fe04de3c8e9bbc371412687c32..3e5c198fc6a6658c7dcdc3bf3ead9df65db63607 100644
--- a/tensorflow/python/ops/nn_xent_test.py
+++ b/tensorflow/python/ops/nn_xent_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import gradient_checker
 from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import nn_impl
@@ -53,6 +54,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     losses = np.array(self._SigmoidCrossEntropyWithLogits(x, y)).reshape(*sizes)
     return logits, targets, losses
 
+  @test_util.run_deprecated_v1
   def testConstructionNamed(self):
     with self.cached_session():
       logits, targets, _ = self._Inputs()
@@ -68,7 +70,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
           loss = nn_impl.sigmoid_cross_entropy_with_logits(
               labels=targets, logits=logits)
           np_loss = np.array(losses).astype(np.float32)
-          tf_loss = loss.eval()
+          tf_loss = self.evaluate(loss)
         self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testLogisticOutputMultiDim(self):
@@ -79,9 +81,10 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
           loss = nn_impl.sigmoid_cross_entropy_with_logits(
               labels=targets, logits=logits)
           np_loss = np.array(losses).astype(np.float32)
-          tf_loss = loss.eval()
+          tf_loss = self.evaluate(loss)
         self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     sizes = [4, 2]
     with self.cached_session():
@@ -92,6 +95,7 @@ class SigmoidCrossEntropyWithLogitsTest(test.TestCase):
     print("logistic loss gradient err = ", err)
     self.assertLess(err, 1e-7)
 
+  @test_util.run_deprecated_v1
   def testGradientAtZero(self):
     with self.cached_session():
       logits = constant_op.constant([0.0, 0.0], dtype=dtypes.float64)
@@ -129,6 +133,7 @@ class WeightedCrossEntropyTest(test.TestCase):
     losses = np.array(self._WeightedCrossEntropy(x, y, q)).reshape(*sizes)
     return logits, targets, q, losses
 
+  @test_util.run_deprecated_v1
   def testConstructionNamed(self):
     with self.cached_session():
       logits, targets, pos_weight, _ = self._Inputs()
@@ -143,7 +148,7 @@ class WeightedCrossEntropyTest(test.TestCase):
         loss = nn_impl.weighted_cross_entropy_with_logits(
             targets=targets, logits=logits, pos_weight=pos_weight)
         np_loss = np.array(losses).astype(np.float32)
-        tf_loss = loss.eval()
+        tf_loss = self.evaluate(loss)
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
   def testOutputMultiDim(self):
@@ -154,9 +159,10 @@ class WeightedCrossEntropyTest(test.TestCase):
         loss = nn_impl.weighted_cross_entropy_with_logits(
             targets=targets, logits=logits, pos_weight=pos_weight)
         np_loss = np.array(losses).astype(np.float32)
-        tf_loss = loss.eval()
+        tf_loss = self.evaluate(loss)
       self.assertAllClose(np_loss, tf_loss, atol=0.001)
 
+  @test_util.run_deprecated_v1
   def testGradient(self):
     sizes = [4, 2]
     with self.cached_session():
diff --git a/tensorflow/python/ops/numerics.py b/tensorflow/python/ops/numerics.py
index 1a235de90cf59cf5cde55d967c8ccd73ff0303a0..0ab39ad0a8edd60c78a6bea3ae31e4f025c9e0bd 100644
--- a/tensorflow/python/ops/numerics.py
+++ b/tensorflow/python/ops/numerics.py
@@ -28,9 +28,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(
-    "debugging.assert_all_finite",
-    v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
+@tf_export(v1=["debugging.assert_all_finite", "verify_tensor_all_finite"])
 @deprecation.deprecated_endpoints("verify_tensor_all_finite")
 def verify_tensor_all_finite(t, msg, name=None):
   """Assert that the tensor does not contain any NaN's or Inf's.
@@ -43,11 +41,26 @@ def verify_tensor_all_finite(t, msg, name=None):
   Returns:
     Same tensor as `t`.
   """
-  with ops.name_scope(name, "VerifyFinite", [t]) as name:
-    t = ops.convert_to_tensor(t, name="t")
-    with ops.colocate_with(t):
-      verify_input = array_ops.check_numerics(t, message=msg)
-      out = control_flow_ops.with_dependencies([verify_input], t)
+  return verify_tensor_all_finite_v2(t, msg, name)
+
+
+@tf_export("debugging.assert_all_finite", v1=[])
+def verify_tensor_all_finite_v2(x, message, name=None):
+  """Assert that the tensor does not contain any NaN's or Inf's.
+
+  Args:
+    x: Tensor to check.
+    message: Message to log on failure.
+    name: A name for this operation (optional).
+
+  Returns:
+    Same tensor as `x`.
+  """
+  with ops.name_scope(name, "VerifyFinite", [x]) as name:
+    x = ops.convert_to_tensor(x, name="x")
+    with ops.colocate_with(x):
+      verify_input = array_ops.check_numerics(x, message=message)
+      out = control_flow_ops.with_dependencies([verify_input], x)
   return out
 
 
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py b/tensorflow/python/ops/optional_grad.py
similarity index 62%
rename from tensorflow/tools/compatibility/testdata/test_file_v1_10.py
rename to tensorflow/python/ops/optional_grad.py
index e5ca8d3e2e24161310fe9878b349dfd524d31efc..0d1eae3cda4bd9b6558313abc8abe7f4e815b816 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v1_10.py
+++ b/tensorflow/python/ops/optional_grad.py
@@ -12,23 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for tf upgrader."""
+"""Gradient functions for optional ops."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import tensorflow as tf
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import test as test_lib
 
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import gen_dataset_ops
 
-class TestUpgrade(test_util.TensorFlowTestCase):
-  """Test various APIs that have been changed in 2.0."""
 
-  def testRenames(self):
-    with self.cached_session():
-      self.assertAllClose(1.04719755, tf.acos(0.5).eval())
-      self.assertAllClose(0.5, tf.rsqrt(4.0).eval())
+@ops.RegisterGradient("OptionalFromValue")
+def _OptionalFromValueGrad(op, grad):
+  return gen_dataset_ops.optional_get_value(
+      grad, [t.dtype for t in op.inputs], [t.shape for t in op.inputs])
 
-if __name__ == "__main__":
-  test_lib.main()
+
+@ops.RegisterGradient("OptionalGetValue")
+def _OptionalGetValueGrad(unused_op, *grads):
+  return gen_dataset_ops.optional_from_value(grads)
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops.py b/tensorflow/python/ops/parallel_for/control_flow_ops.py
index ead7ae5478c74aad4f67296ed68895c1f54f7333..8f652e9c5097db318a77c3cec8c6597c6bb1d87c 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops.py
@@ -17,16 +17,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+from tensorflow.python.eager import function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops.parallel_for.pfor import PFor
 from tensorflow.python.util import nest
 
 
-def for_loop(loop_fn, loop_fn_dtypes, iters):
+def for_loop(loop_fn, loop_fn_dtypes, iters, parallel_iterations=None):
   """Runs `loop_fn` `iters` times and stacks the outputs.
 
 
@@ -39,6 +43,8 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       objects. The shape of these outputs should not depend on the input.
     loop_fn_dtypes: dtypes for the outputs of loop_fn.
     iters: Number of iterations for which to run loop_fn.
+    parallel_iterations: The number of iterations that can be dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     Returns a nested structure of stacked output tensor objects with the same
@@ -66,11 +72,16 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
       outputs.append(ta)
     return tuple([i + 1] + outputs)
 
+  if parallel_iterations is not None:
+    extra_args = {"parallel_iterations": parallel_iterations}
+  else:
+    extra_args = {}
   ta_list = control_flow_ops.while_loop(
-      lambda i, *ta: i < iters, while_body, [0] + [
-          tensor_array_ops.TensorArray(dtype, iters)
-          for dtype in flat_loop_fn_dtypes
-      ])[1:]
+      lambda i, *ta: i < iters,
+      while_body,
+      [0] + [tensor_array_ops.TensorArray(dtype, iters)
+             for dtype in flat_loop_fn_dtypes],
+      **extra_args)[1:]
 
   # TODO(rachelim): enable this for sparse tensors
 
@@ -79,7 +90,15 @@ def for_loop(loop_fn, loop_fn_dtypes, iters):
   return nest.pack_sequence_as(loop_fn_dtypes, output)
 
 
-def pfor(loop_fn, iters):
+def _flatten_first_two_dims(x):
+  """Flattens the first two dimensions of x into a single dimension."""
+  old_shape = array_ops.shape(x)
+  new_shape = array_ops.concat([[old_shape[0] * old_shape[1]], old_shape[2:]],
+                               axis=0)
+  return array_ops.reshape(x, new_shape)
+
+
+def pfor(loop_fn, iters, parallel_iterations=None):
   """Equivalent to running `loop_fn` `iters` times and stacking the outputs.
 
   `pfor` has functionality similar to `for_loop`, i.e. running `loop_fn` `iters`
@@ -99,8 +118,8 @@ def pfor(loop_fn, iters):
       reads, etc).
     - Conversion works only on a limited set of kernels for which a converter
       has been registered.
-    - loop_fn cannot currently contain control flow operations like
-      tf.while_loop or tf.cond.
+    - loop_fn has limited support for control flow operations. tf.cond in
+      particular is not supported.
     - `loop_fn` should return nested structure of Tensors or Operations. However
       if an Operation is returned, it should have zero outputs.
     - The shape and dtype of `loop_fn` outputs should not depend on the input
@@ -109,22 +128,92 @@ def pfor(loop_fn, iters):
   Args:
     loop_fn: A function that takes an int32 scalar tf.Tensor object representing
       the iteration number, and returns a possibly nested structure of Tensor or
-      Operation objects.
+      Operation objects. Note that if setting `parallel_iterations` argument to
+      something other than None, `loop_fn` may be called more than once during
+      graph construction. So it may need to avoid mutating global state.
     iters: Number of iterations for which to run loop_fn.
+    parallel_iterations: A knob to control how many iterations are vectorized
+      and dispatched in parallel. The default value of None corresponds to
+      vectorizing all the iterations.  If `parallel_iterations` is smaller than
+      `iters`, then chunks of at most that many iterations are dispatched in
+      sequence. This knob can be used to control the total memory usage.
 
   Returns:
     Returns a nested structure of stacked tensor objects with the same nested
     structure as the output of `loop_fn`.
+  Raises:
+    ValueError: If parallel_iterations is not None and not an integer > 1.
   """
+  def f():
+    return _pfor_impl(loop_fn, iters, parallel_iterations=parallel_iterations)
+  if context.executing_eagerly():
+    f = function.defun(f)
+  return f()
+
+
+def _pfor_impl(loop_fn, iters, parallel_iterations=None):
+  """Implementation of pfor."""
   existing_ops = set(ops.get_default_graph().get_operations())
   with ops.name_scope("loop_body"):
     loop_var = array_ops.placeholder(dtypes.int32, shape=[])
     loop_fn_outputs = loop_fn(loop_var)
   new_ops = set(ops.get_default_graph().get_operations()) - existing_ops
   iters = ops.convert_to_tensor(iters)
-  with ops.name_scope("pfor"):
-    converter = PFor(loop_var, iters, new_ops)
-    outputs = []
-    for loop_fn_output in nest.flatten(loop_fn_outputs):
-      outputs.append(converter.convert(loop_fn_output))
-    return nest.pack_sequence_as(loop_fn_outputs, outputs)
+  if parallel_iterations is not None:
+    if parallel_iterations < 1:
+      raise ValueError("parallel_iterations must be None or a positive integer")
+    if parallel_iterations == 1:
+      raise ValueError("Found parallel_iterations == 1. Use for_loop instead.")
+    iters_value = tensor_util.constant_value(iters)
+    if iters_value is not None and iters_value < parallel_iterations:
+      parallel_iterations = None
+  if parallel_iterations is None:
+    with ops.name_scope("pfor"):
+      converter = PFor(loop_var, iters, new_ops)
+      outputs = []
+      for loop_fn_output in nest.flatten(loop_fn_outputs):
+        outputs.append(converter.convert(loop_fn_output))
+      return nest.pack_sequence_as(loop_fn_outputs, outputs)
+  else:
+    num_tiled_iterations = iters // parallel_iterations
+    num_remaining_iterations = iters % parallel_iterations
+    # TODO(agarwal): Avoid calling loop_fn twice. Generate the loop body inside
+    # a tf.function and extract the graph from there to vectorize it.
+    with ops.name_scope("pfor_untiled"):
+      converter = PFor(loop_var, num_remaining_iterations, new_ops)
+      remaining_outputs = []
+      flattened_loop_fn_outputs = nest.flatten(loop_fn_outputs)
+      for loop_fn_output in flattened_loop_fn_outputs:
+        remaining_outputs.append(converter.convert(loop_fn_output))
+
+    with ops.name_scope("pfor_tiled"):
+      loop_fn_dtypes = [ops.convert_to_tensor(x).dtype
+                        for x in flattened_loop_fn_outputs]
+
+      def tiled_loop_body(j):
+        offset = j * parallel_iterations + num_remaining_iterations
+
+        def tiled_loop_fn(i):
+          return nest.flatten(loop_fn(i + offset))
+
+        return pfor(tiled_loop_fn, parallel_iterations)
+
+      tiled_outputs = for_loop(tiled_loop_body, loop_fn_dtypes,
+                               num_tiled_iterations, parallel_iterations=1)
+      tiled_outputs = [_flatten_first_two_dims(y) for y in tiled_outputs]
+
+    with ops.name_scope("pfor"):
+      iters_value = tensor_util.constant_value(iters)
+      if iters_value is None or iters_value % parallel_iterations:
+        outputs = control_flow_ops.cond(
+            math_ops.equal(num_remaining_iterations, 0),
+            lambda: tiled_outputs,
+            lambda: [array_ops.concat([x, y], axis=0)
+                     for x, y in zip(remaining_outputs, tiled_outputs)])
+      else:
+        outputs = tiled_outputs
+      return nest.pack_sequence_as(loop_fn_outputs, nest.flatten(outputs))
+
+
+
+
diff --git a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
index 171369b724a6b7c7cbb6c3cac5b49a31926af2dd..933bddd8ccaa830a394c8d69e4f1b33311315c99 100644
--- a/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
+++ b/tensorflow/python/ops/parallel_for/control_flow_ops_test.py
@@ -26,10 +26,12 @@ import numpy as np
 from tensorflow.core.example import example_pb2
 from tensorflow.core.example import feature_pb2
 from tensorflow.python.client import session
+from tensorflow.python.eager import backprop
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import bitwise_ops
 from tensorflow.python.ops import clip_ops
@@ -52,6 +54,7 @@ from tensorflow.python.platform import test
 from tensorflow.python.util import nest
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class PForTest(test.TestCase):
 
   def _run_targets(self, targets1, targets2=None, run_init=True):
@@ -73,9 +76,13 @@ class PForTest(test.TestCase):
       else:
         self.assertAllEqual(outputs[i + n], outputs[i])
 
-  def _test_loop_fn(self, loop_fn, iters, loop_fn_dtypes=dtypes.float32):
-    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters)
-    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters)
+  def _test_loop_fn(self, loop_fn, iters,
+                    loop_fn_dtypes=dtypes.float32,
+                    parallel_iterations=None):
+    t1 = pfor_control_flow_ops.pfor(loop_fn, iters=iters,
+                                    parallel_iterations=parallel_iterations)
+    t2 = pfor_control_flow_ops.for_loop(loop_fn, loop_fn_dtypes, iters=iters,
+                                        parallel_iterations=parallel_iterations)
     self.run_and_assert_equal(t1, t2)
 
   def test_op_conversion_fallback_to_while_loop(self):
@@ -96,7 +103,32 @@ class PForTest(test.TestCase):
         loop_fn, 3, loop_fn_dtypes=[dtypes.float32, dtypes.int32])
     flags.FLAGS.op_conversion_fallback_to_while_loop = False
 
+  def test_parallel_iterations(self):
+    for parallel_iterations in [2, 3, 8, 10]:
+      x = random_ops.random_uniform([8, 3])
 
+      # pylint: disable=cell-var-from-loop
+      def loop_fn(i):
+        return array_ops.gather(x, i)
+      # pylint: enable=cell-var-from-loop
+
+      self._test_loop_fn(loop_fn, 8, parallel_iterations=parallel_iterations)
+      self._test_loop_fn(loop_fn, 4 * constant_op.constant(2),
+                         parallel_iterations=parallel_iterations)
+
+  def test_parallel_iterations_zero(self):
+    with self.assertRaisesRegexp(ValueError, "positive integer"):
+      pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=0)
+    with self.assertRaisesRegexp(TypeError, "positive integer"):
+      pfor_control_flow_ops.for_loop(lambda i: 1, dtypes.int32, 8,
+                                     parallel_iterations=0)
+
+  def test_parallel_iterations_one(self):
+    with self.assertRaisesRegexp(ValueError, "Use for_loop instead"):
+      pfor_control_flow_ops.pfor(lambda i: 1, 8, parallel_iterations=1)
+
+
+@test_util.run_all_in_graph_and_eager_modes
 class ArrayTest(PForTest):
 
   def test_gather(self):
@@ -288,14 +320,17 @@ class ArrayTest(PForTest):
 
   def test_unary_cwise_ops(self):
     for op in [array_ops.identity, array_ops.stop_gradient]:
-      x = random_ops.random_uniform([3, 5])
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
 
       # pylint: disable=cell-var-from-loop
       def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y = op(x1) + x1
-        loss = nn.l2_loss(y)
-        return op(x), y, gradient_ops.gradients(loss, x1)
+        with g:
+          x1 = array_ops.gather(x, i)
+          y = op(x1) + x1
+          loss = nn.l2_loss(y)
+        return op(x), y, g.gradient(loss, x1)
 
       # pylint: enable=cell-var-from-loop
 
@@ -318,17 +353,21 @@ class ArrayTest(PForTest):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32])
 
   def test_strided_slice(self):
-    x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 4, 4, 2, 2, 2])
+      g.watch(x)
 
     def loop_fn(i):
-      x_i = array_ops.gather(x, i)
-      y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
-      loss = nn.l2_loss(y)
-      return y, gradient_ops.gradients(loss, x_i)
+      with g:
+        x_i = array_ops.gather(x, i)
+        y = x_i[:2, ::2, 1::3, ..., array_ops.newaxis, 1]
+        loss = nn.l2_loss(y)
+      return y, g.gradient(loss, x_i)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class BitwiseTest(PForTest):
 
   def test_unary_cwise(self):
@@ -368,6 +407,7 @@ class BitwiseTest(PForTest):
       self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=output_dtypes)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class MathTest(PForTest):
 
   def test_unary_cwise_ops(self):
@@ -424,22 +464,29 @@ class MathTest(PForTest):
         nn.softsign,
     ]
     for op in complex_ops + real_ops:
-      x = random_ops.random_uniform([3, 5])
-      if op in complex_ops:
-        y = random_ops.random_uniform([3, 5])
-        x = math_ops.complex(x, y)
+      with backprop.GradientTape(persistent=True) as g:
+        x = random_ops.random_uniform([3, 5])
+        g.watch(x)
+        if op in complex_ops:
+          y = random_ops.random_uniform([3, 5])
+          g.watch(y)
+          x = math_ops.complex(x, y)
 
       # pylint: disable=cell-var-from-loop
       output_dtypes = []
       def loop_fn(i):
-        x1 = array_ops.gather(x, i)
-        y1 = op(x1)
-        outputs = [op(x), y1]
-        if y1.dtype == dtypes.float32:
-          loss = math_ops.reduce_sum(y1 * y1)
-          grad = gradient_ops.gradients(loss, x1)
-          if grad and grad[0] is not None:
-            outputs.extend(grad)
+        with g:
+          x1 = array_ops.gather(x, i)
+          y1 = op(x1)
+          outputs = [op(x), y1]
+          if y1.dtype == dtypes.float32:
+            loss = math_ops.reduce_sum(y1 * y1)
+          else:
+            loss = None
+        if loss is not None:
+          grad = g.gradient(loss, x1)
+          if grad is not None:
+            outputs.append(grad)
         del output_dtypes[:]
         output_dtypes.extend([t.dtype for t in outputs])
         return outputs
@@ -656,17 +703,19 @@ class MathTest(PForTest):
     x_shape = [2, 3, 4, 5, 6]
     x = random_ops.random_uniform(x_shape)
     for data_format in ("NCHW", "NHWC"):
-      bias_dim = 2 if data_format == "NCHW" else -1
-      bias_shape = x_shape[bias_dim]
-      bias = random_ops.random_uniform([bias_shape])
+      with backprop.GradientTape(persistent=True) as g:
+        bias_dim = 2 if data_format == "NCHW" else -1
+        bias_shape = x_shape[bias_dim]
+        bias = random_ops.random_uniform([bias_shape])
+        g.watch(bias)
 
       # pylint: disable=cell-var-from-loop
       def loop_fn(i):
-        a = array_ops.gather(x, i)
-        y = nn.bias_add(a, bias, data_format=data_format)
-        loss = math_ops.reduce_sum(y * y)
-        return y, gradient_ops.gradients(loss, bias)
-
+        with g:
+          a = array_ops.gather(x, i)
+          y = nn.bias_add(a, bias, data_format=data_format)
+          loss = math_ops.reduce_sum(y * y)
+        return y, g.gradient(loss, bias)
       # pylint: enable=cell-var-from-loop
 
       self._test_loop_fn(
@@ -727,6 +776,7 @@ class MathTest(PForTest):
       self._test_loop_fn(loop_fn, 2)
 
 
+@test_util.run_all_in_graph_and_eager_modes
 class NNTest(PForTest):
 
   def test_conv2d(self):
@@ -779,30 +829,60 @@ class NNTest(PForTest):
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
   def test_avg_pool(self):
-    x = random_ops.random_uniform([3, 2, 12, 12, 3])
-    ksize = [1, 3, 3, 1]
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 3, 3, 1]
 
     def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      output = nn.avg_pool(
-          x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC")
-      loss = nn.l2_loss(output)
-      return output, gradient_ops.gradients(loss, x1)
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.avg_pool(
+            x1, ksize, strides=[1, 2, 2, 1], padding="VALID",
+            data_format="NHWC")
+        loss = nn.l2_loss(output)
+      return output, g.gradient(loss, x1)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
   def test_max_pool(self):
-    x = random_ops.random_uniform([3, 2, 12, 12, 3])
-    ksize = [1, 3, 3, 1]
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 3, 3, 1]
+      strides = [1, 2, 2, 1]
 
     def loop_fn(i):
-      x1 = array_ops.gather(x, i)
-      output = nn.max_pool(
-          x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC")
-      loss = nn.l2_loss(output)
-      ones = array_ops.ones_like(output)
-      grad = gradient_ops.gradients(loss, x1, grad_ys=ones)
-      grad_grad = gradient_ops.gradients(grad, ones)
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.max_pool(
+            x1, ksize, strides=strides, padding="VALID", data_format="NHWC")
+        loss = nn.l2_loss(output)
+        ones = array_ops.ones_like(output)
+        g.watch(ones)
+        grad = g.gradient(loss, x1, output_gradients=ones)
+      grad_grad = g.gradient(grad, ones)
+      return output, grad, grad_grad
+
+    self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
+
+  def test_max_pool3d(self):
+    with backprop.GradientTape(persistent=True) as g:
+      x = random_ops.random_uniform([3, 3, 2, 12, 12, 3])
+      g.watch(x)
+      ksize = [1, 1, 3, 3, 1]
+      strides = [1, 1, 2, 2, 1]
+
+    def loop_fn(i):
+      with g:
+        x1 = array_ops.gather(x, i)
+        output = nn.max_pool3d(
+            x1, ksize, strides=strides, padding="VALID", data_format="NDHWC")
+        loss = nn.l2_loss(output)
+        ones = array_ops.ones_like(output)
+        g.watch(ones)
+        grad = g.gradient(loss, x1, output_gradients=ones)
+      grad_grad = g.gradient(grad, ones)
       return output, grad, grad_grad
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 3)
@@ -813,36 +893,44 @@ class NNTest(PForTest):
       data_formats.append("NCHW")
     for is_training in (True, False):
       for data_format in data_formats:
-        if data_format == "NCHW":
-          x = random_ops.random_uniform([3, 1, 2, 5, 5])
-        else:
-          x = random_ops.random_uniform([3, 1, 5, 5, 2])
-        scale = random_ops.random_uniform([2])
-        offset = random_ops.random_uniform([2])
-        mean = None if is_training else random_ops.random_uniform([2])
-        variance = None if is_training else random_ops.random_uniform([2])
+        with backprop.GradientTape(persistent=True) as g:
+          if data_format == "NCHW":
+            x = random_ops.random_uniform([3, 1, 2, 5, 5])
+          else:
+            x = random_ops.random_uniform([3, 1, 5, 5, 2])
+          g.watch(x)
+          scale = random_ops.random_uniform([2])
+          g.watch(scale)
+          offset = random_ops.random_uniform([2])
+          g.watch(offset)
+          mean = None if is_training else random_ops.random_uniform([2])
+          variance = None if is_training else random_ops.random_uniform([2])
 
         # pylint: disable=cell-var-from-loop
         def loop_fn(i):
-          x1 = array_ops.gather(x, i)
-          outputs = nn.fused_batch_norm(
-              x1,
-              scale,
-              offset,
-              mean=mean,
-              variance=variance,
-              epsilon=0.01,
-              data_format=data_format,
-              is_training=is_training)
-          outputs = list(outputs)
-          # We only test the first value of outputs when is_training is False.
-          # It looks like CPU and GPU have different outputs for batch_mean and
-          # batch_variance for this case.
-          if not is_training:
-            outputs[1] = constant_op.constant(0.)
-            outputs[2] = constant_op.constant(0.)
-          loss = nn.l2_loss(outputs[0])
-          gradients = gradient_ops.gradients(loss, [x1, scale, offset])
+          with g:
+            x1 = array_ops.gather(x, i)
+            outputs = nn.fused_batch_norm(
+                x1,
+                scale,
+                offset,
+                mean=mean,
+                variance=variance,
+                epsilon=0.01,
+                data_format=data_format,
+                is_training=is_training)
+            outputs = list(outputs)
+            # We only test the first value of outputs when is_training is False.
+            # It looks like CPU and GPU have different outputs for batch_mean
+            # and batch_variance for this case.
+            if not is_training:
+              outputs[1] = constant_op.constant(0.)
+              outputs[2] = constant_op.constant(0.)
+            loss = nn.l2_loss(outputs[0])
+          if is_training:
+            gradients = g.gradient(loss, [x1, scale, offset])
+          else:
+            gradients = [constant_op.constant(0.)] * 3
           return outputs + gradients
 
         # pylint: enable=cell-var-from-loop
@@ -850,16 +938,20 @@ class NNTest(PForTest):
         self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 6)
 
   def test_softmax_cross_entropy_with_logits(self):
-    logits = random_ops.random_uniform([3, 2, 4])
-    labels = random_ops.random_uniform([3, 2, 4])
-    labels /= math_ops.reduce_sum(labels, axis=[2], keepdims=True)
+    with backprop.GradientTape(persistent=True) as g:
+      logits = random_ops.random_uniform([3, 2, 4])
+      g.watch(logits)
+      labels = random_ops.random_uniform([3, 2, 4])
+      labels /= math_ops.reduce_sum(labels, axis=[2], keepdims=True)
 
     def loop_fn(i):
-      logits_i = array_ops.gather(logits, i)
-      labels_i = array_ops.gather(labels, i)
-      loss = nn.softmax_cross_entropy_with_logits(
-          labels=labels_i, logits=logits_i)
-      return loss, gradient_ops.gradients(math_ops.reduce_sum(loss), logits_i)
+      with g:
+        logits_i = array_ops.gather(logits, i)
+        labels_i = array_ops.gather(labels, i)
+        loss = nn.softmax_cross_entropy_with_logits(
+            labels=labels_i, logits=logits_i)
+        total_loss = math_ops.reduce_sum(loss)
+      return loss, g.gradient(total_loss, logits_i)
 
     self._test_loop_fn(loop_fn, 3, loop_fn_dtypes=[dtypes.float32] * 2)
 
@@ -1278,13 +1370,12 @@ class ControlFlowTest(PForTest):
     pfor_out, pfor_out_grad = pfor_control_flow_ops.pfor(loop_fn, 4)
     # Note that tf.while_loop does not work in the setup above. So we manually
     # construct the equivalent computation of the above loops here.
-    real_out = math_ops.reduce_sum(inp, reduction_indices=[0])
-    real_out = math_ops.reduce_prod(real_out, reduction_indices=[1])
+    real_out = math_ops.reduce_sum(inp, axis=[0])
+    real_out = math_ops.reduce_prod(real_out, axis=[1])
     # Note that gradients of real_out will accumulate the gradients across the
     # output value. Hence we do the same aggregation on pfor_out_grad.
     real_out_grad = gradient_ops.gradients(real_out, inp)[0]
-    sum_pfor_out_grad = math_ops.reduce_sum(
-        pfor_out_grad, reduction_indices=[0])
+    sum_pfor_out_grad = math_ops.reduce_sum(pfor_out_grad, axis=[0])
 
     with session.Session() as sess:
       v1, v2, v1_grad, v2_grad = sess.run(
diff --git a/tensorflow/python/ops/parallel_for/gradients.py b/tensorflow/python/ops/parallel_for/gradients.py
index 1f026b3660c39066b3a8cf741b0fbd1929b22665..3ba1bde347698acf3b1229808fe63cef2e3255af 100644
--- a/tensorflow/python/ops/parallel_for/gradients.py
+++ b/tensorflow/python/ops/parallel_for/gradients.py
@@ -25,7 +25,7 @@ from tensorflow.python.ops.parallel_for import control_flow_ops
 from tensorflow.python.util import nest
 
 
-def jacobian(output, inputs, use_pfor=True):
+def jacobian(output, inputs, use_pfor=True, parallel_iterations=None):
   """Computes jacobian of `output` w.r.t. `inputs`.
 
   Args:
@@ -33,6 +33,8 @@ def jacobian(output, inputs, use_pfor=True):
     inputs: A tensor or a nested structure of tensor objects.
     use_pfor: If true, uses pfor for computing the jacobian. Else uses
       tf.while_loop.
+    parallel_iterations: A knob to control how many iterations and dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     A tensor or a nested strucutre of tensors with the same structure as
@@ -56,10 +58,14 @@ def jacobian(output, inputs, use_pfor=True):
     output_size = array_ops.shape(output)[0]
 
   if use_pfor:
-    pfor_outputs = control_flow_ops.pfor(loop_fn, output_size)
+    pfor_outputs = control_flow_ops.pfor(
+        loop_fn, output_size, parallel_iterations=parallel_iterations)
   else:
     pfor_outputs = control_flow_ops.for_loop(
-        loop_fn, [output.dtype] * len(flat_inputs), output_size)
+        loop_fn,
+        [output.dtype] * len(flat_inputs),
+        output_size,
+        parallel_iterations=parallel_iterations)
 
   for i, out in enumerate(pfor_outputs):
     if out is not None:
@@ -72,7 +78,7 @@ def jacobian(output, inputs, use_pfor=True):
   return nest.pack_sequence_as(inputs, pfor_outputs)
 
 
-def batch_jacobian(output, inp, use_pfor=True):
+def batch_jacobian(output, inp, use_pfor=True, parallel_iterations=None):
   """Computes and stacks jacobians of `output[i,...]` w.r.t. `input[i,...]`.
 
   e.g.
@@ -87,6 +93,8 @@ def batch_jacobian(output, inp, use_pfor=True):
     inp: A tensor with shape [b, x1, ..., x_m]
     use_pfor: If true, uses pfor for computing the Jacobian. Else uses a
       tf.while_loop.
+    parallel_iterations: A knob to control how many iterations and dispatched in
+      parallel. This knob can be used to control the total memory usage.
 
   Returns:
     A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
@@ -118,10 +126,13 @@ def batch_jacobian(output, inp, use_pfor=True):
     return gradient_ops.gradients(y, inp)[0]
 
   if use_pfor:
-    pfor_output = control_flow_ops.pfor(loop_fn, output_row_size)
+    pfor_output = control_flow_ops.pfor(loop_fn, output_row_size,
+                                        parallel_iterations=parallel_iterations)
   else:
-    pfor_output = control_flow_ops.for_loop(loop_fn, output.dtype,
-                                            output_row_size)
+    pfor_output = control_flow_ops.for_loop(
+        loop_fn, output.dtype,
+        output_row_size,
+        parallel_iterations=parallel_iterations)
   if pfor_output is None:
     return None
   pfor_output = array_ops.reshape(pfor_output,
diff --git a/tensorflow/python/ops/parallel_for/gradients_test.py b/tensorflow/python/ops/parallel_for/gradients_test.py
index 5a058bae82554eac98ec69ea4b9e809a0c06b223..4342833e3eb362e81ff9f60b4649cc5b8de6250f 100644
--- a/tensorflow/python/ops/parallel_for/gradients_test.py
+++ b/tensorflow/python/ops/parallel_for/gradients_test.py
@@ -416,6 +416,12 @@ class GradientsTest(test.TestCase):
       self.assertAllClose(ans, pfor_value)
       self.assertAllClose(ans, while_value)
 
+  def test_jacobian_parallel_iterations(self):
+    x = constant_op.constant([[1., 2], [3, 4]])
+    y = math_ops.matmul(x, x)
+    self.assertAllClose(gradients.jacobian(y, x, parallel_iterations=2),
+                        gradients.jacobian(y, x, parallel_iterations=3))
+
   def test_batch_jacobian_bad_shapes(self):
     x = random_ops.random_uniform([2, 2])
     y = random_ops.random_uniform([3, 2])
@@ -459,6 +465,13 @@ class GradientsTest(test.TestCase):
       self.assertAllClose(ans, pfor_value)
       self.assertAllClose(ans, while_value)
 
+  def test_batch_jacobian_parallel_iterations(self):
+    x = constant_op.constant([[1., 2], [3, 4]])
+    w = constant_op.constant([[1., 2, 3, 4], [5, 6, 7, 8]])
+    y = math_ops.matmul(x, w)
+    self.assertAllClose(gradients.batch_jacobian(y, x, parallel_iterations=2),
+                        gradients.batch_jacobian(y, x, parallel_iterations=3))
+
   def test_fc_batch_jacobian(self):
     pfor_jacobian, while_jacobian = create_fc_batch_jacobian(8, 4, 2)
     self.run_and_assert_equal(pfor_jacobian, while_jacobian)
@@ -471,8 +484,8 @@ class GradientsTest(test.TestCase):
     pfor_jacobian, while_gradients = create_dynamic_lstm_batch_jacobian(8, 4, 3)
     with session.Session() as sess:
       init = variables.global_variables_initializer()
-      sess.run(init)
-      pfor = sess.run(pfor_jacobian)
+      self.evaluate(init)
+      pfor = self.evaluate(pfor_jacobian)
       for i in range(4):
         while_i = sess.run(while_gradients[i])
         self.assertAllClose(while_i, pfor[:, i, ...])
@@ -547,11 +560,11 @@ class GradientsBenchmarks(test.Benchmark):
     sess = session.Session()
     with sess:
       init = variables.global_variables_initializer()
-      sess.run(init)
-      sess.run(targets)
+      self.evaluate(init)
+      self.evaluate(targets)
       begin = time.time()
       for _ in range(iters):
-        sess.run(targets)
+        self.evaluate(targets)
       end = time.time()
     avg_time_ms = 1000 * (end - begin) / iters
     self.report_benchmark(iters=iters, wall_time=avg_time_ms, name=name)
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index e6f140a9410bcf00d35f8a611c49583c56e70188..a22c1126c93915da7acc5221594567f855557b84 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -1152,9 +1152,8 @@ class PFor(object):
           continue
 
         converted_inputs = [self._conversion_map[inp] for inp in y_op.inputs]
-        some_input_converted = any(
-            [self._was_converted(x) for x in y_op.inputs])
-        some_input_stacked = any([x.is_stacked for x in converted_inputs])
+        some_input_converted = any(self._was_converted(x) for x in y_op.inputs)
+        some_input_stacked = any(x.is_stacked for x in converted_inputs)
 
         converted_control_ops = set()
         some_control_input_converted = False
@@ -1198,7 +1197,7 @@ class PFor(object):
           # All inputs are unstacked or uncoverted but some control inputs are
           # converted.
           # TODO(rachelim): Handle the case where some inputs are sparsely
-          # stacked (i.e. any([x.is_sparse_stacked for x in converted_inputs]))
+          # stacked (i.e. any(x.is_sparse_stacked for x in converted_inputs))
           new_op = _create_op(y_op.type, [x.t for x in converted_inputs],
                               [x.dtype for x in y_op.outputs],
                               y_op.node_def.attr)
@@ -1303,7 +1302,10 @@ def _inputs_with_flattening(pfor_input, input_indices):
 @RegisterPForWithArgs("Conv2D", dims=[0])
 @RegisterPForWithArgs("AvgPool", dims=[0])
 @RegisterPForWithArgs("MaxPool", dims=[0])
+@RegisterPForWithArgs("MaxPool3D", dims=[0])
+@RegisterPForWithArgs("MaxPool3DGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("MaxPoolGrad", dims=[0, 1, 2])
+@RegisterPForWithArgs("MaxPool3DGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("MaxPoolGradGrad", dims=[0, 1, 2])
 @RegisterPForWithArgs("SoftmaxCrossEntropyWithLogits", dims=[0, 1])
 def _convert_flatten_batch(pfor_input, op_type, dims):
diff --git a/tensorflow/python/ops/parsing_ops.py b/tensorflow/python/ops/parsing_ops.py
index 484caf017968103c0949dce2205cf60fbc494439..a84af6c5cf27f2e021b3950f4a60a87cb5324942 100644
--- a/tensorflow/python/ops/parsing_ops.py
+++ b/tensorflow/python/ops/parsing_ops.py
@@ -363,7 +363,7 @@ def _prepend_none_dimension(features):
     return features
 
 
-@tf_export("io.parse_example", v1=["io.parse_example", "parse_example"])
+@tf_export(v1=["io.parse_example", "parse_example"])
 def parse_example(serialized, features, name=None, example_names=None):
   # pylint: disable=line-too-long
   """Parses `Example` protos into a `dict` of tensors.
@@ -574,6 +574,223 @@ def parse_example(serialized, features, name=None, example_names=None):
   Returns:
     A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
 
+  Raises:
+    ValueError: if any feature is invalid.
+  """
+  return parse_example_v2(serialized, features, example_names, name)
+
+
+@tf_export("io.parse_example", v1=[])
+def parse_example_v2(serialized, features, example_names=None, name=None):
+  # pylint: disable=line-too-long
+  """Parses `Example` protos into a `dict` of tensors.
+
+  Parses a number of serialized [`Example`](https://www.tensorflow.org/code/tensorflow/core/example/example.proto)
+  protos given in `serialized`. We refer to `serialized` as a batch with
+  `batch_size` many entries of individual `Example` protos.
+
+  `example_names` may contain descriptive names for the corresponding serialized
+  protos. These may be useful for debugging purposes, but they have no effect on
+  the output. If not `None`, `example_names` must be the same length as
+  `serialized`.
+
+  This op parses serialized examples into a dictionary mapping keys to `Tensor`
+  and `SparseTensor` objects. `features` is a dict from keys to `VarLenFeature`,
+  `SparseFeature`, and `FixedLenFeature` objects. Each `VarLenFeature`
+  and `SparseFeature` is mapped to a `SparseTensor`, and each
+  `FixedLenFeature` is mapped to a `Tensor`.
+
+  Each `VarLenFeature` maps to a `SparseTensor` of the specified type
+  representing a ragged matrix. Its indices are `[batch, index]` where `batch`
+  identifies the example in `serialized`, and `index` is the value's index in
+  the list of values associated with that feature and example.
+
+  Each `SparseFeature` maps to a `SparseTensor` of the specified type
+  representing a Tensor of `dense_shape` `[batch_size] + SparseFeature.size`.
+  Its `values` come from the feature in the examples with key `value_key`.
+  A `values[i]` comes from a position `k` in the feature of an example at batch
+  entry `batch`. This positional information is recorded in `indices[i]` as
+  `[batch, index_0, index_1, ...]` where `index_j` is the `k-th` value of
+  the feature in the example at with key `SparseFeature.index_key[j]`.
+  In other words, we split the indices (except the first index indicating the
+  batch entry) of a `SparseTensor` by dimension into different features of the
+  `Example`. Due to its complexity a `VarLenFeature` should be preferred over a
+  `SparseFeature` whenever possible.
+
+  Each `FixedLenFeature` `df` maps to a `Tensor` of the specified type (or
+  `tf.float32` if not specified) and shape `(serialized.size(),) + df.shape`.
+
+  `FixedLenFeature` entries with a `default_value` are optional. With no default
+  value, we will fail if that `Feature` is missing from any example in
+  `serialized`.
+
+  Each `FixedLenSequenceFeature` `df` maps to a `Tensor` of the specified type
+  (or `tf.float32` if not specified) and shape
+  `(serialized.size(), None) + df.shape`.
+  All examples in `serialized` will be padded with `default_value` along the
+  second dimension.
+
+  Examples:
+
+  For example, if one expects a `tf.float32` `VarLenFeature` `ft` and three
+  serialized `Example`s are provided:
+
+  ```
+  serialized = [
+    features
+      { feature { key: "ft" value { float_list { value: [1.0, 2.0] } } } },
+    features
+      { feature []},
+    features
+      { feature { key: "ft" value { float_list { value: [3.0] } } }
+  ]
+  ```
+
+  then the output will look like:
+
+  ```python
+  {"ft": SparseTensor(indices=[[0, 0], [0, 1], [2, 0]],
+                      values=[1.0, 2.0, 3.0],
+                      dense_shape=(3, 2)) }
+  ```
+
+  If instead a `FixedLenSequenceFeature` with `default_value = -1.0` and
+  `shape=[]` is used then the output will look like:
+
+  ```python
+  {"ft": [[1.0, 2.0], [3.0, -1.0]]}
+  ```
+
+  Given two `Example` input protos in `serialized`:
+
+  ```
+  [
+    features {
+      feature { key: "kw" value { bytes_list { value: [ "knit", "big" ] } } }
+      feature { key: "gps" value { float_list { value: [] } } }
+    },
+    features {
+      feature { key: "kw" value { bytes_list { value: [ "emmy" ] } } }
+      feature { key: "dank" value { int64_list { value: [ 42 ] } } }
+      feature { key: "gps" value { } }
+    }
+  ]
+  ```
+
+  And arguments
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "kw": VarLenFeature(tf.string),
+      "dank": VarLenFeature(tf.int64),
+      "gps": VarLenFeature(tf.float32),
+  }
+  ```
+
+  Then the output is a dictionary:
+
+  ```python
+  {
+    "kw": SparseTensor(
+        indices=[[0, 0], [0, 1], [1, 0]],
+        values=["knit", "big", "emmy"]
+        dense_shape=[2, 2]),
+    "dank": SparseTensor(
+        indices=[[1, 0]],
+        values=[42],
+        dense_shape=[2, 1]),
+    "gps": SparseTensor(
+        indices=[],
+        values=[],
+        dense_shape=[2, 0]),
+  }
+  ```
+
+  For dense results in two serialized `Example`s:
+
+  ```
+  [
+    features {
+      feature { key: "age" value { int64_list { value: [ 0 ] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+     },
+     features {
+      feature { key: "age" value { int64_list { value: [] } } }
+      feature { key: "gender" value { bytes_list { value: [ "f" ] } } }
+    }
+  ]
+  ```
+
+  We can use arguments:
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "age": FixedLenFeature([], dtype=tf.int64, default_value=-1),
+      "gender": FixedLenFeature([], dtype=tf.string),
+  }
+  ```
+
+  And the expected output is:
+
+  ```python
+  {
+    "age": [[0], [-1]],
+    "gender": [["f"], ["f"]],
+  }
+  ```
+
+  An alternative to `VarLenFeature` to obtain a `SparseTensor` is
+  `SparseFeature`. For example, given two `Example` input protos in
+  `serialized`:
+
+  ```
+  [
+    features {
+      feature { key: "val" value { float_list { value: [ 0.5, -1.0 ] } } }
+      feature { key: "ix" value { int64_list { value: [ 3, 20 ] } } }
+    },
+    features {
+      feature { key: "val" value { float_list { value: [ 0.0 ] } } }
+      feature { key: "ix" value { int64_list { value: [ 42 ] } } }
+    }
+  ]
+  ```
+
+  And arguments
+
+  ```
+  example_names: ["input0", "input1"],
+  features: {
+      "sparse": SparseFeature(
+          index_key="ix", value_key="val", dtype=tf.float32, size=100),
+  }
+  ```
+
+  Then the output is a dictionary:
+
+  ```python
+  {
+    "sparse": SparseTensor(
+        indices=[[0, 3], [0, 20], [1, 42]],
+        values=[0.5, -1.0, 0.0]
+        dense_shape=[2, 100]),
+  }
+  ```
+
+  Args:
+    serialized: A vector (1-D Tensor) of strings, a batch of binary
+      serialized `Example` protos.
+    features: A `dict` mapping feature keys to `FixedLenFeature`,
+      `VarLenFeature`, and `SparseFeature` values.
+    example_names: A vector (1-D Tensor) of strings (optional), the names of
+      the serialized protos in the batch.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
+
   Raises:
     ValueError: if any feature is invalid.
   """
@@ -764,8 +981,7 @@ def _process_raw_parameters(names, dense_defaults, sparse_keys, sparse_types,
           dense_shapes_as_proto, dense_shapes)
 
 
-@tf_export("io.parse_single_example",
-           v1=["io.parse_single_example", "parse_single_example"])
+@tf_export(v1=["io.parse_single_example", "parse_single_example"])
 def parse_single_example(serialized, features, name=None, example_names=None):
   """Parses a single `Example` proto.
 
@@ -795,6 +1011,48 @@ def parse_single_example(serialized, features, name=None, example_names=None):
   Returns:
     A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
 
+  Raises:
+    ValueError: if any feature is invalid.
+  """
+  return parse_single_example_v2_unoptimized(
+      serialized, features, example_names, name
+      )
+
+
+# TODO(b/70890287): Combine the implementation of this op and
+# `parse_single_example_v2()` after 1/10/2018.
+@tf_export("io.parse_single_example", v1=[])
+def parse_single_example_v2_unoptimized(
+    serialized, features, example_names=None, name=None
+    ):
+  """Parses a single `Example` proto.
+
+  Similar to `parse_example`, except:
+
+  For dense tensors, the returned `Tensor` is identical to the output of
+  `parse_example`, except there is no batch dimension, the output shape is the
+  same as the shape given in `dense_shape`.
+
+  For `SparseTensor`s, the first (batch) column of the indices matrix is removed
+  (the indices matrix is a column vector), the values vector is unchanged, and
+  the first (`batch_size`) entry of the shape vector is removed (it is now a
+  single element vector).
+
+  One might see performance advantages by batching `Example` protos with
+  `parse_example` instead of using this function directly.
+
+  Args:
+    serialized: A scalar string Tensor, a single serialized Example.
+      See `_parse_single_example_raw` documentation for more details.
+    features: A `dict` mapping feature keys to `FixedLenFeature` or
+      `VarLenFeature` values.
+    example_names: (Optional) A scalar string Tensor, the associated name.
+      See `_parse_single_example_raw` documentation for more details.
+    name: A name for this operation (optional).
+
+  Returns:
+    A `dict` mapping feature keys to `Tensor` and `SparseTensor` values.
+
   Raises:
     ValueError: if any feature is invalid.
   """
@@ -1570,7 +1828,7 @@ def _parse_single_sequence_example_raw(serialized,
 
 
 # Swap `name` and `na_value` for backward compatibility.
-@tf_export("io.decode_csv", v1=["io.decode_csv", "decode_csv"])
+@tf_export(v1=["io.decode_csv", "decode_csv"])
 @deprecation.deprecated_endpoints("decode_csv")
 def decode_csv(records,
                record_defaults,
@@ -1609,6 +1867,54 @@ def decode_csv(records,
     A list of `Tensor` objects. Has the same type as `record_defaults`.
     Each tensor will have the same shape as records.
 
+  Raises:
+    ValueError: If any of the arguments is malformed.
+  """
+  return decode_csv_v2(
+      records, record_defaults,
+      field_delim, use_quote_delim,
+      na_value, select_cols, name
+      )
+
+
+@tf_export("io.decode_csv", v1=[])
+def decode_csv_v2(records,
+                  record_defaults,
+                  field_delim=",",
+                  use_quote_delim=True,
+                  na_value="",
+                  select_cols=None,
+                  name=None):
+  """Convert CSV records to tensors. Each column maps to one tensor.
+
+  RFC 4180 format is expected for the CSV records.
+  (https://tools.ietf.org/html/rfc4180)
+  Note that we allow leading and trailing spaces with int or float field.
+
+  Args:
+    records: A `Tensor` of type `string`.
+      Each string is a record/row in the csv and all records should have
+      the same format.
+    record_defaults: A list of `Tensor` objects with specific types.
+      Acceptable types are `float32`, `float64`, `int32`, `int64`, `string`.
+      One tensor per column of the input record, with either a
+      scalar default value for that column or an empty vector if the column is
+      required.
+    field_delim: An optional `string`. Defaults to `","`.
+      char delimiter to separate fields in a record.
+    use_quote_delim: An optional `bool`. Defaults to `True`.
+      If false, treats double quotation marks as regular
+      characters inside of the string fields (ignoring RFC 4180, Section 2,
+      Bullet 5).
+    na_value: Additional string to recognize as NA/NaN.
+    select_cols: Optional sorted list of column indices to select. If specified,
+      only this subset of columns will be parsed and returned.
+    name: A name for the operation (optional).
+
+  Returns:
+    A list of `Tensor` objects. Has the same type as `record_defaults`.
+    Each tensor will have the same shape as records.
+
   Raises:
     ValueError: If any of the arguments is malformed.
   """
diff --git a/tensorflow/python/ops/partitioned_variables.py b/tensorflow/python/ops/partitioned_variables.py
index 7743b634e8fa418572f334130f2072dcfe8d029c..c1084c25592045734ae016c9d5a84b5264a38032 100644
--- a/tensorflow/python/ops/partitioned_variables.py
+++ b/tensorflow/python/ops/partitioned_variables.py
@@ -57,7 +57,7 @@ import math
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import variable_scope
-from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 __all__ = [
@@ -68,7 +68,7 @@ __all__ = [
 ]
 
 
-@tf_export("variable_axis_size_partitioner")
+@tf_export(v1=["variable_axis_size_partitioner"])
 def variable_axis_size_partitioner(
     max_shard_bytes, axis=0, bytes_per_string_element=16, max_shards=None):
   """Get a partitioner for VariableScope to keep shards below `max_shard_bytes`.
@@ -96,7 +96,7 @@ def variable_axis_size_partitioner(
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
 
   Raises:
     ValueError: If any of the byte counts are non-positive.
@@ -154,7 +154,7 @@ def variable_axis_size_partitioner(
   return _partitioner
 
 
-@tf_export("min_max_variable_partitioner")
+@tf_export(v1=["min_max_variable_partitioner"])
 def min_max_variable_partitioner(max_partitions=1, axis=0,
                                  min_slice_size=256 << 10,
                                  bytes_per_string_element=16):
@@ -175,7 +175,7 @@ def min_max_variable_partitioner(max_partitions=1, axis=0,
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
 
   """
   def _partitioner(shape, dtype):
@@ -218,7 +218,7 @@ def min_max_variable_partitioner(max_partitions=1, axis=0,
   return _partitioner
 
 
-@tf_export("fixed_size_partitioner")
+@tf_export(v1=["fixed_size_partitioner"])
 def fixed_size_partitioner(num_shards, axis=0):
   """Partitioner to specify a fixed number of shards along given axis.
 
@@ -228,7 +228,7 @@ def fixed_size_partitioner(num_shards, axis=0):
 
   Returns:
     A partition function usable as the `partitioner` argument to
-    `variable_scope`, `get_variable`, and `get_partitioned_variable_list`.
+    `variable_scope` and `get_variable`.
   """
   def _partitioner(shape, **unused_args):
     partitions_list = [1] * len(shape)
@@ -237,7 +237,10 @@ def fixed_size_partitioner(num_shards, axis=0):
   return _partitioner
 
 
-@tf_export("create_partitioned_variables")
+@tf_export(v1=["create_partitioned_variables"])
+@deprecation.deprecated(
+    date=None,
+    instructions="Use tf.get_variable with a partitioner set.")
 def create_partitioned_variables(
     shape, slicing, initializer, dtype=dtypes.float32,
     trainable=True, collections=None, name=None, reuse=None):
@@ -282,11 +285,6 @@ def create_partitioned_variables(
   Raises:
     ValueError: If any of the arguments is malformed.
   """
-  logging.warn(
-      "create_partitioned_variables is deprecated.  Use "
-      "tf.get_variable with a partitioner set, or "
-      "tf.get_partitioned_variable_list, instead.")
-
   if len(shape) != len(slicing):
     raise ValueError("The 'shape' and 'slicing' of a partitioned Variable "
                      "must have the length: shape: %s, slicing: %s" %
diff --git a/tensorflow/python/ops/quantized_conv_ops_test.py b/tensorflow/python/ops/quantized_conv_ops_test.py
index f7fa264461ee7a1a80d8e7c0cf7d71c4d23225bf..6b469a954f6531641f4bc61396563581b7c368fe 100644
--- a/tensorflow/python/ops/quantized_conv_ops_test.py
+++ b/tensorflow/python/ops/quantized_conv_ops_test.py
@@ -73,7 +73,7 @@ class Conv2DTest(test.TestCase):
           max_input=x1_max,
           min_filter=x2_min,
           max_filter=x2_max)
-      value = sess.run(conv)
+      value = self.evaluate(conv)
     quantized_output = value[0]
     output_min = value[1]
     output_max = value[2]
diff --git a/tensorflow/python/ops/quantized_ops_test.py b/tensorflow/python/ops/quantized_ops_test.py
index 0f3b04e4ad07749e53042216d5abbff8551dc04b..b81843d17482bdff910827125ed8affd4094b942 100644
--- a/tensorflow/python/ops/quantized_ops_test.py
+++ b/tensorflow/python/ops/quantized_ops_test.py
@@ -41,7 +41,7 @@ class QuantizedOpsTest(test.TestCase):
       x_min = 0.0
       x_max = 255.0
       op = array_ops.quantize(x, x_min, x_max, dtypes.quint8, mode="MIN_FIRST")
-      value = sess.run(op)
+      value = self.evaluate(op)
       self.assertArrayNear(expected_output, value.output, 0.1)
 
   def testDequantizeOp(self):
@@ -52,7 +52,7 @@ class QuantizedOpsTest(test.TestCase):
       x_min = 0.0
       x_max = 255.0
       op = array_ops.dequantize(x, x_min, x_max, mode="MIN_FIRST")
-      value = sess.run(op)
+      value = self.evaluate(op)
       self.assertArrayNear(expected_output, value, 0.1)
 
 
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 152c6dc84169524853f4a3eb7211c299c5ff6065..d88543c400f2432ea620ccddcab983337abe3fc2 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -25,14 +25,16 @@ py_library(
     deps = [
         ":ragged_array_ops",
         ":ragged_conversion_ops",
-        ":ragged_elementwise_ops",
+        ":ragged_dispatch",
         ":ragged_factory_ops",
         ":ragged_functional_ops",
         ":ragged_getitem",
         ":ragged_map_ops",
         ":ragged_math_ops",
         ":ragged_operators",
+        ":ragged_string_ops",
         ":ragged_tensor",
+        ":ragged_tensor_shape",
         ":ragged_tensor_value",
         ":ragged_util",
         ":segment_id_ops",
@@ -149,31 +151,29 @@ py_library(
 )
 
 py_library(
-    name = "ragged_elementwise_ops",
-    srcs = ["ragged_elementwise_ops.py"],
+    name = "ragged_operators",
+    srcs = ["ragged_operators.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_factory_ops",
+        ":ragged_getitem",
         ":ragged_tensor",
-        ":ragged_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
     ],
 )
 
 py_library(
-    name = "ragged_operators",
-    srcs = ["ragged_operators.py"],
+    name = "ragged_string_ops",
+    srcs = ["ragged_string_ops.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_elementwise_ops",
-        ":ragged_getitem",
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
         ":ragged_tensor",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:string_ops",
         "//tensorflow/python:util",
     ],
 )
@@ -184,12 +184,34 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged_tensor_value",
+        ":ragged_util",
+        ":segment_id_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
     ],
 )
 
+py_library(
+    name = "ragged_tensor_shape",
+    srcs = ["ragged_tensor_shape.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_conversion_ops",
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:constant_op",
+        "//tensorflow/python:control_flow_ops",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tensor_util",
+    ],
+)
+
 py_library(
     name = "ragged_tensor_value",
     srcs = ["ragged_tensor_value.py"],
@@ -207,6 +229,7 @@ py_library(
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
+        "//tensorflow/python:ragged_math_ops_gen",
     ],
 )
 
@@ -247,17 +270,57 @@ py_library(
     ],
 )
 
+py_library(
+    name = "ragged_dispatch",
+    srcs = ["ragged_dispatch.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_array_ops",
+        ":ragged_factory_ops",
+        ":ragged_math_ops",
+        ":ragged_tensor",
+        ":ragged_tensor_shape",
+        ":ragged_util",
+        "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python:string_ops",
+        "//tensorflow/python:util",
+        "//third_party/py/numpy",
+    ],
+)
+
 #-------------------------------------------------------------------------------
 # RaggedTensor Tests
 #-------------------------------------------------------------------------------
 
+py_library(
+    name = "ragged_test_util",
+    srcs = ["ragged_test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged_tensor",
+        ":ragged_tensor_value",
+        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+    ],
+)
+
 py_test(
     name = "ragged_tensor_test",
     size = "medium",
     srcs = ["ragged_tensor_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -276,6 +339,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -289,6 +353,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -301,6 +366,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
@@ -312,6 +378,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -324,6 +391,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -340,6 +408,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -355,6 +424,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
@@ -370,6 +440,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -382,6 +453,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -394,6 +466,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
@@ -407,13 +480,18 @@ py_test(
     name = "ragged_to_sparse_op_test",
     srcs = ["ragged_to_sparse_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients_impl",
+        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -424,6 +502,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -439,6 +518,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
@@ -453,6 +533,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:errors",
@@ -468,6 +549,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -479,11 +561,12 @@ py_test(
 )
 
 py_test(
-    name = "ragged_map_inner_values_op_test",
-    srcs = ["ragged_map_inner_values_op_test.py"],
+    name = "ragged_map_flat_values_op_test",
+    srcs = ["ragged_map_flat_values_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -500,8 +583,8 @@ py_test(
     srcs = ["ragged_const_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_factory_ops",
-        ":ragged_tensor",
+        ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -513,8 +596,12 @@ py_test(
     name = "ragged_constant_value_op_test",
     srcs = ["ragged_constant_value_op_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "no_windows",
+    ],
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "//third_party/py/numpy",
@@ -528,6 +615,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
@@ -543,6 +631,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
@@ -558,6 +647,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -574,6 +664,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
@@ -586,8 +677,8 @@ py_test(
     srcs = ["ragged_tile_op_test.py"],
     srcs_version = "PY2AND3",
     deps = [
-        ":ragged_array_ops",
-        ":ragged_factory_ops",
+        ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:dtypes",
@@ -602,6 +693,7 @@ py_test(
     srcs = ["ragged_util_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":ragged_test_util",
         ":ragged_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
@@ -618,6 +710,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -630,6 +723,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
         "@absl_py//absl/testing:parameterized",
@@ -637,17 +731,22 @@ py_test(
 )
 
 py_test(
-    name = "ragged_elementwise_ops_test",
-    srcs = ["ragged_elementwise_ops_test.py"],
+    name = "ragged_dispatch_test",
+    srcs = ["ragged_dispatch_test.py"],
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:clip_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:math_ops",
+        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python:string_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -659,6 +758,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
     ],
@@ -671,6 +771,7 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":ragged",
+        ":ragged_test_util",
         "//tensorflow/python:array_ops",
         "//tensorflow/python:dtypes",
         "//tensorflow/python:framework_test_lib",
@@ -678,6 +779,22 @@ py_test(
         "//tensorflow/python:platform_test",
         "//tensorflow/python:string_ops",
         "//tensorflow/python/keras:backend",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_test(
+    name = "ragged_tensor_shape_test",
+    srcs = ["ragged_tensor_shape_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ragged",
+        ":ragged_test_util",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:platform_test",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/ops/ragged/__init__.py b/tensorflow/python/ops/ragged/__init__.py
index 3a2884854546493a2ba1bd0f7be4986c27fd3482..3d915ee269b45571c9338ea1d734ddaa4b884a98 100644
--- a/tensorflow/python/ops/ragged/__init__.py
+++ b/tensorflow/python/ops/ragged/__init__.py
@@ -1,78 +1,14 @@
 """Ragged Tensors.
 
-This package defines the [`RaggedTensor`](ragged/RaggedTensor.md) class, which
+This package defines the `tf.RaggedTensor` class, which
 represents tensors with non-uniform shapes.  In particular, each `RaggedTensor`
 has one or more *ragged dimensions*, which are dimensions whose slices may have
 different lengths.  For example, the inner (column) dimension of
 `rt=[[3, 1, 4, 1], [], [5, 9, 2], [6], []]` is ragged, since the column slices
 (`rt[0, :]`, ..., `rt[4, :]`) have different lengths.  For a more detailed
-description of ragged tensors, see the [`RaggedTensor`](ragged/RaggedTensor.md)
+description of ragged tensors, see the `tf.RaggedTensor`
 class documentation.
 
-## RaggedTensor Operations
-
-This package also defines a collection of operations for manipulating
-ragged tensors.
-
-### RaggedTensor Versions of Standard Tensor Operations
-
-Many of the operations defined by this package are analogous to
-[`Tensor`](https://www.tensorflow.org/api_docs/python/tf/Tensor)
-operations, but they accept `RaggedTensor`s as input and can return
-`RaggedTensor`s as output.  For example, `ragged.add` performs elementwise
-addition just like `tf.add`, but can be used on `RaggedTensor`s.
-
-These `RaggedTensor` versions of the standard `Tensor` operations can also be
-used with standard `Tensors`; and for the most part, they will return the same
-value that the standard `Tensor` operation would return.  However, there are
-a few notable exceptions:
-
-* For [`ragged.stack(...)`](ragged/stack.md) and
-  [`ragged.concat(...)`](ragged/concat.md), the input tensors are not required
-  to have matching shapes.  In the returned tensor, all dimensions up to the
-  `axis` dimension will be ragged.
-
-### Ragged-Tensor Specific Operations
-
-The following operations are specific to ragged tensors:
-
-* **Factory ops**:
-  [`constant(...)`](ragged/constant.md),
-  [`from_row_splits(...)`](ragged/from_row_splits.md),
-  [`from_row_lengths(...)`](ragged/from_row_lengths.md),
-  [`from_row_starts(...)`](ragged/from_row_starts.md),
-  [`from_row_limits(...)`](ragged/from_row_limits.md),
-  [`from_value_rowids(...)`](ragged/from_value_rowids.md),
-  [`from_nested_row_splits(...)`](ragged/from_nested_row_splits.md),
-  [`from_nested_value_rowids(...)`](ragged/from_nested_value_rowids.md).
-
-* **Conversion ops**:
-  [`from_tensor(...)`](ragged/from_tensor.md),
-  [`to_tensor(...)`](ragged/to_tensor.md),
-  [`from_sparse(...)`](ragged/from_sparse.md),
-  [`to_sparse(...)`](ragged/to_sparse.md),
-  [`from_variant(...)`](ragged/from_variant.md),
-  [`to_variant(...)`](ragged/to_variant.md),
-  [`convert_to_tensor_or_ragged_tensor(...)`](
-  ragged/convert_to_tensor_or_ragged_tensor.md).
-
-* **Shape ops**:
-  [`row_splits(...)`](ragged/row_splits.md),
-  [`row_lengths(...)`](ragged/row_lengths.md),
-  [`row_starts(...)`](ragged/row_starts.md),
-  [`row_limits(...)`](ragged/row_limits.md),
-  [`value_rowids(...)`](ragged/value_rowids.md),
-  [`nrows(...)`](ragged/nrows.md),
-  [`nested_row_splits(...)`](ragged/nested_row_splits.md),
-  [`row_splits_to_segment_ids(...)`](ragged/row_splits_to_segment_ids.md),
-  [`segment_ids_to_row_splits(...)`](ragged/segment_ids_to_row_splits.md),
-  [`bounding_shape(...)`](ragged/bounding_shape.md).
-
-* **Functional ops**:
-  [`map_inner_values(...)`](ragged/map_inner_values.md),
-  [`make_elementwise_op(...)`](ragged/make_elementwise_op.md).
-
-
 <!-- Ragged Classes & related helper functions -->
 @@RaggedTensor
 @@RaggedTensorType
@@ -80,15 +16,9 @@ The following operations are specific to ragged tensors:
 @@is_ragged
 
 <!-- Factory Ops -->
+@@ragged_factory_ops
 @@constant
 @@constant_value
-@@from_row_splits
-@@from_row_lengths
-@@from_row_starts
-@@from_row_limits
-@@from_value_rowids
-@@from_nested_row_splits
-@@from_nested_value_rowids
 @@convert_to_tensor_or_ragged_tensor
 
 <!-- Conversion Ops -->
@@ -100,14 +30,6 @@ The following operations are specific to ragged tensors:
 @@segment_ids_to_row_splits
 
 <!-- Array Ops -->
-@@row_splits
-@@row_lengths
-@@row_starts
-@@row_limits
-@@value_rowids
-@@nrows
-@@nested_row_splits
-@@bounding_shape
 @@gather
 @@batch_gather
 @@gather_nd
@@ -137,35 +59,31 @@ The following operations are specific to ragged tensors:
 @@reduce_any
 
 <!-- Functional Ops -->
-@@map_inner_values
+@@map_flat_values
 @@map_fn
 
-<!-- Elementwise Ops -->
-@@make_elementwise_op
-
-<!-- Symbols from  ragged_elementwise_ops._symbols_to_export are whitelisted -->
+<!-- Shape & broadcasting -->
+@@RaggedTensorDynamicShape
+@@broadcast_to
+@@broadcast_dynamic_shape
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.ops.ragged import ragged_dispatch
 from tensorflow.python.ops.ragged import ragged_operators
+from tensorflow.python.ops.ragged import ragged_string_ops
 
 from tensorflow.python.ops.ragged.ragged_array_ops import batch_gather
 from tensorflow.python.ops.ragged.ragged_array_ops import boolean_mask
-from tensorflow.python.ops.ragged.ragged_array_ops import bounding_shape
 from tensorflow.python.ops.ragged.ragged_array_ops import concat
 from tensorflow.python.ops.ragged.ragged_array_ops import expand_dims
 from tensorflow.python.ops.ragged.ragged_array_ops import gather
 from tensorflow.python.ops.ragged.ragged_array_ops import gather_nd
-from tensorflow.python.ops.ragged.ragged_array_ops import nrows
-from tensorflow.python.ops.ragged.ragged_array_ops import row_lengths
-from tensorflow.python.ops.ragged.ragged_array_ops import row_limits
-from tensorflow.python.ops.ragged.ragged_array_ops import row_starts
 from tensorflow.python.ops.ragged.ragged_array_ops import stack
 from tensorflow.python.ops.ragged.ragged_array_ops import tile
-from tensorflow.python.ops.ragged.ragged_array_ops import value_rowids
 from tensorflow.python.ops.ragged.ragged_array_ops import where
 
 from tensorflow.python.ops.ragged.ragged_conversion_ops import from_sparse
@@ -173,23 +91,10 @@ from tensorflow.python.ops.ragged.ragged_conversion_ops import from_tensor
 from tensorflow.python.ops.ragged.ragged_conversion_ops import to_sparse
 from tensorflow.python.ops.ragged.ragged_conversion_ops import to_tensor
 
-# pylint: disable=protected-access, wildcard-import
-from tensorflow.python.ops.ragged.ragged_elementwise_ops import *
-from tensorflow.python.ops.ragged.ragged_elementwise_ops import _symbols_to_export as _elementwise_ops
-# pylint: enable=protected-access, wildcard-import
-
 from tensorflow.python.ops.ragged.ragged_factory_ops import constant
 from tensorflow.python.ops.ragged.ragged_factory_ops import constant_value
-from tensorflow.python.ops.ragged.ragged_factory_ops import convert_to_tensor_or_ragged_tensor
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_nested_row_splits
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_nested_value_rowids
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_lengths
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_limits
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_splits
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_row_starts
-from tensorflow.python.ops.ragged.ragged_factory_ops import from_value_rowids
 
-from tensorflow.python.ops.ragged.ragged_functional_ops import map_inner_values
+from tensorflow.python.ops.ragged.ragged_functional_ops import map_flat_values
 
 from tensorflow.python.ops.ragged.ragged_map_ops import map_fn
 
@@ -210,10 +115,15 @@ from tensorflow.python.ops.ragged.ragged_math_ops import segment_prod
 from tensorflow.python.ops.ragged.ragged_math_ops import segment_sqrt_n
 from tensorflow.python.ops.ragged.ragged_math_ops import segment_sum
 
+from tensorflow.python.ops.ragged.ragged_tensor import convert_to_tensor_or_ragged_tensor
 from tensorflow.python.ops.ragged.ragged_tensor import is_ragged
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensor
 from tensorflow.python.ops.ragged.ragged_tensor import RaggedTensorType
 
+from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_dynamic_shape
+from tensorflow.python.ops.ragged.ragged_tensor_shape import broadcast_to
+from tensorflow.python.ops.ragged.ragged_tensor_shape import RaggedTensorDynamicShape
+
 from tensorflow.python.ops.ragged.ragged_tensor_value import RaggedTensorValue
 
 from tensorflow.python.ops.ragged.segment_id_ops import row_splits_to_segment_ids
@@ -221,6 +131,10 @@ from tensorflow.python.ops.ragged.segment_id_ops import segment_ids_to_row_split
 
 from tensorflow.python.util import all_util as _all_util
 
+
+# Register OpDispatchers that override standard TF ops to work w/ RaggedTensors.
+__doc__ += ragged_dispatch.register_dispatchers()  # pylint: disable=redefined-builtin
+
 # Any symbol that is not referenced (with "@@name") in the module docstring
-# above, or included in the "_elementwise_ops" whitelist, will be removed.
-_all_util.remove_undocumented(__name__, _elementwise_ops)
+# above will be removed.
+_all_util.remove_undocumented(__name__)
diff --git a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
index b43470dfa11f1694cfb6017f3f1552704337fbed..b88f18c8b61a2fbc33aeca1f799c8e518cac4bf6 100644
--- a/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/convert_to_tensor_or_ragged_tensor_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.convert_to_tensor_or_ragged_tensor."""
+"""Tests for ragged.convert_to_tensor_or_ragged."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -25,11 +25,13 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
-                                              parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConvertToTensorOrRaggedTensorTest(
+    ragged_test_util.RaggedTensorTestCase, parameterized.TestCase):
 
   #=============================================================================
   # Tests where the 'value' param is a RaggedTensor
@@ -101,8 +103,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
         value, dtype, preferred_dtype)
     self.assertEqual(value.ragged_rank, converted.ragged_rank)
     self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
-    with self.test_session():
-      self.assertEqual(value.tolist(), converted.eval().tolist())
+    self.assertEqual(value.to_list(), self.eval_to_list(converted))
 
   @parameterized.parameters([
       dict(
@@ -130,8 +131,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
     tensor = constant_op.constant(pylist)
     converted = ragged.convert_to_tensor_or_ragged_tensor(
         tensor, dtype, preferred_dtype)
-    with self.test_session():
-      self.assertIs(tensor, converted)
+    self.assertIs(tensor, converted)
 
   @parameterized.parameters([
       dict(
@@ -187,8 +187,7 @@ class RaggedConvertToTensorOrRaggedTensorTest(test_util.TensorFlowTestCase,
     converted = ragged.convert_to_tensor_or_ragged_tensor(
         value, dtype, preferred_dtype)
     self.assertEqual(dtypes.as_dtype(expected_dtype), converted.dtype)
-    with self.test_session():
-      self.assertAllEqual(value, converted)
+    self.assertAllEqual(value, converted)
 
   @parameterized.parameters([
       dict(
diff --git a/tensorflow/python/ops/ragged/ragged_array_ops.py b/tensorflow/python/ops/ragged/ragged_array_ops.py
index 425f3957c38550f43ceb74fff7f236bff1ace69c..b5917bc4ee6f6f5fb1d46f3e75cbdb66ef156bad 100644
--- a/tensorflow/python/ops/ragged/ragged_array_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_array_ops.py
@@ -27,266 +27,18 @@ from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_ragged_array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.ops.ragged import segment_id_ops
 
-#===============================================================================
-# Row Partitioning
-#===============================================================================
-
-
-def value_rowids(rt_input, name=None):
-  """Returns the row indices for the `values` in the given ragged tensor.
-
-  `value_rowids(rt)` corresponds one-to-one with the outermost dimension of
-  `rt.values`, and specifies the row containing each value.  In particular,
-  the row `rt[row]` consists of the values `rt.values[j]` where
-  `value_rowids(rt)[j] == row`.
-
-  Args:
-    rt_input: The RaggedTensor whose row indices should be returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A 1-D `int64` `Tensor` with shape `self.values.shape[:1]`.
-    The returned tensor is nonnegative, and is sorted in ascending order.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-    >>> rt.values.eval()
-    [3, 1, 4, 1, 5, 9, 2, 6]
-    >>> ragged.value_rowids(rt).eval()
-    [0, 0, 0, 0, 2, 2, 2, 3]  # corresponds 1:1 with rt.values
-    ```
-  """
-  if not ragged_tensor.is_ragged(rt_input):
-    raise TypeError(
-        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
-  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
-      rt_input.cached_value_rowids is not None):
-    return rt_input.cached_value_rowids
-
-  with ops.name_scope(name, 'RaggedValueRowIds', [rt_input]):
-    return segment_id_ops.row_splits_to_segment_ids(rt_input.row_splits)
-
-
-def nrows(rt_input, out_type=dtypes.int64, name=None):
-  """Returns the number of rows in the given potentially ragged tensor.
-
-  I.e., the size of the outermost dimension of the tensor.
-
-  Args:
-    rt_input: The potentially ragged tensor whose number of rows should be
-      returned.
-    out_type: `dtype` for the returned tensor.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A scalar `Tensor` with dtype `out_type`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-    >>> ragged.nrows(rt).eval()  # rt has 5 rows.
-    5
-    ```
-  """
-  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
-      rt_input.cached_nrows is not None):
-    return rt_input.cached_nrows
-
-  with ops.name_scope(name, 'RaggedNRows', [rt_input]):
-    if ragged_tensor.is_ragged(rt_input):
-      return array_ops.shape(rt_input.row_splits, out_type=out_type)[0] - 1
-    else:
-      return array_ops.shape(rt_input, out_type=out_type)[0]
-
-
-def row_starts(rt_input, name=None):
-  """Returns the start indices for rows in the given ragged tensor.
-
-  These indices specify where the values for each row begin in
-  `rt_input.values`.  `ragged.row_starts(rt_input)` is equal to
-  `rt_input.row_splits[:-1]`.
-
-  Args:
-    rt_input: The RaggedTensor whose row starts should be returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A 1-D Tensor of int64 with shape `[nrows]`.
-    The returned tensor is nonnegative, and is sorted in ascending order.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-    >>> ragged.values(rt).eval()
-    [3, 1, 4, 1, 5, 9, 2, 6]
-    >>> ragged.row_starts(rt).eval()  # indices of row starts in ragged.values
-    [0, 4, 4, 7, 8]
-    ```
-  """
-  if not ragged_tensor.is_ragged(rt_input):
-    raise TypeError(
-        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
-  with ops.name_scope(name, 'RaggedRowStarts', [rt_input]):
-    return rt_input.row_splits[:-1]
-
-
-def row_limits(rt_input, name=None):
-  """Returns the limit indices for rows in the given ragged tensor.
-
-  These indices specify where the values for each row end in
-  `rt_input.values`.  `ragged.row_limits(rt_input)` is equal to
-  `rt_input.row_splits[:-1]`.
-
-  Args:
-    rt_input: The RaggedTensor whose row limits should be returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A 1-D Tensor of int64 with shape `[nrows]`.
-    The returned tensor is nonnegative, and is sorted in ascending order.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-    >>> ragged.values(rt).eval()
-    [3, 1, 4, 1, 5, 9, 2, 6]
-    >>> ragged.row_limits(rt).eval()  # indices of row limits in ragged.values
-    [4, 4, 7, 8, 8]
-    ```
-  """
-  if not ragged_tensor.is_ragged(rt_input):
-    raise TypeError(
-        'rt_input expected RaggedTensor, got %s' % type(rt_input).__name__)
-  with ops.name_scope(name, 'RaggedRowLimits', [rt_input]):
-    return rt_input.row_splits[1:]
-
-
-def row_lengths(rt_input, axis=1, name=None):
-  """Returns the lengths of the rows in the given potentially ragged tensor.
-
-  `ragged.row_lengths(rt_input)[i]` indicates the number of values in the
-  `i`th row of `rt_input`.
-
-  Args:
-    rt_input: The potentially ragged tensor whose row lengths should be
-      returned.  Must have at least `axis+1` dimensions.
-    axis: An integer constant indicating the axis whose row lengths should be
-      returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    A potentially Tensor of int64 with shape `rt_input.shape[:axis]`.
-
-  Raises:
-    ValueError: If rt_input is a scalar, or `axis` is out of bounds.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
-    >>> ragged.row_lengths(rt).eval()  # lengths of rows in rt
-    [2, 0, 2, 1, 0]
-    >>> ragged.row_lengths(rt, axis=2).eval()  # lengths of axis=2 rows.
-    [[3, 1], [], [2, 1], [1], []]
-    ```
-  """
-  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
-      rt_input.cached_row_lengths is not None):
-    return rt_input.cached_row_lengths
-
-  with ops.name_scope(name, 'RaggedRowLengths', [rt_input]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
-    ndims = rt_input.shape.ndims
-    if ndims is not None:
-      if ndims == 0:
-        raise ValueError('rt_input may not be a scalar.')
-      elif not -ndims <= axis < ndims:
-        raise ValueError('axis=%d out of bounds: expected %d<=axis<%d.' %
-                         (axis, -ndims, ndims))
-    if ragged_tensor.is_ragged(rt_input):
-      axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
-      if axis == 0:
-        return nrows(rt_input)
-      elif axis == 1:
-        splits = rt_input.row_splits
-        return splits[1:] - splits[:-1]
-      else:
-        return rt_input.with_values(row_lengths(rt_input.values, axis - 1))
-    else:
-      shape = array_ops.shape(rt_input, out_type=dtypes.int64)
-      return array_ops.ones(shape[:axis], dtypes.int64) * shape[axis]
-
-
-#===============================================================================
-# Bounding Shape
-#===============================================================================
-def bounding_shape(rt_input, axis=None, name=None):
-  """Returns the tight bounding box shape for a potentially ragged tensor.
-
-  Args:
-    rt_input: A potentially ragged tensor.
-    axis: An integer scalar or vector indicating which axes to return the
-      bounding box for.  If not specified, then the full bounding box is
-      returned.
-    name: A name prefix for the returned tensor (optional).
-
-  Returns:
-    An int64 `Tensor`.  If `axis` is not specified, then `output`
-    is a vector with `output.shape=[rt_input.shape.ndims]`.  If `axis` is a
-    scalar, then the `output` is a scalar.  If `axis` is a vector, then
-    `output` is a vector, where `output[i]` is the bounding size for
-    dimension `axis[i]`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
-    >>> ragged.bounding_shape(rt).eval().tolist()
-    [5, 4]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedBoundingBox', [rt_input, axis]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
-    if not ragged_tensor.is_ragged(rt_input):
-      bbox = array_ops.shape(rt_input)
-      return bbox if axis is None else array_ops.gather(bbox, axis)
-
-    nested_splits = rt_input.nested_row_splits
-    rt_inner_values = rt_input.inner_values
-
-    # Optimized special cases for when axis=0 or axis=1:
-    if isinstance(axis, int):
-      if axis == 0:
-        return array_ops.shape(nested_splits[0], out_type=dtypes.int64)[0] - 1
-      elif axis == 1:
-        return math_ops.maximum(math_ops.reduce_max(row_lengths(rt_input)), 0)
-
-    splits_shape = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)
-    inner_values_shape = array_ops.shape(rt_inner_values, out_type=dtypes.int64)
-
-    ragged_dimensions = array_ops.stack([splits_shape[0] - 1] + [
-        math_ops.maximum(math_ops.reduce_max(splits[1:] - splits[:-1]), 0)
-        for splits in nested_splits
-    ])
-    inner_dimensions = inner_values_shape[1:]
-
-    bbox = array_ops.concat([ragged_dimensions, inner_dimensions], axis=0)
-    return bbox if axis is None else array_ops.gather(bbox, axis)
-
 
 #===============================================================================
 # ragged_gather
 #===============================================================================
 # TODO(edloper): Add an `axis` argument
-def gather(params, indices, name=None):
+def gather(params, indices, validate_indices=None, axis=0, name=None):
   """Gathers ragged slices from `params` axis `0` according to `indices`.
 
   Returns `RaggedTensor` output, such that:
@@ -309,13 +61,13 @@ def gather(params, indices, name=None):
   >>> ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
   >>> ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
 
-  >>> print ragged.gather(params, ragged_indices).eval().tolist()
+  >>> print ragged.gather(params, ragged_indices)
   [['d', 'b', 'c'], ['b'], [], ['a']]
 
-  >>> print ragged.gather(ragged_params, indices).eval().tolist()
+  >>> print ragged.gather(ragged_params, indices)
   [['e'], ['d'], [], ['d'], ['a', 'b', 'c']]
 
-  >>> print ragged.gather(ragged_params, ragged_indices).eval().tolist()
+  >>> print ragged.gather(ragged_params, ragged_indices)
   [[['e'], ['d'], []], [['d']], [], [['a', 'b', 'c']]]
   ```
 
@@ -325,6 +77,8 @@ def gather(params, indices, name=None):
     indices: The potentially ragged tensor indicating which values to gather.
       Must have dtype `int32` or `int64`.  Values must be in the range `[0,
       params.shape[0]]`.
+    validate_indices: Ignored.
+    axis: Must be zero.
     name: A name for the operation (optional).
 
   Returns:
@@ -335,10 +89,13 @@ def gather(params, indices, name=None):
   Raises:
     ValueError: If indices.shape.ndims is not known statically.
   """
+  del validate_indices
+  if not isinstance(axis, int) or axis != 0:
+    raise ValueError('axis>0 is not supported for ragged gather yet.')
   with ops.name_scope(name, 'RaggedGather', [params, indices]):
-    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         params, name='params')
-    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
 
     if ragged_tensor.is_ragged(indices):
@@ -353,13 +110,13 @@ def gather(params, indices, name=None):
 
     result = gen_ragged_array_ops.ragged_gather(
         indices=indices,
-        params_dense_values=params.inner_values,
+        params_dense_values=params.flat_values,
         params_nested_splits=params.nested_row_splits,
         OUTPUT_RAGGED_RANK=indices.shape.ndims + len(params.nested_row_splits) -
         1)
 
     # Compose the RaggedTensor from splits & values.
-    return ragged_factory_ops.from_nested_row_splits(
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
         result.output_dense_values, result.output_nested_splits)
 
 
@@ -402,9 +159,9 @@ def batch_gather(params, indices, name=None):
     return array_ops.batch_gather(params, indices, name)
 
   with ops.name_scope(name, 'RaggedBatchGather', [params, indices]):
-    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         params, name='params')
-    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
     indices_ndims = indices.shape.ndims
     if indices_ndims is None:
@@ -421,7 +178,7 @@ def batch_gather(params, indices, name=None):
                            'not match params shape')
         checks = [check_ops.assert_equal(params.row_splits, indices.row_splits)]
         with ops.control_dependencies(checks):
-          return ragged_factory_ops.from_row_splits(
+          return ragged_tensor.RaggedTensor.from_row_splits(
               batch_gather(params.values, indices.values), indices.row_splits)
 
       # Otherwise, indices is a 2D ragged tensor with 1 ragged dimension.
@@ -435,11 +192,11 @@ def batch_gather(params, indices, name=None):
 
         # Adjust indices from within-batch to global (in params.values), and
         # then use ragged.gather to gather them.
-        num_indices = row_lengths(indices)
-        params_starts = row_starts(params)
+        num_indices = indices.row_lengths()
+        params_starts = params.row_starts()
         adjustments = ragged_util.repeat(params_starts, num_indices, axis=0)
         adjusted_index_values = math_ops.to_int64(indices.values) + adjustments
-        return ragged_factory_ops.from_row_splits(
+        return ragged_tensor.RaggedTensor.from_row_splits(
             gather(params.values, adjusted_index_values), indices.row_splits)
 
     else:  # params is a RaggedTensor and indices is a Tensor.
@@ -447,12 +204,11 @@ def batch_gather(params, indices, name=None):
         return gather(params, indices)
       elif indices_ndims == 2:
         # Adjust indices from batch-local to global (in params.values)
-        adjustments = array_ops.expand_dims(row_starts(params), 1)
+        adjustments = array_ops.expand_dims(params.row_starts(), 1)
         adjusted_indices = math_ops.to_int64(indices) + adjustments
         return gather(params.values, adjusted_indices)
       else:
-        raise ValueError(
-            'batch shape from indices does not match params shape')
+        raise ValueError('batch shape from indices does not match params shape')
 
 
 #===============================================================================
@@ -506,9 +262,9 @@ def gather_nd(params, indices, name=None):
 
   with ops.name_scope(name, 'RaggedGatherNd', [params, indices]):
 
-    params = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    params = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         params, name='params')
-    indices = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    indices = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         indices, name='indices')
     indices_shape = indices.shape
     indices_ndims = indices_shape.ndims
@@ -522,7 +278,7 @@ def gather_nd(params, indices, name=None):
 
     # `index_size` is the "n" in "gather_nd" -- i.e., the number of dimensions
     # that each index slices into.
-    index_size = indices_shape[-1].value
+    index_size = tensor_shape.dimension_value(indices_shape[-1])
     if index_size is None:
       raise ValueError('indices.shape[-1] must be statically known.')
 
@@ -534,8 +290,7 @@ def gather_nd(params, indices, name=None):
       if indices_is_dense:
         indices = ragged_conversion_ops.from_tensor(
             indices, ragged_rank=indices_ndims - 2)
-      result = indices.with_inner_values(
-          gather_nd(params, indices.inner_values))
+      result = indices.with_flat_values(gather_nd(params, indices.flat_values))
       if (indices_is_dense and ragged_tensor.is_ragged(result) and
           result.ragged_rank == indices_ndims - 2):
         result = ragged_conversion_ops.to_tensor(result)
@@ -549,7 +304,7 @@ def gather_nd(params, indices, name=None):
     # Handle corner case: An empty index tuple selects the entire `params`
     # value.  So if `index_size` is zero, then tile `params`.
     if index_size == 0:
-      params_ndims = params.ragged_rank + array_ops.rank(params.inner_values)
+      params_ndims = params.ragged_rank + array_ops.rank(params.flat_values)
       for dim in range(indices_ndims - 1):
         params = expand_dims(params, axis=0)
       multiples = array_ops.concat([
@@ -587,7 +342,7 @@ def gather_nd(params, indices, name=None):
           return array_ops.gather_nd(flattened_params, flattened_index_tuples)
 
         flattened_index_tuples = array_ops.gather(
-            row_starts(flattened_params), flattened_index_tuples)
+            flattened_params.row_starts(), flattened_index_tuples)
         flattened_index_tuples += indices[..., dim]
         flattened_params = flattened_params.values
 
@@ -683,9 +438,8 @@ def boolean_mask(data, mask, keepdims=False, name=None):
   """
   with ops.name_scope(name, 'RaggedMask', [data, mask]):
     # Convert inputs to tensors.
-    data = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        data, name='data')
-    mask = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
+    mask = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         mask, dtypes.bool, name='mask')
 
     # Get static rank of mask.
@@ -716,10 +470,10 @@ def boolean_mask(data, mask, keepdims=False, name=None):
           else:
             # Count the number of True mask values in each row to find the
             # lengths of the filtered rows; then convert to splits.
-            int_mask = ragged_functional_ops.map_inner_values(
+            int_mask = ragged_functional_ops.map_flat_values(
                 math_ops.cast, mask, dtype=dtypes.int64)
             masked_row_lengths = ragged_math_ops.reduce_sum(int_mask, axis=1)
-            splits.append(_lengths_to_splits(masked_row_lengths))
+            splits.append(ragged_util.lengths_to_splits(masked_row_lengths))
           mask = mask.values
           data = data.values
 
@@ -728,7 +482,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
 
         # Add the ragged `splits` back to the result.
         if keepdims:
-          masked_values = ragged_factory_ops.from_nested_row_splits(
+          masked_values = ragged_tensor.RaggedTensor.from_nested_row_splits(
               masked_values, splits)
 
         return masked_values
@@ -739,9 +493,9 @@ def boolean_mask(data, mask, keepdims=False, name=None):
       # Get the masked splits: first get the length of each row, then filter
       # out the rows that we are deleting, and convert that filtered set of
       # masks back to a splits tensor.
-      lengths = row_lengths(data)
+      lengths = data.row_lengths()
       masked_lengths = array_ops.boolean_mask(lengths, mask)
-      masked_splits = _lengths_to_splits(masked_lengths)
+      masked_splits = ragged_util.lengths_to_splits(masked_lengths)
 
       # Get the masked values: first get row ids corresponding to each
       # value, then use tf.gather to build a boolean mask that's false for
@@ -751,7 +505,8 @@ def boolean_mask(data, mask, keepdims=False, name=None):
       segment_mask = array_ops.gather(mask, segment_ids)
       masked_values = boolean_mask(data.values, segment_mask, keepdims=False)
 
-      return ragged_factory_ops.from_row_splits(masked_values, masked_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(masked_values,
+                                                        masked_splits)
 
     # If mask is non-ragged and has rank>1, then convert it to be ragged,
     # with a ragged rank matching data.
@@ -772,7 +527,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
         # and values to get the innermost ragged tensor.
         masked_lengths = math_ops.count_nonzero(mask, axis=-1)
         flattened_masked_lengths = array_ops.reshape(masked_lengths, [-1])
-        masked_values = ragged_factory_ops.from_row_lengths(
+        masked_values = ragged_tensor.RaggedTensor.from_row_lengths(
             masked_values, flattened_masked_lengths)
 
         # Wrap remaining ragged dimensions.
@@ -782,7 +537,7 @@ def boolean_mask(data, mask, keepdims=False, name=None):
           for dim in range(mask.shape.ndims - 3, -1, -1):
             elt_size = mask_shape[dim + 1]
             masked_splits = math_ops.range(split_size[dim]) * elt_size
-            masked_values = ragged_factory_ops.from_row_splits(
+            masked_values = ragged_tensor.RaggedTensor.from_row_splits(
                 masked_values, masked_splits)
 
       return masked_values
@@ -791,29 +546,29 @@ def boolean_mask(data, mask, keepdims=False, name=None):
 #===============================================================================
 # Concatenation and Stacking
 #===============================================================================
-def concat(rt_inputs, axis, name=None):
+def concat(values, axis, name=None):
   """Concatenates potentially ragged tensors along one dimension.
 
   Given a list of tensors with the same rank `K` (`K >= axis`), returns a
   rank-`K` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  concatenation of `[rt[i0...iaxis] for rt in rt_inputs]`.
+  concatenation of `[rt[i0...iaxis] for rt in values]`.
 
   Args:
-    rt_inputs: A list of potentially ragged tensors.  May not be empty. All
-      `rt_inputs` must have the same rank and the same dtype; but unlike
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
       `tf.concat`, they can have arbitrary shapes.
     axis: A python integer, indicating the dimension along which to concatenate.
       (Note: Unlike `tf.concat`, the `axis` parameter must be statically known.)
         Negative values are supported only if the rank of at least one
-        `rt_inputs` value is statically known.
+        `values` value is statically known.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
     A `RaggedTensor` with rank `K`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in rt_inputs]))`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
 
   Raises:
-    ValueError: If `rt_inputs` is empty, if `axis` is out of bounds or if
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
       the input tensors have different ranks.
 
   #### Example:
@@ -826,35 +581,35 @@ def concat(rt_inputs, axis, name=None):
     [[1, 2, 6], [3, 4, 5, 7, 8, 9]]
     ```
   """
-  if not isinstance(rt_inputs, (list, tuple)):
-    rt_inputs = [rt_inputs]
-  with ops.name_scope(name, 'RaggedConcat', rt_inputs):
-    return _ragged_stack_concat_helper(rt_inputs, axis, stack_values=False)
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=False)
 
 
-def stack(rt_inputs, axis, name=None):
+def stack(values, axis, name=None):
   """Stacks potentially ragged tensors along one dimension.
 
   Given a list of tensors with the same rank `K` (`K >= axis`), returns a
   rank-`K+1` `RaggedTensor` `result` such that `result[i0...iaxis]` is the
-  list `[rt[i0...iaxis] for rt in rt_inputs]`.
+  list `[rt[i0...iaxis] for rt in values]`.
 
   Args:
-    rt_inputs: A list of potentially ragged tensors.  May not be empty. All
-      `rt_inputs` must have the same rank and the same dtype; but unlike
+    values: A list of potentially ragged tensors.  May not be empty. All
+      `values` must have the same rank and the same dtype; but unlike
       `tf.concat`, they can have arbitrary shapes.
     axis: A python integer, indicating the dimension along which to stack.
       (Note: Unlike `tf.stack`, the `axis` parameter must be statically known.)
         Negative values are supported only if the rank of at least one
-        `rt_inputs` value is statically known.
+        `values` value is statically known.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
     A `RaggedTensor` with rank `K+1`.
-    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in rt_inputs]))`.
+    `result.ragged_rank=max(axis, max(rt.ragged_rank for rt in values]))`.
 
   Raises:
-    ValueError: If `rt_inputs` is empty, if `axis` is out of bounds or if
+    ValueError: If `values` is empty, if `axis` is out of bounds or if
       the input tensors have different ranks.
 
   #### Example:
@@ -867,10 +622,10 @@ def stack(rt_inputs, axis, name=None):
     [[[1, 2], [6]], [[3, 4, 5], [7, 8, 9]]]
     ```
   """
-  if not isinstance(rt_inputs, (list, tuple)):
-    rt_inputs = [rt_inputs]
-  with ops.name_scope(name, 'RaggedConcat', rt_inputs):
-    return _ragged_stack_concat_helper(rt_inputs, axis, stack_values=True)
+  if not isinstance(values, (list, tuple)):
+    values = [values]
+  with ops.name_scope(name, 'RaggedConcat', values):
+    return _ragged_stack_concat_helper(values, axis, stack_values=True)
 
 
 def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
@@ -893,7 +648,7 @@ def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
 
   # Convert input tensors.
   rt_inputs = [
-      ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+      ragged_tensor.convert_to_tensor_or_ragged_tensor(
           rt_input, name='rt_input') for rt_input in rt_inputs
   ]
 
@@ -944,7 +699,7 @@ def _ragged_stack_concat_helper(rt_inputs, axis, stack_values):
     values = [rt.values for rt in rt_inputs]
     splits = [[rt_input.row_splits] for rt_input in rt_inputs]
     with ops.control_dependencies(ragged_util.assert_splits_match(splits)):
-      return ragged_factory_ops.from_row_splits(
+      return ragged_tensor.RaggedTensor.from_row_splits(
           _ragged_stack_concat_helper(values, axis - 1, stack_values),
           splits[0][0])
 
@@ -961,8 +716,8 @@ def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
     A RaggedTensor.
   """
   # Concatenate the inner values together.
-  inner_values = [rt.inner_values for rt in rt_inputs]
-  concatenated_inner_values = array_ops.concat(inner_values, axis=0)
+  flat_values = [rt.flat_values for rt in rt_inputs]
+  concatenated_flat_values = array_ops.concat(flat_values, axis=0)
 
   # Concatenate the splits together for each ragged dimension (adjusting
   # split offsets as necessary).
@@ -976,12 +731,12 @@ def _ragged_stack_concat_axis_0(rt_inputs, stack_values):
 
   # If we are performing a stack operation, then add another splits.
   if stack_values:
-    stack_lengths = array_ops.stack([nrows(rt) for rt in rt_inputs])
-    stack_splits = _lengths_to_splits(stack_lengths)
+    stack_lengths = array_ops.stack([_nrows(rt) for rt in rt_inputs])
+    stack_splits = ragged_util.lengths_to_splits(stack_lengths)
     concatenated_nested_splits.insert(0, stack_splits)
 
-  return ragged_factory_ops.from_nested_row_splits(concatenated_inner_values,
-                                                   concatenated_nested_splits)
+  return ragged_tensor.RaggedTensor.from_nested_row_splits(
+      concatenated_flat_values, concatenated_nested_splits)
 
 
 def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
@@ -997,10 +752,10 @@ def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
   """
   num_inputs = len(rt_inputs)
 
-  rt_nrows = nrows(rt_inputs[0])
+  rt_nrows = _nrows(rt_inputs[0])
   nrows_msg = 'Input tensors have incompatible shapes.'
   nrows_checks = [
-      check_ops.assert_equal(nrows(rt), rt_nrows, message=nrows_msg)
+      check_ops.assert_equal(_nrows(rt), rt_nrows, message=nrows_msg)
       for rt in rt_inputs[1:]
   ]
 
@@ -1024,14 +779,15 @@ def _ragged_stack_concat_axis_1(rt_inputs, stack_values):
       # Add a new splits tensor to group together the values.
       stack_splits = math_ops.range(0, rt_nrows * num_inputs + 1, num_inputs)
       _copy_row_shape(rt_inputs, stack_splits)
-      return ragged_factory_ops.from_row_splits(permuted_rt, stack_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt,
+                                                        stack_splits)
     else:
       # Merge together adjacent rows by dropping the row-split indices that
       # separate them.
       concat_splits = permuted_rt.row_splits[::num_inputs]
       _copy_row_shape(rt_inputs, concat_splits)
-      return ragged_factory_ops.from_row_splits(permuted_rt.values,
-                                                concat_splits)
+      return ragged_tensor.RaggedTensor.from_row_splits(permuted_rt.values,
+                                                        concat_splits)
 
 
 def _copy_row_shape(rt_inputs, splits):
@@ -1044,53 +800,53 @@ def _copy_row_shape(rt_inputs, splits):
 #===============================================================================
 # Tiling
 #===============================================================================
-def tile(rt_input, multiples, name=None):
+def tile(input, multiples, name=None):  # pylint: disable=redefined-builtin
   """Constructs a `RaggedTensor` by tiling a given `RaggedTensor`.
 
-  The values of `rt_input` are replicated `multiples[i]` times along the
+  The values of `input` are replicated `multiples[i]` times along the
   `i`th dimension (for each dimension `i`).  For every dimension `axis` in
-  `rt_input`, the length of each output element in that dimension is the
+  `input`, the length of each output element in that dimension is the
   length of corresponding input element multiplied by `multiples[axis]`.
 
   Args:
-    rt_input: A `RaggedTensor`.
+    input: A `RaggedTensor`.
     multiples: A 1-D integer `Tensor`.  Length must be the same as the number of
-      dimensions in `rt_input`.
+      dimensions in `input`.
     name: A name for the operation (optional).
 
   Returns:
-    A `RaggedTensor` with the same type, rank, and ragged_rank as `rt_input`.
+    A `RaggedTensor` with the same type, rank, and ragged_rank as `input`.
 
   #### Example:
     ```python
     >>> rt = ragged.constant([[1, 2], [3]])
-    >>> ragged.tile(rt, [3, 2]).eval().tolist()
+    >>> ragged.tile(rt, [3, 2])
     [[1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3], [1, 2, 1, 2], [3, 3]]
     ```
   """
-  with ops.name_scope(name, 'RaggedTile', [rt_input, multiples]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
+  with ops.name_scope(name, 'RaggedTile', [input, multiples]):
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        input, name='input')
     multiples = ragged_util.convert_to_int_tensor(
         multiples, name='multiples', dtype=dtypes.int64)
     multiples.shape.assert_has_rank(1)
-    if not ragged_tensor.is_ragged(rt_input):
-      return array_ops.tile(rt_input, multiples, name)
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.tile(input, multiples, name)
 
     # If the constant value of `multiples` is available, then we can use it
     # to skip tiling dimensions where `multiples=1`.
     const_multiples = tensor_util.constant_value(multiples)
 
-    return ragged_factory_ops.from_nested_row_splits(
-        _tile_ragged_values(rt_input, multiples, const_multiples),
-        _tile_ragged_splits(rt_input, multiples, const_multiples))
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        _tile_ragged_values(input, multiples, const_multiples),
+        _tile_ragged_splits(input, multiples, const_multiples))
 
 
 def _tile_ragged_values(rt_input, multiples, const_multiples=None):
-  """Builds inner_values tensor for a tiled `RaggedTensor`.
+  """Builds flat_values tensor for a tiled `RaggedTensor`.
 
   Returns a tensor that repeats the values in
-  `rt_input.inner_values` in the
+  `rt_input.flat_values` in the
   appropriate pattern to construct a `RaggedTensor` that tiles `rt_input` as
   specified by `multiples`.
 
@@ -1102,19 +858,19 @@ def _tile_ragged_values(rt_input, multiples, const_multiples=None):
       dimensions where `multiples=1`.
 
   Returns:
-    A `Tensor` with the same type and rank as `rt_input.inner_values`.
+    A `Tensor` with the same type and rank as `rt_input.flat_values`.
 
   #### Example:
     ```python
     >>> rt = ragged.constant([[1, 2], [3]])
-    >>> _tile_ragged_values(rt, [3, 2]).eval().tolist()
+    >>> _tile_ragged_values(rt, [3, 2])
     [1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3, 1, 2, 1, 2, 3, 3]
     ```
   """
   ragged_rank = rt_input.ragged_rank
   nested_splits = rt_input.nested_row_splits
 
-  # Pointers to the values in `rt_input.inner_values`.
+  # Pointers to the values in `rt_input.flat_values`.
   inner_value_ids = math_ops.range(nested_splits[-1][-1])
 
   # For each ragged dimension (working from the innermost to outermost),
@@ -1131,14 +887,15 @@ def _tile_ragged_values(rt_input, multiples, const_multiples=None):
 
     # Repeat each element in this ragged dimension `multiples[axis]` times.
     if const_multiples is None or const_multiples[axis] != 1:
-      inner_value_ids = _repeat_ranges(inner_value_ids, splits, multiples[axis])
+      inner_value_ids = ragged_util.repeat_ranges(inner_value_ids, splits,
+                                                  multiples[axis])
 
     prev_splits = splits
 
   # Gather the tiled inner values.
-  ragged_tiled_values = array_ops.gather(rt_input.inner_values, inner_value_ids)
+  ragged_tiled_values = array_ops.gather(rt_input.flat_values, inner_value_ids)
 
-  # Tile the inner_values for the uniform dimensions (i.e., for `axis=0` plus
+  # Tile the flat_values for the uniform dimensions (i.e., for `axis=0` plus
   # `axis=range(ragged_rank, rank)`).
   inner_repeats = array_ops.concat([multiples[:1], multiples[ragged_rank + 1:]],
                                    axis=0)
@@ -1165,13 +922,24 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
   #### Example:
     ```python
     >>> rt = ragged.constant([[1, 2], [3]])
-    >>> _tile_ragged_splits(rt, [3, 2]).eval().tolist()
+    >>> _tile_ragged_splits(rt, [3, 2])
     [0, 4, 6, 10, 12, 16, 18]
     ```
   """
   ragged_rank = rt_input.ragged_rank
   nested_splits = rt_input.nested_row_splits
 
+  # projected_splits[src_axis, dst_axis] contains the split points that divide
+  # the rows from src_axis in the list of dst_axis values.  E.g.,
+  # projected_splits[i, i] = nested_splits[i], and
+  # projected_splits[i, i+1] = gather(nested_splits[i+1], nested_splits[i]).
+  projected_splits = [{i: nested_splits[i]} for i in range(ragged_rank)]
+  for src_axis in range(ragged_rank):
+    for dst_axis in range(src_axis + 1, ragged_rank - 1):
+      projected_splits[src_axis][dst_axis] = array_ops.gather(
+          nested_splits[dst_axis],
+          projected_splits[src_axis][dst_axis - 1])
+
   # For each ragged dimension: nested_splits[axis] -> result_splits[axis].
   result_splits = []
   for axis in range(ragged_rank):
@@ -1188,16 +956,16 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
     repeats = 1
     for d in range(axis - 1, -1, -1):
       if const_multiples is None or const_multiples[d + 1] != 1:
-        splits = nested_splits[d] * repeats
-        output_lengths = _repeat_ranges(output_lengths, splits,
-                                        multiples[d + 1])
+        splits = projected_splits[d][axis - 1] * repeats
+        output_lengths = ragged_util.repeat_ranges(output_lengths, splits,
+                                                   multiples[d + 1])
       repeats *= multiples[d + 1]
 
     # Tile splits for the outermost (uniform) dimension.
     output_lengths = array_ops.tile(output_lengths, multiples[:1])
 
     # Convert to splits.
-    result_splits.append(_lengths_to_splits(output_lengths))
+    result_splits.append(ragged_util.lengths_to_splits(output_lengths))
 
   return result_splits
 
@@ -1207,26 +975,26 @@ def _tile_ragged_splits(rt_input, multiples, const_multiples=None):
 #===============================================================================
 
 
-def expand_dims(rt_input, axis, name=None):
+def expand_dims(input, axis, name=None):  # pylint: disable=redefined-builtin
   """Inserts a dimension with shape 1 into a potentially ragged tensor's shape.
 
-  Given a potentially ragged tenor `rt_input`, this operation inserts a
-  dimension with size 1 at the dimension `axis` of `rt_input`'s shape.
+  Given a potentially ragged tenor `input`, this operation inserts a
+  dimension with size 1 at the dimension `axis` of `input`'s shape.
 
-  * If `rt_input` is a `Tensor`, then this is equivalent to
+  * If `input` is a `Tensor`, then this is equivalent to
     `tf.expand_dims`.
-  * If `rt_input` is ragged, and `axis=0`, then the new dimension will be
+  * If `input` is ragged, and `axis=0`, then the new dimension will be
     uniform; but the previously outermost dimension will become ragged.
-  * If `rt_input` is ragged, and `0 < axis < rt_input.ragged_rank`, then the
+  * If `input` is ragged, and `0 < axis < input.ragged_rank`, then the
     new dimension will be ragged.
-  * If `rt_input` is ragged, and axis >= rt_input.ragged_rank`, then the new
+  * If `input` is ragged, and axis >= input.ragged_rank`, then the new
     dimension will be uniform.
 
   The following table gives some examples showing how `ragged.expand_dims`
   impacts the shapes of different input tensors.  Ragged dimensions are
   indicated by enclosing them in parentheses.
 
-  rt_input.shape          | axis | result.shape
+  input.shape             | axis | result.shape
   ----------------------- | ---- | -----------------------------
   `[D1, D2]`              |  `0` | `[1, D1, D2]`
   `[D1, D2]`              |  `1` | `[D1, 1, D2]`
@@ -1238,14 +1006,14 @@ def expand_dims(rt_input, axis, name=None):
   `[D1, (D2), (D3), D4]`  |  `4` | `[D1, (D2), (D3), D4, 1]`
 
   Args:
-    rt_input: The potentially tensor that should be expanded with a new
+    input: The potentially tensor that should be expanded with a new
       dimension.
     axis: An integer constant indicating where the new dimension should be
       inserted.
     name: A name for the operation (optional).
 
   Returns:
-    A tensor with the same values as `rt_input`, with an added dimension of
+    A tensor with the same values as `input`, with an added dimension of
     size 1 at `axis`.
 
   #### Examples:
@@ -1255,38 +1023,38 @@ def expand_dims(rt_input, axis, name=None):
     TensorShape([2, None])
 
     >>> expanded = ragged.expand_dims(rt, axis=0)
-    >>> print(expanded.shape, expanded.eval().tolist())
+    >>> print(expanded.shape, expanded)
     TensorShape([1, None, None]) [[[1, 2], [3]]]
 
     >>> expanded = ragged.expand_dims(rt, axis=1)
-    >>> print(expanded.shape, expanded.eval().tolist())
+    >>> print(expanded.shape, expanded)
     TensorShape([2, None, None]) [[[1, 2]], [[3]]]
 
     >>> expanded = ragged.expand_dims(rt, axis=2)
-    >>> print(expanded.shape, expanded.eval().tolist())
+    >>> print(expanded.shape, expanded)
     TensorShape([2, None, 1]) [[[1], [2]], [[3]]]
     ```
   """
-  with ops.name_scope(name, 'RaggedExpandDims', [rt_input]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
+  with ops.name_scope(name, 'RaggedExpandDims', [input]):
+    input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        input, name='input')
 
-    if not ragged_tensor.is_ragged(rt_input):
-      return array_ops.expand_dims(rt_input, axis)
+    if not ragged_tensor.is_ragged(input):
+      return array_ops.expand_dims(input, axis)
 
-    ndims = None if rt_input.shape.ndims is None else rt_input.shape.ndims + 1
+    ndims = None if input.shape.ndims is None else input.shape.ndims + 1
     axis = ragged_util.get_positive_axis(axis, ndims)
     if axis == 0:
-      values = rt_input
-      splits = array_ops.stack([0, nrows(rt_input)])
+      values = input
+      splits = array_ops.stack([0, input.nrows()])
     elif axis == 1:
-      values = rt_input
-      splits = math_ops.range(nrows(rt_input) + 1)
+      values = input
+      splits = math_ops.range(input.nrows() + 1)
     else:
-      values = expand_dims(rt_input.values, axis - 1)
-      splits = rt_input.row_splits
+      values = expand_dims(input.values, axis - 1)
+      splits = input.row_splits
 
-    return ragged_factory_ops.from_row_splits(values, splits)
+    return ragged_tensor.RaggedTensor.from_row_splits(values, splits)
 
 
 #===============================================================================
@@ -1363,13 +1131,13 @@ def where(condition, x=None, y=None, name=None):
   if (x is None) != (y is None):
     raise ValueError('x and y must be either both None or both non-None')
   with ops.name_scope('RaggedWhere', name, [condition, x, y]):
-    condition = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    condition = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         condition, name='condition')
     if x is None:
       return _coordinate_where(condition)
     else:
-      x = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(x, name='x')
-      y = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(y, name='y')
+      x = ragged_tensor.convert_to_tensor_or_ragged_tensor(x, name='x')
+      y = ragged_tensor.convert_to_tensor_or_ragged_tensor(y, name='y')
       return _elementwise_where(condition, x, y)
 
 
@@ -1383,15 +1151,15 @@ def _elementwise_where(condition, x, y):
     return array_ops.where(condition, x, y)
 
   elif condition_is_ragged and x_is_ragged and y_is_ragged:
-    return ragged_functional_ops.map_inner_values(array_ops.where, condition, x,
-                                                  y)
+    return ragged_functional_ops.map_flat_values(array_ops.where, condition, x,
+                                                 y)
   elif not condition_is_ragged:
     # Concatenate x and y, and then use `gather` to assemble the selected rows.
     condition.shape.assert_has_rank(1)
-    x_nrows = nrows(x)
+    x_nrows = _nrows(x)
     x_and_y = concat([x, y], axis=0)
     indices = array_ops.where(condition, math_ops.range(x_nrows),
-                              x_nrows + math_ops.range(nrows(y)))
+                              x_nrows + math_ops.range(_nrows(y)))
     return gather(x_and_y, indices)
 
   else:
@@ -1408,7 +1176,7 @@ def _coordinate_where(condition):
 
   # Convert the first index in each coordinate to a row index and column index.
   first_index = selected_coords[:, 0]
-  selected_rows = array_ops.gather(value_rowids(condition), first_index)
+  selected_rows = array_ops.gather(condition.value_rowids(), first_index)
   selected_row_starts = array_ops.gather(condition.row_splits, selected_rows)
   selected_cols = first_index - selected_row_starts
 
@@ -1425,11 +1193,6 @@ def _coordinate_where(condition):
 #===============================================================================
 
 
-def _lengths_to_splits(lengths):
-  """Returns splits corresponding to the given lengths."""
-  return array_ops.concat([[0], math_ops.cumsum(lengths)], axis=0)
-
-
 def _increase_ragged_rank_to(rt_input, ragged_rank):
   """Adds ragged dimensions to `rt_input` so it has the desired ragged rank."""
   if ragged_rank > 0:
@@ -1451,43 +1214,10 @@ def _concat_ragged_splits(splits_list):
   return array_ops.concat(pieces, axis=0)
 
 
-def _repeat_ranges(params, splits, multiple):
-  """Repeats each range of `params` (as specified by `splits`) `multiple` times.
-
-  Let the `i`th range of `params` be defined as
-  `params[splits[i]:splits[i + 1]]`.  Then this function returns a tensor
-  containing range 0 repeated `multiple` times, followed by range 1 repeated
-  `multiple`, ..., followed by the last range repeated `multiple` times.
-
-  Args:
-    params: The `Tensor` whose values should be repeated.
-    splits: A splits tensor indicating the ranges of `params` that should be
-      repeated.
-    multiple: The number of times each range should be repeated.
-
-  Returns:
-    A `Tensor` with the same rank and type as `params`.
+def _nrows(rt_input, out_type=dtypes.int64, name=None):
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    return rt_input.nrows(out_type=out_type, name=name)
+  else:
+    with ops.name_scope(name, 'RaggedNRows', [rt_input]):
+      return array_ops.shape(rt_input, out_type=out_type)[0]
 
-  #### Example:
-    ```python
-    >>> _repeat_ranges(['a', 'b', 'c'], [0, 2, 3], 3)
-    ['a', 'b', 'a', 'b', 'a', 'b', 'c', 'c', 'c']
-    ```
-  """
-  # Repeat each split value `multiple` times.  E.g., if `splits=[0 3 4]` and
-  # `multiples=3`, then `repeated_splits=[0 0 0 3 3 3 4 4 4]`.
-  repeated_splits = array_ops.tile(
-      array_ops.expand_dims(splits, axis=1), array_ops.stack([1, multiple]))
-  repeated_splits = array_ops.reshape(repeated_splits, [-1])
-
-  # Divide the splits into repeated starts & repeated limits.  E.g., if
-  # `repeated_splits=[0 0 0 3 3 3 4 4 4]` then `repeated_starts=[0 0 0 3 3 3]`
-  # and `repeated_limits=[3 3 3 4 4 4]`.
-  n_splits = array_ops.shape(repeated_splits, out_type=dtypes.int64)[0]
-  repeated_starts = repeated_splits[:n_splits - multiple]
-  repeated_limits = repeated_splits[multiple:]
-
-  # Get indices for each range from starts to limits, and use those to gather
-  # the values in the desired repetition pattern.
-  offsets = ragged_math_ops.range(repeated_starts, repeated_limits).values
-  return array_ops.gather(params, offsets)
diff --git a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
index 79a2ecd87ae11b2c6aadb888074bc8721123cba3..79f1ae591f9f2c9dfcf5b405b1c4d7370ab853a6 100644
--- a/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_batch_gather_op_test.py
@@ -20,15 +20,18 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedBatchGatherOpTest(ragged_test_util.RaggedTensorTestCase,
                               parameterized.TestCase):
 
   @parameterized.parameters([
@@ -137,17 +140,14 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
   ])
   def testRaggedBatchGather(self, descr, params, indices, expected):
     result = ragged.batch_gather(params, indices)
-    self.assertEqual(
-        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
-    with self.test_session():
-      if hasattr(expected, 'tolist'):
-        expected = expected.tolist()
-      self.assertEqual(result.eval().tolist(), expected)
+    self.assertRaggedEqual(result, expected)
 
   def testRaggedBatchGatherUnknownRankError(self):
+    if context.executing_eagerly():
+      return
     params = [['a', 'b'], ['c', 'd']]
     indices = array_ops.placeholder(dtypes.int32, shape=None)
-    ragged_indices = ragged.from_row_splits(indices, [0, 2, 4])
+    ragged_indices = ragged.RaggedTensor.from_row_splits(indices, [0, 2, 4])
 
     with self.assertRaisesRegexp(
         ValueError, 'batch_gather does not allow indices with unknown shape.'):
@@ -159,37 +159,39 @@ class RaggedBatchGatherOpTest(test_util.TensorFlowTestCase,
 
   @parameterized.parameters([
       dict(
-          params=ragged.constant([['a'], ['b'], ['c']]),
-          indices=ragged.constant([[0], [0]]),
+          params=ragged.constant_value([['a'], ['b'], ['c']]),
+          indices=ragged.constant_value([[0], [0]]),
           message='Dimensions 3 and 2 are not compatible'),
       dict(
           params=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
-          indices=ragged.constant([[[0, 0], [0, 0, 0]], [[0]]]),
+          indices=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]]]),
           message='batch shape from indices does not match params shape'),
+      dict(  # rank mismatch
+          params=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]]]),
+          indices=ragged.constant_value([[[0, 0]], [[0, 0, 0]], [[0]]]),
+          error=(ValueError, errors.InvalidArgumentError)),
       dict(
-          params=ragged.constant([[[0, 0], [0, 0, 0]], [[0]]]),
-          indices=ragged.constant([[[0, 0]], [[0, 0, 0]], [[0]]]),
-          message='Dimensions must be equal, but are 3 and 4'),
-      dict(
-          params=ragged.constant([[[0, 0], [0, 0, 0]], [[0]], [[0]]]),
-          indices=ragged.constant([[[0, 0]], [[0, 0, 0]], [[0]]]),
+          params=ragged.constant_value([[[0, 0], [0, 0, 0]], [[0]], [[0]]]),
+          indices=ragged.constant_value([[[0, 0]], [[0, 0, 0]], [[0]]]),
           error=errors.InvalidArgumentError,
-          message='Condition x == y did not hold element-wise'),
+          message='.*Condition x == y did not hold.*'),
+      dict(
+          params=ragged.constant_value(['a', 'b', 'c']),
+          indices=ragged.constant_value([[0], [0]]),
+          message='batch shape from indices does not match params shape'),
+      dict(
+          params=ragged.constant_value([['a']]),
+          indices=0,
+          message='indices.rank must be at least 1.'),
       dict(
-          params=ragged.constant(['a', 'b', 'c']),
-          indices=ragged.constant([[0], [0]]),
+          params=ragged.constant_value([['a']]),
+          indices=[[[0]]],
           message='batch shape from indices does not match params shape'),
-      dict(params=ragged.constant_value([['a']]),
-           indices=0,
-           message='indices.rank must be at least 1.'),
-      dict(params=ragged.constant_value([['a']]),
-           indices=[[[0]]],
-           message='batch shape from indices does not match params shape'),
   ])
   def testRaggedBatchGatherStaticError(self,
                                        params,
                                        indices,
-                                       message,
+                                       message=None,
                                        error=ValueError):
     with self.assertRaisesRegexp(error, message):
       ragged.batch_gather(params, indices)
diff --git a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
index b3279c1e84036d605443fee1f82a426ec2b5340b..b0f7459322792aeafaadd4db18ecd30105e8e74c 100644
--- a/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_boolean_mask_op_test.py
@@ -20,15 +20,18 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedBooleanMaskOpTest(ragged_test_util.RaggedTensorTestCase,
                               parameterized.TestCase):
   # Define short constants for true & false, so the data & mask can be lined
   # up in the examples below.  This makes it easier to read the examples, to
@@ -300,22 +303,16 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
   ])  # pyformat: disable
   def testBooleanMask(self, descr, data, mask, keepdims, expected):
     actual = ragged.boolean_mask(data, mask, keepdims=keepdims)
-    self.assertEqual(
-        getattr(actual, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
-    with self.test_session():
-      if isinstance(expected, ragged.RaggedTensorValue):
-        expected = expected.tolist()
-      self.assertEqual(actual.eval().tolist(), expected)
+    self.assertRaggedEqual(actual, expected)
 
   def testErrors(self):
-    self.assertRaisesRegexp(ValueError,
-                            r'mask\.shape\.ndims must be kown statically',
-                            ragged.boolean_mask, [[1, 2]],
-                            array_ops.placeholder(dtypes.bool))
+    if not context.executing_eagerly():
+      self.assertRaisesRegexp(ValueError,
+                              r'mask\.shape\.ndims must be kown statically',
+                              ragged.boolean_mask, [[1, 2]],
+                              array_ops.placeholder(dtypes.bool))
 
-    self.assertRaisesRegexp(TypeError,
-                            "Expected bool, got 0 of type 'int' instead.",
-                            ragged.boolean_mask, [[1, 2]], [[0, 1]])
+    self.assertRaises(TypeError, ragged.boolean_mask, [[1, 2]], [[0, 1]])
     self.assertRaisesRegexp(
         ValueError, 'Tensor conversion requested dtype bool for '
         'RaggedTensor with dtype int32', ragged.boolean_mask,
@@ -325,15 +322,6 @@ class RaggedBooleanMaskOpTest(test_util.TensorFlowTestCase,
         ValueError, r'Shapes \(1, 2\) and \(1, 3\) are incompatible',
         ragged.boolean_mask, [[1, 2]], [[True, False, True]])
 
-    # self.assertRaisesRegexp(ValueError,
-    #                         r'data=.* is non-ragged but mask=.* is ragged',
-    #                         ragged.boolean_mask, [[1, 2]],
-    #                         ragged.constant([[True, False]]))
-
-    # self.assertRaisesRegexp(
-    #     ValueError, r'data=.* is ragged but mask=.* is non-ragged',
-    #     ragged.boolean_mask, ragged.constant([[1, 2]]), [[True, False]])
-
     self.assertRaisesRegexp(errors.InvalidArgumentError,
                             r'Inputs must have identical ragged splits',
                             ragged.boolean_mask, ragged.constant([[1, 2]]),
diff --git a/tensorflow/python/ops/ragged/ragged_concat_op_test.py b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
index 6b1a602d049fbc3509951c51e27d783e084ed523..e72afb0448f5e7f7f4ab9aebefb712bfd7816133 100644
--- a/tensorflow/python/ops/ragged/ragged_concat_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_concat_op_test.py
@@ -20,16 +20,20 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConcatOpTest(ragged_test_util.RaggedTensorTestCase,
+                         parameterized.TestCase):
 
   def _rt_inputs_to_tensors(self, rt_inputs, ragged_ranks=None):
     if ragged_ranks is None:
@@ -41,6 +45,11 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     ]
 
   @parameterized.parameters(
+      dict(
+          descr='Two rank-2 inputs with empty value axis=1',
+          rt_inputs=([[]], [[]]),
+          axis=1,
+          expected=[[]]),
       dict(
           descr='Two rank-2 inputs (ragged_rank=1), axis=0',
           rt_inputs=(
@@ -230,8 +239,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(concatenated.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
       self.assertEqual(concatenated.shape.as_list(), expected_shape)
-    with self.test_session():
-      self.assertEqual(concatenated.eval().tolist(), expected)
+    self.assertRaggedEqual(concatenated, expected)
 
   @parameterized.parameters(
       dict(
@@ -258,10 +266,14 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           ragged_ranks=(0, 0),
           rt_inputs=([[1, 2]], [[3, 4], [5, 6]]),
           axis=1,
-          error=ValueError,
-          message='Dimension 0 in both shapes must be equal'),
+          error=(ValueError, errors.InvalidArgumentError)),
   )
-  def testStaticError(self, rt_inputs, axis, error, message, ragged_ranks=None):
+  def testStaticError(self,
+                      rt_inputs,
+                      axis,
+                      error,
+                      message=None,
+                      ragged_ranks=None):
     rt_inputs = self._rt_inputs_to_tensors(rt_inputs, ragged_ranks)
     self.assertRaisesRegexp(error, message, ragged.concat, rt_inputs, axis)
 
@@ -275,14 +287,18 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   ])
   def testRuntimeError(self, rt_inputs, axis, error, message,
                        ragged_ranks=None):
+    if context.executing_eagerly():
+      return
     rt_inputs = [
         array_ops.placeholder_with_default(rt, shape=None) for rt in rt_inputs
     ]
     concatenated = ragged.concat(rt_inputs, axis)
-    with self.test_session():
-      self.assertRaisesRegexp(error, message, concatenated.eval)
+    with self.assertRaisesRegexp(error, message):
+      self.evaluate(concatenated)
 
   def testNegativeAxisWithUnknownRankError(self):
+    if context.executing_eagerly():
+      return
     rt_inputs = [
         array_ops.placeholder(dtypes.int64),
         array_ops.placeholder(dtypes.int64)
@@ -300,8 +316,7 @@ class RaggedConcatOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     """
     rt_inputs = ragged.constant([[1, 2], [3, 4]])
     concatenated = ragged.concat(rt_inputs, 0)
-    with self.test_session():
-      self.assertEqual(concatenated.eval().tolist(), [[1, 2], [3, 4]])
+    self.assertRaggedEqual(concatenated, [[1, 2], [3, 4]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_const_op_test.py b/tensorflow/python/ops/ragged/ragged_const_op_test.py
index 13f79c57292cef91d704e25c20237082b15bce7d..c014f7103016104d3cc2e3ecbd18bbf3337a0153 100644
--- a/tensorflow/python/ops/ragged/ragged_const_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_const_op_test.py
@@ -20,15 +20,16 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
-
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConstOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
 
   @parameterized.parameters(
       #=========================================================================
@@ -156,7 +157,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       expected_dtype: The expected dtype for the resulting ragged tensor (used
         to test default/inferred types when dtype=None).
     """
-    rt = ragged_factory_ops.constant(
+    rt = ragged.constant(
         pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
 
     # If dtype was explicitly specified, check it.
@@ -167,31 +168,22 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     # If ragged_rank was explicitly specified, check it.
     if ragged_rank is not None:
-      if isinstance(rt, ragged_tensor.RaggedTensor):
+      if isinstance(rt, ragged.RaggedTensor):
         self.assertEqual(rt.ragged_rank, ragged_rank)
       else:
         self.assertEqual(0, ragged_rank)
 
     # If inner_shape was explicitly specified, check it.
     if inner_shape is not None:
-      if isinstance(rt, ragged_tensor.RaggedTensor):
-        self.assertEqual(rt.inner_values.shape.as_list()[1:], list(inner_shape))
+      if isinstance(rt, ragged.RaggedTensor):
+        self.assertEqual(rt.flat_values.shape.as_list()[1:], list(inner_shape))
       else:
         self.assertEqual(rt.shape.as_list(), list(inner_shape))
 
     if expected_shape is not None:
       self.assertEqual(tuple(rt.shape.as_list()), expected_shape)
 
-    with self.test_session():
-      result = rt.eval()
-      if rt.shape.ndims > 0:
-        self.assertEqual(result.tolist(), pylist)
-        if expected_shape is not None:
-          self.assertEqual(result.shape, expected_shape)
-      else:
-        self.assertEqual(result, pylist)
-        if expected_shape is not None:
-          self.assertEqual((), expected_shape)
+    self.assertRaggedEqual(rt, pylist)
 
   @parameterized.parameters(
       dict(
@@ -235,11 +227,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           exception=ValueError,
           message='pylist has scalar values depth 2, but ragged_rank=2 '
           'requires scalar value depth greater than 2'),
-      dict(
-          pylist=[1, 2, 3],
-          inner_shape=(1, 1),
-          exception=ValueError,
-          message='Too many elements provided.'),
+      dict(pylist=[1, 2, 3], inner_shape=(1, 1), exception=TypeError),
       dict(
           pylist=[[[1, 2], [3, 4]], [[5, 6], [7, 8]]],
           inner_shape=(2, 2),
@@ -269,7 +257,7 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertRaisesRegexp(
         exception,
         message,
-        ragged_factory_ops.constant,
+        ragged.constant,
         pylist,
         dtype=dtype,
         ragged_rank=ragged_rank,
@@ -308,10 +296,10 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if exception is not None:
       self.assertRaisesRegexp(
           exception, message,
-          ragged_factory_ops._find_scalar_and_max_depth, pylist)
+          ragged.ragged_factory_ops._find_scalar_and_max_depth, pylist)
     else:
       self.assertEqual(
-          ragged_factory_ops._find_scalar_and_max_depth(pylist),
+          ragged.ragged_factory_ops._find_scalar_and_max_depth(pylist),
           (scalar_depth, max_depth))
 
   @parameterized.parameters([
@@ -358,11 +346,11 @@ class RaggedConstOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     if exception is not None:
       self.assertRaisesRegexp(
           exception, message,
-          ragged_factory_ops._default_inner_shape_for_pylist, pylist,
+          ragged.ragged_factory_ops._default_inner_shape_for_pylist, pylist,
           ragged_rank)
     else:
       self.assertEqual(
-          ragged_factory_ops._default_inner_shape_for_pylist(
+          ragged.ragged_factory_ops._default_inner_shape_for_pylist(
               pylist, ragged_rank), inner_shape)
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
index d80518930dbb74b5e044269df73002e68c0df2d2..56768a9a479d0d3b568f4ff4b7f102837e26171d 100644
--- a/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_constant_value_op_test.py
@@ -23,10 +23,12 @@ import numpy as np
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedConstantValueOpTest(ragged_test_util.RaggedTensorTestCase,
                                 parameterized.TestCase):
 
   @parameterized.parameters(
@@ -144,7 +146,7 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
                        inner_shape=None,
                        expected_shape=None,
                        expected_dtype=None):
-    """Tests that `ragged_value(pylist).tolist() == pylist`."""
+    """Tests that `ragged_value(pylist).to_list() == pylist`."""
     rt = ragged.constant_value(
         pylist, dtype=dtype, ragged_rank=ragged_rank, inner_shape=inner_shape)
 
@@ -164,7 +166,7 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
     # If inner_shape was explicitly specified, check it.
     if inner_shape is not None:
       if isinstance(rt, ragged.RaggedTensorValue):
-        self.assertEqual(rt.inner_values.shape[1:], inner_shape)
+        self.assertEqual(rt.flat_values.shape[1:], inner_shape)
       else:
         self.assertEqual(rt.shape, inner_shape)
 
@@ -172,7 +174,10 @@ class RaggedConstantValueOpTest(test_util.TensorFlowTestCase,
       self.assertEqual(tuple(rt.shape), expected_shape)
 
     if rt.shape:
-      self.assertEqual(rt.tolist(), pylist)
+      if isinstance(rt, ragged.RaggedTensorValue):
+        self.assertEqual(rt.to_list(), pylist)
+      else:
+        self.assertEqual(rt.tolist(), pylist)
       if expected_shape is not None:
         self.assertEqual(rt.shape, expected_shape)
     else:
diff --git a/tensorflow/python/ops/ragged/ragged_conversion_ops.py b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
index 3ec246ccaf1a53d0651a12a3b5660a05078c9ad3..854c5b303c81d089baf78119ca8525a51e7a83c4 100644
--- a/tensorflow/python/ops/ragged/ragged_conversion_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_conversion_ops.py
@@ -18,392 +18,27 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
-from tensorflow.python.framework import tensor_shape
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import gen_ragged_conversion_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_util
 
 
-#===============================================================================
-# RaggedTensor <-> Tensor conversion
-#===============================================================================
 def from_tensor(tensor, lengths=None, padding=None, ragged_rank=1, name=None):
-  """Converts a `Tensor` into a `RaggedTensor`.
-
-  The set of absent/default values may be specified using a vector of lengths
-  or a padding value (but not both).  If `lengths` is specified, then the
-  output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`.
-  If `padding` is specified, then any row *suffix* consisting entirely of
-  `padding` will be excluded from the returned `RaggedTensor`.  If neither
-  `lengths` nor `padding` is specified, then the returned `RaggedTensor` will
-  have no absent/default values.
-
-  Examples:
-
-  ```python
-  >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
-  >>> ragged.from_tensor(dt).eval().tolist()
-  [[5, 7, 0], [0, 3, 0], [6, 0, 0]]
-  >>> ragged.from_tensor(dt, lengths=[2, 0, 3]).eval().tolist()
-  [[5, 7], [], [6, 0, 0]]
-  >>> ragged.from_tensor(dt, padding=0).eval().tolist()
-  [[5, 7], [0, 3], [6]]
-  ```
-
-  Args:
-    tensor: The `Tensor` to convert.  Must have rank `ragged_rank + 1` or
-      higher.
-    lengths: An optional set of row lengths, specified using a 1-D integer
-      `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows in
-      `tensor`).  If specified, then `output[row]` will contain
-      `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero.
-    padding: An optional padding value.  If specified, then any row suffix
-      consisting entirely of `padding` will be excluded from the returned
-      RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
-      and with `shape=tensor.shape[ragged_rank + 1:]`.
-    ragged_rank: Integer specifying the ragged rank for the returned
-      `RaggedTensor`.  Must be greater than zero.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A `RaggedTensor` with the specified `ragged_rank`.  The shape of the
-    returned ragged tensor is compatible with the shape of `tensor`.
-  Raises:
-    ValueError: If both `lengths` and `padding` are specified.
-  """
-  if lengths is not None and padding is not None:
-    raise ValueError('Specify lengths or padding, but not both')
-  if not isinstance(ragged_rank, int):
-    raise TypeError('ragged_rank expected int, got %r' % ragged_rank)
-  if ragged_rank <= 0:
-    raise ValueError('ragged_rank must be greater than 0; got %s' % ragged_rank)
-
-  with ops.name_scope(name, 'RaggedFromTensor', [tensor, lengths, padding]):
-    tensor = ops.convert_to_tensor(tensor, name='tensor')
-    tensor.shape.with_rank_at_least(ragged_rank + 1)
-    input_shape = array_ops.shape(tensor, out_type=dtypes.int64)
-    ncols = input_shape[1]
-
-    # Handle ragged_rank>1 via recursion:
-    # If the output should have multiple ragged dimensions, then first
-    # flatten the tensor to eliminate all but the last ragged dimension,
-    # and recursively convert that flattened tensor.  Then add on the splits
-    # for the dimensions that we flattened out.
-    if ragged_rank > 1:
-      # Flatten `tensor` to eliminate all but the last ragged dimension.
-      new_shape = array_ops.concat(
-          [constant_op.constant([-1], dtypes.int64), input_shape[ragged_rank:]],
-          axis=0)
-      flattened = array_ops.reshape(tensor, new_shape)
-      # Recursively convert the flattened tensor.
-      values = from_tensor(flattened, lengths, padding)
-      # The total number of elements in each  dimension.  E.g., if
-      # input_shape=[3, 4, 5, 6], then dim[2] has 3*4*5 elements in total.
-      dim_size = math_ops.cumprod(input_shape)
-      # Construct splits tensors for the dimensions that were flattened.
-      new_splits = [
-          math_ops.range(0, dim_size[dim - 1] + 1) * input_shape[dim]
-          for dim in range(1, ragged_rank)
-      ]
-      return ragged_factory_ops.from_nested_row_splits(values, new_splits)
-
-    # If padding was specified, then use it to find row lengths.
-    if padding is not None:
-      padding = ops.convert_to_tensor(
-          padding, name='padding', dtype=tensor.dtype)
-      padding.shape.assert_is_compatible_with(tensor.shape[2:])
-
-      # Find places where the padding is equal to the tensor.  (This will
-      # broadcast `padding` across the outermost 2 dimensions of `tensor`,
-      # so `has_default_value.shape = tensor.shape`.)
-      has_default_value = math_ops.equal(padding, tensor)
-
-      # If the padding isn't a scalar, then require that all values in the
-      # padding match each item in the tensor.  After this block of code,
-      # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
-      # use reduce_all for both cases, becaue when you pass an empty `axis`
-      # list to reduce_all, it reduces all axes; but we want it to reduce no
-      # axes -- i.e., to be a no-op.)
-      tensor_rank = array_ops.rank(tensor)
-      reduce_axis = math_ops.range(2, tensor_rank)
-      has_default = control_flow_ops.cond(
-          tensor_rank > 2,
-          lambda: math_ops.reduce_all(has_default_value, axis=reduce_axis),
-          lambda: has_default_value)
-      has_default.set_shape(tensor_shape.TensorShape([None, None]))
-      has_default.set_shape(tensor.shape[:2])
-
-      # Use has_default it to find the length of each row: for each non-default
-      # item in a row, calculate the length that the row needs to have to
-      # include that item; and then take the max of those values (across each
-      # row).
-      has_nondefault = math_ops.logical_not(has_default)
-      has_nondefault = math_ops.cast(has_nondefault, dtypes.int64)
-      length_for_nondefault_value = (
-          has_nondefault * array_ops.expand_dims(
-              math_ops.range(1, ncols + 1), 0))
-      lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
-
-    # If we have lengths (either directly supplied, or computed from paddings),
-    # then use those to construct splits; and then use masking to get the
-    # corresponding values.
-    if lengths is not None:
-      lengths = ragged_util.convert_to_int_tensor(lengths, 'lengths',
-                                                  dtypes.int64)
-      lengths.shape.assert_has_rank(1)
-      lengths = math_ops.minimum(lengths, ncols)
-      lengths = math_ops.maximum(lengths, 0)
-      limits = math_ops.cumsum(lengths)
-      splits = array_ops.concat(
-          [array_ops.zeros([1], dtypes.int64), limits], axis=0)
-      mask = array_ops.sequence_mask(lengths, maxlen=ncols)
-      values = array_ops.boolean_mask(tensor, mask)
-      return ragged_factory_ops.from_row_splits(values, splits)
-
-    # If neither padding nor lengths were specified, then create a splits
-    # vector that contains no default values, and reshape the input tensor
-    # to form the values for the RaggedTensor.
-    nrows = input_shape[0]
-    nvals = nrows * ncols
-    splits = math_ops.range(nrows + 1) * ncols
-    values_shape = array_ops.concat([[nvals], input_shape[2:]], axis=0)
-    values = array_ops.reshape(tensor, values_shape)
-    return ragged_factory_ops.from_row_splits(values, splits)
+  if ragged_tensor.is_ragged(tensor):
+    return tensor
+  else:
+    return ragged_tensor.RaggedTensor.from_tensor(tensor, lengths, padding,
+                                                  ragged_rank, name)
 
 
 def to_tensor(rt_input, default_value=None, name=None):
-  """Converts a `RaggedTensor` into a `Tensor`.
-
-  Example:
-
-  ```python
-  >>> rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
-  >>> print ragged.to_tensor(rt).eval()
-  [[9 8 7]
-   [0 0 0]
-   [6 5 0]
-   [4 0 0]]
-  ```
-
-  Args:
-    rt_input: The input `RaggedTensor`.
-    default_value: Value to set for indices not specified in `rt_input`.
-      Defaults to zero.  `default_value.shape` must be equal to
-      `rt_input.shape[rt_input.ragged_rank + 1:]`.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A `Tensor` with shape `ragged.bounding_shape(rt_input)` and the
-    values specified by the non-empty values in `rt_input`.  Empty values are
-    assigned `default_value`.
-  """
-  with ops.name_scope(name, 'RaggedToTensor', [rt_input, default_value]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
-    if not ragged_tensor.is_ragged(rt_input):
-      return rt_input  # already dense
-
-    # If ragged_rank > 1, then recursively convert the ragged values into a
-    # `Tensor` before we proceed.
-    values = rt_input.values
-    if ragged_tensor.is_ragged(values):
-      values = to_tensor(values, default_value)
-
-    # Get the expected dense shape ([nrows, ncols] + value_shape).
-    rt_row_lengths = [rt_input.row_splits[1:] - rt_input.row_splits[:-1]]
-    nrows = array_ops.shape(rt_input.row_splits, out_type=dtypes.int64)[0] - 1
-    ncols = math_ops.maximum(math_ops.reduce_max(rt_row_lengths), 0)
-    values_shape = array_ops.shape(values, out_type=dtypes.int64)
-    value_shape = values_shape[1:]
-    nvals = values_shape[0]
-
-    # Build a default value if none was supplied.
-    if default_value is None:
-      default_value = array_ops.zeros(value_shape, dtype=values.dtype)
-    else:
-      default_value = ops.convert_to_tensor(
-          default_value, name='default_value', dtype=values.dtype)
-    default_value.shape.assert_is_compatible_with(values.shape[1:])
-    default_value.set_shape(values.shape[1:])
-
-    # Get the row start indices, and expand to shape=[nrows, 1].
-    starts = array_ops.expand_dims(rt_input.row_splits[:-1], 1)
-
-    # Get the row limit indices, and expand to shape=[nrows, 1].
-    limits = array_ops.expand_dims(rt_input.row_splits[1:], 1)
-
-    # Get the column indices, and expand to shape=[1, ncols].
-    columns = array_ops.expand_dims(math_ops.range(0, ncols), 0)
-
-    # Build a list containing the values plus the default value.  We will use
-    # tf.gather to collect values from this list for the `Tensor` (using
-    # nvals as the index for the default value).
-    values_and_default = array_ops.concat(
-        [values, array_ops.stack([default_value])], axis=0)
-
-    # Construct a matrix "indices" pointing into values_and_default.  I.e.,
-    # output[r, c] = values_and_default[indices[r, c].
-    nondefault_index = starts + columns
-    has_value = nondefault_index < limits
-    default_index = array_ops.fill(array_ops.stack([nrows, ncols]), nvals)
-    indices = array_ops.where(has_value, nondefault_index, default_index)
-
-    # Gather the results into a `Tensor`.
-    return array_ops.gather(values_and_default, indices)
+  if ragged_tensor.is_ragged(rt_input):
+    return rt_input.to_tensor(default_value, name)
+  else:
+    return rt_input
 
 
-#===============================================================================
-# RaggedTensor <-> SparseTensor conversion
-#===============================================================================
 def to_sparse(rt_input, name=None):
-  """Converts a `RaggedTensor` into a sparse tensor.
-
-  Example:
-
-  ```python
-  >>> rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
-  >>> ragged.to_sparse(rt).eval()
-  SparseTensorValue(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]],
-                    values=[1, 2, 3, 4, 5, 6],
-                    dense_shape=[4, 3])
-  ```
-
-  Args:
-    rt_input: The input `RaggedTensor`.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A SparseTensor with the same values as `rt_input`.
-  """
-  if not ragged_tensor.is_ragged(rt_input):
-    raise TypeError('Expected RaggedTensor, got %s' % type(rt_input).__name__)
-  with ops.name_scope(name, 'RaggedToSparse', [rt_input]):
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        rt_input, name='rt_input')
-    result = gen_ragged_conversion_ops.ragged_tensor_to_sparse(
-        rt_input.nested_row_splits, rt_input.inner_values, name=name)
-    return sparse_tensor.SparseTensor(
-        result.sparse_indices, result.sparse_values, result.sparse_dense_shape)
-
-
-@ops.RegisterGradient('RaggedTensorToSparse')
-def _ragged_tensor_to_sparse_gradient(op, unused_sparse_indices_grad,
-                                      sparse_values_grad,
-                                      unused_sparse_shape_grad):
-  """Gradient for ragged.to_sparse."""
-  op_inputs_nested_row_splits = op.inputs[:-1]
-  op_inputs_inner_values = op.inputs[-1]
-
-  # No gradient for the RaggedTensor's nested_row_splits.
-  nested_row_splits_gradient = [None] * len(op_inputs_nested_row_splits)
-
-  # Gradient for the RaggedTensor's inner_values is formed by reshaping
-  # the gradient for the SparseTensor's values.
-  inner_values_shape = array_ops.shape(op_inputs_inner_values)
-  inner_values_gradient = array_ops.reshape(sparse_values_grad,
-                                            inner_values_shape)
-
-  return nested_row_splits_gradient + [inner_values_gradient]
+  return rt_input.to_sparse(name)
 
 
 def from_sparse(st_input, name=None):
-  """Converts a 2D `SparseTensor` to a `RaggedTensor`.
-
-  Each row of the `output` `RaggedTensor` will contain the explicit values from
-  the same row in `st_input`.  `st_input` must be ragged-right.  If not it is
-  not ragged-right, then an error will be generated.
-
-  Example:
-
-  ```python
-  >>> st = SparseTensor(indices=[[0, 1], [0, 2], [0, 3], [1, 0], [3, 0]],
-  ...                   values=[1, 2, 3, 4, 5],
-  ...                   dense_shape=[4, 3])
-  >>> ragged.from_sparse(st).eval().tolist()
-  [[1, 2, 3], [4], [], [5]]
-  ```
-
-  Currently, only two-dimensional `SparseTensors` are supported.
-
-  Args:
-    st_input: The sparse tensor to convert.  Must have rank 2.
-    name: A name prefix for the returned tensors (optional).
-
-  Returns:
-    A `RaggedTensor` with the same values as `st_input`.
-    `output.ragged_rank = rank(st_input) - 1`.
-    `output.shape = [st_input.dense_shape[0], None]`.
-  Raises:
-    ValueError: If the number of dimensions in `st_input` is not known
-      statically, or is not two.
-  """
-  if not sparse_tensor.is_sparse(st_input):
-    raise TypeError('Expected SparseTensor, got %s' % type(st_input).__name__)
-  with ops.name_scope(name, 'RaggedFromSparse', [st_input]):
-    st_input = sparse_tensor.convert_to_tensor_or_sparse_tensor(
-        st_input, name='rt_input')
-
-    if (st_input.dense_shape.shape.ndims != 2 and
-        st_input.indices.shape.ndims is None or
-        st_input.indices.shape.dims[1].value != 2):
-      raise ValueError('rank(st_input) must be 2')
-
-    with ops.control_dependencies(
-        _assert_sparse_indices_are_ragged_right(st_input.indices)):
-      # Treat sparse row indices as segment ids to generate a splits tensor that
-      # we can pair with the sparse tensor values.  (Ignore sparse column
-      # indices.)
-      segment_ids = st_input.indices[:, 0]
-      num_segments = st_input.dense_shape[0]
-      return ragged_factory_ops.from_value_rowids(st_input.values, segment_ids,
-                                                  num_segments)
-
-
-def _assert_sparse_indices_are_ragged_right(indices):
-  """Checks that the given SparseTensor.indices tensor is ragged-right.
-
-  Example: `indices = [[0, 0], [0, 1], [2, 0], [3, 1]]` is not ragged right
-  because the entry `[3, 1]` skips a cell.
-
-  Args:
-    indices: The SparseTensor indices to check.
-
-  Returns:
-    A list of control dependency op tensors.
-  """
-  index_prefix = indices[:, :-1]
-  index_suffix = indices[:, -1]
-
-  # Check whether each index is starting a new row in the innermost dimension
-  # (prefix[i] != prefix[i-1]) or continuing a row (prefix[i] == prefix[i-1]).
-  # (Note: this skips the first index; we will check that separately below.)
-  index_prefix_changed = math_ops.reduce_any(
-      math_ops.not_equal(index_prefix[1:], index_prefix[:-1]), axis=1)
-
-  # Check two cases:
-  #   * For indices that start a new row: index_suffix[i] must be zero.
-  #   * For indices that continue a row: index_suffix[i] must be equal to
-  #     index_suffix[i-1]+1.
-  index_ok = array_ops.where(
-      index_prefix_changed, math_ops.equal(index_suffix[1:], 0),
-      math_ops.equal(index_suffix[1:], index_suffix[:-1] + 1))
-
-  # Also check that the very first index didn't skip any cells.  The first
-  # index starts a new row (by definition), so its suffix should be zero.
-  sparse_indices_are_ragged_right = math_ops.logical_and(
-      math_ops.reduce_all(math_ops.equal(index_suffix[:1], 0)),
-      math_ops.reduce_all(index_ok))
-
-  message = [
-      'SparseTensor is not right-ragged',
-      'SparseTensor.indices =', indices
-  ]
-  return [control_flow_ops.Assert(sparse_indices_are_ragged_right, message)]
+  return ragged_tensor.RaggedTensor.from_sparse(st_input, name)
diff --git a/tensorflow/python/ops/ragged/ragged_dispatch.py b/tensorflow/python/ops/ragged/ragged_dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c74f7be62de0746418f57b2b2c06c31f2a5a4f5
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_dispatch.py
@@ -0,0 +1,440 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Operator dispatch for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_math_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_tensor_shape
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.util import dispatch
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+
+# @TODO(edloper): Set this to True in the CL that exports RaggedTensors.
+_UPDATE_DOCSTRINGS = False
+
+# Information about an argument to an operation: The name of the argument, its
+# position in the argument list, and a boolean flag indicating whether it
+# expects a list of tensors.
+_ArgInfo = collections.namedtuple('ArgInfo', ['name', 'position', 'is_list'])
+
+
+def _get_arg_infos(func, arg_names):
+  """Returns an `_ArgInfo` for each argument of `func` specified by `arg_names`.
+
+  Args:
+    func: The function whose arguments should be described.
+    arg_names: The names of the arguments to get info for.
+
+  Returns:
+    A tuple of `_ArgInfo`s.
+  """
+  arg_infos = []
+
+  # Inspect the func's argspec to find the position of each arg.
+  arg_spec = tf_inspect.getargspec(func)
+  for argname in arg_names:
+    assert isinstance(argname, str)
+    is_list = argname.startswith('[') and argname.endswith(']')
+    if is_list:
+      argname = argname[1:-1]
+    if argname not in arg_spec.args:
+      raise ValueError('Argument %r not found function in %s.  Args=%s' %
+                       (argname, func, arg_spec.args))
+    arg_infos.append(_ArgInfo(argname, arg_spec.args.index(argname), is_list))
+  return arg_infos
+
+
+def _is_convertible_to_tensor(value):
+  """Returns true if `value` is convertible to a `Tensor`."""
+  if isinstance(value,
+                (ops.Tensor, variables.Variable, np.ndarray, int, float, str)):
+    return True
+  elif isinstance(value, (sparse_tensor.SparseTensor,)):
+    return False
+  else:
+    try:
+      ops.convert_to_tensor(value)
+      return True
+    except (TypeError, ValueError):
+      return False
+
+
+class UnaryRaggedElementwiseDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for unary ops that map a base op across ragged values."""
+
+  def __init__(self, original_op, arg_is_list=False):
+    self._original_op = original_op
+    self._arg_is_list = arg_is_list
+    arg_names = tf_inspect.getfullargspec(original_op)[0]
+    self._x = arg_names[0]
+    if _UPDATE_DOCSTRINGS:
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    `{x}` may be a `tf.RaggedTensor`.\n'.format(x=self._x))
+
+  def handle(self, args, kwargs):
+    if args:
+      x, args = args[0], args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+    if x is None:
+      return self.NOT_SUPPORTED
+    if self._arg_is_list:
+      found_ragged = False
+      for elt in x:
+        if ragged_tensor.is_ragged(elt):
+          found_ragged = True
+        elif not _is_convertible_to_tensor(elt):
+          return self.NOT_SUPPORTED
+      if found_ragged:
+        nested_splits_lists = [
+            elt.nested_row_splits for elt in x if ragged_tensor.is_ragged(elt)
+        ]
+        flat_values = [
+            elt.flat_values if ragged_tensor.is_ragged(elt) else elt
+            for elt in x
+        ]
+        with ops.control_dependencies(
+            ragged_util.assert_splits_match(nested_splits_lists)):
+          return ragged_tensor.RaggedTensor.from_nested_row_splits(
+              self._original_op(flat_values, *args, **kwargs),
+              nested_splits_lists[0])
+      else:
+        return self.NOT_SUPPORTED
+    else:
+      found_ragged = ragged_tensor.is_ragged(x)
+      if found_ragged:
+        mapped_values = self._original_op(x.flat_values, *args, **kwargs)
+        return x.with_flat_values(mapped_values)
+      else:
+        return self.NOT_SUPPORTED
+
+
+class BinaryRaggedElementwiseDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for binary ops that map a base op across ragged values.
+
+  Supports broadcasting.
+  """
+
+  def __init__(self, original_op):
+    self._original_op = original_op
+    arg_names = tf_inspect.getfullargspec(original_op)[0]
+    self._x = arg_names[0]
+    self._y = arg_names[1]
+    if _UPDATE_DOCSTRINGS:
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    `{x}` and `{y}` may be a `tf.RaggedTensor`.\n'.format(
+              x=self._x, y=self._y))
+
+  def handle(self, args, kwargs):
+    # Extract the binary args.
+    if len(args) > 1:
+      x = args[0]
+      y = args[1]
+      args = args[2:]
+    elif args:
+      kwargs = kwargs.copy()
+      x = args[0]
+      y = kwargs.pop(self._y, None)
+      args = args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+      y = kwargs.pop(self._y, None)
+
+    # Bail if we don't have at least one ragged argument.
+    x_is_ragged = ragged_tensor.is_ragged(x)
+    y_is_ragged = ragged_tensor.is_ragged(y)
+    if not (x_is_ragged or y_is_ragged):
+      return self.NOT_SUPPORTED
+
+    # Convert args to tensors.  Bail if conversion fails.
+    try:
+      if not x_is_ragged:
+        x = ops.convert_to_tensor(x, name=self._x, preferred_dtype=y.dtype)
+      if not y_is_ragged:
+        y = ops.convert_to_tensor(y, name=self._y, preferred_dtype=x.dtype)
+    except (TypeError, ValueError):
+      return self.NOT_SUPPORTED
+
+    if ((x_is_ragged and y_is_ragged) or
+        (x_is_ragged and x.flat_values.shape.ndims <= y.shape.ndims) or
+        (y_is_ragged and y.flat_values.shape.ndims <= x.shape.ndims)):
+      bcast_shape = ragged_tensor_shape.broadcast_dynamic_shape(
+          ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(x),
+          ragged_tensor_shape.RaggedTensorDynamicShape.from_tensor(y))
+      x = ragged_tensor_shape.broadcast_to(
+          x, bcast_shape, broadcast_inner_dimensions=False)
+      y = ragged_tensor_shape.broadcast_to(
+          y, bcast_shape, broadcast_inner_dimensions=False)
+
+    x_values = x.flat_values if ragged_tensor.is_ragged(x) else x
+    y_values = y.flat_values if ragged_tensor.is_ragged(y) else y
+    mapped_values = self._original_op(x_values, y_values, *args, **kwargs)
+    if ragged_tensor.is_ragged(x):
+      return x.with_flat_values(mapped_values)
+    else:
+      return y.with_flat_values(mapped_values)
+
+
+class RaggedDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for ragged ops.
+
+  Dispatches to a wrapped op-handler if at least one of the `tensor_args`
+  arguments is a RaggedTensor or a RaggedTensorValue; and all of the
+  `tensor_args` arguments are convertible to Tensor or RaggedTensor.
+  """
+
+  def __init__(self, original_op, ragged_op, ragged_args):
+    op_arg_names = tf_inspect.getfullargspec(original_op)[0]
+    ragged_arg_names = tf_inspect.getfullargspec(ragged_op)[0]
+    if op_arg_names != ragged_arg_names:
+      raise AssertionError(
+          'Signature must exactly match when overriding %s with %s: %s vs %s' %
+          (original_op, ragged_op, op_arg_names, ragged_arg_names))
+    self._ragged_op = ragged_op
+    self._ragged_args = _get_arg_infos(ragged_op, ragged_args)
+    if _UPDATE_DOCSTRINGS:
+      arg_list = ' and '.join('`%s`' % arg for arg in ragged_args)
+      original_op.__doc__ = (
+          original_op.__doc__.rstrip() + '\n\n' +
+          '    {0} may be a `tf.RaggedTensor`.\n'.format(arg_list))
+
+  def handle(self, args, kwargs):
+    if self.is_supported(args, kwargs):
+      return self._ragged_op(*args, **kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+  def is_supported(self, args, kwargs):
+    found_ragged = False
+    for arg_info in self._ragged_args:
+      if arg_info.position < len(args):
+        arg = args[arg_info.position]
+      else:
+        arg = kwargs.get(arg_info.name, None)
+
+      if arg_info.is_list:
+        if not isinstance(arg, (list, tuple)):
+          return False
+        for elt in arg:
+          if ragged_tensor.is_ragged(elt):
+            found_ragged = True
+          elif not _is_convertible_to_tensor(elt):
+            return False
+      else:
+        if ragged_tensor.is_ragged(arg):
+          found_ragged = True
+        elif not _is_convertible_to_tensor(arg):
+          return False
+    return found_ragged
+
+
+def ragged_dispatch(original_op, tensor_args):
+
+  def decorator(ragged_op):
+    dispatch.RaggedDispatcher(original_op, ragged_op,
+                              tensor_args).register(original_op)
+    return ragged_op
+
+  return decorator
+
+
+_UNARY_ELEMENTWISE_OPS = [
+    array_ops.check_numerics,
+    array_ops.identity,
+    array_ops.ones_like,
+    array_ops.ones_like_v2,
+    array_ops.zeros_like,
+    array_ops.zeros_like_v2,
+    clip_ops.clip_by_value,
+    math_ops.abs,
+    math_ops.acos,
+    math_ops.acosh,
+    math_ops.angle,
+    math_ops.asin,
+    math_ops.asinh,
+    math_ops.atan,
+    math_ops.atanh,
+    math_ops.cast,
+    math_ops.ceil,
+    math_ops.conj,
+    math_ops.cos,
+    math_ops.cosh,
+    math_ops.digamma,
+    math_ops.erf,
+    math_ops.erfc,
+    math_ops.exp,
+    math_ops.expm1,
+    math_ops.floor,
+    math_ops.imag,
+    math_ops.is_finite,
+    math_ops.is_inf,
+    math_ops.is_nan,
+    math_ops.lgamma,
+    math_ops.log,
+    math_ops.log1p,
+    math_ops.log_sigmoid,
+    math_ops.logical_not,
+    math_ops.negative,
+    math_ops.real,
+    math_ops.reciprocal,
+    math_ops.rint,
+    math_ops.round,
+    math_ops.rsqrt,
+    math_ops.saturate_cast,
+    math_ops.sign,
+    math_ops.sin,
+    math_ops.sinh,
+    math_ops.sqrt,
+    math_ops.square,
+    math_ops.tan,
+    parsing_ops.decode_compressed,
+    string_ops.string_to_number,
+    string_ops.string_to_hash_bucket,
+    string_ops.as_string,
+    string_ops.decode_base64,
+    string_ops.encode_base64,
+    string_ops.regex_full_match,
+    string_ops.regex_replace,
+    string_ops.string_strip,
+    string_ops.string_to_hash_bucket,
+    string_ops.string_to_hash_bucket_fast,
+    string_ops.string_to_hash_bucket_strong,
+    string_ops.substr,
+    string_ops.substr_v2,
+    string_ops.string_length,
+    string_ops.string_length_v2,
+    string_ops.unicode_script,
+]
+
+_UNARY_LIST_ELEMENTWISE_OPS = [
+    math_ops.add_n,
+    string_ops.string_join,
+]
+
+_BINARY_ELEMENTWISE_OPS = [
+    math_ops.add,
+    math_ops.atan2,
+    math_ops.complex,
+    math_ops.div_no_nan,
+    math_ops.divide,
+    math_ops.equal,
+    math_ops.floordiv,
+    math_ops.floormod,
+    math_ops.greater,
+    math_ops.greater_equal,
+    math_ops.less,
+    math_ops.less_equal,
+    math_ops.logical_and,
+    math_ops.logical_or,
+    math_ops.logical_xor,
+    math_ops.maximum,
+    math_ops.minimum,
+    math_ops.multiply,
+    math_ops.not_equal,
+    math_ops.pow,
+    math_ops.realdiv,
+    math_ops.squared_difference,
+    math_ops.subtract,
+    math_ops.truediv,
+    math_ops.truncatediv,
+    math_ops.truncatemod,
+]
+
+# (original_op, ragged_op, ragged_args)
+_RAGGED_DISPATCH_OPS = [
+    (array_ops.batch_gather, ragged_array_ops.batch_gather,
+     ['params', 'indices']),
+    (array_ops.concat, ragged_array_ops.concat, ['values']),
+    (array_ops.expand_dims_v2, ragged_array_ops.expand_dims, ['input']),
+    (array_ops.gather_v2, ragged_array_ops.gather, ['params', 'indices']),
+    (array_ops.gather_nd, ragged_array_ops.gather_nd, ['params', 'indices']),
+    (array_ops.stack, ragged_array_ops.stack, ['values']),
+    (array_ops.tile, ragged_array_ops.tile, ['input']),
+    (array_ops.where, ragged_array_ops.where, ['condition', 'x', 'y']),
+    (math_ops.unsorted_segment_sum, ragged_math_ops.segment_sum,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_prod, ragged_math_ops.segment_prod,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_min, ragged_math_ops.segment_min,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_max, ragged_math_ops.segment_max,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_mean, ragged_math_ops.segment_mean,
+     ['data', 'segment_ids']),
+    (math_ops.unsorted_segment_sqrt_n, ragged_math_ops.segment_sqrt_n,
+     ['data', 'segment_ids']),
+    (math_ops.reduce_sum, ragged_math_ops.reduce_sum, ['input_tensor']),
+    (math_ops.reduce_prod, ragged_math_ops.reduce_prod, ['input_tensor']),
+    (math_ops.reduce_min, ragged_math_ops.reduce_min, ['input_tensor']),
+    (math_ops.reduce_max, ragged_math_ops.reduce_max, ['input_tensor']),
+    (math_ops.reduce_mean, ragged_math_ops.reduce_mean, ['input_tensor']),
+    (math_ops.reduce_any, ragged_math_ops.reduce_any, ['input_tensor']),
+    (math_ops.reduce_all, ragged_math_ops.reduce_all, ['input_tensor']),
+]
+
+
+def register_dispatchers():
+  """Constructs & registers OpDispatchers for ragged ops."""
+
+  op_list = (
+      _UNARY_ELEMENTWISE_OPS + _UNARY_LIST_ELEMENTWISE_OPS +
+      _BINARY_ELEMENTWISE_OPS + [x[0] for x in _RAGGED_DISPATCH_OPS])
+  for op in op_list:
+    _, undecorated_op = tf_decorator.unwrap(op)
+    if not hasattr(undecorated_op, tf_export.API_ATTRS['tensorflow'].names):
+      raise AssertionError('Expected %s to be an exported symbol '
+                           '(while adding a RaggedTensor dispatcher)')
+
+  for op in _UNARY_ELEMENTWISE_OPS:
+    UnaryRaggedElementwiseDispatcher(op).register(op)
+
+  for op in _UNARY_LIST_ELEMENTWISE_OPS:
+    UnaryRaggedElementwiseDispatcher(op, True).register(op)
+
+  for op in _BINARY_ELEMENTWISE_OPS:
+    BinaryRaggedElementwiseDispatcher(op).register(op)
+
+  for (original_op, ragged_op, args) in _RAGGED_DISPATCH_OPS:
+    RaggedDispatcher(original_op, ragged_op, args).register(original_op)
+
+  docstring = (
+      '\n\n### Additional ops that support `RaggedTensor`\n\n' + '\n'.join([
+          '* `tf.%s`' % tf_export.get_canonical_name_for_symbol(op)
+          for op in op_list
+      ]))
+
+  return docstring
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
similarity index 62%
rename from tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
rename to tensorflow/python/ops/ragged/ragged_dispatch_test.py
index 5dfa5cff45d0022300d47bd7257552ddf315352c..82827aa2aafe22e7d6c61977ca6321cb69bd0db5 100644
--- a/tensorflow/python/ops/ragged/ragged_elementwise_ops_test.py
+++ b/tensorflow/python/ops/ragged/ragged_dispatch_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.elementwise_ops."""
+"""Tests for RaggedTensor operator dispatch."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,110 +21,115 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
-
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import parsing_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
-# Constants listing various op types to test.  Each elementwise operation
+# Constants listing various op types to test.  Each operation
 # should be included in at least one list below, or tested separately if
 # necessary (e.g., because it expects additional arguments).
 UNARY_FLOAT_OPS = [
-    ragged.abs,
-    ragged.acos,
-    ragged.acosh,
-    ragged.angle,
-    ragged.asin,
-    ragged.asinh,
-    ragged.atan,
-    ragged.atanh,
-    ragged.ceil,
-    ragged.conj,
-    ragged.cos,
-    ragged.cosh,
-    ragged.digamma,
-    ragged.erf,
-    ragged.erfc,
-    ragged.exp,
-    ragged.expm1,
-    ragged.floor,
-    ragged.imag,
-    ragged.is_finite,
-    ragged.is_inf,
-    ragged.is_nan,
-    ragged.lgamma,
-    ragged.log,
-    ragged.log1p,
-    ragged.log_sigmoid,
-    ragged.negative,
-    ragged.real,
-    ragged.reciprocal,
-    ragged.rint,
-    ragged.round,
-    ragged.rsqrt,
-    ragged.sign,
-    ragged.sin,
-    ragged.sinh,
-    ragged.sqrt,
-    ragged.square,
-    ragged.tan,
-    ragged.as_string,
-    ragged.identity,
-    ragged.ones_like,
-    ragged.zeros_like,
+    math_ops.abs,
+    math_ops.acos,
+    math_ops.acosh,
+    math_ops.angle,
+    math_ops.asin,
+    math_ops.asinh,
+    math_ops.atan,
+    math_ops.atanh,
+    math_ops.ceil,
+    math_ops.conj,
+    math_ops.cos,
+    math_ops.cosh,
+    math_ops.digamma,
+    math_ops.erf,
+    math_ops.erfc,
+    math_ops.exp,
+    math_ops.expm1,
+    math_ops.floor,
+    math_ops.imag,
+    math_ops.is_finite,
+    math_ops.is_inf,
+    math_ops.is_nan,
+    math_ops.lgamma,
+    math_ops.log,
+    math_ops.log1p,
+    math_ops.log_sigmoid,
+    math_ops.negative,
+    math_ops.real,
+    math_ops.reciprocal,
+    math_ops.rint,
+    math_ops.round,
+    math_ops.rsqrt,
+    math_ops.sign,
+    math_ops.sin,
+    math_ops.sinh,
+    math_ops.sqrt,
+    math_ops.square,
+    math_ops.tan,
+    array_ops.identity,
+    array_ops.ones_like,
+    array_ops.zeros_like,
 ]
 UNARY_BOOL_OPS = [
-    ragged.logical_not,
+    math_ops.logical_not,
 ]
 UNARY_STRING_OPS = [
-    ragged.decode_base64,
-    ragged.encode_base64,
-    ragged.string_strip,
-    ragged.decode_compressed,
+    string_ops.decode_base64,
+    string_ops.encode_base64,
+    string_ops.string_strip,
+    parsing_ops.decode_compressed,
 ]
 BINARY_FLOAT_OPS = [
-    ragged.add,
-    ragged.atan2,
-    ragged.complex,
-    ragged.div,
-    ragged.div_no_nan,
-    ragged.divide,
-    ragged.equal,
-    ragged.floordiv,
-    ragged.floormod,
-    ragged.greater,
-    ragged.greater_equal,
-    ragged.less,
-    ragged.less_equal,
-    ragged.maximum,
-    ragged.minimum,
-    ragged.multiply,
-    ragged.not_equal,
-    ragged.pow,
-    ragged.realdiv,
-    ragged.squared_difference,
-    ragged.subtract,
-    ragged.truediv,
+    math_ops.add,
+    math_ops.atan2,
+    math_ops.complex,
+    math_ops.div_no_nan,
+    math_ops.divide,
+    math_ops.equal,
+    math_ops.floordiv,
+    math_ops.floormod,
+    math_ops.greater,
+    math_ops.greater_equal,
+    math_ops.less,
+    math_ops.less_equal,
+    math_ops.maximum,
+    math_ops.minimum,
+    math_ops.multiply,
+    math_ops.not_equal,
+    math_ops.pow,
+    math_ops.realdiv,
+    math_ops.squared_difference,
+    math_ops.subtract,
+    math_ops.truediv,
 ]
 BINARY_BOOL_OPS = [
-    ragged.logical_and,
-    ragged.logical_or,
-    ragged.logical_xor,
+    math_ops.logical_and,
+    math_ops.logical_or,
+    math_ops.logical_xor,
 ]
 UNARY_INT_OPS = [
-    ragged.unicode_script,
+    string_ops.unicode_script,
 ]
 BINARY_INT_OPS = [
-    ragged.truncatediv,
-    ragged.truncatemod,
+    math_ops.truncatediv,
+    math_ops.truncatemod,
 ]
 
 
-class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase,
                                parameterized.TestCase):
 
   def assertSameShape(self, x, y):
@@ -135,7 +140,7 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
       for (x_splits, y_splits) in zip(x.nested_row_splits, y.nested_row_splits):
         self.assertAllEqual(x_splits, y_splits)
       self.assertAllEqual(
-          array_ops.shape(x.inner_values), array_ops.shape(y.inner_values))
+          array_ops.shape(x.flat_values), array_ops.shape(y.flat_values))
     else:
       self.assertIsInstance(y, ops.Tensor)
       self.assertAllEqual(array_ops.shape(x), array_ops.shape(y))
@@ -171,61 +176,59 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
       [{'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]), 'op': op}
        for op in UNARY_STRING_OPS] +
       [
-          {'op': ragged.clip_by_value,
+          {'op': clip_ops.clip_by_value,
            'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
            'clip_value_min': 0.1, 'clip_value_max': 4.0},
-          {'op': ragged.cast,
+          {'op': math_ops.cast,
            'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
            'dtype': dtypes.int32},
-          {'op': ragged.saturate_cast,
+          {'op': math_ops.saturate_cast,
            'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
            'dtype': dtypes.int32},
-          {'op': ragged.string_to_hash_bucket,
+          {'op': string_ops.string_to_hash_bucket,
            'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
            'num_buckets': 1000},
-          {'op': ragged.string_to_hash_bucket_fast,
+          {'op': string_ops.string_to_hash_bucket_fast,
            'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
            'num_buckets': 1000},
-          {'op': ragged.string_to_hash_bucket_strong,
+          {'op': string_ops.string_to_hash_bucket_strong,
            'x': ragged.constant_value([['abcd', 'efgh'], ['aabbccdd']]),
            'num_buckets': 1000,
            'key': [1231, 12512]},
-          {'op': ragged.string_to_number,
+          {'op': string_ops.string_to_number,
            'x': ragged.constant_value([['-2.0', '3.0'], ['-3.0']])},
-          {'op': ragged.regex_full_match,
+          {'op': string_ops.regex_full_match,
            'x': ragged.constant_value([['hello', '123'], ['1+1']]),
            'pattern': r'\w+'},
-          {'op': ragged.regex_replace,
+          {'op': string_ops.regex_replace,
            'x': ragged.constant_value([['hello', '123'], ['1+1']]),
            'pattern': r'\d',
            'rewrite': '#'},
-          {'op': ragged.substr,
+          {'op': string_ops.substr,
            'x': ragged.constant_value([['hello', '123'], ['1+1']]),
            'pos': 2, 'len': 3},
-          {'op': ragged.check_numerics,
+          {'op': array_ops.check_numerics,
            'x': ragged.constant_value([[-2.0, 3.0], [-3.0]]),
            'message': 'check-numerics'},
       ]
       )  # pyformat: disable
-  def testUnaryOp(self, x, op=ragged.abs, **extra_args):
+  def testUnaryElementwiseOp(self, x, op=math_ops.abs, **extra_args):
     x = ragged.convert_to_tensor_or_ragged_tensor(x)
     result = op(x, **extra_args)
 
     # Run the wrapped op on the dense values, for comparison.
-    dense_x = x.inner_values if isinstance(x, ragged.RaggedTensor) else x
-    expected_flat_values = array_ops.reshape(
-        op.__wrapped__(dense_x, **extra_args), [-1])
+    dense_x = x.flat_values if isinstance(x, ragged.RaggedTensor) else x
+    expected_flat_values = array_ops.reshape(op(dense_x, **extra_args), [-1])
 
-    with self.test_session():
-      # Check that the result has the expected shape.
-      self.assertSameShape(x, result)
+    # Check that the result has the expected shape.
+    self.assertSameShape(x, result)
 
-      # Check that the result has the expected (flattened) values.
-      if isinstance(result, ragged.RaggedTensor):
-        result_flat_values = array_ops.reshape(result.inner_values, [-1])
-      else:
-        result_flat_values = array_ops.reshape(result, [-1])
-      self.assertAllEqual(expected_flat_values, result_flat_values)
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
 
   @parameterized.parameters(
       [
@@ -285,12 +288,17 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
           #=====================================================================
           {'x': ragged.constant_value([[[1, 2], [3], [4]], [[], [5, 7, 8]]]),
            'y': ragged.constant_value([[[3, 8], [2], [5]], [[], [1, 9, 8]]]),
-           'use_kwargs': True},
+           'use_kwargs': ('x', 'y')},
           {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
                                       ragged_rank=1),
            'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
                                       ragged_rank=1),
-           'use_kwargs': True},
+           'use_kwargs': ('x', 'y')},
+          {'x': ragged.constant_value([[[1, 2]], [[3, 4], [5, 6], [7, 8]]],
+                                      ragged_rank=1),
+           'y': ragged.constant_value([[[9, 3]], [[5, 2], [3, 4], [7, 6]]],
+                                      ragged_rank=1),
+           'use_kwargs': ('x',)},
       ] +
       #=========================================================================
       # Test each unary op.
@@ -306,35 +314,34 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
       [{'x': ragged.constant_value([[True, True], [False]]),
         'y': ragged.constant_value([[False, True], [False]]),
         'op': op}
-       for op in BINARY_BOOL_OPS] +
-      [
-      ]
+       for op in BINARY_BOOL_OPS]
       )  # pyformat: disable
-  def testBinaryOp(self, x, y, op=ragged.add, **extra_args):
-    use_kwargs = extra_args.pop('use_kwargs', False)
+  def testBinaryElementwiseOp(self, x, y, op=math_ops.add, **extra_args):
+    use_kwargs = extra_args.pop('use_kwargs', ())
     x = ragged.convert_to_tensor_or_ragged_tensor(x)
     y = ragged.convert_to_tensor_or_ragged_tensor(y)
-    if use_kwargs:
+    if 'x' in use_kwargs and 'y' in use_kwargs:
       result = op(x=x, y=y, **extra_args)
+    elif 'y' in use_kwargs:
+      result = op(x, y=y, **extra_args)
     else:
       result = op(x, y, **extra_args)
 
     # Run the wrapped op on the dense values, for comparison.
-    dense_x = x.inner_values if isinstance(x, ragged.RaggedTensor) else x
-    dense_y = y.inner_values if isinstance(y, ragged.RaggedTensor) else y
+    dense_x = x.flat_values if isinstance(x, ragged.RaggedTensor) else x
+    dense_y = y.flat_values if isinstance(y, ragged.RaggedTensor) else y
     expected_flat_values = array_ops.reshape(
-        op.__wrapped__(dense_x, dense_y, **extra_args), [-1])
+        op(dense_x, dense_y, **extra_args), [-1])
 
-    with self.test_session():
-      # Check that the result has the expected shape.
-      self.assertSameShape(y, result)
+    # Check that the result has the expected shape.
+    self.assertSameShape(y, result)
 
-      # Check that the result has the expected (flattened) values.
-      if isinstance(result, ragged.RaggedTensor):
-        result_flat_values = array_ops.reshape(result.inner_values, [-1])
-      else:
-        result_flat_values = array_ops.reshape(result, [-1])
-      self.assertAllEqual(expected_flat_values, result_flat_values)
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
 
   @parameterized.parameters(
       [
@@ -358,16 +365,17 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
                       ragged.constant_value([[[2, 9], [12]], [[8]]])),
            'use_kwargs': True},
       ] + [
-          {'op': ragged.add_n,
+          {'op': math_ops.add_n,
            'inputs': (ragged.constant_value([[1, 3], [-3]]),
                       ragged.constant_value([[4, 7], [88]]),
                       ragged.constant_value([[2, 9], [12]]))},
-          {'op': ragged.string_join,
+          {'op': string_ops.string_join,
            'inputs': (ragged.constant_value([['a', 'b'], ['c']]),
                       ragged.constant_value([['foo', 'bar'], ['baz']]),
                       ragged.constant_value([['2', '9'], ['12']]))},
       ])  # pyformat: disable
-  def testListValuedOp(self, inputs, op=ragged.add_n, **extra_args):
+  def testListValuedElementwiseOp(self, inputs, op=math_ops.add_n,
+                                  **extra_args):
     use_kwargs = extra_args.pop('use_kwargs', False)
     inputs = [ragged.convert_to_tensor_or_ragged_tensor(x) for x in inputs]
     if use_kwargs:
@@ -377,72 +385,66 @@ class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase,
 
     # Run the wrapped op on the dense values, for comparison.
     dense_inputs = [
-        x.inner_values if isinstance(x, ragged.RaggedTensor) else x
+        x.flat_values if isinstance(x, ragged.RaggedTensor) else x
         for x in inputs
     ]
     expected_flat_values = array_ops.reshape(
-        op.__wrapped__(dense_inputs, **extra_args), [-1])
+        op(dense_inputs, **extra_args), [-1])
 
-    with self.test_session():
-      # Check that the result has the expected shape.
-      self.assertSameShape(inputs[0], result)
+    # Check that the result has the expected shape.
+    self.assertSameShape(inputs[0], result)
 
-      # Check that the result has the expected (flattened) values.
-      if isinstance(result, ragged.RaggedTensor):
-        result_flat_values = array_ops.reshape(result.inner_values, [-1])
-      else:
-        result_flat_values = array_ops.reshape(result, [-1])
-      self.assertAllEqual(expected_flat_values, result_flat_values)
+    # Check that the result has the expected (flattened) values.
+    if isinstance(result, ragged.RaggedTensor):
+      result_flat_values = array_ops.reshape(result.flat_values, [-1])
+    else:
+      result_flat_values = array_ops.reshape(result, [-1])
+    self.assertAllEqual(expected_flat_values, result_flat_values)
 
-  def testUnknownRankError(self):
+  def testElementwiseOpUnknownRankError(self):
+    if context.executing_eagerly():
+      return
     x = ragged.constant([[1, 2], [3]])
-    y = ragged.from_row_splits(
+    y = ragged.RaggedTensor.from_row_splits(
         array_ops.placeholder_with_default([1, 2, 3], shape=None), x.row_splits)
-    with self.assertRaisesRegexp(
-        ValueError, r'Ragged elementwise ops require that rank \(number '
-        r'of dimensions\) be statically known.'):
-      ragged.add(x, y)
-
-  def testBroadcastError1(self):
-    x = ragged.constant([[1, 2], [3]])
-    y = [[12]]
-    with self.assertRaisesRegexp(
-        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
-      ragged.add(x, y)
-
-  def testBroadcastError2(self):
-    x = ragged.constant([[[1, 2], [3, 4]], [[5]]], ragged_rank=2)
-    y = ragged.constant([[[8], [3]], [[2]]], ragged_rank=1)
     with self.assertRaisesRegexp(ValueError,
-                                 'Inputs must have identical ragged splits'):
-      ragged.add(x, y)
-
-  def testBroadcastError3(self):
-    x = ragged.constant([[[1, 2], [3]], [[4, 5], [6]]], ragged_rank=2)
-    y = ragged.constant([[7, 8], [9]], ragged_rank=1)
-    with self.assertRaisesRegexp(
-        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
-      ragged.add(x, y)
+                                 r'Unable to broadcast: unknown rank'):
+      math_ops.add(x, y)
 
-  def testBroadcastError4(self):
-    x = ragged.constant([[[1]]])
-    y = ragged.constant([[1]])
-    with self.assertRaisesRegexp(
-        ValueError, 'Ragged elementwise ops do not support broadcasting yet'):
-      ragged.add(x, y)
+  @parameterized.parameters([
+      dict(
+          x=ragged.constant_value([[1, 2], [3]]),
+          y=[[10]],
+          expected=[[11, 12], [13]]),
+      dict(
+          x=ragged.constant_value([[[1, 2], [3, 4]], [[5]]], ragged_rank=2),
+          y=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
+          expected=[[[11, 12], [23, 24]], [[35]]]),
+      dict(
+          x=ragged.constant_value([[[1]]]),
+          y=ragged.constant_value([[1]]),
+          expected=[[[2]]]),
+  ])
+  def testElementwiseOpBroadcast(self, x, y, expected):
+    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    result = x + y
+    self.assertRaggedEqual(result, expected)
 
-  def testShapeMismatch(self):
+  def testElementwiseOpShapeMismatch(self):
     x = ragged.constant([[1, 2, 3], [4, 5]])
     y = ragged.constant([[1, 2, 3], [4, 5, 6]])
-    with self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                 'Inputs must have identical ragged splits'):
-      ragged.add(x, y)
+    with self.assertRaises(errors.InvalidArgumentError):
+      self.evaluate(math_ops.add(x, y))
+
+  def testBinaryOpSparseAndRagged(self):
+    x = ragged.constant([[1, 2, 3], [4, 5]])
+    y = sparse_tensor.SparseTensor([[0, 0], [0, 1], [2, 0]], [1, 2, 3], [3, 2])
+    with self.assertRaises((TypeError, ValueError)):
+      self.evaluate(math_ops.add(x, y))
 
-  def testDocstring(self):
-    self.assertRegexpMatches(
-        ragged.add.__doc__,
-        'Ragged version of the elementwise operation `tf.math.add`')
-    self.assertEqual(ragged.add.__name__, 'add')
+    with self.assertRaises((TypeError, ValueError)):
+      self.evaluate(math_ops.add_n([x, y]))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_eager_test.py b/tensorflow/python/ops/ragged/ragged_eager_test.py
index 731ff742aa18bfa45c68813d5e19f4dbe2307cdb..f1befbf9613fefc4efd5efd3d8ebf17db9038581 100644
--- a/tensorflow/python/ops/ragged/ragged_eager_test.py
+++ b/tensorflow/python/ops/ragged/ragged_eager_test.py
@@ -17,17 +17,17 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import sys
 
 from absl.testing import parameterized
 
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
 
   @parameterized.parameters([
       dict(pylist=[[b'a', b'b'], [b'c']]),
@@ -36,21 +36,15 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   ])
   def testRaggedTensorToList(self, pylist, ragged_rank=None):
     rt = ragged.constant(pylist, ragged_rank)
-    self.assertEqual(rt.tolist(), pylist)
-
-  expected = "RaggedTensor([['a', 'b'], ['c']])"
-  if sys.version_info[0] == 3:
-    expected = "RaggedTensor([[b'a', b'b'], [b'c']])"
+    self.assertRaggedEqual(rt, pylist)
 
   @parameterized.parameters([
-      dict(pylist=[['a', 'b'], ['c']],
-           expected=expected),
-      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]],
-           expected='RaggedTensor([[[1, 2], [3]], [[4, 5, 6], [], [7]]])'),
+      dict(pylist=[[b'a', b'b'], [b'c']]),
+      dict(pylist=[[[1, 2], [3]], [[4, 5, 6], [], [7]]]),
   ])
-  def testRaggedTensorStr(self, pylist, expected):
+  def testRaggedTensorStr(self, pylist):
     rt = ragged.constant(pylist)
-    self.assertEqual(str(rt), expected)
+    self.assertEqual(str(rt), '<tf.RaggedTensor %s>' % pylist)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_elementwise_ops.py b/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
deleted file mode 100644
index 23d0e8b5fc44d23ba4ada0ae69084b8547d42064..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/ragged/ragged_elementwise_ops.py
+++ /dev/null
@@ -1,367 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Elementwise operations for RaggedTensors."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import collections
-
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import parsing_ops
-from tensorflow.python.ops import string_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.ops.ragged import ragged_tensor
-from tensorflow.python.ops.ragged import ragged_util
-from tensorflow.python.util import tf_decorator
-from tensorflow.python.util import tf_export
-from tensorflow.python.util import tf_inspect
-
-# Information about an argument to an operation: The name of the argument, its
-# position in the argument list, and a boolean flag indicating whether it
-# expects a list of tensors.
-_ArgInfo = collections.namedtuple('ArgInfo', ['name', 'position', 'is_list'])
-
-
-def make_elementwise_op(op, *elementwise_args):
-  """Returns a ragged-tensor version of the elementwise operation `op`.
-
-  The returned operation will:
-
-  1. Broadcast the elementwise arguments to have a compatible shape.
-     An exception is raised if the tensors not broadcast-compatible.
-  2. Call `op`, substituting the dense values of the broadcasted tensor for
-     each elementwise argument.
-  3. Return a potentially ragged tensor constructed from the output of `op`
-     and the broadcasted tensors' nested row splits.
-
-  For example, you can construct a ragged-tensor version of the standard
-  operation `tf.add` by calling `make_elementwise_op(tf.add, 'x', 'y')`.
-
-  Args:
-    op: The operation to wrap.
-    *elementwise_args: The names of arguments to `op` that are treated as
-      elementwise.  Arguments that take a list of tensors should have their
-      names wrapped in square brackets (e.g. "[inputs]").
-
-  Raises:
-    ValueError: If any name specified in `elementwise_args` is not the name
-      of an argument to `op`.
-  """
-  elementwise_arg_infos = _get_arg_infos(op, elementwise_args)
-
-  def ragged_op(*args, **kwargs):
-    """Ragged version of `op`."""
-    args = list(args)
-
-    # Collect all of the elementwise arguments, and put them in a single
-    # dict whose values are the (potentially ragged) tensors that need to
-    # be broadcast to a common shape.  The keys of this dict are tuples
-    # (argkey, index), where argkey is an int for poitional args or a string
-    # for keyword args; and index is None for non-list args and the index of the
-    # tensor for list args.
-    elementwise_args = {}
-    for (name, position, is_list) in elementwise_arg_infos.values():
-      if position < len(args):
-        if is_list:
-          args[position] = list(args[position])
-          for (index, arg) in enumerate(args[position]):
-            elementwise_args[position, index] = arg
-        else:
-          elementwise_args[position, None] = args[position]
-      elif name in kwargs:
-        if is_list:
-          kwargs[name] = list(kwargs[name])
-          for (i, arg) in enumerate(kwargs[name]):
-            elementwise_args[name, i] = arg
-        else:
-          elementwise_args[name, None] = kwargs[name]
-
-    with ops.name_scope(None, op.__name__, elementwise_args.values()):
-      # Convert all inputs to tensors or ragged tensors.
-      for ((key, index), tensor) in elementwise_args.items():
-        argname = elementwise_arg_infos[key].name
-        converted = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-            tensor, name=argname)
-        elementwise_args[key, index] = converted
-
-      # Broadcast tensors to have compatible shapes.
-      broadcast_args, result_splits, broadcast_check_ops = \
-          _broadcast_elementwise_args(elementwise_args)
-
-      # Replace tensor arguments with their dense values.
-      for ((key, index), tensor) in broadcast_args.items():
-        if ragged_tensor.is_ragged(tensor):
-          if isinstance(key, int) and index is None:
-            args[key] = tensor.inner_values
-          elif isinstance(key, int) and index is not None:
-            args[key][index] = tensor.inner_values
-          elif isinstance(key, str) and index is None:
-            kwargs[key] = tensor.inner_values
-          else:
-            assert isinstance(key, str) and index is not None
-            kwargs[key][index] = tensor.inner_values
-
-      # Call the elementwise op on the broadcasted dense values.
-      with ops.control_dependencies(broadcast_check_ops):
-        result_values = op(*args, **kwargs)
-
-      # Restore any ragged dimensions that we stripped off, and return the
-      # result.
-      return ragged_factory_ops.from_nested_row_splits(result_values,
-                                                       result_splits)
-
-  # Construct the docstring.
-  op_name = tf_export.get_canonical_name_for_symbol(op)
-  assert op_name is not None, op
-  argnames = ', '.join('`%s`' % s.strip('[]') for s in elementwise_args)
-  docstring = _ELEMENTWISE_DOCSTRING % dict(op_name=op_name, argnames=argnames)
-
-  # Update name, docstring, signature, etc., for the wrapper, and return it.
-  return tf_decorator.make_decorator(op, ragged_op, decorator_doc=docstring)
-
-
-_ELEMENTWISE_DOCSTRING = """\
-Ragged version of the elementwise operation `tf.%(op_name)s`.
-
-  The following elementwise arguments may be ragged or dense:
-  %(argnames)s.
-  These arguments will be broadcast to a compatible shape if necessary.
-  """
-
-
-def _get_arg_infos(func, elementwise_args):
-  """Returns `_ArgInfo`s for each `func` arg specified by `elementwise_args`.
-
-  Args:
-    func: The function whose arguments should be described.
-    elementwise_args: The names of the arguments to get info for.
-
-  Returns:
-    A dictionary that maps both names and positions of arguments to
-    `_ArgInfo` tuples.
-  """
-  arg_infos = {}
-
-  # Inspect the func's argspec to find the position of each arg.
-  arg_spec = tf_inspect.getargspec(func)
-  for argname in elementwise_args:
-    assert isinstance(argname, str)
-    is_list = argname.startswith('[') and argname.endswith(']')
-    if is_list:
-      argname = argname[1:-1]
-    assert argname in arg_spec.args, (func, argname, arg_spec.args)
-    arg_info = _ArgInfo(argname, arg_spec.args.index(argname), is_list)
-    arg_infos[arg_info.name] = arg_info
-    arg_infos[arg_info.position] = arg_info
-  return arg_infos
-
-
-def _broadcast_elementwise_args(elementwise_args):
-  """Broadcasts the values of `elementwise_args` to have compatible shapes.
-
-  Args:
-    elementwise_args: A dictionary whose keys are potentially ragged tensors.
-
-  Returns:
-    A tuple `(broadcast_args, broadcast_splits, checks)` where:
-
-    * `broadcast_args` is a dictionary with the same keys as
-      `elementwise_args`, mapping to broadcasted tensors.
-    * `broadcast_splits` is the broadcasted nested row splits.
-    * `checks` is a possibly empty tuple of assertion operations that should
-      be added as control dependencies.
-
-  Raises:
-    ValueError: If broadcasting fails.
-  """
-  # No elementwise arguments were used: nothing to do!
-  if not elementwise_args:
-    return elementwise_args, (), ()
-
-  # A single elementwise argument was used: no broadcasting necessary.
-  if len(elementwise_args) == 1:
-    arg = list(elementwise_args.values())[0]
-    if ragged_tensor.is_ragged(arg):
-      return elementwise_args, arg.nested_row_splits, ()
-    else:
-      return elementwise_args, (), ()
-
-  # Multiple elementwise arguments.
-  else:
-    is_ragged = [ragged_tensor.is_ragged(t) for t in elementwise_args.values()]
-    if not any(is_ragged):
-      return elementwise_args, (), ()
-
-    # Support limited broadcasting (namely, scalar + ragged).  Full
-    # broadcasting support will be added later.
-    if all((ragged_tensor.is_ragged(t) or t.shape.ndims == 0)
-           for t in elementwise_args.values()):
-      nested_splits_lists = [
-          t.nested_row_splits
-          for t in elementwise_args.values()
-          if ragged_tensor.is_ragged(t)
-      ]
-      if len(nested_splits_lists) == 1:
-        checks = ()
-      else:
-        if any(t.shape.ndims is None for t in elementwise_args.values()):
-          raise ValueError('Ragged elementwise ops require that rank (number '
-                           'of dimensions) be statically known.')
-        if len(set(t.shape.ndims for t in elementwise_args.values())) != 1:
-          raise ValueError('Ragged elementwise ops do not support '
-                           'broadcasting yet')
-        checks = ragged_util.assert_splits_match(nested_splits_lists)
-      return (elementwise_args, nested_splits_lists[0], checks)
-    else:
-      raise ValueError('Ragged elementwise ops do not support broadcasting yet')
-
-
-# A list of symbols that should be exported in the "ragged" package.
-_symbols_to_export = []
-
-
-def _add_elementwise_ops_to_this_module(specs, verbose=False):
-  """Adds ragged versions of the given ops to this module.
-
-  Args:
-    specs: A list of tuples containing the arguments for `make_elementwise_op`.
-    verbose: If true, then display each op that gets added.
-  """
-  for spec in specs:
-    original_op = spec[0]
-    ragged_op = make_elementwise_op(*spec)
-    canonical_name = tf_export.get_canonical_name_for_symbol(original_op)
-    if '.' not in canonical_name:
-      op_name = canonical_name
-    else:
-      op_name = original_op.__name__
-    if verbose:
-      print('Adding ragged_elementwise_op: tf.ragged.%s (based on tf.%s)' %
-            (op_name, canonical_name))
-    globals()[op_name] = ragged_op
-    _symbols_to_export.append(op_name)
-
-
-# A list of tuples containing arguments for `make_elementwise_op`, for each
-# elementwise operation that should have a ragged version built.  Each tuple
-# contains a standard `Tensor` operation, and the names of any arguments
-# that are processed in elementwise fashion.
-_TF_ELEMENTWISE_OPS = [
-    # Unary math operations.
-    (clip_ops.clip_by_value, 't'),
-    (math_ops.abs, 'x'),
-    (math_ops.acos, 'x'),
-    (math_ops.acosh, 'x'),
-    (math_ops.angle, 'input'),
-    (math_ops.asin, 'x'),
-    (math_ops.asinh, 'x'),
-    (math_ops.atan, 'x'),
-    (math_ops.atanh, 'x'),
-    (math_ops.cast, 'x'),
-    (math_ops.ceil, 'x'),
-    (math_ops.conj, 'x'),
-    (math_ops.cos, 'x'),
-    (math_ops.cosh, 'x'),
-    (math_ops.digamma, 'x'),
-    (math_ops.erf, 'x'),
-    (math_ops.erfc, 'x'),
-    (math_ops.exp, 'x'),
-    (math_ops.expm1, 'x'),
-    (math_ops.floor, 'x'),
-    (math_ops.imag, 'input'),
-    (math_ops.is_finite, 'x'),
-    (math_ops.is_inf, 'x'),
-    (math_ops.is_nan, 'x'),
-    (math_ops.lgamma, 'x'),
-    (math_ops.log, 'x'),
-    (math_ops.log1p, 'x'),
-    (math_ops.log_sigmoid, 'x'),
-    (math_ops.logical_not, 'x'),
-    (math_ops.negative, 'x'),
-    (math_ops.real, 'input'),
-    (math_ops.reciprocal, 'x'),
-    (math_ops.rint, 'x'),
-    (math_ops.round, 'x'),
-    (math_ops.rsqrt, 'x'),
-    (math_ops.saturate_cast, 'value'),
-    (math_ops.sign, 'x'),
-    (math_ops.sin, 'x'),
-    (math_ops.sinh, 'x'),
-    (math_ops.sqrt, 'x'),
-    (math_ops.square, 'x'),
-    (math_ops.tan, 'x'),
-
-    # Binary math operations
-    (math_ops.add, 'x', 'y'),
-    (math_ops.atan2, 'y', 'x'),
-    (math_ops.complex, 'real', 'imag'),
-    (math_ops.div, 'x', 'y'),
-    (math_ops.div_no_nan, 'x', 'y'),
-    (math_ops.divide, 'x', 'y'),
-    (math_ops.equal, 'x', 'y'),
-    (math_ops.floordiv, 'x', 'y'),
-    (math_ops.floormod, 'x', 'y'),
-    (math_ops.greater, 'x', 'y'),
-    (math_ops.greater_equal, 'x', 'y'),
-    (math_ops.less, 'x', 'y'),
-    (math_ops.less_equal, 'x', 'y'),
-    (math_ops.logical_and, 'x', 'y'),
-    (math_ops.logical_or, 'x', 'y'),
-    (math_ops.logical_xor, 'x', 'y'),
-    (math_ops.maximum, 'x', 'y'),
-    (math_ops.minimum, 'x', 'y'),
-    (math_ops.multiply, 'x', 'y'),
-    (math_ops.not_equal, 'x', 'y'),
-    (math_ops.pow, 'x', 'y'),
-    (math_ops.realdiv, 'x', 'y'),
-    (math_ops.squared_difference, 'x', 'y'),
-    (math_ops.subtract, 'x', 'y'),
-    (math_ops.truediv, 'x', 'y'),
-    (math_ops.truncatediv, 'x', 'y'),
-    (math_ops.truncatemod, 'x', 'y'),
-
-    # N-ary math operations
-    (math_ops.add_n, '[inputs]'),
-
-    # String operations
-    (string_ops.as_string, 'input'),
-    (string_ops.decode_base64, 'input'),
-    (string_ops.encode_base64, 'input'),
-    (string_ops.regex_full_match, 'input'),
-    (string_ops.regex_replace, 'input'),
-    (string_ops.string_join, '[inputs]'),
-    (string_ops.string_strip, 'input'),
-    (string_ops.string_to_hash_bucket, 'string_tensor'),
-    (string_ops.string_to_hash_bucket_fast, 'input'),
-    (string_ops.string_to_hash_bucket_strong, 'input'),
-    (string_ops.substr, 'input'),
-    (string_ops.unicode_script, 'input'),
-
-    # Array ops
-    (array_ops.check_numerics, 'tensor'),
-    (array_ops.identity, 'input'),
-    (array_ops.ones_like, 'tensor'),
-    (array_ops.zeros_like, 'tensor'),
-
-    # Parsing ops
-    (parsing_ops.decode_compressed, 'bytes'),
-    (parsing_ops.string_to_number, 'string_tensor'),
-]
-_add_elementwise_ops_to_this_module(_TF_ELEMENTWISE_OPS)
diff --git a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
index 0c4fd458c230a13ebf48d6a94028497a266ea1bf..072f330e3c1c0a20ac7cecd84ec6b0e47003a3a0 100644
--- a/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_expand_dims_op_test.py
@@ -22,10 +22,12 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedExpandDimsOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedExpandDimsOpTest(ragged_test_util.RaggedTensorTestCase,
                              parameterized.TestCase):
 
   # An example 4-d ragged tensor with shape [3, (D2), (D3), 2], and the
@@ -117,8 +119,7 @@ class RaggedExpandDimsOpTest(test_util.TensorFlowTestCase,
     if expected_shape is not None:
       self.assertEqual(expanded.shape.as_list(), expected_shape)
 
-    with self.test_session():
-      self.assertEqual(expanded.eval().tolist(), expected)
+    self.assertRaggedEqual(expanded, expected)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_factory_ops.py b/tensorflow/python/ops/ragged/ragged_factory_ops.py
index de3a2d5b10be0e22c22f24ea8cb959c28cb741fd..2c63e1c7994c31b6ed53e37e65498a843e2bb595 100644
--- a/tensorflow/python/ops/ragged/ragged_factory_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_factory_ops.py
@@ -21,11 +21,7 @@ from __future__ import print_function
 import numpy as np
 
 from tensorflow.python.framework import constant_op
-from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_util
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_tensor_value
 
@@ -56,8 +52,8 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
       `pylist`.
     ragged_rank: An integer specifying the ragged rank of the returned
       `RaggedTensor`.  Must be nonnegative and less than `K`. Defaults to
-      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to
-      `max(0, K - 1 - len(inner_shape))` if `inner_shape` is specified.
+      `max(0, K - 1)` if `inner_shape` is not specified.  Defaults to `max(0, K
+      - 1 - len(inner_shape))` if `inner_shape` is specified.
     inner_shape: A tuple of integers specifying the shape for individual inner
       values in the returned `RaggedTensor`.  Defaults to `()` if `ragged_rank`
       is not specified.  If `ragged_rank` is specified, then a default is chosen
@@ -72,9 +68,10 @@ def constant(pylist, dtype=None, ragged_rank=None, inner_shape=None, name=None):
     ValueError: If the scalar values in `pylist` have inconsistent nesting
       depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
   """
-  with ops.name_scope(name, 'RaggedConstant'):
-    return _constant_value(from_row_splits, constant_op.constant, pylist, dtype,
-                           ragged_rank, inner_shape)
+  with ops.name_scope(name, "RaggedConstant"):
+    return _constant_value(ragged_tensor.RaggedTensor.from_row_splits,
+                           constant_op.constant, pylist, dtype, ragged_rank,
+                           inner_shape)
 
 
 def constant_value(pylist, dtype=None, ragged_rank=None, inner_shape=None):
@@ -153,29 +150,29 @@ def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
       depth; or if ragged_rank or inner_shape are incompatible with `pylist`.
   """
   if ragged_tensor.is_ragged(pylist):
-    raise TypeError('pylist may not be a RaggedTensor or RaggedTensorValue.')
+    raise TypeError("pylist may not be a RaggedTensor or RaggedTensorValue.")
 
   if not isinstance(pylist, (list, tuple)):
     # Scalar value
     if ragged_rank is not None and ragged_rank != 0:
-      raise ValueError('Invalid pylist=%r: incompatible with ragged_rank=%d' %
+      raise ValueError("Invalid pylist=%r: incompatible with ragged_rank=%d" %
                        (pylist, ragged_rank))
     if inner_shape is not None and inner_shape:
       raise ValueError(
-          'Invalid pylist=%r: incompatible with dim(inner_shape)=%d' %
+          "Invalid pylist=%r: incompatible with dim(inner_shape)=%d" %
           (pylist, len(inner_shape)))
     return inner_factory(pylist, dtype, ())
 
   if ragged_rank is not None and ragged_rank < 0:
     raise ValueError(
-        'Invalid ragged_rank=%r: must be nonnegative' % ragged_rank)
+        "Invalid ragged_rank=%r: must be nonnegative" % ragged_rank)
 
   # Find the depth of scalar values in `pylist`.
   scalar_depth, max_depth = _find_scalar_and_max_depth(pylist)
   if scalar_depth is not None:
     if max_depth > scalar_depth:
-      raise ValueError('Invalid pylist=%r: empty list nesting is greater '
-                       'than scalar value nesting' % pylist)
+      raise ValueError("Invalid pylist=%r: empty list nesting is greater "
+                       "than scalar value nesting" % pylist)
 
   # If both inner_shape and ragged_rank were specified, then check that
   # they are compatible with pylist.
@@ -184,8 +181,8 @@ def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
     if ((scalar_depth is not None and expected_depth != scalar_depth) or
         (scalar_depth is None and expected_depth < max_depth)):
       raise ValueError(
-          'Invalid pylist=%r: incompatible with ragged_rank=%d '
-          'and dim(inner_shape)=%d' % (pylist, ragged_rank, len(inner_shape)))
+          "Invalid pylist=%r: incompatible with ragged_rank=%d "
+          "and dim(inner_shape)=%d" % (pylist, ragged_rank, len(inner_shape)))
 
   # Check if the result is a `Tensor`.
   if (ragged_rank == 0 or
@@ -221,7 +218,7 @@ def _constant_value(ragged_factory, inner_factory, pylist, dtype, ragged_rank,
     values = concatenated_values
 
   values = inner_factory(
-      values, dtype=dtype, shape=(len(values),) + inner_shape, name='values')
+      values, dtype=dtype, shape=(len(values),) + inner_shape, name="values")
   for row_splits in reversed(nested_splits):
     values = ragged_factory(values, row_splits)
   return values
@@ -249,7 +246,7 @@ def _find_scalar_and_max_depth(pylist):
       child_scalar_depth, child_max_depth = _find_scalar_and_max_depth(child)
       if child_scalar_depth is not None:
         if scalar_depth is not None and scalar_depth != child_scalar_depth + 1:
-          raise ValueError('all scalar values must have the same nesting depth')
+          raise ValueError("all scalar values must have the same nesting depth")
         scalar_depth = child_scalar_depth + 1
       max_depth = max(max_depth, child_max_depth + 1)
     return (scalar_depth, max_depth)
@@ -273,406 +270,24 @@ def _default_inner_shape_for_pylist(pylist, ragged_rank):
     """Checks that `item` has a consistent shape matching `shape`."""
     is_nested = isinstance(item, (list, tuple))
     if is_nested != bool(shape):
-      raise ValueError('inner values have inconsistent shape')
+      raise ValueError("inner values have inconsistent shape")
     if is_nested:
       if shape[0] != len(item):
-        raise ValueError('inner values have inconsistent shape')
+        raise ValueError("inner values have inconsistent shape")
       for child in item:
         check_inner_shape(child, shape[1:])
 
   # Collapse the ragged layers to get the list of inner values.
-  inner_values = pylist
+  flat_values = pylist
   for dim in range(ragged_rank):
-    if not all(isinstance(v, (list, tuple)) for v in inner_values):
-      raise ValueError('pylist has scalar values depth %d, but ragged_rank=%d '
-                       'requires scalar value depth greater than %d' %
+    if not all(isinstance(v, (list, tuple)) for v in flat_values):
+      raise ValueError("pylist has scalar values depth %d, but ragged_rank=%d "
+                       "requires scalar value depth greater than %d" %
                        (dim + 1, ragged_rank, ragged_rank))
-    inner_values = sum((list(v) for v in inner_values), [])
+    flat_values = sum((list(v) for v in flat_values), [])
 
   # Compute the inner shape looking only at the leftmost elements; and then
   # use check_inner_shape to verify that other elements have the same shape.
-  inner_shape = get_inner_shape(inner_values)
-  check_inner_shape(inner_values, inner_shape)
+  inner_shape = get_inner_shape(flat_values)
+  check_inner_shape(flat_values, inner_shape)
   return inner_shape[1:]
-
-
-#===============================================================================
-# Convert value -> tensor
-#===============================================================================
-def convert_to_tensor_or_ragged_tensor(value,
-                                       dtype=None,
-                                       preferred_dtype=None,
-                                       name=None):
-  """Converts value to a `RaggedTensor` or `Tensor`.
-
-  * If `value` is a `RaggedTensor`, then return it as-is.
-  * If `value` is a `RaggedTensorValue`, return a corresponding constant
-    `RaggedTensor`.
-  * Otherwise, use `convert_to_tensor` to convert `value` to a `Tensor`.
-
-  Args:
-    value: A `RaggedTensor`, a `RaggedTensorValue`, or an object whose type has
-      a registered `Tensor` conversion function.
-    dtype: Optional element type for the returned tensor.  If missing the type
-      is inferred from the type of `value`.
-    preferred_dtype: Optional element type for the returned tensor, used when
-      dtype is None.  This argument has no effect if `value` is already a
-      tensor, or when conversion is not possible.
-    name: Optional name to use if a new `Tensor` is created.
-
-  Returns:
-    A `Tensor` or `RaggedTensor`.
-  """
-  if isinstance(value, ragged_tensor.RaggedTensor):
-    if dtype and not dtype.is_compatible_with(value.dtype):
-      raise ValueError('Tensor conversion requested dtype %s for '
-                       'RaggedTensor with dtype %s: %r' %
-                       (dtype.name, value.dtype.name, value))
-    return value
-  elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
-    with ops.name_scope(name, 'ConvertToTensorOrRaggedTensor', []):
-      inner_values = ops.convert_to_tensor(
-          value=value.inner_values,
-          dtype=dtype,
-          preferred_dtype=preferred_dtype,
-          name='inner_values')
-      return from_nested_row_splits(inner_values, value.nested_row_splits)
-  else:
-    return ops.convert_to_tensor(
-        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
-
-
-#===============================================================================
-# Ops to construct RaggedTensor from row-partitioned values.
-#===============================================================================
-
-
-def from_value_rowids(values, value_rowids, nrows=None, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `value_rowids`.
-
-  The returned `RaggedTensor` corresponds with the python list defined by:
-
-  ```python
-  result = [[values[i] for i in range(len(values)) if value_rowids[i] == row]
-            for row in range(nrows)]
-  ```
-
-  Warning: currently, this needs to cast value_rowids to int64 before
-  converting, since `tf.bincount` only supports `int32`.
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    value_rowids: A 1-D int64 tensor with shape `[nvals]`, which corresponds
-      one-to-one with `values`, and specifies each value's row index.  Must be
-      nonnegative, and must be sorted in ascending order.
-    nrows: An int64 scalar specifying the number of rows.  This should be
-      specified if the `RaggedTensor` may containing empty training rows.  Must
-      be greater than `value_rowids[-1]` (or zero if `value_rowids` is empty).
-      Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty).
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  Raises:
-    ValueError: If `nrows` is incompatible with `value_rowids`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_value_rowids(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
-    ...     nrows=5)
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedFromValueRowIds',
-                      [values, value_rowids, nrows]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    value_rowids = ops.convert_to_tensor(
-        value_rowids, dtypes.int64, name='value_rowids')
-    if nrows is None:
-      const_rowids = tensor_util.constant_value(value_rowids)
-      if const_rowids is None:
-        nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
-        const_nrows = None
-      else:
-        const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
-        nrows = ops.convert_to_tensor(const_nrows, dtypes.int64, name='nrows')
-    else:
-      nrows = ops.convert_to_tensor(nrows, dtypes.int64, 'nrows')
-      const_nrows = tensor_util.constant_value(nrows)
-      if const_nrows is not None:
-        if const_nrows < 0:
-          raise ValueError('Expected nrows >= 0; got %d' % const_nrows)
-        const_rowids = tensor_util.constant_value(value_rowids)
-        if const_rowids is not None and const_rowids.size > 0:
-          if not const_nrows >= const_rowids[-1] + 1:
-            raise ValueError(
-                'Expected nrows >= value_rowids[-1] + 1; got nrows=%d, '
-                'value_rowids[-1]=%d' % (const_nrows, const_rowids[-1]))
-
-    value_rowids.shape.assert_has_rank(1)
-    nrows.shape.assert_has_rank(0)
-    values.shape[:1].assert_is_compatible_with(value_rowids.shape)
-
-    # Convert value_rowids & nrows to row_splits.
-    # Note: we don't use segment_ids_to_row_splits() here because we want
-    # to save the intermediate value `row_lengths`, so we can cache it.
-    # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the cast
-    # (Remove the warning in the docstring when we do.)
-    value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
-    nrows_int32 = math_ops.cast(nrows, dtypes.int32)
-    row_lengths = math_ops.bincount(
-        value_rowids_int32,
-        minlength=nrows_int32,
-        maxlength=nrows_int32,
-        dtype=dtypes.int64)
-    row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
-    if const_nrows is not None:
-      row_lengths.set_shape([const_nrows])
-      row_splits.set_shape([const_nrows + 1])
-
-    return ragged_tensor.RaggedTensor(
-        values,
-        row_splits,
-        cached_row_lengths=row_lengths,
-        cached_value_rowids=value_rowids,
-        cached_nrows=nrows,
-        internal=True)
-
-
-def from_row_splits(values, row_splits, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
-
-  The returned `RaggedTensor` corresponds with the python list defined by:
-
-  ```python
-  result = [values[row_splits[i]:row_splits[i + 1]]
-            for i in range(len(row_splits) - 1)]
-  ```
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    row_splits: A 1-D int64 tensor with shape `[nrows+1]`.  Must not be empty,
-      and must be sorted in ascending order.  `row_splits[0]` must be zero and
-      `row_splits[-1]` must be `nvals`.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  Raises:
-    ValueError: If `row_splits` is an empty list.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_row_splits(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     row_splits=[0, 4, 4, 7, 8, 8])
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  if isinstance(row_splits, (list, tuple)) and not row_splits:
-    raise ValueError('row_splits tensor may not be empty.')
-  with ops.name_scope(name, 'RaggedFromRowSplits', [values, row_splits]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    row_splits = ops.convert_to_tensor(row_splits, dtypes.int64, 'row_splits')
-    row_splits.shape.assert_has_rank(1)
-    return ragged_tensor.RaggedTensor(
-        values=values, row_splits=row_splits, internal=True)
-
-
-def from_row_lengths(values, row_lengths, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
-
-  The returned `RaggedTensor` corresponds with the python list defined by:
-
-  ```python
-  result = [[values.pop(0) for i in range(length)]
-            for length in row_lengths]
-  ```
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    row_lengths: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative.
-      `sum(row_lengths)` must be `nvals`.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_row_lengths(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     row_lengths=[4, 0, 3, 1, 0])
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedFromRowLengths', [values, row_lengths]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    row_lengths = ops.convert_to_tensor(row_lengths, dtypes.int64,
-                                        'row_lengths')
-    row_lengths.shape.assert_has_rank(1)
-    row_limits = math_ops.cumsum(row_lengths)
-    row_splits = array_ops.concat([[0], row_limits], axis=0)
-    return ragged_tensor.RaggedTensor(
-        values=values,
-        row_splits=row_splits,
-        cached_row_lengths=row_lengths,
-        internal=True)
-
-
-def from_row_starts(values, row_starts, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
-
-  Equivalent to: `from_row_splits(values, concat([row_starts, nvals]))`.
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    row_starts: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative
-      and sorted in ascending order.  If `nrows>0`, then `row_starts[0]` must be
-      zero.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_row_starts(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     row_starts=[0, 4, 4, 7, 8])
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedFromRowStarts', [values, row_starts]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    row_starts = ops.convert_to_tensor(row_starts, dtypes.int64, 'row_starts')
-    row_starts.shape.assert_has_rank(1)
-    nvals = array_ops.shape(values, out_type=dtypes.int64)[:1]
-    row_splits = array_ops.concat([row_starts, nvals], axis=0)
-    return ragged_tensor.RaggedTensor(
-        values=values, row_splits=row_splits, internal=True)
-
-
-def from_row_limits(values, row_limits, name=None):
-  """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
-
-  Equivalent to: `from_row_splits(values, concat([0, row_limits]))`.
-
-  Args:
-    values: A potentially ragged tensor with shape `[nvals, ...]`.
-    row_limits: A 1-D int64 tensor with shape `[nrows]`.  Must be sorted in
-      ascending order.  If `nrows>0`, then `row_limits[-1]` must be `nvals`.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor`.  `result.rank = values.rank + 1`.
-    `result.ragged_rank = values.ragged_rank + 1`.
-
-  #### Example:
-    ```python
-    >>> rt = ragged.from_row_limits(
-    ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
-    ...     row_limits=[4, 4, 7, 8, 8])
-    >>> rt.eval().tolist()
-    [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
-    ```
-  """
-  with ops.name_scope(name, 'RaggedFromRowLimits', [values, row_limits]):
-    values = convert_to_tensor_or_ragged_tensor(values, name='values')
-    row_limits = ops.convert_to_tensor(row_limits, dtypes.int64, 'row_limits')
-    row_limits.shape.assert_has_rank(1)
-    zero = array_ops.zeros([1], dtypes.int64)
-    row_splits = array_ops.concat([zero, row_limits], axis=0)
-    return ragged_tensor.RaggedTensor(
-        values=values, row_splits=row_splits, internal=True)
-
-
-def from_nested_value_rowids(inner_values,
-                             nested_value_rowids,
-                             nested_nrows=None,
-                             name=None):
-  """Creates a `RaggedTensor` from a nested list of `value_rowids` tensors.
-
-  Equivalent to:
-
-  ```python
-  result = inner_values
-  for (value_rowids, nrows) in reversed(zip(nested_value_rowids, nested_nrows)):
-    result = from_value_rowids(result, value_rowids, nrows)
-  ```
-
-  Args:
-    inner_values: A potentially ragged tensor.
-    nested_value_rowids: A list of 1-D int64 tensors.  The `i`th tensor is used
-      as the `value_rowids` for the `i`th ragged dimension.
-    nested_nrows: A list of int64 scalars.  The `i`th scalar is used as the
-      `nrows` for the `i`th ragged dimension.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor` (or `inner_values` if `nested_value_rowids` is empty).
-
-  Raises:
-    ValueError: If `len(nested_values_rowids) != len(nested_nrows)`.
-  """
-  if isinstance(nested_value_rowids, ops.Tensor):
-    raise TypeError('nested_value_rowids must be a list of Tensors')
-  if nested_nrows is None:
-    nested_nrows = [None] * len(nested_value_rowids)
-  else:
-    if isinstance(nested_nrows, ops.Tensor):
-      raise TypeError('nested_nrows must be a list of Tensors')
-    if len(nested_nrows) != len(nested_value_rowids):
-      raise ValueError('nested_nrows must have the same length as '
-                       'nested_value_rowids')
-
-  with ops.name_scope(
-      name, 'RaggedFromNestedValueRowIds',
-      [inner_values] + list(nested_value_rowids) + list(nested_nrows)):
-    result = inner_values
-    for value_rowids, nrows in reversed(
-        list(zip(nested_value_rowids, nested_nrows))):
-      result = from_value_rowids(result, value_rowids, nrows)
-    return result
-
-
-def from_nested_row_splits(inner_values, nested_row_splits, name=None):
-  """Creates a `RaggedTensor` from a nested list of `row_splits` tensors.
-
-  Equivalent to:
-
-  ```python
-  result = inner_values
-  for row_splits in reversed(nested_row_splits):
-    result = from_row_splits(result, row_splits)
-  ```
-
-  Args:
-    inner_values: A potentially ragged tensor.
-    nested_row_splits: A list of 1-D int64 tensors.  The `i`th tensor is used as
-      the `row_splits` for the `i`th ragged dimension.
-    name: A name prefix for the RaggedTensor (optional).
-
-  Returns:
-    A `RaggedTensor` (or `inner_values` if `nested_row_splits` is empty).
-  """
-  if isinstance(nested_row_splits, ops.Tensor):
-    raise TypeError('nested_row_splits must be a list of Tensors')
-  with ops.name_scope(name, 'RaggedFromNestedRowSplits',
-                      [inner_values] + list(nested_row_splits)):
-    result = inner_values
-    for splits in reversed(nested_row_splits):
-      result = from_row_splits(result, splits)
-    return result
diff --git a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
index ff19ddedebf6108055747ad43c97ec8316e9fadc..07cf910202770192f146328844dec8c12be542a7 100644
--- a/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_sparse_op_test.py
@@ -12,57 +12,76 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.from_sparse."""
+"""Tests for RaggedTensor.from_sparse."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExample(self):
     st = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0]],
         values=[1, 2, 3, 4, 5],
         dense_shape=[4, 3])
-    rt = ragged.from_sparse(st)
+    rt = RaggedTensor.from_sparse(st)
 
-    with self.test_session():
-      self.assertEqual(rt.eval().tolist(), [[1, 2, 3], [4], [], [5]])
+    self.assertRaggedEqual(rt, [[1, 2, 3], [4], [], [5]])
 
   def testEmpty(self):
     st = sparse_tensor.SparseTensor(
         indices=array_ops.zeros([0, 2], dtype=dtypes.int64),
         values=[],
         dense_shape=[4, 3])
-    rt = ragged.from_sparse(st)
+    rt = RaggedTensor.from_sparse(st)
 
-    with self.test_session():
-      self.assertEqual(rt.eval().tolist(), [[], [], [], []])
+    self.assertRaggedEqual(rt, [[], [], [], []])
 
   def testBadSparseTensorRank(self):
     st1 = sparse_tensor.SparseTensor(indices=[[0]], values=[0], dense_shape=[3])
+    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                            RaggedTensor.from_sparse, st1)
+
     st2 = sparse_tensor.SparseTensor(
         indices=[[0, 0, 0]], values=[0], dense_shape=[3, 3, 3])
-    st3 = sparse_tensor.SparseTensor(
-        indices=array_ops.placeholder(dtypes.int64),
-        values=[0],
-        dense_shape=array_ops.placeholder(dtypes.int64))
-    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                            ragged.from_sparse, st1)
     self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                            ragged.from_sparse, st2)
-    self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
-                            ragged.from_sparse, st3)
+                            RaggedTensor.from_sparse, st2)
+
+    if not context.executing_eagerly():
+      st3 = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=[0],
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      self.assertRaisesRegexp(ValueError, r'rank\(st_input\) must be 2',
+                              RaggedTensor.from_sparse, st3)
+
+  def testGoodPartialSparseTensorRank(self):
+    if not context.executing_eagerly():
+      st1 = sparse_tensor.SparseTensor(
+          indices=[[0, 0]],
+          values=[0],
+          dense_shape=array_ops.placeholder(dtypes.int64))
+      st2 = sparse_tensor.SparseTensor(
+          indices=array_ops.placeholder(dtypes.int64),
+          values=[0],
+          dense_shape=[4, 3])
+
+      # Shouldn't throw ValueError
+      RaggedTensor.from_sparse(st1)
+      RaggedTensor.from_sparse(st2)
 
   def testNonRaggedSparseTensor(self):
     # "index_suffix" means the value of the innermost dimension of the index
@@ -73,22 +92,21 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     # index_suffix of first index is not zero.
     st1 = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [2, 0]], values=[1, 2, 3], dense_shape=[3, 3])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st1))
     # index_suffix of an index that starts a new row is not zero.
     st2 = sparse_tensor.SparseTensor(
         indices=[[0, 0], [0, 1], [2, 1]], values=[1, 2, 3], dense_shape=[3, 3])
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st2))
     # index_suffix of an index that continues a row skips a cell.
     st3 = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 1], [0, 3]], values=[1, 2, 3], dense_shape=[3, 3])
-    rt1 = ragged.from_sparse(st1)
-    rt2 = ragged.from_sparse(st2)
-    rt3 = ragged.from_sparse(st3)
-    with self.test_session():
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'.*SparseTensor is not right-ragged', rt1.eval)
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'.*SparseTensor is not right-ragged', rt2.eval)
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'.*SparseTensor is not right-ragged', rt3.eval)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*SparseTensor is not right-ragged'):
+      self.evaluate(RaggedTensor.from_sparse(st3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
index eb237f4c95604d6ee44d3427dc6b8589897fa955..6a3d639c5e35f23db7d53994e0a0bfe5231e664b 100644
--- a/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_from_tensor_op_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.from_tensor."""
+"""Tests for RaggedTensor.from_tensor."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -24,28 +24,26 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
-class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
-                             parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase,
+                                 parameterized.TestCase):
 
   def testDocStringExamples(self):
-    # The examples from ragged.from_tensor.__doc__.
+    # The examples from RaggedTensor.from_tensor.__doc__.
     dt = constant_op.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
-    with self.test_session():
-      self.assertEqual(
-          ragged.from_tensor(dt).eval().tolist(),
-          [[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt), [[5, 7, 0], [0, 3, 0], [6, 0, 0]])
 
-      self.assertEqual(
-          ragged.from_tensor(dt, lengths=[1, 0, 3]).eval().tolist(),
-          [[5], [], [6, 0, 0]])
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt, lengths=[1, 0, 3]), [[5], [], [6, 0, 0]])
 
-      self.assertEqual(
-          ragged.from_tensor(dt, padding=0).eval().tolist(),
-          [[5, 7], [0, 3], [6]])
+    self.assertRaggedEqual(
+        RaggedTensor.from_tensor(dt, padding=0), [[5, 7], [0, 3], [6]])
 
   @parameterized.parameters(
       # 2D test cases, no length or padding.
@@ -269,14 +267,13 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
                            padding=None,
                            ragged_rank=1):
     dt = constant_op.constant(tensor)
-    rt = ragged.from_tensor(dt, lengths, padding, ragged_rank)
-    self.assertEqual(type(rt), ragged.RaggedTensor)
+    rt = RaggedTensor.from_tensor(dt, lengths, padding, ragged_rank)
+    self.assertEqual(type(rt), RaggedTensor)
     self.assertEqual(rt.ragged_rank, ragged_rank)
     self.assertTrue(
         dt.shape.is_compatible_with(rt.shape),
         '%s is incompatible with %s' % (dt.shape, rt.shape))
-    with self.test_session():
-      self.assertEqual(rt.eval().tolist(), expected)
+    self.assertRaggedEqual(rt, expected)
 
   def testHighDimensions(self):
     # Use distinct prime numbers for all dimension shapes in this test, so
@@ -284,14 +281,13 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
     dt = array_ops.reshape(
         math_ops.range(3 * 5 * 7 * 11 * 13 * 17), [3, 5, 7, 11, 13, 17])
     for ragged_rank in range(1, 4):
-      rt = ragged.from_tensor(dt, ragged_rank=ragged_rank)
-      self.assertEqual(type(rt), ragged.RaggedTensor)
+      rt = RaggedTensor.from_tensor(dt, ragged_rank=ragged_rank)
+      self.assertEqual(type(rt), RaggedTensor)
       self.assertEqual(rt.ragged_rank, ragged_rank)
       self.assertTrue(
           dt.shape.is_compatible_with(rt.shape),
           '%s is incompatible with %s' % (dt.shape, rt.shape))
-      with self.test_session():
-        self.assertEqual(rt.eval().tolist(), dt.eval().tolist())
+      self.assertRaggedEqual(rt, self.evaluate(dt).tolist())
 
   @parameterized.parameters(
       # With no padding or lengths
@@ -397,12 +393,11 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
   )
   def testEmpty(self, dt_shape, expected, lengths=None, padding=None):
     dt = array_ops.zeros(dt_shape)
-    rt = ragged.from_tensor(dt, lengths, padding)
-    self.assertEqual(type(rt), ragged.RaggedTensor)
+    rt = RaggedTensor.from_tensor(dt, lengths, padding)
+    self.assertEqual(type(rt), RaggedTensor)
     self.assertEqual(rt.ragged_rank, 1)
     self.assertTrue(dt.shape.is_compatible_with(rt.shape))
-    with self.test_session():
-      self.assertEqual(rt.eval().tolist(), expected)
+    self.assertRaggedEqual(rt, expected)
 
   @parameterized.parameters(
       {
@@ -419,7 +414,7 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
       {
           'tensor': [[1]],
           'padding': 'a',
-          'error': (TypeError, "Expected int32, got 'a'.*")
+          'error': (TypeError, '.*')
       },
       {
           'tensor': [[1]],
@@ -454,8 +449,8 @@ class RaggedFromTensorOpTest(test_util.TensorFlowTestCase,
                  ragged_rank=1,
                  error=None):
     dt = constant_op.constant(tensor)
-    self.assertRaisesRegexp(error[0], error[1], ragged.from_tensor, dt, lengths,
-                            padding, ragged_rank)
+    self.assertRaisesRegexp(error[0], error[1], RaggedTensor.from_tensor, dt,
+                            lengths, padding, ragged_rank)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_functional_ops.py b/tensorflow/python/ops/ragged/ragged_functional_ops.py
index 6b71d88435c91d1c130c1c24a033ebcf4a7959cb..751f2c73592c676d0dd5eec4f9dc45430cd646b1 100644
--- a/tensorflow/python/ops/ragged/ragged_functional_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_functional_ops.py
@@ -19,15 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
 
 
-def map_inner_values(op, *args, **kwargs):
+def map_flat_values(op, *args, **kwargs):
   """Applies `op` to the inner values of one or more RaggedTensors.
 
-  Replaces any `RaggedTensor` in `args` or `kwargs` with its `inner_values`
+  Replaces any `RaggedTensor` in `args` or `kwargs` with its `flat_values`
   tensor, and then calls `op`.  Returns a `RaggedTensor` that is constructed
   from the input `RaggedTensor`s' `splits` and the value returned by
   the `op`.
@@ -39,20 +38,20 @@ def map_inner_values(op, *args, **kwargs):
 
   ```python
   >>> rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
-  >>> ragged.map_inner_values(tf.ones_like, rt).eval().tolist()
+  >>> ragged.map_flat_values(tf.ones_like, rt).eval().tolist()
   [[1, 1, 1], [], [1, 1], [1]]
-  >>> ragged.map_inner_values(tf.multiply, rt, rt).eval().tolist()
+  >>> ragged.map_flat_values(tf.multiply, rt, rt).eval().tolist()
   [[1, 4, 9], [], [16, 25], [36]]
-  >>> ragged.map_inner_values(tf.add, rt, 5).eval().tolist()
+  >>> ragged.map_flat_values(tf.add, rt, 5).eval().tolist()
   [[6, 7, 8], [], [9, 10], [11]]
   ```
 
   Args:
-    op: The operation that should be applied to the RaggedTensor `inner_values`.
+    op: The operation that should be applied to the RaggedTensor `flat_values`.
       `op` is typically an element-wise operation (such as math_ops.add), but
       any operation that preserves the size of the outermost dimension can be
       used.  I.e., `shape[0]` of the value returned by `op` must match
-      `shape[0]` of the `RaggedTensor`s' `inner_values` tensors.
+      `shape[0]` of the `RaggedTensor`s' `flat_values` tensors.
     *args: Arguments for `op`.
     **kwargs: Keyword arguments for `op`.
 
@@ -66,8 +65,8 @@ def map_inner_values(op, *args, **kwargs):
   # Replace RaggedTensors with their values; and collect the splits tensors
   # from each RaggedTensor.
   nested_splits_lists = []
-  inner_args = _replace_ragged_with_inner_values(args, nested_splits_lists)
-  inner_kwargs = _replace_ragged_with_inner_values(kwargs, nested_splits_lists)
+  inner_args = _replace_ragged_with_flat_values(args, nested_splits_lists)
+  inner_kwargs = _replace_ragged_with_flat_values(kwargs, nested_splits_lists)
   if not nested_splits_lists:
     return op(*args, **kwargs)
 
@@ -75,15 +74,15 @@ def map_inner_values(op, *args, **kwargs):
       ragged_util.assert_splits_match(nested_splits_lists)):
     # Delegate to op, and then compose the result from the transformed values
     # and the splits.
-    return ragged_factory_ops.from_nested_row_splits(
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
         op(*inner_args, **inner_kwargs), nested_splits_lists[0])
 
 
-def _replace_ragged_with_inner_values(value, nested_splits_lists):
-  """Replace RaggedTensors with their inner_values, and record their splits.
+def _replace_ragged_with_flat_values(value, nested_splits_lists):
+  """Replace RaggedTensors with their flat_values, and record their splits.
 
   Returns a copy of `value`, with any nested `RaggedTensor`s replaced by their
-  `inner_values` tensor.  Looks inside lists, tuples, and dicts.
+  `flat_values` tensor.  Looks inside lists, tuples, and dicts.
 
   Appends each `RaggedTensor`'s `nested_splits` to `nested_splits_lists`.
 
@@ -97,13 +96,13 @@ def _replace_ragged_with_inner_values(value, nested_splits_lists):
   """
   # Base case
   if ragged_tensor.is_ragged(value):
-    value = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(value)
+    value = ragged_tensor.convert_to_tensor_or_ragged_tensor(value)
     nested_splits_lists.append(value.nested_row_splits)
-    return value.inner_values
+    return value.flat_values
 
   # Recursion cases
   def recurse(v):
-    return _replace_ragged_with_inner_values(v, nested_splits_lists)
+    return _replace_ragged_with_flat_values(v, nested_splits_lists)
 
   if isinstance(value, list):
     return [recurse(v) for v in value]
diff --git a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
index dcf1feaa696c6f5e516afd9796c661edccf66813..6673192752e613f671c175193fce83fbba60e48d 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_nd_op_test.py
@@ -21,14 +21,18 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedGatherNdOpTest(ragged_test_util.RaggedTensorTestCase,
                            parameterized.TestCase):
 
   DOCSTRING_PARAMS = [[['000', '001'], ['010']],
@@ -185,14 +189,11 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
   ])  # pyformat: disable
   def testRaggedGatherNd(self, descr, params, indices, expected):
     result = ragged.gather_nd(params, indices)
-    self.assertEqual(
-        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
-    with self.test_session() as sess:
-      if hasattr(expected, 'tolist'):
-        expected = expected.tolist()
-      self.assertEqual(sess.run(result).tolist(), expected)
+    self.assertRaggedEqual(result, expected)
 
   def testRaggedGatherNdUnknownRankError(self):
+    if context.executing_eagerly():
+      return
     params = ragged.constant([['a', 'b'], ['c', 'd']])
     indices1 = array_ops.placeholder(dtypes.int32, shape=None)
     indices2 = array_ops.placeholder(dtypes.int32, shape=[None])
@@ -208,21 +209,20 @@ class RaggedGatherNdOpTest(test_util.TensorFlowTestCase,
       dict(
           params=['a'],
           indices=0,
-          message='Shape must be at least rank 1 but is rank 0'
-          " for 'GatherNd'"),
+          error=(ValueError, errors.InvalidArgumentError)),
       dict(
           params=ragged.constant_value([['a']]),
           indices=0,
           message='indices.rank must be at least 1.'),
       dict(
           params=['a', 'b', 'c'],
-          indices=ragged.constant([[0]]),
+          indices=ragged.constant_value([[0]]),
           message='The innermost dimension of indices may not be ragged'),
   ])
   def testRaggedGatherNdStaticError(self,
                                     params,
                                     indices,
-                                    message,
+                                    message=None,
                                     error=ValueError):
     with self.assertRaisesRegexp(error, message):
       ragged.gather_nd(params, indices)
diff --git a/tensorflow/python/ops/ragged/ragged_gather_op_test.py b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
index bb52d05c32ea2f4a47ade9cc84ae3415789e3b8b..42efdc8a7d384744041454b5e0bb90e5618b7184 100644
--- a/tensorflow/python/ops/ragged/ragged_gather_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_gather_op_test.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -25,81 +26,74 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedGatherOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExamples(self):
     params = constant_op.constant(['a', 'b', 'c', 'd', 'e'])
     indices = constant_op.constant([3, 1, 2, 1, 0])
     ragged_params = ragged.constant([['a', 'b', 'c'], ['d'], [], ['e']])
     ragged_indices = ragged.constant([[3, 1, 2], [1], [], [0]])
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, ragged_indices).eval().tolist(),
-          [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
-      self.assertEqual(
-          ragged.gather(ragged_params, indices).eval().tolist(),
-          [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
-      self.assertEqual(
-          ragged.gather(ragged_params, ragged_indices).eval().tolist(),
-          [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
+    self.assertRaggedEqual(
+        ragged.gather(params, ragged_indices),
+        [[b'd', b'b', b'c'], [b'b'], [], [b'a']])
+    self.assertRaggedEqual(
+        ragged.gather(ragged_params, indices),
+        [[b'e'], [b'd'], [], [b'd'], [b'a', b'b', b'c']])
+    self.assertRaggedEqual(
+        ragged.gather(ragged_params, ragged_indices),
+        [[[b'e'], [b'd'], []], [[b'd']], [], [[b'a', b'b', b'c']]])
 
   def testTensorParamsAndTensorIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = [2, 0, 2, 1]
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [b'c', b'a', b'c', b'b'])
-      self.assertEqual(type(ragged.gather(params, indices)), ops.Tensor)
+    self.assertRaggedEqual(
+        ragged.gather(params, indices), [b'c', b'a', b'c', b'b'])
+    self.assertIsInstance(ragged.gather(params, indices), ops.Tensor)
 
   def testRaggedParamsAndTensorIndices(self):
     params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
     indices = [2, 0, 2, 1]
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[b'f'], [b'a', b'b'], [b'f'], [b'c', b'd', b'e']])
 
   def testTensorParamsAndRaggedIndices(self):
     params = ['a', 'b', 'c', 'd', 'e']
     indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[b'c', b'b'], [b'b', b'c', b'a'], [b'd']])
 
   def testRaggedParamsAndRaggedIndices(self):
     params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
     indices = ragged.constant([[2, 1], [1, 2, 0], [3]])
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
-           [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
-           [[]]]                                        #  [p[3]            ]]
-      )  # pyformat: disable
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[[b'f'], [b'c', b'd', b'e']],                # [[p[2], p[1]      ],
+         [[b'c', b'd', b'e'], [b'f'], [b'a', b'b']],  #  [p[1], p[2], p[0]],
+         [[]]]                                        #  [p[3]            ]]
+    )  # pyformat: disable
 
   def testRaggedParamsAndScalarIndices(self):
     params = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
     indices = 1
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(), [b'c', b'd', b'e'])
+    self.assertRaggedEqual(ragged.gather(params, indices), [b'c', b'd', b'e'])
 
   def test3DRaggedParamsAnd2DTensorIndices(self):
     params = ragged.constant([[['a', 'b'], []], [['c', 'd'], ['e'], ['f']],
                               [['g']]])
     indices = [[1, 2], [0, 1], [2, 2]]
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
-           [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
-           [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
-      )  # pyformat: disable
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[[[b'c', b'd'], [b'e'], [b'f']], [[b'g']]],            # [[p1, p2],
+         [[[b'a', b'b'], []], [[b'c', b'd'], [b'e'], [b'f']]],  #  [p0, p1],
+         [[[b'g']], [[b'g']]]]                                  #  [p2, p2]]
+    )  # pyformat: disable
 
   def testTensorParamsAnd4DRaggedIndices(self):
     indices = ragged.constant(
@@ -108,30 +102,30 @@ class RaggedTensorOpsTest(test_util.TensorFlowTestCase):
         ragged_rank=2,
         inner_shape=(2,))
     params = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
-    with self.test_session():
-      self.assertEqual(
-          ragged.gather(params, indices).eval().tolist(),
-          [[[[b'd', b'e'], [b'a', b'g']], []],
-           [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
-           [[[b'b', b'a']]]])  # pyformat: disable
+    self.assertRaggedEqual(
+        ragged.gather(params, indices),
+        [[[[b'd', b'e'], [b'a', b'g']], []],
+         [[[b'c', b'b'], [b'b', b'a']], [[b'c', b'f']], [[b'c', b'd']]],
+         [[[b'b', b'a']]]])  # pyformat: disable
 
   def testOutOfBoundsError(self):
     tensor_params = ['a', 'b', 'c']
     tensor_indices = [0, 1, 2]
     ragged_params = ragged.constant([['a', 'b'], ['c']])
     ragged_indices = ragged.constant([[0, 3]])
-    with self.test_session():
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'indices\[1\] = 3 is not in \[0, 3\)',
-                              ragged.gather(tensor_params, ragged_indices).eval)
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'indices\[2\] = 2 is not in \[0, 2\)',
-                              ragged.gather(ragged_params, tensor_indices).eval)
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'indices\[1\] = 3 is not in \[0, 2\)',
-                              ragged.gather(ragged_params, ragged_indices).eval)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[1\] = 3 is not in \[0, 3\)'):
+      self.evaluate(ragged.gather(tensor_params, ragged_indices))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[2\] = 2 is not in \[0, 2\)'):
+      self.evaluate(ragged.gather(ragged_params, tensor_indices))
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'indices\[1\] = 3 is not in \[0, 2\)'):
+      self.evaluate(ragged.gather(ragged_params, ragged_indices))
 
   def testUnknownIndicesRankError(self):
+    if context.executing_eagerly():
+      return
     params = ragged.constant([], ragged_rank=1)
     indices = constant_op.constant([0], dtype=dtypes.int64)
     indices = array_ops.placeholder_with_default(indices, None)
diff --git a/tensorflow/python/ops/ragged/ragged_getitem.py b/tensorflow/python/ops/ragged/ragged_getitem.py
index 9821695046c577627298c413fcfc7716b71f8019..0fa72a36581150cd9408aa7bf12467bfaaab8893 100644
--- a/tensorflow/python/ops/ragged/ragged_getitem.py
+++ b/tensorflow/python/ops/ragged/ragged_getitem.py
@@ -24,7 +24,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_math_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 
@@ -137,8 +136,8 @@ def _ragged_getitem(rt_input, key_list):
   if row_key is array_ops.newaxis:
     inner_rt = _ragged_getitem(rt_input, inner_keys)
     nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
-    return ragged_factory_ops.from_row_splits(inner_rt,
-                                              array_ops.stack([0, nsplits - 1]))
+    return ragged_tensor.RaggedTensor.from_row_splits(
+        inner_rt, array_ops.stack([0, nsplits - 1]))
 
   # Slicing a range of rows: first slice the outer dimension, and then
   # call `_ragged_getitem_inner_dimensions` to handle the inner keys.
@@ -184,7 +183,7 @@ def _slice_ragged_row_dimension(rt_input, row_key):
         axis=0)
     values_start = new_splits[0]
     values_limit = new_splits[-1]
-    return ragged_factory_ops.from_row_splits(
+    return ragged_tensor.RaggedTensor.from_row_splits(
         rt_input.values[values_start:values_limit], new_splits - values_start)
 
   # If there is a slice step (aka a strided slice), then use ragged_gather to
@@ -225,7 +224,8 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
   if column_key is array_ops.newaxis:
     inner_rt = _ragged_getitem_inner_dimensions(rt_input, key_list[1:])
     nsplits = array_ops.shape(inner_rt.row_splits, out_type=dtypes.int64)[0]
-    return ragged_factory_ops.from_row_splits(inner_rt, math_ops.range(nsplits))
+    return ragged_tensor.RaggedTensor.from_row_splits(inner_rt,
+                                                      math_ops.range(nsplits))
 
   # Slicing a range of columns in a ragged inner dimension.  We use a
   # recursive call to process the values, and then assemble a RaggedTensor
@@ -239,7 +239,7 @@ def _ragged_getitem_inner_dimensions(rt_input, key_list):
     else:
       # Nontrivial slice: use ragged_gather to extract the indicated slice as
       # a new RaggedTensor (inner_rt), and then recursively process its values.
-      # The splits can be taken from ragged.row_splits(inner_rt).
+      # The splits can be taken from inner_rt.row_splits().
       inner_rt_starts = rt_input.row_splits[:-1]
       inner_rt_limits = rt_input.row_splits[1:]
       if column_key.start is not None and column_key.start != 0:
diff --git a/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
similarity index 83%
rename from tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
rename to tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
index 798d7c3ce81e77d7134752757387d8da27fed411..8b28cac99db29e9ab2a2758db3449413b83cd747 100644
--- a/tensorflow/python/ops/ragged/ragged_map_inner_values_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_flat_values_op_test.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for ragged.map_inner_values."""
+"""Tests for ragged.map_flat_values."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from absl.testing import parameterized
-
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
@@ -27,11 +25,12 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
-                                 parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedMapInnerValuesOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def assertRaggedMapInnerValuesReturns(self,
                                         op,
@@ -39,20 +38,20 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
                                         args=(),
                                         kwargs=None):
     kwargs = kwargs or {}
-    result = ragged.map_inner_values(op, *args, **kwargs)
+    result = ragged.map_flat_values(op, *args, **kwargs)
     with self.test_session():
-      self.assertEqual(result.eval().tolist(), expected)
+      self.assertRaggedEqual(result, expected)
 
   def testDocStringExamples(self):
     """Test the examples in apply_op_to_ragged_values.__doc__."""
     rt = ragged.constant([[1, 2, 3], [], [4, 5], [6]])
-    v1 = ragged.map_inner_values(array_ops.ones_like, rt)
-    v2 = ragged.map_inner_values(math_ops.multiply, rt, rt)
-    v3 = ragged.map_inner_values(math_ops.add, rt, 5)
+    v1 = ragged.map_flat_values(array_ops.ones_like, rt)
+    v2 = ragged.map_flat_values(math_ops.multiply, rt, rt)
+    v3 = ragged.map_flat_values(math_ops.add, rt, 5)
     with self.test_session():
-      self.assertEqual(v1.eval().tolist(), [[1, 1, 1], [], [1, 1], [1]])
-      self.assertEqual(v2.eval().tolist(), [[1, 4, 9], [], [16, 25], [36]])
-      self.assertEqual(v3.eval().tolist(), [[6, 7, 8], [], [9, 10], [11]])
+      self.assertRaggedEqual(v1, [[1, 1, 1], [], [1, 1], [1]])
+      self.assertRaggedEqual(v2, [[1, 4, 9], [], [16, 25], [36]])
+      self.assertRaggedEqual(v3, [[6, 7, 8], [], [9, 10], [11]])
 
   def testOpWithSingleRaggedTensorArg(self):
     tensor = ragged.constant([[1, 2, 3], [], [4, 5]])
@@ -124,9 +123,8 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
     x0 = [3, 1, 4, 1, 5, 9, 2, 6, 5]
     y0 = [1, 2, 3, 4, 5, 6, 7, 8, 9]
     with self.test_session():
-      self.assertEqual(
-          math_ops.multiply(x0, y0).eval().tolist(),
-          [3, 2, 12, 4, 25, 54, 14, 48, 45])
+      self.assertRaggedEqual(
+          math_ops.multiply(x0, y0), [3, 2, 12, 4, 25, 54, 14, 48, 45])
 
     # ragged_rank=1
     x1 = ragged.constant([[3, 1, 4], [], [1, 5], [9, 2], [6, 5]])
@@ -182,28 +180,25 @@ class RaggedMapInnerValuesOpTest(test_util.TensorFlowTestCase,
     y = ragged.constant([[[3, 1, 4], []], [], [[1, 5]]])
     self.assertRaisesRegexp(ValueError,
                             r'Inputs must have identical ragged splits.*',
-                            ragged.map_inner_values, math_ops.add, x, y)
+                            ragged.map_flat_values, math_ops.add, x, y)
 
   def testRaggedTensorSplitsValueMismatchError(self):
     x = ragged.constant([[3, 1, 4], [], [1, 5]])
     y = ragged.constant([[1], [2, 3], [4, 5]])
     self.assertRaisesRegexp(errors.InvalidArgumentError,
                             r'Inputs must have identical ragged splits.*',
-                            ragged.map_inner_values, math_ops.add, x, y)
+                            ragged.map_flat_values, math_ops.add, x, y)
 
   def testRaggedTensorSplitsMismatchErrorAtRuntime(self):
     splits1 = array_ops.placeholder_with_default(
         constant_op.constant([0, 3, 3, 5], dtypes.int64), None)
     splits2 = array_ops.placeholder_with_default(
         constant_op.constant([0, 1, 3, 5], dtypes.int64), None)
-    x = ragged.from_row_splits([3, 1, 4, 1, 5], splits1)
-    y = ragged.from_row_splits([1, 2, 3, 4, 5], splits2)
-    result = ragged.map_inner_values(math_ops.add, x, y)
-    with self.test_session():
-      self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          r'\[Inputs must have identical ragged splits\] '
-          r'\[Condition x == y did not hold element-wise:\].*', result.eval)
+    x = ragged.RaggedTensor.from_row_splits([3, 1, 4, 1, 5], splits1)
+    y = ragged.RaggedTensor.from_row_splits([1, 2, 3, 4, 5], splits2)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'.*Inputs must have identical ragged splits'):
+      self.evaluate(ragged.map_flat_values(math_ops.add, x, y))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
index 6f3f33b4441caff3f76ff4244b679360fe02793f..49c0996b24f30dd33219d3292446239717bbf487 100644
--- a/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_map_fn_op_test.py
@@ -18,6 +18,7 @@ from __future__ import division
 from __future__ import print_function
 
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
@@ -26,10 +27,14 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops as mo
 from tensorflow.python.ops import ragged
 from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedMapOpTest(ragged_test_util.RaggedTensorTestCase,
+                      parameterized.TestCase):
+
   @parameterized.parameters([
       # The following test sets map over a RaggedTensor and apply a
       # transformation that returns with shape:
@@ -52,57 +57,58 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[2, 6], [4.5, 9], [6.5, 13]],
           dtype=dtypes.float32,
+          expected_ragged_rank=0,
       ),
       # [d1, (d2)] -> [d1, (d2)]
       dict(
-          fn=lambda x: x+1,
+          fn=lambda x: x + np.int64(1),
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[2, 3, 4], [5, 6], [7, 8]],
           dtype=dtypes.int64,
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), d3] -> [d1, (d2), d3]
       dict(
-          fn=lambda x: x+1,
+          fn=lambda x: x + np.int64(1),
           elems=[[[1, 2], [3, 4]], [], [[5, 6], [7, 8], [9, 0]]],
           elems_ragged_rank=1,
           expected_ragged_rank=1,
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
           expected_output=[[[2, 3], [4, 5]], [], [[6, 7], [8, 9], [10, 1]]],
       ),
       # [d1, (d2)] -> [d1, (d2), (d3)]
       dict(
-          fn=lambda x: ragged.from_row_starts(x, [0]),
+          fn=lambda x: ragged.RaggedTensor.from_row_starts(x, [0]),
           elems=[[1, 2, 3], [4, 5], [6, 7]],
           expected_output=[[[1, 2, 3]], [[4, 5]], [[6, 7]]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=2),
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3)] -> [d1, (d2), (d3)]
       dict(
-          fn=lambda x: ragged.map_inner_values(mo.add, x, 1),
+          fn=lambda x: ragged.map_flat_values(mo.add, x, 1),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[[2, 3, 4]], [[5, 6], [7, 8]]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=2),
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3)] -> [d1, (d2)]
       dict(
           fn=lambda x: ragged.reduce_sum(x, axis=1),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[6], [9, 13]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), (d3)] -> [d1, (d3)]
       dict(
           fn=lambda x: ragged.reduce_sum(x, axis=0),
           elems=[[[1, 2, 3]], [[4, 5], [6, 7]]],
           expected_output=[[1, 2, 3], [10, 12]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1, (d2), (d3)] -> [d1]
       dict(
@@ -116,27 +122,26 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           fn=mo.range,
           elems=[4, 0, 2],
           expected_output=[[0, 1, 2, 3], [], [0, 1]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=1),
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=1),
       ),
       # [d1] -> [d1, (d2), (d3)]
       dict(
           fn=lambda x: ragged.range(mo.range(x)),
           elems=[5, 0, 3],
-          expected_output=[
-              [[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [], [[], [0], [0, 1]]
-          ],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=2),
+          expected_output=[[[], [0], [0, 1], [0, 1, 2], [0, 1, 2, 3]], [],
+                           [[], [0], [0, 1]]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=2),
       ),
       # [d1, (d2), (d3), (d4a), (d5)] ->  [d1, (d2), (d3), (d4b), (d5)]
       dict(
-          fn=lambda x: ragged.add(x, 1),
+          fn=lambda x: x + np.int64(1),
           elems=[[[[[1, 2, 3]], [[4], [5]]]], [[[[6, 7]]], [[[8], []]]]],
-          expected_output=[[[[[2, 3, 4]], [[5], [6]]]],
-                           [[[[7, 8]]], [[[9], []]]]],
-          result_dtype=ragged.RaggedTensorType(dtype=dtypes.int64,
-                                               ragged_rank=4),
+          expected_output=[[[[[2, 3, 4]], [[5], [6]]]], [[[[7, 8]]], [[[9],
+                                                                       []]]]],
+          result_dtype=ragged.RaggedTensorType(
+              dtype=dtypes.int64, ragged_rank=4),
       ),
   ])
 
@@ -158,16 +163,12 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     expected_rt = ragged.constant(
         expected_output, ragged_rank=expected_ragged_rank)
-    with self.test_session():
-      if ragged.is_ragged(expected_output):
-        self.assertEqual(output.ragged_rank, expected_rt.ragged_rank)
-      output_values = output.eval()
-      self.assertAllEqual(expected_output, output_values.tolist())
+    self.assertRaggedEqual(expected_rt, output)
 
   def testRaggedMapOnStructure(self):
     batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
     # [[10, 20, 30], [40], [50, 60, 70]]
-    robin = ragged.map_inner_values(mo.multiply, batman, 10)
+    robin = ragged.map_flat_values(mo.multiply, batman, 10)
 
     features = {'batman': batman, 'robin': robin}
 
@@ -180,21 +181,20 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         dtype=dtypes.int32,
     )
 
-    with self.test_session():
-      self.assertAllEqual(output.eval().tolist(), [66, 44, 198])
+    self.assertRaggedEqual(output, [66, 44, 198])
 
   # Test mapping over a dict of RTs can produce a dict of RTs.
   def testRaggedMapOnStructure_RaggedOutputs(self):
     batman = ragged.constant([[1, 2, 3], [4], [5, 6, 7]])
     # [[10, 20, 30], [40], [50, 60, 70]]
-    robin = ragged.map_inner_values(mo.multiply, batman, 10)
+    robin = ragged.map_flat_values(mo.multiply, batman, 10)
 
     features = {'batman': batman, 'robin': robin}
 
     def _increment(f):
       return {
-          'batman': ragged.add(f['batman'], 1),
-          'robin': ragged.add(f['robin'], 1),
+          'batman': f['batman'] + 1,
+          'robin': f['robin'] + 1,
       }
 
     output = ragged.map_fn(
@@ -209,17 +209,13 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         },
     )
 
-    with self.test_session():
-      self.assertAllEqual(output['batman'].eval().tolist(),
-                          [[2, 3, 4], [5], [6, 7, 8]])
-      self.assertAllEqual(output['robin'].eval().tolist(),
-                          [[11, 21, 31], [41], [51, 61, 71]])
+    self.assertRaggedEqual(output['batman'], [[2, 3, 4], [5], [6, 7, 8]])
+    self.assertRaggedEqual(output['robin'], [[11, 21, 31], [41], [51, 61, 71]])
 
   def testZip(self):
     x = ragged.constant([[10, 20], [30, 40], [50, 60], [70], [80, 90, 100]],
                         dtypes.int64)
-    y = array_ops.expand_dims(
-        mo.range(ragged.nrows(x), dtype=dtypes.int64), axis=1)
+    y = array_ops.expand_dims(mo.range(x.nrows(), dtype=dtypes.int64), axis=1)
 
     def _zip(foo):
       y_val, x_val = foo
@@ -231,11 +227,9 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         dtype=ragged.RaggedTensorType(dtype=dtypes.int64, ragged_rank=1),
         infer_shape=False)
 
-    with self.test_session():
-      result = output.eval().tolist()
-      self.assertAllEqual(
-          result, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
-                   [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
+    self.assertRaggedEqual(
+        output, [[[0, 10], [0, 20]], [[1, 30], [1, 40]], [[2, 50], [2, 60]],
+                 [[3, 70]], [[4, 80], [4, 90], [4, 100]]])
 
   def testBatchGather(self):
     tokens = ragged.constant([['hello', '.', 'there'], ['merhaba'],
@@ -253,10 +247,8 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         dtype=ragged.RaggedTensorType(dtype=dtypes.string, ragged_rank=1),
         infer_shape=False)
 
-    with self.test_session():
-      self.assertAllEqual(
-          out.eval().tolist(),
-          [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
+    self.assertRaggedEqual(
+        out, [[b'hello', b'there'], [b'merhaba'], [b'bonjour', b'ca va']])
 
   def testMismatchRaggedRank(self):
     elems = ragged.constant([[[1, 2, 3]], [[4, 5], [6, 7]]])
@@ -270,7 +262,7 @@ class RaggedMapOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testMismatchRaggedRank2(self):
     elems = ragged.constant([[1, 2, 3], [4, 5], [6, 7]])
-    fn = lambda x: ragged.from_row_starts(x, [0])
+    fn = lambda x: ragged.RaggedTensor.from_row_starts(x, [0])
     with self.assertRaisesWithLiteralMatch(
         ValueError, r'The declared ragged rank (10) mismatches the result (1)'):
       _ = ragged.map_fn(
diff --git a/tensorflow/python/ops/ragged/ragged_map_ops.py b/tensorflow/python/ops/ragged/ragged_map_ops.py
index fafa23b8dcbbf128723c1b8e51611a958087fdeb..af40352b1d02fe8ccce242d31fb33e2f8a21f1ce 100644
--- a/tensorflow/python/ops/ragged/ragged_map_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_map_ops.py
@@ -27,12 +27,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_shape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope as vs
-from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import nest
@@ -216,8 +215,8 @@ def map_fn(fn,
         varscope_caching_device_was_none = True
 
     elems_flat = [
-        ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-            elem, name="elem") for elem in elems_flat
+        ragged_tensor.convert_to_tensor_or_ragged_tensor(elem, name="elem")
+        for elem in elems_flat
     ]
 
     # We can either infer the output, or we can assume that it will be the same
@@ -226,7 +225,7 @@ def map_fn(fn,
 
     # Find the number of iterations, n may be known statically.
     if isinstance(elems_flat[0], ragged_tensor.RaggedTensor):
-      n = ragged_array_ops.nrows(elems_flat[0], out_type=dtypes.int32)
+      n = elems_flat[0].nrows(out_type=dtypes.int32)
     else:
       static_shape = elems_flat[0].shape
       if static_shape.ndims is not None and static_shape.ndims < 1:
@@ -236,7 +235,8 @@ def map_fn(fn,
         else:
           raise ValueError(
               "elements in elems must be 1+ dimensional Tensors, not scalars")
-      n = static_shape[0].value or array_ops.shape(elems_flat[0])[0]
+      n = (tensor_shape.dimension_value(static_shape[0]) or
+           array_ops.shape(elems_flat[0])[0])
 
     # Create a flat list of TAs.
 
@@ -334,7 +334,7 @@ def map_fn(fn,
 class _RaggedTensorComponents(
     collections.namedtuple(
         "_RaggedTensorComponents",
-        ["inner_values", "nested_row_lengths", "outer_row_length"])):
+        ["flat_values", "nested_row_lengths", "outer_row_length"])):
   """A namedtuple of components which represent a `RaggedTensor`.
 
   _RaggedTensorComponents is a list of components which can be used to create a
@@ -344,7 +344,7 @@ class _RaggedTensorComponents(
 
   The following are a list of components for a `RaggedTensor`:
 
-  inner_values: The flat and inner values of a RaggedTensor. This could be
+  flat_values: The flat and inner values of a RaggedTensor. This could be
     a `Tensor`, a `TensorArray`, or a data type.
   nested_row_lengths: a tuple containing the row lengths of each rank. The
     elements of the tuple could be `Tensor`s or `TensorArray`s.
@@ -357,12 +357,12 @@ class _RaggedTensorComponents(
 
 
 def _concat_ragged_tensor_components(rt_ta):
-  inner_values = rt_ta.inner_values.concat()
+  flat_values = rt_ta.flat_values.concat()
   nested_row_lengths = tuple(
       row_lengths_ta.concat() for row_lengths_ta in rt_ta.nested_row_lengths)
   outer_row_length = rt_ta.outer_row_length.concat()
   return _RaggedTensorComponents(
-      inner_values=inner_values,
+      flat_values=flat_values,
       nested_row_lengths=nested_row_lengths,
       outer_row_length=outer_row_length)
 
@@ -374,17 +374,17 @@ def _maybe_decompose_tensor(rt):
 
   # The three component pieces we need:
   # - inner values
-  inner_values = rt.inner_values
+  flat_values = rt.flat_values
 
   # - row_splits of the RT
   splits = rt.nested_row_splits
   nested_row_lengths = tuple(split[1:] - split[:-1] for split in splits)
 
   # - outer row length
-  outer_row_length = array_ops.expand_dims(ragged_array_ops.nrows(rt), axis=0)
+  outer_row_length = array_ops.expand_dims(rt.nrows(), axis=0)
 
   return _RaggedTensorComponents(
-      inner_values=inner_values,
+      flat_values=flat_values,
       nested_row_lengths=nested_row_lengths,
       outer_row_length=outer_row_length,
   )
@@ -395,11 +395,12 @@ def _maybe_recompose_tensor(t):
   if not isinstance(t, _RaggedTensorComponents):
     return t
 
-  values = t.inner_values
+  values = t.flat_values
   nested_row_lengths = tuple(t.nested_row_lengths)
   for nested_row_length in reversed(nested_row_lengths):
-    values = ragged_factory_ops.from_row_lengths(values, nested_row_length)
-  return ragged_factory_ops.from_row_lengths(values, t.outer_row_length)
+    values = ragged_tensor.RaggedTensor.from_row_lengths(
+        values, nested_row_length)
+  return ragged_tensor.RaggedTensor.from_row_lengths(values, t.outer_row_length)
 
 
 def _maybe_decompose_dtype(d):
@@ -408,7 +409,7 @@ def _maybe_decompose_dtype(d):
     return d
 
   result = _RaggedTensorComponents(
-      inner_values=d.dtype,
+      flat_values=d.dtype,
       nested_row_lengths=tuple(dtypes.int64 for i in range(d.ragged_rank - 1)),
       outer_row_length=dtypes.int64,
   )
@@ -435,10 +436,13 @@ def _convert_declared(fn_output_flat, output_declared):
               "The declared ragged rank (%d) mismatches the result (1)" %
               declared.ragged_rank)
 
-        row_length = array_ops.expand_dims(
-            ragged_array_ops.nrows(current), axis=0)
+        if isinstance(current, ragged_tensor.RaggedTensor):
+          nrows = current.nrows()
+        else:
+          nrows = array_ops.shape(current, out_type=dtypes.int64)[0]
+        row_length = array_ops.expand_dims(nrows, axis=0)
         rt = _RaggedTensorComponents(
-            inner_values=current,
+            flat_values=current,
             nested_row_lengths=(),
             outer_row_length=row_length)
         yield rt
diff --git a/tensorflow/python/ops/ragged/ragged_math_ops.py b/tensorflow/python/ops/ragged/ragged_math_ops.py
index 857b8dbfa361901108bf88949ac167a277991e36..92f82be84aca06ae723f00103dccbdeb5c64371f 100644
--- a/tensorflow/python/ops/ragged/ragged_math_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_math_ops.py
@@ -18,6 +18,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import numpy as np
+
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -25,7 +27,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import gen_ragged_math_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.ops.ragged import ragged_util
@@ -97,8 +98,8 @@ def range(starts, limits=None, deltas=1, dtype=None, name=None):
           [dtypes.int32, dtypes.int64, dtypes.float32, dtypes.float64])
 
     result = gen_ragged_math_ops.ragged_range(starts, limits, deltas, name=name)
-    return ragged_factory_ops.from_row_splits(result.rt_dense_values,
-                                              result.rt_nested_splits)
+    return ragged_tensor.RaggedTensor.from_row_splits(result.rt_dense_values,
+                                                      result.rt_nested_splits)
 
 
 def _infer_matching_dtype(tensors, dtype_hierarchy):
@@ -143,8 +144,11 @@ Computes the %(combination)s along segments of a RaggedTensor.
 """
 
 
-def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
-                              num_segments, name=None):
+def _ragged_segment_aggregate(unsorted_segment_op,
+                              data,
+                              segment_ids,
+                              num_segments,
+                              name=None):
   """Aggregates along segments of a RaggedTensor using `unsorted_segment_op`.
 
   Returns a RaggedTensor `output` with `num_segments` rows, where the row
@@ -181,9 +185,8 @@ def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
 
   with ops.name_scope(name, 'RaggedSegment',
                       [data, segment_ids, num_segments]) as name:
-    data = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
-        data, name='data')
-    segment_ids = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    data = ragged_tensor.convert_to_tensor_or_ragged_tensor(data, name='data')
+    segment_ids = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         segment_ids, name='segment_ids')
 
     if ragged_tensor.is_ragged(segment_ids):
@@ -212,12 +215,11 @@ def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
     assert output_row_lengths.dtype == dtypes.int64
 
     # Build the splits tensor for the output RaggedTensor.
-    output_splits = array_ops.concat(
-        [
-            array_ops.zeros([1], dtypes.int64),
-            math_ops.cumsum(output_row_lengths)
-        ],
-        axis=0)
+    output_splits = array_ops.concat([
+        array_ops.zeros([1], dtypes.int64),
+        math_ops.cumsum(output_row_lengths)
+    ],
+                                     axis=0)
 
     # For each row in `data`, find the start & limit position where that row's
     # values will be aggregated in output.values.
@@ -234,7 +236,8 @@ def _ragged_segment_aggregate(unsorted_segment_op, data, segment_ids,
     output_values = _ragged_segment_aggregate(unsorted_segment_op, data.values,
                                               data_val_to_out_val_index,
                                               output_splits[-1])
-    return ragged_factory_ops.from_row_splits(output_values, output_splits)
+    return ragged_tensor.RaggedTensor.from_row_splits(output_values,
+                                                      output_splits)
 
 
 def segment_sum(data, segment_ids, num_segments, name=None):
@@ -270,11 +273,11 @@ def segment_mean(data, segment_ids, num_segments, name=None):
   with ops.name_scope(name, 'RaggedSegmentMean',
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
-    ones = ragged_factory_ops.from_nested_row_splits(
-        array_ops.ones_like(data.inner_values), data.nested_row_splits)
+    ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        array_ops.ones_like(data.flat_values), data.nested_row_splits)
     count = segment_sum(ones, segment_ids, num_segments)
-    return ragged_factory_ops.from_nested_row_splits(
-        total.inner_values / count.inner_values, total.nested_row_splits)
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        total.flat_values / count.flat_values, total.nested_row_splits)
 
 
 def segment_sqrt_n(data, segment_ids, num_segments, name=None):
@@ -282,11 +285,11 @@ def segment_sqrt_n(data, segment_ids, num_segments, name=None):
   with ops.name_scope(name, 'RaggedSegmentSqrtN',
                       [data, segment_ids, num_segments]):
     total = segment_sum(data, segment_ids, num_segments)
-    ones = ragged_factory_ops.from_nested_row_splits(
-        array_ops.ones_like(data.inner_values), data.nested_row_splits)
+    ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+        array_ops.ones_like(data.flat_values), data.nested_row_splits)
     count = segment_sum(ones, segment_ids, num_segments)
-    return ragged_factory_ops.from_nested_row_splits(
-        total.inner_values / math_ops.sqrt(count.inner_values),
+    return ragged_tensor.RaggedTensor.from_nested_row_splits(
+        total.flat_values / math_ops.sqrt(count.flat_values),
         total.nested_row_splits)
 
 
@@ -311,7 +314,7 @@ _set_ragged_segment_docstring(segment_sqrt_n, 'sum divided by sqrt(N)',
 _RAGGED_REDUCE_DOCSTRING = """\
 Computes the %(combination)s of elements across dimensions of a `RaggedTensor`.
 
-  Reduces `rt_input` along the dimensions given in `axis` by taking the
+  Reduces `input_tensor` along the dimensions given in `axis` by taking the
   %(combination)s of values.  If a reduced dimension has no elements for
   some index, then the value for that index will be %(default)s.
 
@@ -319,18 +322,18 @@ Computes the %(combination)s of elements across dimensions of a `RaggedTensor`.
   `axis` is not specified, then all dimensions are reduced, and a scalar
   value is returned.
   Args:
-    rt_input: A `RaggedTensor` containing the values to be %(combined)s.
+    input_tensor: A `RaggedTensor` containing the values to be %(combined)s.
     axis: The dimensions to reduce.  May be `None` (to reduce all axes), an
       `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce
       a given set of axes), or a `Tensor` with a constant value.  Must be in
-      the range `[0, rt_input.rank]`.
+      the range `[0, input_tensor.rank]`.
     name: A name prefix for the returned tensor (optional).
   Returns:
     A `RaggedTensor` containing the %(combined)s values.  The returned tensor
     has the same dtype as `data`, and its shape is given by removing the
-    dimensions specified in `axis` from `rt_input.shape`.  The `ragged_rank`
+    dimensions specified in `axis` from `input_tensor.shape`.  The `ragged_rank`
     of the returned tensor is given by substracting any ragged dimensions
-    specified in `axis` from `rt_input.ragged_rank`.
+    specified in `axis` from `input_tensor.ragged_rank`.
   Raises:
     ValueError: If `axis` contains a `Tensor` whose value is not constant.
   ####Example:
@@ -387,7 +390,11 @@ _RAGGED_REDUCE_ANY_EXAMPLE = """
 """
 
 
-def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
+def _ragged_reduce_aggregate(reduce_op,
+                             unsorted_segment_op,
+                             rt_input,
+                             axis,
+                             keepdims,
                              name=None):
   """Aggregates across axes of a RaggedTensor using the given `Tensor` ops.
 
@@ -412,6 +419,7 @@ def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
       `int` (to reduce a single axis), a `list` or `tuple` of `int` (to reduce a
       given set of axes), or a `Tensor` with a constant value.  Must be in the
       range `[0, rt_input.rank)`.
+    keepdims: If true, retains reduced dimensions with length 1.
     name: A name prefix for the returned tensor (optional).
 
   Returns:
@@ -426,14 +434,19 @@ def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
   if not ragged_tensor.is_ragged(rt_input):
     return reduce_op(rt_input, axis, name=name)
 
+  if keepdims:
+    raise ValueError('keepdims=True is not supported for RaggedTensors.')
+
   if isinstance(axis, ops.Tensor):
     axis = tensor_util.constant_value(axis)
     if axis is None:
       raise ValueError('axis must be known at graph construction time.')
+    if isinstance(axis, np.ndarray):
+      axis = axis.tolist()
 
   # When reducing all axes, just ignore splits & reduce the inner values.
   if axis is None:
-    return reduce_op(rt_input.inner_values, None, name=name)
+    return reduce_op(rt_input.flat_values, None, name=name)
 
   with ops.name_scope(name, 'RaggedReduce', [rt_input, axis]):
     if isinstance(axis, (tuple, list)):
@@ -448,13 +461,13 @@ def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
         # once will probably require a nontrivial c++ op.
         axis = sorted(axis)
         inner_reduced = _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                                 rt_input, axis[-1])
+                                                 rt_input, axis[-1], keepdims)
         return _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                        inner_reduced, axis[:-1])
+                                        inner_reduced, axis[:-1], keepdims)
 
     axis = ragged_util.get_positive_axis(axis, rt_input.shape.ndims)
 
-    rt_input = ragged_factory_ops.convert_to_tensor_or_ragged_tensor(
+    rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(
         rt_input, name='rt_input')
 
     if axis == 0:
@@ -476,69 +489,74 @@ def _ragged_reduce_aggregate(reduce_op, unsorted_segment_op, rt_input, axis,
       #     sum_{j} rt_input [i_0, ..., i_[axis-1], j, i_axis+1], ..., i_N]
       return rt_input.with_values(
           _ragged_reduce_aggregate(reduce_op, unsorted_segment_op,
-                                   rt_input.values, axis - 1))
+                                   rt_input.values, axis - 1, keepdims))
 
 
-def reduce_sum(rt_input, axis=None, name=None):
+def reduce_sum(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
   return _ragged_reduce_aggregate(math_ops.reduce_sum,
-                                  math_ops.unsorted_segment_sum, rt_input, axis,
-                                  name or 'RaggedReduceSum')
+                                  math_ops.unsorted_segment_sum, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceSum')
 
 
-def reduce_prod(rt_input, axis=None, name=None):
+def reduce_prod(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
   return _ragged_reduce_aggregate(math_ops.reduce_prod,
-                                  math_ops.unsorted_segment_prod, rt_input,
-                                  axis, name or 'RaggedReduceProd')
+                                  math_ops.unsorted_segment_prod, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceProd')
 
 
-def reduce_min(rt_input, axis=None, name=None):
+def reduce_min(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
   return _ragged_reduce_aggregate(math_ops.reduce_min,
-                                  math_ops.unsorted_segment_min, rt_input, axis,
-                                  name or 'RaggedReduceMin')
+                                  math_ops.unsorted_segment_min, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceMin')
 
 
-def reduce_max(rt_input, axis=None, name=None):
+def reduce_max(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
   return _ragged_reduce_aggregate(math_ops.reduce_max,
-                                  math_ops.unsorted_segment_max, rt_input, axis,
-                                  name or 'RaggedReduceMax')
+                                  math_ops.unsorted_segment_max, input_tensor,
+                                  axis, keepdims, name or 'RaggedReduceMax')
 
 
-def reduce_mean(rt_input, axis=None, name=None):
+def reduce_mean(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  with ops.name_scope(name, 'RaggedReduceMean', [rt_input, axis]):
-    total = reduce_sum(rt_input, axis)
-    if ragged_tensor.is_ragged(rt_input):
-      ones = ragged_factory_ops.from_nested_row_splits(
-          array_ops.ones_like(rt_input.inner_values),
-          rt_input.nested_row_splits)
+  with ops.name_scope(name, 'RaggedReduceMean', [input_tensor, axis]):
+    total = reduce_sum(input_tensor, axis, keepdims)
+    if ragged_tensor.is_ragged(input_tensor):
+      ones = ragged_tensor.RaggedTensor.from_nested_row_splits(
+          array_ops.ones_like(input_tensor.flat_values),
+          input_tensor.nested_row_splits)
     else:
-      ones = array_ops.ones_like(rt_input)
-    count = reduce_sum(ones, axis)
+      ones = array_ops.ones_like(input_tensor)
+    count = reduce_sum(ones, axis, keepdims)
     if ragged_tensor.is_ragged(total):
-      return ragged_factory_ops.from_nested_row_splits(
-          total.inner_values / count.inner_values, total.nested_row_splits)
+      return ragged_tensor.RaggedTensor.from_nested_row_splits(
+          total.flat_values / count.flat_values, total.nested_row_splits)
     else:
       return total / count
 
 
-def _cast(rt_input, dtype):
-  return ragged_functional_ops.map_inner_values(math_ops.cast, rt_input, dtype)
+def _cast(input_tensor, dtype):
+  return ragged_functional_ops.map_flat_values(math_ops.cast, input_tensor,
+                                               dtype)
 
 
-def reduce_all(rt_input, axis=None, name=None):
+def reduce_all(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  with ops.name_scope(name, 'RaggedReduceAll', [rt_input, axis]):
-    return _cast(reduce_prod(_cast(rt_input, dtypes.int32), axis), dtypes.bool)
+  with ops.name_scope(name, 'RaggedReduceAll', [input_tensor, axis]):
+    return _cast(
+        reduce_prod(_cast(input_tensor, dtypes.int32), axis, keepdims),
+        dtypes.bool)
 
 
-def reduce_any(rt_input, axis=None, name=None):
+def reduce_any(input_tensor, axis=None, keepdims=None, name=None):
   """For docs, see: _RAGGED_REDUCE_DOCSTRING."""
-  with ops.name_scope(name, 'RaggedReduceAny', [rt_input, axis]):
-    return _cast(reduce_sum(_cast(rt_input, dtypes.int32), axis), dtypes.bool)
+  with ops.name_scope(name, 'RaggedReduceAny', [input_tensor, axis]):
+    return _cast(
+        reduce_sum(_cast(input_tensor, dtypes.int32), axis, keepdims),
+        dtypes.bool)
 
 
 def _set_ragged_reduce_docstring(func, combination, combined, default, example):
@@ -554,9 +572,11 @@ _set_ragged_reduce_docstring(reduce_sum, 'sum', 'summed', '0',
 _set_ragged_reduce_docstring(reduce_prod, 'product', 'multiplied', '1',
                              _RAGGED_REDUCE_PROD_EXAMPLE)
 _set_ragged_reduce_docstring(reduce_min, 'minimum', 'minimized',
-                             '`rt_input.dtype.min`', _RAGGED_REDUCE_MIN_EXAMPLE)
+                             '`input_tensor.dtype.min`',
+                             _RAGGED_REDUCE_MIN_EXAMPLE)
 _set_ragged_reduce_docstring(reduce_max, 'maximum', 'maximized',
-                             '`rt_input.dtype.max`', _RAGGED_REDUCE_MAX_EXAMPLE)
+                             '`input_tensor.dtype.max`',
+                             _RAGGED_REDUCE_MAX_EXAMPLE)
 _set_ragged_reduce_docstring(reduce_mean, 'mean', 'averaged', 'NaN',
                              _RAGGED_REDUCE_MEAN_EXAMPLE)
 
diff --git a/tensorflow/python/ops/ragged/ragged_operators.py b/tensorflow/python/ops/ragged/ragged_operators.py
index 223ba0d2e7f050650a0849fdb4987afb38cebd2e..7654fa22b1e3a6d783a7a3295bca2d1a0b2ea757 100644
--- a/tensorflow/python/ops/ragged/ragged_operators.py
+++ b/tensorflow/python/ops/ragged/ragged_operators.py
@@ -18,7 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.ops.ragged import ragged_elementwise_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_getitem
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import tf_decorator
@@ -33,40 +33,39 @@ def _right(operator):
 ragged_tensor.RaggedTensor.__getitem__ = ragged_getitem.ragged_tensor_getitem
 
 # Ordering operators
-ragged_tensor.RaggedTensor.__ge__ = ragged_elementwise_ops.greater_equal
-ragged_tensor.RaggedTensor.__gt__ = ragged_elementwise_ops.greater
-ragged_tensor.RaggedTensor.__le__ = ragged_elementwise_ops.less_equal
-ragged_tensor.RaggedTensor.__lt__ = ragged_elementwise_ops.less
+ragged_tensor.RaggedTensor.__ge__ = math_ops.greater_equal
+ragged_tensor.RaggedTensor.__gt__ = math_ops.greater
+ragged_tensor.RaggedTensor.__le__ = math_ops.less_equal
+ragged_tensor.RaggedTensor.__lt__ = math_ops.less
 
 # Logical operators
-ragged_tensor.RaggedTensor.__and__ = ragged_elementwise_ops.logical_and
-ragged_tensor.RaggedTensor.__rand__ = _right(ragged_elementwise_ops.logical_and)
-ragged_tensor.RaggedTensor.__invert__ = ragged_elementwise_ops.logical_not
-ragged_tensor.RaggedTensor.__ror__ = _right(ragged_elementwise_ops.logical_or)
-ragged_tensor.RaggedTensor.__or__ = ragged_elementwise_ops.logical_or
-ragged_tensor.RaggedTensor.__xor__ = ragged_elementwise_ops.logical_xor
-ragged_tensor.RaggedTensor.__rxor__ = _right(ragged_elementwise_ops.logical_xor)
+ragged_tensor.RaggedTensor.__and__ = math_ops.logical_and
+ragged_tensor.RaggedTensor.__rand__ = _right(math_ops.logical_and)
+ragged_tensor.RaggedTensor.__invert__ = math_ops.logical_not
+ragged_tensor.RaggedTensor.__ror__ = _right(math_ops.logical_or)
+ragged_tensor.RaggedTensor.__or__ = math_ops.logical_or
+ragged_tensor.RaggedTensor.__xor__ = math_ops.logical_xor
+ragged_tensor.RaggedTensor.__rxor__ = _right(math_ops.logical_xor)
 
 # Arithmetic operators
-ragged_tensor.RaggedTensor.__abs__ = ragged_elementwise_ops.abs
-ragged_tensor.RaggedTensor.__add__ = ragged_elementwise_ops.add
-ragged_tensor.RaggedTensor.__radd__ = _right(ragged_elementwise_ops.add)
-ragged_tensor.RaggedTensor.__div__ = ragged_elementwise_ops.div
-ragged_tensor.RaggedTensor.__rdiv__ = _right(ragged_elementwise_ops.div)
-ragged_tensor.RaggedTensor.__floordiv__ = ragged_elementwise_ops.floordiv
-ragged_tensor.RaggedTensor.__rfloordiv__ = _right(
-    ragged_elementwise_ops.floordiv)
-ragged_tensor.RaggedTensor.__mod__ = ragged_elementwise_ops.floormod
-ragged_tensor.RaggedTensor.__rmod__ = _right(ragged_elementwise_ops.floormod)
-ragged_tensor.RaggedTensor.__mul__ = ragged_elementwise_ops.multiply
-ragged_tensor.RaggedTensor.__rmul__ = _right(ragged_elementwise_ops.multiply)
-ragged_tensor.RaggedTensor.__neg__ = ragged_elementwise_ops.negative
-ragged_tensor.RaggedTensor.__pow__ = ragged_elementwise_ops.pow
-ragged_tensor.RaggedTensor.__rpow__ = _right(ragged_elementwise_ops.pow)
-ragged_tensor.RaggedTensor.__sub__ = ragged_elementwise_ops.subtract
-ragged_tensor.RaggedTensor.__rsub__ = _right(ragged_elementwise_ops.subtract)
-ragged_tensor.RaggedTensor.__truediv__ = ragged_elementwise_ops.truediv
-ragged_tensor.RaggedTensor.__rtruediv__ = _right(ragged_elementwise_ops.truediv)
+ragged_tensor.RaggedTensor.__abs__ = math_ops.abs
+ragged_tensor.RaggedTensor.__add__ = math_ops.add
+ragged_tensor.RaggedTensor.__radd__ = _right(math_ops.add)
+ragged_tensor.RaggedTensor.__div__ = math_ops.div
+ragged_tensor.RaggedTensor.__rdiv__ = _right(math_ops.div)
+ragged_tensor.RaggedTensor.__floordiv__ = math_ops.floordiv
+ragged_tensor.RaggedTensor.__rfloordiv__ = _right(math_ops.floordiv)
+ragged_tensor.RaggedTensor.__mod__ = math_ops.floormod
+ragged_tensor.RaggedTensor.__rmod__ = _right(math_ops.floormod)
+ragged_tensor.RaggedTensor.__mul__ = math_ops.multiply
+ragged_tensor.RaggedTensor.__rmul__ = _right(math_ops.multiply)
+ragged_tensor.RaggedTensor.__neg__ = math_ops.negative
+ragged_tensor.RaggedTensor.__pow__ = math_ops.pow
+ragged_tensor.RaggedTensor.__rpow__ = _right(math_ops.pow)
+ragged_tensor.RaggedTensor.__sub__ = math_ops.subtract
+ragged_tensor.RaggedTensor.__rsub__ = _right(math_ops.subtract)
+ragged_tensor.RaggedTensor.__truediv__ = math_ops.truediv
+ragged_tensor.RaggedTensor.__rtruediv__ = _right(math_ops.truediv)
 
 
 # Dummy methods
diff --git a/tensorflow/python/ops/ragged/ragged_operators_test.py b/tensorflow/python/ops/ragged/ragged_operators_test.py
index a99d788ef79f5893eb09cad2b9f336c435704783..78bb37c341e9261a972445cbd34f8e1b0fc674d9 100644
--- a/tensorflow/python/ops/ragged/ragged_operators_test.py
+++ b/tensorflow/python/ops/ragged/ragged_operators_test.py
@@ -20,78 +20,71 @@ from __future__ import print_function
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedElementwiseOpsTest(test_util.TensorFlowTestCase):
-  # @TODO(edloper): Test right-handed versions of operators once we add
-  # broadcasting support for elementwise ops.
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedElementwiseOpsTest(ragged_test_util.RaggedTensorTestCase):
 
   def testOrderingOperators(self):
     x = ragged.constant([[1, 5], [3]])
     y = ragged.constant([[4, 5], [1]])
-    with self.test_session():
-      self.assertEqual((x > y).eval().tolist(), [[False, False], [True]])
-      self.assertEqual((x >= y).eval().tolist(), [[False, True], [True]])
-      self.assertEqual((x < y).eval().tolist(), [[True, False], [False]])
-      self.assertEqual((x <= y).eval().tolist(), [[True, True], [False]])
-
-  def assertEqual(self, a, b):
-    if a != b:
-      print('%30s %s' % (b, a))
+    self.assertRaggedEqual((x > y), [[False, False], [True]])
+    self.assertRaggedEqual((x >= y), [[False, True], [True]])
+    self.assertRaggedEqual((x < y), [[True, False], [False]])
+    self.assertRaggedEqual((x <= y), [[True, True], [False]])
 
   def testArithmeticOperators(self):
     x = ragged.constant([[1.0, -2.0], [8.0]])
     y = ragged.constant([[4.0, 4.0], [2.0]])
-    with self.test_session():
-      self.assertEqual(abs(x).eval().tolist(), [[1.0, 2.0], [8.0]])
+    self.assertRaggedEqual(abs(x), [[1.0, 2.0], [8.0]])
 
-      self.assertEqual((-x).eval().tolist(), [[-1.0, 2.0], [-8.0]])
+    self.assertRaggedEqual((-x), [[-1.0, 2.0], [-8.0]])
 
-      self.assertEqual((x + y).eval().tolist(), [[5.0, 2.0], [10.0]])
-      self.assertEqual((3.0 + y).eval().tolist(), [[7.0, 7.0], [5.0]])
-      self.assertEqual((x + 3.0).eval().tolist(), [[4.0, 1.0], [11.0]])
+    self.assertRaggedEqual((x + y), [[5.0, 2.0], [10.0]])
+    self.assertRaggedEqual((3.0 + y), [[7.0, 7.0], [5.0]])
+    self.assertRaggedEqual((x + 3.0), [[4.0, 1.0], [11.0]])
 
-      self.assertEqual((x - y).eval().tolist(), [[-3.0, -6.0], [6.0]])
-      self.assertEqual((3.0 - y).eval().tolist(), [[-1.0, -1.0], [1.0]])
-      self.assertEqual((x + 3.0).eval().tolist(), [[4.0, 1.0], [11.0]])
+    self.assertRaggedEqual((x - y), [[-3.0, -6.0], [6.0]])
+    self.assertRaggedEqual((3.0 - y), [[-1.0, -1.0], [1.0]])
+    self.assertRaggedEqual((x + 3.0), [[4.0, 1.0], [11.0]])
 
-      self.assertEqual((x * y).eval().tolist(), [[4.0, -8.0], [16.0]])
-      self.assertEqual((3.0 * y).eval().tolist(), [[12.0, 12.0], [6.0]])
-      self.assertEqual((x * 3.0).eval().tolist(), [[3.0, -6.0], [24.0]])
+    self.assertRaggedEqual((x * y), [[4.0, -8.0], [16.0]])
+    self.assertRaggedEqual((3.0 * y), [[12.0, 12.0], [6.0]])
+    self.assertRaggedEqual((x * 3.0), [[3.0, -6.0], [24.0]])
 
-      self.assertEqual((x / y).eval().tolist(), [[0.25, -0.5], [4.0]])
-      self.assertEqual((y / x).eval().tolist(), [[4.0, -2.0], [0.25]])
-      self.assertEqual((2.0 / y).eval().tolist(), [[0.5, 0.5], [1.0]])
-      self.assertEqual((x / 2.0).eval().tolist(), [[0.5, -1.0], [4.0]])
+    self.assertRaggedEqual((x / y), [[0.25, -0.5], [4.0]])
+    self.assertRaggedEqual((y / x), [[4.0, -2.0], [0.25]])
+    self.assertRaggedEqual((2.0 / y), [[0.5, 0.5], [1.0]])
+    self.assertRaggedEqual((x / 2.0), [[0.5, -1.0], [4.0]])
 
-      self.assertEqual((x // y).eval().tolist(), [[0.0, -1.0], [4.0]])
-      self.assertEqual((y // x).eval().tolist(), [[4.0, -2.0], [0.0]])
-      self.assertEqual((2.0 // y).eval().tolist(), [[0.0, 0.0], [1.0]])
-      self.assertEqual((x // 2.0).eval().tolist(), [[0.0, -1.0], [4.0]])
+    self.assertRaggedEqual((x // y), [[0.0, -1.0], [4.0]])
+    self.assertRaggedEqual((y // x), [[4.0, -2.0], [0.0]])
+    self.assertRaggedEqual((2.0 // y), [[0.0, 0.0], [1.0]])
+    self.assertRaggedEqual((x // 2.0), [[0.0, -1.0], [4.0]])
 
-      self.assertEqual((x % y).eval().tolist(), [[1.0, 2.0], [0.0]])
-      self.assertEqual((y % x).eval().tolist(), [[0.0, -0.0], [2.0]])
-      self.assertEqual((2.0 % y).eval().tolist(), [[2.0, 2.0], [0.0]])
-      self.assertEqual((x % 2.0).eval().tolist(), [[1.0, 0.0], [0.0]])
+    self.assertRaggedEqual((x % y), [[1.0, 2.0], [0.0]])
+    self.assertRaggedEqual((y % x), [[0.0, -0.0], [2.0]])
+    self.assertRaggedEqual((2.0 % y), [[2.0, 2.0], [0.0]])
+    self.assertRaggedEqual((x % 2.0), [[1.0, 0.0], [0.0]])
 
   def testLogicalOperators(self):
     a = ragged.constant([[True, True], [False]])
     b = ragged.constant([[True, False], [False]])
-    with self.test_session():
-      self.assertEqual((~a).eval().tolist(), [[False, False], [True]])
+    self.assertRaggedEqual((~a), [[False, False], [True]])
 
-      self.assertEqual((a & b).eval().tolist(), [[True, False], [False]])
-      self.assertEqual((a & True).eval().tolist(), [[True, True], [False]])
-      self.assertEqual((True & b).eval().tolist(), [[True, False], [False]])
+    self.assertRaggedEqual((a & b), [[True, False], [False]])
+    self.assertRaggedEqual((a & True), [[True, True], [False]])
+    self.assertRaggedEqual((True & b), [[True, False], [False]])
 
-      self.assertEqual((a | b).eval().tolist(), [[True, True], [False]])
-      self.assertEqual((a | False).eval().tolist(), [[True, True], [False]])
-      self.assertEqual((False | b).eval().tolist(), [[True, False], [False]])
+    self.assertRaggedEqual((a | b), [[True, True], [False]])
+    self.assertRaggedEqual((a | False), [[True, True], [False]])
+    self.assertRaggedEqual((False | b), [[True, False], [False]])
 
-      self.assertEqual((a ^ b).eval().tolist(), [[False, True], [False]])
-      self.assertEqual((a ^ True).eval().tolist(), [[False, False], [True]])
-      self.assertEqual((True ^ b).eval().tolist(), [[False, True], [True]])
+    self.assertRaggedEqual((a ^ b), [[False, True], [False]])
+    self.assertRaggedEqual((a ^ True), [[False, False], [True]])
+    self.assertRaggedEqual((True ^ b), [[False, True], [True]])
 
   def testDummyOperators(self):
     a = ragged.constant([[True, True], [False]])
diff --git a/tensorflow/python/ops/ragged/ragged_range_op_test.py b/tensorflow/python/ops/ragged/ragged_range_op_test.py
index 3c6a6fb75c8a85f7d10f4f3e501f2f53f28a48e5..5ab3d4abc3988b05add4bf98e31e472d2d5b2e88 100644
--- a/tensorflow/python/ops/ragged/ragged_range_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_range_op_test.py
@@ -21,102 +21,102 @@ from __future__ import print_function
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedRangeOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRangeOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExamples(self):
     """Examples from ragged_range.__doc__."""
-    with self.test_session():
-      rt1 = ragged.range([3, 5, 2]).eval().tolist()
-      self.assertEqual(rt1, [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]])
+    rt1 = ragged.range([3, 5, 2])
+    self.assertRaggedEqual(rt1, [[0, 1, 2], [0, 1, 2, 3, 4], [0, 1]])
 
-      rt2 = ragged.range([0, 5, 8], [3, 3, 12]).eval().tolist()
-      self.assertEqual(rt2, [[0, 1, 2], [], [8, 9, 10, 11]])
+    rt2 = ragged.range([0, 5, 8], [3, 3, 12])
+    self.assertRaggedEqual(rt2, [[0, 1, 2], [], [8, 9, 10, 11]])
 
-      rt3 = ragged.range([0, 5, 8], [3, 3, 12], 2).eval().tolist()
-      self.assertEqual(rt3, [[0, 2], [], [8, 10]])
+    rt3 = ragged.range([0, 5, 8], [3, 3, 12], 2)
+    self.assertRaggedEqual(rt3, [[0, 2], [], [8, 10]])
 
   def testBasicRanges(self):
-    with self.test_session():
-      # Specify limits only.
-      self.assertEqual(
-          ragged.range([0, 3, 5]).eval().tolist(),
-          [list(range(0)), list(range(3)), list(range(5))])
-
-      # Specify starts and limits.
-      self.assertEqual(
-          ragged.range([0, 3, 5], [2, 3, 10]).eval().tolist(),
-          [list(range(0, 2)), list(range(3, 3)), list(range(5, 10))])
-
-      # Specify starts, limits, and deltas.
-      self.assertEqual(
-          ragged.range([0, 3, 5], [4, 4, 15], [2, 3, 4]).eval().tolist(),
-          [list(range(0, 4, 2)), list(range(3, 4, 3)),
-           list(range(5, 15, 4))])
+    # Specify limits only.
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5]),
+        [list(range(0)), list(range(3)),
+         list(range(5))])
+
+    # Specify starts and limits.
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5], [2, 3, 10]),
+        [list(range(0, 2)),
+         list(range(3, 3)),
+         list(range(5, 10))])
+
+    # Specify starts, limits, and deltas.
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5], [4, 4, 15], [2, 3, 4]),
+        [list(range(0, 4, 2)),
+         list(range(3, 4, 3)),
+         list(range(5, 15, 4))])
 
   def testFloatRanges(self):
-    with self.test_session():
-      expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
-                  [5.0, 7.2, 9.4, 11.6, 13.8]]
-      actual = ragged.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0],
-                            [0.4, 1.5, 2.2]).eval().tolist()
-      self.assertEqual(expected, [[round(v, 5) for v in row] for row in actual])
+    expected = [[0.0, 0.4, 0.8, 1.2, 1.6, 2.0, 2.4, 2.8, 3.2, 3.6], [3.0],
+                [5.0, 7.2, 9.4, 11.6, 13.8]]
+    actual = ragged.range([0.0, 3.0, 5.0], [3.9, 4.0, 15.0], [0.4, 1.5, 2.2])
+    self.assertEqual(
+        expected,
+        [[round(v, 5) for v in row] for row in self.eval_to_list(actual)])
 
   def testNegativeDeltas(self):
-    with self.test_session():
-      self.assertEqual(
-          ragged.range([0, 3, 5], limits=0, deltas=-1).eval().tolist(),
-          [list(range(0, 0, -1)), list(range(3, 0, -1)),
-           list(range(5, 0, -1))])
-
-      self.assertEqual(
-          ragged.range([0, -3, 5], limits=0, deltas=[-1, 1,
-                                                     -2]).eval().tolist(),
-          [list(range(0, 0, -1)), list(range(-3, 0, 1)),
-           list(range(5, 0, -2))])
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5], limits=0, deltas=-1),
+        [list(range(0, 0, -1)),
+         list(range(3, 0, -1)),
+         list(range(5, 0, -1))])
+
+    self.assertRaggedEqual(
+        ragged.range([0, -3, 5], limits=0, deltas=[-1, 1, -2]),
+        [list(range(0, 0, -1)),
+         list(range(-3, 0, 1)),
+         list(range(5, 0, -2))])
 
   def testBroadcast(self):
-    with self.test_session():
-      # Specify starts and limits, broadcast deltas.
-      self.assertEqual(
-          ragged.range([0, 3, 5], [4, 4, 15], 3).eval().tolist(),
-          [list(range(0, 4, 3)), list(range(3, 4, 3)),
-           list(range(5, 15, 3))])
+    # Specify starts and limits, broadcast deltas.
+    self.assertRaggedEqual(
+        ragged.range([0, 3, 5], [4, 4, 15], 3),
+        [list(range(0, 4, 3)),
+         list(range(3, 4, 3)),
+         list(range(5, 15, 3))])
 
-      # Broadcast all arguments.
-      self.assertEqual(
-          ragged.range(0, 5, 1).eval().tolist(), [list(range(0, 5, 1))])
+    # Broadcast all arguments.
+    self.assertRaggedEqual(ragged.range(0, 5, 1), [list(range(0, 5, 1))])
 
   def testEmptyRanges(self):
     rt1 = ragged.range([0, 5, 3], [0, 3, 5])
     rt2 = ragged.range([0, 5, 5], [0, 3, 5], -1)
-    with self.test_session():
-      self.assertEqual(rt1.eval().tolist(), [[], [], [3, 4]])
-      self.assertEqual(rt2.eval().tolist(), [[], [5, 4], []])
+    self.assertRaggedEqual(rt1, [[], [], [3, 4]])
+    self.assertRaggedEqual(rt2, [[], [5, 4], []])
 
   def testShapeFnErrors(self):
-    with self.test_session():
-      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
-                              ragged.range, [[0]], 5)
-      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
-                              ragged.range, 0, [[5]])
-      self.assertRaisesRegexp(ValueError, r'Shape must be at most rank 1.*',
-                              ragged.range, 0, 5, [[0]])
-      self.assertRaisesRegexp(ValueError, r'Dimensions must be equal.*',
-                              ragged.range, [0], [1, 2])
+    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
+                      [[0]], 5)
+    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
+                      0, [[5]])
+    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
+                      0, 5, [[0]])
+    self.assertRaises((ValueError, errors.InvalidArgumentError), ragged.range,
+                      [0], [1, 2])
 
   def testKernelErrors(self):
-    with self.test_session():
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              r'Requires delta != 0',
-                              ragged.range(0, 0, 0).eval)
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 r'Requires delta != 0'):
+      self.evaluate(ragged.range(0, 0, 0))
 
   def testShape(self):
-    self.assertEqual(ragged.range(0, 0, 0).shape.as_list(), [1, None])
-    self.assertEqual(ragged.range([1, 2, 3]).shape.as_list(), [3, None])
-    self.assertEqual(
+    self.assertRaggedEqual(ragged.range(0, 0, 1).shape.as_list(), [1, None])
+    self.assertRaggedEqual(ragged.range([1, 2, 3]).shape.as_list(), [3, None])
+    self.assertRaggedEqual(
         ragged.range([1, 2, 3], [4, 5, 6]).shape.as_list(), [3, None])
 
 
diff --git a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
index 93176c738df0d9ae2d6287838b97756e4eda2eb3..890460221bf9fdebe134d6ced77b1fca2dbdffd5 100644
--- a/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_reduce_op_test.py
@@ -21,11 +21,13 @@ from __future__ import print_function
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 _MAX_INT32 = dtypes.int32.max
@@ -37,7 +39,9 @@ def mean(*values):
   return 1.0 * sum(values) / len(values)
 
 
-class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedReduceOpsTest(ragged_test_util.RaggedTensorTestCase,
+                          parameterized.TestCase):
 
   @parameterized.parameters(
       #=========================================================================
@@ -303,8 +307,7 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testReduce(self, ragged_reduce_op, rt_input, axis, expected):
     rt_input = ragged.constant(rt_input)
     reduced = ragged_reduce_op(rt_input, axis)
-    with self.test_session():
-      self.assertEqual(reduced.eval().tolist(), expected)
+    self.assertRaggedEqual(reduced, expected)
 
   def assertEqualWithNan(self, actual, expected):
     """Like assertEqual, but NaN==NaN."""
@@ -318,22 +321,22 @@ class RaggedReduceOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
             [4, 1, 0, 2, 1, 2]))
     rt_input = ragged.constant(rt_as_list)
     reduced = ragged.reduce_mean(rt_input, axis=1)
-    with self.test_session():
-      self.assertEqualWithNan(reduced.eval(), expected)
+    self.assertEqualWithNan(self.evaluate(reduced), expected)
 
   def testMeanWithTensorInputs(self):
     tensor = [[1.0, 2.0, 3.0], [10.0, 20.0, 30.0]]
     expected = [2.0, 20.0]
     reduced = ragged.reduce_mean(tensor, axis=1)
-    with self.test_session():
-      self.assertAllEqual(reduced.eval(), expected)
+    self.assertRaggedEqual(reduced, expected)
 
   def testErrors(self):
     rt_input = ragged.constant([[1, 2, 3], [4, 5]])
     axis = array_ops.placeholder_with_default(constant_op.constant([0]), None)
-    self.assertRaisesRegexp(ValueError,
-                            r'axis must be known at graph construction time.',
-                            ragged.reduce_sum, rt_input, axis)
+
+    if not context.executing_eagerly():
+      self.assertRaisesRegexp(
+          ValueError, r'axis must be known at graph construction time.',
+          ragged.reduce_sum, rt_input, axis)
     self.assertRaisesRegexp(TypeError,
                             r'axis must be an int; got str.*',
                             ragged.reduce_sum, rt_input, ['x'])
diff --git a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
index 4d5a0a5d11c92cccef54f27fdeaf36608a61980c..15112d6c9c56b0e15247fc7c2f0b8410a5b9d376 100644
--- a/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_lengths_op_test.py
@@ -20,12 +20,16 @@ from __future__ import print_function
 
 from absl.testing import parameterized
 
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedRowLengthsOp(ragged_test_util.RaggedTensorTestCase,
+                         parameterized.TestCase):
 
   @parameterized.parameters([
       # Docstring Example
@@ -37,24 +41,6 @@ class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=2,
           expected=[[3, 1], [], [2, 1], [1], []]),
 
-      # 1D tensor
-      dict(
-          rt_input=[1, 2, 3, 4, 5],
-          ragged_rank=0,
-          axis=0,
-          expected=5),
-
-      # 2D Tensor (0 ragged dimensions)
-      dict(
-          rt_input=[[1, 2], [3, 4], [5, 6], [7, 8]],
-          ragged_rank=0,
-          expected=[2, 2, 2, 2]),
-      dict(
-          rt_input=[[1, 2], [3, 4], [5, 6], [7, 8]],
-          ragged_rank=0,
-          axis=0,
-          expected=4),
-
       # 2D Tensor (1 ragged dimension)
       dict(
           rt_input=[['a'], ['b', 'c', 'd'], ['e'], [], ['f']],
@@ -79,24 +65,6 @@ class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
           axis=0,
           expected=0),
 
-      # 3D Tensor (0 ragged dimensions)
-      dict(
-          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
-          ragged_rank=0,
-          axis=0,
-          expected=2),
-      dict(
-          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
-          ragged_rank=0,
-          axis=1,
-          expected=[3, 3]),
-      dict(
-          rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10], [11, 12]]],
-          ragged_rank=0,
-          axis=2,
-          expected=[[2, 2, 2], [2, 2, 2]],
-          expected_ragged_rank=0),
-
       # 3D Tensor (1 ragged dimension)
       dict(
           rt_input=[[[1, 2], [3, 4], [5, 6]], [[7, 8], [9, 10]]],
@@ -150,34 +118,28 @@ class RaggedRowLengthsOp(test_util.TensorFlowTestCase, parameterized.TestCase):
                      ragged_rank=None,
                      expected_ragged_rank=None):
     rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
-    lengths = ragged.row_lengths(rt, axis)
-    with self.test_session():
-      self.assertEqual(lengths.eval().tolist(), expected)
-      if expected_ragged_rank is not None:
-        if isinstance(lengths, ragged.RaggedTensor):
-          self.assertEqual(lengths.ragged_rank, expected_ragged_rank)
-        else:
-          self.assertEqual(0, expected_ragged_rank)
+    lengths = rt.row_lengths(axis)
+    self.assertRaggedEqual(lengths, expected)
+    if expected_ragged_rank is not None:
+      if isinstance(lengths, ragged.RaggedTensor):
+        self.assertEqual(lengths.ragged_rank, expected_ragged_rank)
+      else:
+        self.assertEqual(0, expected_ragged_rank)
 
   @parameterized.parameters([
-      dict(
-          rt_input=10,
-          exception=ValueError,
-          message='rt_input may not be a scalar.'),
-      dict(
-          rt_input=[10, 20],
-          axis=1,
-          exception=ValueError,
-          message='axis=1 out of bounds: expected -1<=axis<1.'),
-      dict(
+      dict(  # axis=2 out of bounds: expected -2<=axis<2.
+          rt_input=[[10, 20], [30]],
+          axis=2,
+          exception=(ValueError, errors.InvalidArgumentError)),
+      dict(  # axis=-3 out of bounds: expected -2<=axis<2.
           rt_input=[[2, 3, 0], [4, 1, 2]],
           axis=-3,
-          exception=ValueError,
-          message='axis=-3 out of bounds: expected -2<=axis<2.'),
+          exception=(ValueError, errors.InvalidArgumentError)),
   ])
-  def testErrors(self, rt_input, exception, message, axis=1):
+  def testErrors(self, rt_input, exception, message=None, axis=1):
+    rt = ragged.constant(rt_input)
     with self.assertRaisesRegexp(exception, message):
-      ragged.row_lengths(rt_input, axis)
+      rt.row_lengths(axis)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
index f246bf35524084c958f66caecceae3547012ee9a..2970540f3e585a7e9399dbe561f148a5abc9ee2c 100644
--- a/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_row_splits_to_segment_ids_op_test.py
@@ -21,23 +21,23 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSplitsToSegmentIdsOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExample(self):
     splits = [0, 3, 3, 5, 6, 9]
     expected = [0, 0, 0, 2, 2, 3, 4, 4, 4]
     segment_ids = ragged.row_splits_to_segment_ids(splits)
-    with self.test_session():
-      self.assertEqual(segment_ids.eval().tolist(), expected)
+    self.assertAllEqual(segment_ids, expected)
 
   def testEmptySplits(self):
     # Note: the splits for an empty ragged tensor contains a single zero.
     segment_ids = ragged.row_splits_to_segment_ids([0])
-    with self.test_session():
-      self.assertEqual(segment_ids.eval().tolist(), [])
+    self.assertAllEqual(segment_ids, [])
 
   def testErrors(self):
     self.assertRaisesRegexp(ValueError, r'Invalid row_splits: \[\]',
diff --git a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
index fa7adf66b0bbd5ae8091c7b7f47bfaae56d9d266..4ed962676700ade62adb76b035a9b4e1dc5c5d73 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_ids_to_row_splits_op_test.py
@@ -21,23 +21,23 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSplitsToSegmentIdsOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExample(self):
     segment_ids = [0, 0, 0, 2, 2, 3, 4, 4, 4]
     expected = [0, 3, 3, 5, 6, 9]
     splits = ragged.segment_ids_to_row_splits(segment_ids)
-    with self.test_session():
-      self.assertEqual(splits.eval().tolist(), expected)
+    self.assertAllEqual(splits, expected)
 
   def testEmptySegmentIds(self):
     # Note: the splits for an empty ragged tensor contains a single zero.
     segment_ids = ragged.segment_ids_to_row_splits([])
-    with self.test_session():
-      self.assertEqual(segment_ids.eval().tolist(), [0])
+    self.assertAllEqual(segment_ids, [0])
 
   def testErrors(self):
     self.assertRaisesRegexp(TypeError,
@@ -54,8 +54,7 @@ class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
     num_segments = 7
     expected = [0, 3, 3, 5, 6, 9, 9, 9]
     splits = ragged.segment_ids_to_row_splits(segment_ids, num_segments)
-    with self.test_session():
-      self.assertEqual(splits.eval().tolist(), expected)
+    self.assertAllEqual(splits, expected)
 
   def testUnsortedSegmentIds(self):
     # Segment ids are not required to be sorted.
@@ -65,9 +64,8 @@ class RaggedSplitsToSegmentIdsOpTest(test_util.TensorFlowTestCase):
 
     splits2 = ragged.segment_ids_to_row_splits(segment_ids, 7)
     expected2 = [0, 3, 3, 5, 6, 9, 9, 9]
-    with self.test_session():
-      self.assertEqual(splits1.eval().tolist(), expected1)
-      self.assertEqual(splits2.eval().tolist(), expected2)
+    self.assertAllEqual(splits1, expected1)
+    self.assertAllEqual(splits2, expected2)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_segment_op_test.py b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
index 7d41eb7f7532b0015b89a42197db62163b05a544..be1f39afef0e720c0c23d9d8571fc70907696d6d 100644
--- a/tensorflow/python/ops/ragged/ragged_segment_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_segment_op_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
@@ -45,21 +46,10 @@ def sqrt_n(values):
   return 1.0 * sum(values) / math.sqrt(len(values))
 
 
-class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedSegmentOpsTest(ragged_test_util.RaggedTensorTestCase,
                            parameterized.TestCase):
 
-  def assertNestedListAmostEqual(self, lhs, rhs, places=7, context='value'):
-    self.assertEqual(type(lhs), type(rhs))
-    if isinstance(lhs, (list, tuple)):
-      self.assertEqual(len(lhs), len(rhs), 'Length differs for %s' % context)
-      for i in range(len(lhs)):
-        self.assertNestedListAmostEqual(lhs[i], rhs[i], places,
-                                        '%s[%s]' % (context, i))
-    else:
-      self.assertAlmostEqual(
-          lhs, rhs, places,
-          '%s != %s within %s places at %s' % (lhs, rhs, places, context))
-
   def expected_value(self, data, segment_ids, num_segments, combiner):
     """Find the expected value for a call to ragged_segment_<aggregate>.
 
@@ -118,8 +108,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
                                    combiner)
 
     segmented = segment_op(rt, segment_ids, num_segments)
-    with self.test_session():
-      self.assertListEqual(segmented.eval().tolist(), expected)
+    self.assertRaggedEqual(segmented, expected)
 
   @parameterized.parameters(
       (ragged.segment_sum, sum, [0, 0, 1, 1, 2, 2]),
@@ -155,9 +144,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
                                    combiner)
 
     segmented = segment_op(rt, segment_ids, num_segments)
-    with self.test_session():
-      self.assertNestedListAmostEqual(
-          segmented.eval().tolist(), expected, places=5)
+    self.assertRaggedAlmostEqual(segmented, expected, places=5)
 
   def testRaggedRankTwo(self):
     rt = ragged.constant([
@@ -172,16 +159,14 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
                  [],                                # row 1
                  [[411, 412], [321, 322], [331]]    # row 2
                 ]  # pyformat: disable
-    with self.test_session():
-      self.assertEqual(segmented1.eval().tolist(), expected1)
+    self.assertRaggedEqual(segmented1, expected1)
 
     segment_ids2 = [1, 2, 1, 1]
     segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
     expected2 = [[],
                  [[111+411, 112+412, 113, 114], [121+321, 322], [331]],
                  []]  # pyformat: disable
-    with self.test_session():
-      self.assertEqual(segmented2.eval().tolist(), expected2)
+    self.assertRaggedEqual(segmented2, expected2)
 
   def testRaggedSegmentIds(self):
     rt = ragged.constant([
@@ -195,8 +180,7 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
     expected = [[],
                 [111+321, 112+322, 113, 114],
                 [121+331+411, 412]]  # pyformat: disable
-    with self.test_session():
-      self.assertEqual(segmented.eval().tolist(), expected)
+    self.assertRaggedEqual(segmented, expected)
 
   def testShapeMismatchError1(self):
     dt = constant_op.constant([1, 2, 3, 4, 5, 6])
@@ -222,14 +206,13 @@ class RaggedSegmentOpsTest(test_util.TensorFlowTestCase,
         ragged.segment_sum, rt, segment_ids, 3)
 
     # Otherwise, error is raised when we run the graph.
-    segment_ids2 = ragged.from_row_splits(
+    segment_ids2 = ragged.RaggedTensor.from_row_splits(
         array_ops.placeholder_with_default(segment_ids.values, None),
         array_ops.placeholder_with_default(segment_ids.row_splits, None))
-    segmented2 = ragged.segment_sum(rt, segment_ids2, 3)
-    with self.test_session():
-      self.assertRaisesRegexp(
-          errors.InvalidArgumentError,
-          'segment_ids.shape must be a prefix of data.shape.*', segmented2.eval)
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        'segment_ids.shape must be a prefix of data.shape.*'):
+      self.evaluate(ragged.segment_sum(rt, segment_ids2, 3))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_stack_op_test.py b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
index d474a749f049b24543e6d0406479fead5f44a908..17d80b5aadc936cfe11c3f65628cc57bf2c60361 100644
--- a/tensorflow/python/ops/ragged/ragged_stack_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_stack_op_test.py
@@ -23,10 +23,13 @@ from absl.testing import parameterized
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedStackOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
 
   @parameterized.parameters(
       dict(
@@ -285,8 +288,7 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(stacked.ragged_rank, expected_ragged_rank)
     if expected_shape is not None:
       self.assertEqual(stacked.shape.as_list(), expected_shape)
-    with self.test_session():
-      self.assertEqual(stacked.eval().tolist(), expected)
+    self.assertRaggedEqual(stacked, expected)
 
   @parameterized.parameters(
       dict(
@@ -322,8 +324,7 @@ class RaggedStackOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     """
     rt_inputs = ragged.constant([[1, 2], [3, 4]])
     stacked = ragged.stack(rt_inputs, 0)
-    with self.test_session():
-      self.assertEqual(stacked.eval().tolist(), [[[1, 2], [3, 4]]])
+    self.assertRaggedEqual(stacked, [[[1, 2], [3, 4]]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f9f0abe4f04bf0a9a2822df28af842cd18fc553
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -0,0 +1,118 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ragged operations for working with string Tensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util.tf_export import tf_export
+
+
+# pylint: disable=redefined-builtin
+@tf_export("strings.unicode_encode")
+def unicode_encode(input, output_encoding, errors="replace",
+                   replacement_char=65533, name=None):
+  r"""Encodes each sequence of Unicode code points in `input` into a string.
+
+  `result[i1...iN]` is the string formed by concatenating the Unicode
+  codepoints `input[1...iN, :]`, encoded using `output_encoding`.
+
+  Args:
+    input: An `N+1` dimensional potentially ragged integer tensor with
+        shape `[D1...DN, num_chars]`.
+    output_encoding: Unicode encoding that should be used to encode each
+      codepoint sequence.  Can be `"UTF-8"`, `"UTF-16-BE"`, or `"UTF-32-BE"`.
+    errors: Specifies the response when an invalid codepoint is encountered
+      (optional). One of:
+            * `'replace'`: Replace invalid codepoint with the
+              `replacement_char`. (default)
+            * `'ignore'`: Skip invalid codepoints.
+            * `'strict'`: Raise an exception for any invalid codepoint.
+    replacement_char: The replacement character codepoint to be used in place of
+      any invalid input when `errors='replace'`. Any valid unicode codepoint may
+      be used. The default value is the default unicode replacement character
+      which is 0xFFFD (U+65533).
+    name: A name for the operation (optional).
+
+  Returns:
+    A `N` dimensional `string` tensor with shape `[D1...DN]`.
+
+  #### Example:
+    ```python
+      >>> input = [[71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]
+      >>> unicode_encode(input, 'UTF8')
+      ['G\xc3\xb6\xc3\xb6dnight', '\xf0\x9f\x98\x8a']
+    ```
+  """
+  with ops.name_scope(name, "UnicodeEncode", [input]):
+    input_tensor = ragged_tensor.convert_to_tensor_or_ragged_tensor(input)
+    if input_tensor.shape.ndims is None:
+      raise ValueError("Rank of input_tensor must be statically known.")
+    if ragged_tensor.is_ragged(input_tensor):
+      if input_tensor.flat_values.shape.ndims > 1:
+        # If the flat_values of our ragged tensor is multi-dimensional, we can
+        # process it separately and our output will have the same nested splits
+        # as our input.
+        return input_tensor.with_flat_values(
+            unicode_encode(input_tensor.flat_values, output_encoding, errors,
+                           replacement_char))
+      elif input_tensor.ragged_rank > 1:
+        # Recursively process the values of the ragged tensor.
+        return input_tensor.with_values(
+            unicode_encode(input_tensor.values, output_encoding, errors,
+                           replacement_char))
+      else:
+        # Our ragged tensor is of the correct shape (rank 1 flat_values tensor
+        # with ragged_rank of 1) so we can process it as normal.
+        return gen_string_ops.unicode_encode(
+            input_values=input_tensor.values,
+            input_splits=input_tensor.row_splits,
+            output_encoding=output_encoding,
+            errors=errors,
+            replacement_char=replacement_char)
+    else:
+      if input_tensor.shape.ndims == 2:
+        # The input tensor is of the correct 2-D shape, it's just not ragged.
+        return unicode_encode(ragged_conversion_ops.from_tensor(input_tensor),
+                              output_encoding, errors, replacement_char)
+      elif input_tensor.shape.ndims > 2:
+        # We need to initially flatten the input tensor to 2-D, and then can
+        # reshape the output of our processed flattened tensor.
+        flat_input_tensor = array_ops.reshape(
+            input_tensor,
+            array_ops.stack([-1, array_ops.shape(input_tensor)[-1]]))
+        flat_output_tensor = unicode_encode(flat_input_tensor, output_encoding,
+                                            errors, replacement_char)
+        return array_ops.reshape(flat_output_tensor, input_tensor.shape[:-1])
+      elif input_tensor.shape.ndims == 0:
+        raise ValueError("input_tensor's rank must be at least 1.")
+      else:
+        # Our input tensor is rank 1, so we create a ragged tensor with an added
+        # dimension to create the correct input shape & type, and then remove
+        # the additional dimension from the output and return the string scalar.
+        ragged_input_tensor = ragged_tensor.RaggedTensor.from_row_splits(
+            input_tensor,
+            array_ops.stack(
+                [0, array_ops.shape(input_tensor, out_type=dtypes.int64)[0]]))
+        output_tensor = unicode_encode(ragged_input_tensor, output_encoding,
+                                       errors, replacement_char)
+        return array_ops.reshape(output_tensor, [])
diff --git a/tensorflow/python/ops/ragged/ragged_tensor.py b/tensorflow/python/ops/ragged/ragged_tensor.py
index abb27fc3c0812fc5eec0bcd078c916c23e815d19..567c50203af592e57168063e20787b3ed621b8c8 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor.py
@@ -19,9 +19,19 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.client import session
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_ragged_conversion_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.ragged import ragged_tensor_value
+from tensorflow.python.ops.ragged import ragged_util
+from tensorflow.python.ops.ragged import segment_id_ops
 
 # pylint: disable=protected-access
 _eval_using_default_session = ops._eval_using_default_session
@@ -64,7 +74,7 @@ class RaggedTensor(object):
   a 3-D `RaggedTensor` that stores the fixed-size word embedding for each
   word in a sentence, for each sentence in a batch, could be written as
   `[num_sentences, (num_words), embedding_size]`.  The parentheses around
-  `(num_words)` indicate that that dimension is ragged, and that the length
+  `(num_words)` indicate that dimension is ragged, and that the length
   of each element list in that dimension may vary for each item.
 
   ### Component Tensors
@@ -84,10 +94,10 @@ class RaggedTensor(object):
   Example:
 
   ```python
-  >>> rt = ragged.from_row_splits(values=[3, 1, 4, 1, 5, 9, 2, 6],
-  ...                             row_splits=[0, 4, 4, 7, 8, 8])
-  >>> rt.tolist()
-  [[3, 1, 4, 1], [], [5, 9, 2], [6], []]
+  >>> print(tf.RaggedTensor.from_row_splits(
+  ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...     row_splits=[0, 4, 4, 7, 8, 8]))
+  <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
   ```
 
   ### Alternative Row-Partitioning Schemes
@@ -116,13 +126,12 @@ class RaggedTensor(object):
 
   ```python
   >>> values = [3, 1, 4, 1, 5, 9, 2, 6]
-  >>> rt1 = ragged.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
-  >>> rt2 = ragged.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
-  >>> rt3 = ragged.from_value_rowids(values,
-  ...                                value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
-  ...                                nrows=5)
-  >>> rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
-  >>> rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+  >>> rt1 = RaggedTensor.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+  >>> rt2 = RaggedTensor.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+  >>> rt3 = RaggedTensor.from_value_rowids(
+  ...     values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
+  >>> rt4 = RaggedTensor.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+  >>> rt5 = RaggedTensor.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
   ```
 
   ### Multiple Ragged Dimensions
@@ -132,24 +141,24 @@ class RaggedTensor(object):
   adds a single ragged dimension.
 
   ```python
-  >>> inner_rt = ragged.from_row_splits(  # =rt1 from above
+  >>> inner_rt = RaggedTensor.from_row_splits(  # =rt1 from above
   ...     values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-  >>> outer_rt = ragged.from_row_splits(
+  >>> outer_rt = RaggedTensor.from_row_splits(
   ...     values=inner_rt, row_splits=[0, 3, 3, 5])
-  >>> print outer_rt.tolist()
+  >>> print outer_rt.to_list()
   [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
   >>> print outer_rt.ragged_rank
   2
   ```
 
-  The factory function `ragged.from_nested_row_splits` may be used to
+  The factory function `RaggedTensor.from_nested_row_splits` may be used to
   construct a `RaggedTensor` with multiple ragged dimensions directly, by
   providing a list of `row_splits` tensors:
 
   ```python
-  >>> ragged.from_nested_row_splits(
-  ...     inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
-  ...     nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8])).tolist()
+  >>> RaggedTensor.from_nested_row_splits(
+  ...     flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
+  ...     nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8])).to_list()
   [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]
   ```
 
@@ -159,12 +168,13 @@ class RaggedTensor(object):
   by using a multidimensional `Tensor` for `values`.
 
   ```python
-  >>> rt = ragged.from_row_splits(values=tf.ones([5, 3]), row_splits=[0, 2, 5])
-  >>> print rt.tolist()
+  >>> rt = RaggedTensor.from_row_splits(values=tf.ones([5, 3]),
+  ..                                    row_splits=[0, 2, 5])
+  >>> print rt.to_list()
   [[[1, 1, 1], [1, 1, 1]],
    [[1, 1, 1], [1, 1, 1], [1, 1, 1]]]
-   >>> print rt.shape.as_list()
-   [2, None, 3]
+   >>> print rt.shape
+   (2, ?, 3)
   ```
 
   ### RaggedTensor Shape Restrictions
@@ -181,31 +191,6 @@ class RaggedTensor(object):
   dimension followed by a ragged dimension.
   """
 
-  #=============================================================================
-  # Implementation notes
-  #=============================================================================
-  # Currently, the RaggedTensor class uses a single row-partitioning scheme
-  # (row_splits).
-  #
-  # We are considering adding value_rowids+nvals as a secondary
-  # row-partitioning scheme.  This change would not impact the functional
-  # interface of the RaggedTensor class, but it would impact the efficiency
-  # of several operations.  In particular:
-  #
-  #   * The functions `ragged.value_rowids` and `ragged.nrows` would always
-  #     return pre-existing tensors; they would not need to add any ops to
-  #     the graph.
-  #
-  #   * The `RaggedTensor` constructor would construct all row-partitioning
-  #     tensors (row_splits, value_rowids, and nvals).  In eager mode, this
-  #     would mean that conversion operations would occur whenever a
-  #     `RaggedTensor` is constructed.  But in graph mode, the converted
-  #     row-partitioning tensors would only be evaluated if they are used.
-  #
-  # Since this change impacts efficiency but not functionality, we would like
-  # to perform additional profiling with real-world use cases before we
-  # decide whether to make this change.
-
   #=============================================================================
   # Constructor (private)
   #=============================================================================
@@ -221,13 +206,14 @@ class RaggedTensor(object):
     This constructor is private -- please use one of the following ops to
     build `RaggedTensor`s:
 
-      * [`ragged.from_row_lengths()`](from_row_lengths.md)
-      * [`ragged.from_value_rowids()`](from_value_rowids.md)
-      * [`ragged.from_row_splits()`](from_row_splits.md)
-      * [`ragged.from_row_starts()`](from_row_starts.md)
-      * [`ragged.from_row_limits()`](from_row_limits.md)
-      * [`ragged.from_nested_row_splits()`](from_nested_row_splits.md)
-      * [`ragged.from_nested_value_rowids()`](from_nested_value_rowids.md)
+      * `tf.RaggedTensor.from_row_lengths`
+      * `tf.RaggedTensor.from_value_rowids`
+      * `tf.RaggedTensor.from_row_splits`
+      * `tf.RaggedTensor.from_row_starts`
+      * `tf.RaggedTensor.from_row_limits`
+      * `tf.RaggedTensor.from_nested_row_splits`
+      * `tf.RaggedTensor.from_nested_row_lengths`
+      * `tf.RaggedTensor.from_nested_value_rowids`
 
     Args:
       values: A potentially ragged tensor of any dtype and shape `[nvals, ...]`.
@@ -248,7 +234,7 @@ class RaggedTensor(object):
     if not internal:
       raise ValueError("RaggedTensor constructor is private; please use one "
                        "of the factory methods instead (e.g., "
-                       "ragged.from_row_lengths())")
+                       "RaggedTensor.from_row_lengths())")
 
     # Validate the arguments.
     if not isinstance(values, (RaggedTensor, ops.Tensor)):
@@ -257,6 +243,7 @@ class RaggedTensor(object):
       raise TypeError("Row-partitioning argument must be a Tensor.")
     values.shape.with_rank_at_least(1)
     row_splits.shape.assert_has_rank(1)
+    row_splits.set_shape([None])
 
     self._values = values
     self._row_splits = row_splits
@@ -271,6 +258,364 @@ class RaggedTensor(object):
     self._cached_value_rowids = cached_value_rowids
     self._cached_nrows = cached_nrows
 
+  #=============================================================================
+  # Factory Methods
+  #=============================================================================
+
+  @classmethod
+  def from_value_rowids(cls, values, value_rowids, nrows=None, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `value_rowids`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [[values[i] for i in range(len(values)) if value_rowids[i] == row]
+              for row in range(nrows)]
+    ```
+
+    Warning: currently, this needs to cast value_rowids to int64 before
+    converting, since `tf.bincount` only supports `int32`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      value_rowids: A 1-D int64 tensor with shape `[nvals]`, which corresponds
+        one-to-one with `values`, and specifies each value's row index.  Must be
+        nonnegative, and must be sorted in ascending order.
+      nrows: An int64 scalar specifying the number of rows.  This should be
+        specified if the `RaggedTensor` may containing empty training rows. Must
+        be greater than `value_rowids[-1]` (or zero if `value_rowids` is empty).
+        Defaults to `value_rowids[-1]` (or zero if `value_rowids` is empty).
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    Raises:
+      ValueError: If `nrows` is incompatible with `value_rowids`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_value_rowids(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     value_rowids=[0, 0, 0, 0, 2, 2, 2, 3],
+      ...     nrows=5))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromValueRowIds",
+                        [values, value_rowids, nrows]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      value_rowids = ops.convert_to_tensor(
+          value_rowids, dtypes.int64, name="value_rowids")
+      if nrows is None:
+        const_rowids = tensor_util.constant_value(value_rowids)
+        if const_rowids is None:
+          nrows = array_ops.concat([value_rowids[-1:], [-1]], axis=0)[0] + 1
+          const_nrows = None
+        else:
+          const_nrows = const_rowids[-1] + 1 if const_rowids.size > 0 else 0
+          nrows = ops.convert_to_tensor(const_nrows, dtypes.int64, name="nrows")
+      else:
+        nrows = ops.convert_to_tensor(nrows, dtypes.int64, "nrows")
+        const_nrows = tensor_util.constant_value(nrows)
+        if const_nrows is not None:
+          if const_nrows < 0:
+            raise ValueError("Expected nrows >= 0; got %d" % const_nrows)
+          const_rowids = tensor_util.constant_value(value_rowids)
+          if const_rowids is not None and const_rowids.size > 0:
+            if not const_nrows >= const_rowids[-1] + 1:
+              raise ValueError(
+                  "Expected nrows >= value_rowids[-1] + 1; got nrows=%d, "
+                  "value_rowids[-1]=%d" % (const_nrows, const_rowids[-1]))
+
+      value_rowids.shape.assert_has_rank(1)
+      nrows.shape.assert_has_rank(0)
+      values.shape[:1].assert_is_compatible_with(value_rowids.shape)
+
+      # Convert value_rowids & nrows to row_splits.
+      # Note: we don't use segment_ids_to_row_splits() here because we want
+      # to save the intermediate value `row_lengths`, so we can cache it.
+      # TODO(b/116708836) Upgrade bincount to accept int64 so we can skip the
+      # cast (Remove the warning in the docstring when we do.)
+      value_rowids_int32 = math_ops.cast(value_rowids, dtypes.int32)
+      nrows_int32 = math_ops.cast(nrows, dtypes.int32)
+      row_lengths = math_ops.bincount(
+          value_rowids_int32,
+          minlength=nrows_int32,
+          maxlength=nrows_int32,
+          dtype=dtypes.int64)
+      row_splits = array_ops.concat([[0], math_ops.cumsum(row_lengths)], axis=0)
+      if const_nrows is not None:
+        row_lengths.set_shape([const_nrows])
+        row_splits.set_shape([const_nrows + 1])
+
+      return cls(
+          values,
+          row_splits,
+          cached_row_lengths=row_lengths,
+          cached_value_rowids=value_rowids,
+          cached_nrows=nrows,
+          internal=True)
+
+  @classmethod
+  def from_row_splits(cls, values, row_splits, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_splits`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [values[row_splits[i]:row_splits[i + 1]]
+              for i in range(len(row_splits) - 1)]
+    ```
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_splits: A 1-D int64 tensor with shape `[nrows+1]`.  Must not be empty,
+        and must be sorted in ascending order.  `row_splits[0]` must be zero and
+        `row_splits[-1]` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    Raises:
+      ValueError: If `row_splits` is an empty list.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_splits(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_splits=[0, 4, 4, 7, 8, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    if isinstance(row_splits, (list, tuple)) and not row_splits:
+      raise ValueError("row_splits tensor may not be empty.")
+    with ops.name_scope(name, "RaggedFromRowSplits", [values, row_splits]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_splits = ops.convert_to_tensor(row_splits, dtypes.int64, "row_splits")
+      row_splits.shape.assert_has_rank(1)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_row_lengths(cls, values, row_lengths, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_lengths`.
+
+    The returned `RaggedTensor` corresponds with the python list defined by:
+
+    ```python
+    result = [[values.pop(0) for i in range(length)]
+              for length in row_lengths]
+    ```
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_lengths: A 1-D int64 tensor with shape `[nrows]`.  Must be
+        nonnegative.  `sum(row_lengths)` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_lengths(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_lengths=[4, 0, 3, 1, 0]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []])>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowLengths", [values, row_lengths]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_lengths = ops.convert_to_tensor(row_lengths, dtypes.int64,
+                                          "row_lengths")
+      row_lengths.shape.assert_has_rank(1)
+      row_limits = math_ops.cumsum(row_lengths)
+      row_splits = array_ops.concat([[0], row_limits], axis=0)
+      return cls(
+          values=values,
+          row_splits=row_splits,
+          cached_row_lengths=row_lengths,
+          internal=True)
+
+  @classmethod
+  def from_row_starts(cls, values, row_starts, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_starts`.
+
+    Equivalent to: `from_row_splits(values, concat([row_starts, nvals]))`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_starts: A 1-D int64 tensor with shape `[nrows]`.  Must be nonnegative
+        and sorted in ascending order.  If `nrows>0`, then `row_starts[0]` must
+        be zero.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_starts(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_starts=[0, 4, 4, 7, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowStarts", [values, row_starts]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_starts = ops.convert_to_tensor(row_starts, dtypes.int64, "row_starts")
+      row_starts.shape.assert_has_rank(1)
+      nvals = array_ops.shape(values, out_type=dtypes.int64)[:1]
+      row_splits = array_ops.concat([row_starts, nvals], axis=0)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_row_limits(cls, values, row_limits, name=None):
+    """Creates a `RaggedTensor` with rows partitioned by `row_limits`.
+
+    Equivalent to: `from_row_splits(values, concat([0, row_limits]))`.
+
+    Args:
+      values: A potentially ragged tensor with shape `[nvals, ...]`.
+      row_limits: A 1-D int64 tensor with shape `[nrows]`.  Must be sorted in
+        ascending order.  If `nrows>0`, then `row_limits[-1]` must be `nvals`.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor`.  `result.rank = values.rank + 1`.
+      `result.ragged_rank = values.ragged_rank + 1`.
+
+    #### Example:
+      ```python
+      >>> print(tf.RaggedTensor.from_row_limits(
+      ...     values=[3, 1, 4, 1, 5, 9, 2, 6],
+      ...     row_limits=[4, 4, 7, 8, 8]))
+      <tf.RaggedTensor [[3, 1, 4, 1], [], [5, 9, 2], [6], []]>
+      ```
+    """
+    with ops.name_scope(name, "RaggedFromRowLimits", [values, row_limits]):
+      values = convert_to_tensor_or_ragged_tensor(values, name="values")
+      row_limits = ops.convert_to_tensor(row_limits, dtypes.int64, "row_limits")
+      row_limits.shape.assert_has_rank(1)
+      zero = array_ops.zeros([1], dtypes.int64)
+      row_splits = array_ops.concat([zero, row_limits], axis=0)
+      return cls(values=values, row_splits=row_splits, internal=True)
+
+  @classmethod
+  def from_nested_value_rowids(cls,
+                               flat_values,
+                               nested_value_rowids,
+                               nested_nrows=None,
+                               name=None):
+    """Creates a `RaggedTensor` from a nested list of `value_rowids` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for (rowids, nrows) in reversed(zip(nested_value_rowids, nested_nrows)):
+      result = from_value_rowids(result, rowids, nrows)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_value_rowids: A list of 1-D int64 tensors.  The `i`th tensor is
+        used as the `value_rowids` for the `i`th ragged dimension.
+      nested_nrows: A list of int64 scalars.  The `i`th scalar is used as the
+        `nrows` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_value_rowids` is empty).
+
+    Raises:
+      ValueError: If `len(nested_values_rowids) != len(nested_nrows)`.
+    """
+    if isinstance(nested_value_rowids, ops.Tensor):
+      raise TypeError("nested_value_rowids must be a list of Tensors")
+    if nested_nrows is None:
+      nested_nrows = [None] * len(nested_value_rowids)
+    else:
+      if isinstance(nested_nrows, ops.Tensor):
+        raise TypeError("nested_nrows must be a list of Tensors")
+      if len(nested_nrows) != len(nested_value_rowids):
+        raise ValueError("nested_nrows must have the same length as "
+                         "nested_value_rowids")
+
+    with ops.name_scope(
+        name, "RaggedFromNestedValueRowIds",
+        [flat_values] + list(nested_value_rowids) + list(nested_nrows)):
+      result = flat_values
+      for value_rowids, nrows in reversed(
+          list(zip(nested_value_rowids, nested_nrows))):
+        result = cls.from_value_rowids(result, value_rowids, nrows)
+      return result
+
+  @classmethod
+  def from_nested_row_splits(cls, flat_values, nested_row_splits, name=None):
+    """Creates a `RaggedTensor` from a nested list of `row_splits` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for row_splits in reversed(nested_row_splits):
+      result = from_row_splits(result, row_splits)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_row_splits: A list of 1-D int64 tensors.  The `i`th tensor is used
+        as the `row_splits` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_row_splits` is empty).
+    """
+    if isinstance(nested_row_splits, ops.Tensor):
+      raise TypeError("nested_row_splits must be a list of Tensors")
+    with ops.name_scope(name, "RaggedFromNestedRowSplits",
+                        [flat_values] + list(nested_row_splits)):
+      result = flat_values
+      for splits in reversed(nested_row_splits):
+        result = cls.from_row_splits(result, splits)
+      return result
+
+  @classmethod
+  def from_nested_row_lengths(cls, flat_values, nested_row_lengths, name=None):
+    """Creates a `RaggedTensor` from a nested list of `row_lengths` tensors.
+
+    Equivalent to:
+
+    ```python
+    result = flat_values
+    for row_lengths in reversed(nested_row_lengths):
+      result = from_row_lengths(result, row_lengths)
+    ```
+
+    Args:
+      flat_values: A potentially ragged tensor.
+      nested_row_lengths: A list of 1-D int64 tensors.  The `i`th tensor is used
+        as the `row_lengths` for the `i`th ragged dimension.
+      name: A name prefix for the RaggedTensor (optional).
+
+    Returns:
+      A `RaggedTensor` (or `flat_values` if `nested_row_lengths` is empty).
+    """
+    if isinstance(nested_row_lengths, ops.Tensor):
+      raise TypeError("nested_row_lengths must be a list of Tensors")
+    with ops.name_scope(name, "RaggedFromNestedRowlengths",
+                        [flat_values] + list(nested_row_lengths)):
+      result = flat_values
+      for lengths in reversed(nested_row_lengths):
+        result = cls.from_row_lengths(result, lengths)
+      return result
+
   #=============================================================================
   # Accessors
   #=============================================================================
@@ -333,8 +678,8 @@ class RaggedTensor(object):
     #### Example:
       ```python
       >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> rt.values.eval()
-      [3, 1, 4, 1, 5, 9, 2, 6]
+      >>> print rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
       ```
     """
     return self._values
@@ -356,26 +701,24 @@ class RaggedTensor(object):
     #### Example:
       ```python
       >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
-      >>> rt.values.eval()
-      [3, 1, 4, 1, 5, 9, 2, 6]
-      >>> rt.row_splits.eval()  # indices of row splits in ragged.values
-      [0, 4, 4, 7, 8, 8]
+      >>> print rt.row_splits  # indices of row splits in rt.values
+      tf.Tensor([0, 4, 4, 7, 8, 8])
       ```
     """
     return self._row_splits
 
   @property
-  def inner_values(self):
+  def flat_values(self):
     """The innermost `values` tensor for this ragged tensor.
 
-    Concretely, if `rt.values` is a `Tensor`, then `rt.inner_values` is
-    `rt.values`; otherwise, `rt.inner_values` is `rt.values.inner_values`.
+    Concretely, if `rt.values` is a `Tensor`, then `rt.flat_values` is
+    `rt.values`; otherwise, `rt.flat_values` is `rt.values.flat_values`.
 
-    Conceptually, `inner_values` is the tensor formed by flattening the
+    Conceptually, `flat_values` is the tensor formed by flattening the
     outermost dimension and all of the ragged dimensions into a single
     dimension.
 
-    `rt.inner_values.shape = [nvals] + rt.shape[rt.ragged_rank + 1:]`
+    `rt.flat_values.shape = [nvals] + rt.shape[rt.ragged_rank + 1:]`
     (where `nvals` is the number of items in the flattened dimensions).
 
     Returns:
@@ -385,8 +728,8 @@ class RaggedTensor(object):
 
       ```python
       >>> rt = ragged.constant([[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
-      >>> ragged.inner_values(rt).eval()
-      [3, 1, 4, 1, 5, 9, 2, 6]
+      >>> print rt.flat_values()
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
       ```
     """
     rt_values = self.values
@@ -412,8 +755,8 @@ class RaggedTensor(object):
 
       ```python
       >>> rt = ragged.constant([[[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]]])
-      >>> for i, splits in enumerate(ragged.nested_row_splits(rt)):
-      ...   print('Splits for dimension %d: %s' % (i+1, splits.eval()))
+      >>> for i, splits in enumerate(rt.nested_row_splits()):
+      ...   print('Splits for dimension %d: %s' % (i+1, splits))
       Splits for dimension 1: [0, 1]
       Splits for dimension 2: [0, 3, 3, 5]
       Splits for dimension 3: [0, 4, 4, 7, 8, 8]
@@ -427,38 +770,220 @@ class RaggedTensor(object):
       rt_values = rt_values.values
     return tuple(rt_nested_splits)
 
-  @property
-  def cached_value_rowids(self):
-    """The row lengths for this `RaggedTensor`, or `None`.
+  def value_rowids(self, name=None):
+    """Returns the row indices for the `values` in this ragged tensor.
+
+    `rt.value_rowids()` corresponds one-to-one with the outermost dimension of
+    `rt.values`, and specifies the row containing each value.  In particular,
+    the row `rt[row]` consists of the values `rt.values[j]` where
+    `rt.value_rowids()[j] == row`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
 
     Returns:
-      The `value_rowids` tensor that was used to construct this `RaggedTensor`
-      if it was constructed using
-      [`ragged.from_value_rowids`](from_value_rowids.md); or `None` otherwise.
+      A 1-D `int64` `Tensor` with shape `self.values.shape[:1]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.value_rowids()
+      tf.Tensor([0, 0, 0, 0, 2, 2, 2, 3])  # corresponds 1:1 with rt.values
+      ```
     """
-    return self._cached_value_rowids
+    if self._cached_value_rowids is not None:
+      return self._cached_value_rowids
 
-  @property
-  def cached_nrows(self):
-    """The row lengths for this `RaggedTensor`, or `None`.
+    with ops.name_scope(name, "RaggedValueRowIds", [self]):
+      return segment_id_ops.row_splits_to_segment_ids(self.row_splits)
+
+  def nrows(self, out_type=dtypes.int64, name=None):
+    """Returns the number of rows in this ragged tensor.
+
+    I.e., the size of the outermost dimension of the tensor.
+
+    Args:
+      out_type: `dtype` for the returned tensor.
+      name: A name prefix for the returned tensor (optional).
 
     Returns:
-      The `nrows` tensor that was used to construct this `RaggedTensor`
-      if it was constructed using
-      [`ragged.from_value_rowids`](from_value_rowids.md); or `None` otherwise.
+      A scalar `Tensor` with dtype `out_type`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.nrows()  # rt has 5 rows.
+      5
+      ```
     """
-    return self._cached_nrows
+    if self._cached_nrows is not None:
+      return self._cached_nrows
 
-  @property
-  def cached_row_lengths(self):
-    """The row lengths for this `RaggedTensor`, or `None`.
+    with ops.name_scope(name, "RaggedNRows", [self]):
+      return array_ops.shape(self.row_splits, out_type=out_type)[0] - 1
+
+  def row_starts(self, name=None):
+    """Returns the start indices for rows in this ragged tensor.
+
+    These indices specify where the values for each row begin in
+    `self.values`.  `rt.row_starts()` is equal to `rt.row_splits[:-1]`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A 1-D Tensor of int64 with shape `[nrows]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.row_starts()  # indices of row starts in rt.values
+      tf.Tensor([0, 4, 4, 7, 8])
+      ```
+    """
+    with ops.name_scope(name, "RaggedRowStarts", [self]):
+      return self.row_splits[:-1]
+
+  def row_limits(self, name=None):
+    """Returns the limit indices for rows in this ragged tensor.
+
+    These indices specify where the values for each row end in
+    `self.values`.  `rt.row_limits(self)` is equal to `rt.row_splits[:-1]`.
+
+    Args:
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A 1-D Tensor of int64 with shape `[nrows]`.
+      The returned tensor is nonnegative, and is sorted in ascending order.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      >>> rt.values
+      tf.Tensor([3, 1, 4, 1, 5, 9, 2, 6])
+      >>> rt.row_limits()  # indices of row limits in rt.values
+      tf.Tensor([4, 4, 7, 8, 8])
+      ```
+    """
+    with ops.name_scope(name, "RaggedRowLimits", [self]):
+      return self.row_splits[1:]
+
+  def row_lengths(self, axis=1, name=None):
+    """Returns the lengths of the rows in this ragged tensor.
+
+    `rt.row_lengths()[i]` indicates the number of values in the
+    `i`th row of `rt`.
+
+    Args:
+      axis: An integer constant indicating the axis whose row lengths should be
+        returned.
+      name: A name prefix for the returned tensor (optional).
+
+    Returns:
+      A potentially ragged Tensor of int64 with shape `self.shape[:axis]`.
+
+    Raises:
+      ValueError: If `axis` is out of bounds.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[[3, 1, 4], [1]], [], [[5, 9], [2]], [[6]], []])
+      >>> rt.row_lengths(rt)  # lengths of rows in rt
+      tf.Tensor([2, 0, 2, 1, 0])
+      >>> rt.row_lengths(axis=2)  # lengths of axis=2 rows.
+      <tf.RaggedTensor [[3, 1], [], [2, 1], [1], []]>
+      ```
+    """
+    if self._cached_row_lengths is not None:
+      return self._cached_row_lengths
+
+    with ops.name_scope(name, "RaggedRowLengths", [self]):
+      axis = ragged_util.get_positive_axis(axis, self.shape.ndims)
+      if axis == 0:
+        return self.nrows()
+      elif axis == 1:
+        splits = self.row_splits
+        return splits[1:] - splits[:-1]
+      elif isinstance(self.values, RaggedTensor):
+        return self.with_values(self.values.row_lengths(axis - 1))
+      else:
+        shape = array_ops.shape(self.values, out_type=dtypes.int64)
+        return self.with_values(
+            array_ops.ones(shape[:axis - 1], dtypes.int64) * shape[axis - 1])
+
+  def nested_row_lengths(self, name=None):
+    """Returns a tuple containing the row_lengths for all ragged dimensions.
+
+    `rtnested_row_lengths()` is a tuple containing the `row_lengths` tensors for
+    all ragged dimensions in `rt`, ordered from outermost to innermost.
+
+    Args:
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `tuple` of 1-D `int64` `Tensors`.  The length of the tuple is equal to
+      `self.ragged_rank`.
+    """
+    with ops.name_scope(name, "RaggedNestedRowLengths", [self]):
+      rt_nested_row_lengths = []
+      rt = self
+      while isinstance(rt, RaggedTensor):
+        rt_nested_row_lengths.append(rt.row_lengths())
+        rt = rt.values
+      return tuple(rt_nested_row_lengths)
+
+  def bounding_shape(self, axis=None, name=None):
+    """Returns the tight bounding box shape for this `RaggedTensor`.
+
+    Args:
+      axis: An integer scalar or vector indicating which axes to return the
+        bounding box for.  If not specified, then the full bounding box is
+        returned.
+      name: A name prefix for the returned tensor (optional).
 
     Returns:
-      The `row_lengths` tensor that was used to construct this `RaggedTensor`
-      if it was constructed using
-      [`ragged.from_row_lengths`](from_row_lengths.md); or `None` otherwise.
+      An int64 `Tensor`.  If `axis` is not specified, then `output`
+      is a vector with `output.shape=[self.shape.ndims]`.  If `axis` is a
+      scalar, then the `output` is a scalar.  If `axis` is a vector, then
+      `output` is a vector, where `output[i]` is the bounding size for
+      dimension `axis[i]`.
+
+    #### Example:
+      ```python
+      >>> rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
+      >>> rt.bounding_shape()
+      [5, 4]
+      ```
     """
-    return self._cached_row_lengths
+    with ops.name_scope(name, "RaggedBoundingBox", [self, axis]):
+      nested_splits = self.nested_row_splits
+      rt_flat_values = self.flat_values
+
+      # Optimized special cases for when axis=0 or axis=1:
+      if isinstance(axis, int):
+        if axis == 0:
+          return array_ops.shape(nested_splits[0], out_type=dtypes.int64)[0] - 1
+        elif axis == 1:
+          return math_ops.maximum(math_ops.reduce_max(self.row_lengths()), 0)
+
+      splits_shape = array_ops.shape(self.row_splits, out_type=dtypes.int64)
+      flat_values_shape = array_ops.shape(rt_flat_values, out_type=dtypes.int64)
+
+      ragged_dimensions = array_ops.stack([splits_shape[0] - 1] + [
+          math_ops.maximum(math_ops.reduce_max(splits[1:] - splits[:-1]), 0)
+          for splits in nested_splits
+      ])
+      inner_dimensions = flat_values_shape[1:]
+
+      bbox = array_ops.concat([ragged_dimensions, inner_dimensions], axis=0)
+      return bbox if axis is None else array_ops.gather(bbox, axis)
 
   #=============================================================================
   # Transformation
@@ -480,7 +1005,7 @@ class RaggedTensor(object):
       `result.ragged_rank = 1 + new_values.ragged_rank`
     """
     new_values.shape.with_rank_at_least(1)
-    self.values.shape[0].assert_is_compatible_with(new_values.shape[0])
+    self.values.shape[:1].assert_is_compatible_with(new_values.shape[:1])
     return RaggedTensor(
         new_values,
         self._row_splits,
@@ -489,16 +1014,16 @@ class RaggedTensor(object):
         self._cached_nrows,
         internal=True)
 
-  def with_inner_values(self, new_values):
-    """Returns a copy of `self` with `inner_values` replaced by `new_value`.
+  def with_flat_values(self, new_values):
+    """Returns a copy of `self` with `flat_values` replaced by `new_value`.
 
     Preserves cached row-partitioning tensors such as `self.cached_nrows` and
     `self.cached_value_rowids` if they have values.
 
     Args:
       new_values: Potentially ragged tensor that should replace
-      `self.inner_values`.  Must have `rank > 0`, and must have the same
-      number of rows as `self.inner_values`.
+      `self.flat_values`.  Must have `rank > 0`, and must have the same
+      number of rows as `self.flat_values`.
 
     Returns:
       A `RaggedTensor`.
@@ -508,46 +1033,369 @@ class RaggedTensor(object):
     if isinstance(self._values, ops.Tensor):
       return self.with_values(new_values)
     else:
-      return self.with_values(self.values.with_inner_values(new_values))
+      return self.with_values(self.values.with_flat_values(new_values))
+
+  #=============================================================================
+  # Tensor Type Conversions
+  #=============================================================================
+
+  @classmethod
+  def from_tensor(cls,
+                  tensor,
+                  lengths=None,
+                  padding=None,
+                  ragged_rank=1,
+                  name=None):
+    """Converts a `tf.Tensor` into a `RaggedTensor`.
+
+    The set of absent/default values may be specified using a vector of lengths
+    or a padding value (but not both).  If `lengths` is specified, then the
+    output tensor will satisfy `output[row] = tensor[row][:lengths[row]]`.
+    If `padding` is specified, then any row *suffix* consisting entirely of
+    `padding` will be excluded from the returned `RaggedTensor`.  If neither
+    `lengths` nor `padding` is specified, then the returned `RaggedTensor` will
+    have no absent/default values.
+
+    Examples:
+
+    ```python
+    >>> dt = tf.constant([[5, 7, 0], [0, 3, 0], [6, 0, 0]])
+    >>> tf.RaggedTensor.from_tensor(dt)
+    <tf.RaggedTensor [[5, 7, 0], [0, 3, 0], [6, 0, 0]]>
+    >>> tf.RaggedTensor.from_tensor(dt, lengths=[2, 0, 3])
+    <tf.RaggedTensor [[5, 7], [], [6, 0, 0]]>
+    >>> tf.RaggedTensor.from_tensor(dt, padding=0)
+    <tf.RaggedTensor [[5, 7], [0, 3], [6]]>
+    ```
+
+    Args:
+      tensor: The `Tensor` to convert.  Must have rank `ragged_rank + 1` or
+        higher.
+      lengths: An optional set of row lengths, specified using a 1-D integer
+        `Tensor` whose length is equal to `tensor.shape[0]` (the number of rows
+        in `tensor`).  If specified, then `output[row]` will contain
+        `tensor[row][:lengths[row]]`.  Negative lengths are treated as zero.
+      padding: An optional padding value.  If specified, then any row suffix
+        consisting entirely of `padding` will be excluded from the returned
+        RaggedTensor.  `padding` is a `Tensor` with the same dtype as `tensor`
+        and with `shape=tensor.shape[ragged_rank + 1:]`.
+      ragged_rank: Integer specifying the ragged rank for the returned
+        `RaggedTensor`.  Must be greater than zero.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `RaggedTensor` with the specified `ragged_rank`.  The shape of the
+      returned ragged tensor is compatible with the shape of `tensor`.
+    Raises:
+      ValueError: If both `lengths` and `padding` are specified.
+    """
+    if lengths is not None and padding is not None:
+      raise ValueError("Specify lengths or padding, but not both")
+    if not isinstance(ragged_rank, int):
+      raise TypeError("ragged_rank expected int, got %r" % ragged_rank)
+    if ragged_rank <= 0:
+      raise ValueError(
+          "ragged_rank must be greater than 0; got %s" % ragged_rank)
+
+    with ops.name_scope(name, "RaggedFromTensor", [tensor, lengths, padding]):
+      tensor = ops.convert_to_tensor(tensor, name="tensor")
+      tensor.shape.with_rank_at_least(ragged_rank + 1)
+      input_shape = array_ops.shape(tensor, out_type=dtypes.int64)
+      ncols = input_shape[1]
+
+      # Handle ragged_rank>1 via recursion:
+      # If the output should have multiple ragged dimensions, then first
+      # flatten the tensor to eliminate all but the last ragged dimension,
+      # and recursively convert that flattened tensor.  Then add on the splits
+      # for the dimensions that we flattened out.
+      if ragged_rank > 1:
+        # Flatten `tensor` to eliminate all but the last ragged dimension.
+        new_shape = array_ops.concat([
+            constant_op.constant([-1], dtypes.int64), input_shape[ragged_rank:]
+        ],
+                                     axis=0)
+        flattened = array_ops.reshape(tensor, new_shape)
+        # Recursively convert the flattened tensor.
+        values = cls.from_tensor(flattened, lengths, padding)
+        # The total number of elements in each  dimension.  E.g., if
+        # input_shape=[3, 4, 5, 6], then dim[2] has 3*4*5 elements in total.
+        dim_size = math_ops.cumprod(input_shape)
+        # Construct splits tensors for the dimensions that were flattened.
+        new_splits = [
+            math_ops.range(0, dim_size[dim - 1] + 1) * input_shape[dim]
+            for dim in range(1, ragged_rank)
+        ]
+        return cls.from_nested_row_splits(values, new_splits)
+
+      # If padding was specified, then use it to find row lengths.
+      if padding is not None:
+        padding = ops.convert_to_tensor(
+            padding, name="padding", dtype=tensor.dtype)
+        padding.shape.assert_is_compatible_with(tensor.shape[2:])
+
+        # Find places where the padding is equal to the tensor.  (This will
+        # broadcast `padding` across the outermost 2 dimensions of `tensor`,
+        # so `has_default_value.shape = tensor.shape`.)
+        has_default_value = math_ops.equal(padding, tensor)
+
+        # If the padding isn't a scalar, then require that all values in the
+        # padding match each item in the tensor.  After this block of code,
+        # `has_default.shape = tensor.shape[:2]`.  (Unfortunately, we can't just
+        # use reduce_all for both cases, becaue when you pass an empty `axis`
+        # list to reduce_all, it reduces all axes; but we want it to reduce no
+        # axes -- i.e., to be a no-op.)
+        tensor_rank = array_ops.rank(tensor)
+        reduce_axis = math_ops.range(2, tensor_rank)
+        has_default = control_flow_ops.cond(
+            tensor_rank > 2,
+            lambda: math_ops.reduce_all(has_default_value, axis=reduce_axis),
+            lambda: has_default_value)
+        has_default.set_shape(tensor_shape.TensorShape([None, None]))
+        has_default.set_shape(tensor.shape[:2])
+
+        # Use has_default it to find the length of each row: for each
+        # non-default item in a row, calculate the length that the row needs to
+        # have to include that item; and then take the max of those values
+        # (across each row).
+        has_nondefault = math_ops.logical_not(has_default)
+        has_nondefault = math_ops.cast(has_nondefault, dtypes.int64)
+        length_for_nondefault_value = (
+            has_nondefault * array_ops.expand_dims(
+                math_ops.range(1, ncols + 1), 0))
+        lengths = math_ops.reduce_max(length_for_nondefault_value, axis=1)
+
+      # If we have lengths (either directly supplied, or computed from
+      # paddings), then use those to construct splits; and then use masking
+      # to get the corresponding values.
+      if lengths is not None:
+        lengths = ragged_util.convert_to_int_tensor(lengths, "lengths",
+                                                    dtypes.int64)
+        lengths.shape.assert_has_rank(1)
+        lengths = math_ops.minimum(lengths, ncols)
+        lengths = math_ops.maximum(lengths, 0)
+        limits = math_ops.cumsum(lengths)
+        splits = array_ops.concat([array_ops.zeros([1], dtypes.int64), limits],
+                                  axis=0)
+        mask = array_ops.sequence_mask(lengths, maxlen=ncols)
+        values = array_ops.boolean_mask(tensor, mask)
+        return cls.from_row_splits(values, splits)
+
+      # If neither padding nor lengths were specified, then create a splits
+      # vector that contains no default values, and reshape the input tensor
+      # to form the values for the RaggedTensor.
+      nrows = input_shape[0]
+      nvals = nrows * ncols
+      splits = math_ops.range(nrows + 1) * ncols
+      values_shape = array_ops.concat([[nvals], input_shape[2:]], axis=0)
+      values = array_ops.reshape(tensor, values_shape)
+      return cls.from_row_splits(values, splits)
+
+  def to_tensor(self, default_value=None, name=None):
+    """Converts this `RaggedTensor` into a `tf.Tensor`.
+
+    Example:
+
+    ```python
+    >>> rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
+    >>> print rt.to_tensor()
+    [[9 8 7]
+     [0 0 0]
+     [6 5 0]
+     [4 0 0]]
+    ```
+
+    Args:
+      default_value: Value to set for indices not specified in `self`. Defaults
+        to zero.  `default_value` must be broadcastable to
+        `self.shape[self.ragged_rank + 1:]`.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `Tensor` with shape `ragged.bounding_shape(self)` and the
+      values specified by the non-empty values in `self`.  Empty values are
+      assigned `default_value`.
+    """
+    with ops.name_scope(name, "RaggedToTensor", [self, default_value]):
+      if default_value is not None:
+        default_value = ops.convert_to_tensor(
+            default_value, name="default_value", dtype=self.dtype)
+
+      # If ragged_rank > 1, then recursively convert the ragged values into a
+      # `Tensor` before we proceed.
+      values = self.values
+      if is_ragged(values):
+        values = values.to_tensor(default_value)
+
+      # Tile the default value, if necessary.
+      if default_value is not None:
+        if values.shape.ndims is not None:
+          default_value.shape.with_rank_at_most(values.shape.ndims - 1)
+        if (values.shape.ndims is None or default_value.shape.ndims is None or
+            values.shape.ndims != default_value.shape.ndims + 1):
+          value_shape = array_ops.shape(values)[1:]
+          default_value = array_ops.broadcast_to(default_value, value_shape)
+        default_value.shape.assert_is_compatible_with(values.shape[1:])
+
+      # Get the expected dense shape ([nrows, ncols] + value_shape).
+      rt_row_lengths = [self.row_splits[1:] - self.row_splits[:-1]]
+      nrows = array_ops.shape(self.row_splits, out_type=dtypes.int64)[0] - 1
+      ncols = math_ops.maximum(math_ops.reduce_max(rt_row_lengths), 0)
+      values_shape = array_ops.shape(values, out_type=dtypes.int64)
+      value_shape = values_shape[1:]
+      nvals = values_shape[0]
+
+      # Build a default value if none was supplied.
+      if default_value is None:
+        default_value = array_ops.zeros(value_shape, dtype=values.dtype)
+      default_value.shape.assert_is_compatible_with(values.shape[1:])
+      default_value.set_shape(values.shape[1:])
+
+      # Get the row start indices, and expand to shape=[nrows, 1].
+      starts = array_ops.expand_dims(self.row_splits[:-1], 1)
+
+      # Get the row limit indices, and expand to shape=[nrows, 1].
+      limits = array_ops.expand_dims(self.row_splits[1:], 1)
+
+      # Get the column indices, and expand to shape=[1, ncols].
+      columns = array_ops.expand_dims(math_ops.range(0, ncols), 0)
+
+      # Build a list containing the values plus the default value.  We will use
+      # tf.gather to collect values from this list for the `Tensor` (using
+      # nvals as the index for the default value).
+      values_and_default = array_ops.concat(
+          [values, array_ops.stack([default_value])], axis=0)
+
+      # Construct a matrix "indices" pointing into values_and_default.  I.e.,
+      # output[r, c] = values_and_default[indices[r, c].
+      nondefault_index = starts + columns
+      has_value = nondefault_index < limits
+      default_index = array_ops.fill(array_ops.stack([nrows, ncols]), nvals)
+      indices = array_ops.where(has_value, nondefault_index, default_index)
+
+      # Gather the results into a `Tensor`.
+      return array_ops.gather(values_and_default, indices)
+
+  @classmethod
+  def from_sparse(cls, st_input, name=None):
+    """Converts a 2D `tf.SparseTensor` to a `RaggedTensor`.
+
+    Each row of the `output` `RaggedTensor` will contain the explicit values
+    from the same row in `st_input`.  `st_input` must be ragged-right.  If not
+    it is not ragged-right, then an error will be generated.
+
+    Example:
+
+    ```python
+    >>> st = SparseTensor(indices=[[0, 1], [0, 2], [0, 3], [1, 0], [3, 0]],
+    ...                   values=[1, 2, 3, 4, 5],
+    ...                   dense_shape=[4, 3])
+    >>> rt.RaggedTensor.from_sparse(st).eval().tolist()
+    [[1, 2, 3], [4], [], [5]]
+    ```
+
+    Currently, only two-dimensional `SparseTensors` are supported.
+
+    Args:
+      st_input: The sparse tensor to convert.  Must have rank 2.
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A `RaggedTensor` with the same values as `st_input`.
+      `output.ragged_rank = rank(st_input) - 1`.
+      `output.shape = [st_input.dense_shape[0], None]`.
+    Raises:
+      ValueError: If the number of dimensions in `st_input` is not known
+        statically, or is not two.
+    """
+    if not sparse_tensor.is_sparse(st_input):
+      raise TypeError("Expected SparseTensor, got %s" % type(st_input).__name__)
+    with ops.name_scope(name, "RaggedFromSparse", [st_input]):
+      st_input = sparse_tensor.convert_to_tensor_or_sparse_tensor(
+          st_input, name="st_input")
+
+      if st_input.dense_shape.shape.ndims is None:
+        static_rank_from_dense_shape = None
+      else:
+        static_rank_from_dense_shape = st_input.dense_shape.shape.dims[0].value
+
+      if st_input.indices.shape.ndims is None:
+        static_rank_from_indices = None
+      else:
+        static_rank_from_indices = st_input.indices.shape.dims[1].value
+
+      if static_rank_from_dense_shape != 2 and static_rank_from_indices != 2:
+        raise ValueError("rank(st_input) must be 2")
+
+      with ops.control_dependencies(
+          _assert_sparse_indices_are_ragged_right(st_input.indices)):
+        # Treat sparse row indices as segment ids to generate a splits tensor
+        # thta we can pair with the sparse tensor values.  (Ignore sparse column
+        # indices.)
+        segment_ids = st_input.indices[:, 0]
+        num_segments = st_input.dense_shape[0]
+        return cls.from_value_rowids(st_input.values, segment_ids, num_segments)
+
+  def to_sparse(self, name=None):
+    """Converts this `RaggedTensor` into a `tf.SparseTensor`.
+
+    Example:
+
+    ```python
+    >>> rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
+    >>> rt.to_sparse().eval()
+    SparseTensorValue(indices=[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]],
+                      values=[1, 2, 3, 4, 5, 6],
+                      dense_shape=[4, 3])
+    ```
+
+    Args:
+      name: A name prefix for the returned tensors (optional).
+
+    Returns:
+      A SparseTensor with the same values as `self`.
+    """
+    with ops.name_scope(name, "RaggedToSparse", [self]):
+      result = gen_ragged_conversion_ops.ragged_tensor_to_sparse(
+          self.nested_row_splits, self.flat_values, name=name)
+      return sparse_tensor.SparseTensor(result.sparse_indices,
+                                        result.sparse_values,
+                                        result.sparse_dense_shape)
 
   #=============================================================================
   # String Encoding
   #=============================================================================
   def __str__(self):
     if self._is_eager():
-      return "RaggedTensor(%s)" % self.tolist()
+      return "<tf.RaggedTensor %s>" % self.to_list()
     else:
       return self.__repr__()
 
   def __repr__(self):
-    return "RaggedTensor(values=%s, row_splits=%s)" % (self._values,
-                                                       self._row_splits)
+    return "tf.RaggedTensor(values=%s, row_splits=%s)" % (self._values,
+                                                          self._row_splits)
 
   #=============================================================================
   # Eager Execution Mode
   #=============================================================================
 
-  def tolist(self):
+  def to_list(self):
     """Returns a nested Python `list` with the values for this `RaggedTensor`.
 
-    If a `RaggedTensor` `rt` was constructed in graph execution mode, then
-    `rt.tolist()` is equivalent to `rt.eval().tolist()`.
-
-    If a `RaggedTensor` `rt` was constructed in eager execution mode, then
-    `rt.tolist()` builds the Python list based on `rt`'s `EagerTensor`
-    components.
+    Requires that `rt` was constructed in eager execution mode.
 
     Returns:
       A nested Python `list`.
     """
     if self._is_eager():
-      return self._eager_value().tolist()
+      return self._eager_value().to_list()
     else:
-      return self.eval().tolist()
+      raise ValueError("RaggedTensor.to_list() is only supported in eager "
+                       "mode; in graph mode, evaluate the RaggedTensor first "
+                       "and then use RaggedTensorValue.to_list().")
 
   def _eager_value(self):
     """Returns a RaggedTensorValue for self.  Requires self._is_eager()=true."""
-    value = self.inner_values.numpy()
+    value = self.flat_values.numpy()
     for row_splits in reversed(self.nested_row_splits):
       value = ragged_tensor_value.RaggedTensorValue(value, row_splits.numpy())
     return value
@@ -561,24 +1409,6 @@ class RaggedTensor(object):
       rt = rt.values
     return isinstance(rt, ops.EagerTensor)
 
-  #=============================================================================
-  # Evaluation
-  #=============================================================================
-  def eval(self, feed_dict=None, session=None):  # pylint: disable=redefined-outer-name
-    """Evaluates this ragged tensor in a `Session`.
-
-    Args:
-      feed_dict: A dictionary that maps `Tensor` objects to feed values. See
-        `tf.Session.run` for a description of the valid feed values.
-      session: The `Session` to be used to evaluate this ragged tensor. If none,
-        the default session will be used.
-
-    Returns:
-      A `RaggedTensorValue` object.
-    """
-    return _eval_using_default_session(self, feed_dict,
-                                       self._as_graph_element().graph, session)
-
   #=============================================================================
   # Indexing & Slicing
   #=============================================================================
@@ -612,6 +1442,53 @@ def is_ragged(value):
                     (RaggedTensor, ragged_tensor_value.RaggedTensorValue))
 
 
+#===============================================================================
+# Convert value -> tensor
+#===============================================================================
+def convert_to_tensor_or_ragged_tensor(value,
+                                       dtype=None,
+                                       preferred_dtype=None,
+                                       name=None):
+  """Converts value to a `RaggedTensor` or `Tensor`.
+
+  * If `value` is a `RaggedTensor`, then return it as-is.
+  * If `value` is a `RaggedTensorValue`, return a corresponding constant
+    `RaggedTensor`.
+  * Otherwise, use `convert_to_tensor` to convert `value` to a `Tensor`.
+
+  Args:
+    value: A `RaggedTensor`, a `RaggedTensorValue`, or an object whose type has
+      a registered `Tensor` conversion function.
+    dtype: Optional element type for the returned tensor.  If missing the type
+      is inferred from the type of `value`.
+    preferred_dtype: Optional element type for the returned tensor, used when
+      dtype is None.  This argument has no effect if `value` is already a
+      tensor, or when conversion is not possible.
+    name: Optional name to use if a new `Tensor` is created.
+
+  Returns:
+    A `Tensor` or `RaggedTensor`.
+  """
+  if isinstance(value, RaggedTensor):
+    if dtype and not dtype.is_compatible_with(value.dtype):
+      raise ValueError("Tensor conversion requested dtype %s for "
+                       "RaggedTensor with dtype %s: %r" %
+                       (dtype.name, value.dtype.name, value))
+    return value
+  elif isinstance(value, ragged_tensor_value.RaggedTensorValue):
+    with ops.name_scope(name, "ConvertToTensorOrRaggedTensor", []):
+      flat_values = ops.convert_to_tensor(
+          value=value.flat_values,
+          dtype=dtype,
+          preferred_dtype=preferred_dtype,
+          name="flat_values")
+      return RaggedTensor.from_nested_row_splits(flat_values,
+                                                 value.nested_row_splits)
+  else:
+    return ops.convert_to_tensor(
+        value=value, dtype=dtype, preferred_dtype=preferred_dtype, name=name)
+
+
 #===============================================================================
 # Register RaggedTensor for use with session.run.
 #===============================================================================
@@ -624,18 +1501,18 @@ def _ragged_tensor_value_from_components(components):
 
 
 def _ragged_tensor_session_fetch(rt):
-  components = rt.nested_row_splits + (rt.inner_values,)
+  components = rt.nested_row_splits + (rt.flat_values,)
   return (components, _ragged_tensor_value_from_components)
 
 
 def _ragged_tensor_session_feed(feed_key, feed_val):
-  key_components = feed_key.nested_row_splits + (feed_key.inner_values,)
-  val_components = feed_val.nested_row_splits + (feed_val.inner_values,)
+  key_components = feed_key.nested_row_splits + (feed_key.flat_values,)
+  val_components = feed_val.nested_row_splits + (feed_val.flat_values,)
   return zip(key_components, val_components)
 
 
 def _ragged_tensor_session_feed_for_partial_run(feed_key):
-  return feed_key.nested_row_splits + (feed_key.inner_values,)
+  return feed_key.nested_row_splits + (feed_key.flat_values,)
 
 
 session.register_session_run_conversion_functions(
@@ -643,6 +1520,9 @@ session.register_session_run_conversion_functions(
     _ragged_tensor_session_feed_for_partial_run)
 
 
+#===============================================================================
+# RaggedTensorType
+#===============================================================================
 class RaggedTensorType(object):
   """Encoding of a static type for a `RaggedTensor`.
 
@@ -662,3 +1542,67 @@ class RaggedTensorType(object):
 
   dtype = property(lambda self: self._dtype)
   ragged_rank = property(lambda self: self._ragged_rank)
+
+
+#===============================================================================
+# Helper Functions
+#===============================================================================
+def _assert_sparse_indices_are_ragged_right(indices):
+  """Checks that the given SparseTensor.indices tensor is ragged-right.
+
+  Example: `indices = [[0, 0], [0, 1], [2, 0], [3, 1]]` is not ragged right
+  because the entry `[3, 1]` skips a cell.
+
+  Args:
+    indices: The SparseTensor indices to check.
+
+  Returns:
+    A list of control dependency op tensors.
+  """
+  index_prefix = indices[:, :-1]
+  index_suffix = indices[:, -1]
+
+  # Check whether each index is starting a new row in the innermost dimension
+  # (prefix[i] != prefix[i-1]) or continuing a row (prefix[i] == prefix[i-1]).
+  # (Note: this skips the first index; we will check that separately below.)
+  index_prefix_changed = math_ops.reduce_any(
+      math_ops.not_equal(index_prefix[1:], index_prefix[:-1]), axis=1)
+
+  # Check two cases:
+  #   * For indices that start a new row: index_suffix[i] must be zero.
+  #   * For indices that continue a row: index_suffix[i] must be equal to
+  #     index_suffix[i-1]+1.
+  index_ok = array_ops.where(
+      index_prefix_changed, math_ops.equal(index_suffix[1:], 0),
+      math_ops.equal(index_suffix[1:], index_suffix[:-1] + 1))
+
+  # Also check that the very first index didn't skip any cells.  The first
+  # index starts a new row (by definition), so its suffix should be zero.
+  sparse_indices_are_ragged_right = math_ops.logical_and(
+      math_ops.reduce_all(math_ops.equal(index_suffix[:1], 0)),
+      math_ops.reduce_all(index_ok))
+
+  message = [
+      "SparseTensor is not right-ragged", "SparseTensor.indices =", indices
+  ]
+  return [control_flow_ops.Assert(sparse_indices_are_ragged_right, message)]
+
+
+@ops.RegisterGradient("RaggedTensorToSparse")
+def _ragged_tensor_to_sparse_gradient(op, unused_sparse_indices_grad,
+                                      sparse_values_grad,
+                                      unused_sparse_shape_grad):
+  """Gradient for RaggedTensorToSparse."""
+  op_inputs_nested_row_splits = op.inputs[:-1]
+  op_inputs_flat_values = op.inputs[-1]
+
+  # No gradient for the RaggedTensor's nested_row_splits.
+  nested_row_splits_gradient = [None] * len(op_inputs_nested_row_splits)
+
+  # Gradient for the RaggedTensor's flat_values is formed by reshaping
+  # the gradient for the SparseTensor's values.
+  flat_values_shape = array_ops.shape(op_inputs_flat_values)
+  flat_values_gradient = array_ops.reshape(sparse_values_grad,
+                                           flat_values_shape)
+
+  return nested_row_splits_gradient + [flat_values_gradient]
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
index a1c10aff9de5c961962cc2227442789cc9f7e9b0..4e6ebdf332e6f53b7a3af5679af1cbf27ec9f792 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_bounding_shape_op_test.py
@@ -20,49 +20,42 @@ from __future__ import print_function
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorBoundingShapeOp(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExample(self):
     # This is the example from ragged.bounding_shape.__doc__.
     rt = ragged.constant([[1, 2, 3, 4], [5], [], [6, 7, 8, 9], [10]])
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(rt).eval().tolist(), [5, 4])
+    self.assertRaggedEqual(rt.bounding_shape(), [5, 4])
 
   def test2DRaggedTensorWithOneRaggedDimension(self):
     values = ['a', 'b', 'c', 'd', 'e', 'f', 'g']
-    rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
-    rt2 = ragged.from_row_splits(values, [0, 7])
-    rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(rt1).eval().tolist(), [5, 3])
-      self.assertEqual(ragged.bounding_shape(rt2).eval().tolist(), [1, 7])
-      self.assertEqual(ragged.bounding_shape(rt3).eval().tolist(), [3, 7])
+    rt1 = ragged.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged.RaggedTensor.from_row_splits(values, [0, 7])
+    rt3 = ragged.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
+    self.assertRaggedEqual(rt1.bounding_shape(), [5, 3])
+    self.assertRaggedEqual(rt2.bounding_shape(), [1, 7])
+    self.assertRaggedEqual(rt3.bounding_shape(), [3, 7])
 
   def test3DRaggedTensorWithOneRaggedDimension(self):
     values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
-    rt1 = ragged.from_row_splits(values, [0, 2, 5, 6, 6, 7])
-    rt2 = ragged.from_row_splits(values, [0, 7])
-    rt3 = ragged.from_row_splits(values, [0, 0, 7, 7])
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(rt1).eval().tolist(), [5, 3, 2])
-      self.assertEqual(ragged.bounding_shape(rt2).eval().tolist(), [1, 7, 2])
-      self.assertEqual(ragged.bounding_shape(rt3).eval().tolist(), [3, 7, 2])
-
-  def testNonRaggedTensor(self):
-    dt = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(dt).eval().tolist(), [4, 3])
+    rt1 = ragged.RaggedTensor.from_row_splits(values, [0, 2, 5, 6, 6, 7])
+    rt2 = ragged.RaggedTensor.from_row_splits(values, [0, 7])
+    rt3 = ragged.RaggedTensor.from_row_splits(values, [0, 0, 7, 7])
+    self.assertRaggedEqual(rt1.bounding_shape(), [5, 3, 2])
+    self.assertRaggedEqual(rt2.bounding_shape(), [1, 7, 2])
+    self.assertRaggedEqual(rt3.bounding_shape(), [3, 7, 2])
 
   def testExplicitAxisOptimizations(self):
-    rt = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
-    with self.test_session():
-      self.assertEqual(ragged.bounding_shape(rt, 0).eval().tolist(), 5)
-      self.assertEqual(ragged.bounding_shape(rt, 1).eval().tolist(), 3)
-      self.assertEqual(
-          ragged.bounding_shape(rt, [1, 0]).eval().tolist(), [3, 5])
+    rt = ragged.RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
+                                             [0, 2, 5, 6, 6, 7])
+    self.assertRaggedEqual(rt.bounding_shape(0), 5)
+    self.assertRaggedEqual(rt.bounding_shape(1), 3)
+    self.assertRaggedEqual(rt.bounding_shape([1, 0]), [3, 5])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape.py b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..706881da74a46137171d4d4771b82e652d4ad4c8
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape.py
@@ -0,0 +1,572 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Shapes & broadcasting for RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops.ragged import ragged_array_ops
+from tensorflow.python.ops.ragged import ragged_conversion_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.ops.ragged import ragged_util
+
+
+class RaggedTensorDynamicShape(object):
+  """A collection of tensors encoding the shape of a potentially ragged tensor.
+
+  Each `RaggedTensorDynamicShape` consists of an ordered list of dimension
+  sizes.  There are two dimension types:
+
+    * "Uniform dimensions" are dimenisons where all slices have the same
+      length.  `RaggedTensorDynamicShape` records the size of each uniform
+      dimension using a single scalar integer.
+
+    * "Ragged dimensions" are dimensions whose slices may have different
+      lengths.  `RaggedTensorDynamicShape` records the size of each ragged
+      dimension using an integer vector containing the slice lengths for all
+      the slices across that dimension.
+
+  Furthermore, there are two ways a dimension might be encoded:
+
+    * "Partitioned dimensions" are dimensions that are encoded using a
+      `RaggedTensor`'s `nested_row_splits`.  The outermostmost partitioned
+      dimension must be uniform, and the innermost partitioned dimension must
+      be ragged.
+
+    * "Inner dimensions" are dimensions that are encoded using a
+      `RaggedTensor`'s `flat_values`.  Inner dimensions are always uniform.
+
+  The sizes of partitioned dimensions are recorded using `partitioned_dim_sizes`
+  and `inner_dim_sizes`:
+
+    * `paritioned_dim_sizes` is a list of tensors (one for each partitioned
+      dimension).
+
+      * For uniform dimensions, the tensor is an integer scalar specifying the
+        size of all slices across that dimension.
+      * For ragged dimensions, the tensor is an integer vector specifying the
+        size of each slice across that dimension.
+
+    * `inner_dim_sizes` is a single integer vector, where each element
+      specifies the size of a single inner dimension.
+
+  Examples:
+
+  Tensor                         | Ragged | Partitioned Dim Sizes  | Inner Dim
+                                 : Rank   :                        : Sizes
+  ------------------------------ | ------ | ---------------------- | ----------
+  `[[1, 2, 3], [4, 5, 6]]`       |      0 |                        | `2, 3`
+  `[[1, 2], [], [3, 4, 5]]`      |      1 | `3, (2, 0, 3)`         |
+  `[[[1, 2], [3, 4]], [[5, 6]]]` |      1 | `2, (2, 1)`            | 2
+  `[[[1, 2], [3]], [[4, 5]]]`    |      2 | `2, (2, 1), (2, 1, 2)` |
+  """
+
+  def __init__(self, partitioned_dim_sizes, inner_dim_sizes):
+    """Creates a RaggedTensorDynamicShape.
+
+    Args:
+      partitioned_dim_sizes: A `list` of 0-D or 1-D integer `Tensor`, one for
+        each partitioned dimension.  If dimension `d` is uniform, then
+        `partitioned_dim_sizes[d]` must be an integer scalar, specifying the
+        size of all slices across dimension `d`.  If dimension `d` is ragged,
+        then `partitioned_dim_sizes[d]` must be an integer vector, specifying
+        the size of each slice across dimension `d`.
+      inner_dim_sizes: A 1-D integer `Tensor`, whose length is equal to the
+        number of inner dimensions.  `inner_dim_sizes[n]` is the size of all
+        slices across the `n`th inner dimension (which is the
+        `(len(partitioned_dim_sizes)+n)`th dimension in the overall tensor.
+    """
+    assert isinstance(partitioned_dim_sizes, (list, tuple))
+    with ops.name_scope(None, 'RaggedTensorDynamicShape',
+                        (partitioned_dim_sizes, inner_dim_sizes)):
+      partitioned_dim_sizes = tuple(
+          ragged_util.convert_to_int_tensor(
+              size, dtype=dtypes.int64, name='partitioned_dimension_size')
+          for size in partitioned_dim_sizes)
+      inner_dim_sizes = ragged_util.convert_to_int_tensor(
+          inner_dim_sizes, dtype=dtypes.int64, name='inner_dim_sizes')
+
+      # Validate shapes.
+      if partitioned_dim_sizes:
+        for axis, dimension_size in enumerate(partitioned_dim_sizes):
+          if dimension_size.shape.ndims is None:
+            raise ValueError(
+                'rank of partitioned_dim_sizes[%d] is unknown' % axis)
+          dimension_size.shape.with_rank_at_most(1)
+        if partitioned_dim_sizes[0].shape.ndims == 1:
+          raise ValueError('outermost partitioned dimension must be uniform')
+        if partitioned_dim_sizes[-1].shape.ndims == 0:
+          raise ValueError('innermost partitioned dimension must be ragged')
+      inner_dim_sizes.shape.assert_has_rank(1)
+
+      self._partitioned_dim_sizes = partitioned_dim_sizes
+      self._inner_dim_sizes = inner_dim_sizes
+
+  def __repr__(self):
+    return ('RaggedTensorDynamicShape'
+            '(partitioned_dim_sizes=%r, inner_dim_sizes=%r)' %
+            (self._partitioned_dim_sizes, self._inner_dim_sizes))
+
+  @staticmethod
+  def from_dim_sizes(dim_sizes):
+    """Constructs a ragged shape from a list of dimension sizes.
+
+    This list contains a single tensor for each dimension, where the tensor
+    is a scalar if the dimension is uniform, or a vector if the dimension is
+    ragged.
+
+    Args:
+      dim_sizes: List of int64 scalars or vectors.
+
+    Returns:
+      A RaggedTensorDynamicShape.
+    """
+    with ops.name_scope(None, 'RaggedTensorDynamicShapeFromDimensionSizes',
+                        [dim_sizes]):
+      dim_sizes = tuple(
+          ragged_util.convert_to_int_tensor(
+              size, dtype=dtypes.int64, name='dim_sizes') for size in dim_sizes)
+      # Split the dimensions into partitioned & inner dimensions.
+      inner_split = 0
+      for dim, dim_size in enumerate(dim_sizes):
+        if dim_size.shape.ndims == 1:
+          inner_split = dim + 1
+        elif dim_size.shape.ndims != 0:
+          raise ValueError('Each dim_size must be a scalar or a vector')
+      return RaggedTensorDynamicShape(dim_sizes[:inner_split],
+                                      dim_sizes[inner_split:])
+
+  @classmethod
+  def from_tensor(cls, rt_input):
+    """Constructs a ragged shape for a potentially ragged tensor."""
+    with ops.name_scope(None, 'RaggedTensorDynamicShapeFromTensor', [rt_input]):
+      rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input)
+      if not ragged_tensor.is_ragged(rt_input):
+        return cls([], array_ops.shape(rt_input))
+      else:
+        partitioned_dim_sizes = (
+            (rt_input.nrows(),) + rt_input.nested_row_lengths())
+        return RaggedTensorDynamicShape(
+            partitioned_dim_sizes,
+            array_ops.shape(rt_input.flat_values)[1:])
+
+  def dimension_size(self, axis):
+    """Returns the size of slices across the specified dimension."""
+    if not isinstance(axis, int):
+      raise TypeError('axis must be an integer')
+    partitioned_ndims = len(self._partitioned_dim_sizes)
+    if axis < partitioned_ndims:
+      return self._partitioned_dim_sizes[axis]
+    else:
+      return self._inner_dim_sizes[axis - partitioned_ndims]
+
+  def is_ragged(self, axis):
+    """Returns true if the indicated dimension is ragged."""
+    if not isinstance(axis, int):
+      raise TypeError('axis must be an integer')
+    rank = self.rank
+    if axis < 0:
+      raise ValueError('Negative axis values are not supported')
+    elif rank is not None and axis >= rank:
+      raise ValueError('Expected axis=%s < rank=%s' % (axis, rank))
+    else:
+      return (axis > 0 and axis < len(self._partitioned_dim_sizes) and
+              self._partitioned_dim_sizes[axis].shape.ndims == 1)
+
+  @property
+  def rank(self):
+    """The number of dimensions in this shape, or None if unknown."""
+    inner_ndims = tensor_shape.dimension_value(self._inner_dim_sizes.shape[0])
+    if inner_ndims is None:
+      return None
+    else:
+      return len(self._partitioned_dim_sizes) + inner_ndims
+
+  @property
+  def partitioned_dim_sizes(self):
+    """The partitioned dimension sizes for this shape.
+
+    Returns:
+      A `list` of 0-D or 1-D integer `Tensor`.
+    """
+    return self._partitioned_dim_sizes
+
+  @property
+  def inner_dim_sizes(self):
+    """The inner dimension sizes for this shape.
+
+    Returns:
+      A 1-D integer `Tensor`.
+    """
+    return self._inner_dim_sizes
+
+  @property
+  def num_partitioned_dimensions(self):
+    """The number of partitioned dimensions in this shape."""
+    return len(self._partitioned_dim_sizes)
+
+  @property
+  def num_inner_dimensions(self):
+    """The number of inner dimensions, or `None` if not statically known."""
+    return tensor_shape.dimension_value(self._inner_dim_sizes.shape[0])
+
+  def broadcast_to_rank(self, rank):
+    """Adds leading size-1 dimensions to broadcast `self` to the given rank.
+
+    E.g., if `shape1` is `[3, (D2), 4]`, then `shape1.broadcast_to_rank(5)`
+    is `[1, 1, 3, (D2), 4]`.
+
+    Args:
+      rank: The rank for the returned shape.
+
+    Returns:
+      A RaggedTensorDynamicShape with `rank` dimensions, whose inner dimensions
+      have the same size as `self` and whose outer dimensions have size `1`.
+
+    Raises:
+      ValueError: If `self.rank` is unknown or greater than `rank`.
+    """
+    if self.rank is None:
+      raise ValueError('Unable to broadcast: self.rank is unknown')
+    dims_to_add = rank - self.rank
+    if dims_to_add < 0:
+      raise ValueError('Unable to broadcast: rank=%d must be greater than '
+                       'self.rank=%d.' % (rank, self.rank))
+    elif dims_to_add == 0:
+      return self
+    elif self._partitioned_dim_sizes:
+      partitioned_dims = (1,) * dims_to_add + self._partitioned_dim_sizes
+      return RaggedTensorDynamicShape(partitioned_dims, self._inner_dim_sizes)
+    else:
+      inner_dims = array_ops.concat(
+          [array_ops.ones([dims_to_add], dtypes.int64), self.inner_dim_sizes],
+          axis=0)
+      return RaggedTensorDynamicShape([], inner_dims)
+
+  def broadcast_dimension(self, axis, lengths):
+    """Returns a shape that is broadcast-compatible with self & lengths.
+
+    * If dimension[axis] is uniform and lengths is a scalar, the check
+      that either lengths==1 or axis==1 or lengths==axis, and tile
+      dimension[axis] with tf.where(lengths==axis, 1, axis) repeats.
+
+    * If dimension[axis] is uniform and lengths is a vector, then check
+      that dimension[axis]==1, and raggedly tile dimension[axis] with
+      lengths repeats.  (we can skip tiling if we statically know that
+      slice_lengths == 1??)
+
+    * If dimension[axis] is ragged and lengths is a scalar, then check
+      that lengths==1.
+
+    * If dimension[axis] is ragged and lengths is a vector, then check
+      that self.dimension_size(axis) == lengths.
+
+    Args:
+      axis: `int`.  The dimension to broadcast.
+      lengths: 0-D or 1-D integer `Tensor`.
+
+    Returns:
+      A `RaggedTensorDynamicShape`.
+    """
+    lengths = ragged_util.convert_to_int_tensor(
+        lengths, name='lengths', dtype=dtypes.int64)
+    # Check whether lengths is a scalar (for uniform dimensions) or
+    # vector (for ragged dimensions).
+    if lengths.shape.ndims is None:
+      raise ValueError('lengths must have a known rank.')
+    elif lengths.shape.ndims > 1:
+      raise ValueError('lengths must be a scalar or vector')
+    else:
+      lengths_is_scalar = (lengths.shape.ndims == 0)
+
+    # Verify that the shapes are compatible.
+    if self.is_ragged(axis):
+      if lengths_is_scalar:
+        condition = math_ops.equal(lengths, 1)
+      else:
+        condition = math_ops.reduce_all(
+            math_ops.equal(lengths, self.dimension_size(axis)))
+    else:
+      axis_dim_size = self.dimension_size(axis)
+      if lengths_is_scalar:
+        condition = (
+            math_ops.equal(lengths, 1) | math_ops.equal(axis_dim_size, 1)
+            | math_ops.equal(axis_dim_size, lengths))
+      else:
+        condition = math_ops.equal(axis_dim_size, 1)
+    broadcast_err = [
+        'Unable to broadcast: dimension size mismatch in dimension', axis,
+        'lengths=', lengths, 'dim_size=',
+        self.dimension_size(axis)
+    ]
+    broadcast_check = control_flow_ops.Assert(
+        condition, data=broadcast_err, summarize=10)
+
+    with ops.control_dependencies([broadcast_check]):
+      # Partitioned dimensions:
+      if axis < self.num_partitioned_dimensions:
+        if self.is_ragged(axis):
+          # Use an identity op to make sure the check actually gets run.
+          return RaggedTensorDynamicShape(
+              self._partitioned_dim_sizes,
+              array_ops.identity(self.inner_dim_sizes))
+        else:
+          return self._broadcast_uniform_partitioned_dimension(axis, lengths)
+
+      # Inner dimensions:
+      else:
+        if lengths_is_scalar:
+          return self._broadcast_inner_dimension_to_uniform(axis, lengths)
+        else:
+          if axis == 0:
+            raise ValueError('Unable to broadcast: '
+                             'outermost dimension must be uniform.')
+          return self._broadcast_inner_dimension_to_ragged(axis, lengths)
+
+  def num_slices_in_dimension(self, axis):
+    """Returns the total number of slices across the indicated dimension."""
+    if axis < 0:
+      return constant_op.constant(1, dtype=dtypes.int64)
+    elif self.is_ragged(axis):
+      return math_ops.reduce_sum(self._partitioned_dim_sizes[axis])
+    else:
+      return self.dimension_size(axis) * self.num_slices_in_dimension(axis - 1)
+
+  def _broadcast_uniform_partitioned_dimension(self, axis, lengths):
+    """Broadcasts the partitioned dimension `axis` to match `lengths`."""
+    axis_dim_size = self.dimension_size(axis)
+    partitioned_sizes = list(self._partitioned_dim_sizes[:axis])
+
+    if lengths.shape.ndims == 0:
+      lengths = array_ops.where(
+          math_ops.equal(axis_dim_size, 1), lengths, axis_dim_size)
+      repeats = array_ops.where(math_ops.equal(axis_dim_size, 1), lengths, 1)
+      splits = array_ops.stack([0, self.num_slices_in_dimension(axis)])
+    else:
+      splits = math_ops.range(
+          array_ops.size(lengths, out_type=dtypes.int64) + 1)
+      repeats = lengths
+
+    partitioned_sizes.append(lengths)
+
+    for dim_size in self._partitioned_dim_sizes[axis + 1:]:
+      if dim_size.shape.ndims == 0:
+        partitioned_sizes.append(dim_size)
+        splits *= dim_size
+      else:
+        partitioned_sizes.append(
+            ragged_util.repeat_ranges(dim_size, splits, repeats))
+        splits = array_ops.gather(
+            ragged_util.lengths_to_splits(dim_size), splits)
+    inner_sizes = self._inner_dim_sizes
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+  def _broadcast_inner_dimension_to_uniform(self, axis, length):
+    """Broadcasts the inner dimension `axis` to match `lengths`."""
+    dim_size = self.dimension_size(axis)
+    axis_in_inner_dims = axis - self.num_partitioned_dimensions
+    partitioned_sizes = self._partitioned_dim_sizes
+    inner_sizes = array_ops.concat([
+        self._inner_dim_sizes[:axis_in_inner_dims],
+        [array_ops.where(math_ops.equal(dim_size, 1), length, dim_size)],
+        self._inner_dim_sizes[axis_in_inner_dims + 1:]
+    ],
+                                   axis=0)
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+  def _broadcast_inner_dimension_to_ragged(self, axis, lengths):
+    axis_in_inner_dims = axis - self.num_partitioned_dimensions
+    partitioned_sizes = (
+        self._partitioned_dim_sizes + tuple([
+            self._inner_dim_sizes[i] for i in range(axis_in_inner_dims)
+        ]) + (lengths,))
+    inner_sizes = self._inner_dim_sizes[axis_in_inner_dims + 1:]
+    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
+
+
+def broadcast_dynamic_shape(shape_x, shape_y):
+  """Returns the shape formed by broadcasting two shapes to be compatible.
+
+  Args:
+    shape_x: A `RaggedTensorDynamicShape`
+    shape_y: A `RaggedTensorDynamicShape`
+
+  Returns:
+    A `RaggedTensorDynamicShape`.
+  Raises:
+    ValueError: If `shape_x` and `shape_y` are not broadcast-compatible.
+  """
+  if not isinstance(shape_x, RaggedTensorDynamicShape):
+    raise TypeError('shape_x must be a RaggedTensorDynamicShape')
+  if not isinstance(shape_y, RaggedTensorDynamicShape):
+    raise TypeError('shape_y must be a RaggedTensorDynamicShape')
+
+  # Broadcast both shapes to have the same rank.
+  if shape_x.rank is None or shape_y.rank is None:
+    raise ValueError('Unable to broadcast: unknown rank')
+  broadcast_rank = max(shape_x.rank, shape_y.rank)
+  shape_x = shape_x.broadcast_to_rank(broadcast_rank)
+  shape_y = shape_y.broadcast_to_rank(broadcast_rank)
+
+  # Broadcast dimensions one at a time, starting from the outermost dimension.
+  for axis in range(broadcast_rank):
+    shape_x = shape_x.broadcast_dimension(axis, shape_y.dimension_size(axis))
+    shape_y = shape_y.broadcast_dimension(axis, shape_x.dimension_size(axis))
+
+  return shape_x
+
+
+def broadcast_to(rt_input, shape, broadcast_inner_dimensions=True):
+  """Broadcasts a potentially ragged tensor to a ragged shape.
+
+  Tiles `rt_input` as necessary to match the given shape.
+
+  Behavior is undefined if `rt_input` is not broadcast-compatible with `shape`.
+
+  Args:
+    rt_input: The potentially ragged tensor to broadcast.
+    shape: A `RaggedTensorDynamicShape`
+    broadcast_inner_dimensions: If false, then inner dimensions will not be
+      tiled.
+
+  Returns:
+    A potentially ragged tensor whose values are taken from
+    `rt_input`, and whose shape matches `shape`.
+  """
+  if not isinstance(shape, RaggedTensorDynamicShape):
+    raise TypeError('shape must be a RaggedTensorDynamicShape')
+  rt_input = ragged_tensor.convert_to_tensor_or_ragged_tensor(rt_input)
+
+  # Broadcasting to a uniform shape.
+  if shape.num_partitioned_dimensions == 0:
+    return _broadcast_to_uniform_shape(rt_input, shape,
+                                       broadcast_inner_dimensions)
+  else:
+    return _broadcast_to_ragged_shape(rt_input, shape,
+                                      broadcast_inner_dimensions)
+
+
+def _broadcast_to_uniform_shape(rt_input, shape, broadcast_inner_dimensions):
+  """Broadcasts rt_input to the uniform shape `shape`."""
+  if isinstance(rt_input, ragged_tensor.RaggedTensor):
+    raise ValueError('Incompatible with shape: ragged rank mismatch')
+  if broadcast_inner_dimensions:
+    return array_ops.broadcast_to(rt_input, shape.inner_dim_sizes)
+  else:
+    return rt_input
+
+
+def _broadcast_to_ragged_shape(rt_input, dst_shape, broadcast_inner_dimensions):
+  """Broadcasts rt_input to the ragged shape `dst_shape`."""
+  # dst_shape's rank and ragged_rank must be greater than or equal to rt_input's
+  if rt_input.shape.ndims is None or dst_shape.rank is None:
+    raise ValueError('Unable to broadcast: unknown rank')
+  if rt_input.shape.ndims > dst_shape.rank:
+    raise ValueError('Incompatible with shape: rank mismatch')
+  if (isinstance(rt_input, ragged_tensor.RaggedTensor) and
+      rt_input.ragged_rank >= dst_shape.num_partitioned_dimensions):
+    raise ValueError('Incompatible with shape: ragged rank mismatch')
+
+  src_shape = RaggedTensorDynamicShape.from_tensor(rt_input)
+  src_shape = src_shape.broadcast_to_rank(dst_shape.rank)
+
+  # Add dimensions to rt_input so its rank and ragged_rank matches dst_shape.
+  if dst_shape.rank > rt_input.shape.ndims:
+    if rt_input.shape.ndims < dst_shape.num_inner_dimensions + 1:
+      rt_input = array_ops.reshape(
+          rt_input, array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0))
+    for _ in range(dst_shape.rank - rt_input.shape.ndims):
+      if ragged_tensor.is_ragged(rt_input):
+        nrows = rt_input.nrows()
+      else:
+        nrows = array_ops.shape(rt_input, out_type=dtypes.int64)[0]
+      rt_input = ragged_tensor.RaggedTensor.from_row_lengths(rt_input, [nrows])
+
+  # Add ragged dimensions to match dst_shape.
+  if ragged_tensor.is_ragged(rt_input):
+    inner_rank_diff = (
+        rt_input.flat_values.shape.ndims - 1 - dst_shape.num_inner_dimensions)
+    if inner_rank_diff > 0:
+      rt_input = rt_input.with_flat_values(
+          ragged_conversion_ops.from_tensor(
+              rt_input.flat_values, ragged_rank=inner_rank_diff))
+  else:
+    rt_input = ragged_conversion_ops.from_tensor(
+        rt_input, ragged_rank=dst_shape.num_partitioned_dimensions - 1)
+
+  # Do broadcasting for any dimensions that will remain uniform.  We can do
+  # these all at once, since they're independent of one another.
+  multiples = [1] * dst_shape.rank
+  for axis in range(dst_shape.num_partitioned_dimensions):
+    if not src_shape.is_ragged(axis) and not dst_shape.is_ragged(axis):
+      src_size = src_shape.dimension_size(axis)
+      dst_size = dst_shape.dimension_size(axis)
+      if ((tensor_util.constant_value(src_size) in (1, None)) and
+          (tensor_util.constant_value(dst_size) != 1)):
+        multiples[axis] = array_ops.where(
+            math_ops.equal(src_size, 1), dst_size, 1)
+  if not all(isinstance(v, int) and v == 1 for v in multiples):
+    multiples = array_ops.stack(multiples, axis=0)
+    rt_input = ragged_array_ops.tile(rt_input, multiples)
+
+  if broadcast_inner_dimensions:
+    rt_input = rt_input.with_flat_values(
+        array_ops.reshape(
+            rt_input.flat_values,
+            array_ops.concat([[-1], dst_shape.inner_dim_sizes], axis=0)))
+
+  # Do broadcasting for dimensions that become ragged.  We must do these from
+  # outermost to innermost.
+  for axis in range(dst_shape.num_partitioned_dimensions):
+    if not src_shape.is_ragged(axis) and dst_shape.is_ragged(axis):
+      dst_size = dst_shape.dimension_size(axis)
+      rt_input = _ragged_tile_axis(rt_input, axis, dst_size)
+
+  return rt_input
+
+
+def _ragged_tile_axis(rt_input, axis, repeats):
+  """Tile a dimension of a RaggedTensor to match a ragged shape."""
+  assert axis > 0  # Outermost dimension may not be ragged.
+
+  if not ragged_tensor.is_ragged(rt_input):
+    rt_input = ragged_conversion_ops.from_tensor(rt_input, ragged_rank=1)
+
+  if axis > 1:
+    return rt_input.with_values(
+        _ragged_tile_axis(rt_input.values, axis - 1, repeats))
+  else:
+    src_row_splits = rt_input.nested_row_splits
+    src_row_lengths = rt_input.nested_row_lengths()
+    splits = src_row_splits[0]
+
+    dst_row_lengths = [repeats]
+    for i in range(1, len(src_row_lengths)):
+      dst_row_lengths.append(
+          ragged_util.repeat_ranges(src_row_lengths[i], splits, repeats))
+      splits = array_ops.gather(src_row_splits[i], splits)
+    dst_values = ragged_util.repeat_ranges(rt_input.flat_values, splits,
+                                           repeats)
+    return ragged_tensor.RaggedTensor.from_nested_row_lengths(
+        dst_values, dst_row_lengths)
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec06aeaea546d679d65c7c8d64357393afd3eae2
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_tensor_shape_test.py
@@ -0,0 +1,479 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf.ragged.ragged_tensor_shape."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.platform import googletest
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorBoundingShapeOp(ragged_test_util.RaggedTensorTestCase,
+                                  parameterized.TestCase):
+
+  def assertShapeEq(self, x, y):
+    assert isinstance(x, ragged.RaggedTensorDynamicShape)
+    assert isinstance(y, ragged.RaggedTensorDynamicShape)
+    x_partitioned_dim_sizes = [
+        self.eval_to_list(splits)  #
+        for splits in x.partitioned_dim_sizes
+    ]
+    y_partitioned_dim_sizes = [
+        self.eval_to_list(splits)  #
+        for splits in y.partitioned_dim_sizes
+    ]
+    self.assertEqual(x_partitioned_dim_sizes, y_partitioned_dim_sizes)
+    self.assertAllEqual(x.inner_dim_sizes, y.inner_dim_sizes)
+
+  @parameterized.parameters([
+      dict(value='x', expected_dim_sizes=[]),
+      dict(value=['a', 'b', 'c'], expected_dim_sizes=[3]),
+      dict(value=[['a', 'b', 'c'], ['d', 'e', 'f']], expected_dim_sizes=[2, 3]),
+      dict(
+          value=[[['a', 'b', 'c'], ['d', 'e', 'f']]],
+          expected_dim_sizes=[1, 2, 3]),
+      dict(
+          value=ragged.constant_value([['a', 'b', 'c'], ['d', 'e']]),
+          expected_dim_sizes=[2, [3, 2]]),
+      dict(
+          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e']]]),
+          expected_dim_sizes=[1, [2], [3, 2]]),
+      dict(
+          value=ragged.constant_value([[['a', 'b', 'c'], ['d', 'e', 'f']]],
+                                      ragged_rank=1),
+          expected_dim_sizes=[1, [2], 3]),
+      dict(
+          value=ragged.constant_value([[[[1], [2]], [[3], [4]]],
+                                       [[[5], [6]]]], ragged_rank=1),
+          expected_dim_sizes=[2, [2, 1], 2, 1]),
+      dict(
+          value=ragged.constant_value([[10, 20], [30]]),
+          expected_dim_sizes=[2, [2, 1]]),
+      # Docstring examples:
+      dict(value=[[1, 2, 3], [4, 5, 6]], expected_dim_sizes=[2, 3]),
+      dict(
+          value=ragged.constant_value([[1, 2], [], [3, 4, 5]]),
+          expected_dim_sizes=[3, [2, 0, 3]]),
+      dict(
+          value=ragged.constant_value([[[1, 2], [3, 4]], [[5, 6]]],
+                                      ragged_rank=1),
+          expected_dim_sizes=[2, [2, 1], 2]),
+      dict(
+          value=ragged.constant_value([[[1, 2], [3]], [[4, 5]]]),
+          expected_dim_sizes=[2, [2, 1], [2, 1, 2]]),
+  ])
+  def testFromTensor(self, value, expected_dim_sizes):
+    shape = ragged.RaggedTensorDynamicShape.from_tensor(value)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        expected_dim_sizes)
+    self.assertShapeEq(shape, expected)
+
+  @parameterized.parameters([
+      dict(dim_sizes=[], rank=0, expected_dim_sizes=[]),
+      dict(dim_sizes=[], rank=3, expected_dim_sizes=[1, 1, 1]),
+      dict(dim_sizes=[3], rank=1, expected_dim_sizes=[3]),
+      dict(dim_sizes=[3], rank=3, expected_dim_sizes=[1, 1, 3]),
+      dict(dim_sizes=[2, 3], rank=3, expected_dim_sizes=[1, 2, 3]),
+      dict(dim_sizes=[3, [3, 2, 4]], rank=2, expected_dim_sizes=[3, [3, 2, 4]]),
+      dict(
+          dim_sizes=[3, [3, 2, 4]],
+          rank=4,
+          expected_dim_sizes=[1, 1, 3, [3, 2, 4]]),
+      dict(
+          dim_sizes=[3, [3, 2, 4], 2, 3],
+          rank=5,
+          expected_dim_sizes=[1, 3, [3, 2, 4], 2, 3]),
+  ])
+  def testBroadcastToRank(self, dim_sizes, rank, expected_dim_sizes):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        expected_dim_sizes)
+    broadcasted_shape = shape.broadcast_to_rank(rank)
+    self.assertShapeEq(broadcasted_shape, expected)
+    self.assertEqual(broadcasted_shape.rank, rank)
+
+  @parameterized.parameters([
+      #=========================================================================
+      # dimension[axis] is uniform inner; and row_lengths is a scalar
+      #=========================================================================
+      # shape: [BROADCAST(UNIFORM), UNIFORM, UNIFORM]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, 4, 5],
+           broadcast_dim_sizes=[3, 4, 5]),
+
+      # shape: [UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=5,
+           original_dim_sizes=[3, 4, 1],
+           broadcast_dim_sizes=[3, 4, 5]),
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=5,
+           original_dim_sizes=[3, [3, 2, 8], 1],
+           broadcast_dim_sizes=[3, [3, 2, 8], 5]),
+
+      # shape: [UNIFORM, RAGGED, RAGGED, UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=5,
+           row_length=5,
+           original_dim_sizes=[2, [2, 1], [3, 2, 8], 3, 4, 1],
+           broadcast_dim_sizes=[2, [2, 1], [3, 2, 8], 3, 4, 5]),
+
+      #=========================================================================
+      # dimension[axis] is uniform inner; and row_lengths is a vector
+      #=========================================================================
+      # shape: [UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=1,
+           row_length=[2, 0, 1],
+           original_dim_sizes=[3, 1],
+           broadcast_dim_sizes=[3, [2, 0, 1]]),
+      # shape: [UNIFORM, BROADCAST(UNIFORM), UNIFORM]
+      dict(axis=1,
+           row_length=[2, 0, 1],
+           original_dim_sizes=[3, 1, 5],
+           broadcast_dim_sizes=[3, [2, 0, 1], 5]),
+
+      # shape: [UNIFORM, UNIFORM, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=[2, 0, 1, 3, 8, 2, 3, 4, 1, 8, 7, 0],
+           original_dim_sizes=[4, 3, 1],
+           broadcast_dim_sizes=[4, 3, [2, 0, 1, 3, 8, 2, 3, 4, 1, 8, 7, 0]]),
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM)]
+      dict(axis=2,
+           row_length=[2, 5, 3],
+           original_dim_sizes=[2, [2, 1], 1],
+           broadcast_dim_sizes=[2, [2, 1], [2, 5, 3]]),
+
+      # shape: [UNIFORM, RAGGED, UNIFORM, UNIFORM, BROADCAST(UNIFORM), UNIFORM]
+      dict(axis=4,
+           row_length=list(range(18)),
+           original_dim_sizes=[2, [2, 1], 3, 2, 1, 8],
+           broadcast_dim_sizes=[2, [2, 1], 3, 2, list(range(18)), 8]),
+
+      #=========================================================================
+      # dimension[axis] is uniform partitioned; and row_lengths is a scalar
+      #=========================================================================
+      # shape: [BROADCAST(UNIFORM), RAGGED]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, [5]],
+           broadcast_dim_sizes=[3, [5, 5, 5]]),
+
+      # shape: [BROADCAST(UNIFORM), UNIFORM, RAGGED]
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 3, [3, 0, 2]],
+           broadcast_dim_sizes=[2, 3, [3, 0, 2, 3, 0, 2]]),
+
+      # shape: [BROADCAST(UNIFORM), RAGGED, RAGGED, UNIFORM, UNIFORM]
+      dict(axis=0,
+           row_length=3,
+           original_dim_sizes=[1, [3], [3, 5, 2], 9, 4, 5],
+           broadcast_dim_sizes=[3, [3, 3, 3], [3, 5, 2, 3, 5, 2, 3, 5, 2],
+                                9, 4, 5]),
+
+      # shape: [BROADCAST(UNIFORM), UNIFORM, RAGGED, UNIFORM]
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 2, [2, 1], [3, 5, 2], 2],
+           broadcast_dim_sizes=[2, 2, [2, 1, 2, 1], [3, 5, 2, 3, 5, 2], 2]),
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, UNIFORM]
+      dict(axis=1,
+           row_length=2,
+           original_dim_sizes=[3, 1, [4, 0, 2], 5],
+           broadcast_dim_sizes=[3, 2, [4, 0, 2, 4, 0, 2], 5]),
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED]
+      dict(axis=1,
+           row_length=1,
+           original_dim_sizes=[2, 3, (1, 2, 3, 4, 5, 6)],
+           broadcast_dim_sizes=[2, 3, (1, 2, 3, 4, 5, 6)]),
+
+      #=========================================================================
+      # dimension[axis] is uniform partitioned; and row_lengths is a vector
+      #=========================================================================
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, UNIFORM]
+      dict(axis=1,
+           row_length=[4, 1, 2],
+           original_dim_sizes=[
+               3,                          # axis=0
+               1,                          # axis=1 (broadcast)
+               [3, 1, 2],                  # axis=2
+               5],                         # axis=3
+           broadcast_dim_sizes=[
+               3,                          # axis=0
+               [4, 1, 2],                  # axis=1 (broadcast)
+               [3, 3, 3, 3, 1, 2, 2],      # axis=2
+               5]),                        # axis=3
+
+      # shape: [UNIFORM, BROADCAST(UNIFORM), RAGGED, RAGGED]
+      dict(axis=1,
+           row_length=[2, 0, 3],
+           original_dim_sizes=[
+               3,                                         # axis=0
+               1,                                         # axis=1 (broadcast)
+               [3, 1, 2],                                 # axis=2
+               [3, 1, 4, 1, 5, 9]],                       # axis=3
+           broadcast_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 3],                                 # axis=1 (broadcast)
+               [3, 3, 2, 2, 2],                           # axis=2
+               [3, 1, 4, 3, 1, 4, 5, 9, 5, 9, 5, 9]]),    # axis=3
+
+      # shape: [UNIFORM, RAGGED, BROADCAST(UNIFORM), RAGGED, RAGGED, UNIFORM]
+      dict(axis=2,
+           row_length=[4, 1, 2],
+           original_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 1],                                 # axis=1
+               1,                                         # axis=2 (broadcast)
+               [3, 2, 1],                                 # axis=3
+               [1, 0, 1, 0, 2, 3],                        # axis=4
+               5],                                        # axis=5
+           broadcast_dim_sizes=[
+               3,                                         # axis=0
+               [2, 0, 1],                                 # axis=2
+               [4, 1, 2],                                 # axis=2 (broadcast)
+               [3, 3, 3, 3, 2, 1, 1],                     # axis=3
+               [1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,    # axis=4
+                2, 3, 3],
+               5]),                                       # axis=5
+
+      dict(axis=0,
+           row_length=2,
+           original_dim_sizes=[1, 1, 2, (2, 1)],
+           broadcast_dim_sizes=[2, 1, 2, (2, 1, 2, 1)]),
+      dict(axis=1,
+           row_length=(2, 1),
+           original_dim_sizes=[2, 1, 2, (2, 1, 2, 1)],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      dict(axis=2,
+           row_length=2,
+           original_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      dict(axis=3,
+           row_length=(2, 1, 2, 1, 2, 1),
+           original_dim_sizes=[2, (2, 1), 2, 1],
+           broadcast_dim_sizes=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+  ])  # pyformat: disable
+  def testBroadcastDimension(self, axis, row_length, original_dim_sizes,
+                             broadcast_dim_sizes):
+    """Tests for the broadcast_dimension method.
+
+    Verifies that:
+
+    * `original.broadcast_dimension(axis, row_length) == broadcast`
+    * `broadcast.broadcast_dimension(axis, row_length) == broadcast`
+    * `broadcast.broadcast_dimension(axis, 1) == broadcast`
+
+    Args:
+      axis: The axis to broadcast
+      row_length: The slice lengths to broadcast to.
+      original_dim_sizes: The dimension sizes before broadcasting.
+        original_dim_sizes[axis] should be equal to `1` or `row_length`.
+      broadcast_dim_sizes: THe dimension sizes after broadcasting.
+    """
+    original_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        original_dim_sizes)
+    broadcast_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(
+        broadcast_dim_sizes)
+    self.assertEqual(original_shape.rank, broadcast_shape.rank)
+    # shape[axis].value == 1 and row_length > 1:
+    bcast1 = original_shape.broadcast_dimension(axis, row_length)
+    # shape[axis].value > 1 and row_length == shape[axis].value:
+    bcast2 = broadcast_shape.broadcast_dimension(axis, row_length)
+    # shape[axis].value > 1 and row_length == 1:
+    bcast3 = broadcast_shape.broadcast_dimension(axis, 1)
+
+    self.assertShapeEq(bcast1, broadcast_shape)
+    self.assertShapeEq(bcast2, broadcast_shape)
+    self.assertShapeEq(bcast3, broadcast_shape)
+
+  @parameterized.parameters(
+      [
+          # Broadcast scalar
+          dict(x_dims=[], y_dims=[], expected_dims=[]),
+          dict(x_dims=[], y_dims=[2], expected_dims=[2]),
+          dict(x_dims=[], y_dims=[2, 3], expected_dims=[2, 3]),
+          dict(
+              x_dims=[],
+              y_dims=[2, (2, 3), (5, 7, 2, 0, 9)],
+              expected_dims=[2, (2, 3), (5, 7, 2, 0, 9)]),
+          # Broadcast vector
+          dict(x_dims=[3], y_dims=[4, 2, 3], expected_dims=[4, 2, 3]),
+          dict(x_dims=[1], y_dims=[4, 2, 3], expected_dims=[4, 2, 3]),
+          dict(x_dims=[3], y_dims=[4, 2, 1], expected_dims=[4, 2, 3]),
+          dict(
+              x_dims=[3],
+              y_dims=[3, (2, 3, 1), 1],
+              expected_dims=[3, (2, 3, 1), 3]),
+          dict(x_dims=[1], y_dims=[3, (2, 1, 3)], expected_dims=[3, (2, 1, 3)]),
+          dict(
+              x_dims=[1],
+              y_dims=[3, (2, 1, 3), 8],
+              expected_dims=[3, (2, 1, 3), 8]),
+          dict(
+              x_dims=[1],
+              y_dims=[2, (2, 3), (5, 7, 2, 0, 9)],
+              expected_dims=[2, (2, 3), (5, 7, 2, 0, 9)]),
+          # Mixed broadcasting
+          dict(
+              x_dims=[
+                  1,  # axis=0
+                  3,  # axis=1
+                  (3, 0, 2),  # axis=2
+                  1,  # axis=3
+                  2,  # axis=4
+              ],
+              y_dims=[
+                  2,  # axis=0
+                  1,  # axis=1
+                  1,  # axis=2
+                  (7, 2),  # axis=3
+                  1,  # axis=4
+              ],
+              expected_dims=[
+                  2,  # axis=0
+                  3,  # axis=1
+                  (3, 0, 2, 3, 0, 2),  # axis=2
+                  (7, 7, 7, 7, 7, 2, 2, 2, 2, 2),  # axis=3
+                  2,  # axis=4
+              ]),
+          dict(
+              x_dims=[2, (2, 1), 2, 1],
+              y_dims=[1, 1, 2, (2, 1)],
+              expected_dims=[2, (2, 1), 2, (2, 1, 2, 1, 2, 1)]),
+      ])
+  def testBroadcastDynamicShape(self, x_dims, y_dims, expected_dims):
+    x_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(x_dims)
+    y_shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(y_dims)
+    expected = ragged.RaggedTensorDynamicShape.from_dim_sizes(expected_dims)
+    result1 = ragged.broadcast_dynamic_shape(x_shape, y_shape)
+    result2 = ragged.broadcast_dynamic_shape(y_shape, x_shape)
+    self.assertShapeEq(expected, result1)
+    self.assertShapeEq(expected, result2)
+
+  def testRepr(self):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes([2, (2, 1), 2, 1])
+    self.assertRegexpMatches(
+        repr(shape),
+        r'RaggedTensorDynamicShape\('
+        r'partitioned_dim_sizes=\(<[^>]+>, <[^>]+>\), '
+        r'inner_dim_sizes=<[^>]+>\)')
+
+  @parameterized.parameters([
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, 2],
+          expected=[[10, 10], [20, 20], [30, 30]]),
+      dict(
+          x=[[10], [20], [30]],  # shape=[3, 1]
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged.constant_value([[10, 10, 10], [], [30, 30]],
+                                         dtype=np.int32)),
+      dict(
+          x=[[[1, 2, 3]], [[4, 5, 6]]],  # shape = [2, 1, 3]
+          dim_sizes=[2, [2, 3], 3],
+          expected=ragged.constant_value(
+              [[[1, 2, 3], [1, 2, 3]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]],
+              dtype=np.int32,
+              ragged_rank=1)),
+      dict(
+          x=[[[1]], [[2]]],  # shape = [2, 1, 1]
+          dim_sizes=[2, [2, 3], [0, 2, 1, 2, 0]],
+          expected=ragged.constant_value([[[], [1, 1]], [[2], [2, 2], []]],
+                                         dtype=np.int32,
+                                         ragged_rank=2)),
+      dict(
+          x=10,
+          dim_sizes=[3, [3, 0, 2]],
+          expected=ragged.constant_value([[10, 10, 10], [], [10, 10]])),
+  ])
+  def testRaggedBroadcastTo(self, x, dim_sizes, expected):
+    shape = ragged.RaggedTensorDynamicShape.from_dim_sizes(dim_sizes)
+    result = ragged.broadcast_to(x, shape)
+    self.assertEqual(
+        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
+    self.assertRaggedEqual(result, expected)
+
+  @parameterized.parameters([
+      dict(
+          doc='x.shape=[3, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
+          y=[[10], [20], [30]],
+          expected=ragged.constant_value([[11, 12, 13], [], [34, 35]])),
+      dict(
+          doc='x.shape=[3, (D1)]; y.shape=[]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3], [], [4, 5]], dtype=np.int32),
+          y=10,
+          expected=ragged.constant_value([[11, 12, 13], [], [14, 15]])),
+      dict(
+          doc='x.shape=[1, (D1)]; y.shape=[3, 1]; bcast.shape=[3, (D1)]',
+          x=ragged.constant_value([[1, 2, 3]], dtype=np.int32),
+          y=[[10], [20], [30]],
+          expected=ragged.constant_value(
+              [[11, 12, 13], [21, 22, 23], [31, 32, 33]], dtype=np.int32)),
+      dict(
+          doc=('x.shape=[2, (D1), 1]; y.shape=[1, (D2)]; '
+               'bcast.shape=[2, (D1), (D2)]'),
+          x=ragged.constant_value([[[1], [2], [3]], [[4]]], ragged_rank=1),
+          y=ragged.constant_value([[10, 20, 30]]),
+          expected=ragged.constant_value([[[11, 21, 31], [12, 22, 32],
+                                           [13, 23, 33]], [[14, 24, 34]]])),
+      dict(
+          doc=('x.shape=[2, (D1), 1]; y.shape=[1, 1, 4]; '
+               'bcast.shape=[2, (D1), 4]'),
+          x=ragged.constant_value([[[10], [20]], [[30]]], ragged_rank=1),
+          y=[[[1, 2, 3, 4]]],
+          expected=ragged.constant_value(
+              [[[11, 12, 13, 14], [21, 22, 23, 24]], [[31, 32, 33, 34]]],
+              ragged_rank=1)),
+      dict(
+          doc=('x.shape=[2, (D1), 2, 1]; y.shape=[2, (D2)]; '
+               'bcast.shape=[2, (D1), (2), (D2)'),
+          x=ragged.constant_value([[[[1], [2]], [[3], [4]]],
+                                   [[[5], [6]]]],
+                                  ragged_rank=1),
+          y=ragged.constant_value([[10, 20], [30]]),
+          expected=ragged.constant_value(
+              [[[[11, 21], [32]], [[13, 23], [34]]],
+               [[[15, 25], [36]]]])),
+  ])
+  def testRaggedAddWithBroadcasting(self, x, y, expected, doc):
+    expected_rrank = getattr(expected, 'ragged_rank', 0)
+    x = ragged.convert_to_tensor_or_ragged_tensor(x, dtype=dtypes.int32)
+    y = ragged.convert_to_tensor_or_ragged_tensor(y, dtype=dtypes.int32)
+    result = x + y
+    result_rrank = getattr(result, 'ragged_rank', 0)
+    self.assertEqual(expected_rrank, result_rrank)
+    if hasattr(expected, 'tolist'):
+      expected = expected.tolist()
+    self.assertRaggedEqual(result, expected)
+
+
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_test.py b/tensorflow/python/ops/ragged/ragged_tensor_test.py
index 61bfcb68090557bb17cbd1312dd47a1c18500c08..b8f1d97137d22376a39d9fa0e098f8c364383b65 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_test.py
@@ -19,17 +19,20 @@ from __future__ import division
 from __future__ import print_function
 
 import re
-import sys
 
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
+from tensorflow.python.ops.ragged import RaggedTensor
 from tensorflow.python.platform import googletest
 
 
@@ -103,11 +106,13 @@ EXAMPLE_RAGGED_TENSOR_4D = [
 EXAMPLE_RAGGED_TENSOR_4D_SPLITS1 = [0, 2, 2, 3, 4]
 EXAMPLE_RAGGED_TENSOR_4D_SPLITS2 = [0, 3, 6, 9, 10]
 EXAMPLE_RAGGED_TENSOR_4D_VALUES = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10],
-                                   [11, 12], [13, 14], [15, 16], [17,
-                                                                  18], [19, 20]]
+                                   [11, 12], [13, 14], [15, 16], [17, 18],
+                                   [19, 20]]
 
 
-class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
   longMessage = True  # Property in unittest.Testcase. pylint: disable=invalid-name
 
   #=============================================================================
@@ -116,54 +121,49 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testClassDocStringExamples(self):
     # From section: "Component Tensors"
-    rt = ragged.from_row_splits(
+    rt = RaggedTensor.from_row_splits(
         values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    with self.test_session():
-      self.assertEqual(rt.tolist(),
-                       [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+    self.assertRaggedEqual(rt, [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
     del rt
 
     # From section: "Alternative Row-Partitioning Schemes"
     values = [3, 1, 4, 1, 5, 9, 2, 6]
-    rt1 = ragged.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
-    rt2 = ragged.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
-    rt3 = ragged.from_value_rowids(
+    rt1 = RaggedTensor.from_row_splits(values, row_splits=[0, 4, 4, 7, 8, 8])
+    rt2 = RaggedTensor.from_row_lengths(values, row_lengths=[4, 0, 3, 1, 0])
+    rt3 = RaggedTensor.from_value_rowids(
         values, value_rowids=[0, 0, 0, 0, 2, 2, 2, 3], nrows=5)
-    rt4 = ragged.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
-    rt5 = ragged.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
+    rt4 = RaggedTensor.from_row_starts(values, row_starts=[0, 4, 4, 7, 8])
+    rt5 = RaggedTensor.from_row_limits(values, row_limits=[4, 4, 7, 8, 8])
     for rt in (rt1, rt2, rt3, rt4, rt5):
-      with self.test_session():
-        self.assertEqual(rt.tolist(),
-                         [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
+      self.assertRaggedEqual(rt, [[3, 1, 4, 1], [], [5, 9, 2], [6], []])
     del rt1, rt2, rt3, rt4, rt5
 
     # From section: "Multiple Ragged Dimensions"
-    inner_rt = ragged.from_row_splits(
+    inner_rt = RaggedTensor.from_row_splits(
         values=[3, 1, 4, 1, 5, 9, 2, 6], row_splits=[0, 4, 4, 7, 8, 8])
-    outer_rt = ragged.from_row_splits(values=inner_rt, row_splits=[0, 3, 3, 5])
+    outer_rt = RaggedTensor.from_row_splits(
+        values=inner_rt, row_splits=[0, 3, 3, 5])
     self.assertEqual(outer_rt.ragged_rank, 2)
-    with self.test_session():
-      self.assertEqual(outer_rt.tolist(),
-                       [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    self.assertEqual(
+        self.eval_to_list(outer_rt),
+        [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
     del inner_rt, outer_rt
 
     # From section: "Multiple Ragged Dimensions"
-    rt = ragged.from_nested_row_splits(
-        inner_values=[3, 1, 4, 1, 5, 9, 2, 6],
+    rt = RaggedTensor.from_nested_row_splits(
+        flat_values=[3, 1, 4, 1, 5, 9, 2, 6],
         nested_row_splits=([0, 3, 3, 5], [0, 4, 4, 7, 8, 8]))
-    with self.test_session():
-      self.assertEqual(rt.tolist(),
-                       [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
+    self.assertEqual(
+        self.eval_to_list(rt), [[[3, 1, 4, 1], [], [5, 9, 2]], [], [[6], []]])
     del rt
 
     # From section: "Uniform Inner Dimensions"
-    rt = ragged.from_row_splits(
+    rt = RaggedTensor.from_row_splits(
         values=array_ops.ones([5, 3]), row_splits=[0, 2, 5])
-    with self.test_session():
-      self.assertEqual(
-          rt.tolist(),
-          [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
-      self.assertEqual(rt.shape.as_list(), [2, None, 3])
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [1, 1, 1], [1, 1, 1]]])
+    self.assertEqual(rt.shape.as_list(), [2, None, 3])
     del rt
 
   #=============================================================================
@@ -183,7 +183,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(splits, rt_value.row_splits)
     self.assertAllEqual(values, rt_value.values)
     self.assertAllEqual(splits, rt_value.nested_row_splits[0])
-    self.assertAllEqual(values, rt_value.inner_values)
+    self.assertAllEqual(values, rt_value.flat_values)
 
     # Test construction of a RaggedTensorValue with ragged_rank=2.
     rt_value = ragged.RaggedTensorValue(
@@ -196,7 +196,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(splits2, rt_value.nested_row_splits[0])
     self.assertAllEqual(splits, rt_value.nested_row_splits[1])
     self.assertAllEqual(values, rt_value.values.values)
-    self.assertAllEqual(values, rt_value.inner_values)
+    self.assertAllEqual(values, rt_value.flat_values)
 
   #=============================================================================
   # RaggedTensor Constructor (private)
@@ -205,12 +205,11 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testRaggedTensorConstruction(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
-    rt = ragged.RaggedTensor(
-        values=values, row_splits=row_splits, internal=True)
+    rt = RaggedTensor(values=values, row_splits=row_splits, internal=True)
 
-    with self.test_session():
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testRaggedTensorConstructionErrors(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -218,28 +217,30 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaisesRegexp(ValueError,
                                  'RaggedTensor constructor is private'):
-      ragged.RaggedTensor(values=values, row_splits=row_splits)
+      RaggedTensor(values=values, row_splits=row_splits)
 
     with self.assertRaisesRegexp(TypeError,
                                  'values must be a Tensor or RaggedTensor'):
-      ragged.RaggedTensor(values=range(7), row_splits=row_splits, internal=True)
+      RaggedTensor(values=range(7), row_splits=row_splits, internal=True)
 
     with self.assertRaisesRegexp(TypeError,
                                  'Row-partitioning argument must be a Tensor'):
-      ragged.RaggedTensor(
-          values=values, row_splits=[0, 2, 2, 5, 6, 7], internal=True)
+      RaggedTensor(values=values, row_splits=[0, 2, 2, 5, 6, 7], internal=True)
 
     with self.assertRaisesRegexp(ValueError,
                                  r'Shape \(6, 1\) must have rank 1'):
-      ragged.RaggedTensor(
+      RaggedTensor(
           values=values,
           row_splits=array_ops.expand_dims(row_splits, 1),
           internal=True)
 
     with self.assertRaisesRegexp(TypeError,
                                  'Cached value must be a Tensor or None.'):
-      ragged.RaggedTensor(values=values, row_splits=row_splits,
-                          cached_row_lengths=[2, 3, 4], internal=True)
+      RaggedTensor(
+          values=values,
+          row_splits=row_splits,
+          cached_row_lengths=[2, 3, 4],
+          internal=True)
 
 
 #=============================================================================
@@ -251,22 +252,22 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
 
-    rt = ragged.from_value_rowids(values, value_rowids)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, value_rowids)
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromValueRowIdsWithDerivedNRowsDynamic(self):
     # nrows is not known at graph creation time.
@@ -274,165 +275,166 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     value_rowids = array_ops.placeholder_with_default(value_rowids, shape=None)
 
-    rt = ragged.from_value_rowids(values, value_rowids)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids)
     self.assertEqual(rt.dtype, dtypes.string)
-    self.assertEqual(rt.shape.as_list(), [None, None])
+    if context.executing_eagerly():
+      self.assertEqual(rt.shape.as_list(), [5, None])
+    else:
+      self.assertEqual(rt.shape.as_list(), [None, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, value_rowids)
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromValueRowIdsWithExplicitNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(7, dtypes.int64)
 
-    rt = ragged.from_value_rowids(values, value_rowids, nrows)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [7, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertIs(rt_nrows, nrows)  # cached_nrows
-    with self.test_session():
-      self.assertEqual(
-          rt.tolist(),
-          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g'], [], []])
 
   def testFromValueRowIdsWithExplicitNRowsEqualToDefault(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     nrows = constant_op.constant(5, dtypes.int64)
 
-    rt = ragged.from_value_rowids(values, value_rowids, nrows)
+    rt = RaggedTensor.from_value_rowids(values, value_rowids, nrows)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_value_rowids, value_rowids)  # cached_value_rowids
     self.assertIs(rt_nrows, nrows)  # cached_nrows
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, value_rowids)
-      self.assertAllEqual(rt_nrows, nrows)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertAllEqual(rt_value_rowids, value_rowids)
+    self.assertAllEqual(rt_nrows, nrows)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromValueRowIdsWithEmptyValues(self):
-    rt = ragged.from_value_rowids([], [])
-    rt_nrows = ragged.nrows(rt)
+    rt = RaggedTensor.from_value_rowids([], [])
+    rt_nrows = rt.nrows()
     self.assertEqual(rt.dtype, dtypes.float32)
     self.assertEqual(rt.shape.as_list(), [0, None])
     self.assertEqual(rt.ragged_rank, 1)
     self.assertEqual(rt.values.shape.as_list(), [0])
-    self.assertEqual(ragged.value_rowids(rt).shape.as_list(), [0])
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval().tolist(), 0)
-      self.assertEqual(rt.tolist(), [])
+    self.assertEqual(rt.value_rowids().shape.as_list(), [0])
+    self.assertEqual(self.eval_to_list(rt_nrows), 0)
+    self.assertEqual(self.eval_to_list(rt), [])
 
   def testFromRowSplits(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
 
-    rt = ragged.from_row_splits(values, row_splits)
+    rt = RaggedTensor.from_row_splits(values, row_splits)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
     rt_row_splits = rt.row_splits
-    rt_nrows = ragged.nrows(rt)
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_row_splits, row_splits)
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromRowSplitsWithEmptySplits(self):
     err_msg = 'row_splits tensor may not be empty'
     with self.assertRaisesRegexp(ValueError, err_msg):
-      ragged.from_row_splits([], [])
+      RaggedTensor.from_row_splits([], [])
 
   def testFromRowStarts(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_starts = constant_op.constant([0, 2, 2, 5, 6], dtypes.int64)
 
-    rt = ragged.from_row_starts(values, row_starts)
+    rt = RaggedTensor.from_row_starts(values, row_starts)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_row_starts = ragged.row_starts(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_row_starts = rt.row_starts()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertAllEqual(rt_row_starts, row_starts)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertAllEqual(rt_row_starts, row_starts)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromRowLimits(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_limits = constant_op.constant([2, 2, 5, 6, 7], dtypes.int64)
 
-    rt = ragged.from_row_limits(values, row_limits)
+    rt = RaggedTensor.from_row_limits(values, row_limits)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_row_limits = ragged.row_limits(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_row_limits = rt.row_limits()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertAllEqual(rt_row_limits, row_limits)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertAllEqual(rt_row_limits, row_limits)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromRowLengths(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_lengths = constant_op.constant([2, 0, 3, 1, 1], dtypes.int64)
 
-    rt = ragged.from_row_lengths(values, row_lengths)
+    rt = RaggedTensor.from_row_lengths(values, row_lengths)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [5, None])
     self.assertEqual(rt.ragged_rank, 1)
 
     rt_values = rt.values
-    rt_row_lengths = ragged.row_lengths(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_row_lengths = rt.row_lengths()
+    rt_nrows = rt.nrows()
 
     self.assertIs(rt_values, values)
     self.assertIs(rt_row_lengths, row_lengths)  # cached_nrows
-    with self.test_session():
-      self.assertEqual(rt_nrows.eval(), 5)
-      self.assertAllEqual(rt_row_lengths, row_lengths)
-      self.assertEqual(rt.tolist(),
-                       [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+    self.assertEqual(self.eval_to_list(rt_nrows), 5)
+    self.assertAllEqual(rt_row_lengths, row_lengths)
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
 
   def testFromNestedValueRowIdsWithDerivedNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -441,23 +443,22 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     ]
 
-    rt = ragged.from_nested_value_rowids(values, nested_value_rowids)
+    rt = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [4, None, None])
     self.assertEqual(rt.ragged_rank, 2)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
+    rt_value_rowids = rt.value_rowids()
     rt_values_values = rt_values.values
-    rt_values_value_rowids = ragged.value_rowids(rt_values)
+    rt_values_value_rowids = rt_values.value_rowids()
 
     self.assertIs(rt_values_values, values)
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
-      self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
-      self.assertEqual(
-          rt.tolist(),
-          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+    self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+    self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
   def testFromNestedValueRowIdsWithExplicitNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -470,27 +471,27 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         constant_op.constant(6, dtypes.int64)
     ]
 
-    rt = ragged.from_nested_value_rowids(values, nested_value_rowids, nrows)
+    rt = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids,
+                                               nrows)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [6, None, None])
     self.assertEqual(rt.ragged_rank, 2)
 
     rt_values = rt.values
-    rt_value_rowids = ragged.value_rowids(rt)
-    rt_nrows = ragged.nrows(rt)
+    rt_value_rowids = rt.value_rowids()
+    rt_nrows = rt.nrows()
     rt_values_values = rt_values.values
-    rt_values_value_rowids = ragged.value_rowids(rt_values)
-    rt_values_nrows = ragged.nrows(rt_values)
+    rt_values_value_rowids = rt_values.value_rowids()
+    rt_values_nrows = rt_values.nrows()
 
     self.assertIs(rt_values_values, values)
-    with self.test_session():
-      self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
-      self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
-      self.assertAllEqual(rt_nrows, nrows[0])
-      self.assertAllEqual(rt_values_nrows, nrows[1])
-      self.assertEqual(rt.tolist(),
-                       [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
-                        [[b'f'], [b'g'], []], [], []])
+    self.assertAllEqual(rt_value_rowids, nested_value_rowids[0])
+    self.assertAllEqual(rt_values_value_rowids, nested_value_rowids[1])
+    self.assertAllEqual(rt_nrows, nrows[0])
+    self.assertAllEqual(rt_values_nrows, nrows[1])
+    self.assertEqual(
+        self.eval_to_list(rt), [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [],
+                                [[b'f'], [b'g'], []], [], []])
 
   def testFromNestedValueRowIdsWithExplicitNRowsMismatch(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -502,27 +503,26 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(
         ValueError, 'nested_nrows must have the same '
         'length as nested_value_rowids'):
-      ragged.from_nested_value_rowids(values, nested_value_rowids, nrows)
+      RaggedTensor.from_nested_value_rowids(values, nested_value_rowids, nrows)
 
   def testFromNestedValueRowIdsWithNonListInput(self):
     with self.assertRaisesRegexp(
         TypeError, 'nested_value_rowids must be a list of Tensors'):
-      ragged.from_nested_value_rowids([1, 2, 3],
-                                      constant_op.constant(
-                                          [[0, 1, 2], [0, 1, 2]], dtypes.int64))
+      RaggedTensor.from_nested_value_rowids(
+          [1, 2, 3], constant_op.constant([[0, 1, 2], [0, 1, 2]], dtypes.int64))
     with self.assertRaisesRegexp(TypeError,
                                  'nested_nrows must be a list of Tensors'):
-      ragged.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
-                                      constant_op.constant([3, 3]))
+      RaggedTensor.from_nested_value_rowids([1, 2, 3], [[0, 1, 2], [0, 1, 2]],
+                                            constant_op.constant([3, 3]))
 
   def testFromNestedRowSplits(self):
-    inner_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
+    flat_values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     nested_row_splits = [
         constant_op.constant([0, 2, 3, 3, 5], dtypes.int64),
         constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     ]
 
-    rt = ragged.from_nested_row_splits(inner_values, nested_row_splits)
+    rt = RaggedTensor.from_nested_row_splits(flat_values, nested_row_splits)
     self.assertEqual(rt.dtype, dtypes.string)
     self.assertEqual(rt.shape.as_list(), [4, None, None])
     self.assertEqual(rt.ragged_rank, 2)
@@ -532,20 +532,18 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt_values_values = rt_values.values
     rt_values_row_splits = rt_values.row_splits
 
-    self.assertIs(rt_values_values, inner_values)
+    self.assertIs(rt_values_values, flat_values)
     self.assertIs(rt_row_splits, nested_row_splits[0])
     self.assertIs(rt_values_row_splits, nested_row_splits[1])
-    with self.test_session():
-      self.assertEqual(
-          rt.tolist(),
-          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
 
   def testFromNestedRowSplitsWithNonListInput(self):
     with self.assertRaisesRegexp(TypeError,
                                  'nested_row_splits must be a list of Tensors'):
-      ragged.from_nested_row_splits([1, 2],
-                                    constant_op.constant([[0, 1, 2], [0, 1, 2]],
-                                                         dtypes.int64))
+      RaggedTensor.from_nested_row_splits(
+          [1, 2], constant_op.constant([[0, 1, 2], [0, 1, 2]], dtypes.int64))
 
   def testFromValueRowIdsWithBadNRows(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -553,7 +551,7 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     nrows = constant_op.constant(5, dtypes.int64)
 
     with self.assertRaisesRegexp(ValueError, r'Expected nrows >= 0; got -2'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=array_ops.placeholder_with_default(value_rowids, None),
           nrows=-2)
@@ -561,35 +559,37 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegexp(
         ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=2, '
         r'value_rowids\[-1\]=4'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values, value_rowids=value_rowids, nrows=2)
 
     with self.assertRaisesRegexp(
         ValueError, r'Expected nrows >= value_rowids\[-1\] \+ 1; got nrows=4, '
         r'value_rowids\[-1\]=4'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values, value_rowids=value_rowids, nrows=4)
 
     with self.assertRaisesRegexp(ValueError,
                                  r'Shape \(7, 1\) must have rank 1'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=array_ops.expand_dims(value_rowids, 1),
           nrows=nrows)
 
     with self.assertRaisesRegexp(ValueError, r'Shape \(1,\) must have rank 0'):
-      ragged.from_value_rowids(
+      RaggedTensor.from_value_rowids(
           values=values,
           value_rowids=value_rowids,
           nrows=array_ops.expand_dims(nrows, 0))
 
   def testGraphMismatch(self):
-    with ops.Graph().as_default():
-      values = constant_op.constant([1, 2, 3])
-    with ops.Graph().as_default():
-      splits = constant_op.constant([0, 2, 3])
-    self.assertRaisesRegexp(ValueError, '.* must be from the same graph as .*',
-                            ragged.from_row_splits, values, splits)
+    if not context.executing_eagerly():
+      with ops.Graph().as_default():
+        values = constant_op.constant([1, 2, 3], dtypes.int64)
+      with ops.Graph().as_default():
+        splits = constant_op.constant([0, 2, 3], dtypes.int64)
+      self.assertRaisesRegexp(ValueError,
+                              '.* must be from the same graph as .*',
+                              RaggedTensor.from_row_splits, values, splits)
 
   #=============================================================================
   # Ragged Value & Row-Partitioning Tensor Accessors
@@ -599,58 +599,53 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
-    rt1 = ragged.from_row_splits(values, row_splits)
-    rt2 = ragged.from_value_rowids(values, value_rowids)
+    rt1 = RaggedTensor.from_row_splits(values, row_splits)
+    rt2 = RaggedTensor.from_value_rowids(values, value_rowids)
 
     for rt in [rt1, rt2]:
-      with self.test_session():
-        self.assertEqual(rt.tolist(),
-                         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
-        self.assertEqual(rt.values.eval().tolist(),
-                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
-        self.assertEqual(rt.values.shape.dims[0].value, 7)
-        self.assertEqual(
-            ragged.value_rowids(rt).eval().tolist(), [0, 0, 2, 2, 2, 3, 4])
-        self.assertEqual(ragged.nrows(rt).eval().tolist(), 5)
-        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 2, 5, 6, 7])
-        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 2, 5, 6])
-        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 2, 5, 6, 7])
-        self.assertEqual(
-            ragged.row_lengths(rt).eval().tolist(), [2, 0, 3, 1, 1])
-        self.assertEqual(rt.inner_values.eval().tolist(),
-                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
-        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
-                         [[0, 2, 2, 5, 6, 7]])
+      self.assertRaggedEqual(
+          rt, [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+      self.assertAllEqual(rt.values, [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertEqual(rt.values.shape.dims[0].value, 7)
+      self.assertAllEqual(rt.value_rowids(), [0, 0, 2, 2, 2, 3, 4])
+      self.assertAllEqual(rt.nrows(), 5)
+      self.assertAllEqual(rt.row_splits, [0, 2, 2, 5, 6, 7])
+      self.assertAllEqual(rt.row_starts(), [0, 2, 2, 5, 6])
+      self.assertAllEqual(rt.row_limits(), [2, 2, 5, 6, 7])
+      self.assertAllEqual(rt.row_lengths(), [2, 0, 3, 1, 1])
+      self.assertAllEqual(rt.flat_values,
+                          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertLen(rt.nested_row_splits, 1)
+      self.assertAllEqual(rt.nested_row_splits[0], [0, 2, 2, 5, 6, 7])
 
   def testRaggedTensorAccessors_3d_with_ragged_rank_1(self):
     values = [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]]
     row_splits = constant_op.constant([0, 2, 2, 5, 6, 7], dtypes.int64)
     value_rowids = constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
-    rt1 = ragged.from_row_splits(values, row_splits)
-    rt2 = ragged.from_value_rowids(values, value_rowids)
+    rt1 = RaggedTensor.from_row_splits(values, row_splits)
+    rt2 = RaggedTensor.from_value_rowids(values, value_rowids)
 
     for rt in [rt1, rt2]:
-      with self.test_session():
-        self.assertEqual(rt.tolist(),
-                         [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]],
-                          [[10, 11]], [[12, 13]]])
-        self.assertEqual(
-            rt.values.eval().tolist(),
-            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
-        self.assertEqual(rt.values.shape.dims[0].value, 7)
-        self.assertEqual(
-            ragged.value_rowids(rt).eval().tolist(), [0, 0, 2, 2, 2, 3, 4])
-        self.assertEqual(ragged.nrows(rt).eval().tolist(), 5)
-        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 2, 5, 6, 7])
-        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 2, 5, 6])
-        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 2, 5, 6, 7])
-        self.assertEqual(
-            ragged.row_lengths(rt).eval().tolist(), [2, 0, 3, 1, 1])
-        self.assertEqual(
-            rt.inner_values.eval().tolist(),
-            [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
-        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
-                         [[0, 2, 2, 5, 6, 7]])
+      self.assertEqual(
+          self.eval_to_list(rt),
+          [[[0, 1], [2, 3]], [], [[4, 5], [6, 7], [8, 9]], [[10, 11]],
+           [[12, 13]]])
+      self.assertEqual(
+          self.eval_to_list(rt.values),
+          [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+      self.assertEqual(rt.values.shape.dims[0].value, 7)
+      self.assertEqual(
+          self.eval_to_list(rt.value_rowids()), [0, 0, 2, 2, 2, 3, 4])
+      self.assertEqual(self.eval_to_list(rt.nrows()), 5)
+      self.assertEqual(self.eval_to_list(rt.row_splits), [0, 2, 2, 5, 6, 7])
+      self.assertEqual(self.eval_to_list(rt.row_starts()), [0, 2, 2, 5, 6])
+      self.assertEqual(self.eval_to_list(rt.row_limits()), [2, 2, 5, 6, 7])
+      self.assertEqual(self.eval_to_list(rt.row_lengths()), [2, 0, 3, 1, 1])
+      self.assertEqual(
+          self.eval_to_list(rt.flat_values),
+          [[0, 1], [2, 3], [4, 5], [6, 7], [8, 9], [10, 11], [12, 13]])
+      self.assertEqual([self.eval_to_list(s) for s in rt.nested_row_splits],
+                       [[0, 2, 2, 5, 6, 7]])
 
   def testRaggedTensorAccessors_3d_with_ragged_rank_2(self):
     values = constant_op.constant(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
@@ -662,40 +657,28 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         constant_op.constant([0, 0, 1, 3, 3], dtypes.int64),
         constant_op.constant([0, 0, 2, 2, 2, 3, 4], dtypes.int64)
     ]
-    rt1 = ragged.from_nested_row_splits(values, nested_row_splits)
-    rt2 = ragged.from_nested_value_rowids(values, nested_value_rowids)
+    rt1 = RaggedTensor.from_nested_row_splits(values, nested_row_splits)
+    rt2 = RaggedTensor.from_nested_value_rowids(values, nested_value_rowids)
 
     for rt in [rt1, rt2]:
-      with self.test_session():
-        self.assertEqual(
-            rt.tolist(),
-            [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
-        self.assertEqual(rt.values.eval().tolist(),
-                         [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
-        self.assertEqual(rt.values.shape.dims[0].value, 5)
-        self.assertEqual(
-            ragged.value_rowids(rt).eval().tolist(), [0, 0, 1, 3, 3])
-        self.assertEqual(ragged.nrows(rt).eval().tolist(), 4)
-        self.assertEqual(rt.row_splits.eval().tolist(), [0, 2, 3, 3, 5])
-        self.assertEqual(ragged.row_starts(rt).eval().tolist(), [0, 2, 3, 3])
-        self.assertEqual(ragged.row_limits(rt).eval().tolist(), [2, 3, 3, 5])
-        self.assertEqual(ragged.row_lengths(rt).eval().tolist(), [2, 1, 0, 2])
-        self.assertEqual(rt.inner_values.eval().tolist(),
-                         [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
-        self.assertEqual([s.eval().tolist() for s in rt.nested_row_splits],
-                         [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
-
-  def testNRowsWithTensorInput(self):
-    dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-    nrows = ragged.nrows(dt)
-    with self.test_session():
-      self.assertEqual(nrows.eval(), 2)
-
-  def testRowLengthsWithTensorInput(self):
-    dt = constant_op.constant([[1, 2, 3], [4, 5, 6]])
-    row_lengths = ragged.row_lengths(dt)
-    with self.test_session():
-      self.assertEqual(row_lengths.eval().tolist(), [3, 3])
+      self.assertEqual(
+          self.eval_to_list(rt),
+          [[[b'a', b'b'], []], [[b'c', b'd', b'e']], [], [[b'f'], [b'g']]])
+      self.assertEqual(
+          self.eval_to_list(rt.values),
+          [[b'a', b'b'], [], [b'c', b'd', b'e'], [b'f'], [b'g']])
+      self.assertEqual(rt.values.shape.dims[0].value, 5)
+      self.assertEqual(self.eval_to_list(rt.value_rowids()), [0, 0, 1, 3, 3])
+      self.assertEqual(self.eval_to_list(rt.nrows()), 4)
+      self.assertEqual(self.eval_to_list(rt.row_splits), [0, 2, 3, 3, 5])
+      self.assertEqual(self.eval_to_list(rt.row_starts()), [0, 2, 3, 3])
+      self.assertEqual(self.eval_to_list(rt.row_limits()), [2, 3, 3, 5])
+      self.assertEqual(self.eval_to_list(rt.row_lengths()), [2, 1, 0, 2])
+      self.assertEqual(
+          self.eval_to_list(rt.flat_values),
+          [b'a', b'b', b'c', b'd', b'e', b'f', b'g'])
+      self.assertEqual([self.eval_to_list(s) for s in rt.nested_row_splits],
+                       [[0, 2, 3, 3, 5], [0, 2, 2, 5, 6, 7]])
 
   #=============================================================================
   # RaggedTensor.shape
@@ -703,28 +686,30 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testShape(self):
     """Tests for RaggedTensor.shape."""
-    rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
+    rt1 = RaggedTensor.from_row_splits(b'a b c d e f g'.split(),
+                                       [0, 2, 5, 6, 6, 7])
     self.assertEqual(rt1.shape.as_list(), [5, None])
 
-    rt2 = ragged.from_row_splits(
+    rt2 = RaggedTensor.from_row_splits(
         [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12], [13, 14]],
         [0, 2, 5, 6, 6, 7])
     self.assertEqual(rt2.shape.as_list(), [5, None, 2])
 
-    rt3 = ragged.from_row_splits(
+    rt3 = RaggedTensor.from_row_splits(
         [[[1, 2], [3, 4]], [[5, 6], [7, 8]], [[9, 10], [11, 12]]], [0, 2, 2, 3])
     self.assertEqual(rt3.shape.as_list(), [3, None, 2, 2])
 
-    rt4 = ragged.from_row_splits(rt3, [0, 1, 3, 3])
+    rt4 = RaggedTensor.from_row_splits(rt3, [0, 1, 3, 3])
     self.assertEqual(rt4.shape.as_list(), [3, None, None, 2, 2])
 
-    rt5 = ragged.from_row_splits(
-        array_ops.placeholder(dtype=dtypes.string), [0, 2, 3, 5])
-    self.assertEqual(rt5.shape.ndims, None)
+    if not context.executing_eagerly():
+      rt5 = RaggedTensor.from_row_splits(
+          array_ops.placeholder(dtype=dtypes.string), [0, 2, 3, 5])
+      self.assertEqual(rt5.shape.ndims, None)
 
-    rt6 = ragged.from_row_splits([1, 2, 3],
-                                 array_ops.placeholder(dtype=dtypes.int64))
-    self.assertEqual(rt6.shape.as_list(), [None, None])
+      rt6 = RaggedTensor.from_row_splits(
+          [1, 2, 3], array_ops.placeholder(dtype=dtypes.int64))
+      self.assertEqual(rt6.shape.as_list(), [None, None])
 
   #=============================================================================
   # RaggedTensor.__getitem__
@@ -748,29 +733,21 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       expected: The expected value of rt.__getitem__(slice_spec), as a python
         list; or an exception class.
     """
-    with self.test_session():
-      tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
-      tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
-      value1 = rt.__getitem__(slice_spec).eval()
-      value2 = rt.__getitem__(tensor_slice_spec1).eval()
-      value3 = rt.__getitem__(tensor_slice_spec2).eval()
-      if hasattr(value1, 'tolist'):
-        value1 = value1.tolist()
-      if hasattr(value2, 'tolist'):
-        value2 = value2.tolist()
-      if hasattr(value3, 'tolist'):
-        value3 = value3.tolist()
-      self.assertEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
-      self.assertEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
-      self.assertEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
+    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+    tensor_slice_spec2 = _make_tensor_slice_spec(slice_spec, False)
+    value1 = self.eval_to_list(rt.__getitem__(slice_spec))
+    value2 = self.eval_to_list(rt.__getitem__(tensor_slice_spec1))
+    value3 = self.eval_to_list(rt.__getitem__(tensor_slice_spec2))
+    self.assertEqual(value1, expected, 'slice_spec=%s' % (slice_spec,))
+    self.assertEqual(value2, expected, 'slice_spec=%s' % (slice_spec,))
+    self.assertEqual(value3, expected, 'slice_spec=%s' % (slice_spec,))
 
   def _TestGetItemException(self, rt, slice_spec, expected, message):
     """Helper function for testing RaggedTensor.__getitem__ exceptions."""
-    with self.test_session():
-      tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
-      self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
-      self.assertRaisesRegexp(expected, message, rt.__getitem__,
-                              tensor_slice_spec1)
+    tensor_slice_spec1 = _make_tensor_slice_spec(slice_spec, True)
+    self.assertRaisesRegexp(expected, message, rt.__getitem__, slice_spec)
+    self.assertRaisesRegexp(expected, message, rt.__getitem__,
+                            tensor_slice_spec1)
 
   @parameterized.parameters(
       # Tests for rt[i]
@@ -839,20 +816,23 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testRaggedTensorGetItemWithRaggedRank1(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
     # Ragged tensor
-    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
-                                EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
 
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItem(rt, slice_spec, expected)
 
   # pylint: disable=invalid-slice-index
   @parameterized.parameters(
       # Tests for out-of-bound errors
-      (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[-6], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 2], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[3, 0], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[-6],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 2],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[3, 0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
 
       # Indexing into an inner ragged dimension
       (SLICE_BUILDER[:, 3], ValueError,
@@ -864,8 +844,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
       # Tests for type errors
       (SLICE_BUILDER[0.5], TypeError, re.escape(array_ops._SLICE_TYPE_ERROR)),
-      (SLICE_BUILDER[1:3:0.5], TypeError,
-       re.escape(array_ops._SLICE_TYPE_ERROR)),
+      (SLICE_BUILDER[1:3:0.5], TypeError, re.escape(
+          array_ops._SLICE_TYPE_ERROR)),
       (SLICE_BUILDER[:, 1:3:0.5], TypeError,
        'slice strides must be integers or None'),
       (SLICE_BUILDER[:, 0.5:1.5], TypeError,
@@ -882,13 +862,10 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
     # Ragged tensor
-    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
-                                EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
-    # if sys.version_info[0] == 3:
-    #   message = 'must be str, not int'
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES,
+                                      EXAMPLE_RAGGED_TENSOR_2D_SPLITS)
 
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItemException(rt, slice_spec, expected, message)
 
   @parameterized.parameters(
@@ -959,11 +936,10 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   )
   def testRaggedTensorGetItemWithRaggedRank2(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = ragged.from_nested_row_splits(
+    rt = RaggedTensor.from_nested_row_splits(
         EXAMPLE_RAGGED_TENSOR_4D_VALUES,
         [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_4D)
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
@@ -974,19 +950,22 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
        'Cannot index into an inner ragged dimension.'),
 
       # Test for out-of-bounds errors.
-      (SLICE_BUILDER[1, 0], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 0, 3], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[5], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[0, 5], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[1, 0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 0, 3],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[0, 5],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
   )
   def testRaggedTensorGetItemErrorsWithRaggedRank2(self, slice_spec, expected,
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = ragged.from_nested_row_splits(
+    rt = RaggedTensor.from_nested_row_splits(
         EXAMPLE_RAGGED_TENSOR_4D_VALUES,
         [EXAMPLE_RAGGED_TENSOR_4D_SPLITS1, EXAMPLE_RAGGED_TENSOR_4D_SPLITS2])
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_4D)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_4D)
     self._TestGetItemException(rt, slice_spec, expected, message)
 
   @parameterized.parameters(
@@ -996,17 +975,19 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   )
   def testRaggedTensorGetItemWithEmptyTensor(self, slice_spec, expected):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = ragged.from_row_splits([], [0])
+    rt = RaggedTensor.from_row_splits([], [0])
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
-      (SLICE_BUILDER[0], ValueError, '.*out of bounds.*'),
-      (SLICE_BUILDER[-1], ValueError, '.*out of bounds.*'),
+      (SLICE_BUILDER[0],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
+      (SLICE_BUILDER[-1],
+       (ValueError, errors.InvalidArgumentError), '.*out of bounds.*'),
   )
   def testRaggedTensorGetItemErrorsWithEmptyTensor(self, slice_spec, expected,
                                                    message):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    rt = ragged.from_row_splits([], [0])
+    rt = RaggedTensor.from_row_splits([], [0])
     self._TestGetItemException(rt, slice_spec, expected, message)
 
   @parameterized.parameters(
@@ -1025,9 +1006,8 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     splits = constant_op.constant(
         EXAMPLE_RAGGED_TENSOR_2D_SPLITS, dtype=dtypes.int64)
     splits = array_ops.placeholder_with_default(splits, None)
-    rt = ragged.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
-    with self.test_session():
-      self.assertEqual(rt.tolist(), EXAMPLE_RAGGED_TENSOR_2D)
+    rt = RaggedTensor.from_row_splits(EXAMPLE_RAGGED_TENSOR_2D_VALUES, splits)
+    self.assertEqual(self.eval_to_list(rt), EXAMPLE_RAGGED_TENSOR_2D)
     self._TestGetItem(rt, slice_spec, expected)
 
   @parameterized.parameters(
@@ -1036,95 +1016,98 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testRaggedTensorGetItemErrorsWithPlaceholderShapes(
       self, slice_spec, expected, message):
     """Test that rt.__getitem__(slice_spec) == expected."""
-    # Intentionally use an unknown shape for `values`.
-    values = array_ops.placeholder_with_default([0], None)
-    rt = ragged.from_row_splits(values, [0, 1])
-    self._TestGetItemException(rt, slice_spec, expected, message)
+    if not context.executing_eagerly():
+      # Intentionally use an unknown shape for `values`.
+      values = array_ops.placeholder_with_default([0], None)
+      rt = RaggedTensor.from_row_splits(values, [0, 1])
+      self._TestGetItemException(rt, slice_spec, expected, message)
 
-  # TODO(edloper): Remove this decorator once c shapes become the default.
-  @test_util.enable_c_shapes
   def testGetItemNewAxis(self):
     # rt: [[[['a', 'b'], ['c', 'd']], [], [['e', 'f']]], []]
     splits1 = [0, 3, 3]
     splits2 = [0, 2, 2, 3]
     values = constant_op.constant([['a', 'b'], ['c', 'd'], ['e', 'f']])
-    rt = ragged.from_nested_row_splits(values, [splits1, splits2])
-    with self.test_session():
-      rt_newaxis0 = rt[array_ops.newaxis]
-      rt_newaxis1 = rt[:, array_ops.newaxis]
-      rt_newaxis2 = rt[:, :, array_ops.newaxis]
-      rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
-      rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
-
-      self.assertEqual(rt.tolist(),
-                       [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
-      self.assertEqual(
-          rt_newaxis0.tolist(),
-          [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
-      self.assertEqual(
-          rt_newaxis1.tolist(),
-          [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
-      self.assertEqual(
-          rt_newaxis2.tolist(),
-          [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
-      self.assertEqual(
-          rt_newaxis3.tolist(),
-          [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
-      self.assertEqual(
-          rt_newaxis4.tolist(),
-          [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
-
-      self.assertEqual(rt.ragged_rank, 2)
-      self.assertEqual(rt_newaxis0.ragged_rank, 3)
-      self.assertEqual(rt_newaxis1.ragged_rank, 3)
-      self.assertEqual(rt_newaxis2.ragged_rank, 3)
-      self.assertEqual(rt_newaxis3.ragged_rank, 2)
-      self.assertEqual(rt_newaxis4.ragged_rank, 2)
-
-      self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
-      self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
-      self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
-      self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
-      self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
+    rt = RaggedTensor.from_nested_row_splits(values, [splits1, splits2])
+    rt_newaxis0 = rt[array_ops.newaxis]
+    rt_newaxis1 = rt[:, array_ops.newaxis]
+    rt_newaxis2 = rt[:, :, array_ops.newaxis]
+    rt_newaxis3 = rt[:, :, :, array_ops.newaxis]
+    rt_newaxis4 = rt[:, :, :, :, array_ops.newaxis]
+
+    self.assertEqual(
+        self.eval_to_list(rt),
+        [[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis0),
+        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]], []]])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis1),
+        [[[[[b'a', b'b'], [b'c', b'd']], [], [[b'e', b'f']]]], [[]]])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis2),
+        [[[[[b'a', b'b'], [b'c', b'd']]], [[]], [[[b'e', b'f']]]], []])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis3),
+        [[[[[b'a', b'b']], [[b'c', b'd']]], [], [[[b'e', b'f']]]], []])
+    self.assertEqual(
+        self.eval_to_list(rt_newaxis4),
+        [[[[[b'a'], [b'b']], [[b'c'], [b'd']]], [], [[[b'e'], [b'f']]]], []])
+
+    self.assertEqual(rt.ragged_rank, 2)
+    self.assertEqual(rt_newaxis0.ragged_rank, 3)
+    self.assertEqual(rt_newaxis1.ragged_rank, 3)
+    self.assertEqual(rt_newaxis2.ragged_rank, 3)
+    self.assertEqual(rt_newaxis3.ragged_rank, 2)
+    self.assertEqual(rt_newaxis4.ragged_rank, 2)
+
+    self.assertEqual(rt_newaxis0.shape.as_list(), [1, None, None, None, 2])
+    self.assertEqual(rt_newaxis1.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis2.shape.as_list(), [2, None, None, None, 2])
+    self.assertEqual(rt_newaxis3.shape.as_list(), [2, None, None, 1, 2])
+    self.assertEqual(rt_newaxis4.shape.as_list(), [2, None, None, 2, 1])
 
   #=============================================================================
   # RaggedTensor.__str__
   #=============================================================================
   def testRaggedTensorStr(self):
-    rt1 = ragged.from_row_splits(b'a b c d e f g'.split(), [0, 2, 5, 6, 6, 7])
-    expected1 = ('RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
-                 'shape=(7,), dtype=string), row_splits='
-                 'Tensor("RaggedFromRowSplits/row_splits:0", '
-                 'shape=(6,), dtype=int64))')
-    self.assertEqual(str(rt1), expected1)
-    self.assertEqual(repr(rt1), expected1)
+    values = [b'a', b'b', b'c', b'd', b'e', b'f', b'g']
+    row_splits = [0, 2, 5, 6, 6, 7]
+    rt = RaggedTensor.from_row_splits(values, row_splits)
+    if context.executing_eagerly():
+      expected_str = '<tf.RaggedTensor {}>'.format([[b'a', b'b'],
+                                                    [b'c', b'd', b'e'], [b'f'],
+                                                    [], [b'g']])
+      expected_repr = (
+          'tf.RaggedTensor(values=tf.Tensor([{}], shape=(7,), dtype=string), '
+          'row_splits=tf.Tensor([{}], shape=(6,), dtype=int64))'.format(
+              ' '.join(repr(x) for x in values), ' '.join(
+                  repr(x) for x in row_splits)))
+      self.assertEqual(str(rt), expected_str)
+      self.assertEqual(repr(rt), expected_repr)
+    else:
+      expected_repr = (
+          'tf.RaggedTensor(values=Tensor("RaggedFromRowSplits/values:0", '
+          'shape=(7,), dtype=string), row_splits='
+          'Tensor("RaggedFromRowSplits/row_splits:0", '
+          'shape=(6,), dtype=int64))')
+      self.assertEqual(repr(rt), expected_repr)
+      self.assertEqual(str(rt), expected_repr)
 
   def testRaggedTensorValueStr(self):
+    values = [b'a', b'b', b'c', b'd', b'e', b'f', b'g']
+    row_splits = [0, 2, 5, 6, 6, 7]
     rt = ragged.RaggedTensorValue(
-        values=np.array(b'a b c d e f g'.split()),
-        row_splits=np.array([0, 2, 5, 6, 6, 7], dtype=np.int64))
-    if sys.version_info[0] == 2:
-      self.assertEqual(' '.join(str(rt).split()),
-                       (r"<RaggedTensorValue [['a', 'b'], ['c', 'd', 'e'], "
-                        "['f'], [], ['g']]>"))
-      self.assertEqual(
-          ' '.join(repr(rt).split()),
-          (r"RaggedTensorValue(values=array(['a', 'b', 'c', 'd', "
-           "'e', 'f', 'g'], dtype='|S1'), row_splits=array([0, 2, 5,"
-           ' 6, 6, 7]))'))
-    else:
-      self.assertEqual(
-          ' '.join(str(rt).split()),
-          (r"<RaggedTensorValue [[b'a', b'b'], [b'c', b'd', b'e'], "
-           "[b'f'], [], [b'g']]>"))
-      self.assertEqual(
-          ' '.join(repr(rt).split()),
-          (r"RaggedTensorValue(values=array([b'a', b'b', b'c', b'd', "
-           "b'e', b'f', b'g'], dtype='|S1'), row_splits=array([0, 2, 5,"
-           ' 6, 6, 7]))'))
+        np.array(values), np.array(row_splits, dtype=np.int64))
+    expected_str = '<tf.RaggedTensorValue {}>'.format([[b'a', b'b'],
+                                                       [b'c', b'd', b'e'],
+                                                       [b'f'], [], [b'g']])
+    expected_repr = ("tf.RaggedTensorValue(values=array({}, dtype='|S1'), "
+                     'row_splits=array({}))'.format(values, row_splits))
+    self.assertEqual(' '.join(str(rt).split()), expected_str)
+    self.assertEqual(' '.join(repr(rt).split()), expected_repr)
 
   #=============================================================================
-  # RaggedTensor.with_values() and RaggedTensor.with_inner_values().
+  # RaggedTensor.with_values() and RaggedTensor.with_flat_values().
   #=============================================================================
 
   def testWithValues(self):
@@ -1132,55 +1115,72 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     rt2 = ragged.constant([[[1, 2], [3, 4, 5]], [[6]], [], [[], [7]]])
 
     rt1_plus_10 = rt1.with_values(rt1.values + 10)
-    rt2_times_10 = rt2.with_inner_values(rt2.inner_values * 10)
+    rt2_times_10 = rt2.with_flat_values(rt2.flat_values * 10)
     rt1_expanded = rt1.with_values(array_ops.expand_dims(rt1.values, axis=1))
 
-    with self.test_session():
-      self.assertEqual(rt1_plus_10.tolist(),
-                       [[11, 12], [13, 14, 15], [16], [], [17]])
-      self.assertEqual(rt2_times_10.tolist(),
-                       [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
-      self.assertEqual(rt1_expanded.tolist(),
-                       [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
+    self.assertEqual(
+        self.eval_to_list(rt1_plus_10),
+        [[11, 12], [13, 14, 15], [16], [], [17]])
+    self.assertEqual(
+        self.eval_to_list(rt2_times_10),
+        [[[10, 20], [30, 40, 50]], [[60]], [], [[], [70]]])
+    self.assertEqual(
+        self.eval_to_list(rt1_expanded),
+        [[[1], [2]], [[3], [4], [5]], [[6]], [], [[7]]])
 
   #=============================================================================
   # Session.run
   #=============================================================================
   def testSessionRun(self):
+    if context.executing_eagerly():
+      return
+
     rt1 = ragged.constant([[1, 2, 3], [4]])
     rt2 = ragged.constant([[[], [1, 2]], [[3]]])
     with self.test_session() as session:
       result = session.run({'rt1': rt1, 'rt2': rt2})
       self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
-      self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
-      self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
+      self.assertEqual(result['rt1'].to_list(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].to_list(), [[[], [1, 2]], [[3]]])
 
   def testSessionRunFeed(self):
-    rt1 = ragged.from_row_splits(
+    if context.executing_eagerly():
+      return
+
+    rt1 = RaggedTensor.from_row_splits(
         array_ops.placeholder(dtypes.int32),
         array_ops.placeholder(dtypes.int64))
-    rt2 = ragged.from_nested_row_splits(
-        array_ops.placeholder(dtypes.int32),
-        [array_ops.placeholder(dtypes.int64),
-         array_ops.placeholder(dtypes.int64)])
+    rt2 = RaggedTensor.from_nested_row_splits(
+        array_ops.placeholder(dtypes.int32), [
+            array_ops.placeholder(dtypes.int64),
+            array_ops.placeholder(dtypes.int64)
+        ])
 
     rt1_feed_val = ragged.constant_value([[1, 2, 3], [4]])
     rt2_feed_val = ragged.constant_value([[[], [1, 2]], [[3]]])
 
     with self.test_session() as session:
-      result = session.run({'rt1': rt1, 'rt2': rt2},
-                           feed_dict={rt1: rt1_feed_val,
-                                      rt2: rt2_feed_val})
+      result = session.run({
+          'rt1': rt1,
+          'rt2': rt2
+      },
+                           feed_dict={
+                               rt1: rt1_feed_val,
+                               rt2: rt2_feed_val
+                           })
       self.assertCountEqual(sorted(result.keys()), ['rt1', 'rt2'])
-      self.assertEqual(result['rt1'].tolist(), [[1, 2, 3], [4]])
-      self.assertEqual(result['rt2'].tolist(), [[[], [1, 2]], [[3]]])
+      self.assertEqual(result['rt1'].to_list(), [[1, 2, 3], [4]])
+      self.assertEqual(result['rt2'].to_list(), [[[], [1, 2]], [[3]]])
 
   def testSessionPartialRunFeed(self):
+    if context.executing_eagerly():
+      return
+
     # Placeholder inputs.
-    a = ragged.from_row_splits(
+    a = RaggedTensor.from_row_splits(
         array_ops.placeholder(dtypes.int32, shape=[None], name='a.values'),
         array_ops.placeholder(dtypes.int64, name='a.row_splits'))
-    b = ragged.from_row_splits(
+    b = RaggedTensor.from_row_splits(
         array_ops.placeholder(dtypes.int32, shape=[None], name='b.values'),
         array_ops.placeholder(dtypes.int64, name='b.row_splits'))
     c = array_ops.placeholder(dtypes.int32, shape=[], name='c')
@@ -1198,11 +1198,10 @@ class RaggedTensorTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       handle = session.partial_run_setup([r1, r2], [a, b, c])
 
       res1 = session.partial_run(handle, r1, feed_dict={a: a_val, b: b_val})
-      self.assertEqual(res1.tolist(), [22, 8])
+      self.assertAllEqual(res1, [22, 8])
 
       res2 = session.partial_run(handle, r2, feed_dict={c: c_val})
-      self.assertEqual(res2.tolist(), [15, 7])
-
+      self.assertAllEqual(res2, [15, 7])
 
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/ragged/ragged_tensor_value.py b/tensorflow/python/ops/ragged/ragged_tensor_value.py
index 39d3249c991674a090d2dab4da8fb385b7463f13..e94ca4afac63f3d1dafb148266683042c987934f 100644
--- a/tensorflow/python/ops/ragged/ragged_tensor_value.py
+++ b/tensorflow/python/ops/ragged/ragged_tensor_value.py
@@ -53,7 +53,7 @@ class RaggedTensorValue(object):
       doc="""The numpy dtype of values in this tensor.""")
 
   @property
-  def inner_values(self):
+  def flat_values(self):
     """The innermost `values` array for this ragged tensor value."""
     rt_values = self.values
     while isinstance(rt_values, RaggedTensorValue):
@@ -82,16 +82,26 @@ class RaggedTensorValue(object):
     return (self._row_splits.shape[0] - 1,) + (None,) + self._values.shape[1:]
 
   def __str__(self):
-    return "<RaggedTensorValue %s>" % self.tolist()
+    return "<tf.RaggedTensorValue %s>" % self.to_list()
 
   def __repr__(self):
-    return "RaggedTensorValue(values=%r, row_splits=%r)" % (self._values,
-                                                            self._row_splits)
+    return "tf.RaggedTensorValue(values=%r, row_splits=%r)" % (self._values,
+                                                               self._row_splits)
 
-  def tolist(self):
+  def to_list(self):
     """Returns this ragged tensor value as a nested Python list."""
-    values_as_list = self._values.tolist()
+    if isinstance(self._values, RaggedTensorValue):
+      values_as_list = self._values.to_list()
+    else:
+      values_as_list = self._values.tolist()
     return [
         values_as_list[self._row_splits[i]:self._row_splits[i + 1]]
         for i in range(len(self._row_splits) - 1)
     ]
+
+  def value_rowids(self, name=None):
+    del name
+    row_lengths = self._row_splits[1:] - self._row_splits[:-1]
+    nrows = self._row_splits.shape[-1] - 1
+    indices = np.arange(nrows)
+    return np.repeat(indices, repeats=row_lengths, axis=0)
diff --git a/tensorflow/python/ops/ragged/ragged_test_util.py b/tensorflow/python/ops/ragged/ragged_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..027417664d23683e0eb3906892b81c29c8847f6a
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_test_util.py
@@ -0,0 +1,95 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+# pylint: disable=invalid-name
+"""Test utils for tensorflow RaggedTensors."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import ragged
+
+
+class RaggedTensorTestCase(test_util.TensorFlowTestCase):
+  """Base class for RaggedTensor test cases."""
+
+  def _GetPyList(self, a):
+    """Converts a to a nested python list."""
+    if isinstance(a, ragged.RaggedTensor):
+      return self.evaluate(a).to_list()
+    elif isinstance(a, ops.Tensor):
+      a = self.evaluate(a)
+      return a.tolist() if isinstance(a, np.ndarray) else a
+    elif isinstance(a, np.ndarray):
+      return a.tolist()
+    elif isinstance(a, ragged.RaggedTensorValue):
+      return a.to_list()
+    else:
+      return np.array(a).tolist()
+
+  def assertRaggedEqual(self, a, b):
+    """Asserts that two potentially ragged tensors are equal."""
+    a_list = self._GetPyList(a)
+    b_list = self._GetPyList(b)
+    self.assertEqual(a_list, b_list)
+
+    if not (isinstance(a, (list, tuple)) or isinstance(b, (list, tuple))):
+      a_ragged_rank = a.ragged_rank if ragged.is_ragged(a) else 0
+      b_ragged_rank = b.ragged_rank if ragged.is_ragged(b) else 0
+      self.assertEqual(a_ragged_rank, b_ragged_rank)
+
+  def assertRaggedAlmostEqual(self, a, b, places=7):
+    a_list = self._GetPyList(a)
+    b_list = self._GetPyList(b)
+    self.assertNestedListAlmostEqual(a_list, b_list, places, context='value')
+
+    if not (isinstance(a, (list, tuple)) or isinstance(b, (list, tuple))):
+      a_ragged_rank = a.ragged_rank if ragged.is_ragged(a) else 0
+      b_ragged_rank = b.ragged_rank if ragged.is_ragged(b) else 0
+      self.assertEqual(a_ragged_rank, b_ragged_rank)
+
+  def assertNestedListAlmostEqual(self, a, b, places=7, context='value'):
+    self.assertEqual(type(a), type(b))
+    if isinstance(a, (list, tuple)):
+      self.assertLen(a, len(b), 'Length differs for %s' % context)
+      for i in range(len(a)):
+        self.assertNestedListAlmostEqual(a[i], b[i], places,
+                                         '%s[%s]' % (context, i))
+    else:
+      self.assertAlmostEqual(
+          a, b, places,
+          '%s != %s within %s places at %s' % (a, b, places, context))
+
+  def eval_to_list(self, tensor):
+    value = self.evaluate(tensor)
+    if ragged.is_ragged(value):
+      return value.to_list()
+    elif isinstance(value, np.ndarray):
+      return value.tolist()
+    else:
+      return value
+
+  def _eval_tensor(self, tensor):
+    if ragged.is_ragged(tensor):
+      return ragged.RaggedTensorValue(
+          self._eval_tensor(tensor.values),
+          self._eval_tensor(tensor.row_splits))
+    else:
+      return test_util.TensorFlowTestCase._eval_tensor(self, tensor)
diff --git a/tensorflow/python/ops/ragged/ragged_tile_op_test.py b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
index bf62d96e7a9d2371ebb808548dfbb5b73677caa6..d3445571bff6c75e7a22e458bdf99d3886cd9614 100644
--- a/tensorflow/python/ops/ragged/ragged_tile_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_tile_op_test.py
@@ -24,12 +24,14 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops.ragged import ragged_array_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTileOpTest(ragged_test_util.RaggedTensorTestCase,
+                       parameterized.TestCase):
 
   @parameterized.parameters([
       #=========================================================================
@@ -170,6 +172,15 @@ class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
           rt_input=[[[[1], [2]], [[3]]], [[]], [[[4, 5]]]],
           multiples=[1, 1, 1, 0],
           expected=[[[[], []], [[]]], [[]], [[[]]]]),
+      #=========================================================================
+      # multiple=1
+      #=========================================================================
+      dict(
+          descr='rank=4, multiples=1 (no repeats)',
+          rt_input=[[[[1], [2]], [[3], [4]]], [[[5], [6]]]],
+          multiples=[1, 1, 1, 1],
+          expected=[[[[1], [2]], [[3], [4]]],
+                    [[[5], [6]]]]),
 
   ])  # pyformat: disable
   def testRaggedTile(self,
@@ -178,7 +189,7 @@ class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
                      multiples,
                      expected,
                      ragged_rank=None):
-    rt = ragged_factory_ops.constant(rt_input, ragged_rank)
+    rt = ragged.constant(rt_input, ragged_rank)
 
     expected_shape = [
         None if dim is None else dim * multiple
@@ -192,23 +203,21 @@ class RaggedTileOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         const_multiples, shape=[len(multiples)])
 
     for multiples_tensor in (const_multiples, non_const_multiples):
-      tiled = ragged_array_ops.tile(rt, multiples_tensor)
+      tiled = ragged.tile(rt, multiples_tensor)
       self.assertEqual(tiled.ragged_rank, rt.ragged_rank)
       self.assertEqual(tiled.shape.ndims, rt.shape.ndims)
       if multiples_tensor is const_multiples:
         self.assertEqual(tiled.shape.as_list(), expected_shape)
-      with self.test_session():
-        self.assertEqual(tiled.eval().tolist(), expected)
+      self.assertRaggedEqual(tiled, expected)
 
   def testRaggedTileWithTensorInput(self):
     # When the input is a `Tensor`, ragged_tile just delegates to tf.tile.
     dt = constant_op.constant([[1, 2], [3, 4]])
-    tiled = ragged_array_ops.tile(dt, [3, 2])
+    tiled = ragged.tile(dt, [3, 2])
     expected = [[1, 2, 1, 2], [3, 4, 3, 4],
                 [1, 2, 1, 2], [3, 4, 3, 4],
                 [1, 2, 1, 2], [3, 4, 3, 4]]  # pyformat: disable
-    with self.test_session():
-      self.assertEqual(tiled.eval().tolist(), expected)
+    self.assertRaggedEqual(tiled, expected)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
index 2fd31837c62de43a1ecb1162f2c1818094d34633..46d7a56a7c8e0fa7a008625314e30786ffbbfefe 100644
--- a/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_sparse_op_test.py
@@ -18,118 +18,116 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToSparseOpTest(ragged_test_util.RaggedTensorTestCase):
 
   def testDocStringExample(self):
     rt = ragged.constant([[1, 2, 3], [4], [], [5, 6]])
-    st = ragged.to_sparse(rt)
-    expected = ('SparseTensorValue(indices='
-                'array([[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]]), '
-                'values=array([1, 2, 3, 4, 5, 6], dtype=int32), '
-                'dense_shape=array([4, 3]))')
-    with self.test_session():
-      self.assertEqual(' '.join(repr(st.eval()).split()), expected)
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.indices,
+                        [[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]])
+    self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6])
+    self.assertAllEqual(st.dense_shape, [4, 3])
 
   def test2DRaggedTensorWithOneRaggedDimension(self):
     rt = ragged.constant([['a', 'b'], ['c', 'd', 'e'], ['f'], [], ['g']])
-    with self.test_session():
-      st = ragged.to_sparse(rt).eval()
-      self.assertAllEqual(
-          st.indices, [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [4, 0]])
-      self.assertAllEqual(st.values, b'a b c d e f g'.split())
-      self.assertAllEqual(st.dense_shape, [5, 3])
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(
+        st.indices, [[0, 0], [0, 1], [1, 0], [1, 1], [1, 2], [2, 0], [4, 0]])
+    self.assertAllEqual(st.values, b'a b c d e f g'.split())
+    self.assertAllEqual(st.dense_shape, [5, 3])
 
   def test3DRaggedTensorWithOneRaggedDimension(self):
     rt = ragged.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]],
                           [[11, 12]], [], [[13, 14]]],
                          ragged_rank=1)
-    with self.test_session():
-      st = ragged.to_sparse(rt).eval()
-      self.assertAllEqual(
-          st.indices, [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0],
-                       [1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 2, 0], [1, 2, 1],
-                       [2, 0, 0], [2, 0, 1], [4, 0, 0], [4, 0, 1]])
-      self.assertAllEqual(st.values,
-                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
-      self.assertAllEqual(st.dense_shape, [5, 3, 2])
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.indices,
+                        [[0, 0, 0], [0, 0, 1], [0, 1, 0], [0, 1, 1], [1, 0, 0],
+                         [1, 0, 1], [1, 1, 0], [1, 1, 1], [1, 2, 0], [1, 2, 1],
+                         [2, 0, 0], [2, 0, 1], [4, 0, 0], [4, 0, 1]])
+    self.assertAllEqual(st.values,
+                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+    self.assertAllEqual(st.dense_shape, [5, 3, 2])
 
   def test4DRaggedTensorWithOneRaggedDimension(self):
     rt = ragged.constant(
         [[[[1, 2], [3, 4]], [[5, 6], [7, 8]]], [], [[[9, 10], [11, 12]]]],
         ragged_rank=1)
-    with self.test_session():
-      st = ragged.to_sparse(rt).eval()
-      self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
-      self.assertAllEqual(
-          st.indices,
-          [
-              [0, 0, 0, 0],  # index for value=1
-              [0, 0, 0, 1],  # index for value=2
-              [0, 0, 1, 0],  # index for value=3
-              [0, 0, 1, 1],  # index for value=4
-              [0, 1, 0, 0],  # index for value=5
-              [0, 1, 0, 1],  # index for value=6
-              [0, 1, 1, 0],  # index for value=7
-              [0, 1, 1, 1],  # index for value=8
-              [2, 0, 0, 0],  # index for value=9
-              [2, 0, 0, 1],  # index for value=10
-              [2, 0, 1, 0],  # index for value=11
-              [2, 0, 1, 1],  # index for value=12
-          ])
-      self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(st.values, [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])
+    self.assertAllEqual(
+        st.indices,
+        [
+            [0, 0, 0, 0],  # index for value=1
+            [0, 0, 0, 1],  # index for value=2
+            [0, 0, 1, 0],  # index for value=3
+            [0, 0, 1, 1],  # index for value=4
+            [0, 1, 0, 0],  # index for value=5
+            [0, 1, 0, 1],  # index for value=6
+            [0, 1, 1, 0],  # index for value=7
+            [0, 1, 1, 1],  # index for value=8
+            [2, 0, 0, 0],  # index for value=9
+            [2, 0, 0, 1],  # index for value=10
+            [2, 0, 1, 0],  # index for value=11
+            [2, 0, 1, 1],  # index for value=12
+        ])
+    self.assertAllEqual(st.dense_shape, [3, 2, 2, 2])
 
   def test4DRaggedTensorWithTwoRaggedDimensions(self):
     rt = ragged.constant([[[[1, 2], [3, 4]], [[5, 6], [7, 8], [9, 10]]],
                           [[[11, 12]], [], [[13, 14]]], []],
                          ragged_rank=2)
-    with self.test_session():
-      st = ragged.to_sparse(rt).eval()
-      self.assertAllEqual(
-          st.indices,
-          [
-              [0, 0, 0, 0],  # index for value=1
-              [0, 0, 0, 1],  # index for value=2
-              [0, 0, 1, 0],  # index for value=3
-              [0, 0, 1, 1],  # index for value=4
-              [0, 1, 0, 0],  # index for value=5
-              [0, 1, 0, 1],  # index for value=6
-              [0, 1, 1, 0],  # index for value=7
-              [0, 1, 1, 1],  # index for value=8
-              [0, 1, 2, 0],  # index for value=9
-              [0, 1, 2, 1],  # index for value=10
-              [1, 0, 0, 0],  # index for value=11
-              [1, 0, 0, 1],  # index for value=12
-              [1, 2, 0, 0],  # index for value=13
-              [1, 2, 0, 1],  # index for value=14
-          ])
-      self.assertAllEqual(st.values,
-                          [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
-      self.assertAllEqual(st.dense_shape, [3, 3, 3, 2])
+    st = self.evaluate(rt.to_sparse())
+    self.assertAllEqual(
+        st.indices,
+        [
+            [0, 0, 0, 0],  # index for value=1
+            [0, 0, 0, 1],  # index for value=2
+            [0, 0, 1, 0],  # index for value=3
+            [0, 0, 1, 1],  # index for value=4
+            [0, 1, 0, 0],  # index for value=5
+            [0, 1, 0, 1],  # index for value=6
+            [0, 1, 1, 0],  # index for value=7
+            [0, 1, 1, 1],  # index for value=8
+            [0, 1, 2, 0],  # index for value=9
+            [0, 1, 2, 1],  # index for value=10
+            [1, 0, 0, 0],  # index for value=11
+            [1, 0, 0, 1],  # index for value=12
+            [1, 2, 0, 0],  # index for value=13
+            [1, 2, 0, 1],  # index for value=14
+        ])
+    self.assertAllEqual(st.values,
+                        [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14])
+    self.assertAllEqual(st.dense_shape, [3, 3, 3, 2])
 
   def testShape(self):
     rt = ragged.constant([[1, 2], [3, 4, 5], [6], [], [7]])
-    st = ragged.to_sparse(rt)
+    st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [7, 2])
     self.assertEqual(st.values.shape.as_list(), [7])
     self.assertEqual(st.dense_shape.shape.as_list(), [2])
 
     rt = ragged.constant([[[1, 2]], [], [[3, 4]], []], ragged_rank=1)
-    st = ragged.to_sparse(rt)
+    st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [4, 3])
     self.assertEqual(st.values.shape.as_list(), [4])
     self.assertEqual(st.dense_shape.shape.as_list(), [3])
 
     rt = ragged.constant([[[1], [2, 3, 4, 5, 6, 7]], [[]]])
-    st = ragged.to_sparse(rt)
+    st = rt.to_sparse()
     self.assertEqual(st.indices.shape.as_list(), [7, 3])
     self.assertEqual(st.values.shape.as_list(), [7])
     self.assertEqual(st.dense_shape.shape.as_list(), [3])
@@ -140,53 +138,56 @@ class RaggedTensorToSparseOpTest(test_util.TensorFlowTestCase):
     empty_vector = array_ops.placeholder_with_default(
         array_ops.zeros([0], dtypes.int64), shape=None)
 
-    bad_rt1 = ragged.from_row_splits(row_splits=[2, 3], values=[1, 2, 3])
-    with self.test_session():
-      bad_split0_error = r'First value of ragged splits must be 0.*'
-      self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0_error,
-                              ragged.to_sparse(bad_rt1).eval)
+    bad_rt1 = ragged.RaggedTensor.from_row_splits(
+        row_splits=[2, 3], values=[1, 2, 3])
+    bad_split0 = r'First value of ragged splits must be 0.*'
+    with self.assertRaisesRegexp(errors.InvalidArgumentError, bad_split0):
+      self.evaluate(bad_rt1.to_sparse())
 
-    bad_rt2 = ragged.from_row_splits(row_splits=[0, 5], values=empty_vector)
-    bad_rt3 = ragged.from_row_splits(
+    bad_rt2 = ragged.RaggedTensor.from_row_splits(
+        row_splits=[0, 5], values=empty_vector)
+    bad_rt3 = ragged.RaggedTensor.from_row_splits(
         row_splits=[0, 1],
-        values=ragged.from_row_splits(row_splits=[0, 5], values=empty_vector))
-    with self.test_session():
-      split_mismatch1_error = r'Final value of ragged splits must match.*'
-      for rt in [bad_rt2, bad_rt3]:
-        self.assertRaisesRegexp(errors.InvalidArgumentError,
-                                split_mismatch1_error,
-                                ragged.to_sparse(rt).eval)
-
-    bad_rt4 = ragged.from_row_splits(
+        values=ragged.RaggedTensor.from_row_splits(
+            row_splits=[0, 5], values=empty_vector))
+    split_mismatch1_error = r'Final value of ragged splits must match.*'
+    for rt in [bad_rt2, bad_rt3]:
+      with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                   split_mismatch1_error):
+        self.evaluate(rt.to_sparse())
+
+    bad_rt4 = ragged.RaggedTensor.from_row_splits(
         row_splits=[0, 5],
-        values=ragged.from_row_splits(row_splits=[0], values=empty_vector))
-    with self.test_session():
-      split_mismatch2_error = r'Final value of ragged splits must match.*'
-      self.assertRaisesRegexp(errors.InvalidArgumentError,
-                              split_mismatch2_error,
-                              ragged.to_sparse(bad_rt4).eval)
-
-    bad_rt5 = ragged.from_row_splits(row_splits=empty_vector, values=[])
-    with self.test_session():
-      empty_splits_error = (r'ragged splits may not be empty.*')
-      self.assertRaisesRegexp(errors.InvalidArgumentError, empty_splits_error,
-                              ragged.to_sparse(bad_rt5).eval)
+        values=ragged.RaggedTensor.from_row_splits(
+            row_splits=[0], values=empty_vector))
+    split_mismatch2_error = r'Final value of ragged splits must match.*'
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 split_mismatch2_error):
+      self.evaluate(bad_rt4.to_sparse())
+
+    bad_rt5 = ragged.RaggedTensor.from_row_splits(
+        row_splits=empty_vector, values=[])
+    empty_splits_error = (r'ragged splits may not be empty.*')
+    with self.assertRaisesRegexp(errors.InvalidArgumentError,
+                                 empty_splits_error):
+      self.evaluate(bad_rt5.to_sparse())
 
   def testGradient(self):
+    if context.executing_eagerly():
+      return
     # rt1.shape == rt2.shape == [2, (D2), (D3), 2].
     rt1 = ragged.constant([[[[1.0, 2.0], [3.0, 4.0]], [[5.0, 6.0]]]],
                           ragged_rank=2)
     rt2 = ragged.constant([[[[9.0, 8.0], [7.0, 6.0]], [[5.0, 4.0]]]],
                           ragged_rank=2)
-    rt = rt1 + rt2 * 2.0
-    st = ragged.to_sparse(rt)
+    rt = ragged.map_flat_values(math_ops.add, rt1, rt2 * 2.0)
+    st = rt.to_sparse()
 
-    g1, g2 = gradients_impl.gradients(st.values, [rt1.inner_values,
-                                                  rt2.inner_values])
+    g1, g2 = gradients_impl.gradients(st.values,
+                                      [rt1.flat_values, rt2.flat_values])
     print(g1, g2)
-    with self.test_session():
-      self.assertEqual(g1.eval().tolist(), [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
-      self.assertEqual(g2.eval().tolist(), [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0]])
+    self.assertRaggedEqual(g1, [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]])
+    self.assertRaggedEqual(g2, [[2.0, 2.0], [2.0, 2.0], [2.0, 2.0]])
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
index 0ccc214a9c76fbf1ef9be244bcb2ecc9ab3f0a39..ffcc2be52e5538c6d99ee8bcb0ed5d368ac5ed42 100644
--- a/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_to_tensor_op_test.py
@@ -24,22 +24,19 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedTensorToTensorOpTest(ragged_test_util.RaggedTensorTestCase,
                                  parameterized.TestCase):
 
   def testDocStringExamples(self):
     """Example from ragged_to_tensor.__doc__."""
     rt = ragged.constant([[9, 8, 7], [], [6, 5], [4]])
-    dt = ragged.to_tensor(rt)
-    with self.test_session():
-      self.assertEqual(str(dt.eval()),
-                       '[[9 8 7]\n'
-                       ' [0 0 0]\n'
-                       ' [6 5 0]\n'
-                       ' [4 0 0]]')  # pyformat: disable
+    dt = rt.to_tensor()
+    self.assertAllEqual(dt, [[9, 8, 7], [0, 0, 0], [6, 5, 0], [4, 0, 0]])
 
   @parameterized.parameters(
       {
@@ -71,8 +68,30 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
               [[1, 2], [0, 0], [3, 4]],  #
               [[0, 0], [0, 0], [0, 0]],  #
               [[5, 0], [0, 0], [0, 0]],  #
-              [[6, 7], [8, 0], [0, 0]]
-          ]  #
+              [[6, 7], [8, 0], [0, 0]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1, 2], [], [3, 4]], [], [[5]], [[6, 7], [8]]],
+          'default':
+              9,
+          'expected': [
+              [[1, 2], [9, 9], [3, 4]],  #
+              [[9, 9], [9, 9], [9, 9]],  #
+              [[5, 9], [9, 9], [9, 9]],  #
+              [[6, 7], [8, 9], [9, 9]],  #
+          ]
+      },
+      {
+          'rt_input': [[[1], [2], [3]]],
+          'ragged_rank': 1,
+          'default': 0,
+          'expected': [[[1], [2], [3]]],
+      },
+      {
+          'rt_input': [[[[1], [2]], [], [[3]]]],
+          'default': 9,
+          'expected': [[[[1], [2]], [[9], [9]], [[3], [9]]]],
       },
   )
   def testRaggedTensorToTensor(self,
@@ -82,41 +101,37 @@ class RaggedTensorToTensorOpTest(test_util.TensorFlowTestCase,
                                default=None,
                                expected_shape=None):
     rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
-    dt = ragged.to_tensor(rt, default)
-    self.assertEqual(type(dt), ops.Tensor)
+    dt = rt.to_tensor(default)
+    self.assertIsInstance(dt, ops.Tensor)
     self.assertEqual(rt.dtype, dt.dtype)
     self.assertTrue(dt.shape.is_compatible_with(rt.shape))
-    with self.test_session():
-      self.assertEqual(dt.eval().tolist(), expected)
-      if expected_shape is not None:
-        dt_shape = array_ops.shape(dt)
-        self.assertEqual(dt_shape.eval().tolist(), expected_shape)
+    self.assertAllEqual(self.eval_to_list(dt), expected)
+    if expected_shape is not None:
+      dt_shape = array_ops.shape(dt)
+      self.assertAllEqual(dt_shape, expected_shape)
 
   @parameterized.parameters(
       {
           'rt_input': [[1, 2, 3]],
           'default': [0],
-          'error': (ValueError, r'Shapes \(1,\) and \(\) are incompatible'),
-      },
-      {
-          'rt_input': [[[1], [2], [3]]],
-          'default': 0,
-          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible'),
+          'error': (ValueError, r'Shape \(1,\) must have rank at most 0'),
       },
       {
-          'rt_input': [[[[1], [2]], [], [[3]]]],
-          'default': 0,
-          'error': (ValueError, r'Shapes \(\) and \(1,\) are incompatible'),
+          'rt_input': [[[1, 2], [3, 4]], [[5, 6]]],
+          'ragged_rank': 1,
+          'default': [7, 8, 9],
+          'error': (ValueError, r'Shapes \(3,\) and \(2,\) are incompatible'),
       },
       {
           'rt_input': [[1, 2, 3]],
           'default': 'a',
-          'error': (TypeError, "Expected int32, got 'a' of type 'str' instead"),
+          'error': (TypeError, '.*'),
       },
   )
-  def testError(self, rt_input, default, error):
-    rt = ragged.constant(rt_input)
-    self.assertRaisesRegexp(error[0], error[1], ragged.to_tensor, rt, default)
+  def testError(self, rt_input, default, error, ragged_rank=None):
+    rt = ragged.constant(rt_input, ragged_rank=ragged_rank)
+    with self.assertRaisesRegexp(error[0], error[1]):
+      rt.to_tensor(default)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/ops/ragged/ragged_util.py b/tensorflow/python/ops/ragged/ragged_util.py
index 03f050de514e7f13de34ba7df23629d56b8ec453..a832f937d16a876a5c7c88866249101785122fb1 100644
--- a/tensorflow/python/ops/ragged/ragged_util.py
+++ b/tensorflow/python/ops/ragged/ragged_util.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_ragged_math_ops
 from tensorflow.python.ops import math_ops
 
 
@@ -229,3 +230,51 @@ def _with_nonzero_rank(data):
     return array_ops.reshape(
         data,
         array_ops.concat([[1], data_shape], axis=0)[-data_ndims:])
+
+
+def lengths_to_splits(lengths):
+  """Returns splits corresponding to the given lengths."""
+  return array_ops.concat([[0], math_ops.cumsum(lengths)], axis=-1)
+
+
+def repeat_ranges(params, splits, repeats):
+  """Repeats each range of `params` (as specified by `splits`) `repeats` times.
+
+  Let the `i`th range of `params` be defined as
+  `params[splits[i]:splits[i + 1]]`.  Then this function returns a tensor
+  containing range 0 repeated `repeats[0]` times, followed by range 1 repeated
+  `repeats[1]`, ..., followed by the last range repeated `repeats[-1]` times.
+
+  Args:
+    params: The `Tensor` whose values should be repeated.
+    splits: A splits tensor indicating the ranges of `params` that should be
+      repeated.
+    repeats: The number of times each range should be repeated.  Supports
+      broadcasting from a scalar value.
+
+  Returns:
+    A `Tensor` with the same rank and type as `params`.
+
+  #### Example:
+    ```python
+    >>> repeat_ranges(['a', 'b', 'c'], [0, 2, 3], 3)
+    ['a', 'b', 'a', 'b', 'a', 'b', 'c', 'c', 'c']
+    ```
+  """
+  # Divide `splits` into starts and limits, and repeat them `repeats` times.
+  if repeats.shape.ndims != 0:
+    repeated_starts = repeat(splits[:-1], repeats, axis=0)
+    repeated_limits = repeat(splits[1:], repeats, axis=0)
+  else:
+    # Optimization: we can just call repeat once, and then slice the result.
+    repeated_splits = repeat(splits, repeats, axis=0)
+    n_splits = array_ops.shape(repeated_splits, out_type=dtypes.int64)[0]
+    repeated_starts = repeated_splits[:n_splits - repeats]
+    repeated_limits = repeated_splits[repeats:]
+
+  # Get indices for each range from starts to limits, and use those to gather
+  # the values in the desired repetition pattern.
+  one = array_ops.ones((), repeated_starts.dtype)
+  offsets = gen_ragged_math_ops.ragged_range(
+      repeated_starts, repeated_limits, one)
+  return array_ops.gather(params, offsets.rt_dense_values)
diff --git a/tensorflow/python/ops/ragged/ragged_util_test.py b/tensorflow/python/ops/ragged/ragged_util_test.py
index c24ea65353104f78f9f4e3e90b0c73edb923c7e2..72a4155930708a0e8eb5808807bf788c67de862f 100644
--- a/tensorflow/python/ops/ragged/ragged_util_test.py
+++ b/tensorflow/python/ops/ragged/ragged_util_test.py
@@ -24,6 +24,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.ops.ragged import ragged_util
 from tensorflow.python.platform import googletest
 
@@ -41,7 +42,9 @@ TENSOR_4D = [[[[('%d%d%d%d' % (i, j, k, l)).encode('utf-8')
              for i in range(4)]
 
 
-class RaggedRepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedUtilTest(ragged_test_util.RaggedTensorTestCase,
+                     parameterized.TestCase):
 
   @parameterized.parameters([
       # Docstring examples
@@ -90,7 +93,7 @@ class RaggedRepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testRepeat(self, data, repeats, expected, axis=None):
     result = ragged_util.repeat(data, repeats, axis)
     with self.test_session():
-      self.assertEqual(result.eval().tolist(), expected)
+      self.assertAllEqual(result, expected)
 
   @parameterized.parameters([
       dict(mode=mode, **args)
@@ -156,7 +159,7 @@ class RaggedRepeatTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     result = ragged_util.repeat(data, repeats, axis)
     with self.test_session():
-      self.assertEqual(result.eval().tolist(), expected.tolist())
+      self.assertAllEqual(result, expected)
 
   @parameterized.parameters([
       dict(
diff --git a/tensorflow/python/ops/ragged/ragged_where_op_test.py b/tensorflow/python/ops/ragged/ragged_where_op_test.py
index 755333de3923154abd1cebebb0c0e661df55b962..b3cd5a2debe0db0b1bac2b6396c78b9e94c3f671 100644
--- a/tensorflow/python/ops/ragged/ragged_where_op_test.py
+++ b/tensorflow/python/ops/ragged/ragged_where_op_test.py
@@ -22,10 +22,13 @@ from absl.testing import parameterized
 
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import ragged
+from tensorflow.python.ops.ragged import ragged_test_util
 from tensorflow.python.platform import googletest
 
 
-class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
+@test_util.run_all_in_graph_and_eager_modes
+class RaggedWhereOpTest(ragged_test_util.RaggedTensorTestCase,
+                        parameterized.TestCase):
 
   @parameterized.parameters([
       #=========================================================================
@@ -167,15 +170,7 @@ class RaggedWhereOpTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   ])   # pyformat: disable
   def testRaggedWhere(self, condition, expected, x=None, y=None):
     result = ragged.where(condition, x, y)
-    self.assertEqual(
-        getattr(result, 'ragged_rank', 0), getattr(expected, 'ragged_rank', 0))
-    with self.test_session():
-      result_value = result.eval()
-      if hasattr(result_value, 'tolist'):
-        result_value = result_value.tolist()
-      if hasattr(expected, 'tolist'):
-        expected = expected.tolist()
-      self.assertEqual(result_value, expected)
+    self.assertRaggedEqual(result, expected)
 
   @parameterized.parameters([
       dict(
diff --git a/tensorflow/python/ops/random_ops.py b/tensorflow/python/ops/random_ops.py
index 1f7db0af61f413b5a92e6c195fed7ff48be8c7b0..62e2f6d1025bb9802a5b2a09a4dbffbe15921ace 100644
--- a/tensorflow/python/ops/random_ops.py
+++ b/tensorflow/python/ops/random_ops.py
@@ -138,7 +138,9 @@ def parameterized_truncated_normal(shape,
     return rnd
 
 
-@tf_export("random.truncated_normal", "truncated_normal")
+@tf_export("random.truncated_normal",
+           v1=["random.truncated_normal", "truncated_normal"])
+@deprecation.deprecated_endpoints("truncated_normal")
 def truncated_normal(shape,
                      mean=0.0,
                      stddev=1.0,
@@ -325,7 +327,9 @@ def random_crop(value, size, seed=None, name=None):
     return array_ops.slice(value, offset, size, name=name)
 
 
-@tf_export("random.multinomial", "multinomial")
+@tf_export(v1=["random.multinomial", "multinomial"])
+@deprecation.deprecated(
+    date=None, instructions="Use tf.random.categorical instead.")
 def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
   """Draws samples from a multinomial distribution.
 
@@ -342,9 +346,7 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
       `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A Python integer. Used to create a random seed for the distribution.
-      See
-      `tf.set_random_seed`
-      for behavior.
+      See `tf.set_random_seed` for behavior.
     name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
 
@@ -352,10 +354,43 @@ def multinomial(logits, num_samples, seed=None, name=None, output_dtype=None):
     The drawn samples of shape `[batch_size, num_samples]`.
   """
   with ops.name_scope(name, "multinomial", [logits]):
-    logits = ops.convert_to_tensor(logits, name="logits")
-    seed1, seed2 = random_seed.get_seed(seed)
-    return gen_random_ops.multinomial(
-        logits, num_samples, seed=seed1, seed2=seed2, output_dtype=output_dtype)
+    return multinomial_categorical_impl(logits, num_samples, output_dtype, seed)
+
+
+@tf_export("random.categorical")
+def categorical(logits, num_samples, dtype=None, seed=None, name=None):
+  """Draws samples from a categorical distribution.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.categorical(tf.log([[10., 10.]]), 5)
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    dtype: integer type to use for the output. Defaults to int64.
+    seed: A Python integer. Used to create a random seed for the distribution.
+      See `tf.set_random_seed` for behavior.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "categorical", [logits]):
+    return multinomial_categorical_impl(logits, num_samples, dtype, seed)
+
+
+def multinomial_categorical_impl(logits, num_samples, dtype, seed):
+  """Implementation for random.multinomial (v1) and random.categorical (v2)."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  seed1, seed2 = random_seed.get_seed(seed)
+  return gen_random_ops.multinomial(
+      logits, num_samples, seed=seed1, seed2=seed2, output_dtype=dtype)
 
 
 ops.NotDifferentiable("Multinomial")
@@ -445,7 +480,7 @@ def random_gamma(shape,
             shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
 
 
-@tf_export("random.poisson", v1=["random.poisson", "random_poisson"])
+@tf_export(v1=["random.poisson", "random_poisson"])
 @deprecation.deprecated_endpoints("random_poisson")
 def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
   """Draws `shape` samples from each of the given Poisson distribution(s).
@@ -478,6 +513,45 @@ def random_poisson(lam, shape, dtype=dtypes.float32, seed=None, name=None):
       for behavior.
     name: Optional name for the operation.
 
+  Returns:
+    samples: a `Tensor` of shape `tf.concat([shape, tf.shape(lam)], axis=0)`
+      with values of type `dtype`.
+  """
+  return random_poisson_v2(shape, lam, dtype, seed, name)
+
+
+@tf_export("random.poisson", v1=[])
+def random_poisson_v2(shape, lam, dtype=dtypes.float32, seed=None, name=None):
+  """Draws `shape` samples from each of the given Poisson distribution(s).
+
+  `lam` is the rate parameter describing the distribution(s).
+
+  Example:
+
+  ```python
+  samples = tf.random_poisson([10], [0.5, 1.5])
+  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
+  # the samples drawn from each distribution
+
+  samples = tf.random_poisson([7, 5], [12.2, 3.3])
+  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
+  # represents the 7x5 samples drawn from each of the two distributions
+  ```
+
+  Args:
+    shape: A 1-D integer Tensor or Python array. The shape of the output samples
+      to be drawn per "rate"-parameterized distribution.
+    lam: A Tensor or Python value or N-D array of type `dtype`.
+      `lam` provides the rate parameter(s) describing the poisson
+      distribution(s) to sample.
+    dtype: The type of the output: `float16`, `float32`, `float64`, `int32` or
+      `int64`.
+    seed: A Python integer. Used to create a random seed for the distributions.
+      See
+      `tf.set_random_seed`
+      for behavior.
+    name: Optional name for the operation.
+
   Returns:
     samples: a `Tensor` of shape `tf.concat([shape, tf.shape(lam)], axis=0)`
       with values of type `dtype`.
diff --git a/tensorflow/python/ops/resource_variable_ops.py b/tensorflow/python/ops/resource_variable_ops.py
index 488b6fcbcdb2fb5158b6d6a08b90f79aa4630047..1066b357b43bb60d5e5b078846fcd82e12e941c3 100644
--- a/tensorflow/python/ops/resource_variable_ops.py
+++ b/tensorflow/python/ops/resource_variable_ops.py
@@ -26,6 +26,7 @@ from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.eager import tape
+from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -64,6 +65,7 @@ def eager_safe_variable_handle(shape, dtype, shared_name, name, graph_mode):
                                                    name=name,
                                                    container=container)
   if graph_mode:
+    handle._handle_data = get_resource_handle_data(handle)  # pylint: disable=protected-access
     return handle
 
   # We do not want two distinct ResourceVariable objects for the same
@@ -519,7 +521,10 @@ class ResourceVariable(variables.RefVariable):
       snapshot = g.as_graph_element(
           ops.prepend_name_scope(
               variable_def.snapshot_name, import_scope=import_scope))
-      self._cached_value = snapshot
+      if snapshot.op.type != "ReadVariableOp":
+        self._cached_value = snapshot
+      else:
+        self._cached_value = None
       while snapshot.op.type != "ReadVariableOp":
         snapshot = snapshot.op.inputs[0]
       self._graph_element = snapshot
@@ -802,16 +807,6 @@ class ResourceVariable(variables.RefVariable):
     return ResourceVariable(
         variable_def=variable_def, import_scope=import_scope)
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
-    """Register overloads for all operators."""
-    for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      ResourceVariable._OverloadOperator(operator)
-    # For slicing, bind getitem differently than a tensor (use SliceHelperVar
-    # instead)
-    # pylint: disable=protected-access
-    setattr(ResourceVariable, "__getitem__", array_ops._SliceHelperVar)
-
   def _AsTensor(self):
     return self.value()
 
@@ -823,30 +818,6 @@ class ResourceVariable(variables.RefVariable):
     """Unsupported."""
     raise NotImplementedError("ResourceVariable does not implement set_shape()")
 
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
-    """Defer an operator overload to `ops.Tensor`.
-
-    We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
-
-    Args:
-      operator: string. The operator name.
-    """
-
-    tensor_oper = getattr(ops.Tensor, operator)
-    def _run_op(a, *args):
-      # pylint: disable=protected-access
-      value = a._AsTensor()
-      return tensor_oper(value, *args)
-
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = tensor_oper.__doc__
-    except AttributeError:
-      pass
-
-    setattr(ResourceVariable, operator, _run_op)
-
   __array_priority__ = 100
 
   def is_initialized(self, name=None):
@@ -1432,7 +1403,6 @@ ops.register_tensor_conversion_function(
     variables.Variable, variables.Variable._TensorConversionFunction)  # pylint: disable=protected-access
 
 # pylint: disable=protected-access
-ResourceVariable._OverloadAllOperators()
 ops.register_dense_tensor_like_type(ResourceVariable)
 
 
@@ -1442,13 +1412,23 @@ def _ReadGrad(_, grad):
   return grad
 
 
+def variable_shape(handle, out_type=dtypes.int32):
+  if getattr(
+      handle, "_handle_data", None) is None or not handle._handle_data.is_set:
+    return gen_resource_variable_ops.variable_shape(handle, out_type=out_type)
+  shape_proto = handle._handle_data.shape_and_type[0].shape
+  if shape_proto.unknown_rank or any(x.size == -1 for x in shape_proto.dim):
+    return gen_resource_variable_ops.variable_shape(handle, out_type=out_type)
+  return constant_op.constant([x.size for x in shape_proto.dim], dtype=out_type)
+
+
 @ops.RegisterGradient("ResourceGather")
 def _GatherGrad(op, grad):
   """Gradient for gather op."""
   # Build appropriately shaped IndexedSlices
   handle = op.inputs[0]
   indices = op.inputs[1]
-  params_shape = gen_resource_variable_ops.variable_shape(handle)
+  params_shape = variable_shape(handle)
   size = array_ops.expand_dims(array_ops.size(indices), 0)
   values_shape = array_ops.concat([size, params_shape[1:]], 0)
   values = array_ops.reshape(grad, values_shape)
@@ -1522,3 +1502,6 @@ def copy_to_graph_uninitialized(var):
   new_variable._maybe_initialize_checkpointable()
   # pylint: enable=protected-access
   return new_variable
+
+ops.NotDifferentiable("VarIsInitializedOp")
+ops.NotDifferentiable("VariableShape")
diff --git a/tensorflow/python/ops/resources.py b/tensorflow/python/ops/resources.py
index db6740643cffd9ca852d75653c837a39a1731d42..86477c924777e7fe7a093b72fc2c6acea1fdaa0e 100644
--- a/tensorflow/python/ops/resources.py
+++ b/tensorflow/python/ops/resources.py
@@ -21,6 +21,7 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import os
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -86,7 +87,9 @@ def report_uninitialized_resources(resource_list=None,
     resource_list = shared_resources() + local_resources()
   with ops.name_scope(name):
     # Run all operations on CPU
-    with ops.device("/cpu:0"):
+    local_device = os.environ.get(
+        "TF_DEVICE_FOR_UNINITIALIZED_VARIABLE_REPORTING", "/cpu:0")
+    with ops.device(local_device):
       if not resource_list:
         # Return an empty tensor so we only need to check for returned tensor
         # size being 0 as an indication of model ready.
diff --git a/tensorflow/python/ops/rnn.py b/tensorflow/python/ops/rnn.py
index 57ecb5055737931ff8e7fee16af64d441ffc6a55..ec48cab91d172c54b2f927d946312f086e808c9c 100644
--- a/tensorflow/python/ops/rnn.py
+++ b/tensorflow/python/ops/rnn.py
@@ -117,7 +117,7 @@ def _infer_state_dtype(explicit_dtype, state):
     inferred_dtypes = [element.dtype for element in nest.flatten(state)]
     if not inferred_dtypes:
       raise ValueError("Unable to infer dtype from empty state.")
-    all_same = all([x == inferred_dtypes[0] for x in inferred_dtypes])
+    all_same = all(x == inferred_dtypes[0] for x in inferred_dtypes)
     if not all_same:
       raise ValueError(
           "State has tensors of different inferred_dtypes. Unable to infer a "
@@ -348,7 +348,10 @@ def _reverse_seq(input_seq, lengths):
   return results
 
 
-@tf_export("nn.bidirectional_dynamic_rnn")
+@deprecation.deprecated(None, "Please use `keras.layers.Bidirectional("
+                        "keras.layers.RNN(cell))`, which is equivalent to "
+                        "this API")
+@tf_export(v1=["nn.bidirectional_dynamic_rnn"])
 def bidirectional_dynamic_rnn(cell_fw, cell_bw, inputs, sequence_length=None,
                               initial_state_fw=None, initial_state_bw=None,
                               dtype=None, parallel_iterations=None,
@@ -1490,7 +1493,10 @@ def static_state_saving_rnn(cell,
   return (outputs, state)
 
 
-@tf_export("nn.static_bidirectional_rnn")
+@deprecation.deprecated(None, "Please use `keras.layers.Bidirectional("
+                        "keras.layers.RNN(cell, unroll=True))`, which is "
+                        "equivalent to this API")
+@tf_export(v1=["nn.static_bidirectional_rnn"])
 def static_bidirectional_rnn(cell_fw,
                              cell_bw,
                              inputs,
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 050b486893938b937bf3dc99a27019a36fdc4a1d..ffc45619a74e9b527047f3e55e94664581cb6591 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras import activations
 from tensorflow.python.keras import initializers
+from tensorflow.python.keras.engine import input_spec
 from tensorflow.python.keras.utils import tf_utils
 from tensorflow.python.layers import base as base_layer
 from tensorflow.python.ops import array_ops
@@ -410,7 +411,7 @@ class BasicRNNCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     if activation:
@@ -507,7 +508,7 @@ class GRUCell(LayerRNNCell):
                    "Please use tf.contrib.cudnn_rnn.CudnnGRU for better "
                    "performance on GPU.", self)
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     if activation:
@@ -683,7 +684,7 @@ class BasicLSTMCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._forget_bias = forget_bias
@@ -871,7 +872,7 @@ class LSTMCell(LayerRNNCell):
                    "performance on GPU.", self)
 
     # Inputs must be 2-dimensional.
-    self.input_spec = base_layer.InputSpec(ndim=2)
+    self.input_spec = input_spec.InputSpec(ndim=2)
 
     self._num_units = num_units
     self._use_peepholes = use_peepholes
@@ -1394,7 +1395,7 @@ class DeviceWrapper(RNNCell):
       return self._cell(inputs, state, scope=scope)
 
 
-@tf_export("nn.rnn_cell.MultiRNNCell")
+@tf_export(v1=["nn.rnn_cell.MultiRNNCell"])
 class MultiRNNCell(RNNCell):
   """RNN cell composed sequentially of multiple simple cells.
 
@@ -1407,6 +1408,9 @@ class MultiRNNCell(RNNCell):
   ```
   """
 
+  @deprecated(None, "This class is equivalent as "
+                    "tf.keras.layers.StackedRNNCells, and will be replaced by "
+                    "that in Tensorflow 2.0.")
   def __init__(self, cells, state_is_tuple=True):
     """Create a RNN cell composed sequentially of a number of RNNCells.
 
@@ -1452,7 +1456,7 @@ class MultiRNNCell(RNNCell):
     if self._state_is_tuple:
       return tuple(cell.state_size for cell in self._cells)
     else:
-      return sum([cell.state_size for cell in self._cells])
+      return sum(cell.state_size for cell in self._cells)
 
   @property
   def output_size(self):
diff --git a/tensorflow/python/ops/sets_impl.py b/tensorflow/python/ops/sets_impl.py
index 21e08d03d213c173d12dfc6676fe7f009811e93f..ee9c9b6bc0b36a374957178653eaae4c91ad733c 100644
--- a/tensorflow/python/ops/sets_impl.py
+++ b/tensorflow/python/ops/sets_impl.py
@@ -31,7 +31,7 @@ _VALID_DTYPES = set([
     dtypes.uint8, dtypes.uint16, dtypes.string])
 
 
-@tf_export("sets.set_size")
+@tf_export("sets.size", v1=["sets.size", "sets.set_size"])
 def set_size(a, validate_indices=True):
   """Compute number of unique elements along last dimension of `a`.
 
@@ -133,7 +133,8 @@ def _set_operation(a, b, set_operation, validate_indices=True):
   return sparse_tensor.SparseTensor(indices, values, shape)
 
 
-@tf_export("sets.set_intersection")
+@tf_export(
+    "sets.intersection", v1=["sets.intersection", "sets.set_intersection"])
 def set_intersection(a, b, validate_indices=True):
   """Compute set intersection of elements in last dimension of `a` and `b`.
 
@@ -200,7 +201,8 @@ def set_intersection(a, b, validate_indices=True):
   return _set_operation(a, b, "intersection", validate_indices)
 
 
-@tf_export("sets.set_difference")
+@tf_export(
+	   "sets.difference", v1=["sets.difference", "sets.set_difference"])
 def set_difference(a, b, aminusb=True, validate_indices=True):
   """Compute set difference of elements in last dimension of `a` and `b`.
 
@@ -271,7 +273,8 @@ def set_difference(a, b, aminusb=True, validate_indices=True):
   return _set_operation(a, b, "a-b" if aminusb else "b-a", validate_indices)
 
 
-@tf_export("sets.set_union")
+@tf_export(
+	   "sets.union", v1=["sets.union", "sets.set_union"])
 def set_union(a, b, validate_indices=True):
   """Compute set union of elements in last dimension of `a` and `b`.
 
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
index 0d04dc0c1bf485393b0446bbdf034c7228a36271..da2bf9c1d2d73aeae8dd2d61c4e690bb1ab93b70 100644
--- a/tensorflow/python/ops/signal/BUILD
+++ b/tensorflow/python/ops/signal/BUILD
@@ -6,16 +6,29 @@ exports_files(["LICENSE"])
 
 py_library(
     name = "signal",
-    srcs = glob(["*.py"]),
+    srcs = [
+        "dct_ops.py",
+        "fft_ops.py",
+        "mel_ops.py",
+        "mfcc_ops.py",
+        "reconstruction_ops.py",
+        "shape_ops.py",
+        "signal.py",
+        "spectral_ops.py",
+        "util_ops.py",
+        "window_ops.py",
+    ],
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/python:array_ops",
         "//tensorflow/python:constant_op",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:dtypes",
+        "//tensorflow/python:framework",
+        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops",
+        "//tensorflow/python:spectral_ops_gen",
         "//tensorflow/python:tensor_util",
         "//tensorflow/python:util",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/ops/signal/__init__.py b/tensorflow/python/ops/signal/__init__.py
deleted file mode 100644
index 3fa4e94e5886ebf72c9b8b6bb3b7d5ab21308167..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/signal/__init__.py
+++ /dev/null
@@ -1,37 +0,0 @@
-"""Signal processing operations.
-
-See the [tf.signal](https://tensorflow.org/api_guides/python/contrib.signal)
-guide.
-
-@@frame
-@@hamming_window
-@@hann_window
-@@inverse_stft
-@@inverse_stft_window_fn
-@@mfccs_from_log_mel_spectrograms
-@@linear_to_mel_weight_matrix
-@@overlap_and_add
-@@stft
-
-[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
-[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
-[mel]: https://en.wikipedia.org/wiki/Mel_scale
-[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
-[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-# pylint: disable=unused-import
-from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
-from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
-from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
-from tensorflow.python.ops.signal.shape_ops import frame
-from tensorflow.python.ops.signal.spectral_ops import inverse_stft
-from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
-from tensorflow.python.ops.signal.spectral_ops import stft
-from tensorflow.python.ops.signal.window_ops import hamming_window
-from tensorflow.python.ops.signal.window_ops import hann_window
-# pylint: enable=unused-import
diff --git a/tensorflow/python/ops/signal/dct_ops.py b/tensorflow/python/ops/signal/dct_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..d042c95c049538354836ef83f0b21d8babccedc8
--- /dev/null
+++ b/tensorflow/python/ops/signal/dct_ops.py
@@ -0,0 +1,192 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Discrete Cosine Transform ops."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import math as _math
+
+from tensorflow.python.framework import dtypes as _dtypes
+from tensorflow.python.framework import ops as _ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.ops import array_ops as _array_ops
+from tensorflow.python.ops import math_ops as _math_ops
+from tensorflow.python.ops.signal import fft_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+def _validate_dct_arguments(input_tensor, dct_type, n, axis, norm):
+  """Checks that DCT/IDCT arguments are compatible and well formed."""
+  if n is not None:
+    raise NotImplementedError("The DCT length argument is not implemented.")
+  if axis != -1:
+    raise NotImplementedError("axis must be -1. Got: %s" % axis)
+  if dct_type not in (1, 2, 3):
+    raise ValueError("Only Types I, II and III (I)DCT are supported.")
+  if dct_type == 1:
+    if norm == "ortho":
+      raise ValueError("Normalization is not supported for the Type-I DCT.")
+    if input_tensor.shape[-1] is not None and input_tensor.shape[-1] < 2:
+      raise ValueError(
+          "Type-I DCT requires the dimension to be greater than one.")
+
+  if norm not in (None, "ortho"):
+    raise ValueError(
+        "Unknown normalization. Expected None or 'ortho', got: %s" % norm)
+
+
+# TODO(rjryan): Implement `n` and `axis` parameters.
+@tf_export("signal.dct", v1=["signal.dct", "spectral.dct"])
+def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
+  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
+
+  Currently only Types I, II and III are supported.
+  Type I is implemented using a length `2N` padded `tf.spectral.rfft`.
+  Type II is implemented using a length `2N` padded `tf.spectral.rfft`, as
+  described here:
+  https://dsp.stackexchange.com/a/10606.
+  Type III is a fairly straightforward inverse of Type II
+  (i.e. using a length `2N` padded `tf.spectral.irfft`).
+
+  @compatibility(scipy)
+  Equivalent to scipy.fftpack.dct for Type-I, Type-II and Type-III DCT.
+  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
+  @end_compatibility
+
+  Args:
+    input: A `[..., samples]` `float32` `Tensor` containing the signals to
+      take the DCT of.
+    type: The DCT type to perform. Must be 1, 2 or 3.
+    n: For future expansion. The length of the transform. Must be `None`.
+    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
+    norm: The normalization to apply. `None` for no normalization or `'ortho'`
+      for orthonormal normalization.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.
+
+  Raises:
+    ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is
+      not `-1`, or `norm` is not `None` or `'ortho'`.
+    ValueError: If `type` is `1` and `norm` is `ortho`.
+
+  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
+  """
+  _validate_dct_arguments(input, type, n, axis, norm)
+  with _ops.name_scope(name, "dct", [input]):
+    # We use the RFFT to compute the DCT and TensorFlow only supports float32
+    # for FFTs at the moment.
+    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)
+
+    axis_dim = (tensor_shape.dimension_value(input.shape[-1])
+                or _array_ops.shape(input)[-1])
+    axis_dim_float = _math_ops.to_float(axis_dim)
+
+    if type == 1:
+      dct1_input = _array_ops.concat([input, input[..., -2:0:-1]], axis=-1)
+      dct1 = _math_ops.real(fft_ops.rfft(dct1_input))
+      return dct1
+
+    if type == 2:
+      scale = 2.0 * _math_ops.exp(
+          _math_ops.complex(
+              0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
+              axis_dim_float))
+
+      # TODO(rjryan): Benchmark performance and memory usage of the various
+      # approaches to computing a DCT via the RFFT.
+      dct2 = _math_ops.real(
+          fft_ops.rfft(
+              input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)
+
+      if norm == "ortho":
+        n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
+        n2 = n1 * _math_ops.sqrt(2.0)
+        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
+        weights = _array_ops.pad(
+            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
+            constant_values=n2)
+        dct2 *= weights
+
+      return dct2
+
+    elif type == 3:
+      if norm == "ortho":
+        n1 = _math_ops.sqrt(axis_dim_float)
+        n2 = n1 * _math_ops.sqrt(0.5)
+        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
+        weights = _array_ops.pad(
+            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
+            constant_values=n2)
+        input *= weights
+      else:
+        input *= axis_dim_float
+      scale = 2.0 * _math_ops.exp(
+          _math_ops.complex(
+              0.0,
+              _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
+              axis_dim_float))
+      dct3 = _math_ops.real(
+          fft_ops.irfft(
+              scale * _math_ops.complex(input, 0.0),
+              fft_length=[2 * axis_dim]))[..., :axis_dim]
+
+      return dct3
+
+
+# TODO(rjryan): Implement `n` and `axis` parameters.
+@tf_export("signal.idct", v1=["signal.idct", "spectral.idct"])
+def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
+  """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
+
+  Currently only Types I, II and III are supported. Type III is the inverse of
+  Type II, and vice versa.
+
+  Note that you must re-normalize by 1/(2n) to obtain an inverse if `norm` is
+  not `'ortho'`. That is:
+  `signal == idct(dct(signal)) * 0.5 / signal.shape[-1]`.
+  When `norm='ortho'`, we have:
+  `signal == idct(dct(signal, norm='ortho'), norm='ortho')`.
+
+  @compatibility(scipy)
+  Equivalent to scipy.fftpack.idct for Type-I, Type-II and Type-III DCT.
+  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.idct.html
+  @end_compatibility
+
+  Args:
+    input: A `[..., samples]` `float32` `Tensor` containing the signals to take
+      the DCT of.
+    type: The IDCT type to perform. Must be 1, 2 or 3.
+    n: For future expansion. The length of the transform. Must be `None`.
+    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
+    norm: The normalization to apply. `None` for no normalization or `'ortho'`
+      for orthonormal normalization.
+    name: An optional name for the operation.
+
+  Returns:
+    A `[..., samples]` `float32` `Tensor` containing the IDCT of `input`.
+
+  Raises:
+    ValueError: If `type` is not `1`, `2` or `3`, `n` is not `None, `axis` is
+      not `-1`, or `norm` is not `None` or `'ortho'`.
+
+  [idct]:
+  https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms
+  """
+  _validate_dct_arguments(input, type, n, axis, norm)
+  inverse_type = {1: 1, 2: 3, 3: 2}[type]
+  return dct(input, type=inverse_type, n=n, axis=axis, norm=norm, name=name)
diff --git a/tensorflow/python/ops/spectral_ops.py b/tensorflow/python/ops/signal/fft_ops.py
similarity index 51%
rename from tensorflow/python/ops/spectral_ops.py
rename to tensorflow/python/ops/signal/fft_ops.py
index 4dcc90aefa978b89856ee6f8d77b73c3e7edb550..2d14b2bbd75864b6477bccc5cef562b617674c08 100644
--- a/tensorflow/python/ops/spectral_ops.py
+++ b/tensorflow/python/ops/signal/fft_ops.py
@@ -12,16 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Spectral operators (e.g. DCT, FFT, RFFT)."""
+"""Fast-Fourier Transform ops."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import math as _math
+import numpy as np
 
 from tensorflow.python.framework import dtypes as _dtypes
 from tensorflow.python.framework import ops as _ops
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util as _tensor_util
 from tensorflow.python.ops import array_ops as _array_ops
 from tensorflow.python.ops import gen_spectral_ops
@@ -112,6 +111,7 @@ def _rfft_wrapper(fft_fn, fft_rank, default_name):
   """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
 
   def _rfft(input_tensor, fft_length=None, name=None):
+    """Wrapper around gen_spectral_ops.rfft* that infers fft_length argument."""
     with _ops.name_scope(name, default_name,
                          [input_tensor, fft_length]) as name:
       input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.float32)
@@ -130,6 +130,7 @@ def _irfft_wrapper(ifft_fn, fft_rank, default_name):
   """Wrapper around gen_spectral_ops.irfft* that infers fft_length argument."""
 
   def _irfft(input_tensor, fft_length=None, name=None):
+    """Wrapper irfft* that infers fft_length argument."""
     with _ops.name_scope(name, default_name,
                          [input_tensor, fft_length]) as name:
       input_tensor = _ops.convert_to_tensor(input_tensor, _dtypes.complex64)
@@ -145,6 +146,8 @@ def _irfft_wrapper(ifft_fn, fft_rank, default_name):
   return _irfft
 
 
+# FFT/IFFT 1/2/3D are exported via
+# third_party/tensorflow/core/api_def/python_api/
 fft = gen_spectral_ops.fft
 ifft = gen_spectral_ops.ifft
 fft2d = gen_spectral_ops.fft2d
@@ -152,159 +155,176 @@ ifft2d = gen_spectral_ops.ifft2d
 fft3d = gen_spectral_ops.fft3d
 ifft3d = gen_spectral_ops.ifft3d
 rfft = _rfft_wrapper(gen_spectral_ops.rfft, 1, "rfft")
-tf_export("spectral.rfft")(rfft)
+tf_export("signal.rfft", v1=["signal.rfft", "spectral.rfft"])(rfft)
 irfft = _irfft_wrapper(gen_spectral_ops.irfft, 1, "irfft")
-tf_export("spectral.irfft")(irfft)
+tf_export("signal.irfft", v1=["signal.irfft", "spectral.irfft"])(irfft)
 rfft2d = _rfft_wrapper(gen_spectral_ops.rfft2d, 2, "rfft2d")
-tf_export("spectral.rfft2d")(rfft2d)
+tf_export("signal.rfft2d", v1=["signal.rfft2d", "spectral.rfft2d"])(rfft2d)
 irfft2d = _irfft_wrapper(gen_spectral_ops.irfft2d, 2, "irfft2d")
-tf_export("spectral.irfft2d")(irfft2d)
+tf_export("signal.irfft2d", v1=["signal.irfft2d", "spectral.irfft2d"])(irfft2d)
 rfft3d = _rfft_wrapper(gen_spectral_ops.rfft3d, 3, "rfft3d")
-tf_export("spectral.rfft3d")(rfft3d)
+tf_export("signal.rfft3d", v1=["signal.rfft3d", "spectral.rfft3d"])(rfft3d)
 irfft3d = _irfft_wrapper(gen_spectral_ops.irfft3d, 3, "irfft3d")
-tf_export("spectral.irfft3d")(irfft3d)
-
-
-def _validate_dct_arguments(dct_type, n, axis, norm):
-  if n is not None:
-    raise NotImplementedError("The DCT length argument is not implemented.")
-  if axis != -1:
-    raise NotImplementedError("axis must be -1. Got: %s" % axis)
-  if dct_type not in (2, 3):
-    raise ValueError("Only Types II and III (I)DCT are supported.")
-  if norm not in (None, "ortho"):
-    raise ValueError(
-        "Unknown normalization. Expected None or 'ortho', got: %s" % norm)
-
-
-# TODO(rjryan): Implement `type`, `n` and `axis` parameters.
-@tf_export("spectral.dct")
-def dct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
-  """Computes the 1D [Discrete Cosine Transform (DCT)][dct] of `input`.
-
-  Currently only Types II and III are supported. Type II is implemented using a
-  length `2N` padded `tf.spectral.rfft`, as described here:
-  https://dsp.stackexchange.com/a/10606. Type III is a fairly straightforward
-  inverse of Type II (i.e. using a length `2N` padded `tf.spectral.irfft`).
-
-  @compatibility(scipy)
-  Equivalent to scipy.fftpack.dct for Type-II and Type-III DCT.
-  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.dct.html
-  @end_compatibility
-
-  Args:
-    input: A `[..., samples]` `float32` `Tensor` containing the signals to
-      take the DCT of.
-    type: The DCT type to perform. Must be 2 or 3.
-    n: For future expansion. The length of the transform. Must be `None`.
-    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
-    norm: The normalization to apply. `None` for no normalization or `'ortho'`
-      for orthonormal normalization.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., samples]` `float32` `Tensor` containing the DCT of `input`.
-
-  Raises:
-    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
-      `-1`, or `norm` is not `None` or `'ortho'`.
-
-  [dct]: https://en.wikipedia.org/wiki/Discrete_cosine_transform
-  """
-  _validate_dct_arguments(type, n, axis, norm)
-  with _ops.name_scope(name, "dct", [input]):
-    # We use the RFFT to compute the DCT and TensorFlow only supports float32
-    # for FFTs at the moment.
-    input = _ops.convert_to_tensor(input, dtype=_dtypes.float32)
-
-    axis_dim = (tensor_shape.dimension_value(input.shape[-1])
-                or _array_ops.shape(input)[-1])
-    axis_dim_float = _math_ops.to_float(axis_dim)
-    if type == 2:
-      scale = 2.0 * _math_ops.exp(
-          _math_ops.complex(
-              0.0, -_math_ops.range(axis_dim_float) * _math.pi * 0.5 /
-              axis_dim_float))
-
-      # TODO(rjryan): Benchmark performance and memory usage of the various
-      # approaches to computing a DCT via the RFFT.
-      dct2 = _math_ops.real(
-          rfft(input, fft_length=[2 * axis_dim])[..., :axis_dim] * scale)
-
-      if norm == "ortho":
-        n1 = 0.5 * _math_ops.rsqrt(axis_dim_float)
-        n2 = n1 * _math_ops.sqrt(2.0)
-        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
-        weights = _array_ops.pad(
-            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
-            constant_values=n2)
-        dct2 *= weights
-
-      return dct2
-
-    elif type == 3:
-      if norm == "ortho":
-        n1 = _math_ops.sqrt(axis_dim_float)
-        n2 = n1 * _math_ops.sqrt(0.5)
-        # Use tf.pad to make a vector of [n1, n2, n2, n2, ...].
-        weights = _array_ops.pad(
-            _array_ops.expand_dims(n1, 0), [[0, axis_dim - 1]],
-            constant_values=n2)
-        input *= weights
-      else:
-        input *= axis_dim_float
-      scale = 2.0 * _math_ops.exp(
-          _math_ops.complex(
-              0.0,
-              _math_ops.range(axis_dim_float) * _math.pi * 0.5 /
-              axis_dim_float))
-      dct3 = _math_ops.real(
-          irfft(
-              scale * _math_ops.complex(input, 0.0),
-              fft_length=[2 * axis_dim]))[..., :axis_dim]
-
-      return dct3
-
-
-# TODO(rjryan): Implement `type`, `n` and `axis` parameters.
-@tf_export("spectral.idct")
-def idct(input, type=2, n=None, axis=-1, norm=None, name=None):  # pylint: disable=redefined-builtin
-  """Computes the 1D [Inverse Discrete Cosine Transform (DCT)][idct] of `input`.
-
-  Currently only Types II and III are supported. Type III is the inverse of
-  Type II, and vice versa.
-
-  Note that you must re-normalize by 1/(2n) to obtain an inverse if `norm` is
-  not `'ortho'`. That is:
-  `signal == idct(dct(signal)) * 0.5 / signal.shape[-1]`.
-  When `norm='ortho'`, we have:
-  `signal == idct(dct(signal, norm='ortho'), norm='ortho')`.
-
-  @compatibility(scipy)
-  Equivalent to scipy.fftpack.idct for Type-II and Type-III DCT.
-  https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.fftpack.idct.html
-  @end_compatibility
-
-  Args:
-    input: A `[..., samples]` `float32` `Tensor` containing the signals to take
-      the DCT of.
-    type: The IDCT type to perform. Must be 2 or 3.
-    n: For future expansion. The length of the transform. Must be `None`.
-    axis: For future expansion. The axis to compute the DCT along. Must be `-1`.
-    norm: The normalization to apply. `None` for no normalization or `'ortho'`
-      for orthonormal normalization.
-    name: An optional name for the operation.
-
-  Returns:
-    A `[..., samples]` `float32` `Tensor` containing the IDCT of `input`.
-
-  Raises:
-    ValueError: If `type` is not `2` or `3`, `n` is not `None, `axis` is not
-      `-1`, or `norm` is not `None` or `'ortho'`.
-
-  [idct]:
-  https://en.wikipedia.org/wiki/Discrete_cosine_transform#Inverse_transforms
-  """
-  _validate_dct_arguments(type, n, axis, norm)
-  inverse_type = {2: 3, 3: 2}[type]
-  return dct(input, type=inverse_type, n=n, axis=axis, norm=norm, name=name)
+tf_export("signal.irfft3d", v1=["signal.irfft3d", "spectral.irfft3d"])(irfft3d)
+
+
+def _fft_size_for_grad(grad, rank):
+  return _math_ops.reduce_prod(_array_ops.shape(grad)[-rank:])
+
+
+@_ops.RegisterGradient("FFT")
+def _fft_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 1), grad.dtype)
+  return ifft(grad) * size
+
+
+@_ops.RegisterGradient("IFFT")
+def _ifft_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 1), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft(grad) * rsize
+
+
+@_ops.RegisterGradient("FFT2D")
+def _fft2d_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 2), grad.dtype)
+  return ifft2d(grad) * size
+
+
+@_ops.RegisterGradient("IFFT2D")
+def _ifft2d_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 2), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft2d(grad) * rsize
+
+
+@_ops.RegisterGradient("FFT3D")
+def _fft3d_grad(_, grad):
+  size = _math_ops.cast(_fft_size_for_grad(grad, 3), grad.dtype)
+  return ifft3d(grad) * size
+
+
+@_ops.RegisterGradient("IFFT3D")
+def _ifft3d_grad(_, grad):
+  rsize = _math_ops.cast(
+      1. / _math_ops.cast(_fft_size_for_grad(grad, 3), grad.dtype.real_dtype),
+      grad.dtype)
+  return fft3d(grad) * rsize
+
+
+def _rfft_grad_helper(rank, irfft_fn):
+  """Returns a gradient function for an RFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for RFFT3D.
+  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
+
+  def _grad(op, grad):
+    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
+    fft_length = op.inputs[1]
+    input_shape = _array_ops.shape(op.inputs[0])
+    is_even = _math_ops.cast(1 - (fft_length[-1] % 2), _dtypes.complex64)
+
+    def _tile_for_broadcasting(matrix, t):
+      expanded = _array_ops.reshape(
+          matrix,
+          _array_ops.concat([
+              _array_ops.ones([_array_ops.rank(t) - 2], _dtypes.int32),
+              _array_ops.shape(matrix)
+          ], 0))
+      return _array_ops.tile(
+          expanded, _array_ops.concat([_array_ops.shape(t)[:-2], [1, 1]], 0))
+
+    def _mask_matrix(length):
+      """Computes t_n = exp(sqrt(-1) * pi * n^2 / line_len)."""
+      # TODO(rjryan): Speed up computation of twiddle factors using the
+      # following recurrence relation and cache them across invocations of RFFT.
+      #
+      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
+      # for n = 0, 1,..., line_len-1.
+      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
+      a = _array_ops.tile(
+          _array_ops.expand_dims(_math_ops.range(length), 0), (length, 1))
+      b = _array_ops.transpose(a, [1, 0])
+      return _math_ops.exp(
+          -2j * np.pi * _math_ops.cast(a * b, _dtypes.complex64) /
+          _math_ops.cast(length, _dtypes.complex64))
+
+    def _ymask(length):
+      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
+      return _math_ops.cast(1 - 2 * (_math_ops.range(length) % 2),
+                            _dtypes.complex64)
+
+    y0 = grad[..., 0:1]
+    if rank == 1:
+      ym = grad[..., -1:]
+      extra_terms = y0 + is_even * ym * _ymask(input_shape[-1])
+    elif rank == 2:
+      # Create a mask matrix for y0 and ym.
+      base_mask = _mask_matrix(input_shape[-2])
+
+      # Tile base_mask to match y0 in shape so that we can batch-matmul the
+      # inner 2 dimensions.
+      tiled_mask = _tile_for_broadcasting(base_mask, y0)
+
+      y0_term = _math_ops.matmul(tiled_mask, _math_ops.conj(y0))
+      extra_terms = y0_term
+
+      ym = grad[..., -1:]
+      ym_term = _math_ops.matmul(tiled_mask, _math_ops.conj(ym))
+
+      inner_dim = input_shape[-1]
+      ym_term = _array_ops.tile(
+          ym_term,
+          _array_ops.concat([
+              _array_ops.ones([_array_ops.rank(grad) - 1], _dtypes.int32),
+              [inner_dim]
+          ], 0)) * _ymask(inner_dim)
+
+      extra_terms += is_even * ym_term
+
+    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
+    # factor, plus some additional terms to make up for the components dropped
+    # due to Hermitian symmetry.
+    input_size = _math_ops.to_float(_fft_size_for_grad(op.inputs[0], rank))
+    the_irfft = irfft_fn(grad, fft_length)
+    return 0.5 * (the_irfft * input_size + _math_ops.real(extra_terms)), None
+
+  return _grad
+
+
+def _irfft_grad_helper(rank, rfft_fn):
+  """Returns a gradient function for an IRFFT of the provided rank."""
+  # Can't happen because we don't register a gradient for IRFFT3D.
+  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
+
+  def _grad(op, grad):
+    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
+    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
+    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
+    # graph we special-case the situation where the FFT length and last
+    # dimension of the input are known at graph construction time.
+    fft_length = op.inputs[1]
+    is_odd = _math_ops.mod(fft_length[-1], 2)
+    input_last_dimension = _array_ops.shape(op.inputs[0])[-1]
+    mask = _array_ops.concat(
+        [[1.0], 2.0 * _array_ops.ones([input_last_dimension - 2 + is_odd]),
+         _array_ops.ones([1 - is_odd])], 0)
+
+    rsize = _math_ops.reciprocal(_math_ops.to_float(
+        _fft_size_for_grad(grad, rank)))
+
+    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
+    # factor and a mask. The mask scales the gradient for the Hermitian
+    # symmetric components of the RFFT by a factor of two, since these
+    # components are de-duplicated in the RFFT.
+    the_rfft = rfft_fn(grad, fft_length)
+    return the_rfft * _math_ops.cast(rsize * mask, _dtypes.complex64), None
+
+  return _grad
+
+
+_ops.RegisterGradient("RFFT")(_rfft_grad_helper(1, irfft))
+_ops.RegisterGradient("IRFFT")(_irfft_grad_helper(1, rfft))
+_ops.RegisterGradient("RFFT2D")(_rfft_grad_helper(2, irfft2d))
+_ops.RegisterGradient("IRFFT2D")(_irfft_grad_helper(2, rfft2d))
diff --git a/tensorflow/python/ops/signal/mfcc_ops.py b/tensorflow/python/ops/signal/mfcc_ops.py
index 6ae3b222ba5934a032298a77bbcde494654acd1a..601409dea901f34cca02861971850c3238378163 100644
--- a/tensorflow/python/ops/signal/mfcc_ops.py
+++ b/tensorflow/python/ops/signal/mfcc_ops.py
@@ -22,7 +22,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import dct_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -106,5 +106,5 @@ def mfccs_from_log_mel_spectrograms(log_mel_spectrograms, name=None):
     else:
       num_mel_bins = array_ops.shape(log_mel_spectrograms)[-1]
 
-    dct2 = spectral_ops.dct(log_mel_spectrograms)
+    dct2 = dct_ops.dct(log_mel_spectrograms, type=2)
     return dct2 * math_ops.rsqrt(math_ops.to_float(num_mel_bins) * 2.0)
diff --git a/tensorflow/python/ops/signal/reconstruction_ops.py b/tensorflow/python/ops/signal/reconstruction_ops.py
index 0fc7fec23933d600c89513fb39d3a45856a8618b..4eaab4e0a0cd7958d56c9af3ccf2c5f69b35ee9b 100644
--- a/tensorflow/python/ops/signal/reconstruction_ops.py
+++ b/tensorflow/python/ops/signal/reconstruction_ops.py
@@ -18,46 +18,14 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.signal import shape_ops
-from tensorflow.python.ops.signal import util_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _shuffle_to_front(input_tensor, k):
-  """Shuffles the last `k` indices of `input_tensor` to the front.
-
-  Transposes `input_tensor` to have the last `k` indices at the front. The input
-  may have arbitrary rank and unknown shape.
-
-  Args:
-    input_tensor: A `Tensor` of arbitrary rank and unknown shape.
-    k: A scalar `Tensor` specifying how many indices to shuffle.
-
-  Returns:
-    A transposed version of `input_tensor` with `k` indices shuffled to the
-    front.
-
-  Raises:
-    ValueError: If `input_tensor` is not at least rank `k` or `k` is not scalar.
-  """
-  k = ops.convert_to_tensor(k, name="k")
-  k.shape.with_rank(0)
-  k_static = tensor_util.constant_value(k)
-  if k_static is not None:
-    input_tensor.shape.with_rank_at_least(k_static)
-
-  rank = array_ops.rank(input_tensor)
-  outer_indices, inner_indices = array_ops.split(math_ops.range(rank),
-                                                 [rank - k, k])
-  permutation = array_ops.concat([inner_indices, outer_indices], 0)
-
-  return array_ops.transpose(input_tensor, perm=permutation)
-
-
 @tf_export("signal.overlap_and_add")
 def overlap_and_add(signal, frame_step, name=None):
   """Reconstructs a signal from a framed representation.
@@ -80,8 +48,8 @@ def overlap_and_add(signal, frame_step, name=None):
     frames of `signal`'s inner-most two dimensions.
 
   Raises:
-    ValueError: If `signal`'s rank is less than 2, `frame_step` is not a scalar
-      integer or `frame_step` is greater than `frame_length`.
+    ValueError: If `signal`'s rank is less than 2, or `frame_step` is not a
+      scalar integer.
   """
   with ops.name_scope(name, "overlap_and_add", [signal, frame_step]):
     signal = ops.convert_to_tensor(signal, name="signal")
@@ -97,56 +65,91 @@ def overlap_and_add(signal, frame_step, name=None):
     # All dimensions that are not part of the overlap-and-add. Can be empty for
     # rank 2 inputs.
     outer_dimensions = signal_shape[:-2]
+    outer_rank = array_ops.size(outer_dimensions)
+
+    def full_shape(inner_shape):
+      return array_ops.concat([outer_dimensions, inner_shape], 0)
 
-    # If frame_length and frame_step are known at graph construction time, check
-    # frame_step is less than or equal to frame_length.
-    frame_step_static = tensor_util.constant_value(frame_step)
-    if (frame_step_static is not None and signal.shape.ndims is not None and
-        signal.shape.dims[-1].value is not None):
-      if frame_step_static > signal.shape.dims[-1].value:
-        raise ValueError(
-            "frame_step (%d) must be less than or equal to "
-            "frame_length (%d)" % (
-                frame_step_static, signal.shape.dims[-1].value))
-      # If frame_length is equal to frame_step, there's no overlap so just
-      # reshape the tensor.
-      if frame_step_static == signal.shape.dims[-1].value:
-        return array_ops.reshape(signal, array_ops.concat(
-            [outer_dimensions, [-1]], 0))
-
-    signal_rank = array_ops.rank(signal)
-    frames = signal_shape[-2]
     frame_length = signal_shape[-1]
+    frames = signal_shape[-2]
 
-    subframe_length = util_ops.gcd(frame_length, frame_step)
-    subframe_step = frame_step // subframe_length
-    subframes_per_frame = frame_length // subframe_length
-    output_size = frame_step * (frames - 1) + frame_length
-    output_subframes = output_size // subframe_length
-
-    # To avoid overlap-adding sample-by-sample, we overlap-add at the "subframe"
-    # level, where a subframe is gcd(frame_length, frame_step). Reshape signal
-    # from [..., frames, frame_length] into [..., subframes, subframe_length].
-    subframe_shape = array_ops.concat(
-        [outer_dimensions, [-1, subframe_length]], 0)
-    subframe_signal = array_ops.reshape(signal, subframe_shape)
-
-    # Now we shuffle the last [subframes, subframe_length] dimensions to the
-    # front.
-    # TODO(rjryan): Add an axis argument to unsorted_segment_sum so we can
-    # avoid this pair of transposes.
-    subframe_signal = _shuffle_to_front(subframe_signal, 2)
-
-    # Use unsorted_segment_sum to add overlapping subframes together.
-    segment_ids = array_ops.reshape(shape_ops.frame(
-        math_ops.range(output_subframes), subframes_per_frame, subframe_step,
-        pad_end=False), [-1])
-    result = math_ops.unsorted_segment_sum(subframe_signal, segment_ids,
-                                           num_segments=output_subframes)
-
-    # result is a [subframes, subframe_length, ...outer_dimensions] tensor. We
-    # return a [...outer_dimensions, output_size] tensor with a transpose and
-    # reshape.
-    result_shape = array_ops.concat([outer_dimensions, [output_size]], 0)
-    return array_ops.reshape(_shuffle_to_front(result, signal_rank - 2),
-                             result_shape)
+    # Compute output length.
+    output_length = frame_length + frame_step * (frames - 1)
+
+    # If frame_length is equal to frame_step, there's no overlap so just
+    # reshape the tensor.
+    frame_step_static = tensor_util.constant_value(frame_step)
+    if (frame_step_static is not None and signal.shape.dims is not None and
+        frame_step_static == signal.shape.dims[-1].value):
+      output_shape = full_shape([output_length])
+      return array_ops.reshape(signal, output_shape, name="fast_path")
+
+    # The following code is documented using this example:
+    #
+    # frame_step = 2
+    # signal.shape = (3, 5)
+    # a b c d e
+    # f g h i j
+    # k l m n o
+
+    # Compute the number of segments, per frame.
+    segments = -(-frame_length // frame_step)  # Divide and round up.
+
+    # Pad the frame_length dimension to a multiple of the frame step.
+    # Pad the frames dimension by `segments` so that signal.shape = (6, 6)
+    # a b c d e 0
+    # f g h i j 0
+    # k l m n o 0
+    # 0 0 0 0 0 0
+    # 0 0 0 0 0 0
+    # 0 0 0 0 0 0
+    paddings = [[0, segments], [0, segments * frame_step - frame_length]]
+    outer_paddings = array_ops.zeros([outer_rank, 2], dtypes.int32)
+    paddings = array_ops.concat([outer_paddings, paddings], 0)
+    signal = array_ops.pad(signal, paddings)
+
+    # Reshape so that signal.shape = (3, 6, 2)
+    # ab cd e0
+    # fg hi j0
+    # kl mn o0
+    # 00 00 00
+    # 00 00 00
+    # 00 00 00
+    shape = full_shape([frames + segments, segments, frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Transpose dimensions so that signal.shape = (3, 6, 2)
+    # ab fg kl 00 00 00
+    # cd hi mn 00 00 00
+    # e0 j0 o0 00 00 00
+    perm = array_ops.concat(
+        [math_ops.range(outer_rank), outer_rank + [1, 0, 2]], 0)
+    signal = array_ops.transpose(signal, perm)
+
+    # Reshape so that signal.shape = (18, 2)
+    # ab fg kl 00 00 00 cd hi mn 00 00 00 e0 j0 o0 00 00 00
+    shape = full_shape([(frames + segments) * segments, frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Truncate so that signal.shape = (15, 2)
+    # ab fg kl 00 00 00 cd hi mn 00 00 00 e0 j0 o0
+    signal = signal[..., :(frames + segments - 1) * segments, :]
+
+    # Reshape so that signal.shape = (3, 5, 2)
+    # ab fg kl 00 00
+    # 00 cd hi mn 00
+    # 00 00 e0 j0 o0
+    shape = full_shape([segments, (frames + segments - 1), frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Now, reduce over the columns, to achieve the desired sum.
+    signal = math_ops.reduce_sum(signal, -3)
+
+    # Flatten the array.
+    shape = full_shape([(frames + segments - 1) * frame_step])
+    signal = array_ops.reshape(signal, shape)
+
+    # Truncate to final length.
+    signal = signal[..., :output_length]
+
+    return signal
diff --git a/tensorflow/python/ops/signal/shape_ops.py b/tensorflow/python/ops/signal/shape_ops.py
index 02dd7c97e8f0fcb527b9e44d1fb1468ca9dcf18a..ae9c2ef28e4f1c857519838f22a4844ac2c9e7b4 100644
--- a/tensorflow/python/ops/signal/shape_ops.py
+++ b/tensorflow/python/ops/signal/shape_ops.py
@@ -71,7 +71,7 @@ def frame(signal, frame_length, frame_step, pad_end=False, pad_value=0, axis=-1,
   ```python
   pcm = tf.placeholder(tf.float32, [None, 9152])
   frames = tf.signal.frame(pcm, 512, 180)
-  magspec = tf.abs(tf.spectral.rfft(frames, [512]))
+  magspec = tf.abs(tf.signal.rfft(frames, [512]))
   image = tf.expand_dims(magspec, 3)
   ```
 
diff --git a/tensorflow/python/ops/signal/signal.py b/tensorflow/python/ops/signal/signal.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc4d1c1911a8570ba28a0b42bd6da5d83fd40e1
--- /dev/null
+++ b/tensorflow/python/ops/signal/signal.py
@@ -0,0 +1,65 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Signal processing operations.
+
+See the [tf.signal](https://tensorflow.org/api_guides/python/contrib.signal)
+guide.
+
+@@frame
+@@hamming_window
+@@hann_window
+@@inverse_stft
+@@inverse_stft_window_fn
+@@mfccs_from_log_mel_spectrograms
+@@linear_to_mel_weight_matrix
+@@overlap_and_add
+@@stft
+
+[hamming]: https://en.wikipedia.org/wiki/Window_function#Hamming_window
+[hann]: https://en.wikipedia.org/wiki/Window_function#Hann_window
+[mel]: https://en.wikipedia.org/wiki/Mel_scale
+[mfcc]: https://en.wikipedia.org/wiki/Mel-frequency_cepstrum
+[stft]: https://en.wikipedia.org/wiki/Short-time_Fourier_transform
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# pylint: disable=unused-import
+from tensorflow.python.ops.signal.dct_ops import dct
+from tensorflow.python.ops.signal.dct_ops import idct
+from tensorflow.python.ops.signal.fft_ops import fft
+from tensorflow.python.ops.signal.fft_ops import fft2d
+from tensorflow.python.ops.signal.fft_ops import fft3d
+from tensorflow.python.ops.signal.fft_ops import ifft
+from tensorflow.python.ops.signal.fft_ops import ifft2d
+from tensorflow.python.ops.signal.fft_ops import ifft3d
+from tensorflow.python.ops.signal.fft_ops import irfft
+from tensorflow.python.ops.signal.fft_ops import irfft2d
+from tensorflow.python.ops.signal.fft_ops import irfft3d
+from tensorflow.python.ops.signal.fft_ops import rfft
+from tensorflow.python.ops.signal.fft_ops import rfft2d
+from tensorflow.python.ops.signal.fft_ops import rfft3d
+from tensorflow.python.ops.signal.mel_ops import linear_to_mel_weight_matrix
+from tensorflow.python.ops.signal.mfcc_ops import mfccs_from_log_mel_spectrograms
+from tensorflow.python.ops.signal.reconstruction_ops import overlap_and_add
+from tensorflow.python.ops.signal.shape_ops import frame
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft
+from tensorflow.python.ops.signal.spectral_ops import inverse_stft_window_fn
+from tensorflow.python.ops.signal.spectral_ops import stft
+from tensorflow.python.ops.signal.window_ops import hamming_window
+from tensorflow.python.ops.signal.window_ops import hann_window
+# pylint: enable=unused-import
diff --git a/tensorflow/python/ops/signal/spectral_ops.py b/tensorflow/python/ops/signal/spectral_ops.py
index b0b7d964b93f4fca1bdb6f62be8193bc43736988..f029e0a8b59777b50e38ab4d8f801e811467c561 100644
--- a/tensorflow/python/ops/signal/spectral_ops.py
+++ b/tensorflow/python/ops/signal/spectral_ops.py
@@ -25,7 +25,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
+from tensorflow.python.ops.signal import fft_ops
 from tensorflow.python.ops.signal import reconstruction_ops
 from tensorflow.python.ops.signal import shape_ops
 from tensorflow.python.ops.signal import window_ops
@@ -86,9 +86,9 @@ def stft(signals, frame_length, frame_step, fft_length=None,
       window = window_fn(frame_length, dtype=framed_signals.dtype)
       framed_signals *= window
 
-    # spectral_ops.rfft produces the (fft_length/2 + 1) unique components of the
+    # fft_ops.rfft produces the (fft_length/2 + 1) unique components of the
     # FFT of the real windowed signals in framed_signals.
-    return spectral_ops.rfft(framed_signals, [fft_length])
+    return fft_ops.rfft(framed_signals, [fft_length])
 
 
 @tf_export('signal.inverse_stft_window_fn')
@@ -232,7 +232,7 @@ def inverse_stft(stfts,
       fft_length = ops.convert_to_tensor(fft_length, name='fft_length')
       fft_length.shape.assert_has_rank(0)
 
-    real_frames = spectral_ops.irfft(stfts, [fft_length])
+    real_frames = fft_ops.irfft(stfts, [fft_length])
 
     # frame_length may be larger or smaller than fft_length, so we pad or
     # truncate real_frames to frame_length.
diff --git a/tensorflow/python/ops/sort_ops.py b/tensorflow/python/ops/sort_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3e23d701ed546ca76e2dd08e999ff869e87c816
--- /dev/null
+++ b/tensorflow/python/ops/sort_ops.py
@@ -0,0 +1,197 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Support for sorting tensors.
+
+@@argsort
+@@sort
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops as framework_ops
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('sort')
+def sort(values, axis=-1, direction='ASCENDING', name=None):
+  """Sorts a tensor.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    name: Optional name for the operation.
+
+  Returns:
+    A `Tensor` with the same dtype and shape as `values`, with the elements
+        sorted along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  with framework_ops.name_scope(name, 'sort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=False)
+
+
+@tf_export('argsort')
+def argsort(values, axis=-1, direction='ASCENDING', stable=False, name=None):
+  """Returns the indices of a tensor that give its sorted order along an axis.
+
+  For a 1D tensor, `tf.gather(values, tf.argsort(values))` is equivalent to
+  `tf.sort(values)`. For higher dimensions, the output has the same shape as
+  `values`, but along the given axis, values represent the index of the sorted
+  element in that slice of the tensor at the given position.
+
+  Args:
+    values: 1-D or higher numeric `Tensor`.
+    axis: The axis along which to sort. The default is -1, which sorts the last
+      axis.
+    direction: The direction in which to sort the values (`'ASCENDING'` or
+      `'DESCENDING'`).
+    stable: If True, equal elements in the original tensor will not be
+      re-ordered in the returned order. Unstable sort is not yet implemented,
+      but will eventually be the default for performance reasons. If you require
+      a stable order, pass `stable=True` for forwards compatibility.
+    name: Optional name for the operation.
+
+  Returns:
+    An int32 `Tensor` with the same shape as `values`. The indices that would
+        sort each slice of the given `values` along the given `axis`.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  del stable  # Unused.
+  with framework_ops.name_scope(name, 'argsort'):
+    return _sort_or_argsort(values, axis, direction, return_argsort=True)
+
+
+def _sort_or_argsort(values, axis, direction, return_argsort):
+  """Internal sort/argsort implementation.
+
+  Args:
+    values: The input values.
+    axis: The axis along which to sort.
+    direction: 'ASCENDING' or 'DESCENDING'.
+    return_argsort: Whether to return the argsort result.
+
+  Returns:
+    Either the sorted values, or the indices of the sorted values in the
+        original tensor. See the `sort` and `argsort` docstrings.
+
+  Raises:
+    ValueError: If axis is not a constant scalar, or the direction is invalid.
+  """
+  if direction not in _SORT_IMPL:
+    raise ValueError('%s should be one of %s' % (direction, ', '.join(
+        sorted(_SORT_IMPL.keys()))))
+  # Axis must be an integer, not a Tensor.
+  axis = framework_ops.convert_to_tensor(axis, name='axis')
+  axis_static = tensor_util.constant_value(axis)
+  if axis.shape.ndims != 0 or axis_static is None:
+    raise ValueError('axis must be a constant scalar')
+  axis_static = int(axis_static)  # Avoids NumPy casting error
+
+  values = framework_ops.convert_to_tensor(values, name='values')
+
+  return _SORT_IMPL[direction](values, axis_static, return_argsort)
+
+
+def _descending_sort(values, axis, return_argsort=False):
+  """Sorts values in reverse using `top_k`.
+
+  Args:
+    values: Tensor of numeric values.
+    axis: Index of the axis which values should be sorted along.
+    return_argsort: If False, return the sorted values. If True, return the
+      indices that would sort the values.
+
+  Returns:
+    The sorted values.
+  """
+  k = array_ops.shape(values)[axis]
+  rank = array_ops.rank(values)
+  static_rank = values.shape.ndims
+  # Fast path: sorting the last axis.
+  if axis == -1 or axis + 1 == values.get_shape().ndims:
+    top_k_input = values
+    transposition = None
+  else:
+    # Otherwise, transpose the array. Swap axes `axis` and `rank - 1`.
+    if axis < 0:
+      # Calculate the actual axis index if counting from the end. Use the static
+      # rank if available, or else make the axis back into a tensor.
+      axis += static_rank or rank
+    if static_rank is not None:
+      # Prefer to calculate the transposition array in NumPy and make it a
+      # constant.
+      transposition = constant_op.constant(
+          np.r_[
+              # Axes up to axis are unchanged.
+              np.arange(axis),
+              # Swap axis and rank - 1.
+              [static_rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              np.arange(axis + 1, static_rank - 1),
+              # Swap axis and rank - 1.
+              [axis]],
+          name='transposition')
+    else:
+      # Generate the transposition array from the tensors.
+      transposition = array_ops.concat(
+          [
+              # Axes up to axis are unchanged.
+              math_ops.range(axis),
+              # Swap axis and rank - 1.
+              [rank - 1],
+              # Axes in [axis + 1, rank - 1) are unchanged.
+              math_ops.range(axis + 1, rank - 1),
+              # Swap axis and rank - 1.
+              [axis]
+          ],
+          axis=0)
+    top_k_input = array_ops.transpose(values, transposition)
+
+  values, indices = nn_ops.top_k(top_k_input, k)
+  return_value = indices if return_argsort else values
+  if transposition is not None:
+    # transposition contains a single cycle of length 2 (swapping 2 elements),
+    # so it is an involution (it is its own inverse).
+    return_value = array_ops.transpose(return_value, transposition)
+  return return_value
+
+
+def _ascending_sort(values, axis, return_argsort=False):
+  # Negate the values to get the ascending order from descending sort.
+  values_or_indices = _descending_sort(-values, axis, return_argsort)
+  # If not argsort, negate the values again.
+  return values_or_indices if return_argsort else -values_or_indices
+
+
+_SORT_IMPL = {
+    'ASCENDING': _ascending_sort,
+    'DESCENDING': _descending_sort,
+}
diff --git a/tensorflow/contrib/framework/python/ops/sort_ops_test.py b/tensorflow/python/ops/sort_ops_test.py
similarity index 90%
rename from tensorflow/contrib/framework/python/ops/sort_ops_test.py
rename to tensorflow/python/ops/sort_ops_test.py
index 791b32cd1e2eea9f466a14585a8b15d085bd450f..17ce604cbf195427033aa71e4c7b4d7ceed61c50 100644
--- a/tensorflow/contrib/framework/python/ops/sort_ops_test.py
+++ b/tensorflow/python/ops/sort_ops_test.py
@@ -20,22 +20,25 @@ from __future__ import print_function
 
 import numpy as np
 
-from tensorflow.contrib.framework.python.ops import sort_ops
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import sort_ops
 from tensorflow.python.platform import test
 
 
 class SortTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testRandom_lowDimensionality(self):
     self._testRandom_lowDimensionality(negative_axis=False)
 
+  @test_util.run_deprecated_v1
   def testRandom_lowDimensionality_negative(self):
     self._testRandom_lowDimensionality(negative_axis=True)
 
@@ -53,6 +56,7 @@ class SortTest(test.TestCase):
             np.sort(arr, axis=sort_axis),
             sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
 
+  @test_util.run_deprecated_v1
   def testRandom_highDimensionality(self):
     np.random.seed(100)
     for _ in range(20):
@@ -65,6 +69,7 @@ class SortTest(test.TestCase):
             np.sort(arr, axis=sort_axis),
             sort_ops.sort(constant_op.constant(arr), axis=sort_axis).eval())
 
+  @test_util.run_deprecated_v1
   def testScalar(self):
     # Create an empty scalar where the static shape is unknown.
     zeros_length_1 = array_ops.zeros(
@@ -77,21 +82,22 @@ class SortTest(test.TestCase):
       with self.assertRaises(errors.InvalidArgumentError):
         sort.eval()
 
+  @test_util.run_deprecated_v1
   def testNegativeOutOfBounds_staticShape(self):
     arr = constant_op.constant([3, 4, 5])
     with self.assertRaises(ValueError):
       sort_ops.sort(arr, axis=-4)
 
+  @test_util.run_deprecated_v1
   def testDescending(self):
     arr = np.random.random((10, 5, 5))
     with self.cached_session():
       self.assertAllEqual(
           np.sort(arr, axis=0)[::-1],
           sort_ops.sort(
-              constant_op.constant(arr),
-              axis=0,
-              direction='DESCENDING').eval())
+              constant_op.constant(arr), axis=0, direction='DESCENDING').eval())
 
+  @test_util.run_deprecated_v1
   def testSort_staticallyKnownRank_constantTransposition(self):
     # The transposition array should be a constant if the rank of "values" is
     # statically known.
@@ -109,6 +115,7 @@ class SortTest(test.TestCase):
         tensor_util.constant_value(transposition),
         [0, 4, 2, 3, 1])
 
+  @test_util.run_deprecated_v1
   def testArgsort_1d(self):
     arr = np.random.random(42)
     with self.cached_session():
@@ -116,6 +123,7 @@ class SortTest(test.TestCase):
           np.sort(arr),
           array_ops.gather(arr, sort_ops.argsort(arr)).eval())
 
+  @test_util.run_deprecated_v1
   def testArgsort(self):
     arr = np.random.random((5, 6, 7, 8))
     for axis in range(4):
diff --git a/tensorflow/python/ops/sparse_grad.py b/tensorflow/python/ops/sparse_grad.py
index 1223b290ff6cfcfba27f40c05556c85b59e77148..2ca9c0c647d14b792b2575c8f977d9dbe39efb4b 100644
--- a/tensorflow/python/ops/sparse_grad.py
+++ b/tensorflow/python/ops/sparse_grad.py
@@ -195,7 +195,7 @@ def _SparseTensorDenseMatMulGrad(op, grad):
   parts_a = array_ops.gather(grad, rows if not adj_a else cols)
   parts_b = array_ops.gather(b if not adj_b else array_ops.transpose(b),
                              cols if not adj_a else rows)
-  a_values_grad = math_ops.reduce_sum(parts_a * parts_b, reduction_indices=1)
+  a_values_grad = math_ops.reduce_sum(parts_a * parts_b, axis=1)
 
   # gradients w.r.t. (a_indices, a_values, a_shape, b)
   return (None, a_values_grad, None, b_grad)
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index b98c7f5f65b641acf1003a259c0027287cb733f8..097b485a115fb8153f77d0ad24c63b872fb2e8ca 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -44,6 +44,9 @@ from tensorflow.python.ops.gen_sparse_ops import *
 # pylint: enable=wildcard-import
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
+from tensorflow.python.util import tf_inspect
+from tensorflow.python.util.tf_export import get_canonical_name_for_symbol
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -186,7 +189,7 @@ def sparse_eye(num_rows,
 
 
 # pylint: disable=protected-access
-@tf_export("sparse.concat", "sparse_concat")
+@tf_export(v1=["sparse.concat", "sparse_concat"])
 @deprecation.deprecated_endpoints("sparse_concat")
 @deprecation.deprecated_args(
     None, "concat_dim is deprecated, use axis instead", "concat_dim")
@@ -292,6 +295,11 @@ def sparse_concat(axis,
   """
   axis = deprecation.deprecated_argument_lookup("axis", axis, "concat_dim",
                                                 concat_dim)
+  return sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dim, name)
+
+
+@tf_export("sparse.concat", v1=[])
+def sparse_concat_v2(axis, sp_inputs, expand_nonconcat_dims=False, name=None):  # pylint: disable=missing-docstring
   sp_inputs = _convert_to_sparse_tensors(sp_inputs)
 
   if len(sp_inputs) == 1:  # Degenerate case of one tensor.
@@ -301,7 +309,7 @@ def sparse_concat(axis,
   vals = [sp_input.values for sp_input in sp_inputs]
   shapes = [sp_input.dense_shape for sp_input in sp_inputs]
 
-  if expand_nonconcat_dim:
+  if expand_nonconcat_dims:
     max_shape = math_ops.reduce_max(
         array_ops.concat(
             [array_ops.reshape(shape, [1, -1]) for shape in shapes], 0), 0)
@@ -319,9 +327,15 @@ def sparse_concat(axis,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.add", v1=["sparse.add", "sparse_add"])
+sparse_concat_v2.__doc__ = sparse_concat.__doc__.replace(
+    "    concat_dim: The old (deprecated) name for axis.\n", "")
+
+
+@tf_export(v1=["sparse.add", "sparse_add"])
 @deprecation.deprecated_endpoints("sparse_add")
-def sparse_add(a, b, thresh=0):
+@deprecation.deprecated_args(
+    None, "thresh is deprecated, use threshold instead", "thresh")
+def sparse_add(a, b, threshold=None, thresh=None):
   """Adds two tensors, at least one of each is a `SparseTensor`.
 
   If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
@@ -359,12 +373,74 @@ def sparse_add(a, b, thresh=0):
 
   Args:
     a: The first operand; `SparseTensor` or `Tensor`.
-    b: The second operand; `SparseTensor` or `Tensor`.  At least one operand
+    b: The second operand; `SparseTensor` or `Tensor`. At least one operand
+      must be sparse.
+    threshold: An optional 0-D `Tensor` (defaults to `0`). The magnitude
+      threshold that determines if an output value/index pair takes space. Its
+      dtype should match that of the values if they are real; if the latter are
+      complex64/complex128, then the dtype should be float32/float64,
+      correspondingly.
+    thresh: Deprecated alias for `threshold`.
+
+  Returns:
+    A `SparseTensor` or a `Tensor`, representing the sum.
+
+  Raises:
+    TypeError: If both `a` and `b` are `Tensor`s.  Use `tf.add()` instead.
+  """
+  threshold = deprecation.deprecated_argument_lookup("threshold", threshold,
+                                                     "thresh", thresh)
+  if threshold is None:
+    threshold = 0
+  return sparse_add_v2(a, b, threshold)
+
+
+@tf_export("sparse.add", v1=[])
+def sparse_add_v2(a, b, threshold=0):
+  """Adds two tensors, at least one of each is a `SparseTensor`.
+
+  If one `SparseTensor` and one `Tensor` are passed in, returns a `Tensor`.  If
+  both arguments are `SparseTensor`s, this returns a `SparseTensor`.  The order
+  of arguments does not matter.  Use vanilla `tf.add()` for adding two dense
+  `Tensor`s.
+
+  The shapes of the two operands must match: broadcasting is not supported.
+
+  The indices of any input `SparseTensor` are assumed ordered in standard
+  lexicographic order.  If this is not the case, before this step run
+  `SparseReorder` to restore index ordering.
+
+  If both arguments are sparse, we perform "clipping" as follows.  By default,
+  if two values sum to zero at some index, the output `SparseTensor` would still
+  include that particular location in its index, storing a zero in the
+  corresponding value slot.  To override this, callers can specify `threshold`,
+  indicating that if the sum has a magnitude strictly smaller than `threshold`,
+  its corresponding value and index would then not be included.  In particular,
+  `threshold == 0.0` (default) means everything is kept and actual thresholding
+  happens only for a positive value.
+
+  For example, suppose the logical sum of two sparse operands is (densified):
+
+      [       2]
+      [.1     0]
+      [ 6   -.2]
+
+  Then,
+
+      * `threshold == 0` (the default): all 5 index/value pairs will be
+          returned.
+      * `threshold == 0.11`: only .1 and 0 will vanish, and the remaining three
+          index/value pairs will be returned.
+      * `threshold == 0.21`: .1, 0, and -.2 will vanish.
+
+  Args:
+    a: The first operand; `SparseTensor` or `Tensor`.
+    b: The second operand; `SparseTensor` or `Tensor`. At least one operand
       must be sparse.
-    thresh: A 0-D `Tensor`.  The magnitude threshold that determines if an
-    output value/index pair takes space.  Its dtype should match that of the
-    values if they are real; if the latter are complex64/complex128, then the
-    dtype should be float32/float64, correspondingly.
+    threshold: A 0-D `Tensor`. The magnitude threshold that determines if an
+      output value/index pair takes space. Its dtype should match that of the
+      values if they are real; if the latter are complex64/complex128, then the
+      dtype should be float32/float64, correspondingly.
 
   Returns:
     A `SparseTensor` or a `Tensor`, representing the sum.
@@ -380,11 +456,12 @@ def sparse_add(a, b, thresh=0):
   if all(isinstance(inp, sparse_classes) for inp in [a, b]):
     a = _convert_to_sparse_tensor(a)
     b = _convert_to_sparse_tensor(b)
-    thresh = ops.convert_to_tensor(
-        thresh, dtype=a.values.dtype.real_dtype.base_dtype, name="thresh")
+    threshold = ops.convert_to_tensor(
+        threshold, dtype=a.values.dtype.real_dtype.base_dtype, name="threshold")
     output_ind, output_val, output_shape = (
         gen_sparse_ops.sparse_add(a.indices, a.values, a.dense_shape,
-                                  b.indices, b.values, b.dense_shape, thresh))
+                                  b.indices, b.values, b.dense_shape,
+                                  threshold))
 
     # Attempt to get output_shape statically.
     a.get_shape().assert_is_compatible_with(b.get_shape())
@@ -705,7 +782,7 @@ class KeywordRequired(object):
     return "KeywordRequired()"
 
 
-@tf_export("sparse.split", "sparse_split")
+@tf_export(v1=["sparse.split", "sparse_split"])
 @deprecation.deprecated_endpoints("sparse_split")
 @deprecation.deprecated_args(
     None, "split_dim is deprecated, use axis instead", "split_dim")
@@ -779,6 +856,51 @@ def sparse_split(keyword_required=KeywordRequired(),
   return sparse_tensors
 
 
+@tf_export("sparse.split", v1=[])
+def sparse_split_v2(sp_input=None,
+                    num_split=None,
+                    axis=None,
+                    name=None):
+  """Split a `SparseTensor` into `num_split` tensors along `axis`.
+
+  If the `sp_input.dense_shape[axis]` is not an integer multiple of `num_split`
+  each slice starting from 0:`shape[axis] % num_split` gets extra one
+  dimension. For example, if `axis = 1` and `num_split = 2` and the
+  input is:
+
+      input_tensor = shape = [2, 7]
+      [    a   d e  ]
+      [b c          ]
+
+  Graphically the output tensors are:
+
+      output_tensor[0] =
+      [    a ]
+      [b c   ]
+
+      output_tensor[1] =
+      [ d e  ]
+      [      ]
+
+  Args:
+    sp_input: The `SparseTensor` to split.
+    num_split: A Python integer. The number of ways to split.
+    axis: A 0-D `int32` `Tensor`. The dimension along which to split.
+    name: A name for the operation (optional).
+
+  Returns:
+    `num_split` `SparseTensor` objects resulting from splitting `value`.
+
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return sparse_split(sp_input=sp_input,
+                      num_split=num_split,
+                      axis=axis,
+                      name=name,
+                      split_dim=None)
+
+
 @tf_export("sparse.slice", v1=["sparse.slice", "sparse_slice"])
 @deprecation.deprecated_endpoints("sparse_slice")
 def sparse_slice(sp_input, start, size, name=None):
@@ -829,7 +951,7 @@ def sparse_slice(sp_input, start, size, name=None):
                                       output_shape)
 
 
-@tf_export("sparse_to_dense")
+@tf_export(v1=["sparse_to_dense"])
 @deprecation.deprecated(
     None,
     "Create a `tf.sparse.SparseTensor` and use `tf.sparse.to_dense` instead.")
@@ -888,10 +1010,92 @@ def sparse_to_dense(sparse_indices,
       name=name)
 
 
-@tf_export("sparse.reduce_max", "sparse_reduce_max")
+@tf_export("sparse.reduce_max", v1=[])
+def sparse_reduce_max_v2(
+    sp_input, axis=None, keepdims=None, output_is_sparse=False, name=None):
+  """Computes the max of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_max()`.  In particular, this Op also returns a dense `Tensor`
+  if `output_is_sparse` is `False`, or a `SparseTensor` if `output_is_sparse`
+  is `True`.
+
+  Note: A gradient is not defined for this function, so it can't be used
+  in training models that need gradient descent.
+
+  Reduces `sp_input` along the dimensions given in `axis`.  Unless
+  `keepdims` is true, the rank of the tensor is reduced by 1 for each entry in
+  `axis`. If `keepdims` is true, the reduced dimensions are retained
+  with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  The values not defined in `sp_input` don't participate in the reduce max,
+  as opposed to be implicitly assumed 0 -- hence it can return negative values
+  for sparse `axis`. But, in case there are no values in
+  `axis`, it will reduce to 0. See second example below.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 2]
+  #                 [?, 3, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse.reduce_max(x) ==> 3
+  tf.sparse.reduce_max(x, 0) ==> [1, 3, 2]
+  tf.sparse.reduce_max(x, 1) ==> [2, 3]  # Can also use -1 as the axis.
+  tf.sparse.reduce_max(x, 1, keepdims=True) ==> [[2], [3]]
+  tf.sparse.reduce_max(x, [0, 1]) ==> 3
+
+  # 'y' represents [[-7, ?]
+  #                 [ 4, 3]
+  #                 [ ?, ?]
+  tf.sparse.reduce_max(x, 1) ==> [-7, 4, 0]
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keepdims: If true, retain reduced dimensions with length 1.
+    output_is_sparse: If true, returns a `SparseTensor` instead of a dense
+      `Tensor` (the default).
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced Tensor or the reduced SparseTensor if `output_is_sparse` is
+    True.
+  """
+  if keepdims is None:
+    keepdims = False
+
+  # reduction_axes is the deprecated name for axis.
+  reduction_axes = None
+
+  if output_is_sparse:
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops.sparse_reduce_max_sparse(
+            sp_input.indices, sp_input.values, sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            name=name))
+
+    return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+  return gen_sparse_ops.sparse_reduce_max(
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      name=name)
+
+
+@tf_export(v1=["sparse.reduce_max", "sparse_reduce_max"])
 @deprecation.deprecated_endpoints("sparse_reduce_max")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(
+    None, "reduction_axes is deprecated, use axis instead",
+    "reduction_axes")
 def sparse_reduce_max(sp_input, axis=None, keepdims=None,
                       reduction_axes=None, keep_dims=None):
   """Computes the max of elements across dimensions of a SparseTensor.
@@ -940,7 +1144,7 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
     keepdims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis.
+    reduction_axes: Deprecated name of `axis`.
     keep_dims:  Deprecated alias for `keepdims`.
 
   Returns:
@@ -956,7 +1160,7 @@ def sparse_reduce_max(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse.reduce_max_sparse", "sparse_reduce_max_sparse")
+@tf_export(v1=["sparse.reduce_max_sparse", "sparse_reduce_max_sparse"])
 @deprecation.deprecated_endpoints("sparse_reduce_max_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1007,10 +1211,80 @@ def sparse_reduce_max_sparse(sp_input,
   return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
 
 
-@tf_export("sparse.reduce_sum", "sparse_reduce_sum")
+@tf_export("sparse.reduce_sum", v1=[])
+def sparse_reduce_sum_v2(
+    sp_input, axis=None, keepdims=None, output_is_sparse=False, name=None):
+  """Computes the sum of elements across dimensions of a SparseTensor.
+
+  This Op takes a SparseTensor and is the sparse counterpart to
+  `tf.reduce_sum()`.  In particular, this Op also returns a dense `Tensor`
+  if `output_is_sparse` is `False`, or a `SparseTensor` if `output_is_sparse`
+  is `True`.
+
+  Note: if `output_is_sparse` is True, a gradient is not defined for this
+  function, so it can't be used in training models that need gradient descent.
+
+  Reduces `sp_input` along the dimensions given in `axis`.  Unless `keepdims` is
+  true, the rank of the tensor is reduced by 1 for each entry in `axis`. If
+  `keepdims` is true, the reduced dimensions are retained with length 1.
+
+  If `axis` has no entries, all dimensions are reduced, and a tensor
+  with a single element is returned.  Additionally, the axes can be negative,
+  similar to the indexing rules in Python.
+
+  For example:
+
+  ```python
+  # 'x' represents [[1, ?, 1]
+  #                 [?, 1, ?]]
+  # where ? is implicitly-zero.
+  tf.sparse.reduce_sum(x) ==> 3
+  tf.sparse.reduce_sum(x, 0) ==> [1, 1, 1]
+  tf.sparse.reduce_sum(x, 1) ==> [2, 1]  # Can also use -1 as the axis.
+  tf.sparse.reduce_sum(x, 1, keepdims=True) ==> [[2], [1]]
+  tf.sparse.reduce_sum(x, [0, 1]) ==> 3
+  ```
+
+  Args:
+    sp_input: The SparseTensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce; list or scalar. If `None` (the
+      default), reduces all dimensions.
+    keepdims: If true, retain reduced dimensions with length 1.
+    output_is_sparse: If true, returns a `SparseTensor` instead of a dense
+      `Tensor` (the default).
+    name: A name for the operation (optional).
+
+  Returns:
+    The reduced Tensor or the reduced SparseTensor if `output_is_sparse` is
+    True.
+  """
+  if keepdims is None:
+    keepdims = False
+
+  # reduction_axes is the deprecated name for axis.
+  reduction_axes = None
+
+  if output_is_sparse:
+    output_ind, output_val, output_shape = (
+        gen_sparse_ops.sparse_reduce_sum_sparse(
+            sp_input.indices, sp_input.values, sp_input.dense_shape,
+            math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+            name=name))
+    return sparse_tensor.SparseTensor(output_ind, output_val, output_shape)
+
+  return gen_sparse_ops.sparse_reduce_sum(
+      sp_input.indices, sp_input.values, sp_input.dense_shape,
+      math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims,
+      name=name)
+
+
+@tf_export(v1=["sparse.reduce_sum", "sparse_reduce_sum"])
 @deprecation.deprecated_endpoints("sparse_reduce_sum")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
+@deprecation.deprecated_args(
+    None, "reduction_axes is deprecated, use axis instead",
+    "reduction_axes")
 def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
                       reduction_axes=None, keep_dims=None):
   """Computes the sum of elements across dimensions of a SparseTensor.
@@ -1046,7 +1320,7 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
     axis: The dimensions to reduce; list or scalar. If `None` (the
       default), reduces all dimensions.
     keepdims: If true, retain reduced dimensions with length 1.
-    reduction_axes: Deprecated name of axis.
+    reduction_axes: Deprecated name of `axis`.
     keep_dims: Deprecated alias for `keepdims`.
 
   Returns:
@@ -1062,7 +1336,7 @@ def sparse_reduce_sum(sp_input, axis=None, keepdims=None,
       math_ops._ReductionDims(sp_input, axis, reduction_axes), keepdims)
 
 
-@tf_export("sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse")
+@tf_export(v1=["sparse.reduce_sum_sparse", "sparse_reduce_sum_sparse"])
 @deprecation.deprecated_endpoints("sparse_reduce_sum_sparse")
 @deprecation.deprecated_args(
     None, "keep_dims is deprecated, use keepdims instead", "keep_dims")
@@ -1157,7 +1431,7 @@ def sparse_tensor_to_dense(sp_input,
   """
   sp_input = _convert_to_sparse_tensor(sp_input)
 
-  return sparse_to_dense(
+  return gen_sparse_ops.sparse_to_dense(
       sp_input.indices,
       sp_input.dense_shape,
       sp_input.values,
@@ -1231,8 +1505,8 @@ def sparse_to_indicator(sp_input, vocab_size, name=None):
         sp_new, default_value=False, validate_indices=False, name=name)
 
 
-@tf_export("sparse.merge", v1=["sparse.merge", "sparse_merge"])
-@deprecation.deprecated_endpoints("sparse_merge")
+@tf_export(v1=["sparse.merge", "sparse_merge"])
+@deprecation.deprecated(None, "No similar op available at this time.")
 def sparse_merge(sp_ids, sp_values, vocab_size, name=None,
                  already_sorted=False):
   """Combines a batch of feature ids and values into a single `SparseTensor`.
@@ -1593,8 +1867,7 @@ def sparse_fill_empty_rows(sp_input, default_value, name=None):
         dense_shape=sp_input.dense_shape), empty_row_indicator)
 
 
-@tf_export(
-    "io.serialize_sparse", v1=["io.serialize_sparse", "serialize_sparse"])
+@tf_export(v1=["io.serialize_sparse", "serialize_sparse"])
 @deprecation.deprecated_endpoints("serialize_sparse")
 def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
@@ -1608,6 +1881,25 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
     A 3-vector (1-D `Tensor`), with each column representing the serialized
     `SparseTensor`'s indices, values, and shape (respectively).
 
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return serialize_sparse_v2(sp_input, out_type, name)
+
+
+@tf_export("io.serialize_sparse", v1=[])
+def serialize_sparse_v2(sp_input, out_type=dtypes.string, name=None):
+  """Serialize a `SparseTensor` into a 3-vector (1-D `Tensor`) object.
+
+  Args:
+    sp_input: The input `SparseTensor`.
+    out_type: The `dtype` to use for serialization.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A 3-vector (1-D `Tensor`), with each column representing the serialized
+    `SparseTensor`'s indices, values, and shape (respectively).
+
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
@@ -1621,9 +1913,7 @@ def serialize_sparse(sp_input, name=None, out_type=dtypes.string):
       out_type=out_type)
 
 
-@tf_export(
-    "io.serialize_many_sparse",
-    v1=["io.serialize_many_sparse", "serialize_many_sparse"])
+@tf_export(v1=["io.serialize_many_sparse", "serialize_many_sparse"])
 @deprecation.deprecated_endpoints("serialize_many_sparse")
 def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
   """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
@@ -1646,6 +1936,34 @@ def serialize_many_sparse(sp_input, name=None, out_type=dtypes.string):
     represents serialized `SparseTensor`'s indices, values, and shape
     (respectively).
 
+  Raises:
+    TypeError: If `sp_input` is not a `SparseTensor`.
+  """
+  return serialize_many_sparse_v2(sp_input, out_type, name)
+
+
+@tf_export("io.serialize_many_sparse", v1=[])
+def serialize_many_sparse_v2(sp_input, out_type=dtypes.string, name=None):
+  """Serialize `N`-minibatch `SparseTensor` into an `[N, 3]` `Tensor`.
+
+  The `SparseTensor` must have rank `R` greater than 1, and the first dimension
+  is treated as the minibatch dimension.  Elements of the `SparseTensor`
+  must be sorted in increasing order of this first dimension.  The serialized
+  `SparseTensor` objects going into each row of the output `Tensor` will have
+  rank `R-1`.
+
+  The minibatch size `N` is extracted from `sparse_shape[0]`.
+
+  Args:
+    sp_input: The input rank `R` `SparseTensor`.
+    out_type: The `dtype` to use for serialization.
+    name: A name prefix for the returned tensors (optional).
+
+  Returns:
+    A matrix (2-D `Tensor`) with `N` rows and `3` columns. Each column
+    represents serialized `SparseTensor`'s indices, values, and shape
+    (respectively).
+
   Raises:
     TypeError: If `sp_input` is not a `SparseTensor`.
   """
@@ -1798,7 +2116,9 @@ def deserialize_many_sparse(serialized_sparse, dtype, rank=None, name=None):
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
 
 
-@tf_export("sparse.matmul", v1=["sparse.matmul", "sparse_tensor_dense_matmul"])
+@tf_export("sparse.sparse_dense_matmul",
+           v1=["sparse.sparse_dense_matmul", "sparse.matmul",
+               "sparse_tensor_dense_matmul"])
 @deprecation.deprecated_endpoints("sparse_tensor_dense_matmul")
 def sparse_tensor_dense_matmul(sp_a,
                                b,
@@ -2362,3 +2682,48 @@ def _take_many_sparse_from_tensors_map(sparse_map_op,
   output_shape.set_shape([rank])
 
   return sparse_tensor.SparseTensor(output_indices, output_values, output_shape)
+
+
+class _UnaryMapValueDispatcher(dispatch.OpDispatcher):
+  """OpDispatcher for unary ops that maps base function across sparse values."""
+
+  def __init__(self, original_func):
+    self._original_func = original_func
+    func_name = get_canonical_name_for_symbol(original_func)
+    arg_names = tf_inspect.getfullargspec(original_func)[0]
+    self._x = arg_names[0]
+    original_func.__doc__ = (
+        original_func.__doc__.rstrip() + "\n\n" +
+        ("    If `{x}` is a `SparseTensor`, returns\n"
+         "    `SparseTensor({x}.indices, tf.{func}({x}.values, ...), "
+         "{x}.dense_shape)`").format(x=self._x, func=func_name))
+
+  def handle(self, args, kwargs):
+    if args:
+      x, args = args[0], args[1:]
+    else:
+      kwargs = kwargs.copy()
+      x = kwargs.pop(self._x, None)
+    if isinstance(x, sparse_tensor.SparseTensor):
+      return sparse_tensor.SparseTensor(
+          indices=x.indices,
+          values=self._original_func(x.values, *args, **kwargs),
+          dense_shape=x.dense_shape)
+    else:
+      return self.NOT_SUPPORTED
+
+
+_UNARY_OPS = [
+    # TODO(b/120307967) Add dispatchers for additional TensorFlow ops.
+    math_ops.abs,
+    math_ops.negative,
+    math_ops.sign,
+    math_ops.square,
+    math_ops.sqrt,
+    math_ops.erf,
+    math_ops.tanh,
+    math_ops.bessel_i0e,
+    math_ops.bessel_i1e,
+]
+for unary_op in _UNARY_OPS:
+  _UnaryMapValueDispatcher(unary_op).register(unary_op)
diff --git a/tensorflow/python/ops/sparse_ops_test.py b/tensorflow/python/ops/sparse_ops_test.py
index 4ee1569249b5ccd3b38de7bb6c2bb5bce761c513..031069a0f017c5d7e80999d2aa6a3e5fd2cf10e6 100644
--- a/tensorflow/python/ops/sparse_ops_test.py
+++ b/tensorflow/python/ops/sparse_ops_test.py
@@ -18,18 +18,20 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import sparse_ops
 from tensorflow.python.platform import googletest
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class SparseOpsTest(test_util.TensorFlowTestCase):
+class SparseOpsTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testSparseEye(self):
     def test_one(n, m, as_tensors):
@@ -77,5 +79,23 @@ class SparseOpsTest(test_util.TensorFlowTestCase):
           d = sparse_ops.sparse_to_dense(s.indices, s.dense_shape, s.values)
           self.assertAllEqual(self.evaluate(d), expected_after)
 
+  @parameterized.parameters([
+      (math_ops.abs, [1.0, -1.0, 3.0, -4.0], [1.0, 1.0, 3.0, 4.0]),
+      (math_ops.negative, [1.0, -1.0, 3.0, -4.0], [-1.0, 1.0, -3.0, 4.0]),
+      (math_ops.sign, [3.0, -2.0, 0.0, -4.0], [1.0, -1.0, 0.0, -1.0]),
+      (math_ops.square, [1.0, -1.0, 3.0, -4.0], [1.0, 1.0, 9.0, 16.0]),
+  ])
+  def testUnarySparseDispatch(self, op, values, expected):
+    st = sparse_tensor.SparseTensor(
+        indices=[[0, 0], [0, 1], [2, 0], [2, 4]],
+        values=values,
+        dense_shape=[3, 6])
+    result = op(st)
+    result_value = self.evaluate(result)
+    self.assertAllEqual(result_value.indices, st.indices)
+    self.assertAllEqual(result_value.values, expected)
+    self.assertAllEqual(result_value.dense_shape, st.dense_shape)
+
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/python/ops/special_math_ops.py b/tensorflow/python/ops/special_math_ops.py
index f44f694109e602c49a196bdc5767635b89c2ee67..21f4996798eda29c8c9090c12b096d888c0b12d8 100644
--- a/tensorflow/python/ops/special_math_ops.py
+++ b/tensorflow/python/ops/special_math_ops.py
@@ -70,8 +70,7 @@ def lbeta(x, name=None):
     x = ops.convert_to_tensor(x, name='x')
 
     # Note reduce_sum([]) = 0.
-    log_prod_gamma_x = math_ops.reduce_sum(
-        math_ops.lgamma(x), reduction_indices=[-1])
+    log_prod_gamma_x = math_ops.reduce_sum(math_ops.lgamma(x), axis=[-1])
 
     # Note lgamma(0) = infinity, so if x = []
     # log_gamma_sum_x = lgamma(0) = infinity, and
@@ -264,11 +263,11 @@ def einsum(equation, *inputs, **kwargs):
 
     missing_indices = set(temp_axis_labels) - set(output_axis_labels)
     if missing_indices:
-      reduction_indices = [
+      axis = [
           i for i, a in enumerate(temp_axis_labels)
           if a not in output_axis_labels
       ]
-      temp = math_ops.reduce_sum(temp, reduction_indices=reduction_indices)
+      temp = math_ops.reduce_sum(temp, axis=axis)
       temp_axis_labels = ''.join(
           a for a in temp_axis_labels if a in output_axis_labels)
 
diff --git a/tensorflow/python/ops/special_math_ops_test.py b/tensorflow/python/ops/special_math_ops_test.py
index 7438cdb3f11314a918838957db8edd1d14c8bcb9..94aaebed951a96a4aade8d05d36b3366e59708a5 100644
--- a/tensorflow/python/ops/special_math_ops_test.py
+++ b/tensorflow/python/ops/special_math_ops_test.py
@@ -46,6 +46,7 @@ class LBetaTest(test.TestCase):
           0.5, self.evaluate(math_ops.exp(special_math_ops.lbeta(x_one_half))))
       self.assertEqual([], special_math_ops.lbeta(x_one).get_shape())
 
+  @test_util.run_deprecated_v1
   def test_one_dimensional_arg_dynamic(self):
     # Should evaluate to 1 and 1/2.
     x_one = [1, 1.]
@@ -57,6 +58,7 @@ class LBetaTest(test.TestCase):
       self.assertAllClose(0.5,
                           beta_ph.eval(feed_dict={ph: x_one_half}))
 
+  @test_util.run_deprecated_v1
   def test_four_dimensional_arg_with_partial_shape_dynamic(self):
     x_ = np.ones((3, 2, 3, 4))
     # Gamma(1) = 0! = 1
@@ -81,6 +83,7 @@ class LBetaTest(test.TestCase):
           self.evaluate(math_ops.exp(special_math_ops.lbeta(x_one_half))))
       self.assertEqual((2,), special_math_ops.lbeta(x_one_half).get_shape())
 
+  @test_util.run_deprecated_v1
   def test_two_dimensional_arg_dynamic(self):
     # Should evaluate to 1/2.
     x_one_half = [[2, 1.], [2, 1.]]
@@ -288,6 +291,7 @@ class EinsumTest(test.TestCase):
     for case in self.long_cases:
       self.run_test(case)
 
+  @test_util.run_deprecated_v1
   def test_invalid(self):
     for axes in self.invalid_cases:
       inputs = [
@@ -297,6 +301,7 @@ class EinsumTest(test.TestCase):
       with self.assertRaises(ValueError):
         _ = special_math_ops.einsum(axes, *inputs)
 
+  @test_util.run_deprecated_v1
   def test_invalid_keyword_arguments(self):
     m0 = array_ops.placeholder(dtypes.int32, shape=(1, None))
     m1 = array_ops.placeholder(dtypes.int32, shape=(None, 1))
@@ -311,11 +316,13 @@ class EinsumTest(test.TestCase):
           invalid1='value1',
           invalid2='value2')
 
+  @test_util.run_deprecated_v1
   def test_repeated_axis_single_input(self):
     x = array_ops.placeholder(dtypes.float32, shape=[2, 2])
     with self.assertRaises(ValueError):
       _ = special_math_ops.einsum('ii->', x)
 
+  @test_util.run_deprecated_v1
   def test_dim_mismatch(self):
     for axes, input_shapes in self.dim_mismatch_cases:
       inputs = [
diff --git a/tensorflow/python/ops/spectral_grad.py b/tensorflow/python/ops/spectral_grad.py
deleted file mode 100644
index 0af24114acbe5fa6283191f9d71e32805eba3f29..0000000000000000000000000000000000000000
--- a/tensorflow/python/ops/spectral_grad.py
+++ /dev/null
@@ -1,185 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Gradients for operators defined in spectral_ops.py."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import numpy as np
-
-from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
-from tensorflow.python.ops import spectral_ops
-
-
-def _FFTSizeForGrad(grad, rank):
-  return math_ops.reduce_prod(array_ops.shape(grad)[-rank:])
-
-
-@ops.RegisterGradient("FFT")
-def _FFTGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype)
-  return spectral_ops.ifft(grad) * size
-
-
-@ops.RegisterGradient("IFFT")
-def _IFFTGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 1), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft(grad) * rsize
-
-
-@ops.RegisterGradient("FFT2D")
-def _FFT2DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype)
-  return spectral_ops.ifft2d(grad) * size
-
-
-@ops.RegisterGradient("IFFT2D")
-def _IFFT2DGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 2), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft2d(grad) * rsize
-
-
-@ops.RegisterGradient("FFT3D")
-def _FFT3DGrad(_, grad):
-  size = math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype)
-  return spectral_ops.ifft3d(grad) * size
-
-
-@ops.RegisterGradient("IFFT3D")
-def _IFFT3DGrad(_, grad):
-  rsize = math_ops.cast(
-      1. / math_ops.cast(_FFTSizeForGrad(grad, 3), grad.dtype.real_dtype),
-      grad.dtype)
-  return spectral_ops.fft3d(grad) * rsize
-
-
-def _RFFTGradHelper(rank, irfft_fn):
-  """Returns a gradient function for an RFFT of the provided rank."""
-  # Can't happen because we don't register a gradient for RFFT3D.
-  assert rank in (1, 2), "Gradient for RFFT3D is not implemented."
-
-  def _Grad(op, grad):
-    """A gradient function for RFFT with the provided `rank` and `irfft_fn`."""
-    fft_length = op.inputs[1]
-    input_shape = array_ops.shape(op.inputs[0])
-    is_even = math_ops.cast(1 - (fft_length[-1] % 2), dtypes.complex64)
-
-    def _TileForBroadcasting(matrix, t):
-      expanded = array_ops.reshape(
-          matrix,
-          array_ops.concat([
-              array_ops.ones([array_ops.rank(t) - 2], dtypes.int32),
-              array_ops.shape(matrix)
-          ], 0))
-      return array_ops.tile(
-          expanded, array_ops.concat([array_ops.shape(t)[:-2], [1, 1]], 0))
-
-    def _MaskMatrix(length):
-      # TODO(rjryan): Speed up computation of twiddle factors using the
-      # following recurrence relation and cache them across invocations of RFFT.
-      #
-      # t_n = exp(sqrt(-1) * pi * n^2 / line_len)
-      # for n = 0, 1,..., line_len-1.
-      # For n > 2, use t_n = t_{n-1}^2 / t_{n-2} * t_1^2
-      a = array_ops.tile(
-          array_ops.expand_dims(math_ops.range(length), 0), (length, 1))
-      b = array_ops.transpose(a, [1, 0])
-      return math_ops.exp(-2j * np.pi * math_ops.cast(a * b, dtypes.complex64) /
-                          math_ops.cast(length, dtypes.complex64))
-
-    def _YMMask(length):
-      """A sequence of [1+0j, -1+0j, 1+0j, -1+0j, ...] with length `length`."""
-      return math_ops.cast(1 - 2 * (math_ops.range(length) % 2),
-                           dtypes.complex64)
-
-    y0 = grad[..., 0:1]
-    if rank == 1:
-      ym = grad[..., -1:]
-      extra_terms = y0 + is_even * ym * _YMMask(input_shape[-1])
-    elif rank == 2:
-      # Create a mask matrix for y0 and ym.
-      base_mask = _MaskMatrix(input_shape[-2])
-
-      # Tile base_mask to match y0 in shape so that we can batch-matmul the
-      # inner 2 dimensions.
-      tiled_mask = _TileForBroadcasting(base_mask, y0)
-
-      y0_term = math_ops.matmul(tiled_mask, math_ops.conj(y0))
-      extra_terms = y0_term
-
-      ym = grad[..., -1:]
-      ym_term = math_ops.matmul(tiled_mask, math_ops.conj(ym))
-
-      inner_dim = input_shape[-1]
-      ym_term = array_ops.tile(
-          ym_term,
-          array_ops.concat([
-              array_ops.ones([array_ops.rank(grad) - 1], dtypes.int32),
-              [inner_dim]
-          ], 0)) * _YMMask(inner_dim)
-
-      extra_terms += is_even * ym_term
-
-    # The gradient of RFFT is the IRFFT of the incoming gradient times a scaling
-    # factor, plus some additional terms to make up for the components dropped
-    # due to Hermitian symmetry.
-    input_size = math_ops.to_float(_FFTSizeForGrad(op.inputs[0], rank))
-    irfft = irfft_fn(grad, fft_length)
-    return 0.5 * (irfft * input_size + math_ops.real(extra_terms)), None
-
-  return _Grad
-
-
-def _IRFFTGradHelper(rank, rfft_fn):
-  """Returns a gradient function for an IRFFT of the provided rank."""
-  # Can't happen because we don't register a gradient for IRFFT3D.
-  assert rank in (1, 2), "Gradient for IRFFT3D is not implemented."
-
-  def _Grad(op, grad):
-    """A gradient function for IRFFT with the provided `rank` and `rfft_fn`."""
-    # Generate a simple mask like [1.0, 2.0, ..., 2.0, 1.0] for even-length FFTs
-    # and [1.0, 2.0, ..., 2.0] for odd-length FFTs. To reduce extra ops in the
-    # graph we special-case the situation where the FFT length and last
-    # dimension of the input are known at graph construction time.
-    fft_length = op.inputs[1]
-    is_odd = math_ops.mod(fft_length[-1], 2)
-    input_last_dimension = array_ops.shape(op.inputs[0])[-1]
-    mask = array_ops.concat(
-        [[1.0], 2.0 * array_ops.ones([input_last_dimension - 2 + is_odd]),
-         array_ops.ones([1 - is_odd])], 0)
-
-    rsize = math_ops.reciprocal(math_ops.to_float(_FFTSizeForGrad(grad, rank)))
-
-    # The gradient of IRFFT is the RFFT of the incoming gradient times a scaling
-    # factor and a mask. The mask scales the gradient for the Hermitian
-    # symmetric components of the RFFT by a factor of two, since these
-    # components are de-duplicated in the RFFT.
-    rfft = rfft_fn(grad, fft_length)
-    return rfft * math_ops.cast(rsize * mask, dtypes.complex64), None
-
-  return _Grad
-
-
-ops.RegisterGradient("RFFT")(_RFFTGradHelper(1, spectral_ops.irfft))
-ops.RegisterGradient("IRFFT")(_IRFFTGradHelper(1, spectral_ops.rfft))
-ops.RegisterGradient("RFFT2D")(_RFFTGradHelper(2, spectral_ops.irfft2d))
-ops.RegisterGradient("IRFFT2D")(_IRFFTGradHelper(2, spectral_ops.rfft2d))
diff --git a/tensorflow/python/ops/standard_ops.py b/tensorflow/python/ops/standard_ops.py
index 4f1662ab086796af0fff9acbf4ad425c6460e37d..c614d072badbdf7927d6c889288e1cf4e8d988ef 100644
--- a/tensorflow/python/ops/standard_ops.py
+++ b/tensorflow/python/ops/standard_ops.py
@@ -31,7 +31,6 @@ from tensorflow.python.ops import manip_grad
 from tensorflow.python.ops import math_grad
 from tensorflow.python.ops import random_grad
 from tensorflow.python.ops import sparse_grad
-from tensorflow.python.ops import spectral_grad
 from tensorflow.python.ops import state_grad
 from tensorflow.python.ops import tensor_array_grad
 
@@ -51,6 +50,7 @@ from tensorflow.python.ops.control_flow_ops import group
 from tensorflow.python.ops.control_flow_ops import no_op
 from tensorflow.python.ops.control_flow_ops import tuple  # pylint: disable=redefined-builtin
 # pylint: enable=redefined-builtin
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.ops.control_flow_ops import while_loop
 from tensorflow.python.ops.data_flow_ops import *
 from tensorflow.python.ops.functional_ops import *
@@ -72,6 +72,7 @@ from tensorflow.python.ops.partitioned_variables import *
 from tensorflow.python.ops.random_ops import *
 from tensorflow.python.ops.script_ops import py_func
 from tensorflow.python.ops.session_ops import *
+from tensorflow.python.ops.sort_ops import *
 from tensorflow.python.ops.sparse_ops import *
 from tensorflow.python.ops.state_ops import assign
 from tensorflow.python.ops.state_ops import assign_add
diff --git a/tensorflow/python/ops/state_ops.py b/tensorflow/python/ops/state_ops.py
index 76684f89f8ac9347486a115c12e0b4f5ff49ba30..3ac69c1c202d71b91e42f0f4a5bdd80c881ef97d 100644
--- a/tensorflow/python/ops/state_ops.py
+++ b/tensorflow/python/ops/state_ops.py
@@ -32,6 +32,7 @@ from tensorflow.python.ops import gen_state_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_state_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -595,7 +596,9 @@ def scatter_nd_sub(ref, indices, updates, use_locking=False, name=None):
       name=name))
 
 
-@tf_export("batch_scatter_update")
+@tf_export(v1=["batch_scatter_update"])
+@deprecation.deprecated(
+    "2018-11-29", "Use the batch_scatter_update method of Variable instead.")
 def batch_scatter_update(ref, indices, updates, use_locking=True, name=None):
   """Generalization of `tf.scatter_update` to axis different than 0.
 
diff --git a/tensorflow/python/ops/stateless_random_ops.py b/tensorflow/python/ops/stateless_random_ops.py
index c6defabacdbe227d7e4ed20badae9c3ce0c553b0..b119049b163dd57aee08f078e5ab5ca913f61706 100644
--- a/tensorflow/python/ops/stateless_random_ops.py
+++ b/tensorflow/python/ops/stateless_random_ops.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 ops.NotDifferentiable("StatelessMultinomial")
@@ -179,7 +180,9 @@ def stateless_truncated_normal(shape,
     return math_ops.add(rnd * stddev, mean, name=name)
 
 
-@tf_export("random.stateless_multinomial")
+@tf_export(v1=["random.stateless_multinomial"])
+@deprecation.deprecated(
+    date=None, instructions="Use tf.random.stateless_categorical instead.")
 def stateless_multinomial(logits,
                           num_samples,
                           seed,
@@ -207,13 +210,58 @@ def stateless_multinomial(logits,
       `[i, :]` represents the unnormalized log-probabilities for all classes.
     num_samples: 0-D.  Number of independent samples to draw for each row slice.
     seed: A shape [2] integer Tensor of seeds to the random number generator.
-    name: Optional name for the operation.
     output_dtype: integer type to use for the output. Defaults to int64.
+    name: Optional name for the operation.
 
   Returns:
     The drawn samples of shape `[batch_size, num_samples]`.
   """
   with ops.name_scope(name, "stateless_multinomial", [logits, seed]):
-    logits = ops.convert_to_tensor(logits, name="logits")
-    return gen_stateless_random_ops.stateless_multinomial(
-        logits, num_samples, seed, output_dtype=output_dtype)
+    return stateless_multinomial_categorical_impl(logits, num_samples,
+                                                  output_dtype, seed)
+
+
+@tf_export("random.stateless_categorical")
+def stateless_categorical(logits,
+                          num_samples,
+                          seed,
+                          dtype=dtypes.int64,
+                          name=None):
+  """Draws deterministic pseudorandom samples from a categorical distribution.
+
+  This is a stateless version of `tf.categorical`: if run twice with the
+  same seeds, it will produce the same pseudorandom numbers.  The output is
+  consistent across multiple runs on the same hardware (and between CPU
+  and GPU), but may change between versions of TensorFlow or on non-CPU/GPU
+  hardware.
+
+  Example:
+
+  ```python
+  # samples has shape [1, 5], where each value is either 0 or 1 with equal
+  # probability.
+  samples = tf.random.stateless_categorical(
+      tf.log([[10., 10.]]), 5, seed=[7, 17])
+  ```
+
+  Args:
+    logits: 2-D Tensor with shape `[batch_size, num_classes]`.  Each slice
+      `[i, :]` represents the unnormalized log-probabilities for all classes.
+    num_samples: 0-D.  Number of independent samples to draw for each row slice.
+    seed: A shape [2] integer Tensor of seeds to the random number generator.
+    dtype: integer type to use for the output. Defaults to int64.
+    name: Optional name for the operation.
+
+  Returns:
+    The drawn samples of shape `[batch_size, num_samples]`.
+  """
+  with ops.name_scope(name, "stateless_categorical", [logits, seed]):
+    return stateless_multinomial_categorical_impl(logits, num_samples, dtype,
+                                                  seed)
+
+
+def stateless_multinomial_categorical_impl(logits, num_samples, dtype, seed):
+  """Implementation for stateless multinomial/categorical ops (v1/v2)."""
+  logits = ops.convert_to_tensor(logits, name="logits")
+  return gen_stateless_random_ops.stateless_multinomial(
+      logits, num_samples, seed, output_dtype=dtype)
diff --git a/tensorflow/python/ops/string_ops.py b/tensorflow/python/ops/string_ops.py
index 25e86cadeb64960032d18cd5637b39764392431f..046459706c0881bd9a3cbd68e4d5553d0547947c 100644
--- a/tensorflow/python/ops/string_ops.py
+++ b/tensorflow/python/ops/string_ops.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_parsing_ops
 from tensorflow.python.ops import gen_string_ops
 from tensorflow.python.ops import math_ops
 
@@ -37,6 +38,7 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops.gen_string_ops import *
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 # pylint: enable=g-bad-import-order
 # pylint: enable=wildcard-import
@@ -44,6 +46,7 @@ from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=redefined-builtin
 @tf_export("strings.regex_full_match")
+@dispatch.add_dispatch_support
 def regex_full_match(input, pattern, name=None):
   r"""Match elements of `input` with regex `pattern`.
 
@@ -75,6 +78,7 @@ regex_full_match.__doc__ = gen_string_ops.regex_full_match.__doc__
 @tf_export(
     "strings.regex_replace", v1=["strings.regex_replace", "regex_replace"])
 @deprecation.deprecated_endpoints("regex_replace")
+@dispatch.add_dispatch_support
 def regex_replace(input, pattern, rewrite, replace_global=True, name=None):
   r"""Replace elements of `input` matching regex `pattern` with `rewrite`.
 
@@ -311,7 +315,7 @@ def _reduce_join_reduction_dims(x, axis, reduction_indices):
     return math_ops.range(array_ops.rank(x) - 1, -1, -1)
 
 
-@tf_export("strings.reduce_join", v1=["strings.reduce_join", "reduce_join"])
+@tf_export(v1=["strings.reduce_join", "reduce_join"])
 @deprecation.deprecated_endpoints("reduce_join")
 def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
                 keep_dims=False,
@@ -329,6 +333,17 @@ def reduce_join(inputs, axis=None,  # pylint: disable=missing-docstring
       name=name)
 
 
+@tf_export("strings.reduce_join", v1=[])
+def reduce_join_v2(  # pylint: disable=missing-docstring
+    inputs,
+    axis=None,
+    keepdims=False,
+    separator="",
+    name=None):
+  return reduce_join(
+      inputs, axis, keep_dims=keepdims, separator=separator, name=name)
+
+
 reduce_join.__doc__ = deprecation.rewrite_argument_docstring(
     gen_string_ops.reduce_join.__doc__, "reduction_indices", "axis")
 reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
@@ -337,11 +352,18 @@ reduce_join.__doc__ = reduce_join.__doc__.replace("tf.reduce_join(",
 
 # This wrapper provides backwards compatibility for code that predates the
 # unit argument and that passed 'name' as a positional argument.
-@tf_export("strings.length")
+@tf_export(v1=["strings.length"])
+@dispatch.add_dispatch_support
 def string_length(input, name=None, unit="BYTE"):
   return gen_string_ops.string_length(input, unit=unit, name=name)
 
 
+@tf_export("strings.length", v1=[])
+@dispatch.add_dispatch_support
+def string_length_v2(input, unit="BYTE", name=None):
+  return string_length(input, name, unit)
+
+
 string_length.__doc__ = gen_string_ops.string_length.__doc__
 
 
@@ -353,11 +375,18 @@ def substr_deprecated(input, pos, len, name=None, unit="BYTE"):
 substr_deprecated.__doc__ = gen_string_ops.substr.__doc__
 
 
-@tf_export("strings.substr")
+@tf_export(v1=["strings.substr"])
+@dispatch.add_dispatch_support
 def substr(input, pos, len, name=None, unit="BYTE"):
   return gen_string_ops.substr(input, pos, len, unit=unit, name=name)
 
 
+@tf_export("strings.substr", v1=[])
+@dispatch.add_dispatch_support
+def substr_v2(input, pos, len, unit="BYTE", name=None):
+  return substr(input, pos, len, name=name, unit=unit)
+
+
 substr.__doc__ = gen_string_ops.substr.__doc__
 
 
@@ -371,3 +400,55 @@ ops.NotDifferentiable("StringSplit")
 ops.NotDifferentiable("AsString")
 ops.NotDifferentiable("EncodeBase64")
 ops.NotDifferentiable("DecodeBase64")
+
+
+@tf_export("strings.to_number", v1=[])
+@dispatch.add_dispatch_support
+def string_to_number(input, out_type=dtypes.float32, name=None):
+  r"""Converts each string in the input Tensor to the specified numeric type.
+
+  (Note that int32 overflow results in an error while float overflow
+  results in a rounded value.)
+
+  Args:
+    input: A `Tensor` of type `string`.
+    out_type: An optional `tf.DType` from: `tf.float32, tf.float64, tf.int32,
+      tf.int64`. Defaults to `tf.float32`.
+      The numeric type to interpret each string in `string_tensor` as.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `out_type`.
+  """
+  return gen_parsing_ops.string_to_number(input, out_type, name)
+tf_export(v1=["strings.to_number", "string_to_number"])(
+    gen_parsing_ops.string_to_number
+    )
+
+
+@tf_export("strings.to_hash_bucket", v1=[])
+@dispatch.add_dispatch_support
+def string_to_hash_bucket(input, num_buckets, name=None):
+  # pylint: disable=line-too-long
+  r"""Converts each string in the input Tensor to its hash mod by a number of buckets.
+
+  The hash function is deterministic on the content of the string within the
+  process.
+
+  Note that the hash function may change from time to time.
+  This functionality will be deprecated and it's recommended to use
+  `tf.string_to_hash_bucket_fast()` or `tf.string_to_hash_bucket_strong()`.
+
+  Args:
+    input: A `Tensor` of type `string`.
+    num_buckets: An `int` that is `>= 1`. The number of buckets.
+    name: A name for the operation (optional).
+
+  Returns:
+    A `Tensor` of type `int64`.
+  """
+  # pylint: enable=line-too-long
+  return gen_string_ops.string_to_hash_bucket(input, num_buckets, name)
+tf_export(v1=["strings.to_hash_bucket", "string_to_hash_bucket"])(
+    gen_string_ops.string_to_hash_bucket
+    )
diff --git a/tensorflow/python/ops/summary_op_util.py b/tensorflow/python/ops/summary_op_util.py
index 14aa44a920863b06221c8ba2d65c246979f3301d..93d8d50842ba681688e6d42890445ab4e6879124 100644
--- a/tensorflow/python/ops/summary_op_util.py
+++ b/tensorflow/python/ops/summary_op_util.py
@@ -21,9 +21,10 @@ from __future__ import print_function
 import contextlib
 import re
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import distribution_strategy_context
 
 
 def collect(val, collections, default_collections):
@@ -44,13 +45,27 @@ _INVALID_TAG_CHARACTERS = re.compile(r'[^-/\w\.]')
 
 
 def skip_summary():
-  # If using multiple replicas in distributed strategy, skip summaries on all
-  # replicas except the first one (replica_id=0).
+  """Determines if summary should be skipped.
+
+  If using multiple replicas in distributed strategy, skip summaries on all
+  replicas except the first one (replica_id=0).
+
+  Returns:
+    True if the summary is skipped; False otherwise.
+  """
+
   # TODO(priyag): Add a new optional argument that will provide multiple
   # alternatives to override default behavior. (e.g. run on last replica,
   # compute sum or mean across replicas).
   replica_context = distribution_strategy_context.get_replica_context()
-  return replica_context and replica_context.replica_id > 0
+  if not replica_context:
+    return False
+  # TODO(b/118385803): when replica_id of _TPUReplicaContext is properly
+  # initialized, remember to change here as well.
+  replica_id = replica_context.replica_id_in_sync_group
+  if isinstance(replica_id, ops.Tensor):
+    replica_id = tensor_util.constant_value(replica_id)
+  return replica_id and replica_id > 0
 
 
 def clean_tag(name):
diff --git a/tensorflow/python/ops/summary_ops_v2.py b/tensorflow/python/ops/summary_ops_v2.py
index 18cefb8e1c4f1eb03d2ac746e1864a48c9aec6b8..3f99b9f8773b3d26cf334044e0d127bf7443bfea 100644
--- a/tensorflow/python/ops/summary_ops_v2.py
+++ b/tensorflow/python/ops/summary_ops_v2.py
@@ -40,11 +40,14 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import summary_op_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import training_util
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
+from tensorflow.python.util.tf_export import tf_export
 
 
-# A global dictionary mapping graph keys to boolean values indicating whether
-# we should record summaries for this particular graph or not.
+# Dictionary mapping graph keys to a boolean Tensor (or callable returning
+# a boolean Tensor) indicating whether we should record summaries for the
+# graph identified by the key of the dictionary.
 _SHOULD_RECORD_SUMMARIES = {}
 
 # A global dictionary mapping graph keys to a list of summary writer init ops.
@@ -59,58 +62,67 @@ def should_record_summaries():
   """Returns boolean Tensor which is true if summaries should be recorded."""
   global _SHOULD_RECORD_SUMMARIES
   key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  return _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  should = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
+  return should() if callable(should) else should
 
 
-# TODO(apassos) consider how to handle local step here.
 @tf_contextlib.contextmanager
-def record_summaries_every_n_global_steps(n, global_step=None):
-  """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
-  if global_step is None:
-    global_step = training_util.get_or_create_global_step()
+def _record_summaries(boolean=True):
+  """Sets summary recording on or off per the provided boolean value.
+
+  The provided value can be a python boolean, a scalar boolean Tensor, or
+  or a callable providing such a value; if a callable is passed it will be
+  invoked each time should_record_summaries() is called to determine whether
+  summary writing should be enabled.
+
+  Args:
+    boolean: can be True, False, a bool Tensor, or a callable providing such.
+      Defaults to True.
+
+  Yields:
+    Returns a context manager that sets this value on enter and restores the
+    previous value on exit.
+  """
+  # TODO(nickfelt): make this threadlocal
   global _SHOULD_RECORD_SUMMARIES
   key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
   old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
   try:
-    with ops.device("cpu:0"):
-      _SHOULD_RECORD_SUMMARIES[key] = math_ops.equal(global_step % n, 0)
+    _SHOULD_RECORD_SUMMARIES[key] = boolean
     yield
   finally:
     _SHOULD_RECORD_SUMMARIES[key] = old
 
 
-@tf_contextlib.contextmanager
+# TODO(apassos) consider how to handle local step here.
+def record_summaries_every_n_global_steps(n, global_step=None):
+  """Sets the should_record_summaries Tensor to true if global_step % n == 0."""
+  if global_step is None:
+    global_step = training_util.get_or_create_global_step()
+  with ops.device("cpu:0"):
+    should = lambda: math_ops.equal(global_step % n, 0)
+    if not context.executing_eagerly():
+      should = should()
+  return _record_summaries(should)
+
+
 def always_record_summaries():
   """Sets the should_record_summaries Tensor to always true."""
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
-  try:
-    _SHOULD_RECORD_SUMMARIES[key] = True
-    yield
-  finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+  return _record_summaries(True)
 
 
-@tf_contextlib.contextmanager
 def never_record_summaries():
   """Sets the should_record_summaries Tensor to always false."""
-  global _SHOULD_RECORD_SUMMARIES
-  key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
-  old = _SHOULD_RECORD_SUMMARIES.setdefault(key, False)
-  try:
-    _SHOULD_RECORD_SUMMARIES[key] = False
-    yield
-  finally:
-    _SHOULD_RECORD_SUMMARIES[key] = old
+  return _record_summaries(False)
 
 
+@tf_export("summary.SummaryWriter", v1=[])
 class SummaryWriter(object):
   """Encapsulates a stateful summary writer resource.
 
   See also:
-  - `tf.contrib.summary.create_file_writer`
-  - `tf.contrib.summary.create_db_writer`
+  - `tf.summary.create_file_writer`
+  - `tf.summary.create_db_writer`
   """
 
   def  __init__(self, resource, init_op_fn):
@@ -205,6 +217,7 @@ def initialize(
     session.run(_graph(x, 0), feed_dict={x: data})
 
 
+@tf_export("summary.create_file_writer", v1=[])
 def create_file_writer(logdir,
                        max_queue=None,
                        flush_millis=None,
@@ -280,7 +293,7 @@ def create_db_writer(db_uri,
       `tf.Graph`.
 
   Returns:
-    A `tf.contrib.summary.SummaryWriter` instance.
+    A `tf.summary.SummaryWriter` instance.
   """
   with ops.device("cpu:0"):
     if experiment_name is None:
@@ -329,7 +342,7 @@ def _nothing():
 def all_summary_ops():
   """Graph-mode only. Returns all summary ops.
 
-  Please note this excludes `tf.contrib.summary.graph` ops.
+  Please note this excludes `tf.summary.graph` ops.
 
   Returns:
     The summary ops.
@@ -497,7 +510,7 @@ def graph(param, step=None, name=None):
   """Writes a TensorFlow graph to the summary interface.
 
   The graph summary is, strictly speaking, not a summary. Conditions
-  like `tf.contrib.summary.never_record_summaries` do not apply. Only
+  like `tf.summary.should_record_summaries` do not apply. Only
   a single graph can be associated with a particular run. If multiple
   graphs are written, then only the last one will be considered by
   TensorBoard.
@@ -541,14 +554,13 @@ def graph(param, step=None, name=None):
 _graph = graph  # for functions with a graph parameter
 
 
+@tf_export("summary.import_event", v1=[])
 def import_event(tensor, name=None):
   """Writes a `tf.Event` binary proto.
 
-  When using create_db_writer(), this can be used alongside
-  `tf.TFRecordReader` to load event logs into the database. Please
-  note that this is lower level than the other summary functions and
-  will ignore any conditions set by methods like
-  `tf.contrib.summary.should_record_summaries`.
+  This can be used to import existing event logs into a new summary writer sink.
+  Please note that this is lower level than the other summary functions and
+  will ignore the `tf.summary.should_record_summaries` setting.
 
   Args:
     tensor: A `tf.Tensor` of type `string` containing a serialized
@@ -562,13 +574,14 @@ def import_event(tensor, name=None):
       context.context().summary_writer_resource, tensor, name=name)
 
 
+@tf_export("summary.flush", v1=[])
 def flush(writer=None, name=None):
   """Forces summary writer to send any buffered data to storage.
 
   This operation blocks until that finishes.
 
   Args:
-    writer: The `tf.contrib.summary.SummaryWriter` resource to flush.
+    writer: The `tf.summary.SummaryWriter` resource to flush.
       The thread default will be used if this parameter is None.
       Otherwise a `tf.no_op` is returned.
     name: A name for the operation (optional).
@@ -595,6 +608,8 @@ def eval_dir(model_dir, name=None):
   return os.path.join(model_dir, "eval" if not name else "eval_" + name)
 
 
+@deprecation.deprecated(date=None,
+                        instructions="Renamed to create_file_writer().")
 def create_summary_file_writer(*args, **kwargs):
   """Please use `tf.contrib.summary.create_file_writer`."""
   logging.warning("Deprecation Warning: create_summary_file_writer was renamed "
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index f86dfb35276f608c5cb323fe5deceb58733be007..d1516949517f1f5df9291add96756eeacea29f51 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -20,8 +20,10 @@ from __future__ import division
 from __future__ import print_function
 
 import contextlib
+import os
 import weakref
 
+from tensorflow.python import tf2
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -30,12 +32,18 @@ from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import gen_data_flow_ops
+from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
 
+ENABLE_TENSOR_ARRAY_V2 = (
+    tf2.enabled() or os.getenv("TF_ENABLE_TENSOR_ARRAY_V2") is not None)
+
+
 # _GraphTensorArray accesses many of the hidden generated ops, but is in
 # fact built to wrap these methods.
 # pylint: disable=protected-access
@@ -393,6 +401,273 @@ class _GraphTensorArray(object):
     return gen_data_flow_ops.tensor_array_close_v3(
         handle=self._handle, name=name)
 
+
+class _GraphTensorArrayV2(object):
+  """Graph-mode implementation of TensorArray backed by TensorLists.
+
+  The backing tensor of this TensorArray is a TensorList variant tensor which is
+  stored in the `flow`. The `handle` is always none here. The reason we use the
+  `flow` field and not the `handle` field is to ensure backwards compatibility
+  with legacy control flow.
+  """
+
+  def __init__(self,
+               dtype,
+               size=None,
+               dynamic_size=None,
+               clear_after_read=None,
+               tensor_array_name=None,
+               handle=None,
+               flow=None,
+               infer_shape=True,
+               element_shape=None,
+               colocate_with_first_write_call=True,
+               name=None):
+    """Constructs a graph mode TensorArray.
+
+    Args:
+      dtype: (required) data type of the TensorArray.
+      size: (optional) int32 scalar `Tensor`: the size of the TensorArray.
+        Required if flow is not provided.
+      dynamic_size: (optional) Python bool: If true, writes to the TensorArray
+        can grow the TensorArray past its initial size.  Default: False.
+      clear_after_read: (optional) unused. Not supported in TensorLists.
+      tensor_array_name: (optional) unused.
+      handle: (optional) Must always be None.
+      flow: (optional) A variant `Tensor` scalar for a TensorList.
+      infer_shape: (optional, default: True) If True, shape inference is
+        enabled.  In this case, all elements must have the same shape.
+      element_shape: (optional, default: None) A `TensorShape` object specifying
+        the shape constraints of each of the elements of the TensorArray. Need
+        not be fully defined.
+      colocate_with_first_write_call: (optional). unused.
+      name: (optional) A name for the operation.
+
+    Raises:
+      ValueError: if both handle and tensor_array_name are provided.
+      TypeError: if handle is provided but is not a Tensor.
+    """
+    assert handle is None
+    del handle
+    del clear_after_read
+    del tensor_array_name
+    del colocate_with_first_write_call
+
+    del dynamic_size  # TODO(b/117943489): Unused for now.
+
+    if (flow is not None and
+        (not isinstance(flow, ops.Tensor) or flow.dtype != dtypes.variant)):
+      raise TypeError("flow must be a variant tensor")
+    if flow is None and size is None:
+      raise ValueError("Size must be provided if flow is not provided")
+    if flow is not None and size is not None:
+      raise ValueError("Cannot provide both a flow and size "
+                       "at the same time")
+    if flow is not None and element_shape is not None:
+      raise ValueError("Cannot provide both a flow and element_shape "
+                       "at the same time")
+
+    self._dtype = dtype
+
+    # Record the current static shape for the array elements. The element
+    # shape is defined either by `element_shape` or the shape of the tensor
+    # of the first write. If `infer_shape` is true, all writes checks for
+    # shape equality.
+    if element_shape is None:
+      self._infer_shape = infer_shape
+      self._element_shape = []
+    else:
+      self._infer_shape = True
+      self._element_shape = [tensor_shape.TensorShape(element_shape)]
+    with ops.name_scope(name, "TensorArrayV2", [size, flow]) as scope:
+      if flow is None:
+        self._flow = list_ops.tensor_list_reserve(
+            element_shape=element_shape,
+            num_elements=size,
+            element_dtype=dtype,
+            name=scope)
+      else:
+        self._flow = flow
+
+    # For backwards compatibility.
+    self._colocate_with_first_write_call = None
+    self._colocate_with = None
+
+  @property
+  def flow(self):
+    return self._flow
+
+  @property
+  def dtype(self):
+    return self._dtype
+
+  @property
+  def handle(self):
+    # We intentionally do not raise an error so that legacy while_loop does not
+    # complain.
+    return None
+
+  def _merge_element_shape(self, shape):
+    """Changes the element shape of the array given a shape to merge with.
+
+    Args:
+      shape: A `TensorShape` object to merge with.
+
+    Raises:
+      ValueError: if the provided shape is incompatible with the current
+          element shape of the `TensorArray`.
+    """
+
+    if self._element_shape:
+      if not shape.is_compatible_with(self._element_shape[0]):
+        raise ValueError(
+            "Inconsistent shapes: saw %s but expected %s "
+            "(and infer_shape=True)" % (shape, self._element_shape[0]))
+      self._element_shape[0] = self._element_shape[0].merge_with(shape)
+    else:
+      self._element_shape.append(shape)
+
+  def identity(self):
+    """See TensorArray."""
+    flow = array_ops.identity(self._flow)
+    ta = TensorArray(
+        dtype=self._dtype, flow=flow, infer_shape=self._infer_shape)
+    ta._element_shape = self._element_shape
+    return ta
+
+  def grad(self, source, flow=None, name=None):
+    """Not supported."""
+    raise NotImplementedError()
+
+  def read(self, index, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_get_item(
+        input_handle=self._flow,
+        index=index,
+        element_dtype=self._dtype,
+        name=name)
+    if self._element_shape:
+      value.set_shape(self._element_shape[0].dims)
+    return value
+
+  @tf_should_use.should_use_result
+  def write(self, index, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayV2Write", [self._flow, index, value]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape:
+        self._merge_element_shape(value.shape)
+      flow_out = list_ops.tensor_list_set_item(
+          input_handle=self._flow, index=index, item=value, name=name)
+      ta = TensorArray(dtype=self._dtype, handle=None, flow=flow_out)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      return ta
+
+  def stack(self, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayV2Stack", [self._flow]):
+      value = list_ops.tensor_list_stack(
+          input_handle=self._flow, element_dtype=self._dtype)
+      if self._element_shape and self._element_shape[0].dims is not None:
+        value.set_shape([None] + self._element_shape[0].dims)
+      return value
+
+  def gather(self, indices, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_gather(
+        input_handle=self._flow,
+        indices=indices,
+        element_dtype=self._dtype,
+        name=name)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims)
+    return value
+
+  def concat(self, name=None):
+    """See TensorArray."""
+    value = list_ops.tensor_list_concat(
+        input_handle=self._flow, element_dtype=self._dtype, name=name)
+    if self._element_shape and self._element_shape[0].dims is not None:
+      value.set_shape([None] + self._element_shape[0].dims[1:])
+    return value
+
+  @tf_should_use.should_use_result
+  def unstack(self, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayUnstack", [self._flow, value]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape and not context.executing_eagerly():
+        self._merge_element_shape(value.shape[1:])
+      flow_out = list_ops.tensor_list_from_tensor(
+          tensor=value, element_shape=value.shape[1:])
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  @tf_should_use.should_use_result
+  def scatter(self, indices, value, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArrayScatter",
+                        [self._flow, value, indices]):
+      value = ops.convert_to_tensor(value, name="value")
+      if self._infer_shape and not context.executing_eagerly():
+        self._merge_element_shape(value.shape[1:])
+      flow_out = list_ops.tensor_list_scatter(
+          tensor=value, indices=indices, element_shape=-1)
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  @tf_should_use.should_use_result
+  def split(self, value, lengths, name=None):
+    """See TensorArray."""
+    with ops.name_scope(name, "TensorArraySplit", [self._flow, value, lengths]):
+      value = ops.convert_to_tensor(value, name="value")
+      lengths_64 = math_ops.to_int64(lengths)
+      if self._infer_shape and not context.executing_eagerly():
+        clengths = tensor_util.constant_value(lengths_64)
+        if value.shape.dims is not None:
+          if clengths is not None and clengths.max() == clengths.min():
+            self._merge_element_shape(
+                tensor_shape.TensorShape([clengths[0]]).concatenate(
+                    value.shape[1:]))
+      flow_out = list_ops.tensor_list_split(
+          tensor=value,
+          lengths=lengths_64,
+          element_shape=self._element_shape[0] if self._element_shape else None,
+          name=name)
+      ta = TensorArray(
+          dtype=self._dtype,
+          handle=self.handle,
+          flow=flow_out,
+          colocate_with_first_write_call=self._colocate_with_first_write_call)
+      ta._infer_shape = self._infer_shape
+      ta._element_shape = self._element_shape
+      ta._colocate_with = self._colocate_with
+      return ta
+
+  def size(self, name=None):
+    """See TensorArray."""
+    return list_ops.tensor_list_length(input_handle=self._flow, name=name)
+
+  @tf_should_use.should_use_result
+  def close(self, name=None):
+    """See TensorArray."""
+    return gen_control_flow_ops.no_op(name=name)
+
 # pylint: enable=protected-access
 
 
@@ -738,8 +1013,10 @@ class TensorArray(object):
     if context.executing_eagerly():
       implementation = _EagerTensorArray
     else:
-      implementation = _GraphTensorArray
-
+      if ENABLE_TENSOR_ARRAY_V2:
+        implementation = _GraphTensorArrayV2
+      else:
+        implementation = _GraphTensorArray
     self._implementation = implementation(
         dtype,
         size=size,
@@ -768,7 +1045,7 @@ class TensorArray(object):
   @property
   def handle(self):
     """The reference to the TensorArray."""
-    return self._implementation._handle
+    return self._implementation.handle
 
   @property
   def _infer_shape(self):
@@ -953,4 +1230,16 @@ class TensorArray(object):
     """Close the current TensorArray."""
     return self._implementation.close(name=name)
 
+
+def build_ta_with_new_flow(old_ta, flow):
+  ta = TensorArray(
+      dtype=old_ta.dtype,
+      handle=old_ta.handle,
+      flow=flow,
+      infer_shape=old_ta._infer_shape,
+      colocate_with_first_write_call=old_ta._colocate_with_first_write_call)
+  ta._colocate_with = old_ta._colocate_with
+  ta._element_shape = old_ta._element_shape
+  return ta
+
 # pylint: enable=protected-access
diff --git a/tensorflow/python/ops/tensor_forest_ops.py b/tensorflow/python/ops/tensor_forest_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..842f0c648b12551624fc6306a6fa869392dd4465
--- /dev/null
+++ b/tensorflow/python/ops/tensor_forest_ops.py
@@ -0,0 +1,103 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Ops for tensor_forest."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python import ops
+from tensorflow.python.ops import gen_tensor_forest_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.training import saver
+
+
+class TreeVariableSaveable(saver.BaseSaverBuilder.SaveableObject):
+  """Resource that holds a tree."""
+
+  def __init__(self, type_name, name, container, config, resource_handle_func,
+               create_op_func, is_initialized_op_func, serialize_op_func,
+               deserialize_op_func):
+
+    with ops.name_scope(name, type_name) as name:
+      self._resource_handle = resource_handle_func(
+          container, shared_name=name, name=name)
+
+    self._is_initialized_op = is_initialized_op_func(self._resource_handle)
+    tensor = serialize_op_func(self._resource_handle)
+    self._create_op = create_op_func(self._resource_handle, config)
+    # slice_spec is useful for saving a slice from a variable.
+    # It's not meaningful the tree variable. So we just pass an empty
+    # value.
+    slice_spec = ''
+    specs = [saver.BaseSaverBuilder.SaveSpec(tensor, slice_spec, name)]
+    super(TreeVariableSaveable, self).__init__(self._resource_handle, specs,
+                                               name)
+
+    ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, self)
+
+    resources.register_resource(self._resource_handle, self._create_op,
+                                self._is_initialized_op)
+    self._deserialize_op_func = deserialize_op_func
+
+  def restore(self, restored_tensors, unused_restored_shapes):
+    """Restores the associated tree from 'restored_tensors'.
+
+    Args:
+      restored_tensors: the tensors that were loaded from a checkpoint.
+      unused_restored_shapes: the shapes this object should conform to after
+        restore. Not meaningful for trees.
+
+    Returns:
+      The operation that restores the state of the tree variable.
+    """
+    with ops.control_dependencies([self._create_op]):
+      return self._deserialize_op_func(
+          self._resource_handle,
+          restored_tensors[0],
+      )
+
+  @property
+  def resource(self):
+    return self._resource_handle
+
+
+def tree_variable(tree_config, name, container=None):
+  return TreeVariableSaveable(
+      'TreeVariable', name, container, tree_config,
+      gen_tensor_forest_ops.tensor_forest_tree_resource_handle_op,
+      gen_tensor_forest_ops.tensor_forest_create_tree_variable,
+      gen_tensor_forest_ops.tensor_forest_tree_is_initialized_op,
+      gen_tensor_forest_ops.tensor_forest_tree_serialize,
+      gen_tensor_forest_ops.tensor_forest_tree_deserialize).resource
+
+
+class ForestVariables(object):
+  """Resource that holds all trees from a forest."""
+
+  def __init__(self, params, tree_configs=None):
+
+    self._variables = []
+
+    for i in range(params.n_trees):
+      tree_config = ''
+      if tree_configs is not None:
+        tree_config = tree_configs[i]
+      self._variables.append(tree_variable(
+          tree_config,
+          'tree-%s' % i,
+      ))
+
+  def __getitem__(self, t):
+    return self._variables[t]
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index fe93bfb61f7ded83f5d3a04313b41585337adf69..ccce9e2f93bac26a69d8cadab9ece4cc2482c4e1 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -646,14 +646,8 @@ class _VariableStore(object):
         when violating reuse during variable creation, or if an existing
         sharded variable exists for the given name but with different sharding.
     """
-    if context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
-
     initializing_from_value = initializer is not None and isinstance(
         initializer, ops.Tensor)
-    reuse_without_partition = reuse and not partitioner
-
     if name in self._vars:
       raise ValueError(
           "A partitioner was provided, but an unpartitioned version of the "
@@ -664,30 +658,9 @@ class _VariableStore(object):
     if initializing_from_value:
       shape = shape.merge_with(initializer.get_shape())
 
-    if not reuse_without_partition:
-      if not shape.is_fully_defined():
-        raise ValueError("Shape of a new partitioned variable (%s) must be "
-                         "fully defined, but instead was %s." % (name, shape))
-
-      if shape.ndims < 1:
-        raise ValueError("A partitioned Variable must have rank at least 1, "
-                         "shape: %s" % shape)
-
-      partitions = partitioner(shape=shape, dtype=dtype)
-
-      if not isinstance(partitions, collections_lib.Sequence):
-        raise ValueError("Partitioner must return a sequence, but saw: %s"
-                         % partitions)
-
-      if len(partitions) != shape.ndims:
-        raise ValueError(
-            "Partitioner returned a partition list that does not match the "
-            "Variable's rank: %s vs. %s" % (partitions, shape))
-
-      if any([p < 1 for p in partitions]):
-        raise ValueError(
-            "Partitioner returned zero partitions for some axes: %s" %
-            partitions)
+    partitions = None
+    if not reuse or partitioner:
+      partitions = _call_partitioner(partitioner, shape, dtype)
 
     if name in self._partitioned_vars:
       if reuse is False:
@@ -709,7 +682,7 @@ class _VariableStore(object):
             % (name, dtype.name, existing_var.dtype.name))
 
       # pylint: disable=protected-access
-      if (not reuse_without_partition and
+      if (partitions is not None and
           existing_var._get_partitions() != partitions):
         raise ValueError(
             "Trying to reuse partitioned variable %s, but specified partitions "
@@ -724,14 +697,7 @@ class _VariableStore(object):
                        "created with tf.get_variable(). Did you mean to set "
                        "reuse=False or reuse=tf.AUTO_REUSE in VarScope?" % name)
 
-    slice_dim, slice_shape = _compute_slice_dim_and_shape(
-        shape.as_list(), partitions)
-
-    vs = []
-    num_slices = partitions[slice_dim]
-    num_slices_with_excess = shape.dims[slice_dim].value % num_slices
-
-    slice_offset = [0] * shape.ndims
+    slice_dim, num_slices = _get_slice_dim_and_num_slices(partitions)
 
     if "%s/part_0" % name in self._vars:
       if "%s/part_%d" % (name, num_slices - 1) not in self._vars:
@@ -747,15 +713,14 @@ class _VariableStore(object):
             "%s/part_0 was found, but so was the extra shard %s/part_%d."
             % (num_slices, name, name, num_slices))
 
-    for i in xrange(num_slices):
-      var_shape = slice_shape[:]
-      var_offset = slice_offset[:]
+    vs = []
+    for i, (var_offset, var_shape) in enumerate(_iter_slices(
+        shape.as_list(),
+        num_slices,
+        slice_dim
+    )):
       partition_info = _PartitionInfo(
           full_shape=shape.as_list(), var_offset=var_offset)
-      if i < num_slices_with_excess:
-        var_shape[slice_dim] += 1
-      slice_offset[slice_dim] += var_shape[slice_dim]
-
       var_full_name = "%s/part_%d" % (name, i)
       with ops.name_scope(var_full_name + "/PartitionedInitializer"):
         # Create the tensor to initialize the variable with default value.
@@ -803,15 +768,13 @@ class _VariableStore(object):
       vs.append(var)
       # pylint: enable=protected-access
 
-      # pylint: disable=protected-access
     partitioned_var = variables.PartitionedVariable(name=name,
                                                     shape=shape,
                                                     dtype=dtype,
                                                     variable_list=vs,
                                                     partitions=partitions)
-    # pylint: enable=protected-access
-
-    self._partitioned_vars[name] = partitioned_var
+    if not context.executing_eagerly() or self._store_eager_variables:
+      self._partitioned_vars[name] = partitioned_var
     return partitioned_var
 
   def _get_single_variable(self,
@@ -913,20 +876,22 @@ class _VariableStore(object):
         variable_dtype = None
       else:
         # Instantiate initializer if provided initializer is a type object.
-        if isinstance(initializer, type(init_ops.Initializer)):
+        if tf_inspect.isclass(initializer):
           initializer = initializer(dtype=dtype)
-        if shape and shape.is_fully_defined():
+        if shape is not None and shape.is_fully_defined():
           init_val = lambda: initializer(  # pylint: disable=g-long-lambda
               shape.as_list(), dtype=dtype, partition_info=partition_info)
-        elif not tf_inspect.getargspec(initializer).args:
+          variable_dtype = dtype.base_dtype
+        elif len(tf_inspect.getargspec(initializer).args) == len(
+            tf_inspect.getargspec(initializer).defaults or []):
           init_val = initializer
+          variable_dtype = None
         else:
-          raise ValueError("You can only pass an initializer function that "
-                           "expects no arguments to its callable when the "
-                           "shape is not fully defined. The given initializer "
-                           "function expects the following args %s" %
-                           tf_inspect.getargspec(initializer).args)
-        variable_dtype = dtype.base_dtype
+          raise ValueError("The initializer passed is not valid. It should "
+                           "be a callable with no arguments and the "
+                           "shape should not be provided or an instance of "
+                           "`tf.keras.initializers.*' and `shape` should be "
+                           "fully defined.")
 
     # Create the variable.
     if use_resource is None:
@@ -1080,9 +1045,6 @@ class VariableScope(object):
       if self._caching_device is not None:
         raise NotImplementedError("Caching devices is not yet supported "
                                   "when eager execution is enabled.")
-      if self._partitioner is not None:
-        raise NotImplementedError("Partitioned variables are not yet supported "
-                                  "when eager execution is enabled.")
       self._reuse = AUTO_REUSE
       self._use_resource = True
 
@@ -1162,9 +1124,6 @@ class VariableScope(object):
 
   def set_partitioner(self, partitioner):
     """Set partitioner for this scope."""
-    if partitioner and context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
     self._partitioner = partitioner
 
   def set_custom_getter(self, custom_getter):
@@ -1277,9 +1236,6 @@ class VariableScope(object):
                                 synchronization=VariableSynchronization.AUTO,
                                 aggregation=VariableAggregation.NONE):
     """Gets an existing variable with this name or create a new one."""
-    if context.executing_eagerly():
-      raise NotImplementedError("Partitioned variables are not yet supported "
-                                "when eager execution is enabled.")
     if initializer is None:
       initializer = self._initializer
     if regularizer is None:
@@ -2246,8 +2202,8 @@ class variable_scope(object):
 
     try:
       return self._enter_scope_uncached()
-    except:
-      if not self._building_function:
+    except Exception:
+      if self._in_graph_mode and not self._building_function:
         if self._graph_context_manager is not None:
           self._graph_context_manager.__exit__(*sys.exc_info())
       raise
@@ -2413,34 +2369,71 @@ def variable_op_scope(values,
     yield scope
 
 
-def _compute_slice_dim_and_shape(full_shape, slicing):
-  """Computes which dimension is being sliced and the typical slice shape."""
-
-  slice_shape = [0] * len(full_shape)
-  slice_dim = None
-  for dim, num_slices in enumerate(slicing):
-    dim_size = full_shape[dim]
-    if num_slices <= 0 or dim_size < num_slices:
-      raise ValueError("Cannot create %d slices for size %d. shape: %s, "
-                       "slicing: %s" %
-                       (num_slices, full_shape[dim], full_shape, slicing))
-    if num_slices == 1:
-      # Not slicing in this dimension.
-      slice_shape[dim] = dim_size
-    elif slice_dim is not None:
-      # We only support slicing along one of the dimensions.
-      raise ValueError("Can only slice a variable along one dimension: "
-                       "shape: %s, slicing: %s" % (full_shape, slicing))
-    else:
-      # Note: We will add any extras onto the last slice, later.
-      slice_dim = dim
-      slice_shape[dim] = dim_size // num_slices
+def _call_partitioner(partitioner, shape, dtype):
+  """Call partitioner validating its inputs/output.
 
-  # Degenerate case: If "slicing" was all ones, pretend we are slicing along
-  # the first dimension.
-  if slice_dim is None:
+  Args:
+    partitioner: a function mapping `Tensor` shape and dtype to a
+        list of partitions.
+    shape: shape of the `Tensor` to partition, must have at least two
+        dimensions.
+    dtype: dtype of the elements in the `Tensor`.
+
+  Returns:
+    A list with elements >=1 and exactly one >1. The index of that
+    element corresponds to the partitioning axis.
+  """
+  if not shape.is_fully_defined():
+    raise ValueError("Shape of a new partitioned variable must be "
+                     "fully defined, but instead was %s." % (shape,))
+  if shape.ndims < 1:
+    raise ValueError("A partitioned Variable must have rank at least 1, "
+                     "shape: %s" % shape)
+
+  slicing = partitioner(shape=shape, dtype=dtype)
+  if not isinstance(slicing, collections_lib.Sequence):
+    raise ValueError("Partitioner must return a sequence, but saw: %s"
+                     % slicing)
+  if len(slicing) != shape.ndims:
+    raise ValueError(
+        "Partitioner returned a partition list that does not match the "
+        "Variable's rank: %s vs. %s" % (slicing, shape))
+  if any(p < 1 for p in slicing):
+    raise ValueError(
+        "Partitioner returned zero partitions for some axes: %s" %
+        slicing)
+  if sum(p > 1 for p in slicing) > 1:
+    raise ValueError(
+        "Can only slice a variable along one dimension: "
+        "shape: %s, partitioning: %s" % (shape, slicing))
+  return slicing
+
+
+# TODO(slebedev): could be inlined, but
+# `_VariableStore._get_partitioned_variable` is too complex even
+# without this logic.
+def _get_slice_dim_and_num_slices(slicing):
+  """Get slicing dimension and number of slices from the partitioner output."""
+  for slice_dim, num_slices in enumerate(slicing):
+    if num_slices > 1:
+      break
+  else:
+    # Degenerate case: no partitioning applied.
     slice_dim = 0
-  return slice_dim, slice_shape
+    num_slices = 1
+  return slice_dim, num_slices
+
+
+def _iter_slices(full_shape, num_slices, slice_dim):
+  """Slices a given a shape along the specified dimension."""
+  num_slices_with_excess = full_shape[slice_dim] % num_slices
+  offset = [0] * len(full_shape)
+  min_slice_len = full_shape[slice_dim] // num_slices
+  for i in xrange(num_slices):
+    shape = full_shape[:]
+    shape[slice_dim] = min_slice_len + bool(i < num_slices_with_excess)
+    yield offset[:], shape
+    offset[slice_dim] += shape[slice_dim]
 
 
 def _get_trainable_value(synchronization, trainable):
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index e43736069e38a69e26a1dae3c393ceca0eb94f71..a31ce655183f8fb7e6331c2d6a4b3af8076902c8 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -18,7 +18,8 @@ from __future__ import division
 from __future__ import print_function
 
 import enum  # pylint: disable=g-bad-import-order
-
+import functools
+import os
 import six
 
 from tensorflow.core.framework import attr_value_pb2
@@ -636,37 +637,84 @@ class Variable(six.with_metaclass(VariableMetaclass,
     """
     raise NotImplementedError
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    raise NotImplementedError
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    Assuming the variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        op = ref.scatter_nd_sub(indices, updates)
+        op = v.scatter_nd_sub(indices, updates)
         with tf.Session() as sess:
           print sess.run(op)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, -9, 3, -6, -6, 6, 7, -4]
 
@@ -690,34 +738,34 @@ class Variable(six.with_metaclass(VariableMetaclass,
   def scatter_nd_add(self, indices, updates, name=None):
     """Applies sparse addition to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    The Variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        add = ref.scatter_nd_add(indices, updates)
+        add = v.scatter_nd_add(indices, updates)
         with tf.Session() as sess:
           print sess.run(add)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, 13, 3, 14, 14, 6, 7, 20]
 
@@ -741,34 +789,34 @@ class Variable(six.with_metaclass(VariableMetaclass,
   def scatter_nd_update(self, indices, updates, name=None):
     """Applies sparse assignment to individual values or slices in a Variable.
 
-    `ref` is a `Tensor` with rank `P` and `indices` is a `Tensor` of rank `Q`.
+    The Variable has rank `P` and `indices` is a `Tensor` of rank `Q`.
 
-    `indices` must be integer tensor, containing indices into `ref`.
+    `indices` must be integer tensor, containing indices into self.
     It must be shape `[d_0, ..., d_{Q-2}, K]` where `0 < K <= P`.
 
     The innermost dimension of `indices` (with length `K`) corresponds to
     indices into elements (if `K = P`) or slices (if `K < P`) along the `K`th
-    dimension of `ref`.
+    dimension of self.
 
     `updates` is `Tensor` of rank `Q-1+P-K` with shape:
 
     ```
-    [d_0, ..., d_{Q-2}, ref.shape[K], ..., ref.shape[P-1]].
+    [d_0, ..., d_{Q-2}, self.shape[K], ..., self.shape[P-1]].
     ```
 
     For example, say we want to add 4 scattered elements to a rank-1 tensor to
     8 elements. In Python, that update would look like this:
 
     ```python
-        ref = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
+        v = tf.Variable([1, 2, 3, 4, 5, 6, 7, 8])
         indices = tf.constant([[4], [3], [1] ,[7]])
         updates = tf.constant([9, 10, 11, 12])
-        op = ref.scatter_nd_assign(indices, updates)
+        op = v.scatter_nd_assign(indices, updates)
         with tf.Session() as sess:
           print sess.run(op)
     ```
 
-    The resulting update to ref would look like this:
+    The resulting update to v would look like this:
 
         [1, 11, 3, 10, 9, 6, 7, 12]
 
@@ -860,18 +908,18 @@ class Variable(six.with_metaclass(VariableMetaclass,
     else:
       return v.value()
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
+  @classmethod
+  def _OverloadAllOperators(cls):  # pylint: disable=invalid-name
     """Register overloads for all operators."""
     for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      Variable._OverloadOperator(operator)
+      cls._OverloadOperator(operator)
     # For slicing, bind getitem differently than a tensor (use SliceHelperVar
     # instead)
     # pylint: disable=protected-access
-    setattr(Variable, "__getitem__", array_ops._SliceHelperVar)
+    setattr(cls, "__getitem__", array_ops._SliceHelperVar)
 
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
+  @classmethod
+  def _OverloadOperator(cls, operator):  # pylint: disable=invalid-name
     """Defer an operator overload to `ops.Tensor`.
 
     We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
@@ -879,17 +927,26 @@ class Variable(six.with_metaclass(VariableMetaclass,
     Args:
       operator: string. The operator name.
     """
+    tensor_oper = getattr(ops.Tensor, operator)
 
-    def _run_op(a, *args):
+    def _run_op(a, *args, **kwargs):
       # pylint: disable=protected-access
-      return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = getattr(ops.Tensor, operator).__doc__
-    except AttributeError:
-      pass
+      return tensor_oper(a._AsTensor(), *args, **kwargs)
+
+    functools.update_wrapper(_run_op, tensor_oper)
+    setattr(cls, operator, _run_op)
+
+  def __iter__(self):
+    """Dummy method to prevent iteration. Do not call.
+
+    NOTE(mrry): If we register __getitem__ as an overloaded operator,
+    Python will valiantly attempt to iterate over the variable's Tensor from 0
+    to infinity.  Declaring this method prevents this unintended behavior.
 
-    setattr(Variable, operator, _run_op)
+    Raises:
+      TypeError: when invoked.
+    """
+    raise TypeError("'Variable' object is not iterable.")
 
   # NOTE(mrry): This enables the Variable's overloaded "right" binary
   # operators to run when the left operand is an ndarray, because it
@@ -1045,27 +1102,6 @@ class Variable(six.with_metaclass(VariableMetaclass,
       else:
         return None
 
-  def __iadd__(self, other):
-    raise NotImplementedError
-
-  def __isub__(self, other):
-    raise NotImplementedError
-
-  def __imul__(self, other):
-    raise NotImplementedError
-
-  def __idiv__(self, other):
-    raise NotImplementedError
-
-  def __itruediv__(self, other):
-    raise NotImplementedError
-
-  def __irealdiv__(self, other):
-    raise NotImplementedError
-
-  def __ipow__(self, other):
-    raise NotImplementedError
-
 
 @tf_export(v1=["Variable"])
 class VariableV1(Variable):
@@ -1576,18 +1612,6 @@ class RefVariable(VariableV1):
     """
     return self._snapshot
 
-  def __iter__(self):
-    """Dummy method to prevent iteration. Do not call.
-
-    NOTE(mrry): If we register __getitem__ as an overloaded operator,
-    Python will valiantly attempt to iterate over the variable's Tensor from 0
-    to infinity.  Declaring this method prevents this unintended behavior.
-
-    Raises:
-      TypeError: when invoked.
-    """
-    raise TypeError("'Variable' object is not iterable.")
-
   def value(self):
     """Returns the last snapshot of this variable.
 
@@ -1865,6 +1889,55 @@ class RefVariable(VariableV1):
         use_locking=use_locking,
         name=name)
 
+  def batch_scatter_update(self, sparse_delta, use_locking=False, name=None):
+    """Assigns `IndexedSlices` to this variable batch-wise.
+
+    Analogous to `batch_gather`. This assumes that this variable and the
+    sparse_delta IndexedSlices have a series of leading dimensions that are the
+    same for all of them, and the updates are performed on the last dimension of
+    indices. In other words, the dimensions should be the following:
+
+    `num_prefix_dims = sparse_delta.indices.ndims - 1`
+    `batch_dim = num_prefix_dims + 1`
+    `sparse_delta.updates.shape = sparse_delta.indices.shape + var.shape[
+         batch_dim:]`
+
+    where
+
+    `sparse_delta.updates.shape[:num_prefix_dims]`
+    `== sparse_delta.indices.shape[:num_prefix_dims]`
+    `== var.shape[:num_prefix_dims]`
+
+    And the operation performed can be expressed as:
+
+    `var[i_1, ..., i_n,
+         sparse_delta.indices[i_1, ..., i_n, j]] = sparse_delta.updates[
+            i_1, ..., i_n, j]`
+
+    When sparse_delta.indices is a 1D tensor, this operation is equivalent to
+    `scatter_update`.
+
+    To avoid this operation one can looping over the first `ndims` of the
+    variable and using `scatter_update` on the subtensors that result of slicing
+    the first dimension. This is a valid option for `ndims = 1`, but less
+    efficient than this implementation.
+
+    Args:
+      sparse_delta: `IndexedSlices` to be assigned to this variable.
+      use_locking: If `True`, use locking during the operation.
+      name: the name of the operation.
+
+    Returns:
+      A `Tensor` that will hold the new value of this variable after
+      the scattered subtraction has completed.
+
+    Raises:
+      ValueError: if `sparse_delta` is not an `IndexedSlices`.
+    """
+    return state_ops.batch_scatter_update(
+        self, sparse_delta.indices, sparse_delta.values,
+        use_locking=use_locking, name=name)
+
   def scatter_nd_sub(self, indices, updates, name=None):
     """Applies sparse subtraction to individual values or slices in a Variable.
 
@@ -2123,37 +2196,6 @@ class RefVariable(VariableV1):
     else:
       return v.value()
 
-  @staticmethod
-  def _OverloadAllOperators():  # pylint: disable=invalid-name
-    """Register overloads for all operators."""
-    for operator in ops.Tensor.OVERLOADABLE_OPERATORS:
-      Variable._OverloadOperator(operator)  # pylint: disable=protected-access
-    # For slicing, bind getitem differently than a tensor (use SliceHelperVar
-    # instead)
-    # pylint: disable=protected-access
-    setattr(Variable, "__getitem__", array_ops._SliceHelperVar)
-
-  @staticmethod
-  def _OverloadOperator(operator):  # pylint: disable=invalid-name
-    """Defer an operator overload to `ops.Tensor`.
-
-    We pull the operator out of ops.Tensor dynamically to avoid ordering issues.
-
-    Args:
-      operator: string. The operator name.
-    """
-
-    def _run_op(a, *args):
-      # pylint: disable=protected-access
-      return getattr(ops.Tensor, operator)(a._AsTensor(), *args)
-    # Propagate __doc__ to wrapper
-    try:
-      _run_op.__doc__ = getattr(ops.Tensor, operator).__doc__
-    except AttributeError:
-      pass
-
-    setattr(Variable, operator, _run_op)
-
   def _gather_saveables_for_checkpoint(self):
     """For implementing `Checkpointable`. This object is saveable on its own."""
     return {checkpointable.VARIABLE_VALUE_KEY: self}
@@ -2457,34 +2499,6 @@ class PartitionedVariable(object):
   @end_compatibility
   """
 
-  class PartitionedVariableIterator(object):
-    """An iterator that allows accessing the underlying `Variable` objects.
-
-    This iterator is necessary to control order of access when Variables
-    are not partitioned in a standard way along a single axis.
-
-    Allows e.g. `list(partitioned_variable)` to return a proper list.
-    """
-
-    def __init__(self, partitioned_variable):
-      self._ix = 0
-      self._partitioned_variable = partitioned_variable
-
-    def __iter__(self):
-      return self
-
-    def __next__(self):  # For python3 compatibility.
-      return self.next()
-
-    def next(self):
-      # pylint: disable=protected-access
-      if self._ix >= len(self._partitioned_variable._variable_list):
-        raise StopIteration()
-      variable = self._partitioned_variable._variable_list[self._ix]
-      # pylint: enable=protected-access
-      self._ix += 1
-      return variable
-
   def __init__(self, name, shape, dtype, variable_list, partitions):
     """Creates a new partitioned variable wrapper.
 
@@ -2504,31 +2518,27 @@ class PartitionedVariable(object):
         `partitions` is not a list.
       ValueError: If `variable_list` is empty, or the `Variable` shape
         information does not match `shape`, or `partitions` has invalid values.
-      RuntimeError: If eager execution is enabled
     """
-    if context.executing_eagerly():
-      raise RuntimeError(
-          "tf.PartitionedVariable not supported with eager execution enabled.")
     if not isinstance(variable_list, (list, tuple)):
       raise TypeError(
           "variable_list is not a list or tuple: %s" % variable_list)
     if not isinstance(partitions, (list, tuple)):
       raise TypeError("partitions is not a list or tuple: %s" % partitions)
-    if not all([p >= 1 for p in partitions]):
+    if not all(p >= 1 for p in partitions):
       raise ValueError("partition values must be positive: %s" % partitions)
     if not variable_list:
       raise ValueError("variable_list may not be empty")
     # pylint: disable=protected-access
     for v in variable_list:
       # Sort the variable_list lexicographically according to var offset value.
-      if not all([v._get_save_slice_info() is not None for v in variable_list]):
+      if not all(v._get_save_slice_info() is not None for v in variable_list):
         raise ValueError(
             "All variables must have a save_slice_info available: %s"
             % [v.name for v in variable_list])
       if len(shape) != len(partitions):
         raise ValueError("len(shape) != len(partitions): %s vs. %s"
                          % (shape, partitions))
-      if not all([v._get_save_slice_info().full_shape == shape]):
+      if v._get_save_slice_info().full_shape != shape:
         raise ValueError(
             "All variables' full shapes must match shape: %s; "
             "but full shapes were: %s"
@@ -2545,7 +2555,7 @@ class PartitionedVariable(object):
 
   def __iter__(self):
     """Return an iterable for accessing the underlying partition Variables."""
-    return self.PartitionedVariableIterator(self)
+    return iter(self._variable_list)
 
   def __len__(self):
     num_partition_axes = len(self._partition_axes())
@@ -2555,7 +2565,7 @@ class PartitionedVariable(object):
     return len(self._variable_list)
 
   def _partition_axes(self):
-    if all([p == 1 for p in self._partitions]):
+    if all(p == 1 for p in self._partitions):
       return [0]
     else:
       return [i for i, p in enumerate(self._partitions) if p > 1]
@@ -2995,7 +3005,9 @@ def report_uninitialized_variables(var_list=None,
     # Run all operations on CPU
     if var_list:
       init_vars = [state_ops.is_variable_initialized(v) for v in var_list]
-    with ops.device("/cpu:0"):
+    local_device = os.environ.get(
+        "TF_DEVICE_FOR_UNINITIALIZED_VARIABLE_REPORTING", "/cpu:0")
+    with ops.device(local_device):
       if not var_list:
         # Return an empty tensor so we only need to check for returned tensor
         # size being 0 as an indication of model ready.
diff --git a/tensorflow/python/ops/while_v2.py b/tensorflow/python/ops/while_v2.py
index 7b0f0ed4fc85828bae4c63ebc43d3bee944c49fb..d00c158d156b225553b52437324accd019c76aee 100644
--- a/tensorflow/python/ops/while_v2.py
+++ b/tensorflow/python/ops/while_v2.py
@@ -65,7 +65,8 @@ def while_loop(cond,
                loop_vars,
                shape_invariants=None,
                maximum_iterations=None,
-               name=None):
+               name=None,
+               return_same_structure=True):
   """Like tf.while_loop, except emits a single While op."""
   maximum_iterations = _validate_and_convert_to_tensor(maximum_iterations)
   # Keep the original loop_vars around to know which args were TensorArrays.
@@ -100,7 +101,8 @@ def while_loop(cond,
     # Add loop counter needed for computing gradients.
     loop_vars = [loop_counter] + loop_vars
 
-    shape_invariants = [tensor_shape.scalar()] + shape_invariants
+    shape_invariants = type(shape_invariants)([tensor_shape.scalar()
+                                              ]) + shape_invariants
 
     # Automatic control dependencies are added in defuns, but not in v1
     # graphs. Propagate that behavior here.
@@ -133,9 +135,8 @@ def while_loop(cond,
     # the value of that tensor in each iteration is the same as it was at the
     # beginning of the loop execution.
     loop_vars = loop_vars + cond_graph.external_captures
-    shape_invariants = shape_invariants + [
-        t.shape for t in cond_graph.external_captures
-    ]
+    shape_invariants = shape_invariants + type(shape_invariants)(
+        [t.shape for t in cond_graph.external_captures])
 
     def wrapped_body(loop_counter, *args):
       """Loop body augmented with counter update.
@@ -200,31 +201,6 @@ def while_loop(cond,
             " this as a loop variable." % str(external_capture))
         cond_graph.capture(external_capture)
 
-    # Export all tensors in the loop body that may be needed for gradient
-    # computation. We do this by accumulating the intermediate values in
-    # TensorLists.
-    intermediate_tensors = _get_intermediates(body_graph)
-
-    for intermediate_tensor in intermediate_tensors:
-      tensor_list = list_ops.empty_tensor_list(
-          element_dtype=intermediate_tensor.dtype,
-          element_shape=_get_tensor_convertible_shape(
-              intermediate_tensor.shape),
-          max_num_elements=maximum_iterations)
-      loop_vars.append(tensor_list)
-      with cond_graph.as_default():
-        # Add a placeholder to cond_graph's inputs corresponding to the
-        # tensor_list.
-        cond_graph.capture(tensor_list)
-      with body_graph.as_default():
-        # Push the intermediate tensor to the tensor list. This captures the
-        # `tensor_list` as well.
-        appended_tensor_list = list_ops.tensor_list_push_back(
-            tensor_list,
-            intermediate_tensor)
-        # Add this modified tensor list to the list of outputs.
-        body_graph.outputs.append(appended_tensor_list)
-
     # Make sure that the shapes of the loop outputs are compatible with the
     # shape invariants, or the shapes of the loop vars if the invariants are not
     # specified.
@@ -245,7 +221,7 @@ def while_loop(cond,
         name=scope)
 
     _copy_handle_data(body_graph.outputs, outputs)
-    _maybe_set_lowering_attr(outputs[0].op)
+    util.maybe_set_lowering_attr(outputs[0].op)
     _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
 
     # Return identities for each output of the While op, rather than the output
@@ -257,17 +233,29 @@ def while_loop(cond,
     outputs = tuple(array_ops.identity(t) for t in outputs)
 
   # First var is loop counter.
-  if num_flattened_outputs == 1:
-    return outputs[1]
+  outputs = _pack_sequence_as(orig_loop_vars,
+                              outputs[1:1 + num_flattened_outputs])
+
+  if return_same_structure:
+    return outputs
+
+  flattened_outputs = nest.flatten(outputs)
+  if len(flattened_outputs) == 1:
+    return flattened_outputs[0]
   else:
-    return _pack_sequence_as(orig_loop_vars,
-                             outputs[1:1 + num_flattened_outputs])
+    return outputs
 
 
 @ops.RegisterGradient("While")
 def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   """The gradient of a While op produced by while_loop."""
-  body_graph = _get_body_graph(op)
+  cond_graph = _get_graph(op, "cond")
+  body_graph = _get_graph(op, "body")
+  orig_num_params = len(body_graph.outputs)
+
+  maximum_iterations = op.get_attr(
+      "_maximum_iterations") if _is_in_xla_context() else None
+  assert not _is_in_xla_context() or maximum_iterations is not None
 
   # Set the incoming gradient of TensorArray handles to None. The gradient
   # implementation currently assumes all resource tensors correspond to float32
@@ -285,15 +273,13 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   # TODO(b/118712257): Remove the IsTrainable filter once we can handle None
   # output grads in _grad_fn.
   grads = [
-      None if _is_tensor_array_handle(output) or
-      not gradients_impl.IsTrainable(output) else grad
-      for grad, output in zip(grads, op.outputs)
+      None if _is_tensor_array_handle(output) or not _is_trainable(output)
+      else grad for grad, output in zip(grads, body_graph.outputs)
   ]
 
   # Ensure that all non-resource trainable outputs have incoming gradients.
-  assert all(g is not None or o.dtype == dtypes.resource or
-             not gradients_impl.IsTrainable(o)
-             for o, g in zip(op.outputs, grads)
+  assert all(g is not None or o.dtype == dtypes.resource or not _is_trainable(o)
+             for o, g in zip(body_graph.outputs, grads)
             ), "All trainable loop vars must receive incoming gradients."
   # We compute the gradient for the sub-graph between trainable ys and xs
   # with non-None incoming gradients. We later pad the None's to the list of
@@ -302,32 +288,36 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       body_graph.outputs, body_graph.inputs, grads) if grad is not None])
 
   body_grad_graph, args = _create_grad_func(
-      ys, xs, non_none_grads, body_graph,
-      util.unique_grad_fn_name(body_graph.name), op)
+      ys, xs, non_none_grads, cond_graph, body_graph,
+      util.unique_grad_fn_name(body_graph.name), op, maximum_iterations)
 
-  intermediate_tensors = _get_intermediates(body_grad_graph)
+  if body_grad_graph.while_op_needs_rewrite:
+    # Modify 'op' to output the intermediate accumulators needed by the grad
+    # function.
+    # NOTE(skyewm): if there are any active sessions, this modification to `op`
+    # may make them unrunnable!
 
-  maximum_iterations = op.get_attr(
-      "_maximum_iterations") if _is_in_xla_context() else None
-  assert not _is_in_xla_context() or maximum_iterations is not None
-  for intermediate_tensor in intermediate_tensors:
-    tensor_list = list_ops.empty_tensor_list(
-        element_dtype=intermediate_tensor.dtype,
-        element_shape=_get_tensor_convertible_shape(intermediate_tensor.shape),
-        max_num_elements=maximum_iterations)
-
-    with body_grad_graph.as_default():
-      tensor_list_ph = body_grad_graph.capture(tensor_list, whitelisted=True)
-      # Push the intermediate tensor to the tensor list.
-      appended_tensor_list = list_ops.tensor_list_push_back(tensor_list_ph,
-                                                            intermediate_tensor)
-      # Add this modified tensor list to the list of outputs.
-      body_grad_graph.outputs.append(appended_tensor_list)
+    cond_graph.name += "_rewritten"
+    body_graph.name += "_rewritten"
+
+    new_inputs = body_grad_graph.empty_tensor_lists
+    new_outputs = body_graph.outputs[orig_num_params:]
+
+    op._set_func_attr("cond", util.create_new_tf_function(cond_graph))
+    op._set_func_attr("body", util.create_new_tf_function(body_graph))
+    op._set_type_list_attr("T", body_graph.output_types)
+    op._set_shape_list_attr("output_shapes", body_graph.output_shapes)
+    op._add_while_inputs(new_inputs)
+    op._add_outputs([t.dtype for t in new_outputs],
+                    [t.shape for t in new_outputs])
+    _copy_handle_data(new_outputs, op.outputs[orig_num_params:])
+
+  captured_inputs = _resolve_grad_captures(body_graph, body_grad_graph, op)
+  loop_vars = args + captured_inputs
 
   def grad_cond(counter, max_iters, *unused_args):
     return counter < max_iters
 
-  loop_vars = args + body_grad_graph.external_captures
   grad_cond_name = util.unique_grad_fn_name(op.get_attr("cond").name)
   cond_grad_graph = func_graph_module.func_graph_from_py_func(
       grad_cond_name, grad_cond, loop_vars, {},
@@ -343,9 +333,12 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
       name="%s_grad" % op.name)
 
   _copy_handle_data(body_grad_graph.outputs, outputs)
-  _maybe_set_lowering_attr(outputs[0].op)
+  util.maybe_set_lowering_attr(outputs[0].op)
   _maybe_set_maximum_iterations_attr(outputs[0].op, maximum_iterations)
 
+  # See comment in while_loop.
+  outputs = [array_ops.identity(t) for t in outputs]
+
   # Set None as the output gradient for tensors with None input gradient
   # e.g. TensorArray handles.
   # outputs[0] is the loop counter.
@@ -361,6 +354,24 @@ def _WhileGrad(op, *grads):  # pylint: disable=invalid-name
   return none_padded_outputs
 
 
+def _is_trainable(tensor):
+  """Returns whether the given tensor is trainable."""
+  if not gradients_impl.IsTrainable(tensor):
+    return False
+
+  # Special case: untrainable accumulator output. The gradients algorithm
+  # doesn't know about tensor lists of untrainable elements. In theory the
+  # tensor list gradient functions should return None as appropriate, but
+  # because we can't return None from the gradient function we filter out
+  # untrainable accumulator output here to avoid computing the gradient at all.
+  if tensor.op.type == "TensorListPopBack" and tensor.value_index == 0:
+    assert tensor.dtype == dtypes.variant
+    element_type = tensor.op.get_attr("element_dtype")
+    return gradients_impl.IsTrainable(element_type)
+
+  return True
+
+
 def _validate_and_convert_to_tensor(maximum_iterations):
   """Checks that `maximum_iterations` is valid.
 
@@ -402,20 +413,21 @@ def _validate_and_convert_to_tensor(maximum_iterations):
 
 
 # TODO(srbs): Pull this into common utils for cond_v2 and while_v2.
-def _get_body_graph(while_op):
-  """Returns `FuncGraph` for the while body.
+def _get_graph(while_op, func_attr_name):
+  """Returns `FuncGraph` for the given function attribute.
 
   Args:
     while_op: The While Operation.
+    func_attr_name: string
 
   Returns:
-    `FuncGraph` for the while body.
+    `FuncGraph`
   """
   # TODO(srbs): Handle TensorShapeProto in function_def_to_graph.input_shapes.
   input_shapes = [
       tensor_shape.TensorShape(s) for s in while_op.get_attr("output_shapes")
   ]
-  func_name = while_op.get_attr("body").name
+  func_name = while_op.get_attr(func_attr_name).name
   fdef = while_op.graph._get_function(func_name).definition
   # `while_op.graph` may not be the same as `ops.get_default_graph()` e.g.
   # if the `while_op` is in the body of another if/while/defun. We build the
@@ -428,7 +440,8 @@ def _get_body_graph(while_op):
   return func_graph
 
 
-def _create_grad_func(ys, xs, grads, func_graph, name, while_op):
+def _create_grad_func(ys, xs, grads, cond_graph, body_graph, name, while_op,
+                      max_iters):
   """Builds and returns the gradient FuncGraph of `func_graph` and its args.
 
   The returned grad_func_graph must be called with the returned
@@ -438,9 +451,11 @@ def _create_grad_func(ys, xs, grads, func_graph, name, while_op):
     ys: A `Tensor` or list of tensors to be differentiated.
     xs: A `Tensor` or list of tensors to be used for differentiation.
     grads: The incoming grads for `ys`.
-    func_graph: FuncGraph for the forward body function.
+    cond_graph: FuncGraph for the forward cond function.
+    body_graph: FuncGraph for the forward body function.
     name: Name of the returned gradient function.
     while_op: The forward While op.
+    max_iters: the maximum number of iterations, or None if no limit.
 
   Returns:
     2-tuple of (grad_func_graph, args).
@@ -456,9 +471,10 @@ def _create_grad_func(ys, xs, grads, func_graph, name, while_op):
   # `external_captures`.
   grad_func_graph = func_graph_module.func_graph_from_py_func(
       name,
-      lambda *args: _grad_fn(ys, xs, args, func_graph),
+      lambda *args: _grad_fn(ys, xs, args, body_graph),
       args, {},
-      func_graph=_WhileBodyGradFuncGraph(name, func_graph))
+      func_graph=_WhileBodyGradFuncGraph(name, cond_graph, body_graph,
+                                         max_iters))
 
   # Add the popped accumulators to the list of outputs.
   for internal_capture in grad_func_graph.internal_captures:
@@ -498,56 +514,59 @@ def _grad_fn(ys, xs, args, func_graph):
   # Build the gradient graph. Note that this builds the gradient computation of
   # func_graph in the current graph, which requires capturing tensors from
   # func_graph. The captured func_graph tensors are resolved to external tensors
-  # in _resolve_grad_inputs.
+  # after the forward While op has been rewritten in _resolve_grad_captures.
   # TODO(srbs): Mark GradientsHelper as public?
   grad_outs = gradients_impl._GradientsHelper(
-      ys, xs, grad_ys=grad_ys, src_graph=func_graph)
+      ys, xs, grad_ys=grad_ys, src_graph=func_graph,
+      unconnected_gradients="zero")
 
   # TODO(b/118712257): Handle the case when grad_outs has None's e.g. when there
   # is a tf.StopGradient in the loop body.
-  assert all([g is not None for g in grad_outs])
+  assert all(g is not None for g in grad_outs)
   counter = args[0]
   total_iters = args[1]
   return [counter + 1, total_iters] + grad_outs
 
 
-def _get_intermediates(func_graph):
-  """Returns all tensors in `func_graph` that should be accumulated."""
-  # We currently accumulate output tensors of most ops in the function and rely
-  # on the pruning pass to get rid of the unused accumulators at runtime.
-  # However, this can bloat the GraphDef and make debugging harder so we perform
-  # some optimizations.
-  #
-  # Optimization we currently perform:
-  # 1. We do not accumulate tensors which already have an accumulator
-  #    in the loop body.
-  # 2. We do not accumulate outputs of Identity nodes. When building the
-  #    FuncGraph, we add an Identity node for each output (see
-  #    `AutomaticControlDependencies.mark_as_return`). Accumulating outputs
-  #    of all these nodes bloats the GraphDef quite a bit so we remove those.
-  #    Since the gradient of an Identity node does not rely on its forward op's
-  #    input this is safe to do.
-  #
-  # Other possible optimizations:
-  # 1. Only accumulate tensors that will be required by the backward pass.
-  #    This will require running the gradient pass and hence would increase the
-  #    graph building time for the forward pass.
-  # 2. Do not accumulate Const nodes created inside the loop body.
-  # 3. Do not accumulate inputs that are passed as-is, e.g. loop invariants.
-  # TODO(srbs): 2 and 3 may be hard optimizations for the runtime optimizer
-  # since it requires knowledge of the while loop semantics. If so, consider
-  # doing those here.
-  intermediates = []
-
-  for op in func_graph.get_operations():
-    if op.type == "Identity":
-      continue
-    for o in op.outputs:
-      if (o != func_graph.inputs[0] and  # Loop counter.
-          o.dtype != dtypes.resource and  # Do not accumulate resource tensors.
-          _get_accumulator(o) is None):  # Has existing accumulator.
-        intermediates.append(o)
-  return intermediates
+def _resolve_grad_captures(body_graph, body_grad_graph, while_op):
+  """Returns the tensors to pass as captured inputs to `body_grad_graph`.
+
+  `body_grad_graph` may have external references to:
+  1. Its outer graph containing the input gradients. These are left as-is.
+  2. Accumulators captured from the forward-pass graph. These should have been
+     added as `while_op` outputs after the gradient graph was built. We replace
+     these with the corresponding output of `while_op`, i.e. a tensor in
+     `body_graph.outer_graph`. In the case of nested control flow or functions,
+     the gradient logic handling `body_grad_graph.outer_graph` will make sure
+     the tensor from `body_graph.outer_graph` is also correctly captured.
+
+  Args:
+    body_graph: FuncGraph. The forward-pass body function.
+    body_grad_graph: FuncGraph. The body gradients function.
+    while_op: The forward-pass While Operation calling `body_graph`.
+
+  Returns:
+    A list of input tensors to be passed as the captured inputs to
+      `body_grad_graph`.
+  """
+  new_capture_inputs = []
+  for t in body_grad_graph.external_captures:
+    # All values captured by gradient computation should be from the forward
+    # graph or a captured resource variable (note that input gradients are
+    # regular non-captured inputs).
+    if t.graph == body_graph:
+      # Captured accumulator
+      t = while_op.outputs[t.graph.outputs.index(t)]
+      # Note: We rely on the capturing logic of the gradient While op graph to
+      # correctly capture the tensors in `body_graph.outer_graph`. Both cond_v2
+      # and while_v2 handle this while building their gradient functions.
+      assert t.graph == body_graph.outer_graph
+    else:
+      # Captured resource variable
+      assert t.dtype == dtypes.resource
+
+    new_capture_inputs.append(t)
+  return new_capture_inputs
 
 
 def _get_accumulator(tensor):
@@ -621,9 +640,10 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
      b. Lookup the corresponding resource tensor in the forward outer graph and
         try to capture that.
   2. If the tensor is not of resource type:
-     a. Find the accumulator for that tensor.
-     b. Capture the forward While op output tensor corresponding to the
-        accumulator in this FuncGraph.
+     a. Create an accumulator for that tensor and output it from the forward
+        pass. Note this also requires adding it as an input to the forward pass.
+     b. Capture the accumulator from the forward pass in this FuncGraph. This
+        will later be resolved to the correct output of the forward While op.
      c. Pop a value from the captured placeholder and use it as the captured
         value for the forward pass tensor.
 
@@ -637,16 +657,25 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
   tensor.
 
   Attributes:
-    popped_tensor_lists: Dict from the captured accumulator placeholder to the
+    while_op_needs_rewrite: True if any non-resource intermediates were
+      captured, meaning the forward While op needs to be rewritten to output the
+      corresponding accumulators.
+    empty_tensor_lists: list of EmptyTensorList tensors to be used as initial
+      input to the new accumulators in the forward graph.
+    popped_tensor_lists: dict from the captured accumulator placeholder to the
       TensorList obtained after popping the intermediate tensor from it. The
       values of this dict need to be added to the list of outputs.
   """
 
-  def __init__(self, name, forward_graph):
+  def __init__(self, name, forward_cond_graph, forward_body_graph, max_iters):
     super(_WhileBodyGradFuncGraph, self).__init__(name)
+    self.empty_tensor_lists = []
     self.popped_tensor_lists = {}
     # FuncGraph for the body of the forward While op.
-    self._forward_graph = forward_graph
+    self._forward_graph = forward_body_graph
+    # FuncGraph for the cond of the forward While op.
+    self._forward_cond_graph = forward_cond_graph
+    self._maximum_iterations = max_iters
     # Dict from forward intermediate tensor to its indirectly captured tensor
     # in this graph. Indirect capturing happens in two ways:
     # 1. For non-resource tensors we capture their accumulators from the forward
@@ -655,13 +684,10 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
     # 2. For resource tensors we directly capture their corresponding tensor
     #    in the forward outer graph.
     self._indirect_captures = {}
-    # Dict from forward graph tensor to its corresponding tensor in
-    # `forward_graph.outer_graph`. For a non-resource tensor the value is the
-    # forward While op's "output" corresponding its accumulator. For a resource
-    # tensor it is the While op's "input" for the resource. Note: We disallow
-    # creation of resources inside the while loop so if a resource tensor exists
-    # inside while loop it must be a loop input.
-    self._inner_to_outer_tensor = {}
+
+  @property
+  def while_op_needs_rewrite(self):
+    return self.empty_tensor_lists
 
   def capture(self, tensor, name=None, whitelisted=False):
     """Selectively captures external tensors.
@@ -700,10 +726,6 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
 
     captured_tensor = self._indirect_captures.get(tensor)
     if captured_tensor is not None:
-      # For GradientTape housekeeping.
-      assert self._inner_to_outer_tensor[tensor] in self.captures
-      super(_WhileBodyGradFuncGraph, self)._capture_helper(
-          self._inner_to_outer_tensor[tensor], name)
       return captured_tensor
 
     if tensor.dtype == dtypes.resource:
@@ -728,36 +750,47 @@ class _WhileBodyGradFuncGraph(util.WhileBodyFuncGraph):
           index], "Resource tensors must be loop invariants %s." % str(
               self._forward_graph._while.inputs[index])
       tensor_in_outer_graph = self._forward_graph._while.inputs[index]
-      self._inner_to_outer_tensor[tensor] = tensor_in_outer_graph
       self._indirect_captures[tensor] = self.capture(
           tensor_in_outer_graph, whitelisted=True)
       return self._indirect_captures[tensor]
 
-    assert tensor not in self._inner_to_outer_tensor
-
-    accumulator = None
-
-    # Find the TensorList that was used to accumulate the tensors of this
-    # intermediate tensor.
+    # Create or find an existing accumulator output for `tensor` in the forward
+    # graph, and fetch from this accumulator in the gradient graph to get the
+    # raw intermediate value.
     accumulator = _get_accumulator(tensor)
     if accumulator is None:
-      raise ValueError("Reference to un-accumulated intermediate tensor: ",
-                       tensor.name)
-    assert accumulator.graph == self._forward_graph
-    # Get the While op output corresponding to the accumulator.
-    accumulator = self._forward_graph._while.outputs[self._forward_graph.outputs
-                                                     .index(accumulator)]
-
-    assert accumulator.graph == self._forward_graph.outer_graph
-    self._inner_to_outer_tensor[tensor] = accumulator
-
-    # Capture the `accumulator`.
-    accumulator_ph = super(_WhileBodyGradFuncGraph, self)._capture_helper(
+      # Create the initial empty tensor list.
+      with self._forward_graph.outer_graph.as_default():
+        tensor_list = list_ops.empty_tensor_list(
+            element_dtype=tensor.dtype, element_shape=tensor.shape,
+            max_num_elements=self._maximum_iterations)
+      self.empty_tensor_lists.append(tensor_list)
+
+      # Push the intermediate tensor to the tensor list. This captures
+      # `tensor_list`.
+      with self._forward_graph.as_default():
+        accumulator = list_ops.tensor_list_push_back(tensor_list, tensor)
+      # Add the modified tensor list to the list of outputs. This output will be
+      # all the accumulated values.
+      self._forward_graph.outputs.append(accumulator)
+
+      # Capture in the cond graph as well so the forward cond and body inputs
+      # match.
+      with self._forward_cond_graph.as_default():
+        self._forward_cond_graph.capture(tensor_list)
+
+    # Capture the accumulator tensor list in the gradient graph directly from
+    # the forward graph -- we'll later modify this to capture the final list
+    # output by the forward While op instead.
+    captured_accumulator = super(_WhileBodyGradFuncGraph, self)._capture_helper(
         accumulator, name)
+
+    # Pop the intermediate value from the tensor list in the gradient graph.
     new_tensor_list, captured_tensor = list_ops.tensor_list_pop_back(
-        accumulator_ph, element_dtype=tensor.dtype)
+        captured_accumulator, element_dtype=tensor.dtype)
+
     self._indirect_captures[tensor] = captured_tensor
-    self.popped_tensor_lists[accumulator_ph] = new_tensor_list
+    self.popped_tensor_lists[captured_accumulator] = new_tensor_list
     return captured_tensor
 
 
@@ -792,29 +825,6 @@ def _copy_handle_data(src_tensors, tgt_tensors):
     custom_gradient.copy_handle_data(src_t, tgt_t)
 
 
-# TODO(srbs): Move to common utils for cond_v2 and while_v2.
-def _maybe_set_lowering_attr(op):
-  """Sets the flag to enable lowering on the `While` op if necessary.
-
-  Lowering allows while_v2 to avoid some of the limitations of Functions,
-  allowing users to specify devices & colocation inside of while_v2
-  branches, and enabling non-strict evaluation & partial pruning of while_v2
-  branches. This brings while_v2 closer to feature parity with
-  tf.while_loop.
-
-  However, we do not lower `While` in the XLA context because it is easier
-  for XLA to apply its own optimizations when dealing with un-lowered
-  `While` operators than with low-level control flow primitives.
-
-  Args:
-    op: The While op.
-  """
-  if not control_flow_util.IsInXLAContext(op):
-    # pylint: disable=protected-access
-    op._set_attr("_lower_using_switch_merge", attr_value_pb2.AttrValue(b=True))
-    # pylint: enable=protected-access
-
-
 def _maybe_set_maximum_iterations_attr(op, maximum_iterations):
   if control_flow_util.IsInXLAContext(op):
     # Store the maximum_iterations to use in the gradient pass.
@@ -837,18 +847,6 @@ def _is_in_xla_context():
   return control_flow_util.GetContainingXLAContext(cur_ctxt) is not None
 
 
-def _get_tensor_convertible_shape(shape):
-  assert isinstance(shape, tensor_shape.TensorShape)
-  if shape.is_fully_defined():
-    return shape
-  if not shape:  # Unknown shape.
-    return -1
-  # Partially defined shape.
-  shape_list = shape.as_list()
-  shape_list = [s if s is not None else -1 for s in shape_list]
-  return ops.convert_to_tensor(shape_list)
-
-
 def _graph_name(graph):
   if isinstance(graph, func_graph_module.FuncGraph):
     return graph.name
@@ -870,6 +868,10 @@ def _is_tensor_array_handle(tensor):
   # TODO(b/118452219): add test coverage for this.
   tensor = func_graph_module.maybe_captured(tensor)
 
+  if isinstance(tensor, ops.EagerTensor):
+    # Eager execution doesn't quite support legacy tensorarray
+    return False
+
   return tensor.op.type in TENSOR_ARRAY_HANDLE_OPS
 
 
diff --git a/tensorflow/python/platform/__init__.py b/tensorflow/python/platform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/python/platform/app.py b/tensorflow/python/platform/app.py
index 4c91bc3652dc77274acfbf43859c03fad8a46a38..7b917235c0a73421552b7aebaa3192de969e5f3a 100644
--- a/tensorflow/python/platform/app.py
+++ b/tensorflow/python/platform/app.py
@@ -108,7 +108,7 @@ def _define_help_flags():
     _define_help_flags_called = True
 
 
-@tf_export('app.run')
+@tf_export(v1=['app.run'])
 def run(main=None, argv=None):
   """Runs the program with an optional 'main' function and 'argv' list."""
 
diff --git a/tensorflow/python/platform/benchmark.py b/tensorflow/python/platform/benchmark.py
index 4f7abb311a75ee85688adbebb6d90f1f97356d89..d6773d7b8136f93080b122f52b77513305aecdb6 100644
--- a/tensorflow/python/platform/benchmark.py
+++ b/tensorflow/python/platform/benchmark.py
@@ -30,6 +30,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.core.util import test_log_pb2
 from tensorflow.python.client import timeline
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import app
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
@@ -299,6 +300,18 @@ class TensorFlowBenchmark(Benchmark):
     benchmark_values["extras"].update(unreported_extras)
     return benchmark_values
 
+  def evaluate(self, tensors):
+    """Evaluates tensors and returns numpy values.
+
+    Args:
+      tensors: A Tensor or a nested list/tuple of Tensors.
+
+    Returns:
+      tensors numpy values.
+    """
+    sess = ops.get_default_session() or self.cached_session()
+    return sess.run(tensors)
+
 
 def _run_benchmarks(regex):
   """Run benchmarks that match regex `regex`.
diff --git a/tensorflow/python/platform/gfile.py b/tensorflow/python/platform/gfile.py
index 5927bc2409bb2744c2f6f003b90c0682e5ba5eb9..d0159e9e9816ba730c843d2b46936b142d47ff79 100644
--- a/tensorflow/python/platform/gfile.py
+++ b/tensorflow/python/platform/gfile.py
@@ -37,7 +37,7 @@ from tensorflow.python.util.deprecation import deprecated
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('gfile.GFile', 'gfile.Open')
+@tf_export(v1=['gfile.GFile', 'gfile.Open'], v2=['io.gfile.GFile'])
 class GFile(_FileIO):
   """File I/O wrappers without thread locking.
 
@@ -52,7 +52,7 @@ class GFile(_FileIO):
     super(GFile, self).__init__(name=name, mode=mode)
 
 
-@tf_export('gfile.FastGFile')
+@tf_export(v1=['gfile.FastGFile'])
 class FastGFile(_FileIO):
   """File I/O wrappers without thread locking.
 
diff --git a/tensorflow/python/platform/googletest.py b/tensorflow/python/platform/googletest.py
index 8141cf92c568f257a5e9810318182d71f445dfa1..5b20e36a693b2ae283ffe4cefa2210c0cb61dcfc 100644
--- a/tensorflow/python/platform/googletest.py
+++ b/tensorflow/python/platform/googletest.py
@@ -104,10 +104,13 @@ def GetTempDir():
   """Return a temporary directory for tests to use."""
   global _googletest_temp_dir
   if not _googletest_temp_dir:
-    first_frame = tf_inspect.stack()[-1][0]
-    temp_dir = os.path.join(tempfile.gettempdir(),
-                            os.path.basename(tf_inspect.getfile(first_frame)))
-    temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
+    if os.environ.get('TEST_TMPDIR'):
+      temp_dir = tempfile.mkdtemp(prefix=os.environ['TEST_TMPDIR'])
+    else:
+      first_frame = tf_inspect.stack()[-1][0]
+      temp_dir = os.path.join(tempfile.gettempdir(),
+                              os.path.basename(tf_inspect.getfile(first_frame)))
+      temp_dir = tempfile.mkdtemp(prefix=temp_dir.rstrip('.py'))
 
     def delete_temp_dir(dirname=temp_dir):
       try:
@@ -139,7 +142,7 @@ def StatefulSessionAvailable():
   return False
 
 
-@tf_export('test.StubOutForTesting')
+@tf_export(v1=['test.StubOutForTesting'])
 class StubOutForTesting(object):
   """Support class for stubbing methods out for unit testing.
 
diff --git a/tensorflow/python/platform/test.py b/tensorflow/python/platform/test.py
index d084870b2555b8f4e32b2b216e4ba8ca4676a833..943832af7a2c58d40cb2143048ddd6517596e406 100644
--- a/tensorflow/python/platform/test.py
+++ b/tensorflow/python/platform/test.py
@@ -46,9 +46,9 @@ from tensorflow.python.util.tf_export import tf_export
 if sys.version_info.major == 2:
   import mock                # pylint: disable=g-import-not-at-top,unused-import
 else:
-  from unittest import mock  # pylint: disable=g-import-not-at-top
+  from unittest import mock  # pylint: disable=g-import-not-at-top,g-importing-member
 
-tf_export('test.mock')(mock)
+tf_export(v1=['test.mock'])(mock)
 
 # Import Benchmark class
 Benchmark = _googletest.Benchmark  # pylint: disable=invalid-name
diff --git a/tensorflow/python/platform/tf_logging.py b/tensorflow/python/platform/tf_logging.py
index 59e60856ae80db76caa7ecd23db0db597bf60c6f..813bcb89beac01de97f53f9cb9ff97119f552a09 100644
--- a/tensorflow/python/platform/tf_logging.py
+++ b/tensorflow/python/platform/tf_logging.py
@@ -37,7 +37,7 @@ import six
 
 from tensorflow.python.util.tf_export import tf_export
 
-# Don't use this directly. Use _get_logger() instead.
+# Don't use this directly. Use get_logger() instead.
 _logger = None
 _logger_lock = threading.Lock()
 
@@ -78,7 +78,8 @@ else:
       return '(unknown file)', 0, '(unknown function)'
 
 
-def _get_logger():
+@tf_export('get_logger')
+def get_logger():
   """Return TF logger instance."""
   global _logger
 
@@ -130,39 +131,39 @@ def _get_logger():
     _logger_lock.release()
 
 
-@tf_export('logging.log')
+@tf_export(v1=['logging.log'])
 def log(level, msg, *args, **kwargs):
-  _get_logger().log(level, msg, *args, **kwargs)
+  get_logger().log(level, msg, *args, **kwargs)
 
 
-@tf_export('logging.debug')
+@tf_export(v1=['logging.debug'])
 def debug(msg, *args, **kwargs):
-  _get_logger().debug(msg, *args, **kwargs)
+  get_logger().debug(msg, *args, **kwargs)
 
 
-@tf_export('logging.error')
+@tf_export(v1=['logging.error'])
 def error(msg, *args, **kwargs):
-  _get_logger().error(msg, *args, **kwargs)
+  get_logger().error(msg, *args, **kwargs)
 
 
-@tf_export('logging.fatal')
+@tf_export(v1=['logging.fatal'])
 def fatal(msg, *args, **kwargs):
-  _get_logger().fatal(msg, *args, **kwargs)
+  get_logger().fatal(msg, *args, **kwargs)
 
 
-@tf_export('logging.info')
+@tf_export(v1=['logging.info'])
 def info(msg, *args, **kwargs):
-  _get_logger().info(msg, *args, **kwargs)
+  get_logger().info(msg, *args, **kwargs)
 
 
-@tf_export('logging.warn')
+@tf_export(v1=['logging.warn'])
 def warn(msg, *args, **kwargs):
-  _get_logger().warn(msg, *args, **kwargs)
+  get_logger().warn(msg, *args, **kwargs)
 
 
-@tf_export('logging.warning')
+@tf_export(v1=['logging.warning'])
 def warning(msg, *args, **kwargs):
-  _get_logger().warning(msg, *args, **kwargs)
+  get_logger().warning(msg, *args, **kwargs)
 
 
 _level_names = {
@@ -183,20 +184,20 @@ _log_prefix = None  # later set to google2_log_prefix
 _log_counter_per_token = {}
 
 
-@tf_export('logging.TaskLevelStatusMessage')
+@tf_export(v1=['logging.TaskLevelStatusMessage'])
 def TaskLevelStatusMessage(msg):
   error(msg)
 
 
-@tf_export('logging.flush')
+@tf_export(v1=['logging.flush'])
 def flush():
   raise NotImplementedError()
 
 
 # Code below is taken from pyglib/logging
-@tf_export('logging.vlog')
+@tf_export(v1=['logging.vlog'])
 def vlog(level, msg, *args, **kwargs):
-  _get_logger().log(level, msg, *args, **kwargs)
+  get_logger().log(level, msg, *args, **kwargs)
 
 
 def _GetNextLogCountPerToken(token):
@@ -214,7 +215,7 @@ def _GetNextLogCountPerToken(token):
   return _log_counter_per_token[token]
 
 
-@tf_export('logging.log_every_n')
+@tf_export(v1=['logging.log_every_n'])
 def log_every_n(level, msg, n, *args):
   """Log 'msg % args' at level 'level' once per 'n' times.
 
@@ -231,7 +232,7 @@ def log_every_n(level, msg, n, *args):
   log_if(level, msg, not (count % n), *args)
 
 
-@tf_export('logging.log_first_n')
+@tf_export(v1=['logging.log_first_n'])
 def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   """Log 'msg % args' at level 'level' only first 'n' times.
 
@@ -247,7 +248,7 @@ def log_first_n(level, msg, n, *args):  # pylint: disable=g-bad-name
   log_if(level, msg, count < n, *args)
 
 
-@tf_export('logging.log_if')
+@tf_export(v1=['logging.log_if'])
 def log_if(level, msg, condition, *args):
   """Log 'msg % args' at level 'level' only if condition is fulfilled."""
   if condition:
@@ -296,16 +297,16 @@ def google2_log_prefix(level, timestamp=None, file_and_line=None):
   return s
 
 
-@tf_export('logging.get_verbosity')
+@tf_export(v1=['logging.get_verbosity'])
 def get_verbosity():
   """Return how much logging output will be produced."""
-  return _get_logger().getEffectiveLevel()
+  return get_logger().getEffectiveLevel()
 
 
-@tf_export('logging.set_verbosity')
+@tf_export(v1=['logging.set_verbosity'])
 def set_verbosity(v):
   """Sets the threshold for what messages will be logged."""
-  _get_logger().setLevel(v)
+  get_logger().setLevel(v)
 
 
 def _get_thread_id():
@@ -318,8 +319,8 @@ def _get_thread_id():
 
 _log_prefix = google2_log_prefix
 
-tf_export('logging.DEBUG').export_constant(__name__, 'DEBUG')
-tf_export('logging.ERROR').export_constant(__name__, 'ERROR')
-tf_export('logging.FATAL').export_constant(__name__, 'FATAL')
-tf_export('logging.INFO').export_constant(__name__, 'INFO')
-tf_export('logging.WARN').export_constant(__name__, 'WARN')
+tf_export(v1=['logging.DEBUG']).export_constant(__name__, 'DEBUG')
+tf_export(v1=['logging.ERROR']).export_constant(__name__, 'ERROR')
+tf_export(v1=['logging.FATAL']).export_constant(__name__, 'FATAL')
+tf_export(v1=['logging.INFO']).export_constant(__name__, 'INFO')
+tf_export(v1=['logging.WARN']).export_constant(__name__, 'WARN')
diff --git a/tensorflow/python/profiler/internal/run_metadata_test.py b/tensorflow/python/profiler/internal/run_metadata_test.py
index 216cc3dd54b7851810baee23be91e321ede06b42..f96d721f46e162ee6753377569aacb439cd591d5 100644
--- a/tensorflow/python/profiler/internal/run_metadata_test.py
+++ b/tensorflow/python/profiler/internal/run_metadata_test.py
@@ -26,6 +26,7 @@ from tensorflow.core.protobuf import config_pb2
 from tensorflow.core.protobuf import rewriter_config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variables
@@ -154,6 +155,7 @@ class RunMetadataTest(test.TestCase):
     # deallocates the memory after matmul started.
     self.assertGreater(random_allocs[1].alloc_micros, mm.all_start_micros)
 
+  @test_util.run_deprecated_v1
   def testCPU(self):
     ops.reset_default_graph()
     with ops.device('/cpu:0'):
@@ -167,6 +169,7 @@ class RunMetadataTest(test.TestCase):
     ret = _extract_node(run_meta, 'MatMul:MatMul')
     self.assertEqual(len(ret), 0)
 
+  @test_util.run_v1_only('b/120545219')
   def testLoopCPU(self):
     ops.reset_default_graph()
     with ops.device('/cpu:0'):
diff --git a/tensorflow/python/profiler/model_analyzer.py b/tensorflow/python/profiler/model_analyzer.py
index 5f19eac0436c305d3e5d162b472210d630bcd545..4b2d9052b7879ceaf4a250ba56f438f3798b669b 100644
--- a/tensorflow/python/profiler/model_analyzer.py
+++ b/tensorflow/python/profiler/model_analyzer.py
@@ -122,7 +122,7 @@ def _build_advisor_options(options):
   return opts
 
 
-@tf_export('profiler.Profiler')
+@tf_export(v1=['profiler.Profiler'])
 class Profiler(object):
   """TensorFlow multi-step profiler.
 
@@ -306,7 +306,7 @@ class Profiler(object):
     print_mdl.WriteProfile(filename)
 
 
-@tf_export('profiler.profile')
+@tf_export(v1=['profiler.profile'])
 def profile(graph=None,
             run_meta=None,
             op_log=None,
@@ -381,7 +381,7 @@ def profile(graph=None,
   return tfprof_node
 
 
-@tf_export('profiler.advise')
+@tf_export(v1=['profiler.advise'])
 def advise(graph=None, run_meta=None, options=_DEFAULT_ADVISE_OPTIONS):
   """Auto profile and advise.
 
diff --git a/tensorflow/python/profiler/model_analyzer_test.py b/tensorflow/python/profiler/model_analyzer_test.py
index 94c685274a764bb099da6c0501b397d73d239f35..1c7c15be4fe5920ff06241175aff57bc52ac338e 100644
--- a/tensorflow/python/profiler/model_analyzer_test.py
+++ b/tensorflow/python/profiler/model_analyzer_test.py
@@ -76,6 +76,7 @@ class PrintModelAnalysisTest(test.TestCase):
                          '  ScalarW (1, 1/1 params)\n',
                          lib.CheckAndRemoveDoc(f.read()))
 
+  @test_util.run_v1_only('b/120545219')
   def testSelectEverythingDetail(self):
     ops.reset_default_graph()
     dev = '/device:GPU:0' if test.is_gpu_available() else '/device:CPU:0'
@@ -93,10 +94,10 @@ class PrintModelAnalysisTest(test.TestCase):
           config=self._no_rewrite_session_config()) as sess, ops.device(dev):
         x = lib.BuildSmallModel()
 
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         pctx.trace_next_step()
         pctx.dump_next_step()
-        _ = sess.run(x)
+        _ = self.evaluate(x)
 
         pctx.profiler.profile_name_scope(options=opts)
 
@@ -160,7 +161,7 @@ class PrintModelAnalysisTest(test.TestCase):
                         ) as sess, ops.device('/device:CPU:0'):
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -186,7 +187,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -203,6 +204,7 @@ class PrintModelAnalysisTest(test.TestCase):
             lib.CheckAndRemoveDoc(f.read())[0:80])
         # pylint: enable=line-too-long
 
+  @test_util.run_v1_only('b/120545219')
   def testComplexCodeView(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -220,9 +222,9 @@ class PrintModelAnalysisTest(test.TestCase):
       with session.Session(config=self._no_rewrite_session_config()) as sess:
         x = lib.BuildFullModel()
 
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         pctx.trace_next_step()
-        _ = sess.run(x)
+        _ = self.evaluate(x)
         tfprof_node = pctx.profiler.profile_python(options=opts)
 
         # pylint: disable=line-too-long
@@ -281,7 +283,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -309,7 +311,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -345,7 +347,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -391,7 +393,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -424,7 +426,7 @@ class PrintModelAnalysisTest(test.TestCase):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildFullModel()
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(
           x,
@@ -490,7 +492,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -555,7 +557,7 @@ class PrintModelAnalysisTest(test.TestCase):
 
     with session.Session(config=self._no_rewrite_session_config()) as sess:
       x = lib.BuildSmallModel()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       run_meta = config_pb2.RunMetadata()
       _ = sess.run(x,
                    options=config_pb2.RunOptions(
@@ -587,10 +589,10 @@ class PrintModelAnalysisTest(test.TestCase):
   def _trainLoop(self, train_op, train_steps, time_dir, time_step,
                  memory_dir, memory_step, profile_dir, dump_step):
     with session.Session(config=self._no_rewrite_session_config()) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       # start from 1 because variable_initializer took one step.
       for i in range(1, train_steps + 1):
-        _ = sess.run(train_op)
+        _ = self.evaluate(train_op)
         if i in time_step:
           ret = gfile.ListDirectory(time_dir)
           self.assertEqual(len(ret), 1)
@@ -619,6 +621,7 @@ class PrintModelAnalysisTest(test.TestCase):
           else:
             self.assertEqual(len(gfile.ListDirectory(profile_dir)), 0)
 
+  @test_util.run_v1_only('b/120545219')
   def testAutoProfiling(self):
     ops.reset_default_graph()
     time_dir = os.path.join(test.get_temp_dir(), 'time')
@@ -706,6 +709,7 @@ class PrintModelAnalysisTest(test.TestCase):
                       exception_str)
       self.assertTrue(mat is None)
 
+  @test_util.run_v1_only('b/120545219')
   def testTrackPersistentBytes(self):
     ops.reset_default_graph()
     a = array_ops.constant(np.ones((100, 100)))
diff --git a/tensorflow/python/profiler/option_builder.py b/tensorflow/python/profiler/option_builder.py
index 2ad7adf76933df65ca795dca361397f436adb995..9d8f7683a658e74c649d9ea337e7dbc10f870ef2 100644
--- a/tensorflow/python/profiler/option_builder.py
+++ b/tensorflow/python/profiler/option_builder.py
@@ -23,7 +23,7 @@ from tensorflow.python.profiler import tfprof_logger
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('profiler.ProfileOptionBuilder')
+@tf_export(v1=['profiler.ProfileOptionBuilder'])
 class ProfileOptionBuilder(object):
   # pylint: disable=line-too-long
   """Option Builder for Profiling API.
diff --git a/tensorflow/python/profiler/pprof_profiler_test.py b/tensorflow/python/profiler/pprof_profiler_test.py
index 11a3487360c1396f86e150bfba47357a6c28a5fd..3f5bd9e79be2254779e4b64507ef91baec3db49c 100644
--- a/tensorflow/python/profiler/pprof_profiler_test.py
+++ b/tensorflow/python/profiler/pprof_profiler_test.py
@@ -24,6 +24,7 @@ from proto import profile_pb2
 from tensorflow.core.framework import step_stats_pb2
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
@@ -135,6 +136,7 @@ comment: 9
       profile.ParseFromString(profile_contents)
       self.assertEquals(expected_proto, str(profile))
 
+  @test_util.run_v1_only('b/120545219')
   def testProfileWithWhileLoop(self):
     options = config_pb2.RunOptions()
     options.trace_level = config_pb2.RunOptions.FULL_TRACE
diff --git a/tensorflow/python/profiler/profile_context_test.py b/tensorflow/python/profiler/profile_context_test.py
index 107ad443c32e20ab69f3c2fb71c652d97a9c0cc6..885f08ca4b9c049aa78d0d8a202cca48aa813bce 100644
--- a/tensorflow/python/profiler/profile_context_test.py
+++ b/tensorflow/python/profiler/profile_context_test.py
@@ -21,6 +21,7 @@ import os
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ builder = option_builder.ProfileOptionBuilder
 
 class ProfilerContextTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasics(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), "dump")
@@ -48,10 +50,10 @@ class ProfilerContextTest(test.TestCase):
     with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
       pctx.add_auto_profiling("op", options=opts, profile_steps=[15, 50, 100])
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         total_steps = 101
         for i in range(total_steps):
-          sess.run(x)
+          self.evaluate(x)
           if i == 14 or i == 49:
             self.assertTrue(gfile.Exists(outfile))
             gfile.Remove(outfile)
@@ -69,45 +71,47 @@ class ProfilerContextTest(test.TestCase):
       with gfile.Open(outfile, "r") as f:
         self.assertEqual(profile_str, f.read())
 
+  @test_util.run_deprecated_v1
   def testAutoTracingInDeubMode(self):
     ops.reset_default_graph()
     x = lib.BuildFullModel()
 
     with profile_context.ProfileContext(test.get_temp_dir(), debug=True):
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
           for f in gfile.ListDirectory(test.get_temp_dir()):
             # Warm up, no tracing.
             self.assertFalse("run_meta" in f)
-        sess.run(x)
+        self.evaluate(x)
         self.assertTrue(
             gfile.Exists(os.path.join(test.get_temp_dir(), "run_meta_11")))
         gfile.Remove(os.path.join(test.get_temp_dir(), "run_meta_11"))
         # fetched already.
-        sess.run(x)
+        self.evaluate(x)
         for f in gfile.ListDirectory(test.get_temp_dir()):
           self.assertFalse("run_meta" in f)
 
+  @test_util.run_deprecated_v1
   def testDisabled(self):
     ops.reset_default_graph()
     x = lib.BuildFullModel()
     with profile_context.ProfileContext(test.get_temp_dir(),
                                         enabled=False) as pctx:
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
       self.assertTrue(pctx.profiler is None)
       self.assertTrue(
           getattr(session.BaseSession, "profile_context", None) is None)
 
     with profile_context.ProfileContext(test.get_temp_dir()) as pctx:
       with session.Session() as sess:
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for _ in range(10):
-          sess.run(x)
+          self.evaluate(x)
       self.assertFalse(pctx.profiler is None)
       self.assertFalse(
           getattr(session.BaseSession, "profile_context", None) is None)
diff --git a/tensorflow/python/profiler/profiler.py b/tensorflow/python/profiler/profiler.py
index efbdd1ba6842d85e82149346e9b4559527a1aacd..5f62690b54e2ff6e2c655eb5256299cce169f59a 100644
--- a/tensorflow/python/profiler/profiler.py
+++ b/tensorflow/python/profiler/profiler.py
@@ -49,7 +49,7 @@ _allowed_symbols.extend([
 ])
 
 # Export protos
-tf_export('profiler.GraphNodeProto')(GraphNodeProto)
-tf_export('profiler.MultiGraphNodeProto')(MultiGraphNodeProto)
-tf_export('profiler.AdviceProto')(AdviceProto)
-tf_export('profiler.OpLogProto')(OpLogProto)
+tf_export(v1=['profiler.GraphNodeProto'])(GraphNodeProto)
+tf_export(v1=['profiler.MultiGraphNodeProto'])(MultiGraphNodeProto)
+tf_export(v1=['profiler.AdviceProto'])(AdviceProto)
+tf_export(v1=['profiler.OpLogProto'])(OpLogProto)
diff --git a/tensorflow/python/profiler/profiler_test.py b/tensorflow/python/profiler/profiler_test.py
index eacb7d21e6aeb1c803165762e5f8f40e23247f64..e4f7361e5d711b58c5786a9e43e9d459c43dff4b 100644
--- a/tensorflow/python/profiler/profiler_test.py
+++ b/tensorflow/python/profiler/profiler_test.py
@@ -21,6 +21,7 @@ import os
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -35,6 +36,7 @@ builder = option_builder.ProfileOptionBuilder
 
 class ProfilerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testProfileBasic(self):
     ops.reset_default_graph()
     outfile = os.path.join(test.get_temp_dir(), 'dump')
@@ -171,6 +173,7 @@ class ProfilerTest(test.TestCase):
       checker = advice_pb.checkers['ExpensiveOperationChecker']
       self.assertGreater(len(checker.reports), 0)
 
+  @test_util.run_deprecated_v1
   def testMultipleProfilePerStep(self):
     ops.reset_default_graph()
     opts = (builder(builder.trainable_variables_parameter())
diff --git a/tensorflow/python/profiler/tfprof_logger.py b/tensorflow/python/profiler/tfprof_logger.py
index e651de32ea3bce32a965bfbeefc76ff08a79ac38..6ccd0e0ff3b5f9f067f49b7a1b64e62af7c7af5d 100644
--- a/tensorflow/python/profiler/tfprof_logger.py
+++ b/tensorflow/python/profiler/tfprof_logger.py
@@ -188,7 +188,7 @@ def merge_default_with_oplog(graph, op_log=None, run_meta=None,
   return tmp_op_log
 
 
-@tf_export('profiler.write_op_log')
+@tf_export(v1=['profiler.write_op_log'])
 def write_op_log(graph, log_dir, op_log=None, run_meta=None, add_trace=True):
   """Log provided 'op_log', and add additional model information below.
 
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index e7a3b8afd5daf279569d1866a700d8084633dfa9..53d0640542f257bff707047cd405a0dad5055449 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -12,6 +12,8 @@ licenses(["notice"])  # Apache 2.0
 exports_files(["LICENSE"])
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 
 py_library(
     name = "saved_model",
@@ -21,6 +23,7 @@ py_library(
     deps = [
         ":builder",
         ":constants",
+        ":load",
         ":loader",
         ":main_op",
         ":save",
@@ -83,12 +86,13 @@ py_library(
     srcs_version = "PY2AND3",
     deps = [
         ":constants",
+        ":signature_def_utils",
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:lib",
         "//tensorflow/python:platform",
-        "//tensorflow/python:training",
+        "//tensorflow/python:saver",
         "//tensorflow/python:util",
         "//tensorflow/python:variables",
     ],
@@ -114,6 +118,7 @@ py_test(
         "//tensorflow/python:state_ops",
         "//tensorflow/python:training",
         "//tensorflow/python:variables",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -166,14 +171,15 @@ py_test(
         ":signature_def_utils",
         ":tag_constants",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:errors",
         "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:lib",
         "//tensorflow/python:math_ops",
         "//tensorflow/python:saver_test_utils",
+        "//tensorflow/python:session",
         "//tensorflow/python:state_ops",
         "//tensorflow/python:test_ops",
         "//tensorflow/python:training",
@@ -264,6 +270,14 @@ py_test(
     ],
 )
 
+tf_proto_library(
+    name = "saved_object_graph",
+    srcs = ["saved_object_graph.proto"],
+    cc_api_version = 2,
+    protodeps = tf_additional_all_protos(),
+    visibility = ["//tensorflow:internal"],
+)
+
 py_library(
     name = "save",
     srcs = [
@@ -271,16 +285,26 @@ py_library(
     ],
     srcs_version = "PY2AND3",
     deps = [
+        ":builder",
+        ":constants",
         ":loader",
+        ":saved_object_graph_py",
         ":signature_constants",
         ":signature_def_utils",
+        ":tag_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:array_ops",
+        "//tensorflow/python:framework",
         "//tensorflow/python:framework_ops",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python:util",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/training/checkpointable:base",
+        "//tensorflow/python/training/checkpointable:util",
     ],
 )
 
@@ -289,13 +313,42 @@ py_test(
     srcs = ["save_test.py"],
     srcs_version = "PY2AND3",
     deps = [
+        ":loader",
         ":save",
         ":signature_constants",
         ":tag_constants",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-# -----------------------------------------------------------------------------
-# Google-internal targets.  These must be at the end for syncrepo.
+py_library(
+    name = "load",
+    srcs = [
+        "load.py",
+    ],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":loader",
+        ":saved_object_graph_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:util",
+        "//tensorflow/python/training/checkpointable:tracking",
+    ],
+)
+
+py_test(
+    name = "load_test",
+    srcs = ["load_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":load",
+        ":save",
+        "//tensorflow/python:dtypes",
+        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:test",
+        "//tensorflow/python/training/checkpointable:tracking",
+    ],
+)
diff --git a/tensorflow/python/saved_model/builder.py b/tensorflow/python/saved_model/builder.py
index be49c70c60476ae8b95c07007abb32a222466958..b929934eebb14a340d89fbb570a322b2b7144154 100644
--- a/tensorflow/python/saved_model/builder.py
+++ b/tensorflow/python/saved_model/builder.py
@@ -24,5 +24,6 @@ from __future__ import division
 from __future__ import print_function
 
 # pylint: disable=unused-import
+from tensorflow.python.saved_model.builder_impl import _SavedModelBuilder
 from tensorflow.python.saved_model.builder_impl import SavedModelBuilder
 # pylint: enable=unused-import
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index 4f68f7c5aeac4e8526dd3181a2eb347d52b8f550..f37d283a2a2cbb50faf62f1ae24cd69bd0f29d74 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import functools
 import os
 
 from google.protobuf.any_pb2 import Any
@@ -32,6 +33,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
@@ -39,8 +41,9 @@ from tensorflow.python.util.deprecation import deprecated_args
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export(v1=["saved_model.Builder", "saved_model.builder.SavedModelBuilder"])
-class SavedModelBuilder(object):
+# Base class for the SavedModelBuilder that is only used by Tensorflow
+# internally. Please use tf.compat.v1.saved_model.SavedModelBuilder instead.
+class _SavedModelBuilder(object):
   """Builds the `SavedModel` protocol buffer and saves variables and assets.
 
   The `SavedModelBuilder` class provides functionality to build a `SavedModel`
@@ -68,7 +71,7 @@ class SavedModelBuilder(object):
     builder.add_meta_graph_and_variables(sess,
                                     ["foo-tag"],
                                     signature_def_map=foo_signatures,
-                                    assets_collection=foo_assets)
+                                    assets_list=foo_assets)
   ...
 
   with tf.Session(graph=tf.Graph()) as sess:
@@ -105,82 +108,24 @@ class SavedModelBuilder(object):
     # weights.
     self._has_saved_variables = False
 
-  def _save_and_write_assets(self, assets_collection_to_add=None):
+  def _save_and_write_assets(self, meta_graph_def, assets_list=None):
     """Saves asset to the meta graph and writes asset files to disk.
 
     Args:
-      assets_collection_to_add: The collection where the asset paths are setup.
+      meta_graph_def: The meta graph def to which the assets will be added.
+      assets_list: The list where the asset paths are setup.
     """
-    asset_filename_map = _maybe_save_assets(assets_collection_to_add)
+    # Creates a function that adds assets into the meta graph def.
+    write_fn = functools.partial(_add_asset_to_metagraph, meta_graph_def)
+    asset_filename_map = _maybe_save_assets(write_fn, assets_list)
 
     # Return if there are no assets to write.
     if not asset_filename_map:
       tf_logging.info("No assets to write.")
       return
 
-    assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
-        self._export_dir)
-
-    # Copy each asset from source path to destination path.
-    for asset_basename, asset_source_filepath in asset_filename_map.items():
-      asset_destination_filepath = os.path.join(
-          compat.as_bytes(assets_destination_dir),
-          compat.as_bytes(asset_basename))
-
-      # Only copy the asset file to the destination if it does not already
-      # exist. This is to ensure that an asset with the same name defined as
-      # part of multiple graphs is only copied the first time.
-      if not file_io.file_exists(asset_destination_filepath):
-        file_io.copy(asset_source_filepath, asset_destination_filepath)
-
-    tf_logging.info("Assets written to: %s",
-                    compat.as_text(assets_destination_dir))
-
-  def _maybe_add_main_op(self, main_op):
-    """Adds main op to the SavedModel.
-
-    Args:
-      main_op: Main op to run as part of graph initialization. If None, no
-        main op will be added to the graph.
-
-    Raises:
-      TypeError: if main op is provided but is not of type `Operation`.
-      ValueError: if the Graph already contains an init op.
-    """
-    if main_op is None:
-      return
-
-    if not isinstance(main_op, ops.Operation):
-      raise TypeError("main_op needs to be an Operation: %r" % main_op)
-
-    # Validate that no other init ops have been added to this graph already.
-    # We check main_op and legacy_init_op for thoroughness and explicitness.
-    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
-      if ops.get_collection(init_op_key):
-        raise ValueError(
-            "Graph already contains one or more main ops under the "
-            "collection {}.".format(init_op_key))
-
-    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
-
-  def _add_train_op(self, train_op):
-    """Add train op to the SavedModel.
-
-    Note that this functionality is in development, and liable to be
-    moved elsewhere.
-
-    Args:
-      train_op: Op or group of ops that are used for training. These are
-        stored as a collection with key TRAIN_OP_KEY, but not executed.
-
-    Raises:
-      TypeError if Train op is not of type `Operation`.
-    """
-    if train_op is not None:
-      if (not isinstance(train_op, ops.Tensor) and
-          not isinstance(train_op, ops.Operation)):
-        raise TypeError("train_op needs to be a Tensor or Op: %r" % train_op)
-      ops.add_to_collection(constants.TRAIN_OP_KEY, train_op)
+    # Copy assets from source path to destination path.
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
 
   def _tag_and_add_meta_graph(self, meta_graph_def, tags, signature_def_map):
     """Tags the meta graph def and adds it to the SavedModel.
@@ -237,30 +182,32 @@ class SavedModelBuilder(object):
 
     Validation of entries in the signature def map includes ensuring that the
     `name` and `dtype` fields of the TensorInfo protos of the `inputs` and
-    `outputs` of each `SignatureDef` are populated.
+    `outputs` of each `SignatureDef` are populated. Also ensures that reserved
+    SigantureDef keys for the initialization and train ops are not used.
 
     Args:
       signature_def_map: The map of signature defs to be validated.
-    """
-    if signature_def_map is not None:
-      for signature_def_key in signature_def_map:
-        signature_def = signature_def_map[signature_def_key]
-        inputs = signature_def.inputs
-        outputs = signature_def.outputs
-        for inputs_key in inputs:
-          self._validate_tensor_info(inputs[inputs_key])
-        for outputs_key in outputs:
-          self._validate_tensor_info(outputs[outputs_key])
-
-  def _add_collections(
-      self, assets_collection, main_op, train_op):
-    """Add asset and op collections to be saved."""
-    # Save asset files and write them to disk, if any.
-    self._save_and_write_assets(assets_collection)
 
-    self._maybe_add_main_op(main_op)
-
-    self._add_train_op(train_op)
+    Raises:
+      AssertionError: If a TensorInfo is not valid.
+      KeyError: If a reserved signature key is used in the map.
+    """
+    for signature_def_key in signature_def_map:
+      signature_def = signature_def_map[signature_def_key]
+      inputs = signature_def.inputs
+      outputs = signature_def.outputs
+      for inputs_key in inputs:
+        self._validate_tensor_info(inputs[inputs_key])
+      for outputs_key in outputs:
+        self._validate_tensor_info(outputs[outputs_key])
+    if constants.INIT_OP_SIGNATURE_KEY in signature_def_map:
+      raise KeyError(
+          "SignatureDef map key \"{}\" is reserved for initialization. Please "
+          "use a different key.".format(constants.INIT_OP_SIGNATURE_KEY))
+    if constants.TRAIN_OP_SIGNATURE_KEY in signature_def_map:
+      raise KeyError(
+          "SignatureDef map key \"{}\" is reserved for the train op. Please "
+          "use a different key.".format(constants.TRAIN_OP_SIGNATURE_KEY))
 
   def _maybe_create_saver(self, saver=None):
     """Creates a sharded saver if one does not already exist."""
@@ -274,19 +221,14 @@ class SavedModelBuilder(object):
           allow_empty=True)
     return saver
 
-  @deprecated_args(None,
-                   "Pass your op to the equivalent parameter main_op instead.",
-                   "legacy_init_op")
   def add_meta_graph(self,
                      tags,
                      signature_def_map=None,
-                     assets_collection=None,
-                     legacy_init_op=None,
+                     assets_list=None,
                      clear_devices=False,
-                     main_op=None,
-                     strip_default_attrs=False,
+                     init_op=None,
+                     train_op=None,
                      saver=None):
-    # pylint: disable=line-too-long
     """Adds the current meta graph to the SavedModel.
 
     Creates a Saver in the current scope and uses the Saver to export the meta
@@ -297,19 +239,17 @@ class SavedModelBuilder(object):
       tags: The set of tags to annotate the meta graph def with.
       signature_def_map: The map of signature defs to be added to the meta graph
           def.
-      assets_collection: Assets collection to be saved with SavedModel. Note
-          that this collection should be a subset of the assets saved as part of
+      assets_list: Assets to be saved with SavedModel. Note
+          that this list should be a subset of the assets saved as part of
           the first meta graph in the SavedModel.
-      legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load. Deprecated; please use main_op instead.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
-      main_op: Op or group of ops to execute when the graph is loaded. Note
-          that when the main_op is specified it is run after the restore op at
+      init_op: Op or group of ops to execute when the graph is loaded. Note
+          that when the init_op is specified it is run after the restore op at
           load-time.
-      strip_default_attrs: Boolean. If `True`, default-valued attributes will be
-        removed from the NodeDefs. For a detailed guide, see
-        [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
+      train_op: Op or group of opts that trains the model when run. This will
+        not be run automatically when the graph is loaded, instead saved in
+        a SignatureDef accessible through the exported MetaGraph.
       saver: An instance of tf.train.Saver that will be used to export the
         metagraph. If None, a sharded Saver that restores all variables will
         be used.
@@ -318,7 +258,6 @@ class SavedModelBuilder(object):
       AssertionError: If the variables for the SavedModel have not been saved
           yet, or if the graph already contains one or more legacy init ops.
     """
-    # pylint: enable=line-too-long
     if not self._has_saved_variables:
       raise AssertionError(
           "Graph state including variables and assets has not been saved yet. "
@@ -326,14 +265,15 @@ class SavedModelBuilder(object):
 
     # Validate the signature def map to ensure all included TensorInfos are
     # properly populated.
+    signature_def_map = signature_def_map or {}
     self._validate_signature_def_map(signature_def_map)
 
-    # legacy_init_op is deprecated, and going away in TF 2.0.
-    # Re-mapping to main_op, as treatment is identical regardless.
-    main_op = main_op or legacy_init_op
-
-    # Add assets and ops
-    self._add_collections(assets_collection, main_op, None)
+    # Create a SignatureDef pointing to the graph initialization op, which will
+    # be added to the MetaGraphDef.
+    _add_op_to_signature_def_map(signature_def_map, init_op,
+                                 constants.INIT_OP_SIGNATURE_KEY)
+    _add_op_to_signature_def_map(signature_def_map, train_op,
+                                 constants.TRAIN_OP_SIGNATURE_KEY)
 
     saver = self._maybe_create_saver(saver)
 
@@ -345,22 +285,22 @@ class SavedModelBuilder(object):
     # resolved, we just leave the option set to False for now.
     # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
     meta_graph_def = saver.export_meta_graph(
-        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+        clear_devices=clear_devices, strip_default_attrs=True)
+
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(meta_graph_def, assets_list)
 
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
-  @deprecated_args(None,
-                   "Pass your op to the equivalent parameter main_op instead.",
-                   "legacy_init_op")
   def add_meta_graph_and_variables(self,
                                    sess,
                                    tags,
                                    signature_def_map=None,
-                                   assets_collection=None,
-                                   legacy_init_op=None,
+                                   assets_list=None,
                                    clear_devices=False,
-                                   main_op=None,
+                                   init_op=None,
+                                   train_op=None,
                                    strip_default_attrs=False,
                                    saver=None):
     # pylint: disable=line-too-long
@@ -378,14 +318,15 @@ class SavedModelBuilder(object):
       tags: The set of tags with which to save the meta graph.
       signature_def_map: The map of signature def map to add to the meta graph
         def.
-      assets_collection: Assets collection to be saved with SavedModel.
-      legacy_init_op: Legacy support for op or group of ops to execute after the
-          restore op upon a load. Deprecated; please use main_op instead.
+      assets_list: Assets to be saved with SavedModel.
       clear_devices: Set to true if the device info on the default graph should
           be cleared.
-      main_op: Op or group of ops to execute when the graph is loaded. Note
-          that when the main_op is specified it is run after the restore op at
+      init_op: Op or group of ops to execute when the graph is loaded. Note
+          that when the init_op is specified it is run after the restore op at
           load-time.
+      train_op: Op or group of ops that trains the model when run. This will
+        not be run automatically when the graph is loaded, instead saved in
+        a SignatureDef accessible through the exported MetaGraph.
       strip_default_attrs: Boolean. If `True`, default-valued attributes will be
         removed from the NodeDefs. For a detailed guide, see
         [Stripping Default-Valued Attributes](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md#stripping-default-valued-attributes).
@@ -402,14 +343,15 @@ class SavedModelBuilder(object):
 
     # Validate the signature def map to ensure all included TensorInfos are
     # properly populated.
+    signature_def_map = signature_def_map or {}
     self._validate_signature_def_map(signature_def_map)
 
-    # legacy_init_op is deprecated, and going away in TF 2.0.
-    # Re-mapping to main_op, as treatment is identical regardless.
-    main_op = main_op or legacy_init_op
-
-    # Add assets and ops
-    self._add_collections(assets_collection, main_op, None)
+    # Create a SignatureDef pointing to the graph initialization op, which will
+    # be added to the MetaGraphDef.
+    _add_op_to_signature_def_map(signature_def_map, init_op,
+                                 constants.INIT_OP_SIGNATURE_KEY)
+    _add_op_to_signature_def_map(signature_def_map, train_op,
+                                 constants.TRAIN_OP_SIGNATURE_KEY)
 
     saved_model_utils.get_or_create_variables_dir(self._export_dir)
     variables_path = saved_model_utils.get_variables_path(self._export_dir)
@@ -434,6 +376,9 @@ class SavedModelBuilder(object):
     meta_graph_def = saver.export_meta_graph(
         clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
 
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(meta_graph_def, assets_list)
+
     # Tag the meta graph def and add it to the SavedModel.
     self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
 
@@ -471,11 +416,205 @@ class SavedModelBuilder(object):
     return path
 
 
-def _maybe_save_assets(assets_collection_to_add=None):
+@tf_export(v1=["saved_model.Builder", "saved_model.builder.SavedModelBuilder"])  # pylint: disable=missing-docstring
+class SavedModelBuilder(_SavedModelBuilder):
+  __doc__ = _SavedModelBuilder.__doc__.replace("assets_list",
+                                               "assets_collection")
+
+  def __init__(self, export_dir):
+    super(SavedModelBuilder, self).__init__(export_dir=export_dir)
+
+  def _add_collections(self, assets_collection, main_op, train_op):
+    """Add asset and op collections to be saved."""
+    # Save asset files and write them to disk, if any.
+    self._save_and_write_assets(assets_collection)
+
+    self._maybe_add_main_op(main_op)
+
+    self._add_train_op(train_op)
+
+  def _save_and_write_assets(self, assets_collection_to_add=None):
+    """Saves asset to the meta graph and writes asset files to disk.
+
+    Args:
+      assets_collection_to_add: The collection where the asset paths are setup.
+    """
+    # Add assets to the collection with key `constants.ASSETS_KEY`, in the
+    # graph.
+    asset_filename_map = _maybe_save_assets(_add_asset_to_collection,
+                                            assets_collection_to_add)
+
+    # Return if there are no assets to write.
+    if not asset_filename_map:
+      tf_logging.info("No assets to write.")
+      return
+
+    # Copy assets from source path to destination path.
+    copy_assets_to_destination_dir(asset_filename_map, self._export_dir)
+
+  def _maybe_add_main_op(self, main_op):
+    """Adds main op to the SavedModel.
+
+    Args:
+      main_op: Main op to run as part of graph initialization. If None, no main
+        op will be added to the graph.
+
+    Raises:
+      TypeError: if main op is provided but is not of type `Operation`.
+      ValueError: if the Graph already contains an init op.
+    """
+    if main_op is None:
+      return
+
+    if not isinstance(main_op, ops.Operation):
+      raise TypeError("main_op needs to be an Operation: %r" % main_op)
+
+    # Validate that no other init ops have been added to this graph already.
+    # We check main_op and legacy_init_op for thoroughness and explicitness.
+    for init_op_key in (constants.MAIN_OP_KEY, constants.LEGACY_INIT_OP_KEY):
+      if ops.get_collection(init_op_key):
+        raise ValueError(
+            "Graph already contains one or more main ops under the "
+            "collection {}.".format(init_op_key))
+
+    ops.add_to_collection(constants.MAIN_OP_KEY, main_op)
+
+  def _add_train_op(self, train_op):
+    """Add train op to the SavedModel.
+
+    Note that this functionality is in development, and liable to be
+    moved elsewhere.
+
+    Args:
+      train_op: Op or group of ops that are used for training. These are stored
+        as a collection with key TRAIN_OP_KEY, but not executed.
+
+    Raises:
+      TypeError if Train op is not of type `Operation`.
+    """
+    if train_op is not None:
+      if (not isinstance(train_op, ops.Tensor) and
+          not isinstance(train_op, ops.Operation)):
+        raise TypeError("train_op needs to be a Tensor or Op: %r" % train_op)
+      ops.add_to_collection(constants.TRAIN_OP_KEY, train_op)
+
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
+  def add_meta_graph(self,
+                     tags,
+                     signature_def_map=None,
+                     assets_collection=None,
+                     legacy_init_op=None,
+                     clear_devices=False,
+                     main_op=None,
+                     strip_default_attrs=False,
+                     saver=None):
+    if not self._has_saved_variables:
+      raise AssertionError(
+          "Graph state including variables and assets has not been saved yet. "
+          "Please invoke `add_meta_graph_and_variables()` first.")
+
+    # Validate the signature def map to ensure all included TensorInfos are
+    # properly populated.
+    signature_def_map = signature_def_map or {}
+    self._validate_signature_def_map(signature_def_map)
+
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
+    # Add assets and ops
+    self._add_collections(assets_collection, main_op, None)
+
+    saver = self._maybe_create_saver(saver)
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+  @deprecated_args(None,
+                   "Pass your op to the equivalent parameter main_op instead.",
+                   "legacy_init_op")
+  def add_meta_graph_and_variables(self,
+                                   sess,
+                                   tags,
+                                   signature_def_map=None,
+                                   assets_collection=None,
+                                   legacy_init_op=None,
+                                   clear_devices=False,
+                                   main_op=None,
+                                   strip_default_attrs=False,
+                                   saver=None):
+    if self._has_saved_variables:
+      raise AssertionError("Graph state including variables and assets has "
+                           "already been saved. Please invoke "
+                           "`add_meta_graph()` instead.")
+
+    # Validate the signature def map to ensure all included TensorInfos are
+    # properly populated.
+    signature_def_map = signature_def_map or {}
+    self._validate_signature_def_map(signature_def_map)
+
+    # legacy_init_op is deprecated, and going away in TF 2.0.
+    # Re-mapping to main_op, as treatment is identical regardless.
+    main_op = main_op or legacy_init_op
+
+    # Add assets and ops
+    self._add_collections(assets_collection, main_op, None)
+
+    saved_model_utils.get_or_create_variables_dir(self._export_dir)
+    variables_path = saved_model_utils.get_variables_path(self._export_dir)
+
+    saver = self._maybe_create_saver(saver)
+
+    # Save the variables. Also, disable writing the checkpoint state proto. The
+    # file is not used during SavedModel loading. In addition, since a
+    # SavedModel can be copied or moved, this avoids the checkpoint state to
+    # become outdated.
+    saver.save(sess, variables_path, write_meta_graph=False, write_state=False)
+
+    # Export the meta graph def.
+
+    # The graph almost certainly previously contained at least one Saver, and
+    # possibly several (e.g. one for loading a pretrained embedding, and another
+    # for the model weights).  Removing the preexisting ones was the
+    # motivation for the clear_extraneous_savers option, but it turns out that
+    # there are edge cases where that option breaks the graph.  Until that is
+    # resolved, we just leave the option set to False for now.
+    # TODO(soergel): Reinstate clear_extraneous_savers=True when possible.
+    meta_graph_def = saver.export_meta_graph(
+        clear_devices=clear_devices, strip_default_attrs=strip_default_attrs)
+
+    # Tag the meta graph def and add it to the SavedModel.
+    self._tag_and_add_meta_graph(meta_graph_def, tags, signature_def_map)
+
+    # Mark this instance of SavedModel as having saved variables, such that
+    # subsequent attempts to save variables will fail.
+    self._has_saved_variables = True
+
+  add_meta_graph.__doc__ = _SavedModelBuilder.add_meta_graph.__doc__.replace(
+      "assets_list", "assets_collection")
+  add_meta_graph_and_variables.__doc__ = \
+      _SavedModelBuilder.add_meta_graph_and_variables.__doc__.replace(
+          "assets_list", "assets_collection")
+
+
+def _maybe_save_assets(write_fn, assets_to_add=None):
   """Saves assets to the meta graph.
 
   Args:
-    assets_collection_to_add: The collection where the asset paths are setup.
+    write_fn: A function callback that writes asset into meta graph.
+    assets_to_add: The list where the asset paths are setup.
 
   Returns:
     A dict of asset basenames for saving to the original full path to the asset.
@@ -486,25 +625,25 @@ def _maybe_save_assets(assets_collection_to_add=None):
   # Map of target file names to original filenames
   asset_filename_map = {}
 
-  if assets_collection_to_add is None:
+  if assets_to_add is None:
     tf_logging.info("No assets to save.")
     return asset_filename_map
 
-  # Iterate over the supplied asset collection, build the `AssetFile` proto
-  # and add them to the collection with key `constants.ASSETS_KEY`, in the
-  # graph.
-  for asset_tensor in assets_collection_to_add:
+  # Iterate over the supplied assets, build the `AssetFile` proto and add them
+  # to the meta graph.
+  for asset_tensor in assets_to_add:
     asset_source_filepath = _asset_path_from_tensor(asset_tensor)
     if not asset_source_filepath:
       raise ValueError("Invalid asset filepath tensor %s" % asset_tensor)
 
-    asset_filename = _get_asset_filename_to_add(
+    asset_filename = get_asset_filename_to_add(
         asset_source_filepath, asset_filename_map)
 
-    # Build `AssetFile` proto and add it to the asset collection in the graph.
+    # Call the passed-in function that builds AssetFileDef proto and adds it
+    # to either the collection or asset_file_def field of the meta graph.
     # Note that this should be done even when the file is a duplicate of an
     # already-added file, as the tensor reference should still exist.
-    _add_asset_to_collection(asset_filename, asset_tensor)
+    write_fn(asset_filename, asset_tensor)
 
     # In the cases where we are adding a duplicate, this will result in the
     # last of the filepaths being the one used for copying the file to the
@@ -516,7 +655,7 @@ def _maybe_save_assets(assets_collection_to_add=None):
   return asset_filename_map
 
 
-def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
+def get_asset_filename_to_add(asset_filepath, asset_filename_map):
   """Get a unique basename to add to the SavedModel if this file is unseen.
 
   Assets come from users as full paths, and we save them out to the
@@ -542,7 +681,7 @@ def _get_asset_filename_to_add(asset_filepath, asset_filename_map):
 
   other_asset_filepath = asset_filename_map[asset_filename]
   if other_asset_filepath == asset_filepath:
-    # This is the same file, stored twice in the collection list. No need
+    # This is the same file, stored twice in the list. No need
     # to make unique.
     return asset_filename
 
@@ -589,6 +728,41 @@ def _asset_path_from_tensor(path_tensor):
   return str_values[0]
 
 
+def _add_asset_to_metagraph(meta_graph_def, asset_filename, asset_tensor):
+  """Builds an asset proto and adds it to the meta graph def.
+
+  Args:
+    meta_graph_def: The meta graph def to which the asset will be added.
+    asset_filename: The filename of the asset to be added.
+    asset_tensor: The asset tensor used to populate the tensor info of the asset
+      proto.
+  """
+  asset_proto = meta_graph_def.asset_file_def.add()
+  asset_proto.filename = asset_filename
+  asset_proto.tensor_info.name = asset_tensor.name
+
+
+def copy_assets_to_destination_dir(asset_filename_map, destination_dir):
+  """Copy all assets from source path to destination path."""
+  assets_destination_dir = saved_model_utils.get_or_create_assets_dir(
+      destination_dir)
+
+  # Copy each asset from source path to destination path.
+  for asset_basename, asset_source_filepath in asset_filename_map.items():
+    asset_destination_filepath = os.path.join(
+        compat.as_bytes(assets_destination_dir),
+        compat.as_bytes(asset_basename))
+
+    # Only copy the asset file to the destination if it does not already
+    # exist. This is to ensure that an asset with the same name defined as
+    # part of multiple graphs is only copied the first time.
+    if not file_io.file_exists(asset_destination_filepath):
+      file_io.copy(asset_source_filepath, asset_destination_filepath)
+
+  tf_logging.info("Assets written to: %s",
+                  compat.as_text(assets_destination_dir))
+
+
 def _add_asset_to_collection(asset_filename, asset_tensor):
   """Builds an asset proto and adds it to the asset collection of the graph.
 
@@ -604,3 +778,8 @@ def _add_asset_to_collection(asset_filename, asset_tensor):
   asset_any_proto = Any()
   asset_any_proto.Pack(asset_proto)
   ops.add_to_collection(constants.ASSETS_KEY, asset_any_proto)
+
+
+def _add_op_to_signature_def_map(signature_def_map, op, key):
+  if op is not None:
+    signature_def_map[key] = signature_def_utils.op_signature_def(op, key)
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 0addbdc9686316bb27f6eaf65006daf4e88a4706..90511a409ed7eb34bede12f1ce9d665e0f1cc913 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -29,6 +29,9 @@ tf_export(
         "saved_model.ASSETS_DIRECTORY", "saved_model.constants.ASSETS_DIRECTORY"
     ]).export_constant(__name__, "ASSETS_DIRECTORY")
 
+# Subdirectory name containing unmanaged files from higher-level APIs.
+EXTRA_ASSETS_DIRECTORY = "assets.extra"
+
 # CollectionDef key containing SavedModel assets.
 ASSETS_KEY = "saved_model_assets"
 tf_export(
@@ -40,7 +43,6 @@ tf_export(
 # CollectionDef key for the legacy init op.
 LEGACY_INIT_OP_KEY = "legacy_init_op"
 tf_export(
-    "saved_model.LEGACY_INIT_OP_KEY",
     v1=[
         "saved_model.LEGACY_INIT_OP_KEY",
         "saved_model.constants.LEGACY_INIT_OP_KEY"
@@ -49,13 +51,12 @@ tf_export(
 # CollectionDef key for the SavedModel main op.
 MAIN_OP_KEY = "saved_model_main_op"
 tf_export(
-    "saved_model.MAIN_OP_KEY",
     v1=["saved_model.MAIN_OP_KEY",
         "saved_model.constants.MAIN_OP_KEY"]).export_constant(
             __name__, "MAIN_OP_KEY")
 
 # CollectionDef key for the SavedModel train op.
-# Not exported while export_all_saved_models is in contrib.
+# Not exported while export_all_saved_models is experimental.
 TRAIN_OP_KEY = "saved_model_train_op"
 
 # Schema version for SavedModel.
@@ -106,3 +107,8 @@ tf_export(
         "saved_model.VARIABLES_FILENAME",
         "saved_model.constants.VARIABLES_FILENAME"
     ]).export_constant(__name__, "VARIABLES_FILENAME")
+
+# The initialization and train ops for a MetaGraph are stored in the
+# signature def map. The ops are added to the map with the following keys.
+INIT_OP_SIGNATURE_KEY = "__saved_model_init_op"
+TRAIN_OP_SIGNATURE_KEY = "__saved_model_train_op"
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3095f4ee5e09ae0973164acc748e2d922e8a991
--- /dev/null
+++ b/tensorflow/python/saved_model/load.py
@@ -0,0 +1,96 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Import a checkpointable object from a SavedModel."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import saved_object_graph_pb2
+from tensorflow.python.saved_model import utils_impl as saved_model_utils
+from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.util import compat
+
+
+class _Loader(object):
+  """Helper class to load an object-based SavedModel."""
+
+  def __init__(self, object_graph_proto, saved_model_proto, export_dir):
+    self._asset_file_def = saved_model_proto.meta_graphs[0].asset_file_def
+    self._proto = object_graph_proto
+    self._export_dir = export_dir
+    self._load_all()
+
+  def _load_all(self):
+    self._nodes = [self._recreate(proto) for proto in self._proto.nodes]
+    # After creating the objects, construct the edges between the objects.
+    for obj, object_proto in zip(self._nodes, self._proto.nodes):
+      for reference in object_proto.children:
+        setattr(obj, reference.local_name, self._nodes[reference.node_id])
+
+  def get(self, node_id):
+    return self._nodes[node_id]
+
+  def _recreate(self, proto):
+    factory = {
+        "user_object": lambda: self._recreate_user_object(proto.user_object),
+        "asset": lambda: self._recreate_asset(proto.asset),
+    }
+    kind = proto.WhichOneof("kind")
+    if kind not in factory:
+      raise ValueError("Unknown SavedObject type: %r" % kind)
+    return factory[kind]()
+
+  def _recreate_user_object(self, proto):
+    del proto
+    return tracking.Checkpointable()
+
+  def _recreate_asset(self, proto):
+    filename = os.path.join(
+        saved_model_utils.get_assets_dir(self._export_dir),
+        self._asset_file_def[proto.asset_file_def_index].filename)
+    return tracking.TrackableAsset(filename)
+
+
+def _load_saved_object_graph_proto(filename):
+  with file_io.FileIO(filename, "rb") as f:
+    contents = f.read()
+    return saved_object_graph_pb2.SavedObjectGraph.FromString(contents)
+
+
+def load(export_dir):
+  """Load a SavedModel from `export_dir`."""
+  saved_model_proto = loader_impl.parse_saved_model(export_dir)
+  object_graph_filename = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY),
+      compat.as_bytes("object_graph.pb"))
+  if file_io.file_exists(object_graph_filename):
+    object_graph_proto = _load_saved_object_graph_proto(object_graph_filename)
+    loader = _Loader(object_graph_proto,
+                     saved_model_proto,
+                     export_dir)
+    root = loader.get(0)
+  else:
+    raise NotImplementedError(
+        "Currently only SavedModels exported with `tf.saved_model.save` may be "
+        "imported. Other SavedModels may eventually be supported via load().")
+  # TODO(allenl): load functions from the SavedModel into the eager context
+  return root
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2971101cdb5ae93613df65f0379866244a7a3fe
--- /dev/null
+++ b/tensorflow/python/saved_model/load_test.py
@@ -0,0 +1,101 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for checkpointable object SavedModel loading."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tempfile
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import test
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
+from tensorflow.python.training.checkpointable import tracking
+
+
+class LoadTest(test.TestCase):
+
+  def test_structure_import(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.dep_one = tracking.Checkpointable()
+    root.dep_two = tracking.Checkpointable()
+    root.dep_two.dep = tracking.Checkpointable()
+    root.dep_three = root.dep_two.dep
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+    imported = load.load(save_dir)
+    self.assertIs(imported.dep_three, imported.dep_two.dep)
+    self.assertIsNot(imported.dep_one, imported.dep_two)
+
+  def _make_asset(self, contents):
+    filename = tempfile.mktemp(prefix=self.get_temp_dir())
+    with open(filename, "w") as f:
+      f.write(contents)
+    return filename
+
+  def test_assets_import(self):
+    file1 = self._make_asset("contents 1")
+    file2 = self._make_asset("contents 2")
+
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.asset1 = tracking.TrackableAsset(file1)
+    root.asset2 = tracking.TrackableAsset(file2)
+
+    save_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, save_dir)
+
+    file_io.delete_file(file1)
+    file_io.delete_file(file2)
+    load_dir = os.path.join(self.get_temp_dir(), "load_dir")
+    file_io.rename(save_dir, load_dir)
+
+    imported = load.load(load_dir)
+    with open(imported.asset1.asset_path.numpy(), "r") as f:
+      self.assertEquals("contents 1", f.read())
+    with open(imported.asset2.asset_path.numpy(), "r") as f:
+      self.assertEquals("contents 2", f.read())
+
+  def test_assets_dedup(self):
+    vocab = self._make_asset("contents")
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+
+    root.asset1 = tracking.TrackableAsset(vocab)
+    root.asset2 = tracking.TrackableAsset(vocab)
+
+    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, export_dir)
+    imported = load.load(export_dir)
+
+    self.assertEqual(imported.asset1.asset_path.numpy(),
+                     imported.asset2.asset_path.numpy())
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index 8c8eaf038a1b908e48ee7ad23a48d064f06102ca..e5be03aae4905f4465ac87590da610a7d46e2ae4 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -31,6 +31,7 @@ from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
@@ -38,7 +39,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-def _parse_saved_model(export_dir):
+def parse_saved_model(export_dir):
   """Reads the savedmodel.pb or savedmodel.pbtxt file containing `SavedModel`.
 
   Args:
@@ -82,6 +83,11 @@ def _parse_saved_model(export_dir):
                    constants.SAVED_MODEL_FILENAME_PB))
 
 
+# TODO(b/120594573): Make this symbol also available as private, so that
+# tensorflow_transform and tensorflow_estimator do not break.
+_parse_saved_model = parse_saved_model
+
+
 def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   """Gets the asset tensors, if defined in the meta graph def to load.
 
@@ -99,22 +105,29 @@ def _get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
   collection_def = meta_graph_def_to_load.collection_def
 
   asset_tensor_dict = {}
-  if constants.ASSETS_KEY in collection_def:
-    # Location of the assets for SavedModel.
-    assets_directory = os.path.join(
-        compat.as_bytes(export_dir),
-        compat.as_bytes(constants.ASSETS_DIRECTORY))
+  asset_protos = []
+
+  if meta_graph_def_to_load.asset_file_def:
+    asset_protos = meta_graph_def_to_load.asset_file_def
+  elif constants.ASSETS_KEY in collection_def:
     assets_any_proto = collection_def[constants.ASSETS_KEY].any_list.value
-    # Process each asset and add it to the asset tensor dictionary.
     for asset_any_proto in assets_any_proto:
       asset_proto = meta_graph_pb2.AssetFileDef()
       asset_any_proto.Unpack(asset_proto)
-      tensor_name = asset_proto.tensor_info.name
-      if import_scope:
-        tensor_name = "%s/%s" % (import_scope, tensor_name)
-      asset_tensor_dict[tensor_name] = os.path.join(
-          compat.as_bytes(assets_directory),
-          compat.as_bytes(asset_proto.filename))
+      asset_protos.append(asset_proto)
+
+  # Location of the assets for SavedModel.
+  assets_directory = os.path.join(
+      compat.as_bytes(export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY))
+  # Process each asset and add it to the asset tensor dictionary.
+  for asset_proto in asset_protos:
+    tensor_name = asset_proto.tensor_info.name
+    if import_scope:
+      tensor_name = "%s/%s" % (import_scope, tensor_name)
+    asset_tensor_dict[tensor_name] = os.path.join(
+        compat.as_bytes(assets_directory),
+        compat.as_bytes(asset_proto.filename))
+
   return asset_tensor_dict
 
 
@@ -134,23 +147,53 @@ def _get_main_op_tensor(
     RuntimeError: If the collection def corresponding to the main op key has
         other than exactly one tensor.
   """
+  # TODO(kathywu): Rename this method to _get_op_from_collection when
+  # dependency from SavedModelEstimator is removed.
   collection_def = meta_graph_def_to_load.collection_def
-  main_op_tensor = None
+  init_op = None
   if init_op_key in collection_def:
-    main_ops = collection_def[init_op_key].node_list.value
-    if len(main_ops) != 1:
-      raise RuntimeError("Expected exactly one SavedModel main op. "
-                         "Found: {}".format(main_ops))
-    main_op_tensor = ops.get_collection(init_op_key)[0]
-  return main_op_tensor
+    init_op_list = collection_def[init_op_key].node_list.value
+    if len(init_op_list) != 1:
+      raise RuntimeError("Expected exactly one SavedModel init op. "
+                         "Found: {}".format(init_op_list))
+    init_op = ops.get_collection(init_op_key)[0]
+  return init_op
 
 
-@tf_export(
+def _get_op_from_collection(meta_graph_def, op_key):
+  return _get_main_op_tensor(meta_graph_def, op_key)
+
+
+def _get_op_from_signature_def(meta_graph_def, op_signature_key, import_scope):
+  """Retrieve op stored in the imported meta graph's signature def."""
+  if op_signature_key in meta_graph_def.signature_def:
+    return signature_def_utils.load_op_from_signature_def(
+        meta_graph_def.signature_def[op_signature_key], op_signature_key,
+        import_scope)
+  else:
+    return None
+
+
+def get_init_op(meta_graph_def, import_scope=None):
+  return (_get_op_from_signature_def(
+      meta_graph_def, constants.INIT_OP_SIGNATURE_KEY, import_scope) or
+          _get_op_from_collection(meta_graph_def, constants.MAIN_OP_KEY) or
+          _get_op_from_collection(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
+
+
+def get_train_op(meta_graph_def, import_scope=None):
+  train_op = _get_op_from_signature_def(
+      meta_graph_def, constants.TRAIN_OP_SIGNATURE_KEY, import_scope)
+  if train_op is None:
+    train_op = _get_op_from_collection(meta_graph_def, constants.TRAIN_OP_KEY)
+  return train_op
+
+
+@tf_export(v1=[
+    "saved_model.contains_saved_model",
     "saved_model.maybe_saved_model_directory",
-    v1=[
-        "saved_model.maybe_saved_model_directory",
-        "saved_model.loader.maybe_saved_model_directory"
-    ])
+    "saved_model.loader.maybe_saved_model_directory"
+])
 @deprecation.deprecated_endpoints(
     "saved_model.loader.maybe_saved_model_directory")
 def maybe_saved_model_directory(export_dir):
@@ -173,6 +216,25 @@ def maybe_saved_model_directory(export_dir):
   return file_io.file_exists(txt_path) or file_io.file_exists(pb_path)
 
 
+@tf_export("saved_model.contains_saved_model", v1=[])
+def contains_saved_model(export_dir):
+  """Checks whether the provided export directory could contain a SavedModel.
+
+  Note that the method does not load any data by itself. If the method returns
+  `false`, the export directory definitely does not contain a SavedModel. If the
+  method returns `true`, the export directory may contain a SavedModel but
+  provides no guarantee that it can be loaded.
+
+  Args:
+    export_dir: Absolute string path to possible export location. For example,
+                '/my/foo/model'.
+
+  Returns:
+    True if the export directory contains SavedModel files, False otherwise.
+  """
+  return maybe_saved_model_directory(export_dir)
+
+
 @tf_export(v1=["saved_model.load", "saved_model.loader.load"])
 @deprecation.deprecated(
     None,
@@ -219,7 +281,7 @@ class SavedModelLoader(object):
     """
     self._export_dir = export_dir
     self._variables_path = saved_model_utils.get_variables_path(export_dir)
-    self._saved_model = _parse_saved_model(export_dir)
+    self._saved_model = parse_saved_model(export_dir)
 
   @property
   def export_dir(self):
@@ -334,11 +396,9 @@ class SavedModelLoader(object):
       asset_tensors_dictionary = _get_asset_tensors(
           self._export_dir, meta_graph_def, import_scope=import_scope)
 
-      main_op_tensor = (
-          _get_main_op_tensor(meta_graph_def, constants.MAIN_OP_KEY) or
-          _get_main_op_tensor(meta_graph_def, constants.LEGACY_INIT_OP_KEY))
-      if main_op_tensor is not None:
-        sess.run(fetches=[main_op_tensor], feed_dict=asset_tensors_dictionary)
+      init_op = get_init_op(meta_graph_def, import_scope)
+      if init_op is not None:
+        sess.run(fetches=[init_op], feed_dict=asset_tensors_dictionary)
 
   def load(self, sess, tags, import_scope=None, **saver_kwargs):
     """Load the MetaGraphDef graph and restore variable values into the session.
diff --git a/tensorflow/python/saved_model/loader_test.py b/tensorflow/python/saved_model/loader_test.py
index 924b2e7c0655130df9c0f7c5fe7742fc5ebaddc6..3e27c0801cd43eb43d1e0636f8aac1b1bc054485 100644
--- a/tensorflow/python/saved_model/loader_test.py
+++ b/tensorflow/python/saved_model/loader_test.py
@@ -19,11 +19,14 @@ from __future__ import division
 from __future__ import print_function
 
 import os
+import shutil
+
+from absl.testing import parameterized
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
-from tensorflow.python.lib.io import file_io
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variables
@@ -42,55 +45,74 @@ SIMPLE_ADD_SAVED_MODEL = _get_export_dir("simple_add_saved_model")
 SAVED_MODEL_WITH_MAIN_OP = _get_export_dir("saved_model_with_main_op")
 
 
-class SavedModelLoaderTest(test.TestCase):
-
-  def setUp(self):
-    """Write test SavedModels to a temp directory."""
-    with session.Session(graph=ops.Graph()) as sess:
-      x = variables.VariableV1(5, name="x")
-      y = variables.VariableV1(11, name="y")
-      z = x + y
-      sess.run(variables.global_variables_initializer())
-
-      foo_sig_def = signature_def_utils.build_signature_def(
-          {"foo_input": utils.build_tensor_info(x)},
-          {"foo_output": utils.build_tensor_info(z)})
-      bar_sig_def = signature_def_utils.build_signature_def(
-          {"bar_x": utils.build_tensor_info(x),
-           "bar_y": utils.build_tensor_info(y)},
-          {"bar_z": utils.build_tensor_info(z)})
-
-      builder = saved_model_builder.SavedModelBuilder(SIMPLE_ADD_SAVED_MODEL)
-      builder.add_meta_graph_and_variables(
-          sess, ["foo_graph"], {"foo": foo_sig_def, "bar": bar_sig_def})
+def build_graph_helper():
+  g = ops.Graph()
+  with g.as_default():
+    x = variables.VariableV1(5, name="x")
+    y = variables.VariableV1(11, name="y")
+    z = x + y
+
+    foo_sig_def = signature_def_utils.build_signature_def({
+        "foo_input": utils.build_tensor_info(x)
+    }, {"foo_output": utils.build_tensor_info(z)})
+    bar_sig_def = signature_def_utils.build_signature_def({
+        "bar_x": utils.build_tensor_info(x),
+        "bar_y": utils.build_tensor_info(y)
+    }, {"bar_z": utils.build_tensor_info(z)})
+  return g, {"foo": foo_sig_def, "bar": bar_sig_def}, y
+
+
+@parameterized.parameters((saved_model_builder.SavedModelBuilder,),
+                          (saved_model_builder._SavedModelBuilder,))
+class SavedModelLoaderTest(test.TestCase, parameterized.TestCase):
+
+  def export_simple_graph(self, builder_cls):
+    g, sig_def_map, _ = build_graph_helper()
+    with session.Session(graph=g) as sess:
+      self.evaluate(variables.global_variables_initializer())
+      builder = builder_cls(SIMPLE_ADD_SAVED_MODEL)
+      builder.add_meta_graph_and_variables(sess, ["foo_graph"], sig_def_map)
       builder.save()
 
-      # Write SavedModel with a main_op
+  def export_graph_with_main_op(self, builder_cls):
+    g, sig_def_map, y = build_graph_helper()
+    with session.Session(graph=g) as sess:
+      self.evaluate(variables.global_variables_initializer())
       assign_op = control_flow_ops.group(state_ops.assign(y, 7))
 
-      builder = saved_model_builder.SavedModelBuilder(SAVED_MODEL_WITH_MAIN_OP)
-      builder.add_meta_graph_and_variables(
-          sess, ["foo_graph"], {"foo": foo_sig_def, "bar": bar_sig_def},
-          main_op=assign_op)
+      builder = builder_cls(SAVED_MODEL_WITH_MAIN_OP)
+
+      if builder_cls == saved_model_builder._SavedModelBuilder:
+        builder.add_meta_graph_and_variables(
+            sess, ["foo_graph"], sig_def_map, init_op=assign_op)
+      else:
+        builder.add_meta_graph_and_variables(
+            sess, ["foo_graph"], sig_def_map, main_op=assign_op)
       builder.save()
 
   def tearDown(self):
-    file_io.delete_recursively(test.get_temp_dir())
+    super(SavedModelLoaderTest, self).tearDown()
+    shutil.rmtree(test.get_temp_dir(), ignore_errors=True)
 
-  def test_load_function(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_load_function(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
+    self.export_graph_with_main_op(builder_cls)
     loader2 = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       loader2.load(sess, ["foo_graph"])
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_load_graph(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_load_graph(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     graph = ops.Graph()
     loader.load_graph(graph, ["foo_graph"])
@@ -101,14 +123,16 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaises(KeyError):
       graph.get_tensor_by_name("z:0")
 
-    with self.session(graph=graph) as sess:
+    with self.session(graph=graph):
       # Check that x and y are not initialized
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(x)
+        self.evaluate(x)
       with self.assertRaises(errors.FailedPreconditionError):
-        sess.run(y)
+        self.evaluate(y)
 
-  def test_load_with_import_scope(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_load_with_import_scope(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       saver, _ = loader.load_graph(
@@ -119,7 +143,13 @@ class SavedModelLoaderTest(test.TestCase):
         loader.restore_variables(sess, tf_saver.Saver())
 
       loader.restore_variables(sess, saver)
-      loader.run_init_ops(sess, ["foo_graph"])
+
+      if builder_cls == saved_model_builder._SavedModelBuilder:
+        with self.assertRaises(errors.NotFoundError):
+          loader.run_init_ops(sess, ["foo_graph"])
+        loader.run_init_ops(sess, ["foo_graph"], import_scope="baz")
+      else:
+        loader.run_init_ops(sess, ["foo_graph"])
 
       self.assertEqual(5, sess.graph.get_tensor_by_name("baz/x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("baz/y:0").eval())
@@ -131,23 +161,27 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("baa/x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("baa/y:0").eval())
 
-  def test_restore_variables(self):
+  @test_util.run_deprecated_v1
+  def test_restore_variables(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     with self.session(graph=ops.Graph()) as sess:
       x = variables.VariableV1(0, name="x")
       y = variables.VariableV1(0, name="y")
       z = x * y
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       # There are variables to restore, so a saver must be created.
       with self.assertRaises(ValueError):
         loader.restore_variables(sess, None)
 
       loader.restore_variables(sess, tf_saver.Saver())
-      self.assertEqual(55, z.eval())
+      self.assertEqual(55, self.evaluate(z))
 
-  def test_run_init_op(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_run_init_op(self, builder_cls):
+    self.export_graph_with_main_op(builder_cls)
     loader = loader_impl.SavedModelLoader(SAVED_MODEL_WITH_MAIN_OP)
     graph = ops.Graph()
     saver, _ = loader.load_graph(graph, ["foo_graph"])
@@ -160,14 +194,16 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(7, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_parse_saved_model(self):
+  def test_parse_saved_model(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     meta_graph = loader.get_meta_graph_def_from_tags(["foo_graph"])
     self.assertIsNotNone(meta_graph)
     self.assertIn("foo", meta_graph.signature_def)
     self.assertIn("bar", meta_graph.signature_def)
 
-  def test_load_invalid_meta_graph(self):
+  def test_load_invalid_meta_graph(self, builder_cls):
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     with self.assertRaises(RuntimeError):
       loader.get_meta_graph_def_from_tags([])
@@ -176,13 +212,17 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaises(RuntimeError):
       loader.get_meta_graph_def_from_tags(["not_a_graph"])
 
-  def test_load_saved_model_with_no_variables(self):
+  @test_util.run_v1_only("b/120545219")
+  def test_load_saved_model_with_no_variables(self, builder_cls):
     """Test that SavedModel runs saver when there appear to be no variables.
 
     When no variables are detected, this may mean that the variables were saved
     to different collections, or the collections weren't saved to the
     SavedModel. If the SavedModel MetaGraphDef contains a saver, it should still
     run in either of these cases.
+
+    Args:
+      builder_cls: SavedModelBuilder or _SavedModelBuilder class
     """
     path = _get_export_dir("no_variable_saved_model")
     with session.Session(graph=ops.Graph()) as sess:
@@ -192,7 +232,7 @@ class SavedModelLoaderTest(test.TestCase):
           11, name="y", collections=["not_global_variable"])
       self.assertFalse(variables._all_saveable_objects())
       z = x + y
-      sess.run(variables.variables_initializer([x, y]))
+      self.evaluate(variables.variables_initializer([x, y]))
 
       foo_sig_def = signature_def_utils.build_signature_def(
           {"foo_input": utils.build_tensor_info(x)},
@@ -215,8 +255,9 @@ class SavedModelLoaderTest(test.TestCase):
       self.assertEqual(5, sess.graph.get_tensor_by_name("x:0").eval())
       self.assertEqual(11, sess.graph.get_tensor_by_name("y:0").eval())
 
-  def test_load_saved_model_graph_with_return_elements(self):
+  def test_load_saved_model_graph_with_return_elements(self, builder_cls):
     """Ensure that the correct elements are returned."""
+    self.export_simple_graph(builder_cls)
     loader = loader_impl.SavedModelLoader(SIMPLE_ADD_SAVED_MODEL)
     graph = ops.Graph()
     _, ret = loader.load_graph(graph, ["foo_graph"],
@@ -228,5 +269,6 @@ class SavedModelLoaderTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "not found in graph"):
       loader.load_graph(graph, ["foo_graph"], return_elements=["z:0"])
 
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 63575f631eb0c0197c84d1598515782836b58b4d..ab6fcb7196fcc243d69b53b595b53b0dd00071f4 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -19,39 +19,82 @@ from __future__ import division
 from __future__ import print_function
 
 import collections
+import functools
 import os
 
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
-from tensorflow.python.eager import function
+from tensorflow.python.eager import function as defun
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.saved_model import builder_impl
 from tensorflow.python.saved_model import constants
+from tensorflow.python.saved_model import saved_object_graph_pb2
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.saved_model import utils_impl
 from tensorflow.python.training.checkpointable import base
+from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
 from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.tf_export import tf_export
 
 
+def _check_for_functional_keras_model(root):
+  """Makes an export signature for `root` if it's a functional Keras Model."""
+  # If nothing is decorated yet but this is a functional Keras Model (duck
+  # typed), we'll try to make a signature ourselves.
+  try:
+    inputs = root.inputs
+    input_names = root.input_names
+  except AttributeError:
+    return None
+  input_signature = []
+  for input_tensor, input_name in zip(inputs, input_names):
+    input_signature.append(tensor_spec.TensorSpec(
+        shape=input_tensor.shape, dtype=input_tensor.dtype,
+        name=input_name))
+
+  @def_function.function(input_signature=input_signature)
+  def _wrapped_model(*args):
+    outputs_list = nest.flatten(root(inputs=list(args)))
+    return {name: output for name, output
+            in zip(root.output_names, outputs_list)}
+  return _wrapped_model
+
+
 def _find_function_to_export(root):
   """Iterate over `root`'s attributes, finding traced functions."""
-  functions = []
-  function_attribute_names = []
+  exported_function = None
+  previous_attribute_name = None
   for attribute_name in dir(root):
     attribute_value = getattr(root, attribute_name, None)
     if isinstance(attribute_value, def_function.PolymorphicFunction):
-      functions.append(attribute_value)
-      function_attribute_names.append(attribute_name)
-  # TODO(allenl): Automatically infer signatures for Keras functional models?
-  if not functions:
+      if exported_function is not None:
+        raise ValueError(
+            ("Exporting an object with no "
+             "tf.saved_model.save(..., signatures=...) "
+             "argument specified, and with more than one "
+             "@tf.function-decorated method attached to it: {}. The signature "
+             "keys for these functions are ambiguous. Specify signature "
+             "functions explicitly.").format(
+                 [previous_attribute_name, attribute_name]))
+      exported_function = attribute_value
+      previous_attribute_name = attribute_name
+  if exported_function is None:
+    exported_function = _check_for_functional_keras_model(root)
+  if exported_function is None:
     raise ValueError(
         ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
          "argument specified, and with no @tf.function-decorated methods "
@@ -60,14 +103,7 @@ def _find_function_to_export(root):
          "signatures does not make sense, as the only consumers will expect "
          "signatures. Either decorate a method or specify a signature function "
          "explicitly."))
-  elif len(functions) > 1:
-    raise ValueError(
-        ("Exporting an object with no tf.saved_model.save(..., signatures=...) "
-         "argument specified, and with more than one @tf.function-decorated "
-         "method attached to it: {}. The signature keys for these functions "
-         "are ambiguous. Specify signature functions explicitly.").format(
-             function_attribute_names))
-  return functions[0]
+  return exported_function
 
 
 def _canonicalize_signatures(signatures):
@@ -77,7 +113,7 @@ def _canonicalize_signatures(signatures):
         signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
   concrete_signatures = {}
   for serving_key, signature_function in signatures.items():
-    if isinstance(signature_function, (function.PolymorphicFunction,
+    if isinstance(signature_function, (defun.PolymorphicFunction,
                                        def_function.PolymorphicFunction)):
       input_signature = signature_function._input_signature  # pylint: disable=protected-access
       if input_signature is None:
@@ -88,7 +124,7 @@ def _canonicalize_signatures(signatures):
              "converted to concrete functions using "
              "`f.get_concrete_function(...)`.").format(signature_function))
       signature_function = signature_function.get_concrete_function()
-    elif not isinstance(signature_function, function.Function):
+    elif not isinstance(signature_function, defun.Function):
       raise ValueError(
           ("Expected a TensorFlow function to generate a signature for, but "
            "got {}. Python functions may be decorated with "
@@ -145,64 +181,65 @@ def _tensor_dict_to_tensorinfo(tensor_dict):
           for key, value in tensor_dict.items()}
 
 
-def _map_captured_resources_to_created_resources(
+def _map_captures_to_created_tensors(
     original_captures, resource_map):
-  """Maps eager resources captured by a function to Graph resources for export.
+  """Maps eager tensors captured by a function to Graph resources for export.
 
   Args:
-    original_captures: A dictionary mapping from resource tensors captured by
-      the function to interior placeholders for those resources (inside the
-      function body).
+    original_captures: A dictionary mapping from tensors captured by the
+      function to interior placeholders for those tensors (inside the function
+      body).
     resource_map: A dictionary mapping from resource tensors owned by the eager
       context to resource tensors in the exported graph.
 
   Returns:
-    A dictionary mapping from interior placeholders in the function body to
-    exterior stand-in resource tensors which belong to the exported graph.
+    A list of stand-in tensors which belong to the exported graph, corresponding
+    to the function's captures.
 
   Raises:
     AssertionError: If the function references a resource which is not part of
       `resource_map`.
   """
-  export_captures = {}
+  export_captures = []
   for exterior, interior in original_captures.items():
     mapped_resource = resource_map.get(exterior, None)
     if mapped_resource is None:
-      raise AssertionError(
-          ("Tried to export a function which references untracked stateful "
-           "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
-           "be tracked by the main object. Objects may be tracked by "
-           "assigning them to an attribute of another tracked object, or to "
-           "an attribute of the main object directly.")
-          .format(interior))
-    export_captures[interior] = mapped_resource
+      if exterior.dtype == dtypes.resource:
+        raise AssertionError(
+            ("Tried to export a function which references untracked stateful "
+             "object {}. Stateful TensorFlow objects (e.g. tf.Variable) must "
+             "be tracked by the main object. Objects may be tracked by "
+             "assigning them to an attribute of another tracked object, or to "
+             "an attribute of the main object directly.")
+            .format(interior))
+      else:
+        # This is a captured Tensor, but it's not a resource. We'll just add it
+        # to the graph as a constant.
+        mapped_resource = constant_op.constant(exterior.numpy())
+    export_captures.append(mapped_resource)
   return export_captures
 
 
-def _map_function_inputs_to_created_inputs(
-    function_inputs, export_captures, signature_key, function_name):
-  """Creates exterior placeholders in the exported graph for function inputs.
+def _map_function_arguments_to_created_inputs(
+    function_arguments, signature_key, function_name):
+  """Creates exterior placeholders in the exported graph for function arguments.
 
   Functions have two types of inputs: tensors captured from the outside (eager)
   context, and arguments to the function which we expect to receive from the
-  user at each call. `_map_captured_resources_to_created_resources` replaces
+  user at each call. `_map_captures_to_created_tensors` replaces
   captured tensors with stand-ins (typically these are resource dtype tensors
   associated with variables). `_map_function_inputs_to_created_inputs` runs over
-  every input, either captured or argument. For captures, it uses the mapped
-  resource from `export_captures`. For arguments, it creates a new placeholder
-  which will belong to the exported graph rather than the function body.
+  every argument, creating a new placeholder for each which will belong to the
+  exported graph rather than the function body.
 
   Args:
-    function_inputs: A list of all placeholders in the function body.
-    export_captures: A dictionary mapping from interior placeholders in the
-      function body to exterior stand-in resource tensors which belong to the
-      exported graph (see `_map_captured_resources_to_created_resources`).
+    function_arguments: A list of argument placeholders in the function body.
     signature_key: The name of the signature being exported, for error messages.
     function_name: The name of the function, for error messages.
 
   Returns:
     A tuple of (mapped_inputs, exterior_placeholders)
-      mapped_inputs: A list with entries corresponding to `function_inputs`
+      mapped_inputs: A list with entries corresponding to `function_arguments`
         containing all of the inputs of the function gathered from the exported
         graph (both captured resources and arguments).
       exterior_argument_placeholders: A dictionary mapping from argument names
@@ -220,12 +257,7 @@ def _map_function_inputs_to_created_inputs(
   # MetaGraph.
   exterior_argument_placeholders = {}
   mapped_inputs = []
-  for placeholder in function_inputs:
-    mapped_resource_tensor = export_captures.get(placeholder, None)
-    if mapped_resource_tensor is not None:
-      # This is a captured resource.
-      mapped_inputs.append(mapped_resource_tensor)
-      continue
+  for placeholder in function_arguments:
     # `export_captures` contains an exhaustive set of captures, so if we don't
     # find the input there then we now know we have an argument.
     user_input_name = compat.as_str_any(
@@ -258,6 +290,20 @@ def _map_function_inputs_to_created_inputs(
   return mapped_inputs, exterior_argument_placeholders
 
 
+def _call_function_with_mapped_captures(function, args, resource_map):
+  """Calls `function` in the exported graph, using mapped resource captures."""
+  export_captures = _map_captures_to_created_tensors(
+      function.graph.captures, resource_map)
+  mapped_inputs = args + export_captures
+  # Calls the function quite directly, since we have new captured resource
+  # tensors we need to feed in which weren't part of the original function
+  # definition.
+  # pylint: disable=protected-access
+  outputs = function._build_call_outputs(
+      function._inference_function.call(context.context(), mapped_inputs))
+  return outputs
+
+
 def _generate_signatures(signature_functions, resource_map):
   """Validates and calls `signature_functions` in the default graph.
 
@@ -287,35 +333,81 @@ def _generate_signatures(signature_functions, resource_map):
     SignatureDefs as part of that MetaGraph.
   """
   signatures = {}
-  for signature_key, func in sorted(signature_functions.items()):
-    # Register the inference function for this signature in the exported
-    # graph. There is no direct use for the gradient of this function, so we
-    # don't generate/register a gradient function here (but may end up with one
-    # if another function relies on it). Users can still take symbolic gradients
-    # of the function on import, the gradient just won't be in the saved
-    # graph. When exporting a signature which already computes gradients, this
-    # stops us from taking needless second-order gradients.
-    func.add_to_graph(register_gradient_functions=False)
-    export_captures = _map_captured_resources_to_created_resources(
-        func.graph.captures, resource_map)
+  for signature_key, function in sorted(signature_functions.items()):
+    if function.graph.captures:
+      argument_inputs = function.graph.inputs[:-len(function.graph.captures)]
+    else:
+      argument_inputs = function.graph.inputs
     mapped_inputs, exterior_argument_placeholders = (
-        _map_function_inputs_to_created_inputs(
-            func.inputs, export_captures, signature_key, func.name))
-    # Calls the function quite directly, since we have new captured resource
-    # tensors we need to feed in which weren't part of the original function
-    # definition.
-    # pylint: disable=protected-access
+        _map_function_arguments_to_created_inputs(
+            argument_inputs, signature_key, function.name))
     outputs = _normalize_outputs(
-        func._build_call_outputs(
-            func._inference_function.call(context.context(), mapped_inputs)),
-        func.name, signature_key)
-    # pylint: enable=protected-access
+        _call_function_with_mapped_captures(
+            function, mapped_inputs, resource_map),
+        function.name, signature_key)
     signatures[signature_key] = signature_def_utils.build_signature_def(
         _tensor_dict_to_tensorinfo(exterior_argument_placeholders),
         _tensor_dict_to_tensorinfo(outputs))
   return signatures
 
 
+def _trace_resource_initializers(accessible_objects):
+  """Create concrete functions from `TrackableResource` objects."""
+  resource_initializers = []
+
+  def _wrap_initializer(obj):
+    obj.initialize()
+    return constant_op.constant(1.)  # Dummy control output
+
+  for obj in accessible_objects:
+    if isinstance(obj, tracking.TrackableResource):
+      resource_initializers.append(def_function.function(
+          functools.partial(_wrap_initializer, obj),
+          # All inputs are captures.
+          input_signature=[]).get_concrete_function())
+  return resource_initializers
+
+
+_AssetInfo = collections.namedtuple(
+    "_AssetInfo", [
+        # List of AssetFileDef protocol buffers
+        "asset_defs",
+        # Map from asset variable resource Tensors to their init ops
+        "asset_initializers_by_resource",
+        # Map from base asset filenames to full paths
+        "asset_filename_map",
+        # Map from TrackableAsset to index of corresponding AssetFileDef
+        "asset_index"])
+
+
+def _process_asset(trackable_asset, asset_info, resource_map):
+  """Add `trackable_asset` to `asset_info` and `resource_map`."""
+  original_variable = trackable_asset.asset_path
+  with context.eager_mode():
+    original_path = original_variable.numpy()
+  path = builder_impl.get_asset_filename_to_add(
+      asset_filepath=original_path,
+      asset_filename_map=asset_info.asset_filename_map)
+  # TODO(andresp): Instead of mapping 1-1 between trackable asset
+  # and asset in the graph def consider deduping the assets that
+  # point to the same file.
+  asset_path_initializer = array_ops.placeholder(
+      shape=original_variable.shape,
+      dtype=dtypes.string,
+      name="asset_path_initializer")
+  asset_variable = resource_variable_ops.ResourceVariable(
+      asset_path_initializer)
+  asset_info.asset_filename_map[path] = original_path
+  asset_def = meta_graph_pb2.AssetFileDef()
+  asset_def.filename = path
+  asset_def.tensor_info.name = asset_path_initializer.name
+  asset_info.asset_defs.append(asset_def)
+  asset_info.asset_initializers_by_resource[original_variable.handle] = (
+      asset_variable.initializer)
+  asset_info.asset_index[trackable_asset] = len(asset_info.asset_defs) - 1
+  resource_map[original_variable.handle] = asset_variable.handle
+
+
 def _map_resources(accessible_objects):
   """Makes new resource handle ops corresponding to existing resource tensors.
 
@@ -329,34 +421,82 @@ def _map_resources(accessible_objects):
       to create replacements for.
 
   Returns:
-    A tuple of (object_map, resource_map):
+    A tuple of (object_map, resource_map, asset_info):
       object_map: A dictionary mapping from object in `accessible_objects` to
         replacement objects created to hold the new resource tensors.
       resource_map: A dictionary mapping from resource tensors extracted from
         `accessible_objects` to newly created resource tensors.
+      asset_info: An _AssetInfo tuple describing external assets referenced from
+        accessible_objects.
   """
-  # TODO(allenl, rohanj): Map generic resources rather than just variables.
   # TODO(allenl): Handle MirroredVariables and other types of variables which
   # may need special casing.
   object_map = {}
   resource_map = {}
+  asset_info = _AssetInfo(
+      asset_defs=[],
+      asset_initializers_by_resource={},
+      asset_filename_map={},
+      asset_index={})
   for obj in accessible_objects:
-    if resource_variable_ops.is_resource_variable(obj):
+    if isinstance(obj, tracking.TrackableResource):
+      new_resource = obj.create_resource()
+      resource_map[obj.resource_handle] = new_resource
+    elif resource_variable_ops.is_resource_variable(obj):
       new_variable = resource_variable_ops.copy_to_graph_uninitialized(obj)
       object_map[obj] = new_variable
       resource_map[obj.handle] = new_variable.handle
-  return object_map, resource_map
+    elif isinstance(obj, tracking.TrackableAsset):
+      _process_asset(obj, asset_info, resource_map)
+  return object_map, resource_map, asset_info
 
 
-def _make_graph_def(root, signature_functions, object_saver):
-  """Generates and exports call ops for `signature_functions`."""
+def _fill_meta_graph_def(meta_graph_def, obj, signature_functions,
+                         object_saver):
+  """Generates a MetaGraph which calls `signature_functions`.
+
+  Args:
+    meta_graph_def: The MetaGraphDef proto to fill.
+    obj: The checkpointable object being exported.
+    signature_functions: A dictionary mapping signature keys to concrete
+      functions containing signatures to add to the MetaGraph.
+    object_saver: A CheckpointableSaver to add to the MetaGraph.
+
+  Returns:
+    An _AssetInfo, which contains information to help creating the SavedModel.
+  """
   signatures = {}
   # List objects from the eager context to make sure Optimizers give us the
   # right Graph-dependent variables.
-  accessible_objects = util.list_objects(root)
+  accessible_objects = util.list_objects(obj)
+  resource_initializer_functions = _trace_resource_initializers(
+      accessible_objects)
   exported_graph = ops.Graph()
+  resource_initializer_ops = []
   with exported_graph.as_default():
-    object_map, resource_map = _map_resources(accessible_objects)
+    object_map, resource_map, asset_info = _map_resources(accessible_objects)
+    for resource_initializer_function in resource_initializer_functions:
+      asset_dependencies = []
+      for capture in resource_initializer_function.graph.external_captures:
+        asset_initializer = asset_info.asset_initializers_by_resource.get(
+            capture, None)
+        if asset_initializer is not None:
+          asset_dependencies.append(asset_initializer)
+      with ops.control_dependencies(asset_dependencies):
+        resource_initializer_ops.append(
+            _call_function_with_mapped_captures(
+                resource_initializer_function, [], resource_map))
+    with ops.control_dependencies(resource_initializer_ops):
+      init_op = control_flow_ops.no_op()
+    # Add the same op to the main_op collection and to the init_op
+    # signature. The collection is for compatibility with older loader APIs;
+    # only one will be executed.
+    meta_graph_def.collection_def[constants.MAIN_OP_KEY].node_list.value.append(
+        init_op.name)
+    meta_graph_def.signature_def[constants.INIT_OP_SIGNATURE_KEY].CopyFrom(
+        signature_def_utils.op_signature_def(
+            init_op, constants.INIT_OP_SIGNATURE_KEY))
+
   # Saving an object-based checkpoint again gathers variables. We need to do the
   # gathering from the eager context so Optimizers save the right set of
   # variables, but want any operations associated with the save/restore to be in
@@ -365,14 +505,54 @@ def _make_graph_def(root, signature_functions, object_saver):
   with exported_graph.as_default():
     signatures = _generate_signatures(signature_functions, resource_map)
     saver_def = saver.to_proto()
+    meta_graph_def.saver_def.CopyFrom(saver_def)
   graph_def = exported_graph.as_graph_def(add_shapes=True)
   # Clean reference cycles so repeated export()s don't make work for the garbage
   # collector.
   ops.dismantle_graph(exported_graph)
-  return graph_def, signatures, saver_def
 
+  meta_graph_def.graph_def.CopyFrom(graph_def)
+  meta_graph_def.meta_info_def.tags.append(tag_constants.SERVING)
+  meta_graph_def.asset_file_def.extend(asset_info.asset_defs)
+  for signature_key, signature in signatures.items():
+    meta_graph_def.signature_def[signature_key].CopyFrom(signature)
+  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
+  return asset_info
+
+
+def _write_object_graph(root, export_dir, asset_file_def_index):
+  """Save a SavedObjectGraph proto for `root`."""
+  # SavedObjectGraph is similar to the CheckpointableObjectGraph proto in the
+  # checkpoint. It will eventually go into the SavedModel.
+  proto = saved_object_graph_pb2.SavedObjectGraph()
+
+  checkpointable_objects, node_ids, slot_variables = util.find_objects(root)
+  util.fill_object_graph_proto(checkpointable_objects, node_ids, slot_variables,
+                               proto)
+
+  for obj, obj_proto in zip(checkpointable_objects, proto.nodes):
+    _write_object_proto(obj, obj_proto, asset_file_def_index)
 
-@tf_export("saved_model.save", v1=["saved_model.experimental.save"])
+  extra_asset_dir = os.path.join(
+      compat.as_bytes(export_dir),
+      compat.as_bytes(constants.EXTRA_ASSETS_DIRECTORY))
+  file_io.recursive_create_dir(extra_asset_dir)
+  object_graph_filename = os.path.join(
+      extra_asset_dir, compat.as_bytes("object_graph.pb"))
+  file_io.write_string_to_file(object_graph_filename, proto.SerializeToString())
+
+
+def _write_object_proto(obj, proto, asset_file_def_index):
+  """Saves an object into SavedObject proto."""
+  if isinstance(obj, tracking.TrackableAsset):
+    proto.asset.SetInParent()
+    proto.asset.asset_file_def_index = asset_file_def_index[obj]
+  else:
+    proto.user_object.SetInParent()
+
+
+@tf_export("saved_model.save",
+           v1=["saved_model.save", "saved_model.experimental.save"])
 def save(obj, export_dir, signatures=None):
   # pylint: disable=line-too-long
   """Exports the Checkpointable object `obj` to [SavedModel format](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/saved_model/README.md).
@@ -450,6 +630,19 @@ def save(obj, export_dir, signatures=None):
           tf.TensorSpec(shape=[None, 3], dtype=tf.float32, name="inp")))
   ```
 
+  `tf.keras.Model` instances constructed from inputs and outputs already have a
+  signature and so do not require a `@tf.function` decorator or a `signatures`
+  argument. If neither are specified, the model's forward pass is exported.
+
+  ```python
+  x = input_layer.Input((4,), name="x")
+  y = core.Dense(5, name="out")(x)
+  model = training.Model(x, y)
+  tf.saved_model.save(model, '/tmp/saved_model/')
+  # The exported SavedModel takes "x" with shape [None, 4] and returns "out"
+  # with shape [None, 5]
+  ```
+
   Variables must be tracked by assigning them to an attribute of a tracked
   object or to an attribute of `obj` directly. TensorFlow objects (e.g. layers
   from `tf.keras.layers`, optimizers from `tf.train`) track their variables
@@ -515,26 +708,27 @@ def save(obj, export_dir, signatures=None):
     # Note that we run this before saving the checkpoint, since looping over
     # attributes may have the side effect of creating variables in some cases.
     signatures = _find_function_to_export(obj)
-  object_saver = util.CheckpointableSaver(obj)
-  utils_impl.get_or_create_variables_dir(export_dir)
-  object_saver.save(utils_impl.get_variables_path(export_dir))
 
   signatures = _canonicalize_signatures(signatures)
-  graph_def, signatures, saver_def = _make_graph_def(
-      obj, signatures, object_saver)
-  saved_model = saved_model_pb2.SavedModel()
-  saved_model.saved_model_schema_version = (
-      constants.SAVED_MODEL_SCHEMA_VERSION)
-  meta_graph_def = saved_model.meta_graphs.add()
-  meta_graph_def.saver_def.CopyFrom(saver_def)
   # TODO(allenl): Factor out some subset of SavedModelBuilder which is 2.x
   # compatible (no sessions) and share it with this export API rather than
   # making a SavedModel proto and writing it directly.
-  meta_graph_def.graph_def.MergeFrom(graph_def)
-  for signature_key, signature in signatures.items():
-    meta_graph_def.signature_def[signature_key].MergeFrom(signature)
-  meta_graph.strip_graph_default_valued_attrs(meta_graph_def)
+  saved_model = saved_model_pb2.SavedModel()
+  meta_graph_def = saved_model.meta_graphs.add()
+  object_saver = util.CheckpointableSaver(obj)
+  asset_info = _fill_meta_graph_def(
+      meta_graph_def, obj, signatures, object_saver)
+  saved_model.saved_model_schema_version = (
+      constants.SAVED_MODEL_SCHEMA_VERSION)
+  # So far we've just been generating protocol buffers with no I/O. Now we write
+  # the checkpoint, copy assets into the assets directory, and write out the
+  # SavedModel proto itself.
+  utils_impl.get_or_create_variables_dir(export_dir)
+  object_saver.save(utils_impl.get_variables_path(export_dir))
+  builder_impl.copy_assets_to_destination_dir(asset_info.asset_filename_map,
+                                              export_dir)
   path = os.path.join(
       compat.as_bytes(export_dir),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
   file_io.write_string_to_file(path, saved_model.SerializeToString())
+  _write_object_graph(obj, export_dir, asset_info.asset_index)
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 42ff508b38ae2bda9e273f6b4a2ac405bebdf53d..97218a98eae38decc9c296a420074b7d4ec1f5e3 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -21,6 +21,9 @@ from __future__ import print_function
 import os
 import sys
 
+import numpy
+
+from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
@@ -29,13 +32,19 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.keras.engine import input_layer
 from tensorflow.python.keras.engine import training
 from tensorflow.python.keras.layers import core
+from tensorflow.python.keras.layers import merge
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.saved_model import loader
 from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import signature_constants
+from tensorflow.python.saved_model import tag_constants
 from tensorflow.python.training import adam
 from tensorflow.python.training.checkpointable import tracking
 from tensorflow.python.training.checkpointable import util
@@ -60,26 +69,27 @@ class _ModelWithOptimizer(training.Model):
     return {"loss": loss}
 
 
-class SaveTest(test.TestCase):
+def _import_and_infer(
+    save_dir, inputs,
+    signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
+  """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
+  graph = ops.Graph()
+  with graph.as_default(), session_lib.Session() as session:
+    model = loader.load(session, [tag_constants.SERVING], save_dir)
+    signature = model.signature_def[signature_key]
+    assert set(inputs.keys()) == set(signature.inputs.keys())
+    feed_dict = {}
+    for arg_name in inputs.keys():
+      feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
+          inputs[arg_name])
+    output_dict = {}
+    for output_name, output_tensor_info in signature.outputs.items():
+      output_dict[output_name] = graph.get_tensor_by_name(
+          output_tensor_info.name)
+    return session.run(output_dict, feed_dict=feed_dict)
 
-  def _import_and_infer(
-      self, save_dir, inputs,
-      signature_key=signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY):
-    """Import a SavedModel into a TF 1.x-style graph and run `signature_key`."""
-    graph = ops.Graph()
-    with graph.as_default(), self.session(graph) as session:
-      model = loader.load(session, [], save_dir)
-      signature = model.signature_def[signature_key]
-      self.assertEqual(set(inputs.keys()), set(signature.inputs.keys()))
-      feed_dict = {}
-      for arg_name in inputs.keys():
-        feed_dict[graph.get_tensor_by_name(signature.inputs[arg_name].name)] = (
-            inputs[arg_name])
-      output_dict = {}
-      for output_name, output_tensor_info in signature.outputs.items():
-        output_dict[output_name] = graph.get_tensor_by_name(
-            output_tensor_info.name)
-      return session.run(output_dict, feed_dict=feed_dict)
+
+class SaveTest(test.TestCase):
 
   def test_method_save_signature(self):
     root = tracking.Checkpointable()
@@ -91,7 +101,7 @@ class SaveTest(test.TestCase):
     save.save(root, save_dir, root.f)
     self.assertEqual(
         {"output_0": 2.},
-        self._import_and_infer(save_dir, {"x": 1.}))
+        _import_and_infer(save_dir, {"x": 1.}))
 
   def test_method_save_concrete(self):
     root = tracking.Checkpointable()
@@ -106,7 +116,7 @@ class SaveTest(test.TestCase):
             tensor_spec.TensorSpec(None, dtypes.float32))})
     self.assertEqual(
         {"out": 2.},
-        self._import_and_infer(
+        _import_and_infer(
             save_dir, {"z": 1.}, signature_key="non_default_key"))
 
   def test_non_concrete_error(self):
@@ -142,9 +152,9 @@ class SaveTest(test.TestCase):
       save.save(root, save_dir, to_save)
 
   def test_nested_dict_outputs(self):
-    root = tracking.Checkpointable()
-    root.f = def_function.function(
-        lambda x: {"a": 2. * x, "b": (3. * x, 4. * x)})
+    root = util.Checkpoint(
+        f=def_function.function(
+            lambda x: {"a": 2. * x, "b": (3. * x, 4. * x)}))
     root.f(constant_op.constant(1.))
     to_save = root.f.get_concrete_function(constant_op.constant(1.))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -163,7 +173,7 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(root, save_dir, to_save)
     self.assertAllEqual({"output_0": 12.},
-                        self._import_and_infer(save_dir, {"x": 2.}))
+                        _import_and_infer(save_dir, {"x": 2.}))
 
   def test_optimizer(self):
     x = constant_op.constant([[3., 4.]])
@@ -176,7 +186,7 @@ class SaveTest(test.TestCase):
     self.assertNotEqual(first_loss, second_loss)
     self.assertAllClose(
         second_loss,
-        self._import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
+        _import_and_infer(save_dir, {"x": [[3., 4.]], "y": [2.]}))
 
   def test_trivial_save_exception(self):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
@@ -191,8 +201,8 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
     self.assertIn("loss",
-                  self._import_and_infer(save_dir,
-                                         {"x": [[3., 4.]], "y": [2.]}))
+                  _import_and_infer(save_dir,
+                                    {"x": [[3., 4.]], "y": [2.]}))
 
   def test_single_function_default_signature(self):
     model = tracking.Checkpointable()
@@ -201,7 +211,7 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(model, save_dir)
     self.assertAllClose({"output_0": 3.},
-                        self._import_and_infer(save_dir, {}))
+                        _import_and_infer(save_dir, {}))
 
   def test_ambiguous_signatures(self):
     model = _ModelWithOptimizer()
@@ -213,6 +223,19 @@ class SaveTest(test.TestCase):
     with self.assertRaisesRegexp(ValueError, "call.*second_function"):
       save.save(model, save_dir)
 
+  def test_subclassed_no_signature(self):
+
+    class Subclassed(training.Model):
+
+      def call(self, inputs):
+        return inputs * 2.
+
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    model = Subclassed()
+    with self.assertRaisesRegexp(
+        ValueError, "no @tf.function-decorated methods"):
+      save.save(model, save_dir)
+
   def test_docstring(self):
 
     class Adder(util.Checkpoint):
@@ -227,7 +250,7 @@ class SaveTest(test.TestCase):
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(to_save, save_dir)
     self.assertAllClose({"output_0": 7.},
-                        self._import_and_infer(save_dir, {"x": 3.}))
+                        _import_and_infer(save_dir, {"x": 3.}))
 
   def test_default_attr_stripping(self):
 
@@ -246,13 +269,103 @@ class SaveTest(test.TestCase):
     save.save(to_save, save_dir)
     graph = ops.Graph()
     with graph.as_default(), self.session(graph) as session:
-      loader.load(session, [], save_dir)
+      loader.load(session, [tag_constants.SERVING], save_dir)
       func, = graph._functions.values()
       complex_node, = [
           node for node in func.definition.node_def if node.op == "Complex"]
       self.assertNotIn("T", complex_node.attr)
       self.assertNotIn("Tout", complex_node.attr)
 
+  def test_export_functional_keras_model(self):
+    x = input_layer.Input((4,), name="x")
+    y = core.Dense(4, name="out")(x)
+    model = training.Model(x, y)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertAllClose(
+        {"out": model(array_ops.ones([1, 4]))},
+        _import_and_infer(save_dir, {"x": [[1., 1., 1., 1.]]}))
+
+  @test_util.run_v1_only("b/120545219")
+  def test_export_functional_keras_model_after_fit(self):
+    x = input_layer.Input((1,))
+    y = core.Dense(1, name="y")(x)
+    model = training.Model(x, y)
+    model.compile(optimizer="sgd", loss="mse")
+    model.fit(x=numpy.array([[1.]]),
+              y=numpy.array([2.]), epochs=2)
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    self.assertAllClose(
+        {"y": model(constant_op.constant([[1.], [2.]]))},
+        _import_and_infer(save_dir, {"input_1": [[1.], [2.]]}))
+
+  def test_export_multi_input_functional_keras_model(self):
+    x1 = input_layer.Input((2,), name="x1")
+    x2 = input_layer.Input((2,), name="x2")
+    y1 = core.Dense(4)(merge.Add()([x1, x2]))
+    y2 = core.Dense(4)(merge.Multiply()([x1, x2]))
+    model = training.Model([x1, x2], [y1, y2])
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(model, save_dir)
+    outputs = model([array_ops.ones([1, 2]), 2. * array_ops.ones([1, 2])])
+    self.assertAllClose(
+        {"dense": outputs[0], "dense_1": outputs[1]},
+        _import_and_infer(
+            save_dir,
+            {"x1": [[1., 1.]],
+             "x2": [[2., 2.]]}))
+
+
+class AssetTests(test.TestCase):
+
+  def setUp(self):
+    super(AssetTests, self).setUp()
+    self._vocab_path = os.path.join(self.get_temp_dir(), "vocab.txt")
+    with open(self._vocab_path, "w") as f:
+      f.write("alpha\nbeta\ngamma\n")
+
+  def test_table(self):
+    initializer = lookup_ops.TextFileInitializer(
+        self._vocab_path,
+        key_dtype=dtypes.string,
+        key_index=lookup_ops.TextFileIndex.WHOLE_LINE,
+        value_dtype=dtypes.int64,
+        value_index=lookup_ops.TextFileIndex.LINE_NUMBER)
+    root = util.Checkpoint(table=lookup_ops.HashTable(
+        initializer, default_value=-1))
+    root.table_user = def_function.function(
+        root.table.lookup,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
+    self.assertEqual(
+        2,
+        self.evaluate(root.table_user(constant_op.constant("gamma"))))
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir)
+    file_io.delete_file(self._vocab_path)
+    self.assertAllClose(
+        {"output_0": [2, 0]},
+        _import_and_infer(save_dir, {"keys": ["gamma", "alpha"]}))
+    second_dir = os.path.join(self.get_temp_dir(), "second_dir")
+    # Asset paths should track the location the SavedModel is loaded from.
+    file_io.rename(save_dir, second_dir)
+    self.assertAllClose(
+        {"output_0": [2, 1]},
+        _import_and_infer(second_dir, {"keys": ["gamma", "beta"]}))
+
+  def test_unused_asset(self):
+    root = tracking.Checkpointable()
+    root.f = def_function.function(
+        lambda x: 2. * x,
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+    root.asset = tracking.TrackableAsset(self._vocab_path)
+
+    export_dir = os.path.join(self.get_temp_dir(), "save_dir")
+    save.save(root, export_dir)
+    self.assertAllClose(
+        {"output_0": [0.2]},
+        _import_and_infer(export_dir, {"x": [0.1]}))
+
 
 class MemoryTests(test.TestCase):
 
diff --git a/tensorflow/python/saved_model/saved_model_test.py b/tensorflow/python/saved_model/saved_model_test.py
index 5d6167ab38f5a07d56143f608770d1aadb17a2fb..8d94c7c989d12df965bd5cc5954d30972238ff3c 100644
--- a/tensorflow/python/saved_model/saved_model_test.py
+++ b/tensorflow/python/saved_model/saved_model_test.py
@@ -54,15 +54,15 @@ def tearDownModule():
   file_io.delete_recursively(test.get_temp_dir())
 
 
-class SavedModelTest(test.TestCase):
+class SavedModelTestBase(test.TestCase):
 
   def _get_export_dir(self, label):
     return os.path.join(test.get_temp_dir(), label)
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.VariableV1(variable_value, name=variable_name)
-    sess.run(variables.global_variables_initializer())
-    self.assertEqual(variable_value, v.eval())
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(variable_value, self.evaluate(v))
 
   def _build_asset_collection(self, asset_file_name, asset_file_contents,
                               asset_file_tensor_name, asset_subdir=""):
@@ -78,14 +78,16 @@ class SavedModelTest(test.TestCase):
     asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
     return asset_collection
 
-  def _validate_asset_collection(self, export_dir, graph_collection_def,
-                                 expected_asset_file_name,
-                                 expected_asset_file_contents,
-                                 expected_asset_tensor_name,
-                                 asset_id=0):
-    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
-    asset = meta_graph_pb2.AssetFileDef()
-    assets_any[asset_id].Unpack(asset)
+
+class SavedModelTest(SavedModelTestBase):
+
+  def _validate_assets(self,
+                       export_dir,
+                       asset_file_def,
+                       expected_asset_file_name,
+                       expected_asset_file_contents,
+                       expected_asset_tensor_name,
+                       asset_id=0):
     assets_path = os.path.join(
         compat.as_bytes(export_dir),
         compat.as_bytes(constants.ASSETS_DIRECTORY),
@@ -93,8 +95,10 @@ class SavedModelTest(test.TestCase):
     actual_asset_contents = file_io.read_file_to_string(assets_path)
     self.assertEqual(expected_asset_file_contents,
                      compat.as_text(actual_asset_contents))
-    self.assertEqual(expected_asset_file_name, asset.filename)
-    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+    self.assertEqual(expected_asset_file_name,
+                     asset_file_def[asset_id].filename)
+    self.assertEqual(expected_asset_tensor_name,
+                     asset_file_def[asset_id].tensor_info.name)
 
   def _validate_inputs_tensor_info_fail(self, builder, tensor_info):
     with self.session(graph=ops.Graph()) as sess:
@@ -142,6 +146,18 @@ class SavedModelTest(test.TestCase):
           sess, ["foo"],
           signature_def_map={"foo_key": foo_signature})
 
+  def _validate_sig_def_keys(self, builder, valid_tensor_info, invalid_key):
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      foo_signature = signature_def_utils.build_signature_def(
+          dict(), {"foo_key": valid_tensor_info}, "foo")
+      self.assertRaises(
+          KeyError,
+          builder.add_meta_graph_and_variables,
+          sess, ["foo"],
+          signature_def_map={invalid_key: foo_signature})
+
   def testMaybeSavedModelDir(self):
     base_path = test.test_src_dir_path("/python/saved_model")
     self.assertFalse(loader.maybe_saved_model_directory(base_path))
@@ -183,9 +199,10 @@ class SavedModelTest(test.TestCase):
                                    constants.SAVED_MODEL_FILENAME_PBTXT):
         loader.load(sess, ["foo"], export_dir)
 
+  @test_util.run_deprecated_v1
   def testVerifySessionGraphUsage(self):
     export_dir = self._get_export_dir("test_verify_session_graph_usage")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
@@ -203,9 +220,10 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  @test_util.run_deprecated_v1
   def testSequence(self):
     export_dir = self._get_export_dir("test_sequence")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Expect an assertion error since add_meta_graph_and_variables() should be
     # invoked before any add_meta_graph() calls.
@@ -220,9 +238,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(AssertionError, builder.add_meta_graph_and_variables,
                         sess, ["baz"])
 
+  @test_util.run_deprecated_v1
   def testTags(self):
     export_dir = self._get_export_dir("test_tags")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -309,9 +328,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(RuntimeError, loader.load, sess, ["foo", "baz"],
                         export_dir)
 
+  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     export_dir = self._get_export_dir("test_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with two variables. SavedModel invoked to:
     # - add with weights.
@@ -361,9 +381,10 @@ class SavedModelTest(test.TestCase):
       self.assertRaises(errors.NotFoundError, loader.load, sess, ["baz"],
                         export_dir)
 
+  @test_util.run_deprecated_v1
   def testGraphWithoutVariables(self):
     export_dir = self._get_export_dir("test_graph_has_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with no variables.
     with self.session(graph=ops.Graph()) as sess:
@@ -385,7 +406,7 @@ class SavedModelTest(test.TestCase):
       a = ops.get_default_graph().get_tensor_by_name(constant_5_name)
       b = constant_op.constant(6.0)
       c = a * b
-      self.assertEqual(30.0, sess.run(c))
+      self.assertEqual(30.0, self.evaluate(c))
 
     # Restore the graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
@@ -394,11 +415,12 @@ class SavedModelTest(test.TestCase):
       a = ops.get_default_graph().get_tensor_by_name(constant_6_name)
       b = constant_op.constant(5.0)
       c = a * b
-      self.assertEqual(30.0, sess.run(c))
+      self.assertEqual(30.0, self.evaluate(c))
 
+  @test_util.run_deprecated_v1
   def testNoOverwrite(self):
     export_dir = self._get_export_dir("test_no_overwrite")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -417,12 +439,13 @@ class SavedModelTest(test.TestCase):
 
     # An attempt to create another builder with the same export directory should
     # result in an assertion error.
-    self.assertRaises(AssertionError, saved_model_builder.SavedModelBuilder,
+    self.assertRaises(AssertionError, saved_model_builder._SavedModelBuilder,
                       export_dir)
 
+  @test_util.run_deprecated_v1
   def testSaveAsText(self):
     export_dir = self._get_export_dir("test_astext")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable. SavedModel invoked to:
     # - add with weights.
@@ -451,17 +474,18 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testCollections(self):
     export_dir = self._get_export_dir("test_collections")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable added to a collection. SavedModel invoked to:
     # - add with weights.
     with self.session(graph=ops.Graph()) as sess:
       v = variables.VariableV1(42, name="v")
       ops.add_to_collection("foo_vars", v)
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(42, v.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(42, self.evaluate(v))
       builder.add_meta_graph_and_variables(sess, ["foo"])
 
     # Graph with the same single variable added to a different collection.
@@ -470,8 +494,8 @@ class SavedModelTest(test.TestCase):
     with self.session(graph=ops.Graph()) as sess:
       v = variables.VariableV1(43, name="v")
       ops.add_to_collection("bar_vars", v)
-      sess.run(variables.global_variables_initializer())
-      self.assertEqual(43, v.eval())
+      self.evaluate(variables.global_variables_initializer())
+      self.assertEqual(43, self.evaluate(v))
       builder.add_meta_graph(["bar"])
 
     # Save the SavedModel to disk.
@@ -501,9 +525,10 @@ class SavedModelTest(test.TestCase):
 
       self.assertEqual(len(ops.get_collection("foo_vars")), 0)
 
+  @test_util.run_deprecated_v1
   def testSignatureDefs(self):
     export_dir = self._get_export_dir("test_signature_defs")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Graph with a single variable and a single entry in the signature def map.
     # SavedModel is invoked to add with weights.
@@ -563,7 +588,7 @@ class SavedModelTest(test.TestCase):
 
   def testSignatureDefValidationFails(self):
     export_dir = self._get_export_dir("test_signature_def_validation_fail")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     tensor_without_encoding = meta_graph_pb2.TensorInfo()
     tensor_without_encoding.dtype = types_pb2.DT_FLOAT
@@ -579,19 +604,30 @@ class SavedModelTest(test.TestCase):
     self._validate_inputs_tensor_info_fail(builder, tensor_empty)
     self._validate_outputs_tensor_info_fail(builder, tensor_empty)
 
+    valid_tensor_info = meta_graph_pb2.TensorInfo()
+    valid_tensor_info.name = "foo"
+    valid_tensor_info.dtype = types_pb2.DT_FLOAT
+
+    self._validate_sig_def_keys(builder, valid_tensor_info,
+                                constants.INIT_OP_SIGNATURE_KEY)
+    self._validate_sig_def_keys(builder, valid_tensor_info,
+                                constants.TRAIN_OP_SIGNATURE_KEY)
+
+  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithName(self):
     tensor_with_name = meta_graph_pb2.TensorInfo()
     tensor_with_name.name = "foo"
     tensor_with_name.dtype = types_pb2.DT_FLOAT
 
     export_dir = self._get_export_dir("test_signature_def_validation_name_1")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_inputs_tensor_info_accept(builder, tensor_with_name)
 
     export_dir = self._get_export_dir("test_signature_def_validation_name_2")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_name)
 
+  @test_util.run_deprecated_v1
   def testSignatureDefValidationSucceedsWithCoo(self):
     tensor_with_coo = meta_graph_pb2.TensorInfo()
     # TODO(soergel) test validation of each of the fields of coo_sparse
@@ -599,16 +635,17 @@ class SavedModelTest(test.TestCase):
     tensor_with_coo.dtype = types_pb2.DT_FLOAT
 
     export_dir = self._get_export_dir("test_signature_def_validation_coo_1")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_inputs_tensor_info_accept(builder, tensor_with_coo)
 
     export_dir = self._get_export_dir("test_signature_def_validation_coo_2")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
     self._validate_outputs_tensor_info_accept(builder, tensor_with_coo)
 
+  @test_util.run_deprecated_v1
   def testAssets(self):
     export_dir = self._get_export_dir("test_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
@@ -618,145 +655,151 @@ class SavedModelTest(test.TestCase):
           compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
       file_io.write_string_to_file(ignored_filepath, "will be ignored")
 
-      asset_collection = self._build_asset_collection("hello42.txt",
-                                                      "foo bar baz",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("ignored.txt"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionDiffFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_diff_file")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar bak", "asset_file_tensor",
-          asset_subdir="1")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar bak", "asset_file_tensor", asset_subdir="1")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
-          asset_subdir="2")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar bak",
-                                      "asset_file_tensor:0")
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt_1", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
-
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar bak", "asset_file_tensor:0")
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt_1",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
+
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionSameFilepath(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_path")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1")
+      asset_list = self._build_asset_collection("hello42.txt", "foo bar baz",
+                                                "asset_file_tensor_1")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       # The second tensor should be recorded, but the same.
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("hello42.txt_1"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionSameFile(self):
     export_dir = self._get_export_dir("test_assets_name_collision_same_file")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor",
-          asset_subdir="1")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor", asset_subdir="1")
 
-      asset_collection = self._build_asset_collection(
-          "hello42.txt", "foo bar baz", "asset_file_tensor_1",
-          asset_subdir="2")
+      asset_list = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor_1", asset_subdir="2")
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz", "asset_file_tensor:0")
       # The second tensor should be recorded, but the same.
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz",
-                                      "asset_file_tensor_1:0",
-                                      asset_id=1)
+      self._validate_assets(
+          export_dir,
+          foo_graph.asset_file_def,
+          "hello42.txt",
+          "foo bar baz",
+          "asset_file_tensor_1:0",
+          asset_id=1)
       ignored_asset_path = os.path.join(
           compat.as_bytes(export_dir),
           compat.as_bytes(constants.ASSETS_DIRECTORY),
           compat.as_bytes("hello42.txt_1"))
       self.assertFalse(file_io.file_exists(ignored_asset_path))
 
+  @test_util.run_deprecated_v1
   def testAssetsNameCollisionManyFiles(self):
     export_dir = self._get_export_dir("test_assets_name_collision_many_files")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       for i in range(5):
         idx = str(i)
-        asset_collection = self._build_asset_collection(
-            "hello42.txt", "foo bar baz " + idx, "asset_file_tensor_" + idx,
+        asset_list = self._build_asset_collection(
+            "hello42.txt",
+            "foo bar baz " + idx,
+            "asset_file_tensor_" + idx,
             asset_subdir=idx)
 
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -765,18 +808,21 @@ class SavedModelTest(test.TestCase):
       foo_graph = loader.load(sess, ["foo"], export_dir)
       for i in range(1, 5):
         idx = str(i)
-        self._validate_asset_collection(
-            export_dir, foo_graph.collection_def, "hello42.txt_" + idx,
-            "foo bar baz " + idx, "asset_file_tensor_{}:0".format(idx),
+        self._validate_assets(
+            export_dir,
+            foo_graph.asset_file_def,
+            "hello42.txt_" + idx,
+            "foo bar baz " + idx,
+            "asset_file_tensor_{}:0".format(idx),
             asset_id=i)
 
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "hello42.txt", "foo bar baz 0",
-                                      "asset_file_tensor_0:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "hello42.txt",
+                            "foo bar baz 0", "asset_file_tensor_0:0")
 
-  def testCustomMainOp(self):
+  @test_util.run_v1_only("b/120545219")
+  def testCustomInitOp(self):
     export_dir = self._get_export_dir("test_main_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -792,11 +838,11 @@ class SavedModelTest(test.TestCase):
       # Set up an assignment op to be run as part of the main_op.
       with ops.control_dependencies([main_op.main_op()]):
         add_v1_v2 = math_ops.add(v1._ref(), v2._ref())
-        custom_main_op = control_flow_ops.group(state_ops.assign(v3, add_v1_v2))
+        custom_init_op = control_flow_ops.group(state_ops.assign(v3, add_v1_v2))
 
-      sess.run(custom_main_op)
+      self.evaluate(custom_init_op)
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], main_op=custom_main_op)
+          sess, ["foo"], init_op=custom_init_op)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -809,83 +855,10 @@ class SavedModelTest(test.TestCase):
       # the main_op, following a restore.
       self.assertEqual(3, ops.get_collection("v")[2].eval())
 
-  def testLegacyInitOp(self):
-    export_dir = self._get_export_dir("test_legacy_init_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-    with self.session(graph=ops.Graph()) as sess:
-      # Add `v1` and `v2` variables to the graph.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-      v2 = variables.VariableV1(2, name="v2")
-      ops.add_to_collection("v", v2)
-
-      # Initialize another variable `v3` to 42.
-      v3 = variables.VariableV1(42, name="v3", trainable=False, collections=[])
-      ops.add_to_collection("v", v3)
-
-      # Set up an assignment op to be run as part of the legacy_init_op.
-      assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
-      legacy_init_op = control_flow_ops.group(assign_v3, name="legacy_init_op")
-
-      sess.run(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(
-          sess, ["foo"], legacy_init_op=legacy_init_op)
-
-    # Save the SavedModel to disk.
-    builder.save()
-
-    with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
-      self.assertEqual(1, ops.get_collection("v")[0].eval())
-      self.assertEqual(2, ops.get_collection("v")[1].eval())
-      # Evaluates to the sum of the first two variables and assigned as part of
-      # the legacy_init_op, following a restore.
-      self.assertEqual(3, ops.get_collection("v")[2].eval())
-
-  def testLegacyInitOpWithNonEmptyCollection(self):
-    export_dir = self._get_export_dir(
-        "test_legacy_init_op_with_non_empty_collection")
-    self._testInitOpsWithNonEmptyCollection(
-        export_dir, constants.LEGACY_INIT_OP_KEY)
-
-  def testMainOpWithNonEmptyCollection(self):
-    export_dir = self._get_export_dir(
-        "test_main_op_with_non_empty_collection")
-    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
-
-  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
-
-    g = ops.Graph()
-    with self.session(graph=g) as sess:
-      # Initialize variable `v1` to 1.
-      v1 = variables.VariableV1(1, name="v1")
-      ops.add_to_collection("v", v1)
-
-      # Initialize another variable `v2` to 42.
-      v2 = variables.VariableV1(42, name="v2", trainable=False, collections=[])
-      ops.add_to_collection("v", v2)
-
-      # Set up an assignment op to be run as part of the init op.
-      assign_v2 = state_ops.assign(v2, v1)
-      init_op = control_flow_ops.group(assign_v2, name="init_op")
-
-      sess.run(variables.global_variables_initializer())
-
-      ops.add_to_collection(key, control_flow_ops.no_op())
-      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
-      # is not empty and we don't support multiple init ops.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(
-            sess, ["foo"], legacy_init_op=init_op)
-      # We shouldn't be able to add as MAIN_OP, either.
-      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
-        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
-
+  @test_util.run_v1_only("b/120545219")
   def testTrainOp(self):
     export_dir = self._get_export_dir("test_train_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -894,27 +867,26 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       train_op = state_ops.assign_add(v1, v2)
 
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertEqual(3, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+          loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
+  @test_util.run_v1_only("b/120545219")
   def testTrainOpGroup(self):
     export_dir = self._get_export_dir("test_train_op_group")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -923,27 +895,26 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       train_op = control_flow_ops.group()
 
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph_and_variables(sess, ["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph_and_variables(sess, ["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Operation)
+          loader_impl.get_train_op(meta_graph_def), ops.Operation)
 
+  @test_util.run_v1_only("b/120545219")
   def testTrainOpAfterVariables(self):
     export_dir = self._get_export_dir("test_train_op_after_variables")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       # Add `v1` and `v2` variables to the graph.
@@ -952,51 +923,50 @@ class SavedModelTest(test.TestCase):
       v2 = variables.VariableV1(2, name="v2")
       ops.add_to_collection("v", v2)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["pre_foo"])
 
       train_op = state_ops.assign_add(v1, v2)
-      sess.run(train_op)
-      # TODO(karmel): remove explicit call when in the public method.
-      builder._add_train_op(train_op)
-      builder.add_meta_graph(["foo"])
+      self.evaluate(train_op)
+      builder.add_meta_graph(["foo"], train_op=train_op)
 
     # Save the SavedModel to disk.
     builder.save()
 
     with self.session(graph=ops.Graph()) as sess:
-      loader.load(sess, ["foo"], export_dir)
+      meta_graph_def = loader.load(sess, ["foo"], export_dir)
       self.assertIsInstance(
-          ops.get_collection(constants.TRAIN_OP_KEY)[0], ops.Tensor)
+          loader_impl.get_train_op(meta_graph_def), ops.Tensor)
 
     with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["pre_foo"], export_dir)
       self.assertFalse(ops.get_collection(constants.TRAIN_OP_KEY))
 
+  @test_util.run_deprecated_v1
   def testMultipleAssets(self):
     export_dir = self._get_export_dir("test_multiple_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `foo` graph.
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "foo".
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection specific to `bar` graph.
-      asset_collection = self._build_asset_collection("bar.txt", "content_bar",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("bar.txt", "content_bar",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+      builder.add_meta_graph(["bar"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1004,43 +974,42 @@ class SavedModelTest(test.TestCase):
     # Check assets restored for graph with tag "foo".
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
       bar_graph = loader.load(sess, ["bar"], export_dir)
-      self._validate_asset_collection(export_dir, bar_graph.collection_def,
-                                      "bar.txt", "content_bar",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, bar_graph.asset_file_def, "bar.txt",
+                            "content_bar", "asset_file_tensor:0")
 
+  @test_util.run_deprecated_v1
   def testDuplicateAssets(self):
     export_dir = self._get_export_dir("test_duplicate_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `foo` specific
       # content.
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "foo".
       builder.add_meta_graph_and_variables(
-          sess, ["foo"], assets_collection=asset_collection)
+          sess, ["foo"], assets_list=asset_list)
 
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
 
       # Build an asset collection with `foo.txt` that has `bar` specific
       # content.
-      asset_collection = self._build_asset_collection("foo.txt", "content_bar",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_bar",
+                                                "asset_file_tensor")
 
       # Add the asset collection as part of the graph with tag "bar".
-      builder.add_meta_graph(["bar"], assets_collection=asset_collection)
+      builder.add_meta_graph(["bar"], assets_list=asset_list)
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1048,9 +1017,8 @@ class SavedModelTest(test.TestCase):
     # Check assets restored for graph with tag "foo".
     with self.session(graph=ops.Graph()) as sess:
       foo_graph = loader.load(sess, ["foo"], export_dir)
-      self._validate_asset_collection(export_dir, foo_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, foo_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
     # Check assets restored for graph with tag "bar".
     with self.session(graph=ops.Graph()) as sess:
@@ -1059,13 +1027,13 @@ class SavedModelTest(test.TestCase):
       # Validate the assets for `bar` graph. `foo.txt` should contain the
       # original contents corresponding to `foo` graph since an asset with the
       # same name across multiple graphs is only stored the first time
-      self._validate_asset_collection(export_dir, bar_graph.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, bar_graph.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
+  @test_util.run_v1_only("b/120545219")
   def testOp(self):
     export_dir = self._get_export_dir("test_op")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with session.Session(
         graph=ops.Graph(),
@@ -1086,7 +1054,7 @@ class SavedModelTest(test.TestCase):
       ops.add_to_collection("v", v3)
       ops.add_to_collection("init_op", init_op)
 
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       self.assertEqual(1, ops.get_collection("v")[0].eval())
       self.assertEqual(2, ops.get_collection("v")[1].eval())
 
@@ -1108,7 +1076,7 @@ class SavedModelTest(test.TestCase):
 
   def testCustomSaveable(self):
     export_dir = self._get_export_dir("custom_saveable")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with session.Session(
         graph=ops.Graph(),
@@ -1135,13 +1103,14 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(b"k1", v1.keys().eval())
       self.assertEqual(3.0, v1.values().eval())
 
+  @test_util.run_deprecated_v1
   def testCustomSaver(self):
     export_dir = self._get_export_dir("test_custom_saver")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       custom_saver = training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"], saver=custom_saver)
 
@@ -1157,13 +1126,14 @@ class SavedModelTest(test.TestCase):
         self.assertEqual(
             saved_graph.saver_def.restore_op_name, "my_saver/restore_all")
 
+  @test_util.run_deprecated_v1
   def testNoCustomSaver(self):
     export_dir = self._get_export_dir("test_no_custom_saver")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       training.Saver(name="my_saver")
       builder.add_meta_graph_and_variables(sess, ["tag"])
 
@@ -1179,13 +1149,14 @@ class SavedModelTest(test.TestCase):
         self.assertEqual(
             saved_graph.saver_def.restore_op_name, "save/restore_all")
 
+  @test_util.run_deprecated_v1
   def testMultipleCustomSavers(self):
     export_dir = self._get_export_dir("test_multiple_custom_savers")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     with self.session(graph=ops.Graph()) as sess:
       variables.VariableV1(1, name="v1")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(sess, ["tag_0"])
 
       saver_1 = training.Saver()
@@ -1209,21 +1180,22 @@ class SavedModelTest(test.TestCase):
     _validate_custom_saver("tag_1", "save_1/restore_all")
     _validate_custom_saver("tag_2", "save_2/restore_all")
 
+  @test_util.run_deprecated_v1
   def testImportScope(self):
     export_dir = self._get_export_dir("test_scoped_assets")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Build a SavedModel with a variable, an asset, and a constant tensor.
     with self.session(graph=ops.Graph()) as sess:
       self._init_and_validate_variable(sess, "v", 42)
-      asset_collection = self._build_asset_collection("foo.txt", "content_foo",
-                                                      "asset_file_tensor")
+      asset_list = self._build_asset_collection("foo.txt", "content_foo",
+                                                "asset_file_tensor")
       constant_op.constant("constant value", name="constant_tensor_name")
       builder.add_meta_graph_and_variables(
-          sess, ["tag_name"], assets_collection=asset_collection)
+          sess, ["tag_name"], assets_list=asset_list)
 
       # Save the asset file path for later comparison.
-      asset_file_path = asset_collection[0].eval()
+      asset_file_path = asset_list[0].eval()
 
     # Save the SavedModel to disk.
     builder.save()
@@ -1244,16 +1216,14 @@ class SavedModelTest(test.TestCase):
 
       # The loaded asset tensor should be scoped, but the asset file path and
       # contents should be unchanged.
-      asset_collection = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
-      self.assertEqual(1, len(asset_collection))
-      self.assertEqual(asset_file_path, asset_collection[0].eval())
-      self.assertEqual("scope_name/asset_file_tensor:0",
-                       asset_collection[0].name)
+      asset_list = ops.get_collection(ops.GraphKeys.ASSET_FILEPATHS)
+      self.assertEqual(1, len(asset_list))
+      self.assertEqual(asset_file_path, asset_list[0].eval())
+      self.assertEqual("scope_name/asset_file_tensor:0", asset_list[0].name)
       # The static asset data inside graph_proto.collection_def should not be
       # scoped.
-      self._validate_asset_collection(export_dir, graph_proto.collection_def,
-                                      "foo.txt", "content_foo",
-                                      "asset_file_tensor:0")
+      self._validate_assets(export_dir, graph_proto.asset_file_def, "foo.txt",
+                            "content_foo", "asset_file_tensor:0")
 
       # The constant tensor should be scoped, but its contents should be
       # unchanged.
@@ -1262,9 +1232,10 @@ class SavedModelTest(test.TestCase):
           ops.get_default_graph().get_tensor_by_name(
               "scope_name/constant_tensor_name:0").eval())
 
+  @test_util.run_deprecated_v1
   def testClearDevices(self):
     export_dir = self._get_export_dir("test_clear_devices")
-    builder = saved_model_builder.SavedModelBuilder(export_dir)
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
 
     # Specify a device and save a variable.
     ops.reset_default_graph()
@@ -1286,6 +1257,174 @@ class SavedModelTest(test.TestCase):
       self.assertEqual(
           42, ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES)[0].eval())
 
+  # Tests the behavior of loading SavedModels that having missing attrs or attrs
+  # with incorrect types.
+  def testInconsistentConsumerDefaultAttrs(self):
+    export_dir = self._get_export_dir(
+        "test_strip_default_attrs_no_consumer_defaults")
+    builder = saved_model_builder._SavedModelBuilder(export_dir)
+
+    # Add a graph with a single variable and a test op with a defaultless
+    # float32 attr, "test_attr".
+    with session.Session(graph=ops.Graph()) as sess:
+      variables.VariableV1(1.0, dtype=dtypes.float64, name="var")
+      test_ops.test_attr(T=dtypes.float32, name="test_attr")
+      self.evaluate(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(sess, ["foo"])
+
+    # Save the SavedModel to disk in text format.
+    builder.save(as_text=True)
+
+    # Rewrite the SavedModel to remove the T attr from "test_attr".
+    saved_model_file = os.path.join(
+        export_dir, constants.SAVED_MODEL_FILENAME_PBTXT)
+    with open(saved_model_file) as f:
+      original_saved_model = f.read()
+
+    no_attr_saved_model = original_saved_model.replace("""
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }""", "")
+    with open(saved_model_file, "w") as f:
+      f.write(no_attr_saved_model)
+
+    # Loading the SavedModel via the loader must fail because the SavedModel
+    # does not have any attr values for the "TestAttr" node, and there is no
+    # default specified in the TestAttr OpDef.
+    sess = session.Session(graph=ops.Graph())
+    with self.assertRaisesRegexp(
+        ValueError, "NodeDef missing attr 'T' from Op<name=TestAttr"):
+      loader.load(sess, ["foo"], export_dir)
+
+    # Rewrite the SavedModel to change the type of the T attr in "test_attr"
+    bad_type_saved_model = original_saved_model.replace("""
+      attr {
+        key: "T"
+        value {
+          type: DT_FLOAT
+        }
+      }""", """
+      attr {
+        key: "T"
+        value {
+          type: DT_DOUBLE
+        }
+      }""")
+    with open(saved_model_file, "w") as f:
+      f.write(bad_type_saved_model)
+
+    # Loading the SavedModel via the loader must fail because there is no
+    # OpKernel registered to handle T = double.
+    sess = session.Session(graph=ops.Graph())
+    with self.assertRaisesRegexp(
+        errors.InvalidArgumentError,
+        "No OpKernel was registered to support Op 'TestAttr' used by node "
+        "test_attr \\(defined at .*\\) with these attrs: \\[.*\\]\n"
+        "Registered devices:.*\n"
+        "Registered kernels:.*"
+    ):
+      loader.load(sess, ["foo"], export_dir)
+
+
+class SavedModelV1Test(SavedModelTestBase):
+
+  def _validate_asset_collection(self,
+                                 export_dir,
+                                 graph_collection_def,
+                                 expected_asset_file_name,
+                                 expected_asset_file_contents,
+                                 expected_asset_tensor_name,
+                                 asset_id=0):
+    assets_any = graph_collection_def[constants.ASSETS_KEY].any_list.value
+    asset = meta_graph_pb2.AssetFileDef()
+    assets_any[asset_id].Unpack(asset)
+    assets_path = os.path.join(
+        compat.as_bytes(export_dir),
+        compat.as_bytes(constants.ASSETS_DIRECTORY),
+        compat.as_bytes(expected_asset_file_name))
+    actual_asset_contents = file_io.read_file_to_string(assets_path)
+    self.assertEqual(expected_asset_file_contents,
+                     compat.as_text(actual_asset_contents))
+    self.assertEqual(expected_asset_file_name, asset.filename)
+    self.assertEqual(expected_asset_tensor_name, asset.tensor_info.name)
+
+  @test_util.run_deprecated_v1
+  def testWritingAssetsToCollection(self):
+    export_dir = self._get_export_dir("test_writing_assets_to_collection")
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    with self.session(graph=ops.Graph()) as sess:
+      self._init_and_validate_variable(sess, "v", 42)
+
+      # Build an asset list.
+      ignored_filepath = os.path.join(
+          compat.as_bytes(test.get_temp_dir()), compat.as_bytes("ignored.txt"))
+      file_io.write_string_to_file(ignored_filepath, "will be ignored")
+
+      asset_collection = self._build_asset_collection(
+          "hello42.txt", "foo bar baz", "asset_file_tensor")
+
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], assets_collection=asset_collection)
+
+    # Save the SavedModel to disk.
+    builder.save()
+
+    with self.session(graph=ops.Graph()) as sess:
+      foo_graph = loader.load(sess, ["foo"], export_dir)
+      self._validate_asset_collection(export_dir, foo_graph.collection_def,
+                                      "hello42.txt", "foo bar baz",
+                                      "asset_file_tensor:0")
+      ignored_asset_path = os.path.join(
+          compat.as_bytes(export_dir),
+          compat.as_bytes(constants.ASSETS_DIRECTORY),
+          compat.as_bytes("ignored.txt"))
+      self.assertFalse(file_io.file_exists(ignored_asset_path))
+
+  @test_util.run_deprecated_v1
+  def testLegacyInitOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir(
+        "test_legacy_init_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir,
+                                            constants.LEGACY_INIT_OP_KEY)
+
+  @test_util.run_deprecated_v1
+  def testMainOpWithNonEmptyCollection(self):
+    export_dir = self._get_export_dir("test_main_op_with_non_empty_collection")
+    self._testInitOpsWithNonEmptyCollection(export_dir, constants.MAIN_OP_KEY)
+
+  def _testInitOpsWithNonEmptyCollection(self, export_dir, key):
+    builder = saved_model_builder.SavedModelBuilder(export_dir)
+
+    g = ops.Graph()
+    with self.session(graph=g) as sess:
+      # Initialize variable `v1` to 1.
+      v1 = variables.VariableV1(1, name="v1")
+      ops.add_to_collection("v", v1)
+
+      # Initialize another variable `v2` to 42.
+      v2 = variables.VariableV1(42, name="v2", trainable=False, collections=[])
+      ops.add_to_collection("v", v2)
+
+      # Set up an assignment op to be run as part of the init op.
+      assign_v2 = state_ops.assign(v2, v1)
+      init_op = control_flow_ops.group(assign_v2, name="init_op")
+
+      self.evaluate(variables.global_variables_initializer())
+
+      ops.add_to_collection(key, control_flow_ops.no_op())
+      # ValueError should be raised since the LEGACY_INIT_OP_KEY collection
+      # is not empty and we don't support multiple init ops.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(
+            sess, ["foo"], legacy_init_op=init_op)
+      # We shouldn't be able to add as MAIN_OP, either.
+      with self.assertRaisesRegexp(ValueError, "Graph already contains"):
+        builder.add_meta_graph_and_variables(sess, ["foo"], main_op=init_op)
+
   def testStripDefaultAttrs(self):
     export_dir = self._get_export_dir("test_strip_default_attrs")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
@@ -1296,7 +1435,7 @@ class SavedModelTest(test.TestCase):
       real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph_and_variables(
           sess, ["foo"], strip_default_attrs=True)
 
@@ -1306,7 +1445,7 @@ class SavedModelTest(test.TestCase):
       real_num = variables.VariableV1(1.0, dtype=dtypes.float32, name="real")
       imag_num = variables.VariableV1(2.0, dtype=dtypes.float32, name="imag")
       math_ops.complex(real_num, imag_num, name="complex")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       builder.add_meta_graph(["bar"], strip_default_attrs=False)
 
     # Save the SavedModel to disk in text format.
@@ -1322,10 +1461,8 @@ class SavedModelTest(test.TestCase):
     self.assertIn("Tout", complex_node.attr)
 
     # Load graph "foo" from disk as-is to verify default attrs are stripped.
-    # pylint: disable=protected-access
-    saved_model_pb = loader_impl._parse_saved_model(export_dir)
+    saved_model_pb = loader_impl.parse_saved_model(export_dir)
     self.assertIsNotNone(saved_model_pb)
-    # pylint: enable=protected-access
 
     meta_graph_foo_def = None
     meta_graph_bar_def = None
@@ -1356,76 +1493,40 @@ class SavedModelTest(test.TestCase):
     self.assertIn("T", node_def.attr)
     self.assertIn("Tout", node_def.attr)
 
-  # Tests the behavior of loading SavedModels that having missing attrs or attrs
-  # with incorrect types.
-  def testInconsistentConsumerDefaultAttrs(self):
-    export_dir = self._get_export_dir(
-        "test_strip_default_attrs_no_consumer_defaults")
+  @test_util.run_v1_only("b/120545219")
+  def testLegacyInitOp(self):
+    export_dir = self._get_export_dir("test_legacy_init_op")
     builder = saved_model_builder.SavedModelBuilder(export_dir)
 
-    # Add a graph with a single variable and a test op with a defaultless
-    # float32 attr, "test_attr".
-    with session.Session(graph=ops.Graph()) as sess:
-      variables.VariableV1(1.0, dtype=dtypes.float64, name="var")
-      test_ops.test_attr(T=dtypes.float32, name="test_attr")
-      sess.run(variables.global_variables_initializer())
-      builder.add_meta_graph_and_variables(sess, ["foo"])
-
-    # Save the SavedModel to disk in text format.
-    builder.save(as_text=True)
+    with self.session(graph=ops.Graph()) as sess:
+      # Add `v1` and `v2` variables to the graph.
+      v1 = variables.VariableV1(1, name="v1")
+      ops.add_to_collection("v", v1)
+      v2 = variables.VariableV1(2, name="v2")
+      ops.add_to_collection("v", v2)
 
-    # Rewrite the SavedModel to remove the T attr from "test_attr".
-    saved_model_file = os.path.join(
-        export_dir, constants.SAVED_MODEL_FILENAME_PBTXT)
-    with open(saved_model_file) as f:
-      original_saved_model = f.read()
+      # Initialize another variable `v3` to 42.
+      v3 = variables.VariableV1(42, name="v3", trainable=False, collections=[])
+      ops.add_to_collection("v", v3)
 
-    no_attr_saved_model = original_saved_model.replace("""
-      attr {
-        key: "T"
-        value {
-          type: DT_FLOAT
-        }
-      }""", "")
-    with open(saved_model_file, "w") as f:
-      f.write(no_attr_saved_model)
+      # Set up an assignment op to be run as part of the init_op.
+      assign_v3 = state_ops.assign(v3, math_ops.add(v1, v2))
+      legacy_init_op = control_flow_ops.group(assign_v3, name="legacy_init_op")
 
-    # Loading the SavedModel via the loader must fail because the SavedModel
-    # does not have any attr values for the "TestAttr" node, and there is no
-    # default specified in the TestAttr OpDef.
-    sess = session.Session(graph=ops.Graph())
-    with self.assertRaisesRegexp(
-        ValueError, "NodeDef missing attr 'T' from Op<name=TestAttr"):
-      loader.load(sess, ["foo"], export_dir)
+      self.evaluate(variables.global_variables_initializer())
+      builder.add_meta_graph_and_variables(
+          sess, ["foo"], legacy_init_op=legacy_init_op)
 
-    # Rewrite the SavedModel to change the type of the T attr in "test_attr"
-    bad_type_saved_model = original_saved_model.replace("""
-      attr {
-        key: "T"
-        value {
-          type: DT_FLOAT
-        }
-      }""", """
-      attr {
-        key: "T"
-        value {
-          type: DT_DOUBLE
-        }
-      }""")
-    with open(saved_model_file, "w") as f:
-      f.write(bad_type_saved_model)
+    # Save the SavedModel to disk.
+    builder.save()
 
-    # Loading the SavedModel via the loader must fail because there is no
-    # OpKernel registered to handle T = double.
-    sess = session.Session(graph=ops.Graph())
-    with self.assertRaisesRegexp(
-        errors.InvalidArgumentError,
-        "No OpKernel was registered to support Op 'TestAttr' used by node "
-        "test_attr \\(defined at .*\\) with these attrs: \\[.*\\]\n"
-        "Registered devices:.*\n"
-        "Registered kernels:.*"
-    ):
+    with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"], export_dir)
+      self.assertEqual(1, ops.get_collection("v")[0].eval())
+      self.assertEqual(2, ops.get_collection("v")[1].eval())
+      # Evaluates to the sum of the first two variables and assigned as part of
+      # the legacy_init_op, following a restore.
+      self.assertEqual(3, ops.get_collection("v")[2].eval())
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/saved_model/saved_object_graph.proto b/tensorflow/python/saved_model/saved_object_graph.proto
new file mode 100644
index 0000000000000000000000000000000000000000..3991fbede42655e39bec93226b6295603c394cf4
--- /dev/null
+++ b/tensorflow/python/saved_model/saved_object_graph.proto
@@ -0,0 +1,73 @@
+syntax = "proto3";
+
+import "tensorflow/core/protobuf/checkpointable_object_graph.proto";
+
+option cc_enable_arenas = true;
+
+package tensorflow;
+
+// A SavedObjectGraph is part of object-based SavedModels in TF 2.0. It
+// describes the directed graph of Python objects (or equivalent in other
+// languages) that make up a model, with nodes[0] at the root.
+
+// SavedObjectGraph shares some structure with CheckpointableObjectGraph, but
+// ObjectGraph belongs to the SavedModel and contains pointers to functions and
+// type information, while CheckpointableObjectGraph lives in the checkpoint and
+// contains pointers only to variable values.
+
+// NOTE: This protocol buffer format is experimental and subject to change.
+
+message SavedObjectGraph {
+  // List of objects in the SavedModel.
+  //
+  // The position of the object in this list indicates its id.
+  // Nodes[0] is considered the root node.
+  repeated SavedObject nodes = 1;
+}
+
+message SavedObject {
+  // Objects which this object depends on: named edges in the dependency
+  // graph.
+  //
+  // Note: only valid if kind == "object".
+  repeated CheckpointableObjectGraph.CheckpointableObject.ObjectReference
+      children = 1;
+
+  // Removed when forking from CheckpointableObjectGraph.
+  reserved "attributes";
+  reserved 2;
+
+  // Slot variables owned by this object. This describes the three-way
+  // (optimizer, variable, slot variable) relationship; none of the three
+  // depend on the others directly.
+  //
+  // Note: only valid if kind == "object".
+  repeated CheckpointableObjectGraph.CheckpointableObject.SlotVariableReference
+      slot_variables = 3;
+
+  oneof kind {
+    SavedUserObject user_object = 4;
+    SavedAsset asset = 5;
+  }
+}
+
+// A SavedUserObject is an object (in the object-oriented language of the
+// TensorFlow program) of some user- or framework-defined class other than
+// those handled specifically by the other kinds of SavedObjects.
+//
+// This object cannot be evaluated as a tensor, and therefore cannot be bound
+// to an input of a function.
+message SavedUserObject {}
+
+// A SavedAsset represents a file in a SavedModel.
+//
+// When bound to a function this object evaluates to a Variable from which the
+// absolute filename can be read. Users should not expect the filename to be
+// maintained.
+message SavedAsset {
+  // Index into `MetaGraphDef.asset_file_def[]` that describes the Asset.
+  //
+  // Only the field `AssetFileDef.filename` is used. Other fields, such as
+  // `AssetFileDef.tensor_info`, MUST be ignored.
+  uint32 asset_file_def_index = 1;
+}
diff --git a/tensorflow/python/saved_model/signature_constants.py b/tensorflow/python/saved_model/signature_constants.py
index 96460717ec5f74e61fb2052b9080f1f0470e243b..0efe1763430eade223801b63f958405212eebe34 100644
--- a/tensorflow/python/saved_model/signature_constants.py
+++ b/tensorflow/python/saved_model/signature_constants.py
@@ -135,7 +135,7 @@ tf_export(
 
 ################################################################################
 # Train/Eval API constants.
-# Not exported while export_all_saved_models is in contrib.
+# Not exported while export_all_saved_models is experimental.
 
 SUPERVISED_TRAIN_METHOD_NAME = "tensorflow/supervised/training"
 
diff --git a/tensorflow/python/saved_model/signature_def_utils.py b/tensorflow/python/saved_model/signature_def_utils.py
index 27d6b70e9dce3ff67b7912efcea8e2994d138dc6..6a3c0aaf385ec360f90f748c5aadcae7e354b621 100644
--- a/tensorflow/python/saved_model/signature_def_utils.py
+++ b/tensorflow/python/saved_model/signature_def_utils.py
@@ -24,6 +24,8 @@ from __future__ import print_function
 from tensorflow.python.saved_model.signature_def_utils_impl import build_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import classification_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import is_valid_signature
+from tensorflow.python.saved_model.signature_def_utils_impl import load_op_from_signature_def
+from tensorflow.python.saved_model.signature_def_utils_impl import op_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import regression_signature_def
 from tensorflow.python.saved_model.signature_def_utils_impl import supervised_eval_signature_def
diff --git a/tensorflow/python/saved_model/signature_def_utils_impl.py b/tensorflow/python/saved_model/signature_def_utils_impl.py
index 6e5e3bc682234b18b6ae540ae92f6988c68e17e5..f6e6e1d13ecdea684f14dcaaa39f1c66f72ac352 100644
--- a/tensorflow/python/saved_model/signature_def_utils_impl.py
+++ b/tensorflow/python/saved_model/signature_def_utils_impl.py
@@ -21,9 +21,10 @@ from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.saved_model import signature_constants
-from tensorflow.python.saved_model import utils
+from tensorflow.python.saved_model import utils_impl as utils
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
@@ -349,3 +350,51 @@ def _is_valid_classification_signature(signature_def):
     return False
 
   return True
+
+
+def op_signature_def(op, key):
+  """Creates a signature def with the output pointing to an op.
+
+  Note that op isn't strictly enforced to be an Op object, and may be a Tensor.
+  It is recommended to use the build_signature_def() function for Tensors.
+
+  Args:
+    op: An Op (or possibly Tensor).
+    key: Key to graph element in the SignatureDef outputs.
+
+  Returns:
+    A SignatureDef with a single output pointing to the op.
+  """
+  # Use build_tensor_info_from_op, which creates a TensorInfo from the element's
+  # name.
+  return build_signature_def(outputs={key: utils.build_tensor_info_from_op(op)})
+
+
+def load_op_from_signature_def(signature_def, key, import_scope=None):
+  """Load an Op from a SignatureDef created by op_signature_def().
+
+  Args:
+    signature_def: a SignatureDef proto
+    key: string key to op in the SignatureDef outputs.
+    import_scope: Scope used to import the op
+
+  Returns:
+    Op (or possibly Tensor) in the graph with the same name as saved in the
+      SignatureDef.
+
+  Raises:
+    NotFoundError: If the op could not be found in the graph.
+  """
+  tensor_info = signature_def.outputs[key]
+  try:
+    # The init and train ops are not strictly enforced to be operations, so
+    # retrieve any graph element (can be either op or tensor).
+    return utils.get_element_from_tensor_info(
+        tensor_info, import_scope=import_scope)
+  except KeyError:
+    raise errors.NotFoundError(
+        None, None,
+        'The {0} could not be found in the graph. Please make sure the '
+        'SavedModel was created by the internal _SavedModelBuilder. If you '
+        'are using the public API, please make sure the SignatureDef in the '
+        'SavedModel does not contain the key "{0}".'.format(key))
diff --git a/tensorflow/python/saved_model/signature_def_utils_test.py b/tensorflow/python/saved_model/signature_def_utils_test.py
index 18c55d8d33221054f033c8baf73c757b3e03a849..d1347eb0178423f9293022e4f36eeb90caac833e 100644
--- a/tensorflow/python/saved_model/signature_def_utils_test.py
+++ b/tensorflow/python/saved_model/signature_def_utils_test.py
@@ -22,7 +22,9 @@ from tensorflow.core.framework import types_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import signature_constants
 from tensorflow.python.saved_model import signature_def_utils_impl
@@ -58,6 +60,7 @@ def _make_signature(inputs, outputs, name=None):
 
 class SignatureDefUtilsTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBuildSignatureDef(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -88,6 +91,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testRegressionSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     output1 = constant_op.constant(2.2, name="output-1")
@@ -113,6 +117,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, y_tensor_info_actual.dtype)
     self.assertEqual(0, len(y_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testClassificationSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     output1 = constant_op.constant("b", name="output-1")
@@ -144,6 +149,7 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_FLOAT, scores_tensor_info_actual.dtype)
     self.assertEqual(0, len(scores_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testPredictionSignatureDef(self):
     input1 = constant_op.constant("a", name="input-1")
     input2 = constant_op.constant("b", name="input-2")
@@ -180,11 +186,13 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_STRING, output2_tensor_info_actual.dtype)
     self.assertEqual(0, len(output2_tensor_info_actual.tensor_shape.dim))
 
+  @test_util.run_deprecated_v1
   def testTrainSignatureDef(self):
     self._testSupervisedSignatureDef(
         signature_def_utils_impl.supervised_train_signature_def,
         signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
 
+  @test_util.run_deprecated_v1
   def testEvalSignatureDef(self):
     self._testSupervisedSignatureDef(
         signature_def_utils_impl.supervised_eval_signature_def,
@@ -238,11 +246,13 @@ class SignatureDefUtilsTest(test.TestCase):
     self.assertEqual(
         types_pb2.DT_FLOAT, signature_def.outputs["metrics/value"].dtype)
 
+  @test_util.run_deprecated_v1
   def testTrainSignatureDefMissingInputs(self):
     self._testSupervisedSignatureDefMissingInputs(
         signature_def_utils_impl.supervised_train_signature_def,
         signature_constants.SUPERVISED_TRAIN_METHOD_NAME)
 
+  @test_util.run_deprecated_v1
   def testEvalSignatureDefMissingInputs(self):
     self._testSupervisedSignatureDefMissingInputs(
         signature_def_utils_impl.supervised_eval_signature_def,
@@ -413,5 +423,24 @@ class SignatureDefUtilsTest(test.TestCase):
         {},
         signature_constants.PREDICT_METHOD_NAME)
 
+  @test_util.run_v1_only("b/120545219")
+  def testOpSignatureDef(self):
+    key = "adding_1_and_2_key"
+    add_op = math_ops.add(1, 2, name="adding_1_and_2")
+    signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+    self.assertIn(key, signature_def.outputs)
+    self.assertEqual(add_op.name, signature_def.outputs[key].name)
+
+  @test_util.run_v1_only("b/120545219")
+  def testLoadOpFromSignatureDef(self):
+    key = "adding_1_and_2_key"
+    add_op = math_ops.add(1, 2, name="adding_1_and_2")
+    signature_def = signature_def_utils_impl.op_signature_def(add_op, key)
+
+    self.assertEqual(
+        add_op,
+        signature_def_utils_impl.load_op_from_signature_def(signature_def, key))
+
+
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/saved_model/simple_save_test.py b/tensorflow/python/saved_model/simple_save_test.py
index 18f82daadad6ae7142c249c66e61ea13782b33ac..21c2e9df2fae9f1d078b9ca95ffa52242b6756f7 100644
--- a/tensorflow/python/saved_model/simple_save_test.py
+++ b/tensorflow/python/saved_model/simple_save_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import os
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import loader
@@ -33,8 +34,8 @@ class SimpleSaveTest(test.TestCase):
 
   def _init_and_validate_variable(self, sess, variable_name, variable_value):
     v = variables.Variable(variable_value, name=variable_name)
-    sess.run(variables.global_variables_initializer())
-    self.assertEqual(variable_value, v.eval())
+    self.evaluate(variables.global_variables_initializer())
+    self.assertEqual(variable_value, self.evaluate(v))
     return v
 
   def _check_variable_info(self, actual_variable, expected_variable):
@@ -53,6 +54,7 @@ class SimpleSaveTest(test.TestCase):
       self.assertEqual(actual_tensor_info.tensor_shape.dim[i].size,
                        expected_tensor.shape[i])
 
+  @test_util.run_deprecated_v1
   def testSimpleSave(self):
     """Test simple_save that uses the default parameters."""
     export_dir = os.path.join(test.get_temp_dir(),
diff --git a/tensorflow/python/saved_model/utils_impl.py b/tensorflow/python/saved_model/utils_impl.py
index 10667419761214fe1830199d86e9cf9bf577d7dd..5caabe59fec1a0819629bd9ff16ad5be19f0890a 100644
--- a/tensorflow/python/saved_model/utils_impl.py
+++ b/tensorflow/python/saved_model/utils_impl.py
@@ -141,6 +141,27 @@ def get_tensor_from_tensor_info(tensor_info, graph=None, import_scope=None):
     raise ValueError("Invalid TensorInfo.encoding: %s" % encoding)
 
 
+def get_element_from_tensor_info(tensor_info, graph=None, import_scope=None):
+  """Returns the element in the graph described by a TensorInfo proto.
+
+  Args:
+    tensor_info: A TensorInfo proto describing an Op or Tensor by name.
+    graph: The tf.Graph in which tensors are looked up. If None, the current
+      default graph is used.
+    import_scope: If not None, names in `tensor_info` are prefixed with this
+      string before lookup.
+
+  Returns:
+    Op or tensor in `graph` described by `tensor_info`.
+
+  Raises:
+    KeyError: If `tensor_info` does not correspond to an op or tensor in `graph`
+  """
+  graph = graph or ops.get_default_graph()
+  return graph.as_graph_element(
+      ops.prepend_name_scope(tensor_info.name, import_scope=import_scope))
+
+
 # Path helpers.
 
 
diff --git a/tensorflow/python/saved_model/utils_test.py b/tensorflow/python/saved_model/utils_test.py
index 0888dcb411e34b030416362663fe4e2d11899cfd..2afe8abfd646f26f0562d7cc56b82c5781a586ef 100644
--- a/tensorflow/python/saved_model/utils_test.py
+++ b/tensorflow/python/saved_model/utils_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.platform import test
@@ -32,6 +33,7 @@ from tensorflow.python.saved_model import utils
 
 class UtilsTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoOp(self):
     x = constant_op.constant(1, name="x")
     y = constant_op.constant(2, name="y")
@@ -41,6 +43,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_INVALID, z_op_info.dtype)
     self.assertEqual(0, len(z_op_info.tensor_shape.dim))
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoDefunOp(self):
     @function.defun
     def my_init_fn(x, y):
@@ -54,6 +57,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(types_pb2.DT_INVALID, init_op_info.dtype)
     self.assertEqual(0, len(init_op_info.tensor_shape.dim))
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoDense(self):
     x = array_ops.placeholder(dtypes.float32, 1, name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -62,6 +66,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(1, len(x_tensor_info.tensor_shape.dim))
     self.assertEqual(1, x_tensor_info.tensor_shape.dim[0].size)
 
+  @test_util.run_v1_only("b/120545219")
   def testBuildTensorInfoSparse(self):
     x = array_ops.sparse_placeholder(dtypes.float32, [42, 69], name="x")
     x_tensor_info = utils.build_tensor_info(x)
@@ -76,6 +81,7 @@ class UtilsTest(test.TestCase):
     self.assertEqual(42, x_tensor_info.tensor_shape.dim[0].size)
     self.assertEqual(69, x_tensor_info.tensor_shape.dim[1].size)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoDense(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
     tensor_info = utils.build_tensor_info(expected)
@@ -83,6 +89,7 @@ class UtilsTest(test.TestCase):
     self.assertIsInstance(actual, ops.Tensor)
     self.assertEqual(expected.name, actual.name)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoSparse(self):
     expected = array_ops.sparse_placeholder(dtypes.float32, name="x")
     tensor_info = utils.build_tensor_info(expected)
@@ -122,6 +129,7 @@ class UtilsTest(test.TestCase):
                                                  import_scope="foo")
       self.assertEqual(expected.name, actual.name)
 
+  @test_util.run_v1_only("b/120545219")
   def testGetTensorFromInfoRaisesErrors(self):
     expected = array_ops.placeholder(dtypes.float32, 1, name="x")
     tensor_info = utils.build_tensor_info(expected)
diff --git a/tensorflow/python/summary/summary.py b/tensorflow/python/summary/summary.py
index 9e9e6ed90353fc6d2778c45812ee66bced6f4167..0c13016712f316e113723c4c0c250ef636a3fcf0 100644
--- a/tensorflow/python/summary/summary.py
+++ b/tensorflow/python/summary/summary.py
@@ -52,7 +52,7 @@ from tensorflow.python.util import compat as _compat
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export('summary.scalar')
+@tf_export(v1=['summary.scalar'])
 def scalar(name, tensor, collections=None, family=None):
   """Outputs a `Summary` protocol buffer containing a single scalar value.
 
@@ -82,7 +82,7 @@ def scalar(name, tensor, collections=None, family=None):
   return val
 
 
-@tf_export('summary.image')
+@tf_export(v1=['summary.image'])
 def image(name, tensor, max_outputs=3, collections=None, family=None):
   """Outputs a `Summary` protocol buffer with images.
 
@@ -138,7 +138,7 @@ def image(name, tensor, max_outputs=3, collections=None, family=None):
   return val
 
 
-@tf_export('summary.histogram')
+@tf_export(v1=['summary.histogram'])
 def histogram(name, values, collections=None, family=None):
   # pylint: disable=line-too-long
   """Outputs a `Summary` protocol buffer with a histogram.
@@ -179,7 +179,7 @@ def histogram(name, values, collections=None, family=None):
   return val
 
 
-@tf_export('summary.audio')
+@tf_export(v1=['summary.audio'])
 def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
           family=None):
   # pylint: disable=line-too-long
@@ -228,7 +228,7 @@ def audio(name, tensor, sample_rate, max_outputs=3, collections=None,
   return val
 
 
-@tf_export('summary.text')
+@tf_export(v1=['summary.text'])
 def text(name, tensor, collections=None):
   """Summarizes textual data.
 
@@ -269,7 +269,7 @@ def text(name, tensor, collections=None):
   return t_summary
 
 
-@tf_export('summary.tensor_summary')
+@tf_export(v1=['summary.tensor_summary'])
 def tensor_summary(name,
                    tensor,
                    summary_description=None,
@@ -325,7 +325,7 @@ def tensor_summary(name,
   return val
 
 
-@tf_export('summary.merge')
+@tf_export(v1=['summary.merge'])
 def merge(inputs, collections=None, name=None):
   # pylint: disable=line-too-long
   """Merges summaries.
@@ -371,7 +371,7 @@ def merge(inputs, collections=None, name=None):
   return val
 
 
-@tf_export('summary.merge_all')
+@tf_export(v1=['summary.merge_all'])
 def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
   """Merges all summaries collected in the default graph.
 
@@ -404,7 +404,7 @@ def merge_all(key=_ops.GraphKeys.SUMMARIES, scope=None, name=None):
     return merge(summary_ops, name=name)
 
 
-@tf_export('summary.get_summary_description')
+@tf_export(v1=['summary.get_summary_description'])
 def get_summary_description(node_def):
   """Given a TensorSummary node_def, retrieve its SummaryDescription.
 
diff --git a/tensorflow/python/summary/summary_test.py b/tensorflow/python/summary/summary_test.py
index cacc28cc596f9f0bb0694f7675e56d92fe1a6d6d..64f0f315c5888b9dd7d2217693962f30e77b3b08 100644
--- a/tensorflow/python/summary/summary_test.py
+++ b/tensorflow/python/summary/summary_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -38,6 +39,7 @@ from tensorflow.python.summary import summary as summary_lib
 
 class SummaryTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testScalarSummary(self):
     with self.cached_session() as s:
       i = constant_op.constant(3)
@@ -51,6 +53,7 @@ class SummaryTest(test.TestCase):
     self.assertEqual(values[0].tag, 'outer/inner')
     self.assertEqual(values[0].simple_value, 3.0)
 
+  @test_util.run_deprecated_v1
   def testScalarSummaryWithFamily(self):
     with self.cached_session() as s:
       i = constant_op.constant(7)
@@ -74,6 +77,7 @@ class SummaryTest(test.TestCase):
     self.assertEqual(values[0].tag, 'family/outer/family/inner_1')
     self.assertEqual(values[0].simple_value, 7.0)
 
+  @test_util.run_deprecated_v1
   def testSummarizingVariable(self):
     with self.cached_session() as s:
       c = constant_op.constant(42.0)
@@ -89,6 +93,7 @@ class SummaryTest(test.TestCase):
     self.assertEqual(value.tag, 'summary')
     self.assertEqual(value.simple_value, 42.0)
 
+  @test_util.run_deprecated_v1
   def testImageSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -103,6 +108,7 @@ class SummaryTest(test.TestCase):
     expected = sorted('outer/inner/image/{}'.format(i) for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testImageSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 2, 3, 1))
@@ -119,6 +125,7 @@ class SummaryTest(test.TestCase):
                       for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testHistogramSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -130,6 +137,7 @@ class SummaryTest(test.TestCase):
     self.assertEqual(len(summary.value), 1)
     self.assertEqual(summary.value[0].tag, 'outer/inner')
 
+  @test_util.run_deprecated_v1
   def testHistogramSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 4, 4, 3))
@@ -148,6 +156,7 @@ class SummaryTest(test.TestCase):
       const = constant_op.constant(10, dtype=dtype)
       summary_lib.histogram('h', const)
 
+  @test_util.run_deprecated_v1
   def testAudioSummary(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
@@ -162,6 +171,7 @@ class SummaryTest(test.TestCase):
     expected = sorted('outer/inner/audio/{}'.format(i) for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testAudioSummaryWithFamily(self):
     with self.cached_session() as s:
       i = array_ops.ones((5, 3, 4))
@@ -178,6 +188,7 @@ class SummaryTest(test.TestCase):
                       for i in xrange(3))
     self.assertEqual(tags, expected)
 
+  @test_util.run_deprecated_v1
   def testTextSummary(self):
     with self.cached_session():
       with self.assertRaises(ValueError):
@@ -193,6 +204,7 @@ class SummaryTest(test.TestCase):
       summ = summary_lib.text('foo', array_ops.constant('one'))
       self.assertEqual(summ.op.type, 'TensorSummaryV2')
 
+  @test_util.run_deprecated_v1
   def testSummaryNameConversion(self):
     c = constant_op.constant(3)
     s = summary_lib.scalar('name with spaces', c)
@@ -204,6 +216,7 @@ class SummaryTest(test.TestCase):
     s3 = summary_lib.scalar('/name/with/leading/slash', c)
     self.assertEqual(s3.op.name, 'name/with/leading/slash')
 
+  @test_util.run_deprecated_v1
   def testSummaryWithFamilyMetaGraphExport(self):
     with ops.name_scope('outer'):
       i = constant_op.constant(11)
diff --git a/tensorflow/python/summary/writer/writer_test.py b/tensorflow/python/summary/writer/writer_test.py
index 09d4b63fbb61780db1aa9341cd2d98010b839989..d702ddc0a274cc22798519319220dbd37046c580 100644
--- a/tensorflow/python/summary/writer/writer_test.py
+++ b/tensorflow/python/summary/writer/writer_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import summary_ops_v2
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import test
@@ -100,6 +101,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testAddingSummaryGraphAndRunMetadata(self):
     test_dir = self._CleanTestDir("basics")
     sw = self._FileWriter(test_dir)
@@ -173,6 +175,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testGraphAsNamed(self):
     test_dir = self._CleanTestDir("basics_named_graph")
     with ops.Graph().as_default() as g:
@@ -181,6 +184,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
+  @test_util.run_deprecated_v1
   def testGraphAsPositional(self):
     test_dir = self._CleanTestDir("basics_positional_graph")
     with ops.Graph().as_default() as g:
@@ -189,6 +193,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, True)
 
+  @test_util.run_deprecated_v1
   def testGraphDefAsNamed(self):
     test_dir = self._CleanTestDir("basics_named_graph_def")
     with ops.Graph().as_default() as g:
@@ -198,6 +203,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
+  @test_util.run_deprecated_v1
   def testGraphDefAsPositional(self):
     test_dir = self._CleanTestDir("basics_positional_graph_def")
     with ops.Graph().as_default() as g:
@@ -207,6 +213,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertEventsWithGraph(test_dir, g, False)
 
+  @test_util.run_deprecated_v1
   def testGraphAndGraphDef(self):
     with self.assertRaises(ValueError):
       test_dir = self._CleanTestDir("basics_graph_and_graph_def")
@@ -216,12 +223,14 @@ class FileWriterTestCase(test.TestCase):
       sw = self._FileWriter(test_dir, graph=g, graph_def=gd)
       sw.close()
 
+  @test_util.run_deprecated_v1
   def testNeitherGraphNorGraphDef(self):
     with self.assertRaises(TypeError):
       test_dir = self._CleanTestDir("basics_string_instead_of_graph")
       sw = self._FileWriter(test_dir, "string instead of graph object")
       sw.close()
 
+  @test_util.run_deprecated_v1
   def testCloseAndReopen(self):
     test_dir = self._CleanTestDir("close_and_reopen")
     sw = self._FileWriter(test_dir)
@@ -265,6 +274,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testNonBlockingClose(self):
     test_dir = self._CleanTestDir("non_blocking_close")
     sw = self._FileWriter(test_dir)
@@ -274,6 +284,7 @@ class FileWriterTestCase(test.TestCase):
     sw.close()
     self._assertRecent(time_before_close)
 
+  @test_util.run_deprecated_v1
   def testUseAfterClose(self):
     test_dir = self._CleanTestDir("use_after_close")
     sw = self._FileWriter(test_dir)
@@ -289,6 +300,7 @@ class FileWriterTestCase(test.TestCase):
     for w in triggered:
       self.assertEqual(w.category, UserWarning)
 
+  @test_util.run_deprecated_v1
   def testWithStatement(self):
     test_dir = self._CleanTestDir("with_statement")
     with self._FileWriter(test_dir) as sw:
@@ -299,6 +311,7 @@ class FileWriterTestCase(test.TestCase):
   # Checks that values returned from session Run() calls are added correctly to
   # summaries.  These are numpy types so we need to check they fit in the
   # protocol buffers correctly.
+  @test_util.run_deprecated_v1
   def testAddingSummariesFromSessionRunCalls(self):
     test_dir = self._CleanTestDir("global_step")
     sw = self._FileWriter(test_dir)
@@ -309,12 +322,11 @@ class FileWriterTestCase(test.TestCase):
       summ = summary_pb2.Summary(
           value=[summary_pb2.Summary.Value(
               tag="i", simple_value=1.0)])
-      sw.add_summary(summ.SerializeToString(), i.eval())
+      sw.add_summary(summ.SerializeToString(), self.evaluate(i))
       sw.add_summary(
           summary_pb2.Summary(
-              value=[summary_pb2.Summary.Value(
-                  tag="l", simple_value=2.0)]),
-          l.eval())
+              value=[summary_pb2.Summary.Value(tag="l", simple_value=2.0)]),
+          self.evaluate(l))
       sw.close()
 
     rr = self._EventsReader(test_dir)
@@ -346,6 +358,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testPluginMetadataStrippedFromSubsequentEvents(self):
     test_dir = self._CleanTestDir("basics")
     sw = self._FileWriter(test_dir)
@@ -405,6 +418,7 @@ class FileWriterTestCase(test.TestCase):
     # We should be done.
     self.assertRaises(StopIteration, lambda: next(rr))
 
+  @test_util.run_deprecated_v1
   def testFileWriterWithSuffix(self):
     test_dir = self._CleanTestDir("test_suffix")
     sw = self._FileWriter(test_dir, filename_suffix="_test_suffix")
diff --git a/tensorflow/python/tf2.py b/tensorflow/python/tf2.py
index c9782a71199f73a1fc6207ea4e9568b6bac9c00a..75748f8f2c5ba2b78a2d220011e3e28e12276b62 100644
--- a/tensorflow/python/tf2.py
+++ b/tensorflow/python/tf2.py
@@ -25,6 +25,21 @@ from __future__ import print_function
 import os
 
 
+_force_enable = False
+
+
+def enable():
+  """Enables v2 behaviors."""
+  global _force_enable
+  _force_enable = True
+
+
+def disable():
+  """Disables v2 behaviors (TF2_BEHAVIOR env variable is still respected)."""
+  global _force_enable
+  _force_enable = False
+
+
 def enabled():
   """Returns True iff TensorFlow 2.0 behavior should be enabled."""
-  return os.getenv("TF2_BEHAVIOR") is not None
+  return _force_enable or os.getenv("TF2_BEHAVIOR", "0") != "0"
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 384c7a82d27b786839545a6ad979e12a73ee88c1..901d6bc335f3a10439e2f02d0db2b237a89fece0 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -29,6 +29,8 @@ py_library(
         ":optimize_for_inference_lib",
         ":selective_registration_header_lib",
         ":strip_unused_lib",
+        # Include the TF upgrade script to users can run it directly after install TF
+        "//tensorflow/tools/compatibility:tf_upgrade_v2",
     ],
 )
 
diff --git a/tensorflow/python/tools/api/generator/api_gen.bzl b/tensorflow/python/tools/api/generator/api_gen.bzl
index 2e5d875a58ae4af1fb164694f925383b0d952fc3..5e64cc64d2408fa459b6daa0c9134793bd9d5327 100644
--- a/tensorflow/python/tools/api/generator/api_gen.bzl
+++ b/tensorflow/python/tools/api/generator/api_gen.bzl
@@ -20,7 +20,8 @@ def gen_api_init_files(
         packages = ["tensorflow.python", "tensorflow.lite.python.lite"],
         package_deps = ["//tensorflow/python:no_contrib"],
         output_package = "tensorflow",
-        output_dir = ""):
+        output_dir = "",
+        root_file_name = "__init__.py"):
     """Creates API directory structure and __init__.py files.
 
     Creates a genrule that generates a directory structure with __init__.py
@@ -54,13 +55,14 @@ def gen_api_init_files(
       output_package: Package where generated API will be added to.
       output_dir: Subdirectory to output API to.
         If non-empty, must end with '/'.
+      root_file_name: Name of the root file with all the root imports.
     """
     root_init_template_flag = ""
     if root_init_template:
         root_init_template_flag = "--root_init_template=$(location " + root_init_template + ")"
 
     primary_package = packages[0]
-    api_gen_binary_target = ("create_" + primary_package + "_api_%d") % api_version
+    api_gen_binary_target = ("create_" + primary_package + "_api_%d_%s") % (api_version, name)
     native.py_binary(
         name = api_gen_binary_target,
         srcs = ["//tensorflow/python/tools/api/generator:create_python_api.py"],
@@ -73,6 +75,11 @@ def gen_api_init_files(
         ],
     )
 
+    # Replace name of root file with root_file_name.
+    output_files = [
+        root_file_name if f == "__init__.py" else f
+        for f in output_files
+    ]
     all_output_files = ["%s%s" % (output_dir, f) for f in output_files]
     compat_api_version_flags = ""
     for compat_api_version in compat_api_versions:
diff --git a/tensorflow/python/tools/api/generator/api_init_files.bzl b/tensorflow/python/tools/api/generator/api_init_files.bzl
index 5699d86e6d5ca20b75a50057584d49a651e8a12a..0245ac50a65a99a4e93733de17d680fe816e7db1 100644
--- a/tensorflow/python/tools/api/generator/api_init_files.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files.bzl
@@ -4,17 +4,17 @@
 TENSORFLOW_API_INIT_FILES = [
     # BEGIN GENERATED FILES
     "__init__.py",
-    "app/__init__.py",
     "bitwise/__init__.py",
     "compat/__init__.py",
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
+    "distribute/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
     "feature_column/__init__.py",
-    "gfile/__init__.py",
+    "io/gfile/__init__.py",
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
@@ -62,20 +62,16 @@ TENSORFLOW_API_INIT_FILES = [
     "linalg/__init__.py",
     "lite/__init__.py",
     "lite/constants/__init__.py",
-    "logging/__init__.py",
     "losses/__init__.py",
     "math/__init__.py",
-    "metrics/__init__.py",
     "nn/__init__.py",
     "nn/rnn_cell/__init__.py",
-    "profiler/__init__.py",
     "quantization/__init__.py",
     "random/__init__.py",
     "saved_model/__init__.py",
     "sets/__init__.py",
     "signal/__init__.py",
     "sparse/__init__.py",
-    "spectral/__init__.py",
     "strings/__init__.py",
     "summary/__init__.py",
     "sysconfig/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
index 89c817f60907175bce33cb30667ba81cefc32a70..e35b9c43740d4e59e9478cca978b15c7451ac96e 100644
--- a/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
+++ b/tensorflow/python/tools/api/generator/api_init_files_v1.bzl
@@ -10,12 +10,14 @@ TENSORFLOW_API_INIT_FILES_V1 = [
     "data/__init__.py",
     "data/experimental/__init__.py",
     "debugging/__init__.py",
+    "distribute/__init__.py",
     "distributions/__init__.py",
     "dtypes/__init__.py",
     "errors/__init__.py",
     "experimental/__init__.py",
     "feature_column/__init__.py",
     "gfile/__init__.py",
+    "io/gfile/__init__.py",
     "graph_util/__init__.py",
     "image/__init__.py",
     "io/__init__.py",
diff --git a/tensorflow/python/tools/api/generator/create_python_api.py b/tensorflow/python/tools/api/generator/create_python_api.py
index f62580342136938d847af1b48ed62856fc8c522e..51c2bfba7c13dee1c321f157fda3e221726f79b8 100644
--- a/tensorflow/python/tools/api/generator/create_python_api.py
+++ b/tensorflow/python/tools/api/generator/create_python_api.py
@@ -45,10 +45,10 @@ _GENERATED_FILE_HEADER = """# This file is MACHINE GENERATED! Do not edit.
 \"\"\"%s
 \"\"\"
 
-from __future__ import print_function
+from __future__ import print_function as _print_function
 
 """
-_GENERATED_FILE_FOOTER = '\n\ndel print_function\n'
+_GENERATED_FILE_FOOTER = '\n\ndel _print_function\n'
 
 
 class SymbolExposedTwiceError(Exception):
@@ -463,8 +463,9 @@ def create_api_files(output_files, packages, root_init_template, output_dir,
     raise ValueError(
         """Missing outputs for genrule:\n%s. Be sure to add these targets to
 tensorflow/python/tools/api/generator/api_init_files_v1.bzl and
-tensorflow/python/tools/api/generator/api_init_files.bzl""" % ',\n'.join(
-    sorted(missing_output_files)))
+tensorflow/python/tools/api/generator/api_init_files.bzl (tensorflow repo), or
+tensorflow_estimator/python/estimator/api/api_gen.bzl (estimator repo)"""
+        % ',\n'.join(sorted(missing_output_files)))
 
 
 def main():
diff --git a/tensorflow/python/tools/api/generator/doc_srcs.py b/tensorflow/python/tools/api/generator/doc_srcs.py
index 479d5006d1ed808c14ad91c639565de7d154bfef..abb5886deb3d9dd2e6981ee5822b0323a87eef1d 100644
--- a/tensorflow/python/tools/api/generator/doc_srcs.py
+++ b/tensorflow/python/tools/api/generator/doc_srcs.py
@@ -35,10 +35,11 @@ DocSource.__new__.__defaults__ = (None,) * len(DocSource._fields)
 
 _TENSORFLOW_DOC_SOURCES = {
     'app': DocSource(docstring_module_name='platform.app'),
+    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
     'compat': DocSource(docstring_module_name='util.compat'),
+    'distribute': DocSource(docstring_module_name='distribute.distribute_lib'),
     'distributions': DocSource(
         docstring_module_name='ops.distributions.distributions'),
-    'bitwise': DocSource(docstring_module_name='ops.bitwise_ops'),
     'errors': DocSource(docstring_module_name='framework.errors'),
     'gfile': DocSource(docstring_module_name='platform.gfile'),
     'graph_util': DocSource(docstring_module_name='framework.graph_util'),
@@ -56,9 +57,8 @@ _TENSORFLOW_DOC_SOURCES = {
     'resource_loader': DocSource(
         docstring_module_name='platform.resource_loader'),
     'sets': DocSource(docstring_module_name='ops.sets'),
-    'signal': DocSource(docstring_module_name='ops.signal'),
+    'signal': DocSource(docstring_module_name='ops.signal.signal'),
     'sparse': DocSource(docstring_module_name='ops.sparse_ops'),
-    'spectral': DocSource(docstring_module_name='ops.spectral_ops'),
     'strings': DocSource(docstring_module_name='ops.string_ops'),
     'sysconfig': DocSource(docstring_module_name='platform.sysconfig'),
     'test': DocSource(docstring_module_name='platform.test'),
diff --git a/tensorflow/python/tools/freeze_graph_test.py b/tensorflow/python/tools/freeze_graph_test.py
index 5dc14a6961e1ba7f1c317519a2d3b63eacba2220..de2672db3c4c4e6b94d3803767a749a943910d2c 100644
--- a/tensorflow/python/tools/freeze_graph_test.py
+++ b/tensorflow/python/tools/freeze_graph_test.py
@@ -161,9 +161,11 @@ class FreezeGraphTest(test_util.TensorFlowTestCase):
             },)
         builder.save(as_text=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testFreezeGraphV1(self):
     self._testFreezeGraph(saver_pb2.SaverDef.V1)
 
+  @test_util.run_v1_only("b/120545219")
   def testFreezeGraphV2(self):
     self._testFreezeGraph(saver_pb2.SaverDef.V2)
 
diff --git a/tensorflow/python/tools/inspect_checkpoint.py b/tensorflow/python/tools/inspect_checkpoint.py
index 6504fbc10755c5c543016b8d56d6d53f3311b249..ea1f6aa55553f0d35e526557ca114f9929b8af7d 100644
--- a/tensorflow/python/tools/inspect_checkpoint.py
+++ b/tensorflow/python/tools/inspect_checkpoint.py
@@ -63,7 +63,7 @@ def print_tensors_in_checkpoint_file(file_name, tensor_name, all_tensors,
       print("It's likely that your checkpoint file has been compressed "
             "with SNAPPY.")
     if ("Data loss" in str(e) and
-        (any([e in file_name for e in [".index", ".meta", ".data"]]))):
+        any(e in file_name for e in [".index", ".meta", ".data"])):
       proposed_file = ".".join(file_name.split(".")[0:-1])
       v2_file_error_template = """
 It's likely that this is a V2 checkpoint and you need to provide the filename
diff --git a/tensorflow/python/tools/optimize_for_inference_test.py b/tensorflow/python/tools/optimize_for_inference_test.py
index 10bfb0dc70613533a71551d13d7abd7020407e19..310776ff1b06a9d210e271b7c31ee6e00903da84 100644
--- a/tensorflow/python/tools/optimize_for_inference_test.py
+++ b/tensorflow/python/tools/optimize_for_inference_test.py
@@ -128,6 +128,7 @@ class OptimizeForInferenceTest(test.TestCase):
         graph_def, [], [add_name], dtypes.float32.as_datatype_enum)
     self.assertProtoEquals(expected_output, output)
 
+  @test_util.run_deprecated_v1
   def testFoldBatchNorms(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -171,6 +172,7 @@ class OptimizeForInferenceTest(test.TestCase):
     for node in optimized_graph_def.node:
       self.assertNotEqual("BatchNormWithGlobalNormalization", node.op)
 
+  @test_util.run_deprecated_v1
   def testFoldFusedBatchNorms(self):
     for data_format, use_gpu in [("NHWC", False), ("NCHW", True)]:
       with self.cached_session(use_gpu=use_gpu) as sess:
@@ -222,6 +224,7 @@ class OptimizeForInferenceTest(test.TestCase):
       for node in optimized_graph_def.node:
         self.assertNotEqual("FusedBatchNorm", node.op)
 
+  @test_util.run_deprecated_v1
   def testFuseResizePadAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -253,6 +256,7 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("MirrorPad", node.op)
       self.assertNotEqual("ResizeBilinear", node.op)
 
+  @test_util.run_deprecated_v1
   def testFuseResizeAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
@@ -282,6 +286,7 @@ class OptimizeForInferenceTest(test.TestCase):
       self.assertNotEqual("MirrorPad", node.op)
 
 
+  @test_util.run_deprecated_v1
   def testFusePadAndConv(self):
     with self.cached_session() as sess:
       inputs = [1, 4, 2, 5, 3, 6, -1, -4, -2, -5, -3, -6]
diff --git a/tensorflow/python/tools/strip_unused_test.py b/tensorflow/python/tools/strip_unused_test.py
index 7cf0c3e3ed9b5748b263913566150eff8acf857a..e906ff94ba8c0ad5ebb5014f244b0ef128d23a7a 100644
--- a/tensorflow/python/tools/strip_unused_test.py
+++ b/tensorflow/python/tools/strip_unused_test.py
@@ -50,7 +50,7 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
           wanted_input_node, 2.0, name="output_node")
       math_ops.add(output_node, 2.0, name="later_node")
       sess = session.Session()
-      output = sess.run(output_node)
+      output = self.evaluate(output_node)
       self.assertNear(-4.0, output, 0.00001)
       graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name)
 
@@ -113,7 +113,7 @@ class StripUnusedTest(test_util.TensorFlowTestCase):
           input_node1, input_node2, name="output_node")
       math_ops.add(output_node, 2.0, name="later_node")
       sess = session.Session()
-      output = sess.run(output_node)
+      output = self.evaluate(output_node)
       self.assertNear(6.0, output, 0.00001)
       graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name)
 
diff --git a/tensorflow/python/training/adadelta.py b/tensorflow/python/training/adadelta.py
index 95eca76496992f7ac66643a4c94d7e9e812cecf8..dd210160004760f1fe8cde945c6a728a530ebf33 100644
--- a/tensorflow/python/training/adadelta.py
+++ b/tensorflow/python/training/adadelta.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdadeltaOptimizer")
+@tf_export(v1=["train.AdadeltaOptimizer"])
 class AdadeltaOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adadelta algorithm.
 
diff --git a/tensorflow/python/training/adadelta_test.py b/tensorflow/python/training/adadelta_test.py
index a14ac895ac096e351cad91aa8a53ca0026b18c9d..0e5af5a92224a5c3a54cc45eef11cf728c78945c 100644
--- a/tensorflow/python/training/adadelta_test.py
+++ b/tensorflow/python/training/adadelta_test.py
@@ -166,6 +166,7 @@ class AdadeltaOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -177,12 +178,11 @@ class AdadeltaOptimizerTest(test.TestCase):
             1.0, 1.0, 1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval())
+        self.assertAllCloseAccordingToType([[-111, -138]], self.evaluate(var0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/adagrad.py b/tensorflow/python/training/adagrad.py
index cc0da26b2792bde3dec5fdbbafdd069eef1d81d7..10c043bae175d1da60f54a31caff37329641d86b 100644
--- a/tensorflow/python/training/adagrad.py
+++ b/tensorflow/python/training/adagrad.py
@@ -28,7 +28,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdagradOptimizer")
+@tf_export(v1=["train.AdagradOptimizer"])
 class AdagradOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adagrad algorithm.
 
diff --git a/tensorflow/python/training/adagrad_da.py b/tensorflow/python/training/adagrad_da.py
index 5ba403554f570d9df33a5d525a40de2eb0d11138..e23b7134b3bb609b4a217c68e2bc30caee7b0f8a 100644
--- a/tensorflow/python/training/adagrad_da.py
+++ b/tensorflow/python/training/adagrad_da.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdagradDAOptimizer")
+@tf_export(v1=["train.AdagradDAOptimizer"])
 class AdagradDAOptimizer(optimizer.Optimizer):
   """Adagrad Dual Averaging algorithm for sparse linear models.
 
diff --git a/tensorflow/python/training/adagrad_da_test.py b/tensorflow/python/training/adagrad_da_test.py
index 00801be3b4da878619cac753707b088352afe803..aacfe6faf4eff2b334197d86794380a273bcbb5e 100644
--- a/tensorflow/python/training/adagrad_da_test.py
+++ b/tensorflow/python/training/adagrad_da_test.py
@@ -22,6 +22,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -54,14 +55,14 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllClose([0.0, 0.0], v0_val)
         self.assertAllClose([0.0, 0.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         # Let g to be gradient accumulator, gg to be gradient squared
         # accumulator, T be the global step, lr is the learning rate, and k the
         # initial gradient squared accumulator value.
@@ -73,12 +74,15 @@ class AdagradDAOptimizerTest(test.TestCase):
         self.assertAllCloseAccordingToType(
             np.array([-0.094821, -0.189358]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithoutRegularizationBasic1(self):
     self.doTestAdagradDAwithoutRegularizationBasic1()
 
+  @test_util.run_deprecated_v1
   def testResourceAdagradDAWithoutRegularizationBasic1(self):
     self.doTestAdagradDAwithoutRegularizationBasic1(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -92,13 +96,15 @@ class AdagradDAOptimizerTest(test.TestCase):
             1.0, global_step).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-1, -1]], var0.eval(), rtol=0.01)
+        self.assertAllCloseAccordingToType([[-1, -1]],
+                                           self.evaluate(var0),
+                                           rtol=0.01)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAwithoutRegularizationBasic2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -118,19 +124,20 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.904534, -1.603567]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.094821, -0.189358]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithL1(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -150,19 +157,20 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.895489, -1.59555]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.085339, -0.17989]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testAdagradDAWithL1_L2(self):
     for dtype in [dtypes.float64, dtypes.float32]:
       with self.cached_session() as sess:
@@ -182,14 +190,14 @@ class AdagradDAOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run a step of AdagradDA
         update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.046907, -0.093659]), v0_val)
         self.assertAllCloseAccordingToType(
diff --git a/tensorflow/python/training/adagrad_test.py b/tensorflow/python/training/adagrad_test.py
index 7caf01f64d5e1cf7a4084444721aff9c55a9fb0b..1e2d29b337338985fb8ac27ab11d65667d22ee21 100644
--- a/tensorflow/python/training/adagrad_test.py
+++ b/tensorflow/python/training/adagrad_test.py
@@ -96,6 +96,7 @@ class AdagradOptimizerTest(test.TestCase):
   def testBasicLocked(self):
     self.doTestBasic(use_locking=True)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -107,14 +108,16 @@ class AdagradOptimizerTest(test.TestCase):
         sgd_op = adagrad.AdagradOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType(
-            [[1.0, 2.0], [3.0, 4.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0], [3.0, 4.0]],
+                                           self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0, 1], [3, 4]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1], [3, 4]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -128,17 +131,20 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 3 steps of adagrad
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -159,17 +165,18 @@ class AdagradOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([[1.0], [2.0]], var0.eval())
-        self.assertAllClose([[3.0], [4.0]], var1.eval())
+        self.assertAllClose([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllClose([[3.0], [4.0]], self.evaluate(var1))
         # Run 3 step of sgd
         for _ in range(3):
           ada_update.run()
         # Validate updated params
         self.assertAllCloseAccordingToType(
-            np.array([[-1.6026098728179932], [2.0]]), var0.eval())
+            np.array([[-1.6026098728179932], [2.0]]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([[3.0], [3.715679168701172]]), var1.eval())
+            np.array([[3.0], [3.715679168701172]]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -193,13 +200,14 @@ class AdagradOptimizerTest(test.TestCase):
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
         self.assertAllClose(aggregated_update_var.eval(),
-                            repeated_index_update_var.eval())
+                            self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
           self.assertAllClose(aggregated_update_var.eval(),
-                              repeated_index_update_var.eval())
+                              self.evaluate(repeated_index_update_var))
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndicesResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -217,13 +225,14 @@ class AdagradOptimizerTest(test.TestCase):
             2.0).minimize(loss_aggregated)
         variables.global_variables_initializer().run()
         self.assertAllCloseAccordingToType(
-            var_repeated.eval(), var_aggregated.eval())
+            self.evaluate(var_repeated), self.evaluate(var_aggregated))
         for _ in range(3):
           update_op_repeated.run()
           update_op_aggregated.run()
           self.assertAllCloseAccordingToType(
-              var_repeated.eval(), var_aggregated.eval())
+              self.evaluate(var_repeated), self.evaluate(var_aggregated))
 
+  @test_util.run_deprecated_v1
   def testSparseStability(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -253,13 +262,14 @@ class AdagradOptimizerTest(test.TestCase):
           init.run()
           ada_update.run()
           self.assertAllCloseAccordingToType(
-              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), slot0.eval())
+              np.array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1]]), self.evaluate(slot0))
           self.assertAllCloseAccordingToType(
               np.array([[
                   0.00891194, -0.10712013, 0.11047515, 0.22636929, -0.0144573,
                   -0.01029443
-              ]]), var0.eval())
+              ]]), self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -282,18 +292,21 @@ class AdagradOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values.
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Mix the first and the second adagrad for 3 steps.
         ada_update1.run()
         ada_update2.run()
         ada_update1.run()
         # Validate updated params (the same as with only 1 Adagrad).
         self.assertAllCloseAccordingToType(
-            np.array([-1.6026098728179932, -0.6026098728179932]), var0.eval())
+            np.array([-1.6026098728179932, -0.6026098728179932]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([2.715679168701172, 3.715679168701172]), var1.eval())
+            np.array([2.715679168701172, 3.715679168701172]),
+            self.evaluate(var1))
 
+  @test_util.run_v1_only("b/120545219")
   def testDynamicShapeVariable_Ok(self):
     with self.cached_session():
       v = variable_scope.get_variable("v", initializer=constant_op.constant(1.),
@@ -302,6 +315,7 @@ class AdagradOptimizerTest(test.TestCase):
       # Creating optimizer should cause no exception.
       adagrad.AdagradOptimizer(3.0, initial_accumulator_value=0.1)
 
+  @test_util.run_v1_only("b/120545219")
   def testDynamicShapeVariableWithCallableInit(self):
     var0 = variable_scope.get_variable("var0",
                                        initializer=constant_op.constant(1.),
diff --git a/tensorflow/python/training/adam.py b/tensorflow/python/training/adam.py
index 704ad6d3fe8a03b74012d260a54c64da67a1b0a3..0c701f47122caf7ae561ddfa84b98925226930e0 100644
--- a/tensorflow/python/training/adam.py
+++ b/tensorflow/python/training/adam.py
@@ -29,7 +29,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.AdamOptimizer")
+@tf_export(v1=["train.AdamOptimizer"])
 class AdamOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Adam algorithm.
 
diff --git a/tensorflow/python/training/adam_test.py b/tensorflow/python/training/adam_test.py
index 0d42cc7b9c690d9c5582bc6282739b7abb4739c1..b0bae275773cf05b4e6233706b60f60ca13c9ac0 100644
--- a/tensorflow/python/training/adam_test.py
+++ b/tensorflow/python/training/adam_test.py
@@ -83,30 +83,34 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     self.doTestSparse(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceSparse(self):
     self.doTestSparse(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testSparseDevicePlacement(self):
     for index_dtype in [dtypes.int32, dtypes.int64]:
       with self.cached_session(force_gpu=test.is_gpu_available()):
@@ -120,6 +124,7 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
         minimize_op.run()
 
+  @test_util.run_deprecated_v1
   def testSparseRepeatedIndices(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -143,12 +148,12 @@ class AdamOptimizerTest(test.TestCase):
             [(grad_aggregated, aggregated_update_var)])
         variables.global_variables_initializer().run()
         self.assertAllClose(aggregated_update_var.eval(),
-                            repeated_index_update_var.eval())
+                            self.evaluate(repeated_index_update_var))
         for _ in range(3):
           repeated_update.run()
           aggregated_update.run()
           self.assertAllClose(aggregated_update_var.eval(),
-                              repeated_index_update_var.eval())
+                              self.evaluate(repeated_index_update_var))
 
   def doTestBasic(self, use_resource=False, use_callable_params=False):
     for i, dtype in enumerate([dtypes.half, dtypes.float32, dtypes.float64]):
@@ -235,6 +240,7 @@ class AdamOptimizerTest(test.TestCase):
     with context.eager_mode():
       self.doTestBasic(use_resource=True, use_callable_params=True)
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -254,24 +260,26 @@ class AdamOptimizerTest(test.TestCase):
         variables.global_variables_initializer().run()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Run 3 steps of Adam
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           update.run()
 
           var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -294,13 +302,14 @@ class AdamOptimizerTest(test.TestCase):
         beta1_power, beta2_power = opt._get_beta_accumulators()
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 3 steps of intertwined Adam1 and Adam2.
         for t in range(1, 4):
-          self.assertAllCloseAccordingToType(0.9**t, beta1_power.eval())
-          self.assertAllCloseAccordingToType(0.999**t, beta2_power.eval())
+          self.assertAllCloseAccordingToType(0.9**t, self.evaluate(beta1_power))
+          self.assertAllCloseAccordingToType(0.999**t,
+                                             self.evaluate(beta2_power))
           if t % 2 == 0:
             update1.run()
           else:
@@ -310,8 +319,8 @@ class AdamOptimizerTest(test.TestCase):
           var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
 
           # Validate updated params
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
 
   def testTwoSessions(self):
     optimizer = adam.AdamOptimizer()
diff --git a/tensorflow/python/training/basic_loops_test.py b/tensorflow/python/training/basic_loops_test.py
index 5f5718e64a6c356e9fd4207c6a71a5b2628e3cb9..511a8334d56e60308c25927f47e3485d49b75dc6 100644
--- a/tensorflow/python/training/basic_loops_test.py
+++ b/tensorflow/python/training/basic_loops_test.py
@@ -23,6 +23,7 @@ import shutil
 
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import basic_loops
 from tensorflow.python.training import supervisor
@@ -37,6 +38,7 @@ def _test_dir(test_name):
 
 class BasicTrainLoopTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoop(self):
     logdir = _test_dir("basic_train_loop")
     sv = supervisor.Supervisor(logdir=logdir)
@@ -55,6 +57,7 @@ class BasicTrainLoopTest(test.TestCase):
           sv, train_fn, args=(sv, "y"), kwargs={"a": "A"})
       self.assertEqual(3, num_calls[0])
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoopExceptionAborts(self):
     logdir = _test_dir("basic_train_loop_exception_aborts")
     sv = supervisor.Supervisor(logdir=logdir)
@@ -71,6 +74,7 @@ class BasicTrainLoopTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "Failed"):
         basic_loops.basic_train_loop(sv, train_fn)
 
+  @test_util.run_deprecated_v1
   def testBasicTrainLoopRetryOnAborted(self):
     logdir = _test_dir("basic_train_loop_exception_aborts")
     sv = supervisor.Supervisor(logdir=logdir)
diff --git a/tensorflow/python/training/basic_session_run_hooks.py b/tensorflow/python/training/basic_session_run_hooks.py
index 1efabcd854d7f72c51e39dcf1f5ce65b0168cbcc..86718ab45fc539d6c7d90878860ca510cda31e47 100644
--- a/tensorflow/python/training/basic_session_run_hooks.py
+++ b/tensorflow/python/training/basic_session_run_hooks.py
@@ -83,7 +83,7 @@ class _HookTimer(object):
     raise NotImplementedError
 
 
-@tf_export("train.SecondOrStepTimer")
+@tf_export(v1=["train.SecondOrStepTimer"])
 class SecondOrStepTimer(_HookTimer):
   """Timer that triggers at most once every N seconds or once every N steps.
   """
@@ -163,7 +163,7 @@ class NeverTriggerTimer(_HookTimer):
     return None
 
 
-@tf_export("train.LoggingTensorHook")
+@tf_export(v1=["train.LoggingTensorHook"])
 class LoggingTensorHook(session_run_hook.SessionRunHook):
   """Prints the given tensors every N local steps, every N seconds, or at end.
 
@@ -373,7 +373,7 @@ class _MultiStepStopAtStepHook(session_run_hook.SessionRunHook):
       self._update_steps_per_run_variable(global_step, run_context.session)
 
 
-@tf_export("train.StopAtStepHook")
+@tf_export(v1=["train.StopAtStepHook"])
 class StopAtStepHook(session_run_hook.SessionRunHook):
   """Hook that requests stop at a specified step."""
 
@@ -429,7 +429,7 @@ class StopAtStepHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
-@tf_export("train.CheckpointSaverListener")
+@tf_export(v1=["train.CheckpointSaverListener"])
 class CheckpointSaverListener(object):
   """Interface for listeners that take action before or after checkpoint save.
 
@@ -495,7 +495,7 @@ class CheckpointSaverListener(object):
     pass
 
 
-@tf_export("train.CheckpointSaverHook")
+@tf_export(v1=["train.CheckpointSaverHook"])
 class CheckpointSaverHook(session_run_hook.SessionRunHook):
   """Saves checkpoints every N steps or seconds."""
 
@@ -634,7 +634,7 @@ class CheckpointSaverHook(session_run_hook.SessionRunHook):
     return savers[0]
 
 
-@tf_export("train.StepCounterHook")
+@tf_export(v1=["train.StepCounterHook"])
 class StepCounterHook(session_run_hook.SessionRunHook):
   """Hook that counts steps per second."""
 
@@ -718,14 +718,14 @@ class StepCounterHook(session_run_hook.SessionRunHook):
     self._last_global_step = stale_global_step
 
 
-@tf_export("train.NanLossDuringTrainingError")
+@tf_export(v1=["train.NanLossDuringTrainingError"])
 class NanLossDuringTrainingError(RuntimeError):
 
   def __str__(self):
     return "NaN loss during training."
 
 
-@tf_export("train.NanTensorHook")
+@tf_export(v1=["train.NanTensorHook"])
 class NanTensorHook(session_run_hook.SessionRunHook):
   """Monitors the loss tensor and stops training if loss is NaN.
 
@@ -757,7 +757,7 @@ class NanTensorHook(session_run_hook.SessionRunHook):
         run_context.request_stop()
 
 
-@tf_export("train.SummarySaverHook")
+@tf_export(v1=["train.SummarySaverHook"])
 class SummarySaverHook(session_run_hook.SessionRunHook):
   """Saves summaries every N steps."""
 
@@ -866,7 +866,7 @@ class SummarySaverHook(session_run_hook.SessionRunHook):
     return summary_op
 
 
-@tf_export("train.GlobalStepWaiterHook")
+@tf_export(v1=["train.GlobalStepWaiterHook"])
 class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
   """Delays execution until global step reaches `wait_until_step`.
 
@@ -914,7 +914,7 @@ class GlobalStepWaiterHook(session_run_hook.SessionRunHook):
       time.sleep(0.5)
 
 
-@tf_export("train.FinalOpsHook")
+@tf_export(v1=["train.FinalOpsHook"])
 class FinalOpsHook(session_run_hook.SessionRunHook):
   """A hook which evaluates `Tensors` at the end of a session."""
 
@@ -958,7 +958,7 @@ class FinalOpsHook(session_run_hook.SessionRunHook):
         raise e
 
 
-@tf_export("train.FeedFnHook")
+@tf_export(v1=["train.FeedFnHook"])
 class FeedFnHook(session_run_hook.SessionRunHook):
   """Runs `feed_fn` and sets the `feed_dict` accordingly."""
 
@@ -976,7 +976,7 @@ class FeedFnHook(session_run_hook.SessionRunHook):
         fetches=None, feed_dict=self.feed_fn())
 
 
-@tf_export("train.ProfilerHook")
+@tf_export(v1=["train.ProfilerHook"])
 class ProfilerHook(session_run_hook.SessionRunHook):
   """Captures CPU/GPU profiling information every N steps or seconds.
 
diff --git a/tensorflow/python/training/basic_session_run_hooks_test.py b/tensorflow/python/training/basic_session_run_hooks_test.py
index 2d469634e0ec99d71e244eb85c8f493759c79738..1af27626ba764b0bf4a2787e492983a72c1491e9 100644
--- a/tensorflow/python/training/basic_session_run_hooks_test.py
+++ b/tensorflow/python/training/basic_session_run_hooks_test.py
@@ -22,7 +22,6 @@ from __future__ import print_function
 import os.path
 import shutil
 import tempfile
-import threading
 import time
 
 from tensorflow.contrib.framework.python.framework import checkpoint_utils
@@ -35,6 +34,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import state_ops
@@ -52,6 +52,11 @@ from tensorflow.python.training import session_run_hook
 from tensorflow.python.training import training_util
 
 
+# Provide a realistic start time for unit tests where we need to mock out
+# calls to time.time().
+MOCK_START_TIME = 1484695987.209386
+
+
 class MockCheckpointSaverListener(
     basic_session_run_hooks.CheckpointSaverListener):
 
@@ -87,15 +92,19 @@ class MockCheckpointSaverListener(
 
 class SecondOrStepTimerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SecondOrStepTimer(every_secs=2.0, every_steps=10)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.SecondOrStepTimer()
 
-  def test_every_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_every_secs(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     timer = basic_session_run_hooks.SecondOrStepTimer(every_secs=1.0)
     self.assertTrue(timer.should_trigger_for_step(1))
 
@@ -103,7 +112,7 @@ class SecondOrStepTimerTest(test.TestCase):
     self.assertFalse(timer.should_trigger_for_step(1))
     self.assertFalse(timer.should_trigger_for_step(2))
 
-    time.sleep(1.0)
+    mock_time.return_value += 1.0
     self.assertFalse(timer.should_trigger_for_step(1))
     self.assertTrue(timer.should_trigger_for_step(2))
 
@@ -243,7 +252,7 @@ class LoggingTensorHookTest(test.TestCase):
           tensors=[t.name], at_end=True)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       self.logged_message = ''
       for _ in range(3):
         mon_sess.run(train_op)
@@ -261,7 +270,7 @@ class LoggingTensorHookTest(test.TestCase):
         tensors=[t.name], every_n_iter=10, at_end=at_end)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
     mon_sess.run(train_op)
     self.assertRegexpMatches(str(self.logged_message), t.name)
     for _ in range(3):
@@ -308,13 +317,13 @@ class LoggingTensorHookTest(test.TestCase):
           tensors={'foo': t}, every_n_iter=1)
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
       self.assertRegexpMatches(str(self.logged_message), 'foo')
       # in first run, elapsed time is None.
       self.assertEqual(str(self.logged_message).find('sec'), -1)
 
-  def _validate_print_every_n_secs(self, sess, at_end):
+  def _validate_print_every_n_secs(self, sess, at_end, mock_time):
     t = constant_op.constant(42.0, name='foo')
     train_op = constant_op.constant(3)
 
@@ -322,7 +331,7 @@ class LoggingTensorHookTest(test.TestCase):
         tensors=[t.name], every_n_secs=1.0, at_end=at_end)
     hook.begin()
     mon_sess = monitored_session._HookedSession(sess, [hook])
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
 
     mon_sess.run(train_op)
     self.assertRegexpMatches(str(self.logged_message), t.name)
@@ -331,7 +340,7 @@ class LoggingTensorHookTest(test.TestCase):
     self.logged_message = ''
     mon_sess.run(train_op)
     self.assertEqual(str(self.logged_message).find(t.name), -1)
-    time.sleep(1.0)
+    mock_time.return_value += 1.0
 
     self.logged_message = ''
     mon_sess.run(train_op)
@@ -345,17 +354,21 @@ class LoggingTensorHookTest(test.TestCase):
       # assertNotRegexpMatches is not supported by python 3.1 and later
       self.assertEqual(str(self.logged_message).find(t.name), -1)
 
-  def test_print_every_n_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_print_every_n_secs(self, mock_time):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      self._validate_print_every_n_secs(sess, at_end=False)
+      mock_time.return_value = MOCK_START_TIME
+      self._validate_print_every_n_secs(sess, at_end=False, mock_time=mock_time)
       # Verify proper reset.
-      self._validate_print_every_n_secs(sess, at_end=False)
+      self._validate_print_every_n_secs(sess, at_end=False, mock_time=mock_time)
 
-  def test_print_every_n_secs_and_end(self):
+  @test.mock.patch.object(time, 'time')
+  def test_print_every_n_secs_and_end(self, mock_time):
     with ops.Graph().as_default(), session_lib.Session() as sess:
-      self._validate_print_every_n_secs(sess, at_end=True)
+      mock_time.return_value = MOCK_START_TIME
+      self._validate_print_every_n_secs(sess, at_end=True, mock_time=mock_time)
       # Verify proper reset.
-      self._validate_print_every_n_secs(sess, at_end=True)
+      self._validate_print_every_n_secs(sess, at_end=True, mock_time=mock_time)
 
   def test_print_formatter(self):
     with ops.Graph().as_default(), session_lib.Session() as sess:
@@ -366,7 +379,7 @@ class LoggingTensorHookTest(test.TestCase):
           formatter=lambda items: 'qqq=%s' % items[t.name])
       hook.begin()
       mon_sess = monitored_session._HookedSession(sess, [hook])
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess.run(train_op)
       self.assertEqual(self.logged_message[0], 'qqq=42.0')
 
@@ -403,11 +416,13 @@ class CheckpointSaverHookTest(test.TestCase):
       basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, saver=self.scaffold.saver, scaffold=self.scaffold)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, save_secs=10, save_steps=20)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.CheckpointSaverHook(self.model_dir)
@@ -562,11 +577,8 @@ class CheckpointSaverHookTest(test.TestCase):
 
   @test.mock.patch.object(time, 'time')
   def test_save_secs_saves_periodically(self, mock_time):
-    # Let's have a realistic start time
-    current_time = 1484695987.209386
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir, save_secs=2, scaffold=self.scaffold)
       hook.begin()
@@ -576,10 +588,10 @@ class CheckpointSaverHookTest(test.TestCase):
         sess.run(self.scaffold.init_op)
         mon_sess = monitored_session._HookedSession(sess, [hook])
 
-        mock_time.return_value = current_time
+        mock_time.return_value = MOCK_START_TIME
         mon_sess.run(self.train_op)  # Saved.
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)  # Not saved.
 
         self.assertEqual(1,
@@ -587,13 +599,13 @@ class CheckpointSaverHookTest(test.TestCase):
                                                         self.global_step.name))
 
         # Simulate 2.5 seconds of sleep.
-        mock_time.return_value = current_time + 2.5
+        mock_time.return_value = MOCK_START_TIME + 2.5
         mon_sess.run(self.train_op)  # Saved.
 
-        mock_time.return_value = current_time + 2.6
+        mock_time.return_value = MOCK_START_TIME + 2.6
         mon_sess.run(self.train_op)  # Not saved.
 
-        mock_time.return_value = current_time + 2.7
+        mock_time.return_value = MOCK_START_TIME + 2.7
         mon_sess.run(self.train_op)  # Not saved.
 
         self.assertEqual(3,
@@ -601,7 +613,7 @@ class CheckpointSaverHookTest(test.TestCase):
                                                         self.global_step.name))
 
         # Simulate 7.5 more seconds of sleep (10 seconds from start.
-        mock_time.return_value = current_time + 10
+        mock_time.return_value = MOCK_START_TIME + 10
         mon_sess.run(self.train_op)  # Saved.
         self.assertEqual(6,
                          checkpoint_utils.load_variable(self.model_dir,
@@ -609,11 +621,8 @@ class CheckpointSaverHookTest(test.TestCase):
 
   @test.mock.patch.object(time, 'time')
   def test_save_secs_calls_listeners_periodically(self, mock_time):
-    # Let's have a realistic start time
-    current_time = 1484695987.209386
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       listener = MockCheckpointSaverListener()
       hook = basic_session_run_hooks.CheckpointSaverHook(
           self.model_dir,
@@ -626,28 +635,28 @@ class CheckpointSaverHookTest(test.TestCase):
         sess.run(self.scaffold.init_op)
         mon_sess = monitored_session._HookedSession(sess, [hook])
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 0.5
+        mock_time.return_value = MOCK_START_TIME + 0.5
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 3.0
+        mock_time.return_value = MOCK_START_TIME + 3.0
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 3.5
+        mock_time.return_value = MOCK_START_TIME + 3.5
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 4.0
+        mock_time.return_value = MOCK_START_TIME + 4.0
         mon_sess.run(self.train_op)
 
-        mock_time.return_value = current_time + 6.5
+        mock_time.return_value = MOCK_START_TIME + 6.5
         mon_sess.run(self.train_op)  # hook runs here
 
-        mock_time.return_value = current_time + 7.0
+        mock_time.return_value = MOCK_START_TIME + 7.0
         mon_sess.run(self.train_op)  # hook won't run here, so it does at end
 
-        mock_time.return_value = current_time + 7.5
+        mock_time.return_value = MOCK_START_TIME + 7.5
         hook.end(sess)  # hook runs here
       self.assertEqual({
           'begin': 1,
@@ -913,7 +922,9 @@ class StepCounterHookTest(test.TestCase):
   def tearDown(self):
     shutil.rmtree(self.log_dir, ignore_errors=True)
 
-  def test_step_counter_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_step_counter_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
@@ -921,11 +932,11 @@ class StepCounterHookTest(test.TestCase):
       hook = basic_session_run_hooks.StepCounterHook(
           summary_writer=summary_writer, every_n_steps=10)
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       with test.mock.patch.object(tf_logging, 'warning') as mock_log:
         for _ in range(30):
-          time.sleep(0.01)
+          mock_time.return_value += 0.01
           mon_sess.run(train_op)
         # logging.warning should not be called.
         self.assertIsNone(mock_log.call_args)
@@ -941,7 +952,9 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_step_counter_every_n_secs(self):
+  @test.mock.patch.object(time, 'time')
+  def test_step_counter_every_n_secs(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(1)
@@ -950,12 +963,12 @@ class StepCounterHookTest(test.TestCase):
           summary_writer=summary_writer, every_n_steps=None, every_n_secs=0.1)
 
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       mon_sess.run(train_op)
-      time.sleep(0.2)
+      mock_time.return_value += 0.2
       mon_sess.run(train_op)
-      time.sleep(0.2)
+      mock_time.return_value += 0.2
       mon_sess.run(train_op)
       hook.end(sess)
 
@@ -987,7 +1000,7 @@ class StepCounterHookTest(test.TestCase):
           summary_writer=summary_writer, every_n_steps=1, every_n_secs=None)
 
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       mon_sess.run(train_op)
       mon_sess.run(train_op)
@@ -1007,7 +1020,7 @@ class StepCounterHookTest(test.TestCase):
     with ops.Graph().as_default(), session_lib.Session() as sess:
       variables.get_or_create_global_step()
       train_op = training_util._increment_global_step(0)  # keep same.
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       hook = basic_session_run_hooks.StepCounterHook(
           every_n_steps=1, every_n_secs=None)
       hook.begin()
@@ -1034,16 +1047,18 @@ class StepCounterHookTest(test.TestCase):
         summary_writer=self.summary_writer, every_n_steps=every_n_steps)
     self.hook._set_steps_per_run(steps_per_run)
     self.hook.begin()
-    sess.run(variables_lib.global_variables_initializer())
+    self.evaluate(variables_lib.global_variables_initializer())
     self.mon_sess = monitored_session._HookedSession(sess, [self.hook])
 
-  def test_steps_per_run_less_than_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_less_than_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(10, 5, g, sess)
 
       # Logs at 15, 25
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1058,13 +1073,15 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_steps_per_run_equal_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_equal_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(5, 5, g, sess)
 
       # Logs at 10, 15, 20, 25
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1080,13 +1097,15 @@ class StepCounterHookTest(test.TestCase):
         self.assertEqual('global_step/sec', summary_value.tag)
         self.assertGreater(summary_value.simple_value, 0)
 
-  def test_steps_per_run_greater_than_every_n_steps(self):
+  @test.mock.patch.object(time, 'time')
+  def test_steps_per_run_greater_than_every_n_steps(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     with ops.Graph().as_default() as g, session_lib.Session() as sess:
       self._setup_steps_per_run_test(5, 10, g, sess)
 
       # Logs at 20, 30, 40, 50
       for _ in range(5):
-        time.sleep(0.01)
+        mock_time.return_value += 0.01
         self.mon_sess.run(self.train_op)
 
       self.hook.end(sess)
@@ -1103,6 +1122,7 @@ class StepCounterHookTest(test.TestCase):
         self.assertGreater(summary_value.simple_value, 0)
 
 
+@test_util.run_v1_only('b/120545219')
 class SummarySaverHookTest(test.TestCase):
 
   def setUp(self):
@@ -1147,7 +1167,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(30):
         mon_sess.run(self.train_op)
@@ -1179,7 +1199,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(10):
         mon_sess.run(self.train_op)
@@ -1199,7 +1219,9 @@ class SummarySaverHookTest(test.TestCase):
             },
         })
 
-  def test_save_secs_saving_once_every_step(self):
+  @test.mock.patch.object(time, 'time')
+  def test_save_secs_saving_once_every_step(self, mock_time):
+    mock_time.return_value = MOCK_START_TIME
     hook = basic_session_run_hooks.SummarySaverHook(
         save_secs=0.5,
         summary_writer=self.summary_writer,
@@ -1207,11 +1229,11 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(4):
         mon_sess.run(self.train_op)
-        time.sleep(0.5)
+        mock_time.return_value += 0.5
       hook.end(sess)
 
     self.summary_writer.assert_summaries(
@@ -1242,7 +1264,7 @@ class SummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(8):
         mon_sess.run(self.train_op)
@@ -1279,27 +1301,43 @@ class GlobalStepWaiterHookTest(test.TestCase):
             session_run_hook.SessionRunContext(
                 original_args=None, session=sess))
 
-  def test_wait_for_step(self):
+  @test.mock.patch.object(time, 'sleep')
+  def test_wait_for_step(self, mock_sleep):
     with ops.Graph().as_default():
       gstep = variables.get_or_create_global_step()
       hook = basic_session_run_hooks.GlobalStepWaiterHook(wait_until_step=1000)
       hook.begin()
+
       with session_lib.Session() as sess:
-        sess.run(variables_lib.global_variables_initializer())
-        waiter = threading.Thread(
-            target=hook.before_run,
-            args=(session_run_hook.SessionRunContext(
-                original_args=None, session=sess),))
-        waiter.daemon = True
-        waiter.start()
-        time.sleep(1.0)
-        self.assertTrue(waiter.is_alive())
-        sess.run(state_ops.assign(gstep, 500))
-        time.sleep(1.0)
-        self.assertTrue(waiter.is_alive())
-        sess.run(state_ops.assign(gstep, 1100))
-        time.sleep(1.2)
-        self.assertFalse(waiter.is_alive())
+        # Mock out calls to time.sleep() to update the global step.
+
+        class Context(object):
+          counter = 0
+
+        def mock_sleep_side_effect(seconds):
+          del seconds  # argument is ignored
+          Context.counter += 1
+          if Context.counter == 1:
+            # The first time sleep() is called, we update the global_step from
+            # 0 to 500.
+            sess.run(state_ops.assign(gstep, 500))
+          elif Context.counter == 2:
+            # The second time sleep() is called, we update the global_step from
+            # 500 to 1100.
+            sess.run(state_ops.assign(gstep, 1100))
+          else:
+            raise AssertionError(
+                'Expected before_run() to terminate after the second call to '
+                'time.sleep()')
+
+        mock_sleep.side_effect = mock_sleep_side_effect
+
+        # Run the mocked-out interaction with the hook.
+        self.evaluate(variables_lib.global_variables_initializer())
+        run_context = session_run_hook.SessionRunContext(
+            original_args=None, session=sess)
+        hook.before_run(run_context)
+        self.assertEqual(Context.counter, 2)
 
 
 class FinalOpsHookTest(test.TestCase):
@@ -1333,7 +1371,7 @@ class FinalOpsHookTest(test.TestCase):
   def test_final_ops_triggers_out_of_range_error(self):
     with ops.Graph().as_default():
       dataset = dataset_ops.Dataset.range(1)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       read_ops = iterator.get_next()
       final_ops = read_ops
 
@@ -1366,6 +1404,7 @@ class FinalOpsHookTest(test.TestCase):
                              hook.final_ops_values.tolist())
 
 
+@test_util.run_v1_only('b/120545219')
 class ResourceSummarySaverHookTest(test.TestCase):
 
   def setUp(self):
@@ -1390,7 +1429,7 @@ class ResourceSummarySaverHookTest(test.TestCase):
 
     with self.cached_session() as sess:
       hook.begin()
-      sess.run(variables_lib.global_variables_initializer())
+      self.evaluate(variables_lib.global_variables_initializer())
       mon_sess = monitored_session._HookedSession(sess, [hook])
       for _ in range(30):
         mon_sess.run(self.train_op)
@@ -1446,10 +1485,12 @@ class ProfilerHookTest(test.TestCase):
   def _count_timeline_files(self):
     return len(gfile.Glob(self.filepattern))
 
+  @test_util.run_deprecated_v1
   def test_raise_in_both_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.ProfilerHook(save_secs=10, save_steps=20)
 
+  @test_util.run_deprecated_v1
   def test_raise_in_none_secs_and_steps(self):
     with self.assertRaises(ValueError):
       basic_session_run_hooks.ProfilerHook(save_secs=None, save_steps=None)
@@ -1465,29 +1506,27 @@ class ProfilerHookTest(test.TestCase):
   @test.mock.patch.object(time, 'time')
   def test_save_secs_saves_periodically(self, mock_time):
     # Pick a fixed start time.
-    current_time = 1484863632.
-
     with self.graph.as_default():
-      mock_time.return_value = current_time
+      mock_time.return_value = MOCK_START_TIME
       hook = basic_session_run_hooks.ProfilerHook(
           save_secs=2, output_dir=self.output_dir)
       with monitored_session.SingularMonitoredSession(hooks=[hook]) as sess:
         sess.run(self.train_op)  # Not saved.
         self.assertEqual(0, self._count_timeline_files())
         # Simulate 2.5 seconds of sleep.
-        mock_time.return_value = current_time + 2.5
+        mock_time.return_value = MOCK_START_TIME + 2.5
         sess.run(self.train_op)  # Saved.
         self.assertEqual(1, self._count_timeline_files())
 
         # Pretend some small amount of time has passed.
-        mock_time.return_value = current_time + 2.6
+        mock_time.return_value = MOCK_START_TIME + 2.6
         sess.run(self.train_op)  # Not saved.
         # Edge test just before we should save the timeline.
-        mock_time.return_value = current_time + 4.4
+        mock_time.return_value = MOCK_START_TIME + 4.4
         sess.run(self.train_op)  # Not saved.
         self.assertEqual(1, self._count_timeline_files())
 
-        mock_time.return_value = current_time + 4.5
+        mock_time.return_value = MOCK_START_TIME + 4.5
         sess.run(self.train_op)  # Saved.
         self.assertEqual(2, self._count_timeline_files())
 
diff --git a/tensorflow/python/training/checkpoint_management_test.py b/tensorflow/python/training/checkpoint_management_test.py
index 3a061bcb35c1c1a6ef31645c8e0ef892e9d9aa62..8606ec4a206ffbce85cf4071934deeb5a545b055 100644
--- a/tensorflow/python/training/checkpoint_management_test.py
+++ b/tensorflow/python/training/checkpoint_management_test.py
@@ -62,6 +62,7 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
     finally:
       shutil.rmtree(tempdir)
 
+  @test_util.run_deprecated_v1
   def testNameCollision(self):
     # Make sure we have a clean directory to work in.
     with self.tempDir() as tempdir:
@@ -99,6 +100,7 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           self.assertIsNotNone(
               checkpoint_management.latest_checkpoint(traindir))
 
+  @test_util.run_deprecated_v1
   def testRelativePath(self):
     # Make sure we have a clean directory to work in.
     with self.tempDir() as tempdir:
@@ -123,9 +125,9 @@ class LatestCheckpointWithRelativePaths(test.TestCase):
           # Record a short training history.
           variables.global_variables_initializer().run()
           save.save(sess, filepath, global_step=0)
-          inc.eval()
+          self.evaluate(inc)
           save.save(sess, filepath, global_step=1)
-          inc.eval()
+          self.evaluate(inc)
           save.save(sess, filepath, global_step=2)
 
         with self.cached_session() as sess:
@@ -270,6 +272,7 @@ class SaverUtilsTest(test.TestCase):
   def tearDown(self):
     gfile.DeleteRecursively(self._base_dir)
 
+  @test_util.run_deprecated_v1
   def testCheckpointExists(self):
     for sharded in (False, True):
       for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
@@ -288,6 +291,7 @@ class SaverUtilsTest(test.TestCase):
           ckpt_prefix = checkpoint_management.latest_checkpoint(self._base_dir)
           self.assertTrue(checkpoint_management.checkpoint_exists(ckpt_prefix))
 
+  @test_util.run_deprecated_v1
   def testGetCheckpointMtimes(self):
     prefixes = []
     for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
@@ -302,6 +306,7 @@ class SaverUtilsTest(test.TestCase):
     self.assertEqual(2, len(mtimes))
     self.assertTrue(mtimes[1] >= mtimes[0])
 
+  @test_util.run_deprecated_v1
   def testRemoveCheckpoint(self):
     for sharded in (False, True):
       for version in (saver_pb2.SaverDef.V2, saver_pb2.SaverDef.V1):
diff --git a/tensorflow/python/training/checkpoint_ops_test.py b/tensorflow/python/training/checkpoint_ops_test.py
index dde84314977f6ffc8c93e90f6ad76e13c2f02cb0..c48154713929b91050e070051add9fee7c428805 100644
--- a/tensorflow/python/training/checkpoint_ops_test.py
+++ b/tensorflow/python/training/checkpoint_ops_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
@@ -33,6 +34,7 @@ from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import saver as saver_lib
 
 
+@test_util.run_v1_only('b/120545219')
 class LoadAndRemapWrappersTest(test.TestCase):
   """Tests for the functionality of the Python wrappers."""
 
@@ -47,7 +49,7 @@ class LoadAndRemapWrappersTest(test.TestCase):
       with variable_scope.variable_scope('some_scope'):
         variable_scope.get_variable(name='embeddings', shape=[5, 16],
                                     initializer=initializer)
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver = saver_lib.Saver()
       saver.save(sess, checkpoint_prefix, global_step=5)
     self.checkpoint_file = '{}-5'.format(checkpoint_prefix)
@@ -115,7 +117,8 @@ class LoadAndRemapWrappersTest(test.TestCase):
         axis=1)
 
     with self.cached_session():
-      self.assertAllClose(expected_remapped_matrix, remapped_matrix.eval())
+      self.assertAllClose(expected_remapped_matrix,
+                          self.evaluate(remapped_matrix))
 
   def test_load_and_remap_output_layer_weight_initializer_linear(self):
     """Tests for the output layer initializer in the linear multi-class case."""
diff --git a/tensorflow/python/training/checkpoint_utils.py b/tensorflow/python/training/checkpoint_utils.py
index 857da431db2cb94338291d3af1beb9afdbd94b06..74b46179e75423b530191cce5a52034879712eaa 100644
--- a/tensorflow/python/training/checkpoint_utils.py
+++ b/tensorflow/python/training/checkpoint_utils.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 import six
 
 from tensorflow.python import pywrap_tensorflow
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -29,8 +30,7 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -101,7 +101,7 @@ def list_variables(ckpt_dir_or_file):
   return result
 
 
-@tf_export("train.init_from_checkpoint")
+@tf_export(v1=["train.init_from_checkpoint"])
 def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
   """Replaces `tf.Variable` initializers so they load from a checkpoint file.
 
@@ -187,7 +187,7 @@ def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
     _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
   else:
     distribution_strategy_context.get_replica_context().merge_call(
-        _init_from_checkpoint, ckpt_dir_or_file, assignment_map)
+        _init_from_checkpoint, args=(ckpt_dir_or_file, assignment_map))
 
 
 def _init_from_checkpoint(_, ckpt_dir_or_file, assignment_map):
@@ -311,10 +311,10 @@ def _set_checkpoint_initializer(variable,
     restore_op = io_ops.restore_v2(
         ckpt_file, [tensor_name], [slice_spec], [base_type], name=name)[0]
 
-    names_to_saveables = saver.BaseSaverBuilder.OpListToDict([variable])
+    names_to_saveables = saveable_object_util.op_list_to_dict([variable])
     saveable_objects = []
     for name, op in names_to_saveables.items():
-      for s in saver.BaseSaverBuilder.SaveableObjectsForOp(op, name):
+      for s in saveable_object_util.saveable_objects_for_op(op, name):
         saveable_objects.append(s)
 
     assert len(saveable_objects) == 1  # Should be only one variable.
diff --git a/tensorflow/python/training/checkpointable/BUILD b/tensorflow/python/training/checkpointable/BUILD
index d26932c1aae7831f8e266d04777db53baa13330f..26a0ac35b763e4b8a2c9143d88a2a97259715262 100644
--- a/tensorflow/python/training/checkpointable/BUILD
+++ b/tensorflow/python/training/checkpointable/BUILD
@@ -25,9 +25,9 @@ py_library(
         "//tensorflow/python:framework_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:platform",
-        "//tensorflow/python:saveable_object",
         "//tensorflow/python:util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/training/saving:saveable_object",
     ],
 )
 
@@ -114,7 +114,6 @@ py_library(
         "//tensorflow/python:init_ops",
         "//tensorflow/python:io_ops_gen",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:saveable_object",
         "//tensorflow/python:saver",
         "//tensorflow/python:session",
         "//tensorflow/python:tensor_shape",
@@ -122,6 +121,10 @@ py_library(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/training/saving:functional_saver",
+        "//tensorflow/python/training/saving:saveable_object",
+        "//tensorflow/python/training/saving:saveable_object_util",
     ],
 )
 
@@ -152,7 +155,7 @@ py_test(
         "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/keras:engine",
         "//tensorflow/python/keras:layers",
diff --git a/tensorflow/python/training/checkpointable/base.py b/tensorflow/python/training/checkpointable/base.py
index 095a90ddd4f831e5af63f8eb7e231eacb5a91975..3cd1c6f9c8b0b5b5acf517e5f5801db66d0045b2 100644
--- a/tensorflow/python/training/checkpointable/base.py
+++ b/tensorflow/python/training/checkpointable/base.py
@@ -25,7 +25,6 @@ import weakref
 
 import six
 
-from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -34,7 +33,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.training import saveable_object
+from tensorflow.python.training.saving import saveable_object
 from tensorflow.python.util import nest
 from tensorflow.python.util import serialization
 from tensorflow.python.util import tf_decorator
@@ -374,41 +373,10 @@ class _CheckpointPosition(object):
       eagerly.
     """
     (restore_ops,
-     named_saveables,
+     tensor_saveables,
      python_saveables) = self._gather_ops_or_named_saveables()
-
-    # Eagerly run restorations for Python state.
-    reader = pywrap_tensorflow.NewCheckpointReader(
-        self._checkpoint.save_path_string)
-    for saveable in python_saveables:
-      spec_names = [spec.name for spec in saveable.specs]
-      saveable.python_restore(
-          [reader.get_tensor(name) for name in spec_names])
-
-    # If we have new SaveableObjects, extract and cache restore ops.
-    if named_saveables:
-      validated_saveables = (
-          self._checkpoint.builder._ValidateAndSliceInputs(named_saveables))  # pylint: disable=protected-access
-      validated_names = set(saveable.name for saveable in validated_saveables)
-      if set(named_saveables.keys()) != validated_names:
-        raise AssertionError(
-            ("Saveable keys changed when validating. Got back %s, was "
-             "expecting %s") % (named_saveables.keys(), validated_names))
-      all_tensors = self._checkpoint.builder.bulk_restore(
-          filename_tensor=self._checkpoint.save_path_tensor,
-          saveables=validated_saveables, preferred_shard=-1,
-          restore_sequentially=False)
-      saveable_index = 0
-      for saveable in validated_saveables:
-        num_specs = len(saveable.specs)
-        saveable_tensors = all_tensors[
-            saveable_index:saveable_index + num_specs]
-        saveable_index += num_specs
-        restore_op = saveable.restore(saveable_tensors, restored_shapes=None)
-        if not context.executing_eagerly():
-          assert saveable.name not in self._checkpoint.restore_ops_by_name
-          self._checkpoint.restore_ops_by_name[saveable.name] = restore_op
-          restore_ops.append(restore_op)
+    restore_ops.extend(self._checkpoint.restore_saveables(
+        tensor_saveables, python_saveables))
     return restore_ops
 
   @property
diff --git a/tensorflow/python/training/checkpointable/data_structures.py b/tensorflow/python/training/checkpointable/data_structures.py
index c29e5db0753c0a2d96ebb0ed43e4e78aac629526..817552f32696e34d123d1da5057388c1bd96139c 100644
--- a/tensorflow/python/training/checkpointable/data_structures.py
+++ b/tensorflow/python/training/checkpointable/data_structures.py
@@ -111,9 +111,6 @@ class CheckpointableDataStructure(base.CheckpointableBase):
   """Base class for data structures which contain checkpointable objects."""
 
   def __init__(self):
-    # An append-only ordered set
-    self._layers = []
-
     self.trainable = True
     self._extra_variables = []
 
@@ -128,21 +125,30 @@ class CheckpointableDataStructure(base.CheckpointableBase):
           ("Only checkpointable objects (such as Layers or Optimizers) may be "
            "stored in a List object. Got %s, which does not inherit from "
            "CheckpointableBase.") % (value,))
-    if (isinstance(value, CheckpointableDataStructure)
-        or layer_utils.is_layer(value)
-        or layer_utils.has_weights(value)):
-      # Check for object-identity rather than with __eq__ to avoid
-      # de-duplicating empty container types. Automatically generated list
-      # wrappers keep things like "[] == []" true, which means "[] in [[]]" is
-      # also true. This becomes not true once one of the lists is mutated.
-      if not any((layer is value for layer in self._layers)):
-        self._layers.append(value)
-        if hasattr(value, "_use_resource_variables"):
-          # In subclassed models, legacy layers (tf.layers) must always use
-          # resource variables.
-          value._use_resource_variables = True  # pylint: disable=protected-access
+    if hasattr(value, "_use_resource_variables"):
+      # In subclassed models, legacy layers (tf.layers) must always use
+      # resource variables.
+      value._use_resource_variables = True  # pylint: disable=protected-access
     return value
 
+  @property
+  def _values(self):
+    """An iterable/sequence which may contain checkpointable objects."""
+    raise NotImplementedError("Abstract method")
+
+  @property
+  def _layers(self):
+    """All Layers and Layer containers, including empty containers."""
+    # Filter objects on demand so that wrapper objects use values from the thing
+    # they're wrapping if out of sync.
+    collected = []
+    for obj in self._values:
+      if (isinstance(obj, CheckpointableDataStructure)
+          or layer_utils.is_layer(obj)
+          or layer_utils.has_weights(obj)):
+        collected.append(obj)
+    return collected
+
   @property
   def layers(self):
     return layer_utils.filter_empty_layer_containers(self._layers)
@@ -265,6 +271,10 @@ class List(CheckpointableDataStructure, collections.Sequence):
   def _name_element(self, index):
     return "%d" % (index,)
 
+  @property
+  def _values(self):
+    return self
+
   def append(self, value):
     """Add a new checkpointable value."""
     value = self._track_value(value, self._name_element(len(self._storage)))
@@ -479,6 +489,14 @@ class Mapping(CheckpointableDataStructure, collections.Mapping):
   def _make_storage(self, *args, **kwargs):
     return dict(*args, **kwargs)
 
+  @property
+  def _values(self):
+    # Sort items deterministically by key
+    ordered = list(zip(*sorted(self.items(), key=lambda it: it[0])))
+    if ordered:
+      return ordered[1]
+    return []
+
   def _name_element(self, key):
     if not isinstance(key, six.string_types):
       raise TypeError(
diff --git a/tensorflow/python/training/checkpointable/data_structures_test.py b/tensorflow/python/training/checkpointable/data_structures_test.py
index ff7d1f1d2d7dfcbeab5b85d90a514088dc832b9b..bcec6e01001eec6c164cf4bb17db3d4ed55b0935 100644
--- a/tensorflow/python/training/checkpointable/data_structures_test.py
+++ b/tensorflow/python/training/checkpointable/data_structures_test.py
@@ -73,6 +73,7 @@ class HasList(training.Model):
 class ListTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTracking(self):
     model = HasList()
     output = model(array_ops.ones([32, 2]))
@@ -105,6 +106,7 @@ class ListTests(test.TestCase):
     self.assertIn(v, model.trainable_variables)
     self.assertNotIn(v, model.non_trainable_variables)
 
+  @test_util.run_v1_only("b/120545219")
   def testUpdatesForwarded(self):
     with context.graph_mode():
       model = HasList()
@@ -121,6 +123,7 @@ class ListTests(test.TestCase):
       self.assertEqual(0, len(model.updates))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testLossesForwarded(self):
     model = HasList()
     model_input = array_ops.ones([32, 2])
@@ -253,6 +256,13 @@ class ListTests(test.TestCase):
     l.append(1)
     self.assertEqual([1], l_wrapper)
 
+  def testLayerCollectionWithExternalMutation(self):
+    l = []
+    l_wrapper = data_structures._ListWrapper(l)
+    layer = core.Dense(1)
+    l.append(layer)
+    self.assertEqual([layer], l_wrapper.layers)
+
   def testHashing(self):
     has_sequences = set([data_structures.List(),
                          data_structures.List()])
@@ -288,6 +298,7 @@ class HasMapping(training.Model):
 class MappingTests(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testTracking(self):
     model = HasMapping()
     output = model(array_ops.ones([32, 2]))
@@ -324,6 +335,20 @@ class MappingTests(test.TestCase):
     with self.assertRaises(TypeError):
       mapping[1] = data_structures.List()
 
+  def testLayerCollectionWithExternalMutation(self):
+    d = {}
+    root = tracking.Checkpointable()
+    root.wrapper = d
+    self.assertEqual([], root.wrapper.layers)
+    self.assertEqual([], root.wrapper.trainable_weights)
+    layer1 = core.Dense(1)
+    layer2 = core.Dense(1)
+    d["a"] = layer1
+    d["b"] = layer2
+    self.assertEqual([layer1, layer2], root.wrapper.layers)
+    # The layers have still not created variables
+    self.assertEqual([], root.wrapper.trainable_weights)
+
   def testHashing(self):
     has_mappings = set([data_structures.Mapping(),
                         data_structures.Mapping()])
diff --git a/tensorflow/python/training/checkpointable/tracking.py b/tensorflow/python/training/checkpointable/tracking.py
index c85b208d47985553ced692ccf0ef1627f9428a89..4e96aee0c51d441c4a32ce68943e27dbf592349c 100644
--- a/tensorflow/python/training/checkpointable/tracking.py
+++ b/tensorflow/python/training/checkpointable/tracking.py
@@ -17,6 +17,10 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.util import tf_contextlib
@@ -145,3 +149,36 @@ class TrackableResource(base.CheckpointableBase):
     if self._resource_handle is None:
       self._resource_handle = self.create_resource()
     return self._resource_handle
+
+
+class TrackableAsset(base.CheckpointableBase):
+  """Base class for asset files which need to be tracked."""
+
+  def __init__(self, path):
+    """Record the full path to the asset."""
+    # We use a variable here so that @tf.functions do not capture a literal
+    # value. The init_scope prevents functions from capturing `path` in an
+    # initialization graph, since it is transient and should not end up in a
+    # serialized function body. When serialized in a SavedModel, the variable
+    # will be set during the loading process to its location in the assets/
+    # directory.
+    with ops.init_scope():
+      if context.executing_eagerly():
+        self._path = self._no_dependency(
+            resource_variable_ops.ResourceVariable(
+                path, dtype=dtypes.string,
+                name="asset_path"))
+      else:
+        # Adding a variable is too disruptive when v1-style graph building,
+        # since things may get fed and local variable initializers would then
+        # need to be run.
+        self._path = path
+
+  @property
+  def asset_path(self):
+    """Fetch the current asset path."""
+    return self._path
+
+ops.register_tensor_conversion_function(
+    TrackableAsset,
+    lambda asset, **kw: ops.internal_convert_to_tensor(asset.asset_path, **kw))
diff --git a/tensorflow/python/training/checkpointable/util.py b/tensorflow/python/training/checkpointable/util.py
index f45f7445f137058b7d78ad7a9c3e2e6a1cd008d7..a54f41a54fa1364af417a85e7faa9ee0693fada1 100644
--- a/tensorflow/python/training/checkpointable/util.py
+++ b/tensorflow/python/training/checkpointable/util.py
@@ -26,11 +26,13 @@ from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
+from tensorflow.python.lib.io import file_io
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gen_io_ops as io_ops
 from tensorflow.python.ops import init_ops
@@ -38,11 +40,14 @@ from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.training import checkpoint_management
 from tensorflow.python.training import optimizer as optimizer_lib
-from tensorflow.python.training import saveable_object as saveable_object_lib
-from tensorflow.python.training import saver as saver_lib
+from tensorflow.python.training import saver as v1_saver_lib
 from tensorflow.python.training.checkpointable import base
 from tensorflow.python.training.checkpointable import data_structures
 from tensorflow.python.training.checkpointable import tracking
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.saving import saveable_object as saveable_object_lib
+from tensorflow.python.training.saving import saveable_object_util
+from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util.tf_export import tf_export
@@ -87,7 +92,6 @@ class _CheckpointRestoreCoordinator(object):
         referenced every restore (e.g. for Python state); otherwise they would
         create their own ops every restore.
     """
-    self.builder = saver_lib.BulkSaverBuilder()
     self.object_graph_proto = object_graph_proto
     self.restore_uid = ops.uid()
     # Maps from objects to lists of attributes which were in the checkpoint but
@@ -142,6 +146,57 @@ class _CheckpointRestoreCoordinator(object):
     if self.new_restore_ops_callback:
       self.new_restore_ops_callback(new_ops)  # pylint: disable=not-callable
 
+  def restore_saveables(self, tensor_saveables, python_saveables):
+    """Run or build restore operations for SaveableObjects.
+
+    Args:
+      tensor_saveables: `SaveableObject`s which correspond to Tensors.
+      python_saveables: `PythonStateSaveable`s which correspond to Python
+        values.
+
+    Returns:
+      When graph building, a list of restore operations, either cached or newly
+      created, to restore `tensor_saveables`.
+    """
+    restore_ops = []
+    # Eagerly run restorations for Python state.
+    reader = pywrap_tensorflow.NewCheckpointReader(
+        self.save_path_string)
+    for saveable in python_saveables:
+      spec_names = [spec.name for spec in saveable.specs]
+      saveable.python_restore(
+          [reader.get_tensor(name) for name in spec_names])
+
+    # If we have new SaveableObjects, extract and cache restore ops.
+    if tensor_saveables:
+      validated_saveables = saveable_object_util.validate_and_slice_inputs(
+          tensor_saveables)
+      validated_names = set(saveable.name for saveable in validated_saveables)
+      if set(tensor_saveables.keys()) != validated_names:
+        raise AssertionError(
+            ("Saveable keys changed when validating. Got back %s, was "
+             "expecting %s") % (tensor_saveables.keys(), validated_names))
+      for saveable in validated_saveables:
+        if saveable.device:
+          device = saveable_object_util.set_cpu0(saveable.device)
+        else:
+          device = None
+        with ops.device(device):
+          tensors = []
+          for spec in saveable.specs:
+            tensors.append(
+                io_ops.restore_v2(
+                    self.save_path_tensor,
+                    [spec.name],
+                    [spec.slice_spec],
+                    [spec.dtype])[0])
+          restore_op = saveable.restore(tensors, restored_shapes=None)
+        if not context.executing_eagerly():
+          assert saveable.name not in self.restore_ops_by_name
+          self.restore_ops_by_name[saveable.name] = restore_op
+          restore_ops.append(restore_op)
+    return restore_ops
+
 
 class _NameBasedRestoreCoordinator(object):
   """Keeps the status of a name-based checkpoint restore."""
@@ -181,11 +236,11 @@ class _NameBasedRestoreCoordinator(object):
           continue
       else:
         saveable = saveable_factory
-      names_to_saveables = saver_lib.BaseSaverBuilder.OpListToDict(
+      names_to_saveables = saveable_object_util.op_list_to_dict(
           [saveable],
           convert_variable_to_tensor=False)
       for name, op in names_to_saveables.items():
-        for saveable_object in saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+        for saveable_object in saveable_object_util.saveable_objects_for_op(
             op=op, name=name):
           yield saveable_object
 
@@ -549,13 +604,11 @@ def _serialize_slot_variables(checkpointable_objects, node_ids, object_names):
   return slot_variables
 
 
-def _serialize_checkpointables(
-    checkpointable_objects, node_ids, object_names, slot_variables,
+def _add_attributes_to_object_graph(
+    checkpointable_objects, object_graph_proto, node_ids, object_names,
     saveables_cache, object_map):
-  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
-  object_graph_proto = (
-      checkpointable_object_graph_pb2.CheckpointableObjectGraph())
-  named_saveables = []
+  """Create SaveableObjects and corresponding SerializedTensor protos."""
+  named_saveable_objects = []
   if saveables_cache is None:
     # No SaveableObject caching. Either we're executing eagerly, or building a
     # static save which is specialized to the current Python state.
@@ -564,10 +617,9 @@ def _serialize_checkpointables(
     # If we are caching SaveableObjects, we need to build up a feed_dict with
     # functions computing volatile Python state to be saved with the checkpoint.
     feed_additions = {}
-  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+  for checkpoint_id, (checkpointable, object_proto) in enumerate(
+      zip(checkpointable_objects, object_graph_proto.nodes)):
     assert node_ids[checkpointable] == checkpoint_id
-    object_proto = object_graph_proto.nodes.add()
-    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     object_name = object_names[checkpointable]
     if object_map:
       object_to_save = object_map.get(checkpointable, checkpointable)
@@ -607,10 +659,10 @@ def _serialize_checkpointables(
           # Figure out the name-based Saver's name for this variable. If it's
           # already a SaveableObject we'd just get the checkpoint key back, so
           # we leave full_name blank.
-          saver_dict = saver_lib.BaseSaverBuilder.OpListToDict(
+          saver_dict = saveable_object_util.op_list_to_dict(
               [maybe_saveable], convert_variable_to_tensor=False)
           full_name, = saver_dict.keys()
-          saveables = tuple(saver_lib.BaseSaverBuilder.SaveableObjectsForOp(
+          saveables = tuple(saveable_object_util.saveable_objects_for_op(
               op=maybe_saveable, name=attribute.checkpoint_key))
           for saveable in saveables:
             saveable.full_name = full_name
@@ -645,14 +697,28 @@ def _serialize_checkpointables(
                      "value.")
                     % (checkpointable, new_feed_key))
             feed_additions.update(saveable_feed_dict)
-        named_saveables.append(saveable)
+        named_saveable_objects.append(saveable)
+
+  return named_saveable_objects, feed_additions
+
 
+def fill_object_graph_proto(checkpointable_objects,
+                            node_ids,
+                            slot_variables,
+                            object_graph_proto=None):
+  """Name non-slot `Checkpointable`s and add them to `object_graph_proto`."""
+  if object_graph_proto is None:
+    object_graph_proto = (
+        checkpointable_object_graph_pb2.CheckpointableObjectGraph())
+  for checkpoint_id, checkpointable in enumerate(checkpointable_objects):
+    assert node_ids[checkpointable] == checkpoint_id
+    object_proto = object_graph_proto.nodes.add()
+    object_proto.slot_variables.extend(slot_variables.get(checkpointable, ()))
     for child in checkpointable._checkpoint_dependencies:  # pylint: disable=protected-access
       child_proto = object_proto.children.add()
       child_proto.node_id = node_ids[child.ref]
       child_proto.local_name = child.name
-
-  return named_saveables, object_graph_proto, feed_additions
+  return object_graph_proto
 
 
 def _serialize_gathered_objects(
@@ -668,13 +734,18 @@ def _serialize_gathered_objects(
       checkpointable_objects=checkpointable_objects,
       node_ids=node_ids,
       object_names=object_names)
-  return _serialize_checkpointables(
+  object_graph_proto = fill_object_graph_proto(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      slot_variables=slot_variables)
+  named_saveable_objects, feed_additions = _add_attributes_to_object_graph(
       checkpointable_objects=checkpointable_objects,
+      object_graph_proto=object_graph_proto,
       node_ids=node_ids,
       object_names=object_names,
-      slot_variables=slot_variables,
       saveables_cache=saveables_cache,
       object_map=object_map)
+  return named_saveable_objects, object_graph_proto, feed_additions
 
 
 def _serialize_object_graph(root_checkpointable, saveables_cache):
@@ -716,6 +787,23 @@ def named_saveables(root_checkpointable):
   return _serialize_object_graph(root_checkpointable, None)[0]
 
 
+def find_objects(root_checkpointable):
+  """Find and number objects which are dependencies of `root_checkpointable`."""
+  checkpointable_objects, path_to_root = (
+      _breadth_first_checkpointable_traversal(root_checkpointable))
+  object_names = _ObjectIdentityDictionary()
+  for obj, path in path_to_root.items():
+    object_names[obj] = _object_prefix_from_path(path)
+  node_ids = _ObjectIdentityDictionary()
+  for node_id, node in enumerate(checkpointable_objects):
+    node_ids[node] = node_id
+  slot_variables = _serialize_slot_variables(
+      checkpointable_objects=checkpointable_objects,
+      node_ids=node_ids,
+      object_names=object_names)
+  return checkpointable_objects, node_ids, slot_variables
+
+
 def list_objects(root_checkpointable):
   """Traverse the object graph and list all accessible objects.
 
@@ -730,20 +818,7 @@ def list_objects(root_checkpointable):
   Returns:
     A flat list of objects.
   """
-  # TODO(allenl): Extract out gathering logic so the naming logic doesn't have
-  # to run.
-  checkpointable_objects, path_to_root = (
-      _breadth_first_checkpointable_traversal(root_checkpointable))
-  object_names = _ObjectIdentityDictionary()
-  for obj, path in path_to_root.items():
-    object_names[obj] = _object_prefix_from_path(path)
-  node_ids = _ObjectIdentityDictionary()
-  for node_id, node in enumerate(checkpointable_objects):
-    node_ids[node] = node_id
-  _serialize_slot_variables(
-      checkpointable_objects=checkpointable_objects,
-      node_ids=node_ids,
-      object_names=object_names)
+  checkpointable_objects, _, _ = find_objects(root_checkpointable)
   return checkpointable_objects
 
 
@@ -1204,7 +1279,7 @@ class NameBasedSaverStatus(_LoadStatus):
       session = ops.get_default_session()
     with ops.device("/cpu:0"):
       saveables = self._gather_saveable_objects()
-      saver_lib.Saver(saveables).restore(
+      v1_saver_lib.Saver(saveables).restore(
           sess=session, save_path=self._checkpoint.save_path)
 
   def initialize_or_restore(self, session=None):
@@ -1229,18 +1304,6 @@ class _SessionWithFeedDictAdditions(session_lib.SessionInterface):
         fetches=fetches, feed_dict=feed_dict, **kwargs)
 
 
-def _copy_saver_with_new_var_list(old_saver, new_var_list):
-  """Copy a `tf.train.Saver`'s state to a new Saver with different variables."""
-  new_saver = saver_lib.Saver(var_list=new_var_list, max_to_keep=None)
-  # TODO(allenl): Move to copying functionality to Saver?
-  # pylint: disable=protected-access
-  new_saver._last_checkpoints = old_saver._last_checkpoints
-  new_saver._checkpoints_to_be_deleted = old_saver._checkpoints_to_be_deleted
-  new_saver._next_checkpoint_time = old_saver._next_checkpoint_time
-  # pylint: enable=protected-access
-  return new_saver
-
-
 class CheckpointableSaver(object):
   """Saves and restores a `Checkpointable` object and its dependencies.
 
@@ -1279,7 +1342,8 @@ class CheckpointableSaver(object):
     # Op caching for save
     self._object_graph_feed_tensor = None
     self._last_save_object_graph = None
-    self._last_save_saver = None
+    self._file_prefix_feed_tensor = None
+    self._cached_save_operation = None
 
     # Op caching for restore, shared between _CheckpointRestoreCoordinators
     self._restore_op_cache = {}
@@ -1346,13 +1410,16 @@ class CheckpointableSaver(object):
           base.NoRestoreSaveable(
               tensor=object_graph_tensor,
               name=base.OBJECT_GRAPH_PROTO_KEY))
-      # TODO(allenl, haoliang): Swap in a function-based saver here.
-      return saver_lib.Saver(
+      # TODO(allenl): Swap in a function-based saver here once it can serialize
+      # to a SaverDef.
+      return v1_saver_lib.Saver(
           var_list=named_saveable_objects, max_to_keep=None)
 
-  def _prepare_save(self,
-                    object_graph_tensor=None,
-                    saveable_object_cache=None):
+  def _save_cached_when_graph_building(
+      self,
+      file_prefix,
+      object_graph_tensor=None,
+      saveable_object_cache=None):
     """Create or retrieve save ops.
 
     When graph building, `saveable_object_cache` will typically be non-`None`,
@@ -1361,15 +1428,17 @@ class CheckpointableSaver(object):
     unnecessarily re-creating save ops.
 
     Args:
+      file_prefix: The prefix for saved checkpoint files.
       object_graph_tensor: A `Tensor` to which the current object graph will be
         fed.
       saveable_object_cache: A dictionary; if specified, used to cache
         `SaveableObject`s.
 
     Returns:
-      A two-element tuple with a `tf.train.Saver` and a feed_dict of `Tensor`s
-      to feed when running save ops. The feed dict contains the current object
-      graph and any Python state to be saved in the checkpoint.
+      A two-element tuple with a filename tensor and a feed_dict of tensors to
+      feed when running it (if graph building). The feed dict contains the
+      current object graph and any Python state to be saved in the
+      checkpoint. When executing eagerly only the first argument is meaningful.
     """
     (named_saveable_objects, graph_proto,
      feed_additions) = self._gather_saveables(
@@ -1381,15 +1450,11 @@ class CheckpointableSaver(object):
         # constructors. That means the Saver needs to be copied with a new
         # var_list.
         or context.executing_eagerly()):
-      if self._last_save_object_graph is not None:
-        self._last_save_saver = _copy_saver_with_new_var_list(
-            old_saver=self._last_save_saver,
-            new_var_list=named_saveable_objects)
-      else:
-        self._last_save_saver = saver_lib.Saver(
-            var_list=named_saveable_objects, max_to_keep=None)
+      saver = functional_saver.Saver(named_saveable_objects)
+      with ops.device("/cpu:0"):
+        self._cached_save_operation = saver.save(file_prefix)
       self._last_save_object_graph = graph_proto
-    return self._last_save_saver, feed_additions
+    return self._cached_save_operation, feed_additions
 
   def save(self, file_prefix, checkpoint_number=None, session=None):
     """Save a training checkpoint.
@@ -1413,35 +1478,42 @@ class CheckpointableSaver(object):
     Returns:
       The full path to the checkpoint.
     """
-    feed_additions = {}
+    feed_dict = {}
     graph_building = not context.executing_eagerly()
+    if checkpoint_number:
+      file_prefix = "%s-%d" % (file_prefix, checkpoint_number)
     if graph_building:
       if self._object_graph_feed_tensor is None:
         with ops.device("/cpu:0"):
           self._object_graph_feed_tensor = constant_op.constant(
               "", dtype=dtypes.string)
+          self._file_prefix_feed_tensor = constant_op.constant(
+              "", dtype=dtypes.string)
       object_graph_tensor = self._object_graph_feed_tensor
+      file_prefix_tensor = self._file_prefix_feed_tensor
+      feed_dict[file_prefix_tensor] = file_prefix
     else:
+      with ops.device("/cpu:0"):
+        file_prefix_tensor = constant_op.constant(
+            file_prefix, dtype=dtypes.string)
       object_graph_tensor = None
 
-    saver, new_feed_additions = self._prepare_save(
+    file_io.recursive_create_dir(os.path.dirname(file_prefix))
+    save_path, new_feed_additions = self._save_cached_when_graph_building(
+        file_prefix=file_prefix_tensor,
         object_graph_tensor=object_graph_tensor,
         saveable_object_cache=self._saveable_object_cache)
     if new_feed_additions:
-      feed_additions.update(new_feed_additions)
+      feed_dict.update(new_feed_additions)
     if not graph_building:
       session = None
     elif session is None:
       session = ops.get_default_session()
 
-    with ops.device("/cpu:0"):
-      save_path = saver.save(
-          sess=_SessionWithFeedDictAdditions(
-              session=session, feed_additions=feed_additions),
-          save_path=file_prefix,
-          write_meta_graph=False,
-          write_state=False,
-          global_step=checkpoint_number)
+    if session:
+      save_path = session.run(save_path, feed_dict=feed_dict)
+    else:
+      save_path = save_path.numpy()
     return save_path
 
   def restore(self, save_path):
@@ -1684,7 +1756,8 @@ class Checkpoint(tracking.Checkpointable):
     """
     super(Checkpoint, self).__init__()
     for k, v in sorted(kwargs.items(), key=lambda item: item[0]):
-      if not isinstance(v, base.CheckpointableBase):
+      if not isinstance(v, (base.CheckpointableBase,
+                            def_function.PolymorphicFunction)):
         raise ValueError(
             ("`Checkpoint` was expecting a checkpointable object (an object "
              "derived from `CheckpointableBase`), got %s. If you believe this "
@@ -1729,9 +1802,9 @@ class Checkpoint(tracking.Checkpointable):
     Returns:
       The full path to the checkpoint (i.e. `file_prefix`).
     """
-    return self._saver.save(
+    return compat.as_str(self._saver.save(
         file_prefix=file_prefix,
-        session=session)
+        session=session))
 
   @property
   def save_counter(self):
diff --git a/tensorflow/python/training/checkpointable/util_test.py b/tensorflow/python/training/checkpointable/util_test.py
index 19955140123afcf7addfa94550a8352f3acf810f..3bdab4cb0bf990543a18cab885f540b8d1f78ed8 100644
--- a/tensorflow/python/training/checkpointable/util_test.py
+++ b/tensorflow/python/training/checkpointable/util_test.py
@@ -26,7 +26,7 @@ from tensorflow.python import pywrap_tensorflow
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
-from tensorflow.python.eager import function
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -616,6 +616,7 @@ class CheckpointingTests(test.TestCase):
 
   # pylint: disable=cell-var-from-loop
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testWithDefun(self):
     num_training_steps = 2
     checkpoint_directory = self.get_temp_dir()
@@ -632,7 +633,7 @@ class CheckpointingTests(test.TestCase):
             checkpoint_directory)
         status = root.restore(save_path=checkpoint_path)
         def train_fn():
-          @function.defun
+          @def_function.function
           def _call_model(x):
             return model(x)
           with backprop.GradientTape() as tape:
diff --git a/tensorflow/python/training/coordinator.py b/tensorflow/python/training/coordinator.py
index 0ff97d85e37e6167f1200ba56940f4a663c259a2..b7e5c98c78ec91078de03de82ecc70d522e59f2b 100644
--- a/tensorflow/python/training/coordinator.py
+++ b/tensorflow/python/training/coordinator.py
@@ -408,7 +408,7 @@ class Coordinator(object):
 
 
 # Threads for the standard services.
-@tf_export("train.LooperThread")
+@tf_export(v1=["train.LooperThread"])
 class LooperThread(threading.Thread):
   """A thread that runs code repeatedly, optionally on a timer.
 
diff --git a/tensorflow/python/training/device_setter.py b/tensorflow/python/training/device_setter.py
index be80c3657158b52d063b5d2b7731f25d184794a0..5874a1ff4152d835263cdc1ad87002b64c026eb8 100644
--- a/tensorflow/python/training/device_setter.py
+++ b/tensorflow/python/training/device_setter.py
@@ -130,7 +130,7 @@ class _ReplicaDeviceChooser(object):
     return worker_device.to_string()
 
 
-@tf_export("train.replica_device_setter")
+@tf_export(v1=["train.replica_device_setter"])
 def replica_device_setter(ps_tasks=0, ps_device="/job:ps",
                           worker_device="/job:worker", merge_devices=True,
                           cluster=None, ps_ops=None, ps_strategy=None):
diff --git a/tensorflow/python/training/device_setter_test.py b/tensorflow/python/training/device_setter_test.py
index 85b75502ab0943013f12a34002e72b71d187bf68..3cff87b326f1de8243a230bf87d64cc6963026d3 100644
--- a/tensorflow/python/training/device_setter_test.py
+++ b/tensorflow/python/training/device_setter_test.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ class DeviceSetterTest(test.TestCase):
       "worker": ["worker0:2222", "worker1:2222", "worker2:2222"]
   })
 
+  @test_util.run_deprecated_v1
   def testCPUOverride(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -47,12 +49,14 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker/cpu:0", a.device)
 
+  @test_util.run_deprecated_v1
   def testResource(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
       v = resource_variable_ops.ResourceVariable([1, 2])
       self.assertDeviceEqual("/job:ps/task:0", v.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterSpecClass(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -65,6 +69,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksPinVariableToJob(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec)):
@@ -82,6 +87,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", x.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksUseCpuForPS(self):
     with ops.device(
         device_setter.replica_device_setter(ps_tasks=1, ps_device="/cpu:0")):
@@ -95,6 +101,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:moon/cpu:0", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksNoMerging(self):
     with ops.device(
         device_setter.replica_device_setter(
@@ -109,6 +116,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterSpecDict(self):
     with ops.device(
         device_setter.replica_device_setter(cluster=self._cluster_spec.as_dict(
@@ -122,6 +130,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithClusterDef(self):
     with ops.device(
         device_setter.replica_device_setter(
@@ -135,6 +144,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:ps/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:worker", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithDevice(self):
     cluster_spec = server_lib.ClusterSpec({
         "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
@@ -155,6 +165,7 @@ class DeviceSetterTest(test.TestCase):
       self.assertDeviceEqual("/job:moon/task:1", w.initializer.device)
       self.assertDeviceEqual("/job:sun", a.device)
 
+  @test_util.run_deprecated_v1
   def testPS2TasksWithCPUConstraint(self):
     cluster_spec = server_lib.ClusterSpec({
         "sun": ["sun0:2222", "sun1:2222", "sun2:2222"],
diff --git a/tensorflow/python/training/distribute.py b/tensorflow/python/training/distribute.py
index 95104ad5779eafcf28b3fd190c8c3dee3b49e7e4..ad27bc8a7025f060b25c47c5391dd8e473c0e466 100644
--- a/tensorflow/python/training/distribute.py
+++ b/tensorflow/python/training/distribute.py
@@ -12,1234 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Class DistributionStrategy, ReplicaContext, and supporting APIs."""
+"""Deprecated, please use ../distribute/distribute_lib.py."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import threading
-
-from tensorflow.python.data.ops import dataset_ops
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import variable_scope
-from tensorflow.python.ops.losses import losses_impl
-from tensorflow.python.platform import tf_logging
-from tensorflow.python.training import device_util
-from tensorflow.python.training import distribution_strategy_context
-from tensorflow.python.util import nest
-
-
-# ------------------------------------------------------------------------------
-# Context tracking whether in a distribution.update() or .update_non_slot()
-# call.
-
-
-_update_device = threading.local()
-
-
-def get_update_device():
-  """Get the current device if in a `DistributionStrategy.update()` call."""
-  try:
-    return _update_device.current
-  except AttributeError:
-    return None
-
-
-class UpdateContext(object):
-  """Context manager when you are in `update()` or `update_non_slot()`."""
-
-  def __init__(self, device):
-    self._device = device
-    self._old_device = None
-
-  def __enter__(self):
-    self._old_device = get_update_device()
-    _update_device.current = self._device
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    del exception_type, exception_value, traceback
-    _update_device.current = self._old_device
-
-
-# ------------------------------------------------------------------------------
-# Public utility functions.
-
-
-def get_loss_reduction():
-  """Reduce `aggregation` corresponding to the last loss reduction."""
-  loss_reduction = ops.get_default_graph()._last_loss_reduction  # pylint: disable=protected-access
-  if loss_reduction == losses_impl.Reduction.SUM:
-    return variable_scope.VariableAggregation.SUM
-  return variable_scope.VariableAggregation.MEAN
-
-
-# ------------------------------------------------------------------------------
-# Internal API for validating the current thread mode
-
-
-def _require_cross_replica_context(distribution_strategy):
-  """Verify in cross-replica context for `distribution_strategy`."""
-  context = _get_per_thread_mode()
-  if context.cross_replica_context is distribution_strategy: return
-  # We have an error to report, figure out the right message.
-  if context.distribution_strategy is not distribution_strategy:
-    if not distribution_strategy_context.has_distribution_strategy():
-      raise RuntimeError(
-          'Need to be inside "with distribution_strategy.scope()" for %s' %
-          (distribution_strategy,))
-    else:
-      raise RuntimeError(
-          "Mixing different DistributionStrategy objects: %s is not %s" %
-          (context.distribution_strategy, distribution_strategy))
-  assert context.cross_replica_context is None
-  raise RuntimeError("Method requires being in cross-replica context, use "
-                     "get_replica_context().merge_call()")
-
-
-def require_replica_context(replica_ctx):
-  """Verify in `replica_ctx` replica context."""
-  context = _get_per_thread_mode()
-  if context.replica_context is replica_ctx: return
-  # We have an error to report, figure out the right message.
-  if context.replica_context is None:
-    raise RuntimeError("Need to be inside `call_for_each_replica()`")
-  if context.distribution_strategy is replica_ctx.distribution_strategy:
-    # Two different ReplicaContexts with the same DistributionStrategy.
-    raise RuntimeError("Mismatching replica context.")
-  raise RuntimeError(
-      "Mismatching DistributionStrategy objects: %s is not %s." %
-      (context.distribution_strategy, replica_ctx.distribution_strategy))
-
-
-def _require_distribution_strategy_scope(distribution_strategy):
-  """Verify in a `distribution_strategy.scope()` in this thread."""
-  context = _get_per_thread_mode()
-  if context.distribution_strategy is distribution_strategy: return
-  # We have an error to report, figure out the right message.
-  if not distribution_strategy_context.has_distribution_strategy():
-    raise RuntimeError(
-        'Need to be inside "with distribution_strategy.scope()" for %s' %
-        (distribution_strategy,))
-  else:
-    raise RuntimeError(
-        "Mixing different DistributionStrategy objects: %s is not %s" %
-        (context.distribution_strategy, distribution_strategy))
-
-
-# ------------------------------------------------------------------------------
-# Internal context managers used to implement the DistributionStrategy
-# base class
-
-
-class _CurrentDistributionContext(object):
-  """Context manager for setting the `DistributionStrategy` and var creator."""
-
-  def __init__(self,
-               distribution_strategy,
-               var_creator_scope,
-               var_scope=None,
-               default_device=None):
-    self._context = distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
-        distribution_strategy)
-    self._var_creator_scope = var_creator_scope
-    self._var_scope = var_scope
-    if default_device:
-      self._device_scope = ops.device(default_device)
-    else:
-      self._device_scope = None
-
-  def __enter__(self):
-    _push_per_thread_mode(self._context)
-    if self._var_scope:
-      self._var_scope.__enter__()
-    self._var_creator_scope.__enter__()
-    if self._device_scope:
-      self._device_scope.__enter__()
-    return self._context.distribution_strategy
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    if self._device_scope:
-      self._device_scope.__exit__(exception_type, exception_value, traceback)
-    self._var_creator_scope.__exit__(exception_type, exception_value, traceback)
-    if self._var_scope:
-      self._var_scope.__exit__(exception_type, exception_value, traceback)
-    _pop_per_thread_mode()
-
-
-class _SameScopeAgainContext(object):
-  """Trivial context manager when you are already in `scope()`."""
-
-  def __init__(self, distribution_strategy):
-    self._distribution_strategy = distribution_strategy
-
-  def __enter__(self):
-    return self._distribution_strategy
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    del exception_type, exception_value, traceback
-
-
-# ------------------------------------------------------------------------------
-# Base classes for all distribution strategies.
-
-
-class DistributionStrategy(object):
-  """A list of devices with a state & compute distribution policy.
-
-  See [tensorflow/contrib/distribute/README.md](
-  https://www.tensorflow.org/code/tensorflow/contrib/distribute/README.md)
-  for overview and examples.
-
-  The intent is that you can write an algorithm in a stylized way and
-  it will be usable with a variety of different `DistributionStrategy`
-  implementations. Each descendant will implement a different strategy
-  for distributing the algorithm across multiple devices/machines.
-  Furthermore, these changes can be hidden inside the specific layers
-  and other library classes that need special treatment to run in a
-  distributed setting, so that most users' model definition code can
-  run unchanged. The `DistributionStrategy` API works the same way
-  with eager and graph execution.
-
-  First let's introduce a few high-level concepts:
-
-  * _Data parallelism_ is where we run multiple copies of the model
-    on different slices of the input data. This is in contrast to
-    _model parallelism_ where we divide up a single copy of a model
-    across multiple devices.
-    Note: we only support data parallelism for now, but
-    hope to add support for model parallelism in the future.
-  * A _replica_ is one copy of the model, running on one slice of the
-    input data.
-  * _Synchronous_, or more commonly _sync_, training is where the
-    updates from each replica are aggregated together before updating
-    the model variables. This is in contrast to _asynchronous_, or
-    _async_ training, where each replica updates the model variables
-    independently.
-  * Furthermore you might run your computation on multiple devices
-    on one machine (or "host"), or on multiple machines/hosts.
-    If you are running on multiple machines, you might have a
-    single master host that drives computation across all of them,
-    or you might have multiple clients driving the computation
-    asynchronously.
-
-  To distribute an algorithm, we might use some of these ingredients:
-
-  * Parameter servers: These are hosts that hold a single copy of
-    parameters/variables. All replicas that want to operate on a variable
-    retrieve it at the beginning of a step and send an update to be
-    applied at the end of the step. Can support either sync or async
-    training.
-  * Mirrored variables: These are variables that are copied to multiple
-    devices, where we keep the copies in sync by applying the same
-    updates to every copy. Normally would only be used with sync training.
-  * Reductions and Allreduce: A _reduction_ is some method of
-    aggregating multiple values into one value, like "sum" or
-    "mean". If doing sync training, we will perform a reduction on the
-    gradients to a parameter from all replicas before applying the
-    update. Allreduce is an algorithm for performing a reduction on
-    values from multiple devices and making the result available on
-    all of those devices.
-  * In the future we will have support for TensorFlow's partitioned
-    variables, where a single variable is split across multiple
-    devices.
-
-  We have then a few approaches we want to support:
-
-  * Code written (as if) with no knowledge of class `DistributionStrategy`.
-    This code should work as before, even if some of the layers, etc.
-    used by that code are written to be distribution-aware. This is done
-    by having a default `DistributionStrategy` that gives ordinary behavior,
-    and by default being in a single replica context.
-  * Ordinary model code that you want to run using a specific
-    `DistributionStrategy`. This can be as simple as:
-
-    ```
-    with my_distribution.scope():
-      iterator = my_distribution.distribute_dataset(
-          dataset).make_one_shot_iterator()
-      replica_train_ops = my_distribution.call_for_each_replica(
-          replica_fn, args=(iterator.get_next(),))
-      train_op = tf.group(my_distribution.unwrap(replica_train_ops))
-    ```
-
-    This takes an ordinary `dataset` and `replica_fn` and runs it
-    distributed using a particular `DistributionStrategy` in
-    `my_distribution`. Any variables created in `replica_fn` are created
-    using `my_distribution`'s policy, and library functions called by
-    `replica_fn` can use the `get_replica_context()` API to get enhanced
-    behavior in this case.
-
-    You can also create an initializable iterator instead of a one-shot
-    iterator. In that case, you will need to ensure that you initialize the
-    iterator before calling get_next.
-    ```
-    iterator = my_distribution.distribute_dataset(
-        dataset).make_initializable_iterator())
-    session.run(iterator.initializer)
-    ```
-
-  * If you want to write a distributed algorithm, you may use any of
-    the `DistributionStrategy` APIs inside a
-    `with my_distribution.scope():` block of code.
-
-  Lower-level concepts:
-
-  * Wrapped values: In order to represent values parallel across devices
-    (either replicas or the devices associated with a particular value), we
-    wrap them in a "PerReplica" or "Mirrored" object that contains a map
-    from device to values. "PerReplica" is used when the value may be
-    different across replicas, and "Mirrored" when the value are the same.
-  * Unwrapping and merging: Consider calling a function `fn` on
-    multiple replicas, like `call_for_each_replica(fn, args=[w])` with an
-    argument `w` that is a wrapped value. This means `w` will have a
-    map taking replica device `d0` to `w0`, replica device `d1` to `w1`,
-    etc. `call_for_each_replica()` unwraps `w` before calling `fn`, so
-    it calls `fn(w0)` on `d0`, `fn(w1)` on `d1`, etc.  It then merges
-    the return values from `fn()`, which can possibly result in
-    wrapped values. For example, let's say `fn()` returns a tuple with
-    three components: `(x, a, v0)` from replica 0, `(x, b, v1)` on replica 1,
-    etc. If the first component is the same object `x` from every
-    replica, then the first component of the merged result will also be
-    `x`. If the second component is different (`a`, `b`, ...)  from
-    each replica, then the merged value will have a wrapped map from
-    replica device to the different values. If the third component is
-    the members of a mirrored variable (`v` maps `d0` to `v0`, `d1` to
-    `v1`, etc.), then the merged result will be that mirrored variable
-    (`v`).
-  * Replica context vs. Cross-replica context: _replica context_ is when we
-    are in some function that is being called once for each replica.
-    Otherwise we are in cross-replica context, which is useful for
-    calling `DistributionStrategy` methods which operate across the
-    replicas (like `reduce()`). By default you start in a replica context
-    (the default "single replica context") and then some methods can
-    switch you back and forth, as described below.
-  * Worker devices vs. parameter devices: Most replica computations will
-    happen on worker devices. Since we don't yet support model
-    parallelism, there will be one worker device per replica. When using
-    parameter servers (see above), the set of devices holding
-    variables may be different, otherwise the parameter devices might
-    match the worker devices.
-  * Non-slot devices are some subset of the parameter devices where we
-    put all the non-slot variables. We need to ensure that all
-    non-slot variables are allocated on the same device, or mirrored
-    across the same set of devices. If you have some variable you want
-    to colocate all the non-slot variables with, you can use
-    `colocate_vars_with()` to get the remaining non-slot variables on
-    the same device.  Otherwise you can use `non_slot_devices()` to
-    pick a consistent set of devices to pass to both
-    `colocate_vars_with()` and `update_non_slot()`.
-
-  When using a `DistributionStrategy`, we have a new type dimension
-  called _locality_ that says what values are compatible with which
-  APIs:
-
-  * T: different value for each replica (e.g. a PerReplica-wrapped value).
-  * M: value is "mirrored" across replicas, i.e. there are copies with the
-    same value on each replica (e.g. a Mirrored-wrapped value).
-  * V(`v`): value is "mirrored" across all the devices which have a
-    copy of variable `v` (also a Mirrored-wrapped value, but over
-    parameter devices instead of worker devices).
-  * N: value is "mirrored" across all the "non-slot" devices
-
-  Rules for methods with respect to locality and single-replica vs.
-  cross-replica context:
-
-  * `with d.scope()`: default single-replica context -> cross-replica context
-    for `d`
-  * `with d.colocate_vars_with(v)`: in replica/cross-replica context, variables
-    will be created with locality V(`v`). That is, if we write
-    `with d.colocate_vars_with(v1): v2 = tf.get_variable(...)`, then
-    `v2` will have locality V(`v1`), i.e. locality V(`v2`) will equal
-    V(`v1`).
-  * `with d.colocate_vars_with(d.non_slot_devices(...))`: in
-    replica/cross-replica context, variables will be created with locality N
-  * `v = tf.get_variable(...)`: in replica/cross-replica context, creates
-    a variable (which by definition will have locality V(`v`), though
-    will match another locality if inside a `colocate_vars_with`
-    scope).
-  * `d.distribute_dataset(dataset).make_one_shot_iterator()`: in cross-replica
-    context, produces an iterator with locality T
-  * `d.broadcast(t)`: in cross-replica context, produces a value with locality M
-  * `d.broadcast(t, v)`: in cross-replica context, produces a value with
-    locality V(`v`)
-  * `d.call_for_each_replica(fn, ...)`: in cross-replica context, runs
-    `fn()` in a replica context (and so may call `get_replica_context()` and
-    use its API, including `merge_call()` to get back to cross-replica
-    context), once for each replica. May use values with locality T or
-    M, and any variable.
-  * `d.reduce(m, t, t)`: in cross-replica context, accepts t with locality T
-    and produces a value with locality M.
-  * `d.reduce(m, t, v)`: in cross-replica context, accepts t with
-    locality T and produces a value with locality V(`v`).
-  * `d.batch_reduce(m, [(t, v)]): see `d.reduce()`
-  * `d.update(v, fn, ...)`: in cross-replica context, runs `fn()` once
-    for each device `v` is copied to, all inputs should have locality
-    V(`v`), output will have locality V(`v`) as well.
-  * `d.update_non_slot(d.non_slot_devices(), fn)`: in cross-replica
-    context, like `d.update()` except with locality N.
-  * `d.read_var(v)`: Gets the (read-only) value of the variable `v` (on
-    the device determined by the current device scope), aggregating
-    across replicas for replica-local variables. Frequently, this will be
-    done automatically when using `v` in an expression or fetching it in
-    a cross-replica context, but this function can be used to force that
-    conversion happens at a particular point in time (for example, to
-    add the result of the conversion to a graph collection).
-
-  The standard pattern for updating variables is to:
-
-  1. Wrap your input dataset in `d.distribute_dataset()` and create an iterator.
-  2. Define each replica `d.call_for_each_replica()` up to the point of
-     getting a list of gradient, variable pairs.
-  3. Call `d.reduce(VariableAggregation.SUM, t, v)` or `d.batch_reduce()` to sum
-     the gradients (with locality T) into values with locality V(`v`).
-  4. Call `d.update(v)` for each variable to update its value.
-
-  Steps 3 and 4 are done automatically by class `Optimizer` if you call
-  its `apply_gradients` method in a replica context. Otherwise you can
-  manually call its `_distributed_apply` method in a cross-replica context.
-
-  Another thing you might want to do in the middle of your replica function
-  is an all-reduce of some intermediate value, using `d.reduce()` or
-  `d.batch_reduce()`. You simply provide the same tensor as the input and
-  destination.
-
-  Layers should expect to be called in a replica context, and can use
-  the `get_replica_context()` function to get a `ReplicaContext` object. The
-  `ReplicaContext` object has a `merge_call()` method for entering
-  cross-replica context where you can use `reduce()` (or
-  `batch_reduce()`) and then optionally `update()` to update state.
-
-  You may use this API whether or not a `DistributionStrategy` is
-  being used, since there is a default implementation of
-  `ReplicaContext` and `DistributionStrategy`.
-  """
-
-  # TODO(josh11b): Raise an exception if variable partitioning requested before
-  #   we add support.
-  # TODO(josh11b): Also `parameter_device_index` property?
-  # TODO(josh11b): `map()`
-  # TODO(josh11b): ClusterSpec/ClusterResolver
-  # TODO(josh11b): Partitioned computations, state; sharding
-  # TODO(josh11b): Model parallelism: "replicas" with multiple devices; shuffling
-  # TODO(josh11b): List of replicas with their worker and parameter devices
-  #   (where the parameter devices may overlap in the ps case).
-
-  def __init__(self):
-    self._default_device = None
-    # This property is used to determine if we should set drop_remainder=True
-    # when creating Datasets from numpy array inputs.
-    self._require_static_shapes = False
-
-  def scope(self):
-    """Returns a context manager selecting this DistributionStrategy as current.
-
-    Inside a `with distribution_strategy.scope():` code block, this thread
-    will use a variable creator set by `distribution_strategy`, and will
-    enter its "cross-replica context".
-
-    Returns:
-      A context manager.
-    """
-    if distribution_strategy_context.has_distribution_strategy():
-      _require_cross_replica_context(self)
-      return _SameScopeAgainContext(self)
-
-    def creator_with_resource_vars(*args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      kwargs["use_resource"] = True
-      return self._create_variable(*args, **kwargs)
-
-    def distributed_getter(getter, *args, **kwargs):
-      if not self._allow_variable_partition():
-        if kwargs.pop("partitioner", None) is not None:
-          tf_logging.log_first_n(
-              tf_logging.WARN, "Partitioned variables are disabled when using "
-              "current DistributionStrategy.", 1)
-      return getter(*args, **kwargs)
-
-    return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator_with_resource_vars),
-        variable_scope.variable_scope(
-            variable_scope.get_variable_scope(),
-            custom_getter=distributed_getter), self._default_device)
-
-  def _allow_variable_partition(self):
-    return False
-
-  def _create_variable(self, next_creator, *args, **kwargs):
-    # Note: should support "colocate_with" argument.
-    raise NotImplementedError("must be implemented in descendants")
-
-  def read_var(self, v):
-    """Reads the value of a variable.
-
-    Returns the aggregate value of a replica-local variable, or the
-    (read-only) value of any other variable.
-
-    Args:
-      v: A variable allocated within the scope of this `DistributionStrategy`.
-
-    Returns:
-      A tensor representing the value of `v`, aggregated across replicas if
-      necessary.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def colocate_vars_with(self, colocate_with_variable):
-    """Scope that controls which devices variables will be created on.
-
-    No operations should be added to the graph inside this scope, it
-    should only be used when creating variables (some implementations
-    work by changing variable creation, others work by using a
-    tf.colocate_with() scope).
-
-    This may only be used inside `self.scope()`.
-
-    Example usage:
-
-    ```
-    with distribution_strategy.scope():
-      var1 = tf.get_variable(...)
-      with distribution_strategy.colocate_vars_with(v1):
-        # var2 and var3 will be created on the same device(s) as var1
-        var2 = tf.get_variable(...)
-        var3 = tf.get_variable(...)
-
-      def fn(v1, v2, v3):
-        # operates on v1 from var1, v2 from var2, and v3 from var3
-
-      # `fn` runs on every device `v1` is on, `v2` and `v3` will be there too.
-      distribution_strategy.update(v1, fn, v2, v3)
-    ```
-
-    Args:
-      colocate_with_variable: A created in `self.scope()`. Variables created
-        while in the returned context manager will be on the same set of
-        devices as `colocate_with_variable`.
-
-    Returns:
-      A context manager.
-    """
-    def create_colocated_variable(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      kwargs["use_resource"] = True
-      kwargs["colocate_with"] = colocate_with_variable
-      return next_creator(*args, **kwargs)
-
-    _require_distribution_strategy_scope(self)
-    return variable_scope.variable_creator_scope(create_colocated_variable)
-
-  def _call_dataset_fn(self, dataset_fn):
-    result = dataset_fn()
-    if not isinstance(result, dataset_ops.Dataset):
-      raise ValueError(
-          "dataset_fn() must return a tf.data.Dataset when using a "
-          "DistributionStrategy.")
-    return result
-
-  # TODO(josh11b): `PerReplicaDataset` currently only implements a few methods of
-  # Dataset API such as make_one_shot_iterator and make_initializable_iterator.
-  # Extend to implement more functionality of datasets.
-  def distribute_dataset(self, dataset_fn):
-    """Return a `dataset` split across all replicas.
-
-    Suitable for providing input to for `call_for_each_replica()` by creating an
-    iterator:
-
-    ```
-    def dataset_fn():
-      return tf.data.Dataset.from_tensors([[1.]]).repeat()
-    with distribution_strategy.scope():
-      distributed_dataset = distribution_strategy.distribute_dataset(dataset_fn)
-      iterator = distributed_dataset.make_one_shot_iterator()
-      replica_results = distribution_strategy.call_for_each_replica(
-          replica_fn, args=(iterator.get_next(),))
-    ```
-
-    Args:
-      dataset_fn: A function that returns a `tf.data.Dataset`.
-
-    Returns:
-      A `PerReplicaDataset` that will produce data for each replica.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def broadcast(self, tensor, destinations=None):
-    """Mirror a tensor on one device to all worker devices.
-
-    Args:
-      tensor: A Tensor value to broadcast.
-      destinations: An optional mirrored variable, device string, or
-        list of device strings, specifying the destination devices
-        to copy `tensor` to. Defaults to `self.worker_devices`.
-
-    Returns:
-      A value mirrored to `destinations` devices.
-    """
-    # TODO(josh11b): More docstring
-    _require_cross_replica_context(self)
-    return self._broadcast(tensor, destinations)
-
-  def _broadcast(self, tensor, destinations):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def initialize(self):
-    """Any initialization to be done before running any computations.
-
-    In eager mode, it executes any initialization as a side effect.
-    In graph mode, it creates the initialization ops and returns them.
-
-    For example, TPU initialize_system ops.
-
-    Returns:
-      A list of ops to execute.
-    """
-    return []
-
-  def finalize(self):
-    """Any final actions to be done at the end of all computations.
-
-    In eager mode, it executes any finalize actions as a side effect.
-    In graph mode, it creates the finalize ops and returns them.
-
-    For example, TPU shutdown ops.
-
-    Returns:
-      A list of ops to execute.
-    """
-    return []
-
-  def run_steps_on_dataset(self, fn, iterator, iterations=1,
-                           initial_loop_values=None):
-    """Run `fn` with input from `iterator` for `iterations` times.
-
-    This method can be used to run a step function for training a number of
-    times using input from a dataset.
-
-    Args:
-      fn: function to run using this distribution strategy. The function must
-        have the following signature: `def fn(context, *inputs)`.
-        `context` is an instance of `MultiStepContext` that will be passed when
-        `fn` is run. `context` can be used to specify the outputs to be returned
-        from `fn` by calling `context.set_last_step_output`. It can also be used
-        to capture non tensor outputs by `context.set_non_tensor_output`.
-        See `MultiStepContext` documentation for more information.
-        `inputs` will have same type/structure as `iterator.get_next()`. If the
-        `iterator.get_next()` returns a tuple say `return x, y` then whose will
-        be unpacked and passed to the `step_fn`; and step_fn signature would
-        look like `def step_fn(context, x, y)`. If the iterator returns a single
-        value say `return x` then the value is passed as is; the step_fn
-        signature would look like `def step_fn(context, x)`.
-        Typically, `fn` will use `call_for_each_replica` method of the strategy
-        to distribute the computation over multiple replicas.
-      iterator: Iterator of a dataset that represents the input for `fn`. The
-        caller is responsible for initializing the iterator as needed.
-      iterations: (Optional) Number of iterations that `fn` should be run.
-        Defaults to 1.
-      initial_loop_values: (Optional) Initial values to be passed into the
-        loop that runs `fn`. Defaults to `None`. # TODO(priyag): Remove
-        initial_loop_values argument when we have a mechanism to infer the
-        outputs of `fn`.
-
-    Returns:
-      Returns the `MultiStepContext` object which has the following properties,
-      among other things:
-        - run_op: An op that runs `fn` `iterations` times.
-        - last_step_outputs: A dictionary containing tensors set using
-        `context.set_last_step_output`. Evaluating this returns the value of
-        the tensors after the last iteration.
-        - non_tensor_outputs: A dictionatry containing anything that was set by
-          `fn` by calling `context.set_non_tensor_output`.
-    """
-    _require_cross_replica_context(self)
-    return self._run_steps_on_dataset(fn, iterator, iterations,
-                                      initial_loop_values)
-
-  def _run_steps_on_dataset(self, fn, iterator, iterations,
-                            initial_loop_values):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def call_for_each_replica(self, fn, *args, **kwargs):
-    """Run `fn` once per replica.
-
-    `fn` may call `tf.get_replica_context()` to access methods such as
-    `replica_id()` and `merge_call()`.
-
-    `merge_call()` is used to communicate between the replicas and
-    re-enter the cross-replica context. All replicas pause their execution
-    having encountered a `merge_call()` call. After that the
-    `merge_fn`-function is executed. Its results are then unwrapped and
-    given back to each replica call. After that execution resumes until
-    `fn` is complete or encounters another `merge_call()`.  Example:
-
-    ```python
-    # Called once in "cross-replica" context.
-    def merge_fn(distribution, three_plus_replica_id):
-      # sum the values across replicas
-      return sum(distribution.unwrap(three_plus_replica_id))
-
-    # Called once per replica in `distribution`, in a "replica" context.
-    def fn(three):
-      replica_ctx = tf.get_replica_context()
-      v = three + replica_ctx.replica_id
-      # Computes the sum of the `v` values across all replicas.
-      s = replica_ctx.merge_call(merge_fn, args=(v,))
-      return s + v
-
-    with distribution.scope():
-      # in "cross-replica" context
-      ...
-      merged_results = distribution.call_for_each_replica(fn, args=[3])
-      # merged_results has the values from every replica execution of `fn`.
-      print(distribution.unwrap(merged_results))  # Prints a list
-    ```
-
-    Args:
-      fn: function to run (will be run once per replica).
-      args: Tuple or list with positional arguments for `fn`.
-      kwargs: Dict with keyword arguments for `fn`.
-
-    Returns:
-      Merged return value of `fn` across all replicas.
-    """
-    _require_cross_replica_context(self)
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to call_for_each_replica")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to call_for_each_replica")
-      kwargs = k
-    kwargs.pop("run_concurrently", None)  # Ignore old option.
-    return self._call_for_each_replica(fn, args, kwargs)
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def reduce(self, aggregation, value, destinations):
-    """Combine (via e.g. sum or mean) values across replicas.
-
-    Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
-        `tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      value: A per-replica value with one value per replica.
-      destinations: A mirrored variable, a per-replica tensor, a device string,
-        or list of device strings. The return value will be copied to all
-        destination devices (or all the devices where the `destinations` value
-        resides). To perform an all-reduction, pass `value` to `destinations`.
-
-    Returns:
-      A value mirrored to `destinations`.
-    """
-    # TODO(josh11b): More docstring
-    # TODO(josh11b): Return an unwrapped value if colocate_with is a
-    # single device.
-    _require_cross_replica_context(self)
-    assert aggregation in [
-        variable_scope.VariableAggregation.SUM,
-        variable_scope.VariableAggregation.MEAN,
-        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-    ]
-    return self._reduce(aggregation, value, destinations)
-
-  def _reduce(self, aggregation, value, destinations):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def batch_reduce(self, aggregation, value_destination_pairs):
-    """Combine multiple `reduce` calls into one for faster execution.
-
-    Args:
-      aggregation: Indicates how a variable will be aggregated. Accepted values
-        are `tf.VariableAggregation.SUM`, `tf.VariableAggregation.MEAN`,
-        `tf.VariableAggregation.ONLY_FIRST_REPLICA`.
-      value_destination_pairs: A sequence of (value, destinations)
-        pairs. See `reduce()` for a description.
-
-    Returns:
-      A list of mirrored values, one per pair in `value_destination_pairs`.
-    """
-    # TODO(josh11b): More docstring
-    _require_cross_replica_context(self)
-    assert aggregation in [
-        variable_scope.VariableAggregation.SUM,
-        variable_scope.VariableAggregation.MEAN,
-        variable_scope.VariableAggregation.ONLY_FIRST_REPLICA
-    ]
-    return self._batch_reduce(aggregation, value_destination_pairs)
-
-  def _batch_reduce(self, aggregation, value_destination_pairs):
-    return [
-        self.reduce(aggregation, t, destinations=v)
-        for t, v in value_destination_pairs
-    ]
-
-  def update(self, var, fn, *args, **kwargs):
-    """Run `fn` to update `var` using inputs mirrored to the same devices.
-
-    If `var` is mirrored across multiple devices, then this implements
-    logic like:
-
-    ```
-    results = {}
-    for device, v in var:
-      with tf.device(device):
-        # *args and **kwargs will be unwrapped if they are mirrored.
-        results[device] = fn(v, *args, **kwargs)
-    return merged(results)
-    ```
-
-    Otherwise this returns `fn(var, *args, **kwargs)` colocated with `var`.
-
-    Neither `*args` nor `**kwargs` may contain per-replica values.
-    If they contain mirrored values, they will be unwrapped before
-    calling `fn`.
-
-    Args:
-      var: Variable, possibly mirrored to multiple devices, to operate on.
-      fn: Function to call. Should take the variable as the first argument.
-      *args: Additional positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
-        specified, the return value will be unwrapped.
-
-    Returns:
-      By default, the merged return value of `fn` across all replicas.  The
-      merged result has dependencies to make sure that if it is evaluated at
-      all, the side effects (updates) will happen on every replica. If instead
-      "grouped=False" is specified, this function will return a nest of lists
-      where each list has an element per replica, and the caller is responsible
-      for ensuring all elements are executed.
-    """
-    _require_cross_replica_context(self)
-    options = {"grouped": kwargs.pop("grouped", True)}
-    return self._update(var, options, fn, *args, **kwargs)
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def update_non_slot(self, colocate_with, fn, *args, **kwargs):
-    """Runs `fn(*args, **kwargs)` on `colocate_with` devices.
-
-    Args:
-      colocate_with: The return value of `non_slot_devices()`.
-      fn: Function to execute.
-      *args: Positional arguments to pass to `fn()`.
-      **kwargs: Keyword arguments to pass to `fn()`. If "grouped=False" is
-        specified, the return value will be unwrapped and the caller is
-        responsible for ensuring all elements are executed.
-
-    Returns:
-      Return value of `fn`, possibly merged across devices.
-    """
-    _require_cross_replica_context(self)
-    options = {"grouped": kwargs.pop("grouped", True)}
-    return self._update_non_slot(colocate_with, options, fn, *args, **kwargs)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def unwrap(self, value):
-    """Returns the list of all per-replica values contained in `value`.
-
-    Args:
-      value: A value returned by `call_for_each_replica()` or a variable
-        created in `scope()`.
-
-    Returns:
-      A list of values contained in `value`. If `value` represents a single
-      value, this returns `[value].`
-    """
-    return self._unwrap(value)
-
-  def value_container(self, value):
-    """Returns the container that this per-replica `value` belongs to.
-
-    Args:
-      value: A value returned by `call_for_each_replica()` or a variable
-        created in `scope()`.
-
-    Returns:
-      A container that `value` belongs to.
-      If value does not belong to any container (including the case of
-      container having been destroyed), returns the value itself.
-      `value in unwrap(value_container(value))` will always be true.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def _unwrap(self, distributed_value):
-    raise NotImplementedError("must be implemented in descendants")
-
-  def group(self, value, name=None):
-    """Shortcut for `tf.group(distribution.unwrap(value))`."""
-    value = nest.flatten(self.unwrap(value))
-
-    if len(value) != 1 or name is not None:
-      return control_flow_ops.group(value, name=name)
-    # Special handling for the common case of one op.
-    v, = value
-    if hasattr(v, "op"):
-      v = v.op
-    return v
-
-  @property
-  def require_static_shapes(self):
-    return self._require_static_shapes
-
-  @property
-  def num_replicas(self):
-    """Returns number of replicas, for purposes of averaging across replicas.
-
-    DEPRECATED: use `num_replicas_in_sync` instead.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def num_replicas_in_sync(self):
-    """Returns number of replicas over which gradients are aggregated."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def worker_devices(self):
-    """Returns the list of devices used to run `call_for_each_replica()` calls.
-    """
-    # TODO(josh11b): More docstring
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def parameter_devices(self):
-    """Returns the list of devices used for variable and `update` placement."""
-    # TODO(josh11b): More docstring
-    raise NotImplementedError("must be implemented in descendants")
-
-  def non_slot_devices(self, var_list):
-    """Device(s) for non-slot variables.
-
-    Create variables on these devices in a
-    `with colocate_vars_with(non_slot_devices(...)):` block.
-    Update those using `update_non_slot()`.
-
-    Args:
-      var_list: The list of variables being optimized, needed with the
-        default `DistributionStrategy`.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def worker_device_index(self):
-    """An object mapping worker device to an id.
-
-    This might be passed as an argument to `call_for_each_replica()`, as in:
-
-    ```
-    with distribution_strategy.scope():
-
-      def fn(device_id):
-        # device_id is an integer. `fn` is being executed on device:
-        #    distribution_strategy.worker_devices[device_id].
-
-      distribution_strategy.call_for_each_replica(
-          fn, distribution_strategy.worker_device_index)
-    ```
-
-    Returns:
-      An index object, or the integer 0 if there is only a single replica.
-    """
-    _require_cross_replica_context(self)
-    return self._worker_device_index()
-
-  def _worker_device_index(self):
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def between_graph(self):
-    """Whether the strategy uses between-graph replication or not.
-
-      This is expected to return a constant value that will not be changed
-      throughout its life cycle.
-    """
-    raise NotImplementedError("must be implemented in descendants")
-
-  def configure(self,
-                session_config=None,
-                cluster_spec=None,
-                task_type=None,
-                task_id=None):
-    """Configures the strategy class."""
-    del session_config, cluster_spec, task_type, task_id
-
-  @property
-  def should_init(self):
-    """Whether initialization is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def should_checkpoint(self):
-    """Whether checkpointing is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-  @property
-  def should_save_summary(self):
-    """Whether saving summaries is needed."""
-    raise NotImplementedError("must be implemented in descendants")
-
-
-# A note about the difference between the context managers
-# `ReplicaContext` (defined here) and `_CurrentDistributionContext`
-# (defined above) used by `DistributionStrategy.scope()`:
-#
-# * a ReplicaContext is only present during a `call_for_each_replica()`
-#   call (except during a `merge_run` call) and in such a scope it
-#   will be returned by calls to `get_replica_context()`.  Implementers of new
-#   DistributionStrategy descendants will frequently also need to
-#   define a descendant of ReplicaContext, and are responsible for
-#   entering and exiting this context.
-#
-# * DistributionStrategy.scope() sets up a variable_creator scope that
-#   changes variable creation calls (e.g. to make mirrored
-#   variables). This is intended as an outer scope that users enter once
-#   around their model creation and graph definition. There is no
-#   anticipated need to define descendants of _CurrentDistributionContext.
-#   It sets the current DistributionStrategy for purposes of
-#   `get_distribution_strategy()` and `has_distribution_strategy()`
-#   and switches the thread mode to a "cross-replica context".
-class ReplicaContext(object):
-  """DistributionStrategy API inside a `call_for_each_replica()` call."""
-
-  def __init__(self, distribution_strategy, replica_id):
-    self._distribution_strategy = distribution_strategy
-    self._thread_context = distribution_strategy_context._InReplicaThreadMode(  # pylint: disable=protected-access
-        self)
-    self._replica_id = replica_id
-
-  def __enter__(self):
-    _push_per_thread_mode(self._thread_context)
-
-  def __exit__(self, exception_type, exception_value, traceback):
-    _pop_per_thread_mode()
-
-  def merge_call(self, merge_fn, *args, **kwargs):
-    """Merge args across replicas and run `merge_fn` in a cross-replica context.
-
-    This allows communication and coordination when there are multiple calls
-    to a model function triggered by a call to
-    `distribution.call_for_each_replica(model_fn, ...)`.
-
-    See `MirroredDistribution.call_for_each_replica()` for an explanation.
-
-    Otherwise, this is equivalent to:
-
-    ```
-    distribution = get_distribution_strategy()
-    with cross-replica-context(distribution):
-      return merge_fn(distribution, *args, **kwargs)
-    ```
-
-    Args:
-      merge_fn: function that joins arguments from threads that are given as
-        PerReplica. It accepts `DistributionStrategy` object as the first
-        argument.
-      args: List or tuple with positional per-thread arguments for `merge_fn`
-      kwargs: Dict with keyword per-thread arguments for `merge_fn`.
-
-    Returns:
-      The return value of `merge_fn`, except for `PerReplica` values which are
-      unpacked.
-    """
-    require_replica_context(self)
-    # Handle old *args, **kwargs, and new args=(...), kwargs={...}, to
-    # allow transition.
-    a = kwargs.pop("args", None)
-    if a is not None:
-      if args:
-        raise ValueError(
-            "Can't pass *args and args=... to merge_call")
-      args = a
-    k = kwargs.pop("kwargs", None)
-    if k is not None:
-      if kwargs:
-        raise ValueError(
-            "Can't pass **kwargs and kwargs=... to merge_call")
-      kwargs = k
-    return self._merge_call(merge_fn, args, kwargs)
-
-  def _merge_call(self, merge_fn, args, kwargs):
-    """Default implementation for single replica."""
-    _push_per_thread_mode(  # thread-local, so not needed with multiple threads
-        distribution_strategy_context._CrossReplicaThreadMode(  # pylint: disable=protected-access
-            self._distribution_strategy))
-    try:
-      return merge_fn(self._distribution_strategy, *args, **kwargs)
-    finally:
-      _pop_per_thread_mode()
-
-  @property
-  def num_replicas(self):
-    """Returns number of replicas, for purposes of averaging across replicas."""
-    return self._distribution_strategy.num_replicas
-
-  @property
-  def num_replicas_in_sync(self):
-    """Returns number of replicas over which gradients are aggregated."""
-    return self._distribution_strategy.num_replicas_in_sync
-
-  @property
-  def replica_id(self):
-    """Which replica is being defined, a number from 0 to `num_replicas - 1`."""
-    require_replica_context(self)
-    return self._replica_id
-
-  @property
-  def distribution_strategy(self):
-    """The current `DistributionStrategy` object."""
-    return self._distribution_strategy
-
-  @property
-  def device(self):
-    """BEING DELETED: use .devices instead."""
-    raise RuntimeError("Use .devices instead")
-
-  @property
-  def devices(self):
-    """The devices this replica is to be executed on, as a list of strings."""
-    require_replica_context(self)
-    return [device_util.current()]
-
-  # TODO(josh11b): Implement `start_all_reduce(method, t)` for efficient
-  # all-reduce. It would return a function returning the result of reducing `t`
-  # across all replicas. The caller would wait to call this function until they
-  # needed the reduce result, allowing an efficient implementation:
-  # * With eager execution, the reduction could be performed asynchronously
-  #   in the background, not blocking until the result was needed.
-  # * When constructing a graph, it could batch up all reduction requests up
-  #   to that point that the first result is needed. Most likely this can be
-  #   implemented in terms of `merge_call()` and `batch_reduce()`.
-
-# ------------------------------------------------------------------------------
-
-
-class _DefaultDistributionStrategy(DistributionStrategy):
-  """Default `DistributionStrategy` if none is explicitly selected."""
-
-  def scope(self):
-    """Context manager setting a variable creator and `self` as current."""
-    if distribution_strategy_context.has_distribution_strategy():
-      raise RuntimeError("Must not nest DistributionStrategy scopes.")
-
-    def creator(next_creator, *args, **kwargs):
-      _require_distribution_strategy_scope(self)
-      return next_creator(*args, **kwargs)
-
-    return _CurrentDistributionContext(
-        self, variable_scope.variable_creator_scope(creator))
-
-  def colocate_vars_with(self, colocate_with_variable):
-    """Does not require `self.scope`."""
-    _require_distribution_strategy_scope(self)
-    return ops.colocate_with(colocate_with_variable)
-
-  def distribute_dataset(self, dataset_fn):
-    return self._call_dataset_fn(dataset_fn)
-
-  def _broadcast(self, tensor, destinations):
-    if destinations is None:
-      return tensor
-    else:
-      raise NotImplementedError("TODO")
-
-  def _call_for_each_replica(self, fn, args, kwargs):
-    with ReplicaContext(self, replica_id=0):
-      return fn(*args, **kwargs)
-
-  def _reduce(self, aggregation, value, destinations):
-    # TODO(josh11b): Use destinations?
-    del aggregation, destinations
-    return value
-
-  def _update(self, var, options, fn, *args, **kwargs):
-    # The implementations of _update() and _update_non_slot() are identical
-    # except _update() passes `var` as the first argument to `fn()`.
-    return self._update_non_slot(var, options, fn, var, *args, **kwargs)
-
-  def _update_non_slot(self, colocate_with, options, fn, *args, **kwargs):
-    should_group = options.pop("grouped")
-    assert not options  # Validate that we are processing all of the options.
-    # TODO(josh11b): Figure out what we should be passing to UpdateContext()
-    # once that value is used for something.
-    with ops.colocate_with(colocate_with), UpdateContext(colocate_with):
-      result = fn(*args, **kwargs)
-      if should_group:
-        return result
-      else:
-        return nest.map_structure(self._unwrap, result)
-
-  def read_var(self, replica_local_var):
-    return array_ops.identity(replica_local_var)
-
-  def _unwrap(self, distributed_value):
-    return [distributed_value]
-
-  def value_container(self, value):
-    return value
-
-  @property
-  def num_replicas(self):
-    return 1
-
-  @property
-  def num_replicas_in_sync(self):
-    return 1
-
-  @property
-  def worker_devices(self):
-    raise RuntimeError(
-        "worker_devices() method unsupported by _DefaultDistributionStrategy.")
-
-  @property
-  def parameter_devices(self):
-    raise RuntimeError("parameter_devices() method unsupported by "
-                       "_DefaultDistributionStrategy.")
-
-  def non_slot_devices(self, var_list):
-    return min(var_list, key=lambda x: x.name)
-
-  def _worker_device_index(self):
-    raise RuntimeError("worker_device_index() method unsupported by "
-                       "_DefaultDistributionStrategy.")
-
-
-# ------------------------------------------------------------------------------
-# We haven't yet implemented deserialization for DistributedVariables.
-# So here we catch any attempts to deserialize variables
-# when using distribution strategies.
-# pylint: disable=protected-access
-_original_from_proto = resource_variable_ops._from_proto_fn
-
-
-def _from_proto_fn(v, import_scope=None):
-  if distribution_strategy_context.has_distribution_strategy():
-    raise NotImplementedError(
-        "Deserialization of variables is not yet supported when using"
-        "distributed strategies.")
-  else:
-    return _original_from_proto(v, import_scope=import_scope)
-
-resource_variable_ops._from_proto_fn = _from_proto_fn
-# pylint: enable=protected-access
-
-
-#-------------------------------------------------------------------------------
-# Shorthand for some methods from distribution_strategy_context.
-_push_per_thread_mode = distribution_strategy_context._push_per_thread_mode  # pylint: disable=protected-access
-_get_per_thread_mode = distribution_strategy_context._get_per_thread_mode  # pylint: disable=protected-access
-_pop_per_thread_mode = distribution_strategy_context._pop_per_thread_mode  # pylint: disable=protected-access
+# pylint: disable=wildcard-import
+from tensorflow.python.distribute.distribute_lib import *
diff --git a/tensorflow/python/training/distribution_strategy_context.py b/tensorflow/python/training/distribution_strategy_context.py
index 278f35b97e4fda5cb7a5ed7590f2ecfbc628a88b..7391bf3b22dfd1a6f1b76e287132828fcc570c67 100644
--- a/tensorflow/python/training/distribution_strategy_context.py
+++ b/tensorflow/python/training/distribution_strategy_context.py
@@ -12,195 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Utility to get distribution strategy related contexts."""
+"""Deprecated, please use ../distribute/distribution_strategy_context.py."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow.python.framework import ops
-from tensorflow.python.util.lazy_loader import LazyLoader
-
-
-# There is a circular dependency between this and `distribute` module. So we
-# load it lazily to workaround this.
-distribute_lib = LazyLoader(
-    "distribute_lib", globals(),
-    "tensorflow.python.training.distribute")
-
-# ------------------------------------------------------------------------------
-# Internal API for setting the current thread mode as being either in a
-# replica or cross-replica context for a particular distribution strategy.
-
-
-class _ThreadMode(object):
-
-  def __init__(self, dist, cross, replica):
-    self.distribution_strategy = dist
-    self.cross_replica_context = cross
-    self.replica_context = replica
-
-
-class _CrossReplicaThreadMode(_ThreadMode):
-
-  def __init__(self, distribution_strategy):
-    _ThreadMode.__init__(
-        self, distribution_strategy, distribution_strategy, None)
-
-
-class _InReplicaThreadMode(_ThreadMode):
-
-  def __init__(self, replica_ctx):
-    _ThreadMode.__init__(
-        self, replica_ctx.distribution_strategy, None, replica_ctx)
-
-
-def _push_per_thread_mode(context):
-  ops.get_default_graph()._distribution_strategy_stack.append(context)  # pylint: disable=protected-access
-
-
-def _pop_per_thread_mode():
-  ops.get_default_graph()._distribution_strategy_stack.pop(-1)  # pylint: disable=protected-access
-
-
-class _DefaultReplicaThreadMode(_ThreadMode):
-  """Type of default value returned by `_get_per_thread_mode()`.
-
-  Used when the thread-local stack is empty.
-  """
-
-  def __init__(self):
-    _ThreadMode.__init__(self, _get_default_distribution_strategy(), None,
-                         _get_default_replica_context())
-
-
-def _get_per_thread_mode():
-  try:
-    return ops.get_default_graph()._distribution_strategy_stack[-1]  # pylint: disable=protected-access
-  except (AttributeError, IndexError):
-    return _get_default_replica_mode()
-
-
-# ------------------------------------------------------------------------------
-# Public API for accessing the current thread mode
-
-
-def get_replica_context():
-  """Returns the current ReplicaContext or None if in a cross-replica context.
-
-  Note that execution:
-
-  1. starts in the default (single-replica) replica context (this function
-     will return the default ReplicaContext object);
-  2. switches to cross-replica context (in which case this will return
-     None) when entering a `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) replica context inside
-     `call_for_each_replica(fn, ...)`;
-  4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-replica context (and again
-     this function will return None).
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-replica context for the default `DistributionStrategy`. You may
-  also switch from the cross-replica context of 4 to a replica context by
-  calling `call_for_each_replica()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-replica context, in a replica context you should use the
-  `ReplicaContext` API instead.
-
-  Returns:
-    The current `ReplicaContext` object when in a replica context scope,
-    else None.
-
-    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().replica_context
-
-
-def get_cross_replica_context():
-  """Returns the current DistributionStrategy if in a cross-replica context.
-
-  Note that execution:
-
-  1. starts in the default (single-replica) replica context;
-  2. switches to cross-replica context when entering a
-     `with DistributionStrategy.scope():` block;
-  3. switches to a (non-default) replica context inside
-     `call_for_each_replica(fn, ...)`;
-  4. if `fn` calls `get_replica_context()->merge_call(merge_fn, ...)`, then
-     inside `merge_fn` you are back in the cross-replica context.
-
-  Note that you can also go directly from step 1 to 4 to switch to a
-  cross-replica context for the default `DistributionStrategy`. You may
-  also switch from the cross-replica context of 4 to a replica context by
-  calling `call_for_each_replica()`, jumping back to step 3.
-
-  Most `DistributionStrategy` methods may only be executed in
-  a cross-replica context.
-
-  Returns:
-    Returns the current `DistributionStrategy` object in a cross-replica
-    context, or None.
-
-    Exactly one of `get_replica_context()` and `get_cross_replica_context()`
-    will return None in a particular block.
-  """
-  return _get_per_thread_mode().cross_replica_context
-
-
-def get_distribution_strategy():
-  """Returns the current `DistributionStrategy` object.
-
-  Prefer to use `get_replica_context()` or `get_cross_replica_context()`
-  instead when possible.
-
-  Returns:
-    A `DistributionStrategy` object. Inside a
-    `with distribution_strategy.scope()` block, it returns
-    `distribution_strategy`, otherwise it returns the default
-    (single-replica) `DistributionStrategy` object.
-  """
-  return _get_per_thread_mode().distribution_strategy
-
-
-def has_distribution_strategy():
-  """Return if there is a current non-default `DistributionStrategy`.
-
-  Returns:
-    True if inside a `with distribution_strategy.scope():`.
-  """
-  return get_distribution_strategy() is not _get_default_distribution_strategy()
-
-
-# ------------------------------------------------------------------------------
-# Defaults that are used when no distribution strategy is explicitly created.
-# We create them lazily in a function so that we can workaround the circular
-# dependency on distribute_lib. See lazy loader at the top of this file.
-
-_defaults = {
-    "distribution_strategy": None,
-    "replica_context": None,
-    "replica_mode": None
-}
-
-
-def _get_default_distribution_strategy():
-  if _defaults["distribution_strategy"] is None:
-    _defaults["distribution_strategy"] = (
-        distribute_lib._DefaultDistributionStrategy())  # pylint: disable=protected-access
-  return _defaults["distribution_strategy"]
-
-
-def _get_default_replica_context():
-  if _defaults["replica_context"] is None:
-    _defaults["replica_context"] = distribute_lib.ReplicaContext(
-        _get_default_distribution_strategy(), replica_id=0)
-  return _defaults["replica_context"]
-
-
-def _get_default_replica_mode():
-  if _defaults["replica_mode"] is None:
-    _defaults["replica_mode"] = _DefaultReplicaThreadMode()
-  return _defaults["replica_mode"]
+# pylint: disable=wildcard-import
+from tensorflow.python.distribute.distribution_strategy_context import *
diff --git a/tensorflow/python/training/evaluation.py b/tensorflow/python/training/evaluation.py
index 2c4eb02d533201cd1be2ea655c8823198dd714d5..a10178f8cfe3af1ac45a5084b8e16abe1beee267 100644
--- a/tensorflow/python/training/evaluation.py
+++ b/tensorflow/python/training/evaluation.py
@@ -230,7 +230,7 @@ def _evaluate_once(checkpoint_path,
   hooks = list(hooks or [])
 
   if eval_ops is not None:
-    if any([isinstance(h, _MultiStepStopAfterNEvalsHook) for h in hooks]):
+    if any(isinstance(h, _MultiStepStopAfterNEvalsHook) for h in hooks):
       steps_per_run_variable = \
           basic_session_run_hooks.get_or_create_steps_per_run_variable()
       update_eval_step = state_ops.assign_add(
diff --git a/tensorflow/python/training/ftrl.py b/tensorflow/python/training/ftrl.py
index 2fafc9a2d803d3530b969125d27e9b564c0e56dd..a2ef3c76b4e79b0ddefd26fcc54fb1afa27a94dd 100644
--- a/tensorflow/python/training/ftrl.py
+++ b/tensorflow/python/training/ftrl.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.FtrlOptimizer")
+@tf_export(v1=["train.FtrlOptimizer"])
 class FtrlOptimizer(optimizer.Optimizer):
   """Optimizer that implements the FTRL algorithm.
 
diff --git a/tensorflow/python/training/ftrl_test.py b/tensorflow/python/training/ftrl_test.py
index 15c50bc8788c3939a135920b8f917a2bb46f3ceb..39b299c64a35a907859416961fb72932423c18e3 100644
--- a/tensorflow/python/training/ftrl_test.py
+++ b/tensorflow/python/training/ftrl_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -54,7 +55,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllClose([0.0, 0.0], v0_val)
         self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -62,18 +63,21 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(3):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-2.60260963, -4.29698515]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.28432083, -0.56694895]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithoutRegularization(self):
     self.doTestFtrlwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceFtrlWithoutRegularization(self):
     self.doTestFtrlwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testFtrlwithoutRegularization2(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -90,19 +94,20 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run 3 steps FTRL
         for _ in range(3):
           update.run()
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-2.55607247, -3.98729396]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.28232238, -0.56096673]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -113,12 +118,15 @@ class FtrlOptimizerTest(test.TestCase):
         sgd_op = ftrl.FtrlOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType([[0, 1]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -135,19 +143,20 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
         # Run 10 steps FTRL
         for _ in range(10):
           update.run()
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-7.66718769, -10.91273689]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.93460727, -1.86147261]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session() as sess:
@@ -164,7 +173,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
@@ -172,12 +181,13 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.24059935, -0.46829352]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.02406147, -0.04830509]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2_L2Shrinkage(self):
     """Test the new FTRL op with support for l2 shrinkage.
 
@@ -201,7 +211,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([4.0, 3.0], v1_val)
 
@@ -209,12 +219,13 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType(
             np.array([-0.22578995, -0.44345796]), v0_val)
         self.assertAllCloseAccordingToType(
             np.array([-0.14378493, -0.13229476]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL1_L2_L2ShrinkageSparse(self):
     """Tests the new FTRL op with support for l2 shrinkage on sparse grads."""
     for dtype in [dtypes.half, dtypes.float32]:
@@ -237,7 +248,7 @@ class FtrlOptimizerTest(test.TestCase):
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([[1.0], [2.0]], v0_val)
         self.assertAllCloseAccordingToType([[4.0], [3.0]], v1_val)
 
@@ -245,10 +256,11 @@ class FtrlOptimizerTest(test.TestCase):
         for _ in range(10):
           update.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([[-0.22578995], [2.]], v0_val)
         self.assertAllCloseAccordingToType([[4.], [-0.13229476]], v1_val)
 
+  @test_util.run_deprecated_v1
   def testFtrlWithL2ShrinkageDoesNotChangeLrSchedule(self):
     """Verifies that l2 shrinkage in FTRL does not change lr schedule."""
     for dtype in [dtypes.half, dtypes.float32]:
@@ -273,7 +285,7 @@ class FtrlOptimizerTest(test.TestCase):
         update1 = opt1.apply_gradients([(grads1, var1)])
         variables.global_variables_initializer().run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         self.assertAllCloseAccordingToType([1.0, 2.0], v0_val)
         self.assertAllCloseAccordingToType([1.0, 2.0], v1_val)
 
@@ -282,12 +294,12 @@ class FtrlOptimizerTest(test.TestCase):
           update0.run()
           update1.run()
 
-        v0_val, v1_val = sess.run([var0, var1])
+        v0_val, v1_val = self.evaluate([var0, var1])
         # var0 is experiencing L2 shrinkage so it should be smaller than var1
         # in magnitude.
         self.assertTrue((v0_val**2 < v1_val**2).all())
-        accum0 = list(sess.run(opt0._slots)["accum"].values())[0]
-        accum1 = list(sess.run(opt1._slots)["accum"].values())[0]
+        accum0 = list(self.evaluate(opt0._slots)["accum"].values())[0]
+        accum1 = list(self.evaluate(opt1._slots)["accum"].values())[0]
         # L2 shrinkage should not change how we update grad accumulator.
         self.assertAllCloseAccordingToType(accum0, accum1)
 
@@ -311,7 +323,7 @@ class FtrlOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllCloseAccordingToType([[0.0], [0.0]], v0_val)
       self.assertAllCloseAccordingToType([[0.0], [0.0]], v1_val)
@@ -323,7 +335,7 @@ class FtrlOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
   # When variables are initialized with Zero, FTRL-Proximal has two properties:
@@ -333,6 +345,7 @@ class FtrlOptimizerTest(test.TestCase):
   # with Adagrad.
   # So, basing on these two properties, we test if our implementation of
   # FTRL-Proximal performs same updates as Adagrad or GradientDescent.
+  @test_util.run_deprecated_v1
   def testEquivAdagradwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -353,6 +366,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseAdagradwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -376,6 +390,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseGradientDescentwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
@@ -399,6 +414,7 @@ class FtrlOptimizerTest(test.TestCase):
       self.assertAllCloseAccordingToType(val0, val2)
       self.assertAllCloseAccordingToType(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivGradientDescentwithoutRegularization(self):
     for dtype in [dtypes.half, dtypes.float32]:
       with self.cached_session():
diff --git a/tensorflow/python/training/gradient_descent.py b/tensorflow/python/training/gradient_descent.py
index ef50f6315dd623647e000b9b713d3ae557c31427..1a527345ef6bdbefa1e2b2a679fa1d0072c3e515 100644
--- a/tensorflow/python/training/gradient_descent.py
+++ b/tensorflow/python/training/gradient_descent.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.GradientDescentOptimizer")
+@tf_export(v1=["train.GradientDescentOptimizer"])
 class GradientDescentOptimizer(optimizer.Optimizer):
   """Optimizer that implements the gradient descent algorithm.
   """
diff --git a/tensorflow/python/training/gradient_descent_test.py b/tensorflow/python/training/gradient_descent_test.py
index 1ddea598e52b3b86b821553b0cc74674fe5389d5..5a6c5cfa7470d66c3710ba11ad0ae8772234d2c9 100644
--- a/tensorflow/python/training/gradient_descent_test.py
+++ b/tensorflow/python/training/gradient_descent_test.py
@@ -24,6 +24,7 @@ from tensorflow.python.eager import function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -35,6 +36,7 @@ from tensorflow.python.training import gradient_descent
 
 class GradientDescentOptimizerTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -47,17 +49,18 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
         self.assertEqual(0, len(optimizer.variables()))
 
+  @test_util.run_deprecated_v1
   def testBasicResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -73,16 +76,17 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testBasicCallableParams(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -99,16 +103,17 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -124,17 +129,18 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         resources.initialize_resources([var0, var1]).run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -151,17 +157,18 @@ class GradientDescentOptimizerTest(test.TestCase):
         # a long-term solution for this.
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
         np_grad = 2 * np_pred
         self.assertAllCloseAccordingToType(
-            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], var0.eval())
-        self.assertAllCloseAccordingToType([3.0 - np_grad], var1.eval())
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRate(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -174,16 +181,17 @@ class GradientDescentOptimizerTest(test.TestCase):
             lrate).apply_gradients(zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testGradWrtRef(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -193,8 +201,9 @@ class GradientDescentOptimizerTest(test.TestCase):
         grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
         variables.global_variables_initializer().run()
         for grad, _ in grads_and_vars:
-          self.assertAllCloseAccordingToType([1.0], grad.eval())
+          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
 
+  @test_util.run_deprecated_v1
   def testWithGlobalStep(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -207,17 +216,18 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]), global_step=global_step)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([1.0, 2.0], var0.eval())
-        self.assertAllCloseAccordingToType([3.0, 4.0], var1.eval())
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params and global_step
         self.assertAllCloseAccordingToType([1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01],
-                                           var1.eval())
-        self.assertAllCloseAccordingToType(1, global_step.eval())
+                                           self.evaluate(var1))
+        self.assertAllCloseAccordingToType(1, self.evaluate(global_step))
 
+  @test_util.run_deprecated_v1
   def testSparseBasic(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -237,15 +247,15 @@ class GradientDescentOptimizerTest(test.TestCase):
             zip([grads0, grads1], [var0, var1]))
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0], [2.0]], var0.eval())
-        self.assertAllCloseAccordingToType([[3.0], [4.0]], var1.eval())
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], self.evaluate(var1))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
         self.assertAllCloseAccordingToType([[1.0 - 3.0 * 0.1], [2.0]],
-                                           var0.eval())
+                                           self.evaluate(var0))
         self.assertAllCloseAccordingToType([[3.0], [4.0 - 3.0 * 0.01]],
-                                           var1.eval())
+                                           self.evaluate(var1))
 
   def testCapturingInDefunWhileExecutingEagerly(self):
     with context.eager_mode():
diff --git a/tensorflow/python/training/input_test.py b/tensorflow/python/training/input_test.py
index 085b77d1d6aee7411d2e354d08518e7e9e17bcb9..d89f5f3bbd879a32ab55cf70e366c5c82ef0f266 100644
--- a/tensorflow/python/training/input_test.py
+++ b/tensorflow/python/training/input_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
@@ -41,6 +42,7 @@ from tensorflow.python.util import compat
 
 class MatchFilenamesOnceTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def test(self):
     temp_dir = self.get_temp_dir()
     filenames = [os.path.join(temp_dir, n) for n in os.listdir(temp_dir)]
@@ -58,35 +60,41 @@ class MatchFilenamesOnceTest(test_lib.TestCase):
       one = inp.match_filenames_once(additional[1])
       variables.global_variables_initializer().run()
       variables.local_variables_initializer().run()
-      self.assertItemsEqual(map(compat.as_bytes, filenames), star.eval())
-      self.assertItemsEqual(map(compat.as_bytes, additional), question.eval())
-      self.assertItemsEqual([compat.as_bytes(additional[1])], one.eval())
+      self.assertItemsEqual(
+          map(compat.as_bytes, filenames), self.evaluate(star))
+      self.assertItemsEqual(
+          map(compat.as_bytes, additional), self.evaluate(question))
+      self.assertItemsEqual([compat.as_bytes(additional[1])],
+                            self.evaluate(one))
 
 
 class LimitEpochsTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoLimit(self):
     with self.cached_session():
       seven = constant_op.constant(7)
       seven_forever = inp.limit_epochs(seven)
       variables.local_variables_initializer().run()
       for _ in range(100):
-        self.assertEqual(7, seven_forever.eval())
+        self.assertEqual(7, self.evaluate(seven_forever))
 
+  @test_util.run_deprecated_v1
   def testLimit(self):
     with self.cached_session():
       love_me = constant_op.constant("Love Me")
       love_me_two_times = inp.limit_epochs(love_me, num_epochs=2)
       variables.global_variables_initializer().run()
       variables.local_variables_initializer().run()
-      self.assertEqual(b"Love Me", love_me_two_times.eval())
-      self.assertEqual(b"Love Me", love_me_two_times.eval())
+      self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
+      self.assertEqual(b"Love Me", self.evaluate(love_me_two_times))
       with self.assertRaises(errors_impl.OutOfRangeError):
-        love_me_two_times.eval()
+        self.evaluate(love_me_two_times)
 
 
 class InputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       input_tensor = [[1, 2, 3, 4],
@@ -102,14 +110,16 @@ class InputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      self.assertAllEqual(input_tensor * num_epochs, dequeue_many.eval())
+      self.assertAllEqual(input_tensor * num_epochs,
+                          self.evaluate(dequeue_many))
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testNoShapeInference(self):
     with self.cached_session():
       # Disable shape inference for the input.
@@ -127,14 +137,15 @@ class InputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      self.assertAllEqual(input_value * num_epochs, dequeue_many.eval())
+      self.assertAllEqual(input_value * num_epochs, self.evaluate(dequeue_many))
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShapeError(self):
     input_tensor = array_ops.placeholder(dtypes.float32, None)
     with self.assertRaisesRegexp(ValueError, "fully defined shape"):
@@ -143,6 +154,7 @@ class InputProducerTest(test_lib.TestCase):
 
 class StringInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -156,15 +168,16 @@ class StringInputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      output = dequeue_many.eval()
+      output = self.evaluate(dequeue_many)
       self.assertAllEqual(strings * num_epochs, output)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session():
       strings = [b"a", b"b", b"c"]
@@ -184,7 +197,7 @@ class StringInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = dequeue_many.eval()
+        output = self.evaluate(dequeue_many)
         key = b"".join(output)
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -200,7 +213,7 @@ class StringInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
@@ -210,6 +223,7 @@ class StringInputProducerTest(test_lib.TestCase):
       with self.assertRaises(ValueError):
         _ = inp.string_input_producer([])
 
+  @test_util.run_deprecated_v1
   def testNullString(self):
     # Runtime check for empty string list.  This is slightly oblique:
     # The queue runner should die with an assertion error on the null
@@ -224,11 +238,12 @@ class StringInputProducerTest(test_lib.TestCase):
       variables.local_variables_initializer().run()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -237,6 +252,7 @@ class StringInputProducerTest(test_lib.TestCase):
       self.assertProtoEquals("s: 'SHARED_NAME_XYZ'",
                              queue.queue_ref.op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testConstructionRace(self):
     with self.cached_session() as sess:
       strings = [b"to", b"be", b"or", b"not", b"to", b"be"]
@@ -252,13 +268,14 @@ class StringInputProducerTest(test_lib.TestCase):
           # writing of the `tf.Graph` object. However, many users
           # write code this way, so we include this test to ensure
           # that we can support it.
-          self.assertEquals(string, sess.run(queue.dequeue()))
+          self.assertEquals(string, self.evaluate(queue.dequeue()))
       coord.request_stop()
       coord.join(threads)
 
 
 class RangeInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session():
       num_epochs = 3
@@ -272,15 +289,16 @@ class RangeInputProducerTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       # No randomness, so just see repeated copies of the input.
-      output = dequeue_many.eval()
+      output = self.evaluate(dequeue_many)
       self.assertAllEqual(list(xrange(range_size)) * num_epochs, output)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session():
       num_epochs = 200
@@ -300,7 +318,7 @@ class RangeInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = dequeue_many.eval()
+        output = self.evaluate(dequeue_many)
         key = 10 * (output[0] + 1) + (output[1] + 1)
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -316,10 +334,11 @@ class RangeInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        dequeue.eval()
+        self.evaluate(dequeue)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       range_size = 5
@@ -331,6 +350,7 @@ class RangeInputProducerTest(test_lib.TestCase):
 
 class SliceInputProducerTest(test_lib.TestCase):
 
+  @test_util.run_deprecated_v1
   def testNoShuffle(self):
     with self.cached_session() as sess:
       num_epochs = 3
@@ -344,17 +364,18 @@ class SliceInputProducerTest(test_lib.TestCase):
 
       # No randomness, so just see repeated copies of the input.
       num_items = len(source_strings) * num_epochs
-      output = [sess.run(slices) for _ in range(num_items)]
+      output = [self.evaluate(slices) for _ in range(num_items)]
       out_strings, out_ints = zip(*output)
       self.assertAllEqual(source_strings * num_epochs, out_strings)
       self.assertAllEqual(source_ints * num_epochs, out_ints)
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(slices)
+        self.evaluate(slices)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testShuffle(self):
     with self.cached_session() as sess:
       num_epochs = 1200
@@ -379,7 +400,7 @@ class SliceInputProducerTest(test_lib.TestCase):
       for e in expected:
         frequency[e] = 0
       for _ in range(num_epochs):
-        output = [sess.run(slices) for _ in range(len(source_strings))]
+        output = [self.evaluate(slices) for _ in range(len(source_strings))]
         key = b",".join([s + compat.as_bytes(str(i)) for s, i in output])
         self.assertIn(key, expected)
         frequency[key] += 1
@@ -395,10 +416,11 @@ class SliceInputProducerTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(slices)
+        self.evaluate(slices)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       source_strings = ["A", "B", "D", "G"]
@@ -470,7 +492,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -487,38 +509,43 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThread(self):
     self._testOneThreadHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testUint32DataTypes(self):
     values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint32)
     batched = inp.batch([values], batch_size=2)
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-      sess.run(batched)
+      self.evaluate(batched)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testUint64DataTypes(self):
     values = constant_op.constant([0, 1, 2, 3, 4, 5], dtype=dtypes.uint64)
     batched = inp.batch([values], batch_size=2)
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(sess=sess, coord=coord)
-      sess.run(batched)
+      self.evaluate(batched)
       coord.request_stop()
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadDynamicPad(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -535,7 +562,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         expected_results = np.arange(i * batch_size, (i + 1) * batch_size)
         max_len = expected_results[-1]
         self.assertAllEqual(results[0], expected_results)
@@ -545,10 +572,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadEnqueueMany(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -567,7 +595,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -580,10 +608,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreads(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -606,7 +635,7 @@ class BatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -620,10 +649,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThreadSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -647,7 +677,7 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(results[0],
                             np.arange(i * batch_size, (i + 1) * batch_size))
         self.assertAllEqual(
@@ -663,7 +693,7 @@ class BatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[0],
                           np.arange(num_batches * batch_size,
                                     num_batches * batch_size + extra_elements))
@@ -677,10 +707,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -705,7 +736,7 @@ class BatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertAllEqual(results[0], results[1].values)
@@ -717,7 +748,7 @@ class BatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), extra_elements)
       self.assertAllEqual(results[0], results[1].values)
@@ -732,10 +763,11 @@ class BatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -753,12 +785,14 @@ class BatchTest(test_lib.TestCase):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testCannotInferRankError(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
       with self.assertRaisesRegexp(ValueError, "Cannot infer Tensor's rank"):
         inp.batch([x], batch_size=2)
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -766,6 +800,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -773,6 +808,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -782,6 +818,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -791,6 +828,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.batch([sparse], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testSingleElementDict(self):
     x = inp.batch({"c": [12, 12]}, batch_size=8)
     self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
@@ -823,35 +861,42 @@ class BatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testMaybeEnqueuePerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadMaybeEnqueuePerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -873,6 +918,7 @@ class BatchTest(test_lib.TestCase):
                       batch_size=1,
                       enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -880,6 +926,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -888,6 +935,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -896,6 +944,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -905,6 +954,7 @@ class BatchTest(test_lib.TestCase):
     batched = inp.maybe_batch([sparse], keep_input=True, batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -915,6 +965,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -925,6 +976,7 @@ class BatchTest(test_lib.TestCase):
         [sparse], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchCorrectValues(self):
     sparse_t = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
@@ -938,7 +990,7 @@ class BatchTest(test_lib.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = batched.eval()
+      batched_np = self.evaluate(batched)
 
       coord.request_stop()
       for thread in threads:
@@ -1016,7 +1068,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(3, len(results))
         self.assertEqual(batch_size, len(results[0]))
         self.assertEqual(batch_size, len(results[2]))
@@ -1047,16 +1099,19 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
     with self.assertRaisesRegexp(ValueError, "must have the same keys"):
       inp.batch_join(
@@ -1071,6 +1126,7 @@ class BatchJoinTest(test_lib.TestCase):
           }],
           batch_size=8)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDynamicPad(self):
     with self.cached_session() as sess:
       # Two threads, the first generates (0..69, ["a"] * 1..70).
@@ -1112,7 +1168,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertEqual(2, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
@@ -1144,10 +1200,11 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       extra_elements = 2
@@ -1197,7 +1254,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -1217,7 +1274,7 @@ class BatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached the final batch with 2 * extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertEqual(len(results[2]), 2 * extra_elements)
@@ -1245,10 +1302,11 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDynamicPadSmallerBatch(self):
     with self.cached_session() as sess:
       extra_elements = 2
@@ -1292,7 +1350,7 @@ class BatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[1]), batch_size)
@@ -1312,7 +1370,7 @@ class BatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached the final batch with 2 * extra_elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       tf_logging.info("Last Batch: %s", results[0])
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertEqual(len(results[1]), 2 * extra_elements)
@@ -1343,10 +1401,11 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -1369,12 +1428,14 @@ class BatchJoinTest(test_lib.TestCase):
           "s: 'SHARED_NAME_XYZ'",
           batched[0].op.inputs[0].op.node_def.attr["shared_name"])
 
+  @test_util.run_deprecated_v1
   def testCannotInferRankError(self):
     with self.cached_session():
       x = array_ops.placeholder(dtype=dtypes.int64)
       with self.assertRaisesRegexp(ValueError, "Cannot infer Tensor's rank"):
         inp.batch_join([[x]], batch_size=2)
 
+  @test_util.run_deprecated_v1
   def testSingleElementDict(self):
     x = inp.batch_join([{"c": [12, 12]}], batch_size=8)
     self.assertAllEqual((8, 2), x["c"].get_shape().as_list())
@@ -1406,7 +1467,7 @@ class BatchJoinTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual(
             [0] * batch_size,
             np.mod(results[0], 2),)
@@ -1417,28 +1478,35 @@ class BatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -1460,6 +1528,7 @@ class BatchJoinTest(test_lib.TestCase):
                            batch_size=1,
                            enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1467,6 +1536,7 @@ class BatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1475,6 +1545,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -1483,6 +1554,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1492,6 +1564,7 @@ class BatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_batch_join([[sparse]], keep_input=True, batch_size=2)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1502,6 +1575,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=True, batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1512,6 +1586,7 @@ class BatchJoinTest(test_lib.TestCase):
         [[sparse]], keep_input=[True, False], batch_size=2, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchCorrectValues(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0, 1], [0, 2], [1, 0], [1, 3]],
@@ -1525,7 +1600,7 @@ class BatchJoinTest(test_lib.TestCase):
       coord = coordinator.Coordinator()
       threads = queue_runner_impl.start_queue_runners(coord=coord)
 
-      batched_np = batched.eval()
+      batched_np = self.evaluate(batched)
 
       coord.request_stop()
       for thread in threads:
@@ -1575,7 +1650,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
         self.assertAllEqual(
@@ -1593,16 +1668,19 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testOneThread(self):
     self._testOneThreadHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testOneThreadDict(self):
     self._testOneThreadHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testOneThreadSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1630,7 +1708,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for _ in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
         self.assertAllEqual(
@@ -1641,7 +1719,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[1].dense_shape, [extra_elements, 1])
       self.assertAllEqual(results[2], [b"string"] * extra_elements)
       all_counts.extend(results[0])
@@ -1655,10 +1733,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreads(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1683,7 +1762,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
@@ -1702,10 +1781,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testManyThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       batch_size = 10
@@ -1733,7 +1813,7 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       all_counts = []
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         all_counts.extend(results[0])
@@ -1745,7 +1825,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         self.assertAllEqual(results[2], [b"string"] * batch_size)
 
       # Reached the final batch with extra elements.
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertAllEqual(results[0].shape, [extra_elements])
       self.assertAllEqual(results[1].dense_shape, [extra_elements, 1])
       self.assertAllEqual(results[2], [b"string"] * extra_elements)
@@ -1760,10 +1840,11 @@ class ShuffleBatchTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -1813,35 +1894,42 @@ class ShuffleBatchTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -1860,6 +1948,7 @@ class ShuffleBatchTest(test_lib.TestCase):
                               keep_input=array_ops.placeholder(dtypes.bool),
                               enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1867,6 +1956,7 @@ class ShuffleBatchTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -1875,6 +1965,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, True, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -1883,6 +1974,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, [True, False], enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1892,6 +1984,7 @@ class ShuffleBatchTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch([sparse], 2, 10, 1, True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1902,6 +1995,7 @@ class ShuffleBatchTest(test_lib.TestCase):
         [sparse], 2, 10, 1, True, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -1986,7 +2080,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched_fetch)
+        results = self.evaluate(batched_fetch)
         self.assertEqual(3, len(results))
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -2016,16 +2110,19 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched_fetch)
+        self.evaluate(batched_fetch)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testTwoThreads(self):
     self._testTwoThreadsHelper(use_dict=False)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsDict(self):
     self._testTwoThreadsHelper(use_dict=True)
 
+  @test_util.run_deprecated_v1
   def testTwoThreadsSmallerBatch(self):
     with self.cached_session() as sess:
       # Two threads, the first generates (0..26, "a").
@@ -2078,7 +2175,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       saw_both = 0
       num_batches = (num_a + num_b) // batch_size
       for i in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         tf_logging.info("Batch %d: %s", i, results[0])
         self.assertEqual(len(results[0]), batch_size)
         self.assertEqual(len(results[2]), batch_size)
@@ -2098,7 +2195,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
                             [results[0][i] for i in which_b])
 
       # Reached end with 2 * extra_elements left
-      results = sess.run(batched)
+      results = self.evaluate(batched)
       self.assertEqual(len(results[0]), 2 * extra_elements)
       self.assertAllEqual(results[1].dense_shape, [2 * extra_elements, 1])
       self.assertEqual(len(results[2]), 2 * extra_elements)
@@ -2125,10 +2222,11 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_deprecated_v1
   def testMismatchedDictKeys(self):
     with self.assertRaisesRegexp(ValueError, "must have the same keys"):
       inp.shuffle_batch_join(
@@ -2146,6 +2244,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
           min_after_dequeue=16,
           seed=223607)
 
+  @test_util.run_deprecated_v1
   def testSharedName(self):
     with self.cached_session():
       batch_size = 10
@@ -2199,35 +2298,42 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
       threads = queue_runner_impl.start_queue_runners()
 
       for _ in range(num_batches):
-        results = sess.run(batched)
+        results = self.evaluate(batched)
         self.assertAllEqual([0] * batch_size, np.mod(results[0], 2))
         self.assertAllEqual([0] * batch_size, np.mod(results[1].values, 2))
         self.assertAllEqual([b"string"] * batch_size, results[2])
 
       # Reached the limit.
       with self.assertRaises(errors_impl.OutOfRangeError):
-        sess.run(batched)
+        self.evaluate(batched)
       for thread in threads:
         thread.join()
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInput(self):
     self._testKeepInputHelper(1, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSingleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(1, True)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInput(self):
     self._testKeepInputHelper(5, False)
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleThreadKeepInputEnqueueMany(self):
     self._testKeepInputHelper(5, True)
 
+  @test_util.run_deprecated_v1
   def testSingleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(1, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testMultipleThreadKeepInputPerExample(self):
     self._testKeepInputHelper(5, True, keep_input_vector=True)
 
+  @test_util.run_deprecated_v1
   def testInvalidKeepInputVector(self):
     # Can't have vector `keep_input` with `enqueue_many=False`.
     with self.assertRaisesRegexp(ValueError, "`keep_input` cannot be a vector"):
@@ -2249,6 +2355,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
           keep_input=array_ops.placeholder(dtypes.bool),
           enqueue_many=True)
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShape(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -2256,6 +2363,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
     self.assertAllEqual((2,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0]], values=[1.0], dense_shape=[1])
@@ -2264,6 +2372,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, True, enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeEnqueueManyPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=[[0], [0]], values=[1.0, 2.0], dense_shape=[2])
@@ -2272,6 +2381,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, [True, False], enqueue_many=True)
     self.assertAllEqual((1,), batched.dense_shape.get_shape().as_list())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRank(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -2281,6 +2391,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
     batched = inp.maybe_shuffle_batch_join([[sparse]], 2, 10, 1, True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankEnqueueMany(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
@@ -2291,6 +2402,7 @@ class ShuffleBatchJoinTest(test_lib.TestCase):
         [[sparse]], 2, 10, 1, True, enqueue_many=True)
     self.assertIs(None, batched.dense_shape.get_shape().num_elements())
 
+  @test_util.run_deprecated_v1
   def testMaybeBatchedSparseTensorInferredShapeUnknownRankPerExample(self):
     sparse = sparse_tensor.SparseTensor(
         indices=array_ops.placeholder(dtypes.int64),
diff --git a/tensorflow/python/training/learning_rate_decay.py b/tensorflow/python/training/learning_rate_decay.py
index 29b546532193320961393ccf00fd3b190802c11a..c52e89db1f47eb303b7160cef77c01bcb46aebba 100644
--- a/tensorflow/python/training/learning_rate_decay.py
+++ b/tensorflow/python/training/learning_rate_decay.py
@@ -100,7 +100,7 @@ def exponential_decay(learning_rate,
   return decayed_lr
 
 
-@tf_export(v1=["train.piecewise_constant"])
+@tf_export(v1=["train.piecewise_constant_decay", "train.piecewise_constant"])
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
diff --git a/tensorflow/python/training/learning_rate_decay_test.py b/tensorflow/python/training/learning_rate_decay_test.py
index 03a32f6ca099a4b02de950d7e4dde6f88944695d..1029d4cea8f67d0e8614983ff106ccc57ccb9064 100644
--- a/tensorflow/python/training/learning_rate_decay_test.py
+++ b/tensorflow/python/training/learning_rate_decay_test.py
@@ -61,24 +61,24 @@ class LRDecayTest(test_util.TensorFlowTestCase):
       self.evaluate(step.assign(100))
       self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
-    with self.cached_session():
-      step = variables.VariableV1(1)
-      assign_1 = step.assign(1)
-      assign_2 = step.assign(2)
-      assign_100 = step.assign(100)
-      decayed_lr = learning_rate_decay.exponential_decay(.1, step, 3, 0.96,
-                                                         staircase=True)
-      variables.global_variables_initializer().run()
-      # No change to learning rate
-      assign_1.op.run()
-      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
-      assign_2.op.run()
-      self.assertAllClose(decayed_lr.eval(), .1, 1e-6)
-      # Decayed learning rate
-      assign_100.op.run()
-      expected = .1 * 0.96 ** (100 // 3)
-      self.assertAllClose(decayed_lr.eval(), expected, 1e-6)
+    step = variables.VariableV1(1)
+    assign_1 = step.assign(1)
+    assign_2 = step.assign(2)
+    assign_100 = step.assign(100)
+    decayed_lr = learning_rate_decay.exponential_decay(
+        .1, step, 3, 0.96, staircase=True)
+    self.evaluate(variables.global_variables_initializer())
+    # No change to learning rate
+    self.evaluate(assign_1.op)
+    self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
+    self.evaluate(assign_2.op)
+    self.assertAllClose(self.evaluate(decayed_lr), .1, 1e-6)
+    # Decayed learning rate
+    self.evaluate(assign_100.op)
+    expected = .1 * 0.96**(100 // 3)
+    self.assertAllClose(self.evaluate(decayed_lr), expected, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self):
@@ -101,6 +101,7 @@ class LRDecayTest(test_util.TensorFlowTestCase):
     self.assertAllClose(self.evaluate(decayed_lr), 0.001, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testPiecewiseConstantEdgeCases(self):
     x_int = resource_variable_ops.ResourceVariable(
         0, dtype=variables.dtypes.int32)
diff --git a/tensorflow/python/training/learning_rate_decay_v2.py b/tensorflow/python/training/learning_rate_decay_v2.py
index 9c5e144be6b0f1aa70d58ab90850ca18e2c90d57..eb69feb17d3983ddb494cdf63ae30edee7062915 100644
--- a/tensorflow/python/training/learning_rate_decay_v2.py
+++ b/tensorflow/python/training/learning_rate_decay_v2.py
@@ -117,7 +117,7 @@ def exponential_decay(learning_rate,
                            decay_rate, staircase, name)
 
 
-@tf_export("train.piecewise_constant", v1=[])
+@tf_export("train.piecewise_constant_decay", v1=[])
 def piecewise_constant(x, boundaries, values, name=None):
   """Piecewise constant from boundaries and interval values.
 
diff --git a/tensorflow/python/training/learning_rate_decay_v2_test.py b/tensorflow/python/training/learning_rate_decay_v2_test.py
index b2ac93f06fe3c3e9ada6d0ef6254078d3f444975..cb96773e299a37db1d5792c84d6a837147e09d04 100644
--- a/tensorflow/python/training/learning_rate_decay_v2_test.py
+++ b/tensorflow/python/training/learning_rate_decay_v2_test.py
@@ -61,24 +61,24 @@ class LRDecayTestV2(test_util.TensorFlowTestCase):
       self.evaluate(step.assign(100))
       self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
 
+  @test_util.run_deprecated_v1
   def testVariables(self):
-    with self.cached_session():
-      step = variables.Variable(1)
-      assign_1 = step.assign(1)
-      assign_2 = step.assign(2)
-      assign_100 = step.assign(100)
-      decayed_lr = learning_rate_decay_v2.exponential_decay(.1, step, 3, 0.96,
-                                                            staircase=True)
-      variables.global_variables_initializer().run()
-      # No change to learning rate
-      assign_1.op.run()
-      self.assertAllClose(decayed_lr().eval(), .1, 1e-6)
-      assign_2.op.run()
-      self.assertAllClose(decayed_lr().eval(), .1, 1e-6)
-      # Decayed learning rate
-      assign_100.op.run()
-      expected = .1 * 0.96 ** (100 // 3)
-      self.assertAllClose(decayed_lr().eval(), expected, 1e-6)
+    step = variables.Variable(1)
+    assign_1 = step.assign(1)
+    assign_2 = step.assign(2)
+    assign_100 = step.assign(100)
+    decayed_lr = learning_rate_decay_v2.exponential_decay(
+        .1, step, 3, 0.96, staircase=True)
+    self.evaluate(variables.global_variables_initializer())
+    # No change to learning rate
+    self.evaluate(assign_1.op)
+    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
+    self.evaluate(assign_2.op)
+    self.assertAllClose(self.evaluate(decayed_lr()), .1, 1e-6)
+    # Decayed learning rate
+    self.evaluate(assign_100.op)
+    expected = .1 * 0.96**(100 // 3)
+    self.assertAllClose(self.evaluate(decayed_lr()), expected, 1e-6)
 
   @test_util.run_in_graph_and_eager_modes
   def testPiecewiseConstant(self):
diff --git a/tensorflow/python/training/localhost_cluster_performance_test.py b/tensorflow/python/training/localhost_cluster_performance_test.py
index 7c097b943d05cd1a049886af6ef1d018d7b2c9ab..c4cbc8a55dc5d40b9aeae2fed400b1d29d6c7499 100644
--- a/tensorflow/python/training/localhost_cluster_performance_test.py
+++ b/tensorflow/python/training/localhost_cluster_performance_test.py
@@ -25,6 +25,7 @@ import numpy as np
 from tensorflow.python.client import session as session_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
@@ -34,6 +35,7 @@ from tensorflow.python.training import device_setter
 
 class CreateLocalClusterTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateLocalCluster(self):
     workers, _ = test.create_local_cluster(num_workers=2, num_ps=2)
     worker_sessions = [session_lib.Session(w.target) for w in workers]
diff --git a/tensorflow/python/training/momentum.py b/tensorflow/python/training/momentum.py
index 4a280e7c514abfa1c148c27212741ff85c839cf5..f3bc83bbfa1fe6a225b1302655a187bca52c995c 100644
--- a/tensorflow/python/training/momentum.py
+++ b/tensorflow/python/training/momentum.py
@@ -25,7 +25,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.MomentumOptimizer")
+@tf_export(v1=["train.MomentumOptimizer"])
 class MomentumOptimizer(optimizer.Optimizer):
   """Optimizer that implements the Momentum algorithm.
 
diff --git a/tensorflow/python/training/momentum_test.py b/tensorflow/python/training/momentum_test.py
index 8a21c39d32344d0027793f1eac3d4f9f43a8d920..ba155fa6c646df5935feddbacffb2a4f9763c90a 100644
--- a/tensorflow/python/training/momentum_test.py
+++ b/tensorflow/python/training/momentum_test.py
@@ -160,6 +160,7 @@ class MomentumOptimizerTest(test.TestCase):
       self.assertStartsWith(optimizer_variables[1].name, "var3")
       self.assertEquals(2, len(optimizer_variables))
 
+  @test_util.run_deprecated_v1
   def testNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -183,9 +184,10 @@ class MomentumOptimizerTest(test.TestCase):
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
                                                                     accum1_np,
                                                                     3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testSparseNesterovMomentum(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -224,8 +226,8 @@ class MomentumOptimizerTest(test.TestCase):
           var1_np, accum1_np = self._update_nesterov_momentum_numpy(var1_np,
                                                                     accum1_np,
                                                                     3, 2.0, 0.9)
-          self.assertAllClose(var0_np, var0.eval())
-          self.assertAllClose(var1_np, var1.eval())
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes(reset_test=True)
   def testMinimizeSparseResourceVariable(self):
@@ -280,6 +282,7 @@ class MomentumOptimizerTest(test.TestCase):
     self.evaluate(sgd_op)
     self.assertAllCloseAccordingToType([[1, 1], [0, 0]], self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testTensorLearningRateAndMomentum(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -303,37 +306,43 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertFalse(slot1 in variables.trainable_variables())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
   def _dbParamsMom01(self):
     """Return dist-belief momentum values.
@@ -434,6 +443,7 @@ class MomentumOptimizerTest(test.TestCase):
     # pylint: enable=line-too-long
     return db_grad, db_out
 
+  @test_util.run_deprecated_v1
   def testLikeDistBeliefMom01(self):
     with self.cached_session():
       db_grad, db_out = self._dbParamsMom01()
@@ -445,8 +455,9 @@ class MomentumOptimizerTest(test.TestCase):
       variables.global_variables_initializer().run()
       for i in xrange(num_samples):
         mom_update.run(feed_dict={grads0: db_grad[i]})
-        self.assertAllClose(np.array(db_out[i]), var0.eval())
+        self.assertAllClose(np.array(db_out[i]), self.evaluate(var0))
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -476,46 +487,59 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertEquals(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([0, 0], var0.eval()[0])
-        self.assertAllClose([0, 0], var0.eval()[1])
-        self.assertAllClose([1, 1], var1.eval()[2])
+        self.assertAllClose([0, 0], self.evaluate(var0)[0])
+        self.assertAllClose([0, 0], self.evaluate(var0)[1])
+        self.assertAllClose([1, 1], self.evaluate(var1)[2])
 
         # Step 1: the momentum accumulators are 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), slot0.eval()[0])
-        self.assertAllCloseAccordingToType(np.array([.1, .1]), slot0.eval()[1])
         self.assertAllCloseAccordingToType(
-            np.array([.01, .01]), slot1.eval()[2])
+            np.array([0, 0]),
+            self.evaluate(slot0)[0])
+        self.assertAllCloseAccordingToType(
+            np.array([.1, .1]),
+            self.evaluate(slot0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([.01, .01]),
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllCloseAccordingToType(np.array([0, 0]), var0.eval()[0])
         self.assertAllCloseAccordingToType(
-            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]), var0.eval()[1])
+            np.array([0, 0]),
+            self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]), var1.eval()[2])
+            np.array([-(0.1 * 2.0), -(0.1 * 2.0)]),
+            self.evaluate(var0)[1])
+        self.assertAllCloseAccordingToType(
+            np.array([1.0 - (0.01 * 2.0), 1.0 - (0.01 * 2.0)]),
+            self.evaluate(var1)[2])
         # Step 2: the momentum accumulators contain the previous update.
         mom_update.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllClose(np.array([0, 0]), slot0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(slot0)[0])
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval()[1])
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0)[1])
         self.assertAllCloseAccordingToType(
             np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
-            slot1.eval()[2])
+            self.evaluate(slot1)[2])
         # Check that the parameters have been updated.
-        self.assertAllClose(np.array([0, 0]), var0.eval()[0])
+        self.assertAllClose(np.array([0, 0]), self.evaluate(var0)[0])
         self.assertAllCloseAccordingToType(
             np.array([
-                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0), -(0.1 * 2.0) - (
-                    (0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval()[1])
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+                -(0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
+            ]),
+            self.evaluate(var0)[1])
         self.assertAllCloseAccordingToType(
             np.array([
-                0.98 - ((0.9 * 0.01 + 0.01) * 2.0), 0.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval()[2])
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                0.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]),
+            self.evaluate(var1)[2])
 
+  @test_util.run_deprecated_v1
   def testSharing(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -538,37 +562,43 @@ class MomentumOptimizerTest(test.TestCase):
         self.assertEquals(slot1.get_shape(), var1.get_shape())
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the momentum accumulators where 0. So we should see a normal
         # update: v -= grad * learning_rate
         mom_update1.run()
         # Check that the momentum accumulators have been updated.
-        self.assertAllCloseAccordingToType(np.array([0.1, 0.1]), slot0.eval())
-        self.assertAllCloseAccordingToType(np.array([0.01, 0.01]), slot1.eval())
+        self.assertAllCloseAccordingToType(
+            np.array([0.1, 0.1]), self.evaluate(slot0))
+        self.assertAllCloseAccordingToType(
+            np.array([0.01, 0.01]), self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), var0.eval())
+            np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]),
+            self.evaluate(var0))
         self.assertAllCloseAccordingToType(
-            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]), var1.eval())
+            np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+            self.evaluate(var1))
         # Step 2: the second momentum accumulators contain the previous update.
         mom_update2.run()
         # Check that the momentum accumulators have been updated.
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]), slot0.eval())
+            np.array([(0.9 * 0.1 + 0.1), (0.9 * 0.1 + 0.1)]),
+            self.evaluate(slot0))
         self.assertAllCloseAccordingToType(
-            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]), slot1.eval())
+            np.array([(0.9 * 0.01 + 0.01), (0.9 * 0.01 + 0.01)]),
+            self.evaluate(slot1))
         # Check that the parameters have been updated.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
                 2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0)
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
-                2.98 - ((0.9 * 0.01 + 0.01) * 2.0), 3.98 - (
-                    (0.9 * 0.01 + 0.01) * 2.0)
-            ]), var1.eval())
+                2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+                3.98 - ((0.9 * 0.01 + 0.01) * 2.0)
+            ]), self.evaluate(var1))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/monitored_session.py b/tensorflow/python/training/monitored_session.py
index 162fef971db0aca468ae619d249972bc4110f825..6a7d27df5c322bfad37cf1ef207f66353d636111 100644
--- a/tensorflow/python/training/monitored_session.py
+++ b/tensorflow/python/training/monitored_session.py
@@ -54,7 +54,7 @@ _PREEMPTION_ERRORS = (errors.AbortedError, errors.UnavailableError)
 USE_DEFAULT = object()
 
 
-@tf_export('train.Scaffold')
+@tf_export(v1=['train.Scaffold'])
 class Scaffold(object):
   """Structure to create or gather pieces commonly needed to train a model.
 
@@ -508,7 +508,7 @@ def MonitoredTrainingSession(master='',  # pylint: disable=invalid-name
       stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.SessionCreator')
+@tf_export(v1=['train.SessionCreator'])
 @six.add_metaclass(abc.ABCMeta)
 class SessionCreator(object):
   """A factory for tf.Session."""
@@ -519,7 +519,7 @@ class SessionCreator(object):
         'create_session is not implemented for {}.'.format(self))
 
 
-@tf_export('train.ChiefSessionCreator')
+@tf_export(v1=['train.ChiefSessionCreator'])
 class ChiefSessionCreator(SessionCreator):
   """Creates a tf.Session for a chief."""
 
@@ -571,7 +571,7 @@ class ChiefSessionCreator(SessionCreator):
         init_fn=self._scaffold.init_fn)
 
 
-@tf_export('train.WorkerSessionCreator')
+@tf_export(v1=['train.WorkerSessionCreator'])
 class WorkerSessionCreator(SessionCreator):
   """Creates a tf.Session for a worker."""
 
@@ -840,10 +840,18 @@ class _MonitoredSession(object):
     return self._coordinated_creator.tf_sess is None
 
   def _tf_sess(self):
+    """Return underlying tf.Session object.
+
+    Warning: accessing the returned object in user code is likely to cause races
+    or "flaky tests".
+
+    Returns:
+      A tf.Session object.
+    """
     return self._coordinated_creator.tf_sess
 
 
-@tf_export('train.MonitoredSession')
+@tf_export(v1=['train.MonitoredSession'])
 class MonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, recovery and hooks.
 
@@ -926,7 +934,7 @@ class MonitoredSession(_MonitoredSession):
         stop_grace_period_secs=stop_grace_period_secs)
 
 
-@tf_export('train.SingularMonitoredSession')
+@tf_export(v1=['train.SingularMonitoredSession'])
 class SingularMonitoredSession(_MonitoredSession):
   """Session-like object that handles initialization, restoring, and hooks.
 
diff --git a/tensorflow/python/training/monitored_session_test.py b/tensorflow/python/training/monitored_session_test.py
index c870d99de9e3daed2c167455e6ee6ab5efa33a7b..99ee9ea7e2e4d32f9a24513d9c46f9de4fa2d797 100644
--- a/tensorflow/python/training/monitored_session_test.py
+++ b/tensorflow/python/training/monitored_session_test.py
@@ -37,6 +37,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -382,6 +383,16 @@ class MonitoredTrainingSessionTest(test.TestCase):
         self.assertEqual(0, session.run(gstep))
 
 
+class MockExtended(object):
+
+  def __init__(self, between_graph, should_init, should_checkpoint,
+               should_save_summary):
+    self.experimental_between_graph = between_graph
+    self.experimental_should_init = should_init
+    self.should_checkpoint = should_checkpoint
+    self.should_save_summary = should_save_summary
+
+
 class MockStrategy(object):
 
   def __init__(self,
@@ -389,26 +400,8 @@ class MockStrategy(object):
                should_init=True,
                should_checkpoint=None,
                should_save_summary=None):
-    self._between_graph = between_graph
-    self._should_init = should_init
-    self._should_checkpoint = should_checkpoint
-    self._should_save_summary = should_save_summary
-
-  @property
-  def between_graph(self):
-    return self._between_graph
-
-  @property
-  def should_init(self):
-    return self._should_init
-
-  @property
-  def should_checkpoint(self):
-    return self._should_checkpoint
-
-  @property
-  def should_save_summary(self):
-    return self._should_save_summary
+    self.extended = MockExtended(between_graph, should_init, should_checkpoint,
+                                 should_save_summary)
 
 
 class MonitoredTrainingSessionWithDistributeCoordinatorTest(test.TestCase):
@@ -512,6 +505,7 @@ class StopAtNSession(monitored_session._WrappedSession):
 class WrappedSessionTest(test.TestCase):
   """_WrappedSession tests."""
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -519,6 +513,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertEquals(sess.graph, wrapped_sess.graph)
       self.assertEquals(sess.sess_str, wrapped_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_close(self):
     with self.cached_session() as sess:
       wrapped_sess = monitored_session._WrappedSession(sess)
@@ -526,6 +521,7 @@ class WrappedSessionTest(test.TestCase):
       wrapped_sess.close()
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_uses_check_stop(self):
     with self.cached_session() as sess:
       wrapped_sess = StopAtNSession(sess, 3)
@@ -534,6 +530,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertFalse(wrapped_sess.should_stop())
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_delegates_to_wrapped_session(self):
     with self.cached_session() as sess:
       wrapped_sess0 = StopAtNSession(sess, 4)
@@ -544,6 +541,7 @@ class WrappedSessionTest(test.TestCase):
       self.assertFalse(wrapped_sess1.should_stop())
       self.assertTrue(wrapped_sess1.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_close_twice(self):
     with self.cached_session() as sess:
       wrapped_sess = monitored_session._WrappedSession(sess)
@@ -552,6 +550,7 @@ class WrappedSessionTest(test.TestCase):
       wrapped_sess.close()
       self.assertTrue(wrapped_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -569,6 +568,7 @@ def busy_wait_for_coord_stop(coord):
 class CoordinatedSessionTest(test.TestCase):
   """_CoordinatedSession tests."""
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -577,6 +577,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertEquals(sess.graph, coord_sess.graph)
       self.assertEquals(sess.sess_str, coord_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -585,6 +586,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord_sess = monitored_session._CoordinatedSession(sess, coord)
       self.assertEqual(42, coord_sess.run(v, feed_dict={c: 42}))
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_close(self):
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
@@ -593,6 +595,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord_sess.close()
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_should_stop_on_coord_stop(self):
     with self.cached_session() as sess:
       coord = coordinator.Coordinator()
@@ -601,6 +604,7 @@ class CoordinatedSessionTest(test.TestCase):
       coord.request_stop()
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_dont_request_stop_on_exception_in_main_thread(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -615,6 +619,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertFalse(coord.should_stop())
       self.assertFalse(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_stop_threads_on_close_after_exception(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -662,6 +667,7 @@ class CoordinatedSessionTest(test.TestCase):
       self.assertTrue(coord.should_stop())
       self.assertTrue(coord_sess.should_stop())
 
+  @test_util.run_deprecated_v1
   def test_propagates_exception_trace(self):
     assertion = control_flow_ops.Assert(False, ['This should fail.'])
     with self.cached_session() as sess:
@@ -809,6 +815,7 @@ class RecoverableSessionTest(test.TestCase):
     def create_session(self):
       return self._sess
 
+  @test_util.run_deprecated_v1
   def test_properties(self):
     with self.cached_session() as sess:
       constant_op.constant(0.0)
@@ -817,6 +824,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertEquals(sess.graph, recoverable_sess.graph)
       self.assertEquals(sess.sess_str, recoverable_sess.sess_str)
 
+  @test_util.run_deprecated_v1
   def test_run(self):
     with self.cached_session() as sess:
       c = constant_op.constant(0)
@@ -825,6 +833,7 @@ class RecoverableSessionTest(test.TestCase):
           self._SessionReturner(sess))
       self.assertEqual(51, recoverable_sess.run(v, feed_dict={c: 51}))
 
+  @test_util.run_deprecated_v1
   def test_recovery(self):
     with self.cached_session() as sess:
 
@@ -871,6 +880,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaisesRegexp(IndexError, 'pop from empty list'):
         recoverable_sess.run(v, feed_dict={c: -12})
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_coordinator_exception(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -896,6 +906,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -925,6 +936,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -949,6 +961,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_step_fn_recovery_from_coordinator_exception_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -979,6 +992,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1013,6 +1027,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck_when_run_hooks(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1057,6 +1072,7 @@ class RecoverableSessionTest(test.TestCase):
     # exception.
     return session
 
+  @test_util.run_deprecated_v1
   def test_step_fn_recovery_from_coordinator_exception_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1089,6 +1105,7 @@ class RecoverableSessionTest(test.TestCase):
       self.assertFalse(session.should_stop())
       self.assertEqual(2, session_creator.number_of_sessions_created)
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_non_preemption_in_coordinator_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1126,6 +1143,7 @@ class RecoverableSessionTest(test.TestCase):
       with self.assertRaises(errors_impl.UnknownError):
         session.close()
 
+  @test_util.run_deprecated_v1
   def test_recovery_from_session_getting_stuck_with_raw_session(self):
     with self.cached_session() as test_session:
       session_creator = CountingSessionCreator(test_session)
@@ -1178,7 +1196,7 @@ class HookedSessionTest(test.TestCase):
       mock_run = FakeSession(sess)
       mon_sess = monitored_session._HookedSession(sess=mock_run, hooks=[])
       a_tensor = constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       output = mon_sess.run(fetches=a_tensor,
                             feed_dict='a_feed',
                             options='an_option',
@@ -1197,7 +1215,7 @@ class HookedSessionTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(
           sess=sess, hooks=[mock_hook, mock_hook2])
       a_tensor = constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       mon_sess.run(a_tensor)
 
       for hook in [mock_hook, mock_hook2]:
@@ -1222,7 +1240,7 @@ class HookedSessionTest(test.TestCase):
       mon_sess = monitored_session._HookedSession(
           sess=sess, hooks=[mock_hook, mock_hook2])
       constant_op.constant([0], name='a_tensor')
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       mon_sess.run(fetches='a_tensor')
       self.assertFalse(mon_sess.should_stop())
@@ -1242,7 +1260,7 @@ class HookedSessionTest(test.TestCase):
       third_tensor = constant_op.constant([10], name='third_tensor')
       mock_hook.request = session_run_hook.SessionRunArgs([another_tensor])
       mock_hook2.request = session_run_hook.SessionRunArgs([third_tensor])
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       output = mon_sess.run(fetches=a_tensor)
       self.assertEqual(output, [0])
@@ -1262,7 +1280,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       self.assertEqual(mon_sess.run(fetches=add_tensor), [15])
 
@@ -1280,7 +1298,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       feed_dict = {c_tensor: [20]}
       self.assertEqual(
@@ -1301,7 +1319,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={a_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor)
@@ -1319,7 +1337,7 @@ class HookedSessionTest(test.TestCase):
           None, feed_dict={a_tensor: [5]})
       mock_hook2.request = session_run_hook.SessionRunArgs(
           None, feed_dict={b_tensor: [10]})
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
 
       with self.assertRaisesRegexp(RuntimeError, 'Same tensor is fed'):
         mon_sess.run(fetches=add_tensor, feed_dict={b_tensor: [10]})
@@ -1451,6 +1469,7 @@ class MonitoredSessionTest(test.TestCase):
   # This set of tests, verifies the supervised session behavior when exceptions
   # are raised next to the innermost session run() call.
 
+  @test_util.run_deprecated_v1
   def test_recovery(self):
     logdir = _test_dir(self.get_temp_dir(), 'test_recovery')
     with ops.Graph().as_default():
@@ -1803,6 +1822,7 @@ class MonitoredSessionTest(test.TestCase):
             isinstance(hook.run_metadata_list[0], config_pb2.RunMetadata))
         self.assertGreater(len(hook.run_metadata_list[0].partition_graphs), 0)
 
+  @test_util.run_deprecated_v1
   def test_with_statement_and_close(self):
     # Test case for https://github.com/tensorflow/tensorflow/issues/12224
     # where close() inside the with should have a better error message.
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index fc9eb479cc3a0c3fd3dba4de7269b7894d3ec84c..72670f0ca39f67b151abcb1813ede7ee36c6544b 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -17,6 +17,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import control_flow_ops
@@ -25,7 +27,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.training import slot_creator
 from tensorflow.python.util.tf_export import tf_export
 
@@ -95,11 +96,11 @@ def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
       # In a replica context, we update variable using the mean of value across
       # replicas.
       def merge_fn(strategy, v, value):
-        value = strategy.reduce(
-            variable_scope.VariableAggregation.MEAN, value, v)
+        value = strategy.extended.reduce_to(
+            ds_reduce_util.ReduceOp.MEAN, value, v)
         return strategy.update(v, update_fn, value)
 
-      return replica_context.merge_call(merge_fn, variable, value)
+      return replica_context.merge_call(merge_fn, args=(variable, value))
     else:
       strategy = distribution_strategy_context.get_cross_replica_context()
       return strategy.update(variable, update_fn, value)
diff --git a/tensorflow/python/training/moving_averages_test.py b/tensorflow/python/training/moving_averages_test.py
index bb2fca66e3c1eed8f3143fa98fe0100a8eb71bbe..03bcde9c8498ed03d2eaf52c7f1e2d4211e0ddc6 100644
--- a/tensorflow/python/training/moving_averages_test.py
+++ b/tensorflow/python/training/moving_averages_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.training import saver as saver_lib
 
 class MovingAveragesTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageWithoutZeroDebias(self):
     with self.cached_session():
       var = variables.Variable([10.0, 11.0])
@@ -43,12 +44,13 @@ class MovingAveragesTest(test.TestCase):
       assign = moving_averages.assign_moving_average(
           var, val, decay, zero_debias=False)
       variables.global_variables_initializer().run()
-      self.assertAllClose([10.0, 11.0], var.eval())
+      self.assertAllClose([10.0, 11.0], self.evaluate(var))
       assign.op.run()
       self.assertAllClose(
           [10.0 * 0.25 + 1.0 * (1.0 - 0.25), 11.0 * 0.25 + 2.0 * (1.0 - 0.25)],
-          var.eval())
+          self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverage(self):
     with self.cached_session():
       var = variables.Variable([0.0, 0.0])
@@ -56,12 +58,13 @@ class MovingAveragesTest(test.TestCase):
       decay = 0.25
       assign = moving_averages.assign_moving_average(var, val, decay)
       variables.global_variables_initializer().run()
-      self.assertAllClose([0.0, 0.0], var.eval())
+      self.assertAllClose([0.0, 0.0], self.evaluate(var))
       assign.op.run()
-      self.assertAllClose([
-          1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)
-      ], var.eval())
+      self.assertAllClose(
+          [1.0 * (1.0 - 0.25) / (1 - 0.25), 2.0 * (1.0 - 0.25) / (1 - 0.25)],
+          self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageNewNamingMultipleCalls(self):
     with variable_scope.variable_scope("scope1") as vs1:
       with variable_scope.variable_scope("scope2"):
@@ -76,6 +79,7 @@ class MovingAveragesTest(test.TestCase):
     actual_names = [v.name for v in vs1.global_variables()]
     self.assertSetEqual(set(expected_names), set(actual_names))
 
+  @test_util.run_deprecated_v1
   def testAssignMovingAverageNewNamingMultipleCallsWithReuse(self):
     with variable_scope.variable_scope("scope1") as vs1:
       var = variable_scope.get_variable("Var", shape=[])
@@ -86,6 +90,7 @@ class MovingAveragesTest(test.TestCase):
       moving_averages.assign_moving_average(var, 0.0, 0.99)
       moving_averages.assign_moving_average(var, 0.0, 0.99)
 
+  @test_util.run_deprecated_v1
   def testWeightedMovingAverage(self):
     with self.cached_session() as sess:
       decay = 0.5
@@ -111,6 +116,7 @@ class MovingAveragesTest(test.TestCase):
       denominator_2 = denominator_1 * decay + weight_2 * (1.0 - decay)
       self.assertAllClose(numerator_2 / denominator_2, wma_array)
 
+  @test_util.run_deprecated_v1
   def testWeightedMovingAverageBfloat16(self):
     bfloat16 = pywrap_tensorflow.TF_bfloat16_type()
     with self.cached_session() as sess:
@@ -179,66 +185,72 @@ class ExponentialMovingAverageTest(test.TestCase):
     self.assertEqual("add/ExponentialMovingAverage:0", avg2.name)
 
     # Check initial values.
-    self.assertAllClose(tens, var0.eval())
-    self.assertAllClose(thirties, var1.eval())
-    self.assertAllClose(_Repeat(10.0 + 30.0, dim), tensor2.eval())
+    self.assertAllClose(tens, self.evaluate(var0))
+    self.assertAllClose(thirties, self.evaluate(var1))
+    self.assertAllClose(_Repeat(10.0 + 30.0, dim), self.evaluate(tensor2))
 
     # Check that averages are initialized correctly.
-    self.assertAllClose(tens, avg0.eval())
-    self.assertAllClose(thirties, avg1.eval())
+    self.assertAllClose(tens, self.evaluate(avg0))
+    self.assertAllClose(thirties, self.evaluate(avg1))
     # Note that averages of Tensor's initialize to zeros_like since no value
     # of the Tensor is known because the Op has not been run (yet).
-    self.assertAllClose(_Repeat(0.0, dim), avg2.eval())
+    self.assertAllClose(_Repeat(0.0, dim), self.evaluate(avg2))
 
     # Update the averages and check.
     update.run()
     dk = actual_decay
 
     expected = _Repeat(10.0 * dk + 10.0 * (1 - dk), dim)
-    self.assertAllClose(expected, avg0.eval())
+    self.assertAllClose(expected, self.evaluate(avg0))
     expected = _Repeat(30.0 * dk + 30.0 * (1 - dk), dim)
-    self.assertAllClose(expected, avg1.eval())
+    self.assertAllClose(expected, self.evaluate(avg1))
     expected = _Repeat(0.0 * dk + (10.0 + 30.0) * (1 - dk) / _Scale(dk, 1), dim)
-    self.assertAllClose(expected, avg2.eval())
+    self.assertAllClose(expected, self.evaluate(avg2))
 
     # Again, update the averages and check.
     update.run()
     expected = _Repeat((10.0 * dk + 10.0 * (1 - dk)) * dk + 10.0 * (1 - dk),
                        dim)
-    self.assertAllClose(expected, avg0.eval())
+    self.assertAllClose(expected, self.evaluate(avg0))
     expected = _Repeat((30.0 * dk + 30.0 * (1 - dk)) * dk + 30.0 * (1 - dk),
                        dim)
-    self.assertAllClose(expected, avg1.eval())
+    self.assertAllClose(expected, self.evaluate(avg1))
     expected = _Repeat(((0.0 * dk + (10.0 + 30.0) * (1 - dk)) * dk +
                         (10.0 + 30.0) * (1 - dk)) / _Scale(dk, 2), dim)
-    self.assertAllClose(expected, avg2.eval())
+    self.assertAllClose(expected, self.evaluate(avg2))
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Scalar(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25)
       self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Scalar_Debias(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.25, dim=1)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Vector(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25)
       self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNoNumUpdates_Vector_Debias(self):
     with self.cached_session():
       ema = moving_averages.ExponentialMovingAverage(0.25, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.25, dim=5)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Scalar(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
       ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
       self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Scalar_Debias(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
@@ -246,12 +258,14 @@ class ExponentialMovingAverageTest(test.TestCase):
           0.25, num_updates=1, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.181818, dim=1)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Vector(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
       ema = moving_averages.ExponentialMovingAverage(0.25, num_updates=1)
       self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNumUpdates_Vector_Debias(self):
     with self.cached_session():
       # With num_updates 1, the decay applied is 0.1818
@@ -259,6 +273,7 @@ class ExponentialMovingAverageTest(test.TestCase):
           0.25, num_updates=1, zero_debias=True)
       self._CheckDecay(ema, actual_decay=0.181818, dim=5)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesWithControlDeps(self):
     with self.cached_session() as sess:
       v0 = variables.Variable(0, name="v0")
@@ -274,16 +289,17 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual([], v1_avg.value().op.control_inputs)
       self.assertEqual([], v1_avg.value().op.control_inputs)
       # We should be able to initialize v1_avg before v0.
-      sess.run(v1_avg.initializer)
-      sess.run(v0.initializer)
-      self.assertEqual([10.0], sess.run(v1_avg))
+      self.evaluate(v1_avg.initializer)
+      self.evaluate(v0.initializer)
+      self.assertEqual([10.0], self.evaluate(v1_avg))
       # running ema_op should add to v0 (in addition to updating v1_avg)
-      sess.run(assign_to_v1)
-      sess.run(ema_op)
-      self.assertEqual(1, sess.run(v0))
-      self.assertEqual([17.5], sess.run(v1_avg))
+      self.evaluate(assign_to_v1)
+      self.evaluate(ema_op)
+      self.assertEqual(1, self.evaluate(v0))
+      self.assertEqual([17.5], self.evaluate(v1_avg))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testBasicEager(self):
     v0 = variables.Variable(1.0)
     v1 = variables.Variable(2.0)
@@ -339,9 +355,11 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNames(self):
     self.averageVariablesNamesHelper(zero_debias=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesNoDebias(self):
     self.averageVariablesNamesHelper(zero_debias=False)
 
@@ -387,12 +405,15 @@ class ExponentialMovingAverageTest(test.TestCase):
         self.assertEqual(
             ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesRespectScope(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesNamesRespectScopeNoDebias(self):
     self.averageVariablesNamesRespectScopeHelper(zero_debias=False)
 
+  @test_util.run_v1_only("b/120545219")
   def testSubsetAverageVariablesNames(self):
     with self.cached_session():
       v0 = variables.Variable(10.0, name="v0")
@@ -421,6 +442,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       self.assertEqual(ema.average(v1).op.name, ema.average_name(v1))
       self.assertEqual(ema.average(tensor2).op.name, ema.average_name(tensor2))
 
+  @test_util.run_v1_only("b/120545219")
   def testAverageVariablesDeviceAssignment(self):
     with ops.device("/job:dev_v0"):
       v0 = variables.Variable(10.0, name="v0")
@@ -451,6 +473,7 @@ class ExponentialMovingAverageTest(test.TestCase):
       _ = saver_lib.import_meta_graph(meta_graph)
     return graph_copy
 
+  @test_util.run_deprecated_v1
   def testImportedGraphVariablesToRestore(self):
     g = ops.Graph()
     with g.as_default():
diff --git a/tensorflow/python/training/optimizer.py b/tensorflow/python/training/optimizer.py
index 9dfa9d2afb288e0e7467de614553402d63e821ea..d9ebdcad1f3c83c0e0d4b8496d601fce2669fbff 100644
--- a/tensorflow/python/training/optimizer.py
+++ b/tensorflow/python/training/optimizer.py
@@ -24,6 +24,9 @@ import abc
 
 import six
 
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import distribution_strategy_context as distribute_ctx
+from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
@@ -36,8 +39,6 @@ from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribute as distribute_lib
-from tensorflow.python.training import distribution_strategy_context as distribute_ctx
 from tensorflow.python.training import slot_creator
 from tensorflow.python.training.checkpointable import base as checkpointable
 from tensorflow.python.util import nest
@@ -200,8 +201,7 @@ def _get_processor(v):
       return _TensorProcessor(v)
     else:
       return _DenseResourceVariableProcessor(v)
-  if isinstance(
-      v, resource_variable_ops.ResourceVariable) and not v._in_graph_mode:  # pylint: disable=protected-access
+  if resource_variable_ops.is_resource_variable(v) and not v._in_graph_mode:  # pylint: disable=protected-access
     # True if and only if `v` was initialized eagerly.
     return _DenseResourceVariableProcessor(v)
   if v.op.type == "VarHandleOp":
@@ -213,7 +213,7 @@ def _get_processor(v):
   raise NotImplementedError("Trying to optimize unsupported type ", v)
 
 
-@tf_export("train.Optimizer")
+@tf_export(v1=["train.Optimizer"])
 class Optimizer(
     # Optimizers inherit from CheckpointableBase rather than Checkpointable
     # since they do most of their dependency management themselves (slot
@@ -520,8 +520,7 @@ class Optimizer(
 
   @staticmethod
   def _scale_loss(loss_value):
-    if (distribute_lib.get_loss_reduction() ==
-        variable_scope.VariableAggregation.MEAN):
+    if distribute_lib.get_loss_reduction() == ds_reduce_util.ReduceOp.MEAN:
       num_replicas = \
         distribute_ctx.get_distribution_strategy().num_replicas_in_sync
       if num_replicas > 1:
@@ -565,7 +564,7 @@ class Optimizer(
     if distribute_ctx.has_distribution_strategy():
       grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
       return distribute_ctx.get_replica_context().merge_call(
-          self._distributed_apply, grads_and_vars, global_step, name)
+          self._distributed_apply, args=(grads_and_vars, global_step, name))
 
     # No DistributionStrategy case.
     grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
@@ -658,14 +657,16 @@ class Optimizer(
     Returns:
       An `Operation` that applies the specified gradients across all
       replicas. If `global_step` was not None, that operation also
-      increments `global_step`.
+      increments `global_step`
     """
-    reduced_grads = distribution.batch_reduce(
-        variable_scope.VariableAggregation.SUM, grads_and_vars)
+    reduced_grads = distribution.extended.batch_reduce_to(
+        ds_reduce_util.ReduceOp.SUM, grads_and_vars)
     var_list = [v for _, v in grads_and_vars]
     grads_and_vars = zip(reduced_grads, var_list)
+
     # Note that this is called in a cross-replica context.
-    self._create_slots(var_list)
+    with ops.init_scope():
+      self._create_slots(var_list)
 
     def update(v, g):
       """Apply gradients to a replica variable."""
@@ -682,7 +683,13 @@ class Optimizer(
             "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
       p = _get_processor(v)
 
-      scope_name = "" if context.executing_eagerly() else v.op.name
+      if context.executing_eagerly() or (
+          resource_variable_ops.is_resource_variable(v) and
+          not v._in_graph_mode):  # pylint: disable=protected-access
+        scope_name = v.name.split(":")[0]
+      else:
+        scope_name = v.op.name
+
       # device_policy is set because non-mirrored tensors will be read in
       # `update_op`. `_resource_apply_dense`, `lr_t`, `beta1_t` and `beta2_t`
       # is an example.
@@ -695,21 +702,23 @@ class Optimizer(
       update_ops = [
           op
           for grad, var in grads_and_vars
-          for op in distribution.update(var, update, grad, grouped=False)
+          for op in distribution.extended.update(
+              var, update, args=(grad,), group=False)
       ]
 
       def finish(self, update_ops):
         return self._finish(update_ops, "update")
 
-      non_slot_devices = distribution.non_slot_devices(var_list)
-      finish_updates = distribution.update_non_slot(
-          non_slot_devices, finish, self, update_ops, grouped=False)
+      non_slot_devices = distribution.extended.non_slot_devices(var_list)
+      finish_updates = distribution.extended.update_non_slot(
+          non_slot_devices, finish, args=(self, update_ops), group=False)
       if global_step is None:
         apply_updates = distribution.group(finish_updates, name=name)
       else:
         with ops.control_dependencies(finish_updates):
-          apply_updates = distribution.update(
-              global_step, state_ops.assign_add, 1, name=name)
+          apply_updates = distribution.extended.update(
+              global_step, state_ops.assign_add, args=(1,),
+              kwargs={"name": name})
 
       if not context.executing_eagerly():
         if isinstance(apply_updates, ops.Tensor):
@@ -747,7 +756,7 @@ class Optimizer(
       # `_resource_apply_dense`.
       distributed_container = var._distributed_container()
       assert distributed_container is not None
-      if context.executing_eagerly():
+      if ops.executing_eagerly_outside_functions():
         key = distributed_container._unique_id
       else:
         key = (distributed_container.graph, distributed_container._shared_name)
diff --git a/tensorflow/python/training/optimizer_test.py b/tensorflow/python/training/optimizer_test.py
index 7a7d01d50e0b6dc639d0d511f03d121c3a9e5c73..e175b5a79989e4c7b6b4c736eefe0250e9ebbcc9 100644
--- a/tensorflow/python/training/optimizer_test.py
+++ b/tensorflow/python/training/optimizer_test.py
@@ -62,6 +62,7 @@ class OptimizerTest(test.TestCase):
       self.assertAllClose([-14., -13.], self.evaluate(var0))
       self.assertAllClose([-6., -5.], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testAggregationMethod(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -79,14 +80,15 @@ class OptimizerTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd through optimizer
         opt_op.run()
         # Validate updated params
-        self.assertAllClose([-14., -13.], var0.eval())
-        self.assertAllClose([-6., -5.], var1.eval())
+        self.assertAllClose([-14., -13.], self.evaluate(var0))
+        self.assertAllClose([-6., -5.], self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testPrecomputedGradient(self):
     for dtype in [dtypes.half, dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -102,15 +104,15 @@ class OptimizerTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Run 1 step of sgd through optimizer
         opt_op.run()
         # Validate updated params
         self.assertAllClose([1.0 - 3 * 5 * 42.0, 2.0 - 3 * 5 * (-42.0)],
-                            var0.eval())
+                            self.evaluate(var0))
         self.assertAllClose([3.0 - 3 * 3 * 42.0, 4.0 - 3 * 3 * (-42.0)],
-                            var1.eval())
+                            self.evaluate(var1))
 
   @test_util.run_in_graph_and_eager_modes
   def testNoVariables(self):
@@ -230,6 +232,7 @@ class OptimizerTest(test.TestCase):
     with self.assertRaises(NotImplementedError):
       sgd_op.apply_gradients(grads_and_vars)
 
+  @test_util.run_deprecated_v1
   def testTrainOp(self):
     with self.cached_session():
       var0 = variables.Variable([1.0, 2.0])
@@ -241,6 +244,7 @@ class OptimizerTest(test.TestCase):
       opt_op = sgd_op.minimize(cost, global_step, [var0, var1])
       self.assertTrue(opt_op in ops.get_collection(ops.GraphKeys.TRAIN_OP))
 
+  @test_util.run_deprecated_v1
   def testConstraint(self):
     constraint_01 = lambda x: clip_ops.clip_by_value(x, -0.1, 0.)
     constraint_0 = lambda x: clip_ops.clip_by_value(x, 0., 1.)
@@ -257,13 +261,13 @@ class OptimizerTest(test.TestCase):
 
       variables.global_variables_initializer().run()
       # Fetch params to validate initial values
-      self.assertAllClose([1.0, 2.0], var0.eval())
-      self.assertAllClose([3.0, 4.0], var1.eval())
+      self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+      self.assertAllClose([3.0, 4.0], self.evaluate(var1))
       # Run 1 step of sgd through optimizer
       opt_op.run()
       # Validate updated params
-      self.assertAllClose([-0.1, -0.1], var0.eval())
-      self.assertAllClose([0., 0.], var1.eval())
+      self.assertAllClose([-0.1, -0.1], self.evaluate(var0))
+      self.assertAllClose([0., 0.], self.evaluate(var1))
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/training/proximal_adagrad.py b/tensorflow/python/training/proximal_adagrad.py
index 9bd677b8efcd447f74ec2a3cbe94d63eeb9a4dd1..2ea628a56b47b36a423b5ebdd3d8afef5f41c6bc 100644
--- a/tensorflow/python/training/proximal_adagrad.py
+++ b/tensorflow/python/training/proximal_adagrad.py
@@ -26,7 +26,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.ProximalAdagradOptimizer")
+@tf_export(v1=["train.ProximalAdagradOptimizer"])
 class ProximalAdagradOptimizer(optimizer.Optimizer):
   # pylint: disable=line-too-long
   """Optimizer that implements the Proximal Adagrad algorithm.
diff --git a/tensorflow/python/training/proximal_adagrad_test.py b/tensorflow/python/training/proximal_adagrad_test.py
index 74e06a5e2e68adc1b214110c6fc2268e50b30879..ce214ac418a01455b113ad261971434727994a3e 100644
--- a/tensorflow/python/training/proximal_adagrad_test.py
+++ b/tensorflow/python/training/proximal_adagrad_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -48,7 +49,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([0.0, 0.0], v0_val)
       self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -56,7 +57,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-2.60260963, -4.29698515]), v0_val)
       self.assertAllClose(np.array([-0.28432083, -0.56694895]), v1_val)
       opt_vars = opt.variables()
@@ -64,12 +65,15 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       self.assertStartsWith(opt_vars[1].name, var1._shared_name)
       self.assertEqual(2, len(opt_vars))
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceProximalAdagradwithoutRegularization(self):
     self.doTestProximalAdagradwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradwithoutRegularization2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -85,17 +89,18 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
       # Run 3 steps Proximal Adagrad.
       for _ in range(3):
         update.run()
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-1.60261, -2.296985]), v0_val)
       self.assertAllClose(np.array([3.715679, 2.433051]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -106,13 +111,15 @@ class ProximalAdagradOptimizerTest(test.TestCase):
         sgd_op = proximal_adagrad.ProximalAdagradOptimizer(1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0, 1]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0, 1]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradWithL1(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -128,17 +135,18 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
       # Run 10 steps Proximal Adagrad
       for _ in range(10):
         update.run()
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-6.663634, -9.190331]), v0_val)
       self.assertAllClose(np.array([2.959304, 1.029232]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testProximalAdagradWithL1_L2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -154,7 +162,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -162,7 +170,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
       for _ in range(10):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.0495, -0.0995]), v0_val)
       self.assertAllClose(np.array([-0.0045, -0.0095]), v1_val)
 
@@ -190,7 +198,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllClose([[1.0], [2.0]], v0_val)
       self.assertAllClose([[3.0], [4.0]], v1_val)
@@ -202,9 +210,10 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
+  @test_util.run_deprecated_v1
   def testEquivAdagradwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
@@ -222,6 +231,7 @@ class ProximalAdagradOptimizerTest(test.TestCase):
     self.assertAllClose(val0, val2)
     self.assertAllClose(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivSparseAdagradwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
diff --git a/tensorflow/python/training/proximal_gradient_descent_test.py b/tensorflow/python/training/proximal_gradient_descent_test.py
index f77f68b23432a59f509e73158ee6893021bbc138..25b206605dc7315216e48a22d597f7342742a5ca 100644
--- a/tensorflow/python/training/proximal_gradient_descent_test.py
+++ b/tensorflow/python/training/proximal_gradient_descent_test.py
@@ -23,6 +23,7 @@ import numpy as np
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -50,7 +51,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([0.0, 0.0], v0_val)
       self.assertAllClose([0.0, 0.0], v1_val)
 
@@ -58,16 +59,19 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.9, -1.8]), v0_val)
       self.assertAllClose(np.array([-0.09, -0.18]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentwithoutRegularization(self):
     self.doTestProximalGradientDescentwithoutRegularization(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testResourceProximalGradientDescentwithoutRegularization(self):
     self.doTestProximalGradientDescentwithoutRegularization(use_resource=True)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentwithoutRegularization2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -80,7 +84,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -88,10 +92,11 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(3):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([0.1, 0.2]), v0_val)
       self.assertAllClose(np.array([3.91, 2.82]), v1_val)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -103,13 +108,15 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
             1.0).minimize(loss)
         variables.global_variables_initializer().run()
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
         sgd_op.run()
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testProximalGradientDescentWithL1_L2(self):
     with self.cached_session() as sess:
       var0 = variables.Variable([1.0, 2.0])
@@ -122,7 +129,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
       variables.global_variables_initializer().run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose([1.0, 2.0], v0_val)
       self.assertAllClose([4.0, 3.0], v1_val)
 
@@ -130,7 +137,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
       for _ in range(10):
         update.run()
 
-      v0_val, v1_val = sess.run([var0, var1])
+      v0_val, v1_val = self.evaluate([var0, var1])
       self.assertAllClose(np.array([-0.0495, -0.0995]), v0_val)
       self.assertAllClose(np.array([-0.0045, -0.0095]), v1_val)
 
@@ -158,7 +165,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     variables.global_variables_initializer().run()
 
     sess = ops.get_default_session()
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     if is_sparse:
       self.assertAllClose([[1.0], [2.0]], v0_val)
       self.assertAllClose([[3.0], [4.0]], v1_val)
@@ -170,9 +177,10 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     for _ in range(steps):
       update.run()
 
-    v0_val, v1_val = sess.run([var0, var1])
+    v0_val, v1_val = self.evaluate([var0, var1])
     return v0_val, v1_val
 
+  @test_util.run_deprecated_v1
   def testEquivSparseGradientDescentwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
@@ -189,6 +197,7 @@ class ProximalGradientDescentOptimizerTest(test.TestCase):
     self.assertAllClose(val0, val2)
     self.assertAllClose(val1, val3)
 
+  @test_util.run_deprecated_v1
   def testEquivGradientDescentwithoutRegularization(self):
     with self.cached_session():
       val0, val1 = self.applyOptimizer(
diff --git a/tensorflow/python/training/quantize_training_test.py b/tensorflow/python/training/quantize_training_test.py
index 6edbf7665fbd59eea04294551452b764856563a9..2352af7e99b5bab99826fb9a628a98846e25444c 100644
--- a/tensorflow/python/training/quantize_training_test.py
+++ b/tensorflow/python/training/quantize_training_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import importer
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -52,6 +53,7 @@ class PywrapQuantizeTrainingTest(test.TestCase):
 
   # Test that save/restoring works for EMA variables generated in the
   # quantized training rewrite.
+  @test_util.run_v1_only('b/120545219')
   def testQuantizedSaveRestore(self):
     save_path = os.path.join(self.get_temp_dir(), 'quantized_save_restore')
 
@@ -73,11 +75,11 @@ class PywrapQuantizeTrainingTest(test.TestCase):
       _ = importer.import_graph_def(result, name='')
 
       # Initialize the variable.
-      sess.run(g.get_operation_by_name(init_op.name))
+      self.evaluate(g.get_operation_by_name(init_op.name))
 
       # Run the graph for one step to assign values to the quantization min/max
       # variables.
-      sess.run(g.get_tensor_by_name(c.name))
+      self.evaluate(g.get_tensor_by_name(c.name))
 
       saver.save(sess, save_path)
 
diff --git a/tensorflow/python/training/queue_runner_test.py b/tensorflow/python/training/queue_runner_test.py
index 15fe42bbd851fec831ef2a84401c1c7f1cac1973..2f6e924f98e5068d9f50e6efe93c58771b9acade 100644
--- a/tensorflow/python/training/queue_runner_test.py
+++ b/tensorflow/python/training/queue_runner_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import variables
@@ -40,6 +41,7 @@ _MockOp = collections.namedtuple("MockOp", ["name"])
 
 class QueueRunnerTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testBasic(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -58,8 +60,9 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_v1_only("b/120545219")
   def testTwoOps(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -80,9 +83,10 @@ class QueueRunnerTest(test.TestCase):
       for t in threads:
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
-      self.assertEqual(3, var0.eval())
-      self.assertEqual(30, var1.eval())
+      self.assertEqual(3, self.evaluate(var0))
+      self.assertEqual(30, self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testExceptionsCaptured(self):
     with self.cached_session() as sess:
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
@@ -99,6 +103,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertTrue("Operation not in the graph" in str(exceptions[0]))
       self.assertTrue("Operation not in the graph" in str(exceptions[1]))
 
+  @test_util.run_deprecated_v1
   def testRealDequeueEnqueue(self):
     with self.cached_session() as sess:
       q0 = data_flow_ops.FIFOQueue(3, dtypes.float32)
@@ -121,12 +126,13 @@ class QueueRunnerTest(test.TestCase):
       # It should have terminated cleanly.
       self.assertEqual(0, len(qr.exceptions_raised))
       # The 2 values should be in queue1.
-      self.assertEqual(10.0, dequeue1.eval())
-      self.assertEqual(10.0, dequeue1.eval())
+      self.assertEqual(10.0, self.evaluate(dequeue1))
+      self.assertEqual(10.0, self.evaluate(dequeue1))
       # And queue1 should now be closed.
       with self.assertRaisesRegexp(errors_impl.OutOfRangeError, "is closed"):
-        dequeue1.eval()
+        self.evaluate(dequeue1)
 
+  @test_util.run_v1_only("b/120545219")
   def testRespectCoordShouldStop(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -149,8 +155,9 @@ class QueueRunnerTest(test.TestCase):
       coord.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 0.
-      self.assertEqual(0, var.eval())
+      self.assertEqual(0, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testRequestStopOnException(self):
     with self.cached_session() as sess:
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32)
@@ -163,6 +170,7 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(ValueError, "Operation not in the graph"):
         coord.join()
 
+  @test_util.run_deprecated_v1
   def testGracePeriod(self):
     with self.cached_session() as sess:
       # The enqueue will quickly block.
@@ -180,6 +188,7 @@ class QueueRunnerTest(test.TestCase):
       # the queue to be closed and the enqueue to terminate.
       coord.join(stop_grace_period_secs=1.0)
 
+  @test_util.run_deprecated_v1
   def testMultipleSessions(self):
     with self.cached_session() as sess:
       with session.Session() as other_sess:
@@ -195,6 +204,7 @@ class QueueRunnerTest(test.TestCase):
         other_threads = qr.create_threads(other_sess, coord=coord)
         self.assertEqual(len(threads), len(other_threads))
 
+  @test_util.run_deprecated_v1
   def testIgnoreMultiStarts(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -211,6 +221,7 @@ class QueueRunnerTest(test.TestCase):
       new_threads = qr.create_threads(sess, coord=coord)
       self.assertEqual([], new_threads)
 
+  @test_util.run_v1_only("b/120545219")
   def testThreads(self):
     with self.cached_session() as sess:
       # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
@@ -238,6 +249,7 @@ class QueueRunnerTest(test.TestCase):
       self.assertEqual(1, len(exceptions))
       self.assertTrue("Operation not in the graph" in str(exceptions[0]))
 
+  @test_util.run_deprecated_v1
   def testName(self):
     with ops.name_scope("scope"):
       queue = data_flow_ops.FIFOQueue(10, dtypes.float32, name="queue")
@@ -247,6 +259,7 @@ class QueueRunnerTest(test.TestCase):
     self.assertEqual(
         1, len(ops.get_collection(ops.GraphKeys.QUEUE_RUNNERS, "scope")))
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunners(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
@@ -263,8 +276,9 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersRaisesIfNotASession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
     var = variables.VariableV1(zero64)
@@ -278,6 +292,7 @@ class QueueRunnerTest(test.TestCase):
       with self.assertRaisesRegexp(TypeError, "tf.Session"):
         queue_runner_impl.start_queue_runners("NotASession")
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersIgnoresMonitoredSession(self):
     zero64 = constant_op.constant(0, dtype=dtypes.int64)
     var = variables.VariableV1(zero64)
@@ -292,6 +307,7 @@ class QueueRunnerTest(test.TestCase):
           monitored_session.MonitoredSession())
       self.assertFalse(threads)
 
+  @test_util.run_deprecated_v1
   def testStartQueueRunnersNonDefaultGraph(self):
     # CountUpTo will raise OUT_OF_RANGE when it reaches the count.
     graph = ops.Graph()
@@ -310,7 +326,7 @@ class QueueRunnerTest(test.TestCase):
         t.join()
       self.assertEqual(0, len(qr.exceptions_raised))
       # The variable should be 3.
-      self.assertEqual(3, var.eval())
+      self.assertEqual(3, self.evaluate(var))
 
   def testQueueRunnerSerializationRoundTrip(self):
     graph = ops.Graph()
diff --git a/tensorflow/python/training/rmsprop.py b/tensorflow/python/training/rmsprop.py
index f38c9861d64aa258cde07ccd3041d3c50932c33b..fb53b5883f5b0246e3e99cb00f972dcf4eb9c409 100644
--- a/tensorflow/python/training/rmsprop.py
+++ b/tensorflow/python/training/rmsprop.py
@@ -50,7 +50,7 @@ from tensorflow.python.training import training_ops
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.RMSPropOptimizer")
+@tf_export(v1=["train.RMSPropOptimizer"])
 class RMSPropOptimizer(optimizer.Optimizer):
   """Optimizer that implements the RMSProp algorithm.
 
diff --git a/tensorflow/python/training/rmsprop_test.py b/tensorflow/python/training/rmsprop_test.py
index b63abe0529515b570c420f53919d24b51c1e2665..8f029d5310e9422e4f6dbc1c874f118d3c05d95d 100644
--- a/tensorflow/python/training/rmsprop_test.py
+++ b/tensorflow/python/training/rmsprop_test.py
@@ -28,6 +28,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -88,11 +89,12 @@ class RMSPropOptimizerTest(test.TestCase):
       var_t[gindex] = var[gindex] - mom_t[gindex]
     return var_t, mg_t, rms_t, mom_t
 
+  @test_util.run_deprecated_v1
   def testDense(self):
     # TODO(yori): Use ParameterizedTest when available
     for (dtype, learning_rate, decay, momentum,
          epsilon, centered, use_resource) in _TESTPARAMS:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1, 0.2], dtype=dtype.as_numpy_dtype)
@@ -115,7 +117,7 @@ class RMSPropOptimizerTest(test.TestCase):
             centered=centered)
 
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         mg0 = opt.get_slot(var0, "mg")
         self.assertEqual(mg0 is not None, centered)
@@ -138,12 +140,12 @@ class RMSPropOptimizerTest(test.TestCase):
         mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 4 steps of RMSProp
         for _ in range(1, 5):
-          update.run()
+          self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._rmsprop_update_numpy(
               var0_np, grads0_np, mg0_np, rms0_np, mom0_np, learning_rate,
@@ -154,15 +156,16 @@ class RMSPropOptimizerTest(test.TestCase):
 
           # Validate updated params
           if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariable(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -176,15 +179,17 @@ class RMSPropOptimizerTest(test.TestCase):
             momentum=0.0,
             epsilon=0.0,
             centered=False).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[0., 1.]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[0., 1.]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testMinimizeSparseResourceVariableCentered(self):
     for dtype in [dtypes.float32, dtypes.float64]:
       with self.cached_session():
@@ -198,20 +203,22 @@ class RMSPropOptimizerTest(test.TestCase):
             momentum=0.0,
             epsilon=1.0,
             centered=True).minimize(loss)
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
         # Fetch params to validate initial values
-        self.assertAllCloseAccordingToType([[1.0, 2.0]], var0.eval())
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
         # Run 1 step of sgd
-        sgd_op.run()
+        self.evaluate(sgd_op)
         # Validate updated params
-        self.assertAllCloseAccordingToType(
-            [[-111, -138]], var0.eval(), atol=0.01)
+        self.assertAllCloseAccordingToType([[-111, -138]],
+                                           self.evaluate(var0),
+                                           atol=0.01)
 
+  @test_util.run_deprecated_v1
   def testSparse(self):
     # TODO(yori): Use ParameterizedTest when available
     for (dtype, learning_rate, decay,
          momentum, epsilon, centered, _) in _TESTPARAMS:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         # Initialize variables for numpy implementation.
         var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
         grads0_np = np.array([0.1], dtype=dtype.as_numpy_dtype)
@@ -235,7 +242,7 @@ class RMSPropOptimizerTest(test.TestCase):
             epsilon=epsilon,
             centered=centered)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         mg0 = opt.get_slot(var0, "mg")
         self.assertEqual(mg0 is not None, centered)
@@ -258,12 +265,12 @@ class RMSPropOptimizerTest(test.TestCase):
         mom1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
 
         # Run 4 steps of RMSProp
         for _ in range(1, 5):
-          update.run()
+          self.evaluate(update)
 
           var0_np, mg0_np, rms0_np, mom0_np = self._sparse_rmsprop_update_numpy(
               var0_np, grads0_np_indices, grads0_np, mg0_np, rms0_np, mom0_np,
@@ -274,18 +281,19 @@ class RMSPropOptimizerTest(test.TestCase):
 
           # Validate updated params
           if centered:
-            self.assertAllCloseAccordingToType(mg0_np, mg0.eval())
-            self.assertAllCloseAccordingToType(mg1_np, mg1.eval())
-          self.assertAllCloseAccordingToType(rms0_np, rms0.eval())
-          self.assertAllCloseAccordingToType(rms1_np, rms1.eval())
-          self.assertAllCloseAccordingToType(mom0_np, mom0.eval())
-          self.assertAllCloseAccordingToType(mom1_np, mom1.eval())
-          self.assertAllCloseAccordingToType(var0_np, var0.eval())
-          self.assertAllCloseAccordingToType(var1_np, var1.eval())
-
+            self.assertAllCloseAccordingToType(mg0_np, self.evaluate(mg0))
+            self.assertAllCloseAccordingToType(mg1_np, self.evaluate(mg1))
+          self.assertAllCloseAccordingToType(rms0_np, self.evaluate(rms0))
+          self.assertAllCloseAccordingToType(rms1_np, self.evaluate(rms1))
+          self.assertAllCloseAccordingToType(mom0_np, self.evaluate(mom0))
+          self.assertAllCloseAccordingToType(mom1_np, self.evaluate(mom1))
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
   def testWithoutMomentum(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -293,7 +301,7 @@ class RMSPropOptimizerTest(test.TestCase):
         opt = rmsprop.RMSPropOptimizer(
             learning_rate=2.0, decay=0.9, momentum=0.0, epsilon=1.0)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
@@ -305,34 +313,36 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertTrue(mom1 is not None)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: the rms accumulators where 1. So we should see a normal
         # update: v -= grad * learning_rate
-        update.run()
+        self.evaluate(update)
         # Check the root mean square accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
+            np.array([0.901, 0.901]), self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
+            np.array([0.90001, 0.90001]), self.evaluate(rms1))
         # Check the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
         # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
+        self.evaluate(update)
         # Check the rms accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]),
+            self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]),
+            self.evaluate(rms1))
         # Check the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
@@ -340,18 +350,19 @@ class RMSPropOptimizerTest(test.TestCase):
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1.0)) -
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1.0))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1.0)) -
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 1e-5 + 1.0))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
+  @test_util.run_deprecated_v1
   def testWithMomentum(self):
     for dtype in [dtypes.half, dtypes.float32]:
-      with self.cached_session(use_gpu=True):
+      with test_util.use_gpu():
         var0 = variables.Variable([1.0, 2.0], dtype=dtype)
         var1 = variables.Variable([3.0, 4.0], dtype=dtype)
         grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
@@ -360,7 +371,7 @@ class RMSPropOptimizerTest(test.TestCase):
         opt = rmsprop.RMSPropOptimizer(
             learning_rate=2.0, decay=0.9, momentum=0.5, epsilon=1e-5)
         update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
-        variables.global_variables_initializer().run()
+        self.evaluate(variables.global_variables_initializer())
 
         rms0 = opt.get_slot(var0, "rms")
         self.assertTrue(rms0 is not None)
@@ -372,57 +383,61 @@ class RMSPropOptimizerTest(test.TestCase):
         self.assertTrue(mom1 is not None)
 
         # Fetch params to validate initial values
-        self.assertAllClose([1.0, 2.0], var0.eval())
-        self.assertAllClose([3.0, 4.0], var1.eval())
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
         # Step 1: rms = 1, mom = 0. So we should see a normal
         # update: v -= grad * learning_rate
-        update.run()
+        self.evaluate(update)
         # Check the root mean square accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901, 0.901]), rms0.eval())
+            np.array([0.901, 0.901]), self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001, 0.90001]), rms1.eval())
+            np.array([0.90001, 0.90001]), self.evaluate(rms1))
         # Check the momentum accumulators
         self.assertAllCloseAccordingToType(
             np.array([(0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
-                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]), mom0.eval())
+                      (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))]),
+            self.evaluate(mom0))
         self.assertAllCloseAccordingToType(
             np.array([(0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
-                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]), mom1.eval())
+                      (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))]),
+            self.evaluate(mom1))
 
         # Check that the parameters.
         self.assertAllCloseAccordingToType(
             np.array([
                 1.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)),
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
         self.assertAllCloseAccordingToType(
             np.array([
                 3.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)),
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
         # Step 2: the root mean square accumulators contain the previous update.
-        update.run()
+        self.evaluate(update)
         # Check the rms accumulators.
         self.assertAllCloseAccordingToType(
-            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]), rms0.eval())
+            np.array([0.901 * 0.9 + 0.001, 0.901 * 0.9 + 0.001]),
+            self.evaluate(rms0))
         self.assertAllCloseAccordingToType(
-            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]), rms1.eval())
+            np.array([0.90001 * 0.9 + 1e-5, 0.90001 * 0.9 + 1e-5]),
+            self.evaluate(rms1))
         self.assertAllCloseAccordingToType(
             np.array([
                 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)),
                 0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                 (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5))
-            ]), mom0.eval())
+            ]), self.evaluate(mom0))
         self.assertAllCloseAccordingToType(
             np.array([
                 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)),
                 0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                 (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5))
-            ]), mom1.eval())
+            ]), self.evaluate(mom1))
 
         # Check the parameters.
         self.assertAllCloseAccordingToType(
@@ -433,7 +448,7 @@ class RMSPropOptimizerTest(test.TestCase):
                 2.0 - (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) -
                 (0.5 * (0.1 * 2.0 / math.sqrt(0.901 + 1e-5)) +
                  (0.1 * 2.0 / math.sqrt(0.901 * 0.9 + 0.001 + 1e-5)))
-            ]), var0.eval())
+            ]), self.evaluate(var0))
 
         self.assertAllCloseAccordingToType(
             np.array([
@@ -443,7 +458,7 @@ class RMSPropOptimizerTest(test.TestCase):
                 4.0 - (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) -
                 (0.5 * (0.01 * 2.0 / math.sqrt(0.90001 + 1e-5)) +
                  (0.01 * 2.0 / math.sqrt(0.90001 * 0.9 + 2e-5)))
-            ]), var1.eval())
+            ]), self.evaluate(var1))
 
   def testCallableParams(self):
     with context.eager_mode():
diff --git a/tensorflow/python/training/saver.py b/tensorflow/python/training/saver.py
index a29926a57df847fd6553e0813a5e2dfeebb3885e..348b8bf1ef0a89a971eb26c9cb7e5f9d01c51a4b 100644
--- a/tensorflow/python/training/saver.py
+++ b/tensorflow/python/training/saver.py
@@ -14,7 +14,11 @@
 # ==============================================================================
 
 # pylint: disable=invalid-name
-"""Save and restore variables."""
+"""Save and restore variables.
+
+Symbols in this file are deprecated. See replacements in
+tensorflow/python/training/checkpointable and tensorflow/python/training/saving.
+"""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -25,7 +29,6 @@ import time
 import uuid
 
 import numpy as np
-import six
 
 from tensorflow.core.protobuf import checkpointable_object_graph_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -42,16 +45,15 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_io_ops
 from tensorflow.python.ops import io_ops
-from tensorflow.python.ops import resource_variable_ops
-from tensorflow.python.ops import state_ops
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import gfile
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import saveable_object
 from tensorflow.python.training import training_util
 from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
@@ -67,31 +69,6 @@ get_checkpoint_mtimes = checkpoint_management.get_checkpoint_mtimes
 remove_checkpoint = checkpoint_management.remove_checkpoint
 
 
-# Op names which identify variable reads which should be saved.
-_VARIABLE_OPS = set(["Variable",
-                     "VariableV2",
-                     "AutoReloadVariable",
-                     "VarHandleOp",
-                     "ReadVariableOp"])
-
-
-def _set_cpu0(device_string):
-  """Creates a new device string based on `device_string` but using /CPU:0.
-
-  If the device is already on /CPU:0, this is a no-op.
-
-  Args:
-    device_string: A device string.
-
-  Returns:
-    A device string.
-  """
-  parsed_device = pydev.DeviceSpec.from_string(device_string)
-  parsed_device.device_type = "CPU"
-  parsed_device.device_index = 0
-  return parsed_device.to_string()
-
-
 class BaseSaverBuilder(object):
   """Base class for Savers.
 
@@ -101,64 +78,9 @@ class BaseSaverBuilder(object):
   SaveSpec = saveable_object.SaveSpec
   SaveableObject = saveable_object.SaveableObject
 
-  class VariableSaveable(SaveableObject):
-    """SaveableObject implementation that handles Variables."""
-
-    def __init__(self, var, slice_spec, name):
-      spec = BaseSaverBuilder.SaveSpec(var, slice_spec, name, dtype=var.dtype)
-      super(BaseSaverBuilder.VariableSaveable, self).__init__(var, [spec], name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      restored_tensor = restored_tensors[0]
-      if restored_shapes is not None:
-        restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
-      return state_ops.assign(
-          self.op,
-          restored_tensor,
-          validate_shape=restored_shapes is None and
-          self.op.get_shape().is_fully_defined())
-
-  class ResourceVariableSaveable(SaveableObject):
-    """SaveableObject implementation that handles ResourceVariables."""
-
-    def __init__(self, var, slice_spec, name):
-      self._var_device = var.device
-      self._var_shape = var.shape
-      if isinstance(var, ops.Tensor):
-        self.handle_op = var.op.inputs[0]
-        tensor = var
-      elif isinstance(var, resource_variable_ops.ResourceVariable):
-
-        def _read_variable_closure(v):
-          def f():
-            with ops.device(v.device):
-              x = v.read_value()
-              # To allow variables placed on non-CPU devices to be checkpointed,
-              # we copy them to CPU on the same machine first.
-              with ops.device("/device:CPU:0"):
-                return array_ops.identity(x)
-          return f
-
-        self.handle_op = var.handle
-        tensor = _read_variable_closure(var)
-      else:
-        raise ValueError(
-            "Saveable is neither a resource variable nor a read operation."
-            " Got: %s" % repr(var))
-      spec = BaseSaverBuilder.SaveSpec(tensor, slice_spec, name,
-                                       dtype=var.dtype)
-      super(BaseSaverBuilder.ResourceVariableSaveable, self).__init__(
-          var, [spec], name)
-
-    def restore(self, restored_tensors, restored_shapes):
-      restored_tensor = restored_tensors[0]
-      if restored_shapes is not None:
-        restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
-      # Copy the restored tensor to the variable's device.
-      with ops.device(self._var_device):
-        restored_tensor = array_ops.identity(restored_tensor)
-        return resource_variable_ops.shape_safe_assign_variable_handle(
-            self.handle_op, self._var_shape, restored_tensor)
+  # Aliases for code which was moved but still has lots of users.
+  VariableSaveable = saveable_object_util.ReferenceVariableSaveable
+  ResourceVariableSaveable = saveable_object_util.ResourceVariableSaveable
 
   def __init__(self, write_version=saver_pb2.SaverDef.V2):
     self._write_version = write_version
@@ -224,7 +146,11 @@ class BaseSaverBuilder(object):
     del restore_sequentially
     all_tensors = []
     for saveable in saveables:
-      with ops.device(_set_cpu0(saveable.device) if saveable.device else None):
+      if saveable.device:
+        device = saveable_object_util.set_cpu0(saveable.device)
+      else:
+        device = None
+      with ops.device(device):
         all_tensors.extend(
             self.restore_op(filename_tensor, saveable, preferred_shard))
     return all_tensors
@@ -336,7 +262,7 @@ class BaseSaverBuilder(object):
     last_device = None
     for shard, (device, saveables) in enumerate(per_device):
       last_device = device
-      with ops.device(_set_cpu0(device)):
+      with ops.device(saveable_object_util.set_cpu0(device)):
         sharded_filename = self.sharded_filename(tmp_checkpoint_prefix, shard,
                                                  num_shards_tensor)
         sharded_prefixes.append(sharded_filename)
@@ -344,7 +270,7 @@ class BaseSaverBuilder(object):
 
     with ops.control_dependencies([x.op for x in sharded_saves]):
       # Co-locates the merge step with the last device.
-      with ops.device(_set_cpu0(last_device)):
+      with ops.device(saveable_object_util.set_cpu0(last_device)):
         # V2 format write path consists of a metadata merge step.  Once merged,
         # attempts to delete the temporary directory, "<user-fed prefix>_temp".
         merge_step = gen_io_ops.merge_v2_checkpoints(
@@ -459,10 +385,6 @@ class BaseSaverBuilder(object):
                 name="restore_shard"))
     return control_flow_ops.group(*sharded_restores, name="restore_all")
 
-  @staticmethod
-  def _IsVariable(v):
-    return isinstance(v, ops.Tensor) and v.op.type in _VARIABLE_OPS
-
   def _GroupByDevices(self, saveables):
     """Group Variable tensor slices per device.
 
@@ -490,220 +412,6 @@ class BaseSaverBuilder(object):
       per_device[canonical_device.pop()].append(saveable)
     return sorted(per_device.items(), key=lambda t: t[0])
 
-  @staticmethod
-  def OpListToDict(op_list, convert_variable_to_tensor=True):
-    """Create a dictionary of names to operation lists.
-
-    Args:
-      op_list: A list, tuple, or set of Variables or SaveableObjects.
-      convert_variable_to_tensor: Whether or not to convert single Variables
-        with no slice info into Tensors.
-
-    Returns:
-      A dictionary of names to the operations that must be saved under
-      that name.  Variables with save_slice_info are grouped together under the
-      same key in no particular order.
-
-    Raises:
-      TypeError: If the type of op_list or its elements is not supported.
-      ValueError: If at least two saveables share the same name.
-    """
-    if not isinstance(op_list, (list, tuple, set)):
-      raise TypeError("Variables to save should be passed in a dict or a "
-                      "list: %s" % op_list)
-    # When ResourceVariables are converted to Tensors, read ops are added to the
-    # graph. Sorting the op_list ensures that the resulting graph is always
-    # constructed in a deterministic way:
-    op_list = sorted(op_list, key=lambda x: x.name)
-    names_to_saveables = {}
-    # pylint: disable=protected-access
-    for var in op_list:
-      if isinstance(var, BaseSaverBuilder.SaveableObject):
-        names_to_saveables[var.name] = var
-      elif isinstance(var, variables.PartitionedVariable):
-        if var.name in names_to_saveables:
-          raise ValueError("At least two variables have the same name: %s" %
-                           var.name)
-        names_to_saveables[var.name] = var
-      elif isinstance(var, variables.Variable) and var._save_slice_info:
-        name = var._save_slice_info.full_name
-        if name in names_to_saveables:
-          if not isinstance(names_to_saveables[name], list):
-            raise ValueError("Mixing slices and non-slices with the same name: "
-                             "%s" % name)
-          names_to_saveables[name].append(var)
-        else:
-          names_to_saveables[name] = [var]
-      elif (isinstance(var, checkpointable.CheckpointableBase)
-            and not isinstance(var, variables.Variable)):
-        checkpointable_saveables = [
-            (factory() if callable(factory) else factory)
-            for factory in var._gather_saveables_for_checkpoint().values()]
-        names_to_saveables.update(
-            BaseSaverBuilder.OpListToDict(checkpointable_saveables))
-      else:
-        if context.executing_eagerly():
-          if not isinstance(var, resource_variable_ops.ResourceVariable):
-            raise ValueError(
-                "Can only save/restore ResourceVariables when eager execution "
-                "is enabled, type: %s." % type(var))
-          set_var = names_to_saveables.setdefault(var._shared_name, var)
-          if set_var is not var:
-            raise ValueError(
-                ("Two different ResourceVariable objects with the same "
-                 "shared_name '%s' were passed to the Saver. This likely means "
-                 "that they were created in different Graphs or isolation "
-                 "contexts, and may not be checkpointed together.") %
-                (var._shared_name,))
-        else:
-          if convert_variable_to_tensor:
-            if isinstance(var, resource_variable_ops.ResourceVariable):
-              var = var._graph_element  # pylint: disable=protected-access
-            else:
-              var = ops.internal_convert_to_tensor(var, as_ref=True)
-            if not BaseSaverBuilder._IsVariable(var):
-              raise TypeError("Variable to save is not a Variable: %s" % var)
-          if var.op.type == "ReadVariableOp":
-            name = var.op.inputs[0].op.name
-          else:
-            name = var.op.name
-          if name in names_to_saveables:
-            raise ValueError("At least two variables have the same name: %s" %
-                             name)
-          names_to_saveables[name] = var
-
-      # pylint: enable=protected-access
-    return names_to_saveables
-
-  @staticmethod
-  def SaveableObjectsForOp(op, name):
-    """Create `SaveableObject`s from an operation.
-
-    Args:
-      op: A variable, operation, or SaveableObject to coerce into a
-        SaveableObject.
-      name: A string name for the SaveableObject.
-
-    Yields:
-      `SaveableObject`s which together save/restore `op`.
-
-    Raises:
-      TypeError: If `name` is not a string.
-      ValueError: For operations with no known conversion to SaveableObject.
-    """
-    if not isinstance(name, six.string_types):
-      raise TypeError(
-          "names_to_saveables must be a dict mapping string names to "
-          "checkpointable operations. Name is not a string: %s" % name)
-    if isinstance(op, BaseSaverBuilder.SaveableObject):
-      yield op
-    elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
-      if isinstance(op, variables.PartitionedVariable):
-        op = list(op)
-      # A set of slices.
-      slice_name = None
-      # pylint: disable=protected-access
-      for variable in op:
-        if not isinstance(variable, variables.Variable):
-          raise ValueError("Slices must all be Variables: %s" % variable)
-        if not variable._save_slice_info:
-          raise ValueError("Slices must all be slices: %s" % variable)
-        if slice_name is None:
-          slice_name = variable._save_slice_info.full_name
-        elif slice_name != variable._save_slice_info.full_name:
-          raise ValueError(
-              "Slices must all be from the same tensor: %s != %s" %
-              (slice_name, variable._save_slice_info.full_name))
-        if variable.op.type in ["Variable", "VariableV2",
-                                "AutoReloadVariable"]:
-          yield BaseSaverBuilder.VariableSaveable(
-              variable, variable._save_slice_info.spec, name)
-        else:
-          yield BaseSaverBuilder.ResourceVariableSaveable(
-              variable, variable._save_slice_info.spec, name)
-      # pylint: enable=protected-access
-    elif isinstance(op, checkpointable.CheckpointableBase) and not isinstance(
-        op, variables.Variable):
-      # pylint: disable=protected-access
-      for attr, factory in op._gather_saveables_for_checkpoint().items():
-        if attr == checkpointable.VARIABLE_VALUE_KEY:
-          # Keep original name for classes masquerading as variables.
-          full_name = name
-        else:
-          full_name = name + "_" + attr
-        op = (factory(full_name) if callable(factory) else factory)
-        for op in BaseSaverBuilder.SaveableObjectsForOp(op, op.name):
-          yield op
-      # pylint: enable=protected-access
-    else:
-      # A variable or tensor.
-      if context.executing_eagerly():
-        if not isinstance(op, resource_variable_ops.ResourceVariable):
-          raise ValueError("Can only save/restore ResourceVariable eager "
-                           "mode is enabled, type: %s." % type(op))
-        yield BaseSaverBuilder.ResourceVariableSaveable(op, "", name)
-      else:
-        if isinstance(op, resource_variable_ops.ResourceVariable):
-          variable = op._graph_element  # pylint: disable=protected-access
-        else:
-          variable = ops.internal_convert_to_tensor(op, as_ref=True)
-        if not BaseSaverBuilder._IsVariable(variable):
-          raise TypeError("names_to_saveables must be a dict mapping string "
-                          "names to Tensors/Variables. Not a variable: %s" %
-                          variable)
-        if variable.op.type in ["Variable", "VariableV2",
-                                "AutoReloadVariable"]:
-          yield BaseSaverBuilder.VariableSaveable(variable, "", name)
-        else:
-          yield BaseSaverBuilder.ResourceVariableSaveable(
-              variable, "", name)
-
-  def _ValidateAndSliceInputs(self, names_to_saveables):
-    """Returns the variables and names that will be used for a Saver.
-
-    Args:
-      names_to_saveables: A dict (k, v) where k is the name of an operation and
-         v is an operation to save or a BaseSaverBuilder.Saver.
-
-    Returns:
-      A list of BaseSaverBuilder.SaveableObject objects.
-
-    Raises:
-      TypeError: If any of the keys are not strings or any of the
-        values are not one of Tensor or Variable or a checkpointable operation.
-      ValueError: If the same operation is given in more than one value
-        (this also applies to slices of SlicedVariables).
-    """
-    if not isinstance(names_to_saveables, dict):
-      names_to_saveables = BaseSaverBuilder.OpListToDict(names_to_saveables)
-
-    saveables = []
-    seen_ops = set()
-    for name, op in sorted(names_to_saveables.items(),
-                           # Avoid comparing ops, sort only by name.
-                           key=lambda x: x[0]):
-      for converted_saveable_object in self.SaveableObjectsForOp(op, name):
-        self._AddSaveable(saveables, seen_ops, converted_saveable_object)
-    return saveables
-
-  def _AddSaveable(self, saveables, seen_ops, saveable):
-    """Adds the saveable to the saveables list.
-
-    Args:
-      saveables: List to append the SaveableObject to.
-      seen_ops: Set of the ops of the saveables already processed.  Used to
-        check that each saveable is only saved once.
-      saveable: The saveable.
-
-    Raises:
-      ValueError: If the saveable has already been processed.
-    """
-    if saveable.op in seen_ops:
-      raise ValueError("The same saveable will be restored with two names: %s" %
-                       saveable.name)
-    saveables.append(saveable)
-    seen_ops.add(saveable.op)
-
   def build(self,
             names_to_saveables,
             reshape=False,
@@ -775,7 +483,8 @@ class BaseSaverBuilder(object):
       raise ValueError("save and restore operations need to be built together "
                        " when eager execution is not enabled.")
 
-    saveables = self._ValidateAndSliceInputs(names_to_saveables)
+    saveables = saveable_object_util.validate_and_slice_inputs(
+        names_to_saveables)
     if max_to_keep is None:
       max_to_keep = 0
 
@@ -1077,16 +786,28 @@ class Saver(object):
     @compatibility(eager)
     When eager execution is enabled, `var_list` must specify a `list` or `dict`
     of variables to save. Otherwise, a `RuntimeError` will be raised.
+
+    Although Saver works in some cases when executing eagerly, it is
+    fragile. Please switch to `tf.train.Checkpoint` or
+    `tf.keras.Model.save_weights`, which perform a more robust object-based
+    saving. These APIs will load checkpoints written by `Saver`.
     @end_compatibility
     """
     if defer_build and var_list:
       raise ValueError(
           "If `var_list` is provided then build cannot be deferred. "
           "Either set defer_build=False or var_list=None.")
-    if context.executing_eagerly() and var_list is None:
-      raise RuntimeError(
-          "When eager execution is enabled, `var_list` must specify a list or "
-          "dict of variables to save")
+    if context.executing_eagerly():
+      logging.warning(
+          "Saver is deprecated, please switch to tf.train.Checkpoint or "
+          "tf.keras.Model.save_weights for training checkpoints. When "
+          "executing eagerly variables do not necessarily have unique names, "
+          "and so the variable.name-based lookups Saver performs are "
+          "error-prone.")
+      if var_list is None:
+        raise RuntimeError(
+            "When eager execution is enabled, `var_list` must specify a list "
+            "or dict of variables to save")
     self._var_list = var_list
     self._reshape = reshape
     self._sharded = sharded
@@ -1656,6 +1377,37 @@ def import_meta_graph(meta_graph_or_file, clear_devices=False,
   NOTE: Restarting training from saved `meta_graph` only works if the
   device assignments have not changed.
 
+  Example 2:
+  Variables, placeholders, and independent operations can also be stored, as
+  shown in the following example.
+
+  ```Python
+  # Saving contents and operations.
+  v1 = tf.placeholder(tf.float32, name="v1")
+  v2 = tf.placeholder(tf.float32, name="v2")
+  v3 = tf.mul(v1, v2)
+  vx = tf.Variable(10.0, name="vx")
+  v4 = tf.add(v3, vx, name="v4")
+  saver = tf.train.Saver([vx])
+  sess = tf.Session()
+  sess.run(tf.initialize_all_variables())
+  sess.run(vx.assign(tf.add(vx, vx)))
+  result = sess.run(v4, feed_dict={v1:12.0, v2:3.3})
+  print(result)
+  saver.save(sess, "./model_ex1")
+  ```
+
+  Later this model can be restored and contents loaded.
+
+  ```Python
+  # Restoring variables and running operations.
+  saver = tf.train.import_meta_graph("./model_ex1.meta")
+  sess = tf.Session()
+  saver.restore(sess, "./model_ex1")
+  result = sess.run("v4:0", feed_dict={"v1:0": 12.0, "v2:0": 3.3})
+  print(result)
+  ```
+
   Args:
     meta_graph_or_file: `MetaGraphDef` protocol buffer or filename (including
       the path) containing a `MetaGraphDef`.
@@ -1898,17 +1650,41 @@ def saver_from_object_based_checkpoint(
   if builder is None:
     builder = BulkSaverBuilder()
 
-  saveables = builder._ValidateAndSliceInputs(var_list)  # pylint: disable=protected-access
+  saveables = saveable_object_util.validate_and_slice_inputs(var_list)
+  current_names = set()
+  for saveable in saveables:
+    for spec in saveable.specs:
+      current_names.add(spec.name)
+  previous_names = set(names_to_keys.keys())
+  missing_names = current_names - previous_names
+  if missing_names:
+    extra_names = previous_names - current_names
+    intersecting_names = previous_names.intersection(current_names)
+    raise errors.NotFoundError(
+        None, None,
+        message=(
+            "\n\nExisting variables not in the checkpoint: %s\n\n"
+            "Variables names when this checkpoint was written which don't "
+            "exist now: %s\n\n"
+            "(%d variable name(s) did match)\n\n"
+            "Could not find some variables in the checkpoint (see names "
+            "above). Saver was attempting to load an object-based checkpoint "
+            "(saved using tf.train.Checkpoint or tf.keras.Model.save_weights) "
+            "using variable names. If the checkpoint was written with eager "
+            "execution enabled, it's possible that variable names have "
+            "changed (for example missing a '_1' suffix). It's also "
+            "possible that there are new variables which did not exist "
+            "when the checkpoint was written. You can construct a "
+            "Saver(var_list=...) with only the variables which previously "
+            "existed, and if variable names have changed you may need to "
+            "make this a dictionary with the old names as keys. If you're "
+            "using an Estimator, you'll need to return a tf.train.Saver "
+            "inside a tf.train.Scaffold from your model_fn.")
+        % (", ".join(sorted(missing_names)), ", ".join(sorted(extra_names)),
+           len(intersecting_names)))
   for saveable in saveables:
     for spec in saveable.specs:
-      if spec.name not in names_to_keys:
-        raise errors.NotFoundError(
-            None, None,
-            message=("Attempting to load an object-based checkpoint using "
-                     "variable names, but could not find %s in the "
-                     "checkpoint.") % spec.name)
       spec.name = names_to_keys[spec.name]
-
   if cached_saver is None:
     return Saver(saveables)
   return cached_saver
diff --git a/tensorflow/python/training/saver_large_partitioned_variable_test.py b/tensorflow/python/training/saver_large_partitioned_variable_test.py
index 1a44511cfeb99e350f8c3394fa51c5cfbf0f3b6c..84458836d0613ea632f3ffcd13315f4e7d7c3927 100644
--- a/tensorflow/python/training/saver_large_partitioned_variable_test.py
+++ b/tensorflow/python/training/saver_large_partitioned_variable_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import partitioned_variables
+from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import saver
@@ -44,8 +45,12 @@ class SaverLargePartitionedVariableTest(test.TestCase):
         # split into smaller sized variables.
         init = lambda shape, dtype, partition_info: constant_op.constant(
             True, dtype, shape)
-        partitioned_var = partitioned_variables.create_partitioned_variables(
-            [1 << 31], [4], init, dtype=dtypes.bool, name=var_name)
+        partitioned_var = list(variable_scope.get_variable(
+            var_name,
+            shape=[1 << 31],
+            partitioner=partitioned_variables.fixed_size_partitioner(4),
+            initializer=init,
+            dtype=dtypes.bool))
         variables.global_variables_initializer().run()
         save = saver.Saver(partitioned_var)
         val = save.save(sess, save_path)
diff --git a/tensorflow/python/training/saver_test.py b/tensorflow/python/training/saver_test.py
index eb2690985d5a18c62772c1eba66f9f0ddcbaa5d4..95c21cb815fd8cf9aa5e9efb98efd6be7108f51a 100644
--- a/tensorflow/python/training/saver_test.py
+++ b/tensorflow/python/training/saver_test.py
@@ -170,6 +170,7 @@ class SaverTest(test.TestCase):
   def testResourceBasic(self):
     self.basicSaveRestore(resource_variable_ops.ResourceVariable)
 
+  @test_util.run_deprecated_v1
   def testResourceColocation(self):
     partitioner = partitioned_variables.fixed_size_partitioner(num_shards=2)
     with ops_lib.device("/job:ps/device:GPU:0"):
@@ -227,7 +228,7 @@ class SaverTest(test.TestCase):
         w1 = resource_variable_ops.ResourceVariable(1.0, name="w1")
         w2 = resource_variable_ops.ResourceVariable(2.0, name="w2")
         graph_saver = saver_module.Saver([w1, w2])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         graph_saver.save(sess, graph_ckpt_prefix)
 
     with context.eager_mode():
@@ -260,7 +261,7 @@ class SaverTest(test.TestCase):
         w3 = resource_variable_ops.ResourceVariable(0.0, name="w3")
         w4 = resource_variable_ops.ResourceVariable(0.0, name="w4")
         graph_saver = saver_module.Saver([w3, w4])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         graph_saver.restore(sess, eager_ckpt_prefix)
         self.assertAllEqual(w3.eval(), 3.0)
         self.assertAllEqual(w4.eval(), 4.0)
@@ -300,6 +301,7 @@ class SaverTest(test.TestCase):
             not op.name.startswith("saver2/save/"))]
     self.assertEqual(ops_in_saver2_scope_but_not_save_scope, [])
 
+  @test_util.run_deprecated_v1
   def testSaveCopyRestoreWithSaveRelativePaths(self):
     """Save, copy checkpoint dir and restore from copied dir.
 
@@ -326,7 +328,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initialize all variables
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
       self.assertEqual(10.0, v0.eval())
@@ -369,6 +371,7 @@ class SaverTest(test.TestCase):
       self.assertEqual(b"k1", v2.keys().eval())
       self.assertEqual(30.0, v2.values().eval())
 
+  @test_util.run_deprecated_v1
   def testFilenameTensor(self):
     v0 = variables.VariableV1(0, name="v0")
     filename = b"somerandomfilename"
@@ -376,7 +379,7 @@ class SaverTest(test.TestCase):
     with self.cached_session() as sess:
       tensor = sess.graph.get_tensor_by_name(
           save.saver_def.filename_tensor_name)
-      self.assertEqual(sess.run(tensor), filename)
+      self.assertEqual(self.evaluate(tensor), filename)
 
   def testInvalidPath(self):
     v0 = variables.VariableV1(0, name="v0")
@@ -387,6 +390,7 @@ class SaverTest(test.TestCase):
             ValueError, "The passed save_path is not a valid checkpoint:"):
           save.restore(sess, "invalid path")
 
+  @test_util.run_v1_only("b/120545219")
   def testInt64(self):
     save_path = os.path.join(self.get_temp_dir(), "int64")
 
@@ -407,7 +411,7 @@ class SaverTest(test.TestCase):
 
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v" in e.message):
-        sess.run(v)
+        self.evaluate(v)
 
       # Restore the saved values in the parameter nodes.
       save.restore(sess, save_path)
@@ -462,6 +466,7 @@ class SaverTest(test.TestCase):
       # Verify non-duplicate names work.
       saver_module.Saver({"v0": v0, "v2": v2.saveable})
 
+  @test_util.run_v1_only("b/120545219")
   def testBasicsWithListOfVariables(self):
     save_path = os.path.join(self.get_temp_dir(), "basics_with_list")
 
@@ -497,10 +502,10 @@ class SaverTest(test.TestCase):
 
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v0" in e.message):
-        sess.run(v0)
+        self.evaluate(v0)
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v1" in e.message):
-        sess.run(v1)
+        self.evaluate(v1)
       self.assertEqual(0, len(v2.keys().eval()))
       self.assertEqual(0, len(v2.values().eval()))
 
@@ -557,6 +562,7 @@ class SaverTest(test.TestCase):
     # The cached readers should know to re-read the file.
     self._SaveAndLoad("var1", 1.1, 2.2, save_path)
 
+  @test_util.run_deprecated_v1
   def testAllowEmpty(self):
     save_path = os.path.join(self.get_temp_dir(), "allow_empty")
     with self.cached_session() as sess:
@@ -661,6 +667,7 @@ class SaverTest(test.TestCase):
       self.assertAllClose(1.0, one.eval())
       self.assertAllClose([2.0, 2.0, 2.0], twos.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testReshape(self):
     save_path = os.path.join(self.get_temp_dir(), "variables_reshape")
     with session.Session("", graph=ops_lib.Graph()) as sess:
@@ -719,6 +726,7 @@ class SaverTest(test.TestCase):
   def testSaveWithGlobalStepWithPadding(self):
     self.testSaveWithGlobalStep(pad_step_number=True)
 
+  @test_util.run_v1_only("b/120545219")
   def testSaveToNonexistingPath(self):
     file_io.write_string_to_file(
         os.path.join(self.get_temp_dir(), "actually_a_file"), "")
@@ -742,7 +750,7 @@ class SaverTest(test.TestCase):
       try:
         with self.cached_session() as sess:
           # Initialize all variables
-          sess.run(init_all_op)
+          self.evaluate(init_all_op)
 
           # Check that the parameter nodes have been initialized.
           self.assertEqual(10.0, v0.eval())
@@ -761,6 +769,7 @@ class SaverTest(test.TestCase):
         error_msg_template = "Parent directory of {} doesn't exist, can't save."
         self.assertEqual(error_msg_template.format(save_path), str(exc))
 
+  @test_util.run_deprecated_v1
   def testSaveToURI(self):
     # ParseURI functions don't work on Windows yet.
     # TODO(jhseu): Remove this check when it works.
@@ -777,7 +786,7 @@ class SaverTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initialize all variables
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
 
       # Check that the parameter nodes have been initialized.
       self.assertEqual(10.0, v0.eval())
@@ -824,11 +833,11 @@ class SaverTest(test.TestCase):
     save_graph = ops_lib.Graph()
     with save_graph.as_default(), self.session(graph=save_graph) as sess:
       orig_vars = _model()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       save = saver_module.Saver(max_to_keep=1)
       variables.global_variables_initializer().run()
       save.save(sess, save_dir)
-      orig_vals = sess.run(orig_vars)
+      orig_vals = self.evaluate(orig_vars)
 
     restore_graph = ops_lib.Graph()
     with restore_graph.as_default(), self.session(
@@ -836,7 +845,7 @@ class SaverTest(test.TestCase):
       restored_vars = _model()
       save = saver_module.Saver(max_to_keep=1)
       save.restore(sess, save_dir)
-      restored_vals = sess.run(restored_vars)
+      restored_vals = self.evaluate(restored_vars)
 
     for orig, restored in zip(orig_vals, restored_vals):
       self.assertAllEqual(orig, restored)
@@ -982,6 +991,7 @@ class SaveRestoreShardedTest(test.TestCase):
           checkpoint_management.latest_checkpoint(self.get_temp_dir()),
           os.path.join(self.get_temp_dir(), "sharded_basics"))
 
+  @test_util.run_deprecated_v1
   def testSaverDef(self):
     with self.cached_session():
       v0 = variables.VariableV1(123, name="v0")
@@ -998,19 +1008,12 @@ class SaveRestoreShardedTest(test.TestCase):
 
     call_saver_with_dict = False  # updated by test loop below
 
-    def _save(slices=None, partitioner=None):
+    def _save(partitioner=None):
       with self.session(graph=ops_lib.Graph()) as sess:
         # Calls .eval() to return the ndarray that makes up the full variable.
         rnd = random_ops.random_uniform(var_full_shape).eval()
 
-        if slices:
-          assert not partitioner
-          # TODO(apassos): make create_partitioned_variables take use_resource
-          # option to make this test passable without creating a named
-          # variable_scope.
-          vs = partitioned_variables.create_partitioned_variables(
-              var_full_shape, slices, rnd, name=var_name)
-        elif partitioner:
+        if partitioner:
           vs = [
               variable_scope.get_variable(
                   var_name,
@@ -1027,7 +1030,7 @@ class SaveRestoreShardedTest(test.TestCase):
 
         variables.global_variables_initializer().run()
         if call_saver_with_dict:
-          saver = saver_module.Saver({var_name: (vs if slices else vs[0])})
+          saver = saver_module.Saver({var_name: vs[0]})
         else:
           saver = saver_module.Saver(vs)
         actual_path = saver.save(sess, saved_path)
@@ -1035,16 +1038,9 @@ class SaveRestoreShardedTest(test.TestCase):
 
         return rnd
 
-    def _restore(slices=None, partitioner=None):
+    def _restore(partitioner=None):
       with self.session(graph=ops_lib.Graph()) as sess:
-        if slices:
-          assert not partitioner
-          new_vs = partitioned_variables.create_partitioned_variables(
-              var_full_shape,
-              slices,
-              array_ops.zeros(var_full_shape),  # != original contents.
-              name=var_name)
-        elif partitioner:
+        if partitioner:
           new_vs = [
               variable_scope.get_variable(
                   var_name,
@@ -1063,7 +1059,7 @@ class SaveRestoreShardedTest(test.TestCase):
         variables.global_variables_initializer().run()
         if call_saver_with_dict:
           saver = saver_module.Saver({
-              var_name: (new_vs if slices else new_vs[0])
+              var_name: new_vs[0]
           })
         else:
           saver = saver_module.Saver(new_vs)
@@ -1071,11 +1067,7 @@ class SaveRestoreShardedTest(test.TestCase):
 
         if partitioner:
           return new_vs[0].as_tensor().eval()
-        elif slices and slices[0] != 1:
-          return array_ops.concat(new_vs, 0).eval()
-        elif slices and slices[1] != 1:
-          return array_ops.concat(new_vs, 1).eval()
-        else:  # Non-sliced.
+        else:
           return new_vs[0].eval()
 
     for call_saver_with_dict in {False, True}:
@@ -1086,32 +1078,30 @@ class SaveRestoreShardedTest(test.TestCase):
       restored_full = _restore()
       self.assertAllEqual(saved_full, restored_full)
 
-      # Saves 10 horizontal parts of a partitioned variable.
-      # Restores into a full variable, non-sliced.
-      saved_full = _save(slices=[10, 1])
-      restored_full = _restore()
-      self.assertAllEqual(saved_full, restored_full)
-
-      # Restores into a different number/orientation of slices.
-      restored_full = _restore(slices=[2, 1])  # 2 horizon parts.
-      self.assertAllEqual(saved_full, restored_full)
-      restored_full = _restore(slices=[1, 3])  # 3 vertical parts.
+      # Restores into the same number of partitions.
+      restored_full = _restore(
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_shards=2))
       self.assertAllEqual(saved_full, restored_full)
 
-      # Restores into a PartitionedVariable
+      # Restores into a different number of partitions.
       restored_full = _restore(
           partitioner=partitioned_variables.fixed_size_partitioner(
-              num_shards=2))
+              num_shards=3))
       self.assertAllEqual(saved_full, restored_full)
 
-      # Now, saves a full variable and restores in slices.
+      # Now, saves a full variable and restores PartitionedVariable.
       saved_full = _save()
-      restored_full = _restore(slices=[1, 3])
+      restored_full = _restore(
+          partitioner=partitioned_variables.fixed_size_partitioner(
+              num_shards=3))
       self.assertAllEqual(saved_full, restored_full)
 
+  @test_util.run_deprecated_v1
   def testPartitionedVariable(self):
     self._testPartitionedVariables(use_resource=False)
 
+  @test_util.run_deprecated_v1
   def testPartitionedResourceVariable(self):
     self._testPartitionedVariables(use_resource=True)
 
@@ -1206,6 +1196,7 @@ class MaxToKeepTest(test.TestCase):
       # Deleted by the first helper.
       self.assertFalse(checkpoint_management.checkpoint_exists(s3))
 
+  @test_util.run_deprecated_v1
   def testNonSharded(self):
     save_dir = self._get_test_dir("max_to_keep_non_sharded")
 
@@ -1443,6 +1434,7 @@ class MaxToKeepTest(test.TestCase):
       self.assertTrue(
           gfile.Exists(checkpoint_management.meta_graph_filename(s3)))
 
+  @test_util.run_deprecated_v1
   def testNoMaxToKeep(self):
     save_dir = self._get_test_dir("no_max_to_keep")
     save_dir2 = self._get_test_dir("max_to_keep_0")
@@ -1471,6 +1463,7 @@ class MaxToKeepTest(test.TestCase):
       self.assertEqual([], save2.last_checkpoints)
       self.assertTrue(checkpoint_management.checkpoint_exists(s2))
 
+  @test_util.run_deprecated_v1
   def testNoMetaGraph(self):
     save_dir = self._get_test_dir("no_meta_graph")
 
@@ -1494,6 +1487,7 @@ class KeepCheckpointEveryNHoursTest(test.TestCase):
 
   @test_util.run_in_graph_and_eager_modes
   @test.mock.patch.object(saver_module, "time")
+  @test_util.run_deprecated_v1
   def testNonSharded(self, mock_time):
     save_dir = self._get_test_dir("keep_checkpoint_every_n_hours")
 
@@ -1613,6 +1607,7 @@ class SaveRestoreWithVariableNameMap(test.TestCase):
       self.assertEqual(20.0, self.evaluate(v1))
 
   @test_util.run_in_graph_and_eager_modes
+  @test_util.run_v1_only("b/120545219")
   def testNonReshapeResourceVariable(self):
     self._testNonReshape(resource_variable_ops.ResourceVariable)
 
@@ -1627,6 +1622,7 @@ class MetaGraphTest(test.TestCase):
     gfile.MakeDirs(test_dir)
     return test_dir
 
+  @test_util.run_v1_only("b/120545219")
   def testAddCollectionDef(self):
     test_dir = self._get_test_dir("good_collection")
     filename = os.path.join(test_dir, "metafile")
@@ -1769,18 +1765,20 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual([], v1.get_shape())
       with self.assertRaisesWithPredicateMatch(
           errors_impl.OpError, lambda e: "uninitialized value v1" in e.message):
-        sess.run(v1)
+        self.evaluate(v1)
       # Retrieves saver1. Verifies that new_saver1 can restore v1.
       new_saver1 = savers[1]
       new_saver1.restore(sess, saver1_ckpt)
       v1 = sess.graph.get_tensor_by_name("v1:0")
       self.assertEqual(11.0, v1.eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testMultiSaverCollection(self):
     test_dir = self._get_test_dir("saver_collection")
     self._testMultiSaverCollectionSave(test_dir)
     self._testMultiSaverCollectionRestore(test_dir)
 
+  @test_util.run_v1_only("b/120545219")
   def testClearExtraneousSavers(self):
     test_dir = self._get_test_dir("clear_extraneous_savers")
     filename = os.path.join(test_dir, "metafile")
@@ -1835,6 +1833,7 @@ class MetaGraphTest(test.TestCase):
       self.assertEqual(33, len(meta_graph_def0.graph_def.node))
       self.assertEqual(21, len(meta_graph_def1.graph_def.node))
 
+  @test_util.run_deprecated_v1
   def testBinaryAndTextFormat(self):
     test_dir = self._get_test_dir("binary_and_text")
     filename = os.path.join(test_dir, "metafile")
@@ -1867,6 +1866,7 @@ class MetaGraphTest(test.TestCase):
                                                lambda e: "does not exist"):
         saver_module.import_meta_graph(filename)
 
+  @test_util.run_v1_only("b/120545219")
   def testSliceVariable(self):
     test_dir = self._get_test_dir("slice_saver")
     filename = os.path.join(test_dir, "metafile")
@@ -1949,9 +1949,9 @@ class MetaGraphTest(test.TestCase):
 
     with self.cached_session() as sess:
       # Initializes all the variables.
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
       # Runs to logit.
-      sess.run(logits)
+      self.evaluate(logits)
       # Creates a saver.
       saver0 = saver_module.Saver()
       saver0.save(sess, saver0_ckpt)
@@ -1991,7 +1991,7 @@ class MetaGraphTest(test.TestCase):
       ops_lib.add_to_collection("train_op", train_op)
 
       # Runs train_op.
-      sess.run(train_op)
+      self.evaluate(train_op)
 
       # Generates MetaGraphDef.
       saver_module.export_meta_graph(train_filename)
@@ -2005,8 +2005,9 @@ class MetaGraphTest(test.TestCase):
       # Restores from checkpoint.
       new_saver.restore(sess, saver0_ckpt)
       train_op = ops_lib.get_collection("train_op")[0]
-      sess.run(train_op)
+      self.evaluate(train_op)
 
+  @test_util.run_deprecated_v1
   def testGraphExtension(self):
     test_dir = self._get_test_dir("graph_extension")
     self._testGraphExtensionSave(test_dir)
@@ -2037,8 +2038,8 @@ class MetaGraphTest(test.TestCase):
 
       # Generate a MetaGraphDef containing the while loop.
       with session.Session() as sess:
-        sess.run(init_op)
-        sess.run(output)
+        self.evaluate(init_op)
+        self.evaluate(output)
         saver = saver_module.Saver()
         saver.save(sess, saver_ckpt)
         saver.export_meta_graph(filename)
@@ -2053,8 +2054,8 @@ class MetaGraphTest(test.TestCase):
       no_constfold_config.graph_options.rewrite_options.constant_folding = (
           rewriter_config_pb2.RewriterConfig.OFF)
       with session.Session(config=no_constfold_config) as sess:
-        sess.run(init_op)
-        expected_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        expected_grad_value = self.evaluate(grad)
 
     # Restore the MetaGraphDef into a new Graph.
     with ops_lib.Graph().as_default():
@@ -2070,8 +2071,8 @@ class MetaGraphTest(test.TestCase):
       init_op = variables.global_variables_initializer()
 
       with session.Session(config=no_constfold_config) as sess:
-        sess.run(init_op)
-        actual_grad_value = sess.run(grad)
+        self.evaluate(init_op)
+        actual_grad_value = self.evaluate(grad)
         self.assertEqual(expected_grad_value, actual_grad_value)
 
   def _testWhileLoopAndGradientSerDes(self, outer_body_fn):
@@ -2092,6 +2093,7 @@ class MetaGraphTest(test.TestCase):
       return i + 1, x + r
     self._testWhileLoopAndGradientSerDes(body)
 
+  @test_util.run_deprecated_v1
   def testNestedControlFlowSerDes(self):
     # Test while loop in a cond in a while loop.
     # pylint: disable=g-long-lambda
@@ -2120,6 +2122,7 @@ class MetaGraphTest(test.TestCase):
                                       lambda: math_ops.multiply(x, -1.0))))
     # pylint: enable=g-long-lambda
 
+  @test_util.run_v1_only("b/120545219")
   def testStrippedOpListDef(self):
     with self.cached_session():
       # Creates a graph.
@@ -2157,6 +2160,7 @@ class MetaGraphTest(test.TestCase):
         self.assertEqual(o.summary, "")
         self.assertEqual(o.description, "")
 
+  @test_util.run_deprecated_v1
   def testStripDefaultValuedAttrs(self):
     """Verifies that default valued attrs are stripped, unless disabled."""
 
@@ -2193,6 +2197,7 @@ class MetaGraphTest(test.TestCase):
       self.assertIn("T", node_def.attr)
       self.assertIn("Tout", node_def.attr)
 
+  @test_util.run_deprecated_v1
   def testImportIntoNamescope(self):
     # Test that we can import a meta graph into a namescope.
     test_dir = self._get_test_dir("import_into_namescope")
@@ -2209,7 +2214,7 @@ class MetaGraphTest(test.TestCase):
                                                       logits=logit, name="cost")
       adam.AdamOptimizer().minimize(cost, name="optimize")
       saver = saver_module.Saver()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver.save(sess, filename)
 
     graph = ops_lib.Graph()
@@ -2246,7 +2251,7 @@ class MetaGraphTest(test.TestCase):
 
       # Create a variable in graph_2 under scope "my_scope".
       variables.VariableV1(array_ops.zeros([10]), name="my_scope/my_var")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       # Restore the checkpoint into a different scope "subgraph_2".
       new_saver_2 = saver_module.import_meta_graph(
           filename + ".meta", graph=graph_2, import_scope="subgraph_2")
@@ -2263,6 +2268,7 @@ class MetaGraphTest(test.TestCase):
           filename + ".meta", graph=graph_2, import_scope="my_scope")
       self.assertIsInstance(new_saver_3, saver_module.Saver)
 
+  @test_util.run_deprecated_v1
   def testImportIntoImplicitNamescope(self):
     # Test that we can import a meta graph into an implicit namescope.
     test_dir = self._get_test_dir("import_into_namescope")
@@ -2279,7 +2285,7 @@ class MetaGraphTest(test.TestCase):
                                                       logits=logit, name="cost")
       adam.AdamOptimizer().minimize(cost, name="optimize")
       saver = saver_module.Saver()
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver.save(sess, filename)
 
     graph = ops_lib.Graph()
@@ -2316,12 +2322,12 @@ class MetaGraphTest(test.TestCase):
           meta_graph_def, clear_devices=False, import_scope="new_model")
       # Device refers to GPU, which is not available here.
       with self.assertRaises(errors_impl.InvalidArgumentError):
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(
           meta_graph_def, clear_devices=True, import_scope="new_model")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(["new_model/optimize"], {
           "new_model/image:0": np.random.random([1, 784]),
           "new_model/label:0": np.random.randint(
@@ -2348,7 +2354,7 @@ class MetaGraphTest(test.TestCase):
 
     with session.Session(graph=ops_lib.Graph()) as sess:
       saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       sess.run(["new_model/optimize"], {
           "new_model/image:0": np.random.random([1, 784]),
           "new_model/label:0": np.random.randint(
@@ -2358,7 +2364,7 @@ class MetaGraphTest(test.TestCase):
   def testPreserveDatasetAndFunctions(self):
     with ops_lib.Graph().as_default() as g:
       dataset = dataset_ops.Dataset.range(10).map(lambda x: x * x)
-      iterator = dataset.make_one_shot_iterator()
+      iterator = dataset_ops.make_one_shot_iterator(dataset)
       next_element = iterator.get_next()
       _ = array_ops.identity(next_element, name="output")
 
@@ -2374,7 +2380,7 @@ class MetaGraphTest(test.TestCase):
                            meta_graph_def_from_graph_def]:
       with session.Session(graph=ops_lib.Graph()) as sess:
         saver_module.import_meta_graph(meta_graph_def, import_scope="new_model")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         for i in range(10):
           self.assertEqual(i * i, sess.run("new_model/output:0"))
         with self.assertRaises(errors.OutOfRangeError):
@@ -2385,6 +2391,7 @@ class CheckpointReaderTest(test.TestCase):
 
   _WRITE_VERSION = saver_pb2.SaverDef.V1
 
+  @test_util.run_deprecated_v1
   def testDebugString(self):
     # Builds a graph.
     v0 = variables.VariableV1(
@@ -2400,7 +2407,7 @@ class CheckpointReaderTest(test.TestCase):
     save_path = os.path.join(self.get_temp_dir(),
                              "ckpt_for_debug_string" + str(self._WRITE_VERSION))
     with self.cached_session() as sess:
-      sess.run(init_all_op)
+      self.evaluate(init_all_op)
       # Saves a checkpoint.
       save.save(sess, save_path)
 
@@ -2546,7 +2553,7 @@ class ScopedGraphTest(test.TestCase):
       self.assertEqual(["biases:0", "weights:0"], sorted(var_list.keys()))
 
     with self.session(graph=graph) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.save(sess, os.path.join(test_dir, ckpt_filename), write_state=False)
 
@@ -2609,13 +2616,14 @@ class ScopedGraphTest(test.TestCase):
       saver = saver_module.Saver(var_list=var_list, max_to_keep=1)
       saver.restore(sess, os.path.join(test_dir, ckpt_filename))
       # Verify that we have restored weights1 and biases1.
-      sess.run([weights1, biases1])
+      self.evaluate([weights1, biases1])
       # Initialize the rest of the variables and run logits.
-      sess.run(init_rest_op)
-      sess.run(logits)
+      self.evaluate(init_rest_op)
+      self.evaluate(logits)
 
   # Verifies that we can save the subgraph under "hidden1" and restore it
   # into "new_hidden1" in the new graph.
+  @test_util.run_deprecated_v1
   def testScopedSaveAndRestore(self):
     test_dir = self._get_test_dir("scoped_export_import")
     ckpt_filename = "ckpt"
@@ -2625,6 +2633,7 @@ class ScopedGraphTest(test.TestCase):
 
   # Verifies that we can copy the subgraph under "hidden1" and copy it
   # to different name scope in the same graph or different graph.
+  @test_util.run_deprecated_v1
   def testCopyScopedGraph(self):
     test_dir = self._get_test_dir("scoped_copy")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
@@ -2640,7 +2649,7 @@ class ScopedGraphTest(test.TestCase):
 
     # Run the graph and save scoped checkpoint.
     with self.session(graph=graph1) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           export_scope="hidden1")
       saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
@@ -2681,6 +2690,7 @@ class ScopedGraphTest(test.TestCase):
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
+  @test_util.run_deprecated_v1
   def testExportGraphDefWithScope(self):
     test_dir = self._get_test_dir("export_graph_def")
     saver0_ckpt = os.path.join(test_dir, "saver0.ckpt")
@@ -2696,7 +2706,7 @@ class ScopedGraphTest(test.TestCase):
 
     # Run the graph and save scoped checkpoint.
     with self.session(graph=graph1) as sess:
-      sess.run(variables.global_variables_initializer())
+      self.evaluate(variables.global_variables_initializer())
       _, var_list_1 = meta_graph.export_scoped_meta_graph(
           graph_def=graph1.as_graph_def(), export_scope="hidden1")
       saver = saver_module.Saver(var_list=var_list_1, max_to_keep=1)
@@ -2717,6 +2727,7 @@ class ScopedGraphTest(test.TestCase):
       saver3.restore(sess, saver0_ckpt)
       self.assertAllClose(expected, sess.run("new_hidden1/relu:0"))
 
+  @test_util.run_deprecated_v1
   def testSerializeSaverWithScope(self):
     test_dir = self._get_test_dir("export_graph_def")
     saver1_ckpt = os.path.join(test_dir, "saver1.ckpt")
@@ -2964,7 +2975,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
     a_saver = saver_module.Saver([a])
     b_saver = saver_module.Saver([b])
     with self.cached_session() as sess:
-      sess.run(a.initializer)
+      self.evaluate(a.initializer)
       save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
       with self.assertRaisesRegexp(
           errors.NotFoundError, "Key b not found in checkpoint"):
@@ -2977,6 +2988,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       # exception" block in Python 3.
       self.assertNotIn("NewCheckpointReader", cs.exception.message)
 
+  @test_util.run_v1_only("b/120545219")
   def testGraphChangedForRestoreErrorRaised(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -2986,7 +2998,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       a_saver = saver_module.Saver([a])
 
       with self.session(graph=g) as sess:
-        sess.run(a.initializer)
+        self.evaluate(a.initializer)
         save_path = a_saver.save(sess=sess, save_path=checkpoint_prefix)
 
     with ops_lib.Graph().as_default() as g:
@@ -2998,6 +3010,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
             "a mismatch between the current graph and the graph"):
           a_saver.restore(sess=sess, save_path=save_path)
 
+  @test_util.run_v1_only("b/120545219")
   def testLoadFromObjectBasedGraph(self):
     checkpoint_directory = self.get_temp_dir()
     checkpoint_prefix = os.path.join(checkpoint_directory, "ckpt")
@@ -3029,7 +3042,7 @@ class CheckpointableCompatibilityTests(test.TestCase):
       self.assertEqual(before_second_restore_ops,
                        restore_graph.get_operations())
       with self.assertRaisesRegexp(errors.NotFoundError,
-                                   "could not find a_variable"):
+                                   "Could not find some variables"):
         saver.restore(sess=sess, save_path=second_path)
 
   def testLoadFromObjectBasedEager(self):
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..67ccd59b88c289a11791c9098a2014c48e6c33fb
--- /dev/null
+++ b/tensorflow/python/training/saving/BUILD
@@ -0,0 +1,55 @@
+# Description:
+#   Low-level utilities for reading and writing checkpoints.
+
+package(
+    default_visibility = [
+        "//tensorflow:internal",
+    ],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+load("//tensorflow:tensorflow.bzl", "cuda_py_test")
+
+py_library(
+    name = "functional_saver",
+    srcs = ["functional_saver.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":saveable_object",
+        ":saveable_object_util",
+        "//tensorflow/python/eager:def_function",
+    ],
+)
+
+cuda_py_test(
+    name = "functional_saver_test",
+    size = "medium",
+    srcs = [
+        "functional_saver_test.py",
+    ],
+    additional_deps = [
+        ":functional_saver",
+        "//tensorflow/python/eager:test",
+    ],
+)
+
+py_library(
+    name = "saveable_object",
+    srcs = ["saveable_object.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "saveable_object_util",
+    srcs = ["saveable_object_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python:variables",
+        "//tensorflow/python/training/checkpointable:base",
+        "@six_archive//:six",
+    ],
+)
diff --git a/tensorflow/python/training/saving/functional_saver.py b/tensorflow/python/training/saving/functional_saver.py
new file mode 100644
index 0000000000000000000000000000000000000000..7eed3336626ef63942a40702f9787e6b5847b97b
--- /dev/null
+++ b/tensorflow/python/training/saving/functional_saver.py
@@ -0,0 +1,101 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Saves and restore variables inside traced @tf.functions."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import io_ops
+from tensorflow.python.training.saving import saveable_object
+from tensorflow.python.training.saving import saveable_object_util
+
+
+class Saver(object):
+  """A minimal utility class for saving and restoring checkpoints.
+
+  Note that this is a low-level utility which stores Tensors in the keys
+  specified by `SaveableObject`s. Higher-level utilities for object-based
+  checkpointing are built on top of it.
+  """
+
+  def __init__(self, saveable_objects):
+    """Specify a list of `SaveableObject`s to save and restore.
+
+    Args:
+      saveable_objects: A list of `SaveableObject`s.
+    """
+    saveable_objects = list(saveable_objects)
+    for saveable in saveable_objects:
+      if not isinstance(saveable, saveable_object.SaveableObject):
+        raise ValueError(
+            "Saver expected a list of SaveableObjects, got %s." % (saveable,))
+    self._saveable_objects = saveable_objects
+
+  # TODO(b/120569892): Use tf.function here
+  def save(self, file_prefix):
+    """Save the saveable objects to a checkpoint with `file_prefix`.
+
+    Args:
+      file_prefix: A string or scalar string Tensor containing the prefix to
+        save under.
+    Returns:
+      A scalar string Tensor containing `file_prefix` with control dependencies
+      on the save ops.
+    """
+    tensor_names = []
+    tensors = []
+    tensor_slices = []
+    for saveable in self._saveable_objects:
+      for spec in saveable.specs:
+        tensor_names.append(spec.name)
+        tensors.append(spec.tensor)
+        tensor_slices.append(spec.slice_spec)
+    with ops.control_dependencies(
+        [io_ops.save_v2(file_prefix, tensor_names, tensor_slices, tensors)]):
+      return array_ops.identity(file_prefix)
+
+  # TODO(b/120569892): Use tf.function here
+  def restore(self, file_prefix):
+    """Restore the saveable objects from a checkpoint with `file_prefix`.
+
+    Args:
+      file_prefix: A string or scalar string Tensor containing the prefix for
+        files to read from.
+
+    Returns:
+      An operation which restores the `Saver`'s `SaveableObject`s when run, or
+      None if executing eagerly.
+    """
+    restore_ops = []
+    for saveable in self._saveable_objects:
+      if saveable.device:
+        device = saveable_object_util.set_cpu0(saveable.device)
+      else:
+        device = None
+      with ops.device(device):
+        tensors = []
+        for spec in saveable.specs:
+          tensors.append(
+              io_ops.restore_v2(
+                  file_prefix,
+                  [spec.name],
+                  [spec.slice_spec],
+                  [spec.dtype])[0])
+        restore_ops.append(saveable.restore(tensors, restored_shapes=None))
+    return control_flow_ops.group(restore_ops)
diff --git a/tensorflow/python/training/saving/functional_saver_test.py b/tensorflow/python/training/saving/functional_saver_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..40002255aacd4b3579bab6ea44bc9e5ee98f9177
--- /dev/null
+++ b/tensorflow/python/training/saving/functional_saver_test.py
@@ -0,0 +1,50 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for the functional saver."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+from tensorflow.python.eager import test
+from tensorflow.python.framework import constant_op
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.training.saving import functional_saver
+from tensorflow.python.training.saving import saveable_object_util
+
+
+class SaverTest(test.TestCase):
+
+  def test_resource_variable(self):
+    v1 = resource_variable_ops.ResourceVariable(2.)
+    saver = functional_saver.Saver(
+        saveable_object_util.saveable_objects_for_op(v1, "x"))
+    prefix = os.path.join(self.get_temp_dir(), "ckpt")
+    save_path = saver.save(constant_op.constant(prefix))
+    v1.assign(1.)
+    saver.restore(save_path)
+    self.assertEqual(2., self.evaluate(v1))
+
+    v2 = resource_variable_ops.ResourceVariable(3.)
+    second_saver = functional_saver.Saver(
+        saveable_object_util.saveable_objects_for_op(v2, "x"))
+    second_saver.restore(save_path)
+    self.assertEqual(2., self.evaluate(v2))
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/training/saveable_object.py b/tensorflow/python/training/saving/saveable_object.py
similarity index 100%
rename from tensorflow/python/training/saveable_object.py
rename to tensorflow/python/training/saving/saveable_object.py
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa88d2c6ebd2f29c2d2de7583a918dcbc6b28b51
--- /dev/null
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -0,0 +1,340 @@
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Utilities for working with and creating SaveableObjects."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import six
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import device as pydev
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import state_ops
+from tensorflow.python.ops import variables
+from tensorflow.python.training.checkpointable import base as checkpointable
+from tensorflow.python.training.saving import saveable_object
+
+
+# Op names which identify variable reads which should be saved.
+_VARIABLE_OPS = set(["Variable",
+                     "VariableV2",
+                     "AutoReloadVariable",
+                     "VarHandleOp",
+                     "ReadVariableOp"])
+
+
+def set_cpu0(device_string):
+  """Creates a new device string based on `device_string` but using /CPU:0.
+
+  If the device is already on /CPU:0, this is a no-op.
+
+  Args:
+    device_string: A device string.
+
+  Returns:
+    A device string.
+  """
+  parsed_device = pydev.DeviceSpec.from_string(device_string)
+  parsed_device.device_type = "CPU"
+  parsed_device.device_index = 0
+  return parsed_device.to_string()
+
+
+class ReferenceVariableSaveable(saveable_object.SaveableObject):
+  """SaveableObject implementation that handles reference variables."""
+
+  def __init__(self, var, slice_spec, name):
+    spec = saveable_object.SaveSpec(var, slice_spec, name, dtype=var.dtype)
+    super(ReferenceVariableSaveable, self).__init__(var, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    restored_tensor = restored_tensors[0]
+    if restored_shapes is not None:
+      restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
+    return state_ops.assign(
+        self.op,
+        restored_tensor,
+        validate_shape=restored_shapes is None and
+        self.op.get_shape().is_fully_defined())
+
+
+class ResourceVariableSaveable(saveable_object.SaveableObject):
+  """SaveableObject implementation that handles ResourceVariables."""
+
+  def __init__(self, var, slice_spec, name):
+    self._var_device = var.device
+    self._var_shape = var.shape
+    if isinstance(var, ops.Tensor):
+      self.handle_op = var.op.inputs[0]
+      tensor = var
+    elif isinstance(var, resource_variable_ops.ResourceVariable):
+
+      def _read_variable_closure(v):
+        def f():
+          with ops.device(v.device):
+            x = v.read_value()
+            # To allow variables placed on non-CPU devices to be checkpointed,
+            # we copy them to CPU on the same machine first.
+            with ops.device("/device:CPU:0"):
+              return array_ops.identity(x)
+        return f
+
+      self.handle_op = var.handle
+      tensor = _read_variable_closure(var)
+    else:
+      raise ValueError(
+          "Saveable is neither a resource variable nor a read operation."
+          " Got: %s" % repr(var))
+    spec = saveable_object.SaveSpec(tensor, slice_spec, name,
+                                    dtype=var.dtype)
+    super(ResourceVariableSaveable, self).__init__(var, [spec], name)
+
+  def restore(self, restored_tensors, restored_shapes):
+    restored_tensor = restored_tensors[0]
+    if restored_shapes is not None:
+      restored_tensor = array_ops.reshape(restored_tensor, restored_shapes[0])
+    # Copy the restored tensor to the variable's device.
+    with ops.device(self._var_device):
+      restored_tensor = array_ops.identity(restored_tensor)
+      return resource_variable_ops.shape_safe_assign_variable_handle(
+          self.handle_op, self._var_shape, restored_tensor)
+
+
+def _tensor_comes_from_variable(v):
+  return isinstance(v, ops.Tensor) and v.op.type in _VARIABLE_OPS
+
+
+def saveable_objects_for_op(op, name):
+  """Create `SaveableObject`s from an operation.
+
+  Args:
+    op: A variable, operation, or SaveableObject to coerce into a
+      SaveableObject.
+    name: A string name for the SaveableObject.
+
+  Yields:
+    `SaveableObject`s which together save/restore `op`.
+
+  Raises:
+    TypeError: If `name` is not a string.
+    ValueError: For operations with no known conversion to SaveableObject.
+  """
+  if not isinstance(name, six.string_types):
+    raise TypeError(
+        "names_to_saveables must be a dict mapping string names to "
+        "checkpointable operations. Name is not a string: %s" % name)
+  if isinstance(op, saveable_object.SaveableObject):
+    yield op
+  elif isinstance(op, (list, tuple, variables.PartitionedVariable)):
+    if isinstance(op, variables.PartitionedVariable):
+      op = list(op)
+    # A set of slices.
+    slice_name = None
+    # pylint: disable=protected-access
+    for variable in op:
+      if not isinstance(variable, variables.Variable):
+        raise ValueError("Slices must all be Variables: %s" % variable)
+      if not variable._save_slice_info:
+        raise ValueError("Slices must all be slices: %s" % variable)
+      if slice_name is None:
+        slice_name = variable._save_slice_info.full_name
+      elif slice_name != variable._save_slice_info.full_name:
+        raise ValueError(
+            "Slices must all be from the same tensor: %s != %s" %
+            (slice_name, variable._save_slice_info.full_name))
+      if variable.op.type in ["Variable", "VariableV2",
+                              "AutoReloadVariable"]:
+        yield ReferenceVariableSaveable(
+            variable, variable._save_slice_info.spec, name)
+      else:
+        yield ResourceVariableSaveable(
+            variable, variable._save_slice_info.spec, name)
+    # pylint: enable=protected-access
+  elif isinstance(op, checkpointable.CheckpointableBase) and not isinstance(
+      op, variables.Variable):
+    # pylint: disable=protected-access
+    for attr, factory in op._gather_saveables_for_checkpoint().items():
+      if attr == checkpointable.VARIABLE_VALUE_KEY:
+        # Keep original name for classes masquerading as variables.
+        full_name = name
+      else:
+        full_name = name + "_" + attr
+      op = (factory(full_name) if callable(factory) else factory)
+      for op in saveable_objects_for_op(op, op.name):
+        yield op
+    # pylint: enable=protected-access
+  else:
+    # A variable or tensor.
+    if isinstance(op, resource_variable_ops.ResourceVariable):
+      # pylint: disable=protected-access
+      if op._in_graph_mode:
+        variable = op._graph_element
+      else:
+        variable = op
+      # pylint: enable=protected-access
+      yield ResourceVariableSaveable(variable, "", name)
+    else:
+      with ops.init_scope():
+        if context.executing_eagerly():
+          raise ValueError("Can only save/restore ResourceVariables when "
+                           "executing eagerly, got type: %s." % type(op))
+
+      variable = ops.internal_convert_to_tensor(op, as_ref=True)
+      if not _tensor_comes_from_variable(variable):
+        raise TypeError("names_to_saveables must be a dict mapping string "
+                        "names to Tensors/Variables. Not a variable: %s" %
+                        variable)
+      if variable.op.type in ["Variable", "VariableV2",
+                              "AutoReloadVariable"]:
+        yield ReferenceVariableSaveable(variable, "", name)
+      else:
+        yield ResourceVariableSaveable(
+            variable, "", name)
+
+
+def op_list_to_dict(op_list, convert_variable_to_tensor=True):
+  """Create a dictionary of names to operation lists.
+
+  Args:
+    op_list: A list, tuple, or set of Variables or SaveableObjects.
+    convert_variable_to_tensor: Whether or not to convert single Variables
+      with no slice info into Tensors.
+
+  Returns:
+    A dictionary of names to the operations that must be saved under
+    that name.  Variables with save_slice_info are grouped together under the
+    same key in no particular order.
+
+  Raises:
+    TypeError: If the type of op_list or its elements is not supported.
+    ValueError: If at least two saveables share the same name.
+  """
+  if not isinstance(op_list, (list, tuple, set)):
+    raise TypeError("Variables to save should be passed in a dict or a "
+                    "list: %s" % op_list)
+  # When ResourceVariables are converted to Tensors, read ops are added to the
+  # graph. Sorting the op_list ensures that the resulting graph is always
+  # constructed in a deterministic way:
+  op_list = sorted(op_list, key=lambda x: x.name)
+  names_to_saveables = {}
+  # pylint: disable=protected-access
+  for var in op_list:
+    if isinstance(var, saveable_object.SaveableObject):
+      names_to_saveables[var.name] = var
+    elif isinstance(var, variables.PartitionedVariable):
+      if var.name in names_to_saveables:
+        raise ValueError("At least two variables have the same name: %s" %
+                         var.name)
+      names_to_saveables[var.name] = var
+    elif isinstance(var, variables.Variable) and var._save_slice_info:
+      name = var._save_slice_info.full_name
+      if name in names_to_saveables:
+        if not isinstance(names_to_saveables[name], list):
+          raise ValueError("Mixing slices and non-slices with the same name: "
+                           "%s" % name)
+        names_to_saveables[name].append(var)
+      else:
+        names_to_saveables[name] = [var]
+    elif (isinstance(var, checkpointable.CheckpointableBase)
+          and not isinstance(var, variables.Variable)):
+      checkpointable_saveables = [
+          (factory() if callable(factory) else factory)
+          for factory in var._gather_saveables_for_checkpoint().values()]
+      names_to_saveables.update(
+          op_list_to_dict(checkpointable_saveables))
+    else:
+      if context.executing_eagerly():
+        if not isinstance(var, resource_variable_ops.ResourceVariable):
+          raise ValueError(
+              "Can only save/restore ResourceVariables when eager execution "
+              "is enabled, type: %s." % type(var))
+        set_var = names_to_saveables.setdefault(var._shared_name, var)
+        if set_var is not var:
+          raise ValueError(
+              ("Two different ResourceVariable objects with the same "
+               "shared_name '%s' were passed to the Saver. This likely means "
+               "that they were created in different Graphs or isolation "
+               "contexts, and may not be checkpointed together.") %
+              (var._shared_name,))
+      else:
+        if convert_variable_to_tensor:
+          if isinstance(var, resource_variable_ops.ResourceVariable):
+            var = var._graph_element  # pylint: disable=protected-access
+          else:
+            var = ops.internal_convert_to_tensor(var, as_ref=True)
+          if not _tensor_comes_from_variable(var):
+            raise TypeError("Variable to save is not a Variable: %s" % var)
+        if var.op.type == "ReadVariableOp":
+          name = var.op.inputs[0].op.name
+        else:
+          name = var.op.name
+        if name in names_to_saveables:
+          raise ValueError("At least two variables have the same name: %s" %
+                           name)
+        names_to_saveables[name] = var
+
+    # pylint: enable=protected-access
+  return names_to_saveables
+
+
+def _add_saveable(saveables, seen_ops, saveable):
+  """Adds the saveable to the saveables list.
+
+  Args:
+    saveables: List to append the SaveableObject to.
+    seen_ops: Set of the ops of the saveables already processed.  Used to
+      check that each saveable is only saved once.
+    saveable: The saveable.
+
+  Raises:
+    ValueError: If the saveable has already been processed.
+  """
+  if saveable.op in seen_ops:
+    raise ValueError("The same saveable will be restored with two names: %s" %
+                     saveable.name)
+  saveables.append(saveable)
+  seen_ops.add(saveable.op)
+
+
+def validate_and_slice_inputs(names_to_saveables):
+  """Returns the variables and names that will be used for a Saver.
+
+  Args:
+    names_to_saveables: A dict (k, v) where k is the name of an operation and
+       v is an operation to save or a BaseSaverBuilder.Saver.
+
+  Returns:
+    A list of SaveableObjects.
+
+  Raises:
+    TypeError: If any of the keys are not strings or any of the
+      values are not one of Tensor or Variable or a checkpointable operation.
+    ValueError: If the same operation is given in more than one value
+      (this also applies to slices of SlicedVariables).
+  """
+  if not isinstance(names_to_saveables, dict):
+    names_to_saveables = op_list_to_dict(names_to_saveables)
+
+  saveables = []
+  seen_ops = set()
+  for name, op in sorted(names_to_saveables.items(),
+                         # Avoid comparing ops, sort only by name.
+                         key=lambda x: x[0]):
+    for converted_saveable_object in saveable_objects_for_op(op, name):
+      _add_saveable(saveables, seen_ops, converted_saveable_object)
+  return saveables
diff --git a/tensorflow/python/training/server_lib_multiple_containers_test.py b/tensorflow/python/training/server_lib_multiple_containers_test.py
index f599e9b55b9f8d6ac3b66a9c72e6c33c7b127c58..fb6118942bdb7332a1f96a25927d4958796b6ba2 100644
--- a/tensorflow/python/training/server_lib_multiple_containers_test.py
+++ b/tensorflow/python/training/server_lib_multiple_containers_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -33,6 +34,7 @@ class MultipleContainersTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testMultipleContainers(self):
     with ops.container("test0"):
       v0 = variables.Variable(1.0, name="v0")
diff --git a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
index 11e6f28ab05b5d7e7ca8b90a0407f1dbdb283738..e0ab21bbd979ab8c7e6d825573c584325bcdaf7b 100644
--- a/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_clear_container_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
@@ -32,6 +33,7 @@ class SameVariablesClearContainerTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSameVariablesClearContainer(self):
     # Starts two servers with different names so they map to different
     # resource "containers".
@@ -60,9 +62,9 @@ class SameVariablesClearContainerTest(test.TestCase):
     session.Session.reset(server0.target, ["local0"])
     sess = session.Session(server0.target)
     with self.assertRaises(errors_impl.FailedPreconditionError):
-      sess.run(v0)
+      self.evaluate(v0)
     # Reinitializes v0 for the following test.
-    sess.run(v0.initializer)
+    self.evaluate(v0.initializer)
 
     # Verifies that v1 is still valid.
     self.assertAllEqual(2.0, sess_1.run(v1))
@@ -71,10 +73,10 @@ class SameVariablesClearContainerTest(test.TestCase):
     session.Session.reset(server1.target, ["local1"])
     sess = session.Session(server1.target)
     with self.assertRaises(errors_impl.FailedPreconditionError):
-      sess.run(v1)
+      self.evaluate(v1)
     # Verifies that v0 is still valid.
     sess = session.Session(server0.target)
-    self.assertAllEqual(1.0, sess.run(v0))
+    self.assertAllEqual(1.0, self.evaluate(v0))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/server_lib_same_variables_clear_test.py b/tensorflow/python/training/server_lib_same_variables_clear_test.py
index 4682f1ab84d719cafd1d94669a9ee3ca5f1797fc..7b147af6c55894575e4f98436daaa3f3f33bd16c 100644
--- a/tensorflow/python/training/server_lib_same_variables_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_clear_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -32,6 +33,7 @@ class SameVariablesClearTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSameVariablesClear(self):
     server = server_lib.Server.create_local_server()
 
diff --git a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
index 5aa7f45c2b350a795016ed645a981e34f7626561..ff3fab9f372aecae28adf84a3d800759e3487665 100644
--- a/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
+++ b/tensorflow/python/training/server_lib_same_variables_no_clear_test.py
@@ -20,6 +20,7 @@ from __future__ import print_function
 
 from tensorflow.python.client import session
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -33,6 +34,7 @@ class SameVariablesNoClearTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_v1_only("b/120545219")
   def testSameVariablesNoClear(self):
     server = server_lib.Server.create_local_server()
 
diff --git a/tensorflow/python/training/server_lib_sparse_job_test.py b/tensorflow/python/training/server_lib_sparse_job_test.py
index 1a6b44b90e8d4d4c3faf9f0ac596942a7ff3d09f..93b06e621608f0754fd4560ec4faa6c530209ac7 100644
--- a/tensorflow/python/training/server_lib_sparse_job_test.py
+++ b/tensorflow/python/training/server_lib_sparse_job_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.client import session
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.training import server_lib
 
@@ -30,13 +31,14 @@ class SparseJobTest(test.TestCase):
   # TODO(b/34465411): Starting multiple servers with different configurations
   # in the same test is flaky. Move this test case back into
   # "server_lib_test.py" when this is no longer the case.
+  @test_util.run_deprecated_v1
   def testSparseJob(self):
     server = server_lib.Server({"local": {37: "localhost:0"}})
     with ops.device("/job:local/task:37"):
       a = constant_op.constant(1.0)
 
     with session.Session(server.target) as sess:
-      self.assertEqual(1.0, sess.run(a))
+      self.assertEqual(1.0, self.evaluate(a))
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/training/server_lib_test.py b/tensorflow/python/training/server_lib_test.py
index cf995707fc56448e7fe5354d162581947604f382..92cdc1c4ad0832fc3f8593bebabe76d4e6dc0cc0 100644
--- a/tensorflow/python/training/server_lib_test.py
+++ b/tensorflow/python/training/server_lib_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import data_flow_ops
 from tensorflow.python.ops import math_ops
@@ -55,6 +56,7 @@ class GrpcServerTest(test.TestCase):
       self.assertAllEqual([[4]], sess.run(e))
     # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
+  @test_util.run_v1_only("b/120545219")
   def testMultipleSessions(self):
     server = self._cached_server
 
@@ -73,6 +75,7 @@ class GrpcServerTest(test.TestCase):
     # TODO(mrry): Add `server.stop()` and `server.join()` when these work.
 
   # Verifies various reset failures.
+  @test_util.run_v1_only("b/120545219")
   def testResetFails(self):
     # Creates variable with container name.
     with ops.container("test0"):
@@ -146,6 +149,7 @@ class GrpcServerTest(test.TestCase):
       self.assertEqual(0.5, min_val)
       self.assertEqual(0.5, max_val)
 
+  @test_util.run_v1_only("b/120545219")
   def testCloseCancelsBlockingOperation(self):
     server = self._cached_server
     sess = session.Session(server.target, config=self._useRPCConfig())
@@ -174,7 +178,7 @@ class GrpcServerTest(test.TestCase):
     # is not supported, but it should successfully ignore it.
     sess = session.InteractiveSession(server.target)
     c = constant_op.constant(42.0)
-    self.assertEqual(42.0, c.eval())
+    self.assertEqual(42.0, self.evaluate(c))
     sess.close()
 
   def testSetConfiguration(self):
@@ -207,6 +211,7 @@ class GrpcServerTest(test.TestCase):
               "local": ["localhost"]
           }, job_name="local", task_index=0)
 
+  @test_util.run_v1_only("b/120545219")
   def testTimeoutRaisesException(self):
     server = self._cached_server
     q = data_flow_ops.FIFOQueue(1, [dtypes.float32])
@@ -241,6 +246,7 @@ class GrpcServerTest(test.TestCase):
       queue_runner_impl.start_queue_runners(sess)
       sess.run(var.assign(3.0))
 
+  @test_util.run_v1_only("b/120545219")
   def testIsolateSessionState(self):
     server = self._cached_server
 
@@ -296,6 +302,7 @@ class GrpcServerTest(test.TestCase):
     self.assertAllEqual(37, isolate_sess_0.run(v))
     self.assertAllEqual([19, 86], isolate_sess_1.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testShapeChangingIsolateState(self):
     server = self._cached_server
     sharing_config = config_pb2.ConfigProto(isolate_session_state=False)
diff --git a/tensorflow/python/training/session_manager.py b/tensorflow/python/training/session_manager.py
index cd313c2ce053cdecd6b7856f55eb8969d31eac5a..0f68fcfe8bb4cb81e54ba27d35bfb0b2e3888a1b 100644
--- a/tensorflow/python/training/session_manager.py
+++ b/tensorflow/python/training/session_manager.py
@@ -25,7 +25,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_management
-from tensorflow.python.training import distribution_strategy_context
 from tensorflow.python.util.tf_export import tf_export
 
 
@@ -46,7 +45,7 @@ def _maybe_name(obj):
     return "<no name for %s>" % type(obj)
 
 
-@tf_export("train.SessionManager")
+@tf_export(v1=["train.SessionManager"])
 class SessionManager(object):
   """Training helper that restores from checkpoint and creates session.
 
@@ -183,12 +182,6 @@ class SessionManager(object):
     """
     self._target = master
     sess = session.Session(self._target, graph=self._graph, config=config)
-    # TODO(jhseu): Delete once tpu.initialize_system() goes away.
-    initialize_ops = (
-        distribution_strategy_context.get_distribution_strategy().initialize()
-    )
-    if initialize_ops:
-      sess.run(initialize_ops)
 
     if checkpoint_dir and checkpoint_filename_with_path:
       raise ValueError("Can not provide both checkpoint_dir and "
diff --git a/tensorflow/python/training/session_manager_test.py b/tensorflow/python/training/session_manager_test.py
index 2b5c3b01defeedd6f59fd9b1eee9385d3101b584..c9a0c56ffc1e78f1f654b4ec224bf8480d53ad9b 100644
--- a/tensorflow/python/training/session_manager_test.py
+++ b/tensorflow/python/training/session_manager_test.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import variables
@@ -68,6 +69,7 @@ class SessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -152,6 +154,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -184,6 +187,7 @@ class SessionManagerTest(test.TestCase):
           checkpoint_filename_with_path=checkpoint_management.latest_checkpoint(
               checkpoint_dir))
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
       variables.VariableV1(1, name="v")
@@ -206,6 +210,7 @@ class SessionManagerTest(test.TestCase):
               variables.global_variables()),
           local_init_op=None)
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(),
@@ -259,6 +264,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(v))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionWithReadyForLocalInitOpFailsToReadyLocal(self):
     # We use ready_for_local_init_op=tf.report_uninitialized_variables(),
     # which causes recover_session to not run local_init_op, and to return
@@ -315,6 +321,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionNoChkptStillRunsLocalInitOp(self):
     # This test checks for backwards compatibility.
     # In particular, we continue to ensure that recover_session will execute
@@ -343,6 +350,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSessionFailsStillRunsLocalInitOp(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(
@@ -386,6 +394,7 @@ class SessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("w:0")).eval(session=sess))
       self.assertEquals(1, sess.run(w))
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionLocalInit(self):
     server = server_lib.Server.create_local_server()
     with ops.Graph().as_default() as graph:
@@ -437,6 +446,7 @@ class SessionManagerTest(test.TestCase):
         # because of overly restrictive ready_for_local_init_op
         sm.wait_for_session("", max_wait_secs=3)
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default() as graph:
       v = variables.VariableV1(1, name="v")
@@ -454,6 +464,7 @@ class SessionManagerTest(test.TestCase):
                                  "Session was not ready after waiting.*"):
       sm.wait_for_session("", max_wait_secs=3)
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithReadyForLocalInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -493,6 +504,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w))
       self.assertEquals(3, sess.run(x))
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithPartialInitOp(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -559,6 +571,7 @@ class SessionManagerTest(test.TestCase):
       self.assertEquals(1, sess.run(w_res))
       self.assertEquals(3, sess.run(x_res))
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithCyclicInitializer(self):
     # Regression test. Previously Variable._build_initializer_expr would enter
     # into an infinite recursion when the variable's initial_value involved
@@ -632,6 +645,7 @@ class SessionManagerTest(test.TestCase):
           "Init operations did not make model ready for local_init"):
         sm2.prepare_session("", init_op=None)
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionWithInsufficientReadyForLocalInitCheck(self):
     with ops.Graph().as_default():
       v = variables.VariableV1(1, name="v")
@@ -684,6 +698,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           "", init_fn=lambda sess: sess.run(v.initializer))
       self.assertAllClose([125], sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testPrepareSessionFails(self):
     checkpoint_dir = os.path.join(self.get_temp_dir(), "prepare_session")
     checkpoint_dir2 = os.path.join(self.get_temp_dir(), "prepare_session2")
@@ -745,6 +760,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
           variables.is_variable_initialized(
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
 
+  @test_util.run_v1_only("b/120545219")
   def testRecoverSession(self):
     # Create a checkpoint.
     checkpoint_dir = os.path.join(self.get_temp_dir(), "recover_session")
@@ -783,6 +799,7 @@ class ObsoleteSessionManagerTest(test.TestCase):
               sess.graph.get_tensor_by_name("v:0")).eval(session=sess))
       self.assertEquals(1, sess.run(v))
 
+  @test_util.run_v1_only("b/120545219")
   def testWaitForSessionReturnsNoneAfterTimeout(self):
     with ops.Graph().as_default():
       variables.VariableV1(1, name="v")
diff --git a/tensorflow/python/training/session_run_hook.py b/tensorflow/python/training/session_run_hook.py
index 5daea9312886599f4119b088096434a8b2a258de..e9a61def7430fec0190c8f7b788fd7b72492e432 100644
--- a/tensorflow/python/training/session_run_hook.py
+++ b/tensorflow/python/training/session_run_hook.py
@@ -186,7 +186,7 @@ class SessionRunHook(object):
     pass
 
 
-@tf_export("train.SessionRunArgs")
+@tf_export(v1=["train.SessionRunArgs"])
 class SessionRunArgs(
     collections.namedtuple("SessionRunArgs",
                            ["fetches", "feed_dict", "options"])):
@@ -211,7 +211,7 @@ class SessionRunArgs(
     return super(SessionRunArgs, cls).__new__(cls, fetches, feed_dict, options)
 
 
-@tf_export("train.SessionRunContext")
+@tf_export(v1=["train.SessionRunContext"])
 class SessionRunContext(object):
   """Provides information about the `session.run()` call being made.
 
@@ -263,7 +263,7 @@ class SessionRunContext(object):
     self._stop_requested = True
 
 
-@tf_export("train.SessionRunValues")
+@tf_export(v1=["train.SessionRunValues"])
 class SessionRunValues(
     collections.namedtuple("SessionRunValues",
                            ["results", "options", "run_metadata"])):
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index d76b22acd86956e9b7bbd768299e3db7f630a4d5..ecf5a96ed49146fe4cafce6a809925aab5bdc6fb 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -39,13 +39,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.eager import context
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
-from tensorflow.python.training import distribution_strategy_context
 
 
 def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index 6d6364169fd4b9afa6f64fb9aadc283aab261cbb..f1f0d58a6913a542093ada7a948969f47928a43b 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -21,6 +21,7 @@ from __future__ import print_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -31,6 +32,7 @@ from tensorflow.python.training import slot_creator
 
 class SlotCreatorTest(test.TestCase):
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateSlotFromVariable(self):
     with self.cached_session():
       v = variables.Variable([1.0, 2.5], name="var")
@@ -41,8 +43,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([1.0, 2.5], slot.eval())
+      self.assertAllEqual([1.0, 2.5], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateSlotFromTensor(self):
     with self.cached_session():
       v = constant_op.constant([1.0, 2.5], name="const")
@@ -53,8 +56,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([2.0, 5.0], slot.eval())
+      self.assertAllEqual([2.0, 5.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromVariable(self):
     with self.cached_session():
       v = variables.Variable([1.0, 2.5], name="var")
@@ -67,8 +71,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateZerosSlotFromDynamicShapedVariable(self):
     with self.cached_session():
       dyn_shape = constant_op.constant([2], dtype=dtypes.int32)
@@ -88,8 +93,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("var/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromTensor(self):
     with self.cached_session():
       v = constant_op.constant([1.0, 2.5], name="const")
@@ -101,8 +107,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], slot.get_shape().as_list())
       self.assertEqual(dtypes.float32, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_deprecated_v1
   def testCreateZerosSlotFromDynamicShapedTensor(self):
     with self.cached_session():
       v = random_ops.random_uniform([2], dtype=dtypes.float64)
@@ -116,8 +123,9 @@ class SlotCreatorTest(test.TestCase):
       self.assertEqual("const/slot", slot.op.name)
       self.assertEqual([2], array_ops.shape(slot).eval())
       self.assertEqual(dtypes.float64, slot.dtype.base_dtype)
-      self.assertAllEqual([0.0, 0.0], slot.eval())
+      self.assertAllEqual([0.0, 0.0], self.evaluate(slot))
 
+  @test_util.run_v1_only("b/120545219")
   def testCreateSlotFromVariableRespectsScope(self):
     # See discussion on #2740.
     with self.cached_session():
diff --git a/tensorflow/python/training/supervisor.py b/tensorflow/python/training/supervisor.py
index a5e626d3204f2e9a2993c07df6044ba99df0f68f..de60dd456ff81884398ba16abd03bdfde267d6f4 100644
--- a/tensorflow/python/training/supervisor.py
+++ b/tensorflow/python/training/supervisor.py
@@ -40,7 +40,7 @@ from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.Supervisor")
+@tf_export(v1=["train.Supervisor"])
 class Supervisor(object):
   """A training helper that checkpoints models and computes summaries.
 
diff --git a/tensorflow/python/training/supervisor_test.py b/tensorflow/python/training/supervisor_test.py
index 7cd99d86801e659b369419796848babb49ac9ff4..180ddb52876635c584a12aad26c3703f0fae9d9a 100644
--- a/tensorflow/python/training/supervisor_test.py
+++ b/tensorflow/python/training/supervisor_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import meta_graph
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import io_ops
 from tensorflow.python.ops import parsing_ops
@@ -100,7 +101,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir=logdir)
       sess = sv.prepare_or_wait_for_session("")
       for _ in xrange(10):
-        sess.run(my_op)
+        self.evaluate(my_op)
       sess.close()
       sv.stop()
 
@@ -111,7 +112,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir=logdir)
       with sv.managed_session("") as sess:
         for _ in xrange(10):
-          sess.run(my_op)
+          self.evaluate(my_op)
       # Supervisor has been stopped.
       self.assertTrue(sv.should_stop())
 
@@ -128,7 +129,7 @@ class SupervisorTest(test.TestCase):
             if step == 1:
               raise RuntimeError("failing here")
             else:
-              sess.run(my_op)
+              self.evaluate(my_op)
       # Supervisor has been stopped.
       self.assertTrue(sv.should_stop())
       self.assertEqual(1, last_step)
@@ -146,7 +147,7 @@ class SupervisorTest(test.TestCase):
             raise errors_impl.OutOfRangeError(my_op.op.node_def, my_op.op,
                                               "all done")
           else:
-            sess.run(my_op)
+            self.evaluate(my_op)
       # Supervisor has been stopped.  OutOfRangeError was not thrown.
       self.assertTrue(sv.should_stop())
       self.assertEqual(3, last_step)
@@ -335,7 +336,7 @@ class SupervisorTest(test.TestCase):
       sess = sv.prepare_or_wait_for_session(
           "", config=config_pb2.ConfigProto(device_count={"CPU": 2}))
       for _ in xrange(10):
-        sess.run(my_op)
+        self.evaluate(my_op)
       sess.close()
       sv.stop()
 
@@ -420,6 +421,7 @@ class SupervisorTest(test.TestCase):
       with self.assertRaisesRegexp(RuntimeError, "requires a summary writer"):
         sv.summary_computed(sess, sess.run(summ))
 
+  @test_util.run_v1_only("b/120545219")
   def testLogdirButExplicitlyNoSummaryWriter(self):
     logdir = self._test_dir("explicit_no_summary_writer")
     with ops.Graph().as_default():
@@ -505,6 +507,7 @@ class SupervisorTest(test.TestCase):
       sv = supervisor.Supervisor(logdir="", session_manager=sm)
       sv.prepare_or_wait_for_session("")
 
+  @test_util.run_v1_only("b/120545219")
   def testInitOp(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -514,6 +517,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testInitFn(self):
     logdir = self._test_dir("default_init_op")
     with ops.Graph().as_default():
@@ -527,6 +531,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testInitOpWithFeedDict(self):
     logdir = self._test_dir("feed_dict_init_op")
     with ops.Graph().as_default():
@@ -540,6 +545,7 @@ class SupervisorTest(test.TestCase):
       self.assertAllClose([1.0, 2.0, 3.0], sess.run(v))
       sv.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testReadyForLocalInitOp(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("default_ready_for_local_init_op")
@@ -582,6 +588,7 @@ class SupervisorTest(test.TestCase):
     sv0.stop()
     sv1.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testReadyForLocalInitOpRestoreFromCheckpoint(self):
     server = server_lib.Server.create_local_server()
     logdir = self._test_dir("ready_for_local_init_op_restore")
@@ -713,6 +720,7 @@ class SupervisorTest(test.TestCase):
                                    "Variables not initialized: w"):
         sv.prepare_or_wait_for_session(server.target)
 
+  @test_util.run_v1_only("b/120545219")
   def testSetupFail(self):
     logdir = self._test_dir("setup_fail")
     with ops.Graph().as_default():
@@ -723,6 +731,7 @@ class SupervisorTest(test.TestCase):
       variables.VariableV1([1.0, 2.0, 3.0], name="v")
       supervisor.Supervisor(logdir=logdir, is_chief=False)
 
+  @test_util.run_v1_only("b/120545219")
   def testDefaultGlobalStep(self):
     logdir = self._test_dir("default_global_step")
     with ops.Graph().as_default():
@@ -732,6 +741,7 @@ class SupervisorTest(test.TestCase):
       self.assertEquals(287, sess.run(sv.global_step))
       sv.stop()
 
+  @test_util.run_v1_only("b/120545219")
   def testRestoreFromMetaGraph(self):
     logdir = self._test_dir("restore_from_meta_graph")
     with ops.Graph().as_default():
@@ -753,6 +763,7 @@ class SupervisorTest(test.TestCase):
   # This test is based on the fact that the standard services start
   # right away and get to run once before sv.stop() returns.
   # We still sleep a bit to make the test robust.
+  @test_util.run_v1_only("b/120545219")
   def testStandardServicesWithoutGlobalStep(self):
     logdir = self._test_dir("standard_services_without_global_step")
     # Create a checkpoint.
@@ -799,10 +810,11 @@ class SupervisorTest(test.TestCase):
       v = variables.VariableV1([10.10], name="foo")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
-      self.assertEqual(1.0, v.eval()[0])
+      self.assertEqual(1.0, self.evaluate(v)[0])
 
   # Same as testStandardServicesNoGlobalStep but with a global step.
   # We should get a summary about the step time.
+  @test_util.run_v1_only("b/120545219")
   def testStandardServicesWithGlobalStep(self):
     logdir = self._test_dir("standard_services_with_global_step")
     # Create a checkpoint.
@@ -863,7 +875,7 @@ class SupervisorTest(test.TestCase):
       v = variables.VariableV1([-12], name="global_step")
       sav = saver_lib.Saver([v])
       sav.restore(sess, save_path)
-      self.assertEqual(123, v.eval()[0])
+      self.assertEqual(123, self.evaluate(v)[0])
 
   def testNoQueueRunners(self):
     with ops.Graph().as_default(), self.cached_session() as sess:
diff --git a/tensorflow/python/training/sync_replicas_optimizer.py b/tensorflow/python/training/sync_replicas_optimizer.py
index fbde8fe3c2a5ee720df4eef9659a1b9ebae9922c..cd4590db7f6550f8790ad683c9aaecf145ad12da 100644
--- a/tensorflow/python/training/sync_replicas_optimizer.py
+++ b/tensorflow/python/training/sync_replicas_optimizer.py
@@ -19,6 +19,7 @@ from __future__ import division
 from __future__ import print_function
 
 from tensorflow.core.framework import types_pb2
+from tensorflow.python.distribute import distribution_strategy_context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import control_flow_ops
@@ -44,6 +45,9 @@ from tensorflow.python.util.tf_export import tf_export
 class SyncReplicasOptimizer(optimizer.Optimizer):
   """Class to synchronize, aggregate gradients and pass them to the optimizer.
 
+  This class is deprecated. For synchrononous training, please use [Distribution
+  Strategies](https://github.com/tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).
+
   In a typical asynchronous training environment, it's common to have some
   stale gradients. For example, with a N-replica asynchronous training,
   gradients will be applied to the variables N times independently. Depending
@@ -142,9 +146,9 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
 
   @deprecation.deprecated(
       None,
-      "The `SyncReplicaOptimizer` is deprecated. For synchrononous training, "
-      "please use [Distribution Strategies](https://github.com/tensorflow/"
-      "tensorflow/tree/master/tensorflow/contrib/distribute).",
+      "The `SyncReplicaOptimizer` class is deprecated. For synchrononous "
+      "training, please use [Distribution Strategies](https://github.com/"
+      "tensorflow/tensorflow/tree/master/tensorflow/contrib/distribute).",
       warn_once=True)
   def __init__(self,
                opt,
@@ -256,7 +260,9 @@ class SyncReplicasOptimizer(optimizer.Optimizer):
     # local_anchor op will be placed on this worker task by default.
     local_anchor = control_flow_ops.no_op()
     # Colocating local_step variable prevents it being placed on the PS.
-    with ops.colocate_with(local_anchor):
+    distribution_strategy = (
+        distribution_strategy_context.get_distribution_strategy())
+    with distribution_strategy.colocate_vars_with(local_anchor):
       self._local_step = variable_scope.variable(
           initial_value=0,
           trainable=False,
diff --git a/tensorflow/python/training/sync_replicas_optimizer_test.py b/tensorflow/python/training/sync_replicas_optimizer_test.py
index 1ef8756ef671b652e2fb1b7616d813db7089fec2..428583d048ab30c8ccad0a5e32b47455c5c9bc3c 100644
--- a/tensorflow/python/training/sync_replicas_optimizer_test.py
+++ b/tensorflow/python/training/sync_replicas_optimizer_test.py
@@ -22,6 +22,7 @@ import time
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.test_util import create_local_cluster
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
@@ -88,6 +89,7 @@ class SyncReplicasOptimizerTest(test.TestCase):
   def _run(self, train_op, sess):
     sess.run(train_op)
 
+  @test_util.run_v1_only("b/120545219")
   def test2Workers(self):
     num_workers = 2
     replicas_to_aggregate = 2
@@ -178,6 +180,7 @@ class SyncReplicasOptimizerTest(test.TestCase):
                         sessions[1].run(var_1_g_1))
 
   # 3 workers and one of them is backup.
+  @test_util.run_v1_only("b/120545219")
   def test3Workers1Backup(self):
     num_workers = 3
     replicas_to_aggregate = 2
@@ -266,6 +269,7 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
                                  "apply_gradient should be called"):
       hook.begin()
 
+  @test_util.run_v1_only("b/120545219")
   def testCanCreatedBeforeMinimizeCalled(self):
     """This behavior is required to be integrated with Estimators."""
     opt = training.SyncReplicasOptimizer(
@@ -278,6 +282,7 @@ class SyncReplicasOptimizerHookTest(test.TestCase):
     opt.minimize(v, global_step=global_step)
     hook.begin()
 
+  @test_util.run_v1_only("b/120545219")
   def testFetchVariableList(self):
     opt = training.SyncReplicasOptimizer(
         opt=adam.AdamOptimizer(0.01),
diff --git a/tensorflow/python/training/tensorboard_logging_test.py b/tensorflow/python/training/tensorboard_logging_test.py
index 5af6a0aa7b430cd6dc3d2e9f54392cf9ffafa63a..5088ab07e5e387c880aadc8de7385b53df911a29 100644
--- a/tensorflow/python/training/tensorboard_logging_test.py
+++ b/tensorflow/python/training/tensorboard_logging_test.py
@@ -25,6 +25,7 @@ import tempfile
 import time
 
 from tensorflow.core.util import event_pb2
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.summary import summary_iterator
@@ -32,6 +33,7 @@ from tensorflow.python.summary.writer import writer
 from tensorflow.python.training import tensorboard_logging
 
 
+@test_util.run_v1_only("b/120545219")
 class EventLoggingTest(test.TestCase):
 
   def setUp(self):
@@ -85,6 +87,7 @@ class EventLoggingTest(test.TestCase):
                                   (event_pb2.LogMessage.ERROR, "format")])
     self.assertEqual(2, self.logged_message_count)
 
+  @test_util.run_v1_only("b/120545219")
   def testVerbosity(self):
     tensorboard_logging.set_summary_writer(self._sw)
     tensorboard_logging.set_verbosity(tensorboard_logging.ERROR)
@@ -112,6 +115,7 @@ class EventLoggingTest(test.TestCase):
     tensorboard_logging.warn("this should work")
     self.assertEqual(1, self.logged_message_count)
 
+  @test_util.run_v1_only("b/120545219")
   def testSummaryWriterFailsAfterClear(self):
     tensorboard_logging._clear_summary_writer()
     with self.assertRaises(RuntimeError):
diff --git a/tensorflow/python/training/training_ops_test.py b/tensorflow/python/training/training_ops_test.py
index 02164828250e786cae1f21d1a604863829a9f6eb..ba0f40999b48ffb8411c2cd0e7f4608f84ff292b 100644
--- a/tensorflow/python/training/training_ops_test.py
+++ b/tensorflow/python/training/training_ops_test.py
@@ -24,6 +24,7 @@ import numpy as np
 
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
 from tensorflow.python.framework.test_util import TensorFlowTestCase
 # Import resource_variable_ops for the variables-to-tensor implicit conversion.
 from tensorflow.python.ops import resource_variable_ops  # pylint: disable=unused-import
@@ -53,12 +54,13 @@ class TrainingOpsTest(TensorFlowTestCase):
     with self.session(use_gpu=use_gpu):
       var = variables.VariableV1(x)
       variables.global_variables_initializer().run()
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_sgd = training_ops.apply_gradient_descent(var, alpha, delta)
-      out = apply_sgd.eval()
+      out = self.evaluate(apply_sgd)
       self.assertShapeEqual(out, apply_sgd)
       self.assertAllCloseAccordingToType(x - alpha * delta, out)
 
+  @test_util.run_v1_only("b/120545219")
   def testApplyGradientDescent(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -74,13 +76,13 @@ class TrainingOpsTest(TensorFlowTestCase):
       accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_adagrad = training_ops.apply_adagrad(var, accum, lr, grad)
-      out = apply_adagrad.eval()
+      out = self.evaluate(apply_adagrad)
       self.assertShapeEqual(out, apply_adagrad)
       self.assertAllCloseAccordingToType(x - lr * grad * (y + grad * grad)**
                                          (-0.5), out)
-      self.assertAllCloseAccordingToType(y + grad * grad, accum.eval())
+      self.assertAllCloseAccordingToType(y + grad * grad, self.evaluate(accum))
 
   def _testTypesForFtrl(self,
                         x,
@@ -99,10 +101,10 @@ class TrainingOpsTest(TensorFlowTestCase):
       linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       apply_ftrl = training_ops.apply_ftrl(var, accum, linear, grad, lr, l1, l2,
                                            lr_power)
-      out = apply_ftrl.eval()
+      out = self.evaluate(apply_ftrl)
       self.assertShapeEqual(out, apply_ftrl)
       accum_update = y + grad * grad
       linear_update = z + grad - (accum_update**(-lr_power) - y**
@@ -112,19 +114,22 @@ class TrainingOpsTest(TensorFlowTestCase):
           np.sign(linear_update[i]) * l1 - linear_update[i]) / (quadratic[i]) if
                                np.abs(linear_update[i]) > l1 else 0.0
                                for i in range(linear_update.size)])
-      self.assertAllCloseAccordingToType(accum_update, accum.eval())
+      self.assertAllCloseAccordingToType(accum_update, self.evaluate(accum))
       if x.dtype == np.float16:
         # The calculations here really are not very precise in float16.
-        self.assertAllClose(linear_update, linear.eval(), rtol=2e-2, atol=2e-2)
+        self.assertAllClose(
+            linear_update, self.evaluate(linear), rtol=2e-2, atol=2e-2)
         self.assertAllClose(expected_out, out, rtol=2e-2, atol=2e-2)
       elif x.dtype == np.float32:
         # The calculations here not sufficiently precise in float32.
-        self.assertAllClose(linear_update, linear.eval(), rtol=1e-5, atol=1e-5)
+        self.assertAllClose(
+            linear_update, self.evaluate(linear), rtol=1e-5, atol=1e-5)
         self.assertAllClose(expected_out, out, rtol=1e-5, atol=1e-5)
       else:
-        self.assertAllClose(linear_update, linear.eval())
+        self.assertAllClose(linear_update, self.evaluate(linear))
         self.assertAllClose(expected_out, out)
 
+  @test_util.run_v1_only("b/120545219")
   def testApplyAdagrad(self):
     for (dtype, use_gpu) in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -134,6 +139,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       grad = np.arange(100).astype(dtype)
       self._testTypesForAdagrad(x, y, lr, grad, use_gpu)
 
+  @test_util.run_v1_only("b/120545219")
   def testApplyFtrl(self):
     for dtype in [np.float16, np.float32, np.float64]:
       x = np.arange(100).astype(dtype)
@@ -152,19 +158,19 @@ class TrainingOpsTest(TensorFlowTestCase):
       accum = variables.VariableV1(y)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_adagrad = training_ops.sparse_apply_adagrad(
           var, accum, lr, grad,
           constant_op.constant(indices, self._toType(indices.dtype)))
-      out = sparse_apply_adagrad.eval()
+      out = self.evaluate(sparse_apply_adagrad)
       self.assertShapeEqual(out, sparse_apply_adagrad)
 
       for (i, index) in enumerate(indices):
         self.assertAllCloseAccordingToType(
             x[index] - lr * grad[i] * (y[index] + grad[i] * grad[i])**(-0.5),
-            var.eval()[index])
+            self.evaluate(var)[index])
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
-                                           accum.eval()[index])
+                                           self.evaluate(accum)[index])
 
   def _testTypesForSparseFtrl(self,
                               x,
@@ -183,7 +189,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       linear = variables.VariableV1(z)
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(x, var.eval())
+      self.assertAllCloseAccordingToType(x, self.evaluate(var))
       sparse_apply_ftrl = training_ops.sparse_apply_ftrl(
           var,
           accum,
@@ -194,16 +200,18 @@ class TrainingOpsTest(TensorFlowTestCase):
           l1,
           l2,
           lr_power=lr_power)
-      out = sparse_apply_ftrl.eval()
+      out = self.evaluate(sparse_apply_ftrl)
       self.assertShapeEqual(out, sparse_apply_ftrl)
 
       for (i, index) in enumerate(indices):
-        self.assertAllCloseAccordingToType(x[index] - lr * grad[i] *
-                                           (y[index] + grad[i] * grad[i])**
-                                           (lr_power), var.eval()[index])
+        self.assertAllCloseAccordingToType(
+            x[index] - lr * grad[i] * (y[index] + grad[i] * grad[i])**
+            (lr_power),
+            self.evaluate(var)[index])
         self.assertAllCloseAccordingToType(y[index] + grad[i] * grad[i],
-                                           accum.eval()[index])
+                                           self.evaluate(accum)[index])
 
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyAdagrad(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -217,6 +225,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyAdagradDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -230,6 +239,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseAdagrad(x, y, lr, grad, indices)
 
+  @test_util.run_v1_only("b/120545219")
   def testSparseApplyFtrlDim1(self):
     for (dtype, index_type) in itertools.product(
         [np.float16, np.float32, np.float64], [np.int32, np.int64]):
@@ -245,6 +255,7 @@ class TrainingOpsTest(TensorFlowTestCase):
       indices = np.array([0, 2]).astype(index_type)
       self._testTypesForSparseFtrl(x, y, z, lr, grad, indices)
 
+  @test_util.run_v1_only("b/120545219")
   def testApplyAdam(self):
     for dtype, use_gpu in itertools.product(
         [np.float16, np.float32, np.float64], [False, True]):
@@ -276,13 +287,13 @@ class TrainingOpsTest(TensorFlowTestCase):
       epsilon_t = constant_op.constant(epsilon, self._toType(var.dtype), [])
       variables.global_variables_initializer().run()
 
-      self.assertAllCloseAccordingToType(var, var_t.eval())
+      self.assertAllCloseAccordingToType(var, self.evaluate(var_t))
       new_var, _, _ = self._adamUpdateNumpy(var, grad, t, m, v, lr, beta1,
                                             beta2, epsilon)
       apply_adam = training_ops.apply_adam(var_t, m_t, v_t, beta1_power_t,
                                            beta2_power_t, lr_t, beta1_t,
                                            beta2_t, epsilon_t, grad)
-      out = apply_adam.eval()
+      out = self.evaluate(apply_adam)
       self.assertShapeEqual(out, apply_adam)
       self.assertAllCloseAccordingToType(new_var, out)
 
diff --git a/tensorflow/python/training/training_util_test.py b/tensorflow/python/training/training_util_test.py
index ba64e785ac660a383e26651a37138f17e3e7cd17..3f9858a33bafc6ae0750695ec55e97ad5800119b 100644
--- a/tensorflow/python/training/training_util_test.py
+++ b/tensorflow/python/training/training_util_test.py
@@ -20,12 +20,14 @@ from __future__ import print_function
 
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.training import monitored_session
 from tensorflow.python.training import training_util
 
 
+@test_util.run_v1_only('b/120545219')
 class GlobalStepTest(test.TestCase):
 
   def _assert_global_step(self, global_step, expected_dtype=dtypes.int64):
@@ -90,6 +92,7 @@ class GlobalStepTest(test.TestCase):
       self._assert_global_step(training_util.get_or_create_global_step(g))
 
 
+@test_util.run_v1_only('b/120545219')
 class GlobalStepReadTest(test.TestCase):
 
   def test_global_step_read_is_none_if_there_is_no_global_step(self):
diff --git a/tensorflow/python/training/warm_starting_util.py b/tensorflow/python/training/warm_starting_util.py
index 78dbb465b55254ed05c5e1a9d87eb7ac2f7f3d82..1382b8ce72e93b19a16e60ac597a2413941b638e 100644
--- a/tensorflow/python/training/warm_starting_util.py
+++ b/tensorflow/python/training/warm_starting_util.py
@@ -28,11 +28,11 @@ from tensorflow.python.ops import variables as variables_lib
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training import checkpoint_ops
 from tensorflow.python.training import checkpoint_utils
-from tensorflow.python.training import saver
+from tensorflow.python.training.saving import saveable_object_util
 from tensorflow.python.util.tf_export import tf_export
 
 
-@tf_export("train.VocabInfo")
+@tf_export(v1=["train.VocabInfo"])
 class VocabInfo(
     collections.namedtuple("VocabInfo", [
         "new_vocab",
@@ -139,7 +139,7 @@ def _infer_var_name(var):
   Returns:
     Name of the `var`
   """
-  name_to_var_dict = saver.BaseSaverBuilder.OpListToDict(var)
+  name_to_var_dict = saveable_object_util.op_list_to_dict(var)
   if len(name_to_var_dict) > 1:
     raise TypeError("`var` = %s passed as arg violates the constraints.  "
                     "name_to_var_dict = %s" % (var, name_to_var_dict))
@@ -248,7 +248,7 @@ def _warm_start_var_with_vocab(var,
     prev_tensor_name = _infer_var_name(var)
 
   # TODO(eddz): Fix functionality for rank-1 Variables (like FC biases).
-  total_v_first_axis = sum([v.get_shape().as_list()[0] for v in var])
+  total_v_first_axis = sum(v.get_shape().as_list()[0] for v in var)
   for v in var:
     v_shape = v.get_shape().as_list()
     slice_info = v._get_save_slice_info()
@@ -333,12 +333,12 @@ def _get_grouped_variables(vars_to_warm_start):
         ops.GraphKeys.TRAINABLE_VARIABLES,
         scope=vars_to_warm_start)
   elif isinstance(vars_to_warm_start, list):
-    if all([isinstance(v, str) for v in vars_to_warm_start]):
+    if all(isinstance(v, str) for v in vars_to_warm_start):
       list_of_vars = []
       for v in vars_to_warm_start:
         list_of_vars += ops.get_collection(ops.GraphKeys.GLOBAL_VARIABLES,
                                            scope=v)
-    elif all([checkpoint_utils._is_variable(v) for v in vars_to_warm_start]):  # pylint: disable=protected-access
+    elif all(checkpoint_utils._is_variable(v) for v in vars_to_warm_start):  # pylint: disable=protected-access
       list_of_vars = vars_to_warm_start
     else:
       raise ValueError("If `vars_to_warm_start` is a list, it must be all "
@@ -360,7 +360,7 @@ def _get_grouped_variables(vars_to_warm_start):
   return grouped_variables
 
 
-@tf_export("train.warm_start")
+@tf_export(v1=["train.warm_start"])
 def warm_start(ckpt_to_initialize_from,
                vars_to_warm_start=".*",
                var_name_to_vocab_info=None,
diff --git a/tensorflow/python/training/warm_starting_util_test.py b/tensorflow/python/training/warm_starting_util_test.py
index 91a0b53b3a8771d8a1d826c6c63df91c91eec954..fa1f370f41efeda0d823d85e4d755038362fd37e 100644
--- a/tensorflow/python/training/warm_starting_util_test.py
+++ b/tensorflow/python/training/warm_starting_util_test.py
@@ -22,7 +22,7 @@ import os
 import numpy as np
 import six
 
-from tensorflow.python.feature_column import feature_column as fc
+from tensorflow.python.feature_column import feature_column_lib as fc
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
@@ -49,7 +49,7 @@ class WarmStartingUtilTest(test.TestCase):
     return vocab_file
 
   def _write_checkpoint(self, sess):
-    sess.run(variables.global_variables_initializer())
+    self.evaluate(variables.global_variables_initializer())
     saver = saver_lib.Saver()
     ckpt_prefix = os.path.join(self.get_temp_dir(), "model")
     saver.save(sess, ckpt_prefix, global_step=0)
@@ -70,7 +70,7 @@ class WarmStartingUtilTest(test.TestCase):
         if partitioner:
           self.assertTrue(isinstance(var, variables.PartitionedVariable))
           var = var._get_variable_list()
-        return var, sess.run(var)
+        return var, self.evaluate(var)
 
   def _create_prev_run_vars(self,
                             var_names,
@@ -86,7 +86,7 @@ class WarmStartingUtilTest(test.TestCase):
               shape=shape,
               initializer=initializer))
         self._write_checkpoint(sess)
-        return [sess.run(var) for var in all_vars]
+        return [self.evaluate(var) for var in all_vars]
 
   def _create_dummy_inputs(self):
     return {
@@ -125,7 +125,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(prev_val, fruit_weights.eval(sess))
 
   def testWarmStartVarPrevVarPartitioned(self):
@@ -143,7 +143,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose(prev_val, fruit_weights.eval(sess))
 
   def testWarmStartVarCurrentVarPartitioned(self):
@@ -162,7 +162,7 @@ class WarmStartingUtilTest(test.TestCase):
         prev_tensor_name, var = ws_util._get_var_info(fruit_weights)
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
@@ -189,7 +189,7 @@ class WarmStartingUtilTest(test.TestCase):
             fruit_weights, prev_tensor_name="old_scope/fruit_weights")
         checkpoint_utils.init_from_checkpoint(self.get_temp_dir(),
                                               {prev_tensor_name: var})
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         fruit_weights = fruit_weights._get_variable_list()
         new_val = np.concatenate(
             [fruit_weights[0].eval(sess), fruit_weights[1].eval(sess)], axis=0)
@@ -211,7 +211,7 @@ class WarmStartingUtilTest(test.TestCase):
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
@@ -236,7 +236,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[0.3, 0.5, 0.], [0.8, 1.0, 0.], [1.2, 1.5, 0.],
                              [2.3, 2., 0.]], fruit_output_layer.eval(sess))
 
@@ -261,7 +261,7 @@ class WarmStartingUtilTest(test.TestCase):
             self.get_temp_dir(),
             prev_vocab_path,
             previous_vocab_size=2)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Old vocabulary limited to ['apple', 'banana'].
         self.assertAllClose([[0.], [0.], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
@@ -285,7 +285,7 @@ class WarmStartingUtilTest(test.TestCase):
             "fruit_weights", initializer=[[0.], [0.], [0.], [0.], [0.]])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 5,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[2.], [1.5], [1.], [0.5], [0.]],
                             fruit_weights.eval(sess))
 
@@ -312,7 +312,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertAllClose([[0.3, 0.5, 0.], [0.8, 1.0, 0.], [1.2, 1.5, 0.],
                              [2.3, 2., 0.]], fruit_output_layer.eval(sess))
 
@@ -340,7 +340,7 @@ class WarmStartingUtilTest(test.TestCase):
             self.get_temp_dir(),
             prev_vocab_path,
             current_oov_buckets=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
         fruit_weights_vars = fruit_weights._get_variable_list()
@@ -372,7 +372,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_output_layer, variables.PartitionedVariable))
         fruit_output_layer_vars = fruit_output_layer._get_variable_list()
@@ -404,7 +404,7 @@ class WarmStartingUtilTest(test.TestCase):
             partitioner=lambda shape, dtype: [2, 1])
         ws_util._warm_start_var_with_vocab(fruit_weights, new_vocab_path, 6,
                                            self.get_temp_dir(), prev_vocab_path)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_weights, variables.PartitionedVariable))
         fruit_weights_vars = fruit_weights._get_variable_list()
@@ -438,7 +438,7 @@ class WarmStartingUtilTest(test.TestCase):
                                            prev_ckpt=self.get_temp_dir(),
                                            prev_vocab_path=prev_vocab_path,
                                            axis=1)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         self.assertTrue(
             isinstance(fruit_output_layer, variables.PartitionedVariable))
         fruit_output_layer_vars = fruit_output_layer._get_variable_list()
@@ -463,7 +463,7 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[10, 1],
             initializer=zeros())
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=[var])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
         self.assertAllEqual(var.eval(), prev_int_val)
 
@@ -483,7 +483,7 @@ class WarmStartingUtilTest(test.TestCase):
             shape=[10, 1],
             initializer=zeros())
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=["v1"])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started (init overridden to ones).
         self.assertAllEqual(var.eval(), prev_int_val)
 
@@ -519,7 +519,7 @@ class WarmStartingUtilTest(test.TestCase):
                            # This warm-starts both v1 and v1/Momentum, but only
                            # v2 (and not v2/Momentum).
                            vars_to_warm_start=["v1", "v2[^/]"])
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify the selection of weights were correctly warm-started (init
         # overridden to ones).
         self.assertAllEqual(v1.eval(), prev_v1_val)
@@ -542,7 +542,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [np.zeros([10, 1])]},
@@ -553,7 +553,7 @@ class WarmStartingUtilTest(test.TestCase):
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_int], partitioner)
         ws_util.warm_start(self.get_temp_dir(), vars_to_warm_start=".*sc_int.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_int: [prev_int_val]}, sess)
 
@@ -571,7 +571,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [np.zeros([15, 1])]},
@@ -583,7 +583,7 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([sc_hash], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_hash.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_hash: [prev_hash_val]},
                                   sess)
@@ -605,7 +605,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
@@ -619,7 +619,7 @@ class WarmStartingUtilTest(test.TestCase):
         # vocab is assumed to be same as new vocab.
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*sc_vocab.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
@@ -641,7 +641,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([4, 1])]},
@@ -657,7 +657,7 @@ class WarmStartingUtilTest(test.TestCase):
             # Explicitly provide the file prefix instead of just the dir.
             os.path.join(self.get_temp_dir(), "model-0"),
             vars_to_warm_start=".*sc_vocab.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [prev_vocab_val]},
                                   sess)
@@ -686,7 +686,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([sc_vocab], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [np.zeros([2, 1])]},
@@ -708,7 +708,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  'banana' isn't in the
         # first two entries of the old vocabulary, so it's newly initialized.
         self._assert_cols_to_vars(cols_to_vars, {sc_vocab: [[[1], [0]]]}, sess)
@@ -729,7 +729,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, the weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars,
@@ -741,7 +741,7 @@ class WarmStartingUtilTest(test.TestCase):
         cols_to_vars = self._create_linear_model([real_bucket], partitioner)
         ws_util.warm_start(
             self.get_temp_dir(), vars_to_warm_start=".*real_bucketized.*")
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars,
                                   {real_bucket: [prev_bucket_val]}, sess)
@@ -800,7 +800,7 @@ class WarmStartingUtilTest(test.TestCase):
     with ops.Graph().as_default() as g:
       with self.session(graph=g) as sess:
         cols_to_vars = self._create_linear_model(all_linear_cols, partitioner)
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Without warm-starting, all weights should be initialized using default
         # initializer (which is init_ops.zeros_initializer).
         self._assert_cols_to_vars(cols_to_vars, {
@@ -826,7 +826,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab/weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.
         self._assert_cols_to_vars(cols_to_vars, {
             sc_int: [prev_int_val],
@@ -865,7 +865,7 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
-        prev_keys_val = sess.run(sc_keys_weights)
+        prev_keys_val = self.evaluate(sc_keys_weights)
 
     def _partitioner(shape, dtype):  # pylint:disable=unused-argument
       # Partition each var into 2 equal slices.
@@ -892,7 +892,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
         # should be correctly warm-started after vocab remapping.
@@ -933,7 +933,7 @@ class WarmStartingUtilTest(test.TestCase):
             "linear_model/sc_vocab/weights",
             initializer=[[0.5], [1.], [2.], [3.]])
         self._write_checkpoint(sess)
-        prev_keys_val = sess.run(sc_keys_weights)
+        prev_keys_val = self.evaluate(sc_keys_weights)
 
     # New graph, new session with warm-starting.
     with ops.Graph().as_default() as g:
@@ -955,7 +955,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_hash should not be warm-started.  Var corresponding to sc_vocab
         # should be correctly warm-started after vocab remapping.
@@ -1024,7 +1024,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[sc_keys]):
                     "some_other_name"
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started.  Var corresponding to
         # sc_vocab should be correctly warm-started after vocab remapping,
         # and neither of the other two should be warm-started..
@@ -1091,7 +1091,7 @@ class WarmStartingUtilTest(test.TestCase):
                 ws_util._infer_var_name(cols_to_vars[emb_vocab_column]):
                     vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab_column should be correctly warm-started after vocab
         # remapping. Missing values are filled in with the EmbeddingColumn's
@@ -1163,7 +1163,7 @@ class WarmStartingUtilTest(test.TestCase):
             var_name_to_vocab_info={
                 "linear_model/sc_vocab_embedding/embedding_weights": vocab_info
             })
-        sess.run(variables.global_variables_initializer())
+        self.evaluate(variables.global_variables_initializer())
         # Verify weights were correctly warm-started. Var corresponding to
         # emb_vocab should be correctly warm-started after vocab remapping.
         # Missing values are filled in with the EmbeddingColumn's initializer.
diff --git a/tensorflow/python/util/deprecation.py b/tensorflow/python/util/deprecation.py
index 4c68d1aaae3272ddae27bd44ab98c6c68dbaa9b6..9aaf0c2de9756718645e77de416c653182994019 100644
--- a/tensorflow/python/util/deprecation.py
+++ b/tensorflow/python/util/deprecation.py
@@ -28,6 +28,7 @@ from tensorflow.python.util import is_in_graph_mode
 from tensorflow.python.util import tf_contextlib
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_inspect
+from tensorflow.python.util import tf_stack
 
 
 # Allow deprecation warnings to be silenced temporarily with a context manager.
@@ -98,21 +99,9 @@ def _validate_deprecation_args(date, instructions):
 
 def _call_location(outer=False):
   """Returns call location given level up from current call."""
-  frame = tf_inspect.currentframe()
-  if frame:
-    # CPython internals are available, use them for performance.
-    # walk back two frames to get to deprecated function caller.
-    frame = frame.f_back
-    if frame.f_back:
-      frame = frame.f_back
-    if outer and frame.f_back:
-      frame = frame.f_back
-    return '%s:%d' % (frame.f_code.co_filename, frame.f_lineno)
-  else:
-    # Slow fallback path
-    stack = tf_inspect.stack(0)  # 0 avoids generating unused context
-    entry = stack[3 if outer else 2]
-    return '%s:%d' % (entry[1], entry[2])
+  stack = tf_stack.extract_stack()
+  frame = stack[-4 if outer else -3]
+  return '{filename}:{lineno}'.format(filename=frame[0], lineno=frame[1])
 
 
 def _wrap_decorator(wrapped_function):
diff --git a/tensorflow/python/util/deprecation_test.py b/tensorflow/python/util/deprecation_test.py
index 34cbca52a1b42869e6ef106328b85435ec2877be..035c416d793e04ab26adbe0f4b321594343a2286 100644
--- a/tensorflow/python/util/deprecation_test.py
+++ b/tensorflow/python/util/deprecation_test.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import deprecation
@@ -174,6 +175,7 @@ class DeprecationTest(test.TestCase):
                         set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -214,6 +216,7 @@ class DeprecationTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -239,6 +242,7 @@ class DeprecationTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -488,6 +492,7 @@ class DeprecatedArgsTest(test.TestCase):
       deprecation.deprecated_args(date, instructions, "missing")(_fn)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -535,6 +540,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -565,6 +571,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -595,6 +602,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_varargs(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -615,6 +623,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_kwargs(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -635,6 +644,7 @@ class DeprecatedArgsTest(test.TestCase):
     self._assert_subset(set(["after " + date, instructions]), set(args[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_positional_and_named(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -660,6 +670,7 @@ class DeprecatedArgsTest(test.TestCase):
                         set(args2[1:]))
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_positional_and_named_with_ok_vals(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -692,6 +703,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(0, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_deprecated_args_once(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -708,6 +720,7 @@ class DeprecatedArgsTest(test.TestCase):
     self.assertEqual(1, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_deprecated_multiple_args_once_each(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -752,6 +765,7 @@ class DeprecatedArgValuesTest(test.TestCase):
       deprecation.deprecated_arg_values(date, instructions)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -804,6 +818,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(2, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_with_one_line_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
@@ -839,6 +854,7 @@ class DeprecatedArgValuesTest(test.TestCase):
     self.assertEqual(2, mock_warning.call_count)
 
   @test.mock.patch.object(logging, "warning", autospec=True)
+  @test_util.run_deprecated_v1
   def test_static_fn_no_doc(self, mock_warning):
     date = "2016-07-04"
     instructions = "This is how you update..."
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94e3345348b119bc64dd487c3c2a14603a2ce09
--- /dev/null
+++ b/tensorflow/python/util/dispatch.py
@@ -0,0 +1,191 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Type-based dispatch for TensorFlow ops.
+
+"Operation dispatchers" can be used to override the behavior for TensorFlow ops
+when they are called with otherwise unsupported argument types.  In particular,
+when an operation is called with arguments that would cause it to raise a
+TypeError, it falls back on its registered operation dispatchers.  If any
+registered dispatchers can handle the arguments, then its result is returned.
+Otherwise, the original TypeError is raised.
+
+By default, dispatch support is added to the generated op wrappers for any
+visible ops by default.  Ops that are implemented in Python can opt in to
+dispatch support using the `add_dispatch_support` decorator.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import itertools
+
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_inspect
+
+# Private function attribute used to store a list of dispatchers.
+DISPATCH_ATTR = "_tf_dispatchers"
+
+
+class OpDispatcher(object):
+  """Abstract base class for TensorFlow operator dispatchers.
+
+  Each operation dispatcher acts as an override handler for a single
+  TensorFlow operation, and its results are used when the handler indicates
+  that it can handle the operation's arguments (by returning any value other
+  than `OpDispatcher.NOT_SUPPORTED`).
+  """
+
+  # Sentinel value that can be returned to indicate that an operation
+  # dispatcher does not support a given set of arguments.
+  NOT_SUPPORTED = object()
+
+  def handle(self, args, kwargs):  # pylint: disable=unused-argument
+    """Handle this dispatcher's operation with the specified arguments.
+
+    If this operation dispatcher can handle the given arguments, then
+    return an appropriate value (or raise an appropriate exception).
+
+    Args:
+      args: The arguments to the operation.
+      kwargs: They keyword arguments to the operation.
+
+    Returns:
+      The result of the operation, or `OpDispatcher.NOT_SUPPORTED` if this
+      dispatcher can not handle the given arguments.
+    """
+    return self.NOT_SUPPORTED
+
+  def register(self, op):
+    """Register this dispatcher as a handler for `op`.
+
+    Args:
+      op: Python function: the TensorFlow operation that should be handled. Must
+        have a dispatch list (which is added automatically for generated ops,
+        and can be added to Python ops using the `add_dispatch_support`
+        decorator).
+    """
+    if not hasattr(op, DISPATCH_ATTR):
+      raise AssertionError("Dispatching not enabled for %s" % op)
+    getattr(op, DISPATCH_ATTR).append(self)
+
+
+def dispatch(op, *args, **kwargs):
+  """Returns the result from the first successful dispatcher for a given op.
+
+  Calls the `handle` method of each `OpDispatcher` that has been registered
+  to handle `op`, and returns the value from the first successful handler.
+
+  Args:
+    op: Python function: the operation to dispatch for.
+    *args: The arguments to the operation.
+    **kwargs: They keyword arguments to the operation.
+
+  Returns:
+    The result of the operation, or `NOT_SUPPORTED` if no registered
+    dispatcher can handle the given arguments.
+  """
+  for dispatcher in getattr(op, DISPATCH_ATTR):
+    result = dispatcher.handle(args, kwargs)
+    if result is not OpDispatcher.NOT_SUPPORTED:
+      return result
+  return OpDispatcher.NOT_SUPPORTED
+
+
+class _TypeBasedDispatcher(OpDispatcher):
+  """Dispatcher that handles op if any arguments have a specified type.
+
+  Checks the types of the arguments and keyword arguments (including elements
+  of lists or tuples), and if any argument values have the indicated type(s),
+  then delegates to an override function.
+  """
+
+  def __init__(self, override_func, types):
+    self._types = types
+    self._override_func = override_func
+
+  def _handles(self, args, kwargs):
+    for arg in itertools.chain(args, kwargs.values()):
+      if (isinstance(arg, self._types) or
+          (isinstance(arg, (list, tuple)) and
+           any(isinstance(elt, self._types) for elt in arg))):
+        return True
+    return False
+
+  def handle(self, args, kwargs):
+    if self._handles(args, kwargs):
+      return self._override_func(*args, **kwargs)
+    else:
+      return self.NOT_SUPPORTED
+
+
+# pylint: disable=g-doc-return-or-yield
+def dispatch_for_types(op, *types):
+  """Decorator to declare that a Python function overrides an op for a type.
+
+  The decorated function is used to override `op` if any of the arguments or
+  keyword arguments (including elements of lists or tuples) have one of the
+  specified types.
+
+  Example:
+
+  ```python
+  @dispatch_for_types(math_ops.add, RaggedTensor, RaggedTensorValue)
+  def ragged_add(x, y, name=None): ...
+  ```
+
+  Args:
+    op: Python function: the operation that should be overridden.
+    *types: The argument types for which this function should be used.
+  """
+
+  def decorator(func):
+    if tf_inspect.getargspec(func) != tf_inspect.getargspec(op):
+      raise AssertionError("The decorated function's signature must exactly "
+                           "match the signature of the overridden op.")
+    _TypeBasedDispatcher(func, types).register(op)
+    return func
+
+  return decorator
+
+
+# pylint: enable=g-doc-return-or-yield
+
+
+def add_dispatch_list(target):
+  """Decorator that adds a dispatch_list attribute to an op."""
+  if hasattr(target, DISPATCH_ATTR):
+    raise AssertionError("%s already has a dispatch list" % target)
+  setattr(target, DISPATCH_ATTR, [])
+  return target
+
+
+def add_dispatch_support(target):
+  """Decorator that adds a dispatch handling wrapper to an op."""
+  def wrapper(*args, **kwargs):
+    """Call target, and fall back on dispatchers if there is a TypeError."""
+    try:
+      return target(*args, **kwargs)
+    except (TypeError, ValueError):
+      # Note: convert_to_eager_tensor currently raises a ValueError, not a
+      # TypeError, when given unexpected types.  So we need to catch both.
+      result = dispatch(wrapper, *args, **kwargs)
+      if result is not OpDispatcher.NOT_SUPPORTED:
+        return result
+      else:
+        raise
+
+  add_dispatch_list(wrapper)
+  return tf_decorator.make_decorator(target, wrapper)
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7c5c8eca8dbb8c810351291d9445404897a9d5f
--- /dev/null
+++ b/tensorflow/python/util/dispatch_test.py
@@ -0,0 +1,120 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for operator dispatch."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import googletest
+from tensorflow.python.util import dispatch
+from tensorflow.python.util.tf_export import tf_export
+
+
+class CustomTensor(object):
+  """A fake composite tensor class, for testing type-based dispatching."""
+
+  def __init__(self, tensor, score):
+    self.tensor = ops.convert_to_tensor(tensor)
+    self.score = score
+
+
+@tf_export("test_op")
+@dispatch.add_dispatch_support
+def test_op(x, y, z):
+  """A fake op for testing dispatch of Python ops."""
+  return x + (2 * y) + (3 * z)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class DispatchTest(test_util.TensorFlowTestCase):
+
+  def testAddDispatchForTypes_With_CppOp(self):
+    original_handlers = gen_math_ops.add._tf_dispatchers[:]
+
+    # Override the behavior of gen_math_ops.add.
+    @dispatch.dispatch_for_types(gen_math_ops.add, CustomTensor)
+    def custom_add(x, y, name=None):  # pylint: disable=unused-variable
+      return CustomTensor(gen_math_ops.add(x.tensor, y.tensor, name),
+                          (x.score+y.score) / 2.0)
+    self.assertEqual(len(math_ops.add._tf_dispatchers),
+                     len(original_handlers) + 1)
+
+    # Test that we see the overridden behavior when using CustomTensors.
+    x = CustomTensor([1, 2, 3], 2.0)
+    y = CustomTensor([7, 8, 2], 0.0)
+    x_plus_y = gen_math_ops.add(x, y)
+    self.assertAllEqual(self.evaluate(x_plus_y.tensor), [8, 10, 5])
+    self.assertNear(x_plus_y.score, 1.0, 0.001)
+
+    # Test that we still get the right behavior when using normal Tensors.
+    a = [1, 2, 3]
+    b = [4, 5, 6]
+    a_plus_b = gen_math_ops.add(a, b)
+    self.assertAllEqual(a_plus_b, [5, 7, 9])
+
+    # Test that we still get a TypeError or ValueError if we pass some
+    # type that's not supported by any dispatcher.
+    with self.assertRaises((TypeError, ValueError)):
+      gen_math_ops.add(a, None)
+
+    # Clean up
+    gen_math_ops.add._tf_dispatchers = original_handlers
+
+  def testAddDispatchForTypes_With_PythonOp(self):
+    original_handlers = test_op._tf_dispatchers[:]
+
+    @dispatch.dispatch_for_types(test_op, CustomTensor)
+    def override_for_test_op(x, y, z):  # pylint: disable=unused-variable
+      return CustomTensor(test_op(x.tensor, y.tensor, z.tensor),
+                          (x.score + y.score + z.score) / 3.0)
+
+    x = CustomTensor([1, 2, 3], 0.2)
+    y = CustomTensor([7, 8, 2], 0.4)
+    z = CustomTensor([0, 1, 2], 0.6)
+
+    result = test_op(x, y, z)
+    self.assertAllEqual(self.evaluate(result.tensor), [15, 21, 13])
+    self.assertNear(result.score, 0.4, 0.001)
+
+    # Clean up
+    test_op._tf_dispatchers = original_handlers
+
+  def testDispatchForTypes_SignatureMismatch(self):
+    with self.assertRaisesRegexp(AssertionError, "The decorated function's "
+                                 "signature must exactly match.*"):
+      @dispatch.dispatch_for_types(test_op, CustomTensor)
+      def override_for_test_op(a, b, c):  # pylint: disable=unused-variable
+        return CustomTensor(test_op(a.tensor, b.tensor, c.tensor),
+                            (a.score + b.score + c.score) / 3.0)
+
+  def testDispatchForTypes_OpDoesNotSupportDispatch(self):
+    def some_op(x, y):
+      return x + y
+
+    with self.assertRaisesRegexp(AssertionError, "Dispatching not enabled for"):
+      @dispatch.dispatch_for_types(some_op, CustomTensor)
+      def override_for_some_op(x, y):  # pylint: disable=unused-variable
+        return x if x.score > 0 else y
+
+
+if __name__ == "__main__":
+  googletest.main()
+
+
diff --git a/tensorflow/python/util/nest_test.py b/tensorflow/python/util/nest_test.py
index 997a3c5c36f083faf157d764afc583aa2e5ad1cf..d0d0c5f7935ba0a4d2b867b3c6fb6bd52c7cd54a 100644
--- a/tensorflow/python/util/nest_test.py
+++ b/tensorflow/python/util/nest_test.py
@@ -482,6 +482,7 @@ class NestTest(parameterized.TestCase, test.TestCase):
     self.assertEqual(nt.a[1][::-1], rev_nt.a[1])
     self.assertEqual(nt.b[::-1], rev_nt.b)
 
+  @test_util.run_deprecated_v1
   def testMapStructureOverPlaceholders(self):
     inp_a = (array_ops.placeholder(dtypes.float32, shape=[3, 4]),
              array_ops.placeholder(dtypes.float32, shape=[3, 7]))
diff --git a/tensorflow/python/util/py_checkpoint_reader.i b/tensorflow/python/util/py_checkpoint_reader.i
index 1c73f7f06f1937a8db0bd858421c2e884892e25b..a1b98a2a75991ee8555c3d3de3aca826fba07a7e 100644
--- a/tensorflow/python/util/py_checkpoint_reader.i
+++ b/tensorflow/python/util/py_checkpoint_reader.i
@@ -165,7 +165,6 @@ def NewCheckpointReader(filepattern):
     from tensorflow.python.util import compat
     return CheckpointReader(compat.as_bytes(filepattern), status)
 
-NewCheckpointReader._tf_api_names = ['train.NewCheckpointReader']
 NewCheckpointReader._tf_api_names_v1 = ['train.NewCheckpointReader']
 %}
 
diff --git a/tensorflow/python/util/tf_export.py b/tensorflow/python/util/tf_export.py
index 0924b36ade8359a2ccb76024697779774e714aaa..ec70cae7d2fc00f793e8ffa0aec331e32e11115f 100644
--- a/tensorflow/python/util/tf_export.py
+++ b/tensorflow/python/util/tf_export.py
@@ -50,6 +50,10 @@ from tensorflow.python.util import tf_decorator
 ESTIMATOR_API_NAME = 'estimator'
 TENSORFLOW_API_NAME = 'tensorflow'
 
+# List of subpackage names used by TensorFlow components. Have to check that
+# TensorFlow core repo does not export any symbols under these names.
+SUBPACKAGE_NAMESPACES = [ESTIMATOR_API_NAME]
+
 _Attributes = collections.namedtuple(
     'ExportedApiAttributes', ['names', 'constants'])
 
@@ -78,6 +82,11 @@ class SymbolAlreadyExposedError(Exception):
   pass
 
 
+class InvalidSymbolNameError(Exception):
+  """Raised when trying to export symbol as an invalid or unallowed name."""
+  pass
+
+
 def get_canonical_name_for_symbol(
     symbol, api_name=TENSORFLOW_API_NAME,
     add_prefix_to_v1_names=False):
@@ -163,6 +172,37 @@ class api_export(object):  # pylint: disable=invalid-name
     self._overrides = kwargs.get('overrides', [])
     self._allow_multiple_exports = kwargs.get('allow_multiple_exports', False)
 
+    self._validate_symbol_names()
+
+  def _validate_symbol_names(self):
+    """Validate you are exporting symbols under an allowed package.
+
+    We need to ensure things exported by tf_export, estimator_export, etc.
+    export symbols under disjoint top-level package names.
+
+    For TensorFlow, we check that it does not export anything under subpackage
+    names used by components (estimator, keras, etc.).
+
+    For each component, we check that it exports everything under its own
+    subpackage.
+
+    Raises:
+      InvalidSymbolNameError: If you try to export symbol under disallowed name.
+    """
+    all_symbol_names = set(self._names) | set(self._names_v1)
+    if self._api_name == TENSORFLOW_API_NAME:
+      for subpackage in SUBPACKAGE_NAMESPACES:
+        if any(n.startswith(subpackage) for n in all_symbol_names):
+          raise InvalidSymbolNameError(
+              '@tf_export is not allowed to export symbols under %s.*' % (
+                  subpackage))
+    else:
+      if not all(n.startswith(self._api_name) for n in all_symbol_names):
+        raise InvalidSymbolNameError(
+            'Can only export symbols under package name of component. '
+            'e.g. tensorflow_estimator must export all symbols under '
+            'tf.estimator')
+
   def __call__(self, func):
     """Calls this decorator.
 
diff --git a/tensorflow/python/util/tf_export_test.py b/tensorflow/python/util/tf_export_test.py
index 4ae1dc55e06b434aeb4a95e2ca9aa68e4eef56de..a0fac8bf362627e6802821e3b33c0f107c5c97ce 100644
--- a/tensorflow/python/util/tf_export_test.py
+++ b/tensorflow/python/util/tf_export_test.py
@@ -130,6 +130,26 @@ class ValidateExportTest(test.TestCase):
     with self.assertRaises(tf_export.SymbolAlreadyExposedError):
       export_decorator(_test_function)
 
+  def testRaisesExceptionIfInvalidSymbolName(self):
+    # TensorFlow code is not allowed to export symbols under package
+    # tf.estimator
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.tf_export('estimator.invalid')
+
+    # All symbols exported by Estimator must be under tf.estimator package.
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('invalid')
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('Estimator.invalid')
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('invalid.estimator')
+
+  def testRaisesExceptionIfInvalidV1SymbolName(self):
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.tf_export('valid', v1=['estimator.invalid'])
+    with self.assertRaises(tf_export.InvalidSymbolNameError):
+      tf_export.estimator_export('estimator.valid', v1=['invalid'])
+
   def testOverridesFunction(self):
     _test_function2._tf_api_names = ['abc']
 
diff --git a/tensorflow/python/util/tf_should_use.py b/tensorflow/python/util/tf_should_use.py
index ca6710bcf2178db0fcf63c9bdfdf27531651f7ed..63de4a7a96c162f38aa3cba1512cc639df09adcf 100644
--- a/tensorflow/python/util/tf_should_use.py
+++ b/tensorflow/python/util/tf_should_use.py
@@ -23,6 +23,7 @@ import traceback
 
 import six  # pylint: disable=unused-import
 
+from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_decorator
 # pylint: enable=g-bad-import-order,g-import-not-at-top
@@ -32,7 +33,8 @@ class _TFShouldUseHelper(object):
   """Object stored in TFShouldUse-wrapped objects.
 
   When it is deleted it will emit a warning or error if its `sate` method
-  has not been called by time of deletion.
+  has not been called by time of deletion, and Tensorflow is not executing
+  eagerly outside of functions.
   """
 
   def __init__(self, type_, repr_, stack_frame, fatal_error_if_unsated):
@@ -50,6 +52,8 @@ class _TFShouldUseHelper(object):
     self._logging_module = None
 
   def __del__(self):
+    if ops.executing_eagerly_outside_functions():
+      return
     if self._sated:
       return
     if self._fatal_error_if_unsated:
diff --git a/tensorflow/python/util/tf_should_use_test.py b/tensorflow/python/util/tf_should_use_test.py
index fedbe1dff6a7bd6e2524355e9946a99fa740f597..65d848cf2a530593857cd912f92a77983d35099b 100644
--- a/tensorflow/python/util/tf_should_use_test.py
+++ b/tensorflow/python/util/tf_should_use_test.py
@@ -24,6 +24,7 @@ import gc
 import sys
 
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging
 from tensorflow.python.util import tf_should_use
@@ -39,6 +40,7 @@ def reroute_error():
 
 class TfShouldUseTest(test.TestCase):
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenNotUsed(self):
     c = constant_op.constant(0, name='blah0')
     def in_this_function():
@@ -52,6 +54,7 @@ class TfShouldUseTest(test.TestCase):
     self.assertIn('in_this_function', msg)
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseFatalWhenNotUsed(self):
     c = constant_op.constant(0, name='blah0')
     def in_this_function():
@@ -74,6 +77,7 @@ class TfShouldUseTest(test.TestCase):
     error.assert_not_called()
     fatal.assert_not_called()
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenUsedWithAdd(self):
     def add(h):
       _ = h + 1
@@ -81,6 +85,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testAddShouldUseWarningWhenUsedWithGetName(self):
     def get_name(h):
       _ = h.name
@@ -88,6 +93,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testShouldUseResult(self):
     @tf_should_use.should_use_result
     def return_const(value):
@@ -101,6 +107,7 @@ class TfShouldUseTest(test.TestCase):
     gc.collect()
     self.assertFalse(gc.garbage)
 
+  @test_util.run_deprecated_v1
   def testShouldUseResultWhenNotReallyUsed(self):
     @tf_should_use.should_use_result
     def return_const(value):
@@ -111,7 +118,7 @@ class TfShouldUseTest(test.TestCase):
         # Creating another op and executing it does not mark the
         # unused op as being "used".
         v = constant_op.constant(1.0, name='meh')
-        v.eval()
+        self.evaluate(v)
     msg = '\n'.join(error.call_args[0])
     self.assertIn('Object was never used', msg)
     self.assertIn('blah3:0', msg)
diff --git a/tensorflow/python/util/util.cc b/tensorflow/python/util/util.cc
index 11eb9ce94768f47e5afe48355fadab30744224b1..e69eec73a0ef8b37f042d9a0f5bf63569b6f5b39 100644
--- a/tensorflow/python/util/util.cc
+++ b/tensorflow/python/util/util.cc
@@ -172,7 +172,7 @@ class CachedTypeCheck {
     auto* type = Py_TYPE(o);
 
     {
-      mutex_lock l(type_to_sequence_map_mu_);
+      tf_shared_lock l(type_to_sequence_map_mu_);
       auto it = type_to_sequence_map_.find(type);
       if (it != type_to_sequence_map_.end()) {
         return it->second;
@@ -195,7 +195,12 @@ class CachedTypeCheck {
       mutex_lock l(type_to_sequence_map_mu_);
       if (type_to_sequence_map_.size() < kMaxItemsInCache) {
         Py_INCREF(type);
-        type_to_sequence_map_.insert({type, check_result});
+        auto insert_result = type_to_sequence_map_.insert({type, check_result});
+        if (!insert_result.second) {
+          // The type was added to the cache by a concurrent thread after we
+          // looked it up above.
+          Py_DECREF(type);
+        }
       }
     }
 
diff --git a/tensorflow/stream_executor/BUILD b/tensorflow/stream_executor/BUILD
index 5c9d85acf4ee3a912ace5753ed891b3929d05478..4c764a7b099010a980c007c5cdff7f20f7ba2106 100644
--- a/tensorflow/stream_executor/BUILD
+++ b/tensorflow/stream_executor/BUILD
@@ -1,6 +1,8 @@
 licenses(["restricted"])
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_proto_library")
+load("//tensorflow/core:platform/default/build_config.bzl", "tf_additional_all_protos")
 load("//tensorflow/core:platform/default/build_config_root.bzl", "if_static")
 load("//tensorflow:tensorflow.bzl", "cc_header_only_library")
 
@@ -13,6 +15,14 @@ STREAM_EXECUTOR_HEADERS = glob([
     "platform/**/*.h",
 ])
 
+tf_proto_library(
+    name = "dnn_proto",
+    srcs = ["dnn.proto"],
+    cc_api_version = 2,
+    default_header = True,
+    protodeps = tf_additional_all_protos(),
+)
+
 cc_library(
     name = "stream_executor_impl",
     srcs = glob(
@@ -35,6 +45,7 @@ cc_library(
     }),
     visibility = ["//visibility:public"],
     deps = [
+        ":dnn_proto_cc_impl",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -51,6 +62,7 @@ cc_library(
     hdrs = STREAM_EXECUTOR_HEADERS,
     visibility = ["//visibility:public"],
     deps = [
+        ":dnn_proto_cc",
         "//tensorflow/core:lib",
         "//tensorflow/core:ptr_util",
         "@com_google_absl//absl/strings",
@@ -96,11 +108,8 @@ cc_library(
         "@local_config_cuda//cuda:cuda_headers",
     ] + if_cuda_is_configured([
         "//tensorflow/core:cuda",
-        "@local_config_cuda//cuda:cublas",
         "@local_config_cuda//cuda:cuda_driver",
         "@local_config_cuda//cuda:cudnn",
-        "@local_config_cuda//cuda:cufft",
-        "@local_config_cuda//cuda:curand",
     ]),
     alwayslink = 1,
 )
diff --git a/tensorflow/stream_executor/cuda/cuda_blas.cc b/tensorflow/stream_executor/cuda/cuda_blas.cc
index 7fabb35e28ce5a4107882a8739c1c0d641e05828..957f6c98da564500f81d7185ce6a151003549ee5 100644
--- a/tensorflow/stream_executor/cuda/cuda_blas.cc
+++ b/tensorflow/stream_executor/cuda/cuda_blas.cc
@@ -58,6 +58,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/cuda/cuda_timer.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -76,21 +81,8 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuBlasPlugin);
 
 namespace wrap {
 
-#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
-  struct WrapperShim__##__name {                                    \
-    static const char *kName;                                       \
-    template <typename... Args>                                     \
-    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
-      cuda::ScopedActivateExecutorContext sac{parent};              \
-      return ::__name(args...);                                     \
-    }                                                               \
-  } __name;                                                         \
-  const char *WrapperShim__##__name::kName = #__name;
-
-#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
-  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
-
-#define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
+// clang-format off
+#define CUBLAS_ROUTINE_EACH(__macro)      \
   __macro(cublasSnrm2)                    \
   __macro(cublasDnrm2)                    \
   __macro(cublasScnrm2)                   \
@@ -262,6 +254,58 @@ namespace wrap {
   __macro(cublasCdgmm)                    \
   __macro(cublasZdgmm)
 
+// clang-format off
+
+#ifdef PLATFORM_GOOGLE
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                         \
+  struct WrapperShim__##__name {                                    \
+    static const char *kName;                                       \
+    template <typename... Args>                                     \
+    cublasStatus_t operator()(CUDAExecutor *parent, Args... args) { \
+      cuda::ScopedActivateExecutorContext sac{parent};              \
+      return ::__name(args...);                                     \
+    }                                                               \
+  } __name;                                                         \
+  const char *WrapperShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
+
+#else
+
+#define STREAM_EXECUTOR_CUBLAS_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char* kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void* GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCublasDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void* f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cublas DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cublasStatus_t operator()(CUDAExecutor* parent, Args... args) {       \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char* DynLoadShim__##__name::kName = #__name;
+
+#define STREAM_EXECUTOR_CUBLAS_V2_WRAP(__name) \
+  STREAM_EXECUTOR_CUBLAS_WRAP(__name)
+
+#endif
+
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasCreate)
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasDestroy)
 STREAM_EXECUTOR_CUBLAS_V2_WRAP(cublasSetStream)
@@ -271,7 +315,7 @@ STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasDgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasCgemmBatched)
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasZgemmBatched)
-CUBLAS_BLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
+CUBLAS_ROUTINE_EACH(STREAM_EXECUTOR_CUBLAS_V2_WRAP)
 
 #if CUDA_VERSION >= 7050
 STREAM_EXECUTOR_CUBLAS_WRAP(cublasSgemmEx)
@@ -424,7 +468,8 @@ class ScopedCublasMathMode {
   // Note that when false is returned, an appropriate error has already been
   // logged.
   bool Init(cublasMath_t new_mode) {
-    cublasStatus_t ret = wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
+    cublasStatus_t ret =
+        wrap::cublasGetMathMode(parent_, handle_, &old_mode_);
     if (ret != CUBLAS_STATUS_SUCCESS) {
       LOG(ERROR) << "failed to get old cublas math mode: " << ToString(ret);
       return ok_ = false;
@@ -442,7 +487,8 @@ class ScopedCublasMathMode {
   // successful in the first place.
   ~ScopedCublasMathMode() {
     if (ok_) {
-      cublasStatus_t ret = wrap::cublasSetMathMode(parent_, handle_, old_mode_);
+      cublasStatus_t ret =
+          wrap::cublasSetMathMode(parent_, handle_, old_mode_);
       if (ret != CUBLAS_STATUS_SUCCESS) {
         LOG(ERROR) << "failed to set former cublas math mode: "
                    << ToString(ret);
@@ -675,16 +721,16 @@ bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
-      wrap::cublasScasum, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasScasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAsum(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
-      wrap::cublasDzasum, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasDzasum, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasAxpy(Stream *stream, uint64 elem_count, float alpha,
@@ -835,16 +881,16 @@ bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<float>> &x, int incx,
                           DeviceMemory<float> *result) {
   return DoBlasInternal(
-      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasScnrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasNrm2(Stream *stream, uint64 elem_count,
                           const DeviceMemory<std::complex<double>> &x, int incx,
                           DeviceMemory<double> *result) {
   return DoBlasInternal(
-      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasDznrm2, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasRot(Stream *stream, uint64 elem_count,
@@ -1060,48 +1106,48 @@ bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIcamax, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIcamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamax(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIzamax, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIzamax, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<float> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIsamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIsamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<double> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIdamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIdamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<float>> &x, int incx,
                            DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIcamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIcamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasIamin(Stream *stream, uint64 elem_count,
                            const DeviceMemory<std::complex<double>> &x,
                            int incx, DeviceMemory<int> *result) {
   return DoBlasInternal(
-      wrap::cublasIzamin, stream, false /* = pointer_mode_host */, elem_count,
-      CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
+      wrap::cublasIzamin, stream, false /* = pointer_mode_host */,
+      elem_count, CUDAComplex(CUDAMemory(x)), incx, CUDAMemoryMutable(result));
 }
 
 bool CUDABlas::DoBlasGbmv(Stream *stream, blas::Transpose trans, uint64 m,
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.cc b/tensorflow/stream_executor/cuda/cuda_dnn.cc
index 19397c7dbf21c35466ca04371b437879e7da2403..1f2e2f48bbddf5f638135129e502cfe233d5952f 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.cc
@@ -132,43 +132,6 @@ string ToString(cudnnStatus_t status) {
   }
 }
 
-template <typename T>
-cudnnDataType_t GetCudnnDataType(
-    dnn::DataLayout = dnn::DataLayout::kBatchDepthYX);
-
-template <>
-cudnnDataType_t GetCudnnDataType<double>(dnn::DataLayout) {
-  return CUDNN_DATA_DOUBLE;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<float>(dnn::DataLayout) {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<Eigen::half>(dnn::DataLayout) {
-  return CUDNN_DATA_HALF;
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<int8>(dnn::DataLayout layout) {
-  switch (layout) {
-    case dnn::DataLayout::kYXDepthBatch:
-    case dnn::DataLayout::kYXBatchDepth:
-    case dnn::DataLayout::kBatchYXDepth:
-    case dnn::DataLayout::kBatchDepthYX:
-      return CUDNN_DATA_INT8;
-    case dnn::DataLayout::kBatchDepthYX4:
-      return CUDNN_DATA_INT8x4;
-  }
-}
-
-template <>
-cudnnDataType_t GetCudnnDataType<int32>(dnn::DataLayout) {
-  return CUDNN_DATA_INT32;
-}
-
 // RAII wrapper for all calls to cuDNN with a cuDNN handle argument.
 //
 // See CudnnAccess::GetHandle() for details.
@@ -685,10 +648,10 @@ class CudnnConvolutionDescriptor {
     CHECK_CUDNN_OK(cudnnSetConvolutionNdDescriptor(
         handle_.get(), convolution_descriptor.ndims(), padding.data(),
         strides.data(), dilations.data(),
-        // NOTE(keveman): cuDNN supports convolution and cross correlation.
-        // However, almost all the use cases do cross correlation, so just
-        // hard coding it here.
-        CUDNN_CROSS_CORRELATION, data_type));
+        convolution_descriptor.convolution_not_crosscorr()
+            ? CUDNN_CONVOLUTION
+            : CUDNN_CROSS_CORRELATION,
+        data_type));
 
     // NOTE(benbarsdell): This only applies if tensor op math is enabled
     //                      and algo selection is set to Default.
@@ -861,11 +824,19 @@ cudnnDataType_t ToCudnnDataType(
     case dnn::DataType::kInt8:
       return data_layout == dnn::DataLayout::kBatchDepthYX4 ? CUDNN_DATA_INT8x4
                                                             : CUDNN_DATA_INT8;
+    case dnn::DataType::kInt32:
+      return CUDNN_DATA_INT32;
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
   }
 }
 
+template <typename T>
+cudnnDataType_t GetCudnnDataType(
+    dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
+  return ToCudnnDataType(dnn::ToDataType<T>::value, data_layout);
+}
+
 cudnnRNNInputMode_t ToCudnnRnnInputMode(dnn::RnnInputMode input_mode) {
   switch (input_mode) {
     case dnn::RnnInputMode::kRnnLinearSkip:
@@ -2345,27 +2316,6 @@ struct ConvDoFP32ComputationFP16Input {
   static constexpr bool kDefaultFlag = true;
 };
 
-// A group of helper functions to return the internal compute type for
-// convolutions in cudnn.
-template <typename T>
-cudnnDataType_t GetConvComputeType() {
-  return CUDNN_DATA_FLOAT;
-}
-
-template <>
-cudnnDataType_t GetConvComputeType<Eigen::half>() {
-  if (CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()) {
-    return CUDNN_DATA_FLOAT;
-  } else {
-    return CUDNN_DATA_HALF;
-  }
-}
-
-template <>
-cudnnDataType_t GetConvComputeType<double>() {
-  return CUDNN_DATA_DOUBLE;
-}
-
 // A helper struct to decide whether to use FP32 as the internal compute type
 // for rnn when the input data type is FP16. At present it is turned off,
 // users can explicitly control them through an env-var
@@ -2437,7 +2387,7 @@ port::Status CudnnSupport::DoConvolveImpl(
     const DeviceMemory<T>& filter_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& output_descriptor, DeviceMemory<T>* output_data,
-    ScratchAllocator* scratch_allocator,
+    dnn::DataType accumulator_type, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -2445,7 +2395,7 @@ port::Status CudnnSupport::DoConvolveImpl(
   CudnnTensorDescriptor output_nd(output_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   // Alpha is the scaling factor for input.
@@ -2536,8 +2486,7 @@ port::Status CudnnSupport::DoConvolveImpl(
   return port::Status::OK();
 }
 
-template <typename AccumulatorType, typename ElementType, typename BiasType,
-          typename ScaleType>
+template <typename ElementType, typename BiasType, typename ScaleType>
 port::Status CudnnSupport::DoFusedConvolveImpl(
     Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
     const DeviceMemory<ElementType>& conv_input_data,
@@ -2548,7 +2497,8 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
     ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
     const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
     const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemory<ElementType>* output_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   if (activation_mode != dnn::ActivationMode::kRelu &&
@@ -2569,7 +2519,7 @@ port::Status CudnnSupport::DoFusedConvolveImpl(
       GetCudnnDataType<ElementType>(conv_input_descriptor.layout()));
   CudnnTensorDescriptor bias_nd(bias_descriptor, GetCudnnDataType<BiasType>());
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetCudnnDataType<AccumulatorType>());
+                                  ToCudnnDataType(accumulator_type));
 
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
@@ -2938,10 +2888,10 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl<float>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, dnn::DataType::kFloat, scratch_allocator,
+                     algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2956,10 +2906,10 @@ bool CudnnSupport::DoConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveImpl<double>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, dnn::DataType::kDouble, scratch_allocator,
+                     algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2973,11 +2923,15 @@ bool CudnnSupport::DoConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveImpl<Eigen::half>(
-          stream, batch_descriptor, input_data, filter_descriptor, filter_data,
-          convolution_descriptor, output_descriptor, output_data,
-          scratch_allocator, algorithm_config, output_profile_result),
+      DoConvolveImpl(stream, batch_descriptor, input_data, filter_descriptor,
+                     filter_data, convolution_descriptor, output_descriptor,
+                     output_data, acc_type, scratch_allocator, algorithm_config,
+                     output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -2995,12 +2949,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl<double>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kDouble, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3018,12 +2973,13 @@ bool CudnnSupport::DoFusedConvolve(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoFusedConvolveImpl<float>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kFloat, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3041,13 +2997,17 @@ bool CudnnSupport::DoFusedConvolve(
     DeviceMemory<Eigen::half>* output_data, ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoFusedConvolveImpl<float>(
+      DoFusedConvolveImpl(
           stream, conv_input_descriptor, conv_input_data, conv_input_scale,
           filter_descriptor, filter_data, convolution_descriptor,
           side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+          activation_mode, output_descriptor, output_data, acc_type,
+          scratch_allocator, algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3073,12 +3033,13 @@ bool CudnnSupport::DoFusedConvolve(
     return false;
   }
   return IsStatusOk(
-      DoFusedConvolveImpl<int32>(
-          stream, conv_input_descriptor, conv_input_data, conv_input_scale,
-          filter_descriptor, filter_data, convolution_descriptor,
-          side_input_data, side_input_scale, bias_descriptor, biases,
-          activation_mode, output_descriptor, output_data, scratch_allocator,
-          algorithm_config, output_profile_result),
+      DoFusedConvolveImpl(stream, conv_input_descriptor, conv_input_data,
+                          conv_input_scale, filter_descriptor, filter_data,
+                          convolution_descriptor, side_input_data,
+                          side_input_scale, bias_descriptor, biases,
+                          activation_mode, output_descriptor, output_data,
+                          dnn::DataType::kInt32, scratch_allocator,
+                          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3112,7 +3073,8 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::BatchDescriptor& input_descriptor,
-    DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -3133,7 +3095,7 @@ port::Status CudnnSupport::DoConvolveBackwardDataImpl(
   CudnnTensorDescriptor in_back_nd(input_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3213,11 +3175,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kDouble, scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3233,11 +3195,11 @@ bool CudnnSupport::DoConvolveBackwardData(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, dnn::DataType::kFloat, scratch_allocator,
+          algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3252,12 +3214,16 @@ bool CudnnSupport::DoConvolveBackwardData(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardDataImpl(stream, filter_descriptor, filter_data,
-                                 output_descriptor, backward_output_data,
-                                 convolution_descriptor, input_descriptor,
-                                 backward_input_data, scratch_allocator,
-                                 algorithm_config, output_profile_result),
+      DoConvolveBackwardDataImpl(
+          stream, filter_descriptor, filter_data, output_descriptor,
+          backward_output_data, convolution_descriptor, input_descriptor,
+          backward_input_data, acc_type, scratch_allocator, algorithm_config,
+          output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3269,7 +3235,8 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
     DeviceMemory<T> backward_output_data,
     const dnn::ConvolutionDescriptor& convolution_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemory<T>* backward_filter_data, ScratchAllocator* scratch_allocator,
+    DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
+    ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   cudnnDataType_t cudnn_type = GetCudnnDataType<T>();
@@ -3290,7 +3257,7 @@ port::Status CudnnSupport::DoConvolveBackwardFilterImpl(
   CudnnTensorDescriptor input_nd(input_descriptor, cudnn_type);
   CudnnFilterDescriptor filter(filter_descriptor, cudnn_type);
   CudnnConvolutionDescriptor conv(convolution_descriptor,
-                                  GetConvComputeType<T>());
+                                  ToCudnnDataType(accumulator_type));
 
   const bool is_profiling = output_profile_result != nullptr;
 
@@ -3406,11 +3373,12 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, dnn::DataType::kDouble,
+
+          scratch_allocator, algorithm_config, output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
@@ -3425,13 +3393,14 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
-  return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
-      /*report_error=*/!output_profile_result);
+  return IsStatusOk(DoConvolveBackwardFilterImpl(
+                        stream, input_descriptor, input_data, output_descriptor,
+                        backward_output_data, convolution_descriptor,
+                        filter_descriptor, backward_filter_data,
+
+                        dnn::DataType::kFloat, scratch_allocator,
+                        algorithm_config, output_profile_result),
+                    /*report_error=*/!output_profile_result);
 }
 
 bool CudnnSupport::DoConvolveBackwardFilter(
@@ -3445,12 +3414,16 @@ bool CudnnSupport::DoConvolveBackwardFilter(
     ScratchAllocator* scratch_allocator,
     const dnn::AlgorithmConfig& algorithm_config,
     dnn::ProfileResult* output_profile_result) {
+  dnn::DataType acc_type =
+      CudnnEnvVar<ConvDoFP32ComputationFP16Input>::IsEnabled()
+          ? dnn::DataType::kFloat
+          : dnn::DataType::kHalf;
   return IsStatusOk(
-      DoConvolveBackwardFilterImpl(stream, input_descriptor, input_data,
-                                   output_descriptor, backward_output_data,
-                                   convolution_descriptor, filter_descriptor,
-                                   backward_filter_data, scratch_allocator,
-                                   algorithm_config, output_profile_result),
+      DoConvolveBackwardFilterImpl(
+          stream, input_descriptor, input_data, output_descriptor,
+          backward_output_data, convolution_descriptor, filter_descriptor,
+          backward_filter_data, acc_type, scratch_allocator, algorithm_config,
+          output_profile_result),
       /*report_error=*/!output_profile_result);
 }
 
diff --git a/tensorflow/stream_executor/cuda/cuda_dnn.h b/tensorflow/stream_executor/cuda/cuda_dnn.h
index 74f6f935b84cfbea27e1e9165b5f7241f74a9cbb..0641be140d2f19651696b0bcac498870a4db2960 100644
--- a/tensorflow/stream_executor/cuda/cuda_dnn.h
+++ b/tensorflow/stream_executor/cuda/cuda_dnn.h
@@ -670,12 +670,12 @@ class CudnnSupport : public dnn::DnnSupport {
       const DeviceMemory<T>& filter_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<T>* output_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<T>* output_data, dnn::DataType accumulator_type,
+      ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
-  template <typename AccumulatorType, typename ElementType, typename BiasType,
-            typename ScaleType>
+  template <typename ElementType, typename BiasType, typename ScaleType>
   port::Status DoFusedConvolveImpl(
       Stream* stream, const dnn::BatchDescriptor& conv_input_descriptor,
       const DeviceMemory<ElementType>& conv_input_data,
@@ -687,7 +687,7 @@ class CudnnSupport : public dnn::DnnSupport {
       ScaleType side_input_scale, const dnn::BatchDescriptor& bias_descriptor,
       const DeviceMemory<BiasType>& biases, dnn::ActivationMode activation_mode,
       const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemory<ElementType>* output_data,
+      DeviceMemory<ElementType>* output_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
@@ -700,7 +700,8 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::BatchDescriptor& input_descriptor,
-      DeviceMemory<T>* backward_input_data, ScratchAllocator* scratch_allocator,
+      DeviceMemory<T>* backward_input_data, dnn::DataType accumulator_type,
+      ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
 
@@ -712,7 +713,7 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<T> backward_output_data,
       const dnn::ConvolutionDescriptor& convolution_descriptor,
       const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemory<T>* backward_filter_data,
+      DeviceMemory<T>* backward_filter_data, dnn::DataType accumulator_type,
       ScratchAllocator* scratch_allocator,
       const dnn::AlgorithmConfig& algorithm_config,
       dnn::ProfileResult* output_profile_result);
diff --git a/tensorflow/stream_executor/cuda/cuda_fft.cc b/tensorflow/stream_executor/cuda/cuda_fft.cc
index cbf388a0f892b0feee0e4f45f67fcb0be5c32537..acac7d6368885537b1f5727779388d550680e90d 100644
--- a/tensorflow/stream_executor/cuda/cuda_fft.cc
+++ b/tensorflow/stream_executor/cuda/cuda_fft.cc
@@ -23,6 +23,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -38,6 +43,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuFftPlugin);
 
 namespace wrap {
 
+#ifdef PLATFORM_GOOGLE
 // This macro wraps a global identifier, given by __name, in a callable
 // structure that loads the DLL symbol out of the DSO handle in a thread-safe
 // manner on first use. This dynamic loading technique is used to avoid DSO
@@ -52,22 +58,69 @@ namespace wrap {
     }                                                            \
   } __name;
 
-#define CUFFT_ROUTINE_EACH(__macro)                                            \
-  __macro(cufftDestroy) __macro(cufftSetStream) __macro(cufftPlan1d)           \
-      __macro(cufftPlan2d) __macro(cufftPlan3d) __macro(cufftPlanMany)         \
-          __macro(cufftExecD2Z) __macro(cufftExecZ2D) __macro(cufftExecC2C)    \
-              __macro(cufftExecC2R) __macro(cufftExecZ2Z)                      \
-                  __macro(cufftExecR2C) __macro(cufftCreate)                   \
-                      __macro(cufftSetAutoAllocation)                          \
-                          __macro(cufftSetWorkArea) __macro(cufftGetSize1d)    \
-                              __macro(cufftMakePlan1d) __macro(cufftGetSize2d) \
-                                  __macro(cufftMakePlan2d)                     \
-                                      __macro(cufftGetSize3d)                  \
-                                          __macro(cufftMakePlan3d)             \
-                                              __macro(cufftGetSizeMany)        \
-                                                  __macro(cufftMakePlanMany)
+#else
+
+#define STREAM_EXECUTOR_CUFFT_WRAP(__name)                                \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCufftDsoHandle();            \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in cufft DSO; dlerror: " << s.error_message();   \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    cufftResult operator()(CUDAExecutor *parent, Args... args) {          \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+
+#endif
+
+// clang-format off
+
+#define CUFFT_ROUTINE_EACH(__macro)                                     \
+  __macro(cufftDestroy)                                                 \
+  __macro(cufftSetStream)                                               \
+  __macro(cufftPlan1d)                                                  \
+  __macro(cufftPlan2d)                                                  \
+  __macro(cufftPlan3d)                                                  \
+  __macro(cufftPlanMany)                                                \
+  __macro(cufftExecD2Z)                                                 \
+  __macro(cufftExecZ2D)                                                 \
+  __macro(cufftExecC2C)                                                 \
+  __macro(cufftExecC2R)                                                 \
+  __macro(cufftExecZ2Z)                                                 \
+  __macro(cufftExecR2C)                                                 \
+  __macro(cufftCreate)                                                  \
+  __macro(cufftSetAutoAllocation)                                       \
+  __macro(cufftSetWorkArea)                                             \
+  __macro(cufftGetSize1d)                                               \
+  __macro(cufftMakePlan1d)                                              \
+  __macro(cufftGetSize2d)                                               \
+  __macro(cufftMakePlan2d)                                              \
+  __macro(cufftGetSize3d)                                               \
+  __macro(cufftMakePlan3d)                                              \
+  __macro(cufftGetSizeMany)                                             \
+  __macro(cufftMakePlanMany)
+
+// clang-format on
 
 CUFFT_ROUTINE_EACH(STREAM_EXECUTOR_CUFFT_WRAP)
+#undef CUFFT_ROUTINE_EACH
 
 }  // namespace wrap
 
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
index ad9154226c4634bb5a54c97c7b4abbf64d4d8c37..4874d096ad54fa352fd6e9ad3b7b87c1fff59f73 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.cc
@@ -662,8 +662,13 @@ bool CUDAExecutor::MemcpyDeviceToDevice(Stream *stream,
 }
 
 bool CUDAExecutor::HostCallback(Stream *stream,
-                                std::function<void()> callback) {
-  auto callback_ptr = new std::function<void()>(callback);
+                                std::function<port::Status()> callback) {
+  auto callback_ptr = new std::function<void()>([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return CUDADriver::AddStreamCallback(context_, AsCUDAStreamValue(stream),
                                        InternalHostCallback, callback_ptr);
 }
diff --git a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
index 90bf1c0242fb24ae38421864124e3fc8a1caef59..ae8e4abf92024626bf3d2bd3d334244708f55737 100644
--- a/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
+++ b/tensorflow/stream_executor/cuda/cuda_gpu_executor.h
@@ -148,7 +148,8 @@ class CUDAExecutor : public internal::StreamExecutorInterface {
                             const DeviceMemoryBase &gpu_src,
                             uint64 size) override;
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   bool AllocateStream(Stream *stream) override;
 
diff --git a/tensorflow/stream_executor/cuda/cuda_rng.cc b/tensorflow/stream_executor/cuda/cuda_rng.cc
index 88c4f15792737aac8dfafefba4c7fce74c434320..7f920719321637360fdf5c098e83dfaa49164e6c 100644
--- a/tensorflow/stream_executor/cuda/cuda_rng.cc
+++ b/tensorflow/stream_executor/cuda/cuda_rng.cc
@@ -21,6 +21,11 @@ limitations under the License.
 #include "tensorflow/stream_executor/cuda/cuda_platform_id.h"
 #include "tensorflow/stream_executor/cuda/cuda_stream.h"
 #include "tensorflow/stream_executor/device_memory.h"
+
+#ifndef PLATFORM_GOOGLE
+#include "tensorflow/stream_executor/dso_loader.h"
+#endif
+
 #include "tensorflow/stream_executor/lib/env.h"
 #include "tensorflow/stream_executor/lib/initialize.h"
 #include "tensorflow/stream_executor/lib/status.h"
@@ -61,6 +66,7 @@ PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kCuRandPlugin);
 
 namespace wrap {
 
+#ifdef PLATFORM_GOOGLE
 #define STREAM_EXECUTOR_CURAND_WRAP(__name)                         \
   struct WrapperShim__##__name {                                    \
     template <typename... Args>                                     \
@@ -70,6 +76,36 @@ namespace wrap {
     }                                                               \
   } __name;
 
+#else
+#define STREAM_EXECUTOR_CURAND_WRAP(__name)                               \
+  struct DynLoadShim__##__name {                                          \
+    static const char *kName;                                             \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;          \
+    static void *GetDsoHandle() {                                         \
+      auto s = internal::CachedDsoLoader::GetCurandDsoHandle();           \
+      return s.ValueOrDie();                                              \
+    }                                                                     \
+    static FuncPtrT LoadOrDie() {                                         \
+      void *f;                                                            \
+      auto s = port::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                          kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                         \
+                    << " in curand DSO; dlerror: " << s.error_message();  \
+      return reinterpret_cast<FuncPtrT>(f);                               \
+    }                                                                     \
+    static FuncPtrT DynLoad() {                                           \
+      static FuncPtrT f = LoadOrDie();                                    \
+      return f;                                                           \
+    }                                                                     \
+    template <typename... Args>                                           \
+    curandStatus_t operator()(CUDAExecutor *parent, Args... args) {       \
+      cuda::ScopedActivateExecutorContext sac{parent};                    \
+      return DynLoad()(args...);                                          \
+    }                                                                     \
+  } __name;                                                               \
+  const char *DynLoadShim__##__name::kName = #__name;
+#endif
+
 STREAM_EXECUTOR_CURAND_WRAP(curandCreateGenerator);
 STREAM_EXECUTOR_CURAND_WRAP(curandDestroyGenerator);
 STREAM_EXECUTOR_CURAND_WRAP(curandSetStream);
diff --git a/tensorflow/stream_executor/device_description.cc b/tensorflow/stream_executor/device_description.cc
index 4120e230dbffb132f194e4d787ae9fd4e44be696..0b991b7ba8cdad7f342adc6c8ff25b88d91e2bd2 100644
--- a/tensorflow/stream_executor/device_description.cc
+++ b/tensorflow/stream_executor/device_description.cc
@@ -140,21 +140,11 @@ void CalculateDimensionality(const DeviceDescription &device_description,
                              uint64 element_count, uint64 *threads_per_block,
                              uint64 *block_count) {
   *threads_per_block = device_description.threads_per_block_limit();
-  *block_count = DivideCeil(element_count, *threads_per_block);
+  *block_count = port::MathUtil::CeilOfRatio(element_count, *threads_per_block);
   if (*block_count == 1) {
     CHECK_LE(element_count, *threads_per_block);
     *threads_per_block = element_count;
   }
 }
 
-// Round value up to a multiple of n.
-static uint64 RoundUp(uint64 value, uint64 n) {
-  return port::MathUtil::CeilOfRatio(value, n) * n;
-}
-
-// Round value down to a multiple of n.
-static uint64 RoundDown(uint64 value, uint64 n) {
-  return port::MathUtil::FloorOfRatio(value, n) * n;
-}
-
 }  // namespace stream_executor
diff --git a/tensorflow/stream_executor/dnn.cc b/tensorflow/stream_executor/dnn.cc
index 3d8e691ab28c1b422a2e3e7cd2f0bbf1e1b80b91..faa662211ebb366b8e20cdc3e33ca651c64cf73a 100644
--- a/tensorflow/stream_executor/dnn.cc
+++ b/tensorflow/stream_executor/dnn.cc
@@ -23,7 +23,7 @@ namespace stream_executor {
 namespace dnn {
 
 uint64 AlgorithmDesc::hash() const {
-  return ::tensorflow::Hash64Combine(algo_, tensor_ops_enabled_);
+  return ::tensorflow::Hash64Combine(algo_id(), tensor_ops_enabled());
 }
 
 bool DnnSupport::GetConvolveAlgorithms(
@@ -187,6 +187,9 @@ std::tuple<int, int, int> GetDimIndices(const DataLayout& layout,
       batch_idx = 0;
       spatial_idx = 2;
       break;
+
+    default:
+      LOG(FATAL) << "Unknown layout " << layout;
   }
 
   return std::make_tuple(depth_idx, batch_idx, spatial_idx);
@@ -233,28 +236,27 @@ string AlgorithmConfig::ToString() const {
 // -- BatchDescriptor
 
 BatchDescriptor::BatchDescriptor(int ndims)
-    : count_(0),
-      feature_map_count_(0),
-      spatial_size_(ndims, 0),
-      value_max_(0.0),
+    : value_max_(0.0),
       value_min_(0.0),
-      layout_(DataLayout::kYXDepthBatch),
-      ndims_(ndims),
-      quantized_activation_mode_(QuantizedActivationMode::k8Bit) {}
+      quantized_activation_mode_(QuantizedActivationMode::k8Bit) {
+  tensor_.mutable_dimensions()->Resize(ndims + 2, 0);
+  set_layout(DataLayout::kYXDepthBatch);
+}
 
 BatchDescriptor::BatchDescriptor() : BatchDescriptor(/*ndims=*/2) {}
 
 std::vector<int64> BatchDescriptor::full_dims(const DataLayout& layout) const {
-  std::vector<int64> bdyx_dims(ndims_ + 2);
+  std::vector<int64> bdyx_dims(ndims() + 2);
   bdyx_dims[0] = count();
   bdyx_dims[1] = feature_map_count();
-  std::copy(spatial_size_.begin(), spatial_size_.end(), bdyx_dims.begin() + 2);
+  std::copy(spatial_size().begin(), spatial_size().end(),
+            bdyx_dims.begin() + 2);
   return ReorderDims(bdyx_dims, DataLayout::kBatchDepthYX, layout);
 }
 
 std::vector<int64> BatchDescriptor::full_strides(
     const DataLayout& layout) const {
-  if (layout_ == DataLayout::kBatchDepthYX4) {
+  if (this->layout() == DataLayout::kBatchDepthYX4) {
     LOG(FATAL)
         << "Cannot compute full strides for batch descriptor " << ToString()
         << ", because its layout is kBatchDepthYX4. In fact, "
@@ -262,36 +264,32 @@ std::vector<int64> BatchDescriptor::full_strides(
            "Use cudnnSetTensor4DDescriptor to set cudnnTensorDescriptor_t "
            "instead.";
   }
-  std::vector<int64> phys_dims = full_dims(layout_);
+  std::vector<int64> phys_dims = full_dims(this->layout());
   std::vector<int64> phys_strides(phys_dims.size());
-  phys_strides[ndims_ + 1] = 1;
-  for (int i = ndims_; i >= 0; i--) {
+  phys_strides[ndims() + 1] = 1;
+  for (int i = ndims(); i >= 0; i--) {
     phys_strides[i] = phys_strides[i + 1] * phys_dims[i + 1];
   }
-  return ReorderDims(phys_strides, layout_, layout);
+  return ReorderDims(phys_strides, this->layout(), layout);
 }
 
 void BatchDescriptor::CloneFrom(const BatchDescriptor& other) {
-  count_ = other.count_;
-  feature_map_count_ = other.feature_map_count_;
-  spatial_size_ = other.spatial_size_;
+  tensor_ = other.tensor_;
   value_max_ = other.value_max_;
   value_min_ = other.value_min_;
-  layout_ = other.layout_;
-  ndims_ = other.ndims_;
   quantized_activation_mode_ = other.quantized_activation_mode_;
 }
 
 string BatchDescriptor::ToString() const {
   string spatial;
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
   }
   return port::Printf(
       "{count: %lld feature_map_count: %lld spatial: %s "
       "value_min: %f value_max: %f layout: %s}",
-      count_, feature_map_count_, spatial.c_str(), value_min_, value_max_,
-      DataLayoutString(layout_).c_str());
+      count(), feature_map_count(), spatial.c_str(), value_min_, value_max_,
+      DataLayoutString(layout()).c_str());
 }
 
 string BatchDescriptor::ToShortString() const {
@@ -302,8 +300,8 @@ string BatchDescriptor::ToShortString() const {
   string batch = absl::StrCat("b", count());
 
   string spatial = "s";
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", spatial_size_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", spatial_size()[i]);
   }
 
   string suffix;
@@ -333,18 +331,18 @@ string BatchDescriptor::ToShortString() const {
 
 int64 BatchDescriptor::NodesPerFeatureMap() const {
   int64 ret = 1;
-  for (int i = 0; i < ndims_; i++) {
-    ret *= spatial_size_[i];
+  for (int i = 0; i < ndims(); i++) {
+    ret *= spatial_size()[i];
   }
   return ret;
 }
 
 int64 BatchDescriptor::NodesAcrossFeatureMaps() const {
-  return NodesPerFeatureMap() * feature_map_count_;
+  return NodesPerFeatureMap() * feature_map_count();
 }
 
 int64 BatchDescriptor::ElementCount() const {
-  return count_ * feature_map_count_ * NodesPerFeatureMap();
+  return count() * feature_map_count() * NodesPerFeatureMap();
 }
 
 int64 BatchDescriptor::FullyConnectedWeightCount(
@@ -372,33 +370,27 @@ BatchDescriptor BatchDescriptor::DepthConcatenateOutputDescriptor(
 
 // -- FilterDescriptor
 
-FilterDescriptor::FilterDescriptor(int ndims)
-    : output_feature_map_count_(0),
-      input_feature_map_count_(0),
-      input_filter_dims_(ndims, 0),
-      ndims_(ndims),
-      layout_(FilterLayout::kOutputInputYX) {}
+FilterDescriptor::FilterDescriptor(int ndims) {
+  tensor_.mutable_dimensions()->Resize(ndims + 2, 0);
+  set_layout(FilterLayout::kOutputInputYX);
+}
 
 FilterDescriptor::FilterDescriptor() : FilterDescriptor(/*ndims=*/2) {}
 
 FilterDescriptor::~FilterDescriptor() {}
 
 void FilterDescriptor::CloneFrom(const FilterDescriptor& other) {
-  set_output_feature_map_count(other.output_feature_map_count())
-      .set_input_feature_map_count(other.input_feature_map_count())
-      .set_layout(other.layout());
-  input_filter_dims_ = other.input_filter_dims_;
-  ndims_ = other.ndims_;
+  tensor_ = other.tensor_;
 }
 
 string FilterDescriptor::ToString() const {
   string desc = port::Printf(
       "{output_feature_map_count: %lld input_feature_map_count: %lld "
       "layout: %s shape: ",
-      output_feature_map_count_, input_feature_map_count_,
-      FilterLayoutString(layout_).c_str());
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "%lld ", input_filter_dims_[i]);
+      output_feature_map_count(), input_feature_map_count(),
+      FilterLayoutString(layout()).c_str());
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "%lld ", input_filter_dims()[i]);
   }
   absl::StrAppend(&desc, "}");
 
@@ -409,15 +401,15 @@ string FilterDescriptor::ToShortString() const {
   // All the constituent strings are less than 15 characters, so the
   // small string optimization ensures that there will be at most one
   // heap memory allocation.
-  string od = absl::StrCat("od", output_feature_map_count_);
-  string id = absl::StrCat("id", input_feature_map_count_);
+  string od = absl::StrCat("od", output_feature_map_count());
+  string id = absl::StrCat("id", input_feature_map_count());
 
   string spatial = "s";
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&spatial, "%lld ", input_filter_dims_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&spatial, "%lld ", input_filter_dims()[i]);
   }
 
-  switch (layout_) {
+  switch (layout()) {
     case FilterLayout::kOutputInputYX:
       return absl::StrCat(od, id, spatial);
     case FilterLayout::kOutputYXInput:
@@ -429,27 +421,28 @@ string FilterDescriptor::ToShortString() const {
     case FilterLayout::kYXInputOutput:
       return absl::StrCat(spatial, id, od);
     default:
-      LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout_);
+      LOG(FATAL) << "Unknown layout " << static_cast<int32>(layout());
       return "";  // Avoid return warning (unreachable)
   }
 }
 
 int64 FilterDescriptor::ComputeWeightCount() const {
-  int64 ret = output_feature_map_count_ * input_feature_map_count_;
-  for (int i = 0; i < ndims_; i++) {
-    ret *= input_filter_dims_[i];
+  int64 ret = output_feature_map_count() * input_feature_map_count();
+  for (int i = 0; i < ndims(); i++) {
+    ret *= input_filter_dims()[i];
   }
   return ret;
 }
 
 // -- ConvolutionDescriptor
 
-ConvolutionDescriptor::ConvolutionDescriptor(int ndims)
-    : zero_padding_(ndims, 0),
-      filter_strides_(ndims, 1),
-      dilation_rates_(ndims, 1),
-      group_count_(1),
-      ndims_(ndims) {}
+ConvolutionDescriptor::ConvolutionDescriptor(int ndims) {
+  proto_.mutable_paddings()->Resize(ndims, 0);
+  proto_.mutable_strides()->Resize(ndims, 1);
+  proto_.mutable_dilations()->Resize(ndims, 1);
+  proto_.set_group_count(1);
+  proto_.set_convolution_mode(ConvolutionMode::CROSS_CORRELATION);
+}
 
 ConvolutionDescriptor::ConvolutionDescriptor()
     : ConvolutionDescriptor(/*ndims=*/2) {}
@@ -460,10 +453,10 @@ string ConvolutionDescriptor::ToString() const {
   string padding;
   string strides;
   string dilations;
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&padding, "%lld ", zero_padding_[i]);
-    port::Appendf(&strides, "%lld ", filter_strides_[i]);
-    port::Appendf(&dilations, "%lld ", dilation_rates_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&padding, "%lld ", this->padding()[i]);
+    port::Appendf(&strides, "%lld ", this->strides()[i]);
+    port::Appendf(&dilations, "%lld ", this->dilations()[i]);
   }
 
   return port::Printf(
@@ -475,15 +468,15 @@ string ConvolutionDescriptor::ToString() const {
 
 string ConvolutionDescriptor::ToShortString() const {
   string desc;
-  for (int i = 0; i < ndims_; i++) {
+  for (int i = 0; i < ndims(); i++) {
     if (i > 0) port::Appendf(&desc, "_");
-    port::Appendf(&desc, "p%d:%lld", i, zero_padding_[i]);
+    port::Appendf(&desc, "p%d:%lld", i, padding()[i]);
   }
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "_s%d:%lld", i, filter_strides_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "_s%d:%lld", i, strides()[i]);
   }
-  for (int i = 0; i < ndims_; i++) {
-    port::Appendf(&desc, "_d%d:%lld", i, dilation_rates_[i]);
+  for (int i = 0; i < ndims(); i++) {
+    port::Appendf(&desc, "_d%d:%lld", i, dilations()[i]);
   }
   return desc;
 }
diff --git a/tensorflow/stream_executor/dnn.h b/tensorflow/stream_executor/dnn.h
index c934301829daa2c08a34cb9b112c3ee482d12330..c044a356efb38c333c3153f024092a22fbdf56db 100644
--- a/tensorflow/stream_executor/dnn.h
+++ b/tensorflow/stream_executor/dnn.h
@@ -29,7 +29,9 @@ limitations under the License.
 
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/stream_executor/device_memory.h"
+#include "tensorflow/stream_executor/dnn.pb.h"
 #include "tensorflow/stream_executor/lib/array_slice.h"
 #include "tensorflow/stream_executor/lib/status.h"
 #include "tensorflow/stream_executor/lib/statusor.h"
@@ -48,19 +50,6 @@ class ScratchAllocator;
 
 namespace dnn {
 
-// Describes how an input or output layer's data is formatted.
-// Specify int64 so there's no padding in BatchDescriptor.
-enum class DataLayout : int64 {
-  kYXDepthBatch = 0,  // Same as dist_belief::DF_DEPTH_MAJOR.
-  kYXBatchDepth,      // Same as dist_belief::DF_BATCH_MAJOR.
-  kBatchYXDepth,      // Same as run_brain output, and tensorflow's layout.
-  kBatchDepthYX,      // cuDNN's NCHW layout, data laid out as image, feature
-                      // maps, rows, columns.
-  kBatchDepthYX4,     // cuDNN's NCHW_VECT_C layout, data laid out the same as
-                      // kBatchDepthYX but each element is a vector of 4 feature
-                      // maps.
-};
-
 // Specifies an index to use when accessing specific spatial dimensions.
 enum class DimIndex : int {
   X = 0,
@@ -73,8 +62,27 @@ inline int64 GetDim(absl::Span<const int64> data, DimIndex dim) {
   return data.rbegin()[static_cast<int64>(dim)];
 }
 
+inline void SetDim(absl::Span<int64> data, DimIndex dim, int64 value) {
+  data.rbegin()[static_cast<int64>(dim)] = value;
+}
+
 inline void SetDim(std::vector<int64>* data, DimIndex dim, int64 value) {
-  data->rbegin()[static_cast<int64>(dim)] = value;
+  return SetDim(absl::MakeSpan(*data), dim, value);
+}
+
+// tensorflow::int64 is not the same type as tensorflow::protobuf_int64 in
+// open-source. Wrapper function that gives an int64 array slice view of a
+// repeated int64 protobuf field.
+inline absl::Span<const int64> AsInt64Slice(
+    const tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>& v) {
+  return absl::Span<const int64>(reinterpret_cast<const int64*>(v.data()),
+                                 v.size());
+}
+
+inline absl::Span<int64> AsInt64Slice(
+    tensorflow::protobuf::RepeatedField<tensorflow::protobuf_int64>* v) {
+  return absl::Span<int64>(reinterpret_cast<int64*>(v->mutable_data()),
+                           v->size());
 }
 
 // Returns a string representation of the given data layout.
@@ -87,14 +95,6 @@ enum class QuantizedActivationMode {
   k32Bit = 4,
 };
 
-// Specifies the data type used by an operation.
-enum class DataType {
-  kFloat = 0,
-  kDouble = 1,
-  kHalf = 2,
-  kInt8 = 3,
-};
-
 // A helper class to convert C/C++ types to the proper enums.
 template <typename T>
 struct ToDataType;
@@ -114,6 +114,10 @@ template <>
 struct ToDataType<int8> {
   static constexpr DataType value = DataType::kInt8;
 };
+template <>
+struct ToDataType<int32> {
+  static constexpr DataType value = DataType::kInt32;
+};
 
 // Specifies the types of a RNN model.
 enum class RnnMode {
@@ -245,15 +249,15 @@ class BatchDescriptor {
   string ToShortString() const;
 
   // Accessors.
-  int64 count() const { return count_; }
-  int64 feature_map_count() const { return feature_map_count_; }
-  int64 height() const { return GetDim(spatial_size_, DimIndex::Y); }
-  int64 width() const { return GetDim(spatial_size_, DimIndex::X); }
-  int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size_, dim); }
-  int ndims() const { return ndims_; }
+  int64 count() const { return tensor_.dimensions(0); }
+  int64 feature_map_count() const { return tensor_.dimensions(1); }
+  int64 height() const { return GetDim(spatial_size(), DimIndex::Y); }
+  int64 width() const { return GetDim(spatial_size(), DimIndex::X); }
+  int64 spatial_dim(DimIndex dim) const { return GetDim(spatial_size(), dim); }
+  int ndims() const { return spatial_size().size(); }
   float value_max() const { return value_max_; }
   float value_min() const { return value_min_; }
-  DataLayout layout() const { return layout_; }
+  DataLayout layout() const { return tensor_.data_layout(); }
   QuantizedActivationMode quantized_activation_mode() const {
     return quantized_activation_mode_;
   }
@@ -267,23 +271,23 @@ class BatchDescriptor {
 
   // Named-argument helpers for avoiding user error during construction.
   BatchDescriptor& set_count(int64 value) {
-    count_ = value;
+    tensor_.set_dimensions(0, value);
     return *this;
   }
   BatchDescriptor& set_feature_map_count(int64 value) {
-    feature_map_count_ = value;
+    tensor_.set_dimensions(1, value);
     return *this;
   }
   BatchDescriptor& set_height(int64 value) {
-    SetDim(&spatial_size_, DimIndex::Y, value);
+    SetDim(spatial_size(), DimIndex::Y, value);
     return *this;
   }
   BatchDescriptor& set_width(int64 value) {
-    SetDim(&spatial_size_, DimIndex::X, value);
+    SetDim(spatial_size(), DimIndex::X, value);
     return *this;
   }
   BatchDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
-    SetDim(&spatial_size_, dim, value);
+    SetDim(spatial_size(), dim, value);
     return *this;
   }
   BatchDescriptor& set_value_max(float value) {
@@ -295,7 +299,7 @@ class BatchDescriptor {
     return *this;
   }
   BatchDescriptor& set_layout(DataLayout layout) {
-    layout_ = layout;
+    tensor_.set_data_layout(layout);
     return *this;
   }
   BatchDescriptor& set_quantized_activation_mode(
@@ -334,31 +338,20 @@ class BatchDescriptor {
       port::ArraySlice<dnn::BatchDescriptor> inputs);
 
  private:
-  int64 count_;
-  int64 feature_map_count_;
-  // Stored as: ..., y, x.
-  std::vector<int64> spatial_size_;
+  absl::Span<const int64> spatial_size() const {
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
+  }
+
+  absl::Span<int64> spatial_size() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
   float value_max_;
   float value_min_;
-  DataLayout layout_;
-  int ndims_;
   QuantizedActivationMode quantized_activation_mode_;
 };
 
-// Describes how a filter is laid out in the memory.
-// Specify int64 so there's no padding in FilterDescriptor.
-enum class FilterLayout : int64 {
-  kOutputInputYX = 0,  // cuDNN's default filter layout, laid out as:
-                       // (major) output feature maps >> input feature maps >>
-                       // rows >> columns (minor).
-  kOutputYXInput,      // major to minor:
-                       //   (output features, row, columns, input features)
-  kOutputInputYX4,  // laid out the same as kOutputInputYX but each element is a
-                    // vector of 4 feature maps.
-  kInputYXOutput,   // Same as dist_belief's default filter layout.
-  kYXInputOutput,   // Same as tensorflow's default filter layout.
-};
-
 // Returns a string representation of the given filter layout.
 string FilterLayoutString(FilterLayout layout);
 
@@ -398,30 +391,30 @@ class FilterDescriptor {
 
   // Named-argument helpers for avoiding user error during construction.
   FilterDescriptor& set_output_feature_map_count(int64 value) {
-    output_feature_map_count_ = value;
+    tensor_.set_dimensions(0, value);
     return *this;
   }
   FilterDescriptor& set_input_feature_map_count(int64 value) {
-    input_feature_map_count_ = value;
+    tensor_.set_dimensions(1, value);
     return *this;
   }
   FilterDescriptor& set_input_filter_height(int64 value) {
-    SetDim(&input_filter_dims_, DimIndex::Y, value);
+    SetDim(input_filter_dims(), DimIndex::Y, value);
     return *this;
   }
   FilterDescriptor& set_input_filter_width(int64 value) {
-    SetDim(&input_filter_dims_, DimIndex::X, value);
+    SetDim(input_filter_dims(), DimIndex::X, value);
     return *this;
   }
   FilterDescriptor& set_layout(FilterLayout layout) {
-    layout_ = layout;
+    tensor_.set_filter_layout(layout);
     return *this;
   }
   FilterDescriptor& set_spatial_dim(DimIndex dim, int64 value) {
-    SetDim(&input_filter_dims_, dim, value);
+    SetDim(input_filter_dims(), dim, value);
     return *this;
   }
-  int ndims() const { return ndims_; }
+  int ndims() const { return input_filter_dims().size(); }
 
   void CloneFrom(const FilterDescriptor& other);
 
@@ -434,32 +427,32 @@ class FilterDescriptor {
 
   // Returns the number of biases required as parameters for a convolution
   // using this filter descriptor.
-  int64 bias_count() const { return output_feature_map_count_; }
+  int64 bias_count() const { return output_feature_map_count(); }
 
-  int64 output_feature_map_count() const { return output_feature_map_count_; }
-  int64 input_feature_map_count() const { return input_feature_map_count_; }
+  int64 output_feature_map_count() const { return tensor_.dimensions(0); }
+  int64 input_feature_map_count() const { return tensor_.dimensions(1); }
   int64 input_filter_height() const {
-    return GetDim(input_filter_dims_, DimIndex::Y);
+    return GetDim(input_filter_dims(), DimIndex::Y);
   }
   int64 input_filter_width() const {
-    return GetDim(input_filter_dims_, DimIndex::X);
+    return GetDim(input_filter_dims(), DimIndex::X);
   }
   int64 input_filter_dim(DimIndex dim) const {
-    return GetDim(input_filter_dims_, dim);
+    return GetDim(input_filter_dims(), dim);
   }
 
-  FilterLayout layout() const { return layout_; }
+  FilterLayout layout() const { return tensor_.filter_layout(); }
+
   absl::Span<const int64> input_filter_dims() const {
-    return input_filter_dims_;
+    return AsInt64Slice(tensor_.dimensions()).subspan(2);
   }
 
  private:
-  int64 output_feature_map_count_;
-  int64 input_feature_map_count_;
-  // Stored as: ..., y, x.
-  std::vector<int64> input_filter_dims_;
-  int ndims_;
-  FilterLayout layout_;
+  absl::Span<int64> input_filter_dims() {
+    return AsInt64Slice(tensor_.mutable_dimensions()).subspan(2);
+  }
+
+  TensorDescriptorProto tensor_;
 };
 
 // Describes how padding should be aligned when the total number of pad
@@ -500,6 +493,11 @@ std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
 //   cells between each filter element in the "y dimension".
 // - horizontal_dilation_rate: there will be (horizontal_dilation_rate - 1)
 //   skipped cells between each filter element in the "x dimension".
+// - convolution_not_crosscor: By default (convolution_not_crosscor == false),
+//   we perform cross correlation rather than convolution. With the flag set,
+//   we perform convolution. Convolution and cross correlation are related by
+//   rotating the filter by 180 degrees (or equivalently flipping all spatial
+//   dimensions).
 class ConvolutionDescriptor {
  public:
   // By default construction, there is no zero-padding and the filter stride is
@@ -513,84 +511,102 @@ class ConvolutionDescriptor {
   string ToShortString() const;
 
   ConvolutionDescriptor& set_zero_padding_height(int64 value) {
-    SetDim(&zero_padding_, DimIndex::Y, value);
+    SetDim(padding(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_zero_padding_width(int64 value) {
-    SetDim(&zero_padding_, DimIndex::X, value);
+    SetDim(padding(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_zero_padding(DimIndex dim, int64 value) {
-    SetDim(&zero_padding_, dim, value);
+    SetDim(padding(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_vertical_filter_stride(int64 value) {
-    SetDim(&filter_strides_, DimIndex::Y, value);
+    SetDim(strides(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_horizontal_filter_stride(int64 value) {
-    SetDim(&filter_strides_, DimIndex::X, value);
+    SetDim(strides(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_filter_stride(DimIndex dim, int64 value) {
-    SetDim(&filter_strides_, dim, value);
+    SetDim(strides(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_vertical_dilation_rate(int64 value) {
-    SetDim(&dilation_rates_, DimIndex::Y, value);
+    SetDim(dilations(), DimIndex::Y, value);
     return *this;
   }
   ConvolutionDescriptor& set_horizontal_dilation_rate(int64 value) {
-    SetDim(&dilation_rates_, DimIndex::X, value);
+    SetDim(dilations(), DimIndex::X, value);
     return *this;
   }
   ConvolutionDescriptor& set_dilation_rate(DimIndex dim, int64 value) {
-    SetDim(&dilation_rates_, dim, value);
+    SetDim(dilations(), dim, value);
     return *this;
   }
   ConvolutionDescriptor& set_group_count(int group_count) {
-    group_count_ = group_count;
+    proto_.set_group_count(group_count);
     return *this;
   }
-  int64 zero_padding_height() const {
-    return GetDim(zero_padding_, DimIndex::Y);
-  }
-  int64 zero_padding_width() const {
-    return GetDim(zero_padding_, DimIndex::X);
+  ConvolutionDescriptor& set_convolution_not_crosscorr(bool conv) {
+    proto_.set_convolution_mode(conv ? ConvolutionMode::CONVOLUTION
+                                     : ConvolutionMode::CROSS_CORRELATION);
+    return *this;
   }
+  int64 zero_padding_height() const { return GetDim(padding(), DimIndex::Y); }
+  int64 zero_padding_width() const { return GetDim(padding(), DimIndex::X); }
   int64 vertical_filter_stride() const {
-    return GetDim(filter_strides_, DimIndex::Y);
+    return GetDim(strides(), DimIndex::Y);
   }
   int64 horizontal_filter_stride() const {
-    return GetDim(filter_strides_, DimIndex::X);
+    return GetDim(strides(), DimIndex::X);
   }
   int64 vertical_dilation_rate() const {
-    return GetDim(dilation_rates_, DimIndex::Y);
+    return GetDim(dilations(), DimIndex::Y);
   }
   int64 horizontal_dilation_rate() const {
-    return GetDim(dilation_rates_, DimIndex::X);
+    return GetDim(dilations(), DimIndex::X);
   }
 
-  int zero_padding(DimIndex dim) const { return GetDim(zero_padding_, dim); }
-  int filter_stride(DimIndex dim) const { return GetDim(filter_strides_, dim); }
-  int dilation_rate(DimIndex dim) const { return GetDim(dilation_rates_, dim); }
+  int zero_padding(DimIndex dim) const { return GetDim(padding(), dim); }
+  int filter_stride(DimIndex dim) const { return GetDim(strides(), dim); }
+  int dilation_rate(DimIndex dim) const { return GetDim(dilations(), dim); }
   // TODO(timshen): remove this function. No users of this class is setting a
   // non-default pad alignment.
   PadAlignment pad_alignment() const { return PadAlignment::kDefault; }
-  int group_count() const { return group_count_; }
-  int ndims() const { return ndims_; }
+  int group_count() const { return proto_.group_count(); }
+  int ndims() const { return padding().size(); }
+  bool convolution_not_crosscorr() const {
+    return proto_.convolution_mode() == ConvolutionMode::CONVOLUTION;
+  }
+
+  absl::Span<const int64> strides() const {
+    return AsInt64Slice(proto_.strides());
+  }
 
-  absl::Span<const int64> strides() const { return filter_strides_; }
-  absl::Span<const int64> dilations() const { return dilation_rates_; }
-  absl::Span<const int64> padding() const { return zero_padding_; }
+  absl::Span<const int64> dilations() const {
+    return AsInt64Slice(proto_.dilations());
+  }
+
+  absl::Span<const int64> padding() const {
+    return AsInt64Slice(proto_.paddings());
+  }
 
  private:
-  // Stored as: .. y, x.
-  std::vector<int64> zero_padding_;
-  std::vector<int64> filter_strides_;
-  std::vector<int64> dilation_rates_;
-  int group_count_;
-  int ndims_;
+  absl::Span<int64> strides() { return AsInt64Slice(proto_.mutable_strides()); }
+
+  absl::Span<int64> dilations() {
+    return AsInt64Slice(proto_.mutable_dilations());
+  }
+
+  absl::Span<int64> padding() {
+    return AsInt64Slice(proto_.mutable_paddings());
+  }
+
+  ConvolutionDescriptorProto proto_;
+
   // TODO(leary) cudnn provides these fields, but need to characterize what
   // their effect is -- they may be boolean rather than integral.
   // int64 upscale_input_x;
@@ -714,21 +730,23 @@ class PoolingDescriptor {
 class AlgorithmDesc {
  public:
   typedef int64 Index;
-  AlgorithmDesc(Index a, bool use_tensor_ops)
-      : algo_(a), tensor_ops_enabled_(use_tensor_ops) {
-    DCHECK_NE(a, -1);
+  AlgorithmDesc(Index a, bool use_tensor_ops) {
+    proto_.set_algo_id(a);
+    proto_.set_math_type(use_tensor_ops ? AlgorithmProto::TENSOR_OP_MATH
+                                        : AlgorithmProto::DEFAULT_MATH);
+  }
+  bool tensor_ops_enabled() const {
+    return proto_.math_type() == AlgorithmProto::TENSOR_OP_MATH;
   }
-  bool tensor_ops_enabled() const { return tensor_ops_enabled_; }
-  Index algo_id() const { return algo_; }
+  Index algo_id() const { return proto_.algo_id(); }
   bool operator==(const AlgorithmDesc& other) const {
-    return this->algo_ == other.algo_ &&
-           this->tensor_ops_enabled_ == other.tensor_ops_enabled_;
+    return algo_id() == other.algo_id() &&
+           tensor_ops_enabled() == other.tensor_ops_enabled();
   }
   uint64 hash() const;
 
  private:
-  Index algo_;
-  bool tensor_ops_enabled_;
+  AlgorithmProto proto_;
 };
 
 // Describes the result from a perf experiment.
@@ -872,24 +890,6 @@ class NormalizeDescriptor {
   int32 segment_size_;
 };
 
-// Describes a kind of non-linearity (threshold-like mathematical function).
-enum class ActivationMode {
-  kNone = 0,
-  kSigmoid,
-  // Rectified linear activation: f(x) = x < 0 ? 0 : x
-  kRelu,
-  // Rectified linear activation, where upper maximum is 6.0.
-  kRelu6,
-  // Rectified linear activation, where upper maximum specified by
-  // BatchDescriptor::value_max().
-  kReluX,
-  kTanh,
-  // Like ReluX, but passes all values in the range [-X,X].
-  kBandPass,
-
-  kNumActivationModes,  // Always in the end.
-};
-
 // Returns a string representation of the given activation mode.
 string ActivationModeString(ActivationMode mode);
 
diff --git a/tensorflow/stream_executor/dnn.proto b/tensorflow/stream_executor/dnn.proto
new file mode 100644
index 0000000000000000000000000000000000000000..56b079c3f5b962636e7c75b46449adca8e13a43e
--- /dev/null
+++ b/tensorflow/stream_executor/dnn.proto
@@ -0,0 +1,103 @@
+// LINT: LEGACY_NAMES
+syntax = "proto3";
+
+package stream_executor.dnn;
+
+// Specifies the data type used by an operation.
+enum DataType {
+  kFloat = 0;
+  kDouble = 1;
+  kHalf = 2;
+  kInt8 = 3;
+  kInt32 = 4;
+}
+
+// Describes how a convolution input or output layer's data is formatted.
+enum DataLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Batch <-> batch, or N
+  // Depth <-> feature, or channel
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kYXDepthBatch = 0;
+  kYXBatchDepth = 1;
+  kBatchYXDepth = 2;   // cuDNN's NHWC layout
+  kBatchDepthYX = 3;   // cuDNN's NCHW layout
+  kBatchDepthYX4 = 4;  // cuDNN's NCHW_VECT_C layout
+}
+
+// Describes how a convolution filter is laid out in the memory.
+enum FilterLayout {
+  // Naming convention:
+  // Y <-> row or height
+  // X <-> column or width
+  // Output <-> output feature, or N
+  // Input <-> input feature, or N
+  // TODO(timshen): turn them into cuDNN names, e.g. kNCHW.
+  kOutputInputYX = 0;   // cuDNN's NCHW layout
+  kOutputYXInput = 1;   // cuDNN's NHWC layout
+  kOutputInputYX4 = 2;  // cuDNN's NCHW_VECT_C layout
+  kInputYXOutput = 3;
+  kYXInputOutput = 4;
+}
+
+// Describes a kind of non-linearity (threshold-like mathematical function).
+enum ActivationMode {
+  kNone = 0;
+  kSigmoid = 1;
+  // Rectified linear activation: f(x) = x < 0 ? 0 : x
+  kRelu = 2;
+  // Rectified linear activation; where upper maximum is 6.0.
+  kRelu6 = 3;
+  // Rectified linear activation; where upper maximum specified by
+  // BatchDescriptor::value_max().
+  kReluX = 4;
+  kTanh = 5;
+  // Like ReluX; but passes all values in the range [-X,X].
+  kBandPass = 6;
+}
+
+// Describe the math definition for the conv op. The popular behavior is
+// actually called cross-correlation in math, despite the operation is often
+// referred as convolution. See cuDNN cudnnConvolutionMode_t.
+enum ConvolutionMode {
+  CROSS_CORRELATION = 0;
+  CONVOLUTION = 1;
+}
+
+// Generic tensor representation.
+message TensorDescriptorProto {
+  repeated int64 dimensions = 1;
+  DataType data_type = 2;
+  oneof layout_oneof {
+    DataLayout data_layout = 3;
+    FilterLayout filter_layout = 4;
+  }
+}
+
+// Generic algorithm representation.
+message AlgorithmProto {
+  enum MathType {
+    DEFAULT_MATH = 0;
+    // The GPU may operate 4x4 matrix FMA.
+    // See cuDNN's documentation for CUDNN_TENSOR_OP_MATH.
+    TENSOR_OP_MATH = 1;
+  }
+  int64 algo_id = 1;
+  MathType math_type = 2;
+}
+
+// Convolution-specific parameters.
+message ConvolutionDescriptorProto {
+  repeated int64 paddings = 1;
+  repeated int64 strides = 2;
+  repeated int64 dilations = 3;
+  // The "accumulator" type. For example, use F32 as an accumulator for F16
+  // convolutions.
+  // See cuDNN's cudnnConvolutionMode_t.
+  DataType compute_mode = 4;
+  // See cuDNN's group count.
+  int32 group_count = 5;
+  ConvolutionMode convolution_mode = 6;
+}
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.cc b/tensorflow/stream_executor/host/host_gpu_executor.cc
index 8adf739b170c42e5aeda5ccf3ea469f2c3cea07c..1396a83dfb1e0217b795d463323aafbcce081e65 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.cc
+++ b/tensorflow/stream_executor/host/host_gpu_executor.cc
@@ -148,8 +148,13 @@ port::Status HostExecutor::SynchronousMemcpyDeviceToDevice(
 }
 
 bool HostExecutor::HostCallback(Stream *stream,
-                                std::function<void()> callback) {
-  AsHostStream(stream)->EnqueueTask(callback);
+                                std::function<port::Status()> callback) {
+  AsHostStream(stream)->EnqueueTask([callback]() {
+    port::Status s = callback();
+    if (!s.ok()) {
+      LOG(WARNING) << "Host callback failed: " << s;
+    }
+  });
   return true;
 }
 
diff --git a/tensorflow/stream_executor/host/host_gpu_executor.h b/tensorflow/stream_executor/host/host_gpu_executor.h
index 7ba1f181015e057b66e7e7287a592d5f2af1ead2..56e3c2aa6a946357b588f84fdd4c2375ee7e50ff 100644
--- a/tensorflow/stream_executor/host/host_gpu_executor.h
+++ b/tensorflow/stream_executor/host/host_gpu_executor.h
@@ -103,7 +103,8 @@ class HostExecutor : public internal::StreamExecutorInterface {
                                                const DeviceMemoryBase &gpu_src,
                                                uint64 size) override;
 
-  bool HostCallback(Stream *stream, std::function<void()> callback) override;
+  bool HostCallback(Stream *stream,
+                    std::function<port::Status()> callback) override;
 
   port::Status AllocateEvent(Event *event) override {
     return port::Status(port::error::UNIMPLEMENTED, "");
diff --git a/tensorflow/stream_executor/stream.cc b/tensorflow/stream_executor/stream.cc
index 5421e4f4a5edc933a9fdbffda81678fab458483a..3edc66cde8045d7f6ae53095e8136d1697fb1d23 100644
--- a/tensorflow/stream_executor/stream.cc
+++ b/tensorflow/stream_executor/stream.cc
@@ -191,8 +191,11 @@ string ToVlogString(dnn::DataType data_type) {
       return "dnn::DataType::kHalf";
     case dnn::DataType::kInt8:
       return "dnn::DataType::kInt8";
+    case dnn::DataType::kInt32:
+      return "dnn::DataType::kInt32";
+    default:
+      return "unknown DataType";
   }
-  return "unknown DataType";
 }
 
 // Used together with PARAM to VLOG calls made to the stream. Intended
diff --git a/tensorflow/stream_executor/stream.h b/tensorflow/stream_executor/stream.h
index e1629b5b3084e6641bcdf80d1de00f33f1c81940..0fc90cf83d6b4e3e0ede84747f8149c1a25289ca 100644
--- a/tensorflow/stream_executor/stream.h
+++ b/tensorflow/stream_executor/stream.h
@@ -2033,9 +2033,20 @@ class Stream {
   // transferred to the caller.
   internal::StreamInterface *implementation() { return implementation_.get(); }
 
+  // Entrains onto the stream a callback to the host (from the device).
+  // Behaves as ThenDoHostCallbackWithStatus below, but the callback should
+  // never fail or its failure is inconsequential.
+  //
+  // This is kept for backward compatibility. Future code should use
+  // ThenDoHostCallbackWithStatus and explicitly return a success status.
+  // TODO(b/112125301): Eventually remove this method.
+  Stream &ThenDoHostCallback(std::function<void()> callback);
+
   // Entrains onto the stream a callback to the host (from the device).
   // Host callbacks block/occupy the stream just as device functions
   // (execute one at a time, block later stream operations).
+  // Whether the callback return status affects the result of BlockHostUntilDone
+  // is platform-dependent.
   //
   // Behavior is undefined when synchronizing using OpenCL user events.
   // Behavior is undefined if host callbacks call device routines or insert
@@ -2043,11 +2054,6 @@ class Stream {
   //
   // On certain platforms, ThenDoHostCallback is expected to have significant
   // negative effects on performance.
-  Stream &ThenDoHostCallback(std::function<void()> callback);
-
-  // Entrains onto the stream a callback to the host (from the device).
-  // Behaves as ThenDoHostCallback above, but returns a Status instead of void.
-  // This overload should be preferred if the callback could fail.
   Stream &ThenDoHostCallbackWithStatus(std::function<port::Status()> callback);
 
   // Returns the StreamExecutor (parent object) associated with this stream.
diff --git a/tensorflow/stream_executor/stream_executor_internal.cc b/tensorflow/stream_executor/stream_executor_internal.cc
index 7df6a361c6810b9a15c97f15704435d145dccb8e..341c6edccd3c1bfd314127c5356f03a15a85e1d3 100644
--- a/tensorflow/stream_executor/stream_executor_internal.cc
+++ b/tensorflow/stream_executor/stream_executor_internal.cc
@@ -36,16 +36,15 @@ StreamExecutorFactory* MakeOpenCLExecutorImplementation() {
 
 StreamExecutorFactory MakeHostExecutorImplementation;
 
-// TODO(b/112125301): Consolodate this down to one implementation of
-// HostCallback, taking a callback that returns a Status.
-bool StreamExecutorInterface::HostCallback(
-    Stream* stream, std::function<port::Status()> callback) {
-  return HostCallback(stream, [callback]() {
-    port::Status s = callback();
-    if (!s.ok()) {
-      LOG(WARNING) << "HostCallback failed: " << s;
-    }
-  });
+// The default implementation just calls the other HostCallback method.
+// It should make all existing code that uses a void() callback still work.
+bool StreamExecutorInterface::HostCallback(Stream* stream,
+                                           std::function<void()> callback) {
+  return HostCallback(
+      stream, std::function<port::Status()>([callback]() -> port::Status {
+        callback();
+        return port::Status::OK();
+      }));
 }
 
 }  // namespace internal
diff --git a/tensorflow/stream_executor/stream_executor_internal.h b/tensorflow/stream_executor/stream_executor_internal.h
index 32f75fd1bc10b412389af71ecd3f6ff112e22144..0c2c33cfca227b2d67fcdc633dd94274a65b92bb 100644
--- a/tensorflow/stream_executor/stream_executor_internal.h
+++ b/tensorflow/stream_executor/stream_executor_internal.h
@@ -237,9 +237,9 @@ class StreamExecutorInterface {
   virtual bool MemcpyDeviceToDevice(Stream *stream, DeviceMemoryBase *gpu_dst,
                                     const DeviceMemoryBase &gpu_src,
                                     uint64 size) = 0;
-  virtual bool HostCallback(Stream *stream, std::function<void()> callback) = 0;
+  virtual bool HostCallback(Stream *stream, std::function<void()> callback);
   virtual bool HostCallback(Stream *stream,
-                            std::function<port::Status()> callback);
+                            std::function<port::Status()> callback) = 0;
   virtual port::Status AllocateEvent(Event *event) = 0;
   virtual port::Status DeallocateEvent(Event *event) = 0;
   virtual port::Status RecordEvent(Stream *stream, Event *event) = 0;
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 8e5ab94b5361d8e7223cdde01941a5976a09970f..ed1de5a31cae98bf5855fde0676162f0264d998e 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -203,8 +203,12 @@ def if_override_eigen_strong_inline(a):
         "//conditions:default": [],
     })
 
-def if_not_tx2_llvm_or_windows_cuda(a):
-    return if_not_windows_cuda(a)
+def if_nccl(a):
+    return select({
+        "//tensorflow:no_nccl_support": [],
+        "//tensorflow:windows": [],
+        "//conditions:default": a,
+    })
 
 def get_win_copts(is_external = False):
     WINDOWS_COPTS = [
@@ -1307,13 +1311,13 @@ def _py_wrap_cc_impl(ctx):
         ctx.outputs.py_out.dirname,
     ]
     args += ["-l" + f.path for f in ctx.files.swig_includes]
-    args += ["-I" + i for i in swig_include_dirs]
+    args += ["-I" + i for i in swig_include_dirs.to_list()]
     args += [src.path]
     outputs = [ctx.outputs.cc_out, ctx.outputs.py_out]
     ctx.action(
         executable = ctx.executable._swig,
         arguments = args,
-        inputs = list(inputs),
+        inputs = inputs.to_list(),
         outputs = outputs,
         mnemonic = "PythonSwig",
         progress_message = "SWIGing " + src.path,
@@ -1493,7 +1497,7 @@ check_deps = rule(
     },
 )
 
-def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [], **kwargs):
+def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [], copts = [], **kwargs):
     """Helper to build a dynamic library (.so) from the sources containing implementations of custom ops and kernels.
     """
     cuda_deps = [
@@ -1505,12 +1509,18 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         clean_dep("//tensorflow/core:stream_executor_headers_lib"),
     ]
     deps = deps + tf_custom_op_library_additional_deps()
+
+    # Override EIGEN_STRONG_INLINE to inline when
+    # --define=override_eigen_strong_inline=true to avoid long compiling time.
+    # See https://github.com/tensorflow/tensorflow/issues/10521
+    copts = copts + if_override_eigen_strong_inline(["/DEIGEN_STRONG_INLINE=inline"])
+
     if gpu_srcs:
         basename = name.split(".")[0]
         native.cc_library(
             name = basename + "_gpu",
             srcs = gpu_srcs,
-            copts = _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
+            copts = copts + _cuda_copts() + if_tensorrt(["-DGOOGLE_TENSORRT=1"]),
             features = if_cuda(["-use_header_modules"]),
             deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
             **kwargs
@@ -1531,7 +1541,7 @@ def tf_custom_op_library(name, srcs = [], gpu_srcs = [], deps = [], linkopts = [
         srcs = srcs,
         deps = deps + if_cuda_is_configured_compat(cuda_deps) + if_rocm_is_configured(rocm_deps),
         data = if_static([name + "_check_deps"]),
-        copts = tf_copts(is_external = True),
+        copts = copts + tf_copts(is_external = True),
         features = ["windows_export_all_symbols"],
         linkopts = linkopts + select({
             "//conditions:default": [
@@ -2022,3 +2032,6 @@ register_extension_info(
     extension_name = "cc_library_with_android_deps",
     label_regex_for_dep = "{extension_name}",
 )
+
+def tensorflow_opensource_extra_deps():
+    return []
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
index f7491649c22738c625e3f63944f2347358d2e525..a1083d732a1bb1b3212457f445323e5e868ef162 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.-experimental.pbtxt
@@ -20,7 +20,13 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_INT32
     }
-    reserved_range {
+    field {
+      name: "use_numa_affinity"
+      number: 5
+      label: LABEL_OPTIONAL
+      type: TYPE_BOOL
+    }
+     reserved_range {
       start: 2
       end: 3
     }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
index 53b532beab344db8cff9d1ccac4821b8f280af67..b505d813509c2049fa6e3f60df553492d6f66613 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-config-proto.pbtxt
@@ -143,6 +143,12 @@ tf_proto {
         label: LABEL_OPTIONAL
         type: TYPE_INT32
       }
+      field {
+        name: "use_numa_affinity"
+        number: 5
+        label: LABEL_OPTIONAL
+        type: TYPE_BOOL
+      }
       reserved_range {
         start: 2
         end: 3
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
index 0a16d6ab92faac1db63470f0aedadf69341be29b..2299a009d3d5335553e1de025c42b23a57592de3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-gradient-tape.pbtxt
@@ -6,10 +6,18 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'persistent\', \'watch_accessed_variables\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
   }
+  member_method {
+    name: "batch_jacobian"
+    argspec: "args=[\'self\', \'target\', \'source\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "gradient"
     argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
   }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..493dcba8922d7f6c51a61d337f48e09d168e6bac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor-spec.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.TensorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_spec.TensorSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
index af7fc9d4efebc62c282bb82f8a71cd0f5cdfb827..62d8ea9208f7f5f031b80be168cedfd538f18a22 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.pbtxt
@@ -64,6 +64,10 @@ tf_class {
     name: "assign_sub"
     argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "batch_scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "count_up_to"
     argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index 8b7f63e43e237864d4ef24d3b251b23199f9ee17..f7d388d33d050eac2c9f14682bc7068c745a46bc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -1,18 +1,19 @@
 path: "tensorflow.data.Dataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_shapes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_types"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 81358cecbc020061fb77a199afcb4a23862fc364..d73168b070e374a749a00f74b24b77a715d2f37e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index 9d032d43de1094f212e5f749013f1fac5a898459..72fc2c3a9ee5b985723ce2dba9643ba796362dc7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.Options"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "experimental_autotune"
@@ -11,47 +12,19 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_hoist_random_uniform"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_and_batch_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_and_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_parallelization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_vectorization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_noop_elimination"
+    name: "experimental_numa_aware"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_numa_aware"
+    name: "experimental_optimization"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_shuffle_and_repeat_fusion"
+    name: "experimental_stats"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_stats"
+    name: "experimental_threading"
     mtype: "<type \'property\'>"
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 7b7a9ebaf08b1e9fdb5e4c5b7448175611a9b2c4..51224cd6b45f0a1efdfbb3ba6a3ca377d37fd00b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -1,7 +1,9 @@
 path: "tensorflow.data.TFRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index 1c305abf68c2e535b989d89f8331c33afc4cc8b6..a10add1b7e38f9875e699903b3e3c103d73e647e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2520e28a3c708f45942eb2e73911b7a5226646e5..71b597c19c512879b8f18b34843b160efecc6bec 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.CsvDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb304f763ea44d0d7314248170e615115b0794c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-dataset-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.DatasetStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'element_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b066e563cc6196650b1ba561da7c16a80a8656
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-nested-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.NestedStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.NestedStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'nested_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b7ad64f51f88ae9c860e061db5c1ad6b5f2bcf8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -0,0 +1,50 @@
+path: "tensorflow.data.experimental.OptimizationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.optimization_options.OptimizationOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "apply_default_optimizations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "hoist_random_uniform"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_batch_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_parallelization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_vectorization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "noop_elimination"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shuffle_and_repeat_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf41c1d1d696d94ef9da5fc64272349d1533816e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optional-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.OptionalStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 1dd53b1eabdf15b662a839a07176ba4eaf8bda37..20646e87b5fbe23d89ad31ca632a64bf958339f6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f97376b328cf34eb04918bec7bacf08d254d8db5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.SparseTensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.SparseTensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8fdd9dc52e332abdeed039bd85d31f6318d013e9..86c5ff5b0bd7b42d61a92a44c8888852a48677be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -1,8 +1,9 @@
 path: "tensorflow.data.experimental.SqlDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1Adapter\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV1\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
index f423eed42cc2d7115fd50b3ad533f3790736a850..892f8c1fb897dfc8bf4964c118aeb641dffd3caa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-stats-options.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.experimental.StatsOptions"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "aggregator"
@@ -20,6 +21,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a99db4542e0deb506d00c00f889299dd22d67e1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-structure.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.data.experimental.Structure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c8864a9dd98058c659e72ba8059182a666ea39
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.TensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.TensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b5ebf108018e75b6de28287a68a25a03b294b64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-threading-options.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.ThreadingOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.threading_options.ThreadingOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "max_intra_op_parallelism"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "private_threadpool_size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 4c253bb8adf63c7603cd3ffed0a10edb5b76f320..2d115904925eb96164484300baf628d41d3fcff4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -12,10 +12,30 @@ tf_module {
     name: "CsvDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DatasetStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NestedStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OptimizationOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Optional"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OptionalStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RandomDataset"
     mtype: "<type \'type\'>"
@@ -24,6 +44,10 @@ tf_module {
     name: "Reducer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseTensorStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SqlDataset"
     mtype: "<type \'type\'>"
@@ -36,10 +60,26 @@ tf_module {
     name: "StatsOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Structure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ThreadingOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -48,6 +88,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "choose_from_datasets"
     argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
@@ -64,6 +108,10 @@ tf_module {
     name: "enumerate_dataset"
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "filter_for_shard"
+    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
@@ -90,7 +138,7 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 509bbae8332fe767a34c14a33d5af1855b3ffdac..aa474680592a1a3996ca3db970b814ba167cd801 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -28,4 +28,12 @@ tf_module {
     name: "experimental"
     mtype: "<type \'module\'>"
   }
+  member_method {
+    name: "make_initializable_iterator"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_one_shot_iterator"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
index ab6287f8cd080621d76fc34e2cb437960a217800..8a7f1e9363b8211d83d39d31da11507cb4c805eb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.debugging.pbtxt
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..583cbc66549223e5c954b715e2043efa5417ef18
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.distribute.InputContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "input_pipeline_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_input_pipelines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_input_pipelines\', \'input_pipeline_id\', \'num_replicas_in_sync\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "get_per_replica_batch_size"
+    argspec: "args=[\'self\', \'global_batch_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a7a3a97aa0927b81708311d4b8b28fced217c00
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.InputReplicationMode"
+tf_class {
+  is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_WORKER"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a613e2d3d1dcefacdf0ec336587a46ff7e0bcb90
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -0,0 +1,138 @@
+path: "tensorflow.distribute.MirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4899f38cad253167ce0b94f79388cb97fe534197
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-reduce-op.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.ReduceOp"
+tf_class {
+  is_instance: "<enum \'ReduceOp\'>"
+  member {
+    name: "MEAN"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..df707e8920e4488ed6b40a7f93f56b5624188c84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-replica-context.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.ReplicaContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "distribution_strategy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica_id_in_sync_group"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_call"
+    argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77706e57133e1186d9e98fcf9205ed4c91772eda
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy-extended.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.distribute.StrategyExtended"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategyExtended\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'container_strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_steps_on_iterator"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9eb73d2c0d9069ec4b818abe1825503f0ea36fc9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.-strategy.pbtxt
@@ -0,0 +1,137 @@
+path: "tensorflow.distribute.Strategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0dd73ca1d4179b4a3323fa0a9be2fde4e22799c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.distribute.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.distribute"
+tf_module {
+  member {
+    name: "InputContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InputReplicationMode"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "MirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ReduceOp"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReplicaContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Strategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StrategyExtended"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "get_loss_reduction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "has_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "in_cross_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
index 848fc303aa5748348b2aee69ec1e869807327d3d..01b870a81639807489ec2a09dcc185137aae1665 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.dtypes.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "half"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "int16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "int32"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
index 32b84e90ce6ae99e80208905d701d690227a0cf7..ee3a72bfce71d64cc6d780ac7c4e0091ad5f0da9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.BaselineClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
index 94933e7ffd6b0f4838f7b2e9254a4056c9cbf245..38b27f735ffa546d46b6e1cb0b2de3de06358184 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.BaselineEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
index db7776b5bf67879cc806bed1b8463b99a082a50e..3874b84d5a6204ee18c520fcdb0042e3175f63bb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.BaselineRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index d530c71482a2f4db1b1006283dd86f15fd6c27fe..e138ce936ec73c05f8f790fb63c381e56ae2f654 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 4703c0f561a301c3d7eb609d5ad2c0284ff626be..eae0a292a962680a53d8c683ee2d2b97e24937a6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
index c3037baa8c951ecd9b60267ee7cc8674ead88dbe..f9e1504b494e3863f770df23f9f9a92e004b8713 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.CheckpointSaverHook"
+path: "tensorflow.estimator.CheckpointSaverHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
similarity index 93%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
index 9d3688e565761758e765d00086de8b59dcc3801b..111b7583f2cd005912c7f06d977565cd17f265b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-checkpoint-saver-listener.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.CheckpointSaverListener"
+path: "tensorflow.estimator.CheckpointSaverListener"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
   is_instance: "<type \'object\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
index ce6040d0f279361ea789b54dee489996d9787ea7..b54133b294e3283cb84316dd0f71670e5bf49333 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
index 4635a1544c35cca12caaa51f665141a55fe00d8c..09e0d3819244da026762eb1c4f31d25aba68fd27 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index e85007e16edcf5f7f59768c7c17b9340e15bc7b6..5a1d85a9b1028b164250819203be49700d81b336 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNLinearCombinedClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
index a23f5daeac4d6690a599d0d92a2cb5ffdc4937c3..e311f96d3dca0007619df2322ee5ca0295c55ac6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNLinearCombinedEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index 8a55bb835ff4b118a8b2ee45561f3e29639bab90..db4780e4c0159b7ff553a736c24ac7701eae7f14 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNLinearCombinedRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 2c4128ec480cc8302e01aa56b61fdb8a7db35b0a..a44e719099e5edd23e5e4bc4fd7eb0a1c3c5799d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.DNNRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
index 9d270a87ab8fe9788988f9277ad0e652f2b4860a..bff6c86cd75358847430954c690fa021a027dca6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-estimator.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.estimator.Estimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -30,9 +31,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt
index 7bec4d032cedc0711ca07049d5d04490e8bc3f30..f24de493f24a363190cd1d323adaa75b32b0d8e3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-feed-fn-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-feed-fn-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.FeedFnHook"
+path: "tensorflow.estimator.FeedFnHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt
index 31cf9aaeb2c640f8db205c0753f20acc75338fe0..6651170ba33f491d5a5342bcd6e6814e1b973832 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-final-ops-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-final-ops-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.FinalOpsHook"
+path: "tensorflow.estimator.FinalOpsHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt
index 147448618e2df9f71ac794e369b108629e10ce0a..37db48bc64e2f0e955105e8094d51c851c25558b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-global-step-waiter-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-global-step-waiter-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.GlobalStepWaiterHook"
+path: "tensorflow.estimator.GlobalStepWaiterHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
index 4b5de2e245090a3dd265a3ab9d062bf8e43169d7..2c8e82517beea5262e19503edfb480d9ec880f9c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-classifier.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.LinearClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
index 3d6b03098aac475e3ebedd0f87128315ca654122..2148374fdee77af7c4bc669f4a1ebd65b45c5e13 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-estimator.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.LinearEstimator"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
index 0d1510e9ab1371d9cead321cfb6def4fcb417b16..1bdc6124fe93ff903ef55ce586e294cffc43e1be 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-linear-regressor.pbtxt
@@ -2,6 +2,7 @@ path: "tensorflow.estimator.LinearRegressor"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,9 +32,13 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt
index 9801c05df181ee65cc8ce0ad2e886566c0145fd5..425f0167a161104891c3bb76816fe8c5094de28a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-logging-tensor-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-logging-tensor-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.LoggingTensorHook"
+path: "tensorflow.estimator.LoggingTensorHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
similarity index 83%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
index e415819b3d76a13335163d0a9bf5b91217ca4354..6cf6e17e4352b0f909b31327a57bbdca3bc0e02a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-loss-during-training-error.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.NanLossDuringTrainingError"
+path: "tensorflow.estimator.NanLossDuringTrainingError"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
   is_instance: "<type \'exceptions.RuntimeError\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt
index 7d1c89f9b37b5e63ecf2cf766986cb8faa5872c4..82293c2c0c4e7204d9aba83f43ed2fac6bc46b19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-nan-tensor-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-nan-tensor-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.NanTensorHook"
+path: "tensorflow.estimator.NanTensorHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt
index 4df6c4156a8bfe6d3bc0fb6746512cb3025c2604..65b5fb16b0874e7c6469ef11420db146be1f0b5f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-profiler-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-profiler-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.ProfilerHook"
+path: "tensorflow.estimator.ProfilerHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt
similarity index 95%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt
index 3c5a6ac13cc2d8a4d464ab48da6edaa0a9ccc14b..64051d2bd6b69614cd210d902552ddeb8b6c8e5e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-second-or-step-timer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-second-or-step-timer.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.SecondOrStepTimer"
+path: "tensorflow.estimator.SecondOrStepTimer"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt
index 13261f6dde1cf8e6fd228950600303370947b7ea..4368e04df3f86834b540bb5306bf66dd82ac440c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-step-counter-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-step-counter-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.StepCounterHook"
+path: "tensorflow.estimator.StepCounterHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt
index e388599b0bf63379fa95a3276e3f4859eab86d6d..938b189a8c30237bb15bf73083a348e6366fbfc4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-stop-at-step-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-stop-at-step-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.StopAtStepHook"
+path: "tensorflow.estimator.StopAtStepHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt
similarity index 96%
rename from tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt
rename to tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt
index 697c3667b09f42f208dec38938f5a1ce0cc09029..104157315f5982efb4f6b9f39e0ece905a225e10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-summary-saver-hook.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.-summary-saver-hook.pbtxt
@@ -1,4 +1,4 @@
-path: "tensorflow.train.SummarySaverHook"
+path: "tensorflow.estimator.SummarySaverHook"
 tf_class {
   is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
   is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a2a01cd5325ba7e02d9b549293dd09a4a57e167
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.experimental.InMemoryEvaluatorHook"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.hooks.InMemoryEvaluatorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'estimator\', \'input_fn\', \'steps\', \'hooks\', \'name\', \'every_n_iter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'100\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
index cabca3e883fbceecb399e048e09722acd4efcad4..f0fd7ce782db71ff5e790fe50e93556bf5d19e1e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.experimental.pbtxt
@@ -1,9 +1,17 @@
 path: "tensorflow.estimator.experimental"
 tf_module {
+  member {
+    name: "InMemoryEvaluatorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearSDCA"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "build_raw_supervised_input_receiver_fn"
+    argspec: "args=[\'features\', \'labels\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "call_logit_fn"
     argspec: "args=[\'logit_fn\', \'features\', \'mode\', \'params\', \'config\'], varargs=None, keywords=None, defaults=None"
@@ -20,6 +28,10 @@ tf_module {
     name: "make_early_stopping_hook"
     argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
   }
+  member_method {
+    name: "make_stop_at_checkpoint_step_hook"
+    argspec: "args=[\'estimator\', \'last_step\', \'wait_after_file_check_secs\'], varargs=None, keywords=None, defaults=[\'30\'], "
+  }
   member_method {
     name: "stop_if_higher_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
index c5b0085b8d3ec58b4215d4a756957e1509501841..d3656ae0455971ccd98062a52ec0412bf6af06f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.estimator.pbtxt
@@ -24,6 +24,14 @@ tf_module {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointSaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CheckpointSaverListener"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
@@ -64,10 +72,22 @@ tf_module {
     name: "Exporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FeedFnHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FinalExporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FinalOpsHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlobalStepWaiterHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LatestExporter"
     mtype: "<type \'type\'>"
@@ -84,14 +104,46 @@ tf_module {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LoggingTensorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ModeKeys"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NanLossDuringTrainingError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProfilerHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RunConfig"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SecondOrStepTimer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StepCounterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StopAtStepHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SummarySaverHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrainSpec"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
index 0a231f1b65155b8662bb38943bfd97c5283b9385..15d0e099bab3052553671d52d396239b27383a8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.image.pbtxt
@@ -172,6 +172,10 @@ tf_module {
     name: "random_saturation"
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "resize"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
+  }
   member_method {
     name: "resize_area"
     argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -240,6 +244,10 @@ tf_module {
     name: "total_variation"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "transpose"
+    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "transpose_image"
     argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cfa3372b12bfe32eed4311c89b6448c0359c0913
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.gfile.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.io.gfile"
+tf_module {
+  member_method {
+    name: "copy"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "exists"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "glob"
+    argspec: "args=[\'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "listdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "makedirs"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mkdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "remove"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rename"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "rmtree"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stat"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "walk"
+    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
index 64b63ed1a4a5611d369cd4aa01589aee2076b24f..b760ec38906e7ae23445857b46e9941a418a5c29 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.io.pbtxt
@@ -44,10 +44,22 @@ tf_module {
     name: "VarLenFeature"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "decode_and_crop_jpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
   member_method {
     name: "decode_compressed"
     argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
@@ -56,10 +68,26 @@ tf_module {
     name: "decode_csv"
     argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
   }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_json_example"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "decode_raw"
     argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -72,6 +100,18 @@ tf_module {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "extract_jpeg_shape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "match_filenames_once"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
index 8ccba990bdd94c4748efb2017cbd2eaabb842b9b..a3254cbd947d9ef70617131e9f4b17f44f059840 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -105,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -225,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -247,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -259,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
index 27aa91a64529a0509451e59d0adbf00e8aa831e4..b70e9ee98d5bc4900420ddb1307abf9adcd8cad0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -110,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -242,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -264,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -276,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
index b0e5d2bde7d04ede1d9e33a02f1198c56a605b12..8cd0c6ea5f027fa1f30b60a742450b651242d406 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.backend.pbtxt
@@ -398,7 +398,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\', \'zero_output_for_mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "round"
@@ -512,6 +512,10 @@ tf_module {
     name: "temporal_padding"
     argspec: "args=[\'x\', \'padding\'], varargs=None, keywords=None, defaults=[\'(1, 1)\'], "
   }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'x\', \'n\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_dense"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index db69e25c5b70a0cdd76dba6aa570d0c634a31279..1d814b2c8b553f1b2a07f9d9b97dc70ec0674969 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
index 5510465d7b015e4989472b06c9d00ec9772373cf..b84629540e700f242f885064c92309c294693a11 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activation.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
index 38ec8a0aff0b9321f3a7ab2cfd9e6b75a8228e4a..5918a13ad8629582829049485e896688ecad9579 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
index 41cb8e30bfb57068ebe787f14f69ccc467047f26..599da06427dfe4f28e757a7aac8d8a14856a4556 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-add.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 9a7aaa8e961528aa750248e02f44403cab10a413..f9ff1538c8134d96051ad81d35c73e59c6a8cc57 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 014f5828fad1152b19a0b0e3d2ffee7cee4c999b..723fc9cdb0d0ad93470e22fd8c147d3ecc92af91 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index cc303bf7b98bb81cb0646fc18df0a4c5c70f1917..957ce2f0ce86f8df3eb8b57606229fb661eb52f7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 628447ce3555628b651536d6c5b2a7716d59085c..a52c0af68175420dc2a1993d1f025d36705538e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
index f03c986c22210906ad7bdc8b880753469b31aa1b..a004db62ddcaaae02a411d8db51f4026ece1384d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-average.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index a6e4856de9b63c946b77b745a6ede28dedf44afc..44f83d1387cb2ec681f50f7b1f0297f3f74594ed 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a01eaf8a12626257e97d135f50c06c7ea32fca27..8378faf7188ec594865d4b68c8ea8cae284183ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 0d6698f2ef4c674bf8a4dfc026eb209a83dcb8e7..9d5655c9644e3a2394a346bed78fc478cf60ba8d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
index f1b23be48f7fec2051f1985381058d769eb8c2f8..b3d3c84f92e6491601f670739b2b45f79313e8f5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -88,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
index 0672cd5b7b8fdb1967e39c9163635372f73459b7..d37a6b47105225d7b83b6a264b944ceeb583a6c4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
index b25ae1e82e8a1f315553337a261a2d8a46301fa0..1ad7a91be0ba48d0dbab19da8c7cd9ca89095918 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-concatenate.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index bb1918eba65659d9ede888400c24b3a5121d6052..cb9abc25396bb63a3c40de5cc52f9df7ed20071e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
index 16e0fd5a3131723b3ba3ef3ae6d93fa6426dbd47..47dba1d81f8f97a60fe72ec521f82a78ee5f3505 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 381839d6deb8355a54fd883596f94e38fa1356d4..fd649418961301f150aac3dabc1bdf0ade4a9c28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
index 543bae6fa96fa3ae51775e865bf95ea6f79c8e94..1b1425d53197db8b59abf51fe93c0b0c45299956 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 2933f9f4b3ab854a9bff6a200da2c2b912bfd4d8..1741063fe8b09acf3865e0a135e96bb715dcdcfa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
index 072943dc2c709a7cee26c3439e02e11455187282..50feb4f458ad1a9cb2b2bfe5d67997b7551eed74 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
index 222a1ef4fc5d19afe2c111c169c2f0bd38c331d6..faaa535df9fe03ad07862f0793f8ebea67b405ca 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 9c9c7461c8b8b43acfc7b7db94fd961a15d57817..4079329d1ee2a61270fee38426bb8a0859c38ce3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
index f93906717814d4df7dfbf983d6cdbef358e9a55c..32e56696e1617f7810792e3416a2ebb2037d23c2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 44ca598724a5c7b6d40ba460dd866675971015c3..381abe73401fa3a588873d643324fc020c159e30 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
index 471b18ef8500a279fb07bc893e2c8100d76d7bf1..b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f250a09b7eb69871e7e89d30da817aeb1d896fc..7aeff8003c322e8a8168dd70481a8b30b08762a8 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
index f52128483c67321e4f0e5f0cf5a9fd3c65794561..a1728d9d4f9a1e677646db04c4d0df9572e21208 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
index 98daf3bab128357ffdde2e8ffa4f61fd5c6493f7..8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index b207c6800050a6a3f8b9525315fffa14341758cc..7758209adf8fe7a1306fa5ef125935dafd925c3e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 2d7a09ceda90fa8564de4410bf9b553cf1594c97..7c463ff1257599366be049edce6cc06140906286 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
index 3ac3825759391b7ea21fd6e3b3b149bb9e731479..4960d0264e96e872ea5c49a8841cef20bd5eb37c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 280ec8c25fabe1be63c9aa9a2c7f168315c219d7..8fad7535f882718462a11e27e75732e3097cb87d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
index 560f66f9c7a1f7e42e27c739a6c71671f8bd147b..5b425f2d4d7a8a897280490e26922766d8bf7065 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dot.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
index c0543529c3884f20383911f32ea04c07fec4a050..f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
index 04eb2824b9b14cf45eaef263282ffc6778bf709d..82b761fc1761bb3e7638f7a80bc80c6433162d04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
index f400432915f8ce892a3297a23078f140eb96db7b..c9ff323877e06b6dff274644744d425e3a9b7932 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-embedding.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
index ab176b441a246d93b88c00cd6decb34af175ad86..9b4165d4cbf88fefd2bb684dae70ea8afc01357b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index c3895a0ac127bc663f2a323661c1371a428159b0..f225f7c4309615919fb05df05f2ae664bde80097 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
index 9e24bb8ae6a1dc2e438038d4ac9225a7f17c4598..855d001700179fb634d1dff78585d340420abe7f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 55e0d7ef023ac4ca5e89f640c5ebb79199c31afa..2c404c99cd2175cdc8b60b229e4410bf280ebcb7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 38fbff5e4a3d2c892b0601c54e52690dae5760bd..6f109d59d0f6fcd2b4650719e3b4f653baec7d23 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index a8094c0bde3e46b49cd253d7861922c90f1ffbd1..69f8a9031d32eb73bb44291cdf330d738d745cf9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 929f48df23180a2c5e21c110e0e1d343596ecd76..4299f765e525b136e289bba169becec06e19ffb1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 2e6d59337f1df94e327b506248eb74ab11bd6013..9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 3ebe162f573f630400a17d9a7ccad1615b8e9c78..625e81fd2322ceba153fa65c138948ce43843089 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4e3e258430cdacaf55aed5d46411d2b74c9bdf2e..2fc769742c70c5665c9cb77ad246fcdb49366d5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index fb9166316f6a641eb12a5664100e31d652148a84..e307a65c7c565660e1f2b6b6b74dc5970425eaa4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index c0a53b847b4afc1fb098fa06eb8e8e27a96c3459..4394ad0364e89fd3531d6625e52540991cadf973 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 87b7f6797a0d5bef8c5a4ff582c30433eaced2d4..050ed39fe98dc7cfdf6febe45e235d3ae7cbf486 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 98bf96fa0c251c5f6de8878d48e651ac3346ff38..436191821ef4689351b6124cf2a20afad917e4ab 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index ff6c6f3ec4d501a87a858c7c9bf365590cfb4fdc..4ba540aa6adc72b572aa9340f89967d69ab78a3c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index c9d4158d1c434655abb11b92269e6e70ad2d1f91..a2e9322cb3fd4e56af708d5c4e17b660f7bc2247 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 9953102ff991bfd4f0568120dd7aef07f75ea208..5d16a57fc1aeff9939220de8043fcae39e3d953e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
index 2617f5a95fa631cf0b92e1fd2feef7457f96fd80..9dd29c1251ef2eacaf535a3f10f3d42dc36624a2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-layer.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
index 5fd0a47a68c0d4ad218c4c64cc6be8f603d9673a..bc3ceb67a4e7506b42fccd6b227891b9eef8147f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index e9f6ef45aaf1c775ea1b8dd157737f65c87e232f..0045d5775e2c19df21428bd4420b6e5612c8002b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 1b1ccbe118069e5fa5c3acd4ebea2d7fc14395b2..529c750f98715ec30313ed34c9023a845061a3df 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
index 2e0b6bac24fd63988b28c1099d40581989b783df..d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-lambda.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
index 1e93d1118a4d306d5427d9b6873de1746d93b764..e1f5491180903f7d6931cc09755cabb715bbf233 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-layer.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bfd36012a7edb8a74198a87a86577278be3fdcd4..9b69d9a9447f42907236b5cc8c7672012f96c38a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 5ad5990d7e624c4f6b1dde92b4608c65aeb19db1..fd52259432577ac94dc702d4411ad5c0eed1ff10 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 40d03369a5235f394832e3e2f48710bb069e9aac..5fc8af0d03564c649dff6e9df70d10731319de40 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
index 86666b51bb8c8dc22deb95f05cb9edfb10688015..7f8932270e63bc02852c5b64e53694e7e26be08b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-masking.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
index d26da270e747898412b05f6e07e3bbc23f287e0b..4723b99cb0792e1ce0bdc45e46908da8c2b5359c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 85f23df671d2772995ec01bb09e191237d60e6a7..173c5d4a8b149c4e23683cf375e8d793db7faa5a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 235806b96500473fe95dd1b25aafe7f091bdb36b..14e1899e145224e411d65cbf481060a3b2cec0f1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 524c5fd69e508bac55b05502e62650b92cf2be5c..a708e652bf0e82dea0f58034a81a040a39550dc9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index fda2562fc8c51623f5c4b33e23319ed35229905e..e6706b5cf9f32bda78adc4e2db5916a5750cc82e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 71d2d09a8d1d7addf91d7dc4ca109f8c2d45aed9..a73c082d1bba0453b742f76bacf0ad6116ba79a7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
index 12949b39a6f7affa657d1dccdc49ad0dc37e9c2f..f3f195554bbf4a43efaf2af0fd278a23bf270994 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-maximum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
index ab16d0021e627e6a2a821a0185ad71eb5bef1835..f345d1d67b2ce0200c64b1aeea5f39821d070bac 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-minimum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
index 61ccbf5962791ee1c0b35cc4aba422ff5cacd456..31cb8bc177c7a9e365101e75108a29900fbda124 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-multiply.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
index ce2320d7030d05ba1e065f5bbcf8a18014891b5e..44cccc92bd2f1ff0335c22f2967865dc88a96ff7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
index 69848af8cf876ad1232a0bf7c419f52ed68af9f0..b55e191ff1ad6997550966bbb6154a81a489575d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-permute.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
index 3358f26aebf8a8f845278d54b4524f4405751f5d..e9575436e5b14ac8c52a0b59c86937886eab5f40 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
index 413f45f018ae0ce9ccf0e459b24d544c456e4c7c..98223b207f2ecfd5b7af8a53390166e53a7d4f73 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9c61ff602744c00f9105a3f297151b49a8a3dead..2df918b16b2552323d75083bfa80e328c0639cfe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
index baa91804c49f86a31093aed0c0a56613f7c1afee..ce5f9e21290eeddc0052257191ac4a6d068c1366 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-reshape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 15a5d6ac9ea6e087dc0d76a2ab48b08448bfb6ee..a0bb917775fd9edb5d909bf850310e0596a88209 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index be43bd5b3c13632711a49cbbe6c85527d46d46ec..d7942f201bdbfa8d1577813be461a5905b5c6c90 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6105992c7a3a92d00718fe3287412af3c752db1d..f7ac9042d46f46ab35d18c62e5d8841679a18ca9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 1b6cf1e9ecb08a789212da141971434bd63988a6..e5a92688220f6e227b317d71a70fde01df4c432b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 29488a37f8f29f953d2b8b7e447c331df3244c84..0fe2c974a762784a82a6b97e116357be2a61d84f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3d70cf8b65972bda1f2bf4d78e126c0946a7c2a5..2ee5873f0f11688019dec3a6cd69db06d99b9caa 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -149,6 +149,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
index d29731ecf9d5387a324104865af5f563d287c60b..5b8f64aa35725d0ea44fc5c5b81952fd839503e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-softmax.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index a6d7494ca7d2230298a442b86766f46bc58a6d54..240cb6e562f77467d94ef95db2374150e318bc04 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c36e802693df564702100a652f3ccc2e95e4c40d..6226c469f8a534f96f6ea991fa5e7d2cf0019e3f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9c46cfe40fd6959b526d6ca271bda3182daa1188..34dabce6d8dd0b1b6fe50a008a981e1f06a77edf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 8982f787940dd65291580781b5dc95941d804071..0ddf628ace582db259ebe0b211aba6e6362b5d5b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
index ec2cc502984d302b243803b04b4f9d60cee43d05..12eb35ad154a514afd9c900cb2dbece8af28c49f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-subtract.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index d7bc1980f32e523781a68e80312905bc355f0509..c41020c2b45cc88c9b63f3b7a45c35066794dfe2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
index fec2de6b49ec1ffaf45b9ee9048bcce37425e919..479f89cf6ae93e8d6ae02e304a51a145164df7de 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -93,6 +93,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 3d285e7f17db3e8cdfbacf0056a4c56ffa7e67cb..233363ce02614f184b43a059889c7475b6a8c50b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index b05e5ec84de1eb4899b6fae437dc0d4bd1ab402d..cb6228ac446bd236df88f94eb6e9e717ea38463d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 728eca415a80842291d5684e55632689ceea4099..03bad3ccb613a225ad56e128ea680fc9312151e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
index da64e77c39c0e116ff725bb05526882541dd6056..158996792a47fab0e7aa26d21d4bb7f281ca76d2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-wrapper.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 2f505f9293f429490543ba2c569668f4b2ba3ca4..63a56cd3eebe271f66258c9a0acb974764555b34 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index f82c77072e6969dd57f89f4a971e59e28b4bfc63..965a4cca04651e123c5bd93484200a58b39918ba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 54e01a99177cde5fbfaf5e1e0ac310bef3ea8eae..1a624308878a68f1b48cb0f8b5e08dafbbfa0333 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f7da93f6f412ca559aec2f6acde2b80a5c93c86
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3a7cd80973259bd5cdfe382c656a9478f8933d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..712bb2ecd3526c354cbcf640e689526b2e415a13
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7fe362da89b47a925cd4708909e1c882a9a23aca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a5718533500d9508c558d25d13fc6b61518a73a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..200006db355ca4dc8eb2f509bcb9da7543145548
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
index eca6b915388ebff0103f7ad16f43c6be0df60b7d..9e26ddbdca0c45df195dd566952379887dcfcff3 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,29 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -22,11 +46,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -106,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2db07df5235e150f691a12d6b332c6d0d241ac19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..904ad3a21a05895b23e30dab82a89a31c74dcfca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17b74924fab4f596a010d6b9731b474433a8153e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49f577e1367aece126449923f77f4f6c89493e99
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8baf858669a446a11b44e044f36bfde61e440bb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40fe64bbd2cec45b9a8c4e9b041d3fa858af1327
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ae6a85026da80cd071984aede8d0ec4e9cd571c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-precision.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31068a51d510a7b95f62f61f03d37176c0fca55d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-recall.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa77d1972cea42184fbbdb91e117b08ba38328fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.SensitivityAtSpecificity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c17452292a031d42f3da0d5844e99d1272dad25
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67857aa89f1769c736d810cf5f73739021afeddf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.SpecificityAtSensitivity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b5eb8d0de53960c3a98409119709c1307aa6379
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b9c470e32d7e038f9ba11e4f96ab6eaa6b60a87
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
index a296e131586504a3fadc9e6fe54079ee0f8270ba..905021dd790205e64a6f9839218200db98941927 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,57 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -26,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -34,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cosine"
@@ -110,7 +162,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
index ccff809f2b25fd25180400d82510ec3ee3683615..c58c7bef22dd4bff95d8ff07a10e20bb1bc463ad 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -105,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -225,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -247,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -259,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
index b0fc7f97f1d9a18f6482abd548d2f60f60b82555..473a1c16fb1edfbf37a7752e273566c1310853af 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.models.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -110,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -242,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -264,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -276,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
index be4496e753f8bdcd76a4761f9bd1804a77380359..8177cc71ed34ed5d0ae57d25ee2da70067411ccc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.keras.utils.-progbar.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\', \'unit_name\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\', \'step\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
index c82e67526b21696a7d56517dc2cb6998882dc7a5..059c91f724aae187055f8323c7748dc99f153302 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
index 1d031cb5f8461145127b0f13d77e6b8774f5a0b3..d06c8e81ee5d2a8b487d7c3c3714a1f4ed2c8e80 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
index a8dda6655df1d06ca77b74f0a992c8fd7e7a357d..6be8e7c210f3f0a28ed8ad8a6672bc4323eb7f9d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-average-pooling3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
index 97f65ed89436bd0b4027bb0cbeb80b6f1419269c..16d9ecce10cfb3c28cd1cf47fd65c987680bda41 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-batch-normalization.pbtxt
@@ -1,7 +1,8 @@
 path: "tensorflow.layers.BatchNormalization"
 tf_class {
   is_instance: "<class \'tensorflow.python.layers.normalization.BatchNormalization\'>"
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV1\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
@@ -98,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
index ccd9578f0d62bd70ea252ddeac587d59c926b018..21c695935ce7751df67e09091c961e9e0cfbbf7c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
index 9cbb58d721bb49bde562a57728a9ee46968e611e..f24d0307207588610c1f764bf43912b64c3ea2c6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d-transpose.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
index c75ea3911e17bc879d140068ef54521effd2824e..0a510ece355435d8e75e39d5f7cdc6cebefe32cf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
index 5dc834e5141e58d255357e02d7446a06e6e2aa45..d0ee44bed3c739da27cc83f0e643e1ea9dd98078 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d-transpose.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
index 96ab209874ac14d6acf2e8115e7f04fc35c4b2bd..546de3cdab3aa0519450f74c6c6d0fe74ddc000c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-conv3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
index 7e9656b3525c1d53940b869607616ff414a466cf..3ad311581eba815c2d1b0155a1380db80dd61c5d 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dense.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
index e9a2269a6e8de1f9a12f1b54d2e6dced3d4f8902..9b83271350cf90a2d430303dfecfd28facad272b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-dropout.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
index 7d2eaaab2a8cb9159214a16ba65473d0b6870ac4..87a7fb3d843e3e8e3e2fe5a56ec0b181355a6d7b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-flatten.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
index fd02c919aeb5a536bd052324618983af699e7c47..80834e08f7ada08f02c660017ae0b735bb31e20e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
index 8bc3eb26e9ca0bf0f129db336b7ca23466fd036f..32b17e90ade7aa0054a390256e3abadfc7011cbe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-layer.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
index 6a0dcce56ac0184ffe995662fd62b89e16257a29..643c469717c258207046ddd93a318f47753de46b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling1-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
index b6c84edf2a2f86240369b4053cd7351d0b59442d..434e25adc12c2f2f704b07087b8552781ac2d024 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling2-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
index 062a02fa590537b9efbf540a874eeaa6d36697f3..089fc6f9243c85937500b6275da034eb0748ecd4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-max-pooling3-d.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
index eaad0fb23ef7501c8c5b7acee6a9677665b7057f..bc3d58b9ca9789b43bc91f9283a81811f2b6a4e9 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv1-d.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
index ece28a8ce962d8fafb3f7a397a814b903e915d48..fe7d71af3a4a46bed4ea9e62cbd7ad17987517c7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.layers.-separable-conv2-d.pbtxt
@@ -100,6 +100,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 973705dae2fabbef0eafb38ad12e96c747aeee27..773c74e64d13ca4a840b7f599fc2cbe9c161cd03 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
index de917706d55214cc59f3205f0778d600a356a5b1..533544d21f2753f785113a30518f4fcbcff96cd7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index c4e6a21c3ac9324f5dd445dc65415c2abb4c6e9f..e3926eb6d4714731d09ff9c5b75a89830c06e7c1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 2e085a8e289e21173789041efb9254e992bd723b..ba209df7824a9cc076499458e35acd7dcf1eaf35 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
index 42d22bce42d8850a784afae3f67771ef1cfe5403..081fb0e08bcd1b35ab44459d1c8eb0857dd14956 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
index d6749fdcec69425e83a044409ec695d2661f782e..2014a04301618c20af5cf6f1144eb4dbda2479e1 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index d9f363d1336210623536e8293a6290d9ebfc2fe1..9a87ae9687741090485bd8d4d0d07d359a2015e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
index aac7ee31ed62c22b2e86d287d48c68c7e905fd00..33afb835ce1d524991c0024bfb87c29a72aac08e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -76,6 +76,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index c11d39082939eda4520b3955b767022bd485b5be..a9078c8ab5cca078237a29febabdbbd4a8b6c89c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 3ee800269e617390c25248a2c847cbe259b18e79..4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 63a1bc2321e35645700778c5906d1b8659eb4a32..a87649133fd207ad59f2124c6b0b5aa44916e5a5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index e2c5a505a7d2f9abbee5b3bb4f92ee8843198c51..32656467840fbbc0c8708ea68aac5aa75c11a540 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
index a1b0e06b4753488bc9fcbe9aeb0d260092745f9c..49d8890c8942bc0021886ee6c9bc4e7625452655 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
index 6d849dc040f61b498b100820bf7be3d4bc264bb4..c89dc067b331603e227d9d578147e2dd1ee4a900 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.-linear-operator.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
index 1a4098d121b71d25fc0aaa9c7e6e4f096b01e033..9f7b422fabcd55aed98bc93f01143d35698c0399 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.linalg.pbtxt
@@ -132,6 +132,10 @@ tf_module {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "lu"
+    argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68c651a3c9969f2f16fca39f4466cebbb44eea28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.-ops-set.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.OpsSet"
+tf_class {
+  is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "SELECT_TF_OPS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+  member {
+    name: "TFLITE_BUILTINS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
index 08845553e55d3bd2b280331676195246632e56e2..ef6c777665c8662be3332dc74b7bd7dd5044c086 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.constants.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.lite.constants"
 tf_module {
   member {
     name: "FLOAT"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "GRAPHVIZ_DOT"
@@ -10,19 +10,19 @@ tf_module {
   }
   member {
     name: "INT32"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "INT64"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "QUANTIZED_UINT8"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "STRING"
-    mtype: "<type \'int\'>"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
   member {
     name: "TFLITE"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
index f5013c250be8477bb630d3d57ae88a501bb60b9b..154dd00821794ef4a5118e98d67e32beca38bebf 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.lite.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "OpHint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OpsSet"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
   member {
     name: "TFLiteConverter"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
index a8334fdd1d1ec277d71853939f929843fcc1a723..f34e2c2aa5a5b30e037157bc84894da5dce78538 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.math.pbtxt
@@ -176,6 +176,26 @@ tf_module {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "l2_normalize"
     argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
@@ -298,7 +318,7 @@ tf_module {
   }
   member_method {
     name: "reduce_std"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
@@ -306,7 +326,7 @@ tf_module {
   }
   member_method {
     name: "reduce_variance"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "rint"
@@ -322,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "segment_max"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
index 93f2fda2acf8f6566f6099a60d1ee4287b4d2ae6..40e20f8c919e64362e5697bd00ded70d0c2292a0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "bidirectional_dynamic_rnn"
     argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "collapse_repeated"
+    argspec: "args=[\'labels\', \'seq_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "compute_accidental_hits"
     argspec: "args=[\'true_classes\', \'sampled_candidates\', \'num_true\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -72,6 +76,10 @@ tf_module {
     name: "conv3d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
+  member_method {
+    name: "conv3d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "conv3d_backprop_filter_v2"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
@@ -104,6 +112,14 @@ tf_module {
     name: "ctc_loss"
     argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
   }
+  member_method {
+    name: "ctc_loss_v2"
+    argspec: "args=[\'labels\', \'logits\', \'label_length\', \'logit_length\', \'logits_time_major\', \'unique\', \'blank_index\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "ctc_unique_labels"
+    argspec: "args=[\'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "depth_to_space"
     argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
@@ -112,6 +128,14 @@ tf_module {
     name: "depthwise_conv2d"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "depthwise_conv2d_backprop_filter"
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
+  member_method {
+    name: "depthwise_conv2d_backprop_input"
+    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+  }
   member_method {
     name: "depthwise_conv2d_native"
     argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
@@ -130,7 +154,7 @@ tf_module {
   }
   member_method {
     name: "dropout"
-    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\', \'rate\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "dynamic_rnn"
@@ -302,7 +326,7 @@ tf_module {
   }
   member_method {
     name: "softmax_cross_entropy_with_logits_v2"
-    argspec: "args=[\'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
index 88b8f37c4ff0cfaf562293c845e505f06119e227..f7f9978c063ceae89c7228b476f54694e25bc249 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-l-s-t-m-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
index a4483fefa279957ce503857021c063254a9abf83..f9e898484b9813373a49e6f117578f822cdeb156 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-basic-r-n-n-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 381c4975d7d778599ce34a9023d0e46b20753cba..9e52a4252619ffc19b287fc1818fa6f772847335 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 912365a28b1277962f648b2b0655d280bca1427c..9836433d08cba809107f9bb5dbccf2e971865b8a 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -110,6 +110,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
index a4bb3219c792708cd02a8345541d8685485c8d05..5fd9b329bdeb40b5a57fe68564977f61b5349ae5 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-g-r-u-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
index 715bfd5fc7c18993d4997caeefe3188ba88f741c..76c8cff22b1e65e65d0ac3d6705541dc3f16f80c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-l-s-t-m-cell.pbtxt
@@ -107,6 +107,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
index b66c0f89cc904c1318787651a3e8e629319c14fb..f53567af52f7ed6baa78bcc75bfc0e38de02e548 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index faeb4f3513362919fca8f0c2ef7c491d7938cb92..d3b68e4f2976912ed65ba7916284c951fda03b05 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -105,6 +105,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index caa2e600800178e4b2d36ae263da23d0b4608dd2..1f7840ab919baeeb0077904592ba8dcc1d4c91fb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index f4dce81659dd8ed2773f09301319c3917c8684be..584c74f99d896e45de06fa020413b8edd4440afb 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -244,6 +244,10 @@ tf_module {
     name: "TensorShape"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorSpec"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TextLineReader"
     mtype: "<type \'type\'>"
@@ -320,6 +324,10 @@ tf_module {
     name: "debugging"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "distribute"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "distributions"
     mtype: "<type \'module\'>"
@@ -692,6 +700,10 @@ tf_module {
     name: "argmin"
     argspec: "args=[\'input\', \'axis\', \'name\', \'dimension\', \'output_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int64\'>\"], "
   }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'stable\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'False\', \'None\'], "
+  }
   member_method {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
@@ -778,7 +790,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'name\', \'message\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
@@ -1040,10 +1052,18 @@ tf_module {
     name: "dimension_value"
     argspec: "args=[\'dimension\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_eager_execution"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "disable_v2_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "disable_v2_tensorshape"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1084,6 +1104,10 @@ tf_module {
     name: "enable_resource_variables"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "enable_v2_behavior"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "enable_v2_tensorshape"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
@@ -1232,6 +1256,10 @@ tf_module {
     name: "get_local_variable"
     argspec: "args=[\'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'collections\', \'caching_device\', \'partitioner\', \'validate_shape\', \'use_resource\', \'custom_getter\', \'constraint\', \'synchronization\', \'aggregation\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
+  member_method {
+    name: "get_logger"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_seed"
     argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
@@ -1466,7 +1494,7 @@ tf_module {
   }
   member_method {
     name: "make_tensor_proto"
-    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'values\', \'dtype\', \'shape\', \'verify_shape\', \'allow_broadcast\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "map_fn"
@@ -1810,7 +1838,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "scan"
@@ -1948,6 +1976,10 @@ tf_module {
     name: "slice"
     argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
+  }
   member_method {
     name: "space_to_batch"
     argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1962,7 +1994,7 @@ tf_module {
   }
   member_method {
     name: "sparse_add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\', \'thresh\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sparse_concat"
@@ -2156,6 +2188,18 @@ tf_module {
     name: "tanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_scatter_add"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_sub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_update"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tensordot"
     argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -2296,6 +2340,10 @@ tf_module {
     name: "while_loop"
     argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
   }
+  member_method {
+    name: "wrap_function"
+    argspec: "args=[\'fn\', \'signature\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
index 2948b7318ea0d72913d38bf33c9a47f5c64b2931..632c2f8f83c8effb188d110bfacaf7f22c0c74cc 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.quantization.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "quantize_and_dequantize"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\'], "
   }
   member_method {
     name: "quantized_concat"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
index 160c09798d02653ba0c090db53124450b956ef05..1eefb1c70ce4d825402155a5e068c736defff02f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.random.pbtxt
@@ -1,5 +1,17 @@
 path: "tensorflow.random"
 tf_module {
+  member_method {
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
@@ -8,6 +20,10 @@ tf_module {
     name: "get_seed"
     argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "learned_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "log_uniform_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -32,6 +48,10 @@ tf_module {
     name: "shuffle"
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
+  member_method {
+    name: "stateless_categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+  }
   member_method {
     name: "stateless_multinomial"
     argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
index 67457de070830d45a48230835fc4827e36f70058..e4cc0061a953c81729d8499530e43f5b43a2210e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-builder.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.saved_model.Builder"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl._SavedModelBuilder\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
index 83bd7035409534abf036c7e2b0d66fcc060ada3a..44860b11720e1af87d8baa3aec5f4f3169410d82 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.builder.-saved-model-builder.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.saved_model.builder.SavedModelBuilder"
 tf_class {
   is_instance: "<class \'tensorflow.python.saved_model.builder_impl.SavedModelBuilder\'>"
+  is_instance: "<class \'tensorflow.python.saved_model.builder_impl._SavedModelBuilder\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
index 2055bfbf066cbb66f1becc770e23ba5151c73b27..2a7c78910526f83fdfcd963c21996b4f4dc4bc28 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.pbtxt
@@ -148,6 +148,10 @@ tf_module {
     name: "classification_signature_def"
     argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "contains_saved_model"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_tensor_from_tensor_info"
     argspec: "args=[\'tensor_info\', \'graph\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -176,6 +180,10 @@ tf_module {
     name: "regression_signature_def"
     argspec: "args=[\'examples\', \'predictions\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "save"
+    argspec: "args=[\'obj\', \'export_dir\', \'signatures\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "simple_save"
     argspec: "args=[\'session\', \'export_dir\', \'inputs\', \'outputs\', \'legacy_init_op\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
index 8a196b1a556e283671cc75af28df3eaa62532975..09d6f1424b785e266854ede48b26ebbdf571288b 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sets.pbtxt
@@ -1,5 +1,13 @@
 path: "tensorflow.sets"
 tf_module {
+  member_method {
+    name: "difference"
+    argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "intersection"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
   member_method {
     name: "set_difference"
     argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
@@ -16,4 +24,12 @@ tf_module {
     name: "set_union"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
+  member_method {
+    name: "size"
+    argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "union"
+    argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
index 2c50c41f186363e8331488df9e2d328e0d0b4d26..ea717b4d719d6709e05182faca964ae544abc39c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.signal.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.signal"
 tf_module {
+  member_method {
+    name: "dct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
   member_method {
     name: "fft"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -24,6 +28,10 @@ tf_module {
     name: "hann_window"
     argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "idct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
   member_method {
     name: "ifft"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -44,6 +52,18 @@ tf_module {
     name: "inverse_stft_window_fn"
     argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
   }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "linear_to_mel_weight_matrix"
     argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
@@ -56,6 +76,18 @@ tf_module {
     name: "overlap_and_add"
     argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "stft"
     argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
index 32bd8d5f8edb24ee1f5a5672487499337bd1c0dd..33e342bc75486be0bccffc1e36a94e147f934432 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.sparse.pbtxt
@@ -10,7 +10,7 @@ tf_module {
   }
   member_method {
     name: "add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\', \'thresh\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "concat"
@@ -112,6 +112,10 @@ tf_module {
     name: "softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "split"
     argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
index 03144cbe709fe59afc3a818ea7c157ace72b713d..a1cd581a86bc2132bfa04ac3f3433e84b6365b19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.strings.pbtxt
@@ -52,6 +52,10 @@ tf_module {
     name: "to_number"
     argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "unicode_encode"
+    argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
+  }
   member_method {
     name: "unicode_script"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
index df528e26b60f8d8ddcc1eaf0ed292cc7ff0ebd94..6fc489c86043d074ac832d0ec9dbefd2cbbb4f19 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.test.-benchmark.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_abstract"
     argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
index 877c55c6b3820294bcadf249304bd67f82bcaee6..bdb3ea2197c78dd17357f2753f05638c3c054bd6 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.pbtxt
@@ -396,6 +396,10 @@ tf_module {
     name: "piecewise_constant"
     argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "piecewise_constant_decay"
+    argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "polynomial_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt
deleted file mode 100644
index f1dffd595285098afaeb0ff04e5db35d594f7fac..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.-list-value.pbtxt
+++ /dev/null
@@ -1,70 +0,0 @@
-path: "tensorflow.AttrValue.ListValue"
-tf_proto {
-  descriptor {
-    name: "ListValue"
-    field {
-      name: "s"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_BYTES
-    }
-    field {
-      name: "i"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_INT64
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "f"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_FLOAT
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "b"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_BOOL
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "type"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "shape"
-      number: 7
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-    }
-    field {
-      name: "func"
-      number: 9
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt
deleted file mode 100644
index 6ccd64f428c3b87c807d0af82f67a884187f738c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-attr-value.pbtxt
+++ /dev/null
@@ -1,151 +0,0 @@
-path: "tensorflow.AttrValue"
-tf_proto {
-  descriptor {
-    name: "AttrValue"
-    field {
-      name: "s"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BYTES
-      oneof_index: 0
-    }
-    field {
-      name: "i"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-      oneof_index: 0
-    }
-    field {
-      name: "f"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_FLOAT
-      oneof_index: 0
-    }
-    field {
-      name: "b"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-      oneof_index: 0
-    }
-    field {
-      name: "type"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-      oneof_index: 0
-    }
-    field {
-      name: "shape"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-      oneof_index: 0
-    }
-    field {
-      name: "tensor"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorProto"
-      oneof_index: 0
-    }
-    field {
-      name: "list"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue.ListValue"
-      oneof_index: 0
-    }
-    field {
-      name: "func"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList"
-      oneof_index: 0
-    }
-    field {
-      name: "placeholder"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    nested_type {
-      name: "ListValue"
-      field {
-        name: "s"
-        number: 2
-        label: LABEL_REPEATED
-        type: TYPE_BYTES
-      }
-      field {
-        name: "i"
-        number: 3
-        label: LABEL_REPEATED
-        type: TYPE_INT64
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "f"
-        number: 4
-        label: LABEL_REPEATED
-        type: TYPE_FLOAT
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "b"
-        number: 5
-        label: LABEL_REPEATED
-        type: TYPE_BOOL
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "type"
-        number: 6
-        label: LABEL_REPEATED
-        type: TYPE_ENUM
-        type_name: ".tensorflow.DataType"
-        options {
-          packed: true
-        }
-      }
-      field {
-        name: "shape"
-        number: 7
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorShapeProto"
-      }
-      field {
-        name: "tensor"
-        number: 8
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorProto"
-      }
-      field {
-        name: "func"
-        number: 9
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.NameAttrList"
-      }
-    }
-    oneof_decl {
-      name: "value"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
deleted file mode 100644
index c9a32c16b34a78bd5a182b7c0635a559bddc611d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator-base.pbtxt
+++ /dev/null
@@ -1,29 +0,0 @@
-path: "tensorflow.ConditionalAccumulatorBase"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'accumulator_ref\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
deleted file mode 100644
index 15e0ab76b6fd97b83019589e79ac290bbce11053..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-conditional-accumulator.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.ConditionalAccumulator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulator\'>"
-  is_instance: "<class \'tensorflow.python.ops.data_flow_ops.ConditionalAccumulatorBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "accumulator_ref"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "dtype"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'dtype\', \'shape\', \'shared_name\', \'name\', \'reduction_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'conditional_accumulator\', \'MEAN\'], "
-  }
-  member_method {
-    name: "apply_grad"
-    argspec: "args=[\'self\', \'grad\', \'local_step\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
-  }
-  member_method {
-    name: "num_accumulated"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "set_global_step"
-    argspec: "args=[\'self\', \'new_global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "take_grad"
-    argspec: "args=[\'self\', \'num_required\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt
deleted file mode 100644
index d9b142682899bf5d9fd5d942437359adf8962466..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-device-count-entry.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.ConfigProto.DeviceCountEntry"
-tf_proto {
-  descriptor {
-    name: "DeviceCountEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
deleted file mode 100644
index f7491649c22738c625e3f63944f2347358d2e525..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.-experimental.pbtxt
+++ /dev/null
@@ -1,28 +0,0 @@
-path: "tensorflow.ConfigProto.Experimental"
-tf_proto {
-  descriptor {
-    name: "Experimental"
-    field {
-      name: "collective_group_leader"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "executor_type"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "recv_buf_max_chunk"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    reserved_range {
-      start: 2
-      end: 3
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
deleted file mode 100644
index 53b532beab344db8cff9d1ccac4821b8f280af67..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-config-proto.pbtxt
+++ /dev/null
@@ -1,152 +0,0 @@
-path: "tensorflow.ConfigProto"
-tf_proto {
-  descriptor {
-    name: "ConfigProto"
-    field {
-      name: "device_count"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ConfigProto.DeviceCountEntry"
-    }
-    field {
-      name: "intra_op_parallelism_threads"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "inter_op_parallelism_threads"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "use_per_session_threads"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "session_inter_op_thread_pool"
-      number: 12
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ThreadPoolOptionProto"
-    }
-    field {
-      name: "placement_period"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "device_filters"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "gpu_options"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GPUOptions"
-    }
-    field {
-      name: "allow_soft_placement"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "log_device_placement"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "graph_options"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphOptions"
-    }
-    field {
-      name: "operation_timeout_in_ms"
-      number: 11
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "rpc_options"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RPCOptions"
-    }
-    field {
-      name: "cluster_def"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ClusterDef"
-    }
-    field {
-      name: "isolate_session_state"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 16
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.ConfigProto.Experimental"
-    }
-    nested_type {
-      name: "DeviceCountEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "collective_group_leader"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "executor_type"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "recv_buf_max_chunk"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      reserved_range {
-        start: 2
-        end: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
deleted file mode 100644
index 92e535c341447628a50d8941998a4065e78d12a5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-device-spec.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.DeviceSpec"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.device.DeviceSpec\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "job"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "replica"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "task"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'job\', \'replica\', \'task\', \'device_type\', \'device_index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_string"
-    argspec: "args=[\'spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "merge_from"
-    argspec: "args=[\'self\', \'dev\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "parse_from_string"
-    argspec: "args=[\'self\', \'spec\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "to_string"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
deleted file mode 100644
index a9ab27719b4d71f3d7ed10963ad896ccafa82f15..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-dimension.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-path: "tensorflow.Dimension"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.tensor_shape.Dimension\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "value"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "assert_is_compatible_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_compatible_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "merge_with"
-    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt
deleted file mode 100644
index a2cc07483a4e10918891f555ca9459fb7503bb32..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-g-p-u-options.pbtxt
+++ /dev/null
@@ -1,98 +0,0 @@
-path: "tensorflow.GPUOptions"
-tf_proto {
-  descriptor {
-    name: "GPUOptions"
-    field {
-      name: "per_process_gpu_memory_fraction"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "allow_growth"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "allocator_type"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "deferred_deletion_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "visible_device_list"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "polling_active_delay_usecs"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "polling_inactive_delay_msecs"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "force_gpu_compatible"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GPUOptions.Experimental"
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "virtual_devices"
-        number: 1
-        label: LABEL_REPEATED
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.GPUOptions.Experimental.VirtualDevices"
-      }
-      field {
-        name: "use_unified_memory"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-      field {
-        name: "num_dev_to_dev_copy_streams"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "collective_ring_order"
-        number: 4
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      nested_type {
-        name: "VirtualDevices"
-        field {
-          name: "memory_limit_mb"
-          number: 1
-          label: LABEL_REPEATED
-          type: TYPE_FLOAT
-        }
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
index 0a16d6ab92faac1db63470f0aedadf69341be29b..2299a009d3d5335553e1de025c42b23a57592de3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-gradient-tape.pbtxt
@@ -6,10 +6,18 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'persistent\', \'watch_accessed_variables\'], varargs=None, keywords=None, defaults=[\'False\', \'True\'], "
   }
+  member_method {
+    name: "batch_jacobian"
+    argspec: "args=[\'self\', \'target\', \'source\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "gradient"
     argspec: "args=[\'self\', \'target\', \'sources\', \'output_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'UnconnectedGradients.NONE\'], "
   }
+  member_method {
+    name: "jacobian"
+    argspec: "args=[\'self\', \'target\', \'sources\', \'unconnected_gradients\', \'parallel_iterations\', \'experimental_use_pfor\'], varargs=None, keywords=None, defaults=[\'UnconnectedGradients.NONE\', \'None\', \'True\'], "
+  }
   member_method {
     name: "reset"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt
deleted file mode 100644
index 19eccff03d24719d95ea84ccdad4014aa777ccd5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-def.pbtxt
+++ /dev/null
@@ -1,36 +0,0 @@
-path: "tensorflow.GraphDef"
-tf_proto {
-  descriptor {
-    name: "GraphDef"
-    field {
-      name: "node"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NodeDef"
-    }
-    field {
-      name: "versions"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.VersionDef"
-    }
-    field {
-      name: "version"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-      options {
-        deprecated: true
-      }
-    }
-    field {
-      name: "library"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.FunctionDefLibrary"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
deleted file mode 100644
index ffe479093397a9bf98d10aa4e054c643e64d5f5d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-keys.pbtxt
+++ /dev/null
@@ -1,140 +0,0 @@
-path: "tensorflow.GraphKeys"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.ops.GraphKeys\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "ACTIVATIONS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "ASSET_FILEPATHS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "BIASES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "CONCATENATED_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "COND_CONTEXT"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "EVAL_STEP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GLOBAL_STEP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "GLOBAL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_RESOURCES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOCAL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "LOSSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "METRIC_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MODEL_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MOVING_AVERAGE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "QUEUE_RUNNERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "READY_FOR_LOCAL_INIT_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "READY_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "REGULARIZATION_LOSSES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "RESOURCES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVEABLE_OBJECTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SAVERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SUMMARIES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "SUMMARY_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TABLE_INITIALIZERS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINABLE_RESOURCE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAINABLE_VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "TRAIN_OP"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "UPDATE_OPS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "VARIABLES"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "WHILE_CONTEXT"
-    mtype: "<type \'str\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt
deleted file mode 100644
index a9f99bc171cc3661031981f467f583b122e43476..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-graph-options.pbtxt
+++ /dev/null
@@ -1,67 +0,0 @@
-path: "tensorflow.GraphOptions"
-tf_proto {
-  descriptor {
-    name: "GraphOptions"
-    field {
-      name: "enable_recv_scheduling"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "optimizer_options"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.OptimizerOptions"
-    }
-    field {
-      name: "build_cost_model"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "build_cost_model_after"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "infer_shapes"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "place_pruned_graph"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "enable_bfloat16_sendrecv"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "timeline_step"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "rewrite_options"
-      number: 10
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RewriterConfig"
-    }
-    reserved_range {
-      start: 1
-      end: 2
-    }
-    reserved_name: "skip_common_subexpression_elimination"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt
deleted file mode 100644
index d4402f330b8a28eaa61eb2b74c9ca412dce06b62..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-histogram-proto.pbtxt
+++ /dev/null
@@ -1,54 +0,0 @@
-path: "tensorflow.HistogramProto"
-tf_proto {
-  descriptor {
-    name: "HistogramProto"
-    field {
-      name: "min"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "max"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "num"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "sum"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "sum_squares"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_DOUBLE
-    }
-    field {
-      name: "bucket_limit"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_DOUBLE
-      options {
-        packed: true
-      }
-    }
-    field {
-      name: "bucket"
-      number: 7
-      label: LABEL_REPEATED
-      type: TYPE_DOUBLE
-      options {
-        packed: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt
deleted file mode 100644
index 5023aa96bf3b4f3f550421db5f41872d9f62b70d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-log-message.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.LogMessage"
-tf_proto {
-  descriptor {
-    name: "LogMessage"
-    field {
-      name: "level"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.LogMessage.Level"
-    }
-    field {
-      name: "message"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "Level"
-      value {
-        name: "UNKNOWN"
-        number: 0
-      }
-      value {
-        name: "DEBUGGING"
-        number: 10
-      }
-      value {
-        name: "INFO"
-        number: 20
-      }
-      value {
-        name: "WARN"
-        number: 30
-      }
-      value {
-        name: "ERROR"
-        number: 40
-      }
-      value {
-        name: "FATAL"
-        number: 50
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
deleted file mode 100644
index 0ba09bec4b3fa6e9eaf59978beaa958ebc038b4c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-collection-def-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.MetaGraphDef.CollectionDefEntry"
-tf_proto {
-  descriptor {
-    name: "CollectionDefEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.CollectionDef"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
deleted file mode 100644
index 41c62a407b8577288016f2376c35ba6ec1c3c1ca..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-meta-info-def.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.MetaGraphDef.MetaInfoDef"
-tf_proto {
-  descriptor {
-    name: "MetaInfoDef"
-    field {
-      name: "meta_graph_version"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "stripped_op_list"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.OpList"
-    }
-    field {
-      name: "any_info"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".google.protobuf.Any"
-    }
-    field {
-      name: "tags"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensorflow_version"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensorflow_git_version"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "stripped_default_attrs"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
deleted file mode 100644
index 73dc414a779ded3d1f896e743b7f1f1a443352f0..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.-signature-def-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.MetaGraphDef.SignatureDefEntry"
-tf_proto {
-  descriptor {
-    name: "SignatureDefEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SignatureDef"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt
deleted file mode 100644
index d71c2358c93e9597726665fdf8f92e648b2ea772..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-meta-graph-def.pbtxt
+++ /dev/null
@@ -1,133 +0,0 @@
-path: "tensorflow.MetaGraphDef"
-tf_proto {
-  descriptor {
-    name: "MetaGraphDef"
-    field {
-      name: "meta_info_def"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.MetaInfoDef"
-    }
-    field {
-      name: "graph_def"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphDef"
-    }
-    field {
-      name: "saver_def"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.SaverDef"
-    }
-    field {
-      name: "collection_def"
-      number: 4
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.CollectionDefEntry"
-    }
-    field {
-      name: "signature_def"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.MetaGraphDef.SignatureDefEntry"
-    }
-    field {
-      name: "asset_file_def"
-      number: 6
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AssetFileDef"
-    }
-    nested_type {
-      name: "MetaInfoDef"
-      field {
-        name: "meta_graph_version"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "stripped_op_list"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.OpList"
-      }
-      field {
-        name: "any_info"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".google.protobuf.Any"
-      }
-      field {
-        name: "tags"
-        number: 4
-        label: LABEL_REPEATED
-        type: TYPE_STRING
-      }
-      field {
-        name: "tensorflow_version"
-        number: 5
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "tensorflow_git_version"
-        number: 6
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "stripped_default_attrs"
-        number: 7
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-    }
-    nested_type {
-      name: "CollectionDefEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.CollectionDef"
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "SignatureDefEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.SignatureDef"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt
deleted file mode 100644
index b119b208772199e5c3596be142f3e0f62d3ed50e..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.-attr-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.NameAttrList.AttrEntry"
-tf_proto {
-  descriptor {
-    name: "AttrEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt
deleted file mode 100644
index fcdb411ffce9b68ac28696f86ca11a47f9e64e8f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-name-attr-list.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.NameAttrList"
-tf_proto {
-  descriptor {
-    name: "NameAttrList"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "attr"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NameAttrList.AttrEntry"
-    }
-    nested_type {
-      name: "AttrEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.AttrValue"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt
deleted file mode 100644
index 622e4c3d0f60ce4842a6fd4cc421551aa795fcbf..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.-attr-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.NodeDef.AttrEntry"
-tf_proto {
-  descriptor {
-    name: "AttrEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.AttrValue"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt
deleted file mode 100644
index 646fa8abb9b22dbd908ff821cbe66a33ad02ba64..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-node-def.pbtxt
+++ /dev/null
@@ -1,56 +0,0 @@
-path: "tensorflow.NodeDef"
-tf_proto {
-  descriptor {
-    name: "NodeDef"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "op"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "input"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "device"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "attr"
-      number: 5
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.NodeDef.AttrEntry"
-    }
-    nested_type {
-      name: "AttrEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.AttrValue"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt
deleted file mode 100644
index 3ccf9d459b133b48e5456f02e4780ade8d3042c8..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-optimizer-options.pbtxt
+++ /dev/null
@@ -1,74 +0,0 @@
-path: "tensorflow.OptimizerOptions"
-tf_proto {
-  descriptor {
-    name: "OptimizerOptions"
-    field {
-      name: "do_common_subexpression_elimination"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "do_constant_folding"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "max_folded_constant_in_bytes"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "do_function_inlining"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "opt_level"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.OptimizerOptions.Level"
-    }
-    field {
-      name: "global_jit_level"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.OptimizerOptions.GlobalJitLevel"
-    }
-    enum_type {
-      name: "Level"
-      value {
-        name: "L1"
-        number: 0
-      }
-      value {
-        name: "L0"
-        number: -1
-      }
-    }
-    enum_type {
-      name: "GlobalJitLevel"
-      value {
-        name: "DEFAULT"
-        number: 0
-      }
-      value {
-        name: "OFF"
-        number: -1
-      }
-      value {
-        name: "ON_1"
-        number: 1
-      }
-      value {
-        name: "ON_2"
-        number: 2
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt
deleted file mode 100644
index 1287940326c0196e76fff2cf6363622226092504..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-metadata.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.RunMetadata"
-tf_proto {
-  descriptor {
-    name: "RunMetadata"
-    field {
-      name: "step_stats"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.StepStats"
-    }
-    field {
-      name: "cost_graph"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.CostGraphDef"
-    }
-    field {
-      name: "partition_graphs"
-      number: 3
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.GraphDef"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
deleted file mode 100644
index 47b5b56faf63edba9ce4f08bf744f3acf4f67f5f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.-experimental.pbtxt
+++ /dev/null
@@ -1,18 +0,0 @@
-path: "tensorflow.RunOptions.Experimental"
-tf_proto {
-  descriptor {
-    name: "Experimental"
-    field {
-      name: "collective_graph_key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "use_run_handler_pool"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
deleted file mode 100644
index c0c2e7b9f8d71be9b96e7195b561d0a934d24057..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-run-options.pbtxt
+++ /dev/null
@@ -1,89 +0,0 @@
-path: "tensorflow.RunOptions"
-tf_proto {
-  descriptor {
-    name: "RunOptions"
-    field {
-      name: "trace_level"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.RunOptions.TraceLevel"
-    }
-    field {
-      name: "timeout_in_ms"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "inter_op_thread_pool"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "output_partition_graphs"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "debug_options"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.DebugOptions"
-    }
-    field {
-      name: "report_tensor_allocations_upon_oom"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_BOOL
-    }
-    field {
-      name: "experimental"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.RunOptions.Experimental"
-    }
-    nested_type {
-      name: "Experimental"
-      field {
-        name: "collective_graph_key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "use_run_handler_pool"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_BOOL
-      }
-    }
-    enum_type {
-      name: "TraceLevel"
-      value {
-        name: "NO_TRACE"
-        number: 0
-      }
-      value {
-        name: "SOFTWARE_TRACE"
-        number: 1
-      }
-      value {
-        name: "HARDWARE_TRACE"
-        number: 2
-      }
-      value {
-        name: "FULL_TRACE"
-        number: 3
-      }
-    }
-    reserved_range {
-      start: 4
-      end: 5
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt
deleted file mode 100644
index 259f2418740cbfe47cdb4bd871d4f5c6306d25f5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-session-log.pbtxt
+++ /dev/null
@@ -1,44 +0,0 @@
-path: "tensorflow.SessionLog"
-tf_proto {
-  descriptor {
-    name: "SessionLog"
-    field {
-      name: "status"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.SessionLog.SessionStatus"
-    }
-    field {
-      name: "checkpoint_path"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "msg"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "SessionStatus"
-      value {
-        name: "STATUS_UNSPECIFIED"
-        number: 0
-      }
-      value {
-        name: "START"
-        number: 1
-      }
-      value {
-        name: "STOP"
-        number: 2
-      }
-      value {
-        name: "CHECKPOINT"
-        number: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt
deleted file mode 100644
index d33fd4d5d7b6b3e2eb7454b5326d993c139f0490..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-sparse-tensor-value.pbtxt
+++ /dev/null
@@ -1,26 +0,0 @@
-path: "tensorflow.SparseTensorValue"
-tf_class {
-  is_instance: "<class \'tensorflow.python.framework.sparse_tensor.SparseTensorValue\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "dense_shape"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "indices"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "values"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
deleted file mode 100644
index 0064c8460cb374f1e3f108085a2efed4131dd205..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.-coo-sparse.pbtxt
+++ /dev/null
@@ -1,24 +0,0 @@
-path: "tensorflow.TensorInfo.CooSparse"
-tf_proto {
-  descriptor {
-    name: "CooSparse"
-    field {
-      name: "values_tensor_name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "indices_tensor_name"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "dense_shape_tensor_name"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
deleted file mode 100644
index 63566c808e55cb4d3b630f0a017fa3a2c8a30de3..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-info.pbtxt
+++ /dev/null
@@ -1,59 +0,0 @@
-path: "tensorflow.TensorInfo"
-tf_proto {
-  descriptor {
-    name: "TensorInfo"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-      oneof_index: 0
-    }
-    field {
-      name: "coo_sparse"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorInfo.CooSparse"
-      oneof_index: 0
-    }
-    field {
-      name: "dtype"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.DataType"
-    }
-    field {
-      name: "tensor_shape"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    nested_type {
-      name: "CooSparse"
-      field {
-        name: "values_tensor_name"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "indices_tensor_name"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "dense_shape_tensor_name"
-        number: 3
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-    }
-    oneof_decl {
-      name: "encoding"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..493dcba8922d7f6c51a61d337f48e09d168e6bac
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor-spec.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.TensorSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.tensor_spec.TensorSpec\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shape"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'shape\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "from_tensor"
+    argspec: "args=[\'cls\', \'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_tensor\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
index e85949f23c968046f8a9cfa50ffd206a18e767e7..6136c8fbe79ef8d3851c39b8f11ac3c33f6050f2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.pbtxt
@@ -63,6 +63,10 @@ tf_class {
     name: "assign_sub"
     argspec: "args=[\'self\', \'delta\', \'use_locking\', \'name\', \'read_value\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'True\'], "
   }
+  member_method {
+    name: "batch_scatter_update"
+    argspec: "args=[\'self\', \'sparse_delta\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+  }
   member_method {
     name: "count_up_to"
     argspec: "args=[\'self\', \'limit\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
deleted file mode 100644
index 67e1b76caba8a278eabe4a54e5c2fe85c5c2e099..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.app.pbtxt
+++ /dev/null
@@ -1,7 +0,0 @@
-path: "tensorflow.app"
-tf_module {
-  member_method {
-    name: "run"
-    argspec: "args=[\'main\', \'argv\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index 8b7f63e43e237864d4ef24d3b251b23199f9ee17..d877339409d781f95f7ff75a553d21d82c27fc40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -1,22 +1,21 @@
 path: "tensorflow.data.Dataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_shapes"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member {
     name: "output_types"
-    mtype: "<class \'abc.abstractproperty\'>"
+    mtype: "<type \'property\'>"
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "apply"
@@ -46,10 +45,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -66,14 +61,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -102,10 +89,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 81358cecbc020061fb77a199afcb4a23862fc364..f1573512438b3f40db7653bf94fd4ad282a40acd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.FixedLengthRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.FixedLengthRecordDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
deleted file mode 100644
index 4f0147a52381c748eccbfee29df0d3537ba5d14a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-iterator.pbtxt
+++ /dev/null
@@ -1,46 +0,0 @@
-path: "tensorflow.data.Iterator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.iterator_ops.Iterator\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_classes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_shapes"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "output_types"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'iterator_resource\', \'initializer\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "from_string_handle"
-    argspec: "args=[\'string_handle\', \'output_types\', \'output_shapes\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "from_structure"
-    argspec: "args=[\'output_types\', \'output_shapes\', \'shared_name\', \'output_classes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_next"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_initializer"
-    argspec: "args=[\'self\', \'dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "string_handle"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index 9d032d43de1094f212e5f749013f1fac5a898459..72fc2c3a9ee5b985723ce2dba9643ba796362dc7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.Options"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Options\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "experimental_autotune"
@@ -11,47 +12,19 @@ tf_class {
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_hoist_random_uniform"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_and_batch_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_and_filter_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_fusion"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_parallelization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_map_vectorization"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "experimental_noop_elimination"
+    name: "experimental_numa_aware"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_numa_aware"
+    name: "experimental_optimization"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_shuffle_and_repeat_fusion"
+    name: "experimental_stats"
     mtype: "<type \'property\'>"
   }
   member {
-    name: "experimental_stats"
+    name: "experimental_threading"
     mtype: "<type \'property\'>"
   }
   member_method {
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index 7b7a9ebaf08b1e9fdb5e4c5b7448175611a9b2c4..690da98b1ac2097c4241ba3218caa3b476dbf397 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.data.TFRecordDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV2\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -47,10 +47,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -67,14 +63,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -103,10 +91,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index 1c305abf68c2e535b989d89f8331c33afc4cc8b6..fe0bc1a4db5d4a5e78ec7479e414545b522ec2df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.TextLineDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.readers.TextLineDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index 2520e28a3c708f45942eb2e73911b7a5226646e5..261129b132189ef504678058f11651dd22bdce8c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.CsvDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.CsvDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..dcb304f763ea44d0d7314248170e615115b0794c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-dataset-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.DatasetStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'element_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b066e563cc6196650b1ba561da7c16a80a8656
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-nested-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.NestedStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.NestedStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'nested_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..3b7ad64f51f88ae9c860e061db5c1ad6b5f2bcf8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -0,0 +1,50 @@
+path: "tensorflow.data.experimental.OptimizationOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.optimization_options.OptimizationOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "apply_default_optimizations"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "hoist_random_uniform"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_batch_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_and_filter_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_parallelization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "map_vectorization"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "noop_elimination"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "shuffle_and_repeat_fusion"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..bf41c1d1d696d94ef9da5fc64272349d1533816e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optional-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.OptionalStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.optional_ops.OptionalStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'value_structure\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index 1dd53b1eabdf15b662a839a07176ba4eaf8bda37..0b34bbc94269280d6cca77bca789fb74f76629be 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.RandomDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.random_ops.RandomDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f97376b328cf34eb04918bec7bacf08d254d8db5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sparse-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.SparseTensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.SparseTensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'dense_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index 8fdd9dc52e332abdeed039bd85d31f6318d013e9..0e61890eee42a8b5b0df7bda0f99d189c4911eb9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -1,8 +1,8 @@
 path: "tensorflow.data.experimental.SqlDataset"
 tf_class {
-  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDataset\'>"
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.readers.SqlDatasetV2\'>"
   is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetSource\'>"
-  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.Dataset\'>"
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.DatasetV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "output_classes"
@@ -48,10 +48,6 @@ tf_class {
     name: "from_generator"
     argspec: "args=[\'generator\', \'output_types\', \'output_shapes\', \'args\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "from_sparse_tensor_slices"
-    argspec: "args=[\'sparse_tensor\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "from_tensor_slices"
     argspec: "args=[\'tensors\'], varargs=None, keywords=None, defaults=None"
@@ -68,14 +64,6 @@ tf_class {
     name: "list_files"
     argspec: "args=[\'file_pattern\', \'shuffle\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "make_initializable_iterator"
-    argspec: "args=[\'self\', \'shared_name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "make_one_shot_iterator"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "map"
     argspec: "args=[\'self\', \'map_func\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -104,10 +92,6 @@ tf_class {
     name: "repeat"
     argspec: "args=[\'self\', \'count\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "shard"
-    argspec: "args=[\'self\', \'num_shards\', \'index\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shuffle"
     argspec: "args=[\'self\', \'buffer_size\', \'seed\', \'reshuffle_each_iteration\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
index f423eed42cc2d7115fd50b3ad533f3790736a850..892f8c1fb897dfc8bf4964c118aeb641dffd3caa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-stats-options.pbtxt
@@ -1,6 +1,7 @@
 path: "tensorflow.data.experimental.StatsOptions"
 tf_class {
   is_instance: "<class \'tensorflow.python.data.experimental.ops.stats_options.StatsOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "aggregator"
@@ -20,6 +21,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'aggregator\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a99db4542e0deb506d00c00f889299dd22d67e1e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-structure.pbtxt
@@ -0,0 +1,16 @@
+path: "tensorflow.data.experimental.Structure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f5c8864a9dd98058c659e72ba8059182a666ea39
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-tensor-structure.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.TensorStructure"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.util.structure.TensorStructure\'>"
+  is_instance: "<class \'tensorflow.python.data.util.structure.Structure\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dtype\', \'shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b5ebf108018e75b6de28287a68a25a03b294b64
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-threading-options.pbtxt
@@ -0,0 +1,18 @@
+path: "tensorflow.data.experimental.ThreadingOptions"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.experimental.ops.threading_options.ThreadingOptions\'>"
+  is_instance: "<class \'tensorflow.python.data.util.options.OptionsBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "max_intra_op_parallelism"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "private_threadpool_size"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index 4c253bb8adf63c7603cd3ffed0a10edb5b76f320..2d115904925eb96164484300baf628d41d3fcff4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -12,10 +12,30 @@ tf_module {
     name: "CsvDataset"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DatasetStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "INFINITE_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
+  member {
+    name: "NestedStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "OptimizationOptions"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Optional"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OptionalStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RandomDataset"
     mtype: "<type \'type\'>"
@@ -24,6 +44,10 @@ tf_module {
     name: "Reducer"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SparseTensorStructure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "SqlDataset"
     mtype: "<type \'type\'>"
@@ -36,10 +60,26 @@ tf_module {
     name: "StatsOptions"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "Structure"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TFRecordWriter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "TensorStructure"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ThreadingOptions"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "UNKNOWN_CARDINALITY"
+    mtype: "<type \'int\'>"
+  }
   member_method {
     name: "Counter"
     argspec: "args=[\'start\', \'step\', \'dtype\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \"<dtype: \'int64\'>\"], "
@@ -48,6 +88,10 @@ tf_module {
     name: "bucket_by_sequence_length"
     argspec: "args=[\'element_length_func\', \'bucket_boundaries\', \'bucket_batch_sizes\', \'padded_shapes\', \'padding_values\', \'pad_to_bucket_boundary\', \'no_padding\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'False\'], "
   }
+  member_method {
+    name: "cardinality"
+    argspec: "args=[\'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "choose_from_datasets"
     argspec: "args=[\'datasets\', \'choice_dataset\'], varargs=None, keywords=None, defaults=None"
@@ -64,6 +108,10 @@ tf_module {
     name: "enumerate_dataset"
     argspec: "args=[\'start\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
+  member_method {
+    name: "filter_for_shard"
+    argspec: "args=[\'num_shards\', \'shard_index\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "get_next_as_optional"
     argspec: "args=[\'iterator\'], varargs=None, keywords=None, defaults=None"
@@ -90,7 +138,7 @@ tf_module {
   }
   member_method {
     name: "make_batched_features_dataset"
-    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDataset\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
+    argspec: "args=[\'file_pattern\', \'batch_size\', \'features\', \'reader\', \'label_key\', \'reader_args\', \'num_epochs\', \'shuffle\', \'shuffle_buffer_size\', \'shuffle_seed\', \'prefetch_buffer_size\', \'reader_num_threads\', \'parser_num_threads\', \'sloppy_ordering\', \'drop_final_batch\'], varargs=None, keywords=None, defaults=[\"<class \'tensorflow.python.data.ops.readers.TFRecordDatasetV1\'>\", \'None\', \'None\', \'None\', \'True\', \'10000\', \'None\', \'-1\', \'1\', \'2\', \'False\', \'False\'], "
   }
   member_method {
     name: "make_csv_dataset"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index 509bbae8332fe767a34c14a33d5af1855b3ffdac..4c3d6ddd85233c356aa22ebc9a9ec395ca0a8b74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -8,10 +8,6 @@ tf_module {
     name: "FixedLengthRecordDataset"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "Iterator"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Options"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
index ab6287f8cd080621d76fc34e2cb437960a217800..314aedda909cda8b1d8a209333b85a7792c19bd5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.debugging.pbtxt
@@ -6,19 +6,19 @@ tf_module {
   }
   member_method {
     name: "assert_all_finite"
-    argspec: "args=[\'t\', \'msg\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'x\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "assert_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_integer"
@@ -26,35 +26,35 @@ tf_module {
   }
   member_method {
     name: "assert_less"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_less_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_near"
-    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'rtol\', \'atol\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_non_negative"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_non_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_none_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_positive"
-    argspec: "args=[\'x\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_proper_iterable"
@@ -62,15 +62,15 @@ tf_module {
   }
   member_method {
     name: "assert_rank"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank_at_least"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank_in"
-    argspec: "args=[\'x\', \'ranks\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'ranks\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_same_float_dtype"
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "assert_scalar"
-    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "assert_type"
@@ -88,28 +88,8 @@ tf_module {
     name: "check_numerics"
     argspec: "args=[\'tensor\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "is_finite"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_inf"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_nan"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "is_non_decreasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "is_numeric_tensor"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "is_strictly_increasing"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..583cbc66549223e5c954b715e2043efa5417ef18
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-context.pbtxt
@@ -0,0 +1,25 @@
+path: "tensorflow.distribute.InputContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.InputContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "input_pipeline_id"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_input_pipelines"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_input_pipelines\', \'input_pipeline_id\', \'num_replicas_in_sync\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'1\'], "
+  }
+  member_method {
+    name: "get_per_replica_batch_size"
+    argspec: "args=[\'self\', \'global_batch_size\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6a7a3a97aa0927b81708311d4b8b28fced217c00
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-input-replication-mode.pbtxt
@@ -0,0 +1,8 @@
+path: "tensorflow.distribute.InputReplicationMode"
+tf_class {
+  is_instance: "<enum \'InputReplicationMode\'>"
+  member {
+    name: "PER_WORKER"
+    mtype: "<enum \'InputReplicationMode\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a613e2d3d1dcefacdf0ec336587a46ff7e0bcb90
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-mirrored-strategy.pbtxt
@@ -0,0 +1,138 @@
+path: "tensorflow.distribute.MirroredStrategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.mirrored_strategy.MirroredStrategy\'>"
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'devices\', \'cross_device_ops\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4899f38cad253167ce0b94f79388cb97fe534197
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-reduce-op.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.distribute.ReduceOp"
+tf_class {
+  is_instance: "<enum \'ReduceOp\'>"
+  member {
+    name: "MEAN"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<enum \'ReduceOp\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..df707e8920e4488ed6b40a7f93f56b5624188c84
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-replica-context.pbtxt
@@ -0,0 +1,33 @@
+path: "tensorflow.distribute.ReplicaContext"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.ReplicaContext\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "distribution_strategy"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "replica_id_in_sync_group"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "strategy"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'strategy\', \'replica_id_in_sync_group\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "merge_call"
+    argspec: "args=[\'self\', \'merge_fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..77706e57133e1186d9e98fcf9205ed4c91772eda
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy-extended.pbtxt
@@ -0,0 +1,81 @@
+path: "tensorflow.distribute.StrategyExtended"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategyExtended\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "experimental_between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "experimental_should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'container_strategy\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast_to"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'()\', \'None\'], "
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_run_steps_on_iterator"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce_to"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\', \'destinations\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\', \'args\', \'kwargs\', \'group\'], varargs=None, keywords=None, defaults=[\'()\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..9eb73d2c0d9069ec4b818abe1825503f0ea36fc9
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.-strategy.pbtxt
@@ -0,0 +1,137 @@
+path: "tensorflow.distribute.Strategy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.distribute.distribute_lib.DistributionStrategy\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "between_graph"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "extended"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "num_replicas_in_sync"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "parameter_devices"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "require_static_shapes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_checkpoint"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_init"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "should_save_summary"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "worker_devices"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'extended\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "batch_reduce"
+    argspec: "args=[\'self\', \'aggregation\', \'value_destination_pairs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "broadcast"
+    argspec: "args=[\'self\', \'tensor\', \'destinations\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "call_for_each_replica"
+    argspec: "args=[\'self\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "colocate_vars_with"
+    argspec: "args=[\'self\', \'colocate_with_variable\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "configure"
+    argspec: "args=[\'self\', \'session_config\', \'cluster_spec\', \'task_type\', \'task_id\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "distribute_dataset"
+    argspec: "args=[\'self\', \'dataset_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "finalize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "group"
+    argspec: "args=[\'self\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "initialize"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_dataset_iterator"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "make_input_fn_iterator"
+    argspec: "args=[\'self\', \'input_fn\', \'replication_mode\'], varargs=None, keywords=None, defaults=[\'InputReplicationMode.PER_WORKER\'], "
+  }
+  member_method {
+    name: "non_slot_devices"
+    argspec: "args=[\'self\', \'var_list\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "read_var"
+    argspec: "args=[\'self\', \'v\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reduce"
+    argspec: "args=[\'self\', \'reduce_op\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "run_steps_on_dataset"
+    argspec: "args=[\'self\', \'fn\', \'iterator\', \'iterations\', \'initial_loop_values\'], varargs=None, keywords=None, defaults=[\'1\', \'None\'], "
+  }
+  member_method {
+    name: "scope"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "unwrap"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update"
+    argspec: "args=[\'self\', \'var\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "update_config_proto"
+    argspec: "args=[\'self\', \'config_proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_non_slot"
+    argspec: "args=[\'self\', \'colocate_with\', \'fn\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "value_container"
+    argspec: "args=[\'self\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b0dd73ca1d4179b4a3323fa0a9be2fde4e22799c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.distribute.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.distribute"
+tf_module {
+  member {
+    name: "InputContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "InputReplicationMode"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "MirroredStrategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ReduceOp"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
+  member {
+    name: "ReplicaContext"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Strategy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StrategyExtended"
+    mtype: "<type \'type\'>"
+  }
+  member_method {
+    name: "get_loss_reduction"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "has_strategy"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "in_cross_replica_context"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
index 848fc303aa5748348b2aee69ec1e869807327d3d..01b870a81639807489ec2a09dcc185137aae1665 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.dtypes.pbtxt
@@ -44,6 +44,10 @@ tf_module {
     name: "half"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
+  member {
+    name: "int16"
+    mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
+  }
   member {
     name: "int32"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
index 32b84e90ce6ae99e80208905d701d690227a0cf7..efe9e74697096b4a7bac912f10c1092470daadec 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BaselineClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifier\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
index 94933e7ffd6b0f4838f7b2e9254a4056c9cbf245..382d392f39e4044916ff16718c9708b71043bcf4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-estimator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BaselineEstimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimator\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
index db7776b5bf67879cc806bed1b8463b99a082a50e..a7300bf06bb5bbb01c02b9050f8779910b11919e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-baseline-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.BaselineRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressor\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.baseline.BaselineRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'loss_reduction\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'weighted_sum_over_batch_size\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
index d530c71482a2f4db1b1006283dd86f15fd6c27fe..e138ce936ec73c05f8f790fb63c381e56ae2f654 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-classifier.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesClassifier\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
index 4703c0f561a301c3d7eb609d5ad2c0284ff626be..eae0a292a962680a53d8c683ee2d2b97e24937a6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-boosted-trees-regressor.pbtxt
@@ -3,6 +3,7 @@ tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees.BoostedTreesRegressor\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.boosted_trees._BoostedTreesBase\'>"
   is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,6 +33,10 @@ tf_class {
     name: "evaluate"
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+  }
   member_method {
     name: "experimental_feature_importances"
     argspec: "args=[\'self\', \'normalize\'], varargs=None, keywords=None, defaults=[\'False\'], "
@@ -42,7 +47,7 @@ tf_class {
   }
   member_method {
     name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "export_savedmodel"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f9e1504b494e3863f770df23f9f9a92e004b8713
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.CheckpointSaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'checkpoint_dir\', \'save_secs\', \'save_steps\', \'saver\', \'checkpoint_basename\', \'scaffold\', \'listeners\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'model.ckpt\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..111b7583f2cd005912c7f06d977565cd17f265b8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-checkpoint-saver-listener.pbtxt
@@ -0,0 +1,24 @@
+path: "tensorflow.estimator.CheckpointSaverListener"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.CheckpointSaverListener\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "after_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_save"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\', \'global_step_value\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
index ce6040d0f279361ea789b54dee489996d9787ea7..a540085aba48c1d7c877b41831475cb2dacf8ec9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifier\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'2\', \'None\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
index 4635a1544c35cca12caaa51f665141a55fe00d8c..d1b29d670a0cbd3628569ea1c401a329f336c960 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-estimator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNEstimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimator\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
index e85007e16edcf5f7f59768c7c17b9340e15bc7b6..f6c3910a9fe5c76bafe03a636a4e91014055ce81 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNLinearCombinedClassifier"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifier\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedClassifierV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'n_classes\', \'weight_column\', \'label_vocabulary\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'2\', \'None\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
index a23f5daeac4d6690a599d0d92a2cb5ffdc4937c3..b78527279ca32decc71185a98f9f8270b4cd41a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-estimator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNLinearCombinedEstimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimator\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
index 8a55bb835ff4b118a8b2ee45561f3e29639bab90..9133f0d3b280dc8d2d5a263e25731594e0be2ef0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-linear-combined-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNLinearCombinedRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressor\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn_linear_combined.DNNLinearCombinedRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\', \'sum\'], "
+    argspec: "args=[\'self\', \'model_dir\', \'linear_feature_columns\', \'linear_optimizer\', \'dnn_feature_columns\', \'dnn_optimizer\', \'dnn_hidden_units\', \'dnn_activation_fn\', \'dnn_dropout\', \'label_dimension\', \'weight_column\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\', \'linear_sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Ftrl\', \'None\', \'Adagrad\', \'None\', \'<function relu instance>\', \'None\', \'1\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
index 2c4128ec480cc8302e01aa56b61fdb8a7db35b0a..a58d733302da9e69fe0d46d7d327e1b7868e198e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-d-n-n-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.DNNRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressor\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.dnn.DNNRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum\', \'False\'], "
+    argspec: "args=[\'self\', \'hidden_units\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'activation_fn\', \'dropout\', \'input_layer_partitioner\', \'config\', \'warm_start_from\', \'loss_reduction\', \'batch_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Adagrad\', \'<function relu instance>\', \'None\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'False\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
index 9d270a87ab8fe9788988f9277ad0e652f2b4860a..a1f0e76c8b87bac01e21850528e035e1baa7f3d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-estimator.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.estimator.Estimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -31,12 +31,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f24de493f24a363190cd1d323adaa75b32b0d8e3
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-feed-fn-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.FeedFnHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FeedFnHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feed_fn\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6651170ba33f491d5a5342bcd6e6814e1b973832
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-final-ops-hook.pbtxt
@@ -0,0 +1,34 @@
+path: "tensorflow.estimator.FinalOpsHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.FinalOpsHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "final_ops_values"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'final_ops\', \'final_ops_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..37db48bc64e2f0e955105e8094d51c851c25558b
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-global-step-waiter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.GlobalStepWaiterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.GlobalStepWaiterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'wait_until_step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
index 4acbff2cfffec47f1952ac9802a627efe24c870f..47de660a386c3362cf880ba9eed189f2bea047cd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-classifier.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LinearClassifier"
 tf_class {
   is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearClassifierV2\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
index 3d6b03098aac475e3ebedd0f87128315ca654122..66a127606a5be7c356a48ff7eb0751dd7db0eb02 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-estimator.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LinearEstimator"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimator\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearEstimatorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
index 0d1510e9ab1371d9cead321cfb6def4fcb417b16..5c094fe1318565443fb0864750fdf532d465cc04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-linear-regressor.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.estimator.LinearRegressor"
 tf_class {
-  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressor\'>"
-  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.Estimator\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.canned.linear.LinearRegressorV2\'>"
+  is_instance: "<class \'tensorflow_estimator.python.estimator.estimator.EstimatorV2\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "config"
@@ -21,7 +21,7 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum\', \'sum\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'model_dir\', \'label_dimension\', \'weight_column\', \'optimizer\', \'config\', \'partitioner\', \'warm_start_from\', \'loss_reduction\', \'sparse_combiner\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'Ftrl\', \'None\', \'None\', \'None\', \'weighted_sum_over_batch_size\', \'sum\'], "
   }
   member_method {
     name: "eval_dir"
@@ -32,12 +32,12 @@ tf_class {
     argspec: "args=[\'self\', \'input_fn\', \'steps\', \'hooks\', \'checkpoint_path\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "export_saved_model"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "experimental_export_all_saved_models"
+    argspec: "args=[\'self\', \'export_dir_base\', \'input_receiver_fn_map\', \'assets_extra\', \'as_text\', \'checkpoint_path\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
-    name: "export_savedmodel"
-    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'strip_default_attrs\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\'], "
+    name: "export_saved_model"
+    argspec: "args=[\'self\', \'export_dir_base\', \'serving_input_receiver_fn\', \'assets_extra\', \'as_text\', \'checkpoint_path\', \'experimental_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'infer\'], "
   }
   member_method {
     name: "get_variable_names"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..425f0167a161104891c3bb76816fe8c5094de28a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-logging-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.LoggingTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.LoggingTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'tensors\', \'every_n_iter\', \'every_n_secs\', \'at_end\', \'formatter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..6cf6e17e4352b0f909b31327a57bbdca3bc0e02a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-loss-during-training-error.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.estimator.NanLossDuringTrainingError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError\'>"
+  is_instance: "<type \'exceptions.RuntimeError\'>"
+  member {
+    name: "args"
+    mtype: "<type \'getset_descriptor\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..82293c2c0c4e7204d9aba83f43ed2fac6bc46b19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-nan-tensor-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.NanTensorHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.NanTensorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'loss_tensor\', \'fail_on_nan_loss\'], varargs=None, keywords=None, defaults=[\'True\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..65b5fb16b0874e7c6469ef11420db146be1f0b5f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-profiler-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.ProfilerHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.ProfilerHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'show_dataflow\', \'show_memory\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'True\', \'False\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..64051d2bd6b69614cd210d902552ddeb8b6c8e5e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-second-or-step-timer.pbtxt
@@ -0,0 +1,26 @@
+path: "tensorflow.estimator.SecondOrStepTimer"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SecondOrStepTimer\'>"
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks._HookTimer\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_secs\', \'every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "last_triggered_step"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "should_trigger_for_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_last_triggered_step"
+    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..4368e04df3f86834b540bb5306bf66dd82ac440c
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-step-counter-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.StepCounterHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StepCounterHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'every_n_steps\', \'every_n_secs\', \'output_dir\', \'summary_writer\'], varargs=None, keywords=None, defaults=[\'100\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..938b189a8c30237bb15bf73083a348e6366fbfc4
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-stop-at-step-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.StopAtStepHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.StopAtStepHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'num_steps\', \'last_step\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..104157315f5982efb4f6b9f39e0ece905a225e10
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.-summary-saver-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.SummarySaverHook"
+tf_class {
+  is_instance: "<class \'tensorflow.python.training.basic_session_run_hooks.SummarySaverHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'save_steps\', \'save_secs\', \'output_dir\', \'summary_writer\', \'scaffold\', \'summary_op\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5a2a01cd5325ba7e02d9b549293dd09a4a57e167
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.-in-memory-evaluator-hook.pbtxt
@@ -0,0 +1,30 @@
+path: "tensorflow.estimator.experimental.InMemoryEvaluatorHook"
+tf_class {
+  is_instance: "<class \'tensorflow_estimator.python.estimator.hooks.hooks.InMemoryEvaluatorHook\'>"
+  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunHook\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'estimator\', \'input_fn\', \'steps\', \'hooks\', \'name\', \'every_n_iter\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'100\'], "
+  }
+  member_method {
+    name: "after_create_session"
+    argspec: "args=[\'self\', \'session\', \'coord\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "after_run"
+    argspec: "args=[\'self\', \'run_context\', \'run_values\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "before_run"
+    argspec: "args=[\'self\', \'run_context\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "begin"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "end"
+    argspec: "args=[\'self\', \'session\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
index cabca3e883fbceecb399e048e09722acd4efcad4..f0fd7ce782db71ff5e790fe50e93556bf5d19e1e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.experimental.pbtxt
@@ -1,9 +1,17 @@
 path: "tensorflow.estimator.experimental"
 tf_module {
+  member {
+    name: "InMemoryEvaluatorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LinearSDCA"
     mtype: "<type \'type\'>"
   }
+  member_method {
+    name: "build_raw_supervised_input_receiver_fn"
+    argspec: "args=[\'features\', \'labels\', \'default_batch_size\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "call_logit_fn"
     argspec: "args=[\'logit_fn\', \'features\', \'mode\', \'params\', \'config\'], varargs=None, keywords=None, defaults=None"
@@ -20,6 +28,10 @@ tf_module {
     name: "make_early_stopping_hook"
     argspec: "args=[\'estimator\', \'should_stop_fn\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'60\', \'None\'], "
   }
+  member_method {
+    name: "make_stop_at_checkpoint_step_hook"
+    argspec: "args=[\'estimator\', \'last_step\', \'wait_after_file_check_secs\'], varargs=None, keywords=None, defaults=[\'30\'], "
+  }
   member_method {
     name: "stop_if_higher_hook"
     argspec: "args=[\'estimator\', \'metric_name\', \'threshold\', \'eval_dir\', \'min_steps\', \'run_every_secs\', \'run_every_steps\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'60\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
index c5b0085b8d3ec58b4215d4a756957e1509501841..d3656ae0455971ccd98062a52ec0412bf6af06f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.estimator.pbtxt
@@ -24,6 +24,14 @@ tf_module {
     name: "BoostedTreesRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "CheckpointSaverHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CheckpointSaverListener"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DNNClassifier"
     mtype: "<type \'type\'>"
@@ -64,10 +72,22 @@ tf_module {
     name: "Exporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FeedFnHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "FinalExporter"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "FinalOpsHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "GlobalStepWaiterHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LatestExporter"
     mtype: "<type \'type\'>"
@@ -84,14 +104,46 @@ tf_module {
     name: "LinearRegressor"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LoggingTensorHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "ModeKeys"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NanLossDuringTrainingError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "NanTensorHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "ProfilerHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "RunConfig"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "SecondOrStepTimer"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StepCounterHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "StopAtStepHook"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SummarySaverHook"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TrainSpec"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
index f06e7989537eef2b0e6fa4b720e90614366b41ee..3aadd7dc341ae97fdbfa83cd3fc96fc75249a4c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.feature_column.pbtxt
@@ -14,7 +14,7 @@ tf_module {
   }
   member_method {
     name: "categorical_column_with_vocabulary_file"
-    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'num_oov_buckets\', \'default_value\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'key\', \'vocabulary_file\', \'vocabulary_size\', \'dtype\', \'default_value\', \'num_oov_buckets\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\", \'None\', \'0\'], "
   }
   member_method {
     name: "categorical_column_with_vocabulary_list"
@@ -32,14 +32,6 @@ tf_module {
     name: "indicator_column"
     argspec: "args=[\'categorical_column\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "input_layer"
-    argspec: "args=[\'features\', \'feature_columns\', \'weight_collections\', \'trainable\', \'cols_to_vars\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "linear_model"
-    argspec: "args=[\'features\', \'feature_columns\', \'units\', \'sparse_combiner\', \'weight_collections\', \'trainable\', \'cols_to_vars\'], varargs=None, keywords=None, defaults=[\'1\', \'sum\', \'None\', \'True\', \'None\'], "
-  }
   member_method {
     name: "make_parse_example_spec"
     argspec: "args=[\'feature_columns\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
deleted file mode 100644
index eecfaffd0a6f6e611eba8bf3f5bb709bc9e0157f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-fast-g-file.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.FastGFile"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.FastGFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
deleted file mode 100644
index 305251059d90b52aa2e76e99a4ec65e68b73fb79..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-g-file.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.GFile"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
deleted file mode 100644
index 6e8894180a4a685d5a35ba02df53c6e054db01b9..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.-open.pbtxt
+++ /dev/null
@@ -1,58 +0,0 @@
-path: "tensorflow.gfile.Open"
-tf_class {
-  is_instance: "<class \'tensorflow.python.platform.gfile.GFile\'>"
-  is_instance: "<class \'tensorflow.python.lib.io.file_io.FileIO\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "mode"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'name\', \'mode\'], varargs=None, keywords=None, defaults=[\'r\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "next"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "read"
-    argspec: "args=[\'self\', \'n\'], varargs=None, keywords=None, defaults=[\'-1\'], "
-  }
-  member_method {
-    name: "readline"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "readlines"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "seek"
-    argspec: "args=[\'self\', \'offset\', \'whence\', \'position\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "size"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "tell"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "write"
-    argspec: "args=[\'self\', \'file_content\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
deleted file mode 100644
index 65b55a8b7c4e30e349c1ea256664002b19191c82..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.gfile.pbtxt
+++ /dev/null
@@ -1,63 +0,0 @@
-path: "tensorflow.gfile"
-tf_module {
-  member {
-    name: "FastGFile"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GFile"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Open"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "Copy"
-    argspec: "args=[\'oldpath\', \'newpath\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "DeleteRecursively"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Exists"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Glob"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "IsDirectory"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "ListDirectory"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "MakeDirs"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "MkDir"
-    argspec: "args=[\'dirname\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Remove"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Rename"
-    argspec: "args=[\'oldname\', \'newname\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
-  }
-  member_method {
-    name: "Stat"
-    argspec: "args=[\'filename\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Walk"
-    argspec: "args=[\'top\', \'in_order\'], varargs=None, keywords=None, defaults=[\'True\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
index 0a231f1b65155b8662bb38943bfd97c5283b9385..3c6ed1cfb8340b6e8f2599360e3c321c562e37ff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.image.pbtxt
@@ -38,7 +38,7 @@ tf_module {
   }
   member_method {
     name: "crop_and_resize"
-    argspec: "args=[\'image\', \'boxes\', \'box_ind\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
+    argspec: "args=[\'image\', \'boxes\', \'box_indices\', \'crop_size\', \'method\', \'extrapolation_value\', \'name\'], varargs=None, keywords=None, defaults=[\'bilinear\', \'0\', \'None\'], "
   }
   member_method {
     name: "crop_to_bounding_box"
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "extract_image_patches"
-    argspec: "args=[\'images\', \'ksizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'images\', \'sizes\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "extract_jpeg_shape"
@@ -173,16 +173,8 @@ tf_module {
     argspec: "args=[\'image\', \'lower\', \'upper\', \'seed\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "resize_area"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bicubic"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
-  member_method {
-    name: "resize_bilinear"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
+    name: "resize"
+    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
   }
   member_method {
     name: "resize_image_with_crop_or_pad"
@@ -192,14 +184,6 @@ tf_module {
     name: "resize_image_with_pad"
     argspec: "args=[\'image\', \'target_height\', \'target_width\', \'method\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
-  member_method {
-    name: "resize_images"
-    argspec: "args=[\'images\', \'size\', \'method\', \'align_corners\', \'preserve_aspect_ratio\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\'], "
-  }
-  member_method {
-    name: "resize_nearest_neighbor"
-    argspec: "args=[\'images\', \'size\', \'align_corners\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
-  }
   member_method {
     name: "rgb_to_grayscale"
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -222,7 +206,7 @@ tf_module {
   }
   member_method {
     name: "sample_distorted_bounding_box"
-    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'seed2\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'image_size\', \'bounding_boxes\', \'seed\', \'min_object_covered\', \'aspect_ratio_range\', \'area_range\', \'max_attempts\', \'use_image_if_no_bounding_boxes\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'0.1\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sobel_edges"
@@ -241,8 +225,8 @@ tf_module {
     argspec: "args=[\'images\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "transpose_image"
-    argspec: "args=[\'image\'], varargs=None, keywords=None, defaults=None"
+    name: "transpose"
+    argspec: "args=[\'image\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "yiq_to_rgb"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..cfa3372b12bfe32eed4311c89b6448c0359c0913
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.gfile.pbtxt
@@ -0,0 +1,51 @@
+path: "tensorflow.io.gfile"
+tf_module {
+  member_method {
+    name: "copy"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "exists"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "glob"
+    argspec: "args=[\'pattern\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "isdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "listdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "makedirs"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "mkdir"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "remove"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "rename"
+    argspec: "args=[\'src\', \'dst\', \'overwrite\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "rmtree"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "stat"
+    argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "walk"
+    argspec: "args=[\'top\', \'topdown\', \'onerror\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
index 64b63ed1a4a5611d369cd4aa01589aee2076b24f..8906329742c61ed08a25bcc252ec0d1dfa9e374e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.io.pbtxt
@@ -44,22 +44,50 @@ tf_module {
     name: "VarLenFeature"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "gfile"
+    mtype: "<type \'module\'>"
+  }
+  member_method {
+    name: "decode_and_crop_jpeg"
+    argspec: "args=[\'contents\', \'crop_window\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
+  }
   member_method {
     name: "decode_base64"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_bmp"
+    argspec: "args=[\'contents\', \'channels\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'None\'], "
+  }
   member_method {
     name: "decode_compressed"
     argspec: "args=[\'bytes\', \'compression_type\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
   member_method {
     name: "decode_csv"
-    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'name\', \'na_value\', \'select_cols\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'None\', \'\', \'None\'], "
+    argspec: "args=[\'records\', \'record_defaults\', \'field_delim\', \'use_quote_delim\', \'na_value\', \'select_cols\', \'name\'], varargs=None, keywords=None, defaults=[\',\', \'True\', \'\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "decode_gif"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "decode_image"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
+  member_method {
+    name: "decode_jpeg"
+    argspec: "args=[\'contents\', \'channels\', \'ratio\', \'fancy_upscaling\', \'try_recover_truncated\', \'acceptable_fraction\', \'dct_method\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'1\', \'True\', \'False\', \'1\', \'\', \'None\'], "
   }
   member_method {
     name: "decode_json_example"
     argspec: "args=[\'json_examples\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "decode_png"
+    argspec: "args=[\'contents\', \'channels\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \"<dtype: \'uint8\'>\", \'None\'], "
+  }
   member_method {
     name: "decode_raw"
     argspec: "args=[\'bytes\', \'out_type\', \'little_endian\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
@@ -72,6 +100,18 @@ tf_module {
     name: "encode_base64"
     argspec: "args=[\'input\', \'pad\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
+  member_method {
+    name: "encode_jpeg"
+    argspec: "args=[\'image\', \'format\', \'quality\', \'progressive\', \'optimize_size\', \'chroma_downsampling\', \'density_unit\', \'x_density\', \'y_density\', \'xmp_metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'95\', \'False\', \'False\', \'True\', \'in\', \'300\', \'300\', \'\', \'None\'], "
+  }
+  member_method {
+    name: "extract_jpeg_shape"
+    argspec: "args=[\'contents\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "is_jpeg"
+    argspec: "args=[\'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "match_filenames_once"
     argspec: "args=[\'pattern\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -82,7 +122,7 @@ tf_module {
   }
   member_method {
     name: "parse_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'serialized\', \'features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "parse_sequence_example"
@@ -90,7 +130,7 @@ tf_module {
   }
   member_method {
     name: "parse_single_example"
-    argspec: "args=[\'serialized\', \'features\', \'name\', \'example_names\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'serialized\', \'features\', \'example_names\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "parse_single_sequence_example"
@@ -106,20 +146,16 @@ tf_module {
   }
   member_method {
     name: "serialize_many_sparse"
-    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'sp_input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
   }
   member_method {
     name: "serialize_sparse"
-    argspec: "args=[\'sp_input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'string\'>\"], "
+    argspec: "args=[\'sp_input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'string\'>\", \'None\'], "
   }
   member_method {
     name: "serialize_tensor"
     argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "tf_record_iterator"
-    argspec: "args=[\'path\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "write_file"
     argspec: "args=[\'filename\', \'contents\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
index 8ccba990bdd94c4748efb2017cbd2eaabb842b9b..a3254cbd947d9ef70617131e9f4b17f44f059840 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -105,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -225,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -247,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -259,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
index 27aa91a64529a0509451e59d0adbf00e8aa831e4..b70e9ee98d5bc4900420ddb1307abf9adcd8cad0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -110,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -242,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -264,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -276,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
index b30778b2a082972e153eed9cf015d5b32d98379a..d200d3d26d7c1b7d54eda596a8056a66e29be0b6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.backend.pbtxt
@@ -394,7 +394,7 @@ tf_module {
   }
   member_method {
     name: "rnn"
-    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\'], "
+    argspec: "args=[\'step_function\', \'inputs\', \'initial_states\', \'go_backwards\', \'mask\', \'constants\', \'unroll\', \'input_length\', \'time_major\', \'zero_output_for_mask\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'None\', \'False\', \'None\', \'False\', \'False\'], "
   }
   member_method {
     name: "round"
@@ -508,6 +508,10 @@ tf_module {
     name: "temporal_padding"
     argspec: "args=[\'x\', \'padding\'], varargs=None, keywords=None, defaults=[\'(1, 1)\'], "
   }
+  member_method {
+    name: "tile"
+    argspec: "args=[\'x\', \'n\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_dense"
     argspec: "args=[\'tensor\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
index db69e25c5b70a0cdd76dba6aa570d0c634a31279..1d814b2c8b553f1b2a07f9d9b97dc70ec0674969 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.experimental.-peephole-l-s-t-m-cell.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
index 5510465d7b015e4989472b06c9d00ec9772373cf..b84629540e700f242f885064c92309c294693a11 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activation.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
index 38ec8a0aff0b9321f3a7ab2cfd9e6b75a8228e4a..5918a13ad8629582829049485e896688ecad9579 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-activity-regularization.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
index 41cb8e30bfb57068ebe787f14f69ccc467047f26..599da06427dfe4f28e757a7aac8d8a14856a4556 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-add.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
index 9a7aaa8e961528aa750248e02f44403cab10a413..f9ff1538c8134d96051ad81d35c73e59c6a8cc57 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-alpha-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
index 014f5828fad1152b19a0b0e3d2ffee7cee4c999b..723fc9cdb0d0ad93470e22fd8c147d3ecc92af91 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
index cc303bf7b98bb81cb0646fc18df0a4c5c70f1917..957ce2f0ce86f8df3eb8b57606229fb661eb52f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
index 628447ce3555628b651536d6c5b2a7716d59085c..a52c0af68175420dc2a1993d1f025d36705538e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
index f03c986c22210906ad7bdc8b880753469b31aa1b..a004db62ddcaaae02a411d8db51f4026ece1384d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-average.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
index a6e4856de9b63c946b77b745a6ede28dedf44afc..44f83d1387cb2ec681f50f7b1f0297f3f74594ed 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
index a01eaf8a12626257e97d135f50c06c7ea32fca27..8378faf7188ec594865d4b68c8ea8cae284183ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
index 0d6698f2ef4c674bf8a4dfc026eb209a83dcb8e7..9d5655c9644e3a2394a346bed78fc478cf60ba8d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
index f1b23be48f7fec2051f1985381058d769eb8c2f8..5da79268129fc5c08cbd37686333847cbb32730d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-batch-normalization.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.BatchNormalization"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalization\'>"
+  is_instance: "<class \'tensorflow.python.keras.layers.normalization.BatchNormalizationV2\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
index 0672cd5b7b8fdb1967e39c9163635372f73459b7..d37a6b47105225d7b83b6a264b944ceeb583a6c4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-bidirectional.pbtxt
@@ -97,6 +97,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
index b25ae1e82e8a1f315553337a261a2d8a46301fa0..1ad7a91be0ba48d0dbab19da8c7cd9ca89095918 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-concatenate.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
index bb1918eba65659d9ede888400c24b3a5121d6052..cb9abc25396bb63a3c40de5cc52f9df7ed20071e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv-l-s-t-m2-d.pbtxt
@@ -178,6 +178,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
index 16e0fd5a3131723b3ba3ef3ae6d93fa6426dbd47..47dba1d81f8f97a60fe72ec521f82a78ee5f3505 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
index 381839d6deb8355a54fd883596f94e38fa1356d4..fd649418961301f150aac3dabc1bdf0ade4a9c28 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
index 543bae6fa96fa3ae51775e865bf95ea6f79c8e94..1b1425d53197db8b59abf51fe93c0b0c45299956 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
index 2933f9f4b3ab854a9bff6a200da2c2b912bfd4d8..1741063fe8b09acf3865e0a135e96bb715dcdcfa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
index 072943dc2c709a7cee26c3439e02e11455187282..50feb4f458ad1a9cb2b2bfe5d67997b7551eed74 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-conv3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
index 222a1ef4fc5d19afe2c111c169c2f0bd38c331d6..faaa535df9fe03ad07862f0793f8ebea67b405ca 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
index 9c9c7461c8b8b43acfc7b7db94fd961a15d57817..4079329d1ee2a61270fee38426bb8a0859c38ce3 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
index f93906717814d4df7dfbf983d6cdbef358e9a55c..32e56696e1617f7810792e3416a2ebb2037d23c2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
index 44ca598724a5c7b6d40ba460dd866675971015c3..381abe73401fa3a588873d643324fc020c159e30 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d-transpose.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
index 471b18ef8500a279fb07bc893e2c8100d76d7bf1..b3e4bf9689dc7e9db63de7f43e9dfa9ac4d42b02 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-convolution3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
index 0f250a09b7eb69871e7e89d30da817aeb1d896fc..7aeff8003c322e8a8168dd70481a8b30b08762a8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
index f52128483c67321e4f0e5f0cf5a9fd3c65794561..a1728d9d4f9a1e677646db04c4d0df9572e21208 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
index 98daf3bab128357ffdde2e8ffa4f61fd5c6493f7..8d8fd142cc64ee113c4b6a7e4e2462ecc69b6028 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cropping3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
index b207c6800050a6a3f8b9525315fffa14341758cc..7758209adf8fe7a1306fa5ef125935dafd925c3e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-g-r-u.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
index 2d7a09ceda90fa8564de4410bf9b553cf1594c97..7c463ff1257599366be049edce6cc06140906286 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-cu-d-n-n-l-s-t-m.pbtxt
@@ -98,6 +98,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
similarity index 77%
rename from tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
index b66c0f89cc904c1318787651a3e8e629319c14fb..0781a93bd56c5ebc77e1fb650497621e49d7ee1f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-multi-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense-features.pbtxt
@@ -1,8 +1,6 @@
-path: "tensorflow.nn.rnn_cell.MultiRNNCell"
+path: "tensorflow.keras.layers.DenseFeatures"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.MultiRNNCell\'>"
-  is_instance: "<class \'tensorflow.python.ops.rnn_cell_impl.RNNCell\'>"
-  is_instance: "<class \'tensorflow.python.layers.base.Layer\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseFeatures\'>"
   is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
   is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
   is_instance: "<type \'object\'>"
@@ -14,10 +12,6 @@ tf_class {
     name: "dtype"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "inbound_nodes"
     mtype: "<type \'property\'>"
@@ -66,18 +60,6 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
-  member {
-    name: "output_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "scope_name"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "state_size"
-    mtype: "<type \'property\'>"
-  }
   member {
     name: "trainable_variables"
     mtype: "<type \'property\'>"
@@ -100,12 +82,16 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'cells\', \'state_is_tuple\'], varargs=None, keywords=None, defaults=[\'True\'], "
+    argspec: "args=[\'self\', \'feature_columns\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'True\', \'None\'], "
   }
   member_method {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -116,7 +102,7 @@ tf_class {
   }
   member_method {
     name: "add_weight"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'use_resource\', \'synchronization\', \'aggregation\', \'partitioner\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\', \'None\'], "
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
   }
   member_method {
     name: "apply"
@@ -128,7 +114,7 @@ tf_class {
   }
   member_method {
     name: "call"
-    argspec: "args=[\'self\', \'inputs\', \'state\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'self\', \'features\', \'cols_to_output_tensors\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_mask"
@@ -150,10 +136,6 @@ tf_class {
     name: "get_config"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "get_initial_state"
-    argspec: "args=[\'self\', \'inputs\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "get_input_at"
     argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
@@ -194,8 +176,4 @@ tf_class {
     name: "set_weights"
     argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "zero_state"
-    argspec: "args=[\'self\', \'batch_size\', \'dtype\'], varargs=None, keywords=None, defaults=None"
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
index 3ac3825759391b7ea21fd6e3b3b149bb9e731479..4960d0264e96e872ea5c49a8841cef20bd5eb37c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dense.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
index 280ec8c25fabe1be63c9aa9a2c7f168315c219d7..8fad7535f882718462a11e27e75732e3097cb87d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-depthwise-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
index 560f66f9c7a1f7e42e27c739a6c71671f8bd147b..5b425f2d4d7a8a897280490e26922766d8bf7065 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dot.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
index c0543529c3884f20383911f32ea04c07fec4a050..f6c4d0a438ed027635b40ec992eb1bbcb5c9a3a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
index 04eb2824b9b14cf45eaef263282ffc6778bf709d..82b761fc1761bb3e7638f7a80bc80c6433162d04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-e-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
index f400432915f8ce892a3297a23078f140eb96db7b..c9ff323877e06b6dff274644744d425e3a9b7932 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-embedding.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
index ab176b441a246d93b88c00cd6decb34af175ad86..9b4165d4cbf88fefd2bb684dae70ea8afc01357b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-flatten.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
index c3895a0ac127bc663f2a323661c1371a428159b0..f225f7c4309615919fb05df05f2ae664bde80097 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
index 9e24bb8ae6a1dc2e438038d4ac9225a7f17c4598..855d001700179fb634d1dff78585d340420abe7f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-g-r-u.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
index 55e0d7ef023ac4ca5e89f640c5ebb79199c31afa..2c404c99cd2175cdc8b60b229e4410bf280ebcb7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-dropout.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
index 38fbff5e4a3d2c892b0601c54e52690dae5760bd..6f109d59d0f6fcd2b4650719e3b4f653baec7d23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-gaussian-noise.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
index a8094c0bde3e46b49cd253d7861922c90f1ffbd1..69f8a9031d32eb73bb44291cdf330d738d745cf9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
index 929f48df23180a2c5e21c110e0e1d343596ecd76..4299f765e525b136e289bba169becec06e19ffb1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
index 2e6d59337f1df94e327b506248eb74ab11bd6013..9153a1a2406b6fc4ab60c80fee2f8d6d69b00b72 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-average-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
index 3ebe162f573f630400a17d9a7ccad1615b8e9c78..625e81fd2322ceba153fa65c138948ce43843089 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
index 4e3e258430cdacaf55aed5d46411d2b74c9bdf2e..2fc769742c70c5665c9cb77ad246fcdb49366d5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
index fb9166316f6a641eb12a5664100e31d652148a84..e307a65c7c565660e1f2b6b6b74dc5970425eaa4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-avg-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
index c0a53b847b4afc1fb098fa06eb8e8e27a96c3459..4394ad0364e89fd3531d6625e52540991cadf973 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
index 87b7f6797a0d5bef8c5a4ff582c30433eaced2d4..050ed39fe98dc7cfdf6febe45e235d3ae7cbf486 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
index 98bf96fa0c251c5f6de8878d48e651ac3346ff38..436191821ef4689351b6124cf2a20afad917e4ab 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
index ff6c6f3ec4d501a87a858c7c9bf365590cfb4fdc..4ba540aa6adc72b572aa9340f89967d69ab78a3c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
index c9d4158d1c434655abb11b92269e6e70ad2d1f91..a2e9322cb3fd4e56af708d5c4e17b660f7bc2247 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
index 9953102ff991bfd4f0568120dd7aef07f75ea208..5d16a57fc1aeff9939220de8043fcae39e3d953e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-global-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
index 2617f5a95fa631cf0b92e1fd2feef7457f96fd80..9dd29c1251ef2eacaf535a3f10f3d42dc36624a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-layer.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
index 5fd0a47a68c0d4ad218c4c64cc6be8f603d9673a..bc3ceb67a4e7506b42fccd6b227891b9eef8147f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-input-spec.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.keras.layers.InputSpec"
 tf_class {
-  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.InputSpec\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.input_spec.InputSpec\'>"
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
index e9f6ef45aaf1c775ea1b8dd157737f65c87e232f..0045d5775e2c19df21428bd4420b6e5612c8002b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
index 1b1ccbe118069e5fa5c3acd4ebea2d7fc14395b2..529c750f98715ec30313ed34c9023a845061a3df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-l-s-t-m.pbtxt
@@ -161,6 +161,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
index 2e0b6bac24fd63988b28c1099d40581989b783df..d4d1bc6b6bbf0ce39742b740aff6dc0c1cd464a1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-lambda.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
index 1e93d1118a4d306d5427d9b6873de1746d93b764..e1f5491180903f7d6931cc09755cabb715bbf233 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-layer.pbtxt
@@ -87,6 +87,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
index bfd36012a7edb8a74198a87a86577278be3fdcd4..9b69d9a9447f42907236b5cc8c7672012f96c38a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-leaky-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2b66576c96b8503d3ebb90f02ed19233223a269a
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-linear-model.pbtxt
@@ -0,0 +1,289 @@
+path: "tensorflow.keras.layers.LinearModel"
+tf_class {
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.LinearModel\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.training.Model\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.network.Network\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "bias"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_spec"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "layers"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "state_updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "stateful"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'feature_columns\', \'units\', \'sparse_combiner\', \'trainable\', \'name\'], varargs=None, keywords=kwargs, defaults=[\'1\', \'sum\', \'True\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\', \'partitioner\', \'use_resource\', \'synchronization\', \'aggregation\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'VariableSynchronization.AUTO\', \'VariableAggregation.NONE\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'features\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compile"
+    argspec: "args=[\'self\', \'optimizer\', \'loss\', \'metrics\', \'loss_weights\', \'sample_weight_mode\', \'weighted_metrics\', \'target_tensors\', \'distribute\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'verbose\', \'sample_weight\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'1\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "evaluate_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "fit"
+    argspec: "args=[\'self\', \'x\', \'y\', \'batch_size\', \'epochs\', \'verbose\', \'callbacks\', \'validation_split\', \'validation_data\', \'shuffle\', \'class_weight\', \'sample_weight\', \'initial_epoch\', \'steps_per_epoch\', \'validation_steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=kwargs, defaults=[\'None\', \'None\', \'None\', \'1\', \'1\', \'None\', \'0.0\', \'None\', \'True\', \'None\', \'None\', \'0\', \'None\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "fit_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps_per_epoch\', \'epochs\', \'verbose\', \'callbacks\', \'validation_data\', \'validation_steps\', \'class_weight\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'shuffle\', \'initial_epoch\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'1\', \'None\', \'None\', \'None\', \'None\', \'10\', \'1\', \'False\', \'True\', \'0\'], "
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\', \'custom_objects\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_layer"
+    argspec: "args=[\'self\', \'name\', \'index\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "load_weights"
+    argspec: "args=[\'self\', \'filepath\', \'by_name\'], varargs=None, keywords=None, defaults=[\'False\'], "
+  }
+  member_method {
+    name: "predict"
+    argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\', \'10\', \'1\', \'False\'], "
+  }
+  member_method {
+    name: "predict_generator"
+    argspec: "args=[\'self\', \'generator\', \'steps\', \'max_queue_size\', \'workers\', \'use_multiprocessing\', \'verbose\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'1\', \'False\', \'0\'], "
+  }
+  member_method {
+    name: "predict_on_batch"
+    argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'include_optimizer\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
+  }
+  member_method {
+    name: "save_weights"
+    argspec: "args=[\'self\', \'filepath\', \'overwrite\', \'save_format\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "summary"
+    argspec: "args=[\'self\', \'line_length\', \'positions\', \'print_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "test_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+  }
+  member_method {
+    name: "to_json"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "to_yaml"
+    argspec: "args=[\'self\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "train_on_batch"
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
index 5ad5990d7e624c4f6b1dde92b4608c65aeb19db1..fd52259432577ac94dc702d4411ad5c0eed1ff10 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
index 40d03369a5235f394832e3e2f48710bb069e9aac..5fc8af0d03564c649dff6e9df70d10731319de40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-locally-connected2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
index 86666b51bb8c8dc22deb95f05cb9edfb10688015..7f8932270e63bc02852c5b64e53694e7e26be08b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-masking.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
index d26da270e747898412b05f6e07e3bbc23f287e0b..4723b99cb0792e1ce0bdc45e46908da8c2b5359c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
index 85f23df671d2772995ec01bb09e191237d60e6a7..173c5d4a8b149c4e23683cf375e8d793db7faa5a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
index 235806b96500473fe95dd1b25aafe7f091bdb36b..14e1899e145224e411d65cbf481060a3b2cec0f1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pool3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
index 524c5fd69e508bac55b05502e62650b92cf2be5c..a708e652bf0e82dea0f58034a81a040a39550dc9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
index fda2562fc8c51623f5c4b33e23319ed35229905e..e6706b5cf9f32bda78adc4e2db5916a5750cc82e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
index 71d2d09a8d1d7addf91d7dc4ca109f8c2d45aed9..a73c082d1bba0453b742f76bacf0ad6116ba79a7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-max-pooling3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
index 12949b39a6f7affa657d1dccdc49ad0dc37e9c2f..f3f195554bbf4a43efaf2af0fd278a23bf270994 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-maximum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
index ab16d0021e627e6a2a821a0185ad71eb5bef1835..f345d1d67b2ce0200c64b1aeea5f39821d070bac 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-minimum.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
index 61ccbf5962791ee1c0b35cc4aba422ff5cacd456..31cb8bc177c7a9e365101e75108a29900fbda124 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-multiply.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
index ce2320d7030d05ba1e065f5bbcf8a18014891b5e..44cccc92bd2f1ff0335c22f2967865dc88a96ff7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-p-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
index 69848af8cf876ad1232a0bf7c419f52ed68af9f0..b55e191ff1ad6997550966bbb6154a81a489575d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-permute.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
index 3358f26aebf8a8f845278d54b4524f4405751f5d..e9575436e5b14ac8c52a0b59c86937886eab5f40 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-r-n-n.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
index 413f45f018ae0ce9ccf0e459b24d544c456e4c7c..98223b207f2ecfd5b7af8a53390166e53a7d4f73 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
index 9c61ff602744c00f9105a3f297151b49a8a3dead..2df918b16b2552323d75083bfa80e328c0639cfe 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-repeat-vector.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
index baa91804c49f86a31093aed0c0a56613f7c1afee..ce5f9e21290eeddc0052257191ac4a6d068c1366 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-reshape.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
index 15a5d6ac9ea6e087dc0d76a2ab48b08448bfb6ee..a0bb917775fd9edb5d909bf850310e0596a88209 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
index be43bd5b3c13632711a49cbbe6c85527d46d46ec..d7942f201bdbfa8d1577813be461a5905b5c6c90 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-conv2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
index 6105992c7a3a92d00718fe3287412af3c752db1d..f7ac9042d46f46ab35d18c62e5d8841679a18ca9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution1-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
index 1b6cf1e9ecb08a789212da141971434bd63988a6..e5a92688220f6e227b317d71a70fde01df4c432b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-separable-convolution2-d.pbtxt
@@ -90,6 +90,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
index 29488a37f8f29f953d2b8b7e447c331df3244c84..0fe2c974a762784a82a6b97e116357be2a61d84f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n-cell.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
index 3d70cf8b65972bda1f2bf4d78e126c0946a7c2a5..2ee5873f0f11688019dec3a6cd69db06d99b9caa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-simple-r-n-n.pbtxt
@@ -149,6 +149,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
index d29731ecf9d5387a324104865af5f563d287c60b..5b8f64aa35725d0ea44fc5c5b81952fd839503e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-softmax.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
index a6d7494ca7d2230298a442b86766f46bc58a6d54..240cb6e562f77467d94ef95db2374150e318bc04 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout1-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
index c36e802693df564702100a652f3ccc2e95e4c40d..6226c469f8a534f96f6ea991fa5e7d2cf0019e3f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout2-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
index 9c46cfe40fd6959b526d6ca271bda3182daa1188..34dabce6d8dd0b1b6fe50a008a981e1f06a77edf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-spatial-dropout3-d.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
index 8982f787940dd65291580781b5dc95941d804071..0ddf628ace582db259ebe0b211aba6e6362b5d5b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-stacked-r-n-n-cells.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
index ec2cc502984d302b243803b04b4f9d60cee43d05..12eb35ad154a514afd9c900cb2dbece8af28c49f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-subtract.pbtxt
@@ -89,6 +89,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
index d7bc1980f32e523781a68e80312905bc355f0509..c41020c2b45cc88c9b63f3b7a45c35066794dfe2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-thresholded-re-l-u.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
index fec2de6b49ec1ffaf45b9ee9048bcce37425e919..479f89cf6ae93e8d6ae02e304a51a145164df7de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-time-distributed.pbtxt
@@ -93,6 +93,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
index 3d285e7f17db3e8cdfbacf0056a4c56ffa7e67cb..233363ce02614f184b43a059889c7475b6a8c50b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
index b05e5ec84de1eb4899b6fae437dc0d4bd1ab402d..cb6228ac446bd236df88f94eb6e9e717ea38463d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
index 728eca415a80842291d5684e55632689ceea4099..03bad3ccb613a225ad56e128ea680fc9312151e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-up-sampling3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
index da64e77c39c0e116ff725bb05526882541dd6056..158996792a47fab0e7aa26d21d4bb7f281ca76d2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-wrapper.pbtxt
@@ -92,6 +92,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
index 2f505f9293f429490543ba2c569668f4b2ba3ca4..63a56cd3eebe271f66258c9a0acb974764555b34 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding1-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
index f82c77072e6969dd57f89f4a971e59e28b4bfc63..965a4cca04651e123c5bd93484200a58b39918ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding2-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
index 54e01a99177cde5fbfaf5e1e0ac310bef3ea8eae..1a624308878a68f1b48cb0f8b5e08dafbbfa0333 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.-zero-padding3-d.pbtxt
@@ -88,6 +88,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
index 9d7e5bb8c7808689bedd8abb835e61c1f38fdb1d..3b4724ef104878df0caada75b0ba68740dc93f8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.layers.pbtxt
@@ -124,6 +124,10 @@ tf_module {
     name: "Dense"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "DenseFeatures"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "DepthwiseConv2D"
     mtype: "<type \'type\'>"
@@ -240,6 +244,10 @@ tf_module {
     name: "LeakyReLU"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "LinearModel"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "LocallyConnected1D"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2f7da93f6f412ca559aec2f6acde2b80a5c93c86
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-binary-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.BinaryCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.BinaryCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..b3a7cd80973259bd5cdfe382c656a9478f8933d8
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-categorical-crossentropy.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.CategoricalCrossentropy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.CategoricalCrossentropy\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'from_logits\', \'label_smoothing\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'0\', \'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..712bb2ecd3526c354cbcf640e689526b2e415a13
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsoluteError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsoluteError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..7fe362da89b47a925cd4708909e1c882a9a23aca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-absolute-percentage-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanAbsolutePercentageError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanAbsolutePercentageError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..a5718533500d9508c558d25d13fc6b61518a73a0
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..200006db355ca4dc8eb2f509bcb9da7543145548
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-mean-squared-logarithmic-error.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.keras.losses.MeanSquaredLogarithmicError"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.losses.MeanSquaredLogarithmicError\'>"
+  is_instance: "<class \'tensorflow.python.keras.losses.Loss\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'reduction\', \'name\'], varargs=None, keywords=None, defaults=[\'sum_over_batch_size\', \'None\'], "
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..f20ed26e2ea2819554159a9bcecb4141601e4a19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.-reduction.pbtxt
@@ -0,0 +1,28 @@
+path: "tensorflow.keras.losses.Reduction"
+tf_class {
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "NONE"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM"
+    mtype: "<type \'str\'>"
+  }
+  member {
+    name: "SUM_OVER_BATCH_SIZE"
+    mtype: "<type \'str\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "all"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "validate"
+    argspec: "args=[\'cls\', \'key\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
index eca6b915388ebff0103f7ad16f43c6be0df60b7d..c198096d252cd9a3706bcbf6f1e4a1199ec7a1f7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.losses.pbtxt
@@ -1,5 +1,33 @@
 path: "tensorflow.keras.losses"
 tf_module {
+  member {
+    name: "BinaryCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalCrossentropy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsoluteError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanAbsolutePercentageError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "MeanSquaredLogarithmicError"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Reduction"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -22,11 +50,11 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_hinge"
@@ -106,7 +134,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "squared_hinge"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..2db07df5235e150f691a12d6b332c6d0d241ac19
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.Accuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Accuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..904ad3a21a05895b23e30dab82a89a31c74dcfca
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-binary-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.BinaryAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.BinaryAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\', \'threshold\'], varargs=None, keywords=None, defaults=[\'binary_accuracy\', \'None\', \'0.5\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..17b74924fab4f596a010d6b9731b474433a8153e
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.CategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.CategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..49f577e1367aece126449923f77f4f6c89493e99
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalseNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalseNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..e8baf858669a446a11b44e044f36bfde61e440bb
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-false-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.FalsePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.FalsePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..40fe64bbd2cec45b9a8c4e9b041d3fa858af1327
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-mean.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Mean"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'mean\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'values\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..ae6a85026da80cd071984aede8d0ec4e9cd571c5
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-precision.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Precision"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Precision\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..31068a51d510a7b95f62f61f03d37176c0fca55d
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-recall.pbtxt
@@ -0,0 +1,192 @@
+path: "tensorflow.keras.metrics.Recall"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.Recall\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..aa77d1972cea42184fbbdb91e117b08ba38328fd
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sensitivity-at-specificity.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.SensitivityAtSpecificity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivityAtSpecificity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'specificity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..0c17452292a031d42f3da0d5844e99d1272dad25
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-sparse-categorical-accuracy.pbtxt
@@ -0,0 +1,194 @@
+path: "tensorflow.keras.metrics.SparseCategoricalAccuracy"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SparseCategoricalAccuracy\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.MeanMetricWrapper\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Mean\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'sparse_categorical_accuracy\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..67857aa89f1769c736d810cf5f73739021afeddf
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-specificity-at-sensitivity.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.SpecificityAtSensitivity"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.SpecificityAtSensitivity\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.SensitivitySpecificityBase\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'sensitivity\', \'num_thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'200\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..1b5eb8d0de53960c3a98409119709c1307aa6379
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-negatives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TrueNegatives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TrueNegatives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..5b9c470e32d7e038f9ba11e4f96ab6eaa6b60a87
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.-true-positives.pbtxt
@@ -0,0 +1,193 @@
+path: "tensorflow.keras.metrics.TruePositives"
+tf_class {
+  is_instance: "<class \'tensorflow.python.keras.metrics.TruePositives\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics._ConfusionMatrixConditionCount\'>"
+  is_instance: "<class \'tensorflow.python.keras.metrics.Metric\'>"
+  is_instance: "<class \'tensorflow.python.keras.engine.base_layer.Layer\'>"
+  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "activity_regularizer"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "dtype"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "inbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "input_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "losses"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "name"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "non_trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "outbound_nodes"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_mask"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "output_shape"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "trainable_weights"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "updates"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "variables"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "weights"
+    mtype: "<type \'property\'>"
+  }
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'thresholds\', \'name\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_loss"
+    argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "add_update"
+    argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "add_variable"
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "add_weight"
+    argspec: "args=[\'self\', \'name\', \'shape\', \'aggregation\', \'synchronization\', \'initializer\'], varargs=None, keywords=None, defaults=[\'()\', \'VariableAggregation.SUM\', \'VariableSynchronization.ON_READ\', \'None\'], "
+  }
+  member_method {
+    name: "apply"
+    argspec: "args=[\'self\', \'inputs\'], varargs=args, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "build"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "call"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=kwargs, defaults=None"
+  }
+  member_method {
+    name: "compute_mask"
+    argspec: "args=[\'self\', \'inputs\', \'mask\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "compute_output_shape"
+    argspec: "args=[\'self\', \'input_shape\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "count_params"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_config"
+    argspec: "args=[\'cls\', \'config\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_config"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_input_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_losses_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_mask_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_output_shape_at"
+    argspec: "args=[\'self\', \'node_index\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_updates_for"
+    argspec: "args=[\'self\', \'inputs\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "get_weights"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "reset_states"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "result"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "set_weights"
+    argspec: "args=[\'self\', \'weights\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "update_state"
+    argspec: "args=[\'self\', \'y_true\', \'y_pred\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
index a296e131586504a3fadc9e6fe54079ee0f8270ba..905021dd790205e64a6f9839218200db98941927 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.metrics.pbtxt
@@ -1,5 +1,57 @@
 path: "tensorflow.keras.metrics"
 tf_module {
+  member {
+    name: "Accuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "BinaryAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "CategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalseNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "FalsePositives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Mean"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Precision"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "Recall"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SensitivityAtSpecificity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SparseCategoricalAccuracy"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "SpecificityAtSensitivity"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TrueNegatives"
+    mtype: "<type \'type\'>"
+  }
+  member {
+    name: "TruePositives"
+    mtype: "<type \'type\'>"
+  }
   member_method {
     name: "KLD"
     argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
@@ -26,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "binary_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "categorical_accuracy"
@@ -34,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "cosine"
@@ -110,7 +162,7 @@ tf_module {
   }
   member_method {
     name: "sparse_categorical_crossentropy"
-    argspec: "args=[\'y_true\', \'y_pred\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'y_true\', \'y_pred\', \'from_logits\'], varargs=None, keywords=None, defaults=[\'False\'], "
   }
   member_method {
     name: "sparse_top_k_categorical_accuracy"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
index ccff809f2b25fd25180400d82510ec3ee3683615..c58c7bef22dd4bff95d8ff07a10e20bb1bc463ad 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-model.pbtxt
@@ -41,6 +41,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -69,6 +77,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -105,13 +117,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -225,6 +241,10 @@ tf_class {
     name: "predict_on_batch"
     argspec: "args=[\'self\', \'x\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -247,7 +267,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -259,6 +279,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
index b0fc7f97f1d9a18f6482abd548d2f60f60b82555..473a1c16fb1edfbf37a7752e273566c1310853af 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.models.-sequential.pbtxt
@@ -42,6 +42,14 @@ tf_class {
     name: "losses"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "metrics"
+    mtype: "<type \'property\'>"
+  }
+  member {
+    name: "metrics_names"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "name"
     mtype: "<type \'property\'>"
@@ -70,6 +78,10 @@ tf_class {
     name: "output_shape"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "run_eagerly"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "state_updates"
     mtype: "<type \'property\'>"
@@ -110,13 +122,17 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "add_variable"
-    argspec: "args=[\'self\', \'name\', \'shape\', \'dtype\', \'initializer\', \'regularizer\', \'trainable\', \'constraint\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
   }
   member_method {
     name: "add_weight"
@@ -242,6 +258,10 @@ tf_class {
     name: "predict_proba"
     argspec: "args=[\'self\', \'x\', \'batch_size\', \'verbose\'], varargs=None, keywords=None, defaults=[\'32\', \'0\'], "
   }
+  member_method {
+    name: "reset_metrics"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "reset_states"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
@@ -264,7 +284,7 @@ tf_class {
   }
   member_method {
     name: "test_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
   }
   member_method {
     name: "to_json"
@@ -276,6 +296,6 @@ tf_class {
   }
   member_method {
     name: "train_on_batch"
-    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'self\', \'x\', \'y\', \'sample_weight\', \'class_weight\', \'reset_metrics\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
index be4496e753f8bdcd76a4761f9bd1804a77380359..8177cc71ed34ed5d0ae57d25ee2da70067411ccc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.keras.utils.-progbar.pbtxt
@@ -4,7 +4,7 @@ tf_class {
   is_instance: "<type \'object\'>"
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\'], "
+    argspec: "args=[\'self\', \'target\', \'width\', \'verbose\', \'interval\', \'stateful_metrics\', \'unit_name\'], varargs=None, keywords=None, defaults=[\'30\', \'1\', \'0.05\', \'None\', \'step\'], "
   }
   member_method {
     name: "add"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
index 973705dae2fabbef0eafb38ad12e96c747aeee27..773c74e64d13ca4a840b7f599fc2cbe9c161cd03 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-block-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
index de917706d55214cc59f3205f0778d600a356a5b1..533544d21f2753f785113a30518f4fcbcff96cd7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
index c4e6a21c3ac9324f5dd445dc65415c2abb4c6e9f..e3926eb6d4714731d09ff9c5b75a89830c06e7c1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant2-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
index 2e085a8e289e21173789041efb9254e992bd723b..ba209df7824a9cc076499458e35acd7dcf1eaf35 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-circulant3-d.pbtxt
@@ -96,6 +96,10 @@ tf_class {
     name: "block_shape_tensor"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "convolution_kernel"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'convolution_kernel\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
index 42d22bce42d8850a784afae3f67771ef1cfe5403..081fb0e08bcd1b35ab44459d1c8eb0857dd14956 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-composition.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
index d6749fdcec69425e83a044409ec695d2661f782e..2014a04301618c20af5cf6f1144eb4dbda2479e1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-diag.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
index d9f363d1336210623536e8293a6290d9ebfc2fe1..9a87ae9687741090485bd8d4d0d07d359a2015e7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-full-matrix.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
index aac7ee31ed62c22b2e86d287d48c68c7e905fd00..33afb835ce1d524991c0024bfb87c29a72aac08e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-identity.pbtxt
@@ -76,6 +76,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
index c11d39082939eda4520b3955b767022bd485b5be..a9078c8ab5cca078237a29febabdbbd4a8b6c89c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-kronecker.pbtxt
@@ -79,6 +79,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
index 3ee800269e617390c25248a2c847cbe259b18e79..4cfa3bb30d7382f3cf3cc0d5ce412d230d2a4287 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-low-rank-update.pbtxt
@@ -99,6 +99,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
index 63a1bc2321e35645700778c5906d1b8659eb4a32..a87649133fd207ad59f2124c6b0b5aa44916e5a5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-lower-triangular.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
index e2c5a505a7d2f9abbee5b3bb4f92ee8843198c51..32656467840fbbc0c8708ea68aac5aa75c11a540 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-scaled-identity.pbtxt
@@ -80,6 +80,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
index a1b0e06b4753488bc9fcbe9aeb0d260092745f9c..49d8890c8942bc0021886ee6c9bc4e7625452655 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator-zeros.pbtxt
@@ -75,6 +75,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
index 6d849dc040f61b498b100820bf7be3d4bc264bb4..c89dc067b331603e227d9d578147e2dd1ee4a900 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.-linear-operator.pbtxt
@@ -74,6 +74,10 @@ tf_class {
     name: "batch_shape_tensor"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'batch_shape_tensor\'], "
   }
+  member_method {
+    name: "cholesky"
+    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'cholesky\'], "
+  }
   member_method {
     name: "determinant"
     argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=[\'det\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
index 1a4098d121b71d25fc0aaa9c7e6e4f096b01e033..3e1e2e3d54de3e2442299a783f933a60dfd2db6d 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.linalg.pbtxt
@@ -118,7 +118,7 @@ tf_module {
   }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "logdet"
@@ -132,6 +132,10 @@ tf_module {
     name: "lstsq"
     argspec: "args=[\'matrix\', \'rhs\', \'l2_regularizer\', \'fast\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'True\', \'None\'], "
   }
+  member_method {
+    name: "lu"
+    argspec: "args=[\'input\', \'output_idx_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+  }
   member_method {
     name: "matmul"
     argspec: "args=[\'a\', \'b\', \'transpose_a\', \'transpose_b\', \'adjoint_a\', \'adjoint_b\', \'a_is_sparse\', \'b_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'False\', \'False\', \'False\', \'None\'], "
@@ -142,7 +146,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "qr"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
new file mode 100644
index 0000000000000000000000000000000000000000..68c651a3c9969f2f16fca39f4466cebbb44eea28
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.-ops-set.pbtxt
@@ -0,0 +1,12 @@
+path: "tensorflow.lite.OpsSet"
+tf_class {
+  is_instance: "<enum \'OpsSet\'>"
+  member {
+    name: "SELECT_TF_OPS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+  member {
+    name: "TFLITE_BUILTINS"
+    mtype: "<enum \'OpsSet\'>"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt
index 08845553e55d3bd2b280331676195246632e56e2..4d5c4893b410120bf9d66e8d5d99ba0df5eaf164 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.constants.pbtxt
@@ -1,29 +1,9 @@
 path: "tensorflow.lite.constants"
 tf_module {
-  member {
-    name: "FLOAT"
-    mtype: "<type \'int\'>"
-  }
   member {
     name: "GRAPHVIZ_DOT"
     mtype: "<type \'int\'>"
   }
-  member {
-    name: "INT32"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INT64"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "QUANTIZED_UINT8"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "STRING"
-    mtype: "<type \'int\'>"
-  }
   member {
     name: "TFLITE"
     mtype: "<type \'int\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
index f5013c250be8477bb630d3d57ae88a501bb60b9b..154dd00821794ef4a5118e98d67e32beca38bebf 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.lite.pbtxt
@@ -8,6 +8,10 @@ tf_module {
     name: "OpHint"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "OpsSet"
+    mtype: "<class \'enum.EnumMeta\'>"
+  }
   member {
     name: "TFLiteConverter"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
deleted file mode 100644
index 85bb15455da624962744a0cc856e79e0a6d57d7c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.logging.pbtxt
+++ /dev/null
@@ -1,83 +0,0 @@
-path: "tensorflow.logging"
-tf_module {
-  member {
-    name: "DEBUG"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "ERROR"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "FATAL"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "INFO"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "WARN"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "TaskLevelStatusMessage"
-    argspec: "args=[\'msg\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "debug"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "error"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "fatal"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "flush"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_verbosity"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "info"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "log"
-    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "log_every_n"
-    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "log_first_n"
-    argspec: "args=[\'level\', \'msg\', \'n\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "log_if"
-    argspec: "args=[\'level\', \'msg\', \'condition\'], varargs=args, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "set_verbosity"
-    argspec: "args=[\'v\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "vlog"
-    argspec: "args=[\'level\', \'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "warn"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-  member_method {
-    name: "warning"
-    argspec: "args=[\'msg\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
index 258ad5047eb6e82eeb9c0941b0acf0573e5ca61d..6a44e4ce66c9dfcb9912c96d0106e4f4fd9fdcff 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.-reduction.pbtxt
@@ -1,11 +1,7 @@
 path: "tensorflow.losses.Reduction"
 tf_class {
-  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.Reduction\'>"
+  is_instance: "<class \'tensorflow.python.ops.losses.losses_impl.ReductionV2\'>"
   is_instance: "<type \'object\'>"
-  member {
-    name: "MEAN"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "NONE"
     mtype: "<type \'str\'>"
@@ -14,18 +10,10 @@ tf_class {
     name: "SUM"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "SUM_BY_NONZERO_WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "SUM_OVER_BATCH_SIZE"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "SUM_OVER_NONZERO_WEIGHTS"
-    mtype: "<type \'str\'>"
-  }
   member_method {
     name: "__init__"
   }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
index c1d190ae116e94ec8f837237e54b6fcff7358254..233b1a0131a4d292574be161de2d547cb0060c23 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.losses.pbtxt
@@ -4,22 +4,10 @@ tf_module {
     name: "Reduction"
     mtype: "<type \'type\'>"
   }
-  member_method {
-    name: "absolute_difference"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
   member_method {
     name: "add_loss"
     argspec: "args=[\'loss\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'losses\'], "
   }
-  member_method {
-    name: "compute_weighted_loss"
-    argspec: "args=[\'losses\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "cosine_distance"
-    argspec: "args=[\'labels\', \'predictions\', \'axis\', \'weights\', \'scope\', \'loss_collection\', \'reduction\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\', \'None\'], "
-  }
   member_method {
     name: "get_losses"
     argspec: "args=[\'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'None\', \'losses\'], "
@@ -36,36 +24,4 @@ tf_module {
     name: "get_total_loss"
     argspec: "args=[\'add_regularization_losses\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'total_loss\'], "
   }
-  member_method {
-    name: "hinge_loss"
-    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "huber_loss"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'delta\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "log_loss"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'epsilon\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'1e-07\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "mean_pairwise_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\'], "
-  }
-  member_method {
-    name: "mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "sigmoid_cross_entropy"
-    argspec: "args=[\'multi_class_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "softmax_cross_entropy"
-    argspec: "args=[\'onehot_labels\', \'logits\', \'weights\', \'label_smoothing\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
-  member_method {
-    name: "sparse_softmax_cross_entropy"
-    argspec: "args=[\'labels\', \'logits\', \'weights\', \'scope\', \'loss_collection\', \'reduction\'], varargs=None, keywords=None, defaults=[\'1.0\', \'None\', \'losses\', \'weighted_sum_by_nonzero_weights\'], "
-  }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
index a441e42b0a9ff85b990b93d57c516d2c8a10bb18..4ac0484050054abee9496bcf09d90ff58bbfb9d7 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.math.pbtxt
@@ -78,7 +78,7 @@ tf_module {
   }
   member_method {
     name: "bincount"
-    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'arr\', \'weights\', \'minlength\', \'maxlength\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "ceil"
@@ -86,7 +86,7 @@ tf_module {
   }
   member_method {
     name: "confusion_matrix"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'dtype\', \'name\', \'weights\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\", \'None\', \'None\'], "
+    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "conj"
@@ -102,7 +102,7 @@ tf_module {
   }
   member_method {
     name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'keepdims\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "cumprod"
@@ -170,15 +170,35 @@ tf_module {
   }
   member_method {
     name: "in_top_k"
-    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "invert_permutation"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "is_finite"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_inf"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_nan"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_non_decreasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "is_strictly_increasing"
+    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "lbeta"
@@ -210,7 +230,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "logical_and"
@@ -270,43 +290,43 @@ tf_module {
   }
   member_method {
     name: "reduce_all"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_any"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_mean"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_min"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_prod"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_std"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_variance"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "rint"
@@ -322,7 +342,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "segment_max"
@@ -362,7 +382,7 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "softplus"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
deleted file mode 100644
index e9b996c9f53e9062dcdd39ef22f99eef5175eb35..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.metrics.pbtxt
+++ /dev/null
@@ -1,135 +0,0 @@
-path: "tensorflow.metrics"
-tf_module {
-  member_method {
-    name: "accuracy"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "auc"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'curve\', \'name\', \'summation_method\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'ROC\', \'None\', \'trapezoidal\'], "
-  }
-  member_method {
-    name: "average_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "false_negatives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "false_negatives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "false_positives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "false_positives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean"
-    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_absolute_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_cosine_distance"
-    argspec: "args=[\'labels\', \'predictions\', \'dim\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_iou"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_per_class_accuracy"
-    argspec: "args=[\'labels\', \'predictions\', \'num_classes\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_relative_error"
-    argspec: "args=[\'labels\', \'predictions\', \'normalizer\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "mean_tensor"
-    argspec: "args=[\'values\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "percentage_below"
-    argspec: "args=[\'values\', \'threshold\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "precision_at_top_k"
-    argspec: "args=[\'labels\', \'predictions_idx\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recall_at_top_k"
-    argspec: "args=[\'labels\', \'predictions_idx\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "root_mean_squared_error"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sensitivity_at_specificity"
-    argspec: "args=[\'labels\', \'predictions\', \'specificity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_average_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_precision_at_k"
-    argspec: "args=[\'labels\', \'predictions\', \'k\', \'class_id\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "specificity_at_sensitivity"
-    argspec: "args=[\'labels\', \'predictions\', \'sensitivity\', \'weights\', \'num_thresholds\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_negatives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_negatives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_positives"
-    argspec: "args=[\'labels\', \'predictions\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "true_positives_at_thresholds"
-    argspec: "args=[\'labels\', \'predictions\', \'thresholds\', \'weights\', \'metrics_collections\', \'updates_collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
index 2dc5c48aa6e260200eb80474af3e2a81b6ea3eb2..c75c75f2ef7ca50cce15fe1dffb4d0de3f6815de 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.pbtxt
@@ -30,7 +30,7 @@ tf_module {
   }
   member_method {
     name: "batch_norm_with_global_normalization"
-    argspec: "args=[\'t\', \'m\', \'v\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'mean\', \'variance\', \'beta\', \'gamma\', \'variance_epsilon\', \'scale_after_normalization\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "batch_normalization"
@@ -41,8 +41,8 @@ tf_module {
     argspec: "args=[\'value\', \'bias\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "bidirectional_dynamic_rnn"
-    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'sequence_length\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'parallel_iterations\', \'swap_memory\', \'time_major\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'False\', \'False\', \'None\'], "
+    name: "collapse_repeated"
+    argspec: "args=[\'labels\', \'seq_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "compute_accidental_hits"
@@ -50,43 +50,43 @@ tf_module {
   }
   member_method {
     name: "conv1d"
-    argspec: "args=[\'value\', \'filters\', \'stride\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'stride\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_filter"
-    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_backprop_input"
-    argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'use_cudnn_on_gpu\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input_sizes\', \'filters\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\', \'None\'], "
   }
   member_method {
     name: "conv2d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NHWC\', \'None\'], "
   }
   member_method {
     name: "conv3d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'None\', \'None\'], "
   }
   member_method {
-    name: "conv3d_backprop_filter_v2"
+    name: "conv3d_backprop_filter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NDHWC\', \'[1, 1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "conv3d_transpose"
-    argspec: "args=[\'value\', \'filter\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'output_shape\', \'strides\', \'padding\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'SAME\', \'NDHWC\', \'None\'], "
   }
   member_method {
     name: "convolution"
-    argspec: "args=[\'input\', \'filter\', \'padding\', \'strides\', \'dilation_rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'VALID\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "crelu"
-    argspec: "args=[\'features\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'-1\'], "
+    argspec: "args=[\'features\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "ctc_beam_search_decoder"
@@ -98,35 +98,35 @@ tf_module {
   }
   member_method {
     name: "ctc_loss"
-    argspec: "args=[\'labels\', \'inputs\', \'sequence_length\', \'preprocess_collapse_repeated\', \'ctc_merge_repeated\', \'ignore_longer_outputs_than_inputs\', \'time_major\'], varargs=None, keywords=None, defaults=[\'False\', \'True\', \'False\', \'True\'], "
+    argspec: "args=[\'labels\', \'logits\', \'label_length\', \'logit_length\', \'logits_time_major\', \'unique\', \'blank_index\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "depth_to_space"
-    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+    name: "ctc_unique_labels"
+    argspec: "args=[\'labels\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    name: "depth_to_space"
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
+    name: "depthwise_conv2d"
+    argspec: "args=[\'input\', \'filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native_backprop_filter"
+    name: "depthwise_conv2d_backprop_filter"
     argspec: "args=[\'input\', \'filter_sizes\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
-    name: "depthwise_conv2d_native_backprop_input"
+    name: "depthwise_conv2d_backprop_input"
     argspec: "args=[\'input_sizes\', \'filter\', \'out_backprop\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'[1, 1, 1, 1]\', \'None\'], "
   }
   member_method {
     name: "dilation2d"
-    argspec: "args=[\'input\', \'filter\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "dropout"
-    argspec: "args=[\'x\', \'keep_prob\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rate\', \'noise_shape\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "elu"
@@ -134,15 +134,15 @@ tf_module {
   }
   member_method {
     name: "embedding_lookup"
-    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'name\', \'validate_indices\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'True\', \'None\'], "
+    argspec: "args=[\'params\', \'ids\', \'partition_strategy\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\'], "
   }
   member_method {
     name: "embedding_lookup_sparse"
-    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'name\', \'combiner\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'params\', \'sp_ids\', \'sp_weights\', \'partition_strategy\', \'combiner\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'mod\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "erosion2d"
-    argspec: "args=[\'value\', \'kernel\', \'strides\', \'rates\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'value\', \'filters\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "fixed_unigram_candidate_sampler"
@@ -150,19 +150,15 @@ tf_module {
   }
   member_method {
     name: "fractional_avg_pool"
-    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
   member_method {
     name: "fractional_max_pool"
-    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'deterministic\', \'seed\', \'seed2\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'False\', \'0\', \'0\', \'None\'], "
-  }
-  member_method {
-    name: "fused_batch_norm"
-    argspec: "args=[\'x\', \'scale\', \'offset\', \'mean\', \'variance\', \'epsilon\', \'data_format\', \'is_training\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0.001\', \'NHWC\', \'True\', \'None\'], "
+    argspec: "args=[\'value\', \'pooling_ratio\', \'pseudo_random\', \'overlapping\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'0\', \'None\'], "
   }
   member_method {
     name: "in_top_k"
-    argspec: "args=[\'predictions\', \'targets\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'targets\', \'predictions\', \'k\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "l2_loss"
@@ -170,7 +166,7 @@ tf_module {
   }
   member_method {
     name: "l2_normalize"
-    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'axis\', \'epsilon\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1e-12\', \'None\'], "
   }
   member_method {
     name: "leaky_relu"
@@ -190,7 +186,7 @@ tf_module {
   }
   member_method {
     name: "log_softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "lrn"
@@ -206,15 +202,15 @@ tf_module {
   }
   member_method {
     name: "max_pool_with_argmax"
-    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'Targmax\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'ksize\', \'strides\', \'padding\', \'data_format\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "moments"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "nce_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'mod\', \'nce_loss\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'False\', \'nce_loss\'], "
   }
   member_method {
     name: "normalize_moments"
@@ -222,23 +218,7 @@ tf_module {
   }
   member_method {
     name: "pool"
-    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'padding\', \'dilation_rate\', \'strides\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "quantized_avg_pool"
-    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "quantized_conv2d"
-    argspec: "args=[\'input\', \'filter\', \'min_input\', \'max_input\', \'min_filter\', \'max_filter\', \'strides\', \'padding\', \'out_type\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'qint32\'>\", \'[1, 1, 1, 1]\', \'None\'], "
-  }
-  member_method {
-    name: "quantized_max_pool"
-    argspec: "args=[\'input\', \'min_input\', \'max_input\', \'ksize\', \'strides\', \'padding\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "quantized_relu_x"
-    argspec: "args=[\'features\', \'max_value\', \'min_features\', \'max_features\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'quint8\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'window_shape\', \'pooling_type\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'VALID\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "relu"
@@ -248,17 +228,13 @@ tf_module {
     name: "relu6"
     argspec: "args=[\'features\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "relu_layer"
-    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "safe_embedding_lookup_sparse"
-    argspec: "args=[\'embedding_weights\', \'sparse_ids\', \'sparse_weights\', \'combiner\', \'default_id\', \'name\', \'partition_strategy\', \'max_norm\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\', \'None\', \'div\', \'None\'], "
+    argspec: "args=[\'embedding_weights\', \'sparse_ids\', \'sparse_weights\', \'combiner\', \'default_id\', \'max_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'mean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sampled_softmax_loss"
-    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'partition_strategy\', \'name\', \'seed\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'mod\', \'sampled_softmax_loss\', \'None\'], "
+    argspec: "args=[\'weights\', \'biases\', \'labels\', \'inputs\', \'num_sampled\', \'num_classes\', \'num_true\', \'sampled_values\', \'remove_accidental_hits\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'1\', \'None\', \'True\', \'None\', \'sampled_softmax_loss\'], "
   }
   member_method {
     name: "selu"
@@ -266,7 +242,7 @@ tf_module {
   }
   member_method {
     name: "separable_conv2d"
-    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'rate\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'depthwise_filter\', \'pointwise_filter\', \'strides\', \'padding\', \'data_format\', \'dilations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "sigmoid"
@@ -278,11 +254,11 @@ tf_module {
   }
   member_method {
     name: "softmax"
-    argspec: "args=[\'logits\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "softmax_cross_entropy_with_logits"
-    argspec: "args=[\'labels\', \'logits\', \'dim\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
+    argspec: "args=[\'labels\', \'logits\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'None\'], "
   }
   member_method {
     name: "softplus"
@@ -294,27 +270,23 @@ tf_module {
   }
   member_method {
     name: "space_to_batch"
-    argspec: "args=[\'input\', \'paddings\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "space_to_depth"
-    argspec: "args=[\'input\', \'block_size\', \'name\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'NHWC\'], "
+    argspec: "args=[\'input\', \'block_size\', \'data_format\', \'name\'], varargs=None, keywords=None, defaults=[\'NHWC\', \'None\'], "
   }
   member_method {
     name: "sparse_softmax_cross_entropy_with_logits"
     argspec: "args=[\'_sentinel\', \'labels\', \'logits\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "static_bidirectional_rnn"
-    argspec: "args=[\'cell_fw\', \'cell_bw\', \'inputs\', \'initial_state_fw\', \'initial_state_bw\', \'dtype\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "static_state_saving_rnn"
     argspec: "args=[\'cell\', \'inputs\', \'state_saver\', \'state_name\', \'sequence_length\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "sufficient_statistics"
-    argspec: "args=[\'x\', \'axes\', \'shift\', \'keep_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'x\', \'axes\', \'shift\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "tanh"
@@ -330,16 +302,12 @@ tf_module {
   }
   member_method {
     name: "weighted_moments"
-    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
+    argspec: "args=[\'x\', \'axes\', \'frequency_weights\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "with_space_to_batch"
     argspec: "args=[\'input\', \'dilation_rate\', \'padding\', \'op\', \'filter_shape\', \'spatial_dims\', \'data_format\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
-  member_method {
-    name: "xw_plus_b"
-    argspec: "args=[\'x\', \'weights\', \'biases\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "zero_fraction"
     argspec: "args=[\'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
index 381c4975d7d778599ce34a9023d0e46b20753cba..9e52a4252619ffc19b287fc1818fa6f772847335 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-device-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
index 912365a28b1277962f648b2b0655d280bca1427c..9836433d08cba809107f9bb5dbccf2e971865b8a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-dropout-wrapper.pbtxt
@@ -110,6 +110,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
index faeb4f3513362919fca8f0c2ef7c491d7938cb92..d3b68e4f2976912ed65ba7916284c951fda03b05 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-r-n-n-cell.pbtxt
@@ -105,6 +105,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
index caa2e600800178e4b2d36ae263da23d0b4608dd2..1f7840ab919baeeb0077904592ba8dcc1d4c91fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.-residual-wrapper.pbtxt
@@ -106,6 +106,10 @@ tf_class {
     name: "add_loss"
     argspec: "args=[\'self\', \'losses\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "add_metric"
+    argspec: "args=[\'self\', \'value\', \'aggregation\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "add_update"
     argspec: "args=[\'self\', \'updates\', \'inputs\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
index 3c78b07b3943879995aa0d822c20e0b393137f69..b1f687f52964e20a6dfa6f81f68e61d2a67513c9 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.nn.rnn_cell.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "LSTMStateTuple"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "MultiRNNCell"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "RNNCell"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index 078b471a4c62e83940cf5a64891698b6cb5f9b02..4432cae53b64b66e5a5c906f87af94f61bcf36bd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -4,34 +4,10 @@ tf_module {
     name: "AggregationMethod"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "AttrValue"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "ConditionalAccumulator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ConditionalAccumulatorBase"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ConfigProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "DType"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "DeviceSpec"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Dimension"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Event"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -40,10 +16,6 @@ tf_module {
     name: "FIFOQueue"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "GPUOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "GradientTape"
     mtype: "<type \'type\'>"
@@ -52,74 +24,22 @@ tf_module {
     name: "Graph"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "GraphDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "GraphKeys"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GraphOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "HistogramProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "IndexedSlices"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "LogMessage"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "MetaGraphDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "NameAttrList"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "NodeDef"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "Operation"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "OptimizerOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "RegisterGradient"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "RunMetadata"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "RunOptions"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "SessionLog"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "SparseTensor"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SparseTensorValue"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Summary"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -137,11 +57,11 @@ tf_module {
     mtype: "<type \'type\'>"
   }
   member {
-    name: "TensorInfo"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
+    name: "TensorShape"
+    mtype: "<type \'type\'>"
   }
   member {
-    name: "TensorShape"
+    name: "TensorSpec"
     mtype: "<type \'type\'>"
   }
   member {
@@ -160,10 +80,6 @@ tf_module {
     name: "VariableSynchronization"
     mtype: "<class \'enum.EnumMeta\'>"
   }
-  member {
-    name: "app"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "bfloat16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -200,6 +116,10 @@ tf_module {
     name: "debugging"
     mtype: "<type \'module\'>"
   }
+  member {
+    name: "distribute"
+    mtype: "<type \'module\'>"
+  }
   member {
     name: "double"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -224,10 +144,6 @@ tf_module {
     name: "feature_column"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "flags"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "float16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -240,10 +156,6 @@ tf_module {
     name: "float64"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
   }
-  member {
-    name: "gfile"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "glorot_uniform_initializer"
     mtype: "<type \'type\'>"
@@ -296,10 +208,6 @@ tf_module {
     name: "lite"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "logging"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "losses"
     mtype: "<type \'module\'>"
@@ -308,10 +216,6 @@ tf_module {
     name: "math"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "metrics"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "name_scope"
     mtype: "<type \'type\'>"
@@ -328,14 +232,6 @@ tf_module {
     name: "ones_initializer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "profiler"
-    mtype: "<type \'module\'>"
-  }
-  member {
-    name: "pywrap_tensorflow"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "qint16"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -392,10 +288,6 @@ tf_module {
     name: "sparse"
     mtype: "<type \'module\'>"
   }
-  member {
-    name: "spectral"
-    mtype: "<type \'module\'>"
-  }
   member {
     name: "string"
     mtype: "<class \'tensorflow.python.framework.dtypes.DType\'>"
@@ -476,14 +368,6 @@ tf_module {
     name: "add_n"
     argspec: "args=[\'inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "arg_max"
-    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
-  }
-  member_method {
-    name: "arg_min"
-    argspec: "args=[\'input\', \'dimension\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
-  }
   member_method {
     name: "argmax"
     argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
@@ -492,6 +376,10 @@ tf_module {
     name: "argmin"
     argspec: "args=[\'input\', \'axis\', \'output_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int64\'>\", \'None\'], "
   }
+  member_method {
+    name: "argsort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'stable\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'False\', \'None\'], "
+  }
   member_method {
     name: "as_dtype"
     argspec: "args=[\'type_value\'], varargs=None, keywords=None, defaults=None"
@@ -510,19 +398,19 @@ tf_module {
   }
   member_method {
     name: "assert_equal"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_greater"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_less"
-    argspec: "args=[\'x\', \'y\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'y\', \'message\', \'summarize\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "assert_rank"
-    argspec: "args=[\'x\', \'rank\', \'data\', \'summarize\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'x\', \'rank\', \'message\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "atan"
@@ -540,16 +428,8 @@ tf_module {
     name: "batch_gather"
     argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "batch_scatter_update"
-    argspec: "args=[\'ref\', \'indices\', \'updates\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'None\'], "
-  }
   member_method {
     name: "batch_to_space"
-    argspec: "args=[\'input\', \'crops\', \'block_size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "batch_to_space_nd"
     argspec: "args=[\'input\', \'block_shape\', \'crops\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
@@ -580,10 +460,6 @@ tf_module {
     name: "cast"
     argspec: "args=[\'x\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "clip_by_average_norm"
-    argspec: "args=[\'t\', \'clip_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "clip_by_global_norm"
     argspec: "args=[\'t_list\', \'clip_norm\', \'use_norm\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
@@ -606,11 +482,11 @@ tf_module {
   }
   member_method {
     name: "cond"
-    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'strict\', \'name\', \'fn1\', \'fn2\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'pred\', \'true_fn\', \'false_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "constant"
-    argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\', \'verify_shape\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\', \'False\'], "
+    argspec: "args=[\'value\', \'dtype\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'Const\'], "
   }
   member_method {
     name: "control_dependencies"
@@ -628,14 +504,6 @@ tf_module {
     name: "cosh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "count_nonzero"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'dtype\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \"<dtype: \'int64\'>\", \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "create_partitioned_variables"
-    argspec: "args=[\'shape\', \'slicing\', \'initializer\', \'dtype\', \'trainable\', \'collections\', \'name\', \'reuse\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'True\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "cumsum"
     argspec: "args=[\'x\', \'axis\', \'exclusive\', \'reverse\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'False\', \'False\', \'None\'], "
@@ -648,10 +516,6 @@ tf_module {
     name: "device"
     argspec: "args=[\'device_name\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "div"
-    argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "div_no_nan"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -676,10 +540,6 @@ tf_module {
     name: "einsum"
     argspec: "args=[\'equation\'], varargs=inputs, keywords=kwargs, defaults=None"
   }
-  member_method {
-    name: "enable_eager_execution"
-    argspec: "args=[\'config\', \'device_policy\', \'execution_mode\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "ensure_shape"
     argspec: "args=[\'x\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -698,7 +558,7 @@ tf_module {
   }
   member_method {
     name: "expand_dims"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "extract_volume_patches"
@@ -712,10 +572,6 @@ tf_module {
     name: "fill"
     argspec: "args=[\'dims\', \'value\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "fixed_size_partitioner"
-    argspec: "args=[\'num_shards\', \'axis\'], varargs=None, keywords=None, defaults=[\'0\'], "
-  }
   member_method {
     name: "floor"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -738,31 +594,23 @@ tf_module {
   }
   member_method {
     name: "function"
-    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
+    argspec: "args=[\'func\', \'input_signature\', \'autograph\', \'experimental_autograph_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\', \'None\'], "
   }
   member_method {
     name: "gather"
-    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'name\', \'axis\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
+    argspec: "args=[\'params\', \'indices\', \'validate_indices\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'None\'], "
   }
   member_method {
     name: "gather_nd"
     argspec: "args=[\'params\', \'indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "get_collection"
-    argspec: "args=[\'key\', \'scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "get_collection_ref"
-    argspec: "args=[\'key\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_default_graph"
+    name: "get_logger"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "gradients"
-    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
+    argspec: "args=[\'ys\', \'xs\', \'grad_ys\', \'name\', \'gate_gradients\', \'aggregation_method\', \'stop_gradients\', \'unconnected_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'gradients\', \'False\', \'None\', \'None\', \'UnconnectedGradients.NONE\'], "
   }
   member_method {
     name: "greater"
@@ -782,7 +630,7 @@ tf_module {
   }
   member_method {
     name: "hessians"
-    argspec: "args=[\'ys\', \'xs\', \'name\', \'colocate_gradients_with_ops\', \'gate_gradients\', \'aggregation_method\'], varargs=None, keywords=None, defaults=[\'hessians\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'ys\', \'xs\', \'gate_gradients\', \'aggregation_method\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\', \'hessians\'], "
   }
   member_method {
     name: "histogram_fixed_width"
@@ -816,18 +664,10 @@ tf_module {
     name: "less_equal"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "lin_space"
-    argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "linspace"
     argspec: "args=[\'start\', \'stop\', \'num\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "load_file_system_library"
-    argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "load_library"
     argspec: "args=[\'library_location\'], varargs=None, keywords=None, defaults=None"
@@ -836,14 +676,6 @@ tf_module {
     name: "load_op_library"
     argspec: "args=[\'library_filename\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "log"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "log1p"
-    argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
   member_method {
     name: "logical_and"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -880,10 +712,6 @@ tf_module {
     name: "meshgrid"
     argspec: "args=[], varargs=args, keywords=kwargs, defaults=None"
   }
-  member_method {
-    name: "min_max_variable_partitioner"
-    argspec: "args=[\'max_partitions\', \'axis\', \'min_slice_size\', \'bytes_per_string_element\'], varargs=None, keywords=None, defaults=[\'1\', \'0\', \'262144\', \'16\'], "
-  }
   member_method {
     name: "minimum"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -892,10 +720,6 @@ tf_module {
     name: "mod"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "multiply"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -918,7 +742,7 @@ tf_module {
   }
   member_method {
     name: "norm"
-    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'tensor\', \'ord\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'euclidean\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "not_equal"
@@ -934,11 +758,11 @@ tf_module {
   }
   member_method {
     name: "ones_like"
-    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'input\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "pad"
-    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'name\', \'constant_values\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'None\', \'0\'], "
+    argspec: "args=[\'tensor\', \'paddings\', \'mode\', \'constant_values\', \'name\'], varargs=None, keywords=None, defaults=[\'CONSTANT\', \'0\', \'None\'], "
   }
   member_method {
     name: "parallel_stack"
@@ -956,10 +780,6 @@ tf_module {
     name: "py_function"
     argspec: "args=[\'func\', \'inp\', \'Tout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "quantize_v2"
-    argspec: "args=[\'input\', \'min_range\', \'max_range\', \'T\', \'mode\', \'name\', \'round_mode\'], varargs=None, keywords=None, defaults=[\'MIN_COMBINED\', \'None\', \'HALF_AWAY_FROM_ZERO\'], "
-  }
   member_method {
     name: "range"
     argspec: "args=[\'start\', \'limit\', \'delta\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'range\'], "
@@ -974,35 +794,35 @@ tf_module {
   }
   member_method {
     name: "reduce_all"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_any"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_logsumexp"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_mean"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_min"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_prod"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\', \'reduction_indices\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input_tensor\', \'axis\', \'keepdims\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "register_tensor_conversion_function"
@@ -1012,10 +832,6 @@ tf_module {
     name: "required_space_to_batch_paddings"
     argspec: "args=[\'input_shape\', \'block_shape\', \'base_paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "reset_default_graph"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "reshape"
     argspec: "args=[\'tensor\', \'shape\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1026,7 +842,7 @@ tf_module {
   }
   member_method {
     name: "reverse_sequence"
-    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\', \'seq_dim\', \'batch_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'seq_lengths\', \'seq_axis\', \'batch_axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "roll"
@@ -1042,7 +858,7 @@ tf_module {
   }
   member_method {
     name: "scalar_mul"
-    argspec: "args=[\'scalar\', \'x\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'scalar\', \'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "scan"
@@ -1076,13 +892,9 @@ tf_module {
     name: "sequence_mask"
     argspec: "args=[\'lengths\', \'maxlen\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'bool\'>\", \'None\'], "
   }
-  member_method {
-    name: "set_random_seed"
-    argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "shape"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "shape_n"
@@ -1106,43 +918,23 @@ tf_module {
   }
   member_method {
     name: "size"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
   }
   member_method {
     name: "slice"
     argspec: "args=[\'input_\', \'begin\', \'size\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "space_to_batch_nd"
-    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "sparse_concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    name: "sort"
+    argspec: "args=[\'values\', \'axis\', \'direction\', \'name\'], varargs=None, keywords=None, defaults=[\'-1\', \'ASCENDING\', \'None\'], "
   }
   member_method {
-    name: "sparse_reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "sparse_split"
-    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    name: "space_to_batch"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
-    name: "sparse_to_dense"
-    argspec: "args=[\'sparse_indices\', \'output_shape\', \'sparse_values\', \'default_value\', \'validate_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'0\', \'True\', \'None\'], "
+    name: "space_to_batch_nd"
+    argspec: "args=[\'input\', \'block_shape\', \'paddings\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "split"
@@ -1158,7 +950,7 @@ tf_module {
   }
   member_method {
     name: "squeeze"
-    argspec: "args=[\'input\', \'axis\', \'name\', \'squeeze_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+    argspec: "args=[\'input\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "stack"
@@ -1192,6 +984,18 @@ tf_module {
     name: "tanh"
     argspec: "args=[\'x\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "tensor_scatter_add"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_sub"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "tensor_scatter_update"
+    argspec: "args=[\'tensor\', \'indices\', \'updates\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "tensordot"
     argspec: "args=[\'a\', \'b\', \'axes\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1206,16 +1010,12 @@ tf_module {
   }
   member_method {
     name: "transpose"
-    argspec: "args=[\'a\', \'perm\', \'name\', \'conjugate\'], varargs=None, keywords=None, defaults=[\'None\', \'transpose\', \'False\'], "
+    argspec: "args=[\'a\', \'perm\', \'conjugate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'transpose\'], "
   }
   member_method {
     name: "truediv"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "truncated_normal"
-    argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
-  }
   member_method {
     name: "truncatediv"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1226,7 +1026,7 @@ tf_module {
   }
   member_method {
     name: "tuple"
-    argspec: "args=[\'tensors\', \'name\', \'control_inputs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'tensors\', \'control_inputs\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "unique"
@@ -1244,10 +1044,6 @@ tf_module {
     name: "unstack"
     argspec: "args=[\'value\', \'num\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'unstack\'], "
   }
-  member_method {
-    name: "variable_axis_size_partitioner"
-    argspec: "args=[\'max_shard_bytes\', \'axis\', \'bytes_per_string_element\', \'max_shards\'], varargs=None, keywords=None, defaults=[\'0\', \'16\', \'None\'], "
-  }
   member_method {
     name: "variable_creator_scope"
     argspec: "args=[\'variable_creator\'], varargs=None, keywords=None, defaults=None"
@@ -1258,7 +1054,7 @@ tf_module {
   }
   member_method {
     name: "while_loop"
-    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'name\', \'maximum_iterations\', \'return_same_structure\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\', \'False\'], "
+    argspec: "args=[\'cond\', \'body\', \'loop_vars\', \'shape_invariants\', \'parallel_iterations\', \'back_prop\', \'swap_memory\', \'maximum_iterations\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'10\', \'True\', \'False\', \'None\', \'None\'], "
   }
   member_method {
     name: "zeros"
@@ -1266,6 +1062,6 @@ tf_module {
   }
   member_method {
     name: "zeros_like"
-    argspec: "args=[\'tensor\', \'dtype\', \'name\', \'optimize\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
+    argspec: "args=[\'input\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
deleted file mode 100644
index e09c44cc9ce71305692740ba2d63b0940b2e0573..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checker.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.profiler.AdviceProto.Checker"
-tf_proto {
-  descriptor {
-    name: "Checker"
-    field {
-      name: "reports"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
deleted file mode 100644
index 87462435496fd2eedeb0bc8d92e8a833671b6531..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.-checkers-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.profiler.AdviceProto.CheckersEntry"
-tf_proto {
-  descriptor {
-    name: "CheckersEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.AdviceProto.Checker"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
deleted file mode 100644
index a8a8858ccd5af3fb3dac612eef44e5cb450df914..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-advice-proto.pbtxt
+++ /dev/null
@@ -1,41 +0,0 @@
-path: "tensorflow.profiler.AdviceProto"
-tf_proto {
-  descriptor {
-    name: "AdviceProto"
-    field {
-      name: "checkers"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.AdviceProto.CheckersEntry"
-    }
-    nested_type {
-      name: "CheckersEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.tfprof.AdviceProto.Checker"
-      }
-      options {
-        map_entry: true
-      }
-    }
-    nested_type {
-      name: "Checker"
-      field {
-        name: "reports"
-        number: 2
-        label: LABEL_REPEATED
-        type: TYPE_STRING
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
deleted file mode 100644
index afec73f537aadd5d1a274db8d57e37b8c6fa3e74..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.-input-shapes-entry.pbtxt
+++ /dev/null
@@ -1,22 +0,0 @@
-path: "tensorflow.profiler.GraphNodeProto.InputShapesEntry"
-tf_proto {
-  descriptor {
-    name: "InputShapesEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT32
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
deleted file mode 100644
index 3c83177005323a277f929d8c769cd7b1eeff4d51..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-graph-node-proto.pbtxt
+++ /dev/null
@@ -1,191 +0,0 @@
-path: "tensorflow.profiler.GraphNodeProto"
-tf_proto {
-  descriptor {
-    name: "GraphNodeProto"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "tensor_value"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.TFProfTensorProto"
-    }
-    field {
-      name: "run_count"
-      number: 21
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "exec_micros"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "accelerator_exec_micros"
-      number: 17
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "cpu_exec_micros"
-      number: 18
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "requested_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "peak_bytes"
-      number: 24
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "residual_bytes"
-      number: 25
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "output_bytes"
-      number: 26
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "parameters"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "float_ops"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "devices"
-      number: 10
-      label: LABEL_REPEATED
-      type: TYPE_STRING
-    }
-    field {
-      name: "total_definition_count"
-      number: 23
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_run_count"
-      number: 22
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_exec_micros"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_accelerator_exec_micros"
-      number: 19
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_cpu_exec_micros"
-      number: 20
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_requested_bytes"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_peak_bytes"
-      number: 27
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_residual_bytes"
-      number: 28
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_output_bytes"
-      number: 29
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_parameters"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_float_ops"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "shapes"
-      number: 11
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.TensorShapeProto"
-    }
-    field {
-      name: "input_shapes"
-      number: 16
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto.InputShapesEntry"
-    }
-    field {
-      name: "children"
-      number: 12
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto"
-    }
-    nested_type {
-      name: "InputShapesEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT32
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_MESSAGE
-        type_name: ".tensorflow.TensorShapeProto"
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
deleted file mode 100644
index 2b08a05437f90b91160fc08e670b2466ae163149..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-multi-graph-node-proto.pbtxt
+++ /dev/null
@@ -1,134 +0,0 @@
-path: "tensorflow.profiler.MultiGraphNodeProto"
-tf_proto {
-  descriptor {
-    name: "MultiGraphNodeProto"
-    field {
-      name: "name"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "exec_micros"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "accelerator_exec_micros"
-      number: 12
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "cpu_exec_micros"
-      number: 13
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "requested_bytes"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "peak_bytes"
-      number: 16
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "residual_bytes"
-      number: 17
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "output_bytes"
-      number: 18
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "parameters"
-      number: 4
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "float_ops"
-      number: 5
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_exec_micros"
-      number: 6
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_accelerator_exec_micros"
-      number: 14
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_cpu_exec_micros"
-      number: 15
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_requested_bytes"
-      number: 7
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_peak_bytes"
-      number: 19
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_residual_bytes"
-      number: 20
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_output_bytes"
-      number: 21
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_parameters"
-      number: 8
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "total_float_ops"
-      number: 9
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "graph_nodes"
-      number: 10
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.GraphNodeProto"
-    }
-    field {
-      name: "children"
-      number: 11
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.MultiGraphNodeProto"
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
deleted file mode 100644
index b3adc50c7e14152a81a148df9deccc5272189aad..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.-id-to-string-entry.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.profiler.OpLogProto.IdToStringEntry"
-tf_proto {
-  descriptor {
-    name: "IdToStringEntry"
-    field {
-      name: "key"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_INT64
-    }
-    field {
-      name: "value"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    options {
-      map_entry: true
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
deleted file mode 100644
index 7510c566ba574e9370f5e54c29023ef4fb5ee804..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-op-log-proto.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.profiler.OpLogProto"
-tf_proto {
-  descriptor {
-    name: "OpLogProto"
-    field {
-      name: "log_entries"
-      number: 1
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.OpLogEntry"
-    }
-    field {
-      name: "id_to_string"
-      number: 2
-      label: LABEL_REPEATED
-      type: TYPE_MESSAGE
-      type_name: ".tensorflow.tfprof.OpLogProto.IdToStringEntry"
-    }
-    nested_type {
-      name: "IdToStringEntry"
-      field {
-        name: "key"
-        number: 1
-        label: LABEL_OPTIONAL
-        type: TYPE_INT64
-      }
-      field {
-        name: "value"
-        number: 2
-        label: LABEL_OPTIONAL
-        type: TYPE_STRING
-      }
-      options {
-        map_entry: true
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
deleted file mode 100644
index 19ff38a3900c2d358faaa40e7316cc3a9da73040..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profile-option-builder.pbtxt
+++ /dev/null
@@ -1,93 +0,0 @@
-path: "tensorflow.profiler.ProfileOptionBuilder"
-tf_class {
-  is_instance: "<class \'tensorflow.python.profiler.option_builder.ProfileOptionBuilder\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "account_displayed_op_only"
-    argspec: "args=[\'self\', \'is_true\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "build"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "float_operation"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "order_by"
-    argspec: "args=[\'self\', \'attribute\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "select"
-    argspec: "args=[\'self\', \'attributes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "time_and_memory"
-    argspec: "args=[\'min_micros\', \'min_bytes\', \'min_accelerator_micros\', \'min_cpu_micros\', \'min_peak_bytes\', \'min_residual_bytes\', \'min_output_bytes\'], varargs=None, keywords=None, defaults=[\'1\', \'1\', \'0\', \'0\', \'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "trainable_variables_parameter"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_accounted_types"
-    argspec: "args=[\'self\', \'account_type_regexes\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_empty_output"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_file_output"
-    argspec: "args=[\'self\', \'outfile\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_max_depth"
-    argspec: "args=[\'self\', \'max_depth\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_execution_time"
-    argspec: "args=[\'self\', \'min_micros\', \'min_accelerator_micros\', \'min_cpu_micros\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "with_min_float_operations"
-    argspec: "args=[\'self\', \'min_float_ops\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_memory"
-    argspec: "args=[\'self\', \'min_bytes\', \'min_peak_bytes\', \'min_residual_bytes\', \'min_output_bytes\'], varargs=None, keywords=None, defaults=[\'0\', \'0\', \'0\', \'0\'], "
-  }
-  member_method {
-    name: "with_min_occurrence"
-    argspec: "args=[\'self\', \'min_occurrence\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_min_parameters"
-    argspec: "args=[\'self\', \'min_params\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_node_names"
-    argspec: "args=[\'self\', \'start_name_regexes\', \'show_name_regexes\', \'hide_name_regexes\', \'trim_name_regexes\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "with_pprof_output"
-    argspec: "args=[\'self\', \'pprof_file\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_stdout_output"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_step"
-    argspec: "args=[\'self\', \'step\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "with_timeline_output"
-    argspec: "args=[\'self\', \'timeline_file\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
deleted file mode 100644
index acb61dae9f0d184ba998aa820ec40de5bc38c3eb..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.-profiler.pbtxt
+++ /dev/null
@@ -1,37 +0,0 @@
-path: "tensorflow.profiler.Profiler"
-tf_class {
-  is_instance: "<class \'tensorflow.python.profiler.model_analyzer.Profiler\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'op_log\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "add_step"
-    argspec: "args=[\'self\', \'step\', \'run_meta\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "advise"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_graph"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_name_scope"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_operations"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "profile_python"
-    argspec: "args=[\'self\', \'options\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "serialize_to_string"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
deleted file mode 100644
index 7b4d3ac522abc4229c5623da25c4ec818d86f829..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.profiler.pbtxt
+++ /dev/null
@@ -1,39 +0,0 @@
-path: "tensorflow.profiler"
-tf_module {
-  member {
-    name: "AdviceProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "GraphNodeProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "MultiGraphNodeProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "OpLogProto"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
-  member {
-    name: "ProfileOptionBuilder"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Profiler"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "advise"
-    argspec: "args=[\'graph\', \'run_meta\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'0\'], "
-  }
-  member_method {
-    name: "profile"
-    argspec: "args=[\'graph\', \'run_meta\', \'op_log\', \'cmd\', \'options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'scope\', \'0\'], "
-  }
-  member_method {
-    name: "write_op_log"
-    argspec: "args=[\'graph\', \'log_dir\', \'op_log\', \'run_meta\', \'add_trace\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'True\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
index 2948b7318ea0d72913d38bf33c9a47f5c64b2931..632c2f8f83c8effb188d110bfacaf7f22c0c74cc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.quantization.pbtxt
@@ -34,7 +34,7 @@ tf_module {
   }
   member_method {
     name: "quantize_and_dequantize"
-    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'None\'], "
+    argspec: "args=[\'input\', \'input_min\', \'input_max\', \'signed_input\', \'num_bits\', \'range_given\', \'round_mode\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'8\', \'False\', \'HALF_TO_EVEN\', \'None\'], "
   }
   member_method {
     name: "quantized_concat"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
index 160c09798d02653ba0c090db53124450b956ef05..d49c23e59cf036f05758f5c50208febf4b7381d5 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.random.pbtxt
@@ -1,31 +1,39 @@
 path: "tensorflow.random"
 tf_module {
+  member_method {
+    name: "all_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
+  }
+  member_method {
+    name: "fixed_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'vocab_file\', \'distortion\', \'num_reserved_ids\', \'num_shards\', \'shard\', \'unigrams\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'1.0\', \'0\', \'1\', \'0\', \'()\', \'None\', \'None\'], "
+  }
   member_method {
     name: "gamma"
     argspec: "args=[\'shape\', \'alpha\', \'beta\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
-    name: "get_seed"
-    argspec: "args=[\'op_seed\'], varargs=None, keywords=None, defaults=None"
+    name: "learned_unigram_candidate_sampler"
+    argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "log_uniform_candidate_sampler"
     argspec: "args=[\'true_classes\', \'num_true\', \'num_sampled\', \'unique\', \'range_max\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
-  member_method {
-    name: "multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'name\', \'output_dtype\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "normal"
     argspec: "args=[\'shape\', \'mean\', \'stddev\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'1.0\', \"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
     name: "poisson"
-    argspec: "args=[\'lam\', \'shape\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
+    argspec: "args=[\'shape\', \'lam\', \'dtype\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\', \'None\'], "
   }
   member_method {
-    name: "set_random_seed"
+    name: "set_seed"
     argspec: "args=[\'seed\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
@@ -33,8 +41,8 @@ tf_module {
     argspec: "args=[\'value\', \'seed\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "stateless_multinomial"
-    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'output_dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
+    name: "stateless_categorical"
+    argspec: "args=[\'logits\', \'num_samples\', \'seed\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int64\'>\", \'None\'], "
   }
   member_method {
     name: "stateless_normal"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
index d57936a2f1cb9eb0be9647a9ae9d8d00221e93c7..63bebb20bcae08c645d9aaaecab2ea2de4cc49aa 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.pbtxt
@@ -32,14 +32,6 @@ tf_module {
     name: "GPU"
     mtype: "<type \'str\'>"
   }
-  member {
-    name: "LEGACY_INIT_OP_KEY"
-    mtype: "<type \'str\'>"
-  }
-  member {
-    name: "MAIN_OP_KEY"
-    mtype: "<type \'str\'>"
-  }
   member {
     name: "PREDICT_INPUTS"
     mtype: "<type \'str\'>"
@@ -105,12 +97,12 @@ tf_module {
     argspec: "args=[\'examples\', \'classes\', \'scores\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "is_valid_signature"
-    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
+    name: "contains_saved_model"
+    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "maybe_saved_model_directory"
-    argspec: "args=[\'export_dir\'], varargs=None, keywords=None, defaults=None"
+    name: "is_valid_signature"
+    argspec: "args=[\'signature_def\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "predict_signature_def"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
index 8a196b1a556e283671cc75af28df3eaa62532975..900d08ff47ca062fdda4f0f2f6ac20ee9822d1df 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sets.pbtxt
@@ -1,19 +1,19 @@
 path: "tensorflow.sets"
 tf_module {
   member_method {
-    name: "set_difference"
+    name: "difference"
     argspec: "args=[\'a\', \'b\', \'aminusb\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\', \'True\'], "
   }
   member_method {
-    name: "set_intersection"
+    name: "intersection"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "set_size"
+    name: "size"
     argspec: "args=[\'a\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
   member_method {
-    name: "set_union"
+    name: "union"
     argspec: "args=[\'a\', \'b\', \'validate_indices\'], varargs=None, keywords=None, defaults=[\'True\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
index 2c50c41f186363e8331488df9e2d328e0d0b4d26..ea717b4d719d6709e05182faca964ae544abc39c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.signal.pbtxt
@@ -1,5 +1,9 @@
 path: "tensorflow.signal"
 tf_module {
+  member_method {
+    name: "dct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
   member_method {
     name: "fft"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -24,6 +28,10 @@ tf_module {
     name: "hann_window"
     argspec: "args=[\'window_length\', \'periodic\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \"<dtype: \'float32\'>\", \'None\'], "
   }
+  member_method {
+    name: "idct"
+    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
+  }
   member_method {
     name: "ifft"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -44,6 +52,18 @@ tf_module {
     name: "inverse_stft_window_fn"
     argspec: "args=[\'frame_step\', \'forward_window_fn\', \'name\'], varargs=None, keywords=None, defaults=[\'<function hann_window instance>\', \'None\'], "
   }
+  member_method {
+    name: "irfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "irfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "linear_to_mel_weight_matrix"
     argspec: "args=[\'num_mel_bins\', \'num_spectrogram_bins\', \'sample_rate\', \'lower_edge_hertz\', \'upper_edge_hertz\', \'dtype\', \'name\'], varargs=None, keywords=None, defaults=[\'20\', \'129\', \'8000\', \'125.0\', \'3800.0\', \"<dtype: \'float32\'>\", \'None\'], "
@@ -56,6 +76,18 @@ tf_module {
     name: "overlap_and_add"
     argspec: "args=[\'signal\', \'frame_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "rfft"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft2d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
+  member_method {
+    name: "rfft3d"
+    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+  }
   member_method {
     name: "stft"
     argspec: "args=[\'signals\', \'frame_length\', \'frame_step\', \'fft_length\', \'window_fn\', \'pad_end\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'<function hann_window instance>\', \'False\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
index 9c9c4d838e9e26ac22349a5e3f4655db8374f685..b8bd2c0b72c1a78fb2abbfb319073fec267f56fb 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.sparse.pbtxt
@@ -10,11 +10,11 @@ tf_module {
   }
   member_method {
     name: "add"
-    argspec: "args=[\'a\', \'b\', \'thresh\'], varargs=None, keywords=None, defaults=[\'0\'], "
+    argspec: "args=[\'a\', \'b\', \'threshold\'], varargs=None, keywords=None, defaults=[\'0\'], "
   }
   member_method {
     name: "concat"
-    argspec: "args=[\'axis\', \'sp_inputs\', \'name\', \'expand_nonconcat_dim\', \'concat_dim\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
+    argspec: "args=[\'axis\', \'sp_inputs\', \'expand_nonconcat_dims\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
   }
   member_method {
     name: "cross"
@@ -40,37 +40,21 @@ tf_module {
     name: "mask"
     argspec: "args=[\'a\', \'mask_indices\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "matmul"
-    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
-  }
   member_method {
     name: "maximum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "merge"
-    argspec: "args=[\'sp_ids\', \'sp_values\', \'vocab_size\', \'name\', \'already_sorted\'], varargs=None, keywords=None, defaults=[\'None\', \'False\'], "
-  }
   member_method {
     name: "minimum"
     argspec: "args=[\'sp_a\', \'sp_b\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "reduce_max"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_max_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'output_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reduce_sum"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "reduce_sum_sparse"
-    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'reduction_axes\', \'keep_dims\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'axis\', \'keepdims\', \'output_is_sparse\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'False\', \'None\'], "
   }
   member_method {
     name: "reorder"
@@ -90,15 +74,15 @@ tf_module {
   }
   member_method {
     name: "segment_mean"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_sqrt_n"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "segment_sum"
-    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'name\', \'num_segments\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    argspec: "args=[\'data\', \'indices\', \'segment_ids\', \'num_segments\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "slice"
@@ -108,9 +92,13 @@ tf_module {
     name: "softmax"
     argspec: "args=[\'sp_input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "sparse_dense_matmul"
+    argspec: "args=[\'sp_a\', \'b\', \'adjoint_a\', \'adjoint_b\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'False\', \'None\'], "
+  }
   member_method {
     name: "split"
-    argspec: "args=[\'keyword_required\', \'sp_input\', \'num_split\', \'axis\', \'name\', \'split_dim\'], varargs=None, keywords=None, defaults=[\'KeywordRequired()\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
+    argspec: "args=[\'sp_input\', \'num_split\', \'axis\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
     name: "to_dense"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
deleted file mode 100644
index b0f0783e300652c9063a248e5157b5f0174c5598..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.spectral.pbtxt
+++ /dev/null
@@ -1,35 +0,0 @@
-path: "tensorflow.spectral"
-tf_module {
-  member_method {
-    name: "dct"
-    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "idct"
-    argspec: "args=[\'input\', \'type\', \'n\', \'axis\', \'norm\', \'name\'], varargs=None, keywords=None, defaults=[\'2\', \'None\', \'-1\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "irfft"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "irfft2d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "irfft3d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft2d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "rfft3d"
-    argspec: "args=[\'input_tensor\', \'fft_length\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
index 03144cbe709fe59afc3a818ea7c157ace72b713d..f6e32ed08c8339413374c11c6fc75aec92bffec2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.strings.pbtxt
@@ -10,11 +10,11 @@ tf_module {
   }
   member_method {
     name: "length"
-    argspec: "args=[\'input\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
+    argspec: "args=[\'input\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
     name: "reduce_join"
-    argspec: "args=[\'inputs\', \'axis\', \'keep_dims\', \'separator\', \'name\', \'reduction_indices\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\', \'None\'], "
+    argspec: "args=[\'inputs\', \'axis\', \'keepdims\', \'separator\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'\', \'None\'], "
   }
   member_method {
     name: "regex_full_match"
@@ -34,11 +34,11 @@ tf_module {
   }
   member_method {
     name: "substr"
-    argspec: "args=[\'input\', \'pos\', \'len\', \'name\', \'unit\'], varargs=None, keywords=None, defaults=[\'None\', \'BYTE\'], "
+    argspec: "args=[\'input\', \'pos\', \'len\', \'unit\', \'name\'], varargs=None, keywords=None, defaults=[\'BYTE\', \'None\'], "
   }
   member_method {
     name: "to_hash_bucket"
-    argspec: "args=[\'string_tensor\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    argspec: "args=[\'input\', \'num_buckets\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "to_hash_bucket_fast"
@@ -50,7 +50,11 @@ tf_module {
   }
   member_method {
     name: "to_number"
-    argspec: "args=[\'string_tensor\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'float32\'>\", \'None\'], "
+  }
+  member_method {
+    name: "unicode_encode"
+    argspec: "args=[\'input\', \'output_encoding\', \'errors\', \'replacement_char\', \'name\'], varargs=None, keywords=None, defaults=[\'replace\', \'65533\', \'None\'], "
   }
   member_method {
     name: "unicode_script"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt
deleted file mode 100644
index 73de73869c8d1a6808b16fe8853fd21cc8891879..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.-session-log.pbtxt
+++ /dev/null
@@ -1,44 +0,0 @@
-path: "tensorflow.summary.SessionLog"
-tf_proto {
-  descriptor {
-    name: "SessionLog"
-    field {
-      name: "status"
-      number: 1
-      label: LABEL_OPTIONAL
-      type: TYPE_ENUM
-      type_name: ".tensorflow.SessionLog.SessionStatus"
-    }
-    field {
-      name: "checkpoint_path"
-      number: 2
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    field {
-      name: "msg"
-      number: 3
-      label: LABEL_OPTIONAL
-      type: TYPE_STRING
-    }
-    enum_type {
-      name: "SessionStatus"
-      value {
-        name: "STATUS_UNSPECIFIED"
-        number: 0
-      }
-      value {
-        name: "START"
-        number: 1
-      }
-      value {
-        name: "STOP"
-        number: 2
-      }
-      value {
-        name: "CHECKPOINT"
-        number: 3
-      }
-    }
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
similarity index 50%
rename from tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt
rename to tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
index e02a0c6097c5ea4dae905b25cd0e381f5e257105..6715c14e168d6a30ce8aa35470525521069de40a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.-stub-out-for-testing.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.-summary-writer.pbtxt
@@ -1,28 +1,29 @@
-path: "tensorflow.test.StubOutForTesting"
+path: "tensorflow.summary.SummaryWriter"
 tf_class {
-  is_instance: "<class \'tensorflow.python.platform.googletest.StubOutForTesting\'>"
+  is_instance: "<class \'tensorflow.python.ops.summary_ops_v2.SummaryWriter\'>"
+  is_instance: "<type \'object\'>"
   member_method {
-    name: "CleanUp"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+    name: "__init__"
+    argspec: "args=[\'self\', \'resource\', \'init_op_fn\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "Set"
-    argspec: "args=[\'self\', \'parent\', \'child_name\', \'new_child\'], varargs=None, keywords=None, defaults=None"
+    name: "as_default"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "SmartSet"
-    argspec: "args=[\'self\', \'obj\', \'attr_name\', \'new_attr\'], varargs=None, keywords=None, defaults=None"
+    name: "close"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "SmartUnsetAll"
+    name: "flush"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "UnsetAll"
+    name: "init"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
-    name: "__init__"
+    name: "set_as_default"
     argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
index 7ed9cd77a01c2eadb5ea43a02306d60d505127a0..5cf4d7cfd9ac54eeccea5094ad789aede29540b8 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.summary.pbtxt
@@ -12,10 +12,6 @@ tf_module {
     name: "FileWriterCache"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SessionLog"
-    mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
-  }
   member {
     name: "Summary"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -24,44 +20,24 @@ tf_module {
     name: "SummaryDescription"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
+  member {
+    name: "SummaryWriter"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "TaggedRunMetadata"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
   member_method {
-    name: "audio"
-    argspec: "args=[\'name\', \'tensor\', \'sample_rate\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "get_summary_description"
-    argspec: "args=[\'node_def\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "histogram"
-    argspec: "args=[\'name\', \'values\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "image"
-    argspec: "args=[\'name\', \'tensor\', \'max_outputs\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'3\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "merge"
-    argspec: "args=[\'inputs\', \'collections\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "merge_all"
-    argspec: "args=[\'key\', \'scope\', \'name\'], varargs=None, keywords=None, defaults=[\'summaries\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "scalar"
-    argspec: "args=[\'name\', \'tensor\', \'collections\', \'family\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
+    name: "create_file_writer"
+    argspec: "args=[\'logdir\', \'max_queue\', \'flush_millis\', \'filename_suffix\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\'], "
   }
   member_method {
-    name: "tensor_summary"
-    argspec: "args=[\'name\', \'tensor\', \'summary_description\', \'collections\', \'summary_metadata\', \'family\', \'display_name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\'], "
+    name: "flush"
+    argspec: "args=[\'writer\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
-    name: "text"
-    argspec: "args=[\'name\', \'tensor\', \'collections\'], varargs=None, keywords=None, defaults=[\'None\'], "
+    name: "import_event"
+    argspec: "args=[\'tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
index df528e26b60f8d8ddcc1eaf0ed292cc7ff0ebd94..6fc489c86043d074ac832d0ec9dbefd2cbbb4f19 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.-benchmark.pbtxt
@@ -6,6 +6,10 @@ tf_class {
   member_method {
     name: "__init__"
   }
+  member_method {
+    name: "evaluate"
+    argspec: "args=[\'self\', \'tensors\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "is_abstract"
     argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
index af3f06d8de3a90d55654ed706faa9182adc9cae1..980e96ac254aebf229ae52d98f607ed87d334e7a 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.test.pbtxt
@@ -4,34 +4,18 @@ tf_module {
     name: "Benchmark"
     mtype: "<class \'tensorflow.python.platform.benchmark._BenchmarkRegistrar\'>"
   }
-  member {
-    name: "StubOutForTesting"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "TestCase"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "mock"
-    mtype: "<type \'module\'>"
-  }
   member_method {
     name: "assert_equal_graph_def"
-    argspec: "args=[\'actual\', \'expected\', \'checkpoint_v2\'], varargs=None, keywords=None, defaults=[\'False\'], "
+    argspec: "args=[\'actual\', \'expected\'], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "benchmark_config"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "compute_gradient"
-    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'init_targets\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradient_error"
-    argspec: "args=[\'x\', \'x_shape\', \'y\', \'y_shape\', \'x_init_value\', \'delta\', \'extra_feed_dict\'], varargs=None, keywords=None, defaults=[\'None\', \'0.001\', \'None\'], "
-  }
   member_method {
     name: "create_local_cluster"
     argspec: "args=[\'num_workers\', \'num_ps\', \'protocol\', \'worker_config\', \'ps_config\'], varargs=None, keywords=None, defaults=[\'grpc\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt
deleted file mode 100644
index 1f1d8b6f9e2cde4800cdef9c417191b1a0ce07b5..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adadelta-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdadeltaOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adadelta.AdadeltaOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'rho\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.95\', \'1e-08\', \'False\', \'Adadelta\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
deleted file mode 100644
index a7c05d484905a0af26c80a52d92623ef4a3eb6c4..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-d-a-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdagradDAOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adagrad_da.AdagradDAOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'global_step\', \'initial_gradient_squared_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'AdagradDA\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt
deleted file mode 100644
index bc8b92389c6ed7dcb0fa23ff3abd86bb0d1c488a..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adagrad-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdagradOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adagrad.AdagradOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'False\', \'Adagrad\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt
deleted file mode 100644
index 5d17be9378fd130b89e199544f85e03a23a71d3c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-adam-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.AdamOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.adam.AdamOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'beta1\', \'beta2\', \'epsilon\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.001\', \'0.9\', \'0.999\', \'1e-08\', \'False\', \'Adam\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt
deleted file mode 100644
index abbe273be32c6fd20b1a6464f3e99966bd3c8953..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-chief-session-creator.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.train.ChiefSessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.ChiefSessionCreator\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt
deleted file mode 100644
index d265fdeb01c38d8a1347e630d7f7bff111999634..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-ftrl-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.FtrlOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.ftrl.FtrlOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'learning_rate_power\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\', \'accum_name\', \'linear_name\', \'l2_shrinkage_regularization_strength\'], varargs=None, keywords=None, defaults=[\'-0.5\', \'0.1\', \'0.0\', \'0.0\', \'False\', \'Ftrl\', \'None\', \'None\', \'0.0\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt
deleted file mode 100644
index c673e29cd4dd6cd3c01582abfbc306c092818892..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-gradient-descent-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.GradientDescentOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.gradient_descent.GradientDescentOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'GradientDescent\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt
deleted file mode 100644
index c61859004e897a14b580dc0b55957edfa6ae6860..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-looper-thread.pbtxt
+++ /dev/null
@@ -1,73 +0,0 @@
-path: "tensorflow.train.LooperThread"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.coordinator.LooperThread\'>"
-  is_instance: "<class \'threading.Thread\'>"
-  member {
-    name: "daemon"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ident"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "name"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "getName"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "isAlive"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "isDaemon"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "is_alive"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "join"
-    argspec: "args=[\'self\', \'timeout\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "loop"
-    argspec: "args=[\'coord\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "setDaemon"
-    argspec: "args=[\'self\', \'daemonic\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "setName"
-    argspec: "args=[\'self\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "stop_loop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt
deleted file mode 100644
index 8199f63b9b8c64c73a3d62294277838cdc240280..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-momentum-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.MomentumOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.momentum.MomentumOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'momentum\', \'use_locking\', \'name\', \'use_nesterov\'], varargs=None, keywords=None, defaults=[\'False\', \'Momentum\', \'False\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt
deleted file mode 100644
index 03efe6639e0e3d2c6c280bd30d2b59b5d654f995..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.-step-context.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.MonitoredSession.StepContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.StepContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session\', \'run_with_hooks_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_with_hooks"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt
deleted file mode 100644
index 09b7b3fb538fb8d87dcfd622089818081a1fb79b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-monitored-session.pbtxt
+++ /dev/null
@@ -1,34 +0,0 @@
-path: "tensorflow.train.MonitoredSession"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.MonitoredSession\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "StepContext"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session_creator\', \'hooks\', \'stop_grace_period_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'120\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "run_step_fn"
-    argspec: "args=[\'self\', \'step_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt
deleted file mode 100644
index 876bb35e391885e751066a415967af848280c714..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-optimizer.pbtxt
+++ /dev/null
@@ -1,50 +0,0 @@
-path: "tensorflow.train.Optimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
deleted file mode 100644
index 14349a74efb61124fc7b5568d5ec023f08b1b62f..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-proximal-adagrad-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.ProximalAdagradOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.proximal_adagrad.ProximalAdagradOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'initial_accumulator_value\', \'l1_regularization_strength\', \'l2_regularization_strength\', \'use_locking\', \'name\'], varargs=None, keywords=None, defaults=[\'0.1\', \'0.0\', \'0.0\', \'False\', \'ProximalAdagrad\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
deleted file mode 100644
index 906384a2875bf7b05ac26fc43207f4ef9b5a7472..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-r-m-s-prop-optimizer.pbtxt
+++ /dev/null
@@ -1,51 +0,0 @@
-path: "tensorflow.train.RMSPropOptimizer"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.rmsprop.RMSPropOptimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.optimizer.Optimizer\'>"
-  is_instance: "<class \'tensorflow.python.training.checkpointable.base.CheckpointableBase\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "GATE_GRAPH"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_NONE"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "GATE_OP"
-    mtype: "<type \'int\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'learning_rate\', \'decay\', \'momentum\', \'epsilon\', \'use_locking\', \'centered\', \'name\'], varargs=None, keywords=None, defaults=[\'0.9\', \'0.0\', \'1e-10\', \'False\', \'False\', \'RMSProp\'], "
-  }
-  member_method {
-    name: "apply_gradients"
-    argspec: "args=[\'self\', \'grads_and_vars\', \'global_step\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "compute_gradients"
-    argspec: "args=[\'self\', \'loss\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'1\', \'None\', \'False\', \'None\'], "
-  }
-  member_method {
-    name: "get_name"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot"
-    argspec: "args=[\'self\', \'var\', \'name\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_slot_names"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "minimize"
-    argspec: "args=[\'self\', \'loss\', \'global_step\', \'var_list\', \'gate_gradients\', \'aggregation_method\', \'colocate_gradients_with_ops\', \'name\', \'grad_loss\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'1\', \'None\', \'False\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "variables"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt
deleted file mode 100644
index 38cc98b48e78aa93f7614a9baff236f7b119f99d..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-scaffold.pbtxt
+++ /dev/null
@@ -1,53 +0,0 @@
-path: "tensorflow.train.Scaffold"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.Scaffold\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "init_feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_fn"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_for_local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "saver"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_op"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'init_op\', \'init_feed_dict\', \'init_fn\', \'ready_op\', \'ready_for_local_init_op\', \'local_init_op\', \'summary_op\', \'saver\', \'copy_from_scaffold\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "default_local_init_op"
-    argspec: "args=[], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "finalize"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "get_or_default"
-    argspec: "args=[\'arg_name\', \'collection_key\', \'default_constructor\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt
deleted file mode 100644
index beb232715f725047dd8c03054b899a90fa81eec2..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-creator.pbtxt
+++ /dev/null
@@ -1,12 +0,0 @@
-path: "tensorflow.train.SessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt
deleted file mode 100644
index 448764fe081b250e1e22633f118268ad638cb9dd..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-manager.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.SessionManager"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_manager.SessionManager\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'local_init_op\', \'ready_op\', \'ready_for_local_init_op\', \'graph\', \'recovery_wait_secs\', \'local_init_run_options\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'30\', \'None\'], "
-  }
-  member_method {
-    name: "prepare_session"
-    argspec: "args=[\'self\', \'master\', \'init_op\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\', \'init_feed_dict\', \'init_fn\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'False\', \'7200\', \'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "recover_session"
-    argspec: "args=[\'self\', \'master\', \'saver\', \'checkpoint_dir\', \'checkpoint_filename_with_path\', \'wait_for_checkpoint\', \'max_wait_secs\', \'config\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'False\', \'7200\', \'None\'], "
-  }
-  member_method {
-    name: "wait_for_session"
-    argspec: "args=[\'self\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'inf\'], "
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt
deleted file mode 100644
index 442990893e33c92bd05a72b198a6584bc979b2fe..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-args.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.train.SessionRunArgs"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunArgs\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "fetches"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "options"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt
deleted file mode 100644
index d5adb15c95f8a6ebde4ca0e0c535dfebc5edfbf2..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-context.pbtxt
+++ /dev/null
@@ -1,25 +0,0 @@
-path: "tensorflow.train.SessionRunContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "original_args"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "stop_requested"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'original_args\', \'session\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt
deleted file mode 100644
index 0b401d59c400f1d08f47daa2d264a9a5bfc91538..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-session-run-values.pbtxt
+++ /dev/null
@@ -1,27 +0,0 @@
-path: "tensorflow.train.SessionRunValues"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
-  is_instance: "<class \'tensorflow.python.training.session_run_hook.SessionRunValues\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "options"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "results"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "run_metadata"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
deleted file mode 100644
index 36d8ce7ff82e02300b59705400be40d7cc3f65ae..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.-step-context.pbtxt
+++ /dev/null
@@ -1,21 +0,0 @@
-path: "tensorflow.train.SingularMonitoredSession.StepContext"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.StepContext\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "session"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'session\', \'run_with_hooks_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run_with_hooks"
-    argspec: "args=[\'self\'], varargs=args, keywords=kwargs, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt
deleted file mode 100644
index de0f2c1c1a2497ef4e541ee6583d416e31f48826..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-singular-monitored-session.pbtxt
+++ /dev/null
@@ -1,38 +0,0 @@
-path: "tensorflow.train.SingularMonitoredSession"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SingularMonitoredSession\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session._MonitoredSession\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "StepContext"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "graph"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'hooks\', \'scaffold\', \'master\', \'config\', \'checkpoint_dir\', \'stop_grace_period_secs\', \'checkpoint_filename_with_path\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'\', \'None\', \'None\', \'120\', \'None\'], "
-  }
-  member_method {
-    name: "close"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "raw_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "run"
-    argspec: "args=[\'self\', \'fetches\', \'feed_dict\', \'options\', \'run_metadata\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "run_step_fn"
-    argspec: "args=[\'self\', \'step_fn\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt
deleted file mode 100644
index 9677e5a98e4a8308093f51a84d8b1edae405cd2b..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-supervisor.pbtxt
+++ /dev/null
@@ -1,153 +0,0 @@
-path: "tensorflow.train.Supervisor"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.supervisor.Supervisor\'>"
-  is_instance: "<type \'object\'>"
-  member {
-    name: "USE_DEFAULT"
-    mtype: "<type \'int\'>"
-  }
-  member {
-    name: "coord"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "global_step"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_feed_dict"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "is_chief"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_for_local_init_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "ready_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_model_secs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_path"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "save_summaries_secs"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "saver"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "session_manager"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_op"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "summary_writer"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "Loop"
-    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "PrepareSession"
-    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
-  }
-  member_method {
-    name: "RequestStop"
-    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "ShouldStop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "StartQueueRunners"
-    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "StartStandardServices"
-    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "Stop"
-    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\', \'ignore_live_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "StopOnException"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "SummaryComputed"
-    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "WaitForStop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'graph\', \'ready_op\', \'ready_for_local_init_op\', \'is_chief\', \'init_op\', \'init_feed_dict\', \'local_init_op\', \'logdir\', \'summary_op\', \'saver\', \'global_step\', \'save_summaries_secs\', \'save_model_secs\', \'recovery_wait_secs\', \'stop_grace_secs\', \'checkpoint_basename\', \'session_manager\', \'summary_writer\', \'init_fn\', \'local_init_run_options\'], varargs=None, keywords=None, defaults=[\'None\', \'0\', \'0\', \'True\', \'0\', \'None\', \'0\', \'None\', \'0\', \'0\', \'0\', \'120\', \'600\', \'30\', \'120\', \'model.ckpt\', \'None\', \'0\', \'None\', \'None\'], "
-  }
-  member_method {
-    name: "loop"
-    argspec: "args=[\'self\', \'timer_interval_secs\', \'target\', \'args\', \'kwargs\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
-  }
-  member_method {
-    name: "managed_session"
-    argspec: "args=[], varargs=args, keywords=kwds, defaults=None"
-  }
-  member_method {
-    name: "prepare_or_wait_for_session"
-    argspec: "args=[\'self\', \'master\', \'config\', \'wait_for_checkpoint\', \'max_wait_secs\', \'start_standard_services\'], varargs=None, keywords=None, defaults=[\'\', \'None\', \'False\', \'7200\', \'True\'], "
-  }
-  member_method {
-    name: "request_stop"
-    argspec: "args=[\'self\', \'ex\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "should_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "start_queue_runners"
-    argspec: "args=[\'self\', \'sess\', \'queue_runners\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "start_standard_services"
-    argspec: "args=[\'self\', \'sess\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "stop"
-    argspec: "args=[\'self\', \'threads\', \'close_summary_writer\', \'ignore_live_threads\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
-  }
-  member_method {
-    name: "stop_on_exception"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-  member_method {
-    name: "summary_computed"
-    argspec: "args=[\'self\', \'sess\', \'summary\', \'global_step\'], varargs=None, keywords=None, defaults=[\'None\'], "
-  }
-  member_method {
-    name: "wait_for_stop"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt
deleted file mode 100644
index 39b946b82f3d5caadbdeac6253e5554df69a2776..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-vocab-info.pbtxt
+++ /dev/null
@@ -1,43 +0,0 @@
-path: "tensorflow.train.VocabInfo"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
-  is_instance: "<class \'tensorflow.python.training.warm_starting_util.VocabInfo\'>"
-  is_instance: "<type \'tuple\'>"
-  member {
-    name: "axis"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "backup_initializer"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "new_vocab"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "new_vocab_size"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "num_oov_buckets"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "old_vocab"
-    mtype: "<type \'property\'>"
-  }
-  member {
-    name: "old_vocab_size"
-    mtype: "<type \'property\'>"
-  }
-  member_method {
-    name: "__init__"
-  }
-  member_method {
-    name: "count"
-  }
-  member_method {
-    name: "index"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt
deleted file mode 100644
index ac263580687e53bb3fcffd5268f73f8b67aa43a1..0000000000000000000000000000000000000000
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-worker-session-creator.pbtxt
+++ /dev/null
@@ -1,14 +0,0 @@
-path: "tensorflow.train.WorkerSessionCreator"
-tf_class {
-  is_instance: "<class \'tensorflow.python.training.monitored_session.WorkerSessionCreator\'>"
-  is_instance: "<class \'tensorflow.python.training.monitored_session.SessionCreator\'>"
-  is_instance: "<type \'object\'>"
-  member_method {
-    name: "__init__"
-    argspec: "args=[\'self\', \'scaffold\', \'master\', \'config\', \'max_wait_secs\'], varargs=None, keywords=None, defaults=[\'None\', \'\', \'None\', \'1800\'], "
-  }
-  member_method {
-    name: "create_session"
-    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
-  }
-}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
index a091daa2985b49b51c121862fa80dadfaa35a98d..8c327f88f32357bc15b1cdcbbc2ffad674063f6b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.pbtxt
@@ -1,21 +1,5 @@
 path: "tensorflow.train"
 tf_module {
-  member {
-    name: "AdadeltaOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdagradDAOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdagradOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "AdamOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "BytesList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -28,18 +12,6 @@ tf_module {
     name: "CheckpointManager"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "CheckpointSaverHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "CheckpointSaverListener"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ChiefSessionCreator"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ClusterDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -76,30 +48,10 @@ tf_module {
     name: "Features"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "FeedFnHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "FinalOpsHook"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "FloatList"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "FtrlOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GlobalStepWaiterHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "GradientDescentOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "Int64List"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -108,58 +60,10 @@ tf_module {
     name: "JobDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "LoggingTensorHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "LooperThread"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MomentumOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "MonitoredSession"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "NanLossDuringTrainingError"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "NanTensorHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Optimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ProfilerHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "ProximalAdagradOptimizer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "ProximalGradientDescentOptimizer"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "RMSPropOptimizer"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Scaffold"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SecondOrStepTimer"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SequenceExample"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
@@ -172,62 +76,10 @@ tf_module {
     name: "ServerDef"
     mtype: "<class \'google.protobuf.pyext.cpp_message.GeneratedProtocolMessageType\'>"
   }
-  member {
-    name: "SessionCreator"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionManager"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionRunArgs"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SessionRunContext"
-    mtype: "<type \'type\'>"
-  }
   member {
     name: "SessionRunHook"
     mtype: "<type \'type\'>"
   }
-  member {
-    name: "SessionRunValues"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SingularMonitoredSession"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StepCounterHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "StopAtStepHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "SummarySaverHook"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "Supervisor"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "VocabInfo"
-    mtype: "<type \'type\'>"
-  }
-  member {
-    name: "WorkerSessionCreator"
-    mtype: "<type \'type\'>"
-  }
-  member_method {
-    name: "NewCheckpointReader"
-    argspec: "args=[\'filepattern\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "cosine_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'alpha\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0\', \'None\'], "
@@ -244,10 +96,6 @@ tf_module {
     name: "get_checkpoint_state"
     argspec: "args=[\'checkpoint_dir\', \'latest_filename\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
-  member_method {
-    name: "init_from_checkpoint"
-    argspec: "args=[\'ckpt_dir_or_file\', \'assignment_map\'], varargs=None, keywords=None, defaults=None"
-  }
   member_method {
     name: "inverse_time_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'decay_rate\', \'staircase\', \'name\'], varargs=None, keywords=None, defaults=[\'False\', \'None\'], "
@@ -281,17 +129,13 @@ tf_module {
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'initial_variance\', \'variance_decay\', \'num_periods\', \'alpha\', \'beta\', \'name\'], varargs=None, keywords=None, defaults=[\'1.0\', \'0.55\', \'0.5\', \'0.0\', \'0.001\', \'None\'], "
   }
   member_method {
-    name: "piecewise_constant"
+    name: "piecewise_constant_decay"
     argspec: "args=[\'x\', \'boundaries\', \'values\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "polynomial_decay"
     argspec: "args=[\'learning_rate\', \'global_step\', \'decay_steps\', \'end_learning_rate\', \'power\', \'cycle\', \'name\'], varargs=None, keywords=None, defaults=[\'0.0001\', \'1.0\', \'False\', \'None\'], "
   }
-  member_method {
-    name: "replica_device_setter"
-    argspec: "args=[\'ps_tasks\', \'ps_device\', \'worker_device\', \'merge_devices\', \'cluster\', \'ps_ops\', \'ps_strategy\'], varargs=None, keywords=None, defaults=[\'0\', \'/job:ps\', \'/job:worker\', \'True\', \'None\', \'None\', \'None\'], "
-  }
   member_method {
     name: "sdca_fprint"
     argspec: "args=[\'input\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -308,8 +152,4 @@ tf_module {
     name: "summary_iterator"
     argspec: "args=[\'path\'], varargs=None, keywords=None, defaults=None"
   }
-  member_method {
-    name: "warm_start"
-    argspec: "args=[\'ckpt_to_initialize_from\', \'vars_to_warm_start\', \'var_name_to_vocab_info\', \'var_name_to_prev_var_name\'], varargs=None, keywords=None, defaults=[\'.*\', \'None\', \'None\'], "
-  }
 }
diff --git a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
index 70df38ba8b8c46a51640b14591b6437dea639450..5102066730533c717a029c6fd52ef0e2d10a520d 100644
--- a/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
+++ b/tensorflow/tools/api/lib/python_object_to_proto_visitor.py
@@ -37,6 +37,9 @@ _CORNER_CASES = {
     'train.NanLossDuringTrainingError': {
         'message': {}
     },
+    'estimator.NanLossDuringTrainingError': {
+        'message': {}
+    },
 }
 
 # Python 2 vs. 3 differences
diff --git a/tensorflow/tools/api/tests/api_compatibility_test.py b/tensorflow/tools/api/tests/api_compatibility_test.py
index fb489ea80fbdad0612f5ae0af9d91fa0df534115..723fceef413d86675e885debd37e73e5facd7f7c 100644
--- a/tensorflow/tools/api/tests/api_compatibility_test.py
+++ b/tensorflow/tools/api/tests/api_compatibility_test.py
@@ -33,12 +33,13 @@ import re
 import sys
 
 import tensorflow as tf
-from tensorflow._api import v2 as tf_v2
+from tensorflow._api.v2 import v2 as tf_v2
 
 from google.protobuf import message
 from google.protobuf import text_format
 
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
 from tensorflow.python.platform import tf_logging as logging
@@ -126,9 +127,9 @@ def _FilterNonCoreGoldenFiles(golden_file_list):
   filtered_file_list = []
   filtered_package_prefixes = ['tensorflow.%s.' % p for p in _NON_CORE_PACKAGES]
   for f in golden_file_list:
-    if any([
+    if any(
         f.rsplit('/')[-1].startswith(pre) for pre in filtered_package_prefixes
-    ]):
+    ):
       continue
     filtered_file_list.append(f)
   return filtered_file_list
@@ -276,6 +277,9 @@ class ApiCompatibilityTest(test.TestCase):
 
     public_api_visitor = public_api.PublicAPIVisitor(visitor)
     public_api_visitor.private_map['tf'] = ['contrib']
+    if api_version == 2:
+      public_api_visitor.private_map['tf'].append('enable_v2_behavior')
+
     public_api_visitor.do_not_descend_map['tf.GPUOptions'] = ['Experimental']
     if FLAGS.only_test_core_api:
       public_api_visitor.do_not_descend_map['tf'].extend(_NON_CORE_PACKAGES)
@@ -310,6 +314,7 @@ class ApiCompatibilityTest(test.TestCase):
         update_goldens=FLAGS.update_goldens,
         api_version=api_version)
 
+  @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibility(self):
     api_version = 1
     golden_file_pattern = os.path.join(
@@ -328,6 +333,7 @@ class ApiCompatibilityTest(test.TestCase):
         'tensorflow.python.util.lazy_loader.LazyLoader'
         in str(type(tf.contrib)))
 
+  @test_util.run_v1_only('b/120545219')
   def testAPIBackwardsCompatibilityV1(self):
     api_version = 1
     golden_file_pattern = os.path.join(
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
new file mode 100644
index 0000000000000000000000000000000000000000..03de89b7176b702cf8fdee84bb4372002ad94707
--- /dev/null
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04
@@ -0,0 +1,75 @@
+# To push a new version, run:
+# $ docker build -f Dockerfile.rbe.cuda10.0-cudnn7-ubuntu14.04 \
+#       --tag "gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04" .
+# $ docker push gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04
+
+FROM ubuntu:14.04
+LABEL maintainer="Manuel Klimek <klimek@google.com>"
+
+RUN apt-get update && apt-get install -y --no-install-recommends ca-certificates apt-transport-https gnupg-curl && \
+    rm -rf /var/lib/apt/lists/* && \
+    NVIDIA_GPGKEY_SUM=d1be581509378368edeec8c1eb2958702feedf3bc3d17011adbf24efacce4ab5 && \
+    NVIDIA_GPGKEY_FPR=ae09fe4bbd223a84b2ccfce3f60f4b3d7fa2af80 && \
+    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64/7fa2af80.pub && \
+    apt-key adv --export --no-emit-version -a $NVIDIA_GPGKEY_FPR | tail -n +2 > cudasign.pub && \
+    echo "$NVIDIA_GPGKEY_SUM  cudasign.pub" | sha256sum -c --strict - && rm cudasign.pub && \
+    echo "deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/cuda.list && \
+    echo "deb https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1604/x86_64 /" > /etc/apt/sources.list.d/nvidia-ml.list
+
+ENV CUDA_VERSION 10.0.130
+ENV CUDA_PKG_VERSION 10-0=$CUDA_VERSION-1
+ENV CUDNN_VERSION 7.3.1.20
+ENV NCCL_VERSION 2.3.5
+ENV NVIDIA_DRIVER_CAPABILITIES compute,utility
+ENV NVIDIA_REQUIRE_CUDA "cuda>=10.0,driver>=410"
+ENV NVIDIA_VISIBLE_DEVICES all
+ENV PATH /usr/local/cuda/bin:${PATH}
+
+# TODO(b/110903506): /usr/loca/cuda/lib64/stubs should not be needed in
+# LD_LIBRARY_PATH. The stubs/libcuda.so is not meant to used at runtime. The
+# correct way to pass the path to bfd-ld is to pass
+# -Wl,-rpath-link=/usr/local/cuda/lib64/stubs to all binaries transitively
+# depending on libcuda. Optimally, builds targeting cuda would do that
+# internally.
+ENV LD_LIBRARY_PATH /usr/local/cuda/lib64/stubs
+
+LABEL com.nvidia.cudnn.version="${CUDNN_VERSION}"
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        cuda-command-line-tools-$CUDA_PKG_VERSION \
+        cuda-compat-10-0=410.48-1 \
+        cuda-cudart-$CUDA_PKG_VERSION \
+        cuda-libraries-$CUDA_PKG_VERSION \
+        cuda-libraries-dev-$CUDA_PKG_VERSION \
+        cuda-minimal-build-$CUDA_PKG_VERSION \
+        cuda-nvml-dev-$CUDA_PKG_VERSION \
+        cuda-nvtx-$CUDA_PKG_VERSION \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7=$CUDNN_VERSION-1+cuda10.0 \
+        libcudnn7-dev=$CUDNN_VERSION-1+cuda10.0 \
+        libnccl2=$NCCL_VERSION-2+cuda10.0 \
+        libnccl-dev=$NCCL_VERSION-2+cuda10.0 && \
+    ln -s cuda-10.0 /usr/local/cuda && \
+    apt-mark hold libcudnn7 && \
+    apt-mark hold libnccl2 && \
+    rm -rf /var/lib/apt/lists/*
+
+# TODO(b/110903506): Provide a link to the SONAME of libcuda.so.
+# https://github.com/NVIDIA/nvidia-docker/issues/775
+RUN ln -s libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+# TODO(klimek): Once the TODO in tensorflow's configure.py to correctly find
+# libnccl is resolved, delete this block.
+RUN ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so \
+ && ln -s /usr/lib/x86_64-linux-gnu/libnccl.so /usr/lib/libnccl.so.2
+
+# Copy and run the install scripts.
+COPY install/*.sh /install/
+ARG DEBIAN_FRONTEND=noninteractive
+RUN /install/install_bootstrap_deb_packages.sh
+RUN add-apt-repository -y ppa:openjdk-r/ppa && \
+    add-apt-repository -y ppa:george-edison55/cmake-3.x
+RUN /install/install_deb_packages.sh
+RUN /install/install_pip_packages.sh
+RUN /install/install_golang.sh
+
diff --git a/tensorflow/tools/ci_build/builds/libtensorflow.sh b/tensorflow/tools/ci_build/builds/libtensorflow.sh
index 9b3ff0cba7dcacc0f68a417299c31f7a0f413430..44abcc309b9ff238059d6f298c42c7edb3fecd32 100755
--- a/tensorflow/tools/ci_build/builds/libtensorflow.sh
+++ b/tensorflow/tools/ci_build/builds/libtensorflow.sh
@@ -55,6 +55,7 @@ function build_libtensorflow_tarball() {
   export CC_OPT_FLAGS='-mavx'
   if [ "${TF_NEED_CUDA}" == "1" ]; then
     BAZEL_OPTS="${BAZEL_OPTS} --config=cuda"
+    export TF_NEED_ROCM=0
   fi
   bazel clean --expunge
   yes "" | ./configure
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 177ef390dbd2f27a34f7a4e230f682b92648ca84..62e1eaa366865616c063d9f9785b863033a32706 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -58,6 +58,8 @@ PY_TEST_DIR="py_test_dir"
 SKIP_TEST=0
 RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -65,16 +67,32 @@ TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 #                        ensure performance
 # --test_core_only       Use tensorflow/python/... as test target
 # --test_contrib_only    Use tensorflow/contrib/... as test target
-for ARG in "$@"; do
-  case "$ARG" in
+#for ARG in "$@"; do
+while [[ $# -gt 0 ]]; do
+  case "$1" in
     --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
     --enable_remote_cache) set_remote_cache_options ;;
     --release_build) RELEASE_BUILD=1 ;;
     --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
     --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
     *)
   esac
+  shift
 done
 
 if [[ "$RELEASE_BUILD" == 1 ]]; then
@@ -88,7 +106,11 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   python tensorflow/tools/ci_build/update_version.py --nightly
-  EXTRA_PIP_FLAG="--nightly_flag"
+  if [ -z ${PROJECT_NAME} ]; then
+    EXTRA_PIP_FLAGS="--nightly_flag"
+  else
+    EXTRA_PIP_FLAGS="--project_name ${PROJECT_NAME} --nightly_flag"
+  fi
 fi
 
 # Enable short object file path to avoid long path issue on Windows.
@@ -100,7 +122,9 @@ fi
 
 run_configure_for_cpu_build
 
-bazel build --announce_rc --config=opt tensorflow/tools/pip_package:build_pip_package || exit $?
+bazel build --announce_rc --config=opt ${EXTRA_BUILD_FLAGS} \
+  tensorflow/tools/pip_package:build_pip_package \
+  --incompatible_remove_native_http_archive=false || exit $?
 
 if [[ "$SKIP_TEST" == 1 ]]; then
   exit 0
@@ -109,7 +133,7 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAG}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" "${EXTRA_PIP_FLAGS}"
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
@@ -126,8 +150,8 @@ N_JOBS="${NUMBER_OF_PROCESSORS}"
 # which will result testing system installed tensorflow
 bazel test --announce_rc --config=opt -k --test_output=errors \
   --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-no_oss \
-  --build_tag_filters=-no_pip,-no_windows,-no_oss --build_tests_only \
+  --test_tag_filters=-no_pip,-no_windows,-no_oss,-gpu \
+  --build_tag_filters=-no_pip,-no_windows,-no_oss,-gpu --build_tests_only \
   --test_size_filters=small,medium \
   --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
   --flaky_test_attempts=3 \
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 6178d7794dfc1f510846d3097e7c2607a44c3014..acafd9ebce3afa634c1a1aafd4d9ac5c57935d80 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -58,6 +58,8 @@ PY_TEST_DIR="py_test_dir"
 SKIP_TEST=0
 RELEASE_BUILD=0
 TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
+PROJECT_NAME=""
+EXTRA_BUILD_FLAGS=""
 
 # --skip_test            Skip running tests
 # --enable_remote_cache  Add options to enable remote cache for build and test
@@ -65,7 +67,7 @@ TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..."
 #                        ensure performance
 # --test_core_only       Use tensorflow/python/... as test target
 # --test_contrib_only    Use tensorflow/contrib/... as test target
-for ARG in "$@"; do
+while [[ $# -gt 0 ]]; do
   case "$ARG" in
     --tf_nightly) TF_NIGHTLY=1 ;;
     --skip_test) SKIP_TEST=1 ;;
@@ -73,8 +75,23 @@ for ARG in "$@"; do
     --release_build) RELEASE_BUILD=1 ;;
     --test_core_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/python/..." ;;
     --test_contrib_only) TEST_TARGET="//${PY_TEST_DIR}/tensorflow/contrib/..." ;;
+    --extra_build_flags)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      EXTRA_BUILD_FLAGS="$1"
+      ;;
+    --project_name)
+      shift
+      if [[ -z "$1" ]]; then
+        break
+      fi
+      PROJECT_NAME="$1"
+      ;;
     *)
   esac
+  shift
 done
 
 if [[ "$RELEASE_BUILD" == 1 ]]; then
@@ -88,7 +105,11 @@ fi
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   python tensorflow/tools/ci_build/update_version.py --nightly
-  EXTRA_PIP_FLAG="--nightly_flag"
+  if [ -z ${PROJECT_NAME} ]; then
+    EXTRA_PIP_FLAGS="--nightly_flag"
+  else
+    EXTRA_PIP_FLAGS="--project_name=${PROJECT_NAME} --nightly_flag"
+  fi
 fi
 
 # Enable short object file path to avoid long path issue on Windows.
@@ -104,6 +125,7 @@ fi
 run_configure_for_gpu_build
 
 bazel build --announce_rc --config=opt --define=no_tensorflow_py_deps=true \
+  ${EXTRA_BUILD_FLAGS} \
   tensorflow/tools/pip_package:build_pip_package || exit $?
 
 if [[ "$SKIP_TEST" == 1 ]]; then
@@ -113,7 +135,8 @@ fi
 # Create a python test directory to avoid package name conflict
 create_python_test_dir "${PY_TEST_DIR}"
 
-./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" --gpu "${EXTRA_PIP_FLAG}"
+./bazel-bin/tensorflow/tools/pip_package/build_pip_package "$PWD/${PY_TEST_DIR}" \
+  --gpu "${EXTRA_PIP_FLAGS}"
 
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index 5f619c4e62ab76047966316c227c1ca9e7a10ba7..a9902d77f5ec103fe2000a4a470d425e3998f45e 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -14,6 +14,18 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_test(
+    name = "ast_edits_test",
+    srcs = ["ast_edits_test.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ast_edits",
+        "//tensorflow/python:client_testlib",
+        "//tensorflow/python:framework_test_lib",
+        "@six_archive//:six",
+    ],
+)
+
 py_binary(
     name = "tf_upgrade",
     srcs = ["tf_upgrade.py"],
@@ -39,14 +51,32 @@ py_library(
     srcs_version = "PY2AND3",
 )
 
+py_library(
+    name = "reorders_v2",
+    srcs = ["reorders_v2.py"],
+    srcs_version = "PY2AND3",
+)
+
+py_library(
+    name = "tf_upgrade_v2_lib",
+    srcs = ["tf_upgrade_v2.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":ast_edits",
+        ":renames_v2",
+        ":reorders_v2",
+    ],
+)
+
 py_binary(
     name = "tf_upgrade_v2",
-    srcs = [
-        "renames_v2.py",
-        "tf_upgrade_v2.py",
-    ],
+    srcs = ["tf_upgrade_v2_main.py"],
+    main = "tf_upgrade_v2_main.py",
     srcs_version = "PY2AND3",
-    deps = [":ast_edits"],
+    deps = [
+        ":ast_edits",
+        ":tf_upgrade_v2_lib",
+    ],
 )
 
 py_test(
@@ -55,8 +85,11 @@ py_test(
     srcs_version = "PY2AND3",
     deps = [
         ":tf_upgrade_v2",
+        "//tensorflow:tensorflow_py",
         "//tensorflow/python:client_testlib",
         "//tensorflow/python:framework_test_lib",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
         "@six_archive//:six",
     ],
 )
@@ -100,18 +133,28 @@ py_test(
 genrule(
     name = "generate_upgraded_file_v2",
     testonly = 1,
-    srcs = ["testdata/test_file_v1_10.py"],
+    srcs = ["testdata/test_file_v1_12.py"],
     outs = [
         "test_file_v2_0.py",
         "report_v2.txt",
     ],
     cmd = ("$(location :tf_upgrade_v2)" +
-           " --infile $(location testdata/test_file_v1_10.py)" +
+           " --infile $(location testdata/test_file_v1_12.py)" +
            " --outfile $(location test_file_v2_0.py)" +
            " --reportfile $(location report_v2.txt)"),
     tools = [":tf_upgrade_v2"],
 )
 
+py_test(
+    name = "test_file_v1_12",
+    size = "small",
+    srcs = ["testdata/test_file_v1_12.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
 py_test(
     name = "test_file_v2_0",
     size = "small",
@@ -128,6 +171,6 @@ exports_files(
         "tf_upgrade.py",
         "renames_v2.py",
         "testdata/test_file_v0_11.py",
-        "testdata/test_file_v1_10.py",
+        "testdata/test_file_v1_12.py",
     ],
 )
diff --git a/tensorflow/tools/compatibility/README.md b/tensorflow/tools/compatibility/README.md
index aabc7b253d68eb43d3e6c1d5cecd55697a0cab59..6ff42b1fefe983d2119ddc7841d14d888443b49a 100644
--- a/tensorflow/tools/compatibility/README.md
+++ b/tensorflow/tools/compatibility/README.md
@@ -1,60 +1,77 @@
 # TensorFlow Python API Upgrade Utility
 
 This tool allows you to upgrade your existing TensorFlow Python scripts.
-This script can be run on a single Python file:
+Specifically: \
+`tf_upgrade_v2.py`: upgrades code from TensorFlow 1.12 to TensorFlow 2.0 preview. \
+`tf_upgrade.py`: upgrades code to TensorFlow 1.0 from TensorFlow 0.11.
+
+## Running the script from pip package
+
+First, install TensorFlow pip package. See
+https://www.tensorflow.org/install/pip.
+
+Upgrade script can be run on a single Python file:
 
 ```
-tf_upgrade.py --infile foo.py --outfile foo-upgraded.py
+tf_upgrade_v2 --infile foo.py --outfile foo-upgraded.py
 ```
 
 It will print a list of errors it finds that it can't fix. You can also run
 it on a directory tree:
 
 ```
+# upgrade the .py files and copy all the other files to the outtree
+tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded
+
 # just upgrade the .py files
-tf_upgrade.py --intree coolcode --outtree coolcode-upgraded
-# after upgrade the .py files, then copy all the other files to the outtree
-tf_upgrade.py --intree coolcode --outtree coolcode-upgraded --copyotherfiles True
+tf_upgrade_v2 --intree coolcode --outtree coolcode-upgraded --copyotherfiles False
 ```
 
-In either case, it will also dump out a report e.g. which will detail changes
+
+## Report
+
+The script will also dump out a report e.g. which will detail changes
 e.g.:
 
 ```
-third_party/tensorflow/tools/compatibility/test_file_v0.11.py Line 125
+'tensorflow/tools/compatibility/testdata/test_file_v1_12.py' Line 65
+--------------------------------------------------------------------------------
+
+Added keyword 'input' to reordered function 'tf.argmax'
+Renamed keyword argument from 'dimension' to 'axis'
 
-Renamed keyword argument from `dim` to `axis`
-Renamed keyword argument from `squeeze_dims` to `axis`
+    Old:         tf.argmax([[1, 3, 2]], dimension=0))
+                                        ~~~~~~~~~~
+    New:         tf.argmax(input=[[1, 3, 2]], axis=0))
 
-    Old:                   [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
-                                        ~~~~    ~~~~~~~~~~~~~
-    New:                   [[1, 2, 3]], axis=1), axis=[1]).eval(),
-                                        ~~~~~    ~~~~~
 ```
 
 ## Caveats
 
 - Don't update parts of your code manually before running this script. In
-particular, functions that have had reordered arguments like `tf.concat`
-or `tf.split` will cause the script to incorrectly add keyword arguments that
-mismap arguments.
+particular, functions that have had reordered arguments like `tf.argmax`
+or `tf.batch_to_space` will cause the script to incorrectly add keyword
+arguments that mismap arguments.
 
 - This script wouldn't actually reorder arguments. Instead, the script will add
 keyword arguments to functions that had their arguments reordered.
 
 - This script is not able to upgrade all functions. One notable example is
-`tf.reverse()` which has been changed to take a list of indices rather than
-a tensor of bools. If the script detects this, it will report this to stdout
+`tf.nn.conv2d` that no longer takes `use_cudnn_on_gpu` argument.
+If the script detects this, it will report this to stdout
 (and in the report), and you can fix it manually. For example if you have
-`tf.reverse(a, [False, True, True])` you will need to manually change it to
-`tf.reverse(a, [1, 2])`.
+`tf.nn.conv2d(inputs, filters, strides, padding, use_cudnn_on_gpu=True)`
+you will need to manually change it to
+`tf.nn.conv2d(input, filters, strides, padding)`.
 
 - There are some syntaxes that are not handleable with this script as this
-script was designed to use only standard python packages. If the script fails
-with "A necessary keyword argument failed to be inserted." or
+script was designed to use only standard python packages.
+There is an alternative available for TensorFlow 0.* to 1.0 upgrade script.
+If the script fails with "A necessary keyword argument failed to be inserted." or
 "Failed to find keyword lexicographically. Fix manually.", you can try
 [@machrisaa's fork of this script](https://github.com/machrisaa/tf0to1).
 [@machrisaa](https://github.com/machrisaa) has used the
 [RedBaron Python refactoring engine](https://redbaron.readthedocs.io/en/latest/)
 which is able to localize syntactic elements more reliably than the built-in
-`ast` module this script is based upon.
+`ast` module this script is based upon. Note that the alternative script is not
+available for TensorFlow 2.0 upgrade.
diff --git a/tensorflow/tools/compatibility/ast_edits.py b/tensorflow/tools/compatibility/ast_edits.py
index 56c67b8356524e9169f59d24af6d455e2cd82706..eac2150502d6511da127a42fbb46c92bea7fe364 100644
--- a/tensorflow/tools/compatibility/ast_edits.py
+++ b/tensorflow/tools/compatibility/ast_edits.py
@@ -21,11 +21,16 @@ from __future__ import print_function
 import ast
 import collections
 import os
+import re
 import shutil
 import sys
 import tempfile
 import traceback
 
+# Some regular expressions we will need for parsing
+FIND_OPEN = re.compile(r"^\s*(\[).*$")
+FIND_STRING_CHARS = re.compile(r"['\"]")
+
 
 class APIChangeSpec(object):
   """This class defines the transformations that need to happen.
@@ -40,6 +45,10 @@ class APIChangeSpec(object):
   * `function_reorders`: maps functions whose argument order has changed to the
     list of arguments in the new order
   * `function_handle`: maps function names to custom handlers for the function
+  * `function_warnings`: maps full names of functions to warnings that will be
+    printed out if the function is used. (e.g. tf.nn.convolution())
+  * `unrestricted_function_warnings`: maps names of functions to warnings that
+    will be printed out when the function is used (e.g. foo.convolution()).
 
   For an example, see `TFAPIChangeSpec`.
   """
@@ -53,7 +62,7 @@ class _FileEditTuple(
   Fields:
     comment: A description of the edit and why it was made.
     line: The line number in the file where the edit occurs (1-indexed).
-    start: The line number in the file where the edit occurs (0-indexed).
+    start: The column number in the file where the edit occurs (0-indexed).
     old: text string to remove (this must match what was in file).
     new: text string to add in place of `old`.
   """
@@ -195,6 +204,29 @@ class _ASTCallVisitor(ast.NodeVisitor):
     except KeyError:
       pass
 
+  def _print_warning_for_function_unrestricted(self, node):
+    """Print a warning when specific functions are called.
+
+    The function _print_warning_for_function matches the full name of the called
+    function, e.g., tf.foo.bar(). This function matches the function name that
+    is called, as long as the function is an attribute. For example,
+    `tf.foo.bar()` and `foo.bar()` are matched, but not `bar()`.
+
+    Args:
+      node: ast.Call object
+    """
+    function_warnings = getattr(
+        self._api_change_spec, "unrestricted_function_warnings", {})
+    if isinstance(node.func, ast.Attribute):
+      function_name = node.func.attr
+      try:
+        warning_message = function_warnings[function_name]
+        self._file_edit.add(warning_message,
+                            node.lineno, node.col_offset, "", "",
+                            error="%s requires manual check." % function_name)
+      except KeyError:
+        pass
+
   def _get_attribute_full_path(self, node):
     """Traverse an attribute to generate a full name e.g. tf.foo.bar.
 
@@ -209,11 +241,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
     items = []
     while not isinstance(curr, ast.Name):
       if not isinstance(curr, ast.Attribute):
-        return None
+        return None, None
       items.append(curr.attr)
       curr = curr.value
     items.append(curr.id)
-    return ".".join(reversed(items))
+    return ".".join(reversed(items)), items[0]
 
   def _find_true_position(self, node):
     """Return correct line number and column offset for a given node.
@@ -221,13 +253,12 @@ class _ASTCallVisitor(ast.NodeVisitor):
     This is necessary mainly because ListComp's location reporting reports
     the next token after the list comprehension list opening.
 
+    Returns:
+      lineno, offset for the given node
+
     Args:
       node: Node for which we wish to know the lineno and col_offset
     """
-    import re
-    find_open = re.compile("^\s*(\\[).*$")
-    find_string_chars = re.compile("['\"]")
-
     if isinstance(node, ast.ListComp):
       # Strangely, ast.ListComp returns the col_offset of the first token
       # after the '[' token which appears to be a bug. Workaround by
@@ -241,7 +272,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
         reversed_preceding_text = text[:col][::-1]
         # First find if a [ can be found with only whitespace between it and
         # col.
-        m = find_open.match(reversed_preceding_text)
+        m = FIND_OPEN.match(reversed_preceding_text)
         if m:
           new_col_offset = col - m.start(1) - 1
           return line, new_col_offset
@@ -260,7 +291,7 @@ class _ASTCallVisitor(ast.NodeVisitor):
             comment_start = prev_line.find("#")
             if comment_start == -1:
               col = len(prev_line) - 1
-            elif find_string_chars.search(prev_line[comment_start:]) is None:
+            elif FIND_STRING_CHARS.search(prev_line[comment_start:]) is None:
               col = comment_start
             else:
               return None, None
@@ -276,9 +307,10 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Current Node
     """
+    self._print_warning_for_function_unrestricted(node)
 
     # Find a simple attribute name path e.g. "tf.foo.bar"
-    full_name = self._get_attribute_full_path(node.func)
+    full_name, name = self._get_attribute_full_path(node.func)
 
     # Make sure the func is marked as being part of a call
     node.func.is_function_for_call = True
@@ -286,6 +318,9 @@ class _ASTCallVisitor(ast.NodeVisitor):
     if full_name:
       # Call special handlers
       function_handles = self._api_change_spec.function_handle
+      glob_name = "*.{}".format(name)
+      if glob_name in function_handles:
+        function_handles[glob_name](self._file_edit, node)
       if full_name in function_handles:
         function_handles[full_name](self._file_edit, node)
 
@@ -358,10 +393,11 @@ class _ASTCallVisitor(ast.NodeVisitor):
     Args:
       node: Node that is of type ast.Attribute
     """
-    full_name = self._get_attribute_full_path(node)
+    full_name, _ = self._get_attribute_full_path(node)
     if full_name:
-      self._rename_functions(node, full_name)
+      # Make sure the warning comes first, otherwise the name may have changed
       self._print_warning_for_function(node, full_name)
+      self._rename_functions(node, full_name)
     if full_name in self._api_change_spec.change_to_function:
       if not hasattr(node, "is_function_for_call"):
         new_text = full_name + "()"
diff --git a/tensorflow/tools/compatibility/ast_edits_test.py b/tensorflow/tools/compatibility/ast_edits_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..99f20a026fcb9b60e0d4365dd2690946f0d833fc
--- /dev/null
+++ b/tensorflow/tools/compatibility/ast_edits_test.py
@@ -0,0 +1,420 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for ast_edits which is used in tf upgraders.
+
+All of the tests assume that we want to change from an API containing
+
+    def f(a, b, kw1, kw2): ...
+    def g(a, b, kw1, c, kw1_alias): ...
+    def g2(a, b, kw1, c, d, kw1_alias): ...
+    def h(a, kw1, kw2, kw1_alias, kw2_alias): ...
+
+and the changes to the API consist of renaming, reordering, and/or removing
+arguments. Thus, we want to be able to generate changes to produce each of the
+following new APIs:
+
+    def f(a, b, kw1, kw3): ...
+    def f(a, b, kw2, kw1): ...
+    def f(a, b, kw3, kw1): ...
+    def g(a, b, kw1, c): ...
+    def g(a, b, c, kw1): ...
+    def g2(a, b, kw1, c, d): ...
+    def g2(a, b, c, d, kw1): ...
+    def h(a, kw1, kw2): ...
+
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import six
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+from tensorflow.tools.compatibility import ast_edits
+
+
+class NoUpdateSpec(ast_edits.APIChangeSpec):
+  """A specification of an API change which doesn't change anything."""
+
+  def __init__(self):
+    self.function_handle = {}
+    self.function_reorders = {}
+    self.function_keyword_renames = {}
+    self.symbol_renames = {}
+    self.function_warnings = {}
+    self.unrestricted_function_warnings = {}
+    self.change_to_function = {}
+
+
+class RenameKeywordSpec(NoUpdateSpec):
+  """A specification where kw2 gets renamed to kw3.
+
+  The new API is
+
+    def f(a, b, kw1, kw3): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.update_renames()
+
+  def update_renames(self):
+    self.function_keyword_renames["f"] = {"kw2": "kw3"}
+
+
+class ReorderKeywordSpec(NoUpdateSpec):
+  """A specification where kw2 gets moved in front of kw1.
+
+  The new API is
+
+    def f(a, b, kw2, kw1): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.update_reorders()
+
+  def update_reorders(self):
+    # Note that these should be in the old order.
+    self.function_reorders["f"] = ["a", "b", "kw1", "kw2"]
+
+
+class ReorderAndRenameKeywordSpec(ReorderKeywordSpec, RenameKeywordSpec):
+  """A specification where kw2 gets moved in front of kw1 and is changed to kw3.
+
+  The new API is
+
+    def f(a, b, kw3, kw1): ...
+
+  """
+
+  def __init__(self):
+    ReorderKeywordSpec.__init__(self)
+    RenameKeywordSpec.__init__(self)
+    self.update_renames()
+    self.update_reorders()
+
+
+class RemoveDeprecatedAliasKeyword(NoUpdateSpec):
+  """A specification where kw1_alias is removed in g.
+
+  The new API is
+
+    def g(a, b, kw1, c): ...
+    def g2(a, b, kw1, c, d): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.function_keyword_renames["g"] = {"kw1_alias": "kw1"}
+    self.function_keyword_renames["g2"] = {"kw1_alias": "kw1"}
+
+
+class RemoveDeprecatedAliasAndReorderRest(RemoveDeprecatedAliasKeyword):
+  """A specification where kw1_alias is removed in g.
+
+  The new API is
+
+    def g(a, b, c, kw1): ...
+    def g2(a, b, c, d, kw1): ...
+
+  """
+
+  def __init__(self):
+    RemoveDeprecatedAliasKeyword.__init__(self)
+    # Note that these should be in the old order.
+    self.function_reorders["g"] = ["a", "b", "kw1", "c"]
+    self.function_reorders["g2"] = ["a", "b", "kw1", "c", "d"]
+
+
+class RemoveMultipleKeywordArguments(NoUpdateSpec):
+  """A specification where both keyword aliases are removed from h.
+
+  The new API is
+
+    def h(a, kw1, kw2): ...
+
+  """
+
+  def __init__(self):
+    NoUpdateSpec.__init__(self)
+    self.function_keyword_renames["h"] = {
+        "kw1_alias": "kw1",
+        "kw2_alias": "kw2",
+    }
+
+
+class TestAstEdits(test_util.TensorFlowTestCase):
+
+  def _upgrade(self, spec, old_file_text):
+    in_file = six.StringIO(old_file_text)
+    out_file = six.StringIO()
+    upgrader = ast_edits.ASTCodeUpgrader(spec)
+    count, report, errors = (
+        upgrader.process_opened_file("test.py", in_file,
+                                     "test_out.py", out_file))
+    return (count, report, errors), out_file.getvalue()
+
+  def testNoTransformIfNothingIsSupplied(self):
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    self.assertEqual(new_text, text)
+
+    text = "f(a, b, c, d)\n"
+    _, new_text = self._upgrade(NoUpdateSpec(), text)
+    self.assertEqual(new_text, text)
+
+  def testKeywordRename(self):
+    """Test that we get the expected result if renaming kw2 to kw3."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    expected = "f(a, b, kw1=c, kw3=d)\n"
+    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    self.assertEqual(new_text, expected)
+
+    # No keywords specified, no reordering, so we should get input as output
+    text = "f(a, b, c, d)\n"
+    _, new_text = self._upgrade(RenameKeywordSpec(), text)
+    self.assertEqual(new_text, text)
+
+  def testKeywordReorder(self):
+    """Test that we get the expected result if kw2 is now before kw1."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    acceptable_outputs = [
+        # No change is a valid output
+        text,
+        # Just reordering the kw.. args is also ok
+        "f(a, b, kw2=d, kw1=c)\n",
+        # Also cases where all arguments are fully specified are allowed
+        "f(a=a, b=b, kw1=c, kw2=d)\n",
+        "f(a=a, b=b, kw2=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "f(a, b, c, d)\n"
+    acceptable_outputs = [
+        "f(a, b, d, c)\n",
+        "f(a=a, b=b, kw1=c, kw2=d)\n",
+        "f(a=a, b=b, kw2=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testKeywordReorderAndRename(self):
+    """Test that we get the expected result if kw2 is renamed and moved."""
+    text = "f(a, b, kw1=c, kw2=d)\n"
+    acceptable_outputs = [
+        "f(a, b, kw3=d, kw1=c)\n",
+        "f(a=a, b=b, kw1=c, kw3=d)\n",
+        "f(a=a, b=b, kw3=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "f(a, b, c, d)\n"
+    acceptable_outputs = [
+        "f(a, b, d, c)\n",
+        "f(a=a, b=b, kw1=c, kw3=d)\n",
+        "f(a=a, b=b, kw3=d, kw1=c)\n",
+    ]
+    _, new_text = self._upgrade(ReorderAndRenameKeywordSpec(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAlias(self):
+    """Test that we get the expected result if a keyword alias is removed."""
+    text = "g(a, b, kw1=x, c=c)\n"
+    acceptable_outputs = [
+        # Not using deprecated alias, so original is ok
+        text,
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # No keyword used, should be no change
+    text = "g(a, b, x, c)\n"
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertEqual(new_text, text)
+
+    # If we used the alias, it should get renamed
+    text = "g(a, b, kw1_alias=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed even if it's last
+    text = "g(a, b, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAndReorder(self):
+    """Test for when a keyword alias is removed and args are reordered."""
+    text = "g(a, b, kw1=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "g(a, b, x, c)\n"
+    # Don't accept an output which doesn't reorder c and d
+    acceptable_outputs = [
+        "g(a, b, c, x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # If we used the alias, it should get renamed
+    text = "g(a, b, kw1_alias=x, c=c)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed and reordered even if it's last
+    text = "g(a, b, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g(a, b, kw1=x, c=c)\n",
+        "g(a, b, c=c, kw1=x)\n",
+        "g(a=a, b=b, kw1=x, c=c)\n",
+        "g(a=a, b=b, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveDeprecatedKeywordAndReorder2(self):
+    """Same as testRemoveDeprecatedKeywordAndReorder but on g2 (more args)."""
+    text = "g2(a, b, kw1=x, c=c, d=d)\n"
+    acceptable_outputs = [
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Keywords are reordered, so we should reorder arguments too
+    text = "g2(a, b, x, c, d)\n"
+    # Don't accept an output which doesn't reorder c and d
+    acceptable_outputs = [
+        "g2(a, b, c, d, x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasAndReorderRest(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # If we used the alias, it should get renamed
+    text = "g2(a, b, kw1_alias=x, c=c, d=d)\n"
+    acceptable_outputs = [
+        "g2(a, b, kw1=x, c=c, d=d)\n",
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+        "g2(a=a, b=b, c=c, d=d, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # It should get renamed and reordered even if it's not in order
+    text = "g2(a, b, d=d, c=c, kw1_alias=x)\n"
+    acceptable_outputs = [
+        "g2(a, b, kw1=x, c=c, d=d)\n",
+        "g2(a, b, c=c, d=d, kw1=x)\n",
+        "g2(a, b, d=d, c=c, kw1=x)\n",
+        "g2(a=a, b=b, kw1=x, c=c, d=d)\n",
+        "g2(a=a, b=b, c=c, d=d, kw1=x)\n",
+        "g2(a=a, b=b, d=d, c=c, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveDeprecatedAliasKeyword(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testRemoveMultipleKeywords(self):
+    """Remove multiple keywords at once."""
+    # Not using deprecated keywords -> no rename
+    text = "h(a, kw1=x, kw2=y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertEqual(new_text, text)
+
+    # Using positional arguments (in proper order) -> no change
+    text = "h(a, x, y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertEqual(new_text, text)
+
+    # Use only the old names, in order
+    text = "h(a, kw1_alias=x, kw2_alias=y)\n"
+    acceptable_outputs = [
+        "h(a, x, y)\n",
+        "h(a, kw1=x, kw2=y)\n",
+        "h(a=a, kw1=x, kw2=y)\n",
+        "h(a, kw2=y, kw1=x)\n",
+        "h(a=a, kw2=y, kw1=x)\n",
+    ]
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Use only the old names, in reverse order, should give one of same outputs
+    text = "h(a, kw2_alias=y, kw1_alias=x)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+    # Mix old and new names
+    text = "h(a, kw1=x, kw2_alias=y)\n"
+    _, new_text = self._upgrade(RemoveMultipleKeywordArguments(), text)
+    self.assertIn(new_text, acceptable_outputs)
+
+  def testUnrestrictedFunctionWarnings(self):
+    class FooWarningSpec(NoUpdateSpec):
+      """Usages of function attribute foo() prints out a warning."""
+
+      def __init__(self):
+        NoUpdateSpec.__init__(self)
+        self.unrestricted_function_warnings = {"foo": "not good"}
+    texts = ["object.foo()", "get_object().foo()",
+             "get_object().foo()", "object.foo().bar()"]
+    for text in texts:
+      (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
+      self.assertIn("not good", report)
+
+    # Note that foo() won't result in a warning, because in this case foo is
+    # not an attribute, but a name.
+    false_alarms = ["foo", "foo()", "foo.bar()", "obj.run_foo()", "obj.foo"]
+    for text in false_alarms:
+      (_, report, _), _ = self._upgrade(FooWarningSpec(), text)
+      self.assertNotIn("not good", report)
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/renames_v2.py b/tensorflow/tools/compatibility/renames_v2.py
index 5ea2fbcc4cd2134f97f820bdbd48d5d52a20a8b3..b757ad4647c6d92e21feccd7d90da887df379531 100644
--- a/tensorflow/tools/compatibility/renames_v2.py
+++ b/tensorflow/tools/compatibility/renames_v2.py
@@ -26,22 +26,38 @@ from __future__ import print_function
 
 renames = {
     'tf.AUTO_REUSE': 'tf.compat.v1.AUTO_REUSE',
+    'tf.AttrValue': 'tf.compat.v1.AttrValue',
     'tf.COMPILER_VERSION': 'tf.version.COMPILER_VERSION',
     'tf.CXX11_ABI_FLAG': 'tf.sysconfig.CXX11_ABI_FLAG',
+    'tf.ConditionalAccumulator': 'tf.compat.v1.ConditionalAccumulator',
+    'tf.ConditionalAccumulatorBase': 'tf.compat.v1.ConditionalAccumulatorBase',
+    'tf.ConfigProto': 'tf.compat.v1.ConfigProto',
+    'tf.DeviceSpec': 'tf.compat.v1.DeviceSpec',
+    'tf.Dimension': 'tf.compat.v1.Dimension',
     'tf.FixedLenFeature': 'tf.io.FixedLenFeature',
     'tf.FixedLenSequenceFeature': 'tf.io.FixedLenSequenceFeature',
     'tf.FixedLengthRecordReader': 'tf.compat.v1.FixedLengthRecordReader',
     'tf.GIT_VERSION': 'tf.version.GIT_VERSION',
+    'tf.GPUOptions': 'tf.compat.v1.GPUOptions',
     'tf.GRAPH_DEF_VERSION': 'tf.version.GRAPH_DEF_VERSION',
     'tf.GRAPH_DEF_VERSION_MIN_CONSUMER': 'tf.version.GRAPH_DEF_VERSION_MIN_CONSUMER',
     'tf.GRAPH_DEF_VERSION_MIN_PRODUCER': 'tf.version.GRAPH_DEF_VERSION_MIN_PRODUCER',
+    'tf.GraphDef': 'tf.compat.v1.GraphDef',
+    'tf.GraphKeys': 'tf.compat.v1.GraphKeys',
+    'tf.GraphOptions': 'tf.compat.v1.GraphOptions',
+    'tf.HistogramProto': 'tf.compat.v1.HistogramProto',
     'tf.IdentityReader': 'tf.compat.v1.IdentityReader',
     'tf.InteractiveSession': 'tf.compat.v1.InteractiveSession',
     'tf.LMDBReader': 'tf.compat.v1.LMDBReader',
+    'tf.LogMessage': 'tf.compat.v1.LogMessage',
     'tf.MONOLITHIC_BUILD': 'tf.sysconfig.MONOLITHIC_BUILD',
+    'tf.MetaGraphDef': 'tf.compat.v1.MetaGraphDef',
+    'tf.NameAttrList': 'tf.compat.v1.NameAttrList',
     'tf.NoGradient': 'tf.no_gradient',
+    'tf.NodeDef': 'tf.compat.v1.NodeDef',
     'tf.NotDifferentiable': 'tf.no_gradient',
     'tf.OpError': 'tf.errors.OpError',
+    'tf.OptimizerOptions': 'tf.compat.v1.OptimizerOptions',
     'tf.PaddingFIFOQueue': 'tf.io.PaddingFIFOQueue',
     'tf.Print': 'tf.compat.v1.Print',
     'tf.PriorityQueue': 'tf.io.PriorityQueue',
@@ -49,16 +65,18 @@ renames = {
     'tf.QueueBase': 'tf.io.QueueBase',
     'tf.RandomShuffleQueue': 'tf.io.RandomShuffleQueue',
     'tf.ReaderBase': 'tf.compat.v1.ReaderBase',
+    'tf.RunMetadata': 'tf.compat.v1.RunMetadata',
+    'tf.RunOptions': 'tf.compat.v1.RunOptions',
     'tf.Session': 'tf.compat.v1.Session',
+    'tf.SessionLog': 'tf.compat.v1.SessionLog',
     'tf.SparseConditionalAccumulator': 'tf.sparse.SparseConditionalAccumulator',
     'tf.SparseFeature': 'tf.io.SparseFeature',
+    'tf.SparseTensorValue': 'tf.compat.v1.SparseTensorValue',
     'tf.TFRecordReader': 'tf.compat.v1.TFRecordReader',
-    'tf.TensorShape': 'tf.compat.v1.TensorShape',
+    'tf.TensorInfo': 'tf.compat.v1.TensorInfo',
     'tf.TextLineReader': 'tf.compat.v1.TextLineReader',
     'tf.VERSION': 'tf.version.VERSION',
     'tf.VarLenFeature': 'tf.io.VarLenFeature',
-    'tf.Variable': 'tf.compat.v1.Variable',
-    'tf.VariableAggregation': 'tf.compat.v1.VariableAggregation',
     'tf.VariableScope': 'tf.compat.v1.VariableScope',
     'tf.WholeFileReader': 'tf.compat.v1.WholeFileReader',
     'tf.accumulate_n': 'tf.math.accumulate_n',
@@ -67,59 +85,66 @@ renames = {
     'tf.add_to_collections': 'tf.compat.v1.add_to_collections',
     'tf.all_variables': 'tf.compat.v1.all_variables',
     'tf.angle': 'tf.math.angle',
-    'tf.argmax': 'tf.compat.v1.argmax',
-    'tf.argmin': 'tf.compat.v1.argmin',
-    'tf.assert_greater_equal': 'tf.debugging.assert_greater_equal',
-    'tf.assert_integer': 'tf.debugging.assert_integer',
-    'tf.assert_less_equal': 'tf.debugging.assert_less_equal',
-    'tf.assert_near': 'tf.debugging.assert_near',
-    'tf.assert_negative': 'tf.debugging.assert_negative',
-    'tf.assert_non_negative': 'tf.debugging.assert_non_negative',
-    'tf.assert_non_positive': 'tf.debugging.assert_non_positive',
-    'tf.assert_none_equal': 'tf.debugging.assert_none_equal',
-    'tf.assert_positive': 'tf.debugging.assert_positive',
+    'tf.app.run': 'tf.compat.v1.app.run',
+    'tf.assert_greater_equal': 'tf.compat.v1.assert_greater_equal',
+    'tf.assert_integer': 'tf.compat.v1.assert_integer',
+    'tf.assert_less_equal': 'tf.compat.v1.assert_less_equal',
+    'tf.assert_near': 'tf.compat.v1.assert_near',
+    'tf.assert_negative': 'tf.compat.v1.assert_negative',
+    'tf.assert_non_negative': 'tf.compat.v1.assert_non_negative',
+    'tf.assert_non_positive': 'tf.compat.v1.assert_non_positive',
+    'tf.assert_none_equal': 'tf.compat.v1.assert_none_equal',
+    'tf.assert_positive': 'tf.compat.v1.assert_positive',
     'tf.assert_proper_iterable': 'tf.debugging.assert_proper_iterable',
-    'tf.assert_rank_at_least': 'tf.debugging.assert_rank_at_least',
-    'tf.assert_rank_in': 'tf.debugging.assert_rank_in',
+    'tf.assert_rank_at_least': 'tf.compat.v1.assert_rank_at_least',
+    'tf.assert_rank_in': 'tf.compat.v1.assert_rank_in',
     'tf.assert_same_float_dtype': 'tf.debugging.assert_same_float_dtype',
-    'tf.assert_scalar': 'tf.debugging.assert_scalar',
-    'tf.assert_type': 'tf.debugging.assert_type',
+    'tf.assert_scalar': 'tf.compat.v1.assert_scalar',
+    'tf.assert_type': 'tf.compat.v1.assert_type',
     'tf.assert_variables_initialized': 'tf.compat.v1.assert_variables_initialized',
     'tf.assign': 'tf.compat.v1.assign',
     'tf.assign_add': 'tf.compat.v1.assign_add',
     'tf.assign_sub': 'tf.compat.v1.assign_sub',
+    'tf.batch_scatter_update': 'tf.compat.v1.batch_scatter_update',
     'tf.betainc': 'tf.math.betainc',
-    'tf.bincount': 'tf.math.bincount',
     'tf.ceil': 'tf.math.ceil',
     'tf.check_numerics': 'tf.debugging.check_numerics',
     'tf.cholesky': 'tf.linalg.cholesky',
     'tf.cholesky_solve': 'tf.linalg.cholesky_solve',
+    'tf.clip_by_average_norm': 'tf.compat.v1.clip_by_average_norm',
     'tf.colocate_with': 'tf.compat.v1.colocate_with',
-    'tf.confusion_matrix': 'tf.math.confusion_matrix',
     'tf.conj': 'tf.math.conj',
     'tf.container': 'tf.compat.v1.container',
-    'tf.convert_to_tensor': 'tf.compat.v1.convert_to_tensor',
     'tf.convert_to_tensor_or_indexed_slices': 'tf.compat.v1.convert_to_tensor_or_indexed_slices',
     'tf.convert_to_tensor_or_sparse_tensor': 'tf.compat.v1.convert_to_tensor_or_sparse_tensor',
+    'tf.count_nonzero': 'tf.compat.v1.count_nonzero',
     'tf.count_up_to': 'tf.compat.v1.count_up_to',
+    'tf.create_partitioned_variables': 'tf.compat.v1.create_partitioned_variables',
     'tf.cross': 'tf.linalg.cross',
     'tf.cumprod': 'tf.math.cumprod',
+    'tf.data.make_initializable_iterator': 'tf.compat.v1.data.make_initializable_iterator',
+    'tf.data.make_one_shot_iterator': 'tf.compat.v1.data.make_one_shot_iterator',
+    'tf.debugging.is_finite': 'tf.math.is_finite',
+    'tf.debugging.is_inf': 'tf.math.is_inf',
+    'tf.debugging.is_nan': 'tf.math.is_nan',
+    'tf.debugging.is_non_decreasing': 'tf.math.is_non_decreasing',
+    'tf.debugging.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.decode_base64': 'tf.io.decode_base64',
     'tf.decode_compressed': 'tf.io.decode_compressed',
-    'tf.decode_csv': 'tf.io.decode_csv',
     'tf.decode_json_example': 'tf.io.decode_json_example',
     'tf.decode_raw': 'tf.io.decode_raw',
     'tf.delete_session_tensor': 'tf.compat.v1.delete_session_tensor',
-    'tf.depth_to_space': 'tf.nn.depth_to_space',
+    'tf.depth_to_space': 'tf.compat.v1.depth_to_space',
     'tf.dequantize': 'tf.quantization.dequantize',
     'tf.deserialize_many_sparse': 'tf.io.deserialize_many_sparse',
-    'tf.device': 'tf.compat.v1.device',
     'tf.diag': 'tf.linalg.tensor_diag',
     'tf.diag_part': 'tf.linalg.tensor_diag_part',
     'tf.digamma': 'tf.math.digamma',
     'tf.dimension_at_index': 'tf.compat.v1.dimension_at_index',
     'tf.dimension_value': 'tf.compat.v1.dimension_value',
+    'tf.disable_eager_execution': 'tf.compat.v1.disable_eager_execution',
     'tf.disable_resource_variables': 'tf.compat.v1.disable_resource_variables',
+    'tf.disable_v2_behavior': 'tf.compat.v1.disable_v2_behavior',
     'tf.disable_v2_tensorshape': 'tf.compat.v1.disable_v2_tensorshape',
     'tf.distributions.Bernoulli': 'tf.compat.v1.distributions.Bernoulli',
     'tf.distributions.Beta': 'tf.compat.v1.distributions.Beta',
@@ -139,30 +164,41 @@ renames = {
     'tf.distributions.StudentT': 'tf.compat.v1.distributions.StudentT',
     'tf.distributions.Uniform': 'tf.compat.v1.distributions.Uniform',
     'tf.distributions.kl_divergence': 'tf.compat.v1.distributions.kl_divergence',
+    'tf.div': 'tf.compat.v1.div',
+    'tf.enable_eager_execution': 'tf.compat.v1.enable_eager_execution',
     'tf.enable_resource_variables': 'tf.compat.v1.enable_resource_variables',
+    'tf.enable_v2_behavior': 'tf.compat.v1.enable_v2_behavior',
     'tf.enable_v2_tensorshape': 'tf.compat.v1.enable_v2_tensorshape',
     'tf.encode_base64': 'tf.io.encode_base64',
     'tf.erf': 'tf.math.erf',
     'tf.erfc': 'tf.math.erfc',
     'tf.expm1': 'tf.math.expm1',
-    'tf.extract_image_patches': 'tf.image.extract_image_patches',
     'tf.fake_quant_with_min_max_args': 'tf.quantization.fake_quant_with_min_max_args',
     'tf.fake_quant_with_min_max_args_gradient': 'tf.quantization.fake_quant_with_min_max_args_gradient',
     'tf.fake_quant_with_min_max_vars': 'tf.quantization.fake_quant_with_min_max_vars',
     'tf.fake_quant_with_min_max_vars_gradient': 'tf.quantization.fake_quant_with_min_max_vars_gradient',
     'tf.fake_quant_with_min_max_vars_per_channel': 'tf.quantization.fake_quant_with_min_max_vars_per_channel',
     'tf.fake_quant_with_min_max_vars_per_channel_gradient': 'tf.quantization.fake_quant_with_min_max_vars_per_channel_gradient',
+    'tf.feature_column.input_layer': 'tf.compat.v1.feature_column.input_layer',
+    'tf.feature_column.linear_model': 'tf.compat.v1.feature_column.linear_model',
     'tf.fft': 'tf.signal.fft',
     'tf.fft2d': 'tf.signal.fft2d',
     'tf.fft3d': 'tf.signal.fft3d',
+    'tf.fixed_size_partitioner': 'tf.compat.v1.fixed_size_partitioner',
     'tf.floordiv': 'tf.math.floordiv',
+    'tf.get_collection': 'tf.compat.v1.get_collection',
+    'tf.get_collection_ref': 'tf.compat.v1.get_collection_ref',
+    'tf.get_default_graph': 'tf.compat.v1.get_default_graph',
     'tf.get_default_session': 'tf.compat.v1.get_default_session',
     'tf.get_local_variable': 'tf.compat.v1.get_local_variable',
-    'tf.get_seed': 'tf.random.get_seed',
+    'tf.get_seed': 'tf.compat.v1.get_seed',
     'tf.get_session_handle': 'tf.compat.v1.get_session_handle',
     'tf.get_session_tensor': 'tf.compat.v1.get_session_tensor',
     'tf.get_variable': 'tf.compat.v1.get_variable',
     'tf.get_variable_scope': 'tf.compat.v1.get_variable_scope',
+    'tf.gfile.FastGFile': 'tf.compat.v1.gfile.FastGFile',
+    'tf.gfile.GFile': 'tf.compat.v1.gfile.GFile',
+    'tf.gfile.Open': 'tf.compat.v1.gfile.Open',
     'tf.global_norm': 'tf.linalg.global_norm',
     'tf.global_variables': 'tf.compat.v1.global_variables',
     'tf.global_variables_initializer': 'tf.compat.v1.global_variables_initializer',
@@ -178,6 +214,12 @@ renames = {
     'tf.igamma': 'tf.math.igamma',
     'tf.igammac': 'tf.math.igammac',
     'tf.imag': 'tf.math.imag',
+    'tf.image.resize_area': 'tf.compat.v1.image.resize_area',
+    'tf.image.resize_bicubic': 'tf.compat.v1.image.resize_bicubic',
+    'tf.image.resize_bilinear': 'tf.compat.v1.image.resize_bilinear',
+    'tf.image.resize_images': 'tf.compat.v1.image.resize_images',
+    'tf.image.resize_nearest_neighbor': 'tf.compat.v1.image.resize_nearest_neighbor',
+    'tf.image.transpose_image': 'tf.compat.v1.image.transpose_image',
     'tf.initialize_all_tables': 'tf.compat.v1.initialize_all_tables',
     'tf.initialize_all_variables': 'tf.compat.v1.initialize_all_variables',
     'tf.initialize_local_variables': 'tf.compat.v1.initialize_local_variables',
@@ -187,12 +229,13 @@ renames = {
     'tf.initializers.tables_initializer': 'tf.compat.v1.initializers.tables_initializer',
     'tf.initializers.variables': 'tf.compat.v1.initializers.variables',
     'tf.invert_permutation': 'tf.math.invert_permutation',
-    'tf.is_finite': 'tf.debugging.is_finite',
-    'tf.is_inf': 'tf.debugging.is_inf',
-    'tf.is_nan': 'tf.debugging.is_nan',
-    'tf.is_non_decreasing': 'tf.debugging.is_non_decreasing',
+    'tf.io.tf_record_iterator': 'tf.compat.v1.io.tf_record_iterator',
+    'tf.is_finite': 'tf.math.is_finite',
+    'tf.is_inf': 'tf.math.is_inf',
+    'tf.is_nan': 'tf.math.is_nan',
+    'tf.is_non_decreasing': 'tf.math.is_non_decreasing',
     'tf.is_numeric_tensor': 'tf.debugging.is_numeric_tensor',
-    'tf.is_strictly_increasing': 'tf.debugging.is_strictly_increasing',
+    'tf.is_strictly_increasing': 'tf.math.is_strictly_increasing',
     'tf.is_variable_initialized': 'tf.compat.v1.is_variable_initialized',
     'tf.keras.backend.get_session': 'tf.compat.v1.keras.backend.get_session',
     'tf.layers.AveragePooling1D': 'tf.compat.v1.layers.AveragePooling1D',
@@ -235,13 +278,46 @@ renames = {
     'tf.layers.separable_conv2d': 'tf.compat.v1.layers.separable_conv2d',
     'tf.lbeta': 'tf.math.lbeta',
     'tf.lgamma': 'tf.math.lgamma',
+    'tf.lin_space': 'tf.linspace',
     'tf.local_variables': 'tf.compat.v1.local_variables',
     'tf.local_variables_initializer': 'tf.compat.v1.local_variables_initializer',
+    'tf.log': 'tf.math.log',
+    'tf.log1p': 'tf.math.log1p',
     'tf.log_sigmoid': 'tf.math.log_sigmoid',
+    'tf.logging.DEBUG': 'tf.compat.v1.logging.DEBUG',
+    'tf.logging.ERROR': 'tf.compat.v1.logging.ERROR',
+    'tf.logging.FATAL': 'tf.compat.v1.logging.FATAL',
+    'tf.logging.INFO': 'tf.compat.v1.logging.INFO',
+    'tf.logging.TaskLevelStatusMessage': 'tf.compat.v1.logging.TaskLevelStatusMessage',
+    'tf.logging.WARN': 'tf.compat.v1.logging.WARN',
+    'tf.logging.debug': 'tf.compat.v1.logging.debug',
+    'tf.logging.error': 'tf.compat.v1.logging.error',
+    'tf.logging.fatal': 'tf.compat.v1.logging.fatal',
+    'tf.logging.flush': 'tf.compat.v1.logging.flush',
+    'tf.logging.get_verbosity': 'tf.compat.v1.logging.get_verbosity',
+    'tf.logging.info': 'tf.compat.v1.logging.info',
+    'tf.logging.log': 'tf.compat.v1.logging.log',
+    'tf.logging.log_every_n': 'tf.compat.v1.logging.log_every_n',
+    'tf.logging.log_first_n': 'tf.compat.v1.logging.log_first_n',
+    'tf.logging.log_if': 'tf.compat.v1.logging.log_if',
+    'tf.logging.set_verbosity': 'tf.compat.v1.logging.set_verbosity',
+    'tf.logging.vlog': 'tf.compat.v1.logging.vlog',
+    'tf.logging.warn': 'tf.compat.v1.logging.warn',
+    'tf.logging.warning': 'tf.compat.v1.logging.warning',
     'tf.logical_xor': 'tf.math.logical_xor',
+    'tf.losses.absolute_difference': 'tf.compat.v1.losses.absolute_difference',
+    'tf.losses.compute_weighted_loss': 'tf.compat.v1.losses.compute_weighted_loss',
+    'tf.losses.cosine_distance': 'tf.compat.v1.losses.cosine_distance',
+    'tf.losses.hinge_loss': 'tf.compat.v1.losses.hinge_loss',
+    'tf.losses.huber_loss': 'tf.compat.v1.losses.huber_loss',
+    'tf.losses.log_loss': 'tf.compat.v1.losses.log_loss',
+    'tf.losses.mean_pairwise_squared_error': 'tf.compat.v1.losses.mean_pairwise_squared_error',
+    'tf.losses.mean_squared_error': 'tf.compat.v1.losses.mean_squared_error',
+    'tf.losses.sigmoid_cross_entropy': 'tf.compat.v1.losses.sigmoid_cross_entropy',
+    'tf.losses.softmax_cross_entropy': 'tf.compat.v1.losses.softmax_cross_entropy',
+    'tf.losses.sparse_softmax_cross_entropy': 'tf.compat.v1.losses.sparse_softmax_cross_entropy',
     'tf.make_template': 'tf.compat.v1.make_template',
     'tf.make_tensor_proto': 'tf.compat.v1.make_tensor_proto',
-    'tf.manip.batch_to_space_nd': 'tf.batch_to_space_nd',
     'tf.manip.gather_nd': 'tf.gather_nd',
     'tf.manip.reshape': 'tf.reshape',
     'tf.manip.reverse': 'tf.reverse',
@@ -250,8 +326,6 @@ renames = {
     'tf.manip.space_to_batch_nd': 'tf.space_to_batch_nd',
     'tf.manip.tile': 'tf.tile',
     'tf.matching_files': 'tf.io.matching_files',
-    'tf.math.argmax': 'tf.compat.v1.math.argmax',
-    'tf.math.argmin': 'tf.compat.v1.math.argmin',
     'tf.matrix_band_part': 'tf.linalg.band_part',
     'tf.matrix_determinant': 'tf.linalg.det',
     'tf.matrix_diag': 'tf.linalg.diag',
@@ -262,49 +336,104 @@ renames = {
     'tf.matrix_solve_ls': 'tf.linalg.lstsq',
     'tf.matrix_transpose': 'tf.linalg.transpose',
     'tf.matrix_triangular_solve': 'tf.linalg.triangular_solve',
+    'tf.metrics.accuracy': 'tf.compat.v1.metrics.accuracy',
+    'tf.metrics.auc': 'tf.compat.v1.metrics.auc',
+    'tf.metrics.average_precision_at_k': 'tf.compat.v1.metrics.average_precision_at_k',
+    'tf.metrics.false_negatives': 'tf.compat.v1.metrics.false_negatives',
+    'tf.metrics.false_negatives_at_thresholds': 'tf.compat.v1.metrics.false_negatives_at_thresholds',
+    'tf.metrics.false_positives': 'tf.compat.v1.metrics.false_positives',
+    'tf.metrics.false_positives_at_thresholds': 'tf.compat.v1.metrics.false_positives_at_thresholds',
+    'tf.metrics.mean': 'tf.compat.v1.metrics.mean',
+    'tf.metrics.mean_absolute_error': 'tf.compat.v1.metrics.mean_absolute_error',
+    'tf.metrics.mean_cosine_distance': 'tf.compat.v1.metrics.mean_cosine_distance',
+    'tf.metrics.mean_iou': 'tf.compat.v1.metrics.mean_iou',
+    'tf.metrics.mean_per_class_accuracy': 'tf.compat.v1.metrics.mean_per_class_accuracy',
+    'tf.metrics.mean_relative_error': 'tf.compat.v1.metrics.mean_relative_error',
+    'tf.metrics.mean_squared_error': 'tf.compat.v1.metrics.mean_squared_error',
+    'tf.metrics.mean_tensor': 'tf.compat.v1.metrics.mean_tensor',
+    'tf.metrics.percentage_below': 'tf.compat.v1.metrics.percentage_below',
+    'tf.metrics.precision': 'tf.compat.v1.metrics.precision',
+    'tf.metrics.precision_at_k': 'tf.compat.v1.metrics.precision_at_k',
+    'tf.metrics.precision_at_thresholds': 'tf.compat.v1.metrics.precision_at_thresholds',
+    'tf.metrics.precision_at_top_k': 'tf.compat.v1.metrics.precision_at_top_k',
+    'tf.metrics.recall': 'tf.compat.v1.metrics.recall',
+    'tf.metrics.recall_at_k': 'tf.compat.v1.metrics.recall_at_k',
+    'tf.metrics.recall_at_thresholds': 'tf.compat.v1.metrics.recall_at_thresholds',
+    'tf.metrics.recall_at_top_k': 'tf.compat.v1.metrics.recall_at_top_k',
+    'tf.metrics.root_mean_squared_error': 'tf.compat.v1.metrics.root_mean_squared_error',
+    'tf.metrics.sensitivity_at_specificity': 'tf.compat.v1.metrics.sensitivity_at_specificity',
+    'tf.metrics.sparse_average_precision_at_k': 'tf.compat.v1.metrics.sparse_average_precision_at_k',
+    'tf.metrics.sparse_precision_at_k': 'tf.compat.v1.metrics.sparse_precision_at_k',
+    'tf.metrics.specificity_at_sensitivity': 'tf.compat.v1.metrics.specificity_at_sensitivity',
+    'tf.metrics.true_negatives': 'tf.compat.v1.metrics.true_negatives',
+    'tf.metrics.true_negatives_at_thresholds': 'tf.compat.v1.metrics.true_negatives_at_thresholds',
+    'tf.metrics.true_positives': 'tf.compat.v1.metrics.true_positives',
+    'tf.metrics.true_positives_at_thresholds': 'tf.compat.v1.metrics.true_positives_at_thresholds',
+    'tf.min_max_variable_partitioner': 'tf.compat.v1.min_max_variable_partitioner',
     'tf.model_variables': 'tf.compat.v1.model_variables',
     'tf.moving_average_variables': 'tf.compat.v1.moving_average_variables',
-    'tf.nn.ctc_beam_search_decoder': 'tf.compat.v1.nn.ctc_beam_search_decoder',
+    'tf.nn.bidirectional_dynamic_rnn': 'tf.compat.v1.nn.bidirectional_dynamic_rnn',
+    'tf.nn.conv3d_backprop_filter_v2': 'tf.nn.conv3d_backprop_filter',
     'tf.nn.ctc_beam_search_decoder_v2': 'tf.nn.ctc_beam_search_decoder',
+    'tf.nn.ctc_loss_v2': 'tf.nn.ctc_loss',
+    'tf.nn.depthwise_conv2d_native': 'tf.compat.v1.nn.depthwise_conv2d_native',
+    'tf.nn.depthwise_conv2d_native_backprop_filter': 'tf.nn.depthwise_conv2d_backprop_filter',
+    'tf.nn.depthwise_conv2d_native_backprop_input': 'tf.nn.depthwise_conv2d_backprop_input',
     'tf.nn.dynamic_rnn': 'tf.compat.v1.nn.dynamic_rnn',
     'tf.nn.log_uniform_candidate_sampler': 'tf.random.log_uniform_candidate_sampler',
+    'tf.nn.quantized_avg_pool': 'tf.compat.v1.nn.quantized_avg_pool',
+    'tf.nn.quantized_conv2d': 'tf.compat.v1.nn.quantized_conv2d',
+    'tf.nn.quantized_max_pool': 'tf.compat.v1.nn.quantized_max_pool',
+    'tf.nn.quantized_relu_x': 'tf.compat.v1.nn.quantized_relu_x',
     'tf.nn.raw_rnn': 'tf.compat.v1.nn.raw_rnn',
+    'tf.nn.relu_layer': 'tf.compat.v1.nn.relu_layer',
     'tf.nn.rnn_cell.BasicLSTMCell': 'tf.compat.v1.nn.rnn_cell.BasicLSTMCell',
     'tf.nn.rnn_cell.BasicRNNCell': 'tf.compat.v1.nn.rnn_cell.BasicRNNCell',
     'tf.nn.rnn_cell.GRUCell': 'tf.compat.v1.nn.rnn_cell.GRUCell',
     'tf.nn.rnn_cell.LSTMCell': 'tf.compat.v1.nn.rnn_cell.LSTMCell',
-    'tf.nn.softmax_cross_entropy_with_logits': 'tf.compat.v1.nn.softmax_cross_entropy_with_logits',
-    'tf.nn.softmax_cross_entropy_with_logits_v2': 'tf.nn.softmax_cross_entropy_with_logits',
+    'tf.nn.rnn_cell.MultiRNNCell': 'tf.compat.v1.nn.rnn_cell.MultiRNNCell',
+    'tf.nn.static_bidirectional_rnn': 'tf.compat.v1.nn.static_bidirectional_rnn',
     'tf.nn.static_rnn': 'tf.compat.v1.nn.static_rnn',
     'tf.nn.uniform_candidate_sampler': 'tf.random.uniform_candidate_sampler',
+    'tf.nn.xw_plus_b': 'tf.compat.v1.nn.xw_plus_b',
     'tf.op_scope': 'tf.compat.v1.op_scope',
     'tf.orthogonal_initializer': 'tf.keras.initializers.Orthogonal',
-    'tf.parse_example': 'tf.io.parse_example',
-    'tf.parse_single_example': 'tf.io.parse_single_example',
     'tf.parse_single_sequence_example': 'tf.io.parse_single_sequence_example',
     'tf.parse_tensor': 'tf.io.parse_tensor',
     'tf.placeholder': 'tf.compat.v1.placeholder',
     'tf.placeholder_with_default': 'tf.compat.v1.placeholder_with_default',
     'tf.polygamma': 'tf.math.polygamma',
+    'tf.profiler.AdviceProto': 'tf.compat.v1.profiler.AdviceProto',
+    'tf.profiler.GraphNodeProto': 'tf.compat.v1.profiler.GraphNodeProto',
+    'tf.profiler.MultiGraphNodeProto': 'tf.compat.v1.profiler.MultiGraphNodeProto',
+    'tf.profiler.OpLogProto': 'tf.compat.v1.profiler.OpLogProto',
+    'tf.profiler.ProfileOptionBuilder': 'tf.compat.v1.profiler.ProfileOptionBuilder',
+    'tf.profiler.Profiler': 'tf.compat.v1.profiler.Profiler',
+    'tf.profiler.advise': 'tf.compat.v1.profiler.advise',
+    'tf.profiler.profile': 'tf.compat.v1.profiler.profile',
+    'tf.profiler.write_op_log': 'tf.compat.v1.profiler.write_op_log',
+    'tf.py_func': 'tf.compat.v1.py_func',
     'tf.python_io.TFRecordCompressionType': 'tf.io.TFRecordCompressionType',
     'tf.python_io.TFRecordOptions': 'tf.io.TFRecordOptions',
     'tf.python_io.TFRecordWriter': 'tf.io.TFRecordWriter',
-    'tf.python_io.tf_record_iterator': 'tf.io.tf_record_iterator',
+    'tf.python_io.tf_record_iterator': 'tf.compat.v1.python_io.tf_record_iterator',
     'tf.qr': 'tf.linalg.qr',
     'tf.quantize': 'tf.quantization.quantize',
     'tf.quantized_concat': 'tf.quantization.quantized_concat',
+    'tf.random.get_seed': 'tf.compat.v1.random.get_seed',
+    'tf.random.set_random_seed': 'tf.compat.v1.random.set_random_seed',
     'tf.random_crop': 'tf.image.random_crop',
     'tf.random_gamma': 'tf.random.gamma',
     'tf.random_normal': 'tf.random.normal',
-    'tf.random_poisson': 'tf.random.poisson',
+    'tf.random_poisson': 'tf.compat.v1.random_poisson',
     'tf.random_shuffle': 'tf.random.shuffle',
     'tf.random_uniform': 'tf.random.uniform',
     'tf.read_file': 'tf.io.read_file',
     'tf.real': 'tf.math.real',
     'tf.reciprocal': 'tf.math.reciprocal',
-    'tf.reduce_join': 'tf.strings.reduce_join',
     'tf.regex_replace': 'tf.strings.regex_replace',
     'tf.report_uninitialized_variables': 'tf.compat.v1.report_uninitialized_variables',
+    'tf.reset_default_graph': 'tf.compat.v1.reset_default_graph',
     'tf.resource_loader.get_data_files_path': 'tf.compat.v1.resource_loader.get_data_files_path',
     'tf.resource_loader.get_path_to_datafile': 'tf.compat.v1.resource_loader.get_path_to_datafile',
     'tf.resource_loader.get_root_dir_with_all_resources': 'tf.compat.v1.resource_loader.get_root_dir_with_all_resources',
@@ -314,13 +443,15 @@ renames = {
     'tf.rint': 'tf.math.rint',
     'tf.rsqrt': 'tf.math.rsqrt',
     'tf.saved_model.Builder': 'tf.compat.v1.saved_model.Builder',
+    'tf.saved_model.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.MAIN_OP_KEY': 'tf.compat.v1.saved_model.MAIN_OP_KEY',
     'tf.saved_model.TRAINING': 'tf.saved_model.TRANING',
     'tf.saved_model.build_tensor_info': 'tf.compat.v1.saved_model.build_tensor_info',
     'tf.saved_model.builder.SavedModelBuilder': 'tf.compat.v1.saved_model.builder.SavedModelBuilder',
     'tf.saved_model.constants.ASSETS_DIRECTORY': 'tf.saved_model.ASSETS_DIRECTORY',
     'tf.saved_model.constants.ASSETS_KEY': 'tf.saved_model.ASSETS_KEY',
-    'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.saved_model.LEGACY_INIT_OP_KEY',
-    'tf.saved_model.constants.MAIN_OP_KEY': 'tf.saved_model.MAIN_OP_KEY',
+    'tf.saved_model.constants.LEGACY_INIT_OP_KEY': 'tf.compat.v1.saved_model.constants.LEGACY_INIT_OP_KEY',
+    'tf.saved_model.constants.MAIN_OP_KEY': 'tf.compat.v1.saved_model.constants.MAIN_OP_KEY',
     'tf.saved_model.constants.SAVED_MODEL_FILENAME_PB': 'tf.saved_model.SAVED_MODEL_FILENAME_PB',
     'tf.saved_model.constants.SAVED_MODEL_FILENAME_PBTXT': 'tf.saved_model.SAVED_MODEL_FILENAME_PBTXT',
     'tf.saved_model.constants.SAVED_MODEL_SCHEMA_VERSION': 'tf.saved_model.SAVED_MODEL_SCHEMA_VERSION',
@@ -330,10 +461,11 @@ renames = {
     'tf.saved_model.get_tensor_from_tensor_info': 'tf.compat.v1.saved_model.get_tensor_from_tensor_info',
     'tf.saved_model.load': 'tf.compat.v1.saved_model.load',
     'tf.saved_model.loader.load': 'tf.compat.v1.saved_model.loader.load',
-    'tf.saved_model.loader.maybe_saved_model_directory': 'tf.saved_model.maybe_saved_model_directory',
+    'tf.saved_model.loader.maybe_saved_model_directory': 'tf.compat.v1.saved_model.loader.maybe_saved_model_directory',
     'tf.saved_model.main_op.main_op': 'tf.compat.v1.saved_model.main_op.main_op',
     'tf.saved_model.main_op.main_op_with_restore': 'tf.compat.v1.saved_model.main_op.main_op_with_restore',
     'tf.saved_model.main_op_with_restore': 'tf.compat.v1.saved_model.main_op_with_restore',
+    'tf.saved_model.maybe_saved_model_directory': 'tf.compat.v1.saved_model.maybe_saved_model_directory',
     'tf.saved_model.signature_constants.CLASSIFY_INPUTS': 'tf.saved_model.CLASSIFY_INPUTS',
     'tf.saved_model.signature_constants.CLASSIFY_METHOD_NAME': 'tf.saved_model.CLASSIFY_METHOD_NAME',
     'tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES': 'tf.saved_model.CLASSIFY_OUTPUT_CLASSES',
@@ -370,51 +502,81 @@ renames = {
     'tf.segment_sum': 'tf.math.segment_sum',
     'tf.self_adjoint_eig': 'tf.linalg.eigh',
     'tf.self_adjoint_eigvals': 'tf.linalg.eigvalsh',
-    'tf.serialize_many_sparse': 'tf.io.serialize_many_sparse',
-    'tf.serialize_sparse': 'tf.io.serialize_sparse',
+    'tf.serialize_many_sparse': 'tf.compat.v1.serialize_many_sparse',
+    'tf.serialize_sparse': 'tf.compat.v1.serialize_sparse',
     'tf.serialize_tensor': 'tf.io.serialize_tensor',
+    'tf.set_random_seed': 'tf.compat.v1.set_random_seed',
     'tf.setdiff1d': 'tf.compat.v1.setdiff1d',
-    'tf.space_to_batch': 'tf.nn.space_to_batch',
-    'tf.space_to_depth': 'tf.nn.space_to_depth',
+    'tf.sets.set_difference': 'tf.sets.difference',
+    'tf.sets.set_intersection': 'tf.sets.intersection',
+    'tf.sets.set_size': 'tf.sets.size',
+    'tf.sets.set_union': 'tf.sets.union',
+    'tf.space_to_depth': 'tf.compat.v1.space_to_depth',
+    'tf.sparse.matmul': 'tf.sparse.sparse_dense_matmul',
+    'tf.sparse.merge': 'tf.compat.v1.sparse.merge',
     'tf.sparse.placeholder': 'tf.compat.v1.sparse.placeholder',
-    'tf.sparse_add': 'tf.sparse.add',
+    'tf.sparse.reduce_max_sparse': 'tf.compat.v1.sparse.reduce_max_sparse',
+    'tf.sparse.reduce_sum_sparse': 'tf.compat.v1.sparse.reduce_sum_sparse',
     'tf.sparse_fill_empty_rows': 'tf.sparse.fill_empty_rows',
     'tf.sparse_mask': 'tf.sparse.mask',
-    'tf.sparse_matmul': 'tf.compat.v1.sparse_matmul',
     'tf.sparse_maximum': 'tf.sparse.maximum',
-    'tf.sparse_merge': 'tf.sparse.merge',
+    'tf.sparse_merge': 'tf.compat.v1.sparse_merge',
     'tf.sparse_minimum': 'tf.sparse.minimum',
     'tf.sparse_placeholder': 'tf.compat.v1.sparse_placeholder',
+    'tf.sparse_reduce_max': 'tf.compat.v1.sparse_reduce_max',
+    'tf.sparse_reduce_max_sparse': 'tf.compat.v1.sparse_reduce_max_sparse',
+    'tf.sparse_reduce_sum': 'tf.compat.v1.sparse_reduce_sum',
+    'tf.sparse_reduce_sum_sparse': 'tf.compat.v1.sparse_reduce_sum_sparse',
     'tf.sparse_reorder': 'tf.sparse.reorder',
     'tf.sparse_reset_shape': 'tf.sparse.reset_shape',
     'tf.sparse_reshape': 'tf.sparse.reshape',
     'tf.sparse_retain': 'tf.sparse.retain',
-    'tf.sparse_segment_mean': 'tf.sparse.segment_mean',
-    'tf.sparse_segment_sqrt_n': 'tf.sparse.segment_sqrt_n',
-    'tf.sparse_segment_sum': 'tf.sparse.segment_sum',
+    'tf.sparse_segment_mean': 'tf.compat.v1.sparse_segment_mean',
+    'tf.sparse_segment_sqrt_n': 'tf.compat.v1.sparse_segment_sqrt_n',
+    'tf.sparse_segment_sum': 'tf.compat.v1.sparse_segment_sum',
     'tf.sparse_slice': 'tf.sparse.slice',
     'tf.sparse_softmax': 'tf.sparse.softmax',
-    'tf.sparse_tensor_dense_matmul': 'tf.sparse.matmul',
+    'tf.sparse_tensor_dense_matmul': 'tf.sparse.sparse_dense_matmul',
     'tf.sparse_tensor_to_dense': 'tf.sparse.to_dense',
+    'tf.sparse_to_dense': 'tf.compat.v1.sparse_to_dense',
     'tf.sparse_to_indicator': 'tf.sparse.to_indicator',
     'tf.sparse_transpose': 'tf.sparse.transpose',
+    'tf.spectral.dct': 'tf.signal.dct',
     'tf.spectral.fft': 'tf.signal.fft',
     'tf.spectral.fft2d': 'tf.signal.fft2d',
     'tf.spectral.fft3d': 'tf.signal.fft3d',
+    'tf.spectral.idct': 'tf.signal.idct',
     'tf.spectral.ifft': 'tf.signal.ifft',
     'tf.spectral.ifft2d': 'tf.signal.ifft2d',
     'tf.spectral.ifft3d': 'tf.signal.ifft3d',
+    'tf.spectral.irfft': 'tf.signal.irfft',
+    'tf.spectral.irfft2d': 'tf.signal.irfft2d',
+    'tf.spectral.irfft3d': 'tf.signal.irfft3d',
+    'tf.spectral.rfft': 'tf.signal.rfft',
+    'tf.spectral.rfft2d': 'tf.signal.rfft2d',
+    'tf.spectral.rfft3d': 'tf.signal.rfft3d',
     'tf.squared_difference': 'tf.math.squared_difference',
     'tf.string_join': 'tf.strings.join',
     'tf.string_strip': 'tf.strings.strip',
-    'tf.string_to_hash_bucket': 'tf.strings.to_hash_bucket',
     'tf.string_to_hash_bucket_fast': 'tf.strings.to_hash_bucket_fast',
     'tf.string_to_hash_bucket_strong': 'tf.strings.to_hash_bucket_strong',
-    'tf.string_to_number': 'tf.strings.to_number',
+    'tf.summary.SessionLog': 'tf.compat.v1.summary.SessionLog',
+    'tf.summary.audio': 'tf.compat.v1.summary.audio',
+    'tf.summary.get_summary_description': 'tf.compat.v1.summary.get_summary_description',
+    'tf.summary.histogram': 'tf.compat.v1.summary.histogram',
+    'tf.summary.image': 'tf.compat.v1.summary.image',
+    'tf.summary.merge': 'tf.compat.v1.summary.merge',
+    'tf.summary.merge_all': 'tf.compat.v1.summary.merge_all',
+    'tf.summary.scalar': 'tf.compat.v1.summary.scalar',
+    'tf.summary.tensor_summary': 'tf.compat.v1.summary.tensor_summary',
+    'tf.summary.text': 'tf.compat.v1.summary.text',
     'tf.svd': 'tf.linalg.svd',
     'tf.tables_initializer': 'tf.compat.v1.tables_initializer',
+    'tf.test.StubOutForTesting': 'tf.compat.v1.test.StubOutForTesting',
+    'tf.test.compute_gradient': 'tf.compat.v1.test.compute_gradient',
     'tf.test.compute_gradient_error': 'tf.compat.v1.test.compute_gradient_error',
     'tf.test.get_temp_dir': 'tf.compat.v1.test.get_temp_dir',
+    'tf.test.mock': 'tf.compat.v1.test.mock',
     'tf.test.test_src_dir_path': 'tf.compat.v1.test.test_src_dir_path',
     'tf.to_bfloat16': 'tf.compat.v1.to_bfloat16',
     'tf.to_complex128': 'tf.compat.v1.to_complex128',
@@ -424,22 +586,56 @@ renames = {
     'tf.to_int32': 'tf.compat.v1.to_int32',
     'tf.to_int64': 'tf.compat.v1.to_int64',
     'tf.trace': 'tf.linalg.trace',
+    'tf.train.AdadeltaOptimizer': 'tf.compat.v1.train.AdadeltaOptimizer',
+    'tf.train.AdagradDAOptimizer': 'tf.compat.v1.train.AdagradDAOptimizer',
+    'tf.train.AdagradOptimizer': 'tf.compat.v1.train.AdagradOptimizer',
+    'tf.train.AdamOptimizer': 'tf.compat.v1.train.AdamOptimizer',
+    'tf.train.CheckpointSaverHook': 'tf.estimator.CheckpointSaverHook',
+    'tf.train.CheckpointSaverListener': 'tf.estimator.CheckpointSaverListener',
+    'tf.train.ChiefSessionCreator': 'tf.compat.v1.train.ChiefSessionCreator',
+    'tf.train.FeedFnHook': 'tf.estimator.FeedFnHook',
+    'tf.train.FinalOpsHook': 'tf.estimator.FinalOpsHook',
+    'tf.train.FtrlOptimizer': 'tf.compat.v1.train.FtrlOptimizer',
+    'tf.train.GlobalStepWaiterHook': 'tf.estimator.GlobalStepWaiterHook',
+    'tf.train.GradientDescentOptimizer': 'tf.compat.v1.train.GradientDescentOptimizer',
+    'tf.train.LoggingTensorHook': 'tf.estimator.LoggingTensorHook',
+    'tf.train.LooperThread': 'tf.compat.v1.train.LooperThread',
+    'tf.train.MomentumOptimizer': 'tf.compat.v1.train.MomentumOptimizer',
+    'tf.train.MonitoredSession': 'tf.compat.v1.train.MonitoredSession',
     'tf.train.MonitoredTrainingSession': 'tf.compat.v1.train.MonitoredTrainingSession',
+    'tf.train.NanLossDuringTrainingError': 'tf.estimator.NanLossDuringTrainingError',
+    'tf.train.NanTensorHook': 'tf.estimator.NanTensorHook',
+    'tf.train.NewCheckpointReader': 'tf.compat.v1.train.NewCheckpointReader',
+    'tf.train.Optimizer': 'tf.compat.v1.train.Optimizer',
+    'tf.train.ProfilerHook': 'tf.estimator.ProfilerHook',
+    'tf.train.ProximalAdagradOptimizer': 'tf.compat.v1.train.ProximalAdagradOptimizer',
     'tf.train.QueueRunner': 'tf.compat.v1.train.QueueRunner',
+    'tf.train.RMSPropOptimizer': 'tf.compat.v1.train.RMSPropOptimizer',
     'tf.train.Saver': 'tf.compat.v1.train.Saver',
     'tf.train.SaverDef': 'tf.compat.v1.train.SaverDef',
+    'tf.train.Scaffold': 'tf.compat.v1.train.Scaffold',
+    'tf.train.SecondOrStepTimer': 'tf.estimator.SecondOrStepTimer',
+    'tf.train.SessionCreator': 'tf.compat.v1.train.SessionCreator',
+    'tf.train.SessionManager': 'tf.compat.v1.train.SessionManager',
+    'tf.train.SessionRunArgs': 'tf.compat.v1.train.SessionRunArgs',
+    'tf.train.SessionRunContext': 'tf.compat.v1.train.SessionRunContext',
+    'tf.train.SessionRunValues': 'tf.compat.v1.train.SessionRunValues',
+    'tf.train.SingularMonitoredSession': 'tf.compat.v1.train.SingularMonitoredSession',
+    'tf.train.StepCounterHook': 'tf.estimator.StepCounterHook',
+    'tf.train.StopAtStepHook': 'tf.estimator.StopAtStepHook',
+    'tf.train.SummarySaverHook': 'tf.estimator.SummarySaverHook',
+    'tf.train.Supervisor': 'tf.compat.v1.train.Supervisor',
     'tf.train.SyncReplicasOptimizer': 'tf.compat.v1.train.SyncReplicasOptimizer',
+    'tf.train.VocabInfo': 'tf.estimator.VocabInfo',
+    'tf.train.WorkerSessionCreator': 'tf.compat.v1.train.WorkerSessionCreator',
     'tf.train.add_queue_runner': 'tf.compat.v1.train.add_queue_runner',
     'tf.train.assert_global_step': 'tf.compat.v1.train.assert_global_step',
     'tf.train.basic_train_loop': 'tf.compat.v1.train.basic_train_loop',
     'tf.train.batch': 'tf.compat.v1.train.batch',
     'tf.train.batch_join': 'tf.compat.v1.train.batch_join',
     'tf.train.checkpoint_exists': 'tf.compat.v1.train.checkpoint_exists',
-    'tf.train.cosine_decay': 'tf.compat.v1.train.cosine_decay',
-    'tf.train.cosine_decay_restarts': 'tf.compat.v1.train.cosine_decay_restarts',
     'tf.train.create_global_step': 'tf.compat.v1.train.create_global_step',
     'tf.train.do_quantize_training_on_graphdef': 'tf.compat.v1.train.do_quantize_training_on_graphdef',
-    'tf.train.exponential_decay': 'tf.compat.v1.train.exponential_decay',
     'tf.train.export_meta_graph': 'tf.compat.v1.train.export_meta_graph',
     'tf.train.generate_checkpoint_state_proto': 'tf.compat.v1.train.generate_checkpoint_state_proto',
     'tf.train.get_checkpoint_mtimes': 'tf.compat.v1.train.get_checkpoint_mtimes',
@@ -447,32 +643,31 @@ renames = {
     'tf.train.get_or_create_global_step': 'tf.compat.v1.train.get_or_create_global_step',
     'tf.train.global_step': 'tf.compat.v1.train.global_step',
     'tf.train.import_meta_graph': 'tf.compat.v1.train.import_meta_graph',
+    'tf.train.init_from_checkpoint': 'tf.compat.v1.train.init_from_checkpoint',
     'tf.train.input_producer': 'tf.compat.v1.train.input_producer',
-    'tf.train.inverse_time_decay': 'tf.compat.v1.train.inverse_time_decay',
     'tf.train.limit_epochs': 'tf.compat.v1.train.limit_epochs',
-    'tf.train.linear_cosine_decay': 'tf.compat.v1.train.linear_cosine_decay',
     'tf.train.match_filenames_once': 'tf.io.match_filenames_once',
     'tf.train.maybe_batch': 'tf.compat.v1.train.maybe_batch',
     'tf.train.maybe_batch_join': 'tf.compat.v1.train.maybe_batch_join',
     'tf.train.maybe_shuffle_batch': 'tf.compat.v1.train.maybe_shuffle_batch',
     'tf.train.maybe_shuffle_batch_join': 'tf.compat.v1.train.maybe_shuffle_batch_join',
-    'tf.train.natural_exp_decay': 'tf.compat.v1.train.natural_exp_decay',
-    'tf.train.noisy_linear_cosine_decay': 'tf.compat.v1.train.noisy_linear_cosine_decay',
     'tf.train.piecewise_constant': 'tf.compat.v1.train.piecewise_constant',
-    'tf.train.polynomial_decay': 'tf.compat.v1.train.polynomial_decay',
     'tf.train.queue_runner.QueueRunner': 'tf.compat.v1.train.queue_runner.QueueRunner',
     'tf.train.queue_runner.add_queue_runner': 'tf.compat.v1.train.queue_runner.add_queue_runner',
     'tf.train.queue_runner.start_queue_runners': 'tf.compat.v1.train.queue_runner.start_queue_runners',
     'tf.train.range_input_producer': 'tf.compat.v1.train.range_input_producer',
     'tf.train.remove_checkpoint': 'tf.compat.v1.train.remove_checkpoint',
+    'tf.train.replica_device_setter': 'tf.compat.v1.train.replica_device_setter',
     'tf.train.shuffle_batch': 'tf.compat.v1.train.shuffle_batch',
     'tf.train.shuffle_batch_join': 'tf.compat.v1.train.shuffle_batch_join',
     'tf.train.slice_input_producer': 'tf.compat.v1.train.slice_input_producer',
     'tf.train.start_queue_runners': 'tf.compat.v1.train.start_queue_runners',
     'tf.train.string_input_producer': 'tf.compat.v1.train.string_input_producer',
     'tf.train.update_checkpoint_state': 'tf.compat.v1.train.update_checkpoint_state',
+    'tf.train.warm_start': 'tf.compat.v1.train.warm_start',
     'tf.train.write_graph': 'tf.io.write_graph',
     'tf.trainable_variables': 'tf.compat.v1.trainable_variables',
+    'tf.truncated_normal': 'tf.random.truncated_normal',
     'tf.uniform_unit_scaling_initializer': 'tf.initializers.uniform_unit_scaling',
     'tf.unsorted_segment_max': 'tf.math.unsorted_segment_max',
     'tf.unsorted_segment_mean': 'tf.math.unsorted_segment_mean',
@@ -480,12 +675,13 @@ renames = {
     'tf.unsorted_segment_prod': 'tf.math.unsorted_segment_prod',
     'tf.unsorted_segment_sqrt_n': 'tf.math.unsorted_segment_sqrt_n',
     'tf.unsorted_segment_sum': 'tf.math.unsorted_segment_sum',
-    'tf.variable_creator_scope': 'tf.compat.v1.variable_creator_scope',
+    'tf.variable_axis_size_partitioner': 'tf.compat.v1.variable_axis_size_partitioner',
     'tf.variable_op_scope': 'tf.compat.v1.variable_op_scope',
     'tf.variable_scope': 'tf.compat.v1.variable_scope',
     'tf.variables_initializer': 'tf.compat.v1.variables_initializer',
     'tf.variance_scaling_initializer': 'tf.keras.initializers.VarianceScaling',
-    'tf.verify_tensor_all_finite': 'tf.debugging.assert_all_finite',
+    'tf.verify_tensor_all_finite': 'tf.compat.v1.verify_tensor_all_finite',
+    'tf.wrap_function': 'tf.compat.v1.wrap_function',
     'tf.write_file': 'tf.io.write_file',
     'tf.zeta': 'tf.math.zeta'
 }
diff --git a/tensorflow/tools/compatibility/reorders_v2.py b/tensorflow/tools/compatibility/reorders_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..44494ac148cb878d500ef504eae8a6c388cc89df
--- /dev/null
+++ b/tensorflow/tools/compatibility/reorders_v2.py
@@ -0,0 +1,115 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+This file should be updated whenever a function is added to
+self.reordered_function_names in tf_upgrade_v2.py.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+reorders = {
+    'tf.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.batch_to_space': ['input', 'crops', 'block_size', 'name'],
+    'tf.boolean_mask': ['tensor', 'mask', 'name', 'axis'],
+    'tf.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
+    'tf.convert_to_tensor': ['value', 'dtype', 'name', 'preferred_dtype'],
+    'tf.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
+    'tf.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
+    'tf.feature_column.categorical_column_with_vocabulary_file': ['key', 'vocabulary_file', 'vocabulary_size', 'num_oov_buckets', 'default_value', 'dtype'],
+    'tf.io.decode_csv': ['records', 'record_defaults', 'field_delim', 'use_quote_delim', 'name', 'na_value', 'select_cols'],
+    'tf.io.parse_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.io.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.io.serialize_many_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.io.serialize_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.linalg.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
+    'tf.math.argmax': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.math.argmin': ['input', 'axis', 'name', 'dimension', 'output_type'],
+    'tf.math.confusion_matrix': ['labels', 'predictions', 'num_classes', 'dtype', 'name', 'weights'],
+    'tf.math.in_top_k': ['predictions', 'targets', 'k', 'name'],
+    'tf.math.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_min': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.math.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
+    'tf.nn.convolution': ['input', 'filter', 'padding', 'strides', 'dilation_rate', 'name', 'data_format'],
+    'tf.nn.crelu': ['features', 'name', 'axis'],
+    'tf.nn.depth_to_space': ['input', 'block_size', 'name', 'data_format'],
+    'tf.nn.depthwise_conv2d': ['input', 'filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.embedding_lookup': ['params', 'ids', 'partition_strategy', 'name', 'validate_indices', 'max_norm'],
+    'tf.nn.embedding_lookup_sparse': ['params', 'sp_ids', 'sp_weights', 'partition_strategy', 'name', 'combiner', 'max_norm'],
+    'tf.nn.in_top_k': ['predictions', 'targets', 'k', 'name'],
+    'tf.nn.moments': ['x', 'axes', 'shift', 'name', 'keep_dims'],
+    'tf.nn.pool': ['input', 'window_shape', 'pooling_type', 'padding', 'dilation_rate', 'strides', 'name', 'data_format'],
+    'tf.nn.separable_conv2d': ['input', 'depthwise_filter', 'pointwise_filter', 'strides', 'padding', 'rate', 'name', 'data_format'],
+    'tf.nn.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.nn.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
+    'tf.nn.weighted_moments': ['x', 'axes', 'frequency_weights', 'name', 'keep_dims'],
+    'tf.norm': ['tensor', 'ord', 'axis', 'keepdims', 'name', 'keep_dims'],
+    'tf.pad': ['tensor', 'paddings', 'mode', 'name', 'constant_values'],
+    'tf.parse_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.parse_single_example': ['serialized', 'features', 'name', 'example_names'],
+    'tf.quantize_v2': ['input', 'min_range', 'max_range', 'T', 'mode', 'name', 'round_mode'],
+    'tf.random.multinomial': ['logits', 'num_samples', 'seed', 'name', 'output_dtype'],
+    'tf.random.poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
+    'tf.random_poisson': ['lam', 'shape', 'dtype', 'seed', 'name'],
+    'tf.reduce_all': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_any': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.reduce_logsumexp': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_max': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_mean': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_min': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_prod': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reduce_sum': ['input_tensor', 'axis', 'keepdims', 'name', 'reduction_indices', 'keep_dims'],
+    'tf.reverse_sequence': ['input', 'seq_lengths', 'seq_axis', 'batch_axis', 'name', 'seq_dim', 'batch_dim'],
+    'tf.serialize_many_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.serialize_sparse': ['sp_input', 'name', 'out_type'],
+    'tf.shape': ['input', 'name', 'out_type'],
+    'tf.size': ['input', 'name', 'out_type'],
+    'tf.space_to_batch': ['input', 'paddings', 'block_size', 'name'],
+    'tf.space_to_depth': ['input', 'block_size', 'name', 'data_format'],
+    'tf.sparse.add': ['a', 'b', 'threshold', 'thresh'],
+    'tf.sparse.concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse.reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
+    'tf.sparse.segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse.split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
+    'tf.sparse_add': ['a', 'b', 'threshold', 'thresh'],
+    'tf.sparse_concat': ['axis', 'sp_inputs', 'name', 'expand_nonconcat_dim', 'concat_dim'],
+    'tf.sparse_matmul': ['a', 'b', 'transpose_a', 'transpose_b', 'a_is_sparse', 'b_is_sparse', 'name'],
+    'tf.sparse_reduce_max': ['sp_input', 'axis', 'keepdims', 'reduction_axes', 'keep_dims'],
+    'tf.sparse_segment_mean': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_segment_sqrt_n': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_segment_sum': ['data', 'indices', 'segment_ids', 'name', 'num_segments'],
+    'tf.sparse_split': ['keyword_required', 'sp_input', 'num_split', 'axis', 'name', 'split_dim'],
+    'tf.strings.length': ['input', 'name', 'unit'],
+    'tf.strings.reduce_join': ['inputs', 'axis', 'keep_dims', 'separator', 'name', 'reduction_indices'],
+    'tf.strings.substr': ['input', 'pos', 'len', 'name', 'unit'],
+    'tf.transpose': ['a', 'perm', 'name', 'conjugate'],
+    'tf.tuple': ['tensors', 'name', 'control_inputs'],
+    'tf.while_loop': ['cond', 'body', 'loop_vars', 'shape_invariants', 'parallel_iterations', 'back_prop', 'swap_memory', 'name', 'maximum_iterations', 'return_same_structure']
+}
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
index 68ba7a2630cec9cf23e9fbe3d1e9822c31ae3c0c..917236da4b4b75a1a1ca65e11d49d722cc178571 100644
--- a/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
+++ b/tensorflow/tools/compatibility/testdata/test_file_v0_11.py
@@ -34,6 +34,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   a unit test if the converter is successful.
   """
 
+  @test_util.run_v1_only("b/120545219")
   def testArgRenames(self):
     with self.cached_session():
 
@@ -97,6 +98,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.expand_dims([[1, 2], [3, 4]], axis=1).eval(),
           [[[1, 2]], [[3, 4]]])
 
+  @test_util.run_v1_only("b/120545219")
   def testArgMinMax(self):
     with self.cached_session():
       self.assertAllEqual(
@@ -112,6 +114,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.argmax([[1, 2, 3], [4, 1, 0]], dimension=0).eval(),
           [1, 0, 0])
 
+  @test_util.run_v1_only("b/120545219")
   def testExpandAndSqueeze(self):
     with self.cached_session():
 
@@ -139,6 +142,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   [[1, 2, 3]], dim=1), squeeze_dims=[1]).eval(),
           a)
 
+  @test_util.run_v1_only("b/120545219")
   def testArithmeticRenames(self):
     with self.cached_session() as s:
       stuff = tf.split(1, 2, [[1, 2, 3, 4], [4, 5, 6, 7]])
@@ -163,6 +167,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       #     # TODO(aselle): (tf.batch_*)
       # ]
 
+  @test_util.run_v1_only("b/120545219")
   def testBatchAndSvd(self):
     with self.cached_session():
       mat = [[1., 2.], [2., 3.]]
@@ -174,6 +179,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.svd(mat, False, True).eval(),
           tf.svd(mat, compute_uv=False, full_matrices=True).eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testCrossEntropy(self):
     # TODO(aselle): Test sparse_softmax_...
     with self.cached_session():
@@ -190,6 +196,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
           tf.nn.sigmoid_cross_entropy_with_logits(
               labels=labels, logits=logits).eval())
 
+  @test_util.run_v1_only("b/120545219")
   def testVariables(self):
     with self.cached_session() as s:
 
@@ -200,6 +207,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
       _ = [v.name for v in tf.all_variables()]
       _ = [v.name for v in tf.local_variables()]
 
+  @test_util.run_v1_only("b/120545219")
   def testSummaries(self):
     with self.cached_session() as s:
       var = tf.Variable([1, 2, 3], dtype=tf.float32)
diff --git a/tensorflow/tools/compatibility/testdata/test_file_v1_12.py b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ce4dd49adc940dbc56e19915a188cdb6b8de1d1
--- /dev/null
+++ b/tensorflow/tools/compatibility/testdata/test_file_v1_12.py
@@ -0,0 +1,75 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for tf upgrader."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+import tensorflow as tf
+from tensorflow.python.framework import test_util
+from tensorflow.python.platform import test as test_lib
+
+
+class TestUpgrade(test_util.TensorFlowTestCase):
+  """Test various APIs that have been changed in 2.0."""
+
+  def setUp(self):
+    tf.enable_eager_execution()
+
+  @test_util.run_v1_only("b/120545219")
+  def testRenames(self):
+    with self.cached_session():
+      self.assertAllClose(1.04719755, tf.acos(0.5))
+      self.assertAllClose(0.5, tf.rsqrt(4.0))
+
+  @test_util.run_v1_only("b/120545219")
+  def testSerializeSparseTensor(self):
+    sp_input = tf.SparseTensor(
+        indices=tf.constant([[1]], dtype=tf.int64),
+        values=tf.constant([2], dtype=tf.int64),
+        dense_shape=[2])
+
+    with self.cached_session():
+      serialized_sp = tf.serialize_sparse(sp_input, 'serialize_name', tf.string)
+      self.assertEqual((3,), serialized_sp.shape)
+      self.assertTrue(serialized_sp[0].numpy())  # check non-empty
+
+  @test_util.run_v1_only("b/120545219")
+  def testSerializeManySparse(self):
+    sp_input = tf.SparseTensor(
+        indices=tf.constant([[0, 1]], dtype=tf.int64),
+        values=tf.constant([2], dtype=tf.int64),
+        dense_shape=[1, 2])
+
+    with self.cached_session():
+      serialized_sp = tf.serialize_many_sparse(
+          sp_input, 'serialize_name', tf.string)
+      self.assertEqual((1, 3), serialized_sp.shape)
+
+  @test_util.run_v1_only("b/120545219")
+  def testArgMaxMin(self):
+    self.assertAllClose(
+        [1],
+        tf.argmax([[1, 3, 2]], name='abc', dimension=1))
+    self.assertAllClose(
+        [0, 0, 0],
+        tf.argmax([[1, 3, 2]], dimension=0))
+    self.assertAllClose(
+        [0],
+        tf.argmin([[1, 3, 2]], name='abc', dimension=1))
+
+
+if __name__ == "__main__":
+  test_lib.main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2.py b/tensorflow/tools/compatibility/tf_upgrade_v2.py
index 0df8b0f3769f4c96ebca4060363fa9afb25df0bd..ea86da42f6bbb8170c56d08e02ab38cf72acf3f7 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2.py
@@ -18,10 +18,9 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import argparse
-
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import renames_v2
+from tensorflow.tools.compatibility import reorders_v2
 
 
 class TFAPIChangeSpec(ast_edits.APIChangeSpec):
@@ -31,46 +30,638 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
     # Maps from a function name to a dictionary that describes how to
     # map from an old argument keyword to the new argument keyword.
     self.function_keyword_renames = {
+        "tf.argmin": {
+            "dimension": "axis",
+        },
+        "tf.argmax": {
+            "dimension": "axis",
+        },
+        "tf.arg_min": {
+            "dimension": "axis",
+        },
+        "tf.arg_max": {
+            "dimension": "axis",
+        },
+        "tf.math.argmin": {
+            "dimension": "axis",
+        },
+        "tf.math.argmax": {
+            "dimension": "axis",
+        },
+        "tf.image.crop_and_resize": {
+            "box_ind": "box_indices",
+        },
+        "tf.image.extract_image_patches": {
+            "ksizes": "sizes",
+        },
+        "tf.extract_image_patches": {
+            "ksizes": "sizes",
+        },
+        "tf.expand_dims": {
+            "dim": "axis",
+        },
+        "tf.batch_to_space": {
+            "block_size": "block_shape",
+        },
+        "tf.space_to_batch": {
+            "block_size": "block_shape",
+        },
+        "tf.nn.space_to_batch": {
+            "block_size": "block_shape",
+        },
+        "tf.constant": {
+            "verify_shape": "verify_shape_is_now_always_true",
+        },
         "tf.convert_to_tensor": {
             "preferred_dtype": "dtype_hint"
         },
+        "tf.nn.softmax_cross_entropy_with_logits_v2": {
+            "dim": "axis"
+        },
+        "tf.linalg.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.linalg.norm": {
+            "keep_dims": "keepdims",
+        },
+        "tf.norm": {
+            "keep_dims": "keepdims",
+        },
+        "tf.load_file_system_library": {
+            "library_filename": "library_location",
+        },
+        "tf.math.count_nonzero": {
+            "input_tensor": "input",
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis",
+        },
+        "tf.nn.erosion2d": {
+            "kernel": "filters",
+            "rates": "dilations",
+        },
+        "tf.math.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.math.log_softmax": {
+            "dim": "axis",
+        },
+        "tf.math.softmax": {
+            "dim": "axis"
+        },
+        "tf.nn.l2_normalize": {
+            "dim": "axis",
+        },
+        "tf.nn.log_softmax": {
+            "dim": "axis",
+        },
+        "tf.nn.moments": {
+            "keep_dims": "keepdims",
+        },
+        "tf.nn.pool": {
+            "dilation_rate": "dilations"
+        },
+        "tf.nn.separable_conv2d": {
+            "rate": "dilations"
+        },
+        "tf.nn.depthwise_conv2d": {
+            "rate": "dilations"
+        },
+        "tf.nn.softmax": {
+            "dim": "axis"
+        },
+        "tf.nn.sufficient_statistics": {
+            "keep_dims": "keepdims"
+        },
+        "tf.debugging.assert_all_finite": {
+            "t": "x",
+            "msg": "message",
+        },
+        "tf.sparse.add": {
+            "thresh": "threshold",
+        },
+        "tf.sparse_add": {
+            "thresh": "threshold",
+        },
+        "tf.sparse.concat": {
+            "concat_dim": "axis",
+            "expand_nonconcat_dim": "expand_nonconcat_dims",
+        },
+        "tf.sparse_concat": {
+            "concat_dim": "axis",
+            "expand_nonconcat_dim": "expand_nonconcat_dims",
+        },
+        "tf.sparse.split": {
+            "split_dim": "axis",
+        },
+        "tf.sparse_split": {
+            "split_dim": "axis",
+        },
+        "tf.sparse.reduce_max": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse_reduce_max": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse.reduce_sum": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.sparse_reduce_sum": {
+            "reduction_axes": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.nn.max_pool_with_argmax": {
+            "Targmax": "output_dtype",
+        },
+        "tf.multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.random.multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.reverse_sequence": {
+            "seq_dim": "seq_axis",
+            "batch_dim": "batch_axis",
+        },
+        "tf.nn.batch_norm_with_global_normalization": {
+            "t": "input",
+            "m": "mean",
+            "v": "variance",
+        },
+        "tf.nn.dilation2d": {
+            "filter": "filters",
+            "rates": "dilations",
+        },
+        "tf.nn.conv3d": {
+            "filter": "filters"
+        },
+        "tf.zeros_like": {
+            "tensor": "input",
+        },
+        "tf.ones_like": {
+            "tensor": "input",
+        },
+        "tf.nn.conv2d_transpose": {
+            "value": "input",
+            "filter": "filters",
+        },
+        "tf.nn.conv3d_transpose": {
+            "value": "input",
+            "filter": "filters",
+        },
+        "tf.nn.convolution": {
+            "filter": "filters",
+            "dilation_rate": "dilations",
+        },
+        "tf.gfile.Exists": {
+            "filename": "path",
+        },
+        "tf.gfile.Remove": {
+            "filename": "path",
+        },
+        "tf.gfile.Stat": {
+            "filename": "path",
+        },
+        "tf.gfile.Glob": {
+            "filename": "pattern",
+        },
+        "tf.gfile.MkDir": {
+            "dirname": "path",
+        },
+        "tf.gfile.MakeDirs": {
+            "dirname": "path",
+        },
+        "tf.gfile.DeleteRecursively": {
+            "dirname": "path",
+        },
+        "tf.gfile.IsDirectory": {
+            "dirname": "path",
+        },
+        "tf.gfile.ListDirectory": {
+            "dirname": "path",
+        },
+        "tf.gfile.Copy": {
+            "oldpath": "src",
+            "newpath": "dst",
+        },
+        "tf.gfile.Rename": {
+            "oldname": "src",
+            "newname": "dst",
+        },
+        "tf.gfile.Walk": {
+            "in_order": "topdown",
+        },
+        "tf.random.stateless_multinomial": {
+            "output_dtype": "dtype",
+        },
+        "tf.string_to_number": {
+            "string_tensor": "input",
+        },
+        "tf.strings.to_number": {
+            "string_tensor": "input",
+        },
+        "tf.string_to_hash_bucket": {
+            "string_tensor": "input",
+        },
+        "tf.strings.to_hash_bucket": {
+            "string_tensor": "input",
+        },
+        "tf.reduce_all": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_all": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_any": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_any": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_min": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_min": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_max": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_max": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_sum": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_sum": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_mean": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_mean": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_prod": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_prod": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_logsumexp": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.math.reduce_logsumexp": {
+            "reduction_indices": "axis",
+            "keep_dims": "keepdims",
+        },
+        "tf.reduce_join": {
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis"
+        },
+        "tf.strings.reduce_join": {
+            "keep_dims": "keepdims",
+            "reduction_indices": "axis"
+        },
+        "tf.squeeze": {
+            "squeeze_dims": "axis",
+        },
+        "tf.nn.weighted_moments": {
+            "keep_dims": "keepdims"
+        },
     }
 
-    # Mapping from function to the new name of the function
-    self.symbol_renames = renames_v2.renames
     # pylint: disable=line-too-long
     # Add additional renames not in renames_v2.py here.
-    self.symbol_renames.update({
-    })
+    # IMPORTANT: For the renames in here, if you also need to add to
+    # function_reorders or function_keyword_renames, use the OLD function name.
+    # These renames happen after the arguments have been processed.
+    self.manual_symbol_renames = {
+        "tf.batch_to_space_nd":
+            "tf.batch_to_space",
+        "tf.space_to_batch_nd":
+            "tf.space_to_batch",
+        "tf.nn.space_to_batch":
+            "tf.space_to_batch",
+        "tf.extract_image_patches":
+            "tf.image.extract_image_patches",
+        "tf.gfile.Copy":
+            "tf.io.gfile.copy",
+        "tf.gfile.DeleteRecursively":
+            "tf.io.gfile.rmtree",
+        "tf.gfile.Exists":
+            "tf.io.gfile.exists",
+        "tf.gfile.Glob":
+            "tf.io.gfile.glob",
+        "tf.gfile.IsDirectory":
+            "tf.io.gfile.isdir",
+        "tf.gfile.ListDirectory":
+            "tf.io.gfile.listdir",
+        "tf.gfile.MakeDirs":
+            "tf.io.gfile.makedirs",
+        "tf.gfile.MkDir":
+            "tf.io.gfile.mkdir",
+        "tf.gfile.Remove":
+            "tf.io.gfile.remove",
+        "tf.gfile.Rename":
+            "tf.io.gfile.rename",
+        "tf.gfile.Stat":
+            "tf.io.gfile.stat",
+        "tf.gfile.Walk":
+            "tf.io.gfile.walk",
+        "tf.contrib.data.AUTOTUNE":
+            "tf.data.experimental.AUTOTUNE",
+        "tf.contrib.data.Counter":
+            "tf.data.experimental.Counter",
+        "tf.contrib.data.CheckpointInputPipelineHook":
+            "tf.data.experimental.CheckpointInputPipelineHook",
+        "tf.contrib.data.CsvDataset":
+            "tf.data.experimental.CsvDataset",
+        "tf.contrib.data.Optional":
+            "tf.data.experimental.Optional",
+        "tf.contrib.data.RandomDataset":
+            "tf.data.experimental.RandomDataset",
+        "tf.contrib.data.Reducer":
+            "tf.data.experimental.Reducer",
+        "tf.contrib.data.SqlDataset":
+            "tf.data.experimental.SqlDataset",
+        "tf.contrib.data.StatsAggregator":
+            "tf.data.experimental.StatsAggregator",
+        "tf.contrib.data.TFRecordWriter":
+            "tf.data.experimental.TFRecordWriter",
+        "tf.contrib.data.assert_element_shape":
+            "tf.data.experimental.assert_element_shape",
+        "tf.contrib.data.batch_and_drop_remainder":
+            "tf.compat.v1.contrib.data.batch_and_drop_remainder",
+        "tf.contrib.data.bucket_by_sequence_length":
+            "tf.data.experimental.bucket_by_sequence_length",
+        "tf.contrib.data.choose_from_datasets":
+            "tf.data.experimental.choose_from_datasets",
+        "tf.contrib.data.copy_to_device":
+            "tf.data.experimental.copy_to_device",
+        "tf.contrib.data.dense_to_sparse_batch":
+            "tf.data.experimental.dense_to_sparse_batch",
+        "tf.contrib.data.enumerate_dataset":
+            "tf.data.experimental.enumerate_dataset",
+        "tf.contrib.data.get_next_as_optional":
+            "tf.data.experimental.get_next_as_optional",
+        "tf.contrib.data.get_single_element":
+            "tf.data.experimental.get_single_element",
+        "tf.contrib.data.group_by_reducer":
+            "tf.data.experimental.group_by_reducer",
+        "tf.contrib.data.group_by_window":
+            "tf.data.experimental.group_by_window",
+        "tf.contrib.data.ignore_errors":
+            "tf.data.experimental.ignore_errors",
+        "tf.contrib.data.latency_stats":
+            "tf.data.experimental.latency_stats",
+        "tf.contrib.data.make_batched_features_dataset":
+            "tf.data.experimental.make_batched_features_dataset",
+        "tf.contrib.data.make_csv_dataset":
+            "tf.data.experimental.make_csv_dataset",
+        "tf.contrib.data.make_saveable_from_iterator":
+            "tf.data.experimental.make_saveable_from_iterator",
+        "tf.contrib.data.map_and_batch":
+            "tf.data.experimental.map_and_batch",
+        "tf.contrib.data.padded_batch_and_drop_remainder":
+            "tf.compat.v1.contrib.data.padded_batch_and_drop_remainder",
+        "tf.contrib.data.parallel_interleave":
+            "tf.data.experimental.parallel_interleave",
+        "tf.contrib.data.parse_example_dataset":
+            "tf.data.experimental.parse_example_dataset",
+        "tf.contrib.data.prefetch_to_device":
+            "tf.data.experimental.prefetch_to_device",
+        "tf.contrib.data.read_batch_features":
+            "tf.compat.v1.contrib.data.read_batch_features",
+        "tf.contrib.data.reduce_dataset":
+            "tf.compat.v1.contrib.data.reduce_dataset",
+        "tf.contrib.data.rejection_resample":
+            "tf.data.experimental.rejection_resample",
+        "tf.contrib.data.sample_from_datasets":
+            "tf.data.experimental.sample_from_datasets",
+        "tf.contrib.data.scan":
+            "tf.data.experimental.scan",
+        "tf.contrib.data.set_stats_aggregator":
+            "tf.data.experimental.set_stats_aggregator",
+        "tf.contrib.data.shuffle_and_repeat":
+            "tf.data.experimental.shuffle_and_repeat",
+        "tf.contrib.data.sliding_window_batch":
+            "tf.compat.v1.contrib.data.sliding_window_batch",
+        "tf.contrib.data.sloppy_interleave":
+            "tf.compat.v1.contrib.data.sloppy_interleave",
+        "tf.contrib.data.unbatch":
+            "tf.data.experimental.unbatch",
+        "tf.contrib.data.unique":
+            "tf.data.experimental.unique",
+        "tf.contrib.framework.sort":
+            "tf.sort",
+        "tf.contrib.framework.argsort":
+            "tf.argsort",
+        "tf.manip.batch_to_space_nd":
+            "tf.batch_to_space",
+        "tf.quantize_v2":
+            "tf.quantization.quantize",
+        "tf.sparse_add":
+            "tf.sparse.add",
+        "tf.sparse_concat":
+            "tf.sparse.concat",
+        "tf.sparse_split":
+            "tf.sparse.split",
+        "tf.sparse_matmul":
+            "tf.linalg.matmul",
+        "tf.sparse_reduce_sum":
+            "tf.sparse.reduce_sum",
+        "tf.sparse_reduce_max":
+            "tf.sparse.reduce_max",
+        "tf.random.stateless_multinomial":
+            "tf.random.stateless_categorical",
+        "tf.string_to_hash_bucket":
+            "tf.strings.to_hash_bucket",
+        "tf.string_to_number":
+            "tf.strings.to_number",
+        "tf.multinomial":
+            "tf.random.categorical",
+        "tf.random.multinomial":
+            "tf.random.categorical",
+        "tf.reduce_join":
+            "tf.strings.reduce_join",
+        "tf.load_file_system_library":
+            "tf.load_library",
+        "tf.pywrap_tensorflow":
+            "tf.compat.v1.pywrap_tensorflow",
+        "tf.bincount":
+            "tf.math.bincount",
+        "tf.confusion_matrix":
+            "tf.math.confusion_matrix",
+        "tf.train.confusion_matrix":
+            "tf.math.confusion_matrix",
+        "tf.decode_csv":
+            "tf.io.decode_csv",
+        "tf.data.Iterator":
+            "tf.compat.v1.data.Iterator",
+        "tf.parse_example":
+            "tf.io.parse_example",
+        "tf.parse_single_example":
+            "tf.io.parse_single_example",
+        "tf.nn.fused_batch_norm":
+            "tf.compat.v1.nn.fused_batch_norm",
+        "tf.nn.softmax_cross_entropy_with_logits_v2":
+            "tf.nn.softmax_cross_entropy_with_logits",
+        "tf.losses.Reduction.MEAN":
+            "tf.compat.v1.losses.Reduction.MEAN",
+        "tf.losses.Reduction.SUM_BY_NONZERO_WEIGHTS":
+            "tf.compat.v1.losses.Reduction.SUM_BY_NONZERO_WEIGHTS",
+        "tf.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS":
+            "tf.compat.v1.losses.Reduction.SUM_OVER_NONZERO_WEIGHTS",
+        "tf.lite.constants.FLOAT":
+            "tf.float32",
+        "tf.lite.constants.INT32":
+            "tf.int32",
+        "tf.lite.constants.INT64":
+            "tf.int64",
+        "tf.lite.constants.STRING":
+            "tf.string",
+        "tf.lite.constants.QUANTIZED_UINT8":
+            "tf.uint8",
+        "tf.arg_max":
+            "tf.argmax",
+        "tf.arg_min":
+            "tf.argmin",
+        # tf.nn.ctc_loss is still available in 2.0 but behavior
+        # changed significantly.
+        "tf.nn.ctc_loss":
+            "tf.compat.v1.nn.ctc_loss",
+    }
     # pylint: enable=line-too-long
 
-    # For custom behavior and if auto-generate rename in renames_v2.py
-    # is incorrect, add the op name here to exclude it from renames_v2.py.
-    excluded_renames = [
-    ]
+    # Mapping from function to the new name of the function
+    self.symbol_renames = renames_v2.renames
+    self.symbol_renames.update(self.manual_symbol_renames)
 
     # Variables that should be changed to functions.
     self.change_to_function = {}
 
+    # pylint: disable=line-too-long
+    # This list should just contain names of functions that had
+    # their arguments reordered. After adding a function name to the list
+    # run the following to update reorders_v2.py:
+    # bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+    # bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+    # pylint: enable=line-too-long
+    self.reordered_function_names = {
+        "tf.io.serialize_sparse",
+        "tf.io.serialize_many_sparse",
+        "tf.argmax",
+        "tf.argmin",
+        "tf.batch_to_space",
+        "tf.nn.space_to_batch",
+        "tf.boolean_mask",
+        "tf.convert_to_tensor",
+        "tf.nn.moments",
+        "tf.nn.convolution",
+        "tf.nn.crelu",
+        "tf.nn.weighted_moments",
+        "tf.nn.pool",
+        "tf.nn.separable_conv2d",
+        "tf.nn.depthwise_conv2d",
+        "tf.multinomial",
+        "tf.random.multinomial",
+        "tf.pad",
+        "tf.quantize_v2",
+        "tf.feature_column.categorical_column_with_vocabulary_file",
+        "tf.shape",
+        "tf.size",
+        "tf.random.poisson",
+        "tf.sparse.add",
+        "tf.sparse_add",
+        "tf.sparse.concat",
+        "tf.sparse_concat",
+        "tf.sparse.segment_mean",
+        "tf.sparse.segment_sqrt_n",
+        "tf.sparse.segment_sum",
+        "tf.sparse_matmul",
+        "tf.sparse.reduce_max",
+        "tf.sparse_reduce_max",
+        "tf.io.decode_csv",
+        "tf.strings.substr",
+        "tf.strings.reduce_join",
+        "tf.strings.length",
+        "tf.transpose",
+        "tf.tuple",
+        "tf.parse_example",
+        "tf.parse_single_example",
+        "tf.io.parse_example",
+        "tf.io.parse_single_example",
+        "tf.while_loop",
+        "tf.reduce_all",
+        "tf.math.reduce_all",
+        "tf.reduce_any",
+        "tf.math.reduce_any",
+        "tf.reduce_min",
+        "tf.math.reduce_min",
+        "tf.reduce_max",
+        "tf.math.reduce_max",
+        "tf.reduce_sum",
+        "tf.math.reduce_sum",
+        "tf.reduce_mean",
+        "tf.math.reduce_mean",
+        "tf.reduce_prod",
+        "tf.math.reduce_prod",
+        "tf.reduce_logsumexp",
+        "tf.math.reduce_logsumexp",
+        "tf.reduce_join",
+        "tf.confusion_matrix",
+        "tf.math.confusion_matrix",
+        "tf.math.in_top_k",
+        "tf.nn.depth_to_space",
+        "tf.nn.embedding_lookup",
+        "tf.nn.embedding_lookup_sparse",
+        "tf.nn.in_top_k",
+        "tf.nn.space_to_depth",
+        "tf.linalg.norm",
+        "tf.norm",
+        "tf.reverse_sequence",
+        "tf.sparse_split",
+    }
+
     # Functions that were reordered should be changed to the new keyword args
     # for safety, if positional arguments are used. If you have reversed the
     # positional arguments yourself, this could do the wrong thing.
-    self.function_reorders = {
-        "tf.convert_to_tensor": ["value", "dtype", "preferred_dtype", "name"],
-        "tf.argmin": ["input", "axis", "output_type", "name"],
-        "tf.argmax": ["input", "axis", "output_type", "name"],
-        "tf.boolean_mask": ["tensor", "mask", "name", "axis"],
-    }
+    self.function_reorders = reorders_v2.reorders
 
     # Specially handled functions.
-    self.function_handle = {}
+    self.function_handle = {
+        "tf.nn.dropout": self._dropout_handler,
+        "tf.gradients": self._colocate_handler("tf.gradients"),
+        "*.minimize": self._colocate_handler("Optimizer.minimize"),
+        "*.compute_gradients":
+            self._colocate_handler("Optimizer.compute_gradients"),
+    }
 
     decay_function_comment = (
-        "ERROR: <function name> has been changed to return a callable instead "
-        "of a tensor when graph building, but its functionality remains "
+        "WARNING: <function name> has been changed to return a callable instead"
+        " of a tensor when graph building, but its functionality remains "
         "unchanged during eager execution (returns a callable like "
         "before). The converter cannot detect and fix this reliably, so "
-        "you need to inspect this usage manually.\n"
+        "this usage has been converted to compat.v1 (even though it may already"
+        " be correct).\n"
     )
 
     # TODO(b/118888586): add default value change to update script.
@@ -79,99 +670,256 @@ class TFAPIChangeSpec(ast_edits.APIChangeSpec):
         "SUM_OVER_BATCH_SIZE.\n"
     )
 
+    assert_return_type_comment = (
+        "WARNING: assert_* functions have been changed to return None, the "
+        "data argument has been removed, and arguments have been reordered."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
+    assert_rank_comment = (
+        "WARNING: assert_rank_* functions have been changed to return None, and"
+        " the data and summarize arguments have been removed."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
+    tf_01s_like_no_optimize_comment = (
+        "WARNING: tf.zeros_like and tf.ones_like no longer have the optimize "
+        "argument in TF 2.0 or after (also, `tensor' argument is renamed to "
+        "`input')."
+        "\nThe calls have been converted to compat.v1 for safety (even though "
+        " they may already have been correct)."
+    )
+
+    deprecate_partition_strategy_comment = (
+        "WARNING: `partition_strategy` has been removed from `%s` "
+        " The 'div' strategy is used by default.")
+
     # Function warnings. <function name> placeholder inside warnings will be
     # replaced by function name.
     self.function_warnings = {
-        "tf.train.exponential_decay": decay_function_comment,
-        "tf.train.piecewise_constant": decay_function_comment,
-        "tf.train.polynomial_decay": decay_function_comment,
-        "tf.train.natural_exp_decay": decay_function_comment,
-        "tf.train.inverse_time_decay": decay_function_comment,
-        "tf.train.cosine_decay": decay_function_comment,
-        "tf.train.cosine_decay_restarts": decay_function_comment,
-        "tf.train.linear_cosine_decay": decay_function_comment,
-        "tf.train.noisy_linear_cosine_decay": decay_function_comment,
-        "tf.estimator.LinearClassifier": default_loss_reduction_changed,
+        "tf.assert_greater":
+            assert_return_type_comment,
+        "tf.assert_equal":
+            assert_return_type_comment,
+        "tf.assert_less":
+            assert_return_type_comment,
+        "tf.assert_rank":
+            assert_rank_comment,
+        "tf.cond": "tf.cond no longer takes 'strict'. "
+                   "Now 'strict' defaults to True."
+                   "fn1/fn2 arguments are replaced by true_fn/false_fn.",
+        "tf.debugging.assert_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_greater":
+            assert_return_type_comment,
+        "tf.debugging.assert_greater_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_integer":
+            assert_return_type_comment,
+        "tf.debugging.assert_less":
+            assert_return_type_comment,
+        "tf.debugging.assert_less_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_near":
+            assert_return_type_comment,
+        "tf.debugging.assert_negative":
+            assert_return_type_comment,
+        "tf.debugging.assert_non_negative":
+            assert_return_type_comment,
+        "tf.debugging.assert_non_positive":
+            assert_return_type_comment,
+        "tf.debugging.assert_none_equal":
+            assert_return_type_comment,
+        "tf.debugging.assert_positive":
+            assert_return_type_comment,
+        "tf.debugging.assert_rank":
+            assert_rank_comment,
+        "tf.debugging.assert_rank_at_least":
+            assert_rank_comment,
+        "tf.debugging.assert_rank_in":
+            assert_rank_comment,
+        "tf.device": "tf.device no longer takes function as an argument. "
+                     "'devide_name_or_function' argument has been renamed to "
+                     "'device_name'.",
+        "tf.flags":
+            "tf.flags has been removed, please use the argparse or absl"
+            " module if you need command line parsing.",
+        "tf.train.exponential_decay":
+            decay_function_comment,
+        "tf.train.piecewise_constant_decay":
+            decay_function_comment,
+        "tf.train.polynomial_decay":
+            decay_function_comment,
+        "tf.train.natural_exp_decay":
+            decay_function_comment,
+        "tf.train.inverse_time_decay":
+            decay_function_comment,
+        "tf.train.cosine_decay":
+            decay_function_comment,
+        "tf.train.cosine_decay_restarts":
+            decay_function_comment,
+        "tf.train.linear_cosine_decay":
+            decay_function_comment,
+        "tf.train.noisy_linear_cosine_decay":
+            decay_function_comment,
+        "tf.estimator.LinearClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.LinearRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNLinearCombinedClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNLinearCombinedRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNRegressor":
+            default_loss_reduction_changed,
+        "tf.estimator.DNNClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.BaselineClassifier":
+            default_loss_reduction_changed,
+        "tf.estimator.BaselineRegressor":
+            default_loss_reduction_changed,
+        "tf.hessians": "tf.hessians no longer takes "
+                       "'colocate_gradients_with_ops' argument. Also, "
+                       "arguments have been reordered so that 'name' is the "
+                       "last argument.",
+        "tf.nn.conv1d":
+            "WARNING: use_cudnn_on_gpu argument has been removed and \"value\""
+            " was renamed to \"input\"",
+        "tf.nn.conv2d":
+            "WARNING: use_cudnn_on_gpu argument has been removed and "
+            "\"filter\" was renamed to \"filters\"",
+        "tf.nn.conv2d_backprop_filter":
+            "WARNING: use_cudnn_on_gpu argument has been removed",
+        "tf.nn.conv2d_backprop_input":
+            "WARNING: use_cudnn_on_gpu argument has been removed and "
+            "\"filter\" was renamed to \"filters\"",
+        "tf.nn.erosion2d":
+            "WARNING: <function name> now requires a data_format argument",
+        "tf.nn.nce_loss":
+            deprecate_partition_strategy_comment % "tf.nn.nce_loss",
+        "tf.nn.safe_embedding_lookup_sparse":
+            deprecate_partition_strategy_comment %
+            "tf.nn.safe_embedding_lookup_sparse",
+        "tf.nn.sampled_softmax_loss":
+            deprecate_partition_strategy_comment % "tf.nn.sampled_softmax_loss",
+        "tf.zeros_like":
+            tf_01s_like_no_optimize_comment,
+        "tf.ones_like":
+            tf_01s_like_no_optimize_comment,
+        "tf.nn.embedding_lookup":
+            "WARNING: validate_indices argument has been removed.",
+        "tf.while_loop":
+            "tf.while_loop no longer takes 'return_same_structure' argument. "
+            "'return_same_structure' now defaults to True. Also, 'name'"
+            "argument is now the last argument.",
+        "tf.image.sample_distorted_bounding_box":
+            "tf.image.sample_distorted_bounding_box no longer takes 'seed2' "
+            "argument.",
+        "tf.nn.ctc_beam_search_decoder":
+            "tf.nn.ctc_beam_search_decoder no longer takes 'merge_repeated' "
+            "argument. 'merge_repeated' now defaults to False.",
+        "tf.nn.fractional_avg_pool":
+            "tf.nn.fractional_avg_pool no longer takes 'seed2' and "
+            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
+            "'seed' is zero, the execution is random and deterministic "
+            "otherwise",
+        "tf.nn.fractional_max_pool":
+            "tf.nn.fractional_max_pool no longer takes 'seed2' and "
+            "'deterministic' arguments. Now it takes a single 'seed' arg. If "
+            "'seed' is zero, the execution is random and deterministic "
+            "otherwise",
+        "tf.nn.softmax_cross_entropy_with_logits":
+            "tf.nn.softmax_cross_entropy_with_logits behavior has changed. "
+            "'labels' needs to be wrapped with tf.stop_gradient to keep the "
+            "old behavior. Also, 'dim' argument has been renamed to 'axis'.",
+        "tf.test.assert_equal_graph_def":
+            "tf.assert_equal_graph_def no longer takes 'checkpoint_v2' "
+            "argument. 'checkpoint_v2' now defaults to True.",
     }
-    # Right now we can't have both a rename and a warning.
+
     self.symbol_renames = {
         name: new_name
         for name, new_name in self.symbol_renames.items()
-        if name not in self.function_warnings and name not in excluded_renames
     }
 
+    export_saved_model_renamed = (
+        "(Manual edit required) Please rename the method export_savedmodel() "
+        "to export_saved_model(). Two things to note:\n\t(1) The argument "
+        "strip_default_attributes has been removed. The function will always "
+        "strip the default attributes from ops. If this breaks your code, "
+        "please switch to tf.compat.v1.estimator.Estimator.\n\t(2) This change "
+        "only effects core estimator. If you are using "
+        "tf.contrib.learn.Estimator, please switch to using core estimator.")
+
+    make_initializable_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_initializable_iterator()` method has been "
+        "removed. If you are using the Estimator API, you can return a dataset "
+        "directly from your input functions without creating an iterator. "
+        "As a last resort, please replace calls to that method on `dataset` "
+        "with a call to "
+        "`tf.compat.v1.data.make_initializable_iterator(dataset)`.")
+
+    make_one_shot_iterator_deprecation = (
+        "(Manual edit required) The "
+        "`tf.data.Dataset.make_one_shot_iterator()` method has been "
+        "removed. If you are using eager execution, you can iterate over "
+        "`dataset` using a Python `for` loop. If you are using the Estimator "
+        "API, you can return a dataset directly from your input functions "
+        "without creating an iterator. As a last resort, please replace calls "
+        "to that method on `dataset` with a call to "
+        "`tf.compat.v1.data.make_one_shot_iterator(dataset)`.")
+
+    # Specify warnings for functions that aren't restricted to the tf.x.y.z
+    # format. This should only be used for methods with unique names, e.g.
+    # export_savedmodel, which is only defined in Estimator objects.
+    self.unrestricted_function_warnings = {
+        "export_savedmodel": export_saved_model_renamed,
+        "make_initializable_iterator": make_initializable_iterator_deprecation,
+        "make_one_shot_iterator": make_one_shot_iterator_deprecation,
+    }
+
+  @staticmethod
+  def _dropout_handler(file_edit_recorder, node):
+    if len(node.args) < 2:
+      comment = ("ERROR: tf.nn.dropout did not take arguments, so automatic "
+                 "transformation was disabled. tf.nn.dropout has changed "
+                 "the semantics of the second argument.")
+      file_edit_recorder.add(
+          comment,
+          node.lineno,
+          node.col_offset,
+          "tf.nn.dropout",
+          "tf.nn.dropout",
+          error="tf.nn.dropout requires manual check.")
+    else:
+      comment = ("WARNING: tf.nn.dropout has changed the semantics of the "
+                 "second argument. Please check the transformation.\n")
+      file_edit_recorder.add(
+          comment,
+          node.args[1].lineno,
+          node.args[1].col_offset,
+          "",
+          "1 - ")
 
-if __name__ == "__main__":
-  parser = argparse.ArgumentParser(
-      formatter_class=argparse.RawDescriptionHelpFormatter,
-      description="""Convert a TensorFlow Python file to 2.0
-
-Simple usage:
-  tf_convert_v2.py --infile foo.py --outfile bar.py
-  tf_convert_v2.py --intree ~/code/old --outtree ~/code/new
-""")
-  parser.add_argument(
-      "--infile",
-      dest="input_file",
-      help="If converting a single file, the name of the file "
-      "to convert")
-  parser.add_argument(
-      "--outfile",
-      dest="output_file",
-      help="If converting a single file, the output filename.")
-  parser.add_argument(
-      "--intree",
-      dest="input_tree",
-      help="If converting a whole tree of files, the directory "
-      "to read from (relative or absolute).")
-  parser.add_argument(
-      "--outtree",
-      dest="output_tree",
-      help="If converting a whole tree of files, the output "
-      "directory (relative or absolute).")
-  parser.add_argument(
-      "--copyotherfiles",
-      dest="copy_other_files",
-      help=("If converting a whole tree of files, whether to "
-            "copy the other files."),
-      type=bool,
-      default=False)
-  parser.add_argument(
-      "--reportfile",
-      dest="report_filename",
-      help=("The name of the file where the report log is "
-            "stored."
-            "(default: %(default)s)"),
-      default="report.txt")
-  args = parser.parse_args()
-
-  upgrade = ast_edits.ASTCodeUpgrader(TFAPIChangeSpec())
-  report_text = None
-  report_filename = args.report_filename
-  files_processed = 0
-  if args.input_file:
-    if not args.output_file:
-      raise ValueError(
-          "--outfile=<output file> argument is required when converting a "
-          "single file.")
-    files_processed, report_text, errors = upgrade.process_file(
-        args.input_file, args.output_file)
-    files_processed = 1
-  elif args.input_tree:
-    if not args.output_tree:
-      raise ValueError(
-          "--outtree=<output directory> argument is required when converting a "
-          "file tree.")
-    files_processed, report_text, errors = upgrade.process_tree(
-        args.input_tree, args.output_tree, args.copy_other_files)
-  else:
-    parser.print_help()
-  if report_text:
-    open(report_filename, "w").write(report_text)
-    print("TensorFlow 2.0 Upgrade Script")
-    print("-----------------------------")
-    print("Converted %d files\n" % files_processed)
-    print("Detected %d errors that require attention" % len(errors))
-    print("-" * 80)
-    print("\n".join(errors))
-    print("\nMake sure to read the detailed log %r\n" % report_filename)
+  @staticmethod
+  def _colocate_handler(name):
+    def _helper(file_edit_recorder, node):
+      for keyword in node.keywords:
+        if keyword.arg == "colocate_gradients_with_ops":
+          # TODO(jhseu): Since ast_edit.py does string replacement, there's no
+          # straightforward way to remove the argument. Try to fix before 2.0 is
+          # final.
+          comment = ("For tf.gradients and tf.Optimizer.minimize, "
+                     "colocate_gradients_with_op has been removed and now "
+                     "defaults to True.")
+          file_edit_recorder.add(
+              comment,
+              node.lineno,
+              node.col_offset,
+              "",
+              "",
+              error="{} requires manual check.".format(name))
+    return _helper
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_main.py b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
new file mode 100644
index 0000000000000000000000000000000000000000..543d0786423f5b3f9bc59895c1325d19b6241cf7
--- /dev/null
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_main.py
@@ -0,0 +1,104 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Upgrader for Python scripts from 1.* TensorFlow to 2.0 TensorFlow."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+
+from tensorflow.tools.compatibility import ast_edits
+from tensorflow.tools.compatibility import tf_upgrade_v2
+
+
+def main():
+  parser = argparse.ArgumentParser(
+      formatter_class=argparse.RawDescriptionHelpFormatter,
+      description="""Convert a TensorFlow Python file to 2.0
+
+Simple usage:
+  tf_upgrade_v2.py --infile foo.py --outfile bar.py
+  tf_upgrade_v2.py --intree ~/code/old --outtree ~/code/new
+""")
+  parser.add_argument(
+      "--infile",
+      dest="input_file",
+      help="If converting a single file, the name of the file "
+      "to convert")
+  parser.add_argument(
+      "--outfile",
+      dest="output_file",
+      help="If converting a single file, the output filename.")
+  parser.add_argument(
+      "--intree",
+      dest="input_tree",
+      help="If converting a whole tree of files, the directory "
+      "to read from (relative or absolute).")
+  parser.add_argument(
+      "--outtree",
+      dest="output_tree",
+      help="If converting a whole tree of files, the output "
+      "directory (relative or absolute).")
+  parser.add_argument(
+      "--copyotherfiles",
+      dest="copy_other_files",
+      help=("If converting a whole tree of files, whether to "
+            "copy the other files."),
+      type=bool,
+      default=True)
+  parser.add_argument(
+      "--reportfile",
+      dest="report_filename",
+      help=("The name of the file where the report log is "
+            "stored."
+            "(default: %(default)s)"),
+      default="report.txt")
+  args = parser.parse_args()
+
+  upgrade = ast_edits.ASTCodeUpgrader(tf_upgrade_v2.TFAPIChangeSpec())
+  report_text = None
+  report_filename = args.report_filename
+  files_processed = 0
+  if args.input_file:
+    if not args.output_file:
+      raise ValueError(
+          "--outfile=<output file> argument is required when converting a "
+          "single file.")
+    files_processed, report_text, errors = upgrade.process_file(
+        args.input_file, args.output_file)
+    files_processed = 1
+  elif args.input_tree:
+    if not args.output_tree:
+      raise ValueError(
+          "--outtree=<output directory> argument is required when converting a "
+          "file tree.")
+    files_processed, report_text, errors = upgrade.process_tree(
+        args.input_tree, args.output_tree, args.copy_other_files)
+  else:
+    parser.print_help()
+  if report_text:
+    open(report_filename, "w").write(report_text)
+    print("TensorFlow 2.0 Upgrade Script")
+    print("-----------------------------")
+    print("Converted %d files\n" % files_processed)
+    print("Detected %d errors that require attention" % len(errors))
+    print("-" * 80)
+    print("\n".join(errors))
+    print("\nMake sure to read the detailed log %r\n" % report_filename)
+
+
+if __name__ == "__main__":
+  main()
diff --git a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
index 9060b1c71f105add64115e0a60bdd16d39343e2a..0fc7a18734219cd0216816873768dd9dada16cc5 100644
--- a/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
+++ b/tensorflow/tools/compatibility/tf_upgrade_v2_test.py
@@ -17,15 +17,90 @@
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
+
+import inspect
 import os
 import tempfile
+
 import six
+import tensorflow as tf
+# OSS TF V2 import placeholder.
+
 from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test as test_lib
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
 from tensorflow.tools.compatibility import ast_edits
 from tensorflow.tools.compatibility import tf_upgrade_v2
 
 
+_TENSORFLOW_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
+_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+_ESTIMATOR_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
+_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+
+
+def get_v1_names(symbol):
+  names_v1 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  names_v2 = set()
+  if hasattr(symbol, _TENSORFLOW_API_ATTR):
+    names_v2.update(getattr(symbol, _TENSORFLOW_API_ATTR))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR):
+    names_v2.update(getattr(symbol, _ESTIMATOR_API_ATTR))
+  return list(names_v2)
+
+
+def get_symbol_for_name(root, name):
+  name_parts = name.split(".")
+  symbol = root
+  # Iterate starting with second item since 1st item is "tf.".
+  for part in name_parts[1:]:
+    symbol = getattr(symbol, part)
+  return symbol
+
+
+def get_args(symbol):
+  if hasattr(inspect, "signature"):
+    signature = inspect.signature(symbol)
+    # Ignore *args and **kwargs for now.
+    return [param.name for param in signature.parameters.values()
+            if param.kind == param.POSITIONAL_OR_KEYWORD]
+  return tf_inspect.getargspec(symbol)[0]
+
+
+def get_func_and_args_from_str(call_str):
+  """Parse call string to get function and argument names.
+
+  Args:
+    call_str: Call string must be in the form:
+              `tf.foo(arg1=val1, arg2=val2, ...)`.
+
+  Returns:
+    (function_name, list of arg names) tuple.
+  """
+  open_paren_index = call_str.find("(")
+  close_paren_index = call_str.rfind(")")
+
+  function_name = call_str[:call_str.find("(")]
+  args = call_str[open_paren_index+1:close_paren_index].split(",")
+  args = [arg.split("=")[0].strip() for arg in args]
+  args = [arg for arg in args if arg]  # filter out empty strings
+  return function_name, args
+
+
 class TestUpgrade(test_util.TensorFlowTestCase):
   """Test various APIs that have been changed in 2.0.
 
@@ -34,6 +109,22 @@ class TestUpgrade(test_util.TensorFlowTestCase):
   work when run with current TensorFlow.
   """
 
+  @classmethod
+  def setUpClass(cls):
+    cls.v2_symbols = {}
+    if not hasattr(tf.compat, "v2"):
+      return
+
+    def symbol_collector(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names_v2 = get_v2_names(attr)
+        for name in api_names_v2:
+          cls.v2_symbols["tf." + name] = attr
+
+    visitor = public_api.PublicAPIVisitor(symbol_collector)
+    traverse.traverse(tf.compat.v2, visitor)
+
   def _upgrade(self, old_file_text):
     in_file = six.StringIO(old_file_text)
     out_file = six.StringIO()
@@ -64,6 +155,199 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, "tf.math.rsqrt(tf.math.log_sigmoid(3.8))\n")
 
+  def testAllAPI(self):
+    if not hasattr(tf.compat, "v2"):
+      return
+
+    # Converts all symbols in the v1 namespace to the v2 namespace, raising
+    # an error if the target of the conversion is not in the v2 namespace.
+    # Please regenerate the renames file or edit any manual renames if this
+    # test fails.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names = get_v1_names(attr)
+        for name in api_names:
+          _, _, _, text = self._upgrade("tf." + name)
+          if (text and
+              not text.startswith("tf.compat.v1") and
+              text not in self.v2_symbols):
+            self.assertFalse(
+                True, "Symbol %s generated from %s not in v2 API" % (
+                    text, name))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testAllAPIV1(self):
+    collect = True
+    v1_symbols = set([])
+
+    # Converts all symbols in the v1 namespace to the v2 namespace, raising
+    # an error if the target of the conversion is not in the v1 namespace.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        api_names = get_v1_names(attr)
+        for name in api_names:
+          if collect:
+            v1_symbols.add("tf." + name)
+          else:
+            _, _, _, text = self._upgrade("tf." + name)
+            if (text and
+                not text.startswith("tf.compat.v1") and
+                not text.startswith("tf.estimator") and
+                text not in v1_symbols):
+              self.assertFalse(
+                  True, "Symbol %s generated from %s not in v1 API" % (
+                      text, name))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+    collect = False
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testV1KeywordArgNames(self):
+    all_keyword_renames = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
+
+    # Visitor that verifies V1 argument names.
+    def arg_test_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        names_v1 = get_v1_names(attr)
+
+        for name in names_v1:
+          name = "tf.%s" % name
+          if name not in all_keyword_renames:
+            continue
+          arg_names_v1 = tf_inspect.getargspec(attr)[0]
+          keyword_renames = all_keyword_renames[name]
+          self.assertEqual(type(keyword_renames), dict)
+
+          # Assert that v1 function has valid v1 argument names.
+          for from_name, _ in keyword_renames.items():
+            self.assertIn(
+                from_name, arg_names_v1,
+                "%s not found in %s arguments: %s" %
+                (from_name, name, str(arg_names_v1)))
+
+    visitor = public_api.PublicAPIVisitor(arg_test_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testV2KeywordArgNames(self):
+    # This test converts a call of the form:
+    # tf.foo(arg1=0, arg2=1, ...)
+    # to 2.0. Then, checks that converted function has valid argument names.
+    if not hasattr(tf.compat, "v2"):
+      return
+    v2_arg_exceptions = {
+        "verify_shape_is_now_always_true",
+        # These arguments should not be used, they just specify
+        # that a function takes named arguments.
+        "keyword_required",
+        "_sentinel",
+    }
+    v1_name_exceptions = {
+        "tf.print",  # requires print_function import
+    }
+    function_warnings = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_warnings)
+    function_handles = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_handle)
+    keyword_renames = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_keyword_renames)
+
+    # Visitor that converts to V2 and checks V2 argument names.
+    def conversion_visitor(unused_path, unused_parent, children):
+      for child in children:
+        _, attr = tf_decorator.unwrap(child[1])
+        if not tf_inspect.isfunction(attr):
+          continue
+        names_v1 = get_v1_names(attr)
+        arg_names_v1 = get_args(attr)
+
+        for name in names_v1:
+          tf_name = "tf.%s" % name
+          if tf_name in function_warnings or tf_name in function_handles:
+            continue  # These require manual change
+          if tf_name in v1_name_exceptions:
+            continue
+          # Assert that arg names after converting to v2 are present in
+          # v2 function.
+          # 1. First, create an input of the form:
+          #    tf.foo(arg1=val1, arg2=val2, ...)
+          args = ",".join(
+              ["%s=%d" % (from_name, from_index)
+               for from_index, from_name in enumerate(arg_names_v1)])
+          text_input = "%s(%s)" % (tf_name, args)
+          # 2. Convert the input to V2.
+          _, _, _, text = self._upgrade(text_input)
+          new_function_name, new_args = get_func_and_args_from_str(text)
+          if new_function_name == "tf.compat.v1.%s" % name:
+            if tf_name in keyword_renames:
+              # If we rename arguments, new function must be available in 2.0.
+              # We should not be using compat.v1 in this case.
+              self.assertFalse(
+                  "Function '%s' is not in 2.0 when converting\n%s\nto\n%s" %
+                  (new_function_name, text_input, text))
+            continue
+          # 3. Verify V2 function and arguments.
+          args_v2 = get_args(self.v2_symbols[new_function_name])
+          args_v2.extend(v2_arg_exceptions)
+          for new_arg in new_args:
+            self.assertIn(
+                new_arg, args_v2,
+                "Invalid argument '%s' in 2.0 when converting\n%s\nto\n%s.\n"
+                "Supported arguments: %s" % (
+                    new_arg, text_input, text, str(args_v2)))
+
+    visitor = public_api.PublicAPIVisitor(conversion_visitor)
+    visitor.do_not_descend_map["tf"].append("contrib")
+    visitor.private_map["tf.compat"] = ["v1", "v2"]
+    traverse.traverse(tf.compat.v1, visitor)
+
+  def testReorderFileNeedsUpdate(self):
+    reordered_function_names = (
+        tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
+    function_reorders = (
+        tf_upgrade_v2.TFAPIChangeSpec().function_reorders)
+
+    added_names_message = """Some function names in
+self.reordered_function_names are not in reorders_v2.py.
+Please run the following commands to update reorders_v2.py:
+bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+    removed_names_message = """%s in self.reorders_v2 does not match
+any name in self.reordered_function_names.
+Please run the following commands to update reorders_v2.py:
+bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+    self.assertTrue(
+        reordered_function_names.issubset(function_reorders),
+        added_names_message)
+    # function_reorders should contain reordered_function_names
+    # and their TensorFlow V1 aliases.
+    for name in function_reorders:
+      # get other names for this function
+      attr = get_symbol_for_name(tf.compat.v1, name)
+      _, attr = tf_decorator.unwrap(attr)
+      v1_names = get_v1_names(attr)
+      self.assertTrue(v1_names)
+      v1_names = ["tf.%s" % n for n in v1_names]
+      # check if any other name is in
+      self.assertTrue(
+          any(n in reordered_function_names for n in v1_names),
+          removed_names_message % name)
+
   def testRenameConstant(self):
     text = "tf.MONOLITHIC_BUILD\n"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
@@ -72,6 +356,16 @@ class TestUpgrade(test_util.TensorFlowTestCase):
     _, unused_report, unused_errors, new_text = self._upgrade(text)
     self.assertEqual(new_text, "some_call(tf.sysconfig.MONOLITHIC_BUILD)\n")
 
+  def testRenameArgs(self):
+    text = ("tf.nn.pool(input_a, window_shape_a, pooling_type_a, padding_a, "
+            "dilation_rate_a, strides_a, name_a, data_format_a)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text,
+                     ("tf.nn.pool(input=input_a, window_shape=window_shape_a,"
+                      " pooling_type=pooling_type_a, padding=padding_a, "
+                      "dilations=dilation_rate_a, strides=strides_a, "
+                      "name=name_a, data_format=data_format_a)\n"))
+
   def testReorder(self):
     text = "tf.boolean_mask(a, b, c, d)\n"
     _, unused_report, unused_errors, new_text = self._upgrade(text)
@@ -79,7 +373,7 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                      "tf.boolean_mask(tensor=a, mask=b, name=c, axis=d)\n")
 
   def testLearningRateDecay(self):
-    for decay in ["tf.train.exponential_decay", "tf.train.piecewise_constant",
+    for decay in ["tf.train.exponential_decay",
                   "tf.train.polynomial_decay", "tf.train.natural_exp_decay",
                   "tf.train.inverse_time_decay", "tf.train.cosine_decay",
                   "tf.train.cosine_decay_restarts",
@@ -87,18 +381,362 @@ class TestUpgrade(test_util.TensorFlowTestCase):
                   "tf.train.noisy_linear_cosine_decay"]:
 
       text = "%s(a, b)\n" % decay
-      _, report, errors, new_text = self._upgrade(text)
-      self.assertEqual(text, new_text)
+      _, report, errors, _ = self._upgrade(text)
       self.assertEqual(errors, ["test.py:1: %s requires manual check." % decay])
       self.assertIn("%s has been changed" % decay, report)
 
-  def testEstimatorLossReductionChangege(self):
-    text = "tf.estimator.LinearClassifier(a, b)\n"
-    _, report, errors, new_text = self._upgrade(text)
+  def testPiecewiseDecay(self):
+    text = "tf.train.piecewise_constant_decay(a, b)\n"
+    _, report, errors, _ = self._upgrade(text)
+    self.assertEqual(
+        errors,
+        ["test.py:1: tf.train.piecewise_constant_decay requires manual check."])
+    self.assertIn("tf.train.piecewise_constant_decay has been changed", report)
+
+  def testEstimatorLossReductionChange(self):
+    classes = [
+        "LinearClassifier", "LinearRegressor", "DNNLinearCombinedClassifier",
+        "DNNLinearCombinedRegressor", "DNNRegressor", "DNNClassifier",
+        "BaselineClassifier", "BaselineRegressor"
+    ]
+    for c in classes:
+      ns = "tf.estimator." + c
+      text = ns + "(a, b)"
+      _, report, errors, new_text = self._upgrade(text)
+      self.assertEqual(text, new_text)
+      self.assertEqual(errors, ["test.py:1: %s requires manual check." % ns])
+      self.assertIn("loss_reduction has been changed", report)
+
+  def testDropout(self):
+    text = "tf.nn.dropout(x, keep_prob, name=\"foo\")\n"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(
+        new_text,
+        "tf.nn.dropout(x, 1 - keep_prob, name=\"foo\")\n",
+    )
+
+    text = "tf.nn.dropout(x)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, text)
+    self.assertEqual(
+        errors,
+        ["test.py:1: tf.nn.dropout requires manual check."]
+    )
+
+  def testCountNonZeroChanges(self):
+    text = (
+        "tf.math.count_nonzero(input_tensor=input, dtype=dtype, name=name, "
+        "reduction_indices=axis, keep_dims=keepdims)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.math.count_nonzero(input=input, dtype=dtype, name=name, "
+        "axis=axis, keepdims=keepdims)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+  def testRandomMultinomialToRandomCategorical(self):
+    text = (
+        "tf.random.multinomial(logits, samples, seed, name, output_dtype)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
+        "name=name, dtype=output_dtype)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.multinomial(logits, samples, seed, name, output_dtype)\n"
+        )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.random.categorical(logits=logits, num_samples=samples, seed=seed, "
+        "name=name, dtype=output_dtype)\n"
+        )
+    self.assertEqual(new_text, expected_text)
+
+  def testConvolutionOpUpdate(self):
+    text = (
+        "tf.nn.convolution(input, filter, padding, strides, dilation_rate, "
+        "name, data_format)"
+    )
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    expected_text = (
+        "tf.nn.convolution(input=input, filters=filter, padding=padding, "
+        "strides=strides, dilations=dilation_rate, name=name, "
+        "data_format=data_format)"
+    )
+    self.assertEqual(new_text, expected_text)
+
+  def testColocateGradientsWithOps(self):
+    text = "tf.gradients(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, [])
+
+    text = "tf.gradients(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, ["test.py:1: tf.gradients requires manual check."])
+
+    text = "optimizer.minimize(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, [])
+
+    text = "optimizer.minimize(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
     self.assertEqual(text, new_text)
-    self.assertEqual(errors, ["test.py:1: %s requires manual check."
-                              % "tf.estimator.LinearClassifier"])
-    self.assertIn("loss_reduction has been changed", report)
+    self.assertEqual(errors,
+                     ["test.py:1: Optimizer.minimize requires manual check."])
+
+    text = "optimizer.compute_gradients(a, foo=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors, [])
+
+    text = "optimizer.compute_gradients(a, colocate_gradients_with_ops=False)\n"
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(text, new_text)
+    self.assertEqual(errors,
+                     ["test.py:1: Optimizer.compute_gradients "
+                      "requires manual check."])
+
+  def testExportSavedModelRename(self):
+    text = "self.est.export_savedmodel(path)"
+    _, report, unused_errors, unused_new_text = self._upgrade(text)
+    self.assertIn(
+        "rename the method export_savedmodel() to export_saved_model()",
+        report)
+
+  def testArgmin(self):
+    text = "tf.argmin(input, name=n, dimension=1, output_type=type)"
+    expected_text = "tf.argmin(input=input, name=n, axis=1, output_type=type)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.argmin(input, 0)"
+    expected_text = "tf.argmin(input=input, axis=0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.arg_min(input, 0)"
+    expected_text = "tf.argmin(input, 0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testArgmax(self):
+    text = "tf.argmax(input, name=n, dimension=1, output_type=type)"
+    expected_text = "tf.argmax(input=input, name=n, axis=1, output_type=type)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.argmax(input, 0)"
+    expected_text = "tf.argmax(input=input, axis=0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.arg_max(input, 0)"
+    expected_text = "tf.argmax(input, 0)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testBatchToSpace(self):
+    text = "tf.batch_to_space_nd(input, block_shape, crops, name)"
+    expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.batch_to_space(input, crops, block_size, name)"
+    expected_text = (
+        "tf.batch_to_space(input=input, crops=crops, block_shape=block_size, "
+        "name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.manip.batch_to_space_nd(input, block_shape, crops, name)"
+    expected_text = "tf.batch_to_space(input, block_shape, crops, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testExtractImagePatches(self):
+    text = (
+        "tf.extract_image_patches(images, ksizes=ksizes, strides=strides,"
+        "rates=rates, padding=padding, name=name)")
+    expected_text = (
+        "tf.image.extract_image_patches(images, sizes=ksizes, strides=strides,"
+        "rates=rates, padding=padding, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testStatelessMultinomial(self):
+    text = (
+        "tf.random.stateless_multinomial(logits, num_samples, seed, "
+        "output_dtype=dtype, name=name)")
+    expected_text = (
+        "tf.random.stateless_categorical(logits, num_samples, seed, "
+        "dtype=dtype, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSoftMaxCrossEntropyWithLogitsV2(self):
+    text = "tf.nn.softmax_cross_entropy_with_logits_v2(labels, logits, dim=2)"
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, axis=2)")
+    _, unused_report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    self.assertFalse(errors)
+
+  def testSoftMaxCrossEntropyWithLogits(self):
+    text = "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)"
+    expected_text = (
+        "tf.nn.softmax_cross_entropy_with_logits(labels, logits, dim=2)")
+    _, report, errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+    self.assertIn(
+        "tf.nn.softmax_cross_entropy_with_logits requires manual check.",
+        errors[0])
+    self.assertIn(
+        "tf.nn.softmax_cross_entropy_with_logits behavior has changed. ",
+        report)
+
+  def testSparseMatmul(self):
+    text = ("tf.sparse_matmul(a, b, c, d, e, f, g)\n")
+    expected_text = ("tf.linalg.matmul(a=a, b=b, transpose_a=c, transpose_b=d, "
+                     "a_is_sparse=e, b_is_sparse=f, name=g)\n")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testWeightedMoments(self):
+    text = "tf.nn.weighted_moments(x, axes, freq, name, kd)"
+    expected_text = (
+        "tf.nn.weighted_moments(x=x, axes=axes, frequency_weights=freq, "
+        "name=name, keepdims=kd)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSparseAdd(self):
+    text = "tf.sparse.add(a, b, t)"
+    expected_text = "tf.sparse.add(a=a, b=b, threshold=t)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSparseConcat(self):
+    text = "tf.sparse.concat(ax, inp, name, exp, concat)"
+    expected_text = (
+        "tf.sparse.concat(axis=ax, sp_inputs=inp, name=name, "
+        "expand_nonconcat_dims=exp, axis=concat)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSeparableConv2D(self):
+    text = "tf.nn.separable_conv2d(inp, d, pt, strides, pad, rate, name, fmt)"
+    expected_text = (
+        "tf.nn.separable_conv2d(input=inp, depthwise_filter=d, "
+        "pointwise_filter=pt, strides=strides, padding=pad, "
+        "dilations=rate, name=name, data_format=fmt)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSpacetoBatch(self):
+    text = "tf.space_to_batch_nd(input, shape, paddings, name)"
+    expected_text = "tf.space_to_batch(input, shape, paddings, name)"
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = "tf.nn.space_to_batch(input, paddings, block_size, name)"
+    expected_text = (
+        "tf.space_to_batch(input=input, paddings=paddings, "
+        "block_shape=block_size, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testInTopK(self):
+    text = "tf.math.in_top_k(a, b, c, n)"
+    expected_text = (
+        "tf.math.in_top_k(predictions=a, targets=b, k=c, name=n)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testDepthToSpace(self):
+    text = "tf.nn.depth_to_space(input, block_size, name, data_format)"
+    expected_text = (
+        "tf.nn.depth_to_space(input=input, block_size=block_size, "
+        "name=name, data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testEmbeddingLookup(self):
+    text = ("tf.nn.embedding_lookup(params, ids, partition_strategy, name, "
+            "validate_indices, max_norm)")
+    expected_text = ("tf.nn.embedding_lookup(params=params, ids=ids, "
+                     "partition_strategy=partition_strategy, name=name, "
+                     "validate_indices=validate_indices, max_norm=max_norm)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testEmbeddingLookupSparse(self):
+    text = ("tf.nn.embedding_lookup_sparse(params, sp_ids, sp_weights, "
+            "partition_strategy, name, combiner, max_norm)")
+    expected_text = ("tf.nn.embedding_lookup_sparse(params=params, "
+                     "sp_ids=sp_ids, sp_weights=sp_weights, "
+                     "partition_strategy=partition_strategy, name=name, "
+                     "combiner=combiner, max_norm=max_norm)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testNnInTopK(self):
+    text = "tf.nn.in_top_k(predictions, targets, k, name)"
+    expected_text = ("tf.nn.in_top_k(predictions=predictions, "
+                     "targets=targets, k=k, name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testSpaceToDepth(self):
+    text = "tf.nn.space_to_depth(input, block_size, name, data_format)"
+    expected_text = ("tf.nn.space_to_depth(input=input, block_size=block_size, "
+                     "name=name, data_format=data_format)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+  def testPrint(self):
+    # tf.print() cannot be parsed unless we import print_function
+    text = """from __future__ import print_function
+tf.print()
+tf.print('abc')
+"""
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, text)  # Text should stay the same
+
+  def testSparseSplit(self):
+    text = (
+        "tf.sparse_split(sp_input=sp_input, num_split=num_split, axis=axis, "
+        "name=name)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, axis=axis, "
+        "name=name)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.sparse_split(sp_input=sp_input, num_split=num_split, "
+        "name=name, split_dim=axis)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, axis=axis)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
+
+    text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, split_dim=axis)")
+    expected_text = (
+        "tf.sparse.split(sp_input=sp_input, num_split=num_split, "
+        "name=name, axis=axis)")
+    _, unused_report, unused_errors, new_text = self._upgrade(text)
+    self.assertEqual(new_text, expected_text)
 
 
 class TestUpgradeFiles(test_util.TensorFlowTestCase):
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index 0ee4550815568dececd4e88ca520743b8f81948f..75bb0cfd2b7569c899fb72aa5ac9f4e608c3decc 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -12,5 +12,20 @@ py_binary(
         "//tensorflow/python:no_contrib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
+        "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
+    ],
+)
+
+py_binary(
+    name = "generate_v2_reorders_map",
+    srcs = ["generate_v2_reorders_map.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "//tensorflow:tensorflow_py",
+        "//tensorflow/python:lib",
+        "//tensorflow/python:no_contrib",
+        "//tensorflow/tools/common:public_api",
+        "//tensorflow/tools/common:traverse",
+        "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
     ],
 )
diff --git a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
index 43aa8e057e12b179b17f4b7b4667cd87d17ea90d..19ad6c3a2a5c723cbbff2c76c8bfe6517ca4a4f0 100644
--- a/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
+++ b/tensorflow/tools/compatibility/update/generate_v2_renames_map.py
@@ -32,6 +32,7 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import tf_export
 from tensorflow.tools.common import public_api
 from tensorflow.tools.common import traverse
+from tensorflow.tools.compatibility import tf_upgrade_v2
 
 
 _OUTPUT_FILE_PATH = 'third_party/tensorflow/tools/compatibility/renames_v2.py'
@@ -71,6 +72,50 @@ _TENSORFLOW_CONSTANTS_ATTR_V1 = (
 _TENSORFLOW_CONSTANTS_ATTR = (
     tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
 
+_ESTIMATOR_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
+_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+_ESTIMATOR_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].constants)
+_ESTIMATOR_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].constants)
+
+
+def get_v1_names(symbol):
+  names_v1 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  names_v2 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR):
+    names_v2.extend(getattr(symbol, _TENSORFLOW_API_ATTR))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR):
+    names_v2.extend(getattr(symbol, _ESTIMATOR_API_ATTR))
+  return list(names_v2)
+
+
+def get_v1_constants(module):
+  constants_v1 = []
+  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
+    constants_v1.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1))
+  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1):
+    constants_v1.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR_V1))
+  return constants_v1
+
+
+def get_v2_constants(module):
+  constants_v2 = []
+  if hasattr(module, _TENSORFLOW_CONSTANTS_ATTR):
+    constants_v2.extend(getattr(module, _TENSORFLOW_CONSTANTS_ATTR))
+  if hasattr(module, _ESTIMATOR_CONSTANTS_ATTR):
+    constants_v2.extend(getattr(module, _ESTIMATOR_CONSTANTS_ATTR))
+  return constants_v2
+
 
 def get_canonical_name(v2_names, v1_name):
   if v2_names:
@@ -78,18 +123,34 @@ def get_canonical_name(v2_names, v1_name):
   return 'compat.v1.%s' % v1_name
 
 
+def get_all_v2_names():
+  """Get a set of function/class names available in TensorFlow 2.0."""
+  v2_names = set()  # All op names in TensorFlow 2.0
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects TF 2.0 names."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      api_names_v2 = get_v2_names(attr)
+      for name in api_names_v2:
+        v2_names.add(name)
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  traverse.traverse(tf.compat.v2, visitor)
+  return v2_names
+
+
 def collect_constant_renames():
   """Looks for constants that need to be renamed in TF 2.0.
 
   Returns:
-    List of tuples of the form (current name, new name).
+    Set of tuples of the form (current name, new name).
   """
   renames = set()
   for module in sys.modules.values():
-    if not hasattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1):
-      continue
-    constants_v1_list = getattr(module, _TENSORFLOW_CONSTANTS_ATTR_V1)
-    constants_v2_list = getattr(module, _TENSORFLOW_CONSTANTS_ATTR)
+    constants_v1_list = get_v1_constants(module)
+    constants_v2_list = get_v2_constants(module)
 
     # _tf_api_constants attribute contains a list of tuples:
     # (api_names_list, constant_name)
@@ -115,26 +176,21 @@ def collect_function_renames():
   """Looks for functions/classes that need to be renamed in TF 2.0.
 
   Returns:
-    List of tuples of the form (current name, new name).
+    Set of tuples of the form (current name, new name).
   """
   # Set of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
   renames = set()
-  v2_names = set()  # All op names in TensorFlow 2.0
 
   def visit(unused_path, unused_parent, children):
     """Visitor that collects rename strings to add to rename_line_set."""
     for child in children:
       _, attr = tf_decorator.unwrap(child[1])
-      if not hasattr(attr, '__dict__'):
-        continue
-      api_names_v1 = attr.__dict__.get(_TENSORFLOW_API_ATTR_V1, [])
-      api_names_v2 = attr.__dict__.get(_TENSORFLOW_API_ATTR, [])
+      api_names_v1 = get_v1_names(attr)
+      api_names_v2 = get_v2_names(attr)
       deprecated_api_names = set(api_names_v1) - set(api_names_v2)
       for name in deprecated_api_names:
         renames.add((name, get_canonical_name(api_names_v2, name)))
-      for name in api_names_v2:
-        v2_names.add(name)
 
   visitor = public_api.PublicAPIVisitor(visit)
   visitor.do_not_descend_map['tf'].append('contrib')
@@ -144,8 +200,9 @@ def collect_function_renames():
   # It is possible that a different function is exported with the
   # same name. For e.g. when creating a different function to
   # rename arguments. Exclude it from renames in this case.
-  renames = {name: new_name for name, new_name in renames.items()
-             if name not in v2_names}
+  v2_names = get_all_v2_names()
+  renames = set((name, new_name) for name, new_name in renames
+                if name not in v2_names)
   return renames
 
 
@@ -163,12 +220,15 @@ def update_renames_v2(output_file_path):
   function_renames = collect_function_renames()
   constant_renames = collect_constant_renames()
   all_renames = function_renames.union(constant_renames)
+  manual_renames = set(
+      tf_upgrade_v2.TFAPIChangeSpec().manual_symbol_renames.keys())
 
   # List of rename lines to write to output file in the form:
   #   'tf.deprecated_name': 'tf.canonical_name'
   rename_lines = [
       get_rename_line(name, canonical_name)
-      for name, canonical_name in all_renames]
+      for name, canonical_name in all_renames
+      if 'tf.' + name not in manual_renames]
   renames_file_text = '%srenames = {\n%s\n}\n' % (
       _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
   file_io.write_string_to_file(output_file_path, renames_file_text)
diff --git a/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
new file mode 100644
index 0000000000000000000000000000000000000000..63541771bf36fb243ae241fbf1b4c4a83cf19fd7
--- /dev/null
+++ b/tensorflow/tools/compatibility/update/generate_v2_reorders_map.py
@@ -0,0 +1,166 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+"""Script for updating tensorflow/tools/compatibility/reorders_v2.py.
+
+To update reorders_v2.py, run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+"""
+# pylint: enable=line-too-long
+import tensorflow as tf
+
+# This import is needed so that TensorFlow python modules are in sys.modules.
+from tensorflow import python as tf_python  # pylint: disable=unused-import
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.platform import app
+from tensorflow.python.util import tf_decorator
+from tensorflow.python.util import tf_export
+from tensorflow.python.util import tf_inspect
+from tensorflow.tools.common import public_api
+from tensorflow.tools.common import traverse
+from tensorflow.tools.compatibility import tf_upgrade_v2
+
+
+_OUTPUT_FILE_PATH = 'third_party/tensorflow/tools/compatibility/reorders_v2.py'
+_FILE_HEADER = """# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=line-too-long
+\"\"\"List of renames to apply when converting from TF 1.0 to TF 2.0.
+
+THIS FILE IS AUTOGENERATED: To update, please run:
+  bazel build tensorflow/tools/compatibility/update:generate_v2_reorders_map
+  bazel-bin/tensorflow/tools/compatibility/update/generate_v2_reorders_map
+This file should be updated whenever a function is added to
+self.reordered_function_names in tf_upgrade_v2.py.
+\"\"\"
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+"""
+
+_TENSORFLOW_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].names)
+_TENSORFLOW_API_ATTR = tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].names
+_TENSORFLOW_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.TENSORFLOW_API_NAME].constants)
+_TENSORFLOW_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.TENSORFLOW_API_NAME].constants)
+
+_ESTIMATOR_API_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].names)
+_ESTIMATOR_API_ATTR = tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].names
+_ESTIMATOR_CONSTANTS_ATTR_V1 = (
+    tf_export.API_ATTRS_V1[tf_export.ESTIMATOR_API_NAME].constants)
+_ESTIMATOR_CONSTANTS_ATTR = (
+    tf_export.API_ATTRS[tf_export.ESTIMATOR_API_NAME].constants)
+
+
+def get_v1_names(symbol):
+  names_v1 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _TENSORFLOW_API_ATTR_V1))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR_V1):
+    names_v1.extend(getattr(symbol, _ESTIMATOR_API_ATTR_V1))
+  return names_v1
+
+
+def get_v2_names(symbol):
+  names_v2 = []
+  if hasattr(symbol, _TENSORFLOW_API_ATTR):
+    names_v2.extend(getattr(symbol, _TENSORFLOW_API_ATTR))
+  if hasattr(symbol, _ESTIMATOR_API_ATTR):
+    names_v2.extend(getattr(symbol, _ESTIMATOR_API_ATTR))
+  return list(names_v2)
+
+
+def collect_function_arg_names(function_names):
+  """Determines argument names for reordered function signatures.
+
+  Args:
+    function_names: Functions to collect arguments for.
+
+  Returns:
+    Dictionary mapping function name to its arguments.
+  """
+  # Map from reordered function name to its arguments.
+  function_to_args = {}
+
+  def visit(unused_path, unused_parent, children):
+    """Visitor that collects arguments for reordered functions."""
+    for child in children:
+      _, attr = tf_decorator.unwrap(child[1])
+      api_names_v1 = get_v1_names(attr)
+      api_names_v1 = ['tf.%s' % name for name in api_names_v1]
+      matches_function_names = any(
+          name in function_names for name in api_names_v1)
+      if matches_function_names:
+        arg_list = tf_inspect.getargspec(attr)[0]
+        for name in api_names_v1:
+          function_to_args[name] = arg_list
+
+  visitor = public_api.PublicAPIVisitor(visit)
+  visitor.do_not_descend_map['tf'].append('contrib')
+  visitor.do_not_descend_map['tf.compat'] = ['v1', 'v2']
+  traverse.traverse(tf, visitor)
+
+  return function_to_args
+
+
+def get_reorder_line(name, arg_list):
+  return '    \'%s\': %s' % (name, str(arg_list))
+
+
+def update_reorders_v2(output_file_path):
+  """Writes a Python dictionary mapping function name to argument order.
+
+  Args:
+    output_file_path: File path to write output to. Any existing contents
+      would be replaced.
+  """
+  reordered_function_names = (
+      tf_upgrade_v2.TFAPIChangeSpec().reordered_function_names)
+
+  all_reorders = collect_function_arg_names(reordered_function_names)
+
+  # List of reorder lines to write to output file in the form:
+  #   'tf.function_name': ['arg1', 'arg2', ...]
+  rename_lines = [
+      get_reorder_line(name, arg_names)
+      for name, arg_names in all_reorders.items()]
+  renames_file_text = '%sreorders = {\n%s\n}\n' % (
+      _FILE_HEADER, ',\n'.join(sorted(rename_lines)))
+  file_io.write_string_to_file(output_file_path, renames_file_text)
+
+
+def main(unused_argv):
+  update_reorders_v2(_OUTPUT_FILE_PATH)
+
+
+if __name__ == '__main__':
+  app.run(main=main)
diff --git a/tensorflow/tools/docker/Dockerfile b/tensorflow/tools/docker/Dockerfile
index 205128ad58a39d113d4f3414e009f547a5cb38af..6676de02a41dcd53e92b905e98e811cd71833e20 100644
--- a/tensorflow/tools/docker/Dockerfile
+++ b/tensorflow/tools/docker/Dockerfile
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
@@ -8,7 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python \
diff --git a/tensorflow/tools/docker/Dockerfile.devel b/tensorflow/tools/docker/Dockerfile.devel
index a3893a2713d6b265d7e3464f34753a00fd1c97c2..c256dd364ef5a29ba7f8a2afa6e772ee9c566cb8 100644
--- a/tensorflow/tools/docker/Dockerfile.devel
+++ b/tensorflow/tools/docker/Dockerfile.devel
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Craig Citro <craigcitro@google.com>"
 
@@ -9,7 +9,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python-dev \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl b/tensorflow/tools/docker/Dockerfile.devel-mkl
index bd2883ddba0a65014a44266628575211f7c11497..2341c0e8ccfc5f88356ed38f33cca356c207214f 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 
@@ -16,7 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         libssl-dev \
         pkg-config \
diff --git a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
index df084e029c856d06a5e562896f66e332d69a7940..5e24617b2190f1d564d63f4c9be6321aa03cd8fb 100755
--- a/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.devel-mkl-horovod
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 
@@ -16,7 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libcurl3-dev \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python-dev \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl b/tensorflow/tools/docker/Dockerfile.mkl
index ac41cffe4bcc4742d7cc9256b11ceb0964515b84..dad27697fa142ac80d7237510b8b7d7ebda2b621 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl
+++ b/tensorflow/tools/docker/Dockerfile.mkl
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Clayne Robison <clayne.b.robison@intel.com>"
 
@@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         ${PYTHON} \
diff --git a/tensorflow/tools/docker/Dockerfile.mkl-horovod b/tensorflow/tools/docker/Dockerfile.mkl-horovod
index 0432cd5e80c19e0807fa8a70d6e6b6cc5a201a9d..19dc45c62cbc79bf931d89f275b5a7816e9924c8 100755
--- a/tensorflow/tools/docker/Dockerfile.mkl-horovod
+++ b/tensorflow/tools/docker/Dockerfile.mkl-horovod
@@ -1,4 +1,4 @@
-FROM ubuntu:16.04
+FROM ubuntu:18.04
 
 LABEL maintainer="Cong Xu <cong.xu@intel.com>"
 
@@ -17,7 +17,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         curl \
         libfreetype6-dev \
         libhdf5-serial-dev \
-        libpng12-dev \
+        libpng-dev \
         libzmq3-dev \
         pkg-config \
         python \
diff --git a/tensorflow/tools/dockerfiles/.gitignore b/tensorflow/tools/dockerfiles/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..d7efa472a92b23dfde1277acfe4b543f14842678
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/.gitignore
@@ -0,0 +1 @@
+dockerfiles/*.temp.Dockerfile
diff --git a/tensorflow/tools/dockerfiles/README.md b/tensorflow/tools/dockerfiles/README.md
index 7c8ca1d1c7a21989c616c7ed93ed737f7664b585..07bfd5960e686d1198548c080df9c733955a2903 100644
--- a/tensorflow/tools/dockerfiles/README.md
+++ b/tensorflow/tools/dockerfiles/README.md
@@ -1,9 +1,13 @@
 # TensorFlow Dockerfiles
 
-This directory houses TensorFlow's Dockerfiles. **DO NOT EDIT THE DOCKERFILES
-MANUALLY!** They are maintained by `assembler.py`, which builds Dockerfiles from
-the files in `partials/` and the rules in `spec.yml`. See [the Contributing
-section](#contributing) for more information.
+This directory houses TensorFlow's Dockerfiles and the infrastructure used to
+create and deploy them to
+[Docker Hub](https://hub.docker.com/r/tensorflow/tensorflow).
+
+**DO NOT EDIT THE DOCKERFILES/ DIRECTORY MANUALLY!** The files within are
+maintained by `assembler.py`, which builds Dockerfiles from the files in
+`partials/` and the rules in `spec.yml`. See
+[the Contributing section](#contributing) for more information.
 
 These Dockerfiles are planned to replace the Dockerfiles used to generate
 [TensorFlow's official Docker images](https://hub.docker.com/r/tensorflow/tensorflow).
@@ -20,10 +24,10 @@ $ docker build -f ./dockerfiles/cpu.Dockerfile -t tf .
 Each Dockerfile has its own set of available `--build-arg`s which are documented
 in the Dockerfile itself.
 
-## Running
+## Running Locally Built Images
 
 After building the image with the tag `tf` (for example), use `docker run` to
-run the images. Examples are below.
+run the images.
 
 Note for new Docker users: the `-v` and `-u` flags share directories between
 the Docker container and your machine, and very important. Without
@@ -42,8 +46,10 @@ $ docker run -u $(id -u):$(id -g) -v $(pwd):/my-devel -it tf
 # GPU-based images (set up nvidia-docker2 first)
 $ docker run --runtime=nvidia -u $(id -u):$(id -g) -v $(pwd):/my-devel -it tf
 
-# Images with Jupyter run on port 8888, and needs a volume for notebooks
-$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(pwd):/notebooks -it tf
+# Images with Jupyter run on port 8888 and need a volume for your notebooks
+# You can change $(PWD) to the full path to a directory if your notebooks
+# live outside the current directory.
+$ docker run --user $(id -u):$(id -g) -p 8888:8888 -v $(PWD):/tf/notebooks -it tf
 ```
 
 These images do not come with the TensorFlow source code -- but the development
@@ -60,11 +66,32 @@ You can use the `Dockerfile` in this directory to build an editing environment
 that has all of the Python dependencies you'll need:
 
 ```bash
-$ docker build -t tf-assembler -f assembler.Dockerfile .
+# Build the tools-helper image so you can run the assembler
+$ docker build -t tf-tools -f tools.Dockerfile .
 
 # Set --user to set correct permissions on generated files
-$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-assembler bash 
+$ docker run --user $(id -u):$(id -g) -it -v $(pwd):/tf tf-tools bash
+
+# Next you can make a handy alias depending on what you're doing. When building
+# Docker images, you need to run as root with docker.sock mounted so that the
+# container can run Docker commands. When assembling Dockerfiles, though, you'll
+# want to run as your user so that new files have the right permissions.
+
+# If you're BUILDING OR DEPLOYING DOCKER IMAGES, run as root with docker.sock:
+$ alias asm_images="docker run --rm -v $(pwd):/tf -v /var/run/docker.sock:/var/run/docker.sock tf-tools python3 assembler.py "
+
+# If you're REBUILDING OR ADDING DOCKERFILES, remove docker.sock and add -u:
+$ alias asm_dockerfiles="docker run --rm -u $(id -u):$(id -g) -v $(pwd):/tf tf-tools python3 assembler.py "
+
+# Check flags
+$ asm_dockerfiles --help
+
+# Assemble all of the Dockerfiles
+$ asm_dockerfiles --release ubuntu-dockerfiles --construct_dockerfiles
+
+# Build all of the "nightly" images on your local machine:
+$ asm_images --release nightly --build_images
 
-# In the container...
-/tf $ python3 ./assembler.py -o dockerfiles -s spec.yml
+# Build version release for version 99.0, except "gpu" tags:
+$ asm_images --release versioned --arg _TAG_PREFIX=99.0 --build_images --exclude_tags_matching '*.gpu.*'
 ```
diff --git a/tensorflow/tools/dockerfiles/assembler.py b/tensorflow/tools/dockerfiles/assembler.py
index 9cdd9bb0cb0841e95d8d334293026207f093ab90..67a0320241d273bbb7a2439b2e09723905db0765 100644
--- a/tensorflow/tools/dockerfiles/assembler.py
+++ b/tensorflow/tools/dockerfiles/assembler.py
@@ -11,63 +11,144 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# ==============================================================================
-"""Assemble common TF Dockerfiles from many parts.
+# ============================================================================
+"""Multipurpose TensorFlow Docker Helper.
 
-This script constructs TF's Dockerfiles by aggregating partial
-Dockerfiles. See README.md for usage examples.
+- Assembles Dockerfiles
+- Builds images (and optionally runs image tests)
+- Pushes images to Docker Hub (provided with credentials)
+
+Read README.md (in this directory) for instructions!
 """
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import collections
 import copy
 import errno
+import itertools
+import multiprocessing
 import os
-import os.path
 import re
 import shutil
-import textwrap
+import sys
 
 from absl import app
 from absl import flags
 import cerberus
+import docker
 import yaml
 
 FLAGS = flags.FLAGS
 
+flags.DEFINE_string('hub_username', None,
+                    'Dockerhub username, only used with --upload_to_hub')
+
+flags.DEFINE_string(
+    'hub_password', None,
+    ('Dockerhub password, only used with --upload_to_hub. Use from an env param'
+     'so your password isn\'t in your history.'))
+
+flags.DEFINE_integer('hub_timeout', 3600,
+                     'Abort Hub upload if it takes longer than this.')
+
+flags.DEFINE_string(
+    'repository', 'tensorflow',
+    'Tag local images as {repository}:tag (in addition to the '
+    'hub_repository, if uploading to hub)')
+
+flags.DEFINE_string(
+    'hub_repository', None,
+    'Push tags to this Docker Hub repository, e.g. tensorflow/tensorflow')
+
+flags.DEFINE_boolean(
+    'upload_to_hub',
+    False,
+    ('Push built images to Docker Hub (you must also provide --hub_username, '
+     '--hub_password, and --hub_repository)'),
+    short_name='u',
+)
+
+flags.DEFINE_boolean(
+    'construct_dockerfiles', False, 'Do not build images', short_name='d')
+
+flags.DEFINE_boolean(
+    'keep_temp_dockerfiles',
+    False,
+    'Retain .temp.Dockerfiles created while building images.',
+    short_name='k')
+
 flags.DEFINE_boolean(
-    'dry_run', False, 'Do not actually generate Dockerfiles', short_name='n')
+    'build_images', False, 'Do not build images', short_name='b')
 
 flags.DEFINE_string(
-    'spec_file',
-    './spec.yml',
-    'Path to a YAML specification file',
-    short_name='s')
+    'run_tests_path', None,
+    ('Execute test scripts on generated Dockerfiles before pushing them. '
+     'Flag value must be a full path to the "tests" directory, which is usually'
+     ' $(realpath ./tests). A failed tests counts the same as a failed build.'))
+
+flags.DEFINE_boolean(
+    'stop_on_failure', False,
+    ('Stop processing tags if any one build fails. If False or not specified, '
+     'failures are reported but do not affect the other images.'))
+
+flags.DEFINE_boolean(
+    'dry_run',
+    False,
+    'Do not build or deploy anything at all.',
+    short_name='n',
+)
 
 flags.DEFINE_string(
-    'output_dir',
-    './dockerfiles', ('Path to an output directory for Dockerfiles. '
-                      'Will be created if it doesn\'t exist.'),
+    'exclude_tags_matching',
+    None,
+    ('Regular expression that skips processing on any tag it matches. Must '
+     'match entire string, e.g. ".*gpu.*" ignores all GPU tags.'),
+    short_name='x')
+
+flags.DEFINE_string(
+    'only_tags_matching',
+    None,
+    ('Regular expression that skips processing on any tag it does not match. '
+     'Must match entire string, e.g. ".*gpu.*" includes only GPU tags.'),
+    short_name='i')
+
+flags.DEFINE_string(
+    'dockerfile_dir',
+    './dockerfiles', 'Path to an output directory for Dockerfiles.'
+    ' Will be created if it doesn\'t exist.'
+    ' Existing files in this directory will be deleted when new Dockerfiles'
+    ' are made.',
     short_name='o')
 
 flags.DEFINE_string(
     'partial_dir',
     './partials',
-    'Path to a directory containing foo.partial.Dockerfile partial files.',
+    'Path to a directory containing foo.partial.Dockerfile partial files.'
+    ' can have subdirectories, e.g. "bar/baz.partial.Dockerfile".',
     short_name='p')
 
-flags.DEFINE_boolean(
-    'quiet_dry_run',
-    True,
-    'Do not print contents of dry run Dockerfiles.',
-    short_name='q')
+flags.DEFINE_multi_string(
+    'release', [],
+    'Set of releases to build and tag. Defaults to every release type.',
+    short_name='r')
 
-flags.DEFINE_boolean(
-    'validate', True, 'Validate generated Dockerfiles', short_name='c')
+flags.DEFINE_multi_string(
+    'arg', [],
+    ('Extra build arguments. These are used for expanding tag names if needed '
+     '(e.g. --arg _TAG_PREFIX=foo) and for using as build arguments (unused '
+     'args will print a warning).'),
+    short_name='a')
 
-# Schema to verify the contents of spec.yml with Cerberus.
+flags.DEFINE_string(
+    'spec_file',
+    './spec.yml',
+    'Path to the YAML specification file',
+    short_name='s')
+
+# Schema to verify the contents of tag-spec.yml with Cerberus.
 # Must be converted to a dict from yaml to work.
 # Note: can add python references with e.g.
 # !!python/name:builtins.str
@@ -76,79 +157,78 @@ SCHEMA_TEXT = """
 header:
   type: string
 
-partials:
+slice_sets:
   type: dict
   keyschema:
     type: string
   valueschema:
-    type: dict
-    schema:
-      desc:
-        type: string
-      args:
+     type: list
+     schema:
         type: dict
-        keyschema:
-          type: string
-        valueschema:
-          anyof:
-            - type: [ boolean, number, string ]
-            - type: dict
-              schema:
-                 default:
-                    type: [ boolean, number, string ]
-                 desc:
-                    type: string
-                 options:
-                    type: list
-                    schema:
-                       type: string
-
-images:
+        schema:
+           add_to_name:
+             type: string
+           dockerfile_exclusive_name:
+             type: string
+           dockerfile_subdirectory:
+             type: string
+           partials:
+             type: list
+             schema:
+               type: string
+               ispartial: true
+           test_runtime:
+             type: string
+             required: false
+           tests:
+             type: list
+             default: []
+             schema:
+               type: string
+           args:
+             type: list
+             default: []
+             schema:
+               type: string
+               isfullarg: true
+
+releases:
+  type: dict
   keyschema:
     type: string
   valueschema:
     type: dict
     schema:
-      desc:
-        type: string
-      arg-defaults:
-        type: list
-        schema:
-          anyof:
-            - type: dict
-              keyschema:
-                type: string
-                arg_in_use: true
-              valueschema:
-                type: string
-            - type: string
-              isimage: true
-      create-dockerfile:
+      is_dockerfiles:
         type: boolean
-      partials:
+        required: false
+        default: false
+      upload_images:
+        type: boolean
+        required: false
+        default: true
+      tag_specs:
         type: list
+        required: true
         schema:
-          anyof:
-            - type: dict
-              keyschema:
-                type: string
-                regex: image
-              valueschema:
-                type: string
-                isimage: true
-            - type: string
-              ispartial: true
+          type: string
 """
 
 
-class TfDockerValidator(cerberus.Validator):
-  """Custom Cerberus validator for TF dockerfile spec.
+class TfDockerTagValidator(cerberus.Validator):
+  """Custom Cerberus validator for TF tag spec.
 
   Note: Each _validate_foo function's docstring must end with a segment
   describing its own validation schema, e.g. "The rule's arguments are...". If
   you add a new validator, you can copy/paste that section.
   """
 
+  def __init__(self, *args, **kwargs):
+    # See http://docs.python-cerberus.org/en/stable/customize.html
+    if 'partials' in kwargs:
+      self.partials = kwargs['partials']
+    super(cerberus.Validator, self).__init__(*args, **kwargs)
+
   def _validate_ispartial(self, ispartial, field, value):
     """Validate that a partial references an existing partial spec.
 
@@ -156,398 +236,431 @@ class TfDockerValidator(cerberus.Validator):
       ispartial: Value of the rule, a bool
       field: The field being validated
       value: The field's value
-
     The rule's arguments are validated against this schema:
     {'type': 'boolean'}
     """
-    if ispartial and value not in self.root_document.get('partials', dict()):
-      self._error(field, '{} is not an existing partial.'.format(value))
+    if ispartial and value not in self.partials:
+      self._error(field,
+                  '{} is not present in the partials directory.'.format(value))
 
-  def _validate_isimage(self, isimage, field, value):
-    """Validate that an image references an existing partial spec.
+  def _validate_isfullarg(self, isfullarg, field, value):
+    """Validate that a string is either a FULL=arg or NOT.
 
     Args:
-      isimage: Value of the rule, a bool
+      isfullarg: Value of the rule, a bool
       field: The field being validated
       value: The field's value
-
     The rule's arguments are validated against this schema:
     {'type': 'boolean'}
     """
-    if isimage and value not in self.root_document.get('images', dict()):
-      self._error(field, '{} is not an existing image.'.format(value))
-
-  def _validate_arg_in_use(self, arg_in_use, field, value):
-    """Validate that an arg references an existing partial spec's args.
-
-    Args:
-      arg_in_use: Value of the rule, a bool
-      field: The field being validated
-      value: The field's value
-
-    The rule's arguments are validated against this schema:
-    {'type': 'boolean'}
-    """
-    if arg_in_use:
-      for partial in self.root_document.get('partials', dict()).values():
-        if value in partial.get('args', tuple()):
-          return
-
-      self._error(field, '{} is not an arg used in any partial.'.format(value))
-
-
-def build_partial_description(partial_spec):
-  """Create the documentation lines for a specific partial.
-
-  Generates something like this:
-
-    # This is the partial's description, from spec.yml.
-    # --build-arg ARG_NAME=argdefault
-    #    this is one of the args.
-    # --build-arg ANOTHER_ARG=(some|choices)
-    #    another arg.
+    if isfullarg and '=' not in value:
+      self._error(field, '{} should be of the form ARG=VALUE.'.format(value))
+    if not isfullarg and '=' in value:
+      self._error(field, '{} should be of the form ARG (no =).'.format(value))
 
-  Args:
-    partial_spec: A dict representing one of the partials from spec.yml. Doesn't
-      include the name of the partial; is a dict like { desc: ..., args: ... }.
-
-  Returns:
-    A commented string describing this partial.
-  """
 
-  # Start from linewrapped desc field
-  lines = []
-  wrapper = textwrap.TextWrapper(
-      initial_indent='# ', subsequent_indent='# ', width=80)
-  description = wrapper.fill(partial_spec.get('desc', '( no comments )'))
-  lines.extend(['#', description])
-
-  # Document each arg
-  for arg, arg_data in partial_spec.get('args', dict()).items():
-    # Wrap arg description with comment lines
-    desc = arg_data.get('desc', '( no description )')
-    desc = textwrap.fill(
-        desc,
-        initial_indent='#    ',
-        subsequent_indent='#    ',
-        width=80,
-        drop_whitespace=False)
-
-    # Document (each|option|like|this)
-    if 'options' in arg_data:
-      arg_options = ' ({})'.format('|'.join(arg_data['options']))
-    else:
-      arg_options = ''
+def eprint(*args, **kwargs):
+  print(*args, file=sys.stderr, flush=True, **kwargs)
 
-    # Add usage sample
-    arg_use = '# --build-arg {}={}{}'.format(arg,
-                                             arg_data.get('default', '(unset)'),
-                                             arg_options)
-    lines.extend([arg_use, desc])
 
-  return '\n'.join(lines)
+def aggregate_all_slice_combinations(spec, slice_set_names):
+  """Figure out all of the possible slice groupings for a tag spec."""
+  slice_sets = copy.deepcopy(spec['slice_sets'])
 
+  for name in slice_set_names:
+    for slice_set in slice_sets[name]:
+      slice_set['set_name'] = name
 
-def construct_contents(partial_specs, image_spec):
-  """Assemble the dockerfile contents for an image spec.
+  slices_grouped_but_not_keyed = [slice_sets[name] for name in slice_set_names]
+  all_slice_combos = list(itertools.product(*slices_grouped_but_not_keyed))
+  return all_slice_combos
 
-  It assembles a concrete list of partial references into a single, large
-  string.
-  Also expands argument defaults, so that the resulting Dockerfile doesn't have
-  to be configured with --build-arg=... every time. That is, any ARG directive
-  will be updated with a new default value.
 
-  Args:
-    partial_specs: The dict from spec.yml["partials"].
-    image_spec: One of the dict values from spec.yml["images"].
+def build_name_from_slices(format_string, slices, args, is_dockerfile=False):
+  """Build the tag name (cpu-devel...) from a list of slices."""
+  name_formatter = copy.deepcopy(args)
+  name_formatter.update({s['set_name']: s['add_to_name'] for s in slices})
+  name_formatter.update({
+      s['set_name']: s['dockerfile_exclusive_name']
+      for s in slices
+      if is_dockerfile and 'dockerfile_exclusive_name' in s
+  })
+  name = format_string.format(**name_formatter)
+  return name
 
-  Returns:
-    A string containing a valid Dockerfile based on the partials listed in
-    image_spec.
-  """
-  processed_partial_strings = []
-  for partial_name in image_spec['partials']:
-    # Apply image arg-defaults to existing arg defaults
-    partial_spec = copy.deepcopy(partial_specs[partial_name])
-    args = partial_spec.get('args', dict())
-    for k_v in image_spec.get('arg-defaults', []):
-      arg, value = list(k_v.items())[0]
-      if arg in args:
-        args[arg]['default'] = value
-
-    # Read partial file contents
-    filename = partial_spec.get('file', partial_name)
-    partial_path = os.path.join(FLAGS.partial_dir,
-                                '{}.partial.Dockerfile'.format(filename))
-    with open(partial_path, 'r') as f_partial:
-      partial_contents = f_partial.read()
-
-    # Replace ARG FOO=BAR with ARG FOO=[new-default]
-    for arg, arg_data in args.items():
-      if 'default' in arg_data and arg_data['default']:
-        default = '={}'.format(arg_data['default'])
-      else:
-        default = ''
-      partial_contents = re.sub(r'ARG {}.*'.format(arg), 'ARG {}{}'.format(
-          arg, default), partial_contents)
-
-    # Store updated partial contents
-    processed_partial_strings.append(partial_contents)
-
-  # Join everything together
-  return '\n'.join(processed_partial_strings)
 
-
-def mkdir_p(path):
-  """Create a directory and its parents, even if it already exists."""
-  try:
-    os.makedirs(path)
-  except OSError as e:
-    if e.errno != errno.EEXIST:
-      raise
+def update_args_dict(args_dict, updater):
+  """Update a dict of arg values with more values from a list or dict."""
+  if isinstance(updater, list):
+    for arg in updater:
+      key, sep, value = arg.partition('=')
+      if sep == '=':
+        args_dict[key] = value
+  if isinstance(updater, dict):
+    for key, value in updater.items():
+      args_dict[key] = value
+  return args_dict
 
 
-def construct_documentation(header, partial_specs, image_spec):
-  """Assemble all of the documentation for a single dockerfile.
+def get_slice_sets_and_required_args(slice_sets, tag_spec):
+  """Extract used-slice-sets and required CLI arguments from a spec string.
 
-  Builds explanations of included partials and available build args.
+  For example, {FOO}{bar}{bat} finds FOO, bar, and bat. Assuming bar and bat
+  are both named slice sets, FOO must be specified on the command line.
 
   Args:
-    header: The string from spec.yml["header"]; will be commented and wrapped.
-    partial_specs: The dict from spec.yml["partials"].
-    image_spec: The spec for the dockerfile being built.
+     slice_sets: Dict of named slice sets
+     tag_spec: The tag spec string, e.g. {_FOO}{blep}
 
   Returns:
-    A string containing a commented header that documents the contents of the
-    dockerfile.
-
+     (used_slice_sets, required_args), a tuple of lists
   """
-  # Comment and wrap header and image description
-  commented_header = '\n'.join(
-      [('# ' + l).rstrip() for l in header.splitlines()])
-  commented_desc = '\n'.join(
-      ['# ' + l for l in image_spec.get('desc', '').splitlines()])
-  partial_descriptions = []
-
-  # Build documentation for each partial in the image
-  for partial in image_spec['partials']:
-    # Copy partial data for default args unique to this image
-    partial_spec = copy.deepcopy(partial_specs[partial])
-    args = partial_spec.get('args', dict())
-
-    # Overwrite any existing arg defaults
-    for k_v in image_spec.get('arg-defaults', []):
-      arg, value = list(k_v.items())[0]
-      if arg in args:
-        args[arg]['default'] = value
-
-    # Build the description from new args
-    partial_description = build_partial_description(partial_spec)
-    partial_descriptions.append(partial_description)
-
-  contents = [commented_header, '#', commented_desc] + partial_descriptions
-  return '\n'.join(contents) + '\n'
-
-
-def normalize_partial_args(partial_specs):
-  """Normalize the shorthand form of a partial's args specification.
-
-  Turns this:
-
-    partial:
-      args:
-        SOME_ARG: arg_value
-
-  Into this:
-
-    partial:
-       args:
-         SOME_ARG:
-            default: arg_value
-
-  Args:
-    partial_specs: The dict from spec.yml["partials"]. This dict is modified in
-      place.
-
-  Returns:
-    The modified contents of partial_specs.
-
-  """
-  for _, partial in partial_specs.items():
-    args = partial.get('args', dict())
-    for arg, value in args.items():
-      if not isinstance(value, dict):
-        new_value = {'default': value}
-        args[arg] = new_value
-
-  return partial_specs
-
-
-def flatten_args_references(image_specs):
-  """Resolve all default-args in each image spec to a concrete dict.
-
-  Turns this:
-
-    example-image:
-      arg-defaults:
-        - MY_ARG: ARG_VALUE
-
-    another-example:
-      arg-defaults:
-        - ANOTHER_ARG: ANOTHER_VALUE
-        - example_image
-
-  Into this:
+  required_args = []
+  used_slice_sets = []
+
+  extract_bracketed_words = re.compile(r'\{([^}]+)\}')
+  possible_args_or_slice_set_names = extract_bracketed_words.findall(tag_spec)
+  for name in possible_args_or_slice_set_names:
+    if name in slice_sets:
+      used_slice_sets.append(name)
+    else:
+      required_args.append(name)
 
-    example-image:
-      arg-defaults:
-        - MY_ARG: ARG_VALUE
+  return (used_slice_sets, required_args)
 
-    another-example:
-      arg-defaults:
-        - ANOTHER_ARG: ANOTHER_VALUE
-        - MY_ARG: ARG_VALUE
 
-  Args:
-    image_specs: A dict of image_spec dicts; should be the contents of the
-      "images" key in the global spec.yaml. This dict is modified in place and
-      then returned.
-
-  Returns:
-    The modified contents of image_specs.
-  """
-  for _, image_spec in image_specs.items():
-    too_deep = 0
-    while str in map(type, image_spec.get('arg-defaults', [])) and too_deep < 5:
-      new_args = []
-      for arg in image_spec['arg-defaults']:
-        if isinstance(arg, str):
-          new_args.extend(image_specs[arg]['arg-defaults'])
-        else:
-          new_args.append(arg)
+def gather_tag_args(slices, cli_input_args, required_args):
+  """Build a dictionary of all the CLI and slice-specified args for a tag."""
+  args = dict()
 
-      image_spec['arg-defaults'] = new_args
-      too_deep += 1
+  for s in slices:
+    args = update_args_dict(args, s['args'])
 
-  return image_specs
+  args = update_args_dict(args, cli_input_args)
+  for arg in required_args:
+    if arg not in args:
+      eprint(('> Error: {} is not a valid slice_set, and also isn\'t an arg '
+              'provided on the command line. If it is an arg, please specify '
+              'it with --arg. If not, check the slice_sets list.'.format(arg)))
+      exit(1)
 
+  return args
 
-def flatten_partial_references(image_specs):
-  """Resolve all partial references in each image spec to a concrete list.
 
-  Turns this:
+def gather_slice_list_items(slices, key):
+  """For a list of slices, get the flattened list of all of a certain key."""
+  return list(itertools.chain(*[s[key] for s in slices if key in s]))
 
-    example-image:
-      partials:
-        - foo
 
-    another-example:
-      partials:
-        - bar
-        - image: example-image
-        - bat
+def find_first_slice_value(slices, key):
+  """For a list of slices, get the first value for a certain key."""
+  for s in slices:
+    if key in s and s[key] is not None:
+      return s[key]
+  return None
 
-  Into this:
 
-    example-image:
-      partials:
-        - foo
+def assemble_tags(spec, cli_args, enabled_releases, all_partials):
+  """Gather all the tags based on our spec.
 
-    another-example:
-      partials:
-        - bar
-        - foo
-        - bat
   Args:
-    image_specs: A dict of image_spec dicts; should be the contents of the
-      "images" key in the global spec.yaml. This dict is modified in place and
-      then returned.
+    spec: Nested dict containing full Tag spec
+    cli_args: List of ARG=foo arguments to pass along to Docker build
+    enabled_releases: List of releases to parse. Empty list = all
+    all_partials: Dict of every partial, for reference
 
   Returns:
-    The modified contents of image_specs.
+    Dict of tags and how to build them
   """
-  for _, image_spec in image_specs.items():
-    too_deep = 0
-    while dict in map(type, image_spec['partials']) and too_deep < 5:
-      new_partials = []
-      for partial in image_spec['partials']:
-        if isinstance(partial, str):
-          new_partials.append(partial)
-        else:
-          new_partials.extend(image_specs[partial['image']]['partials'])
+  tag_data = collections.defaultdict(list)
+
+  for name, release in spec['releases'].items():
+    for tag_spec in release['tag_specs']:
+      if enabled_releases and name not in enabled_releases:
+        eprint('> Skipping release {}'.format(name))
+        continue
+
+      used_slice_sets, required_cli_args = get_slice_sets_and_required_args(
+          spec['slice_sets'], tag_spec)
+
+      slice_combos = aggregate_all_slice_combinations(spec, used_slice_sets)
+      for slices in slice_combos:
+
+        tag_args = gather_tag_args(slices, cli_args, required_cli_args)
+        tag_name = build_name_from_slices(tag_spec, slices, tag_args,
+                                          release['is_dockerfiles'])
+        used_partials = gather_slice_list_items(slices, 'partials')
+        used_tests = gather_slice_list_items(slices, 'tests')
+        test_runtime = find_first_slice_value(slices, 'test_runtime')
+        dockerfile_subdirectory = find_first_slice_value(
+            slices, 'dockerfile_subdirectory')
+        dockerfile_contents = merge_partials(spec['header'], used_partials,
+                                             all_partials)
+
+        tag_data[tag_name].append({
+            'release': name,
+            'tag_spec': tag_spec,
+            'is_dockerfiles': release['is_dockerfiles'],
+            'upload_images': release['upload_images'],
+            'cli_args': tag_args,
+            'dockerfile_subdirectory': dockerfile_subdirectory or '',
+            'partials': used_partials,
+            'tests': used_tests,
+            'test_runtime': test_runtime,
+            'dockerfile_contents': dockerfile_contents,
+        })
+
+  return tag_data
+
+
+def merge_partials(header, used_partials, all_partials):
+  """Merge all partial contents with their header."""
+  used_partials = list(used_partials)
+  return '\n'.join([header] + [all_partials[u] for u in used_partials])
+
+
+def upload_in_background(hub_repository, dock, image, tag):
+  """Upload a docker image (to be used by multiprocessing)."""
+  image.tag(hub_repository, tag=tag)
+  print(dock.images.push(hub_repository, tag=tag))
 
-      image_spec['partials'] = new_partials
-      too_deep += 1
 
-  return image_specs
+def mkdir_p(path):
+  """Create a directory and its parents, even if it already exists."""
+  try:
+    os.makedirs(path)
+  except OSError as e:
+    if e.errno != errno.EEXIST:
+      raise
 
 
-def construct_dockerfiles(tf_spec):
-  """Generate a mapping of {"cpu": <cpu dockerfile contents>, ...}.
+def gather_existing_partials(partial_path):
+  """Find and read all available partials.
 
   Args:
-    tf_spec: The full spec.yml loaded as a python object.
+    partial_path (string): read partials from this directory.
 
   Returns:
-    A string:string dict of short names ("cpu-devel") to Dockerfile contents.
+    Dict[string, string] of partial short names (like "ubuntu/python" or
+      "bazel") to the full contents of that partial.
   """
-  names_to_contents = dict()
-  image_specs = tf_spec['images']
-  image_specs = flatten_partial_references(image_specs)
-  image_specs = flatten_args_references(image_specs)
-  partial_specs = tf_spec['partials']
-  partial_specs = normalize_partial_args(partial_specs)
-
-  for name, image_spec in image_specs.items():
-    if not image_spec.get('create-dockerfile', True):
-      continue
-    documentation = construct_documentation(tf_spec['header'], partial_specs,
-                                            image_spec)
-    contents = construct_contents(partial_specs, image_spec)
-    names_to_contents[name] = '\n'.join([documentation, contents])
-
-  return names_to_contents
+  partials = dict()
+  for path, _, files in os.walk(partial_path):
+    for name in files:
+      fullpath = os.path.join(path, name)
+      if '.partial.Dockerfile' not in fullpath:
+        eprint(('> Probably not a problem: skipping {}, which is not a '
+                'partial.').format(fullpath))
+        continue
+      # partial_dir/foo/bar.partial.Dockerfile -> foo/bar
+      simple_name = fullpath[len(partial_path) + 1:-len('.partial.dockerfile')]
+      with open(fullpath, 'r') as f:
+        partial_contents = f.read()
+      partials[simple_name] = partial_contents
+  return partials
 
 
 def main(argv):
   if len(argv) > 1:
-    raise app.UsageError('Unexpected command line args found: {}'.format(argv))
+    raise app.UsageError('Too many command-line arguments.')
 
+  # Read the full spec file, used for everything
   with open(FLAGS.spec_file, 'r') as spec_file:
-    tf_spec = yaml.load(spec_file)
+    tag_spec = yaml.load(spec_file)
+
+  # Get existing partial contents
+  partials = gather_existing_partials(FLAGS.partial_dir)
 
   # Abort if spec.yaml is invalid
-  if FLAGS.validate:
-    schema = yaml.load(SCHEMA_TEXT)
-    v = TfDockerValidator(schema)
-    if not v.validate(tf_spec):
-      print('>> ERROR: {} is an invalid spec! The errors are:'.format(
-          FLAGS.spec_file))
-      print(yaml.dump(v.errors, indent=2))
+  schema = yaml.load(SCHEMA_TEXT)
+  v = TfDockerTagValidator(schema, partials=partials)
+  if not v.validate(tag_spec):
+    eprint('> Error: {} is an invalid spec! The errors are:'.format(
+        FLAGS.spec_file))
+    eprint(yaml.dump(v.errors, indent=2))
+    exit(1)
+  tag_spec = v.normalized(tag_spec)
+
+  # Assemble tags and images used to build them
+  all_tags = assemble_tags(tag_spec, FLAGS.arg, FLAGS.release, partials)
+
+  # Empty Dockerfile directory if building new Dockerfiles
+  if FLAGS.construct_dockerfiles:
+    eprint('> Emptying Dockerfile dir "{}"'.format(FLAGS.dockerfile_dir))
+    shutil.rmtree(FLAGS.dockerfile_dir, ignore_errors=True)
+    mkdir_p(FLAGS.dockerfile_dir)
+
+  # Set up Docker helper
+  dock = docker.from_env()
+
+  # Login to Docker if uploading images
+  if FLAGS.upload_to_hub:
+    if not FLAGS.hub_username:
+      eprint('> Error: please set --hub_username when uploading to Dockerhub.')
       exit(1)
-  else:
-    print('>> WARNING: Not validating {}'.format(FLAGS.spec_file))
-
-  # Generate mapping of { "cpu-devel": "<cpu-devel dockerfile contents>", ... }
-  names_to_contents = construct_dockerfiles(tf_spec)
-
-  # Write each completed Dockerfile
-  if not FLAGS.dry_run:
-    print('>> Emptying destination dir "{}"'.format(FLAGS.output_dir))
-    shutil.rmtree(FLAGS.output_dir, ignore_errors=True)
-    mkdir_p(FLAGS.output_dir)
-  else:
-    print('>> Skipping creation of {} (dry run)'.format(FLAGS.output_dir))
-  for name, contents in names_to_contents.items():
-    path = os.path.join(FLAGS.output_dir, name + '.Dockerfile')
-    if FLAGS.dry_run:
-      print('>> Skipping writing contents of {} (dry run)'.format(path))
-      print(contents)
-    else:
-      mkdir_p(FLAGS.output_dir)
-      print('>> Writing {}'.format(path))
-      with open(path, 'w') as f:
-        f.write(contents)
+    if not FLAGS.hub_repository:
+      eprint(
+          '> Error: please set --hub_repository when uploading to Dockerhub.')
+      exit(1)
+    if not FLAGS.hub_password:
+      eprint('> Error: please set --hub_password when uploading to Dockerhub.')
+      exit(1)
+    dock.login(
+        username=FLAGS.hub_username,
+        password=FLAGS.hub_password,
+    )
+
+  # Each tag has a name ('tag') and a definition consisting of the contents
+  # of its Dockerfile, its build arg list, etc.
+  failed_tags = []
+  for tag, tag_defs in all_tags.items():
+    for tag_def in tag_defs:
+      eprint('> Working on {}'.format(tag))
+
+      if FLAGS.exclude_tags_matching and re.match(FLAGS.exclude_tags_matching,
+                                                  tag):
+        eprint('>> Excluded due to match against "{}".'.format(
+            FLAGS.exclude_tags_matching))
+        continue
+
+      if FLAGS.only_tags_matching and not re.match(FLAGS.only_tags_matching,
+                                                   tag):
+        eprint('>> Excluded due to failure to match against "{}".'.format(
+            FLAGS.only_tags_matching))
+        continue
+
+      # Write releases marked "is_dockerfiles" into the Dockerfile directory
+      if FLAGS.construct_dockerfiles and tag_def['is_dockerfiles']:
+        path = os.path.join(FLAGS.dockerfile_dir,
+                            tag_def['dockerfile_subdirectory'],
+                            tag + '.Dockerfile')
+        eprint('>> Writing {}...'.format(path))
+        if not FLAGS.dry_run:
+          mkdir_p(os.path.dirname(path))
+          with open(path, 'w') as f:
+            f.write(tag_def['dockerfile_contents'])
+
+      # Don't build any images for dockerfile-only releases
+      if not FLAGS.build_images:
+        continue
+
+      # Generate a temporary Dockerfile to use to build, since docker-py
+      # needs a filepath relative to the build context (i.e. the current
+      # directory)
+      dockerfile = os.path.join(FLAGS.dockerfile_dir, tag + '.temp.Dockerfile')
+      if not FLAGS.dry_run:
+        with open(dockerfile, 'w') as f:
+          f.write(tag_def['dockerfile_contents'])
+      eprint('>> (Temporary) writing {}...'.format(dockerfile))
+
+      repo_tag = '{}:{}'.format(FLAGS.repository, tag)
+      eprint('>> Building {} using build args:'.format(repo_tag))
+      for arg, value in tag_def['cli_args'].items():
+        eprint('>>> {}={}'.format(arg, value))
+
+      # Note that we are NOT using cache_from, which appears to limit
+      # available cache layers to those from explicitly specified layers. Many
+      # of our layers are similar between local builds, so we want to use the
+      # implied local build cache.
+      tag_failed = False
+      image, logs = None, []
+      if not FLAGS.dry_run:
+        try:
+          image, logs = dock.images.build(
+              timeout=FLAGS.hub_timeout,
+              path='.',
+              dockerfile=dockerfile,
+              buildargs=tag_def['cli_args'],
+              tag=repo_tag)
+
+          # Print logs after finishing
+          log_lines = [l.get('stream', '') for l in logs]
+          eprint(''.join(log_lines))
+
+          # Run tests if requested, and dump output
+          # Could be improved by backgrounding, but would need better
+          # multiprocessing support to track failures properly.
+          if FLAGS.run_tests_path:
+            if not tag_def['tests']:
+              eprint('>>> No tests to run.')
+            for test in tag_def['tests']:
+              eprint('>> Testing {}...'.format(test))
+              container, = dock.containers.run(
+                  image,
+                  '/tests/' + test,
+                  working_dir='/',
+                  log_config={'type': 'journald'},
+                  detach=True,
+                  stderr=True,
+                  stdout=True,
+                  volumes={
+                      FLAGS.run_tests_path: {
+                          'bind': '/tests',
+                          'mode': 'ro'
+                      }
+                  },
+                  runtime=tag_def['test_runtime']),
+              ret = container.wait()
+              code = ret['StatusCode']
+              out = container.logs(stdout=True, stderr=False)
+              err = container.logs(stdout=False, stderr=True)
+              container.remove()
+              if out:
+                eprint('>>> Output stdout:')
+                eprint(out.decode('utf-8'))
+              else:
+                eprint('>>> No test standard out.')
+              if err:
+                eprint('>>> Output stderr:')
+                eprint(out.decode('utf-8'))
+              else:
+                eprint('>>> No test standard err.')
+              if code != 0:
+                eprint('>> {} failed tests with status: "{}"'.format(
+                    repo_tag, code))
+                failed_tags.append(tag)
+                tag_failed = True
+                if FLAGS.stop_on_failure:
+                  eprint('>> ABORTING due to --stop_on_failure!')
+                  exit(1)
+              else:
+                eprint('>> Tests look good!')
+
+        except docker.errors.BuildError as e:
+          eprint('>> {} failed to build with message: "{}"'.format(
+              repo_tag, e.msg))
+          eprint('>> Build logs follow:')
+          log_lines = [l.get('stream', '') for l in e.build_log]
+          eprint(''.join(log_lines))
+          failed_tags.append(tag)
+          tag_failed = True
+          if FLAGS.stop_on_failure:
+            eprint('>> ABORTING due to --stop_on_failure!')
+            exit(1)
+
+        # Clean temporary dockerfiles if they were created earlier
+        if not FLAGS.keep_temp_dockerfiles:
+          os.remove(dockerfile)
+
+      # Upload new images to DockerHub as long as they built + passed tests
+      if FLAGS.upload_to_hub:
+        if not tag_def['upload_images']:
+          continue
+        if tag_failed:
+          continue
+
+        eprint('>> Uploading to {}:{}'.format(FLAGS.hub_repository, tag))
+        if not FLAGS.dry_run:
+          p = multiprocessing.Process(
+              target=upload_in_background,
+              args=(FLAGS.hub_repository, dock, image, tag))
+          p.start()
+
+  if failed_tags:
+    eprint(
+        '> Some tags failed to build or failed testing, check scrollback for '
+        'errors: {}'.format(','.join(failed_tags)))
+    exit(1)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
index f889ed6f91d077fb5fb6044e55a9504c2a5b56c9..d8fabadec280cc136bd6cc9a30e79390a9a167bd 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu-jupyter.Dockerfile
@@ -16,31 +16,14 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter included.
-#
-# Start from Ubuntu (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
 
-ARG USE_PYTHON_3_NOT_2=True
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -52,21 +35,37 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
index 182a534bed9855bf9e57c4f495822fe78523dcc3..857b5e20471a82bd162e55b146854d0a5c165db8 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/cpu.Dockerfile
@@ -16,29 +16,14 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for using TensorFlow
-#
-# Start from Ubuntu (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
 
-ARG USE_PYTHON_3_NOT_2=True
+FROM ubuntu:${UBUNTU_VERSION} as base
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -50,10 +35,18 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
 ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
similarity index 60%
rename from tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
index dab7178db3adde78be55fc5b9d4b4254a131924e..43265676f8b7ab19dc14f2c1475de1af67054c6a 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu-jupyter.Dockerfile
@@ -16,27 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for developing changes for TensorFlow, with Jupyter included.
-#
-# Start from Ubuntu, with TF development packages (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -48,7 +33,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -59,8 +43,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
 
-ARG USE_PYTHON_3_NOT_2=True
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -72,10 +62,13 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
@@ -84,6 +77,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
@@ -93,11 +100,19 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
similarity index 71%
rename from tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
index 68566ccc8aa954c64f7504d380cfcf5968f3c449..5c5b2f91634ff43fb2a047c66a856ac787858a47 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/cpu-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-cpu.Dockerfile
@@ -16,25 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, CPU-only environment for developing changes for TensorFlow.
-#
-# Start from Ubuntu, with TF development packages (no GPU support)
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -46,7 +33,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -57,8 +43,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
 
-ARG USE_PYTHON_3_NOT_2=True
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -70,10 +62,13 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
@@ -82,6 +77,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
similarity index 63%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
index 17faa84a682d90000538c913ecac545b7c4b4445..8769e4e9cd619a2c31e37ee838e45ea050e42712 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu-jupyter.Dockerfile
@@ -16,28 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow, with Jupyter included.
-#
-# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
-# packages.
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -60,6 +44,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -82,11 +67,23 @@ RUN mkdir /usr/local/cuda-9.0/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
 
-ARG USE_PYTHON_3_NOT_2=True
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -98,10 +95,13 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
@@ -110,6 +110,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
@@ -119,11 +133,19 @@ RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
similarity index 72%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
index a3ba02a684ce6be99cddb917b3b069b3631e9436..809cda679ea7e33b64e4b4180cfa1af2d05f8ff3 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-devel.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/devel-gpu.Dockerfile
@@ -16,26 +16,12 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for TensorFlow.
-#
-# Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF development
-# packages.
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the latest version of Bazel and Python development tools.
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
 
 ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -58,6 +44,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -80,11 +67,23 @@ RUN mkdir /usr/local/cuda-9.0/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
 
-ARG USE_PYTHON_3_NOT_2=True
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -96,10 +95,13 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
 RUN apt-get update && apt-get install -y \
     build-essential \
     curl \
@@ -108,6 +110,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
similarity index 62%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
index fbdea4628ad5008de9c4eee5009bca2884c47a2a..acfe4d8607d56b6192926eb50ef9a3d58a07efe2 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia-jupyter.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu-jupyter.Dockerfile
@@ -16,30 +16,13 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with Jupyter included.
-#
-# NVIDIA with CUDA and CuDNN, no dev stuff
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
-#
-# Launch Jupyter on execution instead of a bash prompt.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:9.0-base-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
-# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-9-0 \
@@ -48,6 +31,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-curand-9-0 \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
+        curl \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libnccl2=2.2.13-1+cuda9.0 \
         libfreetype6-dev \
@@ -55,6 +39,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        rsync \
         software-properties-common \
         unzip \
         && \
@@ -66,7 +51,10 @@ RUN apt-get update && \
         apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0
 
-ARG USE_PYTHON_3_NOT_2=True
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -78,21 +66,37 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
-ARG TF_PACKAGE=tensorflow-gpu
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
 RUN chmod a+rwx /etc/bash.bashrc
 
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
similarity index 69%
rename from tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
rename to tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
index e0312dbc2949797f3b6af35224bcfe66664c3cbd..f36a21eaf0cce02cf77db7c88358696c6f392cf4 100644
--- a/tensorflow/tools/dockerfiles/dockerfiles/nvidia.Dockerfile
+++ b/tensorflow/tools/dockerfiles/dockerfiles/gpu.Dockerfile
@@ -16,28 +16,13 @@
 # THIS IS A GENERATED DOCKERFILE.
 #
 # This file was assembled from multiple pieces, whose use is documented
-# below. Please refer to the the TensorFlow dockerfiles documentation for
-# more information. Build args are documented as their default value.
-#
-# Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
-#
-# NVIDIA with CUDA and CuDNN, no dev stuff
-# --build-arg UBUNTU_VERSION=16.04
-#    ( no description )
-#
-# Python is required for TensorFlow and other libraries.
-# --build-arg USE_PYTHON_3_NOT_2=True
-#    Install python 3 over Python 2
-#
-# Install the TensorFlow Python package.
-# --build-arg TF_PACKAGE=tensorflow-gpu (tensorflow|tensorflow-gpu|tf-nightly|tf-nightly-gpu)
-#    The specific TensorFlow Python package to install
-#
-# Configure TensorFlow's shell prompt and login tools.
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG UBUNTU_VERSION=16.04
 
-FROM nvidia/cuda:9.0-base-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
-# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-9-0 \
@@ -46,6 +31,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-curand-9-0 \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
+        curl \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libnccl2=2.2.13-1+cuda9.0 \
         libfreetype6-dev \
@@ -53,6 +39,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        rsync \
         software-properties-common \
         unzip \
         && \
@@ -64,7 +51,10 @@ RUN apt-get update && \
         apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0
 
-ARG USE_PYTHON_3_NOT_2=True
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+
+ARG USE_PYTHON_3_NOT_2
 ARG _PY_SUFFIX=${USE_PYTHON_3_NOT_2:+3}
 ARG PYTHON=python${_PY_SUFFIX}
 ARG PIP=pip${_PY_SUFFIX}
@@ -76,11 +66,19 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
 
-ARG TF_PACKAGE=tensorflow-gpu
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
 
 COPY bashrc /etc/bash.bashrc
diff --git a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
index 2c9b9f3f9a081e97c96cedf1bbdf0936a9961d46..c4ec6095c0cae43b9d5756cd4391ca3ddd329fbe 100644
--- a/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/jupyter.partial.Dockerfile
@@ -1,8 +1,16 @@
-RUN ${PIP} install jupyter
+RUN ${PIP} install jupyter matplotlib
 
-RUN mkdir /notebooks && chmod a+rwx /notebooks
+RUN mkdir -p /tf/tensorflow-tutorials && chmod -R a+rwx /tf/
 RUN mkdir /.local && chmod a+rwx /.local
-WORKDIR /notebooks
+RUN apt-get install -y --no-install-recommends wget
+WORKDIR /tf/tensorflow-tutorials
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_classification.ipynb
+RUN wget https://raw.githubusercontent.com/tensorflow/docs/master/site/en/tutorials/keras/basic_text_classification.ipynb
+COPY readme-for-jupyter.md README.md
+RUN apt-get autoremove -y && apt-get remove -y wget
+WORKDIR /tf
 EXPOSE 8888
 
-CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/notebooks --ip 0.0.0.0 --no-browser --allow-root"]
+RUN ${PYTHON} -m ipykernel.kernelspec
+
+CMD ["bash", "-c", "source /etc/bash.bashrc && jupyter notebook --notebook-dir=/tf --ip 0.0.0.0 --no-browser --allow-root"]
diff --git a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
index 96e79547f0c67c232565019e0ae64d24d55d1516..76758bd147ef9d52b3db072bd0091190e132667c 100644
--- a/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/tensorflow.partial.Dockerfile
@@ -1,2 +1,7 @@
-ARG TF_PACKAGE
+# Options:
+#   tensorflow
+#   tensorflow-gpu
+#   tf-nightly
+#   tf-nightly-gpu
+ARG TF_PACKAGE=tensorflow
 RUN ${PIP} install ${TF_PACKAGE}
diff --git a/tensorflow/tools/dockerfiles/partials/test-import.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/test-import.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
deleted file mode 100644
index 0a50735bf83364446919254010f0acab0e26404c..0000000000000000000000000000000000000000
--- a/tensorflow/tools/dockerfiles/partials/ubuntu.partial.Dockerfile
+++ /dev/null
@@ -1,2 +0,0 @@
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
diff --git a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
similarity index 58%
rename from tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
index b08d8bdd14b638b87ac8fbd57cf2b3e8c4564582..156bb019914554e650421fb23bcebc935658abdb 100644
--- a/tensorflow/tools/dockerfiles/partials/bazel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/bazel.partial.Dockerfile
@@ -6,6 +6,20 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON}-dev \
     swig
 
+RUN ${PIP} --no-cache-dir install \
+    Pillow \
+    h5py \
+    keras_applications \
+    keras_preprocessing \
+    matplotlib \
+    mock \
+    numpy \
+    scipy \
+    sklearn \
+    pandas \
+    && test "${USE_PYTHON_3_NOT_2}" -eq 1 && true || ${PIP} --no-cache-dir install \
+    enum34
+
 # Install bazel
 RUN echo "deb [arch=amd64] http://storage.googleapis.com/bazel-apt stable jdk1.8" | tee /etc/apt/sources.list.d/bazel.list && \
     curl https://bazel.build/bazel-release.pub.gpg | apt-key add - && \
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..d01b26e27f6ffb35affb95f8e40b7ce3b8e52d0a
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/cpu.partial.Dockerfile
@@ -0,0 +1 @@
+FROM ubuntu:${UBUNTU_VERSION} as base
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
similarity index 64%
rename from tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
index bc792722766e07d1af3d6944f14a8eb26f43dc1a..a61dfbbe54eb163b25160490f3ee245c36d21ffe 100644
--- a/tensorflow/tools/dockerfiles/partials/ubuntu-devel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-cpu.partial.Dockerfile
@@ -1,5 +1,4 @@
-ARG UBUNTU_VERSION=16.04
-FROM ubuntu:${UBUNTU_VERSION}
+FROM ubuntu:${UBUNTU_VERSION} AS base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -11,7 +10,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
-        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -22,3 +20,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
+ 
+ENV CI_BUILD_PYTHON python
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
similarity index 70%
rename from tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
index 45159f711fcbdd0e6bb7083169d2abb39ab8dea5..95f9875012d2a552be4af6f59cb6a5c60d99dce5 100644
--- a/tensorflow/tools/dockerfiles/partials/nvidia-devel.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/devel-nvidia.partial.Dockerfile
@@ -1,5 +1,4 @@
-ARG UBUNTU_VERSION=16.04
-FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION}
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
@@ -22,6 +21,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        python-dev \
         rsync \
         software-properties-common \
         unzip \
@@ -44,6 +44,18 @@ RUN mkdir /usr/local/cuda-9.0/lib &&  \
     ln -s /usr/lib/x86_64-linux-gnu/libnccl.so.2 /usr/local/cuda/lib/libnccl.so.2 && \
     ln -s /usr/include/nccl.h /usr/local/cuda/include/nccl.h
 
-# TODO(tobyboyd): Remove after license is excluded from BUILD file.
-RUN gunzip /usr/share/doc/libnccl2/NCCL-SLA.txt.gz && \
-    cp /usr/share/doc/libnccl2/NCCL-SLA.txt /usr/local/cuda/
+# Configure the build for our CUDA configuration.
+ENV CI_BUILD_PYTHON python
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
+ENV TF_NEED_CUDA 1
+ENV TF_NEED_TENSORRT 1
+ENV TF_CUDA_COMPUTE_CAPABILITIES=3.5,5.2,6.0,6.1,7.0
+ENV TF_CUDA_VERSION=9.0
+ENV TF_CUDNN_VERSION=7
+
+# NCCL 2.x
+ENV TF_NCCL_VERSION=2
+
+# Check out TensorFlow source code if --build_arg CHECKOUT_TENSORFLOW=1
+ARG CHECKOUT_TF_SRC=0
+RUN test "${CHECKOUT_TF_SRC}" -eq 1 && git clone https://github.com/tensorflow/tensorflow.git /tensorflow_src
diff --git a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
similarity index 78%
rename from tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
index 1064390af3b5006a8e539ad2b006d692e51692ae..1dc8e43aaddc606efde2cbd84215f7ef7131e251 100644
--- a/tensorflow/tools/dockerfiles/partials/nvidia.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/nvidia.partial.Dockerfile
@@ -1,6 +1,5 @@
-FROM nvidia/cuda:9.0-base-ubuntu16.04
+FROM nvidia/cuda:9.0-base-ubuntu${UBUNTU_VERSION} as base
 
-# Pick up some TF dependencies
 RUN apt-get update && apt-get install -y --no-install-recommends \
         build-essential \
         cuda-command-line-tools-9-0 \
@@ -9,6 +8,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         cuda-curand-9-0 \
         cuda-cusolver-9-0 \
         cuda-cusparse-9-0 \
+        curl \
         libcudnn7=7.2.1.38-1+cuda9.0 \
         libnccl2=2.2.13-1+cuda9.0 \
         libfreetype6-dev \
@@ -16,6 +16,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libpng12-dev \
         libzmq3-dev \
         pkg-config \
+        rsync \
         software-properties-common \
         unzip \
         && \
@@ -26,3 +27,6 @@ RUN apt-get update && \
         apt-get install nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 && \
         apt-get update && \
         apt-get install libnvinfer4=4.1.2-1+cuda9.0
+
+# For CUDA profiling, TensorFlow requires CUPTI.
+ENV LD_LIBRARY_PATH /usr/local/cuda/extras/CUPTI/lib64:$LD_LIBRARY_PATH
diff --git a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
similarity index 66%
rename from tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
rename to tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
index ee08af73a8e3bdd50537209c6624f4c143da9ad7..6af473195380801bded2e6849e97432caf07816b 100644
--- a/tensorflow/tools/dockerfiles/partials/python.partial.Dockerfile
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/python.partial.Dockerfile
@@ -10,6 +10,9 @@ RUN apt-get update && apt-get install -y \
     ${PYTHON} \
     ${PYTHON}-pip
 
-RUN ${PIP} install --upgrade \
+RUN ${PIP} --no-cache-dir install --upgrade \
     pip \
     setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which ${PYTHON}) /usr/local/bin/python 
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/test-devel.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/test-devel.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile b/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6ecd2b8b1acd59e50c172c3fc9c5574626ed5608
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/partials/ubuntu/version.partial.Dockerfile
@@ -0,0 +1 @@
+ARG UBUNTU_VERSION=16.04
diff --git a/tensorflow/tools/dockerfiles/readme-for-jupyter.md b/tensorflow/tools/dockerfiles/readme-for-jupyter.md
new file mode 100644
index 0000000000000000000000000000000000000000..f104a7533b884bea06c46e9670d07d92bca87ea1
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/readme-for-jupyter.md
@@ -0,0 +1,3 @@
+Want more tutorials like these?
+
+Check out tensorflow.org/tutorials!
diff --git a/tensorflow/tools/dockerfiles/spec.yml b/tensorflow/tools/dockerfiles/spec.yml
index 28bf9a55da123a0a45cd4b0e54971f14c355d794..19d96e7a3df4468ff82f2029a1945a02b1e58932 100644
--- a/tensorflow/tools/dockerfiles/spec.yml
+++ b/tensorflow/tools/dockerfiles/spec.yml
@@ -1,195 +1,148 @@
-# ======
-# HEADER
-# ======
-#
-# This is commented-out and prepended to each generated Dockerfile.
 header: |
-    Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-
-    Licensed under the Apache License, Version 2.0 (the "License");
-    you may not use this file except in compliance with the License.
-    You may obtain a copy of the License at
-
-        http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing, software
-    distributed under the License is distributed on an "AS IS" BASIS,
-    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-    See the License for the specific language governing permissions and
-    limitations under the License.
-    ============================================================================
-
-    THIS IS A GENERATED DOCKERFILE.
-
-    This file was assembled from multiple pieces, whose use is documented
-    below. Please refer to the the TensorFlow dockerfiles documentation for
-    more information. Build args are documented as their default value.
-
-# ========
-# PARTIALS
-# ========
+    # Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+    #
+    # Licensed under the Apache License, Version 2.0 (the "License");
+    # you may not use this file except in compliance with the License.
+    # You may obtain a copy of the License at
+    #
+    #     http://www.apache.org/licenses/LICENSE-2.0
+    #
+    # Unless required by applicable law or agreed to in writing, software
+    # distributed under the License is distributed on an "AS IS" BASIS,
+    # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    # See the License for the specific language governing permissions and
+    # limitations under the License.
+    # ============================================================================
+    #
+    # THIS IS A GENERATED DOCKERFILE.
+    #
+    # This file was assembled from multiple pieces, whose use is documented
+    # throughout. Please refer to the TensorFlow dockerfiles documentation
+    # for more information.
+
+# A combinatorial explosion of Docker images and Dockerfiles.
+# Each "release" defines all of the ways to combine related but separate chunks
+# of functionality ("slices") by listing all of the "slice sets" to use when
+# building.
 #
-# Represent and document pieces of a Dockerfile. Spec:
-# 
-# name: the name of the partial, is referenced from the images section
-#   desc: A description, inserted later into the Dockerfile
-#   file: Alternative file prefix, e.g. file.partial.Dockerfile. The default is
-#         the name of the partial.
-#   args: A dict of ARGs in the Dockerfile; each entry has the format
-#      ARG_NAME: VALUE where VALUE is one of:
-#         - a dict:
-#             desc: Documentation for the arg
-#             default: Default value for the arg; is written to the Dockerfile
-#             options: List of strings, part of documentation
-#         - a concrete value: the same as a dictionary with default: [value].
-
-partials:
-    ubuntu:
-        desc: Start from Ubuntu (no GPU support)
-        args:
-            UBUNTU_VERSION: 16.04
-
-    ubuntu-devel:
-        desc: Start from Ubuntu, with TF development packages (no GPU support)
-        args:
-            UBUNTU_VERSION: 16.04
-
-    bazel:
-        desc: Install the latest version of Bazel and Python development tools.
-
-    nvidia:
-        desc: NVIDIA with CUDA and CuDNN, no dev stuff
-        args:
-            UBUNTU_VERSION: 16.04
-
-    nvidia-devel:
-        desc: >
-            Start from Nvidia's Ubuntu base image with CUDA and CuDNN, with TF
-            development packages.
-        args:
-            UBUNTU_VERSION: 16.04
+# For example, a release that uses {nightly}{py} would create 4 Dockerfiles
+# (which could become images or concrete Dockerfiles), because the "nightly"
+# and "py" slice sets both have two entries:
+#
+#   - nightly (no -py2 because the Python 2 slice set has add_to_name: ""
+#   - nightly-py3
+#   - nightly-gpu (similar)
+#   - nightly-gpu-py3
+#
+# Releases are all treated differently by TensorFlow's CI systems.
+releases:
+    # Built Nightly and pushed to tensorflow/tensorflow
+    nightly:
+        tag_specs:
+            - "{nightly}{py}{jupyter}"
+            - "{ubuntu-devel}{py}"
+
+    # Built per-release and pushed to tensorflow/tensorflow
+    # --arg _TAG_PREFIX=<val> should be set to "1.11" (for example) or "latest".
+    versioned:
+        tag_specs:
+            - "{_TAG_PREFIX}{ubuntu}{py}{jupyter}"
+
+    # Dockerfiles stored in the TF repo; not pushed anywhere
+    dockerfiles:
+        is_dockerfiles: true
+        upload_images: false
+        tag_specs:
+            - "{ubuntu}{jupyter}"
+            - "{ubuntu-devel}{jupyter}"
+
+slice_sets:
+
+    py:
+        - add_to_name: ""
+          args:
+              - USE_PYTHON_3_NOT_2=
+        - add_to_name: "-py3"
+          args:
+              - USE_PYTHON_3_NOT_2=1
 
-    python:
-        desc: Python is required for TensorFlow and other libraries.
-        args:
-            USE_PYTHON_3_NOT_2:
-                default: true
-                desc: Install python 3 over Python 2
-                
-    tensorflow:
-        desc: Install the TensorFlow Python package.
-        args:
-            TF_PACKAGE:
-                default: tensorflow
-                options:
-                    - tensorflow
-                    - tensorflow-gpu
-                    - tf-nightly
-                    - tf-nightly-gpu
-                desc: The specific TensorFlow Python package to install
-    shell:
-        desc: Configure TensorFlow's shell prompt and login tools.
     jupyter:
-        desc: Launch Jupyter on execution instead of a bash prompt.
-
-# ======
-# IMAGES
-# ======
-# 
-# Represent Dockerfiles. Spec:
-# 
-# name: the name of the image, possibly referenced by other images
-#   desc: A description, inserted later into the Dockerfile
-#   create-dockerfile: Create a dockerfile based on this. Useful for creating
-#      extensible base images that don't need a file. Default is true.
-#   partials: List of VALUEs, where a VALUE is either:
-#      - the name of a partial, which inserts that partial into this image
-#      - image: [name of another image], which inserts the partials from that
-#        image into this image
-#   arg-defaults: List of VALUEs, where a VALUE is either:
-#      - ARG_NAME: VALUE, which sets the ARG_NAME to VALUE wherever it appears
-#        in this image's partials
-#      - [name of another image], which loads the default args from that image
-images:
-
-    nodev:
-        create-dockerfile: false
-        partials:
-            - python
-            - tensorflow
-            - shell
-
-    dev:
-        create-dockerfile: false
-        partials:
-            - python
-            - bazel
-            - shell
-
-    cpu:
-      desc: Ubuntu-based, CPU-only environment for using TensorFlow
-      partials:
-        - ubuntu
-        - image: nodev
-
-    cpu-devel:
-      desc: >
-          Ubuntu-based, CPU-only environment for developing changes for
-          TensorFlow.
-      partials:
-        - ubuntu-devel
-        - image: dev
+        - add_to_name: ""
+        - add_to_name: "-jupyter"
+          partials:
+              - jupyter
 
-    nvidia:
-      desc: Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow.
-      arg-defaults: 
-        - TF_PACKAGE: tensorflow-gpu
-      partials:
-        - nvidia
-        - image: nodev
-
-    nvidia-devel:
-      desc: >
-          Ubuntu-based, Nvidia-GPU-enabled environment for developing changes
-          for TensorFlow.
-      arg-defaults: 
-        - TF_PACKAGE: tensorflow-gpu
-      partials:
-        - nvidia-devel
-        - image: dev
-
-    cpu-jupyter:
-      desc: >
-          Ubuntu-based, CPU-only environment for using TensorFlow, with Jupyter
-          included.
-      partials:
-        - image: cpu
-        - jupyter
-
-    cpu-devel-jupyter:
-      desc: >
-         Ubuntu-based, CPU-only environment for developing changes for
-         TensorFlow, with Jupyter included.
-      partials:
-        - image: cpu-devel
-        - jupyter
-
-    nvidia-jupyter:
-      desc: >
-        Ubuntu-based, Nvidia-GPU-enabled environment for using TensorFlow, with
-        Jupyter included.
-      arg-defaults: 
-        - nvidia
-      partials:
-        - image: nvidia
-        - jupyter
+    ubuntu:
+        - add_to_name: ""
+          dockerfile_exclusive_name: "cpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow
+              - shell
+        - add_to_name: "-gpu"
+          dockerfile_exclusive_name: "gpu"
+          args:
+              - TF_PACKAGE=tensorflow-gpu
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow
+              - shell
+          tests:
+              - import-gpu.sh
+          test_runtime: nvidia
 
-    nvidia-devel-jupyter:
-      desc: >
-        Ubuntu-based, Nvidia-GPU-enabled environment for developing changes for
-        TensorFlow, with Jupyter included.
-      arg-defaults: 
-        - nvidia-devel
-      partials:
-        - image: nvidia-devel
-        - jupyter
+    ubuntu-devel:
+        - add_to_name: "devel"
+          dockerfile_exclusive_name: "devel-cpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-cpu
+              - ubuntu/python
+              - ubuntu/bazel
+              - shell
+          tests:
+              - build-cpu.sh
+          args:
+              - CHECKOUT_TF_SRC=1
+        - add_to_name: "devel-gpu"
+          dockerfile_exclusive_name: "devel-gpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/devel-nvidia
+              - ubuntu/python
+              - ubuntu/bazel
+              - shell
+          tests:
+              - build-gpu.sh
+          test_runtime: nvidia
+          args:
+              - CHECKOUT_TF_SRC=1
+
+    nightly:
+        - add_to_name: "nightly"
+          partials:
+              - ubuntu/version
+              - ubuntu/cpu
+              - ubuntu/python
+              - tensorflow
+              - shell
+          args:
+              - TF_PACKAGE=tf-nightly
+          tests:
+              - import.sh
+        - add_to_name: "nightly-gpu"
+          partials:
+              - ubuntu/version
+              - ubuntu/nvidia
+              - ubuntu/python
+              - tensorflow
+              - shell
+          test_runtime: nvidia
+          tests:
+              - import-gpu.sh
+          args:
+              - TF_PACKAGE=tf-nightly-gpu
diff --git a/tensorflow/tools/dockerfiles/tests/build-cpu.sh b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bcdc4c2139c83e65c15998d3dd6be2f29e27bff3
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/build-cpu.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Download and build TensorFlow.
+set -euxo pipefail
+git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow
+cd /tensorflow
+
+ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+# For optimized builds appropriate for the hardware platform of your choosing, uncomment below...
+# For ivy-bridge or sandy-bridge
+# --copt=-march="ivybridge" \
+# for haswell, broadwell, or skylake
+# --copt=-march="haswell" \
+tensorflow/tools/ci_build/builds/configured CPU \
+  bazel build -c opt --copt=-mavx --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+      tensorflow/tools/pip_package:build_pip_package && \
+  bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
+  pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
+  rm -rf /tmp/pip && \
+  rm -rf /root/.cache
+
diff --git a/tensorflow/tools/dockerfiles/tests/build-gpu.sh b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..76b25d5a7419b9a07a6799f14fa5175fb6fa36d5
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/build-gpu.sh
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# Download and build TensorFlow.
+set -euxo pipefail
+git clone --branch=master --depth=1 https://github.com/tensorflow/tensorflow.git /tensorflow
+cd /tensorflow
+
+ln -s $(which ${PYTHON}) /usr/local/bin/python 
+
+ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
+
+LD_LIBRARY_PATH=/usr/local/cuda/lib64/stubs:${LD_LIBRARY_PATH} \
+tensorflow/tools/ci_build/builds/configured GPU \
+bazel build -c opt --copt=-mavx --config=cuda \
+    --cxxopt="-D_GLIBCXX_USE_CXX11_ABI=0" \
+    tensorflow/tools/pip_package:build_pip_package && \
+rm /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
+bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/pip && \
+pip --no-cache-dir install --upgrade /tmp/pip/tensorflow-*.whl && \
+rm -rf /tmp/pip && \
+rm -rf /root/.cache
diff --git a/tensorflow/tools/dockerfiles/tests/import-gpu.sh b/tensorflow/tools/dockerfiles/tests/import-gpu.sh
new file mode 100755
index 0000000000000000000000000000000000000000..6559210dcbfbb5fe3c76c369c5ae211920f46d15
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import-gpu.sh
@@ -0,0 +1,18 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+python -c 'import tensorflow as tf; tf.test.is_gpu_available() or exit(1)'
diff --git a/tensorflow/tools/dockerfiles/tests/import.sh b/tensorflow/tools/dockerfiles/tests/import.sh
new file mode 100755
index 0000000000000000000000000000000000000000..b73bd86a8529e2b7634f0b027196b978f8245da0
--- /dev/null
+++ b/tensorflow/tools/dockerfiles/tests/import.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+set -euxo pipefail
+python -c 'import tensorflow as tf'
diff --git a/tensorflow/tools/dockerfiles/assembler.Dockerfile b/tensorflow/tools/dockerfiles/tools.Dockerfile
similarity index 95%
rename from tensorflow/tools/dockerfiles/assembler.Dockerfile
rename to tensorflow/tools/dockerfiles/tools.Dockerfile
index 7a8e07fced3465e188f47727013fa92d14424c7c..e8929295a5ee397acbe46ebf96894174ca01fca2 100644
--- a/tensorflow/tools/dockerfiles/assembler.Dockerfile
+++ b/tensorflow/tools/dockerfiles/tools.Dockerfile
@@ -20,8 +20,9 @@
 FROM debian:stretch
 LABEL maintainer="Austin Anderson <angerson@google.com>"
 
-RUN apt-get update && apt-get install -y python3 python3-pip bash
-RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus
+RUN apt-get update && apt-get install -y python3 python3-pip bash curl
+RUN curl -sSL https://get.docker.com/ | sh
+RUN pip3 install --upgrade pip setuptools pyyaml absl-py cerberus docker
 
 WORKDIR /tf
 VOLUME ["/tf"]
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index 1a53f241773a199a6fa1f5388d2c0a4dcf463503..b072853a4ec298ce5c15afc1307a966ecefb743f 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -142,17 +142,30 @@ py_test(
     ],
 )
 
-py_binary(
-    name = "generate_1_0",
-    srcs = ["generate_1_0.py"],
+py_test(
+    name = "generate2_test",
+    srcs = ["generate2_test.py"],
     srcs_version = "PY2AND3",
+    tags = [
+        "manual",
+        # No reason to run sanitizers or fastbuild for this test.
+        "noasan",
+        "nomsan",
+        "notsan",
+        "optonly",
+    ],
     deps = [
-        ":generate_lib",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python/debug:debug_py",
+        ":generate2",
     ],
 )
 
+py_binary(
+    name = "generate2",
+    srcs = ["generate2.py"],
+    srcs_version = "PY2AND3",
+    deps = ["//tensorflow:tensorflow_py"],
+)
+
 py_library(
     name = "py_guide_parser",
     srcs = ["py_guide_parser.py"],
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fba909d26defffad2d7dbaffa4463695685ae50c
--- /dev/null
+++ b/tensorflow/tools/docs/generate2.py
@@ -0,0 +1,82 @@
+# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+r"""A tool to generate api_docs for TensorFlow2.
+
+```
+python generate2.py --output_dir=/tmp/out
+```
+
+Requires a local installation of:
+  https://github.com/tensorflow/docs/tree/master/tools
+  tf-nightly-2.0-preview
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from os import path
+
+from absl import app
+from absl import flags
+
+import tensorflow as tf
+
+from tensorflow_docs.api_generator import generate_lib
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string(
+    "code_url_prefix",
+    "/code/stable/tensorflow/",
+    "A url to prepend to code paths when creating links to defining code")
+
+flags.DEFINE_string(
+    "output_dir", "/tmp/out",
+    "A directory, where the docs will be output to.")
+
+flags.DEFINE_bool("search_hints", True,
+                  "Include meta-data search hints at the top of each file.")
+
+
+def build_docs(output_dir, code_url_prefix, search_hints=True):
+  """Build api docs for tensorflow v2.
+
+  Args:
+    output_dir: A string path, where to put the files.
+    code_url_prefix: prefix for "Defined in" links.
+    search_hints: Bool. Include meta-data search hints at the top of each file.
+  """
+  base_dir = path.dirname(tf.__file__)
+  doc_generator = generate_lib.DocGenerator(
+      root_title="TensorFlow 2.0 Preview",
+      py_modules=[("tf", tf)],
+      base_dir=base_dir,
+      search_hints=search_hints,
+      code_url_prefix=code_url_prefix,
+      site_path="api_docs/")
+
+  doc_generator.build(output_dir)
+
+
+def main(argv):
+  del argv
+  build_docs(output_dir=FLAGS.output_dir,
+             code_url_prefix=FLAGS.code_url_prefix,
+             search_hints=FLAGS.search_hints)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/contrib/estimator/python/estimator/linear.py b/tensorflow/tools/docs/generate2_test.py
similarity index 60%
rename from tensorflow/contrib/estimator/python/estimator/linear.py
rename to tensorflow/tools/docs/generate2_test.py
index b6a4444f66c2dd9ce104053613997af1f9c543eb..774d45c536ba158d6d4cf4f4ac6043e76b88912f 100644
--- a/tensorflow/contrib/estimator/python/estimator/linear.py
+++ b/tensorflow/tools/docs/generate2_test.py
@@ -12,21 +12,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""linear python module.
-
-Importing from tensorflow.python.estimator is unsupported
-and will soon break!
-"""
-# pylint: disable=unused-import,g-bad-import-order,g-import-not-at-top,wildcard-import
+"""Tests for tensorflow.tools.docs.generate2."""
 
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-from tensorflow_estimator.contrib.estimator.python.estimator import linear
+import os
+import shutil
+
+from tensorflow.python.platform import googletest
+from tensorflow.tools.docs import generate2
+
+
+class Generate2Test(googletest.TestCase):
+
+  def test_end_to_end(self):
+    output_dir = os.path.join(googletest.GetTempDir(), 'output')
+    if os.path.exists(output_dir):
+      shutil.rmtree(output_dir)
+    os.makedirs(output_dir)
+    generate2.build_docs(output_dir=output_dir, code_url_prefix='')
 
-# Include attrs that start with single underscore.
-_HAS_DYNAMIC_ATTRIBUTES = True
-linear.__all__ = [s for s in dir(linear) if not s.startswith('__')]
 
-from tensorflow_estimator.contrib.estimator.python.estimator.linear import *
+if __name__ == '__main__':
+  googletest.main()
diff --git a/tensorflow/tools/docs/generate_1_0.py b/tensorflow/tools/docs/generate_1_0.py
deleted file mode 100644
index f4384e0ced77718c80d4d146a2d72072588a0541..0000000000000000000000000000000000000000
--- a/tensorflow/tools/docs/generate_1_0.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Generate docs for the TensorFlow Python API."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import sys
-
-import tensorflow as tf
-
-from tensorflow.python import debug as tf_debug
-from tensorflow.python.util import tf_inspect
-from tensorflow.tools.docs import generate_lib
-
-if __name__ == '__main__':
-  doc_generator = generate_lib.DocGenerator()
-  doc_generator.add_output_dir_argument()
-  doc_generator.add_src_dir_argument()
-
-  # This doc generator works on the TensorFlow codebase. Since this script lives
-  # at tensorflow/tools/docs, and all code is defined somewhere inside
-  # tensorflow/, we can compute the base directory (two levels up), which is
-  # valid unless we're trying to apply this to a different code base, or are
-  # moving the script around.
-  script_dir = os.path.dirname(tf_inspect.getfile(tf_inspect.currentframe()))
-  default_base_dir = os.path.join(script_dir, '..', '..')
-  doc_generator.add_base_dir_argument(default_base_dir)
-
-  flags = doc_generator.parse_known_args()
-
-  # tf_debug is not imported with tf, it's a separate module altogether
-  doc_generator.set_py_modules([('tf', tf), ('tfdbg', tf_debug)])
-
-  doc_generator.set_do_not_descend_map({
-      'tf': ['cli', 'lib', 'wrappers'],
-      'tf.contrib': [
-          'compiler',
-          'factorization',
-          'grid_rnn',
-          'labeled_tensor',
-          'quantization',
-          'session_bundle',
-          'slim',
-          'solvers',
-          'specs',
-          'tensor_forest',
-          'tensorboard',
-          'testing',
-          'training',
-          'tfprof',
-      ],
-      'tf.contrib.bayesflow': [
-          'entropy', 'monte_carlo', 'special_math',
-          'stochastic_gradient_estimators', 'stochastic_graph',
-          'stochastic_tensor', 'stochastic_variables', 'variational_inference'
-      ],
-      'tf.contrib.distributions': ['bijector'],
-      'tf.contrib.ffmpeg': ['ffmpeg_ops'],
-      'tf.contrib.graph_editor': [
-          'edit', 'match', 'reroute', 'subgraph', 'transform', 'select', 'util'
-      ],
-      'tf.contrib.layers': ['feature_column', 'summaries'],
-      'tf.contrib.learn': [
-          'datasets',
-          'head',
-          'graph_actions',
-          'io',
-          'models',
-          'monitors',
-          'ops',
-          'preprocessing',
-          'utils',
-      ],
-      'tf.contrib.util': ['loader'],
-  })
-
-  sys.exit(doc_generator.build(flags))
diff --git a/tensorflow/tools/graph_transforms/backports.cc b/tensorflow/tools/graph_transforms/backports.cc
index 5c153e8cefc900728c78340dd43a56737d887b21..041e7eedfb7a38f0eeb7ec17b44c92010041dc29 100644
--- a/tensorflow/tools/graph_transforms/backports.cc
+++ b/tensorflow/tools/graph_transforms/backports.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/flatten_atrous.cc b/tensorflow/tools/graph_transforms/flatten_atrous.cc
index a6f7cb0ed8b45dc537b6fe8c7b9d7e09685feef9..c80b28fbbca7e3d29f5abdef30a130934f17c9c0 100644
--- a/tensorflow/tools/graph_transforms/flatten_atrous.cc
+++ b/tensorflow/tools/graph_transforms/flatten_atrous.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
index 975b17380f6ca7fbd94783c6226f54c89e730cde..16a0f7d58df66be06224d58de623ee7e2dc41880 100644
--- a/tensorflow/tools/graph_transforms/fold_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_batch_norms.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_constants_lib.cc b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
index 6df2718e61074daab7bdfd75ca923035ffe5fba4..dcc36b1a8557cf30ac030302fcb7545da55c7886 100644
--- a/tensorflow/tools/graph_transforms/fold_constants_lib.cc
+++ b/tensorflow/tools/graph_transforms/fold_constants_lib.cc
@@ -33,7 +33,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
index 156636ab8215d9abdc9e0ed461df550f1c7ed09c..fd546f812c0dafc5d2e71c94710c3c3f5b75250e 100644
--- a/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
+++ b/tensorflow/tools/graph_transforms/fold_old_batch_norms.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fuse_convolutions.cc b/tensorflow/tools/graph_transforms/fuse_convolutions.cc
index df6e9e6dc2864872fa8f30741735a7d5985a3104..7754dde9c68753ea648ce31e0f87329826e10828 100644
--- a/tensorflow/tools/graph_transforms/fuse_convolutions.cc
+++ b/tensorflow/tools/graph_transforms/fuse_convolutions.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc b/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
index bd021d094efcea5ca5f512929d1b84e933a17d84..5aa2dd4f99b89f0ea03fe69db854c55f3f2f3c38 100644
--- a/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
+++ b/tensorflow/tools/graph_transforms/fuse_quantized_convolution.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/fold_constants_lib.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
diff --git a/tensorflow/tools/graph_transforms/insert_logging.cc b/tensorflow/tools/graph_transforms/insert_logging.cc
index 377665448c244aeace78f231ba0c263613afd9a0..ccc48540eb9731514ecbff41de86df956ff91a3b 100644
--- a/tensorflow/tools/graph_transforms/insert_logging.cc
+++ b/tensorflow/tools/graph_transforms/insert_logging.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/obfuscate_names.cc b/tensorflow/tools/graph_transforms/obfuscate_names.cc
index c470b51b96096a36eacdc67a74431ec02e0515d0..ee8ca3d097d71fef91d0ee50057ff6d215891596 100644
--- a/tensorflow/tools/graph_transforms/obfuscate_names.cc
+++ b/tensorflow/tools/graph_transforms/obfuscate_names.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/quantize_nodes.cc b/tensorflow/tools/graph_transforms/quantize_nodes.cc
index a022f5792676c62c52fd1197b0d8c436f7161a47..b139dad2ddd13ade70a4563a50b0db2db298ef36 100644
--- a/tensorflow/tools/graph_transforms/quantize_nodes.cc
+++ b/tensorflow/tools/graph_transforms/quantize_nodes.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/quantize_weights.cc b/tensorflow/tools/graph_transforms/quantize_weights.cc
index cccae8a992a64b0f49798eda71513a2fe62ad656..a1a6e27171ee5a48dec91d64a3b15f6caa88dbf8 100644
--- a/tensorflow/tools/graph_transforms/quantize_weights.cc
+++ b/tensorflow/tools/graph_transforms/quantize_weights.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_attribute.cc b/tensorflow/tools/graph_transforms/remove_attribute.cc
index b1a04c0f283bf6bc03da702447694558c5b98538..0a76c2b2052a2c26ee66691b361fff2be70bbf30 100644
--- a/tensorflow/tools/graph_transforms/remove_attribute.cc
+++ b/tensorflow/tools/graph_transforms/remove_attribute.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_device.cc b/tensorflow/tools/graph_transforms/remove_device.cc
index 975fa3706335dd38e4f0992ff4c155addfc5e6a9..fdd43168a117b89884187e6b7a29e5f44f14fd33 100644
--- a/tensorflow/tools/graph_transforms/remove_device.cc
+++ b/tensorflow/tools/graph_transforms/remove_device.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/remove_nodes.cc b/tensorflow/tools/graph_transforms/remove_nodes.cc
index 05f036a86a09b2a6a94e9c1a1220803eabc64da5..aa0288689d9e093a39e8aa6b9156bac19ef40491 100644
--- a/tensorflow/tools/graph_transforms/remove_nodes.cc
+++ b/tensorflow/tools/graph_transforms/remove_nodes.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/rename_attribute.cc b/tensorflow/tools/graph_transforms/rename_attribute.cc
index bd066aab5b9ab69a38e313c0b0437457b3a2bb52..62897d43a8ca774418c7b45c1f886cd8cd7fd850 100644
--- a/tensorflow/tools/graph_transforms/rename_attribute.cc
+++ b/tensorflow/tools/graph_transforms/rename_attribute.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/rename_op.cc b/tensorflow/tools/graph_transforms/rename_op.cc
index e1e13c1be43a531355e5df4530183bd55836fe4c..9deee8bbffbbda41c1e59480c5e642d4c6ce1de9 100644
--- a/tensorflow/tools/graph_transforms/rename_op.cc
+++ b/tensorflow/tools/graph_transforms/rename_op.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/round_weights.cc b/tensorflow/tools/graph_transforms/round_weights.cc
index 72927e439b7f4177a8db035d022ba450a924ad98..3a145ac1f6b0ef238383f4eb75dd5de023503c47 100644
--- a/tensorflow/tools/graph_transforms/round_weights.cc
+++ b/tensorflow/tools/graph_transforms/round_weights.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include "tensorflow/core/kernels/quantization_utils.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/sort_by_execution_order.cc b/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
index 43152d20fcc1aa477983c8d792dcab2e74664e73..548f5ba4820a82718676d995cbd7a09332051bf4 100644
--- a/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
+++ b/tensorflow/tools/graph_transforms/sort_by_execution_order.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/graph_transforms/sparsify_gather.cc b/tensorflow/tools/graph_transforms/sparsify_gather.cc
index cc82100148117c7846ba5781e1a97e172ad7f03c..bed51f89821032862ec3d24077cb51d9c676be94 100644
--- a/tensorflow/tools/graph_transforms/sparsify_gather.cc
+++ b/tensorflow/tools/graph_transforms/sparsify_gather.cc
@@ -25,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/core/util/tensor_bundle/tensor_bundle.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
diff --git a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
index ae9d0aa20999c86fe2ea8902204604807f0f298c..d466f21c17ddfec9c0b0181f844b1b608f95246a 100644
--- a/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
+++ b/tensorflow/tools/graph_transforms/strip_unused_nodes.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tensorflow/core/graph/subgraph.h"
 #include "tensorflow/core/platform/init_main.h"
 #include "tensorflow/core/public/session.h"
-#include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/graph_transforms/transform_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 7aaa845ae92eed26fd72a13c4fdd74692163458f..1186189844aa887ba011b532df3a73d89ffe52b8 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -112,6 +112,7 @@ pkg_tar(
 genrule(
     name = "clicenses_generate",
     srcs = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
@@ -168,7 +169,6 @@ genrule(
         "grpc",
         [
             "@grpc//:LICENSE",
-            "@grpc//third_party/nanopb:LICENSE.txt",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
     ),
@@ -180,6 +180,7 @@ genrule(
 genrule(
     name = "jnilicenses_generate",
     srcs = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
@@ -206,7 +207,6 @@ genrule(
         "@zlib_archive//:zlib.h",
         "@grpc//:LICENSE",
         "@grpc//third_party/address_sorting:LICENSE",
-        "@grpc//third_party/nanopb:LICENSE.txt",
     ] + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 3a863d3c5238259ff8a5b44cf3ecfb08d2a5c920..baacb8723961d0a78b29338f1c4f212e46573b2c 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -88,6 +88,9 @@ COMMON_PIP_DEPS = [
     "//tensorflow/contrib/timeseries:timeseries_pip",
     "//tensorflow/contrib/tpu",
     "//tensorflow/examples/tutorials/mnist:package",
+    "//tensorflow/lite/python:interpreter_test_data",
+    "//tensorflow/lite/python:tflite_convert",
+    "//tensorflow/lite/toco/python:toco_from_protos",
     # "//tensorflow/python/autograph/converters:converters",
     # "//tensorflow/python/autograph/core:core",
     "//tensorflow/python/autograph/core:test_lib",
@@ -110,6 +113,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/eager:eager_pip",
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/testdata:self_adjoint_eig_op_test_files",
+    "//tensorflow/python/ops/ragged:ragged_test_util",
     "//tensorflow/python/saved_model:saved_model",
     "//tensorflow/python/tools:tools_pip",
     "//tensorflow/python/tools/api/generator:create_python_api",
@@ -124,7 +128,7 @@ COMMON_PIP_DEPS = [
 py_binary(
     name = "simple_console_for_windows",
     srcs = ["simple_console_for_windows.py"],
-    data = COMMON_PIP_DEPS,
+    data = COMMON_PIP_DEPS + ["//tensorflow/python:pywrap_tensorflow_import_lib_file"],
     srcs_version = "PY2AND3",
     deps = ["//tensorflow:tensorflow_py"],
 )
@@ -132,6 +136,7 @@ py_binary(
 filegroup(
     name = "licenses",
     data = [
+        "//third_party/icu/data:LICENSE",
         "//third_party/eigen3:LICENSE",
         "//third_party/fft2d:LICENSE",
         "//third_party/hadoop:LICENSE.txt",
@@ -210,7 +215,6 @@ filegroup(
         "grpc",
         [
             "@grpc//:LICENSE",
-            "@grpc//third_party/nanopb:LICENSE.txt",
             "@grpc//third_party/address_sorting:LICENSE",
         ],
     ) + if_ngraph([
@@ -227,15 +231,9 @@ sh_binary(
     data = select({
         "//tensorflow:windows": [
             ":simple_console_for_windows",
-            "//tensorflow/lite/python:interpreter_test_data",
-            "//tensorflow/lite/python:tflite_convert",
-            "//tensorflow/lite/toco/python:toco_from_protos",
         ],
         "//conditions:default": COMMON_PIP_DEPS + [
             ":simple_console",
-            "//tensorflow/lite/python:interpreter_test_data",
-            "//tensorflow/lite/python:tflite_convert",
-            "//tensorflow/lite/toco/python:toco_from_protos",
         ],
     }) + if_mkl_ml(["//third_party/mkl:intel_binary_blob"]),
 )
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 07475cc0c4de6b3cd71795575637a3c06da7c041..85c913f158863c5ff3718ae3f305829e15237b22 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -87,7 +87,8 @@ if 'tf_nightly' in project_name:
   for i, pkg in enumerate(REQUIRED_PACKAGES):
     if 'tensorboard' in pkg:
       REQUIRED_PACKAGES[i] = 'tb-nightly >= 1.13.0a0, < 1.14.0a0'
-      break
+    if 'tensorflow_estimator' in pkg:
+      REQUIRED_PACKAGES[i] = 'tf-estimator-nightly'
 
 # weakref.finalize and enum were introduced in Python 3.4
 if sys.version_info < (3, 4):
@@ -106,6 +107,7 @@ CONSOLE_SCRIPTS = [
     # TensorBoard command, pip will inappropriately remove it during install,
     # even though the command is not removed, just moved to a different wheel.
     'tensorboard = tensorboard.main:run_main',
+    'tf_upgrade_v2 = tensorflow.tools.compatibility.tf_upgrade_v2_main:main',
 ]
 # pylint: enable=line-too-long
 
diff --git a/tensorflow/workspace.bzl b/tensorflow/workspace.bzl
index 6d3562caef68d526dbe5a94952466c3f3d18bc4c..60dcca3207f88f4bba9e0d11c263f657d44ed1b5 100755
--- a/tensorflow/workspace.bzl
+++ b/tensorflow/workspace.bzl
@@ -77,31 +77,31 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     mkl_repository(
         name = "mkl_linux",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "e2233534a9d15c387e22260997af4312a39e9f86f791768409be273b5453c4e6",
-        strip_prefix = "mklml_lnx_2019.0.20180710",
+        sha256 = "f00dc3b142a5be399bdeebd7e7ea369545a35d4fb84c86f98b6b048d72685295",
+        strip_prefix = "mklml_lnx_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_lnx_2019.0.20180710.tgz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_lnx_2019.0.1.20180928.tgz",
         ],
     )
     mkl_repository(
         name = "mkl_windows",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "3fdcff17b018a0082491adf3ba143358265336a801646e46e0191ec8d58d24a2",
-        strip_prefix = "mklml_win_2019.0.20180710",
+        sha256 = "efef90b7b9613fab10f44c8ac4ff28db613a112c64ed94826d7e44df09c44b0b",
+        strip_prefix = "mklml_win_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_win_2019.0.20180710.zip",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_win_2019.0.1.20180928.zip",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_win_2019.0.1.20180928.zip",
         ],
     )
     mkl_repository(
         name = "mkl_darwin",
         build_file = clean_dep("//third_party/mkl:mkl.BUILD"),
-        sha256 = "411a30014a938eb83fb9f37b3dbe8e371b106fc1dd621fc23123cadc72737ce6",
-        strip_prefix = "mklml_mac_2019.0.20180710",
+        sha256 = "83f02938a0c095274db7b8b7b694157abafa3837c5cbaef740440d466c86a477",
+        strip_prefix = "mklml_mac_2019.0.1.20180928",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz",
-            "https://github.com/intel/mkl-dnn/releases/download/v0.16/mklml_mac_2019.0.20180710.tgz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_mac_2019.0.1.20180928.tgz",
+            "https://github.com/intel/mkl-dnn/releases/download/v0.17-rc/mklml_mac_2019.0.1.20180928.tgz",
         ],
     )
 
@@ -112,34 +112,33 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "mkl_dnn",
         build_file = clean_dep("//third_party/mkl_dnn:mkldnn.BUILD"),
-        sha256 = "363cc9239eacf8e7917753c6d8c94f767e4cd049160d0654a61ef32d5e1b3049",
-        strip_prefix = "mkl-dnn-4e333787e0d66a1dca1218e99a891d493dbc8ef1",
+        sha256 = "b100f57af4a2b59a3a37a1ba38f77b644d2107d758a1a7f4e51310063cd21e73",
+        strip_prefix = "mkl-dnn-733fc908874c71a5285043931a1cf80aa923165c",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
-            "https://github.com/intel/mkl-dnn/archive/4e333787e0d66a1dca1218e99a891d493dbc8ef1.tar.gz",
+            "https://mirror.bazel.build/github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
+            "https://github.com/intel/mkl-dnn/archive/733fc908874c71a5285043931a1cf80aa923165c.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_google_absl",
         build_file = clean_dep("//third_party:com_google_absl.BUILD"),
-        sha256 = "3cf6132129ba87f0781c383bfaf381b7174b5818e81fffcc5d04bb451154f0f2",
-        strip_prefix = "abseil-cpp-f95179062eb65ce40895cc76f1398cce25394369",
+        sha256 = "3ad76de484192b2d5afd49d90492b5ed0bc59eb1a4e8e0deecc7a2a077a90251",
+        strip_prefix = "abseil-cpp-f197d7c72a54064cfde5a2058f1513a4a0ee36fb",
         urls = [
-            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f95179062eb65ce40895cc76f1398cce25394369.tar.gz",
-            "https://github.com/abseil/abseil-cpp/archive/f95179062eb65ce40895cc76f1398cce25394369.tar.gz",
+            "https://mirror.bazel.build/github.com/abseil/abseil-cpp/archive/f197d7c72a54064cfde5a2058f1513a4a0ee36fb.tar.gz",
+            "https://github.com/abseil/abseil-cpp/archive/f197d7c72a54064cfde5a2058f1513a4a0ee36fb.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "eigen_archive",
         build_file = clean_dep("//third_party:eigen.BUILD"),
-        patch_file = clean_dep("//third_party:eigen_reshaped.patch"),
-        sha256 = "d66cec3b54b3dfaa4666c1d49481a7197f93fc078cd53c54e2b4a8893a529c9f",
-        strip_prefix = "eigen-eigen-b4890dc6bc34",
+        sha256 = "aae7a680d141c978301dfae2c7945c06039f65849fcf64269595a9cdbba82638",
+        strip_prefix = "eigen-eigen-729d33d11c81",
         urls = [
-            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/b4890dc6bc34.tar.gz",
-            "https://bitbucket.org/eigen/eigen/get/b4890dc6bc34.tar.gz",
+            "https://mirror.bazel.build/bitbucket.org/eigen/eigen/get/729d33d11c81.tar.gz",
+            "https://bitbucket.org/eigen/eigen/get/729d33d11c81.tar.gz",
         ],
     )
 
@@ -169,26 +168,26 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
 
     tf_http_archive(
         name = "com_googlesource_code_re2",
-        sha256 = "803c7811146edeef8f91064de37c6f19136ff01a2a8cdb3230e940b2fd9f07fe",
-        strip_prefix = "re2-2018-07-01",
+        sha256 = "a31397714a353587413d307337d0b58f8a2e20e2b9d02f2e24e3463fa4eeda81",
+        strip_prefix = "re2-2018-10-01",
         system_build_file = clean_dep("//third_party/systemlibs:re2.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/google/re2/archive/2018-07-01.tar.gz",
-            "https://github.com/google/re2/archive/2018-07-01.tar.gz",
+            "https://mirror.bazel.build/github.com/google/re2/archive/2018-10-01.tar.gz",
+            "https://github.com/google/re2/archive/2018-10-01.tar.gz",
         ],
     )
 
     tf_http_archive(
         name = "com_github_googlecloudplatform_google_cloud_cpp",
-        sha256 = "fdd3b3aecce60987e5525e55bf3a21d68a8695320bd5b980775af6507eec3944",
-        strip_prefix = "google-cloud-cpp-14760a86c4ffab9943b476305c4fe927ad95db1c",
+        sha256 = "3ade2072e6588ff56c0434abe6c63aa5f3f2d56be15a299bafc7e9cdf0a12c17",
+        strip_prefix = "google-cloud-cpp-0.3.0",
         system_build_file = clean_dep("//third_party/systemlibs:google_cloud_cpp.BUILD"),
         system_link_files = {
             "//third_party/systemlibs:google_cloud_cpp.google.cloud.bigtable.BUILD": "google/cloud/bigtable/BUILD",
         },
         urls = [
-            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
-            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/14760a86c4ffab9943b476305c4fe927ad95db1c.tar.gz",
+            "https://mirror.bazel.build/github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
+            "https://github.com/GoogleCloudPlatform/google-cloud-cpp/archive/v0.3.0.tar.gz",
         ],
     )
 
@@ -348,16 +347,20 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     PROTOBUF_URLS = [
-        "https://mirror.bazel.build/github.com/google/protobuf/archive/v3.6.1.tar.gz",
-        "https://github.com/google/protobuf/archive/v3.6.1.tar.gz",
+        "https://mirror.bazel.build/github.com/protocolbuffers/protobuf/archive/v3.6.1.2.tar.gz",
+        "https://github.com/protocolbuffers/protobuf/archive/v3.6.1.2.tar.gz",
     ]
-    PROTOBUF_SHA256 = "3d4e589d81b2006ca603c1ab712c9715a76227293032d05b26fca603f90b3f5b"
-    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1"
+    PROTOBUF_SHA256 = "2244b0308846bb22b4ff0bcc675e99290ff9f1115553ae9671eba1030af31bc0"
+    PROTOBUF_STRIP_PREFIX = "protobuf-3.6.1.2"
 
     tf_http_archive(
         name = "protobuf_archive",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -368,6 +371,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "com_google_protobuf",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -375,6 +382,10 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         name = "com_google_protobuf_cc",
         sha256 = PROTOBUF_SHA256,
         strip_prefix = PROTOBUF_STRIP_PREFIX,
+        system_build_file = clean_dep("//third_party/systemlibs:protobuf.BUILD"),
+        system_link_files = {
+            "//third_party/systemlibs:protobuf.bzl": "protobuf.bzl",
+        },
         urls = PROTOBUF_URLS,
     )
 
@@ -446,14 +457,26 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
         ],
     )
 
+    # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
     tf_http_archive(
         name = "grpc",
-        sha256 = "50db9cf2221354485eb7c3bd55a4c27190caef7048a2a1a15fbe60a498f98b44",
-        strip_prefix = "grpc-1.13.0",
+        sha256 = "1aa84387232dda273ea8fdfe722622084f72c16f7b84bfc519ac7759b71cdc91",
+        strip_prefix = "grpc-69b6c047bc767b4d80e7af4d00ccb7c45b683dae",
         system_build_file = clean_dep("//third_party/systemlibs:grpc.BUILD"),
         urls = [
-            "https://mirror.bazel.build/github.com/grpc/grpc/archive/v1.13.0.tar.gz",
-            "https://github.com/grpc/grpc/archive/v1.13.0.tar.gz",
+            "https://mirror.bazel.build/github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+            "https://github.com/grpc/grpc/archive/69b6c047bc767b4d80e7af4d00ccb7c45b683dae.tar.gz",
+        ],
+    )
+
+    tf_http_archive(
+        name = "com_github_nanopb_nanopb",
+        sha256 = "8bbbb1e78d4ddb0a1919276924ab10d11b631df48b657d960e0c795a25515735",
+        build_file = "@grpc//third_party:nanopb.BUILD",
+        strip_prefix = "nanopb-f8ac463766281625ad710900479130c7fcb4d63b",
+        urls = [
+            "https://mirror.bazel.build/github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
+            "https://github.com/nanopb/nanopb/archive/f8ac463766281625ad710900479130c7fcb4d63b.tar.gz",
         ],
     )
 
@@ -473,11 +496,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "llvm",
         build_file = clean_dep("//third_party/llvm:llvm.autogenerated.BUILD"),
-        sha256 = "a22a9b4c3af50a52ba0015b6987bba7202c3ec8e1d40ae76ee7d7643638936ae",
-        strip_prefix = "llvm-b4ace5f3454131a3070ef7c11e19e42fc9a80b4e",
+        sha256 = "34170a4aa07e434dd537d98a705dcf1b3901f73820fe1d6b9370e8c1c94e9157",
+        strip_prefix = "llvm-0487bd8f42c8b38166ff825d56014d0ff49db604",
         urls = [
-            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/b4ace5f3454131a3070ef7c11e19e42fc9a80b4e.tar.gz",
-            "https://github.com/llvm-mirror/llvm/archive/b4ace5f3454131a3070ef7c11e19e42fc9a80b4e.tar.gz",
+            "https://mirror.bazel.build/github.com/llvm-mirror/llvm/archive/0487bd8f42c8b38166ff825d56014d0ff49db604.tar.gz",
+            "https://github.com/llvm-mirror/llvm/archive/0487bd8f42c8b38166ff825d56014d0ff49db604.tar.gz",
         ],
     )
 
@@ -690,11 +713,11 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     tf_http_archive(
         name = "arm_neon_2_x86_sse",
         build_file = clean_dep("//third_party:arm_neon_2_x86_sse.BUILD"),
-        sha256 = "c8d90aa4357f8079d427e87a6f4c493da1fa4140aee926c05902d7ec1533d9a5",
-        strip_prefix = "ARM_NEON_2_x86_SSE-0f77d9d182265259b135dad949230ecbf1a2633d",
+        sha256 = "213733991310b904b11b053ac224fee2d4e0179e46b52fe7f8735b8831e04dcc",
+        strip_prefix = "ARM_NEON_2_x86_SSE-1200fe90bb174a6224a525ee60148671a786a71f",
         urls = [
-            "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
-            "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/0f77d9d182265259b135dad949230ecbf1a2633d.tar.gz",
+            "https://mirror.bazel.build/github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
+            "https://github.com/intel/ARM_NEON_2_x86_SSE/archive/1200fe90bb174a6224a525ee60148671a786a71f.tar.gz",
         ],
     )
 
@@ -711,12 +734,22 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     )
 
     tf_http_archive(
-        name = "tflite_mobilenet",
-        build_file = clean_dep("//third_party:tflite_mobilenet.BUILD"),
-        sha256 = "23f814d1c076bdf03715dfb6cab3713aa4fbdf040fd5448c43196bd2e97a4c1b",
+        name = "tflite_mobilenet_float",
+        build_file = clean_dep("//third_party:tflite_mobilenet_float.BUILD"),
+        sha256 = "2fadeabb9968ec6833bee903900dda6e61b3947200535874ce2fe42a8493abc0",
+        urls = [
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224.tgz",
+        ],
+    )
+
+    tf_http_archive(
+        name = "tflite_mobilenet_quant",
+        build_file = clean_dep("//third_party:tflite_mobilenet_quant.BUILD"),
+        sha256 = "d32432d28673a936b2d6281ab0600c71cf7226dfe4cdcef3012555f691744166",
         urls = [
-            "https://mirror.bazel.build/storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
-            "https://storage.googleapis.com/download.tensorflow.org/models/tflite/mobilenet_v1_224_android_quant_2017_11_08.zip",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
+            "http://download.tensorflow.org/models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
         ],
     )
 
@@ -847,7 +880,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # important since we have set GRPC_ARES=0 in .bazelrc
     native.bind(
         name = "cares",
-        actual = "@grpc//third_party/nanopb:nanopb",
+        actual = "@com_github_nanopb_nanopb//:nanopb",
     )
 
     # Needed by Protobuf
@@ -879,7 +912,7 @@ def tf_workspace(path_prefix = "", tf_repo_name = ""):
     # Needed by gRPC
     native.bind(
         name = "nanopb",
-        actual = "@grpc//third_party/nanopb:nanopb",
+        actual = "@com_github_nanopb_nanopb//:nanopb",
     )
 
     # Needed by gRPC
diff --git a/third_party/clang_toolchain/download_clang.bzl b/third_party/clang_toolchain/download_clang.bzl
index 9023e250b2fe2d752782314d8f38d2e6630fbc96..7ced9027473e39ad9870ce138b64c7f7ec64ad01 100644
--- a/third_party/clang_toolchain/download_clang.bzl
+++ b/third_party/clang_toolchain/download_clang.bzl
@@ -39,15 +39,15 @@ def download_clang(repo_ctx, out_folder):
 
     # Latest CLANG_REVISION and CLANG_SUB_REVISION of the Chromiums's release
     # can be found in https://chromium.googlesource.com/chromium/src/tools/clang/+/master/scripts/update.py
-    CLANG_REVISION = "346388"
+    CLANG_REVISION = "347933"
     CLANG_SUB_REVISION = 1
 
     package_version = "%s-%s" % (CLANG_REVISION, CLANG_SUB_REVISION)
 
     checksums = {
-        "Linux_x64": "5e5564e4e743414c7eaec9fd9e739732ddd2a343e49bde4c88fc2530b1c598b9",
-        "Mac": "19271a7cc5c2bcaf9643d3dd622b5458569dc662bbc58f63b129cf6e3a4e3243",
-        "Win": "60b0bd1f11e53892109f4159e2aba0f803604823e07875ca98b82bd5628d7f4d",
+        "Linux_x64": "cae3643fdf5d46fc9bc8731212bb37573547148d90b64b083165e090133d11b0",
+        "Mac": "083a0e91a38c06e568652313ac7372b17a101268f7d65533d721ca30413442b4",
+        "Win": "43160487cfc7e88076a369a2b6e8e4a0f42e104c28d8903f3aaa62d630aba949",
     }
 
     platform_folder = _get_platform_folder(repo_ctx.os.name)
diff --git a/third_party/eigen_reshaped.patch b/third_party/eigen_reshaped.patch
deleted file mode 100644
index 7acfdcf9fefc1f0f83433994f5ea52306a71d7ea..0000000000000000000000000000000000000000
--- a/third_party/eigen_reshaped.patch
+++ /dev/null
@@ -1,48 +0,0 @@
---- a/Eigen/src/Core/util/ReshapedHelper.h	(date 1541195478000)
-+++ b/Eigen/src/Core/util/ReshapedHelper.h	(date 1541195478000)
-@@ -39,6 +39,11 @@
-   return total/other;
- }
-
-+template<int Flags, int Order>
-+struct get_compiletime_reshape_order {
-+  enum { value = Order == AutoOrder ? Flags & RowMajorBit : Order };
-+};
-+
- }
-
- } // end namespace Eigen
---- a/Eigen/src/plugins/ReshapedMethods.h	(date 1541195254000)
-+++ b/Eigen/src/plugins/ReshapedMethods.h	(date 1541195254000)
-@@ -105,13 +105,13 @@
- inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
-                 internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
-                 internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
--                (Order==AutoOrder?Flags&RowMajorBit:Order)>
-+                internal::get_compiletime_reshape_order<Flags,Order>::value>
- reshaped(NRowsType nRows, NColsType nCols) EIGEN_RESHAPED_METHOD_CONST
- {
-   return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived,
-                   internal::get_compiletime_reshape_size<NRowsType,NColsType,SizeAtCompileTime>::value,
-                   internal::get_compiletime_reshape_size<NColsType,NRowsType,SizeAtCompileTime>::value,
--                  (Order==AutoOrder?Flags&RowMajorBit:Order)>
-+                  internal::get_compiletime_reshape_order<Flags,Order>::value>
-                 (derived(),
-                  internal::get_runtime_reshape_size(nRows,internal::get_runtime_value(nCols),size()),
-                  internal::get_runtime_reshape_size(nCols,internal::get_runtime_value(nRows),size()));
-@@ -128,11 +128,13 @@
-
- template<int Order>
- EIGEN_DEVICE_FUNC
--inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1, (Order==AutoOrder?Flags&RowMajorBit:Order)>
-+inline Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
-+                internal::get_compiletime_reshape_order<Flags,Order>::value>
- reshaped() EIGEN_RESHAPED_METHOD_CONST
- {
-   EIGEN_STATIC_ASSERT(Order==RowMajor || Order==ColMajor || Order==AutoOrder, INVALID_TEMPLATE_PARAMETER);
--  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1, (Order==AutoOrder?Flags&RowMajorBit:Order)>
-+  return Reshaped<EIGEN_RESHAPED_METHOD_CONST Derived, SizeAtCompileTime, 1,
-+                  internal::get_compiletime_reshape_order<Flags,Order>::value>
-                 (derived(), size(), 1);
- }
- 
\ No newline at end of file
diff --git a/third_party/googleapis.BUILD b/third_party/googleapis.BUILD
index 95e999af1886576317aa59d133e8d5c88ba368d3..b8871eda7280becb7c3f53412120600d52c0fb54 100644
--- a/third_party/googleapis.BUILD
+++ b/third_party/googleapis.BUILD
@@ -13,7 +13,9 @@
 # limitations under the License.
 
 package(default_visibility = ["//visibility:public"])
+
 licenses(["notice"])  # Apache 2.0
+
 exports_files(["LICENSE"])
 
 load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
@@ -21,6 +23,9 @@ load("@protobuf_archive//:protobuf.bzl", "cc_proto_library")
 cc_proto_library(
     name = "bigtable_protos",
     srcs = [
+        "google/api/annotations.proto",
+        "google/api/auth.proto",
+        "google/api/http.proto",
         "google/bigtable/admin/v2/bigtable_instance_admin.proto",
         "google/bigtable/admin/v2/bigtable_table_admin.proto",
         "google/bigtable/admin/v2/common.proto",
@@ -31,15 +36,12 @@ cc_proto_library(
         "google/iam/v1/iam_policy.proto",
         "google/iam/v1/policy.proto",
         "google/longrunning/operations.proto",
-        "google/rpc/status.proto",
         "google/rpc/error_details.proto",
-        "google/api/annotations.proto",
-        "google/api/auth.proto",
-        "google/api/http.proto",
+        "google/rpc/status.proto",
     ],
     include = ".",
-    protoc = "@protobuf_archive//:protoc",
     default_runtime = "@protobuf_archive//:protobuf",
-    deps = ["@protobuf_archive//:cc_wkt_protos"],
+    protoc = "@protobuf_archive//:protoc",
     use_grpc_plugin = True,
+    deps = ["@protobuf_archive//:cc_wkt_protos"],
 )
diff --git a/third_party/gpus/crosstool/CROSSTOOL.tpl b/third_party/gpus/crosstool/CROSSTOOL.tpl
index 3189cf8e31610c432f03f8f3a30efc3ada4d9652..921188cbb431d925df69fbd0cc06aac07fe1a1a9 100644
--- a/third_party/gpus/crosstool/CROSSTOOL.tpl
+++ b/third_party/gpus/crosstool/CROSSTOOL.tpl
@@ -184,7 +184,8 @@ toolchain {
       action: "c++-link-dynamic-library"
       action: "c++-link-nodeps-dynamic-library"
       flag_group {
-        flag:"-no-canonical-prefixes"
+        flag: "-no-canonical-prefixes"
+        %{extra_no_canonical_prefixes_flags}
       }
     }
   }
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 831a3067b2413c2975a920dfa5edbf1838e9a5dc..03c67bcb3d75aca19bcad8b824d79283193dc115 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -1418,6 +1418,7 @@ def _create_local_cuda_repository(repository_ctx):
         flag: "-Wno-invalid-partial-specialization"
     """
     cuda_defines["%{host_compiler_includes}"] = host_compiler_includes
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = ""
     _tpl(repository_ctx, "crosstool:BUILD", {
         "%{linker_files}": ":empty",
         "%{win_linker_files}": ":empty"
@@ -1439,6 +1440,14 @@ def _create_local_cuda_repository(repository_ctx):
             repository_ctx, cuda_config) +
         "\n  cxx_builtin_include_directory: \"%s\"" % cupti_header_dir +
         "\n  cxx_builtin_include_directory: \"%s\"" % cudnn_header_dir)
+
+    # For gcc, do not canonicalize system header paths; some versions of gcc
+    # pick the shortest possible path for system includes when creating the
+    # .d file - given that includes that are prefixed with "../" multiple
+    # time quickly grow longer than the root of the tree, this can lead to
+    # bazel's header check failing.
+    cuda_defines["%{extra_no_canonical_prefixes_flags}"] = (
+        "flag: \"-fno-canonical-system-headers\"")
     nvcc_path = str(
         repository_ctx.path("%s/bin/nvcc%s" % (
             cuda_config.cuda_toolkit_path,
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 9108639b0bf74ab4b14468d77a0570ff8913f107..6df6799bd7696d5dbcc70345bf7b5e19f709b8d4 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -105,7 +105,7 @@ def get_cxx_inc_directories(repository_ctx, cc):
     return includes_cpp + [
         inc
         for inc in includes_c
-        if inc not in includes_cpp_set
+        if inc not in includes_cpp_set.to_list()
     ]
 
 def auto_configure_fail(msg):
diff --git a/third_party/icu/data/BUILD.bazel b/third_party/icu/data/BUILD.bazel
new file mode 100644
index 0000000000000000000000000000000000000000..7db21566e4e65960d59caa9584c944ef8375bd7e
--- /dev/null
+++ b/third_party/icu/data/BUILD.bazel
@@ -0,0 +1,46 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(["LICENSE"])
+
+# Data for core MIME/Unix/Windows encodings:
+# ISO 8859-2..9, 15; Windows-125x; EUC-CN; GBK (Windows cp936); GB 18030;
+# Big5 (Windows cp950); SJIS (Windows cp932); EUC-JP; EUC-KR, KS C 5601;
+# Windows cp949. Data is pre-processed for little-endian platforms. To replicate
+# this pre-processing (if you want additional encodings, for example), do the
+# following:
+#
+# First, download, build, and install ICU. This installs tools such as makeconv.
+# Then, run the following from your icu4c/source directory:
+#   $ cd data/mappings
+#   $ rm *.cnv  # there shouldn't be any .cnv files here to begin with
+#   $ grep \.ucm ucmcore.mk | \
+#     sed 's/\(UCM_SOURCE_CORE=\)\?\([^ ]\+\.ucm\)\\\?/\2/g' | \
+#     tr '\n' ' ' | xargs makeconv
+#   $ ls *.cnv > filelist.lst
+#   $ pkgdata -m common -p ucmcore filelist.lst
+#   $ genccode -f custom_conversion_data ucmcore.dat
+# This creates custom_conversion_data.c. You will need to change the target
+# :conversion_data to depend on your custom source instead of :conversion_data.c
+filegroup(
+    name = "conversion_files",
+    srcs = glob(["icu_conversion_data.c.gz.*"]),
+)
+
+# Data files are compressed and split to work around git performance degradation
+# around large files.
+genrule(
+    name = "merge_conversion_data",
+    srcs = [":conversion_files"],
+    outs = ["conversion_data.c"],
+    cmd = "cat $(locations :conversion_files) | gunzip > $@",
+)
+
+cc_library(
+    name = "conversion_data",
+    srcs = [":conversion_data.c"],
+    deps = ["@icu//:headers"],
+)
diff --git a/third_party/icu/data/LICENSE b/third_party/icu/data/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..25b6eb9d3415e62e99af6a349362349c091bc6c7
--- /dev/null
+++ b/third_party/icu/data/LICENSE
@@ -0,0 +1,414 @@
+COPYRIGHT AND PERMISSION NOTICE (ICU 58 and later)
+
+Copyright © 1991-2018 Unicode, Inc. All rights reserved.
+Distributed under the Terms of Use in http://www.unicode.org/copyright.html.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of the Unicode data files and any associated documentation
+(the "Data Files") or Unicode software and any associated documentation
+(the "Software") to deal in the Data Files or Software
+without restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, and/or sell copies of
+the Data Files or Software, and to permit persons to whom the Data Files
+or Software are furnished to do so, provided that either
+(a) this copyright and permission notice appear with all copies
+of the Data Files or Software, or
+(b) this copyright and permission notice appear in associated
+Documentation.
+
+THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
+WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
+NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
+DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+PERFORMANCE OF THE DATA FILES OR SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale,
+use or other dealings in these Data Files or Software without prior
+written authorization of the copyright holder.
+
+---------------------
+
+Third-Party Software Licenses
+
+This section contains third-party software notices and/or additional
+terms for licensed third-party software components included within ICU
+libraries.
+
+1. ICU License - ICU 1.8.1 to ICU 57.1
+
+COPYRIGHT AND PERMISSION NOTICE
+
+Copyright (c) 1995-2016 International Business Machines Corporation and others
+All rights reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, and/or sell copies of the Software, and to permit persons
+to whom the Software is furnished to do so, provided that the above
+copyright notice(s) and this permission notice appear in all copies of
+the Software and that both the above copyright notice(s) and this
+permission notice appear in supporting documentation.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT
+OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY
+SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER
+RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
+CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
+CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+
+Except as contained in this notice, the name of a copyright holder
+shall not be used in advertising or otherwise to promote the sale, use
+or other dealings in this Software without prior written authorization
+of the copyright holder.
+
+All trademarks and registered trademarks mentioned herein are the
+property of their respective owners.
+
+2. Chinese/Japanese Word Break Dictionary Data (cjdict.txt)
+
+ #     The Google Chrome software developed by Google is licensed under
+ # the BSD license. Other software included in this distribution is
+ # provided under other licenses, as set forth below.
+ #
+ #  The BSD License
+ #  http://opensource.org/licenses/bsd-license.php
+ #  Copyright (C) 2006-2008, Google Inc.
+ #
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ # modification, are permitted provided that the following conditions are met:
+ #
+ #  Redistributions of source code must retain the above copyright notice,
+ # this list of conditions and the following disclaimer.
+ #  Redistributions in binary form must reproduce the above
+ # copyright notice, this list of conditions and the following
+ # disclaimer in the documentation and/or other materials provided with
+ # the distribution.
+ #  Neither the name of  Google Inc. nor the names of its
+ # contributors may be used to endorse or promote products derived from
+ # this software without specific prior written permission.
+ #
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ # CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ # INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ # MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ # DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+ # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+ # BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ #
+ #
+ #  The word list in cjdict.txt are generated by combining three word lists
+ # listed below with further processing for compound word breaking. The
+ # frequency is generated with an iterative training against Google web
+ # corpora.
+ #
+ #  * Libtabe (Chinese)
+ #    - https://sourceforge.net/project/?group_id=1519
+ #    - Its license terms and conditions are shown below.
+ #
+ #  * IPADIC (Japanese)
+ #    - http://chasen.aist-nara.ac.jp/chasen/distribution.html
+ #    - Its license terms and conditions are shown below.
+ #
+ #  ---------COPYING.libtabe ---- BEGIN--------------------
+ #
+ #  /*
+ #   * Copyright (c) 1999 TaBE Project.
+ #   * Copyright (c) 1999 Pai-Hsiang Hsiao.
+ #   * All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the TaBE Project nor the names of its
+ #   *   contributors may be used to endorse or promote products derived
+ #   *   from this software without specific prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  /*
+ #   * Copyright (c) 1999 Computer Systems and Communication Lab,
+ #   *                    Institute of Information Science, Academia
+ #       *                    Sinica. All rights reserved.
+ #   *
+ #   * Redistribution and use in source and binary forms, with or without
+ #   * modification, are permitted provided that the following conditions
+ #   * are met:
+ #   *
+ #   * . Redistributions of source code must retain the above copyright
+ #   *   notice, this list of conditions and the following disclaimer.
+ #   * . Redistributions in binary form must reproduce the above copyright
+ #   *   notice, this list of conditions and the following disclaimer in
+ #   *   the documentation and/or other materials provided with the
+ #   *   distribution.
+ #   * . Neither the name of the Computer Systems and Communication Lab
+ #   *   nor the names of its contributors may be used to endorse or
+ #   *   promote products derived from this software without specific
+ #   *   prior written permission.
+ #   *
+ #   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ #   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ #   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ #   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ #   * REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ #   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ #   * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ #   * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ #   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ #   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ #   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ #   * OF THE POSSIBILITY OF SUCH DAMAGE.
+ #   */
+ #
+ #  Copyright 1996 Chih-Hao Tsai @ Beckman Institute,
+ #      University of Illinois
+ #  c-tsai4@uiuc.edu  http://casper.beckman.uiuc.edu/~c-tsai4
+ #
+ #  ---------------COPYING.libtabe-----END--------------------------------
+ #
+ #
+ #  ---------------COPYING.ipadic-----BEGIN-------------------------------
+ #
+ #  Copyright 2000, 2001, 2002, 2003 Nara Institute of Science
+ #  and Technology.  All Rights Reserved.
+ #
+ #  Use, reproduction, and distribution of this software is permitted.
+ #  Any copy of this software, whether in its original form or modified,
+ #  must include both the above copyright notice and the following
+ #  paragraphs.
+ #
+ #  Nara Institute of Science and Technology (NAIST),
+ #  the copyright holders, disclaims all warranties with regard to this
+ #  software, including all implied warranties of merchantability and
+ #  fitness, in no event shall NAIST be liable for
+ #  any special, indirect or consequential damages or any damages
+ #  whatsoever resulting from loss of use, data or profits, whether in an
+ #  action of contract, negligence or other tortuous action, arising out
+ #  of or in connection with the use or performance of this software.
+ #
+ #  A large portion of the dictionary entries
+ #  originate from ICOT Free Software.  The following conditions for ICOT
+ #  Free Software applies to the current dictionary as well.
+ #
+ #  Each User may also freely distribute the Program, whether in its
+ #  original form or modified, to any third party or parties, PROVIDED
+ #  that the provisions of Section 3 ("NO WARRANTY") will ALWAYS appear
+ #  on, or be attached to, the Program, which is distributed substantially
+ #  in the same form as set out herein and that such intended
+ #  distribution, if actually made, will neither violate or otherwise
+ #  contravene any of the laws and regulations of the countries having
+ #  jurisdiction over the User or the intended distribution itself.
+ #
+ #  NO WARRANTY
+ #
+ #  The program was produced on an experimental basis in the course of the
+ #  research and development conducted during the project and is provided
+ #  to users as so produced on an experimental basis.  Accordingly, the
+ #  program is provided without any warranty whatsoever, whether express,
+ #  implied, statutory or otherwise.  The term "warranty" used herein
+ #  includes, but is not limited to, any warranty of the quality,
+ #  performance, merchantability and fitness for a particular purpose of
+ #  the program and the nonexistence of any infringement or violation of
+ #  any right of any third party.
+ #
+ #  Each user of the program will agree and understand, and be deemed to
+ #  have agreed and understood, that there is no warranty whatsoever for
+ #  the program and, accordingly, the entire risk arising from or
+ #  otherwise connected with the program is assumed by the user.
+ #
+ #  Therefore, neither ICOT, the copyright holder, or any other
+ #  organization that participated in or was otherwise related to the
+ #  development of the program and their respective officials, directors,
+ #  officers and other employees shall be held liable for any and all
+ #  damages, including, without limitation, general, special, incidental
+ #  and consequential damages, arising out of or otherwise in connection
+ #  with the use or inability to use the program or any product, material
+ #  or result produced or otherwise obtained by using the program,
+ #  regardless of whether they have been advised of, or otherwise had
+ #  knowledge of, the possibility of such damages at any time during the
+ #  project or thereafter.  Each user will be deemed to have agreed to the
+ #  foregoing by his or her commencement of use of the program.  The term
+ #  "use" as used herein includes, but is not limited to, the use,
+ #  modification, copying and distribution of the program and the
+ #  production of secondary products from the program.
+ #
+ #  In the case where the program, whether in its original form or
+ #  modified, was distributed or delivered to or received by a user from
+ #  any person, organization or entity other than ICOT, unless it makes or
+ #  grants independently of ICOT any specific warranty to the user in
+ #  writing, such person, organization or entity, will also be exempted
+ #  from and not be held liable to the user for any such damages as noted
+ #  above as far as the program is concerned.
+ #
+ #  ---------------COPYING.ipadic-----END----------------------------------
+
+3. Lao Word Break Dictionary Data (laodict.txt)
+
+ #  Copyright (c) 2013 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ # Project: http://code.google.com/p/lao-dictionary/
+ # Dictionary: http://lao-dictionary.googlecode.com/git/Lao-Dictionary.txt
+ # License: http://lao-dictionary.googlecode.com/git/Lao-Dictionary-LICENSE.txt
+ #              (copied below)
+ #
+ #  This file is derived from the above dictionary, with slight
+ #  modifications.
+ #  ----------------------------------------------------------------------
+ #  Copyright (C) 2013 Brian Eugene Wilson, Robert Martin Campbell.
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification,
+ #  are permitted provided that the following conditions are met:
+ #
+ #
+ # Redistributions of source code must retain the above copyright notice, this
+ #  list of conditions and the following disclaimer. Redistributions in
+ #  binary form must reproduce the above copyright notice, this list of
+ #  conditions and the following disclaimer in the documentation and/or
+ #  other materials provided with the distribution.
+ #
+ #
+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ # INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ # OF THE POSSIBILITY OF SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+4. Burmese Word Break Dictionary Data (burmesedict.txt)
+
+ #  Copyright (c) 2014 International Business Machines Corporation
+ #  and others. All Rights Reserved.
+ #
+ #  This list is part of a project hosted at:
+ #    github.com/kanyawtech/myanmar-karen-word-lists
+ #
+ #  --------------------------------------------------------------------------
+ #  Copyright (c) 2013, LeRoy Benjamin Sharon
+ #  All rights reserved.
+ #
+ #  Redistribution and use in source and binary forms, with or without
+ #  modification, are permitted provided that the following conditions
+ #  are met: Redistributions of source code must retain the above
+ #  copyright notice, this list of conditions and the following
+ #  disclaimer.  Redistributions in binary form must reproduce the
+ #  above copyright notice, this list of conditions and the following
+ #  disclaimer in the documentation and/or other materials provided
+ #  with the distribution.
+ #
+ #    Neither the name Myanmar Karen Word Lists, nor the names of its
+ #    contributors may be used to endorse or promote products derived
+ #    from this software without specific prior written permission.
+ #
+ #  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ #  CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ #  INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ #  MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ #  DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
+ #  BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ #  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ #  TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ #  DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ #  ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR
+ #  TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
+ #  THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ #  SUCH DAMAGE.
+ #  --------------------------------------------------------------------------
+
+5. Time Zone Database
+
+  ICU uses the public domain data and code derived from Time Zone
+Database for its time zone support. The ownership of the TZ database
+is explained in BCP 175: Procedure for Maintaining the Time Zone
+Database section 7.
+
+ # 7.  Database Ownership
+ #
+ #    The TZ database itself is not an IETF Contribution or an IETF
+ #    document.  Rather it is a pre-existing and regularly updated work
+ #    that is in the public domain, and is intended to remain in the
+ #    public domain.  Therefore, BCPs 78 [RFC5378] and 79 [RFC3979] do
+ #    not apply to the TZ Database or contributions that individuals make
+ #    to it.  Should any claims be made and substantiated against the TZ
+ #    Database, the organization that is providing the IANA
+ #    Considerations defined in this RFC, under the memorandum of
+ #    understanding with the IETF, currently ICANN, may act in accordance
+ #    with all competent court orders.  No ownership claims will be made
+ #    by ICANN or the IETF Trust on the database or the code.  Any person
+ #    making a contribution to the database or code waives all rights to
+ #    future claims in that contribution or in the TZ Database.
+
+6. Google double-conversion
+
+Copyright 2006-2011, the V8 project authors. All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+    * Neither the name of Google Inc. nor the names of its
+      contributors may be used to endorse or promote products derived
+      from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.aa b/third_party/icu/data/icu_conversion_data.c.gz.aa
new file mode 100644
index 0000000000000000000000000000000000000000..b68a2c6516f8183e805c509a9139cf63d1ee3fa5
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.aa differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ab b/third_party/icu/data/icu_conversion_data.c.gz.ab
new file mode 100644
index 0000000000000000000000000000000000000000..d60aa92d675c85f95e811221bffc012d65e6c29e
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ab differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ac b/third_party/icu/data/icu_conversion_data.c.gz.ac
new file mode 100644
index 0000000000000000000000000000000000000000..de9b69ff9474e0c9ccc799d40d092d2ab2ad98bb
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ac differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ad b/third_party/icu/data/icu_conversion_data.c.gz.ad
new file mode 100644
index 0000000000000000000000000000000000000000..d5abb06b8ca21e1e6116ef1732c661c815b1489a
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ad differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ae b/third_party/icu/data/icu_conversion_data.c.gz.ae
new file mode 100644
index 0000000000000000000000000000000000000000..0e54fdb9eaffd814477460f71bc194104c1b247d
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ae differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.af b/third_party/icu/data/icu_conversion_data.c.gz.af
new file mode 100644
index 0000000000000000000000000000000000000000..cfbeb165ad3428555276a463a90a1ed2e34740f0
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.af differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ag b/third_party/icu/data/icu_conversion_data.c.gz.ag
new file mode 100644
index 0000000000000000000000000000000000000000..bde20b6da6253d866f87fcadc7e6c3571bd64d44
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ag differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ah b/third_party/icu/data/icu_conversion_data.c.gz.ah
new file mode 100644
index 0000000000000000000000000000000000000000..ae31dffbe2afc8ad59ae1dc323447d8cf9d61032
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ah differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.ai b/third_party/icu/data/icu_conversion_data.c.gz.ai
new file mode 100644
index 0000000000000000000000000000000000000000..981b869561a615f21639482929b89d2b2e5ca360
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.ai differ
diff --git a/third_party/icu/data/icu_conversion_data.c.gz.aj b/third_party/icu/data/icu_conversion_data.c.gz.aj
new file mode 100644
index 0000000000000000000000000000000000000000..1ae6bce382a05570b46217e1a031414515439a42
Binary files /dev/null and b/third_party/icu/data/icu_conversion_data.c.gz.aj differ
diff --git a/third_party/icu/udata.patch b/third_party/icu/udata.patch
new file mode 100644
index 0000000000000000000000000000000000000000..d6d59100e48b8346fcaa54f0cbdebdc5e4658f92
--- /dev/null
+++ b/third_party/icu/udata.patch
@@ -0,0 +1,53 @@
+--- /icu4c/source/common/udata.cpp.old	2018-06-19 22:34:56.000000000 -0700
++++ /icu4c/source/common/udata.cpp	2018-10-19 14:26:09.778950855 -0700
+@@ -18,15 +18,15 @@
+ 
+ #include "unicode/utypes.h"  /* U_PLATFORM etc. */
+ 
+-#ifdef __GNUC__
+-/* if gcc
+-#define ATTRIBUTE_WEAK __attribute__ ((weak))
+-might have to #include some other header
+-*/
++#if defined(__GNUC__) || defined(__SUNPRO_CC)
++#  define ATTRIBUTE_WEAK __attribute__ ((weak))
++#else
++#  define ATTRIBUTE_WEAK
+ #endif
+ 
+ #include "unicode/putil.h"
+ #include "unicode/udata.h"
++#include "unicode/umachine.h"
+ #include "unicode/uversion.h"
+ #include "charstr.h"
+ #include "cmemory.h"
+@@ -641,10 +641,11 @@
+  * partial-data-library access functions where each returns a pointer
+  * to its data package, if it is linked in.
+  */
+-/*
+-extern const void *uprv_getICUData_collation(void) ATTRIBUTE_WEAK;
+-extern const void *uprv_getICUData_conversion(void) ATTRIBUTE_WEAK;
+-*/
++
++//extern "C" const void *uprv_getICUData_collation(void);
++U_CDECL_BEGIN
++const void *uprv_getICUData_conversion(void) ATTRIBUTE_WEAK;
++U_CDECL_END
+ 
+ /*----------------------------------------------------------------------*
+  *                                                                      *
+@@ -702,10 +703,11 @@
+         if (uprv_getICUData_collation) {
+             setCommonICUDataPointer(uprv_getICUData_collation(), FALSE, pErrorCode);
+         }
++        */
+         if (uprv_getICUData_conversion) {
+-            setCommonICUDataPointer(uprv_getICUData_conversion(), FALSE, pErrorCode);
++          setCommonICUDataPointer(uprv_getICUData_conversion(), FALSE, pErrorCode);
+         }
+-        */
++
+ #if U_PLATFORM_HAS_WINUWP_API == 0 // Windows UWP Platform does not support dll icu data at this time
+         setCommonICUDataPointer(&U_ICUDATA_ENTRY_POINT, FALSE, pErrorCode);
+         {
diff --git a/third_party/icu/workspace.bzl b/third_party/icu/workspace.bzl
index a4f653e026138d233a9041bea484c809eefa4fdc..f100836b4101efa0a20e09e7d430b0b44953e89a 100644
--- a/third_party/icu/workspace.bzl
+++ b/third_party/icu/workspace.bzl
@@ -2,6 +2,11 @@
 
 load("//third_party:repo.bzl", "third_party_http_archive")
 
+# Sanitize a dependency so that it works correctly from code that includes
+# TensorFlow as a submodule.
+def clean_dep(dep):
+    return str(Label(dep))
+
 def repo():
     third_party_http_archive(
         name = "icu",
@@ -13,4 +18,5 @@ def repo():
         ],
         build_file = "//third_party/icu:BUILD.bazel",
         system_build_file = "//third_party/icu:BUILD.system",
+        patch_file = clean_dep("//third_party/icu:udata.patch"),
     )
diff --git a/third_party/libxsmm.BUILD b/third_party/libxsmm.BUILD
index ee49d281abcd54b566edde119f4a5b3e6b07d2a3..dc7dcc9517092e05999c067f9d2e04b4f36bb37a 100644
--- a/third_party/libxsmm.BUILD
+++ b/third_party/libxsmm.BUILD
@@ -38,8 +38,8 @@ genrule(
         ":libxsmm_interface",
     ],
     visibility = [
-        "//third_party/eigen3:__pkg__",
         "//tensorflow/core/kernels:__pkg__",
+        "//third_party/eigen3:__pkg__",
     ],
 )
 
diff --git a/third_party/llvm/llvm.autogenerated.BUILD b/third_party/llvm/llvm.autogenerated.BUILD
index 776935739ace7e2b8e337eff28e73e3a8a5b7f47..eb468aa65fce9c014bc7b53f1fb69729eb2a3718 100644
--- a/third_party/llvm/llvm.autogenerated.BUILD
+++ b/third_party/llvm/llvm.autogenerated.BUILD
@@ -823,6 +823,7 @@ cc_library(
     ]),
     copts = llvm_copts + ["-Iexternal/llvm/lib/Target/ARM"],
     deps = [
+        ":arm_asm_printer",
         ":arm_desc",
         ":arm_info",
         ":arm_utils",
@@ -2141,6 +2142,7 @@ cc_library(
         ":core",
         ":global_i_sel",
         ":mc",
+        ":profile_data",
         ":selection_dag",
         ":support",
         ":target",
diff --git a/third_party/llvm/llvm.bzl b/third_party/llvm/llvm.bzl
index 54ca86f3272cb6c91541e20d9ba5326d2cf726a0..5a977f82c417a9ae3e3022fa43534affe727cae2 100644
--- a/third_party/llvm/llvm.bzl
+++ b/third_party/llvm/llvm.bzl
@@ -250,6 +250,7 @@ linux_cmake_vars = {
 # CMake variables specific to the Darwin (Mac OS X) platform.
 darwin_cmake_vars = {
     "HAVE_MALLOC_MALLOC_H": 1,
+    "HAVE_MALLOC_ZONE_STATISTICS": 1,
 }
 
 # CMake variables specific to the Windows platform.
diff --git a/third_party/mkl_dnn/mkldnn.BUILD b/third_party/mkl_dnn/mkldnn.BUILD
index 597ac69e2ffed73210733fab98bed3d1227b0d23..d80c7135d6fd47f45a00b35bb29ceae0c0d1d003 100644
--- a/third_party/mkl_dnn/mkldnn.BUILD
+++ b/third_party/mkl_dnn/mkldnn.BUILD
@@ -42,8 +42,8 @@ cc_library(
         "src",
         "src/common",
         "src/cpu",
-        "src/cpu/xbyak",
         "src/cpu/gemm",
+        "src/cpu/xbyak",
     ],
     nocopts = "-fno-exceptions",
     visibility = ["//visibility:public"],
@@ -63,3 +63,27 @@ cc_library(
         "//conditions:default": [],
     }),
 )
+
+cc_library(
+    name = "mkldnn_single_threaded",
+    srcs = glob([
+        "src/common/*.cpp",
+        "src/cpu/*.cpp",
+        "src/cpu/gemm/*.cpp",
+    ]),
+    hdrs = glob(["include/*"]),
+    copts = [
+        "-fexceptions",
+        "-DMKLDNN_THR=MKLDNN_THR_SEQ",  # Disables threading.
+    ],
+    includes = [
+        "include",
+        "src",
+        "src/common",
+        "src/cpu",
+        "src/cpu/gemm",
+        "src/cpu/xbyak",
+    ],
+    nocopts = "-fno-exceptions",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/mpi/mpi.bzl b/third_party/mpi/mpi.bzl
index 38ce91c4d069fc311d5e7f17a49ff7904c9c67eb..3a483351d1f982eba09d6522db9842dd4f7eca84 100644
--- a/third_party/mpi/mpi.bzl
+++ b/third_party/mpi/mpi.bzl
@@ -2,16 +2,16 @@
 #based on the configuration options return one or the other
 
 def mpi_hdr():
-    MPI_LIB_IS_OPENMPI=True
-    hdrs = []    
+    MPI_LIB_IS_OPENMPI = True
+    hdrs = []
     if MPI_LIB_IS_OPENMPI:
-        hdrs = ["mpi.h", "mpi_portable_platform.h"]   #When using OpenMPI
+        hdrs = ["mpi.h", "mpi_portable_platform.h"]  #When using OpenMPI
     else:
-        hdrs = ["mpi.h",  "mpio.h", "mpicxx.h"]        #When using MVAPICH
+        hdrs = ["mpi.h", "mpio.h", "mpicxx.h"]  #When using MVAPICH
     return hdrs
 
 def if_mpi(if_true, if_false = []):
     return select({
         "//tensorflow:with_mpi_support": if_true,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
diff --git a/third_party/nccl/archive.BUILD b/third_party/nccl/archive.BUILD
index c0833828a736a1589a4aa7b76c15546bb0e8cd25..7a08f97ef328a7a731d7c76de8bda70c8d004dac 100644
--- a/third_party/nccl/archive.BUILD
+++ b/third_party/nccl/archive.BUILD
@@ -64,13 +64,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=0"] + rdc_copts(),
+    linkstatic = True,
     prefix = "sum_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -80,13 +80,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=1"] + rdc_copts(),
+    linkstatic = True,
     prefix = "_prod",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -96,13 +96,13 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=2"] + rdc_copts(),
+    linkstatic = True,
     prefix = "min_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
@@ -112,28 +112,28 @@ nccl_library(
         ":device_srcs",
     ],
     copts = ["-DNCCL_OP=3"] + rdc_copts(),
+    linkstatic = True,
     prefix = "max_",
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 nccl_library(
     name = "functions",
     srcs = [
-        ":device_hdrs",
         "src/collectives/device/functions.cu",
+        ":device_hdrs",
     ],
     copts = rdc_copts(),
+    linkstatic = True,
     deps = [
-        ":src_hdrs",
         ":include_hdrs",
+        ":src_hdrs",
         "@local_config_cuda//cuda:cuda_headers",
     ],
-    linkstatic = True,
 )
 
 rdc_library(
@@ -162,13 +162,13 @@ nccl_library(
         "src/nccl.h",
     ],
     hdrs = ["src/nccl.h"],
+    copts = cuda_default_copts(),
     include_prefix = "third_party/nccl",
     strip_include_prefix = "src",
-    copts = cuda_default_copts(),
+    visibility = ["//visibility:public"],
     deps = [
         ":device_code",
         ":include_hdrs",
         ":src_hdrs",
     ],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/ngraph/ngraph.BUILD b/third_party/ngraph/ngraph.BUILD
index f556c5279df1580813e3528a95f9ca7d6f8e2ef9..63e9548c53262461cfc9c3fd160f4f17430319c7 100644
--- a/third_party/ngraph/ngraph.BUILD
+++ b/third_party/ngraph/ngraph.BUILD
@@ -97,13 +97,6 @@ cc_library(
         "src/ngraph/runtime/cpu/pass/cpu_workspace_insertion.cpp",
     ],
     hdrs = glob(["src/ngraph/runtime/cpu/**/*.hpp"]) + glob([]),
-    deps = [
-        ":ngraph_headers",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-        "@tbb",
-        "@mkl_dnn//:mkl_dnn",
-    ],
     copts = [
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
@@ -113,6 +106,13 @@ cc_library(
         '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
+    deps = [
+        ":ngraph_headers",
+        "@eigen_archive//:eigen",
+        "@mkl_dnn",
+        "@nlohmann_json_lib",
+        "@tbb",
+    ],
     alwayslink = 1,
 )
 
@@ -138,12 +138,6 @@ cc_library(
         "src/ngraph/runtime/*.cpp",
         "src/ngraph/type/*.cpp",
     ]),
-    deps = [
-        ":ngraph_headers",
-        ":ngraph_cpu_backend",
-        "@eigen_archive//:eigen",
-        "@nlohmann_json_lib",
-    ],
     copts = [
         "-I external/ngraph/src",
         "-I external/nlohmann_json_lib/include/",
@@ -152,5 +146,11 @@ cc_library(
         '-D PROJECT_ROOT_DIR=\\"\\"',
     ],
     visibility = ["//visibility:public"],
+    deps = [
+        ":ngraph_cpu_backend",
+        ":ngraph_headers",
+        "@eigen_archive//:eigen",
+        "@nlohmann_json_lib",
+    ],
     alwayslink = 1,
 )
diff --git a/third_party/ngraph/ngraph_tf.BUILD b/third_party/ngraph/ngraph_tf.BUILD
index 068e411e81ba42ee44cdf2878aaa0cc3c01703b1..db9a66f9b5bcdaa29ec55175f1a8c76ac5f6f22a 100644
--- a/third_party/ngraph/ngraph_tf.BUILD
+++ b/third_party/ngraph/ngraph_tf.BUILD
@@ -10,6 +10,10 @@ load(
 cc_library(
     name = "ngraph_tf",
     srcs = [
+        "logging/ngraph_log.cc",
+        "logging/ngraph_log.h",
+        "logging/tf_graph_writer.cc",
+        "logging/tf_graph_writer.h",
         "src/ngraph_api.cc",
         "src/ngraph_api.h",
         "src/ngraph_assign_clusters.cc",
@@ -41,27 +45,23 @@ cc_library(
         "src/tf_deadness_analysis.h",
         "src/tf_graphcycles.cc",
         "src/tf_graphcycles.h",
-        "logging/ngraph_log.h",
-        "logging/ngraph_log.cc",
-        "logging/tf_graph_writer.h",
-        "logging/tf_graph_writer.cc",
-    ],
-    deps = [
-        "@org_tensorflow//tensorflow/core:protos_all_proto_text",
-        "@org_tensorflow//tensorflow/core:framework_headers_lib",
-        "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
-        "@ngraph//:ngraph_core",
-        "@com_google_absl//absl/container:container_memory",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/types:variant",
     ],
     copts = [
         "-I external/ngraph_tf/src",
         "-I external/ngraph_tf/logging",
         "-I external/ngraph/src",
     ],
-    alwayslink = 1,
     visibility = ["//visibility:public"],
+    deps = [
+        "@com_google_absl//absl/container:container_memory",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/types:variant",
+        "@ngraph//:ngraph_core",
+        "@org_tensorflow//tensorflow/core:core_cpu_headers_lib",
+        "@org_tensorflow//tensorflow/core:framework_headers_lib",
+        "@org_tensorflow//tensorflow/core:protos_all_proto_text",
+    ],
+    alwayslink = 1,
 )
 
 tf_cc_test(
@@ -82,6 +82,12 @@ tf_cc_test(
         "test/test_utilities.h",
         "test/tf_exec.cpp",
     ],
+    extra_copts = [
+        "-fexceptions ",
+        "-I external/ngraph_tf/src",
+        "-I external/ngraph_tf/logging",
+        "-I external/ngraph/src",
+    ],
     deps = [
         ":ngraph_tf",
         "@com_google_googletest//:gtest",
@@ -89,10 +95,4 @@ tf_cc_test(
         "@org_tensorflow//tensorflow/cc:client_session",
         "@org_tensorflow//tensorflow/core:tensorflow",
     ],
-    extra_copts = [
-        "-fexceptions ",
-        "-I external/ngraph_tf/src",
-        "-I external/ngraph_tf/logging",
-        "-I external/ngraph/src",
-    ],
 )
diff --git a/third_party/ngraph/tbb.BUILD b/third_party/ngraph/tbb.BUILD
index 04e6544ffb579a94db2ffeed123068a64afbfcb7..c78a2d79ddfff53ddede0a70427dac89d08fbdcc 100644
--- a/third_party/ngraph/tbb.BUILD
+++ b/third_party/ngraph/tbb.BUILD
@@ -14,6 +14,10 @@ genrule(
     srcs = glob(["**"]) + [
         "@local_config_cc//:toolchain",
     ],
+    outs = [
+        "libtbb.a",
+        "libtbbmalloc.a",
+    ],
     cmd = """
 	    set -e
 	    WORK_DIR=$$PWD
@@ -45,19 +49,15 @@ genrule(
         cp build/build_{release,debug}/*.a $$DEST_DIR
 		cd $$WORK_DIR
 	""",
-    outs = [
-        "libtbb.a",
-        "libtbbmalloc.a",
-    ],
 )
 
 cc_library(
     name = "tbb",
+    srcs = ["libtbb.a"],
     hdrs = glob([
         "include/serial/**",
         "include/tbb/**/**",
     ]),
-    srcs = ["libtbb.a"],
     includes = ["include"],
     visibility = ["//visibility:public"],
 )
diff --git a/third_party/png.BUILD b/third_party/png.BUILD
index c26a2897176e57220b42b7d2cc5b61d114ecfc5f..e82948648e42e14e97238726e7db5a932bbea946 100644
--- a/third_party/png.BUILD
+++ b/third_party/png.BUILD
@@ -44,11 +44,11 @@ cc_library(
         "png.h",
         "pngconf.h",
     ],
-    includes = ["."],
     copts = select({
         ":windows": ["-DPNG_INTEL_SSE_OPT=1"],
         "//conditions:default": [],
     }),
+    includes = ["."],
     linkopts = select({
         ":windows": [],
         "//conditions:default": ["-lm"],
diff --git a/third_party/repo.bzl b/third_party/repo.bzl
index 07b853ff11cb737f26a9b0ec37aaff6fd7ada203..bad6d20a08c0ee27345bf16a5a4f7c9e4d67a05f 100644
--- a/third_party/repo.bzl
+++ b/third_party/repo.bzl
@@ -84,7 +84,7 @@ def _apply_delete(ctx, paths):
 def _tf_http_archive(ctx):
     if ("mirror.bazel.build" not in ctx.attr.urls[0] and
         (len(ctx.attr.urls) < 2 and
-         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
         fail("tf_http_archive(urls) must have redundant URLs. The " +
              "mirror.bazel.build URL must be present and it must come first. " +
              "Even if you don't have permission to mirror the file, please " +
@@ -150,7 +150,7 @@ ensure best practices are followed.
 def _third_party_http_archive(ctx):
     if ("mirror.bazel.build" not in ctx.attr.urls[0] and
         (len(ctx.attr.urls) < 2 and
-         ctx.attr.name not in _SINGLE_URL_WHITELIST)):
+         ctx.attr.name not in _SINGLE_URL_WHITELIST.to_list())):
         fail("tf_http_archive(urls) must have redundant URLs. The " +
              "mirror.bazel.build URL must be present and it must come first. " +
              "Even if you don't have permission to mirror the file, please " +
diff --git a/third_party/systemlibs/protobuf.BUILD b/third_party/systemlibs/protobuf.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..4b1cf396b9b7abef8feaa653c7c71e9e8a9e304e
--- /dev/null
+++ b/third_party/systemlibs/protobuf.BUILD
@@ -0,0 +1,104 @@
+load(
+    "@protobuf_archive//:protobuf.bzl",
+    "proto_gen",
+    "py_proto_library",
+    "cc_proto_library",
+)
+
+licenses(["notice"])
+
+filegroup(
+    name = "LICENSE",
+    visibility = ["//visibility:public"],
+)
+
+HEADERS = [
+    "google/protobuf/any.pb.h",
+    "google/protobuf/any.proto",
+    "google/protobuf/arena.h",
+    "google/protobuf/compiler/importer.h",
+    "google/protobuf/descriptor.h",
+    "google/protobuf/descriptor.pb.h",
+    "google/protobuf/descriptor.proto",
+    "google/protobuf/duration.pb.h",
+    "google/protobuf/duration.proto",
+    "google/protobuf/dynamic_message.h",
+    "google/protobuf/empty.pb.h",
+    "google/protobuf/empty.proto",
+    "google/protobuf/field_mask.pb.h",
+    "google/protobuf/field_mask.proto",
+    "google/protobuf/io/coded_stream.h",
+    "google/protobuf/io/zero_copy_stream.h",
+    "google/protobuf/io/zero_copy_stream_impl_lite.h",
+    "google/protobuf/map.h",
+    "google/protobuf/repeated_field.h",
+    "google/protobuf/text_format.h",
+    "google/protobuf/timestamp.pb.h",
+    "google/protobuf/timestamp.proto",
+    "google/protobuf/util/json_util.h",
+    "google/protobuf/util/type_resolver_util.h",
+    "google/protobuf/wrappers.pb.h",
+    "google/protobuf/wrappers.proto",
+]
+
+genrule(
+    name = "link_headers",
+    outs = HEADERS,
+    cmd = """
+      for i in $(OUTS); do
+        f=$${i#$(@D)/}
+        mkdir -p $(@D)/$${f%/*}
+        ln -sf $(INCLUDEDIR)/$$f $(@D)/$$f
+      done
+    """,
+)
+
+cc_library(
+    name = "protobuf",
+    hdrs = HEADERS,
+    linkopts = ["-lprotobuf"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "protobuf_headers",
+    hdrs = HEADERS,
+    linkopts = ["-lprotobuf"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "protoc_lib",
+    linkopts = ["-lprotoc"],
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "protoc",
+    outs = ["protoc.bin"],
+    cmd = "ln -s $$(which protoc) $@",
+    executable = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_proto_library(
+    name = "cc_wkt_protos",
+    hdrs = HEADERS,
+    internal_bootstrap_hack = 1,
+    protoc = ":protoc",
+    visibility = ["//visibility:public"],
+)
+
+proto_gen(
+    name = "protobuf_python_genproto",
+    includes = ["."],
+    protoc = "@protobuf_archive//:protoc",
+    visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "protobuf_python",
+    data = [":link_headers"],
+    srcs_version = "PY2AND3",
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/systemlibs/protobuf.bzl b/third_party/systemlibs/protobuf.bzl
new file mode 100644
index 0000000000000000000000000000000000000000..2aa75610a9313d12daeb7406ea0107e53231e814
--- /dev/null
+++ b/third_party/systemlibs/protobuf.bzl
@@ -0,0 +1,425 @@
+def _GetPath(ctx, path):
+    if ctx.label.workspace_root:
+        return ctx.label.workspace_root + "/" + path
+    else:
+        return path
+
+def _IsNewExternal(ctx):
+    # Bazel 0.4.4 and older have genfiles paths that look like:
+    #   bazel-out/local-fastbuild/genfiles/external/repo/foo
+    # After the exec root rearrangement, they look like:
+    #   ../repo/bazel-out/local-fastbuild/genfiles/foo
+    return ctx.label.workspace_root.startswith("../")
+
+def _GenDir(ctx):
+    if _IsNewExternal(ctx):
+        # We are using the fact that Bazel 0.4.4+ provides repository-relative paths
+        # for ctx.genfiles_dir.
+        return ctx.genfiles_dir.path + (
+            "/" + ctx.attr.includes[0] if ctx.attr.includes and ctx.attr.includes[0] else ""
+        )
+
+    # This means that we're either in the old version OR the new version in the local repo.
+    # Either way, appending the source path to the genfiles dir works.
+    return ctx.var["GENDIR"] + "/" + _SourceDir(ctx)
+
+def _SourceDir(ctx):
+    if not ctx.attr.includes:
+        return ctx.label.workspace_root
+    if not ctx.attr.includes[0]:
+        return _GetPath(ctx, ctx.label.package)
+    if not ctx.label.package:
+        return _GetPath(ctx, ctx.attr.includes[0])
+    return _GetPath(ctx, ctx.label.package + "/" + ctx.attr.includes[0])
+
+def _CcHdrs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + ".pb.h" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + ".grpc.pb.h" for s in srcs]
+    return ret
+
+def _CcSrcs(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + ".pb.cc" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + ".grpc.pb.cc" for s in srcs]
+    return ret
+
+def _CcOuts(srcs, use_grpc_plugin = False):
+    return _CcHdrs(srcs, use_grpc_plugin) + _CcSrcs(srcs, use_grpc_plugin)
+
+def _PyOuts(srcs, use_grpc_plugin = False):
+    ret = [s[:-len(".proto")] + "_pb2.py" for s in srcs]
+    if use_grpc_plugin:
+        ret += [s[:-len(".proto")] + "_pb2_grpc.py" for s in srcs]
+    return ret
+
+def _RelativeOutputPath(path, include, dest = ""):
+    if include == None:
+        return path
+
+    if not path.startswith(include):
+        fail("Include path %s isn't part of the path %s." % (include, path))
+
+    if include and include[-1] != "/":
+        include = include + "/"
+    if dest and dest[-1] != "/":
+        dest = dest + "/"
+
+    path = path[len(include):]
+    return dest + path
+
+def _proto_gen_impl(ctx):
+    """General implementation for generating protos"""
+    srcs = ctx.files.srcs
+    deps = []
+    deps += ctx.files.srcs
+    source_dir = _SourceDir(ctx)
+    gen_dir = _GenDir(ctx)
+    if source_dir:
+        import_flags = ["-I" + source_dir, "-I" + gen_dir]
+    else:
+        import_flags = ["-I."]
+
+    for dep in ctx.attr.deps:
+        import_flags += dep.proto.import_flags
+        deps += dep.proto.deps
+
+    args = []
+    if ctx.attr.gen_cc:
+        args += ["--cpp_out=" + gen_dir]
+    if ctx.attr.gen_py:
+        args += ["--python_out=" + gen_dir]
+
+    inputs = srcs + deps
+    if ctx.executable.plugin:
+        plugin = ctx.executable.plugin
+        lang = ctx.attr.plugin_language
+        if not lang and plugin.basename.startswith("protoc-gen-"):
+            lang = plugin.basename[len("protoc-gen-"):]
+        if not lang:
+            fail("cannot infer the target language of plugin", "plugin_language")
+
+        outdir = gen_dir
+        if ctx.attr.plugin_options:
+            outdir = ",".join(ctx.attr.plugin_options) + ":" + outdir
+        args += ["--plugin=protoc-gen-%s=%s" % (lang, plugin.path)]
+        args += ["--%s_out=%s" % (lang, outdir)]
+        inputs += [plugin]
+
+    if args:
+        ctx.action(
+            inputs = inputs,
+            outputs = ctx.outputs.outs,
+            arguments = args + import_flags + [s.path for s in srcs],
+            executable = ctx.executable.protoc,
+            mnemonic = "ProtoCompile",
+            use_default_shell_env = True,
+        )
+
+    return struct(
+        proto = struct(
+            srcs = srcs,
+            import_flags = import_flags,
+            deps = deps,
+        ),
+    )
+
+proto_gen = rule(
+    attrs = {
+        "srcs": attr.label_list(allow_files = True),
+        "deps": attr.label_list(providers = ["proto"]),
+        "includes": attr.string_list(),
+        "protoc": attr.label(
+            cfg = "host",
+            executable = True,
+            single_file = True,
+            mandatory = True,
+        ),
+        "plugin": attr.label(
+            cfg = "host",
+            allow_files = True,
+            executable = True,
+        ),
+        "plugin_language": attr.string(),
+        "plugin_options": attr.string_list(),
+        "gen_cc": attr.bool(),
+        "gen_py": attr.bool(),
+        "outs": attr.output_list(),
+    },
+    output_to_genfiles = True,
+    implementation = _proto_gen_impl,
+)
+"""Generates codes from Protocol Buffers definitions.
+
+This rule helps you to implement Skylark macros specific to the target
+language. You should prefer more specific `cc_proto_library `,
+`py_proto_library` and others unless you are adding such wrapper macros.
+
+Args:
+  srcs: Protocol Buffers definition files (.proto) to run the protocol compiler
+    against.
+  deps: a list of dependency labels; must be other proto libraries.
+  includes: a list of include paths to .proto files.
+  protoc: the label of the protocol compiler to generate the sources.
+  plugin: the label of the protocol compiler plugin to be passed to the protocol
+    compiler.
+  plugin_language: the language of the generated sources
+  plugin_options: a list of options to be passed to the plugin
+  gen_cc: generates C++ sources in addition to the ones from the plugin.
+  gen_py: generates Python sources in addition to the ones from the plugin.
+  outs: a list of labels of the expected outputs from the protocol compiler.
+"""
+
+def cc_proto_library(
+        name,
+        srcs = [],
+        deps = [],
+        cc_libs = [],
+        include = None,
+        protoc = "@com_google_protobuf//:protoc",
+        internal_bootstrap_hack = False,
+        use_grpc_plugin = False,
+        default_runtime = "@com_google_protobuf//:protobuf",
+        **kargs):
+    """Bazel rule to create a C++ protobuf library from proto source files
+
+    NOTE: the rule is only an internal workaround to generate protos. The
+    interface may change and the rule may be removed when bazel has introduced
+    the native rule.
+
+    Args:
+      name: the name of the cc_proto_library.
+      srcs: the .proto files of the cc_proto_library.
+      deps: a list of dependency labels; must be cc_proto_library.
+      cc_libs: a list of other cc_library targets depended by the generated
+          cc_library.
+      include: a string indicating the include path of the .proto files.
+      protoc: the label of the protocol compiler to generate the sources.
+      internal_bootstrap_hack: a flag indicate the cc_proto_library is used only
+          for bootstraping. When it is set to True, no files will be generated.
+          The rule will simply be a provider for .proto files, so that other
+          cc_proto_library can depend on it.
+      use_grpc_plugin: a flag to indicate whether to call the grpc C++ plugin
+          when processing the proto files.
+      default_runtime: the implicitly default runtime which will be depended on by
+          the generated cc_library target.
+      **kargs: other keyword arguments that are passed to cc_library.
+
+    """
+
+    includes = []
+    if include != None:
+        includes = [include]
+
+    if internal_bootstrap_hack:
+        # For pre-checked-in generated files, we add the internal_bootstrap_hack
+        # which will skip the codegen action.
+        proto_gen(
+            name = name + "_genproto",
+            srcs = srcs,
+            deps = [s + "_genproto" for s in deps],
+            includes = includes,
+            protoc = protoc,
+            visibility = ["//visibility:public"],
+        )
+
+        # An empty cc_library to make rule dependency consistent.
+        native.cc_library(
+            name = name,
+            **kargs
+        )
+        return
+
+    grpc_cpp_plugin = None
+    if use_grpc_plugin:
+        grpc_cpp_plugin = "//external:grpc_cpp_plugin"
+
+    gen_srcs = _CcSrcs(srcs, use_grpc_plugin)
+    gen_hdrs = _CcHdrs(srcs, use_grpc_plugin)
+    outs = gen_srcs + gen_hdrs
+
+    proto_gen(
+        name = name + "_genproto",
+        srcs = srcs,
+        deps = [s + "_genproto" for s in deps],
+        includes = includes,
+        protoc = protoc,
+        plugin = grpc_cpp_plugin,
+        plugin_language = "grpc",
+        gen_cc = 1,
+        outs = outs,
+        visibility = ["//visibility:public"],
+    )
+
+    if default_runtime and not default_runtime in cc_libs:
+        cc_libs = cc_libs + [default_runtime]
+    if use_grpc_plugin:
+        cc_libs = cc_libs + ["//external:grpc_lib"]
+
+    native.cc_library(
+        name = name,
+        srcs = gen_srcs,
+        hdrs = gen_hdrs,
+        deps = cc_libs + deps,
+        includes = includes,
+        **kargs
+    )
+
+def internal_gen_well_known_protos_java(srcs):
+    """Bazel rule to generate the gen_well_known_protos_java genrule
+
+    Args:
+      srcs: the well known protos
+    """
+    root = Label("%s//protobuf_java" % (REPOSITORY_NAME)).workspace_root
+    pkg = PACKAGE_NAME + "/" if PACKAGE_NAME else ""
+    if root == "":
+        include = " -I%ssrc " % pkg
+    else:
+        include = " -I%s/%ssrc " % (root, pkg)
+    native.genrule(
+        name = "gen_well_known_protos_java",
+        srcs = srcs,
+        outs = [
+            "wellknown.srcjar",
+        ],
+        cmd = "$(location :protoc) --java_out=$(@D)/wellknown.jar" +
+              " %s $(SRCS) " % include +
+              " && mv $(@D)/wellknown.jar $(@D)/wellknown.srcjar",
+        tools = [":protoc"],
+    )
+
+def internal_copied_filegroup(name, srcs, strip_prefix, dest, **kwargs):
+    """Macro to copy files to a different directory and then create a filegroup.
+
+    This is used by the //:protobuf_python py_proto_library target to work around
+    an issue caused by Python source files that are part of the same Python
+    package being in separate directories.
+
+    Args:
+      srcs: The source files to copy and add to the filegroup.
+      strip_prefix: Path to the root of the files to copy.
+      dest: The directory to copy the source files into.
+      **kwargs: extra arguments that will be passesd to the filegroup.
+    """
+    outs = [_RelativeOutputPath(s, strip_prefix, dest) for s in srcs]
+
+    native.genrule(
+        name = name + "_genrule",
+        srcs = srcs,
+        outs = outs,
+        cmd = " && ".join(
+            ["cp $(location %s) $(location %s)" %
+             (s, _RelativeOutputPath(s, strip_prefix, dest)) for s in srcs],
+        ),
+    )
+
+    native.filegroup(
+        name = name,
+        srcs = outs,
+        **kwargs
+    )
+
+def py_proto_library(
+        name,
+        srcs = [],
+        deps = [],
+        py_libs = [],
+        py_extra_srcs = [],
+        include = None,
+        default_runtime = "@com_google_protobuf//:protobuf_python",
+        protoc = "@com_google_protobuf//:protoc",
+        use_grpc_plugin = False,
+        **kargs):
+    """Bazel rule to create a Python protobuf library from proto source files
+
+    NOTE: the rule is only an internal workaround to generate protos. The
+    interface may change and the rule may be removed when bazel has introduced
+    the native rule.
+
+    Args:
+      name: the name of the py_proto_library.
+      srcs: the .proto files of the py_proto_library.
+      deps: a list of dependency labels; must be py_proto_library.
+      py_libs: a list of other py_library targets depended by the generated
+          py_library.
+      py_extra_srcs: extra source files that will be added to the output
+          py_library. This attribute is used for internal bootstrapping.
+      include: a string indicating the include path of the .proto files.
+      default_runtime: the implicitly default runtime which will be depended on by
+          the generated py_library target.
+      protoc: the label of the protocol compiler to generate the sources.
+      use_grpc_plugin: a flag to indicate whether to call the Python C++ plugin
+          when processing the proto files.
+      **kargs: other keyword arguments that are passed to cc_library.
+
+    """
+    outs = _PyOuts(srcs, use_grpc_plugin)
+
+    includes = []
+    if include != None:
+        includes = [include]
+
+    grpc_python_plugin = None
+    if use_grpc_plugin:
+        grpc_python_plugin = "//external:grpc_python_plugin"
+        # Note: Generated grpc code depends on Python grpc module. This dependency
+        # is not explicitly listed in py_libs. Instead, host system is assumed to
+        # have grpc installed.
+
+    proto_gen(
+        name = name + "_genproto",
+        srcs = srcs,
+        deps = [s + "_genproto" for s in deps],
+        includes = includes,
+        protoc = protoc,
+        gen_py = 1,
+        outs = outs,
+        visibility = ["//visibility:public"],
+        plugin = grpc_python_plugin,
+        plugin_language = "grpc",
+    )
+
+    if default_runtime and not default_runtime in py_libs + deps:
+        py_libs = py_libs + [default_runtime]
+
+    native.py_library(
+        name = name,
+        srcs = outs + py_extra_srcs,
+        deps = py_libs + deps,
+        imports = includes,
+        **kargs
+    )
+
+def internal_protobuf_py_tests(
+        name,
+        modules = [],
+        **kargs):
+    """Bazel rules to create batch tests for protobuf internal.
+
+    Args:
+      name: the name of the rule.
+      modules: a list of modules for tests. The macro will create a py_test for
+          each of the parameter with the source "google/protobuf/%s.py"
+      kargs: extra parameters that will be passed into the py_test.
+
+    """
+    for m in modules:
+        s = "python/google/protobuf/internal/%s.py" % m
+        native.py_test(
+            name = "py_%s" % m,
+            srcs = [s],
+            main = s,
+            **kargs
+        )
+
+def check_protobuf_required_bazel_version():
+    """For WORKSPACE files, to check the installed version of bazel.
+
+    This ensures bazel supports our approach to proto_library() depending on a
+    copied filegroup. (Fixed in bazel 0.5.4)
+    """
+    expected = apple_common.dotted_version("0.5.4")
+    current = apple_common.dotted_version(native.bazel_version)
+    if current.compare_to(expected) < 0:
+        fail("Bazel must be newer than 0.5.4")
diff --git a/third_party/systemlibs/syslibs_configure.bzl b/third_party/systemlibs/syslibs_configure.bzl
index dbf4fd6e32fe3ac91d2f553cac4176ca6c21961f..645d242c96c02a6a90b84334af1ac2fd11e437da 100644
--- a/third_party/systemlibs/syslibs_configure.bzl
+++ b/third_party/systemlibs/syslibs_configure.bzl
@@ -15,6 +15,8 @@ VALID_LIBS = [
     "boringssl",
     "com_github_googleapis_googleapis",
     "com_github_googlecloudplatform_google_cloud_cpp",
+    "com_google_protobuf",
+    "com_google_protobuf_cc",
     "com_googlesource_code_re2",
     "curl",
     "cython",
@@ -32,6 +34,7 @@ VALID_LIBS = [
     "org_sqlite",
     "pcre",
     "png_archive",
+    "protobuf_archive",
     "six_archive",
     "snappy",
     "swig",
diff --git a/third_party/tflite_mobilenet_float.BUILD b/third_party/tflite_mobilenet_float.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de47ed61f9db9ad980468aa325e3c770e0aae4f1
--- /dev/null
+++ b/third_party/tflite_mobilenet_float.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/tflite_mobilenet_quant.BUILD b/third_party/tflite_mobilenet_quant.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..de47ed61f9db9ad980468aa325e3c770e0aae4f1
--- /dev/null
+++ b/third_party/tflite_mobilenet_quant.BUILD
@@ -0,0 +1,12 @@
+package(default_visibility = ["//visibility:public"])
+
+licenses(["notice"])  # Apache 2.0
+
+exports_files(
+    glob(
+        ["**/*"],
+        exclude = [
+            "BUILD",
+        ],
+    ),
+)
diff --git a/third_party/toolchains/BUILD b/third_party/toolchains/BUILD
index a7b4687c020e3d9176a5e451bdf9e20aec569b5e..9da417fd5fe18619de6dc51032b8e3cde21b6ffb 100644
--- a/third_party/toolchains/BUILD
+++ b/third_party/toolchains/BUILD
@@ -35,3 +35,16 @@ platform(
             value:"docker://gcr.io/asci-toolchain/nosla-cuda9.0-cudnn7-ubuntu14.04@%s"
         }""" % container_digests["cuda9.0-cudnn7-ubuntu14.04"],
 )
+
+platform(
+    name = "rbe_cuda10.0-cudnn7-ubuntu14.04",
+    constraint_values = [
+        "@bazel_tools//platforms:x86_64",
+        "@bazel_tools//platforms:linux",
+    ],
+    remote_execution_properties = """
+        properties: {
+            name: "container-image"
+            value:"docker://gcr.io/asci-toolchain/nosla-cuda10.0-cudnn7-ubuntu14.04@%s"
+        }""" % container_digests["cuda10.0-cudnn7-ubuntu14.04"],
+)
diff --git a/third_party/toolchains/preconfig/generate/containers.bzl b/third_party/toolchains/preconfig/generate/containers.bzl
index 1f9e29d4402dc2a969d01291d7772219415bbf3e..7099b9bf3e4715706cbe725373add4cc98d304b8 100644
--- a/third_party/toolchains/preconfig/generate/containers.bzl
+++ b/third_party/toolchains/preconfig/generate/containers.bzl
@@ -1,4 +1,4 @@
 container_digests = {
     "cuda9.0-cudnn7-ubuntu14.04": "sha256:c26138f4c38c754da2bad44a8a068523abf7fbd71d58a57ce92e5342c5431bf5",
-    "cuda10.0-cudnn7-ubuntu14.04": "sha256:34c4a55e2376b300cdc2b903775fc32e62352f6e33f927df5653743324378bfc",
+    "cuda10.0-cudnn7-ubuntu14.04": "sha256:66e7d592c8149291d5562a0f3093655a15b09c22e0eb30a87b3b6469b7a30ffc",
 }
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..b61f572d6d2e4155a1b8c889407f9e0cb54b7674
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/WORKSPACE
@@ -0,0 +1,2 @@
+# DO NOT EDIT: automatically generated WORKSPACE file for cuda_configure rule
+workspace(name = "local_config_cuda")
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..c813efccf9b82578984b33d04fd513030c83e0b1
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/BUILD
@@ -0,0 +1,1275 @@
+licenses(["restricted"])  # MPL2, portions GPL v3, LGPL v3, BSD-like
+
+package(default_visibility = ["//visibility:public"])
+
+config_setting(
+    name = "using_nvcc",
+    values = {
+        "define": "using_cuda_nvcc=true",
+    },
+)
+
+config_setting(
+    name = "using_clang",
+    values = {
+        "define": "using_cuda_clang=true",
+    },
+)
+
+# Equivalent to using_clang && -c opt.
+config_setting(
+    name = "using_clang_opt",
+    values = {
+        "define": "using_cuda_clang=true",
+        "compilation_mode": "opt",
+    },
+)
+
+config_setting(
+    name = "darwin",
+    values = {"cpu": "darwin"},
+    visibility = ["//visibility:public"],
+)
+
+config_setting(
+    name = "freebsd",
+    values = {"cpu": "freebsd"},
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-include",
+        ":cudnn-include",
+    ],
+    includes = [
+        ".",
+        "cuda/include",
+        "cuda/include/crt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart_static",
+    srcs = ["cuda/lib/libcudart_static.a"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = select({
+        ":freebsd": [],
+        "//conditions:default": ["-ldl"],
+    }) + [
+        "-lpthread",
+        "-lrt",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda_driver",
+    srcs = ["cuda/lib/libcuda.so"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudart",
+    srcs = ["cuda/lib/libcudart.so.10.0"],
+    data = ["cuda/lib/libcudart.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cublas",
+    srcs = ["cuda/lib/libcublas.so.10.0"],
+    data = ["cuda/lib/libcublas.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cusolver",
+    srcs = ["cuda/lib/libcusolver.so.10.0"],
+    data = ["cuda/lib/libcusolver.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkopts = ["-lgomp"],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn",
+    srcs = ["cuda/lib/libcudnn.so.7"],
+    data = ["cuda/lib/libcudnn.so.7"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cudnn_header",
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cufft",
+    srcs = ["cuda/lib/libcufft.so.10.0"],
+    data = ["cuda/lib/libcufft.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "curand",
+    srcs = ["cuda/lib/libcurand.so.10.0"],
+    data = ["cuda/lib/libcurand.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    linkstatic = 1,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cuda",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":cublas",
+        ":cuda_headers",
+        ":cudart",
+        ":cudnn",
+        ":cufft",
+        ":curand",
+    ],
+)
+
+cc_library(
+    name = "cupti_headers",
+    hdrs = [
+        "cuda/cuda_config.h",
+        ":cuda-extras",
+    ],
+    includes = [
+        ".",
+        "cuda/extras/CUPTI/include/",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "cupti_dsos",
+    data = ["cuda/lib/libcupti.so.10.0"],
+    includes = [
+        ".",
+        "cuda/include",
+    ],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "libdevice_root",
+    data = [":cuda-nvvm"],
+    visibility = ["//visibility:public"],
+)
+
+genrule(
+    name = "cuda-include",
+    outs = [
+        "cuda/include/CL/cl.h",
+        "cuda/include/CL/cl.hpp",
+        "cuda/include/CL/cl_egl.h",
+        "cuda/include/CL/cl_ext.h",
+        "cuda/include/CL/cl_gl.h",
+        "cuda/include/CL/cl_gl_ext.h",
+        "cuda/include/CL/cl_platform.h",
+        "cuda/include/CL/opencl.h",
+        "cuda/include/builtin_types.h",
+        "cuda/include/channel_descriptor.h",
+        "cuda/include/common_functions.h",
+        "cuda/include/cooperative_groups.h",
+        "cuda/include/cooperative_groups_helpers.h",
+        "cuda/include/crt/common_functions.h",
+        "cuda/include/crt/device_double_functions.h",
+        "cuda/include/crt/device_double_functions.hpp",
+        "cuda/include/crt/device_functions.h",
+        "cuda/include/crt/device_functions.hpp",
+        "cuda/include/crt/func_macro.h",
+        "cuda/include/crt/host_config.h",
+        "cuda/include/crt/host_defines.h",
+        "cuda/include/crt/host_runtime.h",
+        "cuda/include/crt/math_functions.h",
+        "cuda/include/crt/math_functions.hpp",
+        "cuda/include/crt/mma.h",
+        "cuda/include/crt/mma.hpp",
+        "cuda/include/crt/nvfunctional",
+        "cuda/include/crt/sm_70_rt.h",
+        "cuda/include/crt/sm_70_rt.hpp",
+        "cuda/include/crt/storage_class.h",
+        "cuda/include/cuComplex.h",
+        "cuda/include/cublas.h",
+        "cuda/include/cublasXt.h",
+        "cuda/include/cublas_api.h",
+        "cuda/include/cublas_v2.h",
+        "cuda/include/cuda.h",
+        "cuda/include/cudaEGL.h",
+        "cuda/include/cudaGL.h",
+        "cuda/include/cudaProfiler.h",
+        "cuda/include/cudaVDPAU.h",
+        "cuda/include/cuda_device_runtime_api.h",
+        "cuda/include/cuda_egl_interop.h",
+        "cuda/include/cuda_fp16.h",
+        "cuda/include/cuda_fp16.hpp",
+        "cuda/include/cuda_gl_interop.h",
+        "cuda/include/cuda_occupancy.h",
+        "cuda/include/cuda_profiler_api.h",
+        "cuda/include/cuda_runtime.h",
+        "cuda/include/cuda_runtime_api.h",
+        "cuda/include/cuda_surface_types.h",
+        "cuda/include/cuda_texture_types.h",
+        "cuda/include/cuda_vdpau_interop.h",
+        "cuda/include/cudalibxt.h",
+        "cuda/include/cudart_platform.h",
+        "cuda/include/cufft.h",
+        "cuda/include/cufftXt.h",
+        "cuda/include/cufftw.h",
+        "cuda/include/curand.h",
+        "cuda/include/curand_discrete.h",
+        "cuda/include/curand_discrete2.h",
+        "cuda/include/curand_globals.h",
+        "cuda/include/curand_kernel.h",
+        "cuda/include/curand_lognormal.h",
+        "cuda/include/curand_mrg32k3a.h",
+        "cuda/include/curand_mtgp32.h",
+        "cuda/include/curand_mtgp32_host.h",
+        "cuda/include/curand_mtgp32_kernel.h",
+        "cuda/include/curand_mtgp32dc_p_11213.h",
+        "cuda/include/curand_normal.h",
+        "cuda/include/curand_normal_static.h",
+        "cuda/include/curand_philox4x32_x.h",
+        "cuda/include/curand_poisson.h",
+        "cuda/include/curand_precalc.h",
+        "cuda/include/curand_uniform.h",
+        "cuda/include/cusolverDn.h",
+        "cuda/include/cusolverRf.h",
+        "cuda/include/cusolverSp.h",
+        "cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h",
+        "cuda/include/cusolver_common.h",
+        "cuda/include/cusparse.h",
+        "cuda/include/cusparse_v2.h",
+        "cuda/include/device_atomic_functions.h",
+        "cuda/include/device_atomic_functions.hpp",
+        "cuda/include/device_double_functions.h",
+        "cuda/include/device_functions.h",
+        "cuda/include/device_launch_parameters.h",
+        "cuda/include/device_types.h",
+        "cuda/include/driver_functions.h",
+        "cuda/include/driver_types.h",
+        "cuda/include/fatBinaryCtl.h",
+        "cuda/include/fatbinary.h",
+        "cuda/include/host_config.h",
+        "cuda/include/host_defines.h",
+        "cuda/include/library_types.h",
+        "cuda/include/math_constants.h",
+        "cuda/include/math_functions.h",
+        "cuda/include/mma.h",
+        "cuda/include/npp.h",
+        "cuda/include/nppcore.h",
+        "cuda/include/nppdefs.h",
+        "cuda/include/nppi.h",
+        "cuda/include/nppi_arithmetic_and_logical_operations.h",
+        "cuda/include/nppi_color_conversion.h",
+        "cuda/include/nppi_compression_functions.h",
+        "cuda/include/nppi_computer_vision.h",
+        "cuda/include/nppi_data_exchange_and_initialization.h",
+        "cuda/include/nppi_filtering_functions.h",
+        "cuda/include/nppi_geometry_transforms.h",
+        "cuda/include/nppi_linear_transforms.h",
+        "cuda/include/nppi_morphological_operations.h",
+        "cuda/include/nppi_statistics_functions.h",
+        "cuda/include/nppi_support_functions.h",
+        "cuda/include/nppi_threshold_and_compare_operations.h",
+        "cuda/include/npps.h",
+        "cuda/include/npps_arithmetic_and_logical_operations.h",
+        "cuda/include/npps_conversion_functions.h",
+        "cuda/include/npps_filtering_functions.h",
+        "cuda/include/npps_initialization.h",
+        "cuda/include/npps_statistics_functions.h",
+        "cuda/include/npps_support_functions.h",
+        "cuda/include/nppversion.h",
+        "cuda/include/nvToolsExt.h",
+        "cuda/include/nvToolsExtCuda.h",
+        "cuda/include/nvToolsExtCudaRt.h",
+        "cuda/include/nvToolsExtMeta.h",
+        "cuda/include/nvToolsExtSync.h",
+        "cuda/include/nvblas.h",
+        "cuda/include/nvfunctional",
+        "cuda/include/nvgraph.h",
+        "cuda/include/nvjpeg.h",
+        "cuda/include/nvml.h",
+        "cuda/include/nvrtc.h",
+        "cuda/include/nvtx3/nvToolsExt.h",
+        "cuda/include/nvtx3/nvToolsExtCuda.h",
+        "cuda/include/nvtx3/nvToolsExtCudaRt.h",
+        "cuda/include/nvtx3/nvToolsExtOpenCL.h",
+        "cuda/include/nvtx3/nvToolsExtSync.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImpl.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInit.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h",
+        "cuda/include/nvtx3/nvtxDetail/nvtxTypes.h",
+        "cuda/include/sm_20_atomic_functions.h",
+        "cuda/include/sm_20_atomic_functions.hpp",
+        "cuda/include/sm_20_intrinsics.h",
+        "cuda/include/sm_20_intrinsics.hpp",
+        "cuda/include/sm_30_intrinsics.h",
+        "cuda/include/sm_30_intrinsics.hpp",
+        "cuda/include/sm_32_atomic_functions.h",
+        "cuda/include/sm_32_atomic_functions.hpp",
+        "cuda/include/sm_32_intrinsics.h",
+        "cuda/include/sm_32_intrinsics.hpp",
+        "cuda/include/sm_35_atomic_functions.h",
+        "cuda/include/sm_35_intrinsics.h",
+        "cuda/include/sm_60_atomic_functions.h",
+        "cuda/include/sm_60_atomic_functions.hpp",
+        "cuda/include/sm_61_intrinsics.h",
+        "cuda/include/sm_61_intrinsics.hpp",
+        "cuda/include/sobol_direction_vectors.h",
+        "cuda/include/surface_functions.h",
+        "cuda/include/surface_functions.hpp",
+        "cuda/include/surface_indirect_functions.h",
+        "cuda/include/surface_indirect_functions.hpp",
+        "cuda/include/surface_types.h",
+        "cuda/include/texture_fetch_functions.h",
+        "cuda/include/texture_fetch_functions.hpp",
+        "cuda/include/texture_indirect_functions.h",
+        "cuda/include/texture_indirect_functions.hpp",
+        "cuda/include/texture_types.h",
+        "cuda/include/thrust/adjacent_difference.h",
+        "cuda/include/thrust/advance.h",
+        "cuda/include/thrust/binary_search.h",
+        "cuda/include/thrust/complex.h",
+        "cuda/include/thrust/copy.h",
+        "cuda/include/thrust/count.h",
+        "cuda/include/thrust/detail/adjacent_difference.inl",
+        "cuda/include/thrust/detail/advance.inl",
+        "cuda/include/thrust/detail/alignment.h",
+        "cuda/include/thrust/detail/allocator/allocator_traits.h",
+        "cuda/include/thrust/detail/allocator/allocator_traits.inl",
+        "cuda/include/thrust/detail/allocator/copy_construct_range.h",
+        "cuda/include/thrust/detail/allocator/copy_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/default_construct_range.h",
+        "cuda/include/thrust/detail/allocator/default_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/destroy_range.h",
+        "cuda/include/thrust/detail/allocator/destroy_range.inl",
+        "cuda/include/thrust/detail/allocator/fill_construct_range.h",
+        "cuda/include/thrust/detail/allocator/fill_construct_range.inl",
+        "cuda/include/thrust/detail/allocator/malloc_allocator.h",
+        "cuda/include/thrust/detail/allocator/malloc_allocator.inl",
+        "cuda/include/thrust/detail/allocator/no_throw_allocator.h",
+        "cuda/include/thrust/detail/allocator/tagged_allocator.h",
+        "cuda/include/thrust/detail/allocator/tagged_allocator.inl",
+        "cuda/include/thrust/detail/allocator/temporary_allocator.h",
+        "cuda/include/thrust/detail/allocator/temporary_allocator.inl",
+        "cuda/include/thrust/detail/binary_search.inl",
+        "cuda/include/thrust/detail/complex/arithmetic.h",
+        "cuda/include/thrust/detail/complex/c99math.h",
+        "cuda/include/thrust/detail/complex/catrig.h",
+        "cuda/include/thrust/detail/complex/catrigf.h",
+        "cuda/include/thrust/detail/complex/ccosh.h",
+        "cuda/include/thrust/detail/complex/ccoshf.h",
+        "cuda/include/thrust/detail/complex/cexp.h",
+        "cuda/include/thrust/detail/complex/cexpf.h",
+        "cuda/include/thrust/detail/complex/clog.h",
+        "cuda/include/thrust/detail/complex/clogf.h",
+        "cuda/include/thrust/detail/complex/complex.inl",
+        "cuda/include/thrust/detail/complex/cpow.h",
+        "cuda/include/thrust/detail/complex/cproj.h",
+        "cuda/include/thrust/detail/complex/csinh.h",
+        "cuda/include/thrust/detail/complex/csinhf.h",
+        "cuda/include/thrust/detail/complex/csqrt.h",
+        "cuda/include/thrust/detail/complex/csqrtf.h",
+        "cuda/include/thrust/detail/complex/ctanh.h",
+        "cuda/include/thrust/detail/complex/ctanhf.h",
+        "cuda/include/thrust/detail/complex/math_private.h",
+        "cuda/include/thrust/detail/complex/stream.h",
+        "cuda/include/thrust/detail/config.h",
+        "cuda/include/thrust/detail/config/compiler.h",
+        "cuda/include/thrust/detail/config/compiler_fence.h",
+        "cuda/include/thrust/detail/config/config.h",
+        "cuda/include/thrust/detail/config/debug.h",
+        "cuda/include/thrust/detail/config/device_system.h",
+        "cuda/include/thrust/detail/config/exec_check_disable.h",
+        "cuda/include/thrust/detail/config/forceinline.h",
+        "cuda/include/thrust/detail/config/global_workarounds.h",
+        "cuda/include/thrust/detail/config/host_device.h",
+        "cuda/include/thrust/detail/config/host_system.h",
+        "cuda/include/thrust/detail/config/simple_defines.h",
+        "cuda/include/thrust/detail/contiguous_storage.h",
+        "cuda/include/thrust/detail/contiguous_storage.inl",
+        "cuda/include/thrust/detail/copy.h",
+        "cuda/include/thrust/detail/copy.inl",
+        "cuda/include/thrust/detail/copy_if.h",
+        "cuda/include/thrust/detail/copy_if.inl",
+        "cuda/include/thrust/detail/count.inl",
+        "cuda/include/thrust/detail/cstdint.h",
+        "cuda/include/thrust/detail/device_delete.inl",
+        "cuda/include/thrust/detail/device_free.inl",
+        "cuda/include/thrust/detail/device_malloc.inl",
+        "cuda/include/thrust/detail/device_new.inl",
+        "cuda/include/thrust/detail/device_ptr.inl",
+        "cuda/include/thrust/detail/device_reference.inl",
+        "cuda/include/thrust/detail/device_vector.inl",
+        "cuda/include/thrust/detail/dispatch/is_trivial_copy.h",
+        "cuda/include/thrust/detail/distance.inl",
+        "cuda/include/thrust/detail/equal.inl",
+        "cuda/include/thrust/detail/execute_with_allocator.h",
+        "cuda/include/thrust/detail/execution_policy.h",
+        "cuda/include/thrust/detail/extrema.inl",
+        "cuda/include/thrust/detail/fill.inl",
+        "cuda/include/thrust/detail/find.inl",
+        "cuda/include/thrust/detail/for_each.inl",
+        "cuda/include/thrust/detail/function.h",
+        "cuda/include/thrust/detail/functional.inl",
+        "cuda/include/thrust/detail/functional/actor.h",
+        "cuda/include/thrust/detail/functional/actor.inl",
+        "cuda/include/thrust/detail/functional/argument.h",
+        "cuda/include/thrust/detail/functional/composite.h",
+        "cuda/include/thrust/detail/functional/operators.h",
+        "cuda/include/thrust/detail/functional/operators/arithmetic_operators.h",
+        "cuda/include/thrust/detail/functional/operators/assignment_operator.h",
+        "cuda/include/thrust/detail/functional/operators/bitwise_operators.h",
+        "cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h",
+        "cuda/include/thrust/detail/functional/operators/logical_operators.h",
+        "cuda/include/thrust/detail/functional/operators/operator_adaptors.h",
+        "cuda/include/thrust/detail/functional/operators/relational_operators.h",
+        "cuda/include/thrust/detail/functional/placeholder.h",
+        "cuda/include/thrust/detail/functional/value.h",
+        "cuda/include/thrust/detail/gather.inl",
+        "cuda/include/thrust/detail/generate.inl",
+        "cuda/include/thrust/detail/get_iterator_value.h",
+        "cuda/include/thrust/detail/host_vector.inl",
+        "cuda/include/thrust/detail/inner_product.inl",
+        "cuda/include/thrust/detail/integer_math.h",
+        "cuda/include/thrust/detail/integer_traits.h",
+        "cuda/include/thrust/detail/internal_functional.h",
+        "cuda/include/thrust/detail/logical.inl",
+        "cuda/include/thrust/detail/malloc_and_free.h",
+        "cuda/include/thrust/detail/merge.inl",
+        "cuda/include/thrust/detail/minmax.h",
+        "cuda/include/thrust/detail/mismatch.inl",
+        "cuda/include/thrust/detail/mpl/math.h",
+        "cuda/include/thrust/detail/numeric_traits.h",
+        "cuda/include/thrust/detail/overlapped_copy.h",
+        "cuda/include/thrust/detail/pair.inl",
+        "cuda/include/thrust/detail/partition.inl",
+        "cuda/include/thrust/detail/pointer.h",
+        "cuda/include/thrust/detail/pointer.inl",
+        "cuda/include/thrust/detail/preprocessor.h",
+        "cuda/include/thrust/detail/range/head_flags.h",
+        "cuda/include/thrust/detail/range/tail_flags.h",
+        "cuda/include/thrust/detail/raw_pointer_cast.h",
+        "cuda/include/thrust/detail/raw_reference_cast.h",
+        "cuda/include/thrust/detail/reduce.inl",
+        "cuda/include/thrust/detail/reference.h",
+        "cuda/include/thrust/detail/reference.inl",
+        "cuda/include/thrust/detail/reference_forward_declaration.h",
+        "cuda/include/thrust/detail/remove.inl",
+        "cuda/include/thrust/detail/replace.inl",
+        "cuda/include/thrust/detail/reverse.inl",
+        "cuda/include/thrust/detail/scan.inl",
+        "cuda/include/thrust/detail/scatter.inl",
+        "cuda/include/thrust/detail/seq.h",
+        "cuda/include/thrust/detail/sequence.inl",
+        "cuda/include/thrust/detail/set_operations.inl",
+        "cuda/include/thrust/detail/sort.inl",
+        "cuda/include/thrust/detail/static_assert.h",
+        "cuda/include/thrust/detail/static_map.h",
+        "cuda/include/thrust/detail/swap.h",
+        "cuda/include/thrust/detail/swap.inl",
+        "cuda/include/thrust/detail/swap_ranges.inl",
+        "cuda/include/thrust/detail/tabulate.inl",
+        "cuda/include/thrust/detail/temporary_array.h",
+        "cuda/include/thrust/detail/temporary_array.inl",
+        "cuda/include/thrust/detail/temporary_buffer.h",
+        "cuda/include/thrust/detail/transform.inl",
+        "cuda/include/thrust/detail/transform_reduce.inl",
+        "cuda/include/thrust/detail/transform_scan.inl",
+        "cuda/include/thrust/detail/trivial_sequence.h",
+        "cuda/include/thrust/detail/tuple.inl",
+        "cuda/include/thrust/detail/tuple_meta_transform.h",
+        "cuda/include/thrust/detail/tuple_transform.h",
+        "cuda/include/thrust/detail/type_traits.h",
+        "cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h",
+        "cuda/include/thrust/detail/type_traits/function_traits.h",
+        "cuda/include/thrust/detail/type_traits/has_member_function.h",
+        "cuda/include/thrust/detail/type_traits/has_nested_type.h",
+        "cuda/include/thrust/detail/type_traits/has_trivial_assign.h",
+        "cuda/include/thrust/detail/type_traits/is_call_possible.h",
+        "cuda/include/thrust/detail/type_traits/is_metafunction_defined.h",
+        "cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h",
+        "cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h",
+        "cuda/include/thrust/detail/type_traits/minimum_type.h",
+        "cuda/include/thrust/detail/type_traits/pointer_traits.h",
+        "cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h",
+        "cuda/include/thrust/detail/uninitialized_copy.inl",
+        "cuda/include/thrust/detail/uninitialized_fill.inl",
+        "cuda/include/thrust/detail/unique.inl",
+        "cuda/include/thrust/detail/use_default.h",
+        "cuda/include/thrust/detail/util/align.h",
+        "cuda/include/thrust/detail/util/blocking.h",
+        "cuda/include/thrust/detail/vector_base.h",
+        "cuda/include/thrust/detail/vector_base.inl",
+        "cuda/include/thrust/device_allocator.h",
+        "cuda/include/thrust/device_delete.h",
+        "cuda/include/thrust/device_free.h",
+        "cuda/include/thrust/device_malloc.h",
+        "cuda/include/thrust/device_malloc_allocator.h",
+        "cuda/include/thrust/device_new.h",
+        "cuda/include/thrust/device_new_allocator.h",
+        "cuda/include/thrust/device_ptr.h",
+        "cuda/include/thrust/device_reference.h",
+        "cuda/include/thrust/device_vector.h",
+        "cuda/include/thrust/distance.h",
+        "cuda/include/thrust/equal.h",
+        "cuda/include/thrust/execution_policy.h",
+        "cuda/include/thrust/extrema.h",
+        "cuda/include/thrust/fill.h",
+        "cuda/include/thrust/find.h",
+        "cuda/include/thrust/for_each.h",
+        "cuda/include/thrust/functional.h",
+        "cuda/include/thrust/gather.h",
+        "cuda/include/thrust/generate.h",
+        "cuda/include/thrust/host_vector.h",
+        "cuda/include/thrust/inner_product.h",
+        "cuda/include/thrust/iterator/constant_iterator.h",
+        "cuda/include/thrust/iterator/counting_iterator.h",
+        "cuda/include/thrust/iterator/detail/any_assign.h",
+        "cuda/include/thrust/iterator/detail/any_system_tag.h",
+        "cuda/include/thrust/iterator/detail/constant_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/counting_iterator.inl",
+        "cuda/include/thrust/iterator/detail/device_system_tag.h",
+        "cuda/include/thrust/iterator/detail/discard_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/distance_from_result.h",
+        "cuda/include/thrust/iterator/detail/host_system_tag.h",
+        "cuda/include/thrust/iterator/detail/is_iterator_category.h",
+        "cuda/include/thrust/iterator/detail/is_trivial_iterator.h",
+        "cuda/include/thrust/iterator/detail/iterator_adaptor_base.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_to_system.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h",
+        "cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h",
+        "cuda/include/thrust/iterator/detail/iterator_facade_category.h",
+        "cuda/include/thrust/iterator/detail/iterator_traits.inl",
+        "cuda/include/thrust/iterator/detail/iterator_traversal_tags.h",
+        "cuda/include/thrust/iterator/detail/join_iterator.h",
+        "cuda/include/thrust/iterator/detail/minimum_category.h",
+        "cuda/include/thrust/iterator/detail/minimum_system.h",
+        "cuda/include/thrust/iterator/detail/normal_iterator.h",
+        "cuda/include/thrust/iterator/detail/permutation_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/retag.h",
+        "cuda/include/thrust/iterator/detail/reverse_iterator.inl",
+        "cuda/include/thrust/iterator/detail/reverse_iterator_base.h",
+        "cuda/include/thrust/iterator/detail/tagged_iterator.h",
+        "cuda/include/thrust/iterator/detail/transform_iterator.inl",
+        "cuda/include/thrust/iterator/detail/transform_output_iterator.inl",
+        "cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h",
+        "cuda/include/thrust/iterator/detail/universal_categories.h",
+        "cuda/include/thrust/iterator/detail/zip_iterator.inl",
+        "cuda/include/thrust/iterator/detail/zip_iterator_base.h",
+        "cuda/include/thrust/iterator/discard_iterator.h",
+        "cuda/include/thrust/iterator/iterator_adaptor.h",
+        "cuda/include/thrust/iterator/iterator_categories.h",
+        "cuda/include/thrust/iterator/iterator_facade.h",
+        "cuda/include/thrust/iterator/iterator_traits.h",
+        "cuda/include/thrust/iterator/permutation_iterator.h",
+        "cuda/include/thrust/iterator/retag.h",
+        "cuda/include/thrust/iterator/reverse_iterator.h",
+        "cuda/include/thrust/iterator/transform_iterator.h",
+        "cuda/include/thrust/iterator/transform_output_iterator.h",
+        "cuda/include/thrust/iterator/zip_iterator.h",
+        "cuda/include/thrust/logical.h",
+        "cuda/include/thrust/memory.h",
+        "cuda/include/thrust/merge.h",
+        "cuda/include/thrust/mismatch.h",
+        "cuda/include/thrust/pair.h",
+        "cuda/include/thrust/partition.h",
+        "cuda/include/thrust/random.h",
+        "cuda/include/thrust/random/detail/discard_block_engine.inl",
+        "cuda/include/thrust/random/detail/linear_congruential_engine.inl",
+        "cuda/include/thrust/random/detail/linear_congruential_engine_discard.h",
+        "cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl",
+        "cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h",
+        "cuda/include/thrust/random/detail/mod.h",
+        "cuda/include/thrust/random/detail/normal_distribution.inl",
+        "cuda/include/thrust/random/detail/normal_distribution_base.h",
+        "cuda/include/thrust/random/detail/random_core_access.h",
+        "cuda/include/thrust/random/detail/subtract_with_carry_engine.inl",
+        "cuda/include/thrust/random/detail/uniform_int_distribution.inl",
+        "cuda/include/thrust/random/detail/uniform_real_distribution.inl",
+        "cuda/include/thrust/random/detail/xor_combine_engine.inl",
+        "cuda/include/thrust/random/detail/xor_combine_engine_max.h",
+        "cuda/include/thrust/random/discard_block_engine.h",
+        "cuda/include/thrust/random/linear_congruential_engine.h",
+        "cuda/include/thrust/random/linear_feedback_shift_engine.h",
+        "cuda/include/thrust/random/normal_distribution.h",
+        "cuda/include/thrust/random/subtract_with_carry_engine.h",
+        "cuda/include/thrust/random/uniform_int_distribution.h",
+        "cuda/include/thrust/random/uniform_real_distribution.h",
+        "cuda/include/thrust/random/xor_combine_engine.h",
+        "cuda/include/thrust/reduce.h",
+        "cuda/include/thrust/remove.h",
+        "cuda/include/thrust/replace.h",
+        "cuda/include/thrust/reverse.h",
+        "cuda/include/thrust/scan.h",
+        "cuda/include/thrust/scatter.h",
+        "cuda/include/thrust/sequence.h",
+        "cuda/include/thrust/set_operations.h",
+        "cuda/include/thrust/sort.h",
+        "cuda/include/thrust/swap.h",
+        "cuda/include/thrust/system/cpp/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/cpp/detail/assign_value.h",
+        "cuda/include/thrust/system/cpp/detail/binary_search.h",
+        "cuda/include/thrust/system/cpp/detail/copy.h",
+        "cuda/include/thrust/system/cpp/detail/copy_if.h",
+        "cuda/include/thrust/system/cpp/detail/count.h",
+        "cuda/include/thrust/system/cpp/detail/equal.h",
+        "cuda/include/thrust/system/cpp/detail/execution_policy.h",
+        "cuda/include/thrust/system/cpp/detail/extrema.h",
+        "cuda/include/thrust/system/cpp/detail/fill.h",
+        "cuda/include/thrust/system/cpp/detail/find.h",
+        "cuda/include/thrust/system/cpp/detail/for_each.h",
+        "cuda/include/thrust/system/cpp/detail/gather.h",
+        "cuda/include/thrust/system/cpp/detail/generate.h",
+        "cuda/include/thrust/system/cpp/detail/get_value.h",
+        "cuda/include/thrust/system/cpp/detail/inner_product.h",
+        "cuda/include/thrust/system/cpp/detail/iter_swap.h",
+        "cuda/include/thrust/system/cpp/detail/logical.h",
+        "cuda/include/thrust/system/cpp/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/cpp/detail/memory.inl",
+        "cuda/include/thrust/system/cpp/detail/merge.h",
+        "cuda/include/thrust/system/cpp/detail/mismatch.h",
+        "cuda/include/thrust/system/cpp/detail/par.h",
+        "cuda/include/thrust/system/cpp/detail/partition.h",
+        "cuda/include/thrust/system/cpp/detail/reduce.h",
+        "cuda/include/thrust/system/cpp/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/remove.h",
+        "cuda/include/thrust/system/cpp/detail/replace.h",
+        "cuda/include/thrust/system/cpp/detail/reverse.h",
+        "cuda/include/thrust/system/cpp/detail/scan.h",
+        "cuda/include/thrust/system/cpp/detail/scan_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/scatter.h",
+        "cuda/include/thrust/system/cpp/detail/sequence.h",
+        "cuda/include/thrust/system/cpp/detail/set_operations.h",
+        "cuda/include/thrust/system/cpp/detail/sort.h",
+        "cuda/include/thrust/system/cpp/detail/swap_ranges.h",
+        "cuda/include/thrust/system/cpp/detail/tabulate.h",
+        "cuda/include/thrust/system/cpp/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/cpp/detail/transform.h",
+        "cuda/include/thrust/system/cpp/detail/transform_reduce.h",
+        "cuda/include/thrust/system/cpp/detail/transform_scan.h",
+        "cuda/include/thrust/system/cpp/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/cpp/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/cpp/detail/unique.h",
+        "cuda/include/thrust/system/cpp/detail/unique_by_key.h",
+        "cuda/include/thrust/system/cpp/detail/vector.inl",
+        "cuda/include/thrust/system/cpp/execution_policy.h",
+        "cuda/include/thrust/system/cpp/memory.h",
+        "cuda/include/thrust/system/cpp/vector.h",
+        "cuda/include/thrust/system/cuda/config.h",
+        "cuda/include/thrust/system/cuda/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/cuda/detail/assign_value.h",
+        "cuda/include/thrust/system/cuda/detail/binary_search.h",
+        "cuda/include/thrust/system/cuda/detail/copy.h",
+        "cuda/include/thrust/system/cuda/detail/copy_if.h",
+        "cuda/include/thrust/system/cuda/detail/core/agent_launcher.h",
+        "cuda/include/thrust/system/cuda/detail/core/alignment.h",
+        "cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h",
+        "cuda/include/thrust/system/cuda/detail/core/util.h",
+        "cuda/include/thrust/system/cuda/detail/count.h",
+        "cuda/include/thrust/system/cuda/detail/cross_system.h",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/cub.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_device.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/util_type.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh",
+        "cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh",
+        "cuda/include/thrust/system/cuda/detail/equal.h",
+        "cuda/include/thrust/system/cuda/detail/error.inl",
+        "cuda/include/thrust/system/cuda/detail/execution_policy.h",
+        "cuda/include/thrust/system/cuda/detail/extrema.h",
+        "cuda/include/thrust/system/cuda/detail/fill.h",
+        "cuda/include/thrust/system/cuda/detail/find.h",
+        "cuda/include/thrust/system/cuda/detail/for_each.h",
+        "cuda/include/thrust/system/cuda/detail/gather.h",
+        "cuda/include/thrust/system/cuda/detail/generate.h",
+        "cuda/include/thrust/system/cuda/detail/get_value.h",
+        "cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h",
+        "cuda/include/thrust/system/cuda/detail/guarded_driver_types.h",
+        "cuda/include/thrust/system/cuda/detail/inner_product.h",
+        "cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h",
+        "cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h",
+        "cuda/include/thrust/system/cuda/detail/iter_swap.h",
+        "cuda/include/thrust/system/cuda/detail/logical.h",
+        "cuda/include/thrust/system/cuda/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/cuda/detail/memory.inl",
+        "cuda/include/thrust/system/cuda/detail/merge.h",
+        "cuda/include/thrust/system/cuda/detail/mismatch.h",
+        "cuda/include/thrust/system/cuda/detail/par.h",
+        "cuda/include/thrust/system/cuda/detail/par_to_seq.h",
+        "cuda/include/thrust/system/cuda/detail/parallel_for.h",
+        "cuda/include/thrust/system/cuda/detail/partition.h",
+        "cuda/include/thrust/system/cuda/detail/reduce.h",
+        "cuda/include/thrust/system/cuda/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/remove.h",
+        "cuda/include/thrust/system/cuda/detail/replace.h",
+        "cuda/include/thrust/system/cuda/detail/reverse.h",
+        "cuda/include/thrust/system/cuda/detail/scan.h",
+        "cuda/include/thrust/system/cuda/detail/scan_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/scatter.h",
+        "cuda/include/thrust/system/cuda/detail/sequence.h",
+        "cuda/include/thrust/system/cuda/detail/set_operations.h",
+        "cuda/include/thrust/system/cuda/detail/sort.h",
+        "cuda/include/thrust/system/cuda/detail/swap_ranges.h",
+        "cuda/include/thrust/system/cuda/detail/tabulate.h",
+        "cuda/include/thrust/system/cuda/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/cuda/detail/terminate.h",
+        "cuda/include/thrust/system/cuda/detail/transform.h",
+        "cuda/include/thrust/system/cuda/detail/transform_reduce.h",
+        "cuda/include/thrust/system/cuda/detail/transform_scan.h",
+        "cuda/include/thrust/system/cuda/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/cuda/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/cuda/detail/unique.h",
+        "cuda/include/thrust/system/cuda/detail/unique_by_key.h",
+        "cuda/include/thrust/system/cuda/detail/util.h",
+        "cuda/include/thrust/system/cuda/detail/vector.inl",
+        "cuda/include/thrust/system/cuda/error.h",
+        "cuda/include/thrust/system/cuda/execution_policy.h",
+        "cuda/include/thrust/system/cuda/experimental/pinned_allocator.h",
+        "cuda/include/thrust/system/cuda/memory.h",
+        "cuda/include/thrust/system/cuda/vector.h",
+        "cuda/include/thrust/system/detail/adl/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/adl/assign_value.h",
+        "cuda/include/thrust/system/detail/adl/binary_search.h",
+        "cuda/include/thrust/system/detail/adl/copy.h",
+        "cuda/include/thrust/system/detail/adl/copy_if.h",
+        "cuda/include/thrust/system/detail/adl/count.h",
+        "cuda/include/thrust/system/detail/adl/equal.h",
+        "cuda/include/thrust/system/detail/adl/extrema.h",
+        "cuda/include/thrust/system/detail/adl/fill.h",
+        "cuda/include/thrust/system/detail/adl/find.h",
+        "cuda/include/thrust/system/detail/adl/for_each.h",
+        "cuda/include/thrust/system/detail/adl/gather.h",
+        "cuda/include/thrust/system/detail/adl/generate.h",
+        "cuda/include/thrust/system/detail/adl/get_value.h",
+        "cuda/include/thrust/system/detail/adl/inner_product.h",
+        "cuda/include/thrust/system/detail/adl/iter_swap.h",
+        "cuda/include/thrust/system/detail/adl/logical.h",
+        "cuda/include/thrust/system/detail/adl/malloc_and_free.h",
+        "cuda/include/thrust/system/detail/adl/merge.h",
+        "cuda/include/thrust/system/detail/adl/mismatch.h",
+        "cuda/include/thrust/system/detail/adl/partition.h",
+        "cuda/include/thrust/system/detail/adl/reduce.h",
+        "cuda/include/thrust/system/detail/adl/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/adl/remove.h",
+        "cuda/include/thrust/system/detail/adl/replace.h",
+        "cuda/include/thrust/system/detail/adl/reverse.h",
+        "cuda/include/thrust/system/detail/adl/scan.h",
+        "cuda/include/thrust/system/detail/adl/scan_by_key.h",
+        "cuda/include/thrust/system/detail/adl/scatter.h",
+        "cuda/include/thrust/system/detail/adl/sequence.h",
+        "cuda/include/thrust/system/detail/adl/set_operations.h",
+        "cuda/include/thrust/system/detail/adl/sort.h",
+        "cuda/include/thrust/system/detail/adl/swap_ranges.h",
+        "cuda/include/thrust/system/detail/adl/tabulate.h",
+        "cuda/include/thrust/system/detail/adl/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/adl/transform.h",
+        "cuda/include/thrust/system/detail/adl/transform_reduce.h",
+        "cuda/include/thrust/system/detail/adl/transform_scan.h",
+        "cuda/include/thrust/system/detail/adl/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/adl/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/adl/unique.h",
+        "cuda/include/thrust/system/detail/adl/unique_by_key.h",
+        "cuda/include/thrust/system/detail/bad_alloc.h",
+        "cuda/include/thrust/system/detail/errno.h",
+        "cuda/include/thrust/system/detail/error_category.inl",
+        "cuda/include/thrust/system/detail/error_code.inl",
+        "cuda/include/thrust/system/detail/error_condition.inl",
+        "cuda/include/thrust/system/detail/generic/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/generic/adjacent_difference.inl",
+        "cuda/include/thrust/system/detail/generic/advance.h",
+        "cuda/include/thrust/system/detail/generic/advance.inl",
+        "cuda/include/thrust/system/detail/generic/binary_search.h",
+        "cuda/include/thrust/system/detail/generic/binary_search.inl",
+        "cuda/include/thrust/system/detail/generic/copy.h",
+        "cuda/include/thrust/system/detail/generic/copy.inl",
+        "cuda/include/thrust/system/detail/generic/copy_if.h",
+        "cuda/include/thrust/system/detail/generic/copy_if.inl",
+        "cuda/include/thrust/system/detail/generic/count.h",
+        "cuda/include/thrust/system/detail/generic/count.inl",
+        "cuda/include/thrust/system/detail/generic/distance.h",
+        "cuda/include/thrust/system/detail/generic/distance.inl",
+        "cuda/include/thrust/system/detail/generic/equal.h",
+        "cuda/include/thrust/system/detail/generic/equal.inl",
+        "cuda/include/thrust/system/detail/generic/extrema.h",
+        "cuda/include/thrust/system/detail/generic/extrema.inl",
+        "cuda/include/thrust/system/detail/generic/fill.h",
+        "cuda/include/thrust/system/detail/generic/find.h",
+        "cuda/include/thrust/system/detail/generic/find.inl",
+        "cuda/include/thrust/system/detail/generic/for_each.h",
+        "cuda/include/thrust/system/detail/generic/gather.h",
+        "cuda/include/thrust/system/detail/generic/gather.inl",
+        "cuda/include/thrust/system/detail/generic/generate.h",
+        "cuda/include/thrust/system/detail/generic/generate.inl",
+        "cuda/include/thrust/system/detail/generic/inner_product.h",
+        "cuda/include/thrust/system/detail/generic/inner_product.inl",
+        "cuda/include/thrust/system/detail/generic/logical.h",
+        "cuda/include/thrust/system/detail/generic/memory.h",
+        "cuda/include/thrust/system/detail/generic/memory.inl",
+        "cuda/include/thrust/system/detail/generic/merge.h",
+        "cuda/include/thrust/system/detail/generic/merge.inl",
+        "cuda/include/thrust/system/detail/generic/mismatch.h",
+        "cuda/include/thrust/system/detail/generic/mismatch.inl",
+        "cuda/include/thrust/system/detail/generic/partition.h",
+        "cuda/include/thrust/system/detail/generic/partition.inl",
+        "cuda/include/thrust/system/detail/generic/reduce.h",
+        "cuda/include/thrust/system/detail/generic/reduce.inl",
+        "cuda/include/thrust/system/detail/generic/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/generic/reduce_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/remove.h",
+        "cuda/include/thrust/system/detail/generic/remove.inl",
+        "cuda/include/thrust/system/detail/generic/replace.h",
+        "cuda/include/thrust/system/detail/generic/replace.inl",
+        "cuda/include/thrust/system/detail/generic/reverse.h",
+        "cuda/include/thrust/system/detail/generic/reverse.inl",
+        "cuda/include/thrust/system/detail/generic/scalar/binary_search.h",
+        "cuda/include/thrust/system/detail/generic/scalar/binary_search.inl",
+        "cuda/include/thrust/system/detail/generic/scan.h",
+        "cuda/include/thrust/system/detail/generic/scan.inl",
+        "cuda/include/thrust/system/detail/generic/scan_by_key.h",
+        "cuda/include/thrust/system/detail/generic/scan_by_key.inl",
+        "cuda/include/thrust/system/detail/generic/scatter.h",
+        "cuda/include/thrust/system/detail/generic/scatter.inl",
+        "cuda/include/thrust/system/detail/generic/select_system.h",
+        "cuda/include/thrust/system/detail/generic/sequence.h",
+        "cuda/include/thrust/system/detail/generic/sequence.inl",
+        "cuda/include/thrust/system/detail/generic/set_operations.h",
+        "cuda/include/thrust/system/detail/generic/set_operations.inl",
+        "cuda/include/thrust/system/detail/generic/sort.h",
+        "cuda/include/thrust/system/detail/generic/sort.inl",
+        "cuda/include/thrust/system/detail/generic/swap_ranges.h",
+        "cuda/include/thrust/system/detail/generic/swap_ranges.inl",
+        "cuda/include/thrust/system/detail/generic/tabulate.h",
+        "cuda/include/thrust/system/detail/generic/tabulate.inl",
+        "cuda/include/thrust/system/detail/generic/tag.h",
+        "cuda/include/thrust/system/detail/generic/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/generic/temporary_buffer.inl",
+        "cuda/include/thrust/system/detail/generic/transform.h",
+        "cuda/include/thrust/system/detail/generic/transform.inl",
+        "cuda/include/thrust/system/detail/generic/transform_reduce.h",
+        "cuda/include/thrust/system/detail/generic/transform_reduce.inl",
+        "cuda/include/thrust/system/detail/generic/transform_scan.h",
+        "cuda/include/thrust/system/detail/generic/transform_scan.inl",
+        "cuda/include/thrust/system/detail/generic/type_traits.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_copy.inl",
+        "cuda/include/thrust/system/detail/generic/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/generic/uninitialized_fill.inl",
+        "cuda/include/thrust/system/detail/generic/unique.h",
+        "cuda/include/thrust/system/detail/generic/unique.inl",
+        "cuda/include/thrust/system/detail/generic/unique_by_key.h",
+        "cuda/include/thrust/system/detail/generic/unique_by_key.inl",
+        "cuda/include/thrust/system/detail/internal/decompose.h",
+        "cuda/include/thrust/system/detail/sequential/adjacent_difference.h",
+        "cuda/include/thrust/system/detail/sequential/assign_value.h",
+        "cuda/include/thrust/system/detail/sequential/binary_search.h",
+        "cuda/include/thrust/system/detail/sequential/copy.h",
+        "cuda/include/thrust/system/detail/sequential/copy.inl",
+        "cuda/include/thrust/system/detail/sequential/copy_backward.h",
+        "cuda/include/thrust/system/detail/sequential/copy_if.h",
+        "cuda/include/thrust/system/detail/sequential/count.h",
+        "cuda/include/thrust/system/detail/sequential/equal.h",
+        "cuda/include/thrust/system/detail/sequential/execution_policy.h",
+        "cuda/include/thrust/system/detail/sequential/extrema.h",
+        "cuda/include/thrust/system/detail/sequential/fill.h",
+        "cuda/include/thrust/system/detail/sequential/find.h",
+        "cuda/include/thrust/system/detail/sequential/for_each.h",
+        "cuda/include/thrust/system/detail/sequential/gather.h",
+        "cuda/include/thrust/system/detail/sequential/general_copy.h",
+        "cuda/include/thrust/system/detail/sequential/generate.h",
+        "cuda/include/thrust/system/detail/sequential/get_value.h",
+        "cuda/include/thrust/system/detail/sequential/inner_product.h",
+        "cuda/include/thrust/system/detail/sequential/insertion_sort.h",
+        "cuda/include/thrust/system/detail/sequential/iter_swap.h",
+        "cuda/include/thrust/system/detail/sequential/logical.h",
+        "cuda/include/thrust/system/detail/sequential/malloc_and_free.h",
+        "cuda/include/thrust/system/detail/sequential/merge.h",
+        "cuda/include/thrust/system/detail/sequential/merge.inl",
+        "cuda/include/thrust/system/detail/sequential/mismatch.h",
+        "cuda/include/thrust/system/detail/sequential/partition.h",
+        "cuda/include/thrust/system/detail/sequential/reduce.h",
+        "cuda/include/thrust/system/detail/sequential/reduce_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/remove.h",
+        "cuda/include/thrust/system/detail/sequential/replace.h",
+        "cuda/include/thrust/system/detail/sequential/reverse.h",
+        "cuda/include/thrust/system/detail/sequential/scan.h",
+        "cuda/include/thrust/system/detail/sequential/scan_by_key.h",
+        "cuda/include/thrust/system/detail/sequential/scatter.h",
+        "cuda/include/thrust/system/detail/sequential/sequence.h",
+        "cuda/include/thrust/system/detail/sequential/set_operations.h",
+        "cuda/include/thrust/system/detail/sequential/sort.h",
+        "cuda/include/thrust/system/detail/sequential/sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.h",
+        "cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl",
+        "cuda/include/thrust/system/detail/sequential/swap_ranges.h",
+        "cuda/include/thrust/system/detail/sequential/tabulate.h",
+        "cuda/include/thrust/system/detail/sequential/temporary_buffer.h",
+        "cuda/include/thrust/system/detail/sequential/transform.h",
+        "cuda/include/thrust/system/detail/sequential/transform_reduce.h",
+        "cuda/include/thrust/system/detail/sequential/transform_scan.h",
+        "cuda/include/thrust/system/detail/sequential/trivial_copy.h",
+        "cuda/include/thrust/system/detail/sequential/uninitialized_copy.h",
+        "cuda/include/thrust/system/detail/sequential/uninitialized_fill.h",
+        "cuda/include/thrust/system/detail/sequential/unique.h",
+        "cuda/include/thrust/system/detail/sequential/unique_by_key.h",
+        "cuda/include/thrust/system/detail/system_error.inl",
+        "cuda/include/thrust/system/error_code.h",
+        "cuda/include/thrust/system/omp/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/omp/detail/assign_value.h",
+        "cuda/include/thrust/system/omp/detail/binary_search.h",
+        "cuda/include/thrust/system/omp/detail/copy.h",
+        "cuda/include/thrust/system/omp/detail/copy.inl",
+        "cuda/include/thrust/system/omp/detail/copy_if.h",
+        "cuda/include/thrust/system/omp/detail/copy_if.inl",
+        "cuda/include/thrust/system/omp/detail/count.h",
+        "cuda/include/thrust/system/omp/detail/default_decomposition.h",
+        "cuda/include/thrust/system/omp/detail/default_decomposition.inl",
+        "cuda/include/thrust/system/omp/detail/equal.h",
+        "cuda/include/thrust/system/omp/detail/execution_policy.h",
+        "cuda/include/thrust/system/omp/detail/extrema.h",
+        "cuda/include/thrust/system/omp/detail/fill.h",
+        "cuda/include/thrust/system/omp/detail/find.h",
+        "cuda/include/thrust/system/omp/detail/for_each.h",
+        "cuda/include/thrust/system/omp/detail/for_each.inl",
+        "cuda/include/thrust/system/omp/detail/gather.h",
+        "cuda/include/thrust/system/omp/detail/generate.h",
+        "cuda/include/thrust/system/omp/detail/get_value.h",
+        "cuda/include/thrust/system/omp/detail/inner_product.h",
+        "cuda/include/thrust/system/omp/detail/iter_swap.h",
+        "cuda/include/thrust/system/omp/detail/logical.h",
+        "cuda/include/thrust/system/omp/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/omp/detail/memory.inl",
+        "cuda/include/thrust/system/omp/detail/merge.h",
+        "cuda/include/thrust/system/omp/detail/mismatch.h",
+        "cuda/include/thrust/system/omp/detail/par.h",
+        "cuda/include/thrust/system/omp/detail/partition.h",
+        "cuda/include/thrust/system/omp/detail/partition.inl",
+        "cuda/include/thrust/system/omp/detail/reduce.h",
+        "cuda/include/thrust/system/omp/detail/reduce.inl",
+        "cuda/include/thrust/system/omp/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/omp/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/omp/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/omp/detail/reduce_intervals.inl",
+        "cuda/include/thrust/system/omp/detail/remove.h",
+        "cuda/include/thrust/system/omp/detail/remove.inl",
+        "cuda/include/thrust/system/omp/detail/replace.h",
+        "cuda/include/thrust/system/omp/detail/reverse.h",
+        "cuda/include/thrust/system/omp/detail/scan.h",
+        "cuda/include/thrust/system/omp/detail/scan_by_key.h",
+        "cuda/include/thrust/system/omp/detail/scatter.h",
+        "cuda/include/thrust/system/omp/detail/sequence.h",
+        "cuda/include/thrust/system/omp/detail/set_operations.h",
+        "cuda/include/thrust/system/omp/detail/sort.h",
+        "cuda/include/thrust/system/omp/detail/sort.inl",
+        "cuda/include/thrust/system/omp/detail/swap_ranges.h",
+        "cuda/include/thrust/system/omp/detail/tabulate.h",
+        "cuda/include/thrust/system/omp/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/omp/detail/transform.h",
+        "cuda/include/thrust/system/omp/detail/transform_reduce.h",
+        "cuda/include/thrust/system/omp/detail/transform_scan.h",
+        "cuda/include/thrust/system/omp/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/omp/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/omp/detail/unique.h",
+        "cuda/include/thrust/system/omp/detail/unique.inl",
+        "cuda/include/thrust/system/omp/detail/unique_by_key.h",
+        "cuda/include/thrust/system/omp/detail/unique_by_key.inl",
+        "cuda/include/thrust/system/omp/detail/vector.inl",
+        "cuda/include/thrust/system/omp/execution_policy.h",
+        "cuda/include/thrust/system/omp/memory.h",
+        "cuda/include/thrust/system/omp/vector.h",
+        "cuda/include/thrust/system/system_error.h",
+        "cuda/include/thrust/system/tbb/detail/adjacent_difference.h",
+        "cuda/include/thrust/system/tbb/detail/assign_value.h",
+        "cuda/include/thrust/system/tbb/detail/binary_search.h",
+        "cuda/include/thrust/system/tbb/detail/copy.h",
+        "cuda/include/thrust/system/tbb/detail/copy.inl",
+        "cuda/include/thrust/system/tbb/detail/copy_if.h",
+        "cuda/include/thrust/system/tbb/detail/copy_if.inl",
+        "cuda/include/thrust/system/tbb/detail/count.h",
+        "cuda/include/thrust/system/tbb/detail/equal.h",
+        "cuda/include/thrust/system/tbb/detail/execution_policy.h",
+        "cuda/include/thrust/system/tbb/detail/extrema.h",
+        "cuda/include/thrust/system/tbb/detail/fill.h",
+        "cuda/include/thrust/system/tbb/detail/find.h",
+        "cuda/include/thrust/system/tbb/detail/for_each.h",
+        "cuda/include/thrust/system/tbb/detail/for_each.inl",
+        "cuda/include/thrust/system/tbb/detail/gather.h",
+        "cuda/include/thrust/system/tbb/detail/generate.h",
+        "cuda/include/thrust/system/tbb/detail/get_value.h",
+        "cuda/include/thrust/system/tbb/detail/inner_product.h",
+        "cuda/include/thrust/system/tbb/detail/iter_swap.h",
+        "cuda/include/thrust/system/tbb/detail/logical.h",
+        "cuda/include/thrust/system/tbb/detail/malloc_and_free.h",
+        "cuda/include/thrust/system/tbb/detail/memory.inl",
+        "cuda/include/thrust/system/tbb/detail/merge.h",
+        "cuda/include/thrust/system/tbb/detail/merge.inl",
+        "cuda/include/thrust/system/tbb/detail/mismatch.h",
+        "cuda/include/thrust/system/tbb/detail/par.h",
+        "cuda/include/thrust/system/tbb/detail/partition.h",
+        "cuda/include/thrust/system/tbb/detail/partition.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce.h",
+        "cuda/include/thrust/system/tbb/detail/reduce.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/reduce_by_key.inl",
+        "cuda/include/thrust/system/tbb/detail/reduce_intervals.h",
+        "cuda/include/thrust/system/tbb/detail/remove.h",
+        "cuda/include/thrust/system/tbb/detail/remove.inl",
+        "cuda/include/thrust/system/tbb/detail/replace.h",
+        "cuda/include/thrust/system/tbb/detail/reverse.h",
+        "cuda/include/thrust/system/tbb/detail/scan.h",
+        "cuda/include/thrust/system/tbb/detail/scan.inl",
+        "cuda/include/thrust/system/tbb/detail/scan_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/scatter.h",
+        "cuda/include/thrust/system/tbb/detail/sequence.h",
+        "cuda/include/thrust/system/tbb/detail/set_operations.h",
+        "cuda/include/thrust/system/tbb/detail/sort.h",
+        "cuda/include/thrust/system/tbb/detail/sort.inl",
+        "cuda/include/thrust/system/tbb/detail/swap_ranges.h",
+        "cuda/include/thrust/system/tbb/detail/tabulate.h",
+        "cuda/include/thrust/system/tbb/detail/temporary_buffer.h",
+        "cuda/include/thrust/system/tbb/detail/transform.h",
+        "cuda/include/thrust/system/tbb/detail/transform_reduce.h",
+        "cuda/include/thrust/system/tbb/detail/transform_scan.h",
+        "cuda/include/thrust/system/tbb/detail/uninitialized_copy.h",
+        "cuda/include/thrust/system/tbb/detail/uninitialized_fill.h",
+        "cuda/include/thrust/system/tbb/detail/unique.h",
+        "cuda/include/thrust/system/tbb/detail/unique.inl",
+        "cuda/include/thrust/system/tbb/detail/unique_by_key.h",
+        "cuda/include/thrust/system/tbb/detail/unique_by_key.inl",
+        "cuda/include/thrust/system/tbb/detail/vector.inl",
+        "cuda/include/thrust/system/tbb/execution_policy.h",
+        "cuda/include/thrust/system/tbb/memory.h",
+        "cuda/include/thrust/system/tbb/vector.h",
+        "cuda/include/thrust/system_error.h",
+        "cuda/include/thrust/tabulate.h",
+        "cuda/include/thrust/transform.h",
+        "cuda/include/thrust/transform_reduce.h",
+        "cuda/include/thrust/transform_scan.h",
+        "cuda/include/thrust/tuple.h",
+        "cuda/include/thrust/uninitialized_copy.h",
+        "cuda/include/thrust/uninitialized_fill.h",
+        "cuda/include/thrust/unique.h",
+        "cuda/include/thrust/version.h",
+        "cuda/include/vector_functions.h",
+        "cuda/include/vector_functions.hpp",
+        "cuda/include/vector_types.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-10.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-10.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-10.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-10.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-10.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-10.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-10.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-10.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-10.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-10.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-10.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-10.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-10.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-10.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-10.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-10.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-10.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-10.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-10.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-10.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-10.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_egl_interop.h" "$(@D)/cuda/include/cuda_egl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-10.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-10.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-10.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-10.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-10.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-10.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-10.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-10.0/include/cudart_platform.h" "$(@D)/cuda/include/cudart_platform.h" && cp -f "/usr/local/cuda-10.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-10.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-10.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-10.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-10.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-10.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-10.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-10.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-10.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-10.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-10.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-10.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-10.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-10.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-10.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-10.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-10.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-10.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-10.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-10.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-10.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-10.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-10.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-10.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-10.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-10.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-10.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-10.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-10.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-10.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-10.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-10.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-10.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-10.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-10.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-10.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-10.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-10.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-10.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-10.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-10.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-10.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-10.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-10.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-10.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-10.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-10.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-10.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-10.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-10.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-10.0/include/nvjpeg.h" "$(@D)/cuda/include/nvjpeg.h" && cp -f "/usr/local/cuda-10.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-10.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExt.h" "$(@D)/cuda/include/nvtx3/nvToolsExt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCuda.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvtx3/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtOpenCL.h" "$(@D)/cuda/include/nvtx3/nvToolsExtOpenCL.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvToolsExtSync.h" "$(@D)/cuda/include/nvtx3/nvToolsExtSync.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImpl.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImpl.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCore.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCore.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCudaRt_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplCuda_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplOpenCL_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxImplSync_v3.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInit.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInit.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDecls.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDecls.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxInitDefs.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxInitDefs.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxLinkOnce.h" && cp -f "/usr/local/cuda-10.0/include/nvtx3/nvtxDetail/nvtxTypes.h" "$(@D)/cuda/include/nvtx3/nvtxDetail/nvtxTypes.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-10.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-10.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-10.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-10.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-10.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/alignment.h" "$(@D)/cuda/include/thrust/detail/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/preprocessor.h" "$(@D)/cuda/include/thrust/detail/preprocessor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-10.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-10.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-10.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-10.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-10.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-10.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-10.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-10.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-10.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-10.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-10.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+   """,
+)
+
+genrule(
+    name = "cuda-nvvm",
+    outs = [
+        "cuda/nvvm/libdevice/libdevice.10.bc",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+   """,
+)
+
+genrule(
+    name = "cuda-extras",
+    outs = [
+        "cuda/extras/CUPTI/include/GL/gl.h",
+        "cuda/extras/CUPTI/include/GL/glew.h",
+        "cuda/extras/CUPTI/include/GL/glext.h",
+        "cuda/extras/CUPTI/include/GL/glu.h",
+        "cuda/extras/CUPTI/include/GL/glut.h",
+        "cuda/extras/CUPTI/include/GL/glx.h",
+        "cuda/extras/CUPTI/include/GL/glxext.h",
+        "cuda/extras/CUPTI/include/GL/wglew.h",
+        "cuda/extras/CUPTI/include/GL/wglext.h",
+        "cuda/extras/CUPTI/include/cuda_stdint.h",
+        "cuda/extras/CUPTI/include/cupti.h",
+        "cuda/extras/CUPTI/include/cupti_activity.h",
+        "cuda/extras/CUPTI/include/cupti_callbacks.h",
+        "cuda/extras/CUPTI/include/cupti_driver_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_events.h",
+        "cuda/extras/CUPTI/include/cupti_metrics.h",
+        "cuda/extras/CUPTI/include/cupti_nvtx_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_result.h",
+        "cuda/extras/CUPTI/include/cupti_runtime_cbid.h",
+        "cuda/extras/CUPTI/include/cupti_version.h",
+        "cuda/extras/CUPTI/include/generated_cudaGL_meta.h",
+        "cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h",
+        "cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h",
+        "cuda/extras/CUPTI/include/generated_nvtx_meta.h",
+        "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
+        "cuda/extras/CUPTI/include/openmp/cupti_openmp.h",
+        "cuda/extras/CUPTI/include/openmp/ompt.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/cupti_openmp.h" "$(@D)/cuda/extras/CUPTI/include/openmp/cupti_openmp.h" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/include/openmp/ompt.h" "$(@D)/cuda/extras/CUPTI/include/openmp/ompt.h"
+   """,
+)
+
+genrule(
+    name = "cuda-lib",
+    outs = [
+        "cuda/lib/libcuda.so",
+        "cuda/lib/libcudart.so.10.0",
+        "cuda/lib/libcudart_static.a",
+        "cuda/lib/libcublas.so.10.0",
+        "cuda/lib/libcusolver.so.10.0",
+        "cuda/lib/libcurand.so.10.0",
+        "cuda/lib/libcufft.so.10.0",
+        "cuda/lib/libcudnn.so.7",
+        "cuda/lib/libcupti.so.10.0",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart.so.10.0.130" "$(@D)/cuda/lib/libcudart.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcublas.so.10.0.130" "$(@D)/cuda/lib/libcublas.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcusolver.so.10.0.130" "$(@D)/cuda/lib/libcusolver.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcurand.so.10.0.130" "$(@D)/cuda/lib/libcurand.so.10.0" && cp -f "/usr/local/cuda-10.0/targets/x86_64-linux/lib/libcufft.so.10.0.145" "$(@D)/cuda/lib/libcufft.so.10.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.3.1" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-10.0/extras/CUPTI/lib64/libcupti.so.10.0.130" "$(@D)/cuda/lib/libcupti.so.10.0"
+   """,
+)
+
+genrule(
+    name = "cudnn-include",
+    outs = [
+        "cuda/include/cudnn.h",
+    ],
+    cmd = """
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
+   """,
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
new file mode 100755
index 0000000000000000000000000000000000000000..a53c891d8bba1b80a880ddd9c16091db27861a8d
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/build_defs.bzl
@@ -0,0 +1,31 @@
+# Macros for building CUDA code.
+def if_cuda(if_true, if_false = []):
+    """Shorthand for select()'ing on whether we're building with CUDA.
+
+    Returns a select statement which evaluates to if_true if we're building
+    with CUDA enabled.  Otherwise, the select statement evaluates to if_false.
+
+    """
+    return select({
+        "@local_config_cuda//cuda:using_nvcc": if_true,
+        "@local_config_cuda//cuda:using_clang": if_true,
+        "//conditions:default": if_false,
+    })
+
+def cuda_default_copts():
+    """Default options for all CUDA compilations."""
+    return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
+
+def cuda_is_configured():
+    """Returns true if CUDA was enabled during the configure process."""
+    return True
+
+def if_cuda_is_configured(x):
+    """Tests if the CUDA was enabled during the configure process.
+
+    Unlike if_cuda(), this does not require that we are building with
+    --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
+    """
+    if cuda_is_configured():
+        return x
+    return []
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
new file mode 100755
index 0000000000000000000000000000000000000000..0934618e0b538ab0db2a969870c85aa9c4053130
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda10.0-cudnn7/cuda/cuda/cuda_config.h
@@ -0,0 +1,26 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef CUDA_CUDA_CONFIG_H_
+#define CUDA_CUDA_CONFIG_H_
+
+#define TF_CUDA_CAPABILITIES CudaVersion("3.0")
+
+#define TF_CUDA_VERSION "10.0"
+#define TF_CUDNN_VERSION "7"
+
+#define TF_CUDA_TOOLKIT_PATH "/usr/local/cuda-10.0"
+
+#endif  // CUDA_CUDA_CONFIG_H_
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
index 247e0ace2432641c4a834ee26cfd4237f532d1ef..c6930904b564bf2cce70b484a0e7b0759f13b7c9 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/BUILD
@@ -1188,7 +1188,7 @@ genrule(
         "cuda/include/vector_types.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/include/CL/cl.h" "$(@D)/cuda/include/CL/cl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl.hpp" "$(@D)/cuda/include/CL/cl.hpp" && cp -f "/usr/local/cuda-9.0/include/CL/cl_egl.h" "$(@D)/cuda/include/CL/cl_egl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_ext.h" "$(@D)/cuda/include/CL/cl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl.h" "$(@D)/cuda/include/CL/cl_gl.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_gl_ext.h" "$(@D)/cuda/include/CL/cl_gl_ext.h" && cp -f "/usr/local/cuda-9.0/include/CL/cl_platform.h" "$(@D)/cuda/include/CL/cl_platform.h" && cp -f "/usr/local/cuda-9.0/include/CL/opencl.h" "$(@D)/cuda/include/CL/opencl.h" && cp -f "/usr/local/cuda-9.0/include/builtin_types.h" "$(@D)/cuda/include/builtin_types.h" && cp -f "/usr/local/cuda-9.0/include/channel_descriptor.h" "$(@D)/cuda/include/channel_descriptor.h" && cp -f "/usr/local/cuda-9.0/include/common_functions.h" "$(@D)/cuda/include/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups.h" "$(@D)/cuda/include/cooperative_groups.h" && cp -f "/usr/local/cuda-9.0/include/cooperative_groups_helpers.h" "$(@D)/cuda/include/cooperative_groups_helpers.h" && cp -f "/usr/local/cuda-9.0/include/crt/common_functions.h" "$(@D)/cuda/include/crt/common_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.h" "$(@D)/cuda/include/crt/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_double_functions.hpp" "$(@D)/cuda/include/crt/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.h" "$(@D)/cuda/include/crt/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/device_functions.hpp" "$(@D)/cuda/include/crt/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/func_macro.h" "$(@D)/cuda/include/crt/func_macro.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_config.h" "$(@D)/cuda/include/crt/host_config.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_defines.h" "$(@D)/cuda/include/crt/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/crt/host_runtime.h" "$(@D)/cuda/include/crt/host_runtime.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.h" "$(@D)/cuda/include/crt/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/crt/math_functions.hpp" "$(@D)/cuda/include/crt/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/mma.h" "$(@D)/cuda/include/crt/mma.h" && cp -f "/usr/local/cuda-9.0/include/crt/mma.hpp" "$(@D)/cuda/include/crt/mma.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/nvfunctional" "$(@D)/cuda/include/crt/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.h" "$(@D)/cuda/include/crt/sm_70_rt.h" && cp -f "/usr/local/cuda-9.0/include/crt/sm_70_rt.hpp" "$(@D)/cuda/include/crt/sm_70_rt.hpp" && cp -f "/usr/local/cuda-9.0/include/crt/storage_class.h" "$(@D)/cuda/include/crt/storage_class.h" && cp -f "/usr/local/cuda-9.0/include/cuComplex.h" "$(@D)/cuda/include/cuComplex.h" && cp -f "/usr/local/cuda-9.0/include/cublas.h" "$(@D)/cuda/include/cublas.h" && cp -f "/usr/local/cuda-9.0/include/cublasXt.h" "$(@D)/cuda/include/cublasXt.h" && cp -f "/usr/local/cuda-9.0/include/cublas_api.h" "$(@D)/cuda/include/cublas_api.h" && cp -f "/usr/local/cuda-9.0/include/cublas_v2.h" "$(@D)/cuda/include/cublas_v2.h" && cp -f "/usr/local/cuda-9.0/include/cuda.h" "$(@D)/cuda/include/cuda.h" && cp -f "/usr/local/cuda-9.0/include/cudaEGL.h" "$(@D)/cuda/include/cudaEGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaGL.h" "$(@D)/cuda/include/cudaGL.h" && cp -f "/usr/local/cuda-9.0/include/cudaProfiler.h" "$(@D)/cuda/include/cudaProfiler.h" && cp -f "/usr/local/cuda-9.0/include/cudaVDPAU.h" "$(@D)/cuda/include/cudaVDPAU.h" && cp -f "/usr/local/cuda-9.0/include/cuda_device_runtime_api.h" "$(@D)/cuda/include/cuda_device_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.h" "$(@D)/cuda/include/cuda_fp16.h" && cp -f "/usr/local/cuda-9.0/include/cuda_fp16.hpp" "$(@D)/cuda/include/cuda_fp16.hpp" && cp -f "/usr/local/cuda-9.0/include/cuda_gl_interop.h" "$(@D)/cuda/include/cuda_gl_interop.h" && cp -f "/usr/local/cuda-9.0/include/cuda_occupancy.h" "$(@D)/cuda/include/cuda_occupancy.h" && cp -f "/usr/local/cuda-9.0/include/cuda_profiler_api.h" "$(@D)/cuda/include/cuda_profiler_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime.h" "$(@D)/cuda/include/cuda_runtime.h" && cp -f "/usr/local/cuda-9.0/include/cuda_runtime_api.h" "$(@D)/cuda/include/cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/cuda_surface_types.h" "$(@D)/cuda/include/cuda_surface_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_texture_types.h" "$(@D)/cuda/include/cuda_texture_types.h" && cp -f "/usr/local/cuda-9.0/include/cuda_vdpau_interop.h" "$(@D)/cuda/include/cuda_vdpau_interop.h" && cp -f "/usr/local/cuda-9.0/include/cudalibxt.h" "$(@D)/cuda/include/cudalibxt.h" && cp -f "/usr/local/cuda-9.0/include/cufft.h" "$(@D)/cuda/include/cufft.h" && cp -f "/usr/local/cuda-9.0/include/cufftXt.h" "$(@D)/cuda/include/cufftXt.h" && cp -f "/usr/local/cuda-9.0/include/cufftw.h" "$(@D)/cuda/include/cufftw.h" && cp -f "/usr/local/cuda-9.0/include/curand.h" "$(@D)/cuda/include/curand.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete.h" "$(@D)/cuda/include/curand_discrete.h" && cp -f "/usr/local/cuda-9.0/include/curand_discrete2.h" "$(@D)/cuda/include/curand_discrete2.h" && cp -f "/usr/local/cuda-9.0/include/curand_globals.h" "$(@D)/cuda/include/curand_globals.h" && cp -f "/usr/local/cuda-9.0/include/curand_kernel.h" "$(@D)/cuda/include/curand_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_lognormal.h" "$(@D)/cuda/include/curand_lognormal.h" && cp -f "/usr/local/cuda-9.0/include/curand_mrg32k3a.h" "$(@D)/cuda/include/curand_mrg32k3a.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32.h" "$(@D)/cuda/include/curand_mtgp32.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_host.h" "$(@D)/cuda/include/curand_mtgp32_host.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32_kernel.h" "$(@D)/cuda/include/curand_mtgp32_kernel.h" && cp -f "/usr/local/cuda-9.0/include/curand_mtgp32dc_p_11213.h" "$(@D)/cuda/include/curand_mtgp32dc_p_11213.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal.h" "$(@D)/cuda/include/curand_normal.h" && cp -f "/usr/local/cuda-9.0/include/curand_normal_static.h" "$(@D)/cuda/include/curand_normal_static.h" && cp -f "/usr/local/cuda-9.0/include/curand_philox4x32_x.h" "$(@D)/cuda/include/curand_philox4x32_x.h" && cp -f "/usr/local/cuda-9.0/include/curand_poisson.h" "$(@D)/cuda/include/curand_poisson.h" && cp -f "/usr/local/cuda-9.0/include/curand_precalc.h" "$(@D)/cuda/include/curand_precalc.h" && cp -f "/usr/local/cuda-9.0/include/curand_uniform.h" "$(@D)/cuda/include/curand_uniform.h" && cp -f "/usr/local/cuda-9.0/include/cusolverDn.h" "$(@D)/cuda/include/cusolverDn.h" && cp -f "/usr/local/cuda-9.0/include/cusolverRf.h" "$(@D)/cuda/include/cusolverRf.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp.h" "$(@D)/cuda/include/cusolverSp.h" && cp -f "/usr/local/cuda-9.0/include/cusolverSp_LOWLEVEL_PREVIEW.h" "$(@D)/cuda/include/cusolverSp_LOWLEVEL_PREVIEW.h" && cp -f "/usr/local/cuda-9.0/include/cusolver_common.h" "$(@D)/cuda/include/cusolver_common.h" && cp -f "/usr/local/cuda-9.0/include/cusparse.h" "$(@D)/cuda/include/cusparse.h" && cp -f "/usr/local/cuda-9.0/include/cusparse_v2.h" "$(@D)/cuda/include/cusparse_v2.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.h" "$(@D)/cuda/include/device_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_atomic_functions.hpp" "$(@D)/cuda/include/device_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.h" "$(@D)/cuda/include/device_double_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_double_functions.hpp" "$(@D)/cuda/include/device_double_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions.h" "$(@D)/cuda/include/device_functions.h" && cp -f "/usr/local/cuda-9.0/include/device_functions.hpp" "$(@D)/cuda/include/device_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/device_functions_decls.h" "$(@D)/cuda/include/device_functions_decls.h" && cp -f "/usr/local/cuda-9.0/include/device_launch_parameters.h" "$(@D)/cuda/include/device_launch_parameters.h" && cp -f "/usr/local/cuda-9.0/include/device_types.h" "$(@D)/cuda/include/device_types.h" && cp -f "/usr/local/cuda-9.0/include/driver_functions.h" "$(@D)/cuda/include/driver_functions.h" && cp -f "/usr/local/cuda-9.0/include/driver_types.h" "$(@D)/cuda/include/driver_types.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda.h" "$(@D)/cuda/include/dynlink_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuda_cuda.h" "$(@D)/cuda/include/dynlink_cuda_cuda.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_cuviddec.h" "$(@D)/cuda/include/dynlink_cuviddec.h" && cp -f "/usr/local/cuda-9.0/include/dynlink_nvcuvid.h" "$(@D)/cuda/include/dynlink_nvcuvid.h" && cp -f "/usr/local/cuda-9.0/include/fatBinaryCtl.h" "$(@D)/cuda/include/fatBinaryCtl.h" && cp -f "/usr/local/cuda-9.0/include/fatbinary.h" "$(@D)/cuda/include/fatbinary.h" && cp -f "/usr/local/cuda-9.0/include/host_config.h" "$(@D)/cuda/include/host_config.h" && cp -f "/usr/local/cuda-9.0/include/host_defines.h" "$(@D)/cuda/include/host_defines.h" && cp -f "/usr/local/cuda-9.0/include/library_types.h" "$(@D)/cuda/include/library_types.h" && cp -f "/usr/local/cuda-9.0/include/math_constants.h" "$(@D)/cuda/include/math_constants.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.h" "$(@D)/cuda/include/math_functions.h" && cp -f "/usr/local/cuda-9.0/include/math_functions.hpp" "$(@D)/cuda/include/math_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.h" "$(@D)/cuda/include/math_functions_dbl_ptx3.h" && cp -f "/usr/local/cuda-9.0/include/math_functions_dbl_ptx3.hpp" "$(@D)/cuda/include/math_functions_dbl_ptx3.hpp" && cp -f "/usr/local/cuda-9.0/include/mma.h" "$(@D)/cuda/include/mma.h" && cp -f "/usr/local/cuda-9.0/include/npp.h" "$(@D)/cuda/include/npp.h" && cp -f "/usr/local/cuda-9.0/include/nppcore.h" "$(@D)/cuda/include/nppcore.h" && cp -f "/usr/local/cuda-9.0/include/nppdefs.h" "$(@D)/cuda/include/nppdefs.h" && cp -f "/usr/local/cuda-9.0/include/nppi.h" "$(@D)/cuda/include/nppi.h" && cp -f "/usr/local/cuda-9.0/include/nppi_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/nppi_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_color_conversion.h" "$(@D)/cuda/include/nppi_color_conversion.h" && cp -f "/usr/local/cuda-9.0/include/nppi_compression_functions.h" "$(@D)/cuda/include/nppi_compression_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_computer_vision.h" "$(@D)/cuda/include/nppi_computer_vision.h" && cp -f "/usr/local/cuda-9.0/include/nppi_data_exchange_and_initialization.h" "$(@D)/cuda/include/nppi_data_exchange_and_initialization.h" && cp -f "/usr/local/cuda-9.0/include/nppi_filtering_functions.h" "$(@D)/cuda/include/nppi_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_geometry_transforms.h" "$(@D)/cuda/include/nppi_geometry_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_linear_transforms.h" "$(@D)/cuda/include/nppi_linear_transforms.h" && cp -f "/usr/local/cuda-9.0/include/nppi_morphological_operations.h" "$(@D)/cuda/include/nppi_morphological_operations.h" && cp -f "/usr/local/cuda-9.0/include/nppi_statistics_functions.h" "$(@D)/cuda/include/nppi_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_support_functions.h" "$(@D)/cuda/include/nppi_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppi_threshold_and_compare_operations.h" "$(@D)/cuda/include/nppi_threshold_and_compare_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps.h" "$(@D)/cuda/include/npps.h" && cp -f "/usr/local/cuda-9.0/include/npps_arithmetic_and_logical_operations.h" "$(@D)/cuda/include/npps_arithmetic_and_logical_operations.h" && cp -f "/usr/local/cuda-9.0/include/npps_conversion_functions.h" "$(@D)/cuda/include/npps_conversion_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_filtering_functions.h" "$(@D)/cuda/include/npps_filtering_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_initialization.h" "$(@D)/cuda/include/npps_initialization.h" && cp -f "/usr/local/cuda-9.0/include/npps_statistics_functions.h" "$(@D)/cuda/include/npps_statistics_functions.h" && cp -f "/usr/local/cuda-9.0/include/npps_support_functions.h" "$(@D)/cuda/include/npps_support_functions.h" && cp -f "/usr/local/cuda-9.0/include/nppversion.h" "$(@D)/cuda/include/nppversion.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExt.h" "$(@D)/cuda/include/nvToolsExt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCuda.h" "$(@D)/cuda/include/nvToolsExtCuda.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtCudaRt.h" "$(@D)/cuda/include/nvToolsExtCudaRt.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtMeta.h" "$(@D)/cuda/include/nvToolsExtMeta.h" && cp -f "/usr/local/cuda-9.0/include/nvToolsExtSync.h" "$(@D)/cuda/include/nvToolsExtSync.h" && cp -f "/usr/local/cuda-9.0/include/nvblas.h" "$(@D)/cuda/include/nvblas.h" && cp -f "/usr/local/cuda-9.0/include/nvfunctional" "$(@D)/cuda/include/nvfunctional" && cp -f "/usr/local/cuda-9.0/include/nvgraph.h" "$(@D)/cuda/include/nvgraph.h" && cp -f "/usr/local/cuda-9.0/include/nvml.h" "$(@D)/cuda/include/nvml.h" && cp -f "/usr/local/cuda-9.0/include/nvrtc.h" "$(@D)/cuda/include/nvrtc.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.h" "$(@D)/cuda/include/sm_20_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_atomic_functions.hpp" "$(@D)/cuda/include/sm_20_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.h" "$(@D)/cuda/include/sm_20_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_20_intrinsics.hpp" "$(@D)/cuda/include/sm_20_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.h" "$(@D)/cuda/include/sm_30_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_30_intrinsics.hpp" "$(@D)/cuda/include/sm_30_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.h" "$(@D)/cuda/include/sm_32_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_atomic_functions.hpp" "$(@D)/cuda/include/sm_32_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.h" "$(@D)/cuda/include/sm_32_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_32_intrinsics.hpp" "$(@D)/cuda/include/sm_32_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_35_atomic_functions.h" "$(@D)/cuda/include/sm_35_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_35_intrinsics.h" "$(@D)/cuda/include/sm_35_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.h" "$(@D)/cuda/include/sm_60_atomic_functions.h" && cp -f "/usr/local/cuda-9.0/include/sm_60_atomic_functions.hpp" "$(@D)/cuda/include/sm_60_atomic_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.h" "$(@D)/cuda/include/sm_61_intrinsics.h" && cp -f "/usr/local/cuda-9.0/include/sm_61_intrinsics.hpp" "$(@D)/cuda/include/sm_61_intrinsics.hpp" && cp -f "/usr/local/cuda-9.0/include/sobol_direction_vectors.h" "$(@D)/cuda/include/sobol_direction_vectors.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.h" "$(@D)/cuda/include/surface_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_functions.hpp" "$(@D)/cuda/include/surface_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.h" "$(@D)/cuda/include/surface_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/surface_indirect_functions.hpp" "$(@D)/cuda/include/surface_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/surface_types.h" "$(@D)/cuda/include/surface_types.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.h" "$(@D)/cuda/include/texture_fetch_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_fetch_functions.hpp" "$(@D)/cuda/include/texture_fetch_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.h" "$(@D)/cuda/include/texture_indirect_functions.h" && cp -f "/usr/local/cuda-9.0/include/texture_indirect_functions.hpp" "$(@D)/cuda/include/texture_indirect_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/texture_types.h" "$(@D)/cuda/include/texture_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/adjacent_difference.h" "$(@D)/cuda/include/thrust/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/advance.h" "$(@D)/cuda/include/thrust/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/binary_search.h" "$(@D)/cuda/include/thrust/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/complex.h" "$(@D)/cuda/include/thrust/complex.h" && cp -f "/usr/local/cuda-9.0/include/thrust/copy.h" "$(@D)/cuda/include/thrust/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/count.h" "$(@D)/cuda/include/thrust/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/adjacent_difference.inl" "$(@D)/cuda/include/thrust/detail/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/advance.inl" "$(@D)/cuda/include/thrust/detail/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.h" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/allocator_traits.inl" "$(@D)/cuda/include/thrust/detail/allocator/allocator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/copy_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/copy_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/default_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/default_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.h" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/destroy_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/destroy_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.h" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/fill_construct_range.inl" "$(@D)/cuda/include/thrust/detail/allocator/fill_construct_range.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/malloc_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/malloc_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/no_throw_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/no_throw_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/tagged_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/tagged_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.h" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/allocator/temporary_allocator.inl" "$(@D)/cuda/include/thrust/detail/allocator/temporary_allocator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/binary_search.inl" "$(@D)/cuda/include/thrust/detail/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/arithmetic.h" "$(@D)/cuda/include/thrust/detail/complex/arithmetic.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/c99math.h" "$(@D)/cuda/include/thrust/detail/complex/c99math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrig.h" "$(@D)/cuda/include/thrust/detail/complex/catrig.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/catrigf.h" "$(@D)/cuda/include/thrust/detail/complex/catrigf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccosh.h" "$(@D)/cuda/include/thrust/detail/complex/ccosh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ccoshf.h" "$(@D)/cuda/include/thrust/detail/complex/ccoshf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexp.h" "$(@D)/cuda/include/thrust/detail/complex/cexp.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cexpf.h" "$(@D)/cuda/include/thrust/detail/complex/cexpf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clog.h" "$(@D)/cuda/include/thrust/detail/complex/clog.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/clogf.h" "$(@D)/cuda/include/thrust/detail/complex/clogf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/complex.inl" "$(@D)/cuda/include/thrust/detail/complex/complex.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpow.h" "$(@D)/cuda/include/thrust/detail/complex/cpow.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cpowf.h" "$(@D)/cuda/include/thrust/detail/complex/cpowf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/cproj.h" "$(@D)/cuda/include/thrust/detail/complex/cproj.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinh.h" "$(@D)/cuda/include/thrust/detail/complex/csinh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csinhf.h" "$(@D)/cuda/include/thrust/detail/complex/csinhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrt.h" "$(@D)/cuda/include/thrust/detail/complex/csqrt.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/csqrtf.h" "$(@D)/cuda/include/thrust/detail/complex/csqrtf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanh.h" "$(@D)/cuda/include/thrust/detail/complex/ctanh.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/ctanhf.h" "$(@D)/cuda/include/thrust/detail/complex/ctanhf.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/math_private.h" "$(@D)/cuda/include/thrust/detail/complex/math_private.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/complex/stream.h" "$(@D)/cuda/include/thrust/detail/complex/stream.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config.h" "$(@D)/cuda/include/thrust/detail/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler.h" "$(@D)/cuda/include/thrust/detail/config/compiler.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/compiler_fence.h" "$(@D)/cuda/include/thrust/detail/config/compiler_fence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/config.h" "$(@D)/cuda/include/thrust/detail/config/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/debug.h" "$(@D)/cuda/include/thrust/detail/config/debug.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/device_system.h" "$(@D)/cuda/include/thrust/detail/config/device_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/exec_check_disable.h" "$(@D)/cuda/include/thrust/detail/config/exec_check_disable.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/forceinline.h" "$(@D)/cuda/include/thrust/detail/config/forceinline.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/global_workarounds.h" "$(@D)/cuda/include/thrust/detail/config/global_workarounds.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_device.h" "$(@D)/cuda/include/thrust/detail/config/host_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/host_system.h" "$(@D)/cuda/include/thrust/detail/config/host_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/config/simple_defines.h" "$(@D)/cuda/include/thrust/detail/config/simple_defines.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.h" "$(@D)/cuda/include/thrust/detail/contiguous_storage.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/contiguous_storage.inl" "$(@D)/cuda/include/thrust/detail/contiguous_storage.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.h" "$(@D)/cuda/include/thrust/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy.inl" "$(@D)/cuda/include/thrust/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.h" "$(@D)/cuda/include/thrust/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/copy_if.inl" "$(@D)/cuda/include/thrust/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/count.inl" "$(@D)/cuda/include/thrust/detail/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/cstdint.h" "$(@D)/cuda/include/thrust/detail/cstdint.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_delete.inl" "$(@D)/cuda/include/thrust/detail/device_delete.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_free.inl" "$(@D)/cuda/include/thrust/detail/device_free.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_malloc.inl" "$(@D)/cuda/include/thrust/detail/device_malloc.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_new.inl" "$(@D)/cuda/include/thrust/detail/device_new.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_ptr.inl" "$(@D)/cuda/include/thrust/detail/device_ptr.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_reference.inl" "$(@D)/cuda/include/thrust/detail/device_reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/device_vector.inl" "$(@D)/cuda/include/thrust/detail/device_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/dispatch/is_trivial_copy.h" "$(@D)/cuda/include/thrust/detail/dispatch/is_trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/distance.inl" "$(@D)/cuda/include/thrust/detail/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/equal.inl" "$(@D)/cuda/include/thrust/detail/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execute_with_allocator.h" "$(@D)/cuda/include/thrust/detail/execute_with_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/execution_policy.h" "$(@D)/cuda/include/thrust/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/extrema.inl" "$(@D)/cuda/include/thrust/detail/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/fill.inl" "$(@D)/cuda/include/thrust/detail/fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/find.inl" "$(@D)/cuda/include/thrust/detail/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/for_each.inl" "$(@D)/cuda/include/thrust/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/function.h" "$(@D)/cuda/include/thrust/detail/function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional.inl" "$(@D)/cuda/include/thrust/detail/functional.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.h" "$(@D)/cuda/include/thrust/detail/functional/actor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/actor.inl" "$(@D)/cuda/include/thrust/detail/functional/actor.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/argument.h" "$(@D)/cuda/include/thrust/detail/functional/argument.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/composite.h" "$(@D)/cuda/include/thrust/detail/functional/composite.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/arithmetic_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/arithmetic_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/assignment_operator.h" "$(@D)/cuda/include/thrust/detail/functional/operators/assignment_operator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/bitwise_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/bitwise_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/compound_assignment_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/compound_assignment_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/logical_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/logical_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/operator_adaptors.h" "$(@D)/cuda/include/thrust/detail/functional/operators/operator_adaptors.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/operators/relational_operators.h" "$(@D)/cuda/include/thrust/detail/functional/operators/relational_operators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/placeholder.h" "$(@D)/cuda/include/thrust/detail/functional/placeholder.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/functional/value.h" "$(@D)/cuda/include/thrust/detail/functional/value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/gather.inl" "$(@D)/cuda/include/thrust/detail/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/generate.inl" "$(@D)/cuda/include/thrust/detail/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/get_iterator_value.h" "$(@D)/cuda/include/thrust/detail/get_iterator_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/host_vector.inl" "$(@D)/cuda/include/thrust/detail/host_vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/inner_product.inl" "$(@D)/cuda/include/thrust/detail/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_math.h" "$(@D)/cuda/include/thrust/detail/integer_math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/integer_traits.h" "$(@D)/cuda/include/thrust/detail/integer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/internal_functional.h" "$(@D)/cuda/include/thrust/detail/internal_functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/logical.inl" "$(@D)/cuda/include/thrust/detail/logical.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/merge.inl" "$(@D)/cuda/include/thrust/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/minmax.h" "$(@D)/cuda/include/thrust/detail/minmax.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mismatch.inl" "$(@D)/cuda/include/thrust/detail/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/mpl/math.h" "$(@D)/cuda/include/thrust/detail/mpl/math.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/numeric_traits.h" "$(@D)/cuda/include/thrust/detail/numeric_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/overlapped_copy.h" "$(@D)/cuda/include/thrust/detail/overlapped_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pair.inl" "$(@D)/cuda/include/thrust/detail/pair.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/partition.inl" "$(@D)/cuda/include/thrust/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.h" "$(@D)/cuda/include/thrust/detail/pointer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/pointer.inl" "$(@D)/cuda/include/thrust/detail/pointer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/head_flags.h" "$(@D)/cuda/include/thrust/detail/range/head_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/range/tail_flags.h" "$(@D)/cuda/include/thrust/detail/range/tail_flags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_pointer_cast.h" "$(@D)/cuda/include/thrust/detail/raw_pointer_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/raw_reference_cast.h" "$(@D)/cuda/include/thrust/detail/raw_reference_cast.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reduce.inl" "$(@D)/cuda/include/thrust/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.h" "$(@D)/cuda/include/thrust/detail/reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference.inl" "$(@D)/cuda/include/thrust/detail/reference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reference_forward_declaration.h" "$(@D)/cuda/include/thrust/detail/reference_forward_declaration.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/remove.inl" "$(@D)/cuda/include/thrust/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/replace.inl" "$(@D)/cuda/include/thrust/detail/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/reverse.inl" "$(@D)/cuda/include/thrust/detail/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scan.inl" "$(@D)/cuda/include/thrust/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/scatter.inl" "$(@D)/cuda/include/thrust/detail/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/seq.h" "$(@D)/cuda/include/thrust/detail/seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sequence.inl" "$(@D)/cuda/include/thrust/detail/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/set_operations.inl" "$(@D)/cuda/include/thrust/detail/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/sort.inl" "$(@D)/cuda/include/thrust/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_assert.h" "$(@D)/cuda/include/thrust/detail/static_assert.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/static_map.h" "$(@D)/cuda/include/thrust/detail/static_map.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.h" "$(@D)/cuda/include/thrust/detail/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap.inl" "$(@D)/cuda/include/thrust/detail/swap.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/swap_ranges.inl" "$(@D)/cuda/include/thrust/detail/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tabulate.inl" "$(@D)/cuda/include/thrust/detail/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.h" "$(@D)/cuda/include/thrust/detail/temporary_array.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_array.inl" "$(@D)/cuda/include/thrust/detail/temporary_array.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform.inl" "$(@D)/cuda/include/thrust/detail/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_reduce.inl" "$(@D)/cuda/include/thrust/detail/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/transform_scan.inl" "$(@D)/cuda/include/thrust/detail/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/trivial_sequence.h" "$(@D)/cuda/include/thrust/detail/trivial_sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple.inl" "$(@D)/cuda/include/thrust/detail/tuple.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_meta_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_meta_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/tuple_transform.h" "$(@D)/cuda/include/thrust/detail/tuple_transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" "$(@D)/cuda/include/thrust/detail/type_traits/algorithm/intermediate_type_from_function_and_iterators.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/function_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/function_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_member_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_member_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_nested_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_nested_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/has_trivial_assign.h" "$(@D)/cuda/include/thrust/detail/type_traits/has_trivial_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_call_possible.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_call_possible.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/is_metafunction_defined.h" "$(@D)/cuda/include/thrust/detail/type_traits/is_metafunction_defined.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/iterator/is_output_iterator.h" "$(@D)/cuda/include/thrust/detail/type_traits/iterator/is_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/minimum_type.h" "$(@D)/cuda/include/thrust/detail/type_traits/minimum_type.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/pointer_traits.h" "$(@D)/cuda/include/thrust/detail/type_traits/pointer_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/type_traits/result_of_adaptable_function.h" "$(@D)/cuda/include/thrust/detail/type_traits/result_of_adaptable_function.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/detail/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/unique.inl" "$(@D)/cuda/include/thrust/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/use_default.h" "$(@D)/cuda/include/thrust/detail/use_default.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/align.h" "$(@D)/cuda/include/thrust/detail/util/align.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/util/blocking.h" "$(@D)/cuda/include/thrust/detail/util/blocking.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.h" "$(@D)/cuda/include/thrust/detail/vector_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/detail/vector_base.inl" "$(@D)/cuda/include/thrust/detail/vector_base.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/device_allocator.h" "$(@D)/cuda/include/thrust/device_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_delete.h" "$(@D)/cuda/include/thrust/device_delete.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_free.h" "$(@D)/cuda/include/thrust/device_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc.h" "$(@D)/cuda/include/thrust/device_malloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_malloc_allocator.h" "$(@D)/cuda/include/thrust/device_malloc_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new.h" "$(@D)/cuda/include/thrust/device_new.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_new_allocator.h" "$(@D)/cuda/include/thrust/device_new_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_ptr.h" "$(@D)/cuda/include/thrust/device_ptr.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_reference.h" "$(@D)/cuda/include/thrust/device_reference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/device_vector.h" "$(@D)/cuda/include/thrust/device_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/distance.h" "$(@D)/cuda/include/thrust/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/equal.h" "$(@D)/cuda/include/thrust/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/execution_policy.h" "$(@D)/cuda/include/thrust/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/extrema.h" "$(@D)/cuda/include/thrust/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/fill.h" "$(@D)/cuda/include/thrust/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/find.h" "$(@D)/cuda/include/thrust/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/for_each.h" "$(@D)/cuda/include/thrust/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/functional.h" "$(@D)/cuda/include/thrust/functional.h" && cp -f "/usr/local/cuda-9.0/include/thrust/gather.h" "$(@D)/cuda/include/thrust/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/generate.h" "$(@D)/cuda/include/thrust/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/host_vector.h" "$(@D)/cuda/include/thrust/host_vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/inner_product.h" "$(@D)/cuda/include/thrust/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/constant_iterator.h" "$(@D)/cuda/include/thrust/iterator/constant_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/counting_iterator.h" "$(@D)/cuda/include/thrust/iterator/counting_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_assign.h" "$(@D)/cuda/include/thrust/iterator/detail/any_assign.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/any_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/any_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/constant_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/constant_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/counting_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/counting_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/device_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/device_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/discard_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/discard_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/distance_from_result.h" "$(@D)/cuda/include/thrust/iterator/detail/distance_from_result.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/host_system_tag.h" "$(@D)/cuda/include/thrust/iterator/detail/host_system_tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_iterator_category.h" "$(@D)/cuda/include/thrust/iterator/detail/is_iterator_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/is_trivial_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/is_trivial_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_adaptor_base.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_adaptor_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_system.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_to_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_to_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_category_with_system_and_traversal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_facade_category.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_facade_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traits.inl" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traits.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/iterator_traversal_tags.h" "$(@D)/cuda/include/thrust/iterator/detail/iterator_traversal_tags.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/join_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/join_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_category.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_category.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/minimum_system.h" "$(@D)/cuda/include/thrust/iterator/detail/minimum_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/normal_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/normal_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/permutation_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/permutation_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/retag.h" "$(@D)/cuda/include/thrust/iterator/detail/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/reverse_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/reverse_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tagged_iterator.h" "$(@D)/cuda/include/thrust/iterator/detail/tagged_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/transform_output_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/transform_output_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/tuple_of_iterator_references.h" "$(@D)/cuda/include/thrust/iterator/detail/tuple_of_iterator_references.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/universal_categories.h" "$(@D)/cuda/include/thrust/iterator/detail/universal_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator.inl" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/detail/zip_iterator_base.h" "$(@D)/cuda/include/thrust/iterator/detail/zip_iterator_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/discard_iterator.h" "$(@D)/cuda/include/thrust/iterator/discard_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_adaptor.h" "$(@D)/cuda/include/thrust/iterator/iterator_adaptor.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_categories.h" "$(@D)/cuda/include/thrust/iterator/iterator_categories.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_facade.h" "$(@D)/cuda/include/thrust/iterator/iterator_facade.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/iterator_traits.h" "$(@D)/cuda/include/thrust/iterator/iterator_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/permutation_iterator.h" "$(@D)/cuda/include/thrust/iterator/permutation_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/retag.h" "$(@D)/cuda/include/thrust/iterator/retag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/reverse_iterator.h" "$(@D)/cuda/include/thrust/iterator/reverse_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/transform_output_iterator.h" "$(@D)/cuda/include/thrust/iterator/transform_output_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/iterator/zip_iterator.h" "$(@D)/cuda/include/thrust/iterator/zip_iterator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/logical.h" "$(@D)/cuda/include/thrust/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/memory.h" "$(@D)/cuda/include/thrust/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/merge.h" "$(@D)/cuda/include/thrust/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/mismatch.h" "$(@D)/cuda/include/thrust/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/pair.h" "$(@D)/cuda/include/thrust/pair.h" && cp -f "/usr/local/cuda-9.0/include/thrust/partition.h" "$(@D)/cuda/include/thrust/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random.h" "$(@D)/cuda/include/thrust/random.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/discard_block_engine.inl" "$(@D)/cuda/include/thrust/random/detail/discard_block_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_congruential_engine_discard.h" "$(@D)/cuda/include/thrust/random/detail/linear_congruential_engine_discard.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine.inl" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" "$(@D)/cuda/include/thrust/random/detail/linear_feedback_shift_engine_wordmask.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/mod.h" "$(@D)/cuda/include/thrust/random/detail/mod.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/normal_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/normal_distribution_base.h" "$(@D)/cuda/include/thrust/random/detail/normal_distribution_base.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/random_core_access.h" "$(@D)/cuda/include/thrust/random/detail/random_core_access.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/subtract_with_carry_engine.inl" "$(@D)/cuda/include/thrust/random/detail/subtract_with_carry_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_int_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_int_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/uniform_real_distribution.inl" "$(@D)/cuda/include/thrust/random/detail/uniform_real_distribution.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine.inl" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/random/detail/xor_combine_engine_max.h" "$(@D)/cuda/include/thrust/random/detail/xor_combine_engine_max.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/discard_block_engine.h" "$(@D)/cuda/include/thrust/random/discard_block_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_congruential_engine.h" "$(@D)/cuda/include/thrust/random/linear_congruential_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/linear_feedback_shift_engine.h" "$(@D)/cuda/include/thrust/random/linear_feedback_shift_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/normal_distribution.h" "$(@D)/cuda/include/thrust/random/normal_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/subtract_with_carry_engine.h" "$(@D)/cuda/include/thrust/random/subtract_with_carry_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_int_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_int_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/uniform_real_distribution.h" "$(@D)/cuda/include/thrust/random/uniform_real_distribution.h" && cp -f "/usr/local/cuda-9.0/include/thrust/random/xor_combine_engine.h" "$(@D)/cuda/include/thrust/random/xor_combine_engine.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reduce.h" "$(@D)/cuda/include/thrust/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/remove.h" "$(@D)/cuda/include/thrust/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/replace.h" "$(@D)/cuda/include/thrust/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/reverse.h" "$(@D)/cuda/include/thrust/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scan.h" "$(@D)/cuda/include/thrust/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/scatter.h" "$(@D)/cuda/include/thrust/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sequence.h" "$(@D)/cuda/include/thrust/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/set_operations.h" "$(@D)/cuda/include/thrust/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/sort.h" "$(@D)/cuda/include/thrust/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/swap.h" "$(@D)/cuda/include/thrust/swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cpp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cpp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cpp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/count.h" "$(@D)/cuda/include/thrust/system/cpp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/equal.h" "$(@D)/cuda/include/thrust/system/cpp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cpp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/find.h" "$(@D)/cuda/include/thrust/system/cpp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cpp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/gather.h" "$(@D)/cuda/include/thrust/system/cpp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/generate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cpp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cpp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cpp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/logical.h" "$(@D)/cuda/include/thrust/system/cpp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cpp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/merge.h" "$(@D)/cuda/include/thrust/system/cpp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cpp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/par.h" "$(@D)/cuda/include/thrust/system/cpp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/partition.h" "$(@D)/cuda/include/thrust/system/cpp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/remove.h" "$(@D)/cuda/include/thrust/system/cpp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/replace.h" "$(@D)/cuda/include/thrust/system/cpp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cpp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cpp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cpp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/sort.h" "$(@D)/cuda/include/thrust/system/cpp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cpp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cpp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cpp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cpp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cpp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cpp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cpp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/execution_policy.h" "$(@D)/cuda/include/thrust/system/cpp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/memory.h" "$(@D)/cuda/include/thrust/system/cpp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cpp/vector.h" "$(@D)/cuda/include/thrust/system/cpp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/config.h" "$(@D)/cuda/include/thrust/system/cuda/config.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/cuda/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/cuda/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/cuda/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/agent_launcher.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/agent_launcher.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/alignment.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/alignment.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/triple_chevron_launch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/core/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/core/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/count.h" "$(@D)/cuda/include/thrust/system/cuda/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_downsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_radix_sort_upsweep.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_segment_fixup.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/agent_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/agent/single_pass_scan_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_adjacent_difference.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_discontinuity.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_exchange.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_rank.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_raking_layout.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_shuffle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/block_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/block_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_atomic.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_histogram_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_raking_commutative_only.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_reduce_warp_reductions.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_raking.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans2.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/block/specializations/block_scan_warp_scans3.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/cub.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/cub.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_partition.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_run_length_encode.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_segmented_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_select.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_select.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/device_spmv.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_histogram.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_radix_sort.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_reduce_by_key.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_rle.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_select_if.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_csrt.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_orig.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/device/dispatch/dispatch_spmv_row_based.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_barrier.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_even_share.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_mapping.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/grid/grid_queue.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/host/mutex.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/host/mutex.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/arg_index_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/cache_modified_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/constant_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/counting_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/discard_output_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_obj_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/tex_ref_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/iterator/transform_input_iterator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_load.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_operators.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_search.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/thread/thread_store.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_allocator.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_allocator.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_arch.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_arch.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_debug.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_debug.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_device.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_device.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_macro.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_macro.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_namespace.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_namespace.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_ptx.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_ptx.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/util_type.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/util_type.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_reduce_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_shfl.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/specializations/warp_scan_smem.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_reduce.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" "$(@D)/cuda/include/thrust/system/cuda/detail/cub/warp/warp_scan.cuh" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/equal.h" "$(@D)/cuda/include/thrust/system/cuda/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/error.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/extrema.h" "$(@D)/cuda/include/thrust/system/cuda/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/find.h" "$(@D)/cuda/include/thrust/system/cuda/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/for_each.h" "$(@D)/cuda/include/thrust/system/cuda/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/gather.h" "$(@D)/cuda/include/thrust/system/cuda/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/generate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/get_value.h" "$(@D)/cuda/include/thrust/system/cuda/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_cuda_runtime_api.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/guarded_driver_types.h" "$(@D)/cuda/include/thrust/system/cuda/detail/guarded_driver_types.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/cuda/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_cross_system.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_cross_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" "$(@D)/cuda/include/thrust/system/cuda/detail/internal/copy_device_to_device.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/cuda/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/logical.h" "$(@D)/cuda/include/thrust/system/cuda/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/cuda/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/memory_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/memory_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/merge.h" "$(@D)/cuda/include/thrust/system/cuda/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/cuda/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/par_to_seq.h" "$(@D)/cuda/include/thrust/system/cuda/detail/par_to_seq.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/parallel_for.h" "$(@D)/cuda/include/thrust/system/cuda/detail/parallel_for.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/partition.h" "$(@D)/cuda/include/thrust/system/cuda/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/remove.h" "$(@D)/cuda/include/thrust/system/cuda/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/replace.h" "$(@D)/cuda/include/thrust/system/cuda/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/reverse.h" "$(@D)/cuda/include/thrust/system/cuda/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/scatter.h" "$(@D)/cuda/include/thrust/system/cuda/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sequence.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/cuda/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/sort.h" "$(@D)/cuda/include/thrust/system/cuda/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/cuda/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/cuda/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/terminate.h" "$(@D)/cuda/include/thrust/system/cuda/detail/terminate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/cuda/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/cuda/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/cuda/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/util.h" "$(@D)/cuda/include/thrust/system/cuda/detail/util.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/detail/vector.inl" "$(@D)/cuda/include/thrust/system/cuda/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/error.h" "$(@D)/cuda/include/thrust/system/cuda/error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/execution_policy.h" "$(@D)/cuda/include/thrust/system/cuda/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/experimental/pinned_allocator.h" "$(@D)/cuda/include/thrust/system/cuda/experimental/pinned_allocator.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/memory.h" "$(@D)/cuda/include/thrust/system/cuda/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/cuda/vector.h" "$(@D)/cuda/include/thrust/system/cuda/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/adl/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/adl/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/adl/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/count.h" "$(@D)/cuda/include/thrust/system/detail/adl/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/equal.h" "$(@D)/cuda/include/thrust/system/detail/adl/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/extrema.h" "$(@D)/cuda/include/thrust/system/detail/adl/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/find.h" "$(@D)/cuda/include/thrust/system/detail/adl/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/for_each.h" "$(@D)/cuda/include/thrust/system/detail/adl/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/gather.h" "$(@D)/cuda/include/thrust/system/detail/adl/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/generate.h" "$(@D)/cuda/include/thrust/system/detail/adl/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/get_value.h" "$(@D)/cuda/include/thrust/system/detail/adl/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/adl/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/adl/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/logical.h" "$(@D)/cuda/include/thrust/system/detail/adl/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/adl/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/merge.h" "$(@D)/cuda/include/thrust/system/detail/adl/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/adl/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/partition.h" "$(@D)/cuda/include/thrust/system/detail/adl/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/remove.h" "$(@D)/cuda/include/thrust/system/detail/adl/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/replace.h" "$(@D)/cuda/include/thrust/system/detail/adl/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/reverse.h" "$(@D)/cuda/include/thrust/system/detail/adl/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/scatter.h" "$(@D)/cuda/include/thrust/system/detail/adl/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sequence.h" "$(@D)/cuda/include/thrust/system/detail/adl/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/adl/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/sort.h" "$(@D)/cuda/include/thrust/system/detail/adl/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/adl/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/adl/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/adl/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/adl/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/adl/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/adl/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/adl/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/bad_alloc.h" "$(@D)/cuda/include/thrust/system/detail/bad_alloc.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/errno.h" "$(@D)/cuda/include/thrust/system/detail/errno.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_category.inl" "$(@D)/cuda/include/thrust/system/detail/error_category.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_code.inl" "$(@D)/cuda/include/thrust/system/detail/error_code.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/error_condition.inl" "$(@D)/cuda/include/thrust/system/detail/error_condition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/adjacent_difference.inl" "$(@D)/cuda/include/thrust/system/detail/generic/adjacent_difference.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.h" "$(@D)/cuda/include/thrust/system/detail/generic/advance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/advance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/advance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/copy_if.inl" "$(@D)/cuda/include/thrust/system/detail/generic/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.h" "$(@D)/cuda/include/thrust/system/detail/generic/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/count.inl" "$(@D)/cuda/include/thrust/system/detail/generic/count.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.h" "$(@D)/cuda/include/thrust/system/detail/generic/distance.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/distance.inl" "$(@D)/cuda/include/thrust/system/detail/generic/distance.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.h" "$(@D)/cuda/include/thrust/system/detail/generic/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/equal.inl" "$(@D)/cuda/include/thrust/system/detail/generic/equal.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.h" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/extrema.inl" "$(@D)/cuda/include/thrust/system/detail/generic/extrema.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.h" "$(@D)/cuda/include/thrust/system/detail/generic/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/find.inl" "$(@D)/cuda/include/thrust/system/detail/generic/find.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/for_each.h" "$(@D)/cuda/include/thrust/system/detail/generic/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.h" "$(@D)/cuda/include/thrust/system/detail/generic/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/gather.inl" "$(@D)/cuda/include/thrust/system/detail/generic/gather.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.h" "$(@D)/cuda/include/thrust/system/detail/generic/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/generate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/generate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/inner_product.inl" "$(@D)/cuda/include/thrust/system/detail/generic/inner_product.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/logical.h" "$(@D)/cuda/include/thrust/system/detail/generic/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.h" "$(@D)/cuda/include/thrust/system/detail/generic/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/memory.inl" "$(@D)/cuda/include/thrust/system/detail/generic/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.h" "$(@D)/cuda/include/thrust/system/detail/generic/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/merge.inl" "$(@D)/cuda/include/thrust/system/detail/generic/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/mismatch.inl" "$(@D)/cuda/include/thrust/system/detail/generic/mismatch.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.h" "$(@D)/cuda/include/thrust/system/detail/generic/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/partition.inl" "$(@D)/cuda/include/thrust/system/detail/generic/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.h" "$(@D)/cuda/include/thrust/system/detail/generic/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/remove.inl" "$(@D)/cuda/include/thrust/system/detail/generic/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.h" "$(@D)/cuda/include/thrust/system/detail/generic/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/replace.inl" "$(@D)/cuda/include/thrust/system/detail/generic/replace.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.h" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/reverse.inl" "$(@D)/cuda/include/thrust/system/detail/generic/reverse.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scalar/binary_search.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scalar/binary_search.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scan_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scan_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.h" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/scatter.inl" "$(@D)/cuda/include/thrust/system/detail/generic/scatter.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/select_system.h" "$(@D)/cuda/include/thrust/system/detail/generic/select_system.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.h" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sequence.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sequence.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/set_operations.inl" "$(@D)/cuda/include/thrust/system/detail/generic/set_operations.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.h" "$(@D)/cuda/include/thrust/system/detail/generic/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/sort.inl" "$(@D)/cuda/include/thrust/system/detail/generic/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/swap_ranges.inl" "$(@D)/cuda/include/thrust/system/detail/generic/swap_ranges.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tabulate.inl" "$(@D)/cuda/include/thrust/system/detail/generic/tabulate.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/tag.h" "$(@D)/cuda/include/thrust/system/detail/generic/tag.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/temporary_buffer.inl" "$(@D)/cuda/include/thrust/system/detail/generic/temporary_buffer.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_reduce.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/transform_scan.inl" "$(@D)/cuda/include/thrust/system/detail/generic/transform_scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/type_traits.h" "$(@D)/cuda/include/thrust/system/detail/generic/type_traits.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_copy.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/uninitialized_fill.inl" "$(@D)/cuda/include/thrust/system/detail/generic/uninitialized_fill.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/generic/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/detail/generic/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/internal/decompose.h" "$(@D)/cuda/include/thrust/system/detail/internal/decompose.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/detail/sequential/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/assign_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/binary_search.h" "$(@D)/cuda/include/thrust/system/detail/sequential/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_backward.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_backward.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/copy_if.h" "$(@D)/cuda/include/thrust/system/detail/sequential/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/count.h" "$(@D)/cuda/include/thrust/system/detail/sequential/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/equal.h" "$(@D)/cuda/include/thrust/system/detail/sequential/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/execution_policy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/extrema.h" "$(@D)/cuda/include/thrust/system/detail/sequential/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/find.h" "$(@D)/cuda/include/thrust/system/detail/sequential/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/for_each.h" "$(@D)/cuda/include/thrust/system/detail/sequential/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/gather.h" "$(@D)/cuda/include/thrust/system/detail/sequential/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/general_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/general_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/generate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/get_value.h" "$(@D)/cuda/include/thrust/system/detail/sequential/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/inner_product.h" "$(@D)/cuda/include/thrust/system/detail/sequential/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/insertion_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/insertion_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/iter_swap.h" "$(@D)/cuda/include/thrust/system/detail/sequential/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/logical.h" "$(@D)/cuda/include/thrust/system/detail/sequential/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/detail/sequential/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.h" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/merge.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/mismatch.h" "$(@D)/cuda/include/thrust/system/detail/sequential/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/partition.h" "$(@D)/cuda/include/thrust/system/detail/sequential/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/remove.h" "$(@D)/cuda/include/thrust/system/detail/sequential/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/replace.h" "$(@D)/cuda/include/thrust/system/detail/sequential/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/reverse.h" "$(@D)/cuda/include/thrust/system/detail/sequential/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scan_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/scatter.h" "$(@D)/cuda/include/thrust/system/detail/sequential/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sequence.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/set_operations.h" "$(@D)/cuda/include/thrust/system/detail/sequential/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_merge_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_merge_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_primitive_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_primitive_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.h" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/stable_radix_sort.inl" "$(@D)/cuda/include/thrust/system/detail/sequential/stable_radix_sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/swap_ranges.h" "$(@D)/cuda/include/thrust/system/detail/sequential/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/tabulate.h" "$(@D)/cuda/include/thrust/system/detail/sequential/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/detail/sequential/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_reduce.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/transform_scan.h" "$(@D)/cuda/include/thrust/system/detail/sequential/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/trivial_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/trivial_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/detail/sequential/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/sequential/unique_by_key.h" "$(@D)/cuda/include/thrust/system/detail/sequential/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/detail/system_error.inl" "$(@D)/cuda/include/thrust/system/detail/system_error.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/error_code.h" "$(@D)/cuda/include/thrust/system/error_code.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/omp/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/omp/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/omp/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/count.h" "$(@D)/cuda/include/thrust/system/omp/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.h" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/default_decomposition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/default_decomposition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/equal.h" "$(@D)/cuda/include/thrust/system/omp/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/extrema.h" "$(@D)/cuda/include/thrust/system/omp/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/find.h" "$(@D)/cuda/include/thrust/system/omp/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.h" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/omp/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/gather.h" "$(@D)/cuda/include/thrust/system/omp/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/generate.h" "$(@D)/cuda/include/thrust/system/omp/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/get_value.h" "$(@D)/cuda/include/thrust/system/omp/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/omp/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/omp/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/logical.h" "$(@D)/cuda/include/thrust/system/omp/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/omp/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/memory.inl" "$(@D)/cuda/include/thrust/system/omp/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/merge.h" "$(@D)/cuda/include/thrust/system/omp/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/omp/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/par.h" "$(@D)/cuda/include/thrust/system/omp/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.h" "$(@D)/cuda/include/thrust/system/omp/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/partition.inl" "$(@D)/cuda/include/thrust/system/omp/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reduce_intervals.inl" "$(@D)/cuda/include/thrust/system/omp/detail/reduce_intervals.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.h" "$(@D)/cuda/include/thrust/system/omp/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/remove.inl" "$(@D)/cuda/include/thrust/system/omp/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/replace.h" "$(@D)/cuda/include/thrust/system/omp/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/reverse.h" "$(@D)/cuda/include/thrust/system/omp/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/scatter.h" "$(@D)/cuda/include/thrust/system/omp/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sequence.h" "$(@D)/cuda/include/thrust/system/omp/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/omp/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.h" "$(@D)/cuda/include/thrust/system/omp/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/sort.inl" "$(@D)/cuda/include/thrust/system/omp/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/omp/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/omp/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/omp/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/omp/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/omp/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/omp/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/detail/vector.inl" "$(@D)/cuda/include/thrust/system/omp/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/execution_policy.h" "$(@D)/cuda/include/thrust/system/omp/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/memory.h" "$(@D)/cuda/include/thrust/system/omp/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/omp/vector.h" "$(@D)/cuda/include/thrust/system/omp/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/system_error.h" "$(@D)/cuda/include/thrust/system/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/adjacent_difference.h" "$(@D)/cuda/include/thrust/system/tbb/detail/adjacent_difference.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/assign_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/assign_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/binary_search.h" "$(@D)/cuda/include/thrust/system/tbb/detail/binary_search.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.h" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/copy_if.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/copy_if.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/count.h" "$(@D)/cuda/include/thrust/system/tbb/detail/count.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/equal.h" "$(@D)/cuda/include/thrust/system/tbb/detail/equal.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/extrema.h" "$(@D)/cuda/include/thrust/system/tbb/detail/extrema.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/find.h" "$(@D)/cuda/include/thrust/system/tbb/detail/find.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.h" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/for_each.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/for_each.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/gather.h" "$(@D)/cuda/include/thrust/system/tbb/detail/gather.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/generate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/generate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/get_value.h" "$(@D)/cuda/include/thrust/system/tbb/detail/get_value.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/inner_product.h" "$(@D)/cuda/include/thrust/system/tbb/detail/inner_product.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/iter_swap.h" "$(@D)/cuda/include/thrust/system/tbb/detail/iter_swap.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/logical.h" "$(@D)/cuda/include/thrust/system/tbb/detail/logical.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/malloc_and_free.h" "$(@D)/cuda/include/thrust/system/tbb/detail/malloc_and_free.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/memory.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/memory.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.h" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/merge.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/merge.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/mismatch.h" "$(@D)/cuda/include/thrust/system/tbb/detail/mismatch.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/par.h" "$(@D)/cuda/include/thrust/system/tbb/detail/par.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.h" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/partition.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/partition.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reduce_intervals.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reduce_intervals.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.h" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/remove.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/remove.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/replace.h" "$(@D)/cuda/include/thrust/system/tbb/detail/replace.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/reverse.h" "$(@D)/cuda/include/thrust/system/tbb/detail/reverse.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/scan.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scan_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scan_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/scatter.h" "$(@D)/cuda/include/thrust/system/tbb/detail/scatter.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sequence.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sequence.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/set_operations.h" "$(@D)/cuda/include/thrust/system/tbb/detail/set_operations.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.h" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/sort.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/sort.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/swap_ranges.h" "$(@D)/cuda/include/thrust/system/tbb/detail/swap_ranges.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/tabulate.h" "$(@D)/cuda/include/thrust/system/tbb/detail/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/temporary_buffer.h" "$(@D)/cuda/include/thrust/system/tbb/detail/temporary_buffer.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_reduce.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/transform_scan.h" "$(@D)/cuda/include/thrust/system/tbb/detail/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_copy.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/uninitialized_fill.h" "$(@D)/cuda/include/thrust/system/tbb/detail/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.h" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/unique_by_key.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/unique_by_key.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/detail/vector.inl" "$(@D)/cuda/include/thrust/system/tbb/detail/vector.inl" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/execution_policy.h" "$(@D)/cuda/include/thrust/system/tbb/execution_policy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/memory.h" "$(@D)/cuda/include/thrust/system/tbb/memory.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system/tbb/vector.h" "$(@D)/cuda/include/thrust/system/tbb/vector.h" && cp -f "/usr/local/cuda-9.0/include/thrust/system_error.h" "$(@D)/cuda/include/thrust/system_error.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tabulate.h" "$(@D)/cuda/include/thrust/tabulate.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform.h" "$(@D)/cuda/include/thrust/transform.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_reduce.h" "$(@D)/cuda/include/thrust/transform_reduce.h" && cp -f "/usr/local/cuda-9.0/include/thrust/transform_scan.h" "$(@D)/cuda/include/thrust/transform_scan.h" && cp -f "/usr/local/cuda-9.0/include/thrust/tuple.h" "$(@D)/cuda/include/thrust/tuple.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_copy.h" "$(@D)/cuda/include/thrust/uninitialized_copy.h" && cp -f "/usr/local/cuda-9.0/include/thrust/uninitialized_fill.h" "$(@D)/cuda/include/thrust/uninitialized_fill.h" && cp -f "/usr/local/cuda-9.0/include/thrust/unique.h" "$(@D)/cuda/include/thrust/unique.h" && cp -f "/usr/local/cuda-9.0/include/thrust/version.h" "$(@D)/cuda/include/thrust/version.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.h" "$(@D)/cuda/include/vector_functions.h" && cp -f "/usr/local/cuda-9.0/include/vector_functions.hpp" "$(@D)/cuda/include/vector_functions.hpp" && cp -f "/usr/local/cuda-9.0/include/vector_types.h" "$(@D)/cuda/include/vector_types.h"
    """,
 )
 
@@ -1198,7 +1198,7 @@ genrule(
         "cuda/nvvm/libdevice/libdevice.10.bc",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/nvvm/libdevice/libdevice.10.bc" "$(@D)//libdevice.10.bc"
    """,
 )
 
@@ -1235,7 +1235,7 @@ genrule(
         "cuda/extras/CUPTI/include/openacc/cupti_openacc.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/gl.h" "$(@D)/cuda/extras/CUPTI/include/GL/gl.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glew.h" "$(@D)/cuda/extras/CUPTI/include/GL/glew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glu.h" "$(@D)/cuda/extras/CUPTI/include/GL/glu.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glut.h" "$(@D)/cuda/extras/CUPTI/include/GL/glut.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glx.h" "$(@D)/cuda/extras/CUPTI/include/GL/glx.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/glxext.h" "$(@D)/cuda/extras/CUPTI/include/GL/glxext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglew.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglew.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/GL/wglext.h" "$(@D)/cuda/extras/CUPTI/include/GL/wglext.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cuda_stdint.h" "$(@D)/cuda/extras/CUPTI/include/cuda_stdint.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti.h" "$(@D)/cuda/extras/CUPTI/include/cupti.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_activity.h" "$(@D)/cuda/extras/CUPTI/include/cupti_activity.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_callbacks.h" "$(@D)/cuda/extras/CUPTI/include/cupti_callbacks.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_driver_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_driver_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_events.h" "$(@D)/cuda/extras/CUPTI/include/cupti_events.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_metrics.h" "$(@D)/cuda/extras/CUPTI/include/cupti_metrics.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_nvtx_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_nvtx_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_result.h" "$(@D)/cuda/extras/CUPTI/include/cupti_result.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_runtime_cbid.h" "$(@D)/cuda/extras/CUPTI/include/cupti_runtime_cbid.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/cupti_version.h" "$(@D)/cuda/extras/CUPTI/include/cupti_version.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaGL_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaGL_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cudaVDPAU_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cudaVDPAU_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_gl_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_runtime_api_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_cuda_vdpau_interop_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/generated_nvtx_meta.h" "$(@D)/cuda/extras/CUPTI/include/generated_nvtx_meta.h" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/include/openacc/cupti_openacc.h" "$(@D)/cuda/extras/CUPTI/include/openacc/cupti_openacc.h"
    """,
 )
 
@@ -1253,7 +1253,7 @@ genrule(
         "cuda/lib/libcupti.so.9.0",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/stubs/libcuda.so" "$(@D)/cuda/lib/libcuda.so" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart.so.9.0.176" "$(@D)/cuda/lib/libcudart.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcudart_static.a" "$(@D)/cuda/lib/libcudart_static.a" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcublas.so.9.0.480" "$(@D)/cuda/lib/libcublas.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcusolver.so.9.0.176" "$(@D)/cuda/lib/libcusolver.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcurand.so.9.0.176" "$(@D)/cuda/lib/libcurand.so.9.0" && cp -f "/usr/local/cuda-9.0/targets/x86_64-linux/lib/libcufft.so.9.0.176" "$(@D)/cuda/lib/libcufft.so.9.0" && cp -f "/usr/lib/x86_64-linux-gnu/libcudnn.so.7.1.4" "$(@D)/cuda/lib/libcudnn.so.7" && cp -f "/usr/local/cuda-9.0/extras/CUPTI/lib64/libcupti.so.9.0.176" "$(@D)/cuda/lib/libcupti.so.9.0"
    """,
 )
 
@@ -1263,6 +1263,6 @@ genrule(
         "cuda/include/cudnn.h",
     ],
     cmd = """
-if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp "/usr/include/cudnn.h" "$(@D)/cudnn.h"
+if [ -d "$(@D)/extras" ]; then rm $(@D)/extras -drf; fi && if [ -d "$(@D)/include" ]; then rm $(@D)/include -drf; fi && if [ -d "$(@D)/lib" ]; then rm $(@D)/lib -drf; fi && if [ -d "$(@D)/nvvm" ]; then rm $(@D)/nvvm -drf; fi && cp -f "/usr/include/cudnn.h" "$(@D)/cudnn.h"
    """,
 )
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
index 5c6703aab4fbdaf92c5b63a5c0f2600ad699c0cf..a53c891d8bba1b80a880ddd9c16091db27861a8d 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
+++ b/third_party/toolchains/preconfig/ubuntu14.04/cuda9.0-cudnn7/cuda/build_defs.bzl
@@ -9,15 +9,13 @@ def if_cuda(if_true, if_false = []):
     return select({
         "@local_config_cuda//cuda:using_nvcc": if_true,
         "@local_config_cuda//cuda:using_clang": if_true,
-        "//conditions:default": if_false
+        "//conditions:default": if_false,
     })
 
-
 def cuda_default_copts():
     """Default options for all CUDA compilations."""
     return if_cuda(["-x", "cuda", "-DGOOGLE_CUDA=1"] + [])
 
-
 def cuda_is_configured():
     """Returns true if CUDA was enabled during the configure process."""
     return True
@@ -29,5 +27,5 @@ def if_cuda_is_configured(x):
     --config=cuda. Used to allow non-CUDA code to depend on CUDA libraries.
     """
     if cuda_is_configured():
-      return x
+        return x
     return []
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..6442e7628a416e3298cfd2579cee275459780145
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/BUILD
@@ -0,0 +1,87 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..1c2e8bcae63ebc9b1ee22b5d677c185589b547f8
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-10.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 0000000000000000000000000000000000000000..7ae59e9967adf9b1a980a8085e203459ba8a7c7b
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '10.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
new file mode 100755
index 0000000000000000000000000000000000000000..e896e654fd7ecd578c80d102895f51ce18bbd4eb
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.bat
@@ -0,0 +1,20 @@
+:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
+@echo OFF
+set arg0=%~0
+for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
+"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 0000000000000000000000000000000000000000..00483951af966e0085e6f2b1d74290d9ee872963
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda10.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-10.0/bin/nvcc'
+NVCC_VERSION = '10.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
new file mode 100755
index 0000000000000000000000000000000000000000..6442e7628a416e3298cfd2579cee275459780145
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/BUILD
@@ -0,0 +1,87 @@
+licenses(["restricted"])
+
+package(default_visibility = ["//visibility:public"])
+
+toolchain(
+    name = "toolchain-linux-x86_64",
+    exec_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    target_compatible_with = [
+        "@bazel_tools//platforms:linux",
+        "@bazel_tools//platforms:x86_64",
+    ],
+    toolchain = ":cc-compiler-local",
+    toolchain_type = "@bazel_tools//tools/cpp:toolchain_type",
+)
+
+cc_toolchain_suite(
+    name = "toolchain",
+    toolchains = {
+        "local|compiler": ":cc-compiler-local",
+        "darwin|compiler": ":cc-compiler-darwin",
+        "x64_windows|msvc-cl": ":cc-compiler-windows",
+    },
+)
+
+cc_toolchain(
+    name = "cc-compiler-local",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "local",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    # To support linker flags that need to go to the start of command line
+    # we need the toolchain to support parameter files. Parameter files are
+    # last on the command line and contain all shared libraries to link, so all
+    # regular options will be left of them.
+    supports_param_files = 1,
+)
+
+cc_toolchain(
+    name = "cc-compiler-darwin",
+    all_files = ":crosstool_wrapper_driver_is_not_gcc",
+    compiler_files = ":empty",
+    cpu = "darwin",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":crosstool_wrapper_driver_is_not_gcc",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 0,
+)
+
+cc_toolchain(
+    name = "cc-compiler-windows",
+    all_files = ":windows_msvc_wrapper_files",
+    compiler_files = ":empty",
+    cpu = "x64_windows",
+    dwp_files = ":empty",
+    dynamic_runtime_libs = [":empty"],
+    linker_files = ":windows_msvc_wrapper_files",
+    objcopy_files = ":empty",
+    static_runtime_libs = [":empty"],
+    strip_files = ":empty",
+    supports_param_files = 1,
+)
+
+filegroup(
+    name = "empty",
+    srcs = [],
+)
+
+filegroup(
+    name = "crosstool_wrapper_driver_is_not_gcc",
+    srcs = ["clang/bin/crosstool_wrapper_driver_is_not_gcc"],
+)
+
+filegroup(
+    name = "windows_msvc_wrapper_files",
+    srcs = glob(["windows/msvc_*"]),
+)
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL
new file mode 100755
index 0000000000000000000000000000000000000000..0d89a539b8d70788eb0f6924636824fba778a058
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/CROSSTOOL
@@ -0,0 +1,1431 @@
+major_version: "local"
+minor_version: ""
+default_target_cpu: "same_as_host"
+
+default_toolchain {
+  cpu: "k8"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "piii"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "arm"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "darwin"
+  toolchain_identifier: "local_darwin"
+}
+default_toolchain {
+  cpu: "ppc"
+  toolchain_identifier: "local_linux"
+}
+default_toolchain {
+  cpu: "x64_windows"
+  toolchain_identifier: "local_windows"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "local"
+  target_cpu: "local"
+  target_system_name: "local"
+  toolchain_identifier: "local_linux"
+
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lstdc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  feature {
+    name: "alwayslink"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,-no-as-needed"
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+        flag: "-Wl,-z,relro,-z,now"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "build-id"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        # Stamp the binary with a unique identifier.
+        flag: "-Wl,--build-id=md5"
+        flag: "-Wl,--hash-style=gnu"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-no-canonical-prefixes"
+        flag: "-fno-canonical-system-headers"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "alwayslink"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "build-id"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-Wl,--gc-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/ar" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  abi_version: "local"
+  abi_libc_version: "local"
+  compiler: "compiler"
+  host_system_name: "local"
+  needsPic: true
+  target_libc: "macosx"
+  target_cpu: "darwin"
+  target_system_name: "local"
+  toolchain_identifier: "local_darwin"
+  feature {
+    name: "c++11"
+    flag_set {
+      action: "c++-compile"
+      flag_group {
+        flag: "-std=c++11"
+      }
+    }
+  }
+
+  feature {
+    name: "stdlib"
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-lc++"
+      }
+    }
+  }
+
+  feature {
+    name: "determinism"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Make C++ compilation deterministic. Use linkstamping instead of these
+        # compiler symbols.
+        flag: "-Wno-builtin-macro-redefined"
+        flag: "-D__DATE__=\"redacted\""
+        flag: "-D__TIMESTAMP__=\"redacted\""
+        flag: "-D__TIME__=\"redacted\""
+      }
+    }
+  }
+
+  # This feature will be enabled for builds that support pic by bazel.
+  feature {
+    name: "pic"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        expand_if_all_available: "pic"
+        flag: "-fPIC"
+      }
+      flag_group {
+        expand_if_none_available: "pic"
+        flag: "-fPIE"
+      }
+    }
+  }
+
+  # Security hardening on by default.
+  feature {
+    name: "hardening"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # Conservative choice; -D_FORTIFY_SOURCE=2 may be unsafe in some cases.
+        # We need to undef it before redefining it as some distributions now
+        # have it enabled by default.
+        flag: "-U_FORTIFY_SOURCE"
+        flag: "-D_FORTIFY_SOURCE=1"
+        flag: "-fstack-protector"
+      }
+    }
+    flag_set {
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-pie"
+      }
+    }
+  }
+
+  feature {
+    name: "warnings"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # All warnings are enabled. Maybe enable -Werror as well?
+        flag: "-Wall"
+        
+      }
+    }
+  }
+
+  # Keep stack frames for debugging, even in opt mode.
+  feature {
+    name: "frame-pointer"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-fno-omit-frame-pointer"
+      }
+    }
+  }
+
+  feature {
+    name: "no-canonical-prefixes"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag:"-no-canonical-prefixes"
+      }
+    }
+  }
+
+  feature {
+    name: "disable-assertions"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: "linker-bin-path"
+
+    flag_set {
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "-B/usr/bin"
+      }
+    }
+  }
+
+  feature {
+    name: "undefined-dynamic"
+    flag_set {
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-executable"
+      flag_group {
+        flag: "-undefined"
+        flag: "dynamic_lookup"
+      }
+    }
+  }
+
+  feature {
+    name: "common"
+    implies: "stdlib"
+    implies: "c++11"
+    implies: "determinism"
+    implies: "hardening"
+    implies: "warnings"
+    implies: "frame-pointer"
+    implies: "no-canonical-prefixes"
+    implies: "linker-bin-path"
+    implies: "undefined-dynamic"
+  }
+
+  feature {
+    name: "opt"
+    implies: "common"
+    implies: "disable-assertions"
+
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        # No debug symbols.
+        # Maybe we should enable https://gcc.gnu.org/wiki/DebugFission for opt
+        # or even generally? However, that can't happen here, as it requires
+        # special handling in Bazel.
+        flag: "-g0"
+
+        # Conservative choice for -O
+        # -O3 can increase binary size and even slow down the resulting binaries.
+        # Profile first and / or use FDO if you need better performance than this.
+        flag: "-O2"
+
+        # Removal of unused code and data at link time (can this increase binary size in some cases?).
+        flag: "-ffunction-sections"
+        flag: "-fdata-sections"
+      }
+    }
+  }
+
+  feature {
+    name: "fastbuild"
+    implies: "common"
+  }
+
+  feature {
+    name: "dbg"
+    implies: "common"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      flag_group {
+        flag: "-g"
+      }
+    }
+  }
+
+  # Set clang as a C/C++ compiler.
+  tool_path { name: "gcc" path: "clang/bin/crosstool_wrapper_driver_is_not_gcc" }
+
+  # Use the default system toolchain for everything else.
+  tool_path { name: "ar" path: "/usr/bin/libtool" }
+  tool_path { name: "compat-ld" path: "/usr/bin/ld" }
+  tool_path { name: "cpp" path: "/usr/bin/cpp" }
+  tool_path { name: "dwp" path: "/usr/bin/dwp" }
+  tool_path { name: "gcov" path: "/usr/bin/gcov" }
+  tool_path { name: "ld" path: "/usr/bin/ld" }
+  tool_path { name: "nm" path: "/usr/bin/nm" }
+  tool_path { name: "objcopy" path: "/usr/bin/objcopy" }
+  tool_path { name: "objdump" path: "/usr/bin/objdump" }
+  tool_path { name: "strip" path: "/usr/bin/strip" }
+
+  # Enabled dynamic linking.
+  linking_mode_flags { mode: DYNAMIC }
+
+  cxx_builtin_include_directory: "/usr/include/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu/c++/4.8"
+  cxx_builtin_include_directory: "/usr/include/c++/4.8/backward"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include"
+  cxx_builtin_include_directory: "/usr/local/include"
+  cxx_builtin_include_directory: "/usr/lib/gcc/x86_64-linux-gnu/4.8/include-fixed"
+  cxx_builtin_include_directory: "/usr/include/x86_64-linux-gnu"
+  cxx_builtin_include_directory: "/usr/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/targets/x86_64-linux/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/include"
+  cxx_builtin_include_directory: "/usr/local/cuda-9.0/extras/CUPTI/include"
+  cxx_builtin_include_directory: "/usr/include"
+}
+
+toolchain {
+  toolchain_identifier: "local_windows"
+  host_system_name: "local"
+  target_system_name: "local"
+
+  abi_version: "local"
+  abi_libc_version: "local"
+  target_cpu: "x64_windows"
+  compiler: "msvc-cl"
+  target_libc: "msvcrt"
+
+
+
+  tool_path {
+    name: "ar"
+    path: ""
+  }
+  tool_path {
+    name: "ml"
+    path: ""
+  }
+  tool_path {
+    name: "cpp"
+    path: ""
+  }
+  tool_path {
+    name: "gcc"
+    path: ""
+  }
+  tool_path {
+    name: "gcov"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "ld"
+    path: ""
+  }
+  tool_path {
+    name: "nm"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objcopy"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "objdump"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  tool_path {
+    name: "strip"
+    path: "wrapper/bin/msvc_nop.bat"
+  }
+  supports_interface_shared_objects: true
+
+  # TODO(pcloudy): Review those flags below, they should be defined by cl.exe
+  compiler_flag: "/DCOMPILER_MSVC"
+
+  # Don't define min/max macros in windows.h.
+  compiler_flag: "/DNOMINMAX"
+
+  # Platform defines.
+  compiler_flag: "/D_WIN32_WINNT=0x0600"
+  # Turn off warning messages.
+  compiler_flag: "/D_CRT_SECURE_NO_DEPRECATE"
+  compiler_flag: "/D_CRT_SECURE_NO_WARNINGS"
+  compiler_flag: "/D_SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS"
+
+  # Useful options to have on for compilation.
+  # Increase the capacity of object files to 2^32 sections.
+  compiler_flag: "/bigobj"
+  # Allocate 500MB for precomputed headers.
+  compiler_flag: "/Zm500"
+  # Use unsigned char by default.
+  compiler_flag: "/J"
+  # Use function level linking.
+  compiler_flag: "/Gy"
+  # Use string pooling.
+  compiler_flag: "/GF"
+  # Catch C++ exceptions only and tell the compiler to assume that functions declared
+  # as extern "C" never throw a C++ exception.
+  compiler_flag: "/EHsc"
+
+  # Globally disabled warnings.
+  # Don't warn about elements of array being be default initialized.
+  compiler_flag: "/wd4351"
+  # Don't warn about no matching delete found.
+  compiler_flag: "/wd4291"
+  # Don't warn about diamond inheritance patterns.
+  compiler_flag: "/wd4250"
+  # Don't warn about insecure functions (e.g. non _s functions).
+  compiler_flag: "/wd4996"
+
+  linker_flag: "/MACHINE:X64"
+
+  feature {
+    name: "no_legacy_features"
+  }
+
+  # Suppress startup banner.
+  feature {
+    name: "nologo"
+    flag_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      flag_group {
+        flag: "/nologo"
+      }
+    }
+  }
+
+  feature {
+    name: 'has_configured_linker_path'
+  }
+
+  # This feature indicates strip is not supported, building stripped binary will just result a copy of orignial binary
+  feature {
+    name: 'no_stripping'
+  }
+
+  # This feature indicates this is a toolchain targeting Windows.
+  feature {
+    name: 'targets_windows'
+    implies: 'copy_dynamic_libraries_to_binary'
+    enabled: true
+  }
+
+  feature {
+    name: 'copy_dynamic_libraries_to_binary'
+  }
+
+  action_config {
+    config_name: 'assemble'
+    action_name: 'assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'preprocess-assemble'
+    action_name: 'preprocess-assemble'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'sysroot'
+  }
+
+  action_config {
+    config_name: 'c-compile'
+    action_name: 'c-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-compile'
+    action_name: 'c++-compile'
+    tool {
+      tool_path: ''
+    }
+    implies: 'compiler_input_flags'
+    implies: 'compiler_output_flags'
+    implies: 'legacy_compile_flags'
+    implies: 'nologo'
+    implies: 'msvc_env'
+    implies: 'parse_showincludes'
+    implies: 'user_compile_flags'
+    implies: 'sysroot'
+    implies: 'unfiltered_compile_flags'
+  }
+
+  action_config {
+    config_name: 'c++-link-executable'
+    action_name: 'c++-link-executable'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+  }
+
+  action_config {
+    config_name: 'c++-link-dynamic-library'
+    action_name: 'c++-link-dynamic-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'shared_flag'
+    implies: 'linkstamps'
+    implies: 'output_execpath_flags'
+    implies: 'input_param_flags'
+    implies: 'user_link_flags'
+    implies: 'legacy_link_flags'
+    implies: 'linker_subsystem_flag'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+    implies: 'no_stripping'
+    implies: 'has_configured_linker_path'
+    implies: 'def_file'
+  }
+
+  action_config {
+      config_name: 'c++-link-nodeps-dynamic-library'
+      action_name: 'c++-link-nodeps-dynamic-library'
+      tool {
+        tool_path: ''
+      }
+      implies: 'nologo'
+      implies: 'shared_flag'
+      implies: 'linkstamps'
+      implies: 'output_execpath_flags'
+      implies: 'input_param_flags'
+      implies: 'user_link_flags'
+      implies: 'legacy_link_flags'
+      implies: 'linker_subsystem_flag'
+      implies: 'linker_param_file'
+      implies: 'msvc_env'
+      implies: 'no_stripping'
+      implies: 'has_configured_linker_path'
+      implies: 'def_file'
+    }
+
+  action_config {
+    config_name: 'c++-link-static-library'
+    action_name: 'c++-link-static-library'
+    tool {
+      tool_path: ''
+    }
+    implies: 'nologo'
+    implies: 'archiver_flags'
+    implies: 'input_param_flags'
+    implies: 'linker_param_file'
+    implies: 'msvc_env'
+  }
+
+  # TODO(b/65151735): Remove legacy_compile_flags feature when legacy fields are
+  # not used in this crosstool
+  feature {
+    name: 'legacy_compile_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'legacy_compile_flags'
+        flag: '%{legacy_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: "msvc_env"
+    env_set {
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-module-compile"
+      action: "c++-module-codegen"
+      action: "c++-header-parsing"
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c++-link-executable"
+      action: "c++-link-dynamic-library"
+      action: "c++-link-nodeps-dynamic-library"
+      action: "c++-link-static-library"
+      env_entry {
+        key: "PATH"
+        value: ""
+      }
+      env_entry {
+        key: "INCLUDE"
+        value: ""
+      }
+      env_entry {
+        key: "LIB"
+        value: ""
+      }
+      env_entry {
+        key: "TMP"
+        value: ""
+      }
+      env_entry {
+        key: "TEMP"
+        value: ""
+      }
+    }
+  }
+
+  feature {
+    name: 'include_paths'
+    flag_set {
+      action: "assemble"
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      flag_group {
+        iterate_over: 'quote_include_paths'
+        flag: '/I%{quote_include_paths}'
+      }
+      flag_group {
+        iterate_over: 'include_paths'
+        flag: '/I%{include_paths}'
+      }
+      flag_group {
+        iterate_over: 'system_include_paths'
+        flag: '/I%{system_include_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: "preprocessor_defines"
+    flag_set {
+      action: "assemble"
+      action: "preprocess-assemble"
+      action: "c-compile"
+      action: "c++-compile"
+      action: "c++-header-parsing"
+      action: "c++-module-compile"
+      flag_group {
+        flag: "/D%{preprocessor_defines}"
+        iterate_over: "preprocessor_defines"
+      }
+    }
+  }
+
+  # Tell Bazel to parse the output of /showIncludes
+  feature {
+    name: 'parse_showincludes'
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-module-compile'
+      action: 'c++-header-parsing'
+      flag_group {
+        flag: "/showIncludes"
+      }
+    }
+  }
+
+
+  feature {
+    name: 'generate_pdb_file'
+    requires: {
+      feature: 'dbg'
+    }
+    requires: {
+      feature: 'fastbuild'
+    }
+  }
+
+  feature {
+    name: 'shared_flag'
+    flag_set {
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/DLL'
+      }
+    }
+  }
+
+  feature {
+    name: 'linkstamps'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      expand_if_all_available: 'linkstamp_paths'
+      flag_group {
+        iterate_over: 'linkstamp_paths'
+        flag: '%{linkstamp_paths}'
+      }
+    }
+  }
+
+  feature {
+    name: 'output_execpath_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'archiver_flags'
+    flag_set {
+      expand_if_all_available: 'output_execpath'
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '/OUT:%{output_execpath}'
+      }
+    }
+  }
+
+  feature {
+    name: 'input_param_flags'
+    flag_set {
+      expand_if_all_available: 'interface_library_output_path'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/IMPLIB:%{interface_library_output_path}"
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libopts'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'libopts'
+        flag: '%{libopts}'
+      }
+    }
+    flag_set {
+      expand_if_all_available: 'libraries_to_link'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        iterate_over: 'libraries_to_link'
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file_group'
+          }
+          iterate_over: 'libraries_to_link.object_files'
+          flag_group {
+            flag: '%{libraries_to_link.object_files}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'object_file'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'interface_library'
+          }
+          flag_group {
+            flag: '%{libraries_to_link.name}'
+          }
+        }
+        flag_group {
+          expand_if_equal: {
+            variable: 'libraries_to_link.type'
+            value: 'static_library'
+          }
+          flag_group {
+            expand_if_false: 'libraries_to_link.is_whole_archive'
+            flag: '%{libraries_to_link.name}'
+          }
+          flag_group {
+            expand_if_true: 'libraries_to_link.is_whole_archive'
+            flag: '/WHOLEARCHIVE:%{libraries_to_link.name}'
+          }
+        }
+      }
+    }
+  }
+
+  # Since this feature is declared earlier in the CROSSTOOL than
+  # "user_link_flags", this feature will be applied prior to it anwyhere they
+  # are both implied. And since "user_link_flags" contains the linkopts from
+  # the build rule, this allows the user to override the /SUBSYSTEM in the BUILD
+  # file.
+  feature {
+    name: 'linker_subsystem_flag'
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: '/SUBSYSTEM:CONSOLE'
+      }
+    }
+  }
+
+  # The "user_link_flags" contains user-defined linkopts (from build rules)
+  # so it should be defined after features that declare user-overridable flags.
+  # For example the "linker_subsystem_flag" defines a default "/SUBSYSTEM" flag
+  # but we want to let the user override it, therefore "link_flag_subsystem" is
+  # defined earlier in the CROSSTOOL file than "user_link_flags".
+  feature {
+    name: 'user_link_flags'
+    flag_set {
+      expand_if_all_available: 'user_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'user_link_flags'
+        flag: '%{user_link_flags}'
+      }
+    }
+  }
+  feature {
+    name: 'legacy_link_flags'
+    flag_set {
+      expand_if_all_available: 'legacy_link_flags'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'legacy_link_flags'
+        flag: '%{legacy_link_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'linker_param_file'
+    flag_set {
+      expand_if_all_available: 'linker_param_file'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      action: 'c++-link-static-library'
+      flag_group {
+        flag: '@%{linker_param_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'static_link_msvcrt'
+  }
+
+  feature {
+    name: 'static_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MT"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_no_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MD"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrt.lib"
+      }
+    }
+    requires: { feature: 'fastbuild'}
+    requires: { feature: 'opt'}
+  }
+
+  feature {
+    name: 'static_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MTd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:libcmtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dynamic_link_msvcrt_debug'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/MDd"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEFAULTLIB:msvcrtd.lib"
+      }
+    }
+    requires: { feature: 'dbg'}
+  }
+
+  feature {
+    name: 'dbg'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FULL"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'fastbuild'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/Od"
+        flag: "/Z7"
+        flag: "/DDEBUG"
+      }
+    }
+    flag_set {
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEBUG:FASTLINK"
+        flag: "/INCREMENTAL:NO"
+      }
+    }
+    implies: 'generate_pdb_file'
+  }
+
+  feature {
+    name: 'opt'
+    flag_set {
+      action: 'c-compile'
+      action: 'c++-compile'
+      flag_group {
+        flag: "/O2"
+        flag: "/DNDEBUG"
+      }
+    }
+  }
+
+  feature {
+    name: 'user_compile_flags'
+    flag_set {
+      expand_if_all_available: 'user_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'user_compile_flags'
+        flag: '%{user_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'sysroot'
+    flag_set {
+      expand_if_all_available: 'sysroot'
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        iterate_over: 'sysroot'
+        flag: '--sysroot=%{sysroot}'
+      }
+    }
+  }
+
+  feature {
+    name: 'unfiltered_compile_flags'
+    flag_set {
+      expand_if_all_available: 'unfiltered_compile_flags'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        iterate_over: 'unfiltered_compile_flags'
+        flag: '%{unfiltered_compile_flags}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_output_flags'
+    flag_set {
+      action: 'assemble'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+        flag: '/Zi'
+      }
+    }
+    flag_set {
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_none_available: 'output_assembly_file'
+        expand_if_none_available: 'output_preprocess_file'
+        flag: '/Fo%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_assembly_file'
+        flag: '/Fa%{output_file}'
+      }
+      flag_group {
+        expand_if_all_available: 'output_file'
+        expand_if_all_available: 'output_preprocess_file'
+        flag: '/P'
+        flag: '/Fi%{output_file}'
+      }
+    }
+  }
+
+  feature {
+    name: 'compiler_input_flags'
+    flag_set {
+      action: 'assemble'
+      action: 'preprocess-assemble'
+      action: 'c-compile'
+      action: 'c++-compile'
+      action: 'c++-header-parsing'
+      action: 'c++-module-compile'
+      action: 'c++-module-codegen'
+      flag_group {
+        expand_if_all_available: 'source_file'
+        flag: '/c'
+        flag: '%{source_file}'
+      }
+    }
+  }
+
+  feature {
+    name : 'def_file',
+    flag_set {
+      expand_if_all_available: 'def_file_path'
+      action: 'c++-link-executable'
+      action: 'c++-link-dynamic-library'
+      action: "c++-link-nodeps-dynamic-library"
+      flag_group {
+        flag: "/DEF:%{def_file_path}"
+        # We can specify a different DLL name in DEF file, /ignore:4070 suppresses
+        # the warning message about DLL name doesn't match the default one.
+        # See https://msdn.microsoft.com/en-us/library/sfkk2fz7.aspx
+        flag: "/ignore:4070"
+      }
+    }
+  }
+
+  feature {
+    name: 'windows_export_all_symbols'
+  }
+
+  feature {
+    name: 'no_windows_export_all_symbols'
+  }
+
+  linking_mode_flags { mode: DYNAMIC }
+}
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
new file mode 100755
index 0000000000000000000000000000000000000000..63893d3722f6b43579758e5f747076b1f1e73ed7
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/clang/bin/crosstool_wrapper_driver_is_not_gcc
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs.
+
+SYNOPSIS:
+  crosstool_wrapper_is_not_gcc [options passed in by cc_library()
+                                or cc_binary() rule]
+
+DESCRIPTION:
+  This script is expected to be called by the cc_library() or cc_binary() bazel
+  rules. When the option "-x cuda" is present in the list of arguments passed
+  to this script, it invokes the nvcc CUDA compiler. Most arguments are passed
+  as is as a string to --compiler-options of nvcc. When "-x cuda" is not
+  present, this wrapper invokes hybrid_driver_is_not_gcc with the input
+  arguments as is.
+
+NOTES:
+  Changes to the contents of this file must be propagated from
+  //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc to
+  //third_party/gpus/crosstool/v*/*/clang/bin/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+__author__ = 'keveman@google.com (Manjunath Kudlur)'
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+PREFIX_DIR = os.path.dirname(GCC_HOST_COMPILER_PATH)
+NVCC_VERSION = '9.0'
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from the argv list.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    option: The option whose value to extract, without the leading '-'.
+
+  Returns:
+    A list of values, either directly following the option,
+    (eg., -opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., -opt val1 -opt val2).
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-' + option, nargs='*', action='append')
+  args, _ = parser.parse_known_args(argv)
+  if not args or not vars(args)[option]:
+    return []
+  else:
+    return sum(vars(args)[option], [])
+
+
+def GetHostCompilerOptions(argv):
+  """Collect the -isystem, -iquote, and --sysroot option values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be used as the --compiler-options to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-isystem', nargs='*', action='append')
+  parser.add_argument('-iquote', nargs='*', action='append')
+  parser.add_argument('--sysroot', nargs=1)
+  parser.add_argument('-g', nargs='*', action='append')
+  parser.add_argument('-fno-canonical-system-headers', action='store_true')
+
+  args, _ = parser.parse_known_args(argv)
+
+  opts = ''
+
+  if args.isystem:
+    opts += ' -isystem ' + ' -isystem '.join(sum(args.isystem, []))
+  if args.iquote:
+    opts += ' -iquote ' + ' -iquote '.join(sum(args.iquote, []))
+  if args.g:
+    opts += ' -g' + ' -g'.join(sum(args.g, []))
+  if args.fno_canonical_system_headers:
+    opts += ' -fno-canonical-system-headers'
+  if args.sysroot:
+    opts += ' --sysroot ' + args.sysroot[0]
+
+  return opts
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    The string that can be passed directly to nvcc.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, _ = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return ' '.join(['--'+a for a in options])
+  return ''
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  host_compiler_options = GetHostCompilerOptions(argv)
+  nvcc_compiler_options = GetNvccOptions(argv)
+  opt_option = GetOptionValue(argv, 'O')
+  m_options = GetOptionValue(argv, 'm')
+  m_options = ''.join([' -m' + m for m in m_options if m in ['32', '64']])
+  include_options = GetOptionValue(argv, 'I')
+  out_file = GetOptionValue(argv, 'o')
+  depfiles = GetOptionValue(argv, 'MF')
+  defines = GetOptionValue(argv, 'D')
+  defines = ''.join([' -D' + define for define in defines])
+  undefines = GetOptionValue(argv, 'U')
+  undefines = ''.join([' -U' + define for define in undefines])
+  std_options = GetOptionValue(argv, 'std')
+  # currently only c++11 is supported by Cuda 7.0 std argument
+  nvcc_allowed_std_options = ["c++11"]
+  std_options = ''.join([' -std=' + define
+      for define in std_options if define in nvcc_allowed_std_options])
+
+  # The list of source files get passed after the -c option. I don't know of
+  # any other reliable way to just get the list of source files to be compiled.
+  src_files = GetOptionValue(argv, 'c')
+
+  # Pass -w through from host to nvcc, but don't do anything fancier with
+  # warnings-related flags, since they're not necessarily the same across
+  # compilers.
+  warning_options = ' -w' if '-w' in argv else ''
+
+  if len(src_files) == 0:
+    return 1
+  if len(out_file) != 1:
+    return 1
+
+  opt = (' -O2' if (len(opt_option) > 0 and int(opt_option[0]) > 0)
+         else ' -g -G')
+
+  includes = (' -I ' + ' -I '.join(include_options)
+              if len(include_options) > 0
+              else '')
+
+  # Unfortunately, there are other options that have -c prefix too.
+  # So allowing only those look like C/C++ files.
+  src_files = [f for f in src_files if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  srcs = ' '.join(src_files)
+  out = ' -o ' + out_file[0]
+
+  supported_cuda_compute_capabilities = [ "3.0" ]
+  nvccopts = '-D_FORCE_INLINES '
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += r'-gencode=arch=compute_%s,\"code=sm_%s,compute_%s\" ' % (
+        capability, capability, capability)
+  nvccopts += ' ' + nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += std_options
+  nvccopts += m_options
+  nvccopts += warning_options
+
+  if depfiles:
+    # Generate the dependency file
+    depfile = depfiles[0]
+    cmd = (NVCC_PATH + ' ' + nvccopts +
+           ' --compiler-options "' + host_compiler_options + '"' +
+           ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+           ' -I .' +
+           ' -x cu ' + opt + includes + ' ' + srcs + ' -M -o ' + depfile)
+    if log: Log(cmd)
+    exit_status = os.system(cmd)
+    if exit_status != 0:
+      return exit_status
+
+  cmd = (NVCC_PATH + ' ' + nvccopts +
+         ' --compiler-options "' + host_compiler_options + ' -fPIC"' +
+         ' --compiler-bindir=' + GCC_HOST_COMPILER_PATH +
+         ' -I .' +
+         ' -x cu ' + opt + includes + ' -c ' + srcs + out)
+
+  # TODO(zhengxq): for some reason, 'gcc' needs this help to find 'as'.
+  # Need to investigate and fix.
+  cmd = 'PATH=' + PREFIX_DIR + ':$PATH ' + cmd
+  if log: Log(cmd)
+  return os.system(cmd)
+
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat
new file mode 100755
index 0000000000000000000000000000000000000000..e896e654fd7ecd578c80d102895f51ce18bbd4eb
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.bat
@@ -0,0 +1,20 @@
+:: Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+::
+:: Licensed under the Apache License, Version 2.0 (the "License");
+:: you may not use this file except in compliance with the License.
+:: You may obtain a copy of the License at
+::
+::     http://www.apache.org/licenses/LICENSE-2.0
+::
+:: Unless required by applicable law or agreed to in writing, software
+:: distributed under the License is distributed on an "AS IS" BASIS,
+:: WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+:: See the License for the specific language governing permissions and
+:: limitations under the License.
+:: =============================================================================
+
+:: Invoke msvc_wrapper_for_nvcc.py, which is located in the same directory.
+@echo OFF
+set arg0=%~0
+for %%F in ("%arg0%") do set DRIVER_BIN=%%~dpF
+"/usr/bin/python3" -B "%DRIVER_BIN%\msvc_wrapper_for_nvcc.py" %*
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
new file mode 100755
index 0000000000000000000000000000000000000000..859b3196d5dba9afadeae56f34be04247b00fe09
--- /dev/null
+++ b/third_party/toolchains/preconfig/ubuntu14.04/gcc-nvcc-cuda9.0/windows/msvc_wrapper_for_nvcc.py
@@ -0,0 +1,192 @@
+#!/usr/bin/env python
+# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Crosstool wrapper for compiling CUDA programs with nvcc on Windows.
+
+DESCRIPTION:
+  This script is the Windows version of //third_party/gpus/crosstool/crosstool_wrapper_is_not_gcc
+"""
+
+from __future__ import print_function
+
+from argparse import ArgumentParser
+import os
+import subprocess
+import re
+import sys
+import pipes
+
+# Template values set by cuda_autoconf.
+CPU_COMPILER = ('/usr/bin/gcc')
+GCC_HOST_COMPILER_PATH = ('/usr/bin/gcc')
+
+NVCC_PATH = '/usr/local/cuda-9.0/bin/nvcc'
+NVCC_VERSION = '9.0'
+NVCC_TEMP_DIR = "C:\\Windows\\Temp\\nvcc_inter_files_tmp_dir"
+supported_cuda_compute_capabilities = [ "3.0" ]
+
+def Log(s):
+  print('gpus/crosstool: {0}'.format(s))
+
+
+def GetOptionValue(argv, option):
+  """Extract the list of values for option from options.
+
+  Args:
+    option: The option whose value to extract, without the leading '/'.
+
+  Returns:
+    1. A list of values, either directly following the option,
+    (eg., /opt val1 val2) or values collected from multiple occurrences of
+    the option (eg., /opt val1 /opt val2).
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser(prefix_chars='/')
+  parser.add_argument('/' + option, nargs='*', action='append')
+  args, leftover = parser.parse_known_args(argv)
+  if args and vars(args)[option]:
+    return (sum(vars(args)[option], []), leftover)
+  return ([], leftover)
+
+def _update_options(nvcc_options):
+  if NVCC_VERSION in ("7.0",):
+    return nvcc_options
+
+  update_options = { "relaxed-constexpr" : "expt-relaxed-constexpr" }
+  return [ update_options[opt] if opt in update_options else opt
+                    for opt in nvcc_options ]
+
+def GetNvccOptions(argv):
+  """Collect the -nvcc_options values from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+
+  Returns:
+    1. The string that can be passed directly to nvcc.
+    2. The leftover options.
+  """
+
+  parser = ArgumentParser()
+  parser.add_argument('-nvcc_options', nargs='*', action='append')
+
+  args, leftover = parser.parse_known_args(argv)
+
+  if args.nvcc_options:
+    options = _update_options(sum(args.nvcc_options, []))
+    return (['--' + a for a in options], leftover)
+  return ([], leftover)
+
+
+def InvokeNvcc(argv, log=False):
+  """Call nvcc with arguments assembled from argv.
+
+  Args:
+    argv: A list of strings, possibly the argv passed to main().
+    log: True if logging is requested.
+
+  Returns:
+    The return value of calling os.system('nvcc ' + args)
+  """
+
+  src_files = [f for f in argv if
+               re.search('\.cpp$|\.cc$|\.c$|\.cxx$|\.C$', f)]
+  if len(src_files) == 0:
+    raise Error('No source files found for cuda compilation.')
+
+  out_file = [ f for f in argv if f.startswith('/Fo') ]
+  if len(out_file) != 1:
+    raise Error('Please sepecify exactly one output file for cuda compilation.')
+  out = ['-o', out_file[0][len('/Fo'):]]
+
+  nvcc_compiler_options, argv = GetNvccOptions(argv)
+
+  opt_option, argv = GetOptionValue(argv, 'O')
+  opt = ['-g', '-G']
+  if (len(opt_option) > 0 and opt_option[0] != 'd'):
+    opt = ['-O2']
+
+  include_options, argv = GetOptionValue(argv, 'I')
+  includes = ["-I " + include for include in include_options]
+
+  defines, argv = GetOptionValue(argv, 'D')
+  defines = ['-D' + define for define in defines]
+
+  undefines, argv = GetOptionValue(argv, 'U')
+  undefines = ['-U' + define for define in undefines]
+
+  # The rest of the unrecongized options should be passed to host compiler
+  host_compiler_options = [option for option in argv if option not in (src_files + out_file)]
+
+  m_options = ["-m64"]
+
+  nvccopts = ['-D_FORCE_INLINES']
+  for capability in supported_cuda_compute_capabilities:
+    capability = capability.replace('.', '')
+    nvccopts += [r'-gencode=arch=compute_%s,"code=sm_%s,compute_%s"' % (
+        capability, capability, capability)]
+  nvccopts += nvcc_compiler_options
+  nvccopts += undefines
+  nvccopts += defines
+  nvccopts += m_options
+  nvccopts += ['--compiler-options="' + " ".join(host_compiler_options) + '"']
+  nvccopts += ['-x', 'cu'] + opt + includes + out + ['-c'] + src_files
+  # If we don't specify --keep-dir, nvcc will generate intermediate files under TEMP
+  # Put them under NVCC_TEMP_DIR instead, then Bazel can ignore files under NVCC_TEMP_DIR during dependency check
+  # http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html#options-for-guiding-compiler-driver
+  # Different actions are sharing NVCC_TEMP_DIR, so we cannot remove it if the directory already exists.
+  if os.path.isfile(NVCC_TEMP_DIR):
+    os.remove(NVCC_TEMP_DIR)
+  if not os.path.exists(NVCC_TEMP_DIR):
+    os.makedirs(NVCC_TEMP_DIR)
+  nvccopts += ['--keep', '--keep-dir', NVCC_TEMP_DIR]
+  cmd = [NVCC_PATH] + nvccopts
+  if log:
+    Log(cmd)
+  proc = subprocess.Popen(cmd,
+                          stdout=sys.stdout,
+                          stderr=sys.stderr,
+                          env=os.environ.copy(),
+                          shell=True)
+  proc.wait()
+  return proc.returncode
+
+def main():
+  parser = ArgumentParser()
+  parser.add_argument('-x', nargs=1)
+  parser.add_argument('--cuda_log', action='store_true')
+  args, leftover = parser.parse_known_args(sys.argv[1:])
+
+  if args.x and args.x[0] == 'cuda':
+    if args.cuda_log: Log('-x cuda')
+    leftover = [pipes.quote(s) for s in leftover]
+    if args.cuda_log: Log('using nvcc')
+    return InvokeNvcc(leftover, log=args.cuda_log)
+
+  # Strip our flags before passing through to the CPU compiler for files which
+  # are not -x cuda. We can't just pass 'leftover' because it also strips -x.
+  # We not only want to pass -x to the CPU compiler, but also keep it in its
+  # relative location in the argv list (the compiler is actually sensitive to
+  # this).
+  cpu_compiler_flags = [flag for flag in sys.argv[1:]
+                             if not flag.startswith(('--cuda_log'))
+                             and not flag.startswith(('-nvcc_options'))]
+
+  return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
+
+if __name__ == '__main__':
+  sys.exit(main())
diff --git a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
index e021df9e1e3066b597dddc5dc78da3121ddd2430..460c879d32f1381454b6d043bded61e66b02f41d 100755
--- a/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
+++ b/third_party/toolchains/preconfig/ubuntu14.04/py3/BUILD
@@ -136,7 +136,7 @@ genrule(
         "python_include/weakrefobject.h",
     ],
     cmd = """
-cp "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
+cp -f "/usr/include/python3.4m/Python-ast.h" "$(@D)/python_include/Python-ast.h" && cp -f "/usr/include/python3.4m/Python.h" "$(@D)/python_include/Python.h" && cp -f "/usr/include/python3.4m/abstract.h" "$(@D)/python_include/abstract.h" && cp -f "/usr/include/python3.4m/accu.h" "$(@D)/python_include/accu.h" && cp -f "/usr/include/python3.4m/asdl.h" "$(@D)/python_include/asdl.h" && cp -f "/usr/include/python3.4m/ast.h" "$(@D)/python_include/ast.h" && cp -f "/usr/include/python3.4m/bitset.h" "$(@D)/python_include/bitset.h" && cp -f "/usr/include/python3.4m/bltinmodule.h" "$(@D)/python_include/bltinmodule.h" && cp -f "/usr/include/python3.4m/boolobject.h" "$(@D)/python_include/boolobject.h" && cp -f "/usr/include/python3.4m/bytearrayobject.h" "$(@D)/python_include/bytearrayobject.h" && cp -f "/usr/include/python3.4m/bytes_methods.h" "$(@D)/python_include/bytes_methods.h" && cp -f "/usr/include/python3.4m/bytesobject.h" "$(@D)/python_include/bytesobject.h" && cp -f "/usr/include/python3.4m/cellobject.h" "$(@D)/python_include/cellobject.h" && cp -f "/usr/include/python3.4m/ceval.h" "$(@D)/python_include/ceval.h" && cp -f "/usr/include/python3.4m/classobject.h" "$(@D)/python_include/classobject.h" && cp -f "/usr/include/python3.4m/code.h" "$(@D)/python_include/code.h" && cp -f "/usr/include/python3.4m/codecs.h" "$(@D)/python_include/codecs.h" && cp -f "/usr/include/python3.4m/compile.h" "$(@D)/python_include/compile.h" && cp -f "/usr/include/python3.4m/complexobject.h" "$(@D)/python_include/complexobject.h" && cp -f "/usr/include/python3.4m/datetime.h" "$(@D)/python_include/datetime.h" && cp -f "/usr/include/python3.4m/descrobject.h" "$(@D)/python_include/descrobject.h" && cp -f "/usr/include/python3.4m/dictobject.h" "$(@D)/python_include/dictobject.h" && cp -f "/usr/include/python3.4m/dtoa.h" "$(@D)/python_include/dtoa.h" && cp -f "/usr/include/python3.4m/dynamic_annotations.h" "$(@D)/python_include/dynamic_annotations.h" && cp -f "/usr/include/python3.4m/enumobject.h" "$(@D)/python_include/enumobject.h" && cp -f "/usr/include/python3.4m/errcode.h" "$(@D)/python_include/errcode.h" && cp -f "/usr/include/python3.4m/eval.h" "$(@D)/python_include/eval.h" && cp -f "/usr/include/python3.4m/fileobject.h" "$(@D)/python_include/fileobject.h" && cp -f "/usr/include/python3.4m/fileutils.h" "$(@D)/python_include/fileutils.h" && cp -f "/usr/include/python3.4m/floatobject.h" "$(@D)/python_include/floatobject.h" && cp -f "/usr/include/python3.4m/frameobject.h" "$(@D)/python_include/frameobject.h" && cp -f "/usr/include/python3.4m/funcobject.h" "$(@D)/python_include/funcobject.h" && cp -f "/usr/include/python3.4m/genobject.h" "$(@D)/python_include/genobject.h" && cp -f "/usr/include/python3.4m/graminit.h" "$(@D)/python_include/graminit.h" && cp -f "/usr/include/python3.4m/grammar.h" "$(@D)/python_include/grammar.h" && cp -f "/usr/include/python3.4m/import.h" "$(@D)/python_include/import.h" && cp -f "/usr/include/python3.4m/intrcheck.h" "$(@D)/python_include/intrcheck.h" && cp -f "/usr/include/python3.4m/iterobject.h" "$(@D)/python_include/iterobject.h" && cp -f "/usr/include/python3.4m/listobject.h" "$(@D)/python_include/listobject.h" && cp -f "/usr/include/python3.4m/longintrepr.h" "$(@D)/python_include/longintrepr.h" && cp -f "/usr/include/python3.4m/longobject.h" "$(@D)/python_include/longobject.h" && cp -f "/usr/include/python3.4m/marshal.h" "$(@D)/python_include/marshal.h" && cp -f "/usr/include/python3.4m/memoryobject.h" "$(@D)/python_include/memoryobject.h" && cp -f "/usr/include/python3.4m/metagrammar.h" "$(@D)/python_include/metagrammar.h" && cp -f "/usr/include/python3.4m/methodobject.h" "$(@D)/python_include/methodobject.h" && cp -f "/usr/include/python3.4m/modsupport.h" "$(@D)/python_include/modsupport.h" && cp -f "/usr/include/python3.4m/moduleobject.h" "$(@D)/python_include/moduleobject.h" && cp -f "/usr/include/python3.4m/namespaceobject.h" "$(@D)/python_include/namespaceobject.h" && cp -f "/usr/include/python3.4m/node.h" "$(@D)/python_include/node.h" && cp -f "/usr/include/python3.4m/object.h" "$(@D)/python_include/object.h" && cp -f "/usr/include/python3.4m/objimpl.h" "$(@D)/python_include/objimpl.h" && cp -f "/usr/include/python3.4m/opcode.h" "$(@D)/python_include/opcode.h" && cp -f "/usr/include/python3.4m/osdefs.h" "$(@D)/python_include/osdefs.h" && cp -f "/usr/include/python3.4m/parsetok.h" "$(@D)/python_include/parsetok.h" && cp -f "/usr/include/python3.4m/patchlevel.h" "$(@D)/python_include/patchlevel.h" && cp -f "/usr/include/python3.4m/pgen.h" "$(@D)/python_include/pgen.h" && cp -f "/usr/include/python3.4m/pgenheaders.h" "$(@D)/python_include/pgenheaders.h" && cp -f "/usr/include/python3.4m/py_curses.h" "$(@D)/python_include/py_curses.h" && cp -f "/usr/include/python3.4m/pyarena.h" "$(@D)/python_include/pyarena.h" && cp -f "/usr/include/python3.4m/pyatomic.h" "$(@D)/python_include/pyatomic.h" && cp -f "/usr/include/python3.4m/pycapsule.h" "$(@D)/python_include/pycapsule.h" && cp -f "/usr/include/python3.4m/pyconfig.h" "$(@D)/python_include/pyconfig.h" && cp -f "/usr/include/python3.4m/pyctype.h" "$(@D)/python_include/pyctype.h" && cp -f "/usr/include/python3.4m/pydebug.h" "$(@D)/python_include/pydebug.h" && cp -f "/usr/include/python3.4m/pyerrors.h" "$(@D)/python_include/pyerrors.h" && cp -f "/usr/include/python3.4m/pyexpat.h" "$(@D)/python_include/pyexpat.h" && cp -f "/usr/include/python3.4m/pyfpe.h" "$(@D)/python_include/pyfpe.h" && cp -f "/usr/include/python3.4m/pygetopt.h" "$(@D)/python_include/pygetopt.h" && cp -f "/usr/include/python3.4m/pyhash.h" "$(@D)/python_include/pyhash.h" && cp -f "/usr/include/python3.4m/pymacconfig.h" "$(@D)/python_include/pymacconfig.h" && cp -f "/usr/include/python3.4m/pymacro.h" "$(@D)/python_include/pymacro.h" && cp -f "/usr/include/python3.4m/pymath.h" "$(@D)/python_include/pymath.h" && cp -f "/usr/include/python3.4m/pymem.h" "$(@D)/python_include/pymem.h" && cp -f "/usr/include/python3.4m/pyport.h" "$(@D)/python_include/pyport.h" && cp -f "/usr/include/python3.4m/pystate.h" "$(@D)/python_include/pystate.h" && cp -f "/usr/include/python3.4m/pystrcmp.h" "$(@D)/python_include/pystrcmp.h" && cp -f "/usr/include/python3.4m/pystrtod.h" "$(@D)/python_include/pystrtod.h" && cp -f "/usr/include/python3.4m/pythonrun.h" "$(@D)/python_include/pythonrun.h" && cp -f "/usr/include/python3.4m/pythread.h" "$(@D)/python_include/pythread.h" && cp -f "/usr/include/python3.4m/pytime.h" "$(@D)/python_include/pytime.h" && cp -f "/usr/include/python3.4m/rangeobject.h" "$(@D)/python_include/rangeobject.h" && cp -f "/usr/include/python3.4m/setobject.h" "$(@D)/python_include/setobject.h" && cp -f "/usr/include/python3.4m/sliceobject.h" "$(@D)/python_include/sliceobject.h" && cp -f "/usr/include/python3.4m/structmember.h" "$(@D)/python_include/structmember.h" && cp -f "/usr/include/python3.4m/structseq.h" "$(@D)/python_include/structseq.h" && cp -f "/usr/include/python3.4m/symtable.h" "$(@D)/python_include/symtable.h" && cp -f "/usr/include/python3.4m/sysmodule.h" "$(@D)/python_include/sysmodule.h" && cp -f "/usr/include/python3.4m/token.h" "$(@D)/python_include/token.h" && cp -f "/usr/include/python3.4m/traceback.h" "$(@D)/python_include/traceback.h" && cp -f "/usr/include/python3.4m/tupleobject.h" "$(@D)/python_include/tupleobject.h" && cp -f "/usr/include/python3.4m/typeslots.h" "$(@D)/python_include/typeslots.h" && cp -f "/usr/include/python3.4m/ucnhash.h" "$(@D)/python_include/ucnhash.h" && cp -f "/usr/include/python3.4m/unicodeobject.h" "$(@D)/python_include/unicodeobject.h" && cp -f "/usr/include/python3.4m/warnings.h" "$(@D)/python_include/warnings.h" && cp -f "/usr/include/python3.4m/weakrefobject.h" "$(@D)/python_include/weakrefobject.h"
    """,
 )
 
@@ -171,6 +171,6 @@ genrule(
         "numpy_include/numpy/utils.h",
     ],
     cmd = """
-cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
+cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__multiarray_api.h" "$(@D)/numpy_include/numpy/__multiarray_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/__ufunc_api.h" "$(@D)/numpy_include/numpy/__ufunc_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_neighborhood_iterator_imp.h" "$(@D)/numpy_include/numpy/_neighborhood_iterator_imp.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/_numpyconfig.h" "$(@D)/numpy_include/numpy/_numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayobject.h" "$(@D)/numpy_include/numpy/arrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/arrayscalars.h" "$(@D)/numpy_include/numpy/arrayscalars.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/halffloat.h" "$(@D)/numpy_include/numpy/halffloat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/multiarray_api.txt" "$(@D)/numpy_include/numpy/multiarray_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarrayobject.h" "$(@D)/numpy_include/numpy/ndarrayobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ndarraytypes.h" "$(@D)/numpy_include/numpy/ndarraytypes.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/noprefix.h" "$(@D)/numpy_include/numpy/noprefix.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_1_7_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_1_7_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_3kcompat.h" "$(@D)/numpy_include/numpy/npy_3kcompat.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_common.h" "$(@D)/numpy_include/numpy/npy_common.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_cpu.h" "$(@D)/numpy_include/numpy/npy_cpu.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_endian.h" "$(@D)/numpy_include/numpy/npy_endian.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_interrupt.h" "$(@D)/numpy_include/numpy/npy_interrupt.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_math.h" "$(@D)/numpy_include/numpy/npy_math.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_no_deprecated_api.h" "$(@D)/numpy_include/numpy/npy_no_deprecated_api.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/npy_os.h" "$(@D)/numpy_include/numpy/npy_os.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/numpyconfig.h" "$(@D)/numpy_include/numpy/numpyconfig.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/old_defines.h" "$(@D)/numpy_include/numpy/old_defines.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/oldnumeric.h" "$(@D)/numpy_include/numpy/oldnumeric.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufunc_api.txt" "$(@D)/numpy_include/numpy/ufunc_api.txt" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/ufuncobject.h" "$(@D)/numpy_include/numpy/ufuncobject.h" && cp -f "/usr/local/lib/python3.4/dist-packages/numpy/core/include/numpy/utils.h" "$(@D)/numpy_include/numpy/utils.h"
    """,
 )
diff --git a/third_party/toolchains/preconfig/win_1803/BUILD b/third_party/toolchains/preconfig/win_1803/BUILD
index 45209d260d8dfdd5ec69112669ee32a3b529b2db..ac599bc2f3d758ad7094fd9f9b748929d2c8ef7a 100644
--- a/third_party/toolchains/preconfig/win_1803/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/BUILD
@@ -17,7 +17,7 @@ platform(
     remote_execution_properties = """
         properties:{
           name:"container-image"
-          value:"docker://gcr.io/tensorflow-testing/tf-rbe-win@sha256:bd22c6bfff6afc1fa4304ec4411df2410d93645494117585332a4e2258358422"
+          value:"docker://gcr.io/tensorflow-testing/tf-rbe-win@sha256:fbc5713566011cc27fc3651183a6e7c2fd56fc6f006618c53f8fc71e742feebd"
         }
         properties:{
           name: "OSFamily" value: "Windows"
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD b/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
index c00f005e46cb727265886e98313c790875a85089..edd958364811d2e063b10f3c2e3a347b601794b5 100644
--- a/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
+++ b/third_party/toolchains/preconfig/win_1803/bazel_018/BUILD
@@ -39,6 +39,9 @@ cc_toolchain_suite(
         "x64_windows|msvc-cl": ":cc-compiler-x64_windows",
         "x64_windows|msys-gcc": ":cc-compiler-x64_windows_msys",
         "x64_windows|mingw-gcc": ":cc-compiler-x64_windows_mingw",
+        "x64_windows_msys": ":cc-compiler-x64_windows_msys",
+        "x64_windows": ":cc-compiler-x64_windows",
+        "armeabi-v7a": ":cc-compiler-armeabi-v7a",
     },
 )
 
@@ -54,6 +57,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "msys_x64",
 )
 
 toolchain(
@@ -83,6 +87,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 0,
+    toolchain_identifier = "msys_x64_mingw",
 )
 
 toolchain(
@@ -112,6 +117,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "msvc_x64",
 )
 
 toolchain(
@@ -140,6 +146,7 @@ cc_toolchain(
     static_runtime_libs = [":empty"],
     strip_files = ":empty",
     supports_param_files = 1,
+    toolchain_identifier = "stub_armeabi-v7a",
 )
 
 toolchain(
diff --git a/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL b/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
index 04c8bcae456ad71e961a2a2f7dfa05875f666260..38a80c22da32de50a98b78da6e157db936d03040 100644
--- a/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
+++ b/third_party/toolchains/preconfig/win_1803/bazel_018/CROSSTOOL
@@ -14,42 +14,6 @@
 
 major_version: "local"
 minor_version: ""
-default_target_cpu: "same_as_host"
-
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "msvc_x64"
-}
-
-default_toolchain {
-  cpu: "local"
-  toolchain_identifier: "stub_armeabi-v7a"
-}
-
-default_toolchain {
-  cpu: "armeabi-v7a"
-  toolchain_identifier: "stub_armeabi-v7a"
-}
-
-default_toolchain {
-  cpu: "x64_windows"
-  toolchain_identifier: "msvc_x64"
-}
-
-default_toolchain {
-  cpu: "x64_windows_msvc"
-  toolchain_identifier: "msvc_x64"
-}
-
-default_toolchain {
-  cpu: "x64_windows_msys"
-  toolchain_identifier: "msys_x64"
-}
-
-default_toolchain {
-  cpu: "s390x"
-  toolchain_identifier: "msys_x64"
-}
 
 # Android tooling requires a default toolchain for the armeabi-v7a cpu.
 toolchain {
diff --git a/tools/bazel.rc b/tools/bazel.rc
index 8c2052ee8ad6e6e2f5088e0a2d92067238a0b5ca..1fdf51f53e29c7111cf89c016400b710051cf9c6 100644
--- a/tools/bazel.rc
+++ b/tools/bazel.rc
@@ -72,6 +72,7 @@ build:nogcp --define=no_gcp_support=true
 build:nohdfs --define=no_hdfs_support=true
 build:nokafka --define=no_kafka_support=true
 build:noignite --define=no_ignite_support=true
+build:nonccl --define=no_nccl_support=true
 
 build --define=use_fast_cpp_protos=true
 build --define=allow_oversize_protos=true